xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 682c9e0fed0115eb6f283e755901c0aac90e86e8)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2009, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 #include <sys/taskqueue.h>
49 
50 /* count xmits ourselves, rather than via drbr */
51 #define NO_SLOW_STATS
52 #include <net/if.h>
53 #include <net/if_arp.h>
54 #include <net/ethernet.h>
55 #include <net/if_dl.h>
56 #include <net/if_media.h>
57 
58 #include <net/bpf.h>
59 
60 #include <net/if_types.h>
61 #include <net/if_vlan_var.h>
62 #include <net/zlib.h>
63 
64 #include <netinet/in_systm.h>
65 #include <netinet/in.h>
66 #include <netinet/ip.h>
67 #include <netinet/tcp.h>
68 
69 #include <machine/bus.h>
70 #include <machine/in_cksum.h>
71 #include <machine/resource.h>
72 #include <sys/bus.h>
73 #include <sys/rman.h>
74 #include <sys/smp.h>
75 
76 #include <dev/pci/pcireg.h>
77 #include <dev/pci/pcivar.h>
78 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
79 
80 #include <vm/vm.h>		/* for pmap_mapdev() */
81 #include <vm/pmap.h>
82 
83 #if defined(__i386) || defined(__amd64)
84 #include <machine/specialreg.h>
85 #endif
86 
87 #include <dev/mxge/mxge_mcp.h>
88 #include <dev/mxge/mcp_gen_header.h>
89 /*#define MXGE_FAKE_IFP*/
90 #include <dev/mxge/if_mxge_var.h>
91 #ifdef IFNET_BUF_RING
92 #include <sys/buf_ring.h>
93 #endif
94 
95 #include "opt_inet.h"
96 
97 /* tunable params */
98 static int mxge_nvidia_ecrc_enable = 1;
99 static int mxge_force_firmware = 0;
100 static int mxge_intr_coal_delay = 30;
101 static int mxge_deassert_wait = 1;
102 static int mxge_flow_control = 1;
103 static int mxge_verbose = 0;
104 static int mxge_lro_cnt = 8;
105 static int mxge_ticks;
106 static int mxge_max_slices = 1;
107 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
108 static int mxge_always_promisc = 0;
109 static int mxge_initial_mtu = ETHERMTU_JUMBO;
110 static int mxge_throttle = 0;
111 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
112 static char *mxge_fw_aligned = "mxge_eth_z8e";
113 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
114 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
115 
116 static int mxge_probe(device_t dev);
117 static int mxge_attach(device_t dev);
118 static int mxge_detach(device_t dev);
119 static int mxge_shutdown(device_t dev);
120 static void mxge_intr(void *arg);
121 
122 static device_method_t mxge_methods[] =
123 {
124   /* Device interface */
125   DEVMETHOD(device_probe, mxge_probe),
126   DEVMETHOD(device_attach, mxge_attach),
127   DEVMETHOD(device_detach, mxge_detach),
128   DEVMETHOD(device_shutdown, mxge_shutdown),
129   {0, 0}
130 };
131 
132 static driver_t mxge_driver =
133 {
134   "mxge",
135   mxge_methods,
136   sizeof(mxge_softc_t),
137 };
138 
139 static devclass_t mxge_devclass;
140 
141 /* Declare ourselves to be a child of the PCI bus.*/
142 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
143 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
144 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
145 
146 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
147 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
148 static int mxge_close(mxge_softc_t *sc, int down);
149 static int mxge_open(mxge_softc_t *sc);
150 static void mxge_tick(void *arg);
151 
152 static int
153 mxge_probe(device_t dev)
154 {
155 	int rev;
156 
157 
158 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
159 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
160 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
161 		rev = pci_get_revid(dev);
162 		switch (rev) {
163 		case MXGE_PCI_REV_Z8E:
164 			device_set_desc(dev, "Myri10G-PCIE-8A");
165 			break;
166 		case MXGE_PCI_REV_Z8ES:
167 			device_set_desc(dev, "Myri10G-PCIE-8B");
168 			break;
169 		default:
170 			device_set_desc(dev, "Myri10G-PCIE-8??");
171 			device_printf(dev, "Unrecognized rev %d NIC\n",
172 				      rev);
173 			break;
174 		}
175 		return 0;
176 	}
177 	return ENXIO;
178 }
179 
180 static void
181 mxge_enable_wc(mxge_softc_t *sc)
182 {
183 #if defined(__i386) || defined(__amd64)
184 	vm_offset_t len;
185 	int err;
186 
187 	sc->wc = 1;
188 	len = rman_get_size(sc->mem_res);
189 	err = pmap_change_attr((vm_offset_t) sc->sram,
190 			       len, PAT_WRITE_COMBINING);
191 	if (err != 0) {
192 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
193 			      err);
194 		sc->wc = 0;
195 	}
196 #endif
197 }
198 
199 
200 /* callback to get our DMA address */
201 static void
202 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
203 			 int error)
204 {
205 	if (error == 0) {
206 		*(bus_addr_t *) arg = segs->ds_addr;
207 	}
208 }
209 
210 static int
211 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
212 		   bus_size_t alignment)
213 {
214 	int err;
215 	device_t dev = sc->dev;
216 	bus_size_t boundary, maxsegsize;
217 
218 	if (bytes > 4096 && alignment == 4096) {
219 		boundary = 0;
220 		maxsegsize = bytes;
221 	} else {
222 		boundary = 4096;
223 		maxsegsize = 4096;
224 	}
225 
226 	/* allocate DMAable memory tags */
227 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
228 				 alignment,		/* alignment */
229 				 boundary,		/* boundary */
230 				 BUS_SPACE_MAXADDR,	/* low */
231 				 BUS_SPACE_MAXADDR,	/* high */
232 				 NULL, NULL,		/* filter */
233 				 bytes,			/* maxsize */
234 				 1,			/* num segs */
235 				 maxsegsize,		/* maxsegsize */
236 				 BUS_DMA_COHERENT,	/* flags */
237 				 NULL, NULL,		/* lock */
238 				 &dma->dmat);		/* tag */
239 	if (err != 0) {
240 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
241 		return err;
242 	}
243 
244 	/* allocate DMAable memory & map */
245 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
246 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
247 				| BUS_DMA_ZERO),  &dma->map);
248 	if (err != 0) {
249 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
250 		goto abort_with_dmat;
251 	}
252 
253 	/* load the memory */
254 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
255 			      mxge_dmamap_callback,
256 			      (void *)&dma->bus_addr, 0);
257 	if (err != 0) {
258 		device_printf(dev, "couldn't load map (err = %d)\n", err);
259 		goto abort_with_mem;
260 	}
261 	return 0;
262 
263 abort_with_mem:
264 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
265 abort_with_dmat:
266 	(void)bus_dma_tag_destroy(dma->dmat);
267 	return err;
268 }
269 
270 
271 static void
272 mxge_dma_free(mxge_dma_t *dma)
273 {
274 	bus_dmamap_unload(dma->dmat, dma->map);
275 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
276 	(void)bus_dma_tag_destroy(dma->dmat);
277 }
278 
279 /*
280  * The eeprom strings on the lanaiX have the format
281  * SN=x\0
282  * MAC=x:x:x:x:x:x\0
283  * PC=text\0
284  */
285 
286 static int
287 mxge_parse_strings(mxge_softc_t *sc)
288 {
289 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
290 
291 	char *ptr, *limit;
292 	int i, found_mac;
293 
294 	ptr = sc->eeprom_strings;
295 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
296 	found_mac = 0;
297 	while (ptr < limit && *ptr != '\0') {
298 		if (memcmp(ptr, "MAC=", 4) == 0) {
299 			ptr += 1;
300 			sc->mac_addr_string = ptr;
301 			for (i = 0; i < 6; i++) {
302 				ptr += 3;
303 				if ((ptr + 2) > limit)
304 					goto abort;
305 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
306 				found_mac = 1;
307 			}
308 		} else if (memcmp(ptr, "PC=", 3) == 0) {
309 			ptr += 3;
310 			strncpy(sc->product_code_string, ptr,
311 				sizeof (sc->product_code_string) - 1);
312 		} else if (memcmp(ptr, "SN=", 3) == 0) {
313 			ptr += 3;
314 			strncpy(sc->serial_number_string, ptr,
315 				sizeof (sc->serial_number_string) - 1);
316 		}
317 		MXGE_NEXT_STRING(ptr);
318 	}
319 
320 	if (found_mac)
321 		return 0;
322 
323  abort:
324 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
325 
326 	return ENXIO;
327 }
328 
329 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
330 static void
331 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
332 {
333 	uint32_t val;
334 	unsigned long base, off;
335 	char *va, *cfgptr;
336 	device_t pdev, mcp55;
337 	uint16_t vendor_id, device_id, word;
338 	uintptr_t bus, slot, func, ivend, idev;
339 	uint32_t *ptr32;
340 
341 
342 	if (!mxge_nvidia_ecrc_enable)
343 		return;
344 
345 	pdev = device_get_parent(device_get_parent(sc->dev));
346 	if (pdev == NULL) {
347 		device_printf(sc->dev, "could not find parent?\n");
348 		return;
349 	}
350 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
351 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
352 
353 	if (vendor_id != 0x10de)
354 		return;
355 
356 	base = 0;
357 
358 	if (device_id == 0x005d) {
359 		/* ck804, base address is magic */
360 		base = 0xe0000000UL;
361 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
362 		/* mcp55, base address stored in chipset */
363 		mcp55 = pci_find_bsf(0, 0, 0);
364 		if (mcp55 &&
365 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
366 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
367 			word = pci_read_config(mcp55, 0x90, 2);
368 			base = ((unsigned long)word & 0x7ffeU) << 25;
369 		}
370 	}
371 	if (!base)
372 		return;
373 
374 	/* XXXX
375 	   Test below is commented because it is believed that doing
376 	   config read/write beyond 0xff will access the config space
377 	   for the next larger function.  Uncomment this and remove
378 	   the hacky pmap_mapdev() way of accessing config space when
379 	   FreeBSD grows support for extended pcie config space access
380 	*/
381 #if 0
382 	/* See if we can, by some miracle, access the extended
383 	   config space */
384 	val = pci_read_config(pdev, 0x178, 4);
385 	if (val != 0xffffffff) {
386 		val |= 0x40;
387 		pci_write_config(pdev, 0x178, val, 4);
388 		return;
389 	}
390 #endif
391 	/* Rather than using normal pci config space writes, we must
392 	 * map the Nvidia config space ourselves.  This is because on
393 	 * opteron/nvidia class machine the 0xe000000 mapping is
394 	 * handled by the nvidia chipset, that means the internal PCI
395 	 * device (the on-chip northbridge), or the amd-8131 bridge
396 	 * and things behind them are not visible by this method.
397 	 */
398 
399 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
400 		      PCI_IVAR_BUS, &bus);
401 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
402 		      PCI_IVAR_SLOT, &slot);
403 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
404 		      PCI_IVAR_FUNCTION, &func);
405 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
406 		      PCI_IVAR_VENDOR, &ivend);
407 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
408 		      PCI_IVAR_DEVICE, &idev);
409 
410 	off =  base
411 		+ 0x00100000UL * (unsigned long)bus
412 		+ 0x00001000UL * (unsigned long)(func
413 						 + 8 * slot);
414 
415 	/* map it into the kernel */
416 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
417 
418 
419 	if (va == NULL) {
420 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
421 		return;
422 	}
423 	/* get a pointer to the config space mapped into the kernel */
424 	cfgptr = va + (off & PAGE_MASK);
425 
426 	/* make sure that we can really access it */
427 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
428 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
429 	if (! (vendor_id == ivend && device_id == idev)) {
430 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
431 			      vendor_id, device_id);
432 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
433 		return;
434 	}
435 
436 	ptr32 = (uint32_t*)(cfgptr + 0x178);
437 	val = *ptr32;
438 
439 	if (val == 0xffffffff) {
440 		device_printf(sc->dev, "extended mapping failed\n");
441 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
442 		return;
443 	}
444 	*ptr32 = val | 0x40;
445 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
446 	if (mxge_verbose)
447 		device_printf(sc->dev,
448 			      "Enabled ECRC on upstream Nvidia bridge "
449 			      "at %d:%d:%d\n",
450 			      (int)bus, (int)slot, (int)func);
451 	return;
452 }
453 #else
454 static void
455 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
456 {
457 	device_printf(sc->dev,
458 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
459 	return;
460 }
461 #endif
462 
463 
464 static int
465 mxge_dma_test(mxge_softc_t *sc, int test_type)
466 {
467 	mxge_cmd_t cmd;
468 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
469 	int status;
470 	uint32_t len;
471 	char *test = " ";
472 
473 
474 	/* Run a small DMA test.
475 	 * The magic multipliers to the length tell the firmware
476 	 * to do DMA read, write, or read+write tests.  The
477 	 * results are returned in cmd.data0.  The upper 16
478 	 * bits of the return is the number of transfers completed.
479 	 * The lower 16 bits is the time in 0.5us ticks that the
480 	 * transfers took to complete.
481 	 */
482 
483 	len = sc->tx_boundary;
484 
485 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
486 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
487 	cmd.data2 = len * 0x10000;
488 	status = mxge_send_cmd(sc, test_type, &cmd);
489 	if (status != 0) {
490 		test = "read";
491 		goto abort;
492 	}
493 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
494 		(cmd.data0 & 0xffff);
495 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
496 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
497 	cmd.data2 = len * 0x1;
498 	status = mxge_send_cmd(sc, test_type, &cmd);
499 	if (status != 0) {
500 		test = "write";
501 		goto abort;
502 	}
503 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
504 		(cmd.data0 & 0xffff);
505 
506 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
507 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
508 	cmd.data2 = len * 0x10001;
509 	status = mxge_send_cmd(sc, test_type, &cmd);
510 	if (status != 0) {
511 		test = "read/write";
512 		goto abort;
513 	}
514 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
515 		(cmd.data0 & 0xffff);
516 
517 abort:
518 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
519 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
520 			      test, status);
521 
522 	return status;
523 }
524 
525 /*
526  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
527  * when the PCI-E Completion packets are aligned on an 8-byte
528  * boundary.  Some PCI-E chip sets always align Completion packets; on
529  * the ones that do not, the alignment can be enforced by enabling
530  * ECRC generation (if supported).
531  *
532  * When PCI-E Completion packets are not aligned, it is actually more
533  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
534  *
535  * If the driver can neither enable ECRC nor verify that it has
536  * already been enabled, then it must use a firmware image which works
537  * around unaligned completion packets (ethp_z8e.dat), and it should
538  * also ensure that it never gives the device a Read-DMA which is
539  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
540  * enabled, then the driver should use the aligned (eth_z8e.dat)
541  * firmware image, and set tx_boundary to 4KB.
542  */
543 
544 static int
545 mxge_firmware_probe(mxge_softc_t *sc)
546 {
547 	device_t dev = sc->dev;
548 	int reg, status;
549 	uint16_t pectl;
550 
551 	sc->tx_boundary = 4096;
552 	/*
553 	 * Verify the max read request size was set to 4KB
554 	 * before trying the test with 4KB.
555 	 */
556 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
557 		pectl = pci_read_config(dev, reg + 0x8, 2);
558 		if ((pectl & (5 << 12)) != (5 << 12)) {
559 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
560 				      pectl);
561 			sc->tx_boundary = 2048;
562 		}
563 	}
564 
565 	/*
566 	 * load the optimized firmware (which assumes aligned PCIe
567 	 * completions) in order to see if it works on this host.
568 	 */
569 	sc->fw_name = mxge_fw_aligned;
570 	status = mxge_load_firmware(sc, 1);
571 	if (status != 0) {
572 		return status;
573 	}
574 
575 	/*
576 	 * Enable ECRC if possible
577 	 */
578 	mxge_enable_nvidia_ecrc(sc);
579 
580 	/*
581 	 * Run a DMA test which watches for unaligned completions and
582 	 * aborts on the first one seen.
583 	 */
584 
585 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
586 	if (status == 0)
587 		return 0; /* keep the aligned firmware */
588 
589 	if (status != E2BIG)
590 		device_printf(dev, "DMA test failed: %d\n", status);
591 	if (status == ENOSYS)
592 		device_printf(dev, "Falling back to ethp! "
593 			      "Please install up to date fw\n");
594 	return status;
595 }
596 
597 static int
598 mxge_select_firmware(mxge_softc_t *sc)
599 {
600 	int aligned = 0;
601 	int force_firmware = mxge_force_firmware;
602 
603 	if (sc->throttle)
604 		force_firmware = sc->throttle;
605 
606 	if (force_firmware != 0) {
607 		if (force_firmware == 1)
608 			aligned = 1;
609 		else
610 			aligned = 0;
611 		if (mxge_verbose)
612 			device_printf(sc->dev,
613 				      "Assuming %s completions (forced)\n",
614 				      aligned ? "aligned" : "unaligned");
615 		goto abort;
616 	}
617 
618 	/* if the PCIe link width is 4 or less, we can use the aligned
619 	   firmware and skip any checks */
620 	if (sc->link_width != 0 && sc->link_width <= 4) {
621 		device_printf(sc->dev,
622 			      "PCIe x%d Link, expect reduced performance\n",
623 			      sc->link_width);
624 		aligned = 1;
625 		goto abort;
626 	}
627 
628 	if (0 == mxge_firmware_probe(sc))
629 		return 0;
630 
631 abort:
632 	if (aligned) {
633 		sc->fw_name = mxge_fw_aligned;
634 		sc->tx_boundary = 4096;
635 	} else {
636 		sc->fw_name = mxge_fw_unaligned;
637 		sc->tx_boundary = 2048;
638 	}
639 	return (mxge_load_firmware(sc, 0));
640 }
641 
642 union qualhack
643 {
644         const char *ro_char;
645         char *rw_char;
646 };
647 
648 static int
649 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
650 {
651 
652 
653 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
654 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
655 			      be32toh(hdr->mcp_type));
656 		return EIO;
657 	}
658 
659 	/* save firmware version for sysctl */
660 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
661 	if (mxge_verbose)
662 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
663 
664 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
665 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
666 
667 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
668 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
669 		device_printf(sc->dev, "Found firmware version %s\n",
670 			      sc->fw_version);
671 		device_printf(sc->dev, "Driver needs %d.%d\n",
672 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
673 		return EINVAL;
674 	}
675 	return 0;
676 
677 }
678 
679 static void *
680 z_alloc(void *nil, u_int items, u_int size)
681 {
682         void *ptr;
683 
684         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
685         return ptr;
686 }
687 
688 static void
689 z_free(void *nil, void *ptr)
690 {
691         free(ptr, M_TEMP);
692 }
693 
694 
695 static int
696 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
697 {
698 	z_stream zs;
699 	char *inflate_buffer;
700 	const struct firmware *fw;
701 	const mcp_gen_header_t *hdr;
702 	unsigned hdr_offset;
703 	int status;
704 	unsigned int i;
705 	char dummy;
706 	size_t fw_len;
707 
708 	fw = firmware_get(sc->fw_name);
709 	if (fw == NULL) {
710 		device_printf(sc->dev, "Could not find firmware image %s\n",
711 			      sc->fw_name);
712 		return ENOENT;
713 	}
714 
715 
716 
717 	/* setup zlib and decompress f/w */
718 	bzero(&zs, sizeof (zs));
719 	zs.zalloc = z_alloc;
720 	zs.zfree = z_free;
721 	status = inflateInit(&zs);
722 	if (status != Z_OK) {
723 		status = EIO;
724 		goto abort_with_fw;
725 	}
726 
727 	/* the uncompressed size is stored as the firmware version,
728 	   which would otherwise go unused */
729 	fw_len = (size_t) fw->version;
730 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
731 	if (inflate_buffer == NULL)
732 		goto abort_with_zs;
733 	zs.avail_in = fw->datasize;
734 	zs.next_in = __DECONST(char *, fw->data);
735 	zs.avail_out = fw_len;
736 	zs.next_out = inflate_buffer;
737 	status = inflate(&zs, Z_FINISH);
738 	if (status != Z_STREAM_END) {
739 		device_printf(sc->dev, "zlib %d\n", status);
740 		status = EIO;
741 		goto abort_with_buffer;
742 	}
743 
744 	/* check id */
745 	hdr_offset = htobe32(*(const uint32_t *)
746 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
747 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
748 		device_printf(sc->dev, "Bad firmware file");
749 		status = EIO;
750 		goto abort_with_buffer;
751 	}
752 	hdr = (const void*)(inflate_buffer + hdr_offset);
753 
754 	status = mxge_validate_firmware(sc, hdr);
755 	if (status != 0)
756 		goto abort_with_buffer;
757 
758 	/* Copy the inflated firmware to NIC SRAM. */
759 	for (i = 0; i < fw_len; i += 256) {
760 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
761 			      inflate_buffer + i,
762 			      min(256U, (unsigned)(fw_len - i)));
763 		wmb();
764 		dummy = *sc->sram;
765 		wmb();
766 	}
767 
768 	*limit = fw_len;
769 	status = 0;
770 abort_with_buffer:
771 	free(inflate_buffer, M_TEMP);
772 abort_with_zs:
773 	inflateEnd(&zs);
774 abort_with_fw:
775 	firmware_put(fw, FIRMWARE_UNLOAD);
776 	return status;
777 }
778 
779 /*
780  * Enable or disable periodic RDMAs from the host to make certain
781  * chipsets resend dropped PCIe messages
782  */
783 
784 static void
785 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
786 {
787 	char buf_bytes[72];
788 	volatile uint32_t *confirm;
789 	volatile char *submit;
790 	uint32_t *buf, dma_low, dma_high;
791 	int i;
792 
793 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
794 
795 	/* clear confirmation addr */
796 	confirm = (volatile uint32_t *)sc->cmd;
797 	*confirm = 0;
798 	wmb();
799 
800 	/* send an rdma command to the PCIe engine, and wait for the
801 	   response in the confirmation address.  The firmware should
802 	   write a -1 there to indicate it is alive and well
803 	*/
804 
805 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
806 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
807 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
808 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
809 	buf[2] = htobe32(0xffffffff);		/* confirm data */
810 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
811 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
812 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
813 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
814 	buf[5] = htobe32(enable);			/* enable? */
815 
816 
817 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
818 
819 	mxge_pio_copy(submit, buf, 64);
820 	wmb();
821 	DELAY(1000);
822 	wmb();
823 	i = 0;
824 	while (*confirm != 0xffffffff && i < 20) {
825 		DELAY(1000);
826 		i++;
827 	}
828 	if (*confirm != 0xffffffff) {
829 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
830 			      (enable ? "enable" : "disable"), confirm,
831 			      *confirm);
832 	}
833 	return;
834 }
835 
836 static int
837 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
838 {
839 	mcp_cmd_t *buf;
840 	char buf_bytes[sizeof(*buf) + 8];
841 	volatile mcp_cmd_response_t *response = sc->cmd;
842 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
843 	uint32_t dma_low, dma_high;
844 	int err, sleep_total = 0;
845 
846 	/* ensure buf is aligned to 8 bytes */
847 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
848 
849 	buf->data0 = htobe32(data->data0);
850 	buf->data1 = htobe32(data->data1);
851 	buf->data2 = htobe32(data->data2);
852 	buf->cmd = htobe32(cmd);
853 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
854 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
855 
856 	buf->response_addr.low = htobe32(dma_low);
857 	buf->response_addr.high = htobe32(dma_high);
858 	mtx_lock(&sc->cmd_mtx);
859 	response->result = 0xffffffff;
860 	wmb();
861 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
862 
863 	/* wait up to 20ms */
864 	err = EAGAIN;
865 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
866 		bus_dmamap_sync(sc->cmd_dma.dmat,
867 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
868 		wmb();
869 		switch (be32toh(response->result)) {
870 		case 0:
871 			data->data0 = be32toh(response->data);
872 			err = 0;
873 			break;
874 		case 0xffffffff:
875 			DELAY(1000);
876 			break;
877 		case MXGEFW_CMD_UNKNOWN:
878 			err = ENOSYS;
879 			break;
880 		case MXGEFW_CMD_ERROR_UNALIGNED:
881 			err = E2BIG;
882 			break;
883 		case MXGEFW_CMD_ERROR_BUSY:
884 			err = EBUSY;
885 			break;
886 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
887 			err = ENXIO;
888 			break;
889 		default:
890 			device_printf(sc->dev,
891 				      "mxge: command %d "
892 				      "failed, result = %d\n",
893 				      cmd, be32toh(response->result));
894 			err = ENXIO;
895 			break;
896 		}
897 		if (err != EAGAIN)
898 			break;
899 	}
900 	if (err == EAGAIN)
901 		device_printf(sc->dev, "mxge: command %d timed out"
902 			      "result = %d\n",
903 			      cmd, be32toh(response->result));
904 	mtx_unlock(&sc->cmd_mtx);
905 	return err;
906 }
907 
908 static int
909 mxge_adopt_running_firmware(mxge_softc_t *sc)
910 {
911 	struct mcp_gen_header *hdr;
912 	const size_t bytes = sizeof (struct mcp_gen_header);
913 	size_t hdr_offset;
914 	int status;
915 
916 	/* find running firmware header */
917 	hdr_offset = htobe32(*(volatile uint32_t *)
918 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
919 
920 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
921 		device_printf(sc->dev,
922 			      "Running firmware has bad header offset (%d)\n",
923 			      (int)hdr_offset);
924 		return EIO;
925 	}
926 
927 	/* copy header of running firmware from SRAM to host memory to
928 	 * validate firmware */
929 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
930 	if (hdr == NULL) {
931 		device_printf(sc->dev, "could not malloc firmware hdr\n");
932 		return ENOMEM;
933 	}
934 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
935 				rman_get_bushandle(sc->mem_res),
936 				hdr_offset, (char *)hdr, bytes);
937 	status = mxge_validate_firmware(sc, hdr);
938 	free(hdr, M_DEVBUF);
939 
940 	/*
941 	 * check to see if adopted firmware has bug where adopting
942 	 * it will cause broadcasts to be filtered unless the NIC
943 	 * is kept in ALLMULTI mode
944 	 */
945 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
946 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
947 		sc->adopted_rx_filter_bug = 1;
948 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
949 			      "working around rx filter bug\n",
950 			      sc->fw_ver_major, sc->fw_ver_minor,
951 			      sc->fw_ver_tiny);
952 	}
953 
954 	return status;
955 }
956 
957 
958 static int
959 mxge_load_firmware(mxge_softc_t *sc, int adopt)
960 {
961 	volatile uint32_t *confirm;
962 	volatile char *submit;
963 	char buf_bytes[72];
964 	uint32_t *buf, size, dma_low, dma_high;
965 	int status, i;
966 
967 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
968 
969 	size = sc->sram_size;
970 	status = mxge_load_firmware_helper(sc, &size);
971 	if (status) {
972 		if (!adopt)
973 			return status;
974 		/* Try to use the currently running firmware, if
975 		   it is new enough */
976 		status = mxge_adopt_running_firmware(sc);
977 		if (status) {
978 			device_printf(sc->dev,
979 				      "failed to adopt running firmware\n");
980 			return status;
981 		}
982 		device_printf(sc->dev,
983 			      "Successfully adopted running firmware\n");
984 		if (sc->tx_boundary == 4096) {
985 			device_printf(sc->dev,
986 				"Using firmware currently running on NIC"
987 				 ".  For optimal\n");
988 			device_printf(sc->dev,
989 				 "performance consider loading optimized "
990 				 "firmware\n");
991 		}
992 		sc->fw_name = mxge_fw_unaligned;
993 		sc->tx_boundary = 2048;
994 		return 0;
995 	}
996 	/* clear confirmation addr */
997 	confirm = (volatile uint32_t *)sc->cmd;
998 	*confirm = 0;
999 	wmb();
1000 	/* send a reload command to the bootstrap MCP, and wait for the
1001 	   response in the confirmation address.  The firmware should
1002 	   write a -1 there to indicate it is alive and well
1003 	*/
1004 
1005 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1006 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1007 
1008 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1009 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1010 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1011 
1012 	/* FIX: All newest firmware should un-protect the bottom of
1013 	   the sram before handoff. However, the very first interfaces
1014 	   do not. Therefore the handoff copy must skip the first 8 bytes
1015 	*/
1016 					/* where the code starts*/
1017 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1018 	buf[4] = htobe32(size - 8); 	/* length of code */
1019 	buf[5] = htobe32(8);		/* where to copy to */
1020 	buf[6] = htobe32(0);		/* where to jump to */
1021 
1022 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1023 	mxge_pio_copy(submit, buf, 64);
1024 	wmb();
1025 	DELAY(1000);
1026 	wmb();
1027 	i = 0;
1028 	while (*confirm != 0xffffffff && i < 20) {
1029 		DELAY(1000*10);
1030 		i++;
1031 		bus_dmamap_sync(sc->cmd_dma.dmat,
1032 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1033 	}
1034 	if (*confirm != 0xffffffff) {
1035 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1036 			confirm, *confirm);
1037 
1038 		return ENXIO;
1039 	}
1040 	return 0;
1041 }
1042 
1043 static int
1044 mxge_update_mac_address(mxge_softc_t *sc)
1045 {
1046 	mxge_cmd_t cmd;
1047 	uint8_t *addr = sc->mac_addr;
1048 	int status;
1049 
1050 
1051 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1052 		     | (addr[2] << 8) | addr[3]);
1053 
1054 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1055 
1056 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1057 	return status;
1058 }
1059 
1060 static int
1061 mxge_change_pause(mxge_softc_t *sc, int pause)
1062 {
1063 	mxge_cmd_t cmd;
1064 	int status;
1065 
1066 	if (pause)
1067 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1068 				       &cmd);
1069 	else
1070 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1071 				       &cmd);
1072 
1073 	if (status) {
1074 		device_printf(sc->dev, "Failed to set flow control mode\n");
1075 		return ENXIO;
1076 	}
1077 	sc->pause = pause;
1078 	return 0;
1079 }
1080 
1081 static void
1082 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1083 {
1084 	mxge_cmd_t cmd;
1085 	int status;
1086 
1087 	if (mxge_always_promisc)
1088 		promisc = 1;
1089 
1090 	if (promisc)
1091 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1092 				       &cmd);
1093 	else
1094 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1095 				       &cmd);
1096 
1097 	if (status) {
1098 		device_printf(sc->dev, "Failed to set promisc mode\n");
1099 	}
1100 }
1101 
1102 static void
1103 mxge_set_multicast_list(mxge_softc_t *sc)
1104 {
1105 	mxge_cmd_t cmd;
1106 	struct ifmultiaddr *ifma;
1107 	struct ifnet *ifp = sc->ifp;
1108 	int err;
1109 
1110 	/* This firmware is known to not support multicast */
1111 	if (!sc->fw_multicast_support)
1112 		return;
1113 
1114 	/* Disable multicast filtering while we play with the lists*/
1115 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1116 	if (err != 0) {
1117 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1118 		       " error status: %d\n", err);
1119 		return;
1120 	}
1121 
1122 	if (sc->adopted_rx_filter_bug)
1123 		return;
1124 
1125 	if (ifp->if_flags & IFF_ALLMULTI)
1126 		/* request to disable multicast filtering, so quit here */
1127 		return;
1128 
1129 	/* Flush all the filters */
1130 
1131 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1132 	if (err != 0) {
1133 		device_printf(sc->dev,
1134 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1135 			      ", error status: %d\n", err);
1136 		return;
1137 	}
1138 
1139 	/* Walk the multicast list, and add each address */
1140 
1141 	if_maddr_rlock(ifp);
1142 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1143 		if (ifma->ifma_addr->sa_family != AF_LINK)
1144 			continue;
1145 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1146 		      &cmd.data0, 4);
1147 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1148 		      &cmd.data1, 2);
1149 		cmd.data0 = htonl(cmd.data0);
1150 		cmd.data1 = htonl(cmd.data1);
1151 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1152 		if (err != 0) {
1153 			device_printf(sc->dev, "Failed "
1154 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1155 			       "%d\t", err);
1156 			/* abort, leaving multicast filtering off */
1157 			if_maddr_runlock(ifp);
1158 			return;
1159 		}
1160 	}
1161 	if_maddr_runlock(ifp);
1162 	/* Enable multicast filtering */
1163 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1164 	if (err != 0) {
1165 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1166 		       ", error status: %d\n", err);
1167 	}
1168 }
1169 
1170 static int
1171 mxge_max_mtu(mxge_softc_t *sc)
1172 {
1173 	mxge_cmd_t cmd;
1174 	int status;
1175 
1176 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1177 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1178 
1179 	/* try to set nbufs to see if it we can
1180 	   use virtually contiguous jumbos */
1181 	cmd.data0 = 0;
1182 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1183 			       &cmd);
1184 	if (status == 0)
1185 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1186 
1187 	/* otherwise, we're limited to MJUMPAGESIZE */
1188 	return MJUMPAGESIZE - MXGEFW_PAD;
1189 }
1190 
1191 static int
1192 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1193 {
1194 	struct mxge_slice_state *ss;
1195 	mxge_rx_done_t *rx_done;
1196 	volatile uint32_t *irq_claim;
1197 	mxge_cmd_t cmd;
1198 	int slice, status;
1199 
1200 	/* try to send a reset command to the card to see if it
1201 	   is alive */
1202 	memset(&cmd, 0, sizeof (cmd));
1203 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1204 	if (status != 0) {
1205 		device_printf(sc->dev, "failed reset\n");
1206 		return ENXIO;
1207 	}
1208 
1209 	mxge_dummy_rdma(sc, 1);
1210 
1211 
1212 	/* set the intrq size */
1213 	cmd.data0 = sc->rx_ring_size;
1214 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1215 
1216 	/*
1217 	 * Even though we already know how many slices are supported
1218 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1219 	 * has magic side effects, and must be called after a reset.
1220 	 * It must be called prior to calling any RSS related cmds,
1221 	 * including assigning an interrupt queue for anything but
1222 	 * slice 0.  It must also be called *after*
1223 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1224 	 * the firmware to compute offsets.
1225 	 */
1226 
1227 	if (sc->num_slices > 1) {
1228 		/* ask the maximum number of slices it supports */
1229 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1230 					   &cmd);
1231 		if (status != 0) {
1232 			device_printf(sc->dev,
1233 				      "failed to get number of slices\n");
1234 			return status;
1235 		}
1236 		/*
1237 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1238 		 * to setting up the interrupt queue DMA
1239 		 */
1240 		cmd.data0 = sc->num_slices;
1241 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1242 #ifdef IFNET_BUF_RING
1243 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1244 #endif
1245 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1246 					   &cmd);
1247 		if (status != 0) {
1248 			device_printf(sc->dev,
1249 				      "failed to set number of slices\n");
1250 			return status;
1251 		}
1252 	}
1253 
1254 
1255 	if (interrupts_setup) {
1256 		/* Now exchange information about interrupts  */
1257 		for (slice = 0; slice < sc->num_slices; slice++) {
1258 			rx_done = &sc->ss[slice].rx_done;
1259 			memset(rx_done->entry, 0, sc->rx_ring_size);
1260 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1261 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1262 			cmd.data2 = slice;
1263 			status |= mxge_send_cmd(sc,
1264 						MXGEFW_CMD_SET_INTRQ_DMA,
1265 						&cmd);
1266 		}
1267 	}
1268 
1269 	status |= mxge_send_cmd(sc,
1270 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1271 
1272 
1273 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1274 
1275 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1276 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1277 
1278 
1279 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1280 				&cmd);
1281 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1282 	if (status != 0) {
1283 		device_printf(sc->dev, "failed set interrupt parameters\n");
1284 		return status;
1285 	}
1286 
1287 
1288 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1289 
1290 
1291 	/* run a DMA benchmark */
1292 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1293 
1294 	for (slice = 0; slice < sc->num_slices; slice++) {
1295 		ss = &sc->ss[slice];
1296 
1297 		ss->irq_claim = irq_claim + (2 * slice);
1298 		/* reset mcp/driver shared state back to 0 */
1299 		ss->rx_done.idx = 0;
1300 		ss->rx_done.cnt = 0;
1301 		ss->tx.req = 0;
1302 		ss->tx.done = 0;
1303 		ss->tx.pkt_done = 0;
1304 		ss->tx.queue_active = 0;
1305 		ss->tx.activate = 0;
1306 		ss->tx.deactivate = 0;
1307 		ss->tx.wake = 0;
1308 		ss->tx.defrag = 0;
1309 		ss->tx.stall = 0;
1310 		ss->rx_big.cnt = 0;
1311 		ss->rx_small.cnt = 0;
1312 		ss->lro_bad_csum = 0;
1313 		ss->lro_queued = 0;
1314 		ss->lro_flushed = 0;
1315 		if (ss->fw_stats != NULL) {
1316 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1317 		}
1318 	}
1319 	sc->rdma_tags_available = 15;
1320 	status = mxge_update_mac_address(sc);
1321 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1322 	mxge_change_pause(sc, sc->pause);
1323 	mxge_set_multicast_list(sc);
1324 	if (sc->throttle) {
1325 		cmd.data0 = sc->throttle;
1326 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1327 				  &cmd)) {
1328 			device_printf(sc->dev,
1329 				      "can't enable throttle\n");
1330 		}
1331 	}
1332 	return status;
1333 }
1334 
1335 static int
1336 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1337 {
1338 	mxge_cmd_t cmd;
1339 	mxge_softc_t *sc;
1340 	int err;
1341 	unsigned int throttle;
1342 
1343 	sc = arg1;
1344 	throttle = sc->throttle;
1345 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1346         if (err != 0) {
1347                 return err;
1348         }
1349 
1350 	if (throttle == sc->throttle)
1351 		return 0;
1352 
1353         if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1354                 return EINVAL;
1355 
1356 	mtx_lock(&sc->driver_mtx);
1357 	cmd.data0 = throttle;
1358 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1359 	if (err == 0)
1360 		sc->throttle = throttle;
1361 	mtx_unlock(&sc->driver_mtx);
1362 	return err;
1363 }
1364 
1365 static int
1366 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1367 {
1368         mxge_softc_t *sc;
1369         unsigned int intr_coal_delay;
1370         int err;
1371 
1372         sc = arg1;
1373         intr_coal_delay = sc->intr_coal_delay;
1374         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1375         if (err != 0) {
1376                 return err;
1377         }
1378         if (intr_coal_delay == sc->intr_coal_delay)
1379                 return 0;
1380 
1381         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1382                 return EINVAL;
1383 
1384 	mtx_lock(&sc->driver_mtx);
1385 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1386 	sc->intr_coal_delay = intr_coal_delay;
1387 
1388 	mtx_unlock(&sc->driver_mtx);
1389         return err;
1390 }
1391 
1392 static int
1393 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1394 {
1395         mxge_softc_t *sc;
1396         unsigned int enabled;
1397         int err;
1398 
1399         sc = arg1;
1400         enabled = sc->pause;
1401         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1402         if (err != 0) {
1403                 return err;
1404         }
1405         if (enabled == sc->pause)
1406                 return 0;
1407 
1408 	mtx_lock(&sc->driver_mtx);
1409 	err = mxge_change_pause(sc, enabled);
1410 	mtx_unlock(&sc->driver_mtx);
1411         return err;
1412 }
1413 
1414 static int
1415 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1416 {
1417 	struct ifnet *ifp;
1418 	int err = 0;
1419 
1420 	ifp = sc->ifp;
1421 	if (lro_cnt == 0)
1422 		ifp->if_capenable &= ~IFCAP_LRO;
1423 	else
1424 		ifp->if_capenable |= IFCAP_LRO;
1425 	sc->lro_cnt = lro_cnt;
1426 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1427 		mxge_close(sc, 0);
1428 		err = mxge_open(sc);
1429 	}
1430 	return err;
1431 }
1432 
1433 static int
1434 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1435 {
1436 	mxge_softc_t *sc;
1437 	unsigned int lro_cnt;
1438 	int err;
1439 
1440 	sc = arg1;
1441 	lro_cnt = sc->lro_cnt;
1442 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1443 	if (err != 0)
1444 		return err;
1445 
1446 	if (lro_cnt == sc->lro_cnt)
1447 		return 0;
1448 
1449 	if (lro_cnt > 128)
1450 		return EINVAL;
1451 
1452 	mtx_lock(&sc->driver_mtx);
1453 	err = mxge_change_lro_locked(sc, lro_cnt);
1454 	mtx_unlock(&sc->driver_mtx);
1455 	return err;
1456 }
1457 
1458 static int
1459 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1460 {
1461         int err;
1462 
1463         if (arg1 == NULL)
1464                 return EFAULT;
1465         arg2 = be32toh(*(int *)arg1);
1466         arg1 = NULL;
1467         err = sysctl_handle_int(oidp, arg1, arg2, req);
1468 
1469         return err;
1470 }
1471 
1472 static void
1473 mxge_rem_sysctls(mxge_softc_t *sc)
1474 {
1475 	struct mxge_slice_state *ss;
1476 	int slice;
1477 
1478 	if (sc->slice_sysctl_tree == NULL)
1479 		return;
1480 
1481 	for (slice = 0; slice < sc->num_slices; slice++) {
1482 		ss = &sc->ss[slice];
1483 		if (ss == NULL || ss->sysctl_tree == NULL)
1484 			continue;
1485 		sysctl_ctx_free(&ss->sysctl_ctx);
1486 		ss->sysctl_tree = NULL;
1487 	}
1488 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1489 	sc->slice_sysctl_tree = NULL;
1490 }
1491 
1492 static void
1493 mxge_add_sysctls(mxge_softc_t *sc)
1494 {
1495 	struct sysctl_ctx_list *ctx;
1496 	struct sysctl_oid_list *children;
1497 	mcp_irq_data_t *fw;
1498 	struct mxge_slice_state *ss;
1499 	int slice;
1500 	char slice_num[8];
1501 
1502 	ctx = device_get_sysctl_ctx(sc->dev);
1503 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1504 	fw = sc->ss[0].fw_stats;
1505 
1506 	/* random information */
1507 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1508 		       "firmware_version",
1509 		       CTLFLAG_RD, &sc->fw_version,
1510 		       0, "firmware version");
1511 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1512 		       "serial_number",
1513 		       CTLFLAG_RD, &sc->serial_number_string,
1514 		       0, "serial number");
1515 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1516 		       "product_code",
1517 		       CTLFLAG_RD, &sc->product_code_string,
1518 		       0, "product_code");
1519 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1520 		       "pcie_link_width",
1521 		       CTLFLAG_RD, &sc->link_width,
1522 		       0, "tx_boundary");
1523 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1524 		       "tx_boundary",
1525 		       CTLFLAG_RD, &sc->tx_boundary,
1526 		       0, "tx_boundary");
1527 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1528 		       "write_combine",
1529 		       CTLFLAG_RD, &sc->wc,
1530 		       0, "write combining PIO?");
1531 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1532 		       "read_dma_MBs",
1533 		       CTLFLAG_RD, &sc->read_dma,
1534 		       0, "DMA Read speed in MB/s");
1535 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1536 		       "write_dma_MBs",
1537 		       CTLFLAG_RD, &sc->write_dma,
1538 		       0, "DMA Write speed in MB/s");
1539 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1540 		       "read_write_dma_MBs",
1541 		       CTLFLAG_RD, &sc->read_write_dma,
1542 		       0, "DMA concurrent Read/Write speed in MB/s");
1543 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1544 		       "watchdog_resets",
1545 		       CTLFLAG_RD, &sc->watchdog_resets,
1546 		       0, "Number of times NIC was reset");
1547 
1548 
1549 	/* performance related tunables */
1550 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1551 			"intr_coal_delay",
1552 			CTLTYPE_INT|CTLFLAG_RW, sc,
1553 			0, mxge_change_intr_coal,
1554 			"I", "interrupt coalescing delay in usecs");
1555 
1556 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1557 			"throttle",
1558 			CTLTYPE_INT|CTLFLAG_RW, sc,
1559 			0, mxge_change_throttle,
1560 			"I", "transmit throttling");
1561 
1562 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1563 			"flow_control_enabled",
1564 			CTLTYPE_INT|CTLFLAG_RW, sc,
1565 			0, mxge_change_flow_control,
1566 			"I", "interrupt coalescing delay in usecs");
1567 
1568 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1569 		       "deassert_wait",
1570 		       CTLFLAG_RW, &mxge_deassert_wait,
1571 		       0, "Wait for IRQ line to go low in ihandler");
1572 
1573 	/* stats block from firmware is in network byte order.
1574 	   Need to swap it */
1575 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1576 			"link_up",
1577 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1578 			0, mxge_handle_be32,
1579 			"I", "link up");
1580 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1581 			"rdma_tags_available",
1582 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1583 			0, mxge_handle_be32,
1584 			"I", "rdma_tags_available");
1585 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1586 			"dropped_bad_crc32",
1587 			CTLTYPE_INT|CTLFLAG_RD,
1588 			&fw->dropped_bad_crc32,
1589 			0, mxge_handle_be32,
1590 			"I", "dropped_bad_crc32");
1591 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1592 			"dropped_bad_phy",
1593 			CTLTYPE_INT|CTLFLAG_RD,
1594 			&fw->dropped_bad_phy,
1595 			0, mxge_handle_be32,
1596 			"I", "dropped_bad_phy");
1597 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1598 			"dropped_link_error_or_filtered",
1599 			CTLTYPE_INT|CTLFLAG_RD,
1600 			&fw->dropped_link_error_or_filtered,
1601 			0, mxge_handle_be32,
1602 			"I", "dropped_link_error_or_filtered");
1603 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1604 			"dropped_link_overflow",
1605 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1606 			0, mxge_handle_be32,
1607 			"I", "dropped_link_overflow");
1608 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1609 			"dropped_multicast_filtered",
1610 			CTLTYPE_INT|CTLFLAG_RD,
1611 			&fw->dropped_multicast_filtered,
1612 			0, mxge_handle_be32,
1613 			"I", "dropped_multicast_filtered");
1614 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1615 			"dropped_no_big_buffer",
1616 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1617 			0, mxge_handle_be32,
1618 			"I", "dropped_no_big_buffer");
1619 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1620 			"dropped_no_small_buffer",
1621 			CTLTYPE_INT|CTLFLAG_RD,
1622 			&fw->dropped_no_small_buffer,
1623 			0, mxge_handle_be32,
1624 			"I", "dropped_no_small_buffer");
1625 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1626 			"dropped_overrun",
1627 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1628 			0, mxge_handle_be32,
1629 			"I", "dropped_overrun");
1630 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1631 			"dropped_pause",
1632 			CTLTYPE_INT|CTLFLAG_RD,
1633 			&fw->dropped_pause,
1634 			0, mxge_handle_be32,
1635 			"I", "dropped_pause");
1636 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1637 			"dropped_runt",
1638 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1639 			0, mxge_handle_be32,
1640 			"I", "dropped_runt");
1641 
1642 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1643 			"dropped_unicast_filtered",
1644 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1645 			0, mxge_handle_be32,
1646 			"I", "dropped_unicast_filtered");
1647 
1648 	/* verbose printing? */
1649 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1650 		       "verbose",
1651 		       CTLFLAG_RW, &mxge_verbose,
1652 		       0, "verbose printing");
1653 
1654 	/* lro */
1655 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1656 			"lro_cnt",
1657 			CTLTYPE_INT|CTLFLAG_RW, sc,
1658 			0, mxge_change_lro,
1659 			"I", "number of lro merge queues");
1660 
1661 
1662 	/* add counters exported for debugging from all slices */
1663 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1664 	sc->slice_sysctl_tree =
1665 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1666 				"slice", CTLFLAG_RD, 0, "");
1667 
1668 	for (slice = 0; slice < sc->num_slices; slice++) {
1669 		ss = &sc->ss[slice];
1670 		sysctl_ctx_init(&ss->sysctl_ctx);
1671 		ctx = &ss->sysctl_ctx;
1672 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1673 		sprintf(slice_num, "%d", slice);
1674 		ss->sysctl_tree =
1675 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1676 					CTLFLAG_RD, 0, "");
1677 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1678 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1679 			       "rx_small_cnt",
1680 			       CTLFLAG_RD, &ss->rx_small.cnt,
1681 			       0, "rx_small_cnt");
1682 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1683 			       "rx_big_cnt",
1684 			       CTLFLAG_RD, &ss->rx_big.cnt,
1685 			       0, "rx_small_cnt");
1686 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1687 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1688 			       0, "number of lro merge queues flushed");
1689 
1690 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1691 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1692 			       0, "number of frames appended to lro merge"
1693 			       "queues");
1694 
1695 #ifndef IFNET_BUF_RING
1696 		/* only transmit from slice 0 for now */
1697 		if (slice > 0)
1698 			continue;
1699 #endif
1700 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1701 			       "tx_req",
1702 			       CTLFLAG_RD, &ss->tx.req,
1703 			       0, "tx_req");
1704 
1705 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1706 			       "tx_done",
1707 			       CTLFLAG_RD, &ss->tx.done,
1708 			       0, "tx_done");
1709 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1710 			       "tx_pkt_done",
1711 			       CTLFLAG_RD, &ss->tx.pkt_done,
1712 			       0, "tx_done");
1713 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1714 			       "tx_stall",
1715 			       CTLFLAG_RD, &ss->tx.stall,
1716 			       0, "tx_stall");
1717 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1718 			       "tx_wake",
1719 			       CTLFLAG_RD, &ss->tx.wake,
1720 			       0, "tx_wake");
1721 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1722 			       "tx_defrag",
1723 			       CTLFLAG_RD, &ss->tx.defrag,
1724 			       0, "tx_defrag");
1725 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1726 			       "tx_queue_active",
1727 			       CTLFLAG_RD, &ss->tx.queue_active,
1728 			       0, "tx_queue_active");
1729 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1730 			       "tx_activate",
1731 			       CTLFLAG_RD, &ss->tx.activate,
1732 			       0, "tx_activate");
1733 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1734 			       "tx_deactivate",
1735 			       CTLFLAG_RD, &ss->tx.deactivate,
1736 			       0, "tx_deactivate");
1737 	}
1738 }
1739 
1740 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1741    backwards one at a time and handle ring wraps */
1742 
1743 static inline void
1744 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1745 			    mcp_kreq_ether_send_t *src, int cnt)
1746 {
1747         int idx, starting_slot;
1748         starting_slot = tx->req;
1749         while (cnt > 1) {
1750                 cnt--;
1751                 idx = (starting_slot + cnt) & tx->mask;
1752                 mxge_pio_copy(&tx->lanai[idx],
1753 			      &src[cnt], sizeof(*src));
1754                 wmb();
1755         }
1756 }
1757 
1758 /*
1759  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1760  * at most 32 bytes at a time, so as to avoid involving the software
1761  * pio handler in the nic.   We re-write the first segment's flags
1762  * to mark them valid only after writing the entire chain
1763  */
1764 
1765 static inline void
1766 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1767                   int cnt)
1768 {
1769         int idx, i;
1770         uint32_t *src_ints;
1771 	volatile uint32_t *dst_ints;
1772         mcp_kreq_ether_send_t *srcp;
1773 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1774 	uint8_t last_flags;
1775 
1776         idx = tx->req & tx->mask;
1777 
1778 	last_flags = src->flags;
1779 	src->flags = 0;
1780         wmb();
1781         dst = dstp = &tx->lanai[idx];
1782         srcp = src;
1783 
1784         if ((idx + cnt) < tx->mask) {
1785                 for (i = 0; i < (cnt - 1); i += 2) {
1786                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1787                         wmb(); /* force write every 32 bytes */
1788                         srcp += 2;
1789                         dstp += 2;
1790                 }
1791         } else {
1792                 /* submit all but the first request, and ensure
1793                    that it is submitted below */
1794                 mxge_submit_req_backwards(tx, src, cnt);
1795                 i = 0;
1796         }
1797         if (i < cnt) {
1798                 /* submit the first request */
1799                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1800                 wmb(); /* barrier before setting valid flag */
1801         }
1802 
1803         /* re-write the last 32-bits with the valid flags */
1804         src->flags = last_flags;
1805         src_ints = (uint32_t *)src;
1806         src_ints+=3;
1807         dst_ints = (volatile uint32_t *)dst;
1808         dst_ints+=3;
1809         *dst_ints =  *src_ints;
1810         tx->req += cnt;
1811         wmb();
1812 }
1813 
1814 #if IFCAP_TSO4
1815 
1816 static void
1817 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1818 	       int busdma_seg_cnt, int ip_off)
1819 {
1820 	mxge_tx_ring_t *tx;
1821 	mcp_kreq_ether_send_t *req;
1822 	bus_dma_segment_t *seg;
1823 	struct ip *ip;
1824 	struct tcphdr *tcp;
1825 	uint32_t low, high_swapped;
1826 	int len, seglen, cum_len, cum_len_next;
1827 	int next_is_first, chop, cnt, rdma_count, small;
1828 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1829 	uint8_t flags, flags_next;
1830 	static int once;
1831 
1832 	mss = m->m_pkthdr.tso_segsz;
1833 
1834 	/* negative cum_len signifies to the
1835 	 * send loop that we are still in the
1836 	 * header portion of the TSO packet.
1837 	 */
1838 
1839 	/* ensure we have the ethernet, IP and TCP
1840 	   header together in the first mbuf, copy
1841 	   it to a scratch buffer if not */
1842 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1843 		m_copydata(m, 0, ip_off + sizeof (*ip),
1844 			   ss->scratch);
1845 		ip = (struct ip *)(ss->scratch + ip_off);
1846 	} else {
1847 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1848 	}
1849 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1850 			    + sizeof (*tcp))) {
1851 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1852 			   + sizeof (*tcp),  ss->scratch);
1853 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1854 	}
1855 
1856 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1857 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1858 	cksum_offset = ip_off + (ip->ip_hl << 2);
1859 
1860 	/* TSO implies checksum offload on this hardware */
1861 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP)) == 0)) {
1862 		/*
1863 		 * If packet has full TCP csum, replace it with pseudo hdr
1864 		 * sum that the NIC expects, otherwise the NIC will emit
1865 		 * packets with bad TCP checksums.
1866 		 */
1867 		m->m_pkthdr.csum_flags = CSUM_TCP;
1868 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1869 		tcp->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1870 			htons(IPPROTO_TCP + (m->m_pkthdr.len - cksum_offset)));
1871 	}
1872 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1873 
1874 
1875 	/* for TSO, pseudo_hdr_offset holds mss.
1876 	 * The firmware figures out where to put
1877 	 * the checksum by parsing the header. */
1878 	pseudo_hdr_offset = htobe16(mss);
1879 
1880 	tx = &ss->tx;
1881 	req = tx->req_list;
1882 	seg = tx->seg_list;
1883 	cnt = 0;
1884 	rdma_count = 0;
1885 	/* "rdma_count" is the number of RDMAs belonging to the
1886 	 * current packet BEFORE the current send request. For
1887 	 * non-TSO packets, this is equal to "count".
1888 	 * For TSO packets, rdma_count needs to be reset
1889 	 * to 0 after a segment cut.
1890 	 *
1891 	 * The rdma_count field of the send request is
1892 	 * the number of RDMAs of the packet starting at
1893 	 * that request. For TSO send requests with one ore more cuts
1894 	 * in the middle, this is the number of RDMAs starting
1895 	 * after the last cut in the request. All previous
1896 	 * segments before the last cut implicitly have 1 RDMA.
1897 	 *
1898 	 * Since the number of RDMAs is not known beforehand,
1899 	 * it must be filled-in retroactively - after each
1900 	 * segmentation cut or at the end of the entire packet.
1901 	 */
1902 
1903 	while (busdma_seg_cnt) {
1904 		/* Break the busdma segment up into pieces*/
1905 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1906 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1907 		len = seg->ds_len;
1908 
1909 		while (len) {
1910 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1911 			seglen = len;
1912 			cum_len_next = cum_len + seglen;
1913 			(req-rdma_count)->rdma_count = rdma_count + 1;
1914 			if (__predict_true(cum_len >= 0)) {
1915 				/* payload */
1916 				chop = (cum_len_next > mss);
1917 				cum_len_next = cum_len_next % mss;
1918 				next_is_first = (cum_len_next == 0);
1919 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1920 				flags_next |= next_is_first *
1921 					MXGEFW_FLAGS_FIRST;
1922 				rdma_count |= -(chop | next_is_first);
1923 				rdma_count += chop & !next_is_first;
1924 			} else if (cum_len_next >= 0) {
1925 				/* header ends */
1926 				rdma_count = -1;
1927 				cum_len_next = 0;
1928 				seglen = -cum_len;
1929 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1930 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1931 					MXGEFW_FLAGS_FIRST |
1932 					(small * MXGEFW_FLAGS_SMALL);
1933 			    }
1934 
1935 			req->addr_high = high_swapped;
1936 			req->addr_low = htobe32(low);
1937 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1938 			req->pad = 0;
1939 			req->rdma_count = 1;
1940 			req->length = htobe16(seglen);
1941 			req->cksum_offset = cksum_offset;
1942 			req->flags = flags | ((cum_len & 1) *
1943 					      MXGEFW_FLAGS_ALIGN_ODD);
1944 			low += seglen;
1945 			len -= seglen;
1946 			cum_len = cum_len_next;
1947 			flags = flags_next;
1948 			req++;
1949 			cnt++;
1950 			rdma_count++;
1951 			if (__predict_false(cksum_offset > seglen))
1952 				cksum_offset -= seglen;
1953 			else
1954 				cksum_offset = 0;
1955 			if (__predict_false(cnt > tx->max_desc))
1956 				goto drop;
1957 		}
1958 		busdma_seg_cnt--;
1959 		seg++;
1960 	}
1961 	(req-rdma_count)->rdma_count = rdma_count;
1962 
1963 	do {
1964 		req--;
1965 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1966 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1967 
1968 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1969 	mxge_submit_req(tx, tx->req_list, cnt);
1970 #ifdef IFNET_BUF_RING
1971 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1972 		/* tell the NIC to start polling this slice */
1973 		*tx->send_go = 1;
1974 		tx->queue_active = 1;
1975 		tx->activate++;
1976 		wmb();
1977 	}
1978 #endif
1979 	return;
1980 
1981 drop:
1982 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1983 	m_freem(m);
1984 	ss->oerrors++;
1985 	if (!once) {
1986 		printf("tx->max_desc exceeded via TSO!\n");
1987 		printf("mss = %d, %ld, %d!\n", mss,
1988 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1989 		once = 1;
1990 	}
1991 	return;
1992 
1993 }
1994 
1995 #endif /* IFCAP_TSO4 */
1996 
1997 #ifdef MXGE_NEW_VLAN_API
1998 /*
1999  * We reproduce the software vlan tag insertion from
2000  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2001  * vlan tag insertion. We need to advertise this in order to have the
2002  * vlan interface respect our csum offload flags.
2003  */
2004 static struct mbuf *
2005 mxge_vlan_tag_insert(struct mbuf *m)
2006 {
2007 	struct ether_vlan_header *evl;
2008 
2009 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
2010 	if (__predict_false(m == NULL))
2011 		return NULL;
2012 	if (m->m_len < sizeof(*evl)) {
2013 		m = m_pullup(m, sizeof(*evl));
2014 		if (__predict_false(m == NULL))
2015 			return NULL;
2016 	}
2017 	/*
2018 	 * Transform the Ethernet header into an Ethernet header
2019 	 * with 802.1Q encapsulation.
2020 	 */
2021 	evl = mtod(m, struct ether_vlan_header *);
2022 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2023 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2024 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2025 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2026 	m->m_flags &= ~M_VLANTAG;
2027 	return m;
2028 }
2029 #endif /* MXGE_NEW_VLAN_API */
2030 
2031 static void
2032 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2033 {
2034 	mxge_softc_t *sc;
2035 	mcp_kreq_ether_send_t *req;
2036 	bus_dma_segment_t *seg;
2037 	struct mbuf *m_tmp;
2038 	struct ifnet *ifp;
2039 	mxge_tx_ring_t *tx;
2040 	struct ip *ip;
2041 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2042 	uint16_t pseudo_hdr_offset;
2043         uint8_t flags, cksum_offset;
2044 
2045 
2046 	sc = ss->sc;
2047 	ifp = sc->ifp;
2048 	tx = &ss->tx;
2049 
2050 	ip_off = sizeof (struct ether_header);
2051 #ifdef MXGE_NEW_VLAN_API
2052 	if (m->m_flags & M_VLANTAG) {
2053 		m = mxge_vlan_tag_insert(m);
2054 		if (__predict_false(m == NULL))
2055 			goto drop;
2056 		ip_off += ETHER_VLAN_ENCAP_LEN;
2057 	}
2058 #endif
2059 	/* (try to) map the frame for DMA */
2060 	idx = tx->req & tx->mask;
2061 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2062 				      m, tx->seg_list, &cnt,
2063 				      BUS_DMA_NOWAIT);
2064 	if (__predict_false(err == EFBIG)) {
2065 		/* Too many segments in the chain.  Try
2066 		   to defrag */
2067 		m_tmp = m_defrag(m, M_NOWAIT);
2068 		if (m_tmp == NULL) {
2069 			goto drop;
2070 		}
2071 		ss->tx.defrag++;
2072 		m = m_tmp;
2073 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2074 					      tx->info[idx].map,
2075 					      m, tx->seg_list, &cnt,
2076 					      BUS_DMA_NOWAIT);
2077 	}
2078 	if (__predict_false(err != 0)) {
2079 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2080 			      " packet len = %d\n", err, m->m_pkthdr.len);
2081 		goto drop;
2082 	}
2083 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2084 			BUS_DMASYNC_PREWRITE);
2085 	tx->info[idx].m = m;
2086 
2087 #if IFCAP_TSO4
2088 	/* TSO is different enough, we handle it in another routine */
2089 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2090 		mxge_encap_tso(ss, m, cnt, ip_off);
2091 		return;
2092 	}
2093 #endif
2094 
2095 	req = tx->req_list;
2096 	cksum_offset = 0;
2097 	pseudo_hdr_offset = 0;
2098 	flags = MXGEFW_FLAGS_NO_TSO;
2099 
2100 	/* checksum offloading? */
2101 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2102 		/* ensure ip header is in first mbuf, copy
2103 		   it to a scratch buffer if not */
2104 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2105 			m_copydata(m, 0, ip_off + sizeof (*ip),
2106 				   ss->scratch);
2107 			ip = (struct ip *)(ss->scratch + ip_off);
2108 		} else {
2109 			ip = (struct ip *)(mtod(m, char *) + ip_off);
2110 		}
2111 		cksum_offset = ip_off + (ip->ip_hl << 2);
2112 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2113 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2114 		req->cksum_offset = cksum_offset;
2115 		flags |= MXGEFW_FLAGS_CKSUM;
2116 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2117 	} else {
2118 		odd_flag = 0;
2119 	}
2120 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2121 		flags |= MXGEFW_FLAGS_SMALL;
2122 
2123 	/* convert segments into a request list */
2124 	cum_len = 0;
2125 	seg = tx->seg_list;
2126 	req->flags = MXGEFW_FLAGS_FIRST;
2127 	for (i = 0; i < cnt; i++) {
2128 		req->addr_low =
2129 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2130 		req->addr_high =
2131 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2132 		req->length = htobe16(seg->ds_len);
2133 		req->cksum_offset = cksum_offset;
2134 		if (cksum_offset > seg->ds_len)
2135 			cksum_offset -= seg->ds_len;
2136 		else
2137 			cksum_offset = 0;
2138 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2139 		req->pad = 0; /* complete solid 16-byte block */
2140 		req->rdma_count = 1;
2141 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2142 		cum_len += seg->ds_len;
2143 		seg++;
2144 		req++;
2145 		req->flags = 0;
2146 	}
2147 	req--;
2148 	/* pad runts to 60 bytes */
2149 	if (cum_len < 60) {
2150 		req++;
2151 		req->addr_low =
2152 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2153 		req->addr_high =
2154 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2155 		req->length = htobe16(60 - cum_len);
2156 		req->cksum_offset = 0;
2157 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2158 		req->pad = 0; /* complete solid 16-byte block */
2159 		req->rdma_count = 1;
2160 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2161 		cnt++;
2162 	}
2163 
2164 	tx->req_list[0].rdma_count = cnt;
2165 #if 0
2166 	/* print what the firmware will see */
2167 	for (i = 0; i < cnt; i++) {
2168 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2169 		    "cso:%d, flags:0x%x, rdma:%d\n",
2170 		    i, (int)ntohl(tx->req_list[i].addr_high),
2171 		    (int)ntohl(tx->req_list[i].addr_low),
2172 		    (int)ntohs(tx->req_list[i].length),
2173 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2174 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2175 		    tx->req_list[i].rdma_count);
2176 	}
2177 	printf("--------------\n");
2178 #endif
2179 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2180 	mxge_submit_req(tx, tx->req_list, cnt);
2181 #ifdef IFNET_BUF_RING
2182 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2183 		/* tell the NIC to start polling this slice */
2184 		*tx->send_go = 1;
2185 		tx->queue_active = 1;
2186 		tx->activate++;
2187 		wmb();
2188 	}
2189 #endif
2190 	return;
2191 
2192 drop:
2193 	m_freem(m);
2194 	ss->oerrors++;
2195 	return;
2196 }
2197 
2198 #ifdef IFNET_BUF_RING
2199 static void
2200 mxge_qflush(struct ifnet *ifp)
2201 {
2202 	mxge_softc_t *sc = ifp->if_softc;
2203 	mxge_tx_ring_t *tx;
2204 	struct mbuf *m;
2205 	int slice;
2206 
2207 	for (slice = 0; slice < sc->num_slices; slice++) {
2208 		tx = &sc->ss[slice].tx;
2209 		mtx_lock(&tx->mtx);
2210 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2211 			m_freem(m);
2212 		mtx_unlock(&tx->mtx);
2213 	}
2214 	if_qflush(ifp);
2215 }
2216 
2217 static inline void
2218 mxge_start_locked(struct mxge_slice_state *ss)
2219 {
2220 	mxge_softc_t *sc;
2221 	struct mbuf *m;
2222 	struct ifnet *ifp;
2223 	mxge_tx_ring_t *tx;
2224 
2225 	sc = ss->sc;
2226 	ifp = sc->ifp;
2227 	tx = &ss->tx;
2228 
2229 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2230 		m = drbr_dequeue(ifp, tx->br);
2231 		if (m == NULL) {
2232 			return;
2233 		}
2234 		/* let BPF see it */
2235 		BPF_MTAP(ifp, m);
2236 
2237 		/* give it to the nic */
2238 		mxge_encap(ss, m);
2239 	}
2240 	/* ran out of transmit slots */
2241 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2242 	    && (!drbr_empty(ifp, tx->br))) {
2243 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2244 		tx->stall++;
2245 	}
2246 }
2247 
2248 static int
2249 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2250 {
2251 	mxge_softc_t *sc;
2252 	struct ifnet *ifp;
2253 	mxge_tx_ring_t *tx;
2254 	int err;
2255 
2256 	sc = ss->sc;
2257 	ifp = sc->ifp;
2258 	tx = &ss->tx;
2259 
2260 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2261 	    IFF_DRV_RUNNING) {
2262 		err = drbr_enqueue(ifp, tx->br, m);
2263 		return (err);
2264 	}
2265 
2266 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2267 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2268 		/* let BPF see it */
2269 		BPF_MTAP(ifp, m);
2270 		/* give it to the nic */
2271 		mxge_encap(ss, m);
2272 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2273 		return (err);
2274 	}
2275 	if (!drbr_empty(ifp, tx->br))
2276 		mxge_start_locked(ss);
2277 	return (0);
2278 }
2279 
2280 static int
2281 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2282 {
2283 	mxge_softc_t *sc = ifp->if_softc;
2284 	struct mxge_slice_state *ss;
2285 	mxge_tx_ring_t *tx;
2286 	int err = 0;
2287 	int slice;
2288 
2289 	slice = m->m_pkthdr.flowid;
2290 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2291 
2292 	ss = &sc->ss[slice];
2293 	tx = &ss->tx;
2294 
2295 	if (mtx_trylock(&tx->mtx)) {
2296 		err = mxge_transmit_locked(ss, m);
2297 		mtx_unlock(&tx->mtx);
2298 	} else {
2299 		err = drbr_enqueue(ifp, tx->br, m);
2300 	}
2301 
2302 	return (err);
2303 }
2304 
2305 #else
2306 
2307 static inline void
2308 mxge_start_locked(struct mxge_slice_state *ss)
2309 {
2310 	mxge_softc_t *sc;
2311 	struct mbuf *m;
2312 	struct ifnet *ifp;
2313 	mxge_tx_ring_t *tx;
2314 
2315 	sc = ss->sc;
2316 	ifp = sc->ifp;
2317 	tx = &ss->tx;
2318 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2319 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2320 		if (m == NULL) {
2321 			return;
2322 		}
2323 		/* let BPF see it */
2324 		BPF_MTAP(ifp, m);
2325 
2326 		/* give it to the nic */
2327 		mxge_encap(ss, m);
2328 	}
2329 	/* ran out of transmit slots */
2330 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2331 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2332 		tx->stall++;
2333 	}
2334 }
2335 #endif
2336 static void
2337 mxge_start(struct ifnet *ifp)
2338 {
2339 	mxge_softc_t *sc = ifp->if_softc;
2340 	struct mxge_slice_state *ss;
2341 
2342 	/* only use the first slice for now */
2343 	ss = &sc->ss[0];
2344 	mtx_lock(&ss->tx.mtx);
2345 	mxge_start_locked(ss);
2346 	mtx_unlock(&ss->tx.mtx);
2347 }
2348 
2349 /*
2350  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2351  * at most 32 bytes at a time, so as to avoid involving the software
2352  * pio handler in the nic.   We re-write the first segment's low
2353  * DMA address to mark it valid only after we write the entire chunk
2354  * in a burst
2355  */
2356 static inline void
2357 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2358 		mcp_kreq_ether_recv_t *src)
2359 {
2360 	uint32_t low;
2361 
2362 	low = src->addr_low;
2363 	src->addr_low = 0xffffffff;
2364 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2365 	wmb();
2366 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2367 	wmb();
2368 	src->addr_low = low;
2369 	dst->addr_low = low;
2370 	wmb();
2371 }
2372 
2373 static int
2374 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2375 {
2376 	bus_dma_segment_t seg;
2377 	struct mbuf *m;
2378 	mxge_rx_ring_t *rx = &ss->rx_small;
2379 	int cnt, err;
2380 
2381 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2382 	if (m == NULL) {
2383 		rx->alloc_fail++;
2384 		err = ENOBUFS;
2385 		goto done;
2386 	}
2387 	m->m_len = MHLEN;
2388 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2389 				      &seg, &cnt, BUS_DMA_NOWAIT);
2390 	if (err != 0) {
2391 		m_free(m);
2392 		goto done;
2393 	}
2394 	rx->info[idx].m = m;
2395 	rx->shadow[idx].addr_low =
2396 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2397 	rx->shadow[idx].addr_high =
2398 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2399 
2400 done:
2401 	if ((idx & 7) == 7)
2402 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2403 	return err;
2404 }
2405 
2406 static int
2407 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2408 {
2409 	bus_dma_segment_t seg[3];
2410 	struct mbuf *m;
2411 	mxge_rx_ring_t *rx = &ss->rx_big;
2412 	int cnt, err, i;
2413 
2414 	m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2415 	if (m == NULL) {
2416 		rx->alloc_fail++;
2417 		err = ENOBUFS;
2418 		goto done;
2419 	}
2420 	m->m_len = rx->mlen;
2421 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2422 				      seg, &cnt, BUS_DMA_NOWAIT);
2423 	if (err != 0) {
2424 		m_free(m);
2425 		goto done;
2426 	}
2427 	rx->info[idx].m = m;
2428 	rx->shadow[idx].addr_low =
2429 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2430 	rx->shadow[idx].addr_high =
2431 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2432 
2433 #if MXGE_VIRT_JUMBOS
2434 	for (i = 1; i < cnt; i++) {
2435 		rx->shadow[idx + i].addr_low =
2436 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2437 		rx->shadow[idx + i].addr_high =
2438 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2439        }
2440 #endif
2441 
2442 done:
2443        for (i = 0; i < rx->nbufs; i++) {
2444 		if ((idx & 7) == 7) {
2445 			mxge_submit_8rx(&rx->lanai[idx - 7],
2446 					&rx->shadow[idx - 7]);
2447 		}
2448 		idx++;
2449 	}
2450 	return err;
2451 }
2452 
2453 /*
2454  *  Myri10GE hardware checksums are not valid if the sender
2455  *  padded the frame with non-zero padding.  This is because
2456  *  the firmware just does a simple 16-bit 1s complement
2457  *  checksum across the entire frame, excluding the first 14
2458  *  bytes.  It is best to simply to check the checksum and
2459  *  tell the stack about it only if the checksum is good
2460  */
2461 
2462 static inline uint16_t
2463 mxge_rx_csum(struct mbuf *m, int csum)
2464 {
2465 	struct ether_header *eh;
2466 	struct ip *ip;
2467 	uint16_t c;
2468 
2469 	eh = mtod(m, struct ether_header *);
2470 
2471 	/* only deal with IPv4 TCP & UDP for now */
2472 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2473 		return 1;
2474 	ip = (struct ip *)(eh + 1);
2475 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2476 			    ip->ip_p != IPPROTO_UDP))
2477 		return 1;
2478 #ifdef INET
2479 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2480 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2481 			    - (ip->ip_hl << 2) + ip->ip_p));
2482 #else
2483 	c = 1;
2484 #endif
2485 	c ^= 0xffff;
2486 	return (c);
2487 }
2488 
2489 static void
2490 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2491 {
2492 	struct ether_vlan_header *evl;
2493 	struct ether_header *eh;
2494 	uint32_t partial;
2495 
2496 	evl = mtod(m, struct ether_vlan_header *);
2497 	eh = mtod(m, struct ether_header *);
2498 
2499 	/*
2500 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2501 	 * after what the firmware thought was the end of the ethernet
2502 	 * header.
2503 	 */
2504 
2505 	/* put checksum into host byte order */
2506 	*csum = ntohs(*csum);
2507 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2508 	(*csum) += ~partial;
2509 	(*csum) +=  ((*csum) < ~partial);
2510 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2511 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2512 
2513 	/* restore checksum to network byte order;
2514 	   later consumers expect this */
2515 	*csum = htons(*csum);
2516 
2517 	/* save the tag */
2518 #ifdef MXGE_NEW_VLAN_API
2519 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2520 #else
2521 	{
2522 		struct m_tag *mtag;
2523 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2524 				   M_NOWAIT);
2525 		if (mtag == NULL)
2526 			return;
2527 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2528 		m_tag_prepend(m, mtag);
2529 	}
2530 
2531 #endif
2532 	m->m_flags |= M_VLANTAG;
2533 
2534 	/*
2535 	 * Remove the 802.1q header by copying the Ethernet
2536 	 * addresses over it and adjusting the beginning of
2537 	 * the data in the mbuf.  The encapsulated Ethernet
2538 	 * type field is already in place.
2539 	 */
2540 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2541 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2542 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2543 }
2544 
2545 
2546 static inline void
2547 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2548 {
2549 	mxge_softc_t *sc;
2550 	struct ifnet *ifp;
2551 	struct mbuf *m;
2552 	struct ether_header *eh;
2553 	mxge_rx_ring_t *rx;
2554 	bus_dmamap_t old_map;
2555 	int idx;
2556 	uint16_t tcpudp_csum;
2557 
2558 	sc = ss->sc;
2559 	ifp = sc->ifp;
2560 	rx = &ss->rx_big;
2561 	idx = rx->cnt & rx->mask;
2562 	rx->cnt += rx->nbufs;
2563 	/* save a pointer to the received mbuf */
2564 	m = rx->info[idx].m;
2565 	/* try to replace the received mbuf */
2566 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2567 		/* drop the frame -- the old mbuf is re-cycled */
2568 		ifp->if_ierrors++;
2569 		return;
2570 	}
2571 
2572 	/* unmap the received buffer */
2573 	old_map = rx->info[idx].map;
2574 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2575 	bus_dmamap_unload(rx->dmat, old_map);
2576 
2577 	/* swap the bus_dmamap_t's */
2578 	rx->info[idx].map = rx->extra_map;
2579 	rx->extra_map = old_map;
2580 
2581 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2582 	 * aligned */
2583 	m->m_data += MXGEFW_PAD;
2584 
2585 	m->m_pkthdr.rcvif = ifp;
2586 	m->m_len = m->m_pkthdr.len = len;
2587 	ss->ipackets++;
2588 	eh = mtod(m, struct ether_header *);
2589 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2590 		mxge_vlan_tag_remove(m, &csum);
2591 	}
2592 	/* if the checksum is valid, mark it in the mbuf header */
2593 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2594 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2595 			return;
2596 		/* otherwise, it was a UDP frame, or a TCP frame which
2597 		   we could not do LRO on.  Tell the stack that the
2598 		   checksum is good */
2599 		m->m_pkthdr.csum_data = 0xffff;
2600 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2601 	}
2602 	/* flowid only valid if RSS hashing is enabled */
2603 	if (sc->num_slices > 1) {
2604 		m->m_pkthdr.flowid = (ss - sc->ss);
2605 		m->m_flags |= M_FLOWID;
2606 	}
2607 	/* pass the frame up the stack */
2608 	(*ifp->if_input)(ifp, m);
2609 }
2610 
2611 static inline void
2612 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2613 {
2614 	mxge_softc_t *sc;
2615 	struct ifnet *ifp;
2616 	struct ether_header *eh;
2617 	struct mbuf *m;
2618 	mxge_rx_ring_t *rx;
2619 	bus_dmamap_t old_map;
2620 	int idx;
2621 	uint16_t tcpudp_csum;
2622 
2623 	sc = ss->sc;
2624 	ifp = sc->ifp;
2625 	rx = &ss->rx_small;
2626 	idx = rx->cnt & rx->mask;
2627 	rx->cnt++;
2628 	/* save a pointer to the received mbuf */
2629 	m = rx->info[idx].m;
2630 	/* try to replace the received mbuf */
2631 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2632 		/* drop the frame -- the old mbuf is re-cycled */
2633 		ifp->if_ierrors++;
2634 		return;
2635 	}
2636 
2637 	/* unmap the received buffer */
2638 	old_map = rx->info[idx].map;
2639 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2640 	bus_dmamap_unload(rx->dmat, old_map);
2641 
2642 	/* swap the bus_dmamap_t's */
2643 	rx->info[idx].map = rx->extra_map;
2644 	rx->extra_map = old_map;
2645 
2646 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2647 	 * aligned */
2648 	m->m_data += MXGEFW_PAD;
2649 
2650 	m->m_pkthdr.rcvif = ifp;
2651 	m->m_len = m->m_pkthdr.len = len;
2652 	ss->ipackets++;
2653 	eh = mtod(m, struct ether_header *);
2654 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2655 		mxge_vlan_tag_remove(m, &csum);
2656 	}
2657 	/* if the checksum is valid, mark it in the mbuf header */
2658 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2659 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2660 			return;
2661 		/* otherwise, it was a UDP frame, or a TCP frame which
2662 		   we could not do LRO on.  Tell the stack that the
2663 		   checksum is good */
2664 		m->m_pkthdr.csum_data = 0xffff;
2665 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2666 	}
2667 	/* flowid only valid if RSS hashing is enabled */
2668 	if (sc->num_slices > 1) {
2669 		m->m_pkthdr.flowid = (ss - sc->ss);
2670 		m->m_flags |= M_FLOWID;
2671 	}
2672 	/* pass the frame up the stack */
2673 	(*ifp->if_input)(ifp, m);
2674 }
2675 
2676 static inline void
2677 mxge_clean_rx_done(struct mxge_slice_state *ss)
2678 {
2679 	mxge_rx_done_t *rx_done = &ss->rx_done;
2680 	int limit = 0;
2681 	uint16_t length;
2682 	uint16_t checksum;
2683 
2684 
2685 	while (rx_done->entry[rx_done->idx].length != 0) {
2686 		length = ntohs(rx_done->entry[rx_done->idx].length);
2687 		rx_done->entry[rx_done->idx].length = 0;
2688 		checksum = rx_done->entry[rx_done->idx].checksum;
2689 		if (length <= (MHLEN - MXGEFW_PAD))
2690 			mxge_rx_done_small(ss, length, checksum);
2691 		else
2692 			mxge_rx_done_big(ss, length, checksum);
2693 		rx_done->cnt++;
2694 		rx_done->idx = rx_done->cnt & rx_done->mask;
2695 
2696 		/* limit potential for livelock */
2697 		if (__predict_false(++limit > rx_done->mask / 2))
2698 			break;
2699 	}
2700 #ifdef INET
2701 	while (!SLIST_EMPTY(&ss->lro_active)) {
2702 		struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2703 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2704 		mxge_lro_flush(ss, lro);
2705 	}
2706 #endif
2707 }
2708 
2709 
2710 static inline void
2711 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2712 {
2713 	struct ifnet *ifp;
2714 	mxge_tx_ring_t *tx;
2715 	struct mbuf *m;
2716 	bus_dmamap_t map;
2717 	int idx;
2718 	int *flags;
2719 
2720 	tx = &ss->tx;
2721 	ifp = ss->sc->ifp;
2722 	while (tx->pkt_done != mcp_idx) {
2723 		idx = tx->done & tx->mask;
2724 		tx->done++;
2725 		m = tx->info[idx].m;
2726 		/* mbuf and DMA map only attached to the first
2727 		   segment per-mbuf */
2728 		if (m != NULL) {
2729 			ss->obytes += m->m_pkthdr.len;
2730 			if (m->m_flags & M_MCAST)
2731 				ss->omcasts++;
2732 			ss->opackets++;
2733 			tx->info[idx].m = NULL;
2734 			map = tx->info[idx].map;
2735 			bus_dmamap_unload(tx->dmat, map);
2736 			m_freem(m);
2737 		}
2738 		if (tx->info[idx].flag) {
2739 			tx->info[idx].flag = 0;
2740 			tx->pkt_done++;
2741 		}
2742 	}
2743 
2744 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2745            its OK to send packets */
2746 #ifdef IFNET_BUF_RING
2747 	flags = &ss->if_drv_flags;
2748 #else
2749 	flags = &ifp->if_drv_flags;
2750 #endif
2751 	mtx_lock(&ss->tx.mtx);
2752 	if ((*flags) & IFF_DRV_OACTIVE &&
2753 	    tx->req - tx->done < (tx->mask + 1)/4) {
2754 		*(flags) &= ~IFF_DRV_OACTIVE;
2755 		ss->tx.wake++;
2756 		mxge_start_locked(ss);
2757 	}
2758 #ifdef IFNET_BUF_RING
2759 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2760 		/* let the NIC stop polling this queue, since there
2761 		 * are no more transmits pending */
2762 		if (tx->req == tx->done) {
2763 			*tx->send_stop = 1;
2764 			tx->queue_active = 0;
2765 			tx->deactivate++;
2766 			wmb();
2767 		}
2768 	}
2769 #endif
2770 	mtx_unlock(&ss->tx.mtx);
2771 
2772 }
2773 
2774 static struct mxge_media_type mxge_xfp_media_types[] =
2775 {
2776 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2777 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2778 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2779 	{0,		(1 << 5),	"10GBASE-ER"},
2780 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2781 	{0,		(1 << 3),	"10GBASE-SW"},
2782 	{0,		(1 << 2),	"10GBASE-LW"},
2783 	{0,		(1 << 1),	"10GBASE-EW"},
2784 	{0,		(1 << 0),	"Reserved"}
2785 };
2786 static struct mxge_media_type mxge_sfp_media_types[] =
2787 {
2788 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2789 	{0,		(1 << 7),	"Reserved"},
2790 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2791 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2792 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2793 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2794 };
2795 
2796 static void
2797 mxge_media_set(mxge_softc_t *sc, int media_type)
2798 {
2799 
2800 
2801 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2802 		    0, NULL);
2803 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2804 	sc->current_media = media_type;
2805 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2806 }
2807 
2808 static void
2809 mxge_media_init(mxge_softc_t *sc)
2810 {
2811 	char *ptr;
2812 	int i;
2813 
2814 	ifmedia_removeall(&sc->media);
2815 	mxge_media_set(sc, IFM_AUTO);
2816 
2817 	/*
2818 	 * parse the product code to deterimine the interface type
2819 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2820 	 * after the 3rd dash in the driver's cached copy of the
2821 	 * EEPROM's product code string.
2822 	 */
2823 	ptr = sc->product_code_string;
2824 	if (ptr == NULL) {
2825 		device_printf(sc->dev, "Missing product code\n");
2826 		return;
2827 	}
2828 
2829 	for (i = 0; i < 3; i++, ptr++) {
2830 		ptr = index(ptr, '-');
2831 		if (ptr == NULL) {
2832 			device_printf(sc->dev,
2833 				      "only %d dashes in PC?!?\n", i);
2834 			return;
2835 		}
2836 	}
2837 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2838 		/* -C is CX4 */
2839 		sc->connector = MXGE_CX4;
2840 		mxge_media_set(sc, IFM_10G_CX4);
2841 	} else if (*ptr == 'Q') {
2842 		/* -Q is Quad Ribbon Fiber */
2843 		sc->connector = MXGE_QRF;
2844 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2845 		/* FreeBSD has no media type for Quad ribbon fiber */
2846 	} else if (*ptr == 'R') {
2847 		/* -R is XFP */
2848 		sc->connector = MXGE_XFP;
2849 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2850 		/* -S or -2S is SFP+ */
2851 		sc->connector = MXGE_SFP;
2852 	} else {
2853 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2854 	}
2855 }
2856 
2857 /*
2858  * Determine the media type for a NIC.  Some XFPs will identify
2859  * themselves only when their link is up, so this is initiated via a
2860  * link up interrupt.  However, this can potentially take up to
2861  * several milliseconds, so it is run via the watchdog routine, rather
2862  * than in the interrupt handler itself.
2863  */
2864 static void
2865 mxge_media_probe(mxge_softc_t *sc)
2866 {
2867 	mxge_cmd_t cmd;
2868 	char *cage_type;
2869 
2870 	struct mxge_media_type *mxge_media_types = NULL;
2871 	int i, err, ms, mxge_media_type_entries;
2872 	uint32_t byte;
2873 
2874 	sc->need_media_probe = 0;
2875 
2876 	if (sc->connector == MXGE_XFP) {
2877 		/* -R is XFP */
2878 		mxge_media_types = mxge_xfp_media_types;
2879 		mxge_media_type_entries =
2880 			sizeof (mxge_xfp_media_types) /
2881 			sizeof (mxge_xfp_media_types[0]);
2882 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2883 		cage_type = "XFP";
2884 	} else 	if (sc->connector == MXGE_SFP) {
2885 		/* -S or -2S is SFP+ */
2886 		mxge_media_types = mxge_sfp_media_types;
2887 		mxge_media_type_entries =
2888 			sizeof (mxge_sfp_media_types) /
2889 			sizeof (mxge_sfp_media_types[0]);
2890 		cage_type = "SFP+";
2891 		byte = 3;
2892 	} else {
2893 		/* nothing to do; media type cannot change */
2894 		return;
2895 	}
2896 
2897 	/*
2898 	 * At this point we know the NIC has an XFP cage, so now we
2899 	 * try to determine what is in the cage by using the
2900 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2901 	 * register.  We read just one byte, which may take over
2902 	 * a millisecond
2903 	 */
2904 
2905 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2906 	cmd.data1 = byte;
2907 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2908 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2909 		device_printf(sc->dev, "failed to read XFP\n");
2910 	}
2911 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2912 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2913 	}
2914 	if (err != MXGEFW_CMD_OK) {
2915 		return;
2916 	}
2917 
2918 	/* now we wait for the data to be cached */
2919 	cmd.data0 = byte;
2920 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2921 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2922 		DELAY(1000);
2923 		cmd.data0 = byte;
2924 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2925 	}
2926 	if (err != MXGEFW_CMD_OK) {
2927 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2928 			      cage_type, err, ms);
2929 		return;
2930 	}
2931 
2932 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2933 		if (mxge_verbose)
2934 			device_printf(sc->dev, "%s:%s\n", cage_type,
2935 				      mxge_media_types[0].name);
2936 		if (sc->current_media != mxge_media_types[0].flag) {
2937 			mxge_media_init(sc);
2938 			mxge_media_set(sc, mxge_media_types[0].flag);
2939 		}
2940 		return;
2941 	}
2942 	for (i = 1; i < mxge_media_type_entries; i++) {
2943 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2944 			if (mxge_verbose)
2945 				device_printf(sc->dev, "%s:%s\n",
2946 					      cage_type,
2947 					      mxge_media_types[i].name);
2948 
2949 			if (sc->current_media != mxge_media_types[i].flag) {
2950 				mxge_media_init(sc);
2951 				mxge_media_set(sc, mxge_media_types[i].flag);
2952 			}
2953 			return;
2954 		}
2955 	}
2956 	if (mxge_verbose)
2957 		device_printf(sc->dev, "%s media 0x%x unknown\n",
2958 			      cage_type, cmd.data0);
2959 
2960 	return;
2961 }
2962 
2963 static void
2964 mxge_intr(void *arg)
2965 {
2966 	struct mxge_slice_state *ss = arg;
2967 	mxge_softc_t *sc = ss->sc;
2968 	mcp_irq_data_t *stats = ss->fw_stats;
2969 	mxge_tx_ring_t *tx = &ss->tx;
2970 	mxge_rx_done_t *rx_done = &ss->rx_done;
2971 	uint32_t send_done_count;
2972 	uint8_t valid;
2973 
2974 
2975 #ifndef IFNET_BUF_RING
2976 	/* an interrupt on a non-zero slice is implicitly valid
2977 	   since MSI-X irqs are not shared */
2978 	if (ss != sc->ss) {
2979 		mxge_clean_rx_done(ss);
2980 		*ss->irq_claim = be32toh(3);
2981 		return;
2982 	}
2983 #endif
2984 
2985 	/* make sure the DMA has finished */
2986 	if (!stats->valid) {
2987 		return;
2988 	}
2989 	valid = stats->valid;
2990 
2991 	if (sc->legacy_irq) {
2992 		/* lower legacy IRQ  */
2993 		*sc->irq_deassert = 0;
2994 		if (!mxge_deassert_wait)
2995 			/* don't wait for conf. that irq is low */
2996 			stats->valid = 0;
2997 	} else {
2998 		stats->valid = 0;
2999 	}
3000 
3001 	/* loop while waiting for legacy irq deassertion */
3002 	do {
3003 		/* check for transmit completes and receives */
3004 		send_done_count = be32toh(stats->send_done_count);
3005 		while ((send_done_count != tx->pkt_done) ||
3006 		       (rx_done->entry[rx_done->idx].length != 0)) {
3007 			if (send_done_count != tx->pkt_done)
3008 				mxge_tx_done(ss, (int)send_done_count);
3009 			mxge_clean_rx_done(ss);
3010 			send_done_count = be32toh(stats->send_done_count);
3011 		}
3012 		if (sc->legacy_irq && mxge_deassert_wait)
3013 			wmb();
3014 	} while (*((volatile uint8_t *) &stats->valid));
3015 
3016 	/* fw link & error stats meaningful only on the first slice */
3017 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3018 		if (sc->link_state != stats->link_up) {
3019 			sc->link_state = stats->link_up;
3020 			if (sc->link_state) {
3021 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3022 				 sc->ifp->if_baudrate = IF_Gbps(10UL);
3023 				if (mxge_verbose)
3024 					device_printf(sc->dev, "link up\n");
3025 			} else {
3026 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3027 				sc->ifp->if_baudrate = 0;
3028 				if (mxge_verbose)
3029 					device_printf(sc->dev, "link down\n");
3030 			}
3031 			sc->need_media_probe = 1;
3032 		}
3033 		if (sc->rdma_tags_available !=
3034 		    be32toh(stats->rdma_tags_available)) {
3035 			sc->rdma_tags_available =
3036 				be32toh(stats->rdma_tags_available);
3037 			device_printf(sc->dev, "RDMA timed out! %d tags "
3038 				      "left\n", sc->rdma_tags_available);
3039 		}
3040 
3041 		if (stats->link_down) {
3042 			sc->down_cnt += stats->link_down;
3043 			sc->link_state = 0;
3044 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3045 		}
3046 	}
3047 
3048 	/* check to see if we have rx token to pass back */
3049 	if (valid & 0x1)
3050 	    *ss->irq_claim = be32toh(3);
3051 	*(ss->irq_claim + 1) = be32toh(3);
3052 }
3053 
3054 static void
3055 mxge_init(void *arg)
3056 {
3057 	mxge_softc_t *sc = arg;
3058 	struct ifnet *ifp = sc->ifp;
3059 
3060 
3061 	mtx_lock(&sc->driver_mtx);
3062 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3063 		(void) mxge_open(sc);
3064 	mtx_unlock(&sc->driver_mtx);
3065 }
3066 
3067 
3068 
3069 static void
3070 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3071 {
3072 	struct lro_entry *lro_entry;
3073 	int i;
3074 
3075 	while (!SLIST_EMPTY(&ss->lro_free)) {
3076 		lro_entry = SLIST_FIRST(&ss->lro_free);
3077 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
3078 		free(lro_entry, M_DEVBUF);
3079 	}
3080 
3081 	for (i = 0; i <= ss->rx_big.mask; i++) {
3082 		if (ss->rx_big.info[i].m == NULL)
3083 			continue;
3084 		bus_dmamap_unload(ss->rx_big.dmat,
3085 				  ss->rx_big.info[i].map);
3086 		m_freem(ss->rx_big.info[i].m);
3087 		ss->rx_big.info[i].m = NULL;
3088 	}
3089 
3090 	for (i = 0; i <= ss->rx_small.mask; i++) {
3091 		if (ss->rx_small.info[i].m == NULL)
3092 			continue;
3093 		bus_dmamap_unload(ss->rx_small.dmat,
3094 				  ss->rx_small.info[i].map);
3095 		m_freem(ss->rx_small.info[i].m);
3096 		ss->rx_small.info[i].m = NULL;
3097 	}
3098 
3099 	/* transmit ring used only on the first slice */
3100 	if (ss->tx.info == NULL)
3101 		return;
3102 
3103 	for (i = 0; i <= ss->tx.mask; i++) {
3104 		ss->tx.info[i].flag = 0;
3105 		if (ss->tx.info[i].m == NULL)
3106 			continue;
3107 		bus_dmamap_unload(ss->tx.dmat,
3108 				  ss->tx.info[i].map);
3109 		m_freem(ss->tx.info[i].m);
3110 		ss->tx.info[i].m = NULL;
3111 	}
3112 }
3113 
3114 static void
3115 mxge_free_mbufs(mxge_softc_t *sc)
3116 {
3117 	int slice;
3118 
3119 	for (slice = 0; slice < sc->num_slices; slice++)
3120 		mxge_free_slice_mbufs(&sc->ss[slice]);
3121 }
3122 
3123 static void
3124 mxge_free_slice_rings(struct mxge_slice_state *ss)
3125 {
3126 	int i;
3127 
3128 
3129 	if (ss->rx_done.entry != NULL)
3130 		mxge_dma_free(&ss->rx_done.dma);
3131 	ss->rx_done.entry = NULL;
3132 
3133 	if (ss->tx.req_bytes != NULL)
3134 		free(ss->tx.req_bytes, M_DEVBUF);
3135 	ss->tx.req_bytes = NULL;
3136 
3137 	if (ss->tx.seg_list != NULL)
3138 		free(ss->tx.seg_list, M_DEVBUF);
3139 	ss->tx.seg_list = NULL;
3140 
3141 	if (ss->rx_small.shadow != NULL)
3142 		free(ss->rx_small.shadow, M_DEVBUF);
3143 	ss->rx_small.shadow = NULL;
3144 
3145 	if (ss->rx_big.shadow != NULL)
3146 		free(ss->rx_big.shadow, M_DEVBUF);
3147 	ss->rx_big.shadow = NULL;
3148 
3149 	if (ss->tx.info != NULL) {
3150 		if (ss->tx.dmat != NULL) {
3151 			for (i = 0; i <= ss->tx.mask; i++) {
3152 				bus_dmamap_destroy(ss->tx.dmat,
3153 						   ss->tx.info[i].map);
3154 			}
3155 			bus_dma_tag_destroy(ss->tx.dmat);
3156 		}
3157 		free(ss->tx.info, M_DEVBUF);
3158 	}
3159 	ss->tx.info = NULL;
3160 
3161 	if (ss->rx_small.info != NULL) {
3162 		if (ss->rx_small.dmat != NULL) {
3163 			for (i = 0; i <= ss->rx_small.mask; i++) {
3164 				bus_dmamap_destroy(ss->rx_small.dmat,
3165 						   ss->rx_small.info[i].map);
3166 			}
3167 			bus_dmamap_destroy(ss->rx_small.dmat,
3168 					   ss->rx_small.extra_map);
3169 			bus_dma_tag_destroy(ss->rx_small.dmat);
3170 		}
3171 		free(ss->rx_small.info, M_DEVBUF);
3172 	}
3173 	ss->rx_small.info = NULL;
3174 
3175 	if (ss->rx_big.info != NULL) {
3176 		if (ss->rx_big.dmat != NULL) {
3177 			for (i = 0; i <= ss->rx_big.mask; i++) {
3178 				bus_dmamap_destroy(ss->rx_big.dmat,
3179 						   ss->rx_big.info[i].map);
3180 			}
3181 			bus_dmamap_destroy(ss->rx_big.dmat,
3182 					   ss->rx_big.extra_map);
3183 			bus_dma_tag_destroy(ss->rx_big.dmat);
3184 		}
3185 		free(ss->rx_big.info, M_DEVBUF);
3186 	}
3187 	ss->rx_big.info = NULL;
3188 }
3189 
3190 static void
3191 mxge_free_rings(mxge_softc_t *sc)
3192 {
3193 	int slice;
3194 
3195 	for (slice = 0; slice < sc->num_slices; slice++)
3196 		mxge_free_slice_rings(&sc->ss[slice]);
3197 }
3198 
3199 static int
3200 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3201 		       int tx_ring_entries)
3202 {
3203 	mxge_softc_t *sc = ss->sc;
3204 	size_t bytes;
3205 	int err, i;
3206 
3207 	err = ENOMEM;
3208 
3209 	/* allocate per-slice receive resources */
3210 
3211 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3212 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3213 
3214 	/* allocate the rx shadow rings */
3215 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3216 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3217 	if (ss->rx_small.shadow == NULL)
3218 		return err;
3219 
3220 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3221 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3222 	if (ss->rx_big.shadow == NULL)
3223 		return err;
3224 
3225 	/* allocate the rx host info rings */
3226 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3227 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3228 	if (ss->rx_small.info == NULL)
3229 		return err;
3230 
3231 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3232 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3233 	if (ss->rx_big.info == NULL)
3234 		return err;
3235 
3236 	/* allocate the rx busdma resources */
3237 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3238 				 1,			/* alignment */
3239 				 4096,			/* boundary */
3240 				 BUS_SPACE_MAXADDR,	/* low */
3241 				 BUS_SPACE_MAXADDR,	/* high */
3242 				 NULL, NULL,		/* filter */
3243 				 MHLEN,			/* maxsize */
3244 				 1,			/* num segs */
3245 				 MHLEN,			/* maxsegsize */
3246 				 BUS_DMA_ALLOCNOW,	/* flags */
3247 				 NULL, NULL,		/* lock */
3248 				 &ss->rx_small.dmat);	/* tag */
3249 	if (err != 0) {
3250 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3251 			      err);
3252 		return err;
3253 	}
3254 
3255 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3256 				 1,			/* alignment */
3257 #if MXGE_VIRT_JUMBOS
3258 				 4096,			/* boundary */
3259 #else
3260 				 0,			/* boundary */
3261 #endif
3262 				 BUS_SPACE_MAXADDR,	/* low */
3263 				 BUS_SPACE_MAXADDR,	/* high */
3264 				 NULL, NULL,		/* filter */
3265 				 3*4096,		/* maxsize */
3266 #if MXGE_VIRT_JUMBOS
3267 				 3,			/* num segs */
3268 				 4096,			/* maxsegsize*/
3269 #else
3270 				 1,			/* num segs */
3271 				 MJUM9BYTES,		/* maxsegsize*/
3272 #endif
3273 				 BUS_DMA_ALLOCNOW,	/* flags */
3274 				 NULL, NULL,		/* lock */
3275 				 &ss->rx_big.dmat);	/* tag */
3276 	if (err != 0) {
3277 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3278 			      err);
3279 		return err;
3280 	}
3281 	for (i = 0; i <= ss->rx_small.mask; i++) {
3282 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3283 					&ss->rx_small.info[i].map);
3284 		if (err != 0) {
3285 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3286 				      err);
3287 			return err;
3288 		}
3289 	}
3290 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3291 				&ss->rx_small.extra_map);
3292 	if (err != 0) {
3293 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3294 			      err);
3295 		return err;
3296 	}
3297 
3298 	for (i = 0; i <= ss->rx_big.mask; i++) {
3299 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3300 					&ss->rx_big.info[i].map);
3301 		if (err != 0) {
3302 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3303 				      err);
3304 			return err;
3305 		}
3306 	}
3307 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3308 				&ss->rx_big.extra_map);
3309 	if (err != 0) {
3310 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3311 			      err);
3312 		return err;
3313 	}
3314 
3315 	/* now allocate TX resouces */
3316 
3317 #ifndef IFNET_BUF_RING
3318 	/* only use a single TX ring for now */
3319 	if (ss != ss->sc->ss)
3320 		return 0;
3321 #endif
3322 
3323 	ss->tx.mask = tx_ring_entries - 1;
3324 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3325 
3326 
3327 	/* allocate the tx request copy block */
3328 	bytes = 8 +
3329 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3330 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3331 	if (ss->tx.req_bytes == NULL)
3332 		return err;
3333 	/* ensure req_list entries are aligned to 8 bytes */
3334 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3335 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3336 
3337 	/* allocate the tx busdma segment list */
3338 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3339 	ss->tx.seg_list = (bus_dma_segment_t *)
3340 		malloc(bytes, M_DEVBUF, M_WAITOK);
3341 	if (ss->tx.seg_list == NULL)
3342 		return err;
3343 
3344 	/* allocate the tx host info ring */
3345 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3346 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3347 	if (ss->tx.info == NULL)
3348 		return err;
3349 
3350 	/* allocate the tx busdma resources */
3351 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3352 				 1,			/* alignment */
3353 				 sc->tx_boundary,	/* boundary */
3354 				 BUS_SPACE_MAXADDR,	/* low */
3355 				 BUS_SPACE_MAXADDR,	/* high */
3356 				 NULL, NULL,		/* filter */
3357 				 65536 + 256,		/* maxsize */
3358 				 ss->tx.max_desc - 2,	/* num segs */
3359 				 sc->tx_boundary,	/* maxsegsz */
3360 				 BUS_DMA_ALLOCNOW,	/* flags */
3361 				 NULL, NULL,		/* lock */
3362 				 &ss->tx.dmat);		/* tag */
3363 
3364 	if (err != 0) {
3365 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3366 			      err);
3367 		return err;
3368 	}
3369 
3370 	/* now use these tags to setup dmamaps for each slot
3371 	   in the ring */
3372 	for (i = 0; i <= ss->tx.mask; i++) {
3373 		err = bus_dmamap_create(ss->tx.dmat, 0,
3374 					&ss->tx.info[i].map);
3375 		if (err != 0) {
3376 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3377 				      err);
3378 			return err;
3379 		}
3380 	}
3381 	return 0;
3382 
3383 }
3384 
3385 static int
3386 mxge_alloc_rings(mxge_softc_t *sc)
3387 {
3388 	mxge_cmd_t cmd;
3389 	int tx_ring_size;
3390 	int tx_ring_entries, rx_ring_entries;
3391 	int err, slice;
3392 
3393 	/* get ring sizes */
3394 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3395 	tx_ring_size = cmd.data0;
3396 	if (err != 0) {
3397 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3398 		goto abort;
3399 	}
3400 
3401 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3402 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3403 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3404 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3405 	IFQ_SET_READY(&sc->ifp->if_snd);
3406 
3407 	for (slice = 0; slice < sc->num_slices; slice++) {
3408 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3409 					     rx_ring_entries,
3410 					     tx_ring_entries);
3411 		if (err != 0)
3412 			goto abort;
3413 	}
3414 	return 0;
3415 
3416 abort:
3417 	mxge_free_rings(sc);
3418 	return err;
3419 
3420 }
3421 
3422 
3423 static void
3424 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3425 {
3426 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3427 
3428 	if (bufsize < MCLBYTES) {
3429 		/* easy, everything fits in a single buffer */
3430 		*big_buf_size = MCLBYTES;
3431 		*cl_size = MCLBYTES;
3432 		*nbufs = 1;
3433 		return;
3434 	}
3435 
3436 	if (bufsize < MJUMPAGESIZE) {
3437 		/* still easy, everything still fits in a single buffer */
3438 		*big_buf_size = MJUMPAGESIZE;
3439 		*cl_size = MJUMPAGESIZE;
3440 		*nbufs = 1;
3441 		return;
3442 	}
3443 #if MXGE_VIRT_JUMBOS
3444 	/* now we need to use virtually contiguous buffers */
3445 	*cl_size = MJUM9BYTES;
3446 	*big_buf_size = 4096;
3447 	*nbufs = mtu / 4096 + 1;
3448 	/* needs to be a power of two, so round up */
3449 	if (*nbufs == 3)
3450 		*nbufs = 4;
3451 #else
3452 	*cl_size = MJUM9BYTES;
3453 	*big_buf_size = MJUM9BYTES;
3454 	*nbufs = 1;
3455 #endif
3456 }
3457 
3458 static int
3459 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3460 {
3461 	mxge_softc_t *sc;
3462 	mxge_cmd_t cmd;
3463 	bus_dmamap_t map;
3464 	struct lro_entry *lro_entry;
3465 	int err, i, slice;
3466 
3467 
3468 	sc = ss->sc;
3469 	slice = ss - sc->ss;
3470 
3471 	SLIST_INIT(&ss->lro_free);
3472 	SLIST_INIT(&ss->lro_active);
3473 
3474 	for (i = 0; i < sc->lro_cnt; i++) {
3475 		lro_entry = (struct lro_entry *)
3476 			malloc(sizeof (*lro_entry), M_DEVBUF,
3477 			       M_NOWAIT | M_ZERO);
3478 		if (lro_entry == NULL) {
3479 			sc->lro_cnt = i;
3480 			break;
3481 		}
3482 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3483 	}
3484 	/* get the lanai pointers to the send and receive rings */
3485 
3486 	err = 0;
3487 #ifndef IFNET_BUF_RING
3488 	/* We currently only send from the first slice */
3489 	if (slice == 0) {
3490 #endif
3491 		cmd.data0 = slice;
3492 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3493 		ss->tx.lanai =
3494 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3495 		ss->tx.send_go = (volatile uint32_t *)
3496 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3497 		ss->tx.send_stop = (volatile uint32_t *)
3498 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3499 #ifndef IFNET_BUF_RING
3500 	}
3501 #endif
3502 	cmd.data0 = slice;
3503 	err |= mxge_send_cmd(sc,
3504 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3505 	ss->rx_small.lanai =
3506 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3507 	cmd.data0 = slice;
3508 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3509 	ss->rx_big.lanai =
3510 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3511 
3512 	if (err != 0) {
3513 		device_printf(sc->dev,
3514 			      "failed to get ring sizes or locations\n");
3515 		return EIO;
3516 	}
3517 
3518 	/* stock receive rings */
3519 	for (i = 0; i <= ss->rx_small.mask; i++) {
3520 		map = ss->rx_small.info[i].map;
3521 		err = mxge_get_buf_small(ss, map, i);
3522 		if (err) {
3523 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3524 				      i, ss->rx_small.mask + 1);
3525 			return ENOMEM;
3526 		}
3527 	}
3528 	for (i = 0; i <= ss->rx_big.mask; i++) {
3529 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3530 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3531 	}
3532 	ss->rx_big.nbufs = nbufs;
3533 	ss->rx_big.cl_size = cl_size;
3534 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3535 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3536 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3537 		map = ss->rx_big.info[i].map;
3538 		err = mxge_get_buf_big(ss, map, i);
3539 		if (err) {
3540 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3541 				      i, ss->rx_big.mask + 1);
3542 			return ENOMEM;
3543 		}
3544 	}
3545 	return 0;
3546 }
3547 
3548 static int
3549 mxge_open(mxge_softc_t *sc)
3550 {
3551 	mxge_cmd_t cmd;
3552 	int err, big_bytes, nbufs, slice, cl_size, i;
3553 	bus_addr_t bus;
3554 	volatile uint8_t *itable;
3555 	struct mxge_slice_state *ss;
3556 
3557 	/* Copy the MAC address in case it was overridden */
3558 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3559 
3560 	err = mxge_reset(sc, 1);
3561 	if (err != 0) {
3562 		device_printf(sc->dev, "failed to reset\n");
3563 		return EIO;
3564 	}
3565 
3566 	if (sc->num_slices > 1) {
3567 		/* setup the indirection table */
3568 		cmd.data0 = sc->num_slices;
3569 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3570 				    &cmd);
3571 
3572 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3573 				     &cmd);
3574 		if (err != 0) {
3575 			device_printf(sc->dev,
3576 				      "failed to setup rss tables\n");
3577 			return err;
3578 		}
3579 
3580 		/* just enable an identity mapping */
3581 		itable = sc->sram + cmd.data0;
3582 		for (i = 0; i < sc->num_slices; i++)
3583 			itable[i] = (uint8_t)i;
3584 
3585 		cmd.data0 = 1;
3586 		cmd.data1 = mxge_rss_hash_type;
3587 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3588 		if (err != 0) {
3589 			device_printf(sc->dev, "failed to enable slices\n");
3590 			return err;
3591 		}
3592 	}
3593 
3594 
3595 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3596 
3597 	cmd.data0 = nbufs;
3598 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3599 			    &cmd);
3600 	/* error is only meaningful if we're trying to set
3601 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3602 	if (err && nbufs > 1) {
3603 		device_printf(sc->dev,
3604 			      "Failed to set alway-use-n to %d\n",
3605 			      nbufs);
3606 		return EIO;
3607 	}
3608 	/* Give the firmware the mtu and the big and small buffer
3609 	   sizes.  The firmware wants the big buf size to be a power
3610 	   of two. Luckily, FreeBSD's clusters are powers of two */
3611 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3612 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3613 	cmd.data0 = MHLEN - MXGEFW_PAD;
3614 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3615 			     &cmd);
3616 	cmd.data0 = big_bytes;
3617 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3618 
3619 	if (err != 0) {
3620 		device_printf(sc->dev, "failed to setup params\n");
3621 		goto abort;
3622 	}
3623 
3624 	/* Now give him the pointer to the stats block */
3625 	for (slice = 0;
3626 #ifdef IFNET_BUF_RING
3627 	     slice < sc->num_slices;
3628 #else
3629 	     slice < 1;
3630 #endif
3631 	     slice++) {
3632 		ss = &sc->ss[slice];
3633 		cmd.data0 =
3634 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3635 		cmd.data1 =
3636 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3637 		cmd.data2 = sizeof(struct mcp_irq_data);
3638 		cmd.data2 |= (slice << 16);
3639 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3640 	}
3641 
3642 	if (err != 0) {
3643 		bus = sc->ss->fw_stats_dma.bus_addr;
3644 		bus += offsetof(struct mcp_irq_data, send_done_count);
3645 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3646 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3647 		err = mxge_send_cmd(sc,
3648 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3649 				    &cmd);
3650 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3651 		sc->fw_multicast_support = 0;
3652 	} else {
3653 		sc->fw_multicast_support = 1;
3654 	}
3655 
3656 	if (err != 0) {
3657 		device_printf(sc->dev, "failed to setup params\n");
3658 		goto abort;
3659 	}
3660 
3661 	for (slice = 0; slice < sc->num_slices; slice++) {
3662 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3663 		if (err != 0) {
3664 			device_printf(sc->dev, "couldn't open slice %d\n",
3665 				      slice);
3666 			goto abort;
3667 		}
3668 	}
3669 
3670 	/* Finally, start the firmware running */
3671 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3672 	if (err) {
3673 		device_printf(sc->dev, "Couldn't bring up link\n");
3674 		goto abort;
3675 	}
3676 #ifdef IFNET_BUF_RING
3677 	for (slice = 0; slice < sc->num_slices; slice++) {
3678 		ss = &sc->ss[slice];
3679 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3680 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3681 	}
3682 #endif
3683 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3684 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3685 
3686 	return 0;
3687 
3688 
3689 abort:
3690 	mxge_free_mbufs(sc);
3691 
3692 	return err;
3693 }
3694 
3695 static int
3696 mxge_close(mxge_softc_t *sc, int down)
3697 {
3698 	mxge_cmd_t cmd;
3699 	int err, old_down_cnt;
3700 #ifdef IFNET_BUF_RING
3701 	struct mxge_slice_state *ss;
3702 	int slice;
3703 #endif
3704 
3705 #ifdef IFNET_BUF_RING
3706 	for (slice = 0; slice < sc->num_slices; slice++) {
3707 		ss = &sc->ss[slice];
3708 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3709 	}
3710 #endif
3711 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3712 	if (!down) {
3713 		old_down_cnt = sc->down_cnt;
3714 		wmb();
3715 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3716 		if (err) {
3717 			device_printf(sc->dev,
3718 				      "Couldn't bring down link\n");
3719 		}
3720 		if (old_down_cnt == sc->down_cnt) {
3721 			/* wait for down irq */
3722 			DELAY(10 * sc->intr_coal_delay);
3723 		}
3724 		wmb();
3725 		if (old_down_cnt == sc->down_cnt) {
3726 			device_printf(sc->dev, "never got down irq\n");
3727 		}
3728 	}
3729 	mxge_free_mbufs(sc);
3730 
3731 	return 0;
3732 }
3733 
3734 static void
3735 mxge_setup_cfg_space(mxge_softc_t *sc)
3736 {
3737 	device_t dev = sc->dev;
3738 	int reg;
3739 	uint16_t cmd, lnk, pectl;
3740 
3741 	/* find the PCIe link width and set max read request to 4KB*/
3742 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3743 		lnk = pci_read_config(dev, reg + 0x12, 2);
3744 		sc->link_width = (lnk >> 4) & 0x3f;
3745 
3746 		if (sc->pectl == 0) {
3747 			pectl = pci_read_config(dev, reg + 0x8, 2);
3748 			pectl = (pectl & ~0x7000) | (5 << 12);
3749 			pci_write_config(dev, reg + 0x8, pectl, 2);
3750 			sc->pectl = pectl;
3751 		} else {
3752 			/* restore saved pectl after watchdog reset */
3753 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3754 		}
3755 	}
3756 
3757 	/* Enable DMA and Memory space access */
3758 	pci_enable_busmaster(dev);
3759 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3760 	cmd |= PCIM_CMD_MEMEN;
3761 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3762 }
3763 
3764 static uint32_t
3765 mxge_read_reboot(mxge_softc_t *sc)
3766 {
3767 	device_t dev = sc->dev;
3768 	uint32_t vs;
3769 
3770 	/* find the vendor specific offset */
3771 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3772 		device_printf(sc->dev,
3773 			      "could not find vendor specific offset\n");
3774 		return (uint32_t)-1;
3775 	}
3776 	/* enable read32 mode */
3777 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3778 	/* tell NIC which register to read */
3779 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3780 	return (pci_read_config(dev, vs + 0x14, 4));
3781 }
3782 
3783 static void
3784 mxge_watchdog_reset(mxge_softc_t *sc)
3785 {
3786 	struct pci_devinfo *dinfo;
3787 	struct mxge_slice_state *ss;
3788 	int err, running, s, num_tx_slices = 1;
3789 	uint32_t reboot;
3790 	uint16_t cmd;
3791 
3792 	err = ENXIO;
3793 
3794 	device_printf(sc->dev, "Watchdog reset!\n");
3795 
3796 	/*
3797 	 * check to see if the NIC rebooted.  If it did, then all of
3798 	 * PCI config space has been reset, and things like the
3799 	 * busmaster bit will be zero.  If this is the case, then we
3800 	 * must restore PCI config space before the NIC can be used
3801 	 * again
3802 	 */
3803 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3804 	if (cmd == 0xffff) {
3805 		/*
3806 		 * maybe the watchdog caught the NIC rebooting; wait
3807 		 * up to 100ms for it to finish.  If it does not come
3808 		 * back, then give up
3809 		 */
3810 		DELAY(1000*100);
3811 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3812 		if (cmd == 0xffff) {
3813 			device_printf(sc->dev, "NIC disappeared!\n");
3814 		}
3815 	}
3816 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3817 		/* print the reboot status */
3818 		reboot = mxge_read_reboot(sc);
3819 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3820 			      reboot);
3821 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3822 		if (running) {
3823 
3824 			/*
3825 			 * quiesce NIC so that TX routines will not try to
3826 			 * xmit after restoration of BAR
3827 			 */
3828 
3829 			/* Mark the link as down */
3830 			if (sc->link_state) {
3831 				sc->link_state = 0;
3832 				if_link_state_change(sc->ifp,
3833 						     LINK_STATE_DOWN);
3834 			}
3835 #ifdef IFNET_BUF_RING
3836 			num_tx_slices = sc->num_slices;
3837 #endif
3838 			/* grab all TX locks to ensure no tx  */
3839 			for (s = 0; s < num_tx_slices; s++) {
3840 				ss = &sc->ss[s];
3841 				mtx_lock(&ss->tx.mtx);
3842 			}
3843 			mxge_close(sc, 1);
3844 		}
3845 		/* restore PCI configuration space */
3846 		dinfo = device_get_ivars(sc->dev);
3847 		pci_cfg_restore(sc->dev, dinfo);
3848 
3849 		/* and redo any changes we made to our config space */
3850 		mxge_setup_cfg_space(sc);
3851 
3852 		/* reload f/w */
3853 		err = mxge_load_firmware(sc, 0);
3854 		if (err) {
3855 			device_printf(sc->dev,
3856 				      "Unable to re-load f/w\n");
3857 		}
3858 		if (running) {
3859 			if (!err)
3860 				err = mxge_open(sc);
3861 			/* release all TX locks */
3862 			for (s = 0; s < num_tx_slices; s++) {
3863 				ss = &sc->ss[s];
3864 #ifdef IFNET_BUF_RING
3865 				mxge_start_locked(ss);
3866 #endif
3867 				mtx_unlock(&ss->tx.mtx);
3868 			}
3869 		}
3870 		sc->watchdog_resets++;
3871 	} else {
3872 		device_printf(sc->dev,
3873 			      "NIC did not reboot, not resetting\n");
3874 		err = 0;
3875 	}
3876 	if (err) {
3877 		device_printf(sc->dev, "watchdog reset failed\n");
3878 	} else {
3879 		if (sc->dying == 2)
3880 			sc->dying = 0;
3881 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3882 	}
3883 }
3884 
3885 static void
3886 mxge_watchdog_task(void *arg, int pending)
3887 {
3888 	mxge_softc_t *sc = arg;
3889 
3890 
3891 	mtx_lock(&sc->driver_mtx);
3892 	mxge_watchdog_reset(sc);
3893 	mtx_unlock(&sc->driver_mtx);
3894 }
3895 
3896 static void
3897 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3898 {
3899 	tx = &sc->ss[slice].tx;
3900 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3901 	device_printf(sc->dev,
3902 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3903 		      tx->req, tx->done, tx->queue_active);
3904 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3905 			      tx->activate, tx->deactivate);
3906 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3907 		      tx->pkt_done,
3908 		      be32toh(sc->ss->fw_stats->send_done_count));
3909 }
3910 
3911 static int
3912 mxge_watchdog(mxge_softc_t *sc)
3913 {
3914 	mxge_tx_ring_t *tx;
3915 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3916 	int i, err = 0;
3917 
3918 	/* see if we have outstanding transmits, which
3919 	   have been pending for more than mxge_ticks */
3920 	for (i = 0;
3921 #ifdef IFNET_BUF_RING
3922 	     (i < sc->num_slices) && (err == 0);
3923 #else
3924 	     (i < 1) && (err == 0);
3925 #endif
3926 	     i++) {
3927 		tx = &sc->ss[i].tx;
3928 		if (tx->req != tx->done &&
3929 		    tx->watchdog_req != tx->watchdog_done &&
3930 		    tx->done == tx->watchdog_done) {
3931 			/* check for pause blocking before resetting */
3932 			if (tx->watchdog_rx_pause == rx_pause) {
3933 				mxge_warn_stuck(sc, tx, i);
3934 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3935 				return (ENXIO);
3936 			}
3937 			else
3938 				device_printf(sc->dev, "Flow control blocking "
3939 					      "xmits, check link partner\n");
3940 		}
3941 
3942 		tx->watchdog_req = tx->req;
3943 		tx->watchdog_done = tx->done;
3944 		tx->watchdog_rx_pause = rx_pause;
3945 	}
3946 
3947 	if (sc->need_media_probe)
3948 		mxge_media_probe(sc);
3949 	return (err);
3950 }
3951 
3952 static u_long
3953 mxge_update_stats(mxge_softc_t *sc)
3954 {
3955 	struct mxge_slice_state *ss;
3956 	u_long pkts = 0;
3957 	u_long ipackets = 0;
3958 	u_long opackets = 0;
3959 #ifdef IFNET_BUF_RING
3960 	u_long obytes = 0;
3961 	u_long omcasts = 0;
3962 	u_long odrops = 0;
3963 #endif
3964 	u_long oerrors = 0;
3965 	int slice;
3966 
3967 	for (slice = 0; slice < sc->num_slices; slice++) {
3968 		ss = &sc->ss[slice];
3969 		ipackets += ss->ipackets;
3970 		opackets += ss->opackets;
3971 #ifdef IFNET_BUF_RING
3972 		obytes += ss->obytes;
3973 		omcasts += ss->omcasts;
3974 		odrops += ss->tx.br->br_drops;
3975 #endif
3976 		oerrors += ss->oerrors;
3977 	}
3978 	pkts = (ipackets - sc->ifp->if_ipackets);
3979 	pkts += (opackets - sc->ifp->if_opackets);
3980 	sc->ifp->if_ipackets = ipackets;
3981 	sc->ifp->if_opackets = opackets;
3982 #ifdef IFNET_BUF_RING
3983 	sc->ifp->if_obytes = obytes;
3984 	sc->ifp->if_omcasts = omcasts;
3985 	sc->ifp->if_snd.ifq_drops = odrops;
3986 #endif
3987 	sc->ifp->if_oerrors = oerrors;
3988 	return pkts;
3989 }
3990 
3991 static void
3992 mxge_tick(void *arg)
3993 {
3994 	mxge_softc_t *sc = arg;
3995 	u_long pkts = 0;
3996 	int err = 0;
3997 	int running, ticks;
3998 	uint16_t cmd;
3999 
4000 	ticks = mxge_ticks;
4001 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4002 	if (running) {
4003 		/* aggregate stats from different slices */
4004 		pkts = mxge_update_stats(sc);
4005 		if (!sc->watchdog_countdown) {
4006 			err = mxge_watchdog(sc);
4007 			sc->watchdog_countdown = 4;
4008 		}
4009 		sc->watchdog_countdown--;
4010 	}
4011 	if (pkts == 0) {
4012 		/* ensure NIC did not suffer h/w fault while idle */
4013 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4014 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4015 			sc->dying = 2;
4016 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4017 			err = ENXIO;
4018 		}
4019 		/* look less often if NIC is idle */
4020 		ticks *= 4;
4021 	}
4022 
4023 	if (err == 0)
4024 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4025 
4026 }
4027 
4028 static int
4029 mxge_media_change(struct ifnet *ifp)
4030 {
4031 	return EINVAL;
4032 }
4033 
4034 static int
4035 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4036 {
4037 	struct ifnet *ifp = sc->ifp;
4038 	int real_mtu, old_mtu;
4039 	int err = 0;
4040 
4041 
4042 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4043 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4044 		return EINVAL;
4045 	mtx_lock(&sc->driver_mtx);
4046 	old_mtu = ifp->if_mtu;
4047 	ifp->if_mtu = mtu;
4048 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4049 		mxge_close(sc, 0);
4050 		err = mxge_open(sc);
4051 		if (err != 0) {
4052 			ifp->if_mtu = old_mtu;
4053 			mxge_close(sc, 0);
4054 			(void) mxge_open(sc);
4055 		}
4056 	}
4057 	mtx_unlock(&sc->driver_mtx);
4058 	return err;
4059 }
4060 
4061 static void
4062 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4063 {
4064 	mxge_softc_t *sc = ifp->if_softc;
4065 
4066 
4067 	if (sc == NULL)
4068 		return;
4069 	ifmr->ifm_status = IFM_AVALID;
4070 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4071 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4072 	ifmr->ifm_active |= sc->current_media;
4073 }
4074 
4075 static int
4076 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4077 {
4078 	mxge_softc_t *sc = ifp->if_softc;
4079 	struct ifreq *ifr = (struct ifreq *)data;
4080 	int err, mask;
4081 
4082 	err = 0;
4083 	switch (command) {
4084 	case SIOCSIFADDR:
4085 	case SIOCGIFADDR:
4086 		err = ether_ioctl(ifp, command, data);
4087 		break;
4088 
4089 	case SIOCSIFMTU:
4090 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4091 		break;
4092 
4093 	case SIOCSIFFLAGS:
4094 		mtx_lock(&sc->driver_mtx);
4095 		if (sc->dying) {
4096 			mtx_unlock(&sc->driver_mtx);
4097 			return EINVAL;
4098 		}
4099 		if (ifp->if_flags & IFF_UP) {
4100 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4101 				err = mxge_open(sc);
4102 			} else {
4103 				/* take care of promis can allmulti
4104 				   flag chages */
4105 				mxge_change_promisc(sc,
4106 						    ifp->if_flags & IFF_PROMISC);
4107 				mxge_set_multicast_list(sc);
4108 			}
4109 		} else {
4110 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4111 				mxge_close(sc, 0);
4112 			}
4113 		}
4114 		mtx_unlock(&sc->driver_mtx);
4115 		break;
4116 
4117 	case SIOCADDMULTI:
4118 	case SIOCDELMULTI:
4119 		mtx_lock(&sc->driver_mtx);
4120 		mxge_set_multicast_list(sc);
4121 		mtx_unlock(&sc->driver_mtx);
4122 		break;
4123 
4124 	case SIOCSIFCAP:
4125 		mtx_lock(&sc->driver_mtx);
4126 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4127 		if (mask & IFCAP_TXCSUM) {
4128 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4129 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4130 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
4131 						      | CSUM_TSO);
4132 			} else {
4133 				ifp->if_capenable |= IFCAP_TXCSUM;
4134 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4135 			}
4136 		} else if (mask & IFCAP_RXCSUM) {
4137 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4138 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4139 				sc->csum_flag = 0;
4140 			} else {
4141 				ifp->if_capenable |= IFCAP_RXCSUM;
4142 				sc->csum_flag = 1;
4143 			}
4144 		}
4145 		if (mask & IFCAP_TSO4) {
4146 			if (IFCAP_TSO4 & ifp->if_capenable) {
4147 				ifp->if_capenable &= ~IFCAP_TSO4;
4148 				ifp->if_hwassist &= ~CSUM_TSO;
4149 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4150 				ifp->if_capenable |= IFCAP_TSO4;
4151 				ifp->if_hwassist |= CSUM_TSO;
4152 			} else {
4153 				printf("mxge requires tx checksum offload"
4154 				       " be enabled to use TSO\n");
4155 				err = EINVAL;
4156 			}
4157 		}
4158 		if (mask & IFCAP_LRO) {
4159 			if (IFCAP_LRO & ifp->if_capenable)
4160 				err = mxge_change_lro_locked(sc, 0);
4161 			else
4162 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4163 		}
4164 		if (mask & IFCAP_VLAN_HWTAGGING)
4165 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4166 		if (mask & IFCAP_VLAN_HWTSO)
4167 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4168 
4169 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4170 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4171 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4172 
4173 		mtx_unlock(&sc->driver_mtx);
4174 		VLAN_CAPABILITIES(ifp);
4175 
4176 		break;
4177 
4178 	case SIOCGIFMEDIA:
4179 		mtx_lock(&sc->driver_mtx);
4180 		mxge_media_probe(sc);
4181 		mtx_unlock(&sc->driver_mtx);
4182 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4183 				    &sc->media, command);
4184                 break;
4185 
4186 	default:
4187 		err = ENOTTY;
4188         }
4189 	return err;
4190 }
4191 
4192 static void
4193 mxge_fetch_tunables(mxge_softc_t *sc)
4194 {
4195 
4196 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4197 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4198 			  &mxge_flow_control);
4199 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4200 			  &mxge_intr_coal_delay);
4201 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4202 			  &mxge_nvidia_ecrc_enable);
4203 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4204 			  &mxge_force_firmware);
4205 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4206 			  &mxge_deassert_wait);
4207 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4208 			  &mxge_verbose);
4209 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4210 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4211 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4212 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4213 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4214 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4215 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4216 	if (sc->lro_cnt != 0)
4217 		mxge_lro_cnt = sc->lro_cnt;
4218 
4219 	if (bootverbose)
4220 		mxge_verbose = 1;
4221 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4222 		mxge_intr_coal_delay = 30;
4223 	if (mxge_ticks == 0)
4224 		mxge_ticks = hz / 2;
4225 	sc->pause = mxge_flow_control;
4226 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4227 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4228 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4229 	}
4230 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4231 	    mxge_initial_mtu < ETHER_MIN_LEN)
4232 		mxge_initial_mtu = ETHERMTU_JUMBO;
4233 
4234 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4235 		mxge_throttle = MXGE_MAX_THROTTLE;
4236 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4237 		mxge_throttle = MXGE_MIN_THROTTLE;
4238 	sc->throttle = mxge_throttle;
4239 }
4240 
4241 
4242 static void
4243 mxge_free_slices(mxge_softc_t *sc)
4244 {
4245 	struct mxge_slice_state *ss;
4246 	int i;
4247 
4248 
4249 	if (sc->ss == NULL)
4250 		return;
4251 
4252 	for (i = 0; i < sc->num_slices; i++) {
4253 		ss = &sc->ss[i];
4254 		if (ss->fw_stats != NULL) {
4255 			mxge_dma_free(&ss->fw_stats_dma);
4256 			ss->fw_stats = NULL;
4257 #ifdef IFNET_BUF_RING
4258 			if (ss->tx.br != NULL) {
4259 				drbr_free(ss->tx.br, M_DEVBUF);
4260 				ss->tx.br = NULL;
4261 			}
4262 #endif
4263 			mtx_destroy(&ss->tx.mtx);
4264 		}
4265 		if (ss->rx_done.entry != NULL) {
4266 			mxge_dma_free(&ss->rx_done.dma);
4267 			ss->rx_done.entry = NULL;
4268 		}
4269 	}
4270 	free(sc->ss, M_DEVBUF);
4271 	sc->ss = NULL;
4272 }
4273 
4274 static int
4275 mxge_alloc_slices(mxge_softc_t *sc)
4276 {
4277 	mxge_cmd_t cmd;
4278 	struct mxge_slice_state *ss;
4279 	size_t bytes;
4280 	int err, i, max_intr_slots;
4281 
4282 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4283 	if (err != 0) {
4284 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4285 		return err;
4286 	}
4287 	sc->rx_ring_size = cmd.data0;
4288 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4289 
4290 	bytes = sizeof (*sc->ss) * sc->num_slices;
4291 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4292 	if (sc->ss == NULL)
4293 		return (ENOMEM);
4294 	for (i = 0; i < sc->num_slices; i++) {
4295 		ss = &sc->ss[i];
4296 
4297 		ss->sc = sc;
4298 
4299 		/* allocate per-slice rx interrupt queues */
4300 
4301 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4302 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4303 		if (err != 0)
4304 			goto abort;
4305 		ss->rx_done.entry = ss->rx_done.dma.addr;
4306 		bzero(ss->rx_done.entry, bytes);
4307 
4308 		/*
4309 		 * allocate the per-slice firmware stats; stats
4310 		 * (including tx) are used used only on the first
4311 		 * slice for now
4312 		 */
4313 #ifndef IFNET_BUF_RING
4314 		if (i > 0)
4315 			continue;
4316 #endif
4317 
4318 		bytes = sizeof (*ss->fw_stats);
4319 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4320 				     sizeof (*ss->fw_stats), 64);
4321 		if (err != 0)
4322 			goto abort;
4323 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4324 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4325 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4326 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4327 #ifdef IFNET_BUF_RING
4328 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4329 					   &ss->tx.mtx);
4330 #endif
4331 	}
4332 
4333 	return (0);
4334 
4335 abort:
4336 	mxge_free_slices(sc);
4337 	return (ENOMEM);
4338 }
4339 
4340 static void
4341 mxge_slice_probe(mxge_softc_t *sc)
4342 {
4343 	mxge_cmd_t cmd;
4344 	char *old_fw;
4345 	int msix_cnt, status, max_intr_slots;
4346 
4347 	sc->num_slices = 1;
4348 	/*
4349 	 *  don't enable multiple slices if they are not enabled,
4350 	 *  or if this is not an SMP system
4351 	 */
4352 
4353 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4354 		return;
4355 
4356 	/* see how many MSI-X interrupts are available */
4357 	msix_cnt = pci_msix_count(sc->dev);
4358 	if (msix_cnt < 2)
4359 		return;
4360 
4361 	/* now load the slice aware firmware see what it supports */
4362 	old_fw = sc->fw_name;
4363 	if (old_fw == mxge_fw_aligned)
4364 		sc->fw_name = mxge_fw_rss_aligned;
4365 	else
4366 		sc->fw_name = mxge_fw_rss_unaligned;
4367 	status = mxge_load_firmware(sc, 0);
4368 	if (status != 0) {
4369 		device_printf(sc->dev, "Falling back to a single slice\n");
4370 		return;
4371 	}
4372 
4373 	/* try to send a reset command to the card to see if it
4374 	   is alive */
4375 	memset(&cmd, 0, sizeof (cmd));
4376 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4377 	if (status != 0) {
4378 		device_printf(sc->dev, "failed reset\n");
4379 		goto abort_with_fw;
4380 	}
4381 
4382 	/* get rx ring size */
4383 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4384 	if (status != 0) {
4385 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4386 		goto abort_with_fw;
4387 	}
4388 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4389 
4390 	/* tell it the size of the interrupt queues */
4391 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4392 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4393 	if (status != 0) {
4394 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4395 		goto abort_with_fw;
4396 	}
4397 
4398 	/* ask the maximum number of slices it supports */
4399 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4400 	if (status != 0) {
4401 		device_printf(sc->dev,
4402 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4403 		goto abort_with_fw;
4404 	}
4405 	sc->num_slices = cmd.data0;
4406 	if (sc->num_slices > msix_cnt)
4407 		sc->num_slices = msix_cnt;
4408 
4409 	if (mxge_max_slices == -1) {
4410 		/* cap to number of CPUs in system */
4411 		if (sc->num_slices > mp_ncpus)
4412 			sc->num_slices = mp_ncpus;
4413 	} else {
4414 		if (sc->num_slices > mxge_max_slices)
4415 			sc->num_slices = mxge_max_slices;
4416 	}
4417 	/* make sure it is a power of two */
4418 	while (sc->num_slices & (sc->num_slices - 1))
4419 		sc->num_slices--;
4420 
4421 	if (mxge_verbose)
4422 		device_printf(sc->dev, "using %d slices\n",
4423 			      sc->num_slices);
4424 
4425 	return;
4426 
4427 abort_with_fw:
4428 	sc->fw_name = old_fw;
4429 	(void) mxge_load_firmware(sc, 0);
4430 }
4431 
4432 static int
4433 mxge_add_msix_irqs(mxge_softc_t *sc)
4434 {
4435 	size_t bytes;
4436 	int count, err, i, rid;
4437 
4438 	rid = PCIR_BAR(2);
4439 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4440 						    &rid, RF_ACTIVE);
4441 
4442 	if (sc->msix_table_res == NULL) {
4443 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4444 		return ENXIO;
4445 	}
4446 
4447 	count = sc->num_slices;
4448 	err = pci_alloc_msix(sc->dev, &count);
4449 	if (err != 0) {
4450 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4451 			      "err = %d \n", sc->num_slices, err);
4452 		goto abort_with_msix_table;
4453 	}
4454 	if (count < sc->num_slices) {
4455 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4456 			      count, sc->num_slices);
4457 		device_printf(sc->dev,
4458 			      "Try setting hw.mxge.max_slices to %d\n",
4459 			      count);
4460 		err = ENOSPC;
4461 		goto abort_with_msix;
4462 	}
4463 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4464 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4465 	if (sc->msix_irq_res == NULL) {
4466 		err = ENOMEM;
4467 		goto abort_with_msix;
4468 	}
4469 
4470 	for (i = 0; i < sc->num_slices; i++) {
4471 		rid = i + 1;
4472 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4473 							  SYS_RES_IRQ,
4474 							  &rid, RF_ACTIVE);
4475 		if (sc->msix_irq_res[i] == NULL) {
4476 			device_printf(sc->dev, "couldn't allocate IRQ res"
4477 				      " for message %d\n", i);
4478 			err = ENXIO;
4479 			goto abort_with_res;
4480 		}
4481 	}
4482 
4483 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4484 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4485 
4486 	for (i = 0; i < sc->num_slices; i++) {
4487 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4488 				     INTR_TYPE_NET | INTR_MPSAFE,
4489 #if __FreeBSD_version > 700030
4490 				     NULL,
4491 #endif
4492 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4493 		if (err != 0) {
4494 			device_printf(sc->dev, "couldn't setup intr for "
4495 				      "message %d\n", i);
4496 			goto abort_with_intr;
4497 		}
4498 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4499 				  sc->msix_ih[i], "s%d", i);
4500 	}
4501 
4502 	if (mxge_verbose) {
4503 		device_printf(sc->dev, "using %d msix IRQs:",
4504 			      sc->num_slices);
4505 		for (i = 0; i < sc->num_slices; i++)
4506 			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4507 		printf("\n");
4508 	}
4509 	return (0);
4510 
4511 abort_with_intr:
4512 	for (i = 0; i < sc->num_slices; i++) {
4513 		if (sc->msix_ih[i] != NULL) {
4514 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4515 					  sc->msix_ih[i]);
4516 			sc->msix_ih[i] = NULL;
4517 		}
4518 	}
4519 	free(sc->msix_ih, M_DEVBUF);
4520 
4521 
4522 abort_with_res:
4523 	for (i = 0; i < sc->num_slices; i++) {
4524 		rid = i + 1;
4525 		if (sc->msix_irq_res[i] != NULL)
4526 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4527 					     sc->msix_irq_res[i]);
4528 		sc->msix_irq_res[i] = NULL;
4529 	}
4530 	free(sc->msix_irq_res, M_DEVBUF);
4531 
4532 
4533 abort_with_msix:
4534 	pci_release_msi(sc->dev);
4535 
4536 abort_with_msix_table:
4537 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4538 			     sc->msix_table_res);
4539 
4540 	return err;
4541 }
4542 
4543 static int
4544 mxge_add_single_irq(mxge_softc_t *sc)
4545 {
4546 	int count, err, rid;
4547 
4548 	count = pci_msi_count(sc->dev);
4549 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4550 		rid = 1;
4551 	} else {
4552 		rid = 0;
4553 		sc->legacy_irq = 1;
4554 	}
4555 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4556 					 1, RF_SHAREABLE | RF_ACTIVE);
4557 	if (sc->irq_res == NULL) {
4558 		device_printf(sc->dev, "could not alloc interrupt\n");
4559 		return ENXIO;
4560 	}
4561 	if (mxge_verbose)
4562 		device_printf(sc->dev, "using %s irq %ld\n",
4563 			      sc->legacy_irq ? "INTx" : "MSI",
4564 			      rman_get_start(sc->irq_res));
4565 	err = bus_setup_intr(sc->dev, sc->irq_res,
4566 			     INTR_TYPE_NET | INTR_MPSAFE,
4567 #if __FreeBSD_version > 700030
4568 			     NULL,
4569 #endif
4570 			     mxge_intr, &sc->ss[0], &sc->ih);
4571 	if (err != 0) {
4572 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4573 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4574 		if (!sc->legacy_irq)
4575 			pci_release_msi(sc->dev);
4576 	}
4577 	return err;
4578 }
4579 
4580 static void
4581 mxge_rem_msix_irqs(mxge_softc_t *sc)
4582 {
4583 	int i, rid;
4584 
4585 	for (i = 0; i < sc->num_slices; i++) {
4586 		if (sc->msix_ih[i] != NULL) {
4587 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4588 					  sc->msix_ih[i]);
4589 			sc->msix_ih[i] = NULL;
4590 		}
4591 	}
4592 	free(sc->msix_ih, M_DEVBUF);
4593 
4594 	for (i = 0; i < sc->num_slices; i++) {
4595 		rid = i + 1;
4596 		if (sc->msix_irq_res[i] != NULL)
4597 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4598 					     sc->msix_irq_res[i]);
4599 		sc->msix_irq_res[i] = NULL;
4600 	}
4601 	free(sc->msix_irq_res, M_DEVBUF);
4602 
4603 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4604 			     sc->msix_table_res);
4605 
4606 	pci_release_msi(sc->dev);
4607 	return;
4608 }
4609 
4610 static void
4611 mxge_rem_single_irq(mxge_softc_t *sc)
4612 {
4613 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4614 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4615 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4616 	if (!sc->legacy_irq)
4617 		pci_release_msi(sc->dev);
4618 }
4619 
4620 static void
4621 mxge_rem_irq(mxge_softc_t *sc)
4622 {
4623 	if (sc->num_slices > 1)
4624 		mxge_rem_msix_irqs(sc);
4625 	else
4626 		mxge_rem_single_irq(sc);
4627 }
4628 
4629 static int
4630 mxge_add_irq(mxge_softc_t *sc)
4631 {
4632 	int err;
4633 
4634 	if (sc->num_slices > 1)
4635 		err = mxge_add_msix_irqs(sc);
4636 	else
4637 		err = mxge_add_single_irq(sc);
4638 
4639 	if (0 && err == 0 && sc->num_slices > 1) {
4640 		mxge_rem_msix_irqs(sc);
4641 		err = mxge_add_msix_irqs(sc);
4642 	}
4643 	return err;
4644 }
4645 
4646 
4647 static int
4648 mxge_attach(device_t dev)
4649 {
4650 	mxge_softc_t *sc = device_get_softc(dev);
4651 	struct ifnet *ifp;
4652 	int err, rid;
4653 
4654 	sc->dev = dev;
4655 	mxge_fetch_tunables(sc);
4656 
4657 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4658 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4659 				  taskqueue_thread_enqueue, &sc->tq);
4660 	if (sc->tq == NULL) {
4661 		err = ENOMEM;
4662 		goto abort_with_nothing;
4663 	}
4664 
4665 	err = bus_dma_tag_create(NULL,			/* parent */
4666 				 1,			/* alignment */
4667 				 0,			/* boundary */
4668 				 BUS_SPACE_MAXADDR,	/* low */
4669 				 BUS_SPACE_MAXADDR,	/* high */
4670 				 NULL, NULL,		/* filter */
4671 				 65536 + 256,		/* maxsize */
4672 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4673 				 65536,			/* maxsegsize */
4674 				 0,			/* flags */
4675 				 NULL, NULL,		/* lock */
4676 				 &sc->parent_dmat);	/* tag */
4677 
4678 	if (err != 0) {
4679 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4680 			      err);
4681 		goto abort_with_tq;
4682 	}
4683 
4684 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4685 	if (ifp == NULL) {
4686 		device_printf(dev, "can not if_alloc()\n");
4687 		err = ENOSPC;
4688 		goto abort_with_parent_dmat;
4689 	}
4690 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4691 
4692 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4693 		 device_get_nameunit(dev));
4694 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4695 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4696 		 "%s:drv", device_get_nameunit(dev));
4697 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4698 		 MTX_NETWORK_LOCK, MTX_DEF);
4699 
4700 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4701 
4702 	mxge_setup_cfg_space(sc);
4703 
4704 	/* Map the board into the kernel */
4705 	rid = PCIR_BARS;
4706 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4707 					 ~0, 1, RF_ACTIVE);
4708 	if (sc->mem_res == NULL) {
4709 		device_printf(dev, "could not map memory\n");
4710 		err = ENXIO;
4711 		goto abort_with_lock;
4712 	}
4713 	sc->sram = rman_get_virtual(sc->mem_res);
4714 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4715 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4716 		device_printf(dev, "impossible memory region size %ld\n",
4717 			      rman_get_size(sc->mem_res));
4718 		err = ENXIO;
4719 		goto abort_with_mem_res;
4720 	}
4721 
4722 	/* make NULL terminated copy of the EEPROM strings section of
4723 	   lanai SRAM */
4724 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4725 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4726 				rman_get_bushandle(sc->mem_res),
4727 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4728 				sc->eeprom_strings,
4729 				MXGE_EEPROM_STRINGS_SIZE - 2);
4730 	err = mxge_parse_strings(sc);
4731 	if (err != 0)
4732 		goto abort_with_mem_res;
4733 
4734 	/* Enable write combining for efficient use of PCIe bus */
4735 	mxge_enable_wc(sc);
4736 
4737 	/* Allocate the out of band dma memory */
4738 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4739 			     sizeof (mxge_cmd_t), 64);
4740 	if (err != 0)
4741 		goto abort_with_mem_res;
4742 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4743 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4744 	if (err != 0)
4745 		goto abort_with_cmd_dma;
4746 
4747 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4748 	if (err != 0)
4749 		goto abort_with_zeropad_dma;
4750 
4751 	/* select & load the firmware */
4752 	err = mxge_select_firmware(sc);
4753 	if (err != 0)
4754 		goto abort_with_dmabench;
4755 	sc->intr_coal_delay = mxge_intr_coal_delay;
4756 
4757 	mxge_slice_probe(sc);
4758 	err = mxge_alloc_slices(sc);
4759 	if (err != 0)
4760 		goto abort_with_dmabench;
4761 
4762 	err = mxge_reset(sc, 0);
4763 	if (err != 0)
4764 		goto abort_with_slices;
4765 
4766 	err = mxge_alloc_rings(sc);
4767 	if (err != 0) {
4768 		device_printf(sc->dev, "failed to allocate rings\n");
4769 		goto abort_with_slices;
4770 	}
4771 
4772 	err = mxge_add_irq(sc);
4773 	if (err != 0) {
4774 		device_printf(sc->dev, "failed to add irq\n");
4775 		goto abort_with_rings;
4776 	}
4777 
4778 	ifp->if_baudrate = IF_Gbps(10UL);
4779 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4780 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE;
4781 #ifdef INET
4782 	ifp->if_capabilities |= IFCAP_LRO;
4783 #endif
4784 
4785 #ifdef MXGE_NEW_VLAN_API
4786 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4787 
4788 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4789 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4790 	    sc->fw_ver_tiny >= 32)
4791 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4792 #endif
4793 
4794 	sc->max_mtu = mxge_max_mtu(sc);
4795 	if (sc->max_mtu >= 9000)
4796 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4797 	else
4798 		device_printf(dev, "MTU limited to %d.  Install "
4799 			      "latest firmware for 9000 byte jumbo support\n",
4800 			      sc->max_mtu - ETHER_HDR_LEN);
4801 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4802 	ifp->if_capenable = ifp->if_capabilities;
4803 	if (sc->lro_cnt == 0)
4804 		ifp->if_capenable &= ~IFCAP_LRO;
4805 	sc->csum_flag = 1;
4806         ifp->if_init = mxge_init;
4807         ifp->if_softc = sc;
4808         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4809         ifp->if_ioctl = mxge_ioctl;
4810         ifp->if_start = mxge_start;
4811 	/* Initialise the ifmedia structure */
4812 	ifmedia_init(&sc->media, 0, mxge_media_change,
4813 		     mxge_media_status);
4814 	mxge_media_init(sc);
4815 	mxge_media_probe(sc);
4816 	sc->dying = 0;
4817 	ether_ifattach(ifp, sc->mac_addr);
4818 	/* ether_ifattach sets mtu to ETHERMTU */
4819 	if (mxge_initial_mtu != ETHERMTU)
4820 		mxge_change_mtu(sc, mxge_initial_mtu);
4821 
4822 	mxge_add_sysctls(sc);
4823 #ifdef IFNET_BUF_RING
4824 	ifp->if_transmit = mxge_transmit;
4825 	ifp->if_qflush = mxge_qflush;
4826 #endif
4827 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4828 				device_get_nameunit(sc->dev));
4829 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4830 	return 0;
4831 
4832 abort_with_rings:
4833 	mxge_free_rings(sc);
4834 abort_with_slices:
4835 	mxge_free_slices(sc);
4836 abort_with_dmabench:
4837 	mxge_dma_free(&sc->dmabench_dma);
4838 abort_with_zeropad_dma:
4839 	mxge_dma_free(&sc->zeropad_dma);
4840 abort_with_cmd_dma:
4841 	mxge_dma_free(&sc->cmd_dma);
4842 abort_with_mem_res:
4843 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4844 abort_with_lock:
4845 	pci_disable_busmaster(dev);
4846 	mtx_destroy(&sc->cmd_mtx);
4847 	mtx_destroy(&sc->driver_mtx);
4848 	if_free(ifp);
4849 abort_with_parent_dmat:
4850 	bus_dma_tag_destroy(sc->parent_dmat);
4851 abort_with_tq:
4852 	if (sc->tq != NULL) {
4853 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4854 		taskqueue_free(sc->tq);
4855 		sc->tq = NULL;
4856 	}
4857 abort_with_nothing:
4858 	return err;
4859 }
4860 
4861 static int
4862 mxge_detach(device_t dev)
4863 {
4864 	mxge_softc_t *sc = device_get_softc(dev);
4865 
4866 	if (mxge_vlans_active(sc)) {
4867 		device_printf(sc->dev,
4868 			      "Detach vlans before removing module\n");
4869 		return EBUSY;
4870 	}
4871 	mtx_lock(&sc->driver_mtx);
4872 	sc->dying = 1;
4873 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4874 		mxge_close(sc, 0);
4875 	mtx_unlock(&sc->driver_mtx);
4876 	ether_ifdetach(sc->ifp);
4877 	if (sc->tq != NULL) {
4878 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4879 		taskqueue_free(sc->tq);
4880 		sc->tq = NULL;
4881 	}
4882 	callout_drain(&sc->co_hdl);
4883 	ifmedia_removeall(&sc->media);
4884 	mxge_dummy_rdma(sc, 0);
4885 	mxge_rem_sysctls(sc);
4886 	mxge_rem_irq(sc);
4887 	mxge_free_rings(sc);
4888 	mxge_free_slices(sc);
4889 	mxge_dma_free(&sc->dmabench_dma);
4890 	mxge_dma_free(&sc->zeropad_dma);
4891 	mxge_dma_free(&sc->cmd_dma);
4892 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4893 	pci_disable_busmaster(dev);
4894 	mtx_destroy(&sc->cmd_mtx);
4895 	mtx_destroy(&sc->driver_mtx);
4896 	if_free(sc->ifp);
4897 	bus_dma_tag_destroy(sc->parent_dmat);
4898 	return 0;
4899 }
4900 
4901 static int
4902 mxge_shutdown(device_t dev)
4903 {
4904 	return 0;
4905 }
4906 
4907 /*
4908   This file uses Myri10GE driver indentation.
4909 
4910   Local Variables:
4911   c-file-style:"linux"
4912   tab-width:8
4913   End:
4914 */
4915