xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 35a04710d7286aa9538917fd7f8e417dbee95b82)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2007, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/memrange.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 #include <sys/sx.h>
49 
50 #include <net/if.h>
51 #include <net/if_arp.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
55 
56 #include <net/bpf.h>
57 
58 #include <net/if_types.h>
59 #include <net/if_vlan_var.h>
60 #include <net/zlib.h>
61 
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
66 
67 #include <machine/bus.h>
68 #include <machine/in_cksum.h>
69 #include <machine/resource.h>
70 #include <sys/bus.h>
71 #include <sys/rman.h>
72 
73 #include <dev/pci/pcireg.h>
74 #include <dev/pci/pcivar.h>
75 
76 #include <vm/vm.h>		/* for pmap_mapdev() */
77 #include <vm/pmap.h>
78 
79 #if defined(__i386) || defined(__amd64)
80 #include <machine/specialreg.h>
81 #endif
82 
83 #include <dev/mxge/mxge_mcp.h>
84 #include <dev/mxge/mcp_gen_header.h>
85 #include <dev/mxge/if_mxge_var.h>
86 
87 /* tunable params */
88 static int mxge_nvidia_ecrc_enable = 1;
89 static int mxge_force_firmware = 0;
90 static int mxge_intr_coal_delay = 30;
91 static int mxge_deassert_wait = 1;
92 static int mxge_flow_control = 1;
93 static int mxge_verbose = 0;
94 static int mxge_lro_cnt = 8;
95 static int mxge_ticks;
96 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
97 static char *mxge_fw_aligned = "mxge_eth_z8e";
98 
99 static int mxge_probe(device_t dev);
100 static int mxge_attach(device_t dev);
101 static int mxge_detach(device_t dev);
102 static int mxge_shutdown(device_t dev);
103 static void mxge_intr(void *arg);
104 
105 static device_method_t mxge_methods[] =
106 {
107   /* Device interface */
108   DEVMETHOD(device_probe, mxge_probe),
109   DEVMETHOD(device_attach, mxge_attach),
110   DEVMETHOD(device_detach, mxge_detach),
111   DEVMETHOD(device_shutdown, mxge_shutdown),
112   {0, 0}
113 };
114 
115 static driver_t mxge_driver =
116 {
117   "mxge",
118   mxge_methods,
119   sizeof(mxge_softc_t),
120 };
121 
122 static devclass_t mxge_devclass;
123 
124 /* Declare ourselves to be a child of the PCI bus.*/
125 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
126 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
127 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
128 
129 static int mxge_load_firmware(mxge_softc_t *sc);
130 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
131 static int mxge_close(mxge_softc_t *sc);
132 static int mxge_open(mxge_softc_t *sc);
133 static void mxge_tick(void *arg);
134 
135 static int
136 mxge_probe(device_t dev)
137 {
138   if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
139       ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
140        (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
141 	  device_set_desc(dev, "Myri10G-PCIE-8A");
142 	  return 0;
143   }
144   return ENXIO;
145 }
146 
147 static void
148 mxge_enable_wc(mxge_softc_t *sc)
149 {
150 #if defined(__i386) || defined(__amd64)
151 	struct mem_range_desc mrdesc;
152 	vm_paddr_t pa;
153 	vm_offset_t len;
154 	int err, action;
155 
156 	sc->wc = 1;
157 	len = rman_get_size(sc->mem_res);
158 	err = pmap_change_attr((vm_offset_t) sc->sram,
159 			       len, PAT_WRITE_COMBINING);
160 	if (err == 0)
161 		return;
162 	else
163 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
164 			      err);
165 	pa = rman_get_start(sc->mem_res);
166 	mrdesc.mr_base = pa;
167 	mrdesc.mr_len = len;
168 	mrdesc.mr_flags = MDF_WRITECOMBINE;
169 	action = MEMRANGE_SET_UPDATE;
170 	strcpy((char *)&mrdesc.mr_owner, "mxge");
171 	err = mem_range_attr_set(&mrdesc, &action);
172 	if (err != 0) {
173 		sc->wc = 0;
174 		device_printf(sc->dev,
175 			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
176 			      (unsigned long)pa, (unsigned long)len, err);
177 	}
178 #endif
179 }
180 
181 
182 /* callback to get our DMA address */
183 static void
184 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
185 			 int error)
186 {
187 	if (error == 0) {
188 		*(bus_addr_t *) arg = segs->ds_addr;
189 	}
190 }
191 
192 static int
193 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
194 		   bus_size_t alignment)
195 {
196 	int err;
197 	device_t dev = sc->dev;
198 
199 	/* allocate DMAable memory tags */
200 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
201 				 alignment,		/* alignment */
202 				 4096,			/* boundary */
203 				 BUS_SPACE_MAXADDR,	/* low */
204 				 BUS_SPACE_MAXADDR,	/* high */
205 				 NULL, NULL,		/* filter */
206 				 bytes,			/* maxsize */
207 				 1,			/* num segs */
208 				 4096,			/* maxsegsize */
209 				 BUS_DMA_COHERENT,	/* flags */
210 				 NULL, NULL,		/* lock */
211 				 &dma->dmat);		/* tag */
212 	if (err != 0) {
213 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
214 		return err;
215 	}
216 
217 	/* allocate DMAable memory & map */
218 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
219 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
220 				| BUS_DMA_ZERO),  &dma->map);
221 	if (err != 0) {
222 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
223 		goto abort_with_dmat;
224 	}
225 
226 	/* load the memory */
227 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
228 			      mxge_dmamap_callback,
229 			      (void *)&dma->bus_addr, 0);
230 	if (err != 0) {
231 		device_printf(dev, "couldn't load map (err = %d)\n", err);
232 		goto abort_with_mem;
233 	}
234 	return 0;
235 
236 abort_with_mem:
237 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
238 abort_with_dmat:
239 	(void)bus_dma_tag_destroy(dma->dmat);
240 	return err;
241 }
242 
243 
244 static void
245 mxge_dma_free(mxge_dma_t *dma)
246 {
247 	bus_dmamap_unload(dma->dmat, dma->map);
248 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
249 	(void)bus_dma_tag_destroy(dma->dmat);
250 }
251 
252 /*
253  * The eeprom strings on the lanaiX have the format
254  * SN=x\0
255  * MAC=x:x:x:x:x:x\0
256  * PC=text\0
257  */
258 
259 static int
260 mxge_parse_strings(mxge_softc_t *sc)
261 {
262 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
263 
264 	char *ptr, *limit;
265 	int i, found_mac;
266 
267 	ptr = sc->eeprom_strings;
268 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
269 	found_mac = 0;
270 	while (ptr < limit && *ptr != '\0') {
271 		if (memcmp(ptr, "MAC=", 4) == 0) {
272 			ptr += 1;
273 			sc->mac_addr_string = ptr;
274 			for (i = 0; i < 6; i++) {
275 				ptr += 3;
276 				if ((ptr + 2) > limit)
277 					goto abort;
278 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
279 				found_mac = 1;
280 			}
281 		} else if (memcmp(ptr, "PC=", 3) == 0) {
282 			ptr += 3;
283 			strncpy(sc->product_code_string, ptr,
284 				sizeof (sc->product_code_string) - 1);
285 		} else if (memcmp(ptr, "SN=", 3) == 0) {
286 			ptr += 3;
287 			strncpy(sc->serial_number_string, ptr,
288 				sizeof (sc->serial_number_string) - 1);
289 		}
290 		MXGE_NEXT_STRING(ptr);
291 	}
292 
293 	if (found_mac)
294 		return 0;
295 
296  abort:
297 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
298 
299 	return ENXIO;
300 }
301 
302 #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
303 static void
304 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
305 {
306 	uint32_t val;
307 	unsigned long base, off;
308 	char *va, *cfgptr;
309 	device_t pdev, mcp55;
310 	uint16_t vendor_id, device_id, word;
311 	uintptr_t bus, slot, func, ivend, idev;
312 	uint32_t *ptr32;
313 
314 
315 	if (!mxge_nvidia_ecrc_enable)
316 		return;
317 
318 	pdev = device_get_parent(device_get_parent(sc->dev));
319 	if (pdev == NULL) {
320 		device_printf(sc->dev, "could not find parent?\n");
321 		return;
322 	}
323 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
324 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
325 
326 	if (vendor_id != 0x10de)
327 		return;
328 
329 	base = 0;
330 
331 	if (device_id == 0x005d) {
332 		/* ck804, base address is magic */
333 		base = 0xe0000000UL;
334 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
335 		/* mcp55, base address stored in chipset */
336 		mcp55 = pci_find_bsf(0, 0, 0);
337 		if (mcp55 &&
338 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
339 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
340 			word = pci_read_config(mcp55, 0x90, 2);
341 			base = ((unsigned long)word & 0x7ffeU) << 25;
342 		}
343 	}
344 	if (!base)
345 		return;
346 
347 	/* XXXX
348 	   Test below is commented because it is believed that doing
349 	   config read/write beyond 0xff will access the config space
350 	   for the next larger function.  Uncomment this and remove
351 	   the hacky pmap_mapdev() way of accessing config space when
352 	   FreeBSD grows support for extended pcie config space access
353 	*/
354 #if 0
355 	/* See if we can, by some miracle, access the extended
356 	   config space */
357 	val = pci_read_config(pdev, 0x178, 4);
358 	if (val != 0xffffffff) {
359 		val |= 0x40;
360 		pci_write_config(pdev, 0x178, val, 4);
361 		return;
362 	}
363 #endif
364 	/* Rather than using normal pci config space writes, we must
365 	 * map the Nvidia config space ourselves.  This is because on
366 	 * opteron/nvidia class machine the 0xe000000 mapping is
367 	 * handled by the nvidia chipset, that means the internal PCI
368 	 * device (the on-chip northbridge), or the amd-8131 bridge
369 	 * and things behind them are not visible by this method.
370 	 */
371 
372 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
373 		      PCI_IVAR_BUS, &bus);
374 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
375 		      PCI_IVAR_SLOT, &slot);
376 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
377 		      PCI_IVAR_FUNCTION, &func);
378 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
379 		      PCI_IVAR_VENDOR, &ivend);
380 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
381 		      PCI_IVAR_DEVICE, &idev);
382 
383 	off =  base
384 		+ 0x00100000UL * (unsigned long)bus
385 		+ 0x00001000UL * (unsigned long)(func
386 						 + 8 * slot);
387 
388 	/* map it into the kernel */
389 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
390 
391 
392 	if (va == NULL) {
393 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
394 		return;
395 	}
396 	/* get a pointer to the config space mapped into the kernel */
397 	cfgptr = va + (off & PAGE_MASK);
398 
399 	/* make sure that we can really access it */
400 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
401 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
402 	if (! (vendor_id == ivend && device_id == idev)) {
403 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
404 			      vendor_id, device_id);
405 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
406 		return;
407 	}
408 
409 	ptr32 = (uint32_t*)(cfgptr + 0x178);
410 	val = *ptr32;
411 
412 	if (val == 0xffffffff) {
413 		device_printf(sc->dev, "extended mapping failed\n");
414 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
415 		return;
416 	}
417 	*ptr32 = val | 0x40;
418 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
419 	if (mxge_verbose)
420 		device_printf(sc->dev,
421 			      "Enabled ECRC on upstream Nvidia bridge "
422 			      "at %d:%d:%d\n",
423 			      (int)bus, (int)slot, (int)func);
424 	return;
425 }
426 #else
427 static void
428 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
429 {
430 	device_printf(sc->dev,
431 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
432 	return;
433 }
434 #endif
435 
436 
437 static int
438 mxge_dma_test(mxge_softc_t *sc, int test_type)
439 {
440 	mxge_cmd_t cmd;
441 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
442 	int status;
443 	uint32_t len;
444 	char *test = " ";
445 
446 
447 	/* Run a small DMA test.
448 	 * The magic multipliers to the length tell the firmware
449 	 * to do DMA read, write, or read+write tests.  The
450 	 * results are returned in cmd.data0.  The upper 16
451 	 * bits of the return is the number of transfers completed.
452 	 * The lower 16 bits is the time in 0.5us ticks that the
453 	 * transfers took to complete.
454 	 */
455 
456 	len = sc->tx.boundary;
457 
458 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
459 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
460 	cmd.data2 = len * 0x10000;
461 	status = mxge_send_cmd(sc, test_type, &cmd);
462 	if (status != 0) {
463 		test = "read";
464 		goto abort;
465 	}
466 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
467 		(cmd.data0 & 0xffff);
468 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
469 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
470 	cmd.data2 = len * 0x1;
471 	status = mxge_send_cmd(sc, test_type, &cmd);
472 	if (status != 0) {
473 		test = "write";
474 		goto abort;
475 	}
476 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
477 		(cmd.data0 & 0xffff);
478 
479 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
480 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
481 	cmd.data2 = len * 0x10001;
482 	status = mxge_send_cmd(sc, test_type, &cmd);
483 	if (status != 0) {
484 		test = "read/write";
485 		goto abort;
486 	}
487 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
488 		(cmd.data0 & 0xffff);
489 
490 abort:
491 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
492 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
493 			      test, status);
494 
495 	return status;
496 }
497 
498 /*
499  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
500  * when the PCI-E Completion packets are aligned on an 8-byte
501  * boundary.  Some PCI-E chip sets always align Completion packets; on
502  * the ones that do not, the alignment can be enforced by enabling
503  * ECRC generation (if supported).
504  *
505  * When PCI-E Completion packets are not aligned, it is actually more
506  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
507  *
508  * If the driver can neither enable ECRC nor verify that it has
509  * already been enabled, then it must use a firmware image which works
510  * around unaligned completion packets (ethp_z8e.dat), and it should
511  * also ensure that it never gives the device a Read-DMA which is
512  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
513  * enabled, then the driver should use the aligned (eth_z8e.dat)
514  * firmware image, and set tx.boundary to 4KB.
515  */
516 
517 static int
518 mxge_firmware_probe(mxge_softc_t *sc)
519 {
520 	device_t dev = sc->dev;
521 	int reg, status;
522 	uint16_t pectl;
523 
524 	sc->tx.boundary = 4096;
525 	/*
526 	 * Verify the max read request size was set to 4KB
527 	 * before trying the test with 4KB.
528 	 */
529 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
530 		pectl = pci_read_config(dev, reg + 0x8, 2);
531 		if ((pectl & (5 << 12)) != (5 << 12)) {
532 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
533 				      pectl);
534 			sc->tx.boundary = 2048;
535 		}
536 	}
537 
538 	/*
539 	 * load the optimized firmware (which assumes aligned PCIe
540 	 * completions) in order to see if it works on this host.
541 	 */
542 	sc->fw_name = mxge_fw_aligned;
543 	status = mxge_load_firmware(sc);
544 	if (status != 0) {
545 		return status;
546 	}
547 
548 	/*
549 	 * Enable ECRC if possible
550 	 */
551 	mxge_enable_nvidia_ecrc(sc);
552 
553 	/*
554 	 * Run a DMA test which watches for unaligned completions and
555 	 * aborts on the first one seen.
556 	 */
557 
558 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
559 	if (status == 0)
560 		return 0; /* keep the aligned firmware */
561 
562 	if (status != E2BIG)
563 		device_printf(dev, "DMA test failed: %d\n", status);
564 	if (status == ENOSYS)
565 		device_printf(dev, "Falling back to ethp! "
566 			      "Please install up to date fw\n");
567 	return status;
568 }
569 
570 static int
571 mxge_select_firmware(mxge_softc_t *sc)
572 {
573 	int aligned = 0;
574 
575 
576 	if (mxge_force_firmware != 0) {
577 		if (mxge_force_firmware == 1)
578 			aligned = 1;
579 		else
580 			aligned = 0;
581 		if (mxge_verbose)
582 			device_printf(sc->dev,
583 				      "Assuming %s completions (forced)\n",
584 				      aligned ? "aligned" : "unaligned");
585 		goto abort;
586 	}
587 
588 	/* if the PCIe link width is 4 or less, we can use the aligned
589 	   firmware and skip any checks */
590 	if (sc->link_width != 0 && sc->link_width <= 4) {
591 		device_printf(sc->dev,
592 			      "PCIe x%d Link, expect reduced performance\n",
593 			      sc->link_width);
594 		aligned = 1;
595 		goto abort;
596 	}
597 
598 	if (0 == mxge_firmware_probe(sc))
599 		return 0;
600 
601 abort:
602 	if (aligned) {
603 		sc->fw_name = mxge_fw_aligned;
604 		sc->tx.boundary = 4096;
605 	} else {
606 		sc->fw_name = mxge_fw_unaligned;
607 		sc->tx.boundary = 2048;
608 	}
609 	return (mxge_load_firmware(sc));
610 }
611 
612 union qualhack
613 {
614         const char *ro_char;
615         char *rw_char;
616 };
617 
618 static int
619 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
620 {
621 
622 
623 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
624 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
625 			      be32toh(hdr->mcp_type));
626 		return EIO;
627 	}
628 
629 	/* save firmware version for sysctl */
630 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
631 	if (mxge_verbose)
632 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
633 
634 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
635 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
636 
637 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
638 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
639 		device_printf(sc->dev, "Found firmware version %s\n",
640 			      sc->fw_version);
641 		device_printf(sc->dev, "Driver needs %d.%d\n",
642 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
643 		return EINVAL;
644 	}
645 	return 0;
646 
647 }
648 
649 static void *
650 z_alloc(void *nil, u_int items, u_int size)
651 {
652         void *ptr;
653 
654         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
655         return ptr;
656 }
657 
658 static void
659 z_free(void *nil, void *ptr)
660 {
661         free(ptr, M_TEMP);
662 }
663 
664 
665 static int
666 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
667 {
668 	z_stream zs;
669 	char *inflate_buffer;
670 	const struct firmware *fw;
671 	const mcp_gen_header_t *hdr;
672 	unsigned hdr_offset;
673 	int status;
674 	unsigned int i;
675 	char dummy;
676 	size_t fw_len;
677 
678 	fw = firmware_get(sc->fw_name);
679 	if (fw == NULL) {
680 		device_printf(sc->dev, "Could not find firmware image %s\n",
681 			      sc->fw_name);
682 		return ENOENT;
683 	}
684 
685 
686 
687 	/* setup zlib and decompress f/w */
688 	bzero(&zs, sizeof (zs));
689 	zs.zalloc = z_alloc;
690 	zs.zfree = z_free;
691 	status = inflateInit(&zs);
692 	if (status != Z_OK) {
693 		status = EIO;
694 		goto abort_with_fw;
695 	}
696 
697 	/* the uncompressed size is stored as the firmware version,
698 	   which would otherwise go unused */
699 	fw_len = (size_t) fw->version;
700 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
701 	if (inflate_buffer == NULL)
702 		goto abort_with_zs;
703 	zs.avail_in = fw->datasize;
704 	zs.next_in = __DECONST(char *, fw->data);
705 	zs.avail_out = fw_len;
706 	zs.next_out = inflate_buffer;
707 	status = inflate(&zs, Z_FINISH);
708 	if (status != Z_STREAM_END) {
709 		device_printf(sc->dev, "zlib %d\n", status);
710 		status = EIO;
711 		goto abort_with_buffer;
712 	}
713 
714 	/* check id */
715 	hdr_offset = htobe32(*(const uint32_t *)
716 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
717 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
718 		device_printf(sc->dev, "Bad firmware file");
719 		status = EIO;
720 		goto abort_with_buffer;
721 	}
722 	hdr = (const void*)(inflate_buffer + hdr_offset);
723 
724 	status = mxge_validate_firmware(sc, hdr);
725 	if (status != 0)
726 		goto abort_with_buffer;
727 
728 	/* Copy the inflated firmware to NIC SRAM. */
729 	for (i = 0; i < fw_len; i += 256) {
730 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
731 			      inflate_buffer + i,
732 			      min(256U, (unsigned)(fw_len - i)));
733 		mb();
734 		dummy = *sc->sram;
735 		mb();
736 	}
737 
738 	*limit = fw_len;
739 	status = 0;
740 abort_with_buffer:
741 	free(inflate_buffer, M_TEMP);
742 abort_with_zs:
743 	inflateEnd(&zs);
744 abort_with_fw:
745 	firmware_put(fw, FIRMWARE_UNLOAD);
746 	return status;
747 }
748 
749 /*
750  * Enable or disable periodic RDMAs from the host to make certain
751  * chipsets resend dropped PCIe messages
752  */
753 
754 static void
755 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
756 {
757 	char buf_bytes[72];
758 	volatile uint32_t *confirm;
759 	volatile char *submit;
760 	uint32_t *buf, dma_low, dma_high;
761 	int i;
762 
763 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
764 
765 	/* clear confirmation addr */
766 	confirm = (volatile uint32_t *)sc->cmd;
767 	*confirm = 0;
768 	mb();
769 
770 	/* send an rdma command to the PCIe engine, and wait for the
771 	   response in the confirmation address.  The firmware should
772 	   write a -1 there to indicate it is alive and well
773 	*/
774 
775 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
776 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
777 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
778 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
779 	buf[2] = htobe32(0xffffffff);		/* confirm data */
780 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
781 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
782 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
783 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
784 	buf[5] = htobe32(enable);			/* enable? */
785 
786 
787 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
788 
789 	mxge_pio_copy(submit, buf, 64);
790 	mb();
791 	DELAY(1000);
792 	mb();
793 	i = 0;
794 	while (*confirm != 0xffffffff && i < 20) {
795 		DELAY(1000);
796 		i++;
797 	}
798 	if (*confirm != 0xffffffff) {
799 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
800 			      (enable ? "enable" : "disable"), confirm,
801 			      *confirm);
802 	}
803 	return;
804 }
805 
806 static int
807 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
808 {
809 	mcp_cmd_t *buf;
810 	char buf_bytes[sizeof(*buf) + 8];
811 	volatile mcp_cmd_response_t *response = sc->cmd;
812 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
813 	uint32_t dma_low, dma_high;
814 	int err, sleep_total = 0;
815 
816 	/* ensure buf is aligned to 8 bytes */
817 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
818 
819 	buf->data0 = htobe32(data->data0);
820 	buf->data1 = htobe32(data->data1);
821 	buf->data2 = htobe32(data->data2);
822 	buf->cmd = htobe32(cmd);
823 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
824 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
825 
826 	buf->response_addr.low = htobe32(dma_low);
827 	buf->response_addr.high = htobe32(dma_high);
828 	mtx_lock(&sc->cmd_mtx);
829 	response->result = 0xffffffff;
830 	mb();
831 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
832 
833 	/* wait up to 20ms */
834 	err = EAGAIN;
835 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
836 		bus_dmamap_sync(sc->cmd_dma.dmat,
837 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
838 		mb();
839 		switch (be32toh(response->result)) {
840 		case 0:
841 			data->data0 = be32toh(response->data);
842 			err = 0;
843 			break;
844 		case 0xffffffff:
845 			DELAY(1000);
846 			break;
847 		case MXGEFW_CMD_UNKNOWN:
848 			err = ENOSYS;
849 			break;
850 		case MXGEFW_CMD_ERROR_UNALIGNED:
851 			err = E2BIG;
852 			break;
853 		case MXGEFW_CMD_ERROR_BUSY:
854 			err = EBUSY;
855 			break;
856 		default:
857 			device_printf(sc->dev,
858 				      "mxge: command %d "
859 				      "failed, result = %d\n",
860 				      cmd, be32toh(response->result));
861 			err = ENXIO;
862 			break;
863 		}
864 		if (err != EAGAIN)
865 			break;
866 	}
867 	if (err == EAGAIN)
868 		device_printf(sc->dev, "mxge: command %d timed out"
869 			      "result = %d\n",
870 			      cmd, be32toh(response->result));
871 	mtx_unlock(&sc->cmd_mtx);
872 	return err;
873 }
874 
875 static int
876 mxge_adopt_running_firmware(mxge_softc_t *sc)
877 {
878 	struct mcp_gen_header *hdr;
879 	const size_t bytes = sizeof (struct mcp_gen_header);
880 	size_t hdr_offset;
881 	int status;
882 
883 	/* find running firmware header */
884 	hdr_offset = htobe32(*(volatile uint32_t *)
885 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
886 
887 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
888 		device_printf(sc->dev,
889 			      "Running firmware has bad header offset (%d)\n",
890 			      (int)hdr_offset);
891 		return EIO;
892 	}
893 
894 	/* copy header of running firmware from SRAM to host memory to
895 	 * validate firmware */
896 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
897 	if (hdr == NULL) {
898 		device_printf(sc->dev, "could not malloc firmware hdr\n");
899 		return ENOMEM;
900 	}
901 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
902 				rman_get_bushandle(sc->mem_res),
903 				hdr_offset, (char *)hdr, bytes);
904 	status = mxge_validate_firmware(sc, hdr);
905 	free(hdr, M_DEVBUF);
906 
907 	/*
908 	 * check to see if adopted firmware has bug where adopting
909 	 * it will cause broadcasts to be filtered unless the NIC
910 	 * is kept in ALLMULTI mode
911 	 */
912 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
913 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
914 		sc->adopted_rx_filter_bug = 1;
915 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
916 			      "working around rx filter bug\n",
917 			      sc->fw_ver_major, sc->fw_ver_minor,
918 			      sc->fw_ver_tiny);
919 	}
920 
921 	return status;
922 }
923 
924 
925 static int
926 mxge_load_firmware(mxge_softc_t *sc)
927 {
928 	volatile uint32_t *confirm;
929 	volatile char *submit;
930 	char buf_bytes[72];
931 	uint32_t *buf, size, dma_low, dma_high;
932 	int status, i;
933 
934 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
935 
936 	size = sc->sram_size;
937 	status = mxge_load_firmware_helper(sc, &size);
938 	if (status) {
939 		/* Try to use the currently running firmware, if
940 		   it is new enough */
941 		status = mxge_adopt_running_firmware(sc);
942 		if (status) {
943 			device_printf(sc->dev,
944 				      "failed to adopt running firmware\n");
945 			return status;
946 		}
947 		device_printf(sc->dev,
948 			      "Successfully adopted running firmware\n");
949 		if (sc->tx.boundary == 4096) {
950 			device_printf(sc->dev,
951 				"Using firmware currently running on NIC"
952 				 ".  For optimal\n");
953 			device_printf(sc->dev,
954 				 "performance consider loading optimized "
955 				 "firmware\n");
956 		}
957 		sc->fw_name = mxge_fw_unaligned;
958 		sc->tx.boundary = 2048;
959 		return 0;
960 	}
961 	/* clear confirmation addr */
962 	confirm = (volatile uint32_t *)sc->cmd;
963 	*confirm = 0;
964 	mb();
965 	/* send a reload command to the bootstrap MCP, and wait for the
966 	   response in the confirmation address.  The firmware should
967 	   write a -1 there to indicate it is alive and well
968 	*/
969 
970 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
971 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
972 
973 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
974 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
975 	buf[2] = htobe32(0xffffffff);	/* confirm data */
976 
977 	/* FIX: All newest firmware should un-protect the bottom of
978 	   the sram before handoff. However, the very first interfaces
979 	   do not. Therefore the handoff copy must skip the first 8 bytes
980 	*/
981 					/* where the code starts*/
982 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
983 	buf[4] = htobe32(size - 8); 	/* length of code */
984 	buf[5] = htobe32(8);		/* where to copy to */
985 	buf[6] = htobe32(0);		/* where to jump to */
986 
987 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
988 	mxge_pio_copy(submit, buf, 64);
989 	mb();
990 	DELAY(1000);
991 	mb();
992 	i = 0;
993 	while (*confirm != 0xffffffff && i < 20) {
994 		DELAY(1000*10);
995 		i++;
996 		bus_dmamap_sync(sc->cmd_dma.dmat,
997 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
998 	}
999 	if (*confirm != 0xffffffff) {
1000 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1001 			confirm, *confirm);
1002 
1003 		return ENXIO;
1004 	}
1005 	return 0;
1006 }
1007 
1008 static int
1009 mxge_update_mac_address(mxge_softc_t *sc)
1010 {
1011 	mxge_cmd_t cmd;
1012 	uint8_t *addr = sc->mac_addr;
1013 	int status;
1014 
1015 
1016 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1017 		     | (addr[2] << 8) | addr[3]);
1018 
1019 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1020 
1021 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1022 	return status;
1023 }
1024 
1025 static int
1026 mxge_change_pause(mxge_softc_t *sc, int pause)
1027 {
1028 	mxge_cmd_t cmd;
1029 	int status;
1030 
1031 	if (pause)
1032 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1033 				       &cmd);
1034 	else
1035 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1036 				       &cmd);
1037 
1038 	if (status) {
1039 		device_printf(sc->dev, "Failed to set flow control mode\n");
1040 		return ENXIO;
1041 	}
1042 	sc->pause = pause;
1043 	return 0;
1044 }
1045 
1046 static void
1047 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1048 {
1049 	mxge_cmd_t cmd;
1050 	int status;
1051 
1052 	if (promisc)
1053 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1054 				       &cmd);
1055 	else
1056 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1057 				       &cmd);
1058 
1059 	if (status) {
1060 		device_printf(sc->dev, "Failed to set promisc mode\n");
1061 	}
1062 }
1063 
1064 static void
1065 mxge_set_multicast_list(mxge_softc_t *sc)
1066 {
1067 	mxge_cmd_t cmd;
1068 	struct ifmultiaddr *ifma;
1069 	struct ifnet *ifp = sc->ifp;
1070 	int err;
1071 
1072 	/* This firmware is known to not support multicast */
1073 	if (!sc->fw_multicast_support)
1074 		return;
1075 
1076 	/* Disable multicast filtering while we play with the lists*/
1077 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1078 	if (err != 0) {
1079 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1080 		       " error status: %d\n", err);
1081 		return;
1082 	}
1083 
1084 	if (sc->adopted_rx_filter_bug)
1085 		return;
1086 
1087 	if (ifp->if_flags & IFF_ALLMULTI)
1088 		/* request to disable multicast filtering, so quit here */
1089 		return;
1090 
1091 	/* Flush all the filters */
1092 
1093 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1094 	if (err != 0) {
1095 		device_printf(sc->dev,
1096 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1097 			      ", error status: %d\n", err);
1098 		return;
1099 	}
1100 
1101 	/* Walk the multicast list, and add each address */
1102 
1103 	IF_ADDR_LOCK(ifp);
1104 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1105 		if (ifma->ifma_addr->sa_family != AF_LINK)
1106 			continue;
1107 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1108 		      &cmd.data0, 4);
1109 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1110 		      &cmd.data1, 2);
1111 		cmd.data0 = htonl(cmd.data0);
1112 		cmd.data1 = htonl(cmd.data1);
1113 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1114 		if (err != 0) {
1115 			device_printf(sc->dev, "Failed "
1116 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1117 			       "%d\t", err);
1118 			/* abort, leaving multicast filtering off */
1119 			IF_ADDR_UNLOCK(ifp);
1120 			return;
1121 		}
1122 	}
1123 	IF_ADDR_UNLOCK(ifp);
1124 	/* Enable multicast filtering */
1125 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1126 	if (err != 0) {
1127 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1128 		       ", error status: %d\n", err);
1129 	}
1130 }
1131 
1132 static int
1133 mxge_max_mtu(mxge_softc_t *sc)
1134 {
1135 	mxge_cmd_t cmd;
1136 	int status;
1137 
1138 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1139 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1140 
1141 	/* try to set nbufs to see if it we can
1142 	   use virtually contiguous jumbos */
1143 	cmd.data0 = 0;
1144 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1145 			       &cmd);
1146 	if (status == 0)
1147 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1148 
1149 	/* otherwise, we're limited to MJUMPAGESIZE */
1150 	return MJUMPAGESIZE - MXGEFW_PAD;
1151 }
1152 
1153 static int
1154 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1155 {
1156 
1157 	mxge_cmd_t cmd;
1158 	size_t bytes;
1159 	int status;
1160 
1161 	/* try to send a reset command to the card to see if it
1162 	   is alive */
1163 	memset(&cmd, 0, sizeof (cmd));
1164 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1165 	if (status != 0) {
1166 		device_printf(sc->dev, "failed reset\n");
1167 		return ENXIO;
1168 	}
1169 
1170 	mxge_dummy_rdma(sc, 1);
1171 
1172 	if (interrupts_setup) {
1173 		/* Now exchange information about interrupts  */
1174 		bytes = (sc->rx_done.mask + 1) * sizeof (*sc->rx_done.entry);
1175 		memset(sc->rx_done.entry, 0, bytes);
1176 		cmd.data0 = (uint32_t)bytes;
1177 		status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1178 		cmd.data0 = MXGE_LOWPART_TO_U32(sc->rx_done.dma.bus_addr);
1179 		cmd.data1 = MXGE_HIGHPART_TO_U32(sc->rx_done.dma.bus_addr);
1180 		status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd);
1181 	}
1182 
1183 	status |= mxge_send_cmd(sc,
1184 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1185 
1186 
1187 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1188 
1189 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1190 	sc->irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1191 
1192 
1193 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1194 				&cmd);
1195 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1196 	if (status != 0) {
1197 		device_printf(sc->dev, "failed set interrupt parameters\n");
1198 		return status;
1199 	}
1200 
1201 
1202 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1203 
1204 
1205 	/* run a DMA benchmark */
1206 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1207 
1208 	/* reset mcp/driver shared state back to 0 */
1209 	sc->rx_done.idx = 0;
1210 	sc->rx_done.cnt = 0;
1211 	sc->tx.req = 0;
1212 	sc->tx.done = 0;
1213 	sc->tx.pkt_done = 0;
1214 	sc->tx.wake = 0;
1215 	sc->tx_defrag = 0;
1216 	sc->tx.stall = 0;
1217 	sc->rx_big.cnt = 0;
1218 	sc->rx_small.cnt = 0;
1219 	sc->rdma_tags_available = 15;
1220 	sc->fw_stats->valid = 0;
1221 	sc->fw_stats->send_done_count = 0;
1222 	sc->lro_bad_csum = 0;
1223 	sc->lro_queued = 0;
1224 	sc->lro_flushed = 0;
1225 	status = mxge_update_mac_address(sc);
1226 	mxge_change_promisc(sc, 0);
1227 	mxge_change_pause(sc, sc->pause);
1228 	mxge_set_multicast_list(sc);
1229 	return status;
1230 }
1231 
1232 static int
1233 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1234 {
1235         mxge_softc_t *sc;
1236         unsigned int intr_coal_delay;
1237         int err;
1238 
1239         sc = arg1;
1240         intr_coal_delay = sc->intr_coal_delay;
1241         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1242         if (err != 0) {
1243                 return err;
1244         }
1245         if (intr_coal_delay == sc->intr_coal_delay)
1246                 return 0;
1247 
1248         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1249                 return EINVAL;
1250 
1251 	mtx_lock(&sc->driver_mtx);
1252 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1253 	sc->intr_coal_delay = intr_coal_delay;
1254 
1255 	mtx_unlock(&sc->driver_mtx);
1256         return err;
1257 }
1258 
1259 static int
1260 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1261 {
1262         mxge_softc_t *sc;
1263         unsigned int enabled;
1264         int err;
1265 
1266         sc = arg1;
1267         enabled = sc->pause;
1268         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1269         if (err != 0) {
1270                 return err;
1271         }
1272         if (enabled == sc->pause)
1273                 return 0;
1274 
1275 	mtx_lock(&sc->driver_mtx);
1276 	err = mxge_change_pause(sc, enabled);
1277 	mtx_unlock(&sc->driver_mtx);
1278         return err;
1279 }
1280 
1281 static int
1282 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1283 {
1284 	struct ifnet *ifp;
1285 	int err = 0;
1286 
1287 	ifp = sc->ifp;
1288 	if (lro_cnt == 0)
1289 		ifp->if_capenable &= ~IFCAP_LRO;
1290 	else
1291 		ifp->if_capenable |= IFCAP_LRO;
1292 	sc->lro_cnt = lro_cnt;
1293 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1294 		callout_stop(&sc->co_hdl);
1295 		mxge_close(sc);
1296 		err = mxge_open(sc);
1297 		if (err == 0)
1298 			callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
1299 	}
1300 	return err;
1301 }
1302 
1303 static int
1304 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1305 {
1306 	mxge_softc_t *sc;
1307 	unsigned int lro_cnt;
1308 	int err;
1309 
1310 	sc = arg1;
1311 	lro_cnt = sc->lro_cnt;
1312 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1313 	if (err != 0)
1314 		return err;
1315 
1316 	if (lro_cnt == sc->lro_cnt)
1317 		return 0;
1318 
1319 	if (lro_cnt > 128)
1320 		return EINVAL;
1321 
1322 	mtx_lock(&sc->driver_mtx);
1323 	err = mxge_change_lro_locked(sc, lro_cnt);
1324 	mtx_unlock(&sc->driver_mtx);
1325 	return err;
1326 }
1327 
1328 static int
1329 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1330 {
1331         int err;
1332 
1333         if (arg1 == NULL)
1334                 return EFAULT;
1335         arg2 = be32toh(*(int *)arg1);
1336         arg1 = NULL;
1337         err = sysctl_handle_int(oidp, arg1, arg2, req);
1338 
1339         return err;
1340 }
1341 
1342 static void
1343 mxge_add_sysctls(mxge_softc_t *sc)
1344 {
1345 	struct sysctl_ctx_list *ctx;
1346 	struct sysctl_oid_list *children;
1347 	mcp_irq_data_t *fw;
1348 
1349 	ctx = device_get_sysctl_ctx(sc->dev);
1350 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1351 	fw = sc->fw_stats;
1352 
1353 	/* random information */
1354 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1355 		       "firmware_version",
1356 		       CTLFLAG_RD, &sc->fw_version,
1357 		       0, "firmware version");
1358 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1359 		       "serial_number",
1360 		       CTLFLAG_RD, &sc->serial_number_string,
1361 		       0, "serial number");
1362 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1363 		       "product_code",
1364 		       CTLFLAG_RD, &sc->product_code_string,
1365 		       0, "product_code");
1366 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1367 		       "pcie_link_width",
1368 		       CTLFLAG_RD, &sc->link_width,
1369 		       0, "tx_boundary");
1370 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1371 		       "tx_boundary",
1372 		       CTLFLAG_RD, &sc->tx.boundary,
1373 		       0, "tx_boundary");
1374 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1375 		       "write_combine",
1376 		       CTLFLAG_RD, &sc->wc,
1377 		       0, "write combining PIO?");
1378 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1379 		       "read_dma_MBs",
1380 		       CTLFLAG_RD, &sc->read_dma,
1381 		       0, "DMA Read speed in MB/s");
1382 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1383 		       "write_dma_MBs",
1384 		       CTLFLAG_RD, &sc->write_dma,
1385 		       0, "DMA Write speed in MB/s");
1386 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1387 		       "read_write_dma_MBs",
1388 		       CTLFLAG_RD, &sc->read_write_dma,
1389 		       0, "DMA concurrent Read/Write speed in MB/s");
1390 
1391 
1392 	/* performance related tunables */
1393 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1394 			"intr_coal_delay",
1395 			CTLTYPE_INT|CTLFLAG_RW, sc,
1396 			0, mxge_change_intr_coal,
1397 			"I", "interrupt coalescing delay in usecs");
1398 
1399 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1400 			"flow_control_enabled",
1401 			CTLTYPE_INT|CTLFLAG_RW, sc,
1402 			0, mxge_change_flow_control,
1403 			"I", "interrupt coalescing delay in usecs");
1404 
1405 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1406 		       "deassert_wait",
1407 		       CTLFLAG_RW, &mxge_deassert_wait,
1408 		       0, "Wait for IRQ line to go low in ihandler");
1409 
1410 	/* stats block from firmware is in network byte order.
1411 	   Need to swap it */
1412 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1413 			"link_up",
1414 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1415 			0, mxge_handle_be32,
1416 			"I", "link up");
1417 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1418 			"rdma_tags_available",
1419 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1420 			0, mxge_handle_be32,
1421 			"I", "rdma_tags_available");
1422 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1423 			"dropped_bad_crc32",
1424 			CTLTYPE_INT|CTLFLAG_RD,
1425 			&fw->dropped_bad_crc32,
1426 			0, mxge_handle_be32,
1427 			"I", "dropped_bad_crc32");
1428 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1429 			"dropped_bad_phy",
1430 			CTLTYPE_INT|CTLFLAG_RD,
1431 			&fw->dropped_bad_phy,
1432 			0, mxge_handle_be32,
1433 			"I", "dropped_bad_phy");
1434 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1435 			"dropped_link_error_or_filtered",
1436 			CTLTYPE_INT|CTLFLAG_RD,
1437 			&fw->dropped_link_error_or_filtered,
1438 			0, mxge_handle_be32,
1439 			"I", "dropped_link_error_or_filtered");
1440 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1441 			"dropped_link_overflow",
1442 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1443 			0, mxge_handle_be32,
1444 			"I", "dropped_link_overflow");
1445 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1446 			"dropped_multicast_filtered",
1447 			CTLTYPE_INT|CTLFLAG_RD,
1448 			&fw->dropped_multicast_filtered,
1449 			0, mxge_handle_be32,
1450 			"I", "dropped_multicast_filtered");
1451 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1452 			"dropped_no_big_buffer",
1453 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1454 			0, mxge_handle_be32,
1455 			"I", "dropped_no_big_buffer");
1456 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1457 			"dropped_no_small_buffer",
1458 			CTLTYPE_INT|CTLFLAG_RD,
1459 			&fw->dropped_no_small_buffer,
1460 			0, mxge_handle_be32,
1461 			"I", "dropped_no_small_buffer");
1462 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1463 			"dropped_overrun",
1464 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1465 			0, mxge_handle_be32,
1466 			"I", "dropped_overrun");
1467 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1468 			"dropped_pause",
1469 			CTLTYPE_INT|CTLFLAG_RD,
1470 			&fw->dropped_pause,
1471 			0, mxge_handle_be32,
1472 			"I", "dropped_pause");
1473 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1474 			"dropped_runt",
1475 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1476 			0, mxge_handle_be32,
1477 			"I", "dropped_runt");
1478 
1479 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1480 			"dropped_unicast_filtered",
1481 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1482 			0, mxge_handle_be32,
1483 			"I", "dropped_unicast_filtered");
1484 
1485 	/* host counters exported for debugging */
1486 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1487 		       "rx_small_cnt",
1488 		       CTLFLAG_RD, &sc->rx_small.cnt,
1489 		       0, "rx_small_cnt");
1490 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1491 		       "rx_big_cnt",
1492 		       CTLFLAG_RD, &sc->rx_big.cnt,
1493 		       0, "rx_small_cnt");
1494 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1495 		       "tx_req",
1496 		       CTLFLAG_RD, &sc->tx.req,
1497 		       0, "tx_req");
1498 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1499 		       "tx_done",
1500 		       CTLFLAG_RD, &sc->tx.done,
1501 		       0, "tx_done");
1502 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1503 		       "tx_pkt_done",
1504 		       CTLFLAG_RD, &sc->tx.pkt_done,
1505 		       0, "tx_done");
1506 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1507 		       "tx_stall",
1508 		       CTLFLAG_RD, &sc->tx.stall,
1509 		       0, "tx_stall");
1510 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1511 		       "tx_wake",
1512 		       CTLFLAG_RD, &sc->tx.wake,
1513 		       0, "tx_wake");
1514 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1515 		       "tx_defrag",
1516 		       CTLFLAG_RD, &sc->tx_defrag,
1517 		       0, "tx_defrag");
1518 
1519 	/* verbose printing? */
1520 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1521 		       "verbose",
1522 		       CTLFLAG_RW, &mxge_verbose,
1523 		       0, "verbose printing");
1524 
1525 	/* lro */
1526 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1527 			"lro_cnt",
1528 			CTLTYPE_INT|CTLFLAG_RW, sc,
1529 			0, mxge_change_lro,
1530 			"I", "number of lro merge queues");
1531 
1532 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1533 		       "lro_flushed", CTLFLAG_RD, &sc->lro_flushed,
1534 		       0, "number of lro merge queues flushed");
1535 
1536 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1537 		       "lro_queued", CTLFLAG_RD, &sc->lro_queued,
1538 		       0, "number of frames appended to lro merge queues");
1539 
1540 }
1541 
1542 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1543    backwards one at a time and handle ring wraps */
1544 
1545 static inline void
1546 mxge_submit_req_backwards(mxge_tx_buf_t *tx,
1547 			    mcp_kreq_ether_send_t *src, int cnt)
1548 {
1549         int idx, starting_slot;
1550         starting_slot = tx->req;
1551         while (cnt > 1) {
1552                 cnt--;
1553                 idx = (starting_slot + cnt) & tx->mask;
1554                 mxge_pio_copy(&tx->lanai[idx],
1555 			      &src[cnt], sizeof(*src));
1556                 mb();
1557         }
1558 }
1559 
1560 /*
1561  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1562  * at most 32 bytes at a time, so as to avoid involving the software
1563  * pio handler in the nic.   We re-write the first segment's flags
1564  * to mark them valid only after writing the entire chain
1565  */
1566 
1567 static inline void
1568 mxge_submit_req(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src,
1569                   int cnt)
1570 {
1571         int idx, i;
1572         uint32_t *src_ints;
1573 	volatile uint32_t *dst_ints;
1574         mcp_kreq_ether_send_t *srcp;
1575 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1576 	uint8_t last_flags;
1577 
1578         idx = tx->req & tx->mask;
1579 
1580 	last_flags = src->flags;
1581 	src->flags = 0;
1582         mb();
1583         dst = dstp = &tx->lanai[idx];
1584         srcp = src;
1585 
1586         if ((idx + cnt) < tx->mask) {
1587                 for (i = 0; i < (cnt - 1); i += 2) {
1588                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1589                         mb(); /* force write every 32 bytes */
1590                         srcp += 2;
1591                         dstp += 2;
1592                 }
1593         } else {
1594                 /* submit all but the first request, and ensure
1595                    that it is submitted below */
1596                 mxge_submit_req_backwards(tx, src, cnt);
1597                 i = 0;
1598         }
1599         if (i < cnt) {
1600                 /* submit the first request */
1601                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1602                 mb(); /* barrier before setting valid flag */
1603         }
1604 
1605         /* re-write the last 32-bits with the valid flags */
1606         src->flags = last_flags;
1607         src_ints = (uint32_t *)src;
1608         src_ints+=3;
1609         dst_ints = (volatile uint32_t *)dst;
1610         dst_ints+=3;
1611         *dst_ints =  *src_ints;
1612         tx->req += cnt;
1613         mb();
1614 }
1615 
1616 static void
1617 mxge_encap_tso(mxge_softc_t *sc, struct mbuf *m, int busdma_seg_cnt,
1618 	       int ip_off)
1619 {
1620 	mxge_tx_buf_t *tx;
1621 	mcp_kreq_ether_send_t *req;
1622 	bus_dma_segment_t *seg;
1623 	struct ip *ip;
1624 	struct tcphdr *tcp;
1625 	uint32_t low, high_swapped;
1626 	int len, seglen, cum_len, cum_len_next;
1627 	int next_is_first, chop, cnt, rdma_count, small;
1628 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1629 	uint8_t flags, flags_next;
1630 	static int once;
1631 
1632 	mss = m->m_pkthdr.tso_segsz;
1633 
1634 	/* negative cum_len signifies to the
1635 	 * send loop that we are still in the
1636 	 * header portion of the TSO packet.
1637 	 */
1638 
1639 	/* ensure we have the ethernet, IP and TCP
1640 	   header together in the first mbuf, copy
1641 	   it to a scratch buffer if not */
1642 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1643 		m_copydata(m, 0, ip_off + sizeof (*ip),
1644 			   sc->scratch);
1645 		ip = (struct ip *)(sc->scratch + ip_off);
1646 	} else {
1647 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1648 	}
1649 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1650 			    + sizeof (*tcp))) {
1651 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1652 			   + sizeof (*tcp),  sc->scratch);
1653 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1654 	}
1655 
1656 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1657 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1658 
1659 	/* TSO implies checksum offload on this hardware */
1660 	cksum_offset = ip_off + (ip->ip_hl << 2);
1661 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1662 
1663 
1664 	/* for TSO, pseudo_hdr_offset holds mss.
1665 	 * The firmware figures out where to put
1666 	 * the checksum by parsing the header. */
1667 	pseudo_hdr_offset = htobe16(mss);
1668 
1669 	tx = &sc->tx;
1670 	req = tx->req_list;
1671 	seg = tx->seg_list;
1672 	cnt = 0;
1673 	rdma_count = 0;
1674 	/* "rdma_count" is the number of RDMAs belonging to the
1675 	 * current packet BEFORE the current send request. For
1676 	 * non-TSO packets, this is equal to "count".
1677 	 * For TSO packets, rdma_count needs to be reset
1678 	 * to 0 after a segment cut.
1679 	 *
1680 	 * The rdma_count field of the send request is
1681 	 * the number of RDMAs of the packet starting at
1682 	 * that request. For TSO send requests with one ore more cuts
1683 	 * in the middle, this is the number of RDMAs starting
1684 	 * after the last cut in the request. All previous
1685 	 * segments before the last cut implicitly have 1 RDMA.
1686 	 *
1687 	 * Since the number of RDMAs is not known beforehand,
1688 	 * it must be filled-in retroactively - after each
1689 	 * segmentation cut or at the end of the entire packet.
1690 	 */
1691 
1692 	while (busdma_seg_cnt) {
1693 		/* Break the busdma segment up into pieces*/
1694 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1695 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1696 		len = seg->ds_len;
1697 
1698 		while (len) {
1699 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1700 			seglen = len;
1701 			cum_len_next = cum_len + seglen;
1702 			(req-rdma_count)->rdma_count = rdma_count + 1;
1703 			if (__predict_true(cum_len >= 0)) {
1704 				/* payload */
1705 				chop = (cum_len_next > mss);
1706 				cum_len_next = cum_len_next % mss;
1707 				next_is_first = (cum_len_next == 0);
1708 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1709 				flags_next |= next_is_first *
1710 					MXGEFW_FLAGS_FIRST;
1711 				rdma_count |= -(chop | next_is_first);
1712 				rdma_count += chop & !next_is_first;
1713 			} else if (cum_len_next >= 0) {
1714 				/* header ends */
1715 				rdma_count = -1;
1716 				cum_len_next = 0;
1717 				seglen = -cum_len;
1718 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1719 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1720 					MXGEFW_FLAGS_FIRST |
1721 					(small * MXGEFW_FLAGS_SMALL);
1722 			    }
1723 
1724 			req->addr_high = high_swapped;
1725 			req->addr_low = htobe32(low);
1726 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1727 			req->pad = 0;
1728 			req->rdma_count = 1;
1729 			req->length = htobe16(seglen);
1730 			req->cksum_offset = cksum_offset;
1731 			req->flags = flags | ((cum_len & 1) *
1732 					      MXGEFW_FLAGS_ALIGN_ODD);
1733 			low += seglen;
1734 			len -= seglen;
1735 			cum_len = cum_len_next;
1736 			flags = flags_next;
1737 			req++;
1738 			cnt++;
1739 			rdma_count++;
1740 			if (__predict_false(cksum_offset > seglen))
1741 				cksum_offset -= seglen;
1742 			else
1743 				cksum_offset = 0;
1744 			if (__predict_false(cnt > tx->max_desc))
1745 				goto drop;
1746 		}
1747 		busdma_seg_cnt--;
1748 		seg++;
1749 	}
1750 	(req-rdma_count)->rdma_count = rdma_count;
1751 
1752 	do {
1753 		req--;
1754 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1755 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1756 
1757 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1758 	mxge_submit_req(tx, tx->req_list, cnt);
1759 	return;
1760 
1761 drop:
1762 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1763 	m_freem(m);
1764 	sc->ifp->if_oerrors++;
1765 	if (!once) {
1766 		printf("tx->max_desc exceeded via TSO!\n");
1767 		printf("mss = %d, %ld, %d!\n", mss,
1768 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1769 		once = 1;
1770 	}
1771 	return;
1772 
1773 }
1774 
1775 /*
1776  * We reproduce the software vlan tag insertion from
1777  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1778  * vlan tag insertion. We need to advertise this in order to have the
1779  * vlan interface respect our csum offload flags.
1780  */
1781 static struct mbuf *
1782 mxge_vlan_tag_insert(struct mbuf *m)
1783 {
1784 	struct ether_vlan_header *evl;
1785 
1786 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1787 	if (__predict_false(m == NULL))
1788 		return NULL;
1789 	if (m->m_len < sizeof(*evl)) {
1790 		m = m_pullup(m, sizeof(*evl));
1791 		if (__predict_false(m == NULL))
1792 			return NULL;
1793 	}
1794 	/*
1795 	 * Transform the Ethernet header into an Ethernet header
1796 	 * with 802.1Q encapsulation.
1797 	 */
1798 	evl = mtod(m, struct ether_vlan_header *);
1799 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1800 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1801 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1802 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1803 	m->m_flags &= ~M_VLANTAG;
1804 	return m;
1805 }
1806 
1807 static void
1808 mxge_encap(mxge_softc_t *sc, struct mbuf *m)
1809 {
1810 	mcp_kreq_ether_send_t *req;
1811 	bus_dma_segment_t *seg;
1812 	struct mbuf *m_tmp;
1813 	struct ifnet *ifp;
1814 	mxge_tx_buf_t *tx;
1815 	struct ip *ip;
1816 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1817 	uint16_t pseudo_hdr_offset;
1818         uint8_t flags, cksum_offset;
1819 
1820 
1821 
1822 	ifp = sc->ifp;
1823 	tx = &sc->tx;
1824 
1825 	ip_off = sizeof (struct ether_header);
1826 	if (m->m_flags & M_VLANTAG) {
1827 		m = mxge_vlan_tag_insert(m);
1828 		if (__predict_false(m == NULL))
1829 			goto drop;
1830 		ip_off += ETHER_VLAN_ENCAP_LEN;
1831 	}
1832 
1833 	/* (try to) map the frame for DMA */
1834 	idx = tx->req & tx->mask;
1835 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1836 				      m, tx->seg_list, &cnt,
1837 				      BUS_DMA_NOWAIT);
1838 	if (__predict_false(err == EFBIG)) {
1839 		/* Too many segments in the chain.  Try
1840 		   to defrag */
1841 		m_tmp = m_defrag(m, M_NOWAIT);
1842 		if (m_tmp == NULL) {
1843 			goto drop;
1844 		}
1845 		sc->tx_defrag++;
1846 		m = m_tmp;
1847 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1848 					      tx->info[idx].map,
1849 					      m, tx->seg_list, &cnt,
1850 					      BUS_DMA_NOWAIT);
1851 	}
1852 	if (__predict_false(err != 0)) {
1853 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1854 			      " packet len = %d\n", err, m->m_pkthdr.len);
1855 		goto drop;
1856 	}
1857 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1858 			BUS_DMASYNC_PREWRITE);
1859 	tx->info[idx].m = m;
1860 
1861 
1862 	/* TSO is different enough, we handle it in another routine */
1863 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1864 		mxge_encap_tso(sc, m, cnt, ip_off);
1865 		return;
1866 	}
1867 
1868 	req = tx->req_list;
1869 	cksum_offset = 0;
1870 	pseudo_hdr_offset = 0;
1871 	flags = MXGEFW_FLAGS_NO_TSO;
1872 
1873 	/* checksum offloading? */
1874 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1875 		/* ensure ip header is in first mbuf, copy
1876 		   it to a scratch buffer if not */
1877 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1878 			m_copydata(m, 0, ip_off + sizeof (*ip),
1879 				   sc->scratch);
1880 			ip = (struct ip *)(sc->scratch + ip_off);
1881 		} else {
1882 			ip = (struct ip *)(mtod(m, char *) + ip_off);
1883 		}
1884 		cksum_offset = ip_off + (ip->ip_hl << 2);
1885 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1886 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1887 		req->cksum_offset = cksum_offset;
1888 		flags |= MXGEFW_FLAGS_CKSUM;
1889 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1890 	} else {
1891 		odd_flag = 0;
1892 	}
1893 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1894 		flags |= MXGEFW_FLAGS_SMALL;
1895 
1896 	/* convert segments into a request list */
1897 	cum_len = 0;
1898 	seg = tx->seg_list;
1899 	req->flags = MXGEFW_FLAGS_FIRST;
1900 	for (i = 0; i < cnt; i++) {
1901 		req->addr_low =
1902 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1903 		req->addr_high =
1904 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1905 		req->length = htobe16(seg->ds_len);
1906 		req->cksum_offset = cksum_offset;
1907 		if (cksum_offset > seg->ds_len)
1908 			cksum_offset -= seg->ds_len;
1909 		else
1910 			cksum_offset = 0;
1911 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1912 		req->pad = 0; /* complete solid 16-byte block */
1913 		req->rdma_count = 1;
1914 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1915 		cum_len += seg->ds_len;
1916 		seg++;
1917 		req++;
1918 		req->flags = 0;
1919 	}
1920 	req--;
1921 	/* pad runts to 60 bytes */
1922 	if (cum_len < 60) {
1923 		req++;
1924 		req->addr_low =
1925 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
1926 		req->addr_high =
1927 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
1928 		req->length = htobe16(60 - cum_len);
1929 		req->cksum_offset = 0;
1930 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1931 		req->pad = 0; /* complete solid 16-byte block */
1932 		req->rdma_count = 1;
1933 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1934 		cnt++;
1935 	}
1936 
1937 	tx->req_list[0].rdma_count = cnt;
1938 #if 0
1939 	/* print what the firmware will see */
1940 	for (i = 0; i < cnt; i++) {
1941 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1942 		    "cso:%d, flags:0x%x, rdma:%d\n",
1943 		    i, (int)ntohl(tx->req_list[i].addr_high),
1944 		    (int)ntohl(tx->req_list[i].addr_low),
1945 		    (int)ntohs(tx->req_list[i].length),
1946 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1947 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1948 		    tx->req_list[i].rdma_count);
1949 	}
1950 	printf("--------------\n");
1951 #endif
1952 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1953 	mxge_submit_req(tx, tx->req_list, cnt);
1954 	return;
1955 
1956 drop:
1957 	m_freem(m);
1958 	ifp->if_oerrors++;
1959 	return;
1960 }
1961 
1962 
1963 
1964 
1965 static inline void
1966 mxge_start_locked(mxge_softc_t *sc)
1967 {
1968 	struct mbuf *m;
1969 	struct ifnet *ifp;
1970 	mxge_tx_buf_t *tx;
1971 
1972 	ifp = sc->ifp;
1973 	tx = &sc->tx;
1974 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
1975 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
1976 		if (m == NULL) {
1977 			return;
1978 		}
1979 		/* let BPF see it */
1980 		BPF_MTAP(ifp, m);
1981 
1982 		/* give it to the nic */
1983 		mxge_encap(sc, m);
1984 	}
1985 	/* ran out of transmit slots */
1986 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
1987 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
1988 		tx->stall++;
1989 	}
1990 }
1991 
1992 static void
1993 mxge_start(struct ifnet *ifp)
1994 {
1995 	mxge_softc_t *sc = ifp->if_softc;
1996 
1997 
1998 	mtx_lock(&sc->tx_mtx);
1999 	mxge_start_locked(sc);
2000 	mtx_unlock(&sc->tx_mtx);
2001 }
2002 
2003 /*
2004  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2005  * at most 32 bytes at a time, so as to avoid involving the software
2006  * pio handler in the nic.   We re-write the first segment's low
2007  * DMA address to mark it valid only after we write the entire chunk
2008  * in a burst
2009  */
2010 static inline void
2011 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2012 		mcp_kreq_ether_recv_t *src)
2013 {
2014 	uint32_t low;
2015 
2016 	low = src->addr_low;
2017 	src->addr_low = 0xffffffff;
2018 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2019 	mb();
2020 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2021 	mb();
2022 	src->addr_low = low;
2023 	dst->addr_low = low;
2024 	mb();
2025 }
2026 
2027 static int
2028 mxge_get_buf_small(mxge_softc_t *sc, bus_dmamap_t map, int idx)
2029 {
2030 	bus_dma_segment_t seg;
2031 	struct mbuf *m;
2032 	mxge_rx_buf_t *rx = &sc->rx_small;
2033 	int cnt, err;
2034 
2035 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2036 	if (m == NULL) {
2037 		rx->alloc_fail++;
2038 		err = ENOBUFS;
2039 		goto done;
2040 	}
2041 	m->m_len = MHLEN;
2042 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2043 				      &seg, &cnt, BUS_DMA_NOWAIT);
2044 	if (err != 0) {
2045 		m_free(m);
2046 		goto done;
2047 	}
2048 	rx->info[idx].m = m;
2049 	rx->shadow[idx].addr_low =
2050 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2051 	rx->shadow[idx].addr_high =
2052 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2053 
2054 done:
2055 	if ((idx & 7) == 7)
2056 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2057 	return err;
2058 }
2059 
2060 static int
2061 mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
2062 {
2063 	bus_dma_segment_t seg[3];
2064 	struct mbuf *m;
2065 	mxge_rx_buf_t *rx = &sc->rx_big;
2066 	int cnt, err, i;
2067 
2068 	if (rx->cl_size == MCLBYTES)
2069 		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2070 	else
2071 		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2072 	if (m == NULL) {
2073 		rx->alloc_fail++;
2074 		err = ENOBUFS;
2075 		goto done;
2076 	}
2077 	m->m_len = rx->cl_size;
2078 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2079 				      seg, &cnt, BUS_DMA_NOWAIT);
2080 	if (err != 0) {
2081 		m_free(m);
2082 		goto done;
2083 	}
2084 	rx->info[idx].m = m;
2085 
2086 	for (i = 0; i < cnt; i++) {
2087 		rx->shadow[idx + i].addr_low =
2088 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2089 		rx->shadow[idx + i].addr_high =
2090 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2091        }
2092 
2093 
2094 done:
2095        for (i = 0; i < rx->nbufs; i++) {
2096 		if ((idx & 7) == 7) {
2097 			mxge_submit_8rx(&rx->lanai[idx - 7],
2098 					&rx->shadow[idx - 7]);
2099 		}
2100 		idx++;
2101 	}
2102 	return err;
2103 }
2104 
2105 /*
2106  *  Myri10GE hardware checksums are not valid if the sender
2107  *  padded the frame with non-zero padding.  This is because
2108  *  the firmware just does a simple 16-bit 1s complement
2109  *  checksum across the entire frame, excluding the first 14
2110  *  bytes.  It is best to simply to check the checksum and
2111  *  tell the stack about it only if the checksum is good
2112  */
2113 
2114 static inline uint16_t
2115 mxge_rx_csum(struct mbuf *m, int csum)
2116 {
2117 	struct ether_header *eh;
2118 	struct ip *ip;
2119 	uint16_t c;
2120 
2121 	eh = mtod(m, struct ether_header *);
2122 
2123 	/* only deal with IPv4 TCP & UDP for now */
2124 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2125 		return 1;
2126 	ip = (struct ip *)(eh + 1);
2127 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2128 			    ip->ip_p != IPPROTO_UDP))
2129 		return 1;
2130 
2131 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2132 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2133 			    - (ip->ip_hl << 2) + ip->ip_p));
2134 	c ^= 0xffff;
2135 	return (c);
2136 }
2137 
2138 static void
2139 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2140 {
2141 	struct ether_vlan_header *evl;
2142 	struct ether_header *eh;
2143 	uint32_t partial;
2144 
2145 	evl = mtod(m, struct ether_vlan_header *);
2146 	eh = mtod(m, struct ether_header *);
2147 
2148 	/*
2149 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2150 	 * after what the firmware thought was the end of the ethernet
2151 	 * header.
2152 	 */
2153 
2154 	/* put checksum into host byte order */
2155 	*csum = ntohs(*csum);
2156 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2157 	(*csum) += ~partial;
2158 	(*csum) +=  ((*csum) < ~partial);
2159 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2160 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2161 
2162 	/* restore checksum to network byte order;
2163 	   later consumers expect this */
2164 	*csum = htons(*csum);
2165 
2166 	/* save the tag */
2167 	m->m_flags |= M_VLANTAG;
2168 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2169 
2170 	/*
2171 	 * Remove the 802.1q header by copying the Ethernet
2172 	 * addresses over it and adjusting the beginning of
2173 	 * the data in the mbuf.  The encapsulated Ethernet
2174 	 * type field is already in place.
2175 	 */
2176 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2177 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2178 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2179 }
2180 
2181 
2182 static inline void
2183 mxge_rx_done_big(mxge_softc_t *sc, uint32_t len, uint32_t csum)
2184 {
2185 	struct ifnet *ifp;
2186 	struct mbuf *m;
2187 	struct ether_header *eh;
2188 	mxge_rx_buf_t *rx;
2189 	bus_dmamap_t old_map;
2190 	int idx;
2191 	uint16_t tcpudp_csum;
2192 
2193 	ifp = sc->ifp;
2194 	rx = &sc->rx_big;
2195 	idx = rx->cnt & rx->mask;
2196 	rx->cnt += rx->nbufs;
2197 	/* save a pointer to the received mbuf */
2198 	m = rx->info[idx].m;
2199 	/* try to replace the received mbuf */
2200 	if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
2201 		/* drop the frame -- the old mbuf is re-cycled */
2202 		ifp->if_ierrors++;
2203 		return;
2204 	}
2205 
2206 	/* unmap the received buffer */
2207 	old_map = rx->info[idx].map;
2208 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2209 	bus_dmamap_unload(rx->dmat, old_map);
2210 
2211 	/* swap the bus_dmamap_t's */
2212 	rx->info[idx].map = rx->extra_map;
2213 	rx->extra_map = old_map;
2214 
2215 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2216 	 * aligned */
2217 	m->m_data += MXGEFW_PAD;
2218 
2219 	m->m_pkthdr.rcvif = ifp;
2220 	m->m_len = m->m_pkthdr.len = len;
2221 	ifp->if_ipackets++;
2222 	eh = mtod(m, struct ether_header *);
2223 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2224 		mxge_vlan_tag_remove(m, &csum);
2225 	}
2226 	/* if the checksum is valid, mark it in the mbuf header */
2227 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2228 		if (sc->lro_cnt && (0 == mxge_lro_rx(sc, m, csum)))
2229 			return;
2230 		/* otherwise, it was a UDP frame, or a TCP frame which
2231 		   we could not do LRO on.  Tell the stack that the
2232 		   checksum is good */
2233 		m->m_pkthdr.csum_data = 0xffff;
2234 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2235 	}
2236 	/* pass the frame up the stack */
2237 	(*ifp->if_input)(ifp, m);
2238 }
2239 
2240 static inline void
2241 mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
2242 {
2243 	struct ifnet *ifp;
2244 	struct ether_header *eh;
2245 	struct mbuf *m;
2246 	mxge_rx_buf_t *rx;
2247 	bus_dmamap_t old_map;
2248 	int idx;
2249 	uint16_t tcpudp_csum;
2250 
2251 	ifp = sc->ifp;
2252 	rx = &sc->rx_small;
2253 	idx = rx->cnt & rx->mask;
2254 	rx->cnt++;
2255 	/* save a pointer to the received mbuf */
2256 	m = rx->info[idx].m;
2257 	/* try to replace the received mbuf */
2258 	if (mxge_get_buf_small(sc, rx->extra_map, idx)) {
2259 		/* drop the frame -- the old mbuf is re-cycled */
2260 		ifp->if_ierrors++;
2261 		return;
2262 	}
2263 
2264 	/* unmap the received buffer */
2265 	old_map = rx->info[idx].map;
2266 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2267 	bus_dmamap_unload(rx->dmat, old_map);
2268 
2269 	/* swap the bus_dmamap_t's */
2270 	rx->info[idx].map = rx->extra_map;
2271 	rx->extra_map = old_map;
2272 
2273 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2274 	 * aligned */
2275 	m->m_data += MXGEFW_PAD;
2276 
2277 	m->m_pkthdr.rcvif = ifp;
2278 	m->m_len = m->m_pkthdr.len = len;
2279 	ifp->if_ipackets++;
2280 	eh = mtod(m, struct ether_header *);
2281 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2282 		mxge_vlan_tag_remove(m, &csum);
2283 	}
2284 	/* if the checksum is valid, mark it in the mbuf header */
2285 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2286 		if (sc->lro_cnt && (0 == mxge_lro_rx(sc, m, csum)))
2287 			return;
2288 		/* otherwise, it was a UDP frame, or a TCP frame which
2289 		   we could not do LRO on.  Tell the stack that the
2290 		   checksum is good */
2291 		m->m_pkthdr.csum_data = 0xffff;
2292 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2293 	}
2294 
2295 	/* pass the frame up the stack */
2296 	(*ifp->if_input)(ifp, m);
2297 }
2298 
2299 static inline void
2300 mxge_clean_rx_done(mxge_softc_t *sc)
2301 {
2302 	mxge_rx_done_t *rx_done = &sc->rx_done;
2303 	struct lro_entry *lro;
2304 	int limit = 0;
2305 	uint16_t length;
2306 	uint16_t checksum;
2307 
2308 
2309 	while (rx_done->entry[rx_done->idx].length != 0) {
2310 		length = ntohs(rx_done->entry[rx_done->idx].length);
2311 		rx_done->entry[rx_done->idx].length = 0;
2312 		checksum = rx_done->entry[rx_done->idx].checksum;
2313 		if (length <= (MHLEN - MXGEFW_PAD))
2314 			mxge_rx_done_small(sc, length, checksum);
2315 		else
2316 			mxge_rx_done_big(sc, length, checksum);
2317 		rx_done->cnt++;
2318 		rx_done->idx = rx_done->cnt & rx_done->mask;
2319 
2320 		/* limit potential for livelock */
2321 		if (__predict_false(++limit > rx_done->mask / 2))
2322 			break;
2323 	}
2324 	while(!SLIST_EMPTY(&sc->lro_active)) {
2325 		lro = SLIST_FIRST(&sc->lro_active);
2326 		SLIST_REMOVE_HEAD(&sc->lro_active, next);
2327 		mxge_lro_flush(sc, lro);
2328 	}
2329 }
2330 
2331 
2332 static inline void
2333 mxge_tx_done(mxge_softc_t *sc, uint32_t mcp_idx)
2334 {
2335 	struct ifnet *ifp;
2336 	mxge_tx_buf_t *tx;
2337 	struct mbuf *m;
2338 	bus_dmamap_t map;
2339 	int idx;
2340 
2341 	tx = &sc->tx;
2342 	ifp = sc->ifp;
2343 	while (tx->pkt_done != mcp_idx) {
2344 		idx = tx->done & tx->mask;
2345 		tx->done++;
2346 		m = tx->info[idx].m;
2347 		/* mbuf and DMA map only attached to the first
2348 		   segment per-mbuf */
2349 		if (m != NULL) {
2350 			ifp->if_opackets++;
2351 			tx->info[idx].m = NULL;
2352 			map = tx->info[idx].map;
2353 			bus_dmamap_unload(tx->dmat, map);
2354 			m_freem(m);
2355 		}
2356 		if (tx->info[idx].flag) {
2357 			tx->info[idx].flag = 0;
2358 			tx->pkt_done++;
2359 		}
2360 	}
2361 
2362 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2363            its OK to send packets */
2364 
2365 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
2366 	    tx->req - tx->done < (tx->mask + 1)/4) {
2367 		mtx_lock(&sc->tx_mtx);
2368 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2369 		sc->tx.wake++;
2370 		mxge_start_locked(sc);
2371 		mtx_unlock(&sc->tx_mtx);
2372 	}
2373 }
2374 
2375 static struct mxge_media_type mxge_media_types[] =
2376 {
2377 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2378 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2379 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2380 	{0,		(1 << 5),	"10GBASE-ER"},
2381 	{0,		(1 << 4),	"10GBASE-LRM"},
2382 	{0,		(1 << 3),	"10GBASE-SW"},
2383 	{0,		(1 << 2),	"10GBASE-LW"},
2384 	{0,		(1 << 1),	"10GBASE-EW"},
2385 	{0,		(1 << 0),	"Reserved"}
2386 };
2387 
2388 static void
2389 mxge_set_media(mxge_softc_t *sc, int type)
2390 {
2391 	sc->media_flags |= type;
2392 	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2393 	ifmedia_set(&sc->media, sc->media_flags);
2394 }
2395 
2396 
2397 /*
2398  * Determine the media type for a NIC.  Some XFPs will identify
2399  * themselves only when their link is up, so this is initiated via a
2400  * link up interrupt.  However, this can potentially take up to
2401  * several milliseconds, so it is run via the watchdog routine, rather
2402  * than in the interrupt handler itself.   This need only be done
2403  * once, not each time the link is up.
2404  */
2405 static void
2406 mxge_media_probe(mxge_softc_t *sc)
2407 {
2408 	mxge_cmd_t cmd;
2409 	char *ptr;
2410 	int i, err, ms;
2411 
2412 	sc->need_media_probe = 0;
2413 
2414 	/* if we've already set a media type, we're done */
2415 	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2416 		return;
2417 
2418 	/*
2419 	 * parse the product code to deterimine the interface type
2420 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2421 	 * after the 3rd dash in the driver's cached copy of the
2422 	 * EEPROM's product code string.
2423 	 */
2424 	ptr = sc->product_code_string;
2425 	if (ptr == NULL) {
2426 		device_printf(sc->dev, "Missing product code\n");
2427 	}
2428 
2429 	for (i = 0; i < 3; i++, ptr++) {
2430 		ptr = strchr(ptr, '-');
2431 		if (ptr == NULL) {
2432 			device_printf(sc->dev,
2433 				      "only %d dashes in PC?!?\n", i);
2434 			return;
2435 		}
2436 	}
2437 	if (*ptr == 'C') {
2438 		mxge_set_media(sc, IFM_10G_CX4);
2439 		return;
2440 	}
2441 	else if (*ptr == 'Q') {
2442 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2443 		/* FreeBSD has no media type for Quad ribbon fiber */
2444 		return;
2445 	}
2446 
2447 	if (*ptr != 'R') {
2448 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2449 		return;
2450 	}
2451 
2452 	/*
2453 	 * At this point we know the NIC has an XFP cage, so now we
2454 	 * try to determine what is in the cage by using the
2455 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2456 	 * register.  We read just one byte, which may take over
2457 	 * a millisecond
2458 	 */
2459 
2460 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2461 	cmd.data1 = MXGE_XFP_COMPLIANCE_BYTE; /* the byte we want */
2462 	err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_I2C_READ, &cmd);
2463 	if (err == MXGEFW_CMD_ERROR_XFP_FAILURE) {
2464 		device_printf(sc->dev, "failed to read XFP\n");
2465 	}
2466 	if (err == MXGEFW_CMD_ERROR_XFP_ABSENT) {
2467 		device_printf(sc->dev, "Type R with no XFP!?!?\n");
2468 	}
2469 	if (err != MXGEFW_CMD_OK) {
2470 		return;
2471 	}
2472 
2473 	/* now we wait for the data to be cached */
2474 	cmd.data0 = MXGE_XFP_COMPLIANCE_BYTE;
2475 	err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_BYTE, &cmd);
2476 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2477 		DELAY(1000);
2478 		cmd.data0 = MXGE_XFP_COMPLIANCE_BYTE;
2479 		err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_BYTE, &cmd);
2480 	}
2481 	if (err != MXGEFW_CMD_OK) {
2482 		device_printf(sc->dev, "failed to read XFP (%d, %dms)\n",
2483 			      err, ms);
2484 		return;
2485 	}
2486 
2487 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2488 		if (mxge_verbose)
2489 			device_printf(sc->dev, "XFP:%s\n",
2490 				      mxge_media_types[0].name);
2491 		mxge_set_media(sc, IFM_10G_CX4);
2492 		return;
2493 	}
2494 	for (i = 1;
2495 	     i < sizeof (mxge_media_types) / sizeof (mxge_media_types[0]);
2496 	     i++) {
2497 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2498 			if (mxge_verbose)
2499 				device_printf(sc->dev, "XFP:%s\n",
2500 					      mxge_media_types[i].name);
2501 
2502 			mxge_set_media(sc, mxge_media_types[i].flag);
2503 			return;
2504 		}
2505 	}
2506 	device_printf(sc->dev, "XFP media 0x%x unknown\n", cmd.data0);
2507 
2508 	return;
2509 }
2510 
2511 static void
2512 mxge_intr(void *arg)
2513 {
2514 	mxge_softc_t *sc = arg;
2515 	mcp_irq_data_t *stats = sc->fw_stats;
2516 	mxge_tx_buf_t *tx = &sc->tx;
2517 	mxge_rx_done_t *rx_done = &sc->rx_done;
2518 	uint32_t send_done_count;
2519 	uint8_t valid;
2520 
2521 
2522 	/* make sure the DMA has finished */
2523 	if (!stats->valid) {
2524 		return;
2525 	}
2526 	valid = stats->valid;
2527 
2528 	if (!sc->msi_enabled) {
2529 		/* lower legacy IRQ  */
2530 		*sc->irq_deassert = 0;
2531 		if (!mxge_deassert_wait)
2532 			/* don't wait for conf. that irq is low */
2533 			stats->valid = 0;
2534 	} else {
2535 		stats->valid = 0;
2536 	}
2537 
2538 	/* loop while waiting for legacy irq deassertion */
2539 	do {
2540 		/* check for transmit completes and receives */
2541 		send_done_count = be32toh(stats->send_done_count);
2542 		while ((send_done_count != tx->pkt_done) ||
2543 		       (rx_done->entry[rx_done->idx].length != 0)) {
2544 			mxge_tx_done(sc, (int)send_done_count);
2545 			mxge_clean_rx_done(sc);
2546 			send_done_count = be32toh(stats->send_done_count);
2547 		}
2548 	} while (*((volatile uint8_t *) &stats->valid));
2549 
2550 	if (__predict_false(stats->stats_updated)) {
2551 		if (sc->link_state != stats->link_up) {
2552 			sc->link_state = stats->link_up;
2553 			if (sc->link_state) {
2554 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2555 				if (mxge_verbose)
2556 					device_printf(sc->dev, "link up\n");
2557 			} else {
2558 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2559 				if (mxge_verbose)
2560 					device_printf(sc->dev, "link down\n");
2561 			}
2562 			sc->need_media_probe = 1;
2563 		}
2564 		if (sc->rdma_tags_available !=
2565 		    be32toh(sc->fw_stats->rdma_tags_available)) {
2566 			sc->rdma_tags_available =
2567 				be32toh(sc->fw_stats->rdma_tags_available);
2568 			device_printf(sc->dev, "RDMA timed out! %d tags "
2569 				      "left\n", sc->rdma_tags_available);
2570 		}
2571 
2572 		if (stats->link_down) {
2573 			sc->down_cnt += stats->link_down;
2574 			sc->link_state = 0;
2575 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2576 		}
2577 	}
2578 
2579 	/* check to see if we have rx token to pass back */
2580 	if (valid & 0x1)
2581 	    *sc->irq_claim = be32toh(3);
2582 	*(sc->irq_claim + 1) = be32toh(3);
2583 }
2584 
2585 static void
2586 mxge_init(void *arg)
2587 {
2588 }
2589 
2590 
2591 
2592 static void
2593 mxge_free_mbufs(mxge_softc_t *sc)
2594 {
2595 	int i;
2596 
2597 	for (i = 0; i <= sc->rx_big.mask; i++) {
2598 		if (sc->rx_big.info[i].m == NULL)
2599 			continue;
2600 		bus_dmamap_unload(sc->rx_big.dmat,
2601 				  sc->rx_big.info[i].map);
2602 		m_freem(sc->rx_big.info[i].m);
2603 		sc->rx_big.info[i].m = NULL;
2604 	}
2605 
2606 	for (i = 0; i <= sc->rx_small.mask; i++) {
2607 		if (sc->rx_small.info[i].m == NULL)
2608 			continue;
2609 		bus_dmamap_unload(sc->rx_small.dmat,
2610 				  sc->rx_small.info[i].map);
2611 		m_freem(sc->rx_small.info[i].m);
2612 		sc->rx_small.info[i].m = NULL;
2613 	}
2614 
2615 	for (i = 0; i <= sc->tx.mask; i++) {
2616 		sc->tx.info[i].flag = 0;
2617 		if (sc->tx.info[i].m == NULL)
2618 			continue;
2619 		bus_dmamap_unload(sc->tx.dmat,
2620 				  sc->tx.info[i].map);
2621 		m_freem(sc->tx.info[i].m);
2622 		sc->tx.info[i].m = NULL;
2623 	}
2624 }
2625 
2626 static void
2627 mxge_free_rings(mxge_softc_t *sc)
2628 {
2629 	int i;
2630 
2631 	if (sc->rx_done.entry != NULL)
2632 		mxge_dma_free(&sc->rx_done.dma);
2633 	sc->rx_done.entry = NULL;
2634 	if (sc->tx.req_bytes != NULL)
2635 		free(sc->tx.req_bytes, M_DEVBUF);
2636 	if (sc->tx.seg_list != NULL)
2637 		free(sc->tx.seg_list, M_DEVBUF);
2638 	if (sc->rx_small.shadow != NULL)
2639 		free(sc->rx_small.shadow, M_DEVBUF);
2640 	if (sc->rx_big.shadow != NULL)
2641 		free(sc->rx_big.shadow, M_DEVBUF);
2642 	if (sc->tx.info != NULL) {
2643 		if (sc->tx.dmat != NULL) {
2644 			for (i = 0; i <= sc->tx.mask; i++) {
2645 				bus_dmamap_destroy(sc->tx.dmat,
2646 						   sc->tx.info[i].map);
2647 			}
2648 			bus_dma_tag_destroy(sc->tx.dmat);
2649 		}
2650 		free(sc->tx.info, M_DEVBUF);
2651 	}
2652 	if (sc->rx_small.info != NULL) {
2653 		if (sc->rx_small.dmat != NULL) {
2654 			for (i = 0; i <= sc->rx_small.mask; i++) {
2655 				bus_dmamap_destroy(sc->rx_small.dmat,
2656 						   sc->rx_small.info[i].map);
2657 			}
2658 			bus_dmamap_destroy(sc->rx_small.dmat,
2659 					   sc->rx_small.extra_map);
2660 			bus_dma_tag_destroy(sc->rx_small.dmat);
2661 		}
2662 		free(sc->rx_small.info, M_DEVBUF);
2663 	}
2664 	if (sc->rx_big.info != NULL) {
2665 		if (sc->rx_big.dmat != NULL) {
2666 			for (i = 0; i <= sc->rx_big.mask; i++) {
2667 				bus_dmamap_destroy(sc->rx_big.dmat,
2668 						   sc->rx_big.info[i].map);
2669 			}
2670 			bus_dmamap_destroy(sc->rx_big.dmat,
2671 					   sc->rx_big.extra_map);
2672 			bus_dma_tag_destroy(sc->rx_big.dmat);
2673 		}
2674 		free(sc->rx_big.info, M_DEVBUF);
2675 	}
2676 }
2677 
2678 static int
2679 mxge_alloc_rings(mxge_softc_t *sc)
2680 {
2681 	mxge_cmd_t cmd;
2682 	int tx_ring_size, rx_ring_size;
2683 	int tx_ring_entries, rx_ring_entries;
2684 	int i, err;
2685 	unsigned long bytes;
2686 
2687 	/* get ring sizes */
2688 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
2689 	tx_ring_size = cmd.data0;
2690 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
2691 	if (err != 0) {
2692 		device_printf(sc->dev, "Cannot determine ring sizes\n");
2693 		goto abort_with_nothing;
2694 	}
2695 
2696 	rx_ring_size = cmd.data0;
2697 
2698 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
2699 	rx_ring_entries = rx_ring_size / sizeof (mcp_dma_addr_t);
2700 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
2701 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
2702 	IFQ_SET_READY(&sc->ifp->if_snd);
2703 
2704 	sc->tx.mask = tx_ring_entries - 1;
2705 	sc->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
2706 	sc->rx_small.mask = sc->rx_big.mask = rx_ring_entries - 1;
2707 	sc->rx_done.mask = (2 * rx_ring_entries) - 1;
2708 
2709 	err = ENOMEM;
2710 
2711 	/* allocate interrupt queues */
2712 	bytes = (sc->rx_done.mask + 1) * sizeof (*sc->rx_done.entry);
2713 	err = mxge_dma_alloc(sc, &sc->rx_done.dma, bytes, 4096);
2714 	if (err != 0)
2715 		goto abort_with_nothing;
2716 	sc->rx_done.entry = sc->rx_done.dma.addr;
2717 	bzero(sc->rx_done.entry, bytes);
2718 
2719 	/* allocate the tx request copy block */
2720 	bytes = 8 +
2721 		sizeof (*sc->tx.req_list) * (sc->tx.max_desc + 4);
2722 	sc->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
2723 	if (sc->tx.req_bytes == NULL)
2724 		goto abort_with_alloc;
2725 	/* ensure req_list entries are aligned to 8 bytes */
2726 	sc->tx.req_list = (mcp_kreq_ether_send_t *)
2727 		((unsigned long)(sc->tx.req_bytes + 7) & ~7UL);
2728 
2729 	/* allocate the tx busdma segment list */
2730 	bytes = sizeof (*sc->tx.seg_list) * sc->tx.max_desc;
2731 	sc->tx.seg_list = (bus_dma_segment_t *)
2732 		malloc(bytes, M_DEVBUF, M_WAITOK);
2733 	if (sc->tx.seg_list == NULL)
2734 		goto abort_with_alloc;
2735 
2736 	/* allocate the rx shadow rings */
2737 	bytes = rx_ring_entries * sizeof (*sc->rx_small.shadow);
2738 	sc->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2739 	if (sc->rx_small.shadow == NULL)
2740 		goto abort_with_alloc;
2741 
2742 	bytes = rx_ring_entries * sizeof (*sc->rx_big.shadow);
2743 	sc->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2744 	if (sc->rx_big.shadow == NULL)
2745 		goto abort_with_alloc;
2746 
2747 	/* allocate the host info rings */
2748 	bytes = tx_ring_entries * sizeof (*sc->tx.info);
2749 	sc->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2750 	if (sc->tx.info == NULL)
2751 		goto abort_with_alloc;
2752 
2753 	bytes = rx_ring_entries * sizeof (*sc->rx_small.info);
2754 	sc->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2755 	if (sc->rx_small.info == NULL)
2756 		goto abort_with_alloc;
2757 
2758 	bytes = rx_ring_entries * sizeof (*sc->rx_big.info);
2759 	sc->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2760 	if (sc->rx_big.info == NULL)
2761 		goto abort_with_alloc;
2762 
2763 	/* allocate the busdma resources */
2764 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2765 				 1,			/* alignment */
2766 				 sc->tx.boundary,	/* boundary */
2767 				 BUS_SPACE_MAXADDR,	/* low */
2768 				 BUS_SPACE_MAXADDR,	/* high */
2769 				 NULL, NULL,		/* filter */
2770 				 65536 + 256,		/* maxsize */
2771 				 sc->tx.max_desc - 2,	/* num segs */
2772 				 sc->tx.boundary,	/* maxsegsize */
2773 				 BUS_DMA_ALLOCNOW,	/* flags */
2774 				 NULL, NULL,		/* lock */
2775 				 &sc->tx.dmat);		/* tag */
2776 
2777 	if (err != 0) {
2778 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
2779 			      err);
2780 		goto abort_with_alloc;
2781 	}
2782 
2783 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2784 				 1,			/* alignment */
2785 				 4096,			/* boundary */
2786 				 BUS_SPACE_MAXADDR,	/* low */
2787 				 BUS_SPACE_MAXADDR,	/* high */
2788 				 NULL, NULL,		/* filter */
2789 				 MHLEN,			/* maxsize */
2790 				 1,			/* num segs */
2791 				 MHLEN,			/* maxsegsize */
2792 				 BUS_DMA_ALLOCNOW,	/* flags */
2793 				 NULL, NULL,		/* lock */
2794 				 &sc->rx_small.dmat);	/* tag */
2795 	if (err != 0) {
2796 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2797 			      err);
2798 		goto abort_with_alloc;
2799 	}
2800 
2801 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2802 				 1,			/* alignment */
2803 				 4096,			/* boundary */
2804 				 BUS_SPACE_MAXADDR,	/* low */
2805 				 BUS_SPACE_MAXADDR,	/* high */
2806 				 NULL, NULL,		/* filter */
2807 				 3*4096,		/* maxsize */
2808 				 3,			/* num segs */
2809 				 4096,			/* maxsegsize */
2810 				 BUS_DMA_ALLOCNOW,	/* flags */
2811 				 NULL, NULL,		/* lock */
2812 				 &sc->rx_big.dmat);	/* tag */
2813 	if (err != 0) {
2814 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2815 			      err);
2816 		goto abort_with_alloc;
2817 	}
2818 
2819 	/* now use these tags to setup dmamaps for each slot
2820 	   in each ring */
2821 	for (i = 0; i <= sc->tx.mask; i++) {
2822 		err = bus_dmamap_create(sc->tx.dmat, 0,
2823 					&sc->tx.info[i].map);
2824 		if (err != 0) {
2825 			device_printf(sc->dev, "Err %d  tx dmamap\n",
2826 			      err);
2827 			goto abort_with_alloc;
2828 		}
2829 	}
2830 	for (i = 0; i <= sc->rx_small.mask; i++) {
2831 		err = bus_dmamap_create(sc->rx_small.dmat, 0,
2832 					&sc->rx_small.info[i].map);
2833 		if (err != 0) {
2834 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2835 				      err);
2836 			goto abort_with_alloc;
2837 		}
2838 	}
2839 	err = bus_dmamap_create(sc->rx_small.dmat, 0,
2840 				&sc->rx_small.extra_map);
2841 	if (err != 0) {
2842 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2843 			      err);
2844 			goto abort_with_alloc;
2845 	}
2846 
2847 	for (i = 0; i <= sc->rx_big.mask; i++) {
2848 		err = bus_dmamap_create(sc->rx_big.dmat, 0,
2849 					&sc->rx_big.info[i].map);
2850 		if (err != 0) {
2851 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2852 			      err);
2853 			goto abort_with_alloc;
2854 		}
2855 	}
2856 	err = bus_dmamap_create(sc->rx_big.dmat, 0,
2857 				&sc->rx_big.extra_map);
2858 	if (err != 0) {
2859 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2860 			      err);
2861 			goto abort_with_alloc;
2862 	}
2863 	return 0;
2864 
2865 abort_with_alloc:
2866 	mxge_free_rings(sc);
2867 
2868 abort_with_nothing:
2869 	return err;
2870 }
2871 
2872 static void
2873 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
2874 {
2875 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
2876 
2877 	if (bufsize < MCLBYTES) {
2878 		/* easy, everything fits in a single buffer */
2879 		*big_buf_size = MCLBYTES;
2880 		*cl_size = MCLBYTES;
2881 		*nbufs = 1;
2882 		return;
2883 	}
2884 
2885 	if (bufsize < MJUMPAGESIZE) {
2886 		/* still easy, everything still fits in a single buffer */
2887 		*big_buf_size = MJUMPAGESIZE;
2888 		*cl_size = MJUMPAGESIZE;
2889 		*nbufs = 1;
2890 		return;
2891 	}
2892 	/* now we need to use virtually contiguous buffers */
2893 	*cl_size = MJUM9BYTES;
2894 	*big_buf_size = 4096;
2895 	*nbufs = mtu / 4096 + 1;
2896 	/* needs to be a power of two, so round up */
2897 	if (*nbufs == 3)
2898 		*nbufs = 4;
2899 }
2900 
2901 static int
2902 mxge_open(mxge_softc_t *sc)
2903 {
2904 	mxge_cmd_t cmd;
2905 	int i, err, big_bytes;
2906 	bus_dmamap_t map;
2907 	bus_addr_t bus;
2908 	struct lro_entry *lro_entry;
2909 
2910 	SLIST_INIT(&sc->lro_free);
2911 	SLIST_INIT(&sc->lro_active);
2912 
2913 	for (i = 0; i < sc->lro_cnt; i++) {
2914 		lro_entry = (struct lro_entry *)
2915 			malloc(sizeof (*lro_entry), M_DEVBUF, M_NOWAIT | M_ZERO);
2916 		if (lro_entry == NULL) {
2917 			sc->lro_cnt = i;
2918 			break;
2919 		}
2920 		SLIST_INSERT_HEAD(&sc->lro_free, lro_entry, next);
2921 	}
2922 
2923 	/* Copy the MAC address in case it was overridden */
2924 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
2925 
2926 	err = mxge_reset(sc, 1);
2927 	if (err != 0) {
2928 		device_printf(sc->dev, "failed to reset\n");
2929 		return EIO;
2930 	}
2931 
2932 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes,
2933 			   &sc->rx_big.cl_size, &sc->rx_big.nbufs);
2934 
2935 	cmd.data0 = sc->rx_big.nbufs;
2936 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
2937 			    &cmd);
2938 	/* error is only meaningful if we're trying to set
2939 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
2940 	if (err && sc->rx_big.nbufs > 1) {
2941 		device_printf(sc->dev,
2942 			      "Failed to set alway-use-n to %d\n",
2943 			      sc->rx_big.nbufs);
2944 		return EIO;
2945 	}
2946 	/* get the lanai pointers to the send and receive rings */
2947 
2948 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
2949 	sc->tx.lanai =
2950 		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
2951 	err |= mxge_send_cmd(sc,
2952 				 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
2953 	sc->rx_small.lanai =
2954 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2955 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
2956 	sc->rx_big.lanai =
2957 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2958 
2959 	if (err != 0) {
2960 		device_printf(sc->dev,
2961 			      "failed to get ring sizes or locations\n");
2962 		return EIO;
2963 	}
2964 
2965 	/* stock receive rings */
2966 	for (i = 0; i <= sc->rx_small.mask; i++) {
2967 		map = sc->rx_small.info[i].map;
2968 		err = mxge_get_buf_small(sc, map, i);
2969 		if (err) {
2970 			device_printf(sc->dev, "alloced %d/%d smalls\n",
2971 				      i, sc->rx_small.mask + 1);
2972 			goto abort;
2973 		}
2974 	}
2975 	for (i = 0; i <= sc->rx_big.mask; i++) {
2976 		sc->rx_big.shadow[i].addr_low = 0xffffffff;
2977 		sc->rx_big.shadow[i].addr_high = 0xffffffff;
2978 	}
2979 	for (i = 0; i <= sc->rx_big.mask; i += sc->rx_big.nbufs) {
2980 		map = sc->rx_big.info[i].map;
2981 		err = mxge_get_buf_big(sc, map, i);
2982 		if (err) {
2983 			device_printf(sc->dev, "alloced %d/%d bigs\n",
2984 				      i, sc->rx_big.mask + 1);
2985 			goto abort;
2986 		}
2987 	}
2988 
2989 	/* Give the firmware the mtu and the big and small buffer
2990 	   sizes.  The firmware wants the big buf size to be a power
2991 	   of two. Luckily, FreeBSD's clusters are powers of two */
2992 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
2993 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
2994 	cmd.data0 = MHLEN - MXGEFW_PAD;
2995 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
2996 			     &cmd);
2997 	cmd.data0 = big_bytes;
2998 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2999 
3000 	if (err != 0) {
3001 		device_printf(sc->dev, "failed to setup params\n");
3002 		goto abort;
3003 	}
3004 
3005 	/* Now give him the pointer to the stats block */
3006 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->fw_stats_dma.bus_addr);
3007 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->fw_stats_dma.bus_addr);
3008 	cmd.data2 = sizeof(struct mcp_irq_data);
3009 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3010 
3011 	if (err != 0) {
3012 		bus = sc->fw_stats_dma.bus_addr;
3013 		bus += offsetof(struct mcp_irq_data, send_done_count);
3014 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3015 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3016 		err = mxge_send_cmd(sc,
3017 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3018 				    &cmd);
3019 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3020 		sc->fw_multicast_support = 0;
3021 	} else {
3022 		sc->fw_multicast_support = 1;
3023 	}
3024 
3025 	if (err != 0) {
3026 		device_printf(sc->dev, "failed to setup params\n");
3027 		goto abort;
3028 	}
3029 
3030 	/* Finally, start the firmware running */
3031 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3032 	if (err) {
3033 		device_printf(sc->dev, "Couldn't bring up link\n");
3034 		goto abort;
3035 	}
3036 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3037 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3038 
3039 	return 0;
3040 
3041 
3042 abort:
3043 	mxge_free_mbufs(sc);
3044 
3045 	return err;
3046 }
3047 
3048 static int
3049 mxge_close(mxge_softc_t *sc)
3050 {
3051 	struct lro_entry *lro_entry;
3052 	mxge_cmd_t cmd;
3053 	int err, old_down_cnt;
3054 
3055 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3056 	old_down_cnt = sc->down_cnt;
3057 	mb();
3058 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3059 	if (err) {
3060 		device_printf(sc->dev, "Couldn't bring down link\n");
3061 	}
3062 	if (old_down_cnt == sc->down_cnt) {
3063 		/* wait for down irq */
3064 		DELAY(10 * sc->intr_coal_delay);
3065 	}
3066 	if (old_down_cnt == sc->down_cnt) {
3067 		device_printf(sc->dev, "never got down irq\n");
3068 	}
3069 
3070 	mxge_free_mbufs(sc);
3071 
3072 	while (!SLIST_EMPTY(&sc->lro_free)) {
3073 		lro_entry = SLIST_FIRST(&sc->lro_free);
3074 		SLIST_REMOVE_HEAD(&sc->lro_free, next);
3075 	}
3076 	return 0;
3077 }
3078 
3079 static void
3080 mxge_setup_cfg_space(mxge_softc_t *sc)
3081 {
3082 	device_t dev = sc->dev;
3083 	int reg;
3084 	uint16_t cmd, lnk, pectl;
3085 
3086 	/* find the PCIe link width and set max read request to 4KB*/
3087 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3088 		lnk = pci_read_config(dev, reg + 0x12, 2);
3089 		sc->link_width = (lnk >> 4) & 0x3f;
3090 
3091 		pectl = pci_read_config(dev, reg + 0x8, 2);
3092 		pectl = (pectl & ~0x7000) | (5 << 12);
3093 		pci_write_config(dev, reg + 0x8, pectl, 2);
3094 	}
3095 
3096 	/* Enable DMA and Memory space access */
3097 	pci_enable_busmaster(dev);
3098 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3099 	cmd |= PCIM_CMD_MEMEN;
3100 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3101 }
3102 
3103 static uint32_t
3104 mxge_read_reboot(mxge_softc_t *sc)
3105 {
3106 	device_t dev = sc->dev;
3107 	uint32_t vs;
3108 
3109 	/* find the vendor specific offset */
3110 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3111 		device_printf(sc->dev,
3112 			      "could not find vendor specific offset\n");
3113 		return (uint32_t)-1;
3114 	}
3115 	/* enable read32 mode */
3116 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3117 	/* tell NIC which register to read */
3118 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3119 	return (pci_read_config(dev, vs + 0x14, 4));
3120 }
3121 
3122 static void
3123 mxge_watchdog_reset(mxge_softc_t *sc)
3124 {
3125 	int err;
3126 	uint32_t reboot;
3127 	uint16_t cmd;
3128 
3129 	err = ENXIO;
3130 
3131 	device_printf(sc->dev, "Watchdog reset!\n");
3132 
3133 	/*
3134 	 * check to see if the NIC rebooted.  If it did, then all of
3135 	 * PCI config space has been reset, and things like the
3136 	 * busmaster bit will be zero.  If this is the case, then we
3137 	 * must restore PCI config space before the NIC can be used
3138 	 * again
3139 	 */
3140 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3141 	if (cmd == 0xffff) {
3142 		/*
3143 		 * maybe the watchdog caught the NIC rebooting; wait
3144 		 * up to 100ms for it to finish.  If it does not come
3145 		 * back, then give up
3146 		 */
3147 		DELAY(1000*100);
3148 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3149 		if (cmd == 0xffff) {
3150 			device_printf(sc->dev, "NIC disappeared!\n");
3151 			goto abort;
3152 		}
3153 	}
3154 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3155 		/* print the reboot status */
3156 		reboot = mxge_read_reboot(sc);
3157 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3158 			      reboot);
3159 		/* restore PCI configuration space */
3160 
3161 		/* XXXX waiting for pci_cfg_restore() to be exported */
3162 		goto abort; /* just abort for now */
3163 
3164 		/* and redo any changes we made to our config space */
3165 		mxge_setup_cfg_space(sc);
3166 	} else {
3167 		device_printf(sc->dev, "NIC did not reboot, ring state:\n");
3168 		device_printf(sc->dev, "tx.req=%d tx.done=%d\n",
3169 			      sc->tx.req, sc->tx.done);
3170 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3171 			      sc->tx.pkt_done,
3172 			      be32toh(sc->fw_stats->send_done_count));
3173 	}
3174 
3175 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
3176 		mxge_close(sc);
3177 		err = mxge_open(sc);
3178 	}
3179 
3180 abort:
3181 	/*
3182 	 * stop the watchdog if the nic is dead, to avoid spamming the
3183 	 * console
3184 	 */
3185 	if (err != 0) {
3186 		callout_stop(&sc->co_hdl);
3187 	}
3188 }
3189 
3190 static void
3191 mxge_watchdog(mxge_softc_t *sc)
3192 {
3193 	mxge_tx_buf_t *tx = &sc->tx;
3194 	uint32_t rx_pause = be32toh(sc->fw_stats->dropped_pause);
3195 
3196 	/* see if we have outstanding transmits, which
3197 	   have been pending for more than mxge_ticks */
3198 	if (tx->req != tx->done &&
3199 	    tx->watchdog_req != tx->watchdog_done &&
3200 	    tx->done == tx->watchdog_done) {
3201 		/* check for pause blocking before resetting */
3202 		if (tx->watchdog_rx_pause == rx_pause)
3203 			mxge_watchdog_reset(sc);
3204 		else
3205 			device_printf(sc->dev, "Flow control blocking "
3206 				      "xmits, check link partner\n");
3207 	}
3208 
3209 	tx->watchdog_req = tx->req;
3210 	tx->watchdog_done = tx->done;
3211 	tx->watchdog_rx_pause = rx_pause;
3212 
3213 	if (sc->need_media_probe)
3214 		mxge_media_probe(sc);
3215 }
3216 
3217 static void
3218 mxge_tick(void *arg)
3219 {
3220 	mxge_softc_t *sc = arg;
3221 
3222 
3223 	/* Synchronize with possible callout reset/stop. */
3224 	if (callout_pending(&sc->co_hdl) ||
3225 	    !callout_active(&sc->co_hdl)) {
3226 		mtx_unlock(&sc->driver_mtx);
3227 		return;
3228 	}
3229 
3230 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3231 	mxge_watchdog(sc);
3232 }
3233 
3234 static int
3235 mxge_media_change(struct ifnet *ifp)
3236 {
3237 	return EINVAL;
3238 }
3239 
3240 static int
3241 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3242 {
3243 	struct ifnet *ifp = sc->ifp;
3244 	int real_mtu, old_mtu;
3245 	int err = 0;
3246 
3247 
3248 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3249 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3250 		return EINVAL;
3251 	mtx_lock(&sc->driver_mtx);
3252 	old_mtu = ifp->if_mtu;
3253 	ifp->if_mtu = mtu;
3254 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3255 		callout_stop(&sc->co_hdl);
3256 		mxge_close(sc);
3257 		err = mxge_open(sc);
3258 		if (err != 0) {
3259 			ifp->if_mtu = old_mtu;
3260 			mxge_close(sc);
3261 			(void) mxge_open(sc);
3262 		}
3263 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3264 	}
3265 	mtx_unlock(&sc->driver_mtx);
3266 	return err;
3267 }
3268 
3269 static void
3270 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3271 {
3272 	mxge_softc_t *sc = ifp->if_softc;
3273 
3274 
3275 	if (sc == NULL)
3276 		return;
3277 	ifmr->ifm_status = IFM_AVALID;
3278 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3279 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3280 	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3281 }
3282 
3283 static int
3284 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3285 {
3286 	mxge_softc_t *sc = ifp->if_softc;
3287 	struct ifreq *ifr = (struct ifreq *)data;
3288 	int err, mask;
3289 
3290 	err = 0;
3291 	switch (command) {
3292 	case SIOCSIFADDR:
3293 	case SIOCGIFADDR:
3294 		err = ether_ioctl(ifp, command, data);
3295 		break;
3296 
3297 	case SIOCSIFMTU:
3298 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3299 		break;
3300 
3301 	case SIOCSIFFLAGS:
3302 		mtx_lock(&sc->driver_mtx);
3303 		if (ifp->if_flags & IFF_UP) {
3304 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3305 				err = mxge_open(sc);
3306 				callout_reset(&sc->co_hdl, mxge_ticks,
3307 					      mxge_tick, sc);
3308 			} else {
3309 				/* take care of promis can allmulti
3310 				   flag chages */
3311 				mxge_change_promisc(sc,
3312 						    ifp->if_flags & IFF_PROMISC);
3313 				mxge_set_multicast_list(sc);
3314 			}
3315 		} else {
3316 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3317 				mxge_close(sc);
3318 				callout_stop(&sc->co_hdl);
3319 			}
3320 		}
3321 		mtx_unlock(&sc->driver_mtx);
3322 		break;
3323 
3324 	case SIOCADDMULTI:
3325 	case SIOCDELMULTI:
3326 		mtx_lock(&sc->driver_mtx);
3327 		mxge_set_multicast_list(sc);
3328 		mtx_unlock(&sc->driver_mtx);
3329 		break;
3330 
3331 	case SIOCSIFCAP:
3332 		mtx_lock(&sc->driver_mtx);
3333 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3334 		if (mask & IFCAP_TXCSUM) {
3335 			if (IFCAP_TXCSUM & ifp->if_capenable) {
3336 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3337 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3338 						      | CSUM_TSO);
3339 			} else {
3340 				ifp->if_capenable |= IFCAP_TXCSUM;
3341 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3342 			}
3343 		} else if (mask & IFCAP_RXCSUM) {
3344 			if (IFCAP_RXCSUM & ifp->if_capenable) {
3345 				ifp->if_capenable &= ~IFCAP_RXCSUM;
3346 				sc->csum_flag = 0;
3347 			} else {
3348 				ifp->if_capenable |= IFCAP_RXCSUM;
3349 				sc->csum_flag = 1;
3350 			}
3351 		}
3352 		if (mask & IFCAP_TSO4) {
3353 			if (IFCAP_TSO4 & ifp->if_capenable) {
3354 				ifp->if_capenable &= ~IFCAP_TSO4;
3355 				ifp->if_hwassist &= ~CSUM_TSO;
3356 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
3357 				ifp->if_capenable |= IFCAP_TSO4;
3358 				ifp->if_hwassist |= CSUM_TSO;
3359 			} else {
3360 				printf("mxge requires tx checksum offload"
3361 				       " be enabled to use TSO\n");
3362 				err = EINVAL;
3363 			}
3364 		}
3365 		if (mask & IFCAP_LRO) {
3366 			if (IFCAP_LRO & ifp->if_capenable)
3367 				err = mxge_change_lro_locked(sc, 0);
3368 			else
3369 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3370 		}
3371 		if (mask & IFCAP_VLAN_HWTAGGING)
3372 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3373 		mtx_unlock(&sc->driver_mtx);
3374 		VLAN_CAPABILITIES(ifp);
3375 
3376 		break;
3377 
3378 	case SIOCGIFMEDIA:
3379 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3380 				    &sc->media, command);
3381                 break;
3382 
3383 	default:
3384 		err = ENOTTY;
3385         }
3386 	return err;
3387 }
3388 
3389 static void
3390 mxge_fetch_tunables(mxge_softc_t *sc)
3391 {
3392 
3393 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
3394 			  &mxge_flow_control);
3395 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
3396 			  &mxge_intr_coal_delay);
3397 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
3398 			  &mxge_nvidia_ecrc_enable);
3399 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
3400 			  &mxge_force_firmware);
3401 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
3402 			  &mxge_deassert_wait);
3403 	TUNABLE_INT_FETCH("hw.mxge.verbose",
3404 			  &mxge_verbose);
3405 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
3406 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
3407 	if (sc->lro_cnt != 0)
3408 		mxge_lro_cnt = sc->lro_cnt;
3409 
3410 	if (bootverbose)
3411 		mxge_verbose = 1;
3412 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
3413 		mxge_intr_coal_delay = 30;
3414 	if (mxge_ticks == 0)
3415 		mxge_ticks = hz;
3416 	sc->pause = mxge_flow_control;
3417 
3418 }
3419 
3420 static int
3421 mxge_attach(device_t dev)
3422 {
3423 	mxge_softc_t *sc = device_get_softc(dev);
3424 	struct ifnet *ifp;
3425 	int count, rid, err;
3426 
3427 	sc->dev = dev;
3428 	mxge_fetch_tunables(sc);
3429 
3430 	err = bus_dma_tag_create(NULL,			/* parent */
3431 				 1,			/* alignment */
3432 				 4096,			/* boundary */
3433 				 BUS_SPACE_MAXADDR,	/* low */
3434 				 BUS_SPACE_MAXADDR,	/* high */
3435 				 NULL, NULL,		/* filter */
3436 				 65536 + 256,		/* maxsize */
3437 				 MXGE_MAX_SEND_DESC, 	/* num segs */
3438 				 4096,			/* maxsegsize */
3439 				 0,			/* flags */
3440 				 NULL, NULL,		/* lock */
3441 				 &sc->parent_dmat);	/* tag */
3442 
3443 	if (err != 0) {
3444 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
3445 			      err);
3446 		goto abort_with_nothing;
3447 	}
3448 
3449 	ifp = sc->ifp = if_alloc(IFT_ETHER);
3450 	if (ifp == NULL) {
3451 		device_printf(dev, "can not if_alloc()\n");
3452 		err = ENOSPC;
3453 		goto abort_with_parent_dmat;
3454 	}
3455 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
3456 		 device_get_nameunit(dev));
3457 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
3458 	snprintf(sc->tx_mtx_name, sizeof(sc->tx_mtx_name), "%s:tx",
3459 		 device_get_nameunit(dev));
3460 	mtx_init(&sc->tx_mtx, sc->tx_mtx_name, NULL, MTX_DEF);
3461 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
3462 		 "%s:drv", device_get_nameunit(dev));
3463 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
3464 		 MTX_NETWORK_LOCK, MTX_DEF);
3465 
3466 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
3467 
3468 	mxge_setup_cfg_space(sc);
3469 
3470 	/* Map the board into the kernel */
3471 	rid = PCIR_BARS;
3472 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
3473 					 ~0, 1, RF_ACTIVE);
3474 	if (sc->mem_res == NULL) {
3475 		device_printf(dev, "could not map memory\n");
3476 		err = ENXIO;
3477 		goto abort_with_lock;
3478 	}
3479 	sc->sram = rman_get_virtual(sc->mem_res);
3480 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
3481 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
3482 		device_printf(dev, "impossible memory region size %ld\n",
3483 			      rman_get_size(sc->mem_res));
3484 		err = ENXIO;
3485 		goto abort_with_mem_res;
3486 	}
3487 
3488 	/* make NULL terminated copy of the EEPROM strings section of
3489 	   lanai SRAM */
3490 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
3491 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
3492 				rman_get_bushandle(sc->mem_res),
3493 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
3494 				sc->eeprom_strings,
3495 				MXGE_EEPROM_STRINGS_SIZE - 2);
3496 	err = mxge_parse_strings(sc);
3497 	if (err != 0)
3498 		goto abort_with_mem_res;
3499 
3500 	/* Enable write combining for efficient use of PCIe bus */
3501 	mxge_enable_wc(sc);
3502 
3503 	/* Allocate the out of band dma memory */
3504 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
3505 			     sizeof (mxge_cmd_t), 64);
3506 	if (err != 0)
3507 		goto abort_with_mem_res;
3508 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
3509 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
3510 	if (err != 0)
3511 		goto abort_with_cmd_dma;
3512 
3513 	err = mxge_dma_alloc(sc, &sc->fw_stats_dma,
3514 			     sizeof (*sc->fw_stats), 64);
3515 	if (err != 0)
3516 		goto abort_with_zeropad_dma;
3517 	sc->fw_stats = (mcp_irq_data_t *)sc->fw_stats_dma.addr;
3518 
3519 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
3520 	if (err != 0)
3521 		goto abort_with_fw_stats;
3522 
3523 	/* Add our ithread  */
3524 	count = pci_msi_count(dev);
3525 	if (count == 1 && pci_alloc_msi(dev, &count) == 0) {
3526 		rid = 1;
3527 		sc->msi_enabled = 1;
3528 	} else {
3529 		rid = 0;
3530 	}
3531 	sc->irq_res = bus_alloc_resource(dev, SYS_RES_IRQ, &rid, 0, ~0,
3532 					 1, RF_SHAREABLE | RF_ACTIVE);
3533 	if (sc->irq_res == NULL) {
3534 		device_printf(dev, "could not alloc interrupt\n");
3535 		goto abort_with_dmabench;
3536 	}
3537 	if (mxge_verbose)
3538 		device_printf(dev, "using %s irq %ld\n",
3539 			      sc->msi_enabled ? "MSI" : "INTx",
3540 			      rman_get_start(sc->irq_res));
3541 	/* select & load the firmware */
3542 	err = mxge_select_firmware(sc);
3543 	if (err != 0)
3544 		goto abort_with_irq_res;
3545 	sc->intr_coal_delay = mxge_intr_coal_delay;
3546 	err = mxge_reset(sc, 0);
3547 	if (err != 0)
3548 		goto abort_with_irq_res;
3549 
3550 	err = mxge_alloc_rings(sc);
3551 	if (err != 0) {
3552 		device_printf(sc->dev, "failed to allocate rings\n");
3553 		goto abort_with_irq_res;
3554 	}
3555 
3556 	err = bus_setup_intr(sc->dev, sc->irq_res,
3557 			     INTR_TYPE_NET | INTR_MPSAFE,
3558 			     NULL, mxge_intr, sc, &sc->ih);
3559 	if (err != 0) {
3560 		goto abort_with_rings;
3561 	}
3562 	/* hook into the network stack */
3563 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
3564 	ifp->if_baudrate = 100000000;
3565 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
3566 		IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING |
3567 		IFCAP_VLAN_HWCSUM | IFCAP_LRO;
3568 
3569 	sc->max_mtu = mxge_max_mtu(sc);
3570 	if (sc->max_mtu >= 9000)
3571 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
3572 	else
3573 		device_printf(dev, "MTU limited to %d.  Install "
3574 			      "latest firmware for 9000 byte jumbo support\n",
3575 			      sc->max_mtu - ETHER_HDR_LEN);
3576 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
3577 	ifp->if_capenable = ifp->if_capabilities;
3578 	if (sc->lro_cnt == 0)
3579 		ifp->if_capenable &= ~IFCAP_LRO;
3580 	sc->csum_flag = 1;
3581         ifp->if_init = mxge_init;
3582         ifp->if_softc = sc;
3583         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
3584         ifp->if_ioctl = mxge_ioctl;
3585         ifp->if_start = mxge_start;
3586 	/* Initialise the ifmedia structure */
3587 	ifmedia_init(&sc->media, 0, mxge_media_change,
3588 		     mxge_media_status);
3589 	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
3590 	mxge_media_probe(sc);
3591 	ether_ifattach(ifp, sc->mac_addr);
3592 	/* ether_ifattach sets mtu to 1500 */
3593 	if (ifp->if_capabilities & IFCAP_JUMBO_MTU)
3594 		ifp->if_mtu = 9000;
3595 
3596 	mxge_add_sysctls(sc);
3597 	return 0;
3598 
3599 abort_with_rings:
3600 	mxge_free_rings(sc);
3601 abort_with_irq_res:
3602 	bus_release_resource(dev, SYS_RES_IRQ,
3603 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3604 	if (sc->msi_enabled)
3605 		pci_release_msi(dev);
3606 abort_with_dmabench:
3607 	mxge_dma_free(&sc->dmabench_dma);
3608 abort_with_fw_stats:
3609 	mxge_dma_free(&sc->fw_stats_dma);
3610 abort_with_zeropad_dma:
3611 	mxge_dma_free(&sc->zeropad_dma);
3612 abort_with_cmd_dma:
3613 	mxge_dma_free(&sc->cmd_dma);
3614 abort_with_mem_res:
3615 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3616 abort_with_lock:
3617 	pci_disable_busmaster(dev);
3618 	mtx_destroy(&sc->cmd_mtx);
3619 	mtx_destroy(&sc->tx_mtx);
3620 	mtx_destroy(&sc->driver_mtx);
3621 	if_free(ifp);
3622 abort_with_parent_dmat:
3623 	bus_dma_tag_destroy(sc->parent_dmat);
3624 
3625 abort_with_nothing:
3626 	return err;
3627 }
3628 
3629 static int
3630 mxge_detach(device_t dev)
3631 {
3632 	mxge_softc_t *sc = device_get_softc(dev);
3633 
3634 	if (sc->ifp->if_vlantrunk != NULL) {
3635 		device_printf(sc->dev,
3636 			      "Detach vlans before removing module\n");
3637 		return EBUSY;
3638 	}
3639 	mtx_lock(&sc->driver_mtx);
3640 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
3641 		mxge_close(sc);
3642 	callout_stop(&sc->co_hdl);
3643 	mtx_unlock(&sc->driver_mtx);
3644 	ether_ifdetach(sc->ifp);
3645 	ifmedia_removeall(&sc->media);
3646 	mxge_dummy_rdma(sc, 0);
3647 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
3648 	mxge_free_rings(sc);
3649 	bus_release_resource(dev, SYS_RES_IRQ,
3650 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3651 	if (sc->msi_enabled)
3652 		pci_release_msi(dev);
3653 
3654 	sc->rx_done.entry = NULL;
3655 	mxge_dma_free(&sc->fw_stats_dma);
3656 	mxge_dma_free(&sc->dmabench_dma);
3657 	mxge_dma_free(&sc->zeropad_dma);
3658 	mxge_dma_free(&sc->cmd_dma);
3659 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3660 	pci_disable_busmaster(dev);
3661 	mtx_destroy(&sc->cmd_mtx);
3662 	mtx_destroy(&sc->tx_mtx);
3663 	mtx_destroy(&sc->driver_mtx);
3664 	if_free(sc->ifp);
3665 	bus_dma_tag_destroy(sc->parent_dmat);
3666 	return 0;
3667 }
3668 
3669 static int
3670 mxge_shutdown(device_t dev)
3671 {
3672 	return 0;
3673 }
3674 
3675 /*
3676   This file uses Myri10GE driver indentation.
3677 
3678   Local Variables:
3679   c-file-style:"linux"
3680   tab-width:8
3681   End:
3682 */
3683