xref: /freebsd/sys/dev/mxge/if_mxge.c (revision b28624fde638caadd4a89f50c9b7e7da0f98c4d2)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2007, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/memrange.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 #include <sys/sx.h>
49 
50 #include <net/if.h>
51 #include <net/if_arp.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
55 
56 #include <net/bpf.h>
57 
58 #include <net/if_types.h>
59 #include <net/if_vlan_var.h>
60 #include <net/zlib.h>
61 
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
66 
67 #include <machine/bus.h>
68 #include <machine/in_cksum.h>
69 #include <machine/resource.h>
70 #include <sys/bus.h>
71 #include <sys/rman.h>
72 
73 #include <dev/pci/pcireg.h>
74 #include <dev/pci/pcivar.h>
75 
76 #include <vm/vm.h>		/* for pmap_mapdev() */
77 #include <vm/pmap.h>
78 
79 #if defined(__i386) || defined(__amd64)
80 #include <machine/specialreg.h>
81 #endif
82 
83 #include <dev/mxge/mxge_mcp.h>
84 #include <dev/mxge/mcp_gen_header.h>
85 #include <dev/mxge/if_mxge_var.h>
86 
87 /* tunable params */
88 static int mxge_nvidia_ecrc_enable = 1;
89 static int mxge_force_firmware = 0;
90 static int mxge_intr_coal_delay = 30;
91 static int mxge_deassert_wait = 1;
92 static int mxge_flow_control = 1;
93 static int mxge_verbose = 0;
94 static int mxge_lro_cnt = 8;
95 static int mxge_ticks;
96 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
97 static char *mxge_fw_aligned = "mxge_eth_z8e";
98 
99 static int mxge_probe(device_t dev);
100 static int mxge_attach(device_t dev);
101 static int mxge_detach(device_t dev);
102 static int mxge_shutdown(device_t dev);
103 static void mxge_intr(void *arg);
104 
105 static device_method_t mxge_methods[] =
106 {
107   /* Device interface */
108   DEVMETHOD(device_probe, mxge_probe),
109   DEVMETHOD(device_attach, mxge_attach),
110   DEVMETHOD(device_detach, mxge_detach),
111   DEVMETHOD(device_shutdown, mxge_shutdown),
112   {0, 0}
113 };
114 
115 static driver_t mxge_driver =
116 {
117   "mxge",
118   mxge_methods,
119   sizeof(mxge_softc_t),
120 };
121 
122 static devclass_t mxge_devclass;
123 
124 /* Declare ourselves to be a child of the PCI bus.*/
125 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
126 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
127 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
128 
129 static int mxge_load_firmware(mxge_softc_t *sc);
130 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
131 static int mxge_close(mxge_softc_t *sc);
132 static int mxge_open(mxge_softc_t *sc);
133 static void mxge_tick(void *arg);
134 
135 static int
136 mxge_probe(device_t dev)
137 {
138   if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
139       (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E)) {
140 	  device_set_desc(dev, "Myri10G-PCIE-8A");
141 	  return 0;
142   }
143   return ENXIO;
144 }
145 
146 static void
147 mxge_enable_wc(mxge_softc_t *sc)
148 {
149 #if defined(__i386) || defined(__amd64)
150 	struct mem_range_desc mrdesc;
151 	vm_paddr_t pa;
152 	vm_offset_t len;
153 	int err, action;
154 
155 	sc->wc = 1;
156 	len = rman_get_size(sc->mem_res);
157 	err = pmap_change_attr((vm_offset_t) sc->sram,
158 			       len, PAT_WRITE_COMBINING);
159 	if (err == 0)
160 		return;
161 	else
162 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
163 			      err);
164 	pa = rman_get_start(sc->mem_res);
165 	mrdesc.mr_base = pa;
166 	mrdesc.mr_len = len;
167 	mrdesc.mr_flags = MDF_WRITECOMBINE;
168 	action = MEMRANGE_SET_UPDATE;
169 	strcpy((char *)&mrdesc.mr_owner, "mxge");
170 	err = mem_range_attr_set(&mrdesc, &action);
171 	if (err != 0) {
172 		sc->wc = 0;
173 		device_printf(sc->dev,
174 			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
175 			      (unsigned long)pa, (unsigned long)len, err);
176 	}
177 #endif
178 }
179 
180 
181 /* callback to get our DMA address */
182 static void
183 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
184 			 int error)
185 {
186 	if (error == 0) {
187 		*(bus_addr_t *) arg = segs->ds_addr;
188 	}
189 }
190 
191 static int
192 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
193 		   bus_size_t alignment)
194 {
195 	int err;
196 	device_t dev = sc->dev;
197 
198 	/* allocate DMAable memory tags */
199 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
200 				 alignment,		/* alignment */
201 				 4096,			/* boundary */
202 				 BUS_SPACE_MAXADDR,	/* low */
203 				 BUS_SPACE_MAXADDR,	/* high */
204 				 NULL, NULL,		/* filter */
205 				 bytes,			/* maxsize */
206 				 1,			/* num segs */
207 				 4096,			/* maxsegsize */
208 				 BUS_DMA_COHERENT,	/* flags */
209 				 NULL, NULL,		/* lock */
210 				 &dma->dmat);		/* tag */
211 	if (err != 0) {
212 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
213 		return err;
214 	}
215 
216 	/* allocate DMAable memory & map */
217 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
218 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
219 				| BUS_DMA_ZERO),  &dma->map);
220 	if (err != 0) {
221 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
222 		goto abort_with_dmat;
223 	}
224 
225 	/* load the memory */
226 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
227 			      mxge_dmamap_callback,
228 			      (void *)&dma->bus_addr, 0);
229 	if (err != 0) {
230 		device_printf(dev, "couldn't load map (err = %d)\n", err);
231 		goto abort_with_mem;
232 	}
233 	return 0;
234 
235 abort_with_mem:
236 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
237 abort_with_dmat:
238 	(void)bus_dma_tag_destroy(dma->dmat);
239 	return err;
240 }
241 
242 
243 static void
244 mxge_dma_free(mxge_dma_t *dma)
245 {
246 	bus_dmamap_unload(dma->dmat, dma->map);
247 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
248 	(void)bus_dma_tag_destroy(dma->dmat);
249 }
250 
251 /*
252  * The eeprom strings on the lanaiX have the format
253  * SN=x\0
254  * MAC=x:x:x:x:x:x\0
255  * PC=text\0
256  */
257 
258 static int
259 mxge_parse_strings(mxge_softc_t *sc)
260 {
261 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
262 
263 	char *ptr, *limit;
264 	int i, found_mac;
265 
266 	ptr = sc->eeprom_strings;
267 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
268 	found_mac = 0;
269 	while (ptr < limit && *ptr != '\0') {
270 		if (memcmp(ptr, "MAC=", 4) == 0) {
271 			ptr += 1;
272 			sc->mac_addr_string = ptr;
273 			for (i = 0; i < 6; i++) {
274 				ptr += 3;
275 				if ((ptr + 2) > limit)
276 					goto abort;
277 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
278 				found_mac = 1;
279 			}
280 		} else if (memcmp(ptr, "PC=", 3) == 0) {
281 			ptr += 3;
282 			strncpy(sc->product_code_string, ptr,
283 				sizeof (sc->product_code_string) - 1);
284 		} else if (memcmp(ptr, "SN=", 3) == 0) {
285 			ptr += 3;
286 			strncpy(sc->serial_number_string, ptr,
287 				sizeof (sc->serial_number_string) - 1);
288 		}
289 		MXGE_NEXT_STRING(ptr);
290 	}
291 
292 	if (found_mac)
293 		return 0;
294 
295  abort:
296 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
297 
298 	return ENXIO;
299 }
300 
301 #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
302 static void
303 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
304 {
305 	uint32_t val;
306 	unsigned long base, off;
307 	char *va, *cfgptr;
308 	device_t pdev, mcp55;
309 	uint16_t vendor_id, device_id, word;
310 	uintptr_t bus, slot, func, ivend, idev;
311 	uint32_t *ptr32;
312 
313 
314 	if (!mxge_nvidia_ecrc_enable)
315 		return;
316 
317 	pdev = device_get_parent(device_get_parent(sc->dev));
318 	if (pdev == NULL) {
319 		device_printf(sc->dev, "could not find parent?\n");
320 		return;
321 	}
322 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
323 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
324 
325 	if (vendor_id != 0x10de)
326 		return;
327 
328 	base = 0;
329 
330 	if (device_id == 0x005d) {
331 		/* ck804, base address is magic */
332 		base = 0xe0000000UL;
333 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
334 		/* mcp55, base address stored in chipset */
335 		mcp55 = pci_find_bsf(0, 0, 0);
336 		if (mcp55 &&
337 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
338 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
339 			word = pci_read_config(mcp55, 0x90, 2);
340 			base = ((unsigned long)word & 0x7ffeU) << 25;
341 		}
342 	}
343 	if (!base)
344 		return;
345 
346 	/* XXXX
347 	   Test below is commented because it is believed that doing
348 	   config read/write beyond 0xff will access the config space
349 	   for the next larger function.  Uncomment this and remove
350 	   the hacky pmap_mapdev() way of accessing config space when
351 	   FreeBSD grows support for extended pcie config space access
352 	*/
353 #if 0
354 	/* See if we can, by some miracle, access the extended
355 	   config space */
356 	val = pci_read_config(pdev, 0x178, 4);
357 	if (val != 0xffffffff) {
358 		val |= 0x40;
359 		pci_write_config(pdev, 0x178, val, 4);
360 		return;
361 	}
362 #endif
363 	/* Rather than using normal pci config space writes, we must
364 	 * map the Nvidia config space ourselves.  This is because on
365 	 * opteron/nvidia class machine the 0xe000000 mapping is
366 	 * handled by the nvidia chipset, that means the internal PCI
367 	 * device (the on-chip northbridge), or the amd-8131 bridge
368 	 * and things behind them are not visible by this method.
369 	 */
370 
371 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
372 		      PCI_IVAR_BUS, &bus);
373 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
374 		      PCI_IVAR_SLOT, &slot);
375 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
376 		      PCI_IVAR_FUNCTION, &func);
377 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
378 		      PCI_IVAR_VENDOR, &ivend);
379 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
380 		      PCI_IVAR_DEVICE, &idev);
381 
382 	off =  base
383 		+ 0x00100000UL * (unsigned long)bus
384 		+ 0x00001000UL * (unsigned long)(func
385 						 + 8 * slot);
386 
387 	/* map it into the kernel */
388 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
389 
390 
391 	if (va == NULL) {
392 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
393 		return;
394 	}
395 	/* get a pointer to the config space mapped into the kernel */
396 	cfgptr = va + (off & PAGE_MASK);
397 
398 	/* make sure that we can really access it */
399 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
400 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
401 	if (! (vendor_id == ivend && device_id == idev)) {
402 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
403 			      vendor_id, device_id);
404 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
405 		return;
406 	}
407 
408 	ptr32 = (uint32_t*)(cfgptr + 0x178);
409 	val = *ptr32;
410 
411 	if (val == 0xffffffff) {
412 		device_printf(sc->dev, "extended mapping failed\n");
413 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
414 		return;
415 	}
416 	*ptr32 = val | 0x40;
417 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
418 	if (mxge_verbose)
419 		device_printf(sc->dev,
420 			      "Enabled ECRC on upstream Nvidia bridge "
421 			      "at %d:%d:%d\n",
422 			      (int)bus, (int)slot, (int)func);
423 	return;
424 }
425 #else
426 static void
427 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
428 {
429 	device_printf(sc->dev,
430 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
431 	return;
432 }
433 #endif
434 
435 
436 static int
437 mxge_dma_test(mxge_softc_t *sc, int test_type)
438 {
439 	mxge_cmd_t cmd;
440 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
441 	int status;
442 	uint32_t len;
443 	char *test = " ";
444 
445 
446 	/* Run a small DMA test.
447 	 * The magic multipliers to the length tell the firmware
448 	 * to do DMA read, write, or read+write tests.  The
449 	 * results are returned in cmd.data0.  The upper 16
450 	 * bits of the return is the number of transfers completed.
451 	 * The lower 16 bits is the time in 0.5us ticks that the
452 	 * transfers took to complete.
453 	 */
454 
455 	len = sc->tx.boundary;
456 
457 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
458 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
459 	cmd.data2 = len * 0x10000;
460 	status = mxge_send_cmd(sc, test_type, &cmd);
461 	if (status != 0) {
462 		test = "read";
463 		goto abort;
464 	}
465 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
466 		(cmd.data0 & 0xffff);
467 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
468 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
469 	cmd.data2 = len * 0x1;
470 	status = mxge_send_cmd(sc, test_type, &cmd);
471 	if (status != 0) {
472 		test = "write";
473 		goto abort;
474 	}
475 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
476 		(cmd.data0 & 0xffff);
477 
478 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
479 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
480 	cmd.data2 = len * 0x10001;
481 	status = mxge_send_cmd(sc, test_type, &cmd);
482 	if (status != 0) {
483 		test = "read/write";
484 		goto abort;
485 	}
486 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
487 		(cmd.data0 & 0xffff);
488 
489 abort:
490 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
491 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
492 			      test, status);
493 
494 	return status;
495 }
496 
497 /*
498  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
499  * when the PCI-E Completion packets are aligned on an 8-byte
500  * boundary.  Some PCI-E chip sets always align Completion packets; on
501  * the ones that do not, the alignment can be enforced by enabling
502  * ECRC generation (if supported).
503  *
504  * When PCI-E Completion packets are not aligned, it is actually more
505  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
506  *
507  * If the driver can neither enable ECRC nor verify that it has
508  * already been enabled, then it must use a firmware image which works
509  * around unaligned completion packets (ethp_z8e.dat), and it should
510  * also ensure that it never gives the device a Read-DMA which is
511  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
512  * enabled, then the driver should use the aligned (eth_z8e.dat)
513  * firmware image, and set tx.boundary to 4KB.
514  */
515 
516 static int
517 mxge_firmware_probe(mxge_softc_t *sc)
518 {
519 	device_t dev = sc->dev;
520 	int reg, status;
521 	uint16_t pectl;
522 
523 	sc->tx.boundary = 4096;
524 	/*
525 	 * Verify the max read request size was set to 4KB
526 	 * before trying the test with 4KB.
527 	 */
528 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
529 		pectl = pci_read_config(dev, reg + 0x8, 2);
530 		if ((pectl & (5 << 12)) != (5 << 12)) {
531 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
532 				      pectl);
533 			sc->tx.boundary = 2048;
534 		}
535 	}
536 
537 	/*
538 	 * load the optimized firmware (which assumes aligned PCIe
539 	 * completions) in order to see if it works on this host.
540 	 */
541 	sc->fw_name = mxge_fw_aligned;
542 	status = mxge_load_firmware(sc);
543 	if (status != 0) {
544 		return status;
545 	}
546 
547 	/*
548 	 * Enable ECRC if possible
549 	 */
550 	mxge_enable_nvidia_ecrc(sc);
551 
552 	/*
553 	 * Run a DMA test which watches for unaligned completions and
554 	 * aborts on the first one seen.
555 	 */
556 
557 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
558 	if (status == 0)
559 		return 0; /* keep the aligned firmware */
560 
561 	if (status != E2BIG)
562 		device_printf(dev, "DMA test failed: %d\n", status);
563 	if (status == ENOSYS)
564 		device_printf(dev, "Falling back to ethp! "
565 			      "Please install up to date fw\n");
566 	return status;
567 }
568 
569 static int
570 mxge_select_firmware(mxge_softc_t *sc)
571 {
572 	int aligned = 0;
573 
574 
575 	if (mxge_force_firmware != 0) {
576 		if (mxge_force_firmware == 1)
577 			aligned = 1;
578 		else
579 			aligned = 0;
580 		if (mxge_verbose)
581 			device_printf(sc->dev,
582 				      "Assuming %s completions (forced)\n",
583 				      aligned ? "aligned" : "unaligned");
584 		goto abort;
585 	}
586 
587 	/* if the PCIe link width is 4 or less, we can use the aligned
588 	   firmware and skip any checks */
589 	if (sc->link_width != 0 && sc->link_width <= 4) {
590 		device_printf(sc->dev,
591 			      "PCIe x%d Link, expect reduced performance\n",
592 			      sc->link_width);
593 		aligned = 1;
594 		goto abort;
595 	}
596 
597 	if (0 == mxge_firmware_probe(sc))
598 		return 0;
599 
600 abort:
601 	if (aligned) {
602 		sc->fw_name = mxge_fw_aligned;
603 		sc->tx.boundary = 4096;
604 	} else {
605 		sc->fw_name = mxge_fw_unaligned;
606 		sc->tx.boundary = 2048;
607 	}
608 	return (mxge_load_firmware(sc));
609 }
610 
611 union qualhack
612 {
613         const char *ro_char;
614         char *rw_char;
615 };
616 
617 static int
618 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
619 {
620 
621 
622 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
623 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
624 			      be32toh(hdr->mcp_type));
625 		return EIO;
626 	}
627 
628 	/* save firmware version for sysctl */
629 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
630 	if (mxge_verbose)
631 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
632 
633 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
634 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
635 
636 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
637 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
638 		device_printf(sc->dev, "Found firmware version %s\n",
639 			      sc->fw_version);
640 		device_printf(sc->dev, "Driver needs %d.%d\n",
641 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
642 		return EINVAL;
643 	}
644 	return 0;
645 
646 }
647 
648 static void *
649 z_alloc(void *nil, u_int items, u_int size)
650 {
651         void *ptr;
652 
653         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
654         return ptr;
655 }
656 
657 static void
658 z_free(void *nil, void *ptr)
659 {
660         free(ptr, M_TEMP);
661 }
662 
663 
664 static int
665 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
666 {
667 	z_stream zs;
668 	char *inflate_buffer;
669 	const struct firmware *fw;
670 	const mcp_gen_header_t *hdr;
671 	unsigned hdr_offset;
672 	int status;
673 	unsigned int i;
674 	char dummy;
675 	size_t fw_len;
676 
677 	fw = firmware_get(sc->fw_name);
678 	if (fw == NULL) {
679 		device_printf(sc->dev, "Could not find firmware image %s\n",
680 			      sc->fw_name);
681 		return ENOENT;
682 	}
683 
684 
685 
686 	/* setup zlib and decompress f/w */
687 	bzero(&zs, sizeof (zs));
688 	zs.zalloc = z_alloc;
689 	zs.zfree = z_free;
690 	status = inflateInit(&zs);
691 	if (status != Z_OK) {
692 		status = EIO;
693 		goto abort_with_fw;
694 	}
695 
696 	/* the uncompressed size is stored as the firmware version,
697 	   which would otherwise go unused */
698 	fw_len = (size_t) fw->version;
699 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
700 	if (inflate_buffer == NULL)
701 		goto abort_with_zs;
702 	zs.avail_in = fw->datasize;
703 	zs.next_in = __DECONST(char *, fw->data);
704 	zs.avail_out = fw_len;
705 	zs.next_out = inflate_buffer;
706 	status = inflate(&zs, Z_FINISH);
707 	if (status != Z_STREAM_END) {
708 		device_printf(sc->dev, "zlib %d\n", status);
709 		status = EIO;
710 		goto abort_with_buffer;
711 	}
712 
713 	/* check id */
714 	hdr_offset = htobe32(*(const uint32_t *)
715 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
716 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
717 		device_printf(sc->dev, "Bad firmware file");
718 		status = EIO;
719 		goto abort_with_buffer;
720 	}
721 	hdr = (const void*)(inflate_buffer + hdr_offset);
722 
723 	status = mxge_validate_firmware(sc, hdr);
724 	if (status != 0)
725 		goto abort_with_buffer;
726 
727 	/* Copy the inflated firmware to NIC SRAM. */
728 	for (i = 0; i < fw_len; i += 256) {
729 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
730 			      inflate_buffer + i,
731 			      min(256U, (unsigned)(fw_len - i)));
732 		mb();
733 		dummy = *sc->sram;
734 		mb();
735 	}
736 
737 	*limit = fw_len;
738 	status = 0;
739 abort_with_buffer:
740 	free(inflate_buffer, M_TEMP);
741 abort_with_zs:
742 	inflateEnd(&zs);
743 abort_with_fw:
744 	firmware_put(fw, FIRMWARE_UNLOAD);
745 	return status;
746 }
747 
748 /*
749  * Enable or disable periodic RDMAs from the host to make certain
750  * chipsets resend dropped PCIe messages
751  */
752 
753 static void
754 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
755 {
756 	char buf_bytes[72];
757 	volatile uint32_t *confirm;
758 	volatile char *submit;
759 	uint32_t *buf, dma_low, dma_high;
760 	int i;
761 
762 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
763 
764 	/* clear confirmation addr */
765 	confirm = (volatile uint32_t *)sc->cmd;
766 	*confirm = 0;
767 	mb();
768 
769 	/* send an rdma command to the PCIe engine, and wait for the
770 	   response in the confirmation address.  The firmware should
771 	   write a -1 there to indicate it is alive and well
772 	*/
773 
774 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
775 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
776 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
777 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
778 	buf[2] = htobe32(0xffffffff);		/* confirm data */
779 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
780 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
781 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
782 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
783 	buf[5] = htobe32(enable);			/* enable? */
784 
785 
786 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
787 
788 	mxge_pio_copy(submit, buf, 64);
789 	mb();
790 	DELAY(1000);
791 	mb();
792 	i = 0;
793 	while (*confirm != 0xffffffff && i < 20) {
794 		DELAY(1000);
795 		i++;
796 	}
797 	if (*confirm != 0xffffffff) {
798 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
799 			      (enable ? "enable" : "disable"), confirm,
800 			      *confirm);
801 	}
802 	return;
803 }
804 
805 static int
806 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
807 {
808 	mcp_cmd_t *buf;
809 	char buf_bytes[sizeof(*buf) + 8];
810 	volatile mcp_cmd_response_t *response = sc->cmd;
811 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
812 	uint32_t dma_low, dma_high;
813 	int err, sleep_total = 0;
814 
815 	/* ensure buf is aligned to 8 bytes */
816 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
817 
818 	buf->data0 = htobe32(data->data0);
819 	buf->data1 = htobe32(data->data1);
820 	buf->data2 = htobe32(data->data2);
821 	buf->cmd = htobe32(cmd);
822 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
823 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
824 
825 	buf->response_addr.low = htobe32(dma_low);
826 	buf->response_addr.high = htobe32(dma_high);
827 	mtx_lock(&sc->cmd_mtx);
828 	response->result = 0xffffffff;
829 	mb();
830 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
831 
832 	/* wait up to 20ms */
833 	err = EAGAIN;
834 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
835 		bus_dmamap_sync(sc->cmd_dma.dmat,
836 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
837 		mb();
838 		switch (be32toh(response->result)) {
839 		case 0:
840 			data->data0 = be32toh(response->data);
841 			err = 0;
842 			break;
843 		case 0xffffffff:
844 			DELAY(1000);
845 			break;
846 		case MXGEFW_CMD_UNKNOWN:
847 			err = ENOSYS;
848 			break;
849 		case MXGEFW_CMD_ERROR_UNALIGNED:
850 			err = E2BIG;
851 			break;
852 		case MXGEFW_CMD_ERROR_BUSY:
853 			err = EBUSY;
854 			break;
855 		default:
856 			device_printf(sc->dev,
857 				      "mxge: command %d "
858 				      "failed, result = %d\n",
859 				      cmd, be32toh(response->result));
860 			err = ENXIO;
861 			break;
862 		}
863 		if (err != EAGAIN)
864 			break;
865 	}
866 	if (err == EAGAIN)
867 		device_printf(sc->dev, "mxge: command %d timed out"
868 			      "result = %d\n",
869 			      cmd, be32toh(response->result));
870 	mtx_unlock(&sc->cmd_mtx);
871 	return err;
872 }
873 
874 static int
875 mxge_adopt_running_firmware(mxge_softc_t *sc)
876 {
877 	struct mcp_gen_header *hdr;
878 	const size_t bytes = sizeof (struct mcp_gen_header);
879 	size_t hdr_offset;
880 	int status;
881 
882 	/* find running firmware header */
883 	hdr_offset = htobe32(*(volatile uint32_t *)
884 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
885 
886 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
887 		device_printf(sc->dev,
888 			      "Running firmware has bad header offset (%d)\n",
889 			      (int)hdr_offset);
890 		return EIO;
891 	}
892 
893 	/* copy header of running firmware from SRAM to host memory to
894 	 * validate firmware */
895 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
896 	if (hdr == NULL) {
897 		device_printf(sc->dev, "could not malloc firmware hdr\n");
898 		return ENOMEM;
899 	}
900 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
901 				rman_get_bushandle(sc->mem_res),
902 				hdr_offset, (char *)hdr, bytes);
903 	status = mxge_validate_firmware(sc, hdr);
904 	free(hdr, M_DEVBUF);
905 
906 	/*
907 	 * check to see if adopted firmware has bug where adopting
908 	 * it will cause broadcasts to be filtered unless the NIC
909 	 * is kept in ALLMULTI mode
910 	 */
911 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
912 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
913 		sc->adopted_rx_filter_bug = 1;
914 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
915 			      "working around rx filter bug\n",
916 			      sc->fw_ver_major, sc->fw_ver_minor,
917 			      sc->fw_ver_tiny);
918 	}
919 
920 	return status;
921 }
922 
923 
924 static int
925 mxge_load_firmware(mxge_softc_t *sc)
926 {
927 	volatile uint32_t *confirm;
928 	volatile char *submit;
929 	char buf_bytes[72];
930 	uint32_t *buf, size, dma_low, dma_high;
931 	int status, i;
932 
933 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
934 
935 	size = sc->sram_size;
936 	status = mxge_load_firmware_helper(sc, &size);
937 	if (status) {
938 		/* Try to use the currently running firmware, if
939 		   it is new enough */
940 		status = mxge_adopt_running_firmware(sc);
941 		if (status) {
942 			device_printf(sc->dev,
943 				      "failed to adopt running firmware\n");
944 			return status;
945 		}
946 		device_printf(sc->dev,
947 			      "Successfully adopted running firmware\n");
948 		if (sc->tx.boundary == 4096) {
949 			device_printf(sc->dev,
950 				"Using firmware currently running on NIC"
951 				 ".  For optimal\n");
952 			device_printf(sc->dev,
953 				 "performance consider loading optimized "
954 				 "firmware\n");
955 		}
956 		sc->fw_name = mxge_fw_unaligned;
957 		sc->tx.boundary = 2048;
958 		return 0;
959 	}
960 	/* clear confirmation addr */
961 	confirm = (volatile uint32_t *)sc->cmd;
962 	*confirm = 0;
963 	mb();
964 	/* send a reload command to the bootstrap MCP, and wait for the
965 	   response in the confirmation address.  The firmware should
966 	   write a -1 there to indicate it is alive and well
967 	*/
968 
969 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
970 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
971 
972 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
973 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
974 	buf[2] = htobe32(0xffffffff);	/* confirm data */
975 
976 	/* FIX: All newest firmware should un-protect the bottom of
977 	   the sram before handoff. However, the very first interfaces
978 	   do not. Therefore the handoff copy must skip the first 8 bytes
979 	*/
980 					/* where the code starts*/
981 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
982 	buf[4] = htobe32(size - 8); 	/* length of code */
983 	buf[5] = htobe32(8);		/* where to copy to */
984 	buf[6] = htobe32(0);		/* where to jump to */
985 
986 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
987 	mxge_pio_copy(submit, buf, 64);
988 	mb();
989 	DELAY(1000);
990 	mb();
991 	i = 0;
992 	while (*confirm != 0xffffffff && i < 20) {
993 		DELAY(1000*10);
994 		i++;
995 		bus_dmamap_sync(sc->cmd_dma.dmat,
996 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
997 	}
998 	if (*confirm != 0xffffffff) {
999 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1000 			confirm, *confirm);
1001 
1002 		return ENXIO;
1003 	}
1004 	return 0;
1005 }
1006 
1007 static int
1008 mxge_update_mac_address(mxge_softc_t *sc)
1009 {
1010 	mxge_cmd_t cmd;
1011 	uint8_t *addr = sc->mac_addr;
1012 	int status;
1013 
1014 
1015 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1016 		     | (addr[2] << 8) | addr[3]);
1017 
1018 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1019 
1020 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1021 	return status;
1022 }
1023 
1024 static int
1025 mxge_change_pause(mxge_softc_t *sc, int pause)
1026 {
1027 	mxge_cmd_t cmd;
1028 	int status;
1029 
1030 	if (pause)
1031 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1032 				       &cmd);
1033 	else
1034 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1035 				       &cmd);
1036 
1037 	if (status) {
1038 		device_printf(sc->dev, "Failed to set flow control mode\n");
1039 		return ENXIO;
1040 	}
1041 	sc->pause = pause;
1042 	return 0;
1043 }
1044 
1045 static void
1046 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1047 {
1048 	mxge_cmd_t cmd;
1049 	int status;
1050 
1051 	if (promisc)
1052 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1053 				       &cmd);
1054 	else
1055 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1056 				       &cmd);
1057 
1058 	if (status) {
1059 		device_printf(sc->dev, "Failed to set promisc mode\n");
1060 	}
1061 }
1062 
1063 static void
1064 mxge_set_multicast_list(mxge_softc_t *sc)
1065 {
1066 	mxge_cmd_t cmd;
1067 	struct ifmultiaddr *ifma;
1068 	struct ifnet *ifp = sc->ifp;
1069 	int err;
1070 
1071 	/* This firmware is known to not support multicast */
1072 	if (!sc->fw_multicast_support)
1073 		return;
1074 
1075 	/* Disable multicast filtering while we play with the lists*/
1076 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1077 	if (err != 0) {
1078 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1079 		       " error status: %d\n", err);
1080 		return;
1081 	}
1082 
1083 	if (sc->adopted_rx_filter_bug)
1084 		return;
1085 
1086 	if (ifp->if_flags & IFF_ALLMULTI)
1087 		/* request to disable multicast filtering, so quit here */
1088 		return;
1089 
1090 	/* Flush all the filters */
1091 
1092 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1093 	if (err != 0) {
1094 		device_printf(sc->dev,
1095 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1096 			      ", error status: %d\n", err);
1097 		return;
1098 	}
1099 
1100 	/* Walk the multicast list, and add each address */
1101 
1102 	IF_ADDR_LOCK(ifp);
1103 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1104 		if (ifma->ifma_addr->sa_family != AF_LINK)
1105 			continue;
1106 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1107 		      &cmd.data0, 4);
1108 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1109 		      &cmd.data1, 2);
1110 		cmd.data0 = htonl(cmd.data0);
1111 		cmd.data1 = htonl(cmd.data1);
1112 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1113 		if (err != 0) {
1114 			device_printf(sc->dev, "Failed "
1115 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1116 			       "%d\t", err);
1117 			/* abort, leaving multicast filtering off */
1118 			IF_ADDR_UNLOCK(ifp);
1119 			return;
1120 		}
1121 	}
1122 	IF_ADDR_UNLOCK(ifp);
1123 	/* Enable multicast filtering */
1124 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1125 	if (err != 0) {
1126 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1127 		       ", error status: %d\n", err);
1128 	}
1129 }
1130 
1131 static int
1132 mxge_max_mtu(mxge_softc_t *sc)
1133 {
1134 	mxge_cmd_t cmd;
1135 	int status;
1136 
1137 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1138 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1139 
1140 	/* try to set nbufs to see if it we can
1141 	   use virtually contiguous jumbos */
1142 	cmd.data0 = 0;
1143 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1144 			       &cmd);
1145 	if (status == 0)
1146 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1147 
1148 	/* otherwise, we're limited to MJUMPAGESIZE */
1149 	return MJUMPAGESIZE - MXGEFW_PAD;
1150 }
1151 
1152 static int
1153 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1154 {
1155 
1156 	mxge_cmd_t cmd;
1157 	size_t bytes;
1158 	int status;
1159 
1160 	/* try to send a reset command to the card to see if it
1161 	   is alive */
1162 	memset(&cmd, 0, sizeof (cmd));
1163 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1164 	if (status != 0) {
1165 		device_printf(sc->dev, "failed reset\n");
1166 		return ENXIO;
1167 	}
1168 
1169 	mxge_dummy_rdma(sc, 1);
1170 
1171 	if (interrupts_setup) {
1172 		/* Now exchange information about interrupts  */
1173 		bytes = (sc->rx_done.mask + 1) * sizeof (*sc->rx_done.entry);
1174 		memset(sc->rx_done.entry, 0, bytes);
1175 		cmd.data0 = (uint32_t)bytes;
1176 		status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1177 		cmd.data0 = MXGE_LOWPART_TO_U32(sc->rx_done.dma.bus_addr);
1178 		cmd.data1 = MXGE_HIGHPART_TO_U32(sc->rx_done.dma.bus_addr);
1179 		status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd);
1180 	}
1181 
1182 	status |= mxge_send_cmd(sc,
1183 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1184 
1185 
1186 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1187 
1188 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1189 	sc->irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1190 
1191 
1192 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1193 				&cmd);
1194 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1195 	if (status != 0) {
1196 		device_printf(sc->dev, "failed set interrupt parameters\n");
1197 		return status;
1198 	}
1199 
1200 
1201 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1202 
1203 
1204 	/* run a DMA benchmark */
1205 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1206 
1207 	/* reset mcp/driver shared state back to 0 */
1208 	sc->rx_done.idx = 0;
1209 	sc->rx_done.cnt = 0;
1210 	sc->tx.req = 0;
1211 	sc->tx.done = 0;
1212 	sc->tx.pkt_done = 0;
1213 	sc->tx.wake = 0;
1214 	sc->tx_defrag = 0;
1215 	sc->tx.stall = 0;
1216 	sc->rx_big.cnt = 0;
1217 	sc->rx_small.cnt = 0;
1218 	sc->rdma_tags_available = 15;
1219 	sc->fw_stats->valid = 0;
1220 	sc->fw_stats->send_done_count = 0;
1221 	sc->lro_bad_csum = 0;
1222 	sc->lro_queued = 0;
1223 	sc->lro_flushed = 0;
1224 	status = mxge_update_mac_address(sc);
1225 	mxge_change_promisc(sc, 0);
1226 	mxge_change_pause(sc, sc->pause);
1227 	mxge_set_multicast_list(sc);
1228 	return status;
1229 }
1230 
1231 static int
1232 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1233 {
1234         mxge_softc_t *sc;
1235         unsigned int intr_coal_delay;
1236         int err;
1237 
1238         sc = arg1;
1239         intr_coal_delay = sc->intr_coal_delay;
1240         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1241         if (err != 0) {
1242                 return err;
1243         }
1244         if (intr_coal_delay == sc->intr_coal_delay)
1245                 return 0;
1246 
1247         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1248                 return EINVAL;
1249 
1250 	mtx_lock(&sc->driver_mtx);
1251 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1252 	sc->intr_coal_delay = intr_coal_delay;
1253 
1254 	mtx_unlock(&sc->driver_mtx);
1255         return err;
1256 }
1257 
1258 static int
1259 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1260 {
1261         mxge_softc_t *sc;
1262         unsigned int enabled;
1263         int err;
1264 
1265         sc = arg1;
1266         enabled = sc->pause;
1267         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1268         if (err != 0) {
1269                 return err;
1270         }
1271         if (enabled == sc->pause)
1272                 return 0;
1273 
1274 	mtx_lock(&sc->driver_mtx);
1275 	err = mxge_change_pause(sc, enabled);
1276 	mtx_unlock(&sc->driver_mtx);
1277         return err;
1278 }
1279 
1280 static int
1281 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1282 {
1283 	struct ifnet *ifp;
1284 	int err = 0;
1285 
1286 	ifp = sc->ifp;
1287 	if (lro_cnt == 0)
1288 		ifp->if_capenable &= ~IFCAP_LRO;
1289 	else
1290 		ifp->if_capenable |= IFCAP_LRO;
1291 	sc->lro_cnt = lro_cnt;
1292 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1293 		callout_stop(&sc->co_hdl);
1294 		mxge_close(sc);
1295 		err = mxge_open(sc);
1296 		if (err == 0)
1297 			callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
1298 	}
1299 	return err;
1300 }
1301 
1302 static int
1303 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1304 {
1305 	mxge_softc_t *sc;
1306 	unsigned int lro_cnt;
1307 	int err;
1308 
1309 	sc = arg1;
1310 	lro_cnt = sc->lro_cnt;
1311 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1312 	if (err != 0)
1313 		return err;
1314 
1315 	if (lro_cnt == sc->lro_cnt)
1316 		return 0;
1317 
1318 	if (lro_cnt > 128)
1319 		return EINVAL;
1320 
1321 	mtx_lock(&sc->driver_mtx);
1322 	err = mxge_change_lro_locked(sc, lro_cnt);
1323 	mtx_unlock(&sc->driver_mtx);
1324 	return err;
1325 }
1326 
1327 static int
1328 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1329 {
1330         int err;
1331 
1332         if (arg1 == NULL)
1333                 return EFAULT;
1334         arg2 = be32toh(*(int *)arg1);
1335         arg1 = NULL;
1336         err = sysctl_handle_int(oidp, arg1, arg2, req);
1337 
1338         return err;
1339 }
1340 
1341 static void
1342 mxge_add_sysctls(mxge_softc_t *sc)
1343 {
1344 	struct sysctl_ctx_list *ctx;
1345 	struct sysctl_oid_list *children;
1346 	mcp_irq_data_t *fw;
1347 
1348 	ctx = device_get_sysctl_ctx(sc->dev);
1349 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1350 	fw = sc->fw_stats;
1351 
1352 	/* random information */
1353 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1354 		       "firmware_version",
1355 		       CTLFLAG_RD, &sc->fw_version,
1356 		       0, "firmware version");
1357 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1358 		       "serial_number",
1359 		       CTLFLAG_RD, &sc->serial_number_string,
1360 		       0, "serial number");
1361 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1362 		       "product_code",
1363 		       CTLFLAG_RD, &sc->product_code_string,
1364 		       0, "product_code");
1365 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1366 		       "pcie_link_width",
1367 		       CTLFLAG_RD, &sc->link_width,
1368 		       0, "tx_boundary");
1369 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1370 		       "tx_boundary",
1371 		       CTLFLAG_RD, &sc->tx.boundary,
1372 		       0, "tx_boundary");
1373 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1374 		       "write_combine",
1375 		       CTLFLAG_RD, &sc->wc,
1376 		       0, "write combining PIO?");
1377 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1378 		       "read_dma_MBs",
1379 		       CTLFLAG_RD, &sc->read_dma,
1380 		       0, "DMA Read speed in MB/s");
1381 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1382 		       "write_dma_MBs",
1383 		       CTLFLAG_RD, &sc->write_dma,
1384 		       0, "DMA Write speed in MB/s");
1385 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1386 		       "read_write_dma_MBs",
1387 		       CTLFLAG_RD, &sc->read_write_dma,
1388 		       0, "DMA concurrent Read/Write speed in MB/s");
1389 
1390 
1391 	/* performance related tunables */
1392 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1393 			"intr_coal_delay",
1394 			CTLTYPE_INT|CTLFLAG_RW, sc,
1395 			0, mxge_change_intr_coal,
1396 			"I", "interrupt coalescing delay in usecs");
1397 
1398 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1399 			"flow_control_enabled",
1400 			CTLTYPE_INT|CTLFLAG_RW, sc,
1401 			0, mxge_change_flow_control,
1402 			"I", "interrupt coalescing delay in usecs");
1403 
1404 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1405 		       "deassert_wait",
1406 		       CTLFLAG_RW, &mxge_deassert_wait,
1407 		       0, "Wait for IRQ line to go low in ihandler");
1408 
1409 	/* stats block from firmware is in network byte order.
1410 	   Need to swap it */
1411 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1412 			"link_up",
1413 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1414 			0, mxge_handle_be32,
1415 			"I", "link up");
1416 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1417 			"rdma_tags_available",
1418 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1419 			0, mxge_handle_be32,
1420 			"I", "rdma_tags_available");
1421 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1422 			"dropped_bad_crc32",
1423 			CTLTYPE_INT|CTLFLAG_RD,
1424 			&fw->dropped_bad_crc32,
1425 			0, mxge_handle_be32,
1426 			"I", "dropped_bad_crc32");
1427 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1428 			"dropped_bad_phy",
1429 			CTLTYPE_INT|CTLFLAG_RD,
1430 			&fw->dropped_bad_phy,
1431 			0, mxge_handle_be32,
1432 			"I", "dropped_bad_phy");
1433 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1434 			"dropped_link_error_or_filtered",
1435 			CTLTYPE_INT|CTLFLAG_RD,
1436 			&fw->dropped_link_error_or_filtered,
1437 			0, mxge_handle_be32,
1438 			"I", "dropped_link_error_or_filtered");
1439 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1440 			"dropped_link_overflow",
1441 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1442 			0, mxge_handle_be32,
1443 			"I", "dropped_link_overflow");
1444 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1445 			"dropped_multicast_filtered",
1446 			CTLTYPE_INT|CTLFLAG_RD,
1447 			&fw->dropped_multicast_filtered,
1448 			0, mxge_handle_be32,
1449 			"I", "dropped_multicast_filtered");
1450 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1451 			"dropped_no_big_buffer",
1452 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1453 			0, mxge_handle_be32,
1454 			"I", "dropped_no_big_buffer");
1455 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1456 			"dropped_no_small_buffer",
1457 			CTLTYPE_INT|CTLFLAG_RD,
1458 			&fw->dropped_no_small_buffer,
1459 			0, mxge_handle_be32,
1460 			"I", "dropped_no_small_buffer");
1461 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1462 			"dropped_overrun",
1463 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1464 			0, mxge_handle_be32,
1465 			"I", "dropped_overrun");
1466 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1467 			"dropped_pause",
1468 			CTLTYPE_INT|CTLFLAG_RD,
1469 			&fw->dropped_pause,
1470 			0, mxge_handle_be32,
1471 			"I", "dropped_pause");
1472 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1473 			"dropped_runt",
1474 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1475 			0, mxge_handle_be32,
1476 			"I", "dropped_runt");
1477 
1478 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1479 			"dropped_unicast_filtered",
1480 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1481 			0, mxge_handle_be32,
1482 			"I", "dropped_unicast_filtered");
1483 
1484 	/* host counters exported for debugging */
1485 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1486 		       "rx_small_cnt",
1487 		       CTLFLAG_RD, &sc->rx_small.cnt,
1488 		       0, "rx_small_cnt");
1489 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1490 		       "rx_big_cnt",
1491 		       CTLFLAG_RD, &sc->rx_big.cnt,
1492 		       0, "rx_small_cnt");
1493 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1494 		       "tx_req",
1495 		       CTLFLAG_RD, &sc->tx.req,
1496 		       0, "tx_req");
1497 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1498 		       "tx_done",
1499 		       CTLFLAG_RD, &sc->tx.done,
1500 		       0, "tx_done");
1501 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1502 		       "tx_pkt_done",
1503 		       CTLFLAG_RD, &sc->tx.pkt_done,
1504 		       0, "tx_done");
1505 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1506 		       "tx_stall",
1507 		       CTLFLAG_RD, &sc->tx.stall,
1508 		       0, "tx_stall");
1509 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1510 		       "tx_wake",
1511 		       CTLFLAG_RD, &sc->tx.wake,
1512 		       0, "tx_wake");
1513 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1514 		       "tx_defrag",
1515 		       CTLFLAG_RD, &sc->tx_defrag,
1516 		       0, "tx_defrag");
1517 
1518 	/* verbose printing? */
1519 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1520 		       "verbose",
1521 		       CTLFLAG_RW, &mxge_verbose,
1522 		       0, "verbose printing");
1523 
1524 	/* lro */
1525 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1526 			"lro_cnt",
1527 			CTLTYPE_INT|CTLFLAG_RW, sc,
1528 			0, mxge_change_lro,
1529 			"I", "number of lro merge queues");
1530 
1531 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1532 		       "lro_flushed", CTLFLAG_RD, &sc->lro_flushed,
1533 		       0, "number of lro merge queues flushed");
1534 
1535 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1536 		       "lro_queued", CTLFLAG_RD, &sc->lro_queued,
1537 		       0, "number of frames appended to lro merge queues");
1538 
1539 }
1540 
1541 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1542    backwards one at a time and handle ring wraps */
1543 
1544 static inline void
1545 mxge_submit_req_backwards(mxge_tx_buf_t *tx,
1546 			    mcp_kreq_ether_send_t *src, int cnt)
1547 {
1548         int idx, starting_slot;
1549         starting_slot = tx->req;
1550         while (cnt > 1) {
1551                 cnt--;
1552                 idx = (starting_slot + cnt) & tx->mask;
1553                 mxge_pio_copy(&tx->lanai[idx],
1554 			      &src[cnt], sizeof(*src));
1555                 mb();
1556         }
1557 }
1558 
1559 /*
1560  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1561  * at most 32 bytes at a time, so as to avoid involving the software
1562  * pio handler in the nic.   We re-write the first segment's flags
1563  * to mark them valid only after writing the entire chain
1564  */
1565 
1566 static inline void
1567 mxge_submit_req(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src,
1568                   int cnt)
1569 {
1570         int idx, i;
1571         uint32_t *src_ints;
1572 	volatile uint32_t *dst_ints;
1573         mcp_kreq_ether_send_t *srcp;
1574 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1575 	uint8_t last_flags;
1576 
1577         idx = tx->req & tx->mask;
1578 
1579 	last_flags = src->flags;
1580 	src->flags = 0;
1581         mb();
1582         dst = dstp = &tx->lanai[idx];
1583         srcp = src;
1584 
1585         if ((idx + cnt) < tx->mask) {
1586                 for (i = 0; i < (cnt - 1); i += 2) {
1587                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1588                         mb(); /* force write every 32 bytes */
1589                         srcp += 2;
1590                         dstp += 2;
1591                 }
1592         } else {
1593                 /* submit all but the first request, and ensure
1594                    that it is submitted below */
1595                 mxge_submit_req_backwards(tx, src, cnt);
1596                 i = 0;
1597         }
1598         if (i < cnt) {
1599                 /* submit the first request */
1600                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1601                 mb(); /* barrier before setting valid flag */
1602         }
1603 
1604         /* re-write the last 32-bits with the valid flags */
1605         src->flags = last_flags;
1606         src_ints = (uint32_t *)src;
1607         src_ints+=3;
1608         dst_ints = (volatile uint32_t *)dst;
1609         dst_ints+=3;
1610         *dst_ints =  *src_ints;
1611         tx->req += cnt;
1612         mb();
1613 }
1614 
1615 static void
1616 mxge_encap_tso(mxge_softc_t *sc, struct mbuf *m, int busdma_seg_cnt,
1617 	       int ip_off)
1618 {
1619 	mxge_tx_buf_t *tx;
1620 	mcp_kreq_ether_send_t *req;
1621 	bus_dma_segment_t *seg;
1622 	struct ip *ip;
1623 	struct tcphdr *tcp;
1624 	uint32_t low, high_swapped;
1625 	int len, seglen, cum_len, cum_len_next;
1626 	int next_is_first, chop, cnt, rdma_count, small;
1627 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1628 	uint8_t flags, flags_next;
1629 	static int once;
1630 
1631 	mss = m->m_pkthdr.tso_segsz;
1632 
1633 	/* negative cum_len signifies to the
1634 	 * send loop that we are still in the
1635 	 * header portion of the TSO packet.
1636 	 */
1637 
1638 	/* ensure we have the ethernet, IP and TCP
1639 	   header together in the first mbuf, copy
1640 	   it to a scratch buffer if not */
1641 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1642 		m_copydata(m, 0, ip_off + sizeof (*ip),
1643 			   sc->scratch);
1644 		ip = (struct ip *)(sc->scratch + ip_off);
1645 	} else {
1646 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1647 	}
1648 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1649 			    + sizeof (*tcp))) {
1650 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1651 			   + sizeof (*tcp),  sc->scratch);
1652 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1653 	}
1654 
1655 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1656 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1657 
1658 	/* TSO implies checksum offload on this hardware */
1659 	cksum_offset = ip_off + (ip->ip_hl << 2);
1660 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1661 
1662 
1663 	/* for TSO, pseudo_hdr_offset holds mss.
1664 	 * The firmware figures out where to put
1665 	 * the checksum by parsing the header. */
1666 	pseudo_hdr_offset = htobe16(mss);
1667 
1668 	tx = &sc->tx;
1669 	req = tx->req_list;
1670 	seg = tx->seg_list;
1671 	cnt = 0;
1672 	rdma_count = 0;
1673 	/* "rdma_count" is the number of RDMAs belonging to the
1674 	 * current packet BEFORE the current send request. For
1675 	 * non-TSO packets, this is equal to "count".
1676 	 * For TSO packets, rdma_count needs to be reset
1677 	 * to 0 after a segment cut.
1678 	 *
1679 	 * The rdma_count field of the send request is
1680 	 * the number of RDMAs of the packet starting at
1681 	 * that request. For TSO send requests with one ore more cuts
1682 	 * in the middle, this is the number of RDMAs starting
1683 	 * after the last cut in the request. All previous
1684 	 * segments before the last cut implicitly have 1 RDMA.
1685 	 *
1686 	 * Since the number of RDMAs is not known beforehand,
1687 	 * it must be filled-in retroactively - after each
1688 	 * segmentation cut or at the end of the entire packet.
1689 	 */
1690 
1691 	while (busdma_seg_cnt) {
1692 		/* Break the busdma segment up into pieces*/
1693 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1694 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1695 		len = seg->ds_len;
1696 
1697 		while (len) {
1698 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1699 			seglen = len;
1700 			cum_len_next = cum_len + seglen;
1701 			(req-rdma_count)->rdma_count = rdma_count + 1;
1702 			if (__predict_true(cum_len >= 0)) {
1703 				/* payload */
1704 				chop = (cum_len_next > mss);
1705 				cum_len_next = cum_len_next % mss;
1706 				next_is_first = (cum_len_next == 0);
1707 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1708 				flags_next |= next_is_first *
1709 					MXGEFW_FLAGS_FIRST;
1710 				rdma_count |= -(chop | next_is_first);
1711 				rdma_count += chop & !next_is_first;
1712 			} else if (cum_len_next >= 0) {
1713 				/* header ends */
1714 				rdma_count = -1;
1715 				cum_len_next = 0;
1716 				seglen = -cum_len;
1717 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1718 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1719 					MXGEFW_FLAGS_FIRST |
1720 					(small * MXGEFW_FLAGS_SMALL);
1721 			    }
1722 
1723 			req->addr_high = high_swapped;
1724 			req->addr_low = htobe32(low);
1725 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1726 			req->pad = 0;
1727 			req->rdma_count = 1;
1728 			req->length = htobe16(seglen);
1729 			req->cksum_offset = cksum_offset;
1730 			req->flags = flags | ((cum_len & 1) *
1731 					      MXGEFW_FLAGS_ALIGN_ODD);
1732 			low += seglen;
1733 			len -= seglen;
1734 			cum_len = cum_len_next;
1735 			flags = flags_next;
1736 			req++;
1737 			cnt++;
1738 			rdma_count++;
1739 			if (__predict_false(cksum_offset > seglen))
1740 				cksum_offset -= seglen;
1741 			else
1742 				cksum_offset = 0;
1743 			if (__predict_false(cnt > tx->max_desc))
1744 				goto drop;
1745 		}
1746 		busdma_seg_cnt--;
1747 		seg++;
1748 	}
1749 	(req-rdma_count)->rdma_count = rdma_count;
1750 
1751 	do {
1752 		req--;
1753 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1754 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1755 
1756 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1757 	mxge_submit_req(tx, tx->req_list, cnt);
1758 	return;
1759 
1760 drop:
1761 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1762 	m_freem(m);
1763 	sc->ifp->if_oerrors++;
1764 	if (!once) {
1765 		printf("tx->max_desc exceeded via TSO!\n");
1766 		printf("mss = %d, %ld, %d!\n", mss,
1767 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1768 		once = 1;
1769 	}
1770 	return;
1771 
1772 }
1773 
1774 /*
1775  * We reproduce the software vlan tag insertion from
1776  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1777  * vlan tag insertion. We need to advertise this in order to have the
1778  * vlan interface respect our csum offload flags.
1779  */
1780 static struct mbuf *
1781 mxge_vlan_tag_insert(struct mbuf *m)
1782 {
1783 	struct ether_vlan_header *evl;
1784 
1785 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1786 	if (__predict_false(m == NULL))
1787 		return NULL;
1788 	if (m->m_len < sizeof(*evl)) {
1789 		m = m_pullup(m, sizeof(*evl));
1790 		if (__predict_false(m == NULL))
1791 			return NULL;
1792 	}
1793 	/*
1794 	 * Transform the Ethernet header into an Ethernet header
1795 	 * with 802.1Q encapsulation.
1796 	 */
1797 	evl = mtod(m, struct ether_vlan_header *);
1798 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1799 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1800 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1801 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1802 	m->m_flags &= ~M_VLANTAG;
1803 	return m;
1804 }
1805 
1806 static void
1807 mxge_encap(mxge_softc_t *sc, struct mbuf *m)
1808 {
1809 	mcp_kreq_ether_send_t *req;
1810 	bus_dma_segment_t *seg;
1811 	struct mbuf *m_tmp;
1812 	struct ifnet *ifp;
1813 	mxge_tx_buf_t *tx;
1814 	struct ip *ip;
1815 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1816 	uint16_t pseudo_hdr_offset;
1817         uint8_t flags, cksum_offset;
1818 
1819 
1820 
1821 	ifp = sc->ifp;
1822 	tx = &sc->tx;
1823 
1824 	ip_off = sizeof (struct ether_header);
1825 	if (m->m_flags & M_VLANTAG) {
1826 		m = mxge_vlan_tag_insert(m);
1827 		if (__predict_false(m == NULL))
1828 			goto drop;
1829 		ip_off += ETHER_VLAN_ENCAP_LEN;
1830 	}
1831 
1832 	/* (try to) map the frame for DMA */
1833 	idx = tx->req & tx->mask;
1834 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1835 				      m, tx->seg_list, &cnt,
1836 				      BUS_DMA_NOWAIT);
1837 	if (__predict_false(err == EFBIG)) {
1838 		/* Too many segments in the chain.  Try
1839 		   to defrag */
1840 		m_tmp = m_defrag(m, M_NOWAIT);
1841 		if (m_tmp == NULL) {
1842 			goto drop;
1843 		}
1844 		sc->tx_defrag++;
1845 		m = m_tmp;
1846 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1847 					      tx->info[idx].map,
1848 					      m, tx->seg_list, &cnt,
1849 					      BUS_DMA_NOWAIT);
1850 	}
1851 	if (__predict_false(err != 0)) {
1852 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1853 			      " packet len = %d\n", err, m->m_pkthdr.len);
1854 		goto drop;
1855 	}
1856 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1857 			BUS_DMASYNC_PREWRITE);
1858 	tx->info[idx].m = m;
1859 
1860 
1861 	/* TSO is different enough, we handle it in another routine */
1862 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1863 		mxge_encap_tso(sc, m, cnt, ip_off);
1864 		return;
1865 	}
1866 
1867 	req = tx->req_list;
1868 	cksum_offset = 0;
1869 	pseudo_hdr_offset = 0;
1870 	flags = MXGEFW_FLAGS_NO_TSO;
1871 
1872 	/* checksum offloading? */
1873 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1874 		/* ensure ip header is in first mbuf, copy
1875 		   it to a scratch buffer if not */
1876 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1877 			m_copydata(m, 0, ip_off + sizeof (*ip),
1878 				   sc->scratch);
1879 			ip = (struct ip *)(sc->scratch + ip_off);
1880 		} else {
1881 			ip = (struct ip *)(mtod(m, char *) + ip_off);
1882 		}
1883 		cksum_offset = ip_off + (ip->ip_hl << 2);
1884 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1885 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1886 		req->cksum_offset = cksum_offset;
1887 		flags |= MXGEFW_FLAGS_CKSUM;
1888 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1889 	} else {
1890 		odd_flag = 0;
1891 	}
1892 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1893 		flags |= MXGEFW_FLAGS_SMALL;
1894 
1895 	/* convert segments into a request list */
1896 	cum_len = 0;
1897 	seg = tx->seg_list;
1898 	req->flags = MXGEFW_FLAGS_FIRST;
1899 	for (i = 0; i < cnt; i++) {
1900 		req->addr_low =
1901 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1902 		req->addr_high =
1903 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1904 		req->length = htobe16(seg->ds_len);
1905 		req->cksum_offset = cksum_offset;
1906 		if (cksum_offset > seg->ds_len)
1907 			cksum_offset -= seg->ds_len;
1908 		else
1909 			cksum_offset = 0;
1910 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1911 		req->pad = 0; /* complete solid 16-byte block */
1912 		req->rdma_count = 1;
1913 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1914 		cum_len += seg->ds_len;
1915 		seg++;
1916 		req++;
1917 		req->flags = 0;
1918 	}
1919 	req--;
1920 	/* pad runts to 60 bytes */
1921 	if (cum_len < 60) {
1922 		req++;
1923 		req->addr_low =
1924 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
1925 		req->addr_high =
1926 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
1927 		req->length = htobe16(60 - cum_len);
1928 		req->cksum_offset = 0;
1929 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1930 		req->pad = 0; /* complete solid 16-byte block */
1931 		req->rdma_count = 1;
1932 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1933 		cnt++;
1934 	}
1935 
1936 	tx->req_list[0].rdma_count = cnt;
1937 #if 0
1938 	/* print what the firmware will see */
1939 	for (i = 0; i < cnt; i++) {
1940 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1941 		    "cso:%d, flags:0x%x, rdma:%d\n",
1942 		    i, (int)ntohl(tx->req_list[i].addr_high),
1943 		    (int)ntohl(tx->req_list[i].addr_low),
1944 		    (int)ntohs(tx->req_list[i].length),
1945 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1946 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1947 		    tx->req_list[i].rdma_count);
1948 	}
1949 	printf("--------------\n");
1950 #endif
1951 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1952 	mxge_submit_req(tx, tx->req_list, cnt);
1953 	return;
1954 
1955 drop:
1956 	m_freem(m);
1957 	ifp->if_oerrors++;
1958 	return;
1959 }
1960 
1961 
1962 
1963 
1964 static inline void
1965 mxge_start_locked(mxge_softc_t *sc)
1966 {
1967 	struct mbuf *m;
1968 	struct ifnet *ifp;
1969 	mxge_tx_buf_t *tx;
1970 
1971 	ifp = sc->ifp;
1972 	tx = &sc->tx;
1973 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
1974 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
1975 		if (m == NULL) {
1976 			return;
1977 		}
1978 		/* let BPF see it */
1979 		BPF_MTAP(ifp, m);
1980 
1981 		/* give it to the nic */
1982 		mxge_encap(sc, m);
1983 	}
1984 	/* ran out of transmit slots */
1985 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
1986 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
1987 		tx->stall++;
1988 	}
1989 }
1990 
1991 static void
1992 mxge_start(struct ifnet *ifp)
1993 {
1994 	mxge_softc_t *sc = ifp->if_softc;
1995 
1996 
1997 	mtx_lock(&sc->tx_mtx);
1998 	mxge_start_locked(sc);
1999 	mtx_unlock(&sc->tx_mtx);
2000 }
2001 
2002 /*
2003  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2004  * at most 32 bytes at a time, so as to avoid involving the software
2005  * pio handler in the nic.   We re-write the first segment's low
2006  * DMA address to mark it valid only after we write the entire chunk
2007  * in a burst
2008  */
2009 static inline void
2010 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2011 		mcp_kreq_ether_recv_t *src)
2012 {
2013 	uint32_t low;
2014 
2015 	low = src->addr_low;
2016 	src->addr_low = 0xffffffff;
2017 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2018 	mb();
2019 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2020 	mb();
2021 	src->addr_low = low;
2022 	dst->addr_low = low;
2023 	mb();
2024 }
2025 
2026 static int
2027 mxge_get_buf_small(mxge_softc_t *sc, bus_dmamap_t map, int idx)
2028 {
2029 	bus_dma_segment_t seg;
2030 	struct mbuf *m;
2031 	mxge_rx_buf_t *rx = &sc->rx_small;
2032 	int cnt, err;
2033 
2034 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2035 	if (m == NULL) {
2036 		rx->alloc_fail++;
2037 		err = ENOBUFS;
2038 		goto done;
2039 	}
2040 	m->m_len = MHLEN;
2041 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2042 				      &seg, &cnt, BUS_DMA_NOWAIT);
2043 	if (err != 0) {
2044 		m_free(m);
2045 		goto done;
2046 	}
2047 	rx->info[idx].m = m;
2048 	rx->shadow[idx].addr_low =
2049 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2050 	rx->shadow[idx].addr_high =
2051 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2052 
2053 done:
2054 	if ((idx & 7) == 7)
2055 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2056 	return err;
2057 }
2058 
2059 static int
2060 mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
2061 {
2062 	bus_dma_segment_t seg[3];
2063 	struct mbuf *m;
2064 	mxge_rx_buf_t *rx = &sc->rx_big;
2065 	int cnt, err, i;
2066 
2067 	if (rx->cl_size == MCLBYTES)
2068 		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2069 	else
2070 		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2071 	if (m == NULL) {
2072 		rx->alloc_fail++;
2073 		err = ENOBUFS;
2074 		goto done;
2075 	}
2076 	m->m_len = rx->cl_size;
2077 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2078 				      seg, &cnt, BUS_DMA_NOWAIT);
2079 	if (err != 0) {
2080 		m_free(m);
2081 		goto done;
2082 	}
2083 	rx->info[idx].m = m;
2084 
2085 	for (i = 0; i < cnt; i++) {
2086 		rx->shadow[idx + i].addr_low =
2087 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2088 		rx->shadow[idx + i].addr_high =
2089 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2090        }
2091 
2092 
2093 done:
2094        for (i = 0; i < rx->nbufs; i++) {
2095 		if ((idx & 7) == 7) {
2096 			mxge_submit_8rx(&rx->lanai[idx - 7],
2097 					&rx->shadow[idx - 7]);
2098 		}
2099 		idx++;
2100 	}
2101 	return err;
2102 }
2103 
2104 /*
2105  *  Myri10GE hardware checksums are not valid if the sender
2106  *  padded the frame with non-zero padding.  This is because
2107  *  the firmware just does a simple 16-bit 1s complement
2108  *  checksum across the entire frame, excluding the first 14
2109  *  bytes.  It is best to simply to check the checksum and
2110  *  tell the stack about it only if the checksum is good
2111  */
2112 
2113 static inline uint16_t
2114 mxge_rx_csum(struct mbuf *m, int csum)
2115 {
2116 	struct ether_header *eh;
2117 	struct ip *ip;
2118 	uint16_t c;
2119 
2120 	eh = mtod(m, struct ether_header *);
2121 
2122 	/* only deal with IPv4 TCP & UDP for now */
2123 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2124 		return 1;
2125 	ip = (struct ip *)(eh + 1);
2126 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2127 			    ip->ip_p != IPPROTO_UDP))
2128 		return 1;
2129 
2130 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2131 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2132 			    - (ip->ip_hl << 2) + ip->ip_p));
2133 	c ^= 0xffff;
2134 	return (c);
2135 }
2136 
2137 static void
2138 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2139 {
2140 	struct ether_vlan_header *evl;
2141 	struct ether_header *eh;
2142 	uint32_t partial;
2143 
2144 	evl = mtod(m, struct ether_vlan_header *);
2145 	eh = mtod(m, struct ether_header *);
2146 
2147 	/*
2148 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2149 	 * after what the firmware thought was the end of the ethernet
2150 	 * header.
2151 	 */
2152 
2153 	/* put checksum into host byte order */
2154 	*csum = ntohs(*csum);
2155 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2156 	(*csum) += ~partial;
2157 	(*csum) +=  ((*csum) < ~partial);
2158 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2159 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2160 
2161 	/* restore checksum to network byte order;
2162 	   later consumers expect this */
2163 	*csum = htons(*csum);
2164 
2165 	/* save the tag */
2166 	m->m_flags |= M_VLANTAG;
2167 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2168 
2169 	/*
2170 	 * Remove the 802.1q header by copying the Ethernet
2171 	 * addresses over it and adjusting the beginning of
2172 	 * the data in the mbuf.  The encapsulated Ethernet
2173 	 * type field is already in place.
2174 	 */
2175 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2176 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2177 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2178 }
2179 
2180 
2181 static inline void
2182 mxge_rx_done_big(mxge_softc_t *sc, uint32_t len, uint32_t csum)
2183 {
2184 	struct ifnet *ifp;
2185 	struct mbuf *m;
2186 	struct ether_header *eh;
2187 	mxge_rx_buf_t *rx;
2188 	bus_dmamap_t old_map;
2189 	int idx;
2190 	uint16_t tcpudp_csum;
2191 
2192 	ifp = sc->ifp;
2193 	rx = &sc->rx_big;
2194 	idx = rx->cnt & rx->mask;
2195 	rx->cnt += rx->nbufs;
2196 	/* save a pointer to the received mbuf */
2197 	m = rx->info[idx].m;
2198 	/* try to replace the received mbuf */
2199 	if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
2200 		/* drop the frame -- the old mbuf is re-cycled */
2201 		ifp->if_ierrors++;
2202 		return;
2203 	}
2204 
2205 	/* unmap the received buffer */
2206 	old_map = rx->info[idx].map;
2207 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2208 	bus_dmamap_unload(rx->dmat, old_map);
2209 
2210 	/* swap the bus_dmamap_t's */
2211 	rx->info[idx].map = rx->extra_map;
2212 	rx->extra_map = old_map;
2213 
2214 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2215 	 * aligned */
2216 	m->m_data += MXGEFW_PAD;
2217 
2218 	m->m_pkthdr.rcvif = ifp;
2219 	m->m_len = m->m_pkthdr.len = len;
2220 	ifp->if_ipackets++;
2221 	eh = mtod(m, struct ether_header *);
2222 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2223 		mxge_vlan_tag_remove(m, &csum);
2224 	}
2225 	/* if the checksum is valid, mark it in the mbuf header */
2226 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2227 		if (sc->lro_cnt && (0 == mxge_lro_rx(sc, m, csum)))
2228 			return;
2229 		/* otherwise, it was a UDP frame, or a TCP frame which
2230 		   we could not do LRO on.  Tell the stack that the
2231 		   checksum is good */
2232 		m->m_pkthdr.csum_data = 0xffff;
2233 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2234 	}
2235 	/* pass the frame up the stack */
2236 	(*ifp->if_input)(ifp, m);
2237 }
2238 
2239 static inline void
2240 mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
2241 {
2242 	struct ifnet *ifp;
2243 	struct ether_header *eh;
2244 	struct mbuf *m;
2245 	mxge_rx_buf_t *rx;
2246 	bus_dmamap_t old_map;
2247 	int idx;
2248 	uint16_t tcpudp_csum;
2249 
2250 	ifp = sc->ifp;
2251 	rx = &sc->rx_small;
2252 	idx = rx->cnt & rx->mask;
2253 	rx->cnt++;
2254 	/* save a pointer to the received mbuf */
2255 	m = rx->info[idx].m;
2256 	/* try to replace the received mbuf */
2257 	if (mxge_get_buf_small(sc, rx->extra_map, idx)) {
2258 		/* drop the frame -- the old mbuf is re-cycled */
2259 		ifp->if_ierrors++;
2260 		return;
2261 	}
2262 
2263 	/* unmap the received buffer */
2264 	old_map = rx->info[idx].map;
2265 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2266 	bus_dmamap_unload(rx->dmat, old_map);
2267 
2268 	/* swap the bus_dmamap_t's */
2269 	rx->info[idx].map = rx->extra_map;
2270 	rx->extra_map = old_map;
2271 
2272 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2273 	 * aligned */
2274 	m->m_data += MXGEFW_PAD;
2275 
2276 	m->m_pkthdr.rcvif = ifp;
2277 	m->m_len = m->m_pkthdr.len = len;
2278 	ifp->if_ipackets++;
2279 	eh = mtod(m, struct ether_header *);
2280 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2281 		mxge_vlan_tag_remove(m, &csum);
2282 	}
2283 	/* if the checksum is valid, mark it in the mbuf header */
2284 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2285 		if (sc->lro_cnt && (0 == mxge_lro_rx(sc, m, csum)))
2286 			return;
2287 		/* otherwise, it was a UDP frame, or a TCP frame which
2288 		   we could not do LRO on.  Tell the stack that the
2289 		   checksum is good */
2290 		m->m_pkthdr.csum_data = 0xffff;
2291 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2292 	}
2293 
2294 	/* pass the frame up the stack */
2295 	(*ifp->if_input)(ifp, m);
2296 }
2297 
2298 static inline void
2299 mxge_clean_rx_done(mxge_softc_t *sc)
2300 {
2301 	mxge_rx_done_t *rx_done = &sc->rx_done;
2302 	struct lro_entry *lro;
2303 	int limit = 0;
2304 	uint16_t length;
2305 	uint16_t checksum;
2306 
2307 
2308 	while (rx_done->entry[rx_done->idx].length != 0) {
2309 		length = ntohs(rx_done->entry[rx_done->idx].length);
2310 		rx_done->entry[rx_done->idx].length = 0;
2311 		checksum = rx_done->entry[rx_done->idx].checksum;
2312 		if (length <= (MHLEN - MXGEFW_PAD))
2313 			mxge_rx_done_small(sc, length, checksum);
2314 		else
2315 			mxge_rx_done_big(sc, length, checksum);
2316 		rx_done->cnt++;
2317 		rx_done->idx = rx_done->cnt & rx_done->mask;
2318 
2319 		/* limit potential for livelock */
2320 		if (__predict_false(++limit > rx_done->mask / 2))
2321 			break;
2322 	}
2323 	while(!SLIST_EMPTY(&sc->lro_active)) {
2324 		lro = SLIST_FIRST(&sc->lro_active);
2325 		SLIST_REMOVE_HEAD(&sc->lro_active, next);
2326 		mxge_lro_flush(sc, lro);
2327 	}
2328 }
2329 
2330 
2331 static inline void
2332 mxge_tx_done(mxge_softc_t *sc, uint32_t mcp_idx)
2333 {
2334 	struct ifnet *ifp;
2335 	mxge_tx_buf_t *tx;
2336 	struct mbuf *m;
2337 	bus_dmamap_t map;
2338 	int idx;
2339 
2340 	tx = &sc->tx;
2341 	ifp = sc->ifp;
2342 	while (tx->pkt_done != mcp_idx) {
2343 		idx = tx->done & tx->mask;
2344 		tx->done++;
2345 		m = tx->info[idx].m;
2346 		/* mbuf and DMA map only attached to the first
2347 		   segment per-mbuf */
2348 		if (m != NULL) {
2349 			ifp->if_opackets++;
2350 			tx->info[idx].m = NULL;
2351 			map = tx->info[idx].map;
2352 			bus_dmamap_unload(tx->dmat, map);
2353 			m_freem(m);
2354 		}
2355 		if (tx->info[idx].flag) {
2356 			tx->info[idx].flag = 0;
2357 			tx->pkt_done++;
2358 		}
2359 	}
2360 
2361 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2362            its OK to send packets */
2363 
2364 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
2365 	    tx->req - tx->done < (tx->mask + 1)/4) {
2366 		mtx_lock(&sc->tx_mtx);
2367 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2368 		sc->tx.wake++;
2369 		mxge_start_locked(sc);
2370 		mtx_unlock(&sc->tx_mtx);
2371 	}
2372 }
2373 
2374 static struct mxge_media_type mxge_media_types[] =
2375 {
2376 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2377 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2378 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2379 	{0,		(1 << 5),	"10GBASE-ER"},
2380 	{0,		(1 << 4),	"10GBASE-LRM"},
2381 	{0,		(1 << 3),	"10GBASE-SW"},
2382 	{0,		(1 << 2),	"10GBASE-LW"},
2383 	{0,		(1 << 1),	"10GBASE-EW"},
2384 	{0,		(1 << 0),	"Reserved"}
2385 };
2386 
2387 static void
2388 mxge_set_media(mxge_softc_t *sc, int type)
2389 {
2390 	sc->media_flags |= type;
2391 	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2392 	ifmedia_set(&sc->media, sc->media_flags);
2393 }
2394 
2395 
2396 /*
2397  * Determine the media type for a NIC.  Some XFPs will identify
2398  * themselves only when their link is up, so this is initiated via a
2399  * link up interrupt.  However, this can potentially take up to
2400  * several milliseconds, so it is run via the watchdog routine, rather
2401  * than in the interrupt handler itself.   This need only be done
2402  * once, not each time the link is up.
2403  */
2404 static void
2405 mxge_media_probe(mxge_softc_t *sc)
2406 {
2407 	mxge_cmd_t cmd;
2408 	char *ptr;
2409 	int i, err, ms;
2410 
2411 	sc->need_media_probe = 0;
2412 
2413 	/* if we've already set a media type, we're done */
2414 	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2415 		return;
2416 
2417 	/*
2418 	 * parse the product code to deterimine the interface type
2419 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2420 	 * after the 3rd dash in the driver's cached copy of the
2421 	 * EEPROM's product code string.
2422 	 */
2423 	ptr = sc->product_code_string;
2424 	if (ptr == NULL) {
2425 		device_printf(sc->dev, "Missing product code\n");
2426 	}
2427 
2428 	for (i = 0; i < 3; i++, ptr++) {
2429 		ptr = strchr(ptr, '-');
2430 		if (ptr == NULL) {
2431 			device_printf(sc->dev,
2432 				      "only %d dashes in PC?!?\n", i);
2433 			return;
2434 		}
2435 	}
2436 	if (*ptr == 'C') {
2437 		mxge_set_media(sc, IFM_10G_CX4);
2438 		return;
2439 	}
2440 	else if (*ptr == 'Q') {
2441 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2442 		/* FreeBSD has no media type for Quad ribbon fiber */
2443 		return;
2444 	}
2445 
2446 	if (*ptr != 'R') {
2447 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2448 		return;
2449 	}
2450 
2451 	/*
2452 	 * At this point we know the NIC has an XFP cage, so now we
2453 	 * try to determine what is in the cage by using the
2454 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2455 	 * register.  We read just one byte, which may take over
2456 	 * a millisecond
2457 	 */
2458 
2459 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2460 	cmd.data1 = MXGE_XFP_COMPLIANCE_BYTE; /* the byte we want */
2461 	err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_I2C_READ, &cmd);
2462 	if (err == MXGEFW_CMD_ERROR_XFP_FAILURE) {
2463 		device_printf(sc->dev, "failed to read XFP\n");
2464 	}
2465 	if (err == MXGEFW_CMD_ERROR_XFP_ABSENT) {
2466 		device_printf(sc->dev, "Type R with no XFP!?!?\n");
2467 	}
2468 	if (err != MXGEFW_CMD_OK) {
2469 		return;
2470 	}
2471 
2472 	/* now we wait for the data to be cached */
2473 	cmd.data0 = MXGE_XFP_COMPLIANCE_BYTE;
2474 	err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_BYTE, &cmd);
2475 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2476 		DELAY(1000);
2477 		cmd.data0 = MXGE_XFP_COMPLIANCE_BYTE;
2478 		err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_BYTE, &cmd);
2479 	}
2480 	if (err != MXGEFW_CMD_OK) {
2481 		device_printf(sc->dev, "failed to read XFP (%d, %dms)\n",
2482 			      err, ms);
2483 		return;
2484 	}
2485 
2486 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2487 		if (mxge_verbose)
2488 			device_printf(sc->dev, "XFP:%s\n",
2489 				      mxge_media_types[0].name);
2490 		mxge_set_media(sc, IFM_10G_CX4);
2491 		return;
2492 	}
2493 	for (i = 1;
2494 	     i < sizeof (mxge_media_types) / sizeof (mxge_media_types[0]);
2495 	     i++) {
2496 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2497 			if (mxge_verbose)
2498 				device_printf(sc->dev, "XFP:%s\n",
2499 					      mxge_media_types[i].name);
2500 
2501 			mxge_set_media(sc, mxge_media_types[i].flag);
2502 			return;
2503 		}
2504 	}
2505 	device_printf(sc->dev, "XFP media 0x%x unknown\n", cmd.data0);
2506 
2507 	return;
2508 }
2509 
2510 static void
2511 mxge_intr(void *arg)
2512 {
2513 	mxge_softc_t *sc = arg;
2514 	mcp_irq_data_t *stats = sc->fw_stats;
2515 	mxge_tx_buf_t *tx = &sc->tx;
2516 	mxge_rx_done_t *rx_done = &sc->rx_done;
2517 	uint32_t send_done_count;
2518 	uint8_t valid;
2519 
2520 
2521 	/* make sure the DMA has finished */
2522 	if (!stats->valid) {
2523 		return;
2524 	}
2525 	valid = stats->valid;
2526 
2527 	if (!sc->msi_enabled) {
2528 		/* lower legacy IRQ  */
2529 		*sc->irq_deassert = 0;
2530 		if (!mxge_deassert_wait)
2531 			/* don't wait for conf. that irq is low */
2532 			stats->valid = 0;
2533 	} else {
2534 		stats->valid = 0;
2535 	}
2536 
2537 	/* loop while waiting for legacy irq deassertion */
2538 	do {
2539 		/* check for transmit completes and receives */
2540 		send_done_count = be32toh(stats->send_done_count);
2541 		while ((send_done_count != tx->pkt_done) ||
2542 		       (rx_done->entry[rx_done->idx].length != 0)) {
2543 			mxge_tx_done(sc, (int)send_done_count);
2544 			mxge_clean_rx_done(sc);
2545 			send_done_count = be32toh(stats->send_done_count);
2546 		}
2547 	} while (*((volatile uint8_t *) &stats->valid));
2548 
2549 	if (__predict_false(stats->stats_updated)) {
2550 		if (sc->link_state != stats->link_up) {
2551 			sc->link_state = stats->link_up;
2552 			if (sc->link_state) {
2553 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2554 				if (mxge_verbose)
2555 					device_printf(sc->dev, "link up\n");
2556 			} else {
2557 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2558 				if (mxge_verbose)
2559 					device_printf(sc->dev, "link down\n");
2560 			}
2561 			sc->need_media_probe = 1;
2562 		}
2563 		if (sc->rdma_tags_available !=
2564 		    be32toh(sc->fw_stats->rdma_tags_available)) {
2565 			sc->rdma_tags_available =
2566 				be32toh(sc->fw_stats->rdma_tags_available);
2567 			device_printf(sc->dev, "RDMA timed out! %d tags "
2568 				      "left\n", sc->rdma_tags_available);
2569 		}
2570 
2571 		if (stats->link_down) {
2572 			sc->down_cnt += stats->link_down;
2573 			sc->link_state = 0;
2574 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2575 		}
2576 	}
2577 
2578 	/* check to see if we have rx token to pass back */
2579 	if (valid & 0x1)
2580 	    *sc->irq_claim = be32toh(3);
2581 	*(sc->irq_claim + 1) = be32toh(3);
2582 }
2583 
2584 static void
2585 mxge_init(void *arg)
2586 {
2587 }
2588 
2589 
2590 
2591 static void
2592 mxge_free_mbufs(mxge_softc_t *sc)
2593 {
2594 	int i;
2595 
2596 	for (i = 0; i <= sc->rx_big.mask; i++) {
2597 		if (sc->rx_big.info[i].m == NULL)
2598 			continue;
2599 		bus_dmamap_unload(sc->rx_big.dmat,
2600 				  sc->rx_big.info[i].map);
2601 		m_freem(sc->rx_big.info[i].m);
2602 		sc->rx_big.info[i].m = NULL;
2603 	}
2604 
2605 	for (i = 0; i <= sc->rx_small.mask; i++) {
2606 		if (sc->rx_small.info[i].m == NULL)
2607 			continue;
2608 		bus_dmamap_unload(sc->rx_small.dmat,
2609 				  sc->rx_small.info[i].map);
2610 		m_freem(sc->rx_small.info[i].m);
2611 		sc->rx_small.info[i].m = NULL;
2612 	}
2613 
2614 	for (i = 0; i <= sc->tx.mask; i++) {
2615 		sc->tx.info[i].flag = 0;
2616 		if (sc->tx.info[i].m == NULL)
2617 			continue;
2618 		bus_dmamap_unload(sc->tx.dmat,
2619 				  sc->tx.info[i].map);
2620 		m_freem(sc->tx.info[i].m);
2621 		sc->tx.info[i].m = NULL;
2622 	}
2623 }
2624 
2625 static void
2626 mxge_free_rings(mxge_softc_t *sc)
2627 {
2628 	int i;
2629 
2630 	if (sc->rx_done.entry != NULL)
2631 		mxge_dma_free(&sc->rx_done.dma);
2632 	sc->rx_done.entry = NULL;
2633 	if (sc->tx.req_bytes != NULL)
2634 		free(sc->tx.req_bytes, M_DEVBUF);
2635 	if (sc->tx.seg_list != NULL)
2636 		free(sc->tx.seg_list, M_DEVBUF);
2637 	if (sc->rx_small.shadow != NULL)
2638 		free(sc->rx_small.shadow, M_DEVBUF);
2639 	if (sc->rx_big.shadow != NULL)
2640 		free(sc->rx_big.shadow, M_DEVBUF);
2641 	if (sc->tx.info != NULL) {
2642 		if (sc->tx.dmat != NULL) {
2643 			for (i = 0; i <= sc->tx.mask; i++) {
2644 				bus_dmamap_destroy(sc->tx.dmat,
2645 						   sc->tx.info[i].map);
2646 			}
2647 			bus_dma_tag_destroy(sc->tx.dmat);
2648 		}
2649 		free(sc->tx.info, M_DEVBUF);
2650 	}
2651 	if (sc->rx_small.info != NULL) {
2652 		if (sc->rx_small.dmat != NULL) {
2653 			for (i = 0; i <= sc->rx_small.mask; i++) {
2654 				bus_dmamap_destroy(sc->rx_small.dmat,
2655 						   sc->rx_small.info[i].map);
2656 			}
2657 			bus_dmamap_destroy(sc->rx_small.dmat,
2658 					   sc->rx_small.extra_map);
2659 			bus_dma_tag_destroy(sc->rx_small.dmat);
2660 		}
2661 		free(sc->rx_small.info, M_DEVBUF);
2662 	}
2663 	if (sc->rx_big.info != NULL) {
2664 		if (sc->rx_big.dmat != NULL) {
2665 			for (i = 0; i <= sc->rx_big.mask; i++) {
2666 				bus_dmamap_destroy(sc->rx_big.dmat,
2667 						   sc->rx_big.info[i].map);
2668 			}
2669 			bus_dmamap_destroy(sc->rx_big.dmat,
2670 					   sc->rx_big.extra_map);
2671 			bus_dma_tag_destroy(sc->rx_big.dmat);
2672 		}
2673 		free(sc->rx_big.info, M_DEVBUF);
2674 	}
2675 }
2676 
2677 static int
2678 mxge_alloc_rings(mxge_softc_t *sc)
2679 {
2680 	mxge_cmd_t cmd;
2681 	int tx_ring_size, rx_ring_size;
2682 	int tx_ring_entries, rx_ring_entries;
2683 	int i, err;
2684 	unsigned long bytes;
2685 
2686 	/* get ring sizes */
2687 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
2688 	tx_ring_size = cmd.data0;
2689 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
2690 	if (err != 0) {
2691 		device_printf(sc->dev, "Cannot determine ring sizes\n");
2692 		goto abort_with_nothing;
2693 	}
2694 
2695 	rx_ring_size = cmd.data0;
2696 
2697 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
2698 	rx_ring_entries = rx_ring_size / sizeof (mcp_dma_addr_t);
2699 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
2700 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
2701 	IFQ_SET_READY(&sc->ifp->if_snd);
2702 
2703 	sc->tx.mask = tx_ring_entries - 1;
2704 	sc->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
2705 	sc->rx_small.mask = sc->rx_big.mask = rx_ring_entries - 1;
2706 	sc->rx_done.mask = (2 * rx_ring_entries) - 1;
2707 
2708 	err = ENOMEM;
2709 
2710 	/* allocate interrupt queues */
2711 	bytes = (sc->rx_done.mask + 1) * sizeof (*sc->rx_done.entry);
2712 	err = mxge_dma_alloc(sc, &sc->rx_done.dma, bytes, 4096);
2713 	if (err != 0)
2714 		goto abort_with_nothing;
2715 	sc->rx_done.entry = sc->rx_done.dma.addr;
2716 	bzero(sc->rx_done.entry, bytes);
2717 
2718 	/* allocate the tx request copy block */
2719 	bytes = 8 +
2720 		sizeof (*sc->tx.req_list) * (sc->tx.max_desc + 4);
2721 	sc->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
2722 	if (sc->tx.req_bytes == NULL)
2723 		goto abort_with_alloc;
2724 	/* ensure req_list entries are aligned to 8 bytes */
2725 	sc->tx.req_list = (mcp_kreq_ether_send_t *)
2726 		((unsigned long)(sc->tx.req_bytes + 7) & ~7UL);
2727 
2728 	/* allocate the tx busdma segment list */
2729 	bytes = sizeof (*sc->tx.seg_list) * sc->tx.max_desc;
2730 	sc->tx.seg_list = (bus_dma_segment_t *)
2731 		malloc(bytes, M_DEVBUF, M_WAITOK);
2732 	if (sc->tx.seg_list == NULL)
2733 		goto abort_with_alloc;
2734 
2735 	/* allocate the rx shadow rings */
2736 	bytes = rx_ring_entries * sizeof (*sc->rx_small.shadow);
2737 	sc->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2738 	if (sc->rx_small.shadow == NULL)
2739 		goto abort_with_alloc;
2740 
2741 	bytes = rx_ring_entries * sizeof (*sc->rx_big.shadow);
2742 	sc->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2743 	if (sc->rx_big.shadow == NULL)
2744 		goto abort_with_alloc;
2745 
2746 	/* allocate the host info rings */
2747 	bytes = tx_ring_entries * sizeof (*sc->tx.info);
2748 	sc->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2749 	if (sc->tx.info == NULL)
2750 		goto abort_with_alloc;
2751 
2752 	bytes = rx_ring_entries * sizeof (*sc->rx_small.info);
2753 	sc->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2754 	if (sc->rx_small.info == NULL)
2755 		goto abort_with_alloc;
2756 
2757 	bytes = rx_ring_entries * sizeof (*sc->rx_big.info);
2758 	sc->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2759 	if (sc->rx_big.info == NULL)
2760 		goto abort_with_alloc;
2761 
2762 	/* allocate the busdma resources */
2763 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2764 				 1,			/* alignment */
2765 				 sc->tx.boundary,	/* boundary */
2766 				 BUS_SPACE_MAXADDR,	/* low */
2767 				 BUS_SPACE_MAXADDR,	/* high */
2768 				 NULL, NULL,		/* filter */
2769 				 65536 + 256,		/* maxsize */
2770 				 sc->tx.max_desc - 2,	/* num segs */
2771 				 sc->tx.boundary,	/* maxsegsize */
2772 				 BUS_DMA_ALLOCNOW,	/* flags */
2773 				 NULL, NULL,		/* lock */
2774 				 &sc->tx.dmat);		/* tag */
2775 
2776 	if (err != 0) {
2777 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
2778 			      err);
2779 		goto abort_with_alloc;
2780 	}
2781 
2782 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2783 				 1,			/* alignment */
2784 				 4096,			/* boundary */
2785 				 BUS_SPACE_MAXADDR,	/* low */
2786 				 BUS_SPACE_MAXADDR,	/* high */
2787 				 NULL, NULL,		/* filter */
2788 				 MHLEN,			/* maxsize */
2789 				 1,			/* num segs */
2790 				 MHLEN,			/* maxsegsize */
2791 				 BUS_DMA_ALLOCNOW,	/* flags */
2792 				 NULL, NULL,		/* lock */
2793 				 &sc->rx_small.dmat);	/* tag */
2794 	if (err != 0) {
2795 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2796 			      err);
2797 		goto abort_with_alloc;
2798 	}
2799 
2800 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2801 				 1,			/* alignment */
2802 				 4096,			/* boundary */
2803 				 BUS_SPACE_MAXADDR,	/* low */
2804 				 BUS_SPACE_MAXADDR,	/* high */
2805 				 NULL, NULL,		/* filter */
2806 				 3*4096,		/* maxsize */
2807 				 3,			/* num segs */
2808 				 4096,			/* maxsegsize */
2809 				 BUS_DMA_ALLOCNOW,	/* flags */
2810 				 NULL, NULL,		/* lock */
2811 				 &sc->rx_big.dmat);	/* tag */
2812 	if (err != 0) {
2813 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2814 			      err);
2815 		goto abort_with_alloc;
2816 	}
2817 
2818 	/* now use these tags to setup dmamaps for each slot
2819 	   in each ring */
2820 	for (i = 0; i <= sc->tx.mask; i++) {
2821 		err = bus_dmamap_create(sc->tx.dmat, 0,
2822 					&sc->tx.info[i].map);
2823 		if (err != 0) {
2824 			device_printf(sc->dev, "Err %d  tx dmamap\n",
2825 			      err);
2826 			goto abort_with_alloc;
2827 		}
2828 	}
2829 	for (i = 0; i <= sc->rx_small.mask; i++) {
2830 		err = bus_dmamap_create(sc->rx_small.dmat, 0,
2831 					&sc->rx_small.info[i].map);
2832 		if (err != 0) {
2833 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2834 				      err);
2835 			goto abort_with_alloc;
2836 		}
2837 	}
2838 	err = bus_dmamap_create(sc->rx_small.dmat, 0,
2839 				&sc->rx_small.extra_map);
2840 	if (err != 0) {
2841 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2842 			      err);
2843 			goto abort_with_alloc;
2844 	}
2845 
2846 	for (i = 0; i <= sc->rx_big.mask; i++) {
2847 		err = bus_dmamap_create(sc->rx_big.dmat, 0,
2848 					&sc->rx_big.info[i].map);
2849 		if (err != 0) {
2850 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2851 			      err);
2852 			goto abort_with_alloc;
2853 		}
2854 	}
2855 	err = bus_dmamap_create(sc->rx_big.dmat, 0,
2856 				&sc->rx_big.extra_map);
2857 	if (err != 0) {
2858 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2859 			      err);
2860 			goto abort_with_alloc;
2861 	}
2862 	return 0;
2863 
2864 abort_with_alloc:
2865 	mxge_free_rings(sc);
2866 
2867 abort_with_nothing:
2868 	return err;
2869 }
2870 
2871 static void
2872 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
2873 {
2874 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
2875 
2876 	if (bufsize < MCLBYTES) {
2877 		/* easy, everything fits in a single buffer */
2878 		*big_buf_size = MCLBYTES;
2879 		*cl_size = MCLBYTES;
2880 		*nbufs = 1;
2881 		return;
2882 	}
2883 
2884 	if (bufsize < MJUMPAGESIZE) {
2885 		/* still easy, everything still fits in a single buffer */
2886 		*big_buf_size = MJUMPAGESIZE;
2887 		*cl_size = MJUMPAGESIZE;
2888 		*nbufs = 1;
2889 		return;
2890 	}
2891 	/* now we need to use virtually contiguous buffers */
2892 	*cl_size = MJUM9BYTES;
2893 	*big_buf_size = 4096;
2894 	*nbufs = mtu / 4096 + 1;
2895 	/* needs to be a power of two, so round up */
2896 	if (*nbufs == 3)
2897 		*nbufs = 4;
2898 }
2899 
2900 static int
2901 mxge_open(mxge_softc_t *sc)
2902 {
2903 	mxge_cmd_t cmd;
2904 	int i, err, big_bytes;
2905 	bus_dmamap_t map;
2906 	bus_addr_t bus;
2907 	struct lro_entry *lro_entry;
2908 
2909 	SLIST_INIT(&sc->lro_free);
2910 	SLIST_INIT(&sc->lro_active);
2911 
2912 	for (i = 0; i < sc->lro_cnt; i++) {
2913 		lro_entry = (struct lro_entry *)
2914 			malloc(sizeof (*lro_entry), M_DEVBUF, M_NOWAIT | M_ZERO);
2915 		if (lro_entry == NULL) {
2916 			sc->lro_cnt = i;
2917 			break;
2918 		}
2919 		SLIST_INSERT_HEAD(&sc->lro_free, lro_entry, next);
2920 	}
2921 
2922 	/* Copy the MAC address in case it was overridden */
2923 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
2924 
2925 	err = mxge_reset(sc, 1);
2926 	if (err != 0) {
2927 		device_printf(sc->dev, "failed to reset\n");
2928 		return EIO;
2929 	}
2930 
2931 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes,
2932 			   &sc->rx_big.cl_size, &sc->rx_big.nbufs);
2933 
2934 	cmd.data0 = sc->rx_big.nbufs;
2935 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
2936 			    &cmd);
2937 	/* error is only meaningful if we're trying to set
2938 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
2939 	if (err && sc->rx_big.nbufs > 1) {
2940 		device_printf(sc->dev,
2941 			      "Failed to set alway-use-n to %d\n",
2942 			      sc->rx_big.nbufs);
2943 		return EIO;
2944 	}
2945 	/* get the lanai pointers to the send and receive rings */
2946 
2947 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
2948 	sc->tx.lanai =
2949 		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
2950 	err |= mxge_send_cmd(sc,
2951 				 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
2952 	sc->rx_small.lanai =
2953 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2954 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
2955 	sc->rx_big.lanai =
2956 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2957 
2958 	if (err != 0) {
2959 		device_printf(sc->dev,
2960 			      "failed to get ring sizes or locations\n");
2961 		return EIO;
2962 	}
2963 
2964 	/* stock receive rings */
2965 	for (i = 0; i <= sc->rx_small.mask; i++) {
2966 		map = sc->rx_small.info[i].map;
2967 		err = mxge_get_buf_small(sc, map, i);
2968 		if (err) {
2969 			device_printf(sc->dev, "alloced %d/%d smalls\n",
2970 				      i, sc->rx_small.mask + 1);
2971 			goto abort;
2972 		}
2973 	}
2974 	for (i = 0; i <= sc->rx_big.mask; i++) {
2975 		sc->rx_big.shadow[i].addr_low = 0xffffffff;
2976 		sc->rx_big.shadow[i].addr_high = 0xffffffff;
2977 	}
2978 	for (i = 0; i <= sc->rx_big.mask; i += sc->rx_big.nbufs) {
2979 		map = sc->rx_big.info[i].map;
2980 		err = mxge_get_buf_big(sc, map, i);
2981 		if (err) {
2982 			device_printf(sc->dev, "alloced %d/%d bigs\n",
2983 				      i, sc->rx_big.mask + 1);
2984 			goto abort;
2985 		}
2986 	}
2987 
2988 	/* Give the firmware the mtu and the big and small buffer
2989 	   sizes.  The firmware wants the big buf size to be a power
2990 	   of two. Luckily, FreeBSD's clusters are powers of two */
2991 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
2992 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
2993 	cmd.data0 = MHLEN - MXGEFW_PAD;
2994 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
2995 			     &cmd);
2996 	cmd.data0 = big_bytes;
2997 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2998 
2999 	if (err != 0) {
3000 		device_printf(sc->dev, "failed to setup params\n");
3001 		goto abort;
3002 	}
3003 
3004 	/* Now give him the pointer to the stats block */
3005 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->fw_stats_dma.bus_addr);
3006 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->fw_stats_dma.bus_addr);
3007 	cmd.data2 = sizeof(struct mcp_irq_data);
3008 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3009 
3010 	if (err != 0) {
3011 		bus = sc->fw_stats_dma.bus_addr;
3012 		bus += offsetof(struct mcp_irq_data, send_done_count);
3013 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3014 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3015 		err = mxge_send_cmd(sc,
3016 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3017 				    &cmd);
3018 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3019 		sc->fw_multicast_support = 0;
3020 	} else {
3021 		sc->fw_multicast_support = 1;
3022 	}
3023 
3024 	if (err != 0) {
3025 		device_printf(sc->dev, "failed to setup params\n");
3026 		goto abort;
3027 	}
3028 
3029 	/* Finally, start the firmware running */
3030 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3031 	if (err) {
3032 		device_printf(sc->dev, "Couldn't bring up link\n");
3033 		goto abort;
3034 	}
3035 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3036 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3037 
3038 	return 0;
3039 
3040 
3041 abort:
3042 	mxge_free_mbufs(sc);
3043 
3044 	return err;
3045 }
3046 
3047 static int
3048 mxge_close(mxge_softc_t *sc)
3049 {
3050 	struct lro_entry *lro_entry;
3051 	mxge_cmd_t cmd;
3052 	int err, old_down_cnt;
3053 
3054 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3055 	old_down_cnt = sc->down_cnt;
3056 	mb();
3057 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3058 	if (err) {
3059 		device_printf(sc->dev, "Couldn't bring down link\n");
3060 	}
3061 	if (old_down_cnt == sc->down_cnt) {
3062 		/* wait for down irq */
3063 		DELAY(10 * sc->intr_coal_delay);
3064 	}
3065 	if (old_down_cnt == sc->down_cnt) {
3066 		device_printf(sc->dev, "never got down irq\n");
3067 	}
3068 
3069 	mxge_free_mbufs(sc);
3070 
3071 	while (!SLIST_EMPTY(&sc->lro_free)) {
3072 		lro_entry = SLIST_FIRST(&sc->lro_free);
3073 		SLIST_REMOVE_HEAD(&sc->lro_free, next);
3074 	}
3075 	return 0;
3076 }
3077 
3078 static void
3079 mxge_setup_cfg_space(mxge_softc_t *sc)
3080 {
3081 	device_t dev = sc->dev;
3082 	int reg;
3083 	uint16_t cmd, lnk, pectl;
3084 
3085 	/* find the PCIe link width and set max read request to 4KB*/
3086 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3087 		lnk = pci_read_config(dev, reg + 0x12, 2);
3088 		sc->link_width = (lnk >> 4) & 0x3f;
3089 
3090 		pectl = pci_read_config(dev, reg + 0x8, 2);
3091 		pectl = (pectl & ~0x7000) | (5 << 12);
3092 		pci_write_config(dev, reg + 0x8, pectl, 2);
3093 	}
3094 
3095 	/* Enable DMA and Memory space access */
3096 	pci_enable_busmaster(dev);
3097 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3098 	cmd |= PCIM_CMD_MEMEN;
3099 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3100 }
3101 
3102 static uint32_t
3103 mxge_read_reboot(mxge_softc_t *sc)
3104 {
3105 	device_t dev = sc->dev;
3106 	uint32_t vs;
3107 
3108 	/* find the vendor specific offset */
3109 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3110 		device_printf(sc->dev,
3111 			      "could not find vendor specific offset\n");
3112 		return (uint32_t)-1;
3113 	}
3114 	/* enable read32 mode */
3115 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3116 	/* tell NIC which register to read */
3117 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3118 	return (pci_read_config(dev, vs + 0x14, 4));
3119 }
3120 
3121 static void
3122 mxge_watchdog_reset(mxge_softc_t *sc)
3123 {
3124 	int err;
3125 	uint32_t reboot;
3126 	uint16_t cmd;
3127 
3128 	err = ENXIO;
3129 
3130 	device_printf(sc->dev, "Watchdog reset!\n");
3131 
3132 	/*
3133 	 * check to see if the NIC rebooted.  If it did, then all of
3134 	 * PCI config space has been reset, and things like the
3135 	 * busmaster bit will be zero.  If this is the case, then we
3136 	 * must restore PCI config space before the NIC can be used
3137 	 * again
3138 	 */
3139 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3140 	if (cmd == 0xffff) {
3141 		/*
3142 		 * maybe the watchdog caught the NIC rebooting; wait
3143 		 * up to 100ms for it to finish.  If it does not come
3144 		 * back, then give up
3145 		 */
3146 		DELAY(1000*100);
3147 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3148 		if (cmd == 0xffff) {
3149 			device_printf(sc->dev, "NIC disappeared!\n");
3150 			goto abort;
3151 		}
3152 	}
3153 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3154 		/* print the reboot status */
3155 		reboot = mxge_read_reboot(sc);
3156 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3157 			      reboot);
3158 		/* restore PCI configuration space */
3159 
3160 		/* XXXX waiting for pci_cfg_restore() to be exported */
3161 		goto abort; /* just abort for now */
3162 
3163 		/* and redo any changes we made to our config space */
3164 		mxge_setup_cfg_space(sc);
3165 	} else {
3166 		device_printf(sc->dev, "NIC did not reboot, ring state:\n");
3167 		device_printf(sc->dev, "tx.req=%d tx.done=%d\n",
3168 			      sc->tx.req, sc->tx.done);
3169 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3170 			      sc->tx.pkt_done,
3171 			      be32toh(sc->fw_stats->send_done_count));
3172 	}
3173 
3174 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
3175 		mxge_close(sc);
3176 		err = mxge_open(sc);
3177 	}
3178 
3179 abort:
3180 	/*
3181 	 * stop the watchdog if the nic is dead, to avoid spamming the
3182 	 * console
3183 	 */
3184 	if (err != 0) {
3185 		callout_stop(&sc->co_hdl);
3186 	}
3187 }
3188 
3189 static void
3190 mxge_watchdog(mxge_softc_t *sc)
3191 {
3192 	mxge_tx_buf_t *tx = &sc->tx;
3193 	uint32_t rx_pause = be32toh(sc->fw_stats->dropped_pause);
3194 
3195 	/* see if we have outstanding transmits, which
3196 	   have been pending for more than mxge_ticks */
3197 	if (tx->req != tx->done &&
3198 	    tx->watchdog_req != tx->watchdog_done &&
3199 	    tx->done == tx->watchdog_done) {
3200 		/* check for pause blocking before resetting */
3201 		if (tx->watchdog_rx_pause == rx_pause)
3202 			mxge_watchdog_reset(sc);
3203 		else
3204 			device_printf(sc->dev, "Flow control blocking "
3205 				      "xmits, check link partner\n");
3206 	}
3207 
3208 	tx->watchdog_req = tx->req;
3209 	tx->watchdog_done = tx->done;
3210 	tx->watchdog_rx_pause = rx_pause;
3211 
3212 	if (sc->need_media_probe)
3213 		mxge_media_probe(sc);
3214 }
3215 
3216 static void
3217 mxge_tick(void *arg)
3218 {
3219 	mxge_softc_t *sc = arg;
3220 
3221 
3222 	/* Synchronize with possible callout reset/stop. */
3223 	if (callout_pending(&sc->co_hdl) ||
3224 	    !callout_active(&sc->co_hdl)) {
3225 		mtx_unlock(&sc->driver_mtx);
3226 		return;
3227 	}
3228 
3229 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3230 	mxge_watchdog(sc);
3231 }
3232 
3233 static int
3234 mxge_media_change(struct ifnet *ifp)
3235 {
3236 	return EINVAL;
3237 }
3238 
3239 static int
3240 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3241 {
3242 	struct ifnet *ifp = sc->ifp;
3243 	int real_mtu, old_mtu;
3244 	int err = 0;
3245 
3246 
3247 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3248 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3249 		return EINVAL;
3250 	mtx_lock(&sc->driver_mtx);
3251 	old_mtu = ifp->if_mtu;
3252 	ifp->if_mtu = mtu;
3253 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3254 		callout_stop(&sc->co_hdl);
3255 		mxge_close(sc);
3256 		err = mxge_open(sc);
3257 		if (err != 0) {
3258 			ifp->if_mtu = old_mtu;
3259 			mxge_close(sc);
3260 			(void) mxge_open(sc);
3261 		}
3262 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3263 	}
3264 	mtx_unlock(&sc->driver_mtx);
3265 	return err;
3266 }
3267 
3268 static void
3269 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3270 {
3271 	mxge_softc_t *sc = ifp->if_softc;
3272 
3273 
3274 	if (sc == NULL)
3275 		return;
3276 	ifmr->ifm_status = IFM_AVALID;
3277 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3278 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3279 	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3280 }
3281 
3282 static int
3283 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3284 {
3285 	mxge_softc_t *sc = ifp->if_softc;
3286 	struct ifreq *ifr = (struct ifreq *)data;
3287 	int err, mask;
3288 
3289 	err = 0;
3290 	switch (command) {
3291 	case SIOCSIFADDR:
3292 	case SIOCGIFADDR:
3293 		err = ether_ioctl(ifp, command, data);
3294 		break;
3295 
3296 	case SIOCSIFMTU:
3297 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3298 		break;
3299 
3300 	case SIOCSIFFLAGS:
3301 		mtx_lock(&sc->driver_mtx);
3302 		if (ifp->if_flags & IFF_UP) {
3303 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3304 				err = mxge_open(sc);
3305 				callout_reset(&sc->co_hdl, mxge_ticks,
3306 					      mxge_tick, sc);
3307 			} else {
3308 				/* take care of promis can allmulti
3309 				   flag chages */
3310 				mxge_change_promisc(sc,
3311 						    ifp->if_flags & IFF_PROMISC);
3312 				mxge_set_multicast_list(sc);
3313 			}
3314 		} else {
3315 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3316 				mxge_close(sc);
3317 				callout_stop(&sc->co_hdl);
3318 			}
3319 		}
3320 		mtx_unlock(&sc->driver_mtx);
3321 		break;
3322 
3323 	case SIOCADDMULTI:
3324 	case SIOCDELMULTI:
3325 		mtx_lock(&sc->driver_mtx);
3326 		mxge_set_multicast_list(sc);
3327 		mtx_unlock(&sc->driver_mtx);
3328 		break;
3329 
3330 	case SIOCSIFCAP:
3331 		mtx_lock(&sc->driver_mtx);
3332 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3333 		if (mask & IFCAP_TXCSUM) {
3334 			if (IFCAP_TXCSUM & ifp->if_capenable) {
3335 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3336 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3337 						      | CSUM_TSO);
3338 			} else {
3339 				ifp->if_capenable |= IFCAP_TXCSUM;
3340 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3341 			}
3342 		} else if (mask & IFCAP_RXCSUM) {
3343 			if (IFCAP_RXCSUM & ifp->if_capenable) {
3344 				ifp->if_capenable &= ~IFCAP_RXCSUM;
3345 				sc->csum_flag = 0;
3346 			} else {
3347 				ifp->if_capenable |= IFCAP_RXCSUM;
3348 				sc->csum_flag = 1;
3349 			}
3350 		}
3351 		if (mask & IFCAP_TSO4) {
3352 			if (IFCAP_TSO4 & ifp->if_capenable) {
3353 				ifp->if_capenable &= ~IFCAP_TSO4;
3354 				ifp->if_hwassist &= ~CSUM_TSO;
3355 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
3356 				ifp->if_capenable |= IFCAP_TSO4;
3357 				ifp->if_hwassist |= CSUM_TSO;
3358 			} else {
3359 				printf("mxge requires tx checksum offload"
3360 				       " be enabled to use TSO\n");
3361 				err = EINVAL;
3362 			}
3363 		}
3364 		if (mask & IFCAP_LRO) {
3365 			if (IFCAP_LRO & ifp->if_capenable)
3366 				err = mxge_change_lro_locked(sc, 0);
3367 			else
3368 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3369 		}
3370 		if (mask & IFCAP_VLAN_HWTAGGING)
3371 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3372 		mtx_unlock(&sc->driver_mtx);
3373 		VLAN_CAPABILITIES(ifp);
3374 
3375 		break;
3376 
3377 	case SIOCGIFMEDIA:
3378 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3379 				    &sc->media, command);
3380                 break;
3381 
3382 	default:
3383 		err = ENOTTY;
3384         }
3385 	return err;
3386 }
3387 
3388 static void
3389 mxge_fetch_tunables(mxge_softc_t *sc)
3390 {
3391 
3392 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
3393 			  &mxge_flow_control);
3394 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
3395 			  &mxge_intr_coal_delay);
3396 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
3397 			  &mxge_nvidia_ecrc_enable);
3398 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
3399 			  &mxge_force_firmware);
3400 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
3401 			  &mxge_deassert_wait);
3402 	TUNABLE_INT_FETCH("hw.mxge.verbose",
3403 			  &mxge_verbose);
3404 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
3405 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
3406 	if (sc->lro_cnt != 0)
3407 		mxge_lro_cnt = sc->lro_cnt;
3408 
3409 	if (bootverbose)
3410 		mxge_verbose = 1;
3411 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
3412 		mxge_intr_coal_delay = 30;
3413 	if (mxge_ticks == 0)
3414 		mxge_ticks = hz;
3415 	sc->pause = mxge_flow_control;
3416 
3417 }
3418 
3419 static int
3420 mxge_attach(device_t dev)
3421 {
3422 	mxge_softc_t *sc = device_get_softc(dev);
3423 	struct ifnet *ifp;
3424 	int count, rid, err;
3425 
3426 	sc->dev = dev;
3427 	mxge_fetch_tunables(sc);
3428 
3429 	err = bus_dma_tag_create(NULL,			/* parent */
3430 				 1,			/* alignment */
3431 				 4096,			/* boundary */
3432 				 BUS_SPACE_MAXADDR,	/* low */
3433 				 BUS_SPACE_MAXADDR,	/* high */
3434 				 NULL, NULL,		/* filter */
3435 				 65536 + 256,		/* maxsize */
3436 				 MXGE_MAX_SEND_DESC, 	/* num segs */
3437 				 4096,			/* maxsegsize */
3438 				 0,			/* flags */
3439 				 NULL, NULL,		/* lock */
3440 				 &sc->parent_dmat);	/* tag */
3441 
3442 	if (err != 0) {
3443 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
3444 			      err);
3445 		goto abort_with_nothing;
3446 	}
3447 
3448 	ifp = sc->ifp = if_alloc(IFT_ETHER);
3449 	if (ifp == NULL) {
3450 		device_printf(dev, "can not if_alloc()\n");
3451 		err = ENOSPC;
3452 		goto abort_with_parent_dmat;
3453 	}
3454 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
3455 		 device_get_nameunit(dev));
3456 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
3457 	snprintf(sc->tx_mtx_name, sizeof(sc->tx_mtx_name), "%s:tx",
3458 		 device_get_nameunit(dev));
3459 	mtx_init(&sc->tx_mtx, sc->tx_mtx_name, NULL, MTX_DEF);
3460 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
3461 		 "%s:drv", device_get_nameunit(dev));
3462 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
3463 		 MTX_NETWORK_LOCK, MTX_DEF);
3464 
3465 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
3466 
3467 	mxge_setup_cfg_space(sc);
3468 
3469 	/* Map the board into the kernel */
3470 	rid = PCIR_BARS;
3471 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
3472 					 ~0, 1, RF_ACTIVE);
3473 	if (sc->mem_res == NULL) {
3474 		device_printf(dev, "could not map memory\n");
3475 		err = ENXIO;
3476 		goto abort_with_lock;
3477 	}
3478 	sc->sram = rman_get_virtual(sc->mem_res);
3479 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
3480 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
3481 		device_printf(dev, "impossible memory region size %ld\n",
3482 			      rman_get_size(sc->mem_res));
3483 		err = ENXIO;
3484 		goto abort_with_mem_res;
3485 	}
3486 
3487 	/* make NULL terminated copy of the EEPROM strings section of
3488 	   lanai SRAM */
3489 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
3490 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
3491 				rman_get_bushandle(sc->mem_res),
3492 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
3493 				sc->eeprom_strings,
3494 				MXGE_EEPROM_STRINGS_SIZE - 2);
3495 	err = mxge_parse_strings(sc);
3496 	if (err != 0)
3497 		goto abort_with_mem_res;
3498 
3499 	/* Enable write combining for efficient use of PCIe bus */
3500 	mxge_enable_wc(sc);
3501 
3502 	/* Allocate the out of band dma memory */
3503 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
3504 			     sizeof (mxge_cmd_t), 64);
3505 	if (err != 0)
3506 		goto abort_with_mem_res;
3507 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
3508 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
3509 	if (err != 0)
3510 		goto abort_with_cmd_dma;
3511 
3512 	err = mxge_dma_alloc(sc, &sc->fw_stats_dma,
3513 			     sizeof (*sc->fw_stats), 64);
3514 	if (err != 0)
3515 		goto abort_with_zeropad_dma;
3516 	sc->fw_stats = (mcp_irq_data_t *)sc->fw_stats_dma.addr;
3517 
3518 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
3519 	if (err != 0)
3520 		goto abort_with_fw_stats;
3521 
3522 	/* Add our ithread  */
3523 	count = pci_msi_count(dev);
3524 	if (count == 1 && pci_alloc_msi(dev, &count) == 0) {
3525 		rid = 1;
3526 		sc->msi_enabled = 1;
3527 	} else {
3528 		rid = 0;
3529 	}
3530 	sc->irq_res = bus_alloc_resource(dev, SYS_RES_IRQ, &rid, 0, ~0,
3531 					 1, RF_SHAREABLE | RF_ACTIVE);
3532 	if (sc->irq_res == NULL) {
3533 		device_printf(dev, "could not alloc interrupt\n");
3534 		goto abort_with_dmabench;
3535 	}
3536 	if (mxge_verbose)
3537 		device_printf(dev, "using %s irq %ld\n",
3538 			      sc->msi_enabled ? "MSI" : "INTx",
3539 			      rman_get_start(sc->irq_res));
3540 	/* select & load the firmware */
3541 	err = mxge_select_firmware(sc);
3542 	if (err != 0)
3543 		goto abort_with_irq_res;
3544 	sc->intr_coal_delay = mxge_intr_coal_delay;
3545 	err = mxge_reset(sc, 0);
3546 	if (err != 0)
3547 		goto abort_with_irq_res;
3548 
3549 	err = mxge_alloc_rings(sc);
3550 	if (err != 0) {
3551 		device_printf(sc->dev, "failed to allocate rings\n");
3552 		goto abort_with_irq_res;
3553 	}
3554 
3555 	err = bus_setup_intr(sc->dev, sc->irq_res,
3556 			     INTR_TYPE_NET | INTR_MPSAFE,
3557 			     NULL, mxge_intr, sc, &sc->ih);
3558 	if (err != 0) {
3559 		goto abort_with_rings;
3560 	}
3561 	/* hook into the network stack */
3562 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
3563 	ifp->if_baudrate = 100000000;
3564 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
3565 		IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING |
3566 		IFCAP_VLAN_HWCSUM | IFCAP_LRO;
3567 
3568 	sc->max_mtu = mxge_max_mtu(sc);
3569 	if (sc->max_mtu >= 9000)
3570 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
3571 	else
3572 		device_printf(dev, "MTU limited to %d.  Install "
3573 			      "latest firmware for 9000 byte jumbo support\n",
3574 			      sc->max_mtu - ETHER_HDR_LEN);
3575 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
3576 	ifp->if_capenable = ifp->if_capabilities;
3577 	if (sc->lro_cnt == 0)
3578 		ifp->if_capenable &= ~IFCAP_LRO;
3579 	sc->csum_flag = 1;
3580         ifp->if_init = mxge_init;
3581         ifp->if_softc = sc;
3582         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
3583         ifp->if_ioctl = mxge_ioctl;
3584         ifp->if_start = mxge_start;
3585 	/* Initialise the ifmedia structure */
3586 	ifmedia_init(&sc->media, 0, mxge_media_change,
3587 		     mxge_media_status);
3588 	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
3589 	mxge_media_probe(sc);
3590 	ether_ifattach(ifp, sc->mac_addr);
3591 	/* ether_ifattach sets mtu to 1500 */
3592 	if (ifp->if_capabilities & IFCAP_JUMBO_MTU)
3593 		ifp->if_mtu = 9000;
3594 
3595 	mxge_add_sysctls(sc);
3596 	return 0;
3597 
3598 abort_with_rings:
3599 	mxge_free_rings(sc);
3600 abort_with_irq_res:
3601 	bus_release_resource(dev, SYS_RES_IRQ,
3602 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3603 	if (sc->msi_enabled)
3604 		pci_release_msi(dev);
3605 abort_with_dmabench:
3606 	mxge_dma_free(&sc->dmabench_dma);
3607 abort_with_fw_stats:
3608 	mxge_dma_free(&sc->fw_stats_dma);
3609 abort_with_zeropad_dma:
3610 	mxge_dma_free(&sc->zeropad_dma);
3611 abort_with_cmd_dma:
3612 	mxge_dma_free(&sc->cmd_dma);
3613 abort_with_mem_res:
3614 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3615 abort_with_lock:
3616 	pci_disable_busmaster(dev);
3617 	mtx_destroy(&sc->cmd_mtx);
3618 	mtx_destroy(&sc->tx_mtx);
3619 	mtx_destroy(&sc->driver_mtx);
3620 	if_free(ifp);
3621 abort_with_parent_dmat:
3622 	bus_dma_tag_destroy(sc->parent_dmat);
3623 
3624 abort_with_nothing:
3625 	return err;
3626 }
3627 
3628 static int
3629 mxge_detach(device_t dev)
3630 {
3631 	mxge_softc_t *sc = device_get_softc(dev);
3632 
3633 	if (sc->ifp->if_vlantrunk != NULL) {
3634 		device_printf(sc->dev,
3635 			      "Detach vlans before removing module\n");
3636 		return EBUSY;
3637 	}
3638 	mtx_lock(&sc->driver_mtx);
3639 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
3640 		mxge_close(sc);
3641 	callout_stop(&sc->co_hdl);
3642 	mtx_unlock(&sc->driver_mtx);
3643 	ether_ifdetach(sc->ifp);
3644 	ifmedia_removeall(&sc->media);
3645 	mxge_dummy_rdma(sc, 0);
3646 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
3647 	mxge_free_rings(sc);
3648 	bus_release_resource(dev, SYS_RES_IRQ,
3649 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3650 	if (sc->msi_enabled)
3651 		pci_release_msi(dev);
3652 
3653 	sc->rx_done.entry = NULL;
3654 	mxge_dma_free(&sc->fw_stats_dma);
3655 	mxge_dma_free(&sc->dmabench_dma);
3656 	mxge_dma_free(&sc->zeropad_dma);
3657 	mxge_dma_free(&sc->cmd_dma);
3658 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3659 	pci_disable_busmaster(dev);
3660 	mtx_destroy(&sc->cmd_mtx);
3661 	mtx_destroy(&sc->tx_mtx);
3662 	mtx_destroy(&sc->driver_mtx);
3663 	if_free(sc->ifp);
3664 	bus_dma_tag_destroy(sc->parent_dmat);
3665 	return 0;
3666 }
3667 
3668 static int
3669 mxge_shutdown(device_t dev)
3670 {
3671 	return 0;
3672 }
3673 
3674 /*
3675   This file uses Myri10GE driver indentation.
3676 
3677   Local Variables:
3678   c-file-style:"linux"
3679   tab-width:8
3680   End:
3681 */
3682