xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 0c927cdd8e6e05387fc5a9ffcb5dbe128d4ad749)
1 /******************************************************************************
2 
3 Copyright (c) 2006, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Redistributions in binary form must reproduce the above copyright
13     notice, this list of conditions and the following disclaimer in the
14     documentation and/or other materials provided with the distribution.
15 
16  3. Neither the name of the Myricom Inc, nor the names of its
17     contributors may be used to endorse or promote products derived from
18     this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 POSSIBILITY OF SUCH DAMAGE.
31 
32 ***************************************************************************/
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/linker.h>
40 #include <sys/firmware.h>
41 #include <sys/endian.h>
42 #include <sys/sockio.h>
43 #include <sys/mbuf.h>
44 #include <sys/malloc.h>
45 #include <sys/kdb.h>
46 #include <sys/kernel.h>
47 #include <sys/lock.h>
48 #include <sys/module.h>
49 #include <sys/memrange.h>
50 #include <sys/socket.h>
51 #include <sys/sysctl.h>
52 #include <sys/sx.h>
53 
54 #include <net/if.h>
55 #include <net/if_arp.h>
56 #include <net/ethernet.h>
57 #include <net/if_dl.h>
58 #include <net/if_media.h>
59 
60 #include <net/bpf.h>
61 
62 #include <net/if_types.h>
63 #include <net/if_vlan_var.h>
64 #include <net/zlib.h>
65 
66 #include <netinet/in_systm.h>
67 #include <netinet/in.h>
68 #include <netinet/ip.h>
69 #include <netinet/tcp.h>
70 
71 #include <machine/bus.h>
72 #include <machine/in_cksum.h>
73 #include <machine/resource.h>
74 #include <sys/bus.h>
75 #include <sys/rman.h>
76 
77 #include <dev/pci/pcireg.h>
78 #include <dev/pci/pcivar.h>
79 
80 #include <vm/vm.h>		/* for pmap_mapdev() */
81 #include <vm/pmap.h>
82 
83 #if defined(__i386) || defined(__amd64)
84 #include <machine/specialreg.h>
85 #endif
86 
87 #include <dev/mxge/mxge_mcp.h>
88 #include <dev/mxge/mcp_gen_header.h>
89 #include <dev/mxge/if_mxge_var.h>
90 
91 /* tunable params */
92 static int mxge_nvidia_ecrc_enable = 1;
93 static int mxge_force_firmware = 0;
94 static int mxge_intr_coal_delay = 30;
95 static int mxge_deassert_wait = 1;
96 static int mxge_flow_control = 1;
97 static int mxge_verbose = 0;
98 static int mxge_ticks;
99 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
100 static char *mxge_fw_aligned = "mxge_eth_z8e";
101 
102 static int mxge_probe(device_t dev);
103 static int mxge_attach(device_t dev);
104 static int mxge_detach(device_t dev);
105 static int mxge_shutdown(device_t dev);
106 static void mxge_intr(void *arg);
107 
108 static device_method_t mxge_methods[] =
109 {
110   /* Device interface */
111   DEVMETHOD(device_probe, mxge_probe),
112   DEVMETHOD(device_attach, mxge_attach),
113   DEVMETHOD(device_detach, mxge_detach),
114   DEVMETHOD(device_shutdown, mxge_shutdown),
115   {0, 0}
116 };
117 
118 static driver_t mxge_driver =
119 {
120   "mxge",
121   mxge_methods,
122   sizeof(mxge_softc_t),
123 };
124 
125 static devclass_t mxge_devclass;
126 
127 /* Declare ourselves to be a child of the PCI bus.*/
128 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
129 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
130 
131 static int mxge_load_firmware(mxge_softc_t *sc);
132 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
133 
134 static int
135 mxge_probe(device_t dev)
136 {
137   if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
138       (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E)) {
139 	  device_set_desc(dev, "Myri10G-PCIE-8A");
140 	  return 0;
141   }
142   return ENXIO;
143 }
144 
145 static void
146 mxge_enable_wc(mxge_softc_t *sc)
147 {
148 	struct mem_range_desc mrdesc;
149 	vm_paddr_t pa;
150 	vm_offset_t len;
151 	int err, action;
152 
153 	len = rman_get_size(sc->mem_res);
154 #if defined(__i386) || defined(__amd64)
155 	err = pmap_change_attr((vm_offset_t) sc->sram,
156 			       len, PAT_WRITE_COMBINING);
157 	if (err == 0)
158 		return;
159 	else
160 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
161 			      err);
162 #endif
163 	pa = rman_get_start(sc->mem_res);
164 	mrdesc.mr_base = pa;
165 	mrdesc.mr_len = len;
166 	mrdesc.mr_flags = MDF_WRITECOMBINE;
167 	action = MEMRANGE_SET_UPDATE;
168 	strcpy((char *)&mrdesc.mr_owner, "mxge");
169 	err = mem_range_attr_set(&mrdesc, &action);
170 	if (err != 0) {
171 		device_printf(sc->dev,
172 			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
173 			      (unsigned long)pa, (unsigned long)len, err);
174 	} else {
175 		sc->wc = 1;
176 	}
177 }
178 
179 
180 /* callback to get our DMA address */
181 static void
182 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
183 			 int error)
184 {
185 	if (error == 0) {
186 		*(bus_addr_t *) arg = segs->ds_addr;
187 	}
188 }
189 
190 static int
191 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
192 		   bus_size_t alignment)
193 {
194 	int err;
195 	device_t dev = sc->dev;
196 
197 	/* allocate DMAable memory tags */
198 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
199 				 alignment,		/* alignment */
200 				 4096,			/* boundary */
201 				 BUS_SPACE_MAXADDR,	/* low */
202 				 BUS_SPACE_MAXADDR,	/* high */
203 				 NULL, NULL,		/* filter */
204 				 bytes,			/* maxsize */
205 				 1,			/* num segs */
206 				 4096,			/* maxsegsize */
207 				 BUS_DMA_COHERENT,	/* flags */
208 				 NULL, NULL,		/* lock */
209 				 &dma->dmat);		/* tag */
210 	if (err != 0) {
211 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
212 		return err;
213 	}
214 
215 	/* allocate DMAable memory & map */
216 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
217 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
218 				| BUS_DMA_ZERO),  &dma->map);
219 	if (err != 0) {
220 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
221 		goto abort_with_dmat;
222 	}
223 
224 	/* load the memory */
225 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
226 			      mxge_dmamap_callback,
227 			      (void *)&dma->bus_addr, 0);
228 	if (err != 0) {
229 		device_printf(dev, "couldn't load map (err = %d)\n", err);
230 		goto abort_with_mem;
231 	}
232 	return 0;
233 
234 abort_with_mem:
235 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
236 abort_with_dmat:
237 	(void)bus_dma_tag_destroy(dma->dmat);
238 	return err;
239 }
240 
241 
242 static void
243 mxge_dma_free(mxge_dma_t *dma)
244 {
245 	bus_dmamap_unload(dma->dmat, dma->map);
246 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
247 	(void)bus_dma_tag_destroy(dma->dmat);
248 }
249 
250 /*
251  * The eeprom strings on the lanaiX have the format
252  * SN=x\0
253  * MAC=x:x:x:x:x:x\0
254  * PC=text\0
255  */
256 
257 static int
258 mxge_parse_strings(mxge_softc_t *sc)
259 {
260 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
261 
262 	char *ptr, *limit;
263 	int i, found_mac;
264 
265 	ptr = sc->eeprom_strings;
266 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
267 	found_mac = 0;
268 	while (ptr < limit && *ptr != '\0') {
269 		if (memcmp(ptr, "MAC=", 4) == 0) {
270 			ptr += 1;
271 			sc->mac_addr_string = ptr;
272 			for (i = 0; i < 6; i++) {
273 				ptr += 3;
274 				if ((ptr + 2) > limit)
275 					goto abort;
276 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
277 				found_mac = 1;
278 			}
279 		} else if (memcmp(ptr, "PC=", 3) == 0) {
280 			ptr += 3;
281 			strncpy(sc->product_code_string, ptr,
282 				sizeof (sc->product_code_string) - 1);
283 		} else if (memcmp(ptr, "SN=", 3) == 0) {
284 			ptr += 3;
285 			strncpy(sc->serial_number_string, ptr,
286 				sizeof (sc->serial_number_string) - 1);
287 		}
288 		MXGE_NEXT_STRING(ptr);
289 	}
290 
291 	if (found_mac)
292 		return 0;
293 
294  abort:
295 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
296 
297 	return ENXIO;
298 }
299 
300 #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
301 static void
302 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
303 {
304 	uint32_t val;
305 	unsigned long base, off;
306 	char *va, *cfgptr;
307 	device_t pdev, mcp55;
308 	uint16_t vendor_id, device_id, word;
309 	uintptr_t bus, slot, func, ivend, idev;
310 	uint32_t *ptr32;
311 
312 
313 	if (!mxge_nvidia_ecrc_enable)
314 		return;
315 
316 	pdev = device_get_parent(device_get_parent(sc->dev));
317 	if (pdev == NULL) {
318 		device_printf(sc->dev, "could not find parent?\n");
319 		return;
320 	}
321 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
322 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
323 
324 	if (vendor_id != 0x10de)
325 		return;
326 
327 	base = 0;
328 
329 	if (device_id == 0x005d) {
330 		/* ck804, base address is magic */
331 		base = 0xe0000000UL;
332 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
333 		/* mcp55, base address stored in chipset */
334 		mcp55 = pci_find_bsf(0, 0, 0);
335 		if (mcp55 &&
336 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
337 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
338 			word = pci_read_config(mcp55, 0x90, 2);
339 			base = ((unsigned long)word & 0x7ffeU) << 25;
340 		}
341 	}
342 	if (!base)
343 		return;
344 
345 	/* XXXX
346 	   Test below is commented because it is believed that doing
347 	   config read/write beyond 0xff will access the config space
348 	   for the next larger function.  Uncomment this and remove
349 	   the hacky pmap_mapdev() way of accessing config space when
350 	   FreeBSD grows support for extended pcie config space access
351 	*/
352 #if 0
353 	/* See if we can, by some miracle, access the extended
354 	   config space */
355 	val = pci_read_config(pdev, 0x178, 4);
356 	if (val != 0xffffffff) {
357 		val |= 0x40;
358 		pci_write_config(pdev, 0x178, val, 4);
359 		return;
360 	}
361 #endif
362 	/* Rather than using normal pci config space writes, we must
363 	 * map the Nvidia config space ourselves.  This is because on
364 	 * opteron/nvidia class machine the 0xe000000 mapping is
365 	 * handled by the nvidia chipset, that means the internal PCI
366 	 * device (the on-chip northbridge), or the amd-8131 bridge
367 	 * and things behind them are not visible by this method.
368 	 */
369 
370 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
371 		      PCI_IVAR_BUS, &bus);
372 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
373 		      PCI_IVAR_SLOT, &slot);
374 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
375 		      PCI_IVAR_FUNCTION, &func);
376 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
377 		      PCI_IVAR_VENDOR, &ivend);
378 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
379 		      PCI_IVAR_DEVICE, &idev);
380 
381 	off =  base
382 		+ 0x00100000UL * (unsigned long)bus
383 		+ 0x00001000UL * (unsigned long)(func
384 						 + 8 * slot);
385 
386 	/* map it into the kernel */
387 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
388 
389 
390 	if (va == NULL) {
391 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
392 		return;
393 	}
394 	/* get a pointer to the config space mapped into the kernel */
395 	cfgptr = va + (off & PAGE_MASK);
396 
397 	/* make sure that we can really access it */
398 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
399 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
400 	if (! (vendor_id == ivend && device_id == idev)) {
401 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
402 			      vendor_id, device_id);
403 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
404 		return;
405 	}
406 
407 	ptr32 = (uint32_t*)(cfgptr + 0x178);
408 	val = *ptr32;
409 
410 	if (val == 0xffffffff) {
411 		device_printf(sc->dev, "extended mapping failed\n");
412 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
413 		return;
414 	}
415 	*ptr32 = val | 0x40;
416 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
417 	if (mxge_verbose)
418 		device_printf(sc->dev,
419 			      "Enabled ECRC on upstream Nvidia bridge "
420 			      "at %d:%d:%d\n",
421 			      (int)bus, (int)slot, (int)func);
422 	return;
423 }
424 #else
425 static void
426 mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
427 {
428 	device_printf(sc->dev,
429 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
430 	return;
431 }
432 #endif
433 
434 
435 static int
436 mxge_dma_test(mxge_softc_t *sc, int test_type)
437 {
438 	mxge_cmd_t cmd;
439 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
440 	int status;
441 	uint32_t len;
442 	char *test = " ";
443 
444 
445 	/* Run a small DMA test.
446 	 * The magic multipliers to the length tell the firmware
447 	 * to do DMA read, write, or read+write tests.  The
448 	 * results are returned in cmd.data0.  The upper 16
449 	 * bits of the return is the number of transfers completed.
450 	 * The lower 16 bits is the time in 0.5us ticks that the
451 	 * transfers took to complete.
452 	 */
453 
454 	len = sc->tx.boundary;
455 
456 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
457 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
458 	cmd.data2 = len * 0x10000;
459 	status = mxge_send_cmd(sc, test_type, &cmd);
460 	if (status != 0) {
461 		test = "read";
462 		goto abort;
463 	}
464 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
465 		(cmd.data0 & 0xffff);
466 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
467 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
468 	cmd.data2 = len * 0x1;
469 	status = mxge_send_cmd(sc, test_type, &cmd);
470 	if (status != 0) {
471 		test = "write";
472 		goto abort;
473 	}
474 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
475 		(cmd.data0 & 0xffff);
476 
477 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
478 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
479 	cmd.data2 = len * 0x10001;
480 	status = mxge_send_cmd(sc, test_type, &cmd);
481 	if (status != 0) {
482 		test = "read/write";
483 		goto abort;
484 	}
485 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
486 		(cmd.data0 & 0xffff);
487 
488 abort:
489 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
490 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
491 			      test, status);
492 
493 	return status;
494 }
495 
496 /*
497  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
498  * when the PCI-E Completion packets are aligned on an 8-byte
499  * boundary.  Some PCI-E chip sets always align Completion packets; on
500  * the ones that do not, the alignment can be enforced by enabling
501  * ECRC generation (if supported).
502  *
503  * When PCI-E Completion packets are not aligned, it is actually more
504  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
505  *
506  * If the driver can neither enable ECRC nor verify that it has
507  * already been enabled, then it must use a firmware image which works
508  * around unaligned completion packets (ethp_z8e.dat), and it should
509  * also ensure that it never gives the device a Read-DMA which is
510  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
511  * enabled, then the driver should use the aligned (eth_z8e.dat)
512  * firmware image, and set tx.boundary to 4KB.
513  */
514 
515 static int
516 mxge_firmware_probe(mxge_softc_t *sc)
517 {
518 	device_t dev = sc->dev;
519 	int reg, status;
520 	uint16_t pectl;
521 
522 	sc->tx.boundary = 4096;
523 	/*
524 	 * Verify the max read request size was set to 4KB
525 	 * before trying the test with 4KB.
526 	 */
527 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
528 		pectl = pci_read_config(dev, reg + 0x8, 2);
529 		if ((pectl & (5 << 12)) != (5 << 12)) {
530 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
531 				      pectl);
532 			sc->tx.boundary = 2048;
533 		}
534 	}
535 
536 	/*
537 	 * load the optimized firmware (which assumes aligned PCIe
538 	 * completions) in order to see if it works on this host.
539 	 */
540 	sc->fw_name = mxge_fw_aligned;
541 	status = mxge_load_firmware(sc);
542 	if (status != 0) {
543 		return status;
544 	}
545 
546 	/*
547 	 * Enable ECRC if possible
548 	 */
549 	mxge_enable_nvidia_ecrc(sc);
550 
551 	/*
552 	 * Run a DMA test which watches for unaligned completions and
553 	 * aborts on the first one seen.
554 	 */
555 
556 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
557 	if (status == 0)
558 		return 0; /* keep the aligned firmware */
559 
560 	if (status != E2BIG)
561 		device_printf(dev, "DMA test failed: %d\n", status);
562 	if (status == ENOSYS)
563 		device_printf(dev, "Falling back to ethp! "
564 			      "Please install up to date fw\n");
565 	return status;
566 }
567 
568 static int
569 mxge_select_firmware(mxge_softc_t *sc)
570 {
571 	int aligned = 0;
572 
573 
574 	if (mxge_force_firmware != 0) {
575 		if (mxge_force_firmware == 1)
576 			aligned = 1;
577 		else
578 			aligned = 0;
579 		if (mxge_verbose)
580 			device_printf(sc->dev,
581 				      "Assuming %s completions (forced)\n",
582 				      aligned ? "aligned" : "unaligned");
583 		goto abort;
584 	}
585 
586 	/* if the PCIe link width is 4 or less, we can use the aligned
587 	   firmware and skip any checks */
588 	if (sc->link_width != 0 && sc->link_width <= 4) {
589 		device_printf(sc->dev,
590 			      "PCIe x%d Link, expect reduced performance\n",
591 			      sc->link_width);
592 		aligned = 1;
593 		goto abort;
594 	}
595 
596 	if (0 == mxge_firmware_probe(sc))
597 		return 0;
598 
599 abort:
600 	if (aligned) {
601 		sc->fw_name = mxge_fw_aligned;
602 		sc->tx.boundary = 4096;
603 	} else {
604 		sc->fw_name = mxge_fw_unaligned;
605 		sc->tx.boundary = 2048;
606 	}
607 	return (mxge_load_firmware(sc));
608 }
609 
610 union qualhack
611 {
612         const char *ro_char;
613         char *rw_char;
614 };
615 
616 static int
617 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
618 {
619 
620 
621 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
622 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
623 			      be32toh(hdr->mcp_type));
624 		return EIO;
625 	}
626 
627 	/* save firmware version for sysctl */
628 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
629 	if (mxge_verbose)
630 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
631 
632 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
633 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
634 
635 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
636 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
637 		device_printf(sc->dev, "Found firmware version %s\n",
638 			      sc->fw_version);
639 		device_printf(sc->dev, "Driver needs %d.%d\n",
640 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
641 		return EINVAL;
642 	}
643 	return 0;
644 
645 }
646 
647 static int
648 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
649 {
650 	const struct firmware *fw;
651 	const mcp_gen_header_t *hdr;
652 	unsigned hdr_offset;
653 	const char *fw_data;
654 	union qualhack hack;
655 	int status;
656 	unsigned int i;
657 	char dummy;
658 
659 
660 	fw = firmware_get(sc->fw_name);
661 
662 	if (fw == NULL) {
663 		device_printf(sc->dev, "Could not find firmware image %s\n",
664 			      sc->fw_name);
665 		return ENOENT;
666 	}
667 	if (fw->datasize > *limit ||
668 	    fw->datasize < MCP_HEADER_PTR_OFFSET + 4) {
669 		device_printf(sc->dev, "Firmware image %s too large (%d/%d)\n",
670 			      sc->fw_name, (int)fw->datasize, (int) *limit);
671 		status = ENOSPC;
672 		goto abort_with_fw;
673 	}
674 	*limit = fw->datasize;
675 
676 	/* check id */
677 	fw_data = (const char *)fw->data;
678 	hdr_offset = htobe32(*(const uint32_t *)
679 			     (fw_data + MCP_HEADER_PTR_OFFSET));
680 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw->datasize) {
681 		device_printf(sc->dev, "Bad firmware file");
682 		status = EIO;
683 		goto abort_with_fw;
684 	}
685 	hdr = (const void*)(fw_data + hdr_offset);
686 
687 	status = mxge_validate_firmware(sc, hdr);
688 	if (status != 0)
689 		goto abort_with_fw;
690 
691 	hack.ro_char = fw_data;
692 	/* Copy the inflated firmware to NIC SRAM. */
693 	for (i = 0; i < *limit; i += 256) {
694 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
695 			      hack.rw_char + i,
696 			      min(256U, (unsigned)(*limit - i)));
697 		mb();
698 		dummy = *sc->sram;
699 		mb();
700 	}
701 
702 	status = 0;
703 abort_with_fw:
704 	firmware_put(fw, FIRMWARE_UNLOAD);
705 	return status;
706 }
707 
708 /*
709  * Enable or disable periodic RDMAs from the host to make certain
710  * chipsets resend dropped PCIe messages
711  */
712 
713 static void
714 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
715 {
716 	char buf_bytes[72];
717 	volatile uint32_t *confirm;
718 	volatile char *submit;
719 	uint32_t *buf, dma_low, dma_high;
720 	int i;
721 
722 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
723 
724 	/* clear confirmation addr */
725 	confirm = (volatile uint32_t *)sc->cmd;
726 	*confirm = 0;
727 	mb();
728 
729 	/* send an rdma command to the PCIe engine, and wait for the
730 	   response in the confirmation address.  The firmware should
731 	   write a -1 there to indicate it is alive and well
732 	*/
733 
734 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
735 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
736 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
737 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
738 	buf[2] = htobe32(0xffffffff);		/* confirm data */
739 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
740 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
741 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
742 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
743 	buf[5] = htobe32(enable);			/* enable? */
744 
745 
746 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
747 
748 	mxge_pio_copy(submit, buf, 64);
749 	mb();
750 	DELAY(1000);
751 	mb();
752 	i = 0;
753 	while (*confirm != 0xffffffff && i < 20) {
754 		DELAY(1000);
755 		i++;
756 	}
757 	if (*confirm != 0xffffffff) {
758 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
759 			      (enable ? "enable" : "disable"), confirm,
760 			      *confirm);
761 	}
762 	return;
763 }
764 
765 static int
766 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
767 {
768 	mcp_cmd_t *buf;
769 	char buf_bytes[sizeof(*buf) + 8];
770 	volatile mcp_cmd_response_t *response = sc->cmd;
771 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
772 	uint32_t dma_low, dma_high;
773 	int err, sleep_total = 0;
774 
775 	/* ensure buf is aligned to 8 bytes */
776 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
777 
778 	buf->data0 = htobe32(data->data0);
779 	buf->data1 = htobe32(data->data1);
780 	buf->data2 = htobe32(data->data2);
781 	buf->cmd = htobe32(cmd);
782 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
783 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
784 
785 	buf->response_addr.low = htobe32(dma_low);
786 	buf->response_addr.high = htobe32(dma_high);
787 	mtx_lock(&sc->cmd_mtx);
788 	response->result = 0xffffffff;
789 	mb();
790 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
791 
792 	/* wait up to 20ms */
793 	err = EAGAIN;
794 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
795 		bus_dmamap_sync(sc->cmd_dma.dmat,
796 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
797 		mb();
798 		switch (be32toh(response->result)) {
799 		case 0:
800 			data->data0 = be32toh(response->data);
801 			err = 0;
802 			break;
803 		case 0xffffffff:
804 			DELAY(1000);
805 			break;
806 		case MXGEFW_CMD_UNKNOWN:
807 			err = ENOSYS;
808 			break;
809 		case MXGEFW_CMD_ERROR_UNALIGNED:
810 			err = E2BIG;
811 			break;
812 		default:
813 			device_printf(sc->dev,
814 				      "mxge: command %d "
815 				      "failed, result = %d\n",
816 				      cmd, be32toh(response->result));
817 			err = ENXIO;
818 			break;
819 		}
820 		if (err != EAGAIN)
821 			break;
822 	}
823 	if (err == EAGAIN)
824 		device_printf(sc->dev, "mxge: command %d timed out"
825 			      "result = %d\n",
826 			      cmd, be32toh(response->result));
827 	mtx_unlock(&sc->cmd_mtx);
828 	return err;
829 }
830 
831 static int
832 mxge_adopt_running_firmware(mxge_softc_t *sc)
833 {
834 	struct mcp_gen_header *hdr;
835 	const size_t bytes = sizeof (struct mcp_gen_header);
836 	size_t hdr_offset;
837 	int status;
838 
839 	/* find running firmware header */
840 	hdr_offset = htobe32(*(volatile uint32_t *)
841 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
842 
843 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
844 		device_printf(sc->dev,
845 			      "Running firmware has bad header offset (%d)\n",
846 			      (int)hdr_offset);
847 		return EIO;
848 	}
849 
850 	/* copy header of running firmware from SRAM to host memory to
851 	 * validate firmware */
852 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
853 	if (hdr == NULL) {
854 		device_printf(sc->dev, "could not malloc firmware hdr\n");
855 		return ENOMEM;
856 	}
857 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
858 				rman_get_bushandle(sc->mem_res),
859 				hdr_offset, (char *)hdr, bytes);
860 	status = mxge_validate_firmware(sc, hdr);
861 	free(hdr, M_DEVBUF);
862 
863 	/*
864 	 * check to see if adopted firmware has bug where adopting
865 	 * it will cause broadcasts to be filtered unless the NIC
866 	 * is kept in ALLMULTI mode
867 	 */
868 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
869 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
870 		sc->adopted_rx_filter_bug = 1;
871 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
872 			      "working around rx filter bug\n",
873 			      sc->fw_ver_major, sc->fw_ver_minor,
874 			      sc->fw_ver_tiny);
875 	}
876 
877 	return status;
878 }
879 
880 
881 static int
882 mxge_load_firmware(mxge_softc_t *sc)
883 {
884 	volatile uint32_t *confirm;
885 	volatile char *submit;
886 	char buf_bytes[72];
887 	uint32_t *buf, size, dma_low, dma_high;
888 	int status, i;
889 
890 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
891 
892 	size = sc->sram_size;
893 	status = mxge_load_firmware_helper(sc, &size);
894 	if (status) {
895 		/* Try to use the currently running firmware, if
896 		   it is new enough */
897 		status = mxge_adopt_running_firmware(sc);
898 		if (status) {
899 			device_printf(sc->dev,
900 				      "failed to adopt running firmware\n");
901 			return status;
902 		}
903 		device_printf(sc->dev,
904 			      "Successfully adopted running firmware\n");
905 		if (sc->tx.boundary == 4096) {
906 			device_printf(sc->dev,
907 				"Using firmware currently running on NIC"
908 				 ".  For optimal\n");
909 			device_printf(sc->dev,
910 				 "performance consider loading optimized "
911 				 "firmware\n");
912 		}
913 		sc->fw_name = mxge_fw_unaligned;
914 		sc->tx.boundary = 2048;
915 		return 0;
916 	}
917 	/* clear confirmation addr */
918 	confirm = (volatile uint32_t *)sc->cmd;
919 	*confirm = 0;
920 	mb();
921 	/* send a reload command to the bootstrap MCP, and wait for the
922 	   response in the confirmation address.  The firmware should
923 	   write a -1 there to indicate it is alive and well
924 	*/
925 
926 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
927 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
928 
929 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
930 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
931 	buf[2] = htobe32(0xffffffff);	/* confirm data */
932 
933 	/* FIX: All newest firmware should un-protect the bottom of
934 	   the sram before handoff. However, the very first interfaces
935 	   do not. Therefore the handoff copy must skip the first 8 bytes
936 	*/
937 					/* where the code starts*/
938 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
939 	buf[4] = htobe32(size - 8); 	/* length of code */
940 	buf[5] = htobe32(8);		/* where to copy to */
941 	buf[6] = htobe32(0);		/* where to jump to */
942 
943 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
944 	mxge_pio_copy(submit, buf, 64);
945 	mb();
946 	DELAY(1000);
947 	mb();
948 	i = 0;
949 	while (*confirm != 0xffffffff && i < 20) {
950 		DELAY(1000*10);
951 		i++;
952 		bus_dmamap_sync(sc->cmd_dma.dmat,
953 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
954 	}
955 	if (*confirm != 0xffffffff) {
956 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
957 			confirm, *confirm);
958 
959 		return ENXIO;
960 	}
961 	return 0;
962 }
963 
964 static int
965 mxge_update_mac_address(mxge_softc_t *sc)
966 {
967 	mxge_cmd_t cmd;
968 	uint8_t *addr = sc->mac_addr;
969 	int status;
970 
971 
972 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
973 		     | (addr[2] << 8) | addr[3]);
974 
975 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
976 
977 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
978 	return status;
979 }
980 
981 static int
982 mxge_change_pause(mxge_softc_t *sc, int pause)
983 {
984 	mxge_cmd_t cmd;
985 	int status;
986 
987 	if (pause)
988 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
989 				       &cmd);
990 	else
991 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
992 				       &cmd);
993 
994 	if (status) {
995 		device_printf(sc->dev, "Failed to set flow control mode\n");
996 		return ENXIO;
997 	}
998 	sc->pause = pause;
999 	return 0;
1000 }
1001 
1002 static void
1003 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1004 {
1005 	mxge_cmd_t cmd;
1006 	int status;
1007 
1008 	if (promisc)
1009 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1010 				       &cmd);
1011 	else
1012 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1013 				       &cmd);
1014 
1015 	if (status) {
1016 		device_printf(sc->dev, "Failed to set promisc mode\n");
1017 	}
1018 }
1019 
1020 static void
1021 mxge_set_multicast_list(mxge_softc_t *sc)
1022 {
1023 	mxge_cmd_t cmd;
1024 	struct ifmultiaddr *ifma;
1025 	struct ifnet *ifp = sc->ifp;
1026 	int err;
1027 
1028 	/* This firmware is known to not support multicast */
1029 	if (!sc->fw_multicast_support)
1030 		return;
1031 
1032 	/* Disable multicast filtering while we play with the lists*/
1033 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1034 	if (err != 0) {
1035 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1036 		       " error status: %d\n", err);
1037 		return;
1038 	}
1039 
1040 	if (sc->adopted_rx_filter_bug)
1041 		return;
1042 
1043 	if (ifp->if_flags & IFF_ALLMULTI)
1044 		/* request to disable multicast filtering, so quit here */
1045 		return;
1046 
1047 	/* Flush all the filters */
1048 
1049 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1050 	if (err != 0) {
1051 		device_printf(sc->dev,
1052 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1053 			      ", error status: %d\n", err);
1054 		return;
1055 	}
1056 
1057 	/* Walk the multicast list, and add each address */
1058 
1059 	IF_ADDR_LOCK(ifp);
1060 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1061 		if (ifma->ifma_addr->sa_family != AF_LINK)
1062 			continue;
1063 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1064 		      &cmd.data0, 4);
1065 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1066 		      &cmd.data1, 2);
1067 		cmd.data0 = htonl(cmd.data0);
1068 		cmd.data1 = htonl(cmd.data1);
1069 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1070 		if (err != 0) {
1071 			device_printf(sc->dev, "Failed "
1072 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1073 			       "%d\t", err);
1074 			/* abort, leaving multicast filtering off */
1075 			IF_ADDR_UNLOCK(ifp);
1076 			return;
1077 		}
1078 	}
1079 	IF_ADDR_UNLOCK(ifp);
1080 	/* Enable multicast filtering */
1081 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1082 	if (err != 0) {
1083 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1084 		       ", error status: %d\n", err);
1085 	}
1086 }
1087 
1088 static int
1089 mxge_max_mtu(mxge_softc_t *sc)
1090 {
1091 	mxge_cmd_t cmd;
1092 	int status;
1093 
1094 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1095 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1096 
1097 	/* try to set nbufs to see if it we can
1098 	   use virtually contiguous jumbos */
1099 	cmd.data0 = 0;
1100 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1101 			       &cmd);
1102 	if (status == 0)
1103 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1104 
1105 	/* otherwise, we're limited to MJUMPAGESIZE */
1106 	return MJUMPAGESIZE - MXGEFW_PAD;
1107 }
1108 
1109 static int
1110 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1111 {
1112 
1113 	mxge_cmd_t cmd;
1114 	size_t bytes;
1115 	int status;
1116 
1117 	/* try to send a reset command to the card to see if it
1118 	   is alive */
1119 	memset(&cmd, 0, sizeof (cmd));
1120 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1121 	if (status != 0) {
1122 		device_printf(sc->dev, "failed reset\n");
1123 		return ENXIO;
1124 	}
1125 
1126 	mxge_dummy_rdma(sc, 1);
1127 
1128 	if (interrupts_setup) {
1129 		/* Now exchange information about interrupts  */
1130 		bytes = (sc->rx_done.mask + 1) * sizeof (*sc->rx_done.entry);
1131 		memset(sc->rx_done.entry, 0, bytes);
1132 		cmd.data0 = (uint32_t)bytes;
1133 		status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1134 		cmd.data0 = MXGE_LOWPART_TO_U32(sc->rx_done.dma.bus_addr);
1135 		cmd.data1 = MXGE_HIGHPART_TO_U32(sc->rx_done.dma.bus_addr);
1136 		status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd);
1137 	}
1138 
1139 	status |= mxge_send_cmd(sc,
1140 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1141 
1142 
1143 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1144 
1145 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1146 	sc->irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1147 
1148 
1149 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1150 				&cmd);
1151 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1152 	if (status != 0) {
1153 		device_printf(sc->dev, "failed set interrupt parameters\n");
1154 		return status;
1155 	}
1156 
1157 
1158 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1159 
1160 
1161 	/* run a DMA benchmark */
1162 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1163 
1164 	/* reset mcp/driver shared state back to 0 */
1165 	sc->rx_done.idx = 0;
1166 	sc->rx_done.cnt = 0;
1167 	sc->tx.req = 0;
1168 	sc->tx.done = 0;
1169 	sc->tx.pkt_done = 0;
1170 	sc->tx.wake = 0;
1171 	sc->tx_defrag = 0;
1172 	sc->tx.stall = 0;
1173 	sc->rx_big.cnt = 0;
1174 	sc->rx_small.cnt = 0;
1175 	sc->rdma_tags_available = 15;
1176 	sc->fw_stats->valid = 0;
1177 	sc->fw_stats->send_done_count = 0;
1178 	sc->lro_bad_csum = 0;
1179 	sc->lro_queued = 0;
1180 	sc->lro_flushed = 0;
1181 	status = mxge_update_mac_address(sc);
1182 	mxge_change_promisc(sc, 0);
1183 	mxge_change_pause(sc, sc->pause);
1184 	mxge_set_multicast_list(sc);
1185 	return status;
1186 }
1187 
1188 static int
1189 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1190 {
1191         mxge_softc_t *sc;
1192         unsigned int intr_coal_delay;
1193         int err;
1194 
1195         sc = arg1;
1196         intr_coal_delay = sc->intr_coal_delay;
1197         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1198         if (err != 0) {
1199                 return err;
1200         }
1201         if (intr_coal_delay == sc->intr_coal_delay)
1202                 return 0;
1203 
1204         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1205                 return EINVAL;
1206 
1207 	mtx_lock(&sc->driver_mtx);
1208 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1209 	sc->intr_coal_delay = intr_coal_delay;
1210 
1211 	mtx_unlock(&sc->driver_mtx);
1212         return err;
1213 }
1214 
1215 static int
1216 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1217 {
1218         mxge_softc_t *sc;
1219         unsigned int enabled;
1220         int err;
1221 
1222         sc = arg1;
1223         enabled = sc->pause;
1224         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1225         if (err != 0) {
1226                 return err;
1227         }
1228         if (enabled == sc->pause)
1229                 return 0;
1230 
1231 	mtx_lock(&sc->driver_mtx);
1232 	err = mxge_change_pause(sc, enabled);
1233 	mtx_unlock(&sc->driver_mtx);
1234         return err;
1235 }
1236 
1237 static int
1238 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1239 {
1240         int err;
1241 
1242         if (arg1 == NULL)
1243                 return EFAULT;
1244         arg2 = be32toh(*(int *)arg1);
1245         arg1 = NULL;
1246         err = sysctl_handle_int(oidp, arg1, arg2, req);
1247 
1248         return err;
1249 }
1250 
1251 static void
1252 mxge_add_sysctls(mxge_softc_t *sc)
1253 {
1254 	struct sysctl_ctx_list *ctx;
1255 	struct sysctl_oid_list *children;
1256 	mcp_irq_data_t *fw;
1257 
1258 	ctx = device_get_sysctl_ctx(sc->dev);
1259 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1260 	fw = sc->fw_stats;
1261 
1262 	/* random information */
1263 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1264 		       "firmware_version",
1265 		       CTLFLAG_RD, &sc->fw_version,
1266 		       0, "firmware version");
1267 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1268 		       "serial_number",
1269 		       CTLFLAG_RD, &sc->serial_number_string,
1270 		       0, "serial number");
1271 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1272 		       "product_code",
1273 		       CTLFLAG_RD, &sc->product_code_string,
1274 		       0, "product_code");
1275 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1276 		       "pcie_link_width",
1277 		       CTLFLAG_RD, &sc->link_width,
1278 		       0, "tx_boundary");
1279 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1280 		       "tx_boundary",
1281 		       CTLFLAG_RD, &sc->tx.boundary,
1282 		       0, "tx_boundary");
1283 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1284 		       "write_combine",
1285 		       CTLFLAG_RD, &sc->wc,
1286 		       0, "write combining PIO?");
1287 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1288 		       "read_dma_MBs",
1289 		       CTLFLAG_RD, &sc->read_dma,
1290 		       0, "DMA Read speed in MB/s");
1291 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1292 		       "write_dma_MBs",
1293 		       CTLFLAG_RD, &sc->write_dma,
1294 		       0, "DMA Write speed in MB/s");
1295 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1296 		       "read_write_dma_MBs",
1297 		       CTLFLAG_RD, &sc->read_write_dma,
1298 		       0, "DMA concurrent Read/Write speed in MB/s");
1299 
1300 
1301 	/* performance related tunables */
1302 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1303 			"intr_coal_delay",
1304 			CTLTYPE_INT|CTLFLAG_RW, sc,
1305 			0, mxge_change_intr_coal,
1306 			"I", "interrupt coalescing delay in usecs");
1307 
1308 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1309 			"flow_control_enabled",
1310 			CTLTYPE_INT|CTLFLAG_RW, sc,
1311 			0, mxge_change_flow_control,
1312 			"I", "interrupt coalescing delay in usecs");
1313 
1314 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1315 		       "deassert_wait",
1316 		       CTLFLAG_RW, &mxge_deassert_wait,
1317 		       0, "Wait for IRQ line to go low in ihandler");
1318 
1319 	/* stats block from firmware is in network byte order.
1320 	   Need to swap it */
1321 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1322 			"link_up",
1323 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1324 			0, mxge_handle_be32,
1325 			"I", "link up");
1326 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1327 			"rdma_tags_available",
1328 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1329 			0, mxge_handle_be32,
1330 			"I", "rdma_tags_available");
1331 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1332 			"dropped_bad_crc32",
1333 			CTLTYPE_INT|CTLFLAG_RD,
1334 			&fw->dropped_bad_crc32,
1335 			0, mxge_handle_be32,
1336 			"I", "dropped_bad_crc32");
1337 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1338 			"dropped_bad_phy",
1339 			CTLTYPE_INT|CTLFLAG_RD,
1340 			&fw->dropped_bad_phy,
1341 			0, mxge_handle_be32,
1342 			"I", "dropped_bad_phy");
1343 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1344 			"dropped_link_error_or_filtered",
1345 			CTLTYPE_INT|CTLFLAG_RD,
1346 			&fw->dropped_link_error_or_filtered,
1347 			0, mxge_handle_be32,
1348 			"I", "dropped_link_error_or_filtered");
1349 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1350 			"dropped_link_overflow",
1351 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1352 			0, mxge_handle_be32,
1353 			"I", "dropped_link_overflow");
1354 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1355 			"dropped_multicast_filtered",
1356 			CTLTYPE_INT|CTLFLAG_RD,
1357 			&fw->dropped_multicast_filtered,
1358 			0, mxge_handle_be32,
1359 			"I", "dropped_multicast_filtered");
1360 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1361 			"dropped_no_big_buffer",
1362 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1363 			0, mxge_handle_be32,
1364 			"I", "dropped_no_big_buffer");
1365 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1366 			"dropped_no_small_buffer",
1367 			CTLTYPE_INT|CTLFLAG_RD,
1368 			&fw->dropped_no_small_buffer,
1369 			0, mxge_handle_be32,
1370 			"I", "dropped_no_small_buffer");
1371 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1372 			"dropped_overrun",
1373 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1374 			0, mxge_handle_be32,
1375 			"I", "dropped_overrun");
1376 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1377 			"dropped_pause",
1378 			CTLTYPE_INT|CTLFLAG_RD,
1379 			&fw->dropped_pause,
1380 			0, mxge_handle_be32,
1381 			"I", "dropped_pause");
1382 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1383 			"dropped_runt",
1384 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1385 			0, mxge_handle_be32,
1386 			"I", "dropped_runt");
1387 
1388 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1389 			"dropped_unicast_filtered",
1390 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1391 			0, mxge_handle_be32,
1392 			"I", "dropped_unicast_filtered");
1393 
1394 	/* host counters exported for debugging */
1395 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1396 		       "rx_small_cnt",
1397 		       CTLFLAG_RD, &sc->rx_small.cnt,
1398 		       0, "rx_small_cnt");
1399 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1400 		       "rx_big_cnt",
1401 		       CTLFLAG_RD, &sc->rx_big.cnt,
1402 		       0, "rx_small_cnt");
1403 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1404 		       "tx_req",
1405 		       CTLFLAG_RD, &sc->tx.req,
1406 		       0, "tx_req");
1407 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1408 		       "tx_done",
1409 		       CTLFLAG_RD, &sc->tx.done,
1410 		       0, "tx_done");
1411 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1412 		       "tx_pkt_done",
1413 		       CTLFLAG_RD, &sc->tx.pkt_done,
1414 		       0, "tx_done");
1415 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1416 		       "tx_stall",
1417 		       CTLFLAG_RD, &sc->tx.stall,
1418 		       0, "tx_stall");
1419 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1420 		       "tx_wake",
1421 		       CTLFLAG_RD, &sc->tx.wake,
1422 		       0, "tx_wake");
1423 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1424 		       "tx_defrag",
1425 		       CTLFLAG_RD, &sc->tx_defrag,
1426 		       0, "tx_defrag");
1427 
1428 	/* verbose printing? */
1429 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1430 		       "verbose",
1431 		       CTLFLAG_RW, &mxge_verbose,
1432 		       0, "verbose printing");
1433 
1434 	/* lro */
1435 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1436 		       "lro_cnt", CTLFLAG_RD, &sc->lro_cnt,
1437 		       0, "number of lro merge queues");
1438 
1439 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1440 		       "lro_flushed", CTLFLAG_RD, &sc->lro_flushed,
1441 		       0, "number of lro merge queues flushed");
1442 
1443 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1444 		       "lro_queued", CTLFLAG_RD, &sc->lro_queued,
1445 		       0, "number of frames appended to lro merge queues");
1446 
1447 }
1448 
1449 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1450    backwards one at a time and handle ring wraps */
1451 
1452 static inline void
1453 mxge_submit_req_backwards(mxge_tx_buf_t *tx,
1454 			    mcp_kreq_ether_send_t *src, int cnt)
1455 {
1456         int idx, starting_slot;
1457         starting_slot = tx->req;
1458         while (cnt > 1) {
1459                 cnt--;
1460                 idx = (starting_slot + cnt) & tx->mask;
1461                 mxge_pio_copy(&tx->lanai[idx],
1462 			      &src[cnt], sizeof(*src));
1463                 mb();
1464         }
1465 }
1466 
1467 /*
1468  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1469  * at most 32 bytes at a time, so as to avoid involving the software
1470  * pio handler in the nic.   We re-write the first segment's flags
1471  * to mark them valid only after writing the entire chain
1472  */
1473 
1474 static inline void
1475 mxge_submit_req(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src,
1476                   int cnt)
1477 {
1478         int idx, i;
1479         uint32_t *src_ints;
1480 	volatile uint32_t *dst_ints;
1481         mcp_kreq_ether_send_t *srcp;
1482 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1483 	uint8_t last_flags;
1484 
1485         idx = tx->req & tx->mask;
1486 
1487 	last_flags = src->flags;
1488 	src->flags = 0;
1489         mb();
1490         dst = dstp = &tx->lanai[idx];
1491         srcp = src;
1492 
1493         if ((idx + cnt) < tx->mask) {
1494                 for (i = 0; i < (cnt - 1); i += 2) {
1495                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1496                         mb(); /* force write every 32 bytes */
1497                         srcp += 2;
1498                         dstp += 2;
1499                 }
1500         } else {
1501                 /* submit all but the first request, and ensure
1502                    that it is submitted below */
1503                 mxge_submit_req_backwards(tx, src, cnt);
1504                 i = 0;
1505         }
1506         if (i < cnt) {
1507                 /* submit the first request */
1508                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1509                 mb(); /* barrier before setting valid flag */
1510         }
1511 
1512         /* re-write the last 32-bits with the valid flags */
1513         src->flags = last_flags;
1514         src_ints = (uint32_t *)src;
1515         src_ints+=3;
1516         dst_ints = (volatile uint32_t *)dst;
1517         dst_ints+=3;
1518         *dst_ints =  *src_ints;
1519         tx->req += cnt;
1520         mb();
1521 }
1522 
1523 static void
1524 mxge_encap_tso(mxge_softc_t *sc, struct mbuf *m, int busdma_seg_cnt,
1525 	       int ip_off)
1526 {
1527 	mxge_tx_buf_t *tx;
1528 	mcp_kreq_ether_send_t *req;
1529 	bus_dma_segment_t *seg;
1530 	struct ip *ip;
1531 	struct tcphdr *tcp;
1532 	uint32_t low, high_swapped;
1533 	int len, seglen, cum_len, cum_len_next;
1534 	int next_is_first, chop, cnt, rdma_count, small;
1535 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1536 	uint8_t flags, flags_next;
1537 	static int once;
1538 
1539 	mss = m->m_pkthdr.tso_segsz;
1540 
1541 	/* negative cum_len signifies to the
1542 	 * send loop that we are still in the
1543 	 * header portion of the TSO packet.
1544 	 */
1545 
1546 	/* ensure we have the ethernet, IP and TCP
1547 	   header together in the first mbuf, copy
1548 	   it to a scratch buffer if not */
1549 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1550 		m_copydata(m, 0, ip_off + sizeof (*ip),
1551 			   sc->scratch);
1552 		ip = (struct ip *)(sc->scratch + ip_off);
1553 	} else {
1554 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1555 	}
1556 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1557 			    + sizeof (*tcp))) {
1558 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1559 			   + sizeof (*tcp),  sc->scratch);
1560 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1561 	}
1562 
1563 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1564 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1565 
1566 	/* TSO implies checksum offload on this hardware */
1567 	cksum_offset = ip_off + (ip->ip_hl << 2);
1568 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1569 
1570 
1571 	/* for TSO, pseudo_hdr_offset holds mss.
1572 	 * The firmware figures out where to put
1573 	 * the checksum by parsing the header. */
1574 	pseudo_hdr_offset = htobe16(mss);
1575 
1576 	tx = &sc->tx;
1577 	req = tx->req_list;
1578 	seg = tx->seg_list;
1579 	cnt = 0;
1580 	rdma_count = 0;
1581 	/* "rdma_count" is the number of RDMAs belonging to the
1582 	 * current packet BEFORE the current send request. For
1583 	 * non-TSO packets, this is equal to "count".
1584 	 * For TSO packets, rdma_count needs to be reset
1585 	 * to 0 after a segment cut.
1586 	 *
1587 	 * The rdma_count field of the send request is
1588 	 * the number of RDMAs of the packet starting at
1589 	 * that request. For TSO send requests with one ore more cuts
1590 	 * in the middle, this is the number of RDMAs starting
1591 	 * after the last cut in the request. All previous
1592 	 * segments before the last cut implicitly have 1 RDMA.
1593 	 *
1594 	 * Since the number of RDMAs is not known beforehand,
1595 	 * it must be filled-in retroactively - after each
1596 	 * segmentation cut or at the end of the entire packet.
1597 	 */
1598 
1599 	while (busdma_seg_cnt) {
1600 		/* Break the busdma segment up into pieces*/
1601 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1602 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1603 		len = seg->ds_len;
1604 
1605 		while (len) {
1606 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1607 			seglen = len;
1608 			cum_len_next = cum_len + seglen;
1609 			(req-rdma_count)->rdma_count = rdma_count + 1;
1610 			if (__predict_true(cum_len >= 0)) {
1611 				/* payload */
1612 				chop = (cum_len_next > mss);
1613 				cum_len_next = cum_len_next % mss;
1614 				next_is_first = (cum_len_next == 0);
1615 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1616 				flags_next |= next_is_first *
1617 					MXGEFW_FLAGS_FIRST;
1618 				rdma_count |= -(chop | next_is_first);
1619 				rdma_count += chop & !next_is_first;
1620 			} else if (cum_len_next >= 0) {
1621 				/* header ends */
1622 				rdma_count = -1;
1623 				cum_len_next = 0;
1624 				seglen = -cum_len;
1625 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1626 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1627 					MXGEFW_FLAGS_FIRST |
1628 					(small * MXGEFW_FLAGS_SMALL);
1629 			    }
1630 
1631 			req->addr_high = high_swapped;
1632 			req->addr_low = htobe32(low);
1633 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1634 			req->pad = 0;
1635 			req->rdma_count = 1;
1636 			req->length = htobe16(seglen);
1637 			req->cksum_offset = cksum_offset;
1638 			req->flags = flags | ((cum_len & 1) *
1639 					      MXGEFW_FLAGS_ALIGN_ODD);
1640 			low += seglen;
1641 			len -= seglen;
1642 			cum_len = cum_len_next;
1643 			flags = flags_next;
1644 			req++;
1645 			cnt++;
1646 			rdma_count++;
1647 			if (__predict_false(cksum_offset > seglen))
1648 				cksum_offset -= seglen;
1649 			else
1650 				cksum_offset = 0;
1651 			if (__predict_false(cnt > tx->max_desc))
1652 				goto drop;
1653 		}
1654 		busdma_seg_cnt--;
1655 		seg++;
1656 	}
1657 	(req-rdma_count)->rdma_count = rdma_count;
1658 
1659 	do {
1660 		req--;
1661 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1662 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1663 
1664 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1665 	mxge_submit_req(tx, tx->req_list, cnt);
1666 	return;
1667 
1668 drop:
1669 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1670 	m_freem(m);
1671 	sc->ifp->if_oerrors++;
1672 	if (!once) {
1673 		printf("tx->max_desc exceeded via TSO!\n");
1674 		printf("mss = %d, %ld, %d!\n", mss,
1675 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1676 		once = 1;
1677 	}
1678 	return;
1679 
1680 }
1681 
1682 /*
1683  * We reproduce the software vlan tag insertion from
1684  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1685  * vlan tag insertion. We need to advertise this in order to have the
1686  * vlan interface respect our csum offload flags.
1687  */
1688 static struct mbuf *
1689 mxge_vlan_tag_insert(struct mbuf *m)
1690 {
1691 	struct ether_vlan_header *evl;
1692 
1693 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1694 	if (__predict_false(m == NULL))
1695 		return NULL;
1696 	if (m->m_len < sizeof(*evl)) {
1697 		m = m_pullup(m, sizeof(*evl));
1698 		if (__predict_false(m == NULL))
1699 			return NULL;
1700 	}
1701 	/*
1702 	 * Transform the Ethernet header into an Ethernet header
1703 	 * with 802.1Q encapsulation.
1704 	 */
1705 	evl = mtod(m, struct ether_vlan_header *);
1706 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1707 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1708 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1709 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1710 	m->m_flags &= ~M_VLANTAG;
1711 	return m;
1712 }
1713 
1714 static void
1715 mxge_encap(mxge_softc_t *sc, struct mbuf *m)
1716 {
1717 	mcp_kreq_ether_send_t *req;
1718 	bus_dma_segment_t *seg;
1719 	struct mbuf *m_tmp;
1720 	struct ifnet *ifp;
1721 	mxge_tx_buf_t *tx;
1722 	struct ip *ip;
1723 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1724 	uint16_t pseudo_hdr_offset;
1725         uint8_t flags, cksum_offset;
1726 
1727 
1728 
1729 	ifp = sc->ifp;
1730 	tx = &sc->tx;
1731 
1732 	ip_off = sizeof (struct ether_header);
1733 	if (m->m_flags & M_VLANTAG) {
1734 		m = mxge_vlan_tag_insert(m);
1735 		if (__predict_false(m == NULL))
1736 			goto drop;
1737 		ip_off += ETHER_VLAN_ENCAP_LEN;
1738 	}
1739 
1740 	/* (try to) map the frame for DMA */
1741 	idx = tx->req & tx->mask;
1742 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1743 				      m, tx->seg_list, &cnt,
1744 				      BUS_DMA_NOWAIT);
1745 	if (__predict_false(err == EFBIG)) {
1746 		/* Too many segments in the chain.  Try
1747 		   to defrag */
1748 		m_tmp = m_defrag(m, M_NOWAIT);
1749 		if (m_tmp == NULL) {
1750 			goto drop;
1751 		}
1752 		sc->tx_defrag++;
1753 		m = m_tmp;
1754 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1755 					      tx->info[idx].map,
1756 					      m, tx->seg_list, &cnt,
1757 					      BUS_DMA_NOWAIT);
1758 	}
1759 	if (__predict_false(err != 0)) {
1760 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1761 			      " packet len = %d\n", err, m->m_pkthdr.len);
1762 		goto drop;
1763 	}
1764 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1765 			BUS_DMASYNC_PREWRITE);
1766 	tx->info[idx].m = m;
1767 
1768 
1769 	/* TSO is different enough, we handle it in another routine */
1770 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1771 		mxge_encap_tso(sc, m, cnt, ip_off);
1772 		return;
1773 	}
1774 
1775 	req = tx->req_list;
1776 	cksum_offset = 0;
1777 	pseudo_hdr_offset = 0;
1778 	flags = MXGEFW_FLAGS_NO_TSO;
1779 
1780 	/* checksum offloading? */
1781 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1782 		/* ensure ip header is in first mbuf, copy
1783 		   it to a scratch buffer if not */
1784 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1785 			m_copydata(m, 0, ip_off + sizeof (*ip),
1786 				   sc->scratch);
1787 			ip = (struct ip *)(sc->scratch + ip_off);
1788 		} else {
1789 			ip = (struct ip *)(mtod(m, char *) + ip_off);
1790 		}
1791 		cksum_offset = ip_off + (ip->ip_hl << 2);
1792 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1793 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1794 		req->cksum_offset = cksum_offset;
1795 		flags |= MXGEFW_FLAGS_CKSUM;
1796 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1797 	} else {
1798 		odd_flag = 0;
1799 	}
1800 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1801 		flags |= MXGEFW_FLAGS_SMALL;
1802 
1803 	/* convert segments into a request list */
1804 	cum_len = 0;
1805 	seg = tx->seg_list;
1806 	req->flags = MXGEFW_FLAGS_FIRST;
1807 	for (i = 0; i < cnt; i++) {
1808 		req->addr_low =
1809 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1810 		req->addr_high =
1811 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1812 		req->length = htobe16(seg->ds_len);
1813 		req->cksum_offset = cksum_offset;
1814 		if (cksum_offset > seg->ds_len)
1815 			cksum_offset -= seg->ds_len;
1816 		else
1817 			cksum_offset = 0;
1818 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1819 		req->pad = 0; /* complete solid 16-byte block */
1820 		req->rdma_count = 1;
1821 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1822 		cum_len += seg->ds_len;
1823 		seg++;
1824 		req++;
1825 		req->flags = 0;
1826 	}
1827 	req--;
1828 	/* pad runts to 60 bytes */
1829 	if (cum_len < 60) {
1830 		req++;
1831 		req->addr_low =
1832 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
1833 		req->addr_high =
1834 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
1835 		req->length = htobe16(60 - cum_len);
1836 		req->cksum_offset = 0;
1837 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1838 		req->pad = 0; /* complete solid 16-byte block */
1839 		req->rdma_count = 1;
1840 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1841 		cnt++;
1842 	}
1843 
1844 	tx->req_list[0].rdma_count = cnt;
1845 #if 0
1846 	/* print what the firmware will see */
1847 	for (i = 0; i < cnt; i++) {
1848 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1849 		    "cso:%d, flags:0x%x, rdma:%d\n",
1850 		    i, (int)ntohl(tx->req_list[i].addr_high),
1851 		    (int)ntohl(tx->req_list[i].addr_low),
1852 		    (int)ntohs(tx->req_list[i].length),
1853 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1854 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1855 		    tx->req_list[i].rdma_count);
1856 	}
1857 	printf("--------------\n");
1858 #endif
1859 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1860 	mxge_submit_req(tx, tx->req_list, cnt);
1861 	return;
1862 
1863 drop:
1864 	m_freem(m);
1865 	ifp->if_oerrors++;
1866 	return;
1867 }
1868 
1869 
1870 
1871 
1872 static inline void
1873 mxge_start_locked(mxge_softc_t *sc)
1874 {
1875 	struct mbuf *m;
1876 	struct ifnet *ifp;
1877 	mxge_tx_buf_t *tx;
1878 
1879 	ifp = sc->ifp;
1880 	tx = &sc->tx;
1881 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
1882 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
1883 		if (m == NULL) {
1884 			return;
1885 		}
1886 		/* let BPF see it */
1887 		BPF_MTAP(ifp, m);
1888 
1889 		/* give it to the nic */
1890 		mxge_encap(sc, m);
1891 	}
1892 	/* ran out of transmit slots */
1893 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
1894 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
1895 		tx->stall++;
1896 	}
1897 }
1898 
1899 static void
1900 mxge_start(struct ifnet *ifp)
1901 {
1902 	mxge_softc_t *sc = ifp->if_softc;
1903 
1904 
1905 	mtx_lock(&sc->tx_mtx);
1906 	mxge_start_locked(sc);
1907 	mtx_unlock(&sc->tx_mtx);
1908 }
1909 
1910 /*
1911  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
1912  * at most 32 bytes at a time, so as to avoid involving the software
1913  * pio handler in the nic.   We re-write the first segment's low
1914  * DMA address to mark it valid only after we write the entire chunk
1915  * in a burst
1916  */
1917 static inline void
1918 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
1919 		mcp_kreq_ether_recv_t *src)
1920 {
1921 	uint32_t low;
1922 
1923 	low = src->addr_low;
1924 	src->addr_low = 0xffffffff;
1925 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
1926 	mb();
1927 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
1928 	mb();
1929 	src->addr_low = low;
1930 	dst->addr_low = low;
1931 	mb();
1932 }
1933 
1934 static int
1935 mxge_get_buf_small(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1936 {
1937 	bus_dma_segment_t seg;
1938 	struct mbuf *m;
1939 	mxge_rx_buf_t *rx = &sc->rx_small;
1940 	int cnt, err;
1941 
1942 	m = m_gethdr(M_DONTWAIT, MT_DATA);
1943 	if (m == NULL) {
1944 		rx->alloc_fail++;
1945 		err = ENOBUFS;
1946 		goto done;
1947 	}
1948 	m->m_len = MHLEN;
1949 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1950 				      &seg, &cnt, BUS_DMA_NOWAIT);
1951 	if (err != 0) {
1952 		m_free(m);
1953 		goto done;
1954 	}
1955 	rx->info[idx].m = m;
1956 	rx->shadow[idx].addr_low =
1957 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1958 	rx->shadow[idx].addr_high =
1959 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1960 
1961 done:
1962 	if ((idx & 7) == 7)
1963 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
1964 	return err;
1965 }
1966 
1967 static int
1968 mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1969 {
1970 	bus_dma_segment_t seg[3];
1971 	struct mbuf *m;
1972 	mxge_rx_buf_t *rx = &sc->rx_big;
1973 	int cnt, err, i;
1974 
1975 	if (rx->cl_size == MCLBYTES)
1976 		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
1977 	else
1978 		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
1979 	if (m == NULL) {
1980 		rx->alloc_fail++;
1981 		err = ENOBUFS;
1982 		goto done;
1983 	}
1984 	m->m_len = rx->cl_size;
1985 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1986 				      seg, &cnt, BUS_DMA_NOWAIT);
1987 	if (err != 0) {
1988 		m_free(m);
1989 		goto done;
1990 	}
1991 	rx->info[idx].m = m;
1992 
1993 	for (i = 0; i < cnt; i++) {
1994 		rx->shadow[idx + i].addr_low =
1995 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
1996 		rx->shadow[idx + i].addr_high =
1997 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
1998        }
1999 
2000 
2001 done:
2002        for (i = 0; i < rx->nbufs; i++) {
2003 		if ((idx & 7) == 7) {
2004 			mxge_submit_8rx(&rx->lanai[idx - 7],
2005 					&rx->shadow[idx - 7]);
2006 		}
2007 		idx++;
2008 	}
2009 	return err;
2010 }
2011 
2012 /*
2013  *  Myri10GE hardware checksums are not valid if the sender
2014  *  padded the frame with non-zero padding.  This is because
2015  *  the firmware just does a simple 16-bit 1s complement
2016  *  checksum across the entire frame, excluding the first 14
2017  *  bytes.  It is best to simply to check the checksum and
2018  *  tell the stack about it only if the checksum is good
2019  */
2020 
2021 static inline uint16_t
2022 mxge_rx_csum(struct mbuf *m, int csum)
2023 {
2024 	struct ether_header *eh;
2025 	struct ip *ip;
2026 	uint16_t c;
2027 
2028 	eh = mtod(m, struct ether_header *);
2029 
2030 	/* only deal with IPv4 TCP & UDP for now */
2031 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2032 		return 1;
2033 	ip = (struct ip *)(eh + 1);
2034 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2035 			    ip->ip_p != IPPROTO_UDP))
2036 		return 1;
2037 
2038 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2039 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2040 			    - (ip->ip_hl << 2) + ip->ip_p));
2041 	c ^= 0xffff;
2042 	return (c);
2043 }
2044 
2045 static void
2046 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2047 {
2048 	struct ether_vlan_header *evl;
2049 	struct ether_header *eh;
2050 	uint32_t partial;
2051 
2052 	evl = mtod(m, struct ether_vlan_header *);
2053 	eh = mtod(m, struct ether_header *);
2054 
2055 	/*
2056 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2057 	 * after what the firmware thought was the end of the ethernet
2058 	 * header.
2059 	 */
2060 
2061 	/* put checksum into host byte order */
2062 	*csum = ntohs(*csum);
2063 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2064 	(*csum) += ~partial;
2065 	(*csum) +=  ((*csum) < ~partial);
2066 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2067 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2068 
2069 	/* restore checksum to network byte order;
2070 	   later consumers expect this */
2071 	*csum = htons(*csum);
2072 
2073 	/* save the tag */
2074 	m->m_flags |= M_VLANTAG;
2075 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2076 
2077 	/*
2078 	 * Remove the 802.1q header by copying the Ethernet
2079 	 * addresses over it and adjusting the beginning of
2080 	 * the data in the mbuf.  The encapsulated Ethernet
2081 	 * type field is already in place.
2082 	 */
2083 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2084 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2085 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2086 }
2087 
2088 
2089 static inline void
2090 mxge_rx_done_big(mxge_softc_t *sc, uint32_t len, uint32_t csum)
2091 {
2092 	struct ifnet *ifp;
2093 	struct mbuf *m;
2094 	struct ether_header *eh;
2095 	mxge_rx_buf_t *rx;
2096 	bus_dmamap_t old_map;
2097 	int idx;
2098 	uint16_t tcpudp_csum;
2099 
2100 	ifp = sc->ifp;
2101 	rx = &sc->rx_big;
2102 	idx = rx->cnt & rx->mask;
2103 	rx->cnt += rx->nbufs;
2104 	/* save a pointer to the received mbuf */
2105 	m = rx->info[idx].m;
2106 	/* try to replace the received mbuf */
2107 	if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
2108 		/* drop the frame -- the old mbuf is re-cycled */
2109 		ifp->if_ierrors++;
2110 		return;
2111 	}
2112 
2113 	/* unmap the received buffer */
2114 	old_map = rx->info[idx].map;
2115 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2116 	bus_dmamap_unload(rx->dmat, old_map);
2117 
2118 	/* swap the bus_dmamap_t's */
2119 	rx->info[idx].map = rx->extra_map;
2120 	rx->extra_map = old_map;
2121 
2122 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2123 	 * aligned */
2124 	m->m_data += MXGEFW_PAD;
2125 
2126 	m->m_pkthdr.rcvif = ifp;
2127 	m->m_len = m->m_pkthdr.len = len;
2128 	ifp->if_ipackets++;
2129 	eh = mtod(m, struct ether_header *);
2130 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2131 		mxge_vlan_tag_remove(m, &csum);
2132 	}
2133 	/* if the checksum is valid, mark it in the mbuf header */
2134 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2135 		if (sc->lro_cnt && (0 == mxge_lro_rx(sc, m, csum)))
2136 			return;
2137 		/* otherwise, it was a UDP frame, or a TCP frame which
2138 		   we could not do LRO on.  Tell the stack that the
2139 		   checksum is good */
2140 		m->m_pkthdr.csum_data = 0xffff;
2141 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2142 	}
2143 	/* pass the frame up the stack */
2144 	(*ifp->if_input)(ifp, m);
2145 }
2146 
2147 static inline void
2148 mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
2149 {
2150 	struct ifnet *ifp;
2151 	struct ether_header *eh;
2152 	struct mbuf *m;
2153 	mxge_rx_buf_t *rx;
2154 	bus_dmamap_t old_map;
2155 	int idx;
2156 	uint16_t tcpudp_csum;
2157 
2158 	ifp = sc->ifp;
2159 	rx = &sc->rx_small;
2160 	idx = rx->cnt & rx->mask;
2161 	rx->cnt++;
2162 	/* save a pointer to the received mbuf */
2163 	m = rx->info[idx].m;
2164 	/* try to replace the received mbuf */
2165 	if (mxge_get_buf_small(sc, rx->extra_map, idx)) {
2166 		/* drop the frame -- the old mbuf is re-cycled */
2167 		ifp->if_ierrors++;
2168 		return;
2169 	}
2170 
2171 	/* unmap the received buffer */
2172 	old_map = rx->info[idx].map;
2173 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2174 	bus_dmamap_unload(rx->dmat, old_map);
2175 
2176 	/* swap the bus_dmamap_t's */
2177 	rx->info[idx].map = rx->extra_map;
2178 	rx->extra_map = old_map;
2179 
2180 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2181 	 * aligned */
2182 	m->m_data += MXGEFW_PAD;
2183 
2184 	m->m_pkthdr.rcvif = ifp;
2185 	m->m_len = m->m_pkthdr.len = len;
2186 	ifp->if_ipackets++;
2187 	eh = mtod(m, struct ether_header *);
2188 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2189 		mxge_vlan_tag_remove(m, &csum);
2190 	}
2191 	/* if the checksum is valid, mark it in the mbuf header */
2192 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2193 		if (sc->lro_cnt && (0 == mxge_lro_rx(sc, m, csum)))
2194 			return;
2195 		/* otherwise, it was a UDP frame, or a TCP frame which
2196 		   we could not do LRO on.  Tell the stack that the
2197 		   checksum is good */
2198 		m->m_pkthdr.csum_data = 0xffff;
2199 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2200 	}
2201 
2202 	/* pass the frame up the stack */
2203 	(*ifp->if_input)(ifp, m);
2204 }
2205 
2206 static inline void
2207 mxge_clean_rx_done(mxge_softc_t *sc)
2208 {
2209 	mxge_rx_done_t *rx_done = &sc->rx_done;
2210 	struct lro_entry *lro;
2211 	int limit = 0;
2212 	uint16_t length;
2213 	uint16_t checksum;
2214 
2215 
2216 	while (rx_done->entry[rx_done->idx].length != 0) {
2217 		length = ntohs(rx_done->entry[rx_done->idx].length);
2218 		rx_done->entry[rx_done->idx].length = 0;
2219 		checksum = rx_done->entry[rx_done->idx].checksum;
2220 		if (length <= (MHLEN - MXGEFW_PAD))
2221 			mxge_rx_done_small(sc, length, checksum);
2222 		else
2223 			mxge_rx_done_big(sc, length, checksum);
2224 		rx_done->cnt++;
2225 		rx_done->idx = rx_done->cnt & rx_done->mask;
2226 
2227 		/* limit potential for livelock */
2228 		if (__predict_false(++limit > 2 * rx_done->mask))
2229 			break;
2230 	}
2231 	while(!SLIST_EMPTY(&sc->lro_active)) {
2232 		lro = SLIST_FIRST(&sc->lro_active);
2233 		SLIST_REMOVE_HEAD(&sc->lro_active, next);
2234 		mxge_lro_flush(sc, lro);
2235 	}
2236 }
2237 
2238 
2239 static inline void
2240 mxge_tx_done(mxge_softc_t *sc, uint32_t mcp_idx)
2241 {
2242 	struct ifnet *ifp;
2243 	mxge_tx_buf_t *tx;
2244 	struct mbuf *m;
2245 	bus_dmamap_t map;
2246 	int idx, limit;
2247 
2248 	limit = 0;
2249 	tx = &sc->tx;
2250 	ifp = sc->ifp;
2251 	while (tx->pkt_done != mcp_idx) {
2252 		idx = tx->done & tx->mask;
2253 		tx->done++;
2254 		m = tx->info[idx].m;
2255 		/* mbuf and DMA map only attached to the first
2256 		   segment per-mbuf */
2257 		if (m != NULL) {
2258 			ifp->if_opackets++;
2259 			tx->info[idx].m = NULL;
2260 			map = tx->info[idx].map;
2261 			bus_dmamap_unload(tx->dmat, map);
2262 			m_freem(m);
2263 		}
2264 		if (tx->info[idx].flag) {
2265 			tx->info[idx].flag = 0;
2266 			tx->pkt_done++;
2267 		}
2268 		/* limit potential for livelock by only handling
2269 		   2 full tx rings per call */
2270 		if (__predict_false(++limit >  2 * tx->mask))
2271 			break;
2272 	}
2273 
2274 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2275            its OK to send packets */
2276 
2277 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
2278 	    tx->req - tx->done < (tx->mask + 1)/4) {
2279 		mtx_lock(&sc->tx_mtx);
2280 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2281 		sc->tx.wake++;
2282 		mxge_start_locked(sc);
2283 		mtx_unlock(&sc->tx_mtx);
2284 	}
2285 }
2286 
2287 static void
2288 mxge_intr(void *arg)
2289 {
2290 	mxge_softc_t *sc = arg;
2291 	mcp_irq_data_t *stats = sc->fw_stats;
2292 	mxge_tx_buf_t *tx = &sc->tx;
2293 	mxge_rx_done_t *rx_done = &sc->rx_done;
2294 	uint32_t send_done_count;
2295 	uint8_t valid;
2296 
2297 
2298 	/* make sure the DMA has finished */
2299 	if (!stats->valid) {
2300 		return;
2301 	}
2302 	valid = stats->valid;
2303 
2304 	if (!sc->msi_enabled) {
2305 		/* lower legacy IRQ  */
2306 		*sc->irq_deassert = 0;
2307 		if (!mxge_deassert_wait)
2308 			/* don't wait for conf. that irq is low */
2309 			stats->valid = 0;
2310 	} else {
2311 		stats->valid = 0;
2312 	}
2313 
2314 	/* loop while waiting for legacy irq deassertion */
2315 	do {
2316 		/* check for transmit completes and receives */
2317 		send_done_count = be32toh(stats->send_done_count);
2318 		while ((send_done_count != tx->pkt_done) ||
2319 		       (rx_done->entry[rx_done->idx].length != 0)) {
2320 			mxge_tx_done(sc, (int)send_done_count);
2321 			mxge_clean_rx_done(sc);
2322 			send_done_count = be32toh(stats->send_done_count);
2323 		}
2324 	} while (*((volatile uint8_t *) &stats->valid));
2325 
2326 	if (__predict_false(stats->stats_updated)) {
2327 		if (sc->link_state != stats->link_up) {
2328 			sc->link_state = stats->link_up;
2329 			if (sc->link_state) {
2330 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2331 				if (mxge_verbose)
2332 					device_printf(sc->dev, "link up\n");
2333 			} else {
2334 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2335 				if (mxge_verbose)
2336 					device_printf(sc->dev, "link down\n");
2337 			}
2338 		}
2339 		if (sc->rdma_tags_available !=
2340 		    be32toh(sc->fw_stats->rdma_tags_available)) {
2341 			sc->rdma_tags_available =
2342 				be32toh(sc->fw_stats->rdma_tags_available);
2343 			device_printf(sc->dev, "RDMA timed out! %d tags "
2344 				      "left\n", sc->rdma_tags_available);
2345 		}
2346 		sc->down_cnt += stats->link_down;
2347 	}
2348 
2349 	/* check to see if we have rx token to pass back */
2350 	if (valid & 0x1)
2351 	    *sc->irq_claim = be32toh(3);
2352 	*(sc->irq_claim + 1) = be32toh(3);
2353 }
2354 
2355 static void
2356 mxge_init(void *arg)
2357 {
2358 }
2359 
2360 
2361 
2362 static void
2363 mxge_free_mbufs(mxge_softc_t *sc)
2364 {
2365 	int i;
2366 
2367 	for (i = 0; i <= sc->rx_big.mask; i++) {
2368 		if (sc->rx_big.info[i].m == NULL)
2369 			continue;
2370 		bus_dmamap_unload(sc->rx_big.dmat,
2371 				  sc->rx_big.info[i].map);
2372 		m_freem(sc->rx_big.info[i].m);
2373 		sc->rx_big.info[i].m = NULL;
2374 	}
2375 
2376 	for (i = 0; i <= sc->rx_small.mask; i++) {
2377 		if (sc->rx_small.info[i].m == NULL)
2378 			continue;
2379 		bus_dmamap_unload(sc->rx_small.dmat,
2380 				  sc->rx_small.info[i].map);
2381 		m_freem(sc->rx_small.info[i].m);
2382 		sc->rx_small.info[i].m = NULL;
2383 	}
2384 
2385 	for (i = 0; i <= sc->tx.mask; i++) {
2386 		sc->tx.info[i].flag = 0;
2387 		if (sc->tx.info[i].m == NULL)
2388 			continue;
2389 		bus_dmamap_unload(sc->tx.dmat,
2390 				  sc->tx.info[i].map);
2391 		m_freem(sc->tx.info[i].m);
2392 		sc->tx.info[i].m = NULL;
2393 	}
2394 }
2395 
2396 static void
2397 mxge_free_rings(mxge_softc_t *sc)
2398 {
2399 	int i;
2400 
2401 	if (sc->rx_done.entry != NULL)
2402 		mxge_dma_free(&sc->rx_done.dma);
2403 	sc->rx_done.entry = NULL;
2404 	if (sc->tx.req_bytes != NULL)
2405 		free(sc->tx.req_bytes, M_DEVBUF);
2406 	if (sc->tx.seg_list != NULL)
2407 		free(sc->tx.seg_list, M_DEVBUF);
2408 	if (sc->rx_small.shadow != NULL)
2409 		free(sc->rx_small.shadow, M_DEVBUF);
2410 	if (sc->rx_big.shadow != NULL)
2411 		free(sc->rx_big.shadow, M_DEVBUF);
2412 	if (sc->tx.info != NULL) {
2413 		if (sc->tx.dmat != NULL) {
2414 			for (i = 0; i <= sc->tx.mask; i++) {
2415 				bus_dmamap_destroy(sc->tx.dmat,
2416 						   sc->tx.info[i].map);
2417 			}
2418 			bus_dma_tag_destroy(sc->tx.dmat);
2419 		}
2420 		free(sc->tx.info, M_DEVBUF);
2421 	}
2422 	if (sc->rx_small.info != NULL) {
2423 		if (sc->rx_small.dmat != NULL) {
2424 			for (i = 0; i <= sc->rx_small.mask; i++) {
2425 				bus_dmamap_destroy(sc->rx_small.dmat,
2426 						   sc->rx_small.info[i].map);
2427 			}
2428 			bus_dmamap_destroy(sc->rx_small.dmat,
2429 					   sc->rx_small.extra_map);
2430 			bus_dma_tag_destroy(sc->rx_small.dmat);
2431 		}
2432 		free(sc->rx_small.info, M_DEVBUF);
2433 	}
2434 	if (sc->rx_big.info != NULL) {
2435 		if (sc->rx_big.dmat != NULL) {
2436 			for (i = 0; i <= sc->rx_big.mask; i++) {
2437 				bus_dmamap_destroy(sc->rx_big.dmat,
2438 						   sc->rx_big.info[i].map);
2439 			}
2440 			bus_dmamap_destroy(sc->rx_big.dmat,
2441 					   sc->rx_big.extra_map);
2442 			bus_dma_tag_destroy(sc->rx_big.dmat);
2443 		}
2444 		free(sc->rx_big.info, M_DEVBUF);
2445 	}
2446 }
2447 
2448 static int
2449 mxge_alloc_rings(mxge_softc_t *sc)
2450 {
2451 	mxge_cmd_t cmd;
2452 	int tx_ring_size, rx_ring_size;
2453 	int tx_ring_entries, rx_ring_entries;
2454 	int i, err;
2455 	unsigned long bytes;
2456 
2457 	/* get ring sizes */
2458 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
2459 	tx_ring_size = cmd.data0;
2460 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
2461 	if (err != 0) {
2462 		device_printf(sc->dev, "Cannot determine ring sizes\n");
2463 		goto abort_with_nothing;
2464 	}
2465 
2466 	rx_ring_size = cmd.data0;
2467 
2468 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
2469 	rx_ring_entries = rx_ring_size / sizeof (mcp_dma_addr_t);
2470 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
2471 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
2472 	IFQ_SET_READY(&sc->ifp->if_snd);
2473 
2474 	sc->tx.mask = tx_ring_entries - 1;
2475 	sc->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
2476 	sc->rx_small.mask = sc->rx_big.mask = rx_ring_entries - 1;
2477 	sc->rx_done.mask = (2 * rx_ring_entries) - 1;
2478 
2479 	err = ENOMEM;
2480 
2481 	/* allocate interrupt queues */
2482 	bytes = (sc->rx_done.mask + 1) * sizeof (*sc->rx_done.entry);
2483 	err = mxge_dma_alloc(sc, &sc->rx_done.dma, bytes, 4096);
2484 	if (err != 0)
2485 		goto abort_with_nothing;
2486 	sc->rx_done.entry = sc->rx_done.dma.addr;
2487 	bzero(sc->rx_done.entry, bytes);
2488 
2489 	/* allocate the tx request copy block */
2490 	bytes = 8 +
2491 		sizeof (*sc->tx.req_list) * (sc->tx.max_desc + 4);
2492 	sc->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
2493 	if (sc->tx.req_bytes == NULL)
2494 		goto abort_with_alloc;
2495 	/* ensure req_list entries are aligned to 8 bytes */
2496 	sc->tx.req_list = (mcp_kreq_ether_send_t *)
2497 		((unsigned long)(sc->tx.req_bytes + 7) & ~7UL);
2498 
2499 	/* allocate the tx busdma segment list */
2500 	bytes = sizeof (*sc->tx.seg_list) * sc->tx.max_desc;
2501 	sc->tx.seg_list = (bus_dma_segment_t *)
2502 		malloc(bytes, M_DEVBUF, M_WAITOK);
2503 	if (sc->tx.seg_list == NULL)
2504 		goto abort_with_alloc;
2505 
2506 	/* allocate the rx shadow rings */
2507 	bytes = rx_ring_entries * sizeof (*sc->rx_small.shadow);
2508 	sc->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2509 	if (sc->rx_small.shadow == NULL)
2510 		goto abort_with_alloc;
2511 
2512 	bytes = rx_ring_entries * sizeof (*sc->rx_big.shadow);
2513 	sc->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2514 	if (sc->rx_big.shadow == NULL)
2515 		goto abort_with_alloc;
2516 
2517 	/* allocate the host info rings */
2518 	bytes = tx_ring_entries * sizeof (*sc->tx.info);
2519 	sc->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2520 	if (sc->tx.info == NULL)
2521 		goto abort_with_alloc;
2522 
2523 	bytes = rx_ring_entries * sizeof (*sc->rx_small.info);
2524 	sc->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2525 	if (sc->rx_small.info == NULL)
2526 		goto abort_with_alloc;
2527 
2528 	bytes = rx_ring_entries * sizeof (*sc->rx_big.info);
2529 	sc->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2530 	if (sc->rx_big.info == NULL)
2531 		goto abort_with_alloc;
2532 
2533 	/* allocate the busdma resources */
2534 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2535 				 1,			/* alignment */
2536 				 sc->tx.boundary,	/* boundary */
2537 				 BUS_SPACE_MAXADDR,	/* low */
2538 				 BUS_SPACE_MAXADDR,	/* high */
2539 				 NULL, NULL,		/* filter */
2540 				 65536 + 256,		/* maxsize */
2541 				 sc->tx.max_desc - 2,	/* num segs */
2542 				 sc->tx.boundary,	/* maxsegsize */
2543 				 BUS_DMA_ALLOCNOW,	/* flags */
2544 				 NULL, NULL,		/* lock */
2545 				 &sc->tx.dmat);		/* tag */
2546 
2547 	if (err != 0) {
2548 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
2549 			      err);
2550 		goto abort_with_alloc;
2551 	}
2552 
2553 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2554 				 1,			/* alignment */
2555 				 4096,			/* boundary */
2556 				 BUS_SPACE_MAXADDR,	/* low */
2557 				 BUS_SPACE_MAXADDR,	/* high */
2558 				 NULL, NULL,		/* filter */
2559 				 MHLEN,			/* maxsize */
2560 				 1,			/* num segs */
2561 				 MHLEN,			/* maxsegsize */
2562 				 BUS_DMA_ALLOCNOW,	/* flags */
2563 				 NULL, NULL,		/* lock */
2564 				 &sc->rx_small.dmat);	/* tag */
2565 	if (err != 0) {
2566 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2567 			      err);
2568 		goto abort_with_alloc;
2569 	}
2570 
2571 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2572 				 1,			/* alignment */
2573 				 4096,			/* boundary */
2574 				 BUS_SPACE_MAXADDR,	/* low */
2575 				 BUS_SPACE_MAXADDR,	/* high */
2576 				 NULL, NULL,		/* filter */
2577 				 3*4096,		/* maxsize */
2578 				 3,			/* num segs */
2579 				 4096,			/* maxsegsize */
2580 				 BUS_DMA_ALLOCNOW,	/* flags */
2581 				 NULL, NULL,		/* lock */
2582 				 &sc->rx_big.dmat);	/* tag */
2583 	if (err != 0) {
2584 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2585 			      err);
2586 		goto abort_with_alloc;
2587 	}
2588 
2589 	/* now use these tags to setup dmamaps for each slot
2590 	   in each ring */
2591 	for (i = 0; i <= sc->tx.mask; i++) {
2592 		err = bus_dmamap_create(sc->tx.dmat, 0,
2593 					&sc->tx.info[i].map);
2594 		if (err != 0) {
2595 			device_printf(sc->dev, "Err %d  tx dmamap\n",
2596 			      err);
2597 			goto abort_with_alloc;
2598 		}
2599 	}
2600 	for (i = 0; i <= sc->rx_small.mask; i++) {
2601 		err = bus_dmamap_create(sc->rx_small.dmat, 0,
2602 					&sc->rx_small.info[i].map);
2603 		if (err != 0) {
2604 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2605 				      err);
2606 			goto abort_with_alloc;
2607 		}
2608 	}
2609 	err = bus_dmamap_create(sc->rx_small.dmat, 0,
2610 				&sc->rx_small.extra_map);
2611 	if (err != 0) {
2612 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2613 			      err);
2614 			goto abort_with_alloc;
2615 	}
2616 
2617 	for (i = 0; i <= sc->rx_big.mask; i++) {
2618 		err = bus_dmamap_create(sc->rx_big.dmat, 0,
2619 					&sc->rx_big.info[i].map);
2620 		if (err != 0) {
2621 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2622 			      err);
2623 			goto abort_with_alloc;
2624 		}
2625 	}
2626 	err = bus_dmamap_create(sc->rx_big.dmat, 0,
2627 				&sc->rx_big.extra_map);
2628 	if (err != 0) {
2629 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2630 			      err);
2631 			goto abort_with_alloc;
2632 	}
2633 	return 0;
2634 
2635 abort_with_alloc:
2636 	mxge_free_rings(sc);
2637 
2638 abort_with_nothing:
2639 	return err;
2640 }
2641 
2642 static void
2643 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
2644 {
2645 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
2646 
2647 	if (bufsize < MCLBYTES) {
2648 		/* easy, everything fits in a single buffer */
2649 		*big_buf_size = MCLBYTES;
2650 		*cl_size = MCLBYTES;
2651 		*nbufs = 1;
2652 		return;
2653 	}
2654 
2655 	if (bufsize < MJUMPAGESIZE) {
2656 		/* still easy, everything still fits in a single buffer */
2657 		*big_buf_size = MJUMPAGESIZE;
2658 		*cl_size = MJUMPAGESIZE;
2659 		*nbufs = 1;
2660 		return;
2661 	}
2662 	/* now we need to use virtually contiguous buffers */
2663 	*cl_size = MJUM9BYTES;
2664 	*big_buf_size = 4096;
2665 	*nbufs = mtu / 4096 + 1;
2666 	/* needs to be a power of two, so round up */
2667 	if (*nbufs == 3)
2668 		*nbufs = 4;
2669 }
2670 
2671 static int
2672 mxge_open(mxge_softc_t *sc)
2673 {
2674 	mxge_cmd_t cmd;
2675 	int i, err, big_bytes;
2676 	bus_dmamap_t map;
2677 	bus_addr_t bus;
2678 	struct lro_entry *lro_entry;
2679 
2680 	SLIST_INIT(&sc->lro_free);
2681 	SLIST_INIT(&sc->lro_active);
2682 
2683 	for (i = 0; i < sc->lro_cnt; i++) {
2684 		lro_entry = (struct lro_entry *)
2685 			malloc(sizeof (*lro_entry), M_DEVBUF, M_NOWAIT | M_ZERO);
2686 		if (lro_entry == NULL) {
2687 			sc->lro_cnt = i;
2688 			break;
2689 		}
2690 		SLIST_INSERT_HEAD(&sc->lro_free, lro_entry, next);
2691 	}
2692 
2693 	/* Copy the MAC address in case it was overridden */
2694 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
2695 
2696 	err = mxge_reset(sc, 1);
2697 	if (err != 0) {
2698 		device_printf(sc->dev, "failed to reset\n");
2699 		return EIO;
2700 	}
2701 
2702 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes,
2703 			   &sc->rx_big.cl_size, &sc->rx_big.nbufs);
2704 
2705 	cmd.data0 = sc->rx_big.nbufs;
2706 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
2707 			    &cmd);
2708 	/* error is only meaningful if we're trying to set
2709 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
2710 	if (err && sc->rx_big.nbufs > 1) {
2711 		device_printf(sc->dev,
2712 			      "Failed to set alway-use-n to %d\n",
2713 			      sc->rx_big.nbufs);
2714 		return EIO;
2715 	}
2716 	/* get the lanai pointers to the send and receive rings */
2717 
2718 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
2719 	sc->tx.lanai =
2720 		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
2721 	err |= mxge_send_cmd(sc,
2722 				 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
2723 	sc->rx_small.lanai =
2724 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2725 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
2726 	sc->rx_big.lanai =
2727 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2728 
2729 	if (err != 0) {
2730 		device_printf(sc->dev,
2731 			      "failed to get ring sizes or locations\n");
2732 		return EIO;
2733 	}
2734 
2735 	/* stock receive rings */
2736 	for (i = 0; i <= sc->rx_small.mask; i++) {
2737 		map = sc->rx_small.info[i].map;
2738 		err = mxge_get_buf_small(sc, map, i);
2739 		if (err) {
2740 			device_printf(sc->dev, "alloced %d/%d smalls\n",
2741 				      i, sc->rx_small.mask + 1);
2742 			goto abort;
2743 		}
2744 	}
2745 	for (i = 0; i <= sc->rx_big.mask; i++) {
2746 		sc->rx_big.shadow[i].addr_low = 0xffffffff;
2747 		sc->rx_big.shadow[i].addr_high = 0xffffffff;
2748 	}
2749 	for (i = 0; i <= sc->rx_big.mask; i += sc->rx_big.nbufs) {
2750 		map = sc->rx_big.info[i].map;
2751 		err = mxge_get_buf_big(sc, map, i);
2752 		if (err) {
2753 			device_printf(sc->dev, "alloced %d/%d bigs\n",
2754 				      i, sc->rx_big.mask + 1);
2755 			goto abort;
2756 		}
2757 	}
2758 
2759 	/* Give the firmware the mtu and the big and small buffer
2760 	   sizes.  The firmware wants the big buf size to be a power
2761 	   of two. Luckily, FreeBSD's clusters are powers of two */
2762 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
2763 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
2764 	cmd.data0 = MHLEN - MXGEFW_PAD;
2765 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
2766 			     &cmd);
2767 	cmd.data0 = big_bytes;
2768 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2769 
2770 	if (err != 0) {
2771 		device_printf(sc->dev, "failed to setup params\n");
2772 		goto abort;
2773 	}
2774 
2775 	/* Now give him the pointer to the stats block */
2776 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->fw_stats_dma.bus_addr);
2777 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->fw_stats_dma.bus_addr);
2778 	cmd.data2 = sizeof(struct mcp_irq_data);
2779 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
2780 
2781 	if (err != 0) {
2782 		bus = sc->fw_stats_dma.bus_addr;
2783 		bus += offsetof(struct mcp_irq_data, send_done_count);
2784 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
2785 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
2786 		err = mxge_send_cmd(sc,
2787 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
2788 				    &cmd);
2789 		/* Firmware cannot support multicast without STATS_DMA_V2 */
2790 		sc->fw_multicast_support = 0;
2791 	} else {
2792 		sc->fw_multicast_support = 1;
2793 	}
2794 
2795 	if (err != 0) {
2796 		device_printf(sc->dev, "failed to setup params\n");
2797 		goto abort;
2798 	}
2799 
2800 	/* Finally, start the firmware running */
2801 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
2802 	if (err) {
2803 		device_printf(sc->dev, "Couldn't bring up link\n");
2804 		goto abort;
2805 	}
2806 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
2807 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2808 
2809 	return 0;
2810 
2811 
2812 abort:
2813 	mxge_free_mbufs(sc);
2814 
2815 	return err;
2816 }
2817 
2818 static int
2819 mxge_close(mxge_softc_t *sc)
2820 {
2821 	struct lro_entry *lro_entry;
2822 	mxge_cmd_t cmd;
2823 	int err, old_down_cnt;
2824 
2825 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
2826 	old_down_cnt = sc->down_cnt;
2827 	mb();
2828 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2829 	if (err) {
2830 		device_printf(sc->dev, "Couldn't bring down link\n");
2831 	}
2832 	if (old_down_cnt == sc->down_cnt) {
2833 		/* wait for down irq */
2834 		DELAY(10 * sc->intr_coal_delay);
2835 	}
2836 	if (old_down_cnt == sc->down_cnt) {
2837 		device_printf(sc->dev, "never got down irq\n");
2838 	}
2839 
2840 	mxge_free_mbufs(sc);
2841 
2842 	while (!SLIST_EMPTY(&sc->lro_free)) {
2843 		lro_entry = SLIST_FIRST(&sc->lro_free);
2844 		SLIST_REMOVE_HEAD(&sc->lro_free, next);
2845 	}
2846 	return 0;
2847 }
2848 
2849 static void
2850 mxge_setup_cfg_space(mxge_softc_t *sc)
2851 {
2852 	device_t dev = sc->dev;
2853 	int reg;
2854 	uint16_t cmd, lnk, pectl;
2855 
2856 	/* find the PCIe link width and set max read request to 4KB*/
2857 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
2858 		lnk = pci_read_config(dev, reg + 0x12, 2);
2859 		sc->link_width = (lnk >> 4) & 0x3f;
2860 
2861 		pectl = pci_read_config(dev, reg + 0x8, 2);
2862 		pectl = (pectl & ~0x7000) | (5 << 12);
2863 		pci_write_config(dev, reg + 0x8, pectl, 2);
2864 	}
2865 
2866 	/* Enable DMA and Memory space access */
2867 	pci_enable_busmaster(dev);
2868 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
2869 	cmd |= PCIM_CMD_MEMEN;
2870 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
2871 }
2872 
2873 static uint32_t
2874 mxge_read_reboot(mxge_softc_t *sc)
2875 {
2876 	device_t dev = sc->dev;
2877 	uint32_t vs;
2878 
2879 	/* find the vendor specific offset */
2880 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
2881 		device_printf(sc->dev,
2882 			      "could not find vendor specific offset\n");
2883 		return (uint32_t)-1;
2884 	}
2885 	/* enable read32 mode */
2886 	pci_write_config(dev, vs + 0x10, 0x3, 1);
2887 	/* tell NIC which register to read */
2888 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
2889 	return (pci_read_config(dev, vs + 0x14, 4));
2890 }
2891 
2892 static void
2893 mxge_watchdog_reset(mxge_softc_t *sc)
2894 {
2895 	int err;
2896 	uint32_t reboot;
2897 	uint16_t cmd;
2898 
2899 	err = ENXIO;
2900 
2901 	device_printf(sc->dev, "Watchdog reset!\n");
2902 
2903 	/*
2904 	 * check to see if the NIC rebooted.  If it did, then all of
2905 	 * PCI config space has been reset, and things like the
2906 	 * busmaster bit will be zero.  If this is the case, then we
2907 	 * must restore PCI config space before the NIC can be used
2908 	 * again
2909 	 */
2910 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2911 	if (cmd == 0xffff) {
2912 		/*
2913 		 * maybe the watchdog caught the NIC rebooting; wait
2914 		 * up to 100ms for it to finish.  If it does not come
2915 		 * back, then give up
2916 		 */
2917 		DELAY(1000*100);
2918 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2919 		if (cmd == 0xffff) {
2920 			device_printf(sc->dev, "NIC disappeared!\n");
2921 			goto abort;
2922 		}
2923 	}
2924 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
2925 		/* print the reboot status */
2926 		reboot = mxge_read_reboot(sc);
2927 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
2928 			      reboot);
2929 		/* restore PCI configuration space */
2930 
2931 		/* XXXX waiting for pci_cfg_restore() to be exported */
2932 		goto abort; /* just abort for now */
2933 
2934 		/* and redo any changes we made to our config space */
2935 		mxge_setup_cfg_space(sc);
2936 	} else {
2937 		device_printf(sc->dev, "NIC did not reboot, ring state:\n");
2938 		device_printf(sc->dev, "tx.req=%d tx.done=%d\n",
2939 			      sc->tx.req, sc->tx.done);
2940 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
2941 			      sc->tx.pkt_done,
2942 			      be32toh(sc->fw_stats->send_done_count));
2943 	}
2944 
2945 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
2946 		mxge_close(sc);
2947 		err = mxge_open(sc);
2948 	}
2949 
2950 abort:
2951 	/*
2952 	 * stop the watchdog if the nic is dead, to avoid spamming the
2953 	 * console
2954 	 */
2955 	if (err != 0) {
2956 		callout_stop(&sc->co_hdl);
2957 	}
2958 }
2959 
2960 static void
2961 mxge_watchdog(mxge_softc_t *sc)
2962 {
2963 	mxge_tx_buf_t *tx = &sc->tx;
2964 
2965 	/* see if we have outstanding transmits, which
2966 	   have been pending for more than mxge_ticks */
2967 	if (tx->req != tx->done &&
2968 	    tx->watchdog_req != tx->watchdog_done &&
2969 	    tx->done == tx->watchdog_done)
2970 		mxge_watchdog_reset(sc);
2971 
2972 	tx->watchdog_req = tx->req;
2973 	tx->watchdog_done = tx->done;
2974 }
2975 
2976 static void
2977 mxge_tick(void *arg)
2978 {
2979 	mxge_softc_t *sc = arg;
2980 
2981 
2982 	/* Synchronize with possible callout reset/stop. */
2983 	if (callout_pending(&sc->co_hdl) ||
2984 	    !callout_active(&sc->co_hdl)) {
2985 		mtx_unlock(&sc->driver_mtx);
2986 		return;
2987 	}
2988 
2989 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
2990 	mxge_watchdog(sc);
2991 }
2992 
2993 static int
2994 mxge_media_change(struct ifnet *ifp)
2995 {
2996 	return EINVAL;
2997 }
2998 
2999 static int
3000 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3001 {
3002 	struct ifnet *ifp = sc->ifp;
3003 	int real_mtu, old_mtu;
3004 	int err = 0;
3005 
3006 
3007 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3008 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3009 		return EINVAL;
3010 	mtx_lock(&sc->driver_mtx);
3011 	old_mtu = ifp->if_mtu;
3012 	ifp->if_mtu = mtu;
3013 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3014 		callout_stop(&sc->co_hdl);
3015 		mxge_close(sc);
3016 		err = mxge_open(sc);
3017 		if (err != 0) {
3018 			ifp->if_mtu = old_mtu;
3019 			mxge_close(sc);
3020 			(void) mxge_open(sc);
3021 		}
3022 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3023 	}
3024 	mtx_unlock(&sc->driver_mtx);
3025 	return err;
3026 }
3027 
3028 static void
3029 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3030 {
3031 	mxge_softc_t *sc = ifp->if_softc;
3032 
3033 
3034 	if (sc == NULL)
3035 		return;
3036 	ifmr->ifm_status = IFM_AVALID;
3037 	ifmr->ifm_status |= sc->fw_stats->link_up ? IFM_ACTIVE : 0;
3038 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3039 	ifmr->ifm_active |= sc->fw_stats->link_up ? IFM_FDX : 0;
3040 }
3041 
3042 static int
3043 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3044 {
3045 	mxge_softc_t *sc = ifp->if_softc;
3046 	struct ifreq *ifr = (struct ifreq *)data;
3047 	int err, mask;
3048 
3049 	err = 0;
3050 	switch (command) {
3051 	case SIOCSIFADDR:
3052 	case SIOCGIFADDR:
3053 		err = ether_ioctl(ifp, command, data);
3054 		break;
3055 
3056 	case SIOCSIFMTU:
3057 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3058 		break;
3059 
3060 	case SIOCSIFFLAGS:
3061 		mtx_lock(&sc->driver_mtx);
3062 		if (ifp->if_flags & IFF_UP) {
3063 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3064 				err = mxge_open(sc);
3065 				callout_reset(&sc->co_hdl, mxge_ticks,
3066 					      mxge_tick, sc);
3067 			} else {
3068 				/* take care of promis can allmulti
3069 				   flag chages */
3070 				mxge_change_promisc(sc,
3071 						    ifp->if_flags & IFF_PROMISC);
3072 				mxge_set_multicast_list(sc);
3073 			}
3074 		} else {
3075 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3076 				mxge_close(sc);
3077 				callout_stop(&sc->co_hdl);
3078 			}
3079 		}
3080 		mtx_unlock(&sc->driver_mtx);
3081 		break;
3082 
3083 	case SIOCADDMULTI:
3084 	case SIOCDELMULTI:
3085 		mtx_lock(&sc->driver_mtx);
3086 		mxge_set_multicast_list(sc);
3087 		mtx_unlock(&sc->driver_mtx);
3088 		break;
3089 
3090 	case SIOCSIFCAP:
3091 		mtx_lock(&sc->driver_mtx);
3092 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3093 		if (mask & IFCAP_TXCSUM) {
3094 			if (IFCAP_TXCSUM & ifp->if_capenable) {
3095 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3096 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3097 						      | CSUM_TSO);
3098 			} else {
3099 				ifp->if_capenable |= IFCAP_TXCSUM;
3100 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3101 			}
3102 		} else if (mask & IFCAP_RXCSUM) {
3103 			if (IFCAP_RXCSUM & ifp->if_capenable) {
3104 				ifp->if_capenable &= ~IFCAP_RXCSUM;
3105 				sc->csum_flag = 0;
3106 			} else {
3107 				ifp->if_capenable |= IFCAP_RXCSUM;
3108 				sc->csum_flag = 1;
3109 			}
3110 		}
3111 		if (mask & IFCAP_TSO4) {
3112 			if (IFCAP_TSO4 & ifp->if_capenable) {
3113 				ifp->if_capenable &= ~IFCAP_TSO4;
3114 				ifp->if_hwassist &= ~CSUM_TSO;
3115 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
3116 				ifp->if_capenable |= IFCAP_TSO4;
3117 				ifp->if_hwassist |= CSUM_TSO;
3118 			} else {
3119 				printf("mxge requires tx checksum offload"
3120 				       " be enabled to use TSO\n");
3121 				err = EINVAL;
3122 			}
3123 		}
3124 
3125 		if (mask & IFCAP_VLAN_HWTAGGING)
3126 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3127 		mtx_unlock(&sc->driver_mtx);
3128 		VLAN_CAPABILITIES(ifp);
3129 
3130 		break;
3131 
3132 	case SIOCGIFMEDIA:
3133 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3134 				    &sc->media, command);
3135                 break;
3136 
3137 	default:
3138 		err = ENOTTY;
3139         }
3140 	return err;
3141 }
3142 
3143 static void
3144 mxge_fetch_tunables(mxge_softc_t *sc)
3145 {
3146 
3147 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
3148 			  &mxge_flow_control);
3149 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
3150 			  &mxge_intr_coal_delay);
3151 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
3152 			  &mxge_nvidia_ecrc_enable);
3153 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
3154 			  &mxge_force_firmware);
3155 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
3156 			  &mxge_deassert_wait);
3157 	TUNABLE_INT_FETCH("hw.mxge.verbose",
3158 			  &mxge_verbose);
3159 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
3160 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
3161 
3162 	if (bootverbose)
3163 		mxge_verbose = 1;
3164 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
3165 		mxge_intr_coal_delay = 30;
3166 	if (mxge_ticks == 0)
3167 		mxge_ticks = hz;
3168 	sc->pause = mxge_flow_control;
3169 
3170 }
3171 
3172 static int
3173 mxge_attach(device_t dev)
3174 {
3175 	mxge_softc_t *sc = device_get_softc(dev);
3176 	struct ifnet *ifp;
3177 	int count, rid, err;
3178 
3179 	sc->dev = dev;
3180 	mxge_fetch_tunables(sc);
3181 
3182 	err = bus_dma_tag_create(NULL,			/* parent */
3183 				 1,			/* alignment */
3184 				 4096,			/* boundary */
3185 				 BUS_SPACE_MAXADDR,	/* low */
3186 				 BUS_SPACE_MAXADDR,	/* high */
3187 				 NULL, NULL,		/* filter */
3188 				 65536 + 256,		/* maxsize */
3189 				 MXGE_MAX_SEND_DESC, 	/* num segs */
3190 				 4096,			/* maxsegsize */
3191 				 0,			/* flags */
3192 				 NULL, NULL,		/* lock */
3193 				 &sc->parent_dmat);	/* tag */
3194 
3195 	if (err != 0) {
3196 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
3197 			      err);
3198 		goto abort_with_nothing;
3199 	}
3200 
3201 	ifp = sc->ifp = if_alloc(IFT_ETHER);
3202 	if (ifp == NULL) {
3203 		device_printf(dev, "can not if_alloc()\n");
3204 		err = ENOSPC;
3205 		goto abort_with_parent_dmat;
3206 	}
3207 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
3208 		 device_get_nameunit(dev));
3209 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
3210 	snprintf(sc->tx_mtx_name, sizeof(sc->tx_mtx_name), "%s:tx",
3211 		 device_get_nameunit(dev));
3212 	mtx_init(&sc->tx_mtx, sc->tx_mtx_name, NULL, MTX_DEF);
3213 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
3214 		 "%s:drv", device_get_nameunit(dev));
3215 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
3216 		 MTX_NETWORK_LOCK, MTX_DEF);
3217 
3218 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
3219 
3220 	mxge_setup_cfg_space(sc);
3221 
3222 	/* Map the board into the kernel */
3223 	rid = PCIR_BARS;
3224 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
3225 					 ~0, 1, RF_ACTIVE);
3226 	if (sc->mem_res == NULL) {
3227 		device_printf(dev, "could not map memory\n");
3228 		err = ENXIO;
3229 		goto abort_with_lock;
3230 	}
3231 	sc->sram = rman_get_virtual(sc->mem_res);
3232 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
3233 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
3234 		device_printf(dev, "impossible memory region size %ld\n",
3235 			      rman_get_size(sc->mem_res));
3236 		err = ENXIO;
3237 		goto abort_with_mem_res;
3238 	}
3239 
3240 	/* make NULL terminated copy of the EEPROM strings section of
3241 	   lanai SRAM */
3242 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
3243 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
3244 				rman_get_bushandle(sc->mem_res),
3245 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
3246 				sc->eeprom_strings,
3247 				MXGE_EEPROM_STRINGS_SIZE - 2);
3248 	err = mxge_parse_strings(sc);
3249 	if (err != 0)
3250 		goto abort_with_mem_res;
3251 
3252 	/* Enable write combining for efficient use of PCIe bus */
3253 	mxge_enable_wc(sc);
3254 
3255 	/* Allocate the out of band dma memory */
3256 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
3257 			     sizeof (mxge_cmd_t), 64);
3258 	if (err != 0)
3259 		goto abort_with_mem_res;
3260 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
3261 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
3262 	if (err != 0)
3263 		goto abort_with_cmd_dma;
3264 
3265 	err = mxge_dma_alloc(sc, &sc->fw_stats_dma,
3266 			     sizeof (*sc->fw_stats), 64);
3267 	if (err != 0)
3268 		goto abort_with_zeropad_dma;
3269 	sc->fw_stats = (mcp_irq_data_t *)sc->fw_stats_dma.addr;
3270 
3271 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
3272 	if (err != 0)
3273 		goto abort_with_fw_stats;
3274 
3275 	/* Add our ithread  */
3276 	count = pci_msi_count(dev);
3277 	if (count == 1 && pci_alloc_msi(dev, &count) == 0) {
3278 		rid = 1;
3279 		sc->msi_enabled = 1;
3280 	} else {
3281 		rid = 0;
3282 	}
3283 	sc->irq_res = bus_alloc_resource(dev, SYS_RES_IRQ, &rid, 0, ~0,
3284 					 1, RF_SHAREABLE | RF_ACTIVE);
3285 	if (sc->irq_res == NULL) {
3286 		device_printf(dev, "could not alloc interrupt\n");
3287 		goto abort_with_dmabench;
3288 	}
3289 	if (mxge_verbose)
3290 		device_printf(dev, "using %s irq %ld\n",
3291 			      sc->msi_enabled ? "MSI" : "INTx",
3292 			      rman_get_start(sc->irq_res));
3293 	/* select & load the firmware */
3294 	err = mxge_select_firmware(sc);
3295 	if (err != 0)
3296 		goto abort_with_irq_res;
3297 	sc->intr_coal_delay = mxge_intr_coal_delay;
3298 	err = mxge_reset(sc, 0);
3299 	if (err != 0)
3300 		goto abort_with_irq_res;
3301 
3302 	err = mxge_alloc_rings(sc);
3303 	if (err != 0) {
3304 		device_printf(sc->dev, "failed to allocate rings\n");
3305 		goto abort_with_irq_res;
3306 	}
3307 
3308 	err = bus_setup_intr(sc->dev, sc->irq_res,
3309 			     INTR_TYPE_NET | INTR_MPSAFE,
3310 			     NULL, mxge_intr, sc, &sc->ih);
3311 	if (err != 0) {
3312 		goto abort_with_rings;
3313 	}
3314 	/* hook into the network stack */
3315 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
3316 	ifp->if_baudrate = 100000000;
3317 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
3318 		IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
3319 
3320 	sc->max_mtu = mxge_max_mtu(sc);
3321 	if (sc->max_mtu >= 9000)
3322 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
3323 	else
3324 		device_printf(dev, "MTU limited to %d.  Install "
3325 			      "latest firmware for 9000 byte jumbo support\n",
3326 			      sc->max_mtu - ETHER_HDR_LEN);
3327 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
3328 	ifp->if_capenable = ifp->if_capabilities;
3329 	sc->csum_flag = 1;
3330         ifp->if_init = mxge_init;
3331         ifp->if_softc = sc;
3332         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
3333         ifp->if_ioctl = mxge_ioctl;
3334         ifp->if_start = mxge_start;
3335 	ether_ifattach(ifp, sc->mac_addr);
3336 	/* ether_ifattach sets mtu to 1500 */
3337 	if (ifp->if_capabilities & IFCAP_JUMBO_MTU)
3338 		ifp->if_mtu = 9000;
3339 
3340 	/* Initialise the ifmedia structure */
3341 	ifmedia_init(&sc->media, 0, mxge_media_change,
3342 		     mxge_media_status);
3343 	ifmedia_add(&sc->media, IFM_ETHER|IFM_AUTO, 0, NULL);
3344 	mxge_add_sysctls(sc);
3345 	return 0;
3346 
3347 abort_with_rings:
3348 	mxge_free_rings(sc);
3349 abort_with_irq_res:
3350 	bus_release_resource(dev, SYS_RES_IRQ,
3351 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3352 	if (sc->msi_enabled)
3353 		pci_release_msi(dev);
3354 abort_with_dmabench:
3355 	mxge_dma_free(&sc->dmabench_dma);
3356 abort_with_fw_stats:
3357 	mxge_dma_free(&sc->fw_stats_dma);
3358 abort_with_zeropad_dma:
3359 	mxge_dma_free(&sc->zeropad_dma);
3360 abort_with_cmd_dma:
3361 	mxge_dma_free(&sc->cmd_dma);
3362 abort_with_mem_res:
3363 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3364 abort_with_lock:
3365 	pci_disable_busmaster(dev);
3366 	mtx_destroy(&sc->cmd_mtx);
3367 	mtx_destroy(&sc->tx_mtx);
3368 	mtx_destroy(&sc->driver_mtx);
3369 	if_free(ifp);
3370 abort_with_parent_dmat:
3371 	bus_dma_tag_destroy(sc->parent_dmat);
3372 
3373 abort_with_nothing:
3374 	return err;
3375 }
3376 
3377 static int
3378 mxge_detach(device_t dev)
3379 {
3380 	mxge_softc_t *sc = device_get_softc(dev);
3381 
3382 	if (sc->ifp->if_vlantrunk != NULL) {
3383 		device_printf(sc->dev,
3384 			      "Detach vlans before removing module\n");
3385 		return EBUSY;
3386 	}
3387 	mtx_lock(&sc->driver_mtx);
3388 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
3389 		mxge_close(sc);
3390 	callout_stop(&sc->co_hdl);
3391 	mtx_unlock(&sc->driver_mtx);
3392 	ether_ifdetach(sc->ifp);
3393 	ifmedia_removeall(&sc->media);
3394 	mxge_dummy_rdma(sc, 0);
3395 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
3396 	mxge_free_rings(sc);
3397 	bus_release_resource(dev, SYS_RES_IRQ,
3398 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3399 	if (sc->msi_enabled)
3400 		pci_release_msi(dev);
3401 
3402 	sc->rx_done.entry = NULL;
3403 	mxge_dma_free(&sc->rx_done.dma);
3404 	mxge_dma_free(&sc->fw_stats_dma);
3405 	mxge_dma_free(&sc->dmabench_dma);
3406 	mxge_dma_free(&sc->zeropad_dma);
3407 	mxge_dma_free(&sc->cmd_dma);
3408 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3409 	pci_disable_busmaster(dev);
3410 	mtx_destroy(&sc->cmd_mtx);
3411 	mtx_destroy(&sc->tx_mtx);
3412 	mtx_destroy(&sc->driver_mtx);
3413 	if_free(sc->ifp);
3414 	bus_dma_tag_destroy(sc->parent_dmat);
3415 	return 0;
3416 }
3417 
3418 static int
3419 mxge_shutdown(device_t dev)
3420 {
3421 	return 0;
3422 }
3423 
3424 /*
3425   This file uses Myri10GE driver indentation.
3426 
3427   Local Variables:
3428   c-file-style:"linux"
3429   tab-width:8
3430   End:
3431 */
3432