xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 2665faf49713872c4bd3a175f85e6d5254e28259)
1 /******************************************************************************
2 
3 Copyright (c) 2006, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Redistributions in binary form must reproduce the above copyright
13     notice, this list of conditions and the following disclaimer in the
14     documentation and/or other materials provided with the distribution.
15 
16  3. Neither the name of the Myricom Inc, nor the names of its
17     contributors may be used to endorse or promote products derived from
18     this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 POSSIBILITY OF SUCH DAMAGE.
31 
32 ***************************************************************************/
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/linker.h>
40 #include <sys/firmware.h>
41 #include <sys/endian.h>
42 #include <sys/sockio.h>
43 #include <sys/mbuf.h>
44 #include <sys/malloc.h>
45 #include <sys/kdb.h>
46 #include <sys/kernel.h>
47 #include <sys/lock.h>
48 #include <sys/module.h>
49 #include <sys/memrange.h>
50 #include <sys/socket.h>
51 #include <sys/sysctl.h>
52 #include <sys/sx.h>
53 
54 #include <net/if.h>
55 #include <net/if_arp.h>
56 #include <net/ethernet.h>
57 #include <net/if_dl.h>
58 #include <net/if_media.h>
59 
60 #include <net/bpf.h>
61 
62 #include <net/if_types.h>
63 #include <net/if_vlan_var.h>
64 #include <net/zlib.h>
65 
66 #include <netinet/in_systm.h>
67 #include <netinet/in.h>
68 #include <netinet/ip.h>
69 #include <netinet/tcp.h>
70 
71 #include <machine/bus.h>
72 #include <machine/in_cksum.h>
73 #include <machine/resource.h>
74 #include <sys/bus.h>
75 #include <sys/rman.h>
76 
77 #include <dev/pci/pcireg.h>
78 #include <dev/pci/pcivar.h>
79 
80 #include <vm/vm.h>		/* for pmap_mapdev() */
81 #include <vm/pmap.h>
82 
83 #include <dev/mxge/mxge_mcp.h>
84 #include <dev/mxge/mcp_gen_header.h>
85 #include <dev/mxge/if_mxge_var.h>
86 
87 /* tunable params */
88 static int mxge_nvidia_ecrc_enable = 1;
89 static int mxge_force_firmware = 0;
90 static int mxge_intr_coal_delay = 30;
91 static int mxge_deassert_wait = 1;
92 static int mxge_flow_control = 1;
93 static int mxge_verbose = 0;
94 static int mxge_ticks;
95 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
96 static char *mxge_fw_aligned = "mxge_eth_z8e";
97 
98 static int mxge_probe(device_t dev);
99 static int mxge_attach(device_t dev);
100 static int mxge_detach(device_t dev);
101 static int mxge_shutdown(device_t dev);
102 static void mxge_intr(void *arg);
103 
104 static device_method_t mxge_methods[] =
105 {
106   /* Device interface */
107   DEVMETHOD(device_probe, mxge_probe),
108   DEVMETHOD(device_attach, mxge_attach),
109   DEVMETHOD(device_detach, mxge_detach),
110   DEVMETHOD(device_shutdown, mxge_shutdown),
111   {0, 0}
112 };
113 
114 static driver_t mxge_driver =
115 {
116   "mxge",
117   mxge_methods,
118   sizeof(mxge_softc_t),
119 };
120 
121 static devclass_t mxge_devclass;
122 
123 /* Declare ourselves to be a child of the PCI bus.*/
124 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
125 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
126 
127 static int mxge_load_firmware(mxge_softc_t *sc);
128 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
129 
130 static int
131 mxge_probe(device_t dev)
132 {
133   if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
134       (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E)) {
135 	  device_set_desc(dev, "Myri10G-PCIE-8A");
136 	  return 0;
137   }
138   return ENXIO;
139 }
140 
141 static void
142 mxge_enable_wc(mxge_softc_t *sc)
143 {
144 	struct mem_range_desc mrdesc;
145 	vm_paddr_t pa;
146 	vm_offset_t len;
147 	int err, action;
148 
149 	pa = rman_get_start(sc->mem_res);
150 	len = rman_get_size(sc->mem_res);
151 	mrdesc.mr_base = pa;
152 	mrdesc.mr_len = len;
153 	mrdesc.mr_flags = MDF_WRITECOMBINE;
154 	action = MEMRANGE_SET_UPDATE;
155 	strcpy((char *)&mrdesc.mr_owner, "mxge");
156 	err = mem_range_attr_set(&mrdesc, &action);
157 	if (err != 0) {
158 		device_printf(sc->dev,
159 			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
160 			      (unsigned long)pa, (unsigned long)len, err);
161 	} else {
162 		sc->wc = 1;
163 	}
164 }
165 
166 
167 /* callback to get our DMA address */
168 static void
169 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
170 			 int error)
171 {
172 	if (error == 0) {
173 		*(bus_addr_t *) arg = segs->ds_addr;
174 	}
175 }
176 
177 static int
178 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
179 		   bus_size_t alignment)
180 {
181 	int err;
182 	device_t dev = sc->dev;
183 
184 	/* allocate DMAable memory tags */
185 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
186 				 alignment,		/* alignment */
187 				 4096,			/* boundary */
188 				 BUS_SPACE_MAXADDR,	/* low */
189 				 BUS_SPACE_MAXADDR,	/* high */
190 				 NULL, NULL,		/* filter */
191 				 bytes,			/* maxsize */
192 				 1,			/* num segs */
193 				 4096,			/* maxsegsize */
194 				 BUS_DMA_COHERENT,	/* flags */
195 				 NULL, NULL,		/* lock */
196 				 &dma->dmat);		/* tag */
197 	if (err != 0) {
198 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
199 		return err;
200 	}
201 
202 	/* allocate DMAable memory & map */
203 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
204 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
205 				| BUS_DMA_ZERO),  &dma->map);
206 	if (err != 0) {
207 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
208 		goto abort_with_dmat;
209 	}
210 
211 	/* load the memory */
212 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
213 			      mxge_dmamap_callback,
214 			      (void *)&dma->bus_addr, 0);
215 	if (err != 0) {
216 		device_printf(dev, "couldn't load map (err = %d)\n", err);
217 		goto abort_with_mem;
218 	}
219 	return 0;
220 
221 abort_with_mem:
222 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
223 abort_with_dmat:
224 	(void)bus_dma_tag_destroy(dma->dmat);
225 	return err;
226 }
227 
228 
229 static void
230 mxge_dma_free(mxge_dma_t *dma)
231 {
232 	bus_dmamap_unload(dma->dmat, dma->map);
233 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
234 	(void)bus_dma_tag_destroy(dma->dmat);
235 }
236 
237 /*
238  * The eeprom strings on the lanaiX have the format
239  * SN=x\0
240  * MAC=x:x:x:x:x:x\0
241  * PC=text\0
242  */
243 
244 static int
245 mxge_parse_strings(mxge_softc_t *sc)
246 {
247 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
248 
249 	char *ptr, *limit;
250 	int i, found_mac;
251 
252 	ptr = sc->eeprom_strings;
253 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
254 	found_mac = 0;
255 	while (ptr < limit && *ptr != '\0') {
256 		if (memcmp(ptr, "MAC=", 4) == 0) {
257 			ptr += 1;
258 			sc->mac_addr_string = ptr;
259 			for (i = 0; i < 6; i++) {
260 				ptr += 3;
261 				if ((ptr + 2) > limit)
262 					goto abort;
263 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
264 				found_mac = 1;
265 			}
266 		} else if (memcmp(ptr, "PC=", 3) == 0) {
267 			ptr += 3;
268 			strncpy(sc->product_code_string, ptr,
269 				sizeof (sc->product_code_string) - 1);
270 		} else if (memcmp(ptr, "SN=", 3) == 0) {
271 			ptr += 3;
272 			strncpy(sc->serial_number_string, ptr,
273 				sizeof (sc->serial_number_string) - 1);
274 		}
275 		MXGE_NEXT_STRING(ptr);
276 	}
277 
278 	if (found_mac)
279 		return 0;
280 
281  abort:
282 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
283 
284 	return ENXIO;
285 }
286 
287 #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
288 static void
289 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
290 {
291 	uint32_t val;
292 	unsigned long base, off;
293 	char *va, *cfgptr;
294 	device_t pdev, mcp55;
295 	uint16_t vendor_id, device_id, word;
296 	uintptr_t bus, slot, func, ivend, idev;
297 	uint32_t *ptr32;
298 
299 
300 	if (!mxge_nvidia_ecrc_enable)
301 		return;
302 
303 	pdev = device_get_parent(device_get_parent(sc->dev));
304 	if (pdev == NULL) {
305 		device_printf(sc->dev, "could not find parent?\n");
306 		return;
307 	}
308 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
309 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
310 
311 	if (vendor_id != 0x10de)
312 		return;
313 
314 	base = 0;
315 
316 	if (device_id == 0x005d) {
317 		/* ck804, base address is magic */
318 		base = 0xe0000000UL;
319 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
320 		/* mcp55, base address stored in chipset */
321 		mcp55 = pci_find_bsf(0, 0, 0);
322 		if (mcp55 &&
323 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
324 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
325 			word = pci_read_config(mcp55, 0x90, 2);
326 			base = ((unsigned long)word & 0x7ffeU) << 25;
327 		}
328 	}
329 	if (!base)
330 		return;
331 
332 	/* XXXX
333 	   Test below is commented because it is believed that doing
334 	   config read/write beyond 0xff will access the config space
335 	   for the next larger function.  Uncomment this and remove
336 	   the hacky pmap_mapdev() way of accessing config space when
337 	   FreeBSD grows support for extended pcie config space access
338 	*/
339 #if 0
340 	/* See if we can, by some miracle, access the extended
341 	   config space */
342 	val = pci_read_config(pdev, 0x178, 4);
343 	if (val != 0xffffffff) {
344 		val |= 0x40;
345 		pci_write_config(pdev, 0x178, val, 4);
346 		return;
347 	}
348 #endif
349 	/* Rather than using normal pci config space writes, we must
350 	 * map the Nvidia config space ourselves.  This is because on
351 	 * opteron/nvidia class machine the 0xe000000 mapping is
352 	 * handled by the nvidia chipset, that means the internal PCI
353 	 * device (the on-chip northbridge), or the amd-8131 bridge
354 	 * and things behind them are not visible by this method.
355 	 */
356 
357 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
358 		      PCI_IVAR_BUS, &bus);
359 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
360 		      PCI_IVAR_SLOT, &slot);
361 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
362 		      PCI_IVAR_FUNCTION, &func);
363 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
364 		      PCI_IVAR_VENDOR, &ivend);
365 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
366 		      PCI_IVAR_DEVICE, &idev);
367 
368 	off =  base
369 		+ 0x00100000UL * (unsigned long)bus
370 		+ 0x00001000UL * (unsigned long)(func
371 						 + 8 * slot);
372 
373 	/* map it into the kernel */
374 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
375 
376 
377 	if (va == NULL) {
378 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
379 		return;
380 	}
381 	/* get a pointer to the config space mapped into the kernel */
382 	cfgptr = va + (off & PAGE_MASK);
383 
384 	/* make sure that we can really access it */
385 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
386 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
387 	if (! (vendor_id == ivend && device_id == idev)) {
388 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
389 			      vendor_id, device_id);
390 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
391 		return;
392 	}
393 
394 	ptr32 = (uint32_t*)(cfgptr + 0x178);
395 	val = *ptr32;
396 
397 	if (val == 0xffffffff) {
398 		device_printf(sc->dev, "extended mapping failed\n");
399 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
400 		return;
401 	}
402 	*ptr32 = val | 0x40;
403 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
404 	if (mxge_verbose)
405 		device_printf(sc->dev,
406 			      "Enabled ECRC on upstream Nvidia bridge "
407 			      "at %d:%d:%d\n",
408 			      (int)bus, (int)slot, (int)func);
409 	return;
410 }
411 #else
412 static void
413 mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
414 {
415 	device_printf(sc->dev,
416 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
417 	return;
418 }
419 #endif
420 
421 
422 static int
423 mxge_dma_test(mxge_softc_t *sc, int test_type)
424 {
425 	mxge_cmd_t cmd;
426 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
427 	int status;
428 	uint32_t len;
429 	char *test = " ";
430 
431 
432 	/* Run a small DMA test.
433 	 * The magic multipliers to the length tell the firmware
434 	 * to do DMA read, write, or read+write tests.  The
435 	 * results are returned in cmd.data0.  The upper 16
436 	 * bits of the return is the number of transfers completed.
437 	 * The lower 16 bits is the time in 0.5us ticks that the
438 	 * transfers took to complete.
439 	 */
440 
441 	len = sc->tx.boundary;
442 
443 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
444 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
445 	cmd.data2 = len * 0x10000;
446 	status = mxge_send_cmd(sc, test_type, &cmd);
447 	if (status != 0) {
448 		test = "read";
449 		goto abort;
450 	}
451 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
452 		(cmd.data0 & 0xffff);
453 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
454 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
455 	cmd.data2 = len * 0x1;
456 	status = mxge_send_cmd(sc, test_type, &cmd);
457 	if (status != 0) {
458 		test = "write";
459 		goto abort;
460 	}
461 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
462 		(cmd.data0 & 0xffff);
463 
464 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
465 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
466 	cmd.data2 = len * 0x10001;
467 	status = mxge_send_cmd(sc, test_type, &cmd);
468 	if (status != 0) {
469 		test = "read/write";
470 		goto abort;
471 	}
472 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
473 		(cmd.data0 & 0xffff);
474 
475 abort:
476 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
477 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
478 			      test, status);
479 
480 	return status;
481 }
482 
483 /*
484  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
485  * when the PCI-E Completion packets are aligned on an 8-byte
486  * boundary.  Some PCI-E chip sets always align Completion packets; on
487  * the ones that do not, the alignment can be enforced by enabling
488  * ECRC generation (if supported).
489  *
490  * When PCI-E Completion packets are not aligned, it is actually more
491  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
492  *
493  * If the driver can neither enable ECRC nor verify that it has
494  * already been enabled, then it must use a firmware image which works
495  * around unaligned completion packets (ethp_z8e.dat), and it should
496  * also ensure that it never gives the device a Read-DMA which is
497  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
498  * enabled, then the driver should use the aligned (eth_z8e.dat)
499  * firmware image, and set tx.boundary to 4KB.
500  */
501 
502 static int
503 mxge_firmware_probe(mxge_softc_t *sc)
504 {
505 	device_t dev = sc->dev;
506 	int reg, status;
507 	uint16_t pectl;
508 
509 	sc->tx.boundary = 4096;
510 	/*
511 	 * Verify the max read request size was set to 4KB
512 	 * before trying the test with 4KB.
513 	 */
514 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
515 		pectl = pci_read_config(dev, reg + 0x8, 2);
516 		if ((pectl & (5 << 12)) != (5 << 12)) {
517 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
518 				      pectl);
519 			sc->tx.boundary = 2048;
520 		}
521 	}
522 
523 	/*
524 	 * load the optimized firmware (which assumes aligned PCIe
525 	 * completions) in order to see if it works on this host.
526 	 */
527 	sc->fw_name = mxge_fw_aligned;
528 	status = mxge_load_firmware(sc);
529 	if (status != 0) {
530 		return status;
531 	}
532 
533 	/*
534 	 * Enable ECRC if possible
535 	 */
536 	mxge_enable_nvidia_ecrc(sc);
537 
538 	/*
539 	 * Run a DMA test which watches for unaligned completions and
540 	 * aborts on the first one seen.
541 	 */
542 
543 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
544 	if (status == 0)
545 		return 0; /* keep the aligned firmware */
546 
547 	if (status != E2BIG)
548 		device_printf(dev, "DMA test failed: %d\n", status);
549 	if (status == ENOSYS)
550 		device_printf(dev, "Falling back to ethp! "
551 			      "Please install up to date fw\n");
552 	return status;
553 }
554 
555 static int
556 mxge_select_firmware(mxge_softc_t *sc)
557 {
558 	int aligned = 0;
559 
560 
561 	if (mxge_force_firmware != 0) {
562 		if (mxge_force_firmware == 1)
563 			aligned = 1;
564 		else
565 			aligned = 0;
566 		if (mxge_verbose)
567 			device_printf(sc->dev,
568 				      "Assuming %s completions (forced)\n",
569 				      aligned ? "aligned" : "unaligned");
570 		goto abort;
571 	}
572 
573 	/* if the PCIe link width is 4 or less, we can use the aligned
574 	   firmware and skip any checks */
575 	if (sc->link_width != 0 && sc->link_width <= 4) {
576 		device_printf(sc->dev,
577 			      "PCIe x%d Link, expect reduced performance\n",
578 			      sc->link_width);
579 		aligned = 1;
580 		goto abort;
581 	}
582 
583 	if (0 == mxge_firmware_probe(sc))
584 		return 0;
585 
586 abort:
587 	if (aligned) {
588 		sc->fw_name = mxge_fw_aligned;
589 		sc->tx.boundary = 4096;
590 	} else {
591 		sc->fw_name = mxge_fw_unaligned;
592 		sc->tx.boundary = 2048;
593 	}
594 	return (mxge_load_firmware(sc));
595 }
596 
597 union qualhack
598 {
599         const char *ro_char;
600         char *rw_char;
601 };
602 
603 static int
604 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
605 {
606 
607 
608 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
609 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
610 			      be32toh(hdr->mcp_type));
611 		return EIO;
612 	}
613 
614 	/* save firmware version for sysctl */
615 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
616 	if (mxge_verbose)
617 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
618 
619 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
620 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
621 
622 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
623 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
624 		device_printf(sc->dev, "Found firmware version %s\n",
625 			      sc->fw_version);
626 		device_printf(sc->dev, "Driver needs %d.%d\n",
627 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
628 		return EINVAL;
629 	}
630 	return 0;
631 
632 }
633 
634 static int
635 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
636 {
637 	const struct firmware *fw;
638 	const mcp_gen_header_t *hdr;
639 	unsigned hdr_offset;
640 	const char *fw_data;
641 	union qualhack hack;
642 	int status;
643 	unsigned int i;
644 	char dummy;
645 
646 
647 	fw = firmware_get(sc->fw_name);
648 
649 	if (fw == NULL) {
650 		device_printf(sc->dev, "Could not find firmware image %s\n",
651 			      sc->fw_name);
652 		return ENOENT;
653 	}
654 	if (fw->datasize > *limit ||
655 	    fw->datasize < MCP_HEADER_PTR_OFFSET + 4) {
656 		device_printf(sc->dev, "Firmware image %s too large (%d/%d)\n",
657 			      sc->fw_name, (int)fw->datasize, (int) *limit);
658 		status = ENOSPC;
659 		goto abort_with_fw;
660 	}
661 	*limit = fw->datasize;
662 
663 	/* check id */
664 	fw_data = (const char *)fw->data;
665 	hdr_offset = htobe32(*(const uint32_t *)
666 			     (fw_data + MCP_HEADER_PTR_OFFSET));
667 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw->datasize) {
668 		device_printf(sc->dev, "Bad firmware file");
669 		status = EIO;
670 		goto abort_with_fw;
671 	}
672 	hdr = (const void*)(fw_data + hdr_offset);
673 
674 	status = mxge_validate_firmware(sc, hdr);
675 	if (status != 0)
676 		goto abort_with_fw;
677 
678 	hack.ro_char = fw_data;
679 	/* Copy the inflated firmware to NIC SRAM. */
680 	for (i = 0; i < *limit; i += 256) {
681 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
682 			      hack.rw_char + i,
683 			      min(256U, (unsigned)(*limit - i)));
684 		mb();
685 		dummy = *sc->sram;
686 		mb();
687 	}
688 
689 	status = 0;
690 abort_with_fw:
691 	firmware_put(fw, FIRMWARE_UNLOAD);
692 	return status;
693 }
694 
695 /*
696  * Enable or disable periodic RDMAs from the host to make certain
697  * chipsets resend dropped PCIe messages
698  */
699 
700 static void
701 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
702 {
703 	char buf_bytes[72];
704 	volatile uint32_t *confirm;
705 	volatile char *submit;
706 	uint32_t *buf, dma_low, dma_high;
707 	int i;
708 
709 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
710 
711 	/* clear confirmation addr */
712 	confirm = (volatile uint32_t *)sc->cmd;
713 	*confirm = 0;
714 	mb();
715 
716 	/* send an rdma command to the PCIe engine, and wait for the
717 	   response in the confirmation address.  The firmware should
718 	   write a -1 there to indicate it is alive and well
719 	*/
720 
721 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
722 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
723 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
724 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
725 	buf[2] = htobe32(0xffffffff);		/* confirm data */
726 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
727 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
728 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
729 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
730 	buf[5] = htobe32(enable);			/* enable? */
731 
732 
733 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
734 
735 	mxge_pio_copy(submit, buf, 64);
736 	mb();
737 	DELAY(1000);
738 	mb();
739 	i = 0;
740 	while (*confirm != 0xffffffff && i < 20) {
741 		DELAY(1000);
742 		i++;
743 	}
744 	if (*confirm != 0xffffffff) {
745 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
746 			      (enable ? "enable" : "disable"), confirm,
747 			      *confirm);
748 	}
749 	return;
750 }
751 
752 static int
753 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
754 {
755 	mcp_cmd_t *buf;
756 	char buf_bytes[sizeof(*buf) + 8];
757 	volatile mcp_cmd_response_t *response = sc->cmd;
758 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
759 	uint32_t dma_low, dma_high;
760 	int err, sleep_total = 0;
761 
762 	/* ensure buf is aligned to 8 bytes */
763 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
764 
765 	buf->data0 = htobe32(data->data0);
766 	buf->data1 = htobe32(data->data1);
767 	buf->data2 = htobe32(data->data2);
768 	buf->cmd = htobe32(cmd);
769 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
770 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
771 
772 	buf->response_addr.low = htobe32(dma_low);
773 	buf->response_addr.high = htobe32(dma_high);
774 	mtx_lock(&sc->cmd_mtx);
775 	response->result = 0xffffffff;
776 	mb();
777 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
778 
779 	/* wait up to 20ms */
780 	err = EAGAIN;
781 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
782 		bus_dmamap_sync(sc->cmd_dma.dmat,
783 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
784 		mb();
785 		switch (be32toh(response->result)) {
786 		case 0:
787 			data->data0 = be32toh(response->data);
788 			err = 0;
789 			break;
790 		case 0xffffffff:
791 			DELAY(1000);
792 			break;
793 		case MXGEFW_CMD_UNKNOWN:
794 			err = ENOSYS;
795 			break;
796 		case MXGEFW_CMD_ERROR_UNALIGNED:
797 			err = E2BIG;
798 			break;
799 		default:
800 			device_printf(sc->dev,
801 				      "mxge: command %d "
802 				      "failed, result = %d\n",
803 				      cmd, be32toh(response->result));
804 			err = ENXIO;
805 			break;
806 		}
807 		if (err != EAGAIN)
808 			break;
809 	}
810 	if (err == EAGAIN)
811 		device_printf(sc->dev, "mxge: command %d timed out"
812 			      "result = %d\n",
813 			      cmd, be32toh(response->result));
814 	mtx_unlock(&sc->cmd_mtx);
815 	return err;
816 }
817 
818 static int
819 mxge_adopt_running_firmware(mxge_softc_t *sc)
820 {
821 	struct mcp_gen_header *hdr;
822 	const size_t bytes = sizeof (struct mcp_gen_header);
823 	size_t hdr_offset;
824 	int status;
825 
826 	/* find running firmware header */
827 	hdr_offset = htobe32(*(volatile uint32_t *)
828 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
829 
830 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
831 		device_printf(sc->dev,
832 			      "Running firmware has bad header offset (%d)\n",
833 			      (int)hdr_offset);
834 		return EIO;
835 	}
836 
837 	/* copy header of running firmware from SRAM to host memory to
838 	 * validate firmware */
839 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
840 	if (hdr == NULL) {
841 		device_printf(sc->dev, "could not malloc firmware hdr\n");
842 		return ENOMEM;
843 	}
844 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
845 				rman_get_bushandle(sc->mem_res),
846 				hdr_offset, (char *)hdr, bytes);
847 	status = mxge_validate_firmware(sc, hdr);
848 	free(hdr, M_DEVBUF);
849 
850 	/*
851 	 * check to see if adopted firmware has bug where adopting
852 	 * it will cause broadcasts to be filtered unless the NIC
853 	 * is kept in ALLMULTI mode
854 	 */
855 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
856 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
857 		sc->adopted_rx_filter_bug = 1;
858 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
859 			      "working around rx filter bug\n",
860 			      sc->fw_ver_major, sc->fw_ver_minor,
861 			      sc->fw_ver_tiny);
862 	}
863 
864 	return status;
865 }
866 
867 
868 static int
869 mxge_load_firmware(mxge_softc_t *sc)
870 {
871 	volatile uint32_t *confirm;
872 	volatile char *submit;
873 	char buf_bytes[72];
874 	uint32_t *buf, size, dma_low, dma_high;
875 	int status, i;
876 
877 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
878 
879 	size = sc->sram_size;
880 	status = mxge_load_firmware_helper(sc, &size);
881 	if (status) {
882 		/* Try to use the currently running firmware, if
883 		   it is new enough */
884 		status = mxge_adopt_running_firmware(sc);
885 		if (status) {
886 			device_printf(sc->dev,
887 				      "failed to adopt running firmware\n");
888 			return status;
889 		}
890 		device_printf(sc->dev,
891 			      "Successfully adopted running firmware\n");
892 		if (sc->tx.boundary == 4096) {
893 			device_printf(sc->dev,
894 				"Using firmware currently running on NIC"
895 				 ".  For optimal\n");
896 			device_printf(sc->dev,
897 				 "performance consider loading optimized "
898 				 "firmware\n");
899 		}
900 		sc->fw_name = mxge_fw_unaligned;
901 		sc->tx.boundary = 2048;
902 		return 0;
903 	}
904 	/* clear confirmation addr */
905 	confirm = (volatile uint32_t *)sc->cmd;
906 	*confirm = 0;
907 	mb();
908 	/* send a reload command to the bootstrap MCP, and wait for the
909 	   response in the confirmation address.  The firmware should
910 	   write a -1 there to indicate it is alive and well
911 	*/
912 
913 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
914 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
915 
916 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
917 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
918 	buf[2] = htobe32(0xffffffff);	/* confirm data */
919 
920 	/* FIX: All newest firmware should un-protect the bottom of
921 	   the sram before handoff. However, the very first interfaces
922 	   do not. Therefore the handoff copy must skip the first 8 bytes
923 	*/
924 					/* where the code starts*/
925 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
926 	buf[4] = htobe32(size - 8); 	/* length of code */
927 	buf[5] = htobe32(8);		/* where to copy to */
928 	buf[6] = htobe32(0);		/* where to jump to */
929 
930 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
931 	mxge_pio_copy(submit, buf, 64);
932 	mb();
933 	DELAY(1000);
934 	mb();
935 	i = 0;
936 	while (*confirm != 0xffffffff && i < 20) {
937 		DELAY(1000*10);
938 		i++;
939 		bus_dmamap_sync(sc->cmd_dma.dmat,
940 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
941 	}
942 	if (*confirm != 0xffffffff) {
943 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
944 			confirm, *confirm);
945 
946 		return ENXIO;
947 	}
948 	return 0;
949 }
950 
951 static int
952 mxge_update_mac_address(mxge_softc_t *sc)
953 {
954 	mxge_cmd_t cmd;
955 	uint8_t *addr = sc->mac_addr;
956 	int status;
957 
958 
959 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
960 		     | (addr[2] << 8) | addr[3]);
961 
962 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
963 
964 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
965 	return status;
966 }
967 
968 static int
969 mxge_change_pause(mxge_softc_t *sc, int pause)
970 {
971 	mxge_cmd_t cmd;
972 	int status;
973 
974 	if (pause)
975 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
976 				       &cmd);
977 	else
978 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
979 				       &cmd);
980 
981 	if (status) {
982 		device_printf(sc->dev, "Failed to set flow control mode\n");
983 		return ENXIO;
984 	}
985 	sc->pause = pause;
986 	return 0;
987 }
988 
989 static void
990 mxge_change_promisc(mxge_softc_t *sc, int promisc)
991 {
992 	mxge_cmd_t cmd;
993 	int status;
994 
995 	if (promisc)
996 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
997 				       &cmd);
998 	else
999 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1000 				       &cmd);
1001 
1002 	if (status) {
1003 		device_printf(sc->dev, "Failed to set promisc mode\n");
1004 	}
1005 }
1006 
1007 static void
1008 mxge_set_multicast_list(mxge_softc_t *sc)
1009 {
1010 	mxge_cmd_t cmd;
1011 	struct ifmultiaddr *ifma;
1012 	struct ifnet *ifp = sc->ifp;
1013 	int err;
1014 
1015 	/* This firmware is known to not support multicast */
1016 	if (!sc->fw_multicast_support)
1017 		return;
1018 
1019 	/* Disable multicast filtering while we play with the lists*/
1020 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1021 	if (err != 0) {
1022 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1023 		       " error status: %d\n", err);
1024 		return;
1025 	}
1026 
1027 	if (sc->adopted_rx_filter_bug)
1028 		return;
1029 
1030 	if (ifp->if_flags & IFF_ALLMULTI)
1031 		/* request to disable multicast filtering, so quit here */
1032 		return;
1033 
1034 	/* Flush all the filters */
1035 
1036 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1037 	if (err != 0) {
1038 		device_printf(sc->dev,
1039 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1040 			      ", error status: %d\n", err);
1041 		return;
1042 	}
1043 
1044 	/* Walk the multicast list, and add each address */
1045 
1046 	IF_ADDR_LOCK(ifp);
1047 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1048 		if (ifma->ifma_addr->sa_family != AF_LINK)
1049 			continue;
1050 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1051 		      &cmd.data0, 4);
1052 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1053 		      &cmd.data1, 2);
1054 		cmd.data0 = htonl(cmd.data0);
1055 		cmd.data1 = htonl(cmd.data1);
1056 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1057 		if (err != 0) {
1058 			device_printf(sc->dev, "Failed "
1059 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1060 			       "%d\t", err);
1061 			/* abort, leaving multicast filtering off */
1062 			IF_ADDR_UNLOCK(ifp);
1063 			return;
1064 		}
1065 	}
1066 	IF_ADDR_UNLOCK(ifp);
1067 	/* Enable multicast filtering */
1068 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1069 	if (err != 0) {
1070 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1071 		       ", error status: %d\n", err);
1072 	}
1073 }
1074 
1075 static int
1076 mxge_max_mtu(mxge_softc_t *sc)
1077 {
1078 	mxge_cmd_t cmd;
1079 	int status;
1080 
1081 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1082 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1083 
1084 	/* try to set nbufs to see if it we can
1085 	   use virtually contiguous jumbos */
1086 	cmd.data0 = 0;
1087 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1088 			       &cmd);
1089 	if (status == 0)
1090 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1091 
1092 	/* otherwise, we're limited to MJUMPAGESIZE */
1093 	return MJUMPAGESIZE - MXGEFW_PAD;
1094 }
1095 
1096 static int
1097 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1098 {
1099 
1100 	mxge_cmd_t cmd;
1101 	size_t bytes;
1102 	int status;
1103 
1104 	/* try to send a reset command to the card to see if it
1105 	   is alive */
1106 	memset(&cmd, 0, sizeof (cmd));
1107 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1108 	if (status != 0) {
1109 		device_printf(sc->dev, "failed reset\n");
1110 		return ENXIO;
1111 	}
1112 
1113 	mxge_dummy_rdma(sc, 1);
1114 
1115 	if (interrupts_setup) {
1116 		/* Now exchange information about interrupts  */
1117 		bytes = (sc->rx_done.mask + 1) * sizeof (*sc->rx_done.entry);
1118 		memset(sc->rx_done.entry, 0, bytes);
1119 		cmd.data0 = (uint32_t)bytes;
1120 		status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1121 		cmd.data0 = MXGE_LOWPART_TO_U32(sc->rx_done.dma.bus_addr);
1122 		cmd.data1 = MXGE_HIGHPART_TO_U32(sc->rx_done.dma.bus_addr);
1123 		status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd);
1124 	}
1125 
1126 	status |= mxge_send_cmd(sc,
1127 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1128 
1129 
1130 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1131 
1132 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1133 	sc->irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1134 
1135 
1136 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1137 				&cmd);
1138 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1139 	if (status != 0) {
1140 		device_printf(sc->dev, "failed set interrupt parameters\n");
1141 		return status;
1142 	}
1143 
1144 
1145 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1146 
1147 
1148 	/* run a DMA benchmark */
1149 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1150 
1151 	/* reset mcp/driver shared state back to 0 */
1152 	sc->rx_done.idx = 0;
1153 	sc->rx_done.cnt = 0;
1154 	sc->tx.req = 0;
1155 	sc->tx.done = 0;
1156 	sc->tx.pkt_done = 0;
1157 	sc->tx.wake = 0;
1158 	sc->tx_defrag = 0;
1159 	sc->tx.stall = 0;
1160 	sc->rx_big.cnt = 0;
1161 	sc->rx_small.cnt = 0;
1162 	sc->rdma_tags_available = 15;
1163 	sc->fw_stats->valid = 0;
1164 	sc->fw_stats->send_done_count = 0;
1165 	sc->lro_bad_csum = 0;
1166 	sc->lro_queued = 0;
1167 	sc->lro_flushed = 0;
1168 	status = mxge_update_mac_address(sc);
1169 	mxge_change_promisc(sc, 0);
1170 	mxge_change_pause(sc, sc->pause);
1171 	mxge_set_multicast_list(sc);
1172 	return status;
1173 }
1174 
1175 static int
1176 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1177 {
1178         mxge_softc_t *sc;
1179         unsigned int intr_coal_delay;
1180         int err;
1181 
1182         sc = arg1;
1183         intr_coal_delay = sc->intr_coal_delay;
1184         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1185         if (err != 0) {
1186                 return err;
1187         }
1188         if (intr_coal_delay == sc->intr_coal_delay)
1189                 return 0;
1190 
1191         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1192                 return EINVAL;
1193 
1194 	mtx_lock(&sc->driver_mtx);
1195 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1196 	sc->intr_coal_delay = intr_coal_delay;
1197 
1198 	mtx_unlock(&sc->driver_mtx);
1199         return err;
1200 }
1201 
1202 static int
1203 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1204 {
1205         mxge_softc_t *sc;
1206         unsigned int enabled;
1207         int err;
1208 
1209         sc = arg1;
1210         enabled = sc->pause;
1211         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1212         if (err != 0) {
1213                 return err;
1214         }
1215         if (enabled == sc->pause)
1216                 return 0;
1217 
1218 	mtx_lock(&sc->driver_mtx);
1219 	err = mxge_change_pause(sc, enabled);
1220 	mtx_unlock(&sc->driver_mtx);
1221         return err;
1222 }
1223 
1224 static int
1225 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1226 {
1227         int err;
1228 
1229         if (arg1 == NULL)
1230                 return EFAULT;
1231         arg2 = be32toh(*(int *)arg1);
1232         arg1 = NULL;
1233         err = sysctl_handle_int(oidp, arg1, arg2, req);
1234 
1235         return err;
1236 }
1237 
1238 static void
1239 mxge_add_sysctls(mxge_softc_t *sc)
1240 {
1241 	struct sysctl_ctx_list *ctx;
1242 	struct sysctl_oid_list *children;
1243 	mcp_irq_data_t *fw;
1244 
1245 	ctx = device_get_sysctl_ctx(sc->dev);
1246 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1247 	fw = sc->fw_stats;
1248 
1249 	/* random information */
1250 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1251 		       "firmware_version",
1252 		       CTLFLAG_RD, &sc->fw_version,
1253 		       0, "firmware version");
1254 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1255 		       "serial_number",
1256 		       CTLFLAG_RD, &sc->serial_number_string,
1257 		       0, "serial number");
1258 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1259 		       "product_code",
1260 		       CTLFLAG_RD, &sc->product_code_string,
1261 		       0, "product_code");
1262 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1263 		       "pcie_link_width",
1264 		       CTLFLAG_RD, &sc->link_width,
1265 		       0, "tx_boundary");
1266 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1267 		       "tx_boundary",
1268 		       CTLFLAG_RD, &sc->tx.boundary,
1269 		       0, "tx_boundary");
1270 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1271 		       "write_combine",
1272 		       CTLFLAG_RD, &sc->wc,
1273 		       0, "write combining PIO?");
1274 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1275 		       "read_dma_MBs",
1276 		       CTLFLAG_RD, &sc->read_dma,
1277 		       0, "DMA Read speed in MB/s");
1278 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1279 		       "write_dma_MBs",
1280 		       CTLFLAG_RD, &sc->write_dma,
1281 		       0, "DMA Write speed in MB/s");
1282 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1283 		       "read_write_dma_MBs",
1284 		       CTLFLAG_RD, &sc->read_write_dma,
1285 		       0, "DMA concurrent Read/Write speed in MB/s");
1286 
1287 
1288 	/* performance related tunables */
1289 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1290 			"intr_coal_delay",
1291 			CTLTYPE_INT|CTLFLAG_RW, sc,
1292 			0, mxge_change_intr_coal,
1293 			"I", "interrupt coalescing delay in usecs");
1294 
1295 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1296 			"flow_control_enabled",
1297 			CTLTYPE_INT|CTLFLAG_RW, sc,
1298 			0, mxge_change_flow_control,
1299 			"I", "interrupt coalescing delay in usecs");
1300 
1301 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1302 		       "deassert_wait",
1303 		       CTLFLAG_RW, &mxge_deassert_wait,
1304 		       0, "Wait for IRQ line to go low in ihandler");
1305 
1306 	/* stats block from firmware is in network byte order.
1307 	   Need to swap it */
1308 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1309 			"link_up",
1310 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1311 			0, mxge_handle_be32,
1312 			"I", "link up");
1313 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1314 			"rdma_tags_available",
1315 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1316 			0, mxge_handle_be32,
1317 			"I", "rdma_tags_available");
1318 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1319 			"dropped_bad_crc32",
1320 			CTLTYPE_INT|CTLFLAG_RD,
1321 			&fw->dropped_bad_crc32,
1322 			0, mxge_handle_be32,
1323 			"I", "dropped_bad_crc32");
1324 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1325 			"dropped_bad_phy",
1326 			CTLTYPE_INT|CTLFLAG_RD,
1327 			&fw->dropped_bad_phy,
1328 			0, mxge_handle_be32,
1329 			"I", "dropped_bad_phy");
1330 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1331 			"dropped_link_error_or_filtered",
1332 			CTLTYPE_INT|CTLFLAG_RD,
1333 			&fw->dropped_link_error_or_filtered,
1334 			0, mxge_handle_be32,
1335 			"I", "dropped_link_error_or_filtered");
1336 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1337 			"dropped_link_overflow",
1338 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1339 			0, mxge_handle_be32,
1340 			"I", "dropped_link_overflow");
1341 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1342 			"dropped_multicast_filtered",
1343 			CTLTYPE_INT|CTLFLAG_RD,
1344 			&fw->dropped_multicast_filtered,
1345 			0, mxge_handle_be32,
1346 			"I", "dropped_multicast_filtered");
1347 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1348 			"dropped_no_big_buffer",
1349 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1350 			0, mxge_handle_be32,
1351 			"I", "dropped_no_big_buffer");
1352 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1353 			"dropped_no_small_buffer",
1354 			CTLTYPE_INT|CTLFLAG_RD,
1355 			&fw->dropped_no_small_buffer,
1356 			0, mxge_handle_be32,
1357 			"I", "dropped_no_small_buffer");
1358 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1359 			"dropped_overrun",
1360 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1361 			0, mxge_handle_be32,
1362 			"I", "dropped_overrun");
1363 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1364 			"dropped_pause",
1365 			CTLTYPE_INT|CTLFLAG_RD,
1366 			&fw->dropped_pause,
1367 			0, mxge_handle_be32,
1368 			"I", "dropped_pause");
1369 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1370 			"dropped_runt",
1371 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1372 			0, mxge_handle_be32,
1373 			"I", "dropped_runt");
1374 
1375 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1376 			"dropped_unicast_filtered",
1377 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1378 			0, mxge_handle_be32,
1379 			"I", "dropped_unicast_filtered");
1380 
1381 	/* host counters exported for debugging */
1382 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1383 		       "rx_small_cnt",
1384 		       CTLFLAG_RD, &sc->rx_small.cnt,
1385 		       0, "rx_small_cnt");
1386 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1387 		       "rx_big_cnt",
1388 		       CTLFLAG_RD, &sc->rx_big.cnt,
1389 		       0, "rx_small_cnt");
1390 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1391 		       "tx_req",
1392 		       CTLFLAG_RD, &sc->tx.req,
1393 		       0, "tx_req");
1394 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1395 		       "tx_done",
1396 		       CTLFLAG_RD, &sc->tx.done,
1397 		       0, "tx_done");
1398 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1399 		       "tx_pkt_done",
1400 		       CTLFLAG_RD, &sc->tx.pkt_done,
1401 		       0, "tx_done");
1402 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1403 		       "tx_stall",
1404 		       CTLFLAG_RD, &sc->tx.stall,
1405 		       0, "tx_stall");
1406 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1407 		       "tx_wake",
1408 		       CTLFLAG_RD, &sc->tx.wake,
1409 		       0, "tx_wake");
1410 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1411 		       "tx_defrag",
1412 		       CTLFLAG_RD, &sc->tx_defrag,
1413 		       0, "tx_defrag");
1414 
1415 	/* verbose printing? */
1416 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1417 		       "verbose",
1418 		       CTLFLAG_RW, &mxge_verbose,
1419 		       0, "verbose printing");
1420 
1421 	/* lro */
1422 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1423 		       "lro_cnt", CTLFLAG_RD, &sc->lro_cnt,
1424 		       0, "number of lro merge queues");
1425 
1426 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1427 		       "lro_flushed", CTLFLAG_RD, &sc->lro_flushed,
1428 		       0, "number of lro merge queues flushed");
1429 
1430 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1431 		       "lro_queued", CTLFLAG_RD, &sc->lro_queued,
1432 		       0, "number of frames appended to lro merge queues");
1433 
1434 }
1435 
1436 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1437    backwards one at a time and handle ring wraps */
1438 
1439 static inline void
1440 mxge_submit_req_backwards(mxge_tx_buf_t *tx,
1441 			    mcp_kreq_ether_send_t *src, int cnt)
1442 {
1443         int idx, starting_slot;
1444         starting_slot = tx->req;
1445         while (cnt > 1) {
1446                 cnt--;
1447                 idx = (starting_slot + cnt) & tx->mask;
1448                 mxge_pio_copy(&tx->lanai[idx],
1449 			      &src[cnt], sizeof(*src));
1450                 mb();
1451         }
1452 }
1453 
1454 /*
1455  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1456  * at most 32 bytes at a time, so as to avoid involving the software
1457  * pio handler in the nic.   We re-write the first segment's flags
1458  * to mark them valid only after writing the entire chain
1459  */
1460 
1461 static inline void
1462 mxge_submit_req(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src,
1463                   int cnt)
1464 {
1465         int idx, i;
1466         uint32_t *src_ints;
1467 	volatile uint32_t *dst_ints;
1468         mcp_kreq_ether_send_t *srcp;
1469 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1470 	uint8_t last_flags;
1471 
1472         idx = tx->req & tx->mask;
1473 
1474 	last_flags = src->flags;
1475 	src->flags = 0;
1476         mb();
1477         dst = dstp = &tx->lanai[idx];
1478         srcp = src;
1479 
1480         if ((idx + cnt) < tx->mask) {
1481                 for (i = 0; i < (cnt - 1); i += 2) {
1482                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1483                         mb(); /* force write every 32 bytes */
1484                         srcp += 2;
1485                         dstp += 2;
1486                 }
1487         } else {
1488                 /* submit all but the first request, and ensure
1489                    that it is submitted below */
1490                 mxge_submit_req_backwards(tx, src, cnt);
1491                 i = 0;
1492         }
1493         if (i < cnt) {
1494                 /* submit the first request */
1495                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1496                 mb(); /* barrier before setting valid flag */
1497         }
1498 
1499         /* re-write the last 32-bits with the valid flags */
1500         src->flags = last_flags;
1501         src_ints = (uint32_t *)src;
1502         src_ints+=3;
1503         dst_ints = (volatile uint32_t *)dst;
1504         dst_ints+=3;
1505         *dst_ints =  *src_ints;
1506         tx->req += cnt;
1507         mb();
1508 }
1509 
1510 static void
1511 mxge_encap_tso(mxge_softc_t *sc, struct mbuf *m, int busdma_seg_cnt,
1512 	       int ip_off)
1513 {
1514 	mxge_tx_buf_t *tx;
1515 	mcp_kreq_ether_send_t *req;
1516 	bus_dma_segment_t *seg;
1517 	struct ip *ip;
1518 	struct tcphdr *tcp;
1519 	uint32_t low, high_swapped;
1520 	int len, seglen, cum_len, cum_len_next;
1521 	int next_is_first, chop, cnt, rdma_count, small;
1522 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1523 	uint8_t flags, flags_next;
1524 	static int once;
1525 
1526 	mss = m->m_pkthdr.tso_segsz;
1527 
1528 	/* negative cum_len signifies to the
1529 	 * send loop that we are still in the
1530 	 * header portion of the TSO packet.
1531 	 */
1532 
1533 	/* ensure we have the ethernet, IP and TCP
1534 	   header together in the first mbuf, copy
1535 	   it to a scratch buffer if not */
1536 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1537 		m_copydata(m, 0, ip_off + sizeof (*ip),
1538 			   sc->scratch);
1539 		ip = (struct ip *)(sc->scratch + ip_off);
1540 	} else {
1541 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1542 	}
1543 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1544 			    + sizeof (*tcp))) {
1545 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1546 			   + sizeof (*tcp),  sc->scratch);
1547 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1548 	}
1549 
1550 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1551 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1552 
1553 	/* TSO implies checksum offload on this hardware */
1554 	cksum_offset = ip_off + (ip->ip_hl << 2);
1555 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1556 
1557 
1558 	/* for TSO, pseudo_hdr_offset holds mss.
1559 	 * The firmware figures out where to put
1560 	 * the checksum by parsing the header. */
1561 	pseudo_hdr_offset = htobe16(mss);
1562 
1563 	tx = &sc->tx;
1564 	req = tx->req_list;
1565 	seg = tx->seg_list;
1566 	cnt = 0;
1567 	rdma_count = 0;
1568 	/* "rdma_count" is the number of RDMAs belonging to the
1569 	 * current packet BEFORE the current send request. For
1570 	 * non-TSO packets, this is equal to "count".
1571 	 * For TSO packets, rdma_count needs to be reset
1572 	 * to 0 after a segment cut.
1573 	 *
1574 	 * The rdma_count field of the send request is
1575 	 * the number of RDMAs of the packet starting at
1576 	 * that request. For TSO send requests with one ore more cuts
1577 	 * in the middle, this is the number of RDMAs starting
1578 	 * after the last cut in the request. All previous
1579 	 * segments before the last cut implicitly have 1 RDMA.
1580 	 *
1581 	 * Since the number of RDMAs is not known beforehand,
1582 	 * it must be filled-in retroactively - after each
1583 	 * segmentation cut or at the end of the entire packet.
1584 	 */
1585 
1586 	while (busdma_seg_cnt) {
1587 		/* Break the busdma segment up into pieces*/
1588 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1589 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1590 		len = seg->ds_len;
1591 
1592 		while (len) {
1593 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1594 			seglen = len;
1595 			cum_len_next = cum_len + seglen;
1596 			(req-rdma_count)->rdma_count = rdma_count + 1;
1597 			if (__predict_true(cum_len >= 0)) {
1598 				/* payload */
1599 				chop = (cum_len_next > mss);
1600 				cum_len_next = cum_len_next % mss;
1601 				next_is_first = (cum_len_next == 0);
1602 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1603 				flags_next |= next_is_first *
1604 					MXGEFW_FLAGS_FIRST;
1605 				rdma_count |= -(chop | next_is_first);
1606 				rdma_count += chop & !next_is_first;
1607 			} else if (cum_len_next >= 0) {
1608 				/* header ends */
1609 				rdma_count = -1;
1610 				cum_len_next = 0;
1611 				seglen = -cum_len;
1612 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1613 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1614 					MXGEFW_FLAGS_FIRST |
1615 					(small * MXGEFW_FLAGS_SMALL);
1616 			    }
1617 
1618 			req->addr_high = high_swapped;
1619 			req->addr_low = htobe32(low);
1620 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1621 			req->pad = 0;
1622 			req->rdma_count = 1;
1623 			req->length = htobe16(seglen);
1624 			req->cksum_offset = cksum_offset;
1625 			req->flags = flags | ((cum_len & 1) *
1626 					      MXGEFW_FLAGS_ALIGN_ODD);
1627 			low += seglen;
1628 			len -= seglen;
1629 			cum_len = cum_len_next;
1630 			flags = flags_next;
1631 			req++;
1632 			cnt++;
1633 			rdma_count++;
1634 			if (__predict_false(cksum_offset > seglen))
1635 				cksum_offset -= seglen;
1636 			else
1637 				cksum_offset = 0;
1638 			if (__predict_false(cnt > tx->max_desc))
1639 				goto drop;
1640 		}
1641 		busdma_seg_cnt--;
1642 		seg++;
1643 	}
1644 	(req-rdma_count)->rdma_count = rdma_count;
1645 
1646 	do {
1647 		req--;
1648 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1649 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1650 
1651 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1652 	mxge_submit_req(tx, tx->req_list, cnt);
1653 	return;
1654 
1655 drop:
1656 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1657 	m_freem(m);
1658 	sc->ifp->if_oerrors++;
1659 	if (!once) {
1660 		printf("tx->max_desc exceeded via TSO!\n");
1661 		printf("mss = %d, %ld, %d!\n", mss,
1662 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1663 		once = 1;
1664 	}
1665 	return;
1666 
1667 }
1668 
1669 /*
1670  * We reproduce the software vlan tag insertion from
1671  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1672  * vlan tag insertion. We need to advertise this in order to have the
1673  * vlan interface respect our csum offload flags.
1674  */
1675 static struct mbuf *
1676 mxge_vlan_tag_insert(struct mbuf *m)
1677 {
1678 	struct ether_vlan_header *evl;
1679 
1680 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1681 	if (__predict_false(m == NULL))
1682 		return NULL;
1683 	if (m->m_len < sizeof(*evl)) {
1684 		m = m_pullup(m, sizeof(*evl));
1685 		if (__predict_false(m == NULL))
1686 			return NULL;
1687 	}
1688 	/*
1689 	 * Transform the Ethernet header into an Ethernet header
1690 	 * with 802.1Q encapsulation.
1691 	 */
1692 	evl = mtod(m, struct ether_vlan_header *);
1693 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1694 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1695 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1696 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1697 	m->m_flags &= ~M_VLANTAG;
1698 	return m;
1699 }
1700 
1701 static void
1702 mxge_encap(mxge_softc_t *sc, struct mbuf *m)
1703 {
1704 	mcp_kreq_ether_send_t *req;
1705 	bus_dma_segment_t *seg;
1706 	struct mbuf *m_tmp;
1707 	struct ifnet *ifp;
1708 	mxge_tx_buf_t *tx;
1709 	struct ip *ip;
1710 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1711 	uint16_t pseudo_hdr_offset;
1712         uint8_t flags, cksum_offset;
1713 
1714 
1715 
1716 	ifp = sc->ifp;
1717 	tx = &sc->tx;
1718 
1719 	ip_off = sizeof (struct ether_header);
1720 	if (m->m_flags & M_VLANTAG) {
1721 		m = mxge_vlan_tag_insert(m);
1722 		if (__predict_false(m == NULL))
1723 			goto drop;
1724 		ip_off += ETHER_VLAN_ENCAP_LEN;
1725 	}
1726 
1727 	/* (try to) map the frame for DMA */
1728 	idx = tx->req & tx->mask;
1729 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1730 				      m, tx->seg_list, &cnt,
1731 				      BUS_DMA_NOWAIT);
1732 	if (__predict_false(err == EFBIG)) {
1733 		/* Too many segments in the chain.  Try
1734 		   to defrag */
1735 		m_tmp = m_defrag(m, M_NOWAIT);
1736 		if (m_tmp == NULL) {
1737 			goto drop;
1738 		}
1739 		sc->tx_defrag++;
1740 		m = m_tmp;
1741 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1742 					      tx->info[idx].map,
1743 					      m, tx->seg_list, &cnt,
1744 					      BUS_DMA_NOWAIT);
1745 	}
1746 	if (__predict_false(err != 0)) {
1747 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1748 			      " packet len = %d\n", err, m->m_pkthdr.len);
1749 		goto drop;
1750 	}
1751 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1752 			BUS_DMASYNC_PREWRITE);
1753 	tx->info[idx].m = m;
1754 
1755 
1756 	/* TSO is different enough, we handle it in another routine */
1757 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1758 		mxge_encap_tso(sc, m, cnt, ip_off);
1759 		return;
1760 	}
1761 
1762 	req = tx->req_list;
1763 	cksum_offset = 0;
1764 	pseudo_hdr_offset = 0;
1765 	flags = MXGEFW_FLAGS_NO_TSO;
1766 
1767 	/* checksum offloading? */
1768 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1769 		/* ensure ip header is in first mbuf, copy
1770 		   it to a scratch buffer if not */
1771 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1772 			m_copydata(m, 0, ip_off + sizeof (*ip),
1773 				   sc->scratch);
1774 			ip = (struct ip *)(sc->scratch + ip_off);
1775 		} else {
1776 			ip = (struct ip *)(mtod(m, char *) + ip_off);
1777 		}
1778 		cksum_offset = ip_off + (ip->ip_hl << 2);
1779 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1780 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1781 		req->cksum_offset = cksum_offset;
1782 		flags |= MXGEFW_FLAGS_CKSUM;
1783 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1784 	} else {
1785 		odd_flag = 0;
1786 	}
1787 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1788 		flags |= MXGEFW_FLAGS_SMALL;
1789 
1790 	/* convert segments into a request list */
1791 	cum_len = 0;
1792 	seg = tx->seg_list;
1793 	req->flags = MXGEFW_FLAGS_FIRST;
1794 	for (i = 0; i < cnt; i++) {
1795 		req->addr_low =
1796 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1797 		req->addr_high =
1798 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1799 		req->length = htobe16(seg->ds_len);
1800 		req->cksum_offset = cksum_offset;
1801 		if (cksum_offset > seg->ds_len)
1802 			cksum_offset -= seg->ds_len;
1803 		else
1804 			cksum_offset = 0;
1805 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1806 		req->pad = 0; /* complete solid 16-byte block */
1807 		req->rdma_count = 1;
1808 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1809 		cum_len += seg->ds_len;
1810 		seg++;
1811 		req++;
1812 		req->flags = 0;
1813 	}
1814 	req--;
1815 	/* pad runts to 60 bytes */
1816 	if (cum_len < 60) {
1817 		req++;
1818 		req->addr_low =
1819 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
1820 		req->addr_high =
1821 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
1822 		req->length = htobe16(60 - cum_len);
1823 		req->cksum_offset = 0;
1824 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1825 		req->pad = 0; /* complete solid 16-byte block */
1826 		req->rdma_count = 1;
1827 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1828 		cnt++;
1829 	}
1830 
1831 	tx->req_list[0].rdma_count = cnt;
1832 #if 0
1833 	/* print what the firmware will see */
1834 	for (i = 0; i < cnt; i++) {
1835 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1836 		    "cso:%d, flags:0x%x, rdma:%d\n",
1837 		    i, (int)ntohl(tx->req_list[i].addr_high),
1838 		    (int)ntohl(tx->req_list[i].addr_low),
1839 		    (int)ntohs(tx->req_list[i].length),
1840 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1841 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1842 		    tx->req_list[i].rdma_count);
1843 	}
1844 	printf("--------------\n");
1845 #endif
1846 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1847 	mxge_submit_req(tx, tx->req_list, cnt);
1848 	return;
1849 
1850 drop:
1851 	m_freem(m);
1852 	ifp->if_oerrors++;
1853 	return;
1854 }
1855 
1856 
1857 
1858 
1859 static inline void
1860 mxge_start_locked(mxge_softc_t *sc)
1861 {
1862 	struct mbuf *m;
1863 	struct ifnet *ifp;
1864 	mxge_tx_buf_t *tx;
1865 
1866 	ifp = sc->ifp;
1867 	tx = &sc->tx;
1868 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
1869 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
1870 		if (m == NULL) {
1871 			return;
1872 		}
1873 		/* let BPF see it */
1874 		BPF_MTAP(ifp, m);
1875 
1876 		/* give it to the nic */
1877 		mxge_encap(sc, m);
1878 	}
1879 	/* ran out of transmit slots */
1880 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
1881 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
1882 		tx->stall++;
1883 	}
1884 }
1885 
1886 static void
1887 mxge_start(struct ifnet *ifp)
1888 {
1889 	mxge_softc_t *sc = ifp->if_softc;
1890 
1891 
1892 	mtx_lock(&sc->tx_mtx);
1893 	mxge_start_locked(sc);
1894 	mtx_unlock(&sc->tx_mtx);
1895 }
1896 
1897 /*
1898  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
1899  * at most 32 bytes at a time, so as to avoid involving the software
1900  * pio handler in the nic.   We re-write the first segment's low
1901  * DMA address to mark it valid only after we write the entire chunk
1902  * in a burst
1903  */
1904 static inline void
1905 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
1906 		mcp_kreq_ether_recv_t *src)
1907 {
1908 	uint32_t low;
1909 
1910 	low = src->addr_low;
1911 	src->addr_low = 0xffffffff;
1912 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
1913 	mb();
1914 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
1915 	mb();
1916 	src->addr_low = low;
1917 	dst->addr_low = low;
1918 	mb();
1919 }
1920 
1921 static int
1922 mxge_get_buf_small(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1923 {
1924 	bus_dma_segment_t seg;
1925 	struct mbuf *m;
1926 	mxge_rx_buf_t *rx = &sc->rx_small;
1927 	int cnt, err;
1928 
1929 	m = m_gethdr(M_DONTWAIT, MT_DATA);
1930 	if (m == NULL) {
1931 		rx->alloc_fail++;
1932 		err = ENOBUFS;
1933 		goto done;
1934 	}
1935 	m->m_len = MHLEN;
1936 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1937 				      &seg, &cnt, BUS_DMA_NOWAIT);
1938 	if (err != 0) {
1939 		m_free(m);
1940 		goto done;
1941 	}
1942 	rx->info[idx].m = m;
1943 	rx->shadow[idx].addr_low =
1944 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1945 	rx->shadow[idx].addr_high =
1946 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1947 
1948 done:
1949 	if ((idx & 7) == 7)
1950 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
1951 	return err;
1952 }
1953 
1954 static int
1955 mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1956 {
1957 	bus_dma_segment_t seg[3];
1958 	struct mbuf *m;
1959 	mxge_rx_buf_t *rx = &sc->rx_big;
1960 	int cnt, err, i;
1961 
1962 	if (rx->cl_size == MCLBYTES)
1963 		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
1964 	else
1965 		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
1966 	if (m == NULL) {
1967 		rx->alloc_fail++;
1968 		err = ENOBUFS;
1969 		goto done;
1970 	}
1971 	m->m_len = rx->cl_size;
1972 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1973 				      seg, &cnt, BUS_DMA_NOWAIT);
1974 	if (err != 0) {
1975 		m_free(m);
1976 		goto done;
1977 	}
1978 	rx->info[idx].m = m;
1979 
1980 	for (i = 0; i < cnt; i++) {
1981 		rx->shadow[idx + i].addr_low =
1982 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
1983 		rx->shadow[idx + i].addr_high =
1984 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
1985        }
1986 
1987 
1988 done:
1989        for (i = 0; i < rx->nbufs; i++) {
1990 		if ((idx & 7) == 7) {
1991 			mxge_submit_8rx(&rx->lanai[idx - 7],
1992 					&rx->shadow[idx - 7]);
1993 		}
1994 		idx++;
1995 	}
1996 	return err;
1997 }
1998 
1999 /*
2000  *  Myri10GE hardware checksums are not valid if the sender
2001  *  padded the frame with non-zero padding.  This is because
2002  *  the firmware just does a simple 16-bit 1s complement
2003  *  checksum across the entire frame, excluding the first 14
2004  *  bytes.  It is best to simply to check the checksum and
2005  *  tell the stack about it only if the checksum is good
2006  */
2007 
2008 static inline uint16_t
2009 mxge_rx_csum(struct mbuf *m, int csum)
2010 {
2011 	struct ether_header *eh;
2012 	struct ip *ip;
2013 	uint16_t c;
2014 
2015 	eh = mtod(m, struct ether_header *);
2016 
2017 	/* only deal with IPv4 TCP & UDP for now */
2018 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2019 		return 1;
2020 	ip = (struct ip *)(eh + 1);
2021 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2022 			    ip->ip_p != IPPROTO_UDP))
2023 		return 1;
2024 
2025 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2026 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2027 			    - (ip->ip_hl << 2) + ip->ip_p));
2028 	c ^= 0xffff;
2029 	return (c);
2030 }
2031 
2032 static void
2033 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2034 {
2035 	struct ether_vlan_header *evl;
2036 	struct ether_header *eh;
2037 	uint32_t partial;
2038 
2039 	evl = mtod(m, struct ether_vlan_header *);
2040 	eh = mtod(m, struct ether_header *);
2041 
2042 	/*
2043 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2044 	 * after what the firmware thought was the end of the ethernet
2045 	 * header.
2046 	 */
2047 
2048 	/* put checksum into host byte order */
2049 	*csum = ntohs(*csum);
2050 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2051 	(*csum) += ~partial;
2052 	(*csum) +=  ((*csum) < ~partial);
2053 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2054 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2055 
2056 	/* restore checksum to network byte order;
2057 	   later consumers expect this */
2058 	*csum = htons(*csum);
2059 
2060 	/* save the tag */
2061 	m->m_flags |= M_VLANTAG;
2062 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2063 
2064 	/*
2065 	 * Remove the 802.1q header by copying the Ethernet
2066 	 * addresses over it and adjusting the beginning of
2067 	 * the data in the mbuf.  The encapsulated Ethernet
2068 	 * type field is already in place.
2069 	 */
2070 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2071 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2072 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2073 }
2074 
2075 
2076 static inline void
2077 mxge_rx_done_big(mxge_softc_t *sc, uint32_t len, uint32_t csum)
2078 {
2079 	struct ifnet *ifp;
2080 	struct mbuf *m;
2081 	struct ether_header *eh;
2082 	mxge_rx_buf_t *rx;
2083 	bus_dmamap_t old_map;
2084 	int idx;
2085 	uint16_t tcpudp_csum;
2086 
2087 	ifp = sc->ifp;
2088 	rx = &sc->rx_big;
2089 	idx = rx->cnt & rx->mask;
2090 	rx->cnt += rx->nbufs;
2091 	/* save a pointer to the received mbuf */
2092 	m = rx->info[idx].m;
2093 	/* try to replace the received mbuf */
2094 	if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
2095 		/* drop the frame -- the old mbuf is re-cycled */
2096 		ifp->if_ierrors++;
2097 		return;
2098 	}
2099 
2100 	/* unmap the received buffer */
2101 	old_map = rx->info[idx].map;
2102 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2103 	bus_dmamap_unload(rx->dmat, old_map);
2104 
2105 	/* swap the bus_dmamap_t's */
2106 	rx->info[idx].map = rx->extra_map;
2107 	rx->extra_map = old_map;
2108 
2109 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2110 	 * aligned */
2111 	m->m_data += MXGEFW_PAD;
2112 
2113 	m->m_pkthdr.rcvif = ifp;
2114 	m->m_len = m->m_pkthdr.len = len;
2115 	ifp->if_ipackets++;
2116 	eh = mtod(m, struct ether_header *);
2117 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2118 		mxge_vlan_tag_remove(m, &csum);
2119 	}
2120 	/* if the checksum is valid, mark it in the mbuf header */
2121 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2122 		if (sc->lro_cnt && (0 == mxge_lro_rx(sc, m, csum)))
2123 			return;
2124 		/* otherwise, it was a UDP frame, or a TCP frame which
2125 		   we could not do LRO on.  Tell the stack that the
2126 		   checksum is good */
2127 		m->m_pkthdr.csum_data = 0xffff;
2128 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2129 	}
2130 	/* pass the frame up the stack */
2131 	(*ifp->if_input)(ifp, m);
2132 }
2133 
2134 static inline void
2135 mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
2136 {
2137 	struct ifnet *ifp;
2138 	struct ether_header *eh;
2139 	struct mbuf *m;
2140 	mxge_rx_buf_t *rx;
2141 	bus_dmamap_t old_map;
2142 	int idx;
2143 	uint16_t tcpudp_csum;
2144 
2145 	ifp = sc->ifp;
2146 	rx = &sc->rx_small;
2147 	idx = rx->cnt & rx->mask;
2148 	rx->cnt++;
2149 	/* save a pointer to the received mbuf */
2150 	m = rx->info[idx].m;
2151 	/* try to replace the received mbuf */
2152 	if (mxge_get_buf_small(sc, rx->extra_map, idx)) {
2153 		/* drop the frame -- the old mbuf is re-cycled */
2154 		ifp->if_ierrors++;
2155 		return;
2156 	}
2157 
2158 	/* unmap the received buffer */
2159 	old_map = rx->info[idx].map;
2160 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2161 	bus_dmamap_unload(rx->dmat, old_map);
2162 
2163 	/* swap the bus_dmamap_t's */
2164 	rx->info[idx].map = rx->extra_map;
2165 	rx->extra_map = old_map;
2166 
2167 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2168 	 * aligned */
2169 	m->m_data += MXGEFW_PAD;
2170 
2171 	m->m_pkthdr.rcvif = ifp;
2172 	m->m_len = m->m_pkthdr.len = len;
2173 	ifp->if_ipackets++;
2174 	eh = mtod(m, struct ether_header *);
2175 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2176 		mxge_vlan_tag_remove(m, &csum);
2177 	}
2178 	/* if the checksum is valid, mark it in the mbuf header */
2179 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2180 		if (sc->lro_cnt && (0 == mxge_lro_rx(sc, m, csum)))
2181 			return;
2182 		/* otherwise, it was a UDP frame, or a TCP frame which
2183 		   we could not do LRO on.  Tell the stack that the
2184 		   checksum is good */
2185 		m->m_pkthdr.csum_data = 0xffff;
2186 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2187 	}
2188 
2189 	/* pass the frame up the stack */
2190 	(*ifp->if_input)(ifp, m);
2191 }
2192 
2193 static inline void
2194 mxge_clean_rx_done(mxge_softc_t *sc)
2195 {
2196 	mxge_rx_done_t *rx_done = &sc->rx_done;
2197 	struct lro_entry *lro;
2198 	int limit = 0;
2199 	uint16_t length;
2200 	uint16_t checksum;
2201 
2202 
2203 	while (rx_done->entry[rx_done->idx].length != 0) {
2204 		length = ntohs(rx_done->entry[rx_done->idx].length);
2205 		rx_done->entry[rx_done->idx].length = 0;
2206 		checksum = rx_done->entry[rx_done->idx].checksum;
2207 		if (length <= (MHLEN - MXGEFW_PAD))
2208 			mxge_rx_done_small(sc, length, checksum);
2209 		else
2210 			mxge_rx_done_big(sc, length, checksum);
2211 		rx_done->cnt++;
2212 		rx_done->idx = rx_done->cnt & rx_done->mask;
2213 
2214 		/* limit potential for livelock */
2215 		if (__predict_false(++limit > 2 * rx_done->mask))
2216 			break;
2217 	}
2218 	while(!SLIST_EMPTY(&sc->lro_active)) {
2219 		lro = SLIST_FIRST(&sc->lro_active);
2220 		SLIST_REMOVE_HEAD(&sc->lro_active, next);
2221 		mxge_lro_flush(sc, lro);
2222 	}
2223 }
2224 
2225 
2226 static inline void
2227 mxge_tx_done(mxge_softc_t *sc, uint32_t mcp_idx)
2228 {
2229 	struct ifnet *ifp;
2230 	mxge_tx_buf_t *tx;
2231 	struct mbuf *m;
2232 	bus_dmamap_t map;
2233 	int idx, limit;
2234 
2235 	limit = 0;
2236 	tx = &sc->tx;
2237 	ifp = sc->ifp;
2238 	while (tx->pkt_done != mcp_idx) {
2239 		idx = tx->done & tx->mask;
2240 		tx->done++;
2241 		m = tx->info[idx].m;
2242 		/* mbuf and DMA map only attached to the first
2243 		   segment per-mbuf */
2244 		if (m != NULL) {
2245 			ifp->if_opackets++;
2246 			tx->info[idx].m = NULL;
2247 			map = tx->info[idx].map;
2248 			bus_dmamap_unload(tx->dmat, map);
2249 			m_freem(m);
2250 		}
2251 		if (tx->info[idx].flag) {
2252 			tx->info[idx].flag = 0;
2253 			tx->pkt_done++;
2254 		}
2255 		/* limit potential for livelock by only handling
2256 		   2 full tx rings per call */
2257 		if (__predict_false(++limit >  2 * tx->mask))
2258 			break;
2259 	}
2260 
2261 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2262            its OK to send packets */
2263 
2264 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
2265 	    tx->req - tx->done < (tx->mask + 1)/4) {
2266 		mtx_lock(&sc->tx_mtx);
2267 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2268 		sc->tx.wake++;
2269 		mxge_start_locked(sc);
2270 		mtx_unlock(&sc->tx_mtx);
2271 	}
2272 }
2273 
2274 static void
2275 mxge_intr(void *arg)
2276 {
2277 	mxge_softc_t *sc = arg;
2278 	mcp_irq_data_t *stats = sc->fw_stats;
2279 	mxge_tx_buf_t *tx = &sc->tx;
2280 	mxge_rx_done_t *rx_done = &sc->rx_done;
2281 	uint32_t send_done_count;
2282 	uint8_t valid;
2283 
2284 
2285 	/* make sure the DMA has finished */
2286 	if (!stats->valid) {
2287 		return;
2288 	}
2289 	valid = stats->valid;
2290 
2291 	if (!sc->msi_enabled) {
2292 		/* lower legacy IRQ  */
2293 		*sc->irq_deassert = 0;
2294 		if (!mxge_deassert_wait)
2295 			/* don't wait for conf. that irq is low */
2296 			stats->valid = 0;
2297 	} else {
2298 		stats->valid = 0;
2299 	}
2300 
2301 	/* loop while waiting for legacy irq deassertion */
2302 	do {
2303 		/* check for transmit completes and receives */
2304 		send_done_count = be32toh(stats->send_done_count);
2305 		while ((send_done_count != tx->pkt_done) ||
2306 		       (rx_done->entry[rx_done->idx].length != 0)) {
2307 			mxge_tx_done(sc, (int)send_done_count);
2308 			mxge_clean_rx_done(sc);
2309 			send_done_count = be32toh(stats->send_done_count);
2310 		}
2311 	} while (*((volatile uint8_t *) &stats->valid));
2312 
2313 	if (__predict_false(stats->stats_updated)) {
2314 		if (sc->link_state != stats->link_up) {
2315 			sc->link_state = stats->link_up;
2316 			if (sc->link_state) {
2317 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2318 				if (mxge_verbose)
2319 					device_printf(sc->dev, "link up\n");
2320 			} else {
2321 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2322 				if (mxge_verbose)
2323 					device_printf(sc->dev, "link down\n");
2324 			}
2325 		}
2326 		if (sc->rdma_tags_available !=
2327 		    be32toh(sc->fw_stats->rdma_tags_available)) {
2328 			sc->rdma_tags_available =
2329 				be32toh(sc->fw_stats->rdma_tags_available);
2330 			device_printf(sc->dev, "RDMA timed out! %d tags "
2331 				      "left\n", sc->rdma_tags_available);
2332 		}
2333 		sc->down_cnt += stats->link_down;
2334 	}
2335 
2336 	/* check to see if we have rx token to pass back */
2337 	if (valid & 0x1)
2338 	    *sc->irq_claim = be32toh(3);
2339 	*(sc->irq_claim + 1) = be32toh(3);
2340 }
2341 
2342 static void
2343 mxge_init(void *arg)
2344 {
2345 }
2346 
2347 
2348 
2349 static void
2350 mxge_free_mbufs(mxge_softc_t *sc)
2351 {
2352 	int i;
2353 
2354 	for (i = 0; i <= sc->rx_big.mask; i++) {
2355 		if (sc->rx_big.info[i].m == NULL)
2356 			continue;
2357 		bus_dmamap_unload(sc->rx_big.dmat,
2358 				  sc->rx_big.info[i].map);
2359 		m_freem(sc->rx_big.info[i].m);
2360 		sc->rx_big.info[i].m = NULL;
2361 	}
2362 
2363 	for (i = 0; i <= sc->rx_small.mask; i++) {
2364 		if (sc->rx_small.info[i].m == NULL)
2365 			continue;
2366 		bus_dmamap_unload(sc->rx_small.dmat,
2367 				  sc->rx_small.info[i].map);
2368 		m_freem(sc->rx_small.info[i].m);
2369 		sc->rx_small.info[i].m = NULL;
2370 	}
2371 
2372 	for (i = 0; i <= sc->tx.mask; i++) {
2373 		sc->tx.info[i].flag = 0;
2374 		if (sc->tx.info[i].m == NULL)
2375 			continue;
2376 		bus_dmamap_unload(sc->tx.dmat,
2377 				  sc->tx.info[i].map);
2378 		m_freem(sc->tx.info[i].m);
2379 		sc->tx.info[i].m = NULL;
2380 	}
2381 }
2382 
2383 static void
2384 mxge_free_rings(mxge_softc_t *sc)
2385 {
2386 	int i;
2387 
2388 	if (sc->rx_done.entry != NULL)
2389 		mxge_dma_free(&sc->rx_done.dma);
2390 	sc->rx_done.entry = NULL;
2391 	if (sc->tx.req_bytes != NULL)
2392 		free(sc->tx.req_bytes, M_DEVBUF);
2393 	if (sc->tx.seg_list != NULL)
2394 		free(sc->tx.seg_list, M_DEVBUF);
2395 	if (sc->rx_small.shadow != NULL)
2396 		free(sc->rx_small.shadow, M_DEVBUF);
2397 	if (sc->rx_big.shadow != NULL)
2398 		free(sc->rx_big.shadow, M_DEVBUF);
2399 	if (sc->tx.info != NULL) {
2400 		if (sc->tx.dmat != NULL) {
2401 			for (i = 0; i <= sc->tx.mask; i++) {
2402 				bus_dmamap_destroy(sc->tx.dmat,
2403 						   sc->tx.info[i].map);
2404 			}
2405 			bus_dma_tag_destroy(sc->tx.dmat);
2406 		}
2407 		free(sc->tx.info, M_DEVBUF);
2408 	}
2409 	if (sc->rx_small.info != NULL) {
2410 		if (sc->rx_small.dmat != NULL) {
2411 			for (i = 0; i <= sc->rx_small.mask; i++) {
2412 				bus_dmamap_destroy(sc->rx_small.dmat,
2413 						   sc->rx_small.info[i].map);
2414 			}
2415 			bus_dmamap_destroy(sc->rx_small.dmat,
2416 					   sc->rx_small.extra_map);
2417 			bus_dma_tag_destroy(sc->rx_small.dmat);
2418 		}
2419 		free(sc->rx_small.info, M_DEVBUF);
2420 	}
2421 	if (sc->rx_big.info != NULL) {
2422 		if (sc->rx_big.dmat != NULL) {
2423 			for (i = 0; i <= sc->rx_big.mask; i++) {
2424 				bus_dmamap_destroy(sc->rx_big.dmat,
2425 						   sc->rx_big.info[i].map);
2426 			}
2427 			bus_dmamap_destroy(sc->rx_big.dmat,
2428 					   sc->rx_big.extra_map);
2429 			bus_dma_tag_destroy(sc->rx_big.dmat);
2430 		}
2431 		free(sc->rx_big.info, M_DEVBUF);
2432 	}
2433 }
2434 
2435 static int
2436 mxge_alloc_rings(mxge_softc_t *sc)
2437 {
2438 	mxge_cmd_t cmd;
2439 	int tx_ring_size, rx_ring_size;
2440 	int tx_ring_entries, rx_ring_entries;
2441 	int i, err;
2442 	unsigned long bytes;
2443 
2444 	/* get ring sizes */
2445 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
2446 	tx_ring_size = cmd.data0;
2447 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
2448 	if (err != 0) {
2449 		device_printf(sc->dev, "Cannot determine ring sizes\n");
2450 		goto abort_with_nothing;
2451 	}
2452 
2453 	rx_ring_size = cmd.data0;
2454 
2455 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
2456 	rx_ring_entries = rx_ring_size / sizeof (mcp_dma_addr_t);
2457 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
2458 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
2459 	IFQ_SET_READY(&sc->ifp->if_snd);
2460 
2461 	sc->tx.mask = tx_ring_entries - 1;
2462 	sc->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
2463 	sc->rx_small.mask = sc->rx_big.mask = rx_ring_entries - 1;
2464 	sc->rx_done.mask = (2 * rx_ring_entries) - 1;
2465 
2466 	err = ENOMEM;
2467 
2468 	/* allocate interrupt queues */
2469 	bytes = (sc->rx_done.mask + 1) * sizeof (*sc->rx_done.entry);
2470 	err = mxge_dma_alloc(sc, &sc->rx_done.dma, bytes, 4096);
2471 	if (err != 0)
2472 		goto abort_with_nothing;
2473 	sc->rx_done.entry = sc->rx_done.dma.addr;
2474 	bzero(sc->rx_done.entry, bytes);
2475 
2476 	/* allocate the tx request copy block */
2477 	bytes = 8 +
2478 		sizeof (*sc->tx.req_list) * (sc->tx.max_desc + 4);
2479 	sc->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
2480 	if (sc->tx.req_bytes == NULL)
2481 		goto abort_with_alloc;
2482 	/* ensure req_list entries are aligned to 8 bytes */
2483 	sc->tx.req_list = (mcp_kreq_ether_send_t *)
2484 		((unsigned long)(sc->tx.req_bytes + 7) & ~7UL);
2485 
2486 	/* allocate the tx busdma segment list */
2487 	bytes = sizeof (*sc->tx.seg_list) * sc->tx.max_desc;
2488 	sc->tx.seg_list = (bus_dma_segment_t *)
2489 		malloc(bytes, M_DEVBUF, M_WAITOK);
2490 	if (sc->tx.seg_list == NULL)
2491 		goto abort_with_alloc;
2492 
2493 	/* allocate the rx shadow rings */
2494 	bytes = rx_ring_entries * sizeof (*sc->rx_small.shadow);
2495 	sc->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2496 	if (sc->rx_small.shadow == NULL)
2497 		goto abort_with_alloc;
2498 
2499 	bytes = rx_ring_entries * sizeof (*sc->rx_big.shadow);
2500 	sc->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2501 	if (sc->rx_big.shadow == NULL)
2502 		goto abort_with_alloc;
2503 
2504 	/* allocate the host info rings */
2505 	bytes = tx_ring_entries * sizeof (*sc->tx.info);
2506 	sc->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2507 	if (sc->tx.info == NULL)
2508 		goto abort_with_alloc;
2509 
2510 	bytes = rx_ring_entries * sizeof (*sc->rx_small.info);
2511 	sc->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2512 	if (sc->rx_small.info == NULL)
2513 		goto abort_with_alloc;
2514 
2515 	bytes = rx_ring_entries * sizeof (*sc->rx_big.info);
2516 	sc->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2517 	if (sc->rx_big.info == NULL)
2518 		goto abort_with_alloc;
2519 
2520 	/* allocate the busdma resources */
2521 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2522 				 1,			/* alignment */
2523 				 sc->tx.boundary,	/* boundary */
2524 				 BUS_SPACE_MAXADDR,	/* low */
2525 				 BUS_SPACE_MAXADDR,	/* high */
2526 				 NULL, NULL,		/* filter */
2527 				 65536 + 256,		/* maxsize */
2528 				 sc->tx.max_desc - 2,	/* num segs */
2529 				 sc->tx.boundary,	/* maxsegsize */
2530 				 BUS_DMA_ALLOCNOW,	/* flags */
2531 				 NULL, NULL,		/* lock */
2532 				 &sc->tx.dmat);		/* tag */
2533 
2534 	if (err != 0) {
2535 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
2536 			      err);
2537 		goto abort_with_alloc;
2538 	}
2539 
2540 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2541 				 1,			/* alignment */
2542 				 4096,			/* boundary */
2543 				 BUS_SPACE_MAXADDR,	/* low */
2544 				 BUS_SPACE_MAXADDR,	/* high */
2545 				 NULL, NULL,		/* filter */
2546 				 MHLEN,			/* maxsize */
2547 				 1,			/* num segs */
2548 				 MHLEN,			/* maxsegsize */
2549 				 BUS_DMA_ALLOCNOW,	/* flags */
2550 				 NULL, NULL,		/* lock */
2551 				 &sc->rx_small.dmat);	/* tag */
2552 	if (err != 0) {
2553 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2554 			      err);
2555 		goto abort_with_alloc;
2556 	}
2557 
2558 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2559 				 1,			/* alignment */
2560 				 4096,			/* boundary */
2561 				 BUS_SPACE_MAXADDR,	/* low */
2562 				 BUS_SPACE_MAXADDR,	/* high */
2563 				 NULL, NULL,		/* filter */
2564 				 3*4096,		/* maxsize */
2565 				 3,			/* num segs */
2566 				 4096,			/* maxsegsize */
2567 				 BUS_DMA_ALLOCNOW,	/* flags */
2568 				 NULL, NULL,		/* lock */
2569 				 &sc->rx_big.dmat);	/* tag */
2570 	if (err != 0) {
2571 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2572 			      err);
2573 		goto abort_with_alloc;
2574 	}
2575 
2576 	/* now use these tags to setup dmamaps for each slot
2577 	   in each ring */
2578 	for (i = 0; i <= sc->tx.mask; i++) {
2579 		err = bus_dmamap_create(sc->tx.dmat, 0,
2580 					&sc->tx.info[i].map);
2581 		if (err != 0) {
2582 			device_printf(sc->dev, "Err %d  tx dmamap\n",
2583 			      err);
2584 			goto abort_with_alloc;
2585 		}
2586 	}
2587 	for (i = 0; i <= sc->rx_small.mask; i++) {
2588 		err = bus_dmamap_create(sc->rx_small.dmat, 0,
2589 					&sc->rx_small.info[i].map);
2590 		if (err != 0) {
2591 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2592 				      err);
2593 			goto abort_with_alloc;
2594 		}
2595 	}
2596 	err = bus_dmamap_create(sc->rx_small.dmat, 0,
2597 				&sc->rx_small.extra_map);
2598 	if (err != 0) {
2599 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2600 			      err);
2601 			goto abort_with_alloc;
2602 	}
2603 
2604 	for (i = 0; i <= sc->rx_big.mask; i++) {
2605 		err = bus_dmamap_create(sc->rx_big.dmat, 0,
2606 					&sc->rx_big.info[i].map);
2607 		if (err != 0) {
2608 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2609 			      err);
2610 			goto abort_with_alloc;
2611 		}
2612 	}
2613 	err = bus_dmamap_create(sc->rx_big.dmat, 0,
2614 				&sc->rx_big.extra_map);
2615 	if (err != 0) {
2616 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2617 			      err);
2618 			goto abort_with_alloc;
2619 	}
2620 	return 0;
2621 
2622 abort_with_alloc:
2623 	mxge_free_rings(sc);
2624 
2625 abort_with_nothing:
2626 	return err;
2627 }
2628 
2629 static void
2630 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
2631 {
2632 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
2633 
2634 	if (bufsize < MCLBYTES) {
2635 		/* easy, everything fits in a single buffer */
2636 		*big_buf_size = MCLBYTES;
2637 		*cl_size = MCLBYTES;
2638 		*nbufs = 1;
2639 		return;
2640 	}
2641 
2642 	if (bufsize < MJUMPAGESIZE) {
2643 		/* still easy, everything still fits in a single buffer */
2644 		*big_buf_size = MJUMPAGESIZE;
2645 		*cl_size = MJUMPAGESIZE;
2646 		*nbufs = 1;
2647 		return;
2648 	}
2649 	/* now we need to use virtually contiguous buffers */
2650 	*cl_size = MJUM9BYTES;
2651 	*big_buf_size = 4096;
2652 	*nbufs = mtu / 4096 + 1;
2653 	/* needs to be a power of two, so round up */
2654 	if (*nbufs == 3)
2655 		*nbufs = 4;
2656 }
2657 
2658 static int
2659 mxge_open(mxge_softc_t *sc)
2660 {
2661 	mxge_cmd_t cmd;
2662 	int i, err, big_bytes;
2663 	bus_dmamap_t map;
2664 	bus_addr_t bus;
2665 	struct lro_entry *lro_entry;
2666 
2667 	SLIST_INIT(&sc->lro_free);
2668 	SLIST_INIT(&sc->lro_active);
2669 
2670 	for (i = 0; i < sc->lro_cnt; i++) {
2671 		lro_entry = (struct lro_entry *)
2672 			malloc(sizeof (*lro_entry), M_DEVBUF, M_NOWAIT | M_ZERO);
2673 		if (lro_entry == NULL) {
2674 			sc->lro_cnt = i;
2675 			break;
2676 		}
2677 		SLIST_INSERT_HEAD(&sc->lro_free, lro_entry, next);
2678 	}
2679 
2680 	/* Copy the MAC address in case it was overridden */
2681 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
2682 
2683 	err = mxge_reset(sc, 1);
2684 	if (err != 0) {
2685 		device_printf(sc->dev, "failed to reset\n");
2686 		return EIO;
2687 	}
2688 
2689 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes,
2690 			   &sc->rx_big.cl_size, &sc->rx_big.nbufs);
2691 
2692 	cmd.data0 = sc->rx_big.nbufs;
2693 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
2694 			    &cmd);
2695 	/* error is only meaningful if we're trying to set
2696 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
2697 	if (err && sc->rx_big.nbufs > 1) {
2698 		device_printf(sc->dev,
2699 			      "Failed to set alway-use-n to %d\n",
2700 			      sc->rx_big.nbufs);
2701 		return EIO;
2702 	}
2703 	/* get the lanai pointers to the send and receive rings */
2704 
2705 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
2706 	sc->tx.lanai =
2707 		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
2708 	err |= mxge_send_cmd(sc,
2709 				 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
2710 	sc->rx_small.lanai =
2711 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2712 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
2713 	sc->rx_big.lanai =
2714 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2715 
2716 	if (err != 0) {
2717 		device_printf(sc->dev,
2718 			      "failed to get ring sizes or locations\n");
2719 		return EIO;
2720 	}
2721 
2722 	/* stock receive rings */
2723 	for (i = 0; i <= sc->rx_small.mask; i++) {
2724 		map = sc->rx_small.info[i].map;
2725 		err = mxge_get_buf_small(sc, map, i);
2726 		if (err) {
2727 			device_printf(sc->dev, "alloced %d/%d smalls\n",
2728 				      i, sc->rx_small.mask + 1);
2729 			goto abort;
2730 		}
2731 	}
2732 	for (i = 0; i <= sc->rx_big.mask; i++) {
2733 		sc->rx_big.shadow[i].addr_low = 0xffffffff;
2734 		sc->rx_big.shadow[i].addr_high = 0xffffffff;
2735 	}
2736 	for (i = 0; i <= sc->rx_big.mask; i += sc->rx_big.nbufs) {
2737 		map = sc->rx_big.info[i].map;
2738 		err = mxge_get_buf_big(sc, map, i);
2739 		if (err) {
2740 			device_printf(sc->dev, "alloced %d/%d bigs\n",
2741 				      i, sc->rx_big.mask + 1);
2742 			goto abort;
2743 		}
2744 	}
2745 
2746 	/* Give the firmware the mtu and the big and small buffer
2747 	   sizes.  The firmware wants the big buf size to be a power
2748 	   of two. Luckily, FreeBSD's clusters are powers of two */
2749 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
2750 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
2751 	cmd.data0 = MHLEN - MXGEFW_PAD;
2752 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
2753 			     &cmd);
2754 	cmd.data0 = big_bytes;
2755 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2756 
2757 	if (err != 0) {
2758 		device_printf(sc->dev, "failed to setup params\n");
2759 		goto abort;
2760 	}
2761 
2762 	/* Now give him the pointer to the stats block */
2763 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->fw_stats_dma.bus_addr);
2764 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->fw_stats_dma.bus_addr);
2765 	cmd.data2 = sizeof(struct mcp_irq_data);
2766 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
2767 
2768 	if (err != 0) {
2769 		bus = sc->fw_stats_dma.bus_addr;
2770 		bus += offsetof(struct mcp_irq_data, send_done_count);
2771 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
2772 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
2773 		err = mxge_send_cmd(sc,
2774 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
2775 				    &cmd);
2776 		/* Firmware cannot support multicast without STATS_DMA_V2 */
2777 		sc->fw_multicast_support = 0;
2778 	} else {
2779 		sc->fw_multicast_support = 1;
2780 	}
2781 
2782 	if (err != 0) {
2783 		device_printf(sc->dev, "failed to setup params\n");
2784 		goto abort;
2785 	}
2786 
2787 	/* Finally, start the firmware running */
2788 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
2789 	if (err) {
2790 		device_printf(sc->dev, "Couldn't bring up link\n");
2791 		goto abort;
2792 	}
2793 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
2794 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2795 
2796 	return 0;
2797 
2798 
2799 abort:
2800 	mxge_free_mbufs(sc);
2801 
2802 	return err;
2803 }
2804 
2805 static int
2806 mxge_close(mxge_softc_t *sc)
2807 {
2808 	struct lro_entry *lro_entry;
2809 	mxge_cmd_t cmd;
2810 	int err, old_down_cnt;
2811 
2812 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
2813 	old_down_cnt = sc->down_cnt;
2814 	mb();
2815 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2816 	if (err) {
2817 		device_printf(sc->dev, "Couldn't bring down link\n");
2818 	}
2819 	if (old_down_cnt == sc->down_cnt) {
2820 		/* wait for down irq */
2821 		DELAY(10 * sc->intr_coal_delay);
2822 	}
2823 	if (old_down_cnt == sc->down_cnt) {
2824 		device_printf(sc->dev, "never got down irq\n");
2825 	}
2826 
2827 	mxge_free_mbufs(sc);
2828 
2829 	while (!SLIST_EMPTY(&sc->lro_free)) {
2830 		lro_entry = SLIST_FIRST(&sc->lro_free);
2831 		SLIST_REMOVE_HEAD(&sc->lro_free, next);
2832 	}
2833 	return 0;
2834 }
2835 
2836 static void
2837 mxge_setup_cfg_space(mxge_softc_t *sc)
2838 {
2839 	device_t dev = sc->dev;
2840 	int reg;
2841 	uint16_t cmd, lnk, pectl;
2842 
2843 	/* find the PCIe link width and set max read request to 4KB*/
2844 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
2845 		lnk = pci_read_config(dev, reg + 0x12, 2);
2846 		sc->link_width = (lnk >> 4) & 0x3f;
2847 
2848 		pectl = pci_read_config(dev, reg + 0x8, 2);
2849 		pectl = (pectl & ~0x7000) | (5 << 12);
2850 		pci_write_config(dev, reg + 0x8, pectl, 2);
2851 	}
2852 
2853 	/* Enable DMA and Memory space access */
2854 	pci_enable_busmaster(dev);
2855 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
2856 	cmd |= PCIM_CMD_MEMEN;
2857 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
2858 }
2859 
2860 static uint32_t
2861 mxge_read_reboot(mxge_softc_t *sc)
2862 {
2863 	device_t dev = sc->dev;
2864 	uint32_t vs;
2865 
2866 	/* find the vendor specific offset */
2867 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
2868 		device_printf(sc->dev,
2869 			      "could not find vendor specific offset\n");
2870 		return (uint32_t)-1;
2871 	}
2872 	/* enable read32 mode */
2873 	pci_write_config(dev, vs + 0x10, 0x3, 1);
2874 	/* tell NIC which register to read */
2875 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
2876 	return (pci_read_config(dev, vs + 0x14, 4));
2877 }
2878 
2879 static void
2880 mxge_watchdog_reset(mxge_softc_t *sc)
2881 {
2882 	int err;
2883 	uint32_t reboot;
2884 	uint16_t cmd;
2885 
2886 	err = ENXIO;
2887 
2888 	device_printf(sc->dev, "Watchdog reset!\n");
2889 
2890 	/*
2891 	 * check to see if the NIC rebooted.  If it did, then all of
2892 	 * PCI config space has been reset, and things like the
2893 	 * busmaster bit will be zero.  If this is the case, then we
2894 	 * must restore PCI config space before the NIC can be used
2895 	 * again
2896 	 */
2897 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2898 	if (cmd == 0xffff) {
2899 		/*
2900 		 * maybe the watchdog caught the NIC rebooting; wait
2901 		 * up to 100ms for it to finish.  If it does not come
2902 		 * back, then give up
2903 		 */
2904 		DELAY(1000*100);
2905 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2906 		if (cmd == 0xffff) {
2907 			device_printf(sc->dev, "NIC disappeared!\n");
2908 			goto abort;
2909 		}
2910 	}
2911 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
2912 		/* print the reboot status */
2913 		reboot = mxge_read_reboot(sc);
2914 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
2915 			      reboot);
2916 		/* restore PCI configuration space */
2917 
2918 		/* XXXX waiting for pci_cfg_restore() to be exported */
2919 		goto abort; /* just abort for now */
2920 
2921 		/* and redo any changes we made to our config space */
2922 		mxge_setup_cfg_space(sc);
2923 	} else {
2924 		device_printf(sc->dev, "NIC did not reboot, ring state:\n");
2925 		device_printf(sc->dev, "tx.req=%d tx.done=%d\n",
2926 			      sc->tx.req, sc->tx.done);
2927 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
2928 			      sc->tx.pkt_done,
2929 			      be32toh(sc->fw_stats->send_done_count));
2930 	}
2931 
2932 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
2933 		mxge_close(sc);
2934 		err = mxge_open(sc);
2935 	}
2936 
2937 abort:
2938 	/*
2939 	 * stop the watchdog if the nic is dead, to avoid spamming the
2940 	 * console
2941 	 */
2942 	if (err != 0) {
2943 		callout_stop(&sc->co_hdl);
2944 	}
2945 }
2946 
2947 static void
2948 mxge_watchdog(mxge_softc_t *sc)
2949 {
2950 	mxge_tx_buf_t *tx = &sc->tx;
2951 
2952 	/* see if we have outstanding transmits, which
2953 	   have been pending for more than mxge_ticks */
2954 	if (tx->req != tx->done &&
2955 	    tx->watchdog_req != tx->watchdog_done &&
2956 	    tx->done == tx->watchdog_done)
2957 		mxge_watchdog_reset(sc);
2958 
2959 	tx->watchdog_req = tx->req;
2960 	tx->watchdog_done = tx->done;
2961 }
2962 
2963 static void
2964 mxge_tick(void *arg)
2965 {
2966 	mxge_softc_t *sc = arg;
2967 
2968 
2969 	/* Synchronize with possible callout reset/stop. */
2970 	if (callout_pending(&sc->co_hdl) ||
2971 	    !callout_active(&sc->co_hdl)) {
2972 		mtx_unlock(&sc->driver_mtx);
2973 		return;
2974 	}
2975 
2976 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
2977 	mxge_watchdog(sc);
2978 }
2979 
2980 static int
2981 mxge_media_change(struct ifnet *ifp)
2982 {
2983 	return EINVAL;
2984 }
2985 
2986 static int
2987 mxge_change_mtu(mxge_softc_t *sc, int mtu)
2988 {
2989 	struct ifnet *ifp = sc->ifp;
2990 	int real_mtu, old_mtu;
2991 	int err = 0;
2992 
2993 
2994 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
2995 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
2996 		return EINVAL;
2997 	mtx_lock(&sc->driver_mtx);
2998 	old_mtu = ifp->if_mtu;
2999 	ifp->if_mtu = mtu;
3000 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3001 		callout_stop(&sc->co_hdl);
3002 		mxge_close(sc);
3003 		err = mxge_open(sc);
3004 		if (err != 0) {
3005 			ifp->if_mtu = old_mtu;
3006 			mxge_close(sc);
3007 			(void) mxge_open(sc);
3008 		}
3009 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3010 	}
3011 	mtx_unlock(&sc->driver_mtx);
3012 	return err;
3013 }
3014 
3015 static void
3016 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3017 {
3018 	mxge_softc_t *sc = ifp->if_softc;
3019 
3020 
3021 	if (sc == NULL)
3022 		return;
3023 	ifmr->ifm_status = IFM_AVALID;
3024 	ifmr->ifm_status |= sc->fw_stats->link_up ? IFM_ACTIVE : 0;
3025 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3026 	ifmr->ifm_active |= sc->fw_stats->link_up ? IFM_FDX : 0;
3027 }
3028 
3029 static int
3030 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3031 {
3032 	mxge_softc_t *sc = ifp->if_softc;
3033 	struct ifreq *ifr = (struct ifreq *)data;
3034 	int err, mask;
3035 
3036 	err = 0;
3037 	switch (command) {
3038 	case SIOCSIFADDR:
3039 	case SIOCGIFADDR:
3040 		err = ether_ioctl(ifp, command, data);
3041 		break;
3042 
3043 	case SIOCSIFMTU:
3044 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3045 		break;
3046 
3047 	case SIOCSIFFLAGS:
3048 		mtx_lock(&sc->driver_mtx);
3049 		if (ifp->if_flags & IFF_UP) {
3050 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3051 				err = mxge_open(sc);
3052 				callout_reset(&sc->co_hdl, mxge_ticks,
3053 					      mxge_tick, sc);
3054 			} else {
3055 				/* take care of promis can allmulti
3056 				   flag chages */
3057 				mxge_change_promisc(sc,
3058 						    ifp->if_flags & IFF_PROMISC);
3059 				mxge_set_multicast_list(sc);
3060 			}
3061 		} else {
3062 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3063 				mxge_close(sc);
3064 				callout_stop(&sc->co_hdl);
3065 			}
3066 		}
3067 		mtx_unlock(&sc->driver_mtx);
3068 		break;
3069 
3070 	case SIOCADDMULTI:
3071 	case SIOCDELMULTI:
3072 		mtx_lock(&sc->driver_mtx);
3073 		mxge_set_multicast_list(sc);
3074 		mtx_unlock(&sc->driver_mtx);
3075 		break;
3076 
3077 	case SIOCSIFCAP:
3078 		mtx_lock(&sc->driver_mtx);
3079 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3080 		if (mask & IFCAP_TXCSUM) {
3081 			if (IFCAP_TXCSUM & ifp->if_capenable) {
3082 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3083 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3084 						      | CSUM_TSO);
3085 			} else {
3086 				ifp->if_capenable |= IFCAP_TXCSUM;
3087 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3088 			}
3089 		} else if (mask & IFCAP_RXCSUM) {
3090 			if (IFCAP_RXCSUM & ifp->if_capenable) {
3091 				ifp->if_capenable &= ~IFCAP_RXCSUM;
3092 				sc->csum_flag = 0;
3093 			} else {
3094 				ifp->if_capenable |= IFCAP_RXCSUM;
3095 				sc->csum_flag = 1;
3096 			}
3097 		}
3098 		if (mask & IFCAP_TSO4) {
3099 			if (IFCAP_TSO4 & ifp->if_capenable) {
3100 				ifp->if_capenable &= ~IFCAP_TSO4;
3101 				ifp->if_hwassist &= ~CSUM_TSO;
3102 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
3103 				ifp->if_capenable |= IFCAP_TSO4;
3104 				ifp->if_hwassist |= CSUM_TSO;
3105 			} else {
3106 				printf("mxge requires tx checksum offload"
3107 				       " be enabled to use TSO\n");
3108 				err = EINVAL;
3109 			}
3110 		}
3111 
3112 		if (mask & IFCAP_VLAN_HWTAGGING)
3113 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3114 		mtx_unlock(&sc->driver_mtx);
3115 		VLAN_CAPABILITIES(ifp);
3116 
3117 		break;
3118 
3119 	case SIOCGIFMEDIA:
3120 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3121 				    &sc->media, command);
3122                 break;
3123 
3124 	default:
3125 		err = ENOTTY;
3126         }
3127 	return err;
3128 }
3129 
3130 static void
3131 mxge_fetch_tunables(mxge_softc_t *sc)
3132 {
3133 
3134 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
3135 			  &mxge_flow_control);
3136 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
3137 			  &mxge_intr_coal_delay);
3138 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
3139 			  &mxge_nvidia_ecrc_enable);
3140 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
3141 			  &mxge_force_firmware);
3142 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
3143 			  &mxge_deassert_wait);
3144 	TUNABLE_INT_FETCH("hw.mxge.verbose",
3145 			  &mxge_verbose);
3146 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
3147 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
3148 
3149 	if (bootverbose)
3150 		mxge_verbose = 1;
3151 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
3152 		mxge_intr_coal_delay = 30;
3153 	if (mxge_ticks == 0)
3154 		mxge_ticks = hz;
3155 	sc->pause = mxge_flow_control;
3156 
3157 }
3158 
3159 static int
3160 mxge_attach(device_t dev)
3161 {
3162 	mxge_softc_t *sc = device_get_softc(dev);
3163 	struct ifnet *ifp;
3164 	int count, rid, err;
3165 
3166 	sc->dev = dev;
3167 	mxge_fetch_tunables(sc);
3168 
3169 	err = bus_dma_tag_create(NULL,			/* parent */
3170 				 1,			/* alignment */
3171 				 4096,			/* boundary */
3172 				 BUS_SPACE_MAXADDR,	/* low */
3173 				 BUS_SPACE_MAXADDR,	/* high */
3174 				 NULL, NULL,		/* filter */
3175 				 65536 + 256,		/* maxsize */
3176 				 MXGE_MAX_SEND_DESC, 	/* num segs */
3177 				 4096,			/* maxsegsize */
3178 				 0,			/* flags */
3179 				 NULL, NULL,		/* lock */
3180 				 &sc->parent_dmat);	/* tag */
3181 
3182 	if (err != 0) {
3183 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
3184 			      err);
3185 		goto abort_with_nothing;
3186 	}
3187 
3188 	ifp = sc->ifp = if_alloc(IFT_ETHER);
3189 	if (ifp == NULL) {
3190 		device_printf(dev, "can not if_alloc()\n");
3191 		err = ENOSPC;
3192 		goto abort_with_parent_dmat;
3193 	}
3194 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
3195 		 device_get_nameunit(dev));
3196 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
3197 	snprintf(sc->tx_mtx_name, sizeof(sc->tx_mtx_name), "%s:tx",
3198 		 device_get_nameunit(dev));
3199 	mtx_init(&sc->tx_mtx, sc->tx_mtx_name, NULL, MTX_DEF);
3200 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
3201 		 "%s:drv", device_get_nameunit(dev));
3202 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
3203 		 MTX_NETWORK_LOCK, MTX_DEF);
3204 
3205 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
3206 
3207 	mxge_setup_cfg_space(sc);
3208 
3209 	/* Map the board into the kernel */
3210 	rid = PCIR_BARS;
3211 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
3212 					 ~0, 1, RF_ACTIVE);
3213 	if (sc->mem_res == NULL) {
3214 		device_printf(dev, "could not map memory\n");
3215 		err = ENXIO;
3216 		goto abort_with_lock;
3217 	}
3218 	sc->sram = rman_get_virtual(sc->mem_res);
3219 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
3220 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
3221 		device_printf(dev, "impossible memory region size %ld\n",
3222 			      rman_get_size(sc->mem_res));
3223 		err = ENXIO;
3224 		goto abort_with_mem_res;
3225 	}
3226 
3227 	/* make NULL terminated copy of the EEPROM strings section of
3228 	   lanai SRAM */
3229 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
3230 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
3231 				rman_get_bushandle(sc->mem_res),
3232 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
3233 				sc->eeprom_strings,
3234 				MXGE_EEPROM_STRINGS_SIZE - 2);
3235 	err = mxge_parse_strings(sc);
3236 	if (err != 0)
3237 		goto abort_with_mem_res;
3238 
3239 	/* Enable write combining for efficient use of PCIe bus */
3240 	mxge_enable_wc(sc);
3241 
3242 	/* Allocate the out of band dma memory */
3243 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
3244 			     sizeof (mxge_cmd_t), 64);
3245 	if (err != 0)
3246 		goto abort_with_mem_res;
3247 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
3248 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
3249 	if (err != 0)
3250 		goto abort_with_cmd_dma;
3251 
3252 	err = mxge_dma_alloc(sc, &sc->fw_stats_dma,
3253 			     sizeof (*sc->fw_stats), 64);
3254 	if (err != 0)
3255 		goto abort_with_zeropad_dma;
3256 	sc->fw_stats = (mcp_irq_data_t *)sc->fw_stats_dma.addr;
3257 
3258 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
3259 	if (err != 0)
3260 		goto abort_with_fw_stats;
3261 
3262 	/* Add our ithread  */
3263 	count = pci_msi_count(dev);
3264 	if (count == 1 && pci_alloc_msi(dev, &count) == 0) {
3265 		rid = 1;
3266 		sc->msi_enabled = 1;
3267 	} else {
3268 		rid = 0;
3269 	}
3270 	sc->irq_res = bus_alloc_resource(dev, SYS_RES_IRQ, &rid, 0, ~0,
3271 					 1, RF_SHAREABLE | RF_ACTIVE);
3272 	if (sc->irq_res == NULL) {
3273 		device_printf(dev, "could not alloc interrupt\n");
3274 		goto abort_with_dmabench;
3275 	}
3276 	if (mxge_verbose)
3277 		device_printf(dev, "using %s irq %ld\n",
3278 			      sc->msi_enabled ? "MSI" : "INTx",
3279 			      rman_get_start(sc->irq_res));
3280 	/* select & load the firmware */
3281 	err = mxge_select_firmware(sc);
3282 	if (err != 0)
3283 		goto abort_with_irq_res;
3284 	sc->intr_coal_delay = mxge_intr_coal_delay;
3285 	err = mxge_reset(sc, 0);
3286 	if (err != 0)
3287 		goto abort_with_irq_res;
3288 
3289 	err = mxge_alloc_rings(sc);
3290 	if (err != 0) {
3291 		device_printf(sc->dev, "failed to allocate rings\n");
3292 		goto abort_with_irq_res;
3293 	}
3294 
3295 	err = bus_setup_intr(sc->dev, sc->irq_res,
3296 			     INTR_TYPE_NET | INTR_MPSAFE,
3297 			     NULL, mxge_intr, sc, &sc->ih);
3298 	if (err != 0) {
3299 		goto abort_with_rings;
3300 	}
3301 	/* hook into the network stack */
3302 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
3303 	ifp->if_baudrate = 100000000;
3304 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
3305 		IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
3306 
3307 	sc->max_mtu = mxge_max_mtu(sc);
3308 	if (sc->max_mtu >= 9000)
3309 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
3310 	else
3311 		device_printf(dev, "MTU limited to %d.  Install "
3312 			      "latest firmware for 9000 byte jumbo support\n",
3313 			      sc->max_mtu - ETHER_HDR_LEN);
3314 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
3315 	ifp->if_capenable = ifp->if_capabilities;
3316 	sc->csum_flag = 1;
3317         ifp->if_init = mxge_init;
3318         ifp->if_softc = sc;
3319         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
3320         ifp->if_ioctl = mxge_ioctl;
3321         ifp->if_start = mxge_start;
3322 	ether_ifattach(ifp, sc->mac_addr);
3323 	/* ether_ifattach sets mtu to 1500 */
3324 	if (ifp->if_capabilities & IFCAP_JUMBO_MTU)
3325 		ifp->if_mtu = 9000;
3326 
3327 	/* Initialise the ifmedia structure */
3328 	ifmedia_init(&sc->media, 0, mxge_media_change,
3329 		     mxge_media_status);
3330 	ifmedia_add(&sc->media, IFM_ETHER|IFM_AUTO, 0, NULL);
3331 	mxge_add_sysctls(sc);
3332 	return 0;
3333 
3334 abort_with_rings:
3335 	mxge_free_rings(sc);
3336 abort_with_irq_res:
3337 	bus_release_resource(dev, SYS_RES_IRQ,
3338 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3339 	if (sc->msi_enabled)
3340 		pci_release_msi(dev);
3341 abort_with_dmabench:
3342 	mxge_dma_free(&sc->dmabench_dma);
3343 abort_with_fw_stats:
3344 	mxge_dma_free(&sc->fw_stats_dma);
3345 abort_with_zeropad_dma:
3346 	mxge_dma_free(&sc->zeropad_dma);
3347 abort_with_cmd_dma:
3348 	mxge_dma_free(&sc->cmd_dma);
3349 abort_with_mem_res:
3350 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3351 abort_with_lock:
3352 	pci_disable_busmaster(dev);
3353 	mtx_destroy(&sc->cmd_mtx);
3354 	mtx_destroy(&sc->tx_mtx);
3355 	mtx_destroy(&sc->driver_mtx);
3356 	if_free(ifp);
3357 abort_with_parent_dmat:
3358 	bus_dma_tag_destroy(sc->parent_dmat);
3359 
3360 abort_with_nothing:
3361 	return err;
3362 }
3363 
3364 static int
3365 mxge_detach(device_t dev)
3366 {
3367 	mxge_softc_t *sc = device_get_softc(dev);
3368 
3369 	if (sc->ifp->if_vlantrunk != NULL) {
3370 		device_printf(sc->dev,
3371 			      "Detach vlans before removing module\n");
3372 		return EBUSY;
3373 	}
3374 	mtx_lock(&sc->driver_mtx);
3375 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
3376 		mxge_close(sc);
3377 	callout_stop(&sc->co_hdl);
3378 	mtx_unlock(&sc->driver_mtx);
3379 	ether_ifdetach(sc->ifp);
3380 	ifmedia_removeall(&sc->media);
3381 	mxge_dummy_rdma(sc, 0);
3382 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
3383 	mxge_free_rings(sc);
3384 	bus_release_resource(dev, SYS_RES_IRQ,
3385 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3386 	if (sc->msi_enabled)
3387 		pci_release_msi(dev);
3388 
3389 	sc->rx_done.entry = NULL;
3390 	mxge_dma_free(&sc->rx_done.dma);
3391 	mxge_dma_free(&sc->fw_stats_dma);
3392 	mxge_dma_free(&sc->dmabench_dma);
3393 	mxge_dma_free(&sc->zeropad_dma);
3394 	mxge_dma_free(&sc->cmd_dma);
3395 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3396 	pci_disable_busmaster(dev);
3397 	mtx_destroy(&sc->cmd_mtx);
3398 	mtx_destroy(&sc->tx_mtx);
3399 	mtx_destroy(&sc->driver_mtx);
3400 	if_free(sc->ifp);
3401 	bus_dma_tag_destroy(sc->parent_dmat);
3402 	return 0;
3403 }
3404 
3405 static int
3406 mxge_shutdown(device_t dev)
3407 {
3408 	return 0;
3409 }
3410 
3411 /*
3412   This file uses Myri10GE driver indentation.
3413 
3414   Local Variables:
3415   c-file-style:"linux"
3416   tab-width:8
3417   End:
3418 */
3419