xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 94942af266ac119ede0ca836f9aa5a5ac0582938)
1 /******************************************************************************
2 
3 Copyright (c) 2006, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Redistributions in binary form must reproduce the above copyright
13     notice, this list of conditions and the following disclaimer in the
14     documentation and/or other materials provided with the distribution.
15 
16  3. Neither the name of the Myricom Inc, nor the names of its
17     contributors may be used to endorse or promote products derived from
18     this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 POSSIBILITY OF SUCH DAMAGE.
31 
32 ***************************************************************************/
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/linker.h>
40 #include <sys/firmware.h>
41 #include <sys/endian.h>
42 #include <sys/sockio.h>
43 #include <sys/mbuf.h>
44 #include <sys/malloc.h>
45 #include <sys/kdb.h>
46 #include <sys/kernel.h>
47 #include <sys/lock.h>
48 #include <sys/module.h>
49 #include <sys/memrange.h>
50 #include <sys/socket.h>
51 #include <sys/sysctl.h>
52 #include <sys/sx.h>
53 
54 #include <net/if.h>
55 #include <net/if_arp.h>
56 #include <net/ethernet.h>
57 #include <net/if_dl.h>
58 #include <net/if_media.h>
59 
60 #include <net/bpf.h>
61 
62 #include <net/if_types.h>
63 #include <net/if_vlan_var.h>
64 #include <net/zlib.h>
65 
66 #include <netinet/in_systm.h>
67 #include <netinet/in.h>
68 #include <netinet/ip.h>
69 #include <netinet/tcp.h>
70 
71 #include <machine/bus.h>
72 #include <machine/resource.h>
73 #include <sys/bus.h>
74 #include <sys/rman.h>
75 
76 #include <dev/pci/pcireg.h>
77 #include <dev/pci/pcivar.h>
78 
79 #include <vm/vm.h>		/* for pmap_mapdev() */
80 #include <vm/pmap.h>
81 
82 #include <dev/mxge/mxge_mcp.h>
83 #include <dev/mxge/mcp_gen_header.h>
84 #include <dev/mxge/if_mxge_var.h>
85 
86 /* tunable params */
87 static int mxge_nvidia_ecrc_enable = 1;
88 static int mxge_force_firmware = 0;
89 static int mxge_max_intr_slots = 1024;
90 static int mxge_intr_coal_delay = 30;
91 static int mxge_deassert_wait = 1;
92 static int mxge_flow_control = 1;
93 static int mxge_verbose = 0;
94 static int mxge_ticks;
95 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
96 static char *mxge_fw_aligned = "mxge_eth_z8e";
97 
98 static int mxge_probe(device_t dev);
99 static int mxge_attach(device_t dev);
100 static int mxge_detach(device_t dev);
101 static int mxge_shutdown(device_t dev);
102 static void mxge_intr(void *arg);
103 
104 static device_method_t mxge_methods[] =
105 {
106   /* Device interface */
107   DEVMETHOD(device_probe, mxge_probe),
108   DEVMETHOD(device_attach, mxge_attach),
109   DEVMETHOD(device_detach, mxge_detach),
110   DEVMETHOD(device_shutdown, mxge_shutdown),
111   {0, 0}
112 };
113 
114 static driver_t mxge_driver =
115 {
116   "mxge",
117   mxge_methods,
118   sizeof(mxge_softc_t),
119 };
120 
121 static devclass_t mxge_devclass;
122 
123 /* Declare ourselves to be a child of the PCI bus.*/
124 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
125 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
126 
127 static int mxge_load_firmware(mxge_softc_t *sc);
128 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
129 
130 static int
131 mxge_probe(device_t dev)
132 {
133   if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
134       (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E)) {
135 	  device_set_desc(dev, "Myri10G-PCIE-8A");
136 	  return 0;
137   }
138   return ENXIO;
139 }
140 
141 static void
142 mxge_enable_wc(mxge_softc_t *sc)
143 {
144 	struct mem_range_desc mrdesc;
145 	vm_paddr_t pa;
146 	vm_offset_t len;
147 	int err, action;
148 
149 	pa = rman_get_start(sc->mem_res);
150 	len = rman_get_size(sc->mem_res);
151 	mrdesc.mr_base = pa;
152 	mrdesc.mr_len = len;
153 	mrdesc.mr_flags = MDF_WRITECOMBINE;
154 	action = MEMRANGE_SET_UPDATE;
155 	strcpy((char *)&mrdesc.mr_owner, "mxge");
156 	err = mem_range_attr_set(&mrdesc, &action);
157 	if (err != 0) {
158 		device_printf(sc->dev,
159 			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
160 			      (unsigned long)pa, (unsigned long)len, err);
161 	} else {
162 		sc->wc = 1;
163 	}
164 }
165 
166 
167 /* callback to get our DMA address */
168 static void
169 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
170 			 int error)
171 {
172 	if (error == 0) {
173 		*(bus_addr_t *) arg = segs->ds_addr;
174 	}
175 }
176 
177 static int
178 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
179 		   bus_size_t alignment)
180 {
181 	int err;
182 	device_t dev = sc->dev;
183 
184 	/* allocate DMAable memory tags */
185 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
186 				 alignment,		/* alignment */
187 				 4096,			/* boundary */
188 				 BUS_SPACE_MAXADDR,	/* low */
189 				 BUS_SPACE_MAXADDR,	/* high */
190 				 NULL, NULL,		/* filter */
191 				 bytes,			/* maxsize */
192 				 1,			/* num segs */
193 				 4096,			/* maxsegsize */
194 				 BUS_DMA_COHERENT,	/* flags */
195 				 NULL, NULL,		/* lock */
196 				 &dma->dmat);		/* tag */
197 	if (err != 0) {
198 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
199 		return err;
200 	}
201 
202 	/* allocate DMAable memory & map */
203 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
204 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
205 				| BUS_DMA_ZERO),  &dma->map);
206 	if (err != 0) {
207 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
208 		goto abort_with_dmat;
209 	}
210 
211 	/* load the memory */
212 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
213 			      mxge_dmamap_callback,
214 			      (void *)&dma->bus_addr, 0);
215 	if (err != 0) {
216 		device_printf(dev, "couldn't load map (err = %d)\n", err);
217 		goto abort_with_mem;
218 	}
219 	return 0;
220 
221 abort_with_mem:
222 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
223 abort_with_dmat:
224 	(void)bus_dma_tag_destroy(dma->dmat);
225 	return err;
226 }
227 
228 
229 static void
230 mxge_dma_free(mxge_dma_t *dma)
231 {
232 	bus_dmamap_unload(dma->dmat, dma->map);
233 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
234 	(void)bus_dma_tag_destroy(dma->dmat);
235 }
236 
237 /*
238  * The eeprom strings on the lanaiX have the format
239  * SN=x\0
240  * MAC=x:x:x:x:x:x\0
241  * PC=text\0
242  */
243 
244 static int
245 mxge_parse_strings(mxge_softc_t *sc)
246 {
247 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
248 
249 	char *ptr, *limit;
250 	int i, found_mac;
251 
252 	ptr = sc->eeprom_strings;
253 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
254 	found_mac = 0;
255 	while (ptr < limit && *ptr != '\0') {
256 		if (memcmp(ptr, "MAC=", 4) == 0) {
257 			ptr += 1;
258 			sc->mac_addr_string = ptr;
259 			for (i = 0; i < 6; i++) {
260 				ptr += 3;
261 				if ((ptr + 2) > limit)
262 					goto abort;
263 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
264 				found_mac = 1;
265 			}
266 		} else if (memcmp(ptr, "PC=", 3) == 0) {
267 			ptr += 3;
268 			strncpy(sc->product_code_string, ptr,
269 				sizeof (sc->product_code_string) - 1);
270 		} else if (memcmp(ptr, "SN=", 3) == 0) {
271 			ptr += 3;
272 			strncpy(sc->serial_number_string, ptr,
273 				sizeof (sc->serial_number_string) - 1);
274 		}
275 		MXGE_NEXT_STRING(ptr);
276 	}
277 
278 	if (found_mac)
279 		return 0;
280 
281  abort:
282 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
283 
284 	return ENXIO;
285 }
286 
287 #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
288 static void
289 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
290 {
291 	uint32_t val;
292 	unsigned long base, off;
293 	char *va, *cfgptr;
294 	device_t pdev, mcp55;
295 	uint16_t vendor_id, device_id, word;
296 	uintptr_t bus, slot, func, ivend, idev;
297 	uint32_t *ptr32;
298 
299 
300 	if (!mxge_nvidia_ecrc_enable)
301 		return;
302 
303 	pdev = device_get_parent(device_get_parent(sc->dev));
304 	if (pdev == NULL) {
305 		device_printf(sc->dev, "could not find parent?\n");
306 		return;
307 	}
308 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
309 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
310 
311 	if (vendor_id != 0x10de)
312 		return;
313 
314 	base = 0;
315 
316 	if (device_id == 0x005d) {
317 		/* ck804, base address is magic */
318 		base = 0xe0000000UL;
319 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
320 		/* mcp55, base address stored in chipset */
321 		mcp55 = pci_find_bsf(0, 0, 0);
322 		if (mcp55 &&
323 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
324 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
325 			word = pci_read_config(mcp55, 0x90, 2);
326 			base = ((unsigned long)word & 0x7ffeU) << 25;
327 		}
328 	}
329 	if (!base)
330 		return;
331 
332 	/* XXXX
333 	   Test below is commented because it is believed that doing
334 	   config read/write beyond 0xff will access the config space
335 	   for the next larger function.  Uncomment this and remove
336 	   the hacky pmap_mapdev() way of accessing config space when
337 	   FreeBSD grows support for extended pcie config space access
338 	*/
339 #if 0
340 	/* See if we can, by some miracle, access the extended
341 	   config space */
342 	val = pci_read_config(pdev, 0x178, 4);
343 	if (val != 0xffffffff) {
344 		val |= 0x40;
345 		pci_write_config(pdev, 0x178, val, 4);
346 		return;
347 	}
348 #endif
349 	/* Rather than using normal pci config space writes, we must
350 	 * map the Nvidia config space ourselves.  This is because on
351 	 * opteron/nvidia class machine the 0xe000000 mapping is
352 	 * handled by the nvidia chipset, that means the internal PCI
353 	 * device (the on-chip northbridge), or the amd-8131 bridge
354 	 * and things behind them are not visible by this method.
355 	 */
356 
357 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
358 		      PCI_IVAR_BUS, &bus);
359 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
360 		      PCI_IVAR_SLOT, &slot);
361 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
362 		      PCI_IVAR_FUNCTION, &func);
363 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
364 		      PCI_IVAR_VENDOR, &ivend);
365 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
366 		      PCI_IVAR_DEVICE, &idev);
367 
368 	off =  base
369 		+ 0x00100000UL * (unsigned long)bus
370 		+ 0x00001000UL * (unsigned long)(func
371 						 + 8 * slot);
372 
373 	/* map it into the kernel */
374 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
375 
376 
377 	if (va == NULL) {
378 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
379 		return;
380 	}
381 	/* get a pointer to the config space mapped into the kernel */
382 	cfgptr = va + (off & PAGE_MASK);
383 
384 	/* make sure that we can really access it */
385 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
386 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
387 	if (! (vendor_id == ivend && device_id == idev)) {
388 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
389 			      vendor_id, device_id);
390 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
391 		return;
392 	}
393 
394 	ptr32 = (uint32_t*)(cfgptr + 0x178);
395 	val = *ptr32;
396 
397 	if (val == 0xffffffff) {
398 		device_printf(sc->dev, "extended mapping failed\n");
399 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
400 		return;
401 	}
402 	*ptr32 = val | 0x40;
403 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
404 	if (mxge_verbose)
405 		device_printf(sc->dev,
406 			      "Enabled ECRC on upstream Nvidia bridge "
407 			      "at %d:%d:%d\n",
408 			      (int)bus, (int)slot, (int)func);
409 	return;
410 }
411 #else
412 static void
413 mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
414 {
415 	device_printf(sc->dev,
416 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
417 	return;
418 }
419 #endif
420 
421 
422 static int
423 mxge_dma_test(mxge_softc_t *sc, int test_type)
424 {
425 	mxge_cmd_t cmd;
426 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
427 	int status;
428 	uint32_t len;
429 	char *test = " ";
430 
431 
432 	/* Run a small DMA test.
433 	 * The magic multipliers to the length tell the firmware
434 	 * to do DMA read, write, or read+write tests.  The
435 	 * results are returned in cmd.data0.  The upper 16
436 	 * bits of the return is the number of transfers completed.
437 	 * The lower 16 bits is the time in 0.5us ticks that the
438 	 * transfers took to complete.
439 	 */
440 
441 	len = sc->tx.boundary;
442 
443 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
444 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
445 	cmd.data2 = len * 0x10000;
446 	status = mxge_send_cmd(sc, test_type, &cmd);
447 	if (status != 0) {
448 		test = "read";
449 		goto abort;
450 	}
451 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
452 		(cmd.data0 & 0xffff);
453 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
454 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
455 	cmd.data2 = len * 0x1;
456 	status = mxge_send_cmd(sc, test_type, &cmd);
457 	if (status != 0) {
458 		test = "write";
459 		goto abort;
460 	}
461 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
462 		(cmd.data0 & 0xffff);
463 
464 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
465 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
466 	cmd.data2 = len * 0x10001;
467 	status = mxge_send_cmd(sc, test_type, &cmd);
468 	if (status != 0) {
469 		test = "read/write";
470 		goto abort;
471 	}
472 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
473 		(cmd.data0 & 0xffff);
474 
475 abort:
476 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
477 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
478 			      test, status);
479 
480 	return status;
481 }
482 
483 /*
484  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
485  * when the PCI-E Completion packets are aligned on an 8-byte
486  * boundary.  Some PCI-E chip sets always align Completion packets; on
487  * the ones that do not, the alignment can be enforced by enabling
488  * ECRC generation (if supported).
489  *
490  * When PCI-E Completion packets are not aligned, it is actually more
491  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
492  *
493  * If the driver can neither enable ECRC nor verify that it has
494  * already been enabled, then it must use a firmware image which works
495  * around unaligned completion packets (ethp_z8e.dat), and it should
496  * also ensure that it never gives the device a Read-DMA which is
497  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
498  * enabled, then the driver should use the aligned (eth_z8e.dat)
499  * firmware image, and set tx.boundary to 4KB.
500  */
501 
502 static int
503 mxge_firmware_probe(mxge_softc_t *sc)
504 {
505 	device_t dev = sc->dev;
506 	int reg, status;
507 	uint16_t pectl;
508 
509 	sc->tx.boundary = 4096;
510 	/*
511 	 * Verify the max read request size was set to 4KB
512 	 * before trying the test with 4KB.
513 	 */
514 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
515 		pectl = pci_read_config(dev, reg + 0x8, 2);
516 		if ((pectl & (5 << 12)) != (5 << 12)) {
517 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
518 				      pectl);
519 			sc->tx.boundary = 2048;
520 		}
521 	}
522 
523 	/*
524 	 * load the optimized firmware (which assumes aligned PCIe
525 	 * completions) in order to see if it works on this host.
526 	 */
527 	sc->fw_name = mxge_fw_aligned;
528 	status = mxge_load_firmware(sc);
529 	if (status != 0) {
530 		return status;
531 	}
532 
533 	/*
534 	 * Enable ECRC if possible
535 	 */
536 	mxge_enable_nvidia_ecrc(sc);
537 
538 	/*
539 	 * Run a DMA test which watches for unaligned completions and
540 	 * aborts on the first one seen.
541 	 */
542 
543 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
544 	if (status == 0)
545 		return 0; /* keep the aligned firmware */
546 
547 	if (status != E2BIG)
548 		device_printf(dev, "DMA test failed: %d\n", status);
549 	if (status == ENOSYS)
550 		device_printf(dev, "Falling back to ethp! "
551 			      "Please install up to date fw\n");
552 	return status;
553 }
554 
555 static int
556 mxge_select_firmware(mxge_softc_t *sc)
557 {
558 	int aligned = 0;
559 
560 
561 	if (mxge_force_firmware != 0) {
562 		if (mxge_force_firmware == 1)
563 			aligned = 1;
564 		else
565 			aligned = 0;
566 		if (mxge_verbose)
567 			device_printf(sc->dev,
568 				      "Assuming %s completions (forced)\n",
569 				      aligned ? "aligned" : "unaligned");
570 		goto abort;
571 	}
572 
573 	/* if the PCIe link width is 4 or less, we can use the aligned
574 	   firmware and skip any checks */
575 	if (sc->link_width != 0 && sc->link_width <= 4) {
576 		device_printf(sc->dev,
577 			      "PCIe x%d Link, expect reduced performance\n",
578 			      sc->link_width);
579 		aligned = 1;
580 		goto abort;
581 	}
582 
583 	if (0 == mxge_firmware_probe(sc))
584 		return 0;
585 
586 abort:
587 	if (aligned) {
588 		sc->fw_name = mxge_fw_aligned;
589 		sc->tx.boundary = 4096;
590 	} else {
591 		sc->fw_name = mxge_fw_unaligned;
592 		sc->tx.boundary = 2048;
593 	}
594 	return (mxge_load_firmware(sc));
595 }
596 
597 union qualhack
598 {
599         const char *ro_char;
600         char *rw_char;
601 };
602 
603 static int
604 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
605 {
606 
607 
608 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
609 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
610 			      be32toh(hdr->mcp_type));
611 		return EIO;
612 	}
613 
614 	/* save firmware version for sysctl */
615 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
616 	if (mxge_verbose)
617 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
618 
619 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
620 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
621 
622 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
623 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
624 		device_printf(sc->dev, "Found firmware version %s\n",
625 			      sc->fw_version);
626 		device_printf(sc->dev, "Driver needs %d.%d\n",
627 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
628 		return EINVAL;
629 	}
630 	return 0;
631 
632 }
633 
634 static int
635 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
636 {
637 	const struct firmware *fw;
638 	const mcp_gen_header_t *hdr;
639 	unsigned hdr_offset;
640 	const char *fw_data;
641 	union qualhack hack;
642 	int status;
643 	unsigned int i;
644 	char dummy;
645 
646 
647 	fw = firmware_get(sc->fw_name);
648 
649 	if (fw == NULL) {
650 		device_printf(sc->dev, "Could not find firmware image %s\n",
651 			      sc->fw_name);
652 		return ENOENT;
653 	}
654 	if (fw->datasize > *limit ||
655 	    fw->datasize < MCP_HEADER_PTR_OFFSET + 4) {
656 		device_printf(sc->dev, "Firmware image %s too large (%d/%d)\n",
657 			      sc->fw_name, (int)fw->datasize, (int) *limit);
658 		status = ENOSPC;
659 		goto abort_with_fw;
660 	}
661 	*limit = fw->datasize;
662 
663 	/* check id */
664 	fw_data = (const char *)fw->data;
665 	hdr_offset = htobe32(*(const uint32_t *)
666 			     (fw_data + MCP_HEADER_PTR_OFFSET));
667 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw->datasize) {
668 		device_printf(sc->dev, "Bad firmware file");
669 		status = EIO;
670 		goto abort_with_fw;
671 	}
672 	hdr = (const void*)(fw_data + hdr_offset);
673 
674 	status = mxge_validate_firmware(sc, hdr);
675 	if (status != 0)
676 		goto abort_with_fw;
677 
678 	hack.ro_char = fw_data;
679 	/* Copy the inflated firmware to NIC SRAM. */
680 	for (i = 0; i < *limit; i += 256) {
681 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
682 			      hack.rw_char + i,
683 			      min(256U, (unsigned)(*limit - i)));
684 		mb();
685 		dummy = *sc->sram;
686 		mb();
687 	}
688 
689 	status = 0;
690 abort_with_fw:
691 	firmware_put(fw, FIRMWARE_UNLOAD);
692 	return status;
693 }
694 
695 /*
696  * Enable or disable periodic RDMAs from the host to make certain
697  * chipsets resend dropped PCIe messages
698  */
699 
700 static void
701 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
702 {
703 	char buf_bytes[72];
704 	volatile uint32_t *confirm;
705 	volatile char *submit;
706 	uint32_t *buf, dma_low, dma_high;
707 	int i;
708 
709 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
710 
711 	/* clear confirmation addr */
712 	confirm = (volatile uint32_t *)sc->cmd;
713 	*confirm = 0;
714 	mb();
715 
716 	/* send an rdma command to the PCIe engine, and wait for the
717 	   response in the confirmation address.  The firmware should
718 	   write a -1 there to indicate it is alive and well
719 	*/
720 
721 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
722 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
723 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
724 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
725 	buf[2] = htobe32(0xffffffff);		/* confirm data */
726 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
727 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
728 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
729 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
730 	buf[5] = htobe32(enable);			/* enable? */
731 
732 
733 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
734 
735 	mxge_pio_copy(submit, buf, 64);
736 	mb();
737 	DELAY(1000);
738 	mb();
739 	i = 0;
740 	while (*confirm != 0xffffffff && i < 20) {
741 		DELAY(1000);
742 		i++;
743 	}
744 	if (*confirm != 0xffffffff) {
745 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
746 			      (enable ? "enable" : "disable"), confirm,
747 			      *confirm);
748 	}
749 	return;
750 }
751 
752 static int
753 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
754 {
755 	mcp_cmd_t *buf;
756 	char buf_bytes[sizeof(*buf) + 8];
757 	volatile mcp_cmd_response_t *response = sc->cmd;
758 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
759 	uint32_t dma_low, dma_high;
760 	int err, sleep_total = 0;
761 
762 	/* ensure buf is aligned to 8 bytes */
763 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
764 
765 	buf->data0 = htobe32(data->data0);
766 	buf->data1 = htobe32(data->data1);
767 	buf->data2 = htobe32(data->data2);
768 	buf->cmd = htobe32(cmd);
769 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
770 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
771 
772 	buf->response_addr.low = htobe32(dma_low);
773 	buf->response_addr.high = htobe32(dma_high);
774 	mtx_lock(&sc->cmd_mtx);
775 	response->result = 0xffffffff;
776 	mb();
777 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
778 
779 	/* wait up to 20ms */
780 	err = EAGAIN;
781 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
782 		bus_dmamap_sync(sc->cmd_dma.dmat,
783 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
784 		mb();
785 		switch (be32toh(response->result)) {
786 		case 0:
787 			data->data0 = be32toh(response->data);
788 			err = 0;
789 			break;
790 		case 0xffffffff:
791 			DELAY(1000);
792 			break;
793 		case MXGEFW_CMD_UNKNOWN:
794 			err = ENOSYS;
795 			break;
796 		case MXGEFW_CMD_ERROR_UNALIGNED:
797 			err = E2BIG;
798 			break;
799 		default:
800 			device_printf(sc->dev,
801 				      "mxge: command %d "
802 				      "failed, result = %d\n",
803 				      cmd, be32toh(response->result));
804 			err = ENXIO;
805 			break;
806 		}
807 		if (err != EAGAIN)
808 			break;
809 	}
810 	if (err == EAGAIN)
811 		device_printf(sc->dev, "mxge: command %d timed out"
812 			      "result = %d\n",
813 			      cmd, be32toh(response->result));
814 	mtx_unlock(&sc->cmd_mtx);
815 	return err;
816 }
817 
818 static int
819 mxge_adopt_running_firmware(mxge_softc_t *sc)
820 {
821 	struct mcp_gen_header *hdr;
822 	const size_t bytes = sizeof (struct mcp_gen_header);
823 	size_t hdr_offset;
824 	int status;
825 
826 	/* find running firmware header */
827 	hdr_offset = htobe32(*(volatile uint32_t *)
828 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
829 
830 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
831 		device_printf(sc->dev,
832 			      "Running firmware has bad header offset (%d)\n",
833 			      (int)hdr_offset);
834 		return EIO;
835 	}
836 
837 	/* copy header of running firmware from SRAM to host memory to
838 	 * validate firmware */
839 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
840 	if (hdr == NULL) {
841 		device_printf(sc->dev, "could not malloc firmware hdr\n");
842 		return ENOMEM;
843 	}
844 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
845 				rman_get_bushandle(sc->mem_res),
846 				hdr_offset, (char *)hdr, bytes);
847 	status = mxge_validate_firmware(sc, hdr);
848 	free(hdr, M_DEVBUF);
849 
850 	/*
851 	 * check to see if adopted firmware has bug where adopting
852 	 * it will cause broadcasts to be filtered unless the NIC
853 	 * is kept in ALLMULTI mode
854 	 */
855 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
856 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
857 		sc->adopted_rx_filter_bug = 1;
858 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
859 			      "working around rx filter bug\n",
860 			      sc->fw_ver_major, sc->fw_ver_minor,
861 			      sc->fw_ver_tiny);
862 	}
863 
864 	return status;
865 }
866 
867 
868 static int
869 mxge_load_firmware(mxge_softc_t *sc)
870 {
871 	volatile uint32_t *confirm;
872 	volatile char *submit;
873 	char buf_bytes[72];
874 	uint32_t *buf, size, dma_low, dma_high;
875 	int status, i;
876 
877 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
878 
879 	size = sc->sram_size;
880 	status = mxge_load_firmware_helper(sc, &size);
881 	if (status) {
882 		/* Try to use the currently running firmware, if
883 		   it is new enough */
884 		status = mxge_adopt_running_firmware(sc);
885 		if (status) {
886 			device_printf(sc->dev,
887 				      "failed to adopt running firmware\n");
888 			return status;
889 		}
890 		device_printf(sc->dev,
891 			      "Successfully adopted running firmware\n");
892 		if (sc->tx.boundary == 4096) {
893 			device_printf(sc->dev,
894 				"Using firmware currently running on NIC"
895 				 ".  For optimal\n");
896 			device_printf(sc->dev,
897 				 "performance consider loading optimized "
898 				 "firmware\n");
899 		}
900 		sc->fw_name = mxge_fw_unaligned;
901 		sc->tx.boundary = 2048;
902 		return 0;
903 	}
904 	/* clear confirmation addr */
905 	confirm = (volatile uint32_t *)sc->cmd;
906 	*confirm = 0;
907 	mb();
908 	/* send a reload command to the bootstrap MCP, and wait for the
909 	   response in the confirmation address.  The firmware should
910 	   write a -1 there to indicate it is alive and well
911 	*/
912 
913 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
914 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
915 
916 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
917 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
918 	buf[2] = htobe32(0xffffffff);	/* confirm data */
919 
920 	/* FIX: All newest firmware should un-protect the bottom of
921 	   the sram before handoff. However, the very first interfaces
922 	   do not. Therefore the handoff copy must skip the first 8 bytes
923 	*/
924 					/* where the code starts*/
925 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
926 	buf[4] = htobe32(size - 8); 	/* length of code */
927 	buf[5] = htobe32(8);		/* where to copy to */
928 	buf[6] = htobe32(0);		/* where to jump to */
929 
930 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
931 	mxge_pio_copy(submit, buf, 64);
932 	mb();
933 	DELAY(1000);
934 	mb();
935 	i = 0;
936 	while (*confirm != 0xffffffff && i < 20) {
937 		DELAY(1000*10);
938 		i++;
939 		bus_dmamap_sync(sc->cmd_dma.dmat,
940 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
941 	}
942 	if (*confirm != 0xffffffff) {
943 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
944 			confirm, *confirm);
945 
946 		return ENXIO;
947 	}
948 	return 0;
949 }
950 
951 static int
952 mxge_update_mac_address(mxge_softc_t *sc)
953 {
954 	mxge_cmd_t cmd;
955 	uint8_t *addr = sc->mac_addr;
956 	int status;
957 
958 
959 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
960 		     | (addr[2] << 8) | addr[3]);
961 
962 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
963 
964 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
965 	return status;
966 }
967 
968 static int
969 mxge_change_pause(mxge_softc_t *sc, int pause)
970 {
971 	mxge_cmd_t cmd;
972 	int status;
973 
974 	if (pause)
975 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
976 				       &cmd);
977 	else
978 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
979 				       &cmd);
980 
981 	if (status) {
982 		device_printf(sc->dev, "Failed to set flow control mode\n");
983 		return ENXIO;
984 	}
985 	sc->pause = pause;
986 	return 0;
987 }
988 
989 static void
990 mxge_change_promisc(mxge_softc_t *sc, int promisc)
991 {
992 	mxge_cmd_t cmd;
993 	int status;
994 
995 	if (promisc)
996 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
997 				       &cmd);
998 	else
999 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1000 				       &cmd);
1001 
1002 	if (status) {
1003 		device_printf(sc->dev, "Failed to set promisc mode\n");
1004 	}
1005 }
1006 
1007 static void
1008 mxge_set_multicast_list(mxge_softc_t *sc)
1009 {
1010 	mxge_cmd_t cmd;
1011 	struct ifmultiaddr *ifma;
1012 	struct ifnet *ifp = sc->ifp;
1013 	int err;
1014 
1015 	/* This firmware is known to not support multicast */
1016 	if (!sc->fw_multicast_support)
1017 		return;
1018 
1019 	/* Disable multicast filtering while we play with the lists*/
1020 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1021 	if (err != 0) {
1022 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1023 		       " error status: %d\n", err);
1024 		return;
1025 	}
1026 
1027 	if (sc->adopted_rx_filter_bug)
1028 		return;
1029 
1030 	if (ifp->if_flags & IFF_ALLMULTI)
1031 		/* request to disable multicast filtering, so quit here */
1032 		return;
1033 
1034 	/* Flush all the filters */
1035 
1036 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1037 	if (err != 0) {
1038 		device_printf(sc->dev,
1039 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1040 			      ", error status: %d\n", err);
1041 		return;
1042 	}
1043 
1044 	/* Walk the multicast list, and add each address */
1045 
1046 	IF_ADDR_LOCK(ifp);
1047 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1048 		if (ifma->ifma_addr->sa_family != AF_LINK)
1049 			continue;
1050 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1051 		      &cmd.data0, 4);
1052 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1053 		      &cmd.data1, 2);
1054 		cmd.data0 = htonl(cmd.data0);
1055 		cmd.data1 = htonl(cmd.data1);
1056 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1057 		if (err != 0) {
1058 			device_printf(sc->dev, "Failed "
1059 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1060 			       "%d\t", err);
1061 			/* abort, leaving multicast filtering off */
1062 			IF_ADDR_UNLOCK(ifp);
1063 			return;
1064 		}
1065 	}
1066 	IF_ADDR_UNLOCK(ifp);
1067 	/* Enable multicast filtering */
1068 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1069 	if (err != 0) {
1070 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1071 		       ", error status: %d\n", err);
1072 	}
1073 }
1074 
1075 static int
1076 mxge_reset(mxge_softc_t *sc)
1077 {
1078 
1079 	mxge_cmd_t cmd;
1080 	size_t bytes;
1081 	int status;
1082 
1083 	/* try to send a reset command to the card to see if it
1084 	   is alive */
1085 	memset(&cmd, 0, sizeof (cmd));
1086 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1087 	if (status != 0) {
1088 		device_printf(sc->dev, "failed reset\n");
1089 		return ENXIO;
1090 	}
1091 
1092 	mxge_dummy_rdma(sc, 1);
1093 
1094 	/* Now exchange information about interrupts  */
1095 	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);\
1096 	memset(sc->rx_done.entry, 0, bytes);
1097 	cmd.data0 = (uint32_t)bytes;
1098 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1099 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->rx_done.dma.bus_addr);
1100 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->rx_done.dma.bus_addr);
1101 	status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd);
1102 
1103 	status |= mxge_send_cmd(sc,
1104 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1105 
1106 
1107 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1108 
1109 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1110 	sc->irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1111 
1112 
1113 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1114 				&cmd);
1115 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1116 	if (status != 0) {
1117 		device_printf(sc->dev, "failed set interrupt parameters\n");
1118 		return status;
1119 	}
1120 
1121 
1122 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1123 
1124 
1125 	/* run a DMA benchmark */
1126 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1127 
1128 	/* reset mcp/driver shared state back to 0 */
1129 	bzero(sc->rx_done.entry, bytes);
1130 	sc->rx_done.idx = 0;
1131 	sc->rx_done.cnt = 0;
1132 	sc->tx.req = 0;
1133 	sc->tx.done = 0;
1134 	sc->tx.pkt_done = 0;
1135 	sc->tx.wake = 0;
1136 	sc->tx.stall = 0;
1137 	sc->rx_big.cnt = 0;
1138 	sc->rx_small.cnt = 0;
1139 	sc->rdma_tags_available = 15;
1140 	sc->fw_stats->valid = 0;
1141 	sc->fw_stats->send_done_count = 0;
1142 	status = mxge_update_mac_address(sc);
1143 	mxge_change_promisc(sc, 0);
1144 	mxge_change_pause(sc, sc->pause);
1145 	mxge_set_multicast_list(sc);
1146 	return status;
1147 }
1148 
1149 static int
1150 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1151 {
1152         mxge_softc_t *sc;
1153         unsigned int intr_coal_delay;
1154         int err;
1155 
1156         sc = arg1;
1157         intr_coal_delay = sc->intr_coal_delay;
1158         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1159         if (err != 0) {
1160                 return err;
1161         }
1162         if (intr_coal_delay == sc->intr_coal_delay)
1163                 return 0;
1164 
1165         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1166                 return EINVAL;
1167 
1168 	mtx_lock(&sc->driver_mtx);
1169 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1170 	sc->intr_coal_delay = intr_coal_delay;
1171 
1172 	mtx_unlock(&sc->driver_mtx);
1173         return err;
1174 }
1175 
1176 static int
1177 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1178 {
1179         mxge_softc_t *sc;
1180         unsigned int enabled;
1181         int err;
1182 
1183         sc = arg1;
1184         enabled = sc->pause;
1185         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1186         if (err != 0) {
1187                 return err;
1188         }
1189         if (enabled == sc->pause)
1190                 return 0;
1191 
1192 	mtx_lock(&sc->driver_mtx);
1193 	err = mxge_change_pause(sc, enabled);
1194 	mtx_unlock(&sc->driver_mtx);
1195         return err;
1196 }
1197 
1198 static int
1199 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1200 {
1201         int err;
1202 
1203         if (arg1 == NULL)
1204                 return EFAULT;
1205         arg2 = be32toh(*(int *)arg1);
1206         arg1 = NULL;
1207         err = sysctl_handle_int(oidp, arg1, arg2, req);
1208 
1209         return err;
1210 }
1211 
1212 static void
1213 mxge_add_sysctls(mxge_softc_t *sc)
1214 {
1215 	struct sysctl_ctx_list *ctx;
1216 	struct sysctl_oid_list *children;
1217 	mcp_irq_data_t *fw;
1218 
1219 	ctx = device_get_sysctl_ctx(sc->dev);
1220 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1221 	fw = sc->fw_stats;
1222 
1223 	/* random information */
1224 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1225 		       "firmware_version",
1226 		       CTLFLAG_RD, &sc->fw_version,
1227 		       0, "firmware version");
1228 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1229 		       "serial_number",
1230 		       CTLFLAG_RD, &sc->serial_number_string,
1231 		       0, "serial number");
1232 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1233 		       "product_code",
1234 		       CTLFLAG_RD, &sc->product_code_string,
1235 		       0, "product_code");
1236 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1237 		       "pcie_link_width",
1238 		       CTLFLAG_RD, &sc->link_width,
1239 		       0, "tx_boundary");
1240 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1241 		       "tx_boundary",
1242 		       CTLFLAG_RD, &sc->tx.boundary,
1243 		       0, "tx_boundary");
1244 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1245 		       "write_combine",
1246 		       CTLFLAG_RD, &sc->wc,
1247 		       0, "write combining PIO?");
1248 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1249 		       "read_dma_MBs",
1250 		       CTLFLAG_RD, &sc->read_dma,
1251 		       0, "DMA Read speed in MB/s");
1252 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1253 		       "write_dma_MBs",
1254 		       CTLFLAG_RD, &sc->write_dma,
1255 		       0, "DMA Write speed in MB/s");
1256 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1257 		       "read_write_dma_MBs",
1258 		       CTLFLAG_RD, &sc->read_write_dma,
1259 		       0, "DMA concurrent Read/Write speed in MB/s");
1260 
1261 
1262 	/* performance related tunables */
1263 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1264 			"intr_coal_delay",
1265 			CTLTYPE_INT|CTLFLAG_RW, sc,
1266 			0, mxge_change_intr_coal,
1267 			"I", "interrupt coalescing delay in usecs");
1268 
1269 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1270 			"flow_control_enabled",
1271 			CTLTYPE_INT|CTLFLAG_RW, sc,
1272 			0, mxge_change_flow_control,
1273 			"I", "interrupt coalescing delay in usecs");
1274 
1275 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1276 		       "deassert_wait",
1277 		       CTLFLAG_RW, &mxge_deassert_wait,
1278 		       0, "Wait for IRQ line to go low in ihandler");
1279 
1280 	/* stats block from firmware is in network byte order.
1281 	   Need to swap it */
1282 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1283 			"link_up",
1284 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1285 			0, mxge_handle_be32,
1286 			"I", "link up");
1287 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1288 			"rdma_tags_available",
1289 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1290 			0, mxge_handle_be32,
1291 			"I", "rdma_tags_available");
1292 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1293 			"dropped_link_overflow",
1294 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1295 			0, mxge_handle_be32,
1296 			"I", "dropped_link_overflow");
1297 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1298 			"dropped_link_error_or_filtered",
1299 			CTLTYPE_INT|CTLFLAG_RD,
1300 			&fw->dropped_link_error_or_filtered,
1301 			0, mxge_handle_be32,
1302 			"I", "dropped_link_error_or_filtered");
1303 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1304 			"dropped_multicast_filtered",
1305 			CTLTYPE_INT|CTLFLAG_RD,
1306 			&fw->dropped_multicast_filtered,
1307 			0, mxge_handle_be32,
1308 			"I", "dropped_multicast_filtered");
1309 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1310 			"dropped_runt",
1311 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1312 			0, mxge_handle_be32,
1313 			"I", "dropped_runt");
1314 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1315 			"dropped_overrun",
1316 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1317 			0, mxge_handle_be32,
1318 			"I", "dropped_overrun");
1319 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1320 			"dropped_no_small_buffer",
1321 			CTLTYPE_INT|CTLFLAG_RD,
1322 			&fw->dropped_no_small_buffer,
1323 			0, mxge_handle_be32,
1324 			"I", "dropped_no_small_buffer");
1325 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1326 			"dropped_no_big_buffer",
1327 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1328 			0, mxge_handle_be32,
1329 			"I", "dropped_no_big_buffer");
1330 
1331 	/* host counters exported for debugging */
1332 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1333 		       "rx_small_cnt",
1334 		       CTLFLAG_RD, &sc->rx_small.cnt,
1335 		       0, "rx_small_cnt");
1336 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1337 		       "rx_big_cnt",
1338 		       CTLFLAG_RD, &sc->rx_big.cnt,
1339 		       0, "rx_small_cnt");
1340 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1341 		       "tx_req",
1342 		       CTLFLAG_RD, &sc->tx.req,
1343 		       0, "tx_req");
1344 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1345 		       "tx_done",
1346 		       CTLFLAG_RD, &sc->tx.done,
1347 		       0, "tx_done");
1348 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1349 		       "tx_pkt_done",
1350 		       CTLFLAG_RD, &sc->tx.pkt_done,
1351 		       0, "tx_done");
1352 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1353 		       "tx_stall",
1354 		       CTLFLAG_RD, &sc->tx.stall,
1355 		       0, "tx_stall");
1356 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1357 		       "tx_wake",
1358 		       CTLFLAG_RD, &sc->tx.wake,
1359 		       0, "tx_wake");
1360 
1361 	/* verbose printing? */
1362 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1363 		       "verbose",
1364 		       CTLFLAG_RW, &mxge_verbose,
1365 		       0, "verbose printing");
1366 
1367 }
1368 
1369 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1370    backwards one at a time and handle ring wraps */
1371 
1372 static inline void
1373 mxge_submit_req_backwards(mxge_tx_buf_t *tx,
1374 			    mcp_kreq_ether_send_t *src, int cnt)
1375 {
1376         int idx, starting_slot;
1377         starting_slot = tx->req;
1378         while (cnt > 1) {
1379                 cnt--;
1380                 idx = (starting_slot + cnt) & tx->mask;
1381                 mxge_pio_copy(&tx->lanai[idx],
1382 			      &src[cnt], sizeof(*src));
1383                 mb();
1384         }
1385 }
1386 
1387 /*
1388  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1389  * at most 32 bytes at a time, so as to avoid involving the software
1390  * pio handler in the nic.   We re-write the first segment's flags
1391  * to mark them valid only after writing the entire chain
1392  */
1393 
1394 static inline void
1395 mxge_submit_req(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src,
1396                   int cnt)
1397 {
1398         int idx, i;
1399         uint32_t *src_ints;
1400 	volatile uint32_t *dst_ints;
1401         mcp_kreq_ether_send_t *srcp;
1402 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1403 	uint8_t last_flags;
1404 
1405         idx = tx->req & tx->mask;
1406 
1407 	last_flags = src->flags;
1408 	src->flags = 0;
1409         mb();
1410         dst = dstp = &tx->lanai[idx];
1411         srcp = src;
1412 
1413         if ((idx + cnt) < tx->mask) {
1414                 for (i = 0; i < (cnt - 1); i += 2) {
1415                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1416                         mb(); /* force write every 32 bytes */
1417                         srcp += 2;
1418                         dstp += 2;
1419                 }
1420         } else {
1421                 /* submit all but the first request, and ensure
1422                    that it is submitted below */
1423                 mxge_submit_req_backwards(tx, src, cnt);
1424                 i = 0;
1425         }
1426         if (i < cnt) {
1427                 /* submit the first request */
1428                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1429                 mb(); /* barrier before setting valid flag */
1430         }
1431 
1432         /* re-write the last 32-bits with the valid flags */
1433         src->flags = last_flags;
1434         src_ints = (uint32_t *)src;
1435         src_ints+=3;
1436         dst_ints = (volatile uint32_t *)dst;
1437         dst_ints+=3;
1438         *dst_ints =  *src_ints;
1439         tx->req += cnt;
1440         mb();
1441 }
1442 
1443 static inline void
1444 mxge_submit_req_wc(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1445 {
1446     tx->req += cnt;
1447     mb();
1448     while (cnt >= 4) {
1449 	    mxge_pio_copy((volatile char *)tx->wc_fifo, src, 64);
1450 	    mb();
1451 	    src += 4;
1452 	    cnt -= 4;
1453     }
1454     if (cnt > 0) {
1455 	    /* pad it to 64 bytes.  The src is 64 bytes bigger than it
1456 	       needs to be so that we don't overrun it */
1457 	    mxge_pio_copy(tx->wc_fifo + MXGEFW_ETH_SEND_OFFSET(cnt), src, 64);
1458 	    mb();
1459     }
1460 }
1461 
1462 static void
1463 mxge_encap_tso(mxge_softc_t *sc, struct mbuf *m, int busdma_seg_cnt)
1464 {
1465 	mxge_tx_buf_t *tx;
1466 	mcp_kreq_ether_send_t *req;
1467 	bus_dma_segment_t *seg;
1468 	struct ether_header *eh;
1469 	struct ip *ip;
1470 	struct tcphdr *tcp;
1471 	uint32_t low, high_swapped;
1472 	int len, seglen, cum_len, cum_len_next;
1473 	int next_is_first, chop, cnt, rdma_count, small;
1474 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1475 	uint8_t flags, flags_next;
1476 	static int once;
1477 
1478 	mss = m->m_pkthdr.tso_segsz;
1479 
1480 	/* negative cum_len signifies to the
1481 	 * send loop that we are still in the
1482 	 * header portion of the TSO packet.
1483 	 */
1484 
1485 	/* ensure we have the ethernet, IP and TCP
1486 	   header together in the first mbuf, copy
1487 	   it to a scratch buffer if not */
1488 	if (__predict_false(m->m_len < sizeof (*eh)
1489 			    + sizeof (*ip))) {
1490 		m_copydata(m, 0, sizeof (*eh) + sizeof (*ip),
1491 			   sc->scratch);
1492 		eh = (struct ether_header *)sc->scratch;
1493 	} else {
1494 		eh = mtod(m, struct ether_header *);
1495 	}
1496 	ip = (struct ip *) (eh + 1);
1497 	if (__predict_false(m->m_len < sizeof (*eh) + (ip->ip_hl << 2)
1498 			    + sizeof (*tcp))) {
1499 		m_copydata(m, 0, sizeof (*eh) + (ip->ip_hl << 2)
1500 			   + sizeof (*tcp),  sc->scratch);
1501 		eh = (struct ether_header *) sc->scratch;
1502 		ip = (struct ip *) (eh + 1);
1503 	}
1504 
1505 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1506 	cum_len = -(sizeof (*eh) + ((ip->ip_hl + tcp->th_off) << 2));
1507 
1508 	/* TSO implies checksum offload on this hardware */
1509 	cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1510 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1511 
1512 
1513 	/* for TSO, pseudo_hdr_offset holds mss.
1514 	 * The firmware figures out where to put
1515 	 * the checksum by parsing the header. */
1516 	pseudo_hdr_offset = htobe16(mss);
1517 
1518 	tx = &sc->tx;
1519 	req = tx->req_list;
1520 	seg = tx->seg_list;
1521 	cnt = 0;
1522 	rdma_count = 0;
1523 	/* "rdma_count" is the number of RDMAs belonging to the
1524 	 * current packet BEFORE the current send request. For
1525 	 * non-TSO packets, this is equal to "count".
1526 	 * For TSO packets, rdma_count needs to be reset
1527 	 * to 0 after a segment cut.
1528 	 *
1529 	 * The rdma_count field of the send request is
1530 	 * the number of RDMAs of the packet starting at
1531 	 * that request. For TSO send requests with one ore more cuts
1532 	 * in the middle, this is the number of RDMAs starting
1533 	 * after the last cut in the request. All previous
1534 	 * segments before the last cut implicitly have 1 RDMA.
1535 	 *
1536 	 * Since the number of RDMAs is not known beforehand,
1537 	 * it must be filled-in retroactively - after each
1538 	 * segmentation cut or at the end of the entire packet.
1539 	 */
1540 
1541 	while (busdma_seg_cnt) {
1542 		/* Break the busdma segment up into pieces*/
1543 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1544 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1545 		len = seg->ds_len;
1546 
1547 		while (len) {
1548 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1549 			seglen = len;
1550 			cum_len_next = cum_len + seglen;
1551 			(req-rdma_count)->rdma_count = rdma_count + 1;
1552 			if (__predict_true(cum_len >= 0)) {
1553 				/* payload */
1554 				chop = (cum_len_next > mss);
1555 				cum_len_next = cum_len_next % mss;
1556 				next_is_first = (cum_len_next == 0);
1557 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1558 				flags_next |= next_is_first *
1559 					MXGEFW_FLAGS_FIRST;
1560 				rdma_count |= -(chop | next_is_first);
1561 				rdma_count += chop & !next_is_first;
1562 			} else if (cum_len_next >= 0) {
1563 				/* header ends */
1564 				rdma_count = -1;
1565 				cum_len_next = 0;
1566 				seglen = -cum_len;
1567 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1568 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1569 					MXGEFW_FLAGS_FIRST |
1570 					(small * MXGEFW_FLAGS_SMALL);
1571 			    }
1572 
1573 			req->addr_high = high_swapped;
1574 			req->addr_low = htobe32(low);
1575 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1576 			req->pad = 0;
1577 			req->rdma_count = 1;
1578 			req->length = htobe16(seglen);
1579 			req->cksum_offset = cksum_offset;
1580 			req->flags = flags | ((cum_len & 1) *
1581 					      MXGEFW_FLAGS_ALIGN_ODD);
1582 			low += seglen;
1583 			len -= seglen;
1584 			cum_len = cum_len_next;
1585 			flags = flags_next;
1586 			req++;
1587 			cnt++;
1588 			rdma_count++;
1589 			if (__predict_false(cksum_offset > seglen))
1590 				cksum_offset -= seglen;
1591 			else
1592 				cksum_offset = 0;
1593 			if (__predict_false(cnt > MXGE_MAX_SEND_DESC))
1594 				goto drop;
1595 		}
1596 		busdma_seg_cnt--;
1597 		seg++;
1598 	}
1599 	(req-rdma_count)->rdma_count = rdma_count;
1600 
1601 	do {
1602 		req--;
1603 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1604 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1605 
1606 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1607 	if (tx->wc_fifo == NULL)
1608 		mxge_submit_req(tx, tx->req_list, cnt);
1609 	else
1610 		mxge_submit_req_wc(tx, tx->req_list, cnt);
1611 	return;
1612 
1613 drop:
1614 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1615 	m_freem(m);
1616 	sc->ifp->if_oerrors++;
1617 	if (!once) {
1618 		printf("MXGE_MAX_SEND_DESC exceeded via TSO!\n");
1619 		printf("mss = %d, %ld!\n", mss, (long)seg - (long)tx->seg_list);
1620 		once = 1;
1621 	}
1622 	return;
1623 
1624 }
1625 
1626 static void
1627 mxge_encap(mxge_softc_t *sc, struct mbuf *m)
1628 {
1629 	mcp_kreq_ether_send_t *req;
1630 	bus_dma_segment_t *seg;
1631 	struct mbuf *m_tmp;
1632 	struct ifnet *ifp;
1633 	mxge_tx_buf_t *tx;
1634 	struct ether_header *eh;
1635 	struct ip *ip;
1636 	int cnt, cum_len, err, i, idx, odd_flag;
1637 	uint16_t pseudo_hdr_offset;
1638         uint8_t flags, cksum_offset;
1639 
1640 
1641 
1642 	ifp = sc->ifp;
1643 	tx = &sc->tx;
1644 
1645 	/* (try to) map the frame for DMA */
1646 	idx = tx->req & tx->mask;
1647 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1648 				      m, tx->seg_list, &cnt,
1649 				      BUS_DMA_NOWAIT);
1650 	if (err == EFBIG) {
1651 		/* Too many segments in the chain.  Try
1652 		   to defrag */
1653 		m_tmp = m_defrag(m, M_NOWAIT);
1654 		if (m_tmp == NULL) {
1655 			goto drop;
1656 		}
1657 		m = m_tmp;
1658 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1659 					      tx->info[idx].map,
1660 					      m, tx->seg_list, &cnt,
1661 					      BUS_DMA_NOWAIT);
1662 	}
1663 	if (err != 0) {
1664 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1665 			      " packet len = %d\n", err, m->m_pkthdr.len);
1666 		goto drop;
1667 	}
1668 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1669 			BUS_DMASYNC_PREWRITE);
1670 	tx->info[idx].m = m;
1671 
1672 
1673 	/* TSO is different enough, we handle it in another routine */
1674 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1675 		mxge_encap_tso(sc, m, cnt);
1676 		return;
1677 	}
1678 
1679 	req = tx->req_list;
1680 	cksum_offset = 0;
1681 	pseudo_hdr_offset = 0;
1682 	flags = MXGEFW_FLAGS_NO_TSO;
1683 
1684 	/* checksum offloading? */
1685 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1686 		/* ensure ip header is in first mbuf, copy
1687 		   it to a scratch buffer if not */
1688 		if (__predict_false(m->m_len < sizeof (*eh)
1689 				    + sizeof (*ip))) {
1690 			m_copydata(m, 0, sizeof (*eh) + sizeof (*ip),
1691 				   sc->scratch);
1692 			eh = (struct ether_header *)sc->scratch;
1693 		} else {
1694 			eh = mtod(m, struct ether_header *);
1695 		}
1696 		ip = (struct ip *) (eh + 1);
1697 		cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1698 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1699 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1700 		req->cksum_offset = cksum_offset;
1701 		flags |= MXGEFW_FLAGS_CKSUM;
1702 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1703 	} else {
1704 		odd_flag = 0;
1705 	}
1706 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1707 		flags |= MXGEFW_FLAGS_SMALL;
1708 
1709 	/* convert segments into a request list */
1710 	cum_len = 0;
1711 	seg = tx->seg_list;
1712 	req->flags = MXGEFW_FLAGS_FIRST;
1713 	for (i = 0; i < cnt; i++) {
1714 		req->addr_low =
1715 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1716 		req->addr_high =
1717 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1718 		req->length = htobe16(seg->ds_len);
1719 		req->cksum_offset = cksum_offset;
1720 		if (cksum_offset > seg->ds_len)
1721 			cksum_offset -= seg->ds_len;
1722 		else
1723 			cksum_offset = 0;
1724 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1725 		req->pad = 0; /* complete solid 16-byte block */
1726 		req->rdma_count = 1;
1727 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1728 		cum_len += seg->ds_len;
1729 		seg++;
1730 		req++;
1731 		req->flags = 0;
1732 	}
1733 	req--;
1734 	/* pad runts to 60 bytes */
1735 	if (cum_len < 60) {
1736 		req++;
1737 		req->addr_low =
1738 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
1739 		req->addr_high =
1740 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
1741 		req->length = htobe16(60 - cum_len);
1742 		req->cksum_offset = 0;
1743 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1744 		req->pad = 0; /* complete solid 16-byte block */
1745 		req->rdma_count = 1;
1746 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1747 		cnt++;
1748 	}
1749 
1750 	tx->req_list[0].rdma_count = cnt;
1751 #if 0
1752 	/* print what the firmware will see */
1753 	for (i = 0; i < cnt; i++) {
1754 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1755 		    "cso:%d, flags:0x%x, rdma:%d\n",
1756 		    i, (int)ntohl(tx->req_list[i].addr_high),
1757 		    (int)ntohl(tx->req_list[i].addr_low),
1758 		    (int)ntohs(tx->req_list[i].length),
1759 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1760 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1761 		    tx->req_list[i].rdma_count);
1762 	}
1763 	printf("--------------\n");
1764 #endif
1765 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1766 	if (tx->wc_fifo == NULL)
1767 		mxge_submit_req(tx, tx->req_list, cnt);
1768 	else
1769 		mxge_submit_req_wc(tx, tx->req_list, cnt);
1770 	return;
1771 
1772 drop:
1773 	m_freem(m);
1774 	ifp->if_oerrors++;
1775 	return;
1776 }
1777 
1778 
1779 
1780 
1781 static inline void
1782 mxge_start_locked(mxge_softc_t *sc)
1783 {
1784 	struct mbuf *m;
1785 	struct ifnet *ifp;
1786 
1787 	ifp = sc->ifp;
1788 	while ((sc->tx.mask - (sc->tx.req - sc->tx.done))
1789 	       > MXGE_MAX_SEND_DESC) {
1790 
1791 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
1792 		if (m == NULL) {
1793 			return;
1794 		}
1795 		/* let BPF see it */
1796 		BPF_MTAP(ifp, m);
1797 
1798 		/* give it to the nic */
1799 		mxge_encap(sc, m);
1800 	}
1801 	/* ran out of transmit slots */
1802 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
1803 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
1804 		sc->tx.stall++;
1805 	}
1806 }
1807 
1808 static void
1809 mxge_start(struct ifnet *ifp)
1810 {
1811 	mxge_softc_t *sc = ifp->if_softc;
1812 
1813 
1814 	mtx_lock(&sc->tx_mtx);
1815 	mxge_start_locked(sc);
1816 	mtx_unlock(&sc->tx_mtx);
1817 }
1818 
1819 /*
1820  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
1821  * at most 32 bytes at a time, so as to avoid involving the software
1822  * pio handler in the nic.   We re-write the first segment's low
1823  * DMA address to mark it valid only after we write the entire chunk
1824  * in a burst
1825  */
1826 static inline void
1827 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
1828 		mcp_kreq_ether_recv_t *src)
1829 {
1830 	uint32_t low;
1831 
1832 	low = src->addr_low;
1833 	src->addr_low = 0xffffffff;
1834 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
1835 	mb();
1836 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
1837 	mb();
1838 	src->addr_low = low;
1839 	dst->addr_low = low;
1840 	mb();
1841 }
1842 
1843 static int
1844 mxge_get_buf_small(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1845 {
1846 	bus_dma_segment_t seg;
1847 	struct mbuf *m;
1848 	mxge_rx_buf_t *rx = &sc->rx_small;
1849 	int cnt, err;
1850 
1851 	m = m_gethdr(M_DONTWAIT, MT_DATA);
1852 	if (m == NULL) {
1853 		rx->alloc_fail++;
1854 		err = ENOBUFS;
1855 		goto done;
1856 	}
1857 	m->m_len = MHLEN;
1858 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1859 				      &seg, &cnt, BUS_DMA_NOWAIT);
1860 	if (err != 0) {
1861 		m_free(m);
1862 		goto done;
1863 	}
1864 	rx->info[idx].m = m;
1865 	rx->shadow[idx].addr_low =
1866 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1867 	rx->shadow[idx].addr_high =
1868 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1869 
1870 done:
1871 	if ((idx & 7) == 7) {
1872 		if (rx->wc_fifo == NULL)
1873 			mxge_submit_8rx(&rx->lanai[idx - 7],
1874 					&rx->shadow[idx - 7]);
1875 		else {
1876 			mb();
1877 			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1878 		}
1879         }
1880 	return err;
1881 }
1882 
1883 static int
1884 mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1885 {
1886 	bus_dma_segment_t seg;
1887 	struct mbuf *m;
1888 	mxge_rx_buf_t *rx = &sc->rx_big;
1889 	int cnt, err;
1890 
1891 	m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, sc->big_bytes);
1892 	if (m == NULL) {
1893 		rx->alloc_fail++;
1894 		err = ENOBUFS;
1895 		goto done;
1896 	}
1897 	m->m_len = sc->big_bytes;
1898 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1899 				      &seg, &cnt, BUS_DMA_NOWAIT);
1900 	if (err != 0) {
1901 		m_free(m);
1902 		goto done;
1903 	}
1904 	rx->info[idx].m = m;
1905 	rx->shadow[idx].addr_low =
1906 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1907 	rx->shadow[idx].addr_high =
1908 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1909 
1910 done:
1911 	if ((idx & 7) == 7) {
1912 		if (rx->wc_fifo == NULL)
1913 			mxge_submit_8rx(&rx->lanai[idx - 7],
1914 					&rx->shadow[idx - 7]);
1915 		else {
1916 			mb();
1917 			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1918 		}
1919         }
1920 	return err;
1921 }
1922 
1923 static inline void
1924 mxge_rx_csum(struct mbuf *m, int csum)
1925 {
1926 	struct ether_header *eh;
1927 	struct ip *ip;
1928 
1929 	eh = mtod(m, struct ether_header *);
1930 
1931 	/* only deal with IPv4 TCP & UDP for now */
1932 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
1933 		return;
1934 	ip = (struct ip *)(eh + 1);
1935 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
1936 			    ip->ip_p != IPPROTO_UDP))
1937 		return;
1938 
1939 	/*
1940 	 *  Myri10GE hardware checksums are not valid if the sender
1941 	 *  padded the frame with non-zero padding.  This is because
1942 	 *  the firmware just does a simple 16-bit 1s complement
1943 	 *  checksum across the entire frame, excluding the first 14
1944 	 *  bytes.  It is easiest to simply to assume the worst, and
1945 	 *  only apply hardware checksums to non-padded frames.  This
1946 	 *  is what nearly every other OS does by default.
1947 	 */
1948 
1949 	if (__predict_true(m->m_pkthdr.len ==
1950 			   (ntohs(ip->ip_len) + ETHER_HDR_LEN))) {
1951 		m->m_pkthdr.csum_data = csum;
1952 		m->m_pkthdr.csum_flags = CSUM_DATA_VALID;
1953 	}
1954 }
1955 
1956 static inline void
1957 mxge_rx_done_big(mxge_softc_t *sc, int len, int csum)
1958 {
1959 	struct ifnet *ifp;
1960 	struct mbuf *m = 0; 		/* -Wunitialized */
1961 	struct mbuf *m_prev = 0;	/* -Wunitialized */
1962 	struct mbuf *m_head = 0;
1963 	bus_dmamap_t old_map;
1964 	mxge_rx_buf_t *rx;
1965 	int idx;
1966 
1967 
1968 	rx = &sc->rx_big;
1969 	ifp = sc->ifp;
1970 	while (len > 0) {
1971 		idx = rx->cnt & rx->mask;
1972                 rx->cnt++;
1973 		/* save a pointer to the received mbuf */
1974 		m = rx->info[idx].m;
1975 		/* try to replace the received mbuf */
1976 		if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
1977 			goto drop;
1978 		}
1979 		/* unmap the received buffer */
1980 		old_map = rx->info[idx].map;
1981 		bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1982 		bus_dmamap_unload(rx->dmat, old_map);
1983 
1984 		/* swap the bus_dmamap_t's */
1985 		rx->info[idx].map = rx->extra_map;
1986 		rx->extra_map = old_map;
1987 
1988 		/* chain multiple segments together */
1989 		if (!m_head) {
1990 			m_head = m;
1991 			/* mcp implicitly skips 1st bytes so that
1992 			 * packet is properly aligned */
1993 			m->m_data += MXGEFW_PAD;
1994 			m->m_pkthdr.len = len;
1995 			m->m_len = sc->big_bytes - MXGEFW_PAD;
1996 		} else {
1997 			m->m_len = sc->big_bytes;
1998 			m->m_flags &= ~M_PKTHDR;
1999 			m_prev->m_next = m;
2000 		}
2001 		len -= m->m_len;
2002 		m_prev = m;
2003 	}
2004 
2005 	/* trim trailing garbage from the last mbuf in the chain.  If
2006 	 * there is any garbage, len will be negative */
2007 	m->m_len += len;
2008 
2009 	m_head->m_pkthdr.rcvif = ifp;
2010 	ifp->if_ipackets++;
2011 	/* if the checksum is valid, mark it in the mbuf header */
2012 	if (sc->csum_flag)
2013 		mxge_rx_csum(m_head, csum);
2014 
2015 	/* pass the frame up the stack */
2016 	(*ifp->if_input)(ifp, m_head);
2017 	return;
2018 
2019 drop:
2020 	/* drop the frame -- the old mbuf(s) are re-cycled by running
2021 	   every slot through the allocator */
2022         if (m_head) {
2023                 len -= sc->big_bytes;
2024                 m_freem(m_head);
2025         } else {
2026                 len -= (sc->big_bytes + MXGEFW_PAD);
2027         }
2028         while ((int)len > 0) {
2029                 idx = rx->cnt & rx->mask;
2030                 rx->cnt++;
2031                 m = rx->info[idx].m;
2032                 if (0 == (mxge_get_buf_big(sc, rx->extra_map, idx))) {
2033 			m_freem(m);
2034 			/* unmap the received buffer */
2035 			old_map = rx->info[idx].map;
2036 			bus_dmamap_sync(rx->dmat, old_map,
2037 					BUS_DMASYNC_POSTREAD);
2038 			bus_dmamap_unload(rx->dmat, old_map);
2039 
2040 			/* swap the bus_dmamap_t's */
2041 			rx->info[idx].map = rx->extra_map;
2042 			rx->extra_map = old_map;
2043 		}
2044                 len -= sc->big_bytes;
2045         }
2046 
2047 	ifp->if_ierrors++;
2048 
2049 }
2050 
2051 static inline void
2052 mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
2053 {
2054 	struct ifnet *ifp;
2055 	struct mbuf *m;
2056 	mxge_rx_buf_t *rx;
2057 	bus_dmamap_t old_map;
2058 	int idx;
2059 
2060 	ifp = sc->ifp;
2061 	rx = &sc->rx_small;
2062 	idx = rx->cnt & rx->mask;
2063 	rx->cnt++;
2064 	/* save a pointer to the received mbuf */
2065 	m = rx->info[idx].m;
2066 	/* try to replace the received mbuf */
2067 	if (mxge_get_buf_small(sc, rx->extra_map, idx)) {
2068 		/* drop the frame -- the old mbuf is re-cycled */
2069 		ifp->if_ierrors++;
2070 		return;
2071 	}
2072 
2073 	/* unmap the received buffer */
2074 	old_map = rx->info[idx].map;
2075 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2076 	bus_dmamap_unload(rx->dmat, old_map);
2077 
2078 	/* swap the bus_dmamap_t's */
2079 	rx->info[idx].map = rx->extra_map;
2080 	rx->extra_map = old_map;
2081 
2082 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2083 	 * aligned */
2084 	m->m_data += MXGEFW_PAD;
2085 
2086 	m->m_pkthdr.rcvif = ifp;
2087 	m->m_len = m->m_pkthdr.len = len;
2088 	ifp->if_ipackets++;
2089 	/* if the checksum is valid, mark it in the mbuf header */
2090 	if (sc->csum_flag)
2091 		mxge_rx_csum(m, csum);
2092 
2093 	/* pass the frame up the stack */
2094 	(*ifp->if_input)(ifp, m);
2095 }
2096 
2097 static inline void
2098 mxge_clean_rx_done(mxge_softc_t *sc)
2099 {
2100 	mxge_rx_done_t *rx_done = &sc->rx_done;
2101 	int limit = 0;
2102 	uint16_t length;
2103 	uint16_t checksum;
2104 
2105 
2106 	while (rx_done->entry[rx_done->idx].length != 0) {
2107 		length = ntohs(rx_done->entry[rx_done->idx].length);
2108 		rx_done->entry[rx_done->idx].length = 0;
2109 		checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
2110 		if (length <= (MHLEN - MXGEFW_PAD))
2111 			mxge_rx_done_small(sc, length, checksum);
2112 		else
2113 			mxge_rx_done_big(sc, length, checksum);
2114 		rx_done->cnt++;
2115 		rx_done->idx = rx_done->cnt & (mxge_max_intr_slots - 1);
2116 
2117 		/* limit potential for livelock */
2118 		if (__predict_false(++limit > 2 * mxge_max_intr_slots))
2119 			break;
2120 
2121 	}
2122 }
2123 
2124 
2125 static inline void
2126 mxge_tx_done(mxge_softc_t *sc, uint32_t mcp_idx)
2127 {
2128 	struct ifnet *ifp;
2129 	mxge_tx_buf_t *tx;
2130 	struct mbuf *m;
2131 	bus_dmamap_t map;
2132 	int idx, limit;
2133 
2134 	limit = 0;
2135 	tx = &sc->tx;
2136 	ifp = sc->ifp;
2137 	while (tx->pkt_done != mcp_idx) {
2138 		idx = tx->done & tx->mask;
2139 		tx->done++;
2140 		m = tx->info[idx].m;
2141 		/* mbuf and DMA map only attached to the first
2142 		   segment per-mbuf */
2143 		if (m != NULL) {
2144 			ifp->if_opackets++;
2145 			tx->info[idx].m = NULL;
2146 			map = tx->info[idx].map;
2147 			bus_dmamap_unload(tx->dmat, map);
2148 			m_freem(m);
2149 		}
2150 		if (tx->info[idx].flag) {
2151 			tx->info[idx].flag = 0;
2152 			tx->pkt_done++;
2153 		}
2154 		/* limit potential for livelock by only handling
2155 		   2 full tx rings per call */
2156 		if (__predict_false(++limit >  2 * tx->mask))
2157 			break;
2158 	}
2159 
2160 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2161            its OK to send packets */
2162 
2163 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
2164 	    tx->req - tx->done < (tx->mask + 1)/4) {
2165 		mtx_lock(&sc->tx_mtx);
2166 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2167 		sc->tx.wake++;
2168 		mxge_start_locked(sc);
2169 		mtx_unlock(&sc->tx_mtx);
2170 	}
2171 }
2172 
2173 static void
2174 mxge_intr(void *arg)
2175 {
2176 	mxge_softc_t *sc = arg;
2177 	mcp_irq_data_t *stats = sc->fw_stats;
2178 	mxge_tx_buf_t *tx = &sc->tx;
2179 	mxge_rx_done_t *rx_done = &sc->rx_done;
2180 	uint32_t send_done_count;
2181 	uint8_t valid;
2182 
2183 
2184 	/* make sure the DMA has finished */
2185 	if (!stats->valid) {
2186 		return;
2187 	}
2188 	valid = stats->valid;
2189 
2190 	if (!sc->msi_enabled) {
2191 		/* lower legacy IRQ  */
2192 		*sc->irq_deassert = 0;
2193 		if (!mxge_deassert_wait)
2194 			/* don't wait for conf. that irq is low */
2195 			stats->valid = 0;
2196 	} else {
2197 		stats->valid = 0;
2198 	}
2199 
2200 	/* loop while waiting for legacy irq deassertion */
2201 	do {
2202 		/* check for transmit completes and receives */
2203 		send_done_count = be32toh(stats->send_done_count);
2204 		while ((send_done_count != tx->pkt_done) ||
2205 		       (rx_done->entry[rx_done->idx].length != 0)) {
2206 			mxge_tx_done(sc, (int)send_done_count);
2207 			mxge_clean_rx_done(sc);
2208 			send_done_count = be32toh(stats->send_done_count);
2209 		}
2210 	} while (*((volatile uint8_t *) &stats->valid));
2211 
2212 	if (__predict_false(stats->stats_updated)) {
2213 		if (sc->link_state != stats->link_up) {
2214 			sc->link_state = stats->link_up;
2215 			if (sc->link_state) {
2216 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2217 				if (mxge_verbose)
2218 					device_printf(sc->dev, "link up\n");
2219 			} else {
2220 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2221 				if (mxge_verbose)
2222 					device_printf(sc->dev, "link down\n");
2223 			}
2224 		}
2225 		if (sc->rdma_tags_available !=
2226 		    be32toh(sc->fw_stats->rdma_tags_available)) {
2227 			sc->rdma_tags_available =
2228 				be32toh(sc->fw_stats->rdma_tags_available);
2229 			device_printf(sc->dev, "RDMA timed out! %d tags "
2230 				      "left\n", sc->rdma_tags_available);
2231 		}
2232 		sc->down_cnt += stats->link_down;
2233 	}
2234 
2235 	/* check to see if we have rx token to pass back */
2236 	if (valid & 0x1)
2237 	    *sc->irq_claim = be32toh(3);
2238 	*(sc->irq_claim + 1) = be32toh(3);
2239 }
2240 
2241 static void
2242 mxge_init(void *arg)
2243 {
2244 }
2245 
2246 
2247 
2248 static void
2249 mxge_free_mbufs(mxge_softc_t *sc)
2250 {
2251 	int i;
2252 
2253 	for (i = 0; i <= sc->rx_big.mask; i++) {
2254 		if (sc->rx_big.info[i].m == NULL)
2255 			continue;
2256 		bus_dmamap_unload(sc->rx_big.dmat,
2257 				  sc->rx_big.info[i].map);
2258 		m_freem(sc->rx_big.info[i].m);
2259 		sc->rx_big.info[i].m = NULL;
2260 	}
2261 
2262 	for (i = 0; i <= sc->rx_small.mask; i++) {
2263 		if (sc->rx_small.info[i].m == NULL)
2264 			continue;
2265 		bus_dmamap_unload(sc->rx_small.dmat,
2266 				  sc->rx_small.info[i].map);
2267 		m_freem(sc->rx_small.info[i].m);
2268 		sc->rx_small.info[i].m = NULL;
2269 	}
2270 
2271 	for (i = 0; i <= sc->tx.mask; i++) {
2272 		sc->tx.info[i].flag = 0;
2273 		if (sc->tx.info[i].m == NULL)
2274 			continue;
2275 		bus_dmamap_unload(sc->tx.dmat,
2276 				  sc->tx.info[i].map);
2277 		m_freem(sc->tx.info[i].m);
2278 		sc->tx.info[i].m = NULL;
2279 	}
2280 }
2281 
2282 static void
2283 mxge_free_rings(mxge_softc_t *sc)
2284 {
2285 	int i;
2286 
2287 	if (sc->tx.req_bytes != NULL)
2288 		free(sc->tx.req_bytes, M_DEVBUF);
2289 	if (sc->tx.seg_list != NULL)
2290 		free(sc->tx.seg_list, M_DEVBUF);
2291 	if (sc->rx_small.shadow != NULL)
2292 		free(sc->rx_small.shadow, M_DEVBUF);
2293 	if (sc->rx_big.shadow != NULL)
2294 		free(sc->rx_big.shadow, M_DEVBUF);
2295 	if (sc->tx.info != NULL) {
2296 		if (sc->tx.dmat != NULL) {
2297 			for (i = 0; i <= sc->tx.mask; i++) {
2298 				bus_dmamap_destroy(sc->tx.dmat,
2299 						   sc->tx.info[i].map);
2300 			}
2301 			bus_dma_tag_destroy(sc->tx.dmat);
2302 		}
2303 		free(sc->tx.info, M_DEVBUF);
2304 	}
2305 	if (sc->rx_small.info != NULL) {
2306 		if (sc->rx_small.dmat != NULL) {
2307 			for (i = 0; i <= sc->rx_small.mask; i++) {
2308 				bus_dmamap_destroy(sc->rx_small.dmat,
2309 						   sc->rx_small.info[i].map);
2310 			}
2311 			bus_dmamap_destroy(sc->rx_small.dmat,
2312 					   sc->rx_small.extra_map);
2313 			bus_dma_tag_destroy(sc->rx_small.dmat);
2314 		}
2315 		free(sc->rx_small.info, M_DEVBUF);
2316 	}
2317 	if (sc->rx_big.info != NULL) {
2318 		if (sc->rx_big.dmat != NULL) {
2319 			for (i = 0; i <= sc->rx_big.mask; i++) {
2320 				bus_dmamap_destroy(sc->rx_big.dmat,
2321 						   sc->rx_big.info[i].map);
2322 			}
2323 			bus_dmamap_destroy(sc->rx_big.dmat,
2324 					   sc->rx_big.extra_map);
2325 			bus_dma_tag_destroy(sc->rx_big.dmat);
2326 		}
2327 		free(sc->rx_big.info, M_DEVBUF);
2328 	}
2329 }
2330 
2331 static int
2332 mxge_alloc_rings(mxge_softc_t *sc)
2333 {
2334 	mxge_cmd_t cmd;
2335 	int tx_ring_size, rx_ring_size;
2336 	int tx_ring_entries, rx_ring_entries;
2337 	int i, err;
2338 	unsigned long bytes;
2339 
2340 	/* get ring sizes */
2341 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
2342 	tx_ring_size = cmd.data0;
2343 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
2344 	if (err != 0) {
2345 		device_printf(sc->dev, "Cannot determine ring sizes\n");
2346 		goto abort_with_nothing;
2347 	}
2348 
2349 	rx_ring_size = cmd.data0;
2350 
2351 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
2352 	rx_ring_entries = rx_ring_size / sizeof (mcp_dma_addr_t);
2353 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
2354 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
2355 	IFQ_SET_READY(&sc->ifp->if_snd);
2356 
2357 	sc->tx.mask = tx_ring_entries - 1;
2358 	sc->rx_small.mask = sc->rx_big.mask = rx_ring_entries - 1;
2359 
2360 	err = ENOMEM;
2361 
2362 	/* allocate the tx request copy block */
2363 	bytes = 8 +
2364 		sizeof (*sc->tx.req_list) * (MXGE_MAX_SEND_DESC + 4);
2365 	sc->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
2366 	if (sc->tx.req_bytes == NULL)
2367 		goto abort_with_nothing;
2368 	/* ensure req_list entries are aligned to 8 bytes */
2369 	sc->tx.req_list = (mcp_kreq_ether_send_t *)
2370 		((unsigned long)(sc->tx.req_bytes + 7) & ~7UL);
2371 
2372 	/* allocate the tx busdma segment list */
2373 	bytes = sizeof (*sc->tx.seg_list) * MXGE_MAX_SEND_DESC;
2374 	sc->tx.seg_list = (bus_dma_segment_t *)
2375 		malloc(bytes, M_DEVBUF, M_WAITOK);
2376 	if (sc->tx.seg_list == NULL)
2377 		goto abort_with_alloc;
2378 
2379 	/* allocate the rx shadow rings */
2380 	bytes = rx_ring_entries * sizeof (*sc->rx_small.shadow);
2381 	sc->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2382 	if (sc->rx_small.shadow == NULL)
2383 		goto abort_with_alloc;
2384 
2385 	bytes = rx_ring_entries * sizeof (*sc->rx_big.shadow);
2386 	sc->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2387 	if (sc->rx_big.shadow == NULL)
2388 		goto abort_with_alloc;
2389 
2390 	/* allocate the host info rings */
2391 	bytes = tx_ring_entries * sizeof (*sc->tx.info);
2392 	sc->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2393 	if (sc->tx.info == NULL)
2394 		goto abort_with_alloc;
2395 
2396 	bytes = rx_ring_entries * sizeof (*sc->rx_small.info);
2397 	sc->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2398 	if (sc->rx_small.info == NULL)
2399 		goto abort_with_alloc;
2400 
2401 	bytes = rx_ring_entries * sizeof (*sc->rx_big.info);
2402 	sc->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2403 	if (sc->rx_big.info == NULL)
2404 		goto abort_with_alloc;
2405 
2406 	/* allocate the busdma resources */
2407 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2408 				 1,			/* alignment */
2409 				 sc->tx.boundary,	/* boundary */
2410 				 BUS_SPACE_MAXADDR,	/* low */
2411 				 BUS_SPACE_MAXADDR,	/* high */
2412 				 NULL, NULL,		/* filter */
2413 				 65536 + 256,		/* maxsize */
2414 				 MXGE_MAX_SEND_DESC/2,	/* num segs */
2415 				 sc->tx.boundary,	/* maxsegsize */
2416 				 BUS_DMA_ALLOCNOW,	/* flags */
2417 				 NULL, NULL,		/* lock */
2418 				 &sc->tx.dmat);		/* tag */
2419 
2420 	if (err != 0) {
2421 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
2422 			      err);
2423 		goto abort_with_alloc;
2424 	}
2425 
2426 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2427 				 1,			/* alignment */
2428 				 4096,			/* boundary */
2429 				 BUS_SPACE_MAXADDR,	/* low */
2430 				 BUS_SPACE_MAXADDR,	/* high */
2431 				 NULL, NULL,		/* filter */
2432 				 MHLEN,			/* maxsize */
2433 				 1,			/* num segs */
2434 				 MHLEN,			/* maxsegsize */
2435 				 BUS_DMA_ALLOCNOW,	/* flags */
2436 				 NULL, NULL,		/* lock */
2437 				 &sc->rx_small.dmat);	/* tag */
2438 	if (err != 0) {
2439 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2440 			      err);
2441 		goto abort_with_alloc;
2442 	}
2443 
2444 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2445 				 1,			/* alignment */
2446 				 4096,			/* boundary */
2447 				 BUS_SPACE_MAXADDR,	/* low */
2448 				 BUS_SPACE_MAXADDR,	/* high */
2449 				 NULL, NULL,		/* filter */
2450 				 4096,			/* maxsize */
2451 				 1,			/* num segs */
2452 				 4096,			/* maxsegsize */
2453 				 BUS_DMA_ALLOCNOW,	/* flags */
2454 				 NULL, NULL,		/* lock */
2455 				 &sc->rx_big.dmat);	/* tag */
2456 	if (err != 0) {
2457 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2458 			      err);
2459 		goto abort_with_alloc;
2460 	}
2461 
2462 	/* now use these tags to setup dmamaps for each slot
2463 	   in each ring */
2464 	for (i = 0; i <= sc->tx.mask; i++) {
2465 		err = bus_dmamap_create(sc->tx.dmat, 0,
2466 					&sc->tx.info[i].map);
2467 		if (err != 0) {
2468 			device_printf(sc->dev, "Err %d  tx dmamap\n",
2469 			      err);
2470 			goto abort_with_alloc;
2471 		}
2472 	}
2473 	for (i = 0; i <= sc->rx_small.mask; i++) {
2474 		err = bus_dmamap_create(sc->rx_small.dmat, 0,
2475 					&sc->rx_small.info[i].map);
2476 		if (err != 0) {
2477 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2478 				      err);
2479 			goto abort_with_alloc;
2480 		}
2481 	}
2482 	err = bus_dmamap_create(sc->rx_small.dmat, 0,
2483 				&sc->rx_small.extra_map);
2484 	if (err != 0) {
2485 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2486 			      err);
2487 			goto abort_with_alloc;
2488 	}
2489 
2490 	for (i = 0; i <= sc->rx_big.mask; i++) {
2491 		err = bus_dmamap_create(sc->rx_big.dmat, 0,
2492 					&sc->rx_big.info[i].map);
2493 		if (err != 0) {
2494 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2495 			      err);
2496 			goto abort_with_alloc;
2497 		}
2498 	}
2499 	err = bus_dmamap_create(sc->rx_big.dmat, 0,
2500 				&sc->rx_big.extra_map);
2501 	if (err != 0) {
2502 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2503 			      err);
2504 			goto abort_with_alloc;
2505 	}
2506 	return 0;
2507 
2508 abort_with_alloc:
2509 	mxge_free_rings(sc);
2510 
2511 abort_with_nothing:
2512 	return err;
2513 }
2514 
2515 static int
2516 mxge_open(mxge_softc_t *sc)
2517 {
2518 	mxge_cmd_t cmd;
2519 	int i, err;
2520 	bus_dmamap_t map;
2521 	bus_addr_t bus;
2522 
2523 
2524 	/* Copy the MAC address in case it was overridden */
2525 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
2526 
2527 	err = mxge_reset(sc);
2528 	if (err != 0) {
2529 		device_printf(sc->dev, "failed to reset\n");
2530 		return EIO;
2531 	}
2532 	bzero(sc->rx_done.entry,
2533 	      mxge_max_intr_slots * sizeof(*sc->rx_done.entry));
2534 
2535 	if (MCLBYTES >=
2536 	    sc->ifp->if_mtu + ETHER_HDR_LEN + MXGEFW_PAD)
2537 		sc->big_bytes = MCLBYTES;
2538 	else
2539 		sc->big_bytes = MJUMPAGESIZE;
2540 
2541 
2542 	/* get the lanai pointers to the send and receive rings */
2543 
2544 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
2545 	sc->tx.lanai =
2546 		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
2547 	err |= mxge_send_cmd(sc,
2548 				 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
2549 	sc->rx_small.lanai =
2550 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2551 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
2552 	sc->rx_big.lanai =
2553 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2554 
2555 	if (err != 0) {
2556 		device_printf(sc->dev,
2557 			      "failed to get ring sizes or locations\n");
2558 		return EIO;
2559 	}
2560 
2561 	if (sc->wc) {
2562 		sc->tx.wc_fifo = sc->sram + MXGEFW_ETH_SEND_4;
2563 		sc->rx_small.wc_fifo = sc->sram + MXGEFW_ETH_RECV_SMALL;
2564 		sc->rx_big.wc_fifo = sc->sram + MXGEFW_ETH_RECV_BIG;
2565 	} else {
2566 		sc->tx.wc_fifo = 0;
2567 		sc->rx_small.wc_fifo = 0;
2568 		sc->rx_big.wc_fifo = 0;
2569 	}
2570 
2571 
2572 	/* stock receive rings */
2573 	for (i = 0; i <= sc->rx_small.mask; i++) {
2574 		map = sc->rx_small.info[i].map;
2575 		err = mxge_get_buf_small(sc, map, i);
2576 		if (err) {
2577 			device_printf(sc->dev, "alloced %d/%d smalls\n",
2578 				      i, sc->rx_small.mask + 1);
2579 			goto abort;
2580 		}
2581 	}
2582 	for (i = 0; i <= sc->rx_big.mask; i++) {
2583 		map = sc->rx_big.info[i].map;
2584 		err = mxge_get_buf_big(sc, map, i);
2585 		if (err) {
2586 			device_printf(sc->dev, "alloced %d/%d bigs\n",
2587 				      i, sc->rx_big.mask + 1);
2588 			goto abort;
2589 		}
2590 	}
2591 
2592 	/* Give the firmware the mtu and the big and small buffer
2593 	   sizes.  The firmware wants the big buf size to be a power
2594 	   of two. Luckily, FreeBSD's clusters are powers of two */
2595 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN;
2596 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
2597 	cmd.data0 = MHLEN - MXGEFW_PAD;
2598 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
2599 			     &cmd);
2600 	cmd.data0 = sc->big_bytes;
2601 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2602 
2603 	if (err != 0) {
2604 		device_printf(sc->dev, "failed to setup params\n");
2605 		goto abort;
2606 	}
2607 
2608 	/* Now give him the pointer to the stats block */
2609 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->fw_stats_dma.bus_addr);
2610 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->fw_stats_dma.bus_addr);
2611 	cmd.data2 = sizeof(struct mcp_irq_data);
2612 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
2613 
2614 	if (err != 0) {
2615 		bus = sc->fw_stats_dma.bus_addr;
2616 		bus += offsetof(struct mcp_irq_data, send_done_count);
2617 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
2618 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
2619 		err = mxge_send_cmd(sc,
2620 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
2621 				    &cmd);
2622 		/* Firmware cannot support multicast without STATS_DMA_V2 */
2623 		sc->fw_multicast_support = 0;
2624 	} else {
2625 		sc->fw_multicast_support = 1;
2626 	}
2627 
2628 	if (err != 0) {
2629 		device_printf(sc->dev, "failed to setup params\n");
2630 		goto abort;
2631 	}
2632 
2633 	/* Finally, start the firmware running */
2634 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
2635 	if (err) {
2636 		device_printf(sc->dev, "Couldn't bring up link\n");
2637 		goto abort;
2638 	}
2639 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
2640 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2641 
2642 	return 0;
2643 
2644 
2645 abort:
2646 	mxge_free_mbufs(sc);
2647 
2648 	return err;
2649 }
2650 
2651 static int
2652 mxge_close(mxge_softc_t *sc)
2653 {
2654 	mxge_cmd_t cmd;
2655 	int err, old_down_cnt;
2656 
2657 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
2658 	old_down_cnt = sc->down_cnt;
2659 	mb();
2660 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2661 	if (err) {
2662 		device_printf(sc->dev, "Couldn't bring down link\n");
2663 	}
2664 	if (old_down_cnt == sc->down_cnt) {
2665 		/* wait for down irq */
2666 		DELAY(10 * sc->intr_coal_delay);
2667 	}
2668 	if (old_down_cnt == sc->down_cnt) {
2669 		device_printf(sc->dev, "never got down irq\n");
2670 	}
2671 
2672 	mxge_free_mbufs(sc);
2673 
2674 	return 0;
2675 }
2676 
2677 static void
2678 mxge_setup_cfg_space(mxge_softc_t *sc)
2679 {
2680 	device_t dev = sc->dev;
2681 	int reg;
2682 	uint16_t cmd, lnk, pectl;
2683 
2684 	/* find the PCIe link width and set max read request to 4KB*/
2685 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
2686 		lnk = pci_read_config(dev, reg + 0x12, 2);
2687 		sc->link_width = (lnk >> 4) & 0x3f;
2688 
2689 		pectl = pci_read_config(dev, reg + 0x8, 2);
2690 		pectl = (pectl & ~0x7000) | (5 << 12);
2691 		pci_write_config(dev, reg + 0x8, pectl, 2);
2692 	}
2693 
2694 	/* Enable DMA and Memory space access */
2695 	pci_enable_busmaster(dev);
2696 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
2697 	cmd |= PCIM_CMD_MEMEN;
2698 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
2699 }
2700 
2701 static uint32_t
2702 mxge_read_reboot(mxge_softc_t *sc)
2703 {
2704 	device_t dev = sc->dev;
2705 	uint32_t vs;
2706 
2707 	/* find the vendor specific offset */
2708 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
2709 		device_printf(sc->dev,
2710 			      "could not find vendor specific offset\n");
2711 		return (uint32_t)-1;
2712 	}
2713 	/* enable read32 mode */
2714 	pci_write_config(dev, vs + 0x10, 0x3, 1);
2715 	/* tell NIC which register to read */
2716 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
2717 	return (pci_read_config(dev, vs + 0x14, 4));
2718 }
2719 
2720 static void
2721 mxge_watchdog_reset(mxge_softc_t *sc)
2722 {
2723 	int err;
2724 	uint32_t reboot;
2725 	uint16_t cmd;
2726 
2727 	err = ENXIO;
2728 
2729 	device_printf(sc->dev, "Watchdog reset!\n");
2730 
2731 	/*
2732 	 * check to see if the NIC rebooted.  If it did, then all of
2733 	 * PCI config space has been reset, and things like the
2734 	 * busmaster bit will be zero.  If this is the case, then we
2735 	 * must restore PCI config space before the NIC can be used
2736 	 * again
2737 	 */
2738 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2739 	if (cmd == 0xffff) {
2740 		/*
2741 		 * maybe the watchdog caught the NIC rebooting; wait
2742 		 * up to 100ms for it to finish.  If it does not come
2743 		 * back, then give up
2744 		 */
2745 		DELAY(1000*100);
2746 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2747 		if (cmd == 0xffff) {
2748 			device_printf(sc->dev, "NIC disappeared!\n");
2749 			goto abort;
2750 		}
2751 	}
2752 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
2753 		/* print the reboot status */
2754 		reboot = mxge_read_reboot(sc);
2755 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
2756 			      reboot);
2757 		/* restore PCI configuration space */
2758 
2759 		/* XXXX waiting for pci_cfg_restore() to be exported */
2760 		goto abort; /* just abort for now */
2761 
2762 		/* and redo any changes we made to our config space */
2763 		mxge_setup_cfg_space(sc);
2764 	} else {
2765 		device_printf(sc->dev, "NIC did not reboot, ring state:\n");
2766 		device_printf(sc->dev, "tx.req=%d tx.done=%d\n",
2767 			      sc->tx.req, sc->tx.done);
2768 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
2769 			      sc->tx.pkt_done,
2770 			      be32toh(sc->fw_stats->send_done_count));
2771 	}
2772 
2773 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
2774 		mxge_close(sc);
2775 		err = mxge_open(sc);
2776 	}
2777 
2778 abort:
2779 	/*
2780 	 * stop the watchdog if the nic is dead, to avoid spamming the
2781 	 * console
2782 	 */
2783 	if (err != 0) {
2784 		callout_stop(&sc->co_hdl);
2785 	}
2786 }
2787 
2788 static void
2789 mxge_watchdog(mxge_softc_t *sc)
2790 {
2791 	mxge_tx_buf_t *tx = &sc->tx;
2792 
2793 	/* see if we have outstanding transmits, which
2794 	   have been pending for more than mxge_ticks */
2795 	if (tx->req != tx->done &&
2796 	    tx->watchdog_req != tx->watchdog_done &&
2797 	    tx->done == tx->watchdog_done)
2798 		mxge_watchdog_reset(sc);
2799 
2800 	tx->watchdog_req = tx->req;
2801 	tx->watchdog_done = tx->done;
2802 }
2803 
2804 static void
2805 mxge_tick(void *arg)
2806 {
2807 	mxge_softc_t *sc = arg;
2808 
2809 
2810 	/* Synchronize with possible callout reset/stop. */
2811 	if (callout_pending(&sc->co_hdl) ||
2812 	    !callout_active(&sc->co_hdl)) {
2813 		mtx_unlock(&sc->driver_mtx);
2814 		return;
2815 	}
2816 
2817 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
2818 	mxge_watchdog(sc);
2819 }
2820 
2821 static int
2822 mxge_media_change(struct ifnet *ifp)
2823 {
2824 	return EINVAL;
2825 }
2826 
2827 static int
2828 mxge_change_mtu(mxge_softc_t *sc, int mtu)
2829 {
2830 	struct ifnet *ifp = sc->ifp;
2831 	int real_mtu, old_mtu;
2832 	int err = 0;
2833 
2834 
2835 	real_mtu = mtu + ETHER_HDR_LEN;
2836 	if ((real_mtu > MXGE_MAX_ETHER_MTU) ||
2837 	    real_mtu < 60)
2838 		return EINVAL;
2839 	mtx_lock(&sc->driver_mtx);
2840 	old_mtu = ifp->if_mtu;
2841 	ifp->if_mtu = mtu;
2842 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2843 		callout_stop(&sc->co_hdl);
2844 		mxge_close(sc);
2845 		err = mxge_open(sc);
2846 		if (err != 0) {
2847 			ifp->if_mtu = old_mtu;
2848 			mxge_close(sc);
2849 			(void) mxge_open(sc);
2850 		}
2851 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
2852 	}
2853 	mtx_unlock(&sc->driver_mtx);
2854 	return err;
2855 }
2856 
2857 static void
2858 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
2859 {
2860 	mxge_softc_t *sc = ifp->if_softc;
2861 
2862 
2863 	if (sc == NULL)
2864 		return;
2865 	ifmr->ifm_status = IFM_AVALID;
2866 	ifmr->ifm_status |= sc->fw_stats->link_up ? IFM_ACTIVE : 0;
2867 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
2868 	ifmr->ifm_active |= sc->fw_stats->link_up ? IFM_FDX : 0;
2869 }
2870 
2871 static int
2872 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
2873 {
2874 	mxge_softc_t *sc = ifp->if_softc;
2875 	struct ifreq *ifr = (struct ifreq *)data;
2876 	int err, mask;
2877 
2878 	err = 0;
2879 	switch (command) {
2880 	case SIOCSIFADDR:
2881 	case SIOCGIFADDR:
2882 		err = ether_ioctl(ifp, command, data);
2883 		break;
2884 
2885 	case SIOCSIFMTU:
2886 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
2887 		break;
2888 
2889 	case SIOCSIFFLAGS:
2890 		mtx_lock(&sc->driver_mtx);
2891 		if (ifp->if_flags & IFF_UP) {
2892 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
2893 				err = mxge_open(sc);
2894 				callout_reset(&sc->co_hdl, mxge_ticks,
2895 					      mxge_tick, sc);
2896 			} else {
2897 				/* take care of promis can allmulti
2898 				   flag chages */
2899 				mxge_change_promisc(sc,
2900 						    ifp->if_flags & IFF_PROMISC);
2901 				mxge_set_multicast_list(sc);
2902 			}
2903 		} else {
2904 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2905 				mxge_close(sc);
2906 				callout_stop(&sc->co_hdl);
2907 			}
2908 		}
2909 		mtx_unlock(&sc->driver_mtx);
2910 		break;
2911 
2912 	case SIOCADDMULTI:
2913 	case SIOCDELMULTI:
2914 		mtx_lock(&sc->driver_mtx);
2915 		mxge_set_multicast_list(sc);
2916 		mtx_unlock(&sc->driver_mtx);
2917 		break;
2918 
2919 	case SIOCSIFCAP:
2920 		mtx_lock(&sc->driver_mtx);
2921 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2922 		if (mask & IFCAP_TXCSUM) {
2923 			if (IFCAP_TXCSUM & ifp->if_capenable) {
2924 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
2925 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
2926 						      | CSUM_TSO);
2927 			} else {
2928 				ifp->if_capenable |= IFCAP_TXCSUM;
2929 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
2930 			}
2931 		} else if (mask & IFCAP_RXCSUM) {
2932 			if (IFCAP_RXCSUM & ifp->if_capenable) {
2933 				ifp->if_capenable &= ~IFCAP_RXCSUM;
2934 				sc->csum_flag = 0;
2935 			} else {
2936 				ifp->if_capenable |= IFCAP_RXCSUM;
2937 				sc->csum_flag = 1;
2938 			}
2939 		}
2940 		if (mask & IFCAP_TSO4) {
2941 			if (IFCAP_TSO4 & ifp->if_capenable) {
2942 				ifp->if_capenable &= ~IFCAP_TSO4;
2943 				ifp->if_hwassist &= ~CSUM_TSO;
2944 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
2945 				ifp->if_capenable |= IFCAP_TSO4;
2946 				ifp->if_hwassist |= CSUM_TSO;
2947 			} else {
2948 				printf("mxge requires tx checksum offload"
2949 				       " be enabled to use TSO\n");
2950 				err = EINVAL;
2951 			}
2952 		}
2953 		mtx_unlock(&sc->driver_mtx);
2954 		break;
2955 
2956 	case SIOCGIFMEDIA:
2957 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
2958 				    &sc->media, command);
2959                 break;
2960 
2961 	default:
2962 		err = ENOTTY;
2963         }
2964 	return err;
2965 }
2966 
2967 static void
2968 mxge_fetch_tunables(mxge_softc_t *sc)
2969 {
2970 
2971 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
2972 			  &mxge_flow_control);
2973 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
2974 			  &mxge_intr_coal_delay);
2975 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
2976 			  &mxge_nvidia_ecrc_enable);
2977 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
2978 			  &mxge_force_firmware);
2979 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
2980 			  &mxge_deassert_wait);
2981 	TUNABLE_INT_FETCH("hw.mxge.verbose",
2982 			  &mxge_verbose);
2983 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
2984 
2985 	if (bootverbose)
2986 		mxge_verbose = 1;
2987 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
2988 		mxge_intr_coal_delay = 30;
2989 	if (mxge_ticks == 0)
2990 		mxge_ticks = hz;
2991 	sc->pause = mxge_flow_control;
2992 }
2993 
2994 static int
2995 mxge_attach(device_t dev)
2996 {
2997 	mxge_softc_t *sc = device_get_softc(dev);
2998 	struct ifnet *ifp;
2999 	size_t bytes;
3000 	int count, rid, err;
3001 
3002 	sc->dev = dev;
3003 	mxge_fetch_tunables(sc);
3004 
3005 	err = bus_dma_tag_create(NULL,			/* parent */
3006 				 1,			/* alignment */
3007 				 4096,			/* boundary */
3008 				 BUS_SPACE_MAXADDR,	/* low */
3009 				 BUS_SPACE_MAXADDR,	/* high */
3010 				 NULL, NULL,		/* filter */
3011 				 65536 + 256,		/* maxsize */
3012 				 MXGE_MAX_SEND_DESC, 	/* num segs */
3013 				 4096,			/* maxsegsize */
3014 				 0,			/* flags */
3015 				 NULL, NULL,		/* lock */
3016 				 &sc->parent_dmat);	/* tag */
3017 
3018 	if (err != 0) {
3019 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
3020 			      err);
3021 		goto abort_with_nothing;
3022 	}
3023 
3024 	ifp = sc->ifp = if_alloc(IFT_ETHER);
3025 	if (ifp == NULL) {
3026 		device_printf(dev, "can not if_alloc()\n");
3027 		err = ENOSPC;
3028 		goto abort_with_parent_dmat;
3029 	}
3030 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
3031 		 device_get_nameunit(dev));
3032 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
3033 	snprintf(sc->tx_mtx_name, sizeof(sc->tx_mtx_name), "%s:tx",
3034 		 device_get_nameunit(dev));
3035 	mtx_init(&sc->tx_mtx, sc->tx_mtx_name, NULL, MTX_DEF);
3036 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
3037 		 "%s:drv", device_get_nameunit(dev));
3038 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
3039 		 MTX_NETWORK_LOCK, MTX_DEF);
3040 
3041 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
3042 
3043 	mxge_setup_cfg_space(sc);
3044 
3045 	/* Map the board into the kernel */
3046 	rid = PCIR_BARS;
3047 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
3048 					 ~0, 1, RF_ACTIVE);
3049 	if (sc->mem_res == NULL) {
3050 		device_printf(dev, "could not map memory\n");
3051 		err = ENXIO;
3052 		goto abort_with_lock;
3053 	}
3054 	sc->sram = rman_get_virtual(sc->mem_res);
3055 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
3056 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
3057 		device_printf(dev, "impossible memory region size %ld\n",
3058 			      rman_get_size(sc->mem_res));
3059 		err = ENXIO;
3060 		goto abort_with_mem_res;
3061 	}
3062 
3063 	/* make NULL terminated copy of the EEPROM strings section of
3064 	   lanai SRAM */
3065 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
3066 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
3067 				rman_get_bushandle(sc->mem_res),
3068 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
3069 				sc->eeprom_strings,
3070 				MXGE_EEPROM_STRINGS_SIZE - 2);
3071 	err = mxge_parse_strings(sc);
3072 	if (err != 0)
3073 		goto abort_with_mem_res;
3074 
3075 	/* Enable write combining for efficient use of PCIe bus */
3076 	mxge_enable_wc(sc);
3077 
3078 	/* Allocate the out of band dma memory */
3079 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
3080 			     sizeof (mxge_cmd_t), 64);
3081 	if (err != 0)
3082 		goto abort_with_mem_res;
3083 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
3084 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
3085 	if (err != 0)
3086 		goto abort_with_cmd_dma;
3087 
3088 	err = mxge_dma_alloc(sc, &sc->fw_stats_dma,
3089 			     sizeof (*sc->fw_stats), 64);
3090 	if (err != 0)
3091 		goto abort_with_zeropad_dma;
3092 	sc->fw_stats = (mcp_irq_data_t *)sc->fw_stats_dma.addr;
3093 
3094 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
3095 	if (err != 0)
3096 		goto abort_with_fw_stats;
3097 
3098 	/* allocate interrupt queues */
3099 	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);
3100 	err = mxge_dma_alloc(sc, &sc->rx_done.dma, bytes, 4096);
3101 	if (err != 0)
3102 		goto abort_with_dmabench;
3103 	sc->rx_done.entry = sc->rx_done.dma.addr;
3104 	bzero(sc->rx_done.entry, bytes);
3105 
3106 	/* Add our ithread  */
3107 	count = pci_msi_count(dev);
3108 	if (count == 1 && pci_alloc_msi(dev, &count) == 0) {
3109 		rid = 1;
3110 		sc->msi_enabled = 1;
3111 	} else {
3112 		rid = 0;
3113 	}
3114 	sc->irq_res = bus_alloc_resource(dev, SYS_RES_IRQ, &rid, 0, ~0,
3115 					 1, RF_SHAREABLE | RF_ACTIVE);
3116 	if (sc->irq_res == NULL) {
3117 		device_printf(dev, "could not alloc interrupt\n");
3118 		goto abort_with_rx_done;
3119 	}
3120 	if (mxge_verbose)
3121 		device_printf(dev, "using %s irq %ld\n",
3122 			      sc->msi_enabled ? "MSI" : "INTx",
3123 			      rman_get_start(sc->irq_res));
3124 	/* select & load the firmware */
3125 	err = mxge_select_firmware(sc);
3126 	if (err != 0)
3127 		goto abort_with_irq_res;
3128 	sc->intr_coal_delay = mxge_intr_coal_delay;
3129 	err = mxge_reset(sc);
3130 	if (err != 0)
3131 		goto abort_with_irq_res;
3132 
3133 	err = mxge_alloc_rings(sc);
3134 	if (err != 0) {
3135 		device_printf(sc->dev, "failed to allocate rings\n");
3136 		goto abort_with_irq_res;
3137 	}
3138 
3139 	err = bus_setup_intr(sc->dev, sc->irq_res,
3140 			     INTR_TYPE_NET | INTR_MPSAFE,
3141 			     NULL, mxge_intr, sc, &sc->ih);
3142 	if (err != 0) {
3143 		goto abort_with_rings;
3144 	}
3145 	/* hook into the network stack */
3146 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
3147 	ifp->if_baudrate = 100000000;
3148 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
3149 		IFCAP_JUMBO_MTU;
3150 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
3151 	ifp->if_capenable = ifp->if_capabilities;
3152 	sc->csum_flag = 1;
3153         ifp->if_init = mxge_init;
3154         ifp->if_softc = sc;
3155         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
3156         ifp->if_ioctl = mxge_ioctl;
3157         ifp->if_start = mxge_start;
3158 	ether_ifattach(ifp, sc->mac_addr);
3159 	/* ether_ifattach sets mtu to 1500 */
3160 	ifp->if_mtu = MXGE_MAX_ETHER_MTU - ETHER_HDR_LEN;
3161 
3162 	/* Initialise the ifmedia structure */
3163 	ifmedia_init(&sc->media, 0, mxge_media_change,
3164 		     mxge_media_status);
3165 	ifmedia_add(&sc->media, IFM_ETHER|IFM_AUTO, 0, NULL);
3166 	mxge_add_sysctls(sc);
3167 	return 0;
3168 
3169 abort_with_rings:
3170 	mxge_free_rings(sc);
3171 abort_with_irq_res:
3172 	bus_release_resource(dev, SYS_RES_IRQ,
3173 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3174 	if (sc->msi_enabled)
3175 		pci_release_msi(dev);
3176 abort_with_rx_done:
3177 	sc->rx_done.entry = NULL;
3178 	mxge_dma_free(&sc->rx_done.dma);
3179 abort_with_dmabench:
3180 	mxge_dma_free(&sc->dmabench_dma);
3181 abort_with_fw_stats:
3182 	mxge_dma_free(&sc->fw_stats_dma);
3183 abort_with_zeropad_dma:
3184 	mxge_dma_free(&sc->zeropad_dma);
3185 abort_with_cmd_dma:
3186 	mxge_dma_free(&sc->cmd_dma);
3187 abort_with_mem_res:
3188 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3189 abort_with_lock:
3190 	pci_disable_busmaster(dev);
3191 	mtx_destroy(&sc->cmd_mtx);
3192 	mtx_destroy(&sc->tx_mtx);
3193 	mtx_destroy(&sc->driver_mtx);
3194 	if_free(ifp);
3195 abort_with_parent_dmat:
3196 	bus_dma_tag_destroy(sc->parent_dmat);
3197 
3198 abort_with_nothing:
3199 	return err;
3200 }
3201 
3202 static int
3203 mxge_detach(device_t dev)
3204 {
3205 	mxge_softc_t *sc = device_get_softc(dev);
3206 
3207 	mtx_lock(&sc->driver_mtx);
3208 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
3209 		mxge_close(sc);
3210 	callout_stop(&sc->co_hdl);
3211 	mtx_unlock(&sc->driver_mtx);
3212 	ether_ifdetach(sc->ifp);
3213 	ifmedia_removeall(&sc->media);
3214 	mxge_dummy_rdma(sc, 0);
3215 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
3216 	mxge_free_rings(sc);
3217 	bus_release_resource(dev, SYS_RES_IRQ,
3218 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3219 	if (sc->msi_enabled)
3220 		pci_release_msi(dev);
3221 
3222 	sc->rx_done.entry = NULL;
3223 	mxge_dma_free(&sc->rx_done.dma);
3224 	mxge_dma_free(&sc->fw_stats_dma);
3225 	mxge_dma_free(&sc->dmabench_dma);
3226 	mxge_dma_free(&sc->zeropad_dma);
3227 	mxge_dma_free(&sc->cmd_dma);
3228 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3229 	pci_disable_busmaster(dev);
3230 	mtx_destroy(&sc->cmd_mtx);
3231 	mtx_destroy(&sc->tx_mtx);
3232 	mtx_destroy(&sc->driver_mtx);
3233 	if_free(sc->ifp);
3234 	bus_dma_tag_destroy(sc->parent_dmat);
3235 	return 0;
3236 }
3237 
3238 static int
3239 mxge_shutdown(device_t dev)
3240 {
3241 	return 0;
3242 }
3243 
3244 /*
3245   This file uses Myri10GE driver indentation.
3246 
3247   Local Variables:
3248   c-file-style:"linux"
3249   tab-width:8
3250   End:
3251 */
3252