xref: /freebsd/sys/dev/mxge/if_mxge.c (revision bfe691b2f75de2224c7ceb304ebcdef2b42d4179)
1 /******************************************************************************
2 
3 Copyright (c) 2006, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Redistributions in binary form must reproduce the above copyright
13     notice, this list of conditions and the following disclaimer in the
14     documentation and/or other materials provided with the distribution.
15 
16  3. Neither the name of the Myricom Inc, nor the names of its
17     contributors may be used to endorse or promote products derived from
18     this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 POSSIBILITY OF SUCH DAMAGE.
31 
32 ***************************************************************************/
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/linker.h>
40 #include <sys/firmware.h>
41 #include <sys/endian.h>
42 #include <sys/sockio.h>
43 #include <sys/mbuf.h>
44 #include <sys/malloc.h>
45 #include <sys/kdb.h>
46 #include <sys/kernel.h>
47 #include <sys/lock.h>
48 #include <sys/module.h>
49 #include <sys/memrange.h>
50 #include <sys/socket.h>
51 #include <sys/sysctl.h>
52 #include <sys/sx.h>
53 
54 #include <net/if.h>
55 #include <net/if_arp.h>
56 #include <net/ethernet.h>
57 #include <net/if_dl.h>
58 #include <net/if_media.h>
59 
60 #include <net/bpf.h>
61 
62 #include <net/if_types.h>
63 #include <net/if_vlan_var.h>
64 #include <net/zlib.h>
65 
66 #include <netinet/in_systm.h>
67 #include <netinet/in.h>
68 #include <netinet/ip.h>
69 #include <netinet/tcp.h>
70 
71 #include <machine/bus.h>
72 #include <machine/resource.h>
73 #include <sys/bus.h>
74 #include <sys/rman.h>
75 
76 #include <dev/pci/pcireg.h>
77 #include <dev/pci/pcivar.h>
78 
79 #include <vm/vm.h>		/* for pmap_mapdev() */
80 #include <vm/pmap.h>
81 
82 #include <dev/mxge/mxge_mcp.h>
83 #include <dev/mxge/mcp_gen_header.h>
84 #include <dev/mxge/if_mxge_var.h>
85 
86 /* tunable params */
87 static int mxge_nvidia_ecrc_enable = 1;
88 static int mxge_force_firmware = 0;
89 static int mxge_max_intr_slots = 1024;
90 static int mxge_intr_coal_delay = 30;
91 static int mxge_deassert_wait = 1;
92 static int mxge_flow_control = 1;
93 static int mxge_verbose = 0;
94 static int mxge_ticks;
95 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
96 static char *mxge_fw_aligned = "mxge_eth_z8e";
97 
98 static int mxge_probe(device_t dev);
99 static int mxge_attach(device_t dev);
100 static int mxge_detach(device_t dev);
101 static int mxge_shutdown(device_t dev);
102 static void mxge_intr(void *arg);
103 
104 static device_method_t mxge_methods[] =
105 {
106   /* Device interface */
107   DEVMETHOD(device_probe, mxge_probe),
108   DEVMETHOD(device_attach, mxge_attach),
109   DEVMETHOD(device_detach, mxge_detach),
110   DEVMETHOD(device_shutdown, mxge_shutdown),
111   {0, 0}
112 };
113 
114 static driver_t mxge_driver =
115 {
116   "mxge",
117   mxge_methods,
118   sizeof(mxge_softc_t),
119 };
120 
121 static devclass_t mxge_devclass;
122 
123 /* Declare ourselves to be a child of the PCI bus.*/
124 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
125 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
126 
127 static int
128 mxge_probe(device_t dev)
129 {
130   if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
131       (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E)) {
132 	  device_set_desc(dev, "Myri10G-PCIE-8A");
133 	  return 0;
134   }
135   return ENXIO;
136 }
137 
138 static void
139 mxge_enable_wc(mxge_softc_t *sc)
140 {
141 	struct mem_range_desc mrdesc;
142 	vm_paddr_t pa;
143 	vm_offset_t len;
144 	int err, action;
145 
146 	pa = rman_get_start(sc->mem_res);
147 	len = rman_get_size(sc->mem_res);
148 	mrdesc.mr_base = pa;
149 	mrdesc.mr_len = len;
150 	mrdesc.mr_flags = MDF_WRITECOMBINE;
151 	action = MEMRANGE_SET_UPDATE;
152 	strcpy((char *)&mrdesc.mr_owner, "mxge");
153 	err = mem_range_attr_set(&mrdesc, &action);
154 	if (err != 0) {
155 		device_printf(sc->dev,
156 			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
157 			      (unsigned long)pa, (unsigned long)len, err);
158 	} else {
159 		sc->wc = 1;
160 	}
161 }
162 
163 
164 /* callback to get our DMA address */
165 static void
166 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
167 			 int error)
168 {
169 	if (error == 0) {
170 		*(bus_addr_t *) arg = segs->ds_addr;
171 	}
172 }
173 
174 static int
175 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
176 		   bus_size_t alignment)
177 {
178 	int err;
179 	device_t dev = sc->dev;
180 
181 	/* allocate DMAable memory tags */
182 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
183 				 alignment,		/* alignment */
184 				 4096,			/* boundary */
185 				 BUS_SPACE_MAXADDR,	/* low */
186 				 BUS_SPACE_MAXADDR,	/* high */
187 				 NULL, NULL,		/* filter */
188 				 bytes,			/* maxsize */
189 				 1,			/* num segs */
190 				 4096,			/* maxsegsize */
191 				 BUS_DMA_COHERENT,	/* flags */
192 				 NULL, NULL,		/* lock */
193 				 &dma->dmat);		/* tag */
194 	if (err != 0) {
195 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
196 		return err;
197 	}
198 
199 	/* allocate DMAable memory & map */
200 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
201 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
202 				| BUS_DMA_ZERO),  &dma->map);
203 	if (err != 0) {
204 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
205 		goto abort_with_dmat;
206 	}
207 
208 	/* load the memory */
209 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
210 			      mxge_dmamap_callback,
211 			      (void *)&dma->bus_addr, 0);
212 	if (err != 0) {
213 		device_printf(dev, "couldn't load map (err = %d)\n", err);
214 		goto abort_with_mem;
215 	}
216 	return 0;
217 
218 abort_with_mem:
219 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
220 abort_with_dmat:
221 	(void)bus_dma_tag_destroy(dma->dmat);
222 	return err;
223 }
224 
225 
226 static void
227 mxge_dma_free(mxge_dma_t *dma)
228 {
229 	bus_dmamap_unload(dma->dmat, dma->map);
230 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
231 	(void)bus_dma_tag_destroy(dma->dmat);
232 }
233 
234 /*
235  * The eeprom strings on the lanaiX have the format
236  * SN=x\0
237  * MAC=x:x:x:x:x:x\0
238  * PC=text\0
239  */
240 
241 static int
242 mxge_parse_strings(mxge_softc_t *sc)
243 {
244 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
245 
246 	char *ptr, *limit;
247 	int i, found_mac;
248 
249 	ptr = sc->eeprom_strings;
250 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
251 	found_mac = 0;
252 	while (ptr < limit && *ptr != '\0') {
253 		if (memcmp(ptr, "MAC=", 4) == 0) {
254 			ptr += 1;
255 			sc->mac_addr_string = ptr;
256 			for (i = 0; i < 6; i++) {
257 				ptr += 3;
258 				if ((ptr + 2) > limit)
259 					goto abort;
260 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
261 				found_mac = 1;
262 			}
263 		} else if (memcmp(ptr, "PC=", 3) == 0) {
264 			ptr += 3;
265 			strncpy(sc->product_code_string, ptr,
266 				sizeof (sc->product_code_string) - 1);
267 		} else if (memcmp(ptr, "SN=", 3) == 0) {
268 			ptr += 3;
269 			strncpy(sc->serial_number_string, ptr,
270 				sizeof (sc->serial_number_string) - 1);
271 		}
272 		MXGE_NEXT_STRING(ptr);
273 	}
274 
275 	if (found_mac)
276 		return 0;
277 
278  abort:
279 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
280 
281 	return ENXIO;
282 }
283 
284 #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
285 static int
286 mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
287 {
288 	uint32_t val;
289 	unsigned long off;
290 	char *va, *cfgptr;
291 	uint16_t vendor_id, device_id;
292 	uintptr_t bus, slot, func, ivend, idev;
293 	uint32_t *ptr32;
294 
295 	/* XXXX
296 	   Test below is commented because it is believed that doing
297 	   config read/write beyond 0xff will access the config space
298 	   for the next larger function.  Uncomment this and remove
299 	   the hacky pmap_mapdev() way of accessing config space when
300 	   FreeBSD grows support for extended pcie config space access
301 	*/
302 #if 0
303 	/* See if we can, by some miracle, access the extended
304 	   config space */
305 	val = pci_read_config(pdev, 0x178, 4);
306 	if (val != 0xffffffff) {
307 		val |= 0x40;
308 		pci_write_config(pdev, 0x178, val, 4);
309 		return 0;
310 	}
311 #endif
312 	/* Rather than using normal pci config space writes, we must
313 	 * map the Nvidia config space ourselves.  This is because on
314 	 * opteron/nvidia class machine the 0xe000000 mapping is
315 	 * handled by the nvidia chipset, that means the internal PCI
316 	 * device (the on-chip northbridge), or the amd-8131 bridge
317 	 * and things behind them are not visible by this method.
318 	 */
319 
320 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
321 		      PCI_IVAR_BUS, &bus);
322 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
323 		      PCI_IVAR_SLOT, &slot);
324 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
325 		      PCI_IVAR_FUNCTION, &func);
326 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
327 		      PCI_IVAR_VENDOR, &ivend);
328 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
329 		      PCI_IVAR_DEVICE, &idev);
330 
331 	off =  0xe0000000UL
332 		+ 0x00100000UL * (unsigned long)bus
333 		+ 0x00001000UL * (unsigned long)(func
334 						 + 8 * slot);
335 
336 	/* map it into the kernel */
337 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
338 
339 
340 	if (va == NULL) {
341 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
342 		return EIO;
343 	}
344 	/* get a pointer to the config space mapped into the kernel */
345 	cfgptr = va + (off & PAGE_MASK);
346 
347 	/* make sure that we can really access it */
348 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
349 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
350 	if (! (vendor_id == ivend && device_id == idev)) {
351 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
352 			      vendor_id, device_id);
353 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
354 		return EIO;
355 	}
356 
357 	ptr32 = (uint32_t*)(cfgptr + 0x178);
358 	val = *ptr32;
359 
360 	if (val == 0xffffffff) {
361 		device_printf(sc->dev, "extended mapping failed\n");
362 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
363 		return EIO;
364 	}
365 	*ptr32 = val | 0x40;
366 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
367 	if (mxge_verbose)
368 		device_printf(sc->dev,
369 			      "Enabled ECRC on upstream Nvidia bridge "
370 			      "at %d:%d:%d\n",
371 			      (int)bus, (int)slot, (int)func);
372 	return 0;
373 }
374 #else
375 static int
376 mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
377 {
378 	device_printf(sc->dev,
379 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
380 	return ENXIO;
381 }
382 #endif
383 /*
384  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
385  * when the PCI-E Completion packets are aligned on an 8-byte
386  * boundary.  Some PCI-E chip sets always align Completion packets; on
387  * the ones that do not, the alignment can be enforced by enabling
388  * ECRC generation (if supported).
389  *
390  * When PCI-E Completion packets are not aligned, it is actually more
391  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
392  *
393  * If the driver can neither enable ECRC nor verify that it has
394  * already been enabled, then it must use a firmware image which works
395  * around unaligned completion packets (ethp_z8e.dat), and it should
396  * also ensure that it never gives the device a Read-DMA which is
397  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
398  * enabled, then the driver should use the aligned (eth_z8e.dat)
399  * firmware image, and set tx.boundary to 4KB.
400  */
401 
402 static void
403 mxge_select_firmware(mxge_softc_t *sc)
404 {
405 	int err, aligned = 0;
406 	device_t pdev;
407 	uint16_t pvend, pdid;
408 
409 
410 	if (mxge_force_firmware != 0) {
411 		if (mxge_force_firmware == 1)
412 			aligned = 1;
413 		else
414 			aligned = 0;
415 		if (mxge_verbose)
416 			device_printf(sc->dev,
417 				      "Assuming %s completions (forced)\n",
418 				      aligned ? "aligned" : "unaligned");
419 		goto abort;
420 	}
421 
422 	/* if the PCIe link width is 4 or less, we can use the aligned
423 	   firmware and skip any checks */
424 	if (sc->link_width != 0 && sc->link_width <= 4) {
425 		device_printf(sc->dev,
426 			      "PCIe x%d Link, expect reduced performance\n",
427 			      sc->link_width);
428 		aligned = 1;
429 		goto abort;
430 	}
431 
432 	pdev = device_get_parent(device_get_parent(sc->dev));
433 	if (pdev == NULL) {
434 		device_printf(sc->dev, "could not find parent?\n");
435 		goto abort;
436 	}
437 	pvend = pci_read_config(pdev, PCIR_VENDOR, 2);
438 	pdid = pci_read_config(pdev, PCIR_DEVICE, 2);
439 
440 	/* see if we can enable ECRC's on an upstream
441 	   Nvidia bridge */
442 	if (mxge_nvidia_ecrc_enable &&
443 	    (pvend == 0x10de && pdid == 0x005d)) {
444 		err = mxge_enable_nvidia_ecrc(sc, pdev);
445 		if (err == 0) {
446 			aligned = 1;
447 			if (mxge_verbose)
448 				device_printf(sc->dev,
449 					      "Assuming aligned completions"
450 					      " (ECRC)\n");
451 		}
452 	}
453 	/* see if the upstream bridge is known to
454 	   provided aligned completions */
455 	if (/* HT2000 */ (pvend == 0x1166 && pdid == 0x0132) ||
456 	    /* PLX */    (pvend == 0x10b5 && pdid == 0x8532) ||
457 	    /* Intel */  (pvend == 0x8086 &&
458 	      /* E5000 NorthBridge*/((pdid >= 0x25f7 && pdid <= 0x25fa) ||
459 	      /* E5000 SouthBridge*/ (pdid >= 0x3510 && pdid <= 0x351b)))) {
460 		aligned = 1;
461 		if (mxge_verbose)
462 			device_printf(sc->dev,
463 				      "Assuming aligned completions "
464 				      "(0x%x:0x%x)\n", pvend, pdid);
465 	}
466 
467 abort:
468 	if (aligned) {
469 		sc->fw_name = mxge_fw_aligned;
470 		sc->tx.boundary = 4096;
471 	} else {
472 		sc->fw_name = mxge_fw_unaligned;
473 		sc->tx.boundary = 2048;
474 	}
475 }
476 
477 union qualhack
478 {
479         const char *ro_char;
480         char *rw_char;
481 };
482 
483 static int
484 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
485 {
486 
487 
488 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
489 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
490 			      be32toh(hdr->mcp_type));
491 		return EIO;
492 	}
493 
494 	/* save firmware version for sysctl */
495 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
496 	if (mxge_verbose)
497 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
498 
499 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
500 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
501 
502 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
503 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
504 		device_printf(sc->dev, "Found firmware version %s\n",
505 			      sc->fw_version);
506 		device_printf(sc->dev, "Driver needs %d.%d\n",
507 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
508 		return EINVAL;
509 	}
510 	return 0;
511 
512 }
513 
514 static int
515 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
516 {
517 	const struct firmware *fw;
518 	const mcp_gen_header_t *hdr;
519 	unsigned hdr_offset;
520 	const char *fw_data;
521 	union qualhack hack;
522 	int status;
523 	unsigned int i;
524 	char dummy;
525 
526 
527 	fw = firmware_get(sc->fw_name);
528 
529 	if (fw == NULL) {
530 		device_printf(sc->dev, "Could not find firmware image %s\n",
531 			      sc->fw_name);
532 		return ENOENT;
533 	}
534 	if (fw->datasize > *limit ||
535 	    fw->datasize < MCP_HEADER_PTR_OFFSET + 4) {
536 		device_printf(sc->dev, "Firmware image %s too large (%d/%d)\n",
537 			      sc->fw_name, (int)fw->datasize, (int) *limit);
538 		status = ENOSPC;
539 		goto abort_with_fw;
540 	}
541 	*limit = fw->datasize;
542 
543 	/* check id */
544 	fw_data = (const char *)fw->data;
545 	hdr_offset = htobe32(*(const uint32_t *)
546 			     (fw_data + MCP_HEADER_PTR_OFFSET));
547 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw->datasize) {
548 		device_printf(sc->dev, "Bad firmware file");
549 		status = EIO;
550 		goto abort_with_fw;
551 	}
552 	hdr = (const void*)(fw_data + hdr_offset);
553 
554 	status = mxge_validate_firmware(sc, hdr);
555 	if (status != 0)
556 		goto abort_with_fw;
557 
558 	hack.ro_char = fw_data;
559 	/* Copy the inflated firmware to NIC SRAM. */
560 	for (i = 0; i < *limit; i += 256) {
561 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
562 			      hack.rw_char + i,
563 			      min(256U, (unsigned)(*limit - i)));
564 		mb();
565 		dummy = *sc->sram;
566 		mb();
567 	}
568 
569 	status = 0;
570 abort_with_fw:
571 	firmware_put(fw, FIRMWARE_UNLOAD);
572 	return status;
573 }
574 
575 /*
576  * Enable or disable periodic RDMAs from the host to make certain
577  * chipsets resend dropped PCIe messages
578  */
579 
580 static void
581 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
582 {
583 	char buf_bytes[72];
584 	volatile uint32_t *confirm;
585 	volatile char *submit;
586 	uint32_t *buf, dma_low, dma_high;
587 	int i;
588 
589 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
590 
591 	/* clear confirmation addr */
592 	confirm = (volatile uint32_t *)sc->cmd;
593 	*confirm = 0;
594 	mb();
595 
596 	/* send an rdma command to the PCIe engine, and wait for the
597 	   response in the confirmation address.  The firmware should
598 	   write a -1 there to indicate it is alive and well
599 	*/
600 
601 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
602 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
603 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
604 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
605 	buf[2] = htobe32(0xffffffff);		/* confirm data */
606 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
607 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
608 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
609 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
610 	buf[5] = htobe32(enable);			/* enable? */
611 
612 
613 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
614 
615 	mxge_pio_copy(submit, buf, 64);
616 	mb();
617 	DELAY(1000);
618 	mb();
619 	i = 0;
620 	while (*confirm != 0xffffffff && i < 20) {
621 		DELAY(1000);
622 		i++;
623 	}
624 	if (*confirm != 0xffffffff) {
625 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
626 			      (enable ? "enable" : "disable"), confirm,
627 			      *confirm);
628 	}
629 	return;
630 }
631 
632 static int
633 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
634 {
635 	mcp_cmd_t *buf;
636 	char buf_bytes[sizeof(*buf) + 8];
637 	volatile mcp_cmd_response_t *response = sc->cmd;
638 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
639 	uint32_t dma_low, dma_high;
640 	int sleep_total = 0;
641 
642 	/* ensure buf is aligned to 8 bytes */
643 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
644 
645 	buf->data0 = htobe32(data->data0);
646 	buf->data1 = htobe32(data->data1);
647 	buf->data2 = htobe32(data->data2);
648 	buf->cmd = htobe32(cmd);
649 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
650 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
651 
652 	buf->response_addr.low = htobe32(dma_low);
653 	buf->response_addr.high = htobe32(dma_high);
654 	mtx_lock(&sc->cmd_mtx);
655 	response->result = 0xffffffff;
656 	mb();
657 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
658 
659 	/* wait up to 20ms */
660 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
661 		bus_dmamap_sync(sc->cmd_dma.dmat,
662 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
663 		mb();
664 		if (response->result != 0xffffffff) {
665 			if (response->result == 0) {
666 				data->data0 = be32toh(response->data);
667 				mtx_unlock(&sc->cmd_mtx);
668 				return 0;
669 			} else {
670 				device_printf(sc->dev,
671 					      "mxge: command %d "
672 					      "failed, result = %d\n",
673 					      cmd, be32toh(response->result));
674 				mtx_unlock(&sc->cmd_mtx);
675 				return ENXIO;
676 			}
677 		}
678 		DELAY(1000);
679 	}
680 	mtx_unlock(&sc->cmd_mtx);
681 	device_printf(sc->dev, "mxge: command %d timed out"
682 		      "result = %d\n",
683 		      cmd, be32toh(response->result));
684 	return EAGAIN;
685 }
686 
687 static int
688 mxge_adopt_running_firmware(mxge_softc_t *sc)
689 {
690 	struct mcp_gen_header *hdr;
691 	const size_t bytes = sizeof (struct mcp_gen_header);
692 	size_t hdr_offset;
693 	int status;
694 
695 	/* find running firmware header */
696 	hdr_offset = htobe32(*(volatile uint32_t *)
697 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
698 
699 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
700 		device_printf(sc->dev,
701 			      "Running firmware has bad header offset (%d)\n",
702 			      (int)hdr_offset);
703 		return EIO;
704 	}
705 
706 	/* copy header of running firmware from SRAM to host memory to
707 	 * validate firmware */
708 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
709 	if (hdr == NULL) {
710 		device_printf(sc->dev, "could not malloc firmware hdr\n");
711 		return ENOMEM;
712 	}
713 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
714 				rman_get_bushandle(sc->mem_res),
715 				hdr_offset, (char *)hdr, bytes);
716 	status = mxge_validate_firmware(sc, hdr);
717 	free(hdr, M_DEVBUF);
718 
719 	/*
720 	 * check to see if adopted firmware has bug where adopting
721 	 * it will cause broadcasts to be filtered unless the NIC
722 	 * is kept in ALLMULTI mode
723 	 */
724 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
725 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
726 		sc->adopted_rx_filter_bug = 1;
727 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
728 			      "working around rx filter bug\n",
729 			      sc->fw_ver_major, sc->fw_ver_minor,
730 			      sc->fw_ver_tiny);
731 	}
732 
733 	return status;
734 }
735 
736 
737 static int
738 mxge_load_firmware(mxge_softc_t *sc)
739 {
740 	volatile uint32_t *confirm;
741 	volatile char *submit;
742 	char buf_bytes[72];
743 	uint32_t *buf, size, dma_low, dma_high;
744 	int status, i;
745 
746 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
747 
748 	size = sc->sram_size;
749 	status = mxge_load_firmware_helper(sc, &size);
750 	if (status) {
751 		/* Try to use the currently running firmware, if
752 		   it is new enough */
753 		status = mxge_adopt_running_firmware(sc);
754 		if (status) {
755 			device_printf(sc->dev,
756 				      "failed to adopt running firmware\n");
757 			return status;
758 		}
759 		device_printf(sc->dev,
760 			      "Successfully adopted running firmware\n");
761 		if (sc->tx.boundary == 4096) {
762 			device_printf(sc->dev,
763 				"Using firmware currently running on NIC"
764 				 ".  For optimal\n");
765 			device_printf(sc->dev,
766 				 "performance consider loading optimized "
767 				 "firmware\n");
768 		}
769 		sc->fw_name = mxge_fw_unaligned;
770 		sc->tx.boundary = 2048;
771 		return 0;
772 	}
773 	/* clear confirmation addr */
774 	confirm = (volatile uint32_t *)sc->cmd;
775 	*confirm = 0;
776 	mb();
777 	/* send a reload command to the bootstrap MCP, and wait for the
778 	   response in the confirmation address.  The firmware should
779 	   write a -1 there to indicate it is alive and well
780 	*/
781 
782 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
783 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
784 
785 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
786 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
787 	buf[2] = htobe32(0xffffffff);	/* confirm data */
788 
789 	/* FIX: All newest firmware should un-protect the bottom of
790 	   the sram before handoff. However, the very first interfaces
791 	   do not. Therefore the handoff copy must skip the first 8 bytes
792 	*/
793 					/* where the code starts*/
794 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
795 	buf[4] = htobe32(size - 8); 	/* length of code */
796 	buf[5] = htobe32(8);		/* where to copy to */
797 	buf[6] = htobe32(0);		/* where to jump to */
798 
799 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
800 	mxge_pio_copy(submit, buf, 64);
801 	mb();
802 	DELAY(1000);
803 	mb();
804 	i = 0;
805 	while (*confirm != 0xffffffff && i < 20) {
806 		DELAY(1000*10);
807 		i++;
808 		bus_dmamap_sync(sc->cmd_dma.dmat,
809 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
810 	}
811 	if (*confirm != 0xffffffff) {
812 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
813 			confirm, *confirm);
814 
815 		return ENXIO;
816 	}
817 	return 0;
818 }
819 
820 static int
821 mxge_update_mac_address(mxge_softc_t *sc)
822 {
823 	mxge_cmd_t cmd;
824 	uint8_t *addr = sc->mac_addr;
825 	int status;
826 
827 
828 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
829 		     | (addr[2] << 8) | addr[3]);
830 
831 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
832 
833 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
834 	return status;
835 }
836 
837 static int
838 mxge_change_pause(mxge_softc_t *sc, int pause)
839 {
840 	mxge_cmd_t cmd;
841 	int status;
842 
843 	if (pause)
844 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
845 				       &cmd);
846 	else
847 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
848 				       &cmd);
849 
850 	if (status) {
851 		device_printf(sc->dev, "Failed to set flow control mode\n");
852 		return ENXIO;
853 	}
854 	sc->pause = pause;
855 	return 0;
856 }
857 
858 static void
859 mxge_change_promisc(mxge_softc_t *sc, int promisc)
860 {
861 	mxge_cmd_t cmd;
862 	int status;
863 
864 	if (promisc)
865 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
866 				       &cmd);
867 	else
868 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
869 				       &cmd);
870 
871 	if (status) {
872 		device_printf(sc->dev, "Failed to set promisc mode\n");
873 	}
874 }
875 
876 static void
877 mxge_set_multicast_list(mxge_softc_t *sc)
878 {
879 	mxge_cmd_t cmd;
880 	struct ifmultiaddr *ifma;
881 	struct ifnet *ifp = sc->ifp;
882 	int err;
883 
884 	/* This firmware is known to not support multicast */
885 	if (!sc->fw_multicast_support)
886 		return;
887 
888 	/* Disable multicast filtering while we play with the lists*/
889 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
890 	if (err != 0) {
891 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
892 		       " error status: %d\n", err);
893 		return;
894 	}
895 
896 	if (sc->adopted_rx_filter_bug)
897 		return;
898 
899 	if (ifp->if_flags & IFF_ALLMULTI)
900 		/* request to disable multicast filtering, so quit here */
901 		return;
902 
903 	/* Flush all the filters */
904 
905 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
906 	if (err != 0) {
907 		device_printf(sc->dev,
908 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
909 			      ", error status: %d\n", err);
910 		return;
911 	}
912 
913 	/* Walk the multicast list, and add each address */
914 
915 	IF_ADDR_LOCK(ifp);
916 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
917 		if (ifma->ifma_addr->sa_family != AF_LINK)
918 			continue;
919 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
920 		      &cmd.data0, 4);
921 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
922 		      &cmd.data1, 2);
923 		cmd.data0 = htonl(cmd.data0);
924 		cmd.data1 = htonl(cmd.data1);
925 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
926 		if (err != 0) {
927 			device_printf(sc->dev, "Failed "
928 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
929 			       "%d\t", err);
930 			/* abort, leaving multicast filtering off */
931 			IF_ADDR_UNLOCK(ifp);
932 			return;
933 		}
934 	}
935 	IF_ADDR_UNLOCK(ifp);
936 	/* Enable multicast filtering */
937 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
938 	if (err != 0) {
939 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
940 		       ", error status: %d\n", err);
941 	}
942 }
943 
944 
945 static int
946 mxge_reset(mxge_softc_t *sc)
947 {
948 
949 	mxge_cmd_t cmd;
950 	size_t bytes;
951 	int status;
952 
953 	/* try to send a reset command to the card to see if it
954 	   is alive */
955 	memset(&cmd, 0, sizeof (cmd));
956 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
957 	if (status != 0) {
958 		device_printf(sc->dev, "failed reset\n");
959 		return ENXIO;
960 	}
961 
962 	mxge_dummy_rdma(sc, 1);
963 
964 	/* Now exchange information about interrupts  */
965 	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);\
966 	memset(sc->rx_done.entry, 0, bytes);
967 	cmd.data0 = (uint32_t)bytes;
968 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
969 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->rx_done.dma.bus_addr);
970 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->rx_done.dma.bus_addr);
971 	status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd);
972 
973 	status |= mxge_send_cmd(sc,
974 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
975 
976 
977 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
978 
979 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
980 	sc->irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
981 
982 
983 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
984 				&cmd);
985 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
986 	if (status != 0) {
987 		device_printf(sc->dev, "failed set interrupt parameters\n");
988 		return status;
989 	}
990 
991 
992 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
993 
994 
995 	/* run a DMA benchmark */
996 	sc->read_dma = sc->write_dma = sc->read_write_dma = 0;
997 
998 	/* Read DMA */
999 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->dmabench_dma.bus_addr);
1000 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->dmabench_dma.bus_addr);
1001 	cmd.data2 = sc->tx.boundary * 0x10000;
1002 
1003 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
1004 	if (status != 0)
1005 		device_printf(sc->dev, "read dma benchmark failed\n");
1006 	else
1007 		sc->read_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
1008 			(cmd.data0 & 0xffff);
1009 
1010 	/* Write DMA */
1011 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->dmabench_dma.bus_addr);
1012 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->dmabench_dma.bus_addr);
1013 	cmd.data2 = sc->tx.boundary * 0x1;
1014 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
1015 	if (status != 0)
1016 		device_printf(sc->dev, "write dma benchmark failed\n");
1017 	else
1018 		sc->write_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
1019 			(cmd.data0 & 0xffff);
1020 	/* Read/Write DMA */
1021 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->dmabench_dma.bus_addr);
1022 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->dmabench_dma.bus_addr);
1023 	cmd.data2 = sc->tx.boundary * 0x10001;
1024 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
1025 	if (status != 0)
1026 		device_printf(sc->dev, "read/write dma benchmark failed\n");
1027 	else
1028 		sc->read_write_dma =
1029 			((cmd.data0>>16) * sc->tx.boundary * 2 * 2) /
1030 			(cmd.data0 & 0xffff);
1031 
1032 	/* reset mcp/driver shared state back to 0 */
1033 	bzero(sc->rx_done.entry, bytes);
1034 	sc->rx_done.idx = 0;
1035 	sc->rx_done.cnt = 0;
1036 	sc->tx.req = 0;
1037 	sc->tx.done = 0;
1038 	sc->tx.pkt_done = 0;
1039 	sc->tx.wake = 0;
1040 	sc->tx.stall = 0;
1041 	sc->rx_big.cnt = 0;
1042 	sc->rx_small.cnt = 0;
1043 	sc->rdma_tags_available = 15;
1044 	sc->fw_stats->valid = 0;
1045 	sc->fw_stats->send_done_count = 0;
1046 	status = mxge_update_mac_address(sc);
1047 	mxge_change_promisc(sc, 0);
1048 	mxge_change_pause(sc, sc->pause);
1049 	mxge_set_multicast_list(sc);
1050 	return status;
1051 }
1052 
1053 static int
1054 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1055 {
1056         mxge_softc_t *sc;
1057         unsigned int intr_coal_delay;
1058         int err;
1059 
1060         sc = arg1;
1061         intr_coal_delay = sc->intr_coal_delay;
1062         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1063         if (err != 0) {
1064                 return err;
1065         }
1066         if (intr_coal_delay == sc->intr_coal_delay)
1067                 return 0;
1068 
1069         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1070                 return EINVAL;
1071 
1072 	mtx_lock(&sc->driver_mtx);
1073 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1074 	sc->intr_coal_delay = intr_coal_delay;
1075 
1076 	mtx_unlock(&sc->driver_mtx);
1077         return err;
1078 }
1079 
1080 static int
1081 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1082 {
1083         mxge_softc_t *sc;
1084         unsigned int enabled;
1085         int err;
1086 
1087         sc = arg1;
1088         enabled = sc->pause;
1089         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1090         if (err != 0) {
1091                 return err;
1092         }
1093         if (enabled == sc->pause)
1094                 return 0;
1095 
1096 	mtx_lock(&sc->driver_mtx);
1097 	err = mxge_change_pause(sc, enabled);
1098 	mtx_unlock(&sc->driver_mtx);
1099         return err;
1100 }
1101 
1102 static int
1103 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1104 {
1105         int err;
1106 
1107         if (arg1 == NULL)
1108                 return EFAULT;
1109         arg2 = be32toh(*(int *)arg1);
1110         arg1 = NULL;
1111         err = sysctl_handle_int(oidp, arg1, arg2, req);
1112 
1113         return err;
1114 }
1115 
1116 static void
1117 mxge_add_sysctls(mxge_softc_t *sc)
1118 {
1119 	struct sysctl_ctx_list *ctx;
1120 	struct sysctl_oid_list *children;
1121 	mcp_irq_data_t *fw;
1122 
1123 	ctx = device_get_sysctl_ctx(sc->dev);
1124 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1125 	fw = sc->fw_stats;
1126 
1127 	/* random information */
1128 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1129 		       "firmware_version",
1130 		       CTLFLAG_RD, &sc->fw_version,
1131 		       0, "firmware version");
1132 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1133 		       "serial_number",
1134 		       CTLFLAG_RD, &sc->serial_number_string,
1135 		       0, "serial number");
1136 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1137 		       "product_code",
1138 		       CTLFLAG_RD, &sc->product_code_string,
1139 		       0, "product_code");
1140 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1141 		       "pcie_link_width",
1142 		       CTLFLAG_RD, &sc->link_width,
1143 		       0, "tx_boundary");
1144 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1145 		       "tx_boundary",
1146 		       CTLFLAG_RD, &sc->tx.boundary,
1147 		       0, "tx_boundary");
1148 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1149 		       "write_combine",
1150 		       CTLFLAG_RD, &sc->wc,
1151 		       0, "write combining PIO?");
1152 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1153 		       "read_dma_MBs",
1154 		       CTLFLAG_RD, &sc->read_dma,
1155 		       0, "DMA Read speed in MB/s");
1156 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1157 		       "write_dma_MBs",
1158 		       CTLFLAG_RD, &sc->write_dma,
1159 		       0, "DMA Write speed in MB/s");
1160 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1161 		       "read_write_dma_MBs",
1162 		       CTLFLAG_RD, &sc->read_write_dma,
1163 		       0, "DMA concurrent Read/Write speed in MB/s");
1164 
1165 
1166 	/* performance related tunables */
1167 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1168 			"intr_coal_delay",
1169 			CTLTYPE_INT|CTLFLAG_RW, sc,
1170 			0, mxge_change_intr_coal,
1171 			"I", "interrupt coalescing delay in usecs");
1172 
1173 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1174 			"flow_control_enabled",
1175 			CTLTYPE_INT|CTLFLAG_RW, sc,
1176 			0, mxge_change_flow_control,
1177 			"I", "interrupt coalescing delay in usecs");
1178 
1179 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1180 		       "deassert_wait",
1181 		       CTLFLAG_RW, &mxge_deassert_wait,
1182 		       0, "Wait for IRQ line to go low in ihandler");
1183 
1184 	/* stats block from firmware is in network byte order.
1185 	   Need to swap it */
1186 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1187 			"link_up",
1188 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1189 			0, mxge_handle_be32,
1190 			"I", "link up");
1191 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1192 			"rdma_tags_available",
1193 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1194 			0, mxge_handle_be32,
1195 			"I", "rdma_tags_available");
1196 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1197 			"dropped_link_overflow",
1198 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1199 			0, mxge_handle_be32,
1200 			"I", "dropped_link_overflow");
1201 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1202 			"dropped_link_error_or_filtered",
1203 			CTLTYPE_INT|CTLFLAG_RD,
1204 			&fw->dropped_link_error_or_filtered,
1205 			0, mxge_handle_be32,
1206 			"I", "dropped_link_error_or_filtered");
1207 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1208 			"dropped_multicast_filtered",
1209 			CTLTYPE_INT|CTLFLAG_RD,
1210 			&fw->dropped_multicast_filtered,
1211 			0, mxge_handle_be32,
1212 			"I", "dropped_multicast_filtered");
1213 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1214 			"dropped_runt",
1215 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1216 			0, mxge_handle_be32,
1217 			"I", "dropped_runt");
1218 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1219 			"dropped_overrun",
1220 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1221 			0, mxge_handle_be32,
1222 			"I", "dropped_overrun");
1223 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1224 			"dropped_no_small_buffer",
1225 			CTLTYPE_INT|CTLFLAG_RD,
1226 			&fw->dropped_no_small_buffer,
1227 			0, mxge_handle_be32,
1228 			"I", "dropped_no_small_buffer");
1229 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1230 			"dropped_no_big_buffer",
1231 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1232 			0, mxge_handle_be32,
1233 			"I", "dropped_no_big_buffer");
1234 
1235 	/* host counters exported for debugging */
1236 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1237 		       "rx_small_cnt",
1238 		       CTLFLAG_RD, &sc->rx_small.cnt,
1239 		       0, "rx_small_cnt");
1240 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1241 		       "rx_big_cnt",
1242 		       CTLFLAG_RD, &sc->rx_big.cnt,
1243 		       0, "rx_small_cnt");
1244 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1245 		       "tx_req",
1246 		       CTLFLAG_RD, &sc->tx.req,
1247 		       0, "tx_req");
1248 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1249 		       "tx_done",
1250 		       CTLFLAG_RD, &sc->tx.done,
1251 		       0, "tx_done");
1252 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1253 		       "tx_pkt_done",
1254 		       CTLFLAG_RD, &sc->tx.pkt_done,
1255 		       0, "tx_done");
1256 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1257 		       "tx_stall",
1258 		       CTLFLAG_RD, &sc->tx.stall,
1259 		       0, "tx_stall");
1260 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1261 		       "tx_wake",
1262 		       CTLFLAG_RD, &sc->tx.wake,
1263 		       0, "tx_wake");
1264 
1265 	/* verbose printing? */
1266 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1267 		       "verbose",
1268 		       CTLFLAG_RW, &mxge_verbose,
1269 		       0, "verbose printing");
1270 
1271 }
1272 
1273 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1274    backwards one at a time and handle ring wraps */
1275 
1276 static inline void
1277 mxge_submit_req_backwards(mxge_tx_buf_t *tx,
1278 			    mcp_kreq_ether_send_t *src, int cnt)
1279 {
1280         int idx, starting_slot;
1281         starting_slot = tx->req;
1282         while (cnt > 1) {
1283                 cnt--;
1284                 idx = (starting_slot + cnt) & tx->mask;
1285                 mxge_pio_copy(&tx->lanai[idx],
1286 			      &src[cnt], sizeof(*src));
1287                 mb();
1288         }
1289 }
1290 
1291 /*
1292  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1293  * at most 32 bytes at a time, so as to avoid involving the software
1294  * pio handler in the nic.   We re-write the first segment's flags
1295  * to mark them valid only after writing the entire chain
1296  */
1297 
1298 static inline void
1299 mxge_submit_req(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src,
1300                   int cnt)
1301 {
1302         int idx, i;
1303         uint32_t *src_ints;
1304 	volatile uint32_t *dst_ints;
1305         mcp_kreq_ether_send_t *srcp;
1306 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1307 	uint8_t last_flags;
1308 
1309         idx = tx->req & tx->mask;
1310 
1311 	last_flags = src->flags;
1312 	src->flags = 0;
1313         mb();
1314         dst = dstp = &tx->lanai[idx];
1315         srcp = src;
1316 
1317         if ((idx + cnt) < tx->mask) {
1318                 for (i = 0; i < (cnt - 1); i += 2) {
1319                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1320                         mb(); /* force write every 32 bytes */
1321                         srcp += 2;
1322                         dstp += 2;
1323                 }
1324         } else {
1325                 /* submit all but the first request, and ensure
1326                    that it is submitted below */
1327                 mxge_submit_req_backwards(tx, src, cnt);
1328                 i = 0;
1329         }
1330         if (i < cnt) {
1331                 /* submit the first request */
1332                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1333                 mb(); /* barrier before setting valid flag */
1334         }
1335 
1336         /* re-write the last 32-bits with the valid flags */
1337         src->flags = last_flags;
1338         src_ints = (uint32_t *)src;
1339         src_ints+=3;
1340         dst_ints = (volatile uint32_t *)dst;
1341         dst_ints+=3;
1342         *dst_ints =  *src_ints;
1343         tx->req += cnt;
1344         mb();
1345 }
1346 
1347 static inline void
1348 mxge_submit_req_wc(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1349 {
1350     tx->req += cnt;
1351     mb();
1352     while (cnt >= 4) {
1353 	    mxge_pio_copy((volatile char *)tx->wc_fifo, src, 64);
1354 	    mb();
1355 	    src += 4;
1356 	    cnt -= 4;
1357     }
1358     if (cnt > 0) {
1359 	    /* pad it to 64 bytes.  The src is 64 bytes bigger than it
1360 	       needs to be so that we don't overrun it */
1361 	    mxge_pio_copy(tx->wc_fifo + MXGEFW_ETH_SEND_OFFSET(cnt), src, 64);
1362 	    mb();
1363     }
1364 }
1365 
1366 static void
1367 mxge_encap_tso(mxge_softc_t *sc, struct mbuf *m, int busdma_seg_cnt)
1368 {
1369 	mxge_tx_buf_t *tx;
1370 	mcp_kreq_ether_send_t *req;
1371 	bus_dma_segment_t *seg;
1372 	struct ether_header *eh;
1373 	struct ip *ip;
1374 	struct tcphdr *tcp;
1375 	uint32_t low, high_swapped;
1376 	int len, seglen, cum_len, cum_len_next;
1377 	int next_is_first, chop, cnt, rdma_count, small;
1378 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1379 	uint8_t flags, flags_next;
1380 	static int once;
1381 
1382 	mss = m->m_pkthdr.tso_segsz;
1383 
1384 	/* negative cum_len signifies to the
1385 	 * send loop that we are still in the
1386 	 * header portion of the TSO packet.
1387 	 */
1388 
1389 	/* ensure we have the ethernet, IP and TCP
1390 	   header together in the first mbuf, copy
1391 	   it to a scratch buffer if not */
1392 	if (__predict_false(m->m_len < sizeof (*eh)
1393 			    + sizeof (*ip))) {
1394 		m_copydata(m, 0, sizeof (*eh) + sizeof (*ip),
1395 			   sc->scratch);
1396 		eh = (struct ether_header *)sc->scratch;
1397 	} else {
1398 		eh = mtod(m, struct ether_header *);
1399 	}
1400 	ip = (struct ip *) (eh + 1);
1401 	if (__predict_false(m->m_len < sizeof (*eh) + (ip->ip_hl << 2)
1402 			    + sizeof (*tcp))) {
1403 		m_copydata(m, 0, sizeof (*eh) + (ip->ip_hl << 2)
1404 			   + sizeof (*tcp),  sc->scratch);
1405 		eh = (struct ether_header *) sc->scratch;
1406 		ip = (struct ip *) (eh + 1);
1407 	}
1408 
1409 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1410 	cum_len = -(sizeof (*eh) + ((ip->ip_hl + tcp->th_off) << 2));
1411 
1412 	/* TSO implies checksum offload on this hardware */
1413 	cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1414 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1415 
1416 
1417 	/* for TSO, pseudo_hdr_offset holds mss.
1418 	 * The firmware figures out where to put
1419 	 * the checksum by parsing the header. */
1420 	pseudo_hdr_offset = htobe16(mss);
1421 
1422 	tx = &sc->tx;
1423 	req = tx->req_list;
1424 	seg = tx->seg_list;
1425 	cnt = 0;
1426 	rdma_count = 0;
1427 	/* "rdma_count" is the number of RDMAs belonging to the
1428 	 * current packet BEFORE the current send request. For
1429 	 * non-TSO packets, this is equal to "count".
1430 	 * For TSO packets, rdma_count needs to be reset
1431 	 * to 0 after a segment cut.
1432 	 *
1433 	 * The rdma_count field of the send request is
1434 	 * the number of RDMAs of the packet starting at
1435 	 * that request. For TSO send requests with one ore more cuts
1436 	 * in the middle, this is the number of RDMAs starting
1437 	 * after the last cut in the request. All previous
1438 	 * segments before the last cut implicitly have 1 RDMA.
1439 	 *
1440 	 * Since the number of RDMAs is not known beforehand,
1441 	 * it must be filled-in retroactively - after each
1442 	 * segmentation cut or at the end of the entire packet.
1443 	 */
1444 
1445 	while (busdma_seg_cnt) {
1446 		/* Break the busdma segment up into pieces*/
1447 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1448 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1449 		len = seg->ds_len;
1450 
1451 		while (len) {
1452 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1453 			seglen = len;
1454 			cum_len_next = cum_len + seglen;
1455 			(req-rdma_count)->rdma_count = rdma_count + 1;
1456 			if (__predict_true(cum_len >= 0)) {
1457 				/* payload */
1458 				chop = (cum_len_next > mss);
1459 				cum_len_next = cum_len_next % mss;
1460 				next_is_first = (cum_len_next == 0);
1461 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1462 				flags_next |= next_is_first *
1463 					MXGEFW_FLAGS_FIRST;
1464 				rdma_count |= -(chop | next_is_first);
1465 				rdma_count += chop & !next_is_first;
1466 			} else if (cum_len_next >= 0) {
1467 				/* header ends */
1468 				rdma_count = -1;
1469 				cum_len_next = 0;
1470 				seglen = -cum_len;
1471 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1472 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1473 					MXGEFW_FLAGS_FIRST |
1474 					(small * MXGEFW_FLAGS_SMALL);
1475 			    }
1476 
1477 			req->addr_high = high_swapped;
1478 			req->addr_low = htobe32(low);
1479 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1480 			req->pad = 0;
1481 			req->rdma_count = 1;
1482 			req->length = htobe16(seglen);
1483 			req->cksum_offset = cksum_offset;
1484 			req->flags = flags | ((cum_len & 1) *
1485 					      MXGEFW_FLAGS_ALIGN_ODD);
1486 			low += seglen;
1487 			len -= seglen;
1488 			cum_len = cum_len_next;
1489 			flags = flags_next;
1490 			req++;
1491 			cnt++;
1492 			rdma_count++;
1493 			if (__predict_false(cksum_offset > seglen))
1494 				cksum_offset -= seglen;
1495 			else
1496 				cksum_offset = 0;
1497 			if (__predict_false(cnt > MXGE_MAX_SEND_DESC))
1498 				goto drop;
1499 		}
1500 		busdma_seg_cnt--;
1501 		seg++;
1502 	}
1503 	(req-rdma_count)->rdma_count = rdma_count;
1504 
1505 	do {
1506 		req--;
1507 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1508 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1509 
1510 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1511 	if (tx->wc_fifo == NULL)
1512 		mxge_submit_req(tx, tx->req_list, cnt);
1513 	else
1514 		mxge_submit_req_wc(tx, tx->req_list, cnt);
1515 	return;
1516 
1517 drop:
1518 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1519 	m_freem(m);
1520 	sc->ifp->if_oerrors++;
1521 	if (!once) {
1522 		printf("MXGE_MAX_SEND_DESC exceeded via TSO!\n");
1523 		printf("mss = %d, %ld!\n", mss, (long)seg - (long)tx->seg_list);
1524 		once = 1;
1525 	}
1526 	return;
1527 
1528 }
1529 
1530 static void
1531 mxge_encap(mxge_softc_t *sc, struct mbuf *m)
1532 {
1533 	mcp_kreq_ether_send_t *req;
1534 	bus_dma_segment_t *seg;
1535 	struct mbuf *m_tmp;
1536 	struct ifnet *ifp;
1537 	mxge_tx_buf_t *tx;
1538 	struct ether_header *eh;
1539 	struct ip *ip;
1540 	int cnt, cum_len, err, i, idx, odd_flag;
1541 	uint16_t pseudo_hdr_offset;
1542         uint8_t flags, cksum_offset;
1543 
1544 
1545 
1546 	ifp = sc->ifp;
1547 	tx = &sc->tx;
1548 
1549 	/* (try to) map the frame for DMA */
1550 	idx = tx->req & tx->mask;
1551 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1552 				      m, tx->seg_list, &cnt,
1553 				      BUS_DMA_NOWAIT);
1554 	if (err == EFBIG) {
1555 		/* Too many segments in the chain.  Try
1556 		   to defrag */
1557 		m_tmp = m_defrag(m, M_NOWAIT);
1558 		if (m_tmp == NULL) {
1559 			goto drop;
1560 		}
1561 		m = m_tmp;
1562 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1563 					      tx->info[idx].map,
1564 					      m, tx->seg_list, &cnt,
1565 					      BUS_DMA_NOWAIT);
1566 	}
1567 	if (err != 0) {
1568 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1569 			      " packet len = %d\n", err, m->m_pkthdr.len);
1570 		goto drop;
1571 	}
1572 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1573 			BUS_DMASYNC_PREWRITE);
1574 	tx->info[idx].m = m;
1575 
1576 
1577 	/* TSO is different enough, we handle it in another routine */
1578 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1579 		mxge_encap_tso(sc, m, cnt);
1580 		return;
1581 	}
1582 
1583 	req = tx->req_list;
1584 	cksum_offset = 0;
1585 	pseudo_hdr_offset = 0;
1586 	flags = MXGEFW_FLAGS_NO_TSO;
1587 
1588 	/* checksum offloading? */
1589 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1590 		/* ensure ip header is in first mbuf, copy
1591 		   it to a scratch buffer if not */
1592 		if (__predict_false(m->m_len < sizeof (*eh)
1593 				    + sizeof (*ip))) {
1594 			m_copydata(m, 0, sizeof (*eh) + sizeof (*ip),
1595 				   sc->scratch);
1596 			eh = (struct ether_header *)sc->scratch;
1597 		} else {
1598 			eh = mtod(m, struct ether_header *);
1599 		}
1600 		ip = (struct ip *) (eh + 1);
1601 		cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1602 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1603 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1604 		req->cksum_offset = cksum_offset;
1605 		flags |= MXGEFW_FLAGS_CKSUM;
1606 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1607 	} else {
1608 		odd_flag = 0;
1609 	}
1610 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1611 		flags |= MXGEFW_FLAGS_SMALL;
1612 
1613 	/* convert segments into a request list */
1614 	cum_len = 0;
1615 	seg = tx->seg_list;
1616 	req->flags = MXGEFW_FLAGS_FIRST;
1617 	for (i = 0; i < cnt; i++) {
1618 		req->addr_low =
1619 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1620 		req->addr_high =
1621 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1622 		req->length = htobe16(seg->ds_len);
1623 		req->cksum_offset = cksum_offset;
1624 		if (cksum_offset > seg->ds_len)
1625 			cksum_offset -= seg->ds_len;
1626 		else
1627 			cksum_offset = 0;
1628 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1629 		req->pad = 0; /* complete solid 16-byte block */
1630 		req->rdma_count = 1;
1631 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1632 		cum_len += seg->ds_len;
1633 		seg++;
1634 		req++;
1635 		req->flags = 0;
1636 	}
1637 	req--;
1638 	/* pad runts to 60 bytes */
1639 	if (cum_len < 60) {
1640 		req++;
1641 		req->addr_low =
1642 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
1643 		req->addr_high =
1644 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
1645 		req->length = htobe16(60 - cum_len);
1646 		req->cksum_offset = 0;
1647 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1648 		req->pad = 0; /* complete solid 16-byte block */
1649 		req->rdma_count = 1;
1650 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1651 		cnt++;
1652 	}
1653 
1654 	tx->req_list[0].rdma_count = cnt;
1655 #if 0
1656 	/* print what the firmware will see */
1657 	for (i = 0; i < cnt; i++) {
1658 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1659 		    "cso:%d, flags:0x%x, rdma:%d\n",
1660 		    i, (int)ntohl(tx->req_list[i].addr_high),
1661 		    (int)ntohl(tx->req_list[i].addr_low),
1662 		    (int)ntohs(tx->req_list[i].length),
1663 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1664 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1665 		    tx->req_list[i].rdma_count);
1666 	}
1667 	printf("--------------\n");
1668 #endif
1669 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1670 	if (tx->wc_fifo == NULL)
1671 		mxge_submit_req(tx, tx->req_list, cnt);
1672 	else
1673 		mxge_submit_req_wc(tx, tx->req_list, cnt);
1674 	return;
1675 
1676 drop:
1677 	m_freem(m);
1678 	ifp->if_oerrors++;
1679 	return;
1680 }
1681 
1682 
1683 
1684 
1685 static inline void
1686 mxge_start_locked(mxge_softc_t *sc)
1687 {
1688 	struct mbuf *m;
1689 	struct ifnet *ifp;
1690 
1691 	ifp = sc->ifp;
1692 	while ((sc->tx.mask - (sc->tx.req - sc->tx.done))
1693 	       > MXGE_MAX_SEND_DESC) {
1694 
1695 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
1696 		if (m == NULL) {
1697 			return;
1698 		}
1699 		/* let BPF see it */
1700 		BPF_MTAP(ifp, m);
1701 
1702 		/* give it to the nic */
1703 		mxge_encap(sc, m);
1704 	}
1705 	/* ran out of transmit slots */
1706 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
1707 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
1708 		sc->tx.stall++;
1709 	}
1710 }
1711 
1712 static void
1713 mxge_start(struct ifnet *ifp)
1714 {
1715 	mxge_softc_t *sc = ifp->if_softc;
1716 
1717 
1718 	mtx_lock(&sc->tx_mtx);
1719 	mxge_start_locked(sc);
1720 	mtx_unlock(&sc->tx_mtx);
1721 }
1722 
1723 /*
1724  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
1725  * at most 32 bytes at a time, so as to avoid involving the software
1726  * pio handler in the nic.   We re-write the first segment's low
1727  * DMA address to mark it valid only after we write the entire chunk
1728  * in a burst
1729  */
1730 static inline void
1731 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
1732 		mcp_kreq_ether_recv_t *src)
1733 {
1734 	uint32_t low;
1735 
1736 	low = src->addr_low;
1737 	src->addr_low = 0xffffffff;
1738 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
1739 	mb();
1740 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
1741 	mb();
1742 	src->addr_low = low;
1743 	dst->addr_low = low;
1744 	mb();
1745 }
1746 
1747 static int
1748 mxge_get_buf_small(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1749 {
1750 	bus_dma_segment_t seg;
1751 	struct mbuf *m;
1752 	mxge_rx_buf_t *rx = &sc->rx_small;
1753 	int cnt, err;
1754 
1755 	m = m_gethdr(M_DONTWAIT, MT_DATA);
1756 	if (m == NULL) {
1757 		rx->alloc_fail++;
1758 		err = ENOBUFS;
1759 		goto done;
1760 	}
1761 	m->m_len = MHLEN;
1762 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1763 				      &seg, &cnt, BUS_DMA_NOWAIT);
1764 	if (err != 0) {
1765 		m_free(m);
1766 		goto done;
1767 	}
1768 	rx->info[idx].m = m;
1769 	rx->shadow[idx].addr_low =
1770 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1771 	rx->shadow[idx].addr_high =
1772 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1773 
1774 done:
1775 	if ((idx & 7) == 7) {
1776 		if (rx->wc_fifo == NULL)
1777 			mxge_submit_8rx(&rx->lanai[idx - 7],
1778 					&rx->shadow[idx - 7]);
1779 		else {
1780 			mb();
1781 			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1782 		}
1783         }
1784 	return err;
1785 }
1786 
1787 static int
1788 mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1789 {
1790 	bus_dma_segment_t seg;
1791 	struct mbuf *m;
1792 	mxge_rx_buf_t *rx = &sc->rx_big;
1793 	int cnt, err;
1794 
1795 	m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, sc->big_bytes);
1796 	if (m == NULL) {
1797 		rx->alloc_fail++;
1798 		err = ENOBUFS;
1799 		goto done;
1800 	}
1801 	m->m_len = sc->big_bytes;
1802 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1803 				      &seg, &cnt, BUS_DMA_NOWAIT);
1804 	if (err != 0) {
1805 		m_free(m);
1806 		goto done;
1807 	}
1808 	rx->info[idx].m = m;
1809 	rx->shadow[idx].addr_low =
1810 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1811 	rx->shadow[idx].addr_high =
1812 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1813 
1814 done:
1815 	if ((idx & 7) == 7) {
1816 		if (rx->wc_fifo == NULL)
1817 			mxge_submit_8rx(&rx->lanai[idx - 7],
1818 					&rx->shadow[idx - 7]);
1819 		else {
1820 			mb();
1821 			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1822 		}
1823         }
1824 	return err;
1825 }
1826 
1827 static inline void
1828 mxge_rx_csum(struct mbuf *m, int csum)
1829 {
1830 	struct ether_header *eh;
1831 	struct ip *ip;
1832 
1833 	eh = mtod(m, struct ether_header *);
1834 	if (__predict_true(eh->ether_type ==  htons(ETHERTYPE_IP))) {
1835 		ip = (struct ip *)(eh + 1);
1836 		if (__predict_true(ip->ip_p == IPPROTO_TCP ||
1837 				   ip->ip_p == IPPROTO_UDP)) {
1838 			m->m_pkthdr.csum_data = csum;
1839 			m->m_pkthdr.csum_flags = CSUM_DATA_VALID;
1840 		}
1841 	}
1842 }
1843 
1844 static inline void
1845 mxge_rx_done_big(mxge_softc_t *sc, int len, int csum)
1846 {
1847 	struct ifnet *ifp;
1848 	struct mbuf *m = 0; 		/* -Wunitialized */
1849 	struct mbuf *m_prev = 0;	/* -Wunitialized */
1850 	struct mbuf *m_head = 0;
1851 	bus_dmamap_t old_map;
1852 	mxge_rx_buf_t *rx;
1853 	int idx;
1854 
1855 
1856 	rx = &sc->rx_big;
1857 	ifp = sc->ifp;
1858 	while (len > 0) {
1859 		idx = rx->cnt & rx->mask;
1860                 rx->cnt++;
1861 		/* save a pointer to the received mbuf */
1862 		m = rx->info[idx].m;
1863 		/* try to replace the received mbuf */
1864 		if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
1865 			goto drop;
1866 		}
1867 		/* unmap the received buffer */
1868 		old_map = rx->info[idx].map;
1869 		bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1870 		bus_dmamap_unload(rx->dmat, old_map);
1871 
1872 		/* swap the bus_dmamap_t's */
1873 		rx->info[idx].map = rx->extra_map;
1874 		rx->extra_map = old_map;
1875 
1876 		/* chain multiple segments together */
1877 		if (!m_head) {
1878 			m_head = m;
1879 			/* mcp implicitly skips 1st bytes so that
1880 			 * packet is properly aligned */
1881 			m->m_data += MXGEFW_PAD;
1882 			m->m_pkthdr.len = len;
1883 			m->m_len = sc->big_bytes - MXGEFW_PAD;
1884 		} else {
1885 			m->m_len = sc->big_bytes;
1886 			m->m_flags &= ~M_PKTHDR;
1887 			m_prev->m_next = m;
1888 		}
1889 		len -= m->m_len;
1890 		m_prev = m;
1891 	}
1892 
1893 	/* trim trailing garbage from the last mbuf in the chain.  If
1894 	 * there is any garbage, len will be negative */
1895 	m->m_len += len;
1896 
1897 	/* if the checksum is valid, mark it in the mbuf header */
1898 	if (sc->csum_flag)
1899 		mxge_rx_csum(m_head, csum);
1900 
1901 	/* pass the frame up the stack */
1902 	m_head->m_pkthdr.rcvif = ifp;
1903 	ifp->if_ipackets++;
1904 	(*ifp->if_input)(ifp, m_head);
1905 	return;
1906 
1907 drop:
1908 	/* drop the frame -- the old mbuf(s) are re-cycled by running
1909 	   every slot through the allocator */
1910         if (m_head) {
1911                 len -= sc->big_bytes;
1912                 m_freem(m_head);
1913         } else {
1914                 len -= (sc->big_bytes + MXGEFW_PAD);
1915         }
1916         while ((int)len > 0) {
1917                 idx = rx->cnt & rx->mask;
1918                 rx->cnt++;
1919                 m = rx->info[idx].m;
1920                 if (0 == (mxge_get_buf_big(sc, rx->extra_map, idx))) {
1921 			m_freem(m);
1922 			/* unmap the received buffer */
1923 			old_map = rx->info[idx].map;
1924 			bus_dmamap_sync(rx->dmat, old_map,
1925 					BUS_DMASYNC_POSTREAD);
1926 			bus_dmamap_unload(rx->dmat, old_map);
1927 
1928 			/* swap the bus_dmamap_t's */
1929 			rx->info[idx].map = rx->extra_map;
1930 			rx->extra_map = old_map;
1931 		}
1932                 len -= sc->big_bytes;
1933         }
1934 
1935 	ifp->if_ierrors++;
1936 
1937 }
1938 
1939 static inline void
1940 mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
1941 {
1942 	struct ifnet *ifp;
1943 	struct mbuf *m;
1944 	mxge_rx_buf_t *rx;
1945 	bus_dmamap_t old_map;
1946 	int idx;
1947 
1948 	ifp = sc->ifp;
1949 	rx = &sc->rx_small;
1950 	idx = rx->cnt & rx->mask;
1951 	rx->cnt++;
1952 	/* save a pointer to the received mbuf */
1953 	m = rx->info[idx].m;
1954 	/* try to replace the received mbuf */
1955 	if (mxge_get_buf_small(sc, rx->extra_map, idx)) {
1956 		/* drop the frame -- the old mbuf is re-cycled */
1957 		ifp->if_ierrors++;
1958 		return;
1959 	}
1960 
1961 	/* unmap the received buffer */
1962 	old_map = rx->info[idx].map;
1963 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1964 	bus_dmamap_unload(rx->dmat, old_map);
1965 
1966 	/* swap the bus_dmamap_t's */
1967 	rx->info[idx].map = rx->extra_map;
1968 	rx->extra_map = old_map;
1969 
1970 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
1971 	 * aligned */
1972 	m->m_data += MXGEFW_PAD;
1973 
1974 	/* if the checksum is valid, mark it in the mbuf header */
1975 	if (sc->csum_flag)
1976 		mxge_rx_csum(m, csum);
1977 
1978 	/* pass the frame up the stack */
1979 	m->m_pkthdr.rcvif = ifp;
1980 	m->m_len = m->m_pkthdr.len = len;
1981 	ifp->if_ipackets++;
1982 	(*ifp->if_input)(ifp, m);
1983 }
1984 
1985 static inline void
1986 mxge_clean_rx_done(mxge_softc_t *sc)
1987 {
1988 	mxge_rx_done_t *rx_done = &sc->rx_done;
1989 	int limit = 0;
1990 	uint16_t length;
1991 	uint16_t checksum;
1992 
1993 
1994 	while (rx_done->entry[rx_done->idx].length != 0) {
1995 		length = ntohs(rx_done->entry[rx_done->idx].length);
1996 		rx_done->entry[rx_done->idx].length = 0;
1997 		checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
1998 		if (length <= (MHLEN - MXGEFW_PAD))
1999 			mxge_rx_done_small(sc, length, checksum);
2000 		else
2001 			mxge_rx_done_big(sc, length, checksum);
2002 		rx_done->cnt++;
2003 		rx_done->idx = rx_done->cnt & (mxge_max_intr_slots - 1);
2004 
2005 		/* limit potential for livelock */
2006 		if (__predict_false(++limit > 2 * mxge_max_intr_slots))
2007 			break;
2008 
2009 	}
2010 }
2011 
2012 
2013 static inline void
2014 mxge_tx_done(mxge_softc_t *sc, uint32_t mcp_idx)
2015 {
2016 	struct ifnet *ifp;
2017 	mxge_tx_buf_t *tx;
2018 	struct mbuf *m;
2019 	bus_dmamap_t map;
2020 	int idx, limit;
2021 
2022 	limit = 0;
2023 	tx = &sc->tx;
2024 	ifp = sc->ifp;
2025 	while (tx->pkt_done != mcp_idx) {
2026 		idx = tx->done & tx->mask;
2027 		tx->done++;
2028 		m = tx->info[idx].m;
2029 		/* mbuf and DMA map only attached to the first
2030 		   segment per-mbuf */
2031 		if (m != NULL) {
2032 			ifp->if_opackets++;
2033 			tx->info[idx].m = NULL;
2034 			map = tx->info[idx].map;
2035 			bus_dmamap_unload(tx->dmat, map);
2036 			m_freem(m);
2037 		}
2038 		if (tx->info[idx].flag) {
2039 			tx->info[idx].flag = 0;
2040 			tx->pkt_done++;
2041 		}
2042 		/* limit potential for livelock by only handling
2043 		   2 full tx rings per call */
2044 		if (__predict_false(++limit >  2 * tx->mask))
2045 			break;
2046 	}
2047 
2048 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2049            its OK to send packets */
2050 
2051 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
2052 	    tx->req - tx->done < (tx->mask + 1)/4) {
2053 		mtx_lock(&sc->tx_mtx);
2054 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2055 		sc->tx.wake++;
2056 		mxge_start_locked(sc);
2057 		mtx_unlock(&sc->tx_mtx);
2058 	}
2059 }
2060 
2061 static void
2062 mxge_intr(void *arg)
2063 {
2064 	mxge_softc_t *sc = arg;
2065 	mcp_irq_data_t *stats = sc->fw_stats;
2066 	mxge_tx_buf_t *tx = &sc->tx;
2067 	mxge_rx_done_t *rx_done = &sc->rx_done;
2068 	uint32_t send_done_count;
2069 	uint8_t valid;
2070 
2071 
2072 	/* make sure the DMA has finished */
2073 	if (!stats->valid) {
2074 		return;
2075 	}
2076 	valid = stats->valid;
2077 
2078 	if (!sc->msi_enabled) {
2079 		/* lower legacy IRQ  */
2080 		*sc->irq_deassert = 0;
2081 		if (!mxge_deassert_wait)
2082 			/* don't wait for conf. that irq is low */
2083 			stats->valid = 0;
2084 	} else {
2085 		stats->valid = 0;
2086 	}
2087 
2088 	/* loop while waiting for legacy irq deassertion */
2089 	do {
2090 		/* check for transmit completes and receives */
2091 		send_done_count = be32toh(stats->send_done_count);
2092 		while ((send_done_count != tx->pkt_done) ||
2093 		       (rx_done->entry[rx_done->idx].length != 0)) {
2094 			mxge_tx_done(sc, (int)send_done_count);
2095 			mxge_clean_rx_done(sc);
2096 			send_done_count = be32toh(stats->send_done_count);
2097 		}
2098 	} while (*((volatile uint8_t *) &stats->valid));
2099 
2100 	if (__predict_false(stats->stats_updated)) {
2101 		if (sc->link_state != stats->link_up) {
2102 			sc->link_state = stats->link_up;
2103 			if (sc->link_state) {
2104 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2105 				if (mxge_verbose)
2106 					device_printf(sc->dev, "link up\n");
2107 			} else {
2108 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2109 				if (mxge_verbose)
2110 					device_printf(sc->dev, "link down\n");
2111 			}
2112 		}
2113 		if (sc->rdma_tags_available !=
2114 		    be32toh(sc->fw_stats->rdma_tags_available)) {
2115 			sc->rdma_tags_available =
2116 				be32toh(sc->fw_stats->rdma_tags_available);
2117 			device_printf(sc->dev, "RDMA timed out! %d tags "
2118 				      "left\n", sc->rdma_tags_available);
2119 		}
2120 		sc->down_cnt += stats->link_down;
2121 	}
2122 
2123 	/* check to see if we have rx token to pass back */
2124 	if (valid & 0x1)
2125 	    *sc->irq_claim = be32toh(3);
2126 	*(sc->irq_claim + 1) = be32toh(3);
2127 }
2128 
2129 static void
2130 mxge_init(void *arg)
2131 {
2132 }
2133 
2134 
2135 
2136 static void
2137 mxge_free_mbufs(mxge_softc_t *sc)
2138 {
2139 	int i;
2140 
2141 	for (i = 0; i <= sc->rx_big.mask; i++) {
2142 		if (sc->rx_big.info[i].m == NULL)
2143 			continue;
2144 		bus_dmamap_unload(sc->rx_big.dmat,
2145 				  sc->rx_big.info[i].map);
2146 		m_freem(sc->rx_big.info[i].m);
2147 		sc->rx_big.info[i].m = NULL;
2148 	}
2149 
2150 	for (i = 0; i <= sc->rx_big.mask; i++) {
2151 		if (sc->rx_big.info[i].m == NULL)
2152 			continue;
2153 		bus_dmamap_unload(sc->rx_big.dmat,
2154 				  sc->rx_big.info[i].map);
2155 		m_freem(sc->rx_big.info[i].m);
2156 		sc->rx_big.info[i].m = NULL;
2157 	}
2158 
2159 	for (i = 0; i <= sc->tx.mask; i++) {
2160 		sc->tx.info[i].flag = 0;
2161 		if (sc->tx.info[i].m == NULL)
2162 			continue;
2163 		bus_dmamap_unload(sc->tx.dmat,
2164 				  sc->tx.info[i].map);
2165 		m_freem(sc->tx.info[i].m);
2166 		sc->tx.info[i].m = NULL;
2167 	}
2168 }
2169 
2170 static void
2171 mxge_free_rings(mxge_softc_t *sc)
2172 {
2173 	int i;
2174 
2175 	if (sc->tx.req_bytes != NULL)
2176 		free(sc->tx.req_bytes, M_DEVBUF);
2177 	if (sc->tx.seg_list != NULL)
2178 		free(sc->tx.seg_list, M_DEVBUF);
2179 	if (sc->rx_small.shadow != NULL)
2180 		free(sc->rx_small.shadow, M_DEVBUF);
2181 	if (sc->rx_big.shadow != NULL)
2182 		free(sc->rx_big.shadow, M_DEVBUF);
2183 	if (sc->tx.info != NULL) {
2184 		if (sc->tx.dmat != NULL) {
2185 			for (i = 0; i <= sc->tx.mask; i++) {
2186 				bus_dmamap_destroy(sc->tx.dmat,
2187 						   sc->tx.info[i].map);
2188 			}
2189 			bus_dma_tag_destroy(sc->tx.dmat);
2190 		}
2191 		free(sc->tx.info, M_DEVBUF);
2192 	}
2193 	if (sc->rx_small.info != NULL) {
2194 		if (sc->rx_small.dmat != NULL) {
2195 			for (i = 0; i <= sc->rx_small.mask; i++) {
2196 				bus_dmamap_destroy(sc->rx_small.dmat,
2197 						   sc->rx_small.info[i].map);
2198 			}
2199 			bus_dmamap_destroy(sc->rx_small.dmat,
2200 					   sc->rx_small.extra_map);
2201 			bus_dma_tag_destroy(sc->rx_small.dmat);
2202 		}
2203 		free(sc->rx_small.info, M_DEVBUF);
2204 	}
2205 	if (sc->rx_big.info != NULL) {
2206 		if (sc->rx_big.dmat != NULL) {
2207 			for (i = 0; i <= sc->rx_big.mask; i++) {
2208 				bus_dmamap_destroy(sc->rx_big.dmat,
2209 						   sc->rx_big.info[i].map);
2210 			}
2211 			bus_dmamap_destroy(sc->rx_big.dmat,
2212 					   sc->rx_big.extra_map);
2213 			bus_dma_tag_destroy(sc->rx_big.dmat);
2214 		}
2215 		free(sc->rx_big.info, M_DEVBUF);
2216 	}
2217 }
2218 
2219 static int
2220 mxge_alloc_rings(mxge_softc_t *sc)
2221 {
2222 	mxge_cmd_t cmd;
2223 	int tx_ring_size, rx_ring_size;
2224 	int tx_ring_entries, rx_ring_entries;
2225 	int i, err;
2226 	unsigned long bytes;
2227 
2228 	/* get ring sizes */
2229 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
2230 	tx_ring_size = cmd.data0;
2231 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
2232 	if (err != 0) {
2233 		device_printf(sc->dev, "Cannot determine ring sizes\n");
2234 		goto abort_with_nothing;
2235 	}
2236 
2237 	rx_ring_size = cmd.data0;
2238 
2239 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
2240 	rx_ring_entries = rx_ring_size / sizeof (mcp_dma_addr_t);
2241 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
2242 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
2243 	IFQ_SET_READY(&sc->ifp->if_snd);
2244 
2245 	sc->tx.mask = tx_ring_entries - 1;
2246 	sc->rx_small.mask = sc->rx_big.mask = rx_ring_entries - 1;
2247 
2248 	err = ENOMEM;
2249 
2250 	/* allocate the tx request copy block */
2251 	bytes = 8 +
2252 		sizeof (*sc->tx.req_list) * (MXGE_MAX_SEND_DESC + 4);
2253 	sc->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
2254 	if (sc->tx.req_bytes == NULL)
2255 		goto abort_with_nothing;
2256 	/* ensure req_list entries are aligned to 8 bytes */
2257 	sc->tx.req_list = (mcp_kreq_ether_send_t *)
2258 		((unsigned long)(sc->tx.req_bytes + 7) & ~7UL);
2259 
2260 	/* allocate the tx busdma segment list */
2261 	bytes = sizeof (*sc->tx.seg_list) * MXGE_MAX_SEND_DESC;
2262 	sc->tx.seg_list = (bus_dma_segment_t *)
2263 		malloc(bytes, M_DEVBUF, M_WAITOK);
2264 	if (sc->tx.seg_list == NULL)
2265 		goto abort_with_alloc;
2266 
2267 	/* allocate the rx shadow rings */
2268 	bytes = rx_ring_entries * sizeof (*sc->rx_small.shadow);
2269 	sc->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2270 	if (sc->rx_small.shadow == NULL)
2271 		goto abort_with_alloc;
2272 
2273 	bytes = rx_ring_entries * sizeof (*sc->rx_big.shadow);
2274 	sc->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2275 	if (sc->rx_big.shadow == NULL)
2276 		goto abort_with_alloc;
2277 
2278 	/* allocate the host info rings */
2279 	bytes = tx_ring_entries * sizeof (*sc->tx.info);
2280 	sc->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2281 	if (sc->tx.info == NULL)
2282 		goto abort_with_alloc;
2283 
2284 	bytes = rx_ring_entries * sizeof (*sc->rx_small.info);
2285 	sc->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2286 	if (sc->rx_small.info == NULL)
2287 		goto abort_with_alloc;
2288 
2289 	bytes = rx_ring_entries * sizeof (*sc->rx_big.info);
2290 	sc->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2291 	if (sc->rx_big.info == NULL)
2292 		goto abort_with_alloc;
2293 
2294 	/* allocate the busdma resources */
2295 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2296 				 1,			/* alignment */
2297 				 sc->tx.boundary,	/* boundary */
2298 				 BUS_SPACE_MAXADDR,	/* low */
2299 				 BUS_SPACE_MAXADDR,	/* high */
2300 				 NULL, NULL,		/* filter */
2301 				 65536 + 256,		/* maxsize */
2302 				 MXGE_MAX_SEND_DESC/2,	/* num segs */
2303 				 sc->tx.boundary,	/* maxsegsize */
2304 				 BUS_DMA_ALLOCNOW,	/* flags */
2305 				 NULL, NULL,		/* lock */
2306 				 &sc->tx.dmat);		/* tag */
2307 
2308 	if (err != 0) {
2309 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
2310 			      err);
2311 		goto abort_with_alloc;
2312 	}
2313 
2314 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2315 				 1,			/* alignment */
2316 				 4096,			/* boundary */
2317 				 BUS_SPACE_MAXADDR,	/* low */
2318 				 BUS_SPACE_MAXADDR,	/* high */
2319 				 NULL, NULL,		/* filter */
2320 				 MHLEN,			/* maxsize */
2321 				 1,			/* num segs */
2322 				 MHLEN,			/* maxsegsize */
2323 				 BUS_DMA_ALLOCNOW,	/* flags */
2324 				 NULL, NULL,		/* lock */
2325 				 &sc->rx_small.dmat);	/* tag */
2326 	if (err != 0) {
2327 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2328 			      err);
2329 		goto abort_with_alloc;
2330 	}
2331 
2332 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2333 				 1,			/* alignment */
2334 				 4096,			/* boundary */
2335 				 BUS_SPACE_MAXADDR,	/* low */
2336 				 BUS_SPACE_MAXADDR,	/* high */
2337 				 NULL, NULL,		/* filter */
2338 				 4096,			/* maxsize */
2339 				 1,			/* num segs */
2340 				 4096,			/* maxsegsize */
2341 				 BUS_DMA_ALLOCNOW,	/* flags */
2342 				 NULL, NULL,		/* lock */
2343 				 &sc->rx_big.dmat);	/* tag */
2344 	if (err != 0) {
2345 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2346 			      err);
2347 		goto abort_with_alloc;
2348 	}
2349 
2350 	/* now use these tags to setup dmamaps for each slot
2351 	   in each ring */
2352 	for (i = 0; i <= sc->tx.mask; i++) {
2353 		err = bus_dmamap_create(sc->tx.dmat, 0,
2354 					&sc->tx.info[i].map);
2355 		if (err != 0) {
2356 			device_printf(sc->dev, "Err %d  tx dmamap\n",
2357 			      err);
2358 			goto abort_with_alloc;
2359 		}
2360 	}
2361 	for (i = 0; i <= sc->rx_small.mask; i++) {
2362 		err = bus_dmamap_create(sc->rx_small.dmat, 0,
2363 					&sc->rx_small.info[i].map);
2364 		if (err != 0) {
2365 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2366 				      err);
2367 			goto abort_with_alloc;
2368 		}
2369 	}
2370 	err = bus_dmamap_create(sc->rx_small.dmat, 0,
2371 				&sc->rx_small.extra_map);
2372 	if (err != 0) {
2373 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2374 			      err);
2375 			goto abort_with_alloc;
2376 	}
2377 
2378 	for (i = 0; i <= sc->rx_big.mask; i++) {
2379 		err = bus_dmamap_create(sc->rx_big.dmat, 0,
2380 					&sc->rx_big.info[i].map);
2381 		if (err != 0) {
2382 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2383 			      err);
2384 			goto abort_with_alloc;
2385 		}
2386 	}
2387 	err = bus_dmamap_create(sc->rx_big.dmat, 0,
2388 				&sc->rx_big.extra_map);
2389 	if (err != 0) {
2390 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2391 			      err);
2392 			goto abort_with_alloc;
2393 	}
2394 	return 0;
2395 
2396 abort_with_alloc:
2397 	mxge_free_rings(sc);
2398 
2399 abort_with_nothing:
2400 	return err;
2401 }
2402 
2403 static int
2404 mxge_open(mxge_softc_t *sc)
2405 {
2406 	mxge_cmd_t cmd;
2407 	int i, err;
2408 	bus_dmamap_t map;
2409 	bus_addr_t bus;
2410 
2411 
2412 	/* Copy the MAC address in case it was overridden */
2413 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
2414 
2415 	err = mxge_reset(sc);
2416 	if (err != 0) {
2417 		device_printf(sc->dev, "failed to reset\n");
2418 		return EIO;
2419 	}
2420 	bzero(sc->rx_done.entry,
2421 	      mxge_max_intr_slots * sizeof(*sc->rx_done.entry));
2422 
2423 	if (MCLBYTES >=
2424 	    sc->ifp->if_mtu + ETHER_HDR_LEN + MXGEFW_PAD)
2425 		sc->big_bytes = MCLBYTES;
2426 	else
2427 		sc->big_bytes = MJUMPAGESIZE;
2428 
2429 
2430 	/* get the lanai pointers to the send and receive rings */
2431 
2432 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
2433 	sc->tx.lanai =
2434 		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
2435 	err |= mxge_send_cmd(sc,
2436 				 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
2437 	sc->rx_small.lanai =
2438 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2439 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
2440 	sc->rx_big.lanai =
2441 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2442 
2443 	if (err != 0) {
2444 		device_printf(sc->dev,
2445 			      "failed to get ring sizes or locations\n");
2446 		return EIO;
2447 	}
2448 
2449 	if (sc->wc) {
2450 		sc->tx.wc_fifo = sc->sram + MXGEFW_ETH_SEND_4;
2451 		sc->rx_small.wc_fifo = sc->sram + MXGEFW_ETH_RECV_SMALL;
2452 		sc->rx_big.wc_fifo = sc->sram + MXGEFW_ETH_RECV_BIG;
2453 	} else {
2454 		sc->tx.wc_fifo = 0;
2455 		sc->rx_small.wc_fifo = 0;
2456 		sc->rx_big.wc_fifo = 0;
2457 	}
2458 
2459 
2460 	/* stock receive rings */
2461 	for (i = 0; i <= sc->rx_small.mask; i++) {
2462 		map = sc->rx_small.info[i].map;
2463 		err = mxge_get_buf_small(sc, map, i);
2464 		if (err) {
2465 			device_printf(sc->dev, "alloced %d/%d smalls\n",
2466 				      i, sc->rx_small.mask + 1);
2467 			goto abort;
2468 		}
2469 	}
2470 	for (i = 0; i <= sc->rx_big.mask; i++) {
2471 		map = sc->rx_big.info[i].map;
2472 		err = mxge_get_buf_big(sc, map, i);
2473 		if (err) {
2474 			device_printf(sc->dev, "alloced %d/%d bigs\n",
2475 				      i, sc->rx_big.mask + 1);
2476 			goto abort;
2477 		}
2478 	}
2479 
2480 	/* Give the firmware the mtu and the big and small buffer
2481 	   sizes.  The firmware wants the big buf size to be a power
2482 	   of two. Luckily, FreeBSD's clusters are powers of two */
2483 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN;
2484 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
2485 	cmd.data0 = MHLEN - MXGEFW_PAD;
2486 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
2487 			     &cmd);
2488 	cmd.data0 = sc->big_bytes;
2489 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2490 
2491 	if (err != 0) {
2492 		device_printf(sc->dev, "failed to setup params\n");
2493 		goto abort;
2494 	}
2495 
2496 	/* Now give him the pointer to the stats block */
2497 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->fw_stats_dma.bus_addr);
2498 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->fw_stats_dma.bus_addr);
2499 	cmd.data2 = sizeof(struct mcp_irq_data);
2500 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
2501 
2502 	if (err != 0) {
2503 		bus = sc->fw_stats_dma.bus_addr;
2504 		bus += offsetof(struct mcp_irq_data, send_done_count);
2505 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
2506 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
2507 		err = mxge_send_cmd(sc,
2508 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
2509 				    &cmd);
2510 		/* Firmware cannot support multicast without STATS_DMA_V2 */
2511 		sc->fw_multicast_support = 0;
2512 	} else {
2513 		sc->fw_multicast_support = 1;
2514 	}
2515 
2516 	if (err != 0) {
2517 		device_printf(sc->dev, "failed to setup params\n");
2518 		goto abort;
2519 	}
2520 
2521 	/* Finally, start the firmware running */
2522 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
2523 	if (err) {
2524 		device_printf(sc->dev, "Couldn't bring up link\n");
2525 		goto abort;
2526 	}
2527 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
2528 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2529 
2530 	return 0;
2531 
2532 
2533 abort:
2534 	mxge_free_mbufs(sc);
2535 
2536 	return err;
2537 }
2538 
2539 static int
2540 mxge_close(mxge_softc_t *sc)
2541 {
2542 	mxge_cmd_t cmd;
2543 	int err, old_down_cnt;
2544 
2545 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
2546 	old_down_cnt = sc->down_cnt;
2547 	mb();
2548 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2549 	if (err) {
2550 		device_printf(sc->dev, "Couldn't bring down link\n");
2551 	}
2552 	if (old_down_cnt == sc->down_cnt) {
2553 		/* wait for down irq */
2554 		DELAY(10 * sc->intr_coal_delay);
2555 	}
2556 	if (old_down_cnt == sc->down_cnt) {
2557 		device_printf(sc->dev, "never got down irq\n");
2558 	}
2559 
2560 	mxge_free_mbufs(sc);
2561 
2562 	return 0;
2563 }
2564 
2565 static void
2566 mxge_setup_cfg_space(mxge_softc_t *sc)
2567 {
2568 	device_t dev = sc->dev;
2569 	int reg;
2570 	uint16_t cmd, lnk, pectl;
2571 
2572 	/* find the PCIe link width and set max read request to 4KB*/
2573 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
2574 		lnk = pci_read_config(dev, reg + 0x12, 2);
2575 		sc->link_width = (lnk >> 4) & 0x3f;
2576 
2577 		pectl = pci_read_config(dev, reg + 0x8, 2);
2578 		pectl = (pectl & ~0x7000) | (5 << 12);
2579 		pci_write_config(dev, reg + 0x8, pectl, 2);
2580 	}
2581 
2582 	/* Enable DMA and Memory space access */
2583 	pci_enable_busmaster(dev);
2584 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
2585 	cmd |= PCIM_CMD_MEMEN;
2586 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
2587 }
2588 
2589 static uint32_t
2590 mxge_read_reboot(mxge_softc_t *sc)
2591 {
2592 	device_t dev = sc->dev;
2593 	uint32_t vs;
2594 
2595 	/* find the vendor specific offset */
2596 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
2597 		device_printf(sc->dev,
2598 			      "could not find vendor specific offset\n");
2599 		return (uint32_t)-1;
2600 	}
2601 	/* enable read32 mode */
2602 	pci_write_config(dev, vs + 0x10, 0x3, 1);
2603 	/* tell NIC which register to read */
2604 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
2605 	return (pci_read_config(dev, vs + 0x14, 4));
2606 }
2607 
2608 static void
2609 mxge_watchdog_reset(mxge_softc_t *sc)
2610 {
2611 	int err;
2612 	uint32_t reboot;
2613 	uint16_t cmd;
2614 
2615 	err = ENXIO;
2616 
2617 	device_printf(sc->dev, "Watchdog reset!\n");
2618 
2619 	/*
2620 	 * check to see if the NIC rebooted.  If it did, then all of
2621 	 * PCI config space has been reset, and things like the
2622 	 * busmaster bit will be zero.  If this is the case, then we
2623 	 * must restore PCI config space before the NIC can be used
2624 	 * again
2625 	 */
2626 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2627 	if (cmd == 0xffff) {
2628 		/*
2629 		 * maybe the watchdog caught the NIC rebooting; wait
2630 		 * up to 100ms for it to finish.  If it does not come
2631 		 * back, then give up
2632 		 */
2633 		DELAY(1000*100);
2634 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2635 		if (cmd == 0xffff) {
2636 			device_printf(sc->dev, "NIC disappeared!\n");
2637 			goto abort;
2638 		}
2639 	}
2640 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
2641 		/* print the reboot status */
2642 		reboot = mxge_read_reboot(sc);
2643 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
2644 			      reboot);
2645 		/* restore PCI configuration space */
2646 
2647 		/* XXXX waiting for pci_cfg_restore() to be exported */
2648 		goto abort; /* just abort for now */
2649 
2650 		/* and redo any changes we made to our config space */
2651 		mxge_setup_cfg_space(sc);
2652 	} else {
2653 		device_printf(sc->dev, "NIC did not reboot, ring state:\n");
2654 		device_printf(sc->dev, "tx.req=%d tx.done=%d\n",
2655 			      sc->tx.req, sc->tx.done);
2656 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
2657 			      sc->tx.pkt_done,
2658 			      be32toh(sc->fw_stats->send_done_count));
2659 	}
2660 
2661 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
2662 		mxge_close(sc);
2663 		err = mxge_open(sc);
2664 	}
2665 
2666 abort:
2667 	/*
2668 	 * stop the watchdog if the nic is dead, to avoid spamming the
2669 	 * console
2670 	 */
2671 	if (err != 0) {
2672 		callout_stop(&sc->co_hdl);
2673 	}
2674 }
2675 
2676 static void
2677 mxge_watchdog(mxge_softc_t *sc)
2678 {
2679 	mxge_tx_buf_t *tx = &sc->tx;
2680 
2681 	/* see if we have outstanding transmits, which
2682 	   have been pending for more than mxge_ticks */
2683 	if (tx->req != tx->done &&
2684 	    tx->watchdog_req != tx->watchdog_done &&
2685 	    tx->done == tx->watchdog_done)
2686 		mxge_watchdog_reset(sc);
2687 
2688 	tx->watchdog_req = tx->req;
2689 	tx->watchdog_done = tx->done;
2690 }
2691 
2692 static void
2693 mxge_tick(void *arg)
2694 {
2695 	mxge_softc_t *sc = arg;
2696 
2697 
2698 	/* Synchronize with possible callout reset/stop. */
2699 	if (callout_pending(&sc->co_hdl) ||
2700 	    !callout_active(&sc->co_hdl)) {
2701 		mtx_unlock(&sc->driver_mtx);
2702 		return;
2703 	}
2704 
2705 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
2706 	mxge_watchdog(sc);
2707 }
2708 
2709 static int
2710 mxge_media_change(struct ifnet *ifp)
2711 {
2712 	return EINVAL;
2713 }
2714 
2715 static int
2716 mxge_change_mtu(mxge_softc_t *sc, int mtu)
2717 {
2718 	struct ifnet *ifp = sc->ifp;
2719 	int real_mtu, old_mtu;
2720 	int err = 0;
2721 
2722 
2723 	real_mtu = mtu + ETHER_HDR_LEN;
2724 	if ((real_mtu > MXGE_MAX_ETHER_MTU) ||
2725 	    real_mtu < 60)
2726 		return EINVAL;
2727 	mtx_lock(&sc->driver_mtx);
2728 	old_mtu = ifp->if_mtu;
2729 	ifp->if_mtu = mtu;
2730 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2731 		callout_stop(&sc->co_hdl);
2732 		mxge_close(sc);
2733 		err = mxge_open(sc);
2734 		if (err != 0) {
2735 			ifp->if_mtu = old_mtu;
2736 			mxge_close(sc);
2737 			(void) mxge_open(sc);
2738 		}
2739 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
2740 	}
2741 	mtx_unlock(&sc->driver_mtx);
2742 	return err;
2743 }
2744 
2745 static void
2746 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
2747 {
2748 	mxge_softc_t *sc = ifp->if_softc;
2749 
2750 
2751 	if (sc == NULL)
2752 		return;
2753 	ifmr->ifm_status = IFM_AVALID;
2754 	ifmr->ifm_status |= sc->fw_stats->link_up ? IFM_ACTIVE : 0;
2755 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
2756 	ifmr->ifm_active |= sc->fw_stats->link_up ? IFM_FDX : 0;
2757 }
2758 
2759 static int
2760 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
2761 {
2762 	mxge_softc_t *sc = ifp->if_softc;
2763 	struct ifreq *ifr = (struct ifreq *)data;
2764 	int err, mask;
2765 
2766 	err = 0;
2767 	switch (command) {
2768 	case SIOCSIFADDR:
2769 	case SIOCGIFADDR:
2770 		err = ether_ioctl(ifp, command, data);
2771 		break;
2772 
2773 	case SIOCSIFMTU:
2774 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
2775 		break;
2776 
2777 	case SIOCSIFFLAGS:
2778 		mtx_lock(&sc->driver_mtx);
2779 		if (ifp->if_flags & IFF_UP) {
2780 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
2781 				err = mxge_open(sc);
2782 				callout_reset(&sc->co_hdl, mxge_ticks,
2783 					      mxge_tick, sc);
2784 			} else {
2785 				/* take care of promis can allmulti
2786 				   flag chages */
2787 				mxge_change_promisc(sc,
2788 						    ifp->if_flags & IFF_PROMISC);
2789 				mxge_set_multicast_list(sc);
2790 			}
2791 		} else {
2792 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2793 				mxge_close(sc);
2794 				callout_stop(&sc->co_hdl);
2795 			}
2796 		}
2797 		mtx_unlock(&sc->driver_mtx);
2798 		break;
2799 
2800 	case SIOCADDMULTI:
2801 	case SIOCDELMULTI:
2802 		mtx_lock(&sc->driver_mtx);
2803 		mxge_set_multicast_list(sc);
2804 		mtx_unlock(&sc->driver_mtx);
2805 		break;
2806 
2807 	case SIOCSIFCAP:
2808 		mtx_lock(&sc->driver_mtx);
2809 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2810 		if (mask & IFCAP_TXCSUM) {
2811 			if (IFCAP_TXCSUM & ifp->if_capenable) {
2812 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
2813 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
2814 						      | CSUM_TSO);
2815 			} else {
2816 				ifp->if_capenable |= IFCAP_TXCSUM;
2817 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
2818 			}
2819 		} else if (mask & IFCAP_RXCSUM) {
2820 			if (IFCAP_RXCSUM & ifp->if_capenable) {
2821 				ifp->if_capenable &= ~IFCAP_RXCSUM;
2822 				sc->csum_flag = 0;
2823 			} else {
2824 				ifp->if_capenable |= IFCAP_RXCSUM;
2825 				sc->csum_flag = 1;
2826 			}
2827 		}
2828 		if (mask & IFCAP_TSO4) {
2829 			if (IFCAP_TSO4 & ifp->if_capenable) {
2830 				ifp->if_capenable &= ~IFCAP_TSO4;
2831 				ifp->if_hwassist &= ~CSUM_TSO;
2832 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
2833 				ifp->if_capenable |= IFCAP_TSO4;
2834 				ifp->if_hwassist |= CSUM_TSO;
2835 			} else {
2836 				printf("mxge requires tx checksum offload"
2837 				       " be enabled to use TSO\n");
2838 				err = EINVAL;
2839 			}
2840 		}
2841 		mtx_unlock(&sc->driver_mtx);
2842 		break;
2843 
2844 	case SIOCGIFMEDIA:
2845 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
2846 				    &sc->media, command);
2847                 break;
2848 
2849 	default:
2850 		err = ENOTTY;
2851         }
2852 	return err;
2853 }
2854 
2855 static void
2856 mxge_fetch_tunables(mxge_softc_t *sc)
2857 {
2858 
2859 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
2860 			  &mxge_flow_control);
2861 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
2862 			  &mxge_intr_coal_delay);
2863 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
2864 			  &mxge_nvidia_ecrc_enable);
2865 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
2866 			  &mxge_force_firmware);
2867 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
2868 			  &mxge_deassert_wait);
2869 	TUNABLE_INT_FETCH("hw.mxge.verbose",
2870 			  &mxge_verbose);
2871 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
2872 
2873 	if (bootverbose)
2874 		mxge_verbose = 1;
2875 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
2876 		mxge_intr_coal_delay = 30;
2877 	if (mxge_ticks == 0)
2878 		mxge_ticks = hz;
2879 	sc->pause = mxge_flow_control;
2880 }
2881 
2882 static int
2883 mxge_attach(device_t dev)
2884 {
2885 	mxge_softc_t *sc = device_get_softc(dev);
2886 	struct ifnet *ifp;
2887 	size_t bytes;
2888 	int count, rid, err;
2889 
2890 	sc->dev = dev;
2891 	mxge_fetch_tunables(sc);
2892 
2893 	err = bus_dma_tag_create(NULL,			/* parent */
2894 				 1,			/* alignment */
2895 				 4096,			/* boundary */
2896 				 BUS_SPACE_MAXADDR,	/* low */
2897 				 BUS_SPACE_MAXADDR,	/* high */
2898 				 NULL, NULL,		/* filter */
2899 				 65536 + 256,		/* maxsize */
2900 				 MXGE_MAX_SEND_DESC, 	/* num segs */
2901 				 4096,			/* maxsegsize */
2902 				 0,			/* flags */
2903 				 NULL, NULL,		/* lock */
2904 				 &sc->parent_dmat);	/* tag */
2905 
2906 	if (err != 0) {
2907 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
2908 			      err);
2909 		goto abort_with_nothing;
2910 	}
2911 
2912 	ifp = sc->ifp = if_alloc(IFT_ETHER);
2913 	if (ifp == NULL) {
2914 		device_printf(dev, "can not if_alloc()\n");
2915 		err = ENOSPC;
2916 		goto abort_with_parent_dmat;
2917 	}
2918 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
2919 		 device_get_nameunit(dev));
2920 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
2921 	snprintf(sc->tx_mtx_name, sizeof(sc->tx_mtx_name), "%s:tx",
2922 		 device_get_nameunit(dev));
2923 	mtx_init(&sc->tx_mtx, sc->tx_mtx_name, NULL, MTX_DEF);
2924 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
2925 		 "%s:drv", device_get_nameunit(dev));
2926 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
2927 		 MTX_NETWORK_LOCK, MTX_DEF);
2928 
2929 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
2930 
2931 	mxge_setup_cfg_space(sc);
2932 
2933 	/* Map the board into the kernel */
2934 	rid = PCIR_BARS;
2935 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
2936 					 ~0, 1, RF_ACTIVE);
2937 	if (sc->mem_res == NULL) {
2938 		device_printf(dev, "could not map memory\n");
2939 		err = ENXIO;
2940 		goto abort_with_lock;
2941 	}
2942 	sc->sram = rman_get_virtual(sc->mem_res);
2943 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
2944 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
2945 		device_printf(dev, "impossible memory region size %ld\n",
2946 			      rman_get_size(sc->mem_res));
2947 		err = ENXIO;
2948 		goto abort_with_mem_res;
2949 	}
2950 
2951 	/* make NULL terminated copy of the EEPROM strings section of
2952 	   lanai SRAM */
2953 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
2954 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
2955 				rman_get_bushandle(sc->mem_res),
2956 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
2957 				sc->eeprom_strings,
2958 				MXGE_EEPROM_STRINGS_SIZE - 2);
2959 	err = mxge_parse_strings(sc);
2960 	if (err != 0)
2961 		goto abort_with_mem_res;
2962 
2963 	/* Enable write combining for efficient use of PCIe bus */
2964 	mxge_enable_wc(sc);
2965 
2966 	/* Allocate the out of band dma memory */
2967 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
2968 			     sizeof (mxge_cmd_t), 64);
2969 	if (err != 0)
2970 		goto abort_with_mem_res;
2971 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
2972 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
2973 	if (err != 0)
2974 		goto abort_with_cmd_dma;
2975 
2976 	err = mxge_dma_alloc(sc, &sc->fw_stats_dma,
2977 			     sizeof (*sc->fw_stats), 64);
2978 	if (err != 0)
2979 		goto abort_with_zeropad_dma;
2980 	sc->fw_stats = (mcp_irq_data_t *)sc->fw_stats_dma.addr;
2981 
2982 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
2983 	if (err != 0)
2984 		goto abort_with_fw_stats;
2985 
2986 	/* allocate interrupt queues */
2987 	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);
2988 	err = mxge_dma_alloc(sc, &sc->rx_done.dma, bytes, 4096);
2989 	if (err != 0)
2990 		goto abort_with_dmabench;
2991 	sc->rx_done.entry = sc->rx_done.dma.addr;
2992 	bzero(sc->rx_done.entry, bytes);
2993 
2994 	/* Add our ithread  */
2995 	count = pci_msi_count(dev);
2996 	if (count == 1 && pci_alloc_msi(dev, &count) == 0) {
2997 		rid = 1;
2998 		sc->msi_enabled = 1;
2999 	} else {
3000 		rid = 0;
3001 	}
3002 	sc->irq_res = bus_alloc_resource(dev, SYS_RES_IRQ, &rid, 0, ~0,
3003 					 1, RF_SHAREABLE | RF_ACTIVE);
3004 	if (sc->irq_res == NULL) {
3005 		device_printf(dev, "could not alloc interrupt\n");
3006 		goto abort_with_rx_done;
3007 	}
3008 	if (mxge_verbose)
3009 		device_printf(dev, "using %s irq %ld\n",
3010 			      sc->msi_enabled ? "MSI" : "INTx",
3011 			      rman_get_start(sc->irq_res));
3012 	/* load the firmware */
3013 	mxge_select_firmware(sc);
3014 
3015 	err = mxge_load_firmware(sc);
3016 	if (err != 0)
3017 		goto abort_with_irq_res;
3018 	sc->intr_coal_delay = mxge_intr_coal_delay;
3019 	err = mxge_reset(sc);
3020 	if (err != 0)
3021 		goto abort_with_irq_res;
3022 
3023 	err = mxge_alloc_rings(sc);
3024 	if (err != 0) {
3025 		device_printf(sc->dev, "failed to allocate rings\n");
3026 		goto abort_with_irq_res;
3027 	}
3028 
3029 	err = bus_setup_intr(sc->dev, sc->irq_res,
3030 			     INTR_TYPE_NET | INTR_MPSAFE,
3031 			     NULL, mxge_intr, sc, &sc->ih);
3032 	if (err != 0) {
3033 		goto abort_with_rings;
3034 	}
3035 	/* hook into the network stack */
3036 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
3037 	ifp->if_baudrate = 100000000;
3038 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
3039 		IFCAP_JUMBO_MTU;
3040 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
3041 	ifp->if_capenable = ifp->if_capabilities;
3042 	sc->csum_flag = 1;
3043         ifp->if_init = mxge_init;
3044         ifp->if_softc = sc;
3045         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
3046         ifp->if_ioctl = mxge_ioctl;
3047         ifp->if_start = mxge_start;
3048 	ether_ifattach(ifp, sc->mac_addr);
3049 	/* ether_ifattach sets mtu to 1500 */
3050 	ifp->if_mtu = MXGE_MAX_ETHER_MTU - ETHER_HDR_LEN;
3051 
3052 	/* Initialise the ifmedia structure */
3053 	ifmedia_init(&sc->media, 0, mxge_media_change,
3054 		     mxge_media_status);
3055 	ifmedia_add(&sc->media, IFM_ETHER|IFM_AUTO, 0, NULL);
3056 	mxge_add_sysctls(sc);
3057 	return 0;
3058 
3059 abort_with_rings:
3060 	mxge_free_rings(sc);
3061 abort_with_irq_res:
3062 	bus_release_resource(dev, SYS_RES_IRQ,
3063 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3064 	if (sc->msi_enabled)
3065 		pci_release_msi(dev);
3066 abort_with_rx_done:
3067 	sc->rx_done.entry = NULL;
3068 	mxge_dma_free(&sc->rx_done.dma);
3069 abort_with_dmabench:
3070 	mxge_dma_free(&sc->dmabench_dma);
3071 abort_with_fw_stats:
3072 	mxge_dma_free(&sc->fw_stats_dma);
3073 abort_with_zeropad_dma:
3074 	mxge_dma_free(&sc->zeropad_dma);
3075 abort_with_cmd_dma:
3076 	mxge_dma_free(&sc->cmd_dma);
3077 abort_with_mem_res:
3078 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3079 abort_with_lock:
3080 	pci_disable_busmaster(dev);
3081 	mtx_destroy(&sc->cmd_mtx);
3082 	mtx_destroy(&sc->tx_mtx);
3083 	mtx_destroy(&sc->driver_mtx);
3084 	if_free(ifp);
3085 abort_with_parent_dmat:
3086 	bus_dma_tag_destroy(sc->parent_dmat);
3087 
3088 abort_with_nothing:
3089 	return err;
3090 }
3091 
3092 static int
3093 mxge_detach(device_t dev)
3094 {
3095 	mxge_softc_t *sc = device_get_softc(dev);
3096 
3097 	mtx_lock(&sc->driver_mtx);
3098 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
3099 		mxge_close(sc);
3100 	callout_stop(&sc->co_hdl);
3101 	mtx_unlock(&sc->driver_mtx);
3102 	ether_ifdetach(sc->ifp);
3103 	ifmedia_removeall(&sc->media);
3104 	mxge_dummy_rdma(sc, 0);
3105 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
3106 	mxge_free_rings(sc);
3107 	bus_release_resource(dev, SYS_RES_IRQ,
3108 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3109 	if (sc->msi_enabled)
3110 		pci_release_msi(dev);
3111 
3112 	sc->rx_done.entry = NULL;
3113 	mxge_dma_free(&sc->rx_done.dma);
3114 	mxge_dma_free(&sc->fw_stats_dma);
3115 	mxge_dma_free(&sc->dmabench_dma);
3116 	mxge_dma_free(&sc->zeropad_dma);
3117 	mxge_dma_free(&sc->cmd_dma);
3118 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3119 	pci_disable_busmaster(dev);
3120 	mtx_destroy(&sc->cmd_mtx);
3121 	mtx_destroy(&sc->tx_mtx);
3122 	mtx_destroy(&sc->driver_mtx);
3123 	if_free(sc->ifp);
3124 	bus_dma_tag_destroy(sc->parent_dmat);
3125 	return 0;
3126 }
3127 
3128 static int
3129 mxge_shutdown(device_t dev)
3130 {
3131 	return 0;
3132 }
3133 
3134 /*
3135   This file uses Myri10GE driver indentation.
3136 
3137   Local Variables:
3138   c-file-style:"linux"
3139   tab-width:8
3140   End:
3141 */
3142