xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 2b743a9e9ddc6736208dc8ca1ce06ce64ad20a19)
1 /******************************************************************************
2 
3 Copyright (c) 2006, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Redistributions in binary form must reproduce the above copyright
13     notice, this list of conditions and the following disclaimer in the
14     documentation and/or other materials provided with the distribution.
15 
16  3. Neither the name of the Myricom Inc, nor the names of its
17     contributors may be used to endorse or promote products derived from
18     this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 POSSIBILITY OF SUCH DAMAGE.
31 
32 ***************************************************************************/
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/linker.h>
40 #include <sys/firmware.h>
41 #include <sys/endian.h>
42 #include <sys/sockio.h>
43 #include <sys/mbuf.h>
44 #include <sys/malloc.h>
45 #include <sys/kdb.h>
46 #include <sys/kernel.h>
47 #include <sys/module.h>
48 #include <sys/memrange.h>
49 #include <sys/socket.h>
50 #include <sys/sysctl.h>
51 #include <sys/sx.h>
52 
53 #include <net/if.h>
54 #include <net/if_arp.h>
55 #include <net/ethernet.h>
56 #include <net/if_dl.h>
57 #include <net/if_media.h>
58 
59 #include <net/bpf.h>
60 
61 #include <net/if_types.h>
62 #include <net/if_vlan_var.h>
63 #include <net/zlib.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 #include <netinet/tcp.h>
69 
70 #include <machine/bus.h>
71 #include <machine/resource.h>
72 #include <sys/bus.h>
73 #include <sys/rman.h>
74 
75 #include <dev/pci/pcireg.h>
76 #include <dev/pci/pcivar.h>
77 
78 #include <vm/vm.h>		/* for pmap_mapdev() */
79 #include <vm/pmap.h>
80 
81 #include <dev/mxge/mxge_mcp.h>
82 #include <dev/mxge/mcp_gen_header.h>
83 #include <dev/mxge/if_mxge_var.h>
84 
85 /* tunable params */
86 static int mxge_nvidia_ecrc_enable = 1;
87 static int mxge_force_firmware = 0;
88 static int mxge_max_intr_slots = 1024;
89 static int mxge_intr_coal_delay = 30;
90 static int mxge_deassert_wait = 1;
91 static int mxge_flow_control = 1;
92 static int mxge_verbose = 0;
93 static int mxge_ticks;
94 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
95 static char *mxge_fw_aligned = "mxge_eth_z8e";
96 
97 static int mxge_probe(device_t dev);
98 static int mxge_attach(device_t dev);
99 static int mxge_detach(device_t dev);
100 static int mxge_shutdown(device_t dev);
101 static void mxge_intr(void *arg);
102 
103 static device_method_t mxge_methods[] =
104 {
105   /* Device interface */
106   DEVMETHOD(device_probe, mxge_probe),
107   DEVMETHOD(device_attach, mxge_attach),
108   DEVMETHOD(device_detach, mxge_detach),
109   DEVMETHOD(device_shutdown, mxge_shutdown),
110   {0, 0}
111 };
112 
113 static driver_t mxge_driver =
114 {
115   "mxge",
116   mxge_methods,
117   sizeof(mxge_softc_t),
118 };
119 
120 static devclass_t mxge_devclass;
121 
122 /* Declare ourselves to be a child of the PCI bus.*/
123 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
124 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
125 
126 static int
127 mxge_probe(device_t dev)
128 {
129   if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
130       (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E)) {
131 	  device_set_desc(dev, "Myri10G-PCIE-8A");
132 	  return 0;
133   }
134   return ENXIO;
135 }
136 
137 static void
138 mxge_enable_wc(mxge_softc_t *sc)
139 {
140 	struct mem_range_desc mrdesc;
141 	vm_paddr_t pa;
142 	vm_offset_t len;
143 	int err, action;
144 
145 	pa = rman_get_start(sc->mem_res);
146 	len = rman_get_size(sc->mem_res);
147 	mrdesc.mr_base = pa;
148 	mrdesc.mr_len = len;
149 	mrdesc.mr_flags = MDF_WRITECOMBINE;
150 	action = MEMRANGE_SET_UPDATE;
151 	strcpy((char *)&mrdesc.mr_owner, "mxge");
152 	err = mem_range_attr_set(&mrdesc, &action);
153 	if (err != 0) {
154 		device_printf(sc->dev,
155 			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
156 			      (unsigned long)pa, (unsigned long)len, err);
157 	} else {
158 		sc->wc = 1;
159 	}
160 }
161 
162 
163 /* callback to get our DMA address */
164 static void
165 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
166 			 int error)
167 {
168 	if (error == 0) {
169 		*(bus_addr_t *) arg = segs->ds_addr;
170 	}
171 }
172 
173 static int
174 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
175 		   bus_size_t alignment)
176 {
177 	int err;
178 	device_t dev = sc->dev;
179 
180 	/* allocate DMAable memory tags */
181 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
182 				 alignment,		/* alignment */
183 				 4096,			/* boundary */
184 				 BUS_SPACE_MAXADDR,	/* low */
185 				 BUS_SPACE_MAXADDR,	/* high */
186 				 NULL, NULL,		/* filter */
187 				 bytes,			/* maxsize */
188 				 1,			/* num segs */
189 				 4096,			/* maxsegsize */
190 				 BUS_DMA_COHERENT,	/* flags */
191 				 NULL, NULL,		/* lock */
192 				 &dma->dmat);		/* tag */
193 	if (err != 0) {
194 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
195 		return err;
196 	}
197 
198 	/* allocate DMAable memory & map */
199 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
200 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
201 				| BUS_DMA_ZERO),  &dma->map);
202 	if (err != 0) {
203 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
204 		goto abort_with_dmat;
205 	}
206 
207 	/* load the memory */
208 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
209 			      mxge_dmamap_callback,
210 			      (void *)&dma->bus_addr, 0);
211 	if (err != 0) {
212 		device_printf(dev, "couldn't load map (err = %d)\n", err);
213 		goto abort_with_mem;
214 	}
215 	return 0;
216 
217 abort_with_mem:
218 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
219 abort_with_dmat:
220 	(void)bus_dma_tag_destroy(dma->dmat);
221 	return err;
222 }
223 
224 
225 static void
226 mxge_dma_free(mxge_dma_t *dma)
227 {
228 	bus_dmamap_unload(dma->dmat, dma->map);
229 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
230 	(void)bus_dma_tag_destroy(dma->dmat);
231 }
232 
233 /*
234  * The eeprom strings on the lanaiX have the format
235  * SN=x\0
236  * MAC=x:x:x:x:x:x\0
237  * PC=text\0
238  */
239 
240 static int
241 mxge_parse_strings(mxge_softc_t *sc)
242 {
243 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
244 
245 	char *ptr, *limit;
246 	int i, found_mac;
247 
248 	ptr = sc->eeprom_strings;
249 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
250 	found_mac = 0;
251 	while (ptr < limit && *ptr != '\0') {
252 		if (memcmp(ptr, "MAC=", 4) == 0) {
253 			ptr += 1;
254 			sc->mac_addr_string = ptr;
255 			for (i = 0; i < 6; i++) {
256 				ptr += 3;
257 				if ((ptr + 2) > limit)
258 					goto abort;
259 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
260 				found_mac = 1;
261 			}
262 		} else if (memcmp(ptr, "PC=", 3) == 0) {
263 			ptr += 3;
264 			strncpy(sc->product_code_string, ptr,
265 				sizeof (sc->product_code_string) - 1);
266 		} else if (memcmp(ptr, "SN=", 3) == 0) {
267 			ptr += 3;
268 			strncpy(sc->serial_number_string, ptr,
269 				sizeof (sc->serial_number_string) - 1);
270 		}
271 		MXGE_NEXT_STRING(ptr);
272 	}
273 
274 	if (found_mac)
275 		return 0;
276 
277  abort:
278 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
279 
280 	return ENXIO;
281 }
282 
283 #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
284 static int
285 mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
286 {
287 	uint32_t val;
288 	unsigned long off;
289 	char *va, *cfgptr;
290 	uint16_t vendor_id, device_id;
291 	uintptr_t bus, slot, func, ivend, idev;
292 	uint32_t *ptr32;
293 
294 	/* XXXX
295 	   Test below is commented because it is believed that doing
296 	   config read/write beyond 0xff will access the config space
297 	   for the next larger function.  Uncomment this and remove
298 	   the hacky pmap_mapdev() way of accessing config space when
299 	   FreeBSD grows support for extended pcie config space access
300 	*/
301 #if 0
302 	/* See if we can, by some miracle, access the extended
303 	   config space */
304 	val = pci_read_config(pdev, 0x178, 4);
305 	if (val != 0xffffffff) {
306 		val |= 0x40;
307 		pci_write_config(pdev, 0x178, val, 4);
308 		return 0;
309 	}
310 #endif
311 	/* Rather than using normal pci config space writes, we must
312 	 * map the Nvidia config space ourselves.  This is because on
313 	 * opteron/nvidia class machine the 0xe000000 mapping is
314 	 * handled by the nvidia chipset, that means the internal PCI
315 	 * device (the on-chip northbridge), or the amd-8131 bridge
316 	 * and things behind them are not visible by this method.
317 	 */
318 
319 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
320 		      PCI_IVAR_BUS, &bus);
321 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
322 		      PCI_IVAR_SLOT, &slot);
323 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
324 		      PCI_IVAR_FUNCTION, &func);
325 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
326 		      PCI_IVAR_VENDOR, &ivend);
327 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
328 		      PCI_IVAR_DEVICE, &idev);
329 
330 	off =  0xe0000000UL
331 		+ 0x00100000UL * (unsigned long)bus
332 		+ 0x00001000UL * (unsigned long)(func
333 						 + 8 * slot);
334 
335 	/* map it into the kernel */
336 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
337 
338 
339 	if (va == NULL) {
340 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
341 		return EIO;
342 	}
343 	/* get a pointer to the config space mapped into the kernel */
344 	cfgptr = va + (off & PAGE_MASK);
345 
346 	/* make sure that we can really access it */
347 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
348 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
349 	if (! (vendor_id == ivend && device_id == idev)) {
350 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
351 			      vendor_id, device_id);
352 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
353 		return EIO;
354 	}
355 
356 	ptr32 = (uint32_t*)(cfgptr + 0x178);
357 	val = *ptr32;
358 
359 	if (val == 0xffffffff) {
360 		device_printf(sc->dev, "extended mapping failed\n");
361 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
362 		return EIO;
363 	}
364 	*ptr32 = val | 0x40;
365 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
366 	if (mxge_verbose)
367 		device_printf(sc->dev,
368 			      "Enabled ECRC on upstream Nvidia bridge "
369 			      "at %d:%d:%d\n",
370 			      (int)bus, (int)slot, (int)func);
371 	return 0;
372 }
373 #else
374 static int
375 mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
376 {
377 	device_printf(sc->dev,
378 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
379 	return ENXIO;
380 }
381 #endif
382 /*
383  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
384  * when the PCI-E Completion packets are aligned on an 8-byte
385  * boundary.  Some PCI-E chip sets always align Completion packets; on
386  * the ones that do not, the alignment can be enforced by enabling
387  * ECRC generation (if supported).
388  *
389  * When PCI-E Completion packets are not aligned, it is actually more
390  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
391  *
392  * If the driver can neither enable ECRC nor verify that it has
393  * already been enabled, then it must use a firmware image which works
394  * around unaligned completion packets (ethp_z8e.dat), and it should
395  * also ensure that it never gives the device a Read-DMA which is
396  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
397  * enabled, then the driver should use the aligned (eth_z8e.dat)
398  * firmware image, and set tx.boundary to 4KB.
399  */
400 
401 static void
402 mxge_select_firmware(mxge_softc_t *sc)
403 {
404 	int err, aligned = 0;
405 	device_t pdev;
406 	uint16_t pvend, pdid;
407 
408 
409 	if (mxge_force_firmware != 0) {
410 		if (mxge_force_firmware == 1)
411 			aligned = 1;
412 		else
413 			aligned = 0;
414 		if (mxge_verbose)
415 			device_printf(sc->dev,
416 				      "Assuming %s completions (forced)\n",
417 				      aligned ? "aligned" : "unaligned");
418 		goto abort;
419 	}
420 
421 	/* if the PCIe link width is 4 or less, we can use the aligned
422 	   firmware and skip any checks */
423 	if (sc->link_width != 0 && sc->link_width <= 4) {
424 		device_printf(sc->dev,
425 			      "PCIe x%d Link, expect reduced performance\n",
426 			      sc->link_width);
427 		aligned = 1;
428 		goto abort;
429 	}
430 
431 	pdev = device_get_parent(device_get_parent(sc->dev));
432 	if (pdev == NULL) {
433 		device_printf(sc->dev, "could not find parent?\n");
434 		goto abort;
435 	}
436 	pvend = pci_read_config(pdev, PCIR_VENDOR, 2);
437 	pdid = pci_read_config(pdev, PCIR_DEVICE, 2);
438 
439 	/* see if we can enable ECRC's on an upstream
440 	   Nvidia bridge */
441 	if (mxge_nvidia_ecrc_enable &&
442 	    (pvend == 0x10de && pdid == 0x005d)) {
443 		err = mxge_enable_nvidia_ecrc(sc, pdev);
444 		if (err == 0) {
445 			aligned = 1;
446 			if (mxge_verbose)
447 				device_printf(sc->dev,
448 					      "Assuming aligned completions"
449 					      " (ECRC)\n");
450 		}
451 	}
452 	/* see if the upstream bridge is known to
453 	   provided aligned completions */
454 	if (/* HT2000 */ (pvend == 0x1166 && pdid == 0x0132) ||
455 	    /* PLX */    (pvend == 0x10b5 && pdid == 0x8532) ||
456 	    /* Intel */  (pvend == 0x8086 &&
457 	      /* E5000 NorthBridge*/((pdid >= 0x25f7 && pdid <= 0x25fa) ||
458 	      /* E5000 SouthBridge*/ (pdid >= 0x3510 && pdid <= 0x351b)))) {
459 		aligned = 1;
460 		if (mxge_verbose)
461 			device_printf(sc->dev,
462 				      "Assuming aligned completions "
463 				      "(0x%x:0x%x)\n", pvend, pdid);
464 	}
465 
466 abort:
467 	if (aligned) {
468 		sc->fw_name = mxge_fw_aligned;
469 		sc->tx.boundary = 4096;
470 	} else {
471 		sc->fw_name = mxge_fw_unaligned;
472 		sc->tx.boundary = 2048;
473 	}
474 }
475 
476 union qualhack
477 {
478         const char *ro_char;
479         char *rw_char;
480 };
481 
482 static int
483 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
484 {
485 
486 
487 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
488 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
489 			      be32toh(hdr->mcp_type));
490 		return EIO;
491 	}
492 
493 	/* save firmware version for sysctl */
494 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
495 	if (mxge_verbose)
496 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
497 
498 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
499 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
500 
501 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
502 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
503 		device_printf(sc->dev, "Found firmware version %s\n",
504 			      sc->fw_version);
505 		device_printf(sc->dev, "Driver needs %d.%d\n",
506 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
507 		return EINVAL;
508 	}
509 	return 0;
510 
511 }
512 
513 static int
514 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
515 {
516 	const struct firmware *fw;
517 	const mcp_gen_header_t *hdr;
518 	unsigned hdr_offset;
519 	const char *fw_data;
520 	union qualhack hack;
521 	int status;
522 	unsigned int i;
523 	char dummy;
524 
525 
526 	fw = firmware_get(sc->fw_name);
527 
528 	if (fw == NULL) {
529 		device_printf(sc->dev, "Could not find firmware image %s\n",
530 			      sc->fw_name);
531 		return ENOENT;
532 	}
533 	if (fw->datasize > *limit ||
534 	    fw->datasize < MCP_HEADER_PTR_OFFSET + 4) {
535 		device_printf(sc->dev, "Firmware image %s too large (%d/%d)\n",
536 			      sc->fw_name, (int)fw->datasize, (int) *limit);
537 		status = ENOSPC;
538 		goto abort_with_fw;
539 	}
540 	*limit = fw->datasize;
541 
542 	/* check id */
543 	fw_data = (const char *)fw->data;
544 	hdr_offset = htobe32(*(const uint32_t *)
545 			     (fw_data + MCP_HEADER_PTR_OFFSET));
546 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw->datasize) {
547 		device_printf(sc->dev, "Bad firmware file");
548 		status = EIO;
549 		goto abort_with_fw;
550 	}
551 	hdr = (const void*)(fw_data + hdr_offset);
552 
553 	status = mxge_validate_firmware(sc, hdr);
554 	if (status != 0)
555 		goto abort_with_fw;
556 
557 	hack.ro_char = fw_data;
558 	/* Copy the inflated firmware to NIC SRAM. */
559 	for (i = 0; i < *limit; i += 256) {
560 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
561 			      hack.rw_char + i,
562 			      min(256U, (unsigned)(*limit - i)));
563 		mb();
564 		dummy = *sc->sram;
565 		mb();
566 	}
567 
568 	status = 0;
569 abort_with_fw:
570 	firmware_put(fw, FIRMWARE_UNLOAD);
571 	return status;
572 }
573 
574 /*
575  * Enable or disable periodic RDMAs from the host to make certain
576  * chipsets resend dropped PCIe messages
577  */
578 
579 static void
580 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
581 {
582 	char buf_bytes[72];
583 	volatile uint32_t *confirm;
584 	volatile char *submit;
585 	uint32_t *buf, dma_low, dma_high;
586 	int i;
587 
588 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
589 
590 	/* clear confirmation addr */
591 	confirm = (volatile uint32_t *)sc->cmd;
592 	*confirm = 0;
593 	mb();
594 
595 	/* send an rdma command to the PCIe engine, and wait for the
596 	   response in the confirmation address.  The firmware should
597 	   write a -1 there to indicate it is alive and well
598 	*/
599 
600 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
601 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
602 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
603 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
604 	buf[2] = htobe32(0xffffffff);		/* confirm data */
605 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
606 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
607 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
608 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
609 	buf[5] = htobe32(enable);			/* enable? */
610 
611 
612 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
613 
614 	mxge_pio_copy(submit, buf, 64);
615 	mb();
616 	DELAY(1000);
617 	mb();
618 	i = 0;
619 	while (*confirm != 0xffffffff && i < 20) {
620 		DELAY(1000);
621 		i++;
622 	}
623 	if (*confirm != 0xffffffff) {
624 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
625 			      (enable ? "enable" : "disable"), confirm,
626 			      *confirm);
627 	}
628 	return;
629 }
630 
631 static int
632 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
633 {
634 	mcp_cmd_t *buf;
635 	char buf_bytes[sizeof(*buf) + 8];
636 	volatile mcp_cmd_response_t *response = sc->cmd;
637 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
638 	uint32_t dma_low, dma_high;
639 	int sleep_total = 0;
640 
641 	/* ensure buf is aligned to 8 bytes */
642 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
643 
644 	buf->data0 = htobe32(data->data0);
645 	buf->data1 = htobe32(data->data1);
646 	buf->data2 = htobe32(data->data2);
647 	buf->cmd = htobe32(cmd);
648 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
649 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
650 
651 	buf->response_addr.low = htobe32(dma_low);
652 	buf->response_addr.high = htobe32(dma_high);
653 	mtx_lock(&sc->cmd_mtx);
654 	response->result = 0xffffffff;
655 	mb();
656 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
657 
658 	/* wait up to 20ms */
659 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
660 		bus_dmamap_sync(sc->cmd_dma.dmat,
661 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
662 		mb();
663 		if (response->result != 0xffffffff) {
664 			if (response->result == 0) {
665 				data->data0 = be32toh(response->data);
666 				mtx_unlock(&sc->cmd_mtx);
667 				return 0;
668 			} else {
669 				device_printf(sc->dev,
670 					      "mxge: command %d "
671 					      "failed, result = %d\n",
672 					      cmd, be32toh(response->result));
673 				mtx_unlock(&sc->cmd_mtx);
674 				return ENXIO;
675 			}
676 		}
677 		DELAY(1000);
678 	}
679 	mtx_unlock(&sc->cmd_mtx);
680 	device_printf(sc->dev, "mxge: command %d timed out"
681 		      "result = %d\n",
682 		      cmd, be32toh(response->result));
683 	return EAGAIN;
684 }
685 
686 static int
687 mxge_adopt_running_firmware(mxge_softc_t *sc)
688 {
689 	struct mcp_gen_header *hdr;
690 	const size_t bytes = sizeof (struct mcp_gen_header);
691 	size_t hdr_offset;
692 	int status;
693 
694 	/* find running firmware header */
695 	hdr_offset = htobe32(*(volatile uint32_t *)
696 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
697 
698 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
699 		device_printf(sc->dev,
700 			      "Running firmware has bad header offset (%d)\n",
701 			      (int)hdr_offset);
702 		return EIO;
703 	}
704 
705 	/* copy header of running firmware from SRAM to host memory to
706 	 * validate firmware */
707 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
708 	if (hdr == NULL) {
709 		device_printf(sc->dev, "could not malloc firmware hdr\n");
710 		return ENOMEM;
711 	}
712 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
713 				rman_get_bushandle(sc->mem_res),
714 				hdr_offset, (char *)hdr, bytes);
715 	status = mxge_validate_firmware(sc, hdr);
716 	free(hdr, M_DEVBUF);
717 
718 	/*
719 	 * check to see if adopted firmware has bug where adopting
720 	 * it will cause broadcasts to be filtered unless the NIC
721 	 * is kept in ALLMULTI mode
722 	 */
723 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
724 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
725 		sc->adopted_rx_filter_bug = 1;
726 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
727 			      "working around rx filter bug\n",
728 			      sc->fw_ver_major, sc->fw_ver_minor,
729 			      sc->fw_ver_tiny);
730 	}
731 
732 	return status;
733 }
734 
735 
736 static int
737 mxge_load_firmware(mxge_softc_t *sc)
738 {
739 	volatile uint32_t *confirm;
740 	volatile char *submit;
741 	char buf_bytes[72];
742 	uint32_t *buf, size, dma_low, dma_high;
743 	int status, i;
744 
745 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
746 
747 	size = sc->sram_size;
748 	status = mxge_load_firmware_helper(sc, &size);
749 	if (status) {
750 		/* Try to use the currently running firmware, if
751 		   it is new enough */
752 		status = mxge_adopt_running_firmware(sc);
753 		if (status) {
754 			device_printf(sc->dev,
755 				      "failed to adopt running firmware\n");
756 			return status;
757 		}
758 		device_printf(sc->dev,
759 			      "Successfully adopted running firmware\n");
760 		if (sc->tx.boundary == 4096) {
761 			device_printf(sc->dev,
762 				"Using firmware currently running on NIC"
763 				 ".  For optimal\n");
764 			device_printf(sc->dev,
765 				 "performance consider loading optimized "
766 				 "firmware\n");
767 		}
768 		sc->fw_name = mxge_fw_unaligned;
769 		sc->tx.boundary = 2048;
770 		return 0;
771 	}
772 	/* clear confirmation addr */
773 	confirm = (volatile uint32_t *)sc->cmd;
774 	*confirm = 0;
775 	mb();
776 	/* send a reload command to the bootstrap MCP, and wait for the
777 	   response in the confirmation address.  The firmware should
778 	   write a -1 there to indicate it is alive and well
779 	*/
780 
781 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
782 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
783 
784 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
785 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
786 	buf[2] = htobe32(0xffffffff);	/* confirm data */
787 
788 	/* FIX: All newest firmware should un-protect the bottom of
789 	   the sram before handoff. However, the very first interfaces
790 	   do not. Therefore the handoff copy must skip the first 8 bytes
791 	*/
792 					/* where the code starts*/
793 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
794 	buf[4] = htobe32(size - 8); 	/* length of code */
795 	buf[5] = htobe32(8);		/* where to copy to */
796 	buf[6] = htobe32(0);		/* where to jump to */
797 
798 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
799 	mxge_pio_copy(submit, buf, 64);
800 	mb();
801 	DELAY(1000);
802 	mb();
803 	i = 0;
804 	while (*confirm != 0xffffffff && i < 20) {
805 		DELAY(1000*10);
806 		i++;
807 		bus_dmamap_sync(sc->cmd_dma.dmat,
808 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
809 	}
810 	if (*confirm != 0xffffffff) {
811 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
812 			confirm, *confirm);
813 
814 		return ENXIO;
815 	}
816 	return 0;
817 }
818 
819 static int
820 mxge_update_mac_address(mxge_softc_t *sc)
821 {
822 	mxge_cmd_t cmd;
823 	uint8_t *addr = sc->mac_addr;
824 	int status;
825 
826 
827 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
828 		     | (addr[2] << 8) | addr[3]);
829 
830 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
831 
832 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
833 	return status;
834 }
835 
836 static int
837 mxge_change_pause(mxge_softc_t *sc, int pause)
838 {
839 	mxge_cmd_t cmd;
840 	int status;
841 
842 	if (pause)
843 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
844 				       &cmd);
845 	else
846 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
847 				       &cmd);
848 
849 	if (status) {
850 		device_printf(sc->dev, "Failed to set flow control mode\n");
851 		return ENXIO;
852 	}
853 	sc->pause = pause;
854 	return 0;
855 }
856 
857 static void
858 mxge_change_promisc(mxge_softc_t *sc, int promisc)
859 {
860 	mxge_cmd_t cmd;
861 	int status;
862 
863 	if (promisc)
864 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
865 				       &cmd);
866 	else
867 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
868 				       &cmd);
869 
870 	if (status) {
871 		device_printf(sc->dev, "Failed to set promisc mode\n");
872 	}
873 }
874 
875 static void
876 mxge_set_multicast_list(mxge_softc_t *sc)
877 {
878 	mxge_cmd_t cmd;
879 	struct ifmultiaddr *ifma;
880 	struct ifnet *ifp = sc->ifp;
881 	int err;
882 
883 	/* This firmware is known to not support multicast */
884 	if (!sc->fw_multicast_support)
885 		return;
886 
887 	/* Disable multicast filtering while we play with the lists*/
888 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
889 	if (err != 0) {
890 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
891 		       " error status: %d\n", err);
892 		return;
893 	}
894 
895 	if (sc->adopted_rx_filter_bug)
896 		return;
897 
898 	if (ifp->if_flags & IFF_ALLMULTI)
899 		/* request to disable multicast filtering, so quit here */
900 		return;
901 
902 	/* Flush all the filters */
903 
904 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
905 	if (err != 0) {
906 		device_printf(sc->dev,
907 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
908 			      ", error status: %d\n", err);
909 		return;
910 	}
911 
912 	/* Walk the multicast list, and add each address */
913 
914 	IF_ADDR_LOCK(ifp);
915 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
916 		if (ifma->ifma_addr->sa_family != AF_LINK)
917 			continue;
918 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
919 		      &cmd.data0, 4);
920 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
921 		      &cmd.data1, 2);
922 		cmd.data0 = htonl(cmd.data0);
923 		cmd.data1 = htonl(cmd.data1);
924 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
925 		if (err != 0) {
926 			device_printf(sc->dev, "Failed "
927 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
928 			       "%d\t", err);
929 			/* abort, leaving multicast filtering off */
930 			IF_ADDR_UNLOCK(ifp);
931 			return;
932 		}
933 	}
934 	IF_ADDR_UNLOCK(ifp);
935 	/* Enable multicast filtering */
936 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
937 	if (err != 0) {
938 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
939 		       ", error status: %d\n", err);
940 	}
941 }
942 
943 
944 static int
945 mxge_reset(mxge_softc_t *sc)
946 {
947 
948 	mxge_cmd_t cmd;
949 	size_t bytes;
950 	int status;
951 
952 	/* try to send a reset command to the card to see if it
953 	   is alive */
954 	memset(&cmd, 0, sizeof (cmd));
955 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
956 	if (status != 0) {
957 		device_printf(sc->dev, "failed reset\n");
958 		return ENXIO;
959 	}
960 
961 	mxge_dummy_rdma(sc, 1);
962 
963 	/* Now exchange information about interrupts  */
964 	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);\
965 	memset(sc->rx_done.entry, 0, bytes);
966 	cmd.data0 = (uint32_t)bytes;
967 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
968 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->rx_done.dma.bus_addr);
969 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->rx_done.dma.bus_addr);
970 	status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd);
971 
972 	status |= mxge_send_cmd(sc,
973 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
974 
975 
976 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
977 
978 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
979 	sc->irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
980 
981 
982 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
983 				&cmd);
984 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
985 	if (status != 0) {
986 		device_printf(sc->dev, "failed set interrupt parameters\n");
987 		return status;
988 	}
989 
990 
991 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
992 
993 
994 	/* run a DMA benchmark */
995 	sc->read_dma = sc->write_dma = sc->read_write_dma = 0;
996 
997 	/* Read DMA */
998 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->dmabench_dma.bus_addr);
999 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->dmabench_dma.bus_addr);
1000 	cmd.data2 = sc->tx.boundary * 0x10000;
1001 
1002 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
1003 	if (status != 0)
1004 		device_printf(sc->dev, "read dma benchmark failed\n");
1005 	else
1006 		sc->read_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
1007 			(cmd.data0 & 0xffff);
1008 
1009 	/* Write DMA */
1010 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->dmabench_dma.bus_addr);
1011 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->dmabench_dma.bus_addr);
1012 	cmd.data2 = sc->tx.boundary * 0x1;
1013 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
1014 	if (status != 0)
1015 		device_printf(sc->dev, "write dma benchmark failed\n");
1016 	else
1017 		sc->write_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
1018 			(cmd.data0 & 0xffff);
1019 	/* Read/Write DMA */
1020 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->dmabench_dma.bus_addr);
1021 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->dmabench_dma.bus_addr);
1022 	cmd.data2 = sc->tx.boundary * 0x10001;
1023 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
1024 	if (status != 0)
1025 		device_printf(sc->dev, "read/write dma benchmark failed\n");
1026 	else
1027 		sc->read_write_dma =
1028 			((cmd.data0>>16) * sc->tx.boundary * 2 * 2) /
1029 			(cmd.data0 & 0xffff);
1030 
1031 	/* reset mcp/driver shared state back to 0 */
1032 	bzero(sc->rx_done.entry, bytes);
1033 	sc->rx_done.idx = 0;
1034 	sc->rx_done.cnt = 0;
1035 	sc->tx.req = 0;
1036 	sc->tx.done = 0;
1037 	sc->tx.pkt_done = 0;
1038 	sc->tx.wake = 0;
1039 	sc->tx.stall = 0;
1040 	sc->rx_big.cnt = 0;
1041 	sc->rx_small.cnt = 0;
1042 	sc->rdma_tags_available = 15;
1043 	sc->fw_stats->valid = 0;
1044 	sc->fw_stats->send_done_count = 0;
1045 	status = mxge_update_mac_address(sc);
1046 	mxge_change_promisc(sc, 0);
1047 	mxge_change_pause(sc, sc->pause);
1048 	mxge_set_multicast_list(sc);
1049 	return status;
1050 }
1051 
1052 static int
1053 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1054 {
1055         mxge_softc_t *sc;
1056         unsigned int intr_coal_delay;
1057         int err;
1058 
1059         sc = arg1;
1060         intr_coal_delay = sc->intr_coal_delay;
1061         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1062         if (err != 0) {
1063                 return err;
1064         }
1065         if (intr_coal_delay == sc->intr_coal_delay)
1066                 return 0;
1067 
1068         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1069                 return EINVAL;
1070 
1071 	mtx_lock(&sc->driver_mtx);
1072 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1073 	sc->intr_coal_delay = intr_coal_delay;
1074 
1075 	mtx_unlock(&sc->driver_mtx);
1076         return err;
1077 }
1078 
1079 static int
1080 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1081 {
1082         mxge_softc_t *sc;
1083         unsigned int enabled;
1084         int err;
1085 
1086         sc = arg1;
1087         enabled = sc->pause;
1088         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1089         if (err != 0) {
1090                 return err;
1091         }
1092         if (enabled == sc->pause)
1093                 return 0;
1094 
1095 	mtx_lock(&sc->driver_mtx);
1096 	err = mxge_change_pause(sc, enabled);
1097 	mtx_unlock(&sc->driver_mtx);
1098         return err;
1099 }
1100 
1101 static int
1102 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1103 {
1104         int err;
1105 
1106         if (arg1 == NULL)
1107                 return EFAULT;
1108         arg2 = be32toh(*(int *)arg1);
1109         arg1 = NULL;
1110         err = sysctl_handle_int(oidp, arg1, arg2, req);
1111 
1112         return err;
1113 }
1114 
1115 static void
1116 mxge_add_sysctls(mxge_softc_t *sc)
1117 {
1118 	struct sysctl_ctx_list *ctx;
1119 	struct sysctl_oid_list *children;
1120 	mcp_irq_data_t *fw;
1121 
1122 	ctx = device_get_sysctl_ctx(sc->dev);
1123 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1124 	fw = sc->fw_stats;
1125 
1126 	/* random information */
1127 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1128 		       "firmware_version",
1129 		       CTLFLAG_RD, &sc->fw_version,
1130 		       0, "firmware version");
1131 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1132 		       "serial_number",
1133 		       CTLFLAG_RD, &sc->serial_number_string,
1134 		       0, "serial number");
1135 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1136 		       "product_code",
1137 		       CTLFLAG_RD, &sc->product_code_string,
1138 		       0, "product_code");
1139 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1140 		       "pcie_link_width",
1141 		       CTLFLAG_RD, &sc->link_width,
1142 		       0, "tx_boundary");
1143 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1144 		       "tx_boundary",
1145 		       CTLFLAG_RD, &sc->tx.boundary,
1146 		       0, "tx_boundary");
1147 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1148 		       "write_combine",
1149 		       CTLFLAG_RD, &sc->wc,
1150 		       0, "write combining PIO?");
1151 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1152 		       "read_dma_MBs",
1153 		       CTLFLAG_RD, &sc->read_dma,
1154 		       0, "DMA Read speed in MB/s");
1155 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1156 		       "write_dma_MBs",
1157 		       CTLFLAG_RD, &sc->write_dma,
1158 		       0, "DMA Write speed in MB/s");
1159 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1160 		       "read_write_dma_MBs",
1161 		       CTLFLAG_RD, &sc->read_write_dma,
1162 		       0, "DMA concurrent Read/Write speed in MB/s");
1163 
1164 
1165 	/* performance related tunables */
1166 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1167 			"intr_coal_delay",
1168 			CTLTYPE_INT|CTLFLAG_RW, sc,
1169 			0, mxge_change_intr_coal,
1170 			"I", "interrupt coalescing delay in usecs");
1171 
1172 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1173 			"flow_control_enabled",
1174 			CTLTYPE_INT|CTLFLAG_RW, sc,
1175 			0, mxge_change_flow_control,
1176 			"I", "interrupt coalescing delay in usecs");
1177 
1178 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1179 		       "deassert_wait",
1180 		       CTLFLAG_RW, &mxge_deassert_wait,
1181 		       0, "Wait for IRQ line to go low in ihandler");
1182 
1183 	/* stats block from firmware is in network byte order.
1184 	   Need to swap it */
1185 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1186 			"link_up",
1187 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1188 			0, mxge_handle_be32,
1189 			"I", "link up");
1190 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1191 			"rdma_tags_available",
1192 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1193 			0, mxge_handle_be32,
1194 			"I", "rdma_tags_available");
1195 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1196 			"dropped_link_overflow",
1197 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1198 			0, mxge_handle_be32,
1199 			"I", "dropped_link_overflow");
1200 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1201 			"dropped_link_error_or_filtered",
1202 			CTLTYPE_INT|CTLFLAG_RD,
1203 			&fw->dropped_link_error_or_filtered,
1204 			0, mxge_handle_be32,
1205 			"I", "dropped_link_error_or_filtered");
1206 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1207 			"dropped_multicast_filtered",
1208 			CTLTYPE_INT|CTLFLAG_RD,
1209 			&fw->dropped_multicast_filtered,
1210 			0, mxge_handle_be32,
1211 			"I", "dropped_multicast_filtered");
1212 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1213 			"dropped_runt",
1214 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1215 			0, mxge_handle_be32,
1216 			"I", "dropped_runt");
1217 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1218 			"dropped_overrun",
1219 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1220 			0, mxge_handle_be32,
1221 			"I", "dropped_overrun");
1222 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1223 			"dropped_no_small_buffer",
1224 			CTLTYPE_INT|CTLFLAG_RD,
1225 			&fw->dropped_no_small_buffer,
1226 			0, mxge_handle_be32,
1227 			"I", "dropped_no_small_buffer");
1228 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1229 			"dropped_no_big_buffer",
1230 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1231 			0, mxge_handle_be32,
1232 			"I", "dropped_no_big_buffer");
1233 
1234 	/* host counters exported for debugging */
1235 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1236 		       "rx_small_cnt",
1237 		       CTLFLAG_RD, &sc->rx_small.cnt,
1238 		       0, "rx_small_cnt");
1239 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1240 		       "rx_big_cnt",
1241 		       CTLFLAG_RD, &sc->rx_big.cnt,
1242 		       0, "rx_small_cnt");
1243 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1244 		       "tx_req",
1245 		       CTLFLAG_RD, &sc->tx.req,
1246 		       0, "tx_req");
1247 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1248 		       "tx_done",
1249 		       CTLFLAG_RD, &sc->tx.done,
1250 		       0, "tx_done");
1251 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1252 		       "tx_pkt_done",
1253 		       CTLFLAG_RD, &sc->tx.pkt_done,
1254 		       0, "tx_done");
1255 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1256 		       "tx_stall",
1257 		       CTLFLAG_RD, &sc->tx.stall,
1258 		       0, "tx_stall");
1259 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1260 		       "tx_wake",
1261 		       CTLFLAG_RD, &sc->tx.wake,
1262 		       0, "tx_wake");
1263 
1264 	/* verbose printing? */
1265 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1266 		       "verbose",
1267 		       CTLFLAG_RW, &mxge_verbose,
1268 		       0, "verbose printing");
1269 
1270 }
1271 
1272 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1273    backwards one at a time and handle ring wraps */
1274 
1275 static inline void
1276 mxge_submit_req_backwards(mxge_tx_buf_t *tx,
1277 			    mcp_kreq_ether_send_t *src, int cnt)
1278 {
1279         int idx, starting_slot;
1280         starting_slot = tx->req;
1281         while (cnt > 1) {
1282                 cnt--;
1283                 idx = (starting_slot + cnt) & tx->mask;
1284                 mxge_pio_copy(&tx->lanai[idx],
1285 			      &src[cnt], sizeof(*src));
1286                 mb();
1287         }
1288 }
1289 
1290 /*
1291  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1292  * at most 32 bytes at a time, so as to avoid involving the software
1293  * pio handler in the nic.   We re-write the first segment's flags
1294  * to mark them valid only after writing the entire chain
1295  */
1296 
1297 static inline void
1298 mxge_submit_req(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src,
1299                   int cnt)
1300 {
1301         int idx, i;
1302         uint32_t *src_ints;
1303 	volatile uint32_t *dst_ints;
1304         mcp_kreq_ether_send_t *srcp;
1305 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1306 	uint8_t last_flags;
1307 
1308         idx = tx->req & tx->mask;
1309 
1310 	last_flags = src->flags;
1311 	src->flags = 0;
1312         mb();
1313         dst = dstp = &tx->lanai[idx];
1314         srcp = src;
1315 
1316         if ((idx + cnt) < tx->mask) {
1317                 for (i = 0; i < (cnt - 1); i += 2) {
1318                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1319                         mb(); /* force write every 32 bytes */
1320                         srcp += 2;
1321                         dstp += 2;
1322                 }
1323         } else {
1324                 /* submit all but the first request, and ensure
1325                    that it is submitted below */
1326                 mxge_submit_req_backwards(tx, src, cnt);
1327                 i = 0;
1328         }
1329         if (i < cnt) {
1330                 /* submit the first request */
1331                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1332                 mb(); /* barrier before setting valid flag */
1333         }
1334 
1335         /* re-write the last 32-bits with the valid flags */
1336         src->flags = last_flags;
1337         src_ints = (uint32_t *)src;
1338         src_ints+=3;
1339         dst_ints = (volatile uint32_t *)dst;
1340         dst_ints+=3;
1341         *dst_ints =  *src_ints;
1342         tx->req += cnt;
1343         mb();
1344 }
1345 
1346 static inline void
1347 mxge_submit_req_wc(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1348 {
1349     tx->req += cnt;
1350     mb();
1351     while (cnt >= 4) {
1352 	    mxge_pio_copy((volatile char *)tx->wc_fifo, src, 64);
1353 	    mb();
1354 	    src += 4;
1355 	    cnt -= 4;
1356     }
1357     if (cnt > 0) {
1358 	    /* pad it to 64 bytes.  The src is 64 bytes bigger than it
1359 	       needs to be so that we don't overrun it */
1360 	    mxge_pio_copy(tx->wc_fifo + MXGEFW_ETH_SEND_OFFSET(cnt), src, 64);
1361 	    mb();
1362     }
1363 }
1364 
1365 static void
1366 mxge_encap_tso(mxge_softc_t *sc, struct mbuf *m, int busdma_seg_cnt)
1367 {
1368 	mxge_tx_buf_t *tx;
1369 	mcp_kreq_ether_send_t *req;
1370 	bus_dma_segment_t *seg;
1371 	struct ether_header *eh;
1372 	struct ip *ip;
1373 	struct tcphdr *tcp;
1374 	uint32_t low, high_swapped;
1375 	int len, seglen, cum_len, cum_len_next;
1376 	int next_is_first, chop, cnt, rdma_count, small;
1377 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1378 	uint8_t flags, flags_next;
1379 	static int once;
1380 
1381 	mss = m->m_pkthdr.tso_segsz;
1382 
1383 	/* negative cum_len signifies to the
1384 	 * send loop that we are still in the
1385 	 * header portion of the TSO packet.
1386 	 */
1387 
1388 	/* ensure we have the ethernet, IP and TCP
1389 	   header together in the first mbuf, copy
1390 	   it to a scratch buffer if not */
1391 	if (__predict_false(m->m_len < sizeof (*eh)
1392 			    + sizeof (*ip))) {
1393 		m_copydata(m, 0, sizeof (*eh) + sizeof (*ip),
1394 			   sc->scratch);
1395 		eh = (struct ether_header *)sc->scratch;
1396 	} else {
1397 		eh = mtod(m, struct ether_header *);
1398 	}
1399 	ip = (struct ip *) (eh + 1);
1400 	if (__predict_false(m->m_len < sizeof (*eh) + (ip->ip_hl << 2)
1401 			    + sizeof (*tcp))) {
1402 		m_copydata(m, 0, sizeof (*eh) + (ip->ip_hl << 2)
1403 			   + sizeof (*tcp),  sc->scratch);
1404 		eh = (struct ether_header *) sc->scratch;
1405 		ip = (struct ip *) (eh + 1);
1406 	}
1407 
1408 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1409 	cum_len = -(sizeof (*eh) + ((ip->ip_hl + tcp->th_off) << 2));
1410 
1411 	/* TSO implies checksum offload on this hardware */
1412 	cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1413 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1414 
1415 
1416 	/* for TSO, pseudo_hdr_offset holds mss.
1417 	 * The firmware figures out where to put
1418 	 * the checksum by parsing the header. */
1419 	pseudo_hdr_offset = htobe16(mss);
1420 
1421 	tx = &sc->tx;
1422 	req = tx->req_list;
1423 	seg = tx->seg_list;
1424 	cnt = 0;
1425 	rdma_count = 0;
1426 	/* "rdma_count" is the number of RDMAs belonging to the
1427 	 * current packet BEFORE the current send request. For
1428 	 * non-TSO packets, this is equal to "count".
1429 	 * For TSO packets, rdma_count needs to be reset
1430 	 * to 0 after a segment cut.
1431 	 *
1432 	 * The rdma_count field of the send request is
1433 	 * the number of RDMAs of the packet starting at
1434 	 * that request. For TSO send requests with one ore more cuts
1435 	 * in the middle, this is the number of RDMAs starting
1436 	 * after the last cut in the request. All previous
1437 	 * segments before the last cut implicitly have 1 RDMA.
1438 	 *
1439 	 * Since the number of RDMAs is not known beforehand,
1440 	 * it must be filled-in retroactively - after each
1441 	 * segmentation cut or at the end of the entire packet.
1442 	 */
1443 
1444 	while (busdma_seg_cnt) {
1445 		/* Break the busdma segment up into pieces*/
1446 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1447 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1448 		len = seglen = seg->ds_len;
1449 
1450 		while (len) {
1451 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1452 			cum_len_next = cum_len + seglen;
1453 			(req-rdma_count)->rdma_count = rdma_count + 1;
1454 			if (__predict_true(cum_len >= 0)) {
1455 				/* payload */
1456 				chop = (cum_len_next > mss);
1457 				cum_len_next = cum_len_next % mss;
1458 				next_is_first = (cum_len_next == 0);
1459 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1460 				flags_next |= next_is_first *
1461 					MXGEFW_FLAGS_FIRST;
1462 				rdma_count |= -(chop | next_is_first);
1463 				rdma_count += chop & !next_is_first;
1464 			} else if (cum_len_next >= 0) {
1465 				/* header ends */
1466 				rdma_count = -1;
1467 				cum_len_next = 0;
1468 				seglen = -cum_len;
1469 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1470 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1471 					MXGEFW_FLAGS_FIRST |
1472 					(small * MXGEFW_FLAGS_SMALL);
1473 			    }
1474 
1475 			req->addr_high = high_swapped;
1476 			req->addr_low = htobe32(low);
1477 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1478 			req->pad = 0;
1479 			req->rdma_count = 1;
1480 			req->length = htobe16(seglen);
1481 			req->cksum_offset = cksum_offset;
1482 			req->flags = flags | ((cum_len & 1) *
1483 					      MXGEFW_FLAGS_ALIGN_ODD);
1484 			low += seglen;
1485 			len -= seglen;
1486 			cum_len = cum_len_next;
1487 			flags = flags_next;
1488 			req++;
1489 			cnt++;
1490 			rdma_count++;
1491 			if (__predict_false(cksum_offset > seglen))
1492 				cksum_offset -= seglen;
1493 			else
1494 				cksum_offset = 0;
1495 			if (__predict_false(cnt > MXGE_MAX_SEND_DESC))
1496 				goto drop;
1497 		}
1498 		busdma_seg_cnt--;
1499 		seg++;
1500 	}
1501 	(req-rdma_count)->rdma_count = rdma_count;
1502 
1503 	do {
1504 		req--;
1505 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1506 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1507 
1508 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1509 	if (tx->wc_fifo == NULL)
1510 		mxge_submit_req(tx, tx->req_list, cnt);
1511 	else
1512 		mxge_submit_req_wc(tx, tx->req_list, cnt);
1513 	return;
1514 
1515 drop:
1516 	m_freem(m);
1517 	sc->ifp->if_oerrors++;
1518 	if (!once) {
1519 		printf("MXGE_MAX_SEND_DESC exceeded via TSO!\n");
1520 		printf("mss = %d, %ld!\n", mss, (long)seg - (long)tx->seg_list);
1521 		once = 1;
1522 	}
1523 	return;
1524 
1525 }
1526 
1527 static void
1528 mxge_encap(mxge_softc_t *sc, struct mbuf *m)
1529 {
1530 	mcp_kreq_ether_send_t *req;
1531 	bus_dma_segment_t *seg;
1532 	struct mbuf *m_tmp;
1533 	struct ifnet *ifp;
1534 	mxge_tx_buf_t *tx;
1535 	struct ether_header *eh;
1536 	struct ip *ip;
1537 	int cnt, cum_len, err, i, idx, odd_flag;
1538 	uint16_t pseudo_hdr_offset;
1539         uint8_t flags, cksum_offset;
1540 
1541 
1542 
1543 	ifp = sc->ifp;
1544 	tx = &sc->tx;
1545 
1546 	/* (try to) map the frame for DMA */
1547 	idx = tx->req & tx->mask;
1548 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1549 				      m, tx->seg_list, &cnt,
1550 				      BUS_DMA_NOWAIT);
1551 	if (err == EFBIG) {
1552 		/* Too many segments in the chain.  Try
1553 		   to defrag */
1554 		m_tmp = m_defrag(m, M_NOWAIT);
1555 		if (m_tmp == NULL) {
1556 			goto drop;
1557 		}
1558 		m = m_tmp;
1559 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1560 					      tx->info[idx].map,
1561 					      m, tx->seg_list, &cnt,
1562 					      BUS_DMA_NOWAIT);
1563 	}
1564 	if (err != 0) {
1565 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1566 			      " packet len = %d\n", err, m->m_pkthdr.len);
1567 		goto drop;
1568 	}
1569 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1570 			BUS_DMASYNC_PREWRITE);
1571 	tx->info[idx].m = m;
1572 
1573 
1574 	/* TSO is different enough, we handle it in another routine */
1575 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1576 		mxge_encap_tso(sc, m, cnt);
1577 		return;
1578 	}
1579 
1580 	req = tx->req_list;
1581 	cksum_offset = 0;
1582 	pseudo_hdr_offset = 0;
1583 	flags = MXGEFW_FLAGS_NO_TSO;
1584 
1585 	/* checksum offloading? */
1586 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1587 		/* ensure ip header is in first mbuf, copy
1588 		   it to a scratch buffer if not */
1589 		if (__predict_false(m->m_len < sizeof (*eh)
1590 				    + sizeof (*ip))) {
1591 			m_copydata(m, 0, sizeof (*eh) + sizeof (*ip),
1592 				   sc->scratch);
1593 			eh = (struct ether_header *)sc->scratch;
1594 		} else {
1595 			eh = mtod(m, struct ether_header *);
1596 		}
1597 		ip = (struct ip *) (eh + 1);
1598 		cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1599 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1600 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1601 		req->cksum_offset = cksum_offset;
1602 		flags |= MXGEFW_FLAGS_CKSUM;
1603 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1604 	} else {
1605 		odd_flag = 0;
1606 	}
1607 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1608 		flags |= MXGEFW_FLAGS_SMALL;
1609 
1610 	/* convert segments into a request list */
1611 	cum_len = 0;
1612 	seg = tx->seg_list;
1613 	req->flags = MXGEFW_FLAGS_FIRST;
1614 	for (i = 0; i < cnt; i++) {
1615 		req->addr_low =
1616 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1617 		req->addr_high =
1618 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1619 		req->length = htobe16(seg->ds_len);
1620 		req->cksum_offset = cksum_offset;
1621 		if (cksum_offset > seg->ds_len)
1622 			cksum_offset -= seg->ds_len;
1623 		else
1624 			cksum_offset = 0;
1625 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1626 		req->pad = 0; /* complete solid 16-byte block */
1627 		req->rdma_count = 1;
1628 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1629 		cum_len += seg->ds_len;
1630 		seg++;
1631 		req++;
1632 		req->flags = 0;
1633 	}
1634 	req--;
1635 	/* pad runts to 60 bytes */
1636 	if (cum_len < 60) {
1637 		req++;
1638 		req->addr_low =
1639 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
1640 		req->addr_high =
1641 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
1642 		req->length = htobe16(60 - cum_len);
1643 		req->cksum_offset = 0;
1644 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1645 		req->pad = 0; /* complete solid 16-byte block */
1646 		req->rdma_count = 1;
1647 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1648 		cnt++;
1649 	}
1650 
1651 	tx->req_list[0].rdma_count = cnt;
1652 #if 0
1653 	/* print what the firmware will see */
1654 	for (i = 0; i < cnt; i++) {
1655 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1656 		    "cso:%d, flags:0x%x, rdma:%d\n",
1657 		    i, (int)ntohl(tx->req_list[i].addr_high),
1658 		    (int)ntohl(tx->req_list[i].addr_low),
1659 		    (int)ntohs(tx->req_list[i].length),
1660 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1661 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1662 		    tx->req_list[i].rdma_count);
1663 	}
1664 	printf("--------------\n");
1665 #endif
1666 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1667 	if (tx->wc_fifo == NULL)
1668 		mxge_submit_req(tx, tx->req_list, cnt);
1669 	else
1670 		mxge_submit_req_wc(tx, tx->req_list, cnt);
1671 	return;
1672 
1673 drop:
1674 	m_freem(m);
1675 	ifp->if_oerrors++;
1676 	return;
1677 }
1678 
1679 
1680 
1681 
1682 static inline void
1683 mxge_start_locked(mxge_softc_t *sc)
1684 {
1685 	struct mbuf *m;
1686 	struct ifnet *ifp;
1687 
1688 	ifp = sc->ifp;
1689 	while ((sc->tx.mask - (sc->tx.req - sc->tx.done))
1690 	       > MXGE_MAX_SEND_DESC) {
1691 
1692 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
1693 		if (m == NULL) {
1694 			return;
1695 		}
1696 		/* let BPF see it */
1697 		BPF_MTAP(ifp, m);
1698 
1699 		/* give it to the nic */
1700 		mxge_encap(sc, m);
1701 	}
1702 	/* ran out of transmit slots */
1703 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
1704 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
1705 		sc->tx.stall++;
1706 	}
1707 }
1708 
1709 static void
1710 mxge_start(struct ifnet *ifp)
1711 {
1712 	mxge_softc_t *sc = ifp->if_softc;
1713 
1714 
1715 	mtx_lock(&sc->tx_mtx);
1716 	mxge_start_locked(sc);
1717 	mtx_unlock(&sc->tx_mtx);
1718 }
1719 
1720 /*
1721  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
1722  * at most 32 bytes at a time, so as to avoid involving the software
1723  * pio handler in the nic.   We re-write the first segment's low
1724  * DMA address to mark it valid only after we write the entire chunk
1725  * in a burst
1726  */
1727 static inline void
1728 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
1729 		mcp_kreq_ether_recv_t *src)
1730 {
1731 	uint32_t low;
1732 
1733 	low = src->addr_low;
1734 	src->addr_low = 0xffffffff;
1735 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
1736 	mb();
1737 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
1738 	mb();
1739 	dst->addr_low = low;
1740 	mb();
1741 }
1742 
1743 static int
1744 mxge_get_buf_small(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1745 {
1746 	bus_dma_segment_t seg;
1747 	struct mbuf *m;
1748 	mxge_rx_buf_t *rx = &sc->rx_small;
1749 	int cnt, err;
1750 
1751 	m = m_gethdr(M_DONTWAIT, MT_DATA);
1752 	if (m == NULL) {
1753 		rx->alloc_fail++;
1754 		err = ENOBUFS;
1755 		goto done;
1756 	}
1757 	m->m_len = MHLEN;
1758 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1759 				      &seg, &cnt, BUS_DMA_NOWAIT);
1760 	if (err != 0) {
1761 		m_free(m);
1762 		goto done;
1763 	}
1764 	rx->info[idx].m = m;
1765 	rx->shadow[idx].addr_low =
1766 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1767 	rx->shadow[idx].addr_high =
1768 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1769 
1770 done:
1771 	if ((idx & 7) == 7) {
1772 		if (rx->wc_fifo == NULL)
1773 			mxge_submit_8rx(&rx->lanai[idx - 7],
1774 					&rx->shadow[idx - 7]);
1775 		else {
1776 			mb();
1777 			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1778 		}
1779         }
1780 	return err;
1781 }
1782 
1783 static int
1784 mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1785 {
1786 	bus_dma_segment_t seg;
1787 	struct mbuf *m;
1788 	mxge_rx_buf_t *rx = &sc->rx_big;
1789 	int cnt, err;
1790 
1791 	m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, sc->big_bytes);
1792 	if (m == NULL) {
1793 		rx->alloc_fail++;
1794 		err = ENOBUFS;
1795 		goto done;
1796 	}
1797 	m->m_len = sc->big_bytes;
1798 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1799 				      &seg, &cnt, BUS_DMA_NOWAIT);
1800 	if (err != 0) {
1801 		m_free(m);
1802 		goto done;
1803 	}
1804 	rx->info[idx].m = m;
1805 	rx->shadow[idx].addr_low =
1806 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1807 	rx->shadow[idx].addr_high =
1808 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1809 
1810 done:
1811 	if ((idx & 7) == 7) {
1812 		if (rx->wc_fifo == NULL)
1813 			mxge_submit_8rx(&rx->lanai[idx - 7],
1814 					&rx->shadow[idx - 7]);
1815 		else {
1816 			mb();
1817 			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1818 		}
1819         }
1820 	return err;
1821 }
1822 
1823 static inline void
1824 mxge_rx_csum(struct mbuf *m, int csum)
1825 {
1826 	struct ether_header *eh;
1827 	struct ip *ip;
1828 
1829 	eh = mtod(m, struct ether_header *);
1830 	if (__predict_true(eh->ether_type ==  htons(ETHERTYPE_IP))) {
1831 		ip = (struct ip *)(eh + 1);
1832 		if (__predict_true(ip->ip_p == IPPROTO_TCP ||
1833 				   ip->ip_p == IPPROTO_UDP)) {
1834 			m->m_pkthdr.csum_data = csum;
1835 			m->m_pkthdr.csum_flags = CSUM_DATA_VALID;
1836 		}
1837 	}
1838 }
1839 
1840 static inline void
1841 mxge_rx_done_big(mxge_softc_t *sc, int len, int csum)
1842 {
1843 	struct ifnet *ifp;
1844 	struct mbuf *m = 0; 		/* -Wunitialized */
1845 	struct mbuf *m_prev = 0;	/* -Wunitialized */
1846 	struct mbuf *m_head = 0;
1847 	bus_dmamap_t old_map;
1848 	mxge_rx_buf_t *rx;
1849 	int idx;
1850 
1851 
1852 	rx = &sc->rx_big;
1853 	ifp = sc->ifp;
1854 	while (len > 0) {
1855 		idx = rx->cnt & rx->mask;
1856                 rx->cnt++;
1857 		/* save a pointer to the received mbuf */
1858 		m = rx->info[idx].m;
1859 		/* try to replace the received mbuf */
1860 		if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
1861 			goto drop;
1862 		}
1863 		/* unmap the received buffer */
1864 		old_map = rx->info[idx].map;
1865 		bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1866 		bus_dmamap_unload(rx->dmat, old_map);
1867 
1868 		/* swap the bus_dmamap_t's */
1869 		rx->info[idx].map = rx->extra_map;
1870 		rx->extra_map = old_map;
1871 
1872 		/* chain multiple segments together */
1873 		if (!m_head) {
1874 			m_head = m;
1875 			/* mcp implicitly skips 1st bytes so that
1876 			 * packet is properly aligned */
1877 			m->m_data += MXGEFW_PAD;
1878 			m->m_pkthdr.len = len;
1879 			m->m_len = sc->big_bytes - MXGEFW_PAD;
1880 		} else {
1881 			m->m_len = sc->big_bytes;
1882 			m->m_flags &= ~M_PKTHDR;
1883 			m_prev->m_next = m;
1884 		}
1885 		len -= m->m_len;
1886 		m_prev = m;
1887 	}
1888 
1889 	/* trim trailing garbage from the last mbuf in the chain.  If
1890 	 * there is any garbage, len will be negative */
1891 	m->m_len += len;
1892 
1893 	/* if the checksum is valid, mark it in the mbuf header */
1894 	if (sc->csum_flag)
1895 		mxge_rx_csum(m_head, csum);
1896 
1897 	/* pass the frame up the stack */
1898 	m_head->m_pkthdr.rcvif = ifp;
1899 	ifp->if_ipackets++;
1900 	(*ifp->if_input)(ifp, m_head);
1901 	return;
1902 
1903 drop:
1904 	/* drop the frame -- the old mbuf(s) are re-cycled by running
1905 	   every slot through the allocator */
1906         if (m_head) {
1907                 len -= sc->big_bytes;
1908                 m_freem(m_head);
1909         } else {
1910                 len -= (sc->big_bytes + MXGEFW_PAD);
1911         }
1912         while ((int)len > 0) {
1913                 idx = rx->cnt & rx->mask;
1914                 rx->cnt++;
1915                 m = rx->info[idx].m;
1916                 if (0 == (mxge_get_buf_big(sc, rx->extra_map, idx))) {
1917 			m_freem(m);
1918 			/* unmap the received buffer */
1919 			old_map = rx->info[idx].map;
1920 			bus_dmamap_sync(rx->dmat, old_map,
1921 					BUS_DMASYNC_POSTREAD);
1922 			bus_dmamap_unload(rx->dmat, old_map);
1923 
1924 			/* swap the bus_dmamap_t's */
1925 			rx->info[idx].map = rx->extra_map;
1926 			rx->extra_map = old_map;
1927 		}
1928                 len -= sc->big_bytes;
1929         }
1930 
1931 	ifp->if_ierrors++;
1932 
1933 }
1934 
1935 static inline void
1936 mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
1937 {
1938 	struct ifnet *ifp;
1939 	struct mbuf *m;
1940 	mxge_rx_buf_t *rx;
1941 	bus_dmamap_t old_map;
1942 	int idx;
1943 
1944 	ifp = sc->ifp;
1945 	rx = &sc->rx_small;
1946 	idx = rx->cnt & rx->mask;
1947 	rx->cnt++;
1948 	/* save a pointer to the received mbuf */
1949 	m = rx->info[idx].m;
1950 	/* try to replace the received mbuf */
1951 	if (mxge_get_buf_small(sc, rx->extra_map, idx)) {
1952 		/* drop the frame -- the old mbuf is re-cycled */
1953 		ifp->if_ierrors++;
1954 		return;
1955 	}
1956 
1957 	/* unmap the received buffer */
1958 	old_map = rx->info[idx].map;
1959 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1960 	bus_dmamap_unload(rx->dmat, old_map);
1961 
1962 	/* swap the bus_dmamap_t's */
1963 	rx->info[idx].map = rx->extra_map;
1964 	rx->extra_map = old_map;
1965 
1966 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
1967 	 * aligned */
1968 	m->m_data += MXGEFW_PAD;
1969 
1970 	/* if the checksum is valid, mark it in the mbuf header */
1971 	if (sc->csum_flag)
1972 		mxge_rx_csum(m, csum);
1973 
1974 	/* pass the frame up the stack */
1975 	m->m_pkthdr.rcvif = ifp;
1976 	m->m_len = m->m_pkthdr.len = len;
1977 	ifp->if_ipackets++;
1978 	(*ifp->if_input)(ifp, m);
1979 }
1980 
1981 static inline void
1982 mxge_clean_rx_done(mxge_softc_t *sc)
1983 {
1984 	mxge_rx_done_t *rx_done = &sc->rx_done;
1985 	int limit = 0;
1986 	uint16_t length;
1987 	uint16_t checksum;
1988 
1989 
1990 	while (rx_done->entry[rx_done->idx].length != 0) {
1991 		length = ntohs(rx_done->entry[rx_done->idx].length);
1992 		rx_done->entry[rx_done->idx].length = 0;
1993 		checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
1994 		if (length <= (MHLEN - MXGEFW_PAD))
1995 			mxge_rx_done_small(sc, length, checksum);
1996 		else
1997 			mxge_rx_done_big(sc, length, checksum);
1998 		rx_done->cnt++;
1999 		rx_done->idx = rx_done->cnt & (mxge_max_intr_slots - 1);
2000 
2001 		/* limit potential for livelock */
2002 		if (__predict_false(++limit > 2 * mxge_max_intr_slots))
2003 			break;
2004 
2005 	}
2006 }
2007 
2008 
2009 static inline void
2010 mxge_tx_done(mxge_softc_t *sc, uint32_t mcp_idx)
2011 {
2012 	struct ifnet *ifp;
2013 	mxge_tx_buf_t *tx;
2014 	struct mbuf *m;
2015 	bus_dmamap_t map;
2016 	int idx, limit;
2017 
2018 	limit = 0;
2019 	tx = &sc->tx;
2020 	ifp = sc->ifp;
2021 	while (tx->pkt_done != mcp_idx) {
2022 		idx = tx->done & tx->mask;
2023 		tx->done++;
2024 		m = tx->info[idx].m;
2025 		/* mbuf and DMA map only attached to the first
2026 		   segment per-mbuf */
2027 		if (m != NULL) {
2028 			ifp->if_opackets++;
2029 			tx->info[idx].m = NULL;
2030 			map = tx->info[idx].map;
2031 			bus_dmamap_unload(tx->dmat, map);
2032 			m_freem(m);
2033 		}
2034 		if (tx->info[idx].flag) {
2035 			tx->info[idx].flag = 0;
2036 			tx->pkt_done++;
2037 		}
2038 		/* limit potential for livelock by only handling
2039 		   2 full tx rings per call */
2040 		if (__predict_false(++limit >  2 * tx->mask))
2041 			break;
2042 	}
2043 
2044 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2045            its OK to send packets */
2046 
2047 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
2048 	    tx->req - tx->done < (tx->mask + 1)/4) {
2049 		mtx_lock(&sc->tx_mtx);
2050 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2051 		sc->tx.wake++;
2052 		mxge_start_locked(sc);
2053 		mtx_unlock(&sc->tx_mtx);
2054 	}
2055 }
2056 
2057 static void
2058 mxge_intr(void *arg)
2059 {
2060 	mxge_softc_t *sc = arg;
2061 	mcp_irq_data_t *stats = sc->fw_stats;
2062 	mxge_tx_buf_t *tx = &sc->tx;
2063 	mxge_rx_done_t *rx_done = &sc->rx_done;
2064 	uint32_t send_done_count;
2065 	uint8_t valid;
2066 
2067 
2068 	/* make sure the DMA has finished */
2069 	if (!stats->valid) {
2070 		return;
2071 	}
2072 	valid = stats->valid;
2073 
2074 	if (!sc->msi_enabled) {
2075 		/* lower legacy IRQ  */
2076 		*sc->irq_deassert = 0;
2077 		if (!mxge_deassert_wait)
2078 			/* don't wait for conf. that irq is low */
2079 			stats->valid = 0;
2080 	} else {
2081 		stats->valid = 0;
2082 	}
2083 
2084 	/* loop while waiting for legacy irq deassertion */
2085 	do {
2086 		/* check for transmit completes and receives */
2087 		send_done_count = be32toh(stats->send_done_count);
2088 		while ((send_done_count != tx->pkt_done) ||
2089 		       (rx_done->entry[rx_done->idx].length != 0)) {
2090 			mxge_tx_done(sc, (int)send_done_count);
2091 			mxge_clean_rx_done(sc);
2092 			send_done_count = be32toh(stats->send_done_count);
2093 		}
2094 	} while (*((volatile uint8_t *) &stats->valid));
2095 
2096 	if (__predict_false(stats->stats_updated)) {
2097 		if (sc->link_state != stats->link_up) {
2098 			sc->link_state = stats->link_up;
2099 			if (sc->link_state) {
2100 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2101 				if (mxge_verbose)
2102 					device_printf(sc->dev, "link up\n");
2103 			} else {
2104 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2105 				if (mxge_verbose)
2106 					device_printf(sc->dev, "link down\n");
2107 			}
2108 		}
2109 		if (sc->rdma_tags_available !=
2110 		    be32toh(sc->fw_stats->rdma_tags_available)) {
2111 			sc->rdma_tags_available =
2112 				be32toh(sc->fw_stats->rdma_tags_available);
2113 			device_printf(sc->dev, "RDMA timed out! %d tags "
2114 				      "left\n", sc->rdma_tags_available);
2115 		}
2116 		sc->down_cnt += stats->link_down;
2117 	}
2118 
2119 	/* check to see if we have rx token to pass back */
2120 	if (valid & 0x1)
2121 	    *sc->irq_claim = be32toh(3);
2122 	*(sc->irq_claim + 1) = be32toh(3);
2123 }
2124 
2125 static void
2126 mxge_init(void *arg)
2127 {
2128 }
2129 
2130 
2131 
2132 static void
2133 mxge_free_mbufs(mxge_softc_t *sc)
2134 {
2135 	int i;
2136 
2137 	for (i = 0; i <= sc->rx_big.mask; i++) {
2138 		if (sc->rx_big.info[i].m == NULL)
2139 			continue;
2140 		bus_dmamap_unload(sc->rx_big.dmat,
2141 				  sc->rx_big.info[i].map);
2142 		m_freem(sc->rx_big.info[i].m);
2143 		sc->rx_big.info[i].m = NULL;
2144 	}
2145 
2146 	for (i = 0; i <= sc->rx_big.mask; i++) {
2147 		if (sc->rx_big.info[i].m == NULL)
2148 			continue;
2149 		bus_dmamap_unload(sc->rx_big.dmat,
2150 				  sc->rx_big.info[i].map);
2151 		m_freem(sc->rx_big.info[i].m);
2152 		sc->rx_big.info[i].m = NULL;
2153 	}
2154 
2155 	for (i = 0; i <= sc->tx.mask; i++) {
2156 		sc->tx.info[i].flag = 0;
2157 		if (sc->tx.info[i].m == NULL)
2158 			continue;
2159 		bus_dmamap_unload(sc->tx.dmat,
2160 				  sc->tx.info[i].map);
2161 		m_freem(sc->tx.info[i].m);
2162 		sc->tx.info[i].m = NULL;
2163 	}
2164 }
2165 
2166 static void
2167 mxge_free_rings(mxge_softc_t *sc)
2168 {
2169 	int i;
2170 
2171 	if (sc->tx.req_bytes != NULL)
2172 		free(sc->tx.req_bytes, M_DEVBUF);
2173 	if (sc->tx.seg_list != NULL)
2174 		free(sc->tx.seg_list, M_DEVBUF);
2175 	if (sc->rx_small.shadow != NULL)
2176 		free(sc->rx_small.shadow, M_DEVBUF);
2177 	if (sc->rx_big.shadow != NULL)
2178 		free(sc->rx_big.shadow, M_DEVBUF);
2179 	if (sc->tx.info != NULL) {
2180 		if (sc->tx.dmat != NULL) {
2181 			for (i = 0; i <= sc->tx.mask; i++) {
2182 				bus_dmamap_destroy(sc->tx.dmat,
2183 						   sc->tx.info[i].map);
2184 			}
2185 			bus_dma_tag_destroy(sc->tx.dmat);
2186 		}
2187 		free(sc->tx.info, M_DEVBUF);
2188 	}
2189 	if (sc->rx_small.info != NULL) {
2190 		if (sc->rx_small.dmat != NULL) {
2191 			for (i = 0; i <= sc->rx_small.mask; i++) {
2192 				bus_dmamap_destroy(sc->rx_small.dmat,
2193 						   sc->rx_small.info[i].map);
2194 			}
2195 			bus_dmamap_destroy(sc->rx_small.dmat,
2196 					   sc->rx_small.extra_map);
2197 			bus_dma_tag_destroy(sc->rx_small.dmat);
2198 		}
2199 		free(sc->rx_small.info, M_DEVBUF);
2200 	}
2201 	if (sc->rx_big.info != NULL) {
2202 		if (sc->rx_big.dmat != NULL) {
2203 			for (i = 0; i <= sc->rx_big.mask; i++) {
2204 				bus_dmamap_destroy(sc->rx_big.dmat,
2205 						   sc->rx_big.info[i].map);
2206 			}
2207 			bus_dmamap_destroy(sc->rx_big.dmat,
2208 					   sc->rx_big.extra_map);
2209 			bus_dma_tag_destroy(sc->rx_big.dmat);
2210 		}
2211 		free(sc->rx_big.info, M_DEVBUF);
2212 	}
2213 }
2214 
2215 static int
2216 mxge_alloc_rings(mxge_softc_t *sc)
2217 {
2218 	mxge_cmd_t cmd;
2219 	int tx_ring_size, rx_ring_size;
2220 	int tx_ring_entries, rx_ring_entries;
2221 	int i, err;
2222 	unsigned long bytes;
2223 
2224 	/* get ring sizes */
2225 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
2226 	tx_ring_size = cmd.data0;
2227 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
2228 	if (err != 0) {
2229 		device_printf(sc->dev, "Cannot determine ring sizes\n");
2230 		goto abort_with_nothing;
2231 	}
2232 
2233 	rx_ring_size = cmd.data0;
2234 
2235 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
2236 	rx_ring_entries = rx_ring_size / sizeof (mcp_dma_addr_t);
2237 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
2238 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
2239 	IFQ_SET_READY(&sc->ifp->if_snd);
2240 
2241 	sc->tx.mask = tx_ring_entries - 1;
2242 	sc->rx_small.mask = sc->rx_big.mask = rx_ring_entries - 1;
2243 
2244 	err = ENOMEM;
2245 
2246 	/* allocate the tx request copy block */
2247 	bytes = 8 +
2248 		sizeof (*sc->tx.req_list) * (MXGE_MAX_SEND_DESC + 4);
2249 	sc->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
2250 	if (sc->tx.req_bytes == NULL)
2251 		goto abort_with_nothing;
2252 	/* ensure req_list entries are aligned to 8 bytes */
2253 	sc->tx.req_list = (mcp_kreq_ether_send_t *)
2254 		((unsigned long)(sc->tx.req_bytes + 7) & ~7UL);
2255 
2256 	/* allocate the tx busdma segment list */
2257 	bytes = sizeof (*sc->tx.seg_list) * MXGE_MAX_SEND_DESC;
2258 	sc->tx.seg_list = (bus_dma_segment_t *)
2259 		malloc(bytes, M_DEVBUF, M_WAITOK);
2260 	if (sc->tx.seg_list == NULL)
2261 		goto abort_with_alloc;
2262 
2263 	/* allocate the rx shadow rings */
2264 	bytes = rx_ring_entries * sizeof (*sc->rx_small.shadow);
2265 	sc->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2266 	if (sc->rx_small.shadow == NULL)
2267 		goto abort_with_alloc;
2268 
2269 	bytes = rx_ring_entries * sizeof (*sc->rx_big.shadow);
2270 	sc->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2271 	if (sc->rx_big.shadow == NULL)
2272 		goto abort_with_alloc;
2273 
2274 	/* allocate the host info rings */
2275 	bytes = tx_ring_entries * sizeof (*sc->tx.info);
2276 	sc->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2277 	if (sc->tx.info == NULL)
2278 		goto abort_with_alloc;
2279 
2280 	bytes = rx_ring_entries * sizeof (*sc->rx_small.info);
2281 	sc->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2282 	if (sc->rx_small.info == NULL)
2283 		goto abort_with_alloc;
2284 
2285 	bytes = rx_ring_entries * sizeof (*sc->rx_big.info);
2286 	sc->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2287 	if (sc->rx_big.info == NULL)
2288 		goto abort_with_alloc;
2289 
2290 	/* allocate the busdma resources */
2291 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2292 				 1,			/* alignment */
2293 				 sc->tx.boundary,	/* boundary */
2294 				 BUS_SPACE_MAXADDR,	/* low */
2295 				 BUS_SPACE_MAXADDR,	/* high */
2296 				 NULL, NULL,		/* filter */
2297 				 65536 + 256,		/* maxsize */
2298 				 MXGE_MAX_SEND_DESC/2,	/* num segs */
2299 				 sc->tx.boundary,	/* maxsegsize */
2300 				 BUS_DMA_ALLOCNOW,	/* flags */
2301 				 NULL, NULL,		/* lock */
2302 				 &sc->tx.dmat);		/* tag */
2303 
2304 	if (err != 0) {
2305 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
2306 			      err);
2307 		goto abort_with_alloc;
2308 	}
2309 
2310 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2311 				 1,			/* alignment */
2312 				 4096,			/* boundary */
2313 				 BUS_SPACE_MAXADDR,	/* low */
2314 				 BUS_SPACE_MAXADDR,	/* high */
2315 				 NULL, NULL,		/* filter */
2316 				 MHLEN,			/* maxsize */
2317 				 1,			/* num segs */
2318 				 MHLEN,			/* maxsegsize */
2319 				 BUS_DMA_ALLOCNOW,	/* flags */
2320 				 NULL, NULL,		/* lock */
2321 				 &sc->rx_small.dmat);	/* tag */
2322 	if (err != 0) {
2323 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2324 			      err);
2325 		goto abort_with_alloc;
2326 	}
2327 
2328 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2329 				 1,			/* alignment */
2330 				 4096,			/* boundary */
2331 				 BUS_SPACE_MAXADDR,	/* low */
2332 				 BUS_SPACE_MAXADDR,	/* high */
2333 				 NULL, NULL,		/* filter */
2334 				 4096,			/* maxsize */
2335 				 1,			/* num segs */
2336 				 4096,			/* maxsegsize */
2337 				 BUS_DMA_ALLOCNOW,	/* flags */
2338 				 NULL, NULL,		/* lock */
2339 				 &sc->rx_big.dmat);	/* tag */
2340 	if (err != 0) {
2341 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2342 			      err);
2343 		goto abort_with_alloc;
2344 	}
2345 
2346 	/* now use these tags to setup dmamaps for each slot
2347 	   in each ring */
2348 	for (i = 0; i <= sc->tx.mask; i++) {
2349 		err = bus_dmamap_create(sc->tx.dmat, 0,
2350 					&sc->tx.info[i].map);
2351 		if (err != 0) {
2352 			device_printf(sc->dev, "Err %d  tx dmamap\n",
2353 			      err);
2354 			goto abort_with_alloc;
2355 		}
2356 	}
2357 	for (i = 0; i <= sc->rx_small.mask; i++) {
2358 		err = bus_dmamap_create(sc->rx_small.dmat, 0,
2359 					&sc->rx_small.info[i].map);
2360 		if (err != 0) {
2361 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2362 				      err);
2363 			goto abort_with_alloc;
2364 		}
2365 	}
2366 	err = bus_dmamap_create(sc->rx_small.dmat, 0,
2367 				&sc->rx_small.extra_map);
2368 	if (err != 0) {
2369 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2370 			      err);
2371 			goto abort_with_alloc;
2372 	}
2373 
2374 	for (i = 0; i <= sc->rx_big.mask; i++) {
2375 		err = bus_dmamap_create(sc->rx_big.dmat, 0,
2376 					&sc->rx_big.info[i].map);
2377 		if (err != 0) {
2378 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2379 			      err);
2380 			goto abort_with_alloc;
2381 		}
2382 	}
2383 	err = bus_dmamap_create(sc->rx_big.dmat, 0,
2384 				&sc->rx_big.extra_map);
2385 	if (err != 0) {
2386 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2387 			      err);
2388 			goto abort_with_alloc;
2389 	}
2390 	return 0;
2391 
2392 abort_with_alloc:
2393 	mxge_free_rings(sc);
2394 
2395 abort_with_nothing:
2396 	return err;
2397 }
2398 
2399 static int
2400 mxge_open(mxge_softc_t *sc)
2401 {
2402 	mxge_cmd_t cmd;
2403 	int i, err;
2404 	bus_dmamap_t map;
2405 	bus_addr_t bus;
2406 
2407 
2408 	/* Copy the MAC address in case it was overridden */
2409 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
2410 
2411 	err = mxge_reset(sc);
2412 	if (err != 0) {
2413 		device_printf(sc->dev, "failed to reset\n");
2414 		return EIO;
2415 	}
2416 	bzero(sc->rx_done.entry,
2417 	      mxge_max_intr_slots * sizeof(*sc->rx_done.entry));
2418 
2419 	if (MCLBYTES >=
2420 	    sc->ifp->if_mtu + ETHER_HDR_LEN + MXGEFW_PAD)
2421 		sc->big_bytes = MCLBYTES;
2422 	else
2423 		sc->big_bytes = MJUMPAGESIZE;
2424 
2425 
2426 	/* get the lanai pointers to the send and receive rings */
2427 
2428 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
2429 	sc->tx.lanai =
2430 		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
2431 	err |= mxge_send_cmd(sc,
2432 				 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
2433 	sc->rx_small.lanai =
2434 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2435 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
2436 	sc->rx_big.lanai =
2437 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2438 
2439 	if (err != 0) {
2440 		device_printf(sc->dev,
2441 			      "failed to get ring sizes or locations\n");
2442 		return EIO;
2443 	}
2444 
2445 	if (sc->wc) {
2446 		sc->tx.wc_fifo = sc->sram + MXGEFW_ETH_SEND_4;
2447 		sc->rx_small.wc_fifo = sc->sram + MXGEFW_ETH_RECV_SMALL;
2448 		sc->rx_big.wc_fifo = sc->sram + MXGEFW_ETH_RECV_BIG;
2449 	} else {
2450 		sc->tx.wc_fifo = 0;
2451 		sc->rx_small.wc_fifo = 0;
2452 		sc->rx_big.wc_fifo = 0;
2453 	}
2454 
2455 
2456 	/* stock receive rings */
2457 	for (i = 0; i <= sc->rx_small.mask; i++) {
2458 		map = sc->rx_small.info[i].map;
2459 		err = mxge_get_buf_small(sc, map, i);
2460 		if (err) {
2461 			device_printf(sc->dev, "alloced %d/%d smalls\n",
2462 				      i, sc->rx_small.mask + 1);
2463 			goto abort;
2464 		}
2465 	}
2466 	for (i = 0; i <= sc->rx_big.mask; i++) {
2467 		map = sc->rx_big.info[i].map;
2468 		err = mxge_get_buf_big(sc, map, i);
2469 		if (err) {
2470 			device_printf(sc->dev, "alloced %d/%d bigs\n",
2471 				      i, sc->rx_big.mask + 1);
2472 			goto abort;
2473 		}
2474 	}
2475 
2476 	/* Give the firmware the mtu and the big and small buffer
2477 	   sizes.  The firmware wants the big buf size to be a power
2478 	   of two. Luckily, FreeBSD's clusters are powers of two */
2479 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN;
2480 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
2481 	cmd.data0 = MHLEN - MXGEFW_PAD;
2482 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
2483 			     &cmd);
2484 	cmd.data0 = sc->big_bytes;
2485 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2486 
2487 	if (err != 0) {
2488 		device_printf(sc->dev, "failed to setup params\n");
2489 		goto abort;
2490 	}
2491 
2492 	/* Now give him the pointer to the stats block */
2493 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->fw_stats_dma.bus_addr);
2494 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->fw_stats_dma.bus_addr);
2495 	cmd.data2 = sizeof(struct mcp_irq_data);
2496 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
2497 
2498 	if (err != 0) {
2499 		bus = sc->fw_stats_dma.bus_addr;
2500 		bus += offsetof(struct mcp_irq_data, send_done_count);
2501 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
2502 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
2503 		err = mxge_send_cmd(sc,
2504 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
2505 				    &cmd);
2506 		/* Firmware cannot support multicast without STATS_DMA_V2 */
2507 		sc->fw_multicast_support = 0;
2508 	} else {
2509 		sc->fw_multicast_support = 1;
2510 	}
2511 
2512 	if (err != 0) {
2513 		device_printf(sc->dev, "failed to setup params\n");
2514 		goto abort;
2515 	}
2516 
2517 	/* Finally, start the firmware running */
2518 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
2519 	if (err) {
2520 		device_printf(sc->dev, "Couldn't bring up link\n");
2521 		goto abort;
2522 	}
2523 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
2524 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2525 
2526 	return 0;
2527 
2528 
2529 abort:
2530 	mxge_free_mbufs(sc);
2531 
2532 	return err;
2533 }
2534 
2535 static int
2536 mxge_close(mxge_softc_t *sc)
2537 {
2538 	mxge_cmd_t cmd;
2539 	int err, old_down_cnt;
2540 
2541 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
2542 	old_down_cnt = sc->down_cnt;
2543 	mb();
2544 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2545 	if (err) {
2546 		device_printf(sc->dev, "Couldn't bring down link\n");
2547 	}
2548 	if (old_down_cnt == sc->down_cnt) {
2549 		/* wait for down irq */
2550 		DELAY(10 * sc->intr_coal_delay);
2551 	}
2552 	if (old_down_cnt == sc->down_cnt) {
2553 		device_printf(sc->dev, "never got down irq\n");
2554 	}
2555 
2556 	mxge_free_mbufs(sc);
2557 
2558 	return 0;
2559 }
2560 
2561 static void
2562 mxge_setup_cfg_space(mxge_softc_t *sc)
2563 {
2564 	device_t dev = sc->dev;
2565 	int reg;
2566 	uint16_t cmd, lnk, pectl;
2567 
2568 	/* find the PCIe link width and set max read request to 4KB*/
2569 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
2570 		lnk = pci_read_config(dev, reg + 0x12, 2);
2571 		sc->link_width = (lnk >> 4) & 0x3f;
2572 
2573 		pectl = pci_read_config(dev, reg + 0x8, 2);
2574 		pectl = (pectl & ~0x7000) | (5 << 12);
2575 		pci_write_config(dev, reg + 0x8, pectl, 2);
2576 	}
2577 
2578 	/* Enable DMA and Memory space access */
2579 	pci_enable_busmaster(dev);
2580 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
2581 	cmd |= PCIM_CMD_MEMEN;
2582 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
2583 }
2584 
2585 static uint32_t
2586 mxge_read_reboot(mxge_softc_t *sc)
2587 {
2588 	device_t dev = sc->dev;
2589 	uint32_t vs;
2590 
2591 	/* find the vendor specific offset */
2592 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
2593 		device_printf(sc->dev,
2594 			      "could not find vendor specific offset\n");
2595 		return (uint32_t)-1;
2596 	}
2597 	/* enable read32 mode */
2598 	pci_write_config(dev, vs + 0x10, 0x3, 1);
2599 	/* tell NIC which register to read */
2600 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
2601 	return (pci_read_config(dev, vs + 0x14, 4));
2602 }
2603 
2604 static void
2605 mxge_watchdog_reset(mxge_softc_t *sc)
2606 {
2607 	int err;
2608 	uint32_t reboot;
2609 	uint16_t cmd;
2610 
2611 	err = ENXIO;
2612 
2613 	device_printf(sc->dev, "Watchdog reset!\n");
2614 
2615 	/*
2616 	 * check to see if the NIC rebooted.  If it did, then all of
2617 	 * PCI config space has been reset, and things like the
2618 	 * busmaster bit will be zero.  If this is the case, then we
2619 	 * must restore PCI config space before the NIC can be used
2620 	 * again
2621 	 */
2622 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2623 	if (cmd == 0xffff) {
2624 		/*
2625 		 * maybe the watchdog caught the NIC rebooting; wait
2626 		 * up to 100ms for it to finish.  If it does not come
2627 		 * back, then give up
2628 		 */
2629 		DELAY(1000*100);
2630 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2631 		if (cmd == 0xffff) {
2632 			device_printf(sc->dev, "NIC disappeared!\n");
2633 			goto abort;
2634 		}
2635 	}
2636 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
2637 		/* print the reboot status */
2638 		reboot = mxge_read_reboot(sc);
2639 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
2640 			      reboot);
2641 		/* restore PCI configuration space */
2642 
2643 		/* XXXX waiting for pci_cfg_restore() to be exported */
2644 		goto abort; /* just abort for now */
2645 
2646 		/* and redo any changes we made to our config space */
2647 		mxge_setup_cfg_space(sc);
2648 	} else {
2649 		device_printf(sc->dev, "NIC did not reboot, ring state:\n");
2650 		device_printf(sc->dev, "tx.req=%d tx.done=%d\n",
2651 			      sc->tx.req, sc->tx.done);
2652 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
2653 			      sc->tx.pkt_done,
2654 			      be32toh(sc->fw_stats->send_done_count));
2655 	}
2656 
2657 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
2658 		mxge_close(sc);
2659 		err = mxge_open(sc);
2660 	}
2661 
2662 abort:
2663 	/*
2664 	 * stop the watchdog if the nic is dead, to avoid spamming the
2665 	 * console
2666 	 */
2667 	if (err != 0) {
2668 		callout_stop(&sc->co_hdl);
2669 	}
2670 }
2671 
2672 static void
2673 mxge_watchdog(mxge_softc_t *sc)
2674 {
2675 	mxge_tx_buf_t *tx = &sc->tx;
2676 
2677 	/* see if we have outstanding transmits, which
2678 	   have been pending for more than mxge_ticks */
2679 	if (tx->req != tx->done &&
2680 	    tx->watchdog_req != tx->watchdog_done &&
2681 	    tx->done == tx->watchdog_done)
2682 		mxge_watchdog_reset(sc);
2683 
2684 	tx->watchdog_req = tx->req;
2685 	tx->watchdog_done = tx->done;
2686 }
2687 
2688 static void
2689 mxge_tick(void *arg)
2690 {
2691 	mxge_softc_t *sc = arg;
2692 
2693 
2694 	/* Synchronize with possible callout reset/stop. */
2695 	if (callout_pending(&sc->co_hdl) ||
2696 	    !callout_active(&sc->co_hdl)) {
2697 		mtx_unlock(&sc->driver_mtx);
2698 		return;
2699 	}
2700 
2701 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
2702 	mxge_watchdog(sc);
2703 }
2704 
2705 static int
2706 mxge_media_change(struct ifnet *ifp)
2707 {
2708 	return EINVAL;
2709 }
2710 
2711 static int
2712 mxge_change_mtu(mxge_softc_t *sc, int mtu)
2713 {
2714 	struct ifnet *ifp = sc->ifp;
2715 	int real_mtu, old_mtu;
2716 	int err = 0;
2717 
2718 
2719 	real_mtu = mtu + ETHER_HDR_LEN;
2720 	if ((real_mtu > MXGE_MAX_ETHER_MTU) ||
2721 	    real_mtu < 60)
2722 		return EINVAL;
2723 	mtx_lock(&sc->driver_mtx);
2724 	old_mtu = ifp->if_mtu;
2725 	ifp->if_mtu = mtu;
2726 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2727 		callout_stop(&sc->co_hdl);
2728 		mxge_close(sc);
2729 		err = mxge_open(sc);
2730 		if (err != 0) {
2731 			ifp->if_mtu = old_mtu;
2732 			mxge_close(sc);
2733 			(void) mxge_open(sc);
2734 		}
2735 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
2736 	}
2737 	mtx_unlock(&sc->driver_mtx);
2738 	return err;
2739 }
2740 
2741 static void
2742 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
2743 {
2744 	mxge_softc_t *sc = ifp->if_softc;
2745 
2746 
2747 	if (sc == NULL)
2748 		return;
2749 	ifmr->ifm_status = IFM_AVALID;
2750 	ifmr->ifm_status |= sc->fw_stats->link_up ? IFM_ACTIVE : 0;
2751 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
2752 	ifmr->ifm_active |= sc->fw_stats->link_up ? IFM_FDX : 0;
2753 }
2754 
2755 static int
2756 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
2757 {
2758 	mxge_softc_t *sc = ifp->if_softc;
2759 	struct ifreq *ifr = (struct ifreq *)data;
2760 	int err, mask;
2761 
2762 	err = 0;
2763 	switch (command) {
2764 	case SIOCSIFADDR:
2765 	case SIOCGIFADDR:
2766 		err = ether_ioctl(ifp, command, data);
2767 		break;
2768 
2769 	case SIOCSIFMTU:
2770 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
2771 		break;
2772 
2773 	case SIOCSIFFLAGS:
2774 		mtx_lock(&sc->driver_mtx);
2775 		if (ifp->if_flags & IFF_UP) {
2776 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
2777 				err = mxge_open(sc);
2778 				callout_reset(&sc->co_hdl, mxge_ticks,
2779 					      mxge_tick, sc);
2780 			} else {
2781 				/* take care of promis can allmulti
2782 				   flag chages */
2783 				mxge_change_promisc(sc,
2784 						    ifp->if_flags & IFF_PROMISC);
2785 				mxge_set_multicast_list(sc);
2786 			}
2787 		} else {
2788 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2789 				mxge_close(sc);
2790 				callout_stop(&sc->co_hdl);
2791 			}
2792 		}
2793 		mtx_unlock(&sc->driver_mtx);
2794 		break;
2795 
2796 	case SIOCADDMULTI:
2797 	case SIOCDELMULTI:
2798 		mtx_lock(&sc->driver_mtx);
2799 		mxge_set_multicast_list(sc);
2800 		mtx_unlock(&sc->driver_mtx);
2801 		break;
2802 
2803 	case SIOCSIFCAP:
2804 		mtx_lock(&sc->driver_mtx);
2805 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2806 		if (mask & IFCAP_TXCSUM) {
2807 			if (IFCAP_TXCSUM & ifp->if_capenable) {
2808 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
2809 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
2810 						      | CSUM_TSO);
2811 			} else {
2812 				ifp->if_capenable |= IFCAP_TXCSUM;
2813 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
2814 			}
2815 		} else if (mask & IFCAP_RXCSUM) {
2816 			if (IFCAP_RXCSUM & ifp->if_capenable) {
2817 				ifp->if_capenable &= ~IFCAP_RXCSUM;
2818 				sc->csum_flag = 0;
2819 			} else {
2820 				ifp->if_capenable |= IFCAP_RXCSUM;
2821 				sc->csum_flag = 1;
2822 			}
2823 		}
2824 		if (mask & IFCAP_TSO4) {
2825 			if (IFCAP_TSO4 & ifp->if_capenable) {
2826 				ifp->if_capenable &= ~IFCAP_TSO4;
2827 				ifp->if_hwassist &= ~CSUM_TSO;
2828 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
2829 				ifp->if_capenable |= IFCAP_TSO4;
2830 				ifp->if_hwassist |= CSUM_TSO;
2831 			} else {
2832 				printf("mxge requires tx checksum offload"
2833 				       " be enabled to use TSO\n");
2834 				err = EINVAL;
2835 			}
2836 		}
2837 		mtx_unlock(&sc->driver_mtx);
2838 		break;
2839 
2840 	case SIOCGIFMEDIA:
2841 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
2842 				    &sc->media, command);
2843                 break;
2844 
2845 	default:
2846 		err = ENOTTY;
2847         }
2848 	return err;
2849 }
2850 
2851 static void
2852 mxge_fetch_tunables(mxge_softc_t *sc)
2853 {
2854 
2855 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
2856 			  &mxge_flow_control);
2857 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
2858 			  &mxge_intr_coal_delay);
2859 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
2860 			  &mxge_nvidia_ecrc_enable);
2861 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
2862 			  &mxge_force_firmware);
2863 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
2864 			  &mxge_deassert_wait);
2865 	TUNABLE_INT_FETCH("hw.mxge.verbose",
2866 			  &mxge_verbose);
2867 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
2868 
2869 	if (bootverbose)
2870 		mxge_verbose = 1;
2871 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
2872 		mxge_intr_coal_delay = 30;
2873 	if (mxge_ticks == 0)
2874 		mxge_ticks = hz;
2875 	sc->pause = mxge_flow_control;
2876 }
2877 
2878 static int
2879 mxge_attach(device_t dev)
2880 {
2881 	mxge_softc_t *sc = device_get_softc(dev);
2882 	struct ifnet *ifp;
2883 	size_t bytes;
2884 	int count, rid, err;
2885 
2886 	sc->dev = dev;
2887 	mxge_fetch_tunables(sc);
2888 
2889 	err = bus_dma_tag_create(NULL,			/* parent */
2890 				 1,			/* alignment */
2891 				 4096,			/* boundary */
2892 				 BUS_SPACE_MAXADDR,	/* low */
2893 				 BUS_SPACE_MAXADDR,	/* high */
2894 				 NULL, NULL,		/* filter */
2895 				 65536 + 256,		/* maxsize */
2896 				 MXGE_MAX_SEND_DESC, 	/* num segs */
2897 				 4096,			/* maxsegsize */
2898 				 0,			/* flags */
2899 				 NULL, NULL,		/* lock */
2900 				 &sc->parent_dmat);	/* tag */
2901 
2902 	if (err != 0) {
2903 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
2904 			      err);
2905 		goto abort_with_nothing;
2906 	}
2907 
2908 	ifp = sc->ifp = if_alloc(IFT_ETHER);
2909 	if (ifp == NULL) {
2910 		device_printf(dev, "can not if_alloc()\n");
2911 		err = ENOSPC;
2912 		goto abort_with_parent_dmat;
2913 	}
2914 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
2915 		 device_get_nameunit(dev));
2916 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
2917 	snprintf(sc->tx_mtx_name, sizeof(sc->tx_mtx_name), "%s:tx",
2918 		 device_get_nameunit(dev));
2919 	mtx_init(&sc->tx_mtx, sc->tx_mtx_name, NULL, MTX_DEF);
2920 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
2921 		 "%s:drv", device_get_nameunit(dev));
2922 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
2923 		 MTX_NETWORK_LOCK, MTX_DEF);
2924 
2925 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
2926 
2927 	mxge_setup_cfg_space(sc);
2928 
2929 	/* Map the board into the kernel */
2930 	rid = PCIR_BARS;
2931 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
2932 					 ~0, 1, RF_ACTIVE);
2933 	if (sc->mem_res == NULL) {
2934 		device_printf(dev, "could not map memory\n");
2935 		err = ENXIO;
2936 		goto abort_with_lock;
2937 	}
2938 	sc->sram = rman_get_virtual(sc->mem_res);
2939 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
2940 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
2941 		device_printf(dev, "impossible memory region size %ld\n",
2942 			      rman_get_size(sc->mem_res));
2943 		err = ENXIO;
2944 		goto abort_with_mem_res;
2945 	}
2946 
2947 	/* make NULL terminated copy of the EEPROM strings section of
2948 	   lanai SRAM */
2949 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
2950 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
2951 				rman_get_bushandle(sc->mem_res),
2952 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
2953 				sc->eeprom_strings,
2954 				MXGE_EEPROM_STRINGS_SIZE - 2);
2955 	err = mxge_parse_strings(sc);
2956 	if (err != 0)
2957 		goto abort_with_mem_res;
2958 
2959 	/* Enable write combining for efficient use of PCIe bus */
2960 	mxge_enable_wc(sc);
2961 
2962 	/* Allocate the out of band dma memory */
2963 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
2964 			     sizeof (mxge_cmd_t), 64);
2965 	if (err != 0)
2966 		goto abort_with_mem_res;
2967 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
2968 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
2969 	if (err != 0)
2970 		goto abort_with_cmd_dma;
2971 
2972 	err = mxge_dma_alloc(sc, &sc->fw_stats_dma,
2973 			     sizeof (*sc->fw_stats), 64);
2974 	if (err != 0)
2975 		goto abort_with_zeropad_dma;
2976 	sc->fw_stats = (mcp_irq_data_t *)sc->fw_stats_dma.addr;
2977 
2978 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
2979 	if (err != 0)
2980 		goto abort_with_fw_stats;
2981 
2982 	/* allocate interrupt queues */
2983 	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);
2984 	err = mxge_dma_alloc(sc, &sc->rx_done.dma, bytes, 4096);
2985 	if (err != 0)
2986 		goto abort_with_dmabench;
2987 	sc->rx_done.entry = sc->rx_done.dma.addr;
2988 	bzero(sc->rx_done.entry, bytes);
2989 
2990 	/* Add our ithread  */
2991 	count = pci_msi_count(dev);
2992 	if (count == 1 && pci_alloc_msi(dev, &count) == 0) {
2993 		rid = 1;
2994 		sc->msi_enabled = 1;
2995 	} else {
2996 		rid = 0;
2997 	}
2998 	sc->irq_res = bus_alloc_resource(dev, SYS_RES_IRQ, &rid, 0, ~0,
2999 					 1, RF_SHAREABLE | RF_ACTIVE);
3000 	if (sc->irq_res == NULL) {
3001 		device_printf(dev, "could not alloc interrupt\n");
3002 		goto abort_with_rx_done;
3003 	}
3004 	if (mxge_verbose)
3005 		device_printf(dev, "using %s irq %ld\n",
3006 			      sc->msi_enabled ? "MSI" : "INTx",
3007 			      rman_get_start(sc->irq_res));
3008 	/* load the firmware */
3009 	mxge_select_firmware(sc);
3010 
3011 	err = mxge_load_firmware(sc);
3012 	if (err != 0)
3013 		goto abort_with_irq_res;
3014 	sc->intr_coal_delay = mxge_intr_coal_delay;
3015 	err = mxge_reset(sc);
3016 	if (err != 0)
3017 		goto abort_with_irq_res;
3018 
3019 	err = mxge_alloc_rings(sc);
3020 	if (err != 0) {
3021 		device_printf(sc->dev, "failed to allocate rings\n");
3022 		goto abort_with_irq_res;
3023 	}
3024 
3025 	err = bus_setup_intr(sc->dev, sc->irq_res,
3026 			     INTR_TYPE_NET | INTR_MPSAFE,
3027 			     NULL, mxge_intr, sc, &sc->ih);
3028 	if (err != 0) {
3029 		goto abort_with_rings;
3030 	}
3031 	/* hook into the network stack */
3032 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
3033 	ifp->if_baudrate = 100000000;
3034 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
3035 		IFCAP_JUMBO_MTU;
3036 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
3037 	ifp->if_capenable = ifp->if_capabilities;
3038 	sc->csum_flag = 1;
3039         ifp->if_init = mxge_init;
3040         ifp->if_softc = sc;
3041         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
3042         ifp->if_ioctl = mxge_ioctl;
3043         ifp->if_start = mxge_start;
3044 	ether_ifattach(ifp, sc->mac_addr);
3045 	/* ether_ifattach sets mtu to 1500 */
3046 	ifp->if_mtu = MXGE_MAX_ETHER_MTU - ETHER_HDR_LEN;
3047 
3048 	/* Initialise the ifmedia structure */
3049 	ifmedia_init(&sc->media, 0, mxge_media_change,
3050 		     mxge_media_status);
3051 	ifmedia_add(&sc->media, IFM_ETHER|IFM_AUTO, 0, NULL);
3052 	mxge_add_sysctls(sc);
3053 	return 0;
3054 
3055 abort_with_rings:
3056 	mxge_free_rings(sc);
3057 abort_with_irq_res:
3058 	bus_release_resource(dev, SYS_RES_IRQ,
3059 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3060 	if (sc->msi_enabled)
3061 		pci_release_msi(dev);
3062 abort_with_rx_done:
3063 	sc->rx_done.entry = NULL;
3064 	mxge_dma_free(&sc->rx_done.dma);
3065 abort_with_dmabench:
3066 	mxge_dma_free(&sc->dmabench_dma);
3067 abort_with_fw_stats:
3068 	mxge_dma_free(&sc->fw_stats_dma);
3069 abort_with_zeropad_dma:
3070 	mxge_dma_free(&sc->zeropad_dma);
3071 abort_with_cmd_dma:
3072 	mxge_dma_free(&sc->cmd_dma);
3073 abort_with_mem_res:
3074 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3075 abort_with_lock:
3076 	pci_disable_busmaster(dev);
3077 	mtx_destroy(&sc->cmd_mtx);
3078 	mtx_destroy(&sc->tx_mtx);
3079 	mtx_destroy(&sc->driver_mtx);
3080 	if_free(ifp);
3081 abort_with_parent_dmat:
3082 	bus_dma_tag_destroy(sc->parent_dmat);
3083 
3084 abort_with_nothing:
3085 	return err;
3086 }
3087 
3088 static int
3089 mxge_detach(device_t dev)
3090 {
3091 	mxge_softc_t *sc = device_get_softc(dev);
3092 
3093 	mtx_lock(&sc->driver_mtx);
3094 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
3095 		mxge_close(sc);
3096 	callout_stop(&sc->co_hdl);
3097 	mtx_unlock(&sc->driver_mtx);
3098 	ether_ifdetach(sc->ifp);
3099 	ifmedia_removeall(&sc->media);
3100 	mxge_dummy_rdma(sc, 0);
3101 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
3102 	mxge_free_rings(sc);
3103 	bus_release_resource(dev, SYS_RES_IRQ,
3104 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3105 	if (sc->msi_enabled)
3106 		pci_release_msi(dev);
3107 
3108 	sc->rx_done.entry = NULL;
3109 	mxge_dma_free(&sc->rx_done.dma);
3110 	mxge_dma_free(&sc->fw_stats_dma);
3111 	mxge_dma_free(&sc->dmabench_dma);
3112 	mxge_dma_free(&sc->zeropad_dma);
3113 	mxge_dma_free(&sc->cmd_dma);
3114 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3115 	pci_disable_busmaster(dev);
3116 	mtx_destroy(&sc->cmd_mtx);
3117 	mtx_destroy(&sc->tx_mtx);
3118 	mtx_destroy(&sc->driver_mtx);
3119 	if_free(sc->ifp);
3120 	bus_dma_tag_destroy(sc->parent_dmat);
3121 	return 0;
3122 }
3123 
3124 static int
3125 mxge_shutdown(device_t dev)
3126 {
3127 	return 0;
3128 }
3129 
3130 /*
3131   This file uses Myri10GE driver indentation.
3132 
3133   Local Variables:
3134   c-file-style:"linux"
3135   tab-width:8
3136   End:
3137 */
3138