xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 85999a0155e389415cc476110fd5614baf543a55)
1 /******************************************************************************
2 
3 Copyright (c) 2006, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Redistributions in binary form must reproduce the above copyright
13     notice, this list of conditions and the following disclaimer in the
14     documentation and/or other materials provided with the distribution.
15 
16  3. Neither the name of the Myricom Inc, nor the names of its
17     contributors may be used to endorse or promote products derived from
18     this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 POSSIBILITY OF SUCH DAMAGE.
31 
32 ***************************************************************************/
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/linker.h>
40 #include <sys/firmware.h>
41 #include <sys/endian.h>
42 #include <sys/sockio.h>
43 #include <sys/mbuf.h>
44 #include <sys/malloc.h>
45 #include <sys/kdb.h>
46 #include <sys/kernel.h>
47 #include <sys/lock.h>
48 #include <sys/module.h>
49 #include <sys/memrange.h>
50 #include <sys/socket.h>
51 #include <sys/sysctl.h>
52 #include <sys/sx.h>
53 
54 #include <net/if.h>
55 #include <net/if_arp.h>
56 #include <net/ethernet.h>
57 #include <net/if_dl.h>
58 #include <net/if_media.h>
59 
60 #include <net/bpf.h>
61 
62 #include <net/if_types.h>
63 #include <net/if_vlan_var.h>
64 #include <net/zlib.h>
65 
66 #include <netinet/in_systm.h>
67 #include <netinet/in.h>
68 #include <netinet/ip.h>
69 #include <netinet/tcp.h>
70 
71 #include <machine/bus.h>
72 #include <machine/resource.h>
73 #include <sys/bus.h>
74 #include <sys/rman.h>
75 
76 #include <dev/pci/pcireg.h>
77 #include <dev/pci/pcivar.h>
78 
79 #include <vm/vm.h>		/* for pmap_mapdev() */
80 #include <vm/pmap.h>
81 
82 #include <dev/mxge/mxge_mcp.h>
83 #include <dev/mxge/mcp_gen_header.h>
84 #include <dev/mxge/if_mxge_var.h>
85 
86 /* tunable params */
87 static int mxge_nvidia_ecrc_enable = 1;
88 static int mxge_force_firmware = 0;
89 static int mxge_max_intr_slots = 1024;
90 static int mxge_intr_coal_delay = 30;
91 static int mxge_deassert_wait = 1;
92 static int mxge_flow_control = 1;
93 static int mxge_verbose = 0;
94 static int mxge_ticks;
95 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
96 static char *mxge_fw_aligned = "mxge_eth_z8e";
97 
98 static int mxge_probe(device_t dev);
99 static int mxge_attach(device_t dev);
100 static int mxge_detach(device_t dev);
101 static int mxge_shutdown(device_t dev);
102 static void mxge_intr(void *arg);
103 
104 static device_method_t mxge_methods[] =
105 {
106   /* Device interface */
107   DEVMETHOD(device_probe, mxge_probe),
108   DEVMETHOD(device_attach, mxge_attach),
109   DEVMETHOD(device_detach, mxge_detach),
110   DEVMETHOD(device_shutdown, mxge_shutdown),
111   {0, 0}
112 };
113 
114 static driver_t mxge_driver =
115 {
116   "mxge",
117   mxge_methods,
118   sizeof(mxge_softc_t),
119 };
120 
121 static devclass_t mxge_devclass;
122 
123 /* Declare ourselves to be a child of the PCI bus.*/
124 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
125 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
126 
127 static int
128 mxge_probe(device_t dev)
129 {
130   if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
131       (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E)) {
132 	  device_set_desc(dev, "Myri10G-PCIE-8A");
133 	  return 0;
134   }
135   return ENXIO;
136 }
137 
138 static void
139 mxge_enable_wc(mxge_softc_t *sc)
140 {
141 	struct mem_range_desc mrdesc;
142 	vm_paddr_t pa;
143 	vm_offset_t len;
144 	int err, action;
145 
146 	pa = rman_get_start(sc->mem_res);
147 	len = rman_get_size(sc->mem_res);
148 	mrdesc.mr_base = pa;
149 	mrdesc.mr_len = len;
150 	mrdesc.mr_flags = MDF_WRITECOMBINE;
151 	action = MEMRANGE_SET_UPDATE;
152 	strcpy((char *)&mrdesc.mr_owner, "mxge");
153 	err = mem_range_attr_set(&mrdesc, &action);
154 	if (err != 0) {
155 		device_printf(sc->dev,
156 			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
157 			      (unsigned long)pa, (unsigned long)len, err);
158 	} else {
159 		sc->wc = 1;
160 	}
161 }
162 
163 
164 /* callback to get our DMA address */
165 static void
166 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
167 			 int error)
168 {
169 	if (error == 0) {
170 		*(bus_addr_t *) arg = segs->ds_addr;
171 	}
172 }
173 
174 static int
175 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
176 		   bus_size_t alignment)
177 {
178 	int err;
179 	device_t dev = sc->dev;
180 
181 	/* allocate DMAable memory tags */
182 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
183 				 alignment,		/* alignment */
184 				 4096,			/* boundary */
185 				 BUS_SPACE_MAXADDR,	/* low */
186 				 BUS_SPACE_MAXADDR,	/* high */
187 				 NULL, NULL,		/* filter */
188 				 bytes,			/* maxsize */
189 				 1,			/* num segs */
190 				 4096,			/* maxsegsize */
191 				 BUS_DMA_COHERENT,	/* flags */
192 				 NULL, NULL,		/* lock */
193 				 &dma->dmat);		/* tag */
194 	if (err != 0) {
195 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
196 		return err;
197 	}
198 
199 	/* allocate DMAable memory & map */
200 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
201 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
202 				| BUS_DMA_ZERO),  &dma->map);
203 	if (err != 0) {
204 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
205 		goto abort_with_dmat;
206 	}
207 
208 	/* load the memory */
209 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
210 			      mxge_dmamap_callback,
211 			      (void *)&dma->bus_addr, 0);
212 	if (err != 0) {
213 		device_printf(dev, "couldn't load map (err = %d)\n", err);
214 		goto abort_with_mem;
215 	}
216 	return 0;
217 
218 abort_with_mem:
219 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
220 abort_with_dmat:
221 	(void)bus_dma_tag_destroy(dma->dmat);
222 	return err;
223 }
224 
225 
226 static void
227 mxge_dma_free(mxge_dma_t *dma)
228 {
229 	bus_dmamap_unload(dma->dmat, dma->map);
230 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
231 	(void)bus_dma_tag_destroy(dma->dmat);
232 }
233 
234 /*
235  * The eeprom strings on the lanaiX have the format
236  * SN=x\0
237  * MAC=x:x:x:x:x:x\0
238  * PC=text\0
239  */
240 
241 static int
242 mxge_parse_strings(mxge_softc_t *sc)
243 {
244 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
245 
246 	char *ptr, *limit;
247 	int i, found_mac;
248 
249 	ptr = sc->eeprom_strings;
250 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
251 	found_mac = 0;
252 	while (ptr < limit && *ptr != '\0') {
253 		if (memcmp(ptr, "MAC=", 4) == 0) {
254 			ptr += 1;
255 			sc->mac_addr_string = ptr;
256 			for (i = 0; i < 6; i++) {
257 				ptr += 3;
258 				if ((ptr + 2) > limit)
259 					goto abort;
260 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
261 				found_mac = 1;
262 			}
263 		} else if (memcmp(ptr, "PC=", 3) == 0) {
264 			ptr += 3;
265 			strncpy(sc->product_code_string, ptr,
266 				sizeof (sc->product_code_string) - 1);
267 		} else if (memcmp(ptr, "SN=", 3) == 0) {
268 			ptr += 3;
269 			strncpy(sc->serial_number_string, ptr,
270 				sizeof (sc->serial_number_string) - 1);
271 		}
272 		MXGE_NEXT_STRING(ptr);
273 	}
274 
275 	if (found_mac)
276 		return 0;
277 
278  abort:
279 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
280 
281 	return ENXIO;
282 }
283 
284 #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
285 static int
286 mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
287 {
288 	uint32_t val;
289 	unsigned long off;
290 	char *va, *cfgptr;
291 	uint16_t vendor_id, device_id;
292 	uintptr_t bus, slot, func, ivend, idev;
293 	uint32_t *ptr32;
294 
295 	/* XXXX
296 	   Test below is commented because it is believed that doing
297 	   config read/write beyond 0xff will access the config space
298 	   for the next larger function.  Uncomment this and remove
299 	   the hacky pmap_mapdev() way of accessing config space when
300 	   FreeBSD grows support for extended pcie config space access
301 	*/
302 #if 0
303 	/* See if we can, by some miracle, access the extended
304 	   config space */
305 	val = pci_read_config(pdev, 0x178, 4);
306 	if (val != 0xffffffff) {
307 		val |= 0x40;
308 		pci_write_config(pdev, 0x178, val, 4);
309 		return 0;
310 	}
311 #endif
312 	/* Rather than using normal pci config space writes, we must
313 	 * map the Nvidia config space ourselves.  This is because on
314 	 * opteron/nvidia class machine the 0xe000000 mapping is
315 	 * handled by the nvidia chipset, that means the internal PCI
316 	 * device (the on-chip northbridge), or the amd-8131 bridge
317 	 * and things behind them are not visible by this method.
318 	 */
319 
320 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
321 		      PCI_IVAR_BUS, &bus);
322 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
323 		      PCI_IVAR_SLOT, &slot);
324 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
325 		      PCI_IVAR_FUNCTION, &func);
326 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
327 		      PCI_IVAR_VENDOR, &ivend);
328 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
329 		      PCI_IVAR_DEVICE, &idev);
330 
331 	off =  0xe0000000UL
332 		+ 0x00100000UL * (unsigned long)bus
333 		+ 0x00001000UL * (unsigned long)(func
334 						 + 8 * slot);
335 
336 	/* map it into the kernel */
337 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
338 
339 
340 	if (va == NULL) {
341 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
342 		return EIO;
343 	}
344 	/* get a pointer to the config space mapped into the kernel */
345 	cfgptr = va + (off & PAGE_MASK);
346 
347 	/* make sure that we can really access it */
348 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
349 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
350 	if (! (vendor_id == ivend && device_id == idev)) {
351 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
352 			      vendor_id, device_id);
353 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
354 		return EIO;
355 	}
356 
357 	ptr32 = (uint32_t*)(cfgptr + 0x178);
358 	val = *ptr32;
359 
360 	if (val == 0xffffffff) {
361 		device_printf(sc->dev, "extended mapping failed\n");
362 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
363 		return EIO;
364 	}
365 	*ptr32 = val | 0x40;
366 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
367 	if (mxge_verbose)
368 		device_printf(sc->dev,
369 			      "Enabled ECRC on upstream Nvidia bridge "
370 			      "at %d:%d:%d\n",
371 			      (int)bus, (int)slot, (int)func);
372 	return 0;
373 }
374 #else
375 static int
376 mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
377 {
378 	device_printf(sc->dev,
379 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
380 	return ENXIO;
381 }
382 #endif
383 /*
384  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
385  * when the PCI-E Completion packets are aligned on an 8-byte
386  * boundary.  Some PCI-E chip sets always align Completion packets; on
387  * the ones that do not, the alignment can be enforced by enabling
388  * ECRC generation (if supported).
389  *
390  * When PCI-E Completion packets are not aligned, it is actually more
391  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
392  *
393  * If the driver can neither enable ECRC nor verify that it has
394  * already been enabled, then it must use a firmware image which works
395  * around unaligned completion packets (ethp_z8e.dat), and it should
396  * also ensure that it never gives the device a Read-DMA which is
397  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
398  * enabled, then the driver should use the aligned (eth_z8e.dat)
399  * firmware image, and set tx.boundary to 4KB.
400  */
401 
402 static void
403 mxge_select_firmware(mxge_softc_t *sc)
404 {
405 	int err, aligned = 0;
406 	device_t pdev;
407 	uint16_t pvend, pdid;
408 
409 
410 	if (mxge_force_firmware != 0) {
411 		if (mxge_force_firmware == 1)
412 			aligned = 1;
413 		else
414 			aligned = 0;
415 		if (mxge_verbose)
416 			device_printf(sc->dev,
417 				      "Assuming %s completions (forced)\n",
418 				      aligned ? "aligned" : "unaligned");
419 		goto abort;
420 	}
421 
422 	/* if the PCIe link width is 4 or less, we can use the aligned
423 	   firmware and skip any checks */
424 	if (sc->link_width != 0 && sc->link_width <= 4) {
425 		device_printf(sc->dev,
426 			      "PCIe x%d Link, expect reduced performance\n",
427 			      sc->link_width);
428 		aligned = 1;
429 		goto abort;
430 	}
431 
432 	pdev = device_get_parent(device_get_parent(sc->dev));
433 	if (pdev == NULL) {
434 		device_printf(sc->dev, "could not find parent?\n");
435 		goto abort;
436 	}
437 	pvend = pci_read_config(pdev, PCIR_VENDOR, 2);
438 	pdid = pci_read_config(pdev, PCIR_DEVICE, 2);
439 
440 	/* see if we can enable ECRC's on an upstream
441 	   Nvidia bridge */
442 	if (mxge_nvidia_ecrc_enable &&
443 	    (pvend == 0x10de && pdid == 0x005d)) {
444 		err = mxge_enable_nvidia_ecrc(sc, pdev);
445 		if (err == 0) {
446 			aligned = 1;
447 			if (mxge_verbose)
448 				device_printf(sc->dev,
449 					      "Assuming aligned completions"
450 					      " (ECRC)\n");
451 		}
452 	}
453 	/* see if the upstream bridge is known to
454 	   provided aligned completions */
455 	if (/* HT2000 */ (pvend == 0x1166 && pdid == 0x0132) ||
456 	    /* PLX */    (pvend == 0x10b5 && pdid == 0x8532) ||
457 	    /* Intel */  (pvend == 0x8086 &&
458 	      /* E5000 NorthBridge*/((pdid >= 0x25f7 && pdid <= 0x25fa) ||
459 	      /* E5000 SouthBridge*/ (pdid >= 0x3510 && pdid <= 0x351b)))) {
460 		aligned = 1;
461 		if (mxge_verbose)
462 			device_printf(sc->dev,
463 				      "Assuming aligned completions "
464 				      "(0x%x:0x%x)\n", pvend, pdid);
465 	}
466 
467 abort:
468 	if (aligned) {
469 		sc->fw_name = mxge_fw_aligned;
470 		sc->tx.boundary = 4096;
471 	} else {
472 		sc->fw_name = mxge_fw_unaligned;
473 		sc->tx.boundary = 2048;
474 	}
475 }
476 
477 union qualhack
478 {
479         const char *ro_char;
480         char *rw_char;
481 };
482 
483 static int
484 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
485 {
486 
487 
488 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
489 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
490 			      be32toh(hdr->mcp_type));
491 		return EIO;
492 	}
493 
494 	/* save firmware version for sysctl */
495 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
496 	if (mxge_verbose)
497 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
498 
499 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
500 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
501 
502 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
503 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
504 		device_printf(sc->dev, "Found firmware version %s\n",
505 			      sc->fw_version);
506 		device_printf(sc->dev, "Driver needs %d.%d\n",
507 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
508 		return EINVAL;
509 	}
510 	return 0;
511 
512 }
513 
514 static int
515 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
516 {
517 	const struct firmware *fw;
518 	const mcp_gen_header_t *hdr;
519 	unsigned hdr_offset;
520 	const char *fw_data;
521 	union qualhack hack;
522 	int status;
523 	unsigned int i;
524 	char dummy;
525 
526 
527 	fw = firmware_get(sc->fw_name);
528 
529 	if (fw == NULL) {
530 		device_printf(sc->dev, "Could not find firmware image %s\n",
531 			      sc->fw_name);
532 		return ENOENT;
533 	}
534 	if (fw->datasize > *limit ||
535 	    fw->datasize < MCP_HEADER_PTR_OFFSET + 4) {
536 		device_printf(sc->dev, "Firmware image %s too large (%d/%d)\n",
537 			      sc->fw_name, (int)fw->datasize, (int) *limit);
538 		status = ENOSPC;
539 		goto abort_with_fw;
540 	}
541 	*limit = fw->datasize;
542 
543 	/* check id */
544 	fw_data = (const char *)fw->data;
545 	hdr_offset = htobe32(*(const uint32_t *)
546 			     (fw_data + MCP_HEADER_PTR_OFFSET));
547 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw->datasize) {
548 		device_printf(sc->dev, "Bad firmware file");
549 		status = EIO;
550 		goto abort_with_fw;
551 	}
552 	hdr = (const void*)(fw_data + hdr_offset);
553 
554 	status = mxge_validate_firmware(sc, hdr);
555 	if (status != 0)
556 		goto abort_with_fw;
557 
558 	hack.ro_char = fw_data;
559 	/* Copy the inflated firmware to NIC SRAM. */
560 	for (i = 0; i < *limit; i += 256) {
561 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
562 			      hack.rw_char + i,
563 			      min(256U, (unsigned)(*limit - i)));
564 		mb();
565 		dummy = *sc->sram;
566 		mb();
567 	}
568 
569 	status = 0;
570 abort_with_fw:
571 	firmware_put(fw, FIRMWARE_UNLOAD);
572 	return status;
573 }
574 
575 /*
576  * Enable or disable periodic RDMAs from the host to make certain
577  * chipsets resend dropped PCIe messages
578  */
579 
580 static void
581 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
582 {
583 	char buf_bytes[72];
584 	volatile uint32_t *confirm;
585 	volatile char *submit;
586 	uint32_t *buf, dma_low, dma_high;
587 	int i;
588 
589 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
590 
591 	/* clear confirmation addr */
592 	confirm = (volatile uint32_t *)sc->cmd;
593 	*confirm = 0;
594 	mb();
595 
596 	/* send an rdma command to the PCIe engine, and wait for the
597 	   response in the confirmation address.  The firmware should
598 	   write a -1 there to indicate it is alive and well
599 	*/
600 
601 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
602 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
603 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
604 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
605 	buf[2] = htobe32(0xffffffff);		/* confirm data */
606 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
607 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
608 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
609 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
610 	buf[5] = htobe32(enable);			/* enable? */
611 
612 
613 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
614 
615 	mxge_pio_copy(submit, buf, 64);
616 	mb();
617 	DELAY(1000);
618 	mb();
619 	i = 0;
620 	while (*confirm != 0xffffffff && i < 20) {
621 		DELAY(1000);
622 		i++;
623 	}
624 	if (*confirm != 0xffffffff) {
625 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
626 			      (enable ? "enable" : "disable"), confirm,
627 			      *confirm);
628 	}
629 	return;
630 }
631 
632 static int
633 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
634 {
635 	mcp_cmd_t *buf;
636 	char buf_bytes[sizeof(*buf) + 8];
637 	volatile mcp_cmd_response_t *response = sc->cmd;
638 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
639 	uint32_t dma_low, dma_high;
640 	int sleep_total = 0;
641 
642 	/* ensure buf is aligned to 8 bytes */
643 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
644 
645 	buf->data0 = htobe32(data->data0);
646 	buf->data1 = htobe32(data->data1);
647 	buf->data2 = htobe32(data->data2);
648 	buf->cmd = htobe32(cmd);
649 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
650 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
651 
652 	buf->response_addr.low = htobe32(dma_low);
653 	buf->response_addr.high = htobe32(dma_high);
654 	mtx_lock(&sc->cmd_mtx);
655 	response->result = 0xffffffff;
656 	mb();
657 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
658 
659 	/* wait up to 20ms */
660 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
661 		bus_dmamap_sync(sc->cmd_dma.dmat,
662 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
663 		mb();
664 		if (response->result != 0xffffffff) {
665 			if (response->result == 0) {
666 				data->data0 = be32toh(response->data);
667 				mtx_unlock(&sc->cmd_mtx);
668 				return 0;
669 			} else {
670 				device_printf(sc->dev,
671 					      "mxge: command %d "
672 					      "failed, result = %d\n",
673 					      cmd, be32toh(response->result));
674 				mtx_unlock(&sc->cmd_mtx);
675 				return ENXIO;
676 			}
677 		}
678 		DELAY(1000);
679 	}
680 	mtx_unlock(&sc->cmd_mtx);
681 	device_printf(sc->dev, "mxge: command %d timed out"
682 		      "result = %d\n",
683 		      cmd, be32toh(response->result));
684 	return EAGAIN;
685 }
686 
687 static int
688 mxge_adopt_running_firmware(mxge_softc_t *sc)
689 {
690 	struct mcp_gen_header *hdr;
691 	const size_t bytes = sizeof (struct mcp_gen_header);
692 	size_t hdr_offset;
693 	int status;
694 
695 	/* find running firmware header */
696 	hdr_offset = htobe32(*(volatile uint32_t *)
697 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
698 
699 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
700 		device_printf(sc->dev,
701 			      "Running firmware has bad header offset (%d)\n",
702 			      (int)hdr_offset);
703 		return EIO;
704 	}
705 
706 	/* copy header of running firmware from SRAM to host memory to
707 	 * validate firmware */
708 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
709 	if (hdr == NULL) {
710 		device_printf(sc->dev, "could not malloc firmware hdr\n");
711 		return ENOMEM;
712 	}
713 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
714 				rman_get_bushandle(sc->mem_res),
715 				hdr_offset, (char *)hdr, bytes);
716 	status = mxge_validate_firmware(sc, hdr);
717 	free(hdr, M_DEVBUF);
718 
719 	/*
720 	 * check to see if adopted firmware has bug where adopting
721 	 * it will cause broadcasts to be filtered unless the NIC
722 	 * is kept in ALLMULTI mode
723 	 */
724 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
725 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
726 		sc->adopted_rx_filter_bug = 1;
727 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
728 			      "working around rx filter bug\n",
729 			      sc->fw_ver_major, sc->fw_ver_minor,
730 			      sc->fw_ver_tiny);
731 	}
732 
733 	return status;
734 }
735 
736 
737 static int
738 mxge_load_firmware(mxge_softc_t *sc)
739 {
740 	volatile uint32_t *confirm;
741 	volatile char *submit;
742 	char buf_bytes[72];
743 	uint32_t *buf, size, dma_low, dma_high;
744 	int status, i;
745 
746 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
747 
748 	size = sc->sram_size;
749 	status = mxge_load_firmware_helper(sc, &size);
750 	if (status) {
751 		/* Try to use the currently running firmware, if
752 		   it is new enough */
753 		status = mxge_adopt_running_firmware(sc);
754 		if (status) {
755 			device_printf(sc->dev,
756 				      "failed to adopt running firmware\n");
757 			return status;
758 		}
759 		device_printf(sc->dev,
760 			      "Successfully adopted running firmware\n");
761 		if (sc->tx.boundary == 4096) {
762 			device_printf(sc->dev,
763 				"Using firmware currently running on NIC"
764 				 ".  For optimal\n");
765 			device_printf(sc->dev,
766 				 "performance consider loading optimized "
767 				 "firmware\n");
768 		}
769 		sc->fw_name = mxge_fw_unaligned;
770 		sc->tx.boundary = 2048;
771 		return 0;
772 	}
773 	/* clear confirmation addr */
774 	confirm = (volatile uint32_t *)sc->cmd;
775 	*confirm = 0;
776 	mb();
777 	/* send a reload command to the bootstrap MCP, and wait for the
778 	   response in the confirmation address.  The firmware should
779 	   write a -1 there to indicate it is alive and well
780 	*/
781 
782 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
783 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
784 
785 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
786 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
787 	buf[2] = htobe32(0xffffffff);	/* confirm data */
788 
789 	/* FIX: All newest firmware should un-protect the bottom of
790 	   the sram before handoff. However, the very first interfaces
791 	   do not. Therefore the handoff copy must skip the first 8 bytes
792 	*/
793 					/* where the code starts*/
794 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
795 	buf[4] = htobe32(size - 8); 	/* length of code */
796 	buf[5] = htobe32(8);		/* where to copy to */
797 	buf[6] = htobe32(0);		/* where to jump to */
798 
799 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
800 	mxge_pio_copy(submit, buf, 64);
801 	mb();
802 	DELAY(1000);
803 	mb();
804 	i = 0;
805 	while (*confirm != 0xffffffff && i < 20) {
806 		DELAY(1000*10);
807 		i++;
808 		bus_dmamap_sync(sc->cmd_dma.dmat,
809 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
810 	}
811 	if (*confirm != 0xffffffff) {
812 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
813 			confirm, *confirm);
814 
815 		return ENXIO;
816 	}
817 	return 0;
818 }
819 
820 static int
821 mxge_update_mac_address(mxge_softc_t *sc)
822 {
823 	mxge_cmd_t cmd;
824 	uint8_t *addr = sc->mac_addr;
825 	int status;
826 
827 
828 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
829 		     | (addr[2] << 8) | addr[3]);
830 
831 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
832 
833 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
834 	return status;
835 }
836 
837 static int
838 mxge_change_pause(mxge_softc_t *sc, int pause)
839 {
840 	mxge_cmd_t cmd;
841 	int status;
842 
843 	if (pause)
844 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
845 				       &cmd);
846 	else
847 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
848 				       &cmd);
849 
850 	if (status) {
851 		device_printf(sc->dev, "Failed to set flow control mode\n");
852 		return ENXIO;
853 	}
854 	sc->pause = pause;
855 	return 0;
856 }
857 
858 static void
859 mxge_change_promisc(mxge_softc_t *sc, int promisc)
860 {
861 	mxge_cmd_t cmd;
862 	int status;
863 
864 	if (promisc)
865 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
866 				       &cmd);
867 	else
868 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
869 				       &cmd);
870 
871 	if (status) {
872 		device_printf(sc->dev, "Failed to set promisc mode\n");
873 	}
874 }
875 
876 static void
877 mxge_set_multicast_list(mxge_softc_t *sc)
878 {
879 	mxge_cmd_t cmd;
880 	struct ifmultiaddr *ifma;
881 	struct ifnet *ifp = sc->ifp;
882 	int err;
883 
884 	/* This firmware is known to not support multicast */
885 	if (!sc->fw_multicast_support)
886 		return;
887 
888 	/* Disable multicast filtering while we play with the lists*/
889 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
890 	if (err != 0) {
891 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
892 		       " error status: %d\n", err);
893 		return;
894 	}
895 
896 	if (sc->adopted_rx_filter_bug)
897 		return;
898 
899 	if (ifp->if_flags & IFF_ALLMULTI)
900 		/* request to disable multicast filtering, so quit here */
901 		return;
902 
903 	/* Flush all the filters */
904 
905 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
906 	if (err != 0) {
907 		device_printf(sc->dev,
908 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
909 			      ", error status: %d\n", err);
910 		return;
911 	}
912 
913 	/* Walk the multicast list, and add each address */
914 
915 	IF_ADDR_LOCK(ifp);
916 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
917 		if (ifma->ifma_addr->sa_family != AF_LINK)
918 			continue;
919 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
920 		      &cmd.data0, 4);
921 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
922 		      &cmd.data1, 2);
923 		cmd.data0 = htonl(cmd.data0);
924 		cmd.data1 = htonl(cmd.data1);
925 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
926 		if (err != 0) {
927 			device_printf(sc->dev, "Failed "
928 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
929 			       "%d\t", err);
930 			/* abort, leaving multicast filtering off */
931 			IF_ADDR_UNLOCK(ifp);
932 			return;
933 		}
934 	}
935 	IF_ADDR_UNLOCK(ifp);
936 	/* Enable multicast filtering */
937 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
938 	if (err != 0) {
939 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
940 		       ", error status: %d\n", err);
941 	}
942 }
943 
944 
945 static int
946 mxge_reset(mxge_softc_t *sc)
947 {
948 
949 	mxge_cmd_t cmd;
950 	size_t bytes;
951 	int status;
952 
953 	/* try to send a reset command to the card to see if it
954 	   is alive */
955 	memset(&cmd, 0, sizeof (cmd));
956 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
957 	if (status != 0) {
958 		device_printf(sc->dev, "failed reset\n");
959 		return ENXIO;
960 	}
961 
962 	mxge_dummy_rdma(sc, 1);
963 
964 	/* Now exchange information about interrupts  */
965 	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);\
966 	memset(sc->rx_done.entry, 0, bytes);
967 	cmd.data0 = (uint32_t)bytes;
968 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
969 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->rx_done.dma.bus_addr);
970 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->rx_done.dma.bus_addr);
971 	status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd);
972 
973 	status |= mxge_send_cmd(sc,
974 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
975 
976 
977 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
978 
979 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
980 	sc->irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
981 
982 
983 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
984 				&cmd);
985 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
986 	if (status != 0) {
987 		device_printf(sc->dev, "failed set interrupt parameters\n");
988 		return status;
989 	}
990 
991 
992 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
993 
994 
995 	/* run a DMA benchmark */
996 	sc->read_dma = sc->write_dma = sc->read_write_dma = 0;
997 
998 	/* Read DMA */
999 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->dmabench_dma.bus_addr);
1000 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->dmabench_dma.bus_addr);
1001 	cmd.data2 = sc->tx.boundary * 0x10000;
1002 
1003 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
1004 	if (status != 0)
1005 		device_printf(sc->dev, "read dma benchmark failed\n");
1006 	else
1007 		sc->read_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
1008 			(cmd.data0 & 0xffff);
1009 
1010 	/* Write DMA */
1011 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->dmabench_dma.bus_addr);
1012 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->dmabench_dma.bus_addr);
1013 	cmd.data2 = sc->tx.boundary * 0x1;
1014 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
1015 	if (status != 0)
1016 		device_printf(sc->dev, "write dma benchmark failed\n");
1017 	else
1018 		sc->write_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
1019 			(cmd.data0 & 0xffff);
1020 	/* Read/Write DMA */
1021 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->dmabench_dma.bus_addr);
1022 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->dmabench_dma.bus_addr);
1023 	cmd.data2 = sc->tx.boundary * 0x10001;
1024 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
1025 	if (status != 0)
1026 		device_printf(sc->dev, "read/write dma benchmark failed\n");
1027 	else
1028 		sc->read_write_dma =
1029 			((cmd.data0>>16) * sc->tx.boundary * 2 * 2) /
1030 			(cmd.data0 & 0xffff);
1031 
1032 	/* reset mcp/driver shared state back to 0 */
1033 	bzero(sc->rx_done.entry, bytes);
1034 	sc->rx_done.idx = 0;
1035 	sc->rx_done.cnt = 0;
1036 	sc->tx.req = 0;
1037 	sc->tx.done = 0;
1038 	sc->tx.pkt_done = 0;
1039 	sc->tx.wake = 0;
1040 	sc->tx.stall = 0;
1041 	sc->rx_big.cnt = 0;
1042 	sc->rx_small.cnt = 0;
1043 	sc->rdma_tags_available = 15;
1044 	sc->fw_stats->valid = 0;
1045 	sc->fw_stats->send_done_count = 0;
1046 	status = mxge_update_mac_address(sc);
1047 	mxge_change_promisc(sc, 0);
1048 	mxge_change_pause(sc, sc->pause);
1049 	mxge_set_multicast_list(sc);
1050 	return status;
1051 }
1052 
1053 static int
1054 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1055 {
1056         mxge_softc_t *sc;
1057         unsigned int intr_coal_delay;
1058         int err;
1059 
1060         sc = arg1;
1061         intr_coal_delay = sc->intr_coal_delay;
1062         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1063         if (err != 0) {
1064                 return err;
1065         }
1066         if (intr_coal_delay == sc->intr_coal_delay)
1067                 return 0;
1068 
1069         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1070                 return EINVAL;
1071 
1072 	mtx_lock(&sc->driver_mtx);
1073 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1074 	sc->intr_coal_delay = intr_coal_delay;
1075 
1076 	mtx_unlock(&sc->driver_mtx);
1077         return err;
1078 }
1079 
1080 static int
1081 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1082 {
1083         mxge_softc_t *sc;
1084         unsigned int enabled;
1085         int err;
1086 
1087         sc = arg1;
1088         enabled = sc->pause;
1089         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1090         if (err != 0) {
1091                 return err;
1092         }
1093         if (enabled == sc->pause)
1094                 return 0;
1095 
1096 	mtx_lock(&sc->driver_mtx);
1097 	err = mxge_change_pause(sc, enabled);
1098 	mtx_unlock(&sc->driver_mtx);
1099         return err;
1100 }
1101 
1102 static int
1103 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1104 {
1105         int err;
1106 
1107         if (arg1 == NULL)
1108                 return EFAULT;
1109         arg2 = be32toh(*(int *)arg1);
1110         arg1 = NULL;
1111         err = sysctl_handle_int(oidp, arg1, arg2, req);
1112 
1113         return err;
1114 }
1115 
1116 static void
1117 mxge_add_sysctls(mxge_softc_t *sc)
1118 {
1119 	struct sysctl_ctx_list *ctx;
1120 	struct sysctl_oid_list *children;
1121 	mcp_irq_data_t *fw;
1122 
1123 	ctx = device_get_sysctl_ctx(sc->dev);
1124 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1125 	fw = sc->fw_stats;
1126 
1127 	/* random information */
1128 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1129 		       "firmware_version",
1130 		       CTLFLAG_RD, &sc->fw_version,
1131 		       0, "firmware version");
1132 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1133 		       "serial_number",
1134 		       CTLFLAG_RD, &sc->serial_number_string,
1135 		       0, "serial number");
1136 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1137 		       "product_code",
1138 		       CTLFLAG_RD, &sc->product_code_string,
1139 		       0, "product_code");
1140 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1141 		       "pcie_link_width",
1142 		       CTLFLAG_RD, &sc->link_width,
1143 		       0, "tx_boundary");
1144 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1145 		       "tx_boundary",
1146 		       CTLFLAG_RD, &sc->tx.boundary,
1147 		       0, "tx_boundary");
1148 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1149 		       "write_combine",
1150 		       CTLFLAG_RD, &sc->wc,
1151 		       0, "write combining PIO?");
1152 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1153 		       "read_dma_MBs",
1154 		       CTLFLAG_RD, &sc->read_dma,
1155 		       0, "DMA Read speed in MB/s");
1156 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1157 		       "write_dma_MBs",
1158 		       CTLFLAG_RD, &sc->write_dma,
1159 		       0, "DMA Write speed in MB/s");
1160 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1161 		       "read_write_dma_MBs",
1162 		       CTLFLAG_RD, &sc->read_write_dma,
1163 		       0, "DMA concurrent Read/Write speed in MB/s");
1164 
1165 
1166 	/* performance related tunables */
1167 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1168 			"intr_coal_delay",
1169 			CTLTYPE_INT|CTLFLAG_RW, sc,
1170 			0, mxge_change_intr_coal,
1171 			"I", "interrupt coalescing delay in usecs");
1172 
1173 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1174 			"flow_control_enabled",
1175 			CTLTYPE_INT|CTLFLAG_RW, sc,
1176 			0, mxge_change_flow_control,
1177 			"I", "interrupt coalescing delay in usecs");
1178 
1179 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1180 		       "deassert_wait",
1181 		       CTLFLAG_RW, &mxge_deassert_wait,
1182 		       0, "Wait for IRQ line to go low in ihandler");
1183 
1184 	/* stats block from firmware is in network byte order.
1185 	   Need to swap it */
1186 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1187 			"link_up",
1188 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1189 			0, mxge_handle_be32,
1190 			"I", "link up");
1191 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1192 			"rdma_tags_available",
1193 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1194 			0, mxge_handle_be32,
1195 			"I", "rdma_tags_available");
1196 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1197 			"dropped_link_overflow",
1198 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1199 			0, mxge_handle_be32,
1200 			"I", "dropped_link_overflow");
1201 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1202 			"dropped_link_error_or_filtered",
1203 			CTLTYPE_INT|CTLFLAG_RD,
1204 			&fw->dropped_link_error_or_filtered,
1205 			0, mxge_handle_be32,
1206 			"I", "dropped_link_error_or_filtered");
1207 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1208 			"dropped_multicast_filtered",
1209 			CTLTYPE_INT|CTLFLAG_RD,
1210 			&fw->dropped_multicast_filtered,
1211 			0, mxge_handle_be32,
1212 			"I", "dropped_multicast_filtered");
1213 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1214 			"dropped_runt",
1215 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1216 			0, mxge_handle_be32,
1217 			"I", "dropped_runt");
1218 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1219 			"dropped_overrun",
1220 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1221 			0, mxge_handle_be32,
1222 			"I", "dropped_overrun");
1223 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1224 			"dropped_no_small_buffer",
1225 			CTLTYPE_INT|CTLFLAG_RD,
1226 			&fw->dropped_no_small_buffer,
1227 			0, mxge_handle_be32,
1228 			"I", "dropped_no_small_buffer");
1229 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1230 			"dropped_no_big_buffer",
1231 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1232 			0, mxge_handle_be32,
1233 			"I", "dropped_no_big_buffer");
1234 
1235 	/* host counters exported for debugging */
1236 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1237 		       "rx_small_cnt",
1238 		       CTLFLAG_RD, &sc->rx_small.cnt,
1239 		       0, "rx_small_cnt");
1240 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1241 		       "rx_big_cnt",
1242 		       CTLFLAG_RD, &sc->rx_big.cnt,
1243 		       0, "rx_small_cnt");
1244 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1245 		       "tx_req",
1246 		       CTLFLAG_RD, &sc->tx.req,
1247 		       0, "tx_req");
1248 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1249 		       "tx_done",
1250 		       CTLFLAG_RD, &sc->tx.done,
1251 		       0, "tx_done");
1252 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1253 		       "tx_pkt_done",
1254 		       CTLFLAG_RD, &sc->tx.pkt_done,
1255 		       0, "tx_done");
1256 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1257 		       "tx_stall",
1258 		       CTLFLAG_RD, &sc->tx.stall,
1259 		       0, "tx_stall");
1260 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1261 		       "tx_wake",
1262 		       CTLFLAG_RD, &sc->tx.wake,
1263 		       0, "tx_wake");
1264 
1265 	/* verbose printing? */
1266 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1267 		       "verbose",
1268 		       CTLFLAG_RW, &mxge_verbose,
1269 		       0, "verbose printing");
1270 
1271 }
1272 
1273 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1274    backwards one at a time and handle ring wraps */
1275 
1276 static inline void
1277 mxge_submit_req_backwards(mxge_tx_buf_t *tx,
1278 			    mcp_kreq_ether_send_t *src, int cnt)
1279 {
1280         int idx, starting_slot;
1281         starting_slot = tx->req;
1282         while (cnt > 1) {
1283                 cnt--;
1284                 idx = (starting_slot + cnt) & tx->mask;
1285                 mxge_pio_copy(&tx->lanai[idx],
1286 			      &src[cnt], sizeof(*src));
1287                 mb();
1288         }
1289 }
1290 
1291 /*
1292  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1293  * at most 32 bytes at a time, so as to avoid involving the software
1294  * pio handler in the nic.   We re-write the first segment's flags
1295  * to mark them valid only after writing the entire chain
1296  */
1297 
1298 static inline void
1299 mxge_submit_req(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src,
1300                   int cnt)
1301 {
1302         int idx, i;
1303         uint32_t *src_ints;
1304 	volatile uint32_t *dst_ints;
1305         mcp_kreq_ether_send_t *srcp;
1306 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1307 	uint8_t last_flags;
1308 
1309         idx = tx->req & tx->mask;
1310 
1311 	last_flags = src->flags;
1312 	src->flags = 0;
1313         mb();
1314         dst = dstp = &tx->lanai[idx];
1315         srcp = src;
1316 
1317         if ((idx + cnt) < tx->mask) {
1318                 for (i = 0; i < (cnt - 1); i += 2) {
1319                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1320                         mb(); /* force write every 32 bytes */
1321                         srcp += 2;
1322                         dstp += 2;
1323                 }
1324         } else {
1325                 /* submit all but the first request, and ensure
1326                    that it is submitted below */
1327                 mxge_submit_req_backwards(tx, src, cnt);
1328                 i = 0;
1329         }
1330         if (i < cnt) {
1331                 /* submit the first request */
1332                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1333                 mb(); /* barrier before setting valid flag */
1334         }
1335 
1336         /* re-write the last 32-bits with the valid flags */
1337         src->flags = last_flags;
1338         src_ints = (uint32_t *)src;
1339         src_ints+=3;
1340         dst_ints = (volatile uint32_t *)dst;
1341         dst_ints+=3;
1342         *dst_ints =  *src_ints;
1343         tx->req += cnt;
1344         mb();
1345 }
1346 
1347 static inline void
1348 mxge_submit_req_wc(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1349 {
1350     tx->req += cnt;
1351     mb();
1352     while (cnt >= 4) {
1353 	    mxge_pio_copy((volatile char *)tx->wc_fifo, src, 64);
1354 	    mb();
1355 	    src += 4;
1356 	    cnt -= 4;
1357     }
1358     if (cnt > 0) {
1359 	    /* pad it to 64 bytes.  The src is 64 bytes bigger than it
1360 	       needs to be so that we don't overrun it */
1361 	    mxge_pio_copy(tx->wc_fifo + MXGEFW_ETH_SEND_OFFSET(cnt), src, 64);
1362 	    mb();
1363     }
1364 }
1365 
1366 static void
1367 mxge_encap_tso(mxge_softc_t *sc, struct mbuf *m, int busdma_seg_cnt)
1368 {
1369 	mxge_tx_buf_t *tx;
1370 	mcp_kreq_ether_send_t *req;
1371 	bus_dma_segment_t *seg;
1372 	struct ether_header *eh;
1373 	struct ip *ip;
1374 	struct tcphdr *tcp;
1375 	uint32_t low, high_swapped;
1376 	int len, seglen, cum_len, cum_len_next;
1377 	int next_is_first, chop, cnt, rdma_count, small;
1378 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1379 	uint8_t flags, flags_next;
1380 	static int once;
1381 
1382 	mss = m->m_pkthdr.tso_segsz;
1383 
1384 	/* negative cum_len signifies to the
1385 	 * send loop that we are still in the
1386 	 * header portion of the TSO packet.
1387 	 */
1388 
1389 	/* ensure we have the ethernet, IP and TCP
1390 	   header together in the first mbuf, copy
1391 	   it to a scratch buffer if not */
1392 	if (__predict_false(m->m_len < sizeof (*eh)
1393 			    + sizeof (*ip))) {
1394 		m_copydata(m, 0, sizeof (*eh) + sizeof (*ip),
1395 			   sc->scratch);
1396 		eh = (struct ether_header *)sc->scratch;
1397 	} else {
1398 		eh = mtod(m, struct ether_header *);
1399 	}
1400 	ip = (struct ip *) (eh + 1);
1401 	if (__predict_false(m->m_len < sizeof (*eh) + (ip->ip_hl << 2)
1402 			    + sizeof (*tcp))) {
1403 		m_copydata(m, 0, sizeof (*eh) + (ip->ip_hl << 2)
1404 			   + sizeof (*tcp),  sc->scratch);
1405 		eh = (struct ether_header *) sc->scratch;
1406 		ip = (struct ip *) (eh + 1);
1407 	}
1408 
1409 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1410 	cum_len = -(sizeof (*eh) + ((ip->ip_hl + tcp->th_off) << 2));
1411 
1412 	/* TSO implies checksum offload on this hardware */
1413 	cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1414 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1415 
1416 
1417 	/* for TSO, pseudo_hdr_offset holds mss.
1418 	 * The firmware figures out where to put
1419 	 * the checksum by parsing the header. */
1420 	pseudo_hdr_offset = htobe16(mss);
1421 
1422 	tx = &sc->tx;
1423 	req = tx->req_list;
1424 	seg = tx->seg_list;
1425 	cnt = 0;
1426 	rdma_count = 0;
1427 	/* "rdma_count" is the number of RDMAs belonging to the
1428 	 * current packet BEFORE the current send request. For
1429 	 * non-TSO packets, this is equal to "count".
1430 	 * For TSO packets, rdma_count needs to be reset
1431 	 * to 0 after a segment cut.
1432 	 *
1433 	 * The rdma_count field of the send request is
1434 	 * the number of RDMAs of the packet starting at
1435 	 * that request. For TSO send requests with one ore more cuts
1436 	 * in the middle, this is the number of RDMAs starting
1437 	 * after the last cut in the request. All previous
1438 	 * segments before the last cut implicitly have 1 RDMA.
1439 	 *
1440 	 * Since the number of RDMAs is not known beforehand,
1441 	 * it must be filled-in retroactively - after each
1442 	 * segmentation cut or at the end of the entire packet.
1443 	 */
1444 
1445 	while (busdma_seg_cnt) {
1446 		/* Break the busdma segment up into pieces*/
1447 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1448 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1449 		len = seg->ds_len;
1450 
1451 		while (len) {
1452 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1453 			seglen = len;
1454 			cum_len_next = cum_len + seglen;
1455 			(req-rdma_count)->rdma_count = rdma_count + 1;
1456 			if (__predict_true(cum_len >= 0)) {
1457 				/* payload */
1458 				chop = (cum_len_next > mss);
1459 				cum_len_next = cum_len_next % mss;
1460 				next_is_first = (cum_len_next == 0);
1461 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1462 				flags_next |= next_is_first *
1463 					MXGEFW_FLAGS_FIRST;
1464 				rdma_count |= -(chop | next_is_first);
1465 				rdma_count += chop & !next_is_first;
1466 			} else if (cum_len_next >= 0) {
1467 				/* header ends */
1468 				rdma_count = -1;
1469 				cum_len_next = 0;
1470 				seglen = -cum_len;
1471 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1472 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1473 					MXGEFW_FLAGS_FIRST |
1474 					(small * MXGEFW_FLAGS_SMALL);
1475 			    }
1476 
1477 			req->addr_high = high_swapped;
1478 			req->addr_low = htobe32(low);
1479 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1480 			req->pad = 0;
1481 			req->rdma_count = 1;
1482 			req->length = htobe16(seglen);
1483 			req->cksum_offset = cksum_offset;
1484 			req->flags = flags | ((cum_len & 1) *
1485 					      MXGEFW_FLAGS_ALIGN_ODD);
1486 			low += seglen;
1487 			len -= seglen;
1488 			cum_len = cum_len_next;
1489 			flags = flags_next;
1490 			req++;
1491 			cnt++;
1492 			rdma_count++;
1493 			if (__predict_false(cksum_offset > seglen))
1494 				cksum_offset -= seglen;
1495 			else
1496 				cksum_offset = 0;
1497 			if (__predict_false(cnt > MXGE_MAX_SEND_DESC))
1498 				goto drop;
1499 		}
1500 		busdma_seg_cnt--;
1501 		seg++;
1502 	}
1503 	(req-rdma_count)->rdma_count = rdma_count;
1504 
1505 	do {
1506 		req--;
1507 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1508 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1509 
1510 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1511 	if (tx->wc_fifo == NULL)
1512 		mxge_submit_req(tx, tx->req_list, cnt);
1513 	else
1514 		mxge_submit_req_wc(tx, tx->req_list, cnt);
1515 	return;
1516 
1517 drop:
1518 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1519 	m_freem(m);
1520 	sc->ifp->if_oerrors++;
1521 	if (!once) {
1522 		printf("MXGE_MAX_SEND_DESC exceeded via TSO!\n");
1523 		printf("mss = %d, %ld!\n", mss, (long)seg - (long)tx->seg_list);
1524 		once = 1;
1525 	}
1526 	return;
1527 
1528 }
1529 
1530 static void
1531 mxge_encap(mxge_softc_t *sc, struct mbuf *m)
1532 {
1533 	mcp_kreq_ether_send_t *req;
1534 	bus_dma_segment_t *seg;
1535 	struct mbuf *m_tmp;
1536 	struct ifnet *ifp;
1537 	mxge_tx_buf_t *tx;
1538 	struct ether_header *eh;
1539 	struct ip *ip;
1540 	int cnt, cum_len, err, i, idx, odd_flag;
1541 	uint16_t pseudo_hdr_offset;
1542         uint8_t flags, cksum_offset;
1543 
1544 
1545 
1546 	ifp = sc->ifp;
1547 	tx = &sc->tx;
1548 
1549 	/* (try to) map the frame for DMA */
1550 	idx = tx->req & tx->mask;
1551 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1552 				      m, tx->seg_list, &cnt,
1553 				      BUS_DMA_NOWAIT);
1554 	if (err == EFBIG) {
1555 		/* Too many segments in the chain.  Try
1556 		   to defrag */
1557 		m_tmp = m_defrag(m, M_NOWAIT);
1558 		if (m_tmp == NULL) {
1559 			goto drop;
1560 		}
1561 		m = m_tmp;
1562 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1563 					      tx->info[idx].map,
1564 					      m, tx->seg_list, &cnt,
1565 					      BUS_DMA_NOWAIT);
1566 	}
1567 	if (err != 0) {
1568 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1569 			      " packet len = %d\n", err, m->m_pkthdr.len);
1570 		goto drop;
1571 	}
1572 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1573 			BUS_DMASYNC_PREWRITE);
1574 	tx->info[idx].m = m;
1575 
1576 
1577 	/* TSO is different enough, we handle it in another routine */
1578 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1579 		mxge_encap_tso(sc, m, cnt);
1580 		return;
1581 	}
1582 
1583 	req = tx->req_list;
1584 	cksum_offset = 0;
1585 	pseudo_hdr_offset = 0;
1586 	flags = MXGEFW_FLAGS_NO_TSO;
1587 
1588 	/* checksum offloading? */
1589 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1590 		/* ensure ip header is in first mbuf, copy
1591 		   it to a scratch buffer if not */
1592 		if (__predict_false(m->m_len < sizeof (*eh)
1593 				    + sizeof (*ip))) {
1594 			m_copydata(m, 0, sizeof (*eh) + sizeof (*ip),
1595 				   sc->scratch);
1596 			eh = (struct ether_header *)sc->scratch;
1597 		} else {
1598 			eh = mtod(m, struct ether_header *);
1599 		}
1600 		ip = (struct ip *) (eh + 1);
1601 		cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1602 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1603 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1604 		req->cksum_offset = cksum_offset;
1605 		flags |= MXGEFW_FLAGS_CKSUM;
1606 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1607 	} else {
1608 		odd_flag = 0;
1609 	}
1610 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1611 		flags |= MXGEFW_FLAGS_SMALL;
1612 
1613 	/* convert segments into a request list */
1614 	cum_len = 0;
1615 	seg = tx->seg_list;
1616 	req->flags = MXGEFW_FLAGS_FIRST;
1617 	for (i = 0; i < cnt; i++) {
1618 		req->addr_low =
1619 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1620 		req->addr_high =
1621 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1622 		req->length = htobe16(seg->ds_len);
1623 		req->cksum_offset = cksum_offset;
1624 		if (cksum_offset > seg->ds_len)
1625 			cksum_offset -= seg->ds_len;
1626 		else
1627 			cksum_offset = 0;
1628 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1629 		req->pad = 0; /* complete solid 16-byte block */
1630 		req->rdma_count = 1;
1631 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1632 		cum_len += seg->ds_len;
1633 		seg++;
1634 		req++;
1635 		req->flags = 0;
1636 	}
1637 	req--;
1638 	/* pad runts to 60 bytes */
1639 	if (cum_len < 60) {
1640 		req++;
1641 		req->addr_low =
1642 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
1643 		req->addr_high =
1644 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
1645 		req->length = htobe16(60 - cum_len);
1646 		req->cksum_offset = 0;
1647 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1648 		req->pad = 0; /* complete solid 16-byte block */
1649 		req->rdma_count = 1;
1650 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1651 		cnt++;
1652 	}
1653 
1654 	tx->req_list[0].rdma_count = cnt;
1655 #if 0
1656 	/* print what the firmware will see */
1657 	for (i = 0; i < cnt; i++) {
1658 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1659 		    "cso:%d, flags:0x%x, rdma:%d\n",
1660 		    i, (int)ntohl(tx->req_list[i].addr_high),
1661 		    (int)ntohl(tx->req_list[i].addr_low),
1662 		    (int)ntohs(tx->req_list[i].length),
1663 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1664 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1665 		    tx->req_list[i].rdma_count);
1666 	}
1667 	printf("--------------\n");
1668 #endif
1669 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1670 	if (tx->wc_fifo == NULL)
1671 		mxge_submit_req(tx, tx->req_list, cnt);
1672 	else
1673 		mxge_submit_req_wc(tx, tx->req_list, cnt);
1674 	return;
1675 
1676 drop:
1677 	m_freem(m);
1678 	ifp->if_oerrors++;
1679 	return;
1680 }
1681 
1682 
1683 
1684 
1685 static inline void
1686 mxge_start_locked(mxge_softc_t *sc)
1687 {
1688 	struct mbuf *m;
1689 	struct ifnet *ifp;
1690 
1691 	ifp = sc->ifp;
1692 	while ((sc->tx.mask - (sc->tx.req - sc->tx.done))
1693 	       > MXGE_MAX_SEND_DESC) {
1694 
1695 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
1696 		if (m == NULL) {
1697 			return;
1698 		}
1699 		/* let BPF see it */
1700 		BPF_MTAP(ifp, m);
1701 
1702 		/* give it to the nic */
1703 		mxge_encap(sc, m);
1704 	}
1705 	/* ran out of transmit slots */
1706 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
1707 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
1708 		sc->tx.stall++;
1709 	}
1710 }
1711 
1712 static void
1713 mxge_start(struct ifnet *ifp)
1714 {
1715 	mxge_softc_t *sc = ifp->if_softc;
1716 
1717 
1718 	mtx_lock(&sc->tx_mtx);
1719 	mxge_start_locked(sc);
1720 	mtx_unlock(&sc->tx_mtx);
1721 }
1722 
1723 /*
1724  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
1725  * at most 32 bytes at a time, so as to avoid involving the software
1726  * pio handler in the nic.   We re-write the first segment's low
1727  * DMA address to mark it valid only after we write the entire chunk
1728  * in a burst
1729  */
1730 static inline void
1731 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
1732 		mcp_kreq_ether_recv_t *src)
1733 {
1734 	uint32_t low;
1735 
1736 	low = src->addr_low;
1737 	src->addr_low = 0xffffffff;
1738 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
1739 	mb();
1740 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
1741 	mb();
1742 	src->addr_low = low;
1743 	dst->addr_low = low;
1744 	mb();
1745 }
1746 
1747 static int
1748 mxge_get_buf_small(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1749 {
1750 	bus_dma_segment_t seg;
1751 	struct mbuf *m;
1752 	mxge_rx_buf_t *rx = &sc->rx_small;
1753 	int cnt, err;
1754 
1755 	m = m_gethdr(M_DONTWAIT, MT_DATA);
1756 	if (m == NULL) {
1757 		rx->alloc_fail++;
1758 		err = ENOBUFS;
1759 		goto done;
1760 	}
1761 	m->m_len = MHLEN;
1762 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1763 				      &seg, &cnt, BUS_DMA_NOWAIT);
1764 	if (err != 0) {
1765 		m_free(m);
1766 		goto done;
1767 	}
1768 	rx->info[idx].m = m;
1769 	rx->shadow[idx].addr_low =
1770 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1771 	rx->shadow[idx].addr_high =
1772 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1773 
1774 done:
1775 	if ((idx & 7) == 7) {
1776 		if (rx->wc_fifo == NULL)
1777 			mxge_submit_8rx(&rx->lanai[idx - 7],
1778 					&rx->shadow[idx - 7]);
1779 		else {
1780 			mb();
1781 			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1782 		}
1783         }
1784 	return err;
1785 }
1786 
1787 static int
1788 mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1789 {
1790 	bus_dma_segment_t seg;
1791 	struct mbuf *m;
1792 	mxge_rx_buf_t *rx = &sc->rx_big;
1793 	int cnt, err;
1794 
1795 	m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, sc->big_bytes);
1796 	if (m == NULL) {
1797 		rx->alloc_fail++;
1798 		err = ENOBUFS;
1799 		goto done;
1800 	}
1801 	m->m_len = sc->big_bytes;
1802 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1803 				      &seg, &cnt, BUS_DMA_NOWAIT);
1804 	if (err != 0) {
1805 		m_free(m);
1806 		goto done;
1807 	}
1808 	rx->info[idx].m = m;
1809 	rx->shadow[idx].addr_low =
1810 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1811 	rx->shadow[idx].addr_high =
1812 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1813 
1814 done:
1815 	if ((idx & 7) == 7) {
1816 		if (rx->wc_fifo == NULL)
1817 			mxge_submit_8rx(&rx->lanai[idx - 7],
1818 					&rx->shadow[idx - 7]);
1819 		else {
1820 			mb();
1821 			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1822 		}
1823         }
1824 	return err;
1825 }
1826 
1827 static inline void
1828 mxge_rx_csum(struct mbuf *m, int csum)
1829 {
1830 	struct ether_header *eh;
1831 	struct ip *ip;
1832 
1833 	eh = mtod(m, struct ether_header *);
1834 
1835 	/* only deal with IPv4 TCP & UDP for now */
1836 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
1837 		return;
1838 	ip = (struct ip *)(eh + 1);
1839 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
1840 			    ip->ip_p != IPPROTO_UDP))
1841 		return;
1842 
1843 	/*
1844 	 *  Myri10GE hardware checksums are not valid if the sender
1845 	 *  padded the frame with non-zero padding.  This is because
1846 	 *  the firmware just does a simple 16-bit 1s complement
1847 	 *  checksum across the entire frame, excluding the first 14
1848 	 *  bytes.  It is easiest to simply to assume the worst, and
1849 	 *  only apply hardware checksums to non-padded frames.  This
1850 	 *  is what nearly every other OS does by default.
1851 	 */
1852 
1853 	if (__predict_true(m->m_pkthdr.len ==
1854 			   (ntohs(ip->ip_len) + ETHER_HDR_LEN))) {
1855 		m->m_pkthdr.csum_data = csum;
1856 		m->m_pkthdr.csum_flags = CSUM_DATA_VALID;
1857 	}
1858 }
1859 
1860 static inline void
1861 mxge_rx_done_big(mxge_softc_t *sc, int len, int csum)
1862 {
1863 	struct ifnet *ifp;
1864 	struct mbuf *m = 0; 		/* -Wunitialized */
1865 	struct mbuf *m_prev = 0;	/* -Wunitialized */
1866 	struct mbuf *m_head = 0;
1867 	bus_dmamap_t old_map;
1868 	mxge_rx_buf_t *rx;
1869 	int idx;
1870 
1871 
1872 	rx = &sc->rx_big;
1873 	ifp = sc->ifp;
1874 	while (len > 0) {
1875 		idx = rx->cnt & rx->mask;
1876                 rx->cnt++;
1877 		/* save a pointer to the received mbuf */
1878 		m = rx->info[idx].m;
1879 		/* try to replace the received mbuf */
1880 		if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
1881 			goto drop;
1882 		}
1883 		/* unmap the received buffer */
1884 		old_map = rx->info[idx].map;
1885 		bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1886 		bus_dmamap_unload(rx->dmat, old_map);
1887 
1888 		/* swap the bus_dmamap_t's */
1889 		rx->info[idx].map = rx->extra_map;
1890 		rx->extra_map = old_map;
1891 
1892 		/* chain multiple segments together */
1893 		if (!m_head) {
1894 			m_head = m;
1895 			/* mcp implicitly skips 1st bytes so that
1896 			 * packet is properly aligned */
1897 			m->m_data += MXGEFW_PAD;
1898 			m->m_pkthdr.len = len;
1899 			m->m_len = sc->big_bytes - MXGEFW_PAD;
1900 		} else {
1901 			m->m_len = sc->big_bytes;
1902 			m->m_flags &= ~M_PKTHDR;
1903 			m_prev->m_next = m;
1904 		}
1905 		len -= m->m_len;
1906 		m_prev = m;
1907 	}
1908 
1909 	/* trim trailing garbage from the last mbuf in the chain.  If
1910 	 * there is any garbage, len will be negative */
1911 	m->m_len += len;
1912 
1913 	m_head->m_pkthdr.rcvif = ifp;
1914 	ifp->if_ipackets++;
1915 	/* if the checksum is valid, mark it in the mbuf header */
1916 	if (sc->csum_flag)
1917 		mxge_rx_csum(m_head, csum);
1918 
1919 	/* pass the frame up the stack */
1920 	(*ifp->if_input)(ifp, m_head);
1921 	return;
1922 
1923 drop:
1924 	/* drop the frame -- the old mbuf(s) are re-cycled by running
1925 	   every slot through the allocator */
1926         if (m_head) {
1927                 len -= sc->big_bytes;
1928                 m_freem(m_head);
1929         } else {
1930                 len -= (sc->big_bytes + MXGEFW_PAD);
1931         }
1932         while ((int)len > 0) {
1933                 idx = rx->cnt & rx->mask;
1934                 rx->cnt++;
1935                 m = rx->info[idx].m;
1936                 if (0 == (mxge_get_buf_big(sc, rx->extra_map, idx))) {
1937 			m_freem(m);
1938 			/* unmap the received buffer */
1939 			old_map = rx->info[idx].map;
1940 			bus_dmamap_sync(rx->dmat, old_map,
1941 					BUS_DMASYNC_POSTREAD);
1942 			bus_dmamap_unload(rx->dmat, old_map);
1943 
1944 			/* swap the bus_dmamap_t's */
1945 			rx->info[idx].map = rx->extra_map;
1946 			rx->extra_map = old_map;
1947 		}
1948                 len -= sc->big_bytes;
1949         }
1950 
1951 	ifp->if_ierrors++;
1952 
1953 }
1954 
1955 static inline void
1956 mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
1957 {
1958 	struct ifnet *ifp;
1959 	struct mbuf *m;
1960 	mxge_rx_buf_t *rx;
1961 	bus_dmamap_t old_map;
1962 	int idx;
1963 
1964 	ifp = sc->ifp;
1965 	rx = &sc->rx_small;
1966 	idx = rx->cnt & rx->mask;
1967 	rx->cnt++;
1968 	/* save a pointer to the received mbuf */
1969 	m = rx->info[idx].m;
1970 	/* try to replace the received mbuf */
1971 	if (mxge_get_buf_small(sc, rx->extra_map, idx)) {
1972 		/* drop the frame -- the old mbuf is re-cycled */
1973 		ifp->if_ierrors++;
1974 		return;
1975 	}
1976 
1977 	/* unmap the received buffer */
1978 	old_map = rx->info[idx].map;
1979 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1980 	bus_dmamap_unload(rx->dmat, old_map);
1981 
1982 	/* swap the bus_dmamap_t's */
1983 	rx->info[idx].map = rx->extra_map;
1984 	rx->extra_map = old_map;
1985 
1986 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
1987 	 * aligned */
1988 	m->m_data += MXGEFW_PAD;
1989 
1990 	m->m_pkthdr.rcvif = ifp;
1991 	m->m_len = m->m_pkthdr.len = len;
1992 	ifp->if_ipackets++;
1993 	/* if the checksum is valid, mark it in the mbuf header */
1994 	if (sc->csum_flag)
1995 		mxge_rx_csum(m, csum);
1996 
1997 	/* pass the frame up the stack */
1998 	(*ifp->if_input)(ifp, m);
1999 }
2000 
2001 static inline void
2002 mxge_clean_rx_done(mxge_softc_t *sc)
2003 {
2004 	mxge_rx_done_t *rx_done = &sc->rx_done;
2005 	int limit = 0;
2006 	uint16_t length;
2007 	uint16_t checksum;
2008 
2009 
2010 	while (rx_done->entry[rx_done->idx].length != 0) {
2011 		length = ntohs(rx_done->entry[rx_done->idx].length);
2012 		rx_done->entry[rx_done->idx].length = 0;
2013 		checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
2014 		if (length <= (MHLEN - MXGEFW_PAD))
2015 			mxge_rx_done_small(sc, length, checksum);
2016 		else
2017 			mxge_rx_done_big(sc, length, checksum);
2018 		rx_done->cnt++;
2019 		rx_done->idx = rx_done->cnt & (mxge_max_intr_slots - 1);
2020 
2021 		/* limit potential for livelock */
2022 		if (__predict_false(++limit > 2 * mxge_max_intr_slots))
2023 			break;
2024 
2025 	}
2026 }
2027 
2028 
2029 static inline void
2030 mxge_tx_done(mxge_softc_t *sc, uint32_t mcp_idx)
2031 {
2032 	struct ifnet *ifp;
2033 	mxge_tx_buf_t *tx;
2034 	struct mbuf *m;
2035 	bus_dmamap_t map;
2036 	int idx, limit;
2037 
2038 	limit = 0;
2039 	tx = &sc->tx;
2040 	ifp = sc->ifp;
2041 	while (tx->pkt_done != mcp_idx) {
2042 		idx = tx->done & tx->mask;
2043 		tx->done++;
2044 		m = tx->info[idx].m;
2045 		/* mbuf and DMA map only attached to the first
2046 		   segment per-mbuf */
2047 		if (m != NULL) {
2048 			ifp->if_opackets++;
2049 			tx->info[idx].m = NULL;
2050 			map = tx->info[idx].map;
2051 			bus_dmamap_unload(tx->dmat, map);
2052 			m_freem(m);
2053 		}
2054 		if (tx->info[idx].flag) {
2055 			tx->info[idx].flag = 0;
2056 			tx->pkt_done++;
2057 		}
2058 		/* limit potential for livelock by only handling
2059 		   2 full tx rings per call */
2060 		if (__predict_false(++limit >  2 * tx->mask))
2061 			break;
2062 	}
2063 
2064 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2065            its OK to send packets */
2066 
2067 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
2068 	    tx->req - tx->done < (tx->mask + 1)/4) {
2069 		mtx_lock(&sc->tx_mtx);
2070 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2071 		sc->tx.wake++;
2072 		mxge_start_locked(sc);
2073 		mtx_unlock(&sc->tx_mtx);
2074 	}
2075 }
2076 
2077 static void
2078 mxge_intr(void *arg)
2079 {
2080 	mxge_softc_t *sc = arg;
2081 	mcp_irq_data_t *stats = sc->fw_stats;
2082 	mxge_tx_buf_t *tx = &sc->tx;
2083 	mxge_rx_done_t *rx_done = &sc->rx_done;
2084 	uint32_t send_done_count;
2085 	uint8_t valid;
2086 
2087 
2088 	/* make sure the DMA has finished */
2089 	if (!stats->valid) {
2090 		return;
2091 	}
2092 	valid = stats->valid;
2093 
2094 	if (!sc->msi_enabled) {
2095 		/* lower legacy IRQ  */
2096 		*sc->irq_deassert = 0;
2097 		if (!mxge_deassert_wait)
2098 			/* don't wait for conf. that irq is low */
2099 			stats->valid = 0;
2100 	} else {
2101 		stats->valid = 0;
2102 	}
2103 
2104 	/* loop while waiting for legacy irq deassertion */
2105 	do {
2106 		/* check for transmit completes and receives */
2107 		send_done_count = be32toh(stats->send_done_count);
2108 		while ((send_done_count != tx->pkt_done) ||
2109 		       (rx_done->entry[rx_done->idx].length != 0)) {
2110 			mxge_tx_done(sc, (int)send_done_count);
2111 			mxge_clean_rx_done(sc);
2112 			send_done_count = be32toh(stats->send_done_count);
2113 		}
2114 	} while (*((volatile uint8_t *) &stats->valid));
2115 
2116 	if (__predict_false(stats->stats_updated)) {
2117 		if (sc->link_state != stats->link_up) {
2118 			sc->link_state = stats->link_up;
2119 			if (sc->link_state) {
2120 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2121 				if (mxge_verbose)
2122 					device_printf(sc->dev, "link up\n");
2123 			} else {
2124 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2125 				if (mxge_verbose)
2126 					device_printf(sc->dev, "link down\n");
2127 			}
2128 		}
2129 		if (sc->rdma_tags_available !=
2130 		    be32toh(sc->fw_stats->rdma_tags_available)) {
2131 			sc->rdma_tags_available =
2132 				be32toh(sc->fw_stats->rdma_tags_available);
2133 			device_printf(sc->dev, "RDMA timed out! %d tags "
2134 				      "left\n", sc->rdma_tags_available);
2135 		}
2136 		sc->down_cnt += stats->link_down;
2137 	}
2138 
2139 	/* check to see if we have rx token to pass back */
2140 	if (valid & 0x1)
2141 	    *sc->irq_claim = be32toh(3);
2142 	*(sc->irq_claim + 1) = be32toh(3);
2143 }
2144 
2145 static void
2146 mxge_init(void *arg)
2147 {
2148 }
2149 
2150 
2151 
2152 static void
2153 mxge_free_mbufs(mxge_softc_t *sc)
2154 {
2155 	int i;
2156 
2157 	for (i = 0; i <= sc->rx_big.mask; i++) {
2158 		if (sc->rx_big.info[i].m == NULL)
2159 			continue;
2160 		bus_dmamap_unload(sc->rx_big.dmat,
2161 				  sc->rx_big.info[i].map);
2162 		m_freem(sc->rx_big.info[i].m);
2163 		sc->rx_big.info[i].m = NULL;
2164 	}
2165 
2166 	for (i = 0; i <= sc->rx_small.mask; i++) {
2167 		if (sc->rx_small.info[i].m == NULL)
2168 			continue;
2169 		bus_dmamap_unload(sc->rx_small.dmat,
2170 				  sc->rx_small.info[i].map);
2171 		m_freem(sc->rx_small.info[i].m);
2172 		sc->rx_small.info[i].m = NULL;
2173 	}
2174 
2175 	for (i = 0; i <= sc->tx.mask; i++) {
2176 		sc->tx.info[i].flag = 0;
2177 		if (sc->tx.info[i].m == NULL)
2178 			continue;
2179 		bus_dmamap_unload(sc->tx.dmat,
2180 				  sc->tx.info[i].map);
2181 		m_freem(sc->tx.info[i].m);
2182 		sc->tx.info[i].m = NULL;
2183 	}
2184 }
2185 
2186 static void
2187 mxge_free_rings(mxge_softc_t *sc)
2188 {
2189 	int i;
2190 
2191 	if (sc->tx.req_bytes != NULL)
2192 		free(sc->tx.req_bytes, M_DEVBUF);
2193 	if (sc->tx.seg_list != NULL)
2194 		free(sc->tx.seg_list, M_DEVBUF);
2195 	if (sc->rx_small.shadow != NULL)
2196 		free(sc->rx_small.shadow, M_DEVBUF);
2197 	if (sc->rx_big.shadow != NULL)
2198 		free(sc->rx_big.shadow, M_DEVBUF);
2199 	if (sc->tx.info != NULL) {
2200 		if (sc->tx.dmat != NULL) {
2201 			for (i = 0; i <= sc->tx.mask; i++) {
2202 				bus_dmamap_destroy(sc->tx.dmat,
2203 						   sc->tx.info[i].map);
2204 			}
2205 			bus_dma_tag_destroy(sc->tx.dmat);
2206 		}
2207 		free(sc->tx.info, M_DEVBUF);
2208 	}
2209 	if (sc->rx_small.info != NULL) {
2210 		if (sc->rx_small.dmat != NULL) {
2211 			for (i = 0; i <= sc->rx_small.mask; i++) {
2212 				bus_dmamap_destroy(sc->rx_small.dmat,
2213 						   sc->rx_small.info[i].map);
2214 			}
2215 			bus_dmamap_destroy(sc->rx_small.dmat,
2216 					   sc->rx_small.extra_map);
2217 			bus_dma_tag_destroy(sc->rx_small.dmat);
2218 		}
2219 		free(sc->rx_small.info, M_DEVBUF);
2220 	}
2221 	if (sc->rx_big.info != NULL) {
2222 		if (sc->rx_big.dmat != NULL) {
2223 			for (i = 0; i <= sc->rx_big.mask; i++) {
2224 				bus_dmamap_destroy(sc->rx_big.dmat,
2225 						   sc->rx_big.info[i].map);
2226 			}
2227 			bus_dmamap_destroy(sc->rx_big.dmat,
2228 					   sc->rx_big.extra_map);
2229 			bus_dma_tag_destroy(sc->rx_big.dmat);
2230 		}
2231 		free(sc->rx_big.info, M_DEVBUF);
2232 	}
2233 }
2234 
2235 static int
2236 mxge_alloc_rings(mxge_softc_t *sc)
2237 {
2238 	mxge_cmd_t cmd;
2239 	int tx_ring_size, rx_ring_size;
2240 	int tx_ring_entries, rx_ring_entries;
2241 	int i, err;
2242 	unsigned long bytes;
2243 
2244 	/* get ring sizes */
2245 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
2246 	tx_ring_size = cmd.data0;
2247 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
2248 	if (err != 0) {
2249 		device_printf(sc->dev, "Cannot determine ring sizes\n");
2250 		goto abort_with_nothing;
2251 	}
2252 
2253 	rx_ring_size = cmd.data0;
2254 
2255 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
2256 	rx_ring_entries = rx_ring_size / sizeof (mcp_dma_addr_t);
2257 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
2258 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
2259 	IFQ_SET_READY(&sc->ifp->if_snd);
2260 
2261 	sc->tx.mask = tx_ring_entries - 1;
2262 	sc->rx_small.mask = sc->rx_big.mask = rx_ring_entries - 1;
2263 
2264 	err = ENOMEM;
2265 
2266 	/* allocate the tx request copy block */
2267 	bytes = 8 +
2268 		sizeof (*sc->tx.req_list) * (MXGE_MAX_SEND_DESC + 4);
2269 	sc->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
2270 	if (sc->tx.req_bytes == NULL)
2271 		goto abort_with_nothing;
2272 	/* ensure req_list entries are aligned to 8 bytes */
2273 	sc->tx.req_list = (mcp_kreq_ether_send_t *)
2274 		((unsigned long)(sc->tx.req_bytes + 7) & ~7UL);
2275 
2276 	/* allocate the tx busdma segment list */
2277 	bytes = sizeof (*sc->tx.seg_list) * MXGE_MAX_SEND_DESC;
2278 	sc->tx.seg_list = (bus_dma_segment_t *)
2279 		malloc(bytes, M_DEVBUF, M_WAITOK);
2280 	if (sc->tx.seg_list == NULL)
2281 		goto abort_with_alloc;
2282 
2283 	/* allocate the rx shadow rings */
2284 	bytes = rx_ring_entries * sizeof (*sc->rx_small.shadow);
2285 	sc->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2286 	if (sc->rx_small.shadow == NULL)
2287 		goto abort_with_alloc;
2288 
2289 	bytes = rx_ring_entries * sizeof (*sc->rx_big.shadow);
2290 	sc->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2291 	if (sc->rx_big.shadow == NULL)
2292 		goto abort_with_alloc;
2293 
2294 	/* allocate the host info rings */
2295 	bytes = tx_ring_entries * sizeof (*sc->tx.info);
2296 	sc->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2297 	if (sc->tx.info == NULL)
2298 		goto abort_with_alloc;
2299 
2300 	bytes = rx_ring_entries * sizeof (*sc->rx_small.info);
2301 	sc->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2302 	if (sc->rx_small.info == NULL)
2303 		goto abort_with_alloc;
2304 
2305 	bytes = rx_ring_entries * sizeof (*sc->rx_big.info);
2306 	sc->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2307 	if (sc->rx_big.info == NULL)
2308 		goto abort_with_alloc;
2309 
2310 	/* allocate the busdma resources */
2311 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2312 				 1,			/* alignment */
2313 				 sc->tx.boundary,	/* boundary */
2314 				 BUS_SPACE_MAXADDR,	/* low */
2315 				 BUS_SPACE_MAXADDR,	/* high */
2316 				 NULL, NULL,		/* filter */
2317 				 65536 + 256,		/* maxsize */
2318 				 MXGE_MAX_SEND_DESC/2,	/* num segs */
2319 				 sc->tx.boundary,	/* maxsegsize */
2320 				 BUS_DMA_ALLOCNOW,	/* flags */
2321 				 NULL, NULL,		/* lock */
2322 				 &sc->tx.dmat);		/* tag */
2323 
2324 	if (err != 0) {
2325 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
2326 			      err);
2327 		goto abort_with_alloc;
2328 	}
2329 
2330 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2331 				 1,			/* alignment */
2332 				 4096,			/* boundary */
2333 				 BUS_SPACE_MAXADDR,	/* low */
2334 				 BUS_SPACE_MAXADDR,	/* high */
2335 				 NULL, NULL,		/* filter */
2336 				 MHLEN,			/* maxsize */
2337 				 1,			/* num segs */
2338 				 MHLEN,			/* maxsegsize */
2339 				 BUS_DMA_ALLOCNOW,	/* flags */
2340 				 NULL, NULL,		/* lock */
2341 				 &sc->rx_small.dmat);	/* tag */
2342 	if (err != 0) {
2343 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2344 			      err);
2345 		goto abort_with_alloc;
2346 	}
2347 
2348 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2349 				 1,			/* alignment */
2350 				 4096,			/* boundary */
2351 				 BUS_SPACE_MAXADDR,	/* low */
2352 				 BUS_SPACE_MAXADDR,	/* high */
2353 				 NULL, NULL,		/* filter */
2354 				 4096,			/* maxsize */
2355 				 1,			/* num segs */
2356 				 4096,			/* maxsegsize */
2357 				 BUS_DMA_ALLOCNOW,	/* flags */
2358 				 NULL, NULL,		/* lock */
2359 				 &sc->rx_big.dmat);	/* tag */
2360 	if (err != 0) {
2361 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2362 			      err);
2363 		goto abort_with_alloc;
2364 	}
2365 
2366 	/* now use these tags to setup dmamaps for each slot
2367 	   in each ring */
2368 	for (i = 0; i <= sc->tx.mask; i++) {
2369 		err = bus_dmamap_create(sc->tx.dmat, 0,
2370 					&sc->tx.info[i].map);
2371 		if (err != 0) {
2372 			device_printf(sc->dev, "Err %d  tx dmamap\n",
2373 			      err);
2374 			goto abort_with_alloc;
2375 		}
2376 	}
2377 	for (i = 0; i <= sc->rx_small.mask; i++) {
2378 		err = bus_dmamap_create(sc->rx_small.dmat, 0,
2379 					&sc->rx_small.info[i].map);
2380 		if (err != 0) {
2381 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2382 				      err);
2383 			goto abort_with_alloc;
2384 		}
2385 	}
2386 	err = bus_dmamap_create(sc->rx_small.dmat, 0,
2387 				&sc->rx_small.extra_map);
2388 	if (err != 0) {
2389 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2390 			      err);
2391 			goto abort_with_alloc;
2392 	}
2393 
2394 	for (i = 0; i <= sc->rx_big.mask; i++) {
2395 		err = bus_dmamap_create(sc->rx_big.dmat, 0,
2396 					&sc->rx_big.info[i].map);
2397 		if (err != 0) {
2398 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2399 			      err);
2400 			goto abort_with_alloc;
2401 		}
2402 	}
2403 	err = bus_dmamap_create(sc->rx_big.dmat, 0,
2404 				&sc->rx_big.extra_map);
2405 	if (err != 0) {
2406 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2407 			      err);
2408 			goto abort_with_alloc;
2409 	}
2410 	return 0;
2411 
2412 abort_with_alloc:
2413 	mxge_free_rings(sc);
2414 
2415 abort_with_nothing:
2416 	return err;
2417 }
2418 
2419 static int
2420 mxge_open(mxge_softc_t *sc)
2421 {
2422 	mxge_cmd_t cmd;
2423 	int i, err;
2424 	bus_dmamap_t map;
2425 	bus_addr_t bus;
2426 
2427 
2428 	/* Copy the MAC address in case it was overridden */
2429 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
2430 
2431 	err = mxge_reset(sc);
2432 	if (err != 0) {
2433 		device_printf(sc->dev, "failed to reset\n");
2434 		return EIO;
2435 	}
2436 	bzero(sc->rx_done.entry,
2437 	      mxge_max_intr_slots * sizeof(*sc->rx_done.entry));
2438 
2439 	if (MCLBYTES >=
2440 	    sc->ifp->if_mtu + ETHER_HDR_LEN + MXGEFW_PAD)
2441 		sc->big_bytes = MCLBYTES;
2442 	else
2443 		sc->big_bytes = MJUMPAGESIZE;
2444 
2445 
2446 	/* get the lanai pointers to the send and receive rings */
2447 
2448 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
2449 	sc->tx.lanai =
2450 		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
2451 	err |= mxge_send_cmd(sc,
2452 				 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
2453 	sc->rx_small.lanai =
2454 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2455 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
2456 	sc->rx_big.lanai =
2457 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2458 
2459 	if (err != 0) {
2460 		device_printf(sc->dev,
2461 			      "failed to get ring sizes or locations\n");
2462 		return EIO;
2463 	}
2464 
2465 	if (sc->wc) {
2466 		sc->tx.wc_fifo = sc->sram + MXGEFW_ETH_SEND_4;
2467 		sc->rx_small.wc_fifo = sc->sram + MXGEFW_ETH_RECV_SMALL;
2468 		sc->rx_big.wc_fifo = sc->sram + MXGEFW_ETH_RECV_BIG;
2469 	} else {
2470 		sc->tx.wc_fifo = 0;
2471 		sc->rx_small.wc_fifo = 0;
2472 		sc->rx_big.wc_fifo = 0;
2473 	}
2474 
2475 
2476 	/* stock receive rings */
2477 	for (i = 0; i <= sc->rx_small.mask; i++) {
2478 		map = sc->rx_small.info[i].map;
2479 		err = mxge_get_buf_small(sc, map, i);
2480 		if (err) {
2481 			device_printf(sc->dev, "alloced %d/%d smalls\n",
2482 				      i, sc->rx_small.mask + 1);
2483 			goto abort;
2484 		}
2485 	}
2486 	for (i = 0; i <= sc->rx_big.mask; i++) {
2487 		map = sc->rx_big.info[i].map;
2488 		err = mxge_get_buf_big(sc, map, i);
2489 		if (err) {
2490 			device_printf(sc->dev, "alloced %d/%d bigs\n",
2491 				      i, sc->rx_big.mask + 1);
2492 			goto abort;
2493 		}
2494 	}
2495 
2496 	/* Give the firmware the mtu and the big and small buffer
2497 	   sizes.  The firmware wants the big buf size to be a power
2498 	   of two. Luckily, FreeBSD's clusters are powers of two */
2499 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN;
2500 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
2501 	cmd.data0 = MHLEN - MXGEFW_PAD;
2502 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
2503 			     &cmd);
2504 	cmd.data0 = sc->big_bytes;
2505 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2506 
2507 	if (err != 0) {
2508 		device_printf(sc->dev, "failed to setup params\n");
2509 		goto abort;
2510 	}
2511 
2512 	/* Now give him the pointer to the stats block */
2513 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->fw_stats_dma.bus_addr);
2514 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->fw_stats_dma.bus_addr);
2515 	cmd.data2 = sizeof(struct mcp_irq_data);
2516 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
2517 
2518 	if (err != 0) {
2519 		bus = sc->fw_stats_dma.bus_addr;
2520 		bus += offsetof(struct mcp_irq_data, send_done_count);
2521 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
2522 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
2523 		err = mxge_send_cmd(sc,
2524 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
2525 				    &cmd);
2526 		/* Firmware cannot support multicast without STATS_DMA_V2 */
2527 		sc->fw_multicast_support = 0;
2528 	} else {
2529 		sc->fw_multicast_support = 1;
2530 	}
2531 
2532 	if (err != 0) {
2533 		device_printf(sc->dev, "failed to setup params\n");
2534 		goto abort;
2535 	}
2536 
2537 	/* Finally, start the firmware running */
2538 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
2539 	if (err) {
2540 		device_printf(sc->dev, "Couldn't bring up link\n");
2541 		goto abort;
2542 	}
2543 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
2544 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2545 
2546 	return 0;
2547 
2548 
2549 abort:
2550 	mxge_free_mbufs(sc);
2551 
2552 	return err;
2553 }
2554 
2555 static int
2556 mxge_close(mxge_softc_t *sc)
2557 {
2558 	mxge_cmd_t cmd;
2559 	int err, old_down_cnt;
2560 
2561 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
2562 	old_down_cnt = sc->down_cnt;
2563 	mb();
2564 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2565 	if (err) {
2566 		device_printf(sc->dev, "Couldn't bring down link\n");
2567 	}
2568 	if (old_down_cnt == sc->down_cnt) {
2569 		/* wait for down irq */
2570 		DELAY(10 * sc->intr_coal_delay);
2571 	}
2572 	if (old_down_cnt == sc->down_cnt) {
2573 		device_printf(sc->dev, "never got down irq\n");
2574 	}
2575 
2576 	mxge_free_mbufs(sc);
2577 
2578 	return 0;
2579 }
2580 
2581 static void
2582 mxge_setup_cfg_space(mxge_softc_t *sc)
2583 {
2584 	device_t dev = sc->dev;
2585 	int reg;
2586 	uint16_t cmd, lnk, pectl;
2587 
2588 	/* find the PCIe link width and set max read request to 4KB*/
2589 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
2590 		lnk = pci_read_config(dev, reg + 0x12, 2);
2591 		sc->link_width = (lnk >> 4) & 0x3f;
2592 
2593 		pectl = pci_read_config(dev, reg + 0x8, 2);
2594 		pectl = (pectl & ~0x7000) | (5 << 12);
2595 		pci_write_config(dev, reg + 0x8, pectl, 2);
2596 	}
2597 
2598 	/* Enable DMA and Memory space access */
2599 	pci_enable_busmaster(dev);
2600 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
2601 	cmd |= PCIM_CMD_MEMEN;
2602 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
2603 }
2604 
2605 static uint32_t
2606 mxge_read_reboot(mxge_softc_t *sc)
2607 {
2608 	device_t dev = sc->dev;
2609 	uint32_t vs;
2610 
2611 	/* find the vendor specific offset */
2612 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
2613 		device_printf(sc->dev,
2614 			      "could not find vendor specific offset\n");
2615 		return (uint32_t)-1;
2616 	}
2617 	/* enable read32 mode */
2618 	pci_write_config(dev, vs + 0x10, 0x3, 1);
2619 	/* tell NIC which register to read */
2620 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
2621 	return (pci_read_config(dev, vs + 0x14, 4));
2622 }
2623 
2624 static void
2625 mxge_watchdog_reset(mxge_softc_t *sc)
2626 {
2627 	int err;
2628 	uint32_t reboot;
2629 	uint16_t cmd;
2630 
2631 	err = ENXIO;
2632 
2633 	device_printf(sc->dev, "Watchdog reset!\n");
2634 
2635 	/*
2636 	 * check to see if the NIC rebooted.  If it did, then all of
2637 	 * PCI config space has been reset, and things like the
2638 	 * busmaster bit will be zero.  If this is the case, then we
2639 	 * must restore PCI config space before the NIC can be used
2640 	 * again
2641 	 */
2642 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2643 	if (cmd == 0xffff) {
2644 		/*
2645 		 * maybe the watchdog caught the NIC rebooting; wait
2646 		 * up to 100ms for it to finish.  If it does not come
2647 		 * back, then give up
2648 		 */
2649 		DELAY(1000*100);
2650 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2651 		if (cmd == 0xffff) {
2652 			device_printf(sc->dev, "NIC disappeared!\n");
2653 			goto abort;
2654 		}
2655 	}
2656 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
2657 		/* print the reboot status */
2658 		reboot = mxge_read_reboot(sc);
2659 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
2660 			      reboot);
2661 		/* restore PCI configuration space */
2662 
2663 		/* XXXX waiting for pci_cfg_restore() to be exported */
2664 		goto abort; /* just abort for now */
2665 
2666 		/* and redo any changes we made to our config space */
2667 		mxge_setup_cfg_space(sc);
2668 	} else {
2669 		device_printf(sc->dev, "NIC did not reboot, ring state:\n");
2670 		device_printf(sc->dev, "tx.req=%d tx.done=%d\n",
2671 			      sc->tx.req, sc->tx.done);
2672 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
2673 			      sc->tx.pkt_done,
2674 			      be32toh(sc->fw_stats->send_done_count));
2675 	}
2676 
2677 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
2678 		mxge_close(sc);
2679 		err = mxge_open(sc);
2680 	}
2681 
2682 abort:
2683 	/*
2684 	 * stop the watchdog if the nic is dead, to avoid spamming the
2685 	 * console
2686 	 */
2687 	if (err != 0) {
2688 		callout_stop(&sc->co_hdl);
2689 	}
2690 }
2691 
2692 static void
2693 mxge_watchdog(mxge_softc_t *sc)
2694 {
2695 	mxge_tx_buf_t *tx = &sc->tx;
2696 
2697 	/* see if we have outstanding transmits, which
2698 	   have been pending for more than mxge_ticks */
2699 	if (tx->req != tx->done &&
2700 	    tx->watchdog_req != tx->watchdog_done &&
2701 	    tx->done == tx->watchdog_done)
2702 		mxge_watchdog_reset(sc);
2703 
2704 	tx->watchdog_req = tx->req;
2705 	tx->watchdog_done = tx->done;
2706 }
2707 
2708 static void
2709 mxge_tick(void *arg)
2710 {
2711 	mxge_softc_t *sc = arg;
2712 
2713 
2714 	/* Synchronize with possible callout reset/stop. */
2715 	if (callout_pending(&sc->co_hdl) ||
2716 	    !callout_active(&sc->co_hdl)) {
2717 		mtx_unlock(&sc->driver_mtx);
2718 		return;
2719 	}
2720 
2721 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
2722 	mxge_watchdog(sc);
2723 }
2724 
2725 static int
2726 mxge_media_change(struct ifnet *ifp)
2727 {
2728 	return EINVAL;
2729 }
2730 
2731 static int
2732 mxge_change_mtu(mxge_softc_t *sc, int mtu)
2733 {
2734 	struct ifnet *ifp = sc->ifp;
2735 	int real_mtu, old_mtu;
2736 	int err = 0;
2737 
2738 
2739 	real_mtu = mtu + ETHER_HDR_LEN;
2740 	if ((real_mtu > MXGE_MAX_ETHER_MTU) ||
2741 	    real_mtu < 60)
2742 		return EINVAL;
2743 	mtx_lock(&sc->driver_mtx);
2744 	old_mtu = ifp->if_mtu;
2745 	ifp->if_mtu = mtu;
2746 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2747 		callout_stop(&sc->co_hdl);
2748 		mxge_close(sc);
2749 		err = mxge_open(sc);
2750 		if (err != 0) {
2751 			ifp->if_mtu = old_mtu;
2752 			mxge_close(sc);
2753 			(void) mxge_open(sc);
2754 		}
2755 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
2756 	}
2757 	mtx_unlock(&sc->driver_mtx);
2758 	return err;
2759 }
2760 
2761 static void
2762 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
2763 {
2764 	mxge_softc_t *sc = ifp->if_softc;
2765 
2766 
2767 	if (sc == NULL)
2768 		return;
2769 	ifmr->ifm_status = IFM_AVALID;
2770 	ifmr->ifm_status |= sc->fw_stats->link_up ? IFM_ACTIVE : 0;
2771 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
2772 	ifmr->ifm_active |= sc->fw_stats->link_up ? IFM_FDX : 0;
2773 }
2774 
2775 static int
2776 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
2777 {
2778 	mxge_softc_t *sc = ifp->if_softc;
2779 	struct ifreq *ifr = (struct ifreq *)data;
2780 	int err, mask;
2781 
2782 	err = 0;
2783 	switch (command) {
2784 	case SIOCSIFADDR:
2785 	case SIOCGIFADDR:
2786 		err = ether_ioctl(ifp, command, data);
2787 		break;
2788 
2789 	case SIOCSIFMTU:
2790 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
2791 		break;
2792 
2793 	case SIOCSIFFLAGS:
2794 		mtx_lock(&sc->driver_mtx);
2795 		if (ifp->if_flags & IFF_UP) {
2796 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
2797 				err = mxge_open(sc);
2798 				callout_reset(&sc->co_hdl, mxge_ticks,
2799 					      mxge_tick, sc);
2800 			} else {
2801 				/* take care of promis can allmulti
2802 				   flag chages */
2803 				mxge_change_promisc(sc,
2804 						    ifp->if_flags & IFF_PROMISC);
2805 				mxge_set_multicast_list(sc);
2806 			}
2807 		} else {
2808 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2809 				mxge_close(sc);
2810 				callout_stop(&sc->co_hdl);
2811 			}
2812 		}
2813 		mtx_unlock(&sc->driver_mtx);
2814 		break;
2815 
2816 	case SIOCADDMULTI:
2817 	case SIOCDELMULTI:
2818 		mtx_lock(&sc->driver_mtx);
2819 		mxge_set_multicast_list(sc);
2820 		mtx_unlock(&sc->driver_mtx);
2821 		break;
2822 
2823 	case SIOCSIFCAP:
2824 		mtx_lock(&sc->driver_mtx);
2825 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2826 		if (mask & IFCAP_TXCSUM) {
2827 			if (IFCAP_TXCSUM & ifp->if_capenable) {
2828 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
2829 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
2830 						      | CSUM_TSO);
2831 			} else {
2832 				ifp->if_capenable |= IFCAP_TXCSUM;
2833 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
2834 			}
2835 		} else if (mask & IFCAP_RXCSUM) {
2836 			if (IFCAP_RXCSUM & ifp->if_capenable) {
2837 				ifp->if_capenable &= ~IFCAP_RXCSUM;
2838 				sc->csum_flag = 0;
2839 			} else {
2840 				ifp->if_capenable |= IFCAP_RXCSUM;
2841 				sc->csum_flag = 1;
2842 			}
2843 		}
2844 		if (mask & IFCAP_TSO4) {
2845 			if (IFCAP_TSO4 & ifp->if_capenable) {
2846 				ifp->if_capenable &= ~IFCAP_TSO4;
2847 				ifp->if_hwassist &= ~CSUM_TSO;
2848 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
2849 				ifp->if_capenable |= IFCAP_TSO4;
2850 				ifp->if_hwassist |= CSUM_TSO;
2851 			} else {
2852 				printf("mxge requires tx checksum offload"
2853 				       " be enabled to use TSO\n");
2854 				err = EINVAL;
2855 			}
2856 		}
2857 		mtx_unlock(&sc->driver_mtx);
2858 		break;
2859 
2860 	case SIOCGIFMEDIA:
2861 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
2862 				    &sc->media, command);
2863                 break;
2864 
2865 	default:
2866 		err = ENOTTY;
2867         }
2868 	return err;
2869 }
2870 
2871 static void
2872 mxge_fetch_tunables(mxge_softc_t *sc)
2873 {
2874 
2875 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
2876 			  &mxge_flow_control);
2877 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
2878 			  &mxge_intr_coal_delay);
2879 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
2880 			  &mxge_nvidia_ecrc_enable);
2881 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
2882 			  &mxge_force_firmware);
2883 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
2884 			  &mxge_deassert_wait);
2885 	TUNABLE_INT_FETCH("hw.mxge.verbose",
2886 			  &mxge_verbose);
2887 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
2888 
2889 	if (bootverbose)
2890 		mxge_verbose = 1;
2891 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
2892 		mxge_intr_coal_delay = 30;
2893 	if (mxge_ticks == 0)
2894 		mxge_ticks = hz;
2895 	sc->pause = mxge_flow_control;
2896 }
2897 
2898 static int
2899 mxge_attach(device_t dev)
2900 {
2901 	mxge_softc_t *sc = device_get_softc(dev);
2902 	struct ifnet *ifp;
2903 	size_t bytes;
2904 	int count, rid, err;
2905 
2906 	sc->dev = dev;
2907 	mxge_fetch_tunables(sc);
2908 
2909 	err = bus_dma_tag_create(NULL,			/* parent */
2910 				 1,			/* alignment */
2911 				 4096,			/* boundary */
2912 				 BUS_SPACE_MAXADDR,	/* low */
2913 				 BUS_SPACE_MAXADDR,	/* high */
2914 				 NULL, NULL,		/* filter */
2915 				 65536 + 256,		/* maxsize */
2916 				 MXGE_MAX_SEND_DESC, 	/* num segs */
2917 				 4096,			/* maxsegsize */
2918 				 0,			/* flags */
2919 				 NULL, NULL,		/* lock */
2920 				 &sc->parent_dmat);	/* tag */
2921 
2922 	if (err != 0) {
2923 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
2924 			      err);
2925 		goto abort_with_nothing;
2926 	}
2927 
2928 	ifp = sc->ifp = if_alloc(IFT_ETHER);
2929 	if (ifp == NULL) {
2930 		device_printf(dev, "can not if_alloc()\n");
2931 		err = ENOSPC;
2932 		goto abort_with_parent_dmat;
2933 	}
2934 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
2935 		 device_get_nameunit(dev));
2936 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
2937 	snprintf(sc->tx_mtx_name, sizeof(sc->tx_mtx_name), "%s:tx",
2938 		 device_get_nameunit(dev));
2939 	mtx_init(&sc->tx_mtx, sc->tx_mtx_name, NULL, MTX_DEF);
2940 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
2941 		 "%s:drv", device_get_nameunit(dev));
2942 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
2943 		 MTX_NETWORK_LOCK, MTX_DEF);
2944 
2945 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
2946 
2947 	mxge_setup_cfg_space(sc);
2948 
2949 	/* Map the board into the kernel */
2950 	rid = PCIR_BARS;
2951 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
2952 					 ~0, 1, RF_ACTIVE);
2953 	if (sc->mem_res == NULL) {
2954 		device_printf(dev, "could not map memory\n");
2955 		err = ENXIO;
2956 		goto abort_with_lock;
2957 	}
2958 	sc->sram = rman_get_virtual(sc->mem_res);
2959 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
2960 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
2961 		device_printf(dev, "impossible memory region size %ld\n",
2962 			      rman_get_size(sc->mem_res));
2963 		err = ENXIO;
2964 		goto abort_with_mem_res;
2965 	}
2966 
2967 	/* make NULL terminated copy of the EEPROM strings section of
2968 	   lanai SRAM */
2969 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
2970 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
2971 				rman_get_bushandle(sc->mem_res),
2972 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
2973 				sc->eeprom_strings,
2974 				MXGE_EEPROM_STRINGS_SIZE - 2);
2975 	err = mxge_parse_strings(sc);
2976 	if (err != 0)
2977 		goto abort_with_mem_res;
2978 
2979 	/* Enable write combining for efficient use of PCIe bus */
2980 	mxge_enable_wc(sc);
2981 
2982 	/* Allocate the out of band dma memory */
2983 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
2984 			     sizeof (mxge_cmd_t), 64);
2985 	if (err != 0)
2986 		goto abort_with_mem_res;
2987 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
2988 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
2989 	if (err != 0)
2990 		goto abort_with_cmd_dma;
2991 
2992 	err = mxge_dma_alloc(sc, &sc->fw_stats_dma,
2993 			     sizeof (*sc->fw_stats), 64);
2994 	if (err != 0)
2995 		goto abort_with_zeropad_dma;
2996 	sc->fw_stats = (mcp_irq_data_t *)sc->fw_stats_dma.addr;
2997 
2998 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
2999 	if (err != 0)
3000 		goto abort_with_fw_stats;
3001 
3002 	/* allocate interrupt queues */
3003 	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);
3004 	err = mxge_dma_alloc(sc, &sc->rx_done.dma, bytes, 4096);
3005 	if (err != 0)
3006 		goto abort_with_dmabench;
3007 	sc->rx_done.entry = sc->rx_done.dma.addr;
3008 	bzero(sc->rx_done.entry, bytes);
3009 
3010 	/* Add our ithread  */
3011 	count = pci_msi_count(dev);
3012 	if (count == 1 && pci_alloc_msi(dev, &count) == 0) {
3013 		rid = 1;
3014 		sc->msi_enabled = 1;
3015 	} else {
3016 		rid = 0;
3017 	}
3018 	sc->irq_res = bus_alloc_resource(dev, SYS_RES_IRQ, &rid, 0, ~0,
3019 					 1, RF_SHAREABLE | RF_ACTIVE);
3020 	if (sc->irq_res == NULL) {
3021 		device_printf(dev, "could not alloc interrupt\n");
3022 		goto abort_with_rx_done;
3023 	}
3024 	if (mxge_verbose)
3025 		device_printf(dev, "using %s irq %ld\n",
3026 			      sc->msi_enabled ? "MSI" : "INTx",
3027 			      rman_get_start(sc->irq_res));
3028 	/* load the firmware */
3029 	mxge_select_firmware(sc);
3030 
3031 	err = mxge_load_firmware(sc);
3032 	if (err != 0)
3033 		goto abort_with_irq_res;
3034 	sc->intr_coal_delay = mxge_intr_coal_delay;
3035 	err = mxge_reset(sc);
3036 	if (err != 0)
3037 		goto abort_with_irq_res;
3038 
3039 	err = mxge_alloc_rings(sc);
3040 	if (err != 0) {
3041 		device_printf(sc->dev, "failed to allocate rings\n");
3042 		goto abort_with_irq_res;
3043 	}
3044 
3045 	err = bus_setup_intr(sc->dev, sc->irq_res,
3046 			     INTR_TYPE_NET | INTR_MPSAFE,
3047 			     NULL, mxge_intr, sc, &sc->ih);
3048 	if (err != 0) {
3049 		goto abort_with_rings;
3050 	}
3051 	/* hook into the network stack */
3052 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
3053 	ifp->if_baudrate = 100000000;
3054 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
3055 		IFCAP_JUMBO_MTU;
3056 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
3057 	ifp->if_capenable = ifp->if_capabilities;
3058 	sc->csum_flag = 1;
3059         ifp->if_init = mxge_init;
3060         ifp->if_softc = sc;
3061         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
3062         ifp->if_ioctl = mxge_ioctl;
3063         ifp->if_start = mxge_start;
3064 	ether_ifattach(ifp, sc->mac_addr);
3065 	/* ether_ifattach sets mtu to 1500 */
3066 	ifp->if_mtu = MXGE_MAX_ETHER_MTU - ETHER_HDR_LEN;
3067 
3068 	/* Initialise the ifmedia structure */
3069 	ifmedia_init(&sc->media, 0, mxge_media_change,
3070 		     mxge_media_status);
3071 	ifmedia_add(&sc->media, IFM_ETHER|IFM_AUTO, 0, NULL);
3072 	mxge_add_sysctls(sc);
3073 	return 0;
3074 
3075 abort_with_rings:
3076 	mxge_free_rings(sc);
3077 abort_with_irq_res:
3078 	bus_release_resource(dev, SYS_RES_IRQ,
3079 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3080 	if (sc->msi_enabled)
3081 		pci_release_msi(dev);
3082 abort_with_rx_done:
3083 	sc->rx_done.entry = NULL;
3084 	mxge_dma_free(&sc->rx_done.dma);
3085 abort_with_dmabench:
3086 	mxge_dma_free(&sc->dmabench_dma);
3087 abort_with_fw_stats:
3088 	mxge_dma_free(&sc->fw_stats_dma);
3089 abort_with_zeropad_dma:
3090 	mxge_dma_free(&sc->zeropad_dma);
3091 abort_with_cmd_dma:
3092 	mxge_dma_free(&sc->cmd_dma);
3093 abort_with_mem_res:
3094 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3095 abort_with_lock:
3096 	pci_disable_busmaster(dev);
3097 	mtx_destroy(&sc->cmd_mtx);
3098 	mtx_destroy(&sc->tx_mtx);
3099 	mtx_destroy(&sc->driver_mtx);
3100 	if_free(ifp);
3101 abort_with_parent_dmat:
3102 	bus_dma_tag_destroy(sc->parent_dmat);
3103 
3104 abort_with_nothing:
3105 	return err;
3106 }
3107 
3108 static int
3109 mxge_detach(device_t dev)
3110 {
3111 	mxge_softc_t *sc = device_get_softc(dev);
3112 
3113 	mtx_lock(&sc->driver_mtx);
3114 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
3115 		mxge_close(sc);
3116 	callout_stop(&sc->co_hdl);
3117 	mtx_unlock(&sc->driver_mtx);
3118 	ether_ifdetach(sc->ifp);
3119 	ifmedia_removeall(&sc->media);
3120 	mxge_dummy_rdma(sc, 0);
3121 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
3122 	mxge_free_rings(sc);
3123 	bus_release_resource(dev, SYS_RES_IRQ,
3124 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3125 	if (sc->msi_enabled)
3126 		pci_release_msi(dev);
3127 
3128 	sc->rx_done.entry = NULL;
3129 	mxge_dma_free(&sc->rx_done.dma);
3130 	mxge_dma_free(&sc->fw_stats_dma);
3131 	mxge_dma_free(&sc->dmabench_dma);
3132 	mxge_dma_free(&sc->zeropad_dma);
3133 	mxge_dma_free(&sc->cmd_dma);
3134 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3135 	pci_disable_busmaster(dev);
3136 	mtx_destroy(&sc->cmd_mtx);
3137 	mtx_destroy(&sc->tx_mtx);
3138 	mtx_destroy(&sc->driver_mtx);
3139 	if_free(sc->ifp);
3140 	bus_dma_tag_destroy(sc->parent_dmat);
3141 	return 0;
3142 }
3143 
3144 static int
3145 mxge_shutdown(device_t dev)
3146 {
3147 	return 0;
3148 }
3149 
3150 /*
3151   This file uses Myri10GE driver indentation.
3152 
3153   Local Variables:
3154   c-file-style:"linux"
3155   tab-width:8
3156   End:
3157 */
3158