xref: /freebsd/sys/dev/mxge/if_mxge.c (revision c96ae1968a6ab7056427a739bce81bf07447c2d4)
1 /******************************************************************************
2 
3 Copyright (c) 2006, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Redistributions in binary form must reproduce the above copyright
13     notice, this list of conditions and the following disclaimer in the
14     documentation and/or other materials provided with the distribution.
15 
16  3. Neither the name of the Myricom Inc, nor the names of its
17     contributors may be used to endorse or promote products derived from
18     this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 POSSIBILITY OF SUCH DAMAGE.
31 
32 ***************************************************************************/
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/linker.h>
40 #include <sys/firmware.h>
41 #include <sys/endian.h>
42 #include <sys/sockio.h>
43 #include <sys/mbuf.h>
44 #include <sys/malloc.h>
45 #include <sys/kdb.h>
46 #include <sys/kernel.h>
47 #include <sys/module.h>
48 #include <sys/memrange.h>
49 #include <sys/socket.h>
50 #include <sys/sysctl.h>
51 #include <sys/sx.h>
52 
53 #include <net/if.h>
54 #include <net/if_arp.h>
55 #include <net/ethernet.h>
56 #include <net/if_dl.h>
57 #include <net/if_media.h>
58 
59 #include <net/bpf.h>
60 
61 #include <net/if_types.h>
62 #include <net/if_vlan_var.h>
63 #include <net/zlib.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 #include <netinet/tcp.h>
69 
70 #include <machine/bus.h>
71 #include <machine/resource.h>
72 #include <sys/bus.h>
73 #include <sys/rman.h>
74 
75 #include <dev/pci/pcireg.h>
76 #include <dev/pci/pcivar.h>
77 
78 #include <vm/vm.h>		/* for pmap_mapdev() */
79 #include <vm/pmap.h>
80 
81 #include <dev/mxge/mxge_mcp.h>
82 #include <dev/mxge/mcp_gen_header.h>
83 #include <dev/mxge/if_mxge_var.h>
84 
85 /* tunable params */
86 static int mxge_nvidia_ecrc_enable = 1;
87 static int mxge_force_firmware = 0;
88 static int mxge_max_intr_slots = 1024;
89 static int mxge_intr_coal_delay = 30;
90 static int mxge_deassert_wait = 1;
91 static int mxge_flow_control = 1;
92 static int mxge_verbose = 0;
93 static int mxge_ticks;
94 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
95 static char *mxge_fw_aligned = "mxge_eth_z8e";
96 
97 static int mxge_probe(device_t dev);
98 static int mxge_attach(device_t dev);
99 static int mxge_detach(device_t dev);
100 static int mxge_shutdown(device_t dev);
101 static void mxge_intr(void *arg);
102 
103 static device_method_t mxge_methods[] =
104 {
105   /* Device interface */
106   DEVMETHOD(device_probe, mxge_probe),
107   DEVMETHOD(device_attach, mxge_attach),
108   DEVMETHOD(device_detach, mxge_detach),
109   DEVMETHOD(device_shutdown, mxge_shutdown),
110   {0, 0}
111 };
112 
113 static driver_t mxge_driver =
114 {
115   "mxge",
116   mxge_methods,
117   sizeof(mxge_softc_t),
118 };
119 
120 static devclass_t mxge_devclass;
121 
122 /* Declare ourselves to be a child of the PCI bus.*/
123 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
124 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
125 
126 static int
127 mxge_probe(device_t dev)
128 {
129   if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
130       (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E)) {
131 	  device_set_desc(dev, "Myri10G-PCIE-8A");
132 	  return 0;
133   }
134   return ENXIO;
135 }
136 
137 static void
138 mxge_enable_wc(mxge_softc_t *sc)
139 {
140 	struct mem_range_desc mrdesc;
141 	vm_paddr_t pa;
142 	vm_offset_t len;
143 	int err, action;
144 
145 	pa = rman_get_start(sc->mem_res);
146 	len = rman_get_size(sc->mem_res);
147 	mrdesc.mr_base = pa;
148 	mrdesc.mr_len = len;
149 	mrdesc.mr_flags = MDF_WRITECOMBINE;
150 	action = MEMRANGE_SET_UPDATE;
151 	strcpy((char *)&mrdesc.mr_owner, "mxge");
152 	err = mem_range_attr_set(&mrdesc, &action);
153 	if (err != 0) {
154 		device_printf(sc->dev,
155 			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
156 			      (unsigned long)pa, (unsigned long)len, err);
157 	} else {
158 		sc->wc = 1;
159 	}
160 }
161 
162 
163 /* callback to get our DMA address */
164 static void
165 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
166 			 int error)
167 {
168 	if (error == 0) {
169 		*(bus_addr_t *) arg = segs->ds_addr;
170 	}
171 }
172 
173 static int
174 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
175 		   bus_size_t alignment)
176 {
177 	int err;
178 	device_t dev = sc->dev;
179 
180 	/* allocate DMAable memory tags */
181 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
182 				 alignment,		/* alignment */
183 				 4096,			/* boundary */
184 				 BUS_SPACE_MAXADDR,	/* low */
185 				 BUS_SPACE_MAXADDR,	/* high */
186 				 NULL, NULL,		/* filter */
187 				 bytes,			/* maxsize */
188 				 1,			/* num segs */
189 				 4096,			/* maxsegsize */
190 				 BUS_DMA_COHERENT,	/* flags */
191 				 NULL, NULL,		/* lock */
192 				 &dma->dmat);		/* tag */
193 	if (err != 0) {
194 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
195 		return err;
196 	}
197 
198 	/* allocate DMAable memory & map */
199 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
200 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
201 				| BUS_DMA_ZERO),  &dma->map);
202 	if (err != 0) {
203 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
204 		goto abort_with_dmat;
205 	}
206 
207 	/* load the memory */
208 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
209 			      mxge_dmamap_callback,
210 			      (void *)&dma->bus_addr, 0);
211 	if (err != 0) {
212 		device_printf(dev, "couldn't load map (err = %d)\n", err);
213 		goto abort_with_mem;
214 	}
215 	return 0;
216 
217 abort_with_mem:
218 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
219 abort_with_dmat:
220 	(void)bus_dma_tag_destroy(dma->dmat);
221 	return err;
222 }
223 
224 
225 static void
226 mxge_dma_free(mxge_dma_t *dma)
227 {
228 	bus_dmamap_unload(dma->dmat, dma->map);
229 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
230 	(void)bus_dma_tag_destroy(dma->dmat);
231 }
232 
233 /*
234  * The eeprom strings on the lanaiX have the format
235  * SN=x\0
236  * MAC=x:x:x:x:x:x\0
237  * PC=text\0
238  */
239 
240 static int
241 mxge_parse_strings(mxge_softc_t *sc)
242 {
243 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
244 
245 	char *ptr, *limit;
246 	int i, found_mac;
247 
248 	ptr = sc->eeprom_strings;
249 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
250 	found_mac = 0;
251 	while (ptr < limit && *ptr != '\0') {
252 		if (memcmp(ptr, "MAC=", 4) == 0) {
253 			ptr += 1;
254 			sc->mac_addr_string = ptr;
255 			for (i = 0; i < 6; i++) {
256 				ptr += 3;
257 				if ((ptr + 2) > limit)
258 					goto abort;
259 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
260 				found_mac = 1;
261 			}
262 		} else if (memcmp(ptr, "PC=", 3) == 0) {
263 			ptr += 3;
264 			strncpy(sc->product_code_string, ptr,
265 				sizeof (sc->product_code_string) - 1);
266 		} else if (memcmp(ptr, "SN=", 3) == 0) {
267 			ptr += 3;
268 			strncpy(sc->serial_number_string, ptr,
269 				sizeof (sc->serial_number_string) - 1);
270 		}
271 		MXGE_NEXT_STRING(ptr);
272 	}
273 
274 	if (found_mac)
275 		return 0;
276 
277  abort:
278 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
279 
280 	return ENXIO;
281 }
282 
283 #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
284 static int
285 mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
286 {
287 	uint32_t val;
288 	unsigned long off;
289 	char *va, *cfgptr;
290 	uint16_t vendor_id, device_id;
291 	uintptr_t bus, slot, func, ivend, idev;
292 	uint32_t *ptr32;
293 
294 	/* XXXX
295 	   Test below is commented because it is believed that doing
296 	   config read/write beyond 0xff will access the config space
297 	   for the next larger function.  Uncomment this and remove
298 	   the hacky pmap_mapdev() way of accessing config space when
299 	   FreeBSD grows support for extended pcie config space access
300 	*/
301 #if 0
302 	/* See if we can, by some miracle, access the extended
303 	   config space */
304 	val = pci_read_config(pdev, 0x178, 4);
305 	if (val != 0xffffffff) {
306 		val |= 0x40;
307 		pci_write_config(pdev, 0x178, val, 4);
308 		return 0;
309 	}
310 #endif
311 	/* Rather than using normal pci config space writes, we must
312 	 * map the Nvidia config space ourselves.  This is because on
313 	 * opteron/nvidia class machine the 0xe000000 mapping is
314 	 * handled by the nvidia chipset, that means the internal PCI
315 	 * device (the on-chip northbridge), or the amd-8131 bridge
316 	 * and things behind them are not visible by this method.
317 	 */
318 
319 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
320 		      PCI_IVAR_BUS, &bus);
321 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
322 		      PCI_IVAR_SLOT, &slot);
323 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
324 		      PCI_IVAR_FUNCTION, &func);
325 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
326 		      PCI_IVAR_VENDOR, &ivend);
327 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
328 		      PCI_IVAR_DEVICE, &idev);
329 
330 	off =  0xe0000000UL
331 		+ 0x00100000UL * (unsigned long)bus
332 		+ 0x00001000UL * (unsigned long)(func
333 						 + 8 * slot);
334 
335 	/* map it into the kernel */
336 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
337 
338 
339 	if (va == NULL) {
340 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
341 		return EIO;
342 	}
343 	/* get a pointer to the config space mapped into the kernel */
344 	cfgptr = va + (off & PAGE_MASK);
345 
346 	/* make sure that we can really access it */
347 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
348 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
349 	if (! (vendor_id == ivend && device_id == idev)) {
350 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
351 			      vendor_id, device_id);
352 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
353 		return EIO;
354 	}
355 
356 	ptr32 = (uint32_t*)(cfgptr + 0x178);
357 	val = *ptr32;
358 
359 	if (val == 0xffffffff) {
360 		device_printf(sc->dev, "extended mapping failed\n");
361 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
362 		return EIO;
363 	}
364 	*ptr32 = val | 0x40;
365 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
366 	if (mxge_verbose)
367 		device_printf(sc->dev,
368 			      "Enabled ECRC on upstream Nvidia bridge "
369 			      "at %d:%d:%d\n",
370 			      (int)bus, (int)slot, (int)func);
371 	return 0;
372 }
373 #else
374 static int
375 mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
376 {
377 	device_printf(sc->dev,
378 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
379 	return ENXIO;
380 }
381 #endif
382 /*
383  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
384  * when the PCI-E Completion packets are aligned on an 8-byte
385  * boundary.  Some PCI-E chip sets always align Completion packets; on
386  * the ones that do not, the alignment can be enforced by enabling
387  * ECRC generation (if supported).
388  *
389  * When PCI-E Completion packets are not aligned, it is actually more
390  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
391  *
392  * If the driver can neither enable ECRC nor verify that it has
393  * already been enabled, then it must use a firmware image which works
394  * around unaligned completion packets (ethp_z8e.dat), and it should
395  * also ensure that it never gives the device a Read-DMA which is
396  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
397  * enabled, then the driver should use the aligned (eth_z8e.dat)
398  * firmware image, and set tx.boundary to 4KB.
399  */
400 
401 static void
402 mxge_select_firmware(mxge_softc_t *sc)
403 {
404 	int err, aligned = 0;
405 	device_t pdev;
406 	uint16_t pvend, pdid;
407 
408 
409 	if (mxge_force_firmware != 0) {
410 		if (mxge_force_firmware == 1)
411 			aligned = 1;
412 		else
413 			aligned = 0;
414 		if (mxge_verbose)
415 			device_printf(sc->dev,
416 				      "Assuming %s completions (forced)\n",
417 				      aligned ? "aligned" : "unaligned");
418 		goto abort;
419 	}
420 
421 	/* if the PCIe link width is 4 or less, we can use the aligned
422 	   firmware and skip any checks */
423 	if (sc->link_width != 0 && sc->link_width <= 4) {
424 		device_printf(sc->dev,
425 			      "PCIe x%d Link, expect reduced performance\n",
426 			      sc->link_width);
427 		aligned = 1;
428 		goto abort;
429 	}
430 
431 	pdev = device_get_parent(device_get_parent(sc->dev));
432 	if (pdev == NULL) {
433 		device_printf(sc->dev, "could not find parent?\n");
434 		goto abort;
435 	}
436 	pvend = pci_read_config(pdev, PCIR_VENDOR, 2);
437 	pdid = pci_read_config(pdev, PCIR_DEVICE, 2);
438 
439 	/* see if we can enable ECRC's on an upstream
440 	   Nvidia bridge */
441 	if (mxge_nvidia_ecrc_enable &&
442 	    (pvend == 0x10de && pdid == 0x005d)) {
443 		err = mxge_enable_nvidia_ecrc(sc, pdev);
444 		if (err == 0) {
445 			aligned = 1;
446 			if (mxge_verbose)
447 				device_printf(sc->dev,
448 					      "Assuming aligned completions"
449 					      " (ECRC)\n");
450 		}
451 	}
452 	/* see if the upstream bridge is known to
453 	   provided aligned completions */
454 	if (/* HT2000 */ (pvend == 0x1166 && pdid == 0x0132) ||
455 	    /* PLX */    (pvend == 0x10b5 && pdid == 0x8532) ||
456 	    /* Intel */  (pvend == 0x8086 &&
457 	      /* E5000 NorthBridge*/((pdid >= 0x25f7 && pdid <= 0x25fa) ||
458 	      /* E5000 SouthBridge*/ (pdid >= 0x3510 && pdid <= 0x351b)))) {
459 		aligned = 1;
460 		if (mxge_verbose)
461 			device_printf(sc->dev,
462 				      "Assuming aligned completions "
463 				      "(0x%x:0x%x)\n", pvend, pdid);
464 	}
465 
466 abort:
467 	if (aligned) {
468 		sc->fw_name = mxge_fw_aligned;
469 		sc->tx.boundary = 4096;
470 	} else {
471 		sc->fw_name = mxge_fw_unaligned;
472 		sc->tx.boundary = 2048;
473 	}
474 }
475 
476 union qualhack
477 {
478         const char *ro_char;
479         char *rw_char;
480 };
481 
482 static int
483 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
484 {
485 	int major, minor;
486 
487 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
488 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
489 			      be32toh(hdr->mcp_type));
490 		return EIO;
491 	}
492 
493 	/* save firmware version for sysctl */
494 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
495 	if (mxge_verbose)
496 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
497 
498 	sscanf(sc->fw_version, "%d.%d", &major, &minor);
499 
500 	if (!(major == MXGEFW_VERSION_MAJOR
501 	      && minor == MXGEFW_VERSION_MINOR)) {
502 		device_printf(sc->dev, "Found firmware version %s\n",
503 			      sc->fw_version);
504 		device_printf(sc->dev, "Driver needs %d.%d\n",
505 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
506 		return EINVAL;
507 	}
508 	return 0;
509 
510 }
511 
512 static int
513 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
514 {
515 	struct firmware *fw;
516 	const mcp_gen_header_t *hdr;
517 	unsigned hdr_offset;
518 	const char *fw_data;
519 	union qualhack hack;
520 	int status;
521 	unsigned int i;
522 	char dummy;
523 
524 
525 	fw = firmware_get(sc->fw_name);
526 
527 	if (fw == NULL) {
528 		device_printf(sc->dev, "Could not find firmware image %s\n",
529 			      sc->fw_name);
530 		return ENOENT;
531 	}
532 	if (fw->datasize > *limit ||
533 	    fw->datasize < MCP_HEADER_PTR_OFFSET + 4) {
534 		device_printf(sc->dev, "Firmware image %s too large (%d/%d)\n",
535 			      sc->fw_name, (int)fw->datasize, (int) *limit);
536 		status = ENOSPC;
537 		goto abort_with_fw;
538 	}
539 	*limit = fw->datasize;
540 
541 	/* check id */
542 	fw_data = (const char *)fw->data;
543 	hdr_offset = htobe32(*(const uint32_t *)
544 			     (fw_data + MCP_HEADER_PTR_OFFSET));
545 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw->datasize) {
546 		device_printf(sc->dev, "Bad firmware file");
547 		status = EIO;
548 		goto abort_with_fw;
549 	}
550 	hdr = (const void*)(fw_data + hdr_offset);
551 
552 	status = mxge_validate_firmware(sc, hdr);
553 	if (status != 0)
554 		goto abort_with_fw;
555 
556 	hack.ro_char = fw_data;
557 	/* Copy the inflated firmware to NIC SRAM. */
558 	for (i = 0; i < *limit; i += 256) {
559 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
560 			      hack.rw_char + i,
561 			      min(256U, (unsigned)(*limit - i)));
562 		mb();
563 		dummy = *sc->sram;
564 		mb();
565 	}
566 
567 	status = 0;
568 abort_with_fw:
569 	firmware_put(fw, FIRMWARE_UNLOAD);
570 	return status;
571 }
572 
573 /*
574  * Enable or disable periodic RDMAs from the host to make certain
575  * chipsets resend dropped PCIe messages
576  */
577 
578 static void
579 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
580 {
581 	char buf_bytes[72];
582 	volatile uint32_t *confirm;
583 	volatile char *submit;
584 	uint32_t *buf, dma_low, dma_high;
585 	int i;
586 
587 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
588 
589 	/* clear confirmation addr */
590 	confirm = (volatile uint32_t *)sc->cmd;
591 	*confirm = 0;
592 	mb();
593 
594 	/* send an rdma command to the PCIe engine, and wait for the
595 	   response in the confirmation address.  The firmware should
596 	   write a -1 there to indicate it is alive and well
597 	*/
598 
599 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
600 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
601 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
602 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
603 	buf[2] = htobe32(0xffffffff);		/* confirm data */
604 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
605 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
606 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
607 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
608 	buf[5] = htobe32(enable);			/* enable? */
609 
610 
611 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
612 
613 	mxge_pio_copy(submit, buf, 64);
614 	mb();
615 	DELAY(1000);
616 	mb();
617 	i = 0;
618 	while (*confirm != 0xffffffff && i < 20) {
619 		DELAY(1000);
620 		i++;
621 	}
622 	if (*confirm != 0xffffffff) {
623 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
624 			      (enable ? "enable" : "disable"), confirm,
625 			      *confirm);
626 	}
627 	return;
628 }
629 
630 static int
631 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
632 {
633 	mcp_cmd_t *buf;
634 	char buf_bytes[sizeof(*buf) + 8];
635 	volatile mcp_cmd_response_t *response = sc->cmd;
636 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
637 	uint32_t dma_low, dma_high;
638 	int sleep_total = 0;
639 
640 	/* ensure buf is aligned to 8 bytes */
641 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
642 
643 	buf->data0 = htobe32(data->data0);
644 	buf->data1 = htobe32(data->data1);
645 	buf->data2 = htobe32(data->data2);
646 	buf->cmd = htobe32(cmd);
647 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
648 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
649 
650 	buf->response_addr.low = htobe32(dma_low);
651 	buf->response_addr.high = htobe32(dma_high);
652 	mtx_lock(&sc->cmd_mtx);
653 	response->result = 0xffffffff;
654 	mb();
655 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
656 
657 	/* wait up to 20ms */
658 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
659 		bus_dmamap_sync(sc->cmd_dma.dmat,
660 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
661 		mb();
662 		if (response->result != 0xffffffff) {
663 			if (response->result == 0) {
664 				data->data0 = be32toh(response->data);
665 				mtx_unlock(&sc->cmd_mtx);
666 				return 0;
667 			} else {
668 				device_printf(sc->dev,
669 					      "mxge: command %d "
670 					      "failed, result = %d\n",
671 					      cmd, be32toh(response->result));
672 				mtx_unlock(&sc->cmd_mtx);
673 				return ENXIO;
674 			}
675 		}
676 		DELAY(1000);
677 	}
678 	mtx_unlock(&sc->cmd_mtx);
679 	device_printf(sc->dev, "mxge: command %d timed out"
680 		      "result = %d\n",
681 		      cmd, be32toh(response->result));
682 	return EAGAIN;
683 }
684 
685 static int
686 mxge_adopt_running_firmware(mxge_softc_t *sc)
687 {
688 	struct mcp_gen_header *hdr;
689 	const size_t bytes = sizeof (struct mcp_gen_header);
690 	size_t hdr_offset;
691 	int status;
692 
693 	/* find running firmware header */
694 	hdr_offset = htobe32(*(volatile uint32_t *)
695 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
696 
697 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
698 		device_printf(sc->dev,
699 			      "Running firmware has bad header offset (%d)\n",
700 			      (int)hdr_offset);
701 		return EIO;
702 	}
703 
704 	/* copy header of running firmware from SRAM to host memory to
705 	 * validate firmware */
706 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
707 	if (hdr == NULL) {
708 		device_printf(sc->dev, "could not malloc firmware hdr\n");
709 		return ENOMEM;
710 	}
711 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
712 				rman_get_bushandle(sc->mem_res),
713 				hdr_offset, (char *)hdr, bytes);
714 	status = mxge_validate_firmware(sc, hdr);
715 	free(hdr, M_DEVBUF);
716 	return status;
717 }
718 
719 
720 static int
721 mxge_load_firmware(mxge_softc_t *sc)
722 {
723 	volatile uint32_t *confirm;
724 	volatile char *submit;
725 	char buf_bytes[72];
726 	uint32_t *buf, size, dma_low, dma_high;
727 	int status, i;
728 
729 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
730 
731 	size = sc->sram_size;
732 	status = mxge_load_firmware_helper(sc, &size);
733 	if (status) {
734 		/* Try to use the currently running firmware, if
735 		   it is new enough */
736 		status = mxge_adopt_running_firmware(sc);
737 		if (status) {
738 			device_printf(sc->dev,
739 				      "failed to adopt running firmware\n");
740 			return status;
741 		}
742 		device_printf(sc->dev,
743 			      "Successfully adopted running firmware\n");
744 		if (sc->tx.boundary == 4096) {
745 			device_printf(sc->dev,
746 				"Using firmware currently running on NIC"
747 				 ".  For optimal\n");
748 			device_printf(sc->dev,
749 				 "performance consider loading optimized "
750 				 "firmware\n");
751 		}
752 		sc->fw_name = mxge_fw_unaligned;
753 		sc->tx.boundary = 2048;
754 		return 0;
755 	}
756 	/* clear confirmation addr */
757 	confirm = (volatile uint32_t *)sc->cmd;
758 	*confirm = 0;
759 	mb();
760 	/* send a reload command to the bootstrap MCP, and wait for the
761 	   response in the confirmation address.  The firmware should
762 	   write a -1 there to indicate it is alive and well
763 	*/
764 
765 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
766 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
767 
768 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
769 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
770 	buf[2] = htobe32(0xffffffff);	/* confirm data */
771 
772 	/* FIX: All newest firmware should un-protect the bottom of
773 	   the sram before handoff. However, the very first interfaces
774 	   do not. Therefore the handoff copy must skip the first 8 bytes
775 	*/
776 					/* where the code starts*/
777 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
778 	buf[4] = htobe32(size - 8); 	/* length of code */
779 	buf[5] = htobe32(8);		/* where to copy to */
780 	buf[6] = htobe32(0);		/* where to jump to */
781 
782 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
783 	mxge_pio_copy(submit, buf, 64);
784 	mb();
785 	DELAY(1000);
786 	mb();
787 	i = 0;
788 	while (*confirm != 0xffffffff && i < 20) {
789 		DELAY(1000*10);
790 		i++;
791 		bus_dmamap_sync(sc->cmd_dma.dmat,
792 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
793 	}
794 	if (*confirm != 0xffffffff) {
795 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
796 			confirm, *confirm);
797 
798 		return ENXIO;
799 	}
800 	return 0;
801 }
802 
803 static int
804 mxge_update_mac_address(mxge_softc_t *sc)
805 {
806 	mxge_cmd_t cmd;
807 	uint8_t *addr = sc->mac_addr;
808 	int status;
809 
810 
811 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
812 		     | (addr[2] << 8) | addr[3]);
813 
814 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
815 
816 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
817 	return status;
818 }
819 
820 static int
821 mxge_change_pause(mxge_softc_t *sc, int pause)
822 {
823 	mxge_cmd_t cmd;
824 	int status;
825 
826 	if (pause)
827 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
828 				       &cmd);
829 	else
830 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
831 				       &cmd);
832 
833 	if (status) {
834 		device_printf(sc->dev, "Failed to set flow control mode\n");
835 		return ENXIO;
836 	}
837 	sc->pause = pause;
838 	return 0;
839 }
840 
841 static void
842 mxge_change_promisc(mxge_softc_t *sc, int promisc)
843 {
844 	mxge_cmd_t cmd;
845 	int status;
846 
847 	if (promisc)
848 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
849 				       &cmd);
850 	else
851 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
852 				       &cmd);
853 
854 	if (status) {
855 		device_printf(sc->dev, "Failed to set promisc mode\n");
856 	}
857 }
858 
859 static void
860 mxge_set_multicast_list(mxge_softc_t *sc)
861 {
862 	mxge_cmd_t cmd;
863 	struct ifmultiaddr *ifma;
864 	struct ifnet *ifp = sc->ifp;
865 	int err;
866 
867 	/* This firmware is known to not support multicast */
868 	if (!sc->fw_multicast_support)
869 		return;
870 
871 	/* Disable multicast filtering while we play with the lists*/
872 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
873 	if (err != 0) {
874 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
875 		       " error status: %d\n", err);
876 		return;
877 	}
878 
879 
880 	if (ifp->if_flags & IFF_ALLMULTI)
881 		/* request to disable multicast filtering, so quit here */
882 		return;
883 
884 	/* Flush all the filters */
885 
886 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
887 	if (err != 0) {
888 		device_printf(sc->dev,
889 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
890 			      ", error status: %d\n", err);
891 		return;
892 	}
893 
894 	/* Walk the multicast list, and add each address */
895 
896 	IF_ADDR_LOCK(ifp);
897 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
898 		if (ifma->ifma_addr->sa_family != AF_LINK)
899 			continue;
900 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
901 		      &cmd.data0, 4);
902 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
903 		      &cmd.data1, 2);
904 		cmd.data0 = htonl(cmd.data0);
905 		cmd.data1 = htonl(cmd.data1);
906 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
907 		if (err != 0) {
908 			device_printf(sc->dev, "Failed "
909 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
910 			       "%d\t", err);
911 			/* abort, leaving multicast filtering off */
912 			IF_ADDR_UNLOCK(ifp);
913 			return;
914 		}
915 	}
916 	IF_ADDR_UNLOCK(ifp);
917 	/* Enable multicast filtering */
918 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
919 	if (err != 0) {
920 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
921 		       ", error status: %d\n", err);
922 	}
923 }
924 
925 
926 static int
927 mxge_reset(mxge_softc_t *sc)
928 {
929 
930 	mxge_cmd_t cmd;
931 	size_t bytes;
932 	int status;
933 
934 	/* try to send a reset command to the card to see if it
935 	   is alive */
936 	memset(&cmd, 0, sizeof (cmd));
937 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
938 	if (status != 0) {
939 		device_printf(sc->dev, "failed reset\n");
940 		return ENXIO;
941 	}
942 
943 	mxge_dummy_rdma(sc, 1);
944 
945 	/* Now exchange information about interrupts  */
946 	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);\
947 	memset(sc->rx_done.entry, 0, bytes);
948 	cmd.data0 = (uint32_t)bytes;
949 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
950 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->rx_done.dma.bus_addr);
951 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->rx_done.dma.bus_addr);
952 	status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd);
953 
954 	status |= mxge_send_cmd(sc,
955 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
956 
957 
958 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
959 
960 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
961 	sc->irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
962 
963 
964 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
965 				&cmd);
966 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
967 	if (status != 0) {
968 		device_printf(sc->dev, "failed set interrupt parameters\n");
969 		return status;
970 	}
971 
972 
973 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
974 
975 
976 	/* run a DMA benchmark */
977 	sc->read_dma = sc->write_dma = sc->read_write_dma = 0;
978 
979 	/* Read DMA */
980 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->dmabench_dma.bus_addr);
981 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->dmabench_dma.bus_addr);
982 	cmd.data2 = sc->tx.boundary * 0x10000;
983 
984 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
985 	if (status != 0)
986 		device_printf(sc->dev, "read dma benchmark failed\n");
987 	else
988 		sc->read_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
989 			(cmd.data0 & 0xffff);
990 
991 	/* Write DMA */
992 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->dmabench_dma.bus_addr);
993 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->dmabench_dma.bus_addr);
994 	cmd.data2 = sc->tx.boundary * 0x1;
995 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
996 	if (status != 0)
997 		device_printf(sc->dev, "write dma benchmark failed\n");
998 	else
999 		sc->write_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
1000 			(cmd.data0 & 0xffff);
1001 	/* Read/Write DMA */
1002 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->dmabench_dma.bus_addr);
1003 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->dmabench_dma.bus_addr);
1004 	cmd.data2 = sc->tx.boundary * 0x10001;
1005 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
1006 	if (status != 0)
1007 		device_printf(sc->dev, "read/write dma benchmark failed\n");
1008 	else
1009 		sc->read_write_dma =
1010 			((cmd.data0>>16) * sc->tx.boundary * 2 * 2) /
1011 			(cmd.data0 & 0xffff);
1012 
1013 	/* reset mcp/driver shared state back to 0 */
1014 	bzero(sc->rx_done.entry, bytes);
1015 	sc->rx_done.idx = 0;
1016 	sc->rx_done.cnt = 0;
1017 	sc->tx.req = 0;
1018 	sc->tx.done = 0;
1019 	sc->tx.pkt_done = 0;
1020 	sc->tx.wake = 0;
1021 	sc->tx.stall = 0;
1022 	sc->rx_big.cnt = 0;
1023 	sc->rx_small.cnt = 0;
1024 	sc->rdma_tags_available = 15;
1025 	sc->fw_stats->valid = 0;
1026 	sc->fw_stats->send_done_count = 0;
1027 	status = mxge_update_mac_address(sc);
1028 	mxge_change_promisc(sc, 0);
1029 	mxge_change_pause(sc, sc->pause);
1030 	mxge_set_multicast_list(sc);
1031 	return status;
1032 }
1033 
1034 static int
1035 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1036 {
1037         mxge_softc_t *sc;
1038         unsigned int intr_coal_delay;
1039         int err;
1040 
1041         sc = arg1;
1042         intr_coal_delay = sc->intr_coal_delay;
1043         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1044         if (err != 0) {
1045                 return err;
1046         }
1047         if (intr_coal_delay == sc->intr_coal_delay)
1048                 return 0;
1049 
1050         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1051                 return EINVAL;
1052 
1053 	mtx_lock(&sc->driver_mtx);
1054 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1055 	sc->intr_coal_delay = intr_coal_delay;
1056 
1057 	mtx_unlock(&sc->driver_mtx);
1058         return err;
1059 }
1060 
1061 static int
1062 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1063 {
1064         mxge_softc_t *sc;
1065         unsigned int enabled;
1066         int err;
1067 
1068         sc = arg1;
1069         enabled = sc->pause;
1070         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1071         if (err != 0) {
1072                 return err;
1073         }
1074         if (enabled == sc->pause)
1075                 return 0;
1076 
1077 	mtx_lock(&sc->driver_mtx);
1078 	err = mxge_change_pause(sc, enabled);
1079 	mtx_unlock(&sc->driver_mtx);
1080         return err;
1081 }
1082 
1083 static int
1084 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1085 {
1086         int err;
1087 
1088         if (arg1 == NULL)
1089                 return EFAULT;
1090         arg2 = be32toh(*(int *)arg1);
1091         arg1 = NULL;
1092         err = sysctl_handle_int(oidp, arg1, arg2, req);
1093 
1094         return err;
1095 }
1096 
1097 static void
1098 mxge_add_sysctls(mxge_softc_t *sc)
1099 {
1100 	struct sysctl_ctx_list *ctx;
1101 	struct sysctl_oid_list *children;
1102 	mcp_irq_data_t *fw;
1103 
1104 	ctx = device_get_sysctl_ctx(sc->dev);
1105 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1106 	fw = sc->fw_stats;
1107 
1108 	/* random information */
1109 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1110 		       "firmware_version",
1111 		       CTLFLAG_RD, &sc->fw_version,
1112 		       0, "firmware version");
1113 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1114 		       "serial_number",
1115 		       CTLFLAG_RD, &sc->serial_number_string,
1116 		       0, "serial number");
1117 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1118 		       "product_code",
1119 		       CTLFLAG_RD, &sc->product_code_string,
1120 		       0, "product_code");
1121 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1122 		       "pcie_link_width",
1123 		       CTLFLAG_RD, &sc->link_width,
1124 		       0, "tx_boundary");
1125 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1126 		       "tx_boundary",
1127 		       CTLFLAG_RD, &sc->tx.boundary,
1128 		       0, "tx_boundary");
1129 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1130 		       "write_combine",
1131 		       CTLFLAG_RD, &sc->wc,
1132 		       0, "write combining PIO?");
1133 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1134 		       "read_dma_MBs",
1135 		       CTLFLAG_RD, &sc->read_dma,
1136 		       0, "DMA Read speed in MB/s");
1137 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1138 		       "write_dma_MBs",
1139 		       CTLFLAG_RD, &sc->write_dma,
1140 		       0, "DMA Write speed in MB/s");
1141 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1142 		       "read_write_dma_MBs",
1143 		       CTLFLAG_RD, &sc->read_write_dma,
1144 		       0, "DMA concurrent Read/Write speed in MB/s");
1145 
1146 
1147 	/* performance related tunables */
1148 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1149 			"intr_coal_delay",
1150 			CTLTYPE_INT|CTLFLAG_RW, sc,
1151 			0, mxge_change_intr_coal,
1152 			"I", "interrupt coalescing delay in usecs");
1153 
1154 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1155 			"flow_control_enabled",
1156 			CTLTYPE_INT|CTLFLAG_RW, sc,
1157 			0, mxge_change_flow_control,
1158 			"I", "interrupt coalescing delay in usecs");
1159 
1160 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1161 		       "deassert_wait",
1162 		       CTLFLAG_RW, &mxge_deassert_wait,
1163 		       0, "Wait for IRQ line to go low in ihandler");
1164 
1165 	/* stats block from firmware is in network byte order.
1166 	   Need to swap it */
1167 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1168 			"link_up",
1169 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1170 			0, mxge_handle_be32,
1171 			"I", "link up");
1172 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1173 			"rdma_tags_available",
1174 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1175 			0, mxge_handle_be32,
1176 			"I", "rdma_tags_available");
1177 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1178 			"dropped_link_overflow",
1179 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1180 			0, mxge_handle_be32,
1181 			"I", "dropped_link_overflow");
1182 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1183 			"dropped_link_error_or_filtered",
1184 			CTLTYPE_INT|CTLFLAG_RD,
1185 			&fw->dropped_link_error_or_filtered,
1186 			0, mxge_handle_be32,
1187 			"I", "dropped_link_error_or_filtered");
1188 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1189 			"dropped_multicast_filtered",
1190 			CTLTYPE_INT|CTLFLAG_RD,
1191 			&fw->dropped_multicast_filtered,
1192 			0, mxge_handle_be32,
1193 			"I", "dropped_multicast_filtered");
1194 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1195 			"dropped_runt",
1196 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1197 			0, mxge_handle_be32,
1198 			"I", "dropped_runt");
1199 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1200 			"dropped_overrun",
1201 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1202 			0, mxge_handle_be32,
1203 			"I", "dropped_overrun");
1204 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1205 			"dropped_no_small_buffer",
1206 			CTLTYPE_INT|CTLFLAG_RD,
1207 			&fw->dropped_no_small_buffer,
1208 			0, mxge_handle_be32,
1209 			"I", "dropped_no_small_buffer");
1210 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1211 			"dropped_no_big_buffer",
1212 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1213 			0, mxge_handle_be32,
1214 			"I", "dropped_no_big_buffer");
1215 
1216 	/* host counters exported for debugging */
1217 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1218 		       "rx_small_cnt",
1219 		       CTLFLAG_RD, &sc->rx_small.cnt,
1220 		       0, "rx_small_cnt");
1221 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1222 		       "rx_big_cnt",
1223 		       CTLFLAG_RD, &sc->rx_big.cnt,
1224 		       0, "rx_small_cnt");
1225 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1226 		       "tx_req",
1227 		       CTLFLAG_RD, &sc->tx.req,
1228 		       0, "tx_req");
1229 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1230 		       "tx_done",
1231 		       CTLFLAG_RD, &sc->tx.done,
1232 		       0, "tx_done");
1233 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1234 		       "tx_pkt_done",
1235 		       CTLFLAG_RD, &sc->tx.pkt_done,
1236 		       0, "tx_done");
1237 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1238 		       "tx_stall",
1239 		       CTLFLAG_RD, &sc->tx.stall,
1240 		       0, "tx_stall");
1241 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1242 		       "tx_wake",
1243 		       CTLFLAG_RD, &sc->tx.wake,
1244 		       0, "tx_wake");
1245 
1246 	/* verbose printing? */
1247 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1248 		       "verbose",
1249 		       CTLFLAG_RW, &mxge_verbose,
1250 		       0, "verbose printing");
1251 
1252 }
1253 
1254 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1255    backwards one at a time and handle ring wraps */
1256 
1257 static inline void
1258 mxge_submit_req_backwards(mxge_tx_buf_t *tx,
1259 			    mcp_kreq_ether_send_t *src, int cnt)
1260 {
1261         int idx, starting_slot;
1262         starting_slot = tx->req;
1263         while (cnt > 1) {
1264                 cnt--;
1265                 idx = (starting_slot + cnt) & tx->mask;
1266                 mxge_pio_copy(&tx->lanai[idx],
1267 			      &src[cnt], sizeof(*src));
1268                 mb();
1269         }
1270 }
1271 
1272 /*
1273  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1274  * at most 32 bytes at a time, so as to avoid involving the software
1275  * pio handler in the nic.   We re-write the first segment's flags
1276  * to mark them valid only after writing the entire chain
1277  */
1278 
1279 static inline void
1280 mxge_submit_req(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src,
1281                   int cnt)
1282 {
1283         int idx, i;
1284         uint32_t *src_ints;
1285 	volatile uint32_t *dst_ints;
1286         mcp_kreq_ether_send_t *srcp;
1287 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1288 	uint8_t last_flags;
1289 
1290         idx = tx->req & tx->mask;
1291 
1292 	last_flags = src->flags;
1293 	src->flags = 0;
1294         mb();
1295         dst = dstp = &tx->lanai[idx];
1296         srcp = src;
1297 
1298         if ((idx + cnt) < tx->mask) {
1299                 for (i = 0; i < (cnt - 1); i += 2) {
1300                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1301                         mb(); /* force write every 32 bytes */
1302                         srcp += 2;
1303                         dstp += 2;
1304                 }
1305         } else {
1306                 /* submit all but the first request, and ensure
1307                    that it is submitted below */
1308                 mxge_submit_req_backwards(tx, src, cnt);
1309                 i = 0;
1310         }
1311         if (i < cnt) {
1312                 /* submit the first request */
1313                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1314                 mb(); /* barrier before setting valid flag */
1315         }
1316 
1317         /* re-write the last 32-bits with the valid flags */
1318         src->flags = last_flags;
1319         src_ints = (uint32_t *)src;
1320         src_ints+=3;
1321         dst_ints = (volatile uint32_t *)dst;
1322         dst_ints+=3;
1323         *dst_ints =  *src_ints;
1324         tx->req += cnt;
1325         mb();
1326 }
1327 
1328 static inline void
1329 mxge_submit_req_wc(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1330 {
1331     tx->req += cnt;
1332     mb();
1333     while (cnt >= 4) {
1334 	    mxge_pio_copy((volatile char *)tx->wc_fifo, src, 64);
1335 	    mb();
1336 	    src += 4;
1337 	    cnt -= 4;
1338     }
1339     if (cnt > 0) {
1340 	    /* pad it to 64 bytes.  The src is 64 bytes bigger than it
1341 	       needs to be so that we don't overrun it */
1342 	    mxge_pio_copy(tx->wc_fifo + MXGEFW_ETH_SEND_OFFSET(cnt), src, 64);
1343 	    mb();
1344     }
1345 }
1346 
1347 static void
1348 mxge_encap_tso(mxge_softc_t *sc, struct mbuf *m, int busdma_seg_cnt)
1349 {
1350 	mxge_tx_buf_t *tx;
1351 	mcp_kreq_ether_send_t *req;
1352 	bus_dma_segment_t *seg;
1353 	struct ether_header *eh;
1354 	struct ip *ip;
1355 	struct tcphdr *tcp;
1356 	uint32_t low, high_swapped;
1357 	int len, seglen, cum_len, cum_len_next;
1358 	int next_is_first, chop, cnt, rdma_count, small;
1359 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1360 	uint8_t flags, flags_next;
1361 	static int once;
1362 
1363 	mss = m->m_pkthdr.tso_segsz;
1364 
1365 	/* negative cum_len signifies to the
1366 	 * send loop that we are still in the
1367 	 * header portion of the TSO packet.
1368 	 */
1369 
1370 	/* ensure we have the ethernet, IP and TCP
1371 	   header together in the first mbuf, copy
1372 	   it to a scratch buffer if not */
1373 	if (__predict_false(m->m_len < sizeof (*eh)
1374 			    + sizeof (*ip))) {
1375 		m_copydata(m, 0, sizeof (*eh) + sizeof (*ip),
1376 			   sc->scratch);
1377 		eh = (struct ether_header *)sc->scratch;
1378 	} else {
1379 		eh = mtod(m, struct ether_header *);
1380 	}
1381 	ip = (struct ip *) (eh + 1);
1382 	if (__predict_false(m->m_len < sizeof (*eh) + (ip->ip_hl << 2)
1383 			    + sizeof (*tcp))) {
1384 		m_copydata(m, 0, sizeof (*eh) + (ip->ip_hl << 2)
1385 			   + sizeof (*tcp),  sc->scratch);
1386 		eh = (struct ether_header *) sc->scratch;
1387 		ip = (struct ip *) (eh + 1);
1388 	}
1389 
1390 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1391 	cum_len = -(sizeof (*eh) + ((ip->ip_hl + tcp->th_off) << 2));
1392 
1393 	/* TSO implies checksum offload on this hardware */
1394 	cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1395 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1396 
1397 
1398 	/* for TSO, pseudo_hdr_offset holds mss.
1399 	 * The firmware figures out where to put
1400 	 * the checksum by parsing the header. */
1401 	pseudo_hdr_offset = htobe16(mss);
1402 
1403 	tx = &sc->tx;
1404 	req = tx->req_list;
1405 	seg = tx->seg_list;
1406 	cnt = 0;
1407 	rdma_count = 0;
1408 	/* "rdma_count" is the number of RDMAs belonging to the
1409 	 * current packet BEFORE the current send request. For
1410 	 * non-TSO packets, this is equal to "count".
1411 	 * For TSO packets, rdma_count needs to be reset
1412 	 * to 0 after a segment cut.
1413 	 *
1414 	 * The rdma_count field of the send request is
1415 	 * the number of RDMAs of the packet starting at
1416 	 * that request. For TSO send requests with one ore more cuts
1417 	 * in the middle, this is the number of RDMAs starting
1418 	 * after the last cut in the request. All previous
1419 	 * segments before the last cut implicitly have 1 RDMA.
1420 	 *
1421 	 * Since the number of RDMAs is not known beforehand,
1422 	 * it must be filled-in retroactively - after each
1423 	 * segmentation cut or at the end of the entire packet.
1424 	 */
1425 
1426 	while (busdma_seg_cnt) {
1427 		/* Break the busdma segment up into pieces*/
1428 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1429 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1430 		len = seglen = seg->ds_len;
1431 
1432 		while (len) {
1433 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1434 			cum_len_next = cum_len + seglen;
1435 			(req-rdma_count)->rdma_count = rdma_count + 1;
1436 			if (__predict_true(cum_len >= 0)) {
1437 				/* payload */
1438 				chop = (cum_len_next > mss);
1439 				cum_len_next = cum_len_next % mss;
1440 				next_is_first = (cum_len_next == 0);
1441 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1442 				flags_next |= next_is_first *
1443 					MXGEFW_FLAGS_FIRST;
1444 				rdma_count |= -(chop | next_is_first);
1445 				rdma_count += chop & !next_is_first;
1446 			} else if (cum_len_next >= 0) {
1447 				/* header ends */
1448 				rdma_count = -1;
1449 				cum_len_next = 0;
1450 				seglen = -cum_len;
1451 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1452 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1453 					MXGEFW_FLAGS_FIRST |
1454 					(small * MXGEFW_FLAGS_SMALL);
1455 			    }
1456 
1457 			req->addr_high = high_swapped;
1458 			req->addr_low = htobe32(low);
1459 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1460 			req->pad = 0;
1461 			req->rdma_count = 1;
1462 			req->length = htobe16(seglen);
1463 			req->cksum_offset = cksum_offset;
1464 			req->flags = flags | ((cum_len & 1) *
1465 					      MXGEFW_FLAGS_ALIGN_ODD);
1466 			low += seglen;
1467 			len -= seglen;
1468 			cum_len = cum_len_next;
1469 			flags = flags_next;
1470 			req++;
1471 			cnt++;
1472 			rdma_count++;
1473 			if (__predict_false(cksum_offset > seglen))
1474 				cksum_offset -= seglen;
1475 			else
1476 				cksum_offset = 0;
1477 			if (__predict_false(cnt > MXGE_MAX_SEND_DESC))
1478 				goto drop;
1479 		}
1480 		busdma_seg_cnt--;
1481 		seg++;
1482 	}
1483 	(req-rdma_count)->rdma_count = rdma_count;
1484 
1485 	do {
1486 		req--;
1487 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1488 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1489 
1490 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1491 	if (tx->wc_fifo == NULL)
1492 		mxge_submit_req(tx, tx->req_list, cnt);
1493 	else
1494 		mxge_submit_req_wc(tx, tx->req_list, cnt);
1495 	return;
1496 
1497 drop:
1498 	m_freem(m);
1499 	sc->ifp->if_oerrors++;
1500 	if (!once) {
1501 		printf("MXGE_MAX_SEND_DESC exceeded via TSO!\n");
1502 		printf("mss = %d, %ld!\n", mss, (long)seg - (long)tx->seg_list);
1503 		once = 1;
1504 	}
1505 	return;
1506 
1507 }
1508 
1509 static void
1510 mxge_encap(mxge_softc_t *sc, struct mbuf *m)
1511 {
1512 	mcp_kreq_ether_send_t *req;
1513 	bus_dma_segment_t *seg;
1514 	struct mbuf *m_tmp;
1515 	struct ifnet *ifp;
1516 	mxge_tx_buf_t *tx;
1517 	struct ether_header *eh;
1518 	struct ip *ip;
1519 	int cnt, cum_len, err, i, idx, odd_flag;
1520 	uint16_t pseudo_hdr_offset;
1521         uint8_t flags, cksum_offset;
1522 
1523 
1524 
1525 	ifp = sc->ifp;
1526 	tx = &sc->tx;
1527 
1528 	/* (try to) map the frame for DMA */
1529 	idx = tx->req & tx->mask;
1530 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1531 				      m, tx->seg_list, &cnt,
1532 				      BUS_DMA_NOWAIT);
1533 	if (err == EFBIG) {
1534 		/* Too many segments in the chain.  Try
1535 		   to defrag */
1536 		m_tmp = m_defrag(m, M_NOWAIT);
1537 		if (m_tmp == NULL) {
1538 			goto drop;
1539 		}
1540 		m = m_tmp;
1541 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1542 					      tx->info[idx].map,
1543 					      m, tx->seg_list, &cnt,
1544 					      BUS_DMA_NOWAIT);
1545 	}
1546 	if (err != 0) {
1547 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1548 			      " packet len = %d\n", err, m->m_pkthdr.len);
1549 		goto drop;
1550 	}
1551 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1552 			BUS_DMASYNC_PREWRITE);
1553 	tx->info[idx].m = m;
1554 
1555 
1556 	/* TSO is different enough, we handle it in another routine */
1557 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1558 		mxge_encap_tso(sc, m, cnt);
1559 		return;
1560 	}
1561 
1562 	req = tx->req_list;
1563 	cksum_offset = 0;
1564 	pseudo_hdr_offset = 0;
1565 	flags = MXGEFW_FLAGS_NO_TSO;
1566 
1567 	/* checksum offloading? */
1568 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1569 		/* ensure ip header is in first mbuf, copy
1570 		   it to a scratch buffer if not */
1571 		if (__predict_false(m->m_len < sizeof (*eh)
1572 				    + sizeof (*ip))) {
1573 			m_copydata(m, 0, sizeof (*eh) + sizeof (*ip),
1574 				   sc->scratch);
1575 			eh = (struct ether_header *)sc->scratch;
1576 		} else {
1577 			eh = mtod(m, struct ether_header *);
1578 		}
1579 		ip = (struct ip *) (eh + 1);
1580 		cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1581 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1582 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1583 		req->cksum_offset = cksum_offset;
1584 		flags |= MXGEFW_FLAGS_CKSUM;
1585 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1586 	} else {
1587 		odd_flag = 0;
1588 	}
1589 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1590 		flags |= MXGEFW_FLAGS_SMALL;
1591 
1592 	/* convert segments into a request list */
1593 	cum_len = 0;
1594 	seg = tx->seg_list;
1595 	req->flags = MXGEFW_FLAGS_FIRST;
1596 	for (i = 0; i < cnt; i++) {
1597 		req->addr_low =
1598 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1599 		req->addr_high =
1600 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1601 		req->length = htobe16(seg->ds_len);
1602 		req->cksum_offset = cksum_offset;
1603 		if (cksum_offset > seg->ds_len)
1604 			cksum_offset -= seg->ds_len;
1605 		else
1606 			cksum_offset = 0;
1607 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1608 		req->pad = 0; /* complete solid 16-byte block */
1609 		req->rdma_count = 1;
1610 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1611 		cum_len += seg->ds_len;
1612 		seg++;
1613 		req++;
1614 		req->flags = 0;
1615 	}
1616 	req--;
1617 	/* pad runts to 60 bytes */
1618 	if (cum_len < 60) {
1619 		req++;
1620 		req->addr_low =
1621 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
1622 		req->addr_high =
1623 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
1624 		req->length = htobe16(60 - cum_len);
1625 		req->cksum_offset = 0;
1626 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1627 		req->pad = 0; /* complete solid 16-byte block */
1628 		req->rdma_count = 1;
1629 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1630 		cnt++;
1631 	}
1632 
1633 	tx->req_list[0].rdma_count = cnt;
1634 #if 0
1635 	/* print what the firmware will see */
1636 	for (i = 0; i < cnt; i++) {
1637 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1638 		    "cso:%d, flags:0x%x, rdma:%d\n",
1639 		    i, (int)ntohl(tx->req_list[i].addr_high),
1640 		    (int)ntohl(tx->req_list[i].addr_low),
1641 		    (int)ntohs(tx->req_list[i].length),
1642 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1643 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1644 		    tx->req_list[i].rdma_count);
1645 	}
1646 	printf("--------------\n");
1647 #endif
1648 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1649 	if (tx->wc_fifo == NULL)
1650 		mxge_submit_req(tx, tx->req_list, cnt);
1651 	else
1652 		mxge_submit_req_wc(tx, tx->req_list, cnt);
1653 	return;
1654 
1655 drop:
1656 	m_freem(m);
1657 	ifp->if_oerrors++;
1658 	return;
1659 }
1660 
1661 
1662 
1663 
1664 static inline void
1665 mxge_start_locked(mxge_softc_t *sc)
1666 {
1667 	struct mbuf *m;
1668 	struct ifnet *ifp;
1669 
1670 	ifp = sc->ifp;
1671 	while ((sc->tx.mask - (sc->tx.req - sc->tx.done))
1672 	       > MXGE_MAX_SEND_DESC) {
1673 
1674 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
1675 		if (m == NULL) {
1676 			return;
1677 		}
1678 		/* let BPF see it */
1679 		BPF_MTAP(ifp, m);
1680 
1681 		/* give it to the nic */
1682 		mxge_encap(sc, m);
1683 	}
1684 	/* ran out of transmit slots */
1685 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
1686 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
1687 		sc->tx.stall++;
1688 	}
1689 }
1690 
1691 static void
1692 mxge_start(struct ifnet *ifp)
1693 {
1694 	mxge_softc_t *sc = ifp->if_softc;
1695 
1696 
1697 	mtx_lock(&sc->tx_mtx);
1698 	mxge_start_locked(sc);
1699 	mtx_unlock(&sc->tx_mtx);
1700 }
1701 
1702 /*
1703  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
1704  * at most 32 bytes at a time, so as to avoid involving the software
1705  * pio handler in the nic.   We re-write the first segment's low
1706  * DMA address to mark it valid only after we write the entire chunk
1707  * in a burst
1708  */
1709 static inline void
1710 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
1711 		mcp_kreq_ether_recv_t *src)
1712 {
1713 	uint32_t low;
1714 
1715 	low = src->addr_low;
1716 	src->addr_low = 0xffffffff;
1717 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
1718 	mb();
1719 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
1720 	mb();
1721 	dst->addr_low = low;
1722 	mb();
1723 }
1724 
1725 static int
1726 mxge_get_buf_small(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1727 {
1728 	bus_dma_segment_t seg;
1729 	struct mbuf *m;
1730 	mxge_rx_buf_t *rx = &sc->rx_small;
1731 	int cnt, err;
1732 
1733 	m = m_gethdr(M_DONTWAIT, MT_DATA);
1734 	if (m == NULL) {
1735 		rx->alloc_fail++;
1736 		err = ENOBUFS;
1737 		goto done;
1738 	}
1739 	m->m_len = MHLEN;
1740 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1741 				      &seg, &cnt, BUS_DMA_NOWAIT);
1742 	if (err != 0) {
1743 		m_free(m);
1744 		goto done;
1745 	}
1746 	rx->info[idx].m = m;
1747 	rx->shadow[idx].addr_low =
1748 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1749 	rx->shadow[idx].addr_high =
1750 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1751 
1752 done:
1753 	if ((idx & 7) == 7) {
1754 		if (rx->wc_fifo == NULL)
1755 			mxge_submit_8rx(&rx->lanai[idx - 7],
1756 					&rx->shadow[idx - 7]);
1757 		else {
1758 			mb();
1759 			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1760 		}
1761         }
1762 	return err;
1763 }
1764 
1765 static int
1766 mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1767 {
1768 	bus_dma_segment_t seg;
1769 	struct mbuf *m;
1770 	mxge_rx_buf_t *rx = &sc->rx_big;
1771 	int cnt, err;
1772 
1773 	m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, sc->big_bytes);
1774 	if (m == NULL) {
1775 		rx->alloc_fail++;
1776 		err = ENOBUFS;
1777 		goto done;
1778 	}
1779 	m->m_len = sc->big_bytes;
1780 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1781 				      &seg, &cnt, BUS_DMA_NOWAIT);
1782 	if (err != 0) {
1783 		m_free(m);
1784 		goto done;
1785 	}
1786 	rx->info[idx].m = m;
1787 	rx->shadow[idx].addr_low =
1788 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1789 	rx->shadow[idx].addr_high =
1790 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1791 
1792 done:
1793 	if ((idx & 7) == 7) {
1794 		if (rx->wc_fifo == NULL)
1795 			mxge_submit_8rx(&rx->lanai[idx - 7],
1796 					&rx->shadow[idx - 7]);
1797 		else {
1798 			mb();
1799 			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1800 		}
1801         }
1802 	return err;
1803 }
1804 
1805 static inline void
1806 mxge_rx_csum(struct mbuf *m, int csum)
1807 {
1808 	struct ether_header *eh;
1809 	struct ip *ip;
1810 
1811 	eh = mtod(m, struct ether_header *);
1812 	if (__predict_true(eh->ether_type ==  htons(ETHERTYPE_IP))) {
1813 		ip = (struct ip *)(eh + 1);
1814 		if (__predict_true(ip->ip_p == IPPROTO_TCP ||
1815 				   ip->ip_p == IPPROTO_UDP)) {
1816 			m->m_pkthdr.csum_data = csum;
1817 			m->m_pkthdr.csum_flags = CSUM_DATA_VALID;
1818 		}
1819 	}
1820 }
1821 
1822 static inline void
1823 mxge_rx_done_big(mxge_softc_t *sc, int len, int csum)
1824 {
1825 	struct ifnet *ifp;
1826 	struct mbuf *m = 0; 		/* -Wunitialized */
1827 	struct mbuf *m_prev = 0;	/* -Wunitialized */
1828 	struct mbuf *m_head = 0;
1829 	bus_dmamap_t old_map;
1830 	mxge_rx_buf_t *rx;
1831 	int idx;
1832 
1833 
1834 	rx = &sc->rx_big;
1835 	ifp = sc->ifp;
1836 	while (len > 0) {
1837 		idx = rx->cnt & rx->mask;
1838                 rx->cnt++;
1839 		/* save a pointer to the received mbuf */
1840 		m = rx->info[idx].m;
1841 		/* try to replace the received mbuf */
1842 		if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
1843 			goto drop;
1844 		}
1845 		/* unmap the received buffer */
1846 		old_map = rx->info[idx].map;
1847 		bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1848 		bus_dmamap_unload(rx->dmat, old_map);
1849 
1850 		/* swap the bus_dmamap_t's */
1851 		rx->info[idx].map = rx->extra_map;
1852 		rx->extra_map = old_map;
1853 
1854 		/* chain multiple segments together */
1855 		if (!m_head) {
1856 			m_head = m;
1857 			/* mcp implicitly skips 1st bytes so that
1858 			 * packet is properly aligned */
1859 			m->m_data += MXGEFW_PAD;
1860 			m->m_pkthdr.len = len;
1861 			m->m_len = sc->big_bytes - MXGEFW_PAD;
1862 		} else {
1863 			m->m_len = sc->big_bytes;
1864 			m->m_flags &= ~M_PKTHDR;
1865 			m_prev->m_next = m;
1866 		}
1867 		len -= m->m_len;
1868 		m_prev = m;
1869 	}
1870 
1871 	/* trim trailing garbage from the last mbuf in the chain.  If
1872 	 * there is any garbage, len will be negative */
1873 	m->m_len += len;
1874 
1875 	/* if the checksum is valid, mark it in the mbuf header */
1876 	if (sc->csum_flag)
1877 		mxge_rx_csum(m_head, csum);
1878 
1879 	/* pass the frame up the stack */
1880 	m_head->m_pkthdr.rcvif = ifp;
1881 	ifp->if_ipackets++;
1882 	(*ifp->if_input)(ifp, m_head);
1883 	return;
1884 
1885 drop:
1886 	/* drop the frame -- the old mbuf(s) are re-cycled by running
1887 	   every slot through the allocator */
1888         if (m_head) {
1889                 len -= sc->big_bytes;
1890                 m_freem(m_head);
1891         } else {
1892                 len -= (sc->big_bytes + MXGEFW_PAD);
1893         }
1894         while ((int)len > 0) {
1895                 idx = rx->cnt & rx->mask;
1896                 rx->cnt++;
1897                 m = rx->info[idx].m;
1898                 if (0 == (mxge_get_buf_big(sc, rx->extra_map, idx))) {
1899 			m_freem(m);
1900 			/* unmap the received buffer */
1901 			old_map = rx->info[idx].map;
1902 			bus_dmamap_sync(rx->dmat, old_map,
1903 					BUS_DMASYNC_POSTREAD);
1904 			bus_dmamap_unload(rx->dmat, old_map);
1905 
1906 			/* swap the bus_dmamap_t's */
1907 			rx->info[idx].map = rx->extra_map;
1908 			rx->extra_map = old_map;
1909 		}
1910                 len -= sc->big_bytes;
1911         }
1912 
1913 	ifp->if_ierrors++;
1914 
1915 }
1916 
1917 static inline void
1918 mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
1919 {
1920 	struct ifnet *ifp;
1921 	struct mbuf *m;
1922 	mxge_rx_buf_t *rx;
1923 	bus_dmamap_t old_map;
1924 	int idx;
1925 
1926 	ifp = sc->ifp;
1927 	rx = &sc->rx_small;
1928 	idx = rx->cnt & rx->mask;
1929 	rx->cnt++;
1930 	/* save a pointer to the received mbuf */
1931 	m = rx->info[idx].m;
1932 	/* try to replace the received mbuf */
1933 	if (mxge_get_buf_small(sc, rx->extra_map, idx)) {
1934 		/* drop the frame -- the old mbuf is re-cycled */
1935 		ifp->if_ierrors++;
1936 		return;
1937 	}
1938 
1939 	/* unmap the received buffer */
1940 	old_map = rx->info[idx].map;
1941 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1942 	bus_dmamap_unload(rx->dmat, old_map);
1943 
1944 	/* swap the bus_dmamap_t's */
1945 	rx->info[idx].map = rx->extra_map;
1946 	rx->extra_map = old_map;
1947 
1948 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
1949 	 * aligned */
1950 	m->m_data += MXGEFW_PAD;
1951 
1952 	/* if the checksum is valid, mark it in the mbuf header */
1953 	if (sc->csum_flag)
1954 		mxge_rx_csum(m, csum);
1955 
1956 	/* pass the frame up the stack */
1957 	m->m_pkthdr.rcvif = ifp;
1958 	m->m_len = m->m_pkthdr.len = len;
1959 	ifp->if_ipackets++;
1960 	(*ifp->if_input)(ifp, m);
1961 }
1962 
1963 static inline void
1964 mxge_clean_rx_done(mxge_softc_t *sc)
1965 {
1966 	mxge_rx_done_t *rx_done = &sc->rx_done;
1967 	int limit = 0;
1968 	uint16_t length;
1969 	uint16_t checksum;
1970 
1971 
1972 	while (rx_done->entry[rx_done->idx].length != 0) {
1973 		length = ntohs(rx_done->entry[rx_done->idx].length);
1974 		rx_done->entry[rx_done->idx].length = 0;
1975 		checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
1976 		if (length <= (MHLEN - MXGEFW_PAD))
1977 			mxge_rx_done_small(sc, length, checksum);
1978 		else
1979 			mxge_rx_done_big(sc, length, checksum);
1980 		rx_done->cnt++;
1981 		rx_done->idx = rx_done->cnt & (mxge_max_intr_slots - 1);
1982 
1983 		/* limit potential for livelock */
1984 		if (__predict_false(++limit > 2 * mxge_max_intr_slots))
1985 			break;
1986 
1987 	}
1988 }
1989 
1990 
1991 static inline void
1992 mxge_tx_done(mxge_softc_t *sc, uint32_t mcp_idx)
1993 {
1994 	struct ifnet *ifp;
1995 	mxge_tx_buf_t *tx;
1996 	struct mbuf *m;
1997 	bus_dmamap_t map;
1998 	int idx, limit;
1999 
2000 	limit = 0;
2001 	tx = &sc->tx;
2002 	ifp = sc->ifp;
2003 	while (tx->pkt_done != mcp_idx) {
2004 		idx = tx->done & tx->mask;
2005 		tx->done++;
2006 		m = tx->info[idx].m;
2007 		/* mbuf and DMA map only attached to the first
2008 		   segment per-mbuf */
2009 		if (m != NULL) {
2010 			ifp->if_opackets++;
2011 			tx->info[idx].m = NULL;
2012 			map = tx->info[idx].map;
2013 			bus_dmamap_unload(tx->dmat, map);
2014 			m_freem(m);
2015 		}
2016 		if (tx->info[idx].flag) {
2017 			tx->info[idx].flag = 0;
2018 			tx->pkt_done++;
2019 		}
2020 		/* limit potential for livelock by only handling
2021 		   2 full tx rings per call */
2022 		if (__predict_false(++limit >  2 * tx->mask))
2023 			break;
2024 	}
2025 
2026 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2027            its OK to send packets */
2028 
2029 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
2030 	    tx->req - tx->done < (tx->mask + 1)/4) {
2031 		mtx_lock(&sc->tx_mtx);
2032 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2033 		sc->tx.wake++;
2034 		mxge_start_locked(sc);
2035 		mtx_unlock(&sc->tx_mtx);
2036 	}
2037 }
2038 
2039 static void
2040 mxge_intr(void *arg)
2041 {
2042 	mxge_softc_t *sc = arg;
2043 	mcp_irq_data_t *stats = sc->fw_stats;
2044 	mxge_tx_buf_t *tx = &sc->tx;
2045 	mxge_rx_done_t *rx_done = &sc->rx_done;
2046 	uint32_t send_done_count;
2047 	uint8_t valid;
2048 
2049 
2050 	/* make sure the DMA has finished */
2051 	if (!stats->valid) {
2052 		return;
2053 	}
2054 	valid = stats->valid;
2055 
2056 	if (!sc->msi_enabled) {
2057 		/* lower legacy IRQ  */
2058 		*sc->irq_deassert = 0;
2059 		if (!mxge_deassert_wait)
2060 			/* don't wait for conf. that irq is low */
2061 			stats->valid = 0;
2062 	} else {
2063 		stats->valid = 0;
2064 	}
2065 
2066 	/* loop while waiting for legacy irq deassertion */
2067 	do {
2068 		/* check for transmit completes and receives */
2069 		send_done_count = be32toh(stats->send_done_count);
2070 		while ((send_done_count != tx->pkt_done) ||
2071 		       (rx_done->entry[rx_done->idx].length != 0)) {
2072 			mxge_tx_done(sc, (int)send_done_count);
2073 			mxge_clean_rx_done(sc);
2074 			send_done_count = be32toh(stats->send_done_count);
2075 		}
2076 	} while (*((volatile uint8_t *) &stats->valid));
2077 
2078 	if (__predict_false(stats->stats_updated)) {
2079 		if (sc->link_state != stats->link_up) {
2080 			sc->link_state = stats->link_up;
2081 			if (sc->link_state) {
2082 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2083 				if (mxge_verbose)
2084 					device_printf(sc->dev, "link up\n");
2085 			} else {
2086 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2087 				if (mxge_verbose)
2088 					device_printf(sc->dev, "link down\n");
2089 			}
2090 		}
2091 		if (sc->rdma_tags_available !=
2092 		    be32toh(sc->fw_stats->rdma_tags_available)) {
2093 			sc->rdma_tags_available =
2094 				be32toh(sc->fw_stats->rdma_tags_available);
2095 			device_printf(sc->dev, "RDMA timed out! %d tags "
2096 				      "left\n", sc->rdma_tags_available);
2097 		}
2098 		sc->down_cnt += stats->link_down;
2099 	}
2100 
2101 	/* check to see if we have rx token to pass back */
2102 	if (valid & 0x1)
2103 	    *sc->irq_claim = be32toh(3);
2104 	*(sc->irq_claim + 1) = be32toh(3);
2105 }
2106 
2107 static void
2108 mxge_init(void *arg)
2109 {
2110 }
2111 
2112 
2113 
2114 static void
2115 mxge_free_mbufs(mxge_softc_t *sc)
2116 {
2117 	int i;
2118 
2119 	for (i = 0; i <= sc->rx_big.mask; i++) {
2120 		if (sc->rx_big.info[i].m == NULL)
2121 			continue;
2122 		bus_dmamap_unload(sc->rx_big.dmat,
2123 				  sc->rx_big.info[i].map);
2124 		m_freem(sc->rx_big.info[i].m);
2125 		sc->rx_big.info[i].m = NULL;
2126 	}
2127 
2128 	for (i = 0; i <= sc->rx_big.mask; i++) {
2129 		if (sc->rx_big.info[i].m == NULL)
2130 			continue;
2131 		bus_dmamap_unload(sc->rx_big.dmat,
2132 				  sc->rx_big.info[i].map);
2133 		m_freem(sc->rx_big.info[i].m);
2134 		sc->rx_big.info[i].m = NULL;
2135 	}
2136 
2137 	for (i = 0; i <= sc->tx.mask; i++) {
2138 		sc->tx.info[i].flag = 0;
2139 		if (sc->tx.info[i].m == NULL)
2140 			continue;
2141 		bus_dmamap_unload(sc->tx.dmat,
2142 				  sc->tx.info[i].map);
2143 		m_freem(sc->tx.info[i].m);
2144 		sc->tx.info[i].m = NULL;
2145 	}
2146 }
2147 
2148 static void
2149 mxge_free_rings(mxge_softc_t *sc)
2150 {
2151 	int i;
2152 
2153 	if (sc->tx.req_bytes != NULL)
2154 		free(sc->tx.req_bytes, M_DEVBUF);
2155 	if (sc->tx.seg_list != NULL)
2156 		free(sc->tx.seg_list, M_DEVBUF);
2157 	if (sc->rx_small.shadow != NULL)
2158 		free(sc->rx_small.shadow, M_DEVBUF);
2159 	if (sc->rx_big.shadow != NULL)
2160 		free(sc->rx_big.shadow, M_DEVBUF);
2161 	if (sc->tx.info != NULL) {
2162 		if (sc->tx.dmat != NULL) {
2163 			for (i = 0; i <= sc->tx.mask; i++) {
2164 				bus_dmamap_destroy(sc->tx.dmat,
2165 						   sc->tx.info[i].map);
2166 			}
2167 			bus_dma_tag_destroy(sc->tx.dmat);
2168 		}
2169 		free(sc->tx.info, M_DEVBUF);
2170 	}
2171 	if (sc->rx_small.info != NULL) {
2172 		if (sc->rx_small.dmat != NULL) {
2173 			for (i = 0; i <= sc->rx_small.mask; i++) {
2174 				bus_dmamap_destroy(sc->rx_small.dmat,
2175 						   sc->rx_small.info[i].map);
2176 			}
2177 			bus_dmamap_destroy(sc->rx_small.dmat,
2178 					   sc->rx_small.extra_map);
2179 			bus_dma_tag_destroy(sc->rx_small.dmat);
2180 		}
2181 		free(sc->rx_small.info, M_DEVBUF);
2182 	}
2183 	if (sc->rx_big.info != NULL) {
2184 		if (sc->rx_big.dmat != NULL) {
2185 			for (i = 0; i <= sc->rx_big.mask; i++) {
2186 				bus_dmamap_destroy(sc->rx_big.dmat,
2187 						   sc->rx_big.info[i].map);
2188 			}
2189 			bus_dmamap_destroy(sc->rx_big.dmat,
2190 					   sc->rx_big.extra_map);
2191 			bus_dma_tag_destroy(sc->rx_big.dmat);
2192 		}
2193 		free(sc->rx_big.info, M_DEVBUF);
2194 	}
2195 }
2196 
2197 static int
2198 mxge_alloc_rings(mxge_softc_t *sc)
2199 {
2200 	mxge_cmd_t cmd;
2201 	int tx_ring_size, rx_ring_size;
2202 	int tx_ring_entries, rx_ring_entries;
2203 	int i, err;
2204 	unsigned long bytes;
2205 
2206 	/* get ring sizes */
2207 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
2208 	tx_ring_size = cmd.data0;
2209 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
2210 	if (err != 0) {
2211 		device_printf(sc->dev, "Cannot determine ring sizes\n");
2212 		goto abort_with_nothing;
2213 	}
2214 
2215 	rx_ring_size = cmd.data0;
2216 
2217 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
2218 	rx_ring_entries = rx_ring_size / sizeof (mcp_dma_addr_t);
2219 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
2220 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
2221 	IFQ_SET_READY(&sc->ifp->if_snd);
2222 
2223 	sc->tx.mask = tx_ring_entries - 1;
2224 	sc->rx_small.mask = sc->rx_big.mask = rx_ring_entries - 1;
2225 
2226 	err = ENOMEM;
2227 
2228 	/* allocate the tx request copy block */
2229 	bytes = 8 +
2230 		sizeof (*sc->tx.req_list) * (MXGE_MAX_SEND_DESC + 4);
2231 	sc->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
2232 	if (sc->tx.req_bytes == NULL)
2233 		goto abort_with_nothing;
2234 	/* ensure req_list entries are aligned to 8 bytes */
2235 	sc->tx.req_list = (mcp_kreq_ether_send_t *)
2236 		((unsigned long)(sc->tx.req_bytes + 7) & ~7UL);
2237 
2238 	/* allocate the tx busdma segment list */
2239 	bytes = sizeof (*sc->tx.seg_list) * MXGE_MAX_SEND_DESC;
2240 	sc->tx.seg_list = (bus_dma_segment_t *)
2241 		malloc(bytes, M_DEVBUF, M_WAITOK);
2242 	if (sc->tx.seg_list == NULL)
2243 		goto abort_with_alloc;
2244 
2245 	/* allocate the rx shadow rings */
2246 	bytes = rx_ring_entries * sizeof (*sc->rx_small.shadow);
2247 	sc->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2248 	if (sc->rx_small.shadow == NULL)
2249 		goto abort_with_alloc;
2250 
2251 	bytes = rx_ring_entries * sizeof (*sc->rx_big.shadow);
2252 	sc->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2253 	if (sc->rx_big.shadow == NULL)
2254 		goto abort_with_alloc;
2255 
2256 	/* allocate the host info rings */
2257 	bytes = tx_ring_entries * sizeof (*sc->tx.info);
2258 	sc->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2259 	if (sc->tx.info == NULL)
2260 		goto abort_with_alloc;
2261 
2262 	bytes = rx_ring_entries * sizeof (*sc->rx_small.info);
2263 	sc->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2264 	if (sc->rx_small.info == NULL)
2265 		goto abort_with_alloc;
2266 
2267 	bytes = rx_ring_entries * sizeof (*sc->rx_big.info);
2268 	sc->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2269 	if (sc->rx_big.info == NULL)
2270 		goto abort_with_alloc;
2271 
2272 	/* allocate the busdma resources */
2273 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2274 				 1,			/* alignment */
2275 				 sc->tx.boundary,	/* boundary */
2276 				 BUS_SPACE_MAXADDR,	/* low */
2277 				 BUS_SPACE_MAXADDR,	/* high */
2278 				 NULL, NULL,		/* filter */
2279 				 65536 + 256,		/* maxsize */
2280 				 MXGE_MAX_SEND_DESC/2,	/* num segs */
2281 				 sc->tx.boundary,	/* maxsegsize */
2282 				 BUS_DMA_ALLOCNOW,	/* flags */
2283 				 NULL, NULL,		/* lock */
2284 				 &sc->tx.dmat);		/* tag */
2285 
2286 	if (err != 0) {
2287 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
2288 			      err);
2289 		goto abort_with_alloc;
2290 	}
2291 
2292 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2293 				 1,			/* alignment */
2294 				 4096,			/* boundary */
2295 				 BUS_SPACE_MAXADDR,	/* low */
2296 				 BUS_SPACE_MAXADDR,	/* high */
2297 				 NULL, NULL,		/* filter */
2298 				 MHLEN,			/* maxsize */
2299 				 1,			/* num segs */
2300 				 MHLEN,			/* maxsegsize */
2301 				 BUS_DMA_ALLOCNOW,	/* flags */
2302 				 NULL, NULL,		/* lock */
2303 				 &sc->rx_small.dmat);	/* tag */
2304 	if (err != 0) {
2305 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2306 			      err);
2307 		goto abort_with_alloc;
2308 	}
2309 
2310 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2311 				 1,			/* alignment */
2312 				 4096,			/* boundary */
2313 				 BUS_SPACE_MAXADDR,	/* low */
2314 				 BUS_SPACE_MAXADDR,	/* high */
2315 				 NULL, NULL,		/* filter */
2316 				 4096,			/* maxsize */
2317 				 1,			/* num segs */
2318 				 4096,			/* maxsegsize */
2319 				 BUS_DMA_ALLOCNOW,	/* flags */
2320 				 NULL, NULL,		/* lock */
2321 				 &sc->rx_big.dmat);	/* tag */
2322 	if (err != 0) {
2323 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2324 			      err);
2325 		goto abort_with_alloc;
2326 	}
2327 
2328 	/* now use these tags to setup dmamaps for each slot
2329 	   in each ring */
2330 	for (i = 0; i <= sc->tx.mask; i++) {
2331 		err = bus_dmamap_create(sc->tx.dmat, 0,
2332 					&sc->tx.info[i].map);
2333 		if (err != 0) {
2334 			device_printf(sc->dev, "Err %d  tx dmamap\n",
2335 			      err);
2336 			goto abort_with_alloc;
2337 		}
2338 	}
2339 	for (i = 0; i <= sc->rx_small.mask; i++) {
2340 		err = bus_dmamap_create(sc->rx_small.dmat, 0,
2341 					&sc->rx_small.info[i].map);
2342 		if (err != 0) {
2343 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2344 				      err);
2345 			goto abort_with_alloc;
2346 		}
2347 	}
2348 	err = bus_dmamap_create(sc->rx_small.dmat, 0,
2349 				&sc->rx_small.extra_map);
2350 	if (err != 0) {
2351 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2352 			      err);
2353 			goto abort_with_alloc;
2354 	}
2355 
2356 	for (i = 0; i <= sc->rx_big.mask; i++) {
2357 		err = bus_dmamap_create(sc->rx_big.dmat, 0,
2358 					&sc->rx_big.info[i].map);
2359 		if (err != 0) {
2360 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2361 			      err);
2362 			goto abort_with_alloc;
2363 		}
2364 	}
2365 	err = bus_dmamap_create(sc->rx_big.dmat, 0,
2366 				&sc->rx_big.extra_map);
2367 	if (err != 0) {
2368 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2369 			      err);
2370 			goto abort_with_alloc;
2371 	}
2372 	return 0;
2373 
2374 abort_with_alloc:
2375 	mxge_free_rings(sc);
2376 
2377 abort_with_nothing:
2378 	return err;
2379 }
2380 
2381 static int
2382 mxge_open(mxge_softc_t *sc)
2383 {
2384 	mxge_cmd_t cmd;
2385 	int i, err;
2386 	bus_dmamap_t map;
2387 	bus_addr_t bus;
2388 
2389 
2390 	/* Copy the MAC address in case it was overridden */
2391 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
2392 
2393 	err = mxge_reset(sc);
2394 	if (err != 0) {
2395 		device_printf(sc->dev, "failed to reset\n");
2396 		return EIO;
2397 	}
2398 	bzero(sc->rx_done.entry,
2399 	      mxge_max_intr_slots * sizeof(*sc->rx_done.entry));
2400 
2401 	if (MCLBYTES >=
2402 	    sc->ifp->if_mtu + ETHER_HDR_LEN + MXGEFW_PAD)
2403 		sc->big_bytes = MCLBYTES;
2404 	else
2405 		sc->big_bytes = MJUMPAGESIZE;
2406 
2407 
2408 	/* get the lanai pointers to the send and receive rings */
2409 
2410 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
2411 	sc->tx.lanai =
2412 		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
2413 	err |= mxge_send_cmd(sc,
2414 				 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
2415 	sc->rx_small.lanai =
2416 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2417 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
2418 	sc->rx_big.lanai =
2419 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2420 
2421 	if (err != 0) {
2422 		device_printf(sc->dev,
2423 			      "failed to get ring sizes or locations\n");
2424 		return EIO;
2425 	}
2426 
2427 	if (sc->wc) {
2428 		sc->tx.wc_fifo = sc->sram + MXGEFW_ETH_SEND_4;
2429 		sc->rx_small.wc_fifo = sc->sram + MXGEFW_ETH_RECV_SMALL;
2430 		sc->rx_big.wc_fifo = sc->sram + MXGEFW_ETH_RECV_BIG;
2431 	} else {
2432 		sc->tx.wc_fifo = 0;
2433 		sc->rx_small.wc_fifo = 0;
2434 		sc->rx_big.wc_fifo = 0;
2435 	}
2436 
2437 
2438 	/* stock receive rings */
2439 	for (i = 0; i <= sc->rx_small.mask; i++) {
2440 		map = sc->rx_small.info[i].map;
2441 		err = mxge_get_buf_small(sc, map, i);
2442 		if (err) {
2443 			device_printf(sc->dev, "alloced %d/%d smalls\n",
2444 				      i, sc->rx_small.mask + 1);
2445 			goto abort;
2446 		}
2447 	}
2448 	for (i = 0; i <= sc->rx_big.mask; i++) {
2449 		map = sc->rx_big.info[i].map;
2450 		err = mxge_get_buf_big(sc, map, i);
2451 		if (err) {
2452 			device_printf(sc->dev, "alloced %d/%d bigs\n",
2453 				      i, sc->rx_big.mask + 1);
2454 			goto abort;
2455 		}
2456 	}
2457 
2458 	/* Give the firmware the mtu and the big and small buffer
2459 	   sizes.  The firmware wants the big buf size to be a power
2460 	   of two. Luckily, FreeBSD's clusters are powers of two */
2461 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN;
2462 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
2463 	cmd.data0 = MHLEN - MXGEFW_PAD;
2464 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
2465 			     &cmd);
2466 	cmd.data0 = sc->big_bytes;
2467 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2468 
2469 	if (err != 0) {
2470 		device_printf(sc->dev, "failed to setup params\n");
2471 		goto abort;
2472 	}
2473 
2474 	/* Now give him the pointer to the stats block */
2475 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->fw_stats_dma.bus_addr);
2476 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->fw_stats_dma.bus_addr);
2477 	cmd.data2 = sizeof(struct mcp_irq_data);
2478 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
2479 
2480 	if (err != 0) {
2481 		bus = sc->fw_stats_dma.bus_addr;
2482 		bus += offsetof(struct mcp_irq_data, send_done_count);
2483 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
2484 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
2485 		err = mxge_send_cmd(sc,
2486 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
2487 				    &cmd);
2488 		/* Firmware cannot support multicast without STATS_DMA_V2 */
2489 		sc->fw_multicast_support = 0;
2490 	} else {
2491 		sc->fw_multicast_support = 1;
2492 	}
2493 
2494 	if (err != 0) {
2495 		device_printf(sc->dev, "failed to setup params\n");
2496 		goto abort;
2497 	}
2498 
2499 	/* Finally, start the firmware running */
2500 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
2501 	if (err) {
2502 		device_printf(sc->dev, "Couldn't bring up link\n");
2503 		goto abort;
2504 	}
2505 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
2506 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2507 
2508 	return 0;
2509 
2510 
2511 abort:
2512 	mxge_free_mbufs(sc);
2513 
2514 	return err;
2515 }
2516 
2517 static int
2518 mxge_close(mxge_softc_t *sc)
2519 {
2520 	mxge_cmd_t cmd;
2521 	int err, old_down_cnt;
2522 
2523 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
2524 	old_down_cnt = sc->down_cnt;
2525 	mb();
2526 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2527 	if (err) {
2528 		device_printf(sc->dev, "Couldn't bring down link\n");
2529 	}
2530 	if (old_down_cnt == sc->down_cnt) {
2531 		/* wait for down irq */
2532 		DELAY(10 * sc->intr_coal_delay);
2533 	}
2534 	if (old_down_cnt == sc->down_cnt) {
2535 		device_printf(sc->dev, "never got down irq\n");
2536 	}
2537 
2538 	mxge_free_mbufs(sc);
2539 
2540 	return 0;
2541 }
2542 
2543 static void
2544 mxge_setup_cfg_space(mxge_softc_t *sc)
2545 {
2546 	device_t dev = sc->dev;
2547 	int reg;
2548 	uint16_t cmd, lnk, pectl;
2549 
2550 	/* find the PCIe link width and set max read request to 4KB*/
2551 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
2552 		lnk = pci_read_config(dev, reg + 0x12, 2);
2553 		sc->link_width = (lnk >> 4) & 0x3f;
2554 
2555 		pectl = pci_read_config(dev, reg + 0x8, 2);
2556 		pectl = (pectl & ~0x7000) | (5 << 12);
2557 		pci_write_config(dev, reg + 0x8, pectl, 2);
2558 	}
2559 
2560 	/* Enable DMA and Memory space access */
2561 	pci_enable_busmaster(dev);
2562 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
2563 	cmd |= PCIM_CMD_MEMEN;
2564 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
2565 }
2566 
2567 static uint32_t
2568 mxge_read_reboot(mxge_softc_t *sc)
2569 {
2570 	device_t dev = sc->dev;
2571 	uint32_t vs;
2572 
2573 	/* find the vendor specific offset */
2574 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
2575 		device_printf(sc->dev,
2576 			      "could not find vendor specific offset\n");
2577 		return (uint32_t)-1;
2578 	}
2579 	/* enable read32 mode */
2580 	pci_write_config(dev, vs + 0x10, 0x3, 1);
2581 	/* tell NIC which register to read */
2582 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
2583 	return (pci_read_config(dev, vs + 0x14, 4));
2584 }
2585 
2586 static void
2587 mxge_watchdog_reset(mxge_softc_t *sc)
2588 {
2589 	int err;
2590 	uint32_t reboot;
2591 	uint16_t cmd;
2592 
2593 	err = ENXIO;
2594 
2595 	device_printf(sc->dev, "Watchdog reset!\n");
2596 
2597 	/*
2598 	 * check to see if the NIC rebooted.  If it did, then all of
2599 	 * PCI config space has been reset, and things like the
2600 	 * busmaster bit will be zero.  If this is the case, then we
2601 	 * must restore PCI config space before the NIC can be used
2602 	 * again
2603 	 */
2604 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2605 	if (cmd == 0xffff) {
2606 		/*
2607 		 * maybe the watchdog caught the NIC rebooting; wait
2608 		 * up to 100ms for it to finish.  If it does not come
2609 		 * back, then give up
2610 		 */
2611 		DELAY(1000*100);
2612 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
2613 		if (cmd == 0xffff) {
2614 			device_printf(sc->dev, "NIC disappeared!\n");
2615 			goto abort;
2616 		}
2617 	}
2618 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
2619 		/* print the reboot status */
2620 		reboot = mxge_read_reboot(sc);
2621 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
2622 			      reboot);
2623 		/* restore PCI configuration space */
2624 
2625 		/* XXXX waiting for pci_cfg_restore() to be exported */
2626 		goto abort; /* just abort for now */
2627 
2628 		/* and redo any changes we made to our config space */
2629 		mxge_setup_cfg_space(sc);
2630 	} else {
2631 		device_printf(sc->dev, "NIC did not reboot, ring state:\n");
2632 		device_printf(sc->dev, "tx.req=%d tx.done=%d\n",
2633 			      sc->tx.req, sc->tx.done);
2634 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
2635 			      sc->tx.pkt_done,
2636 			      be32toh(sc->fw_stats->send_done_count));
2637 	}
2638 
2639 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
2640 		mxge_close(sc);
2641 		err = mxge_open(sc);
2642 	}
2643 
2644 abort:
2645 	/*
2646 	 * stop the watchdog if the nic is dead, to avoid spamming the
2647 	 * console
2648 	 */
2649 	if (err != 0) {
2650 		callout_stop(&sc->co_hdl);
2651 	}
2652 }
2653 
2654 static void
2655 mxge_watchdog(mxge_softc_t *sc)
2656 {
2657 	mxge_tx_buf_t *tx = &sc->tx;
2658 
2659 	/* see if we have outstanding transmits, which
2660 	   have been pending for more than mxge_ticks */
2661 	if (tx->req != tx->done &&
2662 	    tx->watchdog_req != tx->watchdog_done &&
2663 	    tx->done == tx->watchdog_done)
2664 		mxge_watchdog_reset(sc);
2665 
2666 	tx->watchdog_req = tx->req;
2667 	tx->watchdog_done = tx->done;
2668 }
2669 
2670 static void
2671 mxge_tick(void *arg)
2672 {
2673 	mxge_softc_t *sc = arg;
2674 
2675 
2676 	/* Synchronize with possible callout reset/stop. */
2677 	if (callout_pending(&sc->co_hdl) ||
2678 	    !callout_active(&sc->co_hdl)) {
2679 		mtx_unlock(&sc->driver_mtx);
2680 		return;
2681 	}
2682 
2683 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
2684 	mxge_watchdog(sc);
2685 }
2686 
2687 static int
2688 mxge_media_change(struct ifnet *ifp)
2689 {
2690 	return EINVAL;
2691 }
2692 
2693 static int
2694 mxge_change_mtu(mxge_softc_t *sc, int mtu)
2695 {
2696 	struct ifnet *ifp = sc->ifp;
2697 	int real_mtu, old_mtu;
2698 	int err = 0;
2699 
2700 
2701 	real_mtu = mtu + ETHER_HDR_LEN;
2702 	if ((real_mtu > MXGE_MAX_ETHER_MTU) ||
2703 	    real_mtu < 60)
2704 		return EINVAL;
2705 	mtx_lock(&sc->driver_mtx);
2706 	old_mtu = ifp->if_mtu;
2707 	ifp->if_mtu = mtu;
2708 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2709 		callout_stop(&sc->co_hdl);
2710 		mxge_close(sc);
2711 		err = mxge_open(sc);
2712 		if (err != 0) {
2713 			ifp->if_mtu = old_mtu;
2714 			mxge_close(sc);
2715 			(void) mxge_open(sc);
2716 		}
2717 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
2718 	}
2719 	mtx_unlock(&sc->driver_mtx);
2720 	return err;
2721 }
2722 
2723 static void
2724 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
2725 {
2726 	mxge_softc_t *sc = ifp->if_softc;
2727 
2728 
2729 	if (sc == NULL)
2730 		return;
2731 	ifmr->ifm_status = IFM_AVALID;
2732 	ifmr->ifm_status |= sc->fw_stats->link_up ? IFM_ACTIVE : 0;
2733 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
2734 	ifmr->ifm_active |= sc->fw_stats->link_up ? IFM_FDX : 0;
2735 }
2736 
2737 static int
2738 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
2739 {
2740 	mxge_softc_t *sc = ifp->if_softc;
2741 	struct ifreq *ifr = (struct ifreq *)data;
2742 	int err, mask;
2743 
2744 	err = 0;
2745 	switch (command) {
2746 	case SIOCSIFADDR:
2747 	case SIOCGIFADDR:
2748 		err = ether_ioctl(ifp, command, data);
2749 		break;
2750 
2751 	case SIOCSIFMTU:
2752 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
2753 		break;
2754 
2755 	case SIOCSIFFLAGS:
2756 		mtx_lock(&sc->driver_mtx);
2757 		if (ifp->if_flags & IFF_UP) {
2758 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
2759 				err = mxge_open(sc);
2760 				callout_reset(&sc->co_hdl, mxge_ticks,
2761 					      mxge_tick, sc);
2762 			} else {
2763 				/* take care of promis can allmulti
2764 				   flag chages */
2765 				mxge_change_promisc(sc,
2766 						    ifp->if_flags & IFF_PROMISC);
2767 				mxge_set_multicast_list(sc);
2768 			}
2769 		} else {
2770 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2771 				mxge_close(sc);
2772 				callout_stop(&sc->co_hdl);
2773 			}
2774 		}
2775 		mtx_unlock(&sc->driver_mtx);
2776 		break;
2777 
2778 	case SIOCADDMULTI:
2779 	case SIOCDELMULTI:
2780 		mtx_lock(&sc->driver_mtx);
2781 		mxge_set_multicast_list(sc);
2782 		mtx_unlock(&sc->driver_mtx);
2783 		break;
2784 
2785 	case SIOCSIFCAP:
2786 		mtx_lock(&sc->driver_mtx);
2787 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2788 		if (mask & IFCAP_TXCSUM) {
2789 			if (IFCAP_TXCSUM & ifp->if_capenable) {
2790 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
2791 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
2792 						      | CSUM_TSO);
2793 			} else {
2794 				ifp->if_capenable |= IFCAP_TXCSUM;
2795 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
2796 			}
2797 		} else if (mask & IFCAP_RXCSUM) {
2798 			if (IFCAP_RXCSUM & ifp->if_capenable) {
2799 				ifp->if_capenable &= ~IFCAP_RXCSUM;
2800 				sc->csum_flag = 0;
2801 			} else {
2802 				ifp->if_capenable |= IFCAP_RXCSUM;
2803 				sc->csum_flag = 1;
2804 			}
2805 		}
2806 		if (mask & IFCAP_TSO4) {
2807 			if (IFCAP_TSO4 & ifp->if_capenable) {
2808 				ifp->if_capenable &= ~IFCAP_TSO4;
2809 				ifp->if_hwassist &= ~CSUM_TSO;
2810 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
2811 				ifp->if_capenable |= IFCAP_TSO4;
2812 				ifp->if_hwassist |= CSUM_TSO;
2813 			} else {
2814 				printf("mxge requires tx checksum offload"
2815 				       " be enabled to use TSO\n");
2816 				err = EINVAL;
2817 			}
2818 		}
2819 		mtx_unlock(&sc->driver_mtx);
2820 		break;
2821 
2822 	case SIOCGIFMEDIA:
2823 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
2824 				    &sc->media, command);
2825                 break;
2826 
2827 	default:
2828 		err = ENOTTY;
2829         }
2830 	return err;
2831 }
2832 
2833 static void
2834 mxge_fetch_tunables(mxge_softc_t *sc)
2835 {
2836 
2837 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
2838 			  &mxge_flow_control);
2839 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
2840 			  &mxge_intr_coal_delay);
2841 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
2842 			  &mxge_nvidia_ecrc_enable);
2843 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
2844 			  &mxge_force_firmware);
2845 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
2846 			  &mxge_deassert_wait);
2847 	TUNABLE_INT_FETCH("hw.mxge.verbose",
2848 			  &mxge_verbose);
2849 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
2850 
2851 	if (bootverbose)
2852 		mxge_verbose = 1;
2853 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
2854 		mxge_intr_coal_delay = 30;
2855 	if (mxge_ticks == 0)
2856 		mxge_ticks = hz;
2857 	sc->pause = mxge_flow_control;
2858 }
2859 
2860 static int
2861 mxge_attach(device_t dev)
2862 {
2863 	mxge_softc_t *sc = device_get_softc(dev);
2864 	struct ifnet *ifp;
2865 	size_t bytes;
2866 	int count, rid, err;
2867 
2868 	sc->dev = dev;
2869 	mxge_fetch_tunables(sc);
2870 
2871 	err = bus_dma_tag_create(NULL,			/* parent */
2872 				 1,			/* alignment */
2873 				 4096,			/* boundary */
2874 				 BUS_SPACE_MAXADDR,	/* low */
2875 				 BUS_SPACE_MAXADDR,	/* high */
2876 				 NULL, NULL,		/* filter */
2877 				 65536 + 256,		/* maxsize */
2878 				 MXGE_MAX_SEND_DESC, 	/* num segs */
2879 				 4096,			/* maxsegsize */
2880 				 0,			/* flags */
2881 				 NULL, NULL,		/* lock */
2882 				 &sc->parent_dmat);	/* tag */
2883 
2884 	if (err != 0) {
2885 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
2886 			      err);
2887 		goto abort_with_nothing;
2888 	}
2889 
2890 	ifp = sc->ifp = if_alloc(IFT_ETHER);
2891 	if (ifp == NULL) {
2892 		device_printf(dev, "can not if_alloc()\n");
2893 		err = ENOSPC;
2894 		goto abort_with_parent_dmat;
2895 	}
2896 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
2897 		 device_get_nameunit(dev));
2898 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
2899 	snprintf(sc->tx_mtx_name, sizeof(sc->tx_mtx_name), "%s:tx",
2900 		 device_get_nameunit(dev));
2901 	mtx_init(&sc->tx_mtx, sc->tx_mtx_name, NULL, MTX_DEF);
2902 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
2903 		 "%s:drv", device_get_nameunit(dev));
2904 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
2905 		 MTX_NETWORK_LOCK, MTX_DEF);
2906 
2907 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
2908 
2909 	mxge_setup_cfg_space(sc);
2910 
2911 	/* Map the board into the kernel */
2912 	rid = PCIR_BARS;
2913 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
2914 					 ~0, 1, RF_ACTIVE);
2915 	if (sc->mem_res == NULL) {
2916 		device_printf(dev, "could not map memory\n");
2917 		err = ENXIO;
2918 		goto abort_with_lock;
2919 	}
2920 	sc->sram = rman_get_virtual(sc->mem_res);
2921 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
2922 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
2923 		device_printf(dev, "impossible memory region size %ld\n",
2924 			      rman_get_size(sc->mem_res));
2925 		err = ENXIO;
2926 		goto abort_with_mem_res;
2927 	}
2928 
2929 	/* make NULL terminated copy of the EEPROM strings section of
2930 	   lanai SRAM */
2931 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
2932 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
2933 				rman_get_bushandle(sc->mem_res),
2934 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
2935 				sc->eeprom_strings,
2936 				MXGE_EEPROM_STRINGS_SIZE - 2);
2937 	err = mxge_parse_strings(sc);
2938 	if (err != 0)
2939 		goto abort_with_mem_res;
2940 
2941 	/* Enable write combining for efficient use of PCIe bus */
2942 	mxge_enable_wc(sc);
2943 
2944 	/* Allocate the out of band dma memory */
2945 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
2946 			     sizeof (mxge_cmd_t), 64);
2947 	if (err != 0)
2948 		goto abort_with_mem_res;
2949 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
2950 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
2951 	if (err != 0)
2952 		goto abort_with_cmd_dma;
2953 
2954 	err = mxge_dma_alloc(sc, &sc->fw_stats_dma,
2955 			     sizeof (*sc->fw_stats), 64);
2956 	if (err != 0)
2957 		goto abort_with_zeropad_dma;
2958 	sc->fw_stats = (mcp_irq_data_t *)sc->fw_stats_dma.addr;
2959 
2960 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
2961 	if (err != 0)
2962 		goto abort_with_fw_stats;
2963 
2964 	/* allocate interrupt queues */
2965 	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);
2966 	err = mxge_dma_alloc(sc, &sc->rx_done.dma, bytes, 4096);
2967 	if (err != 0)
2968 		goto abort_with_dmabench;
2969 	sc->rx_done.entry = sc->rx_done.dma.addr;
2970 	bzero(sc->rx_done.entry, bytes);
2971 
2972 	/* Add our ithread  */
2973 	count = pci_msi_count(dev);
2974 	if (count == 1 && pci_alloc_msi(dev, &count) == 0) {
2975 		rid = 1;
2976 		sc->msi_enabled = 1;
2977 	} else {
2978 		rid = 0;
2979 	}
2980 	sc->irq_res = bus_alloc_resource(dev, SYS_RES_IRQ, &rid, 0, ~0,
2981 					 1, RF_SHAREABLE | RF_ACTIVE);
2982 	if (sc->irq_res == NULL) {
2983 		device_printf(dev, "could not alloc interrupt\n");
2984 		goto abort_with_rx_done;
2985 	}
2986 	if (mxge_verbose)
2987 		device_printf(dev, "using %s irq %ld\n",
2988 			      sc->msi_enabled ? "MSI" : "INTx",
2989 			      rman_get_start(sc->irq_res));
2990 	/* load the firmware */
2991 	mxge_select_firmware(sc);
2992 
2993 	err = mxge_load_firmware(sc);
2994 	if (err != 0)
2995 		goto abort_with_irq_res;
2996 	sc->intr_coal_delay = mxge_intr_coal_delay;
2997 	err = mxge_reset(sc);
2998 	if (err != 0)
2999 		goto abort_with_irq_res;
3000 
3001 	err = mxge_alloc_rings(sc);
3002 	if (err != 0) {
3003 		device_printf(sc->dev, "failed to allocate rings\n");
3004 		goto abort_with_irq_res;
3005 	}
3006 
3007 	err = bus_setup_intr(sc->dev, sc->irq_res,
3008 			     INTR_TYPE_NET | INTR_MPSAFE,
3009 			     mxge_intr, sc, &sc->ih);
3010 	if (err != 0) {
3011 		goto abort_with_rings;
3012 	}
3013 	/* hook into the network stack */
3014 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
3015 	ifp->if_baudrate = 100000000;
3016 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
3017 		IFCAP_JUMBO_MTU;
3018 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
3019 	ifp->if_capenable = ifp->if_capabilities;
3020 	sc->csum_flag = 1;
3021         ifp->if_init = mxge_init;
3022         ifp->if_softc = sc;
3023         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
3024         ifp->if_ioctl = mxge_ioctl;
3025         ifp->if_start = mxge_start;
3026 	ether_ifattach(ifp, sc->mac_addr);
3027 	/* ether_ifattach sets mtu to 1500 */
3028 	ifp->if_mtu = MXGE_MAX_ETHER_MTU - ETHER_HDR_LEN;
3029 
3030 	/* Initialise the ifmedia structure */
3031 	ifmedia_init(&sc->media, 0, mxge_media_change,
3032 		     mxge_media_status);
3033 	ifmedia_add(&sc->media, IFM_ETHER|IFM_AUTO, 0, NULL);
3034 	mxge_add_sysctls(sc);
3035 	return 0;
3036 
3037 abort_with_rings:
3038 	mxge_free_rings(sc);
3039 abort_with_irq_res:
3040 	bus_release_resource(dev, SYS_RES_IRQ,
3041 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3042 	if (sc->msi_enabled)
3043 		pci_release_msi(dev);
3044 abort_with_rx_done:
3045 	sc->rx_done.entry = NULL;
3046 	mxge_dma_free(&sc->rx_done.dma);
3047 abort_with_dmabench:
3048 	mxge_dma_free(&sc->dmabench_dma);
3049 abort_with_fw_stats:
3050 	mxge_dma_free(&sc->fw_stats_dma);
3051 abort_with_zeropad_dma:
3052 	mxge_dma_free(&sc->zeropad_dma);
3053 abort_with_cmd_dma:
3054 	mxge_dma_free(&sc->cmd_dma);
3055 abort_with_mem_res:
3056 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3057 abort_with_lock:
3058 	pci_disable_busmaster(dev);
3059 	mtx_destroy(&sc->cmd_mtx);
3060 	mtx_destroy(&sc->tx_mtx);
3061 	mtx_destroy(&sc->driver_mtx);
3062 	if_free(ifp);
3063 abort_with_parent_dmat:
3064 	bus_dma_tag_destroy(sc->parent_dmat);
3065 
3066 abort_with_nothing:
3067 	return err;
3068 }
3069 
3070 static int
3071 mxge_detach(device_t dev)
3072 {
3073 	mxge_softc_t *sc = device_get_softc(dev);
3074 
3075 	mtx_lock(&sc->driver_mtx);
3076 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
3077 		mxge_close(sc);
3078 	callout_stop(&sc->co_hdl);
3079 	mtx_unlock(&sc->driver_mtx);
3080 	ether_ifdetach(sc->ifp);
3081 	ifmedia_removeall(&sc->media);
3082 	mxge_dummy_rdma(sc, 0);
3083 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
3084 	mxge_free_rings(sc);
3085 	bus_release_resource(dev, SYS_RES_IRQ,
3086 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
3087 	if (sc->msi_enabled)
3088 		pci_release_msi(dev);
3089 
3090 	sc->rx_done.entry = NULL;
3091 	mxge_dma_free(&sc->rx_done.dma);
3092 	mxge_dma_free(&sc->fw_stats_dma);
3093 	mxge_dma_free(&sc->dmabench_dma);
3094 	mxge_dma_free(&sc->zeropad_dma);
3095 	mxge_dma_free(&sc->cmd_dma);
3096 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
3097 	pci_disable_busmaster(dev);
3098 	mtx_destroy(&sc->cmd_mtx);
3099 	mtx_destroy(&sc->tx_mtx);
3100 	mtx_destroy(&sc->driver_mtx);
3101 	if_free(sc->ifp);
3102 	bus_dma_tag_destroy(sc->parent_dmat);
3103 	return 0;
3104 }
3105 
3106 static int
3107 mxge_shutdown(device_t dev)
3108 {
3109 	return 0;
3110 }
3111 
3112 /*
3113   This file uses Myri10GE driver indentation.
3114 
3115   Local Variables:
3116   c-file-style:"linux"
3117   tab-width:8
3118   End:
3119 */
3120