xref: /freebsd/sys/dev/mxge/if_mxge.c (revision acd3428b7d3e94cef0e1881c868cb4b131d4ff41)
1 /******************************************************************************
2 
3 Copyright (c) 2006, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Redistributions in binary form must reproduce the above copyright
13     notice, this list of conditions and the following disclaimer in the
14     documentation and/or other materials provided with the distribution.
15 
16  3. Neither the name of the Myricom Inc, nor the names of its
17     contributors may be used to endorse or promote products derived from
18     this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 POSSIBILITY OF SUCH DAMAGE.
31 
32 ***************************************************************************/
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/linker.h>
40 #include <sys/firmware.h>
41 #include <sys/endian.h>
42 #include <sys/sockio.h>
43 #include <sys/mbuf.h>
44 #include <sys/malloc.h>
45 #include <sys/kdb.h>
46 #include <sys/kernel.h>
47 #include <sys/module.h>
48 #include <sys/memrange.h>
49 #include <sys/socket.h>
50 #include <sys/sysctl.h>
51 #include <sys/sx.h>
52 
53 #include <net/if.h>
54 #include <net/if_arp.h>
55 #include <net/ethernet.h>
56 #include <net/if_dl.h>
57 #include <net/if_media.h>
58 
59 #include <net/bpf.h>
60 
61 #include <net/if_types.h>
62 #include <net/if_vlan_var.h>
63 #include <net/zlib.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 #include <netinet/tcp.h>
69 
70 #include <machine/bus.h>
71 #include <machine/resource.h>
72 #include <sys/bus.h>
73 #include <sys/rman.h>
74 
75 #include <dev/pci/pcireg.h>
76 #include <dev/pci/pcivar.h>
77 
78 #include <vm/vm.h>		/* for pmap_mapdev() */
79 #include <vm/pmap.h>
80 
81 #include <dev/mxge/mxge_mcp.h>
82 #include <dev/mxge/mcp_gen_header.h>
83 #include <dev/mxge/if_mxge_var.h>
84 
85 /* tunable params */
86 static int mxge_nvidia_ecrc_enable = 1;
87 static int mxge_max_intr_slots = 1024;
88 static int mxge_intr_coal_delay = 30;
89 static int mxge_deassert_wait = 1;
90 static int mxge_flow_control = 1;
91 static int mxge_verbose = 0;
92 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
93 static char *mxge_fw_aligned = "mxge_eth_z8e";
94 
95 static int mxge_probe(device_t dev);
96 static int mxge_attach(device_t dev);
97 static int mxge_detach(device_t dev);
98 static int mxge_shutdown(device_t dev);
99 static void mxge_intr(void *arg);
100 
101 static device_method_t mxge_methods[] =
102 {
103   /* Device interface */
104   DEVMETHOD(device_probe, mxge_probe),
105   DEVMETHOD(device_attach, mxge_attach),
106   DEVMETHOD(device_detach, mxge_detach),
107   DEVMETHOD(device_shutdown, mxge_shutdown),
108   {0, 0}
109 };
110 
111 static driver_t mxge_driver =
112 {
113   "mxge",
114   mxge_methods,
115   sizeof(mxge_softc_t),
116 };
117 
118 static devclass_t mxge_devclass;
119 
120 /* Declare ourselves to be a child of the PCI bus.*/
121 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
122 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
123 
124 static int
125 mxge_probe(device_t dev)
126 {
127   if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
128       (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E)) {
129 	  device_set_desc(dev, "Myri10G-PCIE-8A");
130 	  return 0;
131   }
132   return ENXIO;
133 }
134 
135 static void
136 mxge_enable_wc(mxge_softc_t *sc)
137 {
138 	struct mem_range_desc mrdesc;
139 	vm_paddr_t pa;
140 	vm_offset_t len;
141 	int err, action;
142 
143 	pa = rman_get_start(sc->mem_res);
144 	len = rman_get_size(sc->mem_res);
145 	mrdesc.mr_base = pa;
146 	mrdesc.mr_len = len;
147 	mrdesc.mr_flags = MDF_WRITECOMBINE;
148 	action = MEMRANGE_SET_UPDATE;
149 	strcpy((char *)&mrdesc.mr_owner, "mxge");
150 	err = mem_range_attr_set(&mrdesc, &action);
151 	if (err != 0) {
152 		device_printf(sc->dev,
153 			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
154 			      (unsigned long)pa, (unsigned long)len, err);
155 	} else {
156 		sc->wc = 1;
157 	}
158 }
159 
160 
161 /* callback to get our DMA address */
162 static void
163 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
164 			 int error)
165 {
166 	if (error == 0) {
167 		*(bus_addr_t *) arg = segs->ds_addr;
168 	}
169 }
170 
171 static int
172 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
173 		   bus_size_t alignment)
174 {
175 	int err;
176 	device_t dev = sc->dev;
177 
178 	/* allocate DMAable memory tags */
179 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
180 				 alignment,		/* alignment */
181 				 4096,			/* boundary */
182 				 BUS_SPACE_MAXADDR,	/* low */
183 				 BUS_SPACE_MAXADDR,	/* high */
184 				 NULL, NULL,		/* filter */
185 				 bytes,			/* maxsize */
186 				 1,			/* num segs */
187 				 4096,			/* maxsegsize */
188 				 BUS_DMA_COHERENT,	/* flags */
189 				 NULL, NULL,		/* lock */
190 				 &dma->dmat);		/* tag */
191 	if (err != 0) {
192 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
193 		return err;
194 	}
195 
196 	/* allocate DMAable memory & map */
197 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
198 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
199 				| BUS_DMA_ZERO),  &dma->map);
200 	if (err != 0) {
201 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
202 		goto abort_with_dmat;
203 	}
204 
205 	/* load the memory */
206 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
207 			      mxge_dmamap_callback,
208 			      (void *)&dma->bus_addr, 0);
209 	if (err != 0) {
210 		device_printf(dev, "couldn't load map (err = %d)\n", err);
211 		goto abort_with_mem;
212 	}
213 	return 0;
214 
215 abort_with_mem:
216 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
217 abort_with_dmat:
218 	(void)bus_dma_tag_destroy(dma->dmat);
219 	return err;
220 }
221 
222 
223 static void
224 mxge_dma_free(mxge_dma_t *dma)
225 {
226 	bus_dmamap_unload(dma->dmat, dma->map);
227 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
228 	(void)bus_dma_tag_destroy(dma->dmat);
229 }
230 
231 /*
232  * The eeprom strings on the lanaiX have the format
233  * SN=x\0
234  * MAC=x:x:x:x:x:x\0
235  * PC=text\0
236  */
237 
238 static int
239 mxge_parse_strings(mxge_softc_t *sc)
240 {
241 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
242 
243 	char *ptr, *limit;
244 	int i, found_mac;
245 
246 	ptr = sc->eeprom_strings;
247 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
248 	found_mac = 0;
249 	while (ptr < limit && *ptr != '\0') {
250 		if (memcmp(ptr, "MAC=", 4) == 0) {
251 			ptr += 1;
252 			sc->mac_addr_string = ptr;
253 			for (i = 0; i < 6; i++) {
254 				ptr += 3;
255 				if ((ptr + 2) > limit)
256 					goto abort;
257 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
258 				found_mac = 1;
259 			}
260 		} else if (memcmp(ptr, "PC=", 3) == 0) {
261 			ptr += 3;
262 			strncpy(sc->product_code_string, ptr,
263 				sizeof (sc->product_code_string) - 1);
264 		} else if (memcmp(ptr, "SN=", 3) == 0) {
265 			ptr += 3;
266 			strncpy(sc->serial_number_string, ptr,
267 				sizeof (sc->serial_number_string) - 1);
268 		}
269 		MXGE_NEXT_STRING(ptr);
270 	}
271 
272 	if (found_mac)
273 		return 0;
274 
275  abort:
276 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
277 
278 	return ENXIO;
279 }
280 
281 #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
282 static int
283 mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
284 {
285 	uint32_t val;
286 	unsigned long off;
287 	char *va, *cfgptr;
288 	uint16_t vendor_id, device_id;
289 	uintptr_t bus, slot, func, ivend, idev;
290 	uint32_t *ptr32;
291 
292 	/* XXXX
293 	   Test below is commented because it is believed that doing
294 	   config read/write beyond 0xff will access the config space
295 	   for the next larger function.  Uncomment this and remove
296 	   the hacky pmap_mapdev() way of accessing config space when
297 	   FreeBSD grows support for extended pcie config space access
298 	*/
299 #if 0
300 	/* See if we can, by some miracle, access the extended
301 	   config space */
302 	val = pci_read_config(pdev, 0x178, 4);
303 	if (val != 0xffffffff) {
304 		val |= 0x40;
305 		pci_write_config(pdev, 0x178, val, 4);
306 		return 0;
307 	}
308 #endif
309 	/* Rather than using normal pci config space writes, we must
310 	 * map the Nvidia config space ourselves.  This is because on
311 	 * opteron/nvidia class machine the 0xe000000 mapping is
312 	 * handled by the nvidia chipset, that means the internal PCI
313 	 * device (the on-chip northbridge), or the amd-8131 bridge
314 	 * and things behind them are not visible by this method.
315 	 */
316 
317 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
318 		      PCI_IVAR_BUS, &bus);
319 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
320 		      PCI_IVAR_SLOT, &slot);
321 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
322 		      PCI_IVAR_FUNCTION, &func);
323 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
324 		      PCI_IVAR_VENDOR, &ivend);
325 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
326 		      PCI_IVAR_DEVICE, &idev);
327 
328 	off =  0xe0000000UL
329 		+ 0x00100000UL * (unsigned long)bus
330 		+ 0x00001000UL * (unsigned long)(func
331 						 + 8 * slot);
332 
333 	/* map it into the kernel */
334 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
335 
336 
337 	if (va == NULL) {
338 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
339 		return EIO;
340 	}
341 	/* get a pointer to the config space mapped into the kernel */
342 	cfgptr = va + (off & PAGE_MASK);
343 
344 	/* make sure that we can really access it */
345 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
346 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
347 	if (! (vendor_id == ivend && device_id == idev)) {
348 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
349 			      vendor_id, device_id);
350 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
351 		return EIO;
352 	}
353 
354 	ptr32 = (uint32_t*)(cfgptr + 0x178);
355 	val = *ptr32;
356 
357 	if (val == 0xffffffff) {
358 		device_printf(sc->dev, "extended mapping failed\n");
359 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
360 		return EIO;
361 	}
362 	*ptr32 = val | 0x40;
363 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
364 	if (mxge_verbose)
365 		device_printf(sc->dev,
366 			      "Enabled ECRC on upstream Nvidia bridge "
367 			      "at %d:%d:%d\n",
368 			      (int)bus, (int)slot, (int)func);
369 	return 0;
370 }
371 #else
372 static int
373 mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
374 {
375 	device_printf(sc->dev,
376 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
377 	return ENXIO;
378 }
379 #endif
380 /*
381  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
382  * when the PCI-E Completion packets are aligned on an 8-byte
383  * boundary.  Some PCI-E chip sets always align Completion packets; on
384  * the ones that do not, the alignment can be enforced by enabling
385  * ECRC generation (if supported).
386  *
387  * When PCI-E Completion packets are not aligned, it is actually more
388  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
389  *
390  * If the driver can neither enable ECRC nor verify that it has
391  * already been enabled, then it must use a firmware image which works
392  * around unaligned completion packets (ethp_z8e.dat), and it should
393  * also ensure that it never gives the device a Read-DMA which is
394  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
395  * enabled, then the driver should use the aligned (eth_z8e.dat)
396  * firmware image, and set tx.boundary to 4KB.
397  */
398 
399 static void
400 mxge_select_firmware(mxge_softc_t *sc)
401 {
402 	int err, aligned = 0;
403 	device_t pdev;
404 	uint16_t pvend, pdid;
405 
406 	pdev = device_get_parent(device_get_parent(sc->dev));
407 	if (pdev == NULL) {
408 		device_printf(sc->dev, "could not find parent?\n");
409 		goto abort;
410 	}
411 	pvend = pci_read_config(pdev, PCIR_VENDOR, 2);
412 	pdid = pci_read_config(pdev, PCIR_DEVICE, 2);
413 
414 	/* see if we can enable ECRC's on an upstream
415 	   Nvidia bridge */
416 	if (mxge_nvidia_ecrc_enable &&
417 	    (pvend == 0x10de && pdid == 0x005d)) {
418 		err = mxge_enable_nvidia_ecrc(sc, pdev);
419 		if (err == 0) {
420 			aligned = 1;
421 			if (mxge_verbose)
422 				device_printf(sc->dev,
423 					      "Assuming aligned completions"
424 					      " (ECRC)\n");
425 		}
426 	}
427 	/* see if the upstream bridge is known to
428 	   provided aligned completions */
429 	if (/* HT2000 */ (pvend == 0x1166 && pdid == 0x0132) ||
430 	    /* PLX */    (pvend == 0x10b5 && pdid == 0x8532) ||
431 	    /* Intel */   (pvend == 0x8086 &&
432 			   /* E5000 */(pdid >= 0x25f7 && pdid <= 0x25fa))) {
433 		if (mxge_verbose)
434 			device_printf(sc->dev,
435 				      "Assuming aligned completions "
436 				      "(0x%x:0x%x)\n", pvend, pdid);
437 	}
438 
439 abort:
440 	if (aligned) {
441 		sc->fw_name = mxge_fw_aligned;
442 		sc->tx.boundary = 4096;
443 	} else {
444 		sc->fw_name = mxge_fw_unaligned;
445 		sc->tx.boundary = 2048;
446 	}
447 }
448 
449 union qualhack
450 {
451         const char *ro_char;
452         char *rw_char;
453 };
454 
455 static int
456 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
457 {
458 	int major, minor;
459 
460 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
461 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
462 			      be32toh(hdr->mcp_type));
463 		return EIO;
464 	}
465 
466 	/* save firmware version for sysctl */
467 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
468 	if (mxge_verbose)
469 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
470 
471 	sscanf(sc->fw_version, "%d.%d", &major, &minor);
472 
473 	if (!(major == MXGEFW_VERSION_MAJOR
474 	      && minor == MXGEFW_VERSION_MINOR)) {
475 		device_printf(sc->dev, "Found firmware version %s\n",
476 			      sc->fw_version);
477 		device_printf(sc->dev, "Driver needs %d.%d\n",
478 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
479 		return EINVAL;
480 	}
481 	return 0;
482 
483 }
484 
485 static int
486 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
487 {
488 	struct firmware *fw;
489 	const mcp_gen_header_t *hdr;
490 	unsigned hdr_offset;
491 	const char *fw_data;
492 	union qualhack hack;
493 	int status;
494 	unsigned int i;
495 	char dummy;
496 
497 
498 	fw = firmware_get(sc->fw_name);
499 
500 	if (fw == NULL) {
501 		device_printf(sc->dev, "Could not find firmware image %s\n",
502 			      sc->fw_name);
503 		return ENOENT;
504 	}
505 	if (fw->datasize > *limit ||
506 	    fw->datasize < MCP_HEADER_PTR_OFFSET + 4) {
507 		device_printf(sc->dev, "Firmware image %s too large (%d/%d)\n",
508 			      sc->fw_name, (int)fw->datasize, (int) *limit);
509 		status = ENOSPC;
510 		goto abort_with_fw;
511 	}
512 	*limit = fw->datasize;
513 
514 	/* check id */
515 	fw_data = (const char *)fw->data;
516 	hdr_offset = htobe32(*(const uint32_t *)
517 			     (fw_data + MCP_HEADER_PTR_OFFSET));
518 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw->datasize) {
519 		device_printf(sc->dev, "Bad firmware file");
520 		status = EIO;
521 		goto abort_with_fw;
522 	}
523 	hdr = (const void*)(fw_data + hdr_offset);
524 
525 	status = mxge_validate_firmware(sc, hdr);
526 	if (status != 0)
527 		goto abort_with_fw;
528 
529 	hack.ro_char = fw_data;
530 	/* Copy the inflated firmware to NIC SRAM. */
531 	for (i = 0; i < *limit; i += 256) {
532 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
533 			      hack.rw_char + i,
534 			      min(256U, (unsigned)(*limit - i)));
535 		mb();
536 		dummy = *sc->sram;
537 		mb();
538 	}
539 
540 	status = 0;
541 abort_with_fw:
542 	firmware_put(fw, FIRMWARE_UNLOAD);
543 	return status;
544 }
545 
546 /*
547  * Enable or disable periodic RDMAs from the host to make certain
548  * chipsets resend dropped PCIe messages
549  */
550 
551 static void
552 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
553 {
554 	char buf_bytes[72];
555 	volatile uint32_t *confirm;
556 	volatile char *submit;
557 	uint32_t *buf, dma_low, dma_high;
558 	int i;
559 
560 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
561 
562 	/* clear confirmation addr */
563 	confirm = (volatile uint32_t *)sc->cmd;
564 	*confirm = 0;
565 	mb();
566 
567 	/* send an rdma command to the PCIe engine, and wait for the
568 	   response in the confirmation address.  The firmware should
569 	   write a -1 there to indicate it is alive and well
570 	*/
571 
572 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
573 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
574 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
575 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
576 	buf[2] = htobe32(0xffffffff);		/* confirm data */
577 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
578 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
579 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
580 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
581 	buf[5] = htobe32(enable);			/* enable? */
582 
583 
584 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
585 
586 	mxge_pio_copy(submit, buf, 64);
587 	mb();
588 	DELAY(1000);
589 	mb();
590 	i = 0;
591 	while (*confirm != 0xffffffff && i < 20) {
592 		DELAY(1000);
593 		i++;
594 	}
595 	if (*confirm != 0xffffffff) {
596 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
597 			      (enable ? "enable" : "disable"), confirm,
598 			      *confirm);
599 	}
600 	return;
601 }
602 
603 static int
604 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
605 {
606 	mcp_cmd_t *buf;
607 	char buf_bytes[sizeof(*buf) + 8];
608 	volatile mcp_cmd_response_t *response = sc->cmd;
609 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
610 	uint32_t dma_low, dma_high;
611 	int sleep_total = 0;
612 
613 	/* ensure buf is aligned to 8 bytes */
614 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
615 
616 	buf->data0 = htobe32(data->data0);
617 	buf->data1 = htobe32(data->data1);
618 	buf->data2 = htobe32(data->data2);
619 	buf->cmd = htobe32(cmd);
620 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
621 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
622 
623 	buf->response_addr.low = htobe32(dma_low);
624 	buf->response_addr.high = htobe32(dma_high);
625 	mtx_lock(&sc->cmd_lock);
626 	response->result = 0xffffffff;
627 	mb();
628 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
629 
630 	/* wait up to 20ms */
631 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
632 		bus_dmamap_sync(sc->cmd_dma.dmat,
633 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
634 		mb();
635 		if (response->result != 0xffffffff) {
636 			if (response->result == 0) {
637 				data->data0 = be32toh(response->data);
638 				mtx_unlock(&sc->cmd_lock);
639 				return 0;
640 			} else {
641 				device_printf(sc->dev,
642 					      "mxge: command %d "
643 					      "failed, result = %d\n",
644 					      cmd, be32toh(response->result));
645 				mtx_unlock(&sc->cmd_lock);
646 				return ENXIO;
647 			}
648 		}
649 		DELAY(1000);
650 	}
651 	mtx_unlock(&sc->cmd_lock);
652 	device_printf(sc->dev, "mxge: command %d timed out"
653 		      "result = %d\n",
654 		      cmd, be32toh(response->result));
655 	return EAGAIN;
656 }
657 
658 static int
659 mxge_adopt_running_firmware(mxge_softc_t *sc)
660 {
661 	struct mcp_gen_header *hdr;
662 	const size_t bytes = sizeof (struct mcp_gen_header);
663 	size_t hdr_offset;
664 	int status;
665 
666 	/* find running firmware header */
667 	hdr_offset = htobe32(*(volatile uint32_t *)
668 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
669 
670 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
671 		device_printf(sc->dev,
672 			      "Running firmware has bad header offset (%d)\n",
673 			      (int)hdr_offset);
674 		return EIO;
675 	}
676 
677 	/* copy header of running firmware from SRAM to host memory to
678 	 * validate firmware */
679 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
680 	if (hdr == NULL) {
681 		device_printf(sc->dev, "could not malloc firmware hdr\n");
682 		return ENOMEM;
683 	}
684 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
685 				rman_get_bushandle(sc->mem_res),
686 				hdr_offset, (char *)hdr, bytes);
687 	status = mxge_validate_firmware(sc, hdr);
688 	free(hdr, M_DEVBUF);
689 	return status;
690 }
691 
692 
693 static int
694 mxge_load_firmware(mxge_softc_t *sc)
695 {
696 	volatile uint32_t *confirm;
697 	volatile char *submit;
698 	char buf_bytes[72];
699 	uint32_t *buf, size, dma_low, dma_high;
700 	int status, i;
701 
702 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
703 
704 	size = sc->sram_size;
705 	status = mxge_load_firmware_helper(sc, &size);
706 	if (status) {
707 		/* Try to use the currently running firmware, if
708 		   it is new enough */
709 		status = mxge_adopt_running_firmware(sc);
710 		if (status) {
711 			device_printf(sc->dev,
712 				      "failed to adopt running firmware\n");
713 			return status;
714 		}
715 		device_printf(sc->dev,
716 			      "Successfully adopted running firmware\n");
717 		if (sc->tx.boundary == 4096) {
718 			device_printf(sc->dev,
719 				"Using firmware currently running on NIC"
720 				 ".  For optimal\n");
721 			device_printf(sc->dev,
722 				 "performance consider loading optimized "
723 				 "firmware\n");
724 		}
725 
726 	}
727 	/* clear confirmation addr */
728 	confirm = (volatile uint32_t *)sc->cmd;
729 	*confirm = 0;
730 	mb();
731 	/* send a reload command to the bootstrap MCP, and wait for the
732 	   response in the confirmation address.  The firmware should
733 	   write a -1 there to indicate it is alive and well
734 	*/
735 
736 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
737 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
738 
739 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
740 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
741 	buf[2] = htobe32(0xffffffff);	/* confirm data */
742 
743 	/* FIX: All newest firmware should un-protect the bottom of
744 	   the sram before handoff. However, the very first interfaces
745 	   do not. Therefore the handoff copy must skip the first 8 bytes
746 	*/
747 					/* where the code starts*/
748 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
749 	buf[4] = htobe32(size - 8); 	/* length of code */
750 	buf[5] = htobe32(8);		/* where to copy to */
751 	buf[6] = htobe32(0);		/* where to jump to */
752 
753 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
754 	mxge_pio_copy(submit, buf, 64);
755 	mb();
756 	DELAY(1000);
757 	mb();
758 	i = 0;
759 	while (*confirm != 0xffffffff && i < 20) {
760 		DELAY(1000*10);
761 		i++;
762 		bus_dmamap_sync(sc->cmd_dma.dmat,
763 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
764 	}
765 	if (*confirm != 0xffffffff) {
766 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
767 			confirm, *confirm);
768 
769 		return ENXIO;
770 	}
771 	return 0;
772 }
773 
774 static int
775 mxge_update_mac_address(mxge_softc_t *sc)
776 {
777 	mxge_cmd_t cmd;
778 	uint8_t *addr = sc->mac_addr;
779 	int status;
780 
781 
782 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
783 		     | (addr[2] << 8) | addr[3]);
784 
785 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
786 
787 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
788 	return status;
789 }
790 
791 static int
792 mxge_change_pause(mxge_softc_t *sc, int pause)
793 {
794 	mxge_cmd_t cmd;
795 	int status;
796 
797 	if (pause)
798 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
799 				       &cmd);
800 	else
801 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
802 				       &cmd);
803 
804 	if (status) {
805 		device_printf(sc->dev, "Failed to set flow control mode\n");
806 		return ENXIO;
807 	}
808 	sc->pause = pause;
809 	return 0;
810 }
811 
812 static void
813 mxge_change_promisc(mxge_softc_t *sc, int promisc)
814 {
815 	mxge_cmd_t cmd;
816 	int status;
817 
818 	if (promisc)
819 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
820 				       &cmd);
821 	else
822 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
823 				       &cmd);
824 
825 	if (status) {
826 		device_printf(sc->dev, "Failed to set promisc mode\n");
827 	}
828 }
829 
830 static void
831 mxge_set_multicast_list(mxge_softc_t *sc)
832 {
833 	mxge_cmd_t cmd;
834 	struct ifmultiaddr *ifma;
835 	struct ifnet *ifp = sc->ifp;
836 	int err;
837 
838 	/* This firmware is known to not support multicast */
839 	if (!sc->fw_multicast_support)
840 		return;
841 
842 	/* Disable multicast filtering while we play with the lists*/
843 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
844 	if (err != 0) {
845 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
846 		       " error status: %d\n", err);
847 		return;
848 	}
849 
850 
851 	if (ifp->if_flags & IFF_ALLMULTI)
852 		/* request to disable multicast filtering, so quit here */
853 		return;
854 
855 	/* Flush all the filters */
856 
857 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
858 	if (err != 0) {
859 		device_printf(sc->dev,
860 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
861 			      ", error status: %d\n", err);
862 		return;
863 	}
864 
865 	/* Walk the multicast list, and add each address */
866 
867 	IF_ADDR_LOCK(ifp);
868 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
869 		if (ifma->ifma_addr->sa_family != AF_LINK)
870 			continue;
871 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
872 		      &cmd.data0, 4);
873 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
874 		      &cmd.data1, 2);
875 		cmd.data0 = htonl(cmd.data0);
876 		cmd.data1 = htonl(cmd.data1);
877 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
878 		if (err != 0) {
879 			device_printf(sc->dev, "Failed "
880 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
881 			       "%d\t", err);
882 			/* abort, leaving multicast filtering off */
883 			IF_ADDR_UNLOCK(ifp);
884 			return;
885 		}
886 	}
887 	IF_ADDR_UNLOCK(ifp);
888 	/* Enable multicast filtering */
889 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
890 	if (err != 0) {
891 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
892 		       ", error status: %d\n", err);
893 	}
894 }
895 
896 
897 static int
898 mxge_reset(mxge_softc_t *sc)
899 {
900 
901 	mxge_cmd_t cmd;
902 	mxge_dma_t dmabench_dma;
903 	size_t bytes;
904 	int status;
905 
906 	/* try to send a reset command to the card to see if it
907 	   is alive */
908 	memset(&cmd, 0, sizeof (cmd));
909 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
910 	if (status != 0) {
911 		device_printf(sc->dev, "failed reset\n");
912 		return ENXIO;
913 	}
914 
915 	mxge_dummy_rdma(sc, 1);
916 
917 	/* Now exchange information about interrupts  */
918 	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);\
919 	memset(sc->rx_done.entry, 0, bytes);
920 	cmd.data0 = (uint32_t)bytes;
921 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
922 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->rx_done.dma.bus_addr);
923 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->rx_done.dma.bus_addr);
924 	status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd);
925 
926 	status |= mxge_send_cmd(sc,
927 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
928 
929 
930 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
931 
932 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
933 	sc->irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
934 
935 
936 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
937 				&cmd);
938 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
939 	if (status != 0) {
940 		device_printf(sc->dev, "failed set interrupt parameters\n");
941 		return status;
942 	}
943 
944 
945 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
946 
947 
948 	/* run a DMA benchmark */
949 	sc->read_dma = sc->write_dma = sc->read_write_dma = 0;
950 	status = mxge_dma_alloc(sc, &dmabench_dma, 4096, 4096);
951 	if (status)
952 		goto dmabench_fail;
953 
954 	/* Read DMA */
955 	cmd.data0 = MXGE_LOWPART_TO_U32(dmabench_dma.bus_addr);
956 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmabench_dma.bus_addr);
957 	cmd.data2 = sc->tx.boundary * 0x10000;
958 
959 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
960 	if (status != 0)
961 		device_printf(sc->dev, "read dma benchmark failed\n");
962 	else
963 		sc->read_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
964 			(cmd.data0 & 0xffff);
965 
966 	/* Write DMA */
967 	cmd.data0 = MXGE_LOWPART_TO_U32(dmabench_dma.bus_addr);
968 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmabench_dma.bus_addr);
969 	cmd.data2 = sc->tx.boundary * 0x1;
970 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
971 	if (status != 0)
972 		device_printf(sc->dev, "write dma benchmark failed\n");
973 	else
974 		sc->write_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
975 			(cmd.data0 & 0xffff);
976 	/* Read/Write DMA */
977 	cmd.data0 = MXGE_LOWPART_TO_U32(dmabench_dma.bus_addr);
978 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmabench_dma.bus_addr);
979 	cmd.data2 = sc->tx.boundary * 0x10001;
980 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
981 	if (status != 0)
982 		device_printf(sc->dev, "read/write dma benchmark failed\n");
983 	else
984 		sc->read_write_dma =
985 			((cmd.data0>>16) * sc->tx.boundary * 2 * 2) /
986 			(cmd.data0 & 0xffff);
987 
988 	mxge_dma_free(&dmabench_dma);
989 
990 dmabench_fail:
991 	/* reset mcp/driver shared state back to 0 */
992 	bzero(sc->rx_done.entry, bytes);
993 	sc->rx_done.idx = 0;
994 	sc->rx_done.cnt = 0;
995 	sc->tx.req = 0;
996 	sc->tx.done = 0;
997 	sc->tx.pkt_done = 0;
998 	sc->rx_big.cnt = 0;
999 	sc->rx_small.cnt = 0;
1000 	sc->rdma_tags_available = 15;
1001 	status = mxge_update_mac_address(sc);
1002 	mxge_change_promisc(sc, 0);
1003 	mxge_change_pause(sc, sc->pause);
1004 	mxge_set_multicast_list(sc);
1005 	return status;
1006 }
1007 
1008 static int
1009 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1010 {
1011         mxge_softc_t *sc;
1012         unsigned int intr_coal_delay;
1013         int err;
1014 
1015         sc = arg1;
1016         intr_coal_delay = sc->intr_coal_delay;
1017         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1018         if (err != 0) {
1019                 return err;
1020         }
1021         if (intr_coal_delay == sc->intr_coal_delay)
1022                 return 0;
1023 
1024         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1025                 return EINVAL;
1026 
1027 	sx_xlock(&sc->driver_lock);
1028 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1029 	sc->intr_coal_delay = intr_coal_delay;
1030 
1031 	sx_xunlock(&sc->driver_lock);
1032         return err;
1033 }
1034 
1035 static int
1036 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1037 {
1038         mxge_softc_t *sc;
1039         unsigned int enabled;
1040         int err;
1041 
1042         sc = arg1;
1043         enabled = sc->pause;
1044         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1045         if (err != 0) {
1046                 return err;
1047         }
1048         if (enabled == sc->pause)
1049                 return 0;
1050 
1051 	sx_xlock(&sc->driver_lock);
1052 	err = mxge_change_pause(sc, enabled);
1053 	sx_xunlock(&sc->driver_lock);
1054         return err;
1055 }
1056 
1057 static int
1058 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1059 {
1060         int err;
1061 
1062         if (arg1 == NULL)
1063                 return EFAULT;
1064         arg2 = be32toh(*(int *)arg1);
1065         arg1 = NULL;
1066         err = sysctl_handle_int(oidp, arg1, arg2, req);
1067 
1068         return err;
1069 }
1070 
1071 static void
1072 mxge_add_sysctls(mxge_softc_t *sc)
1073 {
1074 	struct sysctl_ctx_list *ctx;
1075 	struct sysctl_oid_list *children;
1076 	mcp_irq_data_t *fw;
1077 
1078 	ctx = device_get_sysctl_ctx(sc->dev);
1079 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1080 	fw = sc->fw_stats;
1081 
1082 	/* random information */
1083 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1084 		       "firmware_version",
1085 		       CTLFLAG_RD, &sc->fw_version,
1086 		       0, "firmware version");
1087 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1088 		       "serial_number",
1089 		       CTLFLAG_RD, &sc->serial_number_string,
1090 		       0, "serial number");
1091 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1092 		       "product_code",
1093 		       CTLFLAG_RD, &sc->product_code_string,
1094 		       0, "product_code");
1095 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1096 		       "tx_boundary",
1097 		       CTLFLAG_RD, &sc->tx.boundary,
1098 		       0, "tx_boundary");
1099 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1100 		       "write_combine",
1101 		       CTLFLAG_RD, &sc->wc,
1102 		       0, "write combining PIO?");
1103 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1104 		       "read_dma_MBs",
1105 		       CTLFLAG_RD, &sc->read_dma,
1106 		       0, "DMA Read speed in MB/s");
1107 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1108 		       "write_dma_MBs",
1109 		       CTLFLAG_RD, &sc->write_dma,
1110 		       0, "DMA Write speed in MB/s");
1111 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1112 		       "read_write_dma_MBs",
1113 		       CTLFLAG_RD, &sc->read_write_dma,
1114 		       0, "DMA concurrent Read/Write speed in MB/s");
1115 
1116 
1117 	/* performance related tunables */
1118 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1119 			"intr_coal_delay",
1120 			CTLTYPE_INT|CTLFLAG_RW, sc,
1121 			0, mxge_change_intr_coal,
1122 			"I", "interrupt coalescing delay in usecs");
1123 
1124 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1125 			"flow_control_enabled",
1126 			CTLTYPE_INT|CTLFLAG_RW, sc,
1127 			0, mxge_change_flow_control,
1128 			"I", "interrupt coalescing delay in usecs");
1129 
1130 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1131 		       "deassert_wait",
1132 		       CTLFLAG_RW, &mxge_deassert_wait,
1133 		       0, "Wait for IRQ line to go low in ihandler");
1134 
1135 	/* stats block from firmware is in network byte order.
1136 	   Need to swap it */
1137 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1138 			"link_up",
1139 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1140 			0, mxge_handle_be32,
1141 			"I", "link up");
1142 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1143 			"rdma_tags_available",
1144 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1145 			0, mxge_handle_be32,
1146 			"I", "rdma_tags_available");
1147 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1148 			"dropped_link_overflow",
1149 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1150 			0, mxge_handle_be32,
1151 			"I", "dropped_link_overflow");
1152 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1153 			"dropped_link_error_or_filtered",
1154 			CTLTYPE_INT|CTLFLAG_RD,
1155 			&fw->dropped_link_error_or_filtered,
1156 			0, mxge_handle_be32,
1157 			"I", "dropped_link_error_or_filtered");
1158 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1159 			"dropped_multicast_filtered",
1160 			CTLTYPE_INT|CTLFLAG_RD,
1161 			&fw->dropped_multicast_filtered,
1162 			0, mxge_handle_be32,
1163 			"I", "dropped_multicast_filtered");
1164 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1165 			"dropped_runt",
1166 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1167 			0, mxge_handle_be32,
1168 			"I", "dropped_runt");
1169 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1170 			"dropped_overrun",
1171 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1172 			0, mxge_handle_be32,
1173 			"I", "dropped_overrun");
1174 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1175 			"dropped_no_small_buffer",
1176 			CTLTYPE_INT|CTLFLAG_RD,
1177 			&fw->dropped_no_small_buffer,
1178 			0, mxge_handle_be32,
1179 			"I", "dropped_no_small_buffer");
1180 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1181 			"dropped_no_big_buffer",
1182 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1183 			0, mxge_handle_be32,
1184 			"I", "dropped_no_big_buffer");
1185 
1186 	/* host counters exported for debugging */
1187 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1188 		       "rx_small_cnt",
1189 		       CTLFLAG_RD, &sc->rx_small.cnt,
1190 		       0, "rx_small_cnt");
1191 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1192 		       "rx_big_cnt",
1193 		       CTLFLAG_RD, &sc->rx_big.cnt,
1194 		       0, "rx_small_cnt");
1195 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1196 		       "tx_req",
1197 		       CTLFLAG_RD, &sc->tx.req,
1198 		       0, "tx_req");
1199 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1200 		       "tx_done",
1201 		       CTLFLAG_RD, &sc->tx.done,
1202 		       0, "tx_done");
1203 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1204 		       "tx_pkt_done",
1205 		       CTLFLAG_RD, &sc->tx.pkt_done,
1206 		       0, "tx_done");
1207 
1208 	/* verbose printing? */
1209 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1210 		       "verbose",
1211 		       CTLFLAG_RW, &mxge_verbose,
1212 		       0, "verbose printing");
1213 
1214 }
1215 
1216 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1217    backwards one at a time and handle ring wraps */
1218 
1219 static inline void
1220 mxge_submit_req_backwards(mxge_tx_buf_t *tx,
1221 			    mcp_kreq_ether_send_t *src, int cnt)
1222 {
1223         int idx, starting_slot;
1224         starting_slot = tx->req;
1225         while (cnt > 1) {
1226                 cnt--;
1227                 idx = (starting_slot + cnt) & tx->mask;
1228                 mxge_pio_copy(&tx->lanai[idx],
1229 			      &src[cnt], sizeof(*src));
1230                 mb();
1231         }
1232 }
1233 
1234 /*
1235  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1236  * at most 32 bytes at a time, so as to avoid involving the software
1237  * pio handler in the nic.   We re-write the first segment's flags
1238  * to mark them valid only after writing the entire chain
1239  */
1240 
1241 static inline void
1242 mxge_submit_req(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src,
1243                   int cnt)
1244 {
1245         int idx, i;
1246         uint32_t *src_ints;
1247 	volatile uint32_t *dst_ints;
1248         mcp_kreq_ether_send_t *srcp;
1249 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1250 	uint8_t last_flags;
1251 
1252         idx = tx->req & tx->mask;
1253 
1254 	last_flags = src->flags;
1255 	src->flags = 0;
1256         mb();
1257         dst = dstp = &tx->lanai[idx];
1258         srcp = src;
1259 
1260         if ((idx + cnt) < tx->mask) {
1261                 for (i = 0; i < (cnt - 1); i += 2) {
1262                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1263                         mb(); /* force write every 32 bytes */
1264                         srcp += 2;
1265                         dstp += 2;
1266                 }
1267         } else {
1268                 /* submit all but the first request, and ensure
1269                    that it is submitted below */
1270                 mxge_submit_req_backwards(tx, src, cnt);
1271                 i = 0;
1272         }
1273         if (i < cnt) {
1274                 /* submit the first request */
1275                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1276                 mb(); /* barrier before setting valid flag */
1277         }
1278 
1279         /* re-write the last 32-bits with the valid flags */
1280         src->flags = last_flags;
1281         src_ints = (uint32_t *)src;
1282         src_ints+=3;
1283         dst_ints = (volatile uint32_t *)dst;
1284         dst_ints+=3;
1285         *dst_ints =  *src_ints;
1286         tx->req += cnt;
1287         mb();
1288 }
1289 
1290 static inline void
1291 mxge_submit_req_wc(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1292 {
1293     tx->req += cnt;
1294     mb();
1295     while (cnt >= 4) {
1296 	    mxge_pio_copy((volatile char *)tx->wc_fifo, src, 64);
1297 	    mb();
1298 	    src += 4;
1299 	    cnt -= 4;
1300     }
1301     if (cnt > 0) {
1302 	    /* pad it to 64 bytes.  The src is 64 bytes bigger than it
1303 	       needs to be so that we don't overrun it */
1304 	    mxge_pio_copy(tx->wc_fifo + MXGEFW_ETH_SEND_OFFSET(cnt), src, 64);
1305 	    mb();
1306     }
1307 }
1308 
1309 static void
1310 mxge_encap_tso(mxge_softc_t *sc, struct mbuf *m, int busdma_seg_cnt)
1311 {
1312 	mxge_tx_buf_t *tx;
1313 	mcp_kreq_ether_send_t *req;
1314 	bus_dma_segment_t *seg;
1315 	struct ether_header *eh;
1316 	struct ip *ip;
1317 	struct tcphdr *tcp;
1318 	uint32_t low, high_swapped;
1319 	int len, seglen, cum_len, cum_len_next;
1320 	int next_is_first, chop, cnt, rdma_count, small;
1321 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1322 	uint8_t flags, flags_next;
1323 	static int once;
1324 
1325 	mss = m->m_pkthdr.tso_segsz;
1326 
1327 	/* negative cum_len signifies to the
1328 	 * send loop that we are still in the
1329 	 * header portion of the TSO packet.
1330 	 */
1331 
1332 	/* ensure we have the ethernet, IP and TCP
1333 	   header together in the first mbuf, copy
1334 	   it to a scratch buffer if not */
1335 	if (__predict_false(m->m_len < sizeof (*eh)
1336 			    + sizeof (*ip))) {
1337 		m_copydata(m, 0, sizeof (*eh) + sizeof (*ip),
1338 			   sc->scratch);
1339 		eh = (struct ether_header *)sc->scratch;
1340 	} else {
1341 		eh = mtod(m, struct ether_header *);
1342 	}
1343 	ip = (struct ip *) (eh + 1);
1344 	if (__predict_false(m->m_len < sizeof (*eh) + (ip->ip_hl << 2)
1345 			    + sizeof (*tcp))) {
1346 		m_copydata(m, 0, sizeof (*eh) + (ip->ip_hl << 2)
1347 			   + sizeof (*tcp),  sc->scratch);
1348 		eh = (struct ether_header *) sc->scratch;
1349 		ip = (struct ip *) (eh + 1);
1350 	}
1351 
1352 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1353 	cum_len = -(sizeof (*eh) + ((ip->ip_hl + tcp->th_off) << 2));
1354 
1355 	/* TSO implies checksum offload on this hardware */
1356 	cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1357 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1358 
1359 
1360 	/* for TSO, pseudo_hdr_offset holds mss.
1361 	 * The firmware figures out where to put
1362 	 * the checksum by parsing the header. */
1363 	pseudo_hdr_offset = htobe16(mss);
1364 
1365 	tx = &sc->tx;
1366 	req = tx->req_list;
1367 	seg = tx->seg_list;
1368 	cnt = 0;
1369 	rdma_count = 0;
1370 	/* "rdma_count" is the number of RDMAs belonging to the
1371 	 * current packet BEFORE the current send request. For
1372 	 * non-TSO packets, this is equal to "count".
1373 	 * For TSO packets, rdma_count needs to be reset
1374 	 * to 0 after a segment cut.
1375 	 *
1376 	 * The rdma_count field of the send request is
1377 	 * the number of RDMAs of the packet starting at
1378 	 * that request. For TSO send requests with one ore more cuts
1379 	 * in the middle, this is the number of RDMAs starting
1380 	 * after the last cut in the request. All previous
1381 	 * segments before the last cut implicitly have 1 RDMA.
1382 	 *
1383 	 * Since the number of RDMAs is not known beforehand,
1384 	 * it must be filled-in retroactively - after each
1385 	 * segmentation cut or at the end of the entire packet.
1386 	 */
1387 
1388 	while (busdma_seg_cnt) {
1389 		/* Break the busdma segment up into pieces*/
1390 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1391 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1392 		len = seglen = seg->ds_len;
1393 
1394 		while (len) {
1395 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1396 			cum_len_next = cum_len + seglen;
1397 			(req-rdma_count)->rdma_count = rdma_count + 1;
1398 			if (__predict_true(cum_len >= 0)) {
1399 				/* payload */
1400 				chop = (cum_len_next > mss);
1401 				cum_len_next = cum_len_next % mss;
1402 				next_is_first = (cum_len_next == 0);
1403 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1404 				flags_next |= next_is_first *
1405 					MXGEFW_FLAGS_FIRST;
1406 				rdma_count |= -(chop | next_is_first);
1407 				rdma_count += chop & !next_is_first;
1408 			} else if (cum_len_next >= 0) {
1409 				/* header ends */
1410 				rdma_count = -1;
1411 				cum_len_next = 0;
1412 				seglen = -cum_len;
1413 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1414 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1415 					MXGEFW_FLAGS_FIRST |
1416 					(small * MXGEFW_FLAGS_SMALL);
1417 			    }
1418 
1419 			req->addr_high = high_swapped;
1420 			req->addr_low = htobe32(low);
1421 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1422 			req->pad = 0;
1423 			req->rdma_count = 1;
1424 			req->length = htobe16(seglen);
1425 			req->cksum_offset = cksum_offset;
1426 			req->flags = flags | ((cum_len & 1) *
1427 					      MXGEFW_FLAGS_ALIGN_ODD);
1428 			low += seglen;
1429 			len -= seglen;
1430 			cum_len = cum_len_next;
1431 			flags = flags_next;
1432 			req++;
1433 			cnt++;
1434 			rdma_count++;
1435 			if (__predict_false(cksum_offset > seglen))
1436 				cksum_offset -= seglen;
1437 			else
1438 				cksum_offset = 0;
1439 			if (__predict_false(cnt > MXGE_MAX_SEND_DESC))
1440 				goto drop;
1441 		}
1442 		busdma_seg_cnt--;
1443 		seg++;
1444 	}
1445 	(req-rdma_count)->rdma_count = rdma_count;
1446 
1447 	do {
1448 		req--;
1449 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1450 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1451 
1452 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1453 	if (tx->wc_fifo == NULL)
1454 		mxge_submit_req(tx, tx->req_list, cnt);
1455 	else
1456 		mxge_submit_req_wc(tx, tx->req_list, cnt);
1457 	return;
1458 
1459 drop:
1460 	m_freem(m);
1461 	sc->ifp->if_oerrors++;
1462 	if (!once) {
1463 		printf("MXGE_MAX_SEND_DESC exceeded via TSO!\n");
1464 		printf("mss = %d, %ld!\n", mss, (long)seg - (long)tx->seg_list);
1465 		once = 1;
1466 	}
1467 	return;
1468 
1469 }
1470 
1471 static void
1472 mxge_encap(mxge_softc_t *sc, struct mbuf *m)
1473 {
1474 	mcp_kreq_ether_send_t *req;
1475 	bus_dma_segment_t *seg;
1476 	struct mbuf *m_tmp;
1477 	struct ifnet *ifp;
1478 	mxge_tx_buf_t *tx;
1479 	struct ether_header *eh;
1480 	struct ip *ip;
1481 	int cnt, cum_len, err, i, idx, odd_flag;
1482 	uint16_t pseudo_hdr_offset;
1483         uint8_t flags, cksum_offset;
1484 
1485 
1486 
1487 	ifp = sc->ifp;
1488 	tx = &sc->tx;
1489 
1490 	/* (try to) map the frame for DMA */
1491 	idx = tx->req & tx->mask;
1492 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1493 				      m, tx->seg_list, &cnt,
1494 				      BUS_DMA_NOWAIT);
1495 	if (err == EFBIG) {
1496 		/* Too many segments in the chain.  Try
1497 		   to defrag */
1498 		m_tmp = m_defrag(m, M_NOWAIT);
1499 		if (m_tmp == NULL) {
1500 			goto drop;
1501 		}
1502 		m = m_tmp;
1503 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1504 					      tx->info[idx].map,
1505 					      m, tx->seg_list, &cnt,
1506 					      BUS_DMA_NOWAIT);
1507 	}
1508 	if (err != 0) {
1509 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1510 			      " packet len = %d\n", err, m->m_pkthdr.len);
1511 		goto drop;
1512 	}
1513 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1514 			BUS_DMASYNC_PREWRITE);
1515 	tx->info[idx].m = m;
1516 
1517 
1518 	/* TSO is different enough, we handle it in another routine */
1519 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1520 		mxge_encap_tso(sc, m, cnt);
1521 		return;
1522 	}
1523 
1524 	req = tx->req_list;
1525 	cksum_offset = 0;
1526 	pseudo_hdr_offset = 0;
1527 	flags = MXGEFW_FLAGS_NO_TSO;
1528 
1529 	/* checksum offloading? */
1530 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1531 		/* ensure ip header is in first mbuf, copy
1532 		   it to a scratch buffer if not */
1533 		if (__predict_false(m->m_len < sizeof (*eh)
1534 				    + sizeof (*ip))) {
1535 			m_copydata(m, 0, sizeof (*eh) + sizeof (*ip),
1536 				   sc->scratch);
1537 			eh = (struct ether_header *)sc->scratch;
1538 		} else {
1539 			eh = mtod(m, struct ether_header *);
1540 		}
1541 		ip = (struct ip *) (eh + 1);
1542 		cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1543 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1544 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1545 		req->cksum_offset = cksum_offset;
1546 		flags |= MXGEFW_FLAGS_CKSUM;
1547 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1548 	} else {
1549 		odd_flag = 0;
1550 	}
1551 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1552 		flags |= MXGEFW_FLAGS_SMALL;
1553 
1554 	/* convert segments into a request list */
1555 	cum_len = 0;
1556 	seg = tx->seg_list;
1557 	req->flags = MXGEFW_FLAGS_FIRST;
1558 	for (i = 0; i < cnt; i++) {
1559 		req->addr_low =
1560 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1561 		req->addr_high =
1562 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1563 		req->length = htobe16(seg->ds_len);
1564 		req->cksum_offset = cksum_offset;
1565 		if (cksum_offset > seg->ds_len)
1566 			cksum_offset -= seg->ds_len;
1567 		else
1568 			cksum_offset = 0;
1569 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1570 		req->pad = 0; /* complete solid 16-byte block */
1571 		req->rdma_count = 1;
1572 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1573 		cum_len += seg->ds_len;
1574 		seg++;
1575 		req++;
1576 		req->flags = 0;
1577 	}
1578 	req--;
1579 	/* pad runts to 60 bytes */
1580 	if (cum_len < 60) {
1581 		req++;
1582 		req->addr_low =
1583 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
1584 		req->addr_high =
1585 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
1586 		req->length = htobe16(60 - cum_len);
1587 		req->cksum_offset = 0;
1588 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1589 		req->pad = 0; /* complete solid 16-byte block */
1590 		req->rdma_count = 1;
1591 		req->flags |= flags | ((cum_len & 1) * odd_flag);
1592 		cnt++;
1593 	}
1594 
1595 	tx->req_list[0].rdma_count = cnt;
1596 #if 0
1597 	/* print what the firmware will see */
1598 	for (i = 0; i < cnt; i++) {
1599 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1600 		    "cso:%d, flags:0x%x, rdma:%d\n",
1601 		    i, (int)ntohl(tx->req_list[i].addr_high),
1602 		    (int)ntohl(tx->req_list[i].addr_low),
1603 		    (int)ntohs(tx->req_list[i].length),
1604 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1605 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1606 		    tx->req_list[i].rdma_count);
1607 	}
1608 	printf("--------------\n");
1609 #endif
1610 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1611 	if (tx->wc_fifo == NULL)
1612 		mxge_submit_req(tx, tx->req_list, cnt);
1613 	else
1614 		mxge_submit_req_wc(tx, tx->req_list, cnt);
1615 	return;
1616 
1617 drop:
1618 	m_freem(m);
1619 	ifp->if_oerrors++;
1620 	return;
1621 }
1622 
1623 
1624 
1625 
1626 static inline void
1627 mxge_start_locked(mxge_softc_t *sc)
1628 {
1629 	struct mbuf *m;
1630 	struct ifnet *ifp;
1631 
1632 	ifp = sc->ifp;
1633 	while ((sc->tx.mask - (sc->tx.req - sc->tx.done))
1634 	       > MXGE_MAX_SEND_DESC) {
1635 
1636 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
1637 		if (m == NULL) {
1638 			return;
1639 		}
1640 		/* let BPF see it */
1641 		BPF_MTAP(ifp, m);
1642 
1643 		/* give it to the nic */
1644 		mxge_encap(sc, m);
1645 	}
1646 	/* ran out of transmit slots */
1647 	sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
1648 }
1649 
1650 static void
1651 mxge_start(struct ifnet *ifp)
1652 {
1653 	mxge_softc_t *sc = ifp->if_softc;
1654 
1655 
1656 	mtx_lock(&sc->tx_lock);
1657 	mxge_start_locked(sc);
1658 	mtx_unlock(&sc->tx_lock);
1659 }
1660 
1661 /*
1662  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
1663  * at most 32 bytes at a time, so as to avoid involving the software
1664  * pio handler in the nic.   We re-write the first segment's low
1665  * DMA address to mark it valid only after we write the entire chunk
1666  * in a burst
1667  */
1668 static inline void
1669 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
1670 		mcp_kreq_ether_recv_t *src)
1671 {
1672 	uint32_t low;
1673 
1674 	low = src->addr_low;
1675 	src->addr_low = 0xffffffff;
1676 	mxge_pio_copy(dst, src, 8 * sizeof (*src));
1677 	mb();
1678 	dst->addr_low = low;
1679 	mb();
1680 }
1681 
1682 static int
1683 mxge_get_buf_small(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1684 {
1685 	bus_dma_segment_t seg;
1686 	struct mbuf *m;
1687 	mxge_rx_buf_t *rx = &sc->rx_small;
1688 	int cnt, err;
1689 
1690 	m = m_gethdr(M_DONTWAIT, MT_DATA);
1691 	if (m == NULL) {
1692 		rx->alloc_fail++;
1693 		err = ENOBUFS;
1694 		goto done;
1695 	}
1696 	m->m_len = MHLEN;
1697 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1698 				      &seg, &cnt, BUS_DMA_NOWAIT);
1699 	if (err != 0) {
1700 		m_free(m);
1701 		goto done;
1702 	}
1703 	rx->info[idx].m = m;
1704 	rx->shadow[idx].addr_low =
1705 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1706 	rx->shadow[idx].addr_high =
1707 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1708 
1709 done:
1710 	if ((idx & 7) == 7) {
1711 		if (rx->wc_fifo == NULL)
1712 			mxge_submit_8rx(&rx->lanai[idx - 7],
1713 					&rx->shadow[idx - 7]);
1714 		else {
1715 			mb();
1716 			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1717 		}
1718         }
1719 	return err;
1720 }
1721 
1722 static int
1723 mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1724 {
1725 	bus_dma_segment_t seg;
1726 	struct mbuf *m;
1727 	mxge_rx_buf_t *rx = &sc->rx_big;
1728 	int cnt, err;
1729 
1730 	m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, sc->big_bytes);
1731 	if (m == NULL) {
1732 		rx->alloc_fail++;
1733 		err = ENOBUFS;
1734 		goto done;
1735 	}
1736 	m->m_len = sc->big_bytes;
1737 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1738 				      &seg, &cnt, BUS_DMA_NOWAIT);
1739 	if (err != 0) {
1740 		m_free(m);
1741 		goto done;
1742 	}
1743 	rx->info[idx].m = m;
1744 	rx->shadow[idx].addr_low =
1745 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1746 	rx->shadow[idx].addr_high =
1747 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1748 
1749 done:
1750 	if ((idx & 7) == 7) {
1751 		if (rx->wc_fifo == NULL)
1752 			mxge_submit_8rx(&rx->lanai[idx - 7],
1753 					&rx->shadow[idx - 7]);
1754 		else {
1755 			mb();
1756 			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1757 		}
1758         }
1759 	return err;
1760 }
1761 
1762 static inline void
1763 mxge_rx_csum(struct mbuf *m, int csum)
1764 {
1765 	struct ether_header *eh;
1766 	struct ip *ip;
1767 
1768 	eh = mtod(m, struct ether_header *);
1769 	if (__predict_true(eh->ether_type ==  htons(ETHERTYPE_IP))) {
1770 		ip = (struct ip *)(eh + 1);
1771 		if (__predict_true(ip->ip_p == IPPROTO_TCP ||
1772 				   ip->ip_p == IPPROTO_UDP)) {
1773 			m->m_pkthdr.csum_data = csum;
1774 			m->m_pkthdr.csum_flags = CSUM_DATA_VALID;
1775 		}
1776 	}
1777 }
1778 
1779 static inline void
1780 mxge_rx_done_big(mxge_softc_t *sc, int len, int csum)
1781 {
1782 	struct ifnet *ifp;
1783 	struct mbuf *m = 0; 		/* -Wunitialized */
1784 	struct mbuf *m_prev = 0;	/* -Wunitialized */
1785 	struct mbuf *m_head = 0;
1786 	bus_dmamap_t old_map;
1787 	mxge_rx_buf_t *rx;
1788 	int idx;
1789 
1790 
1791 	rx = &sc->rx_big;
1792 	ifp = sc->ifp;
1793 	while (len > 0) {
1794 		idx = rx->cnt & rx->mask;
1795                 rx->cnt++;
1796 		/* save a pointer to the received mbuf */
1797 		m = rx->info[idx].m;
1798 		/* try to replace the received mbuf */
1799 		if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
1800 			goto drop;
1801 		}
1802 		/* unmap the received buffer */
1803 		old_map = rx->info[idx].map;
1804 		bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1805 		bus_dmamap_unload(rx->dmat, old_map);
1806 
1807 		/* swap the bus_dmamap_t's */
1808 		rx->info[idx].map = rx->extra_map;
1809 		rx->extra_map = old_map;
1810 
1811 		/* chain multiple segments together */
1812 		if (!m_head) {
1813 			m_head = m;
1814 			/* mcp implicitly skips 1st bytes so that
1815 			 * packet is properly aligned */
1816 			m->m_data += MXGEFW_PAD;
1817 			m->m_pkthdr.len = len;
1818 			m->m_len = sc->big_bytes - MXGEFW_PAD;
1819 		} else {
1820 			m->m_len = sc->big_bytes;
1821 			m->m_flags &= ~M_PKTHDR;
1822 			m_prev->m_next = m;
1823 		}
1824 		len -= m->m_len;
1825 		m_prev = m;
1826 	}
1827 
1828 	/* trim trailing garbage from the last mbuf in the chain.  If
1829 	 * there is any garbage, len will be negative */
1830 	m->m_len += len;
1831 
1832 	/* if the checksum is valid, mark it in the mbuf header */
1833 	if (sc->csum_flag)
1834 		mxge_rx_csum(m_head, csum);
1835 
1836 	/* pass the frame up the stack */
1837 	m_head->m_pkthdr.rcvif = ifp;
1838 	ifp->if_ipackets++;
1839 	(*ifp->if_input)(ifp, m_head);
1840 	return;
1841 
1842 drop:
1843 	/* drop the frame -- the old mbuf(s) are re-cycled by running
1844 	   every slot through the allocator */
1845         if (m_head) {
1846                 len -= sc->big_bytes;
1847                 m_freem(m_head);
1848         } else {
1849                 len -= (sc->big_bytes + MXGEFW_PAD);
1850         }
1851         while ((int)len > 0) {
1852                 idx = rx->cnt & rx->mask;
1853                 rx->cnt++;
1854                 m = rx->info[idx].m;
1855                 if (0 == (mxge_get_buf_big(sc, rx->extra_map, idx))) {
1856 			m_freem(m);
1857 			/* unmap the received buffer */
1858 			old_map = rx->info[idx].map;
1859 			bus_dmamap_sync(rx->dmat, old_map,
1860 					BUS_DMASYNC_POSTREAD);
1861 			bus_dmamap_unload(rx->dmat, old_map);
1862 
1863 			/* swap the bus_dmamap_t's */
1864 			rx->info[idx].map = rx->extra_map;
1865 			rx->extra_map = old_map;
1866 		}
1867                 len -= sc->big_bytes;
1868         }
1869 
1870 	ifp->if_ierrors++;
1871 
1872 }
1873 
1874 static inline void
1875 mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
1876 {
1877 	struct ifnet *ifp;
1878 	struct mbuf *m;
1879 	mxge_rx_buf_t *rx;
1880 	bus_dmamap_t old_map;
1881 	int idx;
1882 
1883 	ifp = sc->ifp;
1884 	rx = &sc->rx_small;
1885 	idx = rx->cnt & rx->mask;
1886 	rx->cnt++;
1887 	/* save a pointer to the received mbuf */
1888 	m = rx->info[idx].m;
1889 	/* try to replace the received mbuf */
1890 	if (mxge_get_buf_small(sc, rx->extra_map, idx)) {
1891 		/* drop the frame -- the old mbuf is re-cycled */
1892 		ifp->if_ierrors++;
1893 		return;
1894 	}
1895 
1896 	/* unmap the received buffer */
1897 	old_map = rx->info[idx].map;
1898 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1899 	bus_dmamap_unload(rx->dmat, old_map);
1900 
1901 	/* swap the bus_dmamap_t's */
1902 	rx->info[idx].map = rx->extra_map;
1903 	rx->extra_map = old_map;
1904 
1905 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
1906 	 * aligned */
1907 	m->m_data += MXGEFW_PAD;
1908 
1909 	/* if the checksum is valid, mark it in the mbuf header */
1910 	if (sc->csum_flag)
1911 		mxge_rx_csum(m, csum);
1912 
1913 	/* pass the frame up the stack */
1914 	m->m_pkthdr.rcvif = ifp;
1915 	m->m_len = m->m_pkthdr.len = len;
1916 	ifp->if_ipackets++;
1917 	(*ifp->if_input)(ifp, m);
1918 }
1919 
1920 static inline void
1921 mxge_clean_rx_done(mxge_softc_t *sc)
1922 {
1923 	mxge_rx_done_t *rx_done = &sc->rx_done;
1924 	int limit = 0;
1925 	uint16_t length;
1926 	uint16_t checksum;
1927 
1928 
1929 	while (rx_done->entry[rx_done->idx].length != 0) {
1930 		length = ntohs(rx_done->entry[rx_done->idx].length);
1931 		rx_done->entry[rx_done->idx].length = 0;
1932 		checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
1933 		if (length <= (MHLEN - MXGEFW_PAD))
1934 			mxge_rx_done_small(sc, length, checksum);
1935 		else
1936 			mxge_rx_done_big(sc, length, checksum);
1937 		rx_done->cnt++;
1938 		rx_done->idx = rx_done->cnt & (mxge_max_intr_slots - 1);
1939 
1940 		/* limit potential for livelock */
1941 		if (__predict_false(++limit > 2 * mxge_max_intr_slots))
1942 			break;
1943 
1944 	}
1945 }
1946 
1947 
1948 static inline void
1949 mxge_tx_done(mxge_softc_t *sc, uint32_t mcp_idx)
1950 {
1951 	struct ifnet *ifp;
1952 	mxge_tx_buf_t *tx;
1953 	struct mbuf *m;
1954 	bus_dmamap_t map;
1955 	int idx, limit;
1956 
1957 	limit = 0;
1958 	tx = &sc->tx;
1959 	ifp = sc->ifp;
1960 	while (tx->pkt_done != mcp_idx) {
1961 		idx = tx->done & tx->mask;
1962 		tx->done++;
1963 		m = tx->info[idx].m;
1964 		/* mbuf and DMA map only attached to the first
1965 		   segment per-mbuf */
1966 		if (m != NULL) {
1967 			ifp->if_opackets++;
1968 			tx->info[idx].m = NULL;
1969 			map = tx->info[idx].map;
1970 			bus_dmamap_unload(tx->dmat, map);
1971 			m_freem(m);
1972 		}
1973 		if (tx->info[idx].flag) {
1974 			tx->info[idx].flag = 0;
1975 			tx->pkt_done++;
1976 		}
1977 		/* limit potential for livelock by only handling
1978 		   2 full tx rings per call */
1979 		if (__predict_false(++limit >  2 * tx->mask))
1980 			break;
1981 	}
1982 
1983 	/* If we have space, clear IFF_OACTIVE to tell the stack that
1984            its OK to send packets */
1985 
1986 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
1987 	    tx->req - tx->done < (tx->mask + 1)/4) {
1988 		mtx_lock(&sc->tx_lock);
1989 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
1990 		mxge_start_locked(sc);
1991 		mtx_unlock(&sc->tx_lock);
1992 	}
1993 }
1994 
1995 static void
1996 mxge_intr(void *arg)
1997 {
1998 	mxge_softc_t *sc = arg;
1999 	mcp_irq_data_t *stats = sc->fw_stats;
2000 	mxge_tx_buf_t *tx = &sc->tx;
2001 	mxge_rx_done_t *rx_done = &sc->rx_done;
2002 	uint32_t send_done_count;
2003 	uint8_t valid;
2004 
2005 
2006 	/* make sure the DMA has finished */
2007 	if (!stats->valid) {
2008 		return;
2009 	}
2010 	valid = stats->valid;
2011 
2012 	/* lower legacy IRQ  */
2013 	*sc->irq_deassert = 0;
2014 	mb();
2015 	if (!mxge_deassert_wait)
2016 		/* don't wait for conf. that irq is low */
2017 		stats->valid = 0;
2018 	do {
2019 		/* check for transmit completes and receives */
2020 		send_done_count = be32toh(stats->send_done_count);
2021 		while ((send_done_count != tx->pkt_done) ||
2022 		       (rx_done->entry[rx_done->idx].length != 0)) {
2023 			mxge_tx_done(sc, (int)send_done_count);
2024 			mxge_clean_rx_done(sc);
2025 			send_done_count = be32toh(stats->send_done_count);
2026 		}
2027 	} while (*((volatile uint8_t *) &stats->valid));
2028 
2029 	if (__predict_false(stats->stats_updated)) {
2030 		if (sc->link_state != stats->link_up) {
2031 			sc->link_state = stats->link_up;
2032 			if (sc->link_state) {
2033 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2034 				if (mxge_verbose)
2035 					device_printf(sc->dev, "link up\n");
2036 			} else {
2037 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2038 				if (mxge_verbose)
2039 					device_printf(sc->dev, "link down\n");
2040 			}
2041 		}
2042 		if (sc->rdma_tags_available !=
2043 		    be32toh(sc->fw_stats->rdma_tags_available)) {
2044 			sc->rdma_tags_available =
2045 				be32toh(sc->fw_stats->rdma_tags_available);
2046 			device_printf(sc->dev, "RDMA timed out! %d tags "
2047 				      "left\n", sc->rdma_tags_available);
2048 		}
2049 		sc->down_cnt += stats->link_down;
2050 	}
2051 
2052 	/* check to see if we have rx token to pass back */
2053 	if (valid & 0x1)
2054 	    *sc->irq_claim = be32toh(3);
2055 	*(sc->irq_claim + 1) = be32toh(3);
2056 }
2057 
2058 static void
2059 mxge_watchdog(struct ifnet *ifp)
2060 {
2061 	printf("%s called\n", __FUNCTION__);
2062 }
2063 
2064 static void
2065 mxge_init(void *arg)
2066 {
2067 }
2068 
2069 
2070 
2071 static void
2072 mxge_free_mbufs(mxge_softc_t *sc)
2073 {
2074 	int i;
2075 
2076 	for (i = 0; i <= sc->rx_big.mask; i++) {
2077 		if (sc->rx_big.info[i].m == NULL)
2078 			continue;
2079 		bus_dmamap_unload(sc->rx_big.dmat,
2080 				  sc->rx_big.info[i].map);
2081 		m_freem(sc->rx_big.info[i].m);
2082 		sc->rx_big.info[i].m = NULL;
2083 	}
2084 
2085 	for (i = 0; i <= sc->rx_big.mask; i++) {
2086 		if (sc->rx_big.info[i].m == NULL)
2087 			continue;
2088 		bus_dmamap_unload(sc->rx_big.dmat,
2089 				  sc->rx_big.info[i].map);
2090 		m_freem(sc->rx_big.info[i].m);
2091 		sc->rx_big.info[i].m = NULL;
2092 	}
2093 
2094 	for (i = 0; i <= sc->tx.mask; i++) {
2095 		if (sc->tx.info[i].m == NULL)
2096 			continue;
2097 		bus_dmamap_unload(sc->tx.dmat,
2098 				  sc->tx.info[i].map);
2099 		m_freem(sc->tx.info[i].m);
2100 		sc->tx.info[i].m = NULL;
2101 	}
2102 }
2103 
2104 static void
2105 mxge_free_rings(mxge_softc_t *sc)
2106 {
2107 	int i;
2108 
2109 	if (sc->tx.req_bytes != NULL)
2110 		free(sc->tx.req_bytes, M_DEVBUF);
2111 	if (sc->tx.seg_list != NULL)
2112 		free(sc->tx.seg_list, M_DEVBUF);
2113 	if (sc->rx_small.shadow != NULL)
2114 		free(sc->rx_small.shadow, M_DEVBUF);
2115 	if (sc->rx_big.shadow != NULL)
2116 		free(sc->rx_big.shadow, M_DEVBUF);
2117 	if (sc->tx.info != NULL) {
2118 		for (i = 0; i <= sc->tx.mask; i++) {
2119 			if (sc->tx.info[i].map != NULL)
2120 				bus_dmamap_destroy(sc->tx.dmat,
2121 						   sc->tx.info[i].map);
2122 		}
2123 		free(sc->tx.info, M_DEVBUF);
2124 	}
2125 	if (sc->rx_small.info != NULL) {
2126 		for (i = 0; i <= sc->rx_small.mask; i++) {
2127 			if (sc->rx_small.info[i].map != NULL)
2128 				bus_dmamap_destroy(sc->rx_small.dmat,
2129 						   sc->rx_small.info[i].map);
2130 		}
2131 		free(sc->rx_small.info, M_DEVBUF);
2132 	}
2133 	if (sc->rx_big.info != NULL) {
2134 		for (i = 0; i <= sc->rx_big.mask; i++) {
2135 			if (sc->rx_big.info[i].map != NULL)
2136 				bus_dmamap_destroy(sc->rx_big.dmat,
2137 						   sc->rx_big.info[i].map);
2138 		}
2139 		free(sc->rx_big.info, M_DEVBUF);
2140 	}
2141 	if (sc->rx_big.extra_map != NULL)
2142 		bus_dmamap_destroy(sc->rx_big.dmat,
2143 				   sc->rx_big.extra_map);
2144 	if (sc->rx_small.extra_map != NULL)
2145 		bus_dmamap_destroy(sc->rx_small.dmat,
2146 				   sc->rx_small.extra_map);
2147 	if (sc->tx.dmat != NULL)
2148 		bus_dma_tag_destroy(sc->tx.dmat);
2149 	if (sc->rx_small.dmat != NULL)
2150 		bus_dma_tag_destroy(sc->rx_small.dmat);
2151 	if (sc->rx_big.dmat != NULL)
2152 		bus_dma_tag_destroy(sc->rx_big.dmat);
2153 }
2154 
2155 static int
2156 mxge_alloc_rings(mxge_softc_t *sc)
2157 {
2158 	mxge_cmd_t cmd;
2159 	int tx_ring_size, rx_ring_size;
2160 	int tx_ring_entries, rx_ring_entries;
2161 	int i, err;
2162 	unsigned long bytes;
2163 
2164 	/* get ring sizes */
2165 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
2166 	tx_ring_size = cmd.data0;
2167 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
2168 	if (err != 0) {
2169 		device_printf(sc->dev, "Cannot determine ring sizes\n");
2170 		goto abort_with_nothing;
2171 	}
2172 
2173 	rx_ring_size = cmd.data0;
2174 
2175 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
2176 	rx_ring_entries = rx_ring_size / sizeof (mcp_dma_addr_t);
2177 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
2178 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
2179 	IFQ_SET_READY(&sc->ifp->if_snd);
2180 
2181 	sc->tx.mask = tx_ring_entries - 1;
2182 	sc->rx_small.mask = sc->rx_big.mask = rx_ring_entries - 1;
2183 
2184 	err = ENOMEM;
2185 
2186 	/* allocate the tx request copy block */
2187 	bytes = 8 +
2188 		sizeof (*sc->tx.req_list) * (MXGE_MAX_SEND_DESC + 4);
2189 	sc->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
2190 	if (sc->tx.req_bytes == NULL)
2191 		goto abort_with_nothing;
2192 	/* ensure req_list entries are aligned to 8 bytes */
2193 	sc->tx.req_list = (mcp_kreq_ether_send_t *)
2194 		((unsigned long)(sc->tx.req_bytes + 7) & ~7UL);
2195 
2196 	/* allocate the tx busdma segment list */
2197 	bytes = sizeof (*sc->tx.seg_list) * MXGE_MAX_SEND_DESC;
2198 	sc->tx.seg_list = (bus_dma_segment_t *)
2199 		malloc(bytes, M_DEVBUF, M_WAITOK);
2200 	if (sc->tx.seg_list == NULL)
2201 		goto abort_with_alloc;
2202 
2203 	/* allocate the rx shadow rings */
2204 	bytes = rx_ring_entries * sizeof (*sc->rx_small.shadow);
2205 	sc->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2206 	if (sc->rx_small.shadow == NULL)
2207 		goto abort_with_alloc;
2208 
2209 	bytes = rx_ring_entries * sizeof (*sc->rx_big.shadow);
2210 	sc->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2211 	if (sc->rx_big.shadow == NULL)
2212 		goto abort_with_alloc;
2213 
2214 	/* allocate the host info rings */
2215 	bytes = tx_ring_entries * sizeof (*sc->tx.info);
2216 	sc->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2217 	if (sc->tx.info == NULL)
2218 		goto abort_with_alloc;
2219 
2220 	bytes = rx_ring_entries * sizeof (*sc->rx_small.info);
2221 	sc->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2222 	if (sc->rx_small.info == NULL)
2223 		goto abort_with_alloc;
2224 
2225 	bytes = rx_ring_entries * sizeof (*sc->rx_big.info);
2226 	sc->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2227 	if (sc->rx_big.info == NULL)
2228 		goto abort_with_alloc;
2229 
2230 	/* allocate the busdma resources */
2231 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2232 				 1,			/* alignment */
2233 				 sc->tx.boundary,	/* boundary */
2234 				 BUS_SPACE_MAXADDR,	/* low */
2235 				 BUS_SPACE_MAXADDR,	/* high */
2236 				 NULL, NULL,		/* filter */
2237 				 65536 + 256,		/* maxsize */
2238 				 MXGE_MAX_SEND_DESC/2,	/* num segs */
2239 				 sc->tx.boundary,	/* maxsegsize */
2240 				 BUS_DMA_ALLOCNOW,	/* flags */
2241 				 NULL, NULL,		/* lock */
2242 				 &sc->tx.dmat);		/* tag */
2243 
2244 	if (err != 0) {
2245 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
2246 			      err);
2247 		goto abort_with_alloc;
2248 	}
2249 
2250 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2251 				 1,			/* alignment */
2252 				 4096,			/* boundary */
2253 				 BUS_SPACE_MAXADDR,	/* low */
2254 				 BUS_SPACE_MAXADDR,	/* high */
2255 				 NULL, NULL,		/* filter */
2256 				 MHLEN,			/* maxsize */
2257 				 1,			/* num segs */
2258 				 MHLEN,			/* maxsegsize */
2259 				 BUS_DMA_ALLOCNOW,	/* flags */
2260 				 NULL, NULL,		/* lock */
2261 				 &sc->rx_small.dmat);	/* tag */
2262 	if (err != 0) {
2263 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2264 			      err);
2265 		goto abort_with_alloc;
2266 	}
2267 
2268 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2269 				 1,			/* alignment */
2270 				 4096,			/* boundary */
2271 				 BUS_SPACE_MAXADDR,	/* low */
2272 				 BUS_SPACE_MAXADDR,	/* high */
2273 				 NULL, NULL,		/* filter */
2274 				 4096,			/* maxsize */
2275 				 1,			/* num segs */
2276 				 4096,			/* maxsegsize */
2277 				 BUS_DMA_ALLOCNOW,	/* flags */
2278 				 NULL, NULL,		/* lock */
2279 				 &sc->rx_big.dmat);	/* tag */
2280 	if (err != 0) {
2281 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2282 			      err);
2283 		goto abort_with_alloc;
2284 	}
2285 
2286 	/* now use these tags to setup dmamaps for each slot
2287 	   in each ring */
2288 	for (i = 0; i <= sc->tx.mask; i++) {
2289 		err = bus_dmamap_create(sc->tx.dmat, 0,
2290 					&sc->tx.info[i].map);
2291 		if (err != 0) {
2292 			device_printf(sc->dev, "Err %d  tx dmamap\n",
2293 			      err);
2294 			goto abort_with_alloc;
2295 		}
2296 	}
2297 	for (i = 0; i <= sc->rx_small.mask; i++) {
2298 		err = bus_dmamap_create(sc->rx_small.dmat, 0,
2299 					&sc->rx_small.info[i].map);
2300 		if (err != 0) {
2301 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2302 				      err);
2303 			goto abort_with_alloc;
2304 		}
2305 	}
2306 	err = bus_dmamap_create(sc->rx_small.dmat, 0,
2307 				&sc->rx_small.extra_map);
2308 	if (err != 0) {
2309 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2310 			      err);
2311 			goto abort_with_alloc;
2312 	}
2313 
2314 	for (i = 0; i <= sc->rx_big.mask; i++) {
2315 		err = bus_dmamap_create(sc->rx_big.dmat, 0,
2316 					&sc->rx_big.info[i].map);
2317 		if (err != 0) {
2318 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2319 			      err);
2320 			goto abort_with_alloc;
2321 		}
2322 	}
2323 	err = bus_dmamap_create(sc->rx_big.dmat, 0,
2324 				&sc->rx_big.extra_map);
2325 	if (err != 0) {
2326 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2327 			      err);
2328 			goto abort_with_alloc;
2329 	}
2330 	return 0;
2331 
2332 abort_with_alloc:
2333 	mxge_free_rings(sc);
2334 
2335 abort_with_nothing:
2336 	return err;
2337 }
2338 
2339 static int
2340 mxge_open(mxge_softc_t *sc)
2341 {
2342 	mxge_cmd_t cmd;
2343 	int i, err;
2344 	bus_dmamap_t map;
2345 	bus_addr_t bus;
2346 
2347 
2348 	/* Copy the MAC address in case it was overridden */
2349 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
2350 
2351 	err = mxge_reset(sc);
2352 	if (err != 0) {
2353 		device_printf(sc->dev, "failed to reset\n");
2354 		return EIO;
2355 	}
2356 
2357 	if (MCLBYTES >=
2358 	    sc->ifp->if_mtu + ETHER_HDR_LEN + MXGEFW_PAD)
2359 		sc->big_bytes = MCLBYTES;
2360 	else
2361 		sc->big_bytes = MJUMPAGESIZE;
2362 
2363 	err = mxge_alloc_rings(sc);
2364 	if (err != 0) {
2365 		device_printf(sc->dev, "failed to allocate rings\n");
2366 		return err;
2367 	}
2368 
2369 	err = bus_setup_intr(sc->dev, sc->irq_res,
2370 			     INTR_TYPE_NET | INTR_MPSAFE,
2371 			     mxge_intr, sc, &sc->ih);
2372 	if (err != 0) {
2373 		goto abort_with_rings;
2374 	}
2375 
2376 	/* get the lanai pointers to the send and receive rings */
2377 
2378 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
2379 	sc->tx.lanai =
2380 		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
2381 	err |= mxge_send_cmd(sc,
2382 				 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
2383 	sc->rx_small.lanai =
2384 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2385 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
2386 	sc->rx_big.lanai =
2387 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2388 
2389 	if (err != 0) {
2390 		device_printf(sc->dev,
2391 			      "failed to get ring sizes or locations\n");
2392 		err = EIO;
2393 		goto abort_with_irq;
2394 	}
2395 
2396 	if (sc->wc) {
2397 		sc->tx.wc_fifo = sc->sram + MXGEFW_ETH_SEND_4;
2398 		sc->rx_small.wc_fifo = sc->sram + MXGEFW_ETH_RECV_SMALL;
2399 		sc->rx_big.wc_fifo = sc->sram + MXGEFW_ETH_RECV_BIG;
2400 	} else {
2401 		sc->tx.wc_fifo = 0;
2402 		sc->rx_small.wc_fifo = 0;
2403 		sc->rx_big.wc_fifo = 0;
2404 	}
2405 
2406 
2407 	/* stock receive rings */
2408 	for (i = 0; i <= sc->rx_small.mask; i++) {
2409 		map = sc->rx_small.info[i].map;
2410 		err = mxge_get_buf_small(sc, map, i);
2411 		if (err) {
2412 			device_printf(sc->dev, "alloced %d/%d smalls\n",
2413 				      i, sc->rx_small.mask + 1);
2414 			goto abort;
2415 		}
2416 	}
2417 	for (i = 0; i <= sc->rx_big.mask; i++) {
2418 		map = sc->rx_big.info[i].map;
2419 		err = mxge_get_buf_big(sc, map, i);
2420 		if (err) {
2421 			device_printf(sc->dev, "alloced %d/%d bigs\n",
2422 				      i, sc->rx_big.mask + 1);
2423 			goto abort;
2424 		}
2425 	}
2426 
2427 	/* Give the firmware the mtu and the big and small buffer
2428 	   sizes.  The firmware wants the big buf size to be a power
2429 	   of two. Luckily, FreeBSD's clusters are powers of two */
2430 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN;
2431 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
2432 	cmd.data0 = MHLEN - MXGEFW_PAD;
2433 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
2434 			     &cmd);
2435 	cmd.data0 = sc->big_bytes;
2436 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2437 
2438 	if (err != 0) {
2439 		device_printf(sc->dev, "failed to setup params\n");
2440 		goto abort;
2441 	}
2442 
2443 	/* Now give him the pointer to the stats block */
2444 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->fw_stats_dma.bus_addr);
2445 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->fw_stats_dma.bus_addr);
2446 	cmd.data2 = sizeof(struct mcp_irq_data);
2447 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
2448 
2449 	if (err != 0) {
2450 		bus = sc->fw_stats_dma.bus_addr;
2451 		bus += offsetof(struct mcp_irq_data, send_done_count);
2452 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
2453 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
2454 		err = mxge_send_cmd(sc,
2455 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
2456 				    &cmd);
2457 		/* Firmware cannot support multicast without STATS_DMA_V2 */
2458 		sc->fw_multicast_support = 0;
2459 	} else {
2460 		sc->fw_multicast_support = 1;
2461 	}
2462 
2463 	if (err != 0) {
2464 		device_printf(sc->dev, "failed to setup params\n");
2465 		goto abort;
2466 	}
2467 
2468 	/* Finally, start the firmware running */
2469 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
2470 	if (err) {
2471 		device_printf(sc->dev, "Couldn't bring up link\n");
2472 		goto abort;
2473 	}
2474 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
2475 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2476 
2477 	return 0;
2478 
2479 
2480 abort:
2481 	mxge_free_mbufs(sc);
2482 abort_with_irq:
2483 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
2484 abort_with_rings:
2485 	mxge_free_rings(sc);
2486 	return err;
2487 }
2488 
2489 static int
2490 mxge_close(mxge_softc_t *sc)
2491 {
2492 	mxge_cmd_t cmd;
2493 	int err, old_down_cnt;
2494 
2495 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
2496 	old_down_cnt = sc->down_cnt;
2497 	mb();
2498 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2499 	if (err) {
2500 		device_printf(sc->dev, "Couldn't bring down link\n");
2501 	}
2502 	if (old_down_cnt == sc->down_cnt) {
2503 		/* wait for down irq */
2504 		(void)tsleep(&sc->down_cnt, PWAIT, "down mxge", hz);
2505 	}
2506 	if (old_down_cnt == sc->down_cnt) {
2507 		device_printf(sc->dev, "never got down irq\n");
2508 	}
2509 	if (sc->ih != NULL)
2510 		bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
2511 	mxge_free_mbufs(sc);
2512 	mxge_free_rings(sc);
2513 	return 0;
2514 }
2515 
2516 
2517 static int
2518 mxge_media_change(struct ifnet *ifp)
2519 {
2520 	return EINVAL;
2521 }
2522 
2523 static int
2524 mxge_change_mtu(mxge_softc_t *sc, int mtu)
2525 {
2526 	struct ifnet *ifp = sc->ifp;
2527 	int real_mtu, old_mtu;
2528 	int err = 0;
2529 
2530 
2531 	real_mtu = mtu + ETHER_HDR_LEN;
2532 	if ((real_mtu > MXGE_MAX_ETHER_MTU) ||
2533 	    real_mtu < 60)
2534 		return EINVAL;
2535 	sx_xlock(&sc->driver_lock);
2536 	old_mtu = ifp->if_mtu;
2537 	ifp->if_mtu = mtu;
2538 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2539 		mxge_close(sc);
2540 		err = mxge_open(sc);
2541 		if (err != 0) {
2542 			ifp->if_mtu = old_mtu;
2543 			mxge_close(sc);
2544 			(void) mxge_open(sc);
2545 		}
2546 	}
2547 	sx_xunlock(&sc->driver_lock);
2548 	return err;
2549 }
2550 
2551 static void
2552 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
2553 {
2554 	mxge_softc_t *sc = ifp->if_softc;
2555 
2556 
2557 	if (sc == NULL)
2558 		return;
2559 	ifmr->ifm_status = IFM_AVALID;
2560 	ifmr->ifm_status |= sc->fw_stats->link_up ? IFM_ACTIVE : 0;
2561 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
2562 	ifmr->ifm_active |= sc->fw_stats->link_up ? IFM_FDX : 0;
2563 }
2564 
2565 static int
2566 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
2567 {
2568 	mxge_softc_t *sc = ifp->if_softc;
2569 	struct ifreq *ifr = (struct ifreq *)data;
2570 	int err, mask;
2571 
2572 	err = 0;
2573 	switch (command) {
2574 	case SIOCSIFADDR:
2575 	case SIOCGIFADDR:
2576 		err = ether_ioctl(ifp, command, data);
2577 		break;
2578 
2579 	case SIOCSIFMTU:
2580 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
2581 		break;
2582 
2583 	case SIOCSIFFLAGS:
2584 		sx_xlock(&sc->driver_lock);
2585 		if (ifp->if_flags & IFF_UP) {
2586 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
2587 				err = mxge_open(sc);
2588 			else {
2589 				/* take care of promis can allmulti
2590 				   flag chages */
2591 				mxge_change_promisc(sc,
2592 						    ifp->if_flags & IFF_PROMISC);
2593 				mxge_set_multicast_list(sc);
2594 			}
2595 		} else {
2596 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2597 				mxge_close(sc);
2598 		}
2599 		sx_xunlock(&sc->driver_lock);
2600 		break;
2601 
2602 	case SIOCADDMULTI:
2603 	case SIOCDELMULTI:
2604 		sx_xlock(&sc->driver_lock);
2605 		mxge_set_multicast_list(sc);
2606 		sx_xunlock(&sc->driver_lock);
2607 		break;
2608 
2609 	case SIOCSIFCAP:
2610 		sx_xlock(&sc->driver_lock);
2611 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2612 		if (mask & IFCAP_TXCSUM) {
2613 			if (IFCAP_TXCSUM & ifp->if_capenable) {
2614 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
2615 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
2616 						      | CSUM_TSO);
2617 			} else {
2618 				ifp->if_capenable |= IFCAP_TXCSUM;
2619 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
2620 			}
2621 		} else if (mask & IFCAP_RXCSUM) {
2622 			if (IFCAP_RXCSUM & ifp->if_capenable) {
2623 				ifp->if_capenable &= ~IFCAP_RXCSUM;
2624 				sc->csum_flag = 0;
2625 			} else {
2626 				ifp->if_capenable |= IFCAP_RXCSUM;
2627 				sc->csum_flag = 1;
2628 			}
2629 		}
2630 		if (mask & IFCAP_TSO4) {
2631 			if (IFCAP_TSO4 & ifp->if_capenable) {
2632 				ifp->if_capenable &= ~IFCAP_TSO4;
2633 				ifp->if_hwassist &= ~CSUM_TSO;
2634 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
2635 				ifp->if_capenable |= IFCAP_TSO4;
2636 				ifp->if_hwassist |= CSUM_TSO;
2637 			} else {
2638 				printf("mxge requires tx checksum offload"
2639 				       " be enabled to use TSO\n");
2640 				err = EINVAL;
2641 			}
2642 		}
2643 		sx_xunlock(&sc->driver_lock);
2644 		break;
2645 
2646 	case SIOCGIFMEDIA:
2647 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
2648 				    &sc->media, command);
2649                 break;
2650 
2651 	default:
2652 		err = ENOTTY;
2653         }
2654 	return err;
2655 }
2656 
2657 static void
2658 mxge_fetch_tunables(mxge_softc_t *sc)
2659 {
2660 
2661 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
2662 			  &mxge_flow_control);
2663 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
2664 			  &mxge_intr_coal_delay);
2665 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
2666 			  &mxge_nvidia_ecrc_enable);
2667 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
2668 			  &mxge_deassert_wait);
2669 	TUNABLE_INT_FETCH("hw.mxge.verbose",
2670 			  &mxge_verbose);
2671 
2672 	if (bootverbose)
2673 		mxge_verbose = 1;
2674 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
2675 		mxge_intr_coal_delay = 30;
2676 	sc->pause = mxge_flow_control;
2677 }
2678 
2679 static int
2680 mxge_attach(device_t dev)
2681 {
2682 	mxge_softc_t *sc = device_get_softc(dev);
2683 	struct ifnet *ifp;
2684 	size_t bytes;
2685 	int rid, err;
2686 	uint16_t cmd;
2687 
2688 	sc->dev = dev;
2689 	mxge_fetch_tunables(sc);
2690 
2691 	err = bus_dma_tag_create(NULL,			/* parent */
2692 				 1,			/* alignment */
2693 				 4096,			/* boundary */
2694 				 BUS_SPACE_MAXADDR,	/* low */
2695 				 BUS_SPACE_MAXADDR,	/* high */
2696 				 NULL, NULL,		/* filter */
2697 				 65536 + 256,		/* maxsize */
2698 				 MXGE_MAX_SEND_DESC, 	/* num segs */
2699 				 4096,			/* maxsegsize */
2700 				 0,			/* flags */
2701 				 NULL, NULL,		/* lock */
2702 				 &sc->parent_dmat);	/* tag */
2703 
2704 	if (err != 0) {
2705 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
2706 			      err);
2707 		goto abort_with_nothing;
2708 	}
2709 
2710 	ifp = sc->ifp = if_alloc(IFT_ETHER);
2711 	if (ifp == NULL) {
2712 		device_printf(dev, "can not if_alloc()\n");
2713 		err = ENOSPC;
2714 		goto abort_with_parent_dmat;
2715 	}
2716 	mtx_init(&sc->cmd_lock, NULL,
2717 		 MTX_NETWORK_LOCK, MTX_DEF);
2718 	mtx_init(&sc->tx_lock, device_get_nameunit(dev),
2719 		 MTX_NETWORK_LOCK, MTX_DEF);
2720 	sx_init(&sc->driver_lock, device_get_nameunit(dev));
2721 
2722 	/* Enable DMA and Memory space access */
2723 	pci_enable_busmaster(dev);
2724 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
2725 	cmd |= PCIM_CMD_MEMEN;
2726 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
2727 
2728 	/* Map the board into the kernel */
2729 	rid = PCIR_BARS;
2730 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
2731 					 ~0, 1, RF_ACTIVE);
2732 	if (sc->mem_res == NULL) {
2733 		device_printf(dev, "could not map memory\n");
2734 		err = ENXIO;
2735 		goto abort_with_lock;
2736 	}
2737 	sc->sram = rman_get_virtual(sc->mem_res);
2738 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
2739 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
2740 		device_printf(dev, "impossible memory region size %ld\n",
2741 			      rman_get_size(sc->mem_res));
2742 		err = ENXIO;
2743 		goto abort_with_mem_res;
2744 	}
2745 
2746 	/* make NULL terminated copy of the EEPROM strings section of
2747 	   lanai SRAM */
2748 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
2749 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
2750 				rman_get_bushandle(sc->mem_res),
2751 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
2752 				sc->eeprom_strings,
2753 				MXGE_EEPROM_STRINGS_SIZE - 2);
2754 	err = mxge_parse_strings(sc);
2755 	if (err != 0)
2756 		goto abort_with_mem_res;
2757 
2758 	/* Enable write combining for efficient use of PCIe bus */
2759 	mxge_enable_wc(sc);
2760 
2761 	/* Allocate the out of band dma memory */
2762 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
2763 			     sizeof (mxge_cmd_t), 64);
2764 	if (err != 0)
2765 		goto abort_with_mem_res;
2766 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
2767 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
2768 	if (err != 0)
2769 		goto abort_with_cmd_dma;
2770 
2771 	err = mxge_dma_alloc(sc, &sc->fw_stats_dma,
2772 			     sizeof (*sc->fw_stats), 64);
2773 	if (err != 0)
2774 		goto abort_with_zeropad_dma;
2775 	sc->fw_stats = (mcp_irq_data_t *)sc->fw_stats_dma.addr;
2776 
2777 
2778 	/* allocate interrupt queues */
2779 	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);
2780 	err = mxge_dma_alloc(sc, &sc->rx_done.dma, bytes, 4096);
2781 	if (err != 0)
2782 		goto abort_with_fw_stats;
2783 	sc->rx_done.entry = sc->rx_done.dma.addr;
2784 	bzero(sc->rx_done.entry, bytes);
2785 	/* Add our ithread  */
2786 	rid = 0;
2787 	sc->irq_res = bus_alloc_resource(dev, SYS_RES_IRQ, &rid, 0, ~0,
2788 					 1, RF_SHAREABLE | RF_ACTIVE);
2789 	if (sc->irq_res == NULL) {
2790 		device_printf(dev, "could not alloc interrupt\n");
2791 		goto abort_with_rx_done;
2792 	}
2793 
2794 	/* load the firmware */
2795 	mxge_select_firmware(sc);
2796 
2797 	err = mxge_load_firmware(sc);
2798 	if (err != 0)
2799 		goto abort_with_irq_res;
2800 	sc->intr_coal_delay = mxge_intr_coal_delay;
2801 	err = mxge_reset(sc);
2802 	if (err != 0)
2803 		goto abort_with_irq_res;
2804 
2805 	/* hook into the network stack */
2806 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2807 	ifp->if_baudrate = 100000000;
2808 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4;
2809 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
2810 	ifp->if_capenable = ifp->if_capabilities;
2811 	sc->csum_flag = 1;
2812         ifp->if_init = mxge_init;
2813         ifp->if_softc = sc;
2814         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2815         ifp->if_ioctl = mxge_ioctl;
2816         ifp->if_start = mxge_start;
2817 	ifp->if_watchdog = mxge_watchdog;
2818 	ether_ifattach(ifp, sc->mac_addr);
2819 	/* ether_ifattach sets mtu to 1500 */
2820 	ifp->if_mtu = MXGE_MAX_ETHER_MTU - ETHER_HDR_LEN;
2821 
2822 	/* Initialise the ifmedia structure */
2823 	ifmedia_init(&sc->media, 0, mxge_media_change,
2824 		     mxge_media_status);
2825 	ifmedia_add(&sc->media, IFM_ETHER|IFM_AUTO, 0, NULL);
2826 	mxge_add_sysctls(sc);
2827 	return 0;
2828 
2829 abort_with_irq_res:
2830 	bus_release_resource(dev, SYS_RES_IRQ, 0, sc->irq_res);
2831 abort_with_rx_done:
2832 	sc->rx_done.entry = NULL;
2833 	mxge_dma_free(&sc->rx_done.dma);
2834 abort_with_fw_stats:
2835 	mxge_dma_free(&sc->fw_stats_dma);
2836 abort_with_zeropad_dma:
2837 	mxge_dma_free(&sc->zeropad_dma);
2838 abort_with_cmd_dma:
2839 	mxge_dma_free(&sc->cmd_dma);
2840 abort_with_mem_res:
2841 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
2842 abort_with_lock:
2843 	pci_disable_busmaster(dev);
2844 	mtx_destroy(&sc->cmd_lock);
2845 	mtx_destroy(&sc->tx_lock);
2846 	sx_destroy(&sc->driver_lock);
2847 	if_free(ifp);
2848 abort_with_parent_dmat:
2849 	bus_dma_tag_destroy(sc->parent_dmat);
2850 
2851 abort_with_nothing:
2852 	return err;
2853 }
2854 
2855 static int
2856 mxge_detach(device_t dev)
2857 {
2858 	mxge_softc_t *sc = device_get_softc(dev);
2859 
2860 	sx_xlock(&sc->driver_lock);
2861 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
2862 		mxge_close(sc);
2863 	sx_xunlock(&sc->driver_lock);
2864 	ether_ifdetach(sc->ifp);
2865 	mxge_dummy_rdma(sc, 0);
2866 	bus_release_resource(dev, SYS_RES_IRQ, 0, sc->irq_res);
2867 	sc->rx_done.entry = NULL;
2868 	mxge_dma_free(&sc->rx_done.dma);
2869 	mxge_dma_free(&sc->fw_stats_dma);
2870 	mxge_dma_free(&sc->zeropad_dma);
2871 	mxge_dma_free(&sc->cmd_dma);
2872 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
2873 	pci_disable_busmaster(dev);
2874 	mtx_destroy(&sc->cmd_lock);
2875 	mtx_destroy(&sc->tx_lock);
2876 	sx_destroy(&sc->driver_lock);
2877 	if_free(sc->ifp);
2878 	bus_dma_tag_destroy(sc->parent_dmat);
2879 	return 0;
2880 }
2881 
2882 static int
2883 mxge_shutdown(device_t dev)
2884 {
2885 	return 0;
2886 }
2887 
2888 /*
2889   This file uses Myri10GE driver indentation.
2890 
2891   Local Variables:
2892   c-file-style:"linux"
2893   tab-width:8
2894   End:
2895 */
2896