xref: /freebsd/sys/dev/mxge/if_mxge.c (revision a4eb85b6acb49cb60c72c2cab0d0d3f00eaa6d46)
1 /******************************************************************************
2 
3 Copyright (c) 2006, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Redistributions in binary form must reproduce the above copyright
13     notice, this list of conditions and the following disclaimer in the
14     documentation and/or other materials provided with the distribution.
15 
16  3. Neither the name of the Myricom Inc, nor the names of its
17     contributors may be used to endorse or promote products derived from
18     this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 POSSIBILITY OF SUCH DAMAGE.
31 
32 ***************************************************************************/
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/linker.h>
40 #include <sys/firmware.h>
41 #include <sys/endian.h>
42 #include <sys/sockio.h>
43 #include <sys/mbuf.h>
44 #include <sys/malloc.h>
45 #include <sys/kdb.h>
46 #include <sys/kernel.h>
47 #include <sys/module.h>
48 #include <sys/memrange.h>
49 #include <sys/socket.h>
50 #include <sys/sysctl.h>
51 #include <sys/sx.h>
52 
53 #include <net/if.h>
54 #include <net/if_arp.h>
55 #include <net/ethernet.h>
56 #include <net/if_dl.h>
57 #include <net/if_media.h>
58 
59 #include <net/bpf.h>
60 
61 #include <net/if_types.h>
62 #include <net/if_vlan_var.h>
63 #include <net/zlib.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 
69 #include <machine/bus.h>
70 #include <machine/resource.h>
71 #include <sys/bus.h>
72 #include <sys/rman.h>
73 
74 #include <dev/pci/pcireg.h>
75 #include <dev/pci/pcivar.h>
76 
77 #include <vm/vm.h>		/* for pmap_mapdev() */
78 #include <vm/pmap.h>
79 
80 #include <dev/mxge/mxge_mcp.h>
81 #include <dev/mxge/mcp_gen_header.h>
82 #include <dev/mxge/if_mxge_var.h>
83 
84 /* tunable params */
85 static int mxge_nvidia_ecrc_enable = 1;
86 static int mxge_max_intr_slots = 1024;
87 static int mxge_intr_coal_delay = 30;
88 static int mxge_deassert_wait = 1;
89 static int mxge_flow_control = 1;
90 static int mxge_verbose = 0;
91 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
92 static char *mxge_fw_aligned = "mxge_eth_z8e";
93 
94 static int mxge_probe(device_t dev);
95 static int mxge_attach(device_t dev);
96 static int mxge_detach(device_t dev);
97 static int mxge_shutdown(device_t dev);
98 static void mxge_intr(void *arg);
99 
100 static device_method_t mxge_methods[] =
101 {
102   /* Device interface */
103   DEVMETHOD(device_probe, mxge_probe),
104   DEVMETHOD(device_attach, mxge_attach),
105   DEVMETHOD(device_detach, mxge_detach),
106   DEVMETHOD(device_shutdown, mxge_shutdown),
107   {0, 0}
108 };
109 
110 static driver_t mxge_driver =
111 {
112   "mxge",
113   mxge_methods,
114   sizeof(mxge_softc_t),
115 };
116 
117 static devclass_t mxge_devclass;
118 
119 /* Declare ourselves to be a child of the PCI bus.*/
120 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
121 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
122 
123 static int
124 mxge_probe(device_t dev)
125 {
126   if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
127       (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E)) {
128 	  device_set_desc(dev, "Myri10G-PCIE-8A");
129 	  return 0;
130   }
131   return ENXIO;
132 }
133 
134 static void
135 mxge_enable_wc(mxge_softc_t *sc)
136 {
137 	struct mem_range_desc mrdesc;
138 	vm_paddr_t pa;
139 	vm_offset_t len;
140 	int err, action;
141 
142 	pa = rman_get_start(sc->mem_res);
143 	len = rman_get_size(sc->mem_res);
144 	mrdesc.mr_base = pa;
145 	mrdesc.mr_len = len;
146 	mrdesc.mr_flags = MDF_WRITECOMBINE;
147 	action = MEMRANGE_SET_UPDATE;
148 	strcpy((char *)&mrdesc.mr_owner, "mxge");
149 	err = mem_range_attr_set(&mrdesc, &action);
150 	if (err != 0) {
151 		device_printf(sc->dev,
152 			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
153 			      (unsigned long)pa, (unsigned long)len, err);
154 	} else {
155 		sc->wc = 1;
156 	}
157 }
158 
159 
160 /* callback to get our DMA address */
161 static void
162 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
163 			 int error)
164 {
165 	if (error == 0) {
166 		*(bus_addr_t *) arg = segs->ds_addr;
167 	}
168 }
169 
170 static int
171 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
172 		   bus_size_t alignment)
173 {
174 	int err;
175 	device_t dev = sc->dev;
176 
177 	/* allocate DMAable memory tags */
178 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
179 				 alignment,		/* alignment */
180 				 4096,			/* boundary */
181 				 BUS_SPACE_MAXADDR,	/* low */
182 				 BUS_SPACE_MAXADDR,	/* high */
183 				 NULL, NULL,		/* filter */
184 				 bytes,			/* maxsize */
185 				 1,			/* num segs */
186 				 4096,			/* maxsegsize */
187 				 BUS_DMA_COHERENT,	/* flags */
188 				 NULL, NULL,		/* lock */
189 				 &dma->dmat);		/* tag */
190 	if (err != 0) {
191 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
192 		return err;
193 	}
194 
195 	/* allocate DMAable memory & map */
196 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
197 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
198 				| BUS_DMA_ZERO),  &dma->map);
199 	if (err != 0) {
200 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
201 		goto abort_with_dmat;
202 	}
203 
204 	/* load the memory */
205 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
206 			      mxge_dmamap_callback,
207 			      (void *)&dma->bus_addr, 0);
208 	if (err != 0) {
209 		device_printf(dev, "couldn't load map (err = %d)\n", err);
210 		goto abort_with_mem;
211 	}
212 	return 0;
213 
214 abort_with_mem:
215 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
216 abort_with_dmat:
217 	(void)bus_dma_tag_destroy(dma->dmat);
218 	return err;
219 }
220 
221 
222 static void
223 mxge_dma_free(mxge_dma_t *dma)
224 {
225 	bus_dmamap_unload(dma->dmat, dma->map);
226 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
227 	(void)bus_dma_tag_destroy(dma->dmat);
228 }
229 
230 /*
231  * The eeprom strings on the lanaiX have the format
232  * SN=x\0
233  * MAC=x:x:x:x:x:x\0
234  * PC=text\0
235  */
236 
237 static int
238 mxge_parse_strings(mxge_softc_t *sc)
239 {
240 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
241 
242 	char *ptr, *limit;
243 	int i, found_mac;
244 
245 	ptr = sc->eeprom_strings;
246 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
247 	found_mac = 0;
248 	while (ptr < limit && *ptr != '\0') {
249 		if (memcmp(ptr, "MAC=", 4) == 0) {
250 			ptr += 1;
251 			sc->mac_addr_string = ptr;
252 			for (i = 0; i < 6; i++) {
253 				ptr += 3;
254 				if ((ptr + 2) > limit)
255 					goto abort;
256 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
257 				found_mac = 1;
258 			}
259 		} else if (memcmp(ptr, "PC=", 3) == 0) {
260 			ptr += 3;
261 			strncpy(sc->product_code_string, ptr,
262 				sizeof (sc->product_code_string) - 1);
263 		} else if (memcmp(ptr, "SN=", 3) == 0) {
264 			ptr += 3;
265 			strncpy(sc->serial_number_string, ptr,
266 				sizeof (sc->serial_number_string) - 1);
267 		}
268 		MXGE_NEXT_STRING(ptr);
269 	}
270 
271 	if (found_mac)
272 		return 0;
273 
274  abort:
275 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
276 
277 	return ENXIO;
278 }
279 
280 #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
281 static int
282 mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
283 {
284 	uint32_t val;
285 	unsigned long off;
286 	char *va, *cfgptr;
287 	uint16_t vendor_id, device_id;
288 	uintptr_t bus, slot, func, ivend, idev;
289 	uint32_t *ptr32;
290 
291 	/* XXXX
292 	   Test below is commented because it is believed that doing
293 	   config read/write beyond 0xff will access the config space
294 	   for the next larger function.  Uncomment this and remove
295 	   the hacky pmap_mapdev() way of accessing config space when
296 	   FreeBSD grows support for extended pcie config space access
297 	*/
298 #if 0
299 	/* See if we can, by some miracle, access the extended
300 	   config space */
301 	val = pci_read_config(pdev, 0x178, 4);
302 	if (val != 0xffffffff) {
303 		val |= 0x40;
304 		pci_write_config(pdev, 0x178, val, 4);
305 		return 0;
306 	}
307 #endif
308 	/* Rather than using normal pci config space writes, we must
309 	 * map the Nvidia config space ourselves.  This is because on
310 	 * opteron/nvidia class machine the 0xe000000 mapping is
311 	 * handled by the nvidia chipset, that means the internal PCI
312 	 * device (the on-chip northbridge), or the amd-8131 bridge
313 	 * and things behind them are not visible by this method.
314 	 */
315 
316 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
317 		      PCI_IVAR_BUS, &bus);
318 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
319 		      PCI_IVAR_SLOT, &slot);
320 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
321 		      PCI_IVAR_FUNCTION, &func);
322 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
323 		      PCI_IVAR_VENDOR, &ivend);
324 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
325 		      PCI_IVAR_DEVICE, &idev);
326 
327 	off =  0xe0000000UL
328 		+ 0x00100000UL * (unsigned long)bus
329 		+ 0x00001000UL * (unsigned long)(func
330 						 + 8 * slot);
331 
332 	/* map it into the kernel */
333 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
334 
335 
336 	if (va == NULL) {
337 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
338 		return EIO;
339 	}
340 	/* get a pointer to the config space mapped into the kernel */
341 	cfgptr = va + (off & PAGE_MASK);
342 
343 	/* make sure that we can really access it */
344 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
345 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
346 	if (! (vendor_id == ivend && device_id == idev)) {
347 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
348 			      vendor_id, device_id);
349 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
350 		return EIO;
351 	}
352 
353 	ptr32 = (uint32_t*)(cfgptr + 0x178);
354 	val = *ptr32;
355 
356 	if (val == 0xffffffff) {
357 		device_printf(sc->dev, "extended mapping failed\n");
358 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
359 		return EIO;
360 	}
361 	*ptr32 = val | 0x40;
362 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
363 	if (mxge_verbose)
364 		device_printf(sc->dev,
365 			      "Enabled ECRC on upstream Nvidia bridge "
366 			      "at %d:%d:%d\n",
367 			      (int)bus, (int)slot, (int)func);
368 	return 0;
369 }
370 #else
371 static int
372 mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
373 {
374 	device_printf(sc->dev,
375 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
376 	return ENXIO;
377 }
378 #endif
379 /*
380  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
381  * when the PCI-E Completion packets are aligned on an 8-byte
382  * boundary.  Some PCI-E chip sets always align Completion packets; on
383  * the ones that do not, the alignment can be enforced by enabling
384  * ECRC generation (if supported).
385  *
386  * When PCI-E Completion packets are not aligned, it is actually more
387  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
388  *
389  * If the driver can neither enable ECRC nor verify that it has
390  * already been enabled, then it must use a firmware image which works
391  * around unaligned completion packets (ethp_z8e.dat), and it should
392  * also ensure that it never gives the device a Read-DMA which is
393  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
394  * enabled, then the driver should use the aligned (eth_z8e.dat)
395  * firmware image, and set tx.boundary to 4KB.
396  */
397 
398 static void
399 mxge_select_firmware(mxge_softc_t *sc)
400 {
401 	int err, aligned = 0;
402 	device_t pdev;
403 	uint16_t pvend, pdid;
404 
405 	pdev = device_get_parent(device_get_parent(sc->dev));
406 	if (pdev == NULL) {
407 		device_printf(sc->dev, "could not find parent?\n");
408 		goto abort;
409 	}
410 	pvend = pci_read_config(pdev, PCIR_VENDOR, 2);
411 	pdid = pci_read_config(pdev, PCIR_DEVICE, 2);
412 
413 	/* see if we can enable ECRC's on an upstream
414 	   Nvidia bridge */
415 	if (mxge_nvidia_ecrc_enable &&
416 	    (pvend == 0x10de && pdid == 0x005d)) {
417 		err = mxge_enable_nvidia_ecrc(sc, pdev);
418 		if (err == 0) {
419 			aligned = 1;
420 			if (mxge_verbose)
421 				device_printf(sc->dev,
422 					      "Assuming aligned completions"
423 					      " (ECRC)\n");
424 		}
425 	}
426 	/* see if the upstream bridge is known to
427 	   provided aligned completions */
428 	if (/* HT2000  */ (pvend == 0x1166 && pdid == 0x0132) ||
429 	    /* Ontario */ (pvend == 0x10b5 && pdid == 0x8532)) {
430 		if (mxge_verbose)
431 			device_printf(sc->dev,
432 				      "Assuming aligned completions "
433 				      "(0x%x:0x%x)\n", pvend, pdid);
434 	}
435 
436 abort:
437 	if (aligned) {
438 		sc->fw_name = mxge_fw_aligned;
439 		sc->tx.boundary = 4096;
440 	} else {
441 		sc->fw_name = mxge_fw_unaligned;
442 		sc->tx.boundary = 2048;
443 	}
444 }
445 
446 union qualhack
447 {
448         const char *ro_char;
449         char *rw_char;
450 };
451 
452 static int
453 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
454 {
455 	int major, minor;
456 
457 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
458 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
459 			      be32toh(hdr->mcp_type));
460 		return EIO;
461 	}
462 
463 	/* save firmware version for sysctl */
464 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
465 	if (mxge_verbose)
466 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
467 
468 	sscanf(sc->fw_version, "%d.%d", &major, &minor);
469 
470 	if (!(major == MXGEFW_VERSION_MAJOR
471 	      && minor == MXGEFW_VERSION_MINOR)) {
472 		device_printf(sc->dev, "Found firmware version %s\n",
473 			      sc->fw_version);
474 		device_printf(sc->dev, "Driver needs %d.%d\n",
475 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
476 		return EINVAL;
477 	}
478 	return 0;
479 
480 }
481 
482 static int
483 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
484 {
485 	struct firmware *fw;
486 	const mcp_gen_header_t *hdr;
487 	unsigned hdr_offset;
488 	const char *fw_data;
489 	union qualhack hack;
490 	int status;
491 	unsigned int i;
492 	char dummy;
493 
494 
495 	fw = firmware_get(sc->fw_name);
496 
497 	if (fw == NULL) {
498 		device_printf(sc->dev, "Could not find firmware image %s\n",
499 			      sc->fw_name);
500 		return ENOENT;
501 	}
502 	if (fw->datasize > *limit ||
503 	    fw->datasize < MCP_HEADER_PTR_OFFSET + 4) {
504 		device_printf(sc->dev, "Firmware image %s too large (%d/%d)\n",
505 			      sc->fw_name, (int)fw->datasize, (int) *limit);
506 		status = ENOSPC;
507 		goto abort_with_fw;
508 	}
509 	*limit = fw->datasize;
510 
511 	/* check id */
512 	fw_data = (const char *)fw->data;
513 	hdr_offset = htobe32(*(const uint32_t *)
514 			     (fw_data + MCP_HEADER_PTR_OFFSET));
515 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw->datasize) {
516 		device_printf(sc->dev, "Bad firmware file");
517 		status = EIO;
518 		goto abort_with_fw;
519 	}
520 	hdr = (const void*)(fw_data + hdr_offset);
521 
522 	status = mxge_validate_firmware(sc, hdr);
523 	if (status != 0)
524 		goto abort_with_fw;
525 
526 	hack.ro_char = fw_data;
527 	/* Copy the inflated firmware to NIC SRAM. */
528 	for (i = 0; i < *limit; i += 256) {
529 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
530 			      hack.rw_char + i,
531 			      min(256U, (unsigned)(*limit - i)));
532 		mb();
533 		dummy = *sc->sram;
534 		mb();
535 	}
536 
537 	status = 0;
538 abort_with_fw:
539 	firmware_put(fw, FIRMWARE_UNLOAD);
540 	return status;
541 }
542 
543 /*
544  * Enable or disable periodic RDMAs from the host to make certain
545  * chipsets resend dropped PCIe messages
546  */
547 
548 static void
549 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
550 {
551 	char buf_bytes[72];
552 	volatile uint32_t *confirm;
553 	volatile char *submit;
554 	uint32_t *buf, dma_low, dma_high;
555 	int i;
556 
557 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
558 
559 	/* clear confirmation addr */
560 	confirm = (volatile uint32_t *)sc->cmd;
561 	*confirm = 0;
562 	mb();
563 
564 	/* send an rdma command to the PCIe engine, and wait for the
565 	   response in the confirmation address.  The firmware should
566 	   write a -1 there to indicate it is alive and well
567 	*/
568 
569 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
570 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
571 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
572 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
573 	buf[2] = htobe32(0xffffffff);		/* confirm data */
574 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
575 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
576 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
577 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
578 	buf[5] = htobe32(enable);			/* enable? */
579 
580 
581 	submit = (volatile char *)(sc->sram + 0xfc01c0);
582 
583 	mxge_pio_copy(submit, buf, 64);
584 	mb();
585 	DELAY(1000);
586 	mb();
587 	i = 0;
588 	while (*confirm != 0xffffffff && i < 20) {
589 		DELAY(1000);
590 		i++;
591 	}
592 	if (*confirm != 0xffffffff) {
593 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
594 			      (enable ? "enable" : "disable"), confirm,
595 			      *confirm);
596 	}
597 	return;
598 }
599 
600 static int
601 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
602 {
603 	mcp_cmd_t *buf;
604 	char buf_bytes[sizeof(*buf) + 8];
605 	volatile mcp_cmd_response_t *response = sc->cmd;
606 	volatile char *cmd_addr = sc->sram + MXGEFW_CMD_OFFSET;
607 	uint32_t dma_low, dma_high;
608 	int sleep_total = 0;
609 
610 	/* ensure buf is aligned to 8 bytes */
611 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
612 
613 	buf->data0 = htobe32(data->data0);
614 	buf->data1 = htobe32(data->data1);
615 	buf->data2 = htobe32(data->data2);
616 	buf->cmd = htobe32(cmd);
617 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
618 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
619 
620 	buf->response_addr.low = htobe32(dma_low);
621 	buf->response_addr.high = htobe32(dma_high);
622 	mtx_lock(&sc->cmd_lock);
623 	response->result = 0xffffffff;
624 	mb();
625 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
626 
627 	/* wait up to 20ms */
628 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
629 		bus_dmamap_sync(sc->cmd_dma.dmat,
630 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
631 		mb();
632 		if (response->result != 0xffffffff) {
633 			if (response->result == 0) {
634 				data->data0 = be32toh(response->data);
635 				mtx_unlock(&sc->cmd_lock);
636 				return 0;
637 			} else {
638 				device_printf(sc->dev,
639 					      "mxge: command %d "
640 					      "failed, result = %d\n",
641 					      cmd, be32toh(response->result));
642 				mtx_unlock(&sc->cmd_lock);
643 				return ENXIO;
644 			}
645 		}
646 		DELAY(1000);
647 	}
648 	mtx_unlock(&sc->cmd_lock);
649 	device_printf(sc->dev, "mxge: command %d timed out"
650 		      "result = %d\n",
651 		      cmd, be32toh(response->result));
652 	return EAGAIN;
653 }
654 
655 static int
656 mxge_adopt_running_firmware(mxge_softc_t *sc)
657 {
658 	struct mcp_gen_header *hdr;
659 	const size_t bytes = sizeof (struct mcp_gen_header);
660 	size_t hdr_offset;
661 	int status;
662 
663 	/* find running firmware header */
664 	hdr_offset = htobe32(*(volatile uint32_t *)
665 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
666 
667 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
668 		device_printf(sc->dev,
669 			      "Running firmware has bad header offset (%d)\n",
670 			      (int)hdr_offset);
671 		return EIO;
672 	}
673 
674 	/* copy header of running firmware from SRAM to host memory to
675 	 * validate firmware */
676 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
677 	if (hdr == NULL) {
678 		device_printf(sc->dev, "could not malloc firmware hdr\n");
679 		return ENOMEM;
680 	}
681 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
682 				rman_get_bushandle(sc->mem_res),
683 				hdr_offset, (char *)hdr, bytes);
684 	status = mxge_validate_firmware(sc, hdr);
685 	free(hdr, M_DEVBUF);
686 	return status;
687 }
688 
689 
690 static int
691 mxge_load_firmware(mxge_softc_t *sc)
692 {
693 	volatile uint32_t *confirm;
694 	volatile char *submit;
695 	char buf_bytes[72];
696 	uint32_t *buf, size, dma_low, dma_high;
697 	int status, i;
698 
699 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
700 
701 	size = sc->sram_size;
702 	status = mxge_load_firmware_helper(sc, &size);
703 	if (status) {
704 		/* Try to use the currently running firmware, if
705 		   it is new enough */
706 		status = mxge_adopt_running_firmware(sc);
707 		if (status) {
708 			device_printf(sc->dev,
709 				      "failed to adopt running firmware\n");
710 			return status;
711 		}
712 		device_printf(sc->dev,
713 			      "Successfully adopted running firmware\n");
714 		if (sc->tx.boundary == 4096) {
715 			device_printf(sc->dev,
716 				"Using firmware currently running on NIC"
717 				 ".  For optimal\n");
718 			device_printf(sc->dev,
719 				 "performance consider loading optimized "
720 				 "firmware\n");
721 		}
722 
723 	}
724 	/* clear confirmation addr */
725 	confirm = (volatile uint32_t *)sc->cmd;
726 	*confirm = 0;
727 	mb();
728 	/* send a reload command to the bootstrap MCP, and wait for the
729 	   response in the confirmation address.  The firmware should
730 	   write a -1 there to indicate it is alive and well
731 	*/
732 
733 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
734 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
735 
736 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
737 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
738 	buf[2] = htobe32(0xffffffff);	/* confirm data */
739 
740 	/* FIX: All newest firmware should un-protect the bottom of
741 	   the sram before handoff. However, the very first interfaces
742 	   do not. Therefore the handoff copy must skip the first 8 bytes
743 	*/
744 					/* where the code starts*/
745 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
746 	buf[4] = htobe32(size - 8); 	/* length of code */
747 	buf[5] = htobe32(8);		/* where to copy to */
748 	buf[6] = htobe32(0);		/* where to jump to */
749 
750 	submit = (volatile char *)(sc->sram + 0xfc0000);
751 	mxge_pio_copy(submit, buf, 64);
752 	mb();
753 	DELAY(1000);
754 	mb();
755 	i = 0;
756 	while (*confirm != 0xffffffff && i < 20) {
757 		DELAY(1000*10);
758 		i++;
759 		bus_dmamap_sync(sc->cmd_dma.dmat,
760 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
761 	}
762 	if (*confirm != 0xffffffff) {
763 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
764 			confirm, *confirm);
765 
766 		return ENXIO;
767 	}
768 	mxge_dummy_rdma(sc, 1);
769 	return 0;
770 }
771 
772 static int
773 mxge_update_mac_address(mxge_softc_t *sc)
774 {
775 	mxge_cmd_t cmd;
776 	uint8_t *addr = sc->mac_addr;
777 	int status;
778 
779 
780 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
781 		     | (addr[2] << 8) | addr[3]);
782 
783 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
784 
785 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
786 	return status;
787 }
788 
789 static int
790 mxge_change_pause(mxge_softc_t *sc, int pause)
791 {
792 	mxge_cmd_t cmd;
793 	int status;
794 
795 	if (pause)
796 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
797 				       &cmd);
798 	else
799 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
800 				       &cmd);
801 
802 	if (status) {
803 		device_printf(sc->dev, "Failed to set flow control mode\n");
804 		return ENXIO;
805 	}
806 	sc->pause = pause;
807 	return 0;
808 }
809 
810 static void
811 mxge_change_promisc(mxge_softc_t *sc, int promisc)
812 {
813 	mxge_cmd_t cmd;
814 	int status;
815 
816 	if (promisc)
817 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
818 				       &cmd);
819 	else
820 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
821 				       &cmd);
822 
823 	if (status) {
824 		device_printf(sc->dev, "Failed to set promisc mode\n");
825 	}
826 }
827 
828 static int
829 mxge_reset(mxge_softc_t *sc)
830 {
831 
832 	mxge_cmd_t cmd;
833 	mxge_dma_t dmabench_dma;
834 	size_t bytes;
835 	int status;
836 
837 	/* try to send a reset command to the card to see if it
838 	   is alive */
839 	memset(&cmd, 0, sizeof (cmd));
840 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
841 	if (status != 0) {
842 		device_printf(sc->dev, "failed reset\n");
843 		return ENXIO;
844 	}
845 
846 	/* Now exchange information about interrupts  */
847 	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);\
848 	memset(sc->rx_done.entry, 0, bytes);
849 	cmd.data0 = (uint32_t)bytes;
850 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
851 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->rx_done.dma.bus_addr);
852 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->rx_done.dma.bus_addr);
853 	status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd);
854 
855 	status |= mxge_send_cmd(sc,
856 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
857 
858 
859 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
860 
861 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
862 	sc->irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
863 
864 
865 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
866 				&cmd);
867 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
868 	if (status != 0) {
869 		device_printf(sc->dev, "failed set interrupt parameters\n");
870 		return status;
871 	}
872 
873 
874 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
875 
876 
877 	/* run a DMA benchmark */
878 	sc->read_dma = sc->write_dma = sc->read_write_dma = 0;
879 	status = mxge_dma_alloc(sc, &dmabench_dma, 4096, 4096);
880 	if (status)
881 		goto dmabench_fail;
882 
883 	/* Read DMA */
884 	cmd.data0 = MXGE_LOWPART_TO_U32(dmabench_dma.bus_addr);
885 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmabench_dma.bus_addr);
886 	cmd.data2 = sc->tx.boundary * 0x10000;
887 
888 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
889 	if (status != 0)
890 		device_printf(sc->dev, "read dma benchmark failed\n");
891 	else
892 		sc->read_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
893 			(cmd.data0 & 0xffff);
894 
895 	/* Write DMA */
896 	cmd.data0 = MXGE_LOWPART_TO_U32(dmabench_dma.bus_addr);
897 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmabench_dma.bus_addr);
898 	cmd.data2 = sc->tx.boundary * 0x1;
899 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
900 	if (status != 0)
901 		device_printf(sc->dev, "write dma benchmark failed\n");
902 	else
903 		sc->write_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
904 			(cmd.data0 & 0xffff);
905 	/* Read/Write DMA */
906 	cmd.data0 = MXGE_LOWPART_TO_U32(dmabench_dma.bus_addr);
907 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmabench_dma.bus_addr);
908 	cmd.data2 = sc->tx.boundary * 0x10001;
909 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
910 	if (status != 0)
911 		device_printf(sc->dev, "read/write dma benchmark failed\n");
912 	else
913 		sc->read_write_dma =
914 			((cmd.data0>>16) * sc->tx.boundary * 2 * 2) /
915 			(cmd.data0 & 0xffff);
916 
917 	mxge_dma_free(&dmabench_dma);
918 
919 dmabench_fail:
920 	/* reset mcp/driver shared state back to 0 */
921 	bzero(sc->rx_done.entry, bytes);
922 	sc->rx_done.idx = 0;
923 	sc->rx_done.cnt = 0;
924 	sc->tx.req = 0;
925 	sc->tx.done = 0;
926 	sc->tx.pkt_done = 0;
927 	sc->rx_big.cnt = 0;
928 	sc->rx_small.cnt = 0;
929 	sc->rdma_tags_available = 15;
930 	status = mxge_update_mac_address(sc);
931 	mxge_change_promisc(sc, 0);
932 	mxge_change_pause(sc, sc->pause);
933 	return status;
934 }
935 
936 static int
937 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
938 {
939         mxge_softc_t *sc;
940         unsigned int intr_coal_delay;
941         int err;
942 
943         sc = arg1;
944         intr_coal_delay = sc->intr_coal_delay;
945         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
946         if (err != 0) {
947                 return err;
948         }
949         if (intr_coal_delay == sc->intr_coal_delay)
950                 return 0;
951 
952         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
953                 return EINVAL;
954 
955 	sx_xlock(&sc->driver_lock);
956 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
957 	sc->intr_coal_delay = intr_coal_delay;
958 
959 	sx_xunlock(&sc->driver_lock);
960         return err;
961 }
962 
963 static int
964 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
965 {
966         mxge_softc_t *sc;
967         unsigned int enabled;
968         int err;
969 
970         sc = arg1;
971         enabled = sc->pause;
972         err = sysctl_handle_int(oidp, &enabled, arg2, req);
973         if (err != 0) {
974                 return err;
975         }
976         if (enabled == sc->pause)
977                 return 0;
978 
979 	sx_xlock(&sc->driver_lock);
980 	err = mxge_change_pause(sc, enabled);
981 	sx_xunlock(&sc->driver_lock);
982         return err;
983 }
984 
985 static int
986 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
987 {
988         int err;
989 
990         if (arg1 == NULL)
991                 return EFAULT;
992         arg2 = be32toh(*(int *)arg1);
993         arg1 = NULL;
994         err = sysctl_handle_int(oidp, arg1, arg2, req);
995 
996         return err;
997 }
998 
999 static void
1000 mxge_add_sysctls(mxge_softc_t *sc)
1001 {
1002 	struct sysctl_ctx_list *ctx;
1003 	struct sysctl_oid_list *children;
1004 	mcp_irq_data_t *fw;
1005 
1006 	ctx = device_get_sysctl_ctx(sc->dev);
1007 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1008 	fw = sc->fw_stats;
1009 
1010 	/* random information */
1011 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1012 		       "firmware_version",
1013 		       CTLFLAG_RD, &sc->fw_version,
1014 		       0, "firmware version");
1015 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1016 		       "serial_number",
1017 		       CTLFLAG_RD, &sc->serial_number_string,
1018 		       0, "serial number");
1019 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1020 		       "product_code",
1021 		       CTLFLAG_RD, &sc->product_code_string,
1022 		       0, "product_code");
1023 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1024 		       "tx_boundary",
1025 		       CTLFLAG_RD, &sc->tx.boundary,
1026 		       0, "tx_boundary");
1027 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1028 		       "read_dma_MBs",
1029 		       CTLFLAG_RD, &sc->read_dma,
1030 		       0, "DMA Read speed in MB/s");
1031 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1032 		       "write_dma_MBs",
1033 		       CTLFLAG_RD, &sc->write_dma,
1034 		       0, "DMA Write speed in MB/s");
1035 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1036 		       "read_write_dma_MBs",
1037 		       CTLFLAG_RD, &sc->read_write_dma,
1038 		       0, "DMA concurrent Read/Write speed in MB/s");
1039 
1040 
1041 	/* performance related tunables */
1042 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1043 			"intr_coal_delay",
1044 			CTLTYPE_INT|CTLFLAG_RW, sc,
1045 			0, mxge_change_intr_coal,
1046 			"I", "interrupt coalescing delay in usecs");
1047 
1048 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1049 			"flow_control_enabled",
1050 			CTLTYPE_INT|CTLFLAG_RW, sc,
1051 			0, mxge_change_flow_control,
1052 			"I", "interrupt coalescing delay in usecs");
1053 
1054 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1055 		       "deassert_wait",
1056 		       CTLFLAG_RW, &mxge_deassert_wait,
1057 		       0, "Wait for IRQ line to go low in ihandler");
1058 
1059 	/* stats block from firmware is in network byte order.
1060 	   Need to swap it */
1061 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1062 			"link_up",
1063 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1064 			0, mxge_handle_be32,
1065 			"I", "link up");
1066 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1067 			"rdma_tags_available",
1068 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1069 			0, mxge_handle_be32,
1070 			"I", "rdma_tags_available");
1071 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1072 			"dropped_link_overflow",
1073 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1074 			0, mxge_handle_be32,
1075 			"I", "dropped_link_overflow");
1076 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1077 			"dropped_link_error_or_filtered",
1078 			CTLTYPE_INT|CTLFLAG_RD,
1079 			&fw->dropped_link_error_or_filtered,
1080 			0, mxge_handle_be32,
1081 			"I", "dropped_link_error_or_filtered");
1082 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1083 			"dropped_runt",
1084 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1085 			0, mxge_handle_be32,
1086 			"I", "dropped_runt");
1087 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1088 			"dropped_overrun",
1089 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1090 			0, mxge_handle_be32,
1091 			"I", "dropped_overrun");
1092 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1093 			"dropped_no_small_buffer",
1094 			CTLTYPE_INT|CTLFLAG_RD,
1095 			&fw->dropped_no_small_buffer,
1096 			0, mxge_handle_be32,
1097 			"I", "dropped_no_small_buffer");
1098 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1099 			"dropped_no_big_buffer",
1100 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1101 			0, mxge_handle_be32,
1102 			"I", "dropped_no_big_buffer");
1103 
1104 	/* host counters exported for debugging */
1105 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1106 		       "rx_small_cnt",
1107 		       CTLFLAG_RD, &sc->rx_small.cnt,
1108 		       0, "rx_small_cnt");
1109 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1110 		       "rx_big_cnt",
1111 		       CTLFLAG_RD, &sc->rx_big.cnt,
1112 		       0, "rx_small_cnt");
1113 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1114 		       "tx_req",
1115 		       CTLFLAG_RD, &sc->tx.req,
1116 		       0, "tx_req");
1117 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1118 		       "tx_done",
1119 		       CTLFLAG_RD, &sc->tx.done,
1120 		       0, "tx_done");
1121 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1122 		       "tx_pkt_done",
1123 		       CTLFLAG_RD, &sc->tx.pkt_done,
1124 		       0, "tx_done");
1125 
1126 	/* verbose printing? */
1127 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1128 		       "verbose",
1129 		       CTLFLAG_RW, &mxge_verbose,
1130 		       0, "verbose printing");
1131 
1132 }
1133 
1134 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1135    backwards one at a time and handle ring wraps */
1136 
1137 static inline void
1138 mxge_submit_req_backwards(mxge_tx_buf_t *tx,
1139 			    mcp_kreq_ether_send_t *src, int cnt)
1140 {
1141         int idx, starting_slot;
1142         starting_slot = tx->req;
1143         while (cnt > 1) {
1144                 cnt--;
1145                 idx = (starting_slot + cnt) & tx->mask;
1146                 mxge_pio_copy(&tx->lanai[idx],
1147 			      &src[cnt], sizeof(*src));
1148                 mb();
1149         }
1150 }
1151 
1152 /*
1153  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1154  * at most 32 bytes at a time, so as to avoid involving the software
1155  * pio handler in the nic.   We re-write the first segment's flags
1156  * to mark them valid only after writing the entire chain
1157  */
1158 
1159 static inline void
1160 mxge_submit_req(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src,
1161                   int cnt)
1162 {
1163         int idx, i;
1164         uint32_t *src_ints;
1165 	volatile uint32_t *dst_ints;
1166         mcp_kreq_ether_send_t *srcp;
1167 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1168 	uint8_t last_flags;
1169 
1170         idx = tx->req & tx->mask;
1171 
1172 	last_flags = src->flags;
1173 	src->flags = 0;
1174         mb();
1175         dst = dstp = &tx->lanai[idx];
1176         srcp = src;
1177 
1178         if ((idx + cnt) < tx->mask) {
1179                 for (i = 0; i < (cnt - 1); i += 2) {
1180                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1181                         mb(); /* force write every 32 bytes */
1182                         srcp += 2;
1183                         dstp += 2;
1184                 }
1185         } else {
1186                 /* submit all but the first request, and ensure
1187                    that it is submitted below */
1188                 mxge_submit_req_backwards(tx, src, cnt);
1189                 i = 0;
1190         }
1191         if (i < cnt) {
1192                 /* submit the first request */
1193                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1194                 mb(); /* barrier before setting valid flag */
1195         }
1196 
1197         /* re-write the last 32-bits with the valid flags */
1198         src->flags = last_flags;
1199         src_ints = (uint32_t *)src;
1200         src_ints+=3;
1201         dst_ints = (volatile uint32_t *)dst;
1202         dst_ints+=3;
1203         *dst_ints =  *src_ints;
1204         tx->req += cnt;
1205         mb();
1206 }
1207 
1208 static inline void
1209 mxge_submit_req_wc(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1210 {
1211     tx->req += cnt;
1212     mb();
1213     while (cnt >= 4) {
1214 	    mxge_pio_copy((volatile char *)tx->wc_fifo, src, 64);
1215 	    mb();
1216 	    src += 4;
1217 	    cnt -= 4;
1218     }
1219     if (cnt > 0) {
1220 	    /* pad it to 64 bytes.  The src is 64 bytes bigger than it
1221 	       needs to be so that we don't overrun it */
1222 	    mxge_pio_copy(tx->wc_fifo + (cnt<<18), src, 64);
1223 	    mb();
1224     }
1225 }
1226 
1227 static void
1228 mxge_encap(mxge_softc_t *sc, struct mbuf *m)
1229 {
1230 	mcp_kreq_ether_send_t *req;
1231 	bus_dma_segment_t seg_list[MXGE_MAX_SEND_DESC];
1232 	bus_dma_segment_t *seg;
1233 	struct mbuf *m_tmp;
1234 	struct ifnet *ifp;
1235 	mxge_tx_buf_t *tx;
1236 	struct ether_header *eh;
1237 	struct ip *ip;
1238 	int cnt, cum_len, err, i, idx;
1239 	uint16_t flags, pseudo_hdr_offset;
1240         uint8_t cksum_offset;
1241 
1242 
1243 
1244 	ifp = sc->ifp;
1245 	tx = &sc->tx;
1246 
1247 	/* (try to) map the frame for DMA */
1248 	idx = tx->req & tx->mask;
1249 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1250 				      m, seg_list, &cnt,
1251 				      BUS_DMA_NOWAIT);
1252 	if (err == EFBIG) {
1253 		/* Too many segments in the chain.  Try
1254 		   to defrag */
1255 		m_tmp = m_defrag(m, M_NOWAIT);
1256 		if (m_tmp == NULL) {
1257 			goto drop;
1258 		}
1259 		m = m_tmp;
1260 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1261 					      tx->info[idx].map,
1262 					      m, seg_list, &cnt,
1263 					      BUS_DMA_NOWAIT);
1264 	}
1265 	if (err != 0) {
1266 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d\n",
1267 			      err);
1268 		goto drop;
1269 	}
1270 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1271 			BUS_DMASYNC_PREWRITE);
1272 	tx->info[idx].m = m;
1273 
1274 	req = tx->req_list;
1275 	cksum_offset = 0;
1276 	pseudo_hdr_offset = 0;
1277 	flags = MXGEFW_FLAGS_NO_TSO;
1278 
1279 	/* checksum offloading? */
1280 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1281 		eh = mtod(m, struct ether_header *);
1282 		ip = (struct ip *) (eh + 1);
1283 		cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1284 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1285 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1286 		req->cksum_offset = cksum_offset;
1287 		flags |= MXGEFW_FLAGS_CKSUM;
1288 	}
1289 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1290 		flags |= MXGEFW_FLAGS_SMALL;
1291 
1292 	/* convert segments into a request list */
1293 	cum_len = 0;
1294 	seg = seg_list;
1295 	req->flags = MXGEFW_FLAGS_FIRST;
1296 	for (i = 0; i < cnt; i++) {
1297 		req->addr_low =
1298 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1299 		req->addr_high =
1300 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1301 		req->length = htobe16(seg->ds_len);
1302 		req->cksum_offset = cksum_offset;
1303 		if (cksum_offset > seg->ds_len)
1304 			cksum_offset -= seg->ds_len;
1305 		else
1306 			cksum_offset = 0;
1307 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1308 		req->pad = 0; /* complete solid 16-byte block */
1309 		req->rdma_count = 1;
1310 		req->flags |= flags | ((cum_len & 1) * MXGEFW_FLAGS_ALIGN_ODD);
1311 		cum_len += seg->ds_len;
1312 		seg++;
1313 		req++;
1314 		req->flags = 0;
1315 	}
1316 	req--;
1317 	/* pad runts to 60 bytes */
1318 	if (cum_len < 60) {
1319 		req++;
1320 		req->addr_low =
1321 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
1322 		req->addr_high =
1323 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
1324 		req->length = htobe16(60 - cum_len);
1325 		req->cksum_offset = 0;
1326 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1327 		req->pad = 0; /* complete solid 16-byte block */
1328 		req->rdma_count = 1;
1329 		req->flags |= flags | ((cum_len & 1) * MXGEFW_FLAGS_ALIGN_ODD);
1330 		cnt++;
1331 	}
1332 
1333 	tx->req_list[0].rdma_count = cnt;
1334 #if 0
1335 	/* print what the firmware will see */
1336 	for (i = 0; i < cnt; i++) {
1337 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1338 		    "cso:%d, flags:0x%x, rdma:%d\n",
1339 		    i, (int)ntohl(tx->req_list[i].addr_high),
1340 		    (int)ntohl(tx->req_list[i].addr_low),
1341 		    (int)ntohs(tx->req_list[i].length),
1342 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1343 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1344 		    tx->req_list[i].rdma_count);
1345 	}
1346 	printf("--------------\n");
1347 #endif
1348 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1349 	if (tx->wc_fifo == NULL)
1350 		mxge_submit_req(tx, tx->req_list, cnt);
1351 	else
1352 		mxge_submit_req_wc(tx, tx->req_list, cnt);
1353 	return;
1354 
1355 drop:
1356 	m_freem(m);
1357 	ifp->if_oerrors++;
1358 	return;
1359 }
1360 
1361 
1362 
1363 
1364 static inline void
1365 mxge_start_locked(mxge_softc_t *sc)
1366 {
1367 	struct mbuf *m;
1368 	struct ifnet *ifp;
1369 
1370 	ifp = sc->ifp;
1371 	while ((sc->tx.mask - (sc->tx.req - sc->tx.done))
1372 	       > MXGE_MAX_SEND_DESC) {
1373 
1374 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
1375 		if (m == NULL) {
1376 			return;
1377 		}
1378 		/* let BPF see it */
1379 		BPF_MTAP(ifp, m);
1380 
1381 		/* give it to the nic */
1382 		mxge_encap(sc, m);
1383 	}
1384 	/* ran out of transmit slots */
1385 	sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
1386 }
1387 
1388 static void
1389 mxge_start(struct ifnet *ifp)
1390 {
1391 	mxge_softc_t *sc = ifp->if_softc;
1392 
1393 
1394 	mtx_lock(&sc->tx_lock);
1395 	mxge_start_locked(sc);
1396 	mtx_unlock(&sc->tx_lock);
1397 }
1398 
1399 /*
1400  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
1401  * at most 32 bytes at a time, so as to avoid involving the software
1402  * pio handler in the nic.   We re-write the first segment's low
1403  * DMA address to mark it valid only after we write the entire chunk
1404  * in a burst
1405  */
1406 static inline void
1407 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
1408 		mcp_kreq_ether_recv_t *src)
1409 {
1410 	uint32_t low;
1411 
1412 	low = src->addr_low;
1413 	src->addr_low = 0xffffffff;
1414 	mxge_pio_copy(dst, src, 8 * sizeof (*src));
1415 	mb();
1416 	dst->addr_low = low;
1417 	mb();
1418 }
1419 
1420 static int
1421 mxge_get_buf_small(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1422 {
1423 	bus_dma_segment_t seg;
1424 	struct mbuf *m;
1425 	mxge_rx_buf_t *rx = &sc->rx_small;
1426 	int cnt, err;
1427 
1428 	m = m_gethdr(M_DONTWAIT, MT_DATA);
1429 	if (m == NULL) {
1430 		rx->alloc_fail++;
1431 		err = ENOBUFS;
1432 		goto done;
1433 	}
1434 	m->m_len = MHLEN;
1435 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1436 				      &seg, &cnt, BUS_DMA_NOWAIT);
1437 	if (err != 0) {
1438 		m_free(m);
1439 		goto done;
1440 	}
1441 	rx->info[idx].m = m;
1442 	rx->shadow[idx].addr_low =
1443 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1444 	rx->shadow[idx].addr_high =
1445 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1446 
1447 done:
1448 	if ((idx & 7) == 7) {
1449 		if (rx->wc_fifo == NULL)
1450 			mxge_submit_8rx(&rx->lanai[idx - 7],
1451 					&rx->shadow[idx - 7]);
1452 		else {
1453 			mb();
1454 			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1455 		}
1456         }
1457 	return err;
1458 }
1459 
1460 static int
1461 mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1462 {
1463 	bus_dma_segment_t seg;
1464 	struct mbuf *m;
1465 	mxge_rx_buf_t *rx = &sc->rx_big;
1466 	int cnt, err;
1467 
1468 	m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, sc->big_bytes);
1469 	if (m == NULL) {
1470 		rx->alloc_fail++;
1471 		err = ENOBUFS;
1472 		goto done;
1473 	}
1474 	m->m_len = sc->big_bytes;
1475 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1476 				      &seg, &cnt, BUS_DMA_NOWAIT);
1477 	if (err != 0) {
1478 		m_free(m);
1479 		goto done;
1480 	}
1481 	rx->info[idx].m = m;
1482 	rx->shadow[idx].addr_low =
1483 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1484 	rx->shadow[idx].addr_high =
1485 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1486 
1487 done:
1488 	if ((idx & 7) == 7) {
1489 		if (rx->wc_fifo == NULL)
1490 			mxge_submit_8rx(&rx->lanai[idx - 7],
1491 					&rx->shadow[idx - 7]);
1492 		else {
1493 			mb();
1494 			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1495 		}
1496         }
1497 	return err;
1498 }
1499 
1500 static inline void
1501 mxge_rx_csum(struct mbuf *m, int csum)
1502 {
1503 	struct ether_header *eh;
1504 	struct ip *ip;
1505 
1506 	eh = mtod(m, struct ether_header *);
1507 	if (__predict_true(eh->ether_type ==  htons(ETHERTYPE_IP))) {
1508 		ip = (struct ip *)(eh + 1);
1509 		if (__predict_true(ip->ip_p == IPPROTO_TCP ||
1510 				   ip->ip_p == IPPROTO_UDP)) {
1511 			m->m_pkthdr.csum_data = csum;
1512 			m->m_pkthdr.csum_flags = CSUM_DATA_VALID;
1513 		}
1514 	}
1515 }
1516 
1517 static inline void
1518 mxge_rx_done_big(mxge_softc_t *sc, int len, int csum)
1519 {
1520 	struct ifnet *ifp;
1521 	struct mbuf *m = 0; 		/* -Wunitialized */
1522 	struct mbuf *m_prev = 0;	/* -Wunitialized */
1523 	struct mbuf *m_head = 0;
1524 	bus_dmamap_t old_map;
1525 	mxge_rx_buf_t *rx;
1526 	int idx;
1527 
1528 
1529 	rx = &sc->rx_big;
1530 	ifp = sc->ifp;
1531 	while (len > 0) {
1532 		idx = rx->cnt & rx->mask;
1533                 rx->cnt++;
1534 		/* save a pointer to the received mbuf */
1535 		m = rx->info[idx].m;
1536 		/* try to replace the received mbuf */
1537 		if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
1538 			goto drop;
1539 		}
1540 		/* unmap the received buffer */
1541 		old_map = rx->info[idx].map;
1542 		bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1543 		bus_dmamap_unload(rx->dmat, old_map);
1544 
1545 		/* swap the bus_dmamap_t's */
1546 		rx->info[idx].map = rx->extra_map;
1547 		rx->extra_map = old_map;
1548 
1549 		/* chain multiple segments together */
1550 		if (!m_head) {
1551 			m_head = m;
1552 			/* mcp implicitly skips 1st bytes so that
1553 			 * packet is properly aligned */
1554 			m->m_data += MXGEFW_PAD;
1555 			m->m_pkthdr.len = len;
1556 			m->m_len = sc->big_bytes - MXGEFW_PAD;
1557 		} else {
1558 			m->m_len = sc->big_bytes;
1559 			m->m_flags &= ~M_PKTHDR;
1560 			m_prev->m_next = m;
1561 		}
1562 		len -= m->m_len;
1563 		m_prev = m;
1564 	}
1565 
1566 	/* trim trailing garbage from the last mbuf in the chain.  If
1567 	 * there is any garbage, len will be negative */
1568 	m->m_len += len;
1569 
1570 	/* if the checksum is valid, mark it in the mbuf header */
1571 	if (sc->csum_flag)
1572 		mxge_rx_csum(m_head, csum);
1573 
1574 	/* pass the frame up the stack */
1575 	m_head->m_pkthdr.rcvif = ifp;
1576 	ifp->if_ipackets++;
1577 	(*ifp->if_input)(ifp, m_head);
1578 	return;
1579 
1580 drop:
1581 	/* drop the frame -- the old mbuf(s) are re-cycled by running
1582 	   every slot through the allocator */
1583         if (m_head) {
1584                 len -= sc->big_bytes;
1585                 m_freem(m_head);
1586         } else {
1587                 len -= (sc->big_bytes + MXGEFW_PAD);
1588         }
1589         while ((int)len > 0) {
1590                 idx = rx->cnt & rx->mask;
1591                 rx->cnt++;
1592                 m = rx->info[idx].m;
1593                 if (0 == (mxge_get_buf_big(sc, rx->extra_map, idx))) {
1594 			m_freem(m);
1595 			/* unmap the received buffer */
1596 			old_map = rx->info[idx].map;
1597 			bus_dmamap_sync(rx->dmat, old_map,
1598 					BUS_DMASYNC_POSTREAD);
1599 			bus_dmamap_unload(rx->dmat, old_map);
1600 
1601 			/* swap the bus_dmamap_t's */
1602 			rx->info[idx].map = rx->extra_map;
1603 			rx->extra_map = old_map;
1604 		}
1605                 len -= sc->big_bytes;
1606         }
1607 
1608 	ifp->if_ierrors++;
1609 
1610 }
1611 
1612 static inline void
1613 mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
1614 {
1615 	struct ifnet *ifp;
1616 	struct mbuf *m;
1617 	mxge_rx_buf_t *rx;
1618 	bus_dmamap_t old_map;
1619 	int idx;
1620 
1621 	ifp = sc->ifp;
1622 	rx = &sc->rx_small;
1623 	idx = rx->cnt & rx->mask;
1624 	rx->cnt++;
1625 	/* save a pointer to the received mbuf */
1626 	m = rx->info[idx].m;
1627 	/* try to replace the received mbuf */
1628 	if (mxge_get_buf_small(sc, rx->extra_map, idx)) {
1629 		/* drop the frame -- the old mbuf is re-cycled */
1630 		ifp->if_ierrors++;
1631 		return;
1632 	}
1633 
1634 	/* unmap the received buffer */
1635 	old_map = rx->info[idx].map;
1636 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1637 	bus_dmamap_unload(rx->dmat, old_map);
1638 
1639 	/* swap the bus_dmamap_t's */
1640 	rx->info[idx].map = rx->extra_map;
1641 	rx->extra_map = old_map;
1642 
1643 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
1644 	 * aligned */
1645 	m->m_data += MXGEFW_PAD;
1646 
1647 	/* if the checksum is valid, mark it in the mbuf header */
1648 	if (sc->csum_flag)
1649 		mxge_rx_csum(m, csum);
1650 
1651 	/* pass the frame up the stack */
1652 	m->m_pkthdr.rcvif = ifp;
1653 	m->m_len = m->m_pkthdr.len = len;
1654 	ifp->if_ipackets++;
1655 	(*ifp->if_input)(ifp, m);
1656 }
1657 
1658 static inline void
1659 mxge_clean_rx_done(mxge_softc_t *sc)
1660 {
1661 	mxge_rx_done_t *rx_done = &sc->rx_done;
1662 	int limit = 0;
1663 	uint16_t length;
1664 	uint16_t checksum;
1665 
1666 
1667 	while (rx_done->entry[rx_done->idx].length != 0) {
1668 		length = ntohs(rx_done->entry[rx_done->idx].length);
1669 		rx_done->entry[rx_done->idx].length = 0;
1670 		checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
1671 		if (length <= MHLEN)
1672 			mxge_rx_done_small(sc, length, checksum);
1673 		else
1674 			mxge_rx_done_big(sc, length, checksum);
1675 		rx_done->cnt++;
1676 		rx_done->idx = rx_done->cnt & (mxge_max_intr_slots - 1);
1677 
1678 		/* limit potential for livelock */
1679 		if (__predict_false(++limit > 2 * mxge_max_intr_slots))
1680 			break;
1681 
1682 	}
1683 }
1684 
1685 
1686 static inline void
1687 mxge_tx_done(mxge_softc_t *sc, uint32_t mcp_idx)
1688 {
1689 	struct ifnet *ifp;
1690 	mxge_tx_buf_t *tx;
1691 	struct mbuf *m;
1692 	bus_dmamap_t map;
1693 	int idx, limit;
1694 
1695 	limit = 0;
1696 	tx = &sc->tx;
1697 	ifp = sc->ifp;
1698 	while (tx->pkt_done != mcp_idx) {
1699 		idx = tx->done & tx->mask;
1700 		tx->done++;
1701 		m = tx->info[idx].m;
1702 		/* mbuf and DMA map only attached to the first
1703 		   segment per-mbuf */
1704 		if (m != NULL) {
1705 			ifp->if_opackets++;
1706 			tx->info[idx].m = NULL;
1707 			map = tx->info[idx].map;
1708 			bus_dmamap_unload(tx->dmat, map);
1709 			m_freem(m);
1710 		}
1711 		if (tx->info[idx].flag) {
1712 			tx->info[idx].flag = 0;
1713 			tx->pkt_done++;
1714 		}
1715 		/* limit potential for livelock by only handling
1716 		   2 full tx rings per call */
1717 		if (__predict_false(++limit >  2 * tx->mask))
1718 			break;
1719 	}
1720 
1721 	/* If we have space, clear IFF_OACTIVE to tell the stack that
1722            its OK to send packets */
1723 
1724 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
1725 	    tx->req - tx->done < (tx->mask + 1)/4) {
1726 		mtx_lock(&sc->tx_lock);
1727 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
1728 		mxge_start_locked(sc);
1729 		mtx_unlock(&sc->tx_lock);
1730 	}
1731 }
1732 
1733 static void
1734 mxge_intr(void *arg)
1735 {
1736 	mxge_softc_t *sc = arg;
1737 	mcp_irq_data_t *stats = sc->fw_stats;
1738 	mxge_tx_buf_t *tx = &sc->tx;
1739 	mxge_rx_done_t *rx_done = &sc->rx_done;
1740 	uint32_t send_done_count;
1741 	uint8_t valid;
1742 
1743 
1744 	/* make sure the DMA has finished */
1745 	if (!stats->valid) {
1746 		return;
1747 	}
1748 	valid = stats->valid;
1749 
1750 	/* lower legacy IRQ  */
1751 	*sc->irq_deassert = 0;
1752 	mb();
1753 	if (!mxge_deassert_wait)
1754 		/* don't wait for conf. that irq is low */
1755 		stats->valid = 0;
1756 	do {
1757 		/* check for transmit completes and receives */
1758 		send_done_count = be32toh(stats->send_done_count);
1759 		while ((send_done_count != tx->pkt_done) ||
1760 		       (rx_done->entry[rx_done->idx].length != 0)) {
1761 			mxge_tx_done(sc, (int)send_done_count);
1762 			mxge_clean_rx_done(sc);
1763 			send_done_count = be32toh(stats->send_done_count);
1764 		}
1765 	} while (*((volatile uint8_t *) &stats->valid));
1766 
1767 	if (__predict_false(stats->stats_updated)) {
1768 		if (sc->link_state != stats->link_up) {
1769 			sc->link_state = stats->link_up;
1770 			if (sc->link_state) {
1771 				if_link_state_change(sc->ifp, LINK_STATE_UP);
1772 				if (mxge_verbose)
1773 					device_printf(sc->dev, "link up\n");
1774 			} else {
1775 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
1776 				if (mxge_verbose)
1777 					device_printf(sc->dev, "link down\n");
1778 			}
1779 		}
1780 		if (sc->rdma_tags_available !=
1781 		    be32toh(sc->fw_stats->rdma_tags_available)) {
1782 			sc->rdma_tags_available =
1783 				be32toh(sc->fw_stats->rdma_tags_available);
1784 			device_printf(sc->dev, "RDMA timed out! %d tags "
1785 				      "left\n", sc->rdma_tags_available);
1786 		}
1787 		sc->down_cnt += stats->link_down;
1788 	}
1789 
1790 	/* check to see if we have rx token to pass back */
1791 	if (valid & 0x1)
1792 	    *sc->irq_claim = be32toh(3);
1793 	*(sc->irq_claim + 1) = be32toh(3);
1794 }
1795 
1796 static void
1797 mxge_watchdog(struct ifnet *ifp)
1798 {
1799 	printf("%s called\n", __FUNCTION__);
1800 }
1801 
1802 static void
1803 mxge_init(void *arg)
1804 {
1805 }
1806 
1807 
1808 
1809 static void
1810 mxge_free_mbufs(mxge_softc_t *sc)
1811 {
1812 	int i;
1813 
1814 	for (i = 0; i <= sc->rx_big.mask; i++) {
1815 		if (sc->rx_big.info[i].m == NULL)
1816 			continue;
1817 		bus_dmamap_unload(sc->rx_big.dmat,
1818 				  sc->rx_big.info[i].map);
1819 		m_freem(sc->rx_big.info[i].m);
1820 		sc->rx_big.info[i].m = NULL;
1821 	}
1822 
1823 	for (i = 0; i <= sc->rx_big.mask; i++) {
1824 		if (sc->rx_big.info[i].m == NULL)
1825 			continue;
1826 		bus_dmamap_unload(sc->rx_big.dmat,
1827 				  sc->rx_big.info[i].map);
1828 		m_freem(sc->rx_big.info[i].m);
1829 		sc->rx_big.info[i].m = NULL;
1830 	}
1831 
1832 	for (i = 0; i <= sc->tx.mask; i++) {
1833 		if (sc->tx.info[i].m == NULL)
1834 			continue;
1835 		bus_dmamap_unload(sc->tx.dmat,
1836 				  sc->tx.info[i].map);
1837 		m_freem(sc->tx.info[i].m);
1838 		sc->tx.info[i].m = NULL;
1839 	}
1840 }
1841 
1842 static void
1843 mxge_free_rings(mxge_softc_t *sc)
1844 {
1845 	int i;
1846 
1847 	if (sc->tx.req_bytes != NULL) {
1848 		free(sc->tx.req_bytes, M_DEVBUF);
1849 	}
1850 	if (sc->rx_small.shadow != NULL)
1851 		free(sc->rx_small.shadow, M_DEVBUF);
1852 	if (sc->rx_big.shadow != NULL)
1853 		free(sc->rx_big.shadow, M_DEVBUF);
1854 	if (sc->tx.info != NULL) {
1855 		for (i = 0; i <= sc->tx.mask; i++) {
1856 			if (sc->tx.info[i].map != NULL)
1857 				bus_dmamap_destroy(sc->tx.dmat,
1858 						   sc->tx.info[i].map);
1859 		}
1860 		free(sc->tx.info, M_DEVBUF);
1861 	}
1862 	if (sc->rx_small.info != NULL) {
1863 		for (i = 0; i <= sc->rx_small.mask; i++) {
1864 			if (sc->rx_small.info[i].map != NULL)
1865 				bus_dmamap_destroy(sc->rx_small.dmat,
1866 						   sc->rx_small.info[i].map);
1867 		}
1868 		free(sc->rx_small.info, M_DEVBUF);
1869 	}
1870 	if (sc->rx_big.info != NULL) {
1871 		for (i = 0; i <= sc->rx_big.mask; i++) {
1872 			if (sc->rx_big.info[i].map != NULL)
1873 				bus_dmamap_destroy(sc->rx_big.dmat,
1874 						   sc->rx_big.info[i].map);
1875 		}
1876 		free(sc->rx_big.info, M_DEVBUF);
1877 	}
1878 	if (sc->rx_big.extra_map != NULL)
1879 		bus_dmamap_destroy(sc->rx_big.dmat,
1880 				   sc->rx_big.extra_map);
1881 	if (sc->rx_small.extra_map != NULL)
1882 		bus_dmamap_destroy(sc->rx_small.dmat,
1883 				   sc->rx_small.extra_map);
1884 	if (sc->tx.dmat != NULL)
1885 		bus_dma_tag_destroy(sc->tx.dmat);
1886 	if (sc->rx_small.dmat != NULL)
1887 		bus_dma_tag_destroy(sc->rx_small.dmat);
1888 	if (sc->rx_big.dmat != NULL)
1889 		bus_dma_tag_destroy(sc->rx_big.dmat);
1890 }
1891 
1892 static int
1893 mxge_alloc_rings(mxge_softc_t *sc)
1894 {
1895 	mxge_cmd_t cmd;
1896 	int tx_ring_size, rx_ring_size;
1897 	int tx_ring_entries, rx_ring_entries;
1898 	int i, err;
1899 	unsigned long bytes;
1900 
1901 	/* get ring sizes */
1902 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
1903 	tx_ring_size = cmd.data0;
1904 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
1905 	if (err != 0) {
1906 		device_printf(sc->dev, "Cannot determine ring sizes\n");
1907 		goto abort_with_nothing;
1908 	}
1909 
1910 	rx_ring_size = cmd.data0;
1911 
1912 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
1913 	rx_ring_entries = rx_ring_size / sizeof (mcp_dma_addr_t);
1914 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
1915 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
1916 	IFQ_SET_READY(&sc->ifp->if_snd);
1917 
1918 	sc->tx.mask = tx_ring_entries - 1;
1919 	sc->rx_small.mask = sc->rx_big.mask = rx_ring_entries - 1;
1920 
1921 	err = ENOMEM;
1922 
1923 	/* allocate the tx request copy block */
1924 	bytes = 8 +
1925 		sizeof (*sc->tx.req_list) * (MXGE_MAX_SEND_DESC + 4);
1926 	sc->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
1927 	if (sc->tx.req_bytes == NULL)
1928 		goto abort_with_nothing;
1929 	/* ensure req_list entries are aligned to 8 bytes */
1930 	sc->tx.req_list = (mcp_kreq_ether_send_t *)
1931 		((unsigned long)(sc->tx.req_bytes + 7) & ~7UL);
1932 
1933 	/* allocate the rx shadow rings */
1934 	bytes = rx_ring_entries * sizeof (*sc->rx_small.shadow);
1935 	sc->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
1936 	if (sc->rx_small.shadow == NULL)
1937 		goto abort_with_alloc;
1938 
1939 	bytes = rx_ring_entries * sizeof (*sc->rx_big.shadow);
1940 	sc->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
1941 	if (sc->rx_big.shadow == NULL)
1942 		goto abort_with_alloc;
1943 
1944 	/* allocate the host info rings */
1945 	bytes = tx_ring_entries * sizeof (*sc->tx.info);
1946 	sc->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
1947 	if (sc->tx.info == NULL)
1948 		goto abort_with_alloc;
1949 
1950 	bytes = rx_ring_entries * sizeof (*sc->rx_small.info);
1951 	sc->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
1952 	if (sc->rx_small.info == NULL)
1953 		goto abort_with_alloc;
1954 
1955 	bytes = rx_ring_entries * sizeof (*sc->rx_big.info);
1956 	sc->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
1957 	if (sc->rx_big.info == NULL)
1958 		goto abort_with_alloc;
1959 
1960 	/* allocate the busdma resources */
1961 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
1962 				 1,			/* alignment */
1963 				 sc->tx.boundary,	/* boundary */
1964 				 BUS_SPACE_MAXADDR,	/* low */
1965 				 BUS_SPACE_MAXADDR,	/* high */
1966 				 NULL, NULL,		/* filter */
1967 				 MXGE_MAX_ETHER_MTU,	/* maxsize */
1968 				 MXGE_MAX_SEND_DESC,	/* num segs */
1969 				 sc->tx.boundary,	/* maxsegsize */
1970 				 BUS_DMA_ALLOCNOW,	/* flags */
1971 				 NULL, NULL,		/* lock */
1972 				 &sc->tx.dmat);		/* tag */
1973 
1974 	if (err != 0) {
1975 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
1976 			      err);
1977 		goto abort_with_alloc;
1978 	}
1979 
1980 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
1981 				 1,			/* alignment */
1982 				 4096,			/* boundary */
1983 				 BUS_SPACE_MAXADDR,	/* low */
1984 				 BUS_SPACE_MAXADDR,	/* high */
1985 				 NULL, NULL,		/* filter */
1986 				 MHLEN,			/* maxsize */
1987 				 1,			/* num segs */
1988 				 MHLEN,			/* maxsegsize */
1989 				 BUS_DMA_ALLOCNOW,	/* flags */
1990 				 NULL, NULL,		/* lock */
1991 				 &sc->rx_small.dmat);	/* tag */
1992 	if (err != 0) {
1993 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
1994 			      err);
1995 		goto abort_with_alloc;
1996 	}
1997 
1998 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
1999 				 1,			/* alignment */
2000 				 4096,			/* boundary */
2001 				 BUS_SPACE_MAXADDR,	/* low */
2002 				 BUS_SPACE_MAXADDR,	/* high */
2003 				 NULL, NULL,		/* filter */
2004 				 4096,			/* maxsize */
2005 				 1,			/* num segs */
2006 				 4096,			/* maxsegsize */
2007 				 BUS_DMA_ALLOCNOW,	/* flags */
2008 				 NULL, NULL,		/* lock */
2009 				 &sc->rx_big.dmat);	/* tag */
2010 	if (err != 0) {
2011 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2012 			      err);
2013 		goto abort_with_alloc;
2014 	}
2015 
2016 	/* now use these tags to setup dmamaps for each slot
2017 	   in each ring */
2018 	for (i = 0; i <= sc->tx.mask; i++) {
2019 		err = bus_dmamap_create(sc->tx.dmat, 0,
2020 					&sc->tx.info[i].map);
2021 		if (err != 0) {
2022 			device_printf(sc->dev, "Err %d  tx dmamap\n",
2023 			      err);
2024 			goto abort_with_alloc;
2025 		}
2026 	}
2027 	for (i = 0; i <= sc->rx_small.mask; i++) {
2028 		err = bus_dmamap_create(sc->rx_small.dmat, 0,
2029 					&sc->rx_small.info[i].map);
2030 		if (err != 0) {
2031 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2032 				      err);
2033 			goto abort_with_alloc;
2034 		}
2035 	}
2036 	err = bus_dmamap_create(sc->rx_small.dmat, 0,
2037 				&sc->rx_small.extra_map);
2038 	if (err != 0) {
2039 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2040 			      err);
2041 			goto abort_with_alloc;
2042 	}
2043 
2044 	for (i = 0; i <= sc->rx_big.mask; i++) {
2045 		err = bus_dmamap_create(sc->rx_big.dmat, 0,
2046 					&sc->rx_big.info[i].map);
2047 		if (err != 0) {
2048 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2049 			      err);
2050 			goto abort_with_alloc;
2051 		}
2052 	}
2053 	err = bus_dmamap_create(sc->rx_big.dmat, 0,
2054 				&sc->rx_big.extra_map);
2055 	if (err != 0) {
2056 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2057 			      err);
2058 			goto abort_with_alloc;
2059 	}
2060 	return 0;
2061 
2062 abort_with_alloc:
2063 	mxge_free_rings(sc);
2064 
2065 abort_with_nothing:
2066 	return err;
2067 }
2068 
2069 static int
2070 mxge_open(mxge_softc_t *sc)
2071 {
2072 	mxge_cmd_t cmd;
2073 	int i, err;
2074 	bus_dmamap_t map;
2075 
2076 
2077 	err = mxge_reset(sc);
2078 	if (err != 0) {
2079 		device_printf(sc->dev, "failed to reset\n");
2080 		return EIO;
2081 	}
2082 
2083 	if (MCLBYTES >=
2084 	    sc->ifp->if_mtu + ETHER_HDR_LEN + MXGEFW_PAD)
2085 		sc->big_bytes = MCLBYTES;
2086 	else
2087 		sc->big_bytes = MJUMPAGESIZE;
2088 
2089 	err = mxge_alloc_rings(sc);
2090 	if (err != 0) {
2091 		device_printf(sc->dev, "failed to allocate rings\n");
2092 		return err;
2093 	}
2094 
2095 	err = bus_setup_intr(sc->dev, sc->irq_res,
2096 			     INTR_TYPE_NET | INTR_MPSAFE,
2097 			     mxge_intr, sc, &sc->ih);
2098 	if (err != 0) {
2099 		goto abort_with_rings;
2100 	}
2101 
2102 	/* get the lanai pointers to the send and receive rings */
2103 
2104 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
2105 	sc->tx.lanai =
2106 		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
2107 	err |= mxge_send_cmd(sc,
2108 				 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
2109 	sc->rx_small.lanai =
2110 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2111 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
2112 	sc->rx_big.lanai =
2113 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2114 
2115 	if (err != 0) {
2116 		device_printf(sc->dev,
2117 			      "failed to get ring sizes or locations\n");
2118 		err = EIO;
2119 		goto abort_with_irq;
2120 	}
2121 
2122 	if (sc->wc) {
2123 		sc->tx.wc_fifo = sc->sram + 0x200000;
2124 		sc->rx_small.wc_fifo = sc->sram + 0x300000;
2125 		sc->rx_big.wc_fifo = sc->sram + 0x340000;
2126 	} else {
2127 		sc->tx.wc_fifo = 0;
2128 		sc->rx_small.wc_fifo = 0;
2129 		sc->rx_big.wc_fifo = 0;
2130 	}
2131 
2132 
2133 	/* stock receive rings */
2134 	for (i = 0; i <= sc->rx_small.mask; i++) {
2135 		map = sc->rx_small.info[i].map;
2136 		err = mxge_get_buf_small(sc, map, i);
2137 		if (err) {
2138 			device_printf(sc->dev, "alloced %d/%d smalls\n",
2139 				      i, sc->rx_small.mask + 1);
2140 			goto abort;
2141 		}
2142 	}
2143 	for (i = 0; i <= sc->rx_big.mask; i++) {
2144 		map = sc->rx_big.info[i].map;
2145 		err = mxge_get_buf_big(sc, map, i);
2146 		if (err) {
2147 			device_printf(sc->dev, "alloced %d/%d bigs\n",
2148 				      i, sc->rx_big.mask + 1);
2149 			goto abort;
2150 		}
2151 	}
2152 
2153 	/* Give the firmware the mtu and the big and small buffer
2154 	   sizes.  The firmware wants the big buf size to be a power
2155 	   of two. Luckily, FreeBSD's clusters are powers of two */
2156 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN;
2157 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
2158 	cmd.data0 = MHLEN;
2159 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
2160 			     &cmd);
2161 	cmd.data0 = sc->big_bytes;
2162 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2163 	/* Now give him the pointer to the stats block */
2164 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->fw_stats_dma.bus_addr);
2165 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->fw_stats_dma.bus_addr);
2166 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA, &cmd);
2167 
2168 	if (err != 0) {
2169 		device_printf(sc->dev, "failed to setup params\n");
2170 		goto abort;
2171 	}
2172 
2173 	/* Finally, start the firmware running */
2174 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
2175 	if (err) {
2176 		device_printf(sc->dev, "Couldn't bring up link\n");
2177 		goto abort;
2178 	}
2179 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
2180 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2181 
2182 	return 0;
2183 
2184 
2185 abort:
2186 	mxge_free_mbufs(sc);
2187 abort_with_irq:
2188 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
2189 abort_with_rings:
2190 	mxge_free_rings(sc);
2191 	return err;
2192 }
2193 
2194 static int
2195 mxge_close(mxge_softc_t *sc)
2196 {
2197 	mxge_cmd_t cmd;
2198 	int err, old_down_cnt;
2199 
2200 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
2201 	old_down_cnt = sc->down_cnt;
2202 	mb();
2203 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2204 	if (err) {
2205 		device_printf(sc->dev, "Couldn't bring down link\n");
2206 	}
2207 	if (old_down_cnt == sc->down_cnt) {
2208 		/* wait for down irq */
2209 		(void)tsleep(&sc->down_cnt, PWAIT, "down mxge", hz);
2210 	}
2211 	if (old_down_cnt == sc->down_cnt) {
2212 		device_printf(sc->dev, "never got down irq\n");
2213 	}
2214 	if (sc->ih != NULL)
2215 		bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
2216 	mxge_free_mbufs(sc);
2217 	mxge_free_rings(sc);
2218 	return 0;
2219 }
2220 
2221 
2222 static int
2223 mxge_media_change(struct ifnet *ifp)
2224 {
2225 	return EINVAL;
2226 }
2227 
2228 static int
2229 mxge_change_mtu(mxge_softc_t *sc, int mtu)
2230 {
2231 	struct ifnet *ifp = sc->ifp;
2232 	int real_mtu, old_mtu;
2233 	int err = 0;
2234 
2235 
2236 	real_mtu = mtu + ETHER_HDR_LEN;
2237 	if ((real_mtu > MXGE_MAX_ETHER_MTU) ||
2238 	    real_mtu < 60)
2239 		return EINVAL;
2240 	sx_xlock(&sc->driver_lock);
2241 	old_mtu = ifp->if_mtu;
2242 	ifp->if_mtu = mtu;
2243 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2244 		mxge_close(sc);
2245 		err = mxge_open(sc);
2246 		if (err != 0) {
2247 			ifp->if_mtu = old_mtu;
2248 			mxge_close(sc);
2249 			(void) mxge_open(sc);
2250 		}
2251 	}
2252 	sx_xunlock(&sc->driver_lock);
2253 	return err;
2254 }
2255 
2256 static void
2257 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
2258 {
2259 	mxge_softc_t *sc = ifp->if_softc;
2260 
2261 
2262 	if (sc == NULL)
2263 		return;
2264 	ifmr->ifm_status = IFM_AVALID;
2265 	ifmr->ifm_status |= sc->fw_stats->link_up ? IFM_ACTIVE : 0;
2266 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
2267 	ifmr->ifm_active |= sc->fw_stats->link_up ? IFM_FDX : 0;
2268 }
2269 
2270 static int
2271 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
2272 {
2273 	mxge_softc_t *sc = ifp->if_softc;
2274 	struct ifreq *ifr = (struct ifreq *)data;
2275 	int err, mask;
2276 
2277 	err = 0;
2278 	switch (command) {
2279 	case SIOCSIFADDR:
2280 	case SIOCGIFADDR:
2281 		err = ether_ioctl(ifp, command, data);
2282 		break;
2283 
2284 	case SIOCSIFMTU:
2285 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
2286 		break;
2287 
2288 	case SIOCSIFFLAGS:
2289 		sx_xlock(&sc->driver_lock);
2290 		if (ifp->if_flags & IFF_UP) {
2291 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
2292 				err = mxge_open(sc);
2293 		} else {
2294 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2295 				mxge_close(sc);
2296 		}
2297 		sx_xunlock(&sc->driver_lock);
2298 		break;
2299 
2300 	case SIOCADDMULTI:
2301 	case SIOCDELMULTI:
2302 		err = 0;
2303 		break;
2304 
2305 	case SIOCSIFCAP:
2306 		sx_xlock(&sc->driver_lock);
2307 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2308 		if (mask & IFCAP_TXCSUM) {
2309 			if (IFCAP_TXCSUM & ifp->if_capenable) {
2310 				ifp->if_capenable &= ~IFCAP_TXCSUM;
2311 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
2312 			} else {
2313 				ifp->if_capenable |= IFCAP_TXCSUM;
2314 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
2315 			}
2316 		} else if (mask & IFCAP_RXCSUM) {
2317 			if (IFCAP_RXCSUM & ifp->if_capenable) {
2318 				ifp->if_capenable &= ~IFCAP_RXCSUM;
2319 				sc->csum_flag = 0;
2320 			} else {
2321 				ifp->if_capenable |= IFCAP_RXCSUM;
2322 				sc->csum_flag = 1;
2323 			}
2324 		}
2325 		sx_xunlock(&sc->driver_lock);
2326 		break;
2327 
2328 	case SIOCGIFMEDIA:
2329 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
2330 				    &sc->media, command);
2331                 break;
2332 
2333 	default:
2334 		err = ENOTTY;
2335         }
2336 	return err;
2337 }
2338 
2339 static void
2340 mxge_fetch_tunables(mxge_softc_t *sc)
2341 {
2342 
2343 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
2344 			  &mxge_flow_control);
2345 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
2346 			  &mxge_intr_coal_delay);
2347 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
2348 			  &mxge_nvidia_ecrc_enable);
2349 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
2350 			  &mxge_deassert_wait);
2351 	TUNABLE_INT_FETCH("hw.mxge.verbose",
2352 			  &mxge_verbose);
2353 
2354 	if (bootverbose)
2355 		mxge_verbose = 1;
2356 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
2357 		mxge_intr_coal_delay = 30;
2358 	sc->pause = mxge_flow_control;
2359 }
2360 
2361 static int
2362 mxge_attach(device_t dev)
2363 {
2364 	mxge_softc_t *sc = device_get_softc(dev);
2365 	struct ifnet *ifp;
2366 	size_t bytes;
2367 	int rid, err;
2368 	uint16_t cmd;
2369 
2370 	sc->dev = dev;
2371 	mxge_fetch_tunables(sc);
2372 
2373 	err = bus_dma_tag_create(NULL,			/* parent */
2374 				 1,			/* alignment */
2375 				 4096,			/* boundary */
2376 				 BUS_SPACE_MAXADDR,	/* low */
2377 				 BUS_SPACE_MAXADDR,	/* high */
2378 				 NULL, NULL,		/* filter */
2379 				 MXGE_MAX_ETHER_MTU,	/* maxsize */
2380 				 MXGE_MAX_SEND_DESC, 	/* num segs */
2381 				 4096,			/* maxsegsize */
2382 				 0,			/* flags */
2383 				 NULL, NULL,		/* lock */
2384 				 &sc->parent_dmat);	/* tag */
2385 
2386 	if (err != 0) {
2387 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
2388 			      err);
2389 		goto abort_with_nothing;
2390 	}
2391 
2392 	ifp = sc->ifp = if_alloc(IFT_ETHER);
2393 	if (ifp == NULL) {
2394 		device_printf(dev, "can not if_alloc()\n");
2395 		err = ENOSPC;
2396 		goto abort_with_parent_dmat;
2397 	}
2398 	mtx_init(&sc->cmd_lock, NULL,
2399 		 MTX_NETWORK_LOCK, MTX_DEF);
2400 	mtx_init(&sc->tx_lock, device_get_nameunit(dev),
2401 		 MTX_NETWORK_LOCK, MTX_DEF);
2402 	sx_init(&sc->driver_lock, device_get_nameunit(dev));
2403 
2404 	/* Enable DMA and Memory space access */
2405 	pci_enable_busmaster(dev);
2406 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
2407 	cmd |= PCIM_CMD_MEMEN;
2408 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
2409 
2410 	/* Map the board into the kernel */
2411 	rid = PCIR_BARS;
2412 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
2413 					 ~0, 1, RF_ACTIVE);
2414 	if (sc->mem_res == NULL) {
2415 		device_printf(dev, "could not map memory\n");
2416 		err = ENXIO;
2417 		goto abort_with_lock;
2418 	}
2419 	sc->sram = rman_get_virtual(sc->mem_res);
2420 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
2421 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
2422 		device_printf(dev, "impossible memory region size %ld\n",
2423 			      rman_get_size(sc->mem_res));
2424 		err = ENXIO;
2425 		goto abort_with_mem_res;
2426 	}
2427 
2428 	/* make NULL terminated copy of the EEPROM strings section of
2429 	   lanai SRAM */
2430 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
2431 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
2432 				rman_get_bushandle(sc->mem_res),
2433 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
2434 				sc->eeprom_strings,
2435 				MXGE_EEPROM_STRINGS_SIZE - 2);
2436 	err = mxge_parse_strings(sc);
2437 	if (err != 0)
2438 		goto abort_with_mem_res;
2439 
2440 	/* Enable write combining for efficient use of PCIe bus */
2441 	mxge_enable_wc(sc);
2442 
2443 	/* Allocate the out of band dma memory */
2444 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
2445 			     sizeof (mxge_cmd_t), 64);
2446 	if (err != 0)
2447 		goto abort_with_mem_res;
2448 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
2449 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
2450 	if (err != 0)
2451 		goto abort_with_cmd_dma;
2452 
2453 	err = mxge_dma_alloc(sc, &sc->fw_stats_dma,
2454 			     sizeof (*sc->fw_stats), 64);
2455 	if (err != 0)
2456 		goto abort_with_zeropad_dma;
2457 	sc->fw_stats = (mcp_irq_data_t *)sc->fw_stats_dma.addr;
2458 
2459 
2460 	/* allocate interrupt queues */
2461 	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);
2462 	err = mxge_dma_alloc(sc, &sc->rx_done.dma, bytes, 4096);
2463 	if (err != 0)
2464 		goto abort_with_fw_stats;
2465 	sc->rx_done.entry = sc->rx_done.dma.addr;
2466 	bzero(sc->rx_done.entry, bytes);
2467 	/* Add our ithread  */
2468 	rid = 0;
2469 	sc->irq_res = bus_alloc_resource(dev, SYS_RES_IRQ, &rid, 0, ~0,
2470 					 1, RF_SHAREABLE | RF_ACTIVE);
2471 	if (sc->irq_res == NULL) {
2472 		device_printf(dev, "could not alloc interrupt\n");
2473 		goto abort_with_rx_done;
2474 	}
2475 
2476 	/* load the firmware */
2477 	mxge_select_firmware(sc);
2478 
2479 	err = mxge_load_firmware(sc);
2480 	if (err != 0)
2481 		goto abort_with_irq_res;
2482 	sc->intr_coal_delay = mxge_intr_coal_delay;
2483 	err = mxge_reset(sc);
2484 	if (err != 0)
2485 		goto abort_with_irq_res;
2486 
2487 	/* hook into the network stack */
2488 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2489 	ifp->if_baudrate = 100000000;
2490 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM;
2491 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP;
2492 	ifp->if_capenable = ifp->if_capabilities;
2493 	sc->csum_flag = 1;
2494         ifp->if_init = mxge_init;
2495         ifp->if_softc = sc;
2496         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2497         ifp->if_ioctl = mxge_ioctl;
2498         ifp->if_start = mxge_start;
2499 	ifp->if_watchdog = mxge_watchdog;
2500 	ether_ifattach(ifp, sc->mac_addr);
2501 	/* ether_ifattach sets mtu to 1500 */
2502 	ifp->if_mtu = MXGE_MAX_ETHER_MTU - ETHER_HDR_LEN;
2503 
2504 	/* Initialise the ifmedia structure */
2505 	ifmedia_init(&sc->media, 0, mxge_media_change,
2506 		     mxge_media_status);
2507 	ifmedia_add(&sc->media, IFM_ETHER|IFM_AUTO, 0, NULL);
2508 	mxge_add_sysctls(sc);
2509 	return 0;
2510 
2511 abort_with_irq_res:
2512 	bus_release_resource(dev, SYS_RES_IRQ, 0, sc->irq_res);
2513 abort_with_rx_done:
2514 	sc->rx_done.entry = NULL;
2515 	mxge_dma_free(&sc->rx_done.dma);
2516 abort_with_fw_stats:
2517 	mxge_dma_free(&sc->fw_stats_dma);
2518 abort_with_zeropad_dma:
2519 	mxge_dma_free(&sc->zeropad_dma);
2520 abort_with_cmd_dma:
2521 	mxge_dma_free(&sc->cmd_dma);
2522 abort_with_mem_res:
2523 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
2524 abort_with_lock:
2525 	pci_disable_busmaster(dev);
2526 	mtx_destroy(&sc->cmd_lock);
2527 	mtx_destroy(&sc->tx_lock);
2528 	sx_destroy(&sc->driver_lock);
2529 	if_free(ifp);
2530 abort_with_parent_dmat:
2531 	bus_dma_tag_destroy(sc->parent_dmat);
2532 
2533 abort_with_nothing:
2534 	return err;
2535 }
2536 
2537 static int
2538 mxge_detach(device_t dev)
2539 {
2540 	mxge_softc_t *sc = device_get_softc(dev);
2541 
2542 	sx_xlock(&sc->driver_lock);
2543 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
2544 		mxge_close(sc);
2545 	sx_xunlock(&sc->driver_lock);
2546 	ether_ifdetach(sc->ifp);
2547 	bus_release_resource(dev, SYS_RES_IRQ, 0, sc->irq_res);
2548 	sc->rx_done.entry = NULL;
2549 	mxge_dma_free(&sc->rx_done.dma);
2550 	mxge_dma_free(&sc->fw_stats_dma);
2551 	mxge_dma_free(&sc->zeropad_dma);
2552 	mxge_dma_free(&sc->cmd_dma);
2553 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
2554 	pci_disable_busmaster(dev);
2555 	mtx_destroy(&sc->cmd_lock);
2556 	mtx_destroy(&sc->tx_lock);
2557 	sx_destroy(&sc->driver_lock);
2558 	if_free(sc->ifp);
2559 	bus_dma_tag_destroy(sc->parent_dmat);
2560 	return 0;
2561 }
2562 
2563 static int
2564 mxge_shutdown(device_t dev)
2565 {
2566 	return 0;
2567 }
2568 
2569 /*
2570   This file uses Myri10GE driver indentation.
2571 
2572   Local Variables:
2573   c-file-style:"linux"
2574   tab-width:8
2575   End:
2576 */
2577