xref: /freebsd/sys/dev/mxge/if_mxge.c (revision e4e9813eb92cd7c4d4b819a8fbed5cbd3d92f5d8)
1 /******************************************************************************
2 
3 Copyright (c) 2006, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Redistributions in binary form must reproduce the above copyright
13     notice, this list of conditions and the following disclaimer in the
14     documentation and/or other materials provided with the distribution.
15 
16  3. Neither the name of the Myricom Inc, nor the names of its
17     contributors may be used to endorse or promote products derived from
18     this software without specific prior written permission.
19 
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
24 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 POSSIBILITY OF SUCH DAMAGE.
31 
32 ***************************************************************************/
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/linker.h>
40 #include <sys/firmware.h>
41 #include <sys/endian.h>
42 #include <sys/sockio.h>
43 #include <sys/mbuf.h>
44 #include <sys/malloc.h>
45 #include <sys/kdb.h>
46 #include <sys/kernel.h>
47 #include <sys/module.h>
48 #include <sys/memrange.h>
49 #include <sys/socket.h>
50 #include <sys/sysctl.h>
51 #include <sys/sx.h>
52 
53 #include <net/if.h>
54 #include <net/if_arp.h>
55 #include <net/ethernet.h>
56 #include <net/if_dl.h>
57 #include <net/if_media.h>
58 
59 #include <net/bpf.h>
60 
61 #include <net/if_types.h>
62 #include <net/if_vlan_var.h>
63 #include <net/zlib.h>
64 
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 
69 #include <machine/bus.h>
70 #include <machine/resource.h>
71 #include <sys/bus.h>
72 #include <sys/rman.h>
73 
74 #include <dev/pci/pcireg.h>
75 #include <dev/pci/pcivar.h>
76 
77 #include <vm/vm.h>		/* for pmap_mapdev() */
78 #include <vm/pmap.h>
79 
80 #include <dev/mxge/mxge_mcp.h>
81 #include <dev/mxge/mcp_gen_header.h>
82 #include <dev/mxge/if_mxge_var.h>
83 
84 /* tunable params */
85 static int mxge_nvidia_ecrc_enable = 1;
86 static int mxge_max_intr_slots = 1024;
87 static int mxge_intr_coal_delay = 30;
88 static int mxge_deassert_wait = 1;
89 static int mxge_flow_control = 1;
90 static int mxge_verbose = 0;
91 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
92 static char *mxge_fw_aligned = "mxge_eth_z8e";
93 
94 static int mxge_probe(device_t dev);
95 static int mxge_attach(device_t dev);
96 static int mxge_detach(device_t dev);
97 static int mxge_shutdown(device_t dev);
98 static void mxge_intr(void *arg);
99 
100 static device_method_t mxge_methods[] =
101 {
102   /* Device interface */
103   DEVMETHOD(device_probe, mxge_probe),
104   DEVMETHOD(device_attach, mxge_attach),
105   DEVMETHOD(device_detach, mxge_detach),
106   DEVMETHOD(device_shutdown, mxge_shutdown),
107   {0, 0}
108 };
109 
110 static driver_t mxge_driver =
111 {
112   "mxge",
113   mxge_methods,
114   sizeof(mxge_softc_t),
115 };
116 
117 static devclass_t mxge_devclass;
118 
119 /* Declare ourselves to be a child of the PCI bus.*/
120 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
121 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
122 
123 static int
124 mxge_probe(device_t dev)
125 {
126   if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
127       (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E)) {
128 	  device_set_desc(dev, "Myri10G-PCIE-8A");
129 	  return 0;
130   }
131   return ENXIO;
132 }
133 
134 static void
135 mxge_enable_wc(mxge_softc_t *sc)
136 {
137 	struct mem_range_desc mrdesc;
138 	vm_paddr_t pa;
139 	vm_offset_t len;
140 	int err, action;
141 
142 	pa = rman_get_start(sc->mem_res);
143 	len = rman_get_size(sc->mem_res);
144 	mrdesc.mr_base = pa;
145 	mrdesc.mr_len = len;
146 	mrdesc.mr_flags = MDF_WRITECOMBINE;
147 	action = MEMRANGE_SET_UPDATE;
148 	strcpy((char *)&mrdesc.mr_owner, "mxge");
149 	err = mem_range_attr_set(&mrdesc, &action);
150 	if (err != 0) {
151 		device_printf(sc->dev,
152 			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
153 			      (unsigned long)pa, (unsigned long)len, err);
154 	} else {
155 		sc->wc = 1;
156 	}
157 }
158 
159 
160 /* callback to get our DMA address */
161 static void
162 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
163 			 int error)
164 {
165 	if (error == 0) {
166 		*(bus_addr_t *) arg = segs->ds_addr;
167 	}
168 }
169 
170 static int
171 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
172 		   bus_size_t alignment)
173 {
174 	int err;
175 	device_t dev = sc->dev;
176 
177 	/* allocate DMAable memory tags */
178 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
179 				 alignment,		/* alignment */
180 				 4096,			/* boundary */
181 				 BUS_SPACE_MAXADDR,	/* low */
182 				 BUS_SPACE_MAXADDR,	/* high */
183 				 NULL, NULL,		/* filter */
184 				 bytes,			/* maxsize */
185 				 1,			/* num segs */
186 				 4096,			/* maxsegsize */
187 				 BUS_DMA_COHERENT,	/* flags */
188 				 NULL, NULL,		/* lock */
189 				 &dma->dmat);		/* tag */
190 	if (err != 0) {
191 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
192 		return err;
193 	}
194 
195 	/* allocate DMAable memory & map */
196 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
197 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
198 				| BUS_DMA_ZERO),  &dma->map);
199 	if (err != 0) {
200 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
201 		goto abort_with_dmat;
202 	}
203 
204 	/* load the memory */
205 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
206 			      mxge_dmamap_callback,
207 			      (void *)&dma->bus_addr, 0);
208 	if (err != 0) {
209 		device_printf(dev, "couldn't load map (err = %d)\n", err);
210 		goto abort_with_mem;
211 	}
212 	return 0;
213 
214 abort_with_mem:
215 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
216 abort_with_dmat:
217 	(void)bus_dma_tag_destroy(dma->dmat);
218 	return err;
219 }
220 
221 
222 static void
223 mxge_dma_free(mxge_dma_t *dma)
224 {
225 	bus_dmamap_unload(dma->dmat, dma->map);
226 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
227 	(void)bus_dma_tag_destroy(dma->dmat);
228 }
229 
230 /*
231  * The eeprom strings on the lanaiX have the format
232  * SN=x\0
233  * MAC=x:x:x:x:x:x\0
234  * PC=text\0
235  */
236 
237 static int
238 mxge_parse_strings(mxge_softc_t *sc)
239 {
240 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
241 
242 	char *ptr, *limit;
243 	int i, found_mac;
244 
245 	ptr = sc->eeprom_strings;
246 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
247 	found_mac = 0;
248 	while (ptr < limit && *ptr != '\0') {
249 		if (memcmp(ptr, "MAC=", 4) == 0) {
250 			ptr += 1;
251 			sc->mac_addr_string = ptr;
252 			for (i = 0; i < 6; i++) {
253 				ptr += 3;
254 				if ((ptr + 2) > limit)
255 					goto abort;
256 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
257 				found_mac = 1;
258 			}
259 		} else if (memcmp(ptr, "PC=", 3) == 0) {
260 			ptr += 3;
261 			strncpy(sc->product_code_string, ptr,
262 				sizeof (sc->product_code_string) - 1);
263 		} else if (memcmp(ptr, "SN=", 3) == 0) {
264 			ptr += 3;
265 			strncpy(sc->serial_number_string, ptr,
266 				sizeof (sc->serial_number_string) - 1);
267 		}
268 		MXGE_NEXT_STRING(ptr);
269 	}
270 
271 	if (found_mac)
272 		return 0;
273 
274  abort:
275 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
276 
277 	return ENXIO;
278 }
279 
280 #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
281 static int
282 mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
283 {
284 	uint32_t val;
285 	unsigned long off;
286 	char *va, *cfgptr;
287 	uint16_t vendor_id, device_id;
288 	uintptr_t bus, slot, func, ivend, idev;
289 	uint32_t *ptr32;
290 
291 	/* XXXX
292 	   Test below is commented because it is believed that doing
293 	   config read/write beyond 0xff will access the config space
294 	   for the next larger function.  Uncomment this and remove
295 	   the hacky pmap_mapdev() way of accessing config space when
296 	   FreeBSD grows support for extended pcie config space access
297 	*/
298 #if 0
299 	/* See if we can, by some miracle, access the extended
300 	   config space */
301 	val = pci_read_config(pdev, 0x178, 4);
302 	if (val != 0xffffffff) {
303 		val |= 0x40;
304 		pci_write_config(pdev, 0x178, val, 4);
305 		return 0;
306 	}
307 #endif
308 	/* Rather than using normal pci config space writes, we must
309 	 * map the Nvidia config space ourselves.  This is because on
310 	 * opteron/nvidia class machine the 0xe000000 mapping is
311 	 * handled by the nvidia chipset, that means the internal PCI
312 	 * device (the on-chip northbridge), or the amd-8131 bridge
313 	 * and things behind them are not visible by this method.
314 	 */
315 
316 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
317 		      PCI_IVAR_BUS, &bus);
318 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
319 		      PCI_IVAR_SLOT, &slot);
320 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
321 		      PCI_IVAR_FUNCTION, &func);
322 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
323 		      PCI_IVAR_VENDOR, &ivend);
324 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
325 		      PCI_IVAR_DEVICE, &idev);
326 
327 	off =  0xe0000000UL
328 		+ 0x00100000UL * (unsigned long)bus
329 		+ 0x00001000UL * (unsigned long)(func
330 						 + 8 * slot);
331 
332 	/* map it into the kernel */
333 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
334 
335 
336 	if (va == NULL) {
337 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
338 		return EIO;
339 	}
340 	/* get a pointer to the config space mapped into the kernel */
341 	cfgptr = va + (off & PAGE_MASK);
342 
343 	/* make sure that we can really access it */
344 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
345 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
346 	if (! (vendor_id == ivend && device_id == idev)) {
347 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
348 			      vendor_id, device_id);
349 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
350 		return EIO;
351 	}
352 
353 	ptr32 = (uint32_t*)(cfgptr + 0x178);
354 	val = *ptr32;
355 
356 	if (val == 0xffffffff) {
357 		device_printf(sc->dev, "extended mapping failed\n");
358 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
359 		return EIO;
360 	}
361 	*ptr32 = val | 0x40;
362 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
363 	if (mxge_verbose)
364 		device_printf(sc->dev,
365 			      "Enabled ECRC on upstream Nvidia bridge "
366 			      "at %d:%d:%d\n",
367 			      (int)bus, (int)slot, (int)func);
368 	return 0;
369 }
370 #else
371 static int
372 mxge_enable_nvidia_ecrc(mxge_softc_t *sc, device_t pdev)
373 {
374 	device_printf(sc->dev,
375 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
376 	return ENXIO;
377 }
378 #endif
379 /*
380  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
381  * when the PCI-E Completion packets are aligned on an 8-byte
382  * boundary.  Some PCI-E chip sets always align Completion packets; on
383  * the ones that do not, the alignment can be enforced by enabling
384  * ECRC generation (if supported).
385  *
386  * When PCI-E Completion packets are not aligned, it is actually more
387  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
388  *
389  * If the driver can neither enable ECRC nor verify that it has
390  * already been enabled, then it must use a firmware image which works
391  * around unaligned completion packets (ethp_z8e.dat), and it should
392  * also ensure that it never gives the device a Read-DMA which is
393  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
394  * enabled, then the driver should use the aligned (eth_z8e.dat)
395  * firmware image, and set tx.boundary to 4KB.
396  */
397 
398 static void
399 mxge_select_firmware(mxge_softc_t *sc)
400 {
401 	int err, aligned = 0;
402 	device_t pdev;
403 	uint16_t pvend, pdid;
404 
405 	pdev = device_get_parent(device_get_parent(sc->dev));
406 	if (pdev == NULL) {
407 		device_printf(sc->dev, "could not find parent?\n");
408 		goto abort;
409 	}
410 	pvend = pci_read_config(pdev, PCIR_VENDOR, 2);
411 	pdid = pci_read_config(pdev, PCIR_DEVICE, 2);
412 
413 	/* see if we can enable ECRC's on an upstream
414 	   Nvidia bridge */
415 	if (mxge_nvidia_ecrc_enable &&
416 	    (pvend == 0x10de && pdid == 0x005d)) {
417 		err = mxge_enable_nvidia_ecrc(sc, pdev);
418 		if (err == 0) {
419 			aligned = 1;
420 			if (mxge_verbose)
421 				device_printf(sc->dev,
422 					      "Assuming aligned completions"
423 					      " (ECRC)\n");
424 		}
425 	}
426 	/* see if the upstream bridge is known to
427 	   provided aligned completions */
428 	if (/* HT2000  */ (pvend == 0x1166 && pdid == 0x0132) ||
429 	    /* Ontario */ (pvend == 0x10b5 && pdid == 0x8532)) {
430 		if (mxge_verbose)
431 			device_printf(sc->dev,
432 				      "Assuming aligned completions "
433 				      "(0x%x:0x%x)\n", pvend, pdid);
434 	}
435 
436 abort:
437 	if (aligned) {
438 		sc->fw_name = mxge_fw_aligned;
439 		sc->tx.boundary = 4096;
440 	} else {
441 		sc->fw_name = mxge_fw_unaligned;
442 		sc->tx.boundary = 2048;
443 	}
444 }
445 
446 union qualhack
447 {
448         const char *ro_char;
449         char *rw_char;
450 };
451 
452 static int
453 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
454 {
455 	int major, minor;
456 
457 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
458 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
459 			      be32toh(hdr->mcp_type));
460 		return EIO;
461 	}
462 
463 	/* save firmware version for sysctl */
464 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
465 	if (mxge_verbose)
466 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
467 
468 	sscanf(sc->fw_version, "%d.%d", &major, &minor);
469 
470 	if (!(major == MXGEFW_VERSION_MAJOR
471 	      && minor == MXGEFW_VERSION_MINOR)) {
472 		device_printf(sc->dev, "Found firmware version %s\n",
473 			      sc->fw_version);
474 		device_printf(sc->dev, "Driver needs %d.%d\n",
475 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
476 		return EINVAL;
477 	}
478 	return 0;
479 
480 }
481 
482 static int
483 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
484 {
485 	struct firmware *fw;
486 	const mcp_gen_header_t *hdr;
487 	unsigned hdr_offset;
488 	const char *fw_data;
489 	union qualhack hack;
490 	int status;
491 	unsigned int i;
492 	char dummy;
493 
494 
495 	fw = firmware_get(sc->fw_name);
496 
497 	if (fw == NULL) {
498 		device_printf(sc->dev, "Could not find firmware image %s\n",
499 			      sc->fw_name);
500 		return ENOENT;
501 	}
502 	if (fw->datasize > *limit ||
503 	    fw->datasize < MCP_HEADER_PTR_OFFSET + 4) {
504 		device_printf(sc->dev, "Firmware image %s too large (%d/%d)\n",
505 			      sc->fw_name, (int)fw->datasize, (int) *limit);
506 		status = ENOSPC;
507 		goto abort_with_fw;
508 	}
509 	*limit = fw->datasize;
510 
511 	/* check id */
512 	fw_data = (const char *)fw->data;
513 	hdr_offset = htobe32(*(const uint32_t *)
514 			     (fw_data + MCP_HEADER_PTR_OFFSET));
515 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw->datasize) {
516 		device_printf(sc->dev, "Bad firmware file");
517 		status = EIO;
518 		goto abort_with_fw;
519 	}
520 	hdr = (const void*)(fw_data + hdr_offset);
521 
522 	status = mxge_validate_firmware(sc, hdr);
523 	if (status != 0)
524 		goto abort_with_fw;
525 
526 	hack.ro_char = fw_data;
527 	/* Copy the inflated firmware to NIC SRAM. */
528 	for (i = 0; i < *limit; i += 256) {
529 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
530 			      hack.rw_char + i,
531 			      min(256U, (unsigned)(*limit - i)));
532 		mb();
533 		dummy = *sc->sram;
534 		mb();
535 	}
536 
537 	status = 0;
538 abort_with_fw:
539 	firmware_put(fw, FIRMWARE_UNLOAD);
540 	return status;
541 }
542 
543 /*
544  * Enable or disable periodic RDMAs from the host to make certain
545  * chipsets resend dropped PCIe messages
546  */
547 
548 static void
549 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
550 {
551 	char buf_bytes[72];
552 	volatile uint32_t *confirm;
553 	volatile char *submit;
554 	uint32_t *buf, dma_low, dma_high;
555 	int i;
556 
557 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
558 
559 	/* clear confirmation addr */
560 	confirm = (volatile uint32_t *)sc->cmd;
561 	*confirm = 0;
562 	mb();
563 
564 	/* send an rdma command to the PCIe engine, and wait for the
565 	   response in the confirmation address.  The firmware should
566 	   write a -1 there to indicate it is alive and well
567 	*/
568 
569 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
570 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
571 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
572 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
573 	buf[2] = htobe32(0xffffffff);		/* confirm data */
574 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
575 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
576 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
577 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
578 	buf[5] = htobe32(enable);			/* enable? */
579 
580 
581 	submit = (volatile char *)(sc->sram + 0xfc01c0);
582 
583 	mxge_pio_copy(submit, buf, 64);
584 	mb();
585 	DELAY(1000);
586 	mb();
587 	i = 0;
588 	while (*confirm != 0xffffffff && i < 20) {
589 		DELAY(1000);
590 		i++;
591 	}
592 	if (*confirm != 0xffffffff) {
593 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
594 			      (enable ? "enable" : "disable"), confirm,
595 			      *confirm);
596 	}
597 	return;
598 }
599 
600 static int
601 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
602 {
603 	mcp_cmd_t *buf;
604 	char buf_bytes[sizeof(*buf) + 8];
605 	volatile mcp_cmd_response_t *response = sc->cmd;
606 	volatile char *cmd_addr = sc->sram + MXGEFW_CMD_OFFSET;
607 	uint32_t dma_low, dma_high;
608 	int sleep_total = 0;
609 
610 	/* ensure buf is aligned to 8 bytes */
611 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
612 
613 	buf->data0 = htobe32(data->data0);
614 	buf->data1 = htobe32(data->data1);
615 	buf->data2 = htobe32(data->data2);
616 	buf->cmd = htobe32(cmd);
617 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
618 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
619 
620 	buf->response_addr.low = htobe32(dma_low);
621 	buf->response_addr.high = htobe32(dma_high);
622 	mtx_lock(&sc->cmd_lock);
623 	response->result = 0xffffffff;
624 	mb();
625 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
626 
627 	/* wait up to 20ms */
628 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
629 		bus_dmamap_sync(sc->cmd_dma.dmat,
630 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
631 		mb();
632 		if (response->result != 0xffffffff) {
633 			if (response->result == 0) {
634 				data->data0 = be32toh(response->data);
635 				mtx_unlock(&sc->cmd_lock);
636 				return 0;
637 			} else {
638 				device_printf(sc->dev,
639 					      "mxge: command %d "
640 					      "failed, result = %d\n",
641 					      cmd, be32toh(response->result));
642 				mtx_unlock(&sc->cmd_lock);
643 				return ENXIO;
644 			}
645 		}
646 		DELAY(1000);
647 	}
648 	mtx_unlock(&sc->cmd_lock);
649 	device_printf(sc->dev, "mxge: command %d timed out"
650 		      "result = %d\n",
651 		      cmd, be32toh(response->result));
652 	return EAGAIN;
653 }
654 
655 static int
656 mxge_adopt_running_firmware(mxge_softc_t *sc)
657 {
658 	struct mcp_gen_header *hdr;
659 	const size_t bytes = sizeof (struct mcp_gen_header);
660 	size_t hdr_offset;
661 	int status;
662 
663 	/* find running firmware header */
664 	hdr_offset = htobe32(*(volatile uint32_t *)
665 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
666 
667 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
668 		device_printf(sc->dev,
669 			      "Running firmware has bad header offset (%d)\n",
670 			      (int)hdr_offset);
671 		return EIO;
672 	}
673 
674 	/* copy header of running firmware from SRAM to host memory to
675 	 * validate firmware */
676 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
677 	if (hdr == NULL) {
678 		device_printf(sc->dev, "could not malloc firmware hdr\n");
679 		return ENOMEM;
680 	}
681 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
682 				rman_get_bushandle(sc->mem_res),
683 				hdr_offset, (char *)hdr, bytes);
684 	status = mxge_validate_firmware(sc, hdr);
685 	free(hdr, M_DEVBUF);
686 	return status;
687 }
688 
689 
690 static int
691 mxge_load_firmware(mxge_softc_t *sc)
692 {
693 	volatile uint32_t *confirm;
694 	volatile char *submit;
695 	char buf_bytes[72];
696 	uint32_t *buf, size, dma_low, dma_high;
697 	int status, i;
698 
699 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
700 
701 	size = sc->sram_size;
702 	status = mxge_load_firmware_helper(sc, &size);
703 	if (status) {
704 		/* Try to use the currently running firmware, if
705 		   it is new enough */
706 		status = mxge_adopt_running_firmware(sc);
707 		if (status) {
708 			device_printf(sc->dev,
709 				      "failed to adopt running firmware\n");
710 			return status;
711 		}
712 		device_printf(sc->dev,
713 			      "Successfully adopted running firmware\n");
714 		if (sc->tx.boundary == 4096) {
715 			device_printf(sc->dev,
716 				"Using firmware currently running on NIC"
717 				 ".  For optimal\n");
718 			device_printf(sc->dev,
719 				 "performance consider loading optimized "
720 				 "firmware\n");
721 		}
722 
723 	}
724 	/* clear confirmation addr */
725 	confirm = (volatile uint32_t *)sc->cmd;
726 	*confirm = 0;
727 	mb();
728 	/* send a reload command to the bootstrap MCP, and wait for the
729 	   response in the confirmation address.  The firmware should
730 	   write a -1 there to indicate it is alive and well
731 	*/
732 
733 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
734 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
735 
736 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
737 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
738 	buf[2] = htobe32(0xffffffff);	/* confirm data */
739 
740 	/* FIX: All newest firmware should un-protect the bottom of
741 	   the sram before handoff. However, the very first interfaces
742 	   do not. Therefore the handoff copy must skip the first 8 bytes
743 	*/
744 					/* where the code starts*/
745 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
746 	buf[4] = htobe32(size - 8); 	/* length of code */
747 	buf[5] = htobe32(8);		/* where to copy to */
748 	buf[6] = htobe32(0);		/* where to jump to */
749 
750 	submit = (volatile char *)(sc->sram + 0xfc0000);
751 	mxge_pio_copy(submit, buf, 64);
752 	mb();
753 	DELAY(1000);
754 	mb();
755 	i = 0;
756 	while (*confirm != 0xffffffff && i < 20) {
757 		DELAY(1000*10);
758 		i++;
759 		bus_dmamap_sync(sc->cmd_dma.dmat,
760 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
761 	}
762 	if (*confirm != 0xffffffff) {
763 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
764 			confirm, *confirm);
765 
766 		return ENXIO;
767 	}
768 	return 0;
769 }
770 
771 static int
772 mxge_update_mac_address(mxge_softc_t *sc)
773 {
774 	mxge_cmd_t cmd;
775 	uint8_t *addr = sc->mac_addr;
776 	int status;
777 
778 
779 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
780 		     | (addr[2] << 8) | addr[3]);
781 
782 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
783 
784 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
785 	return status;
786 }
787 
788 static int
789 mxge_change_pause(mxge_softc_t *sc, int pause)
790 {
791 	mxge_cmd_t cmd;
792 	int status;
793 
794 	if (pause)
795 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
796 				       &cmd);
797 	else
798 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
799 				       &cmd);
800 
801 	if (status) {
802 		device_printf(sc->dev, "Failed to set flow control mode\n");
803 		return ENXIO;
804 	}
805 	sc->pause = pause;
806 	return 0;
807 }
808 
809 static void
810 mxge_change_promisc(mxge_softc_t *sc, int promisc)
811 {
812 	mxge_cmd_t cmd;
813 	int status;
814 
815 	if (promisc)
816 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
817 				       &cmd);
818 	else
819 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
820 				       &cmd);
821 
822 	if (status) {
823 		device_printf(sc->dev, "Failed to set promisc mode\n");
824 	}
825 }
826 
827 static int
828 mxge_reset(mxge_softc_t *sc)
829 {
830 
831 	mxge_cmd_t cmd;
832 	mxge_dma_t dmabench_dma;
833 	size_t bytes;
834 	int status;
835 
836 	/* try to send a reset command to the card to see if it
837 	   is alive */
838 	memset(&cmd, 0, sizeof (cmd));
839 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
840 	if (status != 0) {
841 		device_printf(sc->dev, "failed reset\n");
842 		return ENXIO;
843 	}
844 
845 	mxge_dummy_rdma(sc, 1);
846 
847 	/* Now exchange information about interrupts  */
848 	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);\
849 	memset(sc->rx_done.entry, 0, bytes);
850 	cmd.data0 = (uint32_t)bytes;
851 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
852 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->rx_done.dma.bus_addr);
853 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->rx_done.dma.bus_addr);
854 	status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA, &cmd);
855 
856 	status |= mxge_send_cmd(sc,
857 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
858 
859 
860 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
861 
862 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
863 	sc->irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
864 
865 
866 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
867 				&cmd);
868 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
869 	if (status != 0) {
870 		device_printf(sc->dev, "failed set interrupt parameters\n");
871 		return status;
872 	}
873 
874 
875 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
876 
877 
878 	/* run a DMA benchmark */
879 	sc->read_dma = sc->write_dma = sc->read_write_dma = 0;
880 	status = mxge_dma_alloc(sc, &dmabench_dma, 4096, 4096);
881 	if (status)
882 		goto dmabench_fail;
883 
884 	/* Read DMA */
885 	cmd.data0 = MXGE_LOWPART_TO_U32(dmabench_dma.bus_addr);
886 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmabench_dma.bus_addr);
887 	cmd.data2 = sc->tx.boundary * 0x10000;
888 
889 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
890 	if (status != 0)
891 		device_printf(sc->dev, "read dma benchmark failed\n");
892 	else
893 		sc->read_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
894 			(cmd.data0 & 0xffff);
895 
896 	/* Write DMA */
897 	cmd.data0 = MXGE_LOWPART_TO_U32(dmabench_dma.bus_addr);
898 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmabench_dma.bus_addr);
899 	cmd.data2 = sc->tx.boundary * 0x1;
900 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
901 	if (status != 0)
902 		device_printf(sc->dev, "write dma benchmark failed\n");
903 	else
904 		sc->write_dma = ((cmd.data0>>16) * sc->tx.boundary * 2) /
905 			(cmd.data0 & 0xffff);
906 	/* Read/Write DMA */
907 	cmd.data0 = MXGE_LOWPART_TO_U32(dmabench_dma.bus_addr);
908 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmabench_dma.bus_addr);
909 	cmd.data2 = sc->tx.boundary * 0x10001;
910 	status = mxge_send_cmd(sc, MXGEFW_DMA_TEST, &cmd);
911 	if (status != 0)
912 		device_printf(sc->dev, "read/write dma benchmark failed\n");
913 	else
914 		sc->read_write_dma =
915 			((cmd.data0>>16) * sc->tx.boundary * 2 * 2) /
916 			(cmd.data0 & 0xffff);
917 
918 	mxge_dma_free(&dmabench_dma);
919 
920 dmabench_fail:
921 	/* reset mcp/driver shared state back to 0 */
922 	bzero(sc->rx_done.entry, bytes);
923 	sc->rx_done.idx = 0;
924 	sc->rx_done.cnt = 0;
925 	sc->tx.req = 0;
926 	sc->tx.done = 0;
927 	sc->tx.pkt_done = 0;
928 	sc->rx_big.cnt = 0;
929 	sc->rx_small.cnt = 0;
930 	sc->rdma_tags_available = 15;
931 	status = mxge_update_mac_address(sc);
932 	mxge_change_promisc(sc, 0);
933 	mxge_change_pause(sc, sc->pause);
934 	return status;
935 }
936 
937 static int
938 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
939 {
940         mxge_softc_t *sc;
941         unsigned int intr_coal_delay;
942         int err;
943 
944         sc = arg1;
945         intr_coal_delay = sc->intr_coal_delay;
946         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
947         if (err != 0) {
948                 return err;
949         }
950         if (intr_coal_delay == sc->intr_coal_delay)
951                 return 0;
952 
953         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
954                 return EINVAL;
955 
956 	sx_xlock(&sc->driver_lock);
957 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
958 	sc->intr_coal_delay = intr_coal_delay;
959 
960 	sx_xunlock(&sc->driver_lock);
961         return err;
962 }
963 
964 static int
965 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
966 {
967         mxge_softc_t *sc;
968         unsigned int enabled;
969         int err;
970 
971         sc = arg1;
972         enabled = sc->pause;
973         err = sysctl_handle_int(oidp, &enabled, arg2, req);
974         if (err != 0) {
975                 return err;
976         }
977         if (enabled == sc->pause)
978                 return 0;
979 
980 	sx_xlock(&sc->driver_lock);
981 	err = mxge_change_pause(sc, enabled);
982 	sx_xunlock(&sc->driver_lock);
983         return err;
984 }
985 
986 static int
987 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
988 {
989         int err;
990 
991         if (arg1 == NULL)
992                 return EFAULT;
993         arg2 = be32toh(*(int *)arg1);
994         arg1 = NULL;
995         err = sysctl_handle_int(oidp, arg1, arg2, req);
996 
997         return err;
998 }
999 
1000 static void
1001 mxge_add_sysctls(mxge_softc_t *sc)
1002 {
1003 	struct sysctl_ctx_list *ctx;
1004 	struct sysctl_oid_list *children;
1005 	mcp_irq_data_t *fw;
1006 
1007 	ctx = device_get_sysctl_ctx(sc->dev);
1008 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1009 	fw = sc->fw_stats;
1010 
1011 	/* random information */
1012 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1013 		       "firmware_version",
1014 		       CTLFLAG_RD, &sc->fw_version,
1015 		       0, "firmware version");
1016 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1017 		       "serial_number",
1018 		       CTLFLAG_RD, &sc->serial_number_string,
1019 		       0, "serial number");
1020 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1021 		       "product_code",
1022 		       CTLFLAG_RD, &sc->product_code_string,
1023 		       0, "product_code");
1024 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1025 		       "tx_boundary",
1026 		       CTLFLAG_RD, &sc->tx.boundary,
1027 		       0, "tx_boundary");
1028 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1029 		       "write_combine",
1030 		       CTLFLAG_RD, &sc->wc,
1031 		       0, "write combining PIO?");
1032 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1033 		       "read_dma_MBs",
1034 		       CTLFLAG_RD, &sc->read_dma,
1035 		       0, "DMA Read speed in MB/s");
1036 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1037 		       "write_dma_MBs",
1038 		       CTLFLAG_RD, &sc->write_dma,
1039 		       0, "DMA Write speed in MB/s");
1040 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1041 		       "read_write_dma_MBs",
1042 		       CTLFLAG_RD, &sc->read_write_dma,
1043 		       0, "DMA concurrent Read/Write speed in MB/s");
1044 
1045 
1046 	/* performance related tunables */
1047 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1048 			"intr_coal_delay",
1049 			CTLTYPE_INT|CTLFLAG_RW, sc,
1050 			0, mxge_change_intr_coal,
1051 			"I", "interrupt coalescing delay in usecs");
1052 
1053 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1054 			"flow_control_enabled",
1055 			CTLTYPE_INT|CTLFLAG_RW, sc,
1056 			0, mxge_change_flow_control,
1057 			"I", "interrupt coalescing delay in usecs");
1058 
1059 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1060 		       "deassert_wait",
1061 		       CTLFLAG_RW, &mxge_deassert_wait,
1062 		       0, "Wait for IRQ line to go low in ihandler");
1063 
1064 	/* stats block from firmware is in network byte order.
1065 	   Need to swap it */
1066 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1067 			"link_up",
1068 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1069 			0, mxge_handle_be32,
1070 			"I", "link up");
1071 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1072 			"rdma_tags_available",
1073 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1074 			0, mxge_handle_be32,
1075 			"I", "rdma_tags_available");
1076 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1077 			"dropped_link_overflow",
1078 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1079 			0, mxge_handle_be32,
1080 			"I", "dropped_link_overflow");
1081 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1082 			"dropped_link_error_or_filtered",
1083 			CTLTYPE_INT|CTLFLAG_RD,
1084 			&fw->dropped_link_error_or_filtered,
1085 			0, mxge_handle_be32,
1086 			"I", "dropped_link_error_or_filtered");
1087 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1088 			"dropped_runt",
1089 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1090 			0, mxge_handle_be32,
1091 			"I", "dropped_runt");
1092 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1093 			"dropped_overrun",
1094 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1095 			0, mxge_handle_be32,
1096 			"I", "dropped_overrun");
1097 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1098 			"dropped_no_small_buffer",
1099 			CTLTYPE_INT|CTLFLAG_RD,
1100 			&fw->dropped_no_small_buffer,
1101 			0, mxge_handle_be32,
1102 			"I", "dropped_no_small_buffer");
1103 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1104 			"dropped_no_big_buffer",
1105 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1106 			0, mxge_handle_be32,
1107 			"I", "dropped_no_big_buffer");
1108 
1109 	/* host counters exported for debugging */
1110 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1111 		       "rx_small_cnt",
1112 		       CTLFLAG_RD, &sc->rx_small.cnt,
1113 		       0, "rx_small_cnt");
1114 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1115 		       "rx_big_cnt",
1116 		       CTLFLAG_RD, &sc->rx_big.cnt,
1117 		       0, "rx_small_cnt");
1118 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1119 		       "tx_req",
1120 		       CTLFLAG_RD, &sc->tx.req,
1121 		       0, "tx_req");
1122 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1123 		       "tx_done",
1124 		       CTLFLAG_RD, &sc->tx.done,
1125 		       0, "tx_done");
1126 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1127 		       "tx_pkt_done",
1128 		       CTLFLAG_RD, &sc->tx.pkt_done,
1129 		       0, "tx_done");
1130 
1131 	/* verbose printing? */
1132 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1133 		       "verbose",
1134 		       CTLFLAG_RW, &mxge_verbose,
1135 		       0, "verbose printing");
1136 
1137 }
1138 
1139 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1140    backwards one at a time and handle ring wraps */
1141 
1142 static inline void
1143 mxge_submit_req_backwards(mxge_tx_buf_t *tx,
1144 			    mcp_kreq_ether_send_t *src, int cnt)
1145 {
1146         int idx, starting_slot;
1147         starting_slot = tx->req;
1148         while (cnt > 1) {
1149                 cnt--;
1150                 idx = (starting_slot + cnt) & tx->mask;
1151                 mxge_pio_copy(&tx->lanai[idx],
1152 			      &src[cnt], sizeof(*src));
1153                 mb();
1154         }
1155 }
1156 
1157 /*
1158  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1159  * at most 32 bytes at a time, so as to avoid involving the software
1160  * pio handler in the nic.   We re-write the first segment's flags
1161  * to mark them valid only after writing the entire chain
1162  */
1163 
1164 static inline void
1165 mxge_submit_req(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src,
1166                   int cnt)
1167 {
1168         int idx, i;
1169         uint32_t *src_ints;
1170 	volatile uint32_t *dst_ints;
1171         mcp_kreq_ether_send_t *srcp;
1172 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1173 	uint8_t last_flags;
1174 
1175         idx = tx->req & tx->mask;
1176 
1177 	last_flags = src->flags;
1178 	src->flags = 0;
1179         mb();
1180         dst = dstp = &tx->lanai[idx];
1181         srcp = src;
1182 
1183         if ((idx + cnt) < tx->mask) {
1184                 for (i = 0; i < (cnt - 1); i += 2) {
1185                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1186                         mb(); /* force write every 32 bytes */
1187                         srcp += 2;
1188                         dstp += 2;
1189                 }
1190         } else {
1191                 /* submit all but the first request, and ensure
1192                    that it is submitted below */
1193                 mxge_submit_req_backwards(tx, src, cnt);
1194                 i = 0;
1195         }
1196         if (i < cnt) {
1197                 /* submit the first request */
1198                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1199                 mb(); /* barrier before setting valid flag */
1200         }
1201 
1202         /* re-write the last 32-bits with the valid flags */
1203         src->flags = last_flags;
1204         src_ints = (uint32_t *)src;
1205         src_ints+=3;
1206         dst_ints = (volatile uint32_t *)dst;
1207         dst_ints+=3;
1208         *dst_ints =  *src_ints;
1209         tx->req += cnt;
1210         mb();
1211 }
1212 
1213 static inline void
1214 mxge_submit_req_wc(mxge_tx_buf_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1215 {
1216     tx->req += cnt;
1217     mb();
1218     while (cnt >= 4) {
1219 	    mxge_pio_copy((volatile char *)tx->wc_fifo, src, 64);
1220 	    mb();
1221 	    src += 4;
1222 	    cnt -= 4;
1223     }
1224     if (cnt > 0) {
1225 	    /* pad it to 64 bytes.  The src is 64 bytes bigger than it
1226 	       needs to be so that we don't overrun it */
1227 	    mxge_pio_copy(tx->wc_fifo + (cnt<<18), src, 64);
1228 	    mb();
1229     }
1230 }
1231 
1232 static void
1233 mxge_encap(mxge_softc_t *sc, struct mbuf *m)
1234 {
1235 	mcp_kreq_ether_send_t *req;
1236 	bus_dma_segment_t seg_list[MXGE_MAX_SEND_DESC];
1237 	bus_dma_segment_t *seg;
1238 	struct mbuf *m_tmp;
1239 	struct ifnet *ifp;
1240 	mxge_tx_buf_t *tx;
1241 	struct ether_header *eh;
1242 	struct ip *ip;
1243 	int cnt, cum_len, err, i, idx;
1244 	uint16_t flags, pseudo_hdr_offset;
1245         uint8_t cksum_offset;
1246 
1247 
1248 
1249 	ifp = sc->ifp;
1250 	tx = &sc->tx;
1251 
1252 	/* (try to) map the frame for DMA */
1253 	idx = tx->req & tx->mask;
1254 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1255 				      m, seg_list, &cnt,
1256 				      BUS_DMA_NOWAIT);
1257 	if (err == EFBIG) {
1258 		/* Too many segments in the chain.  Try
1259 		   to defrag */
1260 		m_tmp = m_defrag(m, M_NOWAIT);
1261 		if (m_tmp == NULL) {
1262 			goto drop;
1263 		}
1264 		m = m_tmp;
1265 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1266 					      tx->info[idx].map,
1267 					      m, seg_list, &cnt,
1268 					      BUS_DMA_NOWAIT);
1269 	}
1270 	if (err != 0) {
1271 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d\n",
1272 			      err);
1273 		goto drop;
1274 	}
1275 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1276 			BUS_DMASYNC_PREWRITE);
1277 	tx->info[idx].m = m;
1278 
1279 	req = tx->req_list;
1280 	cksum_offset = 0;
1281 	pseudo_hdr_offset = 0;
1282 	flags = MXGEFW_FLAGS_NO_TSO;
1283 
1284 	/* checksum offloading? */
1285 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1286 		eh = mtod(m, struct ether_header *);
1287 		ip = (struct ip *) (eh + 1);
1288 		cksum_offset = sizeof(*eh) + (ip->ip_hl << 2);
1289 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1290 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1291 		req->cksum_offset = cksum_offset;
1292 		flags |= MXGEFW_FLAGS_CKSUM;
1293 	}
1294 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1295 		flags |= MXGEFW_FLAGS_SMALL;
1296 
1297 	/* convert segments into a request list */
1298 	cum_len = 0;
1299 	seg = seg_list;
1300 	req->flags = MXGEFW_FLAGS_FIRST;
1301 	for (i = 0; i < cnt; i++) {
1302 		req->addr_low =
1303 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1304 		req->addr_high =
1305 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1306 		req->length = htobe16(seg->ds_len);
1307 		req->cksum_offset = cksum_offset;
1308 		if (cksum_offset > seg->ds_len)
1309 			cksum_offset -= seg->ds_len;
1310 		else
1311 			cksum_offset = 0;
1312 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1313 		req->pad = 0; /* complete solid 16-byte block */
1314 		req->rdma_count = 1;
1315 		req->flags |= flags | ((cum_len & 1) * MXGEFW_FLAGS_ALIGN_ODD);
1316 		cum_len += seg->ds_len;
1317 		seg++;
1318 		req++;
1319 		req->flags = 0;
1320 	}
1321 	req--;
1322 	/* pad runts to 60 bytes */
1323 	if (cum_len < 60) {
1324 		req++;
1325 		req->addr_low =
1326 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
1327 		req->addr_high =
1328 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
1329 		req->length = htobe16(60 - cum_len);
1330 		req->cksum_offset = 0;
1331 		req->pseudo_hdr_offset = pseudo_hdr_offset;
1332 		req->pad = 0; /* complete solid 16-byte block */
1333 		req->rdma_count = 1;
1334 		req->flags |= flags | ((cum_len & 1) * MXGEFW_FLAGS_ALIGN_ODD);
1335 		cnt++;
1336 	}
1337 
1338 	tx->req_list[0].rdma_count = cnt;
1339 #if 0
1340 	/* print what the firmware will see */
1341 	for (i = 0; i < cnt; i++) {
1342 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1343 		    "cso:%d, flags:0x%x, rdma:%d\n",
1344 		    i, (int)ntohl(tx->req_list[i].addr_high),
1345 		    (int)ntohl(tx->req_list[i].addr_low),
1346 		    (int)ntohs(tx->req_list[i].length),
1347 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1348 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1349 		    tx->req_list[i].rdma_count);
1350 	}
1351 	printf("--------------\n");
1352 #endif
1353 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1354 	if (tx->wc_fifo == NULL)
1355 		mxge_submit_req(tx, tx->req_list, cnt);
1356 	else
1357 		mxge_submit_req_wc(tx, tx->req_list, cnt);
1358 	return;
1359 
1360 drop:
1361 	m_freem(m);
1362 	ifp->if_oerrors++;
1363 	return;
1364 }
1365 
1366 
1367 
1368 
1369 static inline void
1370 mxge_start_locked(mxge_softc_t *sc)
1371 {
1372 	struct mbuf *m;
1373 	struct ifnet *ifp;
1374 
1375 	ifp = sc->ifp;
1376 	while ((sc->tx.mask - (sc->tx.req - sc->tx.done))
1377 	       > MXGE_MAX_SEND_DESC) {
1378 
1379 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
1380 		if (m == NULL) {
1381 			return;
1382 		}
1383 		/* let BPF see it */
1384 		BPF_MTAP(ifp, m);
1385 
1386 		/* give it to the nic */
1387 		mxge_encap(sc, m);
1388 	}
1389 	/* ran out of transmit slots */
1390 	sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
1391 }
1392 
1393 static void
1394 mxge_start(struct ifnet *ifp)
1395 {
1396 	mxge_softc_t *sc = ifp->if_softc;
1397 
1398 
1399 	mtx_lock(&sc->tx_lock);
1400 	mxge_start_locked(sc);
1401 	mtx_unlock(&sc->tx_lock);
1402 }
1403 
1404 /*
1405  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
1406  * at most 32 bytes at a time, so as to avoid involving the software
1407  * pio handler in the nic.   We re-write the first segment's low
1408  * DMA address to mark it valid only after we write the entire chunk
1409  * in a burst
1410  */
1411 static inline void
1412 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
1413 		mcp_kreq_ether_recv_t *src)
1414 {
1415 	uint32_t low;
1416 
1417 	low = src->addr_low;
1418 	src->addr_low = 0xffffffff;
1419 	mxge_pio_copy(dst, src, 8 * sizeof (*src));
1420 	mb();
1421 	dst->addr_low = low;
1422 	mb();
1423 }
1424 
1425 static int
1426 mxge_get_buf_small(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1427 {
1428 	bus_dma_segment_t seg;
1429 	struct mbuf *m;
1430 	mxge_rx_buf_t *rx = &sc->rx_small;
1431 	int cnt, err;
1432 
1433 	m = m_gethdr(M_DONTWAIT, MT_DATA);
1434 	if (m == NULL) {
1435 		rx->alloc_fail++;
1436 		err = ENOBUFS;
1437 		goto done;
1438 	}
1439 	m->m_len = MHLEN;
1440 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1441 				      &seg, &cnt, BUS_DMA_NOWAIT);
1442 	if (err != 0) {
1443 		m_free(m);
1444 		goto done;
1445 	}
1446 	rx->info[idx].m = m;
1447 	rx->shadow[idx].addr_low =
1448 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1449 	rx->shadow[idx].addr_high =
1450 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1451 
1452 done:
1453 	if ((idx & 7) == 7) {
1454 		if (rx->wc_fifo == NULL)
1455 			mxge_submit_8rx(&rx->lanai[idx - 7],
1456 					&rx->shadow[idx - 7]);
1457 		else {
1458 			mb();
1459 			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1460 		}
1461         }
1462 	return err;
1463 }
1464 
1465 static int
1466 mxge_get_buf_big(mxge_softc_t *sc, bus_dmamap_t map, int idx)
1467 {
1468 	bus_dma_segment_t seg;
1469 	struct mbuf *m;
1470 	mxge_rx_buf_t *rx = &sc->rx_big;
1471 	int cnt, err;
1472 
1473 	m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, sc->big_bytes);
1474 	if (m == NULL) {
1475 		rx->alloc_fail++;
1476 		err = ENOBUFS;
1477 		goto done;
1478 	}
1479 	m->m_len = sc->big_bytes;
1480 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
1481 				      &seg, &cnt, BUS_DMA_NOWAIT);
1482 	if (err != 0) {
1483 		m_free(m);
1484 		goto done;
1485 	}
1486 	rx->info[idx].m = m;
1487 	rx->shadow[idx].addr_low =
1488 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
1489 	rx->shadow[idx].addr_high =
1490 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
1491 
1492 done:
1493 	if ((idx & 7) == 7) {
1494 		if (rx->wc_fifo == NULL)
1495 			mxge_submit_8rx(&rx->lanai[idx - 7],
1496 					&rx->shadow[idx - 7]);
1497 		else {
1498 			mb();
1499 			mxge_pio_copy(rx->wc_fifo, &rx->shadow[idx - 7], 64);
1500 		}
1501         }
1502 	return err;
1503 }
1504 
1505 static inline void
1506 mxge_rx_csum(struct mbuf *m, int csum)
1507 {
1508 	struct ether_header *eh;
1509 	struct ip *ip;
1510 
1511 	eh = mtod(m, struct ether_header *);
1512 	if (__predict_true(eh->ether_type ==  htons(ETHERTYPE_IP))) {
1513 		ip = (struct ip *)(eh + 1);
1514 		if (__predict_true(ip->ip_p == IPPROTO_TCP ||
1515 				   ip->ip_p == IPPROTO_UDP)) {
1516 			m->m_pkthdr.csum_data = csum;
1517 			m->m_pkthdr.csum_flags = CSUM_DATA_VALID;
1518 		}
1519 	}
1520 }
1521 
1522 static inline void
1523 mxge_rx_done_big(mxge_softc_t *sc, int len, int csum)
1524 {
1525 	struct ifnet *ifp;
1526 	struct mbuf *m = 0; 		/* -Wunitialized */
1527 	struct mbuf *m_prev = 0;	/* -Wunitialized */
1528 	struct mbuf *m_head = 0;
1529 	bus_dmamap_t old_map;
1530 	mxge_rx_buf_t *rx;
1531 	int idx;
1532 
1533 
1534 	rx = &sc->rx_big;
1535 	ifp = sc->ifp;
1536 	while (len > 0) {
1537 		idx = rx->cnt & rx->mask;
1538                 rx->cnt++;
1539 		/* save a pointer to the received mbuf */
1540 		m = rx->info[idx].m;
1541 		/* try to replace the received mbuf */
1542 		if (mxge_get_buf_big(sc, rx->extra_map, idx)) {
1543 			goto drop;
1544 		}
1545 		/* unmap the received buffer */
1546 		old_map = rx->info[idx].map;
1547 		bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1548 		bus_dmamap_unload(rx->dmat, old_map);
1549 
1550 		/* swap the bus_dmamap_t's */
1551 		rx->info[idx].map = rx->extra_map;
1552 		rx->extra_map = old_map;
1553 
1554 		/* chain multiple segments together */
1555 		if (!m_head) {
1556 			m_head = m;
1557 			/* mcp implicitly skips 1st bytes so that
1558 			 * packet is properly aligned */
1559 			m->m_data += MXGEFW_PAD;
1560 			m->m_pkthdr.len = len;
1561 			m->m_len = sc->big_bytes - MXGEFW_PAD;
1562 		} else {
1563 			m->m_len = sc->big_bytes;
1564 			m->m_flags &= ~M_PKTHDR;
1565 			m_prev->m_next = m;
1566 		}
1567 		len -= m->m_len;
1568 		m_prev = m;
1569 	}
1570 
1571 	/* trim trailing garbage from the last mbuf in the chain.  If
1572 	 * there is any garbage, len will be negative */
1573 	m->m_len += len;
1574 
1575 	/* if the checksum is valid, mark it in the mbuf header */
1576 	if (sc->csum_flag)
1577 		mxge_rx_csum(m_head, csum);
1578 
1579 	/* pass the frame up the stack */
1580 	m_head->m_pkthdr.rcvif = ifp;
1581 	ifp->if_ipackets++;
1582 	(*ifp->if_input)(ifp, m_head);
1583 	return;
1584 
1585 drop:
1586 	/* drop the frame -- the old mbuf(s) are re-cycled by running
1587 	   every slot through the allocator */
1588         if (m_head) {
1589                 len -= sc->big_bytes;
1590                 m_freem(m_head);
1591         } else {
1592                 len -= (sc->big_bytes + MXGEFW_PAD);
1593         }
1594         while ((int)len > 0) {
1595                 idx = rx->cnt & rx->mask;
1596                 rx->cnt++;
1597                 m = rx->info[idx].m;
1598                 if (0 == (mxge_get_buf_big(sc, rx->extra_map, idx))) {
1599 			m_freem(m);
1600 			/* unmap the received buffer */
1601 			old_map = rx->info[idx].map;
1602 			bus_dmamap_sync(rx->dmat, old_map,
1603 					BUS_DMASYNC_POSTREAD);
1604 			bus_dmamap_unload(rx->dmat, old_map);
1605 
1606 			/* swap the bus_dmamap_t's */
1607 			rx->info[idx].map = rx->extra_map;
1608 			rx->extra_map = old_map;
1609 		}
1610                 len -= sc->big_bytes;
1611         }
1612 
1613 	ifp->if_ierrors++;
1614 
1615 }
1616 
1617 static inline void
1618 mxge_rx_done_small(mxge_softc_t *sc, uint32_t len, uint32_t csum)
1619 {
1620 	struct ifnet *ifp;
1621 	struct mbuf *m;
1622 	mxge_rx_buf_t *rx;
1623 	bus_dmamap_t old_map;
1624 	int idx;
1625 
1626 	ifp = sc->ifp;
1627 	rx = &sc->rx_small;
1628 	idx = rx->cnt & rx->mask;
1629 	rx->cnt++;
1630 	/* save a pointer to the received mbuf */
1631 	m = rx->info[idx].m;
1632 	/* try to replace the received mbuf */
1633 	if (mxge_get_buf_small(sc, rx->extra_map, idx)) {
1634 		/* drop the frame -- the old mbuf is re-cycled */
1635 		ifp->if_ierrors++;
1636 		return;
1637 	}
1638 
1639 	/* unmap the received buffer */
1640 	old_map = rx->info[idx].map;
1641 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
1642 	bus_dmamap_unload(rx->dmat, old_map);
1643 
1644 	/* swap the bus_dmamap_t's */
1645 	rx->info[idx].map = rx->extra_map;
1646 	rx->extra_map = old_map;
1647 
1648 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
1649 	 * aligned */
1650 	m->m_data += MXGEFW_PAD;
1651 
1652 	/* if the checksum is valid, mark it in the mbuf header */
1653 	if (sc->csum_flag)
1654 		mxge_rx_csum(m, csum);
1655 
1656 	/* pass the frame up the stack */
1657 	m->m_pkthdr.rcvif = ifp;
1658 	m->m_len = m->m_pkthdr.len = len;
1659 	ifp->if_ipackets++;
1660 	(*ifp->if_input)(ifp, m);
1661 }
1662 
1663 static inline void
1664 mxge_clean_rx_done(mxge_softc_t *sc)
1665 {
1666 	mxge_rx_done_t *rx_done = &sc->rx_done;
1667 	int limit = 0;
1668 	uint16_t length;
1669 	uint16_t checksum;
1670 
1671 
1672 	while (rx_done->entry[rx_done->idx].length != 0) {
1673 		length = ntohs(rx_done->entry[rx_done->idx].length);
1674 		rx_done->entry[rx_done->idx].length = 0;
1675 		checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
1676 		if (length <= MHLEN)
1677 			mxge_rx_done_small(sc, length, checksum);
1678 		else
1679 			mxge_rx_done_big(sc, length, checksum);
1680 		rx_done->cnt++;
1681 		rx_done->idx = rx_done->cnt & (mxge_max_intr_slots - 1);
1682 
1683 		/* limit potential for livelock */
1684 		if (__predict_false(++limit > 2 * mxge_max_intr_slots))
1685 			break;
1686 
1687 	}
1688 }
1689 
1690 
1691 static inline void
1692 mxge_tx_done(mxge_softc_t *sc, uint32_t mcp_idx)
1693 {
1694 	struct ifnet *ifp;
1695 	mxge_tx_buf_t *tx;
1696 	struct mbuf *m;
1697 	bus_dmamap_t map;
1698 	int idx, limit;
1699 
1700 	limit = 0;
1701 	tx = &sc->tx;
1702 	ifp = sc->ifp;
1703 	while (tx->pkt_done != mcp_idx) {
1704 		idx = tx->done & tx->mask;
1705 		tx->done++;
1706 		m = tx->info[idx].m;
1707 		/* mbuf and DMA map only attached to the first
1708 		   segment per-mbuf */
1709 		if (m != NULL) {
1710 			ifp->if_opackets++;
1711 			tx->info[idx].m = NULL;
1712 			map = tx->info[idx].map;
1713 			bus_dmamap_unload(tx->dmat, map);
1714 			m_freem(m);
1715 		}
1716 		if (tx->info[idx].flag) {
1717 			tx->info[idx].flag = 0;
1718 			tx->pkt_done++;
1719 		}
1720 		/* limit potential for livelock by only handling
1721 		   2 full tx rings per call */
1722 		if (__predict_false(++limit >  2 * tx->mask))
1723 			break;
1724 	}
1725 
1726 	/* If we have space, clear IFF_OACTIVE to tell the stack that
1727            its OK to send packets */
1728 
1729 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
1730 	    tx->req - tx->done < (tx->mask + 1)/4) {
1731 		mtx_lock(&sc->tx_lock);
1732 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
1733 		mxge_start_locked(sc);
1734 		mtx_unlock(&sc->tx_lock);
1735 	}
1736 }
1737 
1738 static void
1739 mxge_intr(void *arg)
1740 {
1741 	mxge_softc_t *sc = arg;
1742 	mcp_irq_data_t *stats = sc->fw_stats;
1743 	mxge_tx_buf_t *tx = &sc->tx;
1744 	mxge_rx_done_t *rx_done = &sc->rx_done;
1745 	uint32_t send_done_count;
1746 	uint8_t valid;
1747 
1748 
1749 	/* make sure the DMA has finished */
1750 	if (!stats->valid) {
1751 		return;
1752 	}
1753 	valid = stats->valid;
1754 
1755 	/* lower legacy IRQ  */
1756 	*sc->irq_deassert = 0;
1757 	mb();
1758 	if (!mxge_deassert_wait)
1759 		/* don't wait for conf. that irq is low */
1760 		stats->valid = 0;
1761 	do {
1762 		/* check for transmit completes and receives */
1763 		send_done_count = be32toh(stats->send_done_count);
1764 		while ((send_done_count != tx->pkt_done) ||
1765 		       (rx_done->entry[rx_done->idx].length != 0)) {
1766 			mxge_tx_done(sc, (int)send_done_count);
1767 			mxge_clean_rx_done(sc);
1768 			send_done_count = be32toh(stats->send_done_count);
1769 		}
1770 	} while (*((volatile uint8_t *) &stats->valid));
1771 
1772 	if (__predict_false(stats->stats_updated)) {
1773 		if (sc->link_state != stats->link_up) {
1774 			sc->link_state = stats->link_up;
1775 			if (sc->link_state) {
1776 				if_link_state_change(sc->ifp, LINK_STATE_UP);
1777 				if (mxge_verbose)
1778 					device_printf(sc->dev, "link up\n");
1779 			} else {
1780 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
1781 				if (mxge_verbose)
1782 					device_printf(sc->dev, "link down\n");
1783 			}
1784 		}
1785 		if (sc->rdma_tags_available !=
1786 		    be32toh(sc->fw_stats->rdma_tags_available)) {
1787 			sc->rdma_tags_available =
1788 				be32toh(sc->fw_stats->rdma_tags_available);
1789 			device_printf(sc->dev, "RDMA timed out! %d tags "
1790 				      "left\n", sc->rdma_tags_available);
1791 		}
1792 		sc->down_cnt += stats->link_down;
1793 	}
1794 
1795 	/* check to see if we have rx token to pass back */
1796 	if (valid & 0x1)
1797 	    *sc->irq_claim = be32toh(3);
1798 	*(sc->irq_claim + 1) = be32toh(3);
1799 }
1800 
1801 static void
1802 mxge_watchdog(struct ifnet *ifp)
1803 {
1804 	printf("%s called\n", __FUNCTION__);
1805 }
1806 
1807 static void
1808 mxge_init(void *arg)
1809 {
1810 }
1811 
1812 
1813 
1814 static void
1815 mxge_free_mbufs(mxge_softc_t *sc)
1816 {
1817 	int i;
1818 
1819 	for (i = 0; i <= sc->rx_big.mask; i++) {
1820 		if (sc->rx_big.info[i].m == NULL)
1821 			continue;
1822 		bus_dmamap_unload(sc->rx_big.dmat,
1823 				  sc->rx_big.info[i].map);
1824 		m_freem(sc->rx_big.info[i].m);
1825 		sc->rx_big.info[i].m = NULL;
1826 	}
1827 
1828 	for (i = 0; i <= sc->rx_big.mask; i++) {
1829 		if (sc->rx_big.info[i].m == NULL)
1830 			continue;
1831 		bus_dmamap_unload(sc->rx_big.dmat,
1832 				  sc->rx_big.info[i].map);
1833 		m_freem(sc->rx_big.info[i].m);
1834 		sc->rx_big.info[i].m = NULL;
1835 	}
1836 
1837 	for (i = 0; i <= sc->tx.mask; i++) {
1838 		if (sc->tx.info[i].m == NULL)
1839 			continue;
1840 		bus_dmamap_unload(sc->tx.dmat,
1841 				  sc->tx.info[i].map);
1842 		m_freem(sc->tx.info[i].m);
1843 		sc->tx.info[i].m = NULL;
1844 	}
1845 }
1846 
1847 static void
1848 mxge_free_rings(mxge_softc_t *sc)
1849 {
1850 	int i;
1851 
1852 	if (sc->tx.req_bytes != NULL) {
1853 		free(sc->tx.req_bytes, M_DEVBUF);
1854 	}
1855 	if (sc->rx_small.shadow != NULL)
1856 		free(sc->rx_small.shadow, M_DEVBUF);
1857 	if (sc->rx_big.shadow != NULL)
1858 		free(sc->rx_big.shadow, M_DEVBUF);
1859 	if (sc->tx.info != NULL) {
1860 		for (i = 0; i <= sc->tx.mask; i++) {
1861 			if (sc->tx.info[i].map != NULL)
1862 				bus_dmamap_destroy(sc->tx.dmat,
1863 						   sc->tx.info[i].map);
1864 		}
1865 		free(sc->tx.info, M_DEVBUF);
1866 	}
1867 	if (sc->rx_small.info != NULL) {
1868 		for (i = 0; i <= sc->rx_small.mask; i++) {
1869 			if (sc->rx_small.info[i].map != NULL)
1870 				bus_dmamap_destroy(sc->rx_small.dmat,
1871 						   sc->rx_small.info[i].map);
1872 		}
1873 		free(sc->rx_small.info, M_DEVBUF);
1874 	}
1875 	if (sc->rx_big.info != NULL) {
1876 		for (i = 0; i <= sc->rx_big.mask; i++) {
1877 			if (sc->rx_big.info[i].map != NULL)
1878 				bus_dmamap_destroy(sc->rx_big.dmat,
1879 						   sc->rx_big.info[i].map);
1880 		}
1881 		free(sc->rx_big.info, M_DEVBUF);
1882 	}
1883 	if (sc->rx_big.extra_map != NULL)
1884 		bus_dmamap_destroy(sc->rx_big.dmat,
1885 				   sc->rx_big.extra_map);
1886 	if (sc->rx_small.extra_map != NULL)
1887 		bus_dmamap_destroy(sc->rx_small.dmat,
1888 				   sc->rx_small.extra_map);
1889 	if (sc->tx.dmat != NULL)
1890 		bus_dma_tag_destroy(sc->tx.dmat);
1891 	if (sc->rx_small.dmat != NULL)
1892 		bus_dma_tag_destroy(sc->rx_small.dmat);
1893 	if (sc->rx_big.dmat != NULL)
1894 		bus_dma_tag_destroy(sc->rx_big.dmat);
1895 }
1896 
1897 static int
1898 mxge_alloc_rings(mxge_softc_t *sc)
1899 {
1900 	mxge_cmd_t cmd;
1901 	int tx_ring_size, rx_ring_size;
1902 	int tx_ring_entries, rx_ring_entries;
1903 	int i, err;
1904 	unsigned long bytes;
1905 
1906 	/* get ring sizes */
1907 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
1908 	tx_ring_size = cmd.data0;
1909 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
1910 	if (err != 0) {
1911 		device_printf(sc->dev, "Cannot determine ring sizes\n");
1912 		goto abort_with_nothing;
1913 	}
1914 
1915 	rx_ring_size = cmd.data0;
1916 
1917 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
1918 	rx_ring_entries = rx_ring_size / sizeof (mcp_dma_addr_t);
1919 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
1920 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
1921 	IFQ_SET_READY(&sc->ifp->if_snd);
1922 
1923 	sc->tx.mask = tx_ring_entries - 1;
1924 	sc->rx_small.mask = sc->rx_big.mask = rx_ring_entries - 1;
1925 
1926 	err = ENOMEM;
1927 
1928 	/* allocate the tx request copy block */
1929 	bytes = 8 +
1930 		sizeof (*sc->tx.req_list) * (MXGE_MAX_SEND_DESC + 4);
1931 	sc->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
1932 	if (sc->tx.req_bytes == NULL)
1933 		goto abort_with_nothing;
1934 	/* ensure req_list entries are aligned to 8 bytes */
1935 	sc->tx.req_list = (mcp_kreq_ether_send_t *)
1936 		((unsigned long)(sc->tx.req_bytes + 7) & ~7UL);
1937 
1938 	/* allocate the rx shadow rings */
1939 	bytes = rx_ring_entries * sizeof (*sc->rx_small.shadow);
1940 	sc->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
1941 	if (sc->rx_small.shadow == NULL)
1942 		goto abort_with_alloc;
1943 
1944 	bytes = rx_ring_entries * sizeof (*sc->rx_big.shadow);
1945 	sc->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
1946 	if (sc->rx_big.shadow == NULL)
1947 		goto abort_with_alloc;
1948 
1949 	/* allocate the host info rings */
1950 	bytes = tx_ring_entries * sizeof (*sc->tx.info);
1951 	sc->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
1952 	if (sc->tx.info == NULL)
1953 		goto abort_with_alloc;
1954 
1955 	bytes = rx_ring_entries * sizeof (*sc->rx_small.info);
1956 	sc->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
1957 	if (sc->rx_small.info == NULL)
1958 		goto abort_with_alloc;
1959 
1960 	bytes = rx_ring_entries * sizeof (*sc->rx_big.info);
1961 	sc->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
1962 	if (sc->rx_big.info == NULL)
1963 		goto abort_with_alloc;
1964 
1965 	/* allocate the busdma resources */
1966 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
1967 				 1,			/* alignment */
1968 				 sc->tx.boundary,	/* boundary */
1969 				 BUS_SPACE_MAXADDR,	/* low */
1970 				 BUS_SPACE_MAXADDR,	/* high */
1971 				 NULL, NULL,		/* filter */
1972 				 MXGE_MAX_ETHER_MTU,	/* maxsize */
1973 				 MXGE_MAX_SEND_DESC,	/* num segs */
1974 				 sc->tx.boundary,	/* maxsegsize */
1975 				 BUS_DMA_ALLOCNOW,	/* flags */
1976 				 NULL, NULL,		/* lock */
1977 				 &sc->tx.dmat);		/* tag */
1978 
1979 	if (err != 0) {
1980 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
1981 			      err);
1982 		goto abort_with_alloc;
1983 	}
1984 
1985 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
1986 				 1,			/* alignment */
1987 				 4096,			/* boundary */
1988 				 BUS_SPACE_MAXADDR,	/* low */
1989 				 BUS_SPACE_MAXADDR,	/* high */
1990 				 NULL, NULL,		/* filter */
1991 				 MHLEN,			/* maxsize */
1992 				 1,			/* num segs */
1993 				 MHLEN,			/* maxsegsize */
1994 				 BUS_DMA_ALLOCNOW,	/* flags */
1995 				 NULL, NULL,		/* lock */
1996 				 &sc->rx_small.dmat);	/* tag */
1997 	if (err != 0) {
1998 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
1999 			      err);
2000 		goto abort_with_alloc;
2001 	}
2002 
2003 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2004 				 1,			/* alignment */
2005 				 4096,			/* boundary */
2006 				 BUS_SPACE_MAXADDR,	/* low */
2007 				 BUS_SPACE_MAXADDR,	/* high */
2008 				 NULL, NULL,		/* filter */
2009 				 4096,			/* maxsize */
2010 				 1,			/* num segs */
2011 				 4096,			/* maxsegsize */
2012 				 BUS_DMA_ALLOCNOW,	/* flags */
2013 				 NULL, NULL,		/* lock */
2014 				 &sc->rx_big.dmat);	/* tag */
2015 	if (err != 0) {
2016 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2017 			      err);
2018 		goto abort_with_alloc;
2019 	}
2020 
2021 	/* now use these tags to setup dmamaps for each slot
2022 	   in each ring */
2023 	for (i = 0; i <= sc->tx.mask; i++) {
2024 		err = bus_dmamap_create(sc->tx.dmat, 0,
2025 					&sc->tx.info[i].map);
2026 		if (err != 0) {
2027 			device_printf(sc->dev, "Err %d  tx dmamap\n",
2028 			      err);
2029 			goto abort_with_alloc;
2030 		}
2031 	}
2032 	for (i = 0; i <= sc->rx_small.mask; i++) {
2033 		err = bus_dmamap_create(sc->rx_small.dmat, 0,
2034 					&sc->rx_small.info[i].map);
2035 		if (err != 0) {
2036 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2037 				      err);
2038 			goto abort_with_alloc;
2039 		}
2040 	}
2041 	err = bus_dmamap_create(sc->rx_small.dmat, 0,
2042 				&sc->rx_small.extra_map);
2043 	if (err != 0) {
2044 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2045 			      err);
2046 			goto abort_with_alloc;
2047 	}
2048 
2049 	for (i = 0; i <= sc->rx_big.mask; i++) {
2050 		err = bus_dmamap_create(sc->rx_big.dmat, 0,
2051 					&sc->rx_big.info[i].map);
2052 		if (err != 0) {
2053 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2054 			      err);
2055 			goto abort_with_alloc;
2056 		}
2057 	}
2058 	err = bus_dmamap_create(sc->rx_big.dmat, 0,
2059 				&sc->rx_big.extra_map);
2060 	if (err != 0) {
2061 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2062 			      err);
2063 			goto abort_with_alloc;
2064 	}
2065 	return 0;
2066 
2067 abort_with_alloc:
2068 	mxge_free_rings(sc);
2069 
2070 abort_with_nothing:
2071 	return err;
2072 }
2073 
2074 static int
2075 mxge_open(mxge_softc_t *sc)
2076 {
2077 	mxge_cmd_t cmd;
2078 	int i, err;
2079 	bus_dmamap_t map;
2080 
2081 
2082 	err = mxge_reset(sc);
2083 	if (err != 0) {
2084 		device_printf(sc->dev, "failed to reset\n");
2085 		return EIO;
2086 	}
2087 
2088 	if (MCLBYTES >=
2089 	    sc->ifp->if_mtu + ETHER_HDR_LEN + MXGEFW_PAD)
2090 		sc->big_bytes = MCLBYTES;
2091 	else
2092 		sc->big_bytes = MJUMPAGESIZE;
2093 
2094 	err = mxge_alloc_rings(sc);
2095 	if (err != 0) {
2096 		device_printf(sc->dev, "failed to allocate rings\n");
2097 		return err;
2098 	}
2099 
2100 	err = bus_setup_intr(sc->dev, sc->irq_res,
2101 			     INTR_TYPE_NET | INTR_MPSAFE,
2102 			     mxge_intr, sc, &sc->ih);
2103 	if (err != 0) {
2104 		goto abort_with_rings;
2105 	}
2106 
2107 	/* get the lanai pointers to the send and receive rings */
2108 
2109 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
2110 	sc->tx.lanai =
2111 		(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
2112 	err |= mxge_send_cmd(sc,
2113 				 MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
2114 	sc->rx_small.lanai =
2115 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2116 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
2117 	sc->rx_big.lanai =
2118 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
2119 
2120 	if (err != 0) {
2121 		device_printf(sc->dev,
2122 			      "failed to get ring sizes or locations\n");
2123 		err = EIO;
2124 		goto abort_with_irq;
2125 	}
2126 
2127 	if (sc->wc) {
2128 		sc->tx.wc_fifo = sc->sram + 0x200000;
2129 		sc->rx_small.wc_fifo = sc->sram + 0x300000;
2130 		sc->rx_big.wc_fifo = sc->sram + 0x340000;
2131 	} else {
2132 		sc->tx.wc_fifo = 0;
2133 		sc->rx_small.wc_fifo = 0;
2134 		sc->rx_big.wc_fifo = 0;
2135 	}
2136 
2137 
2138 	/* stock receive rings */
2139 	for (i = 0; i <= sc->rx_small.mask; i++) {
2140 		map = sc->rx_small.info[i].map;
2141 		err = mxge_get_buf_small(sc, map, i);
2142 		if (err) {
2143 			device_printf(sc->dev, "alloced %d/%d smalls\n",
2144 				      i, sc->rx_small.mask + 1);
2145 			goto abort;
2146 		}
2147 	}
2148 	for (i = 0; i <= sc->rx_big.mask; i++) {
2149 		map = sc->rx_big.info[i].map;
2150 		err = mxge_get_buf_big(sc, map, i);
2151 		if (err) {
2152 			device_printf(sc->dev, "alloced %d/%d bigs\n",
2153 				      i, sc->rx_big.mask + 1);
2154 			goto abort;
2155 		}
2156 	}
2157 
2158 	/* Give the firmware the mtu and the big and small buffer
2159 	   sizes.  The firmware wants the big buf size to be a power
2160 	   of two. Luckily, FreeBSD's clusters are powers of two */
2161 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN;
2162 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
2163 	cmd.data0 = MHLEN;
2164 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
2165 			     &cmd);
2166 	cmd.data0 = sc->big_bytes;
2167 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2168 	/* Now give him the pointer to the stats block */
2169 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->fw_stats_dma.bus_addr);
2170 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->fw_stats_dma.bus_addr);
2171 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA, &cmd);
2172 
2173 	if (err != 0) {
2174 		device_printf(sc->dev, "failed to setup params\n");
2175 		goto abort;
2176 	}
2177 
2178 	/* Finally, start the firmware running */
2179 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
2180 	if (err) {
2181 		device_printf(sc->dev, "Couldn't bring up link\n");
2182 		goto abort;
2183 	}
2184 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
2185 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2186 
2187 	return 0;
2188 
2189 
2190 abort:
2191 	mxge_free_mbufs(sc);
2192 abort_with_irq:
2193 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
2194 abort_with_rings:
2195 	mxge_free_rings(sc);
2196 	return err;
2197 }
2198 
2199 static int
2200 mxge_close(mxge_softc_t *sc)
2201 {
2202 	mxge_cmd_t cmd;
2203 	int err, old_down_cnt;
2204 
2205 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
2206 	old_down_cnt = sc->down_cnt;
2207 	mb();
2208 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2209 	if (err) {
2210 		device_printf(sc->dev, "Couldn't bring down link\n");
2211 	}
2212 	if (old_down_cnt == sc->down_cnt) {
2213 		/* wait for down irq */
2214 		(void)tsleep(&sc->down_cnt, PWAIT, "down mxge", hz);
2215 	}
2216 	if (old_down_cnt == sc->down_cnt) {
2217 		device_printf(sc->dev, "never got down irq\n");
2218 	}
2219 	if (sc->ih != NULL)
2220 		bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
2221 	mxge_free_mbufs(sc);
2222 	mxge_free_rings(sc);
2223 	return 0;
2224 }
2225 
2226 
2227 static int
2228 mxge_media_change(struct ifnet *ifp)
2229 {
2230 	return EINVAL;
2231 }
2232 
2233 static int
2234 mxge_change_mtu(mxge_softc_t *sc, int mtu)
2235 {
2236 	struct ifnet *ifp = sc->ifp;
2237 	int real_mtu, old_mtu;
2238 	int err = 0;
2239 
2240 
2241 	real_mtu = mtu + ETHER_HDR_LEN;
2242 	if ((real_mtu > MXGE_MAX_ETHER_MTU) ||
2243 	    real_mtu < 60)
2244 		return EINVAL;
2245 	sx_xlock(&sc->driver_lock);
2246 	old_mtu = ifp->if_mtu;
2247 	ifp->if_mtu = mtu;
2248 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
2249 		mxge_close(sc);
2250 		err = mxge_open(sc);
2251 		if (err != 0) {
2252 			ifp->if_mtu = old_mtu;
2253 			mxge_close(sc);
2254 			(void) mxge_open(sc);
2255 		}
2256 	}
2257 	sx_xunlock(&sc->driver_lock);
2258 	return err;
2259 }
2260 
2261 static void
2262 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
2263 {
2264 	mxge_softc_t *sc = ifp->if_softc;
2265 
2266 
2267 	if (sc == NULL)
2268 		return;
2269 	ifmr->ifm_status = IFM_AVALID;
2270 	ifmr->ifm_status |= sc->fw_stats->link_up ? IFM_ACTIVE : 0;
2271 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
2272 	ifmr->ifm_active |= sc->fw_stats->link_up ? IFM_FDX : 0;
2273 }
2274 
2275 static int
2276 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
2277 {
2278 	mxge_softc_t *sc = ifp->if_softc;
2279 	struct ifreq *ifr = (struct ifreq *)data;
2280 	int err, mask;
2281 
2282 	err = 0;
2283 	switch (command) {
2284 	case SIOCSIFADDR:
2285 	case SIOCGIFADDR:
2286 		err = ether_ioctl(ifp, command, data);
2287 		break;
2288 
2289 	case SIOCSIFMTU:
2290 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
2291 		break;
2292 
2293 	case SIOCSIFFLAGS:
2294 		sx_xlock(&sc->driver_lock);
2295 		if (ifp->if_flags & IFF_UP) {
2296 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING))
2297 				err = mxge_open(sc);
2298 		} else {
2299 			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
2300 				mxge_close(sc);
2301 		}
2302 		sx_xunlock(&sc->driver_lock);
2303 		break;
2304 
2305 	case SIOCADDMULTI:
2306 	case SIOCDELMULTI:
2307 		err = 0;
2308 		break;
2309 
2310 	case SIOCSIFCAP:
2311 		sx_xlock(&sc->driver_lock);
2312 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
2313 		if (mask & IFCAP_TXCSUM) {
2314 			if (IFCAP_TXCSUM & ifp->if_capenable) {
2315 				ifp->if_capenable &= ~IFCAP_TXCSUM;
2316 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
2317 			} else {
2318 				ifp->if_capenable |= IFCAP_TXCSUM;
2319 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
2320 			}
2321 		} else if (mask & IFCAP_RXCSUM) {
2322 			if (IFCAP_RXCSUM & ifp->if_capenable) {
2323 				ifp->if_capenable &= ~IFCAP_RXCSUM;
2324 				sc->csum_flag = 0;
2325 			} else {
2326 				ifp->if_capenable |= IFCAP_RXCSUM;
2327 				sc->csum_flag = 1;
2328 			}
2329 		}
2330 		sx_xunlock(&sc->driver_lock);
2331 		break;
2332 
2333 	case SIOCGIFMEDIA:
2334 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
2335 				    &sc->media, command);
2336                 break;
2337 
2338 	default:
2339 		err = ENOTTY;
2340         }
2341 	return err;
2342 }
2343 
2344 static void
2345 mxge_fetch_tunables(mxge_softc_t *sc)
2346 {
2347 
2348 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
2349 			  &mxge_flow_control);
2350 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
2351 			  &mxge_intr_coal_delay);
2352 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
2353 			  &mxge_nvidia_ecrc_enable);
2354 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
2355 			  &mxge_deassert_wait);
2356 	TUNABLE_INT_FETCH("hw.mxge.verbose",
2357 			  &mxge_verbose);
2358 
2359 	if (bootverbose)
2360 		mxge_verbose = 1;
2361 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
2362 		mxge_intr_coal_delay = 30;
2363 	sc->pause = mxge_flow_control;
2364 }
2365 
2366 static int
2367 mxge_attach(device_t dev)
2368 {
2369 	mxge_softc_t *sc = device_get_softc(dev);
2370 	struct ifnet *ifp;
2371 	size_t bytes;
2372 	int rid, err;
2373 	uint16_t cmd;
2374 
2375 	sc->dev = dev;
2376 	mxge_fetch_tunables(sc);
2377 
2378 	err = bus_dma_tag_create(NULL,			/* parent */
2379 				 1,			/* alignment */
2380 				 4096,			/* boundary */
2381 				 BUS_SPACE_MAXADDR,	/* low */
2382 				 BUS_SPACE_MAXADDR,	/* high */
2383 				 NULL, NULL,		/* filter */
2384 				 MXGE_MAX_ETHER_MTU,	/* maxsize */
2385 				 MXGE_MAX_SEND_DESC, 	/* num segs */
2386 				 4096,			/* maxsegsize */
2387 				 0,			/* flags */
2388 				 NULL, NULL,		/* lock */
2389 				 &sc->parent_dmat);	/* tag */
2390 
2391 	if (err != 0) {
2392 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
2393 			      err);
2394 		goto abort_with_nothing;
2395 	}
2396 
2397 	ifp = sc->ifp = if_alloc(IFT_ETHER);
2398 	if (ifp == NULL) {
2399 		device_printf(dev, "can not if_alloc()\n");
2400 		err = ENOSPC;
2401 		goto abort_with_parent_dmat;
2402 	}
2403 	mtx_init(&sc->cmd_lock, NULL,
2404 		 MTX_NETWORK_LOCK, MTX_DEF);
2405 	mtx_init(&sc->tx_lock, device_get_nameunit(dev),
2406 		 MTX_NETWORK_LOCK, MTX_DEF);
2407 	sx_init(&sc->driver_lock, device_get_nameunit(dev));
2408 
2409 	/* Enable DMA and Memory space access */
2410 	pci_enable_busmaster(dev);
2411 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
2412 	cmd |= PCIM_CMD_MEMEN;
2413 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
2414 
2415 	/* Map the board into the kernel */
2416 	rid = PCIR_BARS;
2417 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
2418 					 ~0, 1, RF_ACTIVE);
2419 	if (sc->mem_res == NULL) {
2420 		device_printf(dev, "could not map memory\n");
2421 		err = ENXIO;
2422 		goto abort_with_lock;
2423 	}
2424 	sc->sram = rman_get_virtual(sc->mem_res);
2425 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
2426 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
2427 		device_printf(dev, "impossible memory region size %ld\n",
2428 			      rman_get_size(sc->mem_res));
2429 		err = ENXIO;
2430 		goto abort_with_mem_res;
2431 	}
2432 
2433 	/* make NULL terminated copy of the EEPROM strings section of
2434 	   lanai SRAM */
2435 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
2436 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
2437 				rman_get_bushandle(sc->mem_res),
2438 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
2439 				sc->eeprom_strings,
2440 				MXGE_EEPROM_STRINGS_SIZE - 2);
2441 	err = mxge_parse_strings(sc);
2442 	if (err != 0)
2443 		goto abort_with_mem_res;
2444 
2445 	/* Enable write combining for efficient use of PCIe bus */
2446 	mxge_enable_wc(sc);
2447 
2448 	/* Allocate the out of band dma memory */
2449 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
2450 			     sizeof (mxge_cmd_t), 64);
2451 	if (err != 0)
2452 		goto abort_with_mem_res;
2453 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
2454 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
2455 	if (err != 0)
2456 		goto abort_with_cmd_dma;
2457 
2458 	err = mxge_dma_alloc(sc, &sc->fw_stats_dma,
2459 			     sizeof (*sc->fw_stats), 64);
2460 	if (err != 0)
2461 		goto abort_with_zeropad_dma;
2462 	sc->fw_stats = (mcp_irq_data_t *)sc->fw_stats_dma.addr;
2463 
2464 
2465 	/* allocate interrupt queues */
2466 	bytes = mxge_max_intr_slots * sizeof (*sc->rx_done.entry);
2467 	err = mxge_dma_alloc(sc, &sc->rx_done.dma, bytes, 4096);
2468 	if (err != 0)
2469 		goto abort_with_fw_stats;
2470 	sc->rx_done.entry = sc->rx_done.dma.addr;
2471 	bzero(sc->rx_done.entry, bytes);
2472 	/* Add our ithread  */
2473 	rid = 0;
2474 	sc->irq_res = bus_alloc_resource(dev, SYS_RES_IRQ, &rid, 0, ~0,
2475 					 1, RF_SHAREABLE | RF_ACTIVE);
2476 	if (sc->irq_res == NULL) {
2477 		device_printf(dev, "could not alloc interrupt\n");
2478 		goto abort_with_rx_done;
2479 	}
2480 
2481 	/* load the firmware */
2482 	mxge_select_firmware(sc);
2483 
2484 	err = mxge_load_firmware(sc);
2485 	if (err != 0)
2486 		goto abort_with_irq_res;
2487 	sc->intr_coal_delay = mxge_intr_coal_delay;
2488 	err = mxge_reset(sc);
2489 	if (err != 0)
2490 		goto abort_with_irq_res;
2491 
2492 	/* hook into the network stack */
2493 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
2494 	ifp->if_baudrate = 100000000;
2495 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM;
2496 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP;
2497 	ifp->if_capenable = ifp->if_capabilities;
2498 	sc->csum_flag = 1;
2499         ifp->if_init = mxge_init;
2500         ifp->if_softc = sc;
2501         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
2502         ifp->if_ioctl = mxge_ioctl;
2503         ifp->if_start = mxge_start;
2504 	ifp->if_watchdog = mxge_watchdog;
2505 	ether_ifattach(ifp, sc->mac_addr);
2506 	/* ether_ifattach sets mtu to 1500 */
2507 	ifp->if_mtu = MXGE_MAX_ETHER_MTU - ETHER_HDR_LEN;
2508 
2509 	/* Initialise the ifmedia structure */
2510 	ifmedia_init(&sc->media, 0, mxge_media_change,
2511 		     mxge_media_status);
2512 	ifmedia_add(&sc->media, IFM_ETHER|IFM_AUTO, 0, NULL);
2513 	mxge_add_sysctls(sc);
2514 	return 0;
2515 
2516 abort_with_irq_res:
2517 	bus_release_resource(dev, SYS_RES_IRQ, 0, sc->irq_res);
2518 abort_with_rx_done:
2519 	sc->rx_done.entry = NULL;
2520 	mxge_dma_free(&sc->rx_done.dma);
2521 abort_with_fw_stats:
2522 	mxge_dma_free(&sc->fw_stats_dma);
2523 abort_with_zeropad_dma:
2524 	mxge_dma_free(&sc->zeropad_dma);
2525 abort_with_cmd_dma:
2526 	mxge_dma_free(&sc->cmd_dma);
2527 abort_with_mem_res:
2528 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
2529 abort_with_lock:
2530 	pci_disable_busmaster(dev);
2531 	mtx_destroy(&sc->cmd_lock);
2532 	mtx_destroy(&sc->tx_lock);
2533 	sx_destroy(&sc->driver_lock);
2534 	if_free(ifp);
2535 abort_with_parent_dmat:
2536 	bus_dma_tag_destroy(sc->parent_dmat);
2537 
2538 abort_with_nothing:
2539 	return err;
2540 }
2541 
2542 static int
2543 mxge_detach(device_t dev)
2544 {
2545 	mxge_softc_t *sc = device_get_softc(dev);
2546 
2547 	sx_xlock(&sc->driver_lock);
2548 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
2549 		mxge_close(sc);
2550 	sx_xunlock(&sc->driver_lock);
2551 	ether_ifdetach(sc->ifp);
2552 	mxge_dummy_rdma(sc, 0);
2553 	bus_release_resource(dev, SYS_RES_IRQ, 0, sc->irq_res);
2554 	sc->rx_done.entry = NULL;
2555 	mxge_dma_free(&sc->rx_done.dma);
2556 	mxge_dma_free(&sc->fw_stats_dma);
2557 	mxge_dma_free(&sc->zeropad_dma);
2558 	mxge_dma_free(&sc->cmd_dma);
2559 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
2560 	pci_disable_busmaster(dev);
2561 	mtx_destroy(&sc->cmd_lock);
2562 	mtx_destroy(&sc->tx_lock);
2563 	sx_destroy(&sc->driver_lock);
2564 	if_free(sc->ifp);
2565 	bus_dma_tag_destroy(sc->parent_dmat);
2566 	return 0;
2567 }
2568 
2569 static int
2570 mxge_shutdown(device_t dev)
2571 {
2572 	return 0;
2573 }
2574 
2575 /*
2576   This file uses Myri10GE driver indentation.
2577 
2578   Local Variables:
2579   c-file-style:"linux"
2580   tab-width:8
2581   End:
2582 */
2583