xref: /freebsd/sys/dev/mxge/if_mxge.c (revision db612abe8df3355d1eb23bb3b50fdd97bc21e979)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2008, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 
49 #include <net/if.h>
50 #include <net/if_arp.h>
51 #include <net/ethernet.h>
52 #include <net/if_dl.h>
53 #include <net/if_media.h>
54 
55 #include <net/bpf.h>
56 
57 #include <net/if_types.h>
58 #include <net/if_vlan_var.h>
59 #include <net/zlib.h>
60 
61 #include <netinet/in_systm.h>
62 #include <netinet/in.h>
63 #include <netinet/ip.h>
64 #include <netinet/tcp.h>
65 
66 #include <machine/bus.h>
67 #include <machine/in_cksum.h>
68 #include <machine/resource.h>
69 #include <sys/bus.h>
70 #include <sys/rman.h>
71 #include <sys/smp.h>
72 
73 #include <dev/pci/pcireg.h>
74 #include <dev/pci/pcivar.h>
75 
76 #include <vm/vm.h>		/* for pmap_mapdev() */
77 #include <vm/pmap.h>
78 
79 #if defined(__i386) || defined(__amd64)
80 #include <machine/specialreg.h>
81 #endif
82 
83 #include <dev/mxge/mxge_mcp.h>
84 #include <dev/mxge/mcp_gen_header.h>
85 /*#define MXGE_FAKE_IFP*/
86 #include <dev/mxge/if_mxge_var.h>
87 
88 /* tunable params */
89 static int mxge_nvidia_ecrc_enable = 1;
90 static int mxge_force_firmware = 0;
91 static int mxge_intr_coal_delay = 30;
92 static int mxge_deassert_wait = 1;
93 static int mxge_flow_control = 1;
94 static int mxge_verbose = 0;
95 static int mxge_lro_cnt = 8;
96 static int mxge_ticks;
97 static int mxge_max_slices = 1;
98 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
99 static int mxge_always_promisc = 0;
100 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
101 static char *mxge_fw_aligned = "mxge_eth_z8e";
102 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
103 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
104 
105 static int mxge_probe(device_t dev);
106 static int mxge_attach(device_t dev);
107 static int mxge_detach(device_t dev);
108 static int mxge_shutdown(device_t dev);
109 static void mxge_intr(void *arg);
110 
111 static device_method_t mxge_methods[] =
112 {
113   /* Device interface */
114   DEVMETHOD(device_probe, mxge_probe),
115   DEVMETHOD(device_attach, mxge_attach),
116   DEVMETHOD(device_detach, mxge_detach),
117   DEVMETHOD(device_shutdown, mxge_shutdown),
118   {0, 0}
119 };
120 
121 static driver_t mxge_driver =
122 {
123   "mxge",
124   mxge_methods,
125   sizeof(mxge_softc_t),
126 };
127 
128 static devclass_t mxge_devclass;
129 
130 /* Declare ourselves to be a child of the PCI bus.*/
131 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
132 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
133 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
134 
135 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
136 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
137 static int mxge_close(mxge_softc_t *sc);
138 static int mxge_open(mxge_softc_t *sc);
139 static void mxge_tick(void *arg);
140 
141 static int
142 mxge_probe(device_t dev)
143 {
144   if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
145       ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
146        (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
147 	  device_set_desc(dev, "Myri10G-PCIE-8A");
148 	  return 0;
149   }
150   return ENXIO;
151 }
152 
153 static void
154 mxge_enable_wc(mxge_softc_t *sc)
155 {
156 #if defined(__i386) || defined(__amd64)
157 	vm_offset_t len;
158 	int err;
159 
160 	sc->wc = 1;
161 	len = rman_get_size(sc->mem_res);
162 	err = pmap_change_attr((vm_offset_t) sc->sram,
163 			       len, PAT_WRITE_COMBINING);
164 	if (err != 0) {
165 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
166 			      err);
167 		sc->wc = 0;
168 	}
169 #endif
170 }
171 
172 
173 /* callback to get our DMA address */
174 static void
175 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
176 			 int error)
177 {
178 	if (error == 0) {
179 		*(bus_addr_t *) arg = segs->ds_addr;
180 	}
181 }
182 
183 static int
184 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
185 		   bus_size_t alignment)
186 {
187 	int err;
188 	device_t dev = sc->dev;
189 	bus_size_t boundary, maxsegsize;
190 
191 	if (bytes > 4096 && alignment == 4096) {
192 		boundary = 0;
193 		maxsegsize = bytes;
194 	} else {
195 		boundary = 4096;
196 		maxsegsize = 4096;
197 	}
198 
199 	/* allocate DMAable memory tags */
200 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
201 				 alignment,		/* alignment */
202 				 boundary,		/* boundary */
203 				 BUS_SPACE_MAXADDR,	/* low */
204 				 BUS_SPACE_MAXADDR,	/* high */
205 				 NULL, NULL,		/* filter */
206 				 bytes,			/* maxsize */
207 				 1,			/* num segs */
208 				 maxsegsize,		/* maxsegsize */
209 				 BUS_DMA_COHERENT,	/* flags */
210 				 NULL, NULL,		/* lock */
211 				 &dma->dmat);		/* tag */
212 	if (err != 0) {
213 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
214 		return err;
215 	}
216 
217 	/* allocate DMAable memory & map */
218 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
219 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
220 				| BUS_DMA_ZERO),  &dma->map);
221 	if (err != 0) {
222 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
223 		goto abort_with_dmat;
224 	}
225 
226 	/* load the memory */
227 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
228 			      mxge_dmamap_callback,
229 			      (void *)&dma->bus_addr, 0);
230 	if (err != 0) {
231 		device_printf(dev, "couldn't load map (err = %d)\n", err);
232 		goto abort_with_mem;
233 	}
234 	return 0;
235 
236 abort_with_mem:
237 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
238 abort_with_dmat:
239 	(void)bus_dma_tag_destroy(dma->dmat);
240 	return err;
241 }
242 
243 
244 static void
245 mxge_dma_free(mxge_dma_t *dma)
246 {
247 	bus_dmamap_unload(dma->dmat, dma->map);
248 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
249 	(void)bus_dma_tag_destroy(dma->dmat);
250 }
251 
252 /*
253  * The eeprom strings on the lanaiX have the format
254  * SN=x\0
255  * MAC=x:x:x:x:x:x\0
256  * PC=text\0
257  */
258 
259 static int
260 mxge_parse_strings(mxge_softc_t *sc)
261 {
262 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
263 
264 	char *ptr, *limit;
265 	int i, found_mac;
266 
267 	ptr = sc->eeprom_strings;
268 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
269 	found_mac = 0;
270 	while (ptr < limit && *ptr != '\0') {
271 		if (memcmp(ptr, "MAC=", 4) == 0) {
272 			ptr += 1;
273 			sc->mac_addr_string = ptr;
274 			for (i = 0; i < 6; i++) {
275 				ptr += 3;
276 				if ((ptr + 2) > limit)
277 					goto abort;
278 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
279 				found_mac = 1;
280 			}
281 		} else if (memcmp(ptr, "PC=", 3) == 0) {
282 			ptr += 3;
283 			strncpy(sc->product_code_string, ptr,
284 				sizeof (sc->product_code_string) - 1);
285 		} else if (memcmp(ptr, "SN=", 3) == 0) {
286 			ptr += 3;
287 			strncpy(sc->serial_number_string, ptr,
288 				sizeof (sc->serial_number_string) - 1);
289 		}
290 		MXGE_NEXT_STRING(ptr);
291 	}
292 
293 	if (found_mac)
294 		return 0;
295 
296  abort:
297 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
298 
299 	return ENXIO;
300 }
301 
302 #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
303 static void
304 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
305 {
306 	uint32_t val;
307 	unsigned long base, off;
308 	char *va, *cfgptr;
309 	device_t pdev, mcp55;
310 	uint16_t vendor_id, device_id, word;
311 	uintptr_t bus, slot, func, ivend, idev;
312 	uint32_t *ptr32;
313 
314 
315 	if (!mxge_nvidia_ecrc_enable)
316 		return;
317 
318 	pdev = device_get_parent(device_get_parent(sc->dev));
319 	if (pdev == NULL) {
320 		device_printf(sc->dev, "could not find parent?\n");
321 		return;
322 	}
323 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
324 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
325 
326 	if (vendor_id != 0x10de)
327 		return;
328 
329 	base = 0;
330 
331 	if (device_id == 0x005d) {
332 		/* ck804, base address is magic */
333 		base = 0xe0000000UL;
334 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
335 		/* mcp55, base address stored in chipset */
336 		mcp55 = pci_find_bsf(0, 0, 0);
337 		if (mcp55 &&
338 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
339 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
340 			word = pci_read_config(mcp55, 0x90, 2);
341 			base = ((unsigned long)word & 0x7ffeU) << 25;
342 		}
343 	}
344 	if (!base)
345 		return;
346 
347 	/* XXXX
348 	   Test below is commented because it is believed that doing
349 	   config read/write beyond 0xff will access the config space
350 	   for the next larger function.  Uncomment this and remove
351 	   the hacky pmap_mapdev() way of accessing config space when
352 	   FreeBSD grows support for extended pcie config space access
353 	*/
354 #if 0
355 	/* See if we can, by some miracle, access the extended
356 	   config space */
357 	val = pci_read_config(pdev, 0x178, 4);
358 	if (val != 0xffffffff) {
359 		val |= 0x40;
360 		pci_write_config(pdev, 0x178, val, 4);
361 		return;
362 	}
363 #endif
364 	/* Rather than using normal pci config space writes, we must
365 	 * map the Nvidia config space ourselves.  This is because on
366 	 * opteron/nvidia class machine the 0xe000000 mapping is
367 	 * handled by the nvidia chipset, that means the internal PCI
368 	 * device (the on-chip northbridge), or the amd-8131 bridge
369 	 * and things behind them are not visible by this method.
370 	 */
371 
372 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
373 		      PCI_IVAR_BUS, &bus);
374 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
375 		      PCI_IVAR_SLOT, &slot);
376 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
377 		      PCI_IVAR_FUNCTION, &func);
378 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
379 		      PCI_IVAR_VENDOR, &ivend);
380 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
381 		      PCI_IVAR_DEVICE, &idev);
382 
383 	off =  base
384 		+ 0x00100000UL * (unsigned long)bus
385 		+ 0x00001000UL * (unsigned long)(func
386 						 + 8 * slot);
387 
388 	/* map it into the kernel */
389 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
390 
391 
392 	if (va == NULL) {
393 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
394 		return;
395 	}
396 	/* get a pointer to the config space mapped into the kernel */
397 	cfgptr = va + (off & PAGE_MASK);
398 
399 	/* make sure that we can really access it */
400 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
401 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
402 	if (! (vendor_id == ivend && device_id == idev)) {
403 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
404 			      vendor_id, device_id);
405 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
406 		return;
407 	}
408 
409 	ptr32 = (uint32_t*)(cfgptr + 0x178);
410 	val = *ptr32;
411 
412 	if (val == 0xffffffff) {
413 		device_printf(sc->dev, "extended mapping failed\n");
414 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
415 		return;
416 	}
417 	*ptr32 = val | 0x40;
418 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
419 	if (mxge_verbose)
420 		device_printf(sc->dev,
421 			      "Enabled ECRC on upstream Nvidia bridge "
422 			      "at %d:%d:%d\n",
423 			      (int)bus, (int)slot, (int)func);
424 	return;
425 }
426 #else
427 static void
428 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
429 {
430 	device_printf(sc->dev,
431 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
432 	return;
433 }
434 #endif
435 
436 
437 static int
438 mxge_dma_test(mxge_softc_t *sc, int test_type)
439 {
440 	mxge_cmd_t cmd;
441 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
442 	int status;
443 	uint32_t len;
444 	char *test = " ";
445 
446 
447 	/* Run a small DMA test.
448 	 * The magic multipliers to the length tell the firmware
449 	 * to do DMA read, write, or read+write tests.  The
450 	 * results are returned in cmd.data0.  The upper 16
451 	 * bits of the return is the number of transfers completed.
452 	 * The lower 16 bits is the time in 0.5us ticks that the
453 	 * transfers took to complete.
454 	 */
455 
456 	len = sc->tx_boundary;
457 
458 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
459 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
460 	cmd.data2 = len * 0x10000;
461 	status = mxge_send_cmd(sc, test_type, &cmd);
462 	if (status != 0) {
463 		test = "read";
464 		goto abort;
465 	}
466 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
467 		(cmd.data0 & 0xffff);
468 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
469 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
470 	cmd.data2 = len * 0x1;
471 	status = mxge_send_cmd(sc, test_type, &cmd);
472 	if (status != 0) {
473 		test = "write";
474 		goto abort;
475 	}
476 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
477 		(cmd.data0 & 0xffff);
478 
479 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
480 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
481 	cmd.data2 = len * 0x10001;
482 	status = mxge_send_cmd(sc, test_type, &cmd);
483 	if (status != 0) {
484 		test = "read/write";
485 		goto abort;
486 	}
487 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
488 		(cmd.data0 & 0xffff);
489 
490 abort:
491 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
492 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
493 			      test, status);
494 
495 	return status;
496 }
497 
498 /*
499  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
500  * when the PCI-E Completion packets are aligned on an 8-byte
501  * boundary.  Some PCI-E chip sets always align Completion packets; on
502  * the ones that do not, the alignment can be enforced by enabling
503  * ECRC generation (if supported).
504  *
505  * When PCI-E Completion packets are not aligned, it is actually more
506  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
507  *
508  * If the driver can neither enable ECRC nor verify that it has
509  * already been enabled, then it must use a firmware image which works
510  * around unaligned completion packets (ethp_z8e.dat), and it should
511  * also ensure that it never gives the device a Read-DMA which is
512  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
513  * enabled, then the driver should use the aligned (eth_z8e.dat)
514  * firmware image, and set tx_boundary to 4KB.
515  */
516 
517 static int
518 mxge_firmware_probe(mxge_softc_t *sc)
519 {
520 	device_t dev = sc->dev;
521 	int reg, status;
522 	uint16_t pectl;
523 
524 	sc->tx_boundary = 4096;
525 	/*
526 	 * Verify the max read request size was set to 4KB
527 	 * before trying the test with 4KB.
528 	 */
529 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
530 		pectl = pci_read_config(dev, reg + 0x8, 2);
531 		if ((pectl & (5 << 12)) != (5 << 12)) {
532 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
533 				      pectl);
534 			sc->tx_boundary = 2048;
535 		}
536 	}
537 
538 	/*
539 	 * load the optimized firmware (which assumes aligned PCIe
540 	 * completions) in order to see if it works on this host.
541 	 */
542 	sc->fw_name = mxge_fw_aligned;
543 	status = mxge_load_firmware(sc, 1);
544 	if (status != 0) {
545 		return status;
546 	}
547 
548 	/*
549 	 * Enable ECRC if possible
550 	 */
551 	mxge_enable_nvidia_ecrc(sc);
552 
553 	/*
554 	 * Run a DMA test which watches for unaligned completions and
555 	 * aborts on the first one seen.
556 	 */
557 
558 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
559 	if (status == 0)
560 		return 0; /* keep the aligned firmware */
561 
562 	if (status != E2BIG)
563 		device_printf(dev, "DMA test failed: %d\n", status);
564 	if (status == ENOSYS)
565 		device_printf(dev, "Falling back to ethp! "
566 			      "Please install up to date fw\n");
567 	return status;
568 }
569 
570 static int
571 mxge_select_firmware(mxge_softc_t *sc)
572 {
573 	int aligned = 0;
574 
575 
576 	if (mxge_force_firmware != 0) {
577 		if (mxge_force_firmware == 1)
578 			aligned = 1;
579 		else
580 			aligned = 0;
581 		if (mxge_verbose)
582 			device_printf(sc->dev,
583 				      "Assuming %s completions (forced)\n",
584 				      aligned ? "aligned" : "unaligned");
585 		goto abort;
586 	}
587 
588 	/* if the PCIe link width is 4 or less, we can use the aligned
589 	   firmware and skip any checks */
590 	if (sc->link_width != 0 && sc->link_width <= 4) {
591 		device_printf(sc->dev,
592 			      "PCIe x%d Link, expect reduced performance\n",
593 			      sc->link_width);
594 		aligned = 1;
595 		goto abort;
596 	}
597 
598 	if (0 == mxge_firmware_probe(sc))
599 		return 0;
600 
601 abort:
602 	if (aligned) {
603 		sc->fw_name = mxge_fw_aligned;
604 		sc->tx_boundary = 4096;
605 	} else {
606 		sc->fw_name = mxge_fw_unaligned;
607 		sc->tx_boundary = 2048;
608 	}
609 	return (mxge_load_firmware(sc, 0));
610 }
611 
612 union qualhack
613 {
614         const char *ro_char;
615         char *rw_char;
616 };
617 
618 static int
619 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
620 {
621 
622 
623 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
624 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
625 			      be32toh(hdr->mcp_type));
626 		return EIO;
627 	}
628 
629 	/* save firmware version for sysctl */
630 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
631 	if (mxge_verbose)
632 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
633 
634 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
635 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
636 
637 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
638 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
639 		device_printf(sc->dev, "Found firmware version %s\n",
640 			      sc->fw_version);
641 		device_printf(sc->dev, "Driver needs %d.%d\n",
642 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
643 		return EINVAL;
644 	}
645 	return 0;
646 
647 }
648 
649 static void *
650 z_alloc(void *nil, u_int items, u_int size)
651 {
652         void *ptr;
653 
654         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
655         return ptr;
656 }
657 
658 static void
659 z_free(void *nil, void *ptr)
660 {
661         free(ptr, M_TEMP);
662 }
663 
664 
665 static int
666 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
667 {
668 	z_stream zs;
669 	char *inflate_buffer;
670 	const struct firmware *fw;
671 	const mcp_gen_header_t *hdr;
672 	unsigned hdr_offset;
673 	int status;
674 	unsigned int i;
675 	char dummy;
676 	size_t fw_len;
677 
678 	fw = firmware_get(sc->fw_name);
679 	if (fw == NULL) {
680 		device_printf(sc->dev, "Could not find firmware image %s\n",
681 			      sc->fw_name);
682 		return ENOENT;
683 	}
684 
685 
686 
687 	/* setup zlib and decompress f/w */
688 	bzero(&zs, sizeof (zs));
689 	zs.zalloc = z_alloc;
690 	zs.zfree = z_free;
691 	status = inflateInit(&zs);
692 	if (status != Z_OK) {
693 		status = EIO;
694 		goto abort_with_fw;
695 	}
696 
697 	/* the uncompressed size is stored as the firmware version,
698 	   which would otherwise go unused */
699 	fw_len = (size_t) fw->version;
700 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
701 	if (inflate_buffer == NULL)
702 		goto abort_with_zs;
703 	zs.avail_in = fw->datasize;
704 	zs.next_in = __DECONST(char *, fw->data);
705 	zs.avail_out = fw_len;
706 	zs.next_out = inflate_buffer;
707 	status = inflate(&zs, Z_FINISH);
708 	if (status != Z_STREAM_END) {
709 		device_printf(sc->dev, "zlib %d\n", status);
710 		status = EIO;
711 		goto abort_with_buffer;
712 	}
713 
714 	/* check id */
715 	hdr_offset = htobe32(*(const uint32_t *)
716 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
717 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
718 		device_printf(sc->dev, "Bad firmware file");
719 		status = EIO;
720 		goto abort_with_buffer;
721 	}
722 	hdr = (const void*)(inflate_buffer + hdr_offset);
723 
724 	status = mxge_validate_firmware(sc, hdr);
725 	if (status != 0)
726 		goto abort_with_buffer;
727 
728 	/* Copy the inflated firmware to NIC SRAM. */
729 	for (i = 0; i < fw_len; i += 256) {
730 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
731 			      inflate_buffer + i,
732 			      min(256U, (unsigned)(fw_len - i)));
733 		mb();
734 		dummy = *sc->sram;
735 		mb();
736 	}
737 
738 	*limit = fw_len;
739 	status = 0;
740 abort_with_buffer:
741 	free(inflate_buffer, M_TEMP);
742 abort_with_zs:
743 	inflateEnd(&zs);
744 abort_with_fw:
745 	firmware_put(fw, FIRMWARE_UNLOAD);
746 	return status;
747 }
748 
749 /*
750  * Enable or disable periodic RDMAs from the host to make certain
751  * chipsets resend dropped PCIe messages
752  */
753 
754 static void
755 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
756 {
757 	char buf_bytes[72];
758 	volatile uint32_t *confirm;
759 	volatile char *submit;
760 	uint32_t *buf, dma_low, dma_high;
761 	int i;
762 
763 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
764 
765 	/* clear confirmation addr */
766 	confirm = (volatile uint32_t *)sc->cmd;
767 	*confirm = 0;
768 	mb();
769 
770 	/* send an rdma command to the PCIe engine, and wait for the
771 	   response in the confirmation address.  The firmware should
772 	   write a -1 there to indicate it is alive and well
773 	*/
774 
775 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
776 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
777 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
778 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
779 	buf[2] = htobe32(0xffffffff);		/* confirm data */
780 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
781 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
782 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
783 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
784 	buf[5] = htobe32(enable);			/* enable? */
785 
786 
787 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
788 
789 	mxge_pio_copy(submit, buf, 64);
790 	mb();
791 	DELAY(1000);
792 	mb();
793 	i = 0;
794 	while (*confirm != 0xffffffff && i < 20) {
795 		DELAY(1000);
796 		i++;
797 	}
798 	if (*confirm != 0xffffffff) {
799 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
800 			      (enable ? "enable" : "disable"), confirm,
801 			      *confirm);
802 	}
803 	return;
804 }
805 
806 static int
807 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
808 {
809 	mcp_cmd_t *buf;
810 	char buf_bytes[sizeof(*buf) + 8];
811 	volatile mcp_cmd_response_t *response = sc->cmd;
812 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
813 	uint32_t dma_low, dma_high;
814 	int err, sleep_total = 0;
815 
816 	/* ensure buf is aligned to 8 bytes */
817 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
818 
819 	buf->data0 = htobe32(data->data0);
820 	buf->data1 = htobe32(data->data1);
821 	buf->data2 = htobe32(data->data2);
822 	buf->cmd = htobe32(cmd);
823 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
824 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
825 
826 	buf->response_addr.low = htobe32(dma_low);
827 	buf->response_addr.high = htobe32(dma_high);
828 	mtx_lock(&sc->cmd_mtx);
829 	response->result = 0xffffffff;
830 	mb();
831 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
832 
833 	/* wait up to 20ms */
834 	err = EAGAIN;
835 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
836 		bus_dmamap_sync(sc->cmd_dma.dmat,
837 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
838 		mb();
839 		switch (be32toh(response->result)) {
840 		case 0:
841 			data->data0 = be32toh(response->data);
842 			err = 0;
843 			break;
844 		case 0xffffffff:
845 			DELAY(1000);
846 			break;
847 		case MXGEFW_CMD_UNKNOWN:
848 			err = ENOSYS;
849 			break;
850 		case MXGEFW_CMD_ERROR_UNALIGNED:
851 			err = E2BIG;
852 			break;
853 		case MXGEFW_CMD_ERROR_BUSY:
854 			err = EBUSY;
855 			break;
856 		default:
857 			device_printf(sc->dev,
858 				      "mxge: command %d "
859 				      "failed, result = %d\n",
860 				      cmd, be32toh(response->result));
861 			err = ENXIO;
862 			break;
863 		}
864 		if (err != EAGAIN)
865 			break;
866 	}
867 	if (err == EAGAIN)
868 		device_printf(sc->dev, "mxge: command %d timed out"
869 			      "result = %d\n",
870 			      cmd, be32toh(response->result));
871 	mtx_unlock(&sc->cmd_mtx);
872 	return err;
873 }
874 
875 static int
876 mxge_adopt_running_firmware(mxge_softc_t *sc)
877 {
878 	struct mcp_gen_header *hdr;
879 	const size_t bytes = sizeof (struct mcp_gen_header);
880 	size_t hdr_offset;
881 	int status;
882 
883 	/* find running firmware header */
884 	hdr_offset = htobe32(*(volatile uint32_t *)
885 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
886 
887 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
888 		device_printf(sc->dev,
889 			      "Running firmware has bad header offset (%d)\n",
890 			      (int)hdr_offset);
891 		return EIO;
892 	}
893 
894 	/* copy header of running firmware from SRAM to host memory to
895 	 * validate firmware */
896 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
897 	if (hdr == NULL) {
898 		device_printf(sc->dev, "could not malloc firmware hdr\n");
899 		return ENOMEM;
900 	}
901 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
902 				rman_get_bushandle(sc->mem_res),
903 				hdr_offset, (char *)hdr, bytes);
904 	status = mxge_validate_firmware(sc, hdr);
905 	free(hdr, M_DEVBUF);
906 
907 	/*
908 	 * check to see if adopted firmware has bug where adopting
909 	 * it will cause broadcasts to be filtered unless the NIC
910 	 * is kept in ALLMULTI mode
911 	 */
912 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
913 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
914 		sc->adopted_rx_filter_bug = 1;
915 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
916 			      "working around rx filter bug\n",
917 			      sc->fw_ver_major, sc->fw_ver_minor,
918 			      sc->fw_ver_tiny);
919 	}
920 
921 	return status;
922 }
923 
924 
925 static int
926 mxge_load_firmware(mxge_softc_t *sc, int adopt)
927 {
928 	volatile uint32_t *confirm;
929 	volatile char *submit;
930 	char buf_bytes[72];
931 	uint32_t *buf, size, dma_low, dma_high;
932 	int status, i;
933 
934 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
935 
936 	size = sc->sram_size;
937 	status = mxge_load_firmware_helper(sc, &size);
938 	if (status) {
939 		if (!adopt)
940 			return status;
941 		/* Try to use the currently running firmware, if
942 		   it is new enough */
943 		status = mxge_adopt_running_firmware(sc);
944 		if (status) {
945 			device_printf(sc->dev,
946 				      "failed to adopt running firmware\n");
947 			return status;
948 		}
949 		device_printf(sc->dev,
950 			      "Successfully adopted running firmware\n");
951 		if (sc->tx_boundary == 4096) {
952 			device_printf(sc->dev,
953 				"Using firmware currently running on NIC"
954 				 ".  For optimal\n");
955 			device_printf(sc->dev,
956 				 "performance consider loading optimized "
957 				 "firmware\n");
958 		}
959 		sc->fw_name = mxge_fw_unaligned;
960 		sc->tx_boundary = 2048;
961 		return 0;
962 	}
963 	/* clear confirmation addr */
964 	confirm = (volatile uint32_t *)sc->cmd;
965 	*confirm = 0;
966 	mb();
967 	/* send a reload command to the bootstrap MCP, and wait for the
968 	   response in the confirmation address.  The firmware should
969 	   write a -1 there to indicate it is alive and well
970 	*/
971 
972 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
973 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
974 
975 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
976 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
977 	buf[2] = htobe32(0xffffffff);	/* confirm data */
978 
979 	/* FIX: All newest firmware should un-protect the bottom of
980 	   the sram before handoff. However, the very first interfaces
981 	   do not. Therefore the handoff copy must skip the first 8 bytes
982 	*/
983 					/* where the code starts*/
984 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
985 	buf[4] = htobe32(size - 8); 	/* length of code */
986 	buf[5] = htobe32(8);		/* where to copy to */
987 	buf[6] = htobe32(0);		/* where to jump to */
988 
989 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
990 	mxge_pio_copy(submit, buf, 64);
991 	mb();
992 	DELAY(1000);
993 	mb();
994 	i = 0;
995 	while (*confirm != 0xffffffff && i < 20) {
996 		DELAY(1000*10);
997 		i++;
998 		bus_dmamap_sync(sc->cmd_dma.dmat,
999 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1000 	}
1001 	if (*confirm != 0xffffffff) {
1002 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1003 			confirm, *confirm);
1004 
1005 		return ENXIO;
1006 	}
1007 	return 0;
1008 }
1009 
1010 static int
1011 mxge_update_mac_address(mxge_softc_t *sc)
1012 {
1013 	mxge_cmd_t cmd;
1014 	uint8_t *addr = sc->mac_addr;
1015 	int status;
1016 
1017 
1018 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1019 		     | (addr[2] << 8) | addr[3]);
1020 
1021 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1022 
1023 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1024 	return status;
1025 }
1026 
1027 static int
1028 mxge_change_pause(mxge_softc_t *sc, int pause)
1029 {
1030 	mxge_cmd_t cmd;
1031 	int status;
1032 
1033 	if (pause)
1034 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1035 				       &cmd);
1036 	else
1037 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1038 				       &cmd);
1039 
1040 	if (status) {
1041 		device_printf(sc->dev, "Failed to set flow control mode\n");
1042 		return ENXIO;
1043 	}
1044 	sc->pause = pause;
1045 	return 0;
1046 }
1047 
1048 static void
1049 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1050 {
1051 	mxge_cmd_t cmd;
1052 	int status;
1053 
1054 	if (mxge_always_promisc)
1055 		promisc = 1;
1056 
1057 	if (promisc)
1058 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1059 				       &cmd);
1060 	else
1061 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1062 				       &cmd);
1063 
1064 	if (status) {
1065 		device_printf(sc->dev, "Failed to set promisc mode\n");
1066 	}
1067 }
1068 
1069 static void
1070 mxge_set_multicast_list(mxge_softc_t *sc)
1071 {
1072 	mxge_cmd_t cmd;
1073 	struct ifmultiaddr *ifma;
1074 	struct ifnet *ifp = sc->ifp;
1075 	int err;
1076 
1077 	/* This firmware is known to not support multicast */
1078 	if (!sc->fw_multicast_support)
1079 		return;
1080 
1081 	/* Disable multicast filtering while we play with the lists*/
1082 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1083 	if (err != 0) {
1084 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1085 		       " error status: %d\n", err);
1086 		return;
1087 	}
1088 
1089 	if (sc->adopted_rx_filter_bug)
1090 		return;
1091 
1092 	if (ifp->if_flags & IFF_ALLMULTI)
1093 		/* request to disable multicast filtering, so quit here */
1094 		return;
1095 
1096 	/* Flush all the filters */
1097 
1098 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1099 	if (err != 0) {
1100 		device_printf(sc->dev,
1101 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1102 			      ", error status: %d\n", err);
1103 		return;
1104 	}
1105 
1106 	/* Walk the multicast list, and add each address */
1107 
1108 	IF_ADDR_LOCK(ifp);
1109 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1110 		if (ifma->ifma_addr->sa_family != AF_LINK)
1111 			continue;
1112 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1113 		      &cmd.data0, 4);
1114 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1115 		      &cmd.data1, 2);
1116 		cmd.data0 = htonl(cmd.data0);
1117 		cmd.data1 = htonl(cmd.data1);
1118 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1119 		if (err != 0) {
1120 			device_printf(sc->dev, "Failed "
1121 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1122 			       "%d\t", err);
1123 			/* abort, leaving multicast filtering off */
1124 			IF_ADDR_UNLOCK(ifp);
1125 			return;
1126 		}
1127 	}
1128 	IF_ADDR_UNLOCK(ifp);
1129 	/* Enable multicast filtering */
1130 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1131 	if (err != 0) {
1132 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1133 		       ", error status: %d\n", err);
1134 	}
1135 }
1136 
1137 static int
1138 mxge_max_mtu(mxge_softc_t *sc)
1139 {
1140 	mxge_cmd_t cmd;
1141 	int status;
1142 
1143 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1144 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1145 
1146 	/* try to set nbufs to see if it we can
1147 	   use virtually contiguous jumbos */
1148 	cmd.data0 = 0;
1149 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1150 			       &cmd);
1151 	if (status == 0)
1152 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1153 
1154 	/* otherwise, we're limited to MJUMPAGESIZE */
1155 	return MJUMPAGESIZE - MXGEFW_PAD;
1156 }
1157 
1158 static int
1159 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1160 {
1161 	struct mxge_slice_state *ss;
1162 	mxge_rx_done_t *rx_done;
1163 	volatile uint32_t *irq_claim;
1164 	mxge_cmd_t cmd;
1165 	int slice, status;
1166 
1167 	/* try to send a reset command to the card to see if it
1168 	   is alive */
1169 	memset(&cmd, 0, sizeof (cmd));
1170 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1171 	if (status != 0) {
1172 		device_printf(sc->dev, "failed reset\n");
1173 		return ENXIO;
1174 	}
1175 
1176 	mxge_dummy_rdma(sc, 1);
1177 
1178 
1179 	/* set the intrq size */
1180 	cmd.data0 = sc->rx_ring_size;
1181 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1182 
1183 	/*
1184 	 * Even though we already know how many slices are supported
1185 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1186 	 * has magic side effects, and must be called after a reset.
1187 	 * It must be called prior to calling any RSS related cmds,
1188 	 * including assigning an interrupt queue for anything but
1189 	 * slice 0.  It must also be called *after*
1190 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1191 	 * the firmware to compute offsets.
1192 	 */
1193 
1194 	if (sc->num_slices > 1) {
1195 		/* ask the maximum number of slices it supports */
1196 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1197 					   &cmd);
1198 		if (status != 0) {
1199 			device_printf(sc->dev,
1200 				      "failed to get number of slices\n");
1201 			return status;
1202 		}
1203 		/*
1204 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1205 		 * to setting up the interrupt queue DMA
1206 		 */
1207 		cmd.data0 = sc->num_slices;
1208 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1209 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1210 					   &cmd);
1211 		if (status != 0) {
1212 			device_printf(sc->dev,
1213 				      "failed to set number of slices\n");
1214 			return status;
1215 		}
1216 	}
1217 
1218 
1219 	if (interrupts_setup) {
1220 		/* Now exchange information about interrupts  */
1221 		for (slice = 0; slice < sc->num_slices; slice++) {
1222 			rx_done = &sc->ss[slice].rx_done;
1223 			memset(rx_done->entry, 0, sc->rx_ring_size);
1224 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1225 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1226 			cmd.data2 = slice;
1227 			status |= mxge_send_cmd(sc,
1228 						MXGEFW_CMD_SET_INTRQ_DMA,
1229 						&cmd);
1230 		}
1231 	}
1232 
1233 	status |= mxge_send_cmd(sc,
1234 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1235 
1236 
1237 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1238 
1239 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1240 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1241 
1242 
1243 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1244 				&cmd);
1245 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1246 	if (status != 0) {
1247 		device_printf(sc->dev, "failed set interrupt parameters\n");
1248 		return status;
1249 	}
1250 
1251 
1252 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1253 
1254 
1255 	/* run a DMA benchmark */
1256 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1257 
1258 	for (slice = 0; slice < sc->num_slices; slice++) {
1259 		ss = &sc->ss[slice];
1260 
1261 		ss->irq_claim = irq_claim + (2 * slice);
1262 		/* reset mcp/driver shared state back to 0 */
1263 		ss->rx_done.idx = 0;
1264 		ss->rx_done.cnt = 0;
1265 		ss->tx.req = 0;
1266 		ss->tx.done = 0;
1267 		ss->tx.pkt_done = 0;
1268 		ss->tx.wake = 0;
1269 		ss->tx.defrag = 0;
1270 		ss->tx.stall = 0;
1271 		ss->rx_big.cnt = 0;
1272 		ss->rx_small.cnt = 0;
1273 		ss->lro_bad_csum = 0;
1274 		ss->lro_queued = 0;
1275 		ss->lro_flushed = 0;
1276 		if (ss->fw_stats != NULL) {
1277 			ss->fw_stats->valid = 0;
1278 			ss->fw_stats->send_done_count = 0;
1279 		}
1280 	}
1281 	sc->rdma_tags_available = 15;
1282 	status = mxge_update_mac_address(sc);
1283 	mxge_change_promisc(sc, 0);
1284 	mxge_change_pause(sc, sc->pause);
1285 	mxge_set_multicast_list(sc);
1286 	return status;
1287 }
1288 
1289 static int
1290 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1291 {
1292         mxge_softc_t *sc;
1293         unsigned int intr_coal_delay;
1294         int err;
1295 
1296         sc = arg1;
1297         intr_coal_delay = sc->intr_coal_delay;
1298         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1299         if (err != 0) {
1300                 return err;
1301         }
1302         if (intr_coal_delay == sc->intr_coal_delay)
1303                 return 0;
1304 
1305         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1306                 return EINVAL;
1307 
1308 	mtx_lock(&sc->driver_mtx);
1309 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1310 	sc->intr_coal_delay = intr_coal_delay;
1311 
1312 	mtx_unlock(&sc->driver_mtx);
1313         return err;
1314 }
1315 
1316 static int
1317 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1318 {
1319         mxge_softc_t *sc;
1320         unsigned int enabled;
1321         int err;
1322 
1323         sc = arg1;
1324         enabled = sc->pause;
1325         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1326         if (err != 0) {
1327                 return err;
1328         }
1329         if (enabled == sc->pause)
1330                 return 0;
1331 
1332 	mtx_lock(&sc->driver_mtx);
1333 	err = mxge_change_pause(sc, enabled);
1334 	mtx_unlock(&sc->driver_mtx);
1335         return err;
1336 }
1337 
1338 static int
1339 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1340 {
1341 	struct ifnet *ifp;
1342 	int err = 0;
1343 
1344 	ifp = sc->ifp;
1345 	if (lro_cnt == 0)
1346 		ifp->if_capenable &= ~IFCAP_LRO;
1347 	else
1348 		ifp->if_capenable |= IFCAP_LRO;
1349 	sc->lro_cnt = lro_cnt;
1350 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1351 		callout_stop(&sc->co_hdl);
1352 		mxge_close(sc);
1353 		err = mxge_open(sc);
1354 		if (err == 0)
1355 			callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
1356 	}
1357 	return err;
1358 }
1359 
1360 static int
1361 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1362 {
1363 	mxge_softc_t *sc;
1364 	unsigned int lro_cnt;
1365 	int err;
1366 
1367 	sc = arg1;
1368 	lro_cnt = sc->lro_cnt;
1369 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1370 	if (err != 0)
1371 		return err;
1372 
1373 	if (lro_cnt == sc->lro_cnt)
1374 		return 0;
1375 
1376 	if (lro_cnt > 128)
1377 		return EINVAL;
1378 
1379 	mtx_lock(&sc->driver_mtx);
1380 	err = mxge_change_lro_locked(sc, lro_cnt);
1381 	mtx_unlock(&sc->driver_mtx);
1382 	return err;
1383 }
1384 
1385 static int
1386 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1387 {
1388         int err;
1389 
1390         if (arg1 == NULL)
1391                 return EFAULT;
1392         arg2 = be32toh(*(int *)arg1);
1393         arg1 = NULL;
1394         err = sysctl_handle_int(oidp, arg1, arg2, req);
1395 
1396         return err;
1397 }
1398 
1399 static void
1400 mxge_rem_sysctls(mxge_softc_t *sc)
1401 {
1402 	struct mxge_slice_state *ss;
1403 	int slice;
1404 
1405 	if (sc->slice_sysctl_tree == NULL)
1406 		return;
1407 
1408 	for (slice = 0; slice < sc->num_slices; slice++) {
1409 		ss = &sc->ss[slice];
1410 		if (ss == NULL || ss->sysctl_tree == NULL)
1411 			continue;
1412 		sysctl_ctx_free(&ss->sysctl_ctx);
1413 		ss->sysctl_tree = NULL;
1414 	}
1415 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1416 	sc->slice_sysctl_tree = NULL;
1417 }
1418 
1419 static void
1420 mxge_add_sysctls(mxge_softc_t *sc)
1421 {
1422 	struct sysctl_ctx_list *ctx;
1423 	struct sysctl_oid_list *children;
1424 	mcp_irq_data_t *fw;
1425 	struct mxge_slice_state *ss;
1426 	int slice;
1427 	char slice_num[8];
1428 
1429 	ctx = device_get_sysctl_ctx(sc->dev);
1430 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1431 	fw = sc->ss[0].fw_stats;
1432 
1433 	/* random information */
1434 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1435 		       "firmware_version",
1436 		       CTLFLAG_RD, &sc->fw_version,
1437 		       0, "firmware version");
1438 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1439 		       "serial_number",
1440 		       CTLFLAG_RD, &sc->serial_number_string,
1441 		       0, "serial number");
1442 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1443 		       "product_code",
1444 		       CTLFLAG_RD, &sc->product_code_string,
1445 		       0, "product_code");
1446 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1447 		       "pcie_link_width",
1448 		       CTLFLAG_RD, &sc->link_width,
1449 		       0, "tx_boundary");
1450 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1451 		       "tx_boundary",
1452 		       CTLFLAG_RD, &sc->tx_boundary,
1453 		       0, "tx_boundary");
1454 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1455 		       "write_combine",
1456 		       CTLFLAG_RD, &sc->wc,
1457 		       0, "write combining PIO?");
1458 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1459 		       "read_dma_MBs",
1460 		       CTLFLAG_RD, &sc->read_dma,
1461 		       0, "DMA Read speed in MB/s");
1462 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1463 		       "write_dma_MBs",
1464 		       CTLFLAG_RD, &sc->write_dma,
1465 		       0, "DMA Write speed in MB/s");
1466 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1467 		       "read_write_dma_MBs",
1468 		       CTLFLAG_RD, &sc->read_write_dma,
1469 		       0, "DMA concurrent Read/Write speed in MB/s");
1470 
1471 
1472 	/* performance related tunables */
1473 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1474 			"intr_coal_delay",
1475 			CTLTYPE_INT|CTLFLAG_RW, sc,
1476 			0, mxge_change_intr_coal,
1477 			"I", "interrupt coalescing delay in usecs");
1478 
1479 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1480 			"flow_control_enabled",
1481 			CTLTYPE_INT|CTLFLAG_RW, sc,
1482 			0, mxge_change_flow_control,
1483 			"I", "interrupt coalescing delay in usecs");
1484 
1485 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1486 		       "deassert_wait",
1487 		       CTLFLAG_RW, &mxge_deassert_wait,
1488 		       0, "Wait for IRQ line to go low in ihandler");
1489 
1490 	/* stats block from firmware is in network byte order.
1491 	   Need to swap it */
1492 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1493 			"link_up",
1494 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1495 			0, mxge_handle_be32,
1496 			"I", "link up");
1497 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1498 			"rdma_tags_available",
1499 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1500 			0, mxge_handle_be32,
1501 			"I", "rdma_tags_available");
1502 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1503 			"dropped_bad_crc32",
1504 			CTLTYPE_INT|CTLFLAG_RD,
1505 			&fw->dropped_bad_crc32,
1506 			0, mxge_handle_be32,
1507 			"I", "dropped_bad_crc32");
1508 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1509 			"dropped_bad_phy",
1510 			CTLTYPE_INT|CTLFLAG_RD,
1511 			&fw->dropped_bad_phy,
1512 			0, mxge_handle_be32,
1513 			"I", "dropped_bad_phy");
1514 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1515 			"dropped_link_error_or_filtered",
1516 			CTLTYPE_INT|CTLFLAG_RD,
1517 			&fw->dropped_link_error_or_filtered,
1518 			0, mxge_handle_be32,
1519 			"I", "dropped_link_error_or_filtered");
1520 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1521 			"dropped_link_overflow",
1522 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1523 			0, mxge_handle_be32,
1524 			"I", "dropped_link_overflow");
1525 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1526 			"dropped_multicast_filtered",
1527 			CTLTYPE_INT|CTLFLAG_RD,
1528 			&fw->dropped_multicast_filtered,
1529 			0, mxge_handle_be32,
1530 			"I", "dropped_multicast_filtered");
1531 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1532 			"dropped_no_big_buffer",
1533 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1534 			0, mxge_handle_be32,
1535 			"I", "dropped_no_big_buffer");
1536 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1537 			"dropped_no_small_buffer",
1538 			CTLTYPE_INT|CTLFLAG_RD,
1539 			&fw->dropped_no_small_buffer,
1540 			0, mxge_handle_be32,
1541 			"I", "dropped_no_small_buffer");
1542 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1543 			"dropped_overrun",
1544 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1545 			0, mxge_handle_be32,
1546 			"I", "dropped_overrun");
1547 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548 			"dropped_pause",
1549 			CTLTYPE_INT|CTLFLAG_RD,
1550 			&fw->dropped_pause,
1551 			0, mxge_handle_be32,
1552 			"I", "dropped_pause");
1553 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1554 			"dropped_runt",
1555 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1556 			0, mxge_handle_be32,
1557 			"I", "dropped_runt");
1558 
1559 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1560 			"dropped_unicast_filtered",
1561 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1562 			0, mxge_handle_be32,
1563 			"I", "dropped_unicast_filtered");
1564 
1565 	/* verbose printing? */
1566 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1567 		       "verbose",
1568 		       CTLFLAG_RW, &mxge_verbose,
1569 		       0, "verbose printing");
1570 
1571 	/* lro */
1572 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1573 			"lro_cnt",
1574 			CTLTYPE_INT|CTLFLAG_RW, sc,
1575 			0, mxge_change_lro,
1576 			"I", "number of lro merge queues");
1577 
1578 
1579 	/* add counters exported for debugging from all slices */
1580 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1581 	sc->slice_sysctl_tree =
1582 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1583 				"slice", CTLFLAG_RD, 0, "");
1584 
1585 	for (slice = 0; slice < sc->num_slices; slice++) {
1586 		ss = &sc->ss[slice];
1587 		sysctl_ctx_init(&ss->sysctl_ctx);
1588 		ctx = &ss->sysctl_ctx;
1589 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1590 		sprintf(slice_num, "%d", slice);
1591 		ss->sysctl_tree =
1592 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1593 					CTLFLAG_RD, 0, "");
1594 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1595 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1596 			       "rx_small_cnt",
1597 			       CTLFLAG_RD, &ss->rx_small.cnt,
1598 			       0, "rx_small_cnt");
1599 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1600 			       "rx_big_cnt",
1601 			       CTLFLAG_RD, &ss->rx_big.cnt,
1602 			       0, "rx_small_cnt");
1603 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1604 			       "tx_req",
1605 			       CTLFLAG_RD, &ss->tx.req,
1606 			       0, "tx_req");
1607 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1608 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1609 			       0, "number of lro merge queues flushed");
1610 
1611 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1612 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1613 			       0, "number of frames appended to lro merge"
1614 			       "queues");
1615 
1616 		/* only transmit from slice 0 for now */
1617 		if (slice > 0)
1618 			continue;
1619 
1620 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1621 			       "tx_done",
1622 			       CTLFLAG_RD, &ss->tx.done,
1623 			       0, "tx_done");
1624 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1625 			       "tx_pkt_done",
1626 			       CTLFLAG_RD, &ss->tx.pkt_done,
1627 			       0, "tx_done");
1628 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1629 			       "tx_stall",
1630 			       CTLFLAG_RD, &ss->tx.stall,
1631 			       0, "tx_stall");
1632 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1633 			       "tx_wake",
1634 			       CTLFLAG_RD, &ss->tx.wake,
1635 			       0, "tx_wake");
1636 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1637 			       "tx_defrag",
1638 			       CTLFLAG_RD, &ss->tx.defrag,
1639 			       0, "tx_defrag");
1640 	}
1641 }
1642 
1643 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1644    backwards one at a time and handle ring wraps */
1645 
1646 static inline void
1647 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1648 			    mcp_kreq_ether_send_t *src, int cnt)
1649 {
1650         int idx, starting_slot;
1651         starting_slot = tx->req;
1652         while (cnt > 1) {
1653                 cnt--;
1654                 idx = (starting_slot + cnt) & tx->mask;
1655                 mxge_pio_copy(&tx->lanai[idx],
1656 			      &src[cnt], sizeof(*src));
1657                 mb();
1658         }
1659 }
1660 
1661 /*
1662  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1663  * at most 32 bytes at a time, so as to avoid involving the software
1664  * pio handler in the nic.   We re-write the first segment's flags
1665  * to mark them valid only after writing the entire chain
1666  */
1667 
1668 static inline void
1669 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1670                   int cnt)
1671 {
1672         int idx, i;
1673         uint32_t *src_ints;
1674 	volatile uint32_t *dst_ints;
1675         mcp_kreq_ether_send_t *srcp;
1676 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1677 	uint8_t last_flags;
1678 
1679         idx = tx->req & tx->mask;
1680 
1681 	last_flags = src->flags;
1682 	src->flags = 0;
1683         mb();
1684         dst = dstp = &tx->lanai[idx];
1685         srcp = src;
1686 
1687         if ((idx + cnt) < tx->mask) {
1688                 for (i = 0; i < (cnt - 1); i += 2) {
1689                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1690                         mb(); /* force write every 32 bytes */
1691                         srcp += 2;
1692                         dstp += 2;
1693                 }
1694         } else {
1695                 /* submit all but the first request, and ensure
1696                    that it is submitted below */
1697                 mxge_submit_req_backwards(tx, src, cnt);
1698                 i = 0;
1699         }
1700         if (i < cnt) {
1701                 /* submit the first request */
1702                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1703                 mb(); /* barrier before setting valid flag */
1704         }
1705 
1706         /* re-write the last 32-bits with the valid flags */
1707         src->flags = last_flags;
1708         src_ints = (uint32_t *)src;
1709         src_ints+=3;
1710         dst_ints = (volatile uint32_t *)dst;
1711         dst_ints+=3;
1712         *dst_ints =  *src_ints;
1713         tx->req += cnt;
1714         mb();
1715 }
1716 
1717 #if IFCAP_TSO4
1718 
1719 static void
1720 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1721 	       int busdma_seg_cnt, int ip_off)
1722 {
1723 	mxge_tx_ring_t *tx;
1724 	mcp_kreq_ether_send_t *req;
1725 	bus_dma_segment_t *seg;
1726 	struct ip *ip;
1727 	struct tcphdr *tcp;
1728 	uint32_t low, high_swapped;
1729 	int len, seglen, cum_len, cum_len_next;
1730 	int next_is_first, chop, cnt, rdma_count, small;
1731 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1732 	uint8_t flags, flags_next;
1733 	static int once;
1734 
1735 	mss = m->m_pkthdr.tso_segsz;
1736 
1737 	/* negative cum_len signifies to the
1738 	 * send loop that we are still in the
1739 	 * header portion of the TSO packet.
1740 	 */
1741 
1742 	/* ensure we have the ethernet, IP and TCP
1743 	   header together in the first mbuf, copy
1744 	   it to a scratch buffer if not */
1745 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1746 		m_copydata(m, 0, ip_off + sizeof (*ip),
1747 			   ss->scratch);
1748 		ip = (struct ip *)(ss->scratch + ip_off);
1749 	} else {
1750 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1751 	}
1752 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1753 			    + sizeof (*tcp))) {
1754 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1755 			   + sizeof (*tcp),  ss->scratch);
1756 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1757 	}
1758 
1759 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1760 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1761 
1762 	/* TSO implies checksum offload on this hardware */
1763 	cksum_offset = ip_off + (ip->ip_hl << 2);
1764 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1765 
1766 
1767 	/* for TSO, pseudo_hdr_offset holds mss.
1768 	 * The firmware figures out where to put
1769 	 * the checksum by parsing the header. */
1770 	pseudo_hdr_offset = htobe16(mss);
1771 
1772 	tx = &ss->tx;
1773 	req = tx->req_list;
1774 	seg = tx->seg_list;
1775 	cnt = 0;
1776 	rdma_count = 0;
1777 	/* "rdma_count" is the number of RDMAs belonging to the
1778 	 * current packet BEFORE the current send request. For
1779 	 * non-TSO packets, this is equal to "count".
1780 	 * For TSO packets, rdma_count needs to be reset
1781 	 * to 0 after a segment cut.
1782 	 *
1783 	 * The rdma_count field of the send request is
1784 	 * the number of RDMAs of the packet starting at
1785 	 * that request. For TSO send requests with one ore more cuts
1786 	 * in the middle, this is the number of RDMAs starting
1787 	 * after the last cut in the request. All previous
1788 	 * segments before the last cut implicitly have 1 RDMA.
1789 	 *
1790 	 * Since the number of RDMAs is not known beforehand,
1791 	 * it must be filled-in retroactively - after each
1792 	 * segmentation cut or at the end of the entire packet.
1793 	 */
1794 
1795 	while (busdma_seg_cnt) {
1796 		/* Break the busdma segment up into pieces*/
1797 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1798 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1799 		len = seg->ds_len;
1800 
1801 		while (len) {
1802 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1803 			seglen = len;
1804 			cum_len_next = cum_len + seglen;
1805 			(req-rdma_count)->rdma_count = rdma_count + 1;
1806 			if (__predict_true(cum_len >= 0)) {
1807 				/* payload */
1808 				chop = (cum_len_next > mss);
1809 				cum_len_next = cum_len_next % mss;
1810 				next_is_first = (cum_len_next == 0);
1811 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1812 				flags_next |= next_is_first *
1813 					MXGEFW_FLAGS_FIRST;
1814 				rdma_count |= -(chop | next_is_first);
1815 				rdma_count += chop & !next_is_first;
1816 			} else if (cum_len_next >= 0) {
1817 				/* header ends */
1818 				rdma_count = -1;
1819 				cum_len_next = 0;
1820 				seglen = -cum_len;
1821 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1822 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1823 					MXGEFW_FLAGS_FIRST |
1824 					(small * MXGEFW_FLAGS_SMALL);
1825 			    }
1826 
1827 			req->addr_high = high_swapped;
1828 			req->addr_low = htobe32(low);
1829 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1830 			req->pad = 0;
1831 			req->rdma_count = 1;
1832 			req->length = htobe16(seglen);
1833 			req->cksum_offset = cksum_offset;
1834 			req->flags = flags | ((cum_len & 1) *
1835 					      MXGEFW_FLAGS_ALIGN_ODD);
1836 			low += seglen;
1837 			len -= seglen;
1838 			cum_len = cum_len_next;
1839 			flags = flags_next;
1840 			req++;
1841 			cnt++;
1842 			rdma_count++;
1843 			if (__predict_false(cksum_offset > seglen))
1844 				cksum_offset -= seglen;
1845 			else
1846 				cksum_offset = 0;
1847 			if (__predict_false(cnt > tx->max_desc))
1848 				goto drop;
1849 		}
1850 		busdma_seg_cnt--;
1851 		seg++;
1852 	}
1853 	(req-rdma_count)->rdma_count = rdma_count;
1854 
1855 	do {
1856 		req--;
1857 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1858 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1859 
1860 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1861 	mxge_submit_req(tx, tx->req_list, cnt);
1862 	return;
1863 
1864 drop:
1865 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1866 	m_freem(m);
1867 	ss->sc->ifp->if_oerrors++;
1868 	if (!once) {
1869 		printf("tx->max_desc exceeded via TSO!\n");
1870 		printf("mss = %d, %ld, %d!\n", mss,
1871 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1872 		once = 1;
1873 	}
1874 	return;
1875 
1876 }
1877 
1878 #endif /* IFCAP_TSO4 */
1879 
1880 #ifdef MXGE_NEW_VLAN_API
1881 /*
1882  * We reproduce the software vlan tag insertion from
1883  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1884  * vlan tag insertion. We need to advertise this in order to have the
1885  * vlan interface respect our csum offload flags.
1886  */
1887 static struct mbuf *
1888 mxge_vlan_tag_insert(struct mbuf *m)
1889 {
1890 	struct ether_vlan_header *evl;
1891 
1892 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1893 	if (__predict_false(m == NULL))
1894 		return NULL;
1895 	if (m->m_len < sizeof(*evl)) {
1896 		m = m_pullup(m, sizeof(*evl));
1897 		if (__predict_false(m == NULL))
1898 			return NULL;
1899 	}
1900 	/*
1901 	 * Transform the Ethernet header into an Ethernet header
1902 	 * with 802.1Q encapsulation.
1903 	 */
1904 	evl = mtod(m, struct ether_vlan_header *);
1905 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1906 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1907 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1908 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1909 	m->m_flags &= ~M_VLANTAG;
1910 	return m;
1911 }
1912 #endif /* MXGE_NEW_VLAN_API */
1913 
1914 static void
1915 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1916 {
1917 	mxge_softc_t *sc;
1918 	mcp_kreq_ether_send_t *req;
1919 	bus_dma_segment_t *seg;
1920 	struct mbuf *m_tmp;
1921 	struct ifnet *ifp;
1922 	mxge_tx_ring_t *tx;
1923 	struct ip *ip;
1924 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1925 	uint16_t pseudo_hdr_offset;
1926         uint8_t flags, cksum_offset;
1927 
1928 
1929 	sc = ss->sc;
1930 	ifp = sc->ifp;
1931 	tx = &ss->tx;
1932 
1933 	ip_off = sizeof (struct ether_header);
1934 #ifdef MXGE_NEW_VLAN_API
1935 	if (m->m_flags & M_VLANTAG) {
1936 		m = mxge_vlan_tag_insert(m);
1937 		if (__predict_false(m == NULL))
1938 			goto drop;
1939 		ip_off += ETHER_VLAN_ENCAP_LEN;
1940 	}
1941 #endif
1942 	/* (try to) map the frame for DMA */
1943 	idx = tx->req & tx->mask;
1944 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1945 				      m, tx->seg_list, &cnt,
1946 				      BUS_DMA_NOWAIT);
1947 	if (__predict_false(err == EFBIG)) {
1948 		/* Too many segments in the chain.  Try
1949 		   to defrag */
1950 		m_tmp = m_defrag(m, M_NOWAIT);
1951 		if (m_tmp == NULL) {
1952 			goto drop;
1953 		}
1954 		ss->tx.defrag++;
1955 		m = m_tmp;
1956 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1957 					      tx->info[idx].map,
1958 					      m, tx->seg_list, &cnt,
1959 					      BUS_DMA_NOWAIT);
1960 	}
1961 	if (__predict_false(err != 0)) {
1962 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1963 			      " packet len = %d\n", err, m->m_pkthdr.len);
1964 		goto drop;
1965 	}
1966 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1967 			BUS_DMASYNC_PREWRITE);
1968 	tx->info[idx].m = m;
1969 
1970 #if IFCAP_TSO4
1971 	/* TSO is different enough, we handle it in another routine */
1972 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1973 		mxge_encap_tso(ss, m, cnt, ip_off);
1974 		return;
1975 	}
1976 #endif
1977 
1978 	req = tx->req_list;
1979 	cksum_offset = 0;
1980 	pseudo_hdr_offset = 0;
1981 	flags = MXGEFW_FLAGS_NO_TSO;
1982 
1983 	/* checksum offloading? */
1984 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1985 		/* ensure ip header is in first mbuf, copy
1986 		   it to a scratch buffer if not */
1987 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1988 			m_copydata(m, 0, ip_off + sizeof (*ip),
1989 				   ss->scratch);
1990 			ip = (struct ip *)(ss->scratch + ip_off);
1991 		} else {
1992 			ip = (struct ip *)(mtod(m, char *) + ip_off);
1993 		}
1994 		cksum_offset = ip_off + (ip->ip_hl << 2);
1995 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1996 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1997 		req->cksum_offset = cksum_offset;
1998 		flags |= MXGEFW_FLAGS_CKSUM;
1999 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2000 	} else {
2001 		odd_flag = 0;
2002 	}
2003 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2004 		flags |= MXGEFW_FLAGS_SMALL;
2005 
2006 	/* convert segments into a request list */
2007 	cum_len = 0;
2008 	seg = tx->seg_list;
2009 	req->flags = MXGEFW_FLAGS_FIRST;
2010 	for (i = 0; i < cnt; i++) {
2011 		req->addr_low =
2012 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2013 		req->addr_high =
2014 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2015 		req->length = htobe16(seg->ds_len);
2016 		req->cksum_offset = cksum_offset;
2017 		if (cksum_offset > seg->ds_len)
2018 			cksum_offset -= seg->ds_len;
2019 		else
2020 			cksum_offset = 0;
2021 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2022 		req->pad = 0; /* complete solid 16-byte block */
2023 		req->rdma_count = 1;
2024 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2025 		cum_len += seg->ds_len;
2026 		seg++;
2027 		req++;
2028 		req->flags = 0;
2029 	}
2030 	req--;
2031 	/* pad runts to 60 bytes */
2032 	if (cum_len < 60) {
2033 		req++;
2034 		req->addr_low =
2035 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2036 		req->addr_high =
2037 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2038 		req->length = htobe16(60 - cum_len);
2039 		req->cksum_offset = 0;
2040 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2041 		req->pad = 0; /* complete solid 16-byte block */
2042 		req->rdma_count = 1;
2043 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2044 		cnt++;
2045 	}
2046 
2047 	tx->req_list[0].rdma_count = cnt;
2048 #if 0
2049 	/* print what the firmware will see */
2050 	for (i = 0; i < cnt; i++) {
2051 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2052 		    "cso:%d, flags:0x%x, rdma:%d\n",
2053 		    i, (int)ntohl(tx->req_list[i].addr_high),
2054 		    (int)ntohl(tx->req_list[i].addr_low),
2055 		    (int)ntohs(tx->req_list[i].length),
2056 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2057 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2058 		    tx->req_list[i].rdma_count);
2059 	}
2060 	printf("--------------\n");
2061 #endif
2062 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2063 	mxge_submit_req(tx, tx->req_list, cnt);
2064 	return;
2065 
2066 drop:
2067 	m_freem(m);
2068 	ifp->if_oerrors++;
2069 	return;
2070 }
2071 
2072 
2073 
2074 
2075 static inline void
2076 mxge_start_locked(struct mxge_slice_state *ss)
2077 {
2078 	mxge_softc_t *sc;
2079 	struct mbuf *m;
2080 	struct ifnet *ifp;
2081 	mxge_tx_ring_t *tx;
2082 
2083 	sc = ss->sc;
2084 	ifp = sc->ifp;
2085 	tx = &ss->tx;
2086 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2087 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2088 		if (m == NULL) {
2089 			return;
2090 		}
2091 		/* let BPF see it */
2092 		BPF_MTAP(ifp, m);
2093 
2094 		/* give it to the nic */
2095 		mxge_encap(ss, m);
2096 	}
2097 	/* ran out of transmit slots */
2098 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2099 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2100 		tx->stall++;
2101 	}
2102 }
2103 
2104 static void
2105 mxge_start(struct ifnet *ifp)
2106 {
2107 	mxge_softc_t *sc = ifp->if_softc;
2108 	struct mxge_slice_state *ss;
2109 
2110 	/* only use the first slice for now */
2111 	ss = &sc->ss[0];
2112 	mtx_lock(&ss->tx.mtx);
2113 	mxge_start_locked(ss);
2114 	mtx_unlock(&ss->tx.mtx);
2115 }
2116 
2117 /*
2118  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2119  * at most 32 bytes at a time, so as to avoid involving the software
2120  * pio handler in the nic.   We re-write the first segment's low
2121  * DMA address to mark it valid only after we write the entire chunk
2122  * in a burst
2123  */
2124 static inline void
2125 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2126 		mcp_kreq_ether_recv_t *src)
2127 {
2128 	uint32_t low;
2129 
2130 	low = src->addr_low;
2131 	src->addr_low = 0xffffffff;
2132 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2133 	mb();
2134 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2135 	mb();
2136 	src->addr_low = low;
2137 	dst->addr_low = low;
2138 	mb();
2139 }
2140 
2141 static int
2142 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2143 {
2144 	bus_dma_segment_t seg;
2145 	struct mbuf *m;
2146 	mxge_rx_ring_t *rx = &ss->rx_small;
2147 	int cnt, err;
2148 
2149 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2150 	if (m == NULL) {
2151 		rx->alloc_fail++;
2152 		err = ENOBUFS;
2153 		goto done;
2154 	}
2155 	m->m_len = MHLEN;
2156 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2157 				      &seg, &cnt, BUS_DMA_NOWAIT);
2158 	if (err != 0) {
2159 		m_free(m);
2160 		goto done;
2161 	}
2162 	rx->info[idx].m = m;
2163 	rx->shadow[idx].addr_low =
2164 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2165 	rx->shadow[idx].addr_high =
2166 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2167 
2168 done:
2169 	if ((idx & 7) == 7)
2170 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2171 	return err;
2172 }
2173 
2174 static int
2175 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2176 {
2177 	bus_dma_segment_t seg[3];
2178 	struct mbuf *m;
2179 	mxge_rx_ring_t *rx = &ss->rx_big;
2180 	int cnt, err, i;
2181 
2182 	if (rx->cl_size == MCLBYTES)
2183 		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2184 	else
2185 		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2186 	if (m == NULL) {
2187 		rx->alloc_fail++;
2188 		err = ENOBUFS;
2189 		goto done;
2190 	}
2191 	m->m_len = rx->cl_size;
2192 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2193 				      seg, &cnt, BUS_DMA_NOWAIT);
2194 	if (err != 0) {
2195 		m_free(m);
2196 		goto done;
2197 	}
2198 	rx->info[idx].m = m;
2199 	rx->shadow[idx].addr_low =
2200 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2201 	rx->shadow[idx].addr_high =
2202 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2203 
2204 #if MXGE_VIRT_JUMBOS
2205 	for (i = 1; i < cnt; i++) {
2206 		rx->shadow[idx + i].addr_low =
2207 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2208 		rx->shadow[idx + i].addr_high =
2209 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2210        }
2211 #endif
2212 
2213 done:
2214        for (i = 0; i < rx->nbufs; i++) {
2215 		if ((idx & 7) == 7) {
2216 			mxge_submit_8rx(&rx->lanai[idx - 7],
2217 					&rx->shadow[idx - 7]);
2218 		}
2219 		idx++;
2220 	}
2221 	return err;
2222 }
2223 
2224 /*
2225  *  Myri10GE hardware checksums are not valid if the sender
2226  *  padded the frame with non-zero padding.  This is because
2227  *  the firmware just does a simple 16-bit 1s complement
2228  *  checksum across the entire frame, excluding the first 14
2229  *  bytes.  It is best to simply to check the checksum and
2230  *  tell the stack about it only if the checksum is good
2231  */
2232 
2233 static inline uint16_t
2234 mxge_rx_csum(struct mbuf *m, int csum)
2235 {
2236 	struct ether_header *eh;
2237 	struct ip *ip;
2238 	uint16_t c;
2239 
2240 	eh = mtod(m, struct ether_header *);
2241 
2242 	/* only deal with IPv4 TCP & UDP for now */
2243 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2244 		return 1;
2245 	ip = (struct ip *)(eh + 1);
2246 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2247 			    ip->ip_p != IPPROTO_UDP))
2248 		return 1;
2249 
2250 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2251 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2252 			    - (ip->ip_hl << 2) + ip->ip_p));
2253 	c ^= 0xffff;
2254 	return (c);
2255 }
2256 
2257 static void
2258 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2259 {
2260 	struct ether_vlan_header *evl;
2261 	struct ether_header *eh;
2262 	uint32_t partial;
2263 
2264 	evl = mtod(m, struct ether_vlan_header *);
2265 	eh = mtod(m, struct ether_header *);
2266 
2267 	/*
2268 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2269 	 * after what the firmware thought was the end of the ethernet
2270 	 * header.
2271 	 */
2272 
2273 	/* put checksum into host byte order */
2274 	*csum = ntohs(*csum);
2275 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2276 	(*csum) += ~partial;
2277 	(*csum) +=  ((*csum) < ~partial);
2278 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2279 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2280 
2281 	/* restore checksum to network byte order;
2282 	   later consumers expect this */
2283 	*csum = htons(*csum);
2284 
2285 	/* save the tag */
2286 #ifdef MXGE_NEW_VLAN_API
2287 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2288 #else
2289 	{
2290 		struct m_tag *mtag;
2291 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2292 				   M_NOWAIT);
2293 		if (mtag == NULL)
2294 			return;
2295 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2296 		m_tag_prepend(m, mtag);
2297 	}
2298 
2299 #endif
2300 	m->m_flags |= M_VLANTAG;
2301 
2302 	/*
2303 	 * Remove the 802.1q header by copying the Ethernet
2304 	 * addresses over it and adjusting the beginning of
2305 	 * the data in the mbuf.  The encapsulated Ethernet
2306 	 * type field is already in place.
2307 	 */
2308 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2309 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2310 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2311 }
2312 
2313 
2314 static inline void
2315 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2316 {
2317 	mxge_softc_t *sc;
2318 	struct ifnet *ifp;
2319 	struct mbuf *m;
2320 	struct ether_header *eh;
2321 	mxge_rx_ring_t *rx;
2322 	bus_dmamap_t old_map;
2323 	int idx;
2324 	uint16_t tcpudp_csum;
2325 
2326 	sc = ss->sc;
2327 	ifp = sc->ifp;
2328 	rx = &ss->rx_big;
2329 	idx = rx->cnt & rx->mask;
2330 	rx->cnt += rx->nbufs;
2331 	/* save a pointer to the received mbuf */
2332 	m = rx->info[idx].m;
2333 	/* try to replace the received mbuf */
2334 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2335 		/* drop the frame -- the old mbuf is re-cycled */
2336 		ifp->if_ierrors++;
2337 		return;
2338 	}
2339 
2340 	/* unmap the received buffer */
2341 	old_map = rx->info[idx].map;
2342 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2343 	bus_dmamap_unload(rx->dmat, old_map);
2344 
2345 	/* swap the bus_dmamap_t's */
2346 	rx->info[idx].map = rx->extra_map;
2347 	rx->extra_map = old_map;
2348 
2349 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2350 	 * aligned */
2351 	m->m_data += MXGEFW_PAD;
2352 
2353 	m->m_pkthdr.rcvif = ifp;
2354 	m->m_len = m->m_pkthdr.len = len;
2355 	ss->ipackets++;
2356 	eh = mtod(m, struct ether_header *);
2357 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2358 		mxge_vlan_tag_remove(m, &csum);
2359 	}
2360 	/* if the checksum is valid, mark it in the mbuf header */
2361 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2362 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2363 			return;
2364 		/* otherwise, it was a UDP frame, or a TCP frame which
2365 		   we could not do LRO on.  Tell the stack that the
2366 		   checksum is good */
2367 		m->m_pkthdr.csum_data = 0xffff;
2368 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2369 	}
2370 	/* pass the frame up the stack */
2371 	(*ifp->if_input)(ifp, m);
2372 }
2373 
2374 static inline void
2375 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2376 {
2377 	mxge_softc_t *sc;
2378 	struct ifnet *ifp;
2379 	struct ether_header *eh;
2380 	struct mbuf *m;
2381 	mxge_rx_ring_t *rx;
2382 	bus_dmamap_t old_map;
2383 	int idx;
2384 	uint16_t tcpudp_csum;
2385 
2386 	sc = ss->sc;
2387 	ifp = sc->ifp;
2388 	rx = &ss->rx_small;
2389 	idx = rx->cnt & rx->mask;
2390 	rx->cnt++;
2391 	/* save a pointer to the received mbuf */
2392 	m = rx->info[idx].m;
2393 	/* try to replace the received mbuf */
2394 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2395 		/* drop the frame -- the old mbuf is re-cycled */
2396 		ifp->if_ierrors++;
2397 		return;
2398 	}
2399 
2400 	/* unmap the received buffer */
2401 	old_map = rx->info[idx].map;
2402 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2403 	bus_dmamap_unload(rx->dmat, old_map);
2404 
2405 	/* swap the bus_dmamap_t's */
2406 	rx->info[idx].map = rx->extra_map;
2407 	rx->extra_map = old_map;
2408 
2409 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2410 	 * aligned */
2411 	m->m_data += MXGEFW_PAD;
2412 
2413 	m->m_pkthdr.rcvif = ifp;
2414 	m->m_len = m->m_pkthdr.len = len;
2415 	ss->ipackets++;
2416 	eh = mtod(m, struct ether_header *);
2417 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2418 		mxge_vlan_tag_remove(m, &csum);
2419 	}
2420 	/* if the checksum is valid, mark it in the mbuf header */
2421 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2422 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2423 			return;
2424 		/* otherwise, it was a UDP frame, or a TCP frame which
2425 		   we could not do LRO on.  Tell the stack that the
2426 		   checksum is good */
2427 		m->m_pkthdr.csum_data = 0xffff;
2428 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2429 	}
2430 	/* pass the frame up the stack */
2431 	(*ifp->if_input)(ifp, m);
2432 }
2433 
2434 static inline void
2435 mxge_clean_rx_done(struct mxge_slice_state *ss)
2436 {
2437 	mxge_rx_done_t *rx_done = &ss->rx_done;
2438 	struct lro_entry *lro;
2439 	int limit = 0;
2440 	uint16_t length;
2441 	uint16_t checksum;
2442 
2443 
2444 	while (rx_done->entry[rx_done->idx].length != 0) {
2445 		length = ntohs(rx_done->entry[rx_done->idx].length);
2446 		rx_done->entry[rx_done->idx].length = 0;
2447 		checksum = rx_done->entry[rx_done->idx].checksum;
2448 		if (length <= (MHLEN - MXGEFW_PAD))
2449 			mxge_rx_done_small(ss, length, checksum);
2450 		else
2451 			mxge_rx_done_big(ss, length, checksum);
2452 		rx_done->cnt++;
2453 		rx_done->idx = rx_done->cnt & rx_done->mask;
2454 
2455 		/* limit potential for livelock */
2456 		if (__predict_false(++limit > rx_done->mask / 2))
2457 			break;
2458 	}
2459 	while (!SLIST_EMPTY(&ss->lro_active)) {
2460 		lro = SLIST_FIRST(&ss->lro_active);
2461 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2462 		mxge_lro_flush(ss, lro);
2463 	}
2464 }
2465 
2466 
2467 static inline void
2468 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2469 {
2470 	struct ifnet *ifp;
2471 	mxge_tx_ring_t *tx;
2472 	struct mbuf *m;
2473 	bus_dmamap_t map;
2474 	int idx;
2475 
2476 	tx = &ss->tx;
2477 	ifp = ss->sc->ifp;
2478 	while (tx->pkt_done != mcp_idx) {
2479 		idx = tx->done & tx->mask;
2480 		tx->done++;
2481 		m = tx->info[idx].m;
2482 		/* mbuf and DMA map only attached to the first
2483 		   segment per-mbuf */
2484 		if (m != NULL) {
2485 			ifp->if_opackets++;
2486 			tx->info[idx].m = NULL;
2487 			map = tx->info[idx].map;
2488 			bus_dmamap_unload(tx->dmat, map);
2489 			m_freem(m);
2490 		}
2491 		if (tx->info[idx].flag) {
2492 			tx->info[idx].flag = 0;
2493 			tx->pkt_done++;
2494 		}
2495 	}
2496 
2497 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2498            its OK to send packets */
2499 
2500 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
2501 	    tx->req - tx->done < (tx->mask + 1)/4) {
2502 		mtx_lock(&ss->tx.mtx);
2503 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2504 		ss->tx.wake++;
2505 		mxge_start_locked(ss);
2506 		mtx_unlock(&ss->tx.mtx);
2507 	}
2508 }
2509 
2510 static struct mxge_media_type mxge_media_types[] =
2511 {
2512 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2513 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2514 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2515 	{0,		(1 << 5),	"10GBASE-ER"},
2516 	{0,		(1 << 4),	"10GBASE-LRM"},
2517 	{0,		(1 << 3),	"10GBASE-SW"},
2518 	{0,		(1 << 2),	"10GBASE-LW"},
2519 	{0,		(1 << 1),	"10GBASE-EW"},
2520 	{0,		(1 << 0),	"Reserved"}
2521 };
2522 
2523 static void
2524 mxge_set_media(mxge_softc_t *sc, int type)
2525 {
2526 	sc->media_flags |= type;
2527 	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2528 	ifmedia_set(&sc->media, sc->media_flags);
2529 }
2530 
2531 
2532 /*
2533  * Determine the media type for a NIC.  Some XFPs will identify
2534  * themselves only when their link is up, so this is initiated via a
2535  * link up interrupt.  However, this can potentially take up to
2536  * several milliseconds, so it is run via the watchdog routine, rather
2537  * than in the interrupt handler itself.   This need only be done
2538  * once, not each time the link is up.
2539  */
2540 static void
2541 mxge_media_probe(mxge_softc_t *sc)
2542 {
2543 	mxge_cmd_t cmd;
2544 	char *ptr;
2545 	int i, err, ms;
2546 
2547 	sc->need_media_probe = 0;
2548 
2549 	/* if we've already set a media type, we're done */
2550 	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2551 		return;
2552 
2553 	/*
2554 	 * parse the product code to deterimine the interface type
2555 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2556 	 * after the 3rd dash in the driver's cached copy of the
2557 	 * EEPROM's product code string.
2558 	 */
2559 	ptr = sc->product_code_string;
2560 	if (ptr == NULL) {
2561 		device_printf(sc->dev, "Missing product code\n");
2562 	}
2563 
2564 	for (i = 0; i < 3; i++, ptr++) {
2565 		ptr = index(ptr, '-');
2566 		if (ptr == NULL) {
2567 			device_printf(sc->dev,
2568 				      "only %d dashes in PC?!?\n", i);
2569 			return;
2570 		}
2571 	}
2572 	if (*ptr == 'C') {
2573 		mxge_set_media(sc, IFM_10G_CX4);
2574 		return;
2575 	}
2576 	else if (*ptr == 'Q') {
2577 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2578 		/* FreeBSD has no media type for Quad ribbon fiber */
2579 		return;
2580 	}
2581 
2582 	if (*ptr != 'R') {
2583 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2584 		return;
2585 	}
2586 
2587 	/*
2588 	 * At this point we know the NIC has an XFP cage, so now we
2589 	 * try to determine what is in the cage by using the
2590 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2591 	 * register.  We read just one byte, which may take over
2592 	 * a millisecond
2593 	 */
2594 
2595 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2596 	cmd.data1 = MXGE_XFP_COMPLIANCE_BYTE; /* the byte we want */
2597 	err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_I2C_READ, &cmd);
2598 	if (err == MXGEFW_CMD_ERROR_XFP_FAILURE) {
2599 		device_printf(sc->dev, "failed to read XFP\n");
2600 	}
2601 	if (err == MXGEFW_CMD_ERROR_XFP_ABSENT) {
2602 		device_printf(sc->dev, "Type R with no XFP!?!?\n");
2603 	}
2604 	if (err != MXGEFW_CMD_OK) {
2605 		return;
2606 	}
2607 
2608 	/* now we wait for the data to be cached */
2609 	cmd.data0 = MXGE_XFP_COMPLIANCE_BYTE;
2610 	err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_BYTE, &cmd);
2611 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2612 		DELAY(1000);
2613 		cmd.data0 = MXGE_XFP_COMPLIANCE_BYTE;
2614 		err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_BYTE, &cmd);
2615 	}
2616 	if (err != MXGEFW_CMD_OK) {
2617 		device_printf(sc->dev, "failed to read XFP (%d, %dms)\n",
2618 			      err, ms);
2619 		return;
2620 	}
2621 
2622 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2623 		if (mxge_verbose)
2624 			device_printf(sc->dev, "XFP:%s\n",
2625 				      mxge_media_types[0].name);
2626 		mxge_set_media(sc, IFM_10G_CX4);
2627 		return;
2628 	}
2629 	for (i = 1;
2630 	     i < sizeof (mxge_media_types) / sizeof (mxge_media_types[0]);
2631 	     i++) {
2632 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2633 			if (mxge_verbose)
2634 				device_printf(sc->dev, "XFP:%s\n",
2635 					      mxge_media_types[i].name);
2636 
2637 			mxge_set_media(sc, mxge_media_types[i].flag);
2638 			return;
2639 		}
2640 	}
2641 	device_printf(sc->dev, "XFP media 0x%x unknown\n", cmd.data0);
2642 
2643 	return;
2644 }
2645 
2646 static void
2647 mxge_intr(void *arg)
2648 {
2649 	struct mxge_slice_state *ss = arg;
2650 	mxge_softc_t *sc = ss->sc;
2651 	mcp_irq_data_t *stats = ss->fw_stats;
2652 	mxge_tx_ring_t *tx = &ss->tx;
2653 	mxge_rx_done_t *rx_done = &ss->rx_done;
2654 	uint32_t send_done_count;
2655 	uint8_t valid;
2656 
2657 
2658 	/* an interrupt on a non-zero slice is implicitly valid
2659 	   since MSI-X irqs are not shared */
2660 	if (ss != sc->ss) {
2661 		mxge_clean_rx_done(ss);
2662 		*ss->irq_claim = be32toh(3);
2663 		return;
2664 	}
2665 
2666 	/* make sure the DMA has finished */
2667 	if (!stats->valid) {
2668 		return;
2669 	}
2670 	valid = stats->valid;
2671 
2672 	if (sc->legacy_irq) {
2673 		/* lower legacy IRQ  */
2674 		*sc->irq_deassert = 0;
2675 		if (!mxge_deassert_wait)
2676 			/* don't wait for conf. that irq is low */
2677 			stats->valid = 0;
2678 	} else {
2679 		stats->valid = 0;
2680 	}
2681 
2682 	/* loop while waiting for legacy irq deassertion */
2683 	do {
2684 		/* check for transmit completes and receives */
2685 		send_done_count = be32toh(stats->send_done_count);
2686 		while ((send_done_count != tx->pkt_done) ||
2687 		       (rx_done->entry[rx_done->idx].length != 0)) {
2688 			mxge_tx_done(ss, (int)send_done_count);
2689 			mxge_clean_rx_done(ss);
2690 			send_done_count = be32toh(stats->send_done_count);
2691 		}
2692 		if (sc->legacy_irq && mxge_deassert_wait)
2693 			mb();
2694 	} while (*((volatile uint8_t *) &stats->valid));
2695 
2696 	if (__predict_false(stats->stats_updated)) {
2697 		if (sc->link_state != stats->link_up) {
2698 			sc->link_state = stats->link_up;
2699 			if (sc->link_state) {
2700 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2701 				if (mxge_verbose)
2702 					device_printf(sc->dev, "link up\n");
2703 			} else {
2704 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2705 				if (mxge_verbose)
2706 					device_printf(sc->dev, "link down\n");
2707 			}
2708 			sc->need_media_probe = 1;
2709 		}
2710 		if (sc->rdma_tags_available !=
2711 		    be32toh(stats->rdma_tags_available)) {
2712 			sc->rdma_tags_available =
2713 				be32toh(stats->rdma_tags_available);
2714 			device_printf(sc->dev, "RDMA timed out! %d tags "
2715 				      "left\n", sc->rdma_tags_available);
2716 		}
2717 
2718 		if (stats->link_down) {
2719 			sc->down_cnt += stats->link_down;
2720 			sc->link_state = 0;
2721 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2722 		}
2723 	}
2724 
2725 	/* check to see if we have rx token to pass back */
2726 	if (valid & 0x1)
2727 	    *ss->irq_claim = be32toh(3);
2728 	*(ss->irq_claim + 1) = be32toh(3);
2729 }
2730 
2731 static void
2732 mxge_init(void *arg)
2733 {
2734 }
2735 
2736 
2737 
2738 static void
2739 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2740 {
2741 	struct lro_entry *lro_entry;
2742 	int i;
2743 
2744 	while (!SLIST_EMPTY(&ss->lro_free)) {
2745 		lro_entry = SLIST_FIRST(&ss->lro_free);
2746 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
2747 		free(lro_entry, M_DEVBUF);
2748 	}
2749 
2750 	for (i = 0; i <= ss->rx_big.mask; i++) {
2751 		if (ss->rx_big.info[i].m == NULL)
2752 			continue;
2753 		bus_dmamap_unload(ss->rx_big.dmat,
2754 				  ss->rx_big.info[i].map);
2755 		m_freem(ss->rx_big.info[i].m);
2756 		ss->rx_big.info[i].m = NULL;
2757 	}
2758 
2759 	for (i = 0; i <= ss->rx_small.mask; i++) {
2760 		if (ss->rx_small.info[i].m == NULL)
2761 			continue;
2762 		bus_dmamap_unload(ss->rx_small.dmat,
2763 				  ss->rx_small.info[i].map);
2764 		m_freem(ss->rx_small.info[i].m);
2765 		ss->rx_small.info[i].m = NULL;
2766 	}
2767 
2768 	/* transmit ring used only on the first slice */
2769 	if (ss->tx.info == NULL)
2770 		return;
2771 
2772 	for (i = 0; i <= ss->tx.mask; i++) {
2773 		ss->tx.info[i].flag = 0;
2774 		if (ss->tx.info[i].m == NULL)
2775 			continue;
2776 		bus_dmamap_unload(ss->tx.dmat,
2777 				  ss->tx.info[i].map);
2778 		m_freem(ss->tx.info[i].m);
2779 		ss->tx.info[i].m = NULL;
2780 	}
2781 }
2782 
2783 static void
2784 mxge_free_mbufs(mxge_softc_t *sc)
2785 {
2786 	int slice;
2787 
2788 	for (slice = 0; slice < sc->num_slices; slice++)
2789 		mxge_free_slice_mbufs(&sc->ss[slice]);
2790 }
2791 
2792 static void
2793 mxge_free_slice_rings(struct mxge_slice_state *ss)
2794 {
2795 	int i;
2796 
2797 
2798 	if (ss->rx_done.entry != NULL)
2799 		mxge_dma_free(&ss->rx_done.dma);
2800 	ss->rx_done.entry = NULL;
2801 
2802 	if (ss->tx.req_bytes != NULL)
2803 		free(ss->tx.req_bytes, M_DEVBUF);
2804 	ss->tx.req_bytes = NULL;
2805 
2806 	if (ss->tx.seg_list != NULL)
2807 		free(ss->tx.seg_list, M_DEVBUF);
2808 	ss->tx.seg_list = NULL;
2809 
2810 	if (ss->rx_small.shadow != NULL)
2811 		free(ss->rx_small.shadow, M_DEVBUF);
2812 	ss->rx_small.shadow = NULL;
2813 
2814 	if (ss->rx_big.shadow != NULL)
2815 		free(ss->rx_big.shadow, M_DEVBUF);
2816 	ss->rx_big.shadow = NULL;
2817 
2818 	if (ss->tx.info != NULL) {
2819 		if (ss->tx.dmat != NULL) {
2820 			for (i = 0; i <= ss->tx.mask; i++) {
2821 				bus_dmamap_destroy(ss->tx.dmat,
2822 						   ss->tx.info[i].map);
2823 			}
2824 			bus_dma_tag_destroy(ss->tx.dmat);
2825 		}
2826 		free(ss->tx.info, M_DEVBUF);
2827 	}
2828 	ss->tx.info = NULL;
2829 
2830 	if (ss->rx_small.info != NULL) {
2831 		if (ss->rx_small.dmat != NULL) {
2832 			for (i = 0; i <= ss->rx_small.mask; i++) {
2833 				bus_dmamap_destroy(ss->rx_small.dmat,
2834 						   ss->rx_small.info[i].map);
2835 			}
2836 			bus_dmamap_destroy(ss->rx_small.dmat,
2837 					   ss->rx_small.extra_map);
2838 			bus_dma_tag_destroy(ss->rx_small.dmat);
2839 		}
2840 		free(ss->rx_small.info, M_DEVBUF);
2841 	}
2842 	ss->rx_small.info = NULL;
2843 
2844 	if (ss->rx_big.info != NULL) {
2845 		if (ss->rx_big.dmat != NULL) {
2846 			for (i = 0; i <= ss->rx_big.mask; i++) {
2847 				bus_dmamap_destroy(ss->rx_big.dmat,
2848 						   ss->rx_big.info[i].map);
2849 			}
2850 			bus_dmamap_destroy(ss->rx_big.dmat,
2851 					   ss->rx_big.extra_map);
2852 			bus_dma_tag_destroy(ss->rx_big.dmat);
2853 		}
2854 		free(ss->rx_big.info, M_DEVBUF);
2855 	}
2856 	ss->rx_big.info = NULL;
2857 }
2858 
2859 static void
2860 mxge_free_rings(mxge_softc_t *sc)
2861 {
2862 	int slice;
2863 
2864 	for (slice = 0; slice < sc->num_slices; slice++)
2865 		mxge_free_slice_rings(&sc->ss[slice]);
2866 }
2867 
2868 static int
2869 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
2870 		       int tx_ring_entries)
2871 {
2872 	mxge_softc_t *sc = ss->sc;
2873 	size_t bytes;
2874 	int err, i;
2875 
2876 	err = ENOMEM;
2877 
2878 	/* allocate per-slice receive resources */
2879 
2880 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
2881 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
2882 
2883 	/* allocate the rx shadow rings */
2884 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2885 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2886 	if (ss->rx_small.shadow == NULL)
2887 		return err;;
2888 
2889 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2890 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2891 	if (ss->rx_big.shadow == NULL)
2892 		return err;;
2893 
2894 	/* allocate the rx host info rings */
2895 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
2896 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2897 	if (ss->rx_small.info == NULL)
2898 		return err;;
2899 
2900 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
2901 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2902 	if (ss->rx_big.info == NULL)
2903 		return err;;
2904 
2905 	/* allocate the rx busdma resources */
2906 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2907 				 1,			/* alignment */
2908 				 4096,			/* boundary */
2909 				 BUS_SPACE_MAXADDR,	/* low */
2910 				 BUS_SPACE_MAXADDR,	/* high */
2911 				 NULL, NULL,		/* filter */
2912 				 MHLEN,			/* maxsize */
2913 				 1,			/* num segs */
2914 				 MHLEN,			/* maxsegsize */
2915 				 BUS_DMA_ALLOCNOW,	/* flags */
2916 				 NULL, NULL,		/* lock */
2917 				 &ss->rx_small.dmat);	/* tag */
2918 	if (err != 0) {
2919 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2920 			      err);
2921 		return err;;
2922 	}
2923 
2924 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2925 				 1,			/* alignment */
2926 #if MXGE_VIRT_JUMBOS
2927 				 4096,			/* boundary */
2928 #else
2929 				 0,			/* boundary */
2930 #endif
2931 				 BUS_SPACE_MAXADDR,	/* low */
2932 				 BUS_SPACE_MAXADDR,	/* high */
2933 				 NULL, NULL,		/* filter */
2934 				 3*4096,		/* maxsize */
2935 #if MXGE_VIRT_JUMBOS
2936 				 3,			/* num segs */
2937 				 4096,			/* maxsegsize*/
2938 #else
2939 				 1,			/* num segs */
2940 				 MJUM9BYTES,		/* maxsegsize*/
2941 #endif
2942 				 BUS_DMA_ALLOCNOW,	/* flags */
2943 				 NULL, NULL,		/* lock */
2944 				 &ss->rx_big.dmat);	/* tag */
2945 	if (err != 0) {
2946 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2947 			      err);
2948 		return err;;
2949 	}
2950 	for (i = 0; i <= ss->rx_small.mask; i++) {
2951 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
2952 					&ss->rx_small.info[i].map);
2953 		if (err != 0) {
2954 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2955 				      err);
2956 			return err;;
2957 		}
2958 	}
2959 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
2960 				&ss->rx_small.extra_map);
2961 	if (err != 0) {
2962 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2963 			      err);
2964 		return err;;
2965 	}
2966 
2967 	for (i = 0; i <= ss->rx_big.mask; i++) {
2968 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
2969 					&ss->rx_big.info[i].map);
2970 		if (err != 0) {
2971 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2972 				      err);
2973 			return err;;
2974 		}
2975 	}
2976 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
2977 				&ss->rx_big.extra_map);
2978 	if (err != 0) {
2979 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2980 			      err);
2981 		return err;;
2982 	}
2983 
2984 	/* now allocate TX resouces */
2985 
2986 	/* only use a single TX ring for now */
2987 	if (ss != ss->sc->ss)
2988 		return 0;
2989 
2990 	ss->tx.mask = tx_ring_entries - 1;
2991 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
2992 
2993 
2994 	/* allocate the tx request copy block */
2995 	bytes = 8 +
2996 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
2997 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
2998 	if (ss->tx.req_bytes == NULL)
2999 		return err;;
3000 	/* ensure req_list entries are aligned to 8 bytes */
3001 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3002 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3003 
3004 	/* allocate the tx busdma segment list */
3005 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3006 	ss->tx.seg_list = (bus_dma_segment_t *)
3007 		malloc(bytes, M_DEVBUF, M_WAITOK);
3008 	if (ss->tx.seg_list == NULL)
3009 		return err;;
3010 
3011 	/* allocate the tx host info ring */
3012 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3013 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3014 	if (ss->tx.info == NULL)
3015 		return err;;
3016 
3017 	/* allocate the tx busdma resources */
3018 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3019 				 1,			/* alignment */
3020 				 sc->tx_boundary,	/* boundary */
3021 				 BUS_SPACE_MAXADDR,	/* low */
3022 				 BUS_SPACE_MAXADDR,	/* high */
3023 				 NULL, NULL,		/* filter */
3024 				 65536 + 256,		/* maxsize */
3025 				 ss->tx.max_desc - 2,	/* num segs */
3026 				 sc->tx_boundary,	/* maxsegsz */
3027 				 BUS_DMA_ALLOCNOW,	/* flags */
3028 				 NULL, NULL,		/* lock */
3029 				 &ss->tx.dmat);		/* tag */
3030 
3031 	if (err != 0) {
3032 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3033 			      err);
3034 		return err;;
3035 	}
3036 
3037 	/* now use these tags to setup dmamaps for each slot
3038 	   in the ring */
3039 	for (i = 0; i <= ss->tx.mask; i++) {
3040 		err = bus_dmamap_create(ss->tx.dmat, 0,
3041 					&ss->tx.info[i].map);
3042 		if (err != 0) {
3043 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3044 				      err);
3045 			return err;;
3046 		}
3047 	}
3048 	return 0;
3049 
3050 }
3051 
3052 static int
3053 mxge_alloc_rings(mxge_softc_t *sc)
3054 {
3055 	mxge_cmd_t cmd;
3056 	int tx_ring_size;
3057 	int tx_ring_entries, rx_ring_entries;
3058 	int err, slice;
3059 
3060 	/* get ring sizes */
3061 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3062 	tx_ring_size = cmd.data0;
3063 	if (err != 0) {
3064 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3065 		goto abort;
3066 	}
3067 
3068 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3069 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3070 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3071 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3072 	IFQ_SET_READY(&sc->ifp->if_snd);
3073 
3074 	for (slice = 0; slice < sc->num_slices; slice++) {
3075 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3076 					     rx_ring_entries,
3077 					     tx_ring_entries);
3078 		if (err != 0)
3079 			goto abort;
3080 	}
3081 	return 0;
3082 
3083 abort:
3084 	mxge_free_rings(sc);
3085 	return err;
3086 
3087 }
3088 
3089 
3090 static void
3091 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3092 {
3093 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3094 
3095 	if (bufsize < MCLBYTES) {
3096 		/* easy, everything fits in a single buffer */
3097 		*big_buf_size = MCLBYTES;
3098 		*cl_size = MCLBYTES;
3099 		*nbufs = 1;
3100 		return;
3101 	}
3102 
3103 	if (bufsize < MJUMPAGESIZE) {
3104 		/* still easy, everything still fits in a single buffer */
3105 		*big_buf_size = MJUMPAGESIZE;
3106 		*cl_size = MJUMPAGESIZE;
3107 		*nbufs = 1;
3108 		return;
3109 	}
3110 #if MXGE_VIRT_JUMBOS
3111 	/* now we need to use virtually contiguous buffers */
3112 	*cl_size = MJUM9BYTES;
3113 	*big_buf_size = 4096;
3114 	*nbufs = mtu / 4096 + 1;
3115 	/* needs to be a power of two, so round up */
3116 	if (*nbufs == 3)
3117 		*nbufs = 4;
3118 #else
3119 	*cl_size = MJUM9BYTES;
3120 	*big_buf_size = MJUM9BYTES;
3121 	*nbufs = 1;
3122 #endif
3123 }
3124 
3125 static int
3126 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3127 {
3128 	mxge_softc_t *sc;
3129 	mxge_cmd_t cmd;
3130 	bus_dmamap_t map;
3131 	struct lro_entry *lro_entry;
3132 	int err, i, slice;
3133 
3134 
3135 	sc = ss->sc;
3136 	slice = ss - sc->ss;
3137 
3138 	SLIST_INIT(&ss->lro_free);
3139 	SLIST_INIT(&ss->lro_active);
3140 
3141 	for (i = 0; i < sc->lro_cnt; i++) {
3142 		lro_entry = (struct lro_entry *)
3143 			malloc(sizeof (*lro_entry), M_DEVBUF,
3144 			       M_NOWAIT | M_ZERO);
3145 		if (lro_entry == NULL) {
3146 			sc->lro_cnt = i;
3147 			break;
3148 		}
3149 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3150 	}
3151 	/* get the lanai pointers to the send and receive rings */
3152 
3153 	err = 0;
3154 	/* We currently only send from the first slice */
3155 	if (slice == 0) {
3156 		cmd.data0 = slice;
3157 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3158 		ss->tx.lanai =
3159 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3160 	}
3161 	cmd.data0 = slice;
3162 	err |= mxge_send_cmd(sc,
3163 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3164 	ss->rx_small.lanai =
3165 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3166 	cmd.data0 = slice;
3167 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3168 	ss->rx_big.lanai =
3169 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3170 
3171 	if (err != 0) {
3172 		device_printf(sc->dev,
3173 			      "failed to get ring sizes or locations\n");
3174 		return EIO;
3175 	}
3176 
3177 	/* stock receive rings */
3178 	for (i = 0; i <= ss->rx_small.mask; i++) {
3179 		map = ss->rx_small.info[i].map;
3180 		err = mxge_get_buf_small(ss, map, i);
3181 		if (err) {
3182 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3183 				      i, ss->rx_small.mask + 1);
3184 			return ENOMEM;
3185 		}
3186 	}
3187 	for (i = 0; i <= ss->rx_big.mask; i++) {
3188 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3189 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3190 	}
3191 	ss->rx_big.nbufs = nbufs;
3192 	ss->rx_big.cl_size = cl_size;
3193 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3194 		map = ss->rx_big.info[i].map;
3195 		err = mxge_get_buf_big(ss, map, i);
3196 		if (err) {
3197 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3198 				      i, ss->rx_big.mask + 1);
3199 			return ENOMEM;
3200 		}
3201 	}
3202 	return 0;
3203 }
3204 
3205 static int
3206 mxge_open(mxge_softc_t *sc)
3207 {
3208 	mxge_cmd_t cmd;
3209 	int err, big_bytes, nbufs, slice, cl_size, i;
3210 	bus_addr_t bus;
3211 	volatile uint8_t *itable;
3212 
3213 	/* Copy the MAC address in case it was overridden */
3214 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3215 
3216 	err = mxge_reset(sc, 1);
3217 	if (err != 0) {
3218 		device_printf(sc->dev, "failed to reset\n");
3219 		return EIO;
3220 	}
3221 
3222 	if (sc->num_slices > 1) {
3223 		/* setup the indirection table */
3224 		cmd.data0 = sc->num_slices;
3225 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3226 				    &cmd);
3227 
3228 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3229 				     &cmd);
3230 		if (err != 0) {
3231 			device_printf(sc->dev,
3232 				      "failed to setup rss tables\n");
3233 			return err;
3234 		}
3235 
3236 		/* just enable an identity mapping */
3237 		itable = sc->sram + cmd.data0;
3238 		for (i = 0; i < sc->num_slices; i++)
3239 			itable[i] = (uint8_t)i;
3240 
3241 		cmd.data0 = 1;
3242 		cmd.data1 = mxge_rss_hash_type;
3243 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3244 		if (err != 0) {
3245 			device_printf(sc->dev, "failed to enable slices\n");
3246 			return err;
3247 		}
3248 	}
3249 
3250 
3251 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3252 
3253 	cmd.data0 = nbufs;
3254 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3255 			    &cmd);
3256 	/* error is only meaningful if we're trying to set
3257 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3258 	if (err && nbufs > 1) {
3259 		device_printf(sc->dev,
3260 			      "Failed to set alway-use-n to %d\n",
3261 			      nbufs);
3262 		return EIO;
3263 	}
3264 	/* Give the firmware the mtu and the big and small buffer
3265 	   sizes.  The firmware wants the big buf size to be a power
3266 	   of two. Luckily, FreeBSD's clusters are powers of two */
3267 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3268 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3269 	cmd.data0 = MHLEN - MXGEFW_PAD;
3270 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3271 			     &cmd);
3272 	cmd.data0 = big_bytes;
3273 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3274 
3275 	if (err != 0) {
3276 		device_printf(sc->dev, "failed to setup params\n");
3277 		goto abort;
3278 	}
3279 
3280 	/* Now give him the pointer to the stats block */
3281 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->ss->fw_stats_dma.bus_addr);
3282 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->ss->fw_stats_dma.bus_addr);
3283 	cmd.data2 = sizeof(struct mcp_irq_data);
3284 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3285 
3286 	if (err != 0) {
3287 		bus = sc->ss->fw_stats_dma.bus_addr;
3288 		bus += offsetof(struct mcp_irq_data, send_done_count);
3289 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3290 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3291 		err = mxge_send_cmd(sc,
3292 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3293 				    &cmd);
3294 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3295 		sc->fw_multicast_support = 0;
3296 	} else {
3297 		sc->fw_multicast_support = 1;
3298 	}
3299 
3300 	if (err != 0) {
3301 		device_printf(sc->dev, "failed to setup params\n");
3302 		goto abort;
3303 	}
3304 
3305 	for (slice = 0; slice < sc->num_slices; slice++) {
3306 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3307 		if (err != 0) {
3308 			device_printf(sc->dev, "couldn't open slice %d\n",
3309 				      slice);
3310 			goto abort;
3311 		}
3312 	}
3313 
3314 	/* Finally, start the firmware running */
3315 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3316 	if (err) {
3317 		device_printf(sc->dev, "Couldn't bring up link\n");
3318 		goto abort;
3319 	}
3320 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3321 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3322 
3323 	return 0;
3324 
3325 
3326 abort:
3327 	mxge_free_mbufs(sc);
3328 
3329 	return err;
3330 }
3331 
3332 static int
3333 mxge_close(mxge_softc_t *sc)
3334 {
3335 	mxge_cmd_t cmd;
3336 	int err, old_down_cnt;
3337 
3338 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3339 	old_down_cnt = sc->down_cnt;
3340 	mb();
3341 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3342 	if (err) {
3343 		device_printf(sc->dev, "Couldn't bring down link\n");
3344 	}
3345 	if (old_down_cnt == sc->down_cnt) {
3346 		/* wait for down irq */
3347 		DELAY(10 * sc->intr_coal_delay);
3348 	}
3349 	mb();
3350 	if (old_down_cnt == sc->down_cnt) {
3351 		device_printf(sc->dev, "never got down irq\n");
3352 	}
3353 
3354 	mxge_free_mbufs(sc);
3355 
3356 	return 0;
3357 }
3358 
3359 static void
3360 mxge_setup_cfg_space(mxge_softc_t *sc)
3361 {
3362 	device_t dev = sc->dev;
3363 	int reg;
3364 	uint16_t cmd, lnk, pectl;
3365 
3366 	/* find the PCIe link width and set max read request to 4KB*/
3367 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3368 		lnk = pci_read_config(dev, reg + 0x12, 2);
3369 		sc->link_width = (lnk >> 4) & 0x3f;
3370 
3371 		pectl = pci_read_config(dev, reg + 0x8, 2);
3372 		pectl = (pectl & ~0x7000) | (5 << 12);
3373 		pci_write_config(dev, reg + 0x8, pectl, 2);
3374 	}
3375 
3376 	/* Enable DMA and Memory space access */
3377 	pci_enable_busmaster(dev);
3378 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3379 	cmd |= PCIM_CMD_MEMEN;
3380 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3381 }
3382 
3383 static uint32_t
3384 mxge_read_reboot(mxge_softc_t *sc)
3385 {
3386 	device_t dev = sc->dev;
3387 	uint32_t vs;
3388 
3389 	/* find the vendor specific offset */
3390 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3391 		device_printf(sc->dev,
3392 			      "could not find vendor specific offset\n");
3393 		return (uint32_t)-1;
3394 	}
3395 	/* enable read32 mode */
3396 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3397 	/* tell NIC which register to read */
3398 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3399 	return (pci_read_config(dev, vs + 0x14, 4));
3400 }
3401 
3402 static void
3403 mxge_watchdog_reset(mxge_softc_t *sc)
3404 {
3405 	int err;
3406 	uint32_t reboot;
3407 	uint16_t cmd;
3408 
3409 	err = ENXIO;
3410 
3411 	device_printf(sc->dev, "Watchdog reset!\n");
3412 
3413 	/*
3414 	 * check to see if the NIC rebooted.  If it did, then all of
3415 	 * PCI config space has been reset, and things like the
3416 	 * busmaster bit will be zero.  If this is the case, then we
3417 	 * must restore PCI config space before the NIC can be used
3418 	 * again
3419 	 */
3420 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3421 	if (cmd == 0xffff) {
3422 		/*
3423 		 * maybe the watchdog caught the NIC rebooting; wait
3424 		 * up to 100ms for it to finish.  If it does not come
3425 		 * back, then give up
3426 		 */
3427 		DELAY(1000*100);
3428 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3429 		if (cmd == 0xffff) {
3430 			device_printf(sc->dev, "NIC disappeared!\n");
3431 			goto abort;
3432 		}
3433 	}
3434 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3435 		/* print the reboot status */
3436 		reboot = mxge_read_reboot(sc);
3437 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3438 			      reboot);
3439 		/* restore PCI configuration space */
3440 
3441 		/* XXXX waiting for pci_cfg_restore() to be exported */
3442 		goto abort; /* just abort for now */
3443 
3444 		/* and redo any changes we made to our config space */
3445 		mxge_setup_cfg_space(sc);
3446 
3447 		if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
3448 			mxge_close(sc);
3449 			err = mxge_open(sc);
3450 		}
3451 	} else {
3452 		device_printf(sc->dev, "NIC did not reboot, ring state:\n");
3453 		device_printf(sc->dev, "tx.req=%d tx.done=%d\n",
3454 			      sc->ss->tx.req, sc->ss->tx.done);
3455 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3456 			      sc->ss->tx.pkt_done,
3457 			      be32toh(sc->ss->fw_stats->send_done_count));
3458 		device_printf(sc->dev, "not resetting\n");
3459 	}
3460 
3461 abort:
3462 	/*
3463 	 * stop the watchdog if the nic is dead, to avoid spamming the
3464 	 * console
3465 	 */
3466 	if (err != 0) {
3467 		callout_stop(&sc->co_hdl);
3468 	}
3469 }
3470 
3471 static void
3472 mxge_watchdog(mxge_softc_t *sc)
3473 {
3474 	mxge_tx_ring_t *tx = &sc->ss->tx;
3475 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3476 
3477 	/* see if we have outstanding transmits, which
3478 	   have been pending for more than mxge_ticks */
3479 	if (tx->req != tx->done &&
3480 	    tx->watchdog_req != tx->watchdog_done &&
3481 	    tx->done == tx->watchdog_done) {
3482 		/* check for pause blocking before resetting */
3483 		if (tx->watchdog_rx_pause == rx_pause)
3484 			mxge_watchdog_reset(sc);
3485 		else
3486 			device_printf(sc->dev, "Flow control blocking "
3487 				      "xmits, check link partner\n");
3488 	}
3489 
3490 	tx->watchdog_req = tx->req;
3491 	tx->watchdog_done = tx->done;
3492 	tx->watchdog_rx_pause = rx_pause;
3493 
3494 	if (sc->need_media_probe)
3495 		mxge_media_probe(sc);
3496 }
3497 
3498 static void
3499 mxge_update_stats(mxge_softc_t *sc)
3500 {
3501 	struct mxge_slice_state *ss;
3502 	u_long ipackets = 0;
3503 	int slice;
3504 
3505 	for(slice = 0; slice < sc->num_slices; slice++) {
3506 		ss = &sc->ss[slice];
3507 		ipackets += ss->ipackets;
3508 	}
3509 	sc->ifp->if_ipackets = ipackets;
3510 
3511 }
3512 static void
3513 mxge_tick(void *arg)
3514 {
3515 	mxge_softc_t *sc = arg;
3516 
3517 
3518 	/* Synchronize with possible callout reset/stop. */
3519 	if (callout_pending(&sc->co_hdl) ||
3520 	    !callout_active(&sc->co_hdl)) {
3521 		mtx_unlock(&sc->driver_mtx);
3522 		return;
3523 	}
3524 
3525 	/* aggregate stats from different slices */
3526 	mxge_update_stats(sc);
3527 
3528 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3529 	if (!sc->watchdog_countdown) {
3530 		mxge_watchdog(sc);
3531 		sc->watchdog_countdown = 4;
3532 	}
3533 	sc->watchdog_countdown--;
3534 }
3535 
3536 static int
3537 mxge_media_change(struct ifnet *ifp)
3538 {
3539 	return EINVAL;
3540 }
3541 
3542 static int
3543 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3544 {
3545 	struct ifnet *ifp = sc->ifp;
3546 	int real_mtu, old_mtu;
3547 	int err = 0;
3548 
3549 
3550 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3551 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3552 		return EINVAL;
3553 	mtx_lock(&sc->driver_mtx);
3554 	old_mtu = ifp->if_mtu;
3555 	ifp->if_mtu = mtu;
3556 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3557 		callout_stop(&sc->co_hdl);
3558 		mxge_close(sc);
3559 		err = mxge_open(sc);
3560 		if (err != 0) {
3561 			ifp->if_mtu = old_mtu;
3562 			mxge_close(sc);
3563 			(void) mxge_open(sc);
3564 		}
3565 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3566 	}
3567 	mtx_unlock(&sc->driver_mtx);
3568 	return err;
3569 }
3570 
3571 static void
3572 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3573 {
3574 	mxge_softc_t *sc = ifp->if_softc;
3575 
3576 
3577 	if (sc == NULL)
3578 		return;
3579 	ifmr->ifm_status = IFM_AVALID;
3580 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3581 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3582 	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3583 }
3584 
3585 static int
3586 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3587 {
3588 	mxge_softc_t *sc = ifp->if_softc;
3589 	struct ifreq *ifr = (struct ifreq *)data;
3590 	int err, mask;
3591 
3592 	err = 0;
3593 	switch (command) {
3594 	case SIOCSIFADDR:
3595 	case SIOCGIFADDR:
3596 		err = ether_ioctl(ifp, command, data);
3597 		break;
3598 
3599 	case SIOCSIFMTU:
3600 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3601 		break;
3602 
3603 	case SIOCSIFFLAGS:
3604 		mtx_lock(&sc->driver_mtx);
3605 		if (ifp->if_flags & IFF_UP) {
3606 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3607 				err = mxge_open(sc);
3608 				callout_reset(&sc->co_hdl, mxge_ticks,
3609 					      mxge_tick, sc);
3610 			} else {
3611 				/* take care of promis can allmulti
3612 				   flag chages */
3613 				mxge_change_promisc(sc,
3614 						    ifp->if_flags & IFF_PROMISC);
3615 				mxge_set_multicast_list(sc);
3616 			}
3617 		} else {
3618 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3619 				callout_stop(&sc->co_hdl);
3620 				mxge_close(sc);
3621 			}
3622 		}
3623 		mtx_unlock(&sc->driver_mtx);
3624 		break;
3625 
3626 	case SIOCADDMULTI:
3627 	case SIOCDELMULTI:
3628 		mtx_lock(&sc->driver_mtx);
3629 		mxge_set_multicast_list(sc);
3630 		mtx_unlock(&sc->driver_mtx);
3631 		break;
3632 
3633 	case SIOCSIFCAP:
3634 		mtx_lock(&sc->driver_mtx);
3635 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3636 		if (mask & IFCAP_TXCSUM) {
3637 			if (IFCAP_TXCSUM & ifp->if_capenable) {
3638 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3639 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3640 						      | CSUM_TSO);
3641 			} else {
3642 				ifp->if_capenable |= IFCAP_TXCSUM;
3643 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3644 			}
3645 		} else if (mask & IFCAP_RXCSUM) {
3646 			if (IFCAP_RXCSUM & ifp->if_capenable) {
3647 				ifp->if_capenable &= ~IFCAP_RXCSUM;
3648 				sc->csum_flag = 0;
3649 			} else {
3650 				ifp->if_capenable |= IFCAP_RXCSUM;
3651 				sc->csum_flag = 1;
3652 			}
3653 		}
3654 		if (mask & IFCAP_TSO4) {
3655 			if (IFCAP_TSO4 & ifp->if_capenable) {
3656 				ifp->if_capenable &= ~IFCAP_TSO4;
3657 				ifp->if_hwassist &= ~CSUM_TSO;
3658 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
3659 				ifp->if_capenable |= IFCAP_TSO4;
3660 				ifp->if_hwassist |= CSUM_TSO;
3661 			} else {
3662 				printf("mxge requires tx checksum offload"
3663 				       " be enabled to use TSO\n");
3664 				err = EINVAL;
3665 			}
3666 		}
3667 		if (mask & IFCAP_LRO) {
3668 			if (IFCAP_LRO & ifp->if_capenable)
3669 				err = mxge_change_lro_locked(sc, 0);
3670 			else
3671 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3672 		}
3673 		if (mask & IFCAP_VLAN_HWTAGGING)
3674 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3675 		mtx_unlock(&sc->driver_mtx);
3676 		VLAN_CAPABILITIES(ifp);
3677 
3678 		break;
3679 
3680 	case SIOCGIFMEDIA:
3681 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3682 				    &sc->media, command);
3683                 break;
3684 
3685 	default:
3686 		err = ENOTTY;
3687         }
3688 	return err;
3689 }
3690 
3691 static void
3692 mxge_fetch_tunables(mxge_softc_t *sc)
3693 {
3694 
3695 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
3696 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
3697 			  &mxge_flow_control);
3698 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
3699 			  &mxge_intr_coal_delay);
3700 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
3701 			  &mxge_nvidia_ecrc_enable);
3702 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
3703 			  &mxge_force_firmware);
3704 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
3705 			  &mxge_deassert_wait);
3706 	TUNABLE_INT_FETCH("hw.mxge.verbose",
3707 			  &mxge_verbose);
3708 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
3709 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
3710 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
3711 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
3712 	if (sc->lro_cnt != 0)
3713 		mxge_lro_cnt = sc->lro_cnt;
3714 
3715 	if (bootverbose)
3716 		mxge_verbose = 1;
3717 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
3718 		mxge_intr_coal_delay = 30;
3719 	if (mxge_ticks == 0)
3720 		mxge_ticks = hz / 2;
3721 	sc->pause = mxge_flow_control;
3722 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
3723 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_SRC_PORT) {
3724 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
3725 	}
3726 }
3727 
3728 
3729 static void
3730 mxge_free_slices(mxge_softc_t *sc)
3731 {
3732 	struct mxge_slice_state *ss;
3733 	int i;
3734 
3735 
3736 	if (sc->ss == NULL)
3737 		return;
3738 
3739 	for (i = 0; i < sc->num_slices; i++) {
3740 		ss = &sc->ss[i];
3741 		if (ss->fw_stats != NULL) {
3742 			mxge_dma_free(&ss->fw_stats_dma);
3743 			ss->fw_stats = NULL;
3744 			mtx_destroy(&ss->tx.mtx);
3745 		}
3746 		if (ss->rx_done.entry != NULL) {
3747 			mxge_dma_free(&ss->rx_done.dma);
3748 			ss->rx_done.entry = NULL;
3749 		}
3750 	}
3751 	free(sc->ss, M_DEVBUF);
3752 	sc->ss = NULL;
3753 }
3754 
3755 static int
3756 mxge_alloc_slices(mxge_softc_t *sc)
3757 {
3758 	mxge_cmd_t cmd;
3759 	struct mxge_slice_state *ss;
3760 	size_t bytes;
3761 	int err, i, max_intr_slots;
3762 
3763 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3764 	if (err != 0) {
3765 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3766 		return err;
3767 	}
3768 	sc->rx_ring_size = cmd.data0;
3769 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
3770 
3771 	bytes = sizeof (*sc->ss) * sc->num_slices;
3772 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
3773 	if (sc->ss == NULL)
3774 		return (ENOMEM);
3775 	for (i = 0; i < sc->num_slices; i++) {
3776 		ss = &sc->ss[i];
3777 
3778 		ss->sc = sc;
3779 
3780 		/* allocate per-slice rx interrupt queues */
3781 
3782 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
3783 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
3784 		if (err != 0)
3785 			goto abort;
3786 		ss->rx_done.entry = ss->rx_done.dma.addr;
3787 		bzero(ss->rx_done.entry, bytes);
3788 
3789 		/*
3790 		 * allocate the per-slice firmware stats; stats
3791 		 * (including tx) are used used only on the first
3792 		 * slice for now
3793 		 */
3794 		if (i > 0)
3795 			continue;
3796 
3797 		bytes = sizeof (*ss->fw_stats);
3798 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
3799 				     sizeof (*ss->fw_stats), 64);
3800 		if (err != 0)
3801 			goto abort;
3802 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
3803 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
3804 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
3805 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
3806 	}
3807 
3808 	return (0);
3809 
3810 abort:
3811 	mxge_free_slices(sc);
3812 	return (ENOMEM);
3813 }
3814 
3815 static void
3816 mxge_slice_probe(mxge_softc_t *sc)
3817 {
3818 	mxge_cmd_t cmd;
3819 	char *old_fw;
3820 	int msix_cnt, status, max_intr_slots;
3821 
3822 	sc->num_slices = 1;
3823 	/*
3824 	 *  don't enable multiple slices if they are not enabled,
3825 	 *  or if this is not an SMP system
3826 	 */
3827 
3828 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
3829 		return;
3830 
3831 	/* see how many MSI-X interrupts are available */
3832 	msix_cnt = pci_msix_count(sc->dev);
3833 	if (msix_cnt < 2)
3834 		return;
3835 
3836 	/* now load the slice aware firmware see what it supports */
3837 	old_fw = sc->fw_name;
3838 	if (old_fw == mxge_fw_aligned)
3839 		sc->fw_name = mxge_fw_rss_aligned;
3840 	else
3841 		sc->fw_name = mxge_fw_rss_unaligned;
3842 	status = mxge_load_firmware(sc, 0);
3843 	if (status != 0) {
3844 		device_printf(sc->dev, "Falling back to a single slice\n");
3845 		return;
3846 	}
3847 
3848 	/* try to send a reset command to the card to see if it
3849 	   is alive */
3850 	memset(&cmd, 0, sizeof (cmd));
3851 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
3852 	if (status != 0) {
3853 		device_printf(sc->dev, "failed reset\n");
3854 		goto abort_with_fw;
3855 	}
3856 
3857 	/* get rx ring size */
3858 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3859 	if (status != 0) {
3860 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3861 		goto abort_with_fw;
3862 	}
3863 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
3864 
3865 	/* tell it the size of the interrupt queues */
3866 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
3867 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
3868 	if (status != 0) {
3869 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
3870 		goto abort_with_fw;
3871 	}
3872 
3873 	/* ask the maximum number of slices it supports */
3874 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
3875 	if (status != 0) {
3876 		device_printf(sc->dev,
3877 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
3878 		goto abort_with_fw;
3879 	}
3880 	sc->num_slices = cmd.data0;
3881 	if (sc->num_slices > msix_cnt)
3882 		sc->num_slices = msix_cnt;
3883 
3884 	if (mxge_max_slices == -1) {
3885 		/* cap to number of CPUs in system */
3886 		if (sc->num_slices > mp_ncpus)
3887 			sc->num_slices = mp_ncpus;
3888 	} else {
3889 		if (sc->num_slices > mxge_max_slices)
3890 			sc->num_slices = mxge_max_slices;
3891 	}
3892 	/* make sure it is a power of two */
3893 	while (sc->num_slices & (sc->num_slices - 1))
3894 		sc->num_slices--;
3895 
3896 	if (mxge_verbose)
3897 		device_printf(sc->dev, "using %d slices\n",
3898 			      sc->num_slices);
3899 
3900 	return;
3901 
3902 abort_with_fw:
3903 	sc->fw_name = old_fw;
3904 	(void) mxge_load_firmware(sc, 0);
3905 }
3906 
3907 static int
3908 mxge_add_msix_irqs(mxge_softc_t *sc)
3909 {
3910 	size_t bytes;
3911 	int count, err, i, rid;
3912 
3913 	rid = PCIR_BAR(2);
3914 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
3915 						    &rid, RF_ACTIVE);
3916 
3917 	if (sc->msix_table_res == NULL) {
3918 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
3919 		return ENXIO;
3920 	}
3921 
3922 	count = sc->num_slices;
3923 	err = pci_alloc_msix(sc->dev, &count);
3924 	if (err != 0) {
3925 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
3926 			      "err = %d \n", sc->num_slices, err);
3927 		goto abort_with_msix_table;
3928 	}
3929 	if (count < sc->num_slices) {
3930 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
3931 			      count, sc->num_slices);
3932 		device_printf(sc->dev,
3933 			      "Try setting hw.mxge.max_slices to %d\n",
3934 			      count);
3935 		err = ENOSPC;
3936 		goto abort_with_msix;
3937 	}
3938 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
3939 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3940 	if (sc->msix_irq_res == NULL) {
3941 		err = ENOMEM;
3942 		goto abort_with_msix;
3943 	}
3944 
3945 	for (i = 0; i < sc->num_slices; i++) {
3946 		rid = i + 1;
3947 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
3948 							  SYS_RES_IRQ,
3949 							  &rid, RF_ACTIVE);
3950 		if (sc->msix_irq_res[i] == NULL) {
3951 			device_printf(sc->dev, "couldn't allocate IRQ res"
3952 				      " for message %d\n", i);
3953 			err = ENXIO;
3954 			goto abort_with_res;
3955 		}
3956 	}
3957 
3958 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
3959 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3960 
3961 	for (i = 0; i < sc->num_slices; i++) {
3962 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
3963 				     INTR_TYPE_NET | INTR_MPSAFE,
3964 #if __FreeBSD_version > 700030
3965 				     NULL,
3966 #endif
3967 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
3968 		if (err != 0) {
3969 			device_printf(sc->dev, "couldn't setup intr for "
3970 				      "message %d\n", i);
3971 			goto abort_with_intr;
3972 		}
3973 	}
3974 
3975 	if (mxge_verbose) {
3976 		device_printf(sc->dev, "using %d msix IRQs:",
3977 			      sc->num_slices);
3978 		for (i = 0; i < sc->num_slices; i++)
3979 			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
3980 		printf("\n");
3981 	}
3982 	return (0);
3983 
3984 abort_with_intr:
3985 	for (i = 0; i < sc->num_slices; i++) {
3986 		if (sc->msix_ih[i] != NULL) {
3987 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
3988 					  sc->msix_ih[i]);
3989 			sc->msix_ih[i] = NULL;
3990 		}
3991 	}
3992 	free(sc->msix_ih, M_DEVBUF);
3993 
3994 
3995 abort_with_res:
3996 	for (i = 0; i < sc->num_slices; i++) {
3997 		rid = i + 1;
3998 		if (sc->msix_irq_res[i] != NULL)
3999 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4000 					     sc->msix_irq_res[i]);
4001 		sc->msix_irq_res[i] = NULL;
4002 	}
4003 	free(sc->msix_irq_res, M_DEVBUF);
4004 
4005 
4006 abort_with_msix:
4007 	pci_release_msi(sc->dev);
4008 
4009 abort_with_msix_table:
4010 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4011 			     sc->msix_table_res);
4012 
4013 	return err;
4014 }
4015 
4016 static int
4017 mxge_add_single_irq(mxge_softc_t *sc)
4018 {
4019 	int count, err, rid;
4020 
4021 	count = pci_msi_count(sc->dev);
4022 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4023 		rid = 1;
4024 	} else {
4025 		rid = 0;
4026 		sc->legacy_irq = 1;
4027 	}
4028 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4029 					 1, RF_SHAREABLE | RF_ACTIVE);
4030 	if (sc->irq_res == NULL) {
4031 		device_printf(sc->dev, "could not alloc interrupt\n");
4032 		return ENXIO;
4033 	}
4034 	if (mxge_verbose)
4035 		device_printf(sc->dev, "using %s irq %ld\n",
4036 			      sc->legacy_irq ? "INTx" : "MSI",
4037 			      rman_get_start(sc->irq_res));
4038 	err = bus_setup_intr(sc->dev, sc->irq_res,
4039 			     INTR_TYPE_NET | INTR_MPSAFE,
4040 #if __FreeBSD_version > 700030
4041 			     NULL,
4042 #endif
4043 			     mxge_intr, &sc->ss[0], &sc->ih);
4044 	if (err != 0) {
4045 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4046 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4047 		if (!sc->legacy_irq)
4048 			pci_release_msi(sc->dev);
4049 	}
4050 	return err;
4051 }
4052 
4053 static void
4054 mxge_rem_msix_irqs(mxge_softc_t *sc)
4055 {
4056 	int i, rid;
4057 
4058 	for (i = 0; i < sc->num_slices; i++) {
4059 		if (sc->msix_ih[i] != NULL) {
4060 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4061 					  sc->msix_ih[i]);
4062 			sc->msix_ih[i] = NULL;
4063 		}
4064 	}
4065 	free(sc->msix_ih, M_DEVBUF);
4066 
4067 	for (i = 0; i < sc->num_slices; i++) {
4068 		rid = i + 1;
4069 		if (sc->msix_irq_res[i] != NULL)
4070 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4071 					     sc->msix_irq_res[i]);
4072 		sc->msix_irq_res[i] = NULL;
4073 	}
4074 	free(sc->msix_irq_res, M_DEVBUF);
4075 
4076 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4077 			     sc->msix_table_res);
4078 
4079 	pci_release_msi(sc->dev);
4080 	return;
4081 }
4082 
4083 static void
4084 mxge_rem_single_irq(mxge_softc_t *sc)
4085 {
4086 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4087 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4088 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4089 	if (!sc->legacy_irq)
4090 		pci_release_msi(sc->dev);
4091 }
4092 
4093 static void
4094 mxge_rem_irq(mxge_softc_t *sc)
4095 {
4096 	if (sc->num_slices > 1)
4097 		mxge_rem_msix_irqs(sc);
4098 	else
4099 		mxge_rem_single_irq(sc);
4100 }
4101 
4102 static int
4103 mxge_add_irq(mxge_softc_t *sc)
4104 {
4105 	int err;
4106 
4107 	if (sc->num_slices > 1)
4108 		err = mxge_add_msix_irqs(sc);
4109 	else
4110 		err = mxge_add_single_irq(sc);
4111 
4112 	if (0 && err == 0 && sc->num_slices > 1) {
4113 		mxge_rem_msix_irqs(sc);
4114 		err = mxge_add_msix_irqs(sc);
4115 	}
4116 	return err;
4117 }
4118 
4119 
4120 static int
4121 mxge_attach(device_t dev)
4122 {
4123 	mxge_softc_t *sc = device_get_softc(dev);
4124 	struct ifnet *ifp;
4125 	int err, rid;
4126 
4127 	sc->dev = dev;
4128 	mxge_fetch_tunables(sc);
4129 
4130 	err = bus_dma_tag_create(NULL,			/* parent */
4131 				 1,			/* alignment */
4132 				 0,			/* boundary */
4133 				 BUS_SPACE_MAXADDR,	/* low */
4134 				 BUS_SPACE_MAXADDR,	/* high */
4135 				 NULL, NULL,		/* filter */
4136 				 65536 + 256,		/* maxsize */
4137 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4138 				 65536,			/* maxsegsize */
4139 				 0,			/* flags */
4140 				 NULL, NULL,		/* lock */
4141 				 &sc->parent_dmat);	/* tag */
4142 
4143 	if (err != 0) {
4144 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4145 			      err);
4146 		goto abort_with_nothing;
4147 	}
4148 
4149 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4150 	if (ifp == NULL) {
4151 		device_printf(dev, "can not if_alloc()\n");
4152 		err = ENOSPC;
4153 		goto abort_with_parent_dmat;
4154 	}
4155 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4156 
4157 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4158 		 device_get_nameunit(dev));
4159 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4160 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4161 		 "%s:drv", device_get_nameunit(dev));
4162 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4163 		 MTX_NETWORK_LOCK, MTX_DEF);
4164 
4165 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4166 
4167 	mxge_setup_cfg_space(sc);
4168 
4169 	/* Map the board into the kernel */
4170 	rid = PCIR_BARS;
4171 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4172 					 ~0, 1, RF_ACTIVE);
4173 	if (sc->mem_res == NULL) {
4174 		device_printf(dev, "could not map memory\n");
4175 		err = ENXIO;
4176 		goto abort_with_lock;
4177 	}
4178 	sc->sram = rman_get_virtual(sc->mem_res);
4179 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4180 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4181 		device_printf(dev, "impossible memory region size %ld\n",
4182 			      rman_get_size(sc->mem_res));
4183 		err = ENXIO;
4184 		goto abort_with_mem_res;
4185 	}
4186 
4187 	/* make NULL terminated copy of the EEPROM strings section of
4188 	   lanai SRAM */
4189 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4190 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4191 				rman_get_bushandle(sc->mem_res),
4192 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4193 				sc->eeprom_strings,
4194 				MXGE_EEPROM_STRINGS_SIZE - 2);
4195 	err = mxge_parse_strings(sc);
4196 	if (err != 0)
4197 		goto abort_with_mem_res;
4198 
4199 	/* Enable write combining for efficient use of PCIe bus */
4200 	mxge_enable_wc(sc);
4201 
4202 	/* Allocate the out of band dma memory */
4203 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4204 			     sizeof (mxge_cmd_t), 64);
4205 	if (err != 0)
4206 		goto abort_with_mem_res;
4207 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4208 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4209 	if (err != 0)
4210 		goto abort_with_cmd_dma;
4211 
4212 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4213 	if (err != 0)
4214 		goto abort_with_zeropad_dma;
4215 
4216 	/* select & load the firmware */
4217 	err = mxge_select_firmware(sc);
4218 	if (err != 0)
4219 		goto abort_with_dmabench;
4220 	sc->intr_coal_delay = mxge_intr_coal_delay;
4221 
4222 	mxge_slice_probe(sc);
4223 	err = mxge_alloc_slices(sc);
4224 	if (err != 0)
4225 		goto abort_with_dmabench;
4226 
4227 	err = mxge_reset(sc, 0);
4228 	if (err != 0)
4229 		goto abort_with_slices;
4230 
4231 	err = mxge_alloc_rings(sc);
4232 	if (err != 0) {
4233 		device_printf(sc->dev, "failed to allocate rings\n");
4234 		goto abort_with_dmabench;
4235 	}
4236 
4237 	err = mxge_add_irq(sc);
4238 	if (err != 0) {
4239 		device_printf(sc->dev, "failed to add irq\n");
4240 		goto abort_with_rings;
4241 	}
4242 
4243 	ifp->if_baudrate = IF_Gbps(10UL);
4244 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4245 		IFCAP_VLAN_MTU | IFCAP_LRO;
4246 
4247 #ifdef MXGE_NEW_VLAN_API
4248 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4249 #endif
4250 
4251 	sc->max_mtu = mxge_max_mtu(sc);
4252 	if (sc->max_mtu >= 9000)
4253 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4254 	else
4255 		device_printf(dev, "MTU limited to %d.  Install "
4256 			      "latest firmware for 9000 byte jumbo support\n",
4257 			      sc->max_mtu - ETHER_HDR_LEN);
4258 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4259 	ifp->if_capenable = ifp->if_capabilities;
4260 	if (sc->lro_cnt == 0)
4261 		ifp->if_capenable &= ~IFCAP_LRO;
4262 	sc->csum_flag = 1;
4263         ifp->if_init = mxge_init;
4264         ifp->if_softc = sc;
4265         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4266         ifp->if_ioctl = mxge_ioctl;
4267         ifp->if_start = mxge_start;
4268 	/* Initialise the ifmedia structure */
4269 	ifmedia_init(&sc->media, 0, mxge_media_change,
4270 		     mxge_media_status);
4271 	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4272 	mxge_media_probe(sc);
4273 	ether_ifattach(ifp, sc->mac_addr);
4274 	/* ether_ifattach sets mtu to 1500 */
4275 	if (ifp->if_capabilities & IFCAP_JUMBO_MTU)
4276 		ifp->if_mtu = 9000;
4277 
4278 	mxge_add_sysctls(sc);
4279 	return 0;
4280 
4281 abort_with_rings:
4282 	mxge_free_rings(sc);
4283 abort_with_slices:
4284 	mxge_free_slices(sc);
4285 abort_with_dmabench:
4286 	mxge_dma_free(&sc->dmabench_dma);
4287 abort_with_zeropad_dma:
4288 	mxge_dma_free(&sc->zeropad_dma);
4289 abort_with_cmd_dma:
4290 	mxge_dma_free(&sc->cmd_dma);
4291 abort_with_mem_res:
4292 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4293 abort_with_lock:
4294 	pci_disable_busmaster(dev);
4295 	mtx_destroy(&sc->cmd_mtx);
4296 	mtx_destroy(&sc->driver_mtx);
4297 	if_free(ifp);
4298 abort_with_parent_dmat:
4299 	bus_dma_tag_destroy(sc->parent_dmat);
4300 
4301 abort_with_nothing:
4302 	return err;
4303 }
4304 
4305 static int
4306 mxge_detach(device_t dev)
4307 {
4308 	mxge_softc_t *sc = device_get_softc(dev);
4309 
4310 	if (mxge_vlans_active(sc)) {
4311 		device_printf(sc->dev,
4312 			      "Detach vlans before removing module\n");
4313 		return EBUSY;
4314 	}
4315 	mtx_lock(&sc->driver_mtx);
4316 	callout_stop(&sc->co_hdl);
4317 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4318 		mxge_close(sc);
4319 	mtx_unlock(&sc->driver_mtx);
4320 	ether_ifdetach(sc->ifp);
4321 	ifmedia_removeall(&sc->media);
4322 	mxge_dummy_rdma(sc, 0);
4323 	mxge_rem_sysctls(sc);
4324 	mxge_rem_irq(sc);
4325 	mxge_free_rings(sc);
4326 	mxge_free_slices(sc);
4327 	mxge_dma_free(&sc->dmabench_dma);
4328 	mxge_dma_free(&sc->zeropad_dma);
4329 	mxge_dma_free(&sc->cmd_dma);
4330 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4331 	pci_disable_busmaster(dev);
4332 	mtx_destroy(&sc->cmd_mtx);
4333 	mtx_destroy(&sc->driver_mtx);
4334 	if_free(sc->ifp);
4335 	bus_dma_tag_destroy(sc->parent_dmat);
4336 	return 0;
4337 }
4338 
4339 static int
4340 mxge_shutdown(device_t dev)
4341 {
4342 	return 0;
4343 }
4344 
4345 /*
4346   This file uses Myri10GE driver indentation.
4347 
4348   Local Variables:
4349   c-file-style:"linux"
4350   tab-width:8
4351   End:
4352 */
4353