xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 2a664c03e55254b0f3b32dcdfc78179c0a57a8d2)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2009, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 #include <sys/taskqueue.h>
49 
50 #include <net/if.h>
51 #include <net/if_arp.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
55 
56 #include <net/bpf.h>
57 
58 #include <net/if_types.h>
59 #include <net/if_vlan_var.h>
60 #include <net/zlib.h>
61 
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
66 
67 #include <machine/bus.h>
68 #include <machine/in_cksum.h>
69 #include <machine/resource.h>
70 #include <sys/bus.h>
71 #include <sys/rman.h>
72 #include <sys/smp.h>
73 
74 #include <dev/pci/pcireg.h>
75 #include <dev/pci/pcivar.h>
76 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
77 
78 #include <vm/vm.h>		/* for pmap_mapdev() */
79 #include <vm/pmap.h>
80 
81 #if defined(__i386) || defined(__amd64)
82 #include <machine/specialreg.h>
83 #endif
84 
85 #include <dev/mxge/mxge_mcp.h>
86 #include <dev/mxge/mcp_gen_header.h>
87 /*#define MXGE_FAKE_IFP*/
88 #include <dev/mxge/if_mxge_var.h>
89 #ifdef IFNET_BUF_RING
90 #include <sys/buf_ring.h>
91 #endif
92 
93 #include "opt_inet.h"
94 
95 /* tunable params */
96 static int mxge_nvidia_ecrc_enable = 1;
97 static int mxge_force_firmware = 0;
98 static int mxge_intr_coal_delay = 30;
99 static int mxge_deassert_wait = 1;
100 static int mxge_flow_control = 1;
101 static int mxge_verbose = 0;
102 static int mxge_lro_cnt = 8;
103 static int mxge_ticks;
104 static int mxge_max_slices = 1;
105 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
106 static int mxge_always_promisc = 0;
107 static int mxge_initial_mtu = ETHERMTU_JUMBO;
108 static int mxge_throttle = 0;
109 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
110 static char *mxge_fw_aligned = "mxge_eth_z8e";
111 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
112 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
113 
114 static int mxge_probe(device_t dev);
115 static int mxge_attach(device_t dev);
116 static int mxge_detach(device_t dev);
117 static int mxge_shutdown(device_t dev);
118 static void mxge_intr(void *arg);
119 
120 static device_method_t mxge_methods[] =
121 {
122   /* Device interface */
123   DEVMETHOD(device_probe, mxge_probe),
124   DEVMETHOD(device_attach, mxge_attach),
125   DEVMETHOD(device_detach, mxge_detach),
126   DEVMETHOD(device_shutdown, mxge_shutdown),
127   {0, 0}
128 };
129 
130 static driver_t mxge_driver =
131 {
132   "mxge",
133   mxge_methods,
134   sizeof(mxge_softc_t),
135 };
136 
137 static devclass_t mxge_devclass;
138 
139 /* Declare ourselves to be a child of the PCI bus.*/
140 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
141 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
142 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
143 
144 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
145 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
146 static int mxge_close(mxge_softc_t *sc, int down);
147 static int mxge_open(mxge_softc_t *sc);
148 static void mxge_tick(void *arg);
149 
150 static int
151 mxge_probe(device_t dev)
152 {
153 	int rev;
154 
155 
156 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
157 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
158 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
159 		rev = pci_get_revid(dev);
160 		switch (rev) {
161 		case MXGE_PCI_REV_Z8E:
162 			device_set_desc(dev, "Myri10G-PCIE-8A");
163 			break;
164 		case MXGE_PCI_REV_Z8ES:
165 			device_set_desc(dev, "Myri10G-PCIE-8B");
166 			break;
167 		default:
168 			device_set_desc(dev, "Myri10G-PCIE-8??");
169 			device_printf(dev, "Unrecognized rev %d NIC\n",
170 				      rev);
171 			break;
172 		}
173 		return 0;
174 	}
175 	return ENXIO;
176 }
177 
178 static void
179 mxge_enable_wc(mxge_softc_t *sc)
180 {
181 #if defined(__i386) || defined(__amd64)
182 	vm_offset_t len;
183 	int err;
184 
185 	sc->wc = 1;
186 	len = rman_get_size(sc->mem_res);
187 	err = pmap_change_attr((vm_offset_t) sc->sram,
188 			       len, PAT_WRITE_COMBINING);
189 	if (err != 0) {
190 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
191 			      err);
192 		sc->wc = 0;
193 	}
194 #endif
195 }
196 
197 
198 /* callback to get our DMA address */
199 static void
200 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
201 			 int error)
202 {
203 	if (error == 0) {
204 		*(bus_addr_t *) arg = segs->ds_addr;
205 	}
206 }
207 
208 static int
209 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
210 		   bus_size_t alignment)
211 {
212 	int err;
213 	device_t dev = sc->dev;
214 	bus_size_t boundary, maxsegsize;
215 
216 	if (bytes > 4096 && alignment == 4096) {
217 		boundary = 0;
218 		maxsegsize = bytes;
219 	} else {
220 		boundary = 4096;
221 		maxsegsize = 4096;
222 	}
223 
224 	/* allocate DMAable memory tags */
225 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
226 				 alignment,		/* alignment */
227 				 boundary,		/* boundary */
228 				 BUS_SPACE_MAXADDR,	/* low */
229 				 BUS_SPACE_MAXADDR,	/* high */
230 				 NULL, NULL,		/* filter */
231 				 bytes,			/* maxsize */
232 				 1,			/* num segs */
233 				 maxsegsize,		/* maxsegsize */
234 				 BUS_DMA_COHERENT,	/* flags */
235 				 NULL, NULL,		/* lock */
236 				 &dma->dmat);		/* tag */
237 	if (err != 0) {
238 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
239 		return err;
240 	}
241 
242 	/* allocate DMAable memory & map */
243 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
244 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
245 				| BUS_DMA_ZERO),  &dma->map);
246 	if (err != 0) {
247 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
248 		goto abort_with_dmat;
249 	}
250 
251 	/* load the memory */
252 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
253 			      mxge_dmamap_callback,
254 			      (void *)&dma->bus_addr, 0);
255 	if (err != 0) {
256 		device_printf(dev, "couldn't load map (err = %d)\n", err);
257 		goto abort_with_mem;
258 	}
259 	return 0;
260 
261 abort_with_mem:
262 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
263 abort_with_dmat:
264 	(void)bus_dma_tag_destroy(dma->dmat);
265 	return err;
266 }
267 
268 
269 static void
270 mxge_dma_free(mxge_dma_t *dma)
271 {
272 	bus_dmamap_unload(dma->dmat, dma->map);
273 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
274 	(void)bus_dma_tag_destroy(dma->dmat);
275 }
276 
277 /*
278  * The eeprom strings on the lanaiX have the format
279  * SN=x\0
280  * MAC=x:x:x:x:x:x\0
281  * PC=text\0
282  */
283 
284 static int
285 mxge_parse_strings(mxge_softc_t *sc)
286 {
287 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
288 
289 	char *ptr, *limit;
290 	int i, found_mac;
291 
292 	ptr = sc->eeprom_strings;
293 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
294 	found_mac = 0;
295 	while (ptr < limit && *ptr != '\0') {
296 		if (memcmp(ptr, "MAC=", 4) == 0) {
297 			ptr += 1;
298 			sc->mac_addr_string = ptr;
299 			for (i = 0; i < 6; i++) {
300 				ptr += 3;
301 				if ((ptr + 2) > limit)
302 					goto abort;
303 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
304 				found_mac = 1;
305 			}
306 		} else if (memcmp(ptr, "PC=", 3) == 0) {
307 			ptr += 3;
308 			strncpy(sc->product_code_string, ptr,
309 				sizeof (sc->product_code_string) - 1);
310 		} else if (memcmp(ptr, "SN=", 3) == 0) {
311 			ptr += 3;
312 			strncpy(sc->serial_number_string, ptr,
313 				sizeof (sc->serial_number_string) - 1);
314 		}
315 		MXGE_NEXT_STRING(ptr);
316 	}
317 
318 	if (found_mac)
319 		return 0;
320 
321  abort:
322 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
323 
324 	return ENXIO;
325 }
326 
327 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
328 static void
329 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
330 {
331 	uint32_t val;
332 	unsigned long base, off;
333 	char *va, *cfgptr;
334 	device_t pdev, mcp55;
335 	uint16_t vendor_id, device_id, word;
336 	uintptr_t bus, slot, func, ivend, idev;
337 	uint32_t *ptr32;
338 
339 
340 	if (!mxge_nvidia_ecrc_enable)
341 		return;
342 
343 	pdev = device_get_parent(device_get_parent(sc->dev));
344 	if (pdev == NULL) {
345 		device_printf(sc->dev, "could not find parent?\n");
346 		return;
347 	}
348 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
349 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
350 
351 	if (vendor_id != 0x10de)
352 		return;
353 
354 	base = 0;
355 
356 	if (device_id == 0x005d) {
357 		/* ck804, base address is magic */
358 		base = 0xe0000000UL;
359 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
360 		/* mcp55, base address stored in chipset */
361 		mcp55 = pci_find_bsf(0, 0, 0);
362 		if (mcp55 &&
363 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
364 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
365 			word = pci_read_config(mcp55, 0x90, 2);
366 			base = ((unsigned long)word & 0x7ffeU) << 25;
367 		}
368 	}
369 	if (!base)
370 		return;
371 
372 	/* XXXX
373 	   Test below is commented because it is believed that doing
374 	   config read/write beyond 0xff will access the config space
375 	   for the next larger function.  Uncomment this and remove
376 	   the hacky pmap_mapdev() way of accessing config space when
377 	   FreeBSD grows support for extended pcie config space access
378 	*/
379 #if 0
380 	/* See if we can, by some miracle, access the extended
381 	   config space */
382 	val = pci_read_config(pdev, 0x178, 4);
383 	if (val != 0xffffffff) {
384 		val |= 0x40;
385 		pci_write_config(pdev, 0x178, val, 4);
386 		return;
387 	}
388 #endif
389 	/* Rather than using normal pci config space writes, we must
390 	 * map the Nvidia config space ourselves.  This is because on
391 	 * opteron/nvidia class machine the 0xe000000 mapping is
392 	 * handled by the nvidia chipset, that means the internal PCI
393 	 * device (the on-chip northbridge), or the amd-8131 bridge
394 	 * and things behind them are not visible by this method.
395 	 */
396 
397 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
398 		      PCI_IVAR_BUS, &bus);
399 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
400 		      PCI_IVAR_SLOT, &slot);
401 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
402 		      PCI_IVAR_FUNCTION, &func);
403 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
404 		      PCI_IVAR_VENDOR, &ivend);
405 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
406 		      PCI_IVAR_DEVICE, &idev);
407 
408 	off =  base
409 		+ 0x00100000UL * (unsigned long)bus
410 		+ 0x00001000UL * (unsigned long)(func
411 						 + 8 * slot);
412 
413 	/* map it into the kernel */
414 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
415 
416 
417 	if (va == NULL) {
418 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
419 		return;
420 	}
421 	/* get a pointer to the config space mapped into the kernel */
422 	cfgptr = va + (off & PAGE_MASK);
423 
424 	/* make sure that we can really access it */
425 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
426 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
427 	if (! (vendor_id == ivend && device_id == idev)) {
428 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
429 			      vendor_id, device_id);
430 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
431 		return;
432 	}
433 
434 	ptr32 = (uint32_t*)(cfgptr + 0x178);
435 	val = *ptr32;
436 
437 	if (val == 0xffffffff) {
438 		device_printf(sc->dev, "extended mapping failed\n");
439 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
440 		return;
441 	}
442 	*ptr32 = val | 0x40;
443 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
444 	if (mxge_verbose)
445 		device_printf(sc->dev,
446 			      "Enabled ECRC on upstream Nvidia bridge "
447 			      "at %d:%d:%d\n",
448 			      (int)bus, (int)slot, (int)func);
449 	return;
450 }
451 #else
452 static void
453 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
454 {
455 	device_printf(sc->dev,
456 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
457 	return;
458 }
459 #endif
460 
461 
462 static int
463 mxge_dma_test(mxge_softc_t *sc, int test_type)
464 {
465 	mxge_cmd_t cmd;
466 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
467 	int status;
468 	uint32_t len;
469 	char *test = " ";
470 
471 
472 	/* Run a small DMA test.
473 	 * The magic multipliers to the length tell the firmware
474 	 * to do DMA read, write, or read+write tests.  The
475 	 * results are returned in cmd.data0.  The upper 16
476 	 * bits of the return is the number of transfers completed.
477 	 * The lower 16 bits is the time in 0.5us ticks that the
478 	 * transfers took to complete.
479 	 */
480 
481 	len = sc->tx_boundary;
482 
483 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
484 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
485 	cmd.data2 = len * 0x10000;
486 	status = mxge_send_cmd(sc, test_type, &cmd);
487 	if (status != 0) {
488 		test = "read";
489 		goto abort;
490 	}
491 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
492 		(cmd.data0 & 0xffff);
493 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
494 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
495 	cmd.data2 = len * 0x1;
496 	status = mxge_send_cmd(sc, test_type, &cmd);
497 	if (status != 0) {
498 		test = "write";
499 		goto abort;
500 	}
501 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
502 		(cmd.data0 & 0xffff);
503 
504 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
505 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
506 	cmd.data2 = len * 0x10001;
507 	status = mxge_send_cmd(sc, test_type, &cmd);
508 	if (status != 0) {
509 		test = "read/write";
510 		goto abort;
511 	}
512 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
513 		(cmd.data0 & 0xffff);
514 
515 abort:
516 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
517 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
518 			      test, status);
519 
520 	return status;
521 }
522 
523 /*
524  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
525  * when the PCI-E Completion packets are aligned on an 8-byte
526  * boundary.  Some PCI-E chip sets always align Completion packets; on
527  * the ones that do not, the alignment can be enforced by enabling
528  * ECRC generation (if supported).
529  *
530  * When PCI-E Completion packets are not aligned, it is actually more
531  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
532  *
533  * If the driver can neither enable ECRC nor verify that it has
534  * already been enabled, then it must use a firmware image which works
535  * around unaligned completion packets (ethp_z8e.dat), and it should
536  * also ensure that it never gives the device a Read-DMA which is
537  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
538  * enabled, then the driver should use the aligned (eth_z8e.dat)
539  * firmware image, and set tx_boundary to 4KB.
540  */
541 
542 static int
543 mxge_firmware_probe(mxge_softc_t *sc)
544 {
545 	device_t dev = sc->dev;
546 	int reg, status;
547 	uint16_t pectl;
548 
549 	sc->tx_boundary = 4096;
550 	/*
551 	 * Verify the max read request size was set to 4KB
552 	 * before trying the test with 4KB.
553 	 */
554 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
555 		pectl = pci_read_config(dev, reg + 0x8, 2);
556 		if ((pectl & (5 << 12)) != (5 << 12)) {
557 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
558 				      pectl);
559 			sc->tx_boundary = 2048;
560 		}
561 	}
562 
563 	/*
564 	 * load the optimized firmware (which assumes aligned PCIe
565 	 * completions) in order to see if it works on this host.
566 	 */
567 	sc->fw_name = mxge_fw_aligned;
568 	status = mxge_load_firmware(sc, 1);
569 	if (status != 0) {
570 		return status;
571 	}
572 
573 	/*
574 	 * Enable ECRC if possible
575 	 */
576 	mxge_enable_nvidia_ecrc(sc);
577 
578 	/*
579 	 * Run a DMA test which watches for unaligned completions and
580 	 * aborts on the first one seen.
581 	 */
582 
583 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
584 	if (status == 0)
585 		return 0; /* keep the aligned firmware */
586 
587 	if (status != E2BIG)
588 		device_printf(dev, "DMA test failed: %d\n", status);
589 	if (status == ENOSYS)
590 		device_printf(dev, "Falling back to ethp! "
591 			      "Please install up to date fw\n");
592 	return status;
593 }
594 
595 static int
596 mxge_select_firmware(mxge_softc_t *sc)
597 {
598 	int aligned = 0;
599 	int force_firmware = mxge_force_firmware;
600 
601 	if (sc->throttle)
602 		force_firmware = sc->throttle;
603 
604 	if (force_firmware != 0) {
605 		if (force_firmware == 1)
606 			aligned = 1;
607 		else
608 			aligned = 0;
609 		if (mxge_verbose)
610 			device_printf(sc->dev,
611 				      "Assuming %s completions (forced)\n",
612 				      aligned ? "aligned" : "unaligned");
613 		goto abort;
614 	}
615 
616 	/* if the PCIe link width is 4 or less, we can use the aligned
617 	   firmware and skip any checks */
618 	if (sc->link_width != 0 && sc->link_width <= 4) {
619 		device_printf(sc->dev,
620 			      "PCIe x%d Link, expect reduced performance\n",
621 			      sc->link_width);
622 		aligned = 1;
623 		goto abort;
624 	}
625 
626 	if (0 == mxge_firmware_probe(sc))
627 		return 0;
628 
629 abort:
630 	if (aligned) {
631 		sc->fw_name = mxge_fw_aligned;
632 		sc->tx_boundary = 4096;
633 	} else {
634 		sc->fw_name = mxge_fw_unaligned;
635 		sc->tx_boundary = 2048;
636 	}
637 	return (mxge_load_firmware(sc, 0));
638 }
639 
640 union qualhack
641 {
642         const char *ro_char;
643         char *rw_char;
644 };
645 
646 static int
647 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
648 {
649 
650 
651 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
652 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
653 			      be32toh(hdr->mcp_type));
654 		return EIO;
655 	}
656 
657 	/* save firmware version for sysctl */
658 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
659 	if (mxge_verbose)
660 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
661 
662 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
663 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
664 
665 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
666 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
667 		device_printf(sc->dev, "Found firmware version %s\n",
668 			      sc->fw_version);
669 		device_printf(sc->dev, "Driver needs %d.%d\n",
670 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
671 		return EINVAL;
672 	}
673 	return 0;
674 
675 }
676 
677 static void *
678 z_alloc(void *nil, u_int items, u_int size)
679 {
680         void *ptr;
681 
682         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
683         return ptr;
684 }
685 
686 static void
687 z_free(void *nil, void *ptr)
688 {
689         free(ptr, M_TEMP);
690 }
691 
692 
693 static int
694 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
695 {
696 	z_stream zs;
697 	char *inflate_buffer;
698 	const struct firmware *fw;
699 	const mcp_gen_header_t *hdr;
700 	unsigned hdr_offset;
701 	int status;
702 	unsigned int i;
703 	char dummy;
704 	size_t fw_len;
705 
706 	fw = firmware_get(sc->fw_name);
707 	if (fw == NULL) {
708 		device_printf(sc->dev, "Could not find firmware image %s\n",
709 			      sc->fw_name);
710 		return ENOENT;
711 	}
712 
713 
714 
715 	/* setup zlib and decompress f/w */
716 	bzero(&zs, sizeof (zs));
717 	zs.zalloc = z_alloc;
718 	zs.zfree = z_free;
719 	status = inflateInit(&zs);
720 	if (status != Z_OK) {
721 		status = EIO;
722 		goto abort_with_fw;
723 	}
724 
725 	/* the uncompressed size is stored as the firmware version,
726 	   which would otherwise go unused */
727 	fw_len = (size_t) fw->version;
728 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
729 	if (inflate_buffer == NULL)
730 		goto abort_with_zs;
731 	zs.avail_in = fw->datasize;
732 	zs.next_in = __DECONST(char *, fw->data);
733 	zs.avail_out = fw_len;
734 	zs.next_out = inflate_buffer;
735 	status = inflate(&zs, Z_FINISH);
736 	if (status != Z_STREAM_END) {
737 		device_printf(sc->dev, "zlib %d\n", status);
738 		status = EIO;
739 		goto abort_with_buffer;
740 	}
741 
742 	/* check id */
743 	hdr_offset = htobe32(*(const uint32_t *)
744 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
745 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
746 		device_printf(sc->dev, "Bad firmware file");
747 		status = EIO;
748 		goto abort_with_buffer;
749 	}
750 	hdr = (const void*)(inflate_buffer + hdr_offset);
751 
752 	status = mxge_validate_firmware(sc, hdr);
753 	if (status != 0)
754 		goto abort_with_buffer;
755 
756 	/* Copy the inflated firmware to NIC SRAM. */
757 	for (i = 0; i < fw_len; i += 256) {
758 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
759 			      inflate_buffer + i,
760 			      min(256U, (unsigned)(fw_len - i)));
761 		wmb();
762 		dummy = *sc->sram;
763 		wmb();
764 	}
765 
766 	*limit = fw_len;
767 	status = 0;
768 abort_with_buffer:
769 	free(inflate_buffer, M_TEMP);
770 abort_with_zs:
771 	inflateEnd(&zs);
772 abort_with_fw:
773 	firmware_put(fw, FIRMWARE_UNLOAD);
774 	return status;
775 }
776 
777 /*
778  * Enable or disable periodic RDMAs from the host to make certain
779  * chipsets resend dropped PCIe messages
780  */
781 
782 static void
783 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
784 {
785 	char buf_bytes[72];
786 	volatile uint32_t *confirm;
787 	volatile char *submit;
788 	uint32_t *buf, dma_low, dma_high;
789 	int i;
790 
791 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
792 
793 	/* clear confirmation addr */
794 	confirm = (volatile uint32_t *)sc->cmd;
795 	*confirm = 0;
796 	wmb();
797 
798 	/* send an rdma command to the PCIe engine, and wait for the
799 	   response in the confirmation address.  The firmware should
800 	   write a -1 there to indicate it is alive and well
801 	*/
802 
803 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
804 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
805 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
806 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
807 	buf[2] = htobe32(0xffffffff);		/* confirm data */
808 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
809 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
810 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
811 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
812 	buf[5] = htobe32(enable);			/* enable? */
813 
814 
815 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
816 
817 	mxge_pio_copy(submit, buf, 64);
818 	wmb();
819 	DELAY(1000);
820 	wmb();
821 	i = 0;
822 	while (*confirm != 0xffffffff && i < 20) {
823 		DELAY(1000);
824 		i++;
825 	}
826 	if (*confirm != 0xffffffff) {
827 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
828 			      (enable ? "enable" : "disable"), confirm,
829 			      *confirm);
830 	}
831 	return;
832 }
833 
834 static int
835 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
836 {
837 	mcp_cmd_t *buf;
838 	char buf_bytes[sizeof(*buf) + 8];
839 	volatile mcp_cmd_response_t *response = sc->cmd;
840 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
841 	uint32_t dma_low, dma_high;
842 	int err, sleep_total = 0;
843 
844 	/* ensure buf is aligned to 8 bytes */
845 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
846 
847 	buf->data0 = htobe32(data->data0);
848 	buf->data1 = htobe32(data->data1);
849 	buf->data2 = htobe32(data->data2);
850 	buf->cmd = htobe32(cmd);
851 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
852 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
853 
854 	buf->response_addr.low = htobe32(dma_low);
855 	buf->response_addr.high = htobe32(dma_high);
856 	mtx_lock(&sc->cmd_mtx);
857 	response->result = 0xffffffff;
858 	wmb();
859 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
860 
861 	/* wait up to 20ms */
862 	err = EAGAIN;
863 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
864 		bus_dmamap_sync(sc->cmd_dma.dmat,
865 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
866 		wmb();
867 		switch (be32toh(response->result)) {
868 		case 0:
869 			data->data0 = be32toh(response->data);
870 			err = 0;
871 			break;
872 		case 0xffffffff:
873 			DELAY(1000);
874 			break;
875 		case MXGEFW_CMD_UNKNOWN:
876 			err = ENOSYS;
877 			break;
878 		case MXGEFW_CMD_ERROR_UNALIGNED:
879 			err = E2BIG;
880 			break;
881 		case MXGEFW_CMD_ERROR_BUSY:
882 			err = EBUSY;
883 			break;
884 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
885 			err = ENXIO;
886 			break;
887 		default:
888 			device_printf(sc->dev,
889 				      "mxge: command %d "
890 				      "failed, result = %d\n",
891 				      cmd, be32toh(response->result));
892 			err = ENXIO;
893 			break;
894 		}
895 		if (err != EAGAIN)
896 			break;
897 	}
898 	if (err == EAGAIN)
899 		device_printf(sc->dev, "mxge: command %d timed out"
900 			      "result = %d\n",
901 			      cmd, be32toh(response->result));
902 	mtx_unlock(&sc->cmd_mtx);
903 	return err;
904 }
905 
906 static int
907 mxge_adopt_running_firmware(mxge_softc_t *sc)
908 {
909 	struct mcp_gen_header *hdr;
910 	const size_t bytes = sizeof (struct mcp_gen_header);
911 	size_t hdr_offset;
912 	int status;
913 
914 	/* find running firmware header */
915 	hdr_offset = htobe32(*(volatile uint32_t *)
916 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
917 
918 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
919 		device_printf(sc->dev,
920 			      "Running firmware has bad header offset (%d)\n",
921 			      (int)hdr_offset);
922 		return EIO;
923 	}
924 
925 	/* copy header of running firmware from SRAM to host memory to
926 	 * validate firmware */
927 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
928 	if (hdr == NULL) {
929 		device_printf(sc->dev, "could not malloc firmware hdr\n");
930 		return ENOMEM;
931 	}
932 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
933 				rman_get_bushandle(sc->mem_res),
934 				hdr_offset, (char *)hdr, bytes);
935 	status = mxge_validate_firmware(sc, hdr);
936 	free(hdr, M_DEVBUF);
937 
938 	/*
939 	 * check to see if adopted firmware has bug where adopting
940 	 * it will cause broadcasts to be filtered unless the NIC
941 	 * is kept in ALLMULTI mode
942 	 */
943 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
944 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
945 		sc->adopted_rx_filter_bug = 1;
946 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
947 			      "working around rx filter bug\n",
948 			      sc->fw_ver_major, sc->fw_ver_minor,
949 			      sc->fw_ver_tiny);
950 	}
951 
952 	return status;
953 }
954 
955 
956 static int
957 mxge_load_firmware(mxge_softc_t *sc, int adopt)
958 {
959 	volatile uint32_t *confirm;
960 	volatile char *submit;
961 	char buf_bytes[72];
962 	uint32_t *buf, size, dma_low, dma_high;
963 	int status, i;
964 
965 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
966 
967 	size = sc->sram_size;
968 	status = mxge_load_firmware_helper(sc, &size);
969 	if (status) {
970 		if (!adopt)
971 			return status;
972 		/* Try to use the currently running firmware, if
973 		   it is new enough */
974 		status = mxge_adopt_running_firmware(sc);
975 		if (status) {
976 			device_printf(sc->dev,
977 				      "failed to adopt running firmware\n");
978 			return status;
979 		}
980 		device_printf(sc->dev,
981 			      "Successfully adopted running firmware\n");
982 		if (sc->tx_boundary == 4096) {
983 			device_printf(sc->dev,
984 				"Using firmware currently running on NIC"
985 				 ".  For optimal\n");
986 			device_printf(sc->dev,
987 				 "performance consider loading optimized "
988 				 "firmware\n");
989 		}
990 		sc->fw_name = mxge_fw_unaligned;
991 		sc->tx_boundary = 2048;
992 		return 0;
993 	}
994 	/* clear confirmation addr */
995 	confirm = (volatile uint32_t *)sc->cmd;
996 	*confirm = 0;
997 	wmb();
998 	/* send a reload command to the bootstrap MCP, and wait for the
999 	   response in the confirmation address.  The firmware should
1000 	   write a -1 there to indicate it is alive and well
1001 	*/
1002 
1003 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1004 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1005 
1006 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1007 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1008 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1009 
1010 	/* FIX: All newest firmware should un-protect the bottom of
1011 	   the sram before handoff. However, the very first interfaces
1012 	   do not. Therefore the handoff copy must skip the first 8 bytes
1013 	*/
1014 					/* where the code starts*/
1015 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1016 	buf[4] = htobe32(size - 8); 	/* length of code */
1017 	buf[5] = htobe32(8);		/* where to copy to */
1018 	buf[6] = htobe32(0);		/* where to jump to */
1019 
1020 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1021 	mxge_pio_copy(submit, buf, 64);
1022 	wmb();
1023 	DELAY(1000);
1024 	wmb();
1025 	i = 0;
1026 	while (*confirm != 0xffffffff && i < 20) {
1027 		DELAY(1000*10);
1028 		i++;
1029 		bus_dmamap_sync(sc->cmd_dma.dmat,
1030 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1031 	}
1032 	if (*confirm != 0xffffffff) {
1033 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1034 			confirm, *confirm);
1035 
1036 		return ENXIO;
1037 	}
1038 	return 0;
1039 }
1040 
1041 static int
1042 mxge_update_mac_address(mxge_softc_t *sc)
1043 {
1044 	mxge_cmd_t cmd;
1045 	uint8_t *addr = sc->mac_addr;
1046 	int status;
1047 
1048 
1049 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1050 		     | (addr[2] << 8) | addr[3]);
1051 
1052 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1053 
1054 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1055 	return status;
1056 }
1057 
1058 static int
1059 mxge_change_pause(mxge_softc_t *sc, int pause)
1060 {
1061 	mxge_cmd_t cmd;
1062 	int status;
1063 
1064 	if (pause)
1065 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1066 				       &cmd);
1067 	else
1068 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1069 				       &cmd);
1070 
1071 	if (status) {
1072 		device_printf(sc->dev, "Failed to set flow control mode\n");
1073 		return ENXIO;
1074 	}
1075 	sc->pause = pause;
1076 	return 0;
1077 }
1078 
1079 static void
1080 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1081 {
1082 	mxge_cmd_t cmd;
1083 	int status;
1084 
1085 	if (mxge_always_promisc)
1086 		promisc = 1;
1087 
1088 	if (promisc)
1089 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1090 				       &cmd);
1091 	else
1092 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1093 				       &cmd);
1094 
1095 	if (status) {
1096 		device_printf(sc->dev, "Failed to set promisc mode\n");
1097 	}
1098 }
1099 
1100 static void
1101 mxge_set_multicast_list(mxge_softc_t *sc)
1102 {
1103 	mxge_cmd_t cmd;
1104 	struct ifmultiaddr *ifma;
1105 	struct ifnet *ifp = sc->ifp;
1106 	int err;
1107 
1108 	/* This firmware is known to not support multicast */
1109 	if (!sc->fw_multicast_support)
1110 		return;
1111 
1112 	/* Disable multicast filtering while we play with the lists*/
1113 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1114 	if (err != 0) {
1115 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1116 		       " error status: %d\n", err);
1117 		return;
1118 	}
1119 
1120 	if (sc->adopted_rx_filter_bug)
1121 		return;
1122 
1123 	if (ifp->if_flags & IFF_ALLMULTI)
1124 		/* request to disable multicast filtering, so quit here */
1125 		return;
1126 
1127 	/* Flush all the filters */
1128 
1129 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1130 	if (err != 0) {
1131 		device_printf(sc->dev,
1132 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1133 			      ", error status: %d\n", err);
1134 		return;
1135 	}
1136 
1137 	/* Walk the multicast list, and add each address */
1138 
1139 	if_maddr_rlock(ifp);
1140 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1141 		if (ifma->ifma_addr->sa_family != AF_LINK)
1142 			continue;
1143 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1144 		      &cmd.data0, 4);
1145 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1146 		      &cmd.data1, 2);
1147 		cmd.data0 = htonl(cmd.data0);
1148 		cmd.data1 = htonl(cmd.data1);
1149 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1150 		if (err != 0) {
1151 			device_printf(sc->dev, "Failed "
1152 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1153 			       "%d\t", err);
1154 			/* abort, leaving multicast filtering off */
1155 			if_maddr_runlock(ifp);
1156 			return;
1157 		}
1158 	}
1159 	if_maddr_runlock(ifp);
1160 	/* Enable multicast filtering */
1161 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1162 	if (err != 0) {
1163 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1164 		       ", error status: %d\n", err);
1165 	}
1166 }
1167 
1168 static int
1169 mxge_max_mtu(mxge_softc_t *sc)
1170 {
1171 	mxge_cmd_t cmd;
1172 	int status;
1173 
1174 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1175 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1176 
1177 	/* try to set nbufs to see if it we can
1178 	   use virtually contiguous jumbos */
1179 	cmd.data0 = 0;
1180 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1181 			       &cmd);
1182 	if (status == 0)
1183 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1184 
1185 	/* otherwise, we're limited to MJUMPAGESIZE */
1186 	return MJUMPAGESIZE - MXGEFW_PAD;
1187 }
1188 
1189 static int
1190 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1191 {
1192 	struct mxge_slice_state *ss;
1193 	mxge_rx_done_t *rx_done;
1194 	volatile uint32_t *irq_claim;
1195 	mxge_cmd_t cmd;
1196 	int slice, status;
1197 
1198 	/* try to send a reset command to the card to see if it
1199 	   is alive */
1200 	memset(&cmd, 0, sizeof (cmd));
1201 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1202 	if (status != 0) {
1203 		device_printf(sc->dev, "failed reset\n");
1204 		return ENXIO;
1205 	}
1206 
1207 	mxge_dummy_rdma(sc, 1);
1208 
1209 
1210 	/* set the intrq size */
1211 	cmd.data0 = sc->rx_ring_size;
1212 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1213 
1214 	/*
1215 	 * Even though we already know how many slices are supported
1216 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1217 	 * has magic side effects, and must be called after a reset.
1218 	 * It must be called prior to calling any RSS related cmds,
1219 	 * including assigning an interrupt queue for anything but
1220 	 * slice 0.  It must also be called *after*
1221 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1222 	 * the firmware to compute offsets.
1223 	 */
1224 
1225 	if (sc->num_slices > 1) {
1226 		/* ask the maximum number of slices it supports */
1227 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1228 					   &cmd);
1229 		if (status != 0) {
1230 			device_printf(sc->dev,
1231 				      "failed to get number of slices\n");
1232 			return status;
1233 		}
1234 		/*
1235 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1236 		 * to setting up the interrupt queue DMA
1237 		 */
1238 		cmd.data0 = sc->num_slices;
1239 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1240 #ifdef IFNET_BUF_RING
1241 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1242 #endif
1243 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1244 					   &cmd);
1245 		if (status != 0) {
1246 			device_printf(sc->dev,
1247 				      "failed to set number of slices\n");
1248 			return status;
1249 		}
1250 	}
1251 
1252 
1253 	if (interrupts_setup) {
1254 		/* Now exchange information about interrupts  */
1255 		for (slice = 0; slice < sc->num_slices; slice++) {
1256 			rx_done = &sc->ss[slice].rx_done;
1257 			memset(rx_done->entry, 0, sc->rx_ring_size);
1258 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1259 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1260 			cmd.data2 = slice;
1261 			status |= mxge_send_cmd(sc,
1262 						MXGEFW_CMD_SET_INTRQ_DMA,
1263 						&cmd);
1264 		}
1265 	}
1266 
1267 	status |= mxge_send_cmd(sc,
1268 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1269 
1270 
1271 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1272 
1273 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1274 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1275 
1276 
1277 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1278 				&cmd);
1279 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1280 	if (status != 0) {
1281 		device_printf(sc->dev, "failed set interrupt parameters\n");
1282 		return status;
1283 	}
1284 
1285 
1286 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1287 
1288 
1289 	/* run a DMA benchmark */
1290 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1291 
1292 	for (slice = 0; slice < sc->num_slices; slice++) {
1293 		ss = &sc->ss[slice];
1294 
1295 		ss->irq_claim = irq_claim + (2 * slice);
1296 		/* reset mcp/driver shared state back to 0 */
1297 		ss->rx_done.idx = 0;
1298 		ss->rx_done.cnt = 0;
1299 		ss->tx.req = 0;
1300 		ss->tx.done = 0;
1301 		ss->tx.pkt_done = 0;
1302 		ss->tx.queue_active = 0;
1303 		ss->tx.activate = 0;
1304 		ss->tx.deactivate = 0;
1305 		ss->tx.wake = 0;
1306 		ss->tx.defrag = 0;
1307 		ss->tx.stall = 0;
1308 		ss->rx_big.cnt = 0;
1309 		ss->rx_small.cnt = 0;
1310 		ss->lro_bad_csum = 0;
1311 		ss->lro_queued = 0;
1312 		ss->lro_flushed = 0;
1313 		if (ss->fw_stats != NULL) {
1314 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1315 		}
1316 	}
1317 	sc->rdma_tags_available = 15;
1318 	status = mxge_update_mac_address(sc);
1319 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1320 	mxge_change_pause(sc, sc->pause);
1321 	mxge_set_multicast_list(sc);
1322 	if (sc->throttle) {
1323 		cmd.data0 = sc->throttle;
1324 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1325 				  &cmd)) {
1326 			device_printf(sc->dev,
1327 				      "can't enable throttle\n");
1328 		}
1329 	}
1330 	return status;
1331 }
1332 
1333 static int
1334 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1335 {
1336 	mxge_cmd_t cmd;
1337 	mxge_softc_t *sc;
1338 	int err;
1339 	unsigned int throttle;
1340 
1341 	sc = arg1;
1342 	throttle = sc->throttle;
1343 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1344         if (err != 0) {
1345                 return err;
1346         }
1347 
1348 	if (throttle == sc->throttle)
1349 		return 0;
1350 
1351         if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1352                 return EINVAL;
1353 
1354 	mtx_lock(&sc->driver_mtx);
1355 	cmd.data0 = throttle;
1356 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1357 	if (err == 0)
1358 		sc->throttle = throttle;
1359 	mtx_unlock(&sc->driver_mtx);
1360 	return err;
1361 }
1362 
1363 static int
1364 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1365 {
1366         mxge_softc_t *sc;
1367         unsigned int intr_coal_delay;
1368         int err;
1369 
1370         sc = arg1;
1371         intr_coal_delay = sc->intr_coal_delay;
1372         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1373         if (err != 0) {
1374                 return err;
1375         }
1376         if (intr_coal_delay == sc->intr_coal_delay)
1377                 return 0;
1378 
1379         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1380                 return EINVAL;
1381 
1382 	mtx_lock(&sc->driver_mtx);
1383 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1384 	sc->intr_coal_delay = intr_coal_delay;
1385 
1386 	mtx_unlock(&sc->driver_mtx);
1387         return err;
1388 }
1389 
1390 static int
1391 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1392 {
1393         mxge_softc_t *sc;
1394         unsigned int enabled;
1395         int err;
1396 
1397         sc = arg1;
1398         enabled = sc->pause;
1399         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1400         if (err != 0) {
1401                 return err;
1402         }
1403         if (enabled == sc->pause)
1404                 return 0;
1405 
1406 	mtx_lock(&sc->driver_mtx);
1407 	err = mxge_change_pause(sc, enabled);
1408 	mtx_unlock(&sc->driver_mtx);
1409         return err;
1410 }
1411 
1412 static int
1413 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1414 {
1415 	struct ifnet *ifp;
1416 	int err = 0;
1417 
1418 	ifp = sc->ifp;
1419 	if (lro_cnt == 0)
1420 		ifp->if_capenable &= ~IFCAP_LRO;
1421 	else
1422 		ifp->if_capenable |= IFCAP_LRO;
1423 	sc->lro_cnt = lro_cnt;
1424 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1425 		mxge_close(sc, 0);
1426 		err = mxge_open(sc);
1427 	}
1428 	return err;
1429 }
1430 
1431 static int
1432 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1433 {
1434 	mxge_softc_t *sc;
1435 	unsigned int lro_cnt;
1436 	int err;
1437 
1438 	sc = arg1;
1439 	lro_cnt = sc->lro_cnt;
1440 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1441 	if (err != 0)
1442 		return err;
1443 
1444 	if (lro_cnt == sc->lro_cnt)
1445 		return 0;
1446 
1447 	if (lro_cnt > 128)
1448 		return EINVAL;
1449 
1450 	mtx_lock(&sc->driver_mtx);
1451 	err = mxge_change_lro_locked(sc, lro_cnt);
1452 	mtx_unlock(&sc->driver_mtx);
1453 	return err;
1454 }
1455 
1456 static int
1457 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1458 {
1459         int err;
1460 
1461         if (arg1 == NULL)
1462                 return EFAULT;
1463         arg2 = be32toh(*(int *)arg1);
1464         arg1 = NULL;
1465         err = sysctl_handle_int(oidp, arg1, arg2, req);
1466 
1467         return err;
1468 }
1469 
1470 static void
1471 mxge_rem_sysctls(mxge_softc_t *sc)
1472 {
1473 	struct mxge_slice_state *ss;
1474 	int slice;
1475 
1476 	if (sc->slice_sysctl_tree == NULL)
1477 		return;
1478 
1479 	for (slice = 0; slice < sc->num_slices; slice++) {
1480 		ss = &sc->ss[slice];
1481 		if (ss == NULL || ss->sysctl_tree == NULL)
1482 			continue;
1483 		sysctl_ctx_free(&ss->sysctl_ctx);
1484 		ss->sysctl_tree = NULL;
1485 	}
1486 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1487 	sc->slice_sysctl_tree = NULL;
1488 }
1489 
1490 static void
1491 mxge_add_sysctls(mxge_softc_t *sc)
1492 {
1493 	struct sysctl_ctx_list *ctx;
1494 	struct sysctl_oid_list *children;
1495 	mcp_irq_data_t *fw;
1496 	struct mxge_slice_state *ss;
1497 	int slice;
1498 	char slice_num[8];
1499 
1500 	ctx = device_get_sysctl_ctx(sc->dev);
1501 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1502 	fw = sc->ss[0].fw_stats;
1503 
1504 	/* random information */
1505 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1506 		       "firmware_version",
1507 		       CTLFLAG_RD, &sc->fw_version,
1508 		       0, "firmware version");
1509 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1510 		       "serial_number",
1511 		       CTLFLAG_RD, &sc->serial_number_string,
1512 		       0, "serial number");
1513 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1514 		       "product_code",
1515 		       CTLFLAG_RD, &sc->product_code_string,
1516 		       0, "product_code");
1517 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1518 		       "pcie_link_width",
1519 		       CTLFLAG_RD, &sc->link_width,
1520 		       0, "tx_boundary");
1521 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1522 		       "tx_boundary",
1523 		       CTLFLAG_RD, &sc->tx_boundary,
1524 		       0, "tx_boundary");
1525 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1526 		       "write_combine",
1527 		       CTLFLAG_RD, &sc->wc,
1528 		       0, "write combining PIO?");
1529 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1530 		       "read_dma_MBs",
1531 		       CTLFLAG_RD, &sc->read_dma,
1532 		       0, "DMA Read speed in MB/s");
1533 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1534 		       "write_dma_MBs",
1535 		       CTLFLAG_RD, &sc->write_dma,
1536 		       0, "DMA Write speed in MB/s");
1537 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1538 		       "read_write_dma_MBs",
1539 		       CTLFLAG_RD, &sc->read_write_dma,
1540 		       0, "DMA concurrent Read/Write speed in MB/s");
1541 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1542 		       "watchdog_resets",
1543 		       CTLFLAG_RD, &sc->watchdog_resets,
1544 		       0, "Number of times NIC was reset");
1545 
1546 
1547 	/* performance related tunables */
1548 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1549 			"intr_coal_delay",
1550 			CTLTYPE_INT|CTLFLAG_RW, sc,
1551 			0, mxge_change_intr_coal,
1552 			"I", "interrupt coalescing delay in usecs");
1553 
1554 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1555 			"throttle",
1556 			CTLTYPE_INT|CTLFLAG_RW, sc,
1557 			0, mxge_change_throttle,
1558 			"I", "transmit throttling");
1559 
1560 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1561 			"flow_control_enabled",
1562 			CTLTYPE_INT|CTLFLAG_RW, sc,
1563 			0, mxge_change_flow_control,
1564 			"I", "interrupt coalescing delay in usecs");
1565 
1566 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1567 		       "deassert_wait",
1568 		       CTLFLAG_RW, &mxge_deassert_wait,
1569 		       0, "Wait for IRQ line to go low in ihandler");
1570 
1571 	/* stats block from firmware is in network byte order.
1572 	   Need to swap it */
1573 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1574 			"link_up",
1575 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1576 			0, mxge_handle_be32,
1577 			"I", "link up");
1578 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1579 			"rdma_tags_available",
1580 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1581 			0, mxge_handle_be32,
1582 			"I", "rdma_tags_available");
1583 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1584 			"dropped_bad_crc32",
1585 			CTLTYPE_INT|CTLFLAG_RD,
1586 			&fw->dropped_bad_crc32,
1587 			0, mxge_handle_be32,
1588 			"I", "dropped_bad_crc32");
1589 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1590 			"dropped_bad_phy",
1591 			CTLTYPE_INT|CTLFLAG_RD,
1592 			&fw->dropped_bad_phy,
1593 			0, mxge_handle_be32,
1594 			"I", "dropped_bad_phy");
1595 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1596 			"dropped_link_error_or_filtered",
1597 			CTLTYPE_INT|CTLFLAG_RD,
1598 			&fw->dropped_link_error_or_filtered,
1599 			0, mxge_handle_be32,
1600 			"I", "dropped_link_error_or_filtered");
1601 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1602 			"dropped_link_overflow",
1603 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1604 			0, mxge_handle_be32,
1605 			"I", "dropped_link_overflow");
1606 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1607 			"dropped_multicast_filtered",
1608 			CTLTYPE_INT|CTLFLAG_RD,
1609 			&fw->dropped_multicast_filtered,
1610 			0, mxge_handle_be32,
1611 			"I", "dropped_multicast_filtered");
1612 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1613 			"dropped_no_big_buffer",
1614 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1615 			0, mxge_handle_be32,
1616 			"I", "dropped_no_big_buffer");
1617 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1618 			"dropped_no_small_buffer",
1619 			CTLTYPE_INT|CTLFLAG_RD,
1620 			&fw->dropped_no_small_buffer,
1621 			0, mxge_handle_be32,
1622 			"I", "dropped_no_small_buffer");
1623 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1624 			"dropped_overrun",
1625 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1626 			0, mxge_handle_be32,
1627 			"I", "dropped_overrun");
1628 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1629 			"dropped_pause",
1630 			CTLTYPE_INT|CTLFLAG_RD,
1631 			&fw->dropped_pause,
1632 			0, mxge_handle_be32,
1633 			"I", "dropped_pause");
1634 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1635 			"dropped_runt",
1636 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1637 			0, mxge_handle_be32,
1638 			"I", "dropped_runt");
1639 
1640 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1641 			"dropped_unicast_filtered",
1642 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1643 			0, mxge_handle_be32,
1644 			"I", "dropped_unicast_filtered");
1645 
1646 	/* verbose printing? */
1647 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1648 		       "verbose",
1649 		       CTLFLAG_RW, &mxge_verbose,
1650 		       0, "verbose printing");
1651 
1652 	/* lro */
1653 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1654 			"lro_cnt",
1655 			CTLTYPE_INT|CTLFLAG_RW, sc,
1656 			0, mxge_change_lro,
1657 			"I", "number of lro merge queues");
1658 
1659 
1660 	/* add counters exported for debugging from all slices */
1661 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1662 	sc->slice_sysctl_tree =
1663 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1664 				"slice", CTLFLAG_RD, 0, "");
1665 
1666 	for (slice = 0; slice < sc->num_slices; slice++) {
1667 		ss = &sc->ss[slice];
1668 		sysctl_ctx_init(&ss->sysctl_ctx);
1669 		ctx = &ss->sysctl_ctx;
1670 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1671 		sprintf(slice_num, "%d", slice);
1672 		ss->sysctl_tree =
1673 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1674 					CTLFLAG_RD, 0, "");
1675 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1676 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1677 			       "rx_small_cnt",
1678 			       CTLFLAG_RD, &ss->rx_small.cnt,
1679 			       0, "rx_small_cnt");
1680 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1681 			       "rx_big_cnt",
1682 			       CTLFLAG_RD, &ss->rx_big.cnt,
1683 			       0, "rx_small_cnt");
1684 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1685 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1686 			       0, "number of lro merge queues flushed");
1687 
1688 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1689 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1690 			       0, "number of frames appended to lro merge"
1691 			       "queues");
1692 
1693 #ifndef IFNET_BUF_RING
1694 		/* only transmit from slice 0 for now */
1695 		if (slice > 0)
1696 			continue;
1697 #endif
1698 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1699 			       "tx_req",
1700 			       CTLFLAG_RD, &ss->tx.req,
1701 			       0, "tx_req");
1702 
1703 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1704 			       "tx_done",
1705 			       CTLFLAG_RD, &ss->tx.done,
1706 			       0, "tx_done");
1707 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1708 			       "tx_pkt_done",
1709 			       CTLFLAG_RD, &ss->tx.pkt_done,
1710 			       0, "tx_done");
1711 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1712 			       "tx_stall",
1713 			       CTLFLAG_RD, &ss->tx.stall,
1714 			       0, "tx_stall");
1715 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1716 			       "tx_wake",
1717 			       CTLFLAG_RD, &ss->tx.wake,
1718 			       0, "tx_wake");
1719 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1720 			       "tx_defrag",
1721 			       CTLFLAG_RD, &ss->tx.defrag,
1722 			       0, "tx_defrag");
1723 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1724 			       "tx_queue_active",
1725 			       CTLFLAG_RD, &ss->tx.queue_active,
1726 			       0, "tx_queue_active");
1727 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1728 			       "tx_activate",
1729 			       CTLFLAG_RD, &ss->tx.activate,
1730 			       0, "tx_activate");
1731 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1732 			       "tx_deactivate",
1733 			       CTLFLAG_RD, &ss->tx.deactivate,
1734 			       0, "tx_deactivate");
1735 	}
1736 }
1737 
1738 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1739    backwards one at a time and handle ring wraps */
1740 
1741 static inline void
1742 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1743 			    mcp_kreq_ether_send_t *src, int cnt)
1744 {
1745         int idx, starting_slot;
1746         starting_slot = tx->req;
1747         while (cnt > 1) {
1748                 cnt--;
1749                 idx = (starting_slot + cnt) & tx->mask;
1750                 mxge_pio_copy(&tx->lanai[idx],
1751 			      &src[cnt], sizeof(*src));
1752                 wmb();
1753         }
1754 }
1755 
1756 /*
1757  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1758  * at most 32 bytes at a time, so as to avoid involving the software
1759  * pio handler in the nic.   We re-write the first segment's flags
1760  * to mark them valid only after writing the entire chain
1761  */
1762 
1763 static inline void
1764 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1765                   int cnt)
1766 {
1767         int idx, i;
1768         uint32_t *src_ints;
1769 	volatile uint32_t *dst_ints;
1770         mcp_kreq_ether_send_t *srcp;
1771 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1772 	uint8_t last_flags;
1773 
1774         idx = tx->req & tx->mask;
1775 
1776 	last_flags = src->flags;
1777 	src->flags = 0;
1778         wmb();
1779         dst = dstp = &tx->lanai[idx];
1780         srcp = src;
1781 
1782         if ((idx + cnt) < tx->mask) {
1783                 for (i = 0; i < (cnt - 1); i += 2) {
1784                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1785                         wmb(); /* force write every 32 bytes */
1786                         srcp += 2;
1787                         dstp += 2;
1788                 }
1789         } else {
1790                 /* submit all but the first request, and ensure
1791                    that it is submitted below */
1792                 mxge_submit_req_backwards(tx, src, cnt);
1793                 i = 0;
1794         }
1795         if (i < cnt) {
1796                 /* submit the first request */
1797                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1798                 wmb(); /* barrier before setting valid flag */
1799         }
1800 
1801         /* re-write the last 32-bits with the valid flags */
1802         src->flags = last_flags;
1803         src_ints = (uint32_t *)src;
1804         src_ints+=3;
1805         dst_ints = (volatile uint32_t *)dst;
1806         dst_ints+=3;
1807         *dst_ints =  *src_ints;
1808         tx->req += cnt;
1809         wmb();
1810 }
1811 
1812 #if IFCAP_TSO4
1813 
1814 static void
1815 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1816 	       int busdma_seg_cnt, int ip_off)
1817 {
1818 	mxge_tx_ring_t *tx;
1819 	mcp_kreq_ether_send_t *req;
1820 	bus_dma_segment_t *seg;
1821 	struct ip *ip;
1822 	struct tcphdr *tcp;
1823 	uint32_t low, high_swapped;
1824 	int len, seglen, cum_len, cum_len_next;
1825 	int next_is_first, chop, cnt, rdma_count, small;
1826 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1827 	uint8_t flags, flags_next;
1828 	static int once;
1829 
1830 	mss = m->m_pkthdr.tso_segsz;
1831 
1832 	/* negative cum_len signifies to the
1833 	 * send loop that we are still in the
1834 	 * header portion of the TSO packet.
1835 	 */
1836 
1837 	/* ensure we have the ethernet, IP and TCP
1838 	   header together in the first mbuf, copy
1839 	   it to a scratch buffer if not */
1840 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1841 		m_copydata(m, 0, ip_off + sizeof (*ip),
1842 			   ss->scratch);
1843 		ip = (struct ip *)(ss->scratch + ip_off);
1844 	} else {
1845 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1846 	}
1847 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1848 			    + sizeof (*tcp))) {
1849 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1850 			   + sizeof (*tcp),  ss->scratch);
1851 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1852 	}
1853 
1854 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1855 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1856 	cksum_offset = ip_off + (ip->ip_hl << 2);
1857 
1858 	/* TSO implies checksum offload on this hardware */
1859 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP)) == 0)) {
1860 		/*
1861 		 * If packet has full TCP csum, replace it with pseudo hdr
1862 		 * sum that the NIC expects, otherwise the NIC will emit
1863 		 * packets with bad TCP checksums.
1864 		 */
1865 		m->m_pkthdr.csum_flags = CSUM_TCP;
1866 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1867 		tcp->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1868 			htons(IPPROTO_TCP + (m->m_pkthdr.len - cksum_offset)));
1869 	}
1870 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1871 
1872 
1873 	/* for TSO, pseudo_hdr_offset holds mss.
1874 	 * The firmware figures out where to put
1875 	 * the checksum by parsing the header. */
1876 	pseudo_hdr_offset = htobe16(mss);
1877 
1878 	tx = &ss->tx;
1879 	req = tx->req_list;
1880 	seg = tx->seg_list;
1881 	cnt = 0;
1882 	rdma_count = 0;
1883 	/* "rdma_count" is the number of RDMAs belonging to the
1884 	 * current packet BEFORE the current send request. For
1885 	 * non-TSO packets, this is equal to "count".
1886 	 * For TSO packets, rdma_count needs to be reset
1887 	 * to 0 after a segment cut.
1888 	 *
1889 	 * The rdma_count field of the send request is
1890 	 * the number of RDMAs of the packet starting at
1891 	 * that request. For TSO send requests with one ore more cuts
1892 	 * in the middle, this is the number of RDMAs starting
1893 	 * after the last cut in the request. All previous
1894 	 * segments before the last cut implicitly have 1 RDMA.
1895 	 *
1896 	 * Since the number of RDMAs is not known beforehand,
1897 	 * it must be filled-in retroactively - after each
1898 	 * segmentation cut or at the end of the entire packet.
1899 	 */
1900 
1901 	while (busdma_seg_cnt) {
1902 		/* Break the busdma segment up into pieces*/
1903 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1904 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1905 		len = seg->ds_len;
1906 
1907 		while (len) {
1908 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1909 			seglen = len;
1910 			cum_len_next = cum_len + seglen;
1911 			(req-rdma_count)->rdma_count = rdma_count + 1;
1912 			if (__predict_true(cum_len >= 0)) {
1913 				/* payload */
1914 				chop = (cum_len_next > mss);
1915 				cum_len_next = cum_len_next % mss;
1916 				next_is_first = (cum_len_next == 0);
1917 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1918 				flags_next |= next_is_first *
1919 					MXGEFW_FLAGS_FIRST;
1920 				rdma_count |= -(chop | next_is_first);
1921 				rdma_count += chop & !next_is_first;
1922 			} else if (cum_len_next >= 0) {
1923 				/* header ends */
1924 				rdma_count = -1;
1925 				cum_len_next = 0;
1926 				seglen = -cum_len;
1927 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1928 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1929 					MXGEFW_FLAGS_FIRST |
1930 					(small * MXGEFW_FLAGS_SMALL);
1931 			    }
1932 
1933 			req->addr_high = high_swapped;
1934 			req->addr_low = htobe32(low);
1935 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1936 			req->pad = 0;
1937 			req->rdma_count = 1;
1938 			req->length = htobe16(seglen);
1939 			req->cksum_offset = cksum_offset;
1940 			req->flags = flags | ((cum_len & 1) *
1941 					      MXGEFW_FLAGS_ALIGN_ODD);
1942 			low += seglen;
1943 			len -= seglen;
1944 			cum_len = cum_len_next;
1945 			flags = flags_next;
1946 			req++;
1947 			cnt++;
1948 			rdma_count++;
1949 			if (__predict_false(cksum_offset > seglen))
1950 				cksum_offset -= seglen;
1951 			else
1952 				cksum_offset = 0;
1953 			if (__predict_false(cnt > tx->max_desc))
1954 				goto drop;
1955 		}
1956 		busdma_seg_cnt--;
1957 		seg++;
1958 	}
1959 	(req-rdma_count)->rdma_count = rdma_count;
1960 
1961 	do {
1962 		req--;
1963 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1964 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1965 
1966 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1967 	mxge_submit_req(tx, tx->req_list, cnt);
1968 #ifdef IFNET_BUF_RING
1969 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1970 		/* tell the NIC to start polling this slice */
1971 		*tx->send_go = 1;
1972 		tx->queue_active = 1;
1973 		tx->activate++;
1974 		wmb();
1975 	}
1976 #endif
1977 	return;
1978 
1979 drop:
1980 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1981 	m_freem(m);
1982 	ss->oerrors++;
1983 	if (!once) {
1984 		printf("tx->max_desc exceeded via TSO!\n");
1985 		printf("mss = %d, %ld, %d!\n", mss,
1986 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1987 		once = 1;
1988 	}
1989 	return;
1990 
1991 }
1992 
1993 #endif /* IFCAP_TSO4 */
1994 
1995 #ifdef MXGE_NEW_VLAN_API
1996 /*
1997  * We reproduce the software vlan tag insertion from
1998  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1999  * vlan tag insertion. We need to advertise this in order to have the
2000  * vlan interface respect our csum offload flags.
2001  */
2002 static struct mbuf *
2003 mxge_vlan_tag_insert(struct mbuf *m)
2004 {
2005 	struct ether_vlan_header *evl;
2006 
2007 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
2008 	if (__predict_false(m == NULL))
2009 		return NULL;
2010 	if (m->m_len < sizeof(*evl)) {
2011 		m = m_pullup(m, sizeof(*evl));
2012 		if (__predict_false(m == NULL))
2013 			return NULL;
2014 	}
2015 	/*
2016 	 * Transform the Ethernet header into an Ethernet header
2017 	 * with 802.1Q encapsulation.
2018 	 */
2019 	evl = mtod(m, struct ether_vlan_header *);
2020 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2021 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2022 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2023 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2024 	m->m_flags &= ~M_VLANTAG;
2025 	return m;
2026 }
2027 #endif /* MXGE_NEW_VLAN_API */
2028 
2029 static void
2030 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2031 {
2032 	mxge_softc_t *sc;
2033 	mcp_kreq_ether_send_t *req;
2034 	bus_dma_segment_t *seg;
2035 	struct mbuf *m_tmp;
2036 	struct ifnet *ifp;
2037 	mxge_tx_ring_t *tx;
2038 	struct ip *ip;
2039 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2040 	uint16_t pseudo_hdr_offset;
2041         uint8_t flags, cksum_offset;
2042 
2043 
2044 	sc = ss->sc;
2045 	ifp = sc->ifp;
2046 	tx = &ss->tx;
2047 
2048 	ip_off = sizeof (struct ether_header);
2049 #ifdef MXGE_NEW_VLAN_API
2050 	if (m->m_flags & M_VLANTAG) {
2051 		m = mxge_vlan_tag_insert(m);
2052 		if (__predict_false(m == NULL))
2053 			goto drop;
2054 		ip_off += ETHER_VLAN_ENCAP_LEN;
2055 	}
2056 #endif
2057 	/* (try to) map the frame for DMA */
2058 	idx = tx->req & tx->mask;
2059 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2060 				      m, tx->seg_list, &cnt,
2061 				      BUS_DMA_NOWAIT);
2062 	if (__predict_false(err == EFBIG)) {
2063 		/* Too many segments in the chain.  Try
2064 		   to defrag */
2065 		m_tmp = m_defrag(m, M_NOWAIT);
2066 		if (m_tmp == NULL) {
2067 			goto drop;
2068 		}
2069 		ss->tx.defrag++;
2070 		m = m_tmp;
2071 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2072 					      tx->info[idx].map,
2073 					      m, tx->seg_list, &cnt,
2074 					      BUS_DMA_NOWAIT);
2075 	}
2076 	if (__predict_false(err != 0)) {
2077 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2078 			      " packet len = %d\n", err, m->m_pkthdr.len);
2079 		goto drop;
2080 	}
2081 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2082 			BUS_DMASYNC_PREWRITE);
2083 	tx->info[idx].m = m;
2084 
2085 #if IFCAP_TSO4
2086 	/* TSO is different enough, we handle it in another routine */
2087 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2088 		mxge_encap_tso(ss, m, cnt, ip_off);
2089 		return;
2090 	}
2091 #endif
2092 
2093 	req = tx->req_list;
2094 	cksum_offset = 0;
2095 	pseudo_hdr_offset = 0;
2096 	flags = MXGEFW_FLAGS_NO_TSO;
2097 
2098 	/* checksum offloading? */
2099 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2100 		/* ensure ip header is in first mbuf, copy
2101 		   it to a scratch buffer if not */
2102 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2103 			m_copydata(m, 0, ip_off + sizeof (*ip),
2104 				   ss->scratch);
2105 			ip = (struct ip *)(ss->scratch + ip_off);
2106 		} else {
2107 			ip = (struct ip *)(mtod(m, char *) + ip_off);
2108 		}
2109 		cksum_offset = ip_off + (ip->ip_hl << 2);
2110 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2111 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2112 		req->cksum_offset = cksum_offset;
2113 		flags |= MXGEFW_FLAGS_CKSUM;
2114 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2115 	} else {
2116 		odd_flag = 0;
2117 	}
2118 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2119 		flags |= MXGEFW_FLAGS_SMALL;
2120 
2121 	/* convert segments into a request list */
2122 	cum_len = 0;
2123 	seg = tx->seg_list;
2124 	req->flags = MXGEFW_FLAGS_FIRST;
2125 	for (i = 0; i < cnt; i++) {
2126 		req->addr_low =
2127 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2128 		req->addr_high =
2129 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2130 		req->length = htobe16(seg->ds_len);
2131 		req->cksum_offset = cksum_offset;
2132 		if (cksum_offset > seg->ds_len)
2133 			cksum_offset -= seg->ds_len;
2134 		else
2135 			cksum_offset = 0;
2136 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2137 		req->pad = 0; /* complete solid 16-byte block */
2138 		req->rdma_count = 1;
2139 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2140 		cum_len += seg->ds_len;
2141 		seg++;
2142 		req++;
2143 		req->flags = 0;
2144 	}
2145 	req--;
2146 	/* pad runts to 60 bytes */
2147 	if (cum_len < 60) {
2148 		req++;
2149 		req->addr_low =
2150 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2151 		req->addr_high =
2152 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2153 		req->length = htobe16(60 - cum_len);
2154 		req->cksum_offset = 0;
2155 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2156 		req->pad = 0; /* complete solid 16-byte block */
2157 		req->rdma_count = 1;
2158 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2159 		cnt++;
2160 	}
2161 
2162 	tx->req_list[0].rdma_count = cnt;
2163 #if 0
2164 	/* print what the firmware will see */
2165 	for (i = 0; i < cnt; i++) {
2166 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2167 		    "cso:%d, flags:0x%x, rdma:%d\n",
2168 		    i, (int)ntohl(tx->req_list[i].addr_high),
2169 		    (int)ntohl(tx->req_list[i].addr_low),
2170 		    (int)ntohs(tx->req_list[i].length),
2171 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2172 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2173 		    tx->req_list[i].rdma_count);
2174 	}
2175 	printf("--------------\n");
2176 #endif
2177 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2178 	mxge_submit_req(tx, tx->req_list, cnt);
2179 #ifdef IFNET_BUF_RING
2180 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2181 		/* tell the NIC to start polling this slice */
2182 		*tx->send_go = 1;
2183 		tx->queue_active = 1;
2184 		tx->activate++;
2185 		wmb();
2186 	}
2187 #endif
2188 	return;
2189 
2190 drop:
2191 	m_freem(m);
2192 	ss->oerrors++;
2193 	return;
2194 }
2195 
2196 #ifdef IFNET_BUF_RING
2197 static void
2198 mxge_qflush(struct ifnet *ifp)
2199 {
2200 	mxge_softc_t *sc = ifp->if_softc;
2201 	mxge_tx_ring_t *tx;
2202 	struct mbuf *m;
2203 	int slice;
2204 
2205 	for (slice = 0; slice < sc->num_slices; slice++) {
2206 		tx = &sc->ss[slice].tx;
2207 		mtx_lock(&tx->mtx);
2208 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2209 			m_freem(m);
2210 		mtx_unlock(&tx->mtx);
2211 	}
2212 	if_qflush(ifp);
2213 }
2214 
2215 static inline void
2216 mxge_start_locked(struct mxge_slice_state *ss)
2217 {
2218 	mxge_softc_t *sc;
2219 	struct mbuf *m;
2220 	struct ifnet *ifp;
2221 	mxge_tx_ring_t *tx;
2222 
2223 	sc = ss->sc;
2224 	ifp = sc->ifp;
2225 	tx = &ss->tx;
2226 
2227 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2228 		m = drbr_dequeue(ifp, tx->br);
2229 		if (m == NULL) {
2230 			return;
2231 		}
2232 		/* let BPF see it */
2233 		BPF_MTAP(ifp, m);
2234 
2235 		/* give it to the nic */
2236 		mxge_encap(ss, m);
2237 	}
2238 	/* ran out of transmit slots */
2239 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2240 	    && (!drbr_empty(ifp, tx->br))) {
2241 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2242 		tx->stall++;
2243 	}
2244 }
2245 
2246 static int
2247 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2248 {
2249 	mxge_softc_t *sc;
2250 	struct ifnet *ifp;
2251 	mxge_tx_ring_t *tx;
2252 	int err;
2253 
2254 	sc = ss->sc;
2255 	ifp = sc->ifp;
2256 	tx = &ss->tx;
2257 
2258 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2259 	    IFF_DRV_RUNNING) {
2260 		err = drbr_enqueue(ifp, tx->br, m);
2261 		return (err);
2262 	}
2263 
2264 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2265 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2266 		/* let BPF see it */
2267 		BPF_MTAP(ifp, m);
2268 		/* give it to the nic */
2269 		mxge_encap(ss, m);
2270 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2271 		return (err);
2272 	}
2273 	if (!drbr_empty(ifp, tx->br))
2274 		mxge_start_locked(ss);
2275 	return (0);
2276 }
2277 
2278 static int
2279 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2280 {
2281 	mxge_softc_t *sc = ifp->if_softc;
2282 	struct mxge_slice_state *ss;
2283 	mxge_tx_ring_t *tx;
2284 	int err = 0;
2285 	int slice;
2286 
2287 	slice = m->m_pkthdr.flowid;
2288 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2289 
2290 	ss = &sc->ss[slice];
2291 	tx = &ss->tx;
2292 
2293 	if (mtx_trylock(&tx->mtx)) {
2294 		err = mxge_transmit_locked(ss, m);
2295 		mtx_unlock(&tx->mtx);
2296 	} else {
2297 		err = drbr_enqueue(ifp, tx->br, m);
2298 	}
2299 
2300 	return (err);
2301 }
2302 
2303 #else
2304 
2305 static inline void
2306 mxge_start_locked(struct mxge_slice_state *ss)
2307 {
2308 	mxge_softc_t *sc;
2309 	struct mbuf *m;
2310 	struct ifnet *ifp;
2311 	mxge_tx_ring_t *tx;
2312 
2313 	sc = ss->sc;
2314 	ifp = sc->ifp;
2315 	tx = &ss->tx;
2316 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2317 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2318 		if (m == NULL) {
2319 			return;
2320 		}
2321 		/* let BPF see it */
2322 		BPF_MTAP(ifp, m);
2323 
2324 		/* give it to the nic */
2325 		mxge_encap(ss, m);
2326 	}
2327 	/* ran out of transmit slots */
2328 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2329 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2330 		tx->stall++;
2331 	}
2332 }
2333 #endif
2334 static void
2335 mxge_start(struct ifnet *ifp)
2336 {
2337 	mxge_softc_t *sc = ifp->if_softc;
2338 	struct mxge_slice_state *ss;
2339 
2340 	/* only use the first slice for now */
2341 	ss = &sc->ss[0];
2342 	mtx_lock(&ss->tx.mtx);
2343 	mxge_start_locked(ss);
2344 	mtx_unlock(&ss->tx.mtx);
2345 }
2346 
2347 /*
2348  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2349  * at most 32 bytes at a time, so as to avoid involving the software
2350  * pio handler in the nic.   We re-write the first segment's low
2351  * DMA address to mark it valid only after we write the entire chunk
2352  * in a burst
2353  */
2354 static inline void
2355 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2356 		mcp_kreq_ether_recv_t *src)
2357 {
2358 	uint32_t low;
2359 
2360 	low = src->addr_low;
2361 	src->addr_low = 0xffffffff;
2362 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2363 	wmb();
2364 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2365 	wmb();
2366 	src->addr_low = low;
2367 	dst->addr_low = low;
2368 	wmb();
2369 }
2370 
2371 static int
2372 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2373 {
2374 	bus_dma_segment_t seg;
2375 	struct mbuf *m;
2376 	mxge_rx_ring_t *rx = &ss->rx_small;
2377 	int cnt, err;
2378 
2379 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2380 	if (m == NULL) {
2381 		rx->alloc_fail++;
2382 		err = ENOBUFS;
2383 		goto done;
2384 	}
2385 	m->m_len = MHLEN;
2386 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2387 				      &seg, &cnt, BUS_DMA_NOWAIT);
2388 	if (err != 0) {
2389 		m_free(m);
2390 		goto done;
2391 	}
2392 	rx->info[idx].m = m;
2393 	rx->shadow[idx].addr_low =
2394 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2395 	rx->shadow[idx].addr_high =
2396 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2397 
2398 done:
2399 	if ((idx & 7) == 7)
2400 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2401 	return err;
2402 }
2403 
2404 static int
2405 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2406 {
2407 	bus_dma_segment_t seg[3];
2408 	struct mbuf *m;
2409 	mxge_rx_ring_t *rx = &ss->rx_big;
2410 	int cnt, err, i;
2411 
2412 	m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2413 	if (m == NULL) {
2414 		rx->alloc_fail++;
2415 		err = ENOBUFS;
2416 		goto done;
2417 	}
2418 	m->m_len = rx->mlen;
2419 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2420 				      seg, &cnt, BUS_DMA_NOWAIT);
2421 	if (err != 0) {
2422 		m_free(m);
2423 		goto done;
2424 	}
2425 	rx->info[idx].m = m;
2426 	rx->shadow[idx].addr_low =
2427 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2428 	rx->shadow[idx].addr_high =
2429 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2430 
2431 #if MXGE_VIRT_JUMBOS
2432 	for (i = 1; i < cnt; i++) {
2433 		rx->shadow[idx + i].addr_low =
2434 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2435 		rx->shadow[idx + i].addr_high =
2436 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2437        }
2438 #endif
2439 
2440 done:
2441        for (i = 0; i < rx->nbufs; i++) {
2442 		if ((idx & 7) == 7) {
2443 			mxge_submit_8rx(&rx->lanai[idx - 7],
2444 					&rx->shadow[idx - 7]);
2445 		}
2446 		idx++;
2447 	}
2448 	return err;
2449 }
2450 
2451 /*
2452  *  Myri10GE hardware checksums are not valid if the sender
2453  *  padded the frame with non-zero padding.  This is because
2454  *  the firmware just does a simple 16-bit 1s complement
2455  *  checksum across the entire frame, excluding the first 14
2456  *  bytes.  It is best to simply to check the checksum and
2457  *  tell the stack about it only if the checksum is good
2458  */
2459 
2460 static inline uint16_t
2461 mxge_rx_csum(struct mbuf *m, int csum)
2462 {
2463 	struct ether_header *eh;
2464 	struct ip *ip;
2465 	uint16_t c;
2466 
2467 	eh = mtod(m, struct ether_header *);
2468 
2469 	/* only deal with IPv4 TCP & UDP for now */
2470 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2471 		return 1;
2472 	ip = (struct ip *)(eh + 1);
2473 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2474 			    ip->ip_p != IPPROTO_UDP))
2475 		return 1;
2476 #ifdef INET
2477 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2478 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2479 			    - (ip->ip_hl << 2) + ip->ip_p));
2480 #else
2481 	c = 1;
2482 #endif
2483 	c ^= 0xffff;
2484 	return (c);
2485 }
2486 
2487 static void
2488 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2489 {
2490 	struct ether_vlan_header *evl;
2491 	struct ether_header *eh;
2492 	uint32_t partial;
2493 
2494 	evl = mtod(m, struct ether_vlan_header *);
2495 	eh = mtod(m, struct ether_header *);
2496 
2497 	/*
2498 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2499 	 * after what the firmware thought was the end of the ethernet
2500 	 * header.
2501 	 */
2502 
2503 	/* put checksum into host byte order */
2504 	*csum = ntohs(*csum);
2505 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2506 	(*csum) += ~partial;
2507 	(*csum) +=  ((*csum) < ~partial);
2508 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2509 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2510 
2511 	/* restore checksum to network byte order;
2512 	   later consumers expect this */
2513 	*csum = htons(*csum);
2514 
2515 	/* save the tag */
2516 #ifdef MXGE_NEW_VLAN_API
2517 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2518 #else
2519 	{
2520 		struct m_tag *mtag;
2521 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2522 				   M_NOWAIT);
2523 		if (mtag == NULL)
2524 			return;
2525 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2526 		m_tag_prepend(m, mtag);
2527 	}
2528 
2529 #endif
2530 	m->m_flags |= M_VLANTAG;
2531 
2532 	/*
2533 	 * Remove the 802.1q header by copying the Ethernet
2534 	 * addresses over it and adjusting the beginning of
2535 	 * the data in the mbuf.  The encapsulated Ethernet
2536 	 * type field is already in place.
2537 	 */
2538 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2539 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2540 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2541 }
2542 
2543 
2544 static inline void
2545 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2546 {
2547 	mxge_softc_t *sc;
2548 	struct ifnet *ifp;
2549 	struct mbuf *m;
2550 	struct ether_header *eh;
2551 	mxge_rx_ring_t *rx;
2552 	bus_dmamap_t old_map;
2553 	int idx;
2554 	uint16_t tcpudp_csum;
2555 
2556 	sc = ss->sc;
2557 	ifp = sc->ifp;
2558 	rx = &ss->rx_big;
2559 	idx = rx->cnt & rx->mask;
2560 	rx->cnt += rx->nbufs;
2561 	/* save a pointer to the received mbuf */
2562 	m = rx->info[idx].m;
2563 	/* try to replace the received mbuf */
2564 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2565 		/* drop the frame -- the old mbuf is re-cycled */
2566 		ifp->if_ierrors++;
2567 		return;
2568 	}
2569 
2570 	/* unmap the received buffer */
2571 	old_map = rx->info[idx].map;
2572 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2573 	bus_dmamap_unload(rx->dmat, old_map);
2574 
2575 	/* swap the bus_dmamap_t's */
2576 	rx->info[idx].map = rx->extra_map;
2577 	rx->extra_map = old_map;
2578 
2579 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2580 	 * aligned */
2581 	m->m_data += MXGEFW_PAD;
2582 
2583 	m->m_pkthdr.rcvif = ifp;
2584 	m->m_len = m->m_pkthdr.len = len;
2585 	ss->ipackets++;
2586 	eh = mtod(m, struct ether_header *);
2587 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2588 		mxge_vlan_tag_remove(m, &csum);
2589 	}
2590 	/* if the checksum is valid, mark it in the mbuf header */
2591 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2592 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2593 			return;
2594 		/* otherwise, it was a UDP frame, or a TCP frame which
2595 		   we could not do LRO on.  Tell the stack that the
2596 		   checksum is good */
2597 		m->m_pkthdr.csum_data = 0xffff;
2598 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2599 	}
2600 	/* flowid only valid if RSS hashing is enabled */
2601 	if (sc->num_slices > 1) {
2602 		m->m_pkthdr.flowid = (ss - sc->ss);
2603 		m->m_flags |= M_FLOWID;
2604 	}
2605 	/* pass the frame up the stack */
2606 	(*ifp->if_input)(ifp, m);
2607 }
2608 
2609 static inline void
2610 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2611 {
2612 	mxge_softc_t *sc;
2613 	struct ifnet *ifp;
2614 	struct ether_header *eh;
2615 	struct mbuf *m;
2616 	mxge_rx_ring_t *rx;
2617 	bus_dmamap_t old_map;
2618 	int idx;
2619 	uint16_t tcpudp_csum;
2620 
2621 	sc = ss->sc;
2622 	ifp = sc->ifp;
2623 	rx = &ss->rx_small;
2624 	idx = rx->cnt & rx->mask;
2625 	rx->cnt++;
2626 	/* save a pointer to the received mbuf */
2627 	m = rx->info[idx].m;
2628 	/* try to replace the received mbuf */
2629 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2630 		/* drop the frame -- the old mbuf is re-cycled */
2631 		ifp->if_ierrors++;
2632 		return;
2633 	}
2634 
2635 	/* unmap the received buffer */
2636 	old_map = rx->info[idx].map;
2637 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2638 	bus_dmamap_unload(rx->dmat, old_map);
2639 
2640 	/* swap the bus_dmamap_t's */
2641 	rx->info[idx].map = rx->extra_map;
2642 	rx->extra_map = old_map;
2643 
2644 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2645 	 * aligned */
2646 	m->m_data += MXGEFW_PAD;
2647 
2648 	m->m_pkthdr.rcvif = ifp;
2649 	m->m_len = m->m_pkthdr.len = len;
2650 	ss->ipackets++;
2651 	eh = mtod(m, struct ether_header *);
2652 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2653 		mxge_vlan_tag_remove(m, &csum);
2654 	}
2655 	/* if the checksum is valid, mark it in the mbuf header */
2656 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2657 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2658 			return;
2659 		/* otherwise, it was a UDP frame, or a TCP frame which
2660 		   we could not do LRO on.  Tell the stack that the
2661 		   checksum is good */
2662 		m->m_pkthdr.csum_data = 0xffff;
2663 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2664 	}
2665 	/* flowid only valid if RSS hashing is enabled */
2666 	if (sc->num_slices > 1) {
2667 		m->m_pkthdr.flowid = (ss - sc->ss);
2668 		m->m_flags |= M_FLOWID;
2669 	}
2670 	/* pass the frame up the stack */
2671 	(*ifp->if_input)(ifp, m);
2672 }
2673 
2674 static inline void
2675 mxge_clean_rx_done(struct mxge_slice_state *ss)
2676 {
2677 	mxge_rx_done_t *rx_done = &ss->rx_done;
2678 	int limit = 0;
2679 	uint16_t length;
2680 	uint16_t checksum;
2681 
2682 
2683 	while (rx_done->entry[rx_done->idx].length != 0) {
2684 		length = ntohs(rx_done->entry[rx_done->idx].length);
2685 		rx_done->entry[rx_done->idx].length = 0;
2686 		checksum = rx_done->entry[rx_done->idx].checksum;
2687 		if (length <= (MHLEN - MXGEFW_PAD))
2688 			mxge_rx_done_small(ss, length, checksum);
2689 		else
2690 			mxge_rx_done_big(ss, length, checksum);
2691 		rx_done->cnt++;
2692 		rx_done->idx = rx_done->cnt & rx_done->mask;
2693 
2694 		/* limit potential for livelock */
2695 		if (__predict_false(++limit > rx_done->mask / 2))
2696 			break;
2697 	}
2698 #ifdef INET
2699 	while (!SLIST_EMPTY(&ss->lro_active)) {
2700 		struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2701 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2702 		mxge_lro_flush(ss, lro);
2703 	}
2704 #endif
2705 }
2706 
2707 
2708 static inline void
2709 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2710 {
2711 	struct ifnet *ifp;
2712 	mxge_tx_ring_t *tx;
2713 	struct mbuf *m;
2714 	bus_dmamap_t map;
2715 	int idx;
2716 	int *flags;
2717 
2718 	tx = &ss->tx;
2719 	ifp = ss->sc->ifp;
2720 	while (tx->pkt_done != mcp_idx) {
2721 		idx = tx->done & tx->mask;
2722 		tx->done++;
2723 		m = tx->info[idx].m;
2724 		/* mbuf and DMA map only attached to the first
2725 		   segment per-mbuf */
2726 		if (m != NULL) {
2727 			ss->obytes += m->m_pkthdr.len;
2728 			if (m->m_flags & M_MCAST)
2729 				ss->omcasts++;
2730 			ss->opackets++;
2731 			tx->info[idx].m = NULL;
2732 			map = tx->info[idx].map;
2733 			bus_dmamap_unload(tx->dmat, map);
2734 			m_freem(m);
2735 		}
2736 		if (tx->info[idx].flag) {
2737 			tx->info[idx].flag = 0;
2738 			tx->pkt_done++;
2739 		}
2740 	}
2741 
2742 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2743            its OK to send packets */
2744 #ifdef IFNET_BUF_RING
2745 	flags = &ss->if_drv_flags;
2746 #else
2747 	flags = &ifp->if_drv_flags;
2748 #endif
2749 	mtx_lock(&ss->tx.mtx);
2750 	if ((*flags) & IFF_DRV_OACTIVE &&
2751 	    tx->req - tx->done < (tx->mask + 1)/4) {
2752 		*(flags) &= ~IFF_DRV_OACTIVE;
2753 		ss->tx.wake++;
2754 		mxge_start_locked(ss);
2755 	}
2756 #ifdef IFNET_BUF_RING
2757 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2758 		/* let the NIC stop polling this queue, since there
2759 		 * are no more transmits pending */
2760 		if (tx->req == tx->done) {
2761 			*tx->send_stop = 1;
2762 			tx->queue_active = 0;
2763 			tx->deactivate++;
2764 			wmb();
2765 		}
2766 	}
2767 #endif
2768 	mtx_unlock(&ss->tx.mtx);
2769 
2770 }
2771 
2772 static struct mxge_media_type mxge_xfp_media_types[] =
2773 {
2774 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2775 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2776 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2777 	{0,		(1 << 5),	"10GBASE-ER"},
2778 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2779 	{0,		(1 << 3),	"10GBASE-SW"},
2780 	{0,		(1 << 2),	"10GBASE-LW"},
2781 	{0,		(1 << 1),	"10GBASE-EW"},
2782 	{0,		(1 << 0),	"Reserved"}
2783 };
2784 static struct mxge_media_type mxge_sfp_media_types[] =
2785 {
2786 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2787 	{0,		(1 << 7),	"Reserved"},
2788 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2789 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2790 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2791 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2792 };
2793 
2794 static void
2795 mxge_media_set(mxge_softc_t *sc, int media_type)
2796 {
2797 
2798 
2799 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2800 		    0, NULL);
2801 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2802 	sc->current_media = media_type;
2803 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2804 }
2805 
2806 static void
2807 mxge_media_init(mxge_softc_t *sc)
2808 {
2809 	char *ptr;
2810 	int i;
2811 
2812 	ifmedia_removeall(&sc->media);
2813 	mxge_media_set(sc, IFM_AUTO);
2814 
2815 	/*
2816 	 * parse the product code to deterimine the interface type
2817 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2818 	 * after the 3rd dash in the driver's cached copy of the
2819 	 * EEPROM's product code string.
2820 	 */
2821 	ptr = sc->product_code_string;
2822 	if (ptr == NULL) {
2823 		device_printf(sc->dev, "Missing product code\n");
2824 		return;
2825 	}
2826 
2827 	for (i = 0; i < 3; i++, ptr++) {
2828 		ptr = strchr(ptr, '-');
2829 		if (ptr == NULL) {
2830 			device_printf(sc->dev,
2831 				      "only %d dashes in PC?!?\n", i);
2832 			return;
2833 		}
2834 	}
2835 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2836 		/* -C is CX4 */
2837 		sc->connector = MXGE_CX4;
2838 		mxge_media_set(sc, IFM_10G_CX4);
2839 	} else if (*ptr == 'Q') {
2840 		/* -Q is Quad Ribbon Fiber */
2841 		sc->connector = MXGE_QRF;
2842 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2843 		/* FreeBSD has no media type for Quad ribbon fiber */
2844 	} else if (*ptr == 'R') {
2845 		/* -R is XFP */
2846 		sc->connector = MXGE_XFP;
2847 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2848 		/* -S or -2S is SFP+ */
2849 		sc->connector = MXGE_SFP;
2850 	} else {
2851 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2852 	}
2853 }
2854 
2855 /*
2856  * Determine the media type for a NIC.  Some XFPs will identify
2857  * themselves only when their link is up, so this is initiated via a
2858  * link up interrupt.  However, this can potentially take up to
2859  * several milliseconds, so it is run via the watchdog routine, rather
2860  * than in the interrupt handler itself.
2861  */
2862 static void
2863 mxge_media_probe(mxge_softc_t *sc)
2864 {
2865 	mxge_cmd_t cmd;
2866 	char *cage_type;
2867 
2868 	struct mxge_media_type *mxge_media_types = NULL;
2869 	int i, err, ms, mxge_media_type_entries;
2870 	uint32_t byte;
2871 
2872 	sc->need_media_probe = 0;
2873 
2874 	if (sc->connector == MXGE_XFP) {
2875 		/* -R is XFP */
2876 		mxge_media_types = mxge_xfp_media_types;
2877 		mxge_media_type_entries =
2878 			sizeof (mxge_xfp_media_types) /
2879 			sizeof (mxge_xfp_media_types[0]);
2880 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2881 		cage_type = "XFP";
2882 	} else 	if (sc->connector == MXGE_SFP) {
2883 		/* -S or -2S is SFP+ */
2884 		mxge_media_types = mxge_sfp_media_types;
2885 		mxge_media_type_entries =
2886 			sizeof (mxge_sfp_media_types) /
2887 			sizeof (mxge_sfp_media_types[0]);
2888 		cage_type = "SFP+";
2889 		byte = 3;
2890 	} else {
2891 		/* nothing to do; media type cannot change */
2892 		return;
2893 	}
2894 
2895 	/*
2896 	 * At this point we know the NIC has an XFP cage, so now we
2897 	 * try to determine what is in the cage by using the
2898 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2899 	 * register.  We read just one byte, which may take over
2900 	 * a millisecond
2901 	 */
2902 
2903 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2904 	cmd.data1 = byte;
2905 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2906 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2907 		device_printf(sc->dev, "failed to read XFP\n");
2908 	}
2909 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2910 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2911 	}
2912 	if (err != MXGEFW_CMD_OK) {
2913 		return;
2914 	}
2915 
2916 	/* now we wait for the data to be cached */
2917 	cmd.data0 = byte;
2918 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2919 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2920 		DELAY(1000);
2921 		cmd.data0 = byte;
2922 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2923 	}
2924 	if (err != MXGEFW_CMD_OK) {
2925 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2926 			      cage_type, err, ms);
2927 		return;
2928 	}
2929 
2930 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2931 		if (mxge_verbose)
2932 			device_printf(sc->dev, "%s:%s\n", cage_type,
2933 				      mxge_media_types[0].name);
2934 		if (sc->current_media != mxge_media_types[0].flag) {
2935 			mxge_media_init(sc);
2936 			mxge_media_set(sc, mxge_media_types[0].flag);
2937 		}
2938 		return;
2939 	}
2940 	for (i = 1; i < mxge_media_type_entries; i++) {
2941 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2942 			if (mxge_verbose)
2943 				device_printf(sc->dev, "%s:%s\n",
2944 					      cage_type,
2945 					      mxge_media_types[i].name);
2946 
2947 			if (sc->current_media != mxge_media_types[i].flag) {
2948 				mxge_media_init(sc);
2949 				mxge_media_set(sc, mxge_media_types[i].flag);
2950 			}
2951 			return;
2952 		}
2953 	}
2954 	if (mxge_verbose)
2955 		device_printf(sc->dev, "%s media 0x%x unknown\n",
2956 			      cage_type, cmd.data0);
2957 
2958 	return;
2959 }
2960 
2961 static void
2962 mxge_intr(void *arg)
2963 {
2964 	struct mxge_slice_state *ss = arg;
2965 	mxge_softc_t *sc = ss->sc;
2966 	mcp_irq_data_t *stats = ss->fw_stats;
2967 	mxge_tx_ring_t *tx = &ss->tx;
2968 	mxge_rx_done_t *rx_done = &ss->rx_done;
2969 	uint32_t send_done_count;
2970 	uint8_t valid;
2971 
2972 
2973 #ifndef IFNET_BUF_RING
2974 	/* an interrupt on a non-zero slice is implicitly valid
2975 	   since MSI-X irqs are not shared */
2976 	if (ss != sc->ss) {
2977 		mxge_clean_rx_done(ss);
2978 		*ss->irq_claim = be32toh(3);
2979 		return;
2980 	}
2981 #endif
2982 
2983 	/* make sure the DMA has finished */
2984 	if (!stats->valid) {
2985 		return;
2986 	}
2987 	valid = stats->valid;
2988 
2989 	if (sc->legacy_irq) {
2990 		/* lower legacy IRQ  */
2991 		*sc->irq_deassert = 0;
2992 		if (!mxge_deassert_wait)
2993 			/* don't wait for conf. that irq is low */
2994 			stats->valid = 0;
2995 	} else {
2996 		stats->valid = 0;
2997 	}
2998 
2999 	/* loop while waiting for legacy irq deassertion */
3000 	do {
3001 		/* check for transmit completes and receives */
3002 		send_done_count = be32toh(stats->send_done_count);
3003 		while ((send_done_count != tx->pkt_done) ||
3004 		       (rx_done->entry[rx_done->idx].length != 0)) {
3005 			if (send_done_count != tx->pkt_done)
3006 				mxge_tx_done(ss, (int)send_done_count);
3007 			mxge_clean_rx_done(ss);
3008 			send_done_count = be32toh(stats->send_done_count);
3009 		}
3010 		if (sc->legacy_irq && mxge_deassert_wait)
3011 			wmb();
3012 	} while (*((volatile uint8_t *) &stats->valid));
3013 
3014 	/* fw link & error stats meaningful only on the first slice */
3015 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3016 		if (sc->link_state != stats->link_up) {
3017 			sc->link_state = stats->link_up;
3018 			if (sc->link_state) {
3019 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3020 				if_initbaudrate(sc->ifp, IF_Gbps(10));
3021 				if (mxge_verbose)
3022 					device_printf(sc->dev, "link up\n");
3023 			} else {
3024 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3025 				sc->ifp->if_baudrate = 0;
3026 				if (mxge_verbose)
3027 					device_printf(sc->dev, "link down\n");
3028 			}
3029 			sc->need_media_probe = 1;
3030 		}
3031 		if (sc->rdma_tags_available !=
3032 		    be32toh(stats->rdma_tags_available)) {
3033 			sc->rdma_tags_available =
3034 				be32toh(stats->rdma_tags_available);
3035 			device_printf(sc->dev, "RDMA timed out! %d tags "
3036 				      "left\n", sc->rdma_tags_available);
3037 		}
3038 
3039 		if (stats->link_down) {
3040 			sc->down_cnt += stats->link_down;
3041 			sc->link_state = 0;
3042 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3043 		}
3044 	}
3045 
3046 	/* check to see if we have rx token to pass back */
3047 	if (valid & 0x1)
3048 	    *ss->irq_claim = be32toh(3);
3049 	*(ss->irq_claim + 1) = be32toh(3);
3050 }
3051 
3052 static void
3053 mxge_init(void *arg)
3054 {
3055 	mxge_softc_t *sc = arg;
3056 	struct ifnet *ifp = sc->ifp;
3057 
3058 
3059 	mtx_lock(&sc->driver_mtx);
3060 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3061 		(void) mxge_open(sc);
3062 	mtx_unlock(&sc->driver_mtx);
3063 }
3064 
3065 
3066 
3067 static void
3068 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3069 {
3070 	struct lro_entry *lro_entry;
3071 	int i;
3072 
3073 	while (!SLIST_EMPTY(&ss->lro_free)) {
3074 		lro_entry = SLIST_FIRST(&ss->lro_free);
3075 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
3076 		free(lro_entry, M_DEVBUF);
3077 	}
3078 
3079 	for (i = 0; i <= ss->rx_big.mask; i++) {
3080 		if (ss->rx_big.info[i].m == NULL)
3081 			continue;
3082 		bus_dmamap_unload(ss->rx_big.dmat,
3083 				  ss->rx_big.info[i].map);
3084 		m_freem(ss->rx_big.info[i].m);
3085 		ss->rx_big.info[i].m = NULL;
3086 	}
3087 
3088 	for (i = 0; i <= ss->rx_small.mask; i++) {
3089 		if (ss->rx_small.info[i].m == NULL)
3090 			continue;
3091 		bus_dmamap_unload(ss->rx_small.dmat,
3092 				  ss->rx_small.info[i].map);
3093 		m_freem(ss->rx_small.info[i].m);
3094 		ss->rx_small.info[i].m = NULL;
3095 	}
3096 
3097 	/* transmit ring used only on the first slice */
3098 	if (ss->tx.info == NULL)
3099 		return;
3100 
3101 	for (i = 0; i <= ss->tx.mask; i++) {
3102 		ss->tx.info[i].flag = 0;
3103 		if (ss->tx.info[i].m == NULL)
3104 			continue;
3105 		bus_dmamap_unload(ss->tx.dmat,
3106 				  ss->tx.info[i].map);
3107 		m_freem(ss->tx.info[i].m);
3108 		ss->tx.info[i].m = NULL;
3109 	}
3110 }
3111 
3112 static void
3113 mxge_free_mbufs(mxge_softc_t *sc)
3114 {
3115 	int slice;
3116 
3117 	for (slice = 0; slice < sc->num_slices; slice++)
3118 		mxge_free_slice_mbufs(&sc->ss[slice]);
3119 }
3120 
3121 static void
3122 mxge_free_slice_rings(struct mxge_slice_state *ss)
3123 {
3124 	int i;
3125 
3126 
3127 	if (ss->rx_done.entry != NULL)
3128 		mxge_dma_free(&ss->rx_done.dma);
3129 	ss->rx_done.entry = NULL;
3130 
3131 	if (ss->tx.req_bytes != NULL)
3132 		free(ss->tx.req_bytes, M_DEVBUF);
3133 	ss->tx.req_bytes = NULL;
3134 
3135 	if (ss->tx.seg_list != NULL)
3136 		free(ss->tx.seg_list, M_DEVBUF);
3137 	ss->tx.seg_list = NULL;
3138 
3139 	if (ss->rx_small.shadow != NULL)
3140 		free(ss->rx_small.shadow, M_DEVBUF);
3141 	ss->rx_small.shadow = NULL;
3142 
3143 	if (ss->rx_big.shadow != NULL)
3144 		free(ss->rx_big.shadow, M_DEVBUF);
3145 	ss->rx_big.shadow = NULL;
3146 
3147 	if (ss->tx.info != NULL) {
3148 		if (ss->tx.dmat != NULL) {
3149 			for (i = 0; i <= ss->tx.mask; i++) {
3150 				bus_dmamap_destroy(ss->tx.dmat,
3151 						   ss->tx.info[i].map);
3152 			}
3153 			bus_dma_tag_destroy(ss->tx.dmat);
3154 		}
3155 		free(ss->tx.info, M_DEVBUF);
3156 	}
3157 	ss->tx.info = NULL;
3158 
3159 	if (ss->rx_small.info != NULL) {
3160 		if (ss->rx_small.dmat != NULL) {
3161 			for (i = 0; i <= ss->rx_small.mask; i++) {
3162 				bus_dmamap_destroy(ss->rx_small.dmat,
3163 						   ss->rx_small.info[i].map);
3164 			}
3165 			bus_dmamap_destroy(ss->rx_small.dmat,
3166 					   ss->rx_small.extra_map);
3167 			bus_dma_tag_destroy(ss->rx_small.dmat);
3168 		}
3169 		free(ss->rx_small.info, M_DEVBUF);
3170 	}
3171 	ss->rx_small.info = NULL;
3172 
3173 	if (ss->rx_big.info != NULL) {
3174 		if (ss->rx_big.dmat != NULL) {
3175 			for (i = 0; i <= ss->rx_big.mask; i++) {
3176 				bus_dmamap_destroy(ss->rx_big.dmat,
3177 						   ss->rx_big.info[i].map);
3178 			}
3179 			bus_dmamap_destroy(ss->rx_big.dmat,
3180 					   ss->rx_big.extra_map);
3181 			bus_dma_tag_destroy(ss->rx_big.dmat);
3182 		}
3183 		free(ss->rx_big.info, M_DEVBUF);
3184 	}
3185 	ss->rx_big.info = NULL;
3186 }
3187 
3188 static void
3189 mxge_free_rings(mxge_softc_t *sc)
3190 {
3191 	int slice;
3192 
3193 	for (slice = 0; slice < sc->num_slices; slice++)
3194 		mxge_free_slice_rings(&sc->ss[slice]);
3195 }
3196 
3197 static int
3198 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3199 		       int tx_ring_entries)
3200 {
3201 	mxge_softc_t *sc = ss->sc;
3202 	size_t bytes;
3203 	int err, i;
3204 
3205 	err = ENOMEM;
3206 
3207 	/* allocate per-slice receive resources */
3208 
3209 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3210 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3211 
3212 	/* allocate the rx shadow rings */
3213 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3214 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3215 	if (ss->rx_small.shadow == NULL)
3216 		return err;
3217 
3218 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3219 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3220 	if (ss->rx_big.shadow == NULL)
3221 		return err;
3222 
3223 	/* allocate the rx host info rings */
3224 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3225 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3226 	if (ss->rx_small.info == NULL)
3227 		return err;
3228 
3229 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3230 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3231 	if (ss->rx_big.info == NULL)
3232 		return err;
3233 
3234 	/* allocate the rx busdma resources */
3235 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3236 				 1,			/* alignment */
3237 				 4096,			/* boundary */
3238 				 BUS_SPACE_MAXADDR,	/* low */
3239 				 BUS_SPACE_MAXADDR,	/* high */
3240 				 NULL, NULL,		/* filter */
3241 				 MHLEN,			/* maxsize */
3242 				 1,			/* num segs */
3243 				 MHLEN,			/* maxsegsize */
3244 				 BUS_DMA_ALLOCNOW,	/* flags */
3245 				 NULL, NULL,		/* lock */
3246 				 &ss->rx_small.dmat);	/* tag */
3247 	if (err != 0) {
3248 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3249 			      err);
3250 		return err;
3251 	}
3252 
3253 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3254 				 1,			/* alignment */
3255 #if MXGE_VIRT_JUMBOS
3256 				 4096,			/* boundary */
3257 #else
3258 				 0,			/* boundary */
3259 #endif
3260 				 BUS_SPACE_MAXADDR,	/* low */
3261 				 BUS_SPACE_MAXADDR,	/* high */
3262 				 NULL, NULL,		/* filter */
3263 				 3*4096,		/* maxsize */
3264 #if MXGE_VIRT_JUMBOS
3265 				 3,			/* num segs */
3266 				 4096,			/* maxsegsize*/
3267 #else
3268 				 1,			/* num segs */
3269 				 MJUM9BYTES,		/* maxsegsize*/
3270 #endif
3271 				 BUS_DMA_ALLOCNOW,	/* flags */
3272 				 NULL, NULL,		/* lock */
3273 				 &ss->rx_big.dmat);	/* tag */
3274 	if (err != 0) {
3275 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3276 			      err);
3277 		return err;
3278 	}
3279 	for (i = 0; i <= ss->rx_small.mask; i++) {
3280 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3281 					&ss->rx_small.info[i].map);
3282 		if (err != 0) {
3283 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3284 				      err);
3285 			return err;
3286 		}
3287 	}
3288 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3289 				&ss->rx_small.extra_map);
3290 	if (err != 0) {
3291 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3292 			      err);
3293 		return err;
3294 	}
3295 
3296 	for (i = 0; i <= ss->rx_big.mask; i++) {
3297 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3298 					&ss->rx_big.info[i].map);
3299 		if (err != 0) {
3300 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3301 				      err);
3302 			return err;
3303 		}
3304 	}
3305 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3306 				&ss->rx_big.extra_map);
3307 	if (err != 0) {
3308 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3309 			      err);
3310 		return err;
3311 	}
3312 
3313 	/* now allocate TX resouces */
3314 
3315 #ifndef IFNET_BUF_RING
3316 	/* only use a single TX ring for now */
3317 	if (ss != ss->sc->ss)
3318 		return 0;
3319 #endif
3320 
3321 	ss->tx.mask = tx_ring_entries - 1;
3322 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3323 
3324 
3325 	/* allocate the tx request copy block */
3326 	bytes = 8 +
3327 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3328 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3329 	if (ss->tx.req_bytes == NULL)
3330 		return err;
3331 	/* ensure req_list entries are aligned to 8 bytes */
3332 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3333 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3334 
3335 	/* allocate the tx busdma segment list */
3336 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3337 	ss->tx.seg_list = (bus_dma_segment_t *)
3338 		malloc(bytes, M_DEVBUF, M_WAITOK);
3339 	if (ss->tx.seg_list == NULL)
3340 		return err;
3341 
3342 	/* allocate the tx host info ring */
3343 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3344 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3345 	if (ss->tx.info == NULL)
3346 		return err;
3347 
3348 	/* allocate the tx busdma resources */
3349 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3350 				 1,			/* alignment */
3351 				 sc->tx_boundary,	/* boundary */
3352 				 BUS_SPACE_MAXADDR,	/* low */
3353 				 BUS_SPACE_MAXADDR,	/* high */
3354 				 NULL, NULL,		/* filter */
3355 				 65536 + 256,		/* maxsize */
3356 				 ss->tx.max_desc - 2,	/* num segs */
3357 				 sc->tx_boundary,	/* maxsegsz */
3358 				 BUS_DMA_ALLOCNOW,	/* flags */
3359 				 NULL, NULL,		/* lock */
3360 				 &ss->tx.dmat);		/* tag */
3361 
3362 	if (err != 0) {
3363 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3364 			      err);
3365 		return err;
3366 	}
3367 
3368 	/* now use these tags to setup dmamaps for each slot
3369 	   in the ring */
3370 	for (i = 0; i <= ss->tx.mask; i++) {
3371 		err = bus_dmamap_create(ss->tx.dmat, 0,
3372 					&ss->tx.info[i].map);
3373 		if (err != 0) {
3374 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3375 				      err);
3376 			return err;
3377 		}
3378 	}
3379 	return 0;
3380 
3381 }
3382 
3383 static int
3384 mxge_alloc_rings(mxge_softc_t *sc)
3385 {
3386 	mxge_cmd_t cmd;
3387 	int tx_ring_size;
3388 	int tx_ring_entries, rx_ring_entries;
3389 	int err, slice;
3390 
3391 	/* get ring sizes */
3392 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3393 	tx_ring_size = cmd.data0;
3394 	if (err != 0) {
3395 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3396 		goto abort;
3397 	}
3398 
3399 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3400 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3401 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3402 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3403 	IFQ_SET_READY(&sc->ifp->if_snd);
3404 
3405 	for (slice = 0; slice < sc->num_slices; slice++) {
3406 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3407 					     rx_ring_entries,
3408 					     tx_ring_entries);
3409 		if (err != 0)
3410 			goto abort;
3411 	}
3412 	return 0;
3413 
3414 abort:
3415 	mxge_free_rings(sc);
3416 	return err;
3417 
3418 }
3419 
3420 
3421 static void
3422 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3423 {
3424 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3425 
3426 	if (bufsize < MCLBYTES) {
3427 		/* easy, everything fits in a single buffer */
3428 		*big_buf_size = MCLBYTES;
3429 		*cl_size = MCLBYTES;
3430 		*nbufs = 1;
3431 		return;
3432 	}
3433 
3434 	if (bufsize < MJUMPAGESIZE) {
3435 		/* still easy, everything still fits in a single buffer */
3436 		*big_buf_size = MJUMPAGESIZE;
3437 		*cl_size = MJUMPAGESIZE;
3438 		*nbufs = 1;
3439 		return;
3440 	}
3441 #if MXGE_VIRT_JUMBOS
3442 	/* now we need to use virtually contiguous buffers */
3443 	*cl_size = MJUM9BYTES;
3444 	*big_buf_size = 4096;
3445 	*nbufs = mtu / 4096 + 1;
3446 	/* needs to be a power of two, so round up */
3447 	if (*nbufs == 3)
3448 		*nbufs = 4;
3449 #else
3450 	*cl_size = MJUM9BYTES;
3451 	*big_buf_size = MJUM9BYTES;
3452 	*nbufs = 1;
3453 #endif
3454 }
3455 
3456 static int
3457 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3458 {
3459 	mxge_softc_t *sc;
3460 	mxge_cmd_t cmd;
3461 	bus_dmamap_t map;
3462 	struct lro_entry *lro_entry;
3463 	int err, i, slice;
3464 
3465 
3466 	sc = ss->sc;
3467 	slice = ss - sc->ss;
3468 
3469 	SLIST_INIT(&ss->lro_free);
3470 	SLIST_INIT(&ss->lro_active);
3471 
3472 	for (i = 0; i < sc->lro_cnt; i++) {
3473 		lro_entry = (struct lro_entry *)
3474 			malloc(sizeof (*lro_entry), M_DEVBUF,
3475 			       M_NOWAIT | M_ZERO);
3476 		if (lro_entry == NULL) {
3477 			sc->lro_cnt = i;
3478 			break;
3479 		}
3480 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3481 	}
3482 	/* get the lanai pointers to the send and receive rings */
3483 
3484 	err = 0;
3485 #ifndef IFNET_BUF_RING
3486 	/* We currently only send from the first slice */
3487 	if (slice == 0) {
3488 #endif
3489 		cmd.data0 = slice;
3490 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3491 		ss->tx.lanai =
3492 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3493 		ss->tx.send_go = (volatile uint32_t *)
3494 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3495 		ss->tx.send_stop = (volatile uint32_t *)
3496 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3497 #ifndef IFNET_BUF_RING
3498 	}
3499 #endif
3500 	cmd.data0 = slice;
3501 	err |= mxge_send_cmd(sc,
3502 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3503 	ss->rx_small.lanai =
3504 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3505 	cmd.data0 = slice;
3506 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3507 	ss->rx_big.lanai =
3508 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3509 
3510 	if (err != 0) {
3511 		device_printf(sc->dev,
3512 			      "failed to get ring sizes or locations\n");
3513 		return EIO;
3514 	}
3515 
3516 	/* stock receive rings */
3517 	for (i = 0; i <= ss->rx_small.mask; i++) {
3518 		map = ss->rx_small.info[i].map;
3519 		err = mxge_get_buf_small(ss, map, i);
3520 		if (err) {
3521 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3522 				      i, ss->rx_small.mask + 1);
3523 			return ENOMEM;
3524 		}
3525 	}
3526 	for (i = 0; i <= ss->rx_big.mask; i++) {
3527 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3528 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3529 	}
3530 	ss->rx_big.nbufs = nbufs;
3531 	ss->rx_big.cl_size = cl_size;
3532 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3533 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3534 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3535 		map = ss->rx_big.info[i].map;
3536 		err = mxge_get_buf_big(ss, map, i);
3537 		if (err) {
3538 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3539 				      i, ss->rx_big.mask + 1);
3540 			return ENOMEM;
3541 		}
3542 	}
3543 	return 0;
3544 }
3545 
3546 static int
3547 mxge_open(mxge_softc_t *sc)
3548 {
3549 	mxge_cmd_t cmd;
3550 	int err, big_bytes, nbufs, slice, cl_size, i;
3551 	bus_addr_t bus;
3552 	volatile uint8_t *itable;
3553 	struct mxge_slice_state *ss;
3554 
3555 	/* Copy the MAC address in case it was overridden */
3556 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3557 
3558 	err = mxge_reset(sc, 1);
3559 	if (err != 0) {
3560 		device_printf(sc->dev, "failed to reset\n");
3561 		return EIO;
3562 	}
3563 
3564 	if (sc->num_slices > 1) {
3565 		/* setup the indirection table */
3566 		cmd.data0 = sc->num_slices;
3567 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3568 				    &cmd);
3569 
3570 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3571 				     &cmd);
3572 		if (err != 0) {
3573 			device_printf(sc->dev,
3574 				      "failed to setup rss tables\n");
3575 			return err;
3576 		}
3577 
3578 		/* just enable an identity mapping */
3579 		itable = sc->sram + cmd.data0;
3580 		for (i = 0; i < sc->num_slices; i++)
3581 			itable[i] = (uint8_t)i;
3582 
3583 		cmd.data0 = 1;
3584 		cmd.data1 = mxge_rss_hash_type;
3585 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3586 		if (err != 0) {
3587 			device_printf(sc->dev, "failed to enable slices\n");
3588 			return err;
3589 		}
3590 	}
3591 
3592 
3593 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3594 
3595 	cmd.data0 = nbufs;
3596 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3597 			    &cmd);
3598 	/* error is only meaningful if we're trying to set
3599 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3600 	if (err && nbufs > 1) {
3601 		device_printf(sc->dev,
3602 			      "Failed to set alway-use-n to %d\n",
3603 			      nbufs);
3604 		return EIO;
3605 	}
3606 	/* Give the firmware the mtu and the big and small buffer
3607 	   sizes.  The firmware wants the big buf size to be a power
3608 	   of two. Luckily, FreeBSD's clusters are powers of two */
3609 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3610 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3611 	cmd.data0 = MHLEN - MXGEFW_PAD;
3612 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3613 			     &cmd);
3614 	cmd.data0 = big_bytes;
3615 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3616 
3617 	if (err != 0) {
3618 		device_printf(sc->dev, "failed to setup params\n");
3619 		goto abort;
3620 	}
3621 
3622 	/* Now give him the pointer to the stats block */
3623 	for (slice = 0;
3624 #ifdef IFNET_BUF_RING
3625 	     slice < sc->num_slices;
3626 #else
3627 	     slice < 1;
3628 #endif
3629 	     slice++) {
3630 		ss = &sc->ss[slice];
3631 		cmd.data0 =
3632 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3633 		cmd.data1 =
3634 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3635 		cmd.data2 = sizeof(struct mcp_irq_data);
3636 		cmd.data2 |= (slice << 16);
3637 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3638 	}
3639 
3640 	if (err != 0) {
3641 		bus = sc->ss->fw_stats_dma.bus_addr;
3642 		bus += offsetof(struct mcp_irq_data, send_done_count);
3643 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3644 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3645 		err = mxge_send_cmd(sc,
3646 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3647 				    &cmd);
3648 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3649 		sc->fw_multicast_support = 0;
3650 	} else {
3651 		sc->fw_multicast_support = 1;
3652 	}
3653 
3654 	if (err != 0) {
3655 		device_printf(sc->dev, "failed to setup params\n");
3656 		goto abort;
3657 	}
3658 
3659 	for (slice = 0; slice < sc->num_slices; slice++) {
3660 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3661 		if (err != 0) {
3662 			device_printf(sc->dev, "couldn't open slice %d\n",
3663 				      slice);
3664 			goto abort;
3665 		}
3666 	}
3667 
3668 	/* Finally, start the firmware running */
3669 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3670 	if (err) {
3671 		device_printf(sc->dev, "Couldn't bring up link\n");
3672 		goto abort;
3673 	}
3674 #ifdef IFNET_BUF_RING
3675 	for (slice = 0; slice < sc->num_slices; slice++) {
3676 		ss = &sc->ss[slice];
3677 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3678 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3679 	}
3680 #endif
3681 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3682 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3683 
3684 	return 0;
3685 
3686 
3687 abort:
3688 	mxge_free_mbufs(sc);
3689 
3690 	return err;
3691 }
3692 
3693 static int
3694 mxge_close(mxge_softc_t *sc, int down)
3695 {
3696 	mxge_cmd_t cmd;
3697 	int err, old_down_cnt;
3698 #ifdef IFNET_BUF_RING
3699 	struct mxge_slice_state *ss;
3700 	int slice;
3701 #endif
3702 
3703 #ifdef IFNET_BUF_RING
3704 	for (slice = 0; slice < sc->num_slices; slice++) {
3705 		ss = &sc->ss[slice];
3706 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3707 	}
3708 #endif
3709 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3710 	if (!down) {
3711 		old_down_cnt = sc->down_cnt;
3712 		wmb();
3713 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3714 		if (err) {
3715 			device_printf(sc->dev,
3716 				      "Couldn't bring down link\n");
3717 		}
3718 		if (old_down_cnt == sc->down_cnt) {
3719 			/* wait for down irq */
3720 			DELAY(10 * sc->intr_coal_delay);
3721 		}
3722 		wmb();
3723 		if (old_down_cnt == sc->down_cnt) {
3724 			device_printf(sc->dev, "never got down irq\n");
3725 		}
3726 	}
3727 	mxge_free_mbufs(sc);
3728 
3729 	return 0;
3730 }
3731 
3732 static void
3733 mxge_setup_cfg_space(mxge_softc_t *sc)
3734 {
3735 	device_t dev = sc->dev;
3736 	int reg;
3737 	uint16_t cmd, lnk, pectl;
3738 
3739 	/* find the PCIe link width and set max read request to 4KB*/
3740 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3741 		lnk = pci_read_config(dev, reg + 0x12, 2);
3742 		sc->link_width = (lnk >> 4) & 0x3f;
3743 
3744 		if (sc->pectl == 0) {
3745 			pectl = pci_read_config(dev, reg + 0x8, 2);
3746 			pectl = (pectl & ~0x7000) | (5 << 12);
3747 			pci_write_config(dev, reg + 0x8, pectl, 2);
3748 			sc->pectl = pectl;
3749 		} else {
3750 			/* restore saved pectl after watchdog reset */
3751 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3752 		}
3753 	}
3754 
3755 	/* Enable DMA and Memory space access */
3756 	pci_enable_busmaster(dev);
3757 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3758 	cmd |= PCIM_CMD_MEMEN;
3759 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3760 }
3761 
3762 static uint32_t
3763 mxge_read_reboot(mxge_softc_t *sc)
3764 {
3765 	device_t dev = sc->dev;
3766 	uint32_t vs;
3767 
3768 	/* find the vendor specific offset */
3769 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3770 		device_printf(sc->dev,
3771 			      "could not find vendor specific offset\n");
3772 		return (uint32_t)-1;
3773 	}
3774 	/* enable read32 mode */
3775 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3776 	/* tell NIC which register to read */
3777 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3778 	return (pci_read_config(dev, vs + 0x14, 4));
3779 }
3780 
3781 static void
3782 mxge_watchdog_reset(mxge_softc_t *sc)
3783 {
3784 	struct pci_devinfo *dinfo;
3785 	struct mxge_slice_state *ss;
3786 	int err, running, s, num_tx_slices = 1;
3787 	uint32_t reboot;
3788 	uint16_t cmd;
3789 
3790 	err = ENXIO;
3791 
3792 	device_printf(sc->dev, "Watchdog reset!\n");
3793 
3794 	/*
3795 	 * check to see if the NIC rebooted.  If it did, then all of
3796 	 * PCI config space has been reset, and things like the
3797 	 * busmaster bit will be zero.  If this is the case, then we
3798 	 * must restore PCI config space before the NIC can be used
3799 	 * again
3800 	 */
3801 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3802 	if (cmd == 0xffff) {
3803 		/*
3804 		 * maybe the watchdog caught the NIC rebooting; wait
3805 		 * up to 100ms for it to finish.  If it does not come
3806 		 * back, then give up
3807 		 */
3808 		DELAY(1000*100);
3809 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3810 		if (cmd == 0xffff) {
3811 			device_printf(sc->dev, "NIC disappeared!\n");
3812 		}
3813 	}
3814 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3815 		/* print the reboot status */
3816 		reboot = mxge_read_reboot(sc);
3817 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3818 			      reboot);
3819 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3820 		if (running) {
3821 
3822 			/*
3823 			 * quiesce NIC so that TX routines will not try to
3824 			 * xmit after restoration of BAR
3825 			 */
3826 
3827 			/* Mark the link as down */
3828 			if (sc->link_state) {
3829 				sc->link_state = 0;
3830 				if_link_state_change(sc->ifp,
3831 						     LINK_STATE_DOWN);
3832 			}
3833 #ifdef IFNET_BUF_RING
3834 			num_tx_slices = sc->num_slices;
3835 #endif
3836 			/* grab all TX locks to ensure no tx  */
3837 			for (s = 0; s < num_tx_slices; s++) {
3838 				ss = &sc->ss[s];
3839 				mtx_lock(&ss->tx.mtx);
3840 			}
3841 			mxge_close(sc, 1);
3842 		}
3843 		/* restore PCI configuration space */
3844 		dinfo = device_get_ivars(sc->dev);
3845 		pci_cfg_restore(sc->dev, dinfo);
3846 
3847 		/* and redo any changes we made to our config space */
3848 		mxge_setup_cfg_space(sc);
3849 
3850 		/* reload f/w */
3851 		err = mxge_load_firmware(sc, 0);
3852 		if (err) {
3853 			device_printf(sc->dev,
3854 				      "Unable to re-load f/w\n");
3855 		}
3856 		if (running) {
3857 			if (!err)
3858 				err = mxge_open(sc);
3859 			/* release all TX locks */
3860 			for (s = 0; s < num_tx_slices; s++) {
3861 				ss = &sc->ss[s];
3862 #ifdef IFNET_BUF_RING
3863 				mxge_start_locked(ss);
3864 #endif
3865 				mtx_unlock(&ss->tx.mtx);
3866 			}
3867 		}
3868 		sc->watchdog_resets++;
3869 	} else {
3870 		device_printf(sc->dev,
3871 			      "NIC did not reboot, not resetting\n");
3872 		err = 0;
3873 	}
3874 	if (err) {
3875 		device_printf(sc->dev, "watchdog reset failed\n");
3876 	} else {
3877 		if (sc->dying == 2)
3878 			sc->dying = 0;
3879 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3880 	}
3881 }
3882 
3883 static void
3884 mxge_watchdog_task(void *arg, int pending)
3885 {
3886 	mxge_softc_t *sc = arg;
3887 
3888 
3889 	mtx_lock(&sc->driver_mtx);
3890 	mxge_watchdog_reset(sc);
3891 	mtx_unlock(&sc->driver_mtx);
3892 }
3893 
3894 static void
3895 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3896 {
3897 	tx = &sc->ss[slice].tx;
3898 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3899 	device_printf(sc->dev,
3900 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3901 		      tx->req, tx->done, tx->queue_active);
3902 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3903 			      tx->activate, tx->deactivate);
3904 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3905 		      tx->pkt_done,
3906 		      be32toh(sc->ss->fw_stats->send_done_count));
3907 }
3908 
3909 static int
3910 mxge_watchdog(mxge_softc_t *sc)
3911 {
3912 	mxge_tx_ring_t *tx;
3913 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3914 	int i, err = 0;
3915 
3916 	/* see if we have outstanding transmits, which
3917 	   have been pending for more than mxge_ticks */
3918 	for (i = 0;
3919 #ifdef IFNET_BUF_RING
3920 	     (i < sc->num_slices) && (err == 0);
3921 #else
3922 	     (i < 1) && (err == 0);
3923 #endif
3924 	     i++) {
3925 		tx = &sc->ss[i].tx;
3926 		if (tx->req != tx->done &&
3927 		    tx->watchdog_req != tx->watchdog_done &&
3928 		    tx->done == tx->watchdog_done) {
3929 			/* check for pause blocking before resetting */
3930 			if (tx->watchdog_rx_pause == rx_pause) {
3931 				mxge_warn_stuck(sc, tx, i);
3932 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3933 				return (ENXIO);
3934 			}
3935 			else
3936 				device_printf(sc->dev, "Flow control blocking "
3937 					      "xmits, check link partner\n");
3938 		}
3939 
3940 		tx->watchdog_req = tx->req;
3941 		tx->watchdog_done = tx->done;
3942 		tx->watchdog_rx_pause = rx_pause;
3943 	}
3944 
3945 	if (sc->need_media_probe)
3946 		mxge_media_probe(sc);
3947 	return (err);
3948 }
3949 
3950 static u_long
3951 mxge_update_stats(mxge_softc_t *sc)
3952 {
3953 	struct mxge_slice_state *ss;
3954 	u_long pkts = 0;
3955 	u_long ipackets = 0;
3956 	u_long opackets = 0;
3957 #ifdef IFNET_BUF_RING
3958 	u_long obytes = 0;
3959 	u_long omcasts = 0;
3960 	u_long odrops = 0;
3961 #endif
3962 	u_long oerrors = 0;
3963 	int slice;
3964 
3965 	for (slice = 0; slice < sc->num_slices; slice++) {
3966 		ss = &sc->ss[slice];
3967 		ipackets += ss->ipackets;
3968 		opackets += ss->opackets;
3969 #ifdef IFNET_BUF_RING
3970 		obytes += ss->obytes;
3971 		omcasts += ss->omcasts;
3972 		odrops += ss->tx.br->br_drops;
3973 #endif
3974 		oerrors += ss->oerrors;
3975 	}
3976 	pkts = (ipackets - sc->ifp->if_ipackets);
3977 	pkts += (opackets - sc->ifp->if_opackets);
3978 	sc->ifp->if_ipackets = ipackets;
3979 	sc->ifp->if_opackets = opackets;
3980 #ifdef IFNET_BUF_RING
3981 	sc->ifp->if_obytes = obytes;
3982 	sc->ifp->if_omcasts = omcasts;
3983 	sc->ifp->if_snd.ifq_drops = odrops;
3984 #endif
3985 	sc->ifp->if_oerrors = oerrors;
3986 	return pkts;
3987 }
3988 
3989 static void
3990 mxge_tick(void *arg)
3991 {
3992 	mxge_softc_t *sc = arg;
3993 	u_long pkts = 0;
3994 	int err = 0;
3995 	int running, ticks;
3996 	uint16_t cmd;
3997 
3998 	ticks = mxge_ticks;
3999 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4000 	if (running) {
4001 		/* aggregate stats from different slices */
4002 		pkts = mxge_update_stats(sc);
4003 		if (!sc->watchdog_countdown) {
4004 			err = mxge_watchdog(sc);
4005 			sc->watchdog_countdown = 4;
4006 		}
4007 		sc->watchdog_countdown--;
4008 	}
4009 	if (pkts == 0) {
4010 		/* ensure NIC did not suffer h/w fault while idle */
4011 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4012 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4013 			sc->dying = 2;
4014 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4015 			err = ENXIO;
4016 		}
4017 		/* look less often if NIC is idle */
4018 		ticks *= 4;
4019 	}
4020 
4021 	if (err == 0)
4022 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4023 
4024 }
4025 
4026 static int
4027 mxge_media_change(struct ifnet *ifp)
4028 {
4029 	return EINVAL;
4030 }
4031 
4032 static int
4033 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4034 {
4035 	struct ifnet *ifp = sc->ifp;
4036 	int real_mtu, old_mtu;
4037 	int err = 0;
4038 
4039 
4040 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4041 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4042 		return EINVAL;
4043 	mtx_lock(&sc->driver_mtx);
4044 	old_mtu = ifp->if_mtu;
4045 	ifp->if_mtu = mtu;
4046 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4047 		mxge_close(sc, 0);
4048 		err = mxge_open(sc);
4049 		if (err != 0) {
4050 			ifp->if_mtu = old_mtu;
4051 			mxge_close(sc, 0);
4052 			(void) mxge_open(sc);
4053 		}
4054 	}
4055 	mtx_unlock(&sc->driver_mtx);
4056 	return err;
4057 }
4058 
4059 static void
4060 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4061 {
4062 	mxge_softc_t *sc = ifp->if_softc;
4063 
4064 
4065 	if (sc == NULL)
4066 		return;
4067 	ifmr->ifm_status = IFM_AVALID;
4068 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4069 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4070 	ifmr->ifm_active |= sc->current_media;
4071 }
4072 
4073 static int
4074 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4075 {
4076 	mxge_softc_t *sc = ifp->if_softc;
4077 	struct ifreq *ifr = (struct ifreq *)data;
4078 	int err, mask;
4079 
4080 	err = 0;
4081 	switch (command) {
4082 	case SIOCSIFADDR:
4083 	case SIOCGIFADDR:
4084 		err = ether_ioctl(ifp, command, data);
4085 		break;
4086 
4087 	case SIOCSIFMTU:
4088 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4089 		break;
4090 
4091 	case SIOCSIFFLAGS:
4092 		mtx_lock(&sc->driver_mtx);
4093 		if (sc->dying) {
4094 			mtx_unlock(&sc->driver_mtx);
4095 			return EINVAL;
4096 		}
4097 		if (ifp->if_flags & IFF_UP) {
4098 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4099 				err = mxge_open(sc);
4100 			} else {
4101 				/* take care of promis can allmulti
4102 				   flag chages */
4103 				mxge_change_promisc(sc,
4104 						    ifp->if_flags & IFF_PROMISC);
4105 				mxge_set_multicast_list(sc);
4106 			}
4107 		} else {
4108 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4109 				mxge_close(sc, 0);
4110 			}
4111 		}
4112 		mtx_unlock(&sc->driver_mtx);
4113 		break;
4114 
4115 	case SIOCADDMULTI:
4116 	case SIOCDELMULTI:
4117 		mtx_lock(&sc->driver_mtx);
4118 		mxge_set_multicast_list(sc);
4119 		mtx_unlock(&sc->driver_mtx);
4120 		break;
4121 
4122 	case SIOCSIFCAP:
4123 		mtx_lock(&sc->driver_mtx);
4124 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4125 		if (mask & IFCAP_TXCSUM) {
4126 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4127 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4128 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
4129 						      | CSUM_TSO);
4130 			} else {
4131 				ifp->if_capenable |= IFCAP_TXCSUM;
4132 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4133 			}
4134 		} else if (mask & IFCAP_RXCSUM) {
4135 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4136 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4137 				sc->csum_flag = 0;
4138 			} else {
4139 				ifp->if_capenable |= IFCAP_RXCSUM;
4140 				sc->csum_flag = 1;
4141 			}
4142 		}
4143 		if (mask & IFCAP_TSO4) {
4144 			if (IFCAP_TSO4 & ifp->if_capenable) {
4145 				ifp->if_capenable &= ~IFCAP_TSO4;
4146 				ifp->if_hwassist &= ~CSUM_TSO;
4147 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4148 				ifp->if_capenable |= IFCAP_TSO4;
4149 				ifp->if_hwassist |= CSUM_TSO;
4150 			} else {
4151 				printf("mxge requires tx checksum offload"
4152 				       " be enabled to use TSO\n");
4153 				err = EINVAL;
4154 			}
4155 		}
4156 		if (mask & IFCAP_LRO) {
4157 			if (IFCAP_LRO & ifp->if_capenable)
4158 				err = mxge_change_lro_locked(sc, 0);
4159 			else
4160 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4161 		}
4162 		if (mask & IFCAP_VLAN_HWTAGGING)
4163 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4164 		if (mask & IFCAP_VLAN_HWTSO)
4165 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4166 
4167 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4168 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4169 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4170 
4171 		mtx_unlock(&sc->driver_mtx);
4172 		VLAN_CAPABILITIES(ifp);
4173 
4174 		break;
4175 
4176 	case SIOCGIFMEDIA:
4177 		mtx_lock(&sc->driver_mtx);
4178 		mxge_media_probe(sc);
4179 		mtx_unlock(&sc->driver_mtx);
4180 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4181 				    &sc->media, command);
4182                 break;
4183 
4184 	default:
4185 		err = ENOTTY;
4186         }
4187 	return err;
4188 }
4189 
4190 static void
4191 mxge_fetch_tunables(mxge_softc_t *sc)
4192 {
4193 
4194 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4195 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4196 			  &mxge_flow_control);
4197 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4198 			  &mxge_intr_coal_delay);
4199 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4200 			  &mxge_nvidia_ecrc_enable);
4201 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4202 			  &mxge_force_firmware);
4203 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4204 			  &mxge_deassert_wait);
4205 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4206 			  &mxge_verbose);
4207 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4208 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4209 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4210 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4211 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4212 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4213 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4214 	if (sc->lro_cnt != 0)
4215 		mxge_lro_cnt = sc->lro_cnt;
4216 
4217 	if (bootverbose)
4218 		mxge_verbose = 1;
4219 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4220 		mxge_intr_coal_delay = 30;
4221 	if (mxge_ticks == 0)
4222 		mxge_ticks = hz / 2;
4223 	sc->pause = mxge_flow_control;
4224 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4225 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4226 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4227 	}
4228 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4229 	    mxge_initial_mtu < ETHER_MIN_LEN)
4230 		mxge_initial_mtu = ETHERMTU_JUMBO;
4231 
4232 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4233 		mxge_throttle = MXGE_MAX_THROTTLE;
4234 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4235 		mxge_throttle = MXGE_MIN_THROTTLE;
4236 	sc->throttle = mxge_throttle;
4237 }
4238 
4239 
4240 static void
4241 mxge_free_slices(mxge_softc_t *sc)
4242 {
4243 	struct mxge_slice_state *ss;
4244 	int i;
4245 
4246 
4247 	if (sc->ss == NULL)
4248 		return;
4249 
4250 	for (i = 0; i < sc->num_slices; i++) {
4251 		ss = &sc->ss[i];
4252 		if (ss->fw_stats != NULL) {
4253 			mxge_dma_free(&ss->fw_stats_dma);
4254 			ss->fw_stats = NULL;
4255 #ifdef IFNET_BUF_RING
4256 			if (ss->tx.br != NULL) {
4257 				drbr_free(ss->tx.br, M_DEVBUF);
4258 				ss->tx.br = NULL;
4259 			}
4260 #endif
4261 			mtx_destroy(&ss->tx.mtx);
4262 		}
4263 		if (ss->rx_done.entry != NULL) {
4264 			mxge_dma_free(&ss->rx_done.dma);
4265 			ss->rx_done.entry = NULL;
4266 		}
4267 	}
4268 	free(sc->ss, M_DEVBUF);
4269 	sc->ss = NULL;
4270 }
4271 
4272 static int
4273 mxge_alloc_slices(mxge_softc_t *sc)
4274 {
4275 	mxge_cmd_t cmd;
4276 	struct mxge_slice_state *ss;
4277 	size_t bytes;
4278 	int err, i, max_intr_slots;
4279 
4280 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4281 	if (err != 0) {
4282 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4283 		return err;
4284 	}
4285 	sc->rx_ring_size = cmd.data0;
4286 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4287 
4288 	bytes = sizeof (*sc->ss) * sc->num_slices;
4289 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4290 	if (sc->ss == NULL)
4291 		return (ENOMEM);
4292 	for (i = 0; i < sc->num_slices; i++) {
4293 		ss = &sc->ss[i];
4294 
4295 		ss->sc = sc;
4296 
4297 		/* allocate per-slice rx interrupt queues */
4298 
4299 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4300 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4301 		if (err != 0)
4302 			goto abort;
4303 		ss->rx_done.entry = ss->rx_done.dma.addr;
4304 		bzero(ss->rx_done.entry, bytes);
4305 
4306 		/*
4307 		 * allocate the per-slice firmware stats; stats
4308 		 * (including tx) are used used only on the first
4309 		 * slice for now
4310 		 */
4311 #ifndef IFNET_BUF_RING
4312 		if (i > 0)
4313 			continue;
4314 #endif
4315 
4316 		bytes = sizeof (*ss->fw_stats);
4317 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4318 				     sizeof (*ss->fw_stats), 64);
4319 		if (err != 0)
4320 			goto abort;
4321 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4322 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4323 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4324 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4325 #ifdef IFNET_BUF_RING
4326 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4327 					   &ss->tx.mtx);
4328 #endif
4329 	}
4330 
4331 	return (0);
4332 
4333 abort:
4334 	mxge_free_slices(sc);
4335 	return (ENOMEM);
4336 }
4337 
4338 static void
4339 mxge_slice_probe(mxge_softc_t *sc)
4340 {
4341 	mxge_cmd_t cmd;
4342 	char *old_fw;
4343 	int msix_cnt, status, max_intr_slots;
4344 
4345 	sc->num_slices = 1;
4346 	/*
4347 	 *  don't enable multiple slices if they are not enabled,
4348 	 *  or if this is not an SMP system
4349 	 */
4350 
4351 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4352 		return;
4353 
4354 	/* see how many MSI-X interrupts are available */
4355 	msix_cnt = pci_msix_count(sc->dev);
4356 	if (msix_cnt < 2)
4357 		return;
4358 
4359 	/* now load the slice aware firmware see what it supports */
4360 	old_fw = sc->fw_name;
4361 	if (old_fw == mxge_fw_aligned)
4362 		sc->fw_name = mxge_fw_rss_aligned;
4363 	else
4364 		sc->fw_name = mxge_fw_rss_unaligned;
4365 	status = mxge_load_firmware(sc, 0);
4366 	if (status != 0) {
4367 		device_printf(sc->dev, "Falling back to a single slice\n");
4368 		return;
4369 	}
4370 
4371 	/* try to send a reset command to the card to see if it
4372 	   is alive */
4373 	memset(&cmd, 0, sizeof (cmd));
4374 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4375 	if (status != 0) {
4376 		device_printf(sc->dev, "failed reset\n");
4377 		goto abort_with_fw;
4378 	}
4379 
4380 	/* get rx ring size */
4381 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4382 	if (status != 0) {
4383 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4384 		goto abort_with_fw;
4385 	}
4386 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4387 
4388 	/* tell it the size of the interrupt queues */
4389 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4390 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4391 	if (status != 0) {
4392 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4393 		goto abort_with_fw;
4394 	}
4395 
4396 	/* ask the maximum number of slices it supports */
4397 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4398 	if (status != 0) {
4399 		device_printf(sc->dev,
4400 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4401 		goto abort_with_fw;
4402 	}
4403 	sc->num_slices = cmd.data0;
4404 	if (sc->num_slices > msix_cnt)
4405 		sc->num_slices = msix_cnt;
4406 
4407 	if (mxge_max_slices == -1) {
4408 		/* cap to number of CPUs in system */
4409 		if (sc->num_slices > mp_ncpus)
4410 			sc->num_slices = mp_ncpus;
4411 	} else {
4412 		if (sc->num_slices > mxge_max_slices)
4413 			sc->num_slices = mxge_max_slices;
4414 	}
4415 	/* make sure it is a power of two */
4416 	while (sc->num_slices & (sc->num_slices - 1))
4417 		sc->num_slices--;
4418 
4419 	if (mxge_verbose)
4420 		device_printf(sc->dev, "using %d slices\n",
4421 			      sc->num_slices);
4422 
4423 	return;
4424 
4425 abort_with_fw:
4426 	sc->fw_name = old_fw;
4427 	(void) mxge_load_firmware(sc, 0);
4428 }
4429 
4430 static int
4431 mxge_add_msix_irqs(mxge_softc_t *sc)
4432 {
4433 	size_t bytes;
4434 	int count, err, i, rid;
4435 
4436 	rid = PCIR_BAR(2);
4437 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4438 						    &rid, RF_ACTIVE);
4439 
4440 	if (sc->msix_table_res == NULL) {
4441 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4442 		return ENXIO;
4443 	}
4444 
4445 	count = sc->num_slices;
4446 	err = pci_alloc_msix(sc->dev, &count);
4447 	if (err != 0) {
4448 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4449 			      "err = %d \n", sc->num_slices, err);
4450 		goto abort_with_msix_table;
4451 	}
4452 	if (count < sc->num_slices) {
4453 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4454 			      count, sc->num_slices);
4455 		device_printf(sc->dev,
4456 			      "Try setting hw.mxge.max_slices to %d\n",
4457 			      count);
4458 		err = ENOSPC;
4459 		goto abort_with_msix;
4460 	}
4461 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4462 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4463 	if (sc->msix_irq_res == NULL) {
4464 		err = ENOMEM;
4465 		goto abort_with_msix;
4466 	}
4467 
4468 	for (i = 0; i < sc->num_slices; i++) {
4469 		rid = i + 1;
4470 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4471 							  SYS_RES_IRQ,
4472 							  &rid, RF_ACTIVE);
4473 		if (sc->msix_irq_res[i] == NULL) {
4474 			device_printf(sc->dev, "couldn't allocate IRQ res"
4475 				      " for message %d\n", i);
4476 			err = ENXIO;
4477 			goto abort_with_res;
4478 		}
4479 	}
4480 
4481 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4482 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4483 
4484 	for (i = 0; i < sc->num_slices; i++) {
4485 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4486 				     INTR_TYPE_NET | INTR_MPSAFE,
4487 #if __FreeBSD_version > 700030
4488 				     NULL,
4489 #endif
4490 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4491 		if (err != 0) {
4492 			device_printf(sc->dev, "couldn't setup intr for "
4493 				      "message %d\n", i);
4494 			goto abort_with_intr;
4495 		}
4496 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4497 				  sc->msix_ih[i], "s%d", i);
4498 	}
4499 
4500 	if (mxge_verbose) {
4501 		device_printf(sc->dev, "using %d msix IRQs:",
4502 			      sc->num_slices);
4503 		for (i = 0; i < sc->num_slices; i++)
4504 			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4505 		printf("\n");
4506 	}
4507 	return (0);
4508 
4509 abort_with_intr:
4510 	for (i = 0; i < sc->num_slices; i++) {
4511 		if (sc->msix_ih[i] != NULL) {
4512 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4513 					  sc->msix_ih[i]);
4514 			sc->msix_ih[i] = NULL;
4515 		}
4516 	}
4517 	free(sc->msix_ih, M_DEVBUF);
4518 
4519 
4520 abort_with_res:
4521 	for (i = 0; i < sc->num_slices; i++) {
4522 		rid = i + 1;
4523 		if (sc->msix_irq_res[i] != NULL)
4524 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4525 					     sc->msix_irq_res[i]);
4526 		sc->msix_irq_res[i] = NULL;
4527 	}
4528 	free(sc->msix_irq_res, M_DEVBUF);
4529 
4530 
4531 abort_with_msix:
4532 	pci_release_msi(sc->dev);
4533 
4534 abort_with_msix_table:
4535 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4536 			     sc->msix_table_res);
4537 
4538 	return err;
4539 }
4540 
4541 static int
4542 mxge_add_single_irq(mxge_softc_t *sc)
4543 {
4544 	int count, err, rid;
4545 
4546 	count = pci_msi_count(sc->dev);
4547 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4548 		rid = 1;
4549 	} else {
4550 		rid = 0;
4551 		sc->legacy_irq = 1;
4552 	}
4553 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4554 					 1, RF_SHAREABLE | RF_ACTIVE);
4555 	if (sc->irq_res == NULL) {
4556 		device_printf(sc->dev, "could not alloc interrupt\n");
4557 		return ENXIO;
4558 	}
4559 	if (mxge_verbose)
4560 		device_printf(sc->dev, "using %s irq %ld\n",
4561 			      sc->legacy_irq ? "INTx" : "MSI",
4562 			      rman_get_start(sc->irq_res));
4563 	err = bus_setup_intr(sc->dev, sc->irq_res,
4564 			     INTR_TYPE_NET | INTR_MPSAFE,
4565 #if __FreeBSD_version > 700030
4566 			     NULL,
4567 #endif
4568 			     mxge_intr, &sc->ss[0], &sc->ih);
4569 	if (err != 0) {
4570 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4571 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4572 		if (!sc->legacy_irq)
4573 			pci_release_msi(sc->dev);
4574 	}
4575 	return err;
4576 }
4577 
4578 static void
4579 mxge_rem_msix_irqs(mxge_softc_t *sc)
4580 {
4581 	int i, rid;
4582 
4583 	for (i = 0; i < sc->num_slices; i++) {
4584 		if (sc->msix_ih[i] != NULL) {
4585 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4586 					  sc->msix_ih[i]);
4587 			sc->msix_ih[i] = NULL;
4588 		}
4589 	}
4590 	free(sc->msix_ih, M_DEVBUF);
4591 
4592 	for (i = 0; i < sc->num_slices; i++) {
4593 		rid = i + 1;
4594 		if (sc->msix_irq_res[i] != NULL)
4595 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4596 					     sc->msix_irq_res[i]);
4597 		sc->msix_irq_res[i] = NULL;
4598 	}
4599 	free(sc->msix_irq_res, M_DEVBUF);
4600 
4601 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4602 			     sc->msix_table_res);
4603 
4604 	pci_release_msi(sc->dev);
4605 	return;
4606 }
4607 
4608 static void
4609 mxge_rem_single_irq(mxge_softc_t *sc)
4610 {
4611 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4612 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4613 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4614 	if (!sc->legacy_irq)
4615 		pci_release_msi(sc->dev);
4616 }
4617 
4618 static void
4619 mxge_rem_irq(mxge_softc_t *sc)
4620 {
4621 	if (sc->num_slices > 1)
4622 		mxge_rem_msix_irqs(sc);
4623 	else
4624 		mxge_rem_single_irq(sc);
4625 }
4626 
4627 static int
4628 mxge_add_irq(mxge_softc_t *sc)
4629 {
4630 	int err;
4631 
4632 	if (sc->num_slices > 1)
4633 		err = mxge_add_msix_irqs(sc);
4634 	else
4635 		err = mxge_add_single_irq(sc);
4636 
4637 	if (0 && err == 0 && sc->num_slices > 1) {
4638 		mxge_rem_msix_irqs(sc);
4639 		err = mxge_add_msix_irqs(sc);
4640 	}
4641 	return err;
4642 }
4643 
4644 
4645 static int
4646 mxge_attach(device_t dev)
4647 {
4648 	mxge_softc_t *sc = device_get_softc(dev);
4649 	struct ifnet *ifp;
4650 	int err, rid;
4651 
4652 	sc->dev = dev;
4653 	mxge_fetch_tunables(sc);
4654 
4655 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4656 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4657 				  taskqueue_thread_enqueue, &sc->tq);
4658 	if (sc->tq == NULL) {
4659 		err = ENOMEM;
4660 		goto abort_with_nothing;
4661 	}
4662 
4663 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4664 				 1,			/* alignment */
4665 				 0,			/* boundary */
4666 				 BUS_SPACE_MAXADDR,	/* low */
4667 				 BUS_SPACE_MAXADDR,	/* high */
4668 				 NULL, NULL,		/* filter */
4669 				 65536 + 256,		/* maxsize */
4670 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4671 				 65536,			/* maxsegsize */
4672 				 0,			/* flags */
4673 				 NULL, NULL,		/* lock */
4674 				 &sc->parent_dmat);	/* tag */
4675 
4676 	if (err != 0) {
4677 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4678 			      err);
4679 		goto abort_with_tq;
4680 	}
4681 
4682 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4683 	if (ifp == NULL) {
4684 		device_printf(dev, "can not if_alloc()\n");
4685 		err = ENOSPC;
4686 		goto abort_with_parent_dmat;
4687 	}
4688 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4689 
4690 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4691 		 device_get_nameunit(dev));
4692 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4693 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4694 		 "%s:drv", device_get_nameunit(dev));
4695 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4696 		 MTX_NETWORK_LOCK, MTX_DEF);
4697 
4698 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4699 
4700 	mxge_setup_cfg_space(sc);
4701 
4702 	/* Map the board into the kernel */
4703 	rid = PCIR_BARS;
4704 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4705 					 ~0, 1, RF_ACTIVE);
4706 	if (sc->mem_res == NULL) {
4707 		device_printf(dev, "could not map memory\n");
4708 		err = ENXIO;
4709 		goto abort_with_lock;
4710 	}
4711 	sc->sram = rman_get_virtual(sc->mem_res);
4712 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4713 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4714 		device_printf(dev, "impossible memory region size %ld\n",
4715 			      rman_get_size(sc->mem_res));
4716 		err = ENXIO;
4717 		goto abort_with_mem_res;
4718 	}
4719 
4720 	/* make NULL terminated copy of the EEPROM strings section of
4721 	   lanai SRAM */
4722 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4723 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4724 				rman_get_bushandle(sc->mem_res),
4725 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4726 				sc->eeprom_strings,
4727 				MXGE_EEPROM_STRINGS_SIZE - 2);
4728 	err = mxge_parse_strings(sc);
4729 	if (err != 0)
4730 		goto abort_with_mem_res;
4731 
4732 	/* Enable write combining for efficient use of PCIe bus */
4733 	mxge_enable_wc(sc);
4734 
4735 	/* Allocate the out of band dma memory */
4736 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4737 			     sizeof (mxge_cmd_t), 64);
4738 	if (err != 0)
4739 		goto abort_with_mem_res;
4740 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4741 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4742 	if (err != 0)
4743 		goto abort_with_cmd_dma;
4744 
4745 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4746 	if (err != 0)
4747 		goto abort_with_zeropad_dma;
4748 
4749 	/* select & load the firmware */
4750 	err = mxge_select_firmware(sc);
4751 	if (err != 0)
4752 		goto abort_with_dmabench;
4753 	sc->intr_coal_delay = mxge_intr_coal_delay;
4754 
4755 	mxge_slice_probe(sc);
4756 	err = mxge_alloc_slices(sc);
4757 	if (err != 0)
4758 		goto abort_with_dmabench;
4759 
4760 	err = mxge_reset(sc, 0);
4761 	if (err != 0)
4762 		goto abort_with_slices;
4763 
4764 	err = mxge_alloc_rings(sc);
4765 	if (err != 0) {
4766 		device_printf(sc->dev, "failed to allocate rings\n");
4767 		goto abort_with_slices;
4768 	}
4769 
4770 	err = mxge_add_irq(sc);
4771 	if (err != 0) {
4772 		device_printf(sc->dev, "failed to add irq\n");
4773 		goto abort_with_rings;
4774 	}
4775 
4776 	if_initbaudrate(ifp, IF_Gbps(10));
4777 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4778 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE;
4779 #ifdef INET
4780 	ifp->if_capabilities |= IFCAP_LRO;
4781 #endif
4782 
4783 #ifdef MXGE_NEW_VLAN_API
4784 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4785 
4786 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4787 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4788 	    sc->fw_ver_tiny >= 32)
4789 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4790 #endif
4791 
4792 	sc->max_mtu = mxge_max_mtu(sc);
4793 	if (sc->max_mtu >= 9000)
4794 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4795 	else
4796 		device_printf(dev, "MTU limited to %d.  Install "
4797 			      "latest firmware for 9000 byte jumbo support\n",
4798 			      sc->max_mtu - ETHER_HDR_LEN);
4799 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4800 	ifp->if_capenable = ifp->if_capabilities;
4801 	if (sc->lro_cnt == 0)
4802 		ifp->if_capenable &= ~IFCAP_LRO;
4803 	sc->csum_flag = 1;
4804         ifp->if_init = mxge_init;
4805         ifp->if_softc = sc;
4806         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4807         ifp->if_ioctl = mxge_ioctl;
4808         ifp->if_start = mxge_start;
4809 	/* Initialise the ifmedia structure */
4810 	ifmedia_init(&sc->media, 0, mxge_media_change,
4811 		     mxge_media_status);
4812 	mxge_media_init(sc);
4813 	mxge_media_probe(sc);
4814 	sc->dying = 0;
4815 	ether_ifattach(ifp, sc->mac_addr);
4816 	/* ether_ifattach sets mtu to ETHERMTU */
4817 	if (mxge_initial_mtu != ETHERMTU)
4818 		mxge_change_mtu(sc, mxge_initial_mtu);
4819 
4820 	mxge_add_sysctls(sc);
4821 #ifdef IFNET_BUF_RING
4822 	ifp->if_transmit = mxge_transmit;
4823 	ifp->if_qflush = mxge_qflush;
4824 #endif
4825 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4826 				device_get_nameunit(sc->dev));
4827 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4828 	return 0;
4829 
4830 abort_with_rings:
4831 	mxge_free_rings(sc);
4832 abort_with_slices:
4833 	mxge_free_slices(sc);
4834 abort_with_dmabench:
4835 	mxge_dma_free(&sc->dmabench_dma);
4836 abort_with_zeropad_dma:
4837 	mxge_dma_free(&sc->zeropad_dma);
4838 abort_with_cmd_dma:
4839 	mxge_dma_free(&sc->cmd_dma);
4840 abort_with_mem_res:
4841 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4842 abort_with_lock:
4843 	pci_disable_busmaster(dev);
4844 	mtx_destroy(&sc->cmd_mtx);
4845 	mtx_destroy(&sc->driver_mtx);
4846 	if_free(ifp);
4847 abort_with_parent_dmat:
4848 	bus_dma_tag_destroy(sc->parent_dmat);
4849 abort_with_tq:
4850 	if (sc->tq != NULL) {
4851 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4852 		taskqueue_free(sc->tq);
4853 		sc->tq = NULL;
4854 	}
4855 abort_with_nothing:
4856 	return err;
4857 }
4858 
4859 static int
4860 mxge_detach(device_t dev)
4861 {
4862 	mxge_softc_t *sc = device_get_softc(dev);
4863 
4864 	if (mxge_vlans_active(sc)) {
4865 		device_printf(sc->dev,
4866 			      "Detach vlans before removing module\n");
4867 		return EBUSY;
4868 	}
4869 	mtx_lock(&sc->driver_mtx);
4870 	sc->dying = 1;
4871 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4872 		mxge_close(sc, 0);
4873 	mtx_unlock(&sc->driver_mtx);
4874 	ether_ifdetach(sc->ifp);
4875 	if (sc->tq != NULL) {
4876 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4877 		taskqueue_free(sc->tq);
4878 		sc->tq = NULL;
4879 	}
4880 	callout_drain(&sc->co_hdl);
4881 	ifmedia_removeall(&sc->media);
4882 	mxge_dummy_rdma(sc, 0);
4883 	mxge_rem_sysctls(sc);
4884 	mxge_rem_irq(sc);
4885 	mxge_free_rings(sc);
4886 	mxge_free_slices(sc);
4887 	mxge_dma_free(&sc->dmabench_dma);
4888 	mxge_dma_free(&sc->zeropad_dma);
4889 	mxge_dma_free(&sc->cmd_dma);
4890 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4891 	pci_disable_busmaster(dev);
4892 	mtx_destroy(&sc->cmd_mtx);
4893 	mtx_destroy(&sc->driver_mtx);
4894 	if_free(sc->ifp);
4895 	bus_dma_tag_destroy(sc->parent_dmat);
4896 	return 0;
4897 }
4898 
4899 static int
4900 mxge_shutdown(device_t dev)
4901 {
4902 	return 0;
4903 }
4904 
4905 /*
4906   This file uses Myri10GE driver indentation.
4907 
4908   Local Variables:
4909   c-file-style:"linux"
4910   tab-width:8
4911   End:
4912 */
4913