xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 8a166cafe0965f6bd72cd3d2f5372704f05cb5e8)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2009, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 #include <sys/taskqueue.h>
49 
50 #include <net/if.h>
51 #include <net/if_arp.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
55 
56 #include <net/bpf.h>
57 
58 #include <net/if_types.h>
59 #include <net/if_vlan_var.h>
60 #include <net/zlib.h>
61 
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
66 
67 #include <machine/bus.h>
68 #include <machine/in_cksum.h>
69 #include <machine/resource.h>
70 #include <sys/bus.h>
71 #include <sys/rman.h>
72 #include <sys/smp.h>
73 
74 #include <dev/pci/pcireg.h>
75 #include <dev/pci/pcivar.h>
76 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
77 
78 #include <vm/vm.h>		/* for pmap_mapdev() */
79 #include <vm/pmap.h>
80 
81 #if defined(__i386) || defined(__amd64)
82 #include <machine/specialreg.h>
83 #endif
84 
85 #include <dev/mxge/mxge_mcp.h>
86 #include <dev/mxge/mcp_gen_header.h>
87 /*#define MXGE_FAKE_IFP*/
88 #include <dev/mxge/if_mxge_var.h>
89 #ifdef IFNET_BUF_RING
90 #include <sys/buf_ring.h>
91 #endif
92 
93 #include "opt_inet.h"
94 
95 /* tunable params */
96 static int mxge_nvidia_ecrc_enable = 1;
97 static int mxge_force_firmware = 0;
98 static int mxge_intr_coal_delay = 30;
99 static int mxge_deassert_wait = 1;
100 static int mxge_flow_control = 1;
101 static int mxge_verbose = 0;
102 static int mxge_lro_cnt = 8;
103 static int mxge_ticks;
104 static int mxge_max_slices = 1;
105 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
106 static int mxge_always_promisc = 0;
107 static int mxge_initial_mtu = ETHERMTU_JUMBO;
108 static int mxge_throttle = 0;
109 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
110 static char *mxge_fw_aligned = "mxge_eth_z8e";
111 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
112 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
113 
114 static int mxge_probe(device_t dev);
115 static int mxge_attach(device_t dev);
116 static int mxge_detach(device_t dev);
117 static int mxge_shutdown(device_t dev);
118 static void mxge_intr(void *arg);
119 
120 static device_method_t mxge_methods[] =
121 {
122   /* Device interface */
123   DEVMETHOD(device_probe, mxge_probe),
124   DEVMETHOD(device_attach, mxge_attach),
125   DEVMETHOD(device_detach, mxge_detach),
126   DEVMETHOD(device_shutdown, mxge_shutdown),
127 
128   DEVMETHOD_END
129 };
130 
131 static driver_t mxge_driver =
132 {
133   "mxge",
134   mxge_methods,
135   sizeof(mxge_softc_t),
136 };
137 
138 static devclass_t mxge_devclass;
139 
140 /* Declare ourselves to be a child of the PCI bus.*/
141 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
142 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
143 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
144 
145 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
146 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
147 static int mxge_close(mxge_softc_t *sc, int down);
148 static int mxge_open(mxge_softc_t *sc);
149 static void mxge_tick(void *arg);
150 
151 static int
152 mxge_probe(device_t dev)
153 {
154 	int rev;
155 
156 
157 	if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
158 	    ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
159 	     (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
160 		rev = pci_get_revid(dev);
161 		switch (rev) {
162 		case MXGE_PCI_REV_Z8E:
163 			device_set_desc(dev, "Myri10G-PCIE-8A");
164 			break;
165 		case MXGE_PCI_REV_Z8ES:
166 			device_set_desc(dev, "Myri10G-PCIE-8B");
167 			break;
168 		default:
169 			device_set_desc(dev, "Myri10G-PCIE-8??");
170 			device_printf(dev, "Unrecognized rev %d NIC\n",
171 				      rev);
172 			break;
173 		}
174 		return 0;
175 	}
176 	return ENXIO;
177 }
178 
179 static void
180 mxge_enable_wc(mxge_softc_t *sc)
181 {
182 #if defined(__i386) || defined(__amd64)
183 	vm_offset_t len;
184 	int err;
185 
186 	sc->wc = 1;
187 	len = rman_get_size(sc->mem_res);
188 	err = pmap_change_attr((vm_offset_t) sc->sram,
189 			       len, PAT_WRITE_COMBINING);
190 	if (err != 0) {
191 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
192 			      err);
193 		sc->wc = 0;
194 	}
195 #endif
196 }
197 
198 
199 /* callback to get our DMA address */
200 static void
201 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
202 			 int error)
203 {
204 	if (error == 0) {
205 		*(bus_addr_t *) arg = segs->ds_addr;
206 	}
207 }
208 
209 static int
210 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
211 		   bus_size_t alignment)
212 {
213 	int err;
214 	device_t dev = sc->dev;
215 	bus_size_t boundary, maxsegsize;
216 
217 	if (bytes > 4096 && alignment == 4096) {
218 		boundary = 0;
219 		maxsegsize = bytes;
220 	} else {
221 		boundary = 4096;
222 		maxsegsize = 4096;
223 	}
224 
225 	/* allocate DMAable memory tags */
226 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
227 				 alignment,		/* alignment */
228 				 boundary,		/* boundary */
229 				 BUS_SPACE_MAXADDR,	/* low */
230 				 BUS_SPACE_MAXADDR,	/* high */
231 				 NULL, NULL,		/* filter */
232 				 bytes,			/* maxsize */
233 				 1,			/* num segs */
234 				 maxsegsize,		/* maxsegsize */
235 				 BUS_DMA_COHERENT,	/* flags */
236 				 NULL, NULL,		/* lock */
237 				 &dma->dmat);		/* tag */
238 	if (err != 0) {
239 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
240 		return err;
241 	}
242 
243 	/* allocate DMAable memory & map */
244 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
245 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
246 				| BUS_DMA_ZERO),  &dma->map);
247 	if (err != 0) {
248 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
249 		goto abort_with_dmat;
250 	}
251 
252 	/* load the memory */
253 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
254 			      mxge_dmamap_callback,
255 			      (void *)&dma->bus_addr, 0);
256 	if (err != 0) {
257 		device_printf(dev, "couldn't load map (err = %d)\n", err);
258 		goto abort_with_mem;
259 	}
260 	return 0;
261 
262 abort_with_mem:
263 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
264 abort_with_dmat:
265 	(void)bus_dma_tag_destroy(dma->dmat);
266 	return err;
267 }
268 
269 
270 static void
271 mxge_dma_free(mxge_dma_t *dma)
272 {
273 	bus_dmamap_unload(dma->dmat, dma->map);
274 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
275 	(void)bus_dma_tag_destroy(dma->dmat);
276 }
277 
278 /*
279  * The eeprom strings on the lanaiX have the format
280  * SN=x\0
281  * MAC=x:x:x:x:x:x\0
282  * PC=text\0
283  */
284 
285 static int
286 mxge_parse_strings(mxge_softc_t *sc)
287 {
288 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
289 
290 	char *ptr, *limit;
291 	int i, found_mac;
292 
293 	ptr = sc->eeprom_strings;
294 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
295 	found_mac = 0;
296 	while (ptr < limit && *ptr != '\0') {
297 		if (memcmp(ptr, "MAC=", 4) == 0) {
298 			ptr += 1;
299 			sc->mac_addr_string = ptr;
300 			for (i = 0; i < 6; i++) {
301 				ptr += 3;
302 				if ((ptr + 2) > limit)
303 					goto abort;
304 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
305 				found_mac = 1;
306 			}
307 		} else if (memcmp(ptr, "PC=", 3) == 0) {
308 			ptr += 3;
309 			strncpy(sc->product_code_string, ptr,
310 				sizeof (sc->product_code_string) - 1);
311 		} else if (memcmp(ptr, "SN=", 3) == 0) {
312 			ptr += 3;
313 			strncpy(sc->serial_number_string, ptr,
314 				sizeof (sc->serial_number_string) - 1);
315 		}
316 		MXGE_NEXT_STRING(ptr);
317 	}
318 
319 	if (found_mac)
320 		return 0;
321 
322  abort:
323 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
324 
325 	return ENXIO;
326 }
327 
328 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
329 static void
330 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
331 {
332 	uint32_t val;
333 	unsigned long base, off;
334 	char *va, *cfgptr;
335 	device_t pdev, mcp55;
336 	uint16_t vendor_id, device_id, word;
337 	uintptr_t bus, slot, func, ivend, idev;
338 	uint32_t *ptr32;
339 
340 
341 	if (!mxge_nvidia_ecrc_enable)
342 		return;
343 
344 	pdev = device_get_parent(device_get_parent(sc->dev));
345 	if (pdev == NULL) {
346 		device_printf(sc->dev, "could not find parent?\n");
347 		return;
348 	}
349 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
350 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
351 
352 	if (vendor_id != 0x10de)
353 		return;
354 
355 	base = 0;
356 
357 	if (device_id == 0x005d) {
358 		/* ck804, base address is magic */
359 		base = 0xe0000000UL;
360 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
361 		/* mcp55, base address stored in chipset */
362 		mcp55 = pci_find_bsf(0, 0, 0);
363 		if (mcp55 &&
364 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
365 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
366 			word = pci_read_config(mcp55, 0x90, 2);
367 			base = ((unsigned long)word & 0x7ffeU) << 25;
368 		}
369 	}
370 	if (!base)
371 		return;
372 
373 	/* XXXX
374 	   Test below is commented because it is believed that doing
375 	   config read/write beyond 0xff will access the config space
376 	   for the next larger function.  Uncomment this and remove
377 	   the hacky pmap_mapdev() way of accessing config space when
378 	   FreeBSD grows support for extended pcie config space access
379 	*/
380 #if 0
381 	/* See if we can, by some miracle, access the extended
382 	   config space */
383 	val = pci_read_config(pdev, 0x178, 4);
384 	if (val != 0xffffffff) {
385 		val |= 0x40;
386 		pci_write_config(pdev, 0x178, val, 4);
387 		return;
388 	}
389 #endif
390 	/* Rather than using normal pci config space writes, we must
391 	 * map the Nvidia config space ourselves.  This is because on
392 	 * opteron/nvidia class machine the 0xe000000 mapping is
393 	 * handled by the nvidia chipset, that means the internal PCI
394 	 * device (the on-chip northbridge), or the amd-8131 bridge
395 	 * and things behind them are not visible by this method.
396 	 */
397 
398 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
399 		      PCI_IVAR_BUS, &bus);
400 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
401 		      PCI_IVAR_SLOT, &slot);
402 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
403 		      PCI_IVAR_FUNCTION, &func);
404 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
405 		      PCI_IVAR_VENDOR, &ivend);
406 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
407 		      PCI_IVAR_DEVICE, &idev);
408 
409 	off =  base
410 		+ 0x00100000UL * (unsigned long)bus
411 		+ 0x00001000UL * (unsigned long)(func
412 						 + 8 * slot);
413 
414 	/* map it into the kernel */
415 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
416 
417 
418 	if (va == NULL) {
419 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
420 		return;
421 	}
422 	/* get a pointer to the config space mapped into the kernel */
423 	cfgptr = va + (off & PAGE_MASK);
424 
425 	/* make sure that we can really access it */
426 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
427 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
428 	if (! (vendor_id == ivend && device_id == idev)) {
429 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
430 			      vendor_id, device_id);
431 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
432 		return;
433 	}
434 
435 	ptr32 = (uint32_t*)(cfgptr + 0x178);
436 	val = *ptr32;
437 
438 	if (val == 0xffffffff) {
439 		device_printf(sc->dev, "extended mapping failed\n");
440 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
441 		return;
442 	}
443 	*ptr32 = val | 0x40;
444 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
445 	if (mxge_verbose)
446 		device_printf(sc->dev,
447 			      "Enabled ECRC on upstream Nvidia bridge "
448 			      "at %d:%d:%d\n",
449 			      (int)bus, (int)slot, (int)func);
450 	return;
451 }
452 #else
453 static void
454 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
455 {
456 	device_printf(sc->dev,
457 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
458 	return;
459 }
460 #endif
461 
462 
463 static int
464 mxge_dma_test(mxge_softc_t *sc, int test_type)
465 {
466 	mxge_cmd_t cmd;
467 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
468 	int status;
469 	uint32_t len;
470 	char *test = " ";
471 
472 
473 	/* Run a small DMA test.
474 	 * The magic multipliers to the length tell the firmware
475 	 * to do DMA read, write, or read+write tests.  The
476 	 * results are returned in cmd.data0.  The upper 16
477 	 * bits of the return is the number of transfers completed.
478 	 * The lower 16 bits is the time in 0.5us ticks that the
479 	 * transfers took to complete.
480 	 */
481 
482 	len = sc->tx_boundary;
483 
484 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
485 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
486 	cmd.data2 = len * 0x10000;
487 	status = mxge_send_cmd(sc, test_type, &cmd);
488 	if (status != 0) {
489 		test = "read";
490 		goto abort;
491 	}
492 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
493 		(cmd.data0 & 0xffff);
494 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
495 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
496 	cmd.data2 = len * 0x1;
497 	status = mxge_send_cmd(sc, test_type, &cmd);
498 	if (status != 0) {
499 		test = "write";
500 		goto abort;
501 	}
502 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
503 		(cmd.data0 & 0xffff);
504 
505 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
506 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
507 	cmd.data2 = len * 0x10001;
508 	status = mxge_send_cmd(sc, test_type, &cmd);
509 	if (status != 0) {
510 		test = "read/write";
511 		goto abort;
512 	}
513 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
514 		(cmd.data0 & 0xffff);
515 
516 abort:
517 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
518 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
519 			      test, status);
520 
521 	return status;
522 }
523 
524 /*
525  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
526  * when the PCI-E Completion packets are aligned on an 8-byte
527  * boundary.  Some PCI-E chip sets always align Completion packets; on
528  * the ones that do not, the alignment can be enforced by enabling
529  * ECRC generation (if supported).
530  *
531  * When PCI-E Completion packets are not aligned, it is actually more
532  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
533  *
534  * If the driver can neither enable ECRC nor verify that it has
535  * already been enabled, then it must use a firmware image which works
536  * around unaligned completion packets (ethp_z8e.dat), and it should
537  * also ensure that it never gives the device a Read-DMA which is
538  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
539  * enabled, then the driver should use the aligned (eth_z8e.dat)
540  * firmware image, and set tx_boundary to 4KB.
541  */
542 
543 static int
544 mxge_firmware_probe(mxge_softc_t *sc)
545 {
546 	device_t dev = sc->dev;
547 	int reg, status;
548 	uint16_t pectl;
549 
550 	sc->tx_boundary = 4096;
551 	/*
552 	 * Verify the max read request size was set to 4KB
553 	 * before trying the test with 4KB.
554 	 */
555 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
556 		pectl = pci_read_config(dev, reg + 0x8, 2);
557 		if ((pectl & (5 << 12)) != (5 << 12)) {
558 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
559 				      pectl);
560 			sc->tx_boundary = 2048;
561 		}
562 	}
563 
564 	/*
565 	 * load the optimized firmware (which assumes aligned PCIe
566 	 * completions) in order to see if it works on this host.
567 	 */
568 	sc->fw_name = mxge_fw_aligned;
569 	status = mxge_load_firmware(sc, 1);
570 	if (status != 0) {
571 		return status;
572 	}
573 
574 	/*
575 	 * Enable ECRC if possible
576 	 */
577 	mxge_enable_nvidia_ecrc(sc);
578 
579 	/*
580 	 * Run a DMA test which watches for unaligned completions and
581 	 * aborts on the first one seen.
582 	 */
583 
584 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
585 	if (status == 0)
586 		return 0; /* keep the aligned firmware */
587 
588 	if (status != E2BIG)
589 		device_printf(dev, "DMA test failed: %d\n", status);
590 	if (status == ENOSYS)
591 		device_printf(dev, "Falling back to ethp! "
592 			      "Please install up to date fw\n");
593 	return status;
594 }
595 
596 static int
597 mxge_select_firmware(mxge_softc_t *sc)
598 {
599 	int aligned = 0;
600 	int force_firmware = mxge_force_firmware;
601 
602 	if (sc->throttle)
603 		force_firmware = sc->throttle;
604 
605 	if (force_firmware != 0) {
606 		if (force_firmware == 1)
607 			aligned = 1;
608 		else
609 			aligned = 0;
610 		if (mxge_verbose)
611 			device_printf(sc->dev,
612 				      "Assuming %s completions (forced)\n",
613 				      aligned ? "aligned" : "unaligned");
614 		goto abort;
615 	}
616 
617 	/* if the PCIe link width is 4 or less, we can use the aligned
618 	   firmware and skip any checks */
619 	if (sc->link_width != 0 && sc->link_width <= 4) {
620 		device_printf(sc->dev,
621 			      "PCIe x%d Link, expect reduced performance\n",
622 			      sc->link_width);
623 		aligned = 1;
624 		goto abort;
625 	}
626 
627 	if (0 == mxge_firmware_probe(sc))
628 		return 0;
629 
630 abort:
631 	if (aligned) {
632 		sc->fw_name = mxge_fw_aligned;
633 		sc->tx_boundary = 4096;
634 	} else {
635 		sc->fw_name = mxge_fw_unaligned;
636 		sc->tx_boundary = 2048;
637 	}
638 	return (mxge_load_firmware(sc, 0));
639 }
640 
641 union qualhack
642 {
643         const char *ro_char;
644         char *rw_char;
645 };
646 
647 static int
648 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
649 {
650 
651 
652 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
653 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
654 			      be32toh(hdr->mcp_type));
655 		return EIO;
656 	}
657 
658 	/* save firmware version for sysctl */
659 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
660 	if (mxge_verbose)
661 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
662 
663 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
664 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
665 
666 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
667 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
668 		device_printf(sc->dev, "Found firmware version %s\n",
669 			      sc->fw_version);
670 		device_printf(sc->dev, "Driver needs %d.%d\n",
671 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
672 		return EINVAL;
673 	}
674 	return 0;
675 
676 }
677 
678 static void *
679 z_alloc(void *nil, u_int items, u_int size)
680 {
681         void *ptr;
682 
683         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
684         return ptr;
685 }
686 
687 static void
688 z_free(void *nil, void *ptr)
689 {
690         free(ptr, M_TEMP);
691 }
692 
693 
694 static int
695 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
696 {
697 	z_stream zs;
698 	char *inflate_buffer;
699 	const struct firmware *fw;
700 	const mcp_gen_header_t *hdr;
701 	unsigned hdr_offset;
702 	int status;
703 	unsigned int i;
704 	char dummy;
705 	size_t fw_len;
706 
707 	fw = firmware_get(sc->fw_name);
708 	if (fw == NULL) {
709 		device_printf(sc->dev, "Could not find firmware image %s\n",
710 			      sc->fw_name);
711 		return ENOENT;
712 	}
713 
714 
715 
716 	/* setup zlib and decompress f/w */
717 	bzero(&zs, sizeof (zs));
718 	zs.zalloc = z_alloc;
719 	zs.zfree = z_free;
720 	status = inflateInit(&zs);
721 	if (status != Z_OK) {
722 		status = EIO;
723 		goto abort_with_fw;
724 	}
725 
726 	/* the uncompressed size is stored as the firmware version,
727 	   which would otherwise go unused */
728 	fw_len = (size_t) fw->version;
729 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
730 	if (inflate_buffer == NULL)
731 		goto abort_with_zs;
732 	zs.avail_in = fw->datasize;
733 	zs.next_in = __DECONST(char *, fw->data);
734 	zs.avail_out = fw_len;
735 	zs.next_out = inflate_buffer;
736 	status = inflate(&zs, Z_FINISH);
737 	if (status != Z_STREAM_END) {
738 		device_printf(sc->dev, "zlib %d\n", status);
739 		status = EIO;
740 		goto abort_with_buffer;
741 	}
742 
743 	/* check id */
744 	hdr_offset = htobe32(*(const uint32_t *)
745 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
746 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
747 		device_printf(sc->dev, "Bad firmware file");
748 		status = EIO;
749 		goto abort_with_buffer;
750 	}
751 	hdr = (const void*)(inflate_buffer + hdr_offset);
752 
753 	status = mxge_validate_firmware(sc, hdr);
754 	if (status != 0)
755 		goto abort_with_buffer;
756 
757 	/* Copy the inflated firmware to NIC SRAM. */
758 	for (i = 0; i < fw_len; i += 256) {
759 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
760 			      inflate_buffer + i,
761 			      min(256U, (unsigned)(fw_len - i)));
762 		wmb();
763 		dummy = *sc->sram;
764 		wmb();
765 	}
766 
767 	*limit = fw_len;
768 	status = 0;
769 abort_with_buffer:
770 	free(inflate_buffer, M_TEMP);
771 abort_with_zs:
772 	inflateEnd(&zs);
773 abort_with_fw:
774 	firmware_put(fw, FIRMWARE_UNLOAD);
775 	return status;
776 }
777 
778 /*
779  * Enable or disable periodic RDMAs from the host to make certain
780  * chipsets resend dropped PCIe messages
781  */
782 
783 static void
784 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
785 {
786 	char buf_bytes[72];
787 	volatile uint32_t *confirm;
788 	volatile char *submit;
789 	uint32_t *buf, dma_low, dma_high;
790 	int i;
791 
792 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
793 
794 	/* clear confirmation addr */
795 	confirm = (volatile uint32_t *)sc->cmd;
796 	*confirm = 0;
797 	wmb();
798 
799 	/* send an rdma command to the PCIe engine, and wait for the
800 	   response in the confirmation address.  The firmware should
801 	   write a -1 there to indicate it is alive and well
802 	*/
803 
804 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
805 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
806 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
807 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
808 	buf[2] = htobe32(0xffffffff);		/* confirm data */
809 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
810 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
811 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
812 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
813 	buf[5] = htobe32(enable);			/* enable? */
814 
815 
816 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
817 
818 	mxge_pio_copy(submit, buf, 64);
819 	wmb();
820 	DELAY(1000);
821 	wmb();
822 	i = 0;
823 	while (*confirm != 0xffffffff && i < 20) {
824 		DELAY(1000);
825 		i++;
826 	}
827 	if (*confirm != 0xffffffff) {
828 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
829 			      (enable ? "enable" : "disable"), confirm,
830 			      *confirm);
831 	}
832 	return;
833 }
834 
835 static int
836 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
837 {
838 	mcp_cmd_t *buf;
839 	char buf_bytes[sizeof(*buf) + 8];
840 	volatile mcp_cmd_response_t *response = sc->cmd;
841 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
842 	uint32_t dma_low, dma_high;
843 	int err, sleep_total = 0;
844 
845 	/* ensure buf is aligned to 8 bytes */
846 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
847 
848 	buf->data0 = htobe32(data->data0);
849 	buf->data1 = htobe32(data->data1);
850 	buf->data2 = htobe32(data->data2);
851 	buf->cmd = htobe32(cmd);
852 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
853 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
854 
855 	buf->response_addr.low = htobe32(dma_low);
856 	buf->response_addr.high = htobe32(dma_high);
857 	mtx_lock(&sc->cmd_mtx);
858 	response->result = 0xffffffff;
859 	wmb();
860 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
861 
862 	/* wait up to 20ms */
863 	err = EAGAIN;
864 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
865 		bus_dmamap_sync(sc->cmd_dma.dmat,
866 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
867 		wmb();
868 		switch (be32toh(response->result)) {
869 		case 0:
870 			data->data0 = be32toh(response->data);
871 			err = 0;
872 			break;
873 		case 0xffffffff:
874 			DELAY(1000);
875 			break;
876 		case MXGEFW_CMD_UNKNOWN:
877 			err = ENOSYS;
878 			break;
879 		case MXGEFW_CMD_ERROR_UNALIGNED:
880 			err = E2BIG;
881 			break;
882 		case MXGEFW_CMD_ERROR_BUSY:
883 			err = EBUSY;
884 			break;
885 		case MXGEFW_CMD_ERROR_I2C_ABSENT:
886 			err = ENXIO;
887 			break;
888 		default:
889 			device_printf(sc->dev,
890 				      "mxge: command %d "
891 				      "failed, result = %d\n",
892 				      cmd, be32toh(response->result));
893 			err = ENXIO;
894 			break;
895 		}
896 		if (err != EAGAIN)
897 			break;
898 	}
899 	if (err == EAGAIN)
900 		device_printf(sc->dev, "mxge: command %d timed out"
901 			      "result = %d\n",
902 			      cmd, be32toh(response->result));
903 	mtx_unlock(&sc->cmd_mtx);
904 	return err;
905 }
906 
907 static int
908 mxge_adopt_running_firmware(mxge_softc_t *sc)
909 {
910 	struct mcp_gen_header *hdr;
911 	const size_t bytes = sizeof (struct mcp_gen_header);
912 	size_t hdr_offset;
913 	int status;
914 
915 	/* find running firmware header */
916 	hdr_offset = htobe32(*(volatile uint32_t *)
917 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
918 
919 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
920 		device_printf(sc->dev,
921 			      "Running firmware has bad header offset (%d)\n",
922 			      (int)hdr_offset);
923 		return EIO;
924 	}
925 
926 	/* copy header of running firmware from SRAM to host memory to
927 	 * validate firmware */
928 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
929 	if (hdr == NULL) {
930 		device_printf(sc->dev, "could not malloc firmware hdr\n");
931 		return ENOMEM;
932 	}
933 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
934 				rman_get_bushandle(sc->mem_res),
935 				hdr_offset, (char *)hdr, bytes);
936 	status = mxge_validate_firmware(sc, hdr);
937 	free(hdr, M_DEVBUF);
938 
939 	/*
940 	 * check to see if adopted firmware has bug where adopting
941 	 * it will cause broadcasts to be filtered unless the NIC
942 	 * is kept in ALLMULTI mode
943 	 */
944 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
945 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
946 		sc->adopted_rx_filter_bug = 1;
947 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
948 			      "working around rx filter bug\n",
949 			      sc->fw_ver_major, sc->fw_ver_minor,
950 			      sc->fw_ver_tiny);
951 	}
952 
953 	return status;
954 }
955 
956 
957 static int
958 mxge_load_firmware(mxge_softc_t *sc, int adopt)
959 {
960 	volatile uint32_t *confirm;
961 	volatile char *submit;
962 	char buf_bytes[72];
963 	uint32_t *buf, size, dma_low, dma_high;
964 	int status, i;
965 
966 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
967 
968 	size = sc->sram_size;
969 	status = mxge_load_firmware_helper(sc, &size);
970 	if (status) {
971 		if (!adopt)
972 			return status;
973 		/* Try to use the currently running firmware, if
974 		   it is new enough */
975 		status = mxge_adopt_running_firmware(sc);
976 		if (status) {
977 			device_printf(sc->dev,
978 				      "failed to adopt running firmware\n");
979 			return status;
980 		}
981 		device_printf(sc->dev,
982 			      "Successfully adopted running firmware\n");
983 		if (sc->tx_boundary == 4096) {
984 			device_printf(sc->dev,
985 				"Using firmware currently running on NIC"
986 				 ".  For optimal\n");
987 			device_printf(sc->dev,
988 				 "performance consider loading optimized "
989 				 "firmware\n");
990 		}
991 		sc->fw_name = mxge_fw_unaligned;
992 		sc->tx_boundary = 2048;
993 		return 0;
994 	}
995 	/* clear confirmation addr */
996 	confirm = (volatile uint32_t *)sc->cmd;
997 	*confirm = 0;
998 	wmb();
999 	/* send a reload command to the bootstrap MCP, and wait for the
1000 	   response in the confirmation address.  The firmware should
1001 	   write a -1 there to indicate it is alive and well
1002 	*/
1003 
1004 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
1005 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
1006 
1007 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
1008 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
1009 	buf[2] = htobe32(0xffffffff);	/* confirm data */
1010 
1011 	/* FIX: All newest firmware should un-protect the bottom of
1012 	   the sram before handoff. However, the very first interfaces
1013 	   do not. Therefore the handoff copy must skip the first 8 bytes
1014 	*/
1015 					/* where the code starts*/
1016 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1017 	buf[4] = htobe32(size - 8); 	/* length of code */
1018 	buf[5] = htobe32(8);		/* where to copy to */
1019 	buf[6] = htobe32(0);		/* where to jump to */
1020 
1021 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1022 	mxge_pio_copy(submit, buf, 64);
1023 	wmb();
1024 	DELAY(1000);
1025 	wmb();
1026 	i = 0;
1027 	while (*confirm != 0xffffffff && i < 20) {
1028 		DELAY(1000*10);
1029 		i++;
1030 		bus_dmamap_sync(sc->cmd_dma.dmat,
1031 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1032 	}
1033 	if (*confirm != 0xffffffff) {
1034 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1035 			confirm, *confirm);
1036 
1037 		return ENXIO;
1038 	}
1039 	return 0;
1040 }
1041 
1042 static int
1043 mxge_update_mac_address(mxge_softc_t *sc)
1044 {
1045 	mxge_cmd_t cmd;
1046 	uint8_t *addr = sc->mac_addr;
1047 	int status;
1048 
1049 
1050 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1051 		     | (addr[2] << 8) | addr[3]);
1052 
1053 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1054 
1055 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1056 	return status;
1057 }
1058 
1059 static int
1060 mxge_change_pause(mxge_softc_t *sc, int pause)
1061 {
1062 	mxge_cmd_t cmd;
1063 	int status;
1064 
1065 	if (pause)
1066 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1067 				       &cmd);
1068 	else
1069 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1070 				       &cmd);
1071 
1072 	if (status) {
1073 		device_printf(sc->dev, "Failed to set flow control mode\n");
1074 		return ENXIO;
1075 	}
1076 	sc->pause = pause;
1077 	return 0;
1078 }
1079 
1080 static void
1081 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1082 {
1083 	mxge_cmd_t cmd;
1084 	int status;
1085 
1086 	if (mxge_always_promisc)
1087 		promisc = 1;
1088 
1089 	if (promisc)
1090 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1091 				       &cmd);
1092 	else
1093 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1094 				       &cmd);
1095 
1096 	if (status) {
1097 		device_printf(sc->dev, "Failed to set promisc mode\n");
1098 	}
1099 }
1100 
1101 static void
1102 mxge_set_multicast_list(mxge_softc_t *sc)
1103 {
1104 	mxge_cmd_t cmd;
1105 	struct ifmultiaddr *ifma;
1106 	struct ifnet *ifp = sc->ifp;
1107 	int err;
1108 
1109 	/* This firmware is known to not support multicast */
1110 	if (!sc->fw_multicast_support)
1111 		return;
1112 
1113 	/* Disable multicast filtering while we play with the lists*/
1114 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1115 	if (err != 0) {
1116 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1117 		       " error status: %d\n", err);
1118 		return;
1119 	}
1120 
1121 	if (sc->adopted_rx_filter_bug)
1122 		return;
1123 
1124 	if (ifp->if_flags & IFF_ALLMULTI)
1125 		/* request to disable multicast filtering, so quit here */
1126 		return;
1127 
1128 	/* Flush all the filters */
1129 
1130 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1131 	if (err != 0) {
1132 		device_printf(sc->dev,
1133 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1134 			      ", error status: %d\n", err);
1135 		return;
1136 	}
1137 
1138 	/* Walk the multicast list, and add each address */
1139 
1140 	if_maddr_rlock(ifp);
1141 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1142 		if (ifma->ifma_addr->sa_family != AF_LINK)
1143 			continue;
1144 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1145 		      &cmd.data0, 4);
1146 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1147 		      &cmd.data1, 2);
1148 		cmd.data0 = htonl(cmd.data0);
1149 		cmd.data1 = htonl(cmd.data1);
1150 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1151 		if (err != 0) {
1152 			device_printf(sc->dev, "Failed "
1153 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1154 			       "%d\t", err);
1155 			/* abort, leaving multicast filtering off */
1156 			if_maddr_runlock(ifp);
1157 			return;
1158 		}
1159 	}
1160 	if_maddr_runlock(ifp);
1161 	/* Enable multicast filtering */
1162 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1163 	if (err != 0) {
1164 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1165 		       ", error status: %d\n", err);
1166 	}
1167 }
1168 
1169 static int
1170 mxge_max_mtu(mxge_softc_t *sc)
1171 {
1172 	mxge_cmd_t cmd;
1173 	int status;
1174 
1175 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1176 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1177 
1178 	/* try to set nbufs to see if it we can
1179 	   use virtually contiguous jumbos */
1180 	cmd.data0 = 0;
1181 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1182 			       &cmd);
1183 	if (status == 0)
1184 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1185 
1186 	/* otherwise, we're limited to MJUMPAGESIZE */
1187 	return MJUMPAGESIZE - MXGEFW_PAD;
1188 }
1189 
1190 static int
1191 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1192 {
1193 	struct mxge_slice_state *ss;
1194 	mxge_rx_done_t *rx_done;
1195 	volatile uint32_t *irq_claim;
1196 	mxge_cmd_t cmd;
1197 	int slice, status;
1198 
1199 	/* try to send a reset command to the card to see if it
1200 	   is alive */
1201 	memset(&cmd, 0, sizeof (cmd));
1202 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1203 	if (status != 0) {
1204 		device_printf(sc->dev, "failed reset\n");
1205 		return ENXIO;
1206 	}
1207 
1208 	mxge_dummy_rdma(sc, 1);
1209 
1210 
1211 	/* set the intrq size */
1212 	cmd.data0 = sc->rx_ring_size;
1213 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1214 
1215 	/*
1216 	 * Even though we already know how many slices are supported
1217 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1218 	 * has magic side effects, and must be called after a reset.
1219 	 * It must be called prior to calling any RSS related cmds,
1220 	 * including assigning an interrupt queue for anything but
1221 	 * slice 0.  It must also be called *after*
1222 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1223 	 * the firmware to compute offsets.
1224 	 */
1225 
1226 	if (sc->num_slices > 1) {
1227 		/* ask the maximum number of slices it supports */
1228 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1229 					   &cmd);
1230 		if (status != 0) {
1231 			device_printf(sc->dev,
1232 				      "failed to get number of slices\n");
1233 			return status;
1234 		}
1235 		/*
1236 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1237 		 * to setting up the interrupt queue DMA
1238 		 */
1239 		cmd.data0 = sc->num_slices;
1240 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1241 #ifdef IFNET_BUF_RING
1242 		cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1243 #endif
1244 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1245 					   &cmd);
1246 		if (status != 0) {
1247 			device_printf(sc->dev,
1248 				      "failed to set number of slices\n");
1249 			return status;
1250 		}
1251 	}
1252 
1253 
1254 	if (interrupts_setup) {
1255 		/* Now exchange information about interrupts  */
1256 		for (slice = 0; slice < sc->num_slices; slice++) {
1257 			rx_done = &sc->ss[slice].rx_done;
1258 			memset(rx_done->entry, 0, sc->rx_ring_size);
1259 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1260 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1261 			cmd.data2 = slice;
1262 			status |= mxge_send_cmd(sc,
1263 						MXGEFW_CMD_SET_INTRQ_DMA,
1264 						&cmd);
1265 		}
1266 	}
1267 
1268 	status |= mxge_send_cmd(sc,
1269 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1270 
1271 
1272 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1273 
1274 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1275 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1276 
1277 
1278 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1279 				&cmd);
1280 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1281 	if (status != 0) {
1282 		device_printf(sc->dev, "failed set interrupt parameters\n");
1283 		return status;
1284 	}
1285 
1286 
1287 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1288 
1289 
1290 	/* run a DMA benchmark */
1291 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1292 
1293 	for (slice = 0; slice < sc->num_slices; slice++) {
1294 		ss = &sc->ss[slice];
1295 
1296 		ss->irq_claim = irq_claim + (2 * slice);
1297 		/* reset mcp/driver shared state back to 0 */
1298 		ss->rx_done.idx = 0;
1299 		ss->rx_done.cnt = 0;
1300 		ss->tx.req = 0;
1301 		ss->tx.done = 0;
1302 		ss->tx.pkt_done = 0;
1303 		ss->tx.queue_active = 0;
1304 		ss->tx.activate = 0;
1305 		ss->tx.deactivate = 0;
1306 		ss->tx.wake = 0;
1307 		ss->tx.defrag = 0;
1308 		ss->tx.stall = 0;
1309 		ss->rx_big.cnt = 0;
1310 		ss->rx_small.cnt = 0;
1311 		ss->lro_bad_csum = 0;
1312 		ss->lro_queued = 0;
1313 		ss->lro_flushed = 0;
1314 		if (ss->fw_stats != NULL) {
1315 			bzero(ss->fw_stats, sizeof *ss->fw_stats);
1316 		}
1317 	}
1318 	sc->rdma_tags_available = 15;
1319 	status = mxge_update_mac_address(sc);
1320 	mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1321 	mxge_change_pause(sc, sc->pause);
1322 	mxge_set_multicast_list(sc);
1323 	if (sc->throttle) {
1324 		cmd.data0 = sc->throttle;
1325 		if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR,
1326 				  &cmd)) {
1327 			device_printf(sc->dev,
1328 				      "can't enable throttle\n");
1329 		}
1330 	}
1331 	return status;
1332 }
1333 
1334 static int
1335 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1336 {
1337 	mxge_cmd_t cmd;
1338 	mxge_softc_t *sc;
1339 	int err;
1340 	unsigned int throttle;
1341 
1342 	sc = arg1;
1343 	throttle = sc->throttle;
1344 	err = sysctl_handle_int(oidp, &throttle, arg2, req);
1345         if (err != 0) {
1346                 return err;
1347         }
1348 
1349 	if (throttle == sc->throttle)
1350 		return 0;
1351 
1352         if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1353                 return EINVAL;
1354 
1355 	mtx_lock(&sc->driver_mtx);
1356 	cmd.data0 = throttle;
1357 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1358 	if (err == 0)
1359 		sc->throttle = throttle;
1360 	mtx_unlock(&sc->driver_mtx);
1361 	return err;
1362 }
1363 
1364 static int
1365 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1366 {
1367         mxge_softc_t *sc;
1368         unsigned int intr_coal_delay;
1369         int err;
1370 
1371         sc = arg1;
1372         intr_coal_delay = sc->intr_coal_delay;
1373         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1374         if (err != 0) {
1375                 return err;
1376         }
1377         if (intr_coal_delay == sc->intr_coal_delay)
1378                 return 0;
1379 
1380         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1381                 return EINVAL;
1382 
1383 	mtx_lock(&sc->driver_mtx);
1384 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1385 	sc->intr_coal_delay = intr_coal_delay;
1386 
1387 	mtx_unlock(&sc->driver_mtx);
1388         return err;
1389 }
1390 
1391 static int
1392 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1393 {
1394         mxge_softc_t *sc;
1395         unsigned int enabled;
1396         int err;
1397 
1398         sc = arg1;
1399         enabled = sc->pause;
1400         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1401         if (err != 0) {
1402                 return err;
1403         }
1404         if (enabled == sc->pause)
1405                 return 0;
1406 
1407 	mtx_lock(&sc->driver_mtx);
1408 	err = mxge_change_pause(sc, enabled);
1409 	mtx_unlock(&sc->driver_mtx);
1410         return err;
1411 }
1412 
1413 static int
1414 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1415 {
1416 	struct ifnet *ifp;
1417 	int err = 0;
1418 
1419 	ifp = sc->ifp;
1420 	if (lro_cnt == 0)
1421 		ifp->if_capenable &= ~IFCAP_LRO;
1422 	else
1423 		ifp->if_capenable |= IFCAP_LRO;
1424 	sc->lro_cnt = lro_cnt;
1425 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1426 		mxge_close(sc, 0);
1427 		err = mxge_open(sc);
1428 	}
1429 	return err;
1430 }
1431 
1432 static int
1433 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1434 {
1435 	mxge_softc_t *sc;
1436 	unsigned int lro_cnt;
1437 	int err;
1438 
1439 	sc = arg1;
1440 	lro_cnt = sc->lro_cnt;
1441 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1442 	if (err != 0)
1443 		return err;
1444 
1445 	if (lro_cnt == sc->lro_cnt)
1446 		return 0;
1447 
1448 	if (lro_cnt > 128)
1449 		return EINVAL;
1450 
1451 	mtx_lock(&sc->driver_mtx);
1452 	err = mxge_change_lro_locked(sc, lro_cnt);
1453 	mtx_unlock(&sc->driver_mtx);
1454 	return err;
1455 }
1456 
1457 static int
1458 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1459 {
1460         int err;
1461 
1462         if (arg1 == NULL)
1463                 return EFAULT;
1464         arg2 = be32toh(*(int *)arg1);
1465         arg1 = NULL;
1466         err = sysctl_handle_int(oidp, arg1, arg2, req);
1467 
1468         return err;
1469 }
1470 
1471 static void
1472 mxge_rem_sysctls(mxge_softc_t *sc)
1473 {
1474 	struct mxge_slice_state *ss;
1475 	int slice;
1476 
1477 	if (sc->slice_sysctl_tree == NULL)
1478 		return;
1479 
1480 	for (slice = 0; slice < sc->num_slices; slice++) {
1481 		ss = &sc->ss[slice];
1482 		if (ss == NULL || ss->sysctl_tree == NULL)
1483 			continue;
1484 		sysctl_ctx_free(&ss->sysctl_ctx);
1485 		ss->sysctl_tree = NULL;
1486 	}
1487 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1488 	sc->slice_sysctl_tree = NULL;
1489 }
1490 
1491 static void
1492 mxge_add_sysctls(mxge_softc_t *sc)
1493 {
1494 	struct sysctl_ctx_list *ctx;
1495 	struct sysctl_oid_list *children;
1496 	mcp_irq_data_t *fw;
1497 	struct mxge_slice_state *ss;
1498 	int slice;
1499 	char slice_num[8];
1500 
1501 	ctx = device_get_sysctl_ctx(sc->dev);
1502 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1503 	fw = sc->ss[0].fw_stats;
1504 
1505 	/* random information */
1506 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1507 		       "firmware_version",
1508 		       CTLFLAG_RD, &sc->fw_version,
1509 		       0, "firmware version");
1510 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1511 		       "serial_number",
1512 		       CTLFLAG_RD, &sc->serial_number_string,
1513 		       0, "serial number");
1514 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1515 		       "product_code",
1516 		       CTLFLAG_RD, &sc->product_code_string,
1517 		       0, "product_code");
1518 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1519 		       "pcie_link_width",
1520 		       CTLFLAG_RD, &sc->link_width,
1521 		       0, "tx_boundary");
1522 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1523 		       "tx_boundary",
1524 		       CTLFLAG_RD, &sc->tx_boundary,
1525 		       0, "tx_boundary");
1526 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1527 		       "write_combine",
1528 		       CTLFLAG_RD, &sc->wc,
1529 		       0, "write combining PIO?");
1530 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1531 		       "read_dma_MBs",
1532 		       CTLFLAG_RD, &sc->read_dma,
1533 		       0, "DMA Read speed in MB/s");
1534 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1535 		       "write_dma_MBs",
1536 		       CTLFLAG_RD, &sc->write_dma,
1537 		       0, "DMA Write speed in MB/s");
1538 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1539 		       "read_write_dma_MBs",
1540 		       CTLFLAG_RD, &sc->read_write_dma,
1541 		       0, "DMA concurrent Read/Write speed in MB/s");
1542 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1543 		       "watchdog_resets",
1544 		       CTLFLAG_RD, &sc->watchdog_resets,
1545 		       0, "Number of times NIC was reset");
1546 
1547 
1548 	/* performance related tunables */
1549 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1550 			"intr_coal_delay",
1551 			CTLTYPE_INT|CTLFLAG_RW, sc,
1552 			0, mxge_change_intr_coal,
1553 			"I", "interrupt coalescing delay in usecs");
1554 
1555 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1556 			"throttle",
1557 			CTLTYPE_INT|CTLFLAG_RW, sc,
1558 			0, mxge_change_throttle,
1559 			"I", "transmit throttling");
1560 
1561 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1562 			"flow_control_enabled",
1563 			CTLTYPE_INT|CTLFLAG_RW, sc,
1564 			0, mxge_change_flow_control,
1565 			"I", "interrupt coalescing delay in usecs");
1566 
1567 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1568 		       "deassert_wait",
1569 		       CTLFLAG_RW, &mxge_deassert_wait,
1570 		       0, "Wait for IRQ line to go low in ihandler");
1571 
1572 	/* stats block from firmware is in network byte order.
1573 	   Need to swap it */
1574 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1575 			"link_up",
1576 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1577 			0, mxge_handle_be32,
1578 			"I", "link up");
1579 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1580 			"rdma_tags_available",
1581 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1582 			0, mxge_handle_be32,
1583 			"I", "rdma_tags_available");
1584 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1585 			"dropped_bad_crc32",
1586 			CTLTYPE_INT|CTLFLAG_RD,
1587 			&fw->dropped_bad_crc32,
1588 			0, mxge_handle_be32,
1589 			"I", "dropped_bad_crc32");
1590 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1591 			"dropped_bad_phy",
1592 			CTLTYPE_INT|CTLFLAG_RD,
1593 			&fw->dropped_bad_phy,
1594 			0, mxge_handle_be32,
1595 			"I", "dropped_bad_phy");
1596 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1597 			"dropped_link_error_or_filtered",
1598 			CTLTYPE_INT|CTLFLAG_RD,
1599 			&fw->dropped_link_error_or_filtered,
1600 			0, mxge_handle_be32,
1601 			"I", "dropped_link_error_or_filtered");
1602 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1603 			"dropped_link_overflow",
1604 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1605 			0, mxge_handle_be32,
1606 			"I", "dropped_link_overflow");
1607 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1608 			"dropped_multicast_filtered",
1609 			CTLTYPE_INT|CTLFLAG_RD,
1610 			&fw->dropped_multicast_filtered,
1611 			0, mxge_handle_be32,
1612 			"I", "dropped_multicast_filtered");
1613 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1614 			"dropped_no_big_buffer",
1615 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1616 			0, mxge_handle_be32,
1617 			"I", "dropped_no_big_buffer");
1618 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1619 			"dropped_no_small_buffer",
1620 			CTLTYPE_INT|CTLFLAG_RD,
1621 			&fw->dropped_no_small_buffer,
1622 			0, mxge_handle_be32,
1623 			"I", "dropped_no_small_buffer");
1624 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1625 			"dropped_overrun",
1626 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1627 			0, mxge_handle_be32,
1628 			"I", "dropped_overrun");
1629 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1630 			"dropped_pause",
1631 			CTLTYPE_INT|CTLFLAG_RD,
1632 			&fw->dropped_pause,
1633 			0, mxge_handle_be32,
1634 			"I", "dropped_pause");
1635 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1636 			"dropped_runt",
1637 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1638 			0, mxge_handle_be32,
1639 			"I", "dropped_runt");
1640 
1641 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1642 			"dropped_unicast_filtered",
1643 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1644 			0, mxge_handle_be32,
1645 			"I", "dropped_unicast_filtered");
1646 
1647 	/* verbose printing? */
1648 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1649 		       "verbose",
1650 		       CTLFLAG_RW, &mxge_verbose,
1651 		       0, "verbose printing");
1652 
1653 	/* lro */
1654 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1655 			"lro_cnt",
1656 			CTLTYPE_INT|CTLFLAG_RW, sc,
1657 			0, mxge_change_lro,
1658 			"I", "number of lro merge queues");
1659 
1660 
1661 	/* add counters exported for debugging from all slices */
1662 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1663 	sc->slice_sysctl_tree =
1664 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1665 				"slice", CTLFLAG_RD, 0, "");
1666 
1667 	for (slice = 0; slice < sc->num_slices; slice++) {
1668 		ss = &sc->ss[slice];
1669 		sysctl_ctx_init(&ss->sysctl_ctx);
1670 		ctx = &ss->sysctl_ctx;
1671 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1672 		sprintf(slice_num, "%d", slice);
1673 		ss->sysctl_tree =
1674 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1675 					CTLFLAG_RD, 0, "");
1676 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1677 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1678 			       "rx_small_cnt",
1679 			       CTLFLAG_RD, &ss->rx_small.cnt,
1680 			       0, "rx_small_cnt");
1681 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1682 			       "rx_big_cnt",
1683 			       CTLFLAG_RD, &ss->rx_big.cnt,
1684 			       0, "rx_small_cnt");
1685 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1686 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1687 			       0, "number of lro merge queues flushed");
1688 
1689 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1690 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1691 			       0, "number of frames appended to lro merge"
1692 			       "queues");
1693 
1694 #ifndef IFNET_BUF_RING
1695 		/* only transmit from slice 0 for now */
1696 		if (slice > 0)
1697 			continue;
1698 #endif
1699 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1700 			       "tx_req",
1701 			       CTLFLAG_RD, &ss->tx.req,
1702 			       0, "tx_req");
1703 
1704 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1705 			       "tx_done",
1706 			       CTLFLAG_RD, &ss->tx.done,
1707 			       0, "tx_done");
1708 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1709 			       "tx_pkt_done",
1710 			       CTLFLAG_RD, &ss->tx.pkt_done,
1711 			       0, "tx_done");
1712 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1713 			       "tx_stall",
1714 			       CTLFLAG_RD, &ss->tx.stall,
1715 			       0, "tx_stall");
1716 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1717 			       "tx_wake",
1718 			       CTLFLAG_RD, &ss->tx.wake,
1719 			       0, "tx_wake");
1720 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1721 			       "tx_defrag",
1722 			       CTLFLAG_RD, &ss->tx.defrag,
1723 			       0, "tx_defrag");
1724 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1725 			       "tx_queue_active",
1726 			       CTLFLAG_RD, &ss->tx.queue_active,
1727 			       0, "tx_queue_active");
1728 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1729 			       "tx_activate",
1730 			       CTLFLAG_RD, &ss->tx.activate,
1731 			       0, "tx_activate");
1732 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1733 			       "tx_deactivate",
1734 			       CTLFLAG_RD, &ss->tx.deactivate,
1735 			       0, "tx_deactivate");
1736 	}
1737 }
1738 
1739 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1740    backwards one at a time and handle ring wraps */
1741 
1742 static inline void
1743 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1744 			    mcp_kreq_ether_send_t *src, int cnt)
1745 {
1746         int idx, starting_slot;
1747         starting_slot = tx->req;
1748         while (cnt > 1) {
1749                 cnt--;
1750                 idx = (starting_slot + cnt) & tx->mask;
1751                 mxge_pio_copy(&tx->lanai[idx],
1752 			      &src[cnt], sizeof(*src));
1753                 wmb();
1754         }
1755 }
1756 
1757 /*
1758  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1759  * at most 32 bytes at a time, so as to avoid involving the software
1760  * pio handler in the nic.   We re-write the first segment's flags
1761  * to mark them valid only after writing the entire chain
1762  */
1763 
1764 static inline void
1765 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1766                   int cnt)
1767 {
1768         int idx, i;
1769         uint32_t *src_ints;
1770 	volatile uint32_t *dst_ints;
1771         mcp_kreq_ether_send_t *srcp;
1772 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1773 	uint8_t last_flags;
1774 
1775         idx = tx->req & tx->mask;
1776 
1777 	last_flags = src->flags;
1778 	src->flags = 0;
1779         wmb();
1780         dst = dstp = &tx->lanai[idx];
1781         srcp = src;
1782 
1783         if ((idx + cnt) < tx->mask) {
1784                 for (i = 0; i < (cnt - 1); i += 2) {
1785                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1786                         wmb(); /* force write every 32 bytes */
1787                         srcp += 2;
1788                         dstp += 2;
1789                 }
1790         } else {
1791                 /* submit all but the first request, and ensure
1792                    that it is submitted below */
1793                 mxge_submit_req_backwards(tx, src, cnt);
1794                 i = 0;
1795         }
1796         if (i < cnt) {
1797                 /* submit the first request */
1798                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1799                 wmb(); /* barrier before setting valid flag */
1800         }
1801 
1802         /* re-write the last 32-bits with the valid flags */
1803         src->flags = last_flags;
1804         src_ints = (uint32_t *)src;
1805         src_ints+=3;
1806         dst_ints = (volatile uint32_t *)dst;
1807         dst_ints+=3;
1808         *dst_ints =  *src_ints;
1809         tx->req += cnt;
1810         wmb();
1811 }
1812 
1813 #if IFCAP_TSO4
1814 
1815 static void
1816 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1817 	       int busdma_seg_cnt, int ip_off)
1818 {
1819 	mxge_tx_ring_t *tx;
1820 	mcp_kreq_ether_send_t *req;
1821 	bus_dma_segment_t *seg;
1822 	struct ip *ip;
1823 	struct tcphdr *tcp;
1824 	uint32_t low, high_swapped;
1825 	int len, seglen, cum_len, cum_len_next;
1826 	int next_is_first, chop, cnt, rdma_count, small;
1827 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1828 	uint8_t flags, flags_next;
1829 	static int once;
1830 
1831 	mss = m->m_pkthdr.tso_segsz;
1832 
1833 	/* negative cum_len signifies to the
1834 	 * send loop that we are still in the
1835 	 * header portion of the TSO packet.
1836 	 */
1837 
1838 	/* ensure we have the ethernet, IP and TCP
1839 	   header together in the first mbuf, copy
1840 	   it to a scratch buffer if not */
1841 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1842 		m_copydata(m, 0, ip_off + sizeof (*ip),
1843 			   ss->scratch);
1844 		ip = (struct ip *)(ss->scratch + ip_off);
1845 	} else {
1846 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1847 	}
1848 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1849 			    + sizeof (*tcp))) {
1850 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1851 			   + sizeof (*tcp),  ss->scratch);
1852 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1853 	}
1854 
1855 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1856 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1857 	cksum_offset = ip_off + (ip->ip_hl << 2);
1858 
1859 	/* TSO implies checksum offload on this hardware */
1860 	if (__predict_false((m->m_pkthdr.csum_flags & (CSUM_TCP)) == 0)) {
1861 		/*
1862 		 * If packet has full TCP csum, replace it with pseudo hdr
1863 		 * sum that the NIC expects, otherwise the NIC will emit
1864 		 * packets with bad TCP checksums.
1865 		 */
1866 		m->m_pkthdr.csum_flags = CSUM_TCP;
1867 		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1868 		tcp->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1869 			htons(IPPROTO_TCP + (m->m_pkthdr.len - cksum_offset)));
1870 	}
1871 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1872 
1873 
1874 	/* for TSO, pseudo_hdr_offset holds mss.
1875 	 * The firmware figures out where to put
1876 	 * the checksum by parsing the header. */
1877 	pseudo_hdr_offset = htobe16(mss);
1878 
1879 	tx = &ss->tx;
1880 	req = tx->req_list;
1881 	seg = tx->seg_list;
1882 	cnt = 0;
1883 	rdma_count = 0;
1884 	/* "rdma_count" is the number of RDMAs belonging to the
1885 	 * current packet BEFORE the current send request. For
1886 	 * non-TSO packets, this is equal to "count".
1887 	 * For TSO packets, rdma_count needs to be reset
1888 	 * to 0 after a segment cut.
1889 	 *
1890 	 * The rdma_count field of the send request is
1891 	 * the number of RDMAs of the packet starting at
1892 	 * that request. For TSO send requests with one ore more cuts
1893 	 * in the middle, this is the number of RDMAs starting
1894 	 * after the last cut in the request. All previous
1895 	 * segments before the last cut implicitly have 1 RDMA.
1896 	 *
1897 	 * Since the number of RDMAs is not known beforehand,
1898 	 * it must be filled-in retroactively - after each
1899 	 * segmentation cut or at the end of the entire packet.
1900 	 */
1901 
1902 	while (busdma_seg_cnt) {
1903 		/* Break the busdma segment up into pieces*/
1904 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1905 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1906 		len = seg->ds_len;
1907 
1908 		while (len) {
1909 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1910 			seglen = len;
1911 			cum_len_next = cum_len + seglen;
1912 			(req-rdma_count)->rdma_count = rdma_count + 1;
1913 			if (__predict_true(cum_len >= 0)) {
1914 				/* payload */
1915 				chop = (cum_len_next > mss);
1916 				cum_len_next = cum_len_next % mss;
1917 				next_is_first = (cum_len_next == 0);
1918 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1919 				flags_next |= next_is_first *
1920 					MXGEFW_FLAGS_FIRST;
1921 				rdma_count |= -(chop | next_is_first);
1922 				rdma_count += chop & !next_is_first;
1923 			} else if (cum_len_next >= 0) {
1924 				/* header ends */
1925 				rdma_count = -1;
1926 				cum_len_next = 0;
1927 				seglen = -cum_len;
1928 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1929 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1930 					MXGEFW_FLAGS_FIRST |
1931 					(small * MXGEFW_FLAGS_SMALL);
1932 			    }
1933 
1934 			req->addr_high = high_swapped;
1935 			req->addr_low = htobe32(low);
1936 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1937 			req->pad = 0;
1938 			req->rdma_count = 1;
1939 			req->length = htobe16(seglen);
1940 			req->cksum_offset = cksum_offset;
1941 			req->flags = flags | ((cum_len & 1) *
1942 					      MXGEFW_FLAGS_ALIGN_ODD);
1943 			low += seglen;
1944 			len -= seglen;
1945 			cum_len = cum_len_next;
1946 			flags = flags_next;
1947 			req++;
1948 			cnt++;
1949 			rdma_count++;
1950 			if (__predict_false(cksum_offset > seglen))
1951 				cksum_offset -= seglen;
1952 			else
1953 				cksum_offset = 0;
1954 			if (__predict_false(cnt > tx->max_desc))
1955 				goto drop;
1956 		}
1957 		busdma_seg_cnt--;
1958 		seg++;
1959 	}
1960 	(req-rdma_count)->rdma_count = rdma_count;
1961 
1962 	do {
1963 		req--;
1964 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1965 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1966 
1967 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1968 	mxge_submit_req(tx, tx->req_list, cnt);
1969 #ifdef IFNET_BUF_RING
1970 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
1971 		/* tell the NIC to start polling this slice */
1972 		*tx->send_go = 1;
1973 		tx->queue_active = 1;
1974 		tx->activate++;
1975 		wmb();
1976 	}
1977 #endif
1978 	return;
1979 
1980 drop:
1981 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1982 	m_freem(m);
1983 	ss->oerrors++;
1984 	if (!once) {
1985 		printf("tx->max_desc exceeded via TSO!\n");
1986 		printf("mss = %d, %ld, %d!\n", mss,
1987 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1988 		once = 1;
1989 	}
1990 	return;
1991 
1992 }
1993 
1994 #endif /* IFCAP_TSO4 */
1995 
1996 #ifdef MXGE_NEW_VLAN_API
1997 /*
1998  * We reproduce the software vlan tag insertion from
1999  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
2000  * vlan tag insertion. We need to advertise this in order to have the
2001  * vlan interface respect our csum offload flags.
2002  */
2003 static struct mbuf *
2004 mxge_vlan_tag_insert(struct mbuf *m)
2005 {
2006 	struct ether_vlan_header *evl;
2007 
2008 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
2009 	if (__predict_false(m == NULL))
2010 		return NULL;
2011 	if (m->m_len < sizeof(*evl)) {
2012 		m = m_pullup(m, sizeof(*evl));
2013 		if (__predict_false(m == NULL))
2014 			return NULL;
2015 	}
2016 	/*
2017 	 * Transform the Ethernet header into an Ethernet header
2018 	 * with 802.1Q encapsulation.
2019 	 */
2020 	evl = mtod(m, struct ether_vlan_header *);
2021 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
2022 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
2023 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
2024 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
2025 	m->m_flags &= ~M_VLANTAG;
2026 	return m;
2027 }
2028 #endif /* MXGE_NEW_VLAN_API */
2029 
2030 static void
2031 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
2032 {
2033 	mxge_softc_t *sc;
2034 	mcp_kreq_ether_send_t *req;
2035 	bus_dma_segment_t *seg;
2036 	struct mbuf *m_tmp;
2037 	struct ifnet *ifp;
2038 	mxge_tx_ring_t *tx;
2039 	struct ip *ip;
2040 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
2041 	uint16_t pseudo_hdr_offset;
2042         uint8_t flags, cksum_offset;
2043 
2044 
2045 	sc = ss->sc;
2046 	ifp = sc->ifp;
2047 	tx = &ss->tx;
2048 
2049 	ip_off = sizeof (struct ether_header);
2050 #ifdef MXGE_NEW_VLAN_API
2051 	if (m->m_flags & M_VLANTAG) {
2052 		m = mxge_vlan_tag_insert(m);
2053 		if (__predict_false(m == NULL))
2054 			goto drop;
2055 		ip_off += ETHER_VLAN_ENCAP_LEN;
2056 	}
2057 #endif
2058 	/* (try to) map the frame for DMA */
2059 	idx = tx->req & tx->mask;
2060 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
2061 				      m, tx->seg_list, &cnt,
2062 				      BUS_DMA_NOWAIT);
2063 	if (__predict_false(err == EFBIG)) {
2064 		/* Too many segments in the chain.  Try
2065 		   to defrag */
2066 		m_tmp = m_defrag(m, M_NOWAIT);
2067 		if (m_tmp == NULL) {
2068 			goto drop;
2069 		}
2070 		ss->tx.defrag++;
2071 		m = m_tmp;
2072 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
2073 					      tx->info[idx].map,
2074 					      m, tx->seg_list, &cnt,
2075 					      BUS_DMA_NOWAIT);
2076 	}
2077 	if (__predict_false(err != 0)) {
2078 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
2079 			      " packet len = %d\n", err, m->m_pkthdr.len);
2080 		goto drop;
2081 	}
2082 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
2083 			BUS_DMASYNC_PREWRITE);
2084 	tx->info[idx].m = m;
2085 
2086 #if IFCAP_TSO4
2087 	/* TSO is different enough, we handle it in another routine */
2088 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
2089 		mxge_encap_tso(ss, m, cnt, ip_off);
2090 		return;
2091 	}
2092 #endif
2093 
2094 	req = tx->req_list;
2095 	cksum_offset = 0;
2096 	pseudo_hdr_offset = 0;
2097 	flags = MXGEFW_FLAGS_NO_TSO;
2098 
2099 	/* checksum offloading? */
2100 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2101 		/* ensure ip header is in first mbuf, copy
2102 		   it to a scratch buffer if not */
2103 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2104 			m_copydata(m, 0, ip_off + sizeof (*ip),
2105 				   ss->scratch);
2106 			ip = (struct ip *)(ss->scratch + ip_off);
2107 		} else {
2108 			ip = (struct ip *)(mtod(m, char *) + ip_off);
2109 		}
2110 		cksum_offset = ip_off + (ip->ip_hl << 2);
2111 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2112 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2113 		req->cksum_offset = cksum_offset;
2114 		flags |= MXGEFW_FLAGS_CKSUM;
2115 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2116 	} else {
2117 		odd_flag = 0;
2118 	}
2119 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2120 		flags |= MXGEFW_FLAGS_SMALL;
2121 
2122 	/* convert segments into a request list */
2123 	cum_len = 0;
2124 	seg = tx->seg_list;
2125 	req->flags = MXGEFW_FLAGS_FIRST;
2126 	for (i = 0; i < cnt; i++) {
2127 		req->addr_low =
2128 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2129 		req->addr_high =
2130 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2131 		req->length = htobe16(seg->ds_len);
2132 		req->cksum_offset = cksum_offset;
2133 		if (cksum_offset > seg->ds_len)
2134 			cksum_offset -= seg->ds_len;
2135 		else
2136 			cksum_offset = 0;
2137 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2138 		req->pad = 0; /* complete solid 16-byte block */
2139 		req->rdma_count = 1;
2140 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2141 		cum_len += seg->ds_len;
2142 		seg++;
2143 		req++;
2144 		req->flags = 0;
2145 	}
2146 	req--;
2147 	/* pad runts to 60 bytes */
2148 	if (cum_len < 60) {
2149 		req++;
2150 		req->addr_low =
2151 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2152 		req->addr_high =
2153 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2154 		req->length = htobe16(60 - cum_len);
2155 		req->cksum_offset = 0;
2156 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2157 		req->pad = 0; /* complete solid 16-byte block */
2158 		req->rdma_count = 1;
2159 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2160 		cnt++;
2161 	}
2162 
2163 	tx->req_list[0].rdma_count = cnt;
2164 #if 0
2165 	/* print what the firmware will see */
2166 	for (i = 0; i < cnt; i++) {
2167 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2168 		    "cso:%d, flags:0x%x, rdma:%d\n",
2169 		    i, (int)ntohl(tx->req_list[i].addr_high),
2170 		    (int)ntohl(tx->req_list[i].addr_low),
2171 		    (int)ntohs(tx->req_list[i].length),
2172 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2173 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2174 		    tx->req_list[i].rdma_count);
2175 	}
2176 	printf("--------------\n");
2177 #endif
2178 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2179 	mxge_submit_req(tx, tx->req_list, cnt);
2180 #ifdef IFNET_BUF_RING
2181 	if ((ss->sc->num_slices > 1) && tx->queue_active == 0) {
2182 		/* tell the NIC to start polling this slice */
2183 		*tx->send_go = 1;
2184 		tx->queue_active = 1;
2185 		tx->activate++;
2186 		wmb();
2187 	}
2188 #endif
2189 	return;
2190 
2191 drop:
2192 	m_freem(m);
2193 	ss->oerrors++;
2194 	return;
2195 }
2196 
2197 #ifdef IFNET_BUF_RING
2198 static void
2199 mxge_qflush(struct ifnet *ifp)
2200 {
2201 	mxge_softc_t *sc = ifp->if_softc;
2202 	mxge_tx_ring_t *tx;
2203 	struct mbuf *m;
2204 	int slice;
2205 
2206 	for (slice = 0; slice < sc->num_slices; slice++) {
2207 		tx = &sc->ss[slice].tx;
2208 		mtx_lock(&tx->mtx);
2209 		while ((m = buf_ring_dequeue_sc(tx->br)) != NULL)
2210 			m_freem(m);
2211 		mtx_unlock(&tx->mtx);
2212 	}
2213 	if_qflush(ifp);
2214 }
2215 
2216 static inline void
2217 mxge_start_locked(struct mxge_slice_state *ss)
2218 {
2219 	mxge_softc_t *sc;
2220 	struct mbuf *m;
2221 	struct ifnet *ifp;
2222 	mxge_tx_ring_t *tx;
2223 
2224 	sc = ss->sc;
2225 	ifp = sc->ifp;
2226 	tx = &ss->tx;
2227 
2228 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2229 		m = drbr_dequeue(ifp, tx->br);
2230 		if (m == NULL) {
2231 			return;
2232 		}
2233 		/* let BPF see it */
2234 		BPF_MTAP(ifp, m);
2235 
2236 		/* give it to the nic */
2237 		mxge_encap(ss, m);
2238 	}
2239 	/* ran out of transmit slots */
2240 	if (((ss->if_drv_flags & IFF_DRV_OACTIVE) == 0)
2241 	    && (!drbr_empty(ifp, tx->br))) {
2242 		ss->if_drv_flags |= IFF_DRV_OACTIVE;
2243 		tx->stall++;
2244 	}
2245 }
2246 
2247 static int
2248 mxge_transmit_locked(struct mxge_slice_state *ss, struct mbuf *m)
2249 {
2250 	mxge_softc_t *sc;
2251 	struct ifnet *ifp;
2252 	mxge_tx_ring_t *tx;
2253 	int err;
2254 
2255 	sc = ss->sc;
2256 	ifp = sc->ifp;
2257 	tx = &ss->tx;
2258 
2259 	if ((ss->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) !=
2260 	    IFF_DRV_RUNNING) {
2261 		err = drbr_enqueue(ifp, tx->br, m);
2262 		return (err);
2263 	}
2264 
2265 	if (!drbr_needs_enqueue(ifp, tx->br) &&
2266 	    ((tx->mask - (tx->req - tx->done)) > tx->max_desc)) {
2267 		/* let BPF see it */
2268 		BPF_MTAP(ifp, m);
2269 		/* give it to the nic */
2270 		mxge_encap(ss, m);
2271 	} else if ((err = drbr_enqueue(ifp, tx->br, m)) != 0) {
2272 		return (err);
2273 	}
2274 	if (!drbr_empty(ifp, tx->br))
2275 		mxge_start_locked(ss);
2276 	return (0);
2277 }
2278 
2279 static int
2280 mxge_transmit(struct ifnet *ifp, struct mbuf *m)
2281 {
2282 	mxge_softc_t *sc = ifp->if_softc;
2283 	struct mxge_slice_state *ss;
2284 	mxge_tx_ring_t *tx;
2285 	int err = 0;
2286 	int slice;
2287 
2288 	slice = m->m_pkthdr.flowid;
2289 	slice &= (sc->num_slices - 1);  /* num_slices always power of 2 */
2290 
2291 	ss = &sc->ss[slice];
2292 	tx = &ss->tx;
2293 
2294 	if (mtx_trylock(&tx->mtx)) {
2295 		err = mxge_transmit_locked(ss, m);
2296 		mtx_unlock(&tx->mtx);
2297 	} else {
2298 		err = drbr_enqueue(ifp, tx->br, m);
2299 	}
2300 
2301 	return (err);
2302 }
2303 
2304 #else
2305 
2306 static inline void
2307 mxge_start_locked(struct mxge_slice_state *ss)
2308 {
2309 	mxge_softc_t *sc;
2310 	struct mbuf *m;
2311 	struct ifnet *ifp;
2312 	mxge_tx_ring_t *tx;
2313 
2314 	sc = ss->sc;
2315 	ifp = sc->ifp;
2316 	tx = &ss->tx;
2317 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2318 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2319 		if (m == NULL) {
2320 			return;
2321 		}
2322 		/* let BPF see it */
2323 		BPF_MTAP(ifp, m);
2324 
2325 		/* give it to the nic */
2326 		mxge_encap(ss, m);
2327 	}
2328 	/* ran out of transmit slots */
2329 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2330 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2331 		tx->stall++;
2332 	}
2333 }
2334 #endif
2335 static void
2336 mxge_start(struct ifnet *ifp)
2337 {
2338 	mxge_softc_t *sc = ifp->if_softc;
2339 	struct mxge_slice_state *ss;
2340 
2341 	/* only use the first slice for now */
2342 	ss = &sc->ss[0];
2343 	mtx_lock(&ss->tx.mtx);
2344 	mxge_start_locked(ss);
2345 	mtx_unlock(&ss->tx.mtx);
2346 }
2347 
2348 /*
2349  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2350  * at most 32 bytes at a time, so as to avoid involving the software
2351  * pio handler in the nic.   We re-write the first segment's low
2352  * DMA address to mark it valid only after we write the entire chunk
2353  * in a burst
2354  */
2355 static inline void
2356 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2357 		mcp_kreq_ether_recv_t *src)
2358 {
2359 	uint32_t low;
2360 
2361 	low = src->addr_low;
2362 	src->addr_low = 0xffffffff;
2363 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2364 	wmb();
2365 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2366 	wmb();
2367 	src->addr_low = low;
2368 	dst->addr_low = low;
2369 	wmb();
2370 }
2371 
2372 static int
2373 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2374 {
2375 	bus_dma_segment_t seg;
2376 	struct mbuf *m;
2377 	mxge_rx_ring_t *rx = &ss->rx_small;
2378 	int cnt, err;
2379 
2380 	m = m_gethdr(M_NOWAIT, MT_DATA);
2381 	if (m == NULL) {
2382 		rx->alloc_fail++;
2383 		err = ENOBUFS;
2384 		goto done;
2385 	}
2386 	m->m_len = MHLEN;
2387 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2388 				      &seg, &cnt, BUS_DMA_NOWAIT);
2389 	if (err != 0) {
2390 		m_free(m);
2391 		goto done;
2392 	}
2393 	rx->info[idx].m = m;
2394 	rx->shadow[idx].addr_low =
2395 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2396 	rx->shadow[idx].addr_high =
2397 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2398 
2399 done:
2400 	if ((idx & 7) == 7)
2401 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2402 	return err;
2403 }
2404 
2405 static int
2406 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2407 {
2408 	bus_dma_segment_t seg[3];
2409 	struct mbuf *m;
2410 	mxge_rx_ring_t *rx = &ss->rx_big;
2411 	int cnt, err, i;
2412 
2413 	m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2414 	if (m == NULL) {
2415 		rx->alloc_fail++;
2416 		err = ENOBUFS;
2417 		goto done;
2418 	}
2419 	m->m_len = rx->mlen;
2420 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2421 				      seg, &cnt, BUS_DMA_NOWAIT);
2422 	if (err != 0) {
2423 		m_free(m);
2424 		goto done;
2425 	}
2426 	rx->info[idx].m = m;
2427 	rx->shadow[idx].addr_low =
2428 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2429 	rx->shadow[idx].addr_high =
2430 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2431 
2432 #if MXGE_VIRT_JUMBOS
2433 	for (i = 1; i < cnt; i++) {
2434 		rx->shadow[idx + i].addr_low =
2435 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2436 		rx->shadow[idx + i].addr_high =
2437 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2438        }
2439 #endif
2440 
2441 done:
2442        for (i = 0; i < rx->nbufs; i++) {
2443 		if ((idx & 7) == 7) {
2444 			mxge_submit_8rx(&rx->lanai[idx - 7],
2445 					&rx->shadow[idx - 7]);
2446 		}
2447 		idx++;
2448 	}
2449 	return err;
2450 }
2451 
2452 /*
2453  *  Myri10GE hardware checksums are not valid if the sender
2454  *  padded the frame with non-zero padding.  This is because
2455  *  the firmware just does a simple 16-bit 1s complement
2456  *  checksum across the entire frame, excluding the first 14
2457  *  bytes.  It is best to simply to check the checksum and
2458  *  tell the stack about it only if the checksum is good
2459  */
2460 
2461 static inline uint16_t
2462 mxge_rx_csum(struct mbuf *m, int csum)
2463 {
2464 	struct ether_header *eh;
2465 	struct ip *ip;
2466 	uint16_t c;
2467 
2468 	eh = mtod(m, struct ether_header *);
2469 
2470 	/* only deal with IPv4 TCP & UDP for now */
2471 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2472 		return 1;
2473 	ip = (struct ip *)(eh + 1);
2474 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2475 			    ip->ip_p != IPPROTO_UDP))
2476 		return 1;
2477 #ifdef INET
2478 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2479 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2480 			    - (ip->ip_hl << 2) + ip->ip_p));
2481 #else
2482 	c = 1;
2483 #endif
2484 	c ^= 0xffff;
2485 	return (c);
2486 }
2487 
2488 static void
2489 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2490 {
2491 	struct ether_vlan_header *evl;
2492 	struct ether_header *eh;
2493 	uint32_t partial;
2494 
2495 	evl = mtod(m, struct ether_vlan_header *);
2496 	eh = mtod(m, struct ether_header *);
2497 
2498 	/*
2499 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2500 	 * after what the firmware thought was the end of the ethernet
2501 	 * header.
2502 	 */
2503 
2504 	/* put checksum into host byte order */
2505 	*csum = ntohs(*csum);
2506 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2507 	(*csum) += ~partial;
2508 	(*csum) +=  ((*csum) < ~partial);
2509 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2510 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2511 
2512 	/* restore checksum to network byte order;
2513 	   later consumers expect this */
2514 	*csum = htons(*csum);
2515 
2516 	/* save the tag */
2517 #ifdef MXGE_NEW_VLAN_API
2518 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2519 #else
2520 	{
2521 		struct m_tag *mtag;
2522 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2523 				   M_NOWAIT);
2524 		if (mtag == NULL)
2525 			return;
2526 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2527 		m_tag_prepend(m, mtag);
2528 	}
2529 
2530 #endif
2531 	m->m_flags |= M_VLANTAG;
2532 
2533 	/*
2534 	 * Remove the 802.1q header by copying the Ethernet
2535 	 * addresses over it and adjusting the beginning of
2536 	 * the data in the mbuf.  The encapsulated Ethernet
2537 	 * type field is already in place.
2538 	 */
2539 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2540 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2541 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2542 }
2543 
2544 
2545 static inline void
2546 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2547 {
2548 	mxge_softc_t *sc;
2549 	struct ifnet *ifp;
2550 	struct mbuf *m;
2551 	struct ether_header *eh;
2552 	mxge_rx_ring_t *rx;
2553 	bus_dmamap_t old_map;
2554 	int idx;
2555 	uint16_t tcpudp_csum;
2556 
2557 	sc = ss->sc;
2558 	ifp = sc->ifp;
2559 	rx = &ss->rx_big;
2560 	idx = rx->cnt & rx->mask;
2561 	rx->cnt += rx->nbufs;
2562 	/* save a pointer to the received mbuf */
2563 	m = rx->info[idx].m;
2564 	/* try to replace the received mbuf */
2565 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2566 		/* drop the frame -- the old mbuf is re-cycled */
2567 		ifp->if_ierrors++;
2568 		return;
2569 	}
2570 
2571 	/* unmap the received buffer */
2572 	old_map = rx->info[idx].map;
2573 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2574 	bus_dmamap_unload(rx->dmat, old_map);
2575 
2576 	/* swap the bus_dmamap_t's */
2577 	rx->info[idx].map = rx->extra_map;
2578 	rx->extra_map = old_map;
2579 
2580 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2581 	 * aligned */
2582 	m->m_data += MXGEFW_PAD;
2583 
2584 	m->m_pkthdr.rcvif = ifp;
2585 	m->m_len = m->m_pkthdr.len = len;
2586 	ss->ipackets++;
2587 	eh = mtod(m, struct ether_header *);
2588 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2589 		mxge_vlan_tag_remove(m, &csum);
2590 	}
2591 	/* if the checksum is valid, mark it in the mbuf header */
2592 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2593 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2594 			return;
2595 		/* otherwise, it was a UDP frame, or a TCP frame which
2596 		   we could not do LRO on.  Tell the stack that the
2597 		   checksum is good */
2598 		m->m_pkthdr.csum_data = 0xffff;
2599 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2600 	}
2601 	/* flowid only valid if RSS hashing is enabled */
2602 	if (sc->num_slices > 1) {
2603 		m->m_pkthdr.flowid = (ss - sc->ss);
2604 		m->m_flags |= M_FLOWID;
2605 	}
2606 	/* pass the frame up the stack */
2607 	(*ifp->if_input)(ifp, m);
2608 }
2609 
2610 static inline void
2611 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2612 {
2613 	mxge_softc_t *sc;
2614 	struct ifnet *ifp;
2615 	struct ether_header *eh;
2616 	struct mbuf *m;
2617 	mxge_rx_ring_t *rx;
2618 	bus_dmamap_t old_map;
2619 	int idx;
2620 	uint16_t tcpudp_csum;
2621 
2622 	sc = ss->sc;
2623 	ifp = sc->ifp;
2624 	rx = &ss->rx_small;
2625 	idx = rx->cnt & rx->mask;
2626 	rx->cnt++;
2627 	/* save a pointer to the received mbuf */
2628 	m = rx->info[idx].m;
2629 	/* try to replace the received mbuf */
2630 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2631 		/* drop the frame -- the old mbuf is re-cycled */
2632 		ifp->if_ierrors++;
2633 		return;
2634 	}
2635 
2636 	/* unmap the received buffer */
2637 	old_map = rx->info[idx].map;
2638 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2639 	bus_dmamap_unload(rx->dmat, old_map);
2640 
2641 	/* swap the bus_dmamap_t's */
2642 	rx->info[idx].map = rx->extra_map;
2643 	rx->extra_map = old_map;
2644 
2645 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2646 	 * aligned */
2647 	m->m_data += MXGEFW_PAD;
2648 
2649 	m->m_pkthdr.rcvif = ifp;
2650 	m->m_len = m->m_pkthdr.len = len;
2651 	ss->ipackets++;
2652 	eh = mtod(m, struct ether_header *);
2653 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2654 		mxge_vlan_tag_remove(m, &csum);
2655 	}
2656 	/* if the checksum is valid, mark it in the mbuf header */
2657 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2658 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2659 			return;
2660 		/* otherwise, it was a UDP frame, or a TCP frame which
2661 		   we could not do LRO on.  Tell the stack that the
2662 		   checksum is good */
2663 		m->m_pkthdr.csum_data = 0xffff;
2664 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2665 	}
2666 	/* flowid only valid if RSS hashing is enabled */
2667 	if (sc->num_slices > 1) {
2668 		m->m_pkthdr.flowid = (ss - sc->ss);
2669 		m->m_flags |= M_FLOWID;
2670 	}
2671 	/* pass the frame up the stack */
2672 	(*ifp->if_input)(ifp, m);
2673 }
2674 
2675 static inline void
2676 mxge_clean_rx_done(struct mxge_slice_state *ss)
2677 {
2678 	mxge_rx_done_t *rx_done = &ss->rx_done;
2679 	int limit = 0;
2680 	uint16_t length;
2681 	uint16_t checksum;
2682 
2683 
2684 	while (rx_done->entry[rx_done->idx].length != 0) {
2685 		length = ntohs(rx_done->entry[rx_done->idx].length);
2686 		rx_done->entry[rx_done->idx].length = 0;
2687 		checksum = rx_done->entry[rx_done->idx].checksum;
2688 		if (length <= (MHLEN - MXGEFW_PAD))
2689 			mxge_rx_done_small(ss, length, checksum);
2690 		else
2691 			mxge_rx_done_big(ss, length, checksum);
2692 		rx_done->cnt++;
2693 		rx_done->idx = rx_done->cnt & rx_done->mask;
2694 
2695 		/* limit potential for livelock */
2696 		if (__predict_false(++limit > rx_done->mask / 2))
2697 			break;
2698 	}
2699 #ifdef INET
2700 	while (!SLIST_EMPTY(&ss->lro_active)) {
2701 		struct lro_entry *lro = SLIST_FIRST(&ss->lro_active);
2702 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2703 		mxge_lro_flush(ss, lro);
2704 	}
2705 #endif
2706 }
2707 
2708 
2709 static inline void
2710 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2711 {
2712 	struct ifnet *ifp;
2713 	mxge_tx_ring_t *tx;
2714 	struct mbuf *m;
2715 	bus_dmamap_t map;
2716 	int idx;
2717 	int *flags;
2718 
2719 	tx = &ss->tx;
2720 	ifp = ss->sc->ifp;
2721 	while (tx->pkt_done != mcp_idx) {
2722 		idx = tx->done & tx->mask;
2723 		tx->done++;
2724 		m = tx->info[idx].m;
2725 		/* mbuf and DMA map only attached to the first
2726 		   segment per-mbuf */
2727 		if (m != NULL) {
2728 			ss->obytes += m->m_pkthdr.len;
2729 			if (m->m_flags & M_MCAST)
2730 				ss->omcasts++;
2731 			ss->opackets++;
2732 			tx->info[idx].m = NULL;
2733 			map = tx->info[idx].map;
2734 			bus_dmamap_unload(tx->dmat, map);
2735 			m_freem(m);
2736 		}
2737 		if (tx->info[idx].flag) {
2738 			tx->info[idx].flag = 0;
2739 			tx->pkt_done++;
2740 		}
2741 	}
2742 
2743 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2744            its OK to send packets */
2745 #ifdef IFNET_BUF_RING
2746 	flags = &ss->if_drv_flags;
2747 #else
2748 	flags = &ifp->if_drv_flags;
2749 #endif
2750 	mtx_lock(&ss->tx.mtx);
2751 	if ((*flags) & IFF_DRV_OACTIVE &&
2752 	    tx->req - tx->done < (tx->mask + 1)/4) {
2753 		*(flags) &= ~IFF_DRV_OACTIVE;
2754 		ss->tx.wake++;
2755 		mxge_start_locked(ss);
2756 	}
2757 #ifdef IFNET_BUF_RING
2758 	if ((ss->sc->num_slices > 1) && (tx->req == tx->done)) {
2759 		/* let the NIC stop polling this queue, since there
2760 		 * are no more transmits pending */
2761 		if (tx->req == tx->done) {
2762 			*tx->send_stop = 1;
2763 			tx->queue_active = 0;
2764 			tx->deactivate++;
2765 			wmb();
2766 		}
2767 	}
2768 #endif
2769 	mtx_unlock(&ss->tx.mtx);
2770 
2771 }
2772 
2773 static struct mxge_media_type mxge_xfp_media_types[] =
2774 {
2775 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2776 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2777 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2778 	{0,		(1 << 5),	"10GBASE-ER"},
2779 	{IFM_10G_LRM,	(1 << 4),	"10GBASE-LRM"},
2780 	{0,		(1 << 3),	"10GBASE-SW"},
2781 	{0,		(1 << 2),	"10GBASE-LW"},
2782 	{0,		(1 << 1),	"10GBASE-EW"},
2783 	{0,		(1 << 0),	"Reserved"}
2784 };
2785 static struct mxge_media_type mxge_sfp_media_types[] =
2786 {
2787 	{IFM_10G_TWINAX,      0,	"10GBASE-Twinax"},
2788 	{0,		(1 << 7),	"Reserved"},
2789 	{IFM_10G_LRM,	(1 << 6),	"10GBASE-LRM"},
2790 	{IFM_10G_LR, 	(1 << 5),	"10GBASE-LR"},
2791 	{IFM_10G_SR,	(1 << 4),	"10GBASE-SR"},
2792 	{IFM_10G_TWINAX,(1 << 0),	"10GBASE-Twinax"}
2793 };
2794 
2795 static void
2796 mxge_media_set(mxge_softc_t *sc, int media_type)
2797 {
2798 
2799 
2800 	ifmedia_add(&sc->media, IFM_ETHER | IFM_FDX | media_type,
2801 		    0, NULL);
2802 	ifmedia_set(&sc->media, IFM_ETHER | IFM_FDX | media_type);
2803 	sc->current_media = media_type;
2804 	sc->media.ifm_media = sc->media.ifm_cur->ifm_media;
2805 }
2806 
2807 static void
2808 mxge_media_init(mxge_softc_t *sc)
2809 {
2810 	char *ptr;
2811 	int i;
2812 
2813 	ifmedia_removeall(&sc->media);
2814 	mxge_media_set(sc, IFM_AUTO);
2815 
2816 	/*
2817 	 * parse the product code to deterimine the interface type
2818 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2819 	 * after the 3rd dash in the driver's cached copy of the
2820 	 * EEPROM's product code string.
2821 	 */
2822 	ptr = sc->product_code_string;
2823 	if (ptr == NULL) {
2824 		device_printf(sc->dev, "Missing product code\n");
2825 		return;
2826 	}
2827 
2828 	for (i = 0; i < 3; i++, ptr++) {
2829 		ptr = strchr(ptr, '-');
2830 		if (ptr == NULL) {
2831 			device_printf(sc->dev,
2832 				      "only %d dashes in PC?!?\n", i);
2833 			return;
2834 		}
2835 	}
2836 	if (*ptr == 'C' || *(ptr +1) == 'C') {
2837 		/* -C is CX4 */
2838 		sc->connector = MXGE_CX4;
2839 		mxge_media_set(sc, IFM_10G_CX4);
2840 	} else if (*ptr == 'Q') {
2841 		/* -Q is Quad Ribbon Fiber */
2842 		sc->connector = MXGE_QRF;
2843 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2844 		/* FreeBSD has no media type for Quad ribbon fiber */
2845 	} else if (*ptr == 'R') {
2846 		/* -R is XFP */
2847 		sc->connector = MXGE_XFP;
2848 	} else if (*ptr == 'S' || *(ptr +1) == 'S') {
2849 		/* -S or -2S is SFP+ */
2850 		sc->connector = MXGE_SFP;
2851 	} else {
2852 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2853 	}
2854 }
2855 
2856 /*
2857  * Determine the media type for a NIC.  Some XFPs will identify
2858  * themselves only when their link is up, so this is initiated via a
2859  * link up interrupt.  However, this can potentially take up to
2860  * several milliseconds, so it is run via the watchdog routine, rather
2861  * than in the interrupt handler itself.
2862  */
2863 static void
2864 mxge_media_probe(mxge_softc_t *sc)
2865 {
2866 	mxge_cmd_t cmd;
2867 	char *cage_type;
2868 
2869 	struct mxge_media_type *mxge_media_types = NULL;
2870 	int i, err, ms, mxge_media_type_entries;
2871 	uint32_t byte;
2872 
2873 	sc->need_media_probe = 0;
2874 
2875 	if (sc->connector == MXGE_XFP) {
2876 		/* -R is XFP */
2877 		mxge_media_types = mxge_xfp_media_types;
2878 		mxge_media_type_entries =
2879 			sizeof (mxge_xfp_media_types) /
2880 			sizeof (mxge_xfp_media_types[0]);
2881 		byte = MXGE_XFP_COMPLIANCE_BYTE;
2882 		cage_type = "XFP";
2883 	} else 	if (sc->connector == MXGE_SFP) {
2884 		/* -S or -2S is SFP+ */
2885 		mxge_media_types = mxge_sfp_media_types;
2886 		mxge_media_type_entries =
2887 			sizeof (mxge_sfp_media_types) /
2888 			sizeof (mxge_sfp_media_types[0]);
2889 		cage_type = "SFP+";
2890 		byte = 3;
2891 	} else {
2892 		/* nothing to do; media type cannot change */
2893 		return;
2894 	}
2895 
2896 	/*
2897 	 * At this point we know the NIC has an XFP cage, so now we
2898 	 * try to determine what is in the cage by using the
2899 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2900 	 * register.  We read just one byte, which may take over
2901 	 * a millisecond
2902 	 */
2903 
2904 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2905 	cmd.data1 = byte;
2906 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2907 	if (err == MXGEFW_CMD_ERROR_I2C_FAILURE) {
2908 		device_printf(sc->dev, "failed to read XFP\n");
2909 	}
2910 	if (err == MXGEFW_CMD_ERROR_I2C_ABSENT) {
2911 		device_printf(sc->dev, "Type R/S with no XFP!?!?\n");
2912 	}
2913 	if (err != MXGEFW_CMD_OK) {
2914 		return;
2915 	}
2916 
2917 	/* now we wait for the data to be cached */
2918 	cmd.data0 = byte;
2919 	err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2920 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2921 		DELAY(1000);
2922 		cmd.data0 = byte;
2923 		err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2924 	}
2925 	if (err != MXGEFW_CMD_OK) {
2926 		device_printf(sc->dev, "failed to read %s (%d, %dms)\n",
2927 			      cage_type, err, ms);
2928 		return;
2929 	}
2930 
2931 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2932 		if (mxge_verbose)
2933 			device_printf(sc->dev, "%s:%s\n", cage_type,
2934 				      mxge_media_types[0].name);
2935 		if (sc->current_media != mxge_media_types[0].flag) {
2936 			mxge_media_init(sc);
2937 			mxge_media_set(sc, mxge_media_types[0].flag);
2938 		}
2939 		return;
2940 	}
2941 	for (i = 1; i < mxge_media_type_entries; i++) {
2942 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2943 			if (mxge_verbose)
2944 				device_printf(sc->dev, "%s:%s\n",
2945 					      cage_type,
2946 					      mxge_media_types[i].name);
2947 
2948 			if (sc->current_media != mxge_media_types[i].flag) {
2949 				mxge_media_init(sc);
2950 				mxge_media_set(sc, mxge_media_types[i].flag);
2951 			}
2952 			return;
2953 		}
2954 	}
2955 	if (mxge_verbose)
2956 		device_printf(sc->dev, "%s media 0x%x unknown\n",
2957 			      cage_type, cmd.data0);
2958 
2959 	return;
2960 }
2961 
2962 static void
2963 mxge_intr(void *arg)
2964 {
2965 	struct mxge_slice_state *ss = arg;
2966 	mxge_softc_t *sc = ss->sc;
2967 	mcp_irq_data_t *stats = ss->fw_stats;
2968 	mxge_tx_ring_t *tx = &ss->tx;
2969 	mxge_rx_done_t *rx_done = &ss->rx_done;
2970 	uint32_t send_done_count;
2971 	uint8_t valid;
2972 
2973 
2974 #ifndef IFNET_BUF_RING
2975 	/* an interrupt on a non-zero slice is implicitly valid
2976 	   since MSI-X irqs are not shared */
2977 	if (ss != sc->ss) {
2978 		mxge_clean_rx_done(ss);
2979 		*ss->irq_claim = be32toh(3);
2980 		return;
2981 	}
2982 #endif
2983 
2984 	/* make sure the DMA has finished */
2985 	if (!stats->valid) {
2986 		return;
2987 	}
2988 	valid = stats->valid;
2989 
2990 	if (sc->legacy_irq) {
2991 		/* lower legacy IRQ  */
2992 		*sc->irq_deassert = 0;
2993 		if (!mxge_deassert_wait)
2994 			/* don't wait for conf. that irq is low */
2995 			stats->valid = 0;
2996 	} else {
2997 		stats->valid = 0;
2998 	}
2999 
3000 	/* loop while waiting for legacy irq deassertion */
3001 	do {
3002 		/* check for transmit completes and receives */
3003 		send_done_count = be32toh(stats->send_done_count);
3004 		while ((send_done_count != tx->pkt_done) ||
3005 		       (rx_done->entry[rx_done->idx].length != 0)) {
3006 			if (send_done_count != tx->pkt_done)
3007 				mxge_tx_done(ss, (int)send_done_count);
3008 			mxge_clean_rx_done(ss);
3009 			send_done_count = be32toh(stats->send_done_count);
3010 		}
3011 		if (sc->legacy_irq && mxge_deassert_wait)
3012 			wmb();
3013 	} while (*((volatile uint8_t *) &stats->valid));
3014 
3015 	/* fw link & error stats meaningful only on the first slice */
3016 	if (__predict_false((ss == sc->ss) && stats->stats_updated)) {
3017 		if (sc->link_state != stats->link_up) {
3018 			sc->link_state = stats->link_up;
3019 			if (sc->link_state) {
3020 				if_link_state_change(sc->ifp, LINK_STATE_UP);
3021 				if_initbaudrate(sc->ifp, IF_Gbps(10));
3022 				if (mxge_verbose)
3023 					device_printf(sc->dev, "link up\n");
3024 			} else {
3025 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3026 				sc->ifp->if_baudrate = 0;
3027 				if (mxge_verbose)
3028 					device_printf(sc->dev, "link down\n");
3029 			}
3030 			sc->need_media_probe = 1;
3031 		}
3032 		if (sc->rdma_tags_available !=
3033 		    be32toh(stats->rdma_tags_available)) {
3034 			sc->rdma_tags_available =
3035 				be32toh(stats->rdma_tags_available);
3036 			device_printf(sc->dev, "RDMA timed out! %d tags "
3037 				      "left\n", sc->rdma_tags_available);
3038 		}
3039 
3040 		if (stats->link_down) {
3041 			sc->down_cnt += stats->link_down;
3042 			sc->link_state = 0;
3043 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
3044 		}
3045 	}
3046 
3047 	/* check to see if we have rx token to pass back */
3048 	if (valid & 0x1)
3049 	    *ss->irq_claim = be32toh(3);
3050 	*(ss->irq_claim + 1) = be32toh(3);
3051 }
3052 
3053 static void
3054 mxge_init(void *arg)
3055 {
3056 	mxge_softc_t *sc = arg;
3057 	struct ifnet *ifp = sc->ifp;
3058 
3059 
3060 	mtx_lock(&sc->driver_mtx);
3061 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
3062 		(void) mxge_open(sc);
3063 	mtx_unlock(&sc->driver_mtx);
3064 }
3065 
3066 
3067 
3068 static void
3069 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
3070 {
3071 	struct lro_entry *lro_entry;
3072 	int i;
3073 
3074 	while (!SLIST_EMPTY(&ss->lro_free)) {
3075 		lro_entry = SLIST_FIRST(&ss->lro_free);
3076 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
3077 		free(lro_entry, M_DEVBUF);
3078 	}
3079 
3080 	for (i = 0; i <= ss->rx_big.mask; i++) {
3081 		if (ss->rx_big.info[i].m == NULL)
3082 			continue;
3083 		bus_dmamap_unload(ss->rx_big.dmat,
3084 				  ss->rx_big.info[i].map);
3085 		m_freem(ss->rx_big.info[i].m);
3086 		ss->rx_big.info[i].m = NULL;
3087 	}
3088 
3089 	for (i = 0; i <= ss->rx_small.mask; i++) {
3090 		if (ss->rx_small.info[i].m == NULL)
3091 			continue;
3092 		bus_dmamap_unload(ss->rx_small.dmat,
3093 				  ss->rx_small.info[i].map);
3094 		m_freem(ss->rx_small.info[i].m);
3095 		ss->rx_small.info[i].m = NULL;
3096 	}
3097 
3098 	/* transmit ring used only on the first slice */
3099 	if (ss->tx.info == NULL)
3100 		return;
3101 
3102 	for (i = 0; i <= ss->tx.mask; i++) {
3103 		ss->tx.info[i].flag = 0;
3104 		if (ss->tx.info[i].m == NULL)
3105 			continue;
3106 		bus_dmamap_unload(ss->tx.dmat,
3107 				  ss->tx.info[i].map);
3108 		m_freem(ss->tx.info[i].m);
3109 		ss->tx.info[i].m = NULL;
3110 	}
3111 }
3112 
3113 static void
3114 mxge_free_mbufs(mxge_softc_t *sc)
3115 {
3116 	int slice;
3117 
3118 	for (slice = 0; slice < sc->num_slices; slice++)
3119 		mxge_free_slice_mbufs(&sc->ss[slice]);
3120 }
3121 
3122 static void
3123 mxge_free_slice_rings(struct mxge_slice_state *ss)
3124 {
3125 	int i;
3126 
3127 
3128 	if (ss->rx_done.entry != NULL)
3129 		mxge_dma_free(&ss->rx_done.dma);
3130 	ss->rx_done.entry = NULL;
3131 
3132 	if (ss->tx.req_bytes != NULL)
3133 		free(ss->tx.req_bytes, M_DEVBUF);
3134 	ss->tx.req_bytes = NULL;
3135 
3136 	if (ss->tx.seg_list != NULL)
3137 		free(ss->tx.seg_list, M_DEVBUF);
3138 	ss->tx.seg_list = NULL;
3139 
3140 	if (ss->rx_small.shadow != NULL)
3141 		free(ss->rx_small.shadow, M_DEVBUF);
3142 	ss->rx_small.shadow = NULL;
3143 
3144 	if (ss->rx_big.shadow != NULL)
3145 		free(ss->rx_big.shadow, M_DEVBUF);
3146 	ss->rx_big.shadow = NULL;
3147 
3148 	if (ss->tx.info != NULL) {
3149 		if (ss->tx.dmat != NULL) {
3150 			for (i = 0; i <= ss->tx.mask; i++) {
3151 				bus_dmamap_destroy(ss->tx.dmat,
3152 						   ss->tx.info[i].map);
3153 			}
3154 			bus_dma_tag_destroy(ss->tx.dmat);
3155 		}
3156 		free(ss->tx.info, M_DEVBUF);
3157 	}
3158 	ss->tx.info = NULL;
3159 
3160 	if (ss->rx_small.info != NULL) {
3161 		if (ss->rx_small.dmat != NULL) {
3162 			for (i = 0; i <= ss->rx_small.mask; i++) {
3163 				bus_dmamap_destroy(ss->rx_small.dmat,
3164 						   ss->rx_small.info[i].map);
3165 			}
3166 			bus_dmamap_destroy(ss->rx_small.dmat,
3167 					   ss->rx_small.extra_map);
3168 			bus_dma_tag_destroy(ss->rx_small.dmat);
3169 		}
3170 		free(ss->rx_small.info, M_DEVBUF);
3171 	}
3172 	ss->rx_small.info = NULL;
3173 
3174 	if (ss->rx_big.info != NULL) {
3175 		if (ss->rx_big.dmat != NULL) {
3176 			for (i = 0; i <= ss->rx_big.mask; i++) {
3177 				bus_dmamap_destroy(ss->rx_big.dmat,
3178 						   ss->rx_big.info[i].map);
3179 			}
3180 			bus_dmamap_destroy(ss->rx_big.dmat,
3181 					   ss->rx_big.extra_map);
3182 			bus_dma_tag_destroy(ss->rx_big.dmat);
3183 		}
3184 		free(ss->rx_big.info, M_DEVBUF);
3185 	}
3186 	ss->rx_big.info = NULL;
3187 }
3188 
3189 static void
3190 mxge_free_rings(mxge_softc_t *sc)
3191 {
3192 	int slice;
3193 
3194 	for (slice = 0; slice < sc->num_slices; slice++)
3195 		mxge_free_slice_rings(&sc->ss[slice]);
3196 }
3197 
3198 static int
3199 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
3200 		       int tx_ring_entries)
3201 {
3202 	mxge_softc_t *sc = ss->sc;
3203 	size_t bytes;
3204 	int err, i;
3205 
3206 	err = ENOMEM;
3207 
3208 	/* allocate per-slice receive resources */
3209 
3210 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
3211 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
3212 
3213 	/* allocate the rx shadow rings */
3214 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
3215 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3216 	if (ss->rx_small.shadow == NULL)
3217 		return err;
3218 
3219 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
3220 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3221 	if (ss->rx_big.shadow == NULL)
3222 		return err;
3223 
3224 	/* allocate the rx host info rings */
3225 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
3226 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3227 	if (ss->rx_small.info == NULL)
3228 		return err;
3229 
3230 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
3231 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3232 	if (ss->rx_big.info == NULL)
3233 		return err;
3234 
3235 	/* allocate the rx busdma resources */
3236 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3237 				 1,			/* alignment */
3238 				 4096,			/* boundary */
3239 				 BUS_SPACE_MAXADDR,	/* low */
3240 				 BUS_SPACE_MAXADDR,	/* high */
3241 				 NULL, NULL,		/* filter */
3242 				 MHLEN,			/* maxsize */
3243 				 1,			/* num segs */
3244 				 MHLEN,			/* maxsegsize */
3245 				 BUS_DMA_ALLOCNOW,	/* flags */
3246 				 NULL, NULL,		/* lock */
3247 				 &ss->rx_small.dmat);	/* tag */
3248 	if (err != 0) {
3249 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3250 			      err);
3251 		return err;
3252 	}
3253 
3254 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3255 				 1,			/* alignment */
3256 #if MXGE_VIRT_JUMBOS
3257 				 4096,			/* boundary */
3258 #else
3259 				 0,			/* boundary */
3260 #endif
3261 				 BUS_SPACE_MAXADDR,	/* low */
3262 				 BUS_SPACE_MAXADDR,	/* high */
3263 				 NULL, NULL,		/* filter */
3264 				 3*4096,		/* maxsize */
3265 #if MXGE_VIRT_JUMBOS
3266 				 3,			/* num segs */
3267 				 4096,			/* maxsegsize*/
3268 #else
3269 				 1,			/* num segs */
3270 				 MJUM9BYTES,		/* maxsegsize*/
3271 #endif
3272 				 BUS_DMA_ALLOCNOW,	/* flags */
3273 				 NULL, NULL,		/* lock */
3274 				 &ss->rx_big.dmat);	/* tag */
3275 	if (err != 0) {
3276 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3277 			      err);
3278 		return err;
3279 	}
3280 	for (i = 0; i <= ss->rx_small.mask; i++) {
3281 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
3282 					&ss->rx_small.info[i].map);
3283 		if (err != 0) {
3284 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
3285 				      err);
3286 			return err;
3287 		}
3288 	}
3289 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
3290 				&ss->rx_small.extra_map);
3291 	if (err != 0) {
3292 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
3293 			      err);
3294 		return err;
3295 	}
3296 
3297 	for (i = 0; i <= ss->rx_big.mask; i++) {
3298 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
3299 					&ss->rx_big.info[i].map);
3300 		if (err != 0) {
3301 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
3302 				      err);
3303 			return err;
3304 		}
3305 	}
3306 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
3307 				&ss->rx_big.extra_map);
3308 	if (err != 0) {
3309 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
3310 			      err);
3311 		return err;
3312 	}
3313 
3314 	/* now allocate TX resouces */
3315 
3316 #ifndef IFNET_BUF_RING
3317 	/* only use a single TX ring for now */
3318 	if (ss != ss->sc->ss)
3319 		return 0;
3320 #endif
3321 
3322 	ss->tx.mask = tx_ring_entries - 1;
3323 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3324 
3325 
3326 	/* allocate the tx request copy block */
3327 	bytes = 8 +
3328 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3329 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3330 	if (ss->tx.req_bytes == NULL)
3331 		return err;
3332 	/* ensure req_list entries are aligned to 8 bytes */
3333 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3334 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3335 
3336 	/* allocate the tx busdma segment list */
3337 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3338 	ss->tx.seg_list = (bus_dma_segment_t *)
3339 		malloc(bytes, M_DEVBUF, M_WAITOK);
3340 	if (ss->tx.seg_list == NULL)
3341 		return err;
3342 
3343 	/* allocate the tx host info ring */
3344 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3345 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3346 	if (ss->tx.info == NULL)
3347 		return err;
3348 
3349 	/* allocate the tx busdma resources */
3350 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3351 				 1,			/* alignment */
3352 				 sc->tx_boundary,	/* boundary */
3353 				 BUS_SPACE_MAXADDR,	/* low */
3354 				 BUS_SPACE_MAXADDR,	/* high */
3355 				 NULL, NULL,		/* filter */
3356 				 65536 + 256,		/* maxsize */
3357 				 ss->tx.max_desc - 2,	/* num segs */
3358 				 sc->tx_boundary,	/* maxsegsz */
3359 				 BUS_DMA_ALLOCNOW,	/* flags */
3360 				 NULL, NULL,		/* lock */
3361 				 &ss->tx.dmat);		/* tag */
3362 
3363 	if (err != 0) {
3364 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3365 			      err);
3366 		return err;
3367 	}
3368 
3369 	/* now use these tags to setup dmamaps for each slot
3370 	   in the ring */
3371 	for (i = 0; i <= ss->tx.mask; i++) {
3372 		err = bus_dmamap_create(ss->tx.dmat, 0,
3373 					&ss->tx.info[i].map);
3374 		if (err != 0) {
3375 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3376 				      err);
3377 			return err;
3378 		}
3379 	}
3380 	return 0;
3381 
3382 }
3383 
3384 static int
3385 mxge_alloc_rings(mxge_softc_t *sc)
3386 {
3387 	mxge_cmd_t cmd;
3388 	int tx_ring_size;
3389 	int tx_ring_entries, rx_ring_entries;
3390 	int err, slice;
3391 
3392 	/* get ring sizes */
3393 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3394 	tx_ring_size = cmd.data0;
3395 	if (err != 0) {
3396 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3397 		goto abort;
3398 	}
3399 
3400 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3401 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3402 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3403 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3404 	IFQ_SET_READY(&sc->ifp->if_snd);
3405 
3406 	for (slice = 0; slice < sc->num_slices; slice++) {
3407 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3408 					     rx_ring_entries,
3409 					     tx_ring_entries);
3410 		if (err != 0)
3411 			goto abort;
3412 	}
3413 	return 0;
3414 
3415 abort:
3416 	mxge_free_rings(sc);
3417 	return err;
3418 
3419 }
3420 
3421 
3422 static void
3423 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3424 {
3425 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3426 
3427 	if (bufsize < MCLBYTES) {
3428 		/* easy, everything fits in a single buffer */
3429 		*big_buf_size = MCLBYTES;
3430 		*cl_size = MCLBYTES;
3431 		*nbufs = 1;
3432 		return;
3433 	}
3434 
3435 	if (bufsize < MJUMPAGESIZE) {
3436 		/* still easy, everything still fits in a single buffer */
3437 		*big_buf_size = MJUMPAGESIZE;
3438 		*cl_size = MJUMPAGESIZE;
3439 		*nbufs = 1;
3440 		return;
3441 	}
3442 #if MXGE_VIRT_JUMBOS
3443 	/* now we need to use virtually contiguous buffers */
3444 	*cl_size = MJUM9BYTES;
3445 	*big_buf_size = 4096;
3446 	*nbufs = mtu / 4096 + 1;
3447 	/* needs to be a power of two, so round up */
3448 	if (*nbufs == 3)
3449 		*nbufs = 4;
3450 #else
3451 	*cl_size = MJUM9BYTES;
3452 	*big_buf_size = MJUM9BYTES;
3453 	*nbufs = 1;
3454 #endif
3455 }
3456 
3457 static int
3458 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3459 {
3460 	mxge_softc_t *sc;
3461 	mxge_cmd_t cmd;
3462 	bus_dmamap_t map;
3463 	struct lro_entry *lro_entry;
3464 	int err, i, slice;
3465 
3466 
3467 	sc = ss->sc;
3468 	slice = ss - sc->ss;
3469 
3470 	SLIST_INIT(&ss->lro_free);
3471 	SLIST_INIT(&ss->lro_active);
3472 
3473 	for (i = 0; i < sc->lro_cnt; i++) {
3474 		lro_entry = (struct lro_entry *)
3475 			malloc(sizeof (*lro_entry), M_DEVBUF,
3476 			       M_NOWAIT | M_ZERO);
3477 		if (lro_entry == NULL) {
3478 			sc->lro_cnt = i;
3479 			break;
3480 		}
3481 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3482 	}
3483 	/* get the lanai pointers to the send and receive rings */
3484 
3485 	err = 0;
3486 #ifndef IFNET_BUF_RING
3487 	/* We currently only send from the first slice */
3488 	if (slice == 0) {
3489 #endif
3490 		cmd.data0 = slice;
3491 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3492 		ss->tx.lanai =
3493 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3494 		ss->tx.send_go = (volatile uint32_t *)
3495 			(sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3496 		ss->tx.send_stop = (volatile uint32_t *)
3497 		(sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3498 #ifndef IFNET_BUF_RING
3499 	}
3500 #endif
3501 	cmd.data0 = slice;
3502 	err |= mxge_send_cmd(sc,
3503 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3504 	ss->rx_small.lanai =
3505 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3506 	cmd.data0 = slice;
3507 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3508 	ss->rx_big.lanai =
3509 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3510 
3511 	if (err != 0) {
3512 		device_printf(sc->dev,
3513 			      "failed to get ring sizes or locations\n");
3514 		return EIO;
3515 	}
3516 
3517 	/* stock receive rings */
3518 	for (i = 0; i <= ss->rx_small.mask; i++) {
3519 		map = ss->rx_small.info[i].map;
3520 		err = mxge_get_buf_small(ss, map, i);
3521 		if (err) {
3522 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3523 				      i, ss->rx_small.mask + 1);
3524 			return ENOMEM;
3525 		}
3526 	}
3527 	for (i = 0; i <= ss->rx_big.mask; i++) {
3528 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3529 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3530 	}
3531 	ss->rx_big.nbufs = nbufs;
3532 	ss->rx_big.cl_size = cl_size;
3533 	ss->rx_big.mlen = ss->sc->ifp->if_mtu + ETHER_HDR_LEN +
3534 		ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3535 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3536 		map = ss->rx_big.info[i].map;
3537 		err = mxge_get_buf_big(ss, map, i);
3538 		if (err) {
3539 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3540 				      i, ss->rx_big.mask + 1);
3541 			return ENOMEM;
3542 		}
3543 	}
3544 	return 0;
3545 }
3546 
3547 static int
3548 mxge_open(mxge_softc_t *sc)
3549 {
3550 	mxge_cmd_t cmd;
3551 	int err, big_bytes, nbufs, slice, cl_size, i;
3552 	bus_addr_t bus;
3553 	volatile uint8_t *itable;
3554 	struct mxge_slice_state *ss;
3555 
3556 	/* Copy the MAC address in case it was overridden */
3557 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3558 
3559 	err = mxge_reset(sc, 1);
3560 	if (err != 0) {
3561 		device_printf(sc->dev, "failed to reset\n");
3562 		return EIO;
3563 	}
3564 
3565 	if (sc->num_slices > 1) {
3566 		/* setup the indirection table */
3567 		cmd.data0 = sc->num_slices;
3568 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3569 				    &cmd);
3570 
3571 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3572 				     &cmd);
3573 		if (err != 0) {
3574 			device_printf(sc->dev,
3575 				      "failed to setup rss tables\n");
3576 			return err;
3577 		}
3578 
3579 		/* just enable an identity mapping */
3580 		itable = sc->sram + cmd.data0;
3581 		for (i = 0; i < sc->num_slices; i++)
3582 			itable[i] = (uint8_t)i;
3583 
3584 		cmd.data0 = 1;
3585 		cmd.data1 = mxge_rss_hash_type;
3586 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3587 		if (err != 0) {
3588 			device_printf(sc->dev, "failed to enable slices\n");
3589 			return err;
3590 		}
3591 	}
3592 
3593 
3594 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3595 
3596 	cmd.data0 = nbufs;
3597 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3598 			    &cmd);
3599 	/* error is only meaningful if we're trying to set
3600 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3601 	if (err && nbufs > 1) {
3602 		device_printf(sc->dev,
3603 			      "Failed to set alway-use-n to %d\n",
3604 			      nbufs);
3605 		return EIO;
3606 	}
3607 	/* Give the firmware the mtu and the big and small buffer
3608 	   sizes.  The firmware wants the big buf size to be a power
3609 	   of two. Luckily, FreeBSD's clusters are powers of two */
3610 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3611 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3612 	cmd.data0 = MHLEN - MXGEFW_PAD;
3613 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3614 			     &cmd);
3615 	cmd.data0 = big_bytes;
3616 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3617 
3618 	if (err != 0) {
3619 		device_printf(sc->dev, "failed to setup params\n");
3620 		goto abort;
3621 	}
3622 
3623 	/* Now give him the pointer to the stats block */
3624 	for (slice = 0;
3625 #ifdef IFNET_BUF_RING
3626 	     slice < sc->num_slices;
3627 #else
3628 	     slice < 1;
3629 #endif
3630 	     slice++) {
3631 		ss = &sc->ss[slice];
3632 		cmd.data0 =
3633 			MXGE_LOWPART_TO_U32(ss->fw_stats_dma.bus_addr);
3634 		cmd.data1 =
3635 			MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.bus_addr);
3636 		cmd.data2 = sizeof(struct mcp_irq_data);
3637 		cmd.data2 |= (slice << 16);
3638 		err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3639 	}
3640 
3641 	if (err != 0) {
3642 		bus = sc->ss->fw_stats_dma.bus_addr;
3643 		bus += offsetof(struct mcp_irq_data, send_done_count);
3644 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3645 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3646 		err = mxge_send_cmd(sc,
3647 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3648 				    &cmd);
3649 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3650 		sc->fw_multicast_support = 0;
3651 	} else {
3652 		sc->fw_multicast_support = 1;
3653 	}
3654 
3655 	if (err != 0) {
3656 		device_printf(sc->dev, "failed to setup params\n");
3657 		goto abort;
3658 	}
3659 
3660 	for (slice = 0; slice < sc->num_slices; slice++) {
3661 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3662 		if (err != 0) {
3663 			device_printf(sc->dev, "couldn't open slice %d\n",
3664 				      slice);
3665 			goto abort;
3666 		}
3667 	}
3668 
3669 	/* Finally, start the firmware running */
3670 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3671 	if (err) {
3672 		device_printf(sc->dev, "Couldn't bring up link\n");
3673 		goto abort;
3674 	}
3675 #ifdef IFNET_BUF_RING
3676 	for (slice = 0; slice < sc->num_slices; slice++) {
3677 		ss = &sc->ss[slice];
3678 		ss->if_drv_flags |= IFF_DRV_RUNNING;
3679 		ss->if_drv_flags &= ~IFF_DRV_OACTIVE;
3680 	}
3681 #endif
3682 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3683 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3684 
3685 	return 0;
3686 
3687 
3688 abort:
3689 	mxge_free_mbufs(sc);
3690 
3691 	return err;
3692 }
3693 
3694 static int
3695 mxge_close(mxge_softc_t *sc, int down)
3696 {
3697 	mxge_cmd_t cmd;
3698 	int err, old_down_cnt;
3699 #ifdef IFNET_BUF_RING
3700 	struct mxge_slice_state *ss;
3701 	int slice;
3702 #endif
3703 
3704 #ifdef IFNET_BUF_RING
3705 	for (slice = 0; slice < sc->num_slices; slice++) {
3706 		ss = &sc->ss[slice];
3707 		ss->if_drv_flags &= ~IFF_DRV_RUNNING;
3708 	}
3709 #endif
3710 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3711 	if (!down) {
3712 		old_down_cnt = sc->down_cnt;
3713 		wmb();
3714 		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3715 		if (err) {
3716 			device_printf(sc->dev,
3717 				      "Couldn't bring down link\n");
3718 		}
3719 		if (old_down_cnt == sc->down_cnt) {
3720 			/* wait for down irq */
3721 			DELAY(10 * sc->intr_coal_delay);
3722 		}
3723 		wmb();
3724 		if (old_down_cnt == sc->down_cnt) {
3725 			device_printf(sc->dev, "never got down irq\n");
3726 		}
3727 	}
3728 	mxge_free_mbufs(sc);
3729 
3730 	return 0;
3731 }
3732 
3733 static void
3734 mxge_setup_cfg_space(mxge_softc_t *sc)
3735 {
3736 	device_t dev = sc->dev;
3737 	int reg;
3738 	uint16_t cmd, lnk, pectl;
3739 
3740 	/* find the PCIe link width and set max read request to 4KB*/
3741 	if (pci_find_cap(dev, PCIY_EXPRESS, &reg) == 0) {
3742 		lnk = pci_read_config(dev, reg + 0x12, 2);
3743 		sc->link_width = (lnk >> 4) & 0x3f;
3744 
3745 		if (sc->pectl == 0) {
3746 			pectl = pci_read_config(dev, reg + 0x8, 2);
3747 			pectl = (pectl & ~0x7000) | (5 << 12);
3748 			pci_write_config(dev, reg + 0x8, pectl, 2);
3749 			sc->pectl = pectl;
3750 		} else {
3751 			/* restore saved pectl after watchdog reset */
3752 			pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3753 		}
3754 	}
3755 
3756 	/* Enable DMA and Memory space access */
3757 	pci_enable_busmaster(dev);
3758 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3759 	cmd |= PCIM_CMD_MEMEN;
3760 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3761 }
3762 
3763 static uint32_t
3764 mxge_read_reboot(mxge_softc_t *sc)
3765 {
3766 	device_t dev = sc->dev;
3767 	uint32_t vs;
3768 
3769 	/* find the vendor specific offset */
3770 	if (pci_find_cap(dev, PCIY_VENDOR, &vs) != 0) {
3771 		device_printf(sc->dev,
3772 			      "could not find vendor specific offset\n");
3773 		return (uint32_t)-1;
3774 	}
3775 	/* enable read32 mode */
3776 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3777 	/* tell NIC which register to read */
3778 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3779 	return (pci_read_config(dev, vs + 0x14, 4));
3780 }
3781 
3782 static void
3783 mxge_watchdog_reset(mxge_softc_t *sc)
3784 {
3785 	struct pci_devinfo *dinfo;
3786 	struct mxge_slice_state *ss;
3787 	int err, running, s, num_tx_slices = 1;
3788 	uint32_t reboot;
3789 	uint16_t cmd;
3790 
3791 	err = ENXIO;
3792 
3793 	device_printf(sc->dev, "Watchdog reset!\n");
3794 
3795 	/*
3796 	 * check to see if the NIC rebooted.  If it did, then all of
3797 	 * PCI config space has been reset, and things like the
3798 	 * busmaster bit will be zero.  If this is the case, then we
3799 	 * must restore PCI config space before the NIC can be used
3800 	 * again
3801 	 */
3802 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3803 	if (cmd == 0xffff) {
3804 		/*
3805 		 * maybe the watchdog caught the NIC rebooting; wait
3806 		 * up to 100ms for it to finish.  If it does not come
3807 		 * back, then give up
3808 		 */
3809 		DELAY(1000*100);
3810 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3811 		if (cmd == 0xffff) {
3812 			device_printf(sc->dev, "NIC disappeared!\n");
3813 		}
3814 	}
3815 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3816 		/* print the reboot status */
3817 		reboot = mxge_read_reboot(sc);
3818 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3819 			      reboot);
3820 		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
3821 		if (running) {
3822 
3823 			/*
3824 			 * quiesce NIC so that TX routines will not try to
3825 			 * xmit after restoration of BAR
3826 			 */
3827 
3828 			/* Mark the link as down */
3829 			if (sc->link_state) {
3830 				sc->link_state = 0;
3831 				if_link_state_change(sc->ifp,
3832 						     LINK_STATE_DOWN);
3833 			}
3834 #ifdef IFNET_BUF_RING
3835 			num_tx_slices = sc->num_slices;
3836 #endif
3837 			/* grab all TX locks to ensure no tx  */
3838 			for (s = 0; s < num_tx_slices; s++) {
3839 				ss = &sc->ss[s];
3840 				mtx_lock(&ss->tx.mtx);
3841 			}
3842 			mxge_close(sc, 1);
3843 		}
3844 		/* restore PCI configuration space */
3845 		dinfo = device_get_ivars(sc->dev);
3846 		pci_cfg_restore(sc->dev, dinfo);
3847 
3848 		/* and redo any changes we made to our config space */
3849 		mxge_setup_cfg_space(sc);
3850 
3851 		/* reload f/w */
3852 		err = mxge_load_firmware(sc, 0);
3853 		if (err) {
3854 			device_printf(sc->dev,
3855 				      "Unable to re-load f/w\n");
3856 		}
3857 		if (running) {
3858 			if (!err)
3859 				err = mxge_open(sc);
3860 			/* release all TX locks */
3861 			for (s = 0; s < num_tx_slices; s++) {
3862 				ss = &sc->ss[s];
3863 #ifdef IFNET_BUF_RING
3864 				mxge_start_locked(ss);
3865 #endif
3866 				mtx_unlock(&ss->tx.mtx);
3867 			}
3868 		}
3869 		sc->watchdog_resets++;
3870 	} else {
3871 		device_printf(sc->dev,
3872 			      "NIC did not reboot, not resetting\n");
3873 		err = 0;
3874 	}
3875 	if (err) {
3876 		device_printf(sc->dev, "watchdog reset failed\n");
3877 	} else {
3878 		if (sc->dying == 2)
3879 			sc->dying = 0;
3880 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3881 	}
3882 }
3883 
3884 static void
3885 mxge_watchdog_task(void *arg, int pending)
3886 {
3887 	mxge_softc_t *sc = arg;
3888 
3889 
3890 	mtx_lock(&sc->driver_mtx);
3891 	mxge_watchdog_reset(sc);
3892 	mtx_unlock(&sc->driver_mtx);
3893 }
3894 
3895 static void
3896 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3897 {
3898 	tx = &sc->ss[slice].tx;
3899 	device_printf(sc->dev, "slice %d struck? ring state:\n", slice);
3900 	device_printf(sc->dev,
3901 		      "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3902 		      tx->req, tx->done, tx->queue_active);
3903 	device_printf(sc->dev, "tx.activate=%d tx.deactivate=%d\n",
3904 			      tx->activate, tx->deactivate);
3905 	device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3906 		      tx->pkt_done,
3907 		      be32toh(sc->ss->fw_stats->send_done_count));
3908 }
3909 
3910 static int
3911 mxge_watchdog(mxge_softc_t *sc)
3912 {
3913 	mxge_tx_ring_t *tx;
3914 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3915 	int i, err = 0;
3916 
3917 	/* see if we have outstanding transmits, which
3918 	   have been pending for more than mxge_ticks */
3919 	for (i = 0;
3920 #ifdef IFNET_BUF_RING
3921 	     (i < sc->num_slices) && (err == 0);
3922 #else
3923 	     (i < 1) && (err == 0);
3924 #endif
3925 	     i++) {
3926 		tx = &sc->ss[i].tx;
3927 		if (tx->req != tx->done &&
3928 		    tx->watchdog_req != tx->watchdog_done &&
3929 		    tx->done == tx->watchdog_done) {
3930 			/* check for pause blocking before resetting */
3931 			if (tx->watchdog_rx_pause == rx_pause) {
3932 				mxge_warn_stuck(sc, tx, i);
3933 				taskqueue_enqueue(sc->tq, &sc->watchdog_task);
3934 				return (ENXIO);
3935 			}
3936 			else
3937 				device_printf(sc->dev, "Flow control blocking "
3938 					      "xmits, check link partner\n");
3939 		}
3940 
3941 		tx->watchdog_req = tx->req;
3942 		tx->watchdog_done = tx->done;
3943 		tx->watchdog_rx_pause = rx_pause;
3944 	}
3945 
3946 	if (sc->need_media_probe)
3947 		mxge_media_probe(sc);
3948 	return (err);
3949 }
3950 
3951 static u_long
3952 mxge_update_stats(mxge_softc_t *sc)
3953 {
3954 	struct mxge_slice_state *ss;
3955 	u_long pkts = 0;
3956 	u_long ipackets = 0;
3957 	u_long opackets = 0;
3958 #ifdef IFNET_BUF_RING
3959 	u_long obytes = 0;
3960 	u_long omcasts = 0;
3961 	u_long odrops = 0;
3962 #endif
3963 	u_long oerrors = 0;
3964 	int slice;
3965 
3966 	for (slice = 0; slice < sc->num_slices; slice++) {
3967 		ss = &sc->ss[slice];
3968 		ipackets += ss->ipackets;
3969 		opackets += ss->opackets;
3970 #ifdef IFNET_BUF_RING
3971 		obytes += ss->obytes;
3972 		omcasts += ss->omcasts;
3973 		odrops += ss->tx.br->br_drops;
3974 #endif
3975 		oerrors += ss->oerrors;
3976 	}
3977 	pkts = (ipackets - sc->ifp->if_ipackets);
3978 	pkts += (opackets - sc->ifp->if_opackets);
3979 	sc->ifp->if_ipackets = ipackets;
3980 	sc->ifp->if_opackets = opackets;
3981 #ifdef IFNET_BUF_RING
3982 	sc->ifp->if_obytes = obytes;
3983 	sc->ifp->if_omcasts = omcasts;
3984 	sc->ifp->if_snd.ifq_drops = odrops;
3985 #endif
3986 	sc->ifp->if_oerrors = oerrors;
3987 	return pkts;
3988 }
3989 
3990 static void
3991 mxge_tick(void *arg)
3992 {
3993 	mxge_softc_t *sc = arg;
3994 	u_long pkts = 0;
3995 	int err = 0;
3996 	int running, ticks;
3997 	uint16_t cmd;
3998 
3999 	ticks = mxge_ticks;
4000 	running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
4001 	if (running) {
4002 		/* aggregate stats from different slices */
4003 		pkts = mxge_update_stats(sc);
4004 		if (!sc->watchdog_countdown) {
4005 			err = mxge_watchdog(sc);
4006 			sc->watchdog_countdown = 4;
4007 		}
4008 		sc->watchdog_countdown--;
4009 	}
4010 	if (pkts == 0) {
4011 		/* ensure NIC did not suffer h/w fault while idle */
4012 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
4013 		if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
4014 			sc->dying = 2;
4015 			taskqueue_enqueue(sc->tq, &sc->watchdog_task);
4016 			err = ENXIO;
4017 		}
4018 		/* look less often if NIC is idle */
4019 		ticks *= 4;
4020 	}
4021 
4022 	if (err == 0)
4023 		callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
4024 
4025 }
4026 
4027 static int
4028 mxge_media_change(struct ifnet *ifp)
4029 {
4030 	return EINVAL;
4031 }
4032 
4033 static int
4034 mxge_change_mtu(mxge_softc_t *sc, int mtu)
4035 {
4036 	struct ifnet *ifp = sc->ifp;
4037 	int real_mtu, old_mtu;
4038 	int err = 0;
4039 
4040 
4041 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
4042 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
4043 		return EINVAL;
4044 	mtx_lock(&sc->driver_mtx);
4045 	old_mtu = ifp->if_mtu;
4046 	ifp->if_mtu = mtu;
4047 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4048 		mxge_close(sc, 0);
4049 		err = mxge_open(sc);
4050 		if (err != 0) {
4051 			ifp->if_mtu = old_mtu;
4052 			mxge_close(sc, 0);
4053 			(void) mxge_open(sc);
4054 		}
4055 	}
4056 	mtx_unlock(&sc->driver_mtx);
4057 	return err;
4058 }
4059 
4060 static void
4061 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
4062 {
4063 	mxge_softc_t *sc = ifp->if_softc;
4064 
4065 
4066 	if (sc == NULL)
4067 		return;
4068 	ifmr->ifm_status = IFM_AVALID;
4069 	ifmr->ifm_active = IFM_ETHER | IFM_FDX;
4070 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
4071 	ifmr->ifm_active |= sc->current_media;
4072 }
4073 
4074 static int
4075 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
4076 {
4077 	mxge_softc_t *sc = ifp->if_softc;
4078 	struct ifreq *ifr = (struct ifreq *)data;
4079 	int err, mask;
4080 
4081 	err = 0;
4082 	switch (command) {
4083 	case SIOCSIFADDR:
4084 	case SIOCGIFADDR:
4085 		err = ether_ioctl(ifp, command, data);
4086 		break;
4087 
4088 	case SIOCSIFMTU:
4089 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
4090 		break;
4091 
4092 	case SIOCSIFFLAGS:
4093 		mtx_lock(&sc->driver_mtx);
4094 		if (sc->dying) {
4095 			mtx_unlock(&sc->driver_mtx);
4096 			return EINVAL;
4097 		}
4098 		if (ifp->if_flags & IFF_UP) {
4099 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
4100 				err = mxge_open(sc);
4101 			} else {
4102 				/* take care of promis can allmulti
4103 				   flag chages */
4104 				mxge_change_promisc(sc,
4105 						    ifp->if_flags & IFF_PROMISC);
4106 				mxge_set_multicast_list(sc);
4107 			}
4108 		} else {
4109 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
4110 				mxge_close(sc, 0);
4111 			}
4112 		}
4113 		mtx_unlock(&sc->driver_mtx);
4114 		break;
4115 
4116 	case SIOCADDMULTI:
4117 	case SIOCDELMULTI:
4118 		mtx_lock(&sc->driver_mtx);
4119 		mxge_set_multicast_list(sc);
4120 		mtx_unlock(&sc->driver_mtx);
4121 		break;
4122 
4123 	case SIOCSIFCAP:
4124 		mtx_lock(&sc->driver_mtx);
4125 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
4126 		if (mask & IFCAP_TXCSUM) {
4127 			if (IFCAP_TXCSUM & ifp->if_capenable) {
4128 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
4129 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
4130 						      | CSUM_TSO);
4131 			} else {
4132 				ifp->if_capenable |= IFCAP_TXCSUM;
4133 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
4134 			}
4135 		} else if (mask & IFCAP_RXCSUM) {
4136 			if (IFCAP_RXCSUM & ifp->if_capenable) {
4137 				ifp->if_capenable &= ~IFCAP_RXCSUM;
4138 				sc->csum_flag = 0;
4139 			} else {
4140 				ifp->if_capenable |= IFCAP_RXCSUM;
4141 				sc->csum_flag = 1;
4142 			}
4143 		}
4144 		if (mask & IFCAP_TSO4) {
4145 			if (IFCAP_TSO4 & ifp->if_capenable) {
4146 				ifp->if_capenable &= ~IFCAP_TSO4;
4147 				ifp->if_hwassist &= ~CSUM_TSO;
4148 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
4149 				ifp->if_capenable |= IFCAP_TSO4;
4150 				ifp->if_hwassist |= CSUM_TSO;
4151 			} else {
4152 				printf("mxge requires tx checksum offload"
4153 				       " be enabled to use TSO\n");
4154 				err = EINVAL;
4155 			}
4156 		}
4157 		if (mask & IFCAP_LRO) {
4158 			if (IFCAP_LRO & ifp->if_capenable)
4159 				err = mxge_change_lro_locked(sc, 0);
4160 			else
4161 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
4162 		}
4163 		if (mask & IFCAP_VLAN_HWTAGGING)
4164 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
4165 		if (mask & IFCAP_VLAN_HWTSO)
4166 			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
4167 
4168 		if (!(ifp->if_capabilities & IFCAP_VLAN_HWTSO) ||
4169 		    !(ifp->if_capenable & IFCAP_VLAN_HWTAGGING))
4170 			ifp->if_capenable &= ~IFCAP_VLAN_HWTSO;
4171 
4172 		mtx_unlock(&sc->driver_mtx);
4173 		VLAN_CAPABILITIES(ifp);
4174 
4175 		break;
4176 
4177 	case SIOCGIFMEDIA:
4178 		mtx_lock(&sc->driver_mtx);
4179 		mxge_media_probe(sc);
4180 		mtx_unlock(&sc->driver_mtx);
4181 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
4182 				    &sc->media, command);
4183                 break;
4184 
4185 	default:
4186 		err = ENOTTY;
4187         }
4188 	return err;
4189 }
4190 
4191 static void
4192 mxge_fetch_tunables(mxge_softc_t *sc)
4193 {
4194 
4195 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
4196 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
4197 			  &mxge_flow_control);
4198 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
4199 			  &mxge_intr_coal_delay);
4200 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
4201 			  &mxge_nvidia_ecrc_enable);
4202 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
4203 			  &mxge_force_firmware);
4204 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
4205 			  &mxge_deassert_wait);
4206 	TUNABLE_INT_FETCH("hw.mxge.verbose",
4207 			  &mxge_verbose);
4208 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
4209 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
4210 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
4211 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
4212 	TUNABLE_INT_FETCH("hw.mxge.rss_hashtype", &mxge_rss_hash_type);
4213 	TUNABLE_INT_FETCH("hw.mxge.initial_mtu", &mxge_initial_mtu);
4214 	TUNABLE_INT_FETCH("hw.mxge.throttle", &mxge_throttle);
4215 	if (sc->lro_cnt != 0)
4216 		mxge_lro_cnt = sc->lro_cnt;
4217 
4218 	if (bootverbose)
4219 		mxge_verbose = 1;
4220 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
4221 		mxge_intr_coal_delay = 30;
4222 	if (mxge_ticks == 0)
4223 		mxge_ticks = hz / 2;
4224 	sc->pause = mxge_flow_control;
4225 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
4226 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_MAX) {
4227 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4228 	}
4229 	if (mxge_initial_mtu > ETHERMTU_JUMBO ||
4230 	    mxge_initial_mtu < ETHER_MIN_LEN)
4231 		mxge_initial_mtu = ETHERMTU_JUMBO;
4232 
4233 	if (mxge_throttle && mxge_throttle > MXGE_MAX_THROTTLE)
4234 		mxge_throttle = MXGE_MAX_THROTTLE;
4235 	if (mxge_throttle && mxge_throttle < MXGE_MIN_THROTTLE)
4236 		mxge_throttle = MXGE_MIN_THROTTLE;
4237 	sc->throttle = mxge_throttle;
4238 }
4239 
4240 
4241 static void
4242 mxge_free_slices(mxge_softc_t *sc)
4243 {
4244 	struct mxge_slice_state *ss;
4245 	int i;
4246 
4247 
4248 	if (sc->ss == NULL)
4249 		return;
4250 
4251 	for (i = 0; i < sc->num_slices; i++) {
4252 		ss = &sc->ss[i];
4253 		if (ss->fw_stats != NULL) {
4254 			mxge_dma_free(&ss->fw_stats_dma);
4255 			ss->fw_stats = NULL;
4256 #ifdef IFNET_BUF_RING
4257 			if (ss->tx.br != NULL) {
4258 				drbr_free(ss->tx.br, M_DEVBUF);
4259 				ss->tx.br = NULL;
4260 			}
4261 #endif
4262 			mtx_destroy(&ss->tx.mtx);
4263 		}
4264 		if (ss->rx_done.entry != NULL) {
4265 			mxge_dma_free(&ss->rx_done.dma);
4266 			ss->rx_done.entry = NULL;
4267 		}
4268 	}
4269 	free(sc->ss, M_DEVBUF);
4270 	sc->ss = NULL;
4271 }
4272 
4273 static int
4274 mxge_alloc_slices(mxge_softc_t *sc)
4275 {
4276 	mxge_cmd_t cmd;
4277 	struct mxge_slice_state *ss;
4278 	size_t bytes;
4279 	int err, i, max_intr_slots;
4280 
4281 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4282 	if (err != 0) {
4283 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4284 		return err;
4285 	}
4286 	sc->rx_ring_size = cmd.data0;
4287 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
4288 
4289 	bytes = sizeof (*sc->ss) * sc->num_slices;
4290 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
4291 	if (sc->ss == NULL)
4292 		return (ENOMEM);
4293 	for (i = 0; i < sc->num_slices; i++) {
4294 		ss = &sc->ss[i];
4295 
4296 		ss->sc = sc;
4297 
4298 		/* allocate per-slice rx interrupt queues */
4299 
4300 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
4301 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
4302 		if (err != 0)
4303 			goto abort;
4304 		ss->rx_done.entry = ss->rx_done.dma.addr;
4305 		bzero(ss->rx_done.entry, bytes);
4306 
4307 		/*
4308 		 * allocate the per-slice firmware stats; stats
4309 		 * (including tx) are used used only on the first
4310 		 * slice for now
4311 		 */
4312 #ifndef IFNET_BUF_RING
4313 		if (i > 0)
4314 			continue;
4315 #endif
4316 
4317 		bytes = sizeof (*ss->fw_stats);
4318 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4319 				     sizeof (*ss->fw_stats), 64);
4320 		if (err != 0)
4321 			goto abort;
4322 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
4323 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
4324 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
4325 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
4326 #ifdef IFNET_BUF_RING
4327 		ss->tx.br = buf_ring_alloc(2048, M_DEVBUF, M_WAITOK,
4328 					   &ss->tx.mtx);
4329 #endif
4330 	}
4331 
4332 	return (0);
4333 
4334 abort:
4335 	mxge_free_slices(sc);
4336 	return (ENOMEM);
4337 }
4338 
4339 static void
4340 mxge_slice_probe(mxge_softc_t *sc)
4341 {
4342 	mxge_cmd_t cmd;
4343 	char *old_fw;
4344 	int msix_cnt, status, max_intr_slots;
4345 
4346 	sc->num_slices = 1;
4347 	/*
4348 	 *  don't enable multiple slices if they are not enabled,
4349 	 *  or if this is not an SMP system
4350 	 */
4351 
4352 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
4353 		return;
4354 
4355 	/* see how many MSI-X interrupts are available */
4356 	msix_cnt = pci_msix_count(sc->dev);
4357 	if (msix_cnt < 2)
4358 		return;
4359 
4360 	/* now load the slice aware firmware see what it supports */
4361 	old_fw = sc->fw_name;
4362 	if (old_fw == mxge_fw_aligned)
4363 		sc->fw_name = mxge_fw_rss_aligned;
4364 	else
4365 		sc->fw_name = mxge_fw_rss_unaligned;
4366 	status = mxge_load_firmware(sc, 0);
4367 	if (status != 0) {
4368 		device_printf(sc->dev, "Falling back to a single slice\n");
4369 		return;
4370 	}
4371 
4372 	/* try to send a reset command to the card to see if it
4373 	   is alive */
4374 	memset(&cmd, 0, sizeof (cmd));
4375 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4376 	if (status != 0) {
4377 		device_printf(sc->dev, "failed reset\n");
4378 		goto abort_with_fw;
4379 	}
4380 
4381 	/* get rx ring size */
4382 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4383 	if (status != 0) {
4384 		device_printf(sc->dev, "Cannot determine rx ring size\n");
4385 		goto abort_with_fw;
4386 	}
4387 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
4388 
4389 	/* tell it the size of the interrupt queues */
4390 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
4391 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4392 	if (status != 0) {
4393 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4394 		goto abort_with_fw;
4395 	}
4396 
4397 	/* ask the maximum number of slices it supports */
4398 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4399 	if (status != 0) {
4400 		device_printf(sc->dev,
4401 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4402 		goto abort_with_fw;
4403 	}
4404 	sc->num_slices = cmd.data0;
4405 	if (sc->num_slices > msix_cnt)
4406 		sc->num_slices = msix_cnt;
4407 
4408 	if (mxge_max_slices == -1) {
4409 		/* cap to number of CPUs in system */
4410 		if (sc->num_slices > mp_ncpus)
4411 			sc->num_slices = mp_ncpus;
4412 	} else {
4413 		if (sc->num_slices > mxge_max_slices)
4414 			sc->num_slices = mxge_max_slices;
4415 	}
4416 	/* make sure it is a power of two */
4417 	while (sc->num_slices & (sc->num_slices - 1))
4418 		sc->num_slices--;
4419 
4420 	if (mxge_verbose)
4421 		device_printf(sc->dev, "using %d slices\n",
4422 			      sc->num_slices);
4423 
4424 	return;
4425 
4426 abort_with_fw:
4427 	sc->fw_name = old_fw;
4428 	(void) mxge_load_firmware(sc, 0);
4429 }
4430 
4431 static int
4432 mxge_add_msix_irqs(mxge_softc_t *sc)
4433 {
4434 	size_t bytes;
4435 	int count, err, i, rid;
4436 
4437 	rid = PCIR_BAR(2);
4438 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4439 						    &rid, RF_ACTIVE);
4440 
4441 	if (sc->msix_table_res == NULL) {
4442 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
4443 		return ENXIO;
4444 	}
4445 
4446 	count = sc->num_slices;
4447 	err = pci_alloc_msix(sc->dev, &count);
4448 	if (err != 0) {
4449 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
4450 			      "err = %d \n", sc->num_slices, err);
4451 		goto abort_with_msix_table;
4452 	}
4453 	if (count < sc->num_slices) {
4454 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
4455 			      count, sc->num_slices);
4456 		device_printf(sc->dev,
4457 			      "Try setting hw.mxge.max_slices to %d\n",
4458 			      count);
4459 		err = ENOSPC;
4460 		goto abort_with_msix;
4461 	}
4462 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
4463 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4464 	if (sc->msix_irq_res == NULL) {
4465 		err = ENOMEM;
4466 		goto abort_with_msix;
4467 	}
4468 
4469 	for (i = 0; i < sc->num_slices; i++) {
4470 		rid = i + 1;
4471 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
4472 							  SYS_RES_IRQ,
4473 							  &rid, RF_ACTIVE);
4474 		if (sc->msix_irq_res[i] == NULL) {
4475 			device_printf(sc->dev, "couldn't allocate IRQ res"
4476 				      " for message %d\n", i);
4477 			err = ENXIO;
4478 			goto abort_with_res;
4479 		}
4480 	}
4481 
4482 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
4483 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
4484 
4485 	for (i = 0; i < sc->num_slices; i++) {
4486 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
4487 				     INTR_TYPE_NET | INTR_MPSAFE,
4488 #if __FreeBSD_version > 700030
4489 				     NULL,
4490 #endif
4491 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
4492 		if (err != 0) {
4493 			device_printf(sc->dev, "couldn't setup intr for "
4494 				      "message %d\n", i);
4495 			goto abort_with_intr;
4496 		}
4497 		bus_describe_intr(sc->dev, sc->msix_irq_res[i],
4498 				  sc->msix_ih[i], "s%d", i);
4499 	}
4500 
4501 	if (mxge_verbose) {
4502 		device_printf(sc->dev, "using %d msix IRQs:",
4503 			      sc->num_slices);
4504 		for (i = 0; i < sc->num_slices; i++)
4505 			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
4506 		printf("\n");
4507 	}
4508 	return (0);
4509 
4510 abort_with_intr:
4511 	for (i = 0; i < sc->num_slices; i++) {
4512 		if (sc->msix_ih[i] != NULL) {
4513 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4514 					  sc->msix_ih[i]);
4515 			sc->msix_ih[i] = NULL;
4516 		}
4517 	}
4518 	free(sc->msix_ih, M_DEVBUF);
4519 
4520 
4521 abort_with_res:
4522 	for (i = 0; i < sc->num_slices; i++) {
4523 		rid = i + 1;
4524 		if (sc->msix_irq_res[i] != NULL)
4525 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4526 					     sc->msix_irq_res[i]);
4527 		sc->msix_irq_res[i] = NULL;
4528 	}
4529 	free(sc->msix_irq_res, M_DEVBUF);
4530 
4531 
4532 abort_with_msix:
4533 	pci_release_msi(sc->dev);
4534 
4535 abort_with_msix_table:
4536 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4537 			     sc->msix_table_res);
4538 
4539 	return err;
4540 }
4541 
4542 static int
4543 mxge_add_single_irq(mxge_softc_t *sc)
4544 {
4545 	int count, err, rid;
4546 
4547 	count = pci_msi_count(sc->dev);
4548 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4549 		rid = 1;
4550 	} else {
4551 		rid = 0;
4552 		sc->legacy_irq = 1;
4553 	}
4554 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4555 					 1, RF_SHAREABLE | RF_ACTIVE);
4556 	if (sc->irq_res == NULL) {
4557 		device_printf(sc->dev, "could not alloc interrupt\n");
4558 		return ENXIO;
4559 	}
4560 	if (mxge_verbose)
4561 		device_printf(sc->dev, "using %s irq %ld\n",
4562 			      sc->legacy_irq ? "INTx" : "MSI",
4563 			      rman_get_start(sc->irq_res));
4564 	err = bus_setup_intr(sc->dev, sc->irq_res,
4565 			     INTR_TYPE_NET | INTR_MPSAFE,
4566 #if __FreeBSD_version > 700030
4567 			     NULL,
4568 #endif
4569 			     mxge_intr, &sc->ss[0], &sc->ih);
4570 	if (err != 0) {
4571 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4572 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4573 		if (!sc->legacy_irq)
4574 			pci_release_msi(sc->dev);
4575 	}
4576 	return err;
4577 }
4578 
4579 static void
4580 mxge_rem_msix_irqs(mxge_softc_t *sc)
4581 {
4582 	int i, rid;
4583 
4584 	for (i = 0; i < sc->num_slices; i++) {
4585 		if (sc->msix_ih[i] != NULL) {
4586 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4587 					  sc->msix_ih[i]);
4588 			sc->msix_ih[i] = NULL;
4589 		}
4590 	}
4591 	free(sc->msix_ih, M_DEVBUF);
4592 
4593 	for (i = 0; i < sc->num_slices; i++) {
4594 		rid = i + 1;
4595 		if (sc->msix_irq_res[i] != NULL)
4596 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4597 					     sc->msix_irq_res[i]);
4598 		sc->msix_irq_res[i] = NULL;
4599 	}
4600 	free(sc->msix_irq_res, M_DEVBUF);
4601 
4602 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4603 			     sc->msix_table_res);
4604 
4605 	pci_release_msi(sc->dev);
4606 	return;
4607 }
4608 
4609 static void
4610 mxge_rem_single_irq(mxge_softc_t *sc)
4611 {
4612 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4613 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4614 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4615 	if (!sc->legacy_irq)
4616 		pci_release_msi(sc->dev);
4617 }
4618 
4619 static void
4620 mxge_rem_irq(mxge_softc_t *sc)
4621 {
4622 	if (sc->num_slices > 1)
4623 		mxge_rem_msix_irqs(sc);
4624 	else
4625 		mxge_rem_single_irq(sc);
4626 }
4627 
4628 static int
4629 mxge_add_irq(mxge_softc_t *sc)
4630 {
4631 	int err;
4632 
4633 	if (sc->num_slices > 1)
4634 		err = mxge_add_msix_irqs(sc);
4635 	else
4636 		err = mxge_add_single_irq(sc);
4637 
4638 	if (0 && err == 0 && sc->num_slices > 1) {
4639 		mxge_rem_msix_irqs(sc);
4640 		err = mxge_add_msix_irqs(sc);
4641 	}
4642 	return err;
4643 }
4644 
4645 
4646 static int
4647 mxge_attach(device_t dev)
4648 {
4649 	mxge_softc_t *sc = device_get_softc(dev);
4650 	struct ifnet *ifp;
4651 	int err, rid;
4652 
4653 	sc->dev = dev;
4654 	mxge_fetch_tunables(sc);
4655 
4656 	TASK_INIT(&sc->watchdog_task, 1, mxge_watchdog_task, sc);
4657 	sc->tq = taskqueue_create("mxge_taskq", M_WAITOK,
4658 				  taskqueue_thread_enqueue, &sc->tq);
4659 	if (sc->tq == NULL) {
4660 		err = ENOMEM;
4661 		goto abort_with_nothing;
4662 	}
4663 
4664 	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
4665 				 1,			/* alignment */
4666 				 0,			/* boundary */
4667 				 BUS_SPACE_MAXADDR,	/* low */
4668 				 BUS_SPACE_MAXADDR,	/* high */
4669 				 NULL, NULL,		/* filter */
4670 				 65536 + 256,		/* maxsize */
4671 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4672 				 65536,			/* maxsegsize */
4673 				 0,			/* flags */
4674 				 NULL, NULL,		/* lock */
4675 				 &sc->parent_dmat);	/* tag */
4676 
4677 	if (err != 0) {
4678 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4679 			      err);
4680 		goto abort_with_tq;
4681 	}
4682 
4683 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4684 	if (ifp == NULL) {
4685 		device_printf(dev, "can not if_alloc()\n");
4686 		err = ENOSPC;
4687 		goto abort_with_parent_dmat;
4688 	}
4689 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4690 
4691 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4692 		 device_get_nameunit(dev));
4693 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4694 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4695 		 "%s:drv", device_get_nameunit(dev));
4696 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4697 		 MTX_NETWORK_LOCK, MTX_DEF);
4698 
4699 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4700 
4701 	mxge_setup_cfg_space(sc);
4702 
4703 	/* Map the board into the kernel */
4704 	rid = PCIR_BARS;
4705 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4706 					 ~0, 1, RF_ACTIVE);
4707 	if (sc->mem_res == NULL) {
4708 		device_printf(dev, "could not map memory\n");
4709 		err = ENXIO;
4710 		goto abort_with_lock;
4711 	}
4712 	sc->sram = rman_get_virtual(sc->mem_res);
4713 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4714 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4715 		device_printf(dev, "impossible memory region size %ld\n",
4716 			      rman_get_size(sc->mem_res));
4717 		err = ENXIO;
4718 		goto abort_with_mem_res;
4719 	}
4720 
4721 	/* make NULL terminated copy of the EEPROM strings section of
4722 	   lanai SRAM */
4723 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4724 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4725 				rman_get_bushandle(sc->mem_res),
4726 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4727 				sc->eeprom_strings,
4728 				MXGE_EEPROM_STRINGS_SIZE - 2);
4729 	err = mxge_parse_strings(sc);
4730 	if (err != 0)
4731 		goto abort_with_mem_res;
4732 
4733 	/* Enable write combining for efficient use of PCIe bus */
4734 	mxge_enable_wc(sc);
4735 
4736 	/* Allocate the out of band dma memory */
4737 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4738 			     sizeof (mxge_cmd_t), 64);
4739 	if (err != 0)
4740 		goto abort_with_mem_res;
4741 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4742 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4743 	if (err != 0)
4744 		goto abort_with_cmd_dma;
4745 
4746 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4747 	if (err != 0)
4748 		goto abort_with_zeropad_dma;
4749 
4750 	/* select & load the firmware */
4751 	err = mxge_select_firmware(sc);
4752 	if (err != 0)
4753 		goto abort_with_dmabench;
4754 	sc->intr_coal_delay = mxge_intr_coal_delay;
4755 
4756 	mxge_slice_probe(sc);
4757 	err = mxge_alloc_slices(sc);
4758 	if (err != 0)
4759 		goto abort_with_dmabench;
4760 
4761 	err = mxge_reset(sc, 0);
4762 	if (err != 0)
4763 		goto abort_with_slices;
4764 
4765 	err = mxge_alloc_rings(sc);
4766 	if (err != 0) {
4767 		device_printf(sc->dev, "failed to allocate rings\n");
4768 		goto abort_with_slices;
4769 	}
4770 
4771 	err = mxge_add_irq(sc);
4772 	if (err != 0) {
4773 		device_printf(sc->dev, "failed to add irq\n");
4774 		goto abort_with_rings;
4775 	}
4776 
4777 	if_initbaudrate(ifp, IF_Gbps(10));
4778 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4779 		IFCAP_VLAN_MTU | IFCAP_LINKSTATE;
4780 #ifdef INET
4781 	ifp->if_capabilities |= IFCAP_LRO;
4782 #endif
4783 
4784 #ifdef MXGE_NEW_VLAN_API
4785 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4786 
4787 	/* Only FW 1.4.32 and newer can do TSO over vlans */
4788 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
4789 	    sc->fw_ver_tiny >= 32)
4790 		ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
4791 #endif
4792 
4793 	sc->max_mtu = mxge_max_mtu(sc);
4794 	if (sc->max_mtu >= 9000)
4795 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4796 	else
4797 		device_printf(dev, "MTU limited to %d.  Install "
4798 			      "latest firmware for 9000 byte jumbo support\n",
4799 			      sc->max_mtu - ETHER_HDR_LEN);
4800 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4801 	ifp->if_capenable = ifp->if_capabilities;
4802 	if (sc->lro_cnt == 0)
4803 		ifp->if_capenable &= ~IFCAP_LRO;
4804 	sc->csum_flag = 1;
4805         ifp->if_init = mxge_init;
4806         ifp->if_softc = sc;
4807         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4808         ifp->if_ioctl = mxge_ioctl;
4809         ifp->if_start = mxge_start;
4810 	/* Initialise the ifmedia structure */
4811 	ifmedia_init(&sc->media, 0, mxge_media_change,
4812 		     mxge_media_status);
4813 	mxge_media_init(sc);
4814 	mxge_media_probe(sc);
4815 	sc->dying = 0;
4816 	ether_ifattach(ifp, sc->mac_addr);
4817 	/* ether_ifattach sets mtu to ETHERMTU */
4818 	if (mxge_initial_mtu != ETHERMTU)
4819 		mxge_change_mtu(sc, mxge_initial_mtu);
4820 
4821 	mxge_add_sysctls(sc);
4822 #ifdef IFNET_BUF_RING
4823 	ifp->if_transmit = mxge_transmit;
4824 	ifp->if_qflush = mxge_qflush;
4825 #endif
4826 	taskqueue_start_threads(&sc->tq, 1, PI_NET, "%s taskq",
4827 				device_get_nameunit(sc->dev));
4828 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
4829 	return 0;
4830 
4831 abort_with_rings:
4832 	mxge_free_rings(sc);
4833 abort_with_slices:
4834 	mxge_free_slices(sc);
4835 abort_with_dmabench:
4836 	mxge_dma_free(&sc->dmabench_dma);
4837 abort_with_zeropad_dma:
4838 	mxge_dma_free(&sc->zeropad_dma);
4839 abort_with_cmd_dma:
4840 	mxge_dma_free(&sc->cmd_dma);
4841 abort_with_mem_res:
4842 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4843 abort_with_lock:
4844 	pci_disable_busmaster(dev);
4845 	mtx_destroy(&sc->cmd_mtx);
4846 	mtx_destroy(&sc->driver_mtx);
4847 	if_free(ifp);
4848 abort_with_parent_dmat:
4849 	bus_dma_tag_destroy(sc->parent_dmat);
4850 abort_with_tq:
4851 	if (sc->tq != NULL) {
4852 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4853 		taskqueue_free(sc->tq);
4854 		sc->tq = NULL;
4855 	}
4856 abort_with_nothing:
4857 	return err;
4858 }
4859 
4860 static int
4861 mxge_detach(device_t dev)
4862 {
4863 	mxge_softc_t *sc = device_get_softc(dev);
4864 
4865 	if (mxge_vlans_active(sc)) {
4866 		device_printf(sc->dev,
4867 			      "Detach vlans before removing module\n");
4868 		return EBUSY;
4869 	}
4870 	mtx_lock(&sc->driver_mtx);
4871 	sc->dying = 1;
4872 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4873 		mxge_close(sc, 0);
4874 	mtx_unlock(&sc->driver_mtx);
4875 	ether_ifdetach(sc->ifp);
4876 	if (sc->tq != NULL) {
4877 		taskqueue_drain(sc->tq, &sc->watchdog_task);
4878 		taskqueue_free(sc->tq);
4879 		sc->tq = NULL;
4880 	}
4881 	callout_drain(&sc->co_hdl);
4882 	ifmedia_removeall(&sc->media);
4883 	mxge_dummy_rdma(sc, 0);
4884 	mxge_rem_sysctls(sc);
4885 	mxge_rem_irq(sc);
4886 	mxge_free_rings(sc);
4887 	mxge_free_slices(sc);
4888 	mxge_dma_free(&sc->dmabench_dma);
4889 	mxge_dma_free(&sc->zeropad_dma);
4890 	mxge_dma_free(&sc->cmd_dma);
4891 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4892 	pci_disable_busmaster(dev);
4893 	mtx_destroy(&sc->cmd_mtx);
4894 	mtx_destroy(&sc->driver_mtx);
4895 	if_free(sc->ifp);
4896 	bus_dma_tag_destroy(sc->parent_dmat);
4897 	return 0;
4898 }
4899 
4900 static int
4901 mxge_shutdown(device_t dev)
4902 {
4903 	return 0;
4904 }
4905 
4906 /*
4907   This file uses Myri10GE driver indentation.
4908 
4909   Local Variables:
4910   c-file-style:"linux"
4911   tab-width:8
4912   End:
4913 */
4914