xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 995dc984471c92c03daad19a1d35af46c086ef3e)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2008, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/memrange.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 #include <sys/sx.h>
49 
50 #include <net/if.h>
51 #include <net/if_arp.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
55 
56 #include <net/bpf.h>
57 
58 #include <net/if_types.h>
59 #include <net/if_vlan_var.h>
60 #include <net/zlib.h>
61 
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
66 
67 #include <machine/bus.h>
68 #include <machine/in_cksum.h>
69 #include <machine/resource.h>
70 #include <sys/bus.h>
71 #include <sys/rman.h>
72 #include <sys/smp.h>
73 
74 #include <dev/pci/pcireg.h>
75 #include <dev/pci/pcivar.h>
76 
77 #include <vm/vm.h>		/* for pmap_mapdev() */
78 #include <vm/pmap.h>
79 
80 #if defined(__i386) || defined(__amd64)
81 #include <machine/specialreg.h>
82 #endif
83 
84 #include <dev/mxge/mxge_mcp.h>
85 #include <dev/mxge/mcp_gen_header.h>
86 /*#define MXGE_FAKE_IFP*/
87 #include <dev/mxge/if_mxge_var.h>
88 
89 /* tunable params */
90 static int mxge_nvidia_ecrc_enable = 1;
91 static int mxge_force_firmware = 0;
92 static int mxge_intr_coal_delay = 30;
93 static int mxge_deassert_wait = 1;
94 static int mxge_flow_control = 1;
95 static int mxge_verbose = 0;
96 static int mxge_lro_cnt = 8;
97 static int mxge_ticks;
98 static int mxge_max_slices = 1;
99 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
100 static int mxge_always_promisc = 0;
101 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
102 static char *mxge_fw_aligned = "mxge_eth_z8e";
103 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
104 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
105 
106 static int mxge_probe(device_t dev);
107 static int mxge_attach(device_t dev);
108 static int mxge_detach(device_t dev);
109 static int mxge_shutdown(device_t dev);
110 static void mxge_intr(void *arg);
111 
112 static device_method_t mxge_methods[] =
113 {
114   /* Device interface */
115   DEVMETHOD(device_probe, mxge_probe),
116   DEVMETHOD(device_attach, mxge_attach),
117   DEVMETHOD(device_detach, mxge_detach),
118   DEVMETHOD(device_shutdown, mxge_shutdown),
119   {0, 0}
120 };
121 
122 static driver_t mxge_driver =
123 {
124   "mxge",
125   mxge_methods,
126   sizeof(mxge_softc_t),
127 };
128 
129 static devclass_t mxge_devclass;
130 
131 /* Declare ourselves to be a child of the PCI bus.*/
132 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
133 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
134 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
135 
136 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
137 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
138 static int mxge_close(mxge_softc_t *sc);
139 static int mxge_open(mxge_softc_t *sc);
140 static void mxge_tick(void *arg);
141 
142 static int
143 mxge_probe(device_t dev)
144 {
145   if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
146       ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
147        (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
148 	  device_set_desc(dev, "Myri10G-PCIE-8A");
149 	  return 0;
150   }
151   return ENXIO;
152 }
153 
154 static void
155 mxge_enable_wc(mxge_softc_t *sc)
156 {
157 #if defined(__i386) || defined(__amd64)
158 	struct mem_range_desc mrdesc;
159 	vm_paddr_t pa;
160 	vm_offset_t len;
161 	int err, action;
162 
163 	sc->wc = 1;
164 	len = rman_get_size(sc->mem_res);
165 	err = pmap_change_attr((vm_offset_t) sc->sram,
166 			       len, PAT_WRITE_COMBINING);
167 	if (err == 0)
168 		return;
169 	else
170 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
171 			      err);
172 	pa = rman_get_start(sc->mem_res);
173 	mrdesc.mr_base = pa;
174 	mrdesc.mr_len = len;
175 	mrdesc.mr_flags = MDF_WRITECOMBINE;
176 	action = MEMRANGE_SET_UPDATE;
177 	strcpy((char *)&mrdesc.mr_owner, "mxge");
178 	err = mem_range_attr_set(&mrdesc, &action);
179 	if (err != 0) {
180 		sc->wc = 0;
181 		device_printf(sc->dev,
182 			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
183 			      (unsigned long)pa, (unsigned long)len, err);
184 	}
185 #endif
186 }
187 
188 
189 /* callback to get our DMA address */
190 static void
191 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
192 			 int error)
193 {
194 	if (error == 0) {
195 		*(bus_addr_t *) arg = segs->ds_addr;
196 	}
197 }
198 
199 static int
200 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
201 		   bus_size_t alignment)
202 {
203 	int err;
204 	device_t dev = sc->dev;
205 	bus_size_t boundary, maxsegsize;
206 
207 	if (bytes > 4096 && alignment == 4096) {
208 		boundary = 0;
209 		maxsegsize = bytes;
210 	} else {
211 		boundary = 4096;
212 		maxsegsize = 4096;
213 	}
214 
215 	/* allocate DMAable memory tags */
216 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
217 				 alignment,		/* alignment */
218 				 boundary,		/* boundary */
219 				 BUS_SPACE_MAXADDR,	/* low */
220 				 BUS_SPACE_MAXADDR,	/* high */
221 				 NULL, NULL,		/* filter */
222 				 bytes,			/* maxsize */
223 				 1,			/* num segs */
224 				 maxsegsize,		/* maxsegsize */
225 				 BUS_DMA_COHERENT,	/* flags */
226 				 NULL, NULL,		/* lock */
227 				 &dma->dmat);		/* tag */
228 	if (err != 0) {
229 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
230 		return err;
231 	}
232 
233 	/* allocate DMAable memory & map */
234 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
235 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
236 				| BUS_DMA_ZERO),  &dma->map);
237 	if (err != 0) {
238 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
239 		goto abort_with_dmat;
240 	}
241 
242 	/* load the memory */
243 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
244 			      mxge_dmamap_callback,
245 			      (void *)&dma->bus_addr, 0);
246 	if (err != 0) {
247 		device_printf(dev, "couldn't load map (err = %d)\n", err);
248 		goto abort_with_mem;
249 	}
250 	return 0;
251 
252 abort_with_mem:
253 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
254 abort_with_dmat:
255 	(void)bus_dma_tag_destroy(dma->dmat);
256 	return err;
257 }
258 
259 
260 static void
261 mxge_dma_free(mxge_dma_t *dma)
262 {
263 	bus_dmamap_unload(dma->dmat, dma->map);
264 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
265 	(void)bus_dma_tag_destroy(dma->dmat);
266 }
267 
268 /*
269  * The eeprom strings on the lanaiX have the format
270  * SN=x\0
271  * MAC=x:x:x:x:x:x\0
272  * PC=text\0
273  */
274 
275 static int
276 mxge_parse_strings(mxge_softc_t *sc)
277 {
278 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
279 
280 	char *ptr, *limit;
281 	int i, found_mac;
282 
283 	ptr = sc->eeprom_strings;
284 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
285 	found_mac = 0;
286 	while (ptr < limit && *ptr != '\0') {
287 		if (memcmp(ptr, "MAC=", 4) == 0) {
288 			ptr += 1;
289 			sc->mac_addr_string = ptr;
290 			for (i = 0; i < 6; i++) {
291 				ptr += 3;
292 				if ((ptr + 2) > limit)
293 					goto abort;
294 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
295 				found_mac = 1;
296 			}
297 		} else if (memcmp(ptr, "PC=", 3) == 0) {
298 			ptr += 3;
299 			strncpy(sc->product_code_string, ptr,
300 				sizeof (sc->product_code_string) - 1);
301 		} else if (memcmp(ptr, "SN=", 3) == 0) {
302 			ptr += 3;
303 			strncpy(sc->serial_number_string, ptr,
304 				sizeof (sc->serial_number_string) - 1);
305 		}
306 		MXGE_NEXT_STRING(ptr);
307 	}
308 
309 	if (found_mac)
310 		return 0;
311 
312  abort:
313 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
314 
315 	return ENXIO;
316 }
317 
318 #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
319 static void
320 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
321 {
322 	uint32_t val;
323 	unsigned long base, off;
324 	char *va, *cfgptr;
325 	device_t pdev, mcp55;
326 	uint16_t vendor_id, device_id, word;
327 	uintptr_t bus, slot, func, ivend, idev;
328 	uint32_t *ptr32;
329 
330 
331 	if (!mxge_nvidia_ecrc_enable)
332 		return;
333 
334 	pdev = device_get_parent(device_get_parent(sc->dev));
335 	if (pdev == NULL) {
336 		device_printf(sc->dev, "could not find parent?\n");
337 		return;
338 	}
339 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
340 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
341 
342 	if (vendor_id != 0x10de)
343 		return;
344 
345 	base = 0;
346 
347 	if (device_id == 0x005d) {
348 		/* ck804, base address is magic */
349 		base = 0xe0000000UL;
350 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
351 		/* mcp55, base address stored in chipset */
352 		mcp55 = pci_find_bsf(0, 0, 0);
353 		if (mcp55 &&
354 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
355 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
356 			word = pci_read_config(mcp55, 0x90, 2);
357 			base = ((unsigned long)word & 0x7ffeU) << 25;
358 		}
359 	}
360 	if (!base)
361 		return;
362 
363 	/* XXXX
364 	   Test below is commented because it is believed that doing
365 	   config read/write beyond 0xff will access the config space
366 	   for the next larger function.  Uncomment this and remove
367 	   the hacky pmap_mapdev() way of accessing config space when
368 	   FreeBSD grows support for extended pcie config space access
369 	*/
370 #if 0
371 	/* See if we can, by some miracle, access the extended
372 	   config space */
373 	val = pci_read_config(pdev, 0x178, 4);
374 	if (val != 0xffffffff) {
375 		val |= 0x40;
376 		pci_write_config(pdev, 0x178, val, 4);
377 		return;
378 	}
379 #endif
380 	/* Rather than using normal pci config space writes, we must
381 	 * map the Nvidia config space ourselves.  This is because on
382 	 * opteron/nvidia class machine the 0xe000000 mapping is
383 	 * handled by the nvidia chipset, that means the internal PCI
384 	 * device (the on-chip northbridge), or the amd-8131 bridge
385 	 * and things behind them are not visible by this method.
386 	 */
387 
388 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
389 		      PCI_IVAR_BUS, &bus);
390 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
391 		      PCI_IVAR_SLOT, &slot);
392 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
393 		      PCI_IVAR_FUNCTION, &func);
394 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
395 		      PCI_IVAR_VENDOR, &ivend);
396 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
397 		      PCI_IVAR_DEVICE, &idev);
398 
399 	off =  base
400 		+ 0x00100000UL * (unsigned long)bus
401 		+ 0x00001000UL * (unsigned long)(func
402 						 + 8 * slot);
403 
404 	/* map it into the kernel */
405 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
406 
407 
408 	if (va == NULL) {
409 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
410 		return;
411 	}
412 	/* get a pointer to the config space mapped into the kernel */
413 	cfgptr = va + (off & PAGE_MASK);
414 
415 	/* make sure that we can really access it */
416 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
417 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
418 	if (! (vendor_id == ivend && device_id == idev)) {
419 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
420 			      vendor_id, device_id);
421 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
422 		return;
423 	}
424 
425 	ptr32 = (uint32_t*)(cfgptr + 0x178);
426 	val = *ptr32;
427 
428 	if (val == 0xffffffff) {
429 		device_printf(sc->dev, "extended mapping failed\n");
430 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
431 		return;
432 	}
433 	*ptr32 = val | 0x40;
434 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
435 	if (mxge_verbose)
436 		device_printf(sc->dev,
437 			      "Enabled ECRC on upstream Nvidia bridge "
438 			      "at %d:%d:%d\n",
439 			      (int)bus, (int)slot, (int)func);
440 	return;
441 }
442 #else
443 static void
444 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
445 {
446 	device_printf(sc->dev,
447 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
448 	return;
449 }
450 #endif
451 
452 
453 static int
454 mxge_dma_test(mxge_softc_t *sc, int test_type)
455 {
456 	mxge_cmd_t cmd;
457 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
458 	int status;
459 	uint32_t len;
460 	char *test = " ";
461 
462 
463 	/* Run a small DMA test.
464 	 * The magic multipliers to the length tell the firmware
465 	 * to do DMA read, write, or read+write tests.  The
466 	 * results are returned in cmd.data0.  The upper 16
467 	 * bits of the return is the number of transfers completed.
468 	 * The lower 16 bits is the time in 0.5us ticks that the
469 	 * transfers took to complete.
470 	 */
471 
472 	len = sc->tx_boundary;
473 
474 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
475 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
476 	cmd.data2 = len * 0x10000;
477 	status = mxge_send_cmd(sc, test_type, &cmd);
478 	if (status != 0) {
479 		test = "read";
480 		goto abort;
481 	}
482 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
483 		(cmd.data0 & 0xffff);
484 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
485 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
486 	cmd.data2 = len * 0x1;
487 	status = mxge_send_cmd(sc, test_type, &cmd);
488 	if (status != 0) {
489 		test = "write";
490 		goto abort;
491 	}
492 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
493 		(cmd.data0 & 0xffff);
494 
495 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
496 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
497 	cmd.data2 = len * 0x10001;
498 	status = mxge_send_cmd(sc, test_type, &cmd);
499 	if (status != 0) {
500 		test = "read/write";
501 		goto abort;
502 	}
503 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
504 		(cmd.data0 & 0xffff);
505 
506 abort:
507 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
508 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
509 			      test, status);
510 
511 	return status;
512 }
513 
514 /*
515  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
516  * when the PCI-E Completion packets are aligned on an 8-byte
517  * boundary.  Some PCI-E chip sets always align Completion packets; on
518  * the ones that do not, the alignment can be enforced by enabling
519  * ECRC generation (if supported).
520  *
521  * When PCI-E Completion packets are not aligned, it is actually more
522  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
523  *
524  * If the driver can neither enable ECRC nor verify that it has
525  * already been enabled, then it must use a firmware image which works
526  * around unaligned completion packets (ethp_z8e.dat), and it should
527  * also ensure that it never gives the device a Read-DMA which is
528  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
529  * enabled, then the driver should use the aligned (eth_z8e.dat)
530  * firmware image, and set tx_boundary to 4KB.
531  */
532 
533 static int
534 mxge_firmware_probe(mxge_softc_t *sc)
535 {
536 	device_t dev = sc->dev;
537 	int reg, status;
538 	uint16_t pectl;
539 
540 	sc->tx_boundary = 4096;
541 	/*
542 	 * Verify the max read request size was set to 4KB
543 	 * before trying the test with 4KB.
544 	 */
545 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
546 		pectl = pci_read_config(dev, reg + 0x8, 2);
547 		if ((pectl & (5 << 12)) != (5 << 12)) {
548 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
549 				      pectl);
550 			sc->tx_boundary = 2048;
551 		}
552 	}
553 
554 	/*
555 	 * load the optimized firmware (which assumes aligned PCIe
556 	 * completions) in order to see if it works on this host.
557 	 */
558 	sc->fw_name = mxge_fw_aligned;
559 	status = mxge_load_firmware(sc, 1);
560 	if (status != 0) {
561 		return status;
562 	}
563 
564 	/*
565 	 * Enable ECRC if possible
566 	 */
567 	mxge_enable_nvidia_ecrc(sc);
568 
569 	/*
570 	 * Run a DMA test which watches for unaligned completions and
571 	 * aborts on the first one seen.
572 	 */
573 
574 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
575 	if (status == 0)
576 		return 0; /* keep the aligned firmware */
577 
578 	if (status != E2BIG)
579 		device_printf(dev, "DMA test failed: %d\n", status);
580 	if (status == ENOSYS)
581 		device_printf(dev, "Falling back to ethp! "
582 			      "Please install up to date fw\n");
583 	return status;
584 }
585 
586 static int
587 mxge_select_firmware(mxge_softc_t *sc)
588 {
589 	int aligned = 0;
590 
591 
592 	if (mxge_force_firmware != 0) {
593 		if (mxge_force_firmware == 1)
594 			aligned = 1;
595 		else
596 			aligned = 0;
597 		if (mxge_verbose)
598 			device_printf(sc->dev,
599 				      "Assuming %s completions (forced)\n",
600 				      aligned ? "aligned" : "unaligned");
601 		goto abort;
602 	}
603 
604 	/* if the PCIe link width is 4 or less, we can use the aligned
605 	   firmware and skip any checks */
606 	if (sc->link_width != 0 && sc->link_width <= 4) {
607 		device_printf(sc->dev,
608 			      "PCIe x%d Link, expect reduced performance\n",
609 			      sc->link_width);
610 		aligned = 1;
611 		goto abort;
612 	}
613 
614 	if (0 == mxge_firmware_probe(sc))
615 		return 0;
616 
617 abort:
618 	if (aligned) {
619 		sc->fw_name = mxge_fw_aligned;
620 		sc->tx_boundary = 4096;
621 	} else {
622 		sc->fw_name = mxge_fw_unaligned;
623 		sc->tx_boundary = 2048;
624 	}
625 	return (mxge_load_firmware(sc, 0));
626 }
627 
628 union qualhack
629 {
630         const char *ro_char;
631         char *rw_char;
632 };
633 
634 static int
635 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
636 {
637 
638 
639 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
640 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
641 			      be32toh(hdr->mcp_type));
642 		return EIO;
643 	}
644 
645 	/* save firmware version for sysctl */
646 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
647 	if (mxge_verbose)
648 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
649 
650 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
651 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
652 
653 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
654 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
655 		device_printf(sc->dev, "Found firmware version %s\n",
656 			      sc->fw_version);
657 		device_printf(sc->dev, "Driver needs %d.%d\n",
658 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
659 		return EINVAL;
660 	}
661 	return 0;
662 
663 }
664 
665 static void *
666 z_alloc(void *nil, u_int items, u_int size)
667 {
668         void *ptr;
669 
670         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
671         return ptr;
672 }
673 
674 static void
675 z_free(void *nil, void *ptr)
676 {
677         free(ptr, M_TEMP);
678 }
679 
680 
681 static int
682 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
683 {
684 	z_stream zs;
685 	char *inflate_buffer;
686 	const struct firmware *fw;
687 	const mcp_gen_header_t *hdr;
688 	unsigned hdr_offset;
689 	int status;
690 	unsigned int i;
691 	char dummy;
692 	size_t fw_len;
693 
694 	fw = firmware_get(sc->fw_name);
695 	if (fw == NULL) {
696 		device_printf(sc->dev, "Could not find firmware image %s\n",
697 			      sc->fw_name);
698 		return ENOENT;
699 	}
700 
701 
702 
703 	/* setup zlib and decompress f/w */
704 	bzero(&zs, sizeof (zs));
705 	zs.zalloc = z_alloc;
706 	zs.zfree = z_free;
707 	status = inflateInit(&zs);
708 	if (status != Z_OK) {
709 		status = EIO;
710 		goto abort_with_fw;
711 	}
712 
713 	/* the uncompressed size is stored as the firmware version,
714 	   which would otherwise go unused */
715 	fw_len = (size_t) fw->version;
716 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
717 	if (inflate_buffer == NULL)
718 		goto abort_with_zs;
719 	zs.avail_in = fw->datasize;
720 	zs.next_in = __DECONST(char *, fw->data);
721 	zs.avail_out = fw_len;
722 	zs.next_out = inflate_buffer;
723 	status = inflate(&zs, Z_FINISH);
724 	if (status != Z_STREAM_END) {
725 		device_printf(sc->dev, "zlib %d\n", status);
726 		status = EIO;
727 		goto abort_with_buffer;
728 	}
729 
730 	/* check id */
731 	hdr_offset = htobe32(*(const uint32_t *)
732 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
733 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
734 		device_printf(sc->dev, "Bad firmware file");
735 		status = EIO;
736 		goto abort_with_buffer;
737 	}
738 	hdr = (const void*)(inflate_buffer + hdr_offset);
739 
740 	status = mxge_validate_firmware(sc, hdr);
741 	if (status != 0)
742 		goto abort_with_buffer;
743 
744 	/* Copy the inflated firmware to NIC SRAM. */
745 	for (i = 0; i < fw_len; i += 256) {
746 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
747 			      inflate_buffer + i,
748 			      min(256U, (unsigned)(fw_len - i)));
749 		mb();
750 		dummy = *sc->sram;
751 		mb();
752 	}
753 
754 	*limit = fw_len;
755 	status = 0;
756 abort_with_buffer:
757 	free(inflate_buffer, M_TEMP);
758 abort_with_zs:
759 	inflateEnd(&zs);
760 abort_with_fw:
761 	firmware_put(fw, FIRMWARE_UNLOAD);
762 	return status;
763 }
764 
765 /*
766  * Enable or disable periodic RDMAs from the host to make certain
767  * chipsets resend dropped PCIe messages
768  */
769 
770 static void
771 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
772 {
773 	char buf_bytes[72];
774 	volatile uint32_t *confirm;
775 	volatile char *submit;
776 	uint32_t *buf, dma_low, dma_high;
777 	int i;
778 
779 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
780 
781 	/* clear confirmation addr */
782 	confirm = (volatile uint32_t *)sc->cmd;
783 	*confirm = 0;
784 	mb();
785 
786 	/* send an rdma command to the PCIe engine, and wait for the
787 	   response in the confirmation address.  The firmware should
788 	   write a -1 there to indicate it is alive and well
789 	*/
790 
791 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
792 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
793 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
794 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
795 	buf[2] = htobe32(0xffffffff);		/* confirm data */
796 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
797 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
798 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
799 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
800 	buf[5] = htobe32(enable);			/* enable? */
801 
802 
803 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
804 
805 	mxge_pio_copy(submit, buf, 64);
806 	mb();
807 	DELAY(1000);
808 	mb();
809 	i = 0;
810 	while (*confirm != 0xffffffff && i < 20) {
811 		DELAY(1000);
812 		i++;
813 	}
814 	if (*confirm != 0xffffffff) {
815 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
816 			      (enable ? "enable" : "disable"), confirm,
817 			      *confirm);
818 	}
819 	return;
820 }
821 
822 static int
823 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
824 {
825 	mcp_cmd_t *buf;
826 	char buf_bytes[sizeof(*buf) + 8];
827 	volatile mcp_cmd_response_t *response = sc->cmd;
828 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
829 	uint32_t dma_low, dma_high;
830 	int err, sleep_total = 0;
831 
832 	/* ensure buf is aligned to 8 bytes */
833 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
834 
835 	buf->data0 = htobe32(data->data0);
836 	buf->data1 = htobe32(data->data1);
837 	buf->data2 = htobe32(data->data2);
838 	buf->cmd = htobe32(cmd);
839 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
840 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
841 
842 	buf->response_addr.low = htobe32(dma_low);
843 	buf->response_addr.high = htobe32(dma_high);
844 	mtx_lock(&sc->cmd_mtx);
845 	response->result = 0xffffffff;
846 	mb();
847 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
848 
849 	/* wait up to 20ms */
850 	err = EAGAIN;
851 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
852 		bus_dmamap_sync(sc->cmd_dma.dmat,
853 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
854 		mb();
855 		switch (be32toh(response->result)) {
856 		case 0:
857 			data->data0 = be32toh(response->data);
858 			err = 0;
859 			break;
860 		case 0xffffffff:
861 			DELAY(1000);
862 			break;
863 		case MXGEFW_CMD_UNKNOWN:
864 			err = ENOSYS;
865 			break;
866 		case MXGEFW_CMD_ERROR_UNALIGNED:
867 			err = E2BIG;
868 			break;
869 		case MXGEFW_CMD_ERROR_BUSY:
870 			err = EBUSY;
871 			break;
872 		default:
873 			device_printf(sc->dev,
874 				      "mxge: command %d "
875 				      "failed, result = %d\n",
876 				      cmd, be32toh(response->result));
877 			err = ENXIO;
878 			break;
879 		}
880 		if (err != EAGAIN)
881 			break;
882 	}
883 	if (err == EAGAIN)
884 		device_printf(sc->dev, "mxge: command %d timed out"
885 			      "result = %d\n",
886 			      cmd, be32toh(response->result));
887 	mtx_unlock(&sc->cmd_mtx);
888 	return err;
889 }
890 
891 static int
892 mxge_adopt_running_firmware(mxge_softc_t *sc)
893 {
894 	struct mcp_gen_header *hdr;
895 	const size_t bytes = sizeof (struct mcp_gen_header);
896 	size_t hdr_offset;
897 	int status;
898 
899 	/* find running firmware header */
900 	hdr_offset = htobe32(*(volatile uint32_t *)
901 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
902 
903 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
904 		device_printf(sc->dev,
905 			      "Running firmware has bad header offset (%d)\n",
906 			      (int)hdr_offset);
907 		return EIO;
908 	}
909 
910 	/* copy header of running firmware from SRAM to host memory to
911 	 * validate firmware */
912 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
913 	if (hdr == NULL) {
914 		device_printf(sc->dev, "could not malloc firmware hdr\n");
915 		return ENOMEM;
916 	}
917 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
918 				rman_get_bushandle(sc->mem_res),
919 				hdr_offset, (char *)hdr, bytes);
920 	status = mxge_validate_firmware(sc, hdr);
921 	free(hdr, M_DEVBUF);
922 
923 	/*
924 	 * check to see if adopted firmware has bug where adopting
925 	 * it will cause broadcasts to be filtered unless the NIC
926 	 * is kept in ALLMULTI mode
927 	 */
928 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
929 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
930 		sc->adopted_rx_filter_bug = 1;
931 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
932 			      "working around rx filter bug\n",
933 			      sc->fw_ver_major, sc->fw_ver_minor,
934 			      sc->fw_ver_tiny);
935 	}
936 
937 	return status;
938 }
939 
940 
941 static int
942 mxge_load_firmware(mxge_softc_t *sc, int adopt)
943 {
944 	volatile uint32_t *confirm;
945 	volatile char *submit;
946 	char buf_bytes[72];
947 	uint32_t *buf, size, dma_low, dma_high;
948 	int status, i;
949 
950 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
951 
952 	size = sc->sram_size;
953 	status = mxge_load_firmware_helper(sc, &size);
954 	if (status) {
955 		if (!adopt)
956 			return status;
957 		/* Try to use the currently running firmware, if
958 		   it is new enough */
959 		status = mxge_adopt_running_firmware(sc);
960 		if (status) {
961 			device_printf(sc->dev,
962 				      "failed to adopt running firmware\n");
963 			return status;
964 		}
965 		device_printf(sc->dev,
966 			      "Successfully adopted running firmware\n");
967 		if (sc->tx_boundary == 4096) {
968 			device_printf(sc->dev,
969 				"Using firmware currently running on NIC"
970 				 ".  For optimal\n");
971 			device_printf(sc->dev,
972 				 "performance consider loading optimized "
973 				 "firmware\n");
974 		}
975 		sc->fw_name = mxge_fw_unaligned;
976 		sc->tx_boundary = 2048;
977 		return 0;
978 	}
979 	/* clear confirmation addr */
980 	confirm = (volatile uint32_t *)sc->cmd;
981 	*confirm = 0;
982 	mb();
983 	/* send a reload command to the bootstrap MCP, and wait for the
984 	   response in the confirmation address.  The firmware should
985 	   write a -1 there to indicate it is alive and well
986 	*/
987 
988 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
989 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
990 
991 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
992 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
993 	buf[2] = htobe32(0xffffffff);	/* confirm data */
994 
995 	/* FIX: All newest firmware should un-protect the bottom of
996 	   the sram before handoff. However, the very first interfaces
997 	   do not. Therefore the handoff copy must skip the first 8 bytes
998 	*/
999 					/* where the code starts*/
1000 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1001 	buf[4] = htobe32(size - 8); 	/* length of code */
1002 	buf[5] = htobe32(8);		/* where to copy to */
1003 	buf[6] = htobe32(0);		/* where to jump to */
1004 
1005 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1006 	mxge_pio_copy(submit, buf, 64);
1007 	mb();
1008 	DELAY(1000);
1009 	mb();
1010 	i = 0;
1011 	while (*confirm != 0xffffffff && i < 20) {
1012 		DELAY(1000*10);
1013 		i++;
1014 		bus_dmamap_sync(sc->cmd_dma.dmat,
1015 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1016 	}
1017 	if (*confirm != 0xffffffff) {
1018 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1019 			confirm, *confirm);
1020 
1021 		return ENXIO;
1022 	}
1023 	return 0;
1024 }
1025 
1026 static int
1027 mxge_update_mac_address(mxge_softc_t *sc)
1028 {
1029 	mxge_cmd_t cmd;
1030 	uint8_t *addr = sc->mac_addr;
1031 	int status;
1032 
1033 
1034 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1035 		     | (addr[2] << 8) | addr[3]);
1036 
1037 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1038 
1039 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1040 	return status;
1041 }
1042 
1043 static int
1044 mxge_change_pause(mxge_softc_t *sc, int pause)
1045 {
1046 	mxge_cmd_t cmd;
1047 	int status;
1048 
1049 	if (pause)
1050 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1051 				       &cmd);
1052 	else
1053 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1054 				       &cmd);
1055 
1056 	if (status) {
1057 		device_printf(sc->dev, "Failed to set flow control mode\n");
1058 		return ENXIO;
1059 	}
1060 	sc->pause = pause;
1061 	return 0;
1062 }
1063 
1064 static void
1065 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1066 {
1067 	mxge_cmd_t cmd;
1068 	int status;
1069 
1070 	if (mxge_always_promisc)
1071 		promisc = 1;
1072 
1073 	if (promisc)
1074 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1075 				       &cmd);
1076 	else
1077 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1078 				       &cmd);
1079 
1080 	if (status) {
1081 		device_printf(sc->dev, "Failed to set promisc mode\n");
1082 	}
1083 }
1084 
1085 static void
1086 mxge_set_multicast_list(mxge_softc_t *sc)
1087 {
1088 	mxge_cmd_t cmd;
1089 	struct ifmultiaddr *ifma;
1090 	struct ifnet *ifp = sc->ifp;
1091 	int err;
1092 
1093 	/* This firmware is known to not support multicast */
1094 	if (!sc->fw_multicast_support)
1095 		return;
1096 
1097 	/* Disable multicast filtering while we play with the lists*/
1098 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1099 	if (err != 0) {
1100 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1101 		       " error status: %d\n", err);
1102 		return;
1103 	}
1104 
1105 	if (sc->adopted_rx_filter_bug)
1106 		return;
1107 
1108 	if (ifp->if_flags & IFF_ALLMULTI)
1109 		/* request to disable multicast filtering, so quit here */
1110 		return;
1111 
1112 	/* Flush all the filters */
1113 
1114 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1115 	if (err != 0) {
1116 		device_printf(sc->dev,
1117 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1118 			      ", error status: %d\n", err);
1119 		return;
1120 	}
1121 
1122 	/* Walk the multicast list, and add each address */
1123 
1124 	IF_ADDR_LOCK(ifp);
1125 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1126 		if (ifma->ifma_addr->sa_family != AF_LINK)
1127 			continue;
1128 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1129 		      &cmd.data0, 4);
1130 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1131 		      &cmd.data1, 2);
1132 		cmd.data0 = htonl(cmd.data0);
1133 		cmd.data1 = htonl(cmd.data1);
1134 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1135 		if (err != 0) {
1136 			device_printf(sc->dev, "Failed "
1137 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1138 			       "%d\t", err);
1139 			/* abort, leaving multicast filtering off */
1140 			IF_ADDR_UNLOCK(ifp);
1141 			return;
1142 		}
1143 	}
1144 	IF_ADDR_UNLOCK(ifp);
1145 	/* Enable multicast filtering */
1146 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1147 	if (err != 0) {
1148 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1149 		       ", error status: %d\n", err);
1150 	}
1151 }
1152 
1153 static int
1154 mxge_max_mtu(mxge_softc_t *sc)
1155 {
1156 	mxge_cmd_t cmd;
1157 	int status;
1158 
1159 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1160 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1161 
1162 	/* try to set nbufs to see if it we can
1163 	   use virtually contiguous jumbos */
1164 	cmd.data0 = 0;
1165 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1166 			       &cmd);
1167 	if (status == 0)
1168 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1169 
1170 	/* otherwise, we're limited to MJUMPAGESIZE */
1171 	return MJUMPAGESIZE - MXGEFW_PAD;
1172 }
1173 
1174 static int
1175 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1176 {
1177 	struct mxge_slice_state *ss;
1178 	mxge_rx_done_t *rx_done;
1179 	volatile uint32_t *irq_claim;
1180 	mxge_cmd_t cmd;
1181 	int slice, status;
1182 
1183 	/* try to send a reset command to the card to see if it
1184 	   is alive */
1185 	memset(&cmd, 0, sizeof (cmd));
1186 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1187 	if (status != 0) {
1188 		device_printf(sc->dev, "failed reset\n");
1189 		return ENXIO;
1190 	}
1191 
1192 	mxge_dummy_rdma(sc, 1);
1193 
1194 
1195 	/* set the intrq size */
1196 	cmd.data0 = sc->rx_ring_size;
1197 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1198 
1199 	/*
1200 	 * Even though we already know how many slices are supported
1201 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1202 	 * has magic side effects, and must be called after a reset.
1203 	 * It must be called prior to calling any RSS related cmds,
1204 	 * including assigning an interrupt queue for anything but
1205 	 * slice 0.  It must also be called *after*
1206 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1207 	 * the firmware to compute offsets.
1208 	 */
1209 
1210 	if (sc->num_slices > 1) {
1211 		/* ask the maximum number of slices it supports */
1212 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1213 					   &cmd);
1214 		if (status != 0) {
1215 			device_printf(sc->dev,
1216 				      "failed to get number of slices\n");
1217 			return status;
1218 		}
1219 		/*
1220 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1221 		 * to setting up the interrupt queue DMA
1222 		 */
1223 		cmd.data0 = sc->num_slices;
1224 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1225 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1226 					   &cmd);
1227 		if (status != 0) {
1228 			device_printf(sc->dev,
1229 				      "failed to set number of slices\n");
1230 			return status;
1231 		}
1232 	}
1233 
1234 
1235 	if (interrupts_setup) {
1236 		/* Now exchange information about interrupts  */
1237 		for (slice = 0; slice < sc->num_slices; slice++) {
1238 			rx_done = &sc->ss[slice].rx_done;
1239 			memset(rx_done->entry, 0, sc->rx_ring_size);
1240 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1241 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1242 			cmd.data2 = slice;
1243 			status |= mxge_send_cmd(sc,
1244 						MXGEFW_CMD_SET_INTRQ_DMA,
1245 						&cmd);
1246 		}
1247 	}
1248 
1249 	status |= mxge_send_cmd(sc,
1250 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1251 
1252 
1253 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1254 
1255 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1256 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1257 
1258 
1259 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1260 				&cmd);
1261 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1262 	if (status != 0) {
1263 		device_printf(sc->dev, "failed set interrupt parameters\n");
1264 		return status;
1265 	}
1266 
1267 
1268 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1269 
1270 
1271 	/* run a DMA benchmark */
1272 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1273 
1274 	for (slice = 0; slice < sc->num_slices; slice++) {
1275 		ss = &sc->ss[slice];
1276 
1277 		ss->irq_claim = irq_claim + (2 * slice);
1278 		/* reset mcp/driver shared state back to 0 */
1279 		ss->rx_done.idx = 0;
1280 		ss->rx_done.cnt = 0;
1281 		ss->tx.req = 0;
1282 		ss->tx.done = 0;
1283 		ss->tx.pkt_done = 0;
1284 		ss->tx.wake = 0;
1285 		ss->tx.defrag = 0;
1286 		ss->tx.stall = 0;
1287 		ss->rx_big.cnt = 0;
1288 		ss->rx_small.cnt = 0;
1289 		ss->lro_bad_csum = 0;
1290 		ss->lro_queued = 0;
1291 		ss->lro_flushed = 0;
1292 		if (ss->fw_stats != NULL) {
1293 			ss->fw_stats->valid = 0;
1294 			ss->fw_stats->send_done_count = 0;
1295 		}
1296 	}
1297 	sc->rdma_tags_available = 15;
1298 	status = mxge_update_mac_address(sc);
1299 	mxge_change_promisc(sc, 0);
1300 	mxge_change_pause(sc, sc->pause);
1301 	mxge_set_multicast_list(sc);
1302 	return status;
1303 }
1304 
1305 static int
1306 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1307 {
1308         mxge_softc_t *sc;
1309         unsigned int intr_coal_delay;
1310         int err;
1311 
1312         sc = arg1;
1313         intr_coal_delay = sc->intr_coal_delay;
1314         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1315         if (err != 0) {
1316                 return err;
1317         }
1318         if (intr_coal_delay == sc->intr_coal_delay)
1319                 return 0;
1320 
1321         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1322                 return EINVAL;
1323 
1324 	mtx_lock(&sc->driver_mtx);
1325 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1326 	sc->intr_coal_delay = intr_coal_delay;
1327 
1328 	mtx_unlock(&sc->driver_mtx);
1329         return err;
1330 }
1331 
1332 static int
1333 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1334 {
1335         mxge_softc_t *sc;
1336         unsigned int enabled;
1337         int err;
1338 
1339         sc = arg1;
1340         enabled = sc->pause;
1341         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1342         if (err != 0) {
1343                 return err;
1344         }
1345         if (enabled == sc->pause)
1346                 return 0;
1347 
1348 	mtx_lock(&sc->driver_mtx);
1349 	err = mxge_change_pause(sc, enabled);
1350 	mtx_unlock(&sc->driver_mtx);
1351         return err;
1352 }
1353 
1354 static int
1355 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1356 {
1357 	struct ifnet *ifp;
1358 	int err = 0;
1359 
1360 	ifp = sc->ifp;
1361 	if (lro_cnt == 0)
1362 		ifp->if_capenable &= ~IFCAP_LRO;
1363 	else
1364 		ifp->if_capenable |= IFCAP_LRO;
1365 	sc->lro_cnt = lro_cnt;
1366 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1367 		callout_stop(&sc->co_hdl);
1368 		mxge_close(sc);
1369 		err = mxge_open(sc);
1370 		if (err == 0)
1371 			callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
1372 	}
1373 	return err;
1374 }
1375 
1376 static int
1377 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1378 {
1379 	mxge_softc_t *sc;
1380 	unsigned int lro_cnt;
1381 	int err;
1382 
1383 	sc = arg1;
1384 	lro_cnt = sc->lro_cnt;
1385 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1386 	if (err != 0)
1387 		return err;
1388 
1389 	if (lro_cnt == sc->lro_cnt)
1390 		return 0;
1391 
1392 	if (lro_cnt > 128)
1393 		return EINVAL;
1394 
1395 	mtx_lock(&sc->driver_mtx);
1396 	err = mxge_change_lro_locked(sc, lro_cnt);
1397 	mtx_unlock(&sc->driver_mtx);
1398 	return err;
1399 }
1400 
1401 static int
1402 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1403 {
1404         int err;
1405 
1406         if (arg1 == NULL)
1407                 return EFAULT;
1408         arg2 = be32toh(*(int *)arg1);
1409         arg1 = NULL;
1410         err = sysctl_handle_int(oidp, arg1, arg2, req);
1411 
1412         return err;
1413 }
1414 
1415 static void
1416 mxge_rem_sysctls(mxge_softc_t *sc)
1417 {
1418 	struct mxge_slice_state *ss;
1419 	int slice;
1420 
1421 	if (sc->slice_sysctl_tree == NULL)
1422 		return;
1423 
1424 	for (slice = 0; slice < sc->num_slices; slice++) {
1425 		ss = &sc->ss[slice];
1426 		if (ss == NULL || ss->sysctl_tree == NULL)
1427 			continue;
1428 		sysctl_ctx_free(&ss->sysctl_ctx);
1429 		ss->sysctl_tree = NULL;
1430 	}
1431 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1432 	sc->slice_sysctl_tree = NULL;
1433 }
1434 
1435 static void
1436 mxge_add_sysctls(mxge_softc_t *sc)
1437 {
1438 	struct sysctl_ctx_list *ctx;
1439 	struct sysctl_oid_list *children;
1440 	mcp_irq_data_t *fw;
1441 	struct mxge_slice_state *ss;
1442 	int slice;
1443 	char slice_num[8];
1444 
1445 	ctx = device_get_sysctl_ctx(sc->dev);
1446 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1447 	fw = sc->ss[0].fw_stats;
1448 
1449 	/* random information */
1450 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1451 		       "firmware_version",
1452 		       CTLFLAG_RD, &sc->fw_version,
1453 		       0, "firmware version");
1454 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1455 		       "serial_number",
1456 		       CTLFLAG_RD, &sc->serial_number_string,
1457 		       0, "serial number");
1458 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1459 		       "product_code",
1460 		       CTLFLAG_RD, &sc->product_code_string,
1461 		       0, "product_code");
1462 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1463 		       "pcie_link_width",
1464 		       CTLFLAG_RD, &sc->link_width,
1465 		       0, "tx_boundary");
1466 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1467 		       "tx_boundary",
1468 		       CTLFLAG_RD, &sc->tx_boundary,
1469 		       0, "tx_boundary");
1470 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1471 		       "write_combine",
1472 		       CTLFLAG_RD, &sc->wc,
1473 		       0, "write combining PIO?");
1474 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1475 		       "read_dma_MBs",
1476 		       CTLFLAG_RD, &sc->read_dma,
1477 		       0, "DMA Read speed in MB/s");
1478 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1479 		       "write_dma_MBs",
1480 		       CTLFLAG_RD, &sc->write_dma,
1481 		       0, "DMA Write speed in MB/s");
1482 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1483 		       "read_write_dma_MBs",
1484 		       CTLFLAG_RD, &sc->read_write_dma,
1485 		       0, "DMA concurrent Read/Write speed in MB/s");
1486 
1487 
1488 	/* performance related tunables */
1489 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1490 			"intr_coal_delay",
1491 			CTLTYPE_INT|CTLFLAG_RW, sc,
1492 			0, mxge_change_intr_coal,
1493 			"I", "interrupt coalescing delay in usecs");
1494 
1495 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1496 			"flow_control_enabled",
1497 			CTLTYPE_INT|CTLFLAG_RW, sc,
1498 			0, mxge_change_flow_control,
1499 			"I", "interrupt coalescing delay in usecs");
1500 
1501 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1502 		       "deassert_wait",
1503 		       CTLFLAG_RW, &mxge_deassert_wait,
1504 		       0, "Wait for IRQ line to go low in ihandler");
1505 
1506 	/* stats block from firmware is in network byte order.
1507 	   Need to swap it */
1508 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1509 			"link_up",
1510 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1511 			0, mxge_handle_be32,
1512 			"I", "link up");
1513 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1514 			"rdma_tags_available",
1515 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1516 			0, mxge_handle_be32,
1517 			"I", "rdma_tags_available");
1518 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1519 			"dropped_bad_crc32",
1520 			CTLTYPE_INT|CTLFLAG_RD,
1521 			&fw->dropped_bad_crc32,
1522 			0, mxge_handle_be32,
1523 			"I", "dropped_bad_crc32");
1524 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1525 			"dropped_bad_phy",
1526 			CTLTYPE_INT|CTLFLAG_RD,
1527 			&fw->dropped_bad_phy,
1528 			0, mxge_handle_be32,
1529 			"I", "dropped_bad_phy");
1530 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1531 			"dropped_link_error_or_filtered",
1532 			CTLTYPE_INT|CTLFLAG_RD,
1533 			&fw->dropped_link_error_or_filtered,
1534 			0, mxge_handle_be32,
1535 			"I", "dropped_link_error_or_filtered");
1536 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1537 			"dropped_link_overflow",
1538 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1539 			0, mxge_handle_be32,
1540 			"I", "dropped_link_overflow");
1541 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1542 			"dropped_multicast_filtered",
1543 			CTLTYPE_INT|CTLFLAG_RD,
1544 			&fw->dropped_multicast_filtered,
1545 			0, mxge_handle_be32,
1546 			"I", "dropped_multicast_filtered");
1547 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548 			"dropped_no_big_buffer",
1549 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1550 			0, mxge_handle_be32,
1551 			"I", "dropped_no_big_buffer");
1552 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1553 			"dropped_no_small_buffer",
1554 			CTLTYPE_INT|CTLFLAG_RD,
1555 			&fw->dropped_no_small_buffer,
1556 			0, mxge_handle_be32,
1557 			"I", "dropped_no_small_buffer");
1558 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1559 			"dropped_overrun",
1560 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1561 			0, mxge_handle_be32,
1562 			"I", "dropped_overrun");
1563 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1564 			"dropped_pause",
1565 			CTLTYPE_INT|CTLFLAG_RD,
1566 			&fw->dropped_pause,
1567 			0, mxge_handle_be32,
1568 			"I", "dropped_pause");
1569 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1570 			"dropped_runt",
1571 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1572 			0, mxge_handle_be32,
1573 			"I", "dropped_runt");
1574 
1575 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1576 			"dropped_unicast_filtered",
1577 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1578 			0, mxge_handle_be32,
1579 			"I", "dropped_unicast_filtered");
1580 
1581 	/* verbose printing? */
1582 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1583 		       "verbose",
1584 		       CTLFLAG_RW, &mxge_verbose,
1585 		       0, "verbose printing");
1586 
1587 	/* lro */
1588 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1589 			"lro_cnt",
1590 			CTLTYPE_INT|CTLFLAG_RW, sc,
1591 			0, mxge_change_lro,
1592 			"I", "number of lro merge queues");
1593 
1594 
1595 	/* add counters exported for debugging from all slices */
1596 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1597 	sc->slice_sysctl_tree =
1598 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1599 				"slice", CTLFLAG_RD, 0, "");
1600 
1601 	for (slice = 0; slice < sc->num_slices; slice++) {
1602 		ss = &sc->ss[slice];
1603 		sysctl_ctx_init(&ss->sysctl_ctx);
1604 		ctx = &ss->sysctl_ctx;
1605 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1606 		sprintf(slice_num, "%d", slice);
1607 		ss->sysctl_tree =
1608 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1609 					CTLFLAG_RD, 0, "");
1610 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1611 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1612 			       "rx_small_cnt",
1613 			       CTLFLAG_RD, &ss->rx_small.cnt,
1614 			       0, "rx_small_cnt");
1615 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1616 			       "rx_big_cnt",
1617 			       CTLFLAG_RD, &ss->rx_big.cnt,
1618 			       0, "rx_small_cnt");
1619 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1620 			       "tx_req",
1621 			       CTLFLAG_RD, &ss->tx.req,
1622 			       0, "tx_req");
1623 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1624 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1625 			       0, "number of lro merge queues flushed");
1626 
1627 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1628 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1629 			       0, "number of frames appended to lro merge"
1630 			       "queues");
1631 
1632 		/* only transmit from slice 0 for now */
1633 		if (slice > 0)
1634 			continue;
1635 
1636 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1637 			       "tx_done",
1638 			       CTLFLAG_RD, &ss->tx.done,
1639 			       0, "tx_done");
1640 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1641 			       "tx_pkt_done",
1642 			       CTLFLAG_RD, &ss->tx.pkt_done,
1643 			       0, "tx_done");
1644 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1645 			       "tx_stall",
1646 			       CTLFLAG_RD, &ss->tx.stall,
1647 			       0, "tx_stall");
1648 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1649 			       "tx_wake",
1650 			       CTLFLAG_RD, &ss->tx.wake,
1651 			       0, "tx_wake");
1652 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1653 			       "tx_defrag",
1654 			       CTLFLAG_RD, &ss->tx.defrag,
1655 			       0, "tx_defrag");
1656 	}
1657 }
1658 
1659 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1660    backwards one at a time and handle ring wraps */
1661 
1662 static inline void
1663 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1664 			    mcp_kreq_ether_send_t *src, int cnt)
1665 {
1666         int idx, starting_slot;
1667         starting_slot = tx->req;
1668         while (cnt > 1) {
1669                 cnt--;
1670                 idx = (starting_slot + cnt) & tx->mask;
1671                 mxge_pio_copy(&tx->lanai[idx],
1672 			      &src[cnt], sizeof(*src));
1673                 mb();
1674         }
1675 }
1676 
1677 /*
1678  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1679  * at most 32 bytes at a time, so as to avoid involving the software
1680  * pio handler in the nic.   We re-write the first segment's flags
1681  * to mark them valid only after writing the entire chain
1682  */
1683 
1684 static inline void
1685 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1686                   int cnt)
1687 {
1688         int idx, i;
1689         uint32_t *src_ints;
1690 	volatile uint32_t *dst_ints;
1691         mcp_kreq_ether_send_t *srcp;
1692 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1693 	uint8_t last_flags;
1694 
1695         idx = tx->req & tx->mask;
1696 
1697 	last_flags = src->flags;
1698 	src->flags = 0;
1699         mb();
1700         dst = dstp = &tx->lanai[idx];
1701         srcp = src;
1702 
1703         if ((idx + cnt) < tx->mask) {
1704                 for (i = 0; i < (cnt - 1); i += 2) {
1705                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1706                         mb(); /* force write every 32 bytes */
1707                         srcp += 2;
1708                         dstp += 2;
1709                 }
1710         } else {
1711                 /* submit all but the first request, and ensure
1712                    that it is submitted below */
1713                 mxge_submit_req_backwards(tx, src, cnt);
1714                 i = 0;
1715         }
1716         if (i < cnt) {
1717                 /* submit the first request */
1718                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1719                 mb(); /* barrier before setting valid flag */
1720         }
1721 
1722         /* re-write the last 32-bits with the valid flags */
1723         src->flags = last_flags;
1724         src_ints = (uint32_t *)src;
1725         src_ints+=3;
1726         dst_ints = (volatile uint32_t *)dst;
1727         dst_ints+=3;
1728         *dst_ints =  *src_ints;
1729         tx->req += cnt;
1730         mb();
1731 }
1732 
1733 #if IFCAP_TSO4
1734 
1735 static void
1736 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1737 	       int busdma_seg_cnt, int ip_off)
1738 {
1739 	mxge_tx_ring_t *tx;
1740 	mcp_kreq_ether_send_t *req;
1741 	bus_dma_segment_t *seg;
1742 	struct ip *ip;
1743 	struct tcphdr *tcp;
1744 	uint32_t low, high_swapped;
1745 	int len, seglen, cum_len, cum_len_next;
1746 	int next_is_first, chop, cnt, rdma_count, small;
1747 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1748 	uint8_t flags, flags_next;
1749 	static int once;
1750 
1751 	mss = m->m_pkthdr.tso_segsz;
1752 
1753 	/* negative cum_len signifies to the
1754 	 * send loop that we are still in the
1755 	 * header portion of the TSO packet.
1756 	 */
1757 
1758 	/* ensure we have the ethernet, IP and TCP
1759 	   header together in the first mbuf, copy
1760 	   it to a scratch buffer if not */
1761 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1762 		m_copydata(m, 0, ip_off + sizeof (*ip),
1763 			   ss->scratch);
1764 		ip = (struct ip *)(ss->scratch + ip_off);
1765 	} else {
1766 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1767 	}
1768 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1769 			    + sizeof (*tcp))) {
1770 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1771 			   + sizeof (*tcp),  ss->scratch);
1772 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1773 	}
1774 
1775 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1776 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1777 
1778 	/* TSO implies checksum offload on this hardware */
1779 	cksum_offset = ip_off + (ip->ip_hl << 2);
1780 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1781 
1782 
1783 	/* for TSO, pseudo_hdr_offset holds mss.
1784 	 * The firmware figures out where to put
1785 	 * the checksum by parsing the header. */
1786 	pseudo_hdr_offset = htobe16(mss);
1787 
1788 	tx = &ss->tx;
1789 	req = tx->req_list;
1790 	seg = tx->seg_list;
1791 	cnt = 0;
1792 	rdma_count = 0;
1793 	/* "rdma_count" is the number of RDMAs belonging to the
1794 	 * current packet BEFORE the current send request. For
1795 	 * non-TSO packets, this is equal to "count".
1796 	 * For TSO packets, rdma_count needs to be reset
1797 	 * to 0 after a segment cut.
1798 	 *
1799 	 * The rdma_count field of the send request is
1800 	 * the number of RDMAs of the packet starting at
1801 	 * that request. For TSO send requests with one ore more cuts
1802 	 * in the middle, this is the number of RDMAs starting
1803 	 * after the last cut in the request. All previous
1804 	 * segments before the last cut implicitly have 1 RDMA.
1805 	 *
1806 	 * Since the number of RDMAs is not known beforehand,
1807 	 * it must be filled-in retroactively - after each
1808 	 * segmentation cut or at the end of the entire packet.
1809 	 */
1810 
1811 	while (busdma_seg_cnt) {
1812 		/* Break the busdma segment up into pieces*/
1813 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1814 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1815 		len = seg->ds_len;
1816 
1817 		while (len) {
1818 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1819 			seglen = len;
1820 			cum_len_next = cum_len + seglen;
1821 			(req-rdma_count)->rdma_count = rdma_count + 1;
1822 			if (__predict_true(cum_len >= 0)) {
1823 				/* payload */
1824 				chop = (cum_len_next > mss);
1825 				cum_len_next = cum_len_next % mss;
1826 				next_is_first = (cum_len_next == 0);
1827 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1828 				flags_next |= next_is_first *
1829 					MXGEFW_FLAGS_FIRST;
1830 				rdma_count |= -(chop | next_is_first);
1831 				rdma_count += chop & !next_is_first;
1832 			} else if (cum_len_next >= 0) {
1833 				/* header ends */
1834 				rdma_count = -1;
1835 				cum_len_next = 0;
1836 				seglen = -cum_len;
1837 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1838 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1839 					MXGEFW_FLAGS_FIRST |
1840 					(small * MXGEFW_FLAGS_SMALL);
1841 			    }
1842 
1843 			req->addr_high = high_swapped;
1844 			req->addr_low = htobe32(low);
1845 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1846 			req->pad = 0;
1847 			req->rdma_count = 1;
1848 			req->length = htobe16(seglen);
1849 			req->cksum_offset = cksum_offset;
1850 			req->flags = flags | ((cum_len & 1) *
1851 					      MXGEFW_FLAGS_ALIGN_ODD);
1852 			low += seglen;
1853 			len -= seglen;
1854 			cum_len = cum_len_next;
1855 			flags = flags_next;
1856 			req++;
1857 			cnt++;
1858 			rdma_count++;
1859 			if (__predict_false(cksum_offset > seglen))
1860 				cksum_offset -= seglen;
1861 			else
1862 				cksum_offset = 0;
1863 			if (__predict_false(cnt > tx->max_desc))
1864 				goto drop;
1865 		}
1866 		busdma_seg_cnt--;
1867 		seg++;
1868 	}
1869 	(req-rdma_count)->rdma_count = rdma_count;
1870 
1871 	do {
1872 		req--;
1873 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1874 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1875 
1876 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1877 	mxge_submit_req(tx, tx->req_list, cnt);
1878 	return;
1879 
1880 drop:
1881 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1882 	m_freem(m);
1883 	ss->sc->ifp->if_oerrors++;
1884 	if (!once) {
1885 		printf("tx->max_desc exceeded via TSO!\n");
1886 		printf("mss = %d, %ld, %d!\n", mss,
1887 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1888 		once = 1;
1889 	}
1890 	return;
1891 
1892 }
1893 
1894 #endif /* IFCAP_TSO4 */
1895 
1896 #ifdef MXGE_NEW_VLAN_API
1897 /*
1898  * We reproduce the software vlan tag insertion from
1899  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1900  * vlan tag insertion. We need to advertise this in order to have the
1901  * vlan interface respect our csum offload flags.
1902  */
1903 static struct mbuf *
1904 mxge_vlan_tag_insert(struct mbuf *m)
1905 {
1906 	struct ether_vlan_header *evl;
1907 
1908 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1909 	if (__predict_false(m == NULL))
1910 		return NULL;
1911 	if (m->m_len < sizeof(*evl)) {
1912 		m = m_pullup(m, sizeof(*evl));
1913 		if (__predict_false(m == NULL))
1914 			return NULL;
1915 	}
1916 	/*
1917 	 * Transform the Ethernet header into an Ethernet header
1918 	 * with 802.1Q encapsulation.
1919 	 */
1920 	evl = mtod(m, struct ether_vlan_header *);
1921 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1922 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1923 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1924 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1925 	m->m_flags &= ~M_VLANTAG;
1926 	return m;
1927 }
1928 #endif /* MXGE_NEW_VLAN_API */
1929 
1930 static void
1931 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1932 {
1933 	mxge_softc_t *sc;
1934 	mcp_kreq_ether_send_t *req;
1935 	bus_dma_segment_t *seg;
1936 	struct mbuf *m_tmp;
1937 	struct ifnet *ifp;
1938 	mxge_tx_ring_t *tx;
1939 	struct ip *ip;
1940 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1941 	uint16_t pseudo_hdr_offset;
1942         uint8_t flags, cksum_offset;
1943 
1944 
1945 	sc = ss->sc;
1946 	ifp = sc->ifp;
1947 	tx = &ss->tx;
1948 
1949 	ip_off = sizeof (struct ether_header);
1950 #ifdef MXGE_NEW_VLAN_API
1951 	if (m->m_flags & M_VLANTAG) {
1952 		m = mxge_vlan_tag_insert(m);
1953 		if (__predict_false(m == NULL))
1954 			goto drop;
1955 		ip_off += ETHER_VLAN_ENCAP_LEN;
1956 	}
1957 #endif
1958 	/* (try to) map the frame for DMA */
1959 	idx = tx->req & tx->mask;
1960 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1961 				      m, tx->seg_list, &cnt,
1962 				      BUS_DMA_NOWAIT);
1963 	if (__predict_false(err == EFBIG)) {
1964 		/* Too many segments in the chain.  Try
1965 		   to defrag */
1966 		m_tmp = m_defrag(m, M_NOWAIT);
1967 		if (m_tmp == NULL) {
1968 			goto drop;
1969 		}
1970 		ss->tx.defrag++;
1971 		m = m_tmp;
1972 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1973 					      tx->info[idx].map,
1974 					      m, tx->seg_list, &cnt,
1975 					      BUS_DMA_NOWAIT);
1976 	}
1977 	if (__predict_false(err != 0)) {
1978 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1979 			      " packet len = %d\n", err, m->m_pkthdr.len);
1980 		goto drop;
1981 	}
1982 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1983 			BUS_DMASYNC_PREWRITE);
1984 	tx->info[idx].m = m;
1985 
1986 #if IFCAP_TSO4
1987 	/* TSO is different enough, we handle it in another routine */
1988 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1989 		mxge_encap_tso(ss, m, cnt, ip_off);
1990 		return;
1991 	}
1992 #endif
1993 
1994 	req = tx->req_list;
1995 	cksum_offset = 0;
1996 	pseudo_hdr_offset = 0;
1997 	flags = MXGEFW_FLAGS_NO_TSO;
1998 
1999 	/* checksum offloading? */
2000 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
2001 		/* ensure ip header is in first mbuf, copy
2002 		   it to a scratch buffer if not */
2003 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
2004 			m_copydata(m, 0, ip_off + sizeof (*ip),
2005 				   ss->scratch);
2006 			ip = (struct ip *)(ss->scratch + ip_off);
2007 		} else {
2008 			ip = (struct ip *)(mtod(m, char *) + ip_off);
2009 		}
2010 		cksum_offset = ip_off + (ip->ip_hl << 2);
2011 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2012 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2013 		req->cksum_offset = cksum_offset;
2014 		flags |= MXGEFW_FLAGS_CKSUM;
2015 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2016 	} else {
2017 		odd_flag = 0;
2018 	}
2019 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2020 		flags |= MXGEFW_FLAGS_SMALL;
2021 
2022 	/* convert segments into a request list */
2023 	cum_len = 0;
2024 	seg = tx->seg_list;
2025 	req->flags = MXGEFW_FLAGS_FIRST;
2026 	for (i = 0; i < cnt; i++) {
2027 		req->addr_low =
2028 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2029 		req->addr_high =
2030 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2031 		req->length = htobe16(seg->ds_len);
2032 		req->cksum_offset = cksum_offset;
2033 		if (cksum_offset > seg->ds_len)
2034 			cksum_offset -= seg->ds_len;
2035 		else
2036 			cksum_offset = 0;
2037 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2038 		req->pad = 0; /* complete solid 16-byte block */
2039 		req->rdma_count = 1;
2040 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2041 		cum_len += seg->ds_len;
2042 		seg++;
2043 		req++;
2044 		req->flags = 0;
2045 	}
2046 	req--;
2047 	/* pad runts to 60 bytes */
2048 	if (cum_len < 60) {
2049 		req++;
2050 		req->addr_low =
2051 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2052 		req->addr_high =
2053 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2054 		req->length = htobe16(60 - cum_len);
2055 		req->cksum_offset = 0;
2056 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2057 		req->pad = 0; /* complete solid 16-byte block */
2058 		req->rdma_count = 1;
2059 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2060 		cnt++;
2061 	}
2062 
2063 	tx->req_list[0].rdma_count = cnt;
2064 #if 0
2065 	/* print what the firmware will see */
2066 	for (i = 0; i < cnt; i++) {
2067 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2068 		    "cso:%d, flags:0x%x, rdma:%d\n",
2069 		    i, (int)ntohl(tx->req_list[i].addr_high),
2070 		    (int)ntohl(tx->req_list[i].addr_low),
2071 		    (int)ntohs(tx->req_list[i].length),
2072 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2073 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2074 		    tx->req_list[i].rdma_count);
2075 	}
2076 	printf("--------------\n");
2077 #endif
2078 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2079 	mxge_submit_req(tx, tx->req_list, cnt);
2080 	return;
2081 
2082 drop:
2083 	m_freem(m);
2084 	ifp->if_oerrors++;
2085 	return;
2086 }
2087 
2088 
2089 
2090 
2091 static inline void
2092 mxge_start_locked(struct mxge_slice_state *ss)
2093 {
2094 	mxge_softc_t *sc;
2095 	struct mbuf *m;
2096 	struct ifnet *ifp;
2097 	mxge_tx_ring_t *tx;
2098 
2099 	sc = ss->sc;
2100 	ifp = sc->ifp;
2101 	tx = &ss->tx;
2102 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2103 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2104 		if (m == NULL) {
2105 			return;
2106 		}
2107 		/* let BPF see it */
2108 		BPF_MTAP(ifp, m);
2109 
2110 		/* give it to the nic */
2111 		mxge_encap(ss, m);
2112 	}
2113 	/* ran out of transmit slots */
2114 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2115 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2116 		tx->stall++;
2117 	}
2118 }
2119 
2120 static void
2121 mxge_start(struct ifnet *ifp)
2122 {
2123 	mxge_softc_t *sc = ifp->if_softc;
2124 	struct mxge_slice_state *ss;
2125 
2126 	/* only use the first slice for now */
2127 	ss = &sc->ss[0];
2128 	mtx_lock(&ss->tx.mtx);
2129 	mxge_start_locked(ss);
2130 	mtx_unlock(&ss->tx.mtx);
2131 }
2132 
2133 /*
2134  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2135  * at most 32 bytes at a time, so as to avoid involving the software
2136  * pio handler in the nic.   We re-write the first segment's low
2137  * DMA address to mark it valid only after we write the entire chunk
2138  * in a burst
2139  */
2140 static inline void
2141 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2142 		mcp_kreq_ether_recv_t *src)
2143 {
2144 	uint32_t low;
2145 
2146 	low = src->addr_low;
2147 	src->addr_low = 0xffffffff;
2148 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2149 	mb();
2150 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2151 	mb();
2152 	src->addr_low = low;
2153 	dst->addr_low = low;
2154 	mb();
2155 }
2156 
2157 static int
2158 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2159 {
2160 	bus_dma_segment_t seg;
2161 	struct mbuf *m;
2162 	mxge_rx_ring_t *rx = &ss->rx_small;
2163 	int cnt, err;
2164 
2165 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2166 	if (m == NULL) {
2167 		rx->alloc_fail++;
2168 		err = ENOBUFS;
2169 		goto done;
2170 	}
2171 	m->m_len = MHLEN;
2172 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2173 				      &seg, &cnt, BUS_DMA_NOWAIT);
2174 	if (err != 0) {
2175 		m_free(m);
2176 		goto done;
2177 	}
2178 	rx->info[idx].m = m;
2179 	rx->shadow[idx].addr_low =
2180 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2181 	rx->shadow[idx].addr_high =
2182 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2183 
2184 done:
2185 	if ((idx & 7) == 7)
2186 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2187 	return err;
2188 }
2189 
2190 static int
2191 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2192 {
2193 	bus_dma_segment_t seg[3];
2194 	struct mbuf *m;
2195 	mxge_rx_ring_t *rx = &ss->rx_big;
2196 	int cnt, err, i;
2197 
2198 	if (rx->cl_size == MCLBYTES)
2199 		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2200 	else
2201 		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2202 	if (m == NULL) {
2203 		rx->alloc_fail++;
2204 		err = ENOBUFS;
2205 		goto done;
2206 	}
2207 	m->m_len = rx->cl_size;
2208 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2209 				      seg, &cnt, BUS_DMA_NOWAIT);
2210 	if (err != 0) {
2211 		m_free(m);
2212 		goto done;
2213 	}
2214 	rx->info[idx].m = m;
2215 	rx->shadow[idx].addr_low =
2216 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2217 	rx->shadow[idx].addr_high =
2218 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2219 
2220 #if MXGE_VIRT_JUMBOS
2221 	for (i = 1; i < cnt; i++) {
2222 		rx->shadow[idx + i].addr_low =
2223 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2224 		rx->shadow[idx + i].addr_high =
2225 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2226        }
2227 #endif
2228 
2229 done:
2230        for (i = 0; i < rx->nbufs; i++) {
2231 		if ((idx & 7) == 7) {
2232 			mxge_submit_8rx(&rx->lanai[idx - 7],
2233 					&rx->shadow[idx - 7]);
2234 		}
2235 		idx++;
2236 	}
2237 	return err;
2238 }
2239 
2240 /*
2241  *  Myri10GE hardware checksums are not valid if the sender
2242  *  padded the frame with non-zero padding.  This is because
2243  *  the firmware just does a simple 16-bit 1s complement
2244  *  checksum across the entire frame, excluding the first 14
2245  *  bytes.  It is best to simply to check the checksum and
2246  *  tell the stack about it only if the checksum is good
2247  */
2248 
2249 static inline uint16_t
2250 mxge_rx_csum(struct mbuf *m, int csum)
2251 {
2252 	struct ether_header *eh;
2253 	struct ip *ip;
2254 	uint16_t c;
2255 
2256 	eh = mtod(m, struct ether_header *);
2257 
2258 	/* only deal with IPv4 TCP & UDP for now */
2259 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2260 		return 1;
2261 	ip = (struct ip *)(eh + 1);
2262 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2263 			    ip->ip_p != IPPROTO_UDP))
2264 		return 1;
2265 
2266 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2267 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2268 			    - (ip->ip_hl << 2) + ip->ip_p));
2269 	c ^= 0xffff;
2270 	return (c);
2271 }
2272 
2273 static void
2274 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2275 {
2276 	struct ether_vlan_header *evl;
2277 	struct ether_header *eh;
2278 	uint32_t partial;
2279 
2280 	evl = mtod(m, struct ether_vlan_header *);
2281 	eh = mtod(m, struct ether_header *);
2282 
2283 	/*
2284 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2285 	 * after what the firmware thought was the end of the ethernet
2286 	 * header.
2287 	 */
2288 
2289 	/* put checksum into host byte order */
2290 	*csum = ntohs(*csum);
2291 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2292 	(*csum) += ~partial;
2293 	(*csum) +=  ((*csum) < ~partial);
2294 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2295 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2296 
2297 	/* restore checksum to network byte order;
2298 	   later consumers expect this */
2299 	*csum = htons(*csum);
2300 
2301 	/* save the tag */
2302 #ifdef MXGE_NEW_VLAN_API
2303 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2304 #else
2305 	{
2306 		struct m_tag *mtag;
2307 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2308 				   M_NOWAIT);
2309 		if (mtag == NULL)
2310 			return;
2311 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2312 		m_tag_prepend(m, mtag);
2313 	}
2314 
2315 #endif
2316 	m->m_flags |= M_VLANTAG;
2317 
2318 	/*
2319 	 * Remove the 802.1q header by copying the Ethernet
2320 	 * addresses over it and adjusting the beginning of
2321 	 * the data in the mbuf.  The encapsulated Ethernet
2322 	 * type field is already in place.
2323 	 */
2324 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2325 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2326 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2327 }
2328 
2329 
2330 static inline void
2331 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2332 {
2333 	mxge_softc_t *sc;
2334 	struct ifnet *ifp;
2335 	struct mbuf *m;
2336 	struct ether_header *eh;
2337 	mxge_rx_ring_t *rx;
2338 	bus_dmamap_t old_map;
2339 	int idx;
2340 	uint16_t tcpudp_csum;
2341 
2342 	sc = ss->sc;
2343 	ifp = sc->ifp;
2344 	rx = &ss->rx_big;
2345 	idx = rx->cnt & rx->mask;
2346 	rx->cnt += rx->nbufs;
2347 	/* save a pointer to the received mbuf */
2348 	m = rx->info[idx].m;
2349 	/* try to replace the received mbuf */
2350 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2351 		/* drop the frame -- the old mbuf is re-cycled */
2352 		ifp->if_ierrors++;
2353 		return;
2354 	}
2355 
2356 	/* unmap the received buffer */
2357 	old_map = rx->info[idx].map;
2358 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2359 	bus_dmamap_unload(rx->dmat, old_map);
2360 
2361 	/* swap the bus_dmamap_t's */
2362 	rx->info[idx].map = rx->extra_map;
2363 	rx->extra_map = old_map;
2364 
2365 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2366 	 * aligned */
2367 	m->m_data += MXGEFW_PAD;
2368 
2369 	m->m_pkthdr.rcvif = ifp;
2370 	m->m_len = m->m_pkthdr.len = len;
2371 	ss->ipackets++;
2372 	eh = mtod(m, struct ether_header *);
2373 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2374 		mxge_vlan_tag_remove(m, &csum);
2375 	}
2376 	/* if the checksum is valid, mark it in the mbuf header */
2377 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2378 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2379 			return;
2380 		/* otherwise, it was a UDP frame, or a TCP frame which
2381 		   we could not do LRO on.  Tell the stack that the
2382 		   checksum is good */
2383 		m->m_pkthdr.csum_data = 0xffff;
2384 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2385 	}
2386 	/* pass the frame up the stack */
2387 	(*ifp->if_input)(ifp, m);
2388 }
2389 
2390 static inline void
2391 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2392 {
2393 	mxge_softc_t *sc;
2394 	struct ifnet *ifp;
2395 	struct ether_header *eh;
2396 	struct mbuf *m;
2397 	mxge_rx_ring_t *rx;
2398 	bus_dmamap_t old_map;
2399 	int idx;
2400 	uint16_t tcpudp_csum;
2401 
2402 	sc = ss->sc;
2403 	ifp = sc->ifp;
2404 	rx = &ss->rx_small;
2405 	idx = rx->cnt & rx->mask;
2406 	rx->cnt++;
2407 	/* save a pointer to the received mbuf */
2408 	m = rx->info[idx].m;
2409 	/* try to replace the received mbuf */
2410 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2411 		/* drop the frame -- the old mbuf is re-cycled */
2412 		ifp->if_ierrors++;
2413 		return;
2414 	}
2415 
2416 	/* unmap the received buffer */
2417 	old_map = rx->info[idx].map;
2418 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2419 	bus_dmamap_unload(rx->dmat, old_map);
2420 
2421 	/* swap the bus_dmamap_t's */
2422 	rx->info[idx].map = rx->extra_map;
2423 	rx->extra_map = old_map;
2424 
2425 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2426 	 * aligned */
2427 	m->m_data += MXGEFW_PAD;
2428 
2429 	m->m_pkthdr.rcvif = ifp;
2430 	m->m_len = m->m_pkthdr.len = len;
2431 	ss->ipackets++;
2432 	eh = mtod(m, struct ether_header *);
2433 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2434 		mxge_vlan_tag_remove(m, &csum);
2435 	}
2436 	/* if the checksum is valid, mark it in the mbuf header */
2437 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2438 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2439 			return;
2440 		/* otherwise, it was a UDP frame, or a TCP frame which
2441 		   we could not do LRO on.  Tell the stack that the
2442 		   checksum is good */
2443 		m->m_pkthdr.csum_data = 0xffff;
2444 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2445 	}
2446 	/* pass the frame up the stack */
2447 	(*ifp->if_input)(ifp, m);
2448 }
2449 
2450 static inline void
2451 mxge_clean_rx_done(struct mxge_slice_state *ss)
2452 {
2453 	mxge_rx_done_t *rx_done = &ss->rx_done;
2454 	struct lro_entry *lro;
2455 	int limit = 0;
2456 	uint16_t length;
2457 	uint16_t checksum;
2458 
2459 
2460 	while (rx_done->entry[rx_done->idx].length != 0) {
2461 		length = ntohs(rx_done->entry[rx_done->idx].length);
2462 		rx_done->entry[rx_done->idx].length = 0;
2463 		checksum = rx_done->entry[rx_done->idx].checksum;
2464 		if (length <= (MHLEN - MXGEFW_PAD))
2465 			mxge_rx_done_small(ss, length, checksum);
2466 		else
2467 			mxge_rx_done_big(ss, length, checksum);
2468 		rx_done->cnt++;
2469 		rx_done->idx = rx_done->cnt & rx_done->mask;
2470 
2471 		/* limit potential for livelock */
2472 		if (__predict_false(++limit > rx_done->mask / 2))
2473 			break;
2474 	}
2475 	while (!SLIST_EMPTY(&ss->lro_active)) {
2476 		lro = SLIST_FIRST(&ss->lro_active);
2477 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2478 		mxge_lro_flush(ss, lro);
2479 	}
2480 }
2481 
2482 
2483 static inline void
2484 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2485 {
2486 	struct ifnet *ifp;
2487 	mxge_tx_ring_t *tx;
2488 	struct mbuf *m;
2489 	bus_dmamap_t map;
2490 	int idx;
2491 
2492 	tx = &ss->tx;
2493 	ifp = ss->sc->ifp;
2494 	while (tx->pkt_done != mcp_idx) {
2495 		idx = tx->done & tx->mask;
2496 		tx->done++;
2497 		m = tx->info[idx].m;
2498 		/* mbuf and DMA map only attached to the first
2499 		   segment per-mbuf */
2500 		if (m != NULL) {
2501 			ifp->if_opackets++;
2502 			tx->info[idx].m = NULL;
2503 			map = tx->info[idx].map;
2504 			bus_dmamap_unload(tx->dmat, map);
2505 			m_freem(m);
2506 		}
2507 		if (tx->info[idx].flag) {
2508 			tx->info[idx].flag = 0;
2509 			tx->pkt_done++;
2510 		}
2511 	}
2512 
2513 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2514            its OK to send packets */
2515 
2516 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
2517 	    tx->req - tx->done < (tx->mask + 1)/4) {
2518 		mtx_lock(&ss->tx.mtx);
2519 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2520 		ss->tx.wake++;
2521 		mxge_start_locked(ss);
2522 		mtx_unlock(&ss->tx.mtx);
2523 	}
2524 }
2525 
2526 static struct mxge_media_type mxge_media_types[] =
2527 {
2528 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2529 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2530 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2531 	{0,		(1 << 5),	"10GBASE-ER"},
2532 	{0,		(1 << 4),	"10GBASE-LRM"},
2533 	{0,		(1 << 3),	"10GBASE-SW"},
2534 	{0,		(1 << 2),	"10GBASE-LW"},
2535 	{0,		(1 << 1),	"10GBASE-EW"},
2536 	{0,		(1 << 0),	"Reserved"}
2537 };
2538 
2539 static void
2540 mxge_set_media(mxge_softc_t *sc, int type)
2541 {
2542 	sc->media_flags |= type;
2543 	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2544 	ifmedia_set(&sc->media, sc->media_flags);
2545 }
2546 
2547 
2548 /*
2549  * Determine the media type for a NIC.  Some XFPs will identify
2550  * themselves only when their link is up, so this is initiated via a
2551  * link up interrupt.  However, this can potentially take up to
2552  * several milliseconds, so it is run via the watchdog routine, rather
2553  * than in the interrupt handler itself.   This need only be done
2554  * once, not each time the link is up.
2555  */
2556 static void
2557 mxge_media_probe(mxge_softc_t *sc)
2558 {
2559 	mxge_cmd_t cmd;
2560 	char *ptr;
2561 	int i, err, ms;
2562 
2563 	sc->need_media_probe = 0;
2564 
2565 	/* if we've already set a media type, we're done */
2566 	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2567 		return;
2568 
2569 	/*
2570 	 * parse the product code to deterimine the interface type
2571 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2572 	 * after the 3rd dash in the driver's cached copy of the
2573 	 * EEPROM's product code string.
2574 	 */
2575 	ptr = sc->product_code_string;
2576 	if (ptr == NULL) {
2577 		device_printf(sc->dev, "Missing product code\n");
2578 	}
2579 
2580 	for (i = 0; i < 3; i++, ptr++) {
2581 		ptr = index(ptr, '-');
2582 		if (ptr == NULL) {
2583 			device_printf(sc->dev,
2584 				      "only %d dashes in PC?!?\n", i);
2585 			return;
2586 		}
2587 	}
2588 	if (*ptr == 'C') {
2589 		mxge_set_media(sc, IFM_10G_CX4);
2590 		return;
2591 	}
2592 	else if (*ptr == 'Q') {
2593 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2594 		/* FreeBSD has no media type for Quad ribbon fiber */
2595 		return;
2596 	}
2597 
2598 	if (*ptr != 'R') {
2599 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2600 		return;
2601 	}
2602 
2603 	/*
2604 	 * At this point we know the NIC has an XFP cage, so now we
2605 	 * try to determine what is in the cage by using the
2606 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2607 	 * register.  We read just one byte, which may take over
2608 	 * a millisecond
2609 	 */
2610 
2611 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2612 	cmd.data1 = MXGE_XFP_COMPLIANCE_BYTE; /* the byte we want */
2613 	err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_I2C_READ, &cmd);
2614 	if (err == MXGEFW_CMD_ERROR_XFP_FAILURE) {
2615 		device_printf(sc->dev, "failed to read XFP\n");
2616 	}
2617 	if (err == MXGEFW_CMD_ERROR_XFP_ABSENT) {
2618 		device_printf(sc->dev, "Type R with no XFP!?!?\n");
2619 	}
2620 	if (err != MXGEFW_CMD_OK) {
2621 		return;
2622 	}
2623 
2624 	/* now we wait for the data to be cached */
2625 	cmd.data0 = MXGE_XFP_COMPLIANCE_BYTE;
2626 	err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_BYTE, &cmd);
2627 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2628 		DELAY(1000);
2629 		cmd.data0 = MXGE_XFP_COMPLIANCE_BYTE;
2630 		err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_BYTE, &cmd);
2631 	}
2632 	if (err != MXGEFW_CMD_OK) {
2633 		device_printf(sc->dev, "failed to read XFP (%d, %dms)\n",
2634 			      err, ms);
2635 		return;
2636 	}
2637 
2638 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2639 		if (mxge_verbose)
2640 			device_printf(sc->dev, "XFP:%s\n",
2641 				      mxge_media_types[0].name);
2642 		mxge_set_media(sc, IFM_10G_CX4);
2643 		return;
2644 	}
2645 	for (i = 1;
2646 	     i < sizeof (mxge_media_types) / sizeof (mxge_media_types[0]);
2647 	     i++) {
2648 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2649 			if (mxge_verbose)
2650 				device_printf(sc->dev, "XFP:%s\n",
2651 					      mxge_media_types[i].name);
2652 
2653 			mxge_set_media(sc, mxge_media_types[i].flag);
2654 			return;
2655 		}
2656 	}
2657 	device_printf(sc->dev, "XFP media 0x%x unknown\n", cmd.data0);
2658 
2659 	return;
2660 }
2661 
2662 static void
2663 mxge_intr(void *arg)
2664 {
2665 	struct mxge_slice_state *ss = arg;
2666 	mxge_softc_t *sc = ss->sc;
2667 	mcp_irq_data_t *stats = ss->fw_stats;
2668 	mxge_tx_ring_t *tx = &ss->tx;
2669 	mxge_rx_done_t *rx_done = &ss->rx_done;
2670 	uint32_t send_done_count;
2671 	uint8_t valid;
2672 
2673 
2674 	/* an interrupt on a non-zero slice is implicitly valid
2675 	   since MSI-X irqs are not shared */
2676 	if (ss != sc->ss) {
2677 		mxge_clean_rx_done(ss);
2678 		*ss->irq_claim = be32toh(3);
2679 		return;
2680 	}
2681 
2682 	/* make sure the DMA has finished */
2683 	if (!stats->valid) {
2684 		return;
2685 	}
2686 	valid = stats->valid;
2687 
2688 	if (sc->legacy_irq) {
2689 		/* lower legacy IRQ  */
2690 		*sc->irq_deassert = 0;
2691 		if (!mxge_deassert_wait)
2692 			/* don't wait for conf. that irq is low */
2693 			stats->valid = 0;
2694 	} else {
2695 		stats->valid = 0;
2696 	}
2697 
2698 	/* loop while waiting for legacy irq deassertion */
2699 	do {
2700 		/* check for transmit completes and receives */
2701 		send_done_count = be32toh(stats->send_done_count);
2702 		while ((send_done_count != tx->pkt_done) ||
2703 		       (rx_done->entry[rx_done->idx].length != 0)) {
2704 			mxge_tx_done(ss, (int)send_done_count);
2705 			mxge_clean_rx_done(ss);
2706 			send_done_count = be32toh(stats->send_done_count);
2707 		}
2708 		if (sc->legacy_irq && mxge_deassert_wait)
2709 			mb();
2710 	} while (*((volatile uint8_t *) &stats->valid));
2711 
2712 	if (__predict_false(stats->stats_updated)) {
2713 		if (sc->link_state != stats->link_up) {
2714 			sc->link_state = stats->link_up;
2715 			if (sc->link_state) {
2716 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2717 				if (mxge_verbose)
2718 					device_printf(sc->dev, "link up\n");
2719 			} else {
2720 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2721 				if (mxge_verbose)
2722 					device_printf(sc->dev, "link down\n");
2723 			}
2724 			sc->need_media_probe = 1;
2725 		}
2726 		if (sc->rdma_tags_available !=
2727 		    be32toh(stats->rdma_tags_available)) {
2728 			sc->rdma_tags_available =
2729 				be32toh(stats->rdma_tags_available);
2730 			device_printf(sc->dev, "RDMA timed out! %d tags "
2731 				      "left\n", sc->rdma_tags_available);
2732 		}
2733 
2734 		if (stats->link_down) {
2735 			sc->down_cnt += stats->link_down;
2736 			sc->link_state = 0;
2737 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2738 		}
2739 	}
2740 
2741 	/* check to see if we have rx token to pass back */
2742 	if (valid & 0x1)
2743 	    *ss->irq_claim = be32toh(3);
2744 	*(ss->irq_claim + 1) = be32toh(3);
2745 }
2746 
2747 static void
2748 mxge_init(void *arg)
2749 {
2750 }
2751 
2752 
2753 
2754 static void
2755 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2756 {
2757 	struct lro_entry *lro_entry;
2758 	int i;
2759 
2760 	while (!SLIST_EMPTY(&ss->lro_free)) {
2761 		lro_entry = SLIST_FIRST(&ss->lro_free);
2762 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
2763 		free(lro_entry, M_DEVBUF);
2764 	}
2765 
2766 	for (i = 0; i <= ss->rx_big.mask; i++) {
2767 		if (ss->rx_big.info[i].m == NULL)
2768 			continue;
2769 		bus_dmamap_unload(ss->rx_big.dmat,
2770 				  ss->rx_big.info[i].map);
2771 		m_freem(ss->rx_big.info[i].m);
2772 		ss->rx_big.info[i].m = NULL;
2773 	}
2774 
2775 	for (i = 0; i <= ss->rx_small.mask; i++) {
2776 		if (ss->rx_small.info[i].m == NULL)
2777 			continue;
2778 		bus_dmamap_unload(ss->rx_small.dmat,
2779 				  ss->rx_small.info[i].map);
2780 		m_freem(ss->rx_small.info[i].m);
2781 		ss->rx_small.info[i].m = NULL;
2782 	}
2783 
2784 	/* transmit ring used only on the first slice */
2785 	if (ss->tx.info == NULL)
2786 		return;
2787 
2788 	for (i = 0; i <= ss->tx.mask; i++) {
2789 		ss->tx.info[i].flag = 0;
2790 		if (ss->tx.info[i].m == NULL)
2791 			continue;
2792 		bus_dmamap_unload(ss->tx.dmat,
2793 				  ss->tx.info[i].map);
2794 		m_freem(ss->tx.info[i].m);
2795 		ss->tx.info[i].m = NULL;
2796 	}
2797 }
2798 
2799 static void
2800 mxge_free_mbufs(mxge_softc_t *sc)
2801 {
2802 	int slice;
2803 
2804 	for (slice = 0; slice < sc->num_slices; slice++)
2805 		mxge_free_slice_mbufs(&sc->ss[slice]);
2806 }
2807 
2808 static void
2809 mxge_free_slice_rings(struct mxge_slice_state *ss)
2810 {
2811 	int i;
2812 
2813 
2814 	if (ss->rx_done.entry != NULL)
2815 		mxge_dma_free(&ss->rx_done.dma);
2816 	ss->rx_done.entry = NULL;
2817 
2818 	if (ss->tx.req_bytes != NULL)
2819 		free(ss->tx.req_bytes, M_DEVBUF);
2820 	ss->tx.req_bytes = NULL;
2821 
2822 	if (ss->tx.seg_list != NULL)
2823 		free(ss->tx.seg_list, M_DEVBUF);
2824 	ss->tx.seg_list = NULL;
2825 
2826 	if (ss->rx_small.shadow != NULL)
2827 		free(ss->rx_small.shadow, M_DEVBUF);
2828 	ss->rx_small.shadow = NULL;
2829 
2830 	if (ss->rx_big.shadow != NULL)
2831 		free(ss->rx_big.shadow, M_DEVBUF);
2832 	ss->rx_big.shadow = NULL;
2833 
2834 	if (ss->tx.info != NULL) {
2835 		if (ss->tx.dmat != NULL) {
2836 			for (i = 0; i <= ss->tx.mask; i++) {
2837 				bus_dmamap_destroy(ss->tx.dmat,
2838 						   ss->tx.info[i].map);
2839 			}
2840 			bus_dma_tag_destroy(ss->tx.dmat);
2841 		}
2842 		free(ss->tx.info, M_DEVBUF);
2843 	}
2844 	ss->tx.info = NULL;
2845 
2846 	if (ss->rx_small.info != NULL) {
2847 		if (ss->rx_small.dmat != NULL) {
2848 			for (i = 0; i <= ss->rx_small.mask; i++) {
2849 				bus_dmamap_destroy(ss->rx_small.dmat,
2850 						   ss->rx_small.info[i].map);
2851 			}
2852 			bus_dmamap_destroy(ss->rx_small.dmat,
2853 					   ss->rx_small.extra_map);
2854 			bus_dma_tag_destroy(ss->rx_small.dmat);
2855 		}
2856 		free(ss->rx_small.info, M_DEVBUF);
2857 	}
2858 	ss->rx_small.info = NULL;
2859 
2860 	if (ss->rx_big.info != NULL) {
2861 		if (ss->rx_big.dmat != NULL) {
2862 			for (i = 0; i <= ss->rx_big.mask; i++) {
2863 				bus_dmamap_destroy(ss->rx_big.dmat,
2864 						   ss->rx_big.info[i].map);
2865 			}
2866 			bus_dmamap_destroy(ss->rx_big.dmat,
2867 					   ss->rx_big.extra_map);
2868 			bus_dma_tag_destroy(ss->rx_big.dmat);
2869 		}
2870 		free(ss->rx_big.info, M_DEVBUF);
2871 	}
2872 	ss->rx_big.info = NULL;
2873 }
2874 
2875 static void
2876 mxge_free_rings(mxge_softc_t *sc)
2877 {
2878 	int slice;
2879 
2880 	for (slice = 0; slice < sc->num_slices; slice++)
2881 		mxge_free_slice_rings(&sc->ss[slice]);
2882 }
2883 
2884 static int
2885 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
2886 		       int tx_ring_entries)
2887 {
2888 	mxge_softc_t *sc = ss->sc;
2889 	size_t bytes;
2890 	int err, i;
2891 
2892 	err = ENOMEM;
2893 
2894 	/* allocate per-slice receive resources */
2895 
2896 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
2897 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
2898 
2899 	/* allocate the rx shadow rings */
2900 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2901 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2902 	if (ss->rx_small.shadow == NULL)
2903 		return err;;
2904 
2905 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2906 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2907 	if (ss->rx_big.shadow == NULL)
2908 		return err;;
2909 
2910 	/* allocate the rx host info rings */
2911 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
2912 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2913 	if (ss->rx_small.info == NULL)
2914 		return err;;
2915 
2916 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
2917 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2918 	if (ss->rx_big.info == NULL)
2919 		return err;;
2920 
2921 	/* allocate the rx busdma resources */
2922 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2923 				 1,			/* alignment */
2924 				 4096,			/* boundary */
2925 				 BUS_SPACE_MAXADDR,	/* low */
2926 				 BUS_SPACE_MAXADDR,	/* high */
2927 				 NULL, NULL,		/* filter */
2928 				 MHLEN,			/* maxsize */
2929 				 1,			/* num segs */
2930 				 MHLEN,			/* maxsegsize */
2931 				 BUS_DMA_ALLOCNOW,	/* flags */
2932 				 NULL, NULL,		/* lock */
2933 				 &ss->rx_small.dmat);	/* tag */
2934 	if (err != 0) {
2935 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2936 			      err);
2937 		return err;;
2938 	}
2939 
2940 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2941 				 1,			/* alignment */
2942 #if MXGE_VIRT_JUMBOS
2943 				 4096,			/* boundary */
2944 #else
2945 				 0,			/* boundary */
2946 #endif
2947 				 BUS_SPACE_MAXADDR,	/* low */
2948 				 BUS_SPACE_MAXADDR,	/* high */
2949 				 NULL, NULL,		/* filter */
2950 				 3*4096,		/* maxsize */
2951 #if MXGE_VIRT_JUMBOS
2952 				 3,			/* num segs */
2953 				 4096,			/* maxsegsize*/
2954 #else
2955 				 1,			/* num segs */
2956 				 MJUM9BYTES,		/* maxsegsize*/
2957 #endif
2958 				 BUS_DMA_ALLOCNOW,	/* flags */
2959 				 NULL, NULL,		/* lock */
2960 				 &ss->rx_big.dmat);	/* tag */
2961 	if (err != 0) {
2962 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2963 			      err);
2964 		return err;;
2965 	}
2966 	for (i = 0; i <= ss->rx_small.mask; i++) {
2967 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
2968 					&ss->rx_small.info[i].map);
2969 		if (err != 0) {
2970 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2971 				      err);
2972 			return err;;
2973 		}
2974 	}
2975 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
2976 				&ss->rx_small.extra_map);
2977 	if (err != 0) {
2978 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2979 			      err);
2980 		return err;;
2981 	}
2982 
2983 	for (i = 0; i <= ss->rx_big.mask; i++) {
2984 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
2985 					&ss->rx_big.info[i].map);
2986 		if (err != 0) {
2987 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2988 				      err);
2989 			return err;;
2990 		}
2991 	}
2992 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
2993 				&ss->rx_big.extra_map);
2994 	if (err != 0) {
2995 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2996 			      err);
2997 		return err;;
2998 	}
2999 
3000 	/* now allocate TX resouces */
3001 
3002 	/* only use a single TX ring for now */
3003 	if (ss != ss->sc->ss)
3004 		return 0;
3005 
3006 	ss->tx.mask = tx_ring_entries - 1;
3007 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3008 
3009 
3010 	/* allocate the tx request copy block */
3011 	bytes = 8 +
3012 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
3013 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
3014 	if (ss->tx.req_bytes == NULL)
3015 		return err;;
3016 	/* ensure req_list entries are aligned to 8 bytes */
3017 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3018 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3019 
3020 	/* allocate the tx busdma segment list */
3021 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3022 	ss->tx.seg_list = (bus_dma_segment_t *)
3023 		malloc(bytes, M_DEVBUF, M_WAITOK);
3024 	if (ss->tx.seg_list == NULL)
3025 		return err;;
3026 
3027 	/* allocate the tx host info ring */
3028 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3029 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3030 	if (ss->tx.info == NULL)
3031 		return err;;
3032 
3033 	/* allocate the tx busdma resources */
3034 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3035 				 1,			/* alignment */
3036 				 sc->tx_boundary,	/* boundary */
3037 				 BUS_SPACE_MAXADDR,	/* low */
3038 				 BUS_SPACE_MAXADDR,	/* high */
3039 				 NULL, NULL,		/* filter */
3040 				 65536 + 256,		/* maxsize */
3041 				 ss->tx.max_desc - 2,	/* num segs */
3042 				 sc->tx_boundary,	/* maxsegsz */
3043 				 BUS_DMA_ALLOCNOW,	/* flags */
3044 				 NULL, NULL,		/* lock */
3045 				 &ss->tx.dmat);		/* tag */
3046 
3047 	if (err != 0) {
3048 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3049 			      err);
3050 		return err;;
3051 	}
3052 
3053 	/* now use these tags to setup dmamaps for each slot
3054 	   in the ring */
3055 	for (i = 0; i <= ss->tx.mask; i++) {
3056 		err = bus_dmamap_create(ss->tx.dmat, 0,
3057 					&ss->tx.info[i].map);
3058 		if (err != 0) {
3059 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3060 				      err);
3061 			return err;;
3062 		}
3063 	}
3064 	return 0;
3065 
3066 }
3067 
3068 static int
3069 mxge_alloc_rings(mxge_softc_t *sc)
3070 {
3071 	mxge_cmd_t cmd;
3072 	int tx_ring_size;
3073 	int tx_ring_entries, rx_ring_entries;
3074 	int err, slice;
3075 
3076 	/* get ring sizes */
3077 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3078 	tx_ring_size = cmd.data0;
3079 	if (err != 0) {
3080 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3081 		goto abort;
3082 	}
3083 
3084 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3085 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3086 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3087 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3088 	IFQ_SET_READY(&sc->ifp->if_snd);
3089 
3090 	for (slice = 0; slice < sc->num_slices; slice++) {
3091 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3092 					     rx_ring_entries,
3093 					     tx_ring_entries);
3094 		if (err != 0)
3095 			goto abort;
3096 	}
3097 	return 0;
3098 
3099 abort:
3100 	mxge_free_rings(sc);
3101 	return err;
3102 
3103 }
3104 
3105 
3106 static void
3107 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3108 {
3109 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3110 
3111 	if (bufsize < MCLBYTES) {
3112 		/* easy, everything fits in a single buffer */
3113 		*big_buf_size = MCLBYTES;
3114 		*cl_size = MCLBYTES;
3115 		*nbufs = 1;
3116 		return;
3117 	}
3118 
3119 	if (bufsize < MJUMPAGESIZE) {
3120 		/* still easy, everything still fits in a single buffer */
3121 		*big_buf_size = MJUMPAGESIZE;
3122 		*cl_size = MJUMPAGESIZE;
3123 		*nbufs = 1;
3124 		return;
3125 	}
3126 #if MXGE_VIRT_JUMBOS
3127 	/* now we need to use virtually contiguous buffers */
3128 	*cl_size = MJUM9BYTES;
3129 	*big_buf_size = 4096;
3130 	*nbufs = mtu / 4096 + 1;
3131 	/* needs to be a power of two, so round up */
3132 	if (*nbufs == 3)
3133 		*nbufs = 4;
3134 #else
3135 	*cl_size = MJUM9BYTES;
3136 	*big_buf_size = MJUM9BYTES;
3137 	*nbufs = 1;
3138 #endif
3139 }
3140 
3141 static int
3142 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3143 {
3144 	mxge_softc_t *sc;
3145 	mxge_cmd_t cmd;
3146 	bus_dmamap_t map;
3147 	struct lro_entry *lro_entry;
3148 	int err, i, slice;
3149 
3150 
3151 	sc = ss->sc;
3152 	slice = ss - sc->ss;
3153 
3154 	SLIST_INIT(&ss->lro_free);
3155 	SLIST_INIT(&ss->lro_active);
3156 
3157 	for (i = 0; i < sc->lro_cnt; i++) {
3158 		lro_entry = (struct lro_entry *)
3159 			malloc(sizeof (*lro_entry), M_DEVBUF,
3160 			       M_NOWAIT | M_ZERO);
3161 		if (lro_entry == NULL) {
3162 			sc->lro_cnt = i;
3163 			break;
3164 		}
3165 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3166 	}
3167 	/* get the lanai pointers to the send and receive rings */
3168 
3169 	err = 0;
3170 	/* We currently only send from the first slice */
3171 	if (slice == 0) {
3172 		cmd.data0 = slice;
3173 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3174 		ss->tx.lanai =
3175 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3176 	}
3177 	cmd.data0 = slice;
3178 	err |= mxge_send_cmd(sc,
3179 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3180 	ss->rx_small.lanai =
3181 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3182 	cmd.data0 = slice;
3183 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3184 	ss->rx_big.lanai =
3185 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3186 
3187 	if (err != 0) {
3188 		device_printf(sc->dev,
3189 			      "failed to get ring sizes or locations\n");
3190 		return EIO;
3191 	}
3192 
3193 	/* stock receive rings */
3194 	for (i = 0; i <= ss->rx_small.mask; i++) {
3195 		map = ss->rx_small.info[i].map;
3196 		err = mxge_get_buf_small(ss, map, i);
3197 		if (err) {
3198 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3199 				      i, ss->rx_small.mask + 1);
3200 			return ENOMEM;
3201 		}
3202 	}
3203 	for (i = 0; i <= ss->rx_big.mask; i++) {
3204 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3205 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3206 	}
3207 	ss->rx_big.nbufs = nbufs;
3208 	ss->rx_big.cl_size = cl_size;
3209 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3210 		map = ss->rx_big.info[i].map;
3211 		err = mxge_get_buf_big(ss, map, i);
3212 		if (err) {
3213 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3214 				      i, ss->rx_big.mask + 1);
3215 			return ENOMEM;
3216 		}
3217 	}
3218 	return 0;
3219 }
3220 
3221 static int
3222 mxge_open(mxge_softc_t *sc)
3223 {
3224 	mxge_cmd_t cmd;
3225 	int err, big_bytes, nbufs, slice, cl_size, i;
3226 	bus_addr_t bus;
3227 	volatile uint8_t *itable;
3228 
3229 	/* Copy the MAC address in case it was overridden */
3230 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3231 
3232 	err = mxge_reset(sc, 1);
3233 	if (err != 0) {
3234 		device_printf(sc->dev, "failed to reset\n");
3235 		return EIO;
3236 	}
3237 
3238 	if (sc->num_slices > 1) {
3239 		/* setup the indirection table */
3240 		cmd.data0 = sc->num_slices;
3241 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3242 				    &cmd);
3243 
3244 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3245 				     &cmd);
3246 		if (err != 0) {
3247 			device_printf(sc->dev,
3248 				      "failed to setup rss tables\n");
3249 			return err;
3250 		}
3251 
3252 		/* just enable an identity mapping */
3253 		itable = sc->sram + cmd.data0;
3254 		for (i = 0; i < sc->num_slices; i++)
3255 			itable[i] = (uint8_t)i;
3256 
3257 		cmd.data0 = 1;
3258 		cmd.data1 = mxge_rss_hash_type;
3259 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3260 		if (err != 0) {
3261 			device_printf(sc->dev, "failed to enable slices\n");
3262 			return err;
3263 		}
3264 	}
3265 
3266 
3267 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3268 
3269 	cmd.data0 = nbufs;
3270 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3271 			    &cmd);
3272 	/* error is only meaningful if we're trying to set
3273 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3274 	if (err && nbufs > 1) {
3275 		device_printf(sc->dev,
3276 			      "Failed to set alway-use-n to %d\n",
3277 			      nbufs);
3278 		return EIO;
3279 	}
3280 	/* Give the firmware the mtu and the big and small buffer
3281 	   sizes.  The firmware wants the big buf size to be a power
3282 	   of two. Luckily, FreeBSD's clusters are powers of two */
3283 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3284 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3285 	cmd.data0 = MHLEN - MXGEFW_PAD;
3286 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3287 			     &cmd);
3288 	cmd.data0 = big_bytes;
3289 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3290 
3291 	if (err != 0) {
3292 		device_printf(sc->dev, "failed to setup params\n");
3293 		goto abort;
3294 	}
3295 
3296 	/* Now give him the pointer to the stats block */
3297 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->ss->fw_stats_dma.bus_addr);
3298 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->ss->fw_stats_dma.bus_addr);
3299 	cmd.data2 = sizeof(struct mcp_irq_data);
3300 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3301 
3302 	if (err != 0) {
3303 		bus = sc->ss->fw_stats_dma.bus_addr;
3304 		bus += offsetof(struct mcp_irq_data, send_done_count);
3305 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3306 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3307 		err = mxge_send_cmd(sc,
3308 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3309 				    &cmd);
3310 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3311 		sc->fw_multicast_support = 0;
3312 	} else {
3313 		sc->fw_multicast_support = 1;
3314 	}
3315 
3316 	if (err != 0) {
3317 		device_printf(sc->dev, "failed to setup params\n");
3318 		goto abort;
3319 	}
3320 
3321 	for (slice = 0; slice < sc->num_slices; slice++) {
3322 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3323 		if (err != 0) {
3324 			device_printf(sc->dev, "couldn't open slice %d\n",
3325 				      slice);
3326 			goto abort;
3327 		}
3328 	}
3329 
3330 	/* Finally, start the firmware running */
3331 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3332 	if (err) {
3333 		device_printf(sc->dev, "Couldn't bring up link\n");
3334 		goto abort;
3335 	}
3336 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3337 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3338 
3339 	return 0;
3340 
3341 
3342 abort:
3343 	mxge_free_mbufs(sc);
3344 
3345 	return err;
3346 }
3347 
3348 static int
3349 mxge_close(mxge_softc_t *sc)
3350 {
3351 	mxge_cmd_t cmd;
3352 	int err, old_down_cnt;
3353 
3354 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3355 	old_down_cnt = sc->down_cnt;
3356 	mb();
3357 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3358 	if (err) {
3359 		device_printf(sc->dev, "Couldn't bring down link\n");
3360 	}
3361 	if (old_down_cnt == sc->down_cnt) {
3362 		/* wait for down irq */
3363 		DELAY(10 * sc->intr_coal_delay);
3364 	}
3365 	mb();
3366 	if (old_down_cnt == sc->down_cnt) {
3367 		device_printf(sc->dev, "never got down irq\n");
3368 	}
3369 
3370 	mxge_free_mbufs(sc);
3371 
3372 	return 0;
3373 }
3374 
3375 static void
3376 mxge_setup_cfg_space(mxge_softc_t *sc)
3377 {
3378 	device_t dev = sc->dev;
3379 	int reg;
3380 	uint16_t cmd, lnk, pectl;
3381 
3382 	/* find the PCIe link width and set max read request to 4KB*/
3383 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3384 		lnk = pci_read_config(dev, reg + 0x12, 2);
3385 		sc->link_width = (lnk >> 4) & 0x3f;
3386 
3387 		pectl = pci_read_config(dev, reg + 0x8, 2);
3388 		pectl = (pectl & ~0x7000) | (5 << 12);
3389 		pci_write_config(dev, reg + 0x8, pectl, 2);
3390 	}
3391 
3392 	/* Enable DMA and Memory space access */
3393 	pci_enable_busmaster(dev);
3394 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3395 	cmd |= PCIM_CMD_MEMEN;
3396 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3397 }
3398 
3399 static uint32_t
3400 mxge_read_reboot(mxge_softc_t *sc)
3401 {
3402 	device_t dev = sc->dev;
3403 	uint32_t vs;
3404 
3405 	/* find the vendor specific offset */
3406 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3407 		device_printf(sc->dev,
3408 			      "could not find vendor specific offset\n");
3409 		return (uint32_t)-1;
3410 	}
3411 	/* enable read32 mode */
3412 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3413 	/* tell NIC which register to read */
3414 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3415 	return (pci_read_config(dev, vs + 0x14, 4));
3416 }
3417 
3418 static void
3419 mxge_watchdog_reset(mxge_softc_t *sc)
3420 {
3421 	int err;
3422 	uint32_t reboot;
3423 	uint16_t cmd;
3424 
3425 	err = ENXIO;
3426 
3427 	device_printf(sc->dev, "Watchdog reset!\n");
3428 
3429 	/*
3430 	 * check to see if the NIC rebooted.  If it did, then all of
3431 	 * PCI config space has been reset, and things like the
3432 	 * busmaster bit will be zero.  If this is the case, then we
3433 	 * must restore PCI config space before the NIC can be used
3434 	 * again
3435 	 */
3436 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3437 	if (cmd == 0xffff) {
3438 		/*
3439 		 * maybe the watchdog caught the NIC rebooting; wait
3440 		 * up to 100ms for it to finish.  If it does not come
3441 		 * back, then give up
3442 		 */
3443 		DELAY(1000*100);
3444 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3445 		if (cmd == 0xffff) {
3446 			device_printf(sc->dev, "NIC disappeared!\n");
3447 			goto abort;
3448 		}
3449 	}
3450 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3451 		/* print the reboot status */
3452 		reboot = mxge_read_reboot(sc);
3453 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3454 			      reboot);
3455 		/* restore PCI configuration space */
3456 
3457 		/* XXXX waiting for pci_cfg_restore() to be exported */
3458 		goto abort; /* just abort for now */
3459 
3460 		/* and redo any changes we made to our config space */
3461 		mxge_setup_cfg_space(sc);
3462 
3463 		if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
3464 			mxge_close(sc);
3465 			err = mxge_open(sc);
3466 		}
3467 	} else {
3468 		device_printf(sc->dev, "NIC did not reboot, ring state:\n");
3469 		device_printf(sc->dev, "tx.req=%d tx.done=%d\n",
3470 			      sc->ss->tx.req, sc->ss->tx.done);
3471 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3472 			      sc->ss->tx.pkt_done,
3473 			      be32toh(sc->ss->fw_stats->send_done_count));
3474 		device_printf(sc->dev, "not resetting\n");
3475 	}
3476 
3477 abort:
3478 	/*
3479 	 * stop the watchdog if the nic is dead, to avoid spamming the
3480 	 * console
3481 	 */
3482 	if (err != 0) {
3483 		callout_stop(&sc->co_hdl);
3484 	}
3485 }
3486 
3487 static void
3488 mxge_watchdog(mxge_softc_t *sc)
3489 {
3490 	mxge_tx_ring_t *tx = &sc->ss->tx;
3491 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3492 
3493 	/* see if we have outstanding transmits, which
3494 	   have been pending for more than mxge_ticks */
3495 	if (tx->req != tx->done &&
3496 	    tx->watchdog_req != tx->watchdog_done &&
3497 	    tx->done == tx->watchdog_done) {
3498 		/* check for pause blocking before resetting */
3499 		if (tx->watchdog_rx_pause == rx_pause)
3500 			mxge_watchdog_reset(sc);
3501 		else
3502 			device_printf(sc->dev, "Flow control blocking "
3503 				      "xmits, check link partner\n");
3504 	}
3505 
3506 	tx->watchdog_req = tx->req;
3507 	tx->watchdog_done = tx->done;
3508 	tx->watchdog_rx_pause = rx_pause;
3509 
3510 	if (sc->need_media_probe)
3511 		mxge_media_probe(sc);
3512 }
3513 
3514 static void
3515 mxge_update_stats(mxge_softc_t *sc)
3516 {
3517 	struct mxge_slice_state *ss;
3518 	u_long ipackets = 0;
3519 	int slice;
3520 
3521 	for(slice = 0; slice < sc->num_slices; slice++) {
3522 		ss = &sc->ss[slice];
3523 		ipackets += ss->ipackets;
3524 	}
3525 	sc->ifp->if_ipackets = ipackets;
3526 
3527 }
3528 static void
3529 mxge_tick(void *arg)
3530 {
3531 	mxge_softc_t *sc = arg;
3532 
3533 
3534 	/* Synchronize with possible callout reset/stop. */
3535 	if (callout_pending(&sc->co_hdl) ||
3536 	    !callout_active(&sc->co_hdl)) {
3537 		mtx_unlock(&sc->driver_mtx);
3538 		return;
3539 	}
3540 
3541 	/* aggregate stats from different slices */
3542 	mxge_update_stats(sc);
3543 
3544 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3545 	if (!sc->watchdog_countdown) {
3546 		mxge_watchdog(sc);
3547 		sc->watchdog_countdown = 4;
3548 	}
3549 	sc->watchdog_countdown--;
3550 }
3551 
3552 static int
3553 mxge_media_change(struct ifnet *ifp)
3554 {
3555 	return EINVAL;
3556 }
3557 
3558 static int
3559 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3560 {
3561 	struct ifnet *ifp = sc->ifp;
3562 	int real_mtu, old_mtu;
3563 	int err = 0;
3564 
3565 
3566 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3567 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3568 		return EINVAL;
3569 	mtx_lock(&sc->driver_mtx);
3570 	old_mtu = ifp->if_mtu;
3571 	ifp->if_mtu = mtu;
3572 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3573 		callout_stop(&sc->co_hdl);
3574 		mxge_close(sc);
3575 		err = mxge_open(sc);
3576 		if (err != 0) {
3577 			ifp->if_mtu = old_mtu;
3578 			mxge_close(sc);
3579 			(void) mxge_open(sc);
3580 		}
3581 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3582 	}
3583 	mtx_unlock(&sc->driver_mtx);
3584 	return err;
3585 }
3586 
3587 static void
3588 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3589 {
3590 	mxge_softc_t *sc = ifp->if_softc;
3591 
3592 
3593 	if (sc == NULL)
3594 		return;
3595 	ifmr->ifm_status = IFM_AVALID;
3596 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3597 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3598 	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3599 }
3600 
3601 static int
3602 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3603 {
3604 	mxge_softc_t *sc = ifp->if_softc;
3605 	struct ifreq *ifr = (struct ifreq *)data;
3606 	int err, mask;
3607 
3608 	err = 0;
3609 	switch (command) {
3610 	case SIOCSIFADDR:
3611 	case SIOCGIFADDR:
3612 		err = ether_ioctl(ifp, command, data);
3613 		break;
3614 
3615 	case SIOCSIFMTU:
3616 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3617 		break;
3618 
3619 	case SIOCSIFFLAGS:
3620 		mtx_lock(&sc->driver_mtx);
3621 		if (ifp->if_flags & IFF_UP) {
3622 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3623 				err = mxge_open(sc);
3624 				callout_reset(&sc->co_hdl, mxge_ticks,
3625 					      mxge_tick, sc);
3626 			} else {
3627 				/* take care of promis can allmulti
3628 				   flag chages */
3629 				mxge_change_promisc(sc,
3630 						    ifp->if_flags & IFF_PROMISC);
3631 				mxge_set_multicast_list(sc);
3632 			}
3633 		} else {
3634 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3635 				callout_stop(&sc->co_hdl);
3636 				mxge_close(sc);
3637 			}
3638 		}
3639 		mtx_unlock(&sc->driver_mtx);
3640 		break;
3641 
3642 	case SIOCADDMULTI:
3643 	case SIOCDELMULTI:
3644 		mtx_lock(&sc->driver_mtx);
3645 		mxge_set_multicast_list(sc);
3646 		mtx_unlock(&sc->driver_mtx);
3647 		break;
3648 
3649 	case SIOCSIFCAP:
3650 		mtx_lock(&sc->driver_mtx);
3651 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3652 		if (mask & IFCAP_TXCSUM) {
3653 			if (IFCAP_TXCSUM & ifp->if_capenable) {
3654 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3655 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3656 						      | CSUM_TSO);
3657 			} else {
3658 				ifp->if_capenable |= IFCAP_TXCSUM;
3659 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3660 			}
3661 		} else if (mask & IFCAP_RXCSUM) {
3662 			if (IFCAP_RXCSUM & ifp->if_capenable) {
3663 				ifp->if_capenable &= ~IFCAP_RXCSUM;
3664 				sc->csum_flag = 0;
3665 			} else {
3666 				ifp->if_capenable |= IFCAP_RXCSUM;
3667 				sc->csum_flag = 1;
3668 			}
3669 		}
3670 		if (mask & IFCAP_TSO4) {
3671 			if (IFCAP_TSO4 & ifp->if_capenable) {
3672 				ifp->if_capenable &= ~IFCAP_TSO4;
3673 				ifp->if_hwassist &= ~CSUM_TSO;
3674 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
3675 				ifp->if_capenable |= IFCAP_TSO4;
3676 				ifp->if_hwassist |= CSUM_TSO;
3677 			} else {
3678 				printf("mxge requires tx checksum offload"
3679 				       " be enabled to use TSO\n");
3680 				err = EINVAL;
3681 			}
3682 		}
3683 		if (mask & IFCAP_LRO) {
3684 			if (IFCAP_LRO & ifp->if_capenable)
3685 				err = mxge_change_lro_locked(sc, 0);
3686 			else
3687 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3688 		}
3689 		if (mask & IFCAP_VLAN_HWTAGGING)
3690 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3691 		mtx_unlock(&sc->driver_mtx);
3692 		VLAN_CAPABILITIES(ifp);
3693 
3694 		break;
3695 
3696 	case SIOCGIFMEDIA:
3697 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3698 				    &sc->media, command);
3699                 break;
3700 
3701 	default:
3702 		err = ENOTTY;
3703         }
3704 	return err;
3705 }
3706 
3707 static void
3708 mxge_fetch_tunables(mxge_softc_t *sc)
3709 {
3710 
3711 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
3712 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
3713 			  &mxge_flow_control);
3714 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
3715 			  &mxge_intr_coal_delay);
3716 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
3717 			  &mxge_nvidia_ecrc_enable);
3718 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
3719 			  &mxge_force_firmware);
3720 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
3721 			  &mxge_deassert_wait);
3722 	TUNABLE_INT_FETCH("hw.mxge.verbose",
3723 			  &mxge_verbose);
3724 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
3725 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
3726 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
3727 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
3728 	if (sc->lro_cnt != 0)
3729 		mxge_lro_cnt = sc->lro_cnt;
3730 
3731 	if (bootverbose)
3732 		mxge_verbose = 1;
3733 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
3734 		mxge_intr_coal_delay = 30;
3735 	if (mxge_ticks == 0)
3736 		mxge_ticks = hz / 2;
3737 	sc->pause = mxge_flow_control;
3738 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
3739 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_SRC_PORT) {
3740 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
3741 	}
3742 }
3743 
3744 
3745 static void
3746 mxge_free_slices(mxge_softc_t *sc)
3747 {
3748 	struct mxge_slice_state *ss;
3749 	int i;
3750 
3751 
3752 	if (sc->ss == NULL)
3753 		return;
3754 
3755 	for (i = 0; i < sc->num_slices; i++) {
3756 		ss = &sc->ss[i];
3757 		if (ss->fw_stats != NULL) {
3758 			mxge_dma_free(&ss->fw_stats_dma);
3759 			ss->fw_stats = NULL;
3760 			mtx_destroy(&ss->tx.mtx);
3761 		}
3762 		if (ss->rx_done.entry != NULL) {
3763 			mxge_dma_free(&ss->rx_done.dma);
3764 			ss->rx_done.entry = NULL;
3765 		}
3766 	}
3767 	free(sc->ss, M_DEVBUF);
3768 	sc->ss = NULL;
3769 }
3770 
3771 static int
3772 mxge_alloc_slices(mxge_softc_t *sc)
3773 {
3774 	mxge_cmd_t cmd;
3775 	struct mxge_slice_state *ss;
3776 	size_t bytes;
3777 	int err, i, max_intr_slots;
3778 
3779 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3780 	if (err != 0) {
3781 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3782 		return err;
3783 	}
3784 	sc->rx_ring_size = cmd.data0;
3785 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
3786 
3787 	bytes = sizeof (*sc->ss) * sc->num_slices;
3788 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
3789 	if (sc->ss == NULL)
3790 		return (ENOMEM);
3791 	for (i = 0; i < sc->num_slices; i++) {
3792 		ss = &sc->ss[i];
3793 
3794 		ss->sc = sc;
3795 
3796 		/* allocate per-slice rx interrupt queues */
3797 
3798 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
3799 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
3800 		if (err != 0)
3801 			goto abort;
3802 		ss->rx_done.entry = ss->rx_done.dma.addr;
3803 		bzero(ss->rx_done.entry, bytes);
3804 
3805 		/*
3806 		 * allocate the per-slice firmware stats; stats
3807 		 * (including tx) are used used only on the first
3808 		 * slice for now
3809 		 */
3810 		if (i > 0)
3811 			continue;
3812 
3813 		bytes = sizeof (*ss->fw_stats);
3814 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
3815 				     sizeof (*ss->fw_stats), 64);
3816 		if (err != 0)
3817 			goto abort;
3818 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
3819 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
3820 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
3821 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
3822 	}
3823 
3824 	return (0);
3825 
3826 abort:
3827 	mxge_free_slices(sc);
3828 	return (ENOMEM);
3829 }
3830 
3831 static void
3832 mxge_slice_probe(mxge_softc_t *sc)
3833 {
3834 	mxge_cmd_t cmd;
3835 	char *old_fw;
3836 	int msix_cnt, status, max_intr_slots;
3837 
3838 	sc->num_slices = 1;
3839 	/*
3840 	 *  don't enable multiple slices if they are not enabled,
3841 	 *  or if this is not an SMP system
3842 	 */
3843 
3844 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
3845 		return;
3846 
3847 	/* see how many MSI-X interrupts are available */
3848 	msix_cnt = pci_msix_count(sc->dev);
3849 	if (msix_cnt < 2)
3850 		return;
3851 
3852 	/* now load the slice aware firmware see what it supports */
3853 	old_fw = sc->fw_name;
3854 	if (old_fw == mxge_fw_aligned)
3855 		sc->fw_name = mxge_fw_rss_aligned;
3856 	else
3857 		sc->fw_name = mxge_fw_rss_unaligned;
3858 	status = mxge_load_firmware(sc, 0);
3859 	if (status != 0) {
3860 		device_printf(sc->dev, "Falling back to a single slice\n");
3861 		return;
3862 	}
3863 
3864 	/* try to send a reset command to the card to see if it
3865 	   is alive */
3866 	memset(&cmd, 0, sizeof (cmd));
3867 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
3868 	if (status != 0) {
3869 		device_printf(sc->dev, "failed reset\n");
3870 		goto abort_with_fw;
3871 	}
3872 
3873 	/* get rx ring size */
3874 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3875 	if (status != 0) {
3876 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3877 		goto abort_with_fw;
3878 	}
3879 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
3880 
3881 	/* tell it the size of the interrupt queues */
3882 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
3883 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
3884 	if (status != 0) {
3885 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
3886 		goto abort_with_fw;
3887 	}
3888 
3889 	/* ask the maximum number of slices it supports */
3890 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
3891 	if (status != 0) {
3892 		device_printf(sc->dev,
3893 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
3894 		goto abort_with_fw;
3895 	}
3896 	sc->num_slices = cmd.data0;
3897 	if (sc->num_slices > msix_cnt)
3898 		sc->num_slices = msix_cnt;
3899 
3900 	if (mxge_max_slices == -1) {
3901 		/* cap to number of CPUs in system */
3902 		if (sc->num_slices > mp_ncpus)
3903 			sc->num_slices = mp_ncpus;
3904 	} else {
3905 		if (sc->num_slices > mxge_max_slices)
3906 			sc->num_slices = mxge_max_slices;
3907 	}
3908 	/* make sure it is a power of two */
3909 	while (sc->num_slices & (sc->num_slices - 1))
3910 		sc->num_slices--;
3911 
3912 	if (mxge_verbose)
3913 		device_printf(sc->dev, "using %d slices\n",
3914 			      sc->num_slices);
3915 
3916 	return;
3917 
3918 abort_with_fw:
3919 	sc->fw_name = old_fw;
3920 	(void) mxge_load_firmware(sc, 0);
3921 }
3922 
3923 static int
3924 mxge_add_msix_irqs(mxge_softc_t *sc)
3925 {
3926 	size_t bytes;
3927 	int count, err, i, rid;
3928 
3929 	rid = PCIR_BAR(2);
3930 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
3931 						    &rid, RF_ACTIVE);
3932 
3933 	if (sc->msix_table_res == NULL) {
3934 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
3935 		return ENXIO;
3936 	}
3937 
3938 	count = sc->num_slices;
3939 	err = pci_alloc_msix(sc->dev, &count);
3940 	if (err != 0) {
3941 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
3942 			      "err = %d \n", sc->num_slices, err);
3943 		goto abort_with_msix_table;
3944 	}
3945 	if (count < sc->num_slices) {
3946 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
3947 			      count, sc->num_slices);
3948 		device_printf(sc->dev,
3949 			      "Try setting hw.mxge.max_slices to %d\n",
3950 			      count);
3951 		err = ENOSPC;
3952 		goto abort_with_msix;
3953 	}
3954 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
3955 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3956 	if (sc->msix_irq_res == NULL) {
3957 		err = ENOMEM;
3958 		goto abort_with_msix;
3959 	}
3960 
3961 	for (i = 0; i < sc->num_slices; i++) {
3962 		rid = i + 1;
3963 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
3964 							  SYS_RES_IRQ,
3965 							  &rid, RF_ACTIVE);
3966 		if (sc->msix_irq_res[i] == NULL) {
3967 			device_printf(sc->dev, "couldn't allocate IRQ res"
3968 				      " for message %d\n", i);
3969 			err = ENXIO;
3970 			goto abort_with_res;
3971 		}
3972 	}
3973 
3974 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
3975 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3976 
3977 	for (i = 0; i < sc->num_slices; i++) {
3978 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
3979 				     INTR_TYPE_NET | INTR_MPSAFE,
3980 #if __FreeBSD_version > 700030
3981 				     NULL,
3982 #endif
3983 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
3984 		if (err != 0) {
3985 			device_printf(sc->dev, "couldn't setup intr for "
3986 				      "message %d\n", i);
3987 			goto abort_with_intr;
3988 		}
3989 	}
3990 
3991 	if (mxge_verbose) {
3992 		device_printf(sc->dev, "using %d msix IRQs:",
3993 			      sc->num_slices);
3994 		for (i = 0; i < sc->num_slices; i++)
3995 			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
3996 		printf("\n");
3997 	}
3998 	return (0);
3999 
4000 abort_with_intr:
4001 	for (i = 0; i < sc->num_slices; i++) {
4002 		if (sc->msix_ih[i] != NULL) {
4003 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4004 					  sc->msix_ih[i]);
4005 			sc->msix_ih[i] = NULL;
4006 		}
4007 	}
4008 	free(sc->msix_ih, M_DEVBUF);
4009 
4010 
4011 abort_with_res:
4012 	for (i = 0; i < sc->num_slices; i++) {
4013 		rid = i + 1;
4014 		if (sc->msix_irq_res[i] != NULL)
4015 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4016 					     sc->msix_irq_res[i]);
4017 		sc->msix_irq_res[i] = NULL;
4018 	}
4019 	free(sc->msix_irq_res, M_DEVBUF);
4020 
4021 
4022 abort_with_msix:
4023 	pci_release_msi(sc->dev);
4024 
4025 abort_with_msix_table:
4026 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4027 			     sc->msix_table_res);
4028 
4029 	return err;
4030 }
4031 
4032 static int
4033 mxge_add_single_irq(mxge_softc_t *sc)
4034 {
4035 	int count, err, rid;
4036 
4037 	count = pci_msi_count(sc->dev);
4038 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4039 		rid = 1;
4040 	} else {
4041 		rid = 0;
4042 		sc->legacy_irq = 1;
4043 	}
4044 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4045 					 1, RF_SHAREABLE | RF_ACTIVE);
4046 	if (sc->irq_res == NULL) {
4047 		device_printf(sc->dev, "could not alloc interrupt\n");
4048 		return ENXIO;
4049 	}
4050 	if (mxge_verbose)
4051 		device_printf(sc->dev, "using %s irq %ld\n",
4052 			      sc->legacy_irq ? "INTx" : "MSI",
4053 			      rman_get_start(sc->irq_res));
4054 	err = bus_setup_intr(sc->dev, sc->irq_res,
4055 			     INTR_TYPE_NET | INTR_MPSAFE,
4056 #if __FreeBSD_version > 700030
4057 			     NULL,
4058 #endif
4059 			     mxge_intr, &sc->ss[0], &sc->ih);
4060 	if (err != 0) {
4061 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4062 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4063 		if (!sc->legacy_irq)
4064 			pci_release_msi(sc->dev);
4065 	}
4066 	return err;
4067 }
4068 
4069 static void
4070 mxge_rem_msix_irqs(mxge_softc_t *sc)
4071 {
4072 	int i, rid;
4073 
4074 	for (i = 0; i < sc->num_slices; i++) {
4075 		if (sc->msix_ih[i] != NULL) {
4076 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4077 					  sc->msix_ih[i]);
4078 			sc->msix_ih[i] = NULL;
4079 		}
4080 	}
4081 	free(sc->msix_ih, M_DEVBUF);
4082 
4083 	for (i = 0; i < sc->num_slices; i++) {
4084 		rid = i + 1;
4085 		if (sc->msix_irq_res[i] != NULL)
4086 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4087 					     sc->msix_irq_res[i]);
4088 		sc->msix_irq_res[i] = NULL;
4089 	}
4090 	free(sc->msix_irq_res, M_DEVBUF);
4091 
4092 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4093 			     sc->msix_table_res);
4094 
4095 	pci_release_msi(sc->dev);
4096 	return;
4097 }
4098 
4099 static void
4100 mxge_rem_single_irq(mxge_softc_t *sc)
4101 {
4102 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4103 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4104 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4105 	if (!sc->legacy_irq)
4106 		pci_release_msi(sc->dev);
4107 }
4108 
4109 static void
4110 mxge_rem_irq(mxge_softc_t *sc)
4111 {
4112 	if (sc->num_slices > 1)
4113 		mxge_rem_msix_irqs(sc);
4114 	else
4115 		mxge_rem_single_irq(sc);
4116 }
4117 
4118 static int
4119 mxge_add_irq(mxge_softc_t *sc)
4120 {
4121 	int err;
4122 
4123 	if (sc->num_slices > 1)
4124 		err = mxge_add_msix_irqs(sc);
4125 	else
4126 		err = mxge_add_single_irq(sc);
4127 
4128 	if (0 && err == 0 && sc->num_slices > 1) {
4129 		mxge_rem_msix_irqs(sc);
4130 		err = mxge_add_msix_irqs(sc);
4131 	}
4132 	return err;
4133 }
4134 
4135 
4136 static int
4137 mxge_attach(device_t dev)
4138 {
4139 	mxge_softc_t *sc = device_get_softc(dev);
4140 	struct ifnet *ifp;
4141 	int err, rid;
4142 
4143 	sc->dev = dev;
4144 	mxge_fetch_tunables(sc);
4145 
4146 	err = bus_dma_tag_create(NULL,			/* parent */
4147 				 1,			/* alignment */
4148 				 0,			/* boundary */
4149 				 BUS_SPACE_MAXADDR,	/* low */
4150 				 BUS_SPACE_MAXADDR,	/* high */
4151 				 NULL, NULL,		/* filter */
4152 				 65536 + 256,		/* maxsize */
4153 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4154 				 65536,			/* maxsegsize */
4155 				 0,			/* flags */
4156 				 NULL, NULL,		/* lock */
4157 				 &sc->parent_dmat);	/* tag */
4158 
4159 	if (err != 0) {
4160 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4161 			      err);
4162 		goto abort_with_nothing;
4163 	}
4164 
4165 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4166 	if (ifp == NULL) {
4167 		device_printf(dev, "can not if_alloc()\n");
4168 		err = ENOSPC;
4169 		goto abort_with_parent_dmat;
4170 	}
4171 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4172 
4173 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4174 		 device_get_nameunit(dev));
4175 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4176 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4177 		 "%s:drv", device_get_nameunit(dev));
4178 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4179 		 MTX_NETWORK_LOCK, MTX_DEF);
4180 
4181 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4182 
4183 	mxge_setup_cfg_space(sc);
4184 
4185 	/* Map the board into the kernel */
4186 	rid = PCIR_BARS;
4187 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4188 					 ~0, 1, RF_ACTIVE);
4189 	if (sc->mem_res == NULL) {
4190 		device_printf(dev, "could not map memory\n");
4191 		err = ENXIO;
4192 		goto abort_with_lock;
4193 	}
4194 	sc->sram = rman_get_virtual(sc->mem_res);
4195 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4196 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4197 		device_printf(dev, "impossible memory region size %ld\n",
4198 			      rman_get_size(sc->mem_res));
4199 		err = ENXIO;
4200 		goto abort_with_mem_res;
4201 	}
4202 
4203 	/* make NULL terminated copy of the EEPROM strings section of
4204 	   lanai SRAM */
4205 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4206 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4207 				rman_get_bushandle(sc->mem_res),
4208 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4209 				sc->eeprom_strings,
4210 				MXGE_EEPROM_STRINGS_SIZE - 2);
4211 	err = mxge_parse_strings(sc);
4212 	if (err != 0)
4213 		goto abort_with_mem_res;
4214 
4215 	/* Enable write combining for efficient use of PCIe bus */
4216 	mxge_enable_wc(sc);
4217 
4218 	/* Allocate the out of band dma memory */
4219 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4220 			     sizeof (mxge_cmd_t), 64);
4221 	if (err != 0)
4222 		goto abort_with_mem_res;
4223 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4224 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4225 	if (err != 0)
4226 		goto abort_with_cmd_dma;
4227 
4228 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4229 	if (err != 0)
4230 		goto abort_with_zeropad_dma;
4231 
4232 	/* select & load the firmware */
4233 	err = mxge_select_firmware(sc);
4234 	if (err != 0)
4235 		goto abort_with_dmabench;
4236 	sc->intr_coal_delay = mxge_intr_coal_delay;
4237 
4238 	mxge_slice_probe(sc);
4239 	err = mxge_alloc_slices(sc);
4240 	if (err != 0)
4241 		goto abort_with_dmabench;
4242 
4243 	err = mxge_reset(sc, 0);
4244 	if (err != 0)
4245 		goto abort_with_slices;
4246 
4247 	err = mxge_alloc_rings(sc);
4248 	if (err != 0) {
4249 		device_printf(sc->dev, "failed to allocate rings\n");
4250 		goto abort_with_dmabench;
4251 	}
4252 
4253 	err = mxge_add_irq(sc);
4254 	if (err != 0) {
4255 		device_printf(sc->dev, "failed to add irq\n");
4256 		goto abort_with_rings;
4257 	}
4258 
4259 	ifp->if_baudrate = 100000000;
4260 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4261 		IFCAP_VLAN_MTU | IFCAP_LRO;
4262 
4263 #ifdef MXGE_NEW_VLAN_API
4264 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4265 #endif
4266 
4267 	sc->max_mtu = mxge_max_mtu(sc);
4268 	if (sc->max_mtu >= 9000)
4269 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4270 	else
4271 		device_printf(dev, "MTU limited to %d.  Install "
4272 			      "latest firmware for 9000 byte jumbo support\n",
4273 			      sc->max_mtu - ETHER_HDR_LEN);
4274 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4275 	ifp->if_capenable = ifp->if_capabilities;
4276 	if (sc->lro_cnt == 0)
4277 		ifp->if_capenable &= ~IFCAP_LRO;
4278 	sc->csum_flag = 1;
4279         ifp->if_init = mxge_init;
4280         ifp->if_softc = sc;
4281         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4282         ifp->if_ioctl = mxge_ioctl;
4283         ifp->if_start = mxge_start;
4284 	/* Initialise the ifmedia structure */
4285 	ifmedia_init(&sc->media, 0, mxge_media_change,
4286 		     mxge_media_status);
4287 	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4288 	mxge_media_probe(sc);
4289 	ether_ifattach(ifp, sc->mac_addr);
4290 	/* ether_ifattach sets mtu to 1500 */
4291 	if (ifp->if_capabilities & IFCAP_JUMBO_MTU)
4292 		ifp->if_mtu = 9000;
4293 
4294 	mxge_add_sysctls(sc);
4295 	return 0;
4296 
4297 abort_with_rings:
4298 	mxge_free_rings(sc);
4299 abort_with_slices:
4300 	mxge_free_slices(sc);
4301 abort_with_dmabench:
4302 	mxge_dma_free(&sc->dmabench_dma);
4303 abort_with_zeropad_dma:
4304 	mxge_dma_free(&sc->zeropad_dma);
4305 abort_with_cmd_dma:
4306 	mxge_dma_free(&sc->cmd_dma);
4307 abort_with_mem_res:
4308 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4309 abort_with_lock:
4310 	pci_disable_busmaster(dev);
4311 	mtx_destroy(&sc->cmd_mtx);
4312 	mtx_destroy(&sc->driver_mtx);
4313 	if_free(ifp);
4314 abort_with_parent_dmat:
4315 	bus_dma_tag_destroy(sc->parent_dmat);
4316 
4317 abort_with_nothing:
4318 	return err;
4319 }
4320 
4321 static int
4322 mxge_detach(device_t dev)
4323 {
4324 	mxge_softc_t *sc = device_get_softc(dev);
4325 
4326 	if (mxge_vlans_active(sc)) {
4327 		device_printf(sc->dev,
4328 			      "Detach vlans before removing module\n");
4329 		return EBUSY;
4330 	}
4331 	mtx_lock(&sc->driver_mtx);
4332 	callout_stop(&sc->co_hdl);
4333 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4334 		mxge_close(sc);
4335 	mtx_unlock(&sc->driver_mtx);
4336 	ether_ifdetach(sc->ifp);
4337 	ifmedia_removeall(&sc->media);
4338 	mxge_dummy_rdma(sc, 0);
4339 	mxge_rem_sysctls(sc);
4340 	mxge_rem_irq(sc);
4341 	mxge_free_rings(sc);
4342 	mxge_free_slices(sc);
4343 	mxge_dma_free(&sc->dmabench_dma);
4344 	mxge_dma_free(&sc->zeropad_dma);
4345 	mxge_dma_free(&sc->cmd_dma);
4346 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4347 	pci_disable_busmaster(dev);
4348 	mtx_destroy(&sc->cmd_mtx);
4349 	mtx_destroy(&sc->driver_mtx);
4350 	if_free(sc->ifp);
4351 	bus_dma_tag_destroy(sc->parent_dmat);
4352 	return 0;
4353 }
4354 
4355 static int
4356 mxge_shutdown(device_t dev)
4357 {
4358 	return 0;
4359 }
4360 
4361 /*
4362   This file uses Myri10GE driver indentation.
4363 
4364   Local Variables:
4365   c-file-style:"linux"
4366   tab-width:8
4367   End:
4368 */
4369