xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 1e413cf93298b5b97441a21d9a50fdcd0ee9945e)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2008, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/memrange.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48 #include <sys/sx.h>
49 
50 #include <net/if.h>
51 #include <net/if_arp.h>
52 #include <net/ethernet.h>
53 #include <net/if_dl.h>
54 #include <net/if_media.h>
55 
56 #include <net/bpf.h>
57 
58 #include <net/if_types.h>
59 #include <net/if_vlan_var.h>
60 #include <net/zlib.h>
61 
62 #include <netinet/in_systm.h>
63 #include <netinet/in.h>
64 #include <netinet/ip.h>
65 #include <netinet/tcp.h>
66 
67 #include <machine/bus.h>
68 #include <machine/in_cksum.h>
69 #include <machine/resource.h>
70 #include <sys/bus.h>
71 #include <sys/rman.h>
72 #include <sys/smp.h>
73 
74 #include <dev/pci/pcireg.h>
75 #include <dev/pci/pcivar.h>
76 
77 #include <vm/vm.h>		/* for pmap_mapdev() */
78 #include <vm/pmap.h>
79 
80 #if defined(__i386) || defined(__amd64)
81 #include <machine/specialreg.h>
82 #endif
83 
84 #include <dev/mxge/mxge_mcp.h>
85 #include <dev/mxge/mcp_gen_header.h>
86 /*#define MXGE_FAKE_IFP*/
87 #include <dev/mxge/if_mxge_var.h>
88 
89 /* tunable params */
90 static int mxge_nvidia_ecrc_enable = 1;
91 static int mxge_force_firmware = 0;
92 static int mxge_intr_coal_delay = 30;
93 static int mxge_deassert_wait = 1;
94 static int mxge_flow_control = 1;
95 static int mxge_verbose = 0;
96 static int mxge_lro_cnt = 8;
97 static int mxge_ticks;
98 static int mxge_max_slices = 1;
99 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
100 static int mxge_always_promisc = 0;
101 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
102 static char *mxge_fw_aligned = "mxge_eth_z8e";
103 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
104 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
105 
106 static int mxge_probe(device_t dev);
107 static int mxge_attach(device_t dev);
108 static int mxge_detach(device_t dev);
109 static int mxge_shutdown(device_t dev);
110 static void mxge_intr(void *arg);
111 
112 static device_method_t mxge_methods[] =
113 {
114   /* Device interface */
115   DEVMETHOD(device_probe, mxge_probe),
116   DEVMETHOD(device_attach, mxge_attach),
117   DEVMETHOD(device_detach, mxge_detach),
118   DEVMETHOD(device_shutdown, mxge_shutdown),
119   {0, 0}
120 };
121 
122 static driver_t mxge_driver =
123 {
124   "mxge",
125   mxge_methods,
126   sizeof(mxge_softc_t),
127 };
128 
129 static devclass_t mxge_devclass;
130 
131 /* Declare ourselves to be a child of the PCI bus.*/
132 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
133 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
134 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
135 
136 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
137 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
138 static int mxge_close(mxge_softc_t *sc);
139 static int mxge_open(mxge_softc_t *sc);
140 static void mxge_tick(void *arg);
141 
142 static int
143 mxge_probe(device_t dev)
144 {
145   if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
146       ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
147        (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
148 	  device_set_desc(dev, "Myri10G-PCIE-8A");
149 	  return 0;
150   }
151   return ENXIO;
152 }
153 
154 static void
155 mxge_enable_wc(mxge_softc_t *sc)
156 {
157 #if defined(__i386) || defined(__amd64)
158 	struct mem_range_desc mrdesc;
159 	vm_paddr_t pa;
160 	vm_offset_t len;
161 	int err, action;
162 
163 	sc->wc = 1;
164 	len = rman_get_size(sc->mem_res);
165 	err = pmap_change_attr((vm_offset_t) sc->sram,
166 			       len, PAT_WRITE_COMBINING);
167 	if (err == 0)
168 		return;
169 	else
170 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
171 			      err);
172 	pa = rman_get_start(sc->mem_res);
173 	mrdesc.mr_base = pa;
174 	mrdesc.mr_len = len;
175 	mrdesc.mr_flags = MDF_WRITECOMBINE;
176 	action = MEMRANGE_SET_UPDATE;
177 	strcpy((char *)&mrdesc.mr_owner, "mxge");
178 	err = mem_range_attr_set(&mrdesc, &action);
179 	if (err != 0) {
180 		sc->wc = 0;
181 		device_printf(sc->dev,
182 			      "w/c failed for pa 0x%lx, len 0x%lx, err = %d\n",
183 			      (unsigned long)pa, (unsigned long)len, err);
184 	}
185 #endif
186 }
187 
188 
189 /* callback to get our DMA address */
190 static void
191 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
192 			 int error)
193 {
194 	if (error == 0) {
195 		*(bus_addr_t *) arg = segs->ds_addr;
196 	}
197 }
198 
199 static int
200 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
201 		   bus_size_t alignment)
202 {
203 	int err;
204 	device_t dev = sc->dev;
205 	bus_size_t boundary, maxsegsize;
206 
207 	if (bytes > 4096 && alignment == 4096) {
208 		boundary = 0;
209 		maxsegsize = bytes;
210 	} else {
211 		boundary = 4096;
212 		maxsegsize = 4096;
213 	}
214 
215 	/* allocate DMAable memory tags */
216 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
217 				 alignment,		/* alignment */
218 				 boundary,		/* boundary */
219 				 BUS_SPACE_MAXADDR,	/* low */
220 				 BUS_SPACE_MAXADDR,	/* high */
221 				 NULL, NULL,		/* filter */
222 				 bytes,			/* maxsize */
223 				 1,			/* num segs */
224 				 maxsegsize,		/* maxsegsize */
225 				 BUS_DMA_COHERENT,	/* flags */
226 				 NULL, NULL,		/* lock */
227 				 &dma->dmat);		/* tag */
228 	if (err != 0) {
229 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
230 		return err;
231 	}
232 
233 	/* allocate DMAable memory & map */
234 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
235 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
236 				| BUS_DMA_ZERO),  &dma->map);
237 	if (err != 0) {
238 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
239 		goto abort_with_dmat;
240 	}
241 
242 	/* load the memory */
243 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
244 			      mxge_dmamap_callback,
245 			      (void *)&dma->bus_addr, 0);
246 	if (err != 0) {
247 		device_printf(dev, "couldn't load map (err = %d)\n", err);
248 		goto abort_with_mem;
249 	}
250 	return 0;
251 
252 abort_with_mem:
253 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
254 abort_with_dmat:
255 	(void)bus_dma_tag_destroy(dma->dmat);
256 	return err;
257 }
258 
259 
260 static void
261 mxge_dma_free(mxge_dma_t *dma)
262 {
263 	bus_dmamap_unload(dma->dmat, dma->map);
264 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
265 	(void)bus_dma_tag_destroy(dma->dmat);
266 }
267 
268 /*
269  * The eeprom strings on the lanaiX have the format
270  * SN=x\0
271  * MAC=x:x:x:x:x:x\0
272  * PC=text\0
273  */
274 
275 static int
276 mxge_parse_strings(mxge_softc_t *sc)
277 {
278 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
279 
280 	char *ptr, *limit;
281 	int i, found_mac;
282 
283 	ptr = sc->eeprom_strings;
284 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
285 	found_mac = 0;
286 	while (ptr < limit && *ptr != '\0') {
287 		if (memcmp(ptr, "MAC=", 4) == 0) {
288 			ptr += 1;
289 			sc->mac_addr_string = ptr;
290 			for (i = 0; i < 6; i++) {
291 				ptr += 3;
292 				if ((ptr + 2) > limit)
293 					goto abort;
294 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
295 				found_mac = 1;
296 			}
297 		} else if (memcmp(ptr, "PC=", 3) == 0) {
298 			ptr += 3;
299 			strncpy(sc->product_code_string, ptr,
300 				sizeof (sc->product_code_string) - 1);
301 		} else if (memcmp(ptr, "SN=", 3) == 0) {
302 			ptr += 3;
303 			strncpy(sc->serial_number_string, ptr,
304 				sizeof (sc->serial_number_string) - 1);
305 		}
306 		MXGE_NEXT_STRING(ptr);
307 	}
308 
309 	if (found_mac)
310 		return 0;
311 
312  abort:
313 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
314 
315 	return ENXIO;
316 }
317 
318 #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
319 static void
320 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
321 {
322 	uint32_t val;
323 	unsigned long base, off;
324 	char *va, *cfgptr;
325 	device_t pdev, mcp55;
326 	uint16_t vendor_id, device_id, word;
327 	uintptr_t bus, slot, func, ivend, idev;
328 	uint32_t *ptr32;
329 
330 
331 	if (!mxge_nvidia_ecrc_enable)
332 		return;
333 
334 	pdev = device_get_parent(device_get_parent(sc->dev));
335 	if (pdev == NULL) {
336 		device_printf(sc->dev, "could not find parent?\n");
337 		return;
338 	}
339 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
340 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
341 
342 	if (vendor_id != 0x10de)
343 		return;
344 
345 	base = 0;
346 
347 	if (device_id == 0x005d) {
348 		/* ck804, base address is magic */
349 		base = 0xe0000000UL;
350 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
351 		/* mcp55, base address stored in chipset */
352 		mcp55 = pci_find_bsf(0, 0, 0);
353 		if (mcp55 &&
354 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
355 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
356 			word = pci_read_config(mcp55, 0x90, 2);
357 			base = ((unsigned long)word & 0x7ffeU) << 25;
358 		}
359 	}
360 	if (!base)
361 		return;
362 
363 	/* XXXX
364 	   Test below is commented because it is believed that doing
365 	   config read/write beyond 0xff will access the config space
366 	   for the next larger function.  Uncomment this and remove
367 	   the hacky pmap_mapdev() way of accessing config space when
368 	   FreeBSD grows support for extended pcie config space access
369 	*/
370 #if 0
371 	/* See if we can, by some miracle, access the extended
372 	   config space */
373 	val = pci_read_config(pdev, 0x178, 4);
374 	if (val != 0xffffffff) {
375 		val |= 0x40;
376 		pci_write_config(pdev, 0x178, val, 4);
377 		return;
378 	}
379 #endif
380 	/* Rather than using normal pci config space writes, we must
381 	 * map the Nvidia config space ourselves.  This is because on
382 	 * opteron/nvidia class machine the 0xe000000 mapping is
383 	 * handled by the nvidia chipset, that means the internal PCI
384 	 * device (the on-chip northbridge), or the amd-8131 bridge
385 	 * and things behind them are not visible by this method.
386 	 */
387 
388 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
389 		      PCI_IVAR_BUS, &bus);
390 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
391 		      PCI_IVAR_SLOT, &slot);
392 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
393 		      PCI_IVAR_FUNCTION, &func);
394 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
395 		      PCI_IVAR_VENDOR, &ivend);
396 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
397 		      PCI_IVAR_DEVICE, &idev);
398 
399 	off =  base
400 		+ 0x00100000UL * (unsigned long)bus
401 		+ 0x00001000UL * (unsigned long)(func
402 						 + 8 * slot);
403 
404 	/* map it into the kernel */
405 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
406 
407 
408 	if (va == NULL) {
409 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
410 		return;
411 	}
412 	/* get a pointer to the config space mapped into the kernel */
413 	cfgptr = va + (off & PAGE_MASK);
414 
415 	/* make sure that we can really access it */
416 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
417 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
418 	if (! (vendor_id == ivend && device_id == idev)) {
419 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
420 			      vendor_id, device_id);
421 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
422 		return;
423 	}
424 
425 	ptr32 = (uint32_t*)(cfgptr + 0x178);
426 	val = *ptr32;
427 
428 	if (val == 0xffffffff) {
429 		device_printf(sc->dev, "extended mapping failed\n");
430 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
431 		return;
432 	}
433 	*ptr32 = val | 0x40;
434 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
435 	if (mxge_verbose)
436 		device_printf(sc->dev,
437 			      "Enabled ECRC on upstream Nvidia bridge "
438 			      "at %d:%d:%d\n",
439 			      (int)bus, (int)slot, (int)func);
440 	return;
441 }
442 #else
443 static void
444 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
445 {
446 	device_printf(sc->dev,
447 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
448 	return;
449 }
450 #endif
451 
452 
453 static int
454 mxge_dma_test(mxge_softc_t *sc, int test_type)
455 {
456 	mxge_cmd_t cmd;
457 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
458 	int status;
459 	uint32_t len;
460 	char *test = " ";
461 
462 
463 	/* Run a small DMA test.
464 	 * The magic multipliers to the length tell the firmware
465 	 * to do DMA read, write, or read+write tests.  The
466 	 * results are returned in cmd.data0.  The upper 16
467 	 * bits of the return is the number of transfers completed.
468 	 * The lower 16 bits is the time in 0.5us ticks that the
469 	 * transfers took to complete.
470 	 */
471 
472 	len = sc->tx_boundary;
473 
474 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
475 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
476 	cmd.data2 = len * 0x10000;
477 	status = mxge_send_cmd(sc, test_type, &cmd);
478 	if (status != 0) {
479 		test = "read";
480 		goto abort;
481 	}
482 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
483 		(cmd.data0 & 0xffff);
484 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
485 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
486 	cmd.data2 = len * 0x1;
487 	status = mxge_send_cmd(sc, test_type, &cmd);
488 	if (status != 0) {
489 		test = "write";
490 		goto abort;
491 	}
492 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
493 		(cmd.data0 & 0xffff);
494 
495 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
496 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
497 	cmd.data2 = len * 0x10001;
498 	status = mxge_send_cmd(sc, test_type, &cmd);
499 	if (status != 0) {
500 		test = "read/write";
501 		goto abort;
502 	}
503 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
504 		(cmd.data0 & 0xffff);
505 
506 abort:
507 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
508 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
509 			      test, status);
510 
511 	return status;
512 }
513 
514 /*
515  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
516  * when the PCI-E Completion packets are aligned on an 8-byte
517  * boundary.  Some PCI-E chip sets always align Completion packets; on
518  * the ones that do not, the alignment can be enforced by enabling
519  * ECRC generation (if supported).
520  *
521  * When PCI-E Completion packets are not aligned, it is actually more
522  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
523  *
524  * If the driver can neither enable ECRC nor verify that it has
525  * already been enabled, then it must use a firmware image which works
526  * around unaligned completion packets (ethp_z8e.dat), and it should
527  * also ensure that it never gives the device a Read-DMA which is
528  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
529  * enabled, then the driver should use the aligned (eth_z8e.dat)
530  * firmware image, and set tx_boundary to 4KB.
531  */
532 
533 static int
534 mxge_firmware_probe(mxge_softc_t *sc)
535 {
536 	device_t dev = sc->dev;
537 	int reg, status;
538 	uint16_t pectl;
539 
540 	sc->tx_boundary = 4096;
541 	/*
542 	 * Verify the max read request size was set to 4KB
543 	 * before trying the test with 4KB.
544 	 */
545 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
546 		pectl = pci_read_config(dev, reg + 0x8, 2);
547 		if ((pectl & (5 << 12)) != (5 << 12)) {
548 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
549 				      pectl);
550 			sc->tx_boundary = 2048;
551 		}
552 	}
553 
554 	/*
555 	 * load the optimized firmware (which assumes aligned PCIe
556 	 * completions) in order to see if it works on this host.
557 	 */
558 	sc->fw_name = mxge_fw_aligned;
559 	status = mxge_load_firmware(sc, 1);
560 	if (status != 0) {
561 		return status;
562 	}
563 
564 	/*
565 	 * Enable ECRC if possible
566 	 */
567 	mxge_enable_nvidia_ecrc(sc);
568 
569 	/*
570 	 * Run a DMA test which watches for unaligned completions and
571 	 * aborts on the first one seen.
572 	 */
573 
574 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
575 	if (status == 0)
576 		return 0; /* keep the aligned firmware */
577 
578 	if (status != E2BIG)
579 		device_printf(dev, "DMA test failed: %d\n", status);
580 	if (status == ENOSYS)
581 		device_printf(dev, "Falling back to ethp! "
582 			      "Please install up to date fw\n");
583 	return status;
584 }
585 
586 static int
587 mxge_select_firmware(mxge_softc_t *sc)
588 {
589 	int aligned = 0;
590 
591 
592 	if (mxge_force_firmware != 0) {
593 		if (mxge_force_firmware == 1)
594 			aligned = 1;
595 		else
596 			aligned = 0;
597 		if (mxge_verbose)
598 			device_printf(sc->dev,
599 				      "Assuming %s completions (forced)\n",
600 				      aligned ? "aligned" : "unaligned");
601 		goto abort;
602 	}
603 
604 	/* if the PCIe link width is 4 or less, we can use the aligned
605 	   firmware and skip any checks */
606 	if (sc->link_width != 0 && sc->link_width <= 4) {
607 		device_printf(sc->dev,
608 			      "PCIe x%d Link, expect reduced performance\n",
609 			      sc->link_width);
610 		aligned = 1;
611 		goto abort;
612 	}
613 
614 	if (0 == mxge_firmware_probe(sc))
615 		return 0;
616 
617 abort:
618 	if (aligned) {
619 		sc->fw_name = mxge_fw_aligned;
620 		sc->tx_boundary = 4096;
621 	} else {
622 		sc->fw_name = mxge_fw_unaligned;
623 		sc->tx_boundary = 2048;
624 	}
625 	return (mxge_load_firmware(sc, 0));
626 }
627 
628 union qualhack
629 {
630         const char *ro_char;
631         char *rw_char;
632 };
633 
634 static int
635 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
636 {
637 
638 
639 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
640 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
641 			      be32toh(hdr->mcp_type));
642 		return EIO;
643 	}
644 
645 	/* save firmware version for sysctl */
646 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
647 	if (mxge_verbose)
648 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
649 
650 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
651 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
652 
653 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
654 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
655 		device_printf(sc->dev, "Found firmware version %s\n",
656 			      sc->fw_version);
657 		device_printf(sc->dev, "Driver needs %d.%d\n",
658 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
659 		return EINVAL;
660 	}
661 	return 0;
662 
663 }
664 
665 static void *
666 z_alloc(void *nil, u_int items, u_int size)
667 {
668         void *ptr;
669 
670         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
671         return ptr;
672 }
673 
674 static void
675 z_free(void *nil, void *ptr)
676 {
677         free(ptr, M_TEMP);
678 }
679 
680 
681 static int
682 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
683 {
684 	z_stream zs;
685 	char *inflate_buffer;
686 	const struct firmware *fw;
687 	const mcp_gen_header_t *hdr;
688 	unsigned hdr_offset;
689 	int status;
690 	unsigned int i;
691 	char dummy;
692 	size_t fw_len;
693 
694 	fw = firmware_get(sc->fw_name);
695 	if (fw == NULL) {
696 		device_printf(sc->dev, "Could not find firmware image %s\n",
697 			      sc->fw_name);
698 		return ENOENT;
699 	}
700 
701 
702 
703 	/* setup zlib and decompress f/w */
704 	bzero(&zs, sizeof (zs));
705 	zs.zalloc = z_alloc;
706 	zs.zfree = z_free;
707 	status = inflateInit(&zs);
708 	if (status != Z_OK) {
709 		status = EIO;
710 		goto abort_with_fw;
711 	}
712 
713 	/* the uncompressed size is stored as the firmware version,
714 	   which would otherwise go unused */
715 	fw_len = (size_t) fw->version;
716 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
717 	if (inflate_buffer == NULL)
718 		goto abort_with_zs;
719 	zs.avail_in = fw->datasize;
720 	zs.next_in = __DECONST(char *, fw->data);
721 	zs.avail_out = fw_len;
722 	zs.next_out = inflate_buffer;
723 	status = inflate(&zs, Z_FINISH);
724 	if (status != Z_STREAM_END) {
725 		device_printf(sc->dev, "zlib %d\n", status);
726 		status = EIO;
727 		goto abort_with_buffer;
728 	}
729 
730 	/* check id */
731 	hdr_offset = htobe32(*(const uint32_t *)
732 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
733 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
734 		device_printf(sc->dev, "Bad firmware file");
735 		status = EIO;
736 		goto abort_with_buffer;
737 	}
738 	hdr = (const void*)(inflate_buffer + hdr_offset);
739 
740 	status = mxge_validate_firmware(sc, hdr);
741 	if (status != 0)
742 		goto abort_with_buffer;
743 
744 	/* Copy the inflated firmware to NIC SRAM. */
745 	for (i = 0; i < fw_len; i += 256) {
746 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
747 			      inflate_buffer + i,
748 			      min(256U, (unsigned)(fw_len - i)));
749 		mb();
750 		dummy = *sc->sram;
751 		mb();
752 	}
753 
754 	*limit = fw_len;
755 	status = 0;
756 abort_with_buffer:
757 	free(inflate_buffer, M_TEMP);
758 abort_with_zs:
759 	inflateEnd(&zs);
760 abort_with_fw:
761 	firmware_put(fw, FIRMWARE_UNLOAD);
762 	return status;
763 }
764 
765 /*
766  * Enable or disable periodic RDMAs from the host to make certain
767  * chipsets resend dropped PCIe messages
768  */
769 
770 static void
771 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
772 {
773 	char buf_bytes[72];
774 	volatile uint32_t *confirm;
775 	volatile char *submit;
776 	uint32_t *buf, dma_low, dma_high;
777 	int i;
778 
779 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
780 
781 	/* clear confirmation addr */
782 	confirm = (volatile uint32_t *)sc->cmd;
783 	*confirm = 0;
784 	mb();
785 
786 	/* send an rdma command to the PCIe engine, and wait for the
787 	   response in the confirmation address.  The firmware should
788 	   write a -1 there to indicate it is alive and well
789 	*/
790 
791 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
792 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
793 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
794 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
795 	buf[2] = htobe32(0xffffffff);		/* confirm data */
796 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
797 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
798 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
799 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
800 	buf[5] = htobe32(enable);			/* enable? */
801 
802 
803 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
804 
805 	mxge_pio_copy(submit, buf, 64);
806 	mb();
807 	DELAY(1000);
808 	mb();
809 	i = 0;
810 	while (*confirm != 0xffffffff && i < 20) {
811 		DELAY(1000);
812 		i++;
813 	}
814 	if (*confirm != 0xffffffff) {
815 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
816 			      (enable ? "enable" : "disable"), confirm,
817 			      *confirm);
818 	}
819 	return;
820 }
821 
822 static int
823 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
824 {
825 	mcp_cmd_t *buf;
826 	char buf_bytes[sizeof(*buf) + 8];
827 	volatile mcp_cmd_response_t *response = sc->cmd;
828 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
829 	uint32_t dma_low, dma_high;
830 	int err, sleep_total = 0;
831 
832 	/* ensure buf is aligned to 8 bytes */
833 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
834 
835 	buf->data0 = htobe32(data->data0);
836 	buf->data1 = htobe32(data->data1);
837 	buf->data2 = htobe32(data->data2);
838 	buf->cmd = htobe32(cmd);
839 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
840 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
841 
842 	buf->response_addr.low = htobe32(dma_low);
843 	buf->response_addr.high = htobe32(dma_high);
844 	mtx_lock(&sc->cmd_mtx);
845 	response->result = 0xffffffff;
846 	mb();
847 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
848 
849 	/* wait up to 20ms */
850 	err = EAGAIN;
851 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
852 		bus_dmamap_sync(sc->cmd_dma.dmat,
853 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
854 		mb();
855 		switch (be32toh(response->result)) {
856 		case 0:
857 			data->data0 = be32toh(response->data);
858 			err = 0;
859 			break;
860 		case 0xffffffff:
861 			DELAY(1000);
862 			break;
863 		case MXGEFW_CMD_UNKNOWN:
864 			err = ENOSYS;
865 			break;
866 		case MXGEFW_CMD_ERROR_UNALIGNED:
867 			err = E2BIG;
868 			break;
869 		case MXGEFW_CMD_ERROR_BUSY:
870 			err = EBUSY;
871 			break;
872 		default:
873 			device_printf(sc->dev,
874 				      "mxge: command %d "
875 				      "failed, result = %d\n",
876 				      cmd, be32toh(response->result));
877 			err = ENXIO;
878 			break;
879 		}
880 		if (err != EAGAIN)
881 			break;
882 	}
883 	if (err == EAGAIN)
884 		device_printf(sc->dev, "mxge: command %d timed out"
885 			      "result = %d\n",
886 			      cmd, be32toh(response->result));
887 	mtx_unlock(&sc->cmd_mtx);
888 	return err;
889 }
890 
891 static int
892 mxge_adopt_running_firmware(mxge_softc_t *sc)
893 {
894 	struct mcp_gen_header *hdr;
895 	const size_t bytes = sizeof (struct mcp_gen_header);
896 	size_t hdr_offset;
897 	int status;
898 
899 	/* find running firmware header */
900 	hdr_offset = htobe32(*(volatile uint32_t *)
901 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
902 
903 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
904 		device_printf(sc->dev,
905 			      "Running firmware has bad header offset (%d)\n",
906 			      (int)hdr_offset);
907 		return EIO;
908 	}
909 
910 	/* copy header of running firmware from SRAM to host memory to
911 	 * validate firmware */
912 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
913 	if (hdr == NULL) {
914 		device_printf(sc->dev, "could not malloc firmware hdr\n");
915 		return ENOMEM;
916 	}
917 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
918 				rman_get_bushandle(sc->mem_res),
919 				hdr_offset, (char *)hdr, bytes);
920 	status = mxge_validate_firmware(sc, hdr);
921 	free(hdr, M_DEVBUF);
922 
923 	/*
924 	 * check to see if adopted firmware has bug where adopting
925 	 * it will cause broadcasts to be filtered unless the NIC
926 	 * is kept in ALLMULTI mode
927 	 */
928 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
929 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
930 		sc->adopted_rx_filter_bug = 1;
931 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
932 			      "working around rx filter bug\n",
933 			      sc->fw_ver_major, sc->fw_ver_minor,
934 			      sc->fw_ver_tiny);
935 	}
936 
937 	return status;
938 }
939 
940 
941 static int
942 mxge_load_firmware(mxge_softc_t *sc, int adopt)
943 {
944 	volatile uint32_t *confirm;
945 	volatile char *submit;
946 	char buf_bytes[72];
947 	uint32_t *buf, size, dma_low, dma_high;
948 	int status, i;
949 
950 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
951 
952 	size = sc->sram_size;
953 	status = mxge_load_firmware_helper(sc, &size);
954 	if (status) {
955 		if (!adopt)
956 			return status;
957 		/* Try to use the currently running firmware, if
958 		   it is new enough */
959 		status = mxge_adopt_running_firmware(sc);
960 		if (status) {
961 			device_printf(sc->dev,
962 				      "failed to adopt running firmware\n");
963 			return status;
964 		}
965 		device_printf(sc->dev,
966 			      "Successfully adopted running firmware\n");
967 		if (sc->tx_boundary == 4096) {
968 			device_printf(sc->dev,
969 				"Using firmware currently running on NIC"
970 				 ".  For optimal\n");
971 			device_printf(sc->dev,
972 				 "performance consider loading optimized "
973 				 "firmware\n");
974 		}
975 		sc->fw_name = mxge_fw_unaligned;
976 		sc->tx_boundary = 2048;
977 		return 0;
978 	}
979 	/* clear confirmation addr */
980 	confirm = (volatile uint32_t *)sc->cmd;
981 	*confirm = 0;
982 	mb();
983 	/* send a reload command to the bootstrap MCP, and wait for the
984 	   response in the confirmation address.  The firmware should
985 	   write a -1 there to indicate it is alive and well
986 	*/
987 
988 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
989 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
990 
991 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
992 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
993 	buf[2] = htobe32(0xffffffff);	/* confirm data */
994 
995 	/* FIX: All newest firmware should un-protect the bottom of
996 	   the sram before handoff. However, the very first interfaces
997 	   do not. Therefore the handoff copy must skip the first 8 bytes
998 	*/
999 					/* where the code starts*/
1000 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
1001 	buf[4] = htobe32(size - 8); 	/* length of code */
1002 	buf[5] = htobe32(8);		/* where to copy to */
1003 	buf[6] = htobe32(0);		/* where to jump to */
1004 
1005 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
1006 	mxge_pio_copy(submit, buf, 64);
1007 	mb();
1008 	DELAY(1000);
1009 	mb();
1010 	i = 0;
1011 	while (*confirm != 0xffffffff && i < 20) {
1012 		DELAY(1000*10);
1013 		i++;
1014 		bus_dmamap_sync(sc->cmd_dma.dmat,
1015 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1016 	}
1017 	if (*confirm != 0xffffffff) {
1018 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1019 			confirm, *confirm);
1020 
1021 		return ENXIO;
1022 	}
1023 	return 0;
1024 }
1025 
1026 static int
1027 mxge_update_mac_address(mxge_softc_t *sc)
1028 {
1029 	mxge_cmd_t cmd;
1030 	uint8_t *addr = sc->mac_addr;
1031 	int status;
1032 
1033 
1034 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1035 		     | (addr[2] << 8) | addr[3]);
1036 
1037 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1038 
1039 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1040 	return status;
1041 }
1042 
1043 static int
1044 mxge_change_pause(mxge_softc_t *sc, int pause)
1045 {
1046 	mxge_cmd_t cmd;
1047 	int status;
1048 
1049 	if (pause)
1050 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1051 				       &cmd);
1052 	else
1053 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1054 				       &cmd);
1055 
1056 	if (status) {
1057 		device_printf(sc->dev, "Failed to set flow control mode\n");
1058 		return ENXIO;
1059 	}
1060 	sc->pause = pause;
1061 	return 0;
1062 }
1063 
1064 static void
1065 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1066 {
1067 	mxge_cmd_t cmd;
1068 	int status;
1069 
1070 	if (mxge_always_promisc)
1071 		promisc = 1;
1072 
1073 	if (promisc)
1074 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1075 				       &cmd);
1076 	else
1077 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1078 				       &cmd);
1079 
1080 	if (status) {
1081 		device_printf(sc->dev, "Failed to set promisc mode\n");
1082 	}
1083 }
1084 
1085 static void
1086 mxge_set_multicast_list(mxge_softc_t *sc)
1087 {
1088 	mxge_cmd_t cmd;
1089 	struct ifmultiaddr *ifma;
1090 	struct ifnet *ifp = sc->ifp;
1091 	int err;
1092 
1093 	/* This firmware is known to not support multicast */
1094 	if (!sc->fw_multicast_support)
1095 		return;
1096 
1097 	/* Disable multicast filtering while we play with the lists*/
1098 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1099 	if (err != 0) {
1100 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1101 		       " error status: %d\n", err);
1102 		return;
1103 	}
1104 
1105 	if (sc->adopted_rx_filter_bug)
1106 		return;
1107 
1108 	if (ifp->if_flags & IFF_ALLMULTI)
1109 		/* request to disable multicast filtering, so quit here */
1110 		return;
1111 
1112 	/* Flush all the filters */
1113 
1114 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1115 	if (err != 0) {
1116 		device_printf(sc->dev,
1117 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1118 			      ", error status: %d\n", err);
1119 		return;
1120 	}
1121 
1122 	/* Walk the multicast list, and add each address */
1123 
1124 	IF_ADDR_LOCK(ifp);
1125 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1126 		if (ifma->ifma_addr->sa_family != AF_LINK)
1127 			continue;
1128 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1129 		      &cmd.data0, 4);
1130 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1131 		      &cmd.data1, 2);
1132 		cmd.data0 = htonl(cmd.data0);
1133 		cmd.data1 = htonl(cmd.data1);
1134 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1135 		if (err != 0) {
1136 			device_printf(sc->dev, "Failed "
1137 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1138 			       "%d\t", err);
1139 			/* abort, leaving multicast filtering off */
1140 			IF_ADDR_UNLOCK(ifp);
1141 			return;
1142 		}
1143 	}
1144 	IF_ADDR_UNLOCK(ifp);
1145 	/* Enable multicast filtering */
1146 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1147 	if (err != 0) {
1148 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1149 		       ", error status: %d\n", err);
1150 	}
1151 }
1152 
1153 static int
1154 mxge_max_mtu(mxge_softc_t *sc)
1155 {
1156 	mxge_cmd_t cmd;
1157 	int status;
1158 
1159 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1160 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1161 
1162 	/* try to set nbufs to see if it we can
1163 	   use virtually contiguous jumbos */
1164 	cmd.data0 = 0;
1165 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1166 			       &cmd);
1167 	if (status == 0)
1168 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1169 
1170 	/* otherwise, we're limited to MJUMPAGESIZE */
1171 	return MJUMPAGESIZE - MXGEFW_PAD;
1172 }
1173 
1174 static int
1175 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1176 {
1177 	struct mxge_slice_state *ss;
1178 	mxge_rx_done_t *rx_done;
1179 	volatile uint32_t *irq_claim;
1180 	mxge_cmd_t cmd;
1181 	int slice, status;
1182 
1183 	/* try to send a reset command to the card to see if it
1184 	   is alive */
1185 	memset(&cmd, 0, sizeof (cmd));
1186 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1187 	if (status != 0) {
1188 		device_printf(sc->dev, "failed reset\n");
1189 		return ENXIO;
1190 	}
1191 
1192 	mxge_dummy_rdma(sc, 1);
1193 
1194 
1195 	/* set the intrq size */
1196 	cmd.data0 = sc->rx_ring_size;
1197 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1198 
1199 	/*
1200 	 * Even though we already know how many slices are supported
1201 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1202 	 * has magic side effects, and must be called after a reset.
1203 	 * It must be called prior to calling any RSS related cmds,
1204 	 * including assigning an interrupt queue for anything but
1205 	 * slice 0.  It must also be called *after*
1206 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1207 	 * the firmware to compute offsets.
1208 	 */
1209 
1210 	if (sc->num_slices > 1) {
1211 		/* ask the maximum number of slices it supports */
1212 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1213 					   &cmd);
1214 		if (status != 0) {
1215 			device_printf(sc->dev,
1216 				      "failed to get number of slices\n");
1217 			return status;
1218 		}
1219 		/*
1220 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1221 		 * to setting up the interrupt queue DMA
1222 		 */
1223 		cmd.data0 = sc->num_slices;
1224 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1225 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1226 					   &cmd);
1227 		if (status != 0) {
1228 			device_printf(sc->dev,
1229 				      "failed to set number of slices\n");
1230 			return status;
1231 		}
1232 	}
1233 
1234 
1235 	if (interrupts_setup) {
1236 		/* Now exchange information about interrupts  */
1237 		for (slice = 0; slice < sc->num_slices; slice++) {
1238 			rx_done = &sc->ss[slice].rx_done;
1239 			memset(rx_done->entry, 0, sc->rx_ring_size);
1240 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1241 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1242 			cmd.data2 = slice;
1243 			status |= mxge_send_cmd(sc,
1244 						MXGEFW_CMD_SET_INTRQ_DMA,
1245 						&cmd);
1246 		}
1247 	}
1248 
1249 	status |= mxge_send_cmd(sc,
1250 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1251 
1252 
1253 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1254 
1255 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1256 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1257 
1258 
1259 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1260 				&cmd);
1261 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1262 	if (status != 0) {
1263 		device_printf(sc->dev, "failed set interrupt parameters\n");
1264 		return status;
1265 	}
1266 
1267 
1268 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1269 
1270 
1271 	/* run a DMA benchmark */
1272 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1273 
1274 	for (slice = 0; slice < sc->num_slices; slice++) {
1275 		ss = &sc->ss[slice];
1276 
1277 		ss->irq_claim = irq_claim + (2 * slice);
1278 		/* reset mcp/driver shared state back to 0 */
1279 		ss->rx_done.idx = 0;
1280 		ss->rx_done.cnt = 0;
1281 		ss->tx.req = 0;
1282 		ss->tx.done = 0;
1283 		ss->tx.pkt_done = 0;
1284 		ss->tx.wake = 0;
1285 		ss->tx.defrag = 0;
1286 		ss->tx.stall = 0;
1287 		ss->rx_big.cnt = 0;
1288 		ss->rx_small.cnt = 0;
1289 		ss->lro_bad_csum = 0;
1290 		ss->lro_queued = 0;
1291 		ss->lro_flushed = 0;
1292 		if (ss->fw_stats != NULL) {
1293 			ss->fw_stats->valid = 0;
1294 			ss->fw_stats->send_done_count = 0;
1295 		}
1296 	}
1297 	sc->rdma_tags_available = 15;
1298 	status = mxge_update_mac_address(sc);
1299 	mxge_change_promisc(sc, 0);
1300 	mxge_change_pause(sc, sc->pause);
1301 	mxge_set_multicast_list(sc);
1302 	return status;
1303 }
1304 
1305 static int
1306 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1307 {
1308         mxge_softc_t *sc;
1309         unsigned int intr_coal_delay;
1310         int err;
1311 
1312         sc = arg1;
1313         intr_coal_delay = sc->intr_coal_delay;
1314         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1315         if (err != 0) {
1316                 return err;
1317         }
1318         if (intr_coal_delay == sc->intr_coal_delay)
1319                 return 0;
1320 
1321         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1322                 return EINVAL;
1323 
1324 	mtx_lock(&sc->driver_mtx);
1325 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1326 	sc->intr_coal_delay = intr_coal_delay;
1327 
1328 	mtx_unlock(&sc->driver_mtx);
1329         return err;
1330 }
1331 
1332 static int
1333 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1334 {
1335         mxge_softc_t *sc;
1336         unsigned int enabled;
1337         int err;
1338 
1339         sc = arg1;
1340         enabled = sc->pause;
1341         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1342         if (err != 0) {
1343                 return err;
1344         }
1345         if (enabled == sc->pause)
1346                 return 0;
1347 
1348 	mtx_lock(&sc->driver_mtx);
1349 	err = mxge_change_pause(sc, enabled);
1350 	mtx_unlock(&sc->driver_mtx);
1351         return err;
1352 }
1353 
1354 static int
1355 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1356 {
1357 	struct ifnet *ifp;
1358 	int err = 0;
1359 
1360 	ifp = sc->ifp;
1361 	if (lro_cnt == 0)
1362 		ifp->if_capenable &= ~IFCAP_LRO;
1363 	else
1364 		ifp->if_capenable |= IFCAP_LRO;
1365 	sc->lro_cnt = lro_cnt;
1366 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1367 		callout_stop(&sc->co_hdl);
1368 		mxge_close(sc);
1369 		err = mxge_open(sc);
1370 		if (err == 0)
1371 			callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
1372 	}
1373 	return err;
1374 }
1375 
1376 static int
1377 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1378 {
1379 	mxge_softc_t *sc;
1380 	unsigned int lro_cnt;
1381 	int err;
1382 
1383 	sc = arg1;
1384 	lro_cnt = sc->lro_cnt;
1385 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1386 	if (err != 0)
1387 		return err;
1388 
1389 	if (lro_cnt == sc->lro_cnt)
1390 		return 0;
1391 
1392 	if (lro_cnt > 128)
1393 		return EINVAL;
1394 
1395 	mtx_lock(&sc->driver_mtx);
1396 	err = mxge_change_lro_locked(sc, lro_cnt);
1397 	mtx_unlock(&sc->driver_mtx);
1398 	return err;
1399 }
1400 
1401 static int
1402 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1403 {
1404         int err;
1405 
1406         if (arg1 == NULL)
1407                 return EFAULT;
1408         arg2 = be32toh(*(int *)arg1);
1409         arg1 = NULL;
1410         err = sysctl_handle_int(oidp, arg1, arg2, req);
1411 
1412         return err;
1413 }
1414 
1415 static void
1416 mxge_rem_sysctls(mxge_softc_t *sc)
1417 {
1418 	struct mxge_slice_state *ss;
1419 	int slice;
1420 
1421 	if (sc->slice_sysctl_tree == NULL)
1422 		return;
1423 
1424 	for (slice = 0; slice < sc->num_slices; slice++) {
1425 		ss = &sc->ss[slice];
1426 		if (ss == NULL || ss->sysctl_tree == NULL)
1427 			continue;
1428 		sysctl_ctx_free(&ss->sysctl_ctx);
1429 		ss->sysctl_tree = NULL;
1430 	}
1431 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1432 	sc->slice_sysctl_tree = NULL;
1433 }
1434 
1435 static void
1436 mxge_add_sysctls(mxge_softc_t *sc)
1437 {
1438 	struct sysctl_ctx_list *ctx;
1439 	struct sysctl_oid_list *children;
1440 	mcp_irq_data_t *fw;
1441 	struct mxge_slice_state *ss;
1442 	int slice;
1443 	char slice_num[8];
1444 
1445 	ctx = device_get_sysctl_ctx(sc->dev);
1446 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1447 	fw = sc->ss[0].fw_stats;
1448 
1449 	/* random information */
1450 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1451 		       "firmware_version",
1452 		       CTLFLAG_RD, &sc->fw_version,
1453 		       0, "firmware version");
1454 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1455 		       "serial_number",
1456 		       CTLFLAG_RD, &sc->serial_number_string,
1457 		       0, "serial number");
1458 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1459 		       "product_code",
1460 		       CTLFLAG_RD, &sc->product_code_string,
1461 		       0, "product_code");
1462 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1463 		       "pcie_link_width",
1464 		       CTLFLAG_RD, &sc->link_width,
1465 		       0, "tx_boundary");
1466 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1467 		       "tx_boundary",
1468 		       CTLFLAG_RD, &sc->tx_boundary,
1469 		       0, "tx_boundary");
1470 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1471 		       "write_combine",
1472 		       CTLFLAG_RD, &sc->wc,
1473 		       0, "write combining PIO?");
1474 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1475 		       "read_dma_MBs",
1476 		       CTLFLAG_RD, &sc->read_dma,
1477 		       0, "DMA Read speed in MB/s");
1478 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1479 		       "write_dma_MBs",
1480 		       CTLFLAG_RD, &sc->write_dma,
1481 		       0, "DMA Write speed in MB/s");
1482 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1483 		       "read_write_dma_MBs",
1484 		       CTLFLAG_RD, &sc->read_write_dma,
1485 		       0, "DMA concurrent Read/Write speed in MB/s");
1486 
1487 
1488 	/* performance related tunables */
1489 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1490 			"intr_coal_delay",
1491 			CTLTYPE_INT|CTLFLAG_RW, sc,
1492 			0, mxge_change_intr_coal,
1493 			"I", "interrupt coalescing delay in usecs");
1494 
1495 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1496 			"flow_control_enabled",
1497 			CTLTYPE_INT|CTLFLAG_RW, sc,
1498 			0, mxge_change_flow_control,
1499 			"I", "interrupt coalescing delay in usecs");
1500 
1501 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1502 		       "deassert_wait",
1503 		       CTLFLAG_RW, &mxge_deassert_wait,
1504 		       0, "Wait for IRQ line to go low in ihandler");
1505 
1506 	/* stats block from firmware is in network byte order.
1507 	   Need to swap it */
1508 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1509 			"link_up",
1510 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1511 			0, mxge_handle_be32,
1512 			"I", "link up");
1513 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1514 			"rdma_tags_available",
1515 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1516 			0, mxge_handle_be32,
1517 			"I", "rdma_tags_available");
1518 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1519 			"dropped_bad_crc32",
1520 			CTLTYPE_INT|CTLFLAG_RD,
1521 			&fw->dropped_bad_crc32,
1522 			0, mxge_handle_be32,
1523 			"I", "dropped_bad_crc32");
1524 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1525 			"dropped_bad_phy",
1526 			CTLTYPE_INT|CTLFLAG_RD,
1527 			&fw->dropped_bad_phy,
1528 			0, mxge_handle_be32,
1529 			"I", "dropped_bad_phy");
1530 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1531 			"dropped_link_error_or_filtered",
1532 			CTLTYPE_INT|CTLFLAG_RD,
1533 			&fw->dropped_link_error_or_filtered,
1534 			0, mxge_handle_be32,
1535 			"I", "dropped_link_error_or_filtered");
1536 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1537 			"dropped_link_overflow",
1538 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1539 			0, mxge_handle_be32,
1540 			"I", "dropped_link_overflow");
1541 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1542 			"dropped_multicast_filtered",
1543 			CTLTYPE_INT|CTLFLAG_RD,
1544 			&fw->dropped_multicast_filtered,
1545 			0, mxge_handle_be32,
1546 			"I", "dropped_multicast_filtered");
1547 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1548 			"dropped_no_big_buffer",
1549 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1550 			0, mxge_handle_be32,
1551 			"I", "dropped_no_big_buffer");
1552 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1553 			"dropped_no_small_buffer",
1554 			CTLTYPE_INT|CTLFLAG_RD,
1555 			&fw->dropped_no_small_buffer,
1556 			0, mxge_handle_be32,
1557 			"I", "dropped_no_small_buffer");
1558 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1559 			"dropped_overrun",
1560 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1561 			0, mxge_handle_be32,
1562 			"I", "dropped_overrun");
1563 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1564 			"dropped_pause",
1565 			CTLTYPE_INT|CTLFLAG_RD,
1566 			&fw->dropped_pause,
1567 			0, mxge_handle_be32,
1568 			"I", "dropped_pause");
1569 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1570 			"dropped_runt",
1571 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1572 			0, mxge_handle_be32,
1573 			"I", "dropped_runt");
1574 
1575 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1576 			"dropped_unicast_filtered",
1577 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1578 			0, mxge_handle_be32,
1579 			"I", "dropped_unicast_filtered");
1580 
1581 	/* verbose printing? */
1582 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1583 		       "verbose",
1584 		       CTLFLAG_RW, &mxge_verbose,
1585 		       0, "verbose printing");
1586 
1587 	/* lro */
1588 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1589 			"lro_cnt",
1590 			CTLTYPE_INT|CTLFLAG_RW, sc,
1591 			0, mxge_change_lro,
1592 			"I", "number of lro merge queues");
1593 
1594 
1595 	/* add counters exported for debugging from all slices */
1596 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1597 	sc->slice_sysctl_tree =
1598 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1599 				"slice", CTLFLAG_RD, 0, "");
1600 
1601 	for (slice = 0; slice < sc->num_slices; slice++) {
1602 		ss = &sc->ss[slice];
1603 		sysctl_ctx_init(&ss->sysctl_ctx);
1604 		ctx = &ss->sysctl_ctx;
1605 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1606 		sprintf(slice_num, "%d", slice);
1607 		ss->sysctl_tree =
1608 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1609 					CTLFLAG_RD, 0, "");
1610 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1611 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1612 			       "rx_small_cnt",
1613 			       CTLFLAG_RD, &ss->rx_small.cnt,
1614 			       0, "rx_small_cnt");
1615 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1616 			       "rx_big_cnt",
1617 			       CTLFLAG_RD, &ss->rx_big.cnt,
1618 			       0, "rx_small_cnt");
1619 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1620 			       "tx_req",
1621 			       CTLFLAG_RD, &ss->tx.req,
1622 			       0, "tx_req");
1623 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1624 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1625 			       0, "number of lro merge queues flushed");
1626 
1627 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1628 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1629 			       0, "number of frames appended to lro merge"
1630 			       "queues");
1631 
1632 		/* only transmit from slice 0 for now */
1633 		if (slice > 0)
1634 			continue;
1635 
1636 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1637 			       "tx_done",
1638 			       CTLFLAG_RD, &ss->tx.done,
1639 			       0, "tx_done");
1640 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1641 			       "tx_pkt_done",
1642 			       CTLFLAG_RD, &ss->tx.pkt_done,
1643 			       0, "tx_done");
1644 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1645 			       "tx_stall",
1646 			       CTLFLAG_RD, &ss->tx.stall,
1647 			       0, "tx_stall");
1648 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1649 			       "tx_wake",
1650 			       CTLFLAG_RD, &ss->tx.wake,
1651 			       0, "tx_wake");
1652 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1653 			       "tx_defrag",
1654 			       CTLFLAG_RD, &ss->tx.defrag,
1655 			       0, "tx_defrag");
1656 	}
1657 }
1658 
1659 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1660    backwards one at a time and handle ring wraps */
1661 
1662 static inline void
1663 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1664 			    mcp_kreq_ether_send_t *src, int cnt)
1665 {
1666         int idx, starting_slot;
1667         starting_slot = tx->req;
1668         while (cnt > 1) {
1669                 cnt--;
1670                 idx = (starting_slot + cnt) & tx->mask;
1671                 mxge_pio_copy(&tx->lanai[idx],
1672 			      &src[cnt], sizeof(*src));
1673                 mb();
1674         }
1675 }
1676 
1677 /*
1678  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1679  * at most 32 bytes at a time, so as to avoid involving the software
1680  * pio handler in the nic.   We re-write the first segment's flags
1681  * to mark them valid only after writing the entire chain
1682  */
1683 
1684 static inline void
1685 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1686                   int cnt)
1687 {
1688         int idx, i;
1689         uint32_t *src_ints;
1690 	volatile uint32_t *dst_ints;
1691         mcp_kreq_ether_send_t *srcp;
1692 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1693 	uint8_t last_flags;
1694 
1695         idx = tx->req & tx->mask;
1696 
1697 	last_flags = src->flags;
1698 	src->flags = 0;
1699         mb();
1700         dst = dstp = &tx->lanai[idx];
1701         srcp = src;
1702 
1703         if ((idx + cnt) < tx->mask) {
1704                 for (i = 0; i < (cnt - 1); i += 2) {
1705                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1706                         mb(); /* force write every 32 bytes */
1707                         srcp += 2;
1708                         dstp += 2;
1709                 }
1710         } else {
1711                 /* submit all but the first request, and ensure
1712                    that it is submitted below */
1713                 mxge_submit_req_backwards(tx, src, cnt);
1714                 i = 0;
1715         }
1716         if (i < cnt) {
1717                 /* submit the first request */
1718                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1719                 mb(); /* barrier before setting valid flag */
1720         }
1721 
1722         /* re-write the last 32-bits with the valid flags */
1723         src->flags = last_flags;
1724         src_ints = (uint32_t *)src;
1725         src_ints+=3;
1726         dst_ints = (volatile uint32_t *)dst;
1727         dst_ints+=3;
1728         *dst_ints =  *src_ints;
1729         tx->req += cnt;
1730         mb();
1731 }
1732 
1733 static void
1734 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1735 	       int busdma_seg_cnt, int ip_off)
1736 {
1737 	mxge_tx_ring_t *tx;
1738 	mcp_kreq_ether_send_t *req;
1739 	bus_dma_segment_t *seg;
1740 	struct ip *ip;
1741 	struct tcphdr *tcp;
1742 	uint32_t low, high_swapped;
1743 	int len, seglen, cum_len, cum_len_next;
1744 	int next_is_first, chop, cnt, rdma_count, small;
1745 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1746 	uint8_t flags, flags_next;
1747 	static int once;
1748 
1749 	mss = m->m_pkthdr.tso_segsz;
1750 
1751 	/* negative cum_len signifies to the
1752 	 * send loop that we are still in the
1753 	 * header portion of the TSO packet.
1754 	 */
1755 
1756 	/* ensure we have the ethernet, IP and TCP
1757 	   header together in the first mbuf, copy
1758 	   it to a scratch buffer if not */
1759 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1760 		m_copydata(m, 0, ip_off + sizeof (*ip),
1761 			   ss->scratch);
1762 		ip = (struct ip *)(ss->scratch + ip_off);
1763 	} else {
1764 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1765 	}
1766 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1767 			    + sizeof (*tcp))) {
1768 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1769 			   + sizeof (*tcp),  ss->scratch);
1770 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1771 	}
1772 
1773 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1774 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1775 
1776 	/* TSO implies checksum offload on this hardware */
1777 	cksum_offset = ip_off + (ip->ip_hl << 2);
1778 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1779 
1780 
1781 	/* for TSO, pseudo_hdr_offset holds mss.
1782 	 * The firmware figures out where to put
1783 	 * the checksum by parsing the header. */
1784 	pseudo_hdr_offset = htobe16(mss);
1785 
1786 	tx = &ss->tx;
1787 	req = tx->req_list;
1788 	seg = tx->seg_list;
1789 	cnt = 0;
1790 	rdma_count = 0;
1791 	/* "rdma_count" is the number of RDMAs belonging to the
1792 	 * current packet BEFORE the current send request. For
1793 	 * non-TSO packets, this is equal to "count".
1794 	 * For TSO packets, rdma_count needs to be reset
1795 	 * to 0 after a segment cut.
1796 	 *
1797 	 * The rdma_count field of the send request is
1798 	 * the number of RDMAs of the packet starting at
1799 	 * that request. For TSO send requests with one ore more cuts
1800 	 * in the middle, this is the number of RDMAs starting
1801 	 * after the last cut in the request. All previous
1802 	 * segments before the last cut implicitly have 1 RDMA.
1803 	 *
1804 	 * Since the number of RDMAs is not known beforehand,
1805 	 * it must be filled-in retroactively - after each
1806 	 * segmentation cut or at the end of the entire packet.
1807 	 */
1808 
1809 	while (busdma_seg_cnt) {
1810 		/* Break the busdma segment up into pieces*/
1811 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1812 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1813 		len = seg->ds_len;
1814 
1815 		while (len) {
1816 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1817 			seglen = len;
1818 			cum_len_next = cum_len + seglen;
1819 			(req-rdma_count)->rdma_count = rdma_count + 1;
1820 			if (__predict_true(cum_len >= 0)) {
1821 				/* payload */
1822 				chop = (cum_len_next > mss);
1823 				cum_len_next = cum_len_next % mss;
1824 				next_is_first = (cum_len_next == 0);
1825 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1826 				flags_next |= next_is_first *
1827 					MXGEFW_FLAGS_FIRST;
1828 				rdma_count |= -(chop | next_is_first);
1829 				rdma_count += chop & !next_is_first;
1830 			} else if (cum_len_next >= 0) {
1831 				/* header ends */
1832 				rdma_count = -1;
1833 				cum_len_next = 0;
1834 				seglen = -cum_len;
1835 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1836 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1837 					MXGEFW_FLAGS_FIRST |
1838 					(small * MXGEFW_FLAGS_SMALL);
1839 			    }
1840 
1841 			req->addr_high = high_swapped;
1842 			req->addr_low = htobe32(low);
1843 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1844 			req->pad = 0;
1845 			req->rdma_count = 1;
1846 			req->length = htobe16(seglen);
1847 			req->cksum_offset = cksum_offset;
1848 			req->flags = flags | ((cum_len & 1) *
1849 					      MXGEFW_FLAGS_ALIGN_ODD);
1850 			low += seglen;
1851 			len -= seglen;
1852 			cum_len = cum_len_next;
1853 			flags = flags_next;
1854 			req++;
1855 			cnt++;
1856 			rdma_count++;
1857 			if (__predict_false(cksum_offset > seglen))
1858 				cksum_offset -= seglen;
1859 			else
1860 				cksum_offset = 0;
1861 			if (__predict_false(cnt > tx->max_desc))
1862 				goto drop;
1863 		}
1864 		busdma_seg_cnt--;
1865 		seg++;
1866 	}
1867 	(req-rdma_count)->rdma_count = rdma_count;
1868 
1869 	do {
1870 		req--;
1871 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1872 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1873 
1874 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1875 	mxge_submit_req(tx, tx->req_list, cnt);
1876 	return;
1877 
1878 drop:
1879 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1880 	m_freem(m);
1881 	ss->sc->ifp->if_oerrors++;
1882 	if (!once) {
1883 		printf("tx->max_desc exceeded via TSO!\n");
1884 		printf("mss = %d, %ld, %d!\n", mss,
1885 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1886 		once = 1;
1887 	}
1888 	return;
1889 
1890 }
1891 
1892 /*
1893  * We reproduce the software vlan tag insertion from
1894  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1895  * vlan tag insertion. We need to advertise this in order to have the
1896  * vlan interface respect our csum offload flags.
1897  */
1898 static struct mbuf *
1899 mxge_vlan_tag_insert(struct mbuf *m)
1900 {
1901 	struct ether_vlan_header *evl;
1902 
1903 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1904 	if (__predict_false(m == NULL))
1905 		return NULL;
1906 	if (m->m_len < sizeof(*evl)) {
1907 		m = m_pullup(m, sizeof(*evl));
1908 		if (__predict_false(m == NULL))
1909 			return NULL;
1910 	}
1911 	/*
1912 	 * Transform the Ethernet header into an Ethernet header
1913 	 * with 802.1Q encapsulation.
1914 	 */
1915 	evl = mtod(m, struct ether_vlan_header *);
1916 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1917 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1918 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1919 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1920 	m->m_flags &= ~M_VLANTAG;
1921 	return m;
1922 }
1923 
1924 static void
1925 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1926 {
1927 	mxge_softc_t *sc;
1928 	mcp_kreq_ether_send_t *req;
1929 	bus_dma_segment_t *seg;
1930 	struct mbuf *m_tmp;
1931 	struct ifnet *ifp;
1932 	mxge_tx_ring_t *tx;
1933 	struct ip *ip;
1934 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1935 	uint16_t pseudo_hdr_offset;
1936         uint8_t flags, cksum_offset;
1937 
1938 
1939 	sc = ss->sc;
1940 	ifp = sc->ifp;
1941 	tx = &ss->tx;
1942 
1943 	ip_off = sizeof (struct ether_header);
1944 	if (m->m_flags & M_VLANTAG) {
1945 		m = mxge_vlan_tag_insert(m);
1946 		if (__predict_false(m == NULL))
1947 			goto drop;
1948 		ip_off += ETHER_VLAN_ENCAP_LEN;
1949 	}
1950 
1951 	/* (try to) map the frame for DMA */
1952 	idx = tx->req & tx->mask;
1953 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1954 				      m, tx->seg_list, &cnt,
1955 				      BUS_DMA_NOWAIT);
1956 	if (__predict_false(err == EFBIG)) {
1957 		/* Too many segments in the chain.  Try
1958 		   to defrag */
1959 		m_tmp = m_defrag(m, M_NOWAIT);
1960 		if (m_tmp == NULL) {
1961 			goto drop;
1962 		}
1963 		ss->tx.defrag++;
1964 		m = m_tmp;
1965 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1966 					      tx->info[idx].map,
1967 					      m, tx->seg_list, &cnt,
1968 					      BUS_DMA_NOWAIT);
1969 	}
1970 	if (__predict_false(err != 0)) {
1971 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1972 			      " packet len = %d\n", err, m->m_pkthdr.len);
1973 		goto drop;
1974 	}
1975 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1976 			BUS_DMASYNC_PREWRITE);
1977 	tx->info[idx].m = m;
1978 
1979 
1980 	/* TSO is different enough, we handle it in another routine */
1981 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1982 		mxge_encap_tso(ss, m, cnt, ip_off);
1983 		return;
1984 	}
1985 
1986 	req = tx->req_list;
1987 	cksum_offset = 0;
1988 	pseudo_hdr_offset = 0;
1989 	flags = MXGEFW_FLAGS_NO_TSO;
1990 
1991 	/* checksum offloading? */
1992 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1993 		/* ensure ip header is in first mbuf, copy
1994 		   it to a scratch buffer if not */
1995 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1996 			m_copydata(m, 0, ip_off + sizeof (*ip),
1997 				   ss->scratch);
1998 			ip = (struct ip *)(ss->scratch + ip_off);
1999 		} else {
2000 			ip = (struct ip *)(mtod(m, char *) + ip_off);
2001 		}
2002 		cksum_offset = ip_off + (ip->ip_hl << 2);
2003 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
2004 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
2005 		req->cksum_offset = cksum_offset;
2006 		flags |= MXGEFW_FLAGS_CKSUM;
2007 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
2008 	} else {
2009 		odd_flag = 0;
2010 	}
2011 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2012 		flags |= MXGEFW_FLAGS_SMALL;
2013 
2014 	/* convert segments into a request list */
2015 	cum_len = 0;
2016 	seg = tx->seg_list;
2017 	req->flags = MXGEFW_FLAGS_FIRST;
2018 	for (i = 0; i < cnt; i++) {
2019 		req->addr_low =
2020 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2021 		req->addr_high =
2022 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2023 		req->length = htobe16(seg->ds_len);
2024 		req->cksum_offset = cksum_offset;
2025 		if (cksum_offset > seg->ds_len)
2026 			cksum_offset -= seg->ds_len;
2027 		else
2028 			cksum_offset = 0;
2029 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2030 		req->pad = 0; /* complete solid 16-byte block */
2031 		req->rdma_count = 1;
2032 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2033 		cum_len += seg->ds_len;
2034 		seg++;
2035 		req++;
2036 		req->flags = 0;
2037 	}
2038 	req--;
2039 	/* pad runts to 60 bytes */
2040 	if (cum_len < 60) {
2041 		req++;
2042 		req->addr_low =
2043 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2044 		req->addr_high =
2045 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2046 		req->length = htobe16(60 - cum_len);
2047 		req->cksum_offset = 0;
2048 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2049 		req->pad = 0; /* complete solid 16-byte block */
2050 		req->rdma_count = 1;
2051 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2052 		cnt++;
2053 	}
2054 
2055 	tx->req_list[0].rdma_count = cnt;
2056 #if 0
2057 	/* print what the firmware will see */
2058 	for (i = 0; i < cnt; i++) {
2059 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2060 		    "cso:%d, flags:0x%x, rdma:%d\n",
2061 		    i, (int)ntohl(tx->req_list[i].addr_high),
2062 		    (int)ntohl(tx->req_list[i].addr_low),
2063 		    (int)ntohs(tx->req_list[i].length),
2064 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2065 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2066 		    tx->req_list[i].rdma_count);
2067 	}
2068 	printf("--------------\n");
2069 #endif
2070 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2071 	mxge_submit_req(tx, tx->req_list, cnt);
2072 	return;
2073 
2074 drop:
2075 	m_freem(m);
2076 	ifp->if_oerrors++;
2077 	return;
2078 }
2079 
2080 
2081 
2082 
2083 static inline void
2084 mxge_start_locked(struct mxge_slice_state *ss)
2085 {
2086 	mxge_softc_t *sc;
2087 	struct mbuf *m;
2088 	struct ifnet *ifp;
2089 	mxge_tx_ring_t *tx;
2090 
2091 	sc = ss->sc;
2092 	ifp = sc->ifp;
2093 	tx = &ss->tx;
2094 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2095 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2096 		if (m == NULL) {
2097 			return;
2098 		}
2099 		/* let BPF see it */
2100 		BPF_MTAP(ifp, m);
2101 
2102 		/* give it to the nic */
2103 		mxge_encap(ss, m);
2104 	}
2105 	/* ran out of transmit slots */
2106 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2107 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2108 		tx->stall++;
2109 	}
2110 }
2111 
2112 static void
2113 mxge_start(struct ifnet *ifp)
2114 {
2115 	mxge_softc_t *sc = ifp->if_softc;
2116 	struct mxge_slice_state *ss;
2117 
2118 	/* only use the first slice for now */
2119 	ss = &sc->ss[0];
2120 	mtx_lock(&ss->tx.mtx);
2121 	mxge_start_locked(ss);
2122 	mtx_unlock(&ss->tx.mtx);
2123 }
2124 
2125 /*
2126  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2127  * at most 32 bytes at a time, so as to avoid involving the software
2128  * pio handler in the nic.   We re-write the first segment's low
2129  * DMA address to mark it valid only after we write the entire chunk
2130  * in a burst
2131  */
2132 static inline void
2133 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2134 		mcp_kreq_ether_recv_t *src)
2135 {
2136 	uint32_t low;
2137 
2138 	low = src->addr_low;
2139 	src->addr_low = 0xffffffff;
2140 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2141 	mb();
2142 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2143 	mb();
2144 	src->addr_low = low;
2145 	dst->addr_low = low;
2146 	mb();
2147 }
2148 
2149 static int
2150 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2151 {
2152 	bus_dma_segment_t seg;
2153 	struct mbuf *m;
2154 	mxge_rx_ring_t *rx = &ss->rx_small;
2155 	int cnt, err;
2156 
2157 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2158 	if (m == NULL) {
2159 		rx->alloc_fail++;
2160 		err = ENOBUFS;
2161 		goto done;
2162 	}
2163 	m->m_len = MHLEN;
2164 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2165 				      &seg, &cnt, BUS_DMA_NOWAIT);
2166 	if (err != 0) {
2167 		m_free(m);
2168 		goto done;
2169 	}
2170 	rx->info[idx].m = m;
2171 	rx->shadow[idx].addr_low =
2172 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2173 	rx->shadow[idx].addr_high =
2174 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2175 
2176 done:
2177 	if ((idx & 7) == 7)
2178 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2179 	return err;
2180 }
2181 
2182 static int
2183 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2184 {
2185 	bus_dma_segment_t seg[3];
2186 	struct mbuf *m;
2187 	mxge_rx_ring_t *rx = &ss->rx_big;
2188 	int cnt, err, i;
2189 
2190 	if (rx->cl_size == MCLBYTES)
2191 		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2192 	else
2193 		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2194 	if (m == NULL) {
2195 		rx->alloc_fail++;
2196 		err = ENOBUFS;
2197 		goto done;
2198 	}
2199 	m->m_len = rx->cl_size;
2200 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2201 				      seg, &cnt, BUS_DMA_NOWAIT);
2202 	if (err != 0) {
2203 		m_free(m);
2204 		goto done;
2205 	}
2206 	rx->info[idx].m = m;
2207 
2208 	for (i = 0; i < cnt; i++) {
2209 		rx->shadow[idx + i].addr_low =
2210 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2211 		rx->shadow[idx + i].addr_high =
2212 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2213        }
2214 
2215 
2216 done:
2217        for (i = 0; i < rx->nbufs; i++) {
2218 		if ((idx & 7) == 7) {
2219 			mxge_submit_8rx(&rx->lanai[idx - 7],
2220 					&rx->shadow[idx - 7]);
2221 		}
2222 		idx++;
2223 	}
2224 	return err;
2225 }
2226 
2227 /*
2228  *  Myri10GE hardware checksums are not valid if the sender
2229  *  padded the frame with non-zero padding.  This is because
2230  *  the firmware just does a simple 16-bit 1s complement
2231  *  checksum across the entire frame, excluding the first 14
2232  *  bytes.  It is best to simply to check the checksum and
2233  *  tell the stack about it only if the checksum is good
2234  */
2235 
2236 static inline uint16_t
2237 mxge_rx_csum(struct mbuf *m, int csum)
2238 {
2239 	struct ether_header *eh;
2240 	struct ip *ip;
2241 	uint16_t c;
2242 
2243 	eh = mtod(m, struct ether_header *);
2244 
2245 	/* only deal with IPv4 TCP & UDP for now */
2246 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2247 		return 1;
2248 	ip = (struct ip *)(eh + 1);
2249 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2250 			    ip->ip_p != IPPROTO_UDP))
2251 		return 1;
2252 
2253 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2254 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2255 			    - (ip->ip_hl << 2) + ip->ip_p));
2256 	c ^= 0xffff;
2257 	return (c);
2258 }
2259 
2260 static void
2261 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2262 {
2263 	struct ether_vlan_header *evl;
2264 	struct ether_header *eh;
2265 	uint32_t partial;
2266 
2267 	evl = mtod(m, struct ether_vlan_header *);
2268 	eh = mtod(m, struct ether_header *);
2269 
2270 	/*
2271 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2272 	 * after what the firmware thought was the end of the ethernet
2273 	 * header.
2274 	 */
2275 
2276 	/* put checksum into host byte order */
2277 	*csum = ntohs(*csum);
2278 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2279 	(*csum) += ~partial;
2280 	(*csum) +=  ((*csum) < ~partial);
2281 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2282 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2283 
2284 	/* restore checksum to network byte order;
2285 	   later consumers expect this */
2286 	*csum = htons(*csum);
2287 
2288 	/* save the tag */
2289 	m->m_flags |= M_VLANTAG;
2290 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2291 
2292 	/*
2293 	 * Remove the 802.1q header by copying the Ethernet
2294 	 * addresses over it and adjusting the beginning of
2295 	 * the data in the mbuf.  The encapsulated Ethernet
2296 	 * type field is already in place.
2297 	 */
2298 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2299 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2300 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2301 }
2302 
2303 
2304 static inline void
2305 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2306 {
2307 	mxge_softc_t *sc;
2308 	struct ifnet *ifp;
2309 	struct mbuf *m;
2310 	struct ether_header *eh;
2311 	mxge_rx_ring_t *rx;
2312 	bus_dmamap_t old_map;
2313 	int idx;
2314 	uint16_t tcpudp_csum;
2315 
2316 	sc = ss->sc;
2317 	ifp = sc->ifp;
2318 	rx = &ss->rx_big;
2319 	idx = rx->cnt & rx->mask;
2320 	rx->cnt += rx->nbufs;
2321 	/* save a pointer to the received mbuf */
2322 	m = rx->info[idx].m;
2323 	/* try to replace the received mbuf */
2324 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2325 		/* drop the frame -- the old mbuf is re-cycled */
2326 		ifp->if_ierrors++;
2327 		return;
2328 	}
2329 
2330 	/* unmap the received buffer */
2331 	old_map = rx->info[idx].map;
2332 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2333 	bus_dmamap_unload(rx->dmat, old_map);
2334 
2335 	/* swap the bus_dmamap_t's */
2336 	rx->info[idx].map = rx->extra_map;
2337 	rx->extra_map = old_map;
2338 
2339 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2340 	 * aligned */
2341 	m->m_data += MXGEFW_PAD;
2342 
2343 	m->m_pkthdr.rcvif = ifp;
2344 	m->m_len = m->m_pkthdr.len = len;
2345 	ss->ipackets++;
2346 	eh = mtod(m, struct ether_header *);
2347 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2348 		mxge_vlan_tag_remove(m, &csum);
2349 	}
2350 	/* if the checksum is valid, mark it in the mbuf header */
2351 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2352 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2353 			return;
2354 		/* otherwise, it was a UDP frame, or a TCP frame which
2355 		   we could not do LRO on.  Tell the stack that the
2356 		   checksum is good */
2357 		m->m_pkthdr.csum_data = 0xffff;
2358 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2359 	}
2360 	/* pass the frame up the stack */
2361 	(*ifp->if_input)(ifp, m);
2362 }
2363 
2364 static inline void
2365 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2366 {
2367 	mxge_softc_t *sc;
2368 	struct ifnet *ifp;
2369 	struct ether_header *eh;
2370 	struct mbuf *m;
2371 	mxge_rx_ring_t *rx;
2372 	bus_dmamap_t old_map;
2373 	int idx;
2374 	uint16_t tcpudp_csum;
2375 
2376 	sc = ss->sc;
2377 	ifp = sc->ifp;
2378 	rx = &ss->rx_small;
2379 	idx = rx->cnt & rx->mask;
2380 	rx->cnt++;
2381 	/* save a pointer to the received mbuf */
2382 	m = rx->info[idx].m;
2383 	/* try to replace the received mbuf */
2384 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2385 		/* drop the frame -- the old mbuf is re-cycled */
2386 		ifp->if_ierrors++;
2387 		return;
2388 	}
2389 
2390 	/* unmap the received buffer */
2391 	old_map = rx->info[idx].map;
2392 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2393 	bus_dmamap_unload(rx->dmat, old_map);
2394 
2395 	/* swap the bus_dmamap_t's */
2396 	rx->info[idx].map = rx->extra_map;
2397 	rx->extra_map = old_map;
2398 
2399 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2400 	 * aligned */
2401 	m->m_data += MXGEFW_PAD;
2402 
2403 	m->m_pkthdr.rcvif = ifp;
2404 	m->m_len = m->m_pkthdr.len = len;
2405 	ss->ipackets++;
2406 	eh = mtod(m, struct ether_header *);
2407 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2408 		mxge_vlan_tag_remove(m, &csum);
2409 	}
2410 	/* if the checksum is valid, mark it in the mbuf header */
2411 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2412 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2413 			return;
2414 		/* otherwise, it was a UDP frame, or a TCP frame which
2415 		   we could not do LRO on.  Tell the stack that the
2416 		   checksum is good */
2417 		m->m_pkthdr.csum_data = 0xffff;
2418 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2419 	}
2420 	/* pass the frame up the stack */
2421 	(*ifp->if_input)(ifp, m);
2422 }
2423 
2424 static inline void
2425 mxge_clean_rx_done(struct mxge_slice_state *ss)
2426 {
2427 	mxge_rx_done_t *rx_done = &ss->rx_done;
2428 	struct lro_entry *lro;
2429 	int limit = 0;
2430 	uint16_t length;
2431 	uint16_t checksum;
2432 
2433 
2434 	while (rx_done->entry[rx_done->idx].length != 0) {
2435 		length = ntohs(rx_done->entry[rx_done->idx].length);
2436 		rx_done->entry[rx_done->idx].length = 0;
2437 		checksum = rx_done->entry[rx_done->idx].checksum;
2438 		if (length <= (MHLEN - MXGEFW_PAD))
2439 			mxge_rx_done_small(ss, length, checksum);
2440 		else
2441 			mxge_rx_done_big(ss, length, checksum);
2442 		rx_done->cnt++;
2443 		rx_done->idx = rx_done->cnt & rx_done->mask;
2444 
2445 		/* limit potential for livelock */
2446 		if (__predict_false(++limit > rx_done->mask / 2))
2447 			break;
2448 	}
2449 	while (!SLIST_EMPTY(&ss->lro_active)) {
2450 		lro = SLIST_FIRST(&ss->lro_active);
2451 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2452 		mxge_lro_flush(ss, lro);
2453 	}
2454 }
2455 
2456 
2457 static inline void
2458 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2459 {
2460 	struct ifnet *ifp;
2461 	mxge_tx_ring_t *tx;
2462 	struct mbuf *m;
2463 	bus_dmamap_t map;
2464 	int idx;
2465 
2466 	tx = &ss->tx;
2467 	ifp = ss->sc->ifp;
2468 	while (tx->pkt_done != mcp_idx) {
2469 		idx = tx->done & tx->mask;
2470 		tx->done++;
2471 		m = tx->info[idx].m;
2472 		/* mbuf and DMA map only attached to the first
2473 		   segment per-mbuf */
2474 		if (m != NULL) {
2475 			ifp->if_opackets++;
2476 			tx->info[idx].m = NULL;
2477 			map = tx->info[idx].map;
2478 			bus_dmamap_unload(tx->dmat, map);
2479 			m_freem(m);
2480 		}
2481 		if (tx->info[idx].flag) {
2482 			tx->info[idx].flag = 0;
2483 			tx->pkt_done++;
2484 		}
2485 	}
2486 
2487 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2488            its OK to send packets */
2489 
2490 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
2491 	    tx->req - tx->done < (tx->mask + 1)/4) {
2492 		mtx_lock(&ss->tx.mtx);
2493 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2494 		ss->tx.wake++;
2495 		mxge_start_locked(ss);
2496 		mtx_unlock(&ss->tx.mtx);
2497 	}
2498 }
2499 
2500 static struct mxge_media_type mxge_media_types[] =
2501 {
2502 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2503 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2504 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2505 	{0,		(1 << 5),	"10GBASE-ER"},
2506 	{0,		(1 << 4),	"10GBASE-LRM"},
2507 	{0,		(1 << 3),	"10GBASE-SW"},
2508 	{0,		(1 << 2),	"10GBASE-LW"},
2509 	{0,		(1 << 1),	"10GBASE-EW"},
2510 	{0,		(1 << 0),	"Reserved"}
2511 };
2512 
2513 static void
2514 mxge_set_media(mxge_softc_t *sc, int type)
2515 {
2516 	sc->media_flags |= type;
2517 	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2518 	ifmedia_set(&sc->media, sc->media_flags);
2519 }
2520 
2521 
2522 /*
2523  * Determine the media type for a NIC.  Some XFPs will identify
2524  * themselves only when their link is up, so this is initiated via a
2525  * link up interrupt.  However, this can potentially take up to
2526  * several milliseconds, so it is run via the watchdog routine, rather
2527  * than in the interrupt handler itself.   This need only be done
2528  * once, not each time the link is up.
2529  */
2530 static void
2531 mxge_media_probe(mxge_softc_t *sc)
2532 {
2533 	mxge_cmd_t cmd;
2534 	char *ptr;
2535 	int i, err, ms;
2536 
2537 	sc->need_media_probe = 0;
2538 
2539 	/* if we've already set a media type, we're done */
2540 	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2541 		return;
2542 
2543 	/*
2544 	 * parse the product code to deterimine the interface type
2545 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2546 	 * after the 3rd dash in the driver's cached copy of the
2547 	 * EEPROM's product code string.
2548 	 */
2549 	ptr = sc->product_code_string;
2550 	if (ptr == NULL) {
2551 		device_printf(sc->dev, "Missing product code\n");
2552 	}
2553 
2554 	for (i = 0; i < 3; i++, ptr++) {
2555 		ptr = strchr(ptr, '-');
2556 		if (ptr == NULL) {
2557 			device_printf(sc->dev,
2558 				      "only %d dashes in PC?!?\n", i);
2559 			return;
2560 		}
2561 	}
2562 	if (*ptr == 'C') {
2563 		mxge_set_media(sc, IFM_10G_CX4);
2564 		return;
2565 	}
2566 	else if (*ptr == 'Q') {
2567 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2568 		/* FreeBSD has no media type for Quad ribbon fiber */
2569 		return;
2570 	}
2571 
2572 	if (*ptr != 'R') {
2573 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2574 		return;
2575 	}
2576 
2577 	/*
2578 	 * At this point we know the NIC has an XFP cage, so now we
2579 	 * try to determine what is in the cage by using the
2580 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2581 	 * register.  We read just one byte, which may take over
2582 	 * a millisecond
2583 	 */
2584 
2585 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2586 	cmd.data1 = MXGE_XFP_COMPLIANCE_BYTE; /* the byte we want */
2587 	err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_I2C_READ, &cmd);
2588 	if (err == MXGEFW_CMD_ERROR_XFP_FAILURE) {
2589 		device_printf(sc->dev, "failed to read XFP\n");
2590 	}
2591 	if (err == MXGEFW_CMD_ERROR_XFP_ABSENT) {
2592 		device_printf(sc->dev, "Type R with no XFP!?!?\n");
2593 	}
2594 	if (err != MXGEFW_CMD_OK) {
2595 		return;
2596 	}
2597 
2598 	/* now we wait for the data to be cached */
2599 	cmd.data0 = MXGE_XFP_COMPLIANCE_BYTE;
2600 	err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_BYTE, &cmd);
2601 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2602 		DELAY(1000);
2603 		cmd.data0 = MXGE_XFP_COMPLIANCE_BYTE;
2604 		err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_BYTE, &cmd);
2605 	}
2606 	if (err != MXGEFW_CMD_OK) {
2607 		device_printf(sc->dev, "failed to read XFP (%d, %dms)\n",
2608 			      err, ms);
2609 		return;
2610 	}
2611 
2612 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2613 		if (mxge_verbose)
2614 			device_printf(sc->dev, "XFP:%s\n",
2615 				      mxge_media_types[0].name);
2616 		mxge_set_media(sc, IFM_10G_CX4);
2617 		return;
2618 	}
2619 	for (i = 1;
2620 	     i < sizeof (mxge_media_types) / sizeof (mxge_media_types[0]);
2621 	     i++) {
2622 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2623 			if (mxge_verbose)
2624 				device_printf(sc->dev, "XFP:%s\n",
2625 					      mxge_media_types[i].name);
2626 
2627 			mxge_set_media(sc, mxge_media_types[i].flag);
2628 			return;
2629 		}
2630 	}
2631 	device_printf(sc->dev, "XFP media 0x%x unknown\n", cmd.data0);
2632 
2633 	return;
2634 }
2635 
2636 static void
2637 mxge_intr(void *arg)
2638 {
2639 	struct mxge_slice_state *ss = arg;
2640 	mxge_softc_t *sc = ss->sc;
2641 	mcp_irq_data_t *stats = ss->fw_stats;
2642 	mxge_tx_ring_t *tx = &ss->tx;
2643 	mxge_rx_done_t *rx_done = &ss->rx_done;
2644 	uint32_t send_done_count;
2645 	uint8_t valid;
2646 
2647 
2648 	/* an interrupt on a non-zero slice is implicitly valid
2649 	   since MSI-X irqs are not shared */
2650 	if (ss != sc->ss) {
2651 		mxge_clean_rx_done(ss);
2652 		*ss->irq_claim = be32toh(3);
2653 		return;
2654 	}
2655 
2656 	/* make sure the DMA has finished */
2657 	if (!stats->valid) {
2658 		return;
2659 	}
2660 	valid = stats->valid;
2661 
2662 	if (!sc->msi_enabled) {
2663 		/* lower legacy IRQ  */
2664 		*sc->irq_deassert = 0;
2665 		if (!mxge_deassert_wait)
2666 			/* don't wait for conf. that irq is low */
2667 			stats->valid = 0;
2668 	} else {
2669 		stats->valid = 0;
2670 	}
2671 
2672 	/* loop while waiting for legacy irq deassertion */
2673 	do {
2674 		/* check for transmit completes and receives */
2675 		send_done_count = be32toh(stats->send_done_count);
2676 		while ((send_done_count != tx->pkt_done) ||
2677 		       (rx_done->entry[rx_done->idx].length != 0)) {
2678 			mxge_tx_done(ss, (int)send_done_count);
2679 			mxge_clean_rx_done(ss);
2680 			send_done_count = be32toh(stats->send_done_count);
2681 		}
2682 	} while (*((volatile uint8_t *) &stats->valid));
2683 
2684 	if (__predict_false(stats->stats_updated)) {
2685 		if (sc->link_state != stats->link_up) {
2686 			sc->link_state = stats->link_up;
2687 			if (sc->link_state) {
2688 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2689 				if (mxge_verbose)
2690 					device_printf(sc->dev, "link up\n");
2691 			} else {
2692 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2693 				if (mxge_verbose)
2694 					device_printf(sc->dev, "link down\n");
2695 			}
2696 			sc->need_media_probe = 1;
2697 		}
2698 		if (sc->rdma_tags_available !=
2699 		    be32toh(stats->rdma_tags_available)) {
2700 			sc->rdma_tags_available =
2701 				be32toh(stats->rdma_tags_available);
2702 			device_printf(sc->dev, "RDMA timed out! %d tags "
2703 				      "left\n", sc->rdma_tags_available);
2704 		}
2705 
2706 		if (stats->link_down) {
2707 			sc->down_cnt += stats->link_down;
2708 			sc->link_state = 0;
2709 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2710 		}
2711 	}
2712 
2713 	/* check to see if we have rx token to pass back */
2714 	if (valid & 0x1)
2715 	    *ss->irq_claim = be32toh(3);
2716 	*(ss->irq_claim + 1) = be32toh(3);
2717 }
2718 
2719 static void
2720 mxge_init(void *arg)
2721 {
2722 }
2723 
2724 
2725 
2726 static void
2727 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2728 {
2729 	struct lro_entry *lro_entry;
2730 	int i;
2731 
2732 	while (!SLIST_EMPTY(&ss->lro_free)) {
2733 		lro_entry = SLIST_FIRST(&ss->lro_free);
2734 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
2735 		free(lro_entry, M_DEVBUF);
2736 	}
2737 
2738 	for (i = 0; i <= ss->rx_big.mask; i++) {
2739 		if (ss->rx_big.info[i].m == NULL)
2740 			continue;
2741 		bus_dmamap_unload(ss->rx_big.dmat,
2742 				  ss->rx_big.info[i].map);
2743 		m_freem(ss->rx_big.info[i].m);
2744 		ss->rx_big.info[i].m = NULL;
2745 	}
2746 
2747 	for (i = 0; i <= ss->rx_small.mask; i++) {
2748 		if (ss->rx_small.info[i].m == NULL)
2749 			continue;
2750 		bus_dmamap_unload(ss->rx_small.dmat,
2751 				  ss->rx_small.info[i].map);
2752 		m_freem(ss->rx_small.info[i].m);
2753 		ss->rx_small.info[i].m = NULL;
2754 	}
2755 
2756 	/* transmit ring used only on the first slice */
2757 	if (ss->tx.info == NULL)
2758 		return;
2759 
2760 	for (i = 0; i <= ss->tx.mask; i++) {
2761 		ss->tx.info[i].flag = 0;
2762 		if (ss->tx.info[i].m == NULL)
2763 			continue;
2764 		bus_dmamap_unload(ss->tx.dmat,
2765 				  ss->tx.info[i].map);
2766 		m_freem(ss->tx.info[i].m);
2767 		ss->tx.info[i].m = NULL;
2768 	}
2769 }
2770 
2771 static void
2772 mxge_free_mbufs(mxge_softc_t *sc)
2773 {
2774 	int slice;
2775 
2776 	for (slice = 0; slice < sc->num_slices; slice++)
2777 		mxge_free_slice_mbufs(&sc->ss[slice]);
2778 }
2779 
2780 static void
2781 mxge_free_slice_rings(struct mxge_slice_state *ss)
2782 {
2783 	int i;
2784 
2785 
2786 	if (ss->rx_done.entry != NULL)
2787 		mxge_dma_free(&ss->rx_done.dma);
2788 	ss->rx_done.entry = NULL;
2789 
2790 	if (ss->tx.req_bytes != NULL)
2791 		free(ss->tx.req_bytes, M_DEVBUF);
2792 	ss->tx.req_bytes = NULL;
2793 
2794 	if (ss->tx.seg_list != NULL)
2795 		free(ss->tx.seg_list, M_DEVBUF);
2796 	ss->tx.seg_list = NULL;
2797 
2798 	if (ss->rx_small.shadow != NULL)
2799 		free(ss->rx_small.shadow, M_DEVBUF);
2800 	ss->rx_small.shadow = NULL;
2801 
2802 	if (ss->rx_big.shadow != NULL)
2803 		free(ss->rx_big.shadow, M_DEVBUF);
2804 	ss->rx_big.shadow = NULL;
2805 
2806 	if (ss->tx.info != NULL) {
2807 		if (ss->tx.dmat != NULL) {
2808 			for (i = 0; i <= ss->tx.mask; i++) {
2809 				bus_dmamap_destroy(ss->tx.dmat,
2810 						   ss->tx.info[i].map);
2811 			}
2812 			bus_dma_tag_destroy(ss->tx.dmat);
2813 		}
2814 		free(ss->tx.info, M_DEVBUF);
2815 	}
2816 	ss->tx.info = NULL;
2817 
2818 	if (ss->rx_small.info != NULL) {
2819 		if (ss->rx_small.dmat != NULL) {
2820 			for (i = 0; i <= ss->rx_small.mask; i++) {
2821 				bus_dmamap_destroy(ss->rx_small.dmat,
2822 						   ss->rx_small.info[i].map);
2823 			}
2824 			bus_dmamap_destroy(ss->rx_small.dmat,
2825 					   ss->rx_small.extra_map);
2826 			bus_dma_tag_destroy(ss->rx_small.dmat);
2827 		}
2828 		free(ss->rx_small.info, M_DEVBUF);
2829 	}
2830 	ss->rx_small.info = NULL;
2831 
2832 	if (ss->rx_big.info != NULL) {
2833 		if (ss->rx_big.dmat != NULL) {
2834 			for (i = 0; i <= ss->rx_big.mask; i++) {
2835 				bus_dmamap_destroy(ss->rx_big.dmat,
2836 						   ss->rx_big.info[i].map);
2837 			}
2838 			bus_dmamap_destroy(ss->rx_big.dmat,
2839 					   ss->rx_big.extra_map);
2840 			bus_dma_tag_destroy(ss->rx_big.dmat);
2841 		}
2842 		free(ss->rx_big.info, M_DEVBUF);
2843 	}
2844 	ss->rx_big.info = NULL;
2845 }
2846 
2847 static void
2848 mxge_free_rings(mxge_softc_t *sc)
2849 {
2850 	int slice;
2851 
2852 	for (slice = 0; slice < sc->num_slices; slice++)
2853 		mxge_free_slice_rings(&sc->ss[slice]);
2854 }
2855 
2856 static int
2857 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
2858 		       int tx_ring_entries)
2859 {
2860 	mxge_softc_t *sc = ss->sc;
2861 	size_t bytes;
2862 	int err, i;
2863 
2864 	err = ENOMEM;
2865 
2866 	/* allocate per-slice receive resources */
2867 
2868 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
2869 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
2870 
2871 	/* allocate the rx shadow rings */
2872 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2873 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2874 	if (ss->rx_small.shadow == NULL)
2875 		return err;;
2876 
2877 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2878 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2879 	if (ss->rx_big.shadow == NULL)
2880 		return err;;
2881 
2882 	/* allocate the rx host info rings */
2883 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
2884 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2885 	if (ss->rx_small.info == NULL)
2886 		return err;;
2887 
2888 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
2889 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2890 	if (ss->rx_big.info == NULL)
2891 		return err;;
2892 
2893 	/* allocate the rx busdma resources */
2894 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2895 				 1,			/* alignment */
2896 				 4096,			/* boundary */
2897 				 BUS_SPACE_MAXADDR,	/* low */
2898 				 BUS_SPACE_MAXADDR,	/* high */
2899 				 NULL, NULL,		/* filter */
2900 				 MHLEN,			/* maxsize */
2901 				 1,			/* num segs */
2902 				 MHLEN,			/* maxsegsize */
2903 				 BUS_DMA_ALLOCNOW,	/* flags */
2904 				 NULL, NULL,		/* lock */
2905 				 &ss->rx_small.dmat);	/* tag */
2906 	if (err != 0) {
2907 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2908 			      err);
2909 		return err;;
2910 	}
2911 
2912 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2913 				 1,			/* alignment */
2914 				 4096,			/* boundary */
2915 				 BUS_SPACE_MAXADDR,	/* low */
2916 				 BUS_SPACE_MAXADDR,	/* high */
2917 				 NULL, NULL,		/* filter */
2918 				 3*4096,		/* maxsize */
2919 				 3,			/* num segs */
2920 				 4096,			/* maxsegsize */
2921 				 BUS_DMA_ALLOCNOW,	/* flags */
2922 				 NULL, NULL,		/* lock */
2923 				 &ss->rx_big.dmat);	/* tag */
2924 	if (err != 0) {
2925 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2926 			      err);
2927 		return err;;
2928 	}
2929 	for (i = 0; i <= ss->rx_small.mask; i++) {
2930 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
2931 					&ss->rx_small.info[i].map);
2932 		if (err != 0) {
2933 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2934 				      err);
2935 			return err;;
2936 		}
2937 	}
2938 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
2939 				&ss->rx_small.extra_map);
2940 	if (err != 0) {
2941 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2942 			      err);
2943 		return err;;
2944 	}
2945 
2946 	for (i = 0; i <= ss->rx_big.mask; i++) {
2947 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
2948 					&ss->rx_big.info[i].map);
2949 		if (err != 0) {
2950 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2951 				      err);
2952 			return err;;
2953 		}
2954 	}
2955 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
2956 				&ss->rx_big.extra_map);
2957 	if (err != 0) {
2958 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2959 			      err);
2960 		return err;;
2961 	}
2962 
2963 	/* now allocate TX resouces */
2964 
2965 	/* only use a single TX ring for now */
2966 	if (ss != ss->sc->ss)
2967 		return 0;
2968 
2969 	ss->tx.mask = tx_ring_entries - 1;
2970 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
2971 
2972 
2973 	/* allocate the tx request copy block */
2974 	bytes = 8 +
2975 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
2976 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
2977 	if (ss->tx.req_bytes == NULL)
2978 		return err;;
2979 	/* ensure req_list entries are aligned to 8 bytes */
2980 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
2981 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
2982 
2983 	/* allocate the tx busdma segment list */
2984 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
2985 	ss->tx.seg_list = (bus_dma_segment_t *)
2986 		malloc(bytes, M_DEVBUF, M_WAITOK);
2987 	if (ss->tx.seg_list == NULL)
2988 		return err;;
2989 
2990 	/* allocate the tx host info ring */
2991 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
2992 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2993 	if (ss->tx.info == NULL)
2994 		return err;;
2995 
2996 	/* allocate the tx busdma resources */
2997 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2998 				 1,			/* alignment */
2999 				 sc->tx_boundary,	/* boundary */
3000 				 BUS_SPACE_MAXADDR,	/* low */
3001 				 BUS_SPACE_MAXADDR,	/* high */
3002 				 NULL, NULL,		/* filter */
3003 				 65536 + 256,		/* maxsize */
3004 				 ss->tx.max_desc - 2,	/* num segs */
3005 				 sc->tx_boundary,	/* maxsegsz */
3006 				 BUS_DMA_ALLOCNOW,	/* flags */
3007 				 NULL, NULL,		/* lock */
3008 				 &ss->tx.dmat);		/* tag */
3009 
3010 	if (err != 0) {
3011 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3012 			      err);
3013 		return err;;
3014 	}
3015 
3016 	/* now use these tags to setup dmamaps for each slot
3017 	   in the ring */
3018 	for (i = 0; i <= ss->tx.mask; i++) {
3019 		err = bus_dmamap_create(ss->tx.dmat, 0,
3020 					&ss->tx.info[i].map);
3021 		if (err != 0) {
3022 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3023 				      err);
3024 			return err;;
3025 		}
3026 	}
3027 	return 0;
3028 
3029 }
3030 
3031 static int
3032 mxge_alloc_rings(mxge_softc_t *sc)
3033 {
3034 	mxge_cmd_t cmd;
3035 	int tx_ring_size;
3036 	int tx_ring_entries, rx_ring_entries;
3037 	int err, slice;
3038 
3039 	/* get ring sizes */
3040 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3041 	tx_ring_size = cmd.data0;
3042 	if (err != 0) {
3043 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3044 		goto abort;
3045 	}
3046 
3047 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3048 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3049 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3050 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3051 	IFQ_SET_READY(&sc->ifp->if_snd);
3052 
3053 	for (slice = 0; slice < sc->num_slices; slice++) {
3054 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3055 					     rx_ring_entries,
3056 					     tx_ring_entries);
3057 		if (err != 0)
3058 			goto abort;
3059 	}
3060 	return 0;
3061 
3062 abort:
3063 	mxge_free_rings(sc);
3064 	return err;
3065 
3066 }
3067 
3068 
3069 static void
3070 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3071 {
3072 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3073 
3074 	if (bufsize < MCLBYTES) {
3075 		/* easy, everything fits in a single buffer */
3076 		*big_buf_size = MCLBYTES;
3077 		*cl_size = MCLBYTES;
3078 		*nbufs = 1;
3079 		return;
3080 	}
3081 
3082 	if (bufsize < MJUMPAGESIZE) {
3083 		/* still easy, everything still fits in a single buffer */
3084 		*big_buf_size = MJUMPAGESIZE;
3085 		*cl_size = MJUMPAGESIZE;
3086 		*nbufs = 1;
3087 		return;
3088 	}
3089 	/* now we need to use virtually contiguous buffers */
3090 	*cl_size = MJUM9BYTES;
3091 	*big_buf_size = 4096;
3092 	*nbufs = mtu / 4096 + 1;
3093 	/* needs to be a power of two, so round up */
3094 	if (*nbufs == 3)
3095 		*nbufs = 4;
3096 }
3097 
3098 static int
3099 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3100 {
3101 	mxge_softc_t *sc;
3102 	mxge_cmd_t cmd;
3103 	bus_dmamap_t map;
3104 	struct lro_entry *lro_entry;
3105 	int err, i, slice;
3106 
3107 
3108 	sc = ss->sc;
3109 	slice = ss - sc->ss;
3110 
3111 	SLIST_INIT(&ss->lro_free);
3112 	SLIST_INIT(&ss->lro_active);
3113 
3114 	for (i = 0; i < sc->lro_cnt; i++) {
3115 		lro_entry = (struct lro_entry *)
3116 			malloc(sizeof (*lro_entry), M_DEVBUF,
3117 			       M_NOWAIT | M_ZERO);
3118 		if (lro_entry == NULL) {
3119 			sc->lro_cnt = i;
3120 			break;
3121 		}
3122 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3123 	}
3124 	/* get the lanai pointers to the send and receive rings */
3125 
3126 	err = 0;
3127 	/* We currently only send from the first slice */
3128 	if (slice == 0) {
3129 		cmd.data0 = slice;
3130 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3131 		ss->tx.lanai =
3132 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3133 	}
3134 	cmd.data0 = slice;
3135 	err |= mxge_send_cmd(sc,
3136 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3137 	ss->rx_small.lanai =
3138 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3139 	cmd.data0 = slice;
3140 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3141 	ss->rx_big.lanai =
3142 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3143 
3144 	if (err != 0) {
3145 		device_printf(sc->dev,
3146 			      "failed to get ring sizes or locations\n");
3147 		return EIO;
3148 	}
3149 
3150 	/* stock receive rings */
3151 	for (i = 0; i <= ss->rx_small.mask; i++) {
3152 		map = ss->rx_small.info[i].map;
3153 		err = mxge_get_buf_small(ss, map, i);
3154 		if (err) {
3155 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3156 				      i, ss->rx_small.mask + 1);
3157 			return ENOMEM;
3158 		}
3159 	}
3160 	for (i = 0; i <= ss->rx_big.mask; i++) {
3161 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3162 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3163 	}
3164 	ss->rx_big.nbufs = nbufs;
3165 	ss->rx_big.cl_size = cl_size;
3166 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3167 		map = ss->rx_big.info[i].map;
3168 		err = mxge_get_buf_big(ss, map, i);
3169 		if (err) {
3170 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3171 				      i, ss->rx_big.mask + 1);
3172 			return ENOMEM;
3173 		}
3174 	}
3175 	return 0;
3176 }
3177 
3178 static int
3179 mxge_open(mxge_softc_t *sc)
3180 {
3181 	mxge_cmd_t cmd;
3182 	int err, big_bytes, nbufs, slice, cl_size, i;
3183 	bus_addr_t bus;
3184 	volatile uint8_t *itable;
3185 
3186 	/* Copy the MAC address in case it was overridden */
3187 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3188 
3189 	err = mxge_reset(sc, 1);
3190 	if (err != 0) {
3191 		device_printf(sc->dev, "failed to reset\n");
3192 		return EIO;
3193 	}
3194 
3195 	if (sc->num_slices > 1) {
3196 		/* setup the indirection table */
3197 		cmd.data0 = sc->num_slices;
3198 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3199 				    &cmd);
3200 
3201 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3202 				     &cmd);
3203 		if (err != 0) {
3204 			device_printf(sc->dev,
3205 				      "failed to setup rss tables\n");
3206 			return err;
3207 		}
3208 
3209 		/* just enable an identity mapping */
3210 		itable = sc->sram + cmd.data0;
3211 		for (i = 0; i < sc->num_slices; i++)
3212 			itable[i] = (uint8_t)i;
3213 
3214 		cmd.data0 = 1;
3215 		cmd.data1 = mxge_rss_hash_type;
3216 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3217 		if (err != 0) {
3218 			device_printf(sc->dev, "failed to enable slices\n");
3219 			return err;
3220 		}
3221 	}
3222 
3223 
3224 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3225 
3226 	cmd.data0 = nbufs;
3227 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3228 			    &cmd);
3229 	/* error is only meaningful if we're trying to set
3230 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3231 	if (err && nbufs > 1) {
3232 		device_printf(sc->dev,
3233 			      "Failed to set alway-use-n to %d\n",
3234 			      nbufs);
3235 		return EIO;
3236 	}
3237 	/* Give the firmware the mtu and the big and small buffer
3238 	   sizes.  The firmware wants the big buf size to be a power
3239 	   of two. Luckily, FreeBSD's clusters are powers of two */
3240 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3241 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3242 	cmd.data0 = MHLEN - MXGEFW_PAD;
3243 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3244 			     &cmd);
3245 	cmd.data0 = big_bytes;
3246 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3247 
3248 	if (err != 0) {
3249 		device_printf(sc->dev, "failed to setup params\n");
3250 		goto abort;
3251 	}
3252 
3253 	/* Now give him the pointer to the stats block */
3254 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->ss->fw_stats_dma.bus_addr);
3255 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->ss->fw_stats_dma.bus_addr);
3256 	cmd.data2 = sizeof(struct mcp_irq_data);
3257 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3258 
3259 	if (err != 0) {
3260 		bus = sc->ss->fw_stats_dma.bus_addr;
3261 		bus += offsetof(struct mcp_irq_data, send_done_count);
3262 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3263 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3264 		err = mxge_send_cmd(sc,
3265 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3266 				    &cmd);
3267 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3268 		sc->fw_multicast_support = 0;
3269 	} else {
3270 		sc->fw_multicast_support = 1;
3271 	}
3272 
3273 	if (err != 0) {
3274 		device_printf(sc->dev, "failed to setup params\n");
3275 		goto abort;
3276 	}
3277 
3278 	for (slice = 0; slice < sc->num_slices; slice++) {
3279 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3280 		if (err != 0) {
3281 			device_printf(sc->dev, "couldn't open slice %d\n",
3282 				      slice);
3283 			goto abort;
3284 		}
3285 	}
3286 
3287 	/* Finally, start the firmware running */
3288 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3289 	if (err) {
3290 		device_printf(sc->dev, "Couldn't bring up link\n");
3291 		goto abort;
3292 	}
3293 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3294 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3295 
3296 	return 0;
3297 
3298 
3299 abort:
3300 	mxge_free_mbufs(sc);
3301 
3302 	return err;
3303 }
3304 
3305 static int
3306 mxge_close(mxge_softc_t *sc)
3307 {
3308 	mxge_cmd_t cmd;
3309 	int err, old_down_cnt;
3310 
3311 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3312 	old_down_cnt = sc->down_cnt;
3313 	mb();
3314 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3315 	if (err) {
3316 		device_printf(sc->dev, "Couldn't bring down link\n");
3317 	}
3318 	if (old_down_cnt == sc->down_cnt) {
3319 		/* wait for down irq */
3320 		DELAY(10 * sc->intr_coal_delay);
3321 	}
3322 	mb();
3323 	if (old_down_cnt == sc->down_cnt) {
3324 		device_printf(sc->dev, "never got down irq\n");
3325 	}
3326 
3327 	mxge_free_mbufs(sc);
3328 
3329 	return 0;
3330 }
3331 
3332 static void
3333 mxge_setup_cfg_space(mxge_softc_t *sc)
3334 {
3335 	device_t dev = sc->dev;
3336 	int reg;
3337 	uint16_t cmd, lnk, pectl;
3338 
3339 	/* find the PCIe link width and set max read request to 4KB*/
3340 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3341 		lnk = pci_read_config(dev, reg + 0x12, 2);
3342 		sc->link_width = (lnk >> 4) & 0x3f;
3343 
3344 		pectl = pci_read_config(dev, reg + 0x8, 2);
3345 		pectl = (pectl & ~0x7000) | (5 << 12);
3346 		pci_write_config(dev, reg + 0x8, pectl, 2);
3347 	}
3348 
3349 	/* Enable DMA and Memory space access */
3350 	pci_enable_busmaster(dev);
3351 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3352 	cmd |= PCIM_CMD_MEMEN;
3353 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3354 }
3355 
3356 static uint32_t
3357 mxge_read_reboot(mxge_softc_t *sc)
3358 {
3359 	device_t dev = sc->dev;
3360 	uint32_t vs;
3361 
3362 	/* find the vendor specific offset */
3363 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3364 		device_printf(sc->dev,
3365 			      "could not find vendor specific offset\n");
3366 		return (uint32_t)-1;
3367 	}
3368 	/* enable read32 mode */
3369 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3370 	/* tell NIC which register to read */
3371 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3372 	return (pci_read_config(dev, vs + 0x14, 4));
3373 }
3374 
3375 static void
3376 mxge_watchdog_reset(mxge_softc_t *sc)
3377 {
3378 	int err;
3379 	uint32_t reboot;
3380 	uint16_t cmd;
3381 
3382 	err = ENXIO;
3383 
3384 	device_printf(sc->dev, "Watchdog reset!\n");
3385 
3386 	/*
3387 	 * check to see if the NIC rebooted.  If it did, then all of
3388 	 * PCI config space has been reset, and things like the
3389 	 * busmaster bit will be zero.  If this is the case, then we
3390 	 * must restore PCI config space before the NIC can be used
3391 	 * again
3392 	 */
3393 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3394 	if (cmd == 0xffff) {
3395 		/*
3396 		 * maybe the watchdog caught the NIC rebooting; wait
3397 		 * up to 100ms for it to finish.  If it does not come
3398 		 * back, then give up
3399 		 */
3400 		DELAY(1000*100);
3401 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3402 		if (cmd == 0xffff) {
3403 			device_printf(sc->dev, "NIC disappeared!\n");
3404 			goto abort;
3405 		}
3406 	}
3407 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3408 		/* print the reboot status */
3409 		reboot = mxge_read_reboot(sc);
3410 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3411 			      reboot);
3412 		/* restore PCI configuration space */
3413 
3414 		/* XXXX waiting for pci_cfg_restore() to be exported */
3415 		goto abort; /* just abort for now */
3416 
3417 		/* and redo any changes we made to our config space */
3418 		mxge_setup_cfg_space(sc);
3419 	} else {
3420 		device_printf(sc->dev, "NIC did not reboot, ring state:\n");
3421 		device_printf(sc->dev, "tx.req=%d tx.done=%d\n",
3422 			      sc->ss->tx.req, sc->ss->tx.done);
3423 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3424 			      sc->ss->tx.pkt_done,
3425 			      be32toh(sc->ss->fw_stats->send_done_count));
3426 	}
3427 
3428 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
3429 		mxge_close(sc);
3430 		err = mxge_open(sc);
3431 	}
3432 
3433 abort:
3434 	/*
3435 	 * stop the watchdog if the nic is dead, to avoid spamming the
3436 	 * console
3437 	 */
3438 	if (err != 0) {
3439 		callout_stop(&sc->co_hdl);
3440 	}
3441 }
3442 
3443 static void
3444 mxge_watchdog(mxge_softc_t *sc)
3445 {
3446 	mxge_tx_ring_t *tx = &sc->ss->tx;
3447 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3448 
3449 	/* see if we have outstanding transmits, which
3450 	   have been pending for more than mxge_ticks */
3451 	if (tx->req != tx->done &&
3452 	    tx->watchdog_req != tx->watchdog_done &&
3453 	    tx->done == tx->watchdog_done) {
3454 		/* check for pause blocking before resetting */
3455 		if (tx->watchdog_rx_pause == rx_pause)
3456 			mxge_watchdog_reset(sc);
3457 		else
3458 			device_printf(sc->dev, "Flow control blocking "
3459 				      "xmits, check link partner\n");
3460 	}
3461 
3462 	tx->watchdog_req = tx->req;
3463 	tx->watchdog_done = tx->done;
3464 	tx->watchdog_rx_pause = rx_pause;
3465 
3466 	if (sc->need_media_probe)
3467 		mxge_media_probe(sc);
3468 }
3469 
3470 static void
3471 mxge_update_stats(mxge_softc_t *sc)
3472 {
3473 	struct mxge_slice_state *ss;
3474 	u_long ipackets = 0;
3475 	int slice;
3476 
3477 	for(slice = 0; slice < sc->num_slices; slice++) {
3478 		ss = &sc->ss[slice];
3479 		ipackets += ss->ipackets;
3480 	}
3481 	sc->ifp->if_ipackets = ipackets;
3482 
3483 }
3484 static void
3485 mxge_tick(void *arg)
3486 {
3487 	mxge_softc_t *sc = arg;
3488 
3489 
3490 	/* Synchronize with possible callout reset/stop. */
3491 	if (callout_pending(&sc->co_hdl) ||
3492 	    !callout_active(&sc->co_hdl)) {
3493 		mtx_unlock(&sc->driver_mtx);
3494 		return;
3495 	}
3496 
3497 	/* aggregate stats from different slices */
3498 	mxge_update_stats(sc);
3499 
3500 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3501 	if (!sc->watchdog_countdown) {
3502 		mxge_watchdog(sc);
3503 		sc->watchdog_countdown = 4;
3504 	}
3505 	sc->watchdog_countdown--;
3506 }
3507 
3508 static int
3509 mxge_media_change(struct ifnet *ifp)
3510 {
3511 	return EINVAL;
3512 }
3513 
3514 static int
3515 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3516 {
3517 	struct ifnet *ifp = sc->ifp;
3518 	int real_mtu, old_mtu;
3519 	int err = 0;
3520 
3521 
3522 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3523 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3524 		return EINVAL;
3525 	mtx_lock(&sc->driver_mtx);
3526 	old_mtu = ifp->if_mtu;
3527 	ifp->if_mtu = mtu;
3528 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3529 		callout_stop(&sc->co_hdl);
3530 		mxge_close(sc);
3531 		err = mxge_open(sc);
3532 		if (err != 0) {
3533 			ifp->if_mtu = old_mtu;
3534 			mxge_close(sc);
3535 			(void) mxge_open(sc);
3536 		}
3537 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3538 	}
3539 	mtx_unlock(&sc->driver_mtx);
3540 	return err;
3541 }
3542 
3543 static void
3544 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3545 {
3546 	mxge_softc_t *sc = ifp->if_softc;
3547 
3548 
3549 	if (sc == NULL)
3550 		return;
3551 	ifmr->ifm_status = IFM_AVALID;
3552 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3553 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3554 	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3555 }
3556 
3557 static int
3558 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3559 {
3560 	mxge_softc_t *sc = ifp->if_softc;
3561 	struct ifreq *ifr = (struct ifreq *)data;
3562 	int err, mask;
3563 
3564 	err = 0;
3565 	switch (command) {
3566 	case SIOCSIFADDR:
3567 	case SIOCGIFADDR:
3568 		err = ether_ioctl(ifp, command, data);
3569 		break;
3570 
3571 	case SIOCSIFMTU:
3572 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3573 		break;
3574 
3575 	case SIOCSIFFLAGS:
3576 		mtx_lock(&sc->driver_mtx);
3577 		if (ifp->if_flags & IFF_UP) {
3578 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3579 				err = mxge_open(sc);
3580 				callout_reset(&sc->co_hdl, mxge_ticks,
3581 					      mxge_tick, sc);
3582 			} else {
3583 				/* take care of promis can allmulti
3584 				   flag chages */
3585 				mxge_change_promisc(sc,
3586 						    ifp->if_flags & IFF_PROMISC);
3587 				mxge_set_multicast_list(sc);
3588 			}
3589 		} else {
3590 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3591 				callout_stop(&sc->co_hdl);
3592 				mxge_close(sc);
3593 			}
3594 		}
3595 		mtx_unlock(&sc->driver_mtx);
3596 		break;
3597 
3598 	case SIOCADDMULTI:
3599 	case SIOCDELMULTI:
3600 		mtx_lock(&sc->driver_mtx);
3601 		mxge_set_multicast_list(sc);
3602 		mtx_unlock(&sc->driver_mtx);
3603 		break;
3604 
3605 	case SIOCSIFCAP:
3606 		mtx_lock(&sc->driver_mtx);
3607 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3608 		if (mask & IFCAP_TXCSUM) {
3609 			if (IFCAP_TXCSUM & ifp->if_capenable) {
3610 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3611 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3612 						      | CSUM_TSO);
3613 			} else {
3614 				ifp->if_capenable |= IFCAP_TXCSUM;
3615 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3616 			}
3617 		} else if (mask & IFCAP_RXCSUM) {
3618 			if (IFCAP_RXCSUM & ifp->if_capenable) {
3619 				ifp->if_capenable &= ~IFCAP_RXCSUM;
3620 				sc->csum_flag = 0;
3621 			} else {
3622 				ifp->if_capenable |= IFCAP_RXCSUM;
3623 				sc->csum_flag = 1;
3624 			}
3625 		}
3626 		if (mask & IFCAP_TSO4) {
3627 			if (IFCAP_TSO4 & ifp->if_capenable) {
3628 				ifp->if_capenable &= ~IFCAP_TSO4;
3629 				ifp->if_hwassist &= ~CSUM_TSO;
3630 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
3631 				ifp->if_capenable |= IFCAP_TSO4;
3632 				ifp->if_hwassist |= CSUM_TSO;
3633 			} else {
3634 				printf("mxge requires tx checksum offload"
3635 				       " be enabled to use TSO\n");
3636 				err = EINVAL;
3637 			}
3638 		}
3639 		if (mask & IFCAP_LRO) {
3640 			if (IFCAP_LRO & ifp->if_capenable)
3641 				err = mxge_change_lro_locked(sc, 0);
3642 			else
3643 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3644 		}
3645 		if (mask & IFCAP_VLAN_HWTAGGING)
3646 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3647 		mtx_unlock(&sc->driver_mtx);
3648 		VLAN_CAPABILITIES(ifp);
3649 
3650 		break;
3651 
3652 	case SIOCGIFMEDIA:
3653 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3654 				    &sc->media, command);
3655                 break;
3656 
3657 	default:
3658 		err = ENOTTY;
3659         }
3660 	return err;
3661 }
3662 
3663 static void
3664 mxge_fetch_tunables(mxge_softc_t *sc)
3665 {
3666 
3667 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
3668 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
3669 			  &mxge_flow_control);
3670 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
3671 			  &mxge_intr_coal_delay);
3672 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
3673 			  &mxge_nvidia_ecrc_enable);
3674 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
3675 			  &mxge_force_firmware);
3676 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
3677 			  &mxge_deassert_wait);
3678 	TUNABLE_INT_FETCH("hw.mxge.verbose",
3679 			  &mxge_verbose);
3680 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
3681 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
3682 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
3683 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
3684 	if (sc->lro_cnt != 0)
3685 		mxge_lro_cnt = sc->lro_cnt;
3686 
3687 	if (bootverbose)
3688 		mxge_verbose = 1;
3689 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
3690 		mxge_intr_coal_delay = 30;
3691 	if (mxge_ticks == 0)
3692 		mxge_ticks = hz / 2;
3693 	sc->pause = mxge_flow_control;
3694 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
3695 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_SRC_PORT) {
3696 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
3697 	}
3698 }
3699 
3700 
3701 static void
3702 mxge_free_slices(mxge_softc_t *sc)
3703 {
3704 	struct mxge_slice_state *ss;
3705 	int i;
3706 
3707 
3708 	if (sc->ss == NULL)
3709 		return;
3710 
3711 	for (i = 0; i < sc->num_slices; i++) {
3712 		ss = &sc->ss[i];
3713 		if (ss->fw_stats != NULL) {
3714 			mxge_dma_free(&ss->fw_stats_dma);
3715 			ss->fw_stats = NULL;
3716 			mtx_destroy(&ss->tx.mtx);
3717 		}
3718 		if (ss->rx_done.entry != NULL) {
3719 			mxge_dma_free(&ss->rx_done.dma);
3720 			ss->rx_done.entry = NULL;
3721 		}
3722 	}
3723 	free(sc->ss, M_DEVBUF);
3724 	sc->ss = NULL;
3725 }
3726 
3727 static int
3728 mxge_alloc_slices(mxge_softc_t *sc)
3729 {
3730 	mxge_cmd_t cmd;
3731 	struct mxge_slice_state *ss;
3732 	size_t bytes;
3733 	int err, i, max_intr_slots;
3734 
3735 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3736 	if (err != 0) {
3737 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3738 		return err;
3739 	}
3740 	sc->rx_ring_size = cmd.data0;
3741 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
3742 
3743 	bytes = sizeof (*sc->ss) * sc->num_slices;
3744 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
3745 	if (sc->ss == NULL)
3746 		return (ENOMEM);
3747 	for (i = 0; i < sc->num_slices; i++) {
3748 		ss = &sc->ss[i];
3749 
3750 		ss->sc = sc;
3751 
3752 		/* allocate per-slice rx interrupt queues */
3753 
3754 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
3755 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
3756 		if (err != 0)
3757 			goto abort;
3758 		ss->rx_done.entry = ss->rx_done.dma.addr;
3759 		bzero(ss->rx_done.entry, bytes);
3760 
3761 		/*
3762 		 * allocate the per-slice firmware stats; stats
3763 		 * (including tx) are used used only on the first
3764 		 * slice for now
3765 		 */
3766 		if (i > 0)
3767 			continue;
3768 
3769 		bytes = sizeof (*ss->fw_stats);
3770 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
3771 				     sizeof (*ss->fw_stats), 64);
3772 		if (err != 0)
3773 			goto abort;
3774 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
3775 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
3776 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
3777 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
3778 	}
3779 
3780 	return (0);
3781 
3782 abort:
3783 	mxge_free_slices(sc);
3784 	return (ENOMEM);
3785 }
3786 
3787 static void
3788 mxge_slice_probe(mxge_softc_t *sc)
3789 {
3790 	mxge_cmd_t cmd;
3791 	char *old_fw;
3792 	int msix_cnt, status, max_intr_slots;
3793 
3794 	sc->num_slices = 1;
3795 	/*
3796 	 *  don't enable multiple slices if they are not enabled,
3797 	 *  or if this is not an SMP system
3798 	 */
3799 
3800 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
3801 		return;
3802 
3803 	/* see how many MSI-X interrupts are available */
3804 	msix_cnt = pci_msix_count(sc->dev);
3805 	if (msix_cnt < 2)
3806 		return;
3807 
3808 	/* now load the slice aware firmware see what it supports */
3809 	old_fw = sc->fw_name;
3810 	if (old_fw == mxge_fw_aligned)
3811 		sc->fw_name = mxge_fw_rss_aligned;
3812 	else
3813 		sc->fw_name = mxge_fw_rss_unaligned;
3814 	status = mxge_load_firmware(sc, 0);
3815 	if (status != 0) {
3816 		device_printf(sc->dev, "Falling back to a single slice\n");
3817 		return;
3818 	}
3819 
3820 	/* try to send a reset command to the card to see if it
3821 	   is alive */
3822 	memset(&cmd, 0, sizeof (cmd));
3823 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
3824 	if (status != 0) {
3825 		device_printf(sc->dev, "failed reset\n");
3826 		goto abort_with_fw;
3827 	}
3828 
3829 	/* get rx ring size */
3830 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3831 	if (status != 0) {
3832 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3833 		goto abort_with_fw;
3834 	}
3835 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
3836 
3837 	/* tell it the size of the interrupt queues */
3838 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
3839 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
3840 	if (status != 0) {
3841 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
3842 		goto abort_with_fw;
3843 	}
3844 
3845 	/* ask the maximum number of slices it supports */
3846 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
3847 	if (status != 0) {
3848 		device_printf(sc->dev,
3849 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
3850 		goto abort_with_fw;
3851 	}
3852 	sc->num_slices = cmd.data0;
3853 	if (sc->num_slices > msix_cnt)
3854 		sc->num_slices = msix_cnt;
3855 
3856 	if (mxge_max_slices == -1) {
3857 		/* cap to number of CPUs in system */
3858 		if (sc->num_slices > mp_ncpus)
3859 			sc->num_slices = mp_ncpus;
3860 	} else {
3861 		if (sc->num_slices > mxge_max_slices)
3862 			sc->num_slices = mxge_max_slices;
3863 	}
3864 	/* make sure it is a power of two */
3865 	while (sc->num_slices & (sc->num_slices - 1))
3866 		sc->num_slices--;
3867 
3868 	if (mxge_verbose)
3869 		device_printf(sc->dev, "using %d slices\n",
3870 			      sc->num_slices);
3871 
3872 	return;
3873 
3874 abort_with_fw:
3875 	sc->fw_name = old_fw;
3876 	(void) mxge_load_firmware(sc, 0);
3877 }
3878 
3879 static int
3880 mxge_add_msix_irqs(mxge_softc_t *sc)
3881 {
3882 	size_t bytes;
3883 	int count, err, i, rid;
3884 
3885 	rid = PCIR_BAR(2);
3886 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
3887 						    &rid, RF_ACTIVE);
3888 
3889 	if (sc->msix_table_res == NULL) {
3890 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
3891 		return ENXIO;
3892 	}
3893 
3894 	count = sc->num_slices;
3895 	err = pci_alloc_msix(sc->dev, &count);
3896 	if (err != 0) {
3897 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
3898 			      "err = %d \n", sc->num_slices, err);
3899 		goto abort_with_msix_table;
3900 	}
3901 	if (count < sc->num_slices) {
3902 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
3903 			      count, sc->num_slices);
3904 		device_printf(sc->dev,
3905 			      "Try setting hw.mxge.max_slices to %d\n",
3906 			      count);
3907 		err = ENOSPC;
3908 		goto abort_with_msix;
3909 	}
3910 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
3911 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3912 	if (sc->msix_irq_res == NULL) {
3913 		err = ENOMEM;
3914 		goto abort_with_msix;
3915 	}
3916 
3917 	for (i = 0; i < sc->num_slices; i++) {
3918 		rid = i + 1;
3919 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
3920 							  SYS_RES_IRQ,
3921 							  &rid, RF_ACTIVE);
3922 		if (sc->msix_irq_res[i] == NULL) {
3923 			device_printf(sc->dev, "couldn't allocate IRQ res"
3924 				      " for message %d\n", i);
3925 			err = ENXIO;
3926 			goto abort_with_res;
3927 		}
3928 	}
3929 
3930 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
3931 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3932 
3933 	for (i = 0; i < sc->num_slices; i++) {
3934 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
3935 				     INTR_TYPE_NET | INTR_MPSAFE,
3936 				     NULL, mxge_intr, &sc->ss[i],
3937 				     &sc->msix_ih[i]);
3938 		if (err != 0) {
3939 			device_printf(sc->dev, "couldn't setup intr for "
3940 				      "message %d\n", i);
3941 			goto abort_with_intr;
3942 		}
3943 	}
3944 
3945 	if (mxge_verbose) {
3946 		device_printf(sc->dev, "using %d msix IRQs:",
3947 			      sc->num_slices);
3948 		for (i = 0; i < sc->num_slices; i++)
3949 			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
3950 		printf("\n");
3951 	}
3952 	return (0);
3953 
3954 abort_with_intr:
3955 	for (i = 0; i < sc->num_slices; i++) {
3956 		if (sc->msix_ih[i] != NULL) {
3957 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
3958 					  sc->msix_ih[i]);
3959 			sc->msix_ih[i] = NULL;
3960 		}
3961 	}
3962 	free(sc->msix_ih, M_DEVBUF);
3963 
3964 
3965 abort_with_res:
3966 	for (i = 0; i < sc->num_slices; i++) {
3967 		rid = i + 1;
3968 		if (sc->msix_irq_res[i] != NULL)
3969 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
3970 					     sc->msix_irq_res[i]);
3971 		sc->msix_irq_res[i] = NULL;
3972 	}
3973 	free(sc->msix_irq_res, M_DEVBUF);
3974 
3975 
3976 abort_with_msix:
3977 	pci_release_msi(sc->dev);
3978 
3979 abort_with_msix_table:
3980 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
3981 			     sc->msix_table_res);
3982 
3983 	return err;
3984 }
3985 
3986 static int
3987 mxge_add_single_irq(mxge_softc_t *sc)
3988 {
3989 	int count, err, rid;
3990 
3991 	count = pci_msi_count(sc->dev);
3992 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
3993 		rid = 1;
3994 		sc->msi_enabled = 1;
3995 	} else {
3996 		rid = 0;
3997 	}
3998 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
3999 					 1, RF_SHAREABLE | RF_ACTIVE);
4000 	if (sc->irq_res == NULL) {
4001 		device_printf(sc->dev, "could not alloc interrupt\n");
4002 		return ENXIO;
4003 	}
4004 	if (mxge_verbose)
4005 		device_printf(sc->dev, "using %s irq %ld\n",
4006 			      sc->msi_enabled ? "MSI" : "INTx",
4007 			      rman_get_start(sc->irq_res));
4008 	err = bus_setup_intr(sc->dev, sc->irq_res,
4009 			     INTR_TYPE_NET | INTR_MPSAFE,
4010 			     NULL, mxge_intr, &sc->ss[0], &sc->ih);
4011 	if (err != 0) {
4012 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4013 				     sc->msi_enabled ? 1 : 0, sc->irq_res);
4014 		if (sc->msi_enabled)
4015 			pci_release_msi(sc->dev);
4016 	}
4017 	return err;
4018 }
4019 
4020 static void
4021 mxge_rem_msix_irqs(mxge_softc_t *sc)
4022 {
4023 	int i, rid;
4024 
4025 	for (i = 0; i < sc->num_slices; i++) {
4026 		if (sc->msix_ih[i] != NULL) {
4027 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4028 					  sc->msix_ih[i]);
4029 			sc->msix_ih[i] = NULL;
4030 		}
4031 	}
4032 	free(sc->msix_ih, M_DEVBUF);
4033 
4034 	for (i = 0; i < sc->num_slices; i++) {
4035 		rid = i + 1;
4036 		if (sc->msix_irq_res[i] != NULL)
4037 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4038 					     sc->msix_irq_res[i]);
4039 		sc->msix_irq_res[i] = NULL;
4040 	}
4041 	free(sc->msix_irq_res, M_DEVBUF);
4042 
4043 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4044 			     sc->msix_table_res);
4045 
4046 	pci_release_msi(sc->dev);
4047 	return;
4048 }
4049 
4050 static void
4051 mxge_rem_single_irq(mxge_softc_t *sc)
4052 {
4053 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4054 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4055 			     sc->msi_enabled ? 1 : 0, sc->irq_res);
4056 	if (sc->msi_enabled)
4057 		pci_release_msi(sc->dev);
4058 }
4059 
4060 static void
4061 mxge_rem_irq(mxge_softc_t *sc)
4062 {
4063 	if (sc->num_slices > 1)
4064 		mxge_rem_msix_irqs(sc);
4065 	else
4066 		mxge_rem_single_irq(sc);
4067 }
4068 
4069 static int
4070 mxge_add_irq(mxge_softc_t *sc)
4071 {
4072 	int err;
4073 
4074 	if (sc->num_slices > 1)
4075 		err = mxge_add_msix_irqs(sc);
4076 	else
4077 		err = mxge_add_single_irq(sc);
4078 
4079 	if (0 && err == 0 && sc->num_slices > 1) {
4080 		mxge_rem_msix_irqs(sc);
4081 		err = mxge_add_msix_irqs(sc);
4082 	}
4083 	return err;
4084 }
4085 
4086 
4087 static int
4088 mxge_attach(device_t dev)
4089 {
4090 	mxge_softc_t *sc = device_get_softc(dev);
4091 	struct ifnet *ifp;
4092 	int err, rid;
4093 
4094 	sc->dev = dev;
4095 	mxge_fetch_tunables(sc);
4096 
4097 	err = bus_dma_tag_create(NULL,			/* parent */
4098 				 1,			/* alignment */
4099 				 0,			/* boundary */
4100 				 BUS_SPACE_MAXADDR,	/* low */
4101 				 BUS_SPACE_MAXADDR,	/* high */
4102 				 NULL, NULL,		/* filter */
4103 				 65536 + 256,		/* maxsize */
4104 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4105 				 65536,			/* maxsegsize */
4106 				 0,			/* flags */
4107 				 NULL, NULL,		/* lock */
4108 				 &sc->parent_dmat);	/* tag */
4109 
4110 	if (err != 0) {
4111 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4112 			      err);
4113 		goto abort_with_nothing;
4114 	}
4115 
4116 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4117 	if (ifp == NULL) {
4118 		device_printf(dev, "can not if_alloc()\n");
4119 		err = ENOSPC;
4120 		goto abort_with_parent_dmat;
4121 	}
4122 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4123 
4124 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4125 		 device_get_nameunit(dev));
4126 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4127 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4128 		 "%s:drv", device_get_nameunit(dev));
4129 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4130 		 MTX_NETWORK_LOCK, MTX_DEF);
4131 
4132 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4133 
4134 	mxge_setup_cfg_space(sc);
4135 
4136 	/* Map the board into the kernel */
4137 	rid = PCIR_BARS;
4138 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4139 					 ~0, 1, RF_ACTIVE);
4140 	if (sc->mem_res == NULL) {
4141 		device_printf(dev, "could not map memory\n");
4142 		err = ENXIO;
4143 		goto abort_with_lock;
4144 	}
4145 	sc->sram = rman_get_virtual(sc->mem_res);
4146 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4147 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4148 		device_printf(dev, "impossible memory region size %ld\n",
4149 			      rman_get_size(sc->mem_res));
4150 		err = ENXIO;
4151 		goto abort_with_mem_res;
4152 	}
4153 
4154 	/* make NULL terminated copy of the EEPROM strings section of
4155 	   lanai SRAM */
4156 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4157 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4158 				rman_get_bushandle(sc->mem_res),
4159 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4160 				sc->eeprom_strings,
4161 				MXGE_EEPROM_STRINGS_SIZE - 2);
4162 	err = mxge_parse_strings(sc);
4163 	if (err != 0)
4164 		goto abort_with_mem_res;
4165 
4166 	/* Enable write combining for efficient use of PCIe bus */
4167 	mxge_enable_wc(sc);
4168 
4169 	/* Allocate the out of band dma memory */
4170 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4171 			     sizeof (mxge_cmd_t), 64);
4172 	if (err != 0)
4173 		goto abort_with_mem_res;
4174 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4175 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4176 	if (err != 0)
4177 		goto abort_with_cmd_dma;
4178 
4179 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4180 	if (err != 0)
4181 		goto abort_with_zeropad_dma;
4182 
4183 	/* select & load the firmware */
4184 	err = mxge_select_firmware(sc);
4185 	if (err != 0)
4186 		goto abort_with_dmabench;
4187 	sc->intr_coal_delay = mxge_intr_coal_delay;
4188 
4189 	mxge_slice_probe(sc);
4190 	err = mxge_alloc_slices(sc);
4191 	if (err != 0)
4192 		goto abort_with_dmabench;
4193 
4194 	err = mxge_reset(sc, 0);
4195 	if (err != 0)
4196 		goto abort_with_slices;
4197 
4198 	err = mxge_alloc_rings(sc);
4199 	if (err != 0) {
4200 		device_printf(sc->dev, "failed to allocate rings\n");
4201 		goto abort_with_dmabench;
4202 	}
4203 
4204 	err = mxge_add_irq(sc);
4205 	if (err != 0) {
4206 		device_printf(sc->dev, "failed to add irq\n");
4207 		goto abort_with_rings;
4208 	}
4209 
4210 	ifp->if_baudrate = 100000000;
4211 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4212 		IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING |
4213 		IFCAP_VLAN_HWCSUM | IFCAP_LRO;
4214 
4215 	sc->max_mtu = mxge_max_mtu(sc);
4216 	if (sc->max_mtu >= 9000)
4217 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4218 	else
4219 		device_printf(dev, "MTU limited to %d.  Install "
4220 			      "latest firmware for 9000 byte jumbo support\n",
4221 			      sc->max_mtu - ETHER_HDR_LEN);
4222 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4223 	ifp->if_capenable = ifp->if_capabilities;
4224 	if (sc->lro_cnt == 0)
4225 		ifp->if_capenable &= ~IFCAP_LRO;
4226 	sc->csum_flag = 1;
4227         ifp->if_init = mxge_init;
4228         ifp->if_softc = sc;
4229         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4230         ifp->if_ioctl = mxge_ioctl;
4231         ifp->if_start = mxge_start;
4232 	/* Initialise the ifmedia structure */
4233 	ifmedia_init(&sc->media, 0, mxge_media_change,
4234 		     mxge_media_status);
4235 	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4236 	mxge_media_probe(sc);
4237 	ether_ifattach(ifp, sc->mac_addr);
4238 	/* ether_ifattach sets mtu to 1500 */
4239 	if (ifp->if_capabilities & IFCAP_JUMBO_MTU)
4240 		ifp->if_mtu = 9000;
4241 
4242 	mxge_add_sysctls(sc);
4243 	return 0;
4244 
4245 abort_with_rings:
4246 	mxge_free_rings(sc);
4247 abort_with_slices:
4248 	mxge_free_slices(sc);
4249 abort_with_dmabench:
4250 	mxge_dma_free(&sc->dmabench_dma);
4251 abort_with_zeropad_dma:
4252 	mxge_dma_free(&sc->zeropad_dma);
4253 abort_with_cmd_dma:
4254 	mxge_dma_free(&sc->cmd_dma);
4255 abort_with_mem_res:
4256 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4257 abort_with_lock:
4258 	pci_disable_busmaster(dev);
4259 	mtx_destroy(&sc->cmd_mtx);
4260 	mtx_destroy(&sc->driver_mtx);
4261 	if_free(ifp);
4262 abort_with_parent_dmat:
4263 	bus_dma_tag_destroy(sc->parent_dmat);
4264 
4265 abort_with_nothing:
4266 	return err;
4267 }
4268 
4269 static int
4270 mxge_detach(device_t dev)
4271 {
4272 	mxge_softc_t *sc = device_get_softc(dev);
4273 
4274 	if (sc->ifp->if_vlantrunk != NULL) {
4275 		device_printf(sc->dev,
4276 			      "Detach vlans before removing module\n");
4277 		return EBUSY;
4278 	}
4279 	mtx_lock(&sc->driver_mtx);
4280 	callout_stop(&sc->co_hdl);
4281 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4282 		mxge_close(sc);
4283 	mtx_unlock(&sc->driver_mtx);
4284 	ether_ifdetach(sc->ifp);
4285 	ifmedia_removeall(&sc->media);
4286 	mxge_dummy_rdma(sc, 0);
4287 	mxge_rem_sysctls(sc);
4288 	mxge_rem_irq(sc);
4289 	mxge_free_rings(sc);
4290 	mxge_free_slices(sc);
4291 	mxge_dma_free(&sc->dmabench_dma);
4292 	mxge_dma_free(&sc->zeropad_dma);
4293 	mxge_dma_free(&sc->cmd_dma);
4294 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4295 	pci_disable_busmaster(dev);
4296 	mtx_destroy(&sc->cmd_mtx);
4297 	mtx_destroy(&sc->driver_mtx);
4298 	if_free(sc->ifp);
4299 	bus_dma_tag_destroy(sc->parent_dmat);
4300 	return 0;
4301 }
4302 
4303 static int
4304 mxge_shutdown(device_t dev)
4305 {
4306 	return 0;
4307 }
4308 
4309 /*
4310   This file uses Myri10GE driver indentation.
4311 
4312   Local Variables:
4313   c-file-style:"linux"
4314   tab-width:8
4315   End:
4316 */
4317