xref: /freebsd/sys/dev/mxge/if_mxge.c (revision 54ebdd631db8c0bba2baab0155f603a8b5cf014a)
1 /******************************************************************************
2 
3 Copyright (c) 2006-2008, Myricom Inc.
4 All rights reserved.
5 
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8 
9  1. Redistributions of source code must retain the above copyright notice,
10     this list of conditions and the following disclaimer.
11 
12  2. Neither the name of the Myricom Inc, nor the names of its
13     contributors may be used to endorse or promote products derived from
14     this software without specific prior written permission.
15 
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27 
28 ***************************************************************************/
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/linker.h>
36 #include <sys/firmware.h>
37 #include <sys/endian.h>
38 #include <sys/sockio.h>
39 #include <sys/mbuf.h>
40 #include <sys/malloc.h>
41 #include <sys/kdb.h>
42 #include <sys/kernel.h>
43 #include <sys/lock.h>
44 #include <sys/module.h>
45 #include <sys/socket.h>
46 #include <sys/sysctl.h>
47 #include <sys/sx.h>
48 
49 #include <net/if.h>
50 #include <net/if_arp.h>
51 #include <net/ethernet.h>
52 #include <net/if_dl.h>
53 #include <net/if_media.h>
54 
55 #include <net/bpf.h>
56 
57 #include <net/if_types.h>
58 #include <net/if_vlan_var.h>
59 #include <net/zlib.h>
60 
61 #include <netinet/in_systm.h>
62 #include <netinet/in.h>
63 #include <netinet/ip.h>
64 #include <netinet/tcp.h>
65 
66 #include <machine/bus.h>
67 #include <machine/in_cksum.h>
68 #include <machine/resource.h>
69 #include <sys/bus.h>
70 #include <sys/rman.h>
71 #include <sys/smp.h>
72 
73 #include <dev/pci/pcireg.h>
74 #include <dev/pci/pcivar.h>
75 #include <dev/pci/pci_private.h> /* XXX for pci_cfg_restore */
76 
77 #include <vm/vm.h>		/* for pmap_mapdev() */
78 #include <vm/pmap.h>
79 
80 #if defined(__i386) || defined(__amd64)
81 #include <machine/specialreg.h>
82 #endif
83 
84 #include <dev/mxge/mxge_mcp.h>
85 #include <dev/mxge/mcp_gen_header.h>
86 /*#define MXGE_FAKE_IFP*/
87 #include <dev/mxge/if_mxge_var.h>
88 
89 /* tunable params */
90 static int mxge_nvidia_ecrc_enable = 1;
91 static int mxge_force_firmware = 0;
92 static int mxge_intr_coal_delay = 30;
93 static int mxge_deassert_wait = 1;
94 static int mxge_flow_control = 1;
95 static int mxge_verbose = 0;
96 static int mxge_lro_cnt = 8;
97 static int mxge_ticks;
98 static int mxge_max_slices = 1;
99 static int mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
100 static int mxge_always_promisc = 0;
101 static char *mxge_fw_unaligned = "mxge_ethp_z8e";
102 static char *mxge_fw_aligned = "mxge_eth_z8e";
103 static char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
104 static char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
105 
106 static int mxge_probe(device_t dev);
107 static int mxge_attach(device_t dev);
108 static int mxge_detach(device_t dev);
109 static int mxge_shutdown(device_t dev);
110 static void mxge_intr(void *arg);
111 
112 static device_method_t mxge_methods[] =
113 {
114   /* Device interface */
115   DEVMETHOD(device_probe, mxge_probe),
116   DEVMETHOD(device_attach, mxge_attach),
117   DEVMETHOD(device_detach, mxge_detach),
118   DEVMETHOD(device_shutdown, mxge_shutdown),
119   {0, 0}
120 };
121 
122 static driver_t mxge_driver =
123 {
124   "mxge",
125   mxge_methods,
126   sizeof(mxge_softc_t),
127 };
128 
129 static devclass_t mxge_devclass;
130 
131 /* Declare ourselves to be a child of the PCI bus.*/
132 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, 0, 0);
133 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
134 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
135 
136 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
137 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
138 static int mxge_close(mxge_softc_t *sc);
139 static int mxge_open(mxge_softc_t *sc);
140 static void mxge_tick(void *arg);
141 
142 static int
143 mxge_probe(device_t dev)
144 {
145   if ((pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM) &&
146       ((pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E) ||
147        (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9))) {
148 	  device_set_desc(dev, "Myri10G-PCIE-8A");
149 	  return 0;
150   }
151   return ENXIO;
152 }
153 
154 static void
155 mxge_enable_wc(mxge_softc_t *sc)
156 {
157 #if defined(__i386) || defined(__amd64)
158 	vm_offset_t len;
159 	int err;
160 
161 	sc->wc = 1;
162 	len = rman_get_size(sc->mem_res);
163 	err = pmap_change_attr((vm_offset_t) sc->sram,
164 			       len, PAT_WRITE_COMBINING);
165 	if (err != 0) {
166 		device_printf(sc->dev, "pmap_change_attr failed, %d\n",
167 			      err);
168 		sc->wc = 0;
169 	}
170 #endif
171 }
172 
173 
174 /* callback to get our DMA address */
175 static void
176 mxge_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs,
177 			 int error)
178 {
179 	if (error == 0) {
180 		*(bus_addr_t *) arg = segs->ds_addr;
181 	}
182 }
183 
184 static int
185 mxge_dma_alloc(mxge_softc_t *sc, mxge_dma_t *dma, size_t bytes,
186 		   bus_size_t alignment)
187 {
188 	int err;
189 	device_t dev = sc->dev;
190 	bus_size_t boundary, maxsegsize;
191 
192 	if (bytes > 4096 && alignment == 4096) {
193 		boundary = 0;
194 		maxsegsize = bytes;
195 	} else {
196 		boundary = 4096;
197 		maxsegsize = 4096;
198 	}
199 
200 	/* allocate DMAable memory tags */
201 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
202 				 alignment,		/* alignment */
203 				 boundary,		/* boundary */
204 				 BUS_SPACE_MAXADDR,	/* low */
205 				 BUS_SPACE_MAXADDR,	/* high */
206 				 NULL, NULL,		/* filter */
207 				 bytes,			/* maxsize */
208 				 1,			/* num segs */
209 				 maxsegsize,		/* maxsegsize */
210 				 BUS_DMA_COHERENT,	/* flags */
211 				 NULL, NULL,		/* lock */
212 				 &dma->dmat);		/* tag */
213 	if (err != 0) {
214 		device_printf(dev, "couldn't alloc tag (err = %d)\n", err);
215 		return err;
216 	}
217 
218 	/* allocate DMAable memory & map */
219 	err = bus_dmamem_alloc(dma->dmat, &dma->addr,
220 			       (BUS_DMA_WAITOK | BUS_DMA_COHERENT
221 				| BUS_DMA_ZERO),  &dma->map);
222 	if (err != 0) {
223 		device_printf(dev, "couldn't alloc mem (err = %d)\n", err);
224 		goto abort_with_dmat;
225 	}
226 
227 	/* load the memory */
228 	err = bus_dmamap_load(dma->dmat, dma->map, dma->addr, bytes,
229 			      mxge_dmamap_callback,
230 			      (void *)&dma->bus_addr, 0);
231 	if (err != 0) {
232 		device_printf(dev, "couldn't load map (err = %d)\n", err);
233 		goto abort_with_mem;
234 	}
235 	return 0;
236 
237 abort_with_mem:
238 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
239 abort_with_dmat:
240 	(void)bus_dma_tag_destroy(dma->dmat);
241 	return err;
242 }
243 
244 
245 static void
246 mxge_dma_free(mxge_dma_t *dma)
247 {
248 	bus_dmamap_unload(dma->dmat, dma->map);
249 	bus_dmamem_free(dma->dmat, dma->addr, dma->map);
250 	(void)bus_dma_tag_destroy(dma->dmat);
251 }
252 
253 /*
254  * The eeprom strings on the lanaiX have the format
255  * SN=x\0
256  * MAC=x:x:x:x:x:x\0
257  * PC=text\0
258  */
259 
260 static int
261 mxge_parse_strings(mxge_softc_t *sc)
262 {
263 #define MXGE_NEXT_STRING(p) while(ptr < limit && *ptr++)
264 
265 	char *ptr, *limit;
266 	int i, found_mac;
267 
268 	ptr = sc->eeprom_strings;
269 	limit = sc->eeprom_strings + MXGE_EEPROM_STRINGS_SIZE;
270 	found_mac = 0;
271 	while (ptr < limit && *ptr != '\0') {
272 		if (memcmp(ptr, "MAC=", 4) == 0) {
273 			ptr += 1;
274 			sc->mac_addr_string = ptr;
275 			for (i = 0; i < 6; i++) {
276 				ptr += 3;
277 				if ((ptr + 2) > limit)
278 					goto abort;
279 				sc->mac_addr[i] = strtoul(ptr, NULL, 16);
280 				found_mac = 1;
281 			}
282 		} else if (memcmp(ptr, "PC=", 3) == 0) {
283 			ptr += 3;
284 			strncpy(sc->product_code_string, ptr,
285 				sizeof (sc->product_code_string) - 1);
286 		} else if (memcmp(ptr, "SN=", 3) == 0) {
287 			ptr += 3;
288 			strncpy(sc->serial_number_string, ptr,
289 				sizeof (sc->serial_number_string) - 1);
290 		}
291 		MXGE_NEXT_STRING(ptr);
292 	}
293 
294 	if (found_mac)
295 		return 0;
296 
297  abort:
298 	device_printf(sc->dev, "failed to parse eeprom_strings\n");
299 
300 	return ENXIO;
301 }
302 
303 #if #cpu(i386) || defined __i386 || defined i386 || defined __i386__ || #cpu(x86_64) || defined __x86_64__
304 static void
305 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
306 {
307 	uint32_t val;
308 	unsigned long base, off;
309 	char *va, *cfgptr;
310 	device_t pdev, mcp55;
311 	uint16_t vendor_id, device_id, word;
312 	uintptr_t bus, slot, func, ivend, idev;
313 	uint32_t *ptr32;
314 
315 
316 	if (!mxge_nvidia_ecrc_enable)
317 		return;
318 
319 	pdev = device_get_parent(device_get_parent(sc->dev));
320 	if (pdev == NULL) {
321 		device_printf(sc->dev, "could not find parent?\n");
322 		return;
323 	}
324 	vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
325 	device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
326 
327 	if (vendor_id != 0x10de)
328 		return;
329 
330 	base = 0;
331 
332 	if (device_id == 0x005d) {
333 		/* ck804, base address is magic */
334 		base = 0xe0000000UL;
335 	} else if (device_id >= 0x0374 && device_id <= 0x378) {
336 		/* mcp55, base address stored in chipset */
337 		mcp55 = pci_find_bsf(0, 0, 0);
338 		if (mcp55 &&
339 		    0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
340 		    0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
341 			word = pci_read_config(mcp55, 0x90, 2);
342 			base = ((unsigned long)word & 0x7ffeU) << 25;
343 		}
344 	}
345 	if (!base)
346 		return;
347 
348 	/* XXXX
349 	   Test below is commented because it is believed that doing
350 	   config read/write beyond 0xff will access the config space
351 	   for the next larger function.  Uncomment this and remove
352 	   the hacky pmap_mapdev() way of accessing config space when
353 	   FreeBSD grows support for extended pcie config space access
354 	*/
355 #if 0
356 	/* See if we can, by some miracle, access the extended
357 	   config space */
358 	val = pci_read_config(pdev, 0x178, 4);
359 	if (val != 0xffffffff) {
360 		val |= 0x40;
361 		pci_write_config(pdev, 0x178, val, 4);
362 		return;
363 	}
364 #endif
365 	/* Rather than using normal pci config space writes, we must
366 	 * map the Nvidia config space ourselves.  This is because on
367 	 * opteron/nvidia class machine the 0xe000000 mapping is
368 	 * handled by the nvidia chipset, that means the internal PCI
369 	 * device (the on-chip northbridge), or the amd-8131 bridge
370 	 * and things behind them are not visible by this method.
371 	 */
372 
373 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
374 		      PCI_IVAR_BUS, &bus);
375 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
376 		      PCI_IVAR_SLOT, &slot);
377 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
378 		      PCI_IVAR_FUNCTION, &func);
379 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
380 		      PCI_IVAR_VENDOR, &ivend);
381 	BUS_READ_IVAR(device_get_parent(pdev), pdev,
382 		      PCI_IVAR_DEVICE, &idev);
383 
384 	off =  base
385 		+ 0x00100000UL * (unsigned long)bus
386 		+ 0x00001000UL * (unsigned long)(func
387 						 + 8 * slot);
388 
389 	/* map it into the kernel */
390 	va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
391 
392 
393 	if (va == NULL) {
394 		device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
395 		return;
396 	}
397 	/* get a pointer to the config space mapped into the kernel */
398 	cfgptr = va + (off & PAGE_MASK);
399 
400 	/* make sure that we can really access it */
401 	vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
402 	device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
403 	if (! (vendor_id == ivend && device_id == idev)) {
404 		device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
405 			      vendor_id, device_id);
406 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
407 		return;
408 	}
409 
410 	ptr32 = (uint32_t*)(cfgptr + 0x178);
411 	val = *ptr32;
412 
413 	if (val == 0xffffffff) {
414 		device_printf(sc->dev, "extended mapping failed\n");
415 		pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
416 		return;
417 	}
418 	*ptr32 = val | 0x40;
419 	pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
420 	if (mxge_verbose)
421 		device_printf(sc->dev,
422 			      "Enabled ECRC on upstream Nvidia bridge "
423 			      "at %d:%d:%d\n",
424 			      (int)bus, (int)slot, (int)func);
425 	return;
426 }
427 #else
428 static void
429 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
430 {
431 	device_printf(sc->dev,
432 		      "Nforce 4 chipset on non-x86/amd64!?!?!\n");
433 	return;
434 }
435 #endif
436 
437 
438 static int
439 mxge_dma_test(mxge_softc_t *sc, int test_type)
440 {
441 	mxge_cmd_t cmd;
442 	bus_addr_t dmatest_bus = sc->dmabench_dma.bus_addr;
443 	int status;
444 	uint32_t len;
445 	char *test = " ";
446 
447 
448 	/* Run a small DMA test.
449 	 * The magic multipliers to the length tell the firmware
450 	 * to do DMA read, write, or read+write tests.  The
451 	 * results are returned in cmd.data0.  The upper 16
452 	 * bits of the return is the number of transfers completed.
453 	 * The lower 16 bits is the time in 0.5us ticks that the
454 	 * transfers took to complete.
455 	 */
456 
457 	len = sc->tx_boundary;
458 
459 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
460 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
461 	cmd.data2 = len * 0x10000;
462 	status = mxge_send_cmd(sc, test_type, &cmd);
463 	if (status != 0) {
464 		test = "read";
465 		goto abort;
466 	}
467 	sc->read_dma = ((cmd.data0>>16) * len * 2) /
468 		(cmd.data0 & 0xffff);
469 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
470 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
471 	cmd.data2 = len * 0x1;
472 	status = mxge_send_cmd(sc, test_type, &cmd);
473 	if (status != 0) {
474 		test = "write";
475 		goto abort;
476 	}
477 	sc->write_dma = ((cmd.data0>>16) * len * 2) /
478 		(cmd.data0 & 0xffff);
479 
480 	cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
481 	cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
482 	cmd.data2 = len * 0x10001;
483 	status = mxge_send_cmd(sc, test_type, &cmd);
484 	if (status != 0) {
485 		test = "read/write";
486 		goto abort;
487 	}
488 	sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
489 		(cmd.data0 & 0xffff);
490 
491 abort:
492 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
493 		device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
494 			      test, status);
495 
496 	return status;
497 }
498 
499 /*
500  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
501  * when the PCI-E Completion packets are aligned on an 8-byte
502  * boundary.  Some PCI-E chip sets always align Completion packets; on
503  * the ones that do not, the alignment can be enforced by enabling
504  * ECRC generation (if supported).
505  *
506  * When PCI-E Completion packets are not aligned, it is actually more
507  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
508  *
509  * If the driver can neither enable ECRC nor verify that it has
510  * already been enabled, then it must use a firmware image which works
511  * around unaligned completion packets (ethp_z8e.dat), and it should
512  * also ensure that it never gives the device a Read-DMA which is
513  * larger than 2KB by setting the tx_boundary to 2KB.  If ECRC is
514  * enabled, then the driver should use the aligned (eth_z8e.dat)
515  * firmware image, and set tx_boundary to 4KB.
516  */
517 
518 static int
519 mxge_firmware_probe(mxge_softc_t *sc)
520 {
521 	device_t dev = sc->dev;
522 	int reg, status;
523 	uint16_t pectl;
524 
525 	sc->tx_boundary = 4096;
526 	/*
527 	 * Verify the max read request size was set to 4KB
528 	 * before trying the test with 4KB.
529 	 */
530 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
531 		pectl = pci_read_config(dev, reg + 0x8, 2);
532 		if ((pectl & (5 << 12)) != (5 << 12)) {
533 			device_printf(dev, "Max Read Req. size != 4k (0x%x\n",
534 				      pectl);
535 			sc->tx_boundary = 2048;
536 		}
537 	}
538 
539 	/*
540 	 * load the optimized firmware (which assumes aligned PCIe
541 	 * completions) in order to see if it works on this host.
542 	 */
543 	sc->fw_name = mxge_fw_aligned;
544 	status = mxge_load_firmware(sc, 1);
545 	if (status != 0) {
546 		return status;
547 	}
548 
549 	/*
550 	 * Enable ECRC if possible
551 	 */
552 	mxge_enable_nvidia_ecrc(sc);
553 
554 	/*
555 	 * Run a DMA test which watches for unaligned completions and
556 	 * aborts on the first one seen.
557 	 */
558 
559 	status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
560 	if (status == 0)
561 		return 0; /* keep the aligned firmware */
562 
563 	if (status != E2BIG)
564 		device_printf(dev, "DMA test failed: %d\n", status);
565 	if (status == ENOSYS)
566 		device_printf(dev, "Falling back to ethp! "
567 			      "Please install up to date fw\n");
568 	return status;
569 }
570 
571 static int
572 mxge_select_firmware(mxge_softc_t *sc)
573 {
574 	int aligned = 0;
575 
576 
577 	if (mxge_force_firmware != 0) {
578 		if (mxge_force_firmware == 1)
579 			aligned = 1;
580 		else
581 			aligned = 0;
582 		if (mxge_verbose)
583 			device_printf(sc->dev,
584 				      "Assuming %s completions (forced)\n",
585 				      aligned ? "aligned" : "unaligned");
586 		goto abort;
587 	}
588 
589 	/* if the PCIe link width is 4 or less, we can use the aligned
590 	   firmware and skip any checks */
591 	if (sc->link_width != 0 && sc->link_width <= 4) {
592 		device_printf(sc->dev,
593 			      "PCIe x%d Link, expect reduced performance\n",
594 			      sc->link_width);
595 		aligned = 1;
596 		goto abort;
597 	}
598 
599 	if (0 == mxge_firmware_probe(sc))
600 		return 0;
601 
602 abort:
603 	if (aligned) {
604 		sc->fw_name = mxge_fw_aligned;
605 		sc->tx_boundary = 4096;
606 	} else {
607 		sc->fw_name = mxge_fw_unaligned;
608 		sc->tx_boundary = 2048;
609 	}
610 	return (mxge_load_firmware(sc, 0));
611 }
612 
613 union qualhack
614 {
615         const char *ro_char;
616         char *rw_char;
617 };
618 
619 static int
620 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
621 {
622 
623 
624 	if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
625 		device_printf(sc->dev, "Bad firmware type: 0x%x\n",
626 			      be32toh(hdr->mcp_type));
627 		return EIO;
628 	}
629 
630 	/* save firmware version for sysctl */
631 	strncpy(sc->fw_version, hdr->version, sizeof (sc->fw_version));
632 	if (mxge_verbose)
633 		device_printf(sc->dev, "firmware id: %s\n", hdr->version);
634 
635 	sscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
636 	       &sc->fw_ver_minor, &sc->fw_ver_tiny);
637 
638 	if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR
639 	      && sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
640 		device_printf(sc->dev, "Found firmware version %s\n",
641 			      sc->fw_version);
642 		device_printf(sc->dev, "Driver needs %d.%d\n",
643 			      MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
644 		return EINVAL;
645 	}
646 	return 0;
647 
648 }
649 
650 static void *
651 z_alloc(void *nil, u_int items, u_int size)
652 {
653         void *ptr;
654 
655         ptr = malloc(items * size, M_TEMP, M_NOWAIT);
656         return ptr;
657 }
658 
659 static void
660 z_free(void *nil, void *ptr)
661 {
662         free(ptr, M_TEMP);
663 }
664 
665 
666 static int
667 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
668 {
669 	z_stream zs;
670 	char *inflate_buffer;
671 	const struct firmware *fw;
672 	const mcp_gen_header_t *hdr;
673 	unsigned hdr_offset;
674 	int status;
675 	unsigned int i;
676 	char dummy;
677 	size_t fw_len;
678 
679 	fw = firmware_get(sc->fw_name);
680 	if (fw == NULL) {
681 		device_printf(sc->dev, "Could not find firmware image %s\n",
682 			      sc->fw_name);
683 		return ENOENT;
684 	}
685 
686 
687 
688 	/* setup zlib and decompress f/w */
689 	bzero(&zs, sizeof (zs));
690 	zs.zalloc = z_alloc;
691 	zs.zfree = z_free;
692 	status = inflateInit(&zs);
693 	if (status != Z_OK) {
694 		status = EIO;
695 		goto abort_with_fw;
696 	}
697 
698 	/* the uncompressed size is stored as the firmware version,
699 	   which would otherwise go unused */
700 	fw_len = (size_t) fw->version;
701 	inflate_buffer = malloc(fw_len, M_TEMP, M_NOWAIT);
702 	if (inflate_buffer == NULL)
703 		goto abort_with_zs;
704 	zs.avail_in = fw->datasize;
705 	zs.next_in = __DECONST(char *, fw->data);
706 	zs.avail_out = fw_len;
707 	zs.next_out = inflate_buffer;
708 	status = inflate(&zs, Z_FINISH);
709 	if (status != Z_STREAM_END) {
710 		device_printf(sc->dev, "zlib %d\n", status);
711 		status = EIO;
712 		goto abort_with_buffer;
713 	}
714 
715 	/* check id */
716 	hdr_offset = htobe32(*(const uint32_t *)
717 			     (inflate_buffer + MCP_HEADER_PTR_OFFSET));
718 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
719 		device_printf(sc->dev, "Bad firmware file");
720 		status = EIO;
721 		goto abort_with_buffer;
722 	}
723 	hdr = (const void*)(inflate_buffer + hdr_offset);
724 
725 	status = mxge_validate_firmware(sc, hdr);
726 	if (status != 0)
727 		goto abort_with_buffer;
728 
729 	/* Copy the inflated firmware to NIC SRAM. */
730 	for (i = 0; i < fw_len; i += 256) {
731 		mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i,
732 			      inflate_buffer + i,
733 			      min(256U, (unsigned)(fw_len - i)));
734 		wmb();
735 		dummy = *sc->sram;
736 		wmb();
737 	}
738 
739 	*limit = fw_len;
740 	status = 0;
741 abort_with_buffer:
742 	free(inflate_buffer, M_TEMP);
743 abort_with_zs:
744 	inflateEnd(&zs);
745 abort_with_fw:
746 	firmware_put(fw, FIRMWARE_UNLOAD);
747 	return status;
748 }
749 
750 /*
751  * Enable or disable periodic RDMAs from the host to make certain
752  * chipsets resend dropped PCIe messages
753  */
754 
755 static void
756 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
757 {
758 	char buf_bytes[72];
759 	volatile uint32_t *confirm;
760 	volatile char *submit;
761 	uint32_t *buf, dma_low, dma_high;
762 	int i;
763 
764 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
765 
766 	/* clear confirmation addr */
767 	confirm = (volatile uint32_t *)sc->cmd;
768 	*confirm = 0;
769 	wmb();
770 
771 	/* send an rdma command to the PCIe engine, and wait for the
772 	   response in the confirmation address.  The firmware should
773 	   write a -1 there to indicate it is alive and well
774 	*/
775 
776 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
777 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
778 	buf[0] = htobe32(dma_high);		/* confirm addr MSW */
779 	buf[1] = htobe32(dma_low);		/* confirm addr LSW */
780 	buf[2] = htobe32(0xffffffff);		/* confirm data */
781 	dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr);
782 	dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr);
783 	buf[3] = htobe32(dma_high); 		/* dummy addr MSW */
784 	buf[4] = htobe32(dma_low); 		/* dummy addr LSW */
785 	buf[5] = htobe32(enable);			/* enable? */
786 
787 
788 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
789 
790 	mxge_pio_copy(submit, buf, 64);
791 	wmb();
792 	DELAY(1000);
793 	wmb();
794 	i = 0;
795 	while (*confirm != 0xffffffff && i < 20) {
796 		DELAY(1000);
797 		i++;
798 	}
799 	if (*confirm != 0xffffffff) {
800 		device_printf(sc->dev, "dummy rdma %s failed (%p = 0x%x)",
801 			      (enable ? "enable" : "disable"), confirm,
802 			      *confirm);
803 	}
804 	return;
805 }
806 
807 static int
808 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
809 {
810 	mcp_cmd_t *buf;
811 	char buf_bytes[sizeof(*buf) + 8];
812 	volatile mcp_cmd_response_t *response = sc->cmd;
813 	volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
814 	uint32_t dma_low, dma_high;
815 	int err, sleep_total = 0;
816 
817 	/* ensure buf is aligned to 8 bytes */
818 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
819 
820 	buf->data0 = htobe32(data->data0);
821 	buf->data1 = htobe32(data->data1);
822 	buf->data2 = htobe32(data->data2);
823 	buf->cmd = htobe32(cmd);
824 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
825 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
826 
827 	buf->response_addr.low = htobe32(dma_low);
828 	buf->response_addr.high = htobe32(dma_high);
829 	mtx_lock(&sc->cmd_mtx);
830 	response->result = 0xffffffff;
831 	wmb();
832 	mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
833 
834 	/* wait up to 20ms */
835 	err = EAGAIN;
836 	for (sleep_total = 0; sleep_total <  20; sleep_total++) {
837 		bus_dmamap_sync(sc->cmd_dma.dmat,
838 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
839 		wmb();
840 		switch (be32toh(response->result)) {
841 		case 0:
842 			data->data0 = be32toh(response->data);
843 			err = 0;
844 			break;
845 		case 0xffffffff:
846 			DELAY(1000);
847 			break;
848 		case MXGEFW_CMD_UNKNOWN:
849 			err = ENOSYS;
850 			break;
851 		case MXGEFW_CMD_ERROR_UNALIGNED:
852 			err = E2BIG;
853 			break;
854 		case MXGEFW_CMD_ERROR_BUSY:
855 			err = EBUSY;
856 			break;
857 		default:
858 			device_printf(sc->dev,
859 				      "mxge: command %d "
860 				      "failed, result = %d\n",
861 				      cmd, be32toh(response->result));
862 			err = ENXIO;
863 			break;
864 		}
865 		if (err != EAGAIN)
866 			break;
867 	}
868 	if (err == EAGAIN)
869 		device_printf(sc->dev, "mxge: command %d timed out"
870 			      "result = %d\n",
871 			      cmd, be32toh(response->result));
872 	mtx_unlock(&sc->cmd_mtx);
873 	return err;
874 }
875 
876 static int
877 mxge_adopt_running_firmware(mxge_softc_t *sc)
878 {
879 	struct mcp_gen_header *hdr;
880 	const size_t bytes = sizeof (struct mcp_gen_header);
881 	size_t hdr_offset;
882 	int status;
883 
884 	/* find running firmware header */
885 	hdr_offset = htobe32(*(volatile uint32_t *)
886 			     (sc->sram + MCP_HEADER_PTR_OFFSET));
887 
888 	if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
889 		device_printf(sc->dev,
890 			      "Running firmware has bad header offset (%d)\n",
891 			      (int)hdr_offset);
892 		return EIO;
893 	}
894 
895 	/* copy header of running firmware from SRAM to host memory to
896 	 * validate firmware */
897 	hdr = malloc(bytes, M_DEVBUF, M_NOWAIT);
898 	if (hdr == NULL) {
899 		device_printf(sc->dev, "could not malloc firmware hdr\n");
900 		return ENOMEM;
901 	}
902 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
903 				rman_get_bushandle(sc->mem_res),
904 				hdr_offset, (char *)hdr, bytes);
905 	status = mxge_validate_firmware(sc, hdr);
906 	free(hdr, M_DEVBUF);
907 
908 	/*
909 	 * check to see if adopted firmware has bug where adopting
910 	 * it will cause broadcasts to be filtered unless the NIC
911 	 * is kept in ALLMULTI mode
912 	 */
913 	if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
914 	    sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
915 		sc->adopted_rx_filter_bug = 1;
916 		device_printf(sc->dev, "Adopting fw %d.%d.%d: "
917 			      "working around rx filter bug\n",
918 			      sc->fw_ver_major, sc->fw_ver_minor,
919 			      sc->fw_ver_tiny);
920 	}
921 
922 	return status;
923 }
924 
925 
926 static int
927 mxge_load_firmware(mxge_softc_t *sc, int adopt)
928 {
929 	volatile uint32_t *confirm;
930 	volatile char *submit;
931 	char buf_bytes[72];
932 	uint32_t *buf, size, dma_low, dma_high;
933 	int status, i;
934 
935 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
936 
937 	size = sc->sram_size;
938 	status = mxge_load_firmware_helper(sc, &size);
939 	if (status) {
940 		if (!adopt)
941 			return status;
942 		/* Try to use the currently running firmware, if
943 		   it is new enough */
944 		status = mxge_adopt_running_firmware(sc);
945 		if (status) {
946 			device_printf(sc->dev,
947 				      "failed to adopt running firmware\n");
948 			return status;
949 		}
950 		device_printf(sc->dev,
951 			      "Successfully adopted running firmware\n");
952 		if (sc->tx_boundary == 4096) {
953 			device_printf(sc->dev,
954 				"Using firmware currently running on NIC"
955 				 ".  For optimal\n");
956 			device_printf(sc->dev,
957 				 "performance consider loading optimized "
958 				 "firmware\n");
959 		}
960 		sc->fw_name = mxge_fw_unaligned;
961 		sc->tx_boundary = 2048;
962 		return 0;
963 	}
964 	/* clear confirmation addr */
965 	confirm = (volatile uint32_t *)sc->cmd;
966 	*confirm = 0;
967 	wmb();
968 	/* send a reload command to the bootstrap MCP, and wait for the
969 	   response in the confirmation address.  The firmware should
970 	   write a -1 there to indicate it is alive and well
971 	*/
972 
973 	dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.bus_addr);
974 	dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.bus_addr);
975 
976 	buf[0] = htobe32(dma_high);	/* confirm addr MSW */
977 	buf[1] = htobe32(dma_low);	/* confirm addr LSW */
978 	buf[2] = htobe32(0xffffffff);	/* confirm data */
979 
980 	/* FIX: All newest firmware should un-protect the bottom of
981 	   the sram before handoff. However, the very first interfaces
982 	   do not. Therefore the handoff copy must skip the first 8 bytes
983 	*/
984 					/* where the code starts*/
985 	buf[3] = htobe32(MXGE_FW_OFFSET + 8);
986 	buf[4] = htobe32(size - 8); 	/* length of code */
987 	buf[5] = htobe32(8);		/* where to copy to */
988 	buf[6] = htobe32(0);		/* where to jump to */
989 
990 	submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
991 	mxge_pio_copy(submit, buf, 64);
992 	wmb();
993 	DELAY(1000);
994 	wmb();
995 	i = 0;
996 	while (*confirm != 0xffffffff && i < 20) {
997 		DELAY(1000*10);
998 		i++;
999 		bus_dmamap_sync(sc->cmd_dma.dmat,
1000 				sc->cmd_dma.map, BUS_DMASYNC_POSTREAD);
1001 	}
1002 	if (*confirm != 0xffffffff) {
1003 		device_printf(sc->dev,"handoff failed (%p = 0x%x)",
1004 			confirm, *confirm);
1005 
1006 		return ENXIO;
1007 	}
1008 	return 0;
1009 }
1010 
1011 static int
1012 mxge_update_mac_address(mxge_softc_t *sc)
1013 {
1014 	mxge_cmd_t cmd;
1015 	uint8_t *addr = sc->mac_addr;
1016 	int status;
1017 
1018 
1019 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1020 		     | (addr[2] << 8) | addr[3]);
1021 
1022 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1023 
1024 	status = mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
1025 	return status;
1026 }
1027 
1028 static int
1029 mxge_change_pause(mxge_softc_t *sc, int pause)
1030 {
1031 	mxge_cmd_t cmd;
1032 	int status;
1033 
1034 	if (pause)
1035 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL,
1036 				       &cmd);
1037 	else
1038 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL,
1039 				       &cmd);
1040 
1041 	if (status) {
1042 		device_printf(sc->dev, "Failed to set flow control mode\n");
1043 		return ENXIO;
1044 	}
1045 	sc->pause = pause;
1046 	return 0;
1047 }
1048 
1049 static void
1050 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1051 {
1052 	mxge_cmd_t cmd;
1053 	int status;
1054 
1055 	if (mxge_always_promisc)
1056 		promisc = 1;
1057 
1058 	if (promisc)
1059 		status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC,
1060 				       &cmd);
1061 	else
1062 		status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC,
1063 				       &cmd);
1064 
1065 	if (status) {
1066 		device_printf(sc->dev, "Failed to set promisc mode\n");
1067 	}
1068 }
1069 
1070 static void
1071 mxge_set_multicast_list(mxge_softc_t *sc)
1072 {
1073 	mxge_cmd_t cmd;
1074 	struct ifmultiaddr *ifma;
1075 	struct ifnet *ifp = sc->ifp;
1076 	int err;
1077 
1078 	/* This firmware is known to not support multicast */
1079 	if (!sc->fw_multicast_support)
1080 		return;
1081 
1082 	/* Disable multicast filtering while we play with the lists*/
1083 	err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1084 	if (err != 0) {
1085 		device_printf(sc->dev, "Failed MXGEFW_ENABLE_ALLMULTI,"
1086 		       " error status: %d\n", err);
1087 		return;
1088 	}
1089 
1090 	if (sc->adopted_rx_filter_bug)
1091 		return;
1092 
1093 	if (ifp->if_flags & IFF_ALLMULTI)
1094 		/* request to disable multicast filtering, so quit here */
1095 		return;
1096 
1097 	/* Flush all the filters */
1098 
1099 	err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1100 	if (err != 0) {
1101 		device_printf(sc->dev,
1102 			      "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS"
1103 			      ", error status: %d\n", err);
1104 		return;
1105 	}
1106 
1107 	/* Walk the multicast list, and add each address */
1108 
1109 	IF_ADDR_LOCK(ifp);
1110 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1111 		if (ifma->ifma_addr->sa_family != AF_LINK)
1112 			continue;
1113 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1114 		      &cmd.data0, 4);
1115 		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1116 		      &cmd.data1, 2);
1117 		cmd.data0 = htonl(cmd.data0);
1118 		cmd.data1 = htonl(cmd.data1);
1119 		err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1120 		if (err != 0) {
1121 			device_printf(sc->dev, "Failed "
1122 			       "MXGEFW_JOIN_MULTICAST_GROUP, error status:"
1123 			       "%d\t", err);
1124 			/* abort, leaving multicast filtering off */
1125 			IF_ADDR_UNLOCK(ifp);
1126 			return;
1127 		}
1128 	}
1129 	IF_ADDR_UNLOCK(ifp);
1130 	/* Enable multicast filtering */
1131 	err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1132 	if (err != 0) {
1133 		device_printf(sc->dev, "Failed MXGEFW_DISABLE_ALLMULTI"
1134 		       ", error status: %d\n", err);
1135 	}
1136 }
1137 
1138 static int
1139 mxge_max_mtu(mxge_softc_t *sc)
1140 {
1141 	mxge_cmd_t cmd;
1142 	int status;
1143 
1144 	if (MJUMPAGESIZE - MXGEFW_PAD >  MXGEFW_MAX_MTU)
1145 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1146 
1147 	/* try to set nbufs to see if it we can
1148 	   use virtually contiguous jumbos */
1149 	cmd.data0 = 0;
1150 	status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1151 			       &cmd);
1152 	if (status == 0)
1153 		return  MXGEFW_MAX_MTU - MXGEFW_PAD;
1154 
1155 	/* otherwise, we're limited to MJUMPAGESIZE */
1156 	return MJUMPAGESIZE - MXGEFW_PAD;
1157 }
1158 
1159 static int
1160 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1161 {
1162 	struct mxge_slice_state *ss;
1163 	mxge_rx_done_t *rx_done;
1164 	volatile uint32_t *irq_claim;
1165 	mxge_cmd_t cmd;
1166 	int slice, status;
1167 
1168 	/* try to send a reset command to the card to see if it
1169 	   is alive */
1170 	memset(&cmd, 0, sizeof (cmd));
1171 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1172 	if (status != 0) {
1173 		device_printf(sc->dev, "failed reset\n");
1174 		return ENXIO;
1175 	}
1176 
1177 	mxge_dummy_rdma(sc, 1);
1178 
1179 
1180 	/* set the intrq size */
1181 	cmd.data0 = sc->rx_ring_size;
1182 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1183 
1184 	/*
1185 	 * Even though we already know how many slices are supported
1186 	 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1187 	 * has magic side effects, and must be called after a reset.
1188 	 * It must be called prior to calling any RSS related cmds,
1189 	 * including assigning an interrupt queue for anything but
1190 	 * slice 0.  It must also be called *after*
1191 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1192 	 * the firmware to compute offsets.
1193 	 */
1194 
1195 	if (sc->num_slices > 1) {
1196 		/* ask the maximum number of slices it supports */
1197 		status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1198 					   &cmd);
1199 		if (status != 0) {
1200 			device_printf(sc->dev,
1201 				      "failed to get number of slices\n");
1202 			return status;
1203 		}
1204 		/*
1205 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1206 		 * to setting up the interrupt queue DMA
1207 		 */
1208 		cmd.data0 = sc->num_slices;
1209 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1210 		status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1211 					   &cmd);
1212 		if (status != 0) {
1213 			device_printf(sc->dev,
1214 				      "failed to set number of slices\n");
1215 			return status;
1216 		}
1217 	}
1218 
1219 
1220 	if (interrupts_setup) {
1221 		/* Now exchange information about interrupts  */
1222 		for (slice = 0; slice < sc->num_slices; slice++) {
1223 			rx_done = &sc->ss[slice].rx_done;
1224 			memset(rx_done->entry, 0, sc->rx_ring_size);
1225 			cmd.data0 = MXGE_LOWPART_TO_U32(rx_done->dma.bus_addr);
1226 			cmd.data1 = MXGE_HIGHPART_TO_U32(rx_done->dma.bus_addr);
1227 			cmd.data2 = slice;
1228 			status |= mxge_send_cmd(sc,
1229 						MXGEFW_CMD_SET_INTRQ_DMA,
1230 						&cmd);
1231 		}
1232 	}
1233 
1234 	status |= mxge_send_cmd(sc,
1235 				MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1236 
1237 
1238 	sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1239 
1240 	status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1241 	irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1242 
1243 
1244 	status |= mxge_send_cmd(sc,  MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET,
1245 				&cmd);
1246 	sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1247 	if (status != 0) {
1248 		device_printf(sc->dev, "failed set interrupt parameters\n");
1249 		return status;
1250 	}
1251 
1252 
1253 	*sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1254 
1255 
1256 	/* run a DMA benchmark */
1257 	(void) mxge_dma_test(sc, MXGEFW_DMA_TEST);
1258 
1259 	for (slice = 0; slice < sc->num_slices; slice++) {
1260 		ss = &sc->ss[slice];
1261 
1262 		ss->irq_claim = irq_claim + (2 * slice);
1263 		/* reset mcp/driver shared state back to 0 */
1264 		ss->rx_done.idx = 0;
1265 		ss->rx_done.cnt = 0;
1266 		ss->tx.req = 0;
1267 		ss->tx.done = 0;
1268 		ss->tx.pkt_done = 0;
1269 		ss->tx.wake = 0;
1270 		ss->tx.defrag = 0;
1271 		ss->tx.stall = 0;
1272 		ss->rx_big.cnt = 0;
1273 		ss->rx_small.cnt = 0;
1274 		ss->lro_bad_csum = 0;
1275 		ss->lro_queued = 0;
1276 		ss->lro_flushed = 0;
1277 		if (ss->fw_stats != NULL) {
1278 			ss->fw_stats->valid = 0;
1279 			ss->fw_stats->send_done_count = 0;
1280 		}
1281 	}
1282 	sc->rdma_tags_available = 15;
1283 	status = mxge_update_mac_address(sc);
1284 	mxge_change_promisc(sc, 0);
1285 	mxge_change_pause(sc, sc->pause);
1286 	mxge_set_multicast_list(sc);
1287 	return status;
1288 }
1289 
1290 static int
1291 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1292 {
1293         mxge_softc_t *sc;
1294         unsigned int intr_coal_delay;
1295         int err;
1296 
1297         sc = arg1;
1298         intr_coal_delay = sc->intr_coal_delay;
1299         err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1300         if (err != 0) {
1301                 return err;
1302         }
1303         if (intr_coal_delay == sc->intr_coal_delay)
1304                 return 0;
1305 
1306         if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1307                 return EINVAL;
1308 
1309 	mtx_lock(&sc->driver_mtx);
1310 	*sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1311 	sc->intr_coal_delay = intr_coal_delay;
1312 
1313 	mtx_unlock(&sc->driver_mtx);
1314         return err;
1315 }
1316 
1317 static int
1318 mxge_change_flow_control(SYSCTL_HANDLER_ARGS)
1319 {
1320         mxge_softc_t *sc;
1321         unsigned int enabled;
1322         int err;
1323 
1324         sc = arg1;
1325         enabled = sc->pause;
1326         err = sysctl_handle_int(oidp, &enabled, arg2, req);
1327         if (err != 0) {
1328                 return err;
1329         }
1330         if (enabled == sc->pause)
1331                 return 0;
1332 
1333 	mtx_lock(&sc->driver_mtx);
1334 	err = mxge_change_pause(sc, enabled);
1335 	mtx_unlock(&sc->driver_mtx);
1336         return err;
1337 }
1338 
1339 static int
1340 mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt)
1341 {
1342 	struct ifnet *ifp;
1343 	int err = 0;
1344 
1345 	ifp = sc->ifp;
1346 	if (lro_cnt == 0)
1347 		ifp->if_capenable &= ~IFCAP_LRO;
1348 	else
1349 		ifp->if_capenable |= IFCAP_LRO;
1350 	sc->lro_cnt = lro_cnt;
1351 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1352 		mxge_close(sc);
1353 		err = mxge_open(sc);
1354 	}
1355 	return err;
1356 }
1357 
1358 static int
1359 mxge_change_lro(SYSCTL_HANDLER_ARGS)
1360 {
1361 	mxge_softc_t *sc;
1362 	unsigned int lro_cnt;
1363 	int err;
1364 
1365 	sc = arg1;
1366 	lro_cnt = sc->lro_cnt;
1367 	err = sysctl_handle_int(oidp, &lro_cnt, arg2, req);
1368 	if (err != 0)
1369 		return err;
1370 
1371 	if (lro_cnt == sc->lro_cnt)
1372 		return 0;
1373 
1374 	if (lro_cnt > 128)
1375 		return EINVAL;
1376 
1377 	mtx_lock(&sc->driver_mtx);
1378 	err = mxge_change_lro_locked(sc, lro_cnt);
1379 	mtx_unlock(&sc->driver_mtx);
1380 	return err;
1381 }
1382 
1383 static int
1384 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1385 {
1386         int err;
1387 
1388         if (arg1 == NULL)
1389                 return EFAULT;
1390         arg2 = be32toh(*(int *)arg1);
1391         arg1 = NULL;
1392         err = sysctl_handle_int(oidp, arg1, arg2, req);
1393 
1394         return err;
1395 }
1396 
1397 static void
1398 mxge_rem_sysctls(mxge_softc_t *sc)
1399 {
1400 	struct mxge_slice_state *ss;
1401 	int slice;
1402 
1403 	if (sc->slice_sysctl_tree == NULL)
1404 		return;
1405 
1406 	for (slice = 0; slice < sc->num_slices; slice++) {
1407 		ss = &sc->ss[slice];
1408 		if (ss == NULL || ss->sysctl_tree == NULL)
1409 			continue;
1410 		sysctl_ctx_free(&ss->sysctl_ctx);
1411 		ss->sysctl_tree = NULL;
1412 	}
1413 	sysctl_ctx_free(&sc->slice_sysctl_ctx);
1414 	sc->slice_sysctl_tree = NULL;
1415 }
1416 
1417 static void
1418 mxge_add_sysctls(mxge_softc_t *sc)
1419 {
1420 	struct sysctl_ctx_list *ctx;
1421 	struct sysctl_oid_list *children;
1422 	mcp_irq_data_t *fw;
1423 	struct mxge_slice_state *ss;
1424 	int slice;
1425 	char slice_num[8];
1426 
1427 	ctx = device_get_sysctl_ctx(sc->dev);
1428 	children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1429 	fw = sc->ss[0].fw_stats;
1430 
1431 	/* random information */
1432 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1433 		       "firmware_version",
1434 		       CTLFLAG_RD, &sc->fw_version,
1435 		       0, "firmware version");
1436 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1437 		       "serial_number",
1438 		       CTLFLAG_RD, &sc->serial_number_string,
1439 		       0, "serial number");
1440 	SYSCTL_ADD_STRING(ctx, children, OID_AUTO,
1441 		       "product_code",
1442 		       CTLFLAG_RD, &sc->product_code_string,
1443 		       0, "product_code");
1444 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1445 		       "pcie_link_width",
1446 		       CTLFLAG_RD, &sc->link_width,
1447 		       0, "tx_boundary");
1448 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1449 		       "tx_boundary",
1450 		       CTLFLAG_RD, &sc->tx_boundary,
1451 		       0, "tx_boundary");
1452 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1453 		       "write_combine",
1454 		       CTLFLAG_RD, &sc->wc,
1455 		       0, "write combining PIO?");
1456 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1457 		       "read_dma_MBs",
1458 		       CTLFLAG_RD, &sc->read_dma,
1459 		       0, "DMA Read speed in MB/s");
1460 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1461 		       "write_dma_MBs",
1462 		       CTLFLAG_RD, &sc->write_dma,
1463 		       0, "DMA Write speed in MB/s");
1464 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1465 		       "read_write_dma_MBs",
1466 		       CTLFLAG_RD, &sc->read_write_dma,
1467 		       0, "DMA concurrent Read/Write speed in MB/s");
1468 
1469 
1470 	/* performance related tunables */
1471 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1472 			"intr_coal_delay",
1473 			CTLTYPE_INT|CTLFLAG_RW, sc,
1474 			0, mxge_change_intr_coal,
1475 			"I", "interrupt coalescing delay in usecs");
1476 
1477 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1478 			"flow_control_enabled",
1479 			CTLTYPE_INT|CTLFLAG_RW, sc,
1480 			0, mxge_change_flow_control,
1481 			"I", "interrupt coalescing delay in usecs");
1482 
1483 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1484 		       "deassert_wait",
1485 		       CTLFLAG_RW, &mxge_deassert_wait,
1486 		       0, "Wait for IRQ line to go low in ihandler");
1487 
1488 	/* stats block from firmware is in network byte order.
1489 	   Need to swap it */
1490 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1491 			"link_up",
1492 			CTLTYPE_INT|CTLFLAG_RD, &fw->link_up,
1493 			0, mxge_handle_be32,
1494 			"I", "link up");
1495 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1496 			"rdma_tags_available",
1497 			CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available,
1498 			0, mxge_handle_be32,
1499 			"I", "rdma_tags_available");
1500 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1501 			"dropped_bad_crc32",
1502 			CTLTYPE_INT|CTLFLAG_RD,
1503 			&fw->dropped_bad_crc32,
1504 			0, mxge_handle_be32,
1505 			"I", "dropped_bad_crc32");
1506 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1507 			"dropped_bad_phy",
1508 			CTLTYPE_INT|CTLFLAG_RD,
1509 			&fw->dropped_bad_phy,
1510 			0, mxge_handle_be32,
1511 			"I", "dropped_bad_phy");
1512 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1513 			"dropped_link_error_or_filtered",
1514 			CTLTYPE_INT|CTLFLAG_RD,
1515 			&fw->dropped_link_error_or_filtered,
1516 			0, mxge_handle_be32,
1517 			"I", "dropped_link_error_or_filtered");
1518 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1519 			"dropped_link_overflow",
1520 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow,
1521 			0, mxge_handle_be32,
1522 			"I", "dropped_link_overflow");
1523 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1524 			"dropped_multicast_filtered",
1525 			CTLTYPE_INT|CTLFLAG_RD,
1526 			&fw->dropped_multicast_filtered,
1527 			0, mxge_handle_be32,
1528 			"I", "dropped_multicast_filtered");
1529 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1530 			"dropped_no_big_buffer",
1531 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer,
1532 			0, mxge_handle_be32,
1533 			"I", "dropped_no_big_buffer");
1534 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1535 			"dropped_no_small_buffer",
1536 			CTLTYPE_INT|CTLFLAG_RD,
1537 			&fw->dropped_no_small_buffer,
1538 			0, mxge_handle_be32,
1539 			"I", "dropped_no_small_buffer");
1540 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1541 			"dropped_overrun",
1542 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun,
1543 			0, mxge_handle_be32,
1544 			"I", "dropped_overrun");
1545 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1546 			"dropped_pause",
1547 			CTLTYPE_INT|CTLFLAG_RD,
1548 			&fw->dropped_pause,
1549 			0, mxge_handle_be32,
1550 			"I", "dropped_pause");
1551 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1552 			"dropped_runt",
1553 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt,
1554 			0, mxge_handle_be32,
1555 			"I", "dropped_runt");
1556 
1557 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1558 			"dropped_unicast_filtered",
1559 			CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered,
1560 			0, mxge_handle_be32,
1561 			"I", "dropped_unicast_filtered");
1562 
1563 	/* verbose printing? */
1564 	SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1565 		       "verbose",
1566 		       CTLFLAG_RW, &mxge_verbose,
1567 		       0, "verbose printing");
1568 
1569 	/* lro */
1570 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
1571 			"lro_cnt",
1572 			CTLTYPE_INT|CTLFLAG_RW, sc,
1573 			0, mxge_change_lro,
1574 			"I", "number of lro merge queues");
1575 
1576 
1577 	/* add counters exported for debugging from all slices */
1578 	sysctl_ctx_init(&sc->slice_sysctl_ctx);
1579 	sc->slice_sysctl_tree =
1580 		SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx, children, OID_AUTO,
1581 				"slice", CTLFLAG_RD, 0, "");
1582 
1583 	for (slice = 0; slice < sc->num_slices; slice++) {
1584 		ss = &sc->ss[slice];
1585 		sysctl_ctx_init(&ss->sysctl_ctx);
1586 		ctx = &ss->sysctl_ctx;
1587 		children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1588 		sprintf(slice_num, "%d", slice);
1589 		ss->sysctl_tree =
1590 			SYSCTL_ADD_NODE(ctx, children, OID_AUTO, slice_num,
1591 					CTLFLAG_RD, 0, "");
1592 		children = SYSCTL_CHILDREN(ss->sysctl_tree);
1593 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1594 			       "rx_small_cnt",
1595 			       CTLFLAG_RD, &ss->rx_small.cnt,
1596 			       0, "rx_small_cnt");
1597 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1598 			       "rx_big_cnt",
1599 			       CTLFLAG_RD, &ss->rx_big.cnt,
1600 			       0, "rx_small_cnt");
1601 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1602 			       "tx_req",
1603 			       CTLFLAG_RD, &ss->tx.req,
1604 			       0, "tx_req");
1605 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1606 			       "lro_flushed", CTLFLAG_RD, &ss->lro_flushed,
1607 			       0, "number of lro merge queues flushed");
1608 
1609 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1610 			       "lro_queued", CTLFLAG_RD, &ss->lro_queued,
1611 			       0, "number of frames appended to lro merge"
1612 			       "queues");
1613 
1614 		/* only transmit from slice 0 for now */
1615 		if (slice > 0)
1616 			continue;
1617 
1618 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1619 			       "tx_done",
1620 			       CTLFLAG_RD, &ss->tx.done,
1621 			       0, "tx_done");
1622 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1623 			       "tx_pkt_done",
1624 			       CTLFLAG_RD, &ss->tx.pkt_done,
1625 			       0, "tx_done");
1626 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1627 			       "tx_stall",
1628 			       CTLFLAG_RD, &ss->tx.stall,
1629 			       0, "tx_stall");
1630 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1631 			       "tx_wake",
1632 			       CTLFLAG_RD, &ss->tx.wake,
1633 			       0, "tx_wake");
1634 		SYSCTL_ADD_INT(ctx, children, OID_AUTO,
1635 			       "tx_defrag",
1636 			       CTLFLAG_RD, &ss->tx.defrag,
1637 			       0, "tx_defrag");
1638 	}
1639 }
1640 
1641 /* copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1642    backwards one at a time and handle ring wraps */
1643 
1644 static inline void
1645 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1646 			    mcp_kreq_ether_send_t *src, int cnt)
1647 {
1648         int idx, starting_slot;
1649         starting_slot = tx->req;
1650         while (cnt > 1) {
1651                 cnt--;
1652                 idx = (starting_slot + cnt) & tx->mask;
1653                 mxge_pio_copy(&tx->lanai[idx],
1654 			      &src[cnt], sizeof(*src));
1655                 wmb();
1656         }
1657 }
1658 
1659 /*
1660  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
1661  * at most 32 bytes at a time, so as to avoid involving the software
1662  * pio handler in the nic.   We re-write the first segment's flags
1663  * to mark them valid only after writing the entire chain
1664  */
1665 
1666 static inline void
1667 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
1668                   int cnt)
1669 {
1670         int idx, i;
1671         uint32_t *src_ints;
1672 	volatile uint32_t *dst_ints;
1673         mcp_kreq_ether_send_t *srcp;
1674 	volatile mcp_kreq_ether_send_t *dstp, *dst;
1675 	uint8_t last_flags;
1676 
1677         idx = tx->req & tx->mask;
1678 
1679 	last_flags = src->flags;
1680 	src->flags = 0;
1681         wmb();
1682         dst = dstp = &tx->lanai[idx];
1683         srcp = src;
1684 
1685         if ((idx + cnt) < tx->mask) {
1686                 for (i = 0; i < (cnt - 1); i += 2) {
1687                         mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1688                         wmb(); /* force write every 32 bytes */
1689                         srcp += 2;
1690                         dstp += 2;
1691                 }
1692         } else {
1693                 /* submit all but the first request, and ensure
1694                    that it is submitted below */
1695                 mxge_submit_req_backwards(tx, src, cnt);
1696                 i = 0;
1697         }
1698         if (i < cnt) {
1699                 /* submit the first request */
1700                 mxge_pio_copy(dstp, srcp, sizeof(*src));
1701                 wmb(); /* barrier before setting valid flag */
1702         }
1703 
1704         /* re-write the last 32-bits with the valid flags */
1705         src->flags = last_flags;
1706         src_ints = (uint32_t *)src;
1707         src_ints+=3;
1708         dst_ints = (volatile uint32_t *)dst;
1709         dst_ints+=3;
1710         *dst_ints =  *src_ints;
1711         tx->req += cnt;
1712         wmb();
1713 }
1714 
1715 #if IFCAP_TSO4
1716 
1717 static void
1718 mxge_encap_tso(struct mxge_slice_state *ss, struct mbuf *m,
1719 	       int busdma_seg_cnt, int ip_off)
1720 {
1721 	mxge_tx_ring_t *tx;
1722 	mcp_kreq_ether_send_t *req;
1723 	bus_dma_segment_t *seg;
1724 	struct ip *ip;
1725 	struct tcphdr *tcp;
1726 	uint32_t low, high_swapped;
1727 	int len, seglen, cum_len, cum_len_next;
1728 	int next_is_first, chop, cnt, rdma_count, small;
1729 	uint16_t pseudo_hdr_offset, cksum_offset, mss;
1730 	uint8_t flags, flags_next;
1731 	static int once;
1732 
1733 	mss = m->m_pkthdr.tso_segsz;
1734 
1735 	/* negative cum_len signifies to the
1736 	 * send loop that we are still in the
1737 	 * header portion of the TSO packet.
1738 	 */
1739 
1740 	/* ensure we have the ethernet, IP and TCP
1741 	   header together in the first mbuf, copy
1742 	   it to a scratch buffer if not */
1743 	if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1744 		m_copydata(m, 0, ip_off + sizeof (*ip),
1745 			   ss->scratch);
1746 		ip = (struct ip *)(ss->scratch + ip_off);
1747 	} else {
1748 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1749 	}
1750 	if (__predict_false(m->m_len < ip_off + (ip->ip_hl << 2)
1751 			    + sizeof (*tcp))) {
1752 		m_copydata(m, 0, ip_off + (ip->ip_hl << 2)
1753 			   + sizeof (*tcp),  ss->scratch);
1754 		ip = (struct ip *)(mtod(m, char *) + ip_off);
1755 	}
1756 
1757 	tcp = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
1758 	cum_len = -(ip_off + ((ip->ip_hl + tcp->th_off) << 2));
1759 
1760 	/* TSO implies checksum offload on this hardware */
1761 	cksum_offset = ip_off + (ip->ip_hl << 2);
1762 	flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1763 
1764 
1765 	/* for TSO, pseudo_hdr_offset holds mss.
1766 	 * The firmware figures out where to put
1767 	 * the checksum by parsing the header. */
1768 	pseudo_hdr_offset = htobe16(mss);
1769 
1770 	tx = &ss->tx;
1771 	req = tx->req_list;
1772 	seg = tx->seg_list;
1773 	cnt = 0;
1774 	rdma_count = 0;
1775 	/* "rdma_count" is the number of RDMAs belonging to the
1776 	 * current packet BEFORE the current send request. For
1777 	 * non-TSO packets, this is equal to "count".
1778 	 * For TSO packets, rdma_count needs to be reset
1779 	 * to 0 after a segment cut.
1780 	 *
1781 	 * The rdma_count field of the send request is
1782 	 * the number of RDMAs of the packet starting at
1783 	 * that request. For TSO send requests with one ore more cuts
1784 	 * in the middle, this is the number of RDMAs starting
1785 	 * after the last cut in the request. All previous
1786 	 * segments before the last cut implicitly have 1 RDMA.
1787 	 *
1788 	 * Since the number of RDMAs is not known beforehand,
1789 	 * it must be filled-in retroactively - after each
1790 	 * segmentation cut or at the end of the entire packet.
1791 	 */
1792 
1793 	while (busdma_seg_cnt) {
1794 		/* Break the busdma segment up into pieces*/
1795 		low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1796 		high_swapped = 	htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1797 		len = seg->ds_len;
1798 
1799 		while (len) {
1800 			flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1801 			seglen = len;
1802 			cum_len_next = cum_len + seglen;
1803 			(req-rdma_count)->rdma_count = rdma_count + 1;
1804 			if (__predict_true(cum_len >= 0)) {
1805 				/* payload */
1806 				chop = (cum_len_next > mss);
1807 				cum_len_next = cum_len_next % mss;
1808 				next_is_first = (cum_len_next == 0);
1809 				flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1810 				flags_next |= next_is_first *
1811 					MXGEFW_FLAGS_FIRST;
1812 				rdma_count |= -(chop | next_is_first);
1813 				rdma_count += chop & !next_is_first;
1814 			} else if (cum_len_next >= 0) {
1815 				/* header ends */
1816 				rdma_count = -1;
1817 				cum_len_next = 0;
1818 				seglen = -cum_len;
1819 				small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1820 				flags_next = MXGEFW_FLAGS_TSO_PLD |
1821 					MXGEFW_FLAGS_FIRST |
1822 					(small * MXGEFW_FLAGS_SMALL);
1823 			    }
1824 
1825 			req->addr_high = high_swapped;
1826 			req->addr_low = htobe32(low);
1827 			req->pseudo_hdr_offset = pseudo_hdr_offset;
1828 			req->pad = 0;
1829 			req->rdma_count = 1;
1830 			req->length = htobe16(seglen);
1831 			req->cksum_offset = cksum_offset;
1832 			req->flags = flags | ((cum_len & 1) *
1833 					      MXGEFW_FLAGS_ALIGN_ODD);
1834 			low += seglen;
1835 			len -= seglen;
1836 			cum_len = cum_len_next;
1837 			flags = flags_next;
1838 			req++;
1839 			cnt++;
1840 			rdma_count++;
1841 			if (__predict_false(cksum_offset > seglen))
1842 				cksum_offset -= seglen;
1843 			else
1844 				cksum_offset = 0;
1845 			if (__predict_false(cnt > tx->max_desc))
1846 				goto drop;
1847 		}
1848 		busdma_seg_cnt--;
1849 		seg++;
1850 	}
1851 	(req-rdma_count)->rdma_count = rdma_count;
1852 
1853 	do {
1854 		req--;
1855 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
1856 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1857 
1858 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
1859 	mxge_submit_req(tx, tx->req_list, cnt);
1860 	return;
1861 
1862 drop:
1863 	bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1864 	m_freem(m);
1865 	ss->sc->ifp->if_oerrors++;
1866 	if (!once) {
1867 		printf("tx->max_desc exceeded via TSO!\n");
1868 		printf("mss = %d, %ld, %d!\n", mss,
1869 		       (long)seg - (long)tx->seg_list, tx->max_desc);
1870 		once = 1;
1871 	}
1872 	return;
1873 
1874 }
1875 
1876 #endif /* IFCAP_TSO4 */
1877 
1878 #ifdef MXGE_NEW_VLAN_API
1879 /*
1880  * We reproduce the software vlan tag insertion from
1881  * net/if_vlan.c:vlan_start() here so that we can advertise "hardware"
1882  * vlan tag insertion. We need to advertise this in order to have the
1883  * vlan interface respect our csum offload flags.
1884  */
1885 static struct mbuf *
1886 mxge_vlan_tag_insert(struct mbuf *m)
1887 {
1888 	struct ether_vlan_header *evl;
1889 
1890 	M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_DONTWAIT);
1891 	if (__predict_false(m == NULL))
1892 		return NULL;
1893 	if (m->m_len < sizeof(*evl)) {
1894 		m = m_pullup(m, sizeof(*evl));
1895 		if (__predict_false(m == NULL))
1896 			return NULL;
1897 	}
1898 	/*
1899 	 * Transform the Ethernet header into an Ethernet header
1900 	 * with 802.1Q encapsulation.
1901 	 */
1902 	evl = mtod(m, struct ether_vlan_header *);
1903 	bcopy((char *)evl + ETHER_VLAN_ENCAP_LEN,
1904 	      (char *)evl, ETHER_HDR_LEN - ETHER_TYPE_LEN);
1905 	evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
1906 	evl->evl_tag = htons(m->m_pkthdr.ether_vtag);
1907 	m->m_flags &= ~M_VLANTAG;
1908 	return m;
1909 }
1910 #endif /* MXGE_NEW_VLAN_API */
1911 
1912 static void
1913 mxge_encap(struct mxge_slice_state *ss, struct mbuf *m)
1914 {
1915 	mxge_softc_t *sc;
1916 	mcp_kreq_ether_send_t *req;
1917 	bus_dma_segment_t *seg;
1918 	struct mbuf *m_tmp;
1919 	struct ifnet *ifp;
1920 	mxge_tx_ring_t *tx;
1921 	struct ip *ip;
1922 	int cnt, cum_len, err, i, idx, odd_flag, ip_off;
1923 	uint16_t pseudo_hdr_offset;
1924         uint8_t flags, cksum_offset;
1925 
1926 
1927 	sc = ss->sc;
1928 	ifp = sc->ifp;
1929 	tx = &ss->tx;
1930 
1931 	ip_off = sizeof (struct ether_header);
1932 #ifdef MXGE_NEW_VLAN_API
1933 	if (m->m_flags & M_VLANTAG) {
1934 		m = mxge_vlan_tag_insert(m);
1935 		if (__predict_false(m == NULL))
1936 			goto drop;
1937 		ip_off += ETHER_VLAN_ENCAP_LEN;
1938 	}
1939 #endif
1940 	/* (try to) map the frame for DMA */
1941 	idx = tx->req & tx->mask;
1942 	err = bus_dmamap_load_mbuf_sg(tx->dmat, tx->info[idx].map,
1943 				      m, tx->seg_list, &cnt,
1944 				      BUS_DMA_NOWAIT);
1945 	if (__predict_false(err == EFBIG)) {
1946 		/* Too many segments in the chain.  Try
1947 		   to defrag */
1948 		m_tmp = m_defrag(m, M_NOWAIT);
1949 		if (m_tmp == NULL) {
1950 			goto drop;
1951 		}
1952 		ss->tx.defrag++;
1953 		m = m_tmp;
1954 		err = bus_dmamap_load_mbuf_sg(tx->dmat,
1955 					      tx->info[idx].map,
1956 					      m, tx->seg_list, &cnt,
1957 					      BUS_DMA_NOWAIT);
1958 	}
1959 	if (__predict_false(err != 0)) {
1960 		device_printf(sc->dev, "bus_dmamap_load_mbuf_sg returned %d"
1961 			      " packet len = %d\n", err, m->m_pkthdr.len);
1962 		goto drop;
1963 	}
1964 	bus_dmamap_sync(tx->dmat, tx->info[idx].map,
1965 			BUS_DMASYNC_PREWRITE);
1966 	tx->info[idx].m = m;
1967 
1968 #if IFCAP_TSO4
1969 	/* TSO is different enough, we handle it in another routine */
1970 	if (m->m_pkthdr.csum_flags & (CSUM_TSO)) {
1971 		mxge_encap_tso(ss, m, cnt, ip_off);
1972 		return;
1973 	}
1974 #endif
1975 
1976 	req = tx->req_list;
1977 	cksum_offset = 0;
1978 	pseudo_hdr_offset = 0;
1979 	flags = MXGEFW_FLAGS_NO_TSO;
1980 
1981 	/* checksum offloading? */
1982 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA)) {
1983 		/* ensure ip header is in first mbuf, copy
1984 		   it to a scratch buffer if not */
1985 		if (__predict_false(m->m_len < ip_off + sizeof (*ip))) {
1986 			m_copydata(m, 0, ip_off + sizeof (*ip),
1987 				   ss->scratch);
1988 			ip = (struct ip *)(ss->scratch + ip_off);
1989 		} else {
1990 			ip = (struct ip *)(mtod(m, char *) + ip_off);
1991 		}
1992 		cksum_offset = ip_off + (ip->ip_hl << 2);
1993 		pseudo_hdr_offset = cksum_offset +  m->m_pkthdr.csum_data;
1994 		pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1995 		req->cksum_offset = cksum_offset;
1996 		flags |= MXGEFW_FLAGS_CKSUM;
1997 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1998 	} else {
1999 		odd_flag = 0;
2000 	}
2001 	if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
2002 		flags |= MXGEFW_FLAGS_SMALL;
2003 
2004 	/* convert segments into a request list */
2005 	cum_len = 0;
2006 	seg = tx->seg_list;
2007 	req->flags = MXGEFW_FLAGS_FIRST;
2008 	for (i = 0; i < cnt; i++) {
2009 		req->addr_low =
2010 			htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2011 		req->addr_high =
2012 			htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2013 		req->length = htobe16(seg->ds_len);
2014 		req->cksum_offset = cksum_offset;
2015 		if (cksum_offset > seg->ds_len)
2016 			cksum_offset -= seg->ds_len;
2017 		else
2018 			cksum_offset = 0;
2019 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2020 		req->pad = 0; /* complete solid 16-byte block */
2021 		req->rdma_count = 1;
2022 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2023 		cum_len += seg->ds_len;
2024 		seg++;
2025 		req++;
2026 		req->flags = 0;
2027 	}
2028 	req--;
2029 	/* pad runts to 60 bytes */
2030 	if (cum_len < 60) {
2031 		req++;
2032 		req->addr_low =
2033 			htobe32(MXGE_LOWPART_TO_U32(sc->zeropad_dma.bus_addr));
2034 		req->addr_high =
2035 			htobe32(MXGE_HIGHPART_TO_U32(sc->zeropad_dma.bus_addr));
2036 		req->length = htobe16(60 - cum_len);
2037 		req->cksum_offset = 0;
2038 		req->pseudo_hdr_offset = pseudo_hdr_offset;
2039 		req->pad = 0; /* complete solid 16-byte block */
2040 		req->rdma_count = 1;
2041 		req->flags |= flags | ((cum_len & 1) * odd_flag);
2042 		cnt++;
2043 	}
2044 
2045 	tx->req_list[0].rdma_count = cnt;
2046 #if 0
2047 	/* print what the firmware will see */
2048 	for (i = 0; i < cnt; i++) {
2049 		printf("%d: addr: 0x%x 0x%x len:%d pso%d,"
2050 		    "cso:%d, flags:0x%x, rdma:%d\n",
2051 		    i, (int)ntohl(tx->req_list[i].addr_high),
2052 		    (int)ntohl(tx->req_list[i].addr_low),
2053 		    (int)ntohs(tx->req_list[i].length),
2054 		    (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
2055 		    tx->req_list[i].cksum_offset, tx->req_list[i].flags,
2056 		    tx->req_list[i].rdma_count);
2057 	}
2058 	printf("--------------\n");
2059 #endif
2060 	tx->info[((cnt - 1) + tx->req) & tx->mask].flag = 1;
2061 	mxge_submit_req(tx, tx->req_list, cnt);
2062 	return;
2063 
2064 drop:
2065 	m_freem(m);
2066 	ifp->if_oerrors++;
2067 	return;
2068 }
2069 
2070 
2071 
2072 
2073 static inline void
2074 mxge_start_locked(struct mxge_slice_state *ss)
2075 {
2076 	mxge_softc_t *sc;
2077 	struct mbuf *m;
2078 	struct ifnet *ifp;
2079 	mxge_tx_ring_t *tx;
2080 
2081 	sc = ss->sc;
2082 	ifp = sc->ifp;
2083 	tx = &ss->tx;
2084 	while ((tx->mask - (tx->req - tx->done)) > tx->max_desc) {
2085 		IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
2086 		if (m == NULL) {
2087 			return;
2088 		}
2089 		/* let BPF see it */
2090 		BPF_MTAP(ifp, m);
2091 
2092 		/* give it to the nic */
2093 		mxge_encap(ss, m);
2094 	}
2095 	/* ran out of transmit slots */
2096 	if ((sc->ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
2097 		sc->ifp->if_drv_flags |= IFF_DRV_OACTIVE;
2098 		tx->stall++;
2099 	}
2100 }
2101 
2102 static void
2103 mxge_start(struct ifnet *ifp)
2104 {
2105 	mxge_softc_t *sc = ifp->if_softc;
2106 	struct mxge_slice_state *ss;
2107 
2108 	/* only use the first slice for now */
2109 	ss = &sc->ss[0];
2110 	mtx_lock(&ss->tx.mtx);
2111 	mxge_start_locked(ss);
2112 	mtx_unlock(&ss->tx.mtx);
2113 }
2114 
2115 /*
2116  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
2117  * at most 32 bytes at a time, so as to avoid involving the software
2118  * pio handler in the nic.   We re-write the first segment's low
2119  * DMA address to mark it valid only after we write the entire chunk
2120  * in a burst
2121  */
2122 static inline void
2123 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2124 		mcp_kreq_ether_recv_t *src)
2125 {
2126 	uint32_t low;
2127 
2128 	low = src->addr_low;
2129 	src->addr_low = 0xffffffff;
2130 	mxge_pio_copy(dst, src, 4 * sizeof (*src));
2131 	wmb();
2132 	mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2133 	wmb();
2134 	src->addr_low = low;
2135 	dst->addr_low = low;
2136 	wmb();
2137 }
2138 
2139 static int
2140 mxge_get_buf_small(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2141 {
2142 	bus_dma_segment_t seg;
2143 	struct mbuf *m;
2144 	mxge_rx_ring_t *rx = &ss->rx_small;
2145 	int cnt, err;
2146 
2147 	m = m_gethdr(M_DONTWAIT, MT_DATA);
2148 	if (m == NULL) {
2149 		rx->alloc_fail++;
2150 		err = ENOBUFS;
2151 		goto done;
2152 	}
2153 	m->m_len = MHLEN;
2154 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2155 				      &seg, &cnt, BUS_DMA_NOWAIT);
2156 	if (err != 0) {
2157 		m_free(m);
2158 		goto done;
2159 	}
2160 	rx->info[idx].m = m;
2161 	rx->shadow[idx].addr_low =
2162 		htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2163 	rx->shadow[idx].addr_high =
2164 		htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2165 
2166 done:
2167 	if ((idx & 7) == 7)
2168 		mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2169 	return err;
2170 }
2171 
2172 static int
2173 mxge_get_buf_big(struct mxge_slice_state *ss, bus_dmamap_t map, int idx)
2174 {
2175 	bus_dma_segment_t seg[3];
2176 	struct mbuf *m;
2177 	mxge_rx_ring_t *rx = &ss->rx_big;
2178 	int cnt, err, i;
2179 
2180 	if (rx->cl_size == MCLBYTES)
2181 		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2182 	else
2183 		m = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, rx->cl_size);
2184 	if (m == NULL) {
2185 		rx->alloc_fail++;
2186 		err = ENOBUFS;
2187 		goto done;
2188 	}
2189 	m->m_len = rx->cl_size;
2190 	err = bus_dmamap_load_mbuf_sg(rx->dmat, map, m,
2191 				      seg, &cnt, BUS_DMA_NOWAIT);
2192 	if (err != 0) {
2193 		m_free(m);
2194 		goto done;
2195 	}
2196 	rx->info[idx].m = m;
2197 	rx->shadow[idx].addr_low =
2198 		htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
2199 	rx->shadow[idx].addr_high =
2200 		htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
2201 
2202 #if MXGE_VIRT_JUMBOS
2203 	for (i = 1; i < cnt; i++) {
2204 		rx->shadow[idx + i].addr_low =
2205 			htobe32(MXGE_LOWPART_TO_U32(seg[i].ds_addr));
2206 		rx->shadow[idx + i].addr_high =
2207 			htobe32(MXGE_HIGHPART_TO_U32(seg[i].ds_addr));
2208        }
2209 #endif
2210 
2211 done:
2212        for (i = 0; i < rx->nbufs; i++) {
2213 		if ((idx & 7) == 7) {
2214 			mxge_submit_8rx(&rx->lanai[idx - 7],
2215 					&rx->shadow[idx - 7]);
2216 		}
2217 		idx++;
2218 	}
2219 	return err;
2220 }
2221 
2222 /*
2223  *  Myri10GE hardware checksums are not valid if the sender
2224  *  padded the frame with non-zero padding.  This is because
2225  *  the firmware just does a simple 16-bit 1s complement
2226  *  checksum across the entire frame, excluding the first 14
2227  *  bytes.  It is best to simply to check the checksum and
2228  *  tell the stack about it only if the checksum is good
2229  */
2230 
2231 static inline uint16_t
2232 mxge_rx_csum(struct mbuf *m, int csum)
2233 {
2234 	struct ether_header *eh;
2235 	struct ip *ip;
2236 	uint16_t c;
2237 
2238 	eh = mtod(m, struct ether_header *);
2239 
2240 	/* only deal with IPv4 TCP & UDP for now */
2241 	if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2242 		return 1;
2243 	ip = (struct ip *)(eh + 1);
2244 	if (__predict_false(ip->ip_p != IPPROTO_TCP &&
2245 			    ip->ip_p != IPPROTO_UDP))
2246 		return 1;
2247 
2248 	c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2249 		      htonl(ntohs(csum) + ntohs(ip->ip_len) +
2250 			    - (ip->ip_hl << 2) + ip->ip_p));
2251 	c ^= 0xffff;
2252 	return (c);
2253 }
2254 
2255 static void
2256 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2257 {
2258 	struct ether_vlan_header *evl;
2259 	struct ether_header *eh;
2260 	uint32_t partial;
2261 
2262 	evl = mtod(m, struct ether_vlan_header *);
2263 	eh = mtod(m, struct ether_header *);
2264 
2265 	/*
2266 	 * fix checksum by subtracting ETHER_VLAN_ENCAP_LEN bytes
2267 	 * after what the firmware thought was the end of the ethernet
2268 	 * header.
2269 	 */
2270 
2271 	/* put checksum into host byte order */
2272 	*csum = ntohs(*csum);
2273 	partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2274 	(*csum) += ~partial;
2275 	(*csum) +=  ((*csum) < ~partial);
2276 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2277 	(*csum) = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2278 
2279 	/* restore checksum to network byte order;
2280 	   later consumers expect this */
2281 	*csum = htons(*csum);
2282 
2283 	/* save the tag */
2284 #ifdef MXGE_NEW_VLAN_API
2285 	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
2286 #else
2287 	{
2288 		struct m_tag *mtag;
2289 		mtag = m_tag_alloc(MTAG_VLAN, MTAG_VLAN_TAG, sizeof(u_int),
2290 				   M_NOWAIT);
2291 		if (mtag == NULL)
2292 			return;
2293 		VLAN_TAG_VALUE(mtag) = ntohs(evl->evl_tag);
2294 		m_tag_prepend(m, mtag);
2295 	}
2296 
2297 #endif
2298 	m->m_flags |= M_VLANTAG;
2299 
2300 	/*
2301 	 * Remove the 802.1q header by copying the Ethernet
2302 	 * addresses over it and adjusting the beginning of
2303 	 * the data in the mbuf.  The encapsulated Ethernet
2304 	 * type field is already in place.
2305 	 */
2306 	bcopy((char *)evl, (char *)evl + ETHER_VLAN_ENCAP_LEN,
2307 	      ETHER_HDR_LEN - ETHER_TYPE_LEN);
2308 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
2309 }
2310 
2311 
2312 static inline void
2313 mxge_rx_done_big(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2314 {
2315 	mxge_softc_t *sc;
2316 	struct ifnet *ifp;
2317 	struct mbuf *m;
2318 	struct ether_header *eh;
2319 	mxge_rx_ring_t *rx;
2320 	bus_dmamap_t old_map;
2321 	int idx;
2322 	uint16_t tcpudp_csum;
2323 
2324 	sc = ss->sc;
2325 	ifp = sc->ifp;
2326 	rx = &ss->rx_big;
2327 	idx = rx->cnt & rx->mask;
2328 	rx->cnt += rx->nbufs;
2329 	/* save a pointer to the received mbuf */
2330 	m = rx->info[idx].m;
2331 	/* try to replace the received mbuf */
2332 	if (mxge_get_buf_big(ss, rx->extra_map, idx)) {
2333 		/* drop the frame -- the old mbuf is re-cycled */
2334 		ifp->if_ierrors++;
2335 		return;
2336 	}
2337 
2338 	/* unmap the received buffer */
2339 	old_map = rx->info[idx].map;
2340 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2341 	bus_dmamap_unload(rx->dmat, old_map);
2342 
2343 	/* swap the bus_dmamap_t's */
2344 	rx->info[idx].map = rx->extra_map;
2345 	rx->extra_map = old_map;
2346 
2347 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2348 	 * aligned */
2349 	m->m_data += MXGEFW_PAD;
2350 
2351 	m->m_pkthdr.rcvif = ifp;
2352 	m->m_len = m->m_pkthdr.len = len;
2353 	ss->ipackets++;
2354 	eh = mtod(m, struct ether_header *);
2355 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2356 		mxge_vlan_tag_remove(m, &csum);
2357 	}
2358 	/* if the checksum is valid, mark it in the mbuf header */
2359 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2360 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2361 			return;
2362 		/* otherwise, it was a UDP frame, or a TCP frame which
2363 		   we could not do LRO on.  Tell the stack that the
2364 		   checksum is good */
2365 		m->m_pkthdr.csum_data = 0xffff;
2366 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2367 	}
2368 	/* pass the frame up the stack */
2369 	(*ifp->if_input)(ifp, m);
2370 }
2371 
2372 static inline void
2373 mxge_rx_done_small(struct mxge_slice_state *ss, uint32_t len, uint32_t csum)
2374 {
2375 	mxge_softc_t *sc;
2376 	struct ifnet *ifp;
2377 	struct ether_header *eh;
2378 	struct mbuf *m;
2379 	mxge_rx_ring_t *rx;
2380 	bus_dmamap_t old_map;
2381 	int idx;
2382 	uint16_t tcpudp_csum;
2383 
2384 	sc = ss->sc;
2385 	ifp = sc->ifp;
2386 	rx = &ss->rx_small;
2387 	idx = rx->cnt & rx->mask;
2388 	rx->cnt++;
2389 	/* save a pointer to the received mbuf */
2390 	m = rx->info[idx].m;
2391 	/* try to replace the received mbuf */
2392 	if (mxge_get_buf_small(ss, rx->extra_map, idx)) {
2393 		/* drop the frame -- the old mbuf is re-cycled */
2394 		ifp->if_ierrors++;
2395 		return;
2396 	}
2397 
2398 	/* unmap the received buffer */
2399 	old_map = rx->info[idx].map;
2400 	bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2401 	bus_dmamap_unload(rx->dmat, old_map);
2402 
2403 	/* swap the bus_dmamap_t's */
2404 	rx->info[idx].map = rx->extra_map;
2405 	rx->extra_map = old_map;
2406 
2407 	/* mcp implicitly skips 1st 2 bytes so that packet is properly
2408 	 * aligned */
2409 	m->m_data += MXGEFW_PAD;
2410 
2411 	m->m_pkthdr.rcvif = ifp;
2412 	m->m_len = m->m_pkthdr.len = len;
2413 	ss->ipackets++;
2414 	eh = mtod(m, struct ether_header *);
2415 	if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
2416 		mxge_vlan_tag_remove(m, &csum);
2417 	}
2418 	/* if the checksum is valid, mark it in the mbuf header */
2419 	if (sc->csum_flag && (0 == (tcpudp_csum = mxge_rx_csum(m, csum)))) {
2420 		if (sc->lro_cnt && (0 == mxge_lro_rx(ss, m, csum)))
2421 			return;
2422 		/* otherwise, it was a UDP frame, or a TCP frame which
2423 		   we could not do LRO on.  Tell the stack that the
2424 		   checksum is good */
2425 		m->m_pkthdr.csum_data = 0xffff;
2426 		m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR | CSUM_DATA_VALID;
2427 	}
2428 	/* pass the frame up the stack */
2429 	(*ifp->if_input)(ifp, m);
2430 }
2431 
2432 static inline void
2433 mxge_clean_rx_done(struct mxge_slice_state *ss)
2434 {
2435 	mxge_rx_done_t *rx_done = &ss->rx_done;
2436 	struct lro_entry *lro;
2437 	int limit = 0;
2438 	uint16_t length;
2439 	uint16_t checksum;
2440 
2441 
2442 	while (rx_done->entry[rx_done->idx].length != 0) {
2443 		length = ntohs(rx_done->entry[rx_done->idx].length);
2444 		rx_done->entry[rx_done->idx].length = 0;
2445 		checksum = rx_done->entry[rx_done->idx].checksum;
2446 		if (length <= (MHLEN - MXGEFW_PAD))
2447 			mxge_rx_done_small(ss, length, checksum);
2448 		else
2449 			mxge_rx_done_big(ss, length, checksum);
2450 		rx_done->cnt++;
2451 		rx_done->idx = rx_done->cnt & rx_done->mask;
2452 
2453 		/* limit potential for livelock */
2454 		if (__predict_false(++limit > rx_done->mask / 2))
2455 			break;
2456 	}
2457 	while (!SLIST_EMPTY(&ss->lro_active)) {
2458 		lro = SLIST_FIRST(&ss->lro_active);
2459 		SLIST_REMOVE_HEAD(&ss->lro_active, next);
2460 		mxge_lro_flush(ss, lro);
2461 	}
2462 }
2463 
2464 
2465 static inline void
2466 mxge_tx_done(struct mxge_slice_state *ss, uint32_t mcp_idx)
2467 {
2468 	struct ifnet *ifp;
2469 	mxge_tx_ring_t *tx;
2470 	struct mbuf *m;
2471 	bus_dmamap_t map;
2472 	int idx;
2473 
2474 	tx = &ss->tx;
2475 	ifp = ss->sc->ifp;
2476 	while (tx->pkt_done != mcp_idx) {
2477 		idx = tx->done & tx->mask;
2478 		tx->done++;
2479 		m = tx->info[idx].m;
2480 		/* mbuf and DMA map only attached to the first
2481 		   segment per-mbuf */
2482 		if (m != NULL) {
2483 			ifp->if_opackets++;
2484 			tx->info[idx].m = NULL;
2485 			map = tx->info[idx].map;
2486 			bus_dmamap_unload(tx->dmat, map);
2487 			m_freem(m);
2488 		}
2489 		if (tx->info[idx].flag) {
2490 			tx->info[idx].flag = 0;
2491 			tx->pkt_done++;
2492 		}
2493 	}
2494 
2495 	/* If we have space, clear IFF_OACTIVE to tell the stack that
2496            its OK to send packets */
2497 
2498 	if (ifp->if_drv_flags & IFF_DRV_OACTIVE &&
2499 	    tx->req - tx->done < (tx->mask + 1)/4) {
2500 		mtx_lock(&ss->tx.mtx);
2501 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
2502 		ss->tx.wake++;
2503 		mxge_start_locked(ss);
2504 		mtx_unlock(&ss->tx.mtx);
2505 	}
2506 }
2507 
2508 static struct mxge_media_type mxge_media_types[] =
2509 {
2510 	{IFM_10G_CX4,	0x7f, 		"10GBASE-CX4 (module)"},
2511 	{IFM_10G_SR, 	(1 << 7),	"10GBASE-SR"},
2512 	{IFM_10G_LR, 	(1 << 6),	"10GBASE-LR"},
2513 	{0,		(1 << 5),	"10GBASE-ER"},
2514 	{0,		(1 << 4),	"10GBASE-LRM"},
2515 	{0,		(1 << 3),	"10GBASE-SW"},
2516 	{0,		(1 << 2),	"10GBASE-LW"},
2517 	{0,		(1 << 1),	"10GBASE-EW"},
2518 	{0,		(1 << 0),	"Reserved"}
2519 };
2520 
2521 static void
2522 mxge_set_media(mxge_softc_t *sc, int type)
2523 {
2524 	sc->media_flags |= type;
2525 	ifmedia_add(&sc->media, sc->media_flags, 0, NULL);
2526 	ifmedia_set(&sc->media, sc->media_flags);
2527 }
2528 
2529 
2530 /*
2531  * Determine the media type for a NIC.  Some XFPs will identify
2532  * themselves only when their link is up, so this is initiated via a
2533  * link up interrupt.  However, this can potentially take up to
2534  * several milliseconds, so it is run via the watchdog routine, rather
2535  * than in the interrupt handler itself.   This need only be done
2536  * once, not each time the link is up.
2537  */
2538 static void
2539 mxge_media_probe(mxge_softc_t *sc)
2540 {
2541 	mxge_cmd_t cmd;
2542 	char *ptr;
2543 	int i, err, ms;
2544 
2545 	sc->need_media_probe = 0;
2546 
2547 	/* if we've already set a media type, we're done */
2548 	if (sc->media_flags  != (IFM_ETHER | IFM_AUTO))
2549 		return;
2550 
2551 	/*
2552 	 * parse the product code to deterimine the interface type
2553 	 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2554 	 * after the 3rd dash in the driver's cached copy of the
2555 	 * EEPROM's product code string.
2556 	 */
2557 	ptr = sc->product_code_string;
2558 	if (ptr == NULL) {
2559 		device_printf(sc->dev, "Missing product code\n");
2560 	}
2561 
2562 	for (i = 0; i < 3; i++, ptr++) {
2563 		ptr = index(ptr, '-');
2564 		if (ptr == NULL) {
2565 			device_printf(sc->dev,
2566 				      "only %d dashes in PC?!?\n", i);
2567 			return;
2568 		}
2569 	}
2570 	if (*ptr == 'C') {
2571 		mxge_set_media(sc, IFM_10G_CX4);
2572 		return;
2573 	}
2574 	else if (*ptr == 'Q') {
2575 		device_printf(sc->dev, "Quad Ribbon Fiber Media\n");
2576 		/* FreeBSD has no media type for Quad ribbon fiber */
2577 		return;
2578 	}
2579 
2580 	if (*ptr != 'R') {
2581 		device_printf(sc->dev, "Unknown media type: %c\n", *ptr);
2582 		return;
2583 	}
2584 
2585 	/*
2586 	 * At this point we know the NIC has an XFP cage, so now we
2587 	 * try to determine what is in the cage by using the
2588 	 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2589 	 * register.  We read just one byte, which may take over
2590 	 * a millisecond
2591 	 */
2592 
2593 	cmd.data0 = 0;	 /* just fetch 1 byte, not all 256 */
2594 	cmd.data1 = MXGE_XFP_COMPLIANCE_BYTE; /* the byte we want */
2595 	err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_I2C_READ, &cmd);
2596 	if (err == MXGEFW_CMD_ERROR_XFP_FAILURE) {
2597 		device_printf(sc->dev, "failed to read XFP\n");
2598 	}
2599 	if (err == MXGEFW_CMD_ERROR_XFP_ABSENT) {
2600 		device_printf(sc->dev, "Type R with no XFP!?!?\n");
2601 	}
2602 	if (err != MXGEFW_CMD_OK) {
2603 		return;
2604 	}
2605 
2606 	/* now we wait for the data to be cached */
2607 	cmd.data0 = MXGE_XFP_COMPLIANCE_BYTE;
2608 	err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_BYTE, &cmd);
2609 	for (ms = 0; (err == EBUSY) && (ms < 50); ms++) {
2610 		DELAY(1000);
2611 		cmd.data0 = MXGE_XFP_COMPLIANCE_BYTE;
2612 		err = mxge_send_cmd(sc, MXGEFW_CMD_XFP_BYTE, &cmd);
2613 	}
2614 	if (err != MXGEFW_CMD_OK) {
2615 		device_printf(sc->dev, "failed to read XFP (%d, %dms)\n",
2616 			      err, ms);
2617 		return;
2618 	}
2619 
2620 	if (cmd.data0 == mxge_media_types[0].bitmask) {
2621 		if (mxge_verbose)
2622 			device_printf(sc->dev, "XFP:%s\n",
2623 				      mxge_media_types[0].name);
2624 		mxge_set_media(sc, IFM_10G_CX4);
2625 		return;
2626 	}
2627 	for (i = 1;
2628 	     i < sizeof (mxge_media_types) / sizeof (mxge_media_types[0]);
2629 	     i++) {
2630 		if (cmd.data0 & mxge_media_types[i].bitmask) {
2631 			if (mxge_verbose)
2632 				device_printf(sc->dev, "XFP:%s\n",
2633 					      mxge_media_types[i].name);
2634 
2635 			mxge_set_media(sc, mxge_media_types[i].flag);
2636 			return;
2637 		}
2638 	}
2639 	device_printf(sc->dev, "XFP media 0x%x unknown\n", cmd.data0);
2640 
2641 	return;
2642 }
2643 
2644 static void
2645 mxge_intr(void *arg)
2646 {
2647 	struct mxge_slice_state *ss = arg;
2648 	mxge_softc_t *sc = ss->sc;
2649 	mcp_irq_data_t *stats = ss->fw_stats;
2650 	mxge_tx_ring_t *tx = &ss->tx;
2651 	mxge_rx_done_t *rx_done = &ss->rx_done;
2652 	uint32_t send_done_count;
2653 	uint8_t valid;
2654 
2655 
2656 	/* an interrupt on a non-zero slice is implicitly valid
2657 	   since MSI-X irqs are not shared */
2658 	if (ss != sc->ss) {
2659 		mxge_clean_rx_done(ss);
2660 		*ss->irq_claim = be32toh(3);
2661 		return;
2662 	}
2663 
2664 	/* make sure the DMA has finished */
2665 	if (!stats->valid) {
2666 		return;
2667 	}
2668 	valid = stats->valid;
2669 
2670 	if (sc->legacy_irq) {
2671 		/* lower legacy IRQ  */
2672 		*sc->irq_deassert = 0;
2673 		if (!mxge_deassert_wait)
2674 			/* don't wait for conf. that irq is low */
2675 			stats->valid = 0;
2676 	} else {
2677 		stats->valid = 0;
2678 	}
2679 
2680 	/* loop while waiting for legacy irq deassertion */
2681 	do {
2682 		/* check for transmit completes and receives */
2683 		send_done_count = be32toh(stats->send_done_count);
2684 		while ((send_done_count != tx->pkt_done) ||
2685 		       (rx_done->entry[rx_done->idx].length != 0)) {
2686 			mxge_tx_done(ss, (int)send_done_count);
2687 			mxge_clean_rx_done(ss);
2688 			send_done_count = be32toh(stats->send_done_count);
2689 		}
2690 		if (sc->legacy_irq && mxge_deassert_wait)
2691 			wmb();
2692 	} while (*((volatile uint8_t *) &stats->valid));
2693 
2694 	if (__predict_false(stats->stats_updated)) {
2695 		if (sc->link_state != stats->link_up) {
2696 			sc->link_state = stats->link_up;
2697 			if (sc->link_state) {
2698 				if_link_state_change(sc->ifp, LINK_STATE_UP);
2699 				if (mxge_verbose)
2700 					device_printf(sc->dev, "link up\n");
2701 			} else {
2702 				if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2703 				if (mxge_verbose)
2704 					device_printf(sc->dev, "link down\n");
2705 			}
2706 			sc->need_media_probe = 1;
2707 		}
2708 		if (sc->rdma_tags_available !=
2709 		    be32toh(stats->rdma_tags_available)) {
2710 			sc->rdma_tags_available =
2711 				be32toh(stats->rdma_tags_available);
2712 			device_printf(sc->dev, "RDMA timed out! %d tags "
2713 				      "left\n", sc->rdma_tags_available);
2714 		}
2715 
2716 		if (stats->link_down) {
2717 			sc->down_cnt += stats->link_down;
2718 			sc->link_state = 0;
2719 			if_link_state_change(sc->ifp, LINK_STATE_DOWN);
2720 		}
2721 	}
2722 
2723 	/* check to see if we have rx token to pass back */
2724 	if (valid & 0x1)
2725 	    *ss->irq_claim = be32toh(3);
2726 	*(ss->irq_claim + 1) = be32toh(3);
2727 }
2728 
2729 static void
2730 mxge_init(void *arg)
2731 {
2732 }
2733 
2734 
2735 
2736 static void
2737 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2738 {
2739 	struct lro_entry *lro_entry;
2740 	int i;
2741 
2742 	while (!SLIST_EMPTY(&ss->lro_free)) {
2743 		lro_entry = SLIST_FIRST(&ss->lro_free);
2744 		SLIST_REMOVE_HEAD(&ss->lro_free, next);
2745 		free(lro_entry, M_DEVBUF);
2746 	}
2747 
2748 	for (i = 0; i <= ss->rx_big.mask; i++) {
2749 		if (ss->rx_big.info[i].m == NULL)
2750 			continue;
2751 		bus_dmamap_unload(ss->rx_big.dmat,
2752 				  ss->rx_big.info[i].map);
2753 		m_freem(ss->rx_big.info[i].m);
2754 		ss->rx_big.info[i].m = NULL;
2755 	}
2756 
2757 	for (i = 0; i <= ss->rx_small.mask; i++) {
2758 		if (ss->rx_small.info[i].m == NULL)
2759 			continue;
2760 		bus_dmamap_unload(ss->rx_small.dmat,
2761 				  ss->rx_small.info[i].map);
2762 		m_freem(ss->rx_small.info[i].m);
2763 		ss->rx_small.info[i].m = NULL;
2764 	}
2765 
2766 	/* transmit ring used only on the first slice */
2767 	if (ss->tx.info == NULL)
2768 		return;
2769 
2770 	for (i = 0; i <= ss->tx.mask; i++) {
2771 		ss->tx.info[i].flag = 0;
2772 		if (ss->tx.info[i].m == NULL)
2773 			continue;
2774 		bus_dmamap_unload(ss->tx.dmat,
2775 				  ss->tx.info[i].map);
2776 		m_freem(ss->tx.info[i].m);
2777 		ss->tx.info[i].m = NULL;
2778 	}
2779 }
2780 
2781 static void
2782 mxge_free_mbufs(mxge_softc_t *sc)
2783 {
2784 	int slice;
2785 
2786 	for (slice = 0; slice < sc->num_slices; slice++)
2787 		mxge_free_slice_mbufs(&sc->ss[slice]);
2788 }
2789 
2790 static void
2791 mxge_free_slice_rings(struct mxge_slice_state *ss)
2792 {
2793 	int i;
2794 
2795 
2796 	if (ss->rx_done.entry != NULL)
2797 		mxge_dma_free(&ss->rx_done.dma);
2798 	ss->rx_done.entry = NULL;
2799 
2800 	if (ss->tx.req_bytes != NULL)
2801 		free(ss->tx.req_bytes, M_DEVBUF);
2802 	ss->tx.req_bytes = NULL;
2803 
2804 	if (ss->tx.seg_list != NULL)
2805 		free(ss->tx.seg_list, M_DEVBUF);
2806 	ss->tx.seg_list = NULL;
2807 
2808 	if (ss->rx_small.shadow != NULL)
2809 		free(ss->rx_small.shadow, M_DEVBUF);
2810 	ss->rx_small.shadow = NULL;
2811 
2812 	if (ss->rx_big.shadow != NULL)
2813 		free(ss->rx_big.shadow, M_DEVBUF);
2814 	ss->rx_big.shadow = NULL;
2815 
2816 	if (ss->tx.info != NULL) {
2817 		if (ss->tx.dmat != NULL) {
2818 			for (i = 0; i <= ss->tx.mask; i++) {
2819 				bus_dmamap_destroy(ss->tx.dmat,
2820 						   ss->tx.info[i].map);
2821 			}
2822 			bus_dma_tag_destroy(ss->tx.dmat);
2823 		}
2824 		free(ss->tx.info, M_DEVBUF);
2825 	}
2826 	ss->tx.info = NULL;
2827 
2828 	if (ss->rx_small.info != NULL) {
2829 		if (ss->rx_small.dmat != NULL) {
2830 			for (i = 0; i <= ss->rx_small.mask; i++) {
2831 				bus_dmamap_destroy(ss->rx_small.dmat,
2832 						   ss->rx_small.info[i].map);
2833 			}
2834 			bus_dmamap_destroy(ss->rx_small.dmat,
2835 					   ss->rx_small.extra_map);
2836 			bus_dma_tag_destroy(ss->rx_small.dmat);
2837 		}
2838 		free(ss->rx_small.info, M_DEVBUF);
2839 	}
2840 	ss->rx_small.info = NULL;
2841 
2842 	if (ss->rx_big.info != NULL) {
2843 		if (ss->rx_big.dmat != NULL) {
2844 			for (i = 0; i <= ss->rx_big.mask; i++) {
2845 				bus_dmamap_destroy(ss->rx_big.dmat,
2846 						   ss->rx_big.info[i].map);
2847 			}
2848 			bus_dmamap_destroy(ss->rx_big.dmat,
2849 					   ss->rx_big.extra_map);
2850 			bus_dma_tag_destroy(ss->rx_big.dmat);
2851 		}
2852 		free(ss->rx_big.info, M_DEVBUF);
2853 	}
2854 	ss->rx_big.info = NULL;
2855 }
2856 
2857 static void
2858 mxge_free_rings(mxge_softc_t *sc)
2859 {
2860 	int slice;
2861 
2862 	for (slice = 0; slice < sc->num_slices; slice++)
2863 		mxge_free_slice_rings(&sc->ss[slice]);
2864 }
2865 
2866 static int
2867 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
2868 		       int tx_ring_entries)
2869 {
2870 	mxge_softc_t *sc = ss->sc;
2871 	size_t bytes;
2872 	int err, i;
2873 
2874 	err = ENOMEM;
2875 
2876 	/* allocate per-slice receive resources */
2877 
2878 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
2879 	ss->rx_done.mask = (2 * rx_ring_entries) - 1;
2880 
2881 	/* allocate the rx shadow rings */
2882 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2883 	ss->rx_small.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2884 	if (ss->rx_small.shadow == NULL)
2885 		return err;;
2886 
2887 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2888 	ss->rx_big.shadow = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2889 	if (ss->rx_big.shadow == NULL)
2890 		return err;;
2891 
2892 	/* allocate the rx host info rings */
2893 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
2894 	ss->rx_small.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2895 	if (ss->rx_small.info == NULL)
2896 		return err;;
2897 
2898 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
2899 	ss->rx_big.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
2900 	if (ss->rx_big.info == NULL)
2901 		return err;;
2902 
2903 	/* allocate the rx busdma resources */
2904 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2905 				 1,			/* alignment */
2906 				 4096,			/* boundary */
2907 				 BUS_SPACE_MAXADDR,	/* low */
2908 				 BUS_SPACE_MAXADDR,	/* high */
2909 				 NULL, NULL,		/* filter */
2910 				 MHLEN,			/* maxsize */
2911 				 1,			/* num segs */
2912 				 MHLEN,			/* maxsegsize */
2913 				 BUS_DMA_ALLOCNOW,	/* flags */
2914 				 NULL, NULL,		/* lock */
2915 				 &ss->rx_small.dmat);	/* tag */
2916 	if (err != 0) {
2917 		device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
2918 			      err);
2919 		return err;;
2920 	}
2921 
2922 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
2923 				 1,			/* alignment */
2924 #if MXGE_VIRT_JUMBOS
2925 				 4096,			/* boundary */
2926 #else
2927 				 0,			/* boundary */
2928 #endif
2929 				 BUS_SPACE_MAXADDR,	/* low */
2930 				 BUS_SPACE_MAXADDR,	/* high */
2931 				 NULL, NULL,		/* filter */
2932 				 3*4096,		/* maxsize */
2933 #if MXGE_VIRT_JUMBOS
2934 				 3,			/* num segs */
2935 				 4096,			/* maxsegsize*/
2936 #else
2937 				 1,			/* num segs */
2938 				 MJUM9BYTES,		/* maxsegsize*/
2939 #endif
2940 				 BUS_DMA_ALLOCNOW,	/* flags */
2941 				 NULL, NULL,		/* lock */
2942 				 &ss->rx_big.dmat);	/* tag */
2943 	if (err != 0) {
2944 		device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
2945 			      err);
2946 		return err;;
2947 	}
2948 	for (i = 0; i <= ss->rx_small.mask; i++) {
2949 		err = bus_dmamap_create(ss->rx_small.dmat, 0,
2950 					&ss->rx_small.info[i].map);
2951 		if (err != 0) {
2952 			device_printf(sc->dev, "Err %d  rx_small dmamap\n",
2953 				      err);
2954 			return err;;
2955 		}
2956 	}
2957 	err = bus_dmamap_create(ss->rx_small.dmat, 0,
2958 				&ss->rx_small.extra_map);
2959 	if (err != 0) {
2960 		device_printf(sc->dev, "Err %d extra rx_small dmamap\n",
2961 			      err);
2962 		return err;;
2963 	}
2964 
2965 	for (i = 0; i <= ss->rx_big.mask; i++) {
2966 		err = bus_dmamap_create(ss->rx_big.dmat, 0,
2967 					&ss->rx_big.info[i].map);
2968 		if (err != 0) {
2969 			device_printf(sc->dev, "Err %d  rx_big dmamap\n",
2970 				      err);
2971 			return err;;
2972 		}
2973 	}
2974 	err = bus_dmamap_create(ss->rx_big.dmat, 0,
2975 				&ss->rx_big.extra_map);
2976 	if (err != 0) {
2977 		device_printf(sc->dev, "Err %d extra rx_big dmamap\n",
2978 			      err);
2979 		return err;;
2980 	}
2981 
2982 	/* now allocate TX resouces */
2983 
2984 	/* only use a single TX ring for now */
2985 	if (ss != ss->sc->ss)
2986 		return 0;
2987 
2988 	ss->tx.mask = tx_ring_entries - 1;
2989 	ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
2990 
2991 
2992 	/* allocate the tx request copy block */
2993 	bytes = 8 +
2994 		sizeof (*ss->tx.req_list) * (ss->tx.max_desc + 4);
2995 	ss->tx.req_bytes = malloc(bytes, M_DEVBUF, M_WAITOK);
2996 	if (ss->tx.req_bytes == NULL)
2997 		return err;;
2998 	/* ensure req_list entries are aligned to 8 bytes */
2999 	ss->tx.req_list = (mcp_kreq_ether_send_t *)
3000 		((unsigned long)(ss->tx.req_bytes + 7) & ~7UL);
3001 
3002 	/* allocate the tx busdma segment list */
3003 	bytes = sizeof (*ss->tx.seg_list) * ss->tx.max_desc;
3004 	ss->tx.seg_list = (bus_dma_segment_t *)
3005 		malloc(bytes, M_DEVBUF, M_WAITOK);
3006 	if (ss->tx.seg_list == NULL)
3007 		return err;;
3008 
3009 	/* allocate the tx host info ring */
3010 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
3011 	ss->tx.info = malloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3012 	if (ss->tx.info == NULL)
3013 		return err;;
3014 
3015 	/* allocate the tx busdma resources */
3016 	err = bus_dma_tag_create(sc->parent_dmat,	/* parent */
3017 				 1,			/* alignment */
3018 				 sc->tx_boundary,	/* boundary */
3019 				 BUS_SPACE_MAXADDR,	/* low */
3020 				 BUS_SPACE_MAXADDR,	/* high */
3021 				 NULL, NULL,		/* filter */
3022 				 65536 + 256,		/* maxsize */
3023 				 ss->tx.max_desc - 2,	/* num segs */
3024 				 sc->tx_boundary,	/* maxsegsz */
3025 				 BUS_DMA_ALLOCNOW,	/* flags */
3026 				 NULL, NULL,		/* lock */
3027 				 &ss->tx.dmat);		/* tag */
3028 
3029 	if (err != 0) {
3030 		device_printf(sc->dev, "Err %d allocating tx dmat\n",
3031 			      err);
3032 		return err;;
3033 	}
3034 
3035 	/* now use these tags to setup dmamaps for each slot
3036 	   in the ring */
3037 	for (i = 0; i <= ss->tx.mask; i++) {
3038 		err = bus_dmamap_create(ss->tx.dmat, 0,
3039 					&ss->tx.info[i].map);
3040 		if (err != 0) {
3041 			device_printf(sc->dev, "Err %d  tx dmamap\n",
3042 				      err);
3043 			return err;;
3044 		}
3045 	}
3046 	return 0;
3047 
3048 }
3049 
3050 static int
3051 mxge_alloc_rings(mxge_softc_t *sc)
3052 {
3053 	mxge_cmd_t cmd;
3054 	int tx_ring_size;
3055 	int tx_ring_entries, rx_ring_entries;
3056 	int err, slice;
3057 
3058 	/* get ring sizes */
3059 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3060 	tx_ring_size = cmd.data0;
3061 	if (err != 0) {
3062 		device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3063 		goto abort;
3064 	}
3065 
3066 	tx_ring_entries = tx_ring_size / sizeof (mcp_kreq_ether_send_t);
3067 	rx_ring_entries = sc->rx_ring_size / sizeof (mcp_dma_addr_t);
3068 	IFQ_SET_MAXLEN(&sc->ifp->if_snd, tx_ring_entries - 1);
3069 	sc->ifp->if_snd.ifq_drv_maxlen = sc->ifp->if_snd.ifq_maxlen;
3070 	IFQ_SET_READY(&sc->ifp->if_snd);
3071 
3072 	for (slice = 0; slice < sc->num_slices; slice++) {
3073 		err = mxge_alloc_slice_rings(&sc->ss[slice],
3074 					     rx_ring_entries,
3075 					     tx_ring_entries);
3076 		if (err != 0)
3077 			goto abort;
3078 	}
3079 	return 0;
3080 
3081 abort:
3082 	mxge_free_rings(sc);
3083 	return err;
3084 
3085 }
3086 
3087 
3088 static void
3089 mxge_choose_params(int mtu, int *big_buf_size, int *cl_size, int *nbufs)
3090 {
3091 	int bufsize = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + MXGEFW_PAD;
3092 
3093 	if (bufsize < MCLBYTES) {
3094 		/* easy, everything fits in a single buffer */
3095 		*big_buf_size = MCLBYTES;
3096 		*cl_size = MCLBYTES;
3097 		*nbufs = 1;
3098 		return;
3099 	}
3100 
3101 	if (bufsize < MJUMPAGESIZE) {
3102 		/* still easy, everything still fits in a single buffer */
3103 		*big_buf_size = MJUMPAGESIZE;
3104 		*cl_size = MJUMPAGESIZE;
3105 		*nbufs = 1;
3106 		return;
3107 	}
3108 #if MXGE_VIRT_JUMBOS
3109 	/* now we need to use virtually contiguous buffers */
3110 	*cl_size = MJUM9BYTES;
3111 	*big_buf_size = 4096;
3112 	*nbufs = mtu / 4096 + 1;
3113 	/* needs to be a power of two, so round up */
3114 	if (*nbufs == 3)
3115 		*nbufs = 4;
3116 #else
3117 	*cl_size = MJUM9BYTES;
3118 	*big_buf_size = MJUM9BYTES;
3119 	*nbufs = 1;
3120 #endif
3121 }
3122 
3123 static int
3124 mxge_slice_open(struct mxge_slice_state *ss, int nbufs, int cl_size)
3125 {
3126 	mxge_softc_t *sc;
3127 	mxge_cmd_t cmd;
3128 	bus_dmamap_t map;
3129 	struct lro_entry *lro_entry;
3130 	int err, i, slice;
3131 
3132 
3133 	sc = ss->sc;
3134 	slice = ss - sc->ss;
3135 
3136 	SLIST_INIT(&ss->lro_free);
3137 	SLIST_INIT(&ss->lro_active);
3138 
3139 	for (i = 0; i < sc->lro_cnt; i++) {
3140 		lro_entry = (struct lro_entry *)
3141 			malloc(sizeof (*lro_entry), M_DEVBUF,
3142 			       M_NOWAIT | M_ZERO);
3143 		if (lro_entry == NULL) {
3144 			sc->lro_cnt = i;
3145 			break;
3146 		}
3147 		SLIST_INSERT_HEAD(&ss->lro_free, lro_entry, next);
3148 	}
3149 	/* get the lanai pointers to the send and receive rings */
3150 
3151 	err = 0;
3152 	/* We currently only send from the first slice */
3153 	if (slice == 0) {
3154 		cmd.data0 = slice;
3155 		err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3156 		ss->tx.lanai =
3157 			(volatile mcp_kreq_ether_send_t *)(sc->sram + cmd.data0);
3158 	}
3159 	cmd.data0 = slice;
3160 	err |= mxge_send_cmd(sc,
3161 			     MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3162 	ss->rx_small.lanai =
3163 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3164 	cmd.data0 = slice;
3165 	err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3166 	ss->rx_big.lanai =
3167 		(volatile mcp_kreq_ether_recv_t *)(sc->sram + cmd.data0);
3168 
3169 	if (err != 0) {
3170 		device_printf(sc->dev,
3171 			      "failed to get ring sizes or locations\n");
3172 		return EIO;
3173 	}
3174 
3175 	/* stock receive rings */
3176 	for (i = 0; i <= ss->rx_small.mask; i++) {
3177 		map = ss->rx_small.info[i].map;
3178 		err = mxge_get_buf_small(ss, map, i);
3179 		if (err) {
3180 			device_printf(sc->dev, "alloced %d/%d smalls\n",
3181 				      i, ss->rx_small.mask + 1);
3182 			return ENOMEM;
3183 		}
3184 	}
3185 	for (i = 0; i <= ss->rx_big.mask; i++) {
3186 		ss->rx_big.shadow[i].addr_low = 0xffffffff;
3187 		ss->rx_big.shadow[i].addr_high = 0xffffffff;
3188 	}
3189 	ss->rx_big.nbufs = nbufs;
3190 	ss->rx_big.cl_size = cl_size;
3191 	for (i = 0; i <= ss->rx_big.mask; i += ss->rx_big.nbufs) {
3192 		map = ss->rx_big.info[i].map;
3193 		err = mxge_get_buf_big(ss, map, i);
3194 		if (err) {
3195 			device_printf(sc->dev, "alloced %d/%d bigs\n",
3196 				      i, ss->rx_big.mask + 1);
3197 			return ENOMEM;
3198 		}
3199 	}
3200 	return 0;
3201 }
3202 
3203 static int
3204 mxge_open(mxge_softc_t *sc)
3205 {
3206 	mxge_cmd_t cmd;
3207 	int err, big_bytes, nbufs, slice, cl_size, i;
3208 	bus_addr_t bus;
3209 	volatile uint8_t *itable;
3210 
3211 	/* Copy the MAC address in case it was overridden */
3212 	bcopy(IF_LLADDR(sc->ifp), sc->mac_addr, ETHER_ADDR_LEN);
3213 
3214 	err = mxge_reset(sc, 1);
3215 	if (err != 0) {
3216 		device_printf(sc->dev, "failed to reset\n");
3217 		return EIO;
3218 	}
3219 
3220 	if (sc->num_slices > 1) {
3221 		/* setup the indirection table */
3222 		cmd.data0 = sc->num_slices;
3223 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
3224 				    &cmd);
3225 
3226 		err |= mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_TABLE_OFFSET,
3227 				     &cmd);
3228 		if (err != 0) {
3229 			device_printf(sc->dev,
3230 				      "failed to setup rss tables\n");
3231 			return err;
3232 		}
3233 
3234 		/* just enable an identity mapping */
3235 		itable = sc->sram + cmd.data0;
3236 		for (i = 0; i < sc->num_slices; i++)
3237 			itable[i] = (uint8_t)i;
3238 
3239 		cmd.data0 = 1;
3240 		cmd.data1 = mxge_rss_hash_type;
3241 		err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3242 		if (err != 0) {
3243 			device_printf(sc->dev, "failed to enable slices\n");
3244 			return err;
3245 		}
3246 	}
3247 
3248 
3249 	mxge_choose_params(sc->ifp->if_mtu, &big_bytes, &cl_size, &nbufs);
3250 
3251 	cmd.data0 = nbufs;
3252 	err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
3253 			    &cmd);
3254 	/* error is only meaningful if we're trying to set
3255 	   MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1 */
3256 	if (err && nbufs > 1) {
3257 		device_printf(sc->dev,
3258 			      "Failed to set alway-use-n to %d\n",
3259 			      nbufs);
3260 		return EIO;
3261 	}
3262 	/* Give the firmware the mtu and the big and small buffer
3263 	   sizes.  The firmware wants the big buf size to be a power
3264 	   of two. Luckily, FreeBSD's clusters are powers of two */
3265 	cmd.data0 = sc->ifp->if_mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3266 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3267 	cmd.data0 = MHLEN - MXGEFW_PAD;
3268 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE,
3269 			     &cmd);
3270 	cmd.data0 = big_bytes;
3271 	err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3272 
3273 	if (err != 0) {
3274 		device_printf(sc->dev, "failed to setup params\n");
3275 		goto abort;
3276 	}
3277 
3278 	/* Now give him the pointer to the stats block */
3279 	cmd.data0 = MXGE_LOWPART_TO_U32(sc->ss->fw_stats_dma.bus_addr);
3280 	cmd.data1 = MXGE_HIGHPART_TO_U32(sc->ss->fw_stats_dma.bus_addr);
3281 	cmd.data2 = sizeof(struct mcp_irq_data);
3282 	err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3283 
3284 	if (err != 0) {
3285 		bus = sc->ss->fw_stats_dma.bus_addr;
3286 		bus += offsetof(struct mcp_irq_data, send_done_count);
3287 		cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3288 		cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3289 		err = mxge_send_cmd(sc,
3290 				    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3291 				    &cmd);
3292 		/* Firmware cannot support multicast without STATS_DMA_V2 */
3293 		sc->fw_multicast_support = 0;
3294 	} else {
3295 		sc->fw_multicast_support = 1;
3296 	}
3297 
3298 	if (err != 0) {
3299 		device_printf(sc->dev, "failed to setup params\n");
3300 		goto abort;
3301 	}
3302 
3303 	for (slice = 0; slice < sc->num_slices; slice++) {
3304 		err = mxge_slice_open(&sc->ss[slice], nbufs, cl_size);
3305 		if (err != 0) {
3306 			device_printf(sc->dev, "couldn't open slice %d\n",
3307 				      slice);
3308 			goto abort;
3309 		}
3310 	}
3311 
3312 	/* Finally, start the firmware running */
3313 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3314 	if (err) {
3315 		device_printf(sc->dev, "Couldn't bring up link\n");
3316 		goto abort;
3317 	}
3318 	sc->ifp->if_drv_flags |= IFF_DRV_RUNNING;
3319 	sc->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
3320 	callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3321 
3322 	return 0;
3323 
3324 
3325 abort:
3326 	mxge_free_mbufs(sc);
3327 
3328 	return err;
3329 }
3330 
3331 static int
3332 mxge_close(mxge_softc_t *sc)
3333 {
3334 	mxge_cmd_t cmd;
3335 	int err, old_down_cnt;
3336 
3337 	callout_stop(&sc->co_hdl);
3338 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
3339 	old_down_cnt = sc->down_cnt;
3340 	wmb();
3341 	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3342 	if (err) {
3343 		device_printf(sc->dev, "Couldn't bring down link\n");
3344 	}
3345 	if (old_down_cnt == sc->down_cnt) {
3346 		/* wait for down irq */
3347 		DELAY(10 * sc->intr_coal_delay);
3348 	}
3349 	wmb();
3350 	if (old_down_cnt == sc->down_cnt) {
3351 		device_printf(sc->dev, "never got down irq\n");
3352 	}
3353 
3354 	mxge_free_mbufs(sc);
3355 
3356 	return 0;
3357 }
3358 
3359 static void
3360 mxge_setup_cfg_space(mxge_softc_t *sc)
3361 {
3362 	device_t dev = sc->dev;
3363 	int reg;
3364 	uint16_t cmd, lnk, pectl;
3365 
3366 	/* find the PCIe link width and set max read request to 4KB*/
3367 	if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3368 		lnk = pci_read_config(dev, reg + 0x12, 2);
3369 		sc->link_width = (lnk >> 4) & 0x3f;
3370 
3371 		pectl = pci_read_config(dev, reg + 0x8, 2);
3372 		pectl = (pectl & ~0x7000) | (5 << 12);
3373 		pci_write_config(dev, reg + 0x8, pectl, 2);
3374 	}
3375 
3376 	/* Enable DMA and Memory space access */
3377 	pci_enable_busmaster(dev);
3378 	cmd = pci_read_config(dev, PCIR_COMMAND, 2);
3379 	cmd |= PCIM_CMD_MEMEN;
3380 	pci_write_config(dev, PCIR_COMMAND, cmd, 2);
3381 }
3382 
3383 static uint32_t
3384 mxge_read_reboot(mxge_softc_t *sc)
3385 {
3386 	device_t dev = sc->dev;
3387 	uint32_t vs;
3388 
3389 	/* find the vendor specific offset */
3390 	if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3391 		device_printf(sc->dev,
3392 			      "could not find vendor specific offset\n");
3393 		return (uint32_t)-1;
3394 	}
3395 	/* enable read32 mode */
3396 	pci_write_config(dev, vs + 0x10, 0x3, 1);
3397 	/* tell NIC which register to read */
3398 	pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3399 	return (pci_read_config(dev, vs + 0x14, 4));
3400 }
3401 
3402 static int
3403 mxge_watchdog_reset(mxge_softc_t *sc)
3404 {
3405 	struct pci_devinfo *dinfo;
3406 	int err;
3407 	uint32_t reboot;
3408 	uint16_t cmd;
3409 
3410 	err = ENXIO;
3411 
3412 	device_printf(sc->dev, "Watchdog reset!\n");
3413 
3414 	/*
3415 	 * check to see if the NIC rebooted.  If it did, then all of
3416 	 * PCI config space has been reset, and things like the
3417 	 * busmaster bit will be zero.  If this is the case, then we
3418 	 * must restore PCI config space before the NIC can be used
3419 	 * again
3420 	 */
3421 	cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3422 	if (cmd == 0xffff) {
3423 		/*
3424 		 * maybe the watchdog caught the NIC rebooting; wait
3425 		 * up to 100ms for it to finish.  If it does not come
3426 		 * back, then give up
3427 		 */
3428 		DELAY(1000*100);
3429 		cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3430 		if (cmd == 0xffff) {
3431 			device_printf(sc->dev, "NIC disappeared!\n");
3432 			return (err);
3433 		}
3434 	}
3435 	if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3436 		/* print the reboot status */
3437 		reboot = mxge_read_reboot(sc);
3438 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
3439 			      reboot);
3440 		/* restore PCI configuration space */
3441 		dinfo = device_get_ivars(sc->dev);
3442 		pci_cfg_restore(sc->dev, dinfo);
3443 
3444 		/* and redo any changes we made to our config space */
3445 		mxge_setup_cfg_space(sc);
3446 
3447 		if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
3448 			mxge_close(sc);
3449 			err = mxge_open(sc);
3450 		}
3451 	} else {
3452 		device_printf(sc->dev, "NIC did not reboot, ring state:\n");
3453 		device_printf(sc->dev, "tx.req=%d tx.done=%d\n",
3454 			      sc->ss->tx.req, sc->ss->tx.done);
3455 		device_printf(sc->dev, "pkt_done=%d fw=%d\n",
3456 			      sc->ss->tx.pkt_done,
3457 			      be32toh(sc->ss->fw_stats->send_done_count));
3458 		device_printf(sc->dev, "not resetting\n");
3459 	}
3460 	return (err);
3461 }
3462 
3463 static int
3464 mxge_watchdog(mxge_softc_t *sc)
3465 {
3466 	mxge_tx_ring_t *tx = &sc->ss->tx;
3467 	uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
3468 	int err = 0;
3469 
3470 	/* see if we have outstanding transmits, which
3471 	   have been pending for more than mxge_ticks */
3472 	if (tx->req != tx->done &&
3473 	    tx->watchdog_req != tx->watchdog_done &&
3474 	    tx->done == tx->watchdog_done) {
3475 		/* check for pause blocking before resetting */
3476 		if (tx->watchdog_rx_pause == rx_pause)
3477 			err = mxge_watchdog_reset(sc);
3478 		else
3479 			device_printf(sc->dev, "Flow control blocking "
3480 				      "xmits, check link partner\n");
3481 	}
3482 
3483 	tx->watchdog_req = tx->req;
3484 	tx->watchdog_done = tx->done;
3485 	tx->watchdog_rx_pause = rx_pause;
3486 
3487 	if (sc->need_media_probe)
3488 		mxge_media_probe(sc);
3489 	return (err);
3490 }
3491 
3492 static void
3493 mxge_update_stats(mxge_softc_t *sc)
3494 {
3495 	struct mxge_slice_state *ss;
3496 	u_long ipackets = 0;
3497 	int slice;
3498 
3499 	for(slice = 0; slice < sc->num_slices; slice++) {
3500 		ss = &sc->ss[slice];
3501 		ipackets += ss->ipackets;
3502 	}
3503 	sc->ifp->if_ipackets = ipackets;
3504 
3505 }
3506 static void
3507 mxge_tick(void *arg)
3508 {
3509 	mxge_softc_t *sc = arg;
3510 	int err = 0;
3511 
3512 	/* aggregate stats from different slices */
3513 	mxge_update_stats(sc);
3514 	if (!sc->watchdog_countdown) {
3515 		err = mxge_watchdog(sc);
3516 		sc->watchdog_countdown = 4;
3517 	}
3518 	sc->watchdog_countdown--;
3519 	if (err == 0)
3520 		callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3521 
3522 }
3523 
3524 static int
3525 mxge_media_change(struct ifnet *ifp)
3526 {
3527 	return EINVAL;
3528 }
3529 
3530 static int
3531 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3532 {
3533 	struct ifnet *ifp = sc->ifp;
3534 	int real_mtu, old_mtu;
3535 	int err = 0;
3536 
3537 
3538 	real_mtu = mtu + ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3539 	if ((real_mtu > sc->max_mtu) || real_mtu < 60)
3540 		return EINVAL;
3541 	mtx_lock(&sc->driver_mtx);
3542 	old_mtu = ifp->if_mtu;
3543 	ifp->if_mtu = mtu;
3544 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3545 		mxge_close(sc);
3546 		err = mxge_open(sc);
3547 		if (err != 0) {
3548 			ifp->if_mtu = old_mtu;
3549 			mxge_close(sc);
3550 			(void) mxge_open(sc);
3551 		}
3552 	}
3553 	mtx_unlock(&sc->driver_mtx);
3554 	return err;
3555 }
3556 
3557 static void
3558 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3559 {
3560 	mxge_softc_t *sc = ifp->if_softc;
3561 
3562 
3563 	if (sc == NULL)
3564 		return;
3565 	ifmr->ifm_status = IFM_AVALID;
3566 	ifmr->ifm_status |= sc->link_state ? IFM_ACTIVE : 0;
3567 	ifmr->ifm_active = IFM_AUTO | IFM_ETHER;
3568 	ifmr->ifm_active |= sc->link_state ? IFM_FDX : 0;
3569 }
3570 
3571 static int
3572 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
3573 {
3574 	mxge_softc_t *sc = ifp->if_softc;
3575 	struct ifreq *ifr = (struct ifreq *)data;
3576 	int err, mask;
3577 
3578 	err = 0;
3579 	switch (command) {
3580 	case SIOCSIFADDR:
3581 	case SIOCGIFADDR:
3582 		err = ether_ioctl(ifp, command, data);
3583 		break;
3584 
3585 	case SIOCSIFMTU:
3586 		err = mxge_change_mtu(sc, ifr->ifr_mtu);
3587 		break;
3588 
3589 	case SIOCSIFFLAGS:
3590 		mtx_lock(&sc->driver_mtx);
3591 		if (ifp->if_flags & IFF_UP) {
3592 			if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
3593 				err = mxge_open(sc);
3594 			} else {
3595 				/* take care of promis can allmulti
3596 				   flag chages */
3597 				mxge_change_promisc(sc,
3598 						    ifp->if_flags & IFF_PROMISC);
3599 				mxge_set_multicast_list(sc);
3600 			}
3601 		} else {
3602 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
3603 				mxge_close(sc);
3604 			}
3605 		}
3606 		mtx_unlock(&sc->driver_mtx);
3607 		break;
3608 
3609 	case SIOCADDMULTI:
3610 	case SIOCDELMULTI:
3611 		mtx_lock(&sc->driver_mtx);
3612 		mxge_set_multicast_list(sc);
3613 		mtx_unlock(&sc->driver_mtx);
3614 		break;
3615 
3616 	case SIOCSIFCAP:
3617 		mtx_lock(&sc->driver_mtx);
3618 		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3619 		if (mask & IFCAP_TXCSUM) {
3620 			if (IFCAP_TXCSUM & ifp->if_capenable) {
3621 				ifp->if_capenable &= ~(IFCAP_TXCSUM|IFCAP_TSO4);
3622 				ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP
3623 						      | CSUM_TSO);
3624 			} else {
3625 				ifp->if_capenable |= IFCAP_TXCSUM;
3626 				ifp->if_hwassist |= (CSUM_TCP | CSUM_UDP);
3627 			}
3628 		} else if (mask & IFCAP_RXCSUM) {
3629 			if (IFCAP_RXCSUM & ifp->if_capenable) {
3630 				ifp->if_capenable &= ~IFCAP_RXCSUM;
3631 				sc->csum_flag = 0;
3632 			} else {
3633 				ifp->if_capenable |= IFCAP_RXCSUM;
3634 				sc->csum_flag = 1;
3635 			}
3636 		}
3637 		if (mask & IFCAP_TSO4) {
3638 			if (IFCAP_TSO4 & ifp->if_capenable) {
3639 				ifp->if_capenable &= ~IFCAP_TSO4;
3640 				ifp->if_hwassist &= ~CSUM_TSO;
3641 			} else if (IFCAP_TXCSUM & ifp->if_capenable) {
3642 				ifp->if_capenable |= IFCAP_TSO4;
3643 				ifp->if_hwassist |= CSUM_TSO;
3644 			} else {
3645 				printf("mxge requires tx checksum offload"
3646 				       " be enabled to use TSO\n");
3647 				err = EINVAL;
3648 			}
3649 		}
3650 		if (mask & IFCAP_LRO) {
3651 			if (IFCAP_LRO & ifp->if_capenable)
3652 				err = mxge_change_lro_locked(sc, 0);
3653 			else
3654 				err = mxge_change_lro_locked(sc, mxge_lro_cnt);
3655 		}
3656 		if (mask & IFCAP_VLAN_HWTAGGING)
3657 			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3658 		mtx_unlock(&sc->driver_mtx);
3659 		VLAN_CAPABILITIES(ifp);
3660 
3661 		break;
3662 
3663 	case SIOCGIFMEDIA:
3664 		err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3665 				    &sc->media, command);
3666                 break;
3667 
3668 	default:
3669 		err = ENOTTY;
3670         }
3671 	return err;
3672 }
3673 
3674 static void
3675 mxge_fetch_tunables(mxge_softc_t *sc)
3676 {
3677 
3678 	TUNABLE_INT_FETCH("hw.mxge.max_slices", &mxge_max_slices);
3679 	TUNABLE_INT_FETCH("hw.mxge.flow_control_enabled",
3680 			  &mxge_flow_control);
3681 	TUNABLE_INT_FETCH("hw.mxge.intr_coal_delay",
3682 			  &mxge_intr_coal_delay);
3683 	TUNABLE_INT_FETCH("hw.mxge.nvidia_ecrc_enable",
3684 			  &mxge_nvidia_ecrc_enable);
3685 	TUNABLE_INT_FETCH("hw.mxge.force_firmware",
3686 			  &mxge_force_firmware);
3687 	TUNABLE_INT_FETCH("hw.mxge.deassert_wait",
3688 			  &mxge_deassert_wait);
3689 	TUNABLE_INT_FETCH("hw.mxge.verbose",
3690 			  &mxge_verbose);
3691 	TUNABLE_INT_FETCH("hw.mxge.ticks", &mxge_ticks);
3692 	TUNABLE_INT_FETCH("hw.mxge.lro_cnt", &sc->lro_cnt);
3693 	TUNABLE_INT_FETCH("hw.mxge.always_promisc", &mxge_always_promisc);
3694 	TUNABLE_INT_FETCH("hw.mxge.rss_hash_type", &mxge_rss_hash_type);
3695 	if (sc->lro_cnt != 0)
3696 		mxge_lro_cnt = sc->lro_cnt;
3697 
3698 	if (bootverbose)
3699 		mxge_verbose = 1;
3700 	if (mxge_intr_coal_delay < 0 || mxge_intr_coal_delay > 10*1000)
3701 		mxge_intr_coal_delay = 30;
3702 	if (mxge_ticks == 0)
3703 		mxge_ticks = hz / 2;
3704 	sc->pause = mxge_flow_control;
3705 	if (mxge_rss_hash_type < MXGEFW_RSS_HASH_TYPE_IPV4
3706 	    || mxge_rss_hash_type > MXGEFW_RSS_HASH_TYPE_SRC_PORT) {
3707 		mxge_rss_hash_type = MXGEFW_RSS_HASH_TYPE_SRC_PORT;
3708 	}
3709 }
3710 
3711 
3712 static void
3713 mxge_free_slices(mxge_softc_t *sc)
3714 {
3715 	struct mxge_slice_state *ss;
3716 	int i;
3717 
3718 
3719 	if (sc->ss == NULL)
3720 		return;
3721 
3722 	for (i = 0; i < sc->num_slices; i++) {
3723 		ss = &sc->ss[i];
3724 		if (ss->fw_stats != NULL) {
3725 			mxge_dma_free(&ss->fw_stats_dma);
3726 			ss->fw_stats = NULL;
3727 			mtx_destroy(&ss->tx.mtx);
3728 		}
3729 		if (ss->rx_done.entry != NULL) {
3730 			mxge_dma_free(&ss->rx_done.dma);
3731 			ss->rx_done.entry = NULL;
3732 		}
3733 	}
3734 	free(sc->ss, M_DEVBUF);
3735 	sc->ss = NULL;
3736 }
3737 
3738 static int
3739 mxge_alloc_slices(mxge_softc_t *sc)
3740 {
3741 	mxge_cmd_t cmd;
3742 	struct mxge_slice_state *ss;
3743 	size_t bytes;
3744 	int err, i, max_intr_slots;
3745 
3746 	err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3747 	if (err != 0) {
3748 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3749 		return err;
3750 	}
3751 	sc->rx_ring_size = cmd.data0;
3752 	max_intr_slots = 2 * (sc->rx_ring_size / sizeof (mcp_dma_addr_t));
3753 
3754 	bytes = sizeof (*sc->ss) * sc->num_slices;
3755 	sc->ss = malloc(bytes, M_DEVBUF, M_NOWAIT | M_ZERO);
3756 	if (sc->ss == NULL)
3757 		return (ENOMEM);
3758 	for (i = 0; i < sc->num_slices; i++) {
3759 		ss = &sc->ss[i];
3760 
3761 		ss->sc = sc;
3762 
3763 		/* allocate per-slice rx interrupt queues */
3764 
3765 		bytes = max_intr_slots * sizeof (*ss->rx_done.entry);
3766 		err = mxge_dma_alloc(sc, &ss->rx_done.dma, bytes, 4096);
3767 		if (err != 0)
3768 			goto abort;
3769 		ss->rx_done.entry = ss->rx_done.dma.addr;
3770 		bzero(ss->rx_done.entry, bytes);
3771 
3772 		/*
3773 		 * allocate the per-slice firmware stats; stats
3774 		 * (including tx) are used used only on the first
3775 		 * slice for now
3776 		 */
3777 		if (i > 0)
3778 			continue;
3779 
3780 		bytes = sizeof (*ss->fw_stats);
3781 		err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
3782 				     sizeof (*ss->fw_stats), 64);
3783 		if (err != 0)
3784 			goto abort;
3785 		ss->fw_stats = (mcp_irq_data_t *)ss->fw_stats_dma.addr;
3786 		snprintf(ss->tx.mtx_name, sizeof(ss->tx.mtx_name),
3787 			 "%s:tx(%d)", device_get_nameunit(sc->dev), i);
3788 		mtx_init(&ss->tx.mtx, ss->tx.mtx_name, NULL, MTX_DEF);
3789 	}
3790 
3791 	return (0);
3792 
3793 abort:
3794 	mxge_free_slices(sc);
3795 	return (ENOMEM);
3796 }
3797 
3798 static void
3799 mxge_slice_probe(mxge_softc_t *sc)
3800 {
3801 	mxge_cmd_t cmd;
3802 	char *old_fw;
3803 	int msix_cnt, status, max_intr_slots;
3804 
3805 	sc->num_slices = 1;
3806 	/*
3807 	 *  don't enable multiple slices if they are not enabled,
3808 	 *  or if this is not an SMP system
3809 	 */
3810 
3811 	if (mxge_max_slices == 0 || mxge_max_slices == 1 || mp_ncpus < 2)
3812 		return;
3813 
3814 	/* see how many MSI-X interrupts are available */
3815 	msix_cnt = pci_msix_count(sc->dev);
3816 	if (msix_cnt < 2)
3817 		return;
3818 
3819 	/* now load the slice aware firmware see what it supports */
3820 	old_fw = sc->fw_name;
3821 	if (old_fw == mxge_fw_aligned)
3822 		sc->fw_name = mxge_fw_rss_aligned;
3823 	else
3824 		sc->fw_name = mxge_fw_rss_unaligned;
3825 	status = mxge_load_firmware(sc, 0);
3826 	if (status != 0) {
3827 		device_printf(sc->dev, "Falling back to a single slice\n");
3828 		return;
3829 	}
3830 
3831 	/* try to send a reset command to the card to see if it
3832 	   is alive */
3833 	memset(&cmd, 0, sizeof (cmd));
3834 	status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
3835 	if (status != 0) {
3836 		device_printf(sc->dev, "failed reset\n");
3837 		goto abort_with_fw;
3838 	}
3839 
3840 	/* get rx ring size */
3841 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3842 	if (status != 0) {
3843 		device_printf(sc->dev, "Cannot determine rx ring size\n");
3844 		goto abort_with_fw;
3845 	}
3846 	max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
3847 
3848 	/* tell it the size of the interrupt queues */
3849 	cmd.data0 = max_intr_slots * sizeof (struct mcp_slot);
3850 	status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
3851 	if (status != 0) {
3852 		device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
3853 		goto abort_with_fw;
3854 	}
3855 
3856 	/* ask the maximum number of slices it supports */
3857 	status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
3858 	if (status != 0) {
3859 		device_printf(sc->dev,
3860 			      "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
3861 		goto abort_with_fw;
3862 	}
3863 	sc->num_slices = cmd.data0;
3864 	if (sc->num_slices > msix_cnt)
3865 		sc->num_slices = msix_cnt;
3866 
3867 	if (mxge_max_slices == -1) {
3868 		/* cap to number of CPUs in system */
3869 		if (sc->num_slices > mp_ncpus)
3870 			sc->num_slices = mp_ncpus;
3871 	} else {
3872 		if (sc->num_slices > mxge_max_slices)
3873 			sc->num_slices = mxge_max_slices;
3874 	}
3875 	/* make sure it is a power of two */
3876 	while (sc->num_slices & (sc->num_slices - 1))
3877 		sc->num_slices--;
3878 
3879 	if (mxge_verbose)
3880 		device_printf(sc->dev, "using %d slices\n",
3881 			      sc->num_slices);
3882 
3883 	return;
3884 
3885 abort_with_fw:
3886 	sc->fw_name = old_fw;
3887 	(void) mxge_load_firmware(sc, 0);
3888 }
3889 
3890 static int
3891 mxge_add_msix_irqs(mxge_softc_t *sc)
3892 {
3893 	size_t bytes;
3894 	int count, err, i, rid;
3895 
3896 	rid = PCIR_BAR(2);
3897 	sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
3898 						    &rid, RF_ACTIVE);
3899 
3900 	if (sc->msix_table_res == NULL) {
3901 		device_printf(sc->dev, "couldn't alloc MSIX table res\n");
3902 		return ENXIO;
3903 	}
3904 
3905 	count = sc->num_slices;
3906 	err = pci_alloc_msix(sc->dev, &count);
3907 	if (err != 0) {
3908 		device_printf(sc->dev, "pci_alloc_msix: failed, wanted %d"
3909 			      "err = %d \n", sc->num_slices, err);
3910 		goto abort_with_msix_table;
3911 	}
3912 	if (count < sc->num_slices) {
3913 		device_printf(sc->dev, "pci_alloc_msix: need %d, got %d\n",
3914 			      count, sc->num_slices);
3915 		device_printf(sc->dev,
3916 			      "Try setting hw.mxge.max_slices to %d\n",
3917 			      count);
3918 		err = ENOSPC;
3919 		goto abort_with_msix;
3920 	}
3921 	bytes = sizeof (*sc->msix_irq_res) * sc->num_slices;
3922 	sc->msix_irq_res = malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3923 	if (sc->msix_irq_res == NULL) {
3924 		err = ENOMEM;
3925 		goto abort_with_msix;
3926 	}
3927 
3928 	for (i = 0; i < sc->num_slices; i++) {
3929 		rid = i + 1;
3930 		sc->msix_irq_res[i] = bus_alloc_resource_any(sc->dev,
3931 							  SYS_RES_IRQ,
3932 							  &rid, RF_ACTIVE);
3933 		if (sc->msix_irq_res[i] == NULL) {
3934 			device_printf(sc->dev, "couldn't allocate IRQ res"
3935 				      " for message %d\n", i);
3936 			err = ENXIO;
3937 			goto abort_with_res;
3938 		}
3939 	}
3940 
3941 	bytes = sizeof (*sc->msix_ih) * sc->num_slices;
3942 	sc->msix_ih =  malloc(bytes, M_DEVBUF, M_NOWAIT|M_ZERO);
3943 
3944 	for (i = 0; i < sc->num_slices; i++) {
3945 		err = bus_setup_intr(sc->dev, sc->msix_irq_res[i],
3946 				     INTR_TYPE_NET | INTR_MPSAFE,
3947 #if __FreeBSD_version > 700030
3948 				     NULL,
3949 #endif
3950 				     mxge_intr, &sc->ss[i], &sc->msix_ih[i]);
3951 		if (err != 0) {
3952 			device_printf(sc->dev, "couldn't setup intr for "
3953 				      "message %d\n", i);
3954 			goto abort_with_intr;
3955 		}
3956 	}
3957 
3958 	if (mxge_verbose) {
3959 		device_printf(sc->dev, "using %d msix IRQs:",
3960 			      sc->num_slices);
3961 		for (i = 0; i < sc->num_slices; i++)
3962 			printf(" %ld",  rman_get_start(sc->msix_irq_res[i]));
3963 		printf("\n");
3964 	}
3965 	return (0);
3966 
3967 abort_with_intr:
3968 	for (i = 0; i < sc->num_slices; i++) {
3969 		if (sc->msix_ih[i] != NULL) {
3970 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
3971 					  sc->msix_ih[i]);
3972 			sc->msix_ih[i] = NULL;
3973 		}
3974 	}
3975 	free(sc->msix_ih, M_DEVBUF);
3976 
3977 
3978 abort_with_res:
3979 	for (i = 0; i < sc->num_slices; i++) {
3980 		rid = i + 1;
3981 		if (sc->msix_irq_res[i] != NULL)
3982 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
3983 					     sc->msix_irq_res[i]);
3984 		sc->msix_irq_res[i] = NULL;
3985 	}
3986 	free(sc->msix_irq_res, M_DEVBUF);
3987 
3988 
3989 abort_with_msix:
3990 	pci_release_msi(sc->dev);
3991 
3992 abort_with_msix_table:
3993 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
3994 			     sc->msix_table_res);
3995 
3996 	return err;
3997 }
3998 
3999 static int
4000 mxge_add_single_irq(mxge_softc_t *sc)
4001 {
4002 	int count, err, rid;
4003 
4004 	count = pci_msi_count(sc->dev);
4005 	if (count == 1 && pci_alloc_msi(sc->dev, &count) == 0) {
4006 		rid = 1;
4007 	} else {
4008 		rid = 0;
4009 		sc->legacy_irq = 1;
4010 	}
4011 	sc->irq_res = bus_alloc_resource(sc->dev, SYS_RES_IRQ, &rid, 0, ~0,
4012 					 1, RF_SHAREABLE | RF_ACTIVE);
4013 	if (sc->irq_res == NULL) {
4014 		device_printf(sc->dev, "could not alloc interrupt\n");
4015 		return ENXIO;
4016 	}
4017 	if (mxge_verbose)
4018 		device_printf(sc->dev, "using %s irq %ld\n",
4019 			      sc->legacy_irq ? "INTx" : "MSI",
4020 			      rman_get_start(sc->irq_res));
4021 	err = bus_setup_intr(sc->dev, sc->irq_res,
4022 			     INTR_TYPE_NET | INTR_MPSAFE,
4023 #if __FreeBSD_version > 700030
4024 			     NULL,
4025 #endif
4026 			     mxge_intr, &sc->ss[0], &sc->ih);
4027 	if (err != 0) {
4028 		bus_release_resource(sc->dev, SYS_RES_IRQ,
4029 				     sc->legacy_irq ? 0 : 1, sc->irq_res);
4030 		if (!sc->legacy_irq)
4031 			pci_release_msi(sc->dev);
4032 	}
4033 	return err;
4034 }
4035 
4036 static void
4037 mxge_rem_msix_irqs(mxge_softc_t *sc)
4038 {
4039 	int i, rid;
4040 
4041 	for (i = 0; i < sc->num_slices; i++) {
4042 		if (sc->msix_ih[i] != NULL) {
4043 			bus_teardown_intr(sc->dev, sc->msix_irq_res[i],
4044 					  sc->msix_ih[i]);
4045 			sc->msix_ih[i] = NULL;
4046 		}
4047 	}
4048 	free(sc->msix_ih, M_DEVBUF);
4049 
4050 	for (i = 0; i < sc->num_slices; i++) {
4051 		rid = i + 1;
4052 		if (sc->msix_irq_res[i] != NULL)
4053 			bus_release_resource(sc->dev, SYS_RES_IRQ, rid,
4054 					     sc->msix_irq_res[i]);
4055 		sc->msix_irq_res[i] = NULL;
4056 	}
4057 	free(sc->msix_irq_res, M_DEVBUF);
4058 
4059 	bus_release_resource(sc->dev, SYS_RES_MEMORY, PCIR_BAR(2),
4060 			     sc->msix_table_res);
4061 
4062 	pci_release_msi(sc->dev);
4063 	return;
4064 }
4065 
4066 static void
4067 mxge_rem_single_irq(mxge_softc_t *sc)
4068 {
4069 	bus_teardown_intr(sc->dev, sc->irq_res, sc->ih);
4070 	bus_release_resource(sc->dev, SYS_RES_IRQ,
4071 			     sc->legacy_irq ? 0 : 1, sc->irq_res);
4072 	if (!sc->legacy_irq)
4073 		pci_release_msi(sc->dev);
4074 }
4075 
4076 static void
4077 mxge_rem_irq(mxge_softc_t *sc)
4078 {
4079 	if (sc->num_slices > 1)
4080 		mxge_rem_msix_irqs(sc);
4081 	else
4082 		mxge_rem_single_irq(sc);
4083 }
4084 
4085 static int
4086 mxge_add_irq(mxge_softc_t *sc)
4087 {
4088 	int err;
4089 
4090 	if (sc->num_slices > 1)
4091 		err = mxge_add_msix_irqs(sc);
4092 	else
4093 		err = mxge_add_single_irq(sc);
4094 
4095 	if (0 && err == 0 && sc->num_slices > 1) {
4096 		mxge_rem_msix_irqs(sc);
4097 		err = mxge_add_msix_irqs(sc);
4098 	}
4099 	return err;
4100 }
4101 
4102 
4103 static int
4104 mxge_attach(device_t dev)
4105 {
4106 	mxge_softc_t *sc = device_get_softc(dev);
4107 	struct ifnet *ifp;
4108 	int err, rid;
4109 
4110 	sc->dev = dev;
4111 	mxge_fetch_tunables(sc);
4112 
4113 	err = bus_dma_tag_create(NULL,			/* parent */
4114 				 1,			/* alignment */
4115 				 0,			/* boundary */
4116 				 BUS_SPACE_MAXADDR,	/* low */
4117 				 BUS_SPACE_MAXADDR,	/* high */
4118 				 NULL, NULL,		/* filter */
4119 				 65536 + 256,		/* maxsize */
4120 				 MXGE_MAX_SEND_DESC, 	/* num segs */
4121 				 65536,			/* maxsegsize */
4122 				 0,			/* flags */
4123 				 NULL, NULL,		/* lock */
4124 				 &sc->parent_dmat);	/* tag */
4125 
4126 	if (err != 0) {
4127 		device_printf(sc->dev, "Err %d allocating parent dmat\n",
4128 			      err);
4129 		goto abort_with_nothing;
4130 	}
4131 
4132 	ifp = sc->ifp = if_alloc(IFT_ETHER);
4133 	if (ifp == NULL) {
4134 		device_printf(dev, "can not if_alloc()\n");
4135 		err = ENOSPC;
4136 		goto abort_with_parent_dmat;
4137 	}
4138 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4139 
4140 	snprintf(sc->cmd_mtx_name, sizeof(sc->cmd_mtx_name), "%s:cmd",
4141 		 device_get_nameunit(dev));
4142 	mtx_init(&sc->cmd_mtx, sc->cmd_mtx_name, NULL, MTX_DEF);
4143 	snprintf(sc->driver_mtx_name, sizeof(sc->driver_mtx_name),
4144 		 "%s:drv", device_get_nameunit(dev));
4145 	mtx_init(&sc->driver_mtx, sc->driver_mtx_name,
4146 		 MTX_NETWORK_LOCK, MTX_DEF);
4147 
4148 	callout_init_mtx(&sc->co_hdl, &sc->driver_mtx, 0);
4149 
4150 	mxge_setup_cfg_space(sc);
4151 
4152 	/* Map the board into the kernel */
4153 	rid = PCIR_BARS;
4154 	sc->mem_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0,
4155 					 ~0, 1, RF_ACTIVE);
4156 	if (sc->mem_res == NULL) {
4157 		device_printf(dev, "could not map memory\n");
4158 		err = ENXIO;
4159 		goto abort_with_lock;
4160 	}
4161 	sc->sram = rman_get_virtual(sc->mem_res);
4162 	sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4163 	if (sc->sram_size > rman_get_size(sc->mem_res)) {
4164 		device_printf(dev, "impossible memory region size %ld\n",
4165 			      rman_get_size(sc->mem_res));
4166 		err = ENXIO;
4167 		goto abort_with_mem_res;
4168 	}
4169 
4170 	/* make NULL terminated copy of the EEPROM strings section of
4171 	   lanai SRAM */
4172 	bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4173 	bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4174 				rman_get_bushandle(sc->mem_res),
4175 				sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4176 				sc->eeprom_strings,
4177 				MXGE_EEPROM_STRINGS_SIZE - 2);
4178 	err = mxge_parse_strings(sc);
4179 	if (err != 0)
4180 		goto abort_with_mem_res;
4181 
4182 	/* Enable write combining for efficient use of PCIe bus */
4183 	mxge_enable_wc(sc);
4184 
4185 	/* Allocate the out of band dma memory */
4186 	err = mxge_dma_alloc(sc, &sc->cmd_dma,
4187 			     sizeof (mxge_cmd_t), 64);
4188 	if (err != 0)
4189 		goto abort_with_mem_res;
4190 	sc->cmd = (mcp_cmd_response_t *) sc->cmd_dma.addr;
4191 	err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4192 	if (err != 0)
4193 		goto abort_with_cmd_dma;
4194 
4195 	err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4196 	if (err != 0)
4197 		goto abort_with_zeropad_dma;
4198 
4199 	/* select & load the firmware */
4200 	err = mxge_select_firmware(sc);
4201 	if (err != 0)
4202 		goto abort_with_dmabench;
4203 	sc->intr_coal_delay = mxge_intr_coal_delay;
4204 
4205 	mxge_slice_probe(sc);
4206 	err = mxge_alloc_slices(sc);
4207 	if (err != 0)
4208 		goto abort_with_dmabench;
4209 
4210 	err = mxge_reset(sc, 0);
4211 	if (err != 0)
4212 		goto abort_with_slices;
4213 
4214 	err = mxge_alloc_rings(sc);
4215 	if (err != 0) {
4216 		device_printf(sc->dev, "failed to allocate rings\n");
4217 		goto abort_with_dmabench;
4218 	}
4219 
4220 	err = mxge_add_irq(sc);
4221 	if (err != 0) {
4222 		device_printf(sc->dev, "failed to add irq\n");
4223 		goto abort_with_rings;
4224 	}
4225 
4226 	ifp->if_baudrate = IF_Gbps(10UL);
4227 	ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO4 |
4228 		IFCAP_VLAN_MTU | IFCAP_LRO;
4229 
4230 #ifdef MXGE_NEW_VLAN_API
4231 	ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
4232 #endif
4233 
4234 	sc->max_mtu = mxge_max_mtu(sc);
4235 	if (sc->max_mtu >= 9000)
4236 		ifp->if_capabilities |= IFCAP_JUMBO_MTU;
4237 	else
4238 		device_printf(dev, "MTU limited to %d.  Install "
4239 			      "latest firmware for 9000 byte jumbo support\n",
4240 			      sc->max_mtu - ETHER_HDR_LEN);
4241 	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4242 	ifp->if_capenable = ifp->if_capabilities;
4243 	if (sc->lro_cnt == 0)
4244 		ifp->if_capenable &= ~IFCAP_LRO;
4245 	sc->csum_flag = 1;
4246         ifp->if_init = mxge_init;
4247         ifp->if_softc = sc;
4248         ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4249         ifp->if_ioctl = mxge_ioctl;
4250         ifp->if_start = mxge_start;
4251 	/* Initialise the ifmedia structure */
4252 	ifmedia_init(&sc->media, 0, mxge_media_change,
4253 		     mxge_media_status);
4254 	mxge_set_media(sc, IFM_ETHER | IFM_AUTO);
4255 	mxge_media_probe(sc);
4256 	ether_ifattach(ifp, sc->mac_addr);
4257 	/* ether_ifattach sets mtu to 1500 */
4258 	if (ifp->if_capabilities & IFCAP_JUMBO_MTU)
4259 		ifp->if_mtu = 9000;
4260 
4261 	mxge_add_sysctls(sc);
4262 	return 0;
4263 
4264 abort_with_rings:
4265 	mxge_free_rings(sc);
4266 abort_with_slices:
4267 	mxge_free_slices(sc);
4268 abort_with_dmabench:
4269 	mxge_dma_free(&sc->dmabench_dma);
4270 abort_with_zeropad_dma:
4271 	mxge_dma_free(&sc->zeropad_dma);
4272 abort_with_cmd_dma:
4273 	mxge_dma_free(&sc->cmd_dma);
4274 abort_with_mem_res:
4275 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4276 abort_with_lock:
4277 	pci_disable_busmaster(dev);
4278 	mtx_destroy(&sc->cmd_mtx);
4279 	mtx_destroy(&sc->driver_mtx);
4280 	if_free(ifp);
4281 abort_with_parent_dmat:
4282 	bus_dma_tag_destroy(sc->parent_dmat);
4283 
4284 abort_with_nothing:
4285 	return err;
4286 }
4287 
4288 static int
4289 mxge_detach(device_t dev)
4290 {
4291 	mxge_softc_t *sc = device_get_softc(dev);
4292 
4293 	if (mxge_vlans_active(sc)) {
4294 		device_printf(sc->dev,
4295 			      "Detach vlans before removing module\n");
4296 		return EBUSY;
4297 	}
4298 	mtx_lock(&sc->driver_mtx);
4299 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
4300 		mxge_close(sc);
4301 	mtx_unlock(&sc->driver_mtx);
4302 	ether_ifdetach(sc->ifp);
4303 	callout_drain(&sc->co_hdl);
4304 	ifmedia_removeall(&sc->media);
4305 	mxge_dummy_rdma(sc, 0);
4306 	mxge_rem_sysctls(sc);
4307 	mxge_rem_irq(sc);
4308 	mxge_free_rings(sc);
4309 	mxge_free_slices(sc);
4310 	mxge_dma_free(&sc->dmabench_dma);
4311 	mxge_dma_free(&sc->zeropad_dma);
4312 	mxge_dma_free(&sc->cmd_dma);
4313 	bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS, sc->mem_res);
4314 	pci_disable_busmaster(dev);
4315 	mtx_destroy(&sc->cmd_mtx);
4316 	mtx_destroy(&sc->driver_mtx);
4317 	if_free(sc->ifp);
4318 	bus_dma_tag_destroy(sc->parent_dmat);
4319 	return 0;
4320 }
4321 
4322 static int
4323 mxge_shutdown(device_t dev)
4324 {
4325 	return 0;
4326 }
4327 
4328 /*
4329   This file uses Myri10GE driver indentation.
4330 
4331   Local Variables:
4332   c-file-style:"linux"
4333   tab-width:8
4334   End:
4335 */
4336