xref: /titanic_50/usr/src/uts/common/io/myri10ge/drv/myri10ge.c (revision abc79d9dd51e98eafb6fc25b4a0b4f66bef40b00)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright 2007-2009 Myricom, Inc.  All rights reserved.
29  * Use is subject to license terms.
30  */
31 
32 #ifndef	lint
33 static const char __idstring[] =
34 	"@(#)$Id: myri10ge.c,v 1.186 2009-06-29 13:47:22 gallatin Exp $";
35 #endif
36 
37 #define	MXGEFW_NDIS
38 #include "myri10ge_var.h"
39 #include "rss_eth_z8e.h"
40 #include "rss_ethp_z8e.h"
41 #include "mcp_gen_header.h"
42 
43 #define	MYRI10GE_MAX_ETHER_MTU 9014
44 
45 #define	MYRI10GE_ETH_STOPPED 0
46 #define	MYRI10GE_ETH_STOPPING 1
47 #define	MYRI10GE_ETH_STARTING 2
48 #define	MYRI10GE_ETH_RUNNING 3
49 #define	MYRI10GE_ETH_OPEN_FAILED 4
50 #define	MYRI10GE_ETH_SUSPENDED_RUNNING 5
51 
52 static int myri10ge_small_bytes = 510;
53 static int myri10ge_intr_coal_delay = 125;
54 static int myri10ge_flow_control = 1;
55 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
56 static int myri10ge_nvidia_ecrc_enable = 1;
57 #endif
58 static int myri10ge_mtu_override = 0;
59 static int myri10ge_tx_copylen = 512;
60 static int myri10ge_deassert_wait = 1;
61 static int myri10ge_verbose = 0;
62 static int myri10ge_watchdog_reset = 0;
63 static int myri10ge_use_msix = 1;
64 static int myri10ge_max_slices = -1;
65 static int myri10ge_use_msi = 1;
66 int myri10ge_force_firmware = 0;
67 static boolean_t myri10ge_use_lso = B_TRUE;
68 static int myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
69 static int myri10ge_tx_hash = 1;
70 static int myri10ge_lro = 0;
71 static int myri10ge_lro_cnt = 8;
72 int myri10ge_lro_max_aggr = 2;
73 static int myri10ge_lso_copy = 0;
74 static mblk_t *myri10ge_send_wrapper(void *arg, mblk_t *mp);
75 int myri10ge_tx_handles_initial = 128;
76 
77 static 	kmutex_t myri10ge_param_lock;
78 static void* myri10ge_db_lastfree;
79 
80 static int myri10ge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
81 static int myri10ge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
82 static int myri10ge_quiesce(dev_info_t *dip);
83 
84 DDI_DEFINE_STREAM_OPS(myri10ge_ops, nulldev, nulldev, myri10ge_attach,
85     myri10ge_detach, nodev, NULL, D_MP, NULL, myri10ge_quiesce);
86 
87 
88 static struct modldrv modldrv = {
89 	&mod_driverops,
90 	"Myricom 10G driver (10GbE)",
91 	&myri10ge_ops,
92 };
93 
94 
95 static struct modlinkage modlinkage = {
96 	MODREV_1,
97 	{&modldrv, NULL},
98 };
99 
100 unsigned char myri10ge_broadcastaddr[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
101 
102 static ddi_dma_attr_t myri10ge_misc_dma_attr = {
103 	DMA_ATTR_V0,			/* version number. */
104 	(uint64_t)0, 			/* low address */
105 	(uint64_t)0xffffffffffffffffULL, /* high address */
106 	(uint64_t)0x7ffffff,		/* address counter max */
107 	(uint64_t)4096,			/* alignment */
108 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
109 	(uint32_t)0x1,			/* minimum transfer size */
110 	(uint64_t)0x7fffffff,		/* maximum transfer size */
111 	(uint64_t)0x7fffffff,		/* maximum segment size */
112 	1,				/* scatter/gather list length */
113 	1,				/* granularity */
114 	0				/* attribute flags */
115 };
116 
117 /*
118  * The Myri10GE NIC has the following constraints on receive buffers:
119  * 1) Buffers which cross a 4KB boundary must be aligned to 4KB
120  * 2) Buffers which are not aligned to 4KB must not cross a 4KB boundary
121  */
122 
123 static ddi_dma_attr_t myri10ge_rx_jumbo_dma_attr = {
124 	DMA_ATTR_V0,			/* version number. */
125 	(uint64_t)0, 			/* low address */
126 	(uint64_t)0xffffffffffffffffULL, /* high address */
127 	(uint64_t)0x7ffffff,		/* address counter max */
128 	(uint64_t)4096,			/* alignment */
129 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
130 	(uint32_t)0x1,			/* minimum transfer size */
131 	(uint64_t)0x7fffffff,		/* maximum transfer size */
132 	UINT64_MAX,			/* maximum segment size */
133 	1,				/* scatter/gather list length */
134 	1,				/* granularity */
135 	0				/* attribute flags */
136 };
137 
138 static ddi_dma_attr_t myri10ge_rx_std_dma_attr = {
139 	DMA_ATTR_V0,			/* version number. */
140 	(uint64_t)0, 			/* low address */
141 	(uint64_t)0xffffffffffffffffULL, /* high address */
142 	(uint64_t)0x7ffffff,		/* address counter max */
143 #if defined sparc64 || defined __sparcv9
144 	(uint64_t)4096,			/* alignment */
145 #else
146 	(uint64_t)0x80,			/* alignment */
147 #endif
148 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
149 	(uint32_t)0x1,			/* minimum transfer size */
150 	(uint64_t)0x7fffffff,		/* maximum transfer size */
151 #if defined sparc64 || defined __sparcv9
152 	UINT64_MAX,			/* maximum segment size */
153 #else
154 	(uint64_t)0xfff,		/* maximum segment size */
155 #endif
156 	1,				/* scatter/gather list length */
157 	1,				/* granularity */
158 	0				/* attribute flags */
159 };
160 
161 static ddi_dma_attr_t myri10ge_tx_dma_attr = {
162 	DMA_ATTR_V0,			/* version number. */
163 	(uint64_t)0, 			/* low address */
164 	(uint64_t)0xffffffffffffffffULL, /* high address */
165 	(uint64_t)0x7ffffff,		/* address counter max */
166 	(uint64_t)1,			/* alignment */
167 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
168 	(uint32_t)0x1,			/* minimum transfer size */
169 	(uint64_t)0x7fffffff,		/* maximum transfer size */
170 	UINT64_MAX,			/* maximum segment size */
171 	INT32_MAX,			/* scatter/gather list length */
172 	1,				/* granularity */
173 	0			/* attribute flags */
174 };
175 
176 #if defined sparc64 || defined __sparcv9
177 #define	WC 0
178 #else
179 #define	WC 1
180 #endif
181 
182 struct ddi_device_acc_attr myri10ge_dev_access_attr = {
183 	DDI_DEVICE_ATTR_V0,		/* version */
184 	DDI_NEVERSWAP_ACC,		/* endian flash */
185 #if WC
186 	DDI_MERGING_OK_ACC		/* data order */
187 #else
188 	DDI_STRICTORDER_ACC
189 #endif
190 };
191 
192 static void myri10ge_watchdog(void *arg);
193 
194 #ifdef MYRICOM_PRIV
195 int myri10ge_mtu = MYRI10GE_MAX_ETHER_MTU + MXGEFW_PAD + VLAN_TAGSZ;
196 #else
197 int myri10ge_mtu = ETHERMAX + MXGEFW_PAD + VLAN_TAGSZ;
198 #endif
199 int myri10ge_bigbufs_initial = 1024;
200 int myri10ge_bigbufs_max = 4096;
201 
202 
203 caddr_t
204 myri10ge_dma_alloc(dev_info_t *dip, size_t len,
205     ddi_dma_attr_t *attr, ddi_device_acc_attr_t  *accattr,
206     uint_t alloc_flags, int bind_flags, struct myri10ge_dma_stuff *dma,
207     int warn, int (*wait)(caddr_t))
208 {
209 	caddr_t  kaddr;
210 	size_t real_length;
211 	ddi_dma_cookie_t cookie;
212 	uint_t count;
213 	int err;
214 
215 	err = ddi_dma_alloc_handle(dip, attr, wait,
216 	    NULL, &dma->handle);
217 	if (err != DDI_SUCCESS) {
218 		if (warn)
219 			cmn_err(CE_WARN,
220 			    "myri10ge: ddi_dma_alloc_handle failed\n");
221 		goto abort_with_nothing;
222 	}
223 
224 	err = ddi_dma_mem_alloc(dma->handle, len, accattr, alloc_flags,
225 	    wait, NULL, &kaddr, &real_length,
226 	    &dma->acc_handle);
227 	if (err != DDI_SUCCESS) {
228 		if (warn)
229 			cmn_err(CE_WARN,
230 			    "myri10ge: ddi_dma_mem_alloc failed\n");
231 		goto abort_with_handle;
232 	}
233 
234 	err = ddi_dma_addr_bind_handle(dma->handle, NULL, kaddr, len,
235 	    bind_flags, wait, NULL, &cookie, &count);
236 
237 	if (err != DDI_SUCCESS) {
238 		if (warn)
239 			cmn_err(CE_WARN,
240 			    "myri10ge: ddi_dma_addr_bind_handle failed\n");
241 		goto abort_with_mem;
242 	}
243 
244 	if (count != 1) {
245 		if (warn)
246 			cmn_err(CE_WARN,
247 			    "myri10ge: got too many dma segments ");
248 		goto abort_with_bind;
249 	}
250 	dma->low = htonl(MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress));
251 	dma->high = htonl(MYRI10GE_HIGHPART_TO_U32(cookie.dmac_laddress));
252 	return (kaddr);
253 
254 abort_with_bind:
255 	(void) ddi_dma_unbind_handle(dma->handle);
256 
257 abort_with_mem:
258 	ddi_dma_mem_free(&dma->acc_handle);
259 
260 abort_with_handle:
261 	ddi_dma_free_handle(&dma->handle);
262 abort_with_nothing:
263 	if (warn) {
264 		cmn_err(CE_WARN, "myri10ge: myri10ge_dma_alloc failed.\n  ");
265 		cmn_err(CE_WARN, "args: dip=%p len=0x%lx ddi_dma_attr=%p\n",
266 		    (void*) dip, len, (void*) attr);
267 		cmn_err(CE_WARN,
268 		    "args: ddi_device_acc_attr=%p  alloc_flags=0x%x\n",
269 		    (void*) accattr, alloc_flags);
270 		cmn_err(CE_WARN, "args: bind_flags=0x%x  dmastuff=%p",
271 		    bind_flags, (void*) dma);
272 	}
273 	return (NULL);
274 
275 }
276 
277 void
278 myri10ge_dma_free(struct myri10ge_dma_stuff *dma)
279 {
280 	(void) ddi_dma_unbind_handle(dma->handle);
281 	ddi_dma_mem_free(&dma->acc_handle);
282 	ddi_dma_free_handle(&dma->handle);
283 }
284 
285 static inline void
286 myri10ge_pio_copy32(void *to, uint32_t *from32, size_t size)
287 {
288 	register volatile uint32_t *to32;
289 	size_t i;
290 
291 	to32 = (volatile uint32_t *) to;
292 	for (i = (size / 4); i; i--) {
293 		*to32 = *from32;
294 		to32++;
295 		from32++;
296 	}
297 }
298 
299 #if defined(_LP64)
300 static inline void
301 myri10ge_pio_copy64(void *to, uint64_t *from64, size_t size)
302 {
303 	register volatile uint64_t *to64;
304 	size_t i;
305 
306 	to64 = (volatile uint64_t *) to;
307 	for (i = (size / 8); i; i--) {
308 		*to64 = *from64;
309 		to64++;
310 		from64++;
311 	}
312 }
313 #endif
314 
315 /*
316  * This routine copies memory from the host to the NIC.
317  * The "size" argument must always be a multiple of
318  * the size of long (4 or 8 bytes), and to/from must also
319  * be naturally aligned.
320  */
321 static inline void
322 myri10ge_pio_copy(void *to, void *from, size_t size)
323 {
324 #if !defined(_LP64)
325 	ASSERT((size % 4) == 0);
326 	myri10ge_pio_copy32(to, (uint32_t *)from, size);
327 #else
328 	ASSERT((size % 8) == 0);
329 	myri10ge_pio_copy64(to, (uint64_t *)from, size);
330 #endif
331 }
332 
333 
334 /*
335  * Due to various bugs in Solaris (especially bug 6186772 where the
336  * TCP/UDP checksum is calculated incorrectly on mblk chains with more
337  * than two elements), and the design bug where hardware checksums are
338  * ignored on mblk chains with more than 2 elements, we need to
339  * allocate private pool of physically contiguous receive buffers.
340  */
341 
342 static void
343 myri10ge_jpool_init(struct myri10ge_slice_state *ss)
344 {
345 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
346 
347 	bzero(jpool, sizeof (*jpool));
348 	mutex_init(&jpool->mtx, NULL, MUTEX_DRIVER,
349 	    ss->mgp->icookie);
350 	jpool->head = NULL;
351 }
352 
353 static void
354 myri10ge_jpool_fini(struct myri10ge_slice_state *ss)
355 {
356 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
357 
358 	if (jpool->head != NULL) {
359 		cmn_err(CE_WARN,
360 		    "%s: BUG! myri10ge_jpool_fini called on non-empty pool\n",
361 		    ss->mgp->name);
362 	}
363 	mutex_destroy(&jpool->mtx);
364 }
365 
366 
367 /*
368  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
369  * at most 32 bytes at a time, so as to avoid involving the software
370  * pio handler in the nic.   We re-write the first segment's low
371  * DMA address to mark it valid only after we write the entire chunk
372  * in a burst
373  */
374 static inline void
375 myri10ge_submit_8rx(mcp_kreq_ether_recv_t *dst, mcp_kreq_ether_recv_t *src)
376 {
377 	src->addr_low |= BE_32(1);
378 	myri10ge_pio_copy(dst, src, 4 * sizeof (*src));
379 	mb();
380 	myri10ge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
381 	mb();
382 	src->addr_low &= ~(BE_32(1));
383 	dst->addr_low = src->addr_low;
384 	mb();
385 }
386 
387 static void
388 myri10ge_pull_jpool(struct myri10ge_slice_state *ss)
389 {
390 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
391 	struct myri10ge_jpool_entry *jtail, *j, *jfree;
392 	volatile uintptr_t *putp;
393 	uintptr_t put;
394 	int i;
395 
396 	/* find tail */
397 	jtail = NULL;
398 	if (jpool->head != NULL) {
399 		j = jpool->head;
400 		while (j->next != NULL)
401 			j = j->next;
402 		jtail = j;
403 	}
404 
405 	/*
406 	 * iterate over all per-CPU caches, and add contents into
407 	 * jpool
408 	 */
409 	for (i = 0; i < MYRI10GE_MAX_CPUS; i++) {
410 		/* take per-CPU free list */
411 		putp = (void *)&jpool->cpu[i & MYRI10GE_MAX_CPU_MASK].head;
412 		if (*putp == NULL)
413 			continue;
414 		put = atomic_swap_ulong(putp, 0);
415 		jfree = (struct myri10ge_jpool_entry *)put;
416 
417 		/* append to pool */
418 		if (jtail == NULL) {
419 			jpool->head = jfree;
420 		} else {
421 			jtail->next = jfree;
422 		}
423 		j = jfree;
424 		while (j->next != NULL)
425 			j = j->next;
426 		jtail = j;
427 	}
428 }
429 
430 /*
431  * Transfers buffers from the free pool to the nic
432  * Must be called holding the jpool mutex.
433  */
434 
435 static inline void
436 myri10ge_restock_jumbos(struct myri10ge_slice_state *ss)
437 {
438 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
439 	struct myri10ge_jpool_entry *j;
440 	myri10ge_rx_ring_t *rx;
441 	int i, idx, limit;
442 
443 	rx = &ss->rx_big;
444 	limit = ss->j_rx_cnt + (rx->mask + 1);
445 
446 	for (i = rx->cnt; i != limit; i++) {
447 		idx = i & (rx->mask);
448 		j = jpool->head;
449 		if (j == NULL) {
450 			myri10ge_pull_jpool(ss);
451 			j = jpool->head;
452 			if (j == NULL) {
453 				break;
454 			}
455 		}
456 		jpool->head = j->next;
457 		rx->info[idx].j = j;
458 		rx->shadow[idx].addr_low = j->dma.low;
459 		rx->shadow[idx].addr_high = j->dma.high;
460 		/* copy 4 descriptors (32-bytes) to the mcp at a time */
461 		if ((idx & 7) == 7) {
462 			myri10ge_submit_8rx(&rx->lanai[idx - 7],
463 			    &rx->shadow[idx - 7]);
464 		}
465 	}
466 	rx->cnt = i;
467 }
468 
469 /*
470  * Transfer buffers from the nic to the free pool.
471  * Should be called holding the jpool mutex
472  */
473 
474 static inline void
475 myri10ge_unstock_jumbos(struct myri10ge_slice_state *ss)
476 {
477 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
478 	struct myri10ge_jpool_entry *j;
479 	myri10ge_rx_ring_t *rx;
480 	int i;
481 
482 	mutex_enter(&jpool->mtx);
483 	rx = &ss->rx_big;
484 
485 	for (i = 0; i < rx->mask + 1; i++) {
486 		j = rx->info[i].j;
487 		rx->info[i].j = NULL;
488 		if (j == NULL)
489 			continue;
490 		j->next = jpool->head;
491 		jpool->head = j;
492 	}
493 	mutex_exit(&jpool->mtx);
494 
495 }
496 
497 
498 /*
499  * Free routine which is called when the mblk allocated via
500  * esballoc() is freed.   Here we return the jumbo buffer
501  * to the free pool, and possibly pass some jumbo buffers
502  * to the nic
503  */
504 
505 static void
506 myri10ge_jfree_rtn(void *arg)
507 {
508 	struct myri10ge_jpool_entry *j = (struct myri10ge_jpool_entry *)arg;
509 	struct myri10ge_jpool_stuff *jpool;
510 	volatile uintptr_t *putp;
511 	uintptr_t old, new;
512 
513 	jpool = &j->ss->jpool;
514 
515 	/* prepend buffer locklessly to per-CPU freelist */
516 	putp = (void *)&jpool->cpu[CPU->cpu_seqid & MYRI10GE_MAX_CPU_MASK].head;
517 	new = (uintptr_t)j;
518 	do {
519 		old = *putp;
520 		j->next = (void *)old;
521 	} while (atomic_cas_ulong(putp, old, new) != old);
522 }
523 
524 static void
525 myri10ge_remove_jbuf(struct myri10ge_jpool_entry *j)
526 {
527 	(void) ddi_dma_unbind_handle(j->dma_handle);
528 	ddi_dma_mem_free(&j->acc_handle);
529 	ddi_dma_free_handle(&j->dma_handle);
530 	kmem_free(j, sizeof (*j));
531 }
532 
533 
534 /*
535  * Allocates one physically contiguous descriptor
536  * and add it to the jumbo buffer pool.
537  */
538 
539 static int
540 myri10ge_add_jbuf(struct myri10ge_slice_state *ss)
541 {
542 	struct myri10ge_jpool_entry *j;
543 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
544 	ddi_dma_attr_t *rx_dma_attr;
545 	size_t real_length;
546 	ddi_dma_cookie_t cookie;
547 	uint_t count;
548 	int err;
549 
550 	if (myri10ge_mtu < 2048)
551 		rx_dma_attr = &myri10ge_rx_std_dma_attr;
552 	else
553 		rx_dma_attr = &myri10ge_rx_jumbo_dma_attr;
554 
555 again:
556 	j = (struct myri10ge_jpool_entry *)
557 	    kmem_alloc(sizeof (*j), KM_SLEEP);
558 	err = ddi_dma_alloc_handle(ss->mgp->dip, rx_dma_attr,
559 	    DDI_DMA_DONTWAIT, NULL, &j->dma_handle);
560 	if (err != DDI_SUCCESS)
561 		goto abort_with_j;
562 
563 	err = ddi_dma_mem_alloc(j->dma_handle, myri10ge_mtu,
564 	    &myri10ge_dev_access_attr,  DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
565 	    NULL, &j->buf, &real_length, &j->acc_handle);
566 	if (err != DDI_SUCCESS)
567 		goto abort_with_handle;
568 
569 	err = ddi_dma_addr_bind_handle(j->dma_handle, NULL, j->buf,
570 	    real_length, DDI_DMA_READ|DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
571 	    NULL, &cookie, &count);
572 	if (err != DDI_SUCCESS)
573 		goto abort_with_mem;
574 
575 	/*
576 	 * Make certain std MTU buffers do not cross a 4KB boundary:
577 	 *
578 	 * Setting dma_attr_align=4096 will do this, but the system
579 	 * will only allocate 1 RX buffer per 4KB page, rather than 2.
580 	 * Setting dma_attr_granular=4096 *seems* to work around this,
581 	 * but I'm paranoid about future systems no longer honoring
582 	 * this, so fall back to the safe, but memory wasting way if a
583 	 * buffer crosses a 4KB boundary.
584 	 */
585 
586 	if (rx_dma_attr == &myri10ge_rx_std_dma_attr &&
587 	    rx_dma_attr->dma_attr_align != 4096) {
588 		uint32_t start, end;
589 
590 		start = MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress);
591 		end = start + myri10ge_mtu;
592 		if (((end >> 12) != (start >> 12)) && (start & 4095U)) {
593 			printf("std buffer crossed a 4KB boundary!\n");
594 			myri10ge_remove_jbuf(j);
595 			rx_dma_attr->dma_attr_align = 4096;
596 			rx_dma_attr->dma_attr_seg = UINT64_MAX;
597 			goto again;
598 		}
599 	}
600 
601 	j->dma.low =
602 	    htonl(MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress));
603 	j->dma.high =
604 	    htonl(MYRI10GE_HIGHPART_TO_U32(cookie.dmac_laddress));
605 	j->ss = ss;
606 
607 
608 	j->free_func.free_func = myri10ge_jfree_rtn;
609 	j->free_func.free_arg = (char *)j;
610 	mutex_enter(&jpool->mtx);
611 	j->next = jpool->head;
612 	jpool->head = j;
613 	jpool->num_alloc++;
614 	mutex_exit(&jpool->mtx);
615 	return (0);
616 
617 abort_with_mem:
618 	ddi_dma_mem_free(&j->acc_handle);
619 
620 abort_with_handle:
621 	ddi_dma_free_handle(&j->dma_handle);
622 
623 abort_with_j:
624 	kmem_free(j, sizeof (*j));
625 
626 	/*
627 	 * If an allocation failed, perhaps it failed because it could
628 	 * not satisfy granularity requirement.  Disable that, and
629 	 * try agin.
630 	 */
631 	if (rx_dma_attr == &myri10ge_rx_std_dma_attr &&
632 	    rx_dma_attr->dma_attr_align != 4096) {
633 			cmn_err(CE_NOTE,
634 			    "!alloc failed, reverting to gran=1\n");
635 			rx_dma_attr->dma_attr_align = 4096;
636 			rx_dma_attr->dma_attr_seg = UINT64_MAX;
637 			goto again;
638 	}
639 	return (err);
640 }
641 
642 static int
643 myri10ge_jfree_cnt(struct myri10ge_jpool_stuff *jpool)
644 {
645 	int i;
646 	struct myri10ge_jpool_entry *j;
647 
648 	mutex_enter(&jpool->mtx);
649 	j = jpool->head;
650 	i = 0;
651 	while (j != NULL) {
652 		i++;
653 		j = j->next;
654 	}
655 	mutex_exit(&jpool->mtx);
656 	return (i);
657 }
658 
659 static int
660 myri10ge_add_jbufs(struct myri10ge_slice_state *ss, int num, int total)
661 {
662 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
663 	int allocated = 0;
664 	int err;
665 	int needed;
666 
667 	/*
668 	 * if total is set, user wants "num" jbufs in the pool,
669 	 * otherwise the user wants to "num" additional jbufs
670 	 * added to the pool
671 	 */
672 	if (total && jpool->num_alloc) {
673 		allocated = myri10ge_jfree_cnt(jpool);
674 		needed = num - allocated;
675 	} else {
676 		needed = num;
677 	}
678 
679 	while (needed > 0) {
680 		needed--;
681 		err = myri10ge_add_jbuf(ss);
682 		if (err == 0) {
683 			allocated++;
684 		}
685 	}
686 	return (allocated);
687 }
688 
689 static void
690 myri10ge_remove_jbufs(struct myri10ge_slice_state *ss)
691 {
692 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
693 	struct myri10ge_jpool_entry *j;
694 
695 	mutex_enter(&jpool->mtx);
696 	myri10ge_pull_jpool(ss);
697 	while (jpool->head != NULL) {
698 		jpool->num_alloc--;
699 		j = jpool->head;
700 		jpool->head = j->next;
701 		myri10ge_remove_jbuf(j);
702 	}
703 	mutex_exit(&jpool->mtx);
704 }
705 
706 static void
707 myri10ge_carve_up_jbufs_into_small_ring(struct myri10ge_slice_state *ss)
708 {
709 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
710 	struct myri10ge_jpool_entry *j = NULL;
711 	caddr_t ptr;
712 	uint32_t dma_low, dma_high;
713 	int idx, len;
714 	unsigned int alloc_size;
715 
716 	dma_low = dma_high = len = 0;
717 	alloc_size = myri10ge_small_bytes + MXGEFW_PAD;
718 	ptr = NULL;
719 	for (idx = 0; idx < ss->rx_small.mask + 1; idx++) {
720 		/* Allocate a jumbo frame and carve it into small frames */
721 		if (len < alloc_size) {
722 			mutex_enter(&jpool->mtx);
723 			/* remove jumbo from freelist */
724 			j = jpool->head;
725 			jpool->head = j->next;
726 			/* place it onto small list */
727 			j->next = ss->small_jpool;
728 			ss->small_jpool = j;
729 			mutex_exit(&jpool->mtx);
730 			len = myri10ge_mtu;
731 			dma_low = ntohl(j->dma.low);
732 			dma_high = ntohl(j->dma.high);
733 			ptr = j->buf;
734 		}
735 		ss->rx_small.info[idx].ptr = ptr;
736 		ss->rx_small.shadow[idx].addr_low = htonl(dma_low);
737 		ss->rx_small.shadow[idx].addr_high = htonl(dma_high);
738 		len -= alloc_size;
739 		ptr += alloc_size;
740 		dma_low += alloc_size;
741 	}
742 }
743 
744 /*
745  * Return the jumbo bufs we carved up for small to the jumbo pool
746  */
747 
748 static void
749 myri10ge_release_small_jbufs(struct myri10ge_slice_state *ss)
750 {
751 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
752 	struct myri10ge_jpool_entry *j = NULL;
753 
754 	mutex_enter(&jpool->mtx);
755 	while (ss->small_jpool != NULL) {
756 		j = ss->small_jpool;
757 		ss->small_jpool = j->next;
758 		j->next = jpool->head;
759 		jpool->head = j;
760 	}
761 	mutex_exit(&jpool->mtx);
762 	ss->jbufs_for_smalls = 0;
763 }
764 
765 static int
766 myri10ge_add_tx_handle(struct myri10ge_slice_state *ss)
767 {
768 	myri10ge_tx_ring_t *tx = &ss->tx;
769 	struct myri10ge_priv *mgp = ss->mgp;
770 	struct myri10ge_tx_dma_handle *handle;
771 	int err;
772 
773 	handle = kmem_zalloc(sizeof (*handle), KM_SLEEP);
774 	err = ddi_dma_alloc_handle(mgp->dip,
775 	    &myri10ge_tx_dma_attr,
776 	    DDI_DMA_SLEEP, NULL,
777 	    &handle->h);
778 	if (err) {
779 		static int limit = 0;
780 		if (limit == 0)
781 			cmn_err(CE_WARN, "%s: Falled to alloc tx dma handle\n",
782 			    mgp->name);
783 		limit++;
784 		kmem_free(handle, sizeof (*handle));
785 		return (err);
786 	}
787 	mutex_enter(&tx->handle_lock);
788 	MYRI10GE_SLICE_STAT_INC(tx_handles_alloced);
789 	handle->next = tx->free_tx_handles;
790 	tx->free_tx_handles = handle;
791 	mutex_exit(&tx->handle_lock);
792 	return (DDI_SUCCESS);
793 }
794 
795 static void
796 myri10ge_remove_tx_handles(struct myri10ge_slice_state *ss)
797 {
798 	myri10ge_tx_ring_t *tx = &ss->tx;
799 	struct myri10ge_tx_dma_handle *handle;
800 	mutex_enter(&tx->handle_lock);
801 
802 	handle = tx->free_tx_handles;
803 	while (handle != NULL) {
804 		tx->free_tx_handles = handle->next;
805 		ddi_dma_free_handle(&handle->h);
806 		kmem_free(handle, sizeof (*handle));
807 		handle = tx->free_tx_handles;
808 		MYRI10GE_SLICE_STAT_DEC(tx_handles_alloced);
809 	}
810 	mutex_exit(&tx->handle_lock);
811 	if (MYRI10GE_SLICE_STAT(tx_handles_alloced) != 0) {
812 		cmn_err(CE_WARN, "%s: %d tx dma handles allocated at close\n",
813 		    ss->mgp->name,
814 		    (int)MYRI10GE_SLICE_STAT(tx_handles_alloced));
815 	}
816 }
817 
818 static void
819 myri10ge_free_tx_handles(myri10ge_tx_ring_t *tx,
820     struct myri10ge_tx_dma_handle_head *list)
821 {
822 	mutex_enter(&tx->handle_lock);
823 	list->tail->next = tx->free_tx_handles;
824 	tx->free_tx_handles = list->head;
825 	mutex_exit(&tx->handle_lock);
826 }
827 
828 static void
829 myri10ge_free_tx_handle_slist(myri10ge_tx_ring_t *tx,
830     struct myri10ge_tx_dma_handle *handle)
831 {
832 	struct myri10ge_tx_dma_handle_head list;
833 
834 	if (handle == NULL)
835 		return;
836 	list.head = handle;
837 	list.tail = handle;
838 	while (handle != NULL) {
839 		list.tail = handle;
840 		handle = handle->next;
841 	}
842 	myri10ge_free_tx_handles(tx, &list);
843 }
844 
845 static int
846 myri10ge_alloc_tx_handles(struct myri10ge_slice_state *ss, int count,
847     struct myri10ge_tx_dma_handle **ret)
848 {
849 	myri10ge_tx_ring_t *tx = &ss->tx;
850 	struct myri10ge_tx_dma_handle *handle;
851 	int err, i;
852 
853 	mutex_enter(&tx->handle_lock);
854 	for (i = 0; i < count; i++) {
855 		handle = tx->free_tx_handles;
856 		while (handle == NULL) {
857 			mutex_exit(&tx->handle_lock);
858 			err = myri10ge_add_tx_handle(ss);
859 			if (err != DDI_SUCCESS) {
860 				goto abort_with_handles;
861 			}
862 			mutex_enter(&tx->handle_lock);
863 			handle = tx->free_tx_handles;
864 		}
865 		tx->free_tx_handles = handle->next;
866 		handle->next = *ret;
867 		*ret = handle;
868 	}
869 	mutex_exit(&tx->handle_lock);
870 	return (DDI_SUCCESS);
871 
872 abort_with_handles:
873 	myri10ge_free_tx_handle_slist(tx, *ret);
874 	return (err);
875 }
876 
877 
878 /*
879  * Frees DMA resources associated with the send ring
880  */
881 static void
882 myri10ge_unprepare_tx_ring(struct myri10ge_slice_state *ss)
883 {
884 	myri10ge_tx_ring_t *tx;
885 	struct myri10ge_tx_dma_handle_head handles;
886 	size_t bytes;
887 	int idx;
888 
889 	tx = &ss->tx;
890 	handles.head = NULL;
891 	handles.tail = NULL;
892 	for (idx = 0; idx < ss->tx.mask + 1; idx++) {
893 		if (tx->info[idx].m) {
894 			(void) ddi_dma_unbind_handle(tx->info[idx].handle->h);
895 			handles.head = tx->info[idx].handle;
896 			if (handles.tail == NULL)
897 				handles.tail = tx->info[idx].handle;
898 			freeb(tx->info[idx].m);
899 			tx->info[idx].m = 0;
900 			tx->info[idx].handle = 0;
901 		}
902 		tx->cp[idx].va = NULL;
903 		myri10ge_dma_free(&tx->cp[idx].dma);
904 	}
905 	bytes = sizeof (*tx->cp) * (tx->mask + 1);
906 	kmem_free(tx->cp, bytes);
907 	tx->cp = NULL;
908 	if (handles.head != NULL)
909 		myri10ge_free_tx_handles(tx, &handles);
910 	myri10ge_remove_tx_handles(ss);
911 }
912 
913 /*
914  * Allocates DMA handles associated with the send ring
915  */
916 static inline int
917 myri10ge_prepare_tx_ring(struct myri10ge_slice_state *ss)
918 {
919 	struct myri10ge_tx_dma_handle *handles;
920 	int h;
921 	size_t bytes;
922 
923 	bytes = sizeof (*ss->tx.cp) * (ss->tx.mask + 1);
924 	ss->tx.cp = kmem_zalloc(bytes, KM_SLEEP);
925 	if (ss->tx.cp == NULL) {
926 		cmn_err(CE_WARN,
927 		    "%s: Failed to allocate tx copyblock storage\n",
928 		    ss->mgp->name);
929 		return (DDI_FAILURE);
930 	}
931 
932 
933 	/* allocate the TX copyblocks */
934 	for (h = 0; h < ss->tx.mask + 1; h++) {
935 		ss->tx.cp[h].va = myri10ge_dma_alloc(ss->mgp->dip,
936 		    4096, &myri10ge_rx_jumbo_dma_attr,
937 		    &myri10ge_dev_access_attr, DDI_DMA_STREAMING,
938 		    DDI_DMA_WRITE|DDI_DMA_STREAMING, &ss->tx.cp[h].dma, 1,
939 		    DDI_DMA_DONTWAIT);
940 		if (ss->tx.cp[h].va == NULL) {
941 			cmn_err(CE_WARN, "%s: Failed to allocate tx "
942 			    "copyblock %d\n", ss->mgp->name, h);
943 			goto abort_with_copyblocks;
944 		}
945 	}
946 	/* pre-allocate transmit handles */
947 	handles = NULL;
948 	(void) myri10ge_alloc_tx_handles(ss, myri10ge_tx_handles_initial,
949 	    &handles);
950 	if (handles != NULL)
951 		myri10ge_free_tx_handle_slist(&ss->tx, handles);
952 
953 	return (DDI_SUCCESS);
954 
955 abort_with_copyblocks:
956 	while (h > 0)  {
957 		h--;
958 		myri10ge_dma_free(&ss->tx.cp[h].dma);
959 	}
960 
961 	bytes = sizeof (*ss->tx.cp) * (ss->tx.mask + 1);
962 	kmem_free(ss->tx.cp, bytes);
963 	ss->tx.cp = NULL;
964 	return (DDI_FAILURE);
965 }
966 
967 /*
968  * The eeprom strings on the lanaiX have the format
969  * SN=x\0
970  * MAC=x:x:x:x:x:x\0
971  * PT:ddd mmm xx xx:xx:xx xx\0
972  * PV:ddd mmm xx xx:xx:xx xx\0
973  */
974 static int
975 myri10ge_read_mac_addr(struct myri10ge_priv *mgp)
976 {
977 #define	MYRI10GE_NEXT_STRING(p) while (ptr < limit && *ptr++)
978 #define	myri10ge_digit(c) (((c) >= '0' && (c) <= '9') ? ((c) - '0') :	\
979 		(((c) >= 'A' && (c) <= 'F') ? (10 + (c) - 'A') :	\
980 		(((c) >= 'a' && (c) <= 'f') ? (10 + (c) - 'a') : -1)))
981 
982 	char *ptr, *limit;
983 	int i, hv, lv;
984 
985 	ptr = mgp->eeprom_strings;
986 	limit = mgp->eeprom_strings + MYRI10GE_EEPROM_STRINGS_SIZE;
987 
988 	while (*ptr != '\0' && ptr < limit) {
989 		if (memcmp(ptr, "MAC=", 4) == 0) {
990 			ptr += 4;
991 			if (myri10ge_verbose)
992 				printf("%s: mac address = %s\n", mgp->name,
993 				    ptr);
994 			mgp->mac_addr_string = ptr;
995 			for (i = 0; i < 6; i++) {
996 				if ((ptr + 2) > limit)
997 					goto abort;
998 
999 				if (*(ptr+1) == ':') {
1000 					hv = 0;
1001 					lv = myri10ge_digit(*ptr); ptr++;
1002 				} else {
1003 					hv = myri10ge_digit(*ptr); ptr++;
1004 					lv = myri10ge_digit(*ptr); ptr++;
1005 				}
1006 				mgp->mac_addr[i] = (hv << 4) | lv;
1007 				ptr++;
1008 			}
1009 		}
1010 		if (memcmp((const void *)ptr, "SN=", 3) == 0) {
1011 			ptr += 3;
1012 			mgp->sn_str = (char *)ptr;
1013 		}
1014 		if (memcmp((const void *)ptr, "PC=", 3) == 0) {
1015 			ptr += 3;
1016 			mgp->pc_str = (char *)ptr;
1017 		}
1018 		MYRI10GE_NEXT_STRING(ptr);
1019 	}
1020 
1021 	return (0);
1022 
1023 abort:
1024 	cmn_err(CE_WARN, "%s: failed to parse eeprom_strings", mgp->name);
1025 	return (ENXIO);
1026 }
1027 
1028 
1029 /*
1030  * Determine the register set containing the PCI resource we
1031  * want to map: the memory-mappable part of the interface. We do
1032  * this by scanning the DDI "reg" property of the interface,
1033  * which is an array of mx_ddi_reg_set structures.
1034  */
1035 static int
1036 myri10ge_reg_set(dev_info_t *dip, int *reg_set, int *span,
1037     unsigned long *busno, unsigned long *devno,
1038     unsigned long *funcno)
1039 {
1040 
1041 #define	REGISTER_NUMBER(ip)	(ip[0] >>  0 & 0xff)
1042 #define	FUNCTION_NUMBER(ip)	(ip[0] >>  8 & 0x07)
1043 #define	DEVICE_NUMBER(ip)	(ip[0] >> 11 & 0x1f)
1044 #define	BUS_NUMBER(ip)		(ip[0] >> 16 & 0xff)
1045 #define	ADDRESS_SPACE(ip)	(ip[0] >> 24 & 0x03)
1046 #define	PCI_ADDR_HIGH(ip)	(ip[1])
1047 #define	PCI_ADDR_LOW(ip) 	(ip[2])
1048 #define	PCI_SPAN_HIGH(ip)	(ip[3])
1049 #define	PCI_SPAN_LOW(ip)	(ip[4])
1050 
1051 #define	MX_DDI_REG_SET_32_BIT_MEMORY_SPACE 2
1052 #define	MX_DDI_REG_SET_64_BIT_MEMORY_SPACE 3
1053 
1054 	int *data, i, *rs;
1055 	uint32_t nelementsp;
1056 
1057 #ifdef MYRI10GE_REGSET_VERBOSE
1058 	char *address_space_name[] = { "Configuration Space",
1059 					"I/O Space",
1060 					"32-bit Memory Space",
1061 					"64-bit Memory Space"
1062 	};
1063 #endif
1064 
1065 	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
1066 	    "reg", &data, &nelementsp) != DDI_SUCCESS) {
1067 		printf("Could not determine register set.\n");
1068 		return (ENXIO);
1069 	}
1070 
1071 #ifdef MYRI10GE_REGSET_VERBOSE
1072 	printf("There are %d register sets.\n", nelementsp / 5);
1073 #endif
1074 	if (!nelementsp) {
1075 		printf("Didn't find any \"reg\" properties.\n");
1076 		ddi_prop_free(data);
1077 		return (ENODEV);
1078 	}
1079 
1080 	/* Scan for the register number. */
1081 	rs = &data[0];
1082 	*busno = BUS_NUMBER(rs);
1083 	*devno = DEVICE_NUMBER(rs);
1084 	*funcno = FUNCTION_NUMBER(rs);
1085 
1086 #ifdef MYRI10GE_REGSET_VERBOSE
1087 	printf("*** Scanning for register number.\n");
1088 #endif
1089 	for (i = 0; i < nelementsp / 5; i++) {
1090 		rs = &data[5 * i];
1091 #ifdef MYRI10GE_REGSET_VERBOSE
1092 		printf("Examining register set %d:\n", i);
1093 		printf("  Register number = %d.\n", REGISTER_NUMBER(rs));
1094 		printf("  Function number = %d.\n", FUNCTION_NUMBER(rs));
1095 		printf("  Device number   = %d.\n", DEVICE_NUMBER(rs));
1096 		printf("  Bus number      = %d.\n", BUS_NUMBER(rs));
1097 		printf("  Address space   = %d (%s ).\n", ADDRESS_SPACE(rs),
1098 		    address_space_name[ADDRESS_SPACE(rs)]);
1099 		printf("  pci address 0x%08x %08x\n", PCI_ADDR_HIGH(rs),
1100 		    PCI_ADDR_LOW(rs));
1101 		printf("  pci span 0x%08x %08x\n", PCI_SPAN_HIGH(rs),
1102 		    PCI_SPAN_LOW(rs));
1103 #endif
1104 		/* We are looking for a memory property. */
1105 
1106 		if (ADDRESS_SPACE(rs) == MX_DDI_REG_SET_64_BIT_MEMORY_SPACE ||
1107 		    ADDRESS_SPACE(rs) == MX_DDI_REG_SET_32_BIT_MEMORY_SPACE) {
1108 			*reg_set = i;
1109 
1110 #ifdef MYRI10GE_REGSET_VERBOSE
1111 			printf("%s uses register set %d.\n",
1112 			    address_space_name[ADDRESS_SPACE(rs)], *reg_set);
1113 #endif
1114 
1115 			*span = (PCI_SPAN_LOW(rs));
1116 #ifdef MYRI10GE_REGSET_VERBOSE
1117 			printf("Board span is 0x%x\n", *span);
1118 #endif
1119 			break;
1120 		}
1121 	}
1122 
1123 	ddi_prop_free(data);
1124 
1125 	/* If no match, fail. */
1126 	if (i >= nelementsp / 5) {
1127 		return (EIO);
1128 	}
1129 
1130 	return (0);
1131 }
1132 
1133 
1134 static int
1135 myri10ge_load_firmware_from_zlib(struct myri10ge_priv *mgp, uint32_t *limit)
1136 {
1137 	void *inflate_buffer;
1138 	int rv, status;
1139 	size_t sram_size = mgp->sram_size - MYRI10GE_EEPROM_STRINGS_SIZE;
1140 	size_t destlen;
1141 	mcp_gen_header_t *hdr;
1142 	unsigned hdr_offset, i;
1143 
1144 
1145 	*limit = 0; /* -Wuninitialized */
1146 	status = 0;
1147 
1148 	inflate_buffer = kmem_zalloc(sram_size, KM_NOSLEEP);
1149 	if (!inflate_buffer) {
1150 		cmn_err(CE_WARN,
1151 		    "%s: Could not allocate buffer to inflate mcp\n",
1152 		    mgp->name);
1153 		return (ENOMEM);
1154 	}
1155 
1156 	destlen = sram_size;
1157 	rv = z_uncompress(inflate_buffer, &destlen, mgp->eth_z8e,
1158 	    mgp->eth_z8e_length);
1159 
1160 	if (rv != Z_OK) {
1161 		cmn_err(CE_WARN, "%s: Could not inflate mcp: %s\n",
1162 		    mgp->name, z_strerror(rv));
1163 		status = ENXIO;
1164 		goto abort;
1165 	}
1166 
1167 	*limit = (uint32_t)destlen;
1168 
1169 	hdr_offset = htonl(*(uint32_t *)(void *)((char *)inflate_buffer +
1170 	    MCP_HEADER_PTR_OFFSET));
1171 	hdr = (void *)((char *)inflate_buffer + hdr_offset);
1172 	if (ntohl(hdr->mcp_type) != MCP_TYPE_ETH) {
1173 		cmn_err(CE_WARN, "%s: Bad firmware type: 0x%x\n", mgp->name,
1174 		    ntohl(hdr->mcp_type));
1175 		status = EIO;
1176 		goto abort;
1177 	}
1178 
1179 	/* save firmware version for kstat */
1180 	(void) strncpy(mgp->fw_version, hdr->version, sizeof (mgp->fw_version));
1181 	if (myri10ge_verbose)
1182 		printf("%s: firmware id: %s\n", mgp->name, hdr->version);
1183 
1184 	/* Copy the inflated firmware to NIC SRAM. */
1185 	for (i = 0; i < *limit; i += 256) {
1186 		myri10ge_pio_copy((char *)mgp->sram + MYRI10GE_FW_OFFSET + i,
1187 		    (char *)inflate_buffer + i,
1188 		    min(256U, (unsigned)(*limit - i)));
1189 		mb();
1190 		(void) *(int *)(void *)mgp->sram;
1191 		mb();
1192 	}
1193 
1194 abort:
1195 	kmem_free(inflate_buffer, sram_size);
1196 
1197 	return (status);
1198 
1199 }
1200 
1201 
1202 int
1203 myri10ge_send_cmd(struct myri10ge_priv *mgp, uint32_t cmd,
1204 		myri10ge_cmd_t *data)
1205 {
1206 	mcp_cmd_t *buf;
1207 	char buf_bytes[sizeof (*buf) + 8];
1208 	volatile mcp_cmd_response_t *response = mgp->cmd;
1209 	volatile char *cmd_addr =
1210 	    (volatile char *)mgp->sram + MXGEFW_ETH_CMD;
1211 	int sleep_total = 0;
1212 
1213 	/* ensure buf is aligned to 8 bytes */
1214 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1215 
1216 	buf->data0 = htonl(data->data0);
1217 	buf->data1 = htonl(data->data1);
1218 	buf->data2 = htonl(data->data2);
1219 	buf->cmd = htonl(cmd);
1220 	buf->response_addr.low = mgp->cmd_dma.low;
1221 	buf->response_addr.high = mgp->cmd_dma.high;
1222 	mutex_enter(&mgp->cmd_lock);
1223 	response->result = 0xffffffff;
1224 	mb();
1225 
1226 	myri10ge_pio_copy((void *)cmd_addr, buf, sizeof (*buf));
1227 
1228 	/* wait up to 20ms */
1229 	for (sleep_total = 0; sleep_total < 20; sleep_total++) {
1230 		mb();
1231 		if (response->result != 0xffffffff) {
1232 			if (response->result == 0) {
1233 				data->data0 = ntohl(response->data);
1234 				mutex_exit(&mgp->cmd_lock);
1235 				return (0);
1236 			} else if (ntohl(response->result)
1237 			    == MXGEFW_CMD_UNKNOWN) {
1238 				mutex_exit(&mgp->cmd_lock);
1239 				return (ENOSYS);
1240 			} else if (ntohl(response->result)
1241 			    == MXGEFW_CMD_ERROR_UNALIGNED) {
1242 				mutex_exit(&mgp->cmd_lock);
1243 				return (E2BIG);
1244 			} else {
1245 				cmn_err(CE_WARN,
1246 				    "%s: command %d failed, result = %d\n",
1247 				    mgp->name, cmd, ntohl(response->result));
1248 				mutex_exit(&mgp->cmd_lock);
1249 				return (ENXIO);
1250 			}
1251 		}
1252 		drv_usecwait(1000);
1253 	}
1254 	mutex_exit(&mgp->cmd_lock);
1255 	cmn_err(CE_WARN, "%s: command %d timed out, result = %d\n",
1256 	    mgp->name, cmd, ntohl(response->result));
1257 	return (EAGAIN);
1258 }
1259 
1260 /*
1261  * Enable or disable periodic RDMAs from the host to make certain
1262  * chipsets resend dropped PCIe messages
1263  */
1264 
1265 static void
1266 myri10ge_dummy_rdma(struct myri10ge_priv *mgp, int enable)
1267 {
1268 	char buf_bytes[72];
1269 	volatile uint32_t *confirm;
1270 	volatile char *submit;
1271 	uint32_t *buf;
1272 	int i;
1273 
1274 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1275 
1276 	/* clear confirmation addr */
1277 	confirm = (volatile uint32_t *)mgp->cmd;
1278 	*confirm = 0;
1279 	mb();
1280 
1281 	/*
1282 	 * send an rdma command to the PCIe engine, and wait for the
1283 	 * response in the confirmation address.  The firmware should
1284 	 *  write a -1 there to indicate it is alive and well
1285 	 */
1286 
1287 	buf[0] = mgp->cmd_dma.high;		/* confirm addr MSW */
1288 	buf[1] = mgp->cmd_dma.low;		/* confirm addr LSW */
1289 	buf[2] = htonl(0xffffffff);		/* confirm data */
1290 	buf[3] = htonl(mgp->cmd_dma.high); 	/* dummy addr MSW */
1291 	buf[4] = htonl(mgp->cmd_dma.low); 	/* dummy addr LSW */
1292 	buf[5] = htonl(enable);			/* enable? */
1293 
1294 
1295 	submit = (volatile char *)(mgp->sram + MXGEFW_BOOT_DUMMY_RDMA);
1296 
1297 	myri10ge_pio_copy((char *)submit, buf, 64);
1298 	mb();
1299 	drv_usecwait(1000);
1300 	mb();
1301 	i = 0;
1302 	while (*confirm != 0xffffffff && i < 20) {
1303 		drv_usecwait(1000);
1304 		i++;
1305 	}
1306 	if (*confirm != 0xffffffff) {
1307 		cmn_err(CE_WARN, "%s: dummy rdma %s failed (%p = 0x%x)",
1308 		    mgp->name,
1309 		    (enable ? "enable" : "disable"), (void*) confirm, *confirm);
1310 	}
1311 }
1312 
1313 static int
1314 myri10ge_load_firmware(struct myri10ge_priv *mgp)
1315 {
1316 	myri10ge_cmd_t cmd;
1317 	volatile uint32_t *confirm;
1318 	volatile char *submit;
1319 	char buf_bytes[72];
1320 	uint32_t *buf, size;
1321 	int status, i;
1322 
1323 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1324 
1325 	status = myri10ge_load_firmware_from_zlib(mgp, &size);
1326 	if (status) {
1327 		cmn_err(CE_WARN, "%s: firmware loading failed\n", mgp->name);
1328 		return (status);
1329 	}
1330 
1331 	/* clear confirmation addr */
1332 	confirm = (volatile uint32_t *)mgp->cmd;
1333 	*confirm = 0;
1334 	mb();
1335 
1336 	/*
1337 	 * send a reload command to the bootstrap MCP, and wait for the
1338 	 * response in the confirmation address.  The firmware should
1339 	 * write a -1 there to indicate it is alive and well
1340 	 */
1341 
1342 	buf[0] = mgp->cmd_dma.high;	/* confirm addr MSW */
1343 	buf[1] = mgp->cmd_dma.low;	/* confirm addr LSW */
1344 	buf[2] = htonl(0xffffffff);	/* confirm data */
1345 
1346 	/*
1347 	 * FIX: All newest firmware should un-protect the bottom of
1348 	 * the sram before handoff. However, the very first interfaces
1349 	 * do not. Therefore the handoff copy must skip the first 8 bytes
1350 	 */
1351 	buf[3] = htonl(MYRI10GE_FW_OFFSET + 8); /* where the code starts */
1352 	buf[4] = htonl(size - 8); 	/* length of code */
1353 	buf[5] = htonl(8);		/* where to copy to */
1354 	buf[6] = htonl(0);		/* where to jump to */
1355 
1356 	submit = (volatile char *)(mgp->sram + MXGEFW_BOOT_HANDOFF);
1357 
1358 	myri10ge_pio_copy((char *)submit, buf, 64);
1359 	mb();
1360 	drv_usecwait(1000);
1361 	mb();
1362 	i = 0;
1363 	while (*confirm != 0xffffffff && i < 1000) {
1364 		drv_usecwait(1000);
1365 		i++;
1366 	}
1367 	if (*confirm != 0xffffffff) {
1368 		cmn_err(CE_WARN, "%s: handoff failed (%p = 0x%x)",
1369 		    mgp->name, (void *) confirm, *confirm);
1370 
1371 		return (ENXIO);
1372 	}
1373 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
1374 	if (status != 0) {
1375 		cmn_err(CE_WARN, "%s: failed MXGEFW_CMD_GET_RX_RING_SIZE\n",
1376 		    mgp->name);
1377 		return (ENXIO);
1378 	}
1379 
1380 	mgp->max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
1381 	myri10ge_dummy_rdma(mgp, 1);
1382 	return (0);
1383 }
1384 
1385 static int
1386 myri10ge_m_unicst(void *arg, const uint8_t *addr)
1387 {
1388 	struct myri10ge_priv *mgp = arg;
1389 	myri10ge_cmd_t cmd;
1390 	int status;
1391 
1392 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1393 	    | (addr[2] << 8) | addr[3]);
1394 
1395 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1396 
1397 	status = myri10ge_send_cmd(mgp, MXGEFW_SET_MAC_ADDRESS, &cmd);
1398 	if (status == 0 && (addr != mgp->mac_addr))
1399 		(void) memcpy(mgp->mac_addr, addr, sizeof (mgp->mac_addr));
1400 
1401 	return (status);
1402 }
1403 
1404 static int
1405 myri10ge_change_pause(struct myri10ge_priv *mgp, int pause)
1406 {
1407 	myri10ge_cmd_t cmd;
1408 	int status;
1409 
1410 	if (pause)
1411 		status = myri10ge_send_cmd(mgp, MXGEFW_ENABLE_FLOW_CONTROL,
1412 		    &cmd);
1413 	else
1414 		status = myri10ge_send_cmd(mgp, MXGEFW_DISABLE_FLOW_CONTROL,
1415 		    &cmd);
1416 
1417 	if (status) {
1418 		cmn_err(CE_WARN, "%s: Failed to set flow control mode\n",
1419 		    mgp->name);
1420 		return (ENXIO);
1421 	}
1422 	mgp->pause = pause;
1423 	return (0);
1424 }
1425 
1426 static void
1427 myri10ge_change_promisc(struct myri10ge_priv *mgp, int promisc)
1428 {
1429 	myri10ge_cmd_t cmd;
1430 	int status;
1431 
1432 	if (promisc)
1433 		status = myri10ge_send_cmd(mgp, MXGEFW_ENABLE_PROMISC, &cmd);
1434 	else
1435 		status = myri10ge_send_cmd(mgp, MXGEFW_DISABLE_PROMISC, &cmd);
1436 
1437 	if (status) {
1438 		cmn_err(CE_WARN, "%s: Failed to set promisc mode\n",
1439 		    mgp->name);
1440 	}
1441 }
1442 
1443 static int
1444 myri10ge_dma_test(struct myri10ge_priv *mgp, int test_type)
1445 {
1446 	myri10ge_cmd_t cmd;
1447 	int status;
1448 	uint32_t len;
1449 	void *dmabench;
1450 	struct myri10ge_dma_stuff dmabench_dma;
1451 	char *test = " ";
1452 
1453 	/*
1454 	 * Run a small DMA test.
1455 	 * The magic multipliers to the length tell the firmware
1456 	 * tp do DMA read, write, or read+write tests.  The
1457 	 * results are returned in cmd.data0.  The upper 16
1458 	 * bits or the return is the number of transfers completed.
1459 	 * The lower 16 bits is the time in 0.5us ticks that the
1460 	 * transfers took to complete
1461 	 */
1462 
1463 	len = mgp->tx_boundary;
1464 
1465 	dmabench = myri10ge_dma_alloc(mgp->dip, len,
1466 	    &myri10ge_rx_jumbo_dma_attr, &myri10ge_dev_access_attr,
1467 	    DDI_DMA_STREAMING,  DDI_DMA_RDWR|DDI_DMA_STREAMING,
1468 	    &dmabench_dma, 1, DDI_DMA_DONTWAIT);
1469 	mgp->read_dma = mgp->write_dma = mgp->read_write_dma = 0;
1470 	if (dmabench == NULL) {
1471 		cmn_err(CE_WARN, "%s dma benchmark aborted\n", mgp->name);
1472 		return (ENOMEM);
1473 	}
1474 
1475 	cmd.data0 = ntohl(dmabench_dma.low);
1476 	cmd.data1 = ntohl(dmabench_dma.high);
1477 	cmd.data2 = len * 0x10000;
1478 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1479 	if (status != 0) {
1480 		test = "read";
1481 		goto abort;
1482 	}
1483 	mgp->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
1484 
1485 	cmd.data0 = ntohl(dmabench_dma.low);
1486 	cmd.data1 = ntohl(dmabench_dma.high);
1487 	cmd.data2 = len * 0x1;
1488 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1489 	if (status != 0) {
1490 		test = "write";
1491 		goto abort;
1492 	}
1493 	mgp->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
1494 
1495 	cmd.data0 = ntohl(dmabench_dma.low);
1496 	cmd.data1 = ntohl(dmabench_dma.high);
1497 	cmd.data2 = len * 0x10001;
1498 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1499 	if (status != 0) {
1500 		test = "read/write";
1501 		goto abort;
1502 	}
1503 	mgp->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
1504 	    (cmd.data0 & 0xffff);
1505 
1506 
1507 abort:
1508 	myri10ge_dma_free(&dmabench_dma);
1509 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
1510 		cmn_err(CE_WARN, "%s %s dma benchmark failed\n", mgp->name,
1511 		    test);
1512 	return (status);
1513 }
1514 
1515 static int
1516 myri10ge_reset(struct myri10ge_priv *mgp)
1517 {
1518 	myri10ge_cmd_t cmd;
1519 	struct myri10ge_nic_stat *ethstat;
1520 	struct myri10ge_slice_state *ss;
1521 	int i, status;
1522 	size_t bytes;
1523 
1524 	/* send a reset command to the card to see if it is alive */
1525 	(void) memset(&cmd, 0, sizeof (cmd));
1526 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_RESET, &cmd);
1527 	if (status != 0) {
1528 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
1529 		return (ENXIO);
1530 	}
1531 
1532 	/* Now exchange information about interrupts  */
1533 
1534 	bytes = mgp->max_intr_slots * sizeof (*mgp->ss[0].rx_done.entry);
1535 	cmd.data0 = (uint32_t)bytes;
1536 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1537 
1538 	/*
1539 	 * Even though we already know how many slices are supported
1540 	 * via myri10ge_probe_slices() MXGEFW_CMD_GET_MAX_RSS_QUEUES
1541 	 * has magic side effects, and must be called after a reset.
1542 	 * It must be called prior to calling any RSS related cmds,
1543 	 * including assigning an interrupt queue for anything but
1544 	 * slice 0.  It must also be called *after*
1545 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1546 	 * the firmware to compute offsets.
1547 	 */
1548 
1549 	if (mgp->num_slices > 1) {
1550 
1551 		/* ask the maximum number of slices it supports */
1552 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1553 		    &cmd);
1554 		if (status != 0) {
1555 			cmn_err(CE_WARN,
1556 			    "%s: failed to get number of slices\n",
1557 			    mgp->name);
1558 			return (status);
1559 		}
1560 
1561 		/*
1562 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1563 		 * to setting up the interrupt queue DMA
1564 		 */
1565 
1566 		cmd.data0 = mgp->num_slices;
1567 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE |
1568 		    MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1569 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1570 		    &cmd);
1571 		if (status != 0) {
1572 			cmn_err(CE_WARN,
1573 			    "%s: failed to set number of slices\n",
1574 			    mgp->name);
1575 			return (status);
1576 		}
1577 	}
1578 	for (i = 0; i < mgp->num_slices; i++) {
1579 		ss = &mgp->ss[i];
1580 		cmd.data0 = ntohl(ss->rx_done.dma.low);
1581 		cmd.data1 = ntohl(ss->rx_done.dma.high);
1582 		cmd.data2 = i;
1583 		status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_DMA,
1584 		    &cmd);
1585 	};
1586 
1587 	status |= myri10ge_send_cmd(mgp,  MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1588 	for (i = 0; i < mgp->num_slices; i++) {
1589 		ss = &mgp->ss[i];
1590 		ss->irq_claim = (volatile unsigned int *)
1591 		    (void *)(mgp->sram + cmd.data0 + 8 * i);
1592 	}
1593 
1594 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
1595 		status |= myri10ge_send_cmd(mgp,
1596 		    MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1597 		mgp->irq_deassert = (uint32_t *)(void *)(mgp->sram + cmd.data0);
1598 	}
1599 
1600 	status |= myri10ge_send_cmd(mgp,
1601 	    MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1602 	mgp->intr_coal_delay_ptr = (uint32_t *)(void *)(mgp->sram + cmd.data0);
1603 
1604 	if (status != 0) {
1605 		cmn_err(CE_WARN, "%s: failed set interrupt parameters\n",
1606 		    mgp->name);
1607 		return (status);
1608 	}
1609 
1610 	*mgp->intr_coal_delay_ptr = htonl(mgp->intr_coal_delay);
1611 	(void) myri10ge_dma_test(mgp, MXGEFW_DMA_TEST);
1612 
1613 	/* reset mcp/driver shared state back to 0 */
1614 
1615 	for (i = 0; i < mgp->num_slices; i++) {
1616 		ss = &mgp->ss[i];
1617 		bytes = mgp->max_intr_slots *
1618 		    sizeof (*mgp->ss[0].rx_done.entry);
1619 		(void) memset(ss->rx_done.entry, 0, bytes);
1620 		ss->tx.req = 0;
1621 		ss->tx.done = 0;
1622 		ss->tx.pkt_done = 0;
1623 		ss->rx_big.cnt = 0;
1624 		ss->rx_small.cnt = 0;
1625 		ss->rx_done.idx = 0;
1626 		ss->rx_done.cnt = 0;
1627 		ss->rx_token = 0;
1628 		ss->tx.watchdog_done = 0;
1629 		ss->tx.watchdog_req = 0;
1630 		ss->tx.active = 0;
1631 		ss->tx.activate = 0;
1632 	}
1633 	mgp->watchdog_rx_pause = 0;
1634 	if (mgp->ksp_stat != NULL) {
1635 		ethstat = (struct myri10ge_nic_stat *)mgp->ksp_stat->ks_data;
1636 		ethstat->link_changes.value.ul = 0;
1637 	}
1638 	status = myri10ge_m_unicst(mgp, mgp->mac_addr);
1639 	myri10ge_change_promisc(mgp, 0);
1640 	(void) myri10ge_change_pause(mgp, mgp->pause);
1641 	return (status);
1642 }
1643 
1644 static int
1645 myri10ge_init_toeplitz(struct myri10ge_priv *mgp)
1646 {
1647 	myri10ge_cmd_t cmd;
1648 	int i, b, s, t, j;
1649 	int status;
1650 	uint32_t k[8];
1651 	uint32_t tmp;
1652 	uint8_t *key;
1653 
1654 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RSS_KEY_OFFSET,
1655 	    &cmd);
1656 	if (status != 0) {
1657 		cmn_err(CE_WARN, "%s: failed to get rss key\n",
1658 		    mgp->name);
1659 		return (EIO);
1660 	}
1661 	myri10ge_pio_copy32(mgp->rss_key,
1662 	    (uint32_t *)(void*)((char *)mgp->sram + cmd.data0),
1663 	    sizeof (mgp->rss_key));
1664 
1665 	mgp->toeplitz_hash_table = kmem_alloc(sizeof (uint32_t) * 12 * 256,
1666 	    KM_SLEEP);
1667 	key = (uint8_t *)mgp->rss_key;
1668 	t = 0;
1669 	for (b = 0; b < 12; b++) {
1670 		for (s = 0; s < 8; s++) {
1671 			/* Bits: b*8+s, ..., b*8+s+31 */
1672 			k[s] = 0;
1673 			for (j = 0; j < 32; j++) {
1674 				int bit = b*8+s+j;
1675 				bit = 0x1 & (key[bit / 8] >> (7 -(bit & 0x7)));
1676 				k[s] |= bit << (31 - j);
1677 			}
1678 		}
1679 
1680 		for (i = 0; i <= 0xff; i++) {
1681 			tmp = 0;
1682 			if (i & (1 << 7)) { tmp ^= k[0]; }
1683 			if (i & (1 << 6)) { tmp ^= k[1]; }
1684 			if (i & (1 << 5)) { tmp ^= k[2]; }
1685 			if (i & (1 << 4)) { tmp ^= k[3]; }
1686 			if (i & (1 << 3)) { tmp ^= k[4]; }
1687 			if (i & (1 << 2)) { tmp ^= k[5]; }
1688 			if (i & (1 << 1)) { tmp ^= k[6]; }
1689 			if (i & (1 << 0)) { tmp ^= k[7]; }
1690 			mgp->toeplitz_hash_table[t++] = tmp;
1691 		}
1692 	}
1693 	return (0);
1694 }
1695 
1696 static inline struct myri10ge_slice_state *
1697 myri10ge_toeplitz_send_hash(struct myri10ge_priv *mgp, struct ip *ip)
1698 {
1699 	struct tcphdr *hdr;
1700 	uint32_t saddr, daddr;
1701 	uint32_t hash, slice;
1702 	uint32_t *table = mgp->toeplitz_hash_table;
1703 	uint16_t src, dst;
1704 
1705 	/*
1706 	 * Note hashing order is reversed from how it is done
1707 	 * in the NIC, so as to generate the same hash value
1708 	 * for the connection to try to keep connections CPU local
1709 	 */
1710 
1711 	/* hash on IPv4 src/dst address */
1712 	saddr = ntohl(ip->ip_src.s_addr);
1713 	daddr = ntohl(ip->ip_dst.s_addr);
1714 	hash = table[(256 * 0) + ((daddr >> 24) & 0xff)];
1715 	hash ^= table[(256 * 1) + ((daddr >> 16) & 0xff)];
1716 	hash ^= table[(256 * 2) + ((daddr >> 8) & 0xff)];
1717 	hash ^= table[(256 * 3) + ((daddr) & 0xff)];
1718 	hash ^= table[(256 * 4) + ((saddr >> 24) & 0xff)];
1719 	hash ^= table[(256 * 5) + ((saddr >> 16) & 0xff)];
1720 	hash ^= table[(256 * 6) + ((saddr >> 8) & 0xff)];
1721 	hash ^= table[(256 * 7) + ((saddr) & 0xff)];
1722 	/* hash on TCP port, if required */
1723 	if ((myri10ge_rss_hash & MXGEFW_RSS_HASH_TYPE_TCP_IPV4) &&
1724 	    ip->ip_p == IPPROTO_TCP) {
1725 		hdr = (struct tcphdr *)(void *)
1726 		    (((uint8_t *)ip) +  (ip->ip_hl << 2));
1727 		src = ntohs(hdr->th_sport);
1728 		dst = ntohs(hdr->th_dport);
1729 
1730 		hash ^= table[(256 * 8) + ((dst >> 8) & 0xff)];
1731 		hash ^= table[(256 * 9) + ((dst) & 0xff)];
1732 		hash ^= table[(256 * 10) + ((src >> 8) & 0xff)];
1733 		hash ^= table[(256 * 11) + ((src) & 0xff)];
1734 	}
1735 	slice = (mgp->num_slices - 1) & hash;
1736 	return (&mgp->ss[slice]);
1737 
1738 }
1739 
1740 static inline struct myri10ge_slice_state *
1741 myri10ge_simple_send_hash(struct myri10ge_priv *mgp, struct ip *ip)
1742 {
1743 	struct tcphdr *hdr;
1744 	uint32_t slice, hash_val;
1745 
1746 
1747 	if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP) {
1748 		return (&mgp->ss[0]);
1749 	}
1750 	hdr = (struct tcphdr *)(void *)(((uint8_t *)ip) +  (ip->ip_hl << 2));
1751 
1752 	/*
1753 	 * Use the second byte of the *destination* address for
1754 	 * MXGEFW_RSS_HASH_TYPE_SRC_PORT, so as to match NIC's hashing
1755 	 */
1756 	hash_val = ntohs(hdr->th_dport) & 0xff;
1757 	if (myri10ge_rss_hash == MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT)
1758 		hash_val += ntohs(hdr->th_sport) & 0xff;
1759 
1760 	slice = (mgp->num_slices - 1) & hash_val;
1761 	return (&mgp->ss[slice]);
1762 }
1763 
1764 static inline struct myri10ge_slice_state *
1765 myri10ge_send_hash(struct myri10ge_priv *mgp, mblk_t *mp)
1766 {
1767 	unsigned int slice = 0;
1768 	struct ether_header *eh;
1769 	struct ether_vlan_header *vh;
1770 	struct ip *ip;
1771 	int ehl, ihl;
1772 
1773 	if (mgp->num_slices == 1)
1774 		return (&mgp->ss[0]);
1775 
1776 	if (myri10ge_tx_hash == 0) {
1777 		slice = CPU->cpu_id & (mgp->num_slices - 1);
1778 		return (&mgp->ss[slice]);
1779 	}
1780 
1781 	/*
1782 	 *  ensure it is a TCP or UDP over IPv4 packet, and that the
1783 	 *  headers are in the 1st mblk.  Otherwise, punt
1784 	 */
1785 	ehl = sizeof (*eh);
1786 	ihl = sizeof (*ip);
1787 	if ((MBLKL(mp)) <  (ehl + ihl + 8))
1788 		return (&mgp->ss[0]);
1789 	eh = (struct ether_header *)(void *)mp->b_rptr;
1790 	ip = (struct ip *)(void *)(eh + 1);
1791 	if (eh->ether_type != BE_16(ETHERTYPE_IP)) {
1792 		if (eh->ether_type != BE_16(ETHERTYPE_VLAN))
1793 			return (&mgp->ss[0]);
1794 		vh = (struct ether_vlan_header *)(void *)mp->b_rptr;
1795 		if (vh->ether_type != BE_16(ETHERTYPE_IP))
1796 			return (&mgp->ss[0]);
1797 		ehl += 4;
1798 		ip = (struct ip *)(void *)(vh + 1);
1799 	}
1800 	ihl = ip->ip_hl << 2;
1801 	if (MBLKL(mp) <  (ehl + ihl + 8))
1802 		return (&mgp->ss[0]);
1803 	switch (myri10ge_rss_hash) {
1804 	case MXGEFW_RSS_HASH_TYPE_IPV4:
1805 		/* fallthru */
1806 	case MXGEFW_RSS_HASH_TYPE_TCP_IPV4:
1807 		/* fallthru */
1808 	case (MXGEFW_RSS_HASH_TYPE_IPV4|MXGEFW_RSS_HASH_TYPE_TCP_IPV4):
1809 		return (myri10ge_toeplitz_send_hash(mgp, ip));
1810 	case MXGEFW_RSS_HASH_TYPE_SRC_PORT:
1811 		/* fallthru */
1812 	case MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT:
1813 		return (myri10ge_simple_send_hash(mgp, ip));
1814 	default:
1815 		break;
1816 	}
1817 	return (&mgp->ss[0]);
1818 }
1819 
1820 static int
1821 myri10ge_setup_slice(struct myri10ge_slice_state *ss)
1822 {
1823 	struct myri10ge_priv *mgp = ss->mgp;
1824 	myri10ge_cmd_t cmd;
1825 	int tx_ring_size, rx_ring_size;
1826 	int tx_ring_entries, rx_ring_entries;
1827 	int slice, status;
1828 	int allocated, idx;
1829 	size_t bytes;
1830 
1831 	slice = ss - mgp->ss;
1832 	cmd.data0 = slice;
1833 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
1834 	tx_ring_size = cmd.data0;
1835 	cmd.data0 = slice;
1836 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
1837 	if (status != 0)
1838 		return (status);
1839 	rx_ring_size = cmd.data0;
1840 
1841 	tx_ring_entries = tx_ring_size / sizeof (struct mcp_kreq_ether_send);
1842 	rx_ring_entries = rx_ring_size / sizeof (struct mcp_dma_addr);
1843 	ss->tx.mask = tx_ring_entries - 1;
1844 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
1845 
1846 	/* get the lanai pointers to the send and receive rings */
1847 
1848 	cmd.data0 = slice;
1849 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
1850 	ss->tx.lanai = (mcp_kreq_ether_send_t *)(void *)(mgp->sram + cmd.data0);
1851 	if (mgp->num_slices > 1) {
1852 		ss->tx.go = (char *)mgp->sram + MXGEFW_ETH_SEND_GO + 64 * slice;
1853 		ss->tx.stop = (char *)mgp->sram + MXGEFW_ETH_SEND_STOP +
1854 		    64 * slice;
1855 	} else {
1856 		ss->tx.go = NULL;
1857 		ss->tx.stop = NULL;
1858 	}
1859 
1860 	cmd.data0 = slice;
1861 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
1862 	ss->rx_small.lanai = (mcp_kreq_ether_recv_t *)
1863 	    (void *)(mgp->sram + cmd.data0);
1864 
1865 	cmd.data0 = slice;
1866 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
1867 	ss->rx_big.lanai = (mcp_kreq_ether_recv_t *)(void *)
1868 	    (mgp->sram + cmd.data0);
1869 
1870 	if (status != 0) {
1871 		cmn_err(CE_WARN,
1872 		    "%s: failed to get ring sizes or locations\n", mgp->name);
1873 		return (status);
1874 	}
1875 
1876 	status = ENOMEM;
1877 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
1878 	ss->rx_small.shadow = kmem_zalloc(bytes, KM_SLEEP);
1879 	if (ss->rx_small.shadow == NULL)
1880 		goto abort;
1881 	(void) memset(ss->rx_small.shadow, 0, bytes);
1882 
1883 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
1884 	ss->rx_big.shadow = kmem_zalloc(bytes, KM_SLEEP);
1885 	if (ss->rx_big.shadow == NULL)
1886 		goto abort_with_rx_small_shadow;
1887 	(void) memset(ss->rx_big.shadow, 0, bytes);
1888 
1889 	/* allocate the host info rings */
1890 
1891 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
1892 	ss->tx.info = kmem_zalloc(bytes, KM_SLEEP);
1893 	if (ss->tx.info == NULL)
1894 		goto abort_with_rx_big_shadow;
1895 	(void) memset(ss->tx.info, 0, bytes);
1896 
1897 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
1898 	ss->rx_small.info = kmem_zalloc(bytes, KM_SLEEP);
1899 	if (ss->rx_small.info == NULL)
1900 		goto abort_with_tx_info;
1901 	(void) memset(ss->rx_small.info, 0, bytes);
1902 
1903 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
1904 	ss->rx_big.info = kmem_zalloc(bytes, KM_SLEEP);
1905 	if (ss->rx_big.info == NULL)
1906 		goto abort_with_rx_small_info;
1907 	(void) memset(ss->rx_big.info, 0, bytes);
1908 
1909 	ss->tx.stall = ss->tx.sched = 0;
1910 	ss->tx.stall_early = ss->tx.stall_late = 0;
1911 
1912 	ss->jbufs_for_smalls = 1 + (1 + ss->rx_small.mask) /
1913 	    (myri10ge_mtu / (myri10ge_small_bytes + MXGEFW_PAD));
1914 
1915 	allocated = myri10ge_add_jbufs(ss,
1916 	    myri10ge_bigbufs_initial + ss->jbufs_for_smalls, 1);
1917 	if (allocated < ss->jbufs_for_smalls + myri10ge_bigbufs_initial) {
1918 		cmn_err(CE_WARN,
1919 		    "%s: Could not allocate enough receive buffers (%d/%d)\n",
1920 		    mgp->name, allocated,
1921 		    myri10ge_bigbufs_initial + ss->jbufs_for_smalls);
1922 		goto abort_with_jumbos;
1923 	}
1924 
1925 	myri10ge_carve_up_jbufs_into_small_ring(ss);
1926 	ss->j_rx_cnt = 0;
1927 
1928 	mutex_enter(&ss->jpool.mtx);
1929 	if (allocated < rx_ring_entries)
1930 		ss->jpool.low_water = allocated / 4;
1931 	else
1932 		ss->jpool.low_water = rx_ring_entries / 2;
1933 
1934 	/*
1935 	 * invalidate the big receive ring in case we do not
1936 	 * allocate sufficient jumbos to fill it
1937 	 */
1938 	(void) memset(ss->rx_big.shadow, 1,
1939 	    (ss->rx_big.mask + 1) * sizeof (ss->rx_big.shadow[0]));
1940 	for (idx = 7; idx <= ss->rx_big.mask; idx += 8) {
1941 		myri10ge_submit_8rx(&ss->rx_big.lanai[idx - 7],
1942 		    &ss->rx_big.shadow[idx - 7]);
1943 		mb();
1944 	}
1945 
1946 
1947 	myri10ge_restock_jumbos(ss);
1948 
1949 	for (idx = 7; idx <= ss->rx_small.mask; idx += 8) {
1950 		myri10ge_submit_8rx(&ss->rx_small.lanai[idx - 7],
1951 		    &ss->rx_small.shadow[idx - 7]);
1952 		mb();
1953 	}
1954 	ss->rx_small.cnt = ss->rx_small.mask + 1;
1955 
1956 	mutex_exit(&ss->jpool.mtx);
1957 
1958 	status = myri10ge_prepare_tx_ring(ss);
1959 
1960 	if (status != 0)
1961 		goto abort_with_small_jbufs;
1962 
1963 	cmd.data0 = ntohl(ss->fw_stats_dma.low);
1964 	cmd.data1 = ntohl(ss->fw_stats_dma.high);
1965 	cmd.data2 = sizeof (mcp_irq_data_t);
1966 	cmd.data2 |= (slice << 16);
1967 	bzero(ss->fw_stats, sizeof (*ss->fw_stats));
1968 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
1969 	if (status == ENOSYS) {
1970 		cmd.data0 = ntohl(ss->fw_stats_dma.low) +
1971 		    offsetof(mcp_irq_data_t, send_done_count);
1972 		cmd.data1 = ntohl(ss->fw_stats_dma.high);
1973 		status = myri10ge_send_cmd(mgp,
1974 		    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE, &cmd);
1975 	}
1976 	if (status) {
1977 		cmn_err(CE_WARN, "%s: Couldn't set stats DMA\n", mgp->name);
1978 		goto abort_with_tx;
1979 	}
1980 
1981 	return (0);
1982 
1983 abort_with_tx:
1984 	myri10ge_unprepare_tx_ring(ss);
1985 
1986 abort_with_small_jbufs:
1987 	myri10ge_release_small_jbufs(ss);
1988 
1989 abort_with_jumbos:
1990 	if (allocated != 0) {
1991 		mutex_enter(&ss->jpool.mtx);
1992 		ss->jpool.low_water = 0;
1993 		mutex_exit(&ss->jpool.mtx);
1994 		myri10ge_unstock_jumbos(ss);
1995 		myri10ge_remove_jbufs(ss);
1996 	}
1997 
1998 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
1999 	kmem_free(ss->rx_big.info, bytes);
2000 
2001 abort_with_rx_small_info:
2002 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
2003 	kmem_free(ss->rx_small.info, bytes);
2004 
2005 abort_with_tx_info:
2006 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
2007 	kmem_free(ss->tx.info, bytes);
2008 
2009 abort_with_rx_big_shadow:
2010 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2011 	kmem_free(ss->rx_big.shadow, bytes);
2012 
2013 abort_with_rx_small_shadow:
2014 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2015 	kmem_free(ss->rx_small.shadow, bytes);
2016 abort:
2017 	return (status);
2018 
2019 }
2020 
2021 static void
2022 myri10ge_teardown_slice(struct myri10ge_slice_state *ss)
2023 {
2024 	int tx_ring_entries, rx_ring_entries;
2025 	size_t bytes;
2026 
2027 	/* ignore slices that have not been fully setup */
2028 	if (ss->tx.cp == NULL)
2029 		return;
2030 	/* Free the TX copy buffers */
2031 	myri10ge_unprepare_tx_ring(ss);
2032 
2033 	/* stop passing returned buffers to firmware */
2034 
2035 	mutex_enter(&ss->jpool.mtx);
2036 	ss->jpool.low_water = 0;
2037 	mutex_exit(&ss->jpool.mtx);
2038 	myri10ge_release_small_jbufs(ss);
2039 
2040 	/* Release the free jumbo frame pool */
2041 	myri10ge_unstock_jumbos(ss);
2042 	myri10ge_remove_jbufs(ss);
2043 
2044 	rx_ring_entries = ss->rx_big.mask + 1;
2045 	tx_ring_entries = ss->tx.mask + 1;
2046 
2047 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
2048 	kmem_free(ss->rx_big.info, bytes);
2049 
2050 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
2051 	kmem_free(ss->rx_small.info, bytes);
2052 
2053 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
2054 	kmem_free(ss->tx.info, bytes);
2055 
2056 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2057 	kmem_free(ss->rx_big.shadow, bytes);
2058 
2059 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2060 	kmem_free(ss->rx_small.shadow, bytes);
2061 
2062 }
2063 static int
2064 myri10ge_start_locked(struct myri10ge_priv *mgp)
2065 {
2066 	myri10ge_cmd_t cmd;
2067 	int status, big_pow2, i;
2068 	volatile uint8_t *itable;
2069 
2070 	status = DDI_SUCCESS;
2071 	/* Allocate DMA resources and receive buffers */
2072 
2073 	status = myri10ge_reset(mgp);
2074 	if (status != 0) {
2075 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
2076 		return (DDI_FAILURE);
2077 	}
2078 
2079 	if (mgp->num_slices > 1) {
2080 		cmd.data0 = mgp->num_slices;
2081 		cmd.data1 = 1; /* use MSI-X */
2082 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ENABLE_RSS_QUEUES,
2083 		    &cmd);
2084 		if (status != 0) {
2085 			cmn_err(CE_WARN,
2086 			    "%s: failed to set number of slices\n",
2087 			    mgp->name);
2088 			goto abort_with_nothing;
2089 		}
2090 		/* setup the indirection table */
2091 		cmd.data0 = mgp->num_slices;
2092 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
2093 		    &cmd);
2094 
2095 		status |= myri10ge_send_cmd(mgp,
2096 		    MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
2097 		if (status != 0) {
2098 			cmn_err(CE_WARN,
2099 			    "%s: failed to setup rss tables\n", mgp->name);
2100 		}
2101 
2102 		/* just enable an identity mapping */
2103 		itable = mgp->sram + cmd.data0;
2104 		for (i = 0; i < mgp->num_slices; i++)
2105 			itable[i] = (uint8_t)i;
2106 
2107 		if (myri10ge_rss_hash & MYRI10GE_TOEPLITZ_HASH) {
2108 			status = myri10ge_init_toeplitz(mgp);
2109 			if (status != 0) {
2110 				cmn_err(CE_WARN, "%s: failed to setup "
2111 				    "toeplitz tx hash table", mgp->name);
2112 				goto abort_with_nothing;
2113 			}
2114 		}
2115 		cmd.data0 = 1;
2116 		cmd.data1 = myri10ge_rss_hash;
2117 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_RSS_ENABLE,
2118 		    &cmd);
2119 		if (status != 0) {
2120 			cmn_err(CE_WARN,
2121 			    "%s: failed to enable slices\n", mgp->name);
2122 			goto abort_with_toeplitz;
2123 		}
2124 	}
2125 
2126 	for (i = 0; i < mgp->num_slices; i++) {
2127 		status = myri10ge_setup_slice(&mgp->ss[i]);
2128 		if (status != 0)
2129 			goto abort_with_slices;
2130 	}
2131 
2132 	/*
2133 	 * Tell the MCP how many buffers he has, and to
2134 	 *  bring the ethernet interface up
2135 	 *
2136 	 * Firmware needs the big buff size as a power of 2.  Lie and
2137 	 * tell him the buffer is larger, because we only use 1
2138 	 * buffer/pkt, and the mtu will prevent overruns
2139 	 */
2140 	big_pow2 = myri10ge_mtu + MXGEFW_PAD;
2141 	while ((big_pow2 & (big_pow2 - 1)) != 0)
2142 		big_pow2++;
2143 
2144 	/* now give firmware buffers sizes, and MTU */
2145 	cmd.data0 = myri10ge_mtu;
2146 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_MTU, &cmd);
2147 	cmd.data0 = myri10ge_small_bytes;
2148 	status |=
2149 	    myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
2150 	cmd.data0 = big_pow2;
2151 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2152 	if (status) {
2153 		cmn_err(CE_WARN, "%s: Couldn't set buffer sizes\n", mgp->name);
2154 		goto abort_with_slices;
2155 	}
2156 
2157 
2158 	cmd.data0 = 1;
2159 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_TSO_MODE, &cmd);
2160 	if (status) {
2161 		cmn_err(CE_WARN, "%s: unable to setup TSO (%d)\n",
2162 		    mgp->name, status);
2163 	} else {
2164 		mgp->features |= MYRI10GE_TSO;
2165 	}
2166 
2167 	mgp->link_state = -1;
2168 	mgp->rdma_tags_available = 15;
2169 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_UP, &cmd);
2170 	if (status) {
2171 		cmn_err(CE_WARN, "%s: unable to start ethernet\n", mgp->name);
2172 		goto abort_with_slices;
2173 	}
2174 	mgp->running = MYRI10GE_ETH_RUNNING;
2175 	return (DDI_SUCCESS);
2176 
2177 abort_with_slices:
2178 	for (i = 0; i < mgp->num_slices; i++)
2179 		myri10ge_teardown_slice(&mgp->ss[i]);
2180 
2181 	mgp->running = MYRI10GE_ETH_STOPPED;
2182 
2183 abort_with_toeplitz:
2184 	if (mgp->toeplitz_hash_table != NULL) {
2185 		kmem_free(mgp->toeplitz_hash_table,
2186 		    sizeof (uint32_t) * 12 * 256);
2187 		mgp->toeplitz_hash_table = NULL;
2188 	}
2189 
2190 abort_with_nothing:
2191 	return (DDI_FAILURE);
2192 }
2193 
2194 static void
2195 myri10ge_stop_locked(struct myri10ge_priv *mgp)
2196 {
2197 	int status, old_down_cnt;
2198 	myri10ge_cmd_t cmd;
2199 	int wait_time = 10;
2200 	int i, polling;
2201 
2202 	old_down_cnt = mgp->down_cnt;
2203 	mb();
2204 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2205 	if (status) {
2206 		cmn_err(CE_WARN, "%s: Couldn't bring down link\n", mgp->name);
2207 	}
2208 
2209 	while (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2210 		delay(1 * drv_usectohz(1000000));
2211 		wait_time--;
2212 		if (wait_time == 0)
2213 			break;
2214 	}
2215 again:
2216 	if (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2217 		cmn_err(CE_WARN, "%s: didn't get down irq\n", mgp->name);
2218 		for (i = 0; i < mgp->num_slices; i++) {
2219 			/*
2220 			 * take and release the rx lock to ensure
2221 			 * that no interrupt thread is blocked
2222 			 * elsewhere in the stack, preventing
2223 			 * completion
2224 			 */
2225 
2226 			mutex_enter(&mgp->ss[i].rx_lock);
2227 			printf("%s: slice %d rx irq idle\n",
2228 			    mgp->name, i);
2229 			mutex_exit(&mgp->ss[i].rx_lock);
2230 
2231 			/* verify that the poll handler is inactive */
2232 			mutex_enter(&mgp->ss->poll_lock);
2233 			polling = mgp->ss->rx_polling;
2234 			mutex_exit(&mgp->ss->poll_lock);
2235 			if (polling) {
2236 				printf("%s: slice %d is polling\n",
2237 				    mgp->name, i);
2238 				delay(1 * drv_usectohz(1000000));
2239 				goto again;
2240 			}
2241 		}
2242 		delay(1 * drv_usectohz(1000000));
2243 		if (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2244 			cmn_err(CE_WARN, "%s: Never got down irq\n", mgp->name);
2245 		}
2246 	}
2247 
2248 	for (i = 0; i < mgp->num_slices; i++)
2249 		myri10ge_teardown_slice(&mgp->ss[i]);
2250 
2251 	if (mgp->toeplitz_hash_table != NULL) {
2252 		kmem_free(mgp->toeplitz_hash_table,
2253 		    sizeof (uint32_t) * 12 * 256);
2254 		mgp->toeplitz_hash_table = NULL;
2255 	}
2256 	mgp->running = MYRI10GE_ETH_STOPPED;
2257 }
2258 
2259 static int
2260 myri10ge_m_start(void *arg)
2261 {
2262 	struct myri10ge_priv *mgp = arg;
2263 	int status;
2264 
2265 	mutex_enter(&mgp->intrlock);
2266 
2267 	if (mgp->running != MYRI10GE_ETH_STOPPED) {
2268 		mutex_exit(&mgp->intrlock);
2269 		return (DDI_FAILURE);
2270 	}
2271 	status = myri10ge_start_locked(mgp);
2272 	mutex_exit(&mgp->intrlock);
2273 
2274 	if (status != DDI_SUCCESS)
2275 		return (status);
2276 
2277 	/* start the watchdog timer */
2278 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
2279 	    mgp->timer_ticks);
2280 	return (DDI_SUCCESS);
2281 
2282 }
2283 
2284 static void
2285 myri10ge_m_stop(void *arg)
2286 {
2287 	struct myri10ge_priv *mgp = arg;
2288 
2289 	mutex_enter(&mgp->intrlock);
2290 	/* if the device not running give up */
2291 	if (mgp->running != MYRI10GE_ETH_RUNNING) {
2292 		mutex_exit(&mgp->intrlock);
2293 		return;
2294 	}
2295 
2296 	mgp->running = MYRI10GE_ETH_STOPPING;
2297 	mutex_exit(&mgp->intrlock);
2298 	(void) untimeout(mgp->timer_id);
2299 	mutex_enter(&mgp->intrlock);
2300 	myri10ge_stop_locked(mgp);
2301 	mutex_exit(&mgp->intrlock);
2302 
2303 }
2304 
2305 static inline void
2306 myri10ge_rx_csum(mblk_t *mp, struct myri10ge_rx_ring_stats *s, uint32_t csum)
2307 {
2308 	struct ether_header *eh;
2309 	struct ip *ip;
2310 	struct ip6_hdr *ip6;
2311 	uint32_t start, stuff, end, partial, hdrlen;
2312 
2313 
2314 	csum = ntohs((uint16_t)csum);
2315 	eh = (struct ether_header *)(void *)mp->b_rptr;
2316 	hdrlen = sizeof (*eh);
2317 	if (eh->ether_dhost.ether_addr_octet[0] & 1) {
2318 		if (0 == (bcmp(eh->ether_dhost.ether_addr_octet,
2319 		    myri10ge_broadcastaddr, sizeof (eh->ether_dhost))))
2320 			s->brdcstrcv++;
2321 		else
2322 			s->multircv++;
2323 	}
2324 
2325 	if (eh->ether_type == BE_16(ETHERTYPE_VLAN)) {
2326 		/*
2327 		 * fix checksum by subtracting 4 bytes after what the
2328 		 * firmware thought was the end of the ether hdr
2329 		 */
2330 		partial = *(uint32_t *)
2331 		    (void *)(mp->b_rptr + ETHERNET_HEADER_SIZE);
2332 		csum += ~partial;
2333 		csum +=  (csum < ~partial);
2334 		csum = (csum >> 16) + (csum & 0xFFFF);
2335 		csum = (csum >> 16) + (csum & 0xFFFF);
2336 		hdrlen += VLAN_TAGSZ;
2337 	}
2338 
2339 	if (eh->ether_type ==  BE_16(ETHERTYPE_IP)) {
2340 		ip = (struct ip *)(void *)(mp->b_rptr + hdrlen);
2341 		start = ip->ip_hl << 2;
2342 
2343 		if (ip->ip_p == IPPROTO_TCP)
2344 			stuff = start + offsetof(struct tcphdr, th_sum);
2345 		else if (ip->ip_p == IPPROTO_UDP)
2346 			stuff = start + offsetof(struct udphdr, uh_sum);
2347 		else
2348 			return;
2349 		end = ntohs(ip->ip_len);
2350 	} else if (eh->ether_type ==  BE_16(ETHERTYPE_IPV6)) {
2351 		ip6 = (struct ip6_hdr *)(void *)(mp->b_rptr + hdrlen);
2352 		start = sizeof (*ip6);
2353 		if (ip6->ip6_nxt == IPPROTO_TCP) {
2354 			stuff = start + offsetof(struct tcphdr, th_sum);
2355 		} else if (ip6->ip6_nxt == IPPROTO_UDP)
2356 			stuff = start + offsetof(struct udphdr, uh_sum);
2357 		else
2358 			return;
2359 		end = start + ntohs(ip6->ip6_plen);
2360 		/*
2361 		 * IPv6 headers do not contain a checksum, and hence
2362 		 * do not checksum to zero, so they don't "fall out"
2363 		 * of the partial checksum calculation like IPv4
2364 		 * headers do.  We need to fix the partial checksum by
2365 		 * subtracting the checksum of the IPv6 header.
2366 		 */
2367 
2368 		partial = myri10ge_csum_generic((uint16_t *)ip6, sizeof (*ip6));
2369 		csum += ~partial;
2370 		csum +=  (csum < ~partial);
2371 		csum = (csum >> 16) + (csum & 0xFFFF);
2372 		csum = (csum >> 16) + (csum & 0xFFFF);
2373 	} else {
2374 		return;
2375 	}
2376 
2377 	if (MBLKL(mp) > hdrlen + end) {
2378 		/* padded frame, so hw csum may be invalid */
2379 		return;
2380 	}
2381 
2382 	mac_hcksum_set(mp, start, stuff, end, csum, HCK_PARTIALCKSUM);
2383 }
2384 
2385 static mblk_t *
2386 myri10ge_rx_done_small(struct myri10ge_slice_state *ss, uint32_t len,
2387     uint32_t csum)
2388 {
2389 	mblk_t *mp;
2390 	myri10ge_rx_ring_t *rx;
2391 	int idx;
2392 
2393 	rx = &ss->rx_small;
2394 	idx = rx->cnt & rx->mask;
2395 	ss->rx_small.cnt++;
2396 
2397 	/* allocate a new buffer to pass up the stack */
2398 	mp = allocb(len + MXGEFW_PAD, 0);
2399 	if (mp == NULL) {
2400 		MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_small_nobuf);
2401 		goto abort;
2402 	}
2403 	bcopy(ss->rx_small.info[idx].ptr,
2404 	    (caddr_t)mp->b_wptr, len + MXGEFW_PAD);
2405 	mp->b_wptr += len + MXGEFW_PAD;
2406 	mp->b_rptr += MXGEFW_PAD;
2407 
2408 	ss->rx_stats.ibytes += len;
2409 	ss->rx_stats.ipackets += 1;
2410 	myri10ge_rx_csum(mp, &ss->rx_stats, csum);
2411 
2412 abort:
2413 	if ((idx & 7) == 7) {
2414 		myri10ge_submit_8rx(&rx->lanai[idx - 7],
2415 		    &rx->shadow[idx - 7]);
2416 	}
2417 
2418 	return (mp);
2419 }
2420 
2421 
2422 static mblk_t *
2423 myri10ge_rx_done_big(struct myri10ge_slice_state *ss, uint32_t len,
2424     uint32_t csum)
2425 {
2426 	struct myri10ge_jpool_stuff *jpool;
2427 	struct myri10ge_jpool_entry *j;
2428 	mblk_t *mp;
2429 	int idx, num_owned_by_mcp;
2430 
2431 	jpool = &ss->jpool;
2432 	idx = ss->j_rx_cnt & ss->rx_big.mask;
2433 	j = ss->rx_big.info[idx].j;
2434 
2435 	if (j == NULL) {
2436 		printf("%s: null j at idx=%d, rx_big.cnt = %d, j_rx_cnt=%d\n",
2437 		    ss->mgp->name, idx, ss->rx_big.cnt, ss->j_rx_cnt);
2438 		return (NULL);
2439 	}
2440 
2441 
2442 	ss->rx_big.info[idx].j = NULL;
2443 	ss->j_rx_cnt++;
2444 
2445 
2446 	/*
2447 	 * Check to see if we are low on rx buffers.
2448 	 * Note that we must leave at least 8 free so there are
2449 	 * enough to free in a single 64-byte write.
2450 	 */
2451 	num_owned_by_mcp = ss->rx_big.cnt - ss->j_rx_cnt;
2452 	if (num_owned_by_mcp < jpool->low_water) {
2453 		mutex_enter(&jpool->mtx);
2454 		myri10ge_restock_jumbos(ss);
2455 		mutex_exit(&jpool->mtx);
2456 		num_owned_by_mcp = ss->rx_big.cnt - ss->j_rx_cnt;
2457 		/* if we are still low, then we have to copy */
2458 		if (num_owned_by_mcp < 16) {
2459 			MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_copy);
2460 			/* allocate a new buffer to pass up the stack */
2461 			mp = allocb(len + MXGEFW_PAD, 0);
2462 			if (mp == NULL) {
2463 				goto abort;
2464 			}
2465 			bcopy(j->buf,
2466 			    (caddr_t)mp->b_wptr, len + MXGEFW_PAD);
2467 			myri10ge_jfree_rtn(j);
2468 			/* push buffer back to NIC */
2469 			mutex_enter(&jpool->mtx);
2470 			myri10ge_restock_jumbos(ss);
2471 			mutex_exit(&jpool->mtx);
2472 			goto set_len;
2473 		}
2474 	}
2475 
2476 	/* loan our buffer to the stack */
2477 	mp = desballoc((unsigned char *)j->buf, myri10ge_mtu, 0, &j->free_func);
2478 	if (mp == NULL) {
2479 		goto abort;
2480 	}
2481 
2482 set_len:
2483 	mp->b_rptr += MXGEFW_PAD;
2484 	mp->b_wptr = ((unsigned char *) mp->b_rptr + len);
2485 
2486 	ss->rx_stats.ibytes += len;
2487 	ss->rx_stats.ipackets += 1;
2488 	myri10ge_rx_csum(mp, &ss->rx_stats, csum);
2489 
2490 	return (mp);
2491 
2492 abort:
2493 	myri10ge_jfree_rtn(j);
2494 	MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_big_nobuf);
2495 	return (NULL);
2496 }
2497 
2498 /*
2499  * Free all transmit buffers up until the specified index
2500  */
2501 static inline void
2502 myri10ge_tx_done(struct myri10ge_slice_state *ss, uint32_t mcp_index)
2503 {
2504 	myri10ge_tx_ring_t *tx;
2505 	struct myri10ge_tx_dma_handle_head handles;
2506 	int idx;
2507 	int limit = 0;
2508 
2509 	tx = &ss->tx;
2510 	handles.head = NULL;
2511 	handles.tail = NULL;
2512 	while (tx->pkt_done != (int)mcp_index) {
2513 		idx = tx->done & tx->mask;
2514 
2515 		/*
2516 		 * mblk & DMA handle attached only to first slot
2517 		 * per buffer in the packet
2518 		 */
2519 
2520 		if (tx->info[idx].m) {
2521 			(void) ddi_dma_unbind_handle(tx->info[idx].handle->h);
2522 			tx->info[idx].handle->next = handles.head;
2523 			handles.head = tx->info[idx].handle;
2524 			if (handles.tail == NULL)
2525 				handles.tail = tx->info[idx].handle;
2526 			freeb(tx->info[idx].m);
2527 			tx->info[idx].m = 0;
2528 			tx->info[idx].handle = 0;
2529 		}
2530 		if (tx->info[idx].ostat.opackets != 0) {
2531 			tx->stats.multixmt += tx->info[idx].ostat.multixmt;
2532 			tx->stats.brdcstxmt += tx->info[idx].ostat.brdcstxmt;
2533 			tx->stats.obytes += tx->info[idx].ostat.obytes;
2534 			tx->stats.opackets += tx->info[idx].ostat.opackets;
2535 			tx->info[idx].stat.un.all = 0;
2536 			tx->pkt_done++;
2537 		}
2538 
2539 		tx->done++;
2540 		/*
2541 		 * if we stalled the queue, wake it.  But Wait until
2542 		 * we have at least 1/2 our slots free.
2543 		 */
2544 		if ((tx->req - tx->done) < (tx->mask >> 1) &&
2545 		    tx->stall != tx->sched) {
2546 			mutex_enter(&ss->tx.lock);
2547 			tx->sched = tx->stall;
2548 			mutex_exit(&ss->tx.lock);
2549 			mac_tx_ring_update(ss->mgp->mh, tx->rh);
2550 		}
2551 
2552 		/* limit potential for livelock */
2553 		if (unlikely(++limit >  2 * tx->mask))
2554 			break;
2555 	}
2556 	if (tx->req == tx->done && tx->stop != NULL) {
2557 		/*
2558 		 * Nic has sent all pending requests, allow him
2559 		 * to stop polling this queue
2560 		 */
2561 		mutex_enter(&tx->lock);
2562 		if (tx->req == tx->done && tx->active) {
2563 			*(int *)(void *)tx->stop = 1;
2564 			tx->active = 0;
2565 			mb();
2566 		}
2567 		mutex_exit(&tx->lock);
2568 	}
2569 	if (handles.head != NULL)
2570 		myri10ge_free_tx_handles(tx, &handles);
2571 }
2572 
2573 static void
2574 myri10ge_mbl_init(struct myri10ge_mblk_list *mbl)
2575 {
2576 	mbl->head = NULL;
2577 	mbl->tail = &mbl->head;
2578 	mbl->cnt = 0;
2579 }
2580 
2581 /*ARGSUSED*/
2582 void
2583 myri10ge_mbl_append(struct myri10ge_slice_state *ss,
2584     struct myri10ge_mblk_list *mbl, mblk_t *mp)
2585 {
2586 	*(mbl->tail) = mp;
2587 	mbl->tail = &mp->b_next;
2588 	mp->b_next = NULL;
2589 	mbl->cnt++;
2590 }
2591 
2592 
2593 static inline void
2594 myri10ge_clean_rx_done(struct myri10ge_slice_state *ss,
2595     struct myri10ge_mblk_list *mbl, int limit, boolean_t *stop)
2596 {
2597 	myri10ge_rx_done_t *rx_done = &ss->rx_done;
2598 	struct myri10ge_priv *mgp = ss->mgp;
2599 	mblk_t *mp;
2600 	struct lro_entry *lro;
2601 	uint16_t length;
2602 	uint16_t checksum;
2603 
2604 
2605 	while (rx_done->entry[rx_done->idx].length != 0) {
2606 		if (unlikely (*stop)) {
2607 			break;
2608 		}
2609 		length = ntohs(rx_done->entry[rx_done->idx].length);
2610 		length &= (~MXGEFW_RSS_HASH_MASK);
2611 
2612 		/* limit potential for livelock */
2613 		limit -= length;
2614 		if (unlikely(limit < 0))
2615 			break;
2616 
2617 		rx_done->entry[rx_done->idx].length = 0;
2618 		checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
2619 		if (length <= myri10ge_small_bytes)
2620 			mp = myri10ge_rx_done_small(ss, length, checksum);
2621 		else
2622 			mp = myri10ge_rx_done_big(ss, length, checksum);
2623 		if (mp != NULL) {
2624 			if (!myri10ge_lro ||
2625 			    0 != myri10ge_lro_rx(ss, mp, checksum, mbl))
2626 				myri10ge_mbl_append(ss, mbl, mp);
2627 		}
2628 		rx_done->cnt++;
2629 		rx_done->idx = rx_done->cnt & (mgp->max_intr_slots - 1);
2630 	}
2631 	while (ss->lro_active != NULL) {
2632 		lro = ss->lro_active;
2633 		ss->lro_active = lro->next;
2634 		myri10ge_lro_flush(ss, lro, mbl);
2635 	}
2636 }
2637 
2638 static void
2639 myri10ge_intr_rx(struct myri10ge_slice_state *ss)
2640 {
2641 	uint64_t gen;
2642 	struct myri10ge_mblk_list mbl;
2643 
2644 	myri10ge_mbl_init(&mbl);
2645 	if (mutex_tryenter(&ss->rx_lock) == 0)
2646 		return;
2647 	gen = ss->rx_gen_num;
2648 	myri10ge_clean_rx_done(ss, &mbl, MYRI10GE_POLL_NULL,
2649 	    &ss->rx_polling);
2650 	if (mbl.head != NULL)
2651 		mac_rx_ring(ss->mgp->mh, ss->rx_rh, mbl.head, gen);
2652 	mutex_exit(&ss->rx_lock);
2653 
2654 }
2655 
2656 static mblk_t *
2657 myri10ge_poll_rx(void *arg, int bytes)
2658 {
2659 	struct myri10ge_slice_state *ss = arg;
2660 	struct myri10ge_mblk_list mbl;
2661 	boolean_t dummy = B_FALSE;
2662 
2663 	if (bytes == 0)
2664 		return (NULL);
2665 
2666 	myri10ge_mbl_init(&mbl);
2667 	mutex_enter(&ss->rx_lock);
2668 	if (ss->rx_polling)
2669 		myri10ge_clean_rx_done(ss, &mbl, bytes, &dummy);
2670 	else
2671 		printf("%d: poll_rx: token=%d, polling=%d\n", (int)(ss -
2672 		    ss->mgp->ss), ss->rx_token, ss->rx_polling);
2673 	mutex_exit(&ss->rx_lock);
2674 	return (mbl.head);
2675 }
2676 
2677 /*ARGSUSED*/
2678 static uint_t
2679 myri10ge_intr(caddr_t arg0, caddr_t arg1)
2680 {
2681 	struct myri10ge_slice_state *ss =
2682 	    (struct myri10ge_slice_state *)(void *)arg0;
2683 	struct myri10ge_priv *mgp = ss->mgp;
2684 	mcp_irq_data_t *stats = ss->fw_stats;
2685 	myri10ge_tx_ring_t *tx = &ss->tx;
2686 	uint32_t send_done_count;
2687 	uint8_t valid;
2688 
2689 
2690 	/* make sure the DMA has finished */
2691 	if (!stats->valid) {
2692 		return (DDI_INTR_UNCLAIMED);
2693 	}
2694 	valid = stats->valid;
2695 
2696 	/* low bit indicates receives are present */
2697 	if (valid & 1)
2698 		myri10ge_intr_rx(ss);
2699 
2700 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
2701 		/* lower legacy IRQ  */
2702 		*mgp->irq_deassert = 0;
2703 		if (!myri10ge_deassert_wait)
2704 			/* don't wait for conf. that irq is low */
2705 			stats->valid = 0;
2706 		mb();
2707 	} else {
2708 		/* no need to wait for conf. that irq is low */
2709 		stats->valid = 0;
2710 	}
2711 
2712 	do {
2713 		/* check for transmit completes and receives */
2714 		send_done_count = ntohl(stats->send_done_count);
2715 		if (send_done_count != tx->pkt_done)
2716 			myri10ge_tx_done(ss, (int)send_done_count);
2717 	} while (*((volatile uint8_t *) &stats->valid));
2718 
2719 	if (stats->stats_updated) {
2720 		if (mgp->link_state != stats->link_up || stats->link_down) {
2721 			mgp->link_state = stats->link_up;
2722 			if (stats->link_down) {
2723 				mgp->down_cnt += stats->link_down;
2724 				mgp->link_state = 0;
2725 			}
2726 			if (mgp->link_state) {
2727 				if (myri10ge_verbose)
2728 					printf("%s: link up\n", mgp->name);
2729 				mac_link_update(mgp->mh, LINK_STATE_UP);
2730 			} else {
2731 				if (myri10ge_verbose)
2732 					printf("%s: link down\n", mgp->name);
2733 				mac_link_update(mgp->mh, LINK_STATE_DOWN);
2734 			}
2735 			MYRI10GE_NIC_STAT_INC(link_changes);
2736 		}
2737 		if (mgp->rdma_tags_available !=
2738 		    ntohl(ss->fw_stats->rdma_tags_available)) {
2739 			mgp->rdma_tags_available =
2740 			    ntohl(ss->fw_stats->rdma_tags_available);
2741 			cmn_err(CE_NOTE, "%s: RDMA timed out! "
2742 			    "%d tags left\n", mgp->name,
2743 			    mgp->rdma_tags_available);
2744 		}
2745 	}
2746 
2747 	mb();
2748 	/* check to see if we have rx token to pass back */
2749 	if (valid & 0x1) {
2750 		mutex_enter(&ss->poll_lock);
2751 		if (ss->rx_polling) {
2752 			ss->rx_token = 1;
2753 		} else {
2754 			*ss->irq_claim = BE_32(3);
2755 			ss->rx_token = 0;
2756 		}
2757 		mutex_exit(&ss->poll_lock);
2758 	}
2759 	*(ss->irq_claim + 1) = BE_32(3);
2760 	return (DDI_INTR_CLAIMED);
2761 }
2762 
2763 /*
2764  * Add or remove a multicast address.  This is called with our
2765  * macinfo's lock held by GLD, so we do not need to worry about
2766  * our own locking here.
2767  */
2768 static int
2769 myri10ge_m_multicst(void *arg, boolean_t add, const uint8_t *multicastaddr)
2770 {
2771 	myri10ge_cmd_t cmd;
2772 	struct myri10ge_priv *mgp = arg;
2773 	int status, join_leave;
2774 
2775 	if (add)
2776 		join_leave = MXGEFW_JOIN_MULTICAST_GROUP;
2777 	else
2778 		join_leave = MXGEFW_LEAVE_MULTICAST_GROUP;
2779 	(void) memcpy(&cmd.data0, multicastaddr, 4);
2780 	(void) memcpy(&cmd.data1, multicastaddr + 4, 2);
2781 	cmd.data0 = htonl(cmd.data0);
2782 	cmd.data1 = htonl(cmd.data1);
2783 	status = myri10ge_send_cmd(mgp, join_leave, &cmd);
2784 	if (status == 0)
2785 		return (0);
2786 
2787 	cmn_err(CE_WARN, "%s: failed to set multicast address\n",
2788 	    mgp->name);
2789 	return (status);
2790 }
2791 
2792 
2793 static int
2794 myri10ge_m_promisc(void *arg, boolean_t on)
2795 {
2796 	struct myri10ge_priv *mgp = arg;
2797 
2798 	myri10ge_change_promisc(mgp, on);
2799 	return (0);
2800 }
2801 
2802 /*
2803  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
2804  *  backwards one at a time and handle ring wraps
2805  */
2806 
2807 static inline void
2808 myri10ge_submit_req_backwards(myri10ge_tx_ring_t *tx,
2809     mcp_kreq_ether_send_t *src, int cnt)
2810 {
2811 	int idx, starting_slot;
2812 	starting_slot = tx->req;
2813 	while (cnt > 1) {
2814 		cnt--;
2815 		idx = (starting_slot + cnt) & tx->mask;
2816 		myri10ge_pio_copy(&tx->lanai[idx],
2817 		    &src[cnt], sizeof (*src));
2818 		mb();
2819 	}
2820 }
2821 
2822 /*
2823  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
2824  * at most 32 bytes at a time, so as to avoid involving the software
2825  * pio handler in the nic.   We re-write the first segment's flags
2826  * to mark them valid only after writing the entire chain
2827  */
2828 
2829 static inline void
2830 myri10ge_submit_req(myri10ge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
2831     int cnt)
2832 {
2833 	int idx, i;
2834 	uint32_t *src_ints, *dst_ints;
2835 	mcp_kreq_ether_send_t *srcp, *dstp, *dst;
2836 	uint8_t last_flags;
2837 
2838 	idx = tx->req & tx->mask;
2839 
2840 	last_flags = src->flags;
2841 	src->flags = 0;
2842 	mb();
2843 	dst = dstp = &tx->lanai[idx];
2844 	srcp = src;
2845 
2846 	if ((idx + cnt) < tx->mask) {
2847 		for (i = 0; i < (cnt - 1); i += 2) {
2848 			myri10ge_pio_copy(dstp, srcp, 2 * sizeof (*src));
2849 			mb(); /* force write every 32 bytes */
2850 			srcp += 2;
2851 			dstp += 2;
2852 		}
2853 	} else {
2854 		/*
2855 		 * submit all but the first request, and ensure
2856 		 *  that it is submitted below
2857 		 */
2858 		myri10ge_submit_req_backwards(tx, src, cnt);
2859 		i = 0;
2860 	}
2861 	if (i < cnt) {
2862 		/* submit the first request */
2863 		myri10ge_pio_copy(dstp, srcp, sizeof (*src));
2864 		mb(); /* barrier before setting valid flag */
2865 	}
2866 
2867 	/* re-write the last 32-bits with the valid flags */
2868 	src->flags |= last_flags;
2869 	src_ints = (uint32_t *)src;
2870 	src_ints += 3;
2871 	dst_ints = (uint32_t *)dst;
2872 	dst_ints += 3;
2873 	*dst_ints =  *src_ints;
2874 	tx->req += cnt;
2875 	mb();
2876 	/* notify NIC to poll this tx ring */
2877 	if (!tx->active && tx->go != NULL) {
2878 		*(int *)(void *)tx->go = 1;
2879 		tx->active = 1;
2880 		tx->activate++;
2881 		mb();
2882 	}
2883 }
2884 
2885 /* ARGSUSED */
2886 static inline void
2887 myri10ge_lso_info_get(mblk_t *mp, uint32_t *mss, uint32_t *flags)
2888 {
2889 	uint32_t lso_flag;
2890 	mac_lso_get(mp, mss, &lso_flag);
2891 	(*flags) |= lso_flag;
2892 }
2893 
2894 
2895 /* like pullupmsg, except preserve hcksum/LSO attributes */
2896 static int
2897 myri10ge_pullup(struct myri10ge_slice_state *ss, mblk_t *mp)
2898 {
2899 	uint32_t start, stuff, tx_offload_flags, mss;
2900 	int ok;
2901 
2902 	mss = 0;
2903 	mac_hcksum_get(mp, &start, &stuff, NULL, NULL, &tx_offload_flags);
2904 	myri10ge_lso_info_get(mp, &mss, &tx_offload_flags);
2905 
2906 	ok = pullupmsg(mp, -1);
2907 	if (!ok) {
2908 		printf("pullupmsg failed");
2909 		return (DDI_FAILURE);
2910 	}
2911 	MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_pullup);
2912 	mac_hcksum_set(mp, start, stuff, NULL, NULL, tx_offload_flags);
2913 	if (tx_offload_flags & HW_LSO)
2914 		DB_LSOMSS(mp) = (uint16_t)mss;
2915 	lso_info_set(mp, mss, tx_offload_flags);
2916 	return (DDI_SUCCESS);
2917 }
2918 
2919 static inline void
2920 myri10ge_tx_stat(struct myri10ge_tx_pkt_stats *s, struct ether_header *eh,
2921     int opackets, int obytes)
2922 {
2923 	s->un.all = 0;
2924 	if (eh->ether_dhost.ether_addr_octet[0] & 1) {
2925 		if (0 == (bcmp(eh->ether_dhost.ether_addr_octet,
2926 		    myri10ge_broadcastaddr, sizeof (eh->ether_dhost))))
2927 			s->un.s.brdcstxmt = 1;
2928 		else
2929 			s->un.s.multixmt = 1;
2930 	}
2931 	s->un.s.opackets = (uint16_t)opackets;
2932 	s->un.s.obytes = obytes;
2933 }
2934 
2935 static int
2936 myri10ge_tx_copy(struct myri10ge_slice_state *ss, mblk_t *mp,
2937     mcp_kreq_ether_send_t *req)
2938 {
2939 	myri10ge_tx_ring_t *tx = &ss->tx;
2940 	caddr_t ptr;
2941 	struct myri10ge_tx_copybuf *cp;
2942 	mblk_t *bp;
2943 	int idx, mblen, avail;
2944 	uint16_t len;
2945 
2946 	mutex_enter(&tx->lock);
2947 	avail = tx->mask - (tx->req - tx->done);
2948 	if (avail <= 1) {
2949 		mutex_exit(&tx->lock);
2950 		return (EBUSY);
2951 	}
2952 	idx = tx->req & tx->mask;
2953 	cp = &tx->cp[idx];
2954 	ptr = cp->va;
2955 	for (len = 0, bp = mp; bp != NULL; bp = bp->b_cont) {
2956 		mblen = MBLKL(bp);
2957 		bcopy(bp->b_rptr, ptr, mblen);
2958 		ptr += mblen;
2959 		len += mblen;
2960 	}
2961 	/* ensure runts are padded to 60 bytes */
2962 	if (len < 60) {
2963 		bzero(ptr, 64 - len);
2964 		len = 60;
2965 	}
2966 	req->addr_low = cp->dma.low;
2967 	req->addr_high = cp->dma.high;
2968 	req->length = htons(len);
2969 	req->pad = 0;
2970 	req->rdma_count = 1;
2971 	myri10ge_tx_stat(&tx->info[idx].stat,
2972 	    (struct ether_header *)(void *)cp->va, 1, len);
2973 	(void) ddi_dma_sync(cp->dma.handle, 0, len, DDI_DMA_SYNC_FORDEV);
2974 	myri10ge_submit_req(&ss->tx, req, 1);
2975 	mutex_exit(&tx->lock);
2976 	freemsg(mp);
2977 	return (DDI_SUCCESS);
2978 }
2979 
2980 
2981 static void
2982 myri10ge_send_locked(myri10ge_tx_ring_t *tx, mcp_kreq_ether_send_t *req_list,
2983     struct myri10ge_tx_buffer_state *tx_info,
2984     int count)
2985 {
2986 	int i, idx;
2987 
2988 	idx = 0; /* gcc -Wuninitialized */
2989 	/* store unmapping and bp info for tx irq handler */
2990 	for (i = 0; i < count; i++) {
2991 		idx = (tx->req + i) & tx->mask;
2992 		tx->info[idx].m = tx_info[i].m;
2993 		tx->info[idx].handle = tx_info[i].handle;
2994 	}
2995 	tx->info[idx].stat.un.all = tx_info[0].stat.un.all;
2996 
2997 	/* submit the frame to the nic */
2998 	myri10ge_submit_req(tx, req_list, count);
2999 
3000 
3001 }
3002 
3003 
3004 
3005 static void
3006 myri10ge_copydata(mblk_t *mp, int off, int len, caddr_t buf)
3007 {
3008 	mblk_t *bp;
3009 	int seglen;
3010 	uint_t count;
3011 
3012 	bp = mp;
3013 
3014 	while (off > 0) {
3015 		seglen = MBLKL(bp);
3016 		if (off < seglen)
3017 			break;
3018 		off -= seglen;
3019 		bp = bp->b_cont;
3020 	}
3021 	while (len > 0) {
3022 		seglen = MBLKL(bp);
3023 		count = min(seglen - off, len);
3024 		bcopy(bp->b_rptr + off, buf, count);
3025 		len -= count;
3026 		buf += count;
3027 		off = 0;
3028 		bp = bp->b_cont;
3029 	}
3030 }
3031 
3032 static int
3033 myri10ge_ether_parse_header(mblk_t *mp)
3034 {
3035 	struct ether_header eh_copy;
3036 	struct ether_header *eh;
3037 	int eth_hdr_len, seglen;
3038 
3039 	seglen = MBLKL(mp);
3040 	eth_hdr_len = sizeof (*eh);
3041 	if (seglen < eth_hdr_len) {
3042 		myri10ge_copydata(mp, 0, eth_hdr_len, (caddr_t)&eh_copy);
3043 		eh = &eh_copy;
3044 	} else {
3045 		eh = (struct ether_header *)(void *)mp->b_rptr;
3046 	}
3047 	if (eh->ether_type == BE_16(ETHERTYPE_VLAN)) {
3048 		eth_hdr_len += 4;
3049 	}
3050 
3051 	return (eth_hdr_len);
3052 }
3053 
3054 static int
3055 myri10ge_lso_parse_header(mblk_t *mp, int off)
3056 {
3057 	char buf[128];
3058 	int seglen, sum_off;
3059 	struct ip *ip;
3060 	struct tcphdr *tcp;
3061 
3062 	seglen = MBLKL(mp);
3063 	if (seglen < off + sizeof (*ip)) {
3064 		myri10ge_copydata(mp, off, sizeof (*ip), buf);
3065 		ip = (struct ip *)(void *)buf;
3066 	} else {
3067 		ip = (struct ip *)(void *)(mp->b_rptr + off);
3068 	}
3069 	if (seglen < off + (ip->ip_hl << 2) + sizeof (*tcp)) {
3070 		myri10ge_copydata(mp, off,
3071 		    (ip->ip_hl << 2) + sizeof (*tcp), buf);
3072 		ip = (struct ip *)(void *)buf;
3073 	}
3074 	tcp = (struct tcphdr *)(void *)((char *)ip + (ip->ip_hl << 2));
3075 
3076 	/*
3077 	 * NIC expects ip_sum to be zero.  Recent changes to
3078 	 * OpenSolaris leave the correct ip checksum there, rather
3079 	 * than the required zero, so we need to zero it.  Otherwise,
3080 	 * the NIC will produce bad checksums when sending LSO packets.
3081 	 */
3082 	if (ip->ip_sum != 0) {
3083 		if (((char *)ip) != buf) {
3084 			/* ip points into mblk, so just zero it */
3085 			ip->ip_sum = 0;
3086 		} else {
3087 			/*
3088 			 * ip points into a copy, so walk the chain
3089 			 * to find the ip_csum, then zero it
3090 			 */
3091 			sum_off = off + _PTRDIFF(&ip->ip_sum, buf);
3092 			while (sum_off > (int)(MBLKL(mp) - 1)) {
3093 				sum_off -= MBLKL(mp);
3094 				mp = mp->b_cont;
3095 			}
3096 			mp->b_rptr[sum_off] = 0;
3097 			sum_off++;
3098 			while (sum_off > MBLKL(mp) - 1) {
3099 				sum_off -= MBLKL(mp);
3100 				mp = mp->b_cont;
3101 			}
3102 			mp->b_rptr[sum_off] = 0;
3103 		}
3104 	}
3105 	return (off + ((ip->ip_hl + tcp->th_off) << 2));
3106 }
3107 
3108 static int
3109 myri10ge_tx_tso_copy(struct myri10ge_slice_state *ss, mblk_t *mp,
3110     mcp_kreq_ether_send_t *req_list, int hdr_size, int pkt_size,
3111     uint16_t mss, uint8_t cksum_offset)
3112 {
3113 	myri10ge_tx_ring_t *tx = &ss->tx;
3114 	struct myri10ge_priv *mgp = ss->mgp;
3115 	mblk_t *bp;
3116 	mcp_kreq_ether_send_t *req;
3117 	struct myri10ge_tx_copybuf *cp;
3118 	caddr_t rptr, ptr;
3119 	int mblen, count, cum_len, mss_resid, tx_req, pkt_size_tmp;
3120 	int resid, avail, idx, hdr_size_tmp, tx_boundary;
3121 	int rdma_count;
3122 	uint32_t seglen, len, boundary, low, high_swapped;
3123 	uint16_t pseudo_hdr_offset = htons(mss);
3124 	uint8_t flags;
3125 
3126 	tx_boundary = mgp->tx_boundary;
3127 	hdr_size_tmp = hdr_size;
3128 	resid = tx_boundary;
3129 	count = 1;
3130 	mutex_enter(&tx->lock);
3131 
3132 	/* check to see if the slots are really there */
3133 	avail = tx->mask - (tx->req - tx->done);
3134 	if (unlikely(avail <=  MYRI10GE_MAX_SEND_DESC_TSO)) {
3135 		atomic_add_32(&tx->stall, 1);
3136 		mutex_exit(&tx->lock);
3137 		return (EBUSY);
3138 	}
3139 
3140 	/* copy */
3141 	cum_len = -hdr_size;
3142 	count = 0;
3143 	req = req_list;
3144 	idx = tx->mask & tx->req;
3145 	cp = &tx->cp[idx];
3146 	low = ntohl(cp->dma.low);
3147 	ptr = cp->va;
3148 	cp->len = 0;
3149 	if (mss) {
3150 		int payload = pkt_size - hdr_size;
3151 		uint16_t opackets = (payload / mss) + ((payload % mss) != 0);
3152 		tx->info[idx].ostat.opackets = opackets;
3153 		tx->info[idx].ostat.obytes = (opackets - 1) * hdr_size
3154 		    + pkt_size;
3155 	}
3156 	hdr_size_tmp = hdr_size;
3157 	mss_resid = mss;
3158 	flags = (MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST);
3159 	tx_req = tx->req;
3160 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3161 		mblen = MBLKL(bp);
3162 		rptr = (caddr_t)bp->b_rptr;
3163 		len = min(hdr_size_tmp, mblen);
3164 		if (len) {
3165 			bcopy(rptr, ptr, len);
3166 			rptr += len;
3167 			ptr += len;
3168 			resid -= len;
3169 			mblen -= len;
3170 			hdr_size_tmp -= len;
3171 			cp->len += len;
3172 			if (hdr_size_tmp)
3173 				continue;
3174 			if (resid < mss) {
3175 				tx_req++;
3176 				idx = tx->mask & tx_req;
3177 				cp = &tx->cp[idx];
3178 				low = ntohl(cp->dma.low);
3179 				ptr = cp->va;
3180 				resid = tx_boundary;
3181 			}
3182 		}
3183 		while (mblen) {
3184 			len = min(mss_resid, mblen);
3185 			bcopy(rptr, ptr, len);
3186 			mss_resid -= len;
3187 			resid -= len;
3188 			mblen -= len;
3189 			rptr += len;
3190 			ptr += len;
3191 			cp->len += len;
3192 			if (mss_resid == 0) {
3193 				mss_resid = mss;
3194 				if (resid < mss) {
3195 					tx_req++;
3196 					idx = tx->mask & tx_req;
3197 					cp = &tx->cp[idx];
3198 					cp->len = 0;
3199 					low = ntohl(cp->dma.low);
3200 					ptr = cp->va;
3201 					resid = tx_boundary;
3202 				}
3203 			}
3204 		}
3205 	}
3206 
3207 	req = req_list;
3208 	pkt_size_tmp = pkt_size;
3209 	count = 0;
3210 	rdma_count = 0;
3211 	tx_req = tx->req;
3212 	while (pkt_size_tmp) {
3213 		idx = tx->mask & tx_req;
3214 		cp = &tx->cp[idx];
3215 		high_swapped = cp->dma.high;
3216 		low = ntohl(cp->dma.low);
3217 		len = cp->len;
3218 		if (len == 0) {
3219 			printf("len=0! pkt_size_tmp=%d, pkt_size=%d\n",
3220 			    pkt_size_tmp, pkt_size);
3221 			for (bp = mp; bp != NULL; bp = bp->b_cont) {
3222 				mblen = MBLKL(bp);
3223 				printf("mblen:%d\n", mblen);
3224 			}
3225 			pkt_size_tmp = pkt_size;
3226 			tx_req = tx->req;
3227 			while (pkt_size_tmp > 0) {
3228 				idx = tx->mask & tx_req;
3229 				cp = &tx->cp[idx];
3230 				printf("cp->len = %d\n", cp->len);
3231 				pkt_size_tmp -= cp->len;
3232 				tx_req++;
3233 			}
3234 			printf("dropped\n");
3235 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3236 			goto done;
3237 		}
3238 		pkt_size_tmp -= len;
3239 		while (len) {
3240 			while (len) {
3241 				uint8_t flags_next;
3242 				int cum_len_next;
3243 
3244 				boundary = (low + mgp->tx_boundary) &
3245 				    ~(mgp->tx_boundary - 1);
3246 				seglen = boundary - low;
3247 				if (seglen > len)
3248 					seglen = len;
3249 
3250 				flags_next = flags & ~MXGEFW_FLAGS_FIRST;
3251 				cum_len_next = cum_len + seglen;
3252 				(req-rdma_count)->rdma_count = rdma_count + 1;
3253 				if (likely(cum_len >= 0)) {
3254 					/* payload */
3255 					int next_is_first, chop;
3256 
3257 					chop = (cum_len_next > mss);
3258 					cum_len_next = cum_len_next % mss;
3259 					next_is_first = (cum_len_next == 0);
3260 					flags |= chop *
3261 					    MXGEFW_FLAGS_TSO_CHOP;
3262 					flags_next |= next_is_first *
3263 					    MXGEFW_FLAGS_FIRST;
3264 					rdma_count |= -(chop | next_is_first);
3265 					rdma_count += chop & !next_is_first;
3266 				} else if (likely(cum_len_next >= 0)) {
3267 					/* header ends */
3268 					int small;
3269 
3270 					rdma_count = -1;
3271 					cum_len_next = 0;
3272 					seglen = -cum_len;
3273 					small = (mss <= MXGEFW_SEND_SMALL_SIZE);
3274 					flags_next = MXGEFW_FLAGS_TSO_PLD |
3275 					    MXGEFW_FLAGS_FIRST |
3276 					    (small * MXGEFW_FLAGS_SMALL);
3277 				}
3278 				req->addr_high = high_swapped;
3279 				req->addr_low = htonl(low);
3280 				req->pseudo_hdr_offset = pseudo_hdr_offset;
3281 				req->pad = 0; /* complete solid 16-byte block */
3282 				req->rdma_count = 1;
3283 				req->cksum_offset = cksum_offset;
3284 				req->length = htons(seglen);
3285 				req->flags = flags | ((cum_len & 1) *
3286 				    MXGEFW_FLAGS_ALIGN_ODD);
3287 				if (cksum_offset > seglen)
3288 					cksum_offset -= seglen;
3289 				else
3290 					cksum_offset = 0;
3291 				low += seglen;
3292 				len -= seglen;
3293 				cum_len = cum_len_next;
3294 				req++;
3295 				req->flags = 0;
3296 				flags = flags_next;
3297 				count++;
3298 				rdma_count++;
3299 			}
3300 		}
3301 		tx_req++;
3302 	}
3303 	(req-rdma_count)->rdma_count = (uint8_t)rdma_count;
3304 	do {
3305 		req--;
3306 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
3307 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP |
3308 	    MXGEFW_FLAGS_FIRST)));
3309 
3310 	myri10ge_submit_req(tx, req_list, count);
3311 done:
3312 	mutex_exit(&tx->lock);
3313 	freemsg(mp);
3314 	return (DDI_SUCCESS);
3315 }
3316 
3317 /*
3318  * Try to send the chain of buffers described by the mp.  We must not
3319  * encapsulate more than eth->tx.req - eth->tx.done, or
3320  * MXGEFW_MAX_SEND_DESC, whichever is more.
3321  */
3322 
3323 static int
3324 myri10ge_send(struct myri10ge_slice_state *ss, mblk_t *mp,
3325     mcp_kreq_ether_send_t *req_list, struct myri10ge_tx_buffer_state *tx_info)
3326 {
3327 	struct myri10ge_priv *mgp = ss->mgp;
3328 	myri10ge_tx_ring_t *tx = &ss->tx;
3329 	mcp_kreq_ether_send_t *req;
3330 	struct myri10ge_tx_dma_handle *handles, *dma_handle = NULL;
3331 	mblk_t  *bp;
3332 	ddi_dma_cookie_t cookie;
3333 	int err, rv, count, avail, mblen, try_pullup, i, max_segs, maclen,
3334 	    rdma_count, cum_len, lso_hdr_size;
3335 	uint32_t start, stuff, tx_offload_flags;
3336 	uint32_t seglen, len, mss, boundary, low, high_swapped;
3337 	uint_t ncookies;
3338 	uint16_t pseudo_hdr_offset;
3339 	uint8_t flags, cksum_offset, odd_flag;
3340 	int pkt_size;
3341 	int lso_copy = myri10ge_lso_copy;
3342 	try_pullup = 1;
3343 
3344 again:
3345 	/* Setup checksum offloading, if needed */
3346 	mac_hcksum_get(mp, &start, &stuff, NULL, NULL, &tx_offload_flags);
3347 	myri10ge_lso_info_get(mp, &mss, &tx_offload_flags);
3348 	if (tx_offload_flags & HW_LSO) {
3349 		max_segs = MYRI10GE_MAX_SEND_DESC_TSO;
3350 		if ((tx_offload_flags & HCK_PARTIALCKSUM) == 0) {
3351 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_lsobadflags);
3352 			freemsg(mp);
3353 			return (DDI_SUCCESS);
3354 		}
3355 	} else {
3356 		max_segs = MXGEFW_MAX_SEND_DESC;
3357 		mss = 0;
3358 	}
3359 	req = req_list;
3360 	cksum_offset = 0;
3361 	pseudo_hdr_offset = 0;
3362 
3363 	/* leave an extra slot keep the ring from wrapping */
3364 	avail = tx->mask - (tx->req - tx->done);
3365 
3366 	/*
3367 	 * If we have > MXGEFW_MAX_SEND_DESC, then any over-length
3368 	 * message will need to be pulled up in order to fit.
3369 	 * Otherwise, we are low on transmit descriptors, it is
3370 	 * probably better to stall and try again rather than pullup a
3371 	 * message to fit.
3372 	 */
3373 
3374 	if (avail < max_segs) {
3375 		err = EBUSY;
3376 		atomic_add_32(&tx->stall_early, 1);
3377 		goto stall;
3378 	}
3379 
3380 	/* find out how long the frame is and how many segments it is */
3381 	count = 0;
3382 	odd_flag = 0;
3383 	pkt_size = 0;
3384 	flags = (MXGEFW_FLAGS_NO_TSO | MXGEFW_FLAGS_FIRST);
3385 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3386 		dblk_t *dbp;
3387 		mblen = MBLKL(bp);
3388 		if (mblen == 0) {
3389 			/*
3390 			 * we can't simply skip over 0-length mblks
3391 			 * because the hardware can't deal with them,
3392 			 * and we could leak them.
3393 			 */
3394 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_zero_len);
3395 			err = EIO;
3396 			goto pullup;
3397 		}
3398 		/*
3399 		 * There's no advantage to copying most gesballoc
3400 		 * attached blocks, so disable lso copy in that case
3401 		 */
3402 		if (mss && lso_copy == 1 && ((dbp = bp->b_datap) != NULL)) {
3403 			if ((void *)dbp->db_lastfree != myri10ge_db_lastfree) {
3404 				lso_copy = 0;
3405 			}
3406 		}
3407 		pkt_size += mblen;
3408 		count++;
3409 	}
3410 
3411 	/* Try to pull up excessivly long chains */
3412 	if (count >= max_segs) {
3413 		err = myri10ge_pullup(ss, mp);
3414 		if (likely(err == DDI_SUCCESS)) {
3415 			count = 1;
3416 		} else {
3417 			if (count <  MYRI10GE_MAX_SEND_DESC_TSO) {
3418 				/*
3419 				 * just let the h/w send it, it will be
3420 				 * inefficient, but us better than dropping
3421 				 */
3422 				max_segs = MYRI10GE_MAX_SEND_DESC_TSO;
3423 			} else {
3424 				/* drop it */
3425 				MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3426 				freemsg(mp);
3427 				return (0);
3428 			}
3429 		}
3430 	}
3431 
3432 	cum_len = 0;
3433 	maclen = myri10ge_ether_parse_header(mp);
3434 
3435 	if (tx_offload_flags & HCK_PARTIALCKSUM) {
3436 
3437 		cksum_offset = start + maclen;
3438 		pseudo_hdr_offset = htons(stuff + maclen);
3439 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
3440 		flags |= MXGEFW_FLAGS_CKSUM;
3441 	}
3442 
3443 	lso_hdr_size = 0; /* -Wunitinialized */
3444 	if (mss) { /* LSO */
3445 		/* this removes any CKSUM flag from before */
3446 		flags = (MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST);
3447 		/*
3448 		 * parse the headers and set cum_len to a negative
3449 		 * value to reflect the offset of the TCP payload
3450 		 */
3451 		lso_hdr_size =  myri10ge_lso_parse_header(mp, maclen);
3452 		cum_len = -lso_hdr_size;
3453 		if ((mss < mgp->tx_boundary) && lso_copy) {
3454 			err = myri10ge_tx_tso_copy(ss, mp, req_list,
3455 			    lso_hdr_size, pkt_size, mss, cksum_offset);
3456 			return (err);
3457 		}
3458 
3459 		/*
3460 		 * for TSO, pseudo_hdr_offset holds mss.  The firmware
3461 		 * figures out where to put the checksum by parsing
3462 		 * the header.
3463 		 */
3464 
3465 		pseudo_hdr_offset = htons(mss);
3466 	} else if (pkt_size <= MXGEFW_SEND_SMALL_SIZE) {
3467 		flags |= MXGEFW_FLAGS_SMALL;
3468 		if (pkt_size < myri10ge_tx_copylen) {
3469 			req->cksum_offset = cksum_offset;
3470 			req->pseudo_hdr_offset = pseudo_hdr_offset;
3471 			req->flags = flags;
3472 			err = myri10ge_tx_copy(ss, mp, req);
3473 			return (err);
3474 		}
3475 		cum_len = 0;
3476 	}
3477 
3478 	/* pull one DMA handle for each bp from our freelist */
3479 	handles = NULL;
3480 	err = myri10ge_alloc_tx_handles(ss, count, &handles);
3481 	if (err != DDI_SUCCESS) {
3482 		err = DDI_FAILURE;
3483 		goto stall;
3484 	}
3485 	count = 0;
3486 	rdma_count = 0;
3487 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3488 		mblen = MBLKL(bp);
3489 		dma_handle = handles;
3490 		handles = handles->next;
3491 
3492 		rv = ddi_dma_addr_bind_handle(dma_handle->h, NULL,
3493 		    (caddr_t)bp->b_rptr, mblen,
3494 		    DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL,
3495 		    &cookie, &ncookies);
3496 		if (unlikely(rv != DDI_DMA_MAPPED)) {
3497 			err = EIO;
3498 			try_pullup = 0;
3499 			dma_handle->next = handles;
3500 			handles = dma_handle;
3501 			goto abort_with_handles;
3502 		}
3503 
3504 		/* reserve the slot */
3505 		tx_info[count].m = bp;
3506 		tx_info[count].handle = dma_handle;
3507 
3508 		for (; ; ) {
3509 			low = MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress);
3510 			high_swapped =
3511 			    htonl(MYRI10GE_HIGHPART_TO_U32(
3512 			    cookie.dmac_laddress));
3513 			len = (uint32_t)cookie.dmac_size;
3514 			while (len) {
3515 				uint8_t flags_next;
3516 				int cum_len_next;
3517 
3518 				boundary = (low + mgp->tx_boundary) &
3519 				    ~(mgp->tx_boundary - 1);
3520 				seglen = boundary - low;
3521 				if (seglen > len)
3522 					seglen = len;
3523 
3524 				flags_next = flags & ~MXGEFW_FLAGS_FIRST;
3525 				cum_len_next = cum_len + seglen;
3526 				if (mss) {
3527 					(req-rdma_count)->rdma_count =
3528 					    rdma_count + 1;
3529 					if (likely(cum_len >= 0)) {
3530 						/* payload */
3531 						int next_is_first, chop;
3532 
3533 						chop = (cum_len_next > mss);
3534 						cum_len_next =
3535 						    cum_len_next % mss;
3536 						next_is_first =
3537 						    (cum_len_next == 0);
3538 						flags |= chop *
3539 						    MXGEFW_FLAGS_TSO_CHOP;
3540 						flags_next |= next_is_first *
3541 						    MXGEFW_FLAGS_FIRST;
3542 						rdma_count |=
3543 						    -(chop | next_is_first);
3544 						rdma_count +=
3545 						    chop & !next_is_first;
3546 					} else if (likely(cum_len_next >= 0)) {
3547 						/* header ends */
3548 						int small;
3549 
3550 						rdma_count = -1;
3551 						cum_len_next = 0;
3552 						seglen = -cum_len;
3553 						small = (mss <=
3554 						    MXGEFW_SEND_SMALL_SIZE);
3555 						flags_next =
3556 						    MXGEFW_FLAGS_TSO_PLD
3557 						    | MXGEFW_FLAGS_FIRST
3558 						    | (small *
3559 						    MXGEFW_FLAGS_SMALL);
3560 					}
3561 				}
3562 				req->addr_high = high_swapped;
3563 				req->addr_low = htonl(low);
3564 				req->pseudo_hdr_offset = pseudo_hdr_offset;
3565 				req->pad = 0; /* complete solid 16-byte block */
3566 				req->rdma_count = 1;
3567 				req->cksum_offset = cksum_offset;
3568 				req->length = htons(seglen);
3569 				req->flags = flags | ((cum_len & 1) * odd_flag);
3570 				if (cksum_offset > seglen)
3571 					cksum_offset -= seglen;
3572 				else
3573 					cksum_offset = 0;
3574 				low += seglen;
3575 				len -= seglen;
3576 				cum_len = cum_len_next;
3577 				count++;
3578 				rdma_count++;
3579 				/*  make sure all the segments will fit */
3580 				if (unlikely(count >= max_segs)) {
3581 					MYRI10GE_ATOMIC_SLICE_STAT_INC(
3582 					    xmit_lowbuf);
3583 					/* may try a pullup */
3584 					err = EBUSY;
3585 					if (try_pullup)
3586 						try_pullup = 2;
3587 					goto abort_with_handles;
3588 				}
3589 				req++;
3590 				req->flags = 0;
3591 				flags = flags_next;
3592 				tx_info[count].m = 0;
3593 			}
3594 			ncookies--;
3595 			if (ncookies == 0)
3596 				break;
3597 			ddi_dma_nextcookie(dma_handle->h, &cookie);
3598 		}
3599 	}
3600 	(req-rdma_count)->rdma_count = (uint8_t)rdma_count;
3601 
3602 	if (mss) {
3603 		do {
3604 			req--;
3605 			req->flags |= MXGEFW_FLAGS_TSO_LAST;
3606 		} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP |
3607 		    MXGEFW_FLAGS_FIRST)));
3608 	}
3609 
3610 	/* calculate tx stats */
3611 	if (mss) {
3612 		uint16_t opackets;
3613 		int payload;
3614 
3615 		payload = pkt_size - lso_hdr_size;
3616 		opackets = (payload / mss) + ((payload % mss) != 0);
3617 		tx_info[0].stat.un.all = 0;
3618 		tx_info[0].ostat.opackets = opackets;
3619 		tx_info[0].ostat.obytes = (opackets - 1) * lso_hdr_size
3620 		    + pkt_size;
3621 	} else {
3622 		myri10ge_tx_stat(&tx_info[0].stat,
3623 		    (struct ether_header *)(void *)mp->b_rptr, 1, pkt_size);
3624 	}
3625 	mutex_enter(&tx->lock);
3626 
3627 	/* check to see if the slots are really there */
3628 	avail = tx->mask - (tx->req - tx->done);
3629 	if (unlikely(avail <= count)) {
3630 		mutex_exit(&tx->lock);
3631 		err = 0;
3632 		goto late_stall;
3633 	}
3634 
3635 	myri10ge_send_locked(tx, req_list, tx_info, count);
3636 	mutex_exit(&tx->lock);
3637 	return (DDI_SUCCESS);
3638 
3639 late_stall:
3640 	try_pullup = 0;
3641 	atomic_add_32(&tx->stall_late, 1);
3642 
3643 abort_with_handles:
3644 	/* unbind and free handles from previous mblks */
3645 	for (i = 0; i < count; i++) {
3646 		bp = tx_info[i].m;
3647 		tx_info[i].m = 0;
3648 		if (bp) {
3649 			dma_handle = tx_info[i].handle;
3650 			(void) ddi_dma_unbind_handle(dma_handle->h);
3651 			dma_handle->next = handles;
3652 			handles = dma_handle;
3653 			tx_info[i].handle = NULL;
3654 			tx_info[i].m = NULL;
3655 		}
3656 	}
3657 	myri10ge_free_tx_handle_slist(tx, handles);
3658 pullup:
3659 	if (try_pullup) {
3660 		err = myri10ge_pullup(ss, mp);
3661 		if (err != DDI_SUCCESS && try_pullup == 2) {
3662 			/* drop */
3663 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3664 			freemsg(mp);
3665 			return (0);
3666 		}
3667 		try_pullup = 0;
3668 		goto again;
3669 	}
3670 
3671 stall:
3672 	if (err != 0) {
3673 		if (err == EBUSY) {
3674 			atomic_add_32(&tx->stall, 1);
3675 		} else {
3676 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3677 		}
3678 	}
3679 	return (err);
3680 }
3681 
3682 static mblk_t *
3683 myri10ge_send_wrapper(void *arg, mblk_t *mp)
3684 {
3685 	struct myri10ge_slice_state *ss = arg;
3686 	int err = 0;
3687 	mcp_kreq_ether_send_t *req_list;
3688 #if defined(__i386)
3689 	/*
3690 	 * We need about 2.5KB of scratch space to handle transmits.
3691 	 * i86pc has only 8KB of kernel stack space, so we malloc the
3692 	 * scratch space there rather than keeping it on the stack.
3693 	 */
3694 	size_t req_size, tx_info_size;
3695 	struct myri10ge_tx_buffer_state *tx_info;
3696 	caddr_t req_bytes;
3697 
3698 	req_size = sizeof (*req_list) * (MYRI10GE_MAX_SEND_DESC_TSO + 4)
3699 	    + 8;
3700 	req_bytes = kmem_alloc(req_size, KM_SLEEP);
3701 	tx_info_size = sizeof (*tx_info) * (MYRI10GE_MAX_SEND_DESC_TSO + 1);
3702 	tx_info = kmem_alloc(tx_info_size, KM_SLEEP);
3703 #else
3704 	char req_bytes[sizeof (*req_list) * (MYRI10GE_MAX_SEND_DESC_TSO + 4)
3705 	    + 8];
3706 	struct myri10ge_tx_buffer_state tx_info[MYRI10GE_MAX_SEND_DESC_TSO + 1];
3707 #endif
3708 
3709 	/* ensure req_list entries are aligned to 8 bytes */
3710 	req_list = (struct mcp_kreq_ether_send *)
3711 	    (((unsigned long)req_bytes + 7UL) & ~7UL);
3712 
3713 	err = myri10ge_send(ss, mp, req_list, tx_info);
3714 
3715 #if defined(__i386)
3716 	kmem_free(tx_info, tx_info_size);
3717 	kmem_free(req_bytes, req_size);
3718 #endif
3719 	if (err)
3720 		return (mp);
3721 	else
3722 		return (NULL);
3723 }
3724 
3725 static int
3726 myri10ge_addmac(void *arg, const uint8_t *mac_addr)
3727 {
3728 	struct myri10ge_priv *mgp = arg;
3729 	int err;
3730 
3731 	if (mac_addr == NULL)
3732 		return (EINVAL);
3733 
3734 	mutex_enter(&mgp->intrlock);
3735 	if (mgp->macaddr_cnt) {
3736 		mutex_exit(&mgp->intrlock);
3737 		return (ENOSPC);
3738 	}
3739 	err = myri10ge_m_unicst(mgp, mac_addr);
3740 	if (!err)
3741 		mgp->macaddr_cnt++;
3742 
3743 	mutex_exit(&mgp->intrlock);
3744 	if (err)
3745 		return (err);
3746 
3747 	bcopy(mac_addr, mgp->mac_addr, sizeof (mgp->mac_addr));
3748 	return (0);
3749 }
3750 
3751 /*ARGSUSED*/
3752 static int
3753 myri10ge_remmac(void *arg, const uint8_t *mac_addr)
3754 {
3755 	struct myri10ge_priv *mgp = arg;
3756 
3757 	mutex_enter(&mgp->intrlock);
3758 	mgp->macaddr_cnt--;
3759 	mutex_exit(&mgp->intrlock);
3760 
3761 	return (0);
3762 }
3763 
3764 /*ARGSUSED*/
3765 static void
3766 myri10ge_fill_group(void *arg, mac_ring_type_t rtype, const int index,
3767     mac_group_info_t *infop, mac_group_handle_t gh)
3768 {
3769 	struct myri10ge_priv *mgp = arg;
3770 
3771 	if (rtype != MAC_RING_TYPE_RX)
3772 		return;
3773 
3774 	infop->mgi_driver = (mac_group_driver_t)mgp;
3775 	infop->mgi_start = NULL;
3776 	infop->mgi_stop = NULL;
3777 	infop->mgi_addmac = myri10ge_addmac;
3778 	infop->mgi_remmac = myri10ge_remmac;
3779 	infop->mgi_count = mgp->num_slices;
3780 }
3781 
3782 static int
3783 myri10ge_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num)
3784 {
3785 	struct myri10ge_slice_state *ss;
3786 
3787 	ss = (struct myri10ge_slice_state *)rh;
3788 	mutex_enter(&ss->rx_lock);
3789 	ss->rx_gen_num = mr_gen_num;
3790 	mutex_exit(&ss->rx_lock);
3791 	return (0);
3792 }
3793 
3794 /*
3795  * Retrieve a value for one of the statistics for a particular rx ring
3796  */
3797 int
3798 myri10ge_rx_ring_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val)
3799 {
3800 	struct myri10ge_slice_state *ss;
3801 
3802 	ss = (struct myri10ge_slice_state *)rh;
3803 	switch (stat) {
3804 	case MAC_STAT_RBYTES:
3805 		*val = ss->rx_stats.ibytes;
3806 		break;
3807 
3808 	case MAC_STAT_IPACKETS:
3809 		*val = ss->rx_stats.ipackets;
3810 		break;
3811 
3812 	default:
3813 		*val = 0;
3814 		return (ENOTSUP);
3815 	}
3816 
3817 	return (0);
3818 }
3819 
3820 /*
3821  * Retrieve a value for one of the statistics for a particular tx ring
3822  */
3823 int
3824 myri10ge_tx_ring_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val)
3825 {
3826 	struct myri10ge_slice_state *ss;
3827 
3828 	ss = (struct myri10ge_slice_state *)rh;
3829 	switch (stat) {
3830 	case MAC_STAT_OBYTES:
3831 		*val = ss->tx.stats.obytes;
3832 		break;
3833 
3834 	case MAC_STAT_OPACKETS:
3835 		*val = ss->tx.stats.opackets;
3836 		break;
3837 
3838 	default:
3839 		*val = 0;
3840 		return (ENOTSUP);
3841 	}
3842 
3843 	return (0);
3844 }
3845 
3846 static int
3847 myri10ge_rx_ring_intr_disable(mac_intr_handle_t intrh)
3848 {
3849 	struct myri10ge_slice_state *ss;
3850 
3851 	ss = (struct myri10ge_slice_state *)intrh;
3852 	mutex_enter(&ss->poll_lock);
3853 	ss->rx_polling = B_TRUE;
3854 	mutex_exit(&ss->poll_lock);
3855 	return (0);
3856 }
3857 
3858 static int
3859 myri10ge_rx_ring_intr_enable(mac_intr_handle_t intrh)
3860 {
3861 	struct myri10ge_slice_state *ss;
3862 
3863 	ss = (struct myri10ge_slice_state *)intrh;
3864 	mutex_enter(&ss->poll_lock);
3865 	ss->rx_polling = B_FALSE;
3866 	if (ss->rx_token) {
3867 		*ss->irq_claim = BE_32(3);
3868 		ss->rx_token = 0;
3869 	}
3870 	mutex_exit(&ss->poll_lock);
3871 	return (0);
3872 }
3873 
3874 /*ARGSUSED*/
3875 static void
3876 myri10ge_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
3877     const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
3878 {
3879 	struct myri10ge_priv *mgp = arg;
3880 	struct myri10ge_slice_state *ss;
3881 	mac_intr_t *mintr = &infop->mri_intr;
3882 
3883 	ASSERT((unsigned int)ring_index < mgp->num_slices);
3884 
3885 	ss = &mgp->ss[ring_index];
3886 	switch (rtype) {
3887 	case MAC_RING_TYPE_RX:
3888 		ss->rx_rh = rh;
3889 		infop->mri_driver = (mac_ring_driver_t)ss;
3890 		infop->mri_start = myri10ge_ring_start;
3891 		infop->mri_stop = NULL;
3892 		infop->mri_poll = myri10ge_poll_rx;
3893 		infop->mri_stat = myri10ge_rx_ring_stat;
3894 		mintr->mi_handle = (mac_intr_handle_t)ss;
3895 		mintr->mi_enable = myri10ge_rx_ring_intr_enable;
3896 		mintr->mi_disable = myri10ge_rx_ring_intr_disable;
3897 		break;
3898 	case MAC_RING_TYPE_TX:
3899 		ss->tx.rh = rh;
3900 		infop->mri_driver = (mac_ring_driver_t)ss;
3901 		infop->mri_start = NULL;
3902 		infop->mri_stop = NULL;
3903 		infop->mri_tx = myri10ge_send_wrapper;
3904 		infop->mri_stat = myri10ge_tx_ring_stat;
3905 		break;
3906 	default:
3907 		break;
3908 	}
3909 }
3910 
3911 static void
3912 myri10ge_nic_stat_destroy(struct myri10ge_priv *mgp)
3913 {
3914 	if (mgp->ksp_stat == NULL)
3915 		return;
3916 
3917 	kstat_delete(mgp->ksp_stat);
3918 	mgp->ksp_stat = NULL;
3919 }
3920 
3921 static void
3922 myri10ge_slice_stat_destroy(struct myri10ge_slice_state *ss)
3923 {
3924 	if (ss->ksp_stat == NULL)
3925 		return;
3926 
3927 	kstat_delete(ss->ksp_stat);
3928 	ss->ksp_stat = NULL;
3929 }
3930 
3931 static void
3932 myri10ge_info_destroy(struct myri10ge_priv *mgp)
3933 {
3934 	if (mgp->ksp_info == NULL)
3935 		return;
3936 
3937 	kstat_delete(mgp->ksp_info);
3938 	mgp->ksp_info = NULL;
3939 }
3940 
3941 static int
3942 myri10ge_nic_stat_kstat_update(kstat_t *ksp, int rw)
3943 {
3944 	struct myri10ge_nic_stat *ethstat;
3945 	struct myri10ge_priv *mgp;
3946 	mcp_irq_data_t *fw_stats;
3947 
3948 
3949 	if (rw == KSTAT_WRITE)
3950 		return (EACCES);
3951 
3952 	ethstat = (struct myri10ge_nic_stat *)ksp->ks_data;
3953 	mgp = (struct myri10ge_priv *)ksp->ks_private;
3954 	fw_stats = mgp->ss[0].fw_stats;
3955 
3956 	ethstat->dma_read_bw_MBs.value.ul = mgp->read_dma;
3957 	ethstat->dma_write_bw_MBs.value.ul = mgp->write_dma;
3958 	ethstat->dma_read_write_bw_MBs.value.ul = mgp->read_write_dma;
3959 	if (myri10ge_tx_dma_attr.dma_attr_flags & DDI_DMA_FORCE_PHYSICAL)
3960 		ethstat->dma_force_physical.value.ul = 1;
3961 	else
3962 		ethstat->dma_force_physical.value.ul = 0;
3963 	ethstat->lanes.value.ul = mgp->pcie_link_width;
3964 	ethstat->dropped_bad_crc32.value.ul =
3965 	    ntohl(fw_stats->dropped_bad_crc32);
3966 	ethstat->dropped_bad_phy.value.ul =
3967 	    ntohl(fw_stats->dropped_bad_phy);
3968 	ethstat->dropped_link_error_or_filtered.value.ul =
3969 	    ntohl(fw_stats->dropped_link_error_or_filtered);
3970 	ethstat->dropped_link_overflow.value.ul =
3971 	    ntohl(fw_stats->dropped_link_overflow);
3972 	ethstat->dropped_multicast_filtered.value.ul =
3973 	    ntohl(fw_stats->dropped_multicast_filtered);
3974 	ethstat->dropped_no_big_buffer.value.ul =
3975 	    ntohl(fw_stats->dropped_no_big_buffer);
3976 	ethstat->dropped_no_small_buffer.value.ul =
3977 	    ntohl(fw_stats->dropped_no_small_buffer);
3978 	ethstat->dropped_overrun.value.ul =
3979 	    ntohl(fw_stats->dropped_overrun);
3980 	ethstat->dropped_pause.value.ul =
3981 	    ntohl(fw_stats->dropped_pause);
3982 	ethstat->dropped_runt.value.ul =
3983 	    ntohl(fw_stats->dropped_runt);
3984 	ethstat->link_up.value.ul =
3985 	    ntohl(fw_stats->link_up);
3986 	ethstat->dropped_unicast_filtered.value.ul =
3987 	    ntohl(fw_stats->dropped_unicast_filtered);
3988 	return (0);
3989 }
3990 
3991 static int
3992 myri10ge_slice_stat_kstat_update(kstat_t *ksp, int rw)
3993 {
3994 	struct myri10ge_slice_stat *ethstat;
3995 	struct myri10ge_slice_state *ss;
3996 
3997 	if (rw == KSTAT_WRITE)
3998 		return (EACCES);
3999 
4000 	ethstat = (struct myri10ge_slice_stat *)ksp->ks_data;
4001 	ss = (struct myri10ge_slice_state *)ksp->ks_private;
4002 
4003 	ethstat->rx_big.value.ul = ss->j_rx_cnt;
4004 	ethstat->rx_bigbuf_firmware.value.ul = ss->rx_big.cnt - ss->j_rx_cnt;
4005 	ethstat->rx_bigbuf_pool.value.ul =
4006 	    ss->jpool.num_alloc - ss->jbufs_for_smalls;
4007 	ethstat->rx_bigbuf_smalls.value.ul = ss->jbufs_for_smalls;
4008 	ethstat->rx_small.value.ul = ss->rx_small.cnt -
4009 	    (ss->rx_small.mask + 1);
4010 	ethstat->tx_done.value.ul = ss->tx.done;
4011 	ethstat->tx_req.value.ul = ss->tx.req;
4012 	ethstat->tx_activate.value.ul = ss->tx.activate;
4013 	ethstat->xmit_sched.value.ul = ss->tx.sched;
4014 	ethstat->xmit_stall.value.ul = ss->tx.stall;
4015 	ethstat->xmit_stall_early.value.ul = ss->tx.stall_early;
4016 	ethstat->xmit_stall_late.value.ul = ss->tx.stall_late;
4017 	ethstat->xmit_err.value.ul =  MYRI10GE_SLICE_STAT(xmit_err);
4018 	return (0);
4019 }
4020 
4021 static int
4022 myri10ge_info_kstat_update(kstat_t *ksp, int rw)
4023 {
4024 	struct myri10ge_info *info;
4025 	struct myri10ge_priv *mgp;
4026 
4027 
4028 	if (rw == KSTAT_WRITE)
4029 		return (EACCES);
4030 
4031 	info = (struct myri10ge_info *)ksp->ks_data;
4032 	mgp = (struct myri10ge_priv *)ksp->ks_private;
4033 	kstat_named_setstr(&info->driver_version, MYRI10GE_VERSION_STR);
4034 	kstat_named_setstr(&info->firmware_version, mgp->fw_version);
4035 	kstat_named_setstr(&info->firmware_name, mgp->fw_name);
4036 	kstat_named_setstr(&info->interrupt_type, mgp->intr_type);
4037 	kstat_named_setstr(&info->product_code, mgp->pc_str);
4038 	kstat_named_setstr(&info->serial_number, mgp->sn_str);
4039 	return (0);
4040 }
4041 
4042 static struct myri10ge_info myri10ge_info_template = {
4043 	{ "driver_version",	KSTAT_DATA_STRING },
4044 	{ "firmware_version",	KSTAT_DATA_STRING },
4045 	{ "firmware_name",	KSTAT_DATA_STRING },
4046 	{ "interrupt_type",	KSTAT_DATA_STRING },
4047 	{ "product_code",	KSTAT_DATA_STRING },
4048 	{ "serial_number",	KSTAT_DATA_STRING },
4049 };
4050 static kmutex_t myri10ge_info_template_lock;
4051 
4052 
4053 static int
4054 myri10ge_info_init(struct myri10ge_priv *mgp)
4055 {
4056 	struct kstat *ksp;
4057 
4058 	ksp = kstat_create("myri10ge", ddi_get_instance(mgp->dip),
4059 	    "myri10ge_info", "net", KSTAT_TYPE_NAMED,
4060 	    sizeof (myri10ge_info_template) /
4061 	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4062 	if (ksp == NULL) {
4063 		cmn_err(CE_WARN,
4064 		    "%s: myri10ge_info_init: kstat_create failed", mgp->name);
4065 		return (DDI_FAILURE);
4066 	}
4067 	mgp->ksp_info = ksp;
4068 	ksp->ks_update = myri10ge_info_kstat_update;
4069 	ksp->ks_private = (void *) mgp;
4070 	ksp->ks_data = &myri10ge_info_template;
4071 	ksp->ks_lock = &myri10ge_info_template_lock;
4072 	if (MYRI10GE_VERSION_STR != NULL)
4073 		ksp->ks_data_size += strlen(MYRI10GE_VERSION_STR) + 1;
4074 	if (mgp->fw_version != NULL)
4075 		ksp->ks_data_size += strlen(mgp->fw_version) + 1;
4076 	ksp->ks_data_size += strlen(mgp->fw_name) + 1;
4077 	ksp->ks_data_size += strlen(mgp->intr_type) + 1;
4078 	if (mgp->pc_str != NULL)
4079 		ksp->ks_data_size += strlen(mgp->pc_str) + 1;
4080 	if (mgp->sn_str != NULL)
4081 		ksp->ks_data_size += strlen(mgp->sn_str) + 1;
4082 
4083 	kstat_install(ksp);
4084 	return (DDI_SUCCESS);
4085 }
4086 
4087 
4088 static int
4089 myri10ge_nic_stat_init(struct myri10ge_priv *mgp)
4090 {
4091 	struct kstat *ksp;
4092 	struct myri10ge_nic_stat *ethstat;
4093 
4094 	ksp = kstat_create("myri10ge", ddi_get_instance(mgp->dip),
4095 	    "myri10ge_nic_stats", "net", KSTAT_TYPE_NAMED,
4096 	    sizeof (*ethstat) / sizeof (kstat_named_t), 0);
4097 	if (ksp == NULL) {
4098 		cmn_err(CE_WARN,
4099 		    "%s: myri10ge_stat_init: kstat_create failed", mgp->name);
4100 		return (DDI_FAILURE);
4101 	}
4102 	mgp->ksp_stat = ksp;
4103 	ethstat = (struct myri10ge_nic_stat *)(ksp->ks_data);
4104 
4105 	kstat_named_init(&ethstat->dma_read_bw_MBs,
4106 	    "dma_read_bw_MBs", KSTAT_DATA_ULONG);
4107 	kstat_named_init(&ethstat->dma_write_bw_MBs,
4108 	    "dma_write_bw_MBs", KSTAT_DATA_ULONG);
4109 	kstat_named_init(&ethstat->dma_read_write_bw_MBs,
4110 	    "dma_read_write_bw_MBs", KSTAT_DATA_ULONG);
4111 	kstat_named_init(&ethstat->dma_force_physical,
4112 	    "dma_force_physical", KSTAT_DATA_ULONG);
4113 	kstat_named_init(&ethstat->lanes,
4114 	    "lanes", KSTAT_DATA_ULONG);
4115 	kstat_named_init(&ethstat->dropped_bad_crc32,
4116 	    "dropped_bad_crc32", KSTAT_DATA_ULONG);
4117 	kstat_named_init(&ethstat->dropped_bad_phy,
4118 	    "dropped_bad_phy", KSTAT_DATA_ULONG);
4119 	kstat_named_init(&ethstat->dropped_link_error_or_filtered,
4120 	    "dropped_link_error_or_filtered", KSTAT_DATA_ULONG);
4121 	kstat_named_init(&ethstat->dropped_link_overflow,
4122 	    "dropped_link_overflow", KSTAT_DATA_ULONG);
4123 	kstat_named_init(&ethstat->dropped_multicast_filtered,
4124 	    "dropped_multicast_filtered", KSTAT_DATA_ULONG);
4125 	kstat_named_init(&ethstat->dropped_no_big_buffer,
4126 	    "dropped_no_big_buffer", KSTAT_DATA_ULONG);
4127 	kstat_named_init(&ethstat->dropped_no_small_buffer,
4128 	    "dropped_no_small_buffer", KSTAT_DATA_ULONG);
4129 	kstat_named_init(&ethstat->dropped_overrun,
4130 	    "dropped_overrun", KSTAT_DATA_ULONG);
4131 	kstat_named_init(&ethstat->dropped_pause,
4132 	    "dropped_pause", KSTAT_DATA_ULONG);
4133 	kstat_named_init(&ethstat->dropped_runt,
4134 	    "dropped_runt", KSTAT_DATA_ULONG);
4135 	kstat_named_init(&ethstat->dropped_unicast_filtered,
4136 	    "dropped_unicast_filtered", KSTAT_DATA_ULONG);
4137 	kstat_named_init(&ethstat->dropped_runt, "dropped_runt",
4138 	    KSTAT_DATA_ULONG);
4139 	kstat_named_init(&ethstat->link_up, "link_up", KSTAT_DATA_ULONG);
4140 	kstat_named_init(&ethstat->link_changes, "link_changes",
4141 	    KSTAT_DATA_ULONG);
4142 	ksp->ks_update = myri10ge_nic_stat_kstat_update;
4143 	ksp->ks_private = (void *) mgp;
4144 	kstat_install(ksp);
4145 	return (DDI_SUCCESS);
4146 }
4147 
4148 static int
4149 myri10ge_slice_stat_init(struct myri10ge_slice_state *ss)
4150 {
4151 	struct myri10ge_priv *mgp = ss->mgp;
4152 	struct kstat *ksp;
4153 	struct myri10ge_slice_stat *ethstat;
4154 	int instance;
4155 
4156 	/*
4157 	 * fake an instance so that the same slice numbers from
4158 	 * different instances do not collide
4159 	 */
4160 	instance = (ddi_get_instance(mgp->dip) * 1000) +  (int)(ss - mgp->ss);
4161 	ksp = kstat_create("myri10ge", instance,
4162 	    "myri10ge_slice_stats", "net", KSTAT_TYPE_NAMED,
4163 	    sizeof (*ethstat) / sizeof (kstat_named_t), 0);
4164 	if (ksp == NULL) {
4165 		cmn_err(CE_WARN,
4166 		    "%s: myri10ge_stat_init: kstat_create failed", mgp->name);
4167 		return (DDI_FAILURE);
4168 	}
4169 	ss->ksp_stat = ksp;
4170 	ethstat = (struct myri10ge_slice_stat *)(ksp->ks_data);
4171 	kstat_named_init(&ethstat->lro_bad_csum, "lro_bad_csum",
4172 	    KSTAT_DATA_ULONG);
4173 	kstat_named_init(&ethstat->lro_flushed, "lro_flushed",
4174 	    KSTAT_DATA_ULONG);
4175 	kstat_named_init(&ethstat->lro_queued, "lro_queued",
4176 	    KSTAT_DATA_ULONG);
4177 	kstat_named_init(&ethstat->rx_bigbuf_firmware, "rx_bigbuf_firmware",
4178 	    KSTAT_DATA_ULONG);
4179 	kstat_named_init(&ethstat->rx_bigbuf_pool, "rx_bigbuf_pool",
4180 	    KSTAT_DATA_ULONG);
4181 	kstat_named_init(&ethstat->rx_bigbuf_smalls, "rx_bigbuf_smalls",
4182 	    KSTAT_DATA_ULONG);
4183 	kstat_named_init(&ethstat->rx_copy, "rx_copy",
4184 	    KSTAT_DATA_ULONG);
4185 	kstat_named_init(&ethstat->rx_big_nobuf, "rx_big_nobuf",
4186 	    KSTAT_DATA_ULONG);
4187 	kstat_named_init(&ethstat->rx_small_nobuf, "rx_small_nobuf",
4188 	    KSTAT_DATA_ULONG);
4189 	kstat_named_init(&ethstat->xmit_zero_len, "xmit_zero_len",
4190 	    KSTAT_DATA_ULONG);
4191 	kstat_named_init(&ethstat->xmit_pullup, "xmit_pullup",
4192 	    KSTAT_DATA_ULONG);
4193 	kstat_named_init(&ethstat->xmit_pullup_first, "xmit_pullup_first",
4194 	    KSTAT_DATA_ULONG);
4195 	kstat_named_init(&ethstat->xmit_lowbuf, "xmit_lowbuf",
4196 	    KSTAT_DATA_ULONG);
4197 	kstat_named_init(&ethstat->xmit_lsobadflags, "xmit_lsobadflags",
4198 	    KSTAT_DATA_ULONG);
4199 	kstat_named_init(&ethstat->xmit_sched, "xmit_sched",
4200 	    KSTAT_DATA_ULONG);
4201 	kstat_named_init(&ethstat->xmit_stall, "xmit_stall",
4202 	    KSTAT_DATA_ULONG);
4203 	kstat_named_init(&ethstat->xmit_stall_early, "xmit_stall_early",
4204 	    KSTAT_DATA_ULONG);
4205 	kstat_named_init(&ethstat->xmit_stall_late, "xmit_stall_late",
4206 	    KSTAT_DATA_ULONG);
4207 	kstat_named_init(&ethstat->xmit_err, "xmit_err",
4208 	    KSTAT_DATA_ULONG);
4209 	kstat_named_init(&ethstat->tx_req, "tx_req",
4210 	    KSTAT_DATA_ULONG);
4211 	kstat_named_init(&ethstat->tx_activate, "tx_activate",
4212 	    KSTAT_DATA_ULONG);
4213 	kstat_named_init(&ethstat->tx_done, "tx_done",
4214 	    KSTAT_DATA_ULONG);
4215 	kstat_named_init(&ethstat->tx_handles_alloced, "tx_handles_alloced",
4216 	    KSTAT_DATA_ULONG);
4217 	kstat_named_init(&ethstat->rx_big, "rx_big",
4218 	    KSTAT_DATA_ULONG);
4219 	kstat_named_init(&ethstat->rx_small, "rx_small",
4220 	    KSTAT_DATA_ULONG);
4221 	ksp->ks_update = myri10ge_slice_stat_kstat_update;
4222 	ksp->ks_private = (void *) ss;
4223 	kstat_install(ksp);
4224 	return (DDI_SUCCESS);
4225 }
4226 
4227 
4228 
4229 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
4230 
4231 #include <vm/hat.h>
4232 #include <sys/ddi_isa.h>
4233 void *device_arena_alloc(size_t size, int vm_flag);
4234 void device_arena_free(void *vaddr, size_t size);
4235 
4236 static void
4237 myri10ge_enable_nvidia_ecrc(struct myri10ge_priv *mgp)
4238 {
4239 	dev_info_t *parent_dip;
4240 	ddi_acc_handle_t handle;
4241 	unsigned long bus_number, dev_number, func_number;
4242 	unsigned long cfg_pa, paddr, base, pgoffset;
4243 	char 		*cvaddr, *ptr;
4244 	uint32_t	*ptr32;
4245 	int 		retval = DDI_FAILURE;
4246 	int dontcare;
4247 	uint16_t read_vid, read_did, vendor_id, device_id;
4248 
4249 	if (!myri10ge_nvidia_ecrc_enable)
4250 		return;
4251 
4252 	parent_dip = ddi_get_parent(mgp->dip);
4253 	if (parent_dip == NULL) {
4254 		cmn_err(CE_WARN, "%s: I'm an orphan?", mgp->name);
4255 		return;
4256 	}
4257 
4258 	if (pci_config_setup(parent_dip, &handle) != DDI_SUCCESS) {
4259 		cmn_err(CE_WARN,
4260 		    "%s: Could not access my parent's registers", mgp->name);
4261 		return;
4262 	}
4263 
4264 	vendor_id = pci_config_get16(handle, PCI_CONF_VENID);
4265 	device_id = pci_config_get16(handle, PCI_CONF_DEVID);
4266 	pci_config_teardown(&handle);
4267 
4268 	if (myri10ge_verbose) {
4269 		unsigned long 	bus_number, dev_number, func_number;
4270 		int 		reg_set, span;
4271 		(void) myri10ge_reg_set(parent_dip, &reg_set, &span,
4272 		    &bus_number, &dev_number, &func_number);
4273 		if (myri10ge_verbose)
4274 			printf("%s: parent at %ld:%ld:%ld\n", mgp->name,
4275 			    bus_number, dev_number, func_number);
4276 	}
4277 
4278 	if (vendor_id !=  0x10de)
4279 		return;
4280 
4281 	if (device_id != 0x005d /* CK804 */ &&
4282 	    (device_id < 0x374 || device_id > 0x378) /* MCP55 */) {
4283 		return;
4284 	}
4285 	(void) myri10ge_reg_set(parent_dip, &dontcare, &dontcare,
4286 	    &bus_number, &dev_number, &func_number);
4287 
4288 	for (cfg_pa = 0xf0000000UL;
4289 	    retval != DDI_SUCCESS && cfg_pa >= 0xe0000000UL;
4290 	    cfg_pa -= 0x10000000UL) {
4291 		/* find the config space address for the nvidia bridge */
4292 		paddr = (cfg_pa + bus_number * 0x00100000UL +
4293 		    (dev_number * 8 + func_number) * 0x00001000UL);
4294 
4295 		base = paddr & (~MMU_PAGEOFFSET);
4296 		pgoffset = paddr & MMU_PAGEOFFSET;
4297 
4298 		/* map it into the kernel */
4299 		cvaddr =  device_arena_alloc(ptob(1), VM_NOSLEEP);
4300 		if (cvaddr == NULL)
4301 			cmn_err(CE_WARN, "%s: failed to map nf4: cvaddr\n",
4302 			    mgp->name);
4303 
4304 		hat_devload(kas.a_hat, cvaddr, mmu_ptob(1),
4305 		    i_ddi_paddr_to_pfn(base),
4306 		    PROT_WRITE|HAT_STRICTORDER, HAT_LOAD_LOCK);
4307 
4308 		ptr = cvaddr + pgoffset;
4309 		read_vid = *(uint16_t *)(void *)(ptr + PCI_CONF_VENID);
4310 		read_did = *(uint16_t *)(void *)(ptr + PCI_CONF_DEVID);
4311 		if (vendor_id ==  read_did || device_id == read_did) {
4312 			ptr32 = (uint32_t *)(void *)(ptr + 0x178);
4313 			if (myri10ge_verbose)
4314 				printf("%s: Enabling ECRC on upstream "
4315 				    "Nvidia bridge (0x%x:0x%x) "
4316 				    "at %ld:%ld:%ld\n", mgp->name,
4317 				    read_vid, read_did, bus_number,
4318 				    dev_number, func_number);
4319 			*ptr32 |= 0x40;
4320 			retval = DDI_SUCCESS;
4321 		}
4322 		hat_unload(kas.a_hat, cvaddr, ptob(1), HAT_UNLOAD_UNLOCK);
4323 		device_arena_free(cvaddr, ptob(1));
4324 	}
4325 }
4326 
4327 #else
4328 /*ARGSUSED*/
4329 static void
4330 myri10ge_enable_nvidia_ecrc(struct myri10ge_priv *mgp)
4331 {
4332 }
4333 #endif /* i386 */
4334 
4335 
4336 /*
4337  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
4338  * when the PCI-E Completion packets are aligned on an 8-byte
4339  * boundary.  Some PCI-E chip sets always align Completion packets; on
4340  * the ones that do not, the alignment can be enforced by enabling
4341  * ECRC generation (if supported).
4342  *
4343  * When PCI-E Completion packets are not aligned, it is actually more
4344  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
4345  *
4346  * If the driver can neither enable ECRC nor verify that it has
4347  * already been enabled, then it must use a firmware image which works
4348  * around unaligned completion packets (ethp_z8e.dat), and it should
4349  * also ensure that it never gives the device a Read-DMA which is
4350  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
4351  * enabled, then the driver should use the aligned (eth_z8e.dat)
4352  * firmware image, and set tx.boundary to 4KB.
4353  */
4354 
4355 
4356 static int
4357 myri10ge_firmware_probe(struct myri10ge_priv *mgp)
4358 {
4359 	int status;
4360 
4361 	mgp->tx_boundary = 4096;
4362 	/*
4363 	 * Verify the max read request size was set to 4KB
4364 	 * before trying the test with 4KB.
4365 	 */
4366 	if (mgp->max_read_request_4k == 0)
4367 		mgp->tx_boundary = 2048;
4368 	/*
4369 	 * load the optimized firmware which assumes aligned PCIe
4370 	 * completions in order to see if it works on this host.
4371 	 */
4372 
4373 	mgp->fw_name = "rss_eth_z8e";
4374 	mgp->eth_z8e = (unsigned char *)rss_eth_z8e;
4375 	mgp->eth_z8e_length = rss_eth_z8e_length;
4376 
4377 	status = myri10ge_load_firmware(mgp);
4378 	if (status != 0) {
4379 		return (status);
4380 	}
4381 	/*
4382 	 * Enable ECRC if possible
4383 	 */
4384 	myri10ge_enable_nvidia_ecrc(mgp);
4385 
4386 	/*
4387 	 * Run a DMA test which watches for unaligned completions and
4388 	 * aborts on the first one seen.
4389 	 */
4390 	status = myri10ge_dma_test(mgp, MXGEFW_CMD_UNALIGNED_TEST);
4391 	if (status == 0)
4392 		return (0); /* keep the aligned firmware */
4393 
4394 	if (status != E2BIG)
4395 		cmn_err(CE_WARN, "%s: DMA test failed: %d\n",
4396 		    mgp->name, status);
4397 	if (status == ENOSYS)
4398 		cmn_err(CE_WARN, "%s: Falling back to ethp! "
4399 		    "Please install up to date fw\n", mgp->name);
4400 	return (status);
4401 }
4402 
4403 static int
4404 myri10ge_select_firmware(struct myri10ge_priv *mgp)
4405 {
4406 	int aligned;
4407 
4408 	aligned = 0;
4409 
4410 	if (myri10ge_force_firmware == 1) {
4411 		if (myri10ge_verbose)
4412 			printf("%s: Assuming aligned completions (forced)\n",
4413 			    mgp->name);
4414 		aligned = 1;
4415 		goto done;
4416 	}
4417 
4418 	if (myri10ge_force_firmware == 2) {
4419 		if (myri10ge_verbose)
4420 			printf("%s: Assuming unaligned completions (forced)\n",
4421 			    mgp->name);
4422 		aligned = 0;
4423 		goto done;
4424 	}
4425 
4426 	/* If the width is less than 8, we may used the aligned firmware */
4427 	if (mgp->pcie_link_width != 0 && mgp->pcie_link_width < 8) {
4428 		cmn_err(CE_WARN, "!%s: PCIe link running at x%d\n",
4429 		    mgp->name, mgp->pcie_link_width);
4430 		aligned = 1;
4431 		goto done;
4432 	}
4433 
4434 	if (0 == myri10ge_firmware_probe(mgp))
4435 		return (0);  /* keep optimized firmware */
4436 
4437 done:
4438 	if (aligned) {
4439 		mgp->fw_name = "rss_eth_z8e";
4440 		mgp->eth_z8e = (unsigned char *)rss_eth_z8e;
4441 		mgp->eth_z8e_length = rss_eth_z8e_length;
4442 		mgp->tx_boundary = 4096;
4443 	} else {
4444 		mgp->fw_name = "rss_ethp_z8e";
4445 		mgp->eth_z8e = (unsigned char *)rss_ethp_z8e;
4446 		mgp->eth_z8e_length = rss_ethp_z8e_length;
4447 		mgp->tx_boundary = 2048;
4448 	}
4449 
4450 	return (myri10ge_load_firmware(mgp));
4451 }
4452 
4453 static int
4454 myri10ge_add_intrs(struct myri10ge_priv *mgp, int add_handler)
4455 {
4456 	dev_info_t *devinfo = mgp->dip;
4457 	int count, avail, actual, intr_types;
4458 	int x, y, rc, inum = 0;
4459 
4460 
4461 	rc = ddi_intr_get_supported_types(devinfo, &intr_types);
4462 	if (rc != DDI_SUCCESS) {
4463 		cmn_err(CE_WARN,
4464 		    "!%s: ddi_intr_get_nintrs() failure, rc = %d\n", mgp->name,
4465 		    rc);
4466 		return (DDI_FAILURE);
4467 	}
4468 
4469 	if (!myri10ge_use_msi)
4470 		intr_types &= ~DDI_INTR_TYPE_MSI;
4471 	if (!myri10ge_use_msix)
4472 		intr_types &= ~DDI_INTR_TYPE_MSIX;
4473 
4474 	if (intr_types & DDI_INTR_TYPE_MSIX) {
4475 		mgp->ddi_intr_type = DDI_INTR_TYPE_MSIX;
4476 		mgp->intr_type = "MSI-X";
4477 	} else if (intr_types & DDI_INTR_TYPE_MSI) {
4478 		mgp->ddi_intr_type = DDI_INTR_TYPE_MSI;
4479 		mgp->intr_type = "MSI";
4480 	} else {
4481 		mgp->ddi_intr_type = DDI_INTR_TYPE_FIXED;
4482 		mgp->intr_type = "Legacy";
4483 	}
4484 	/* Get number of interrupts */
4485 	rc = ddi_intr_get_nintrs(devinfo, mgp->ddi_intr_type, &count);
4486 	if ((rc != DDI_SUCCESS) || (count == 0)) {
4487 		cmn_err(CE_WARN, "%s: ddi_intr_get_nintrs() failure, rc: %d, "
4488 		    "count: %d", mgp->name, rc, count);
4489 
4490 		return (DDI_FAILURE);
4491 	}
4492 
4493 	/* Get number of available interrupts */
4494 	rc = ddi_intr_get_navail(devinfo, mgp->ddi_intr_type, &avail);
4495 	if ((rc != DDI_SUCCESS) || (avail == 0)) {
4496 		cmn_err(CE_WARN, "%s: ddi_intr_get_navail() failure, "
4497 		    "rc: %d, avail: %d\n", mgp->name, rc, avail);
4498 		return (DDI_FAILURE);
4499 	}
4500 	if (avail < count) {
4501 		cmn_err(CE_NOTE,
4502 		    "!%s: nintrs() returned %d, navail returned %d",
4503 		    mgp->name, count, avail);
4504 		count = avail;
4505 	}
4506 
4507 	if (count < mgp->num_slices)
4508 		return (DDI_FAILURE);
4509 
4510 	if (count > mgp->num_slices)
4511 		count = mgp->num_slices;
4512 
4513 	/* Allocate memory for MSI interrupts */
4514 	mgp->intr_size = count * sizeof (ddi_intr_handle_t);
4515 	mgp->htable = kmem_alloc(mgp->intr_size, KM_SLEEP);
4516 
4517 	rc = ddi_intr_alloc(devinfo, mgp->htable, mgp->ddi_intr_type, inum,
4518 	    count, &actual, DDI_INTR_ALLOC_NORMAL);
4519 
4520 	if ((rc != DDI_SUCCESS) || (actual == 0)) {
4521 		cmn_err(CE_WARN, "%s: ddi_intr_alloc() failed: %d",
4522 		    mgp->name, rc);
4523 
4524 		kmem_free(mgp->htable, mgp->intr_size);
4525 		mgp->htable = NULL;
4526 		return (DDI_FAILURE);
4527 	}
4528 
4529 	if ((actual < count) && myri10ge_verbose) {
4530 		cmn_err(CE_NOTE, "%s: got %d/%d slices",
4531 		    mgp->name, actual, count);
4532 	}
4533 
4534 	mgp->intr_cnt = actual;
4535 
4536 	/*
4537 	 * Get priority for first irq, assume remaining are all the same
4538 	 */
4539 	if (ddi_intr_get_pri(mgp->htable[0], &mgp->intr_pri)
4540 	    != DDI_SUCCESS) {
4541 		cmn_err(CE_WARN, "%s: ddi_intr_get_pri() failed", mgp->name);
4542 
4543 		/* Free already allocated intr */
4544 		for (y = 0; y < actual; y++) {
4545 			(void) ddi_intr_free(mgp->htable[y]);
4546 		}
4547 
4548 		kmem_free(mgp->htable, mgp->intr_size);
4549 		mgp->htable = NULL;
4550 		return (DDI_FAILURE);
4551 	}
4552 
4553 	mgp->icookie = (void *)(uintptr_t)mgp->intr_pri;
4554 
4555 	if (!add_handler)
4556 		return (DDI_SUCCESS);
4557 
4558 	/* Call ddi_intr_add_handler() */
4559 	for (x = 0; x < actual; x++) {
4560 		if (ddi_intr_add_handler(mgp->htable[x], myri10ge_intr,
4561 		    (caddr_t)&mgp->ss[x], NULL) != DDI_SUCCESS) {
4562 			cmn_err(CE_WARN, "%s: ddi_intr_add_handler() failed",
4563 			    mgp->name);
4564 
4565 			/* Free already allocated intr */
4566 			for (y = 0; y < actual; y++) {
4567 				(void) ddi_intr_free(mgp->htable[y]);
4568 			}
4569 
4570 			kmem_free(mgp->htable, mgp->intr_size);
4571 			mgp->htable = NULL;
4572 			return (DDI_FAILURE);
4573 		}
4574 	}
4575 
4576 	(void) ddi_intr_get_cap(mgp->htable[0], &mgp->intr_cap);
4577 	if (mgp->intr_cap & DDI_INTR_FLAG_BLOCK) {
4578 		/* Call ddi_intr_block_enable() for MSI */
4579 		(void) ddi_intr_block_enable(mgp->htable, mgp->intr_cnt);
4580 	} else {
4581 		/* Call ddi_intr_enable() for MSI non block enable */
4582 		for (x = 0; x < mgp->intr_cnt; x++) {
4583 			(void) ddi_intr_enable(mgp->htable[x]);
4584 		}
4585 	}
4586 
4587 	return (DDI_SUCCESS);
4588 }
4589 
4590 static void
4591 myri10ge_rem_intrs(struct myri10ge_priv *mgp, int handler_installed)
4592 {
4593 	int x, err;
4594 
4595 	/* Disable all interrupts */
4596 	if (handler_installed) {
4597 		if (mgp->intr_cap & DDI_INTR_FLAG_BLOCK) {
4598 			/* Call ddi_intr_block_disable() */
4599 			(void) ddi_intr_block_disable(mgp->htable,
4600 			    mgp->intr_cnt);
4601 		} else {
4602 			for (x = 0; x < mgp->intr_cnt; x++) {
4603 				(void) ddi_intr_disable(mgp->htable[x]);
4604 			}
4605 		}
4606 	}
4607 
4608 	for (x = 0; x < mgp->intr_cnt; x++) {
4609 		if (handler_installed) {
4610 		/* Call ddi_intr_remove_handler() */
4611 			err = ddi_intr_remove_handler(mgp->htable[x]);
4612 			if (err != DDI_SUCCESS) {
4613 				cmn_err(CE_WARN,
4614 				    "%s: ddi_intr_remove_handler for"
4615 				    "vec %d returned %d\n", mgp->name,
4616 				    x, err);
4617 			}
4618 		}
4619 		err = ddi_intr_free(mgp->htable[x]);
4620 		if (err != DDI_SUCCESS) {
4621 			cmn_err(CE_WARN,
4622 			    "%s: ddi_intr_free for vec %d returned %d\n",
4623 			    mgp->name, x, err);
4624 		}
4625 	}
4626 	kmem_free(mgp->htable, mgp->intr_size);
4627 	mgp->htable = NULL;
4628 }
4629 
4630 static void
4631 myri10ge_test_physical(dev_info_t *dip)
4632 {
4633 	ddi_dma_handle_t	handle;
4634 	struct myri10ge_dma_stuff dma;
4635 	void *addr;
4636 	int err;
4637 
4638 	/* test #1, sufficient for older sparc systems */
4639 	myri10ge_tx_dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
4640 	err = ddi_dma_alloc_handle(dip, &myri10ge_tx_dma_attr,
4641 	    DDI_DMA_DONTWAIT, NULL, &handle);
4642 	if (err == DDI_DMA_BADATTR)
4643 		goto fail;
4644 	ddi_dma_free_handle(&handle);
4645 
4646 	/* test #2, required on Olympis where the bind is what fails */
4647 	addr = myri10ge_dma_alloc(dip, 128, &myri10ge_tx_dma_attr,
4648 	    &myri10ge_dev_access_attr, DDI_DMA_STREAMING,
4649 	    DDI_DMA_WRITE|DDI_DMA_STREAMING, &dma, 0, DDI_DMA_DONTWAIT);
4650 	if (addr == NULL)
4651 		goto fail;
4652 	myri10ge_dma_free(&dma);
4653 	return;
4654 
4655 fail:
4656 	if (myri10ge_verbose)
4657 		printf("myri10ge%d: DDI_DMA_FORCE_PHYSICAL failed, "
4658 		    "using IOMMU\n", ddi_get_instance(dip));
4659 
4660 	myri10ge_tx_dma_attr.dma_attr_flags &= ~DDI_DMA_FORCE_PHYSICAL;
4661 }
4662 
4663 static void
4664 myri10ge_get_props(dev_info_t *dip)
4665 {
4666 
4667 	myri10ge_flow_control =  ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4668 	    "myri10ge_flow_control", myri10ge_flow_control);
4669 
4670 	myri10ge_intr_coal_delay = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4671 	    "myri10ge_intr_coal_delay", myri10ge_intr_coal_delay);
4672 
4673 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
4674 	myri10ge_nvidia_ecrc_enable = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4675 	    "myri10ge_nvidia_ecrc_enable", 1);
4676 #endif
4677 
4678 
4679 	myri10ge_use_msi = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4680 	    "myri10ge_use_msi", myri10ge_use_msi);
4681 
4682 	myri10ge_deassert_wait = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4683 	    "myri10ge_deassert_wait",  myri10ge_deassert_wait);
4684 
4685 	myri10ge_verbose = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4686 	    "myri10ge_verbose", myri10ge_verbose);
4687 
4688 	myri10ge_tx_copylen = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4689 	    "myri10ge_tx_copylen", myri10ge_tx_copylen);
4690 
4691 	if (myri10ge_tx_copylen < 60) {
4692 		cmn_err(CE_WARN,
4693 		    "myri10ge_tx_copylen must be >= 60 bytes\n");
4694 		myri10ge_tx_copylen = 60;
4695 	}
4696 
4697 	myri10ge_mtu_override = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4698 	    "myri10ge_mtu_override", myri10ge_mtu_override);
4699 
4700 	if (myri10ge_mtu_override >= 1500 && myri10ge_mtu_override <= 9000)
4701 		myri10ge_mtu = myri10ge_mtu_override +
4702 		    sizeof (struct ether_header) + MXGEFW_PAD + VLAN_TAGSZ;
4703 	else if (myri10ge_mtu_override != 0) {
4704 		cmn_err(CE_WARN,
4705 		    "myri10ge_mtu_override must be between 1500 and "
4706 		    "9000 bytes\n");
4707 	}
4708 
4709 	myri10ge_bigbufs_initial = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4710 	    "myri10ge_bigbufs_initial", myri10ge_bigbufs_initial);
4711 	myri10ge_bigbufs_max = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4712 	    "myri10ge_bigbufs_max", myri10ge_bigbufs_max);
4713 
4714 	myri10ge_watchdog_reset = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4715 	    "myri10ge_watchdog_reset", myri10ge_watchdog_reset);
4716 
4717 	if (myri10ge_bigbufs_initial < 128) {
4718 		cmn_err(CE_WARN,
4719 		    "myri10ge_bigbufs_initial be at least 128\n");
4720 		myri10ge_bigbufs_initial = 128;
4721 	}
4722 	if (myri10ge_bigbufs_max < 128) {
4723 		cmn_err(CE_WARN,
4724 		    "myri10ge_bigbufs_max be at least 128\n");
4725 		myri10ge_bigbufs_max = 128;
4726 	}
4727 
4728 	if (myri10ge_bigbufs_max < myri10ge_bigbufs_initial) {
4729 		cmn_err(CE_WARN,
4730 		    "myri10ge_bigbufs_max must be >=  "
4731 		    "myri10ge_bigbufs_initial\n");
4732 		myri10ge_bigbufs_max = myri10ge_bigbufs_initial;
4733 	}
4734 
4735 	myri10ge_force_firmware = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4736 	    "myri10ge_force_firmware", myri10ge_force_firmware);
4737 
4738 	myri10ge_max_slices = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4739 	    "myri10ge_max_slices", myri10ge_max_slices);
4740 
4741 	myri10ge_use_msix = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4742 	    "myri10ge_use_msix", myri10ge_use_msix);
4743 
4744 	myri10ge_rss_hash = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4745 	    "myri10ge_rss_hash", myri10ge_rss_hash);
4746 
4747 	if (myri10ge_rss_hash > MXGEFW_RSS_HASH_TYPE_MAX ||
4748 	    myri10ge_rss_hash < MXGEFW_RSS_HASH_TYPE_IPV4) {
4749 		cmn_err(CE_WARN, "myri10ge: Illegal rssh hash type %d\n",
4750 		    myri10ge_rss_hash);
4751 		myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4752 	}
4753 	myri10ge_lro = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4754 	    "myri10ge_lro", myri10ge_lro);
4755 	myri10ge_lro_cnt = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4756 	    "myri10ge_lro_cnt", myri10ge_lro_cnt);
4757 	myri10ge_lro_max_aggr = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4758 	    "myri10ge_lro_max_aggr", myri10ge_lro_max_aggr);
4759 	myri10ge_tx_hash = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4760 	    "myri10ge_tx_hash", myri10ge_tx_hash);
4761 	myri10ge_use_lso = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4762 	    "myri10ge_use_lso", myri10ge_use_lso);
4763 	myri10ge_lso_copy = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4764 	    "myri10ge_lso_copy", myri10ge_lso_copy);
4765 	myri10ge_tx_handles_initial = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4766 	    "myri10ge_tx_handles_initial", myri10ge_tx_handles_initial);
4767 	myri10ge_small_bytes = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4768 	    "myri10ge_small_bytes", myri10ge_small_bytes);
4769 	if ((myri10ge_small_bytes + MXGEFW_PAD) & (128 -1)) {
4770 		cmn_err(CE_WARN, "myri10ge: myri10ge_small_bytes (%d)\n",
4771 		    myri10ge_small_bytes);
4772 		cmn_err(CE_WARN, "must be aligned on 128b bndry -2\n");
4773 		myri10ge_small_bytes += 128;
4774 		myri10ge_small_bytes &= ~(128 -1);
4775 		myri10ge_small_bytes -= MXGEFW_PAD;
4776 		cmn_err(CE_WARN, "rounded up to %d\n",
4777 		    myri10ge_small_bytes);
4778 
4779 		myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4780 	}
4781 }
4782 
4783 #ifndef	PCI_EXP_LNKSTA
4784 #define	PCI_EXP_LNKSTA 18
4785 #endif
4786 
4787 static int
4788 myri10ge_find_cap(ddi_acc_handle_t handle, uint8_t *capptr, uint8_t capid)
4789 {
4790 	uint16_t	status;
4791 	uint8_t 	ptr;
4792 
4793 	/* check to see if we have capabilities */
4794 	status = pci_config_get16(handle, PCI_CONF_STAT);
4795 	if (!(status & PCI_STAT_CAP)) {
4796 		cmn_err(CE_WARN, "PCI_STAT_CAP not found\n");
4797 		return (ENXIO);
4798 	}
4799 
4800 	ptr = pci_config_get8(handle, PCI_CONF_CAP_PTR);
4801 
4802 	/* Walk the capabilities list, looking for a PCI Express cap */
4803 	while (ptr != PCI_CAP_NEXT_PTR_NULL) {
4804 		if (pci_config_get8(handle, ptr + PCI_CAP_ID) == capid)
4805 			break;
4806 		ptr = pci_config_get8(handle, ptr + PCI_CAP_NEXT_PTR);
4807 	}
4808 	if (ptr < 64) {
4809 		cmn_err(CE_WARN, "Bad capability offset %d\n", ptr);
4810 		return (ENXIO);
4811 	}
4812 	*capptr = ptr;
4813 	return (0);
4814 }
4815 
4816 static int
4817 myri10ge_set_max_readreq(ddi_acc_handle_t handle)
4818 {
4819 	int err;
4820 	uint16_t	val;
4821 	uint8_t		ptr;
4822 
4823 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_PCI_E);
4824 	if (err != 0) {
4825 		cmn_err(CE_WARN, "could not find PCIe cap\n");
4826 		return (ENXIO);
4827 	}
4828 
4829 	/* set max read req to 4096 */
4830 	val = pci_config_get16(handle, ptr + PCIE_DEVCTL);
4831 	val = (val & ~PCIE_DEVCTL_MAX_READ_REQ_MASK) |
4832 	    PCIE_DEVCTL_MAX_READ_REQ_4096;
4833 	pci_config_put16(handle, ptr + PCIE_DEVCTL, val);
4834 	val = pci_config_get16(handle, ptr + PCIE_DEVCTL);
4835 	if ((val & (PCIE_DEVCTL_MAX_READ_REQ_4096)) !=
4836 	    PCIE_DEVCTL_MAX_READ_REQ_4096) {
4837 		cmn_err(CE_WARN, "could not set max read req (%x)\n", val);
4838 		return (EINVAL);
4839 	}
4840 	return (0);
4841 }
4842 
4843 static int
4844 myri10ge_read_pcie_link_width(ddi_acc_handle_t handle, int *link)
4845 {
4846 	int err;
4847 	uint16_t	val;
4848 	uint8_t		ptr;
4849 
4850 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_PCI_E);
4851 	if (err != 0) {
4852 		cmn_err(CE_WARN, "could not set max read req\n");
4853 		return (ENXIO);
4854 	}
4855 
4856 	/* read link width */
4857 	val = pci_config_get16(handle, ptr + PCIE_LINKSTS);
4858 	val &= PCIE_LINKSTS_NEG_WIDTH_MASK;
4859 	*link = (val >> 4);
4860 	return (0);
4861 }
4862 
4863 static int
4864 myri10ge_reset_nic(struct myri10ge_priv *mgp)
4865 {
4866 	ddi_acc_handle_t handle = mgp->cfg_hdl;
4867 	uint32_t reboot;
4868 	uint16_t cmd;
4869 	int err;
4870 
4871 	cmd = pci_config_get16(handle, PCI_CONF_COMM);
4872 	if ((cmd & PCI_COMM_ME) == 0) {
4873 		/*
4874 		 * Bus master DMA disabled?  Check to see if the card
4875 		 * rebooted due to a parity error For now, just report
4876 		 * it
4877 		 */
4878 
4879 		/* enter read32 mode */
4880 		pci_config_put8(handle, mgp->vso + 0x10, 0x3);
4881 		/* read REBOOT_STATUS (0xfffffff0) */
4882 		pci_config_put32(handle, mgp->vso + 0x18, 0xfffffff0);
4883 		reboot = pci_config_get16(handle, mgp->vso + 0x14);
4884 		cmn_err(CE_WARN, "%s NIC rebooted 0x%x\n", mgp->name, reboot);
4885 		return (0);
4886 	}
4887 	if (!myri10ge_watchdog_reset) {
4888 		cmn_err(CE_WARN, "%s: not resetting\n", mgp->name);
4889 		return (1);
4890 	}
4891 
4892 	myri10ge_stop_locked(mgp);
4893 	err = myri10ge_start_locked(mgp);
4894 	if (err == DDI_FAILURE) {
4895 		return (0);
4896 	}
4897 	mac_tx_update(mgp->mh);
4898 	return (1);
4899 }
4900 
4901 static inline int
4902 myri10ge_ring_stalled(myri10ge_tx_ring_t *tx)
4903 {
4904 	if (tx->sched != tx->stall &&
4905 	    tx->done == tx->watchdog_done &&
4906 	    tx->watchdog_req != tx->watchdog_done)
4907 		return (1);
4908 	return (0);
4909 }
4910 
4911 static void
4912 myri10ge_watchdog(void *arg)
4913 {
4914 	struct myri10ge_priv *mgp;
4915 	struct myri10ge_slice_state *ss;
4916 	myri10ge_tx_ring_t *tx;
4917 	int nic_ok = 1;
4918 	int slices_stalled, rx_pause, i;
4919 	int add_rx;
4920 
4921 	mgp = arg;
4922 	mutex_enter(&mgp->intrlock);
4923 	if (mgp->running != MYRI10GE_ETH_RUNNING) {
4924 		cmn_err(CE_WARN,
4925 		    "%s not running, not rearming watchdog (%d)\n",
4926 		    mgp->name, mgp->running);
4927 		mutex_exit(&mgp->intrlock);
4928 		return;
4929 	}
4930 
4931 	rx_pause = ntohl(mgp->ss[0].fw_stats->dropped_pause);
4932 
4933 	/*
4934 	 * make sure nic is stalled before we reset the nic, so as to
4935 	 * ensure we don't rip the transmit data structures out from
4936 	 * under a pending transmit
4937 	 */
4938 
4939 	for (slices_stalled = 0, i = 0; i < mgp->num_slices; i++) {
4940 		tx = &mgp->ss[i].tx;
4941 		slices_stalled = myri10ge_ring_stalled(tx);
4942 		if (slices_stalled)
4943 			break;
4944 	}
4945 
4946 	if (slices_stalled) {
4947 		if (mgp->watchdog_rx_pause == rx_pause) {
4948 			cmn_err(CE_WARN,
4949 			    "%s slice %d stalled:(%d, %d, %d, %d, %d %d %d\n)",
4950 			    mgp->name, i, tx->sched, tx->stall,
4951 			    tx->done, tx->watchdog_done, tx->req, tx->pkt_done,
4952 			    (int)ntohl(mgp->ss[i].fw_stats->send_done_count));
4953 			nic_ok = myri10ge_reset_nic(mgp);
4954 		} else {
4955 			cmn_err(CE_WARN,
4956 			    "%s Flow controlled, check link partner\n",
4957 			    mgp->name);
4958 		}
4959 	}
4960 
4961 	if (!nic_ok) {
4962 		cmn_err(CE_WARN,
4963 		    "%s Nic dead, not rearming watchdog\n", mgp->name);
4964 		mutex_exit(&mgp->intrlock);
4965 		return;
4966 	}
4967 	for (i = 0; i < mgp->num_slices; i++) {
4968 		ss = &mgp->ss[i];
4969 		tx = &ss->tx;
4970 		tx->watchdog_done = tx->done;
4971 		tx->watchdog_req = tx->req;
4972 		if (ss->watchdog_rx_copy != MYRI10GE_SLICE_STAT(rx_copy)) {
4973 			ss->watchdog_rx_copy = MYRI10GE_SLICE_STAT(rx_copy);
4974 			add_rx =
4975 			    min(ss->jpool.num_alloc,
4976 			    myri10ge_bigbufs_max -
4977 			    (ss->jpool.num_alloc -
4978 			    ss->jbufs_for_smalls));
4979 			if (add_rx != 0) {
4980 				(void) myri10ge_add_jbufs(ss, add_rx, 0);
4981 				/* now feed them to the firmware */
4982 				mutex_enter(&ss->jpool.mtx);
4983 				myri10ge_restock_jumbos(ss);
4984 				mutex_exit(&ss->jpool.mtx);
4985 			}
4986 		}
4987 	}
4988 	mgp->watchdog_rx_pause = rx_pause;
4989 
4990 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
4991 	    mgp->timer_ticks);
4992 	mutex_exit(&mgp->intrlock);
4993 }
4994 
4995 /*ARGSUSED*/
4996 static int
4997 myri10ge_get_coalesce(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
4998 
4999 {
5000 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5001 	(void) mi_mpprintf(mp, "%d", mgp->intr_coal_delay);
5002 	return (0);
5003 }
5004 
5005 /*ARGSUSED*/
5006 static int
5007 myri10ge_set_coalesce(queue_t *q, mblk_t *mp, char *value,
5008     caddr_t cp, cred_t *credp)
5009 
5010 {
5011 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5012 	char *end;
5013 	size_t new_value;
5014 
5015 	new_value = mi_strtol(value, &end, 10);
5016 	if (end == value)
5017 		return (EINVAL);
5018 
5019 	mutex_enter(&myri10ge_param_lock);
5020 	mgp->intr_coal_delay = (int)new_value;
5021 	*mgp->intr_coal_delay_ptr = htonl(mgp->intr_coal_delay);
5022 	mutex_exit(&myri10ge_param_lock);
5023 	return (0);
5024 }
5025 
5026 /*ARGSUSED*/
5027 static int
5028 myri10ge_get_pauseparam(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
5029 
5030 {
5031 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5032 	(void) mi_mpprintf(mp, "%d", mgp->pause);
5033 	return (0);
5034 }
5035 
5036 /*ARGSUSED*/
5037 static int
5038 myri10ge_set_pauseparam(queue_t *q, mblk_t *mp, char *value,
5039 			caddr_t cp, cred_t *credp)
5040 
5041 {
5042 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5043 	char *end;
5044 	size_t new_value;
5045 	int err = 0;
5046 
5047 	new_value = mi_strtol(value, &end, 10);
5048 	if (end == value)
5049 		return (EINVAL);
5050 	if (new_value != 0)
5051 		new_value = 1;
5052 
5053 	mutex_enter(&myri10ge_param_lock);
5054 	if (new_value != mgp->pause)
5055 		err = myri10ge_change_pause(mgp, new_value);
5056 	mutex_exit(&myri10ge_param_lock);
5057 	return (err);
5058 }
5059 
5060 /*ARGSUSED*/
5061 static int
5062 myri10ge_get_int(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
5063 
5064 {
5065 	(void) mi_mpprintf(mp, "%d", *(int *)(void *)cp);
5066 	return (0);
5067 }
5068 
5069 /*ARGSUSED*/
5070 static int
5071 myri10ge_set_int(queue_t *q, mblk_t *mp, char *value,
5072     caddr_t cp, cred_t *credp)
5073 
5074 {
5075 	char *end;
5076 	size_t new_value;
5077 
5078 	new_value = mi_strtol(value, &end, 10);
5079 	if (end == value)
5080 		return (EINVAL);
5081 	*(int *)(void *)cp = new_value;
5082 
5083 	return (0);
5084 }
5085 
5086 static void
5087 myri10ge_ndd_init(struct myri10ge_priv *mgp)
5088 {
5089 	mgp->nd_head = NULL;
5090 
5091 	(void) nd_load(&mgp->nd_head, "myri10ge_intr_coal_delay",
5092 	    myri10ge_get_coalesce, myri10ge_set_coalesce, (caddr_t)mgp);
5093 	(void) nd_load(&mgp->nd_head, "myri10ge_flow_control",
5094 	    myri10ge_get_pauseparam, myri10ge_set_pauseparam, (caddr_t)mgp);
5095 	(void) nd_load(&mgp->nd_head, "myri10ge_verbose",
5096 	    myri10ge_get_int, myri10ge_set_int, (caddr_t)&myri10ge_verbose);
5097 	(void) nd_load(&mgp->nd_head, "myri10ge_deassert_wait",
5098 	    myri10ge_get_int, myri10ge_set_int,
5099 	    (caddr_t)&myri10ge_deassert_wait);
5100 	(void) nd_load(&mgp->nd_head, "myri10ge_bigbufs_max",
5101 	    myri10ge_get_int, myri10ge_set_int,
5102 	    (caddr_t)&myri10ge_bigbufs_max);
5103 	(void) nd_load(&mgp->nd_head, "myri10ge_lro",
5104 	    myri10ge_get_int, myri10ge_set_int,
5105 	    (caddr_t)&myri10ge_lro);
5106 	(void) nd_load(&mgp->nd_head, "myri10ge_lro_max_aggr",
5107 	    myri10ge_get_int, myri10ge_set_int,
5108 	    (caddr_t)&myri10ge_lro_max_aggr);
5109 	(void) nd_load(&mgp->nd_head, "myri10ge_tx_hash",
5110 	    myri10ge_get_int, myri10ge_set_int,
5111 	    (caddr_t)&myri10ge_tx_hash);
5112 	(void) nd_load(&mgp->nd_head, "myri10ge_lso_copy",
5113 	    myri10ge_get_int, myri10ge_set_int,
5114 	    (caddr_t)&myri10ge_lso_copy);
5115 }
5116 
5117 static void
5118 myri10ge_ndd_fini(struct myri10ge_priv *mgp)
5119 {
5120 	nd_free(&mgp->nd_head);
5121 }
5122 
5123 static void
5124 myri10ge_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
5125 {
5126 	struct iocblk *iocp;
5127 	struct myri10ge_priv *mgp = arg;
5128 	int cmd, ok, err;
5129 
5130 	iocp = (struct iocblk *)(void *)mp->b_rptr;
5131 	cmd = iocp->ioc_cmd;
5132 
5133 	ok = 0;
5134 	err = 0;
5135 
5136 	switch (cmd) {
5137 	case ND_GET:
5138 	case ND_SET:
5139 		ok = nd_getset(wq, mgp->nd_head, mp);
5140 		break;
5141 	default:
5142 		break;
5143 	}
5144 	if (!ok)
5145 		err = EINVAL;
5146 	else
5147 		err = iocp->ioc_error;
5148 
5149 	if (!err)
5150 		miocack(wq, mp, iocp->ioc_count, err);
5151 	else
5152 		miocnak(wq, mp, 0, err);
5153 }
5154 
5155 static struct myri10ge_priv *mgp_list;
5156 
5157 struct myri10ge_priv *
5158 myri10ge_get_instance(uint_t unit)
5159 {
5160 	struct myri10ge_priv *mgp;
5161 
5162 	mutex_enter(&myri10ge_param_lock);
5163 	for (mgp = mgp_list; mgp != NULL; mgp = mgp->next) {
5164 		if (unit == ddi_get_instance(mgp->dip)) {
5165 			mgp->refcnt++;
5166 			break;
5167 		}
5168 	}
5169 	mutex_exit(&myri10ge_param_lock);
5170 	return (mgp);
5171 }
5172 
5173 void
5174 myri10ge_put_instance(struct myri10ge_priv *mgp)
5175 {
5176 	mutex_enter(&myri10ge_param_lock);
5177 	mgp->refcnt--;
5178 	mutex_exit(&myri10ge_param_lock);
5179 }
5180 
5181 static boolean_t
5182 myri10ge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
5183 {
5184 	struct myri10ge_priv *mgp = arg;
5185 	uint32_t *cap_hcksum;
5186 	mac_capab_lso_t *cap_lso;
5187 	mac_capab_rings_t *cap_rings;
5188 
5189 	switch (cap) {
5190 	case MAC_CAPAB_HCKSUM:
5191 		cap_hcksum = cap_data;
5192 		*cap_hcksum = HCKSUM_INET_PARTIAL;
5193 		break;
5194 	case MAC_CAPAB_RINGS:
5195 		cap_rings = cap_data;
5196 		switch (cap_rings->mr_type) {
5197 		case MAC_RING_TYPE_RX:
5198 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
5199 			cap_rings->mr_rnum = mgp->num_slices;
5200 			cap_rings->mr_gnum = 1;
5201 			cap_rings->mr_rget = myri10ge_fill_ring;
5202 			cap_rings->mr_gget = myri10ge_fill_group;
5203 			break;
5204 		case MAC_RING_TYPE_TX:
5205 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
5206 			cap_rings->mr_rnum = mgp->num_slices;
5207 			cap_rings->mr_gnum = 0;
5208 			cap_rings->mr_rget = myri10ge_fill_ring;
5209 			cap_rings->mr_gget = NULL;
5210 			break;
5211 		default:
5212 			return (B_FALSE);
5213 		}
5214 		break;
5215 	case MAC_CAPAB_LSO:
5216 		cap_lso = cap_data;
5217 		if (!myri10ge_use_lso)
5218 			return (B_FALSE);
5219 		if (!(mgp->features & MYRI10GE_TSO))
5220 			return (B_FALSE);
5221 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
5222 		cap_lso->lso_basic_tcp_ipv4.lso_max = (uint16_t)-1;
5223 		break;
5224 
5225 	default:
5226 		return (B_FALSE);
5227 	}
5228 	return (B_TRUE);
5229 }
5230 
5231 
5232 static int
5233 myri10ge_m_stat(void *arg, uint_t stat, uint64_t *val)
5234 {
5235 	struct myri10ge_priv *mgp = arg;
5236 	struct myri10ge_rx_ring_stats *rstat;
5237 	struct myri10ge_tx_ring_stats *tstat;
5238 	mcp_irq_data_t *fw_stats = mgp->ss[0].fw_stats;
5239 	struct myri10ge_slice_state *ss;
5240 	uint64_t tmp = 0;
5241 	int i;
5242 
5243 	switch (stat) {
5244 	case MAC_STAT_IFSPEED:
5245 		*val = 10ull * 1000ull * 1000000ull;
5246 		break;
5247 
5248 	case MAC_STAT_MULTIRCV:
5249 		for (i = 0; i < mgp->num_slices; i++) {
5250 			rstat = &mgp->ss[i].rx_stats;
5251 			tmp += rstat->multircv;
5252 		}
5253 		*val = tmp;
5254 		break;
5255 
5256 	case MAC_STAT_BRDCSTRCV:
5257 		for (i = 0; i < mgp->num_slices; i++) {
5258 			rstat = &mgp->ss[i].rx_stats;
5259 			tmp += rstat->brdcstrcv;
5260 		}
5261 		*val = tmp;
5262 		break;
5263 
5264 	case MAC_STAT_MULTIXMT:
5265 		for (i = 0; i < mgp->num_slices; i++) {
5266 			tstat = &mgp->ss[i].tx.stats;
5267 			tmp += tstat->multixmt;
5268 		}
5269 		*val = tmp;
5270 		break;
5271 
5272 	case MAC_STAT_BRDCSTXMT:
5273 		for (i = 0; i < mgp->num_slices; i++) {
5274 			tstat = &mgp->ss[i].tx.stats;
5275 			tmp += tstat->brdcstxmt;
5276 		}
5277 		*val = tmp;
5278 		break;
5279 
5280 	case MAC_STAT_NORCVBUF:
5281 		tmp = ntohl(fw_stats->dropped_no_big_buffer);
5282 		tmp += ntohl(fw_stats->dropped_no_small_buffer);
5283 		tmp += ntohl(fw_stats->dropped_link_overflow);
5284 		for (i = 0; i < mgp->num_slices; i++) {
5285 			ss = &mgp->ss[i];
5286 			tmp += MYRI10GE_SLICE_STAT(rx_big_nobuf);
5287 			tmp += MYRI10GE_SLICE_STAT(rx_small_nobuf);
5288 		}
5289 		*val = tmp;
5290 		break;
5291 
5292 	case MAC_STAT_IERRORS:
5293 		tmp += ntohl(fw_stats->dropped_bad_crc32);
5294 		tmp += ntohl(fw_stats->dropped_bad_phy);
5295 		tmp += ntohl(fw_stats->dropped_runt);
5296 		tmp += ntohl(fw_stats->dropped_overrun);
5297 		*val = tmp;
5298 		break;
5299 
5300 	case MAC_STAT_OERRORS:
5301 		for (i = 0; i < mgp->num_slices; i++) {
5302 			ss = &mgp->ss[i];
5303 			tmp += MYRI10GE_SLICE_STAT(xmit_lsobadflags);
5304 			tmp += MYRI10GE_SLICE_STAT(xmit_err);
5305 		}
5306 		*val = tmp;
5307 		break;
5308 
5309 	case MAC_STAT_RBYTES:
5310 		for (i = 0; i < mgp->num_slices; i++) {
5311 			rstat = &mgp->ss[i].rx_stats;
5312 			tmp += rstat->ibytes;
5313 		}
5314 		*val = tmp;
5315 		break;
5316 
5317 	case MAC_STAT_IPACKETS:
5318 		for (i = 0; i < mgp->num_slices; i++) {
5319 			rstat = &mgp->ss[i].rx_stats;
5320 			tmp += rstat->ipackets;
5321 		}
5322 		*val = tmp;
5323 		break;
5324 
5325 	case MAC_STAT_OBYTES:
5326 		for (i = 0; i < mgp->num_slices; i++) {
5327 			tstat = &mgp->ss[i].tx.stats;
5328 			tmp += tstat->obytes;
5329 		}
5330 		*val = tmp;
5331 		break;
5332 
5333 	case MAC_STAT_OPACKETS:
5334 		for (i = 0; i < mgp->num_slices; i++) {
5335 			tstat = &mgp->ss[i].tx.stats;
5336 			tmp += tstat->opackets;
5337 		}
5338 		*val = tmp;
5339 		break;
5340 
5341 	case ETHER_STAT_TOOLONG_ERRORS:
5342 		*val = ntohl(fw_stats->dropped_overrun);
5343 		break;
5344 
5345 #ifdef SOLARIS_S11
5346 	case ETHER_STAT_TOOSHORT_ERRORS:
5347 		*val = ntohl(fw_stats->dropped_runt);
5348 		break;
5349 #endif
5350 
5351 	case ETHER_STAT_LINK_PAUSE:
5352 		*val = mgp->pause;
5353 		break;
5354 
5355 	case ETHER_STAT_LINK_AUTONEG:
5356 		*val = 1;
5357 		break;
5358 
5359 	case ETHER_STAT_LINK_DUPLEX:
5360 		*val = LINK_DUPLEX_FULL;
5361 		break;
5362 
5363 	default:
5364 		return (ENOTSUP);
5365 	}
5366 
5367 	return (0);
5368 }
5369 
5370 static mac_callbacks_t myri10ge_m_callbacks = {
5371 	(MC_IOCTL | MC_GETCAPAB),
5372 	myri10ge_m_stat,
5373 	myri10ge_m_start,
5374 	myri10ge_m_stop,
5375 	myri10ge_m_promisc,
5376 	myri10ge_m_multicst,
5377 	NULL,
5378 	NULL,
5379 	NULL,
5380 	myri10ge_m_ioctl,
5381 	myri10ge_m_getcapab
5382 };
5383 
5384 
5385 static int
5386 myri10ge_probe_slices(struct myri10ge_priv *mgp)
5387 {
5388 	myri10ge_cmd_t cmd;
5389 	int status;
5390 
5391 	mgp->num_slices = 1;
5392 
5393 	/* hit the board with a reset to ensure it is alive */
5394 	(void) memset(&cmd, 0, sizeof (cmd));
5395 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_RESET, &cmd);
5396 	if (status != 0) {
5397 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
5398 		return (ENXIO);
5399 	}
5400 
5401 	if (myri10ge_use_msix == 0)
5402 		return (0);
5403 
5404 	/* tell it the size of the interrupt queues */
5405 	cmd.data0 = mgp->max_intr_slots * sizeof (struct mcp_slot);
5406 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
5407 	if (status != 0) {
5408 		cmn_err(CE_WARN, "%s: failed MXGEFW_CMD_SET_INTRQ_SIZE\n",
5409 		    mgp->name);
5410 		return (ENXIO);
5411 	}
5412 
5413 	/* ask the maximum number of slices it supports */
5414 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
5415 	    &cmd);
5416 	if (status != 0)
5417 		return (0);
5418 
5419 	mgp->num_slices = cmd.data0;
5420 
5421 	/*
5422 	 * if the admin did not specify a limit to how many
5423 	 * slices we should use, cap it automatically to the
5424 	 * number of CPUs currently online
5425 	 */
5426 	if (myri10ge_max_slices == -1)
5427 		myri10ge_max_slices = ncpus;
5428 
5429 	if (mgp->num_slices > myri10ge_max_slices)
5430 		mgp->num_slices = myri10ge_max_slices;
5431 
5432 
5433 	/*
5434 	 * Now try to allocate as many MSI-X vectors as we have
5435 	 * slices. We give up on MSI-X if we can only get a single
5436 	 * vector.
5437 	 */
5438 	while (mgp->num_slices > 1) {
5439 		/* make sure it is a power of two */
5440 		while (mgp->num_slices & (mgp->num_slices - 1))
5441 			mgp->num_slices--;
5442 		if (mgp->num_slices == 1)
5443 			return (0);
5444 
5445 		status = myri10ge_add_intrs(mgp, 0);
5446 		if (status == 0) {
5447 			myri10ge_rem_intrs(mgp, 0);
5448 			if (mgp->intr_cnt == mgp->num_slices) {
5449 				if (myri10ge_verbose)
5450 					printf("Got %d slices!\n",
5451 					    mgp->num_slices);
5452 				return (0);
5453 			}
5454 			mgp->num_slices = mgp->intr_cnt;
5455 		} else {
5456 			mgp->num_slices = mgp->num_slices / 2;
5457 		}
5458 	}
5459 
5460 	if (myri10ge_verbose)
5461 		printf("Got %d slices\n", mgp->num_slices);
5462 	return (0);
5463 }
5464 
5465 static void
5466 myri10ge_lro_free(struct myri10ge_slice_state *ss)
5467 {
5468 	struct lro_entry *lro;
5469 
5470 	while (ss->lro_free != NULL) {
5471 		lro = ss->lro_free;
5472 		ss->lro_free = lro->next;
5473 		kmem_free(lro, sizeof (*lro));
5474 	}
5475 }
5476 
5477 static void
5478 myri10ge_lro_alloc(struct myri10ge_slice_state *ss)
5479 {
5480 	struct lro_entry *lro;
5481 	int idx;
5482 
5483 	ss->lro_free = NULL;
5484 	ss->lro_active = NULL;
5485 
5486 	for (idx = 0; idx < myri10ge_lro_cnt; idx++) {
5487 		lro = kmem_zalloc(sizeof (*lro), KM_SLEEP);
5488 		if (lro == NULL)
5489 			continue;
5490 		lro->next = ss->lro_free;
5491 		ss->lro_free = lro;
5492 	}
5493 }
5494 
5495 static void
5496 myri10ge_free_slices(struct myri10ge_priv *mgp)
5497 {
5498 	struct myri10ge_slice_state *ss;
5499 	size_t bytes;
5500 	int i;
5501 
5502 	if (mgp->ss == NULL)
5503 		return;
5504 
5505 	for (i = 0; i < mgp->num_slices; i++) {
5506 		ss = &mgp->ss[i];
5507 		if (ss->rx_done.entry == NULL)
5508 			continue;
5509 		myri10ge_dma_free(&ss->rx_done.dma);
5510 		ss->rx_done.entry = NULL;
5511 		if (ss->fw_stats == NULL)
5512 			continue;
5513 		myri10ge_dma_free(&ss->fw_stats_dma);
5514 		ss->fw_stats = NULL;
5515 		mutex_destroy(&ss->rx_lock);
5516 		mutex_destroy(&ss->tx.lock);
5517 		mutex_destroy(&ss->tx.handle_lock);
5518 		mutex_destroy(&ss->poll_lock);
5519 		myri10ge_jpool_fini(ss);
5520 		myri10ge_slice_stat_destroy(ss);
5521 		myri10ge_lro_free(ss);
5522 	}
5523 	bytes = sizeof (*mgp->ss) * mgp->num_slices;
5524 	kmem_free(mgp->ss, bytes);
5525 	mgp->ss = NULL;
5526 }
5527 
5528 
5529 static int
5530 myri10ge_alloc_slices(struct myri10ge_priv *mgp)
5531 {
5532 	struct myri10ge_slice_state *ss;
5533 	size_t bytes;
5534 	int i;
5535 
5536 	bytes = sizeof (*mgp->ss) * mgp->num_slices;
5537 	mgp->ss = kmem_zalloc(bytes, KM_SLEEP);
5538 	if (mgp->ss == NULL)
5539 		return (ENOMEM);
5540 	for (i = 0; i < mgp->num_slices; i++) {
5541 		ss = &mgp->ss[i];
5542 
5543 		ss->mgp = mgp;
5544 
5545 		/* allocate the per-slice firmware stats */
5546 		bytes = sizeof (*ss->fw_stats);
5547 		ss->fw_stats = (mcp_irq_data_t *)(void *)
5548 		    myri10ge_dma_alloc(mgp->dip, bytes,
5549 		    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5550 		    DDI_DMA_CONSISTENT, DDI_DMA_READ|DDI_DMA_CONSISTENT,
5551 		    &ss->fw_stats_dma, 1, DDI_DMA_DONTWAIT);
5552 		if (ss->fw_stats == NULL)
5553 			goto abort;
5554 		(void) memset(ss->fw_stats, 0, bytes);
5555 
5556 		/* allocate rx done ring */
5557 		bytes = mgp->max_intr_slots *
5558 		    sizeof (*ss->rx_done.entry);
5559 		ss->rx_done.entry = (mcp_slot_t *)(void *)
5560 		    myri10ge_dma_alloc(mgp->dip, bytes,
5561 		    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5562 		    DDI_DMA_CONSISTENT, DDI_DMA_READ|DDI_DMA_CONSISTENT,
5563 		    &ss->rx_done.dma, 1, DDI_DMA_DONTWAIT);
5564 		if (ss->rx_done.entry == NULL) {
5565 			goto abort;
5566 		}
5567 		(void) memset(ss->rx_done.entry, 0, bytes);
5568 		mutex_init(&ss->rx_lock,   NULL, MUTEX_DEFAULT, mgp->icookie);
5569 		mutex_init(&ss->tx.lock,   NULL, MUTEX_DEFAULT, NULL);
5570 		mutex_init(&ss->tx.handle_lock,   NULL, MUTEX_DEFAULT, NULL);
5571 		mutex_init(&ss->poll_lock,   NULL, MUTEX_DEFAULT, NULL);
5572 		myri10ge_jpool_init(ss);
5573 		(void) myri10ge_slice_stat_init(ss);
5574 		myri10ge_lro_alloc(ss);
5575 	}
5576 
5577 	return (0);
5578 
5579 abort:
5580 	myri10ge_free_slices(mgp);
5581 	return (ENOMEM);
5582 }
5583 
5584 static int
5585 myri10ge_save_msi_state(struct myri10ge_priv *mgp,
5586     ddi_acc_handle_t handle)
5587 {
5588 	uint8_t ptr;
5589 	int err;
5590 
5591 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_MSI);
5592 	if (err != 0) {
5593 		cmn_err(CE_WARN, "%s: could not find MSI cap\n",
5594 		    mgp->name);
5595 		return (DDI_FAILURE);
5596 	}
5597 	mgp->pci_saved_state.msi_ctrl =
5598 	    pci_config_get16(handle, ptr + PCI_MSI_CTRL);
5599 	mgp->pci_saved_state.msi_addr_low =
5600 	    pci_config_get32(handle, ptr + PCI_MSI_ADDR_OFFSET);
5601 	mgp->pci_saved_state.msi_addr_high =
5602 	    pci_config_get32(handle, ptr + PCI_MSI_ADDR_OFFSET + 4);
5603 	mgp->pci_saved_state.msi_data_32 =
5604 	    pci_config_get16(handle, ptr + PCI_MSI_32BIT_DATA);
5605 	mgp->pci_saved_state.msi_data_64 =
5606 	    pci_config_get16(handle, ptr + PCI_MSI_64BIT_DATA);
5607 	return (DDI_SUCCESS);
5608 }
5609 
5610 static int
5611 myri10ge_restore_msi_state(struct myri10ge_priv *mgp,
5612     ddi_acc_handle_t handle)
5613 {
5614 	uint8_t ptr;
5615 	int err;
5616 
5617 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_MSI);
5618 	if (err != 0) {
5619 		cmn_err(CE_WARN, "%s: could not find MSI cap\n",
5620 		    mgp->name);
5621 		return (DDI_FAILURE);
5622 	}
5623 
5624 	pci_config_put16(handle, ptr + PCI_MSI_CTRL,
5625 	    mgp->pci_saved_state.msi_ctrl);
5626 	pci_config_put32(handle, ptr + PCI_MSI_ADDR_OFFSET,
5627 	    mgp->pci_saved_state.msi_addr_low);
5628 	pci_config_put32(handle, ptr + PCI_MSI_ADDR_OFFSET + 4,
5629 	    mgp->pci_saved_state.msi_addr_high);
5630 	pci_config_put16(handle, ptr + PCI_MSI_32BIT_DATA,
5631 	    mgp->pci_saved_state.msi_data_32);
5632 	pci_config_put16(handle, ptr + PCI_MSI_64BIT_DATA,
5633 	    mgp->pci_saved_state.msi_data_64);
5634 
5635 	return (DDI_SUCCESS);
5636 }
5637 
5638 static int
5639 myri10ge_save_pci_state(struct myri10ge_priv *mgp)
5640 {
5641 	ddi_acc_handle_t handle = mgp->cfg_hdl;
5642 	int i;
5643 	int err = DDI_SUCCESS;
5644 
5645 
5646 	/* Save the non-extended PCI config space 32-bits at a time */
5647 	for (i = 0; i < 16; i++)
5648 		mgp->pci_saved_state.base[i] =
5649 		    pci_config_get32(handle, i*4);
5650 
5651 	/* now save MSI interrupt state *, if needed */
5652 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_MSI)
5653 		err = myri10ge_save_msi_state(mgp, handle);
5654 
5655 	return (err);
5656 }
5657 
5658 static int
5659 myri10ge_restore_pci_state(struct myri10ge_priv *mgp)
5660 {
5661 	ddi_acc_handle_t handle = mgp->cfg_hdl;
5662 	int i;
5663 	int err = DDI_SUCCESS;
5664 
5665 
5666 	/* Restore the non-extended PCI config space 32-bits at a time */
5667 	for (i = 15; i >= 0; i--)
5668 		pci_config_put32(handle, i*4, mgp->pci_saved_state.base[i]);
5669 
5670 	/* now restore MSI interrupt state *, if needed */
5671 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_MSI)
5672 		err = myri10ge_restore_msi_state(mgp, handle);
5673 
5674 	if (mgp->max_read_request_4k)
5675 		(void) myri10ge_set_max_readreq(handle);
5676 	return (err);
5677 }
5678 
5679 
5680 static int
5681 myri10ge_suspend(dev_info_t *dip)
5682 {
5683 	struct myri10ge_priv *mgp = ddi_get_driver_private(dip);
5684 	int status;
5685 
5686 	if (mgp == NULL) {
5687 		cmn_err(CE_WARN, "null dip in myri10ge_suspend\n");
5688 		return (DDI_FAILURE);
5689 	}
5690 	if (mgp->dip != dip) {
5691 		cmn_err(CE_WARN, "bad dip in myri10ge_suspend\n");
5692 		return (DDI_FAILURE);
5693 	}
5694 	mutex_enter(&mgp->intrlock);
5695 	if (mgp->running == MYRI10GE_ETH_RUNNING) {
5696 		mgp->running = MYRI10GE_ETH_STOPPING;
5697 		mutex_exit(&mgp->intrlock);
5698 		(void) untimeout(mgp->timer_id);
5699 		mutex_enter(&mgp->intrlock);
5700 		myri10ge_stop_locked(mgp);
5701 		mgp->running = MYRI10GE_ETH_SUSPENDED_RUNNING;
5702 	}
5703 	status = myri10ge_save_pci_state(mgp);
5704 	mutex_exit(&mgp->intrlock);
5705 	return (status);
5706 }
5707 
5708 static int
5709 myri10ge_resume(dev_info_t *dip)
5710 {
5711 	struct myri10ge_priv *mgp = ddi_get_driver_private(dip);
5712 	int status = DDI_SUCCESS;
5713 
5714 	if (mgp == NULL) {
5715 		cmn_err(CE_WARN, "null dip in myri10ge_resume\n");
5716 		return (DDI_FAILURE);
5717 	}
5718 	if (mgp->dip != dip) {
5719 		cmn_err(CE_WARN, "bad dip in myri10ge_resume\n");
5720 		return (DDI_FAILURE);
5721 	}
5722 
5723 	mutex_enter(&mgp->intrlock);
5724 	status = myri10ge_restore_pci_state(mgp);
5725 	if (status == DDI_SUCCESS &&
5726 	    mgp->running == MYRI10GE_ETH_SUSPENDED_RUNNING) {
5727 		status = myri10ge_start_locked(mgp);
5728 	}
5729 	mutex_exit(&mgp->intrlock);
5730 	if (status != DDI_SUCCESS)
5731 		return (status);
5732 
5733 	/* start the watchdog timer */
5734 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
5735 	    mgp->timer_ticks);
5736 	return (DDI_SUCCESS);
5737 }
5738 
5739 static int
5740 myri10ge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
5741 {
5742 
5743 	struct myri10ge_priv *mgp;
5744 	mac_register_t *macp, *omacp;
5745 	ddi_acc_handle_t handle;
5746 	uint32_t csr, hdr_offset;
5747 	int status, span, link_width, max_read_request_4k;
5748 	unsigned long bus_number, dev_number, func_number;
5749 	size_t bytes;
5750 	offset_t ss_offset;
5751 	uint8_t vso;
5752 
5753 	if (cmd == DDI_RESUME) {
5754 		return (myri10ge_resume(dip));
5755 	}
5756 
5757 	if (cmd != DDI_ATTACH)
5758 		return (DDI_FAILURE);
5759 	if (pci_config_setup(dip, &handle) != DDI_SUCCESS)
5760 		return (DDI_FAILURE);
5761 
5762 	/* enable busmater and io space access */
5763 	csr = pci_config_get32(handle, PCI_CONF_COMM);
5764 	pci_config_put32(handle, PCI_CONF_COMM,
5765 	    (csr |PCI_COMM_ME|PCI_COMM_MAE));
5766 	status = myri10ge_read_pcie_link_width(handle, &link_width);
5767 	if (status != 0) {
5768 		cmn_err(CE_WARN, "could not read link width!\n");
5769 		link_width = 0;
5770 	}
5771 	max_read_request_4k = !myri10ge_set_max_readreq(handle);
5772 	status = myri10ge_find_cap(handle, &vso, PCI_CAP_ID_VS);
5773 	if (status != 0)
5774 		goto abort_with_cfg_hdl;
5775 	if ((omacp = mac_alloc(MAC_VERSION)) == NULL)
5776 		goto abort_with_cfg_hdl;
5777 	/*
5778 	 * XXXX Hack: mac_register_t grows in newer kernels.  To be
5779 	 * able to write newer fields, such as m_margin, without
5780 	 * writing outside allocated memory, we allocate our own macp
5781 	 * and pass that to mac_register()
5782 	 */
5783 	macp = kmem_zalloc(sizeof (*macp) * 8, KM_SLEEP);
5784 	macp->m_version = omacp->m_version;
5785 
5786 	if ((mgp = (struct myri10ge_priv *)
5787 	    kmem_zalloc(sizeof (*mgp), KM_SLEEP)) == NULL) {
5788 		goto abort_with_macinfo;
5789 	}
5790 	ddi_set_driver_private(dip, mgp);
5791 
5792 	/* setup device name for log messages */
5793 	(void) sprintf(mgp->name, "myri10ge%d", ddi_get_instance(dip));
5794 
5795 	mutex_enter(&myri10ge_param_lock);
5796 	myri10ge_get_props(dip);
5797 	mgp->intr_coal_delay = myri10ge_intr_coal_delay;
5798 	mgp->pause = myri10ge_flow_control;
5799 	mutex_exit(&myri10ge_param_lock);
5800 
5801 	mgp->max_read_request_4k = max_read_request_4k;
5802 	mgp->pcie_link_width = link_width;
5803 	mgp->running = MYRI10GE_ETH_STOPPED;
5804 	mgp->vso = vso;
5805 	mgp->dip = dip;
5806 	mgp->cfg_hdl = handle;
5807 
5808 	mgp->timer_ticks = 5 * drv_usectohz(1000000); /* 5 seconds */
5809 	myri10ge_test_physical(dip);
5810 
5811 	/* allocate command page */
5812 	bytes = sizeof (*mgp->cmd);
5813 	mgp->cmd = (mcp_cmd_response_t *)
5814 	    (void *)myri10ge_dma_alloc(dip, bytes,
5815 	    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5816 	    DDI_DMA_CONSISTENT,	DDI_DMA_RDWR|DDI_DMA_CONSISTENT,
5817 	    &mgp->cmd_dma, 1, DDI_DMA_DONTWAIT);
5818 	if (mgp->cmd == NULL)
5819 		goto abort_with_mgp;
5820 
5821 	(void) myri10ge_reg_set(dip, &mgp->reg_set, &span, &bus_number,
5822 	    &dev_number, &func_number);
5823 	if (myri10ge_verbose)
5824 		printf("%s at %ld:%ld:%ld attaching\n", mgp->name,
5825 		    bus_number, dev_number, func_number);
5826 	status = ddi_regs_map_setup(dip, mgp->reg_set, (caddr_t *)&mgp->sram,
5827 	    (offset_t)0, (offset_t)span,  &myri10ge_dev_access_attr,
5828 	    &mgp->io_handle);
5829 	if (status != DDI_SUCCESS) {
5830 		cmn_err(CE_WARN, "%s: couldn't map memory space", mgp->name);
5831 		printf("%s: reg_set = %d, span = %d, status = %d",
5832 		    mgp->name, mgp->reg_set, span, status);
5833 		goto abort_with_mgp;
5834 	}
5835 
5836 	hdr_offset = *(uint32_t *)(void*)(mgp->sram +  MCP_HEADER_PTR_OFFSET);
5837 	hdr_offset = ntohl(hdr_offset) & 0xffffc;
5838 	ss_offset = hdr_offset +
5839 	    offsetof(struct mcp_gen_header, string_specs);
5840 	mgp->sram_size = ntohl(*(uint32_t *)(void*)(mgp->sram + ss_offset));
5841 	myri10ge_pio_copy32(mgp->eeprom_strings,
5842 	    (uint32_t *)(void*)((char *)mgp->sram + mgp->sram_size),
5843 	    MYRI10GE_EEPROM_STRINGS_SIZE);
5844 	(void) memset(mgp->eeprom_strings +
5845 	    MYRI10GE_EEPROM_STRINGS_SIZE - 2, 0, 2);
5846 
5847 	status = myri10ge_read_mac_addr(mgp);
5848 	if (status) {
5849 		goto abort_with_mapped;
5850 	}
5851 
5852 	status = myri10ge_select_firmware(mgp);
5853 	if (status != 0) {
5854 		cmn_err(CE_WARN, "%s: failed to load firmware\n", mgp->name);
5855 		goto abort_with_mapped;
5856 	}
5857 
5858 	status = myri10ge_probe_slices(mgp);
5859 	if (status != 0) {
5860 		cmn_err(CE_WARN, "%s: failed to probe slices\n", mgp->name);
5861 		goto abort_with_dummy_rdma;
5862 	}
5863 
5864 	status = myri10ge_alloc_slices(mgp);
5865 	if (status != 0) {
5866 		cmn_err(CE_WARN, "%s: failed to alloc slices\n", mgp->name);
5867 		goto abort_with_dummy_rdma;
5868 	}
5869 
5870 	/* add the interrupt handler */
5871 	status = myri10ge_add_intrs(mgp, 1);
5872 	if (status != 0) {
5873 		cmn_err(CE_WARN, "%s: Failed to add interrupt\n",
5874 		    mgp->name);
5875 		goto abort_with_slices;
5876 	}
5877 
5878 	/* now that we have an iblock_cookie, init the mutexes */
5879 	mutex_init(&mgp->cmd_lock, NULL, MUTEX_DRIVER, mgp->icookie);
5880 	mutex_init(&mgp->intrlock, NULL, MUTEX_DRIVER, mgp->icookie);
5881 
5882 
5883 	status = myri10ge_nic_stat_init(mgp);
5884 	if (status != DDI_SUCCESS)
5885 		goto abort_with_interrupts;
5886 	status = myri10ge_info_init(mgp);
5887 	if (status != DDI_SUCCESS)
5888 		goto abort_with_stats;
5889 
5890 	/*
5891 	 *	Initialize  GLD state
5892 	 */
5893 
5894 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
5895 	macp->m_driver = mgp;
5896 	macp->m_dip = dip;
5897 	macp->m_src_addr = mgp->mac_addr;
5898 	macp->m_callbacks = &myri10ge_m_callbacks;
5899 	macp->m_min_sdu = 0;
5900 	macp->m_max_sdu = myri10ge_mtu -
5901 	    (sizeof (struct ether_header) + MXGEFW_PAD + VLAN_TAGSZ);
5902 #ifdef SOLARIS_S11
5903 	macp->m_margin = VLAN_TAGSZ;
5904 #endif
5905 	macp->m_v12n = MAC_VIRT_LEVEL1;
5906 	status = mac_register(macp, &mgp->mh);
5907 	if (status != 0) {
5908 		cmn_err(CE_WARN, "%s: mac_register failed with %d\n",
5909 		    mgp->name, status);
5910 		goto abort_with_info;
5911 	}
5912 	myri10ge_ndd_init(mgp);
5913 	if (myri10ge_verbose)
5914 		printf("%s: %s, tx bndry %d, fw %s\n", mgp->name,
5915 		    mgp->intr_type, mgp->tx_boundary, mgp->fw_name);
5916 	mutex_enter(&myri10ge_param_lock);
5917 	mgp->next = mgp_list;
5918 	mgp_list = mgp;
5919 	mutex_exit(&myri10ge_param_lock);
5920 	kmem_free(macp, sizeof (*macp) * 8);
5921 	mac_free(omacp);
5922 	return (DDI_SUCCESS);
5923 
5924 abort_with_info:
5925 	myri10ge_info_destroy(mgp);
5926 
5927 abort_with_stats:
5928 	myri10ge_nic_stat_destroy(mgp);
5929 
5930 abort_with_interrupts:
5931 	mutex_destroy(&mgp->cmd_lock);
5932 	mutex_destroy(&mgp->intrlock);
5933 	myri10ge_rem_intrs(mgp, 1);
5934 
5935 abort_with_slices:
5936 	myri10ge_free_slices(mgp);
5937 
5938 abort_with_dummy_rdma:
5939 	myri10ge_dummy_rdma(mgp, 0);
5940 
5941 abort_with_mapped:
5942 	ddi_regs_map_free(&mgp->io_handle);
5943 
5944 	myri10ge_dma_free(&mgp->cmd_dma);
5945 
5946 abort_with_mgp:
5947 	kmem_free(mgp, sizeof (*mgp));
5948 
5949 abort_with_macinfo:
5950 	kmem_free(macp, sizeof (*macp) * 8);
5951 	mac_free(omacp);
5952 
5953 abort_with_cfg_hdl:
5954 	pci_config_teardown(&handle);
5955 	return (DDI_FAILURE);
5956 
5957 }
5958 
5959 
5960 static int
5961 myri10ge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5962 {
5963 	struct myri10ge_priv	*mgp, *tmp;
5964 	int 			status, i, jbufs_alloced;
5965 
5966 	if (cmd == DDI_SUSPEND) {
5967 		status = myri10ge_suspend(dip);
5968 		return (status);
5969 	}
5970 
5971 	if (cmd != DDI_DETACH) {
5972 		return (DDI_FAILURE);
5973 	}
5974 	/* Get the driver private (gld_mac_info_t) structure */
5975 	mgp = ddi_get_driver_private(dip);
5976 
5977 	mutex_enter(&mgp->intrlock);
5978 	jbufs_alloced = 0;
5979 	for (i = 0; i < mgp->num_slices; i++) {
5980 		myri10ge_remove_jbufs(&mgp->ss[i]);
5981 		jbufs_alloced += mgp->ss[i].jpool.num_alloc;
5982 	}
5983 	mutex_exit(&mgp->intrlock);
5984 	if (jbufs_alloced != 0) {
5985 		cmn_err(CE_NOTE, "%s: %d loaned rx buffers remain\n",
5986 		    mgp->name, jbufs_alloced);
5987 		return (DDI_FAILURE);
5988 	}
5989 
5990 	mutex_enter(&myri10ge_param_lock);
5991 	if (mgp->refcnt != 0) {
5992 		mutex_exit(&myri10ge_param_lock);
5993 		cmn_err(CE_NOTE, "%s: %d external refs remain\n",
5994 		    mgp->name, mgp->refcnt);
5995 		return (DDI_FAILURE);
5996 	}
5997 	mutex_exit(&myri10ge_param_lock);
5998 
5999 	status = mac_unregister(mgp->mh);
6000 	if (status != DDI_SUCCESS)
6001 		return (status);
6002 
6003 	myri10ge_ndd_fini(mgp);
6004 	myri10ge_dummy_rdma(mgp, 0);
6005 	myri10ge_nic_stat_destroy(mgp);
6006 	myri10ge_info_destroy(mgp);
6007 
6008 	mutex_destroy(&mgp->cmd_lock);
6009 	mutex_destroy(&mgp->intrlock);
6010 
6011 	myri10ge_rem_intrs(mgp, 1);
6012 
6013 	myri10ge_free_slices(mgp);
6014 	ddi_regs_map_free(&mgp->io_handle);
6015 	myri10ge_dma_free(&mgp->cmd_dma);
6016 	pci_config_teardown(&mgp->cfg_hdl);
6017 
6018 	mutex_enter(&myri10ge_param_lock);
6019 	if (mgp_list == mgp) {
6020 		mgp_list = mgp->next;
6021 	} else {
6022 		tmp = mgp_list;
6023 		while (tmp->next != mgp && tmp->next != NULL)
6024 			tmp = tmp->next;
6025 		if (tmp->next != NULL)
6026 			tmp->next = tmp->next->next;
6027 	}
6028 	kmem_free(mgp, sizeof (*mgp));
6029 	mutex_exit(&myri10ge_param_lock);
6030 	return (DDI_SUCCESS);
6031 }
6032 
6033 /*
6034  * Helper for quiesce entry point: Interrupt threads are not being
6035  * scheduled, so we must poll for the confirmation DMA to arrive in
6036  * the firmware stats block for slice 0.  We're essentially running
6037  * the guts of the interrupt handler, and just cherry picking the
6038  * confirmation that the NIC is queuesced (stats->link_down)
6039  */
6040 
6041 static int
6042 myri10ge_poll_down(struct myri10ge_priv *mgp)
6043 {
6044 	struct myri10ge_slice_state *ss = mgp->ss;
6045 	mcp_irq_data_t *stats = ss->fw_stats;
6046 	int valid;
6047 	int found_down = 0;
6048 
6049 
6050 	/* check for a pending IRQ */
6051 
6052 	if (! *((volatile uint8_t *)& stats->valid))
6053 		return (0);
6054 	valid = stats->valid;
6055 
6056 	/*
6057 	 * Make sure to tell the NIC to lower a legacy IRQ, else
6058 	 * it may have corrupt state after restarting
6059 	 */
6060 
6061 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
6062 		/* lower legacy IRQ  */
6063 		*mgp->irq_deassert = 0;
6064 		mb();
6065 		/* wait for irq conf DMA */
6066 		while (*((volatile uint8_t *)& stats->valid))
6067 			;
6068 	}
6069 	if (stats->stats_updated && stats->link_down)
6070 		found_down = 1;
6071 
6072 	if (valid & 0x1)
6073 		*ss->irq_claim = BE_32(3);
6074 	*(ss->irq_claim + 1) = BE_32(3);
6075 
6076 	return (found_down);
6077 }
6078 
6079 static int
6080 myri10ge_quiesce(dev_info_t *dip)
6081 {
6082 	struct myri10ge_priv *mgp;
6083 	myri10ge_cmd_t cmd;
6084 	int status, down, i;
6085 
6086 	mgp = ddi_get_driver_private(dip);
6087 	if (mgp == NULL)
6088 		return (DDI_FAILURE);
6089 
6090 	/* if devices was unplumbed, it is guaranteed to be quiescent */
6091 	if (mgp->running == MYRI10GE_ETH_STOPPED)
6092 		return (DDI_SUCCESS);
6093 
6094 	/* send a down CMD to queuesce NIC */
6095 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
6096 	if (status) {
6097 		cmn_err(CE_WARN, "%s: Couldn't bring down link\n", mgp->name);
6098 		return (DDI_FAILURE);
6099 	}
6100 
6101 	for (i = 0; i < 20; i++) {
6102 		down = myri10ge_poll_down(mgp);
6103 		if (down)
6104 			break;
6105 		delay(drv_usectohz(100000));
6106 		mb();
6107 	}
6108 	if (down)
6109 		return (DDI_SUCCESS);
6110 	return (DDI_FAILURE);
6111 }
6112 
6113 /*
6114  * Distinguish between allocb'ed blocks, and gesballoc'ed attached
6115  * storage.
6116  */
6117 static void
6118 myri10ge_find_lastfree(void)
6119 {
6120 	mblk_t *mp = allocb(1024, 0);
6121 	dblk_t *dbp;
6122 
6123 	if (mp == NULL) {
6124 		cmn_err(CE_WARN, "myri10ge_find_lastfree failed\n");
6125 		return;
6126 	}
6127 	dbp = mp->b_datap;
6128 	myri10ge_db_lastfree = (void *)dbp->db_lastfree;
6129 }
6130 
6131 int
6132 _init(void)
6133 {
6134 	int i;
6135 
6136 	if (myri10ge_verbose)
6137 		cmn_err(CE_NOTE,
6138 		    "Myricom 10G driver (10GbE) version %s loading\n",
6139 		    MYRI10GE_VERSION_STR);
6140 	myri10ge_find_lastfree();
6141 	mac_init_ops(&myri10ge_ops, "myri10ge");
6142 	mutex_init(&myri10ge_param_lock, NULL, MUTEX_DEFAULT, NULL);
6143 	if ((i = mod_install(&modlinkage)) != 0) {
6144 		cmn_err(CE_WARN, "mod_install returned %d\n", i);
6145 		mac_fini_ops(&myri10ge_ops);
6146 		mutex_destroy(&myri10ge_param_lock);
6147 	}
6148 	return (i);
6149 }
6150 
6151 int
6152 _fini(void)
6153 {
6154 	int i;
6155 	i = mod_remove(&modlinkage);
6156 	if (i != 0) {
6157 		return (i);
6158 	}
6159 	mac_fini_ops(&myri10ge_ops);
6160 	mutex_destroy(&myri10ge_param_lock);
6161 	return (0);
6162 }
6163 
6164 int
6165 _info(struct modinfo *modinfop)
6166 {
6167 	return (mod_info(&modlinkage, modinfop));
6168 }
6169 
6170 
6171 /*
6172  *  This file uses MyriGE driver indentation.
6173  *
6174  * Local Variables:
6175  * c-file-style:"sun"
6176  * tab-width:8
6177  * End:
6178  */
6179