xref: /illumos-gate/usr/src/uts/common/io/myri10ge/drv/myri10ge.c (revision 5328fc53d11d7151861fa272e4fb0248b8f0e145)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright 2007-2009 Myricom, Inc.  All rights reserved.
29  * Use is subject to license terms.
30  */
31 
32 /*
33  * Copyright (c) 2014, Joyent, Inc.
34  * Copyright (c) 2016 by Delphix. All rights reserved.
35  */
36 
37 #define	MXGEFW_NDIS
38 #include "myri10ge_var.h"
39 #include "rss_eth_z8e.h"
40 #include "rss_ethp_z8e.h"
41 #include "mcp_gen_header.h"
42 
43 #define	MYRI10GE_MAX_ETHER_MTU 9014
44 #define	MYRI10GE_MAX_GLD_MTU	9000
45 #define	MYRI10GE_MIN_GLD_MTU	1500
46 
47 #define	MYRI10GE_ETH_STOPPED 0
48 #define	MYRI10GE_ETH_STOPPING 1
49 #define	MYRI10GE_ETH_STARTING 2
50 #define	MYRI10GE_ETH_RUNNING 3
51 #define	MYRI10GE_ETH_OPEN_FAILED 4
52 #define	MYRI10GE_ETH_SUSPENDED_RUNNING 5
53 
54 static int myri10ge_small_bytes = 510;
55 static int myri10ge_intr_coal_delay = 125;
56 static int myri10ge_flow_control = 1;
57 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
58 static int myri10ge_nvidia_ecrc_enable = 1;
59 #endif
60 static int myri10ge_mtu_override = 0;
61 static int myri10ge_tx_copylen = 512;
62 static int myri10ge_deassert_wait = 1;
63 static int myri10ge_verbose = 0;
64 static int myri10ge_watchdog_reset = 0;
65 static int myri10ge_use_msix = 1;
66 static int myri10ge_max_slices = -1;
67 static int myri10ge_use_msi = 1;
68 int myri10ge_force_firmware = 0;
69 static boolean_t myri10ge_use_lso = B_TRUE;
70 static int myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
71 static int myri10ge_tx_hash = 1;
72 static int myri10ge_lro = 0;
73 static int myri10ge_lro_cnt = 8;
74 int myri10ge_lro_max_aggr = 2;
75 static int myri10ge_lso_copy = 0;
76 static mblk_t *myri10ge_send_wrapper(void *arg, mblk_t *mp);
77 int myri10ge_tx_handles_initial = 128;
78 
79 static	kmutex_t myri10ge_param_lock;
80 static void* myri10ge_db_lastfree;
81 
82 static int myri10ge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
83 static int myri10ge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
84 static int myri10ge_quiesce(dev_info_t *dip);
85 
86 DDI_DEFINE_STREAM_OPS(myri10ge_ops, nulldev, nulldev, myri10ge_attach,
87     myri10ge_detach, nodev, NULL, D_MP, NULL, myri10ge_quiesce);
88 
89 
90 static struct modldrv modldrv = {
91 	&mod_driverops,
92 	"Myricom 10G driver (10GbE)",
93 	&myri10ge_ops,
94 };
95 
96 
97 static struct modlinkage modlinkage = {
98 	MODREV_1,
99 	{&modldrv, NULL},
100 };
101 
102 unsigned char myri10ge_broadcastaddr[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
103 
104 static ddi_dma_attr_t myri10ge_misc_dma_attr = {
105 	DMA_ATTR_V0,			/* version number. */
106 	(uint64_t)0,			/* low address */
107 	(uint64_t)0xffffffffffffffffULL, /* high address */
108 	(uint64_t)0x7ffffff,		/* address counter max */
109 	(uint64_t)4096,			/* alignment */
110 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
111 	(uint32_t)0x1,			/* minimum transfer size */
112 	(uint64_t)0x7fffffff,		/* maximum transfer size */
113 	(uint64_t)0x7fffffff,		/* maximum segment size */
114 	1,				/* scatter/gather list length */
115 	1,				/* granularity */
116 	0				/* attribute flags */
117 };
118 
119 /*
120  * The Myri10GE NIC has the following constraints on receive buffers:
121  * 1) Buffers which cross a 4KB boundary must be aligned to 4KB
122  * 2) Buffers which are not aligned to 4KB must not cross a 4KB boundary
123  */
124 
125 static ddi_dma_attr_t myri10ge_rx_jumbo_dma_attr = {
126 	DMA_ATTR_V0,			/* version number. */
127 	(uint64_t)0,			/* low address */
128 	(uint64_t)0xffffffffffffffffULL, /* high address */
129 	(uint64_t)0x7ffffff,		/* address counter max */
130 	(uint64_t)4096,			/* alignment */
131 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
132 	(uint32_t)0x1,			/* minimum transfer size */
133 	(uint64_t)0x7fffffff,		/* maximum transfer size */
134 	UINT64_MAX,			/* maximum segment size */
135 	1,				/* scatter/gather list length */
136 	1,				/* granularity */
137 	0				/* attribute flags */
138 };
139 
140 static ddi_dma_attr_t myri10ge_rx_std_dma_attr = {
141 	DMA_ATTR_V0,			/* version number. */
142 	(uint64_t)0,			/* low address */
143 	(uint64_t)0xffffffffffffffffULL, /* high address */
144 	(uint64_t)0x7ffffff,		/* address counter max */
145 #if defined sparc64 || defined __sparcv9
146 	(uint64_t)4096,			/* alignment */
147 #else
148 	(uint64_t)0x80,			/* alignment */
149 #endif
150 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
151 	(uint32_t)0x1,			/* minimum transfer size */
152 	(uint64_t)0x7fffffff,		/* maximum transfer size */
153 #if defined sparc64 || defined __sparcv9
154 	UINT64_MAX,			/* maximum segment size */
155 #else
156 	(uint64_t)0xfff,		/* maximum segment size */
157 #endif
158 	1,				/* scatter/gather list length */
159 	1,				/* granularity */
160 	0				/* attribute flags */
161 };
162 
163 static ddi_dma_attr_t myri10ge_tx_dma_attr = {
164 	DMA_ATTR_V0,			/* version number. */
165 	(uint64_t)0,			/* low address */
166 	(uint64_t)0xffffffffffffffffULL, /* high address */
167 	(uint64_t)0x7ffffff,		/* address counter max */
168 	(uint64_t)1,			/* alignment */
169 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
170 	(uint32_t)0x1,			/* minimum transfer size */
171 	(uint64_t)0x7fffffff,		/* maximum transfer size */
172 	UINT64_MAX,			/* maximum segment size */
173 	INT32_MAX,			/* scatter/gather list length */
174 	1,				/* granularity */
175 	0			/* attribute flags */
176 };
177 
178 #if defined sparc64 || defined __sparcv9
179 #define	WC 0
180 #else
181 #define	WC 1
182 #endif
183 
184 struct ddi_device_acc_attr myri10ge_dev_access_attr = {
185 	DDI_DEVICE_ATTR_V0,		/* version */
186 	DDI_NEVERSWAP_ACC,		/* endian flash */
187 #if WC
188 	DDI_MERGING_OK_ACC		/* data order */
189 #else
190 	DDI_STRICTORDER_ACC
191 #endif
192 };
193 
194 static void myri10ge_watchdog(void *arg);
195 
196 #ifdef MYRICOM_PRIV
197 int myri10ge_mtu = MYRI10GE_MAX_ETHER_MTU + MXGEFW_PAD + VLAN_TAGSZ;
198 #define	MYRI10GE_DEFAULT_GLD_MTU	MYRI10GE_MAX_GLD_MTU
199 #else
200 int myri10ge_mtu = ETHERMAX + MXGEFW_PAD + VLAN_TAGSZ;
201 #define	MYRI10GE_DEFAULT_GLD_MTU	MYRI10GE_MIN_GLD_MTU
202 #endif
203 int myri10ge_bigbufs_initial = 1024;
204 int myri10ge_bigbufs_max = 4096;
205 
206 
207 caddr_t
208 myri10ge_dma_alloc(dev_info_t *dip, size_t len,
209     ddi_dma_attr_t *attr, ddi_device_acc_attr_t  *accattr,
210     uint_t alloc_flags, int bind_flags, struct myri10ge_dma_stuff *dma,
211     int warn, int (*wait)(caddr_t))
212 {
213 	caddr_t  kaddr;
214 	size_t real_length;
215 	ddi_dma_cookie_t cookie;
216 	uint_t count;
217 	int err;
218 
219 	err = ddi_dma_alloc_handle(dip, attr, wait,
220 	    NULL, &dma->handle);
221 	if (err != DDI_SUCCESS) {
222 		if (warn)
223 			cmn_err(CE_WARN,
224 			    "myri10ge: ddi_dma_alloc_handle failed\n");
225 		goto abort_with_nothing;
226 	}
227 
228 	err = ddi_dma_mem_alloc(dma->handle, len, accattr, alloc_flags,
229 	    wait, NULL, &kaddr, &real_length,
230 	    &dma->acc_handle);
231 	if (err != DDI_SUCCESS) {
232 		if (warn)
233 			cmn_err(CE_WARN,
234 			    "myri10ge: ddi_dma_mem_alloc failed\n");
235 		goto abort_with_handle;
236 	}
237 
238 	err = ddi_dma_addr_bind_handle(dma->handle, NULL, kaddr, len,
239 	    bind_flags, wait, NULL, &cookie, &count);
240 
241 	if (err != DDI_SUCCESS) {
242 		if (warn)
243 			cmn_err(CE_WARN,
244 			    "myri10ge: ddi_dma_addr_bind_handle failed\n");
245 		goto abort_with_mem;
246 	}
247 
248 	if (count != 1) {
249 		if (warn)
250 			cmn_err(CE_WARN,
251 			    "myri10ge: got too many dma segments ");
252 		goto abort_with_bind;
253 	}
254 	dma->low = htonl(MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress));
255 	dma->high = htonl(MYRI10GE_HIGHPART_TO_U32(cookie.dmac_laddress));
256 	return (kaddr);
257 
258 abort_with_bind:
259 	(void) ddi_dma_unbind_handle(dma->handle);
260 
261 abort_with_mem:
262 	ddi_dma_mem_free(&dma->acc_handle);
263 
264 abort_with_handle:
265 	ddi_dma_free_handle(&dma->handle);
266 abort_with_nothing:
267 	if (warn) {
268 		cmn_err(CE_WARN, "myri10ge: myri10ge_dma_alloc failed.\n  ");
269 		cmn_err(CE_WARN, "args: dip=%p len=0x%lx ddi_dma_attr=%p\n",
270 		    (void*) dip, len, (void*) attr);
271 		cmn_err(CE_WARN,
272 		    "args: ddi_device_acc_attr=%p  alloc_flags=0x%x\n",
273 		    (void*) accattr, alloc_flags);
274 		cmn_err(CE_WARN, "args: bind_flags=0x%x  dmastuff=%p",
275 		    bind_flags, (void*) dma);
276 	}
277 	return (NULL);
278 
279 }
280 
281 void
282 myri10ge_dma_free(struct myri10ge_dma_stuff *dma)
283 {
284 	(void) ddi_dma_unbind_handle(dma->handle);
285 	ddi_dma_mem_free(&dma->acc_handle);
286 	ddi_dma_free_handle(&dma->handle);
287 }
288 
289 static inline void
290 myri10ge_pio_copy32(void *to, uint32_t *from32, size_t size)
291 {
292 	register volatile uint32_t *to32;
293 	size_t i;
294 
295 	to32 = (volatile uint32_t *) to;
296 	for (i = (size / 4); i; i--) {
297 		*to32 = *from32;
298 		to32++;
299 		from32++;
300 	}
301 }
302 
303 #if defined(_LP64)
304 static inline void
305 myri10ge_pio_copy64(void *to, uint64_t *from64, size_t size)
306 {
307 	register volatile uint64_t *to64;
308 	size_t i;
309 
310 	to64 = (volatile uint64_t *) to;
311 	for (i = (size / 8); i; i--) {
312 		*to64 = *from64;
313 		to64++;
314 		from64++;
315 	}
316 }
317 #endif
318 
319 /*
320  * This routine copies memory from the host to the NIC.
321  * The "size" argument must always be a multiple of
322  * the size of long (4 or 8 bytes), and to/from must also
323  * be naturally aligned.
324  */
325 static inline void
326 myri10ge_pio_copy(void *to, void *from, size_t size)
327 {
328 #if !defined(_LP64)
329 	ASSERT((size % 4) == 0);
330 	myri10ge_pio_copy32(to, (uint32_t *)from, size);
331 #else
332 	ASSERT((size % 8) == 0);
333 	myri10ge_pio_copy64(to, (uint64_t *)from, size);
334 #endif
335 }
336 
337 
338 /*
339  * Due to various bugs in Solaris (especially bug 6186772 where the
340  * TCP/UDP checksum is calculated incorrectly on mblk chains with more
341  * than two elements), and the design bug where hardware checksums are
342  * ignored on mblk chains with more than 2 elements, we need to
343  * allocate private pool of physically contiguous receive buffers.
344  */
345 
346 static void
347 myri10ge_jpool_init(struct myri10ge_slice_state *ss)
348 {
349 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
350 
351 	bzero(jpool, sizeof (*jpool));
352 	mutex_init(&jpool->mtx, NULL, MUTEX_DRIVER,
353 	    ss->mgp->icookie);
354 	jpool->head = NULL;
355 }
356 
357 static void
358 myri10ge_jpool_fini(struct myri10ge_slice_state *ss)
359 {
360 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
361 
362 	if (jpool->head != NULL) {
363 		cmn_err(CE_WARN,
364 		    "%s: BUG! myri10ge_jpool_fini called on non-empty pool\n",
365 		    ss->mgp->name);
366 	}
367 	mutex_destroy(&jpool->mtx);
368 }
369 
370 
371 /*
372  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
373  * at most 32 bytes at a time, so as to avoid involving the software
374  * pio handler in the nic.   We re-write the first segment's low
375  * DMA address to mark it valid only after we write the entire chunk
376  * in a burst
377  */
378 static inline void
379 myri10ge_submit_8rx(mcp_kreq_ether_recv_t *dst, mcp_kreq_ether_recv_t *src)
380 {
381 	src->addr_low |= BE_32(1);
382 	myri10ge_pio_copy(dst, src, 4 * sizeof (*src));
383 	mb();
384 	myri10ge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
385 	mb();
386 	src->addr_low &= ~(BE_32(1));
387 	dst->addr_low = src->addr_low;
388 	mb();
389 }
390 
391 static void
392 myri10ge_pull_jpool(struct myri10ge_slice_state *ss)
393 {
394 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
395 	struct myri10ge_jpool_entry *jtail, *j, *jfree;
396 	volatile void *putp;
397 	int i;
398 
399 	/* find tail */
400 	jtail = NULL;
401 	if (jpool->head != NULL) {
402 		j = jpool->head;
403 		while (j->next != NULL)
404 			j = j->next;
405 		jtail = j;
406 	}
407 
408 	/*
409 	 * iterate over all per-CPU caches, and add contents into
410 	 * jpool
411 	 */
412 	for (i = 0; i < MYRI10GE_MAX_CPUS; i++) {
413 		/* take per-CPU free list */
414 		putp = &jpool->cpu[i & MYRI10GE_MAX_CPU_MASK].head;
415 		jfree = atomic_swap_ptr(putp, NULL);
416 		if (jfree == NULL)
417 			continue;
418 
419 		/* append to pool */
420 		if (jtail == NULL) {
421 			jpool->head = jfree;
422 		} else {
423 			jtail->next = jfree;
424 		}
425 		j = jfree;
426 		while (j->next != NULL)
427 			j = j->next;
428 		jtail = j;
429 	}
430 }
431 
432 /*
433  * Transfers buffers from the free pool to the nic
434  * Must be called holding the jpool mutex.
435  */
436 
437 static inline void
438 myri10ge_restock_jumbos(struct myri10ge_slice_state *ss)
439 {
440 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
441 	struct myri10ge_jpool_entry *j;
442 	myri10ge_rx_ring_t *rx;
443 	int i, idx, limit;
444 
445 	rx = &ss->rx_big;
446 	limit = ss->j_rx_cnt + (rx->mask + 1);
447 
448 	for (i = rx->cnt; i != limit; i++) {
449 		idx = i & (rx->mask);
450 		j = jpool->head;
451 		if (j == NULL) {
452 			myri10ge_pull_jpool(ss);
453 			j = jpool->head;
454 			if (j == NULL) {
455 				break;
456 			}
457 		}
458 		jpool->head = j->next;
459 		rx->info[idx].j = j;
460 		rx->shadow[idx].addr_low = j->dma.low;
461 		rx->shadow[idx].addr_high = j->dma.high;
462 		/* copy 4 descriptors (32-bytes) to the mcp at a time */
463 		if ((idx & 7) == 7) {
464 			myri10ge_submit_8rx(&rx->lanai[idx - 7],
465 			    &rx->shadow[idx - 7]);
466 		}
467 	}
468 	rx->cnt = i;
469 }
470 
471 /*
472  * Transfer buffers from the nic to the free pool.
473  * Should be called holding the jpool mutex
474  */
475 
476 static inline void
477 myri10ge_unstock_jumbos(struct myri10ge_slice_state *ss)
478 {
479 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
480 	struct myri10ge_jpool_entry *j;
481 	myri10ge_rx_ring_t *rx;
482 	int i;
483 
484 	mutex_enter(&jpool->mtx);
485 	rx = &ss->rx_big;
486 
487 	for (i = 0; i < rx->mask + 1; i++) {
488 		j = rx->info[i].j;
489 		rx->info[i].j = NULL;
490 		if (j == NULL)
491 			continue;
492 		j->next = jpool->head;
493 		jpool->head = j;
494 	}
495 	mutex_exit(&jpool->mtx);
496 
497 }
498 
499 
500 /*
501  * Free routine which is called when the mblk allocated via
502  * esballoc() is freed.   Here we return the jumbo buffer
503  * to the free pool, and possibly pass some jumbo buffers
504  * to the nic
505  */
506 
507 static void
508 myri10ge_jfree_rtn(void *arg)
509 {
510 	struct myri10ge_jpool_entry *j = (struct myri10ge_jpool_entry *)arg;
511 	struct myri10ge_jpool_stuff *jpool;
512 	volatile uintptr_t *putp;
513 	uintptr_t old, new;
514 
515 	jpool = &j->ss->jpool;
516 
517 	/* prepend buffer locklessly to per-CPU freelist */
518 	putp = (void *)&jpool->cpu[CPU->cpu_seqid & MYRI10GE_MAX_CPU_MASK].head;
519 	new = (uintptr_t)j;
520 	do {
521 		old = *putp;
522 		j->next = (void *)old;
523 	} while (atomic_cas_ulong(putp, old, new) != old);
524 }
525 
526 static void
527 myri10ge_remove_jbuf(struct myri10ge_jpool_entry *j)
528 {
529 	(void) ddi_dma_unbind_handle(j->dma_handle);
530 	ddi_dma_mem_free(&j->acc_handle);
531 	ddi_dma_free_handle(&j->dma_handle);
532 	kmem_free(j, sizeof (*j));
533 }
534 
535 
536 /*
537  * Allocates one physically contiguous descriptor
538  * and add it to the jumbo buffer pool.
539  */
540 
541 static int
542 myri10ge_add_jbuf(struct myri10ge_slice_state *ss)
543 {
544 	struct myri10ge_jpool_entry *j;
545 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
546 	ddi_dma_attr_t *rx_dma_attr;
547 	size_t real_length;
548 	ddi_dma_cookie_t cookie;
549 	uint_t count;
550 	int err;
551 
552 	if (myri10ge_mtu < 2048)
553 		rx_dma_attr = &myri10ge_rx_std_dma_attr;
554 	else
555 		rx_dma_attr = &myri10ge_rx_jumbo_dma_attr;
556 
557 again:
558 	j = (struct myri10ge_jpool_entry *)
559 	    kmem_alloc(sizeof (*j), KM_SLEEP);
560 	err = ddi_dma_alloc_handle(ss->mgp->dip, rx_dma_attr,
561 	    DDI_DMA_DONTWAIT, NULL, &j->dma_handle);
562 	if (err != DDI_SUCCESS)
563 		goto abort_with_j;
564 
565 	err = ddi_dma_mem_alloc(j->dma_handle, myri10ge_mtu,
566 	    &myri10ge_dev_access_attr,  DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
567 	    NULL, &j->buf, &real_length, &j->acc_handle);
568 	if (err != DDI_SUCCESS)
569 		goto abort_with_handle;
570 
571 	err = ddi_dma_addr_bind_handle(j->dma_handle, NULL, j->buf,
572 	    real_length, DDI_DMA_READ|DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
573 	    NULL, &cookie, &count);
574 	if (err != DDI_SUCCESS)
575 		goto abort_with_mem;
576 
577 	/*
578 	 * Make certain std MTU buffers do not cross a 4KB boundary:
579 	 *
580 	 * Setting dma_attr_align=4096 will do this, but the system
581 	 * will only allocate 1 RX buffer per 4KB page, rather than 2.
582 	 * Setting dma_attr_granular=4096 *seems* to work around this,
583 	 * but I'm paranoid about future systems no longer honoring
584 	 * this, so fall back to the safe, but memory wasting way if a
585 	 * buffer crosses a 4KB boundary.
586 	 */
587 
588 	if (rx_dma_attr == &myri10ge_rx_std_dma_attr &&
589 	    rx_dma_attr->dma_attr_align != 4096) {
590 		uint32_t start, end;
591 
592 		start = MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress);
593 		end = start + myri10ge_mtu;
594 		if (((end >> 12) != (start >> 12)) && (start & 4095U)) {
595 			printf("std buffer crossed a 4KB boundary!\n");
596 			myri10ge_remove_jbuf(j);
597 			rx_dma_attr->dma_attr_align = 4096;
598 			rx_dma_attr->dma_attr_seg = UINT64_MAX;
599 			goto again;
600 		}
601 	}
602 
603 	j->dma.low =
604 	    htonl(MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress));
605 	j->dma.high =
606 	    htonl(MYRI10GE_HIGHPART_TO_U32(cookie.dmac_laddress));
607 	j->ss = ss;
608 
609 
610 	j->free_func.free_func = myri10ge_jfree_rtn;
611 	j->free_func.free_arg = (char *)j;
612 	mutex_enter(&jpool->mtx);
613 	j->next = jpool->head;
614 	jpool->head = j;
615 	jpool->num_alloc++;
616 	mutex_exit(&jpool->mtx);
617 	return (0);
618 
619 abort_with_mem:
620 	ddi_dma_mem_free(&j->acc_handle);
621 
622 abort_with_handle:
623 	ddi_dma_free_handle(&j->dma_handle);
624 
625 abort_with_j:
626 	kmem_free(j, sizeof (*j));
627 
628 	/*
629 	 * If an allocation failed, perhaps it failed because it could
630 	 * not satisfy granularity requirement.  Disable that, and
631 	 * try agin.
632 	 */
633 	if (rx_dma_attr == &myri10ge_rx_std_dma_attr &&
634 	    rx_dma_attr->dma_attr_align != 4096) {
635 			cmn_err(CE_NOTE,
636 			    "!alloc failed, reverting to gran=1\n");
637 			rx_dma_attr->dma_attr_align = 4096;
638 			rx_dma_attr->dma_attr_seg = UINT64_MAX;
639 			goto again;
640 	}
641 	return (err);
642 }
643 
644 static int
645 myri10ge_jfree_cnt(struct myri10ge_jpool_stuff *jpool)
646 {
647 	int i;
648 	struct myri10ge_jpool_entry *j;
649 
650 	mutex_enter(&jpool->mtx);
651 	j = jpool->head;
652 	i = 0;
653 	while (j != NULL) {
654 		i++;
655 		j = j->next;
656 	}
657 	mutex_exit(&jpool->mtx);
658 	return (i);
659 }
660 
661 static int
662 myri10ge_add_jbufs(struct myri10ge_slice_state *ss, int num, int total)
663 {
664 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
665 	int allocated = 0;
666 	int err;
667 	int needed;
668 
669 	/*
670 	 * if total is set, user wants "num" jbufs in the pool,
671 	 * otherwise the user wants to "num" additional jbufs
672 	 * added to the pool
673 	 */
674 	if (total && jpool->num_alloc) {
675 		allocated = myri10ge_jfree_cnt(jpool);
676 		needed = num - allocated;
677 	} else {
678 		needed = num;
679 	}
680 
681 	while (needed > 0) {
682 		needed--;
683 		err = myri10ge_add_jbuf(ss);
684 		if (err == 0) {
685 			allocated++;
686 		}
687 	}
688 	return (allocated);
689 }
690 
691 static void
692 myri10ge_remove_jbufs(struct myri10ge_slice_state *ss)
693 {
694 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
695 	struct myri10ge_jpool_entry *j;
696 
697 	mutex_enter(&jpool->mtx);
698 	myri10ge_pull_jpool(ss);
699 	while (jpool->head != NULL) {
700 		jpool->num_alloc--;
701 		j = jpool->head;
702 		jpool->head = j->next;
703 		myri10ge_remove_jbuf(j);
704 	}
705 	mutex_exit(&jpool->mtx);
706 }
707 
708 static void
709 myri10ge_carve_up_jbufs_into_small_ring(struct myri10ge_slice_state *ss)
710 {
711 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
712 	struct myri10ge_jpool_entry *j = NULL;
713 	caddr_t ptr;
714 	uint32_t dma_low, dma_high;
715 	int idx, len;
716 	unsigned int alloc_size;
717 
718 	dma_low = dma_high = len = 0;
719 	alloc_size = myri10ge_small_bytes + MXGEFW_PAD;
720 	ptr = NULL;
721 	for (idx = 0; idx < ss->rx_small.mask + 1; idx++) {
722 		/* Allocate a jumbo frame and carve it into small frames */
723 		if (len < alloc_size) {
724 			mutex_enter(&jpool->mtx);
725 			/* remove jumbo from freelist */
726 			j = jpool->head;
727 			jpool->head = j->next;
728 			/* place it onto small list */
729 			j->next = ss->small_jpool;
730 			ss->small_jpool = j;
731 			mutex_exit(&jpool->mtx);
732 			len = myri10ge_mtu;
733 			dma_low = ntohl(j->dma.low);
734 			dma_high = ntohl(j->dma.high);
735 			ptr = j->buf;
736 		}
737 		ss->rx_small.info[idx].ptr = ptr;
738 		ss->rx_small.shadow[idx].addr_low = htonl(dma_low);
739 		ss->rx_small.shadow[idx].addr_high = htonl(dma_high);
740 		len -= alloc_size;
741 		ptr += alloc_size;
742 		dma_low += alloc_size;
743 	}
744 }
745 
746 /*
747  * Return the jumbo bufs we carved up for small to the jumbo pool
748  */
749 
750 static void
751 myri10ge_release_small_jbufs(struct myri10ge_slice_state *ss)
752 {
753 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
754 	struct myri10ge_jpool_entry *j = NULL;
755 
756 	mutex_enter(&jpool->mtx);
757 	while (ss->small_jpool != NULL) {
758 		j = ss->small_jpool;
759 		ss->small_jpool = j->next;
760 		j->next = jpool->head;
761 		jpool->head = j;
762 	}
763 	mutex_exit(&jpool->mtx);
764 	ss->jbufs_for_smalls = 0;
765 }
766 
767 static int
768 myri10ge_add_tx_handle(struct myri10ge_slice_state *ss)
769 {
770 	myri10ge_tx_ring_t *tx = &ss->tx;
771 	struct myri10ge_priv *mgp = ss->mgp;
772 	struct myri10ge_tx_dma_handle *handle;
773 	int err;
774 
775 	handle = kmem_zalloc(sizeof (*handle), KM_SLEEP);
776 	err = ddi_dma_alloc_handle(mgp->dip,
777 	    &myri10ge_tx_dma_attr,
778 	    DDI_DMA_SLEEP, NULL,
779 	    &handle->h);
780 	if (err) {
781 		static int limit = 0;
782 		if (limit == 0)
783 			cmn_err(CE_WARN, "%s: Falled to alloc tx dma handle\n",
784 			    mgp->name);
785 		limit++;
786 		kmem_free(handle, sizeof (*handle));
787 		return (err);
788 	}
789 	mutex_enter(&tx->handle_lock);
790 	MYRI10GE_SLICE_STAT_INC(tx_handles_alloced);
791 	handle->next = tx->free_tx_handles;
792 	tx->free_tx_handles = handle;
793 	mutex_exit(&tx->handle_lock);
794 	return (DDI_SUCCESS);
795 }
796 
797 static void
798 myri10ge_remove_tx_handles(struct myri10ge_slice_state *ss)
799 {
800 	myri10ge_tx_ring_t *tx = &ss->tx;
801 	struct myri10ge_tx_dma_handle *handle;
802 	mutex_enter(&tx->handle_lock);
803 
804 	handle = tx->free_tx_handles;
805 	while (handle != NULL) {
806 		tx->free_tx_handles = handle->next;
807 		ddi_dma_free_handle(&handle->h);
808 		kmem_free(handle, sizeof (*handle));
809 		handle = tx->free_tx_handles;
810 		MYRI10GE_SLICE_STAT_DEC(tx_handles_alloced);
811 	}
812 	mutex_exit(&tx->handle_lock);
813 	if (MYRI10GE_SLICE_STAT(tx_handles_alloced) != 0) {
814 		cmn_err(CE_WARN, "%s: %d tx dma handles allocated at close\n",
815 		    ss->mgp->name,
816 		    (int)MYRI10GE_SLICE_STAT(tx_handles_alloced));
817 	}
818 }
819 
820 static void
821 myri10ge_free_tx_handles(myri10ge_tx_ring_t *tx,
822     struct myri10ge_tx_dma_handle_head *list)
823 {
824 	mutex_enter(&tx->handle_lock);
825 	list->tail->next = tx->free_tx_handles;
826 	tx->free_tx_handles = list->head;
827 	mutex_exit(&tx->handle_lock);
828 }
829 
830 static void
831 myri10ge_free_tx_handle_slist(myri10ge_tx_ring_t *tx,
832     struct myri10ge_tx_dma_handle *handle)
833 {
834 	struct myri10ge_tx_dma_handle_head list;
835 
836 	if (handle == NULL)
837 		return;
838 	list.head = handle;
839 	list.tail = handle;
840 	while (handle != NULL) {
841 		list.tail = handle;
842 		handle = handle->next;
843 	}
844 	myri10ge_free_tx_handles(tx, &list);
845 }
846 
847 static int
848 myri10ge_alloc_tx_handles(struct myri10ge_slice_state *ss, int count,
849     struct myri10ge_tx_dma_handle **ret)
850 {
851 	myri10ge_tx_ring_t *tx = &ss->tx;
852 	struct myri10ge_tx_dma_handle *handle;
853 	int err, i;
854 
855 	mutex_enter(&tx->handle_lock);
856 	for (i = 0; i < count; i++) {
857 		handle = tx->free_tx_handles;
858 		while (handle == NULL) {
859 			mutex_exit(&tx->handle_lock);
860 			err = myri10ge_add_tx_handle(ss);
861 			if (err != DDI_SUCCESS) {
862 				goto abort_with_handles;
863 			}
864 			mutex_enter(&tx->handle_lock);
865 			handle = tx->free_tx_handles;
866 		}
867 		tx->free_tx_handles = handle->next;
868 		handle->next = *ret;
869 		*ret = handle;
870 	}
871 	mutex_exit(&tx->handle_lock);
872 	return (DDI_SUCCESS);
873 
874 abort_with_handles:
875 	myri10ge_free_tx_handle_slist(tx, *ret);
876 	return (err);
877 }
878 
879 
880 /*
881  * Frees DMA resources associated with the send ring
882  */
883 static void
884 myri10ge_unprepare_tx_ring(struct myri10ge_slice_state *ss)
885 {
886 	myri10ge_tx_ring_t *tx;
887 	struct myri10ge_tx_dma_handle_head handles;
888 	size_t bytes;
889 	int idx;
890 
891 	tx = &ss->tx;
892 	handles.head = NULL;
893 	handles.tail = NULL;
894 	for (idx = 0; idx < ss->tx.mask + 1; idx++) {
895 		if (tx->info[idx].m) {
896 			(void) ddi_dma_unbind_handle(tx->info[idx].handle->h);
897 			handles.head = tx->info[idx].handle;
898 			if (handles.tail == NULL)
899 				handles.tail = tx->info[idx].handle;
900 			freeb(tx->info[idx].m);
901 			tx->info[idx].m = 0;
902 			tx->info[idx].handle = 0;
903 		}
904 		tx->cp[idx].va = NULL;
905 		myri10ge_dma_free(&tx->cp[idx].dma);
906 	}
907 	bytes = sizeof (*tx->cp) * (tx->mask + 1);
908 	kmem_free(tx->cp, bytes);
909 	tx->cp = NULL;
910 	if (handles.head != NULL)
911 		myri10ge_free_tx_handles(tx, &handles);
912 	myri10ge_remove_tx_handles(ss);
913 }
914 
915 /*
916  * Allocates DMA handles associated with the send ring
917  */
918 static inline int
919 myri10ge_prepare_tx_ring(struct myri10ge_slice_state *ss)
920 {
921 	struct myri10ge_tx_dma_handle *handles;
922 	int h;
923 	size_t bytes;
924 
925 	bytes = sizeof (*ss->tx.cp) * (ss->tx.mask + 1);
926 	ss->tx.cp = kmem_zalloc(bytes, KM_SLEEP);
927 	if (ss->tx.cp == NULL) {
928 		cmn_err(CE_WARN,
929 		    "%s: Failed to allocate tx copyblock storage\n",
930 		    ss->mgp->name);
931 		return (DDI_FAILURE);
932 	}
933 
934 
935 	/* allocate the TX copyblocks */
936 	for (h = 0; h < ss->tx.mask + 1; h++) {
937 		ss->tx.cp[h].va = myri10ge_dma_alloc(ss->mgp->dip,
938 		    4096, &myri10ge_rx_jumbo_dma_attr,
939 		    &myri10ge_dev_access_attr, DDI_DMA_STREAMING,
940 		    DDI_DMA_WRITE|DDI_DMA_STREAMING, &ss->tx.cp[h].dma, 1,
941 		    DDI_DMA_DONTWAIT);
942 		if (ss->tx.cp[h].va == NULL) {
943 			cmn_err(CE_WARN, "%s: Failed to allocate tx "
944 			    "copyblock %d\n", ss->mgp->name, h);
945 			goto abort_with_copyblocks;
946 		}
947 	}
948 	/* pre-allocate transmit handles */
949 	handles = NULL;
950 	(void) myri10ge_alloc_tx_handles(ss, myri10ge_tx_handles_initial,
951 	    &handles);
952 	if (handles != NULL)
953 		myri10ge_free_tx_handle_slist(&ss->tx, handles);
954 
955 	return (DDI_SUCCESS);
956 
957 abort_with_copyblocks:
958 	while (h > 0)  {
959 		h--;
960 		myri10ge_dma_free(&ss->tx.cp[h].dma);
961 	}
962 
963 	bytes = sizeof (*ss->tx.cp) * (ss->tx.mask + 1);
964 	kmem_free(ss->tx.cp, bytes);
965 	ss->tx.cp = NULL;
966 	return (DDI_FAILURE);
967 }
968 
969 /*
970  * The eeprom strings on the lanaiX have the format
971  * SN=x\0
972  * MAC=x:x:x:x:x:x\0
973  * PT:ddd mmm xx xx:xx:xx xx\0
974  * PV:ddd mmm xx xx:xx:xx xx\0
975  */
976 static int
977 myri10ge_read_mac_addr(struct myri10ge_priv *mgp)
978 {
979 #define	MYRI10GE_NEXT_STRING(p) while (ptr < limit && *ptr++)
980 #define	myri10ge_digit(c) (((c) >= '0' && (c) <= '9') ? ((c) - '0') :	\
981 		(((c) >= 'A' && (c) <= 'F') ? (10 + (c) - 'A') :	\
982 		(((c) >= 'a' && (c) <= 'f') ? (10 + (c) - 'a') : -1)))
983 
984 	char *ptr, *limit;
985 	int i, hv, lv;
986 
987 	ptr = mgp->eeprom_strings;
988 	limit = mgp->eeprom_strings + MYRI10GE_EEPROM_STRINGS_SIZE;
989 
990 	while (*ptr != '\0' && ptr < limit) {
991 		if (memcmp(ptr, "MAC=", 4) == 0) {
992 			ptr += 4;
993 			if (myri10ge_verbose)
994 				printf("%s: mac address = %s\n", mgp->name,
995 				    ptr);
996 			mgp->mac_addr_string = ptr;
997 			for (i = 0; i < 6; i++) {
998 				if ((ptr + 2) > limit)
999 					goto abort;
1000 
1001 				if (*(ptr+1) == ':') {
1002 					hv = 0;
1003 					lv = myri10ge_digit(*ptr); ptr++;
1004 				} else {
1005 					hv = myri10ge_digit(*ptr); ptr++;
1006 					lv = myri10ge_digit(*ptr); ptr++;
1007 				}
1008 				mgp->mac_addr[i] = (hv << 4) | lv;
1009 				ptr++;
1010 			}
1011 		}
1012 		if (memcmp((const void *)ptr, "SN=", 3) == 0) {
1013 			ptr += 3;
1014 			mgp->sn_str = (char *)ptr;
1015 		}
1016 		if (memcmp((const void *)ptr, "PC=", 3) == 0) {
1017 			ptr += 3;
1018 			mgp->pc_str = (char *)ptr;
1019 		}
1020 		MYRI10GE_NEXT_STRING(ptr);
1021 	}
1022 
1023 	return (0);
1024 
1025 abort:
1026 	cmn_err(CE_WARN, "%s: failed to parse eeprom_strings", mgp->name);
1027 	return (ENXIO);
1028 }
1029 
1030 
1031 /*
1032  * Determine the register set containing the PCI resource we
1033  * want to map: the memory-mappable part of the interface. We do
1034  * this by scanning the DDI "reg" property of the interface,
1035  * which is an array of mx_ddi_reg_set structures.
1036  */
1037 static int
1038 myri10ge_reg_set(dev_info_t *dip, int *reg_set, int *span,
1039     unsigned long *busno, unsigned long *devno,
1040     unsigned long *funcno)
1041 {
1042 
1043 #define	REGISTER_NUMBER(ip)	(ip[0] >>  0 & 0xff)
1044 #define	FUNCTION_NUMBER(ip)	(ip[0] >>  8 & 0x07)
1045 #define	DEVICE_NUMBER(ip)	(ip[0] >> 11 & 0x1f)
1046 #define	BUS_NUMBER(ip)		(ip[0] >> 16 & 0xff)
1047 #define	ADDRESS_SPACE(ip)	(ip[0] >> 24 & 0x03)
1048 #define	PCI_ADDR_HIGH(ip)	(ip[1])
1049 #define	PCI_ADDR_LOW(ip)	(ip[2])
1050 #define	PCI_SPAN_HIGH(ip)	(ip[3])
1051 #define	PCI_SPAN_LOW(ip)	(ip[4])
1052 
1053 #define	MX_DDI_REG_SET_32_BIT_MEMORY_SPACE 2
1054 #define	MX_DDI_REG_SET_64_BIT_MEMORY_SPACE 3
1055 
1056 	int *data, i, *rs;
1057 	uint32_t nelementsp;
1058 
1059 #ifdef MYRI10GE_REGSET_VERBOSE
1060 	char *address_space_name[] = { "Configuration Space",
1061 					"I/O Space",
1062 					"32-bit Memory Space",
1063 					"64-bit Memory Space"
1064 	};
1065 #endif
1066 
1067 	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
1068 	    "reg", &data, &nelementsp) != DDI_SUCCESS) {
1069 		printf("Could not determine register set.\n");
1070 		return (ENXIO);
1071 	}
1072 
1073 #ifdef MYRI10GE_REGSET_VERBOSE
1074 	printf("There are %d register sets.\n", nelementsp / 5);
1075 #endif
1076 	if (!nelementsp) {
1077 		printf("Didn't find any \"reg\" properties.\n");
1078 		ddi_prop_free(data);
1079 		return (ENODEV);
1080 	}
1081 
1082 	/* Scan for the register number. */
1083 	rs = &data[0];
1084 	*busno = BUS_NUMBER(rs);
1085 	*devno = DEVICE_NUMBER(rs);
1086 	*funcno = FUNCTION_NUMBER(rs);
1087 
1088 #ifdef MYRI10GE_REGSET_VERBOSE
1089 	printf("*** Scanning for register number.\n");
1090 #endif
1091 	for (i = 0; i < nelementsp / 5; i++) {
1092 		rs = &data[5 * i];
1093 #ifdef MYRI10GE_REGSET_VERBOSE
1094 		printf("Examining register set %d:\n", i);
1095 		printf("  Register number = %d.\n", REGISTER_NUMBER(rs));
1096 		printf("  Function number = %d.\n", FUNCTION_NUMBER(rs));
1097 		printf("  Device number   = %d.\n", DEVICE_NUMBER(rs));
1098 		printf("  Bus number      = %d.\n", BUS_NUMBER(rs));
1099 		printf("  Address space   = %d (%s ).\n", ADDRESS_SPACE(rs),
1100 		    address_space_name[ADDRESS_SPACE(rs)]);
1101 		printf("  pci address 0x%08x %08x\n", PCI_ADDR_HIGH(rs),
1102 		    PCI_ADDR_LOW(rs));
1103 		printf("  pci span 0x%08x %08x\n", PCI_SPAN_HIGH(rs),
1104 		    PCI_SPAN_LOW(rs));
1105 #endif
1106 		/* We are looking for a memory property. */
1107 
1108 		if (ADDRESS_SPACE(rs) == MX_DDI_REG_SET_64_BIT_MEMORY_SPACE ||
1109 		    ADDRESS_SPACE(rs) == MX_DDI_REG_SET_32_BIT_MEMORY_SPACE) {
1110 			*reg_set = i;
1111 
1112 #ifdef MYRI10GE_REGSET_VERBOSE
1113 			printf("%s uses register set %d.\n",
1114 			    address_space_name[ADDRESS_SPACE(rs)], *reg_set);
1115 #endif
1116 
1117 			*span = (PCI_SPAN_LOW(rs));
1118 #ifdef MYRI10GE_REGSET_VERBOSE
1119 			printf("Board span is 0x%x\n", *span);
1120 #endif
1121 			break;
1122 		}
1123 	}
1124 
1125 	ddi_prop_free(data);
1126 
1127 	/* If no match, fail. */
1128 	if (i >= nelementsp / 5) {
1129 		return (EIO);
1130 	}
1131 
1132 	return (0);
1133 }
1134 
1135 
1136 static int
1137 myri10ge_load_firmware_from_zlib(struct myri10ge_priv *mgp, uint32_t *limit)
1138 {
1139 	void *inflate_buffer;
1140 	int rv, status;
1141 	size_t sram_size = mgp->sram_size - MYRI10GE_EEPROM_STRINGS_SIZE;
1142 	size_t destlen;
1143 	mcp_gen_header_t *hdr;
1144 	unsigned hdr_offset, i;
1145 
1146 
1147 	*limit = 0; /* -Wuninitialized */
1148 	status = 0;
1149 
1150 	inflate_buffer = kmem_zalloc(sram_size, KM_NOSLEEP);
1151 	if (!inflate_buffer) {
1152 		cmn_err(CE_WARN,
1153 		    "%s: Could not allocate buffer to inflate mcp\n",
1154 		    mgp->name);
1155 		return (ENOMEM);
1156 	}
1157 
1158 	destlen = sram_size;
1159 	rv = z_uncompress(inflate_buffer, &destlen, mgp->eth_z8e,
1160 	    mgp->eth_z8e_length);
1161 
1162 	if (rv != Z_OK) {
1163 		cmn_err(CE_WARN, "%s: Could not inflate mcp: %s\n",
1164 		    mgp->name, z_strerror(rv));
1165 		status = ENXIO;
1166 		goto abort;
1167 	}
1168 
1169 	*limit = (uint32_t)destlen;
1170 
1171 	hdr_offset = htonl(*(uint32_t *)(void *)((char *)inflate_buffer +
1172 	    MCP_HEADER_PTR_OFFSET));
1173 	hdr = (void *)((char *)inflate_buffer + hdr_offset);
1174 	if (ntohl(hdr->mcp_type) != MCP_TYPE_ETH) {
1175 		cmn_err(CE_WARN, "%s: Bad firmware type: 0x%x\n", mgp->name,
1176 		    ntohl(hdr->mcp_type));
1177 		status = EIO;
1178 		goto abort;
1179 	}
1180 
1181 	/* save firmware version for kstat */
1182 	(void) strncpy(mgp->fw_version, hdr->version, sizeof (mgp->fw_version));
1183 	if (myri10ge_verbose)
1184 		printf("%s: firmware id: %s\n", mgp->name, hdr->version);
1185 
1186 	/* Copy the inflated firmware to NIC SRAM. */
1187 	for (i = 0; i < *limit; i += 256) {
1188 		myri10ge_pio_copy((char *)mgp->sram + MYRI10GE_FW_OFFSET + i,
1189 		    (char *)inflate_buffer + i,
1190 		    min(256U, (unsigned)(*limit - i)));
1191 		mb();
1192 		(void) *(int *)(void *)mgp->sram;
1193 		mb();
1194 	}
1195 
1196 abort:
1197 	kmem_free(inflate_buffer, sram_size);
1198 
1199 	return (status);
1200 
1201 }
1202 
1203 
1204 int
1205 myri10ge_send_cmd(struct myri10ge_priv *mgp, uint32_t cmd,
1206     myri10ge_cmd_t *data)
1207 {
1208 	mcp_cmd_t *buf;
1209 	char buf_bytes[sizeof (*buf) + 8];
1210 	volatile mcp_cmd_response_t *response = mgp->cmd;
1211 	volatile char *cmd_addr =
1212 	    (volatile char *)mgp->sram + MXGEFW_ETH_CMD;
1213 	int sleep_total = 0;
1214 
1215 	/* ensure buf is aligned to 8 bytes */
1216 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1217 
1218 	buf->data0 = htonl(data->data0);
1219 	buf->data1 = htonl(data->data1);
1220 	buf->data2 = htonl(data->data2);
1221 	buf->cmd = htonl(cmd);
1222 	buf->response_addr.low = mgp->cmd_dma.low;
1223 	buf->response_addr.high = mgp->cmd_dma.high;
1224 	mutex_enter(&mgp->cmd_lock);
1225 	response->result = 0xffffffff;
1226 	mb();
1227 
1228 	myri10ge_pio_copy((void *)cmd_addr, buf, sizeof (*buf));
1229 
1230 	/* wait up to 20ms */
1231 	for (sleep_total = 0; sleep_total < 20; sleep_total++) {
1232 		mb();
1233 		if (response->result != 0xffffffff) {
1234 			if (response->result == 0) {
1235 				data->data0 = ntohl(response->data);
1236 				mutex_exit(&mgp->cmd_lock);
1237 				return (0);
1238 			} else if (ntohl(response->result)
1239 			    == MXGEFW_CMD_UNKNOWN) {
1240 				mutex_exit(&mgp->cmd_lock);
1241 				return (ENOSYS);
1242 			} else if (ntohl(response->result)
1243 			    == MXGEFW_CMD_ERROR_UNALIGNED) {
1244 				mutex_exit(&mgp->cmd_lock);
1245 				return (E2BIG);
1246 			} else {
1247 				cmn_err(CE_WARN,
1248 				    "%s: command %d failed, result = %d\n",
1249 				    mgp->name, cmd, ntohl(response->result));
1250 				mutex_exit(&mgp->cmd_lock);
1251 				return (ENXIO);
1252 			}
1253 		}
1254 		drv_usecwait(1000);
1255 	}
1256 	mutex_exit(&mgp->cmd_lock);
1257 	cmn_err(CE_WARN, "%s: command %d timed out, result = %d\n",
1258 	    mgp->name, cmd, ntohl(response->result));
1259 	return (EAGAIN);
1260 }
1261 
1262 /*
1263  * Enable or disable periodic RDMAs from the host to make certain
1264  * chipsets resend dropped PCIe messages
1265  */
1266 
1267 static void
1268 myri10ge_dummy_rdma(struct myri10ge_priv *mgp, int enable)
1269 {
1270 	char buf_bytes[72];
1271 	volatile uint32_t *confirm;
1272 	volatile char *submit;
1273 	uint32_t *buf;
1274 	int i;
1275 
1276 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1277 
1278 	/* clear confirmation addr */
1279 	confirm = (volatile uint32_t *)mgp->cmd;
1280 	*confirm = 0;
1281 	mb();
1282 
1283 	/*
1284 	 * send an rdma command to the PCIe engine, and wait for the
1285 	 * response in the confirmation address.  The firmware should
1286 	 *  write a -1 there to indicate it is alive and well
1287 	 */
1288 
1289 	buf[0] = mgp->cmd_dma.high;		/* confirm addr MSW */
1290 	buf[1] = mgp->cmd_dma.low;		/* confirm addr LSW */
1291 	buf[2] = htonl(0xffffffff);		/* confirm data */
1292 	buf[3] = htonl(mgp->cmd_dma.high);	/* dummy addr MSW */
1293 	buf[4] = htonl(mgp->cmd_dma.low);	/* dummy addr LSW */
1294 	buf[5] = htonl(enable);			/* enable? */
1295 
1296 
1297 	submit = (volatile char *)(mgp->sram + MXGEFW_BOOT_DUMMY_RDMA);
1298 
1299 	myri10ge_pio_copy((char *)submit, buf, 64);
1300 	mb();
1301 	drv_usecwait(1000);
1302 	mb();
1303 	i = 0;
1304 	while (*confirm != 0xffffffff && i < 20) {
1305 		drv_usecwait(1000);
1306 		i++;
1307 	}
1308 	if (*confirm != 0xffffffff) {
1309 		cmn_err(CE_WARN, "%s: dummy rdma %s failed (%p = 0x%x)",
1310 		    mgp->name,
1311 		    (enable ? "enable" : "disable"), (void*) confirm, *confirm);
1312 	}
1313 }
1314 
1315 static int
1316 myri10ge_load_firmware(struct myri10ge_priv *mgp)
1317 {
1318 	myri10ge_cmd_t cmd;
1319 	volatile uint32_t *confirm;
1320 	volatile char *submit;
1321 	char buf_bytes[72];
1322 	uint32_t *buf, size;
1323 	int status, i;
1324 
1325 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1326 
1327 	status = myri10ge_load_firmware_from_zlib(mgp, &size);
1328 	if (status) {
1329 		cmn_err(CE_WARN, "%s: firmware loading failed\n", mgp->name);
1330 		return (status);
1331 	}
1332 
1333 	/* clear confirmation addr */
1334 	confirm = (volatile uint32_t *)mgp->cmd;
1335 	*confirm = 0;
1336 	mb();
1337 
1338 	/*
1339 	 * send a reload command to the bootstrap MCP, and wait for the
1340 	 * response in the confirmation address.  The firmware should
1341 	 * write a -1 there to indicate it is alive and well
1342 	 */
1343 
1344 	buf[0] = mgp->cmd_dma.high;	/* confirm addr MSW */
1345 	buf[1] = mgp->cmd_dma.low;	/* confirm addr LSW */
1346 	buf[2] = htonl(0xffffffff);	/* confirm data */
1347 
1348 	/*
1349 	 * FIX: All newest firmware should un-protect the bottom of
1350 	 * the sram before handoff. However, the very first interfaces
1351 	 * do not. Therefore the handoff copy must skip the first 8 bytes
1352 	 */
1353 	buf[3] = htonl(MYRI10GE_FW_OFFSET + 8); /* where the code starts */
1354 	buf[4] = htonl(size - 8);	/* length of code */
1355 	buf[5] = htonl(8);		/* where to copy to */
1356 	buf[6] = htonl(0);		/* where to jump to */
1357 
1358 	submit = (volatile char *)(mgp->sram + MXGEFW_BOOT_HANDOFF);
1359 
1360 	myri10ge_pio_copy((char *)submit, buf, 64);
1361 	mb();
1362 	drv_usecwait(1000);
1363 	mb();
1364 	i = 0;
1365 	while (*confirm != 0xffffffff && i < 1000) {
1366 		drv_usecwait(1000);
1367 		i++;
1368 	}
1369 	if (*confirm != 0xffffffff) {
1370 		cmn_err(CE_WARN, "%s: handoff failed (%p = 0x%x)",
1371 		    mgp->name, (void *) confirm, *confirm);
1372 
1373 		return (ENXIO);
1374 	}
1375 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
1376 	if (status != 0) {
1377 		cmn_err(CE_WARN, "%s: failed MXGEFW_CMD_GET_RX_RING_SIZE\n",
1378 		    mgp->name);
1379 		return (ENXIO);
1380 	}
1381 
1382 	mgp->max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
1383 	myri10ge_dummy_rdma(mgp, 1);
1384 	return (0);
1385 }
1386 
1387 static int
1388 myri10ge_m_unicst(void *arg, const uint8_t *addr)
1389 {
1390 	struct myri10ge_priv *mgp = arg;
1391 	myri10ge_cmd_t cmd;
1392 	int status;
1393 
1394 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1395 	    | (addr[2] << 8) | addr[3]);
1396 
1397 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1398 
1399 	status = myri10ge_send_cmd(mgp, MXGEFW_SET_MAC_ADDRESS, &cmd);
1400 	if (status == 0 && (addr != mgp->mac_addr))
1401 		(void) memcpy(mgp->mac_addr, addr, sizeof (mgp->mac_addr));
1402 
1403 	return (status);
1404 }
1405 
1406 static int
1407 myri10ge_change_pause(struct myri10ge_priv *mgp, int pause)
1408 {
1409 	myri10ge_cmd_t cmd;
1410 	int status;
1411 
1412 	if (pause)
1413 		status = myri10ge_send_cmd(mgp, MXGEFW_ENABLE_FLOW_CONTROL,
1414 		    &cmd);
1415 	else
1416 		status = myri10ge_send_cmd(mgp, MXGEFW_DISABLE_FLOW_CONTROL,
1417 		    &cmd);
1418 
1419 	if (status) {
1420 		cmn_err(CE_WARN, "%s: Failed to set flow control mode\n",
1421 		    mgp->name);
1422 		return (ENXIO);
1423 	}
1424 	mgp->pause = pause;
1425 	return (0);
1426 }
1427 
1428 static void
1429 myri10ge_change_promisc(struct myri10ge_priv *mgp, int promisc)
1430 {
1431 	myri10ge_cmd_t cmd;
1432 	int status;
1433 
1434 	if (promisc)
1435 		status = myri10ge_send_cmd(mgp, MXGEFW_ENABLE_PROMISC, &cmd);
1436 	else
1437 		status = myri10ge_send_cmd(mgp, MXGEFW_DISABLE_PROMISC, &cmd);
1438 
1439 	if (status) {
1440 		cmn_err(CE_WARN, "%s: Failed to set promisc mode\n",
1441 		    mgp->name);
1442 	}
1443 }
1444 
1445 static int
1446 myri10ge_dma_test(struct myri10ge_priv *mgp, int test_type)
1447 {
1448 	myri10ge_cmd_t cmd;
1449 	int status;
1450 	uint32_t len;
1451 	void *dmabench;
1452 	struct myri10ge_dma_stuff dmabench_dma;
1453 	char *test = " ";
1454 
1455 	/*
1456 	 * Run a small DMA test.
1457 	 * The magic multipliers to the length tell the firmware
1458 	 * tp do DMA read, write, or read+write tests.  The
1459 	 * results are returned in cmd.data0.  The upper 16
1460 	 * bits or the return is the number of transfers completed.
1461 	 * The lower 16 bits is the time in 0.5us ticks that the
1462 	 * transfers took to complete
1463 	 */
1464 
1465 	len = mgp->tx_boundary;
1466 
1467 	dmabench = myri10ge_dma_alloc(mgp->dip, len,
1468 	    &myri10ge_rx_jumbo_dma_attr, &myri10ge_dev_access_attr,
1469 	    DDI_DMA_STREAMING,  DDI_DMA_RDWR|DDI_DMA_STREAMING,
1470 	    &dmabench_dma, 1, DDI_DMA_DONTWAIT);
1471 	mgp->read_dma = mgp->write_dma = mgp->read_write_dma = 0;
1472 	if (dmabench == NULL) {
1473 		cmn_err(CE_WARN, "%s dma benchmark aborted\n", mgp->name);
1474 		return (ENOMEM);
1475 	}
1476 
1477 	cmd.data0 = ntohl(dmabench_dma.low);
1478 	cmd.data1 = ntohl(dmabench_dma.high);
1479 	cmd.data2 = len * 0x10000;
1480 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1481 	if (status != 0) {
1482 		test = "read";
1483 		goto abort;
1484 	}
1485 	mgp->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
1486 
1487 	cmd.data0 = ntohl(dmabench_dma.low);
1488 	cmd.data1 = ntohl(dmabench_dma.high);
1489 	cmd.data2 = len * 0x1;
1490 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1491 	if (status != 0) {
1492 		test = "write";
1493 		goto abort;
1494 	}
1495 	mgp->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
1496 
1497 	cmd.data0 = ntohl(dmabench_dma.low);
1498 	cmd.data1 = ntohl(dmabench_dma.high);
1499 	cmd.data2 = len * 0x10001;
1500 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1501 	if (status != 0) {
1502 		test = "read/write";
1503 		goto abort;
1504 	}
1505 	mgp->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
1506 	    (cmd.data0 & 0xffff);
1507 
1508 
1509 abort:
1510 	myri10ge_dma_free(&dmabench_dma);
1511 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
1512 		cmn_err(CE_WARN, "%s %s dma benchmark failed\n", mgp->name,
1513 		    test);
1514 	return (status);
1515 }
1516 
1517 static int
1518 myri10ge_reset(struct myri10ge_priv *mgp)
1519 {
1520 	myri10ge_cmd_t cmd;
1521 	struct myri10ge_nic_stat *ethstat;
1522 	struct myri10ge_slice_state *ss;
1523 	int i, status;
1524 	size_t bytes;
1525 
1526 	/* send a reset command to the card to see if it is alive */
1527 	(void) memset(&cmd, 0, sizeof (cmd));
1528 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_RESET, &cmd);
1529 	if (status != 0) {
1530 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
1531 		return (ENXIO);
1532 	}
1533 
1534 	/* Now exchange information about interrupts  */
1535 
1536 	bytes = mgp->max_intr_slots * sizeof (*mgp->ss[0].rx_done.entry);
1537 	cmd.data0 = (uint32_t)bytes;
1538 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1539 
1540 	/*
1541 	 * Even though we already know how many slices are supported
1542 	 * via myri10ge_probe_slices() MXGEFW_CMD_GET_MAX_RSS_QUEUES
1543 	 * has magic side effects, and must be called after a reset.
1544 	 * It must be called prior to calling any RSS related cmds,
1545 	 * including assigning an interrupt queue for anything but
1546 	 * slice 0.  It must also be called *after*
1547 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1548 	 * the firmware to compute offsets.
1549 	 */
1550 
1551 	if (mgp->num_slices > 1) {
1552 
1553 		/* ask the maximum number of slices it supports */
1554 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1555 		    &cmd);
1556 		if (status != 0) {
1557 			cmn_err(CE_WARN,
1558 			    "%s: failed to get number of slices\n",
1559 			    mgp->name);
1560 			return (status);
1561 		}
1562 
1563 		/*
1564 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1565 		 * to setting up the interrupt queue DMA
1566 		 */
1567 
1568 		cmd.data0 = mgp->num_slices;
1569 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE |
1570 		    MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1571 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1572 		    &cmd);
1573 		if (status != 0) {
1574 			cmn_err(CE_WARN,
1575 			    "%s: failed to set number of slices\n",
1576 			    mgp->name);
1577 			return (status);
1578 		}
1579 	}
1580 	for (i = 0; i < mgp->num_slices; i++) {
1581 		ss = &mgp->ss[i];
1582 		cmd.data0 = ntohl(ss->rx_done.dma.low);
1583 		cmd.data1 = ntohl(ss->rx_done.dma.high);
1584 		cmd.data2 = i;
1585 		status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_DMA,
1586 		    &cmd);
1587 	};
1588 
1589 	status |= myri10ge_send_cmd(mgp,  MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1590 	for (i = 0; i < mgp->num_slices; i++) {
1591 		ss = &mgp->ss[i];
1592 		ss->irq_claim = (volatile unsigned int *)
1593 		    (void *)(mgp->sram + cmd.data0 + 8 * i);
1594 	}
1595 
1596 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
1597 		status |= myri10ge_send_cmd(mgp,
1598 		    MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1599 		mgp->irq_deassert = (uint32_t *)(void *)(mgp->sram + cmd.data0);
1600 	}
1601 
1602 	status |= myri10ge_send_cmd(mgp,
1603 	    MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1604 	mgp->intr_coal_delay_ptr = (uint32_t *)(void *)(mgp->sram + cmd.data0);
1605 
1606 	if (status != 0) {
1607 		cmn_err(CE_WARN, "%s: failed set interrupt parameters\n",
1608 		    mgp->name);
1609 		return (status);
1610 	}
1611 
1612 	*mgp->intr_coal_delay_ptr = htonl(mgp->intr_coal_delay);
1613 	(void) myri10ge_dma_test(mgp, MXGEFW_DMA_TEST);
1614 
1615 	/* reset mcp/driver shared state back to 0 */
1616 
1617 	for (i = 0; i < mgp->num_slices; i++) {
1618 		ss = &mgp->ss[i];
1619 		bytes = mgp->max_intr_slots *
1620 		    sizeof (*mgp->ss[0].rx_done.entry);
1621 		(void) memset(ss->rx_done.entry, 0, bytes);
1622 		ss->tx.req = 0;
1623 		ss->tx.done = 0;
1624 		ss->tx.pkt_done = 0;
1625 		ss->rx_big.cnt = 0;
1626 		ss->rx_small.cnt = 0;
1627 		ss->rx_done.idx = 0;
1628 		ss->rx_done.cnt = 0;
1629 		ss->rx_token = 0;
1630 		ss->tx.watchdog_done = 0;
1631 		ss->tx.watchdog_req = 0;
1632 		ss->tx.active = 0;
1633 		ss->tx.activate = 0;
1634 	}
1635 	mgp->watchdog_rx_pause = 0;
1636 	if (mgp->ksp_stat != NULL) {
1637 		ethstat = (struct myri10ge_nic_stat *)mgp->ksp_stat->ks_data;
1638 		ethstat->link_changes.value.ul = 0;
1639 	}
1640 	status = myri10ge_m_unicst(mgp, mgp->mac_addr);
1641 	myri10ge_change_promisc(mgp, 0);
1642 	(void) myri10ge_change_pause(mgp, mgp->pause);
1643 	return (status);
1644 }
1645 
1646 static int
1647 myri10ge_init_toeplitz(struct myri10ge_priv *mgp)
1648 {
1649 	myri10ge_cmd_t cmd;
1650 	int i, b, s, t, j;
1651 	int status;
1652 	uint32_t k[8];
1653 	uint32_t tmp;
1654 	uint8_t *key;
1655 
1656 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RSS_KEY_OFFSET,
1657 	    &cmd);
1658 	if (status != 0) {
1659 		cmn_err(CE_WARN, "%s: failed to get rss key\n",
1660 		    mgp->name);
1661 		return (EIO);
1662 	}
1663 	myri10ge_pio_copy32(mgp->rss_key,
1664 	    (uint32_t *)(void*)((char *)mgp->sram + cmd.data0),
1665 	    sizeof (mgp->rss_key));
1666 
1667 	mgp->toeplitz_hash_table = kmem_alloc(sizeof (uint32_t) * 12 * 256,
1668 	    KM_SLEEP);
1669 	key = (uint8_t *)mgp->rss_key;
1670 	t = 0;
1671 	for (b = 0; b < 12; b++) {
1672 		for (s = 0; s < 8; s++) {
1673 			/* Bits: b*8+s, ..., b*8+s+31 */
1674 			k[s] = 0;
1675 			for (j = 0; j < 32; j++) {
1676 				int bit = b*8+s+j;
1677 				bit = 0x1 & (key[bit / 8] >> (7 -(bit & 0x7)));
1678 				k[s] |= bit << (31 - j);
1679 			}
1680 		}
1681 
1682 		for (i = 0; i <= 0xff; i++) {
1683 			tmp = 0;
1684 			if (i & (1 << 7)) { tmp ^= k[0]; }
1685 			if (i & (1 << 6)) { tmp ^= k[1]; }
1686 			if (i & (1 << 5)) { tmp ^= k[2]; }
1687 			if (i & (1 << 4)) { tmp ^= k[3]; }
1688 			if (i & (1 << 3)) { tmp ^= k[4]; }
1689 			if (i & (1 << 2)) { tmp ^= k[5]; }
1690 			if (i & (1 << 1)) { tmp ^= k[6]; }
1691 			if (i & (1 << 0)) { tmp ^= k[7]; }
1692 			mgp->toeplitz_hash_table[t++] = tmp;
1693 		}
1694 	}
1695 	return (0);
1696 }
1697 
1698 static inline struct myri10ge_slice_state *
1699 myri10ge_toeplitz_send_hash(struct myri10ge_priv *mgp, struct ip *ip)
1700 {
1701 	struct tcphdr *hdr;
1702 	uint32_t saddr, daddr;
1703 	uint32_t hash, slice;
1704 	uint32_t *table = mgp->toeplitz_hash_table;
1705 	uint16_t src, dst;
1706 
1707 	/*
1708 	 * Note hashing order is reversed from how it is done
1709 	 * in the NIC, so as to generate the same hash value
1710 	 * for the connection to try to keep connections CPU local
1711 	 */
1712 
1713 	/* hash on IPv4 src/dst address */
1714 	saddr = ntohl(ip->ip_src.s_addr);
1715 	daddr = ntohl(ip->ip_dst.s_addr);
1716 	hash = table[(256 * 0) + ((daddr >> 24) & 0xff)];
1717 	hash ^= table[(256 * 1) + ((daddr >> 16) & 0xff)];
1718 	hash ^= table[(256 * 2) + ((daddr >> 8) & 0xff)];
1719 	hash ^= table[(256 * 3) + ((daddr) & 0xff)];
1720 	hash ^= table[(256 * 4) + ((saddr >> 24) & 0xff)];
1721 	hash ^= table[(256 * 5) + ((saddr >> 16) & 0xff)];
1722 	hash ^= table[(256 * 6) + ((saddr >> 8) & 0xff)];
1723 	hash ^= table[(256 * 7) + ((saddr) & 0xff)];
1724 	/* hash on TCP port, if required */
1725 	if ((myri10ge_rss_hash & MXGEFW_RSS_HASH_TYPE_TCP_IPV4) &&
1726 	    ip->ip_p == IPPROTO_TCP) {
1727 		hdr = (struct tcphdr *)(void *)
1728 		    (((uint8_t *)ip) +  (ip->ip_hl << 2));
1729 		src = ntohs(hdr->th_sport);
1730 		dst = ntohs(hdr->th_dport);
1731 
1732 		hash ^= table[(256 * 8) + ((dst >> 8) & 0xff)];
1733 		hash ^= table[(256 * 9) + ((dst) & 0xff)];
1734 		hash ^= table[(256 * 10) + ((src >> 8) & 0xff)];
1735 		hash ^= table[(256 * 11) + ((src) & 0xff)];
1736 	}
1737 	slice = (mgp->num_slices - 1) & hash;
1738 	return (&mgp->ss[slice]);
1739 
1740 }
1741 
1742 static inline struct myri10ge_slice_state *
1743 myri10ge_simple_send_hash(struct myri10ge_priv *mgp, struct ip *ip)
1744 {
1745 	struct tcphdr *hdr;
1746 	uint32_t slice, hash_val;
1747 
1748 
1749 	if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP) {
1750 		return (&mgp->ss[0]);
1751 	}
1752 	hdr = (struct tcphdr *)(void *)(((uint8_t *)ip) +  (ip->ip_hl << 2));
1753 
1754 	/*
1755 	 * Use the second byte of the *destination* address for
1756 	 * MXGEFW_RSS_HASH_TYPE_SRC_PORT, so as to match NIC's hashing
1757 	 */
1758 	hash_val = ntohs(hdr->th_dport) & 0xff;
1759 	if (myri10ge_rss_hash == MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT)
1760 		hash_val += ntohs(hdr->th_sport) & 0xff;
1761 
1762 	slice = (mgp->num_slices - 1) & hash_val;
1763 	return (&mgp->ss[slice]);
1764 }
1765 
1766 static inline struct myri10ge_slice_state *
1767 myri10ge_send_hash(struct myri10ge_priv *mgp, mblk_t *mp)
1768 {
1769 	unsigned int slice = 0;
1770 	struct ether_header *eh;
1771 	struct ether_vlan_header *vh;
1772 	struct ip *ip;
1773 	int ehl, ihl;
1774 
1775 	if (mgp->num_slices == 1)
1776 		return (&mgp->ss[0]);
1777 
1778 	if (myri10ge_tx_hash == 0) {
1779 		slice = CPU->cpu_id & (mgp->num_slices - 1);
1780 		return (&mgp->ss[slice]);
1781 	}
1782 
1783 	/*
1784 	 *  ensure it is a TCP or UDP over IPv4 packet, and that the
1785 	 *  headers are in the 1st mblk.  Otherwise, punt
1786 	 */
1787 	ehl = sizeof (*eh);
1788 	ihl = sizeof (*ip);
1789 	if ((MBLKL(mp)) <  (ehl + ihl + 8))
1790 		return (&mgp->ss[0]);
1791 	eh = (struct ether_header *)(void *)mp->b_rptr;
1792 	ip = (struct ip *)(void *)(eh + 1);
1793 	if (eh->ether_type != BE_16(ETHERTYPE_IP)) {
1794 		if (eh->ether_type != BE_16(ETHERTYPE_VLAN))
1795 			return (&mgp->ss[0]);
1796 		vh = (struct ether_vlan_header *)(void *)mp->b_rptr;
1797 		if (vh->ether_type != BE_16(ETHERTYPE_IP))
1798 			return (&mgp->ss[0]);
1799 		ehl += 4;
1800 		ip = (struct ip *)(void *)(vh + 1);
1801 	}
1802 	ihl = ip->ip_hl << 2;
1803 	if (MBLKL(mp) <  (ehl + ihl + 8))
1804 		return (&mgp->ss[0]);
1805 	switch (myri10ge_rss_hash) {
1806 	case MXGEFW_RSS_HASH_TYPE_IPV4:
1807 		/* fallthru */
1808 	case MXGEFW_RSS_HASH_TYPE_TCP_IPV4:
1809 		/* fallthru */
1810 	case (MXGEFW_RSS_HASH_TYPE_IPV4|MXGEFW_RSS_HASH_TYPE_TCP_IPV4):
1811 		return (myri10ge_toeplitz_send_hash(mgp, ip));
1812 	case MXGEFW_RSS_HASH_TYPE_SRC_PORT:
1813 		/* fallthru */
1814 	case MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT:
1815 		return (myri10ge_simple_send_hash(mgp, ip));
1816 	default:
1817 		break;
1818 	}
1819 	return (&mgp->ss[0]);
1820 }
1821 
1822 static int
1823 myri10ge_setup_slice(struct myri10ge_slice_state *ss)
1824 {
1825 	struct myri10ge_priv *mgp = ss->mgp;
1826 	myri10ge_cmd_t cmd;
1827 	int tx_ring_size, rx_ring_size;
1828 	int tx_ring_entries, rx_ring_entries;
1829 	int slice, status;
1830 	int allocated, idx;
1831 	size_t bytes;
1832 
1833 	slice = ss - mgp->ss;
1834 	cmd.data0 = slice;
1835 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
1836 	tx_ring_size = cmd.data0;
1837 	cmd.data0 = slice;
1838 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
1839 	if (status != 0)
1840 		return (status);
1841 	rx_ring_size = cmd.data0;
1842 
1843 	tx_ring_entries = tx_ring_size / sizeof (struct mcp_kreq_ether_send);
1844 	rx_ring_entries = rx_ring_size / sizeof (struct mcp_dma_addr);
1845 	ss->tx.mask = tx_ring_entries - 1;
1846 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
1847 
1848 	/* get the lanai pointers to the send and receive rings */
1849 
1850 	cmd.data0 = slice;
1851 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
1852 	ss->tx.lanai = (mcp_kreq_ether_send_t *)(void *)(mgp->sram + cmd.data0);
1853 	if (mgp->num_slices > 1) {
1854 		ss->tx.go = (char *)mgp->sram + MXGEFW_ETH_SEND_GO + 64 * slice;
1855 		ss->tx.stop = (char *)mgp->sram + MXGEFW_ETH_SEND_STOP +
1856 		    64 * slice;
1857 	} else {
1858 		ss->tx.go = NULL;
1859 		ss->tx.stop = NULL;
1860 	}
1861 
1862 	cmd.data0 = slice;
1863 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
1864 	ss->rx_small.lanai = (mcp_kreq_ether_recv_t *)
1865 	    (void *)(mgp->sram + cmd.data0);
1866 
1867 	cmd.data0 = slice;
1868 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
1869 	ss->rx_big.lanai = (mcp_kreq_ether_recv_t *)(void *)
1870 	    (mgp->sram + cmd.data0);
1871 
1872 	if (status != 0) {
1873 		cmn_err(CE_WARN,
1874 		    "%s: failed to get ring sizes or locations\n", mgp->name);
1875 		return (status);
1876 	}
1877 
1878 	status = ENOMEM;
1879 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
1880 	ss->rx_small.shadow = kmem_zalloc(bytes, KM_SLEEP);
1881 	if (ss->rx_small.shadow == NULL)
1882 		goto abort;
1883 	(void) memset(ss->rx_small.shadow, 0, bytes);
1884 
1885 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
1886 	ss->rx_big.shadow = kmem_zalloc(bytes, KM_SLEEP);
1887 	if (ss->rx_big.shadow == NULL)
1888 		goto abort_with_rx_small_shadow;
1889 	(void) memset(ss->rx_big.shadow, 0, bytes);
1890 
1891 	/* allocate the host info rings */
1892 
1893 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
1894 	ss->tx.info = kmem_zalloc(bytes, KM_SLEEP);
1895 	if (ss->tx.info == NULL)
1896 		goto abort_with_rx_big_shadow;
1897 	(void) memset(ss->tx.info, 0, bytes);
1898 
1899 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
1900 	ss->rx_small.info = kmem_zalloc(bytes, KM_SLEEP);
1901 	if (ss->rx_small.info == NULL)
1902 		goto abort_with_tx_info;
1903 	(void) memset(ss->rx_small.info, 0, bytes);
1904 
1905 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
1906 	ss->rx_big.info = kmem_zalloc(bytes, KM_SLEEP);
1907 	if (ss->rx_big.info == NULL)
1908 		goto abort_with_rx_small_info;
1909 	(void) memset(ss->rx_big.info, 0, bytes);
1910 
1911 	ss->tx.stall = ss->tx.sched = 0;
1912 	ss->tx.stall_early = ss->tx.stall_late = 0;
1913 
1914 	ss->jbufs_for_smalls = 1 + (1 + ss->rx_small.mask) /
1915 	    (myri10ge_mtu / (myri10ge_small_bytes + MXGEFW_PAD));
1916 
1917 	allocated = myri10ge_add_jbufs(ss,
1918 	    myri10ge_bigbufs_initial + ss->jbufs_for_smalls, 1);
1919 	if (allocated < ss->jbufs_for_smalls + myri10ge_bigbufs_initial) {
1920 		cmn_err(CE_WARN,
1921 		    "%s: Could not allocate enough receive buffers (%d/%d)\n",
1922 		    mgp->name, allocated,
1923 		    myri10ge_bigbufs_initial + ss->jbufs_for_smalls);
1924 		goto abort_with_jumbos;
1925 	}
1926 
1927 	myri10ge_carve_up_jbufs_into_small_ring(ss);
1928 	ss->j_rx_cnt = 0;
1929 
1930 	mutex_enter(&ss->jpool.mtx);
1931 	if (allocated < rx_ring_entries)
1932 		ss->jpool.low_water = allocated / 4;
1933 	else
1934 		ss->jpool.low_water = rx_ring_entries / 2;
1935 
1936 	/*
1937 	 * invalidate the big receive ring in case we do not
1938 	 * allocate sufficient jumbos to fill it
1939 	 */
1940 	(void) memset(ss->rx_big.shadow, 1,
1941 	    (ss->rx_big.mask + 1) * sizeof (ss->rx_big.shadow[0]));
1942 	for (idx = 7; idx <= ss->rx_big.mask; idx += 8) {
1943 		myri10ge_submit_8rx(&ss->rx_big.lanai[idx - 7],
1944 		    &ss->rx_big.shadow[idx - 7]);
1945 		mb();
1946 	}
1947 
1948 
1949 	myri10ge_restock_jumbos(ss);
1950 
1951 	for (idx = 7; idx <= ss->rx_small.mask; idx += 8) {
1952 		myri10ge_submit_8rx(&ss->rx_small.lanai[idx - 7],
1953 		    &ss->rx_small.shadow[idx - 7]);
1954 		mb();
1955 	}
1956 	ss->rx_small.cnt = ss->rx_small.mask + 1;
1957 
1958 	mutex_exit(&ss->jpool.mtx);
1959 
1960 	status = myri10ge_prepare_tx_ring(ss);
1961 
1962 	if (status != 0)
1963 		goto abort_with_small_jbufs;
1964 
1965 	cmd.data0 = ntohl(ss->fw_stats_dma.low);
1966 	cmd.data1 = ntohl(ss->fw_stats_dma.high);
1967 	cmd.data2 = sizeof (mcp_irq_data_t);
1968 	cmd.data2 |= (slice << 16);
1969 	bzero(ss->fw_stats, sizeof (*ss->fw_stats));
1970 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
1971 	if (status == ENOSYS) {
1972 		cmd.data0 = ntohl(ss->fw_stats_dma.low) +
1973 		    offsetof(mcp_irq_data_t, send_done_count);
1974 		cmd.data1 = ntohl(ss->fw_stats_dma.high);
1975 		status = myri10ge_send_cmd(mgp,
1976 		    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE, &cmd);
1977 	}
1978 	if (status) {
1979 		cmn_err(CE_WARN, "%s: Couldn't set stats DMA\n", mgp->name);
1980 		goto abort_with_tx;
1981 	}
1982 
1983 	return (0);
1984 
1985 abort_with_tx:
1986 	myri10ge_unprepare_tx_ring(ss);
1987 
1988 abort_with_small_jbufs:
1989 	myri10ge_release_small_jbufs(ss);
1990 
1991 abort_with_jumbos:
1992 	if (allocated != 0) {
1993 		mutex_enter(&ss->jpool.mtx);
1994 		ss->jpool.low_water = 0;
1995 		mutex_exit(&ss->jpool.mtx);
1996 		myri10ge_unstock_jumbos(ss);
1997 		myri10ge_remove_jbufs(ss);
1998 	}
1999 
2000 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
2001 	kmem_free(ss->rx_big.info, bytes);
2002 
2003 abort_with_rx_small_info:
2004 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
2005 	kmem_free(ss->rx_small.info, bytes);
2006 
2007 abort_with_tx_info:
2008 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
2009 	kmem_free(ss->tx.info, bytes);
2010 
2011 abort_with_rx_big_shadow:
2012 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2013 	kmem_free(ss->rx_big.shadow, bytes);
2014 
2015 abort_with_rx_small_shadow:
2016 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2017 	kmem_free(ss->rx_small.shadow, bytes);
2018 abort:
2019 	return (status);
2020 
2021 }
2022 
2023 static void
2024 myri10ge_teardown_slice(struct myri10ge_slice_state *ss)
2025 {
2026 	int tx_ring_entries, rx_ring_entries;
2027 	size_t bytes;
2028 
2029 	/* ignore slices that have not been fully setup */
2030 	if (ss->tx.cp == NULL)
2031 		return;
2032 	/* Free the TX copy buffers */
2033 	myri10ge_unprepare_tx_ring(ss);
2034 
2035 	/* stop passing returned buffers to firmware */
2036 
2037 	mutex_enter(&ss->jpool.mtx);
2038 	ss->jpool.low_water = 0;
2039 	mutex_exit(&ss->jpool.mtx);
2040 	myri10ge_release_small_jbufs(ss);
2041 
2042 	/* Release the free jumbo frame pool */
2043 	myri10ge_unstock_jumbos(ss);
2044 	myri10ge_remove_jbufs(ss);
2045 
2046 	rx_ring_entries = ss->rx_big.mask + 1;
2047 	tx_ring_entries = ss->tx.mask + 1;
2048 
2049 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
2050 	kmem_free(ss->rx_big.info, bytes);
2051 
2052 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
2053 	kmem_free(ss->rx_small.info, bytes);
2054 
2055 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
2056 	kmem_free(ss->tx.info, bytes);
2057 
2058 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2059 	kmem_free(ss->rx_big.shadow, bytes);
2060 
2061 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2062 	kmem_free(ss->rx_small.shadow, bytes);
2063 
2064 }
2065 static int
2066 myri10ge_start_locked(struct myri10ge_priv *mgp)
2067 {
2068 	myri10ge_cmd_t cmd;
2069 	int status, big_pow2, i;
2070 	volatile uint8_t *itable;
2071 
2072 	status = DDI_SUCCESS;
2073 	/* Allocate DMA resources and receive buffers */
2074 
2075 	status = myri10ge_reset(mgp);
2076 	if (status != 0) {
2077 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
2078 		return (DDI_FAILURE);
2079 	}
2080 
2081 	if (mgp->num_slices > 1) {
2082 		cmd.data0 = mgp->num_slices;
2083 		cmd.data1 = 1; /* use MSI-X */
2084 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ENABLE_RSS_QUEUES,
2085 		    &cmd);
2086 		if (status != 0) {
2087 			cmn_err(CE_WARN,
2088 			    "%s: failed to set number of slices\n",
2089 			    mgp->name);
2090 			goto abort_with_nothing;
2091 		}
2092 		/* setup the indirection table */
2093 		cmd.data0 = mgp->num_slices;
2094 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
2095 		    &cmd);
2096 
2097 		status |= myri10ge_send_cmd(mgp,
2098 		    MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
2099 		if (status != 0) {
2100 			cmn_err(CE_WARN,
2101 			    "%s: failed to setup rss tables\n", mgp->name);
2102 		}
2103 
2104 		/* just enable an identity mapping */
2105 		itable = mgp->sram + cmd.data0;
2106 		for (i = 0; i < mgp->num_slices; i++)
2107 			itable[i] = (uint8_t)i;
2108 
2109 		if (myri10ge_rss_hash & MYRI10GE_TOEPLITZ_HASH) {
2110 			status = myri10ge_init_toeplitz(mgp);
2111 			if (status != 0) {
2112 				cmn_err(CE_WARN, "%s: failed to setup "
2113 				    "toeplitz tx hash table", mgp->name);
2114 				goto abort_with_nothing;
2115 			}
2116 		}
2117 		cmd.data0 = 1;
2118 		cmd.data1 = myri10ge_rss_hash;
2119 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_RSS_ENABLE,
2120 		    &cmd);
2121 		if (status != 0) {
2122 			cmn_err(CE_WARN,
2123 			    "%s: failed to enable slices\n", mgp->name);
2124 			goto abort_with_toeplitz;
2125 		}
2126 	}
2127 
2128 	for (i = 0; i < mgp->num_slices; i++) {
2129 		status = myri10ge_setup_slice(&mgp->ss[i]);
2130 		if (status != 0)
2131 			goto abort_with_slices;
2132 	}
2133 
2134 	/*
2135 	 * Tell the MCP how many buffers it has, and to
2136 	 *  bring the ethernet interface up
2137 	 *
2138 	 * Firmware needs the big buff size as a power of 2.  Lie and
2139 	 * tell it the buffer is larger, because we only use 1
2140 	 * buffer/pkt, and the mtu will prevent overruns
2141 	 */
2142 	big_pow2 = myri10ge_mtu + MXGEFW_PAD;
2143 	while (!ISP2(big_pow2))
2144 		big_pow2++;
2145 
2146 	/* now give firmware buffers sizes, and MTU */
2147 	cmd.data0 = myri10ge_mtu;
2148 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_MTU, &cmd);
2149 	cmd.data0 = myri10ge_small_bytes;
2150 	status |=
2151 	    myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
2152 	cmd.data0 = big_pow2;
2153 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2154 	if (status) {
2155 		cmn_err(CE_WARN, "%s: Couldn't set buffer sizes\n", mgp->name);
2156 		goto abort_with_slices;
2157 	}
2158 
2159 
2160 	cmd.data0 = 1;
2161 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_TSO_MODE, &cmd);
2162 	if (status) {
2163 		cmn_err(CE_WARN, "%s: unable to setup TSO (%d)\n",
2164 		    mgp->name, status);
2165 	} else {
2166 		mgp->features |= MYRI10GE_TSO;
2167 	}
2168 
2169 	mgp->link_state = -1;
2170 	mgp->rdma_tags_available = 15;
2171 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_UP, &cmd);
2172 	if (status) {
2173 		cmn_err(CE_WARN, "%s: unable to start ethernet\n", mgp->name);
2174 		goto abort_with_slices;
2175 	}
2176 	mgp->running = MYRI10GE_ETH_RUNNING;
2177 	return (DDI_SUCCESS);
2178 
2179 abort_with_slices:
2180 	for (i = 0; i < mgp->num_slices; i++)
2181 		myri10ge_teardown_slice(&mgp->ss[i]);
2182 
2183 	mgp->running = MYRI10GE_ETH_STOPPED;
2184 
2185 abort_with_toeplitz:
2186 	if (mgp->toeplitz_hash_table != NULL) {
2187 		kmem_free(mgp->toeplitz_hash_table,
2188 		    sizeof (uint32_t) * 12 * 256);
2189 		mgp->toeplitz_hash_table = NULL;
2190 	}
2191 
2192 abort_with_nothing:
2193 	return (DDI_FAILURE);
2194 }
2195 
2196 static void
2197 myri10ge_stop_locked(struct myri10ge_priv *mgp)
2198 {
2199 	int status, old_down_cnt;
2200 	myri10ge_cmd_t cmd;
2201 	int wait_time = 10;
2202 	int i, polling;
2203 
2204 	old_down_cnt = mgp->down_cnt;
2205 	mb();
2206 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2207 	if (status) {
2208 		cmn_err(CE_WARN, "%s: Couldn't bring down link\n", mgp->name);
2209 	}
2210 
2211 	while (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2212 		delay(1 * drv_usectohz(1000000));
2213 		wait_time--;
2214 		if (wait_time == 0)
2215 			break;
2216 	}
2217 again:
2218 	if (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2219 		cmn_err(CE_WARN, "%s: didn't get down irq\n", mgp->name);
2220 		for (i = 0; i < mgp->num_slices; i++) {
2221 			/*
2222 			 * take and release the rx lock to ensure
2223 			 * that no interrupt thread is blocked
2224 			 * elsewhere in the stack, preventing
2225 			 * completion
2226 			 */
2227 
2228 			mutex_enter(&mgp->ss[i].rx_lock);
2229 			printf("%s: slice %d rx irq idle\n",
2230 			    mgp->name, i);
2231 			mutex_exit(&mgp->ss[i].rx_lock);
2232 
2233 			/* verify that the poll handler is inactive */
2234 			mutex_enter(&mgp->ss->poll_lock);
2235 			polling = mgp->ss->rx_polling;
2236 			mutex_exit(&mgp->ss->poll_lock);
2237 			if (polling) {
2238 				printf("%s: slice %d is polling\n",
2239 				    mgp->name, i);
2240 				delay(1 * drv_usectohz(1000000));
2241 				goto again;
2242 			}
2243 		}
2244 		delay(1 * drv_usectohz(1000000));
2245 		if (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2246 			cmn_err(CE_WARN, "%s: Never got down irq\n", mgp->name);
2247 		}
2248 	}
2249 
2250 	for (i = 0; i < mgp->num_slices; i++)
2251 		myri10ge_teardown_slice(&mgp->ss[i]);
2252 
2253 	if (mgp->toeplitz_hash_table != NULL) {
2254 		kmem_free(mgp->toeplitz_hash_table,
2255 		    sizeof (uint32_t) * 12 * 256);
2256 		mgp->toeplitz_hash_table = NULL;
2257 	}
2258 	mgp->running = MYRI10GE_ETH_STOPPED;
2259 }
2260 
2261 static int
2262 myri10ge_m_start(void *arg)
2263 {
2264 	struct myri10ge_priv *mgp = arg;
2265 	int status;
2266 
2267 	mutex_enter(&mgp->intrlock);
2268 
2269 	if (mgp->running != MYRI10GE_ETH_STOPPED) {
2270 		mutex_exit(&mgp->intrlock);
2271 		return (DDI_FAILURE);
2272 	}
2273 	status = myri10ge_start_locked(mgp);
2274 	mutex_exit(&mgp->intrlock);
2275 
2276 	if (status != DDI_SUCCESS)
2277 		return (status);
2278 
2279 	/* start the watchdog timer */
2280 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
2281 	    mgp->timer_ticks);
2282 	return (DDI_SUCCESS);
2283 
2284 }
2285 
2286 static void
2287 myri10ge_m_stop(void *arg)
2288 {
2289 	struct myri10ge_priv *mgp = arg;
2290 
2291 	mutex_enter(&mgp->intrlock);
2292 	/* if the device not running give up */
2293 	if (mgp->running != MYRI10GE_ETH_RUNNING) {
2294 		mutex_exit(&mgp->intrlock);
2295 		return;
2296 	}
2297 
2298 	mgp->running = MYRI10GE_ETH_STOPPING;
2299 	mutex_exit(&mgp->intrlock);
2300 	(void) untimeout(mgp->timer_id);
2301 	mutex_enter(&mgp->intrlock);
2302 	myri10ge_stop_locked(mgp);
2303 	mutex_exit(&mgp->intrlock);
2304 
2305 }
2306 
2307 static inline void
2308 myri10ge_rx_csum(mblk_t *mp, struct myri10ge_rx_ring_stats *s, uint32_t csum)
2309 {
2310 	struct ether_header *eh;
2311 	struct ip *ip;
2312 	struct ip6_hdr *ip6;
2313 	uint32_t start, stuff, end, partial, hdrlen;
2314 
2315 
2316 	csum = ntohs((uint16_t)csum);
2317 	eh = (struct ether_header *)(void *)mp->b_rptr;
2318 	hdrlen = sizeof (*eh);
2319 	if (eh->ether_dhost.ether_addr_octet[0] & 1) {
2320 		if (0 == (bcmp(eh->ether_dhost.ether_addr_octet,
2321 		    myri10ge_broadcastaddr, sizeof (eh->ether_dhost))))
2322 			s->brdcstrcv++;
2323 		else
2324 			s->multircv++;
2325 	}
2326 
2327 	if (eh->ether_type == BE_16(ETHERTYPE_VLAN)) {
2328 		/*
2329 		 * fix checksum by subtracting 4 bytes after what the
2330 		 * firmware thought was the end of the ether hdr
2331 		 */
2332 		partial = *(uint32_t *)
2333 		    (void *)(mp->b_rptr + ETHERNET_HEADER_SIZE);
2334 		csum += ~partial;
2335 		csum +=  (csum < ~partial);
2336 		csum = (csum >> 16) + (csum & 0xFFFF);
2337 		csum = (csum >> 16) + (csum & 0xFFFF);
2338 		hdrlen += VLAN_TAGSZ;
2339 	}
2340 
2341 	if (eh->ether_type ==  BE_16(ETHERTYPE_IP)) {
2342 		ip = (struct ip *)(void *)(mp->b_rptr + hdrlen);
2343 		start = ip->ip_hl << 2;
2344 
2345 		if (ip->ip_p == IPPROTO_TCP)
2346 			stuff = start + offsetof(struct tcphdr, th_sum);
2347 		else if (ip->ip_p == IPPROTO_UDP)
2348 			stuff = start + offsetof(struct udphdr, uh_sum);
2349 		else
2350 			return;
2351 		end = ntohs(ip->ip_len);
2352 	} else if (eh->ether_type ==  BE_16(ETHERTYPE_IPV6)) {
2353 		ip6 = (struct ip6_hdr *)(void *)(mp->b_rptr + hdrlen);
2354 		start = sizeof (*ip6);
2355 		if (ip6->ip6_nxt == IPPROTO_TCP) {
2356 			stuff = start + offsetof(struct tcphdr, th_sum);
2357 		} else if (ip6->ip6_nxt == IPPROTO_UDP)
2358 			stuff = start + offsetof(struct udphdr, uh_sum);
2359 		else
2360 			return;
2361 		end = start + ntohs(ip6->ip6_plen);
2362 		/*
2363 		 * IPv6 headers do not contain a checksum, and hence
2364 		 * do not checksum to zero, so they don't "fall out"
2365 		 * of the partial checksum calculation like IPv4
2366 		 * headers do.  We need to fix the partial checksum by
2367 		 * subtracting the checksum of the IPv6 header.
2368 		 */
2369 
2370 		partial = myri10ge_csum_generic((uint16_t *)ip6, sizeof (*ip6));
2371 		csum += ~partial;
2372 		csum +=  (csum < ~partial);
2373 		csum = (csum >> 16) + (csum & 0xFFFF);
2374 		csum = (csum >> 16) + (csum & 0xFFFF);
2375 	} else {
2376 		return;
2377 	}
2378 
2379 	if (MBLKL(mp) > hdrlen + end) {
2380 		/* padded frame, so hw csum may be invalid */
2381 		return;
2382 	}
2383 
2384 	mac_hcksum_set(mp, start, stuff, end, csum, HCK_PARTIALCKSUM);
2385 }
2386 
2387 static mblk_t *
2388 myri10ge_rx_done_small(struct myri10ge_slice_state *ss, uint32_t len,
2389     uint32_t csum)
2390 {
2391 	mblk_t *mp;
2392 	myri10ge_rx_ring_t *rx;
2393 	int idx;
2394 
2395 	rx = &ss->rx_small;
2396 	idx = rx->cnt & rx->mask;
2397 	ss->rx_small.cnt++;
2398 
2399 	/* allocate a new buffer to pass up the stack */
2400 	mp = allocb(len + MXGEFW_PAD, 0);
2401 	if (mp == NULL) {
2402 		MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_small_nobuf);
2403 		goto abort;
2404 	}
2405 	bcopy(ss->rx_small.info[idx].ptr,
2406 	    (caddr_t)mp->b_wptr, len + MXGEFW_PAD);
2407 	mp->b_wptr += len + MXGEFW_PAD;
2408 	mp->b_rptr += MXGEFW_PAD;
2409 
2410 	ss->rx_stats.ibytes += len;
2411 	ss->rx_stats.ipackets += 1;
2412 	myri10ge_rx_csum(mp, &ss->rx_stats, csum);
2413 
2414 abort:
2415 	if ((idx & 7) == 7) {
2416 		myri10ge_submit_8rx(&rx->lanai[idx - 7],
2417 		    &rx->shadow[idx - 7]);
2418 	}
2419 
2420 	return (mp);
2421 }
2422 
2423 
2424 static mblk_t *
2425 myri10ge_rx_done_big(struct myri10ge_slice_state *ss, uint32_t len,
2426     uint32_t csum)
2427 {
2428 	struct myri10ge_jpool_stuff *jpool;
2429 	struct myri10ge_jpool_entry *j;
2430 	mblk_t *mp;
2431 	int idx, num_owned_by_mcp;
2432 
2433 	jpool = &ss->jpool;
2434 	idx = ss->j_rx_cnt & ss->rx_big.mask;
2435 	j = ss->rx_big.info[idx].j;
2436 
2437 	if (j == NULL) {
2438 		printf("%s: null j at idx=%d, rx_big.cnt = %d, j_rx_cnt=%d\n",
2439 		    ss->mgp->name, idx, ss->rx_big.cnt, ss->j_rx_cnt);
2440 		return (NULL);
2441 	}
2442 
2443 
2444 	ss->rx_big.info[idx].j = NULL;
2445 	ss->j_rx_cnt++;
2446 
2447 
2448 	/*
2449 	 * Check to see if we are low on rx buffers.
2450 	 * Note that we must leave at least 8 free so there are
2451 	 * enough to free in a single 64-byte write.
2452 	 */
2453 	num_owned_by_mcp = ss->rx_big.cnt - ss->j_rx_cnt;
2454 	if (num_owned_by_mcp < jpool->low_water) {
2455 		mutex_enter(&jpool->mtx);
2456 		myri10ge_restock_jumbos(ss);
2457 		mutex_exit(&jpool->mtx);
2458 		num_owned_by_mcp = ss->rx_big.cnt - ss->j_rx_cnt;
2459 		/* if we are still low, then we have to copy */
2460 		if (num_owned_by_mcp < 16) {
2461 			MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_copy);
2462 			/* allocate a new buffer to pass up the stack */
2463 			mp = allocb(len + MXGEFW_PAD, 0);
2464 			if (mp == NULL) {
2465 				goto abort;
2466 			}
2467 			bcopy(j->buf,
2468 			    (caddr_t)mp->b_wptr, len + MXGEFW_PAD);
2469 			myri10ge_jfree_rtn(j);
2470 			/* push buffer back to NIC */
2471 			mutex_enter(&jpool->mtx);
2472 			myri10ge_restock_jumbos(ss);
2473 			mutex_exit(&jpool->mtx);
2474 			goto set_len;
2475 		}
2476 	}
2477 
2478 	/* loan our buffer to the stack */
2479 	mp = desballoc((unsigned char *)j->buf, myri10ge_mtu, 0, &j->free_func);
2480 	if (mp == NULL) {
2481 		goto abort;
2482 	}
2483 
2484 set_len:
2485 	mp->b_rptr += MXGEFW_PAD;
2486 	mp->b_wptr = ((unsigned char *) mp->b_rptr + len);
2487 
2488 	ss->rx_stats.ibytes += len;
2489 	ss->rx_stats.ipackets += 1;
2490 	myri10ge_rx_csum(mp, &ss->rx_stats, csum);
2491 
2492 	return (mp);
2493 
2494 abort:
2495 	myri10ge_jfree_rtn(j);
2496 	MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_big_nobuf);
2497 	return (NULL);
2498 }
2499 
2500 /*
2501  * Free all transmit buffers up until the specified index
2502  */
2503 static inline void
2504 myri10ge_tx_done(struct myri10ge_slice_state *ss, uint32_t mcp_index)
2505 {
2506 	myri10ge_tx_ring_t *tx;
2507 	struct myri10ge_tx_dma_handle_head handles;
2508 	int idx;
2509 	int limit = 0;
2510 
2511 	tx = &ss->tx;
2512 	handles.head = NULL;
2513 	handles.tail = NULL;
2514 	while (tx->pkt_done != (int)mcp_index) {
2515 		idx = tx->done & tx->mask;
2516 
2517 		/*
2518 		 * mblk & DMA handle attached only to first slot
2519 		 * per buffer in the packet
2520 		 */
2521 
2522 		if (tx->info[idx].m) {
2523 			(void) ddi_dma_unbind_handle(tx->info[idx].handle->h);
2524 			tx->info[idx].handle->next = handles.head;
2525 			handles.head = tx->info[idx].handle;
2526 			if (handles.tail == NULL)
2527 				handles.tail = tx->info[idx].handle;
2528 			freeb(tx->info[idx].m);
2529 			tx->info[idx].m = 0;
2530 			tx->info[idx].handle = 0;
2531 		}
2532 		if (tx->info[idx].ostat.opackets != 0) {
2533 			tx->stats.multixmt += tx->info[idx].ostat.multixmt;
2534 			tx->stats.brdcstxmt += tx->info[idx].ostat.brdcstxmt;
2535 			tx->stats.obytes += tx->info[idx].ostat.obytes;
2536 			tx->stats.opackets += tx->info[idx].ostat.opackets;
2537 			tx->info[idx].stat.un.all = 0;
2538 			tx->pkt_done++;
2539 		}
2540 
2541 		tx->done++;
2542 		/*
2543 		 * if we stalled the queue, wake it.  But Wait until
2544 		 * we have at least 1/2 our slots free.
2545 		 */
2546 		if ((tx->req - tx->done) < (tx->mask >> 1) &&
2547 		    tx->stall != tx->sched) {
2548 			mutex_enter(&ss->tx.lock);
2549 			tx->sched = tx->stall;
2550 			mutex_exit(&ss->tx.lock);
2551 			mac_tx_ring_update(ss->mgp->mh, tx->rh);
2552 		}
2553 
2554 		/* limit potential for livelock */
2555 		if (unlikely(++limit >  2 * tx->mask))
2556 			break;
2557 	}
2558 	if (tx->req == tx->done && tx->stop != NULL) {
2559 		/*
2560 		 * Nic has sent all pending requests, allow it
2561 		 * to stop polling this queue
2562 		 */
2563 		mutex_enter(&tx->lock);
2564 		if (tx->req == tx->done && tx->active) {
2565 			*(int *)(void *)tx->stop = 1;
2566 			tx->active = 0;
2567 			mb();
2568 		}
2569 		mutex_exit(&tx->lock);
2570 	}
2571 	if (handles.head != NULL)
2572 		myri10ge_free_tx_handles(tx, &handles);
2573 }
2574 
2575 static void
2576 myri10ge_mbl_init(struct myri10ge_mblk_list *mbl)
2577 {
2578 	mbl->head = NULL;
2579 	mbl->tail = &mbl->head;
2580 	mbl->cnt = 0;
2581 }
2582 
2583 /*ARGSUSED*/
2584 void
2585 myri10ge_mbl_append(struct myri10ge_slice_state *ss,
2586     struct myri10ge_mblk_list *mbl, mblk_t *mp)
2587 {
2588 	*(mbl->tail) = mp;
2589 	mbl->tail = &mp->b_next;
2590 	mp->b_next = NULL;
2591 	mbl->cnt++;
2592 }
2593 
2594 
2595 static inline void
2596 myri10ge_clean_rx_done(struct myri10ge_slice_state *ss,
2597     struct myri10ge_mblk_list *mbl, int limit, boolean_t *stop)
2598 {
2599 	myri10ge_rx_done_t *rx_done = &ss->rx_done;
2600 	struct myri10ge_priv *mgp = ss->mgp;
2601 	mblk_t *mp;
2602 	struct lro_entry *lro;
2603 	uint16_t length;
2604 	uint16_t checksum;
2605 
2606 
2607 	while (rx_done->entry[rx_done->idx].length != 0) {
2608 		if (unlikely (*stop)) {
2609 			break;
2610 		}
2611 		length = ntohs(rx_done->entry[rx_done->idx].length);
2612 		length &= (~MXGEFW_RSS_HASH_MASK);
2613 
2614 		/* limit potential for livelock */
2615 		limit -= length;
2616 		if (unlikely(limit < 0))
2617 			break;
2618 
2619 		rx_done->entry[rx_done->idx].length = 0;
2620 		checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
2621 		if (length <= myri10ge_small_bytes)
2622 			mp = myri10ge_rx_done_small(ss, length, checksum);
2623 		else
2624 			mp = myri10ge_rx_done_big(ss, length, checksum);
2625 		if (mp != NULL) {
2626 			if (!myri10ge_lro ||
2627 			    0 != myri10ge_lro_rx(ss, mp, checksum, mbl))
2628 				myri10ge_mbl_append(ss, mbl, mp);
2629 		}
2630 		rx_done->cnt++;
2631 		rx_done->idx = rx_done->cnt & (mgp->max_intr_slots - 1);
2632 	}
2633 	while (ss->lro_active != NULL) {
2634 		lro = ss->lro_active;
2635 		ss->lro_active = lro->next;
2636 		myri10ge_lro_flush(ss, lro, mbl);
2637 	}
2638 }
2639 
2640 static void
2641 myri10ge_intr_rx(struct myri10ge_slice_state *ss)
2642 {
2643 	uint64_t gen;
2644 	struct myri10ge_mblk_list mbl;
2645 
2646 	myri10ge_mbl_init(&mbl);
2647 	if (mutex_tryenter(&ss->rx_lock) == 0)
2648 		return;
2649 	gen = ss->rx_gen_num;
2650 	myri10ge_clean_rx_done(ss, &mbl, MYRI10GE_POLL_NULL,
2651 	    &ss->rx_polling);
2652 	if (mbl.head != NULL)
2653 		mac_rx_ring(ss->mgp->mh, ss->rx_rh, mbl.head, gen);
2654 	mutex_exit(&ss->rx_lock);
2655 
2656 }
2657 
2658 static mblk_t *
2659 myri10ge_poll_rx(void *arg, int bytes)
2660 {
2661 	struct myri10ge_slice_state *ss = arg;
2662 	struct myri10ge_mblk_list mbl;
2663 	boolean_t dummy = B_FALSE;
2664 
2665 	if (bytes == 0)
2666 		return (NULL);
2667 
2668 	myri10ge_mbl_init(&mbl);
2669 	mutex_enter(&ss->rx_lock);
2670 	if (ss->rx_polling)
2671 		myri10ge_clean_rx_done(ss, &mbl, bytes, &dummy);
2672 	else
2673 		printf("%d: poll_rx: token=%d, polling=%d\n", (int)(ss -
2674 		    ss->mgp->ss), ss->rx_token, ss->rx_polling);
2675 	mutex_exit(&ss->rx_lock);
2676 	return (mbl.head);
2677 }
2678 
2679 /*ARGSUSED*/
2680 static uint_t
2681 myri10ge_intr(caddr_t arg0, caddr_t arg1)
2682 {
2683 	struct myri10ge_slice_state *ss =
2684 	    (struct myri10ge_slice_state *)(void *)arg0;
2685 	struct myri10ge_priv *mgp = ss->mgp;
2686 	mcp_irq_data_t *stats = ss->fw_stats;
2687 	myri10ge_tx_ring_t *tx = &ss->tx;
2688 	uint32_t send_done_count;
2689 	uint8_t valid;
2690 
2691 
2692 	/* make sure the DMA has finished */
2693 	if (!stats->valid) {
2694 		return (DDI_INTR_UNCLAIMED);
2695 	}
2696 	valid = stats->valid;
2697 
2698 	/* low bit indicates receives are present */
2699 	if (valid & 1)
2700 		myri10ge_intr_rx(ss);
2701 
2702 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
2703 		/* lower legacy IRQ  */
2704 		*mgp->irq_deassert = 0;
2705 		if (!myri10ge_deassert_wait)
2706 			/* don't wait for conf. that irq is low */
2707 			stats->valid = 0;
2708 		mb();
2709 	} else {
2710 		/* no need to wait for conf. that irq is low */
2711 		stats->valid = 0;
2712 	}
2713 
2714 	do {
2715 		/* check for transmit completes and receives */
2716 		send_done_count = ntohl(stats->send_done_count);
2717 		if (send_done_count != tx->pkt_done)
2718 			myri10ge_tx_done(ss, (int)send_done_count);
2719 	} while (*((volatile uint8_t *) &stats->valid));
2720 
2721 	if (stats->stats_updated) {
2722 		if (mgp->link_state != stats->link_up || stats->link_down) {
2723 			mgp->link_state = stats->link_up;
2724 			if (stats->link_down) {
2725 				mgp->down_cnt += stats->link_down;
2726 				mgp->link_state = 0;
2727 			}
2728 			if (mgp->link_state) {
2729 				if (myri10ge_verbose)
2730 					printf("%s: link up\n", mgp->name);
2731 				mac_link_update(mgp->mh, LINK_STATE_UP);
2732 			} else {
2733 				if (myri10ge_verbose)
2734 					printf("%s: link down\n", mgp->name);
2735 				mac_link_update(mgp->mh, LINK_STATE_DOWN);
2736 			}
2737 			MYRI10GE_NIC_STAT_INC(link_changes);
2738 		}
2739 		if (mgp->rdma_tags_available !=
2740 		    ntohl(ss->fw_stats->rdma_tags_available)) {
2741 			mgp->rdma_tags_available =
2742 			    ntohl(ss->fw_stats->rdma_tags_available);
2743 			cmn_err(CE_NOTE, "%s: RDMA timed out! "
2744 			    "%d tags left\n", mgp->name,
2745 			    mgp->rdma_tags_available);
2746 		}
2747 	}
2748 
2749 	mb();
2750 	/* check to see if we have rx token to pass back */
2751 	if (valid & 0x1) {
2752 		mutex_enter(&ss->poll_lock);
2753 		if (ss->rx_polling) {
2754 			ss->rx_token = 1;
2755 		} else {
2756 			*ss->irq_claim = BE_32(3);
2757 			ss->rx_token = 0;
2758 		}
2759 		mutex_exit(&ss->poll_lock);
2760 	}
2761 	*(ss->irq_claim + 1) = BE_32(3);
2762 	return (DDI_INTR_CLAIMED);
2763 }
2764 
2765 /*
2766  * Add or remove a multicast address.  This is called with our
2767  * macinfo's lock held by GLD, so we do not need to worry about
2768  * our own locking here.
2769  */
2770 static int
2771 myri10ge_m_multicst(void *arg, boolean_t add, const uint8_t *multicastaddr)
2772 {
2773 	myri10ge_cmd_t cmd;
2774 	struct myri10ge_priv *mgp = arg;
2775 	int status, join_leave;
2776 
2777 	if (add)
2778 		join_leave = MXGEFW_JOIN_MULTICAST_GROUP;
2779 	else
2780 		join_leave = MXGEFW_LEAVE_MULTICAST_GROUP;
2781 	(void) memcpy(&cmd.data0, multicastaddr, 4);
2782 	(void) memcpy(&cmd.data1, multicastaddr + 4, 2);
2783 	cmd.data0 = htonl(cmd.data0);
2784 	cmd.data1 = htonl(cmd.data1);
2785 	status = myri10ge_send_cmd(mgp, join_leave, &cmd);
2786 	if (status == 0)
2787 		return (0);
2788 
2789 	cmn_err(CE_WARN, "%s: failed to set multicast address\n",
2790 	    mgp->name);
2791 	return (status);
2792 }
2793 
2794 
2795 static int
2796 myri10ge_m_promisc(void *arg, boolean_t on)
2797 {
2798 	struct myri10ge_priv *mgp = arg;
2799 
2800 	myri10ge_change_promisc(mgp, on);
2801 	return (0);
2802 }
2803 
2804 /*
2805  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
2806  *  backwards one at a time and handle ring wraps
2807  */
2808 
2809 static inline void
2810 myri10ge_submit_req_backwards(myri10ge_tx_ring_t *tx,
2811     mcp_kreq_ether_send_t *src, int cnt)
2812 {
2813 	int idx, starting_slot;
2814 	starting_slot = tx->req;
2815 	while (cnt > 1) {
2816 		cnt--;
2817 		idx = (starting_slot + cnt) & tx->mask;
2818 		myri10ge_pio_copy(&tx->lanai[idx],
2819 		    &src[cnt], sizeof (*src));
2820 		mb();
2821 	}
2822 }
2823 
2824 /*
2825  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
2826  * at most 32 bytes at a time, so as to avoid involving the software
2827  * pio handler in the nic.   We re-write the first segment's flags
2828  * to mark them valid only after writing the entire chain
2829  */
2830 
2831 static inline void
2832 myri10ge_submit_req(myri10ge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
2833     int cnt)
2834 {
2835 	int idx, i;
2836 	uint32_t *src_ints, *dst_ints;
2837 	mcp_kreq_ether_send_t *srcp, *dstp, *dst;
2838 	uint8_t last_flags;
2839 
2840 	idx = tx->req & tx->mask;
2841 
2842 	last_flags = src->flags;
2843 	src->flags = 0;
2844 	mb();
2845 	dst = dstp = &tx->lanai[idx];
2846 	srcp = src;
2847 
2848 	if ((idx + cnt) < tx->mask) {
2849 		for (i = 0; i < (cnt - 1); i += 2) {
2850 			myri10ge_pio_copy(dstp, srcp, 2 * sizeof (*src));
2851 			mb(); /* force write every 32 bytes */
2852 			srcp += 2;
2853 			dstp += 2;
2854 		}
2855 	} else {
2856 		/*
2857 		 * submit all but the first request, and ensure
2858 		 *  that it is submitted below
2859 		 */
2860 		myri10ge_submit_req_backwards(tx, src, cnt);
2861 		i = 0;
2862 	}
2863 	if (i < cnt) {
2864 		/* submit the first request */
2865 		myri10ge_pio_copy(dstp, srcp, sizeof (*src));
2866 		mb(); /* barrier before setting valid flag */
2867 	}
2868 
2869 	/* re-write the last 32-bits with the valid flags */
2870 	src->flags |= last_flags;
2871 	src_ints = (uint32_t *)src;
2872 	src_ints += 3;
2873 	dst_ints = (uint32_t *)dst;
2874 	dst_ints += 3;
2875 	*dst_ints =  *src_ints;
2876 	tx->req += cnt;
2877 	mb();
2878 	/* notify NIC to poll this tx ring */
2879 	if (!tx->active && tx->go != NULL) {
2880 		*(int *)(void *)tx->go = 1;
2881 		tx->active = 1;
2882 		tx->activate++;
2883 		mb();
2884 	}
2885 }
2886 
2887 /* ARGSUSED */
2888 static inline void
2889 myri10ge_lso_info_get(mblk_t *mp, uint32_t *mss, uint32_t *flags)
2890 {
2891 	uint32_t lso_flag;
2892 	mac_lso_get(mp, mss, &lso_flag);
2893 	(*flags) |= lso_flag;
2894 }
2895 
2896 
2897 /* like pullupmsg, except preserve hcksum/LSO attributes */
2898 static int
2899 myri10ge_pullup(struct myri10ge_slice_state *ss, mblk_t *mp)
2900 {
2901 	uint32_t start, stuff, tx_offload_flags, mss;
2902 	int ok;
2903 
2904 	mss = 0;
2905 	mac_hcksum_get(mp, &start, &stuff, NULL, NULL, &tx_offload_flags);
2906 	myri10ge_lso_info_get(mp, &mss, &tx_offload_flags);
2907 
2908 	ok = pullupmsg(mp, -1);
2909 	if (!ok) {
2910 		printf("pullupmsg failed");
2911 		return (DDI_FAILURE);
2912 	}
2913 	MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_pullup);
2914 	mac_hcksum_set(mp, start, stuff, 0, 0, tx_offload_flags);
2915 	if (tx_offload_flags & HW_LSO)
2916 		DB_LSOMSS(mp) = (uint16_t)mss;
2917 	lso_info_set(mp, mss, tx_offload_flags);
2918 	return (DDI_SUCCESS);
2919 }
2920 
2921 static inline void
2922 myri10ge_tx_stat(struct myri10ge_tx_pkt_stats *s, struct ether_header *eh,
2923     int opackets, int obytes)
2924 {
2925 	s->un.all = 0;
2926 	if (eh->ether_dhost.ether_addr_octet[0] & 1) {
2927 		if (0 == (bcmp(eh->ether_dhost.ether_addr_octet,
2928 		    myri10ge_broadcastaddr, sizeof (eh->ether_dhost))))
2929 			s->un.s.brdcstxmt = 1;
2930 		else
2931 			s->un.s.multixmt = 1;
2932 	}
2933 	s->un.s.opackets = (uint16_t)opackets;
2934 	s->un.s.obytes = obytes;
2935 }
2936 
2937 static int
2938 myri10ge_tx_copy(struct myri10ge_slice_state *ss, mblk_t *mp,
2939     mcp_kreq_ether_send_t *req)
2940 {
2941 	myri10ge_tx_ring_t *tx = &ss->tx;
2942 	caddr_t ptr;
2943 	struct myri10ge_tx_copybuf *cp;
2944 	mblk_t *bp;
2945 	int idx, mblen, avail;
2946 	uint16_t len;
2947 
2948 	mutex_enter(&tx->lock);
2949 	avail = tx->mask - (tx->req - tx->done);
2950 	if (avail <= 1) {
2951 		mutex_exit(&tx->lock);
2952 		return (EBUSY);
2953 	}
2954 	idx = tx->req & tx->mask;
2955 	cp = &tx->cp[idx];
2956 	ptr = cp->va;
2957 	for (len = 0, bp = mp; bp != NULL; bp = bp->b_cont) {
2958 		mblen = MBLKL(bp);
2959 		bcopy(bp->b_rptr, ptr, mblen);
2960 		ptr += mblen;
2961 		len += mblen;
2962 	}
2963 	/* ensure runts are padded to 60 bytes */
2964 	if (len < 60) {
2965 		bzero(ptr, 64 - len);
2966 		len = 60;
2967 	}
2968 	req->addr_low = cp->dma.low;
2969 	req->addr_high = cp->dma.high;
2970 	req->length = htons(len);
2971 	req->pad = 0;
2972 	req->rdma_count = 1;
2973 	myri10ge_tx_stat(&tx->info[idx].stat,
2974 	    (struct ether_header *)(void *)cp->va, 1, len);
2975 	(void) ddi_dma_sync(cp->dma.handle, 0, len, DDI_DMA_SYNC_FORDEV);
2976 	myri10ge_submit_req(&ss->tx, req, 1);
2977 	mutex_exit(&tx->lock);
2978 	freemsg(mp);
2979 	return (DDI_SUCCESS);
2980 }
2981 
2982 
2983 static void
2984 myri10ge_send_locked(myri10ge_tx_ring_t *tx, mcp_kreq_ether_send_t *req_list,
2985     struct myri10ge_tx_buffer_state *tx_info,
2986     int count)
2987 {
2988 	int i, idx;
2989 
2990 	idx = 0; /* gcc -Wuninitialized */
2991 	/* store unmapping and bp info for tx irq handler */
2992 	for (i = 0; i < count; i++) {
2993 		idx = (tx->req + i) & tx->mask;
2994 		tx->info[idx].m = tx_info[i].m;
2995 		tx->info[idx].handle = tx_info[i].handle;
2996 	}
2997 	tx->info[idx].stat.un.all = tx_info[0].stat.un.all;
2998 
2999 	/* submit the frame to the nic */
3000 	myri10ge_submit_req(tx, req_list, count);
3001 
3002 
3003 }
3004 
3005 
3006 
3007 static void
3008 myri10ge_copydata(mblk_t *mp, int off, int len, caddr_t buf)
3009 {
3010 	mblk_t *bp;
3011 	int seglen;
3012 	uint_t count;
3013 
3014 	bp = mp;
3015 
3016 	while (off > 0) {
3017 		seglen = MBLKL(bp);
3018 		if (off < seglen)
3019 			break;
3020 		off -= seglen;
3021 		bp = bp->b_cont;
3022 	}
3023 	while (len > 0) {
3024 		seglen = MBLKL(bp);
3025 		count = min(seglen - off, len);
3026 		bcopy(bp->b_rptr + off, buf, count);
3027 		len -= count;
3028 		buf += count;
3029 		off = 0;
3030 		bp = bp->b_cont;
3031 	}
3032 }
3033 
3034 static int
3035 myri10ge_ether_parse_header(mblk_t *mp)
3036 {
3037 	struct ether_header eh_copy;
3038 	struct ether_header *eh;
3039 	int eth_hdr_len, seglen;
3040 
3041 	seglen = MBLKL(mp);
3042 	eth_hdr_len = sizeof (*eh);
3043 	if (seglen < eth_hdr_len) {
3044 		myri10ge_copydata(mp, 0, eth_hdr_len, (caddr_t)&eh_copy);
3045 		eh = &eh_copy;
3046 	} else {
3047 		eh = (struct ether_header *)(void *)mp->b_rptr;
3048 	}
3049 	if (eh->ether_type == BE_16(ETHERTYPE_VLAN)) {
3050 		eth_hdr_len += 4;
3051 	}
3052 
3053 	return (eth_hdr_len);
3054 }
3055 
3056 static int
3057 myri10ge_lso_parse_header(mblk_t *mp, int off)
3058 {
3059 	char buf[128];
3060 	int seglen, sum_off;
3061 	struct ip *ip;
3062 	struct tcphdr *tcp;
3063 
3064 	seglen = MBLKL(mp);
3065 	if (seglen < off + sizeof (*ip)) {
3066 		myri10ge_copydata(mp, off, sizeof (*ip), buf);
3067 		ip = (struct ip *)(void *)buf;
3068 	} else {
3069 		ip = (struct ip *)(void *)(mp->b_rptr + off);
3070 	}
3071 	if (seglen < off + (ip->ip_hl << 2) + sizeof (*tcp)) {
3072 		myri10ge_copydata(mp, off,
3073 		    (ip->ip_hl << 2) + sizeof (*tcp), buf);
3074 		ip = (struct ip *)(void *)buf;
3075 	}
3076 	tcp = (struct tcphdr *)(void *)((char *)ip + (ip->ip_hl << 2));
3077 
3078 	/*
3079 	 * NIC expects ip_sum to be zero.  Recent changes to
3080 	 * OpenSolaris leave the correct ip checksum there, rather
3081 	 * than the required zero, so we need to zero it.  Otherwise,
3082 	 * the NIC will produce bad checksums when sending LSO packets.
3083 	 */
3084 	if (ip->ip_sum != 0) {
3085 		if (((char *)ip) != buf) {
3086 			/* ip points into mblk, so just zero it */
3087 			ip->ip_sum = 0;
3088 		} else {
3089 			/*
3090 			 * ip points into a copy, so walk the chain
3091 			 * to find the ip_csum, then zero it
3092 			 */
3093 			sum_off = off + _PTRDIFF(&ip->ip_sum, buf);
3094 			while (sum_off > (int)(MBLKL(mp) - 1)) {
3095 				sum_off -= MBLKL(mp);
3096 				mp = mp->b_cont;
3097 			}
3098 			mp->b_rptr[sum_off] = 0;
3099 			sum_off++;
3100 			while (sum_off > MBLKL(mp) - 1) {
3101 				sum_off -= MBLKL(mp);
3102 				mp = mp->b_cont;
3103 			}
3104 			mp->b_rptr[sum_off] = 0;
3105 		}
3106 	}
3107 	return (off + ((ip->ip_hl + tcp->th_off) << 2));
3108 }
3109 
3110 static int
3111 myri10ge_tx_tso_copy(struct myri10ge_slice_state *ss, mblk_t *mp,
3112     mcp_kreq_ether_send_t *req_list, int hdr_size, int pkt_size,
3113     uint16_t mss, uint8_t cksum_offset)
3114 {
3115 	myri10ge_tx_ring_t *tx = &ss->tx;
3116 	struct myri10ge_priv *mgp = ss->mgp;
3117 	mblk_t *bp;
3118 	mcp_kreq_ether_send_t *req;
3119 	struct myri10ge_tx_copybuf *cp;
3120 	caddr_t rptr, ptr;
3121 	int mblen, count, cum_len, mss_resid, tx_req, pkt_size_tmp;
3122 	int resid, avail, idx, hdr_size_tmp, tx_boundary;
3123 	int rdma_count;
3124 	uint32_t seglen, len, boundary, low, high_swapped;
3125 	uint16_t pseudo_hdr_offset = htons(mss);
3126 	uint8_t flags;
3127 
3128 	tx_boundary = mgp->tx_boundary;
3129 	hdr_size_tmp = hdr_size;
3130 	resid = tx_boundary;
3131 	count = 1;
3132 	mutex_enter(&tx->lock);
3133 
3134 	/* check to see if the slots are really there */
3135 	avail = tx->mask - (tx->req - tx->done);
3136 	if (unlikely(avail <=  MYRI10GE_MAX_SEND_DESC_TSO)) {
3137 		atomic_inc_32(&tx->stall);
3138 		mutex_exit(&tx->lock);
3139 		return (EBUSY);
3140 	}
3141 
3142 	/* copy */
3143 	cum_len = -hdr_size;
3144 	count = 0;
3145 	req = req_list;
3146 	idx = tx->mask & tx->req;
3147 	cp = &tx->cp[idx];
3148 	low = ntohl(cp->dma.low);
3149 	ptr = cp->va;
3150 	cp->len = 0;
3151 	if (mss) {
3152 		int payload = pkt_size - hdr_size;
3153 		uint16_t opackets = (payload / mss) + ((payload % mss) != 0);
3154 		tx->info[idx].ostat.opackets = opackets;
3155 		tx->info[idx].ostat.obytes = (opackets - 1) * hdr_size
3156 		    + pkt_size;
3157 	}
3158 	hdr_size_tmp = hdr_size;
3159 	mss_resid = mss;
3160 	flags = (MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST);
3161 	tx_req = tx->req;
3162 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3163 		mblen = MBLKL(bp);
3164 		rptr = (caddr_t)bp->b_rptr;
3165 		len = min(hdr_size_tmp, mblen);
3166 		if (len) {
3167 			bcopy(rptr, ptr, len);
3168 			rptr += len;
3169 			ptr += len;
3170 			resid -= len;
3171 			mblen -= len;
3172 			hdr_size_tmp -= len;
3173 			cp->len += len;
3174 			if (hdr_size_tmp)
3175 				continue;
3176 			if (resid < mss) {
3177 				tx_req++;
3178 				idx = tx->mask & tx_req;
3179 				cp = &tx->cp[idx];
3180 				low = ntohl(cp->dma.low);
3181 				ptr = cp->va;
3182 				resid = tx_boundary;
3183 			}
3184 		}
3185 		while (mblen) {
3186 			len = min(mss_resid, mblen);
3187 			bcopy(rptr, ptr, len);
3188 			mss_resid -= len;
3189 			resid -= len;
3190 			mblen -= len;
3191 			rptr += len;
3192 			ptr += len;
3193 			cp->len += len;
3194 			if (mss_resid == 0) {
3195 				mss_resid = mss;
3196 				if (resid < mss) {
3197 					tx_req++;
3198 					idx = tx->mask & tx_req;
3199 					cp = &tx->cp[idx];
3200 					cp->len = 0;
3201 					low = ntohl(cp->dma.low);
3202 					ptr = cp->va;
3203 					resid = tx_boundary;
3204 				}
3205 			}
3206 		}
3207 	}
3208 
3209 	req = req_list;
3210 	pkt_size_tmp = pkt_size;
3211 	count = 0;
3212 	rdma_count = 0;
3213 	tx_req = tx->req;
3214 	while (pkt_size_tmp) {
3215 		idx = tx->mask & tx_req;
3216 		cp = &tx->cp[idx];
3217 		high_swapped = cp->dma.high;
3218 		low = ntohl(cp->dma.low);
3219 		len = cp->len;
3220 		if (len == 0) {
3221 			printf("len=0! pkt_size_tmp=%d, pkt_size=%d\n",
3222 			    pkt_size_tmp, pkt_size);
3223 			for (bp = mp; bp != NULL; bp = bp->b_cont) {
3224 				mblen = MBLKL(bp);
3225 				printf("mblen:%d\n", mblen);
3226 			}
3227 			pkt_size_tmp = pkt_size;
3228 			tx_req = tx->req;
3229 			while (pkt_size_tmp > 0) {
3230 				idx = tx->mask & tx_req;
3231 				cp = &tx->cp[idx];
3232 				printf("cp->len = %d\n", cp->len);
3233 				pkt_size_tmp -= cp->len;
3234 				tx_req++;
3235 			}
3236 			printf("dropped\n");
3237 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3238 			goto done;
3239 		}
3240 		pkt_size_tmp -= len;
3241 		while (len) {
3242 			while (len) {
3243 				uint8_t flags_next;
3244 				int cum_len_next;
3245 
3246 				boundary = (low + mgp->tx_boundary) &
3247 				    ~(mgp->tx_boundary - 1);
3248 				seglen = boundary - low;
3249 				if (seglen > len)
3250 					seglen = len;
3251 
3252 				flags_next = flags & ~MXGEFW_FLAGS_FIRST;
3253 				cum_len_next = cum_len + seglen;
3254 				(req-rdma_count)->rdma_count = rdma_count + 1;
3255 				if (likely(cum_len >= 0)) {
3256 					/* payload */
3257 					int next_is_first, chop;
3258 
3259 					chop = (cum_len_next > mss);
3260 					cum_len_next = cum_len_next % mss;
3261 					next_is_first = (cum_len_next == 0);
3262 					flags |= chop *
3263 					    MXGEFW_FLAGS_TSO_CHOP;
3264 					flags_next |= next_is_first *
3265 					    MXGEFW_FLAGS_FIRST;
3266 					rdma_count |= -(chop | next_is_first);
3267 					rdma_count += chop & !next_is_first;
3268 				} else if (likely(cum_len_next >= 0)) {
3269 					/* header ends */
3270 					int small;
3271 
3272 					rdma_count = -1;
3273 					cum_len_next = 0;
3274 					seglen = -cum_len;
3275 					small = (mss <= MXGEFW_SEND_SMALL_SIZE);
3276 					flags_next = MXGEFW_FLAGS_TSO_PLD |
3277 					    MXGEFW_FLAGS_FIRST |
3278 					    (small * MXGEFW_FLAGS_SMALL);
3279 				}
3280 				req->addr_high = high_swapped;
3281 				req->addr_low = htonl(low);
3282 				req->pseudo_hdr_offset = pseudo_hdr_offset;
3283 				req->pad = 0; /* complete solid 16-byte block */
3284 				req->rdma_count = 1;
3285 				req->cksum_offset = cksum_offset;
3286 				req->length = htons(seglen);
3287 				req->flags = flags | ((cum_len & 1) *
3288 				    MXGEFW_FLAGS_ALIGN_ODD);
3289 				if (cksum_offset > seglen)
3290 					cksum_offset -= seglen;
3291 				else
3292 					cksum_offset = 0;
3293 				low += seglen;
3294 				len -= seglen;
3295 				cum_len = cum_len_next;
3296 				req++;
3297 				req->flags = 0;
3298 				flags = flags_next;
3299 				count++;
3300 				rdma_count++;
3301 			}
3302 		}
3303 		tx_req++;
3304 	}
3305 	(req-rdma_count)->rdma_count = (uint8_t)rdma_count;
3306 	do {
3307 		req--;
3308 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
3309 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP |
3310 	    MXGEFW_FLAGS_FIRST)));
3311 
3312 	myri10ge_submit_req(tx, req_list, count);
3313 done:
3314 	mutex_exit(&tx->lock);
3315 	freemsg(mp);
3316 	return (DDI_SUCCESS);
3317 }
3318 
3319 /*
3320  * Try to send the chain of buffers described by the mp.  We must not
3321  * encapsulate more than eth->tx.req - eth->tx.done, or
3322  * MXGEFW_MAX_SEND_DESC, whichever is more.
3323  */
3324 
3325 static int
3326 myri10ge_send(struct myri10ge_slice_state *ss, mblk_t *mp,
3327     mcp_kreq_ether_send_t *req_list, struct myri10ge_tx_buffer_state *tx_info)
3328 {
3329 	struct myri10ge_priv *mgp = ss->mgp;
3330 	myri10ge_tx_ring_t *tx = &ss->tx;
3331 	mcp_kreq_ether_send_t *req;
3332 	struct myri10ge_tx_dma_handle *handles, *dma_handle = NULL;
3333 	mblk_t  *bp;
3334 	ddi_dma_cookie_t cookie;
3335 	int err, rv, count, avail, mblen, try_pullup, i, max_segs, maclen,
3336 	    rdma_count, cum_len, lso_hdr_size;
3337 	uint32_t start, stuff, tx_offload_flags;
3338 	uint32_t seglen, len, mss, boundary, low, high_swapped;
3339 	uint_t ncookies;
3340 	uint16_t pseudo_hdr_offset;
3341 	uint8_t flags, cksum_offset, odd_flag;
3342 	int pkt_size;
3343 	int lso_copy = myri10ge_lso_copy;
3344 	try_pullup = 1;
3345 
3346 again:
3347 	/* Setup checksum offloading, if needed */
3348 	mac_hcksum_get(mp, &start, &stuff, NULL, NULL, &tx_offload_flags);
3349 	myri10ge_lso_info_get(mp, &mss, &tx_offload_flags);
3350 	if (tx_offload_flags & HW_LSO) {
3351 		max_segs = MYRI10GE_MAX_SEND_DESC_TSO;
3352 		if ((tx_offload_flags & HCK_PARTIALCKSUM) == 0) {
3353 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_lsobadflags);
3354 			freemsg(mp);
3355 			return (DDI_SUCCESS);
3356 		}
3357 	} else {
3358 		max_segs = MXGEFW_MAX_SEND_DESC;
3359 		mss = 0;
3360 	}
3361 	req = req_list;
3362 	cksum_offset = 0;
3363 	pseudo_hdr_offset = 0;
3364 
3365 	/* leave an extra slot keep the ring from wrapping */
3366 	avail = tx->mask - (tx->req - tx->done);
3367 
3368 	/*
3369 	 * If we have > MXGEFW_MAX_SEND_DESC, then any over-length
3370 	 * message will need to be pulled up in order to fit.
3371 	 * Otherwise, we are low on transmit descriptors, it is
3372 	 * probably better to stall and try again rather than pullup a
3373 	 * message to fit.
3374 	 */
3375 
3376 	if (avail < max_segs) {
3377 		err = EBUSY;
3378 		atomic_inc_32(&tx->stall_early);
3379 		goto stall;
3380 	}
3381 
3382 	/* find out how long the frame is and how many segments it is */
3383 	count = 0;
3384 	odd_flag = 0;
3385 	pkt_size = 0;
3386 	flags = (MXGEFW_FLAGS_NO_TSO | MXGEFW_FLAGS_FIRST);
3387 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3388 		dblk_t *dbp;
3389 		mblen = MBLKL(bp);
3390 		if (mblen == 0) {
3391 			/*
3392 			 * we can't simply skip over 0-length mblks
3393 			 * because the hardware can't deal with them,
3394 			 * and we could leak them.
3395 			 */
3396 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_zero_len);
3397 			err = EIO;
3398 			goto pullup;
3399 		}
3400 		/*
3401 		 * There's no advantage to copying most gesballoc
3402 		 * attached blocks, so disable lso copy in that case
3403 		 */
3404 		if (mss && lso_copy == 1 && ((dbp = bp->b_datap) != NULL)) {
3405 			if ((void *)dbp->db_lastfree != myri10ge_db_lastfree) {
3406 				lso_copy = 0;
3407 			}
3408 		}
3409 		pkt_size += mblen;
3410 		count++;
3411 	}
3412 
3413 	/* Try to pull up excessivly long chains */
3414 	if (count >= max_segs) {
3415 		err = myri10ge_pullup(ss, mp);
3416 		if (likely(err == DDI_SUCCESS)) {
3417 			count = 1;
3418 		} else {
3419 			if (count <  MYRI10GE_MAX_SEND_DESC_TSO) {
3420 				/*
3421 				 * just let the h/w send it, it will be
3422 				 * inefficient, but us better than dropping
3423 				 */
3424 				max_segs = MYRI10GE_MAX_SEND_DESC_TSO;
3425 			} else {
3426 				/* drop it */
3427 				MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3428 				freemsg(mp);
3429 				return (0);
3430 			}
3431 		}
3432 	}
3433 
3434 	cum_len = 0;
3435 	maclen = myri10ge_ether_parse_header(mp);
3436 
3437 	if (tx_offload_flags & HCK_PARTIALCKSUM) {
3438 
3439 		cksum_offset = start + maclen;
3440 		pseudo_hdr_offset = htons(stuff + maclen);
3441 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
3442 		flags |= MXGEFW_FLAGS_CKSUM;
3443 	}
3444 
3445 	lso_hdr_size = 0; /* -Wunitinialized */
3446 	if (mss) { /* LSO */
3447 		/* this removes any CKSUM flag from before */
3448 		flags = (MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST);
3449 		/*
3450 		 * parse the headers and set cum_len to a negative
3451 		 * value to reflect the offset of the TCP payload
3452 		 */
3453 		lso_hdr_size =  myri10ge_lso_parse_header(mp, maclen);
3454 		cum_len = -lso_hdr_size;
3455 		if ((mss < mgp->tx_boundary) && lso_copy) {
3456 			err = myri10ge_tx_tso_copy(ss, mp, req_list,
3457 			    lso_hdr_size, pkt_size, mss, cksum_offset);
3458 			return (err);
3459 		}
3460 
3461 		/*
3462 		 * for TSO, pseudo_hdr_offset holds mss.  The firmware
3463 		 * figures out where to put the checksum by parsing
3464 		 * the header.
3465 		 */
3466 
3467 		pseudo_hdr_offset = htons(mss);
3468 	} else if (pkt_size <= MXGEFW_SEND_SMALL_SIZE) {
3469 		flags |= MXGEFW_FLAGS_SMALL;
3470 		if (pkt_size < myri10ge_tx_copylen) {
3471 			req->cksum_offset = cksum_offset;
3472 			req->pseudo_hdr_offset = pseudo_hdr_offset;
3473 			req->flags = flags;
3474 			err = myri10ge_tx_copy(ss, mp, req);
3475 			return (err);
3476 		}
3477 		cum_len = 0;
3478 	}
3479 
3480 	/* pull one DMA handle for each bp from our freelist */
3481 	handles = NULL;
3482 	err = myri10ge_alloc_tx_handles(ss, count, &handles);
3483 	if (err != DDI_SUCCESS) {
3484 		err = DDI_FAILURE;
3485 		goto stall;
3486 	}
3487 	count = 0;
3488 	rdma_count = 0;
3489 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3490 		mblen = MBLKL(bp);
3491 		dma_handle = handles;
3492 		handles = handles->next;
3493 
3494 		rv = ddi_dma_addr_bind_handle(dma_handle->h, NULL,
3495 		    (caddr_t)bp->b_rptr, mblen,
3496 		    DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL,
3497 		    &cookie, &ncookies);
3498 		if (unlikely(rv != DDI_DMA_MAPPED)) {
3499 			err = EIO;
3500 			try_pullup = 0;
3501 			dma_handle->next = handles;
3502 			handles = dma_handle;
3503 			goto abort_with_handles;
3504 		}
3505 
3506 		/* reserve the slot */
3507 		tx_info[count].m = bp;
3508 		tx_info[count].handle = dma_handle;
3509 
3510 		for (; ; ) {
3511 			low = MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress);
3512 			high_swapped =
3513 			    htonl(MYRI10GE_HIGHPART_TO_U32(
3514 			    cookie.dmac_laddress));
3515 			len = (uint32_t)cookie.dmac_size;
3516 			while (len) {
3517 				uint8_t flags_next;
3518 				int cum_len_next;
3519 
3520 				boundary = (low + mgp->tx_boundary) &
3521 				    ~(mgp->tx_boundary - 1);
3522 				seglen = boundary - low;
3523 				if (seglen > len)
3524 					seglen = len;
3525 
3526 				flags_next = flags & ~MXGEFW_FLAGS_FIRST;
3527 				cum_len_next = cum_len + seglen;
3528 				if (mss) {
3529 					(req-rdma_count)->rdma_count =
3530 					    rdma_count + 1;
3531 					if (likely(cum_len >= 0)) {
3532 						/* payload */
3533 						int next_is_first, chop;
3534 
3535 						chop = (cum_len_next > mss);
3536 						cum_len_next =
3537 						    cum_len_next % mss;
3538 						next_is_first =
3539 						    (cum_len_next == 0);
3540 						flags |= chop *
3541 						    MXGEFW_FLAGS_TSO_CHOP;
3542 						flags_next |= next_is_first *
3543 						    MXGEFW_FLAGS_FIRST;
3544 						rdma_count |=
3545 						    -(chop | next_is_first);
3546 						rdma_count +=
3547 						    chop & !next_is_first;
3548 					} else if (likely(cum_len_next >= 0)) {
3549 						/* header ends */
3550 						int small;
3551 
3552 						rdma_count = -1;
3553 						cum_len_next = 0;
3554 						seglen = -cum_len;
3555 						small = (mss <=
3556 						    MXGEFW_SEND_SMALL_SIZE);
3557 						flags_next =
3558 						    MXGEFW_FLAGS_TSO_PLD
3559 						    | MXGEFW_FLAGS_FIRST
3560 						    | (small *
3561 						    MXGEFW_FLAGS_SMALL);
3562 					}
3563 				}
3564 				req->addr_high = high_swapped;
3565 				req->addr_low = htonl(low);
3566 				req->pseudo_hdr_offset = pseudo_hdr_offset;
3567 				req->pad = 0; /* complete solid 16-byte block */
3568 				req->rdma_count = 1;
3569 				req->cksum_offset = cksum_offset;
3570 				req->length = htons(seglen);
3571 				req->flags = flags | ((cum_len & 1) * odd_flag);
3572 				if (cksum_offset > seglen)
3573 					cksum_offset -= seglen;
3574 				else
3575 					cksum_offset = 0;
3576 				low += seglen;
3577 				len -= seglen;
3578 				cum_len = cum_len_next;
3579 				count++;
3580 				rdma_count++;
3581 				/*  make sure all the segments will fit */
3582 				if (unlikely(count >= max_segs)) {
3583 					MYRI10GE_ATOMIC_SLICE_STAT_INC(
3584 					    xmit_lowbuf);
3585 					/* may try a pullup */
3586 					err = EBUSY;
3587 					if (try_pullup)
3588 						try_pullup = 2;
3589 					goto abort_with_handles;
3590 				}
3591 				req++;
3592 				req->flags = 0;
3593 				flags = flags_next;
3594 				tx_info[count].m = 0;
3595 			}
3596 			ncookies--;
3597 			if (ncookies == 0)
3598 				break;
3599 			ddi_dma_nextcookie(dma_handle->h, &cookie);
3600 		}
3601 	}
3602 	(req-rdma_count)->rdma_count = (uint8_t)rdma_count;
3603 
3604 	if (mss) {
3605 		do {
3606 			req--;
3607 			req->flags |= MXGEFW_FLAGS_TSO_LAST;
3608 		} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP |
3609 		    MXGEFW_FLAGS_FIRST)));
3610 	}
3611 
3612 	/* calculate tx stats */
3613 	if (mss) {
3614 		uint16_t opackets;
3615 		int payload;
3616 
3617 		payload = pkt_size - lso_hdr_size;
3618 		opackets = (payload / mss) + ((payload % mss) != 0);
3619 		tx_info[0].stat.un.all = 0;
3620 		tx_info[0].ostat.opackets = opackets;
3621 		tx_info[0].ostat.obytes = (opackets - 1) * lso_hdr_size
3622 		    + pkt_size;
3623 	} else {
3624 		myri10ge_tx_stat(&tx_info[0].stat,
3625 		    (struct ether_header *)(void *)mp->b_rptr, 1, pkt_size);
3626 	}
3627 	mutex_enter(&tx->lock);
3628 
3629 	/* check to see if the slots are really there */
3630 	avail = tx->mask - (tx->req - tx->done);
3631 	if (unlikely(avail <= count)) {
3632 		mutex_exit(&tx->lock);
3633 		err = 0;
3634 		goto late_stall;
3635 	}
3636 
3637 	myri10ge_send_locked(tx, req_list, tx_info, count);
3638 	mutex_exit(&tx->lock);
3639 	return (DDI_SUCCESS);
3640 
3641 late_stall:
3642 	try_pullup = 0;
3643 	atomic_inc_32(&tx->stall_late);
3644 
3645 abort_with_handles:
3646 	/* unbind and free handles from previous mblks */
3647 	for (i = 0; i < count; i++) {
3648 		bp = tx_info[i].m;
3649 		tx_info[i].m = 0;
3650 		if (bp) {
3651 			dma_handle = tx_info[i].handle;
3652 			(void) ddi_dma_unbind_handle(dma_handle->h);
3653 			dma_handle->next = handles;
3654 			handles = dma_handle;
3655 			tx_info[i].handle = NULL;
3656 			tx_info[i].m = NULL;
3657 		}
3658 	}
3659 	myri10ge_free_tx_handle_slist(tx, handles);
3660 pullup:
3661 	if (try_pullup) {
3662 		err = myri10ge_pullup(ss, mp);
3663 		if (err != DDI_SUCCESS && try_pullup == 2) {
3664 			/* drop */
3665 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3666 			freemsg(mp);
3667 			return (0);
3668 		}
3669 		try_pullup = 0;
3670 		goto again;
3671 	}
3672 
3673 stall:
3674 	if (err != 0) {
3675 		if (err == EBUSY) {
3676 			atomic_inc_32(&tx->stall);
3677 		} else {
3678 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3679 		}
3680 	}
3681 	return (err);
3682 }
3683 
3684 static mblk_t *
3685 myri10ge_send_wrapper(void *arg, mblk_t *mp)
3686 {
3687 	struct myri10ge_slice_state *ss = arg;
3688 	int err = 0;
3689 	mcp_kreq_ether_send_t *req_list;
3690 #if defined(__i386)
3691 	/*
3692 	 * We need about 2.5KB of scratch space to handle transmits.
3693 	 * i86pc has only 8KB of kernel stack space, so we malloc the
3694 	 * scratch space there rather than keeping it on the stack.
3695 	 */
3696 	size_t req_size, tx_info_size;
3697 	struct myri10ge_tx_buffer_state *tx_info;
3698 	caddr_t req_bytes;
3699 
3700 	req_size = sizeof (*req_list) * (MYRI10GE_MAX_SEND_DESC_TSO + 4)
3701 	    + 8;
3702 	req_bytes = kmem_alloc(req_size, KM_SLEEP);
3703 	tx_info_size = sizeof (*tx_info) * (MYRI10GE_MAX_SEND_DESC_TSO + 1);
3704 	tx_info = kmem_alloc(tx_info_size, KM_SLEEP);
3705 #else
3706 	char req_bytes[sizeof (*req_list) * (MYRI10GE_MAX_SEND_DESC_TSO + 4)
3707 	    + 8];
3708 	struct myri10ge_tx_buffer_state tx_info[MYRI10GE_MAX_SEND_DESC_TSO + 1];
3709 #endif
3710 
3711 	/* ensure req_list entries are aligned to 8 bytes */
3712 	req_list = (struct mcp_kreq_ether_send *)
3713 	    (((unsigned long)req_bytes + 7UL) & ~7UL);
3714 
3715 	err = myri10ge_send(ss, mp, req_list, tx_info);
3716 
3717 #if defined(__i386)
3718 	kmem_free(tx_info, tx_info_size);
3719 	kmem_free(req_bytes, req_size);
3720 #endif
3721 	if (err)
3722 		return (mp);
3723 	else
3724 		return (NULL);
3725 }
3726 
3727 static int
3728 myri10ge_addmac(void *arg, const uint8_t *mac_addr)
3729 {
3730 	struct myri10ge_priv *mgp = arg;
3731 	int err;
3732 
3733 	if (mac_addr == NULL)
3734 		return (EINVAL);
3735 
3736 	mutex_enter(&mgp->intrlock);
3737 	if (mgp->macaddr_cnt) {
3738 		mutex_exit(&mgp->intrlock);
3739 		return (ENOSPC);
3740 	}
3741 	err = myri10ge_m_unicst(mgp, mac_addr);
3742 	if (!err)
3743 		mgp->macaddr_cnt++;
3744 
3745 	mutex_exit(&mgp->intrlock);
3746 	if (err)
3747 		return (err);
3748 
3749 	bcopy(mac_addr, mgp->mac_addr, sizeof (mgp->mac_addr));
3750 	return (0);
3751 }
3752 
3753 /*ARGSUSED*/
3754 static int
3755 myri10ge_remmac(void *arg, const uint8_t *mac_addr)
3756 {
3757 	struct myri10ge_priv *mgp = arg;
3758 
3759 	mutex_enter(&mgp->intrlock);
3760 	mgp->macaddr_cnt--;
3761 	mutex_exit(&mgp->intrlock);
3762 
3763 	return (0);
3764 }
3765 
3766 /*ARGSUSED*/
3767 static void
3768 myri10ge_fill_group(void *arg, mac_ring_type_t rtype, const int index,
3769     mac_group_info_t *infop, mac_group_handle_t gh)
3770 {
3771 	struct myri10ge_priv *mgp = arg;
3772 
3773 	if (rtype != MAC_RING_TYPE_RX)
3774 		return;
3775 
3776 	infop->mgi_driver = (mac_group_driver_t)mgp;
3777 	infop->mgi_start = NULL;
3778 	infop->mgi_stop = NULL;
3779 	infop->mgi_addmac = myri10ge_addmac;
3780 	infop->mgi_remmac = myri10ge_remmac;
3781 	infop->mgi_count = mgp->num_slices;
3782 }
3783 
3784 static int
3785 myri10ge_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num)
3786 {
3787 	struct myri10ge_slice_state *ss;
3788 
3789 	ss = (struct myri10ge_slice_state *)rh;
3790 	mutex_enter(&ss->rx_lock);
3791 	ss->rx_gen_num = mr_gen_num;
3792 	mutex_exit(&ss->rx_lock);
3793 	return (0);
3794 }
3795 
3796 /*
3797  * Retrieve a value for one of the statistics for a particular rx ring
3798  */
3799 int
3800 myri10ge_rx_ring_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val)
3801 {
3802 	struct myri10ge_slice_state *ss;
3803 
3804 	ss = (struct myri10ge_slice_state *)rh;
3805 	switch (stat) {
3806 	case MAC_STAT_RBYTES:
3807 		*val = ss->rx_stats.ibytes;
3808 		break;
3809 
3810 	case MAC_STAT_IPACKETS:
3811 		*val = ss->rx_stats.ipackets;
3812 		break;
3813 
3814 	default:
3815 		*val = 0;
3816 		return (ENOTSUP);
3817 	}
3818 
3819 	return (0);
3820 }
3821 
3822 /*
3823  * Retrieve a value for one of the statistics for a particular tx ring
3824  */
3825 int
3826 myri10ge_tx_ring_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val)
3827 {
3828 	struct myri10ge_slice_state *ss;
3829 
3830 	ss = (struct myri10ge_slice_state *)rh;
3831 	switch (stat) {
3832 	case MAC_STAT_OBYTES:
3833 		*val = ss->tx.stats.obytes;
3834 		break;
3835 
3836 	case MAC_STAT_OPACKETS:
3837 		*val = ss->tx.stats.opackets;
3838 		break;
3839 
3840 	default:
3841 		*val = 0;
3842 		return (ENOTSUP);
3843 	}
3844 
3845 	return (0);
3846 }
3847 
3848 static int
3849 myri10ge_rx_ring_intr_disable(mac_intr_handle_t intrh)
3850 {
3851 	struct myri10ge_slice_state *ss;
3852 
3853 	ss = (struct myri10ge_slice_state *)intrh;
3854 	mutex_enter(&ss->poll_lock);
3855 	ss->rx_polling = B_TRUE;
3856 	mutex_exit(&ss->poll_lock);
3857 	return (0);
3858 }
3859 
3860 static int
3861 myri10ge_rx_ring_intr_enable(mac_intr_handle_t intrh)
3862 {
3863 	struct myri10ge_slice_state *ss;
3864 
3865 	ss = (struct myri10ge_slice_state *)intrh;
3866 	mutex_enter(&ss->poll_lock);
3867 	ss->rx_polling = B_FALSE;
3868 	if (ss->rx_token) {
3869 		*ss->irq_claim = BE_32(3);
3870 		ss->rx_token = 0;
3871 	}
3872 	mutex_exit(&ss->poll_lock);
3873 	return (0);
3874 }
3875 
3876 /*ARGSUSED*/
3877 static void
3878 myri10ge_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
3879     const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
3880 {
3881 	struct myri10ge_priv *mgp = arg;
3882 	struct myri10ge_slice_state *ss;
3883 	mac_intr_t *mintr = &infop->mri_intr;
3884 
3885 	ASSERT((unsigned int)ring_index < mgp->num_slices);
3886 
3887 	ss = &mgp->ss[ring_index];
3888 	switch (rtype) {
3889 	case MAC_RING_TYPE_RX:
3890 		ss->rx_rh = rh;
3891 		infop->mri_driver = (mac_ring_driver_t)ss;
3892 		infop->mri_start = myri10ge_ring_start;
3893 		infop->mri_stop = NULL;
3894 		infop->mri_poll = myri10ge_poll_rx;
3895 		infop->mri_stat = myri10ge_rx_ring_stat;
3896 		mintr->mi_handle = (mac_intr_handle_t)ss;
3897 		mintr->mi_enable = myri10ge_rx_ring_intr_enable;
3898 		mintr->mi_disable = myri10ge_rx_ring_intr_disable;
3899 		break;
3900 	case MAC_RING_TYPE_TX:
3901 		ss->tx.rh = rh;
3902 		infop->mri_driver = (mac_ring_driver_t)ss;
3903 		infop->mri_start = NULL;
3904 		infop->mri_stop = NULL;
3905 		infop->mri_tx = myri10ge_send_wrapper;
3906 		infop->mri_stat = myri10ge_tx_ring_stat;
3907 		break;
3908 	default:
3909 		break;
3910 	}
3911 }
3912 
3913 static void
3914 myri10ge_nic_stat_destroy(struct myri10ge_priv *mgp)
3915 {
3916 	if (mgp->ksp_stat == NULL)
3917 		return;
3918 
3919 	kstat_delete(mgp->ksp_stat);
3920 	mgp->ksp_stat = NULL;
3921 }
3922 
3923 static void
3924 myri10ge_slice_stat_destroy(struct myri10ge_slice_state *ss)
3925 {
3926 	if (ss->ksp_stat == NULL)
3927 		return;
3928 
3929 	kstat_delete(ss->ksp_stat);
3930 	ss->ksp_stat = NULL;
3931 }
3932 
3933 static void
3934 myri10ge_info_destroy(struct myri10ge_priv *mgp)
3935 {
3936 	if (mgp->ksp_info == NULL)
3937 		return;
3938 
3939 	kstat_delete(mgp->ksp_info);
3940 	mgp->ksp_info = NULL;
3941 }
3942 
3943 static int
3944 myri10ge_nic_stat_kstat_update(kstat_t *ksp, int rw)
3945 {
3946 	struct myri10ge_nic_stat *ethstat;
3947 	struct myri10ge_priv *mgp;
3948 	mcp_irq_data_t *fw_stats;
3949 
3950 
3951 	if (rw == KSTAT_WRITE)
3952 		return (EACCES);
3953 
3954 	ethstat = (struct myri10ge_nic_stat *)ksp->ks_data;
3955 	mgp = (struct myri10ge_priv *)ksp->ks_private;
3956 	fw_stats = mgp->ss[0].fw_stats;
3957 
3958 	ethstat->dma_read_bw_MBs.value.ul = mgp->read_dma;
3959 	ethstat->dma_write_bw_MBs.value.ul = mgp->write_dma;
3960 	ethstat->dma_read_write_bw_MBs.value.ul = mgp->read_write_dma;
3961 	if (myri10ge_tx_dma_attr.dma_attr_flags & DDI_DMA_FORCE_PHYSICAL)
3962 		ethstat->dma_force_physical.value.ul = 1;
3963 	else
3964 		ethstat->dma_force_physical.value.ul = 0;
3965 	ethstat->lanes.value.ul = mgp->pcie_link_width;
3966 	ethstat->dropped_bad_crc32.value.ul =
3967 	    ntohl(fw_stats->dropped_bad_crc32);
3968 	ethstat->dropped_bad_phy.value.ul =
3969 	    ntohl(fw_stats->dropped_bad_phy);
3970 	ethstat->dropped_link_error_or_filtered.value.ul =
3971 	    ntohl(fw_stats->dropped_link_error_or_filtered);
3972 	ethstat->dropped_link_overflow.value.ul =
3973 	    ntohl(fw_stats->dropped_link_overflow);
3974 	ethstat->dropped_multicast_filtered.value.ul =
3975 	    ntohl(fw_stats->dropped_multicast_filtered);
3976 	ethstat->dropped_no_big_buffer.value.ul =
3977 	    ntohl(fw_stats->dropped_no_big_buffer);
3978 	ethstat->dropped_no_small_buffer.value.ul =
3979 	    ntohl(fw_stats->dropped_no_small_buffer);
3980 	ethstat->dropped_overrun.value.ul =
3981 	    ntohl(fw_stats->dropped_overrun);
3982 	ethstat->dropped_pause.value.ul =
3983 	    ntohl(fw_stats->dropped_pause);
3984 	ethstat->dropped_runt.value.ul =
3985 	    ntohl(fw_stats->dropped_runt);
3986 	ethstat->link_up.value.ul =
3987 	    ntohl(fw_stats->link_up);
3988 	ethstat->dropped_unicast_filtered.value.ul =
3989 	    ntohl(fw_stats->dropped_unicast_filtered);
3990 	return (0);
3991 }
3992 
3993 static int
3994 myri10ge_slice_stat_kstat_update(kstat_t *ksp, int rw)
3995 {
3996 	struct myri10ge_slice_stat *ethstat;
3997 	struct myri10ge_slice_state *ss;
3998 
3999 	if (rw == KSTAT_WRITE)
4000 		return (EACCES);
4001 
4002 	ethstat = (struct myri10ge_slice_stat *)ksp->ks_data;
4003 	ss = (struct myri10ge_slice_state *)ksp->ks_private;
4004 
4005 	ethstat->rx_big.value.ul = ss->j_rx_cnt;
4006 	ethstat->rx_bigbuf_firmware.value.ul = ss->rx_big.cnt - ss->j_rx_cnt;
4007 	ethstat->rx_bigbuf_pool.value.ul =
4008 	    ss->jpool.num_alloc - ss->jbufs_for_smalls;
4009 	ethstat->rx_bigbuf_smalls.value.ul = ss->jbufs_for_smalls;
4010 	ethstat->rx_small.value.ul = ss->rx_small.cnt -
4011 	    (ss->rx_small.mask + 1);
4012 	ethstat->tx_done.value.ul = ss->tx.done;
4013 	ethstat->tx_req.value.ul = ss->tx.req;
4014 	ethstat->tx_activate.value.ul = ss->tx.activate;
4015 	ethstat->xmit_sched.value.ul = ss->tx.sched;
4016 	ethstat->xmit_stall.value.ul = ss->tx.stall;
4017 	ethstat->xmit_stall_early.value.ul = ss->tx.stall_early;
4018 	ethstat->xmit_stall_late.value.ul = ss->tx.stall_late;
4019 	ethstat->xmit_err.value.ul =  MYRI10GE_SLICE_STAT(xmit_err);
4020 	return (0);
4021 }
4022 
4023 static int
4024 myri10ge_info_kstat_update(kstat_t *ksp, int rw)
4025 {
4026 	struct myri10ge_info *info;
4027 	struct myri10ge_priv *mgp;
4028 
4029 
4030 	if (rw == KSTAT_WRITE)
4031 		return (EACCES);
4032 
4033 	info = (struct myri10ge_info *)ksp->ks_data;
4034 	mgp = (struct myri10ge_priv *)ksp->ks_private;
4035 	kstat_named_setstr(&info->driver_version, MYRI10GE_VERSION_STR);
4036 	kstat_named_setstr(&info->firmware_version, mgp->fw_version);
4037 	kstat_named_setstr(&info->firmware_name, mgp->fw_name);
4038 	kstat_named_setstr(&info->interrupt_type, mgp->intr_type);
4039 	kstat_named_setstr(&info->product_code, mgp->pc_str);
4040 	kstat_named_setstr(&info->serial_number, mgp->sn_str);
4041 	return (0);
4042 }
4043 
4044 static struct myri10ge_info myri10ge_info_template = {
4045 	{ "driver_version",	KSTAT_DATA_STRING },
4046 	{ "firmware_version",	KSTAT_DATA_STRING },
4047 	{ "firmware_name",	KSTAT_DATA_STRING },
4048 	{ "interrupt_type",	KSTAT_DATA_STRING },
4049 	{ "product_code",	KSTAT_DATA_STRING },
4050 	{ "serial_number",	KSTAT_DATA_STRING },
4051 };
4052 static kmutex_t myri10ge_info_template_lock;
4053 
4054 
4055 static int
4056 myri10ge_info_init(struct myri10ge_priv *mgp)
4057 {
4058 	struct kstat *ksp;
4059 
4060 	ksp = kstat_create("myri10ge", ddi_get_instance(mgp->dip),
4061 	    "myri10ge_info", "net", KSTAT_TYPE_NAMED,
4062 	    sizeof (myri10ge_info_template) /
4063 	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4064 	if (ksp == NULL) {
4065 		cmn_err(CE_WARN,
4066 		    "%s: myri10ge_info_init: kstat_create failed", mgp->name);
4067 		return (DDI_FAILURE);
4068 	}
4069 	mgp->ksp_info = ksp;
4070 	ksp->ks_update = myri10ge_info_kstat_update;
4071 	ksp->ks_private = (void *) mgp;
4072 	ksp->ks_data = &myri10ge_info_template;
4073 	ksp->ks_lock = &myri10ge_info_template_lock;
4074 	if (MYRI10GE_VERSION_STR != NULL)
4075 		ksp->ks_data_size += strlen(MYRI10GE_VERSION_STR) + 1;
4076 	if (mgp->fw_version != NULL)
4077 		ksp->ks_data_size += strlen(mgp->fw_version) + 1;
4078 	ksp->ks_data_size += strlen(mgp->fw_name) + 1;
4079 	ksp->ks_data_size += strlen(mgp->intr_type) + 1;
4080 	if (mgp->pc_str != NULL)
4081 		ksp->ks_data_size += strlen(mgp->pc_str) + 1;
4082 	if (mgp->sn_str != NULL)
4083 		ksp->ks_data_size += strlen(mgp->sn_str) + 1;
4084 
4085 	kstat_install(ksp);
4086 	return (DDI_SUCCESS);
4087 }
4088 
4089 
4090 static int
4091 myri10ge_nic_stat_init(struct myri10ge_priv *mgp)
4092 {
4093 	struct kstat *ksp;
4094 	struct myri10ge_nic_stat *ethstat;
4095 
4096 	ksp = kstat_create("myri10ge", ddi_get_instance(mgp->dip),
4097 	    "myri10ge_nic_stats", "net", KSTAT_TYPE_NAMED,
4098 	    sizeof (*ethstat) / sizeof (kstat_named_t), 0);
4099 	if (ksp == NULL) {
4100 		cmn_err(CE_WARN,
4101 		    "%s: myri10ge_stat_init: kstat_create failed", mgp->name);
4102 		return (DDI_FAILURE);
4103 	}
4104 	mgp->ksp_stat = ksp;
4105 	ethstat = (struct myri10ge_nic_stat *)(ksp->ks_data);
4106 
4107 	kstat_named_init(&ethstat->dma_read_bw_MBs,
4108 	    "dma_read_bw_MBs", KSTAT_DATA_ULONG);
4109 	kstat_named_init(&ethstat->dma_write_bw_MBs,
4110 	    "dma_write_bw_MBs", KSTAT_DATA_ULONG);
4111 	kstat_named_init(&ethstat->dma_read_write_bw_MBs,
4112 	    "dma_read_write_bw_MBs", KSTAT_DATA_ULONG);
4113 	kstat_named_init(&ethstat->dma_force_physical,
4114 	    "dma_force_physical", KSTAT_DATA_ULONG);
4115 	kstat_named_init(&ethstat->lanes,
4116 	    "lanes", KSTAT_DATA_ULONG);
4117 	kstat_named_init(&ethstat->dropped_bad_crc32,
4118 	    "dropped_bad_crc32", KSTAT_DATA_ULONG);
4119 	kstat_named_init(&ethstat->dropped_bad_phy,
4120 	    "dropped_bad_phy", KSTAT_DATA_ULONG);
4121 	kstat_named_init(&ethstat->dropped_link_error_or_filtered,
4122 	    "dropped_link_error_or_filtered", KSTAT_DATA_ULONG);
4123 	kstat_named_init(&ethstat->dropped_link_overflow,
4124 	    "dropped_link_overflow", KSTAT_DATA_ULONG);
4125 	kstat_named_init(&ethstat->dropped_multicast_filtered,
4126 	    "dropped_multicast_filtered", KSTAT_DATA_ULONG);
4127 	kstat_named_init(&ethstat->dropped_no_big_buffer,
4128 	    "dropped_no_big_buffer", KSTAT_DATA_ULONG);
4129 	kstat_named_init(&ethstat->dropped_no_small_buffer,
4130 	    "dropped_no_small_buffer", KSTAT_DATA_ULONG);
4131 	kstat_named_init(&ethstat->dropped_overrun,
4132 	    "dropped_overrun", KSTAT_DATA_ULONG);
4133 	kstat_named_init(&ethstat->dropped_pause,
4134 	    "dropped_pause", KSTAT_DATA_ULONG);
4135 	kstat_named_init(&ethstat->dropped_runt,
4136 	    "dropped_runt", KSTAT_DATA_ULONG);
4137 	kstat_named_init(&ethstat->dropped_unicast_filtered,
4138 	    "dropped_unicast_filtered", KSTAT_DATA_ULONG);
4139 	kstat_named_init(&ethstat->dropped_runt, "dropped_runt",
4140 	    KSTAT_DATA_ULONG);
4141 	kstat_named_init(&ethstat->link_up, "link_up", KSTAT_DATA_ULONG);
4142 	kstat_named_init(&ethstat->link_changes, "link_changes",
4143 	    KSTAT_DATA_ULONG);
4144 	ksp->ks_update = myri10ge_nic_stat_kstat_update;
4145 	ksp->ks_private = (void *) mgp;
4146 	kstat_install(ksp);
4147 	return (DDI_SUCCESS);
4148 }
4149 
4150 static int
4151 myri10ge_slice_stat_init(struct myri10ge_slice_state *ss)
4152 {
4153 	struct myri10ge_priv *mgp = ss->mgp;
4154 	struct kstat *ksp;
4155 	struct myri10ge_slice_stat *ethstat;
4156 	int instance;
4157 
4158 	/*
4159 	 * fake an instance so that the same slice numbers from
4160 	 * different instances do not collide
4161 	 */
4162 	instance = (ddi_get_instance(mgp->dip) * 1000) +  (int)(ss - mgp->ss);
4163 	ksp = kstat_create("myri10ge", instance,
4164 	    "myri10ge_slice_stats", "net", KSTAT_TYPE_NAMED,
4165 	    sizeof (*ethstat) / sizeof (kstat_named_t), 0);
4166 	if (ksp == NULL) {
4167 		cmn_err(CE_WARN,
4168 		    "%s: myri10ge_stat_init: kstat_create failed", mgp->name);
4169 		return (DDI_FAILURE);
4170 	}
4171 	ss->ksp_stat = ksp;
4172 	ethstat = (struct myri10ge_slice_stat *)(ksp->ks_data);
4173 	kstat_named_init(&ethstat->lro_bad_csum, "lro_bad_csum",
4174 	    KSTAT_DATA_ULONG);
4175 	kstat_named_init(&ethstat->lro_flushed, "lro_flushed",
4176 	    KSTAT_DATA_ULONG);
4177 	kstat_named_init(&ethstat->lro_queued, "lro_queued",
4178 	    KSTAT_DATA_ULONG);
4179 	kstat_named_init(&ethstat->rx_bigbuf_firmware, "rx_bigbuf_firmware",
4180 	    KSTAT_DATA_ULONG);
4181 	kstat_named_init(&ethstat->rx_bigbuf_pool, "rx_bigbuf_pool",
4182 	    KSTAT_DATA_ULONG);
4183 	kstat_named_init(&ethstat->rx_bigbuf_smalls, "rx_bigbuf_smalls",
4184 	    KSTAT_DATA_ULONG);
4185 	kstat_named_init(&ethstat->rx_copy, "rx_copy",
4186 	    KSTAT_DATA_ULONG);
4187 	kstat_named_init(&ethstat->rx_big_nobuf, "rx_big_nobuf",
4188 	    KSTAT_DATA_ULONG);
4189 	kstat_named_init(&ethstat->rx_small_nobuf, "rx_small_nobuf",
4190 	    KSTAT_DATA_ULONG);
4191 	kstat_named_init(&ethstat->xmit_zero_len, "xmit_zero_len",
4192 	    KSTAT_DATA_ULONG);
4193 	kstat_named_init(&ethstat->xmit_pullup, "xmit_pullup",
4194 	    KSTAT_DATA_ULONG);
4195 	kstat_named_init(&ethstat->xmit_pullup_first, "xmit_pullup_first",
4196 	    KSTAT_DATA_ULONG);
4197 	kstat_named_init(&ethstat->xmit_lowbuf, "xmit_lowbuf",
4198 	    KSTAT_DATA_ULONG);
4199 	kstat_named_init(&ethstat->xmit_lsobadflags, "xmit_lsobadflags",
4200 	    KSTAT_DATA_ULONG);
4201 	kstat_named_init(&ethstat->xmit_sched, "xmit_sched",
4202 	    KSTAT_DATA_ULONG);
4203 	kstat_named_init(&ethstat->xmit_stall, "xmit_stall",
4204 	    KSTAT_DATA_ULONG);
4205 	kstat_named_init(&ethstat->xmit_stall_early, "xmit_stall_early",
4206 	    KSTAT_DATA_ULONG);
4207 	kstat_named_init(&ethstat->xmit_stall_late, "xmit_stall_late",
4208 	    KSTAT_DATA_ULONG);
4209 	kstat_named_init(&ethstat->xmit_err, "xmit_err",
4210 	    KSTAT_DATA_ULONG);
4211 	kstat_named_init(&ethstat->tx_req, "tx_req",
4212 	    KSTAT_DATA_ULONG);
4213 	kstat_named_init(&ethstat->tx_activate, "tx_activate",
4214 	    KSTAT_DATA_ULONG);
4215 	kstat_named_init(&ethstat->tx_done, "tx_done",
4216 	    KSTAT_DATA_ULONG);
4217 	kstat_named_init(&ethstat->tx_handles_alloced, "tx_handles_alloced",
4218 	    KSTAT_DATA_ULONG);
4219 	kstat_named_init(&ethstat->rx_big, "rx_big",
4220 	    KSTAT_DATA_ULONG);
4221 	kstat_named_init(&ethstat->rx_small, "rx_small",
4222 	    KSTAT_DATA_ULONG);
4223 	ksp->ks_update = myri10ge_slice_stat_kstat_update;
4224 	ksp->ks_private = (void *) ss;
4225 	kstat_install(ksp);
4226 	return (DDI_SUCCESS);
4227 }
4228 
4229 
4230 
4231 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
4232 
4233 #include <vm/hat.h>
4234 #include <sys/ddi_isa.h>
4235 void *device_arena_alloc(size_t size, int vm_flag);
4236 void device_arena_free(void *vaddr, size_t size);
4237 
4238 static void
4239 myri10ge_enable_nvidia_ecrc(struct myri10ge_priv *mgp)
4240 {
4241 	dev_info_t *parent_dip;
4242 	ddi_acc_handle_t handle;
4243 	unsigned long bus_number, dev_number, func_number;
4244 	unsigned long cfg_pa, paddr, base, pgoffset;
4245 	char		*cvaddr, *ptr;
4246 	uint32_t	*ptr32;
4247 	int		retval = DDI_FAILURE;
4248 	int dontcare;
4249 	uint16_t read_vid, read_did, vendor_id, device_id;
4250 
4251 	if (!myri10ge_nvidia_ecrc_enable)
4252 		return;
4253 
4254 	parent_dip = ddi_get_parent(mgp->dip);
4255 	if (parent_dip == NULL) {
4256 		cmn_err(CE_WARN, "%s: I'm an orphan?", mgp->name);
4257 		return;
4258 	}
4259 
4260 	if (pci_config_setup(parent_dip, &handle) != DDI_SUCCESS) {
4261 		cmn_err(CE_WARN,
4262 		    "%s: Could not access my parent's registers", mgp->name);
4263 		return;
4264 	}
4265 
4266 	vendor_id = pci_config_get16(handle, PCI_CONF_VENID);
4267 	device_id = pci_config_get16(handle, PCI_CONF_DEVID);
4268 	pci_config_teardown(&handle);
4269 
4270 	if (myri10ge_verbose) {
4271 		unsigned long	bus_number, dev_number, func_number;
4272 		int		reg_set, span;
4273 		(void) myri10ge_reg_set(parent_dip, &reg_set, &span,
4274 		    &bus_number, &dev_number, &func_number);
4275 		if (myri10ge_verbose)
4276 			printf("%s: parent at %ld:%ld:%ld\n", mgp->name,
4277 			    bus_number, dev_number, func_number);
4278 	}
4279 
4280 	if (vendor_id !=  0x10de)
4281 		return;
4282 
4283 	if (device_id != 0x005d /* CK804 */ &&
4284 	    (device_id < 0x374 || device_id > 0x378) /* MCP55 */) {
4285 		return;
4286 	}
4287 	(void) myri10ge_reg_set(parent_dip, &dontcare, &dontcare,
4288 	    &bus_number, &dev_number, &func_number);
4289 
4290 	for (cfg_pa = 0xf0000000UL;
4291 	    retval != DDI_SUCCESS && cfg_pa >= 0xe0000000UL;
4292 	    cfg_pa -= 0x10000000UL) {
4293 		/* find the config space address for the nvidia bridge */
4294 		paddr = (cfg_pa + bus_number * 0x00100000UL +
4295 		    (dev_number * 8 + func_number) * 0x00001000UL);
4296 
4297 		base = paddr & (~MMU_PAGEOFFSET);
4298 		pgoffset = paddr & MMU_PAGEOFFSET;
4299 
4300 		/* map it into the kernel */
4301 		cvaddr =  device_arena_alloc(ptob(1), VM_NOSLEEP);
4302 		if (cvaddr == NULL)
4303 			cmn_err(CE_WARN, "%s: failed to map nf4: cvaddr\n",
4304 			    mgp->name);
4305 
4306 		hat_devload(kas.a_hat, cvaddr, mmu_ptob(1),
4307 		    i_ddi_paddr_to_pfn(base),
4308 		    PROT_WRITE|HAT_STRICTORDER, HAT_LOAD_LOCK);
4309 
4310 		ptr = cvaddr + pgoffset;
4311 		read_vid = *(uint16_t *)(void *)(ptr + PCI_CONF_VENID);
4312 		read_did = *(uint16_t *)(void *)(ptr + PCI_CONF_DEVID);
4313 		if (vendor_id ==  read_did || device_id == read_did) {
4314 			ptr32 = (uint32_t *)(void *)(ptr + 0x178);
4315 			if (myri10ge_verbose)
4316 				printf("%s: Enabling ECRC on upstream "
4317 				    "Nvidia bridge (0x%x:0x%x) "
4318 				    "at %ld:%ld:%ld\n", mgp->name,
4319 				    read_vid, read_did, bus_number,
4320 				    dev_number, func_number);
4321 			*ptr32 |= 0x40;
4322 			retval = DDI_SUCCESS;
4323 		}
4324 		hat_unload(kas.a_hat, cvaddr, ptob(1), HAT_UNLOAD_UNLOCK);
4325 		device_arena_free(cvaddr, ptob(1));
4326 	}
4327 }
4328 
4329 #else
4330 /*ARGSUSED*/
4331 static void
4332 myri10ge_enable_nvidia_ecrc(struct myri10ge_priv *mgp)
4333 {
4334 }
4335 #endif /* i386 */
4336 
4337 
4338 /*
4339  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
4340  * when the PCI-E Completion packets are aligned on an 8-byte
4341  * boundary.  Some PCI-E chip sets always align Completion packets; on
4342  * the ones that do not, the alignment can be enforced by enabling
4343  * ECRC generation (if supported).
4344  *
4345  * When PCI-E Completion packets are not aligned, it is actually more
4346  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
4347  *
4348  * If the driver can neither enable ECRC nor verify that it has
4349  * already been enabled, then it must use a firmware image which works
4350  * around unaligned completion packets (ethp_z8e.dat), and it should
4351  * also ensure that it never gives the device a Read-DMA which is
4352  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
4353  * enabled, then the driver should use the aligned (eth_z8e.dat)
4354  * firmware image, and set tx.boundary to 4KB.
4355  */
4356 
4357 
4358 static int
4359 myri10ge_firmware_probe(struct myri10ge_priv *mgp)
4360 {
4361 	int status;
4362 
4363 	mgp->tx_boundary = 4096;
4364 	/*
4365 	 * Verify the max read request size was set to 4KB
4366 	 * before trying the test with 4KB.
4367 	 */
4368 	if (mgp->max_read_request_4k == 0)
4369 		mgp->tx_boundary = 2048;
4370 	/*
4371 	 * load the optimized firmware which assumes aligned PCIe
4372 	 * completions in order to see if it works on this host.
4373 	 */
4374 
4375 	mgp->fw_name = "rss_eth_z8e";
4376 	mgp->eth_z8e = (unsigned char *)rss_eth_z8e;
4377 	mgp->eth_z8e_length = rss_eth_z8e_length;
4378 
4379 	status = myri10ge_load_firmware(mgp);
4380 	if (status != 0) {
4381 		return (status);
4382 	}
4383 	/*
4384 	 * Enable ECRC if possible
4385 	 */
4386 	myri10ge_enable_nvidia_ecrc(mgp);
4387 
4388 	/*
4389 	 * Run a DMA test which watches for unaligned completions and
4390 	 * aborts on the first one seen.
4391 	 */
4392 	status = myri10ge_dma_test(mgp, MXGEFW_CMD_UNALIGNED_TEST);
4393 	if (status == 0)
4394 		return (0); /* keep the aligned firmware */
4395 
4396 	if (status != E2BIG)
4397 		cmn_err(CE_WARN, "%s: DMA test failed: %d\n",
4398 		    mgp->name, status);
4399 	if (status == ENOSYS)
4400 		cmn_err(CE_WARN, "%s: Falling back to ethp! "
4401 		    "Please install up to date fw\n", mgp->name);
4402 	return (status);
4403 }
4404 
4405 static int
4406 myri10ge_select_firmware(struct myri10ge_priv *mgp)
4407 {
4408 	int aligned;
4409 
4410 	aligned = 0;
4411 
4412 	if (myri10ge_force_firmware == 1) {
4413 		if (myri10ge_verbose)
4414 			printf("%s: Assuming aligned completions (forced)\n",
4415 			    mgp->name);
4416 		aligned = 1;
4417 		goto done;
4418 	}
4419 
4420 	if (myri10ge_force_firmware == 2) {
4421 		if (myri10ge_verbose)
4422 			printf("%s: Assuming unaligned completions (forced)\n",
4423 			    mgp->name);
4424 		aligned = 0;
4425 		goto done;
4426 	}
4427 
4428 	/* If the width is less than 8, we may used the aligned firmware */
4429 	if (mgp->pcie_link_width != 0 && mgp->pcie_link_width < 8) {
4430 		cmn_err(CE_WARN, "!%s: PCIe link running at x%d\n",
4431 		    mgp->name, mgp->pcie_link_width);
4432 		aligned = 1;
4433 		goto done;
4434 	}
4435 
4436 	if (0 == myri10ge_firmware_probe(mgp))
4437 		return (0);  /* keep optimized firmware */
4438 
4439 done:
4440 	if (aligned) {
4441 		mgp->fw_name = "rss_eth_z8e";
4442 		mgp->eth_z8e = (unsigned char *)rss_eth_z8e;
4443 		mgp->eth_z8e_length = rss_eth_z8e_length;
4444 		mgp->tx_boundary = 4096;
4445 	} else {
4446 		mgp->fw_name = "rss_ethp_z8e";
4447 		mgp->eth_z8e = (unsigned char *)rss_ethp_z8e;
4448 		mgp->eth_z8e_length = rss_ethp_z8e_length;
4449 		mgp->tx_boundary = 2048;
4450 	}
4451 
4452 	return (myri10ge_load_firmware(mgp));
4453 }
4454 
4455 static int
4456 myri10ge_add_intrs(struct myri10ge_priv *mgp, int add_handler)
4457 {
4458 	dev_info_t *devinfo = mgp->dip;
4459 	int count, avail, actual, intr_types;
4460 	int x, y, rc, inum = 0;
4461 
4462 
4463 	rc = ddi_intr_get_supported_types(devinfo, &intr_types);
4464 	if (rc != DDI_SUCCESS) {
4465 		cmn_err(CE_WARN,
4466 		    "!%s: ddi_intr_get_nintrs() failure, rc = %d\n", mgp->name,
4467 		    rc);
4468 		return (DDI_FAILURE);
4469 	}
4470 
4471 	if (!myri10ge_use_msi)
4472 		intr_types &= ~DDI_INTR_TYPE_MSI;
4473 	if (!myri10ge_use_msix)
4474 		intr_types &= ~DDI_INTR_TYPE_MSIX;
4475 
4476 	if (intr_types & DDI_INTR_TYPE_MSIX) {
4477 		mgp->ddi_intr_type = DDI_INTR_TYPE_MSIX;
4478 		mgp->intr_type = "MSI-X";
4479 	} else if (intr_types & DDI_INTR_TYPE_MSI) {
4480 		mgp->ddi_intr_type = DDI_INTR_TYPE_MSI;
4481 		mgp->intr_type = "MSI";
4482 	} else {
4483 		mgp->ddi_intr_type = DDI_INTR_TYPE_FIXED;
4484 		mgp->intr_type = "Legacy";
4485 	}
4486 	/* Get number of interrupts */
4487 	rc = ddi_intr_get_nintrs(devinfo, mgp->ddi_intr_type, &count);
4488 	if ((rc != DDI_SUCCESS) || (count == 0)) {
4489 		cmn_err(CE_WARN, "%s: ddi_intr_get_nintrs() failure, rc: %d, "
4490 		    "count: %d", mgp->name, rc, count);
4491 
4492 		return (DDI_FAILURE);
4493 	}
4494 
4495 	/* Get number of available interrupts */
4496 	rc = ddi_intr_get_navail(devinfo, mgp->ddi_intr_type, &avail);
4497 	if ((rc != DDI_SUCCESS) || (avail == 0)) {
4498 		cmn_err(CE_WARN, "%s: ddi_intr_get_navail() failure, "
4499 		    "rc: %d, avail: %d\n", mgp->name, rc, avail);
4500 		return (DDI_FAILURE);
4501 	}
4502 	if (avail < count) {
4503 		cmn_err(CE_NOTE,
4504 		    "!%s: nintrs() returned %d, navail returned %d",
4505 		    mgp->name, count, avail);
4506 		count = avail;
4507 	}
4508 
4509 	if (count < mgp->num_slices)
4510 		return (DDI_FAILURE);
4511 
4512 	if (count > mgp->num_slices)
4513 		count = mgp->num_slices;
4514 
4515 	/* Allocate memory for MSI interrupts */
4516 	mgp->intr_size = count * sizeof (ddi_intr_handle_t);
4517 	mgp->htable = kmem_alloc(mgp->intr_size, KM_SLEEP);
4518 
4519 	rc = ddi_intr_alloc(devinfo, mgp->htable, mgp->ddi_intr_type, inum,
4520 	    count, &actual, DDI_INTR_ALLOC_NORMAL);
4521 
4522 	if ((rc != DDI_SUCCESS) || (actual == 0)) {
4523 		cmn_err(CE_WARN, "%s: ddi_intr_alloc() failed: %d",
4524 		    mgp->name, rc);
4525 
4526 		kmem_free(mgp->htable, mgp->intr_size);
4527 		mgp->htable = NULL;
4528 		return (DDI_FAILURE);
4529 	}
4530 
4531 	if ((actual < count) && myri10ge_verbose) {
4532 		cmn_err(CE_NOTE, "%s: got %d/%d slices",
4533 		    mgp->name, actual, count);
4534 	}
4535 
4536 	mgp->intr_cnt = actual;
4537 
4538 	/*
4539 	 * Get priority for first irq, assume remaining are all the same
4540 	 */
4541 	if (ddi_intr_get_pri(mgp->htable[0], &mgp->intr_pri)
4542 	    != DDI_SUCCESS) {
4543 		cmn_err(CE_WARN, "%s: ddi_intr_get_pri() failed", mgp->name);
4544 
4545 		/* Free already allocated intr */
4546 		for (y = 0; y < actual; y++) {
4547 			(void) ddi_intr_free(mgp->htable[y]);
4548 		}
4549 
4550 		kmem_free(mgp->htable, mgp->intr_size);
4551 		mgp->htable = NULL;
4552 		return (DDI_FAILURE);
4553 	}
4554 
4555 	mgp->icookie = (void *)(uintptr_t)mgp->intr_pri;
4556 
4557 	if (!add_handler)
4558 		return (DDI_SUCCESS);
4559 
4560 	/* Call ddi_intr_add_handler() */
4561 	for (x = 0; x < actual; x++) {
4562 		if (ddi_intr_add_handler(mgp->htable[x], myri10ge_intr,
4563 		    (caddr_t)&mgp->ss[x], NULL) != DDI_SUCCESS) {
4564 			cmn_err(CE_WARN, "%s: ddi_intr_add_handler() failed",
4565 			    mgp->name);
4566 
4567 			/* Free already allocated intr */
4568 			for (y = 0; y < actual; y++) {
4569 				(void) ddi_intr_free(mgp->htable[y]);
4570 			}
4571 
4572 			kmem_free(mgp->htable, mgp->intr_size);
4573 			mgp->htable = NULL;
4574 			return (DDI_FAILURE);
4575 		}
4576 	}
4577 
4578 	(void) ddi_intr_get_cap(mgp->htable[0], &mgp->intr_cap);
4579 	if (mgp->intr_cap & DDI_INTR_FLAG_BLOCK) {
4580 		/* Call ddi_intr_block_enable() for MSI */
4581 		(void) ddi_intr_block_enable(mgp->htable, mgp->intr_cnt);
4582 	} else {
4583 		/* Call ddi_intr_enable() for MSI non block enable */
4584 		for (x = 0; x < mgp->intr_cnt; x++) {
4585 			(void) ddi_intr_enable(mgp->htable[x]);
4586 		}
4587 	}
4588 
4589 	return (DDI_SUCCESS);
4590 }
4591 
4592 static void
4593 myri10ge_rem_intrs(struct myri10ge_priv *mgp, int handler_installed)
4594 {
4595 	int x, err;
4596 
4597 	/* Disable all interrupts */
4598 	if (handler_installed) {
4599 		if (mgp->intr_cap & DDI_INTR_FLAG_BLOCK) {
4600 			/* Call ddi_intr_block_disable() */
4601 			(void) ddi_intr_block_disable(mgp->htable,
4602 			    mgp->intr_cnt);
4603 		} else {
4604 			for (x = 0; x < mgp->intr_cnt; x++) {
4605 				(void) ddi_intr_disable(mgp->htable[x]);
4606 			}
4607 		}
4608 	}
4609 
4610 	for (x = 0; x < mgp->intr_cnt; x++) {
4611 		if (handler_installed) {
4612 		/* Call ddi_intr_remove_handler() */
4613 			err = ddi_intr_remove_handler(mgp->htable[x]);
4614 			if (err != DDI_SUCCESS) {
4615 				cmn_err(CE_WARN,
4616 				    "%s: ddi_intr_remove_handler for"
4617 				    "vec %d returned %d\n", mgp->name,
4618 				    x, err);
4619 			}
4620 		}
4621 		err = ddi_intr_free(mgp->htable[x]);
4622 		if (err != DDI_SUCCESS) {
4623 			cmn_err(CE_WARN,
4624 			    "%s: ddi_intr_free for vec %d returned %d\n",
4625 			    mgp->name, x, err);
4626 		}
4627 	}
4628 	kmem_free(mgp->htable, mgp->intr_size);
4629 	mgp->htable = NULL;
4630 }
4631 
4632 static void
4633 myri10ge_test_physical(dev_info_t *dip)
4634 {
4635 	ddi_dma_handle_t	handle;
4636 	struct myri10ge_dma_stuff dma;
4637 	void *addr;
4638 	int err;
4639 
4640 	/* test #1, sufficient for older sparc systems */
4641 	myri10ge_tx_dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
4642 	err = ddi_dma_alloc_handle(dip, &myri10ge_tx_dma_attr,
4643 	    DDI_DMA_DONTWAIT, NULL, &handle);
4644 	if (err == DDI_DMA_BADATTR)
4645 		goto fail;
4646 	ddi_dma_free_handle(&handle);
4647 
4648 	/* test #2, required on Olympis where the bind is what fails */
4649 	addr = myri10ge_dma_alloc(dip, 128, &myri10ge_tx_dma_attr,
4650 	    &myri10ge_dev_access_attr, DDI_DMA_STREAMING,
4651 	    DDI_DMA_WRITE|DDI_DMA_STREAMING, &dma, 0, DDI_DMA_DONTWAIT);
4652 	if (addr == NULL)
4653 		goto fail;
4654 	myri10ge_dma_free(&dma);
4655 	return;
4656 
4657 fail:
4658 	if (myri10ge_verbose)
4659 		printf("myri10ge%d: DDI_DMA_FORCE_PHYSICAL failed, "
4660 		    "using IOMMU\n", ddi_get_instance(dip));
4661 
4662 	myri10ge_tx_dma_attr.dma_attr_flags &= ~DDI_DMA_FORCE_PHYSICAL;
4663 }
4664 
4665 static void
4666 myri10ge_get_props(dev_info_t *dip)
4667 {
4668 
4669 	myri10ge_flow_control =  ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4670 	    "myri10ge_flow_control", myri10ge_flow_control);
4671 
4672 	myri10ge_intr_coal_delay = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4673 	    "myri10ge_intr_coal_delay", myri10ge_intr_coal_delay);
4674 
4675 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
4676 	myri10ge_nvidia_ecrc_enable = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4677 	    "myri10ge_nvidia_ecrc_enable", 1);
4678 #endif
4679 
4680 
4681 	myri10ge_use_msi = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4682 	    "myri10ge_use_msi", myri10ge_use_msi);
4683 
4684 	myri10ge_deassert_wait = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4685 	    "myri10ge_deassert_wait",  myri10ge_deassert_wait);
4686 
4687 	myri10ge_verbose = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4688 	    "myri10ge_verbose", myri10ge_verbose);
4689 
4690 	myri10ge_tx_copylen = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4691 	    "myri10ge_tx_copylen", myri10ge_tx_copylen);
4692 
4693 	if (myri10ge_tx_copylen < 60) {
4694 		cmn_err(CE_WARN,
4695 		    "myri10ge_tx_copylen must be >= 60 bytes\n");
4696 		myri10ge_tx_copylen = 60;
4697 	}
4698 
4699 	myri10ge_mtu_override = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4700 	    "myri10ge_mtu_override", myri10ge_mtu_override);
4701 
4702 	if (myri10ge_mtu_override >= MYRI10GE_MIN_GLD_MTU &&
4703 	    myri10ge_mtu_override <= MYRI10GE_MAX_GLD_MTU)
4704 		myri10ge_mtu = myri10ge_mtu_override +
4705 		    sizeof (struct ether_header) + MXGEFW_PAD + VLAN_TAGSZ;
4706 	else if (myri10ge_mtu_override != 0) {
4707 		cmn_err(CE_WARN,
4708 		    "myri10ge_mtu_override must be between 1500 and "
4709 		    "9000 bytes\n");
4710 	}
4711 
4712 	myri10ge_bigbufs_initial = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4713 	    "myri10ge_bigbufs_initial", myri10ge_bigbufs_initial);
4714 	myri10ge_bigbufs_max = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4715 	    "myri10ge_bigbufs_max", myri10ge_bigbufs_max);
4716 
4717 	myri10ge_watchdog_reset = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4718 	    "myri10ge_watchdog_reset", myri10ge_watchdog_reset);
4719 
4720 	if (myri10ge_bigbufs_initial < 128) {
4721 		cmn_err(CE_WARN,
4722 		    "myri10ge_bigbufs_initial be at least 128\n");
4723 		myri10ge_bigbufs_initial = 128;
4724 	}
4725 	if (myri10ge_bigbufs_max < 128) {
4726 		cmn_err(CE_WARN,
4727 		    "myri10ge_bigbufs_max be at least 128\n");
4728 		myri10ge_bigbufs_max = 128;
4729 	}
4730 
4731 	if (myri10ge_bigbufs_max < myri10ge_bigbufs_initial) {
4732 		cmn_err(CE_WARN,
4733 		    "myri10ge_bigbufs_max must be >=  "
4734 		    "myri10ge_bigbufs_initial\n");
4735 		myri10ge_bigbufs_max = myri10ge_bigbufs_initial;
4736 	}
4737 
4738 	myri10ge_force_firmware = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4739 	    "myri10ge_force_firmware", myri10ge_force_firmware);
4740 
4741 	myri10ge_max_slices = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4742 	    "myri10ge_max_slices", myri10ge_max_slices);
4743 
4744 	myri10ge_use_msix = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4745 	    "myri10ge_use_msix", myri10ge_use_msix);
4746 
4747 	myri10ge_rss_hash = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4748 	    "myri10ge_rss_hash", myri10ge_rss_hash);
4749 
4750 	if (myri10ge_rss_hash > MXGEFW_RSS_HASH_TYPE_MAX ||
4751 	    myri10ge_rss_hash < MXGEFW_RSS_HASH_TYPE_IPV4) {
4752 		cmn_err(CE_WARN, "myri10ge: Illegal rssh hash type %d\n",
4753 		    myri10ge_rss_hash);
4754 		myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4755 	}
4756 	myri10ge_lro = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4757 	    "myri10ge_lro", myri10ge_lro);
4758 	myri10ge_lro_cnt = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4759 	    "myri10ge_lro_cnt", myri10ge_lro_cnt);
4760 	myri10ge_lro_max_aggr = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4761 	    "myri10ge_lro_max_aggr", myri10ge_lro_max_aggr);
4762 	myri10ge_tx_hash = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4763 	    "myri10ge_tx_hash", myri10ge_tx_hash);
4764 	myri10ge_use_lso = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4765 	    "myri10ge_use_lso", myri10ge_use_lso);
4766 	myri10ge_lso_copy = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4767 	    "myri10ge_lso_copy", myri10ge_lso_copy);
4768 	myri10ge_tx_handles_initial = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4769 	    "myri10ge_tx_handles_initial", myri10ge_tx_handles_initial);
4770 	myri10ge_small_bytes = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4771 	    "myri10ge_small_bytes", myri10ge_small_bytes);
4772 	if ((myri10ge_small_bytes + MXGEFW_PAD) & (128 -1)) {
4773 		cmn_err(CE_WARN, "myri10ge: myri10ge_small_bytes (%d)\n",
4774 		    myri10ge_small_bytes);
4775 		cmn_err(CE_WARN, "must be aligned on 128b bndry -2\n");
4776 		myri10ge_small_bytes += 128;
4777 		myri10ge_small_bytes &= ~(128 -1);
4778 		myri10ge_small_bytes -= MXGEFW_PAD;
4779 		cmn_err(CE_WARN, "rounded up to %d\n",
4780 		    myri10ge_small_bytes);
4781 
4782 		myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4783 	}
4784 }
4785 
4786 #ifndef	PCI_EXP_LNKSTA
4787 #define	PCI_EXP_LNKSTA 18
4788 #endif
4789 
4790 static int
4791 myri10ge_find_cap(ddi_acc_handle_t handle, uint8_t *capptr, uint8_t capid)
4792 {
4793 	uint16_t	status;
4794 	uint8_t		ptr;
4795 
4796 	/* check to see if we have capabilities */
4797 	status = pci_config_get16(handle, PCI_CONF_STAT);
4798 	if (!(status & PCI_STAT_CAP)) {
4799 		cmn_err(CE_WARN, "PCI_STAT_CAP not found\n");
4800 		return (ENXIO);
4801 	}
4802 
4803 	ptr = pci_config_get8(handle, PCI_CONF_CAP_PTR);
4804 
4805 	/* Walk the capabilities list, looking for a PCI Express cap */
4806 	while (ptr != PCI_CAP_NEXT_PTR_NULL) {
4807 		if (pci_config_get8(handle, ptr + PCI_CAP_ID) == capid)
4808 			break;
4809 		ptr = pci_config_get8(handle, ptr + PCI_CAP_NEXT_PTR);
4810 	}
4811 	if (ptr < 64) {
4812 		cmn_err(CE_WARN, "Bad capability offset %d\n", ptr);
4813 		return (ENXIO);
4814 	}
4815 	*capptr = ptr;
4816 	return (0);
4817 }
4818 
4819 static int
4820 myri10ge_set_max_readreq(ddi_acc_handle_t handle)
4821 {
4822 	int err;
4823 	uint16_t	val;
4824 	uint8_t		ptr;
4825 
4826 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_PCI_E);
4827 	if (err != 0) {
4828 		cmn_err(CE_WARN, "could not find PCIe cap\n");
4829 		return (ENXIO);
4830 	}
4831 
4832 	/* set max read req to 4096 */
4833 	val = pci_config_get16(handle, ptr + PCIE_DEVCTL);
4834 	val = (val & ~PCIE_DEVCTL_MAX_READ_REQ_MASK) |
4835 	    PCIE_DEVCTL_MAX_READ_REQ_4096;
4836 	pci_config_put16(handle, ptr + PCIE_DEVCTL, val);
4837 	val = pci_config_get16(handle, ptr + PCIE_DEVCTL);
4838 	if ((val & (PCIE_DEVCTL_MAX_READ_REQ_4096)) !=
4839 	    PCIE_DEVCTL_MAX_READ_REQ_4096) {
4840 		cmn_err(CE_WARN, "could not set max read req (%x)\n", val);
4841 		return (EINVAL);
4842 	}
4843 	return (0);
4844 }
4845 
4846 static int
4847 myri10ge_read_pcie_link_width(ddi_acc_handle_t handle, int *link)
4848 {
4849 	int err;
4850 	uint16_t	val;
4851 	uint8_t		ptr;
4852 
4853 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_PCI_E);
4854 	if (err != 0) {
4855 		cmn_err(CE_WARN, "could not set max read req\n");
4856 		return (ENXIO);
4857 	}
4858 
4859 	/* read link width */
4860 	val = pci_config_get16(handle, ptr + PCIE_LINKSTS);
4861 	val &= PCIE_LINKSTS_NEG_WIDTH_MASK;
4862 	*link = (val >> 4);
4863 	return (0);
4864 }
4865 
4866 static int
4867 myri10ge_reset_nic(struct myri10ge_priv *mgp)
4868 {
4869 	ddi_acc_handle_t handle = mgp->cfg_hdl;
4870 	uint32_t reboot;
4871 	uint16_t cmd;
4872 	int err;
4873 
4874 	cmd = pci_config_get16(handle, PCI_CONF_COMM);
4875 	if ((cmd & PCI_COMM_ME) == 0) {
4876 		/*
4877 		 * Bus master DMA disabled?  Check to see if the card
4878 		 * rebooted due to a parity error For now, just report
4879 		 * it
4880 		 */
4881 
4882 		/* enter read32 mode */
4883 		pci_config_put8(handle, mgp->vso + 0x10, 0x3);
4884 		/* read REBOOT_STATUS (0xfffffff0) */
4885 		pci_config_put32(handle, mgp->vso + 0x18, 0xfffffff0);
4886 		reboot = pci_config_get16(handle, mgp->vso + 0x14);
4887 		cmn_err(CE_WARN, "%s NIC rebooted 0x%x\n", mgp->name, reboot);
4888 		return (0);
4889 	}
4890 	if (!myri10ge_watchdog_reset) {
4891 		cmn_err(CE_WARN, "%s: not resetting\n", mgp->name);
4892 		return (1);
4893 	}
4894 
4895 	myri10ge_stop_locked(mgp);
4896 	err = myri10ge_start_locked(mgp);
4897 	if (err == DDI_FAILURE) {
4898 		return (0);
4899 	}
4900 	mac_tx_update(mgp->mh);
4901 	return (1);
4902 }
4903 
4904 static inline int
4905 myri10ge_ring_stalled(myri10ge_tx_ring_t *tx)
4906 {
4907 	if (tx->sched != tx->stall &&
4908 	    tx->done == tx->watchdog_done &&
4909 	    tx->watchdog_req != tx->watchdog_done)
4910 		return (1);
4911 	return (0);
4912 }
4913 
4914 static void
4915 myri10ge_watchdog(void *arg)
4916 {
4917 	struct myri10ge_priv *mgp;
4918 	struct myri10ge_slice_state *ss;
4919 	myri10ge_tx_ring_t *tx;
4920 	int nic_ok = 1;
4921 	int slices_stalled, rx_pause, i;
4922 	int add_rx;
4923 
4924 	mgp = arg;
4925 	mutex_enter(&mgp->intrlock);
4926 	if (mgp->running != MYRI10GE_ETH_RUNNING) {
4927 		cmn_err(CE_WARN,
4928 		    "%s not running, not rearming watchdog (%d)\n",
4929 		    mgp->name, mgp->running);
4930 		mutex_exit(&mgp->intrlock);
4931 		return;
4932 	}
4933 
4934 	rx_pause = ntohl(mgp->ss[0].fw_stats->dropped_pause);
4935 
4936 	/*
4937 	 * make sure nic is stalled before we reset the nic, so as to
4938 	 * ensure we don't rip the transmit data structures out from
4939 	 * under a pending transmit
4940 	 */
4941 
4942 	for (slices_stalled = 0, i = 0; i < mgp->num_slices; i++) {
4943 		tx = &mgp->ss[i].tx;
4944 		slices_stalled = myri10ge_ring_stalled(tx);
4945 		if (slices_stalled)
4946 			break;
4947 	}
4948 
4949 	if (slices_stalled) {
4950 		if (mgp->watchdog_rx_pause == rx_pause) {
4951 			cmn_err(CE_WARN,
4952 			    "%s slice %d stalled:(%d, %d, %d, %d, %d %d %d\n)",
4953 			    mgp->name, i, tx->sched, tx->stall,
4954 			    tx->done, tx->watchdog_done, tx->req, tx->pkt_done,
4955 			    (int)ntohl(mgp->ss[i].fw_stats->send_done_count));
4956 			nic_ok = myri10ge_reset_nic(mgp);
4957 		} else {
4958 			cmn_err(CE_WARN,
4959 			    "%s Flow controlled, check link partner\n",
4960 			    mgp->name);
4961 		}
4962 	}
4963 
4964 	if (!nic_ok) {
4965 		cmn_err(CE_WARN,
4966 		    "%s Nic dead, not rearming watchdog\n", mgp->name);
4967 		mutex_exit(&mgp->intrlock);
4968 		return;
4969 	}
4970 	for (i = 0; i < mgp->num_slices; i++) {
4971 		ss = &mgp->ss[i];
4972 		tx = &ss->tx;
4973 		tx->watchdog_done = tx->done;
4974 		tx->watchdog_req = tx->req;
4975 		if (ss->watchdog_rx_copy != MYRI10GE_SLICE_STAT(rx_copy)) {
4976 			ss->watchdog_rx_copy = MYRI10GE_SLICE_STAT(rx_copy);
4977 			add_rx =
4978 			    min(ss->jpool.num_alloc,
4979 			    myri10ge_bigbufs_max -
4980 			    (ss->jpool.num_alloc -
4981 			    ss->jbufs_for_smalls));
4982 			if (add_rx != 0) {
4983 				(void) myri10ge_add_jbufs(ss, add_rx, 0);
4984 				/* now feed them to the firmware */
4985 				mutex_enter(&ss->jpool.mtx);
4986 				myri10ge_restock_jumbos(ss);
4987 				mutex_exit(&ss->jpool.mtx);
4988 			}
4989 		}
4990 	}
4991 	mgp->watchdog_rx_pause = rx_pause;
4992 
4993 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
4994 	    mgp->timer_ticks);
4995 	mutex_exit(&mgp->intrlock);
4996 }
4997 
4998 /*ARGSUSED*/
4999 static int
5000 myri10ge_get_coalesce(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
5001 {
5002 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5003 	(void) mi_mpprintf(mp, "%d", mgp->intr_coal_delay);
5004 	return (0);
5005 }
5006 
5007 /*ARGSUSED*/
5008 static int
5009 myri10ge_set_coalesce(queue_t *q, mblk_t *mp, char *value,
5010     caddr_t cp, cred_t *credp)
5011 {
5012 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5013 	char *end;
5014 	size_t new_value;
5015 
5016 	new_value = mi_strtol(value, &end, 10);
5017 	if (end == value)
5018 		return (EINVAL);
5019 
5020 	mutex_enter(&myri10ge_param_lock);
5021 	mgp->intr_coal_delay = (int)new_value;
5022 	*mgp->intr_coal_delay_ptr = htonl(mgp->intr_coal_delay);
5023 	mutex_exit(&myri10ge_param_lock);
5024 	return (0);
5025 }
5026 
5027 /*ARGSUSED*/
5028 static int
5029 myri10ge_get_pauseparam(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
5030 {
5031 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5032 	(void) mi_mpprintf(mp, "%d", mgp->pause);
5033 	return (0);
5034 }
5035 
5036 /*ARGSUSED*/
5037 static int
5038 myri10ge_set_pauseparam(queue_t *q, mblk_t *mp, char *value,
5039     caddr_t cp, cred_t *credp)
5040 {
5041 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5042 	char *end;
5043 	size_t new_value;
5044 	int err = 0;
5045 
5046 	new_value = mi_strtol(value, &end, 10);
5047 	if (end == value)
5048 		return (EINVAL);
5049 	if (new_value != 0)
5050 		new_value = 1;
5051 
5052 	mutex_enter(&myri10ge_param_lock);
5053 	if (new_value != mgp->pause)
5054 		err = myri10ge_change_pause(mgp, new_value);
5055 	mutex_exit(&myri10ge_param_lock);
5056 	return (err);
5057 }
5058 
5059 /*ARGSUSED*/
5060 static int
5061 myri10ge_get_int(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
5062 {
5063 	(void) mi_mpprintf(mp, "%d", *(int *)(void *)cp);
5064 	return (0);
5065 }
5066 
5067 /*ARGSUSED*/
5068 static int
5069 myri10ge_set_int(queue_t *q, mblk_t *mp, char *value,
5070     caddr_t cp, cred_t *credp)
5071 {
5072 	char *end;
5073 	size_t new_value;
5074 
5075 	new_value = mi_strtol(value, &end, 10);
5076 	if (end == value)
5077 		return (EINVAL);
5078 	*(int *)(void *)cp = new_value;
5079 
5080 	return (0);
5081 }
5082 
5083 static void
5084 myri10ge_ndd_init(struct myri10ge_priv *mgp)
5085 {
5086 	mgp->nd_head = NULL;
5087 
5088 	(void) nd_load(&mgp->nd_head, "myri10ge_intr_coal_delay",
5089 	    myri10ge_get_coalesce, myri10ge_set_coalesce, (caddr_t)mgp);
5090 	(void) nd_load(&mgp->nd_head, "myri10ge_flow_control",
5091 	    myri10ge_get_pauseparam, myri10ge_set_pauseparam, (caddr_t)mgp);
5092 	(void) nd_load(&mgp->nd_head, "myri10ge_verbose",
5093 	    myri10ge_get_int, myri10ge_set_int, (caddr_t)&myri10ge_verbose);
5094 	(void) nd_load(&mgp->nd_head, "myri10ge_deassert_wait",
5095 	    myri10ge_get_int, myri10ge_set_int,
5096 	    (caddr_t)&myri10ge_deassert_wait);
5097 	(void) nd_load(&mgp->nd_head, "myri10ge_bigbufs_max",
5098 	    myri10ge_get_int, myri10ge_set_int,
5099 	    (caddr_t)&myri10ge_bigbufs_max);
5100 	(void) nd_load(&mgp->nd_head, "myri10ge_lro",
5101 	    myri10ge_get_int, myri10ge_set_int,
5102 	    (caddr_t)&myri10ge_lro);
5103 	(void) nd_load(&mgp->nd_head, "myri10ge_lro_max_aggr",
5104 	    myri10ge_get_int, myri10ge_set_int,
5105 	    (caddr_t)&myri10ge_lro_max_aggr);
5106 	(void) nd_load(&mgp->nd_head, "myri10ge_tx_hash",
5107 	    myri10ge_get_int, myri10ge_set_int,
5108 	    (caddr_t)&myri10ge_tx_hash);
5109 	(void) nd_load(&mgp->nd_head, "myri10ge_lso_copy",
5110 	    myri10ge_get_int, myri10ge_set_int,
5111 	    (caddr_t)&myri10ge_lso_copy);
5112 }
5113 
5114 static void
5115 myri10ge_ndd_fini(struct myri10ge_priv *mgp)
5116 {
5117 	nd_free(&mgp->nd_head);
5118 }
5119 
5120 static void
5121 myri10ge_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
5122 {
5123 	struct iocblk *iocp;
5124 	struct myri10ge_priv *mgp = arg;
5125 	int cmd, ok, err;
5126 
5127 	iocp = (struct iocblk *)(void *)mp->b_rptr;
5128 	cmd = iocp->ioc_cmd;
5129 
5130 	ok = 0;
5131 	err = 0;
5132 
5133 	switch (cmd) {
5134 	case ND_GET:
5135 	case ND_SET:
5136 		ok = nd_getset(wq, mgp->nd_head, mp);
5137 		break;
5138 	default:
5139 		break;
5140 	}
5141 	if (!ok)
5142 		err = EINVAL;
5143 	else
5144 		err = iocp->ioc_error;
5145 
5146 	if (!err)
5147 		miocack(wq, mp, iocp->ioc_count, err);
5148 	else
5149 		miocnak(wq, mp, 0, err);
5150 }
5151 
5152 static struct myri10ge_priv *mgp_list;
5153 
5154 struct myri10ge_priv *
5155 myri10ge_get_instance(uint_t unit)
5156 {
5157 	struct myri10ge_priv *mgp;
5158 
5159 	mutex_enter(&myri10ge_param_lock);
5160 	for (mgp = mgp_list; mgp != NULL; mgp = mgp->next) {
5161 		if (unit == ddi_get_instance(mgp->dip)) {
5162 			mgp->refcnt++;
5163 			break;
5164 		}
5165 	}
5166 	mutex_exit(&myri10ge_param_lock);
5167 	return (mgp);
5168 }
5169 
5170 void
5171 myri10ge_put_instance(struct myri10ge_priv *mgp)
5172 {
5173 	mutex_enter(&myri10ge_param_lock);
5174 	mgp->refcnt--;
5175 	mutex_exit(&myri10ge_param_lock);
5176 }
5177 
5178 static boolean_t
5179 myri10ge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
5180 {
5181 	struct myri10ge_priv *mgp = arg;
5182 	uint32_t *cap_hcksum;
5183 	mac_capab_lso_t *cap_lso;
5184 	mac_capab_rings_t *cap_rings;
5185 
5186 	switch (cap) {
5187 	case MAC_CAPAB_HCKSUM:
5188 		cap_hcksum = cap_data;
5189 		*cap_hcksum = HCKSUM_INET_PARTIAL;
5190 		break;
5191 	case MAC_CAPAB_RINGS:
5192 		cap_rings = cap_data;
5193 		switch (cap_rings->mr_type) {
5194 		case MAC_RING_TYPE_RX:
5195 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
5196 			cap_rings->mr_rnum = mgp->num_slices;
5197 			cap_rings->mr_gnum = 1;
5198 			cap_rings->mr_rget = myri10ge_fill_ring;
5199 			cap_rings->mr_gget = myri10ge_fill_group;
5200 			break;
5201 		case MAC_RING_TYPE_TX:
5202 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
5203 			cap_rings->mr_rnum = mgp->num_slices;
5204 			cap_rings->mr_gnum = 0;
5205 			cap_rings->mr_rget = myri10ge_fill_ring;
5206 			cap_rings->mr_gget = NULL;
5207 			break;
5208 		default:
5209 			return (B_FALSE);
5210 		}
5211 		break;
5212 	case MAC_CAPAB_LSO:
5213 		cap_lso = cap_data;
5214 		if (!myri10ge_use_lso)
5215 			return (B_FALSE);
5216 		if (!(mgp->features & MYRI10GE_TSO))
5217 			return (B_FALSE);
5218 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
5219 		cap_lso->lso_basic_tcp_ipv4.lso_max = (uint16_t)-1;
5220 		break;
5221 
5222 	default:
5223 		return (B_FALSE);
5224 	}
5225 	return (B_TRUE);
5226 }
5227 
5228 
5229 static int
5230 myri10ge_m_stat(void *arg, uint_t stat, uint64_t *val)
5231 {
5232 	struct myri10ge_priv *mgp = arg;
5233 	struct myri10ge_rx_ring_stats *rstat;
5234 	struct myri10ge_tx_ring_stats *tstat;
5235 	mcp_irq_data_t *fw_stats = mgp->ss[0].fw_stats;
5236 	struct myri10ge_slice_state *ss;
5237 	uint64_t tmp = 0;
5238 	int i;
5239 
5240 	switch (stat) {
5241 	case MAC_STAT_IFSPEED:
5242 		*val = 10ull * 1000ull * 1000000ull;
5243 		break;
5244 
5245 	case MAC_STAT_MULTIRCV:
5246 		for (i = 0; i < mgp->num_slices; i++) {
5247 			rstat = &mgp->ss[i].rx_stats;
5248 			tmp += rstat->multircv;
5249 		}
5250 		*val = tmp;
5251 		break;
5252 
5253 	case MAC_STAT_BRDCSTRCV:
5254 		for (i = 0; i < mgp->num_slices; i++) {
5255 			rstat = &mgp->ss[i].rx_stats;
5256 			tmp += rstat->brdcstrcv;
5257 		}
5258 		*val = tmp;
5259 		break;
5260 
5261 	case MAC_STAT_MULTIXMT:
5262 		for (i = 0; i < mgp->num_slices; i++) {
5263 			tstat = &mgp->ss[i].tx.stats;
5264 			tmp += tstat->multixmt;
5265 		}
5266 		*val = tmp;
5267 		break;
5268 
5269 	case MAC_STAT_BRDCSTXMT:
5270 		for (i = 0; i < mgp->num_slices; i++) {
5271 			tstat = &mgp->ss[i].tx.stats;
5272 			tmp += tstat->brdcstxmt;
5273 		}
5274 		*val = tmp;
5275 		break;
5276 
5277 	case MAC_STAT_NORCVBUF:
5278 		tmp = ntohl(fw_stats->dropped_no_big_buffer);
5279 		tmp += ntohl(fw_stats->dropped_no_small_buffer);
5280 		tmp += ntohl(fw_stats->dropped_link_overflow);
5281 		for (i = 0; i < mgp->num_slices; i++) {
5282 			ss = &mgp->ss[i];
5283 			tmp += MYRI10GE_SLICE_STAT(rx_big_nobuf);
5284 			tmp += MYRI10GE_SLICE_STAT(rx_small_nobuf);
5285 		}
5286 		*val = tmp;
5287 		break;
5288 
5289 	case MAC_STAT_IERRORS:
5290 		tmp += ntohl(fw_stats->dropped_bad_crc32);
5291 		tmp += ntohl(fw_stats->dropped_bad_phy);
5292 		tmp += ntohl(fw_stats->dropped_runt);
5293 		tmp += ntohl(fw_stats->dropped_overrun);
5294 		*val = tmp;
5295 		break;
5296 
5297 	case MAC_STAT_OERRORS:
5298 		for (i = 0; i < mgp->num_slices; i++) {
5299 			ss = &mgp->ss[i];
5300 			tmp += MYRI10GE_SLICE_STAT(xmit_lsobadflags);
5301 			tmp += MYRI10GE_SLICE_STAT(xmit_err);
5302 		}
5303 		*val = tmp;
5304 		break;
5305 
5306 	case MAC_STAT_RBYTES:
5307 		for (i = 0; i < mgp->num_slices; i++) {
5308 			rstat = &mgp->ss[i].rx_stats;
5309 			tmp += rstat->ibytes;
5310 		}
5311 		*val = tmp;
5312 		break;
5313 
5314 	case MAC_STAT_IPACKETS:
5315 		for (i = 0; i < mgp->num_slices; i++) {
5316 			rstat = &mgp->ss[i].rx_stats;
5317 			tmp += rstat->ipackets;
5318 		}
5319 		*val = tmp;
5320 		break;
5321 
5322 	case MAC_STAT_OBYTES:
5323 		for (i = 0; i < mgp->num_slices; i++) {
5324 			tstat = &mgp->ss[i].tx.stats;
5325 			tmp += tstat->obytes;
5326 		}
5327 		*val = tmp;
5328 		break;
5329 
5330 	case MAC_STAT_OPACKETS:
5331 		for (i = 0; i < mgp->num_slices; i++) {
5332 			tstat = &mgp->ss[i].tx.stats;
5333 			tmp += tstat->opackets;
5334 		}
5335 		*val = tmp;
5336 		break;
5337 
5338 	case ETHER_STAT_TOOLONG_ERRORS:
5339 		*val = ntohl(fw_stats->dropped_overrun);
5340 		break;
5341 
5342 #ifdef SOLARIS_S11
5343 	case ETHER_STAT_TOOSHORT_ERRORS:
5344 		*val = ntohl(fw_stats->dropped_runt);
5345 		break;
5346 #endif
5347 
5348 	case ETHER_STAT_LINK_PAUSE:
5349 		*val = mgp->pause;
5350 		break;
5351 
5352 	case ETHER_STAT_LINK_AUTONEG:
5353 		*val = 1;
5354 		break;
5355 
5356 	case ETHER_STAT_LINK_DUPLEX:
5357 		*val = LINK_DUPLEX_FULL;
5358 		break;
5359 
5360 	default:
5361 		return (ENOTSUP);
5362 	}
5363 
5364 	return (0);
5365 }
5366 
5367 /* ARGSUSED */
5368 static void
5369 myri10ge_m_propinfo(void *arg, const char *pr_name,
5370     mac_prop_id_t pr_num, mac_prop_info_handle_t prh)
5371 {
5372 	switch (pr_num) {
5373 	case MAC_PROP_MTU:
5374 		mac_prop_info_set_default_uint32(prh, MYRI10GE_DEFAULT_GLD_MTU);
5375 		mac_prop_info_set_range_uint32(prh, MYRI10GE_MIN_GLD_MTU,
5376 		    MYRI10GE_MAX_GLD_MTU);
5377 		break;
5378 	default:
5379 		break;
5380 	}
5381 }
5382 
5383 /*ARGSUSED*/
5384 static int
5385 myri10ge_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
5386     uint_t pr_valsize, const void *pr_val)
5387 {
5388 	int err = 0;
5389 	struct myri10ge_priv *mgp = arg;
5390 
5391 	switch (pr_num) {
5392 	case MAC_PROP_MTU: {
5393 		uint32_t mtu;
5394 		if (pr_valsize < sizeof (mtu)) {
5395 			err = EINVAL;
5396 			break;
5397 		}
5398 		bcopy(pr_val, &mtu, sizeof (mtu));
5399 		if (mtu > MYRI10GE_MAX_GLD_MTU ||
5400 		    mtu < MYRI10GE_MIN_GLD_MTU) {
5401 			err = EINVAL;
5402 			break;
5403 		}
5404 
5405 		mutex_enter(&mgp->intrlock);
5406 		if (mgp->running != MYRI10GE_ETH_STOPPED) {
5407 			err = EBUSY;
5408 			mutex_exit(&mgp->intrlock);
5409 			break;
5410 		}
5411 
5412 		myri10ge_mtu = mtu + sizeof (struct ether_header) +
5413 		    MXGEFW_PAD + VLAN_TAGSZ;
5414 		mutex_exit(&mgp->intrlock);
5415 		break;
5416 	}
5417 	default:
5418 		err = ENOTSUP;
5419 		break;
5420 	}
5421 
5422 	return (err);
5423 }
5424 
5425 static mac_callbacks_t myri10ge_m_callbacks = {
5426 	(MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO),
5427 	myri10ge_m_stat,
5428 	myri10ge_m_start,
5429 	myri10ge_m_stop,
5430 	myri10ge_m_promisc,
5431 	myri10ge_m_multicst,
5432 	NULL,
5433 	NULL,
5434 	NULL,
5435 	myri10ge_m_ioctl,
5436 	myri10ge_m_getcapab,
5437 	NULL,
5438 	NULL,
5439 	myri10ge_m_setprop,
5440 	NULL,
5441 	myri10ge_m_propinfo
5442 };
5443 
5444 
5445 static int
5446 myri10ge_probe_slices(struct myri10ge_priv *mgp)
5447 {
5448 	myri10ge_cmd_t cmd;
5449 	int status;
5450 
5451 	mgp->num_slices = 1;
5452 
5453 	/* hit the board with a reset to ensure it is alive */
5454 	(void) memset(&cmd, 0, sizeof (cmd));
5455 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_RESET, &cmd);
5456 	if (status != 0) {
5457 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
5458 		return (ENXIO);
5459 	}
5460 
5461 	if (myri10ge_use_msix == 0)
5462 		return (0);
5463 
5464 	/* tell it the size of the interrupt queues */
5465 	cmd.data0 = mgp->max_intr_slots * sizeof (struct mcp_slot);
5466 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
5467 	if (status != 0) {
5468 		cmn_err(CE_WARN, "%s: failed MXGEFW_CMD_SET_INTRQ_SIZE\n",
5469 		    mgp->name);
5470 		return (ENXIO);
5471 	}
5472 
5473 	/* ask the maximum number of slices it supports */
5474 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
5475 	    &cmd);
5476 	if (status != 0)
5477 		return (0);
5478 
5479 	mgp->num_slices = cmd.data0;
5480 
5481 	/*
5482 	 * if the admin did not specify a limit to how many
5483 	 * slices we should use, cap it automatically to the
5484 	 * number of CPUs currently online
5485 	 */
5486 	if (myri10ge_max_slices == -1)
5487 		myri10ge_max_slices = ncpus;
5488 
5489 	if (mgp->num_slices > myri10ge_max_slices)
5490 		mgp->num_slices = myri10ge_max_slices;
5491 
5492 
5493 	/*
5494 	 * Now try to allocate as many MSI-X vectors as we have
5495 	 * slices. We give up on MSI-X if we can only get a single
5496 	 * vector.
5497 	 */
5498 	while (mgp->num_slices > 1) {
5499 		/* make sure it is a power of two */
5500 		while (!ISP2(mgp->num_slices))
5501 			mgp->num_slices--;
5502 		if (mgp->num_slices == 1)
5503 			return (0);
5504 
5505 		status = myri10ge_add_intrs(mgp, 0);
5506 		if (status == 0) {
5507 			myri10ge_rem_intrs(mgp, 0);
5508 			if (mgp->intr_cnt == mgp->num_slices) {
5509 				if (myri10ge_verbose)
5510 					printf("Got %d slices!\n",
5511 					    mgp->num_slices);
5512 				return (0);
5513 			}
5514 			mgp->num_slices = mgp->intr_cnt;
5515 		} else {
5516 			mgp->num_slices = mgp->num_slices / 2;
5517 		}
5518 	}
5519 
5520 	if (myri10ge_verbose)
5521 		printf("Got %d slices\n", mgp->num_slices);
5522 	return (0);
5523 }
5524 
5525 static void
5526 myri10ge_lro_free(struct myri10ge_slice_state *ss)
5527 {
5528 	struct lro_entry *lro;
5529 
5530 	while (ss->lro_free != NULL) {
5531 		lro = ss->lro_free;
5532 		ss->lro_free = lro->next;
5533 		kmem_free(lro, sizeof (*lro));
5534 	}
5535 }
5536 
5537 static void
5538 myri10ge_lro_alloc(struct myri10ge_slice_state *ss)
5539 {
5540 	struct lro_entry *lro;
5541 	int idx;
5542 
5543 	ss->lro_free = NULL;
5544 	ss->lro_active = NULL;
5545 
5546 	for (idx = 0; idx < myri10ge_lro_cnt; idx++) {
5547 		lro = kmem_zalloc(sizeof (*lro), KM_SLEEP);
5548 		if (lro == NULL)
5549 			continue;
5550 		lro->next = ss->lro_free;
5551 		ss->lro_free = lro;
5552 	}
5553 }
5554 
5555 static void
5556 myri10ge_free_slices(struct myri10ge_priv *mgp)
5557 {
5558 	struct myri10ge_slice_state *ss;
5559 	size_t bytes;
5560 	int i;
5561 
5562 	if (mgp->ss == NULL)
5563 		return;
5564 
5565 	for (i = 0; i < mgp->num_slices; i++) {
5566 		ss = &mgp->ss[i];
5567 		if (ss->rx_done.entry == NULL)
5568 			continue;
5569 		myri10ge_dma_free(&ss->rx_done.dma);
5570 		ss->rx_done.entry = NULL;
5571 		if (ss->fw_stats == NULL)
5572 			continue;
5573 		myri10ge_dma_free(&ss->fw_stats_dma);
5574 		ss->fw_stats = NULL;
5575 		mutex_destroy(&ss->rx_lock);
5576 		mutex_destroy(&ss->tx.lock);
5577 		mutex_destroy(&ss->tx.handle_lock);
5578 		mutex_destroy(&ss->poll_lock);
5579 		myri10ge_jpool_fini(ss);
5580 		myri10ge_slice_stat_destroy(ss);
5581 		myri10ge_lro_free(ss);
5582 	}
5583 	bytes = sizeof (*mgp->ss) * mgp->num_slices;
5584 	kmem_free(mgp->ss, bytes);
5585 	mgp->ss = NULL;
5586 }
5587 
5588 
5589 static int
5590 myri10ge_alloc_slices(struct myri10ge_priv *mgp)
5591 {
5592 	struct myri10ge_slice_state *ss;
5593 	size_t bytes;
5594 	int i;
5595 
5596 	bytes = sizeof (*mgp->ss) * mgp->num_slices;
5597 	mgp->ss = kmem_zalloc(bytes, KM_SLEEP);
5598 	if (mgp->ss == NULL)
5599 		return (ENOMEM);
5600 	for (i = 0; i < mgp->num_slices; i++) {
5601 		ss = &mgp->ss[i];
5602 
5603 		ss->mgp = mgp;
5604 
5605 		/* allocate the per-slice firmware stats */
5606 		bytes = sizeof (*ss->fw_stats);
5607 		ss->fw_stats = (mcp_irq_data_t *)(void *)
5608 		    myri10ge_dma_alloc(mgp->dip, bytes,
5609 		    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5610 		    DDI_DMA_CONSISTENT, DDI_DMA_READ|DDI_DMA_CONSISTENT,
5611 		    &ss->fw_stats_dma, 1, DDI_DMA_DONTWAIT);
5612 		if (ss->fw_stats == NULL)
5613 			goto abort;
5614 		(void) memset(ss->fw_stats, 0, bytes);
5615 
5616 		/* allocate rx done ring */
5617 		bytes = mgp->max_intr_slots *
5618 		    sizeof (*ss->rx_done.entry);
5619 		ss->rx_done.entry = (mcp_slot_t *)(void *)
5620 		    myri10ge_dma_alloc(mgp->dip, bytes,
5621 		    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5622 		    DDI_DMA_CONSISTENT, DDI_DMA_READ|DDI_DMA_CONSISTENT,
5623 		    &ss->rx_done.dma, 1, DDI_DMA_DONTWAIT);
5624 		if (ss->rx_done.entry == NULL) {
5625 			goto abort;
5626 		}
5627 		(void) memset(ss->rx_done.entry, 0, bytes);
5628 		mutex_init(&ss->rx_lock,   NULL, MUTEX_DEFAULT, mgp->icookie);
5629 		mutex_init(&ss->tx.lock,   NULL, MUTEX_DEFAULT, NULL);
5630 		mutex_init(&ss->tx.handle_lock,   NULL, MUTEX_DEFAULT, NULL);
5631 		mutex_init(&ss->poll_lock,   NULL, MUTEX_DEFAULT, NULL);
5632 		myri10ge_jpool_init(ss);
5633 		(void) myri10ge_slice_stat_init(ss);
5634 		myri10ge_lro_alloc(ss);
5635 	}
5636 
5637 	return (0);
5638 
5639 abort:
5640 	myri10ge_free_slices(mgp);
5641 	return (ENOMEM);
5642 }
5643 
5644 static int
5645 myri10ge_save_msi_state(struct myri10ge_priv *mgp,
5646     ddi_acc_handle_t handle)
5647 {
5648 	uint8_t ptr;
5649 	int err;
5650 
5651 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_MSI);
5652 	if (err != 0) {
5653 		cmn_err(CE_WARN, "%s: could not find MSI cap\n",
5654 		    mgp->name);
5655 		return (DDI_FAILURE);
5656 	}
5657 	mgp->pci_saved_state.msi_ctrl =
5658 	    pci_config_get16(handle, ptr + PCI_MSI_CTRL);
5659 	mgp->pci_saved_state.msi_addr_low =
5660 	    pci_config_get32(handle, ptr + PCI_MSI_ADDR_OFFSET);
5661 	mgp->pci_saved_state.msi_addr_high =
5662 	    pci_config_get32(handle, ptr + PCI_MSI_ADDR_OFFSET + 4);
5663 	mgp->pci_saved_state.msi_data_32 =
5664 	    pci_config_get16(handle, ptr + PCI_MSI_32BIT_DATA);
5665 	mgp->pci_saved_state.msi_data_64 =
5666 	    pci_config_get16(handle, ptr + PCI_MSI_64BIT_DATA);
5667 	return (DDI_SUCCESS);
5668 }
5669 
5670 static int
5671 myri10ge_restore_msi_state(struct myri10ge_priv *mgp,
5672     ddi_acc_handle_t handle)
5673 {
5674 	uint8_t ptr;
5675 	int err;
5676 
5677 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_MSI);
5678 	if (err != 0) {
5679 		cmn_err(CE_WARN, "%s: could not find MSI cap\n",
5680 		    mgp->name);
5681 		return (DDI_FAILURE);
5682 	}
5683 
5684 	pci_config_put16(handle, ptr + PCI_MSI_CTRL,
5685 	    mgp->pci_saved_state.msi_ctrl);
5686 	pci_config_put32(handle, ptr + PCI_MSI_ADDR_OFFSET,
5687 	    mgp->pci_saved_state.msi_addr_low);
5688 	pci_config_put32(handle, ptr + PCI_MSI_ADDR_OFFSET + 4,
5689 	    mgp->pci_saved_state.msi_addr_high);
5690 	pci_config_put16(handle, ptr + PCI_MSI_32BIT_DATA,
5691 	    mgp->pci_saved_state.msi_data_32);
5692 	pci_config_put16(handle, ptr + PCI_MSI_64BIT_DATA,
5693 	    mgp->pci_saved_state.msi_data_64);
5694 
5695 	return (DDI_SUCCESS);
5696 }
5697 
5698 static int
5699 myri10ge_save_pci_state(struct myri10ge_priv *mgp)
5700 {
5701 	ddi_acc_handle_t handle = mgp->cfg_hdl;
5702 	int i;
5703 	int err = DDI_SUCCESS;
5704 
5705 
5706 	/* Save the non-extended PCI config space 32-bits at a time */
5707 	for (i = 0; i < 16; i++)
5708 		mgp->pci_saved_state.base[i] =
5709 		    pci_config_get32(handle, i*4);
5710 
5711 	/* now save MSI interrupt state *, if needed */
5712 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_MSI)
5713 		err = myri10ge_save_msi_state(mgp, handle);
5714 
5715 	return (err);
5716 }
5717 
5718 static int
5719 myri10ge_restore_pci_state(struct myri10ge_priv *mgp)
5720 {
5721 	ddi_acc_handle_t handle = mgp->cfg_hdl;
5722 	int i;
5723 	int err = DDI_SUCCESS;
5724 
5725 
5726 	/* Restore the non-extended PCI config space 32-bits at a time */
5727 	for (i = 15; i >= 0; i--)
5728 		pci_config_put32(handle, i*4, mgp->pci_saved_state.base[i]);
5729 
5730 	/* now restore MSI interrupt state *, if needed */
5731 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_MSI)
5732 		err = myri10ge_restore_msi_state(mgp, handle);
5733 
5734 	if (mgp->max_read_request_4k)
5735 		(void) myri10ge_set_max_readreq(handle);
5736 	return (err);
5737 }
5738 
5739 
5740 static int
5741 myri10ge_suspend(dev_info_t *dip)
5742 {
5743 	struct myri10ge_priv *mgp = ddi_get_driver_private(dip);
5744 	int status;
5745 
5746 	if (mgp == NULL) {
5747 		cmn_err(CE_WARN, "null dip in myri10ge_suspend\n");
5748 		return (DDI_FAILURE);
5749 	}
5750 	if (mgp->dip != dip) {
5751 		cmn_err(CE_WARN, "bad dip in myri10ge_suspend\n");
5752 		return (DDI_FAILURE);
5753 	}
5754 	mutex_enter(&mgp->intrlock);
5755 	if (mgp->running == MYRI10GE_ETH_RUNNING) {
5756 		mgp->running = MYRI10GE_ETH_STOPPING;
5757 		mutex_exit(&mgp->intrlock);
5758 		(void) untimeout(mgp->timer_id);
5759 		mutex_enter(&mgp->intrlock);
5760 		myri10ge_stop_locked(mgp);
5761 		mgp->running = MYRI10GE_ETH_SUSPENDED_RUNNING;
5762 	}
5763 	status = myri10ge_save_pci_state(mgp);
5764 	mutex_exit(&mgp->intrlock);
5765 	return (status);
5766 }
5767 
5768 static int
5769 myri10ge_resume(dev_info_t *dip)
5770 {
5771 	struct myri10ge_priv *mgp = ddi_get_driver_private(dip);
5772 	int status = DDI_SUCCESS;
5773 
5774 	if (mgp == NULL) {
5775 		cmn_err(CE_WARN, "null dip in myri10ge_resume\n");
5776 		return (DDI_FAILURE);
5777 	}
5778 	if (mgp->dip != dip) {
5779 		cmn_err(CE_WARN, "bad dip in myri10ge_resume\n");
5780 		return (DDI_FAILURE);
5781 	}
5782 
5783 	mutex_enter(&mgp->intrlock);
5784 	status = myri10ge_restore_pci_state(mgp);
5785 	if (status == DDI_SUCCESS &&
5786 	    mgp->running == MYRI10GE_ETH_SUSPENDED_RUNNING) {
5787 		status = myri10ge_start_locked(mgp);
5788 	}
5789 	mutex_exit(&mgp->intrlock);
5790 	if (status != DDI_SUCCESS)
5791 		return (status);
5792 
5793 	/* start the watchdog timer */
5794 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
5795 	    mgp->timer_ticks);
5796 	return (DDI_SUCCESS);
5797 }
5798 
5799 static int
5800 myri10ge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
5801 {
5802 
5803 	struct myri10ge_priv *mgp;
5804 	mac_register_t *macp, *omacp;
5805 	ddi_acc_handle_t handle;
5806 	uint32_t csr, hdr_offset;
5807 	int status, span, link_width, max_read_request_4k;
5808 	unsigned long bus_number, dev_number, func_number;
5809 	size_t bytes;
5810 	offset_t ss_offset;
5811 	uint8_t vso;
5812 
5813 	if (cmd == DDI_RESUME) {
5814 		return (myri10ge_resume(dip));
5815 	}
5816 
5817 	if (cmd != DDI_ATTACH)
5818 		return (DDI_FAILURE);
5819 	if (pci_config_setup(dip, &handle) != DDI_SUCCESS)
5820 		return (DDI_FAILURE);
5821 
5822 	/* enable busmater and io space access */
5823 	csr = pci_config_get32(handle, PCI_CONF_COMM);
5824 	pci_config_put32(handle, PCI_CONF_COMM,
5825 	    (csr |PCI_COMM_ME|PCI_COMM_MAE));
5826 	status = myri10ge_read_pcie_link_width(handle, &link_width);
5827 	if (status != 0) {
5828 		cmn_err(CE_WARN, "could not read link width!\n");
5829 		link_width = 0;
5830 	}
5831 	max_read_request_4k = !myri10ge_set_max_readreq(handle);
5832 	status = myri10ge_find_cap(handle, &vso, PCI_CAP_ID_VS);
5833 	if (status != 0)
5834 		goto abort_with_cfg_hdl;
5835 	if ((omacp = mac_alloc(MAC_VERSION)) == NULL)
5836 		goto abort_with_cfg_hdl;
5837 	/*
5838 	 * XXXX Hack: mac_register_t grows in newer kernels.  To be
5839 	 * able to write newer fields, such as m_margin, without
5840 	 * writing outside allocated memory, we allocate our own macp
5841 	 * and pass that to mac_register()
5842 	 */
5843 	macp = kmem_zalloc(sizeof (*macp) * 8, KM_SLEEP);
5844 	macp->m_version = omacp->m_version;
5845 
5846 	if ((mgp = (struct myri10ge_priv *)
5847 	    kmem_zalloc(sizeof (*mgp), KM_SLEEP)) == NULL) {
5848 		goto abort_with_macinfo;
5849 	}
5850 	ddi_set_driver_private(dip, mgp);
5851 
5852 	/* setup device name for log messages */
5853 	(void) sprintf(mgp->name, "myri10ge%d", ddi_get_instance(dip));
5854 
5855 	mutex_enter(&myri10ge_param_lock);
5856 	myri10ge_get_props(dip);
5857 	mgp->intr_coal_delay = myri10ge_intr_coal_delay;
5858 	mgp->pause = myri10ge_flow_control;
5859 	mutex_exit(&myri10ge_param_lock);
5860 
5861 	mgp->max_read_request_4k = max_read_request_4k;
5862 	mgp->pcie_link_width = link_width;
5863 	mgp->running = MYRI10GE_ETH_STOPPED;
5864 	mgp->vso = vso;
5865 	mgp->dip = dip;
5866 	mgp->cfg_hdl = handle;
5867 
5868 	mgp->timer_ticks = 5 * drv_usectohz(1000000); /* 5 seconds */
5869 	myri10ge_test_physical(dip);
5870 
5871 	/* allocate command page */
5872 	bytes = sizeof (*mgp->cmd);
5873 	mgp->cmd = (mcp_cmd_response_t *)
5874 	    (void *)myri10ge_dma_alloc(dip, bytes,
5875 	    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5876 	    DDI_DMA_CONSISTENT,	DDI_DMA_RDWR|DDI_DMA_CONSISTENT,
5877 	    &mgp->cmd_dma, 1, DDI_DMA_DONTWAIT);
5878 	if (mgp->cmd == NULL)
5879 		goto abort_with_mgp;
5880 
5881 	(void) myri10ge_reg_set(dip, &mgp->reg_set, &span, &bus_number,
5882 	    &dev_number, &func_number);
5883 	if (myri10ge_verbose)
5884 		printf("%s at %ld:%ld:%ld attaching\n", mgp->name,
5885 		    bus_number, dev_number, func_number);
5886 	status = ddi_regs_map_setup(dip, mgp->reg_set, (caddr_t *)&mgp->sram,
5887 	    (offset_t)0, (offset_t)span,  &myri10ge_dev_access_attr,
5888 	    &mgp->io_handle);
5889 	if (status != DDI_SUCCESS) {
5890 		cmn_err(CE_WARN, "%s: couldn't map memory space", mgp->name);
5891 		printf("%s: reg_set = %d, span = %d, status = %d",
5892 		    mgp->name, mgp->reg_set, span, status);
5893 		goto abort_with_mgp;
5894 	}
5895 
5896 	hdr_offset = *(uint32_t *)(void*)(mgp->sram +  MCP_HEADER_PTR_OFFSET);
5897 	hdr_offset = ntohl(hdr_offset) & 0xffffc;
5898 	ss_offset = hdr_offset +
5899 	    offsetof(struct mcp_gen_header, string_specs);
5900 	mgp->sram_size = ntohl(*(uint32_t *)(void*)(mgp->sram + ss_offset));
5901 	myri10ge_pio_copy32(mgp->eeprom_strings,
5902 	    (uint32_t *)(void*)((char *)mgp->sram + mgp->sram_size),
5903 	    MYRI10GE_EEPROM_STRINGS_SIZE);
5904 	(void) memset(mgp->eeprom_strings +
5905 	    MYRI10GE_EEPROM_STRINGS_SIZE - 2, 0, 2);
5906 
5907 	status = myri10ge_read_mac_addr(mgp);
5908 	if (status) {
5909 		goto abort_with_mapped;
5910 	}
5911 
5912 	status = myri10ge_select_firmware(mgp);
5913 	if (status != 0) {
5914 		cmn_err(CE_WARN, "%s: failed to load firmware\n", mgp->name);
5915 		goto abort_with_mapped;
5916 	}
5917 
5918 	status = myri10ge_probe_slices(mgp);
5919 	if (status != 0) {
5920 		cmn_err(CE_WARN, "%s: failed to probe slices\n", mgp->name);
5921 		goto abort_with_dummy_rdma;
5922 	}
5923 
5924 	status = myri10ge_alloc_slices(mgp);
5925 	if (status != 0) {
5926 		cmn_err(CE_WARN, "%s: failed to alloc slices\n", mgp->name);
5927 		goto abort_with_dummy_rdma;
5928 	}
5929 
5930 	/* add the interrupt handler */
5931 	status = myri10ge_add_intrs(mgp, 1);
5932 	if (status != 0) {
5933 		cmn_err(CE_WARN, "%s: Failed to add interrupt\n",
5934 		    mgp->name);
5935 		goto abort_with_slices;
5936 	}
5937 
5938 	/* now that we have an iblock_cookie, init the mutexes */
5939 	mutex_init(&mgp->cmd_lock, NULL, MUTEX_DRIVER, mgp->icookie);
5940 	mutex_init(&mgp->intrlock, NULL, MUTEX_DRIVER, mgp->icookie);
5941 
5942 
5943 	status = myri10ge_nic_stat_init(mgp);
5944 	if (status != DDI_SUCCESS)
5945 		goto abort_with_interrupts;
5946 	status = myri10ge_info_init(mgp);
5947 	if (status != DDI_SUCCESS)
5948 		goto abort_with_stats;
5949 
5950 	/*
5951 	 *	Initialize  GLD state
5952 	 */
5953 
5954 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
5955 	macp->m_driver = mgp;
5956 	macp->m_dip = dip;
5957 	macp->m_src_addr = mgp->mac_addr;
5958 	macp->m_callbacks = &myri10ge_m_callbacks;
5959 	macp->m_min_sdu = 0;
5960 	macp->m_max_sdu = myri10ge_mtu -
5961 	    (sizeof (struct ether_header) + MXGEFW_PAD + VLAN_TAGSZ);
5962 #ifdef SOLARIS_S11
5963 	macp->m_margin = VLAN_TAGSZ;
5964 #endif
5965 	macp->m_v12n = MAC_VIRT_LEVEL1;
5966 	status = mac_register(macp, &mgp->mh);
5967 	if (status != 0) {
5968 		cmn_err(CE_WARN, "%s: mac_register failed with %d\n",
5969 		    mgp->name, status);
5970 		goto abort_with_info;
5971 	}
5972 	myri10ge_ndd_init(mgp);
5973 	if (myri10ge_verbose)
5974 		printf("%s: %s, tx bndry %d, fw %s\n", mgp->name,
5975 		    mgp->intr_type, mgp->tx_boundary, mgp->fw_name);
5976 	mutex_enter(&myri10ge_param_lock);
5977 	mgp->next = mgp_list;
5978 	mgp_list = mgp;
5979 	mutex_exit(&myri10ge_param_lock);
5980 	kmem_free(macp, sizeof (*macp) * 8);
5981 	mac_free(omacp);
5982 	return (DDI_SUCCESS);
5983 
5984 abort_with_info:
5985 	myri10ge_info_destroy(mgp);
5986 
5987 abort_with_stats:
5988 	myri10ge_nic_stat_destroy(mgp);
5989 
5990 abort_with_interrupts:
5991 	mutex_destroy(&mgp->cmd_lock);
5992 	mutex_destroy(&mgp->intrlock);
5993 	myri10ge_rem_intrs(mgp, 1);
5994 
5995 abort_with_slices:
5996 	myri10ge_free_slices(mgp);
5997 
5998 abort_with_dummy_rdma:
5999 	myri10ge_dummy_rdma(mgp, 0);
6000 
6001 abort_with_mapped:
6002 	ddi_regs_map_free(&mgp->io_handle);
6003 
6004 	myri10ge_dma_free(&mgp->cmd_dma);
6005 
6006 abort_with_mgp:
6007 	kmem_free(mgp, sizeof (*mgp));
6008 
6009 abort_with_macinfo:
6010 	kmem_free(macp, sizeof (*macp) * 8);
6011 	mac_free(omacp);
6012 
6013 abort_with_cfg_hdl:
6014 	pci_config_teardown(&handle);
6015 	return (DDI_FAILURE);
6016 
6017 }
6018 
6019 
6020 static int
6021 myri10ge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
6022 {
6023 	struct myri10ge_priv	*mgp, *tmp;
6024 	int			status, i, jbufs_alloced;
6025 
6026 	if (cmd == DDI_SUSPEND) {
6027 		status = myri10ge_suspend(dip);
6028 		return (status);
6029 	}
6030 
6031 	if (cmd != DDI_DETACH) {
6032 		return (DDI_FAILURE);
6033 	}
6034 	/* Get the driver private (gld_mac_info_t) structure */
6035 	mgp = ddi_get_driver_private(dip);
6036 
6037 	mutex_enter(&mgp->intrlock);
6038 	jbufs_alloced = 0;
6039 	for (i = 0; i < mgp->num_slices; i++) {
6040 		myri10ge_remove_jbufs(&mgp->ss[i]);
6041 		jbufs_alloced += mgp->ss[i].jpool.num_alloc;
6042 	}
6043 	mutex_exit(&mgp->intrlock);
6044 	if (jbufs_alloced != 0) {
6045 		cmn_err(CE_NOTE, "%s: %d loaned rx buffers remain\n",
6046 		    mgp->name, jbufs_alloced);
6047 		return (DDI_FAILURE);
6048 	}
6049 
6050 	mutex_enter(&myri10ge_param_lock);
6051 	if (mgp->refcnt != 0) {
6052 		mutex_exit(&myri10ge_param_lock);
6053 		cmn_err(CE_NOTE, "%s: %d external refs remain\n",
6054 		    mgp->name, mgp->refcnt);
6055 		return (DDI_FAILURE);
6056 	}
6057 	mutex_exit(&myri10ge_param_lock);
6058 
6059 	status = mac_unregister(mgp->mh);
6060 	if (status != DDI_SUCCESS)
6061 		return (status);
6062 
6063 	myri10ge_ndd_fini(mgp);
6064 	myri10ge_dummy_rdma(mgp, 0);
6065 	myri10ge_nic_stat_destroy(mgp);
6066 	myri10ge_info_destroy(mgp);
6067 
6068 	mutex_destroy(&mgp->cmd_lock);
6069 	mutex_destroy(&mgp->intrlock);
6070 
6071 	myri10ge_rem_intrs(mgp, 1);
6072 
6073 	myri10ge_free_slices(mgp);
6074 	ddi_regs_map_free(&mgp->io_handle);
6075 	myri10ge_dma_free(&mgp->cmd_dma);
6076 	pci_config_teardown(&mgp->cfg_hdl);
6077 
6078 	mutex_enter(&myri10ge_param_lock);
6079 	if (mgp_list == mgp) {
6080 		mgp_list = mgp->next;
6081 	} else {
6082 		tmp = mgp_list;
6083 		while (tmp->next != mgp && tmp->next != NULL)
6084 			tmp = tmp->next;
6085 		if (tmp->next != NULL)
6086 			tmp->next = tmp->next->next;
6087 	}
6088 	kmem_free(mgp, sizeof (*mgp));
6089 	mutex_exit(&myri10ge_param_lock);
6090 	return (DDI_SUCCESS);
6091 }
6092 
6093 /*
6094  * Helper for quiesce entry point: Interrupt threads are not being
6095  * scheduled, so we must poll for the confirmation DMA to arrive in
6096  * the firmware stats block for slice 0.  We're essentially running
6097  * the guts of the interrupt handler, and just cherry picking the
6098  * confirmation that the NIC is queuesced (stats->link_down)
6099  */
6100 
6101 static int
6102 myri10ge_poll_down(struct myri10ge_priv *mgp)
6103 {
6104 	struct myri10ge_slice_state *ss = mgp->ss;
6105 	mcp_irq_data_t *stats = ss->fw_stats;
6106 	int valid;
6107 	int found_down = 0;
6108 
6109 
6110 	/* check for a pending IRQ */
6111 
6112 	if (! *((volatile uint8_t *)& stats->valid))
6113 		return (0);
6114 	valid = stats->valid;
6115 
6116 	/*
6117 	 * Make sure to tell the NIC to lower a legacy IRQ, else
6118 	 * it may have corrupt state after restarting
6119 	 */
6120 
6121 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
6122 		/* lower legacy IRQ  */
6123 		*mgp->irq_deassert = 0;
6124 		mb();
6125 		/* wait for irq conf DMA */
6126 		while (*((volatile uint8_t *)& stats->valid))
6127 			;
6128 	}
6129 	if (stats->stats_updated && stats->link_down)
6130 		found_down = 1;
6131 
6132 	if (valid & 0x1)
6133 		*ss->irq_claim = BE_32(3);
6134 	*(ss->irq_claim + 1) = BE_32(3);
6135 
6136 	return (found_down);
6137 }
6138 
6139 static int
6140 myri10ge_quiesce(dev_info_t *dip)
6141 {
6142 	struct myri10ge_priv *mgp;
6143 	myri10ge_cmd_t cmd;
6144 	int status, down, i;
6145 
6146 	mgp = ddi_get_driver_private(dip);
6147 	if (mgp == NULL)
6148 		return (DDI_FAILURE);
6149 
6150 	/* if devices was unplumbed, it is guaranteed to be quiescent */
6151 	if (mgp->running == MYRI10GE_ETH_STOPPED)
6152 		return (DDI_SUCCESS);
6153 
6154 	/* send a down CMD to queuesce NIC */
6155 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
6156 	if (status) {
6157 		cmn_err(CE_WARN, "%s: Couldn't bring down link\n", mgp->name);
6158 		return (DDI_FAILURE);
6159 	}
6160 
6161 	for (i = 0; i < 20; i++) {
6162 		down = myri10ge_poll_down(mgp);
6163 		if (down)
6164 			break;
6165 		delay(drv_usectohz(100000));
6166 		mb();
6167 	}
6168 	if (down)
6169 		return (DDI_SUCCESS);
6170 	return (DDI_FAILURE);
6171 }
6172 
6173 /*
6174  * Distinguish between allocb'ed blocks, and gesballoc'ed attached
6175  * storage.
6176  */
6177 static void
6178 myri10ge_find_lastfree(void)
6179 {
6180 	mblk_t *mp = allocb(1024, 0);
6181 	dblk_t *dbp;
6182 
6183 	if (mp == NULL) {
6184 		cmn_err(CE_WARN, "myri10ge_find_lastfree failed\n");
6185 		return;
6186 	}
6187 	dbp = mp->b_datap;
6188 	myri10ge_db_lastfree = (void *)dbp->db_lastfree;
6189 }
6190 
6191 int
6192 _init(void)
6193 {
6194 	int i;
6195 
6196 	if (myri10ge_verbose)
6197 		cmn_err(CE_NOTE,
6198 		    "Myricom 10G driver (10GbE) version %s loading\n",
6199 		    MYRI10GE_VERSION_STR);
6200 	myri10ge_find_lastfree();
6201 	mac_init_ops(&myri10ge_ops, "myri10ge");
6202 	mutex_init(&myri10ge_param_lock, NULL, MUTEX_DEFAULT, NULL);
6203 	if ((i = mod_install(&modlinkage)) != 0) {
6204 		cmn_err(CE_WARN, "mod_install returned %d\n", i);
6205 		mac_fini_ops(&myri10ge_ops);
6206 		mutex_destroy(&myri10ge_param_lock);
6207 	}
6208 	return (i);
6209 }
6210 
6211 int
6212 _fini(void)
6213 {
6214 	int i;
6215 	i = mod_remove(&modlinkage);
6216 	if (i != 0) {
6217 		return (i);
6218 	}
6219 	mac_fini_ops(&myri10ge_ops);
6220 	mutex_destroy(&myri10ge_param_lock);
6221 	return (0);
6222 }
6223 
6224 int
6225 _info(struct modinfo *modinfop)
6226 {
6227 	return (mod_info(&modlinkage, modinfop));
6228 }
6229 
6230 
6231 /*
6232  *  This file uses MyriGE driver indentation.
6233  *
6234  * Local Variables:
6235  * c-file-style:"sun"
6236  * tab-width:8
6237  * End:
6238  */
6239