xref: /titanic_50/usr/src/uts/common/io/myri10ge/drv/myri10ge.c (revision de572d98af8238405c5d1292a788b1a85b0c68eb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright 2007-2009 Myricom, Inc.  All rights reserved.
29  * Use is subject to license terms.
30  */
31 
32 /*
33  * Copyright (c) 2014, Joyent, Inc.
34  */
35 
36 #ifndef	lint
37 static const char __idstring[] =
38 	"@(#)$Id: myri10ge.c,v 1.186 2009-06-29 13:47:22 gallatin Exp $";
39 #endif
40 
41 #define	MXGEFW_NDIS
42 #include "myri10ge_var.h"
43 #include "rss_eth_z8e.h"
44 #include "rss_ethp_z8e.h"
45 #include "mcp_gen_header.h"
46 
47 #define	MYRI10GE_MAX_ETHER_MTU 9014
48 #define	MYRI10GE_MAX_GLD_MTU	9000
49 #define	MYRI10GE_MIN_GLD_MTU	1500
50 
51 #define	MYRI10GE_ETH_STOPPED 0
52 #define	MYRI10GE_ETH_STOPPING 1
53 #define	MYRI10GE_ETH_STARTING 2
54 #define	MYRI10GE_ETH_RUNNING 3
55 #define	MYRI10GE_ETH_OPEN_FAILED 4
56 #define	MYRI10GE_ETH_SUSPENDED_RUNNING 5
57 
58 static int myri10ge_small_bytes = 510;
59 static int myri10ge_intr_coal_delay = 125;
60 static int myri10ge_flow_control = 1;
61 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
62 static int myri10ge_nvidia_ecrc_enable = 1;
63 #endif
64 static int myri10ge_mtu_override = 0;
65 static int myri10ge_tx_copylen = 512;
66 static int myri10ge_deassert_wait = 1;
67 static int myri10ge_verbose = 0;
68 static int myri10ge_watchdog_reset = 0;
69 static int myri10ge_use_msix = 1;
70 static int myri10ge_max_slices = -1;
71 static int myri10ge_use_msi = 1;
72 int myri10ge_force_firmware = 0;
73 static boolean_t myri10ge_use_lso = B_TRUE;
74 static int myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
75 static int myri10ge_tx_hash = 1;
76 static int myri10ge_lro = 0;
77 static int myri10ge_lro_cnt = 8;
78 int myri10ge_lro_max_aggr = 2;
79 static int myri10ge_lso_copy = 0;
80 static mblk_t *myri10ge_send_wrapper(void *arg, mblk_t *mp);
81 int myri10ge_tx_handles_initial = 128;
82 
83 static 	kmutex_t myri10ge_param_lock;
84 static void* myri10ge_db_lastfree;
85 
86 static int myri10ge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
87 static int myri10ge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
88 static int myri10ge_quiesce(dev_info_t *dip);
89 
90 DDI_DEFINE_STREAM_OPS(myri10ge_ops, nulldev, nulldev, myri10ge_attach,
91     myri10ge_detach, nodev, NULL, D_MP, NULL, myri10ge_quiesce);
92 
93 
94 static struct modldrv modldrv = {
95 	&mod_driverops,
96 	"Myricom 10G driver (10GbE)",
97 	&myri10ge_ops,
98 };
99 
100 
101 static struct modlinkage modlinkage = {
102 	MODREV_1,
103 	{&modldrv, NULL},
104 };
105 
106 unsigned char myri10ge_broadcastaddr[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
107 
108 static ddi_dma_attr_t myri10ge_misc_dma_attr = {
109 	DMA_ATTR_V0,			/* version number. */
110 	(uint64_t)0, 			/* low address */
111 	(uint64_t)0xffffffffffffffffULL, /* high address */
112 	(uint64_t)0x7ffffff,		/* address counter max */
113 	(uint64_t)4096,			/* alignment */
114 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
115 	(uint32_t)0x1,			/* minimum transfer size */
116 	(uint64_t)0x7fffffff,		/* maximum transfer size */
117 	(uint64_t)0x7fffffff,		/* maximum segment size */
118 	1,				/* scatter/gather list length */
119 	1,				/* granularity */
120 	0				/* attribute flags */
121 };
122 
123 /*
124  * The Myri10GE NIC has the following constraints on receive buffers:
125  * 1) Buffers which cross a 4KB boundary must be aligned to 4KB
126  * 2) Buffers which are not aligned to 4KB must not cross a 4KB boundary
127  */
128 
129 static ddi_dma_attr_t myri10ge_rx_jumbo_dma_attr = {
130 	DMA_ATTR_V0,			/* version number. */
131 	(uint64_t)0, 			/* low address */
132 	(uint64_t)0xffffffffffffffffULL, /* high address */
133 	(uint64_t)0x7ffffff,		/* address counter max */
134 	(uint64_t)4096,			/* alignment */
135 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
136 	(uint32_t)0x1,			/* minimum transfer size */
137 	(uint64_t)0x7fffffff,		/* maximum transfer size */
138 	UINT64_MAX,			/* maximum segment size */
139 	1,				/* scatter/gather list length */
140 	1,				/* granularity */
141 	0				/* attribute flags */
142 };
143 
144 static ddi_dma_attr_t myri10ge_rx_std_dma_attr = {
145 	DMA_ATTR_V0,			/* version number. */
146 	(uint64_t)0, 			/* low address */
147 	(uint64_t)0xffffffffffffffffULL, /* high address */
148 	(uint64_t)0x7ffffff,		/* address counter max */
149 #if defined sparc64 || defined __sparcv9
150 	(uint64_t)4096,			/* alignment */
151 #else
152 	(uint64_t)0x80,			/* alignment */
153 #endif
154 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
155 	(uint32_t)0x1,			/* minimum transfer size */
156 	(uint64_t)0x7fffffff,		/* maximum transfer size */
157 #if defined sparc64 || defined __sparcv9
158 	UINT64_MAX,			/* maximum segment size */
159 #else
160 	(uint64_t)0xfff,		/* maximum segment size */
161 #endif
162 	1,				/* scatter/gather list length */
163 	1,				/* granularity */
164 	0				/* attribute flags */
165 };
166 
167 static ddi_dma_attr_t myri10ge_tx_dma_attr = {
168 	DMA_ATTR_V0,			/* version number. */
169 	(uint64_t)0, 			/* low address */
170 	(uint64_t)0xffffffffffffffffULL, /* high address */
171 	(uint64_t)0x7ffffff,		/* address counter max */
172 	(uint64_t)1,			/* alignment */
173 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
174 	(uint32_t)0x1,			/* minimum transfer size */
175 	(uint64_t)0x7fffffff,		/* maximum transfer size */
176 	UINT64_MAX,			/* maximum segment size */
177 	INT32_MAX,			/* scatter/gather list length */
178 	1,				/* granularity */
179 	0			/* attribute flags */
180 };
181 
182 #if defined sparc64 || defined __sparcv9
183 #define	WC 0
184 #else
185 #define	WC 1
186 #endif
187 
188 struct ddi_device_acc_attr myri10ge_dev_access_attr = {
189 	DDI_DEVICE_ATTR_V0,		/* version */
190 	DDI_NEVERSWAP_ACC,		/* endian flash */
191 #if WC
192 	DDI_MERGING_OK_ACC		/* data order */
193 #else
194 	DDI_STRICTORDER_ACC
195 #endif
196 };
197 
198 static void myri10ge_watchdog(void *arg);
199 
200 #ifdef MYRICOM_PRIV
201 int myri10ge_mtu = MYRI10GE_MAX_ETHER_MTU + MXGEFW_PAD + VLAN_TAGSZ;
202 #define	MYRI10GE_DEFAULT_GLD_MTU	MYRI10GE_MAX_GLD_MTU
203 #else
204 int myri10ge_mtu = ETHERMAX + MXGEFW_PAD + VLAN_TAGSZ;
205 #define	MYRI10GE_DEFAULT_GLD_MTU	MYRI10GE_MIN_GLD_MTU
206 #endif
207 int myri10ge_bigbufs_initial = 1024;
208 int myri10ge_bigbufs_max = 4096;
209 
210 
211 caddr_t
212 myri10ge_dma_alloc(dev_info_t *dip, size_t len,
213     ddi_dma_attr_t *attr, ddi_device_acc_attr_t  *accattr,
214     uint_t alloc_flags, int bind_flags, struct myri10ge_dma_stuff *dma,
215     int warn, int (*wait)(caddr_t))
216 {
217 	caddr_t  kaddr;
218 	size_t real_length;
219 	ddi_dma_cookie_t cookie;
220 	uint_t count;
221 	int err;
222 
223 	err = ddi_dma_alloc_handle(dip, attr, wait,
224 	    NULL, &dma->handle);
225 	if (err != DDI_SUCCESS) {
226 		if (warn)
227 			cmn_err(CE_WARN,
228 			    "myri10ge: ddi_dma_alloc_handle failed\n");
229 		goto abort_with_nothing;
230 	}
231 
232 	err = ddi_dma_mem_alloc(dma->handle, len, accattr, alloc_flags,
233 	    wait, NULL, &kaddr, &real_length,
234 	    &dma->acc_handle);
235 	if (err != DDI_SUCCESS) {
236 		if (warn)
237 			cmn_err(CE_WARN,
238 			    "myri10ge: ddi_dma_mem_alloc failed\n");
239 		goto abort_with_handle;
240 	}
241 
242 	err = ddi_dma_addr_bind_handle(dma->handle, NULL, kaddr, len,
243 	    bind_flags, wait, NULL, &cookie, &count);
244 
245 	if (err != DDI_SUCCESS) {
246 		if (warn)
247 			cmn_err(CE_WARN,
248 			    "myri10ge: ddi_dma_addr_bind_handle failed\n");
249 		goto abort_with_mem;
250 	}
251 
252 	if (count != 1) {
253 		if (warn)
254 			cmn_err(CE_WARN,
255 			    "myri10ge: got too many dma segments ");
256 		goto abort_with_bind;
257 	}
258 	dma->low = htonl(MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress));
259 	dma->high = htonl(MYRI10GE_HIGHPART_TO_U32(cookie.dmac_laddress));
260 	return (kaddr);
261 
262 abort_with_bind:
263 	(void) ddi_dma_unbind_handle(dma->handle);
264 
265 abort_with_mem:
266 	ddi_dma_mem_free(&dma->acc_handle);
267 
268 abort_with_handle:
269 	ddi_dma_free_handle(&dma->handle);
270 abort_with_nothing:
271 	if (warn) {
272 		cmn_err(CE_WARN, "myri10ge: myri10ge_dma_alloc failed.\n  ");
273 		cmn_err(CE_WARN, "args: dip=%p len=0x%lx ddi_dma_attr=%p\n",
274 		    (void*) dip, len, (void*) attr);
275 		cmn_err(CE_WARN,
276 		    "args: ddi_device_acc_attr=%p  alloc_flags=0x%x\n",
277 		    (void*) accattr, alloc_flags);
278 		cmn_err(CE_WARN, "args: bind_flags=0x%x  dmastuff=%p",
279 		    bind_flags, (void*) dma);
280 	}
281 	return (NULL);
282 
283 }
284 
285 void
286 myri10ge_dma_free(struct myri10ge_dma_stuff *dma)
287 {
288 	(void) ddi_dma_unbind_handle(dma->handle);
289 	ddi_dma_mem_free(&dma->acc_handle);
290 	ddi_dma_free_handle(&dma->handle);
291 }
292 
293 static inline void
294 myri10ge_pio_copy32(void *to, uint32_t *from32, size_t size)
295 {
296 	register volatile uint32_t *to32;
297 	size_t i;
298 
299 	to32 = (volatile uint32_t *) to;
300 	for (i = (size / 4); i; i--) {
301 		*to32 = *from32;
302 		to32++;
303 		from32++;
304 	}
305 }
306 
307 #if defined(_LP64)
308 static inline void
309 myri10ge_pio_copy64(void *to, uint64_t *from64, size_t size)
310 {
311 	register volatile uint64_t *to64;
312 	size_t i;
313 
314 	to64 = (volatile uint64_t *) to;
315 	for (i = (size / 8); i; i--) {
316 		*to64 = *from64;
317 		to64++;
318 		from64++;
319 	}
320 }
321 #endif
322 
323 /*
324  * This routine copies memory from the host to the NIC.
325  * The "size" argument must always be a multiple of
326  * the size of long (4 or 8 bytes), and to/from must also
327  * be naturally aligned.
328  */
329 static inline void
330 myri10ge_pio_copy(void *to, void *from, size_t size)
331 {
332 #if !defined(_LP64)
333 	ASSERT((size % 4) == 0);
334 	myri10ge_pio_copy32(to, (uint32_t *)from, size);
335 #else
336 	ASSERT((size % 8) == 0);
337 	myri10ge_pio_copy64(to, (uint64_t *)from, size);
338 #endif
339 }
340 
341 
342 /*
343  * Due to various bugs in Solaris (especially bug 6186772 where the
344  * TCP/UDP checksum is calculated incorrectly on mblk chains with more
345  * than two elements), and the design bug where hardware checksums are
346  * ignored on mblk chains with more than 2 elements, we need to
347  * allocate private pool of physically contiguous receive buffers.
348  */
349 
350 static void
351 myri10ge_jpool_init(struct myri10ge_slice_state *ss)
352 {
353 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
354 
355 	bzero(jpool, sizeof (*jpool));
356 	mutex_init(&jpool->mtx, NULL, MUTEX_DRIVER,
357 	    ss->mgp->icookie);
358 	jpool->head = NULL;
359 }
360 
361 static void
362 myri10ge_jpool_fini(struct myri10ge_slice_state *ss)
363 {
364 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
365 
366 	if (jpool->head != NULL) {
367 		cmn_err(CE_WARN,
368 		    "%s: BUG! myri10ge_jpool_fini called on non-empty pool\n",
369 		    ss->mgp->name);
370 	}
371 	mutex_destroy(&jpool->mtx);
372 }
373 
374 
375 /*
376  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
377  * at most 32 bytes at a time, so as to avoid involving the software
378  * pio handler in the nic.   We re-write the first segment's low
379  * DMA address to mark it valid only after we write the entire chunk
380  * in a burst
381  */
382 static inline void
383 myri10ge_submit_8rx(mcp_kreq_ether_recv_t *dst, mcp_kreq_ether_recv_t *src)
384 {
385 	src->addr_low |= BE_32(1);
386 	myri10ge_pio_copy(dst, src, 4 * sizeof (*src));
387 	mb();
388 	myri10ge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
389 	mb();
390 	src->addr_low &= ~(BE_32(1));
391 	dst->addr_low = src->addr_low;
392 	mb();
393 }
394 
395 static void
396 myri10ge_pull_jpool(struct myri10ge_slice_state *ss)
397 {
398 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
399 	struct myri10ge_jpool_entry *jtail, *j, *jfree;
400 	volatile uintptr_t *putp;
401 	uintptr_t put;
402 	int i;
403 
404 	/* find tail */
405 	jtail = NULL;
406 	if (jpool->head != NULL) {
407 		j = jpool->head;
408 		while (j->next != NULL)
409 			j = j->next;
410 		jtail = j;
411 	}
412 
413 	/*
414 	 * iterate over all per-CPU caches, and add contents into
415 	 * jpool
416 	 */
417 	for (i = 0; i < MYRI10GE_MAX_CPUS; i++) {
418 		/* take per-CPU free list */
419 		putp = (void *)&jpool->cpu[i & MYRI10GE_MAX_CPU_MASK].head;
420 		if (*putp == NULL)
421 			continue;
422 		put = atomic_swap_ulong(putp, 0);
423 		jfree = (struct myri10ge_jpool_entry *)put;
424 
425 		/* append to pool */
426 		if (jtail == NULL) {
427 			jpool->head = jfree;
428 		} else {
429 			jtail->next = jfree;
430 		}
431 		j = jfree;
432 		while (j->next != NULL)
433 			j = j->next;
434 		jtail = j;
435 	}
436 }
437 
438 /*
439  * Transfers buffers from the free pool to the nic
440  * Must be called holding the jpool mutex.
441  */
442 
443 static inline void
444 myri10ge_restock_jumbos(struct myri10ge_slice_state *ss)
445 {
446 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
447 	struct myri10ge_jpool_entry *j;
448 	myri10ge_rx_ring_t *rx;
449 	int i, idx, limit;
450 
451 	rx = &ss->rx_big;
452 	limit = ss->j_rx_cnt + (rx->mask + 1);
453 
454 	for (i = rx->cnt; i != limit; i++) {
455 		idx = i & (rx->mask);
456 		j = jpool->head;
457 		if (j == NULL) {
458 			myri10ge_pull_jpool(ss);
459 			j = jpool->head;
460 			if (j == NULL) {
461 				break;
462 			}
463 		}
464 		jpool->head = j->next;
465 		rx->info[idx].j = j;
466 		rx->shadow[idx].addr_low = j->dma.low;
467 		rx->shadow[idx].addr_high = j->dma.high;
468 		/* copy 4 descriptors (32-bytes) to the mcp at a time */
469 		if ((idx & 7) == 7) {
470 			myri10ge_submit_8rx(&rx->lanai[idx - 7],
471 			    &rx->shadow[idx - 7]);
472 		}
473 	}
474 	rx->cnt = i;
475 }
476 
477 /*
478  * Transfer buffers from the nic to the free pool.
479  * Should be called holding the jpool mutex
480  */
481 
482 static inline void
483 myri10ge_unstock_jumbos(struct myri10ge_slice_state *ss)
484 {
485 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
486 	struct myri10ge_jpool_entry *j;
487 	myri10ge_rx_ring_t *rx;
488 	int i;
489 
490 	mutex_enter(&jpool->mtx);
491 	rx = &ss->rx_big;
492 
493 	for (i = 0; i < rx->mask + 1; i++) {
494 		j = rx->info[i].j;
495 		rx->info[i].j = NULL;
496 		if (j == NULL)
497 			continue;
498 		j->next = jpool->head;
499 		jpool->head = j;
500 	}
501 	mutex_exit(&jpool->mtx);
502 
503 }
504 
505 
506 /*
507  * Free routine which is called when the mblk allocated via
508  * esballoc() is freed.   Here we return the jumbo buffer
509  * to the free pool, and possibly pass some jumbo buffers
510  * to the nic
511  */
512 
513 static void
514 myri10ge_jfree_rtn(void *arg)
515 {
516 	struct myri10ge_jpool_entry *j = (struct myri10ge_jpool_entry *)arg;
517 	struct myri10ge_jpool_stuff *jpool;
518 	volatile uintptr_t *putp;
519 	uintptr_t old, new;
520 
521 	jpool = &j->ss->jpool;
522 
523 	/* prepend buffer locklessly to per-CPU freelist */
524 	putp = (void *)&jpool->cpu[CPU->cpu_seqid & MYRI10GE_MAX_CPU_MASK].head;
525 	new = (uintptr_t)j;
526 	do {
527 		old = *putp;
528 		j->next = (void *)old;
529 	} while (atomic_cas_ulong(putp, old, new) != old);
530 }
531 
532 static void
533 myri10ge_remove_jbuf(struct myri10ge_jpool_entry *j)
534 {
535 	(void) ddi_dma_unbind_handle(j->dma_handle);
536 	ddi_dma_mem_free(&j->acc_handle);
537 	ddi_dma_free_handle(&j->dma_handle);
538 	kmem_free(j, sizeof (*j));
539 }
540 
541 
542 /*
543  * Allocates one physically contiguous descriptor
544  * and add it to the jumbo buffer pool.
545  */
546 
547 static int
548 myri10ge_add_jbuf(struct myri10ge_slice_state *ss)
549 {
550 	struct myri10ge_jpool_entry *j;
551 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
552 	ddi_dma_attr_t *rx_dma_attr;
553 	size_t real_length;
554 	ddi_dma_cookie_t cookie;
555 	uint_t count;
556 	int err;
557 
558 	if (myri10ge_mtu < 2048)
559 		rx_dma_attr = &myri10ge_rx_std_dma_attr;
560 	else
561 		rx_dma_attr = &myri10ge_rx_jumbo_dma_attr;
562 
563 again:
564 	j = (struct myri10ge_jpool_entry *)
565 	    kmem_alloc(sizeof (*j), KM_SLEEP);
566 	err = ddi_dma_alloc_handle(ss->mgp->dip, rx_dma_attr,
567 	    DDI_DMA_DONTWAIT, NULL, &j->dma_handle);
568 	if (err != DDI_SUCCESS)
569 		goto abort_with_j;
570 
571 	err = ddi_dma_mem_alloc(j->dma_handle, myri10ge_mtu,
572 	    &myri10ge_dev_access_attr,  DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
573 	    NULL, &j->buf, &real_length, &j->acc_handle);
574 	if (err != DDI_SUCCESS)
575 		goto abort_with_handle;
576 
577 	err = ddi_dma_addr_bind_handle(j->dma_handle, NULL, j->buf,
578 	    real_length, DDI_DMA_READ|DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
579 	    NULL, &cookie, &count);
580 	if (err != DDI_SUCCESS)
581 		goto abort_with_mem;
582 
583 	/*
584 	 * Make certain std MTU buffers do not cross a 4KB boundary:
585 	 *
586 	 * Setting dma_attr_align=4096 will do this, but the system
587 	 * will only allocate 1 RX buffer per 4KB page, rather than 2.
588 	 * Setting dma_attr_granular=4096 *seems* to work around this,
589 	 * but I'm paranoid about future systems no longer honoring
590 	 * this, so fall back to the safe, but memory wasting way if a
591 	 * buffer crosses a 4KB boundary.
592 	 */
593 
594 	if (rx_dma_attr == &myri10ge_rx_std_dma_attr &&
595 	    rx_dma_attr->dma_attr_align != 4096) {
596 		uint32_t start, end;
597 
598 		start = MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress);
599 		end = start + myri10ge_mtu;
600 		if (((end >> 12) != (start >> 12)) && (start & 4095U)) {
601 			printf("std buffer crossed a 4KB boundary!\n");
602 			myri10ge_remove_jbuf(j);
603 			rx_dma_attr->dma_attr_align = 4096;
604 			rx_dma_attr->dma_attr_seg = UINT64_MAX;
605 			goto again;
606 		}
607 	}
608 
609 	j->dma.low =
610 	    htonl(MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress));
611 	j->dma.high =
612 	    htonl(MYRI10GE_HIGHPART_TO_U32(cookie.dmac_laddress));
613 	j->ss = ss;
614 
615 
616 	j->free_func.free_func = myri10ge_jfree_rtn;
617 	j->free_func.free_arg = (char *)j;
618 	mutex_enter(&jpool->mtx);
619 	j->next = jpool->head;
620 	jpool->head = j;
621 	jpool->num_alloc++;
622 	mutex_exit(&jpool->mtx);
623 	return (0);
624 
625 abort_with_mem:
626 	ddi_dma_mem_free(&j->acc_handle);
627 
628 abort_with_handle:
629 	ddi_dma_free_handle(&j->dma_handle);
630 
631 abort_with_j:
632 	kmem_free(j, sizeof (*j));
633 
634 	/*
635 	 * If an allocation failed, perhaps it failed because it could
636 	 * not satisfy granularity requirement.  Disable that, and
637 	 * try agin.
638 	 */
639 	if (rx_dma_attr == &myri10ge_rx_std_dma_attr &&
640 	    rx_dma_attr->dma_attr_align != 4096) {
641 			cmn_err(CE_NOTE,
642 			    "!alloc failed, reverting to gran=1\n");
643 			rx_dma_attr->dma_attr_align = 4096;
644 			rx_dma_attr->dma_attr_seg = UINT64_MAX;
645 			goto again;
646 	}
647 	return (err);
648 }
649 
650 static int
651 myri10ge_jfree_cnt(struct myri10ge_jpool_stuff *jpool)
652 {
653 	int i;
654 	struct myri10ge_jpool_entry *j;
655 
656 	mutex_enter(&jpool->mtx);
657 	j = jpool->head;
658 	i = 0;
659 	while (j != NULL) {
660 		i++;
661 		j = j->next;
662 	}
663 	mutex_exit(&jpool->mtx);
664 	return (i);
665 }
666 
667 static int
668 myri10ge_add_jbufs(struct myri10ge_slice_state *ss, int num, int total)
669 {
670 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
671 	int allocated = 0;
672 	int err;
673 	int needed;
674 
675 	/*
676 	 * if total is set, user wants "num" jbufs in the pool,
677 	 * otherwise the user wants to "num" additional jbufs
678 	 * added to the pool
679 	 */
680 	if (total && jpool->num_alloc) {
681 		allocated = myri10ge_jfree_cnt(jpool);
682 		needed = num - allocated;
683 	} else {
684 		needed = num;
685 	}
686 
687 	while (needed > 0) {
688 		needed--;
689 		err = myri10ge_add_jbuf(ss);
690 		if (err == 0) {
691 			allocated++;
692 		}
693 	}
694 	return (allocated);
695 }
696 
697 static void
698 myri10ge_remove_jbufs(struct myri10ge_slice_state *ss)
699 {
700 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
701 	struct myri10ge_jpool_entry *j;
702 
703 	mutex_enter(&jpool->mtx);
704 	myri10ge_pull_jpool(ss);
705 	while (jpool->head != NULL) {
706 		jpool->num_alloc--;
707 		j = jpool->head;
708 		jpool->head = j->next;
709 		myri10ge_remove_jbuf(j);
710 	}
711 	mutex_exit(&jpool->mtx);
712 }
713 
714 static void
715 myri10ge_carve_up_jbufs_into_small_ring(struct myri10ge_slice_state *ss)
716 {
717 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
718 	struct myri10ge_jpool_entry *j = NULL;
719 	caddr_t ptr;
720 	uint32_t dma_low, dma_high;
721 	int idx, len;
722 	unsigned int alloc_size;
723 
724 	dma_low = dma_high = len = 0;
725 	alloc_size = myri10ge_small_bytes + MXGEFW_PAD;
726 	ptr = NULL;
727 	for (idx = 0; idx < ss->rx_small.mask + 1; idx++) {
728 		/* Allocate a jumbo frame and carve it into small frames */
729 		if (len < alloc_size) {
730 			mutex_enter(&jpool->mtx);
731 			/* remove jumbo from freelist */
732 			j = jpool->head;
733 			jpool->head = j->next;
734 			/* place it onto small list */
735 			j->next = ss->small_jpool;
736 			ss->small_jpool = j;
737 			mutex_exit(&jpool->mtx);
738 			len = myri10ge_mtu;
739 			dma_low = ntohl(j->dma.low);
740 			dma_high = ntohl(j->dma.high);
741 			ptr = j->buf;
742 		}
743 		ss->rx_small.info[idx].ptr = ptr;
744 		ss->rx_small.shadow[idx].addr_low = htonl(dma_low);
745 		ss->rx_small.shadow[idx].addr_high = htonl(dma_high);
746 		len -= alloc_size;
747 		ptr += alloc_size;
748 		dma_low += alloc_size;
749 	}
750 }
751 
752 /*
753  * Return the jumbo bufs we carved up for small to the jumbo pool
754  */
755 
756 static void
757 myri10ge_release_small_jbufs(struct myri10ge_slice_state *ss)
758 {
759 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
760 	struct myri10ge_jpool_entry *j = NULL;
761 
762 	mutex_enter(&jpool->mtx);
763 	while (ss->small_jpool != NULL) {
764 		j = ss->small_jpool;
765 		ss->small_jpool = j->next;
766 		j->next = jpool->head;
767 		jpool->head = j;
768 	}
769 	mutex_exit(&jpool->mtx);
770 	ss->jbufs_for_smalls = 0;
771 }
772 
773 static int
774 myri10ge_add_tx_handle(struct myri10ge_slice_state *ss)
775 {
776 	myri10ge_tx_ring_t *tx = &ss->tx;
777 	struct myri10ge_priv *mgp = ss->mgp;
778 	struct myri10ge_tx_dma_handle *handle;
779 	int err;
780 
781 	handle = kmem_zalloc(sizeof (*handle), KM_SLEEP);
782 	err = ddi_dma_alloc_handle(mgp->dip,
783 	    &myri10ge_tx_dma_attr,
784 	    DDI_DMA_SLEEP, NULL,
785 	    &handle->h);
786 	if (err) {
787 		static int limit = 0;
788 		if (limit == 0)
789 			cmn_err(CE_WARN, "%s: Falled to alloc tx dma handle\n",
790 			    mgp->name);
791 		limit++;
792 		kmem_free(handle, sizeof (*handle));
793 		return (err);
794 	}
795 	mutex_enter(&tx->handle_lock);
796 	MYRI10GE_SLICE_STAT_INC(tx_handles_alloced);
797 	handle->next = tx->free_tx_handles;
798 	tx->free_tx_handles = handle;
799 	mutex_exit(&tx->handle_lock);
800 	return (DDI_SUCCESS);
801 }
802 
803 static void
804 myri10ge_remove_tx_handles(struct myri10ge_slice_state *ss)
805 {
806 	myri10ge_tx_ring_t *tx = &ss->tx;
807 	struct myri10ge_tx_dma_handle *handle;
808 	mutex_enter(&tx->handle_lock);
809 
810 	handle = tx->free_tx_handles;
811 	while (handle != NULL) {
812 		tx->free_tx_handles = handle->next;
813 		ddi_dma_free_handle(&handle->h);
814 		kmem_free(handle, sizeof (*handle));
815 		handle = tx->free_tx_handles;
816 		MYRI10GE_SLICE_STAT_DEC(tx_handles_alloced);
817 	}
818 	mutex_exit(&tx->handle_lock);
819 	if (MYRI10GE_SLICE_STAT(tx_handles_alloced) != 0) {
820 		cmn_err(CE_WARN, "%s: %d tx dma handles allocated at close\n",
821 		    ss->mgp->name,
822 		    (int)MYRI10GE_SLICE_STAT(tx_handles_alloced));
823 	}
824 }
825 
826 static void
827 myri10ge_free_tx_handles(myri10ge_tx_ring_t *tx,
828     struct myri10ge_tx_dma_handle_head *list)
829 {
830 	mutex_enter(&tx->handle_lock);
831 	list->tail->next = tx->free_tx_handles;
832 	tx->free_tx_handles = list->head;
833 	mutex_exit(&tx->handle_lock);
834 }
835 
836 static void
837 myri10ge_free_tx_handle_slist(myri10ge_tx_ring_t *tx,
838     struct myri10ge_tx_dma_handle *handle)
839 {
840 	struct myri10ge_tx_dma_handle_head list;
841 
842 	if (handle == NULL)
843 		return;
844 	list.head = handle;
845 	list.tail = handle;
846 	while (handle != NULL) {
847 		list.tail = handle;
848 		handle = handle->next;
849 	}
850 	myri10ge_free_tx_handles(tx, &list);
851 }
852 
853 static int
854 myri10ge_alloc_tx_handles(struct myri10ge_slice_state *ss, int count,
855     struct myri10ge_tx_dma_handle **ret)
856 {
857 	myri10ge_tx_ring_t *tx = &ss->tx;
858 	struct myri10ge_tx_dma_handle *handle;
859 	int err, i;
860 
861 	mutex_enter(&tx->handle_lock);
862 	for (i = 0; i < count; i++) {
863 		handle = tx->free_tx_handles;
864 		while (handle == NULL) {
865 			mutex_exit(&tx->handle_lock);
866 			err = myri10ge_add_tx_handle(ss);
867 			if (err != DDI_SUCCESS) {
868 				goto abort_with_handles;
869 			}
870 			mutex_enter(&tx->handle_lock);
871 			handle = tx->free_tx_handles;
872 		}
873 		tx->free_tx_handles = handle->next;
874 		handle->next = *ret;
875 		*ret = handle;
876 	}
877 	mutex_exit(&tx->handle_lock);
878 	return (DDI_SUCCESS);
879 
880 abort_with_handles:
881 	myri10ge_free_tx_handle_slist(tx, *ret);
882 	return (err);
883 }
884 
885 
886 /*
887  * Frees DMA resources associated with the send ring
888  */
889 static void
890 myri10ge_unprepare_tx_ring(struct myri10ge_slice_state *ss)
891 {
892 	myri10ge_tx_ring_t *tx;
893 	struct myri10ge_tx_dma_handle_head handles;
894 	size_t bytes;
895 	int idx;
896 
897 	tx = &ss->tx;
898 	handles.head = NULL;
899 	handles.tail = NULL;
900 	for (idx = 0; idx < ss->tx.mask + 1; idx++) {
901 		if (tx->info[idx].m) {
902 			(void) ddi_dma_unbind_handle(tx->info[idx].handle->h);
903 			handles.head = tx->info[idx].handle;
904 			if (handles.tail == NULL)
905 				handles.tail = tx->info[idx].handle;
906 			freeb(tx->info[idx].m);
907 			tx->info[idx].m = 0;
908 			tx->info[idx].handle = 0;
909 		}
910 		tx->cp[idx].va = NULL;
911 		myri10ge_dma_free(&tx->cp[idx].dma);
912 	}
913 	bytes = sizeof (*tx->cp) * (tx->mask + 1);
914 	kmem_free(tx->cp, bytes);
915 	tx->cp = NULL;
916 	if (handles.head != NULL)
917 		myri10ge_free_tx_handles(tx, &handles);
918 	myri10ge_remove_tx_handles(ss);
919 }
920 
921 /*
922  * Allocates DMA handles associated with the send ring
923  */
924 static inline int
925 myri10ge_prepare_tx_ring(struct myri10ge_slice_state *ss)
926 {
927 	struct myri10ge_tx_dma_handle *handles;
928 	int h;
929 	size_t bytes;
930 
931 	bytes = sizeof (*ss->tx.cp) * (ss->tx.mask + 1);
932 	ss->tx.cp = kmem_zalloc(bytes, KM_SLEEP);
933 	if (ss->tx.cp == NULL) {
934 		cmn_err(CE_WARN,
935 		    "%s: Failed to allocate tx copyblock storage\n",
936 		    ss->mgp->name);
937 		return (DDI_FAILURE);
938 	}
939 
940 
941 	/* allocate the TX copyblocks */
942 	for (h = 0; h < ss->tx.mask + 1; h++) {
943 		ss->tx.cp[h].va = myri10ge_dma_alloc(ss->mgp->dip,
944 		    4096, &myri10ge_rx_jumbo_dma_attr,
945 		    &myri10ge_dev_access_attr, DDI_DMA_STREAMING,
946 		    DDI_DMA_WRITE|DDI_DMA_STREAMING, &ss->tx.cp[h].dma, 1,
947 		    DDI_DMA_DONTWAIT);
948 		if (ss->tx.cp[h].va == NULL) {
949 			cmn_err(CE_WARN, "%s: Failed to allocate tx "
950 			    "copyblock %d\n", ss->mgp->name, h);
951 			goto abort_with_copyblocks;
952 		}
953 	}
954 	/* pre-allocate transmit handles */
955 	handles = NULL;
956 	(void) myri10ge_alloc_tx_handles(ss, myri10ge_tx_handles_initial,
957 	    &handles);
958 	if (handles != NULL)
959 		myri10ge_free_tx_handle_slist(&ss->tx, handles);
960 
961 	return (DDI_SUCCESS);
962 
963 abort_with_copyblocks:
964 	while (h > 0)  {
965 		h--;
966 		myri10ge_dma_free(&ss->tx.cp[h].dma);
967 	}
968 
969 	bytes = sizeof (*ss->tx.cp) * (ss->tx.mask + 1);
970 	kmem_free(ss->tx.cp, bytes);
971 	ss->tx.cp = NULL;
972 	return (DDI_FAILURE);
973 }
974 
975 /*
976  * The eeprom strings on the lanaiX have the format
977  * SN=x\0
978  * MAC=x:x:x:x:x:x\0
979  * PT:ddd mmm xx xx:xx:xx xx\0
980  * PV:ddd mmm xx xx:xx:xx xx\0
981  */
982 static int
983 myri10ge_read_mac_addr(struct myri10ge_priv *mgp)
984 {
985 #define	MYRI10GE_NEXT_STRING(p) while (ptr < limit && *ptr++)
986 #define	myri10ge_digit(c) (((c) >= '0' && (c) <= '9') ? ((c) - '0') :	\
987 		(((c) >= 'A' && (c) <= 'F') ? (10 + (c) - 'A') :	\
988 		(((c) >= 'a' && (c) <= 'f') ? (10 + (c) - 'a') : -1)))
989 
990 	char *ptr, *limit;
991 	int i, hv, lv;
992 
993 	ptr = mgp->eeprom_strings;
994 	limit = mgp->eeprom_strings + MYRI10GE_EEPROM_STRINGS_SIZE;
995 
996 	while (*ptr != '\0' && ptr < limit) {
997 		if (memcmp(ptr, "MAC=", 4) == 0) {
998 			ptr += 4;
999 			if (myri10ge_verbose)
1000 				printf("%s: mac address = %s\n", mgp->name,
1001 				    ptr);
1002 			mgp->mac_addr_string = ptr;
1003 			for (i = 0; i < 6; i++) {
1004 				if ((ptr + 2) > limit)
1005 					goto abort;
1006 
1007 				if (*(ptr+1) == ':') {
1008 					hv = 0;
1009 					lv = myri10ge_digit(*ptr); ptr++;
1010 				} else {
1011 					hv = myri10ge_digit(*ptr); ptr++;
1012 					lv = myri10ge_digit(*ptr); ptr++;
1013 				}
1014 				mgp->mac_addr[i] = (hv << 4) | lv;
1015 				ptr++;
1016 			}
1017 		}
1018 		if (memcmp((const void *)ptr, "SN=", 3) == 0) {
1019 			ptr += 3;
1020 			mgp->sn_str = (char *)ptr;
1021 		}
1022 		if (memcmp((const void *)ptr, "PC=", 3) == 0) {
1023 			ptr += 3;
1024 			mgp->pc_str = (char *)ptr;
1025 		}
1026 		MYRI10GE_NEXT_STRING(ptr);
1027 	}
1028 
1029 	return (0);
1030 
1031 abort:
1032 	cmn_err(CE_WARN, "%s: failed to parse eeprom_strings", mgp->name);
1033 	return (ENXIO);
1034 }
1035 
1036 
1037 /*
1038  * Determine the register set containing the PCI resource we
1039  * want to map: the memory-mappable part of the interface. We do
1040  * this by scanning the DDI "reg" property of the interface,
1041  * which is an array of mx_ddi_reg_set structures.
1042  */
1043 static int
1044 myri10ge_reg_set(dev_info_t *dip, int *reg_set, int *span,
1045     unsigned long *busno, unsigned long *devno,
1046     unsigned long *funcno)
1047 {
1048 
1049 #define	REGISTER_NUMBER(ip)	(ip[0] >>  0 & 0xff)
1050 #define	FUNCTION_NUMBER(ip)	(ip[0] >>  8 & 0x07)
1051 #define	DEVICE_NUMBER(ip)	(ip[0] >> 11 & 0x1f)
1052 #define	BUS_NUMBER(ip)		(ip[0] >> 16 & 0xff)
1053 #define	ADDRESS_SPACE(ip)	(ip[0] >> 24 & 0x03)
1054 #define	PCI_ADDR_HIGH(ip)	(ip[1])
1055 #define	PCI_ADDR_LOW(ip) 	(ip[2])
1056 #define	PCI_SPAN_HIGH(ip)	(ip[3])
1057 #define	PCI_SPAN_LOW(ip)	(ip[4])
1058 
1059 #define	MX_DDI_REG_SET_32_BIT_MEMORY_SPACE 2
1060 #define	MX_DDI_REG_SET_64_BIT_MEMORY_SPACE 3
1061 
1062 	int *data, i, *rs;
1063 	uint32_t nelementsp;
1064 
1065 #ifdef MYRI10GE_REGSET_VERBOSE
1066 	char *address_space_name[] = { "Configuration Space",
1067 					"I/O Space",
1068 					"32-bit Memory Space",
1069 					"64-bit Memory Space"
1070 	};
1071 #endif
1072 
1073 	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
1074 	    "reg", &data, &nelementsp) != DDI_SUCCESS) {
1075 		printf("Could not determine register set.\n");
1076 		return (ENXIO);
1077 	}
1078 
1079 #ifdef MYRI10GE_REGSET_VERBOSE
1080 	printf("There are %d register sets.\n", nelementsp / 5);
1081 #endif
1082 	if (!nelementsp) {
1083 		printf("Didn't find any \"reg\" properties.\n");
1084 		ddi_prop_free(data);
1085 		return (ENODEV);
1086 	}
1087 
1088 	/* Scan for the register number. */
1089 	rs = &data[0];
1090 	*busno = BUS_NUMBER(rs);
1091 	*devno = DEVICE_NUMBER(rs);
1092 	*funcno = FUNCTION_NUMBER(rs);
1093 
1094 #ifdef MYRI10GE_REGSET_VERBOSE
1095 	printf("*** Scanning for register number.\n");
1096 #endif
1097 	for (i = 0; i < nelementsp / 5; i++) {
1098 		rs = &data[5 * i];
1099 #ifdef MYRI10GE_REGSET_VERBOSE
1100 		printf("Examining register set %d:\n", i);
1101 		printf("  Register number = %d.\n", REGISTER_NUMBER(rs));
1102 		printf("  Function number = %d.\n", FUNCTION_NUMBER(rs));
1103 		printf("  Device number   = %d.\n", DEVICE_NUMBER(rs));
1104 		printf("  Bus number      = %d.\n", BUS_NUMBER(rs));
1105 		printf("  Address space   = %d (%s ).\n", ADDRESS_SPACE(rs),
1106 		    address_space_name[ADDRESS_SPACE(rs)]);
1107 		printf("  pci address 0x%08x %08x\n", PCI_ADDR_HIGH(rs),
1108 		    PCI_ADDR_LOW(rs));
1109 		printf("  pci span 0x%08x %08x\n", PCI_SPAN_HIGH(rs),
1110 		    PCI_SPAN_LOW(rs));
1111 #endif
1112 		/* We are looking for a memory property. */
1113 
1114 		if (ADDRESS_SPACE(rs) == MX_DDI_REG_SET_64_BIT_MEMORY_SPACE ||
1115 		    ADDRESS_SPACE(rs) == MX_DDI_REG_SET_32_BIT_MEMORY_SPACE) {
1116 			*reg_set = i;
1117 
1118 #ifdef MYRI10GE_REGSET_VERBOSE
1119 			printf("%s uses register set %d.\n",
1120 			    address_space_name[ADDRESS_SPACE(rs)], *reg_set);
1121 #endif
1122 
1123 			*span = (PCI_SPAN_LOW(rs));
1124 #ifdef MYRI10GE_REGSET_VERBOSE
1125 			printf("Board span is 0x%x\n", *span);
1126 #endif
1127 			break;
1128 		}
1129 	}
1130 
1131 	ddi_prop_free(data);
1132 
1133 	/* If no match, fail. */
1134 	if (i >= nelementsp / 5) {
1135 		return (EIO);
1136 	}
1137 
1138 	return (0);
1139 }
1140 
1141 
1142 static int
1143 myri10ge_load_firmware_from_zlib(struct myri10ge_priv *mgp, uint32_t *limit)
1144 {
1145 	void *inflate_buffer;
1146 	int rv, status;
1147 	size_t sram_size = mgp->sram_size - MYRI10GE_EEPROM_STRINGS_SIZE;
1148 	size_t destlen;
1149 	mcp_gen_header_t *hdr;
1150 	unsigned hdr_offset, i;
1151 
1152 
1153 	*limit = 0; /* -Wuninitialized */
1154 	status = 0;
1155 
1156 	inflate_buffer = kmem_zalloc(sram_size, KM_NOSLEEP);
1157 	if (!inflate_buffer) {
1158 		cmn_err(CE_WARN,
1159 		    "%s: Could not allocate buffer to inflate mcp\n",
1160 		    mgp->name);
1161 		return (ENOMEM);
1162 	}
1163 
1164 	destlen = sram_size;
1165 	rv = z_uncompress(inflate_buffer, &destlen, mgp->eth_z8e,
1166 	    mgp->eth_z8e_length);
1167 
1168 	if (rv != Z_OK) {
1169 		cmn_err(CE_WARN, "%s: Could not inflate mcp: %s\n",
1170 		    mgp->name, z_strerror(rv));
1171 		status = ENXIO;
1172 		goto abort;
1173 	}
1174 
1175 	*limit = (uint32_t)destlen;
1176 
1177 	hdr_offset = htonl(*(uint32_t *)(void *)((char *)inflate_buffer +
1178 	    MCP_HEADER_PTR_OFFSET));
1179 	hdr = (void *)((char *)inflate_buffer + hdr_offset);
1180 	if (ntohl(hdr->mcp_type) != MCP_TYPE_ETH) {
1181 		cmn_err(CE_WARN, "%s: Bad firmware type: 0x%x\n", mgp->name,
1182 		    ntohl(hdr->mcp_type));
1183 		status = EIO;
1184 		goto abort;
1185 	}
1186 
1187 	/* save firmware version for kstat */
1188 	(void) strncpy(mgp->fw_version, hdr->version, sizeof (mgp->fw_version));
1189 	if (myri10ge_verbose)
1190 		printf("%s: firmware id: %s\n", mgp->name, hdr->version);
1191 
1192 	/* Copy the inflated firmware to NIC SRAM. */
1193 	for (i = 0; i < *limit; i += 256) {
1194 		myri10ge_pio_copy((char *)mgp->sram + MYRI10GE_FW_OFFSET + i,
1195 		    (char *)inflate_buffer + i,
1196 		    min(256U, (unsigned)(*limit - i)));
1197 		mb();
1198 		(void) *(int *)(void *)mgp->sram;
1199 		mb();
1200 	}
1201 
1202 abort:
1203 	kmem_free(inflate_buffer, sram_size);
1204 
1205 	return (status);
1206 
1207 }
1208 
1209 
1210 int
1211 myri10ge_send_cmd(struct myri10ge_priv *mgp, uint32_t cmd,
1212 		myri10ge_cmd_t *data)
1213 {
1214 	mcp_cmd_t *buf;
1215 	char buf_bytes[sizeof (*buf) + 8];
1216 	volatile mcp_cmd_response_t *response = mgp->cmd;
1217 	volatile char *cmd_addr =
1218 	    (volatile char *)mgp->sram + MXGEFW_ETH_CMD;
1219 	int sleep_total = 0;
1220 
1221 	/* ensure buf is aligned to 8 bytes */
1222 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1223 
1224 	buf->data0 = htonl(data->data0);
1225 	buf->data1 = htonl(data->data1);
1226 	buf->data2 = htonl(data->data2);
1227 	buf->cmd = htonl(cmd);
1228 	buf->response_addr.low = mgp->cmd_dma.low;
1229 	buf->response_addr.high = mgp->cmd_dma.high;
1230 	mutex_enter(&mgp->cmd_lock);
1231 	response->result = 0xffffffff;
1232 	mb();
1233 
1234 	myri10ge_pio_copy((void *)cmd_addr, buf, sizeof (*buf));
1235 
1236 	/* wait up to 20ms */
1237 	for (sleep_total = 0; sleep_total < 20; sleep_total++) {
1238 		mb();
1239 		if (response->result != 0xffffffff) {
1240 			if (response->result == 0) {
1241 				data->data0 = ntohl(response->data);
1242 				mutex_exit(&mgp->cmd_lock);
1243 				return (0);
1244 			} else if (ntohl(response->result)
1245 			    == MXGEFW_CMD_UNKNOWN) {
1246 				mutex_exit(&mgp->cmd_lock);
1247 				return (ENOSYS);
1248 			} else if (ntohl(response->result)
1249 			    == MXGEFW_CMD_ERROR_UNALIGNED) {
1250 				mutex_exit(&mgp->cmd_lock);
1251 				return (E2BIG);
1252 			} else {
1253 				cmn_err(CE_WARN,
1254 				    "%s: command %d failed, result = %d\n",
1255 				    mgp->name, cmd, ntohl(response->result));
1256 				mutex_exit(&mgp->cmd_lock);
1257 				return (ENXIO);
1258 			}
1259 		}
1260 		drv_usecwait(1000);
1261 	}
1262 	mutex_exit(&mgp->cmd_lock);
1263 	cmn_err(CE_WARN, "%s: command %d timed out, result = %d\n",
1264 	    mgp->name, cmd, ntohl(response->result));
1265 	return (EAGAIN);
1266 }
1267 
1268 /*
1269  * Enable or disable periodic RDMAs from the host to make certain
1270  * chipsets resend dropped PCIe messages
1271  */
1272 
1273 static void
1274 myri10ge_dummy_rdma(struct myri10ge_priv *mgp, int enable)
1275 {
1276 	char buf_bytes[72];
1277 	volatile uint32_t *confirm;
1278 	volatile char *submit;
1279 	uint32_t *buf;
1280 	int i;
1281 
1282 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1283 
1284 	/* clear confirmation addr */
1285 	confirm = (volatile uint32_t *)mgp->cmd;
1286 	*confirm = 0;
1287 	mb();
1288 
1289 	/*
1290 	 * send an rdma command to the PCIe engine, and wait for the
1291 	 * response in the confirmation address.  The firmware should
1292 	 *  write a -1 there to indicate it is alive and well
1293 	 */
1294 
1295 	buf[0] = mgp->cmd_dma.high;		/* confirm addr MSW */
1296 	buf[1] = mgp->cmd_dma.low;		/* confirm addr LSW */
1297 	buf[2] = htonl(0xffffffff);		/* confirm data */
1298 	buf[3] = htonl(mgp->cmd_dma.high); 	/* dummy addr MSW */
1299 	buf[4] = htonl(mgp->cmd_dma.low); 	/* dummy addr LSW */
1300 	buf[5] = htonl(enable);			/* enable? */
1301 
1302 
1303 	submit = (volatile char *)(mgp->sram + MXGEFW_BOOT_DUMMY_RDMA);
1304 
1305 	myri10ge_pio_copy((char *)submit, buf, 64);
1306 	mb();
1307 	drv_usecwait(1000);
1308 	mb();
1309 	i = 0;
1310 	while (*confirm != 0xffffffff && i < 20) {
1311 		drv_usecwait(1000);
1312 		i++;
1313 	}
1314 	if (*confirm != 0xffffffff) {
1315 		cmn_err(CE_WARN, "%s: dummy rdma %s failed (%p = 0x%x)",
1316 		    mgp->name,
1317 		    (enable ? "enable" : "disable"), (void*) confirm, *confirm);
1318 	}
1319 }
1320 
1321 static int
1322 myri10ge_load_firmware(struct myri10ge_priv *mgp)
1323 {
1324 	myri10ge_cmd_t cmd;
1325 	volatile uint32_t *confirm;
1326 	volatile char *submit;
1327 	char buf_bytes[72];
1328 	uint32_t *buf, size;
1329 	int status, i;
1330 
1331 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1332 
1333 	status = myri10ge_load_firmware_from_zlib(mgp, &size);
1334 	if (status) {
1335 		cmn_err(CE_WARN, "%s: firmware loading failed\n", mgp->name);
1336 		return (status);
1337 	}
1338 
1339 	/* clear confirmation addr */
1340 	confirm = (volatile uint32_t *)mgp->cmd;
1341 	*confirm = 0;
1342 	mb();
1343 
1344 	/*
1345 	 * send a reload command to the bootstrap MCP, and wait for the
1346 	 * response in the confirmation address.  The firmware should
1347 	 * write a -1 there to indicate it is alive and well
1348 	 */
1349 
1350 	buf[0] = mgp->cmd_dma.high;	/* confirm addr MSW */
1351 	buf[1] = mgp->cmd_dma.low;	/* confirm addr LSW */
1352 	buf[2] = htonl(0xffffffff);	/* confirm data */
1353 
1354 	/*
1355 	 * FIX: All newest firmware should un-protect the bottom of
1356 	 * the sram before handoff. However, the very first interfaces
1357 	 * do not. Therefore the handoff copy must skip the first 8 bytes
1358 	 */
1359 	buf[3] = htonl(MYRI10GE_FW_OFFSET + 8); /* where the code starts */
1360 	buf[4] = htonl(size - 8); 	/* length of code */
1361 	buf[5] = htonl(8);		/* where to copy to */
1362 	buf[6] = htonl(0);		/* where to jump to */
1363 
1364 	submit = (volatile char *)(mgp->sram + MXGEFW_BOOT_HANDOFF);
1365 
1366 	myri10ge_pio_copy((char *)submit, buf, 64);
1367 	mb();
1368 	drv_usecwait(1000);
1369 	mb();
1370 	i = 0;
1371 	while (*confirm != 0xffffffff && i < 1000) {
1372 		drv_usecwait(1000);
1373 		i++;
1374 	}
1375 	if (*confirm != 0xffffffff) {
1376 		cmn_err(CE_WARN, "%s: handoff failed (%p = 0x%x)",
1377 		    mgp->name, (void *) confirm, *confirm);
1378 
1379 		return (ENXIO);
1380 	}
1381 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
1382 	if (status != 0) {
1383 		cmn_err(CE_WARN, "%s: failed MXGEFW_CMD_GET_RX_RING_SIZE\n",
1384 		    mgp->name);
1385 		return (ENXIO);
1386 	}
1387 
1388 	mgp->max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
1389 	myri10ge_dummy_rdma(mgp, 1);
1390 	return (0);
1391 }
1392 
1393 static int
1394 myri10ge_m_unicst(void *arg, const uint8_t *addr)
1395 {
1396 	struct myri10ge_priv *mgp = arg;
1397 	myri10ge_cmd_t cmd;
1398 	int status;
1399 
1400 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1401 	    | (addr[2] << 8) | addr[3]);
1402 
1403 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1404 
1405 	status = myri10ge_send_cmd(mgp, MXGEFW_SET_MAC_ADDRESS, &cmd);
1406 	if (status == 0 && (addr != mgp->mac_addr))
1407 		(void) memcpy(mgp->mac_addr, addr, sizeof (mgp->mac_addr));
1408 
1409 	return (status);
1410 }
1411 
1412 static int
1413 myri10ge_change_pause(struct myri10ge_priv *mgp, int pause)
1414 {
1415 	myri10ge_cmd_t cmd;
1416 	int status;
1417 
1418 	if (pause)
1419 		status = myri10ge_send_cmd(mgp, MXGEFW_ENABLE_FLOW_CONTROL,
1420 		    &cmd);
1421 	else
1422 		status = myri10ge_send_cmd(mgp, MXGEFW_DISABLE_FLOW_CONTROL,
1423 		    &cmd);
1424 
1425 	if (status) {
1426 		cmn_err(CE_WARN, "%s: Failed to set flow control mode\n",
1427 		    mgp->name);
1428 		return (ENXIO);
1429 	}
1430 	mgp->pause = pause;
1431 	return (0);
1432 }
1433 
1434 static void
1435 myri10ge_change_promisc(struct myri10ge_priv *mgp, int promisc)
1436 {
1437 	myri10ge_cmd_t cmd;
1438 	int status;
1439 
1440 	if (promisc)
1441 		status = myri10ge_send_cmd(mgp, MXGEFW_ENABLE_PROMISC, &cmd);
1442 	else
1443 		status = myri10ge_send_cmd(mgp, MXGEFW_DISABLE_PROMISC, &cmd);
1444 
1445 	if (status) {
1446 		cmn_err(CE_WARN, "%s: Failed to set promisc mode\n",
1447 		    mgp->name);
1448 	}
1449 }
1450 
1451 static int
1452 myri10ge_dma_test(struct myri10ge_priv *mgp, int test_type)
1453 {
1454 	myri10ge_cmd_t cmd;
1455 	int status;
1456 	uint32_t len;
1457 	void *dmabench;
1458 	struct myri10ge_dma_stuff dmabench_dma;
1459 	char *test = " ";
1460 
1461 	/*
1462 	 * Run a small DMA test.
1463 	 * The magic multipliers to the length tell the firmware
1464 	 * tp do DMA read, write, or read+write tests.  The
1465 	 * results are returned in cmd.data0.  The upper 16
1466 	 * bits or the return is the number of transfers completed.
1467 	 * The lower 16 bits is the time in 0.5us ticks that the
1468 	 * transfers took to complete
1469 	 */
1470 
1471 	len = mgp->tx_boundary;
1472 
1473 	dmabench = myri10ge_dma_alloc(mgp->dip, len,
1474 	    &myri10ge_rx_jumbo_dma_attr, &myri10ge_dev_access_attr,
1475 	    DDI_DMA_STREAMING,  DDI_DMA_RDWR|DDI_DMA_STREAMING,
1476 	    &dmabench_dma, 1, DDI_DMA_DONTWAIT);
1477 	mgp->read_dma = mgp->write_dma = mgp->read_write_dma = 0;
1478 	if (dmabench == NULL) {
1479 		cmn_err(CE_WARN, "%s dma benchmark aborted\n", mgp->name);
1480 		return (ENOMEM);
1481 	}
1482 
1483 	cmd.data0 = ntohl(dmabench_dma.low);
1484 	cmd.data1 = ntohl(dmabench_dma.high);
1485 	cmd.data2 = len * 0x10000;
1486 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1487 	if (status != 0) {
1488 		test = "read";
1489 		goto abort;
1490 	}
1491 	mgp->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
1492 
1493 	cmd.data0 = ntohl(dmabench_dma.low);
1494 	cmd.data1 = ntohl(dmabench_dma.high);
1495 	cmd.data2 = len * 0x1;
1496 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1497 	if (status != 0) {
1498 		test = "write";
1499 		goto abort;
1500 	}
1501 	mgp->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
1502 
1503 	cmd.data0 = ntohl(dmabench_dma.low);
1504 	cmd.data1 = ntohl(dmabench_dma.high);
1505 	cmd.data2 = len * 0x10001;
1506 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1507 	if (status != 0) {
1508 		test = "read/write";
1509 		goto abort;
1510 	}
1511 	mgp->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
1512 	    (cmd.data0 & 0xffff);
1513 
1514 
1515 abort:
1516 	myri10ge_dma_free(&dmabench_dma);
1517 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
1518 		cmn_err(CE_WARN, "%s %s dma benchmark failed\n", mgp->name,
1519 		    test);
1520 	return (status);
1521 }
1522 
1523 static int
1524 myri10ge_reset(struct myri10ge_priv *mgp)
1525 {
1526 	myri10ge_cmd_t cmd;
1527 	struct myri10ge_nic_stat *ethstat;
1528 	struct myri10ge_slice_state *ss;
1529 	int i, status;
1530 	size_t bytes;
1531 
1532 	/* send a reset command to the card to see if it is alive */
1533 	(void) memset(&cmd, 0, sizeof (cmd));
1534 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_RESET, &cmd);
1535 	if (status != 0) {
1536 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
1537 		return (ENXIO);
1538 	}
1539 
1540 	/* Now exchange information about interrupts  */
1541 
1542 	bytes = mgp->max_intr_slots * sizeof (*mgp->ss[0].rx_done.entry);
1543 	cmd.data0 = (uint32_t)bytes;
1544 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1545 
1546 	/*
1547 	 * Even though we already know how many slices are supported
1548 	 * via myri10ge_probe_slices() MXGEFW_CMD_GET_MAX_RSS_QUEUES
1549 	 * has magic side effects, and must be called after a reset.
1550 	 * It must be called prior to calling any RSS related cmds,
1551 	 * including assigning an interrupt queue for anything but
1552 	 * slice 0.  It must also be called *after*
1553 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1554 	 * the firmware to compute offsets.
1555 	 */
1556 
1557 	if (mgp->num_slices > 1) {
1558 
1559 		/* ask the maximum number of slices it supports */
1560 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1561 		    &cmd);
1562 		if (status != 0) {
1563 			cmn_err(CE_WARN,
1564 			    "%s: failed to get number of slices\n",
1565 			    mgp->name);
1566 			return (status);
1567 		}
1568 
1569 		/*
1570 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1571 		 * to setting up the interrupt queue DMA
1572 		 */
1573 
1574 		cmd.data0 = mgp->num_slices;
1575 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE |
1576 		    MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1577 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1578 		    &cmd);
1579 		if (status != 0) {
1580 			cmn_err(CE_WARN,
1581 			    "%s: failed to set number of slices\n",
1582 			    mgp->name);
1583 			return (status);
1584 		}
1585 	}
1586 	for (i = 0; i < mgp->num_slices; i++) {
1587 		ss = &mgp->ss[i];
1588 		cmd.data0 = ntohl(ss->rx_done.dma.low);
1589 		cmd.data1 = ntohl(ss->rx_done.dma.high);
1590 		cmd.data2 = i;
1591 		status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_DMA,
1592 		    &cmd);
1593 	};
1594 
1595 	status |= myri10ge_send_cmd(mgp,  MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1596 	for (i = 0; i < mgp->num_slices; i++) {
1597 		ss = &mgp->ss[i];
1598 		ss->irq_claim = (volatile unsigned int *)
1599 		    (void *)(mgp->sram + cmd.data0 + 8 * i);
1600 	}
1601 
1602 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
1603 		status |= myri10ge_send_cmd(mgp,
1604 		    MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1605 		mgp->irq_deassert = (uint32_t *)(void *)(mgp->sram + cmd.data0);
1606 	}
1607 
1608 	status |= myri10ge_send_cmd(mgp,
1609 	    MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1610 	mgp->intr_coal_delay_ptr = (uint32_t *)(void *)(mgp->sram + cmd.data0);
1611 
1612 	if (status != 0) {
1613 		cmn_err(CE_WARN, "%s: failed set interrupt parameters\n",
1614 		    mgp->name);
1615 		return (status);
1616 	}
1617 
1618 	*mgp->intr_coal_delay_ptr = htonl(mgp->intr_coal_delay);
1619 	(void) myri10ge_dma_test(mgp, MXGEFW_DMA_TEST);
1620 
1621 	/* reset mcp/driver shared state back to 0 */
1622 
1623 	for (i = 0; i < mgp->num_slices; i++) {
1624 		ss = &mgp->ss[i];
1625 		bytes = mgp->max_intr_slots *
1626 		    sizeof (*mgp->ss[0].rx_done.entry);
1627 		(void) memset(ss->rx_done.entry, 0, bytes);
1628 		ss->tx.req = 0;
1629 		ss->tx.done = 0;
1630 		ss->tx.pkt_done = 0;
1631 		ss->rx_big.cnt = 0;
1632 		ss->rx_small.cnt = 0;
1633 		ss->rx_done.idx = 0;
1634 		ss->rx_done.cnt = 0;
1635 		ss->rx_token = 0;
1636 		ss->tx.watchdog_done = 0;
1637 		ss->tx.watchdog_req = 0;
1638 		ss->tx.active = 0;
1639 		ss->tx.activate = 0;
1640 	}
1641 	mgp->watchdog_rx_pause = 0;
1642 	if (mgp->ksp_stat != NULL) {
1643 		ethstat = (struct myri10ge_nic_stat *)mgp->ksp_stat->ks_data;
1644 		ethstat->link_changes.value.ul = 0;
1645 	}
1646 	status = myri10ge_m_unicst(mgp, mgp->mac_addr);
1647 	myri10ge_change_promisc(mgp, 0);
1648 	(void) myri10ge_change_pause(mgp, mgp->pause);
1649 	return (status);
1650 }
1651 
1652 static int
1653 myri10ge_init_toeplitz(struct myri10ge_priv *mgp)
1654 {
1655 	myri10ge_cmd_t cmd;
1656 	int i, b, s, t, j;
1657 	int status;
1658 	uint32_t k[8];
1659 	uint32_t tmp;
1660 	uint8_t *key;
1661 
1662 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RSS_KEY_OFFSET,
1663 	    &cmd);
1664 	if (status != 0) {
1665 		cmn_err(CE_WARN, "%s: failed to get rss key\n",
1666 		    mgp->name);
1667 		return (EIO);
1668 	}
1669 	myri10ge_pio_copy32(mgp->rss_key,
1670 	    (uint32_t *)(void*)((char *)mgp->sram + cmd.data0),
1671 	    sizeof (mgp->rss_key));
1672 
1673 	mgp->toeplitz_hash_table = kmem_alloc(sizeof (uint32_t) * 12 * 256,
1674 	    KM_SLEEP);
1675 	key = (uint8_t *)mgp->rss_key;
1676 	t = 0;
1677 	for (b = 0; b < 12; b++) {
1678 		for (s = 0; s < 8; s++) {
1679 			/* Bits: b*8+s, ..., b*8+s+31 */
1680 			k[s] = 0;
1681 			for (j = 0; j < 32; j++) {
1682 				int bit = b*8+s+j;
1683 				bit = 0x1 & (key[bit / 8] >> (7 -(bit & 0x7)));
1684 				k[s] |= bit << (31 - j);
1685 			}
1686 		}
1687 
1688 		for (i = 0; i <= 0xff; i++) {
1689 			tmp = 0;
1690 			if (i & (1 << 7)) { tmp ^= k[0]; }
1691 			if (i & (1 << 6)) { tmp ^= k[1]; }
1692 			if (i & (1 << 5)) { tmp ^= k[2]; }
1693 			if (i & (1 << 4)) { tmp ^= k[3]; }
1694 			if (i & (1 << 3)) { tmp ^= k[4]; }
1695 			if (i & (1 << 2)) { tmp ^= k[5]; }
1696 			if (i & (1 << 1)) { tmp ^= k[6]; }
1697 			if (i & (1 << 0)) { tmp ^= k[7]; }
1698 			mgp->toeplitz_hash_table[t++] = tmp;
1699 		}
1700 	}
1701 	return (0);
1702 }
1703 
1704 static inline struct myri10ge_slice_state *
1705 myri10ge_toeplitz_send_hash(struct myri10ge_priv *mgp, struct ip *ip)
1706 {
1707 	struct tcphdr *hdr;
1708 	uint32_t saddr, daddr;
1709 	uint32_t hash, slice;
1710 	uint32_t *table = mgp->toeplitz_hash_table;
1711 	uint16_t src, dst;
1712 
1713 	/*
1714 	 * Note hashing order is reversed from how it is done
1715 	 * in the NIC, so as to generate the same hash value
1716 	 * for the connection to try to keep connections CPU local
1717 	 */
1718 
1719 	/* hash on IPv4 src/dst address */
1720 	saddr = ntohl(ip->ip_src.s_addr);
1721 	daddr = ntohl(ip->ip_dst.s_addr);
1722 	hash = table[(256 * 0) + ((daddr >> 24) & 0xff)];
1723 	hash ^= table[(256 * 1) + ((daddr >> 16) & 0xff)];
1724 	hash ^= table[(256 * 2) + ((daddr >> 8) & 0xff)];
1725 	hash ^= table[(256 * 3) + ((daddr) & 0xff)];
1726 	hash ^= table[(256 * 4) + ((saddr >> 24) & 0xff)];
1727 	hash ^= table[(256 * 5) + ((saddr >> 16) & 0xff)];
1728 	hash ^= table[(256 * 6) + ((saddr >> 8) & 0xff)];
1729 	hash ^= table[(256 * 7) + ((saddr) & 0xff)];
1730 	/* hash on TCP port, if required */
1731 	if ((myri10ge_rss_hash & MXGEFW_RSS_HASH_TYPE_TCP_IPV4) &&
1732 	    ip->ip_p == IPPROTO_TCP) {
1733 		hdr = (struct tcphdr *)(void *)
1734 		    (((uint8_t *)ip) +  (ip->ip_hl << 2));
1735 		src = ntohs(hdr->th_sport);
1736 		dst = ntohs(hdr->th_dport);
1737 
1738 		hash ^= table[(256 * 8) + ((dst >> 8) & 0xff)];
1739 		hash ^= table[(256 * 9) + ((dst) & 0xff)];
1740 		hash ^= table[(256 * 10) + ((src >> 8) & 0xff)];
1741 		hash ^= table[(256 * 11) + ((src) & 0xff)];
1742 	}
1743 	slice = (mgp->num_slices - 1) & hash;
1744 	return (&mgp->ss[slice]);
1745 
1746 }
1747 
1748 static inline struct myri10ge_slice_state *
1749 myri10ge_simple_send_hash(struct myri10ge_priv *mgp, struct ip *ip)
1750 {
1751 	struct tcphdr *hdr;
1752 	uint32_t slice, hash_val;
1753 
1754 
1755 	if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP) {
1756 		return (&mgp->ss[0]);
1757 	}
1758 	hdr = (struct tcphdr *)(void *)(((uint8_t *)ip) +  (ip->ip_hl << 2));
1759 
1760 	/*
1761 	 * Use the second byte of the *destination* address for
1762 	 * MXGEFW_RSS_HASH_TYPE_SRC_PORT, so as to match NIC's hashing
1763 	 */
1764 	hash_val = ntohs(hdr->th_dport) & 0xff;
1765 	if (myri10ge_rss_hash == MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT)
1766 		hash_val += ntohs(hdr->th_sport) & 0xff;
1767 
1768 	slice = (mgp->num_slices - 1) & hash_val;
1769 	return (&mgp->ss[slice]);
1770 }
1771 
1772 static inline struct myri10ge_slice_state *
1773 myri10ge_send_hash(struct myri10ge_priv *mgp, mblk_t *mp)
1774 {
1775 	unsigned int slice = 0;
1776 	struct ether_header *eh;
1777 	struct ether_vlan_header *vh;
1778 	struct ip *ip;
1779 	int ehl, ihl;
1780 
1781 	if (mgp->num_slices == 1)
1782 		return (&mgp->ss[0]);
1783 
1784 	if (myri10ge_tx_hash == 0) {
1785 		slice = CPU->cpu_id & (mgp->num_slices - 1);
1786 		return (&mgp->ss[slice]);
1787 	}
1788 
1789 	/*
1790 	 *  ensure it is a TCP or UDP over IPv4 packet, and that the
1791 	 *  headers are in the 1st mblk.  Otherwise, punt
1792 	 */
1793 	ehl = sizeof (*eh);
1794 	ihl = sizeof (*ip);
1795 	if ((MBLKL(mp)) <  (ehl + ihl + 8))
1796 		return (&mgp->ss[0]);
1797 	eh = (struct ether_header *)(void *)mp->b_rptr;
1798 	ip = (struct ip *)(void *)(eh + 1);
1799 	if (eh->ether_type != BE_16(ETHERTYPE_IP)) {
1800 		if (eh->ether_type != BE_16(ETHERTYPE_VLAN))
1801 			return (&mgp->ss[0]);
1802 		vh = (struct ether_vlan_header *)(void *)mp->b_rptr;
1803 		if (vh->ether_type != BE_16(ETHERTYPE_IP))
1804 			return (&mgp->ss[0]);
1805 		ehl += 4;
1806 		ip = (struct ip *)(void *)(vh + 1);
1807 	}
1808 	ihl = ip->ip_hl << 2;
1809 	if (MBLKL(mp) <  (ehl + ihl + 8))
1810 		return (&mgp->ss[0]);
1811 	switch (myri10ge_rss_hash) {
1812 	case MXGEFW_RSS_HASH_TYPE_IPV4:
1813 		/* fallthru */
1814 	case MXGEFW_RSS_HASH_TYPE_TCP_IPV4:
1815 		/* fallthru */
1816 	case (MXGEFW_RSS_HASH_TYPE_IPV4|MXGEFW_RSS_HASH_TYPE_TCP_IPV4):
1817 		return (myri10ge_toeplitz_send_hash(mgp, ip));
1818 	case MXGEFW_RSS_HASH_TYPE_SRC_PORT:
1819 		/* fallthru */
1820 	case MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT:
1821 		return (myri10ge_simple_send_hash(mgp, ip));
1822 	default:
1823 		break;
1824 	}
1825 	return (&mgp->ss[0]);
1826 }
1827 
1828 static int
1829 myri10ge_setup_slice(struct myri10ge_slice_state *ss)
1830 {
1831 	struct myri10ge_priv *mgp = ss->mgp;
1832 	myri10ge_cmd_t cmd;
1833 	int tx_ring_size, rx_ring_size;
1834 	int tx_ring_entries, rx_ring_entries;
1835 	int slice, status;
1836 	int allocated, idx;
1837 	size_t bytes;
1838 
1839 	slice = ss - mgp->ss;
1840 	cmd.data0 = slice;
1841 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
1842 	tx_ring_size = cmd.data0;
1843 	cmd.data0 = slice;
1844 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
1845 	if (status != 0)
1846 		return (status);
1847 	rx_ring_size = cmd.data0;
1848 
1849 	tx_ring_entries = tx_ring_size / sizeof (struct mcp_kreq_ether_send);
1850 	rx_ring_entries = rx_ring_size / sizeof (struct mcp_dma_addr);
1851 	ss->tx.mask = tx_ring_entries - 1;
1852 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
1853 
1854 	/* get the lanai pointers to the send and receive rings */
1855 
1856 	cmd.data0 = slice;
1857 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
1858 	ss->tx.lanai = (mcp_kreq_ether_send_t *)(void *)(mgp->sram + cmd.data0);
1859 	if (mgp->num_slices > 1) {
1860 		ss->tx.go = (char *)mgp->sram + MXGEFW_ETH_SEND_GO + 64 * slice;
1861 		ss->tx.stop = (char *)mgp->sram + MXGEFW_ETH_SEND_STOP +
1862 		    64 * slice;
1863 	} else {
1864 		ss->tx.go = NULL;
1865 		ss->tx.stop = NULL;
1866 	}
1867 
1868 	cmd.data0 = slice;
1869 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
1870 	ss->rx_small.lanai = (mcp_kreq_ether_recv_t *)
1871 	    (void *)(mgp->sram + cmd.data0);
1872 
1873 	cmd.data0 = slice;
1874 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
1875 	ss->rx_big.lanai = (mcp_kreq_ether_recv_t *)(void *)
1876 	    (mgp->sram + cmd.data0);
1877 
1878 	if (status != 0) {
1879 		cmn_err(CE_WARN,
1880 		    "%s: failed to get ring sizes or locations\n", mgp->name);
1881 		return (status);
1882 	}
1883 
1884 	status = ENOMEM;
1885 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
1886 	ss->rx_small.shadow = kmem_zalloc(bytes, KM_SLEEP);
1887 	if (ss->rx_small.shadow == NULL)
1888 		goto abort;
1889 	(void) memset(ss->rx_small.shadow, 0, bytes);
1890 
1891 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
1892 	ss->rx_big.shadow = kmem_zalloc(bytes, KM_SLEEP);
1893 	if (ss->rx_big.shadow == NULL)
1894 		goto abort_with_rx_small_shadow;
1895 	(void) memset(ss->rx_big.shadow, 0, bytes);
1896 
1897 	/* allocate the host info rings */
1898 
1899 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
1900 	ss->tx.info = kmem_zalloc(bytes, KM_SLEEP);
1901 	if (ss->tx.info == NULL)
1902 		goto abort_with_rx_big_shadow;
1903 	(void) memset(ss->tx.info, 0, bytes);
1904 
1905 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
1906 	ss->rx_small.info = kmem_zalloc(bytes, KM_SLEEP);
1907 	if (ss->rx_small.info == NULL)
1908 		goto abort_with_tx_info;
1909 	(void) memset(ss->rx_small.info, 0, bytes);
1910 
1911 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
1912 	ss->rx_big.info = kmem_zalloc(bytes, KM_SLEEP);
1913 	if (ss->rx_big.info == NULL)
1914 		goto abort_with_rx_small_info;
1915 	(void) memset(ss->rx_big.info, 0, bytes);
1916 
1917 	ss->tx.stall = ss->tx.sched = 0;
1918 	ss->tx.stall_early = ss->tx.stall_late = 0;
1919 
1920 	ss->jbufs_for_smalls = 1 + (1 + ss->rx_small.mask) /
1921 	    (myri10ge_mtu / (myri10ge_small_bytes + MXGEFW_PAD));
1922 
1923 	allocated = myri10ge_add_jbufs(ss,
1924 	    myri10ge_bigbufs_initial + ss->jbufs_for_smalls, 1);
1925 	if (allocated < ss->jbufs_for_smalls + myri10ge_bigbufs_initial) {
1926 		cmn_err(CE_WARN,
1927 		    "%s: Could not allocate enough receive buffers (%d/%d)\n",
1928 		    mgp->name, allocated,
1929 		    myri10ge_bigbufs_initial + ss->jbufs_for_smalls);
1930 		goto abort_with_jumbos;
1931 	}
1932 
1933 	myri10ge_carve_up_jbufs_into_small_ring(ss);
1934 	ss->j_rx_cnt = 0;
1935 
1936 	mutex_enter(&ss->jpool.mtx);
1937 	if (allocated < rx_ring_entries)
1938 		ss->jpool.low_water = allocated / 4;
1939 	else
1940 		ss->jpool.low_water = rx_ring_entries / 2;
1941 
1942 	/*
1943 	 * invalidate the big receive ring in case we do not
1944 	 * allocate sufficient jumbos to fill it
1945 	 */
1946 	(void) memset(ss->rx_big.shadow, 1,
1947 	    (ss->rx_big.mask + 1) * sizeof (ss->rx_big.shadow[0]));
1948 	for (idx = 7; idx <= ss->rx_big.mask; idx += 8) {
1949 		myri10ge_submit_8rx(&ss->rx_big.lanai[idx - 7],
1950 		    &ss->rx_big.shadow[idx - 7]);
1951 		mb();
1952 	}
1953 
1954 
1955 	myri10ge_restock_jumbos(ss);
1956 
1957 	for (idx = 7; idx <= ss->rx_small.mask; idx += 8) {
1958 		myri10ge_submit_8rx(&ss->rx_small.lanai[idx - 7],
1959 		    &ss->rx_small.shadow[idx - 7]);
1960 		mb();
1961 	}
1962 	ss->rx_small.cnt = ss->rx_small.mask + 1;
1963 
1964 	mutex_exit(&ss->jpool.mtx);
1965 
1966 	status = myri10ge_prepare_tx_ring(ss);
1967 
1968 	if (status != 0)
1969 		goto abort_with_small_jbufs;
1970 
1971 	cmd.data0 = ntohl(ss->fw_stats_dma.low);
1972 	cmd.data1 = ntohl(ss->fw_stats_dma.high);
1973 	cmd.data2 = sizeof (mcp_irq_data_t);
1974 	cmd.data2 |= (slice << 16);
1975 	bzero(ss->fw_stats, sizeof (*ss->fw_stats));
1976 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
1977 	if (status == ENOSYS) {
1978 		cmd.data0 = ntohl(ss->fw_stats_dma.low) +
1979 		    offsetof(mcp_irq_data_t, send_done_count);
1980 		cmd.data1 = ntohl(ss->fw_stats_dma.high);
1981 		status = myri10ge_send_cmd(mgp,
1982 		    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE, &cmd);
1983 	}
1984 	if (status) {
1985 		cmn_err(CE_WARN, "%s: Couldn't set stats DMA\n", mgp->name);
1986 		goto abort_with_tx;
1987 	}
1988 
1989 	return (0);
1990 
1991 abort_with_tx:
1992 	myri10ge_unprepare_tx_ring(ss);
1993 
1994 abort_with_small_jbufs:
1995 	myri10ge_release_small_jbufs(ss);
1996 
1997 abort_with_jumbos:
1998 	if (allocated != 0) {
1999 		mutex_enter(&ss->jpool.mtx);
2000 		ss->jpool.low_water = 0;
2001 		mutex_exit(&ss->jpool.mtx);
2002 		myri10ge_unstock_jumbos(ss);
2003 		myri10ge_remove_jbufs(ss);
2004 	}
2005 
2006 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
2007 	kmem_free(ss->rx_big.info, bytes);
2008 
2009 abort_with_rx_small_info:
2010 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
2011 	kmem_free(ss->rx_small.info, bytes);
2012 
2013 abort_with_tx_info:
2014 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
2015 	kmem_free(ss->tx.info, bytes);
2016 
2017 abort_with_rx_big_shadow:
2018 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2019 	kmem_free(ss->rx_big.shadow, bytes);
2020 
2021 abort_with_rx_small_shadow:
2022 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2023 	kmem_free(ss->rx_small.shadow, bytes);
2024 abort:
2025 	return (status);
2026 
2027 }
2028 
2029 static void
2030 myri10ge_teardown_slice(struct myri10ge_slice_state *ss)
2031 {
2032 	int tx_ring_entries, rx_ring_entries;
2033 	size_t bytes;
2034 
2035 	/* ignore slices that have not been fully setup */
2036 	if (ss->tx.cp == NULL)
2037 		return;
2038 	/* Free the TX copy buffers */
2039 	myri10ge_unprepare_tx_ring(ss);
2040 
2041 	/* stop passing returned buffers to firmware */
2042 
2043 	mutex_enter(&ss->jpool.mtx);
2044 	ss->jpool.low_water = 0;
2045 	mutex_exit(&ss->jpool.mtx);
2046 	myri10ge_release_small_jbufs(ss);
2047 
2048 	/* Release the free jumbo frame pool */
2049 	myri10ge_unstock_jumbos(ss);
2050 	myri10ge_remove_jbufs(ss);
2051 
2052 	rx_ring_entries = ss->rx_big.mask + 1;
2053 	tx_ring_entries = ss->tx.mask + 1;
2054 
2055 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
2056 	kmem_free(ss->rx_big.info, bytes);
2057 
2058 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
2059 	kmem_free(ss->rx_small.info, bytes);
2060 
2061 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
2062 	kmem_free(ss->tx.info, bytes);
2063 
2064 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2065 	kmem_free(ss->rx_big.shadow, bytes);
2066 
2067 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2068 	kmem_free(ss->rx_small.shadow, bytes);
2069 
2070 }
2071 static int
2072 myri10ge_start_locked(struct myri10ge_priv *mgp)
2073 {
2074 	myri10ge_cmd_t cmd;
2075 	int status, big_pow2, i;
2076 	volatile uint8_t *itable;
2077 
2078 	status = DDI_SUCCESS;
2079 	/* Allocate DMA resources and receive buffers */
2080 
2081 	status = myri10ge_reset(mgp);
2082 	if (status != 0) {
2083 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
2084 		return (DDI_FAILURE);
2085 	}
2086 
2087 	if (mgp->num_slices > 1) {
2088 		cmd.data0 = mgp->num_slices;
2089 		cmd.data1 = 1; /* use MSI-X */
2090 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ENABLE_RSS_QUEUES,
2091 		    &cmd);
2092 		if (status != 0) {
2093 			cmn_err(CE_WARN,
2094 			    "%s: failed to set number of slices\n",
2095 			    mgp->name);
2096 			goto abort_with_nothing;
2097 		}
2098 		/* setup the indirection table */
2099 		cmd.data0 = mgp->num_slices;
2100 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
2101 		    &cmd);
2102 
2103 		status |= myri10ge_send_cmd(mgp,
2104 		    MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
2105 		if (status != 0) {
2106 			cmn_err(CE_WARN,
2107 			    "%s: failed to setup rss tables\n", mgp->name);
2108 		}
2109 
2110 		/* just enable an identity mapping */
2111 		itable = mgp->sram + cmd.data0;
2112 		for (i = 0; i < mgp->num_slices; i++)
2113 			itable[i] = (uint8_t)i;
2114 
2115 		if (myri10ge_rss_hash & MYRI10GE_TOEPLITZ_HASH) {
2116 			status = myri10ge_init_toeplitz(mgp);
2117 			if (status != 0) {
2118 				cmn_err(CE_WARN, "%s: failed to setup "
2119 				    "toeplitz tx hash table", mgp->name);
2120 				goto abort_with_nothing;
2121 			}
2122 		}
2123 		cmd.data0 = 1;
2124 		cmd.data1 = myri10ge_rss_hash;
2125 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_RSS_ENABLE,
2126 		    &cmd);
2127 		if (status != 0) {
2128 			cmn_err(CE_WARN,
2129 			    "%s: failed to enable slices\n", mgp->name);
2130 			goto abort_with_toeplitz;
2131 		}
2132 	}
2133 
2134 	for (i = 0; i < mgp->num_slices; i++) {
2135 		status = myri10ge_setup_slice(&mgp->ss[i]);
2136 		if (status != 0)
2137 			goto abort_with_slices;
2138 	}
2139 
2140 	/*
2141 	 * Tell the MCP how many buffers he has, and to
2142 	 *  bring the ethernet interface up
2143 	 *
2144 	 * Firmware needs the big buff size as a power of 2.  Lie and
2145 	 * tell him the buffer is larger, because we only use 1
2146 	 * buffer/pkt, and the mtu will prevent overruns
2147 	 */
2148 	big_pow2 = myri10ge_mtu + MXGEFW_PAD;
2149 	while (!ISP2(big_pow2))
2150 		big_pow2++;
2151 
2152 	/* now give firmware buffers sizes, and MTU */
2153 	cmd.data0 = myri10ge_mtu;
2154 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_MTU, &cmd);
2155 	cmd.data0 = myri10ge_small_bytes;
2156 	status |=
2157 	    myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
2158 	cmd.data0 = big_pow2;
2159 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2160 	if (status) {
2161 		cmn_err(CE_WARN, "%s: Couldn't set buffer sizes\n", mgp->name);
2162 		goto abort_with_slices;
2163 	}
2164 
2165 
2166 	cmd.data0 = 1;
2167 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_TSO_MODE, &cmd);
2168 	if (status) {
2169 		cmn_err(CE_WARN, "%s: unable to setup TSO (%d)\n",
2170 		    mgp->name, status);
2171 	} else {
2172 		mgp->features |= MYRI10GE_TSO;
2173 	}
2174 
2175 	mgp->link_state = -1;
2176 	mgp->rdma_tags_available = 15;
2177 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_UP, &cmd);
2178 	if (status) {
2179 		cmn_err(CE_WARN, "%s: unable to start ethernet\n", mgp->name);
2180 		goto abort_with_slices;
2181 	}
2182 	mgp->running = MYRI10GE_ETH_RUNNING;
2183 	return (DDI_SUCCESS);
2184 
2185 abort_with_slices:
2186 	for (i = 0; i < mgp->num_slices; i++)
2187 		myri10ge_teardown_slice(&mgp->ss[i]);
2188 
2189 	mgp->running = MYRI10GE_ETH_STOPPED;
2190 
2191 abort_with_toeplitz:
2192 	if (mgp->toeplitz_hash_table != NULL) {
2193 		kmem_free(mgp->toeplitz_hash_table,
2194 		    sizeof (uint32_t) * 12 * 256);
2195 		mgp->toeplitz_hash_table = NULL;
2196 	}
2197 
2198 abort_with_nothing:
2199 	return (DDI_FAILURE);
2200 }
2201 
2202 static void
2203 myri10ge_stop_locked(struct myri10ge_priv *mgp)
2204 {
2205 	int status, old_down_cnt;
2206 	myri10ge_cmd_t cmd;
2207 	int wait_time = 10;
2208 	int i, polling;
2209 
2210 	old_down_cnt = mgp->down_cnt;
2211 	mb();
2212 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2213 	if (status) {
2214 		cmn_err(CE_WARN, "%s: Couldn't bring down link\n", mgp->name);
2215 	}
2216 
2217 	while (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2218 		delay(1 * drv_usectohz(1000000));
2219 		wait_time--;
2220 		if (wait_time == 0)
2221 			break;
2222 	}
2223 again:
2224 	if (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2225 		cmn_err(CE_WARN, "%s: didn't get down irq\n", mgp->name);
2226 		for (i = 0; i < mgp->num_slices; i++) {
2227 			/*
2228 			 * take and release the rx lock to ensure
2229 			 * that no interrupt thread is blocked
2230 			 * elsewhere in the stack, preventing
2231 			 * completion
2232 			 */
2233 
2234 			mutex_enter(&mgp->ss[i].rx_lock);
2235 			printf("%s: slice %d rx irq idle\n",
2236 			    mgp->name, i);
2237 			mutex_exit(&mgp->ss[i].rx_lock);
2238 
2239 			/* verify that the poll handler is inactive */
2240 			mutex_enter(&mgp->ss->poll_lock);
2241 			polling = mgp->ss->rx_polling;
2242 			mutex_exit(&mgp->ss->poll_lock);
2243 			if (polling) {
2244 				printf("%s: slice %d is polling\n",
2245 				    mgp->name, i);
2246 				delay(1 * drv_usectohz(1000000));
2247 				goto again;
2248 			}
2249 		}
2250 		delay(1 * drv_usectohz(1000000));
2251 		if (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2252 			cmn_err(CE_WARN, "%s: Never got down irq\n", mgp->name);
2253 		}
2254 	}
2255 
2256 	for (i = 0; i < mgp->num_slices; i++)
2257 		myri10ge_teardown_slice(&mgp->ss[i]);
2258 
2259 	if (mgp->toeplitz_hash_table != NULL) {
2260 		kmem_free(mgp->toeplitz_hash_table,
2261 		    sizeof (uint32_t) * 12 * 256);
2262 		mgp->toeplitz_hash_table = NULL;
2263 	}
2264 	mgp->running = MYRI10GE_ETH_STOPPED;
2265 }
2266 
2267 static int
2268 myri10ge_m_start(void *arg)
2269 {
2270 	struct myri10ge_priv *mgp = arg;
2271 	int status;
2272 
2273 	mutex_enter(&mgp->intrlock);
2274 
2275 	if (mgp->running != MYRI10GE_ETH_STOPPED) {
2276 		mutex_exit(&mgp->intrlock);
2277 		return (DDI_FAILURE);
2278 	}
2279 	status = myri10ge_start_locked(mgp);
2280 	mutex_exit(&mgp->intrlock);
2281 
2282 	if (status != DDI_SUCCESS)
2283 		return (status);
2284 
2285 	/* start the watchdog timer */
2286 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
2287 	    mgp->timer_ticks);
2288 	return (DDI_SUCCESS);
2289 
2290 }
2291 
2292 static void
2293 myri10ge_m_stop(void *arg)
2294 {
2295 	struct myri10ge_priv *mgp = arg;
2296 
2297 	mutex_enter(&mgp->intrlock);
2298 	/* if the device not running give up */
2299 	if (mgp->running != MYRI10GE_ETH_RUNNING) {
2300 		mutex_exit(&mgp->intrlock);
2301 		return;
2302 	}
2303 
2304 	mgp->running = MYRI10GE_ETH_STOPPING;
2305 	mutex_exit(&mgp->intrlock);
2306 	(void) untimeout(mgp->timer_id);
2307 	mutex_enter(&mgp->intrlock);
2308 	myri10ge_stop_locked(mgp);
2309 	mutex_exit(&mgp->intrlock);
2310 
2311 }
2312 
2313 static inline void
2314 myri10ge_rx_csum(mblk_t *mp, struct myri10ge_rx_ring_stats *s, uint32_t csum)
2315 {
2316 	struct ether_header *eh;
2317 	struct ip *ip;
2318 	struct ip6_hdr *ip6;
2319 	uint32_t start, stuff, end, partial, hdrlen;
2320 
2321 
2322 	csum = ntohs((uint16_t)csum);
2323 	eh = (struct ether_header *)(void *)mp->b_rptr;
2324 	hdrlen = sizeof (*eh);
2325 	if (eh->ether_dhost.ether_addr_octet[0] & 1) {
2326 		if (0 == (bcmp(eh->ether_dhost.ether_addr_octet,
2327 		    myri10ge_broadcastaddr, sizeof (eh->ether_dhost))))
2328 			s->brdcstrcv++;
2329 		else
2330 			s->multircv++;
2331 	}
2332 
2333 	if (eh->ether_type == BE_16(ETHERTYPE_VLAN)) {
2334 		/*
2335 		 * fix checksum by subtracting 4 bytes after what the
2336 		 * firmware thought was the end of the ether hdr
2337 		 */
2338 		partial = *(uint32_t *)
2339 		    (void *)(mp->b_rptr + ETHERNET_HEADER_SIZE);
2340 		csum += ~partial;
2341 		csum +=  (csum < ~partial);
2342 		csum = (csum >> 16) + (csum & 0xFFFF);
2343 		csum = (csum >> 16) + (csum & 0xFFFF);
2344 		hdrlen += VLAN_TAGSZ;
2345 	}
2346 
2347 	if (eh->ether_type ==  BE_16(ETHERTYPE_IP)) {
2348 		ip = (struct ip *)(void *)(mp->b_rptr + hdrlen);
2349 		start = ip->ip_hl << 2;
2350 
2351 		if (ip->ip_p == IPPROTO_TCP)
2352 			stuff = start + offsetof(struct tcphdr, th_sum);
2353 		else if (ip->ip_p == IPPROTO_UDP)
2354 			stuff = start + offsetof(struct udphdr, uh_sum);
2355 		else
2356 			return;
2357 		end = ntohs(ip->ip_len);
2358 	} else if (eh->ether_type ==  BE_16(ETHERTYPE_IPV6)) {
2359 		ip6 = (struct ip6_hdr *)(void *)(mp->b_rptr + hdrlen);
2360 		start = sizeof (*ip6);
2361 		if (ip6->ip6_nxt == IPPROTO_TCP) {
2362 			stuff = start + offsetof(struct tcphdr, th_sum);
2363 		} else if (ip6->ip6_nxt == IPPROTO_UDP)
2364 			stuff = start + offsetof(struct udphdr, uh_sum);
2365 		else
2366 			return;
2367 		end = start + ntohs(ip6->ip6_plen);
2368 		/*
2369 		 * IPv6 headers do not contain a checksum, and hence
2370 		 * do not checksum to zero, so they don't "fall out"
2371 		 * of the partial checksum calculation like IPv4
2372 		 * headers do.  We need to fix the partial checksum by
2373 		 * subtracting the checksum of the IPv6 header.
2374 		 */
2375 
2376 		partial = myri10ge_csum_generic((uint16_t *)ip6, sizeof (*ip6));
2377 		csum += ~partial;
2378 		csum +=  (csum < ~partial);
2379 		csum = (csum >> 16) + (csum & 0xFFFF);
2380 		csum = (csum >> 16) + (csum & 0xFFFF);
2381 	} else {
2382 		return;
2383 	}
2384 
2385 	if (MBLKL(mp) > hdrlen + end) {
2386 		/* padded frame, so hw csum may be invalid */
2387 		return;
2388 	}
2389 
2390 	mac_hcksum_set(mp, start, stuff, end, csum, HCK_PARTIALCKSUM);
2391 }
2392 
2393 static mblk_t *
2394 myri10ge_rx_done_small(struct myri10ge_slice_state *ss, uint32_t len,
2395     uint32_t csum)
2396 {
2397 	mblk_t *mp;
2398 	myri10ge_rx_ring_t *rx;
2399 	int idx;
2400 
2401 	rx = &ss->rx_small;
2402 	idx = rx->cnt & rx->mask;
2403 	ss->rx_small.cnt++;
2404 
2405 	/* allocate a new buffer to pass up the stack */
2406 	mp = allocb(len + MXGEFW_PAD, 0);
2407 	if (mp == NULL) {
2408 		MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_small_nobuf);
2409 		goto abort;
2410 	}
2411 	bcopy(ss->rx_small.info[idx].ptr,
2412 	    (caddr_t)mp->b_wptr, len + MXGEFW_PAD);
2413 	mp->b_wptr += len + MXGEFW_PAD;
2414 	mp->b_rptr += MXGEFW_PAD;
2415 
2416 	ss->rx_stats.ibytes += len;
2417 	ss->rx_stats.ipackets += 1;
2418 	myri10ge_rx_csum(mp, &ss->rx_stats, csum);
2419 
2420 abort:
2421 	if ((idx & 7) == 7) {
2422 		myri10ge_submit_8rx(&rx->lanai[idx - 7],
2423 		    &rx->shadow[idx - 7]);
2424 	}
2425 
2426 	return (mp);
2427 }
2428 
2429 
2430 static mblk_t *
2431 myri10ge_rx_done_big(struct myri10ge_slice_state *ss, uint32_t len,
2432     uint32_t csum)
2433 {
2434 	struct myri10ge_jpool_stuff *jpool;
2435 	struct myri10ge_jpool_entry *j;
2436 	mblk_t *mp;
2437 	int idx, num_owned_by_mcp;
2438 
2439 	jpool = &ss->jpool;
2440 	idx = ss->j_rx_cnt & ss->rx_big.mask;
2441 	j = ss->rx_big.info[idx].j;
2442 
2443 	if (j == NULL) {
2444 		printf("%s: null j at idx=%d, rx_big.cnt = %d, j_rx_cnt=%d\n",
2445 		    ss->mgp->name, idx, ss->rx_big.cnt, ss->j_rx_cnt);
2446 		return (NULL);
2447 	}
2448 
2449 
2450 	ss->rx_big.info[idx].j = NULL;
2451 	ss->j_rx_cnt++;
2452 
2453 
2454 	/*
2455 	 * Check to see if we are low on rx buffers.
2456 	 * Note that we must leave at least 8 free so there are
2457 	 * enough to free in a single 64-byte write.
2458 	 */
2459 	num_owned_by_mcp = ss->rx_big.cnt - ss->j_rx_cnt;
2460 	if (num_owned_by_mcp < jpool->low_water) {
2461 		mutex_enter(&jpool->mtx);
2462 		myri10ge_restock_jumbos(ss);
2463 		mutex_exit(&jpool->mtx);
2464 		num_owned_by_mcp = ss->rx_big.cnt - ss->j_rx_cnt;
2465 		/* if we are still low, then we have to copy */
2466 		if (num_owned_by_mcp < 16) {
2467 			MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_copy);
2468 			/* allocate a new buffer to pass up the stack */
2469 			mp = allocb(len + MXGEFW_PAD, 0);
2470 			if (mp == NULL) {
2471 				goto abort;
2472 			}
2473 			bcopy(j->buf,
2474 			    (caddr_t)mp->b_wptr, len + MXGEFW_PAD);
2475 			myri10ge_jfree_rtn(j);
2476 			/* push buffer back to NIC */
2477 			mutex_enter(&jpool->mtx);
2478 			myri10ge_restock_jumbos(ss);
2479 			mutex_exit(&jpool->mtx);
2480 			goto set_len;
2481 		}
2482 	}
2483 
2484 	/* loan our buffer to the stack */
2485 	mp = desballoc((unsigned char *)j->buf, myri10ge_mtu, 0, &j->free_func);
2486 	if (mp == NULL) {
2487 		goto abort;
2488 	}
2489 
2490 set_len:
2491 	mp->b_rptr += MXGEFW_PAD;
2492 	mp->b_wptr = ((unsigned char *) mp->b_rptr + len);
2493 
2494 	ss->rx_stats.ibytes += len;
2495 	ss->rx_stats.ipackets += 1;
2496 	myri10ge_rx_csum(mp, &ss->rx_stats, csum);
2497 
2498 	return (mp);
2499 
2500 abort:
2501 	myri10ge_jfree_rtn(j);
2502 	MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_big_nobuf);
2503 	return (NULL);
2504 }
2505 
2506 /*
2507  * Free all transmit buffers up until the specified index
2508  */
2509 static inline void
2510 myri10ge_tx_done(struct myri10ge_slice_state *ss, uint32_t mcp_index)
2511 {
2512 	myri10ge_tx_ring_t *tx;
2513 	struct myri10ge_tx_dma_handle_head handles;
2514 	int idx;
2515 	int limit = 0;
2516 
2517 	tx = &ss->tx;
2518 	handles.head = NULL;
2519 	handles.tail = NULL;
2520 	while (tx->pkt_done != (int)mcp_index) {
2521 		idx = tx->done & tx->mask;
2522 
2523 		/*
2524 		 * mblk & DMA handle attached only to first slot
2525 		 * per buffer in the packet
2526 		 */
2527 
2528 		if (tx->info[idx].m) {
2529 			(void) ddi_dma_unbind_handle(tx->info[idx].handle->h);
2530 			tx->info[idx].handle->next = handles.head;
2531 			handles.head = tx->info[idx].handle;
2532 			if (handles.tail == NULL)
2533 				handles.tail = tx->info[idx].handle;
2534 			freeb(tx->info[idx].m);
2535 			tx->info[idx].m = 0;
2536 			tx->info[idx].handle = 0;
2537 		}
2538 		if (tx->info[idx].ostat.opackets != 0) {
2539 			tx->stats.multixmt += tx->info[idx].ostat.multixmt;
2540 			tx->stats.brdcstxmt += tx->info[idx].ostat.brdcstxmt;
2541 			tx->stats.obytes += tx->info[idx].ostat.obytes;
2542 			tx->stats.opackets += tx->info[idx].ostat.opackets;
2543 			tx->info[idx].stat.un.all = 0;
2544 			tx->pkt_done++;
2545 		}
2546 
2547 		tx->done++;
2548 		/*
2549 		 * if we stalled the queue, wake it.  But Wait until
2550 		 * we have at least 1/2 our slots free.
2551 		 */
2552 		if ((tx->req - tx->done) < (tx->mask >> 1) &&
2553 		    tx->stall != tx->sched) {
2554 			mutex_enter(&ss->tx.lock);
2555 			tx->sched = tx->stall;
2556 			mutex_exit(&ss->tx.lock);
2557 			mac_tx_ring_update(ss->mgp->mh, tx->rh);
2558 		}
2559 
2560 		/* limit potential for livelock */
2561 		if (unlikely(++limit >  2 * tx->mask))
2562 			break;
2563 	}
2564 	if (tx->req == tx->done && tx->stop != NULL) {
2565 		/*
2566 		 * Nic has sent all pending requests, allow him
2567 		 * to stop polling this queue
2568 		 */
2569 		mutex_enter(&tx->lock);
2570 		if (tx->req == tx->done && tx->active) {
2571 			*(int *)(void *)tx->stop = 1;
2572 			tx->active = 0;
2573 			mb();
2574 		}
2575 		mutex_exit(&tx->lock);
2576 	}
2577 	if (handles.head != NULL)
2578 		myri10ge_free_tx_handles(tx, &handles);
2579 }
2580 
2581 static void
2582 myri10ge_mbl_init(struct myri10ge_mblk_list *mbl)
2583 {
2584 	mbl->head = NULL;
2585 	mbl->tail = &mbl->head;
2586 	mbl->cnt = 0;
2587 }
2588 
2589 /*ARGSUSED*/
2590 void
2591 myri10ge_mbl_append(struct myri10ge_slice_state *ss,
2592     struct myri10ge_mblk_list *mbl, mblk_t *mp)
2593 {
2594 	*(mbl->tail) = mp;
2595 	mbl->tail = &mp->b_next;
2596 	mp->b_next = NULL;
2597 	mbl->cnt++;
2598 }
2599 
2600 
2601 static inline void
2602 myri10ge_clean_rx_done(struct myri10ge_slice_state *ss,
2603     struct myri10ge_mblk_list *mbl, int limit, boolean_t *stop)
2604 {
2605 	myri10ge_rx_done_t *rx_done = &ss->rx_done;
2606 	struct myri10ge_priv *mgp = ss->mgp;
2607 	mblk_t *mp;
2608 	struct lro_entry *lro;
2609 	uint16_t length;
2610 	uint16_t checksum;
2611 
2612 
2613 	while (rx_done->entry[rx_done->idx].length != 0) {
2614 		if (unlikely (*stop)) {
2615 			break;
2616 		}
2617 		length = ntohs(rx_done->entry[rx_done->idx].length);
2618 		length &= (~MXGEFW_RSS_HASH_MASK);
2619 
2620 		/* limit potential for livelock */
2621 		limit -= length;
2622 		if (unlikely(limit < 0))
2623 			break;
2624 
2625 		rx_done->entry[rx_done->idx].length = 0;
2626 		checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
2627 		if (length <= myri10ge_small_bytes)
2628 			mp = myri10ge_rx_done_small(ss, length, checksum);
2629 		else
2630 			mp = myri10ge_rx_done_big(ss, length, checksum);
2631 		if (mp != NULL) {
2632 			if (!myri10ge_lro ||
2633 			    0 != myri10ge_lro_rx(ss, mp, checksum, mbl))
2634 				myri10ge_mbl_append(ss, mbl, mp);
2635 		}
2636 		rx_done->cnt++;
2637 		rx_done->idx = rx_done->cnt & (mgp->max_intr_slots - 1);
2638 	}
2639 	while (ss->lro_active != NULL) {
2640 		lro = ss->lro_active;
2641 		ss->lro_active = lro->next;
2642 		myri10ge_lro_flush(ss, lro, mbl);
2643 	}
2644 }
2645 
2646 static void
2647 myri10ge_intr_rx(struct myri10ge_slice_state *ss)
2648 {
2649 	uint64_t gen;
2650 	struct myri10ge_mblk_list mbl;
2651 
2652 	myri10ge_mbl_init(&mbl);
2653 	if (mutex_tryenter(&ss->rx_lock) == 0)
2654 		return;
2655 	gen = ss->rx_gen_num;
2656 	myri10ge_clean_rx_done(ss, &mbl, MYRI10GE_POLL_NULL,
2657 	    &ss->rx_polling);
2658 	if (mbl.head != NULL)
2659 		mac_rx_ring(ss->mgp->mh, ss->rx_rh, mbl.head, gen);
2660 	mutex_exit(&ss->rx_lock);
2661 
2662 }
2663 
2664 static mblk_t *
2665 myri10ge_poll_rx(void *arg, int bytes)
2666 {
2667 	struct myri10ge_slice_state *ss = arg;
2668 	struct myri10ge_mblk_list mbl;
2669 	boolean_t dummy = B_FALSE;
2670 
2671 	if (bytes == 0)
2672 		return (NULL);
2673 
2674 	myri10ge_mbl_init(&mbl);
2675 	mutex_enter(&ss->rx_lock);
2676 	if (ss->rx_polling)
2677 		myri10ge_clean_rx_done(ss, &mbl, bytes, &dummy);
2678 	else
2679 		printf("%d: poll_rx: token=%d, polling=%d\n", (int)(ss -
2680 		    ss->mgp->ss), ss->rx_token, ss->rx_polling);
2681 	mutex_exit(&ss->rx_lock);
2682 	return (mbl.head);
2683 }
2684 
2685 /*ARGSUSED*/
2686 static uint_t
2687 myri10ge_intr(caddr_t arg0, caddr_t arg1)
2688 {
2689 	struct myri10ge_slice_state *ss =
2690 	    (struct myri10ge_slice_state *)(void *)arg0;
2691 	struct myri10ge_priv *mgp = ss->mgp;
2692 	mcp_irq_data_t *stats = ss->fw_stats;
2693 	myri10ge_tx_ring_t *tx = &ss->tx;
2694 	uint32_t send_done_count;
2695 	uint8_t valid;
2696 
2697 
2698 	/* make sure the DMA has finished */
2699 	if (!stats->valid) {
2700 		return (DDI_INTR_UNCLAIMED);
2701 	}
2702 	valid = stats->valid;
2703 
2704 	/* low bit indicates receives are present */
2705 	if (valid & 1)
2706 		myri10ge_intr_rx(ss);
2707 
2708 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
2709 		/* lower legacy IRQ  */
2710 		*mgp->irq_deassert = 0;
2711 		if (!myri10ge_deassert_wait)
2712 			/* don't wait for conf. that irq is low */
2713 			stats->valid = 0;
2714 		mb();
2715 	} else {
2716 		/* no need to wait for conf. that irq is low */
2717 		stats->valid = 0;
2718 	}
2719 
2720 	do {
2721 		/* check for transmit completes and receives */
2722 		send_done_count = ntohl(stats->send_done_count);
2723 		if (send_done_count != tx->pkt_done)
2724 			myri10ge_tx_done(ss, (int)send_done_count);
2725 	} while (*((volatile uint8_t *) &stats->valid));
2726 
2727 	if (stats->stats_updated) {
2728 		if (mgp->link_state != stats->link_up || stats->link_down) {
2729 			mgp->link_state = stats->link_up;
2730 			if (stats->link_down) {
2731 				mgp->down_cnt += stats->link_down;
2732 				mgp->link_state = 0;
2733 			}
2734 			if (mgp->link_state) {
2735 				if (myri10ge_verbose)
2736 					printf("%s: link up\n", mgp->name);
2737 				mac_link_update(mgp->mh, LINK_STATE_UP);
2738 			} else {
2739 				if (myri10ge_verbose)
2740 					printf("%s: link down\n", mgp->name);
2741 				mac_link_update(mgp->mh, LINK_STATE_DOWN);
2742 			}
2743 			MYRI10GE_NIC_STAT_INC(link_changes);
2744 		}
2745 		if (mgp->rdma_tags_available !=
2746 		    ntohl(ss->fw_stats->rdma_tags_available)) {
2747 			mgp->rdma_tags_available =
2748 			    ntohl(ss->fw_stats->rdma_tags_available);
2749 			cmn_err(CE_NOTE, "%s: RDMA timed out! "
2750 			    "%d tags left\n", mgp->name,
2751 			    mgp->rdma_tags_available);
2752 		}
2753 	}
2754 
2755 	mb();
2756 	/* check to see if we have rx token to pass back */
2757 	if (valid & 0x1) {
2758 		mutex_enter(&ss->poll_lock);
2759 		if (ss->rx_polling) {
2760 			ss->rx_token = 1;
2761 		} else {
2762 			*ss->irq_claim = BE_32(3);
2763 			ss->rx_token = 0;
2764 		}
2765 		mutex_exit(&ss->poll_lock);
2766 	}
2767 	*(ss->irq_claim + 1) = BE_32(3);
2768 	return (DDI_INTR_CLAIMED);
2769 }
2770 
2771 /*
2772  * Add or remove a multicast address.  This is called with our
2773  * macinfo's lock held by GLD, so we do not need to worry about
2774  * our own locking here.
2775  */
2776 static int
2777 myri10ge_m_multicst(void *arg, boolean_t add, const uint8_t *multicastaddr)
2778 {
2779 	myri10ge_cmd_t cmd;
2780 	struct myri10ge_priv *mgp = arg;
2781 	int status, join_leave;
2782 
2783 	if (add)
2784 		join_leave = MXGEFW_JOIN_MULTICAST_GROUP;
2785 	else
2786 		join_leave = MXGEFW_LEAVE_MULTICAST_GROUP;
2787 	(void) memcpy(&cmd.data0, multicastaddr, 4);
2788 	(void) memcpy(&cmd.data1, multicastaddr + 4, 2);
2789 	cmd.data0 = htonl(cmd.data0);
2790 	cmd.data1 = htonl(cmd.data1);
2791 	status = myri10ge_send_cmd(mgp, join_leave, &cmd);
2792 	if (status == 0)
2793 		return (0);
2794 
2795 	cmn_err(CE_WARN, "%s: failed to set multicast address\n",
2796 	    mgp->name);
2797 	return (status);
2798 }
2799 
2800 
2801 static int
2802 myri10ge_m_promisc(void *arg, boolean_t on)
2803 {
2804 	struct myri10ge_priv *mgp = arg;
2805 
2806 	myri10ge_change_promisc(mgp, on);
2807 	return (0);
2808 }
2809 
2810 /*
2811  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
2812  *  backwards one at a time and handle ring wraps
2813  */
2814 
2815 static inline void
2816 myri10ge_submit_req_backwards(myri10ge_tx_ring_t *tx,
2817     mcp_kreq_ether_send_t *src, int cnt)
2818 {
2819 	int idx, starting_slot;
2820 	starting_slot = tx->req;
2821 	while (cnt > 1) {
2822 		cnt--;
2823 		idx = (starting_slot + cnt) & tx->mask;
2824 		myri10ge_pio_copy(&tx->lanai[idx],
2825 		    &src[cnt], sizeof (*src));
2826 		mb();
2827 	}
2828 }
2829 
2830 /*
2831  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
2832  * at most 32 bytes at a time, so as to avoid involving the software
2833  * pio handler in the nic.   We re-write the first segment's flags
2834  * to mark them valid only after writing the entire chain
2835  */
2836 
2837 static inline void
2838 myri10ge_submit_req(myri10ge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
2839     int cnt)
2840 {
2841 	int idx, i;
2842 	uint32_t *src_ints, *dst_ints;
2843 	mcp_kreq_ether_send_t *srcp, *dstp, *dst;
2844 	uint8_t last_flags;
2845 
2846 	idx = tx->req & tx->mask;
2847 
2848 	last_flags = src->flags;
2849 	src->flags = 0;
2850 	mb();
2851 	dst = dstp = &tx->lanai[idx];
2852 	srcp = src;
2853 
2854 	if ((idx + cnt) < tx->mask) {
2855 		for (i = 0; i < (cnt - 1); i += 2) {
2856 			myri10ge_pio_copy(dstp, srcp, 2 * sizeof (*src));
2857 			mb(); /* force write every 32 bytes */
2858 			srcp += 2;
2859 			dstp += 2;
2860 		}
2861 	} else {
2862 		/*
2863 		 * submit all but the first request, and ensure
2864 		 *  that it is submitted below
2865 		 */
2866 		myri10ge_submit_req_backwards(tx, src, cnt);
2867 		i = 0;
2868 	}
2869 	if (i < cnt) {
2870 		/* submit the first request */
2871 		myri10ge_pio_copy(dstp, srcp, sizeof (*src));
2872 		mb(); /* barrier before setting valid flag */
2873 	}
2874 
2875 	/* re-write the last 32-bits with the valid flags */
2876 	src->flags |= last_flags;
2877 	src_ints = (uint32_t *)src;
2878 	src_ints += 3;
2879 	dst_ints = (uint32_t *)dst;
2880 	dst_ints += 3;
2881 	*dst_ints =  *src_ints;
2882 	tx->req += cnt;
2883 	mb();
2884 	/* notify NIC to poll this tx ring */
2885 	if (!tx->active && tx->go != NULL) {
2886 		*(int *)(void *)tx->go = 1;
2887 		tx->active = 1;
2888 		tx->activate++;
2889 		mb();
2890 	}
2891 }
2892 
2893 /* ARGSUSED */
2894 static inline void
2895 myri10ge_lso_info_get(mblk_t *mp, uint32_t *mss, uint32_t *flags)
2896 {
2897 	uint32_t lso_flag;
2898 	mac_lso_get(mp, mss, &lso_flag);
2899 	(*flags) |= lso_flag;
2900 }
2901 
2902 
2903 /* like pullupmsg, except preserve hcksum/LSO attributes */
2904 static int
2905 myri10ge_pullup(struct myri10ge_slice_state *ss, mblk_t *mp)
2906 {
2907 	uint32_t start, stuff, tx_offload_flags, mss;
2908 	int ok;
2909 
2910 	mss = 0;
2911 	mac_hcksum_get(mp, &start, &stuff, NULL, NULL, &tx_offload_flags);
2912 	myri10ge_lso_info_get(mp, &mss, &tx_offload_flags);
2913 
2914 	ok = pullupmsg(mp, -1);
2915 	if (!ok) {
2916 		printf("pullupmsg failed");
2917 		return (DDI_FAILURE);
2918 	}
2919 	MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_pullup);
2920 	mac_hcksum_set(mp, start, stuff, NULL, NULL, tx_offload_flags);
2921 	if (tx_offload_flags & HW_LSO)
2922 		DB_LSOMSS(mp) = (uint16_t)mss;
2923 	lso_info_set(mp, mss, tx_offload_flags);
2924 	return (DDI_SUCCESS);
2925 }
2926 
2927 static inline void
2928 myri10ge_tx_stat(struct myri10ge_tx_pkt_stats *s, struct ether_header *eh,
2929     int opackets, int obytes)
2930 {
2931 	s->un.all = 0;
2932 	if (eh->ether_dhost.ether_addr_octet[0] & 1) {
2933 		if (0 == (bcmp(eh->ether_dhost.ether_addr_octet,
2934 		    myri10ge_broadcastaddr, sizeof (eh->ether_dhost))))
2935 			s->un.s.brdcstxmt = 1;
2936 		else
2937 			s->un.s.multixmt = 1;
2938 	}
2939 	s->un.s.opackets = (uint16_t)opackets;
2940 	s->un.s.obytes = obytes;
2941 }
2942 
2943 static int
2944 myri10ge_tx_copy(struct myri10ge_slice_state *ss, mblk_t *mp,
2945     mcp_kreq_ether_send_t *req)
2946 {
2947 	myri10ge_tx_ring_t *tx = &ss->tx;
2948 	caddr_t ptr;
2949 	struct myri10ge_tx_copybuf *cp;
2950 	mblk_t *bp;
2951 	int idx, mblen, avail;
2952 	uint16_t len;
2953 
2954 	mutex_enter(&tx->lock);
2955 	avail = tx->mask - (tx->req - tx->done);
2956 	if (avail <= 1) {
2957 		mutex_exit(&tx->lock);
2958 		return (EBUSY);
2959 	}
2960 	idx = tx->req & tx->mask;
2961 	cp = &tx->cp[idx];
2962 	ptr = cp->va;
2963 	for (len = 0, bp = mp; bp != NULL; bp = bp->b_cont) {
2964 		mblen = MBLKL(bp);
2965 		bcopy(bp->b_rptr, ptr, mblen);
2966 		ptr += mblen;
2967 		len += mblen;
2968 	}
2969 	/* ensure runts are padded to 60 bytes */
2970 	if (len < 60) {
2971 		bzero(ptr, 64 - len);
2972 		len = 60;
2973 	}
2974 	req->addr_low = cp->dma.low;
2975 	req->addr_high = cp->dma.high;
2976 	req->length = htons(len);
2977 	req->pad = 0;
2978 	req->rdma_count = 1;
2979 	myri10ge_tx_stat(&tx->info[idx].stat,
2980 	    (struct ether_header *)(void *)cp->va, 1, len);
2981 	(void) ddi_dma_sync(cp->dma.handle, 0, len, DDI_DMA_SYNC_FORDEV);
2982 	myri10ge_submit_req(&ss->tx, req, 1);
2983 	mutex_exit(&tx->lock);
2984 	freemsg(mp);
2985 	return (DDI_SUCCESS);
2986 }
2987 
2988 
2989 static void
2990 myri10ge_send_locked(myri10ge_tx_ring_t *tx, mcp_kreq_ether_send_t *req_list,
2991     struct myri10ge_tx_buffer_state *tx_info,
2992     int count)
2993 {
2994 	int i, idx;
2995 
2996 	idx = 0; /* gcc -Wuninitialized */
2997 	/* store unmapping and bp info for tx irq handler */
2998 	for (i = 0; i < count; i++) {
2999 		idx = (tx->req + i) & tx->mask;
3000 		tx->info[idx].m = tx_info[i].m;
3001 		tx->info[idx].handle = tx_info[i].handle;
3002 	}
3003 	tx->info[idx].stat.un.all = tx_info[0].stat.un.all;
3004 
3005 	/* submit the frame to the nic */
3006 	myri10ge_submit_req(tx, req_list, count);
3007 
3008 
3009 }
3010 
3011 
3012 
3013 static void
3014 myri10ge_copydata(mblk_t *mp, int off, int len, caddr_t buf)
3015 {
3016 	mblk_t *bp;
3017 	int seglen;
3018 	uint_t count;
3019 
3020 	bp = mp;
3021 
3022 	while (off > 0) {
3023 		seglen = MBLKL(bp);
3024 		if (off < seglen)
3025 			break;
3026 		off -= seglen;
3027 		bp = bp->b_cont;
3028 	}
3029 	while (len > 0) {
3030 		seglen = MBLKL(bp);
3031 		count = min(seglen - off, len);
3032 		bcopy(bp->b_rptr + off, buf, count);
3033 		len -= count;
3034 		buf += count;
3035 		off = 0;
3036 		bp = bp->b_cont;
3037 	}
3038 }
3039 
3040 static int
3041 myri10ge_ether_parse_header(mblk_t *mp)
3042 {
3043 	struct ether_header eh_copy;
3044 	struct ether_header *eh;
3045 	int eth_hdr_len, seglen;
3046 
3047 	seglen = MBLKL(mp);
3048 	eth_hdr_len = sizeof (*eh);
3049 	if (seglen < eth_hdr_len) {
3050 		myri10ge_copydata(mp, 0, eth_hdr_len, (caddr_t)&eh_copy);
3051 		eh = &eh_copy;
3052 	} else {
3053 		eh = (struct ether_header *)(void *)mp->b_rptr;
3054 	}
3055 	if (eh->ether_type == BE_16(ETHERTYPE_VLAN)) {
3056 		eth_hdr_len += 4;
3057 	}
3058 
3059 	return (eth_hdr_len);
3060 }
3061 
3062 static int
3063 myri10ge_lso_parse_header(mblk_t *mp, int off)
3064 {
3065 	char buf[128];
3066 	int seglen, sum_off;
3067 	struct ip *ip;
3068 	struct tcphdr *tcp;
3069 
3070 	seglen = MBLKL(mp);
3071 	if (seglen < off + sizeof (*ip)) {
3072 		myri10ge_copydata(mp, off, sizeof (*ip), buf);
3073 		ip = (struct ip *)(void *)buf;
3074 	} else {
3075 		ip = (struct ip *)(void *)(mp->b_rptr + off);
3076 	}
3077 	if (seglen < off + (ip->ip_hl << 2) + sizeof (*tcp)) {
3078 		myri10ge_copydata(mp, off,
3079 		    (ip->ip_hl << 2) + sizeof (*tcp), buf);
3080 		ip = (struct ip *)(void *)buf;
3081 	}
3082 	tcp = (struct tcphdr *)(void *)((char *)ip + (ip->ip_hl << 2));
3083 
3084 	/*
3085 	 * NIC expects ip_sum to be zero.  Recent changes to
3086 	 * OpenSolaris leave the correct ip checksum there, rather
3087 	 * than the required zero, so we need to zero it.  Otherwise,
3088 	 * the NIC will produce bad checksums when sending LSO packets.
3089 	 */
3090 	if (ip->ip_sum != 0) {
3091 		if (((char *)ip) != buf) {
3092 			/* ip points into mblk, so just zero it */
3093 			ip->ip_sum = 0;
3094 		} else {
3095 			/*
3096 			 * ip points into a copy, so walk the chain
3097 			 * to find the ip_csum, then zero it
3098 			 */
3099 			sum_off = off + _PTRDIFF(&ip->ip_sum, buf);
3100 			while (sum_off > (int)(MBLKL(mp) - 1)) {
3101 				sum_off -= MBLKL(mp);
3102 				mp = mp->b_cont;
3103 			}
3104 			mp->b_rptr[sum_off] = 0;
3105 			sum_off++;
3106 			while (sum_off > MBLKL(mp) - 1) {
3107 				sum_off -= MBLKL(mp);
3108 				mp = mp->b_cont;
3109 			}
3110 			mp->b_rptr[sum_off] = 0;
3111 		}
3112 	}
3113 	return (off + ((ip->ip_hl + tcp->th_off) << 2));
3114 }
3115 
3116 static int
3117 myri10ge_tx_tso_copy(struct myri10ge_slice_state *ss, mblk_t *mp,
3118     mcp_kreq_ether_send_t *req_list, int hdr_size, int pkt_size,
3119     uint16_t mss, uint8_t cksum_offset)
3120 {
3121 	myri10ge_tx_ring_t *tx = &ss->tx;
3122 	struct myri10ge_priv *mgp = ss->mgp;
3123 	mblk_t *bp;
3124 	mcp_kreq_ether_send_t *req;
3125 	struct myri10ge_tx_copybuf *cp;
3126 	caddr_t rptr, ptr;
3127 	int mblen, count, cum_len, mss_resid, tx_req, pkt_size_tmp;
3128 	int resid, avail, idx, hdr_size_tmp, tx_boundary;
3129 	int rdma_count;
3130 	uint32_t seglen, len, boundary, low, high_swapped;
3131 	uint16_t pseudo_hdr_offset = htons(mss);
3132 	uint8_t flags;
3133 
3134 	tx_boundary = mgp->tx_boundary;
3135 	hdr_size_tmp = hdr_size;
3136 	resid = tx_boundary;
3137 	count = 1;
3138 	mutex_enter(&tx->lock);
3139 
3140 	/* check to see if the slots are really there */
3141 	avail = tx->mask - (tx->req - tx->done);
3142 	if (unlikely(avail <=  MYRI10GE_MAX_SEND_DESC_TSO)) {
3143 		atomic_inc_32(&tx->stall);
3144 		mutex_exit(&tx->lock);
3145 		return (EBUSY);
3146 	}
3147 
3148 	/* copy */
3149 	cum_len = -hdr_size;
3150 	count = 0;
3151 	req = req_list;
3152 	idx = tx->mask & tx->req;
3153 	cp = &tx->cp[idx];
3154 	low = ntohl(cp->dma.low);
3155 	ptr = cp->va;
3156 	cp->len = 0;
3157 	if (mss) {
3158 		int payload = pkt_size - hdr_size;
3159 		uint16_t opackets = (payload / mss) + ((payload % mss) != 0);
3160 		tx->info[idx].ostat.opackets = opackets;
3161 		tx->info[idx].ostat.obytes = (opackets - 1) * hdr_size
3162 		    + pkt_size;
3163 	}
3164 	hdr_size_tmp = hdr_size;
3165 	mss_resid = mss;
3166 	flags = (MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST);
3167 	tx_req = tx->req;
3168 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3169 		mblen = MBLKL(bp);
3170 		rptr = (caddr_t)bp->b_rptr;
3171 		len = min(hdr_size_tmp, mblen);
3172 		if (len) {
3173 			bcopy(rptr, ptr, len);
3174 			rptr += len;
3175 			ptr += len;
3176 			resid -= len;
3177 			mblen -= len;
3178 			hdr_size_tmp -= len;
3179 			cp->len += len;
3180 			if (hdr_size_tmp)
3181 				continue;
3182 			if (resid < mss) {
3183 				tx_req++;
3184 				idx = tx->mask & tx_req;
3185 				cp = &tx->cp[idx];
3186 				low = ntohl(cp->dma.low);
3187 				ptr = cp->va;
3188 				resid = tx_boundary;
3189 			}
3190 		}
3191 		while (mblen) {
3192 			len = min(mss_resid, mblen);
3193 			bcopy(rptr, ptr, len);
3194 			mss_resid -= len;
3195 			resid -= len;
3196 			mblen -= len;
3197 			rptr += len;
3198 			ptr += len;
3199 			cp->len += len;
3200 			if (mss_resid == 0) {
3201 				mss_resid = mss;
3202 				if (resid < mss) {
3203 					tx_req++;
3204 					idx = tx->mask & tx_req;
3205 					cp = &tx->cp[idx];
3206 					cp->len = 0;
3207 					low = ntohl(cp->dma.low);
3208 					ptr = cp->va;
3209 					resid = tx_boundary;
3210 				}
3211 			}
3212 		}
3213 	}
3214 
3215 	req = req_list;
3216 	pkt_size_tmp = pkt_size;
3217 	count = 0;
3218 	rdma_count = 0;
3219 	tx_req = tx->req;
3220 	while (pkt_size_tmp) {
3221 		idx = tx->mask & tx_req;
3222 		cp = &tx->cp[idx];
3223 		high_swapped = cp->dma.high;
3224 		low = ntohl(cp->dma.low);
3225 		len = cp->len;
3226 		if (len == 0) {
3227 			printf("len=0! pkt_size_tmp=%d, pkt_size=%d\n",
3228 			    pkt_size_tmp, pkt_size);
3229 			for (bp = mp; bp != NULL; bp = bp->b_cont) {
3230 				mblen = MBLKL(bp);
3231 				printf("mblen:%d\n", mblen);
3232 			}
3233 			pkt_size_tmp = pkt_size;
3234 			tx_req = tx->req;
3235 			while (pkt_size_tmp > 0) {
3236 				idx = tx->mask & tx_req;
3237 				cp = &tx->cp[idx];
3238 				printf("cp->len = %d\n", cp->len);
3239 				pkt_size_tmp -= cp->len;
3240 				tx_req++;
3241 			}
3242 			printf("dropped\n");
3243 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3244 			goto done;
3245 		}
3246 		pkt_size_tmp -= len;
3247 		while (len) {
3248 			while (len) {
3249 				uint8_t flags_next;
3250 				int cum_len_next;
3251 
3252 				boundary = (low + mgp->tx_boundary) &
3253 				    ~(mgp->tx_boundary - 1);
3254 				seglen = boundary - low;
3255 				if (seglen > len)
3256 					seglen = len;
3257 
3258 				flags_next = flags & ~MXGEFW_FLAGS_FIRST;
3259 				cum_len_next = cum_len + seglen;
3260 				(req-rdma_count)->rdma_count = rdma_count + 1;
3261 				if (likely(cum_len >= 0)) {
3262 					/* payload */
3263 					int next_is_first, chop;
3264 
3265 					chop = (cum_len_next > mss);
3266 					cum_len_next = cum_len_next % mss;
3267 					next_is_first = (cum_len_next == 0);
3268 					flags |= chop *
3269 					    MXGEFW_FLAGS_TSO_CHOP;
3270 					flags_next |= next_is_first *
3271 					    MXGEFW_FLAGS_FIRST;
3272 					rdma_count |= -(chop | next_is_first);
3273 					rdma_count += chop & !next_is_first;
3274 				} else if (likely(cum_len_next >= 0)) {
3275 					/* header ends */
3276 					int small;
3277 
3278 					rdma_count = -1;
3279 					cum_len_next = 0;
3280 					seglen = -cum_len;
3281 					small = (mss <= MXGEFW_SEND_SMALL_SIZE);
3282 					flags_next = MXGEFW_FLAGS_TSO_PLD |
3283 					    MXGEFW_FLAGS_FIRST |
3284 					    (small * MXGEFW_FLAGS_SMALL);
3285 				}
3286 				req->addr_high = high_swapped;
3287 				req->addr_low = htonl(low);
3288 				req->pseudo_hdr_offset = pseudo_hdr_offset;
3289 				req->pad = 0; /* complete solid 16-byte block */
3290 				req->rdma_count = 1;
3291 				req->cksum_offset = cksum_offset;
3292 				req->length = htons(seglen);
3293 				req->flags = flags | ((cum_len & 1) *
3294 				    MXGEFW_FLAGS_ALIGN_ODD);
3295 				if (cksum_offset > seglen)
3296 					cksum_offset -= seglen;
3297 				else
3298 					cksum_offset = 0;
3299 				low += seglen;
3300 				len -= seglen;
3301 				cum_len = cum_len_next;
3302 				req++;
3303 				req->flags = 0;
3304 				flags = flags_next;
3305 				count++;
3306 				rdma_count++;
3307 			}
3308 		}
3309 		tx_req++;
3310 	}
3311 	(req-rdma_count)->rdma_count = (uint8_t)rdma_count;
3312 	do {
3313 		req--;
3314 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
3315 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP |
3316 	    MXGEFW_FLAGS_FIRST)));
3317 
3318 	myri10ge_submit_req(tx, req_list, count);
3319 done:
3320 	mutex_exit(&tx->lock);
3321 	freemsg(mp);
3322 	return (DDI_SUCCESS);
3323 }
3324 
3325 /*
3326  * Try to send the chain of buffers described by the mp.  We must not
3327  * encapsulate more than eth->tx.req - eth->tx.done, or
3328  * MXGEFW_MAX_SEND_DESC, whichever is more.
3329  */
3330 
3331 static int
3332 myri10ge_send(struct myri10ge_slice_state *ss, mblk_t *mp,
3333     mcp_kreq_ether_send_t *req_list, struct myri10ge_tx_buffer_state *tx_info)
3334 {
3335 	struct myri10ge_priv *mgp = ss->mgp;
3336 	myri10ge_tx_ring_t *tx = &ss->tx;
3337 	mcp_kreq_ether_send_t *req;
3338 	struct myri10ge_tx_dma_handle *handles, *dma_handle = NULL;
3339 	mblk_t  *bp;
3340 	ddi_dma_cookie_t cookie;
3341 	int err, rv, count, avail, mblen, try_pullup, i, max_segs, maclen,
3342 	    rdma_count, cum_len, lso_hdr_size;
3343 	uint32_t start, stuff, tx_offload_flags;
3344 	uint32_t seglen, len, mss, boundary, low, high_swapped;
3345 	uint_t ncookies;
3346 	uint16_t pseudo_hdr_offset;
3347 	uint8_t flags, cksum_offset, odd_flag;
3348 	int pkt_size;
3349 	int lso_copy = myri10ge_lso_copy;
3350 	try_pullup = 1;
3351 
3352 again:
3353 	/* Setup checksum offloading, if needed */
3354 	mac_hcksum_get(mp, &start, &stuff, NULL, NULL, &tx_offload_flags);
3355 	myri10ge_lso_info_get(mp, &mss, &tx_offload_flags);
3356 	if (tx_offload_flags & HW_LSO) {
3357 		max_segs = MYRI10GE_MAX_SEND_DESC_TSO;
3358 		if ((tx_offload_flags & HCK_PARTIALCKSUM) == 0) {
3359 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_lsobadflags);
3360 			freemsg(mp);
3361 			return (DDI_SUCCESS);
3362 		}
3363 	} else {
3364 		max_segs = MXGEFW_MAX_SEND_DESC;
3365 		mss = 0;
3366 	}
3367 	req = req_list;
3368 	cksum_offset = 0;
3369 	pseudo_hdr_offset = 0;
3370 
3371 	/* leave an extra slot keep the ring from wrapping */
3372 	avail = tx->mask - (tx->req - tx->done);
3373 
3374 	/*
3375 	 * If we have > MXGEFW_MAX_SEND_DESC, then any over-length
3376 	 * message will need to be pulled up in order to fit.
3377 	 * Otherwise, we are low on transmit descriptors, it is
3378 	 * probably better to stall and try again rather than pullup a
3379 	 * message to fit.
3380 	 */
3381 
3382 	if (avail < max_segs) {
3383 		err = EBUSY;
3384 		atomic_inc_32(&tx->stall_early);
3385 		goto stall;
3386 	}
3387 
3388 	/* find out how long the frame is and how many segments it is */
3389 	count = 0;
3390 	odd_flag = 0;
3391 	pkt_size = 0;
3392 	flags = (MXGEFW_FLAGS_NO_TSO | MXGEFW_FLAGS_FIRST);
3393 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3394 		dblk_t *dbp;
3395 		mblen = MBLKL(bp);
3396 		if (mblen == 0) {
3397 			/*
3398 			 * we can't simply skip over 0-length mblks
3399 			 * because the hardware can't deal with them,
3400 			 * and we could leak them.
3401 			 */
3402 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_zero_len);
3403 			err = EIO;
3404 			goto pullup;
3405 		}
3406 		/*
3407 		 * There's no advantage to copying most gesballoc
3408 		 * attached blocks, so disable lso copy in that case
3409 		 */
3410 		if (mss && lso_copy == 1 && ((dbp = bp->b_datap) != NULL)) {
3411 			if ((void *)dbp->db_lastfree != myri10ge_db_lastfree) {
3412 				lso_copy = 0;
3413 			}
3414 		}
3415 		pkt_size += mblen;
3416 		count++;
3417 	}
3418 
3419 	/* Try to pull up excessivly long chains */
3420 	if (count >= max_segs) {
3421 		err = myri10ge_pullup(ss, mp);
3422 		if (likely(err == DDI_SUCCESS)) {
3423 			count = 1;
3424 		} else {
3425 			if (count <  MYRI10GE_MAX_SEND_DESC_TSO) {
3426 				/*
3427 				 * just let the h/w send it, it will be
3428 				 * inefficient, but us better than dropping
3429 				 */
3430 				max_segs = MYRI10GE_MAX_SEND_DESC_TSO;
3431 			} else {
3432 				/* drop it */
3433 				MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3434 				freemsg(mp);
3435 				return (0);
3436 			}
3437 		}
3438 	}
3439 
3440 	cum_len = 0;
3441 	maclen = myri10ge_ether_parse_header(mp);
3442 
3443 	if (tx_offload_flags & HCK_PARTIALCKSUM) {
3444 
3445 		cksum_offset = start + maclen;
3446 		pseudo_hdr_offset = htons(stuff + maclen);
3447 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
3448 		flags |= MXGEFW_FLAGS_CKSUM;
3449 	}
3450 
3451 	lso_hdr_size = 0; /* -Wunitinialized */
3452 	if (mss) { /* LSO */
3453 		/* this removes any CKSUM flag from before */
3454 		flags = (MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST);
3455 		/*
3456 		 * parse the headers and set cum_len to a negative
3457 		 * value to reflect the offset of the TCP payload
3458 		 */
3459 		lso_hdr_size =  myri10ge_lso_parse_header(mp, maclen);
3460 		cum_len = -lso_hdr_size;
3461 		if ((mss < mgp->tx_boundary) && lso_copy) {
3462 			err = myri10ge_tx_tso_copy(ss, mp, req_list,
3463 			    lso_hdr_size, pkt_size, mss, cksum_offset);
3464 			return (err);
3465 		}
3466 
3467 		/*
3468 		 * for TSO, pseudo_hdr_offset holds mss.  The firmware
3469 		 * figures out where to put the checksum by parsing
3470 		 * the header.
3471 		 */
3472 
3473 		pseudo_hdr_offset = htons(mss);
3474 	} else if (pkt_size <= MXGEFW_SEND_SMALL_SIZE) {
3475 		flags |= MXGEFW_FLAGS_SMALL;
3476 		if (pkt_size < myri10ge_tx_copylen) {
3477 			req->cksum_offset = cksum_offset;
3478 			req->pseudo_hdr_offset = pseudo_hdr_offset;
3479 			req->flags = flags;
3480 			err = myri10ge_tx_copy(ss, mp, req);
3481 			return (err);
3482 		}
3483 		cum_len = 0;
3484 	}
3485 
3486 	/* pull one DMA handle for each bp from our freelist */
3487 	handles = NULL;
3488 	err = myri10ge_alloc_tx_handles(ss, count, &handles);
3489 	if (err != DDI_SUCCESS) {
3490 		err = DDI_FAILURE;
3491 		goto stall;
3492 	}
3493 	count = 0;
3494 	rdma_count = 0;
3495 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3496 		mblen = MBLKL(bp);
3497 		dma_handle = handles;
3498 		handles = handles->next;
3499 
3500 		rv = ddi_dma_addr_bind_handle(dma_handle->h, NULL,
3501 		    (caddr_t)bp->b_rptr, mblen,
3502 		    DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL,
3503 		    &cookie, &ncookies);
3504 		if (unlikely(rv != DDI_DMA_MAPPED)) {
3505 			err = EIO;
3506 			try_pullup = 0;
3507 			dma_handle->next = handles;
3508 			handles = dma_handle;
3509 			goto abort_with_handles;
3510 		}
3511 
3512 		/* reserve the slot */
3513 		tx_info[count].m = bp;
3514 		tx_info[count].handle = dma_handle;
3515 
3516 		for (; ; ) {
3517 			low = MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress);
3518 			high_swapped =
3519 			    htonl(MYRI10GE_HIGHPART_TO_U32(
3520 			    cookie.dmac_laddress));
3521 			len = (uint32_t)cookie.dmac_size;
3522 			while (len) {
3523 				uint8_t flags_next;
3524 				int cum_len_next;
3525 
3526 				boundary = (low + mgp->tx_boundary) &
3527 				    ~(mgp->tx_boundary - 1);
3528 				seglen = boundary - low;
3529 				if (seglen > len)
3530 					seglen = len;
3531 
3532 				flags_next = flags & ~MXGEFW_FLAGS_FIRST;
3533 				cum_len_next = cum_len + seglen;
3534 				if (mss) {
3535 					(req-rdma_count)->rdma_count =
3536 					    rdma_count + 1;
3537 					if (likely(cum_len >= 0)) {
3538 						/* payload */
3539 						int next_is_first, chop;
3540 
3541 						chop = (cum_len_next > mss);
3542 						cum_len_next =
3543 						    cum_len_next % mss;
3544 						next_is_first =
3545 						    (cum_len_next == 0);
3546 						flags |= chop *
3547 						    MXGEFW_FLAGS_TSO_CHOP;
3548 						flags_next |= next_is_first *
3549 						    MXGEFW_FLAGS_FIRST;
3550 						rdma_count |=
3551 						    -(chop | next_is_first);
3552 						rdma_count +=
3553 						    chop & !next_is_first;
3554 					} else if (likely(cum_len_next >= 0)) {
3555 						/* header ends */
3556 						int small;
3557 
3558 						rdma_count = -1;
3559 						cum_len_next = 0;
3560 						seglen = -cum_len;
3561 						small = (mss <=
3562 						    MXGEFW_SEND_SMALL_SIZE);
3563 						flags_next =
3564 						    MXGEFW_FLAGS_TSO_PLD
3565 						    | MXGEFW_FLAGS_FIRST
3566 						    | (small *
3567 						    MXGEFW_FLAGS_SMALL);
3568 					}
3569 				}
3570 				req->addr_high = high_swapped;
3571 				req->addr_low = htonl(low);
3572 				req->pseudo_hdr_offset = pseudo_hdr_offset;
3573 				req->pad = 0; /* complete solid 16-byte block */
3574 				req->rdma_count = 1;
3575 				req->cksum_offset = cksum_offset;
3576 				req->length = htons(seglen);
3577 				req->flags = flags | ((cum_len & 1) * odd_flag);
3578 				if (cksum_offset > seglen)
3579 					cksum_offset -= seglen;
3580 				else
3581 					cksum_offset = 0;
3582 				low += seglen;
3583 				len -= seglen;
3584 				cum_len = cum_len_next;
3585 				count++;
3586 				rdma_count++;
3587 				/*  make sure all the segments will fit */
3588 				if (unlikely(count >= max_segs)) {
3589 					MYRI10GE_ATOMIC_SLICE_STAT_INC(
3590 					    xmit_lowbuf);
3591 					/* may try a pullup */
3592 					err = EBUSY;
3593 					if (try_pullup)
3594 						try_pullup = 2;
3595 					goto abort_with_handles;
3596 				}
3597 				req++;
3598 				req->flags = 0;
3599 				flags = flags_next;
3600 				tx_info[count].m = 0;
3601 			}
3602 			ncookies--;
3603 			if (ncookies == 0)
3604 				break;
3605 			ddi_dma_nextcookie(dma_handle->h, &cookie);
3606 		}
3607 	}
3608 	(req-rdma_count)->rdma_count = (uint8_t)rdma_count;
3609 
3610 	if (mss) {
3611 		do {
3612 			req--;
3613 			req->flags |= MXGEFW_FLAGS_TSO_LAST;
3614 		} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP |
3615 		    MXGEFW_FLAGS_FIRST)));
3616 	}
3617 
3618 	/* calculate tx stats */
3619 	if (mss) {
3620 		uint16_t opackets;
3621 		int payload;
3622 
3623 		payload = pkt_size - lso_hdr_size;
3624 		opackets = (payload / mss) + ((payload % mss) != 0);
3625 		tx_info[0].stat.un.all = 0;
3626 		tx_info[0].ostat.opackets = opackets;
3627 		tx_info[0].ostat.obytes = (opackets - 1) * lso_hdr_size
3628 		    + pkt_size;
3629 	} else {
3630 		myri10ge_tx_stat(&tx_info[0].stat,
3631 		    (struct ether_header *)(void *)mp->b_rptr, 1, pkt_size);
3632 	}
3633 	mutex_enter(&tx->lock);
3634 
3635 	/* check to see if the slots are really there */
3636 	avail = tx->mask - (tx->req - tx->done);
3637 	if (unlikely(avail <= count)) {
3638 		mutex_exit(&tx->lock);
3639 		err = 0;
3640 		goto late_stall;
3641 	}
3642 
3643 	myri10ge_send_locked(tx, req_list, tx_info, count);
3644 	mutex_exit(&tx->lock);
3645 	return (DDI_SUCCESS);
3646 
3647 late_stall:
3648 	try_pullup = 0;
3649 	atomic_inc_32(&tx->stall_late);
3650 
3651 abort_with_handles:
3652 	/* unbind and free handles from previous mblks */
3653 	for (i = 0; i < count; i++) {
3654 		bp = tx_info[i].m;
3655 		tx_info[i].m = 0;
3656 		if (bp) {
3657 			dma_handle = tx_info[i].handle;
3658 			(void) ddi_dma_unbind_handle(dma_handle->h);
3659 			dma_handle->next = handles;
3660 			handles = dma_handle;
3661 			tx_info[i].handle = NULL;
3662 			tx_info[i].m = NULL;
3663 		}
3664 	}
3665 	myri10ge_free_tx_handle_slist(tx, handles);
3666 pullup:
3667 	if (try_pullup) {
3668 		err = myri10ge_pullup(ss, mp);
3669 		if (err != DDI_SUCCESS && try_pullup == 2) {
3670 			/* drop */
3671 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3672 			freemsg(mp);
3673 			return (0);
3674 		}
3675 		try_pullup = 0;
3676 		goto again;
3677 	}
3678 
3679 stall:
3680 	if (err != 0) {
3681 		if (err == EBUSY) {
3682 			atomic_inc_32(&tx->stall);
3683 		} else {
3684 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3685 		}
3686 	}
3687 	return (err);
3688 }
3689 
3690 static mblk_t *
3691 myri10ge_send_wrapper(void *arg, mblk_t *mp)
3692 {
3693 	struct myri10ge_slice_state *ss = arg;
3694 	int err = 0;
3695 	mcp_kreq_ether_send_t *req_list;
3696 #if defined(__i386)
3697 	/*
3698 	 * We need about 2.5KB of scratch space to handle transmits.
3699 	 * i86pc has only 8KB of kernel stack space, so we malloc the
3700 	 * scratch space there rather than keeping it on the stack.
3701 	 */
3702 	size_t req_size, tx_info_size;
3703 	struct myri10ge_tx_buffer_state *tx_info;
3704 	caddr_t req_bytes;
3705 
3706 	req_size = sizeof (*req_list) * (MYRI10GE_MAX_SEND_DESC_TSO + 4)
3707 	    + 8;
3708 	req_bytes = kmem_alloc(req_size, KM_SLEEP);
3709 	tx_info_size = sizeof (*tx_info) * (MYRI10GE_MAX_SEND_DESC_TSO + 1);
3710 	tx_info = kmem_alloc(tx_info_size, KM_SLEEP);
3711 #else
3712 	char req_bytes[sizeof (*req_list) * (MYRI10GE_MAX_SEND_DESC_TSO + 4)
3713 	    + 8];
3714 	struct myri10ge_tx_buffer_state tx_info[MYRI10GE_MAX_SEND_DESC_TSO + 1];
3715 #endif
3716 
3717 	/* ensure req_list entries are aligned to 8 bytes */
3718 	req_list = (struct mcp_kreq_ether_send *)
3719 	    (((unsigned long)req_bytes + 7UL) & ~7UL);
3720 
3721 	err = myri10ge_send(ss, mp, req_list, tx_info);
3722 
3723 #if defined(__i386)
3724 	kmem_free(tx_info, tx_info_size);
3725 	kmem_free(req_bytes, req_size);
3726 #endif
3727 	if (err)
3728 		return (mp);
3729 	else
3730 		return (NULL);
3731 }
3732 
3733 static int
3734 myri10ge_addmac(void *arg, const uint8_t *mac_addr)
3735 {
3736 	struct myri10ge_priv *mgp = arg;
3737 	int err;
3738 
3739 	if (mac_addr == NULL)
3740 		return (EINVAL);
3741 
3742 	mutex_enter(&mgp->intrlock);
3743 	if (mgp->macaddr_cnt) {
3744 		mutex_exit(&mgp->intrlock);
3745 		return (ENOSPC);
3746 	}
3747 	err = myri10ge_m_unicst(mgp, mac_addr);
3748 	if (!err)
3749 		mgp->macaddr_cnt++;
3750 
3751 	mutex_exit(&mgp->intrlock);
3752 	if (err)
3753 		return (err);
3754 
3755 	bcopy(mac_addr, mgp->mac_addr, sizeof (mgp->mac_addr));
3756 	return (0);
3757 }
3758 
3759 /*ARGSUSED*/
3760 static int
3761 myri10ge_remmac(void *arg, const uint8_t *mac_addr)
3762 {
3763 	struct myri10ge_priv *mgp = arg;
3764 
3765 	mutex_enter(&mgp->intrlock);
3766 	mgp->macaddr_cnt--;
3767 	mutex_exit(&mgp->intrlock);
3768 
3769 	return (0);
3770 }
3771 
3772 /*ARGSUSED*/
3773 static void
3774 myri10ge_fill_group(void *arg, mac_ring_type_t rtype, const int index,
3775     mac_group_info_t *infop, mac_group_handle_t gh)
3776 {
3777 	struct myri10ge_priv *mgp = arg;
3778 
3779 	if (rtype != MAC_RING_TYPE_RX)
3780 		return;
3781 
3782 	infop->mgi_driver = (mac_group_driver_t)mgp;
3783 	infop->mgi_start = NULL;
3784 	infop->mgi_stop = NULL;
3785 	infop->mgi_addmac = myri10ge_addmac;
3786 	infop->mgi_remmac = myri10ge_remmac;
3787 	infop->mgi_count = mgp->num_slices;
3788 }
3789 
3790 static int
3791 myri10ge_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num)
3792 {
3793 	struct myri10ge_slice_state *ss;
3794 
3795 	ss = (struct myri10ge_slice_state *)rh;
3796 	mutex_enter(&ss->rx_lock);
3797 	ss->rx_gen_num = mr_gen_num;
3798 	mutex_exit(&ss->rx_lock);
3799 	return (0);
3800 }
3801 
3802 /*
3803  * Retrieve a value for one of the statistics for a particular rx ring
3804  */
3805 int
3806 myri10ge_rx_ring_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val)
3807 {
3808 	struct myri10ge_slice_state *ss;
3809 
3810 	ss = (struct myri10ge_slice_state *)rh;
3811 	switch (stat) {
3812 	case MAC_STAT_RBYTES:
3813 		*val = ss->rx_stats.ibytes;
3814 		break;
3815 
3816 	case MAC_STAT_IPACKETS:
3817 		*val = ss->rx_stats.ipackets;
3818 		break;
3819 
3820 	default:
3821 		*val = 0;
3822 		return (ENOTSUP);
3823 	}
3824 
3825 	return (0);
3826 }
3827 
3828 /*
3829  * Retrieve a value for one of the statistics for a particular tx ring
3830  */
3831 int
3832 myri10ge_tx_ring_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val)
3833 {
3834 	struct myri10ge_slice_state *ss;
3835 
3836 	ss = (struct myri10ge_slice_state *)rh;
3837 	switch (stat) {
3838 	case MAC_STAT_OBYTES:
3839 		*val = ss->tx.stats.obytes;
3840 		break;
3841 
3842 	case MAC_STAT_OPACKETS:
3843 		*val = ss->tx.stats.opackets;
3844 		break;
3845 
3846 	default:
3847 		*val = 0;
3848 		return (ENOTSUP);
3849 	}
3850 
3851 	return (0);
3852 }
3853 
3854 static int
3855 myri10ge_rx_ring_intr_disable(mac_intr_handle_t intrh)
3856 {
3857 	struct myri10ge_slice_state *ss;
3858 
3859 	ss = (struct myri10ge_slice_state *)intrh;
3860 	mutex_enter(&ss->poll_lock);
3861 	ss->rx_polling = B_TRUE;
3862 	mutex_exit(&ss->poll_lock);
3863 	return (0);
3864 }
3865 
3866 static int
3867 myri10ge_rx_ring_intr_enable(mac_intr_handle_t intrh)
3868 {
3869 	struct myri10ge_slice_state *ss;
3870 
3871 	ss = (struct myri10ge_slice_state *)intrh;
3872 	mutex_enter(&ss->poll_lock);
3873 	ss->rx_polling = B_FALSE;
3874 	if (ss->rx_token) {
3875 		*ss->irq_claim = BE_32(3);
3876 		ss->rx_token = 0;
3877 	}
3878 	mutex_exit(&ss->poll_lock);
3879 	return (0);
3880 }
3881 
3882 /*ARGSUSED*/
3883 static void
3884 myri10ge_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
3885     const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
3886 {
3887 	struct myri10ge_priv *mgp = arg;
3888 	struct myri10ge_slice_state *ss;
3889 	mac_intr_t *mintr = &infop->mri_intr;
3890 
3891 	ASSERT((unsigned int)ring_index < mgp->num_slices);
3892 
3893 	ss = &mgp->ss[ring_index];
3894 	switch (rtype) {
3895 	case MAC_RING_TYPE_RX:
3896 		ss->rx_rh = rh;
3897 		infop->mri_driver = (mac_ring_driver_t)ss;
3898 		infop->mri_start = myri10ge_ring_start;
3899 		infop->mri_stop = NULL;
3900 		infop->mri_poll = myri10ge_poll_rx;
3901 		infop->mri_stat = myri10ge_rx_ring_stat;
3902 		mintr->mi_handle = (mac_intr_handle_t)ss;
3903 		mintr->mi_enable = myri10ge_rx_ring_intr_enable;
3904 		mintr->mi_disable = myri10ge_rx_ring_intr_disable;
3905 		break;
3906 	case MAC_RING_TYPE_TX:
3907 		ss->tx.rh = rh;
3908 		infop->mri_driver = (mac_ring_driver_t)ss;
3909 		infop->mri_start = NULL;
3910 		infop->mri_stop = NULL;
3911 		infop->mri_tx = myri10ge_send_wrapper;
3912 		infop->mri_stat = myri10ge_tx_ring_stat;
3913 		break;
3914 	default:
3915 		break;
3916 	}
3917 }
3918 
3919 static void
3920 myri10ge_nic_stat_destroy(struct myri10ge_priv *mgp)
3921 {
3922 	if (mgp->ksp_stat == NULL)
3923 		return;
3924 
3925 	kstat_delete(mgp->ksp_stat);
3926 	mgp->ksp_stat = NULL;
3927 }
3928 
3929 static void
3930 myri10ge_slice_stat_destroy(struct myri10ge_slice_state *ss)
3931 {
3932 	if (ss->ksp_stat == NULL)
3933 		return;
3934 
3935 	kstat_delete(ss->ksp_stat);
3936 	ss->ksp_stat = NULL;
3937 }
3938 
3939 static void
3940 myri10ge_info_destroy(struct myri10ge_priv *mgp)
3941 {
3942 	if (mgp->ksp_info == NULL)
3943 		return;
3944 
3945 	kstat_delete(mgp->ksp_info);
3946 	mgp->ksp_info = NULL;
3947 }
3948 
3949 static int
3950 myri10ge_nic_stat_kstat_update(kstat_t *ksp, int rw)
3951 {
3952 	struct myri10ge_nic_stat *ethstat;
3953 	struct myri10ge_priv *mgp;
3954 	mcp_irq_data_t *fw_stats;
3955 
3956 
3957 	if (rw == KSTAT_WRITE)
3958 		return (EACCES);
3959 
3960 	ethstat = (struct myri10ge_nic_stat *)ksp->ks_data;
3961 	mgp = (struct myri10ge_priv *)ksp->ks_private;
3962 	fw_stats = mgp->ss[0].fw_stats;
3963 
3964 	ethstat->dma_read_bw_MBs.value.ul = mgp->read_dma;
3965 	ethstat->dma_write_bw_MBs.value.ul = mgp->write_dma;
3966 	ethstat->dma_read_write_bw_MBs.value.ul = mgp->read_write_dma;
3967 	if (myri10ge_tx_dma_attr.dma_attr_flags & DDI_DMA_FORCE_PHYSICAL)
3968 		ethstat->dma_force_physical.value.ul = 1;
3969 	else
3970 		ethstat->dma_force_physical.value.ul = 0;
3971 	ethstat->lanes.value.ul = mgp->pcie_link_width;
3972 	ethstat->dropped_bad_crc32.value.ul =
3973 	    ntohl(fw_stats->dropped_bad_crc32);
3974 	ethstat->dropped_bad_phy.value.ul =
3975 	    ntohl(fw_stats->dropped_bad_phy);
3976 	ethstat->dropped_link_error_or_filtered.value.ul =
3977 	    ntohl(fw_stats->dropped_link_error_or_filtered);
3978 	ethstat->dropped_link_overflow.value.ul =
3979 	    ntohl(fw_stats->dropped_link_overflow);
3980 	ethstat->dropped_multicast_filtered.value.ul =
3981 	    ntohl(fw_stats->dropped_multicast_filtered);
3982 	ethstat->dropped_no_big_buffer.value.ul =
3983 	    ntohl(fw_stats->dropped_no_big_buffer);
3984 	ethstat->dropped_no_small_buffer.value.ul =
3985 	    ntohl(fw_stats->dropped_no_small_buffer);
3986 	ethstat->dropped_overrun.value.ul =
3987 	    ntohl(fw_stats->dropped_overrun);
3988 	ethstat->dropped_pause.value.ul =
3989 	    ntohl(fw_stats->dropped_pause);
3990 	ethstat->dropped_runt.value.ul =
3991 	    ntohl(fw_stats->dropped_runt);
3992 	ethstat->link_up.value.ul =
3993 	    ntohl(fw_stats->link_up);
3994 	ethstat->dropped_unicast_filtered.value.ul =
3995 	    ntohl(fw_stats->dropped_unicast_filtered);
3996 	return (0);
3997 }
3998 
3999 static int
4000 myri10ge_slice_stat_kstat_update(kstat_t *ksp, int rw)
4001 {
4002 	struct myri10ge_slice_stat *ethstat;
4003 	struct myri10ge_slice_state *ss;
4004 
4005 	if (rw == KSTAT_WRITE)
4006 		return (EACCES);
4007 
4008 	ethstat = (struct myri10ge_slice_stat *)ksp->ks_data;
4009 	ss = (struct myri10ge_slice_state *)ksp->ks_private;
4010 
4011 	ethstat->rx_big.value.ul = ss->j_rx_cnt;
4012 	ethstat->rx_bigbuf_firmware.value.ul = ss->rx_big.cnt - ss->j_rx_cnt;
4013 	ethstat->rx_bigbuf_pool.value.ul =
4014 	    ss->jpool.num_alloc - ss->jbufs_for_smalls;
4015 	ethstat->rx_bigbuf_smalls.value.ul = ss->jbufs_for_smalls;
4016 	ethstat->rx_small.value.ul = ss->rx_small.cnt -
4017 	    (ss->rx_small.mask + 1);
4018 	ethstat->tx_done.value.ul = ss->tx.done;
4019 	ethstat->tx_req.value.ul = ss->tx.req;
4020 	ethstat->tx_activate.value.ul = ss->tx.activate;
4021 	ethstat->xmit_sched.value.ul = ss->tx.sched;
4022 	ethstat->xmit_stall.value.ul = ss->tx.stall;
4023 	ethstat->xmit_stall_early.value.ul = ss->tx.stall_early;
4024 	ethstat->xmit_stall_late.value.ul = ss->tx.stall_late;
4025 	ethstat->xmit_err.value.ul =  MYRI10GE_SLICE_STAT(xmit_err);
4026 	return (0);
4027 }
4028 
4029 static int
4030 myri10ge_info_kstat_update(kstat_t *ksp, int rw)
4031 {
4032 	struct myri10ge_info *info;
4033 	struct myri10ge_priv *mgp;
4034 
4035 
4036 	if (rw == KSTAT_WRITE)
4037 		return (EACCES);
4038 
4039 	info = (struct myri10ge_info *)ksp->ks_data;
4040 	mgp = (struct myri10ge_priv *)ksp->ks_private;
4041 	kstat_named_setstr(&info->driver_version, MYRI10GE_VERSION_STR);
4042 	kstat_named_setstr(&info->firmware_version, mgp->fw_version);
4043 	kstat_named_setstr(&info->firmware_name, mgp->fw_name);
4044 	kstat_named_setstr(&info->interrupt_type, mgp->intr_type);
4045 	kstat_named_setstr(&info->product_code, mgp->pc_str);
4046 	kstat_named_setstr(&info->serial_number, mgp->sn_str);
4047 	return (0);
4048 }
4049 
4050 static struct myri10ge_info myri10ge_info_template = {
4051 	{ "driver_version",	KSTAT_DATA_STRING },
4052 	{ "firmware_version",	KSTAT_DATA_STRING },
4053 	{ "firmware_name",	KSTAT_DATA_STRING },
4054 	{ "interrupt_type",	KSTAT_DATA_STRING },
4055 	{ "product_code",	KSTAT_DATA_STRING },
4056 	{ "serial_number",	KSTAT_DATA_STRING },
4057 };
4058 static kmutex_t myri10ge_info_template_lock;
4059 
4060 
4061 static int
4062 myri10ge_info_init(struct myri10ge_priv *mgp)
4063 {
4064 	struct kstat *ksp;
4065 
4066 	ksp = kstat_create("myri10ge", ddi_get_instance(mgp->dip),
4067 	    "myri10ge_info", "net", KSTAT_TYPE_NAMED,
4068 	    sizeof (myri10ge_info_template) /
4069 	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4070 	if (ksp == NULL) {
4071 		cmn_err(CE_WARN,
4072 		    "%s: myri10ge_info_init: kstat_create failed", mgp->name);
4073 		return (DDI_FAILURE);
4074 	}
4075 	mgp->ksp_info = ksp;
4076 	ksp->ks_update = myri10ge_info_kstat_update;
4077 	ksp->ks_private = (void *) mgp;
4078 	ksp->ks_data = &myri10ge_info_template;
4079 	ksp->ks_lock = &myri10ge_info_template_lock;
4080 	if (MYRI10GE_VERSION_STR != NULL)
4081 		ksp->ks_data_size += strlen(MYRI10GE_VERSION_STR) + 1;
4082 	if (mgp->fw_version != NULL)
4083 		ksp->ks_data_size += strlen(mgp->fw_version) + 1;
4084 	ksp->ks_data_size += strlen(mgp->fw_name) + 1;
4085 	ksp->ks_data_size += strlen(mgp->intr_type) + 1;
4086 	if (mgp->pc_str != NULL)
4087 		ksp->ks_data_size += strlen(mgp->pc_str) + 1;
4088 	if (mgp->sn_str != NULL)
4089 		ksp->ks_data_size += strlen(mgp->sn_str) + 1;
4090 
4091 	kstat_install(ksp);
4092 	return (DDI_SUCCESS);
4093 }
4094 
4095 
4096 static int
4097 myri10ge_nic_stat_init(struct myri10ge_priv *mgp)
4098 {
4099 	struct kstat *ksp;
4100 	struct myri10ge_nic_stat *ethstat;
4101 
4102 	ksp = kstat_create("myri10ge", ddi_get_instance(mgp->dip),
4103 	    "myri10ge_nic_stats", "net", KSTAT_TYPE_NAMED,
4104 	    sizeof (*ethstat) / sizeof (kstat_named_t), 0);
4105 	if (ksp == NULL) {
4106 		cmn_err(CE_WARN,
4107 		    "%s: myri10ge_stat_init: kstat_create failed", mgp->name);
4108 		return (DDI_FAILURE);
4109 	}
4110 	mgp->ksp_stat = ksp;
4111 	ethstat = (struct myri10ge_nic_stat *)(ksp->ks_data);
4112 
4113 	kstat_named_init(&ethstat->dma_read_bw_MBs,
4114 	    "dma_read_bw_MBs", KSTAT_DATA_ULONG);
4115 	kstat_named_init(&ethstat->dma_write_bw_MBs,
4116 	    "dma_write_bw_MBs", KSTAT_DATA_ULONG);
4117 	kstat_named_init(&ethstat->dma_read_write_bw_MBs,
4118 	    "dma_read_write_bw_MBs", KSTAT_DATA_ULONG);
4119 	kstat_named_init(&ethstat->dma_force_physical,
4120 	    "dma_force_physical", KSTAT_DATA_ULONG);
4121 	kstat_named_init(&ethstat->lanes,
4122 	    "lanes", KSTAT_DATA_ULONG);
4123 	kstat_named_init(&ethstat->dropped_bad_crc32,
4124 	    "dropped_bad_crc32", KSTAT_DATA_ULONG);
4125 	kstat_named_init(&ethstat->dropped_bad_phy,
4126 	    "dropped_bad_phy", KSTAT_DATA_ULONG);
4127 	kstat_named_init(&ethstat->dropped_link_error_or_filtered,
4128 	    "dropped_link_error_or_filtered", KSTAT_DATA_ULONG);
4129 	kstat_named_init(&ethstat->dropped_link_overflow,
4130 	    "dropped_link_overflow", KSTAT_DATA_ULONG);
4131 	kstat_named_init(&ethstat->dropped_multicast_filtered,
4132 	    "dropped_multicast_filtered", KSTAT_DATA_ULONG);
4133 	kstat_named_init(&ethstat->dropped_no_big_buffer,
4134 	    "dropped_no_big_buffer", KSTAT_DATA_ULONG);
4135 	kstat_named_init(&ethstat->dropped_no_small_buffer,
4136 	    "dropped_no_small_buffer", KSTAT_DATA_ULONG);
4137 	kstat_named_init(&ethstat->dropped_overrun,
4138 	    "dropped_overrun", KSTAT_DATA_ULONG);
4139 	kstat_named_init(&ethstat->dropped_pause,
4140 	    "dropped_pause", KSTAT_DATA_ULONG);
4141 	kstat_named_init(&ethstat->dropped_runt,
4142 	    "dropped_runt", KSTAT_DATA_ULONG);
4143 	kstat_named_init(&ethstat->dropped_unicast_filtered,
4144 	    "dropped_unicast_filtered", KSTAT_DATA_ULONG);
4145 	kstat_named_init(&ethstat->dropped_runt, "dropped_runt",
4146 	    KSTAT_DATA_ULONG);
4147 	kstat_named_init(&ethstat->link_up, "link_up", KSTAT_DATA_ULONG);
4148 	kstat_named_init(&ethstat->link_changes, "link_changes",
4149 	    KSTAT_DATA_ULONG);
4150 	ksp->ks_update = myri10ge_nic_stat_kstat_update;
4151 	ksp->ks_private = (void *) mgp;
4152 	kstat_install(ksp);
4153 	return (DDI_SUCCESS);
4154 }
4155 
4156 static int
4157 myri10ge_slice_stat_init(struct myri10ge_slice_state *ss)
4158 {
4159 	struct myri10ge_priv *mgp = ss->mgp;
4160 	struct kstat *ksp;
4161 	struct myri10ge_slice_stat *ethstat;
4162 	int instance;
4163 
4164 	/*
4165 	 * fake an instance so that the same slice numbers from
4166 	 * different instances do not collide
4167 	 */
4168 	instance = (ddi_get_instance(mgp->dip) * 1000) +  (int)(ss - mgp->ss);
4169 	ksp = kstat_create("myri10ge", instance,
4170 	    "myri10ge_slice_stats", "net", KSTAT_TYPE_NAMED,
4171 	    sizeof (*ethstat) / sizeof (kstat_named_t), 0);
4172 	if (ksp == NULL) {
4173 		cmn_err(CE_WARN,
4174 		    "%s: myri10ge_stat_init: kstat_create failed", mgp->name);
4175 		return (DDI_FAILURE);
4176 	}
4177 	ss->ksp_stat = ksp;
4178 	ethstat = (struct myri10ge_slice_stat *)(ksp->ks_data);
4179 	kstat_named_init(&ethstat->lro_bad_csum, "lro_bad_csum",
4180 	    KSTAT_DATA_ULONG);
4181 	kstat_named_init(&ethstat->lro_flushed, "lro_flushed",
4182 	    KSTAT_DATA_ULONG);
4183 	kstat_named_init(&ethstat->lro_queued, "lro_queued",
4184 	    KSTAT_DATA_ULONG);
4185 	kstat_named_init(&ethstat->rx_bigbuf_firmware, "rx_bigbuf_firmware",
4186 	    KSTAT_DATA_ULONG);
4187 	kstat_named_init(&ethstat->rx_bigbuf_pool, "rx_bigbuf_pool",
4188 	    KSTAT_DATA_ULONG);
4189 	kstat_named_init(&ethstat->rx_bigbuf_smalls, "rx_bigbuf_smalls",
4190 	    KSTAT_DATA_ULONG);
4191 	kstat_named_init(&ethstat->rx_copy, "rx_copy",
4192 	    KSTAT_DATA_ULONG);
4193 	kstat_named_init(&ethstat->rx_big_nobuf, "rx_big_nobuf",
4194 	    KSTAT_DATA_ULONG);
4195 	kstat_named_init(&ethstat->rx_small_nobuf, "rx_small_nobuf",
4196 	    KSTAT_DATA_ULONG);
4197 	kstat_named_init(&ethstat->xmit_zero_len, "xmit_zero_len",
4198 	    KSTAT_DATA_ULONG);
4199 	kstat_named_init(&ethstat->xmit_pullup, "xmit_pullup",
4200 	    KSTAT_DATA_ULONG);
4201 	kstat_named_init(&ethstat->xmit_pullup_first, "xmit_pullup_first",
4202 	    KSTAT_DATA_ULONG);
4203 	kstat_named_init(&ethstat->xmit_lowbuf, "xmit_lowbuf",
4204 	    KSTAT_DATA_ULONG);
4205 	kstat_named_init(&ethstat->xmit_lsobadflags, "xmit_lsobadflags",
4206 	    KSTAT_DATA_ULONG);
4207 	kstat_named_init(&ethstat->xmit_sched, "xmit_sched",
4208 	    KSTAT_DATA_ULONG);
4209 	kstat_named_init(&ethstat->xmit_stall, "xmit_stall",
4210 	    KSTAT_DATA_ULONG);
4211 	kstat_named_init(&ethstat->xmit_stall_early, "xmit_stall_early",
4212 	    KSTAT_DATA_ULONG);
4213 	kstat_named_init(&ethstat->xmit_stall_late, "xmit_stall_late",
4214 	    KSTAT_DATA_ULONG);
4215 	kstat_named_init(&ethstat->xmit_err, "xmit_err",
4216 	    KSTAT_DATA_ULONG);
4217 	kstat_named_init(&ethstat->tx_req, "tx_req",
4218 	    KSTAT_DATA_ULONG);
4219 	kstat_named_init(&ethstat->tx_activate, "tx_activate",
4220 	    KSTAT_DATA_ULONG);
4221 	kstat_named_init(&ethstat->tx_done, "tx_done",
4222 	    KSTAT_DATA_ULONG);
4223 	kstat_named_init(&ethstat->tx_handles_alloced, "tx_handles_alloced",
4224 	    KSTAT_DATA_ULONG);
4225 	kstat_named_init(&ethstat->rx_big, "rx_big",
4226 	    KSTAT_DATA_ULONG);
4227 	kstat_named_init(&ethstat->rx_small, "rx_small",
4228 	    KSTAT_DATA_ULONG);
4229 	ksp->ks_update = myri10ge_slice_stat_kstat_update;
4230 	ksp->ks_private = (void *) ss;
4231 	kstat_install(ksp);
4232 	return (DDI_SUCCESS);
4233 }
4234 
4235 
4236 
4237 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
4238 
4239 #include <vm/hat.h>
4240 #include <sys/ddi_isa.h>
4241 void *device_arena_alloc(size_t size, int vm_flag);
4242 void device_arena_free(void *vaddr, size_t size);
4243 
4244 static void
4245 myri10ge_enable_nvidia_ecrc(struct myri10ge_priv *mgp)
4246 {
4247 	dev_info_t *parent_dip;
4248 	ddi_acc_handle_t handle;
4249 	unsigned long bus_number, dev_number, func_number;
4250 	unsigned long cfg_pa, paddr, base, pgoffset;
4251 	char 		*cvaddr, *ptr;
4252 	uint32_t	*ptr32;
4253 	int 		retval = DDI_FAILURE;
4254 	int dontcare;
4255 	uint16_t read_vid, read_did, vendor_id, device_id;
4256 
4257 	if (!myri10ge_nvidia_ecrc_enable)
4258 		return;
4259 
4260 	parent_dip = ddi_get_parent(mgp->dip);
4261 	if (parent_dip == NULL) {
4262 		cmn_err(CE_WARN, "%s: I'm an orphan?", mgp->name);
4263 		return;
4264 	}
4265 
4266 	if (pci_config_setup(parent_dip, &handle) != DDI_SUCCESS) {
4267 		cmn_err(CE_WARN,
4268 		    "%s: Could not access my parent's registers", mgp->name);
4269 		return;
4270 	}
4271 
4272 	vendor_id = pci_config_get16(handle, PCI_CONF_VENID);
4273 	device_id = pci_config_get16(handle, PCI_CONF_DEVID);
4274 	pci_config_teardown(&handle);
4275 
4276 	if (myri10ge_verbose) {
4277 		unsigned long 	bus_number, dev_number, func_number;
4278 		int 		reg_set, span;
4279 		(void) myri10ge_reg_set(parent_dip, &reg_set, &span,
4280 		    &bus_number, &dev_number, &func_number);
4281 		if (myri10ge_verbose)
4282 			printf("%s: parent at %ld:%ld:%ld\n", mgp->name,
4283 			    bus_number, dev_number, func_number);
4284 	}
4285 
4286 	if (vendor_id !=  0x10de)
4287 		return;
4288 
4289 	if (device_id != 0x005d /* CK804 */ &&
4290 	    (device_id < 0x374 || device_id > 0x378) /* MCP55 */) {
4291 		return;
4292 	}
4293 	(void) myri10ge_reg_set(parent_dip, &dontcare, &dontcare,
4294 	    &bus_number, &dev_number, &func_number);
4295 
4296 	for (cfg_pa = 0xf0000000UL;
4297 	    retval != DDI_SUCCESS && cfg_pa >= 0xe0000000UL;
4298 	    cfg_pa -= 0x10000000UL) {
4299 		/* find the config space address for the nvidia bridge */
4300 		paddr = (cfg_pa + bus_number * 0x00100000UL +
4301 		    (dev_number * 8 + func_number) * 0x00001000UL);
4302 
4303 		base = paddr & (~MMU_PAGEOFFSET);
4304 		pgoffset = paddr & MMU_PAGEOFFSET;
4305 
4306 		/* map it into the kernel */
4307 		cvaddr =  device_arena_alloc(ptob(1), VM_NOSLEEP);
4308 		if (cvaddr == NULL)
4309 			cmn_err(CE_WARN, "%s: failed to map nf4: cvaddr\n",
4310 			    mgp->name);
4311 
4312 		hat_devload(kas.a_hat, cvaddr, mmu_ptob(1),
4313 		    i_ddi_paddr_to_pfn(base),
4314 		    PROT_WRITE|HAT_STRICTORDER, HAT_LOAD_LOCK);
4315 
4316 		ptr = cvaddr + pgoffset;
4317 		read_vid = *(uint16_t *)(void *)(ptr + PCI_CONF_VENID);
4318 		read_did = *(uint16_t *)(void *)(ptr + PCI_CONF_DEVID);
4319 		if (vendor_id ==  read_did || device_id == read_did) {
4320 			ptr32 = (uint32_t *)(void *)(ptr + 0x178);
4321 			if (myri10ge_verbose)
4322 				printf("%s: Enabling ECRC on upstream "
4323 				    "Nvidia bridge (0x%x:0x%x) "
4324 				    "at %ld:%ld:%ld\n", mgp->name,
4325 				    read_vid, read_did, bus_number,
4326 				    dev_number, func_number);
4327 			*ptr32 |= 0x40;
4328 			retval = DDI_SUCCESS;
4329 		}
4330 		hat_unload(kas.a_hat, cvaddr, ptob(1), HAT_UNLOAD_UNLOCK);
4331 		device_arena_free(cvaddr, ptob(1));
4332 	}
4333 }
4334 
4335 #else
4336 /*ARGSUSED*/
4337 static void
4338 myri10ge_enable_nvidia_ecrc(struct myri10ge_priv *mgp)
4339 {
4340 }
4341 #endif /* i386 */
4342 
4343 
4344 /*
4345  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
4346  * when the PCI-E Completion packets are aligned on an 8-byte
4347  * boundary.  Some PCI-E chip sets always align Completion packets; on
4348  * the ones that do not, the alignment can be enforced by enabling
4349  * ECRC generation (if supported).
4350  *
4351  * When PCI-E Completion packets are not aligned, it is actually more
4352  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
4353  *
4354  * If the driver can neither enable ECRC nor verify that it has
4355  * already been enabled, then it must use a firmware image which works
4356  * around unaligned completion packets (ethp_z8e.dat), and it should
4357  * also ensure that it never gives the device a Read-DMA which is
4358  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
4359  * enabled, then the driver should use the aligned (eth_z8e.dat)
4360  * firmware image, and set tx.boundary to 4KB.
4361  */
4362 
4363 
4364 static int
4365 myri10ge_firmware_probe(struct myri10ge_priv *mgp)
4366 {
4367 	int status;
4368 
4369 	mgp->tx_boundary = 4096;
4370 	/*
4371 	 * Verify the max read request size was set to 4KB
4372 	 * before trying the test with 4KB.
4373 	 */
4374 	if (mgp->max_read_request_4k == 0)
4375 		mgp->tx_boundary = 2048;
4376 	/*
4377 	 * load the optimized firmware which assumes aligned PCIe
4378 	 * completions in order to see if it works on this host.
4379 	 */
4380 
4381 	mgp->fw_name = "rss_eth_z8e";
4382 	mgp->eth_z8e = (unsigned char *)rss_eth_z8e;
4383 	mgp->eth_z8e_length = rss_eth_z8e_length;
4384 
4385 	status = myri10ge_load_firmware(mgp);
4386 	if (status != 0) {
4387 		return (status);
4388 	}
4389 	/*
4390 	 * Enable ECRC if possible
4391 	 */
4392 	myri10ge_enable_nvidia_ecrc(mgp);
4393 
4394 	/*
4395 	 * Run a DMA test which watches for unaligned completions and
4396 	 * aborts on the first one seen.
4397 	 */
4398 	status = myri10ge_dma_test(mgp, MXGEFW_CMD_UNALIGNED_TEST);
4399 	if (status == 0)
4400 		return (0); /* keep the aligned firmware */
4401 
4402 	if (status != E2BIG)
4403 		cmn_err(CE_WARN, "%s: DMA test failed: %d\n",
4404 		    mgp->name, status);
4405 	if (status == ENOSYS)
4406 		cmn_err(CE_WARN, "%s: Falling back to ethp! "
4407 		    "Please install up to date fw\n", mgp->name);
4408 	return (status);
4409 }
4410 
4411 static int
4412 myri10ge_select_firmware(struct myri10ge_priv *mgp)
4413 {
4414 	int aligned;
4415 
4416 	aligned = 0;
4417 
4418 	if (myri10ge_force_firmware == 1) {
4419 		if (myri10ge_verbose)
4420 			printf("%s: Assuming aligned completions (forced)\n",
4421 			    mgp->name);
4422 		aligned = 1;
4423 		goto done;
4424 	}
4425 
4426 	if (myri10ge_force_firmware == 2) {
4427 		if (myri10ge_verbose)
4428 			printf("%s: Assuming unaligned completions (forced)\n",
4429 			    mgp->name);
4430 		aligned = 0;
4431 		goto done;
4432 	}
4433 
4434 	/* If the width is less than 8, we may used the aligned firmware */
4435 	if (mgp->pcie_link_width != 0 && mgp->pcie_link_width < 8) {
4436 		cmn_err(CE_WARN, "!%s: PCIe link running at x%d\n",
4437 		    mgp->name, mgp->pcie_link_width);
4438 		aligned = 1;
4439 		goto done;
4440 	}
4441 
4442 	if (0 == myri10ge_firmware_probe(mgp))
4443 		return (0);  /* keep optimized firmware */
4444 
4445 done:
4446 	if (aligned) {
4447 		mgp->fw_name = "rss_eth_z8e";
4448 		mgp->eth_z8e = (unsigned char *)rss_eth_z8e;
4449 		mgp->eth_z8e_length = rss_eth_z8e_length;
4450 		mgp->tx_boundary = 4096;
4451 	} else {
4452 		mgp->fw_name = "rss_ethp_z8e";
4453 		mgp->eth_z8e = (unsigned char *)rss_ethp_z8e;
4454 		mgp->eth_z8e_length = rss_ethp_z8e_length;
4455 		mgp->tx_boundary = 2048;
4456 	}
4457 
4458 	return (myri10ge_load_firmware(mgp));
4459 }
4460 
4461 static int
4462 myri10ge_add_intrs(struct myri10ge_priv *mgp, int add_handler)
4463 {
4464 	dev_info_t *devinfo = mgp->dip;
4465 	int count, avail, actual, intr_types;
4466 	int x, y, rc, inum = 0;
4467 
4468 
4469 	rc = ddi_intr_get_supported_types(devinfo, &intr_types);
4470 	if (rc != DDI_SUCCESS) {
4471 		cmn_err(CE_WARN,
4472 		    "!%s: ddi_intr_get_nintrs() failure, rc = %d\n", mgp->name,
4473 		    rc);
4474 		return (DDI_FAILURE);
4475 	}
4476 
4477 	if (!myri10ge_use_msi)
4478 		intr_types &= ~DDI_INTR_TYPE_MSI;
4479 	if (!myri10ge_use_msix)
4480 		intr_types &= ~DDI_INTR_TYPE_MSIX;
4481 
4482 	if (intr_types & DDI_INTR_TYPE_MSIX) {
4483 		mgp->ddi_intr_type = DDI_INTR_TYPE_MSIX;
4484 		mgp->intr_type = "MSI-X";
4485 	} else if (intr_types & DDI_INTR_TYPE_MSI) {
4486 		mgp->ddi_intr_type = DDI_INTR_TYPE_MSI;
4487 		mgp->intr_type = "MSI";
4488 	} else {
4489 		mgp->ddi_intr_type = DDI_INTR_TYPE_FIXED;
4490 		mgp->intr_type = "Legacy";
4491 	}
4492 	/* Get number of interrupts */
4493 	rc = ddi_intr_get_nintrs(devinfo, mgp->ddi_intr_type, &count);
4494 	if ((rc != DDI_SUCCESS) || (count == 0)) {
4495 		cmn_err(CE_WARN, "%s: ddi_intr_get_nintrs() failure, rc: %d, "
4496 		    "count: %d", mgp->name, rc, count);
4497 
4498 		return (DDI_FAILURE);
4499 	}
4500 
4501 	/* Get number of available interrupts */
4502 	rc = ddi_intr_get_navail(devinfo, mgp->ddi_intr_type, &avail);
4503 	if ((rc != DDI_SUCCESS) || (avail == 0)) {
4504 		cmn_err(CE_WARN, "%s: ddi_intr_get_navail() failure, "
4505 		    "rc: %d, avail: %d\n", mgp->name, rc, avail);
4506 		return (DDI_FAILURE);
4507 	}
4508 	if (avail < count) {
4509 		cmn_err(CE_NOTE,
4510 		    "!%s: nintrs() returned %d, navail returned %d",
4511 		    mgp->name, count, avail);
4512 		count = avail;
4513 	}
4514 
4515 	if (count < mgp->num_slices)
4516 		return (DDI_FAILURE);
4517 
4518 	if (count > mgp->num_slices)
4519 		count = mgp->num_slices;
4520 
4521 	/* Allocate memory for MSI interrupts */
4522 	mgp->intr_size = count * sizeof (ddi_intr_handle_t);
4523 	mgp->htable = kmem_alloc(mgp->intr_size, KM_SLEEP);
4524 
4525 	rc = ddi_intr_alloc(devinfo, mgp->htable, mgp->ddi_intr_type, inum,
4526 	    count, &actual, DDI_INTR_ALLOC_NORMAL);
4527 
4528 	if ((rc != DDI_SUCCESS) || (actual == 0)) {
4529 		cmn_err(CE_WARN, "%s: ddi_intr_alloc() failed: %d",
4530 		    mgp->name, rc);
4531 
4532 		kmem_free(mgp->htable, mgp->intr_size);
4533 		mgp->htable = NULL;
4534 		return (DDI_FAILURE);
4535 	}
4536 
4537 	if ((actual < count) && myri10ge_verbose) {
4538 		cmn_err(CE_NOTE, "%s: got %d/%d slices",
4539 		    mgp->name, actual, count);
4540 	}
4541 
4542 	mgp->intr_cnt = actual;
4543 
4544 	/*
4545 	 * Get priority for first irq, assume remaining are all the same
4546 	 */
4547 	if (ddi_intr_get_pri(mgp->htable[0], &mgp->intr_pri)
4548 	    != DDI_SUCCESS) {
4549 		cmn_err(CE_WARN, "%s: ddi_intr_get_pri() failed", mgp->name);
4550 
4551 		/* Free already allocated intr */
4552 		for (y = 0; y < actual; y++) {
4553 			(void) ddi_intr_free(mgp->htable[y]);
4554 		}
4555 
4556 		kmem_free(mgp->htable, mgp->intr_size);
4557 		mgp->htable = NULL;
4558 		return (DDI_FAILURE);
4559 	}
4560 
4561 	mgp->icookie = (void *)(uintptr_t)mgp->intr_pri;
4562 
4563 	if (!add_handler)
4564 		return (DDI_SUCCESS);
4565 
4566 	/* Call ddi_intr_add_handler() */
4567 	for (x = 0; x < actual; x++) {
4568 		if (ddi_intr_add_handler(mgp->htable[x], myri10ge_intr,
4569 		    (caddr_t)&mgp->ss[x], NULL) != DDI_SUCCESS) {
4570 			cmn_err(CE_WARN, "%s: ddi_intr_add_handler() failed",
4571 			    mgp->name);
4572 
4573 			/* Free already allocated intr */
4574 			for (y = 0; y < actual; y++) {
4575 				(void) ddi_intr_free(mgp->htable[y]);
4576 			}
4577 
4578 			kmem_free(mgp->htable, mgp->intr_size);
4579 			mgp->htable = NULL;
4580 			return (DDI_FAILURE);
4581 		}
4582 	}
4583 
4584 	(void) ddi_intr_get_cap(mgp->htable[0], &mgp->intr_cap);
4585 	if (mgp->intr_cap & DDI_INTR_FLAG_BLOCK) {
4586 		/* Call ddi_intr_block_enable() for MSI */
4587 		(void) ddi_intr_block_enable(mgp->htable, mgp->intr_cnt);
4588 	} else {
4589 		/* Call ddi_intr_enable() for MSI non block enable */
4590 		for (x = 0; x < mgp->intr_cnt; x++) {
4591 			(void) ddi_intr_enable(mgp->htable[x]);
4592 		}
4593 	}
4594 
4595 	return (DDI_SUCCESS);
4596 }
4597 
4598 static void
4599 myri10ge_rem_intrs(struct myri10ge_priv *mgp, int handler_installed)
4600 {
4601 	int x, err;
4602 
4603 	/* Disable all interrupts */
4604 	if (handler_installed) {
4605 		if (mgp->intr_cap & DDI_INTR_FLAG_BLOCK) {
4606 			/* Call ddi_intr_block_disable() */
4607 			(void) ddi_intr_block_disable(mgp->htable,
4608 			    mgp->intr_cnt);
4609 		} else {
4610 			for (x = 0; x < mgp->intr_cnt; x++) {
4611 				(void) ddi_intr_disable(mgp->htable[x]);
4612 			}
4613 		}
4614 	}
4615 
4616 	for (x = 0; x < mgp->intr_cnt; x++) {
4617 		if (handler_installed) {
4618 		/* Call ddi_intr_remove_handler() */
4619 			err = ddi_intr_remove_handler(mgp->htable[x]);
4620 			if (err != DDI_SUCCESS) {
4621 				cmn_err(CE_WARN,
4622 				    "%s: ddi_intr_remove_handler for"
4623 				    "vec %d returned %d\n", mgp->name,
4624 				    x, err);
4625 			}
4626 		}
4627 		err = ddi_intr_free(mgp->htable[x]);
4628 		if (err != DDI_SUCCESS) {
4629 			cmn_err(CE_WARN,
4630 			    "%s: ddi_intr_free for vec %d returned %d\n",
4631 			    mgp->name, x, err);
4632 		}
4633 	}
4634 	kmem_free(mgp->htable, mgp->intr_size);
4635 	mgp->htable = NULL;
4636 }
4637 
4638 static void
4639 myri10ge_test_physical(dev_info_t *dip)
4640 {
4641 	ddi_dma_handle_t	handle;
4642 	struct myri10ge_dma_stuff dma;
4643 	void *addr;
4644 	int err;
4645 
4646 	/* test #1, sufficient for older sparc systems */
4647 	myri10ge_tx_dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
4648 	err = ddi_dma_alloc_handle(dip, &myri10ge_tx_dma_attr,
4649 	    DDI_DMA_DONTWAIT, NULL, &handle);
4650 	if (err == DDI_DMA_BADATTR)
4651 		goto fail;
4652 	ddi_dma_free_handle(&handle);
4653 
4654 	/* test #2, required on Olympis where the bind is what fails */
4655 	addr = myri10ge_dma_alloc(dip, 128, &myri10ge_tx_dma_attr,
4656 	    &myri10ge_dev_access_attr, DDI_DMA_STREAMING,
4657 	    DDI_DMA_WRITE|DDI_DMA_STREAMING, &dma, 0, DDI_DMA_DONTWAIT);
4658 	if (addr == NULL)
4659 		goto fail;
4660 	myri10ge_dma_free(&dma);
4661 	return;
4662 
4663 fail:
4664 	if (myri10ge_verbose)
4665 		printf("myri10ge%d: DDI_DMA_FORCE_PHYSICAL failed, "
4666 		    "using IOMMU\n", ddi_get_instance(dip));
4667 
4668 	myri10ge_tx_dma_attr.dma_attr_flags &= ~DDI_DMA_FORCE_PHYSICAL;
4669 }
4670 
4671 static void
4672 myri10ge_get_props(dev_info_t *dip)
4673 {
4674 
4675 	myri10ge_flow_control =  ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4676 	    "myri10ge_flow_control", myri10ge_flow_control);
4677 
4678 	myri10ge_intr_coal_delay = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4679 	    "myri10ge_intr_coal_delay", myri10ge_intr_coal_delay);
4680 
4681 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
4682 	myri10ge_nvidia_ecrc_enable = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4683 	    "myri10ge_nvidia_ecrc_enable", 1);
4684 #endif
4685 
4686 
4687 	myri10ge_use_msi = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4688 	    "myri10ge_use_msi", myri10ge_use_msi);
4689 
4690 	myri10ge_deassert_wait = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4691 	    "myri10ge_deassert_wait",  myri10ge_deassert_wait);
4692 
4693 	myri10ge_verbose = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4694 	    "myri10ge_verbose", myri10ge_verbose);
4695 
4696 	myri10ge_tx_copylen = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4697 	    "myri10ge_tx_copylen", myri10ge_tx_copylen);
4698 
4699 	if (myri10ge_tx_copylen < 60) {
4700 		cmn_err(CE_WARN,
4701 		    "myri10ge_tx_copylen must be >= 60 bytes\n");
4702 		myri10ge_tx_copylen = 60;
4703 	}
4704 
4705 	myri10ge_mtu_override = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4706 	    "myri10ge_mtu_override", myri10ge_mtu_override);
4707 
4708 	if (myri10ge_mtu_override >= MYRI10GE_MIN_GLD_MTU &&
4709 	    myri10ge_mtu_override <= MYRI10GE_MAX_GLD_MTU)
4710 		myri10ge_mtu = myri10ge_mtu_override +
4711 		    sizeof (struct ether_header) + MXGEFW_PAD + VLAN_TAGSZ;
4712 	else if (myri10ge_mtu_override != 0) {
4713 		cmn_err(CE_WARN,
4714 		    "myri10ge_mtu_override must be between 1500 and "
4715 		    "9000 bytes\n");
4716 	}
4717 
4718 	myri10ge_bigbufs_initial = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4719 	    "myri10ge_bigbufs_initial", myri10ge_bigbufs_initial);
4720 	myri10ge_bigbufs_max = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4721 	    "myri10ge_bigbufs_max", myri10ge_bigbufs_max);
4722 
4723 	myri10ge_watchdog_reset = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4724 	    "myri10ge_watchdog_reset", myri10ge_watchdog_reset);
4725 
4726 	if (myri10ge_bigbufs_initial < 128) {
4727 		cmn_err(CE_WARN,
4728 		    "myri10ge_bigbufs_initial be at least 128\n");
4729 		myri10ge_bigbufs_initial = 128;
4730 	}
4731 	if (myri10ge_bigbufs_max < 128) {
4732 		cmn_err(CE_WARN,
4733 		    "myri10ge_bigbufs_max be at least 128\n");
4734 		myri10ge_bigbufs_max = 128;
4735 	}
4736 
4737 	if (myri10ge_bigbufs_max < myri10ge_bigbufs_initial) {
4738 		cmn_err(CE_WARN,
4739 		    "myri10ge_bigbufs_max must be >=  "
4740 		    "myri10ge_bigbufs_initial\n");
4741 		myri10ge_bigbufs_max = myri10ge_bigbufs_initial;
4742 	}
4743 
4744 	myri10ge_force_firmware = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4745 	    "myri10ge_force_firmware", myri10ge_force_firmware);
4746 
4747 	myri10ge_max_slices = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4748 	    "myri10ge_max_slices", myri10ge_max_slices);
4749 
4750 	myri10ge_use_msix = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4751 	    "myri10ge_use_msix", myri10ge_use_msix);
4752 
4753 	myri10ge_rss_hash = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4754 	    "myri10ge_rss_hash", myri10ge_rss_hash);
4755 
4756 	if (myri10ge_rss_hash > MXGEFW_RSS_HASH_TYPE_MAX ||
4757 	    myri10ge_rss_hash < MXGEFW_RSS_HASH_TYPE_IPV4) {
4758 		cmn_err(CE_WARN, "myri10ge: Illegal rssh hash type %d\n",
4759 		    myri10ge_rss_hash);
4760 		myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4761 	}
4762 	myri10ge_lro = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4763 	    "myri10ge_lro", myri10ge_lro);
4764 	myri10ge_lro_cnt = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4765 	    "myri10ge_lro_cnt", myri10ge_lro_cnt);
4766 	myri10ge_lro_max_aggr = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4767 	    "myri10ge_lro_max_aggr", myri10ge_lro_max_aggr);
4768 	myri10ge_tx_hash = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4769 	    "myri10ge_tx_hash", myri10ge_tx_hash);
4770 	myri10ge_use_lso = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4771 	    "myri10ge_use_lso", myri10ge_use_lso);
4772 	myri10ge_lso_copy = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4773 	    "myri10ge_lso_copy", myri10ge_lso_copy);
4774 	myri10ge_tx_handles_initial = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4775 	    "myri10ge_tx_handles_initial", myri10ge_tx_handles_initial);
4776 	myri10ge_small_bytes = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4777 	    "myri10ge_small_bytes", myri10ge_small_bytes);
4778 	if ((myri10ge_small_bytes + MXGEFW_PAD) & (128 -1)) {
4779 		cmn_err(CE_WARN, "myri10ge: myri10ge_small_bytes (%d)\n",
4780 		    myri10ge_small_bytes);
4781 		cmn_err(CE_WARN, "must be aligned on 128b bndry -2\n");
4782 		myri10ge_small_bytes += 128;
4783 		myri10ge_small_bytes &= ~(128 -1);
4784 		myri10ge_small_bytes -= MXGEFW_PAD;
4785 		cmn_err(CE_WARN, "rounded up to %d\n",
4786 		    myri10ge_small_bytes);
4787 
4788 		myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4789 	}
4790 }
4791 
4792 #ifndef	PCI_EXP_LNKSTA
4793 #define	PCI_EXP_LNKSTA 18
4794 #endif
4795 
4796 static int
4797 myri10ge_find_cap(ddi_acc_handle_t handle, uint8_t *capptr, uint8_t capid)
4798 {
4799 	uint16_t	status;
4800 	uint8_t 	ptr;
4801 
4802 	/* check to see if we have capabilities */
4803 	status = pci_config_get16(handle, PCI_CONF_STAT);
4804 	if (!(status & PCI_STAT_CAP)) {
4805 		cmn_err(CE_WARN, "PCI_STAT_CAP not found\n");
4806 		return (ENXIO);
4807 	}
4808 
4809 	ptr = pci_config_get8(handle, PCI_CONF_CAP_PTR);
4810 
4811 	/* Walk the capabilities list, looking for a PCI Express cap */
4812 	while (ptr != PCI_CAP_NEXT_PTR_NULL) {
4813 		if (pci_config_get8(handle, ptr + PCI_CAP_ID) == capid)
4814 			break;
4815 		ptr = pci_config_get8(handle, ptr + PCI_CAP_NEXT_PTR);
4816 	}
4817 	if (ptr < 64) {
4818 		cmn_err(CE_WARN, "Bad capability offset %d\n", ptr);
4819 		return (ENXIO);
4820 	}
4821 	*capptr = ptr;
4822 	return (0);
4823 }
4824 
4825 static int
4826 myri10ge_set_max_readreq(ddi_acc_handle_t handle)
4827 {
4828 	int err;
4829 	uint16_t	val;
4830 	uint8_t		ptr;
4831 
4832 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_PCI_E);
4833 	if (err != 0) {
4834 		cmn_err(CE_WARN, "could not find PCIe cap\n");
4835 		return (ENXIO);
4836 	}
4837 
4838 	/* set max read req to 4096 */
4839 	val = pci_config_get16(handle, ptr + PCIE_DEVCTL);
4840 	val = (val & ~PCIE_DEVCTL_MAX_READ_REQ_MASK) |
4841 	    PCIE_DEVCTL_MAX_READ_REQ_4096;
4842 	pci_config_put16(handle, ptr + PCIE_DEVCTL, val);
4843 	val = pci_config_get16(handle, ptr + PCIE_DEVCTL);
4844 	if ((val & (PCIE_DEVCTL_MAX_READ_REQ_4096)) !=
4845 	    PCIE_DEVCTL_MAX_READ_REQ_4096) {
4846 		cmn_err(CE_WARN, "could not set max read req (%x)\n", val);
4847 		return (EINVAL);
4848 	}
4849 	return (0);
4850 }
4851 
4852 static int
4853 myri10ge_read_pcie_link_width(ddi_acc_handle_t handle, int *link)
4854 {
4855 	int err;
4856 	uint16_t	val;
4857 	uint8_t		ptr;
4858 
4859 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_PCI_E);
4860 	if (err != 0) {
4861 		cmn_err(CE_WARN, "could not set max read req\n");
4862 		return (ENXIO);
4863 	}
4864 
4865 	/* read link width */
4866 	val = pci_config_get16(handle, ptr + PCIE_LINKSTS);
4867 	val &= PCIE_LINKSTS_NEG_WIDTH_MASK;
4868 	*link = (val >> 4);
4869 	return (0);
4870 }
4871 
4872 static int
4873 myri10ge_reset_nic(struct myri10ge_priv *mgp)
4874 {
4875 	ddi_acc_handle_t handle = mgp->cfg_hdl;
4876 	uint32_t reboot;
4877 	uint16_t cmd;
4878 	int err;
4879 
4880 	cmd = pci_config_get16(handle, PCI_CONF_COMM);
4881 	if ((cmd & PCI_COMM_ME) == 0) {
4882 		/*
4883 		 * Bus master DMA disabled?  Check to see if the card
4884 		 * rebooted due to a parity error For now, just report
4885 		 * it
4886 		 */
4887 
4888 		/* enter read32 mode */
4889 		pci_config_put8(handle, mgp->vso + 0x10, 0x3);
4890 		/* read REBOOT_STATUS (0xfffffff0) */
4891 		pci_config_put32(handle, mgp->vso + 0x18, 0xfffffff0);
4892 		reboot = pci_config_get16(handle, mgp->vso + 0x14);
4893 		cmn_err(CE_WARN, "%s NIC rebooted 0x%x\n", mgp->name, reboot);
4894 		return (0);
4895 	}
4896 	if (!myri10ge_watchdog_reset) {
4897 		cmn_err(CE_WARN, "%s: not resetting\n", mgp->name);
4898 		return (1);
4899 	}
4900 
4901 	myri10ge_stop_locked(mgp);
4902 	err = myri10ge_start_locked(mgp);
4903 	if (err == DDI_FAILURE) {
4904 		return (0);
4905 	}
4906 	mac_tx_update(mgp->mh);
4907 	return (1);
4908 }
4909 
4910 static inline int
4911 myri10ge_ring_stalled(myri10ge_tx_ring_t *tx)
4912 {
4913 	if (tx->sched != tx->stall &&
4914 	    tx->done == tx->watchdog_done &&
4915 	    tx->watchdog_req != tx->watchdog_done)
4916 		return (1);
4917 	return (0);
4918 }
4919 
4920 static void
4921 myri10ge_watchdog(void *arg)
4922 {
4923 	struct myri10ge_priv *mgp;
4924 	struct myri10ge_slice_state *ss;
4925 	myri10ge_tx_ring_t *tx;
4926 	int nic_ok = 1;
4927 	int slices_stalled, rx_pause, i;
4928 	int add_rx;
4929 
4930 	mgp = arg;
4931 	mutex_enter(&mgp->intrlock);
4932 	if (mgp->running != MYRI10GE_ETH_RUNNING) {
4933 		cmn_err(CE_WARN,
4934 		    "%s not running, not rearming watchdog (%d)\n",
4935 		    mgp->name, mgp->running);
4936 		mutex_exit(&mgp->intrlock);
4937 		return;
4938 	}
4939 
4940 	rx_pause = ntohl(mgp->ss[0].fw_stats->dropped_pause);
4941 
4942 	/*
4943 	 * make sure nic is stalled before we reset the nic, so as to
4944 	 * ensure we don't rip the transmit data structures out from
4945 	 * under a pending transmit
4946 	 */
4947 
4948 	for (slices_stalled = 0, i = 0; i < mgp->num_slices; i++) {
4949 		tx = &mgp->ss[i].tx;
4950 		slices_stalled = myri10ge_ring_stalled(tx);
4951 		if (slices_stalled)
4952 			break;
4953 	}
4954 
4955 	if (slices_stalled) {
4956 		if (mgp->watchdog_rx_pause == rx_pause) {
4957 			cmn_err(CE_WARN,
4958 			    "%s slice %d stalled:(%d, %d, %d, %d, %d %d %d\n)",
4959 			    mgp->name, i, tx->sched, tx->stall,
4960 			    tx->done, tx->watchdog_done, tx->req, tx->pkt_done,
4961 			    (int)ntohl(mgp->ss[i].fw_stats->send_done_count));
4962 			nic_ok = myri10ge_reset_nic(mgp);
4963 		} else {
4964 			cmn_err(CE_WARN,
4965 			    "%s Flow controlled, check link partner\n",
4966 			    mgp->name);
4967 		}
4968 	}
4969 
4970 	if (!nic_ok) {
4971 		cmn_err(CE_WARN,
4972 		    "%s Nic dead, not rearming watchdog\n", mgp->name);
4973 		mutex_exit(&mgp->intrlock);
4974 		return;
4975 	}
4976 	for (i = 0; i < mgp->num_slices; i++) {
4977 		ss = &mgp->ss[i];
4978 		tx = &ss->tx;
4979 		tx->watchdog_done = tx->done;
4980 		tx->watchdog_req = tx->req;
4981 		if (ss->watchdog_rx_copy != MYRI10GE_SLICE_STAT(rx_copy)) {
4982 			ss->watchdog_rx_copy = MYRI10GE_SLICE_STAT(rx_copy);
4983 			add_rx =
4984 			    min(ss->jpool.num_alloc,
4985 			    myri10ge_bigbufs_max -
4986 			    (ss->jpool.num_alloc -
4987 			    ss->jbufs_for_smalls));
4988 			if (add_rx != 0) {
4989 				(void) myri10ge_add_jbufs(ss, add_rx, 0);
4990 				/* now feed them to the firmware */
4991 				mutex_enter(&ss->jpool.mtx);
4992 				myri10ge_restock_jumbos(ss);
4993 				mutex_exit(&ss->jpool.mtx);
4994 			}
4995 		}
4996 	}
4997 	mgp->watchdog_rx_pause = rx_pause;
4998 
4999 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
5000 	    mgp->timer_ticks);
5001 	mutex_exit(&mgp->intrlock);
5002 }
5003 
5004 /*ARGSUSED*/
5005 static int
5006 myri10ge_get_coalesce(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
5007 
5008 {
5009 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5010 	(void) mi_mpprintf(mp, "%d", mgp->intr_coal_delay);
5011 	return (0);
5012 }
5013 
5014 /*ARGSUSED*/
5015 static int
5016 myri10ge_set_coalesce(queue_t *q, mblk_t *mp, char *value,
5017     caddr_t cp, cred_t *credp)
5018 
5019 {
5020 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5021 	char *end;
5022 	size_t new_value;
5023 
5024 	new_value = mi_strtol(value, &end, 10);
5025 	if (end == value)
5026 		return (EINVAL);
5027 
5028 	mutex_enter(&myri10ge_param_lock);
5029 	mgp->intr_coal_delay = (int)new_value;
5030 	*mgp->intr_coal_delay_ptr = htonl(mgp->intr_coal_delay);
5031 	mutex_exit(&myri10ge_param_lock);
5032 	return (0);
5033 }
5034 
5035 /*ARGSUSED*/
5036 static int
5037 myri10ge_get_pauseparam(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
5038 
5039 {
5040 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5041 	(void) mi_mpprintf(mp, "%d", mgp->pause);
5042 	return (0);
5043 }
5044 
5045 /*ARGSUSED*/
5046 static int
5047 myri10ge_set_pauseparam(queue_t *q, mblk_t *mp, char *value,
5048 			caddr_t cp, cred_t *credp)
5049 
5050 {
5051 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5052 	char *end;
5053 	size_t new_value;
5054 	int err = 0;
5055 
5056 	new_value = mi_strtol(value, &end, 10);
5057 	if (end == value)
5058 		return (EINVAL);
5059 	if (new_value != 0)
5060 		new_value = 1;
5061 
5062 	mutex_enter(&myri10ge_param_lock);
5063 	if (new_value != mgp->pause)
5064 		err = myri10ge_change_pause(mgp, new_value);
5065 	mutex_exit(&myri10ge_param_lock);
5066 	return (err);
5067 }
5068 
5069 /*ARGSUSED*/
5070 static int
5071 myri10ge_get_int(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
5072 
5073 {
5074 	(void) mi_mpprintf(mp, "%d", *(int *)(void *)cp);
5075 	return (0);
5076 }
5077 
5078 /*ARGSUSED*/
5079 static int
5080 myri10ge_set_int(queue_t *q, mblk_t *mp, char *value,
5081     caddr_t cp, cred_t *credp)
5082 
5083 {
5084 	char *end;
5085 	size_t new_value;
5086 
5087 	new_value = mi_strtol(value, &end, 10);
5088 	if (end == value)
5089 		return (EINVAL);
5090 	*(int *)(void *)cp = new_value;
5091 
5092 	return (0);
5093 }
5094 
5095 static void
5096 myri10ge_ndd_init(struct myri10ge_priv *mgp)
5097 {
5098 	mgp->nd_head = NULL;
5099 
5100 	(void) nd_load(&mgp->nd_head, "myri10ge_intr_coal_delay",
5101 	    myri10ge_get_coalesce, myri10ge_set_coalesce, (caddr_t)mgp);
5102 	(void) nd_load(&mgp->nd_head, "myri10ge_flow_control",
5103 	    myri10ge_get_pauseparam, myri10ge_set_pauseparam, (caddr_t)mgp);
5104 	(void) nd_load(&mgp->nd_head, "myri10ge_verbose",
5105 	    myri10ge_get_int, myri10ge_set_int, (caddr_t)&myri10ge_verbose);
5106 	(void) nd_load(&mgp->nd_head, "myri10ge_deassert_wait",
5107 	    myri10ge_get_int, myri10ge_set_int,
5108 	    (caddr_t)&myri10ge_deassert_wait);
5109 	(void) nd_load(&mgp->nd_head, "myri10ge_bigbufs_max",
5110 	    myri10ge_get_int, myri10ge_set_int,
5111 	    (caddr_t)&myri10ge_bigbufs_max);
5112 	(void) nd_load(&mgp->nd_head, "myri10ge_lro",
5113 	    myri10ge_get_int, myri10ge_set_int,
5114 	    (caddr_t)&myri10ge_lro);
5115 	(void) nd_load(&mgp->nd_head, "myri10ge_lro_max_aggr",
5116 	    myri10ge_get_int, myri10ge_set_int,
5117 	    (caddr_t)&myri10ge_lro_max_aggr);
5118 	(void) nd_load(&mgp->nd_head, "myri10ge_tx_hash",
5119 	    myri10ge_get_int, myri10ge_set_int,
5120 	    (caddr_t)&myri10ge_tx_hash);
5121 	(void) nd_load(&mgp->nd_head, "myri10ge_lso_copy",
5122 	    myri10ge_get_int, myri10ge_set_int,
5123 	    (caddr_t)&myri10ge_lso_copy);
5124 }
5125 
5126 static void
5127 myri10ge_ndd_fini(struct myri10ge_priv *mgp)
5128 {
5129 	nd_free(&mgp->nd_head);
5130 }
5131 
5132 static void
5133 myri10ge_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
5134 {
5135 	struct iocblk *iocp;
5136 	struct myri10ge_priv *mgp = arg;
5137 	int cmd, ok, err;
5138 
5139 	iocp = (struct iocblk *)(void *)mp->b_rptr;
5140 	cmd = iocp->ioc_cmd;
5141 
5142 	ok = 0;
5143 	err = 0;
5144 
5145 	switch (cmd) {
5146 	case ND_GET:
5147 	case ND_SET:
5148 		ok = nd_getset(wq, mgp->nd_head, mp);
5149 		break;
5150 	default:
5151 		break;
5152 	}
5153 	if (!ok)
5154 		err = EINVAL;
5155 	else
5156 		err = iocp->ioc_error;
5157 
5158 	if (!err)
5159 		miocack(wq, mp, iocp->ioc_count, err);
5160 	else
5161 		miocnak(wq, mp, 0, err);
5162 }
5163 
5164 static struct myri10ge_priv *mgp_list;
5165 
5166 struct myri10ge_priv *
5167 myri10ge_get_instance(uint_t unit)
5168 {
5169 	struct myri10ge_priv *mgp;
5170 
5171 	mutex_enter(&myri10ge_param_lock);
5172 	for (mgp = mgp_list; mgp != NULL; mgp = mgp->next) {
5173 		if (unit == ddi_get_instance(mgp->dip)) {
5174 			mgp->refcnt++;
5175 			break;
5176 		}
5177 	}
5178 	mutex_exit(&myri10ge_param_lock);
5179 	return (mgp);
5180 }
5181 
5182 void
5183 myri10ge_put_instance(struct myri10ge_priv *mgp)
5184 {
5185 	mutex_enter(&myri10ge_param_lock);
5186 	mgp->refcnt--;
5187 	mutex_exit(&myri10ge_param_lock);
5188 }
5189 
5190 static boolean_t
5191 myri10ge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
5192 {
5193 	struct myri10ge_priv *mgp = arg;
5194 	uint32_t *cap_hcksum;
5195 	mac_capab_lso_t *cap_lso;
5196 	mac_capab_rings_t *cap_rings;
5197 
5198 	switch (cap) {
5199 	case MAC_CAPAB_HCKSUM:
5200 		cap_hcksum = cap_data;
5201 		*cap_hcksum = HCKSUM_INET_PARTIAL;
5202 		break;
5203 	case MAC_CAPAB_RINGS:
5204 		cap_rings = cap_data;
5205 		switch (cap_rings->mr_type) {
5206 		case MAC_RING_TYPE_RX:
5207 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
5208 			cap_rings->mr_rnum = mgp->num_slices;
5209 			cap_rings->mr_gnum = 1;
5210 			cap_rings->mr_rget = myri10ge_fill_ring;
5211 			cap_rings->mr_gget = myri10ge_fill_group;
5212 			break;
5213 		case MAC_RING_TYPE_TX:
5214 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
5215 			cap_rings->mr_rnum = mgp->num_slices;
5216 			cap_rings->mr_gnum = 0;
5217 			cap_rings->mr_rget = myri10ge_fill_ring;
5218 			cap_rings->mr_gget = NULL;
5219 			break;
5220 		default:
5221 			return (B_FALSE);
5222 		}
5223 		break;
5224 	case MAC_CAPAB_LSO:
5225 		cap_lso = cap_data;
5226 		if (!myri10ge_use_lso)
5227 			return (B_FALSE);
5228 		if (!(mgp->features & MYRI10GE_TSO))
5229 			return (B_FALSE);
5230 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
5231 		cap_lso->lso_basic_tcp_ipv4.lso_max = (uint16_t)-1;
5232 		break;
5233 
5234 	default:
5235 		return (B_FALSE);
5236 	}
5237 	return (B_TRUE);
5238 }
5239 
5240 
5241 static int
5242 myri10ge_m_stat(void *arg, uint_t stat, uint64_t *val)
5243 {
5244 	struct myri10ge_priv *mgp = arg;
5245 	struct myri10ge_rx_ring_stats *rstat;
5246 	struct myri10ge_tx_ring_stats *tstat;
5247 	mcp_irq_data_t *fw_stats = mgp->ss[0].fw_stats;
5248 	struct myri10ge_slice_state *ss;
5249 	uint64_t tmp = 0;
5250 	int i;
5251 
5252 	switch (stat) {
5253 	case MAC_STAT_IFSPEED:
5254 		*val = 10ull * 1000ull * 1000000ull;
5255 		break;
5256 
5257 	case MAC_STAT_MULTIRCV:
5258 		for (i = 0; i < mgp->num_slices; i++) {
5259 			rstat = &mgp->ss[i].rx_stats;
5260 			tmp += rstat->multircv;
5261 		}
5262 		*val = tmp;
5263 		break;
5264 
5265 	case MAC_STAT_BRDCSTRCV:
5266 		for (i = 0; i < mgp->num_slices; i++) {
5267 			rstat = &mgp->ss[i].rx_stats;
5268 			tmp += rstat->brdcstrcv;
5269 		}
5270 		*val = tmp;
5271 		break;
5272 
5273 	case MAC_STAT_MULTIXMT:
5274 		for (i = 0; i < mgp->num_slices; i++) {
5275 			tstat = &mgp->ss[i].tx.stats;
5276 			tmp += tstat->multixmt;
5277 		}
5278 		*val = tmp;
5279 		break;
5280 
5281 	case MAC_STAT_BRDCSTXMT:
5282 		for (i = 0; i < mgp->num_slices; i++) {
5283 			tstat = &mgp->ss[i].tx.stats;
5284 			tmp += tstat->brdcstxmt;
5285 		}
5286 		*val = tmp;
5287 		break;
5288 
5289 	case MAC_STAT_NORCVBUF:
5290 		tmp = ntohl(fw_stats->dropped_no_big_buffer);
5291 		tmp += ntohl(fw_stats->dropped_no_small_buffer);
5292 		tmp += ntohl(fw_stats->dropped_link_overflow);
5293 		for (i = 0; i < mgp->num_slices; i++) {
5294 			ss = &mgp->ss[i];
5295 			tmp += MYRI10GE_SLICE_STAT(rx_big_nobuf);
5296 			tmp += MYRI10GE_SLICE_STAT(rx_small_nobuf);
5297 		}
5298 		*val = tmp;
5299 		break;
5300 
5301 	case MAC_STAT_IERRORS:
5302 		tmp += ntohl(fw_stats->dropped_bad_crc32);
5303 		tmp += ntohl(fw_stats->dropped_bad_phy);
5304 		tmp += ntohl(fw_stats->dropped_runt);
5305 		tmp += ntohl(fw_stats->dropped_overrun);
5306 		*val = tmp;
5307 		break;
5308 
5309 	case MAC_STAT_OERRORS:
5310 		for (i = 0; i < mgp->num_slices; i++) {
5311 			ss = &mgp->ss[i];
5312 			tmp += MYRI10GE_SLICE_STAT(xmit_lsobadflags);
5313 			tmp += MYRI10GE_SLICE_STAT(xmit_err);
5314 		}
5315 		*val = tmp;
5316 		break;
5317 
5318 	case MAC_STAT_RBYTES:
5319 		for (i = 0; i < mgp->num_slices; i++) {
5320 			rstat = &mgp->ss[i].rx_stats;
5321 			tmp += rstat->ibytes;
5322 		}
5323 		*val = tmp;
5324 		break;
5325 
5326 	case MAC_STAT_IPACKETS:
5327 		for (i = 0; i < mgp->num_slices; i++) {
5328 			rstat = &mgp->ss[i].rx_stats;
5329 			tmp += rstat->ipackets;
5330 		}
5331 		*val = tmp;
5332 		break;
5333 
5334 	case MAC_STAT_OBYTES:
5335 		for (i = 0; i < mgp->num_slices; i++) {
5336 			tstat = &mgp->ss[i].tx.stats;
5337 			tmp += tstat->obytes;
5338 		}
5339 		*val = tmp;
5340 		break;
5341 
5342 	case MAC_STAT_OPACKETS:
5343 		for (i = 0; i < mgp->num_slices; i++) {
5344 			tstat = &mgp->ss[i].tx.stats;
5345 			tmp += tstat->opackets;
5346 		}
5347 		*val = tmp;
5348 		break;
5349 
5350 	case ETHER_STAT_TOOLONG_ERRORS:
5351 		*val = ntohl(fw_stats->dropped_overrun);
5352 		break;
5353 
5354 #ifdef SOLARIS_S11
5355 	case ETHER_STAT_TOOSHORT_ERRORS:
5356 		*val = ntohl(fw_stats->dropped_runt);
5357 		break;
5358 #endif
5359 
5360 	case ETHER_STAT_LINK_PAUSE:
5361 		*val = mgp->pause;
5362 		break;
5363 
5364 	case ETHER_STAT_LINK_AUTONEG:
5365 		*val = 1;
5366 		break;
5367 
5368 	case ETHER_STAT_LINK_DUPLEX:
5369 		*val = LINK_DUPLEX_FULL;
5370 		break;
5371 
5372 	default:
5373 		return (ENOTSUP);
5374 	}
5375 
5376 	return (0);
5377 }
5378 
5379 /* ARGSUSED */
5380 static void
5381 myri10ge_m_propinfo(void *arg, const char *pr_name,
5382     mac_prop_id_t pr_num, mac_prop_info_handle_t prh)
5383 {
5384 	switch (pr_num) {
5385 	case MAC_PROP_MTU:
5386 		mac_prop_info_set_default_uint32(prh, MYRI10GE_DEFAULT_GLD_MTU);
5387 		mac_prop_info_set_range_uint32(prh, MYRI10GE_MIN_GLD_MTU,
5388 		    MYRI10GE_MAX_GLD_MTU);
5389 		break;
5390 	default:
5391 		break;
5392 	}
5393 }
5394 
5395 /*ARGSUSED*/
5396 static int
5397 myri10ge_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
5398     uint_t pr_valsize, const void *pr_val)
5399 {
5400 	int err = 0;
5401 	struct myri10ge_priv *mgp = arg;
5402 
5403 	switch (pr_num) {
5404 	case MAC_PROP_MTU: {
5405 		uint32_t mtu;
5406 		if (pr_valsize < sizeof (mtu)) {
5407 			err = EINVAL;
5408 			break;
5409 		}
5410 		bcopy(pr_val, &mtu, sizeof (mtu));
5411 		if (mtu > MYRI10GE_MAX_GLD_MTU ||
5412 		    mtu < MYRI10GE_MIN_GLD_MTU) {
5413 			err = EINVAL;
5414 			break;
5415 		}
5416 
5417 		mutex_enter(&mgp->intrlock);
5418 		if (mgp->running != MYRI10GE_ETH_STOPPED) {
5419 			err = EBUSY;
5420 			mutex_exit(&mgp->intrlock);
5421 			break;
5422 		}
5423 
5424 		myri10ge_mtu = mtu + sizeof (struct ether_header) +
5425 		    MXGEFW_PAD + VLAN_TAGSZ;
5426 		mutex_exit(&mgp->intrlock);
5427 		break;
5428 	}
5429 	default:
5430 		err = ENOTSUP;
5431 		break;
5432 	}
5433 
5434 	return (err);
5435 }
5436 
5437 static mac_callbacks_t myri10ge_m_callbacks = {
5438 	(MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO),
5439 	myri10ge_m_stat,
5440 	myri10ge_m_start,
5441 	myri10ge_m_stop,
5442 	myri10ge_m_promisc,
5443 	myri10ge_m_multicst,
5444 	NULL,
5445 	NULL,
5446 	NULL,
5447 	myri10ge_m_ioctl,
5448 	myri10ge_m_getcapab,
5449 	NULL,
5450 	NULL,
5451 	myri10ge_m_setprop,
5452 	NULL,
5453 	myri10ge_m_propinfo
5454 };
5455 
5456 
5457 static int
5458 myri10ge_probe_slices(struct myri10ge_priv *mgp)
5459 {
5460 	myri10ge_cmd_t cmd;
5461 	int status;
5462 
5463 	mgp->num_slices = 1;
5464 
5465 	/* hit the board with a reset to ensure it is alive */
5466 	(void) memset(&cmd, 0, sizeof (cmd));
5467 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_RESET, &cmd);
5468 	if (status != 0) {
5469 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
5470 		return (ENXIO);
5471 	}
5472 
5473 	if (myri10ge_use_msix == 0)
5474 		return (0);
5475 
5476 	/* tell it the size of the interrupt queues */
5477 	cmd.data0 = mgp->max_intr_slots * sizeof (struct mcp_slot);
5478 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
5479 	if (status != 0) {
5480 		cmn_err(CE_WARN, "%s: failed MXGEFW_CMD_SET_INTRQ_SIZE\n",
5481 		    mgp->name);
5482 		return (ENXIO);
5483 	}
5484 
5485 	/* ask the maximum number of slices it supports */
5486 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
5487 	    &cmd);
5488 	if (status != 0)
5489 		return (0);
5490 
5491 	mgp->num_slices = cmd.data0;
5492 
5493 	/*
5494 	 * if the admin did not specify a limit to how many
5495 	 * slices we should use, cap it automatically to the
5496 	 * number of CPUs currently online
5497 	 */
5498 	if (myri10ge_max_slices == -1)
5499 		myri10ge_max_slices = ncpus;
5500 
5501 	if (mgp->num_slices > myri10ge_max_slices)
5502 		mgp->num_slices = myri10ge_max_slices;
5503 
5504 
5505 	/*
5506 	 * Now try to allocate as many MSI-X vectors as we have
5507 	 * slices. We give up on MSI-X if we can only get a single
5508 	 * vector.
5509 	 */
5510 	while (mgp->num_slices > 1) {
5511 		/* make sure it is a power of two */
5512 		while (!ISP2(mgp->num_slices))
5513 			mgp->num_slices--;
5514 		if (mgp->num_slices == 1)
5515 			return (0);
5516 
5517 		status = myri10ge_add_intrs(mgp, 0);
5518 		if (status == 0) {
5519 			myri10ge_rem_intrs(mgp, 0);
5520 			if (mgp->intr_cnt == mgp->num_slices) {
5521 				if (myri10ge_verbose)
5522 					printf("Got %d slices!\n",
5523 					    mgp->num_slices);
5524 				return (0);
5525 			}
5526 			mgp->num_slices = mgp->intr_cnt;
5527 		} else {
5528 			mgp->num_slices = mgp->num_slices / 2;
5529 		}
5530 	}
5531 
5532 	if (myri10ge_verbose)
5533 		printf("Got %d slices\n", mgp->num_slices);
5534 	return (0);
5535 }
5536 
5537 static void
5538 myri10ge_lro_free(struct myri10ge_slice_state *ss)
5539 {
5540 	struct lro_entry *lro;
5541 
5542 	while (ss->lro_free != NULL) {
5543 		lro = ss->lro_free;
5544 		ss->lro_free = lro->next;
5545 		kmem_free(lro, sizeof (*lro));
5546 	}
5547 }
5548 
5549 static void
5550 myri10ge_lro_alloc(struct myri10ge_slice_state *ss)
5551 {
5552 	struct lro_entry *lro;
5553 	int idx;
5554 
5555 	ss->lro_free = NULL;
5556 	ss->lro_active = NULL;
5557 
5558 	for (idx = 0; idx < myri10ge_lro_cnt; idx++) {
5559 		lro = kmem_zalloc(sizeof (*lro), KM_SLEEP);
5560 		if (lro == NULL)
5561 			continue;
5562 		lro->next = ss->lro_free;
5563 		ss->lro_free = lro;
5564 	}
5565 }
5566 
5567 static void
5568 myri10ge_free_slices(struct myri10ge_priv *mgp)
5569 {
5570 	struct myri10ge_slice_state *ss;
5571 	size_t bytes;
5572 	int i;
5573 
5574 	if (mgp->ss == NULL)
5575 		return;
5576 
5577 	for (i = 0; i < mgp->num_slices; i++) {
5578 		ss = &mgp->ss[i];
5579 		if (ss->rx_done.entry == NULL)
5580 			continue;
5581 		myri10ge_dma_free(&ss->rx_done.dma);
5582 		ss->rx_done.entry = NULL;
5583 		if (ss->fw_stats == NULL)
5584 			continue;
5585 		myri10ge_dma_free(&ss->fw_stats_dma);
5586 		ss->fw_stats = NULL;
5587 		mutex_destroy(&ss->rx_lock);
5588 		mutex_destroy(&ss->tx.lock);
5589 		mutex_destroy(&ss->tx.handle_lock);
5590 		mutex_destroy(&ss->poll_lock);
5591 		myri10ge_jpool_fini(ss);
5592 		myri10ge_slice_stat_destroy(ss);
5593 		myri10ge_lro_free(ss);
5594 	}
5595 	bytes = sizeof (*mgp->ss) * mgp->num_slices;
5596 	kmem_free(mgp->ss, bytes);
5597 	mgp->ss = NULL;
5598 }
5599 
5600 
5601 static int
5602 myri10ge_alloc_slices(struct myri10ge_priv *mgp)
5603 {
5604 	struct myri10ge_slice_state *ss;
5605 	size_t bytes;
5606 	int i;
5607 
5608 	bytes = sizeof (*mgp->ss) * mgp->num_slices;
5609 	mgp->ss = kmem_zalloc(bytes, KM_SLEEP);
5610 	if (mgp->ss == NULL)
5611 		return (ENOMEM);
5612 	for (i = 0; i < mgp->num_slices; i++) {
5613 		ss = &mgp->ss[i];
5614 
5615 		ss->mgp = mgp;
5616 
5617 		/* allocate the per-slice firmware stats */
5618 		bytes = sizeof (*ss->fw_stats);
5619 		ss->fw_stats = (mcp_irq_data_t *)(void *)
5620 		    myri10ge_dma_alloc(mgp->dip, bytes,
5621 		    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5622 		    DDI_DMA_CONSISTENT, DDI_DMA_READ|DDI_DMA_CONSISTENT,
5623 		    &ss->fw_stats_dma, 1, DDI_DMA_DONTWAIT);
5624 		if (ss->fw_stats == NULL)
5625 			goto abort;
5626 		(void) memset(ss->fw_stats, 0, bytes);
5627 
5628 		/* allocate rx done ring */
5629 		bytes = mgp->max_intr_slots *
5630 		    sizeof (*ss->rx_done.entry);
5631 		ss->rx_done.entry = (mcp_slot_t *)(void *)
5632 		    myri10ge_dma_alloc(mgp->dip, bytes,
5633 		    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5634 		    DDI_DMA_CONSISTENT, DDI_DMA_READ|DDI_DMA_CONSISTENT,
5635 		    &ss->rx_done.dma, 1, DDI_DMA_DONTWAIT);
5636 		if (ss->rx_done.entry == NULL) {
5637 			goto abort;
5638 		}
5639 		(void) memset(ss->rx_done.entry, 0, bytes);
5640 		mutex_init(&ss->rx_lock,   NULL, MUTEX_DEFAULT, mgp->icookie);
5641 		mutex_init(&ss->tx.lock,   NULL, MUTEX_DEFAULT, NULL);
5642 		mutex_init(&ss->tx.handle_lock,   NULL, MUTEX_DEFAULT, NULL);
5643 		mutex_init(&ss->poll_lock,   NULL, MUTEX_DEFAULT, NULL);
5644 		myri10ge_jpool_init(ss);
5645 		(void) myri10ge_slice_stat_init(ss);
5646 		myri10ge_lro_alloc(ss);
5647 	}
5648 
5649 	return (0);
5650 
5651 abort:
5652 	myri10ge_free_slices(mgp);
5653 	return (ENOMEM);
5654 }
5655 
5656 static int
5657 myri10ge_save_msi_state(struct myri10ge_priv *mgp,
5658     ddi_acc_handle_t handle)
5659 {
5660 	uint8_t ptr;
5661 	int err;
5662 
5663 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_MSI);
5664 	if (err != 0) {
5665 		cmn_err(CE_WARN, "%s: could not find MSI cap\n",
5666 		    mgp->name);
5667 		return (DDI_FAILURE);
5668 	}
5669 	mgp->pci_saved_state.msi_ctrl =
5670 	    pci_config_get16(handle, ptr + PCI_MSI_CTRL);
5671 	mgp->pci_saved_state.msi_addr_low =
5672 	    pci_config_get32(handle, ptr + PCI_MSI_ADDR_OFFSET);
5673 	mgp->pci_saved_state.msi_addr_high =
5674 	    pci_config_get32(handle, ptr + PCI_MSI_ADDR_OFFSET + 4);
5675 	mgp->pci_saved_state.msi_data_32 =
5676 	    pci_config_get16(handle, ptr + PCI_MSI_32BIT_DATA);
5677 	mgp->pci_saved_state.msi_data_64 =
5678 	    pci_config_get16(handle, ptr + PCI_MSI_64BIT_DATA);
5679 	return (DDI_SUCCESS);
5680 }
5681 
5682 static int
5683 myri10ge_restore_msi_state(struct myri10ge_priv *mgp,
5684     ddi_acc_handle_t handle)
5685 {
5686 	uint8_t ptr;
5687 	int err;
5688 
5689 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_MSI);
5690 	if (err != 0) {
5691 		cmn_err(CE_WARN, "%s: could not find MSI cap\n",
5692 		    mgp->name);
5693 		return (DDI_FAILURE);
5694 	}
5695 
5696 	pci_config_put16(handle, ptr + PCI_MSI_CTRL,
5697 	    mgp->pci_saved_state.msi_ctrl);
5698 	pci_config_put32(handle, ptr + PCI_MSI_ADDR_OFFSET,
5699 	    mgp->pci_saved_state.msi_addr_low);
5700 	pci_config_put32(handle, ptr + PCI_MSI_ADDR_OFFSET + 4,
5701 	    mgp->pci_saved_state.msi_addr_high);
5702 	pci_config_put16(handle, ptr + PCI_MSI_32BIT_DATA,
5703 	    mgp->pci_saved_state.msi_data_32);
5704 	pci_config_put16(handle, ptr + PCI_MSI_64BIT_DATA,
5705 	    mgp->pci_saved_state.msi_data_64);
5706 
5707 	return (DDI_SUCCESS);
5708 }
5709 
5710 static int
5711 myri10ge_save_pci_state(struct myri10ge_priv *mgp)
5712 {
5713 	ddi_acc_handle_t handle = mgp->cfg_hdl;
5714 	int i;
5715 	int err = DDI_SUCCESS;
5716 
5717 
5718 	/* Save the non-extended PCI config space 32-bits at a time */
5719 	for (i = 0; i < 16; i++)
5720 		mgp->pci_saved_state.base[i] =
5721 		    pci_config_get32(handle, i*4);
5722 
5723 	/* now save MSI interrupt state *, if needed */
5724 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_MSI)
5725 		err = myri10ge_save_msi_state(mgp, handle);
5726 
5727 	return (err);
5728 }
5729 
5730 static int
5731 myri10ge_restore_pci_state(struct myri10ge_priv *mgp)
5732 {
5733 	ddi_acc_handle_t handle = mgp->cfg_hdl;
5734 	int i;
5735 	int err = DDI_SUCCESS;
5736 
5737 
5738 	/* Restore the non-extended PCI config space 32-bits at a time */
5739 	for (i = 15; i >= 0; i--)
5740 		pci_config_put32(handle, i*4, mgp->pci_saved_state.base[i]);
5741 
5742 	/* now restore MSI interrupt state *, if needed */
5743 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_MSI)
5744 		err = myri10ge_restore_msi_state(mgp, handle);
5745 
5746 	if (mgp->max_read_request_4k)
5747 		(void) myri10ge_set_max_readreq(handle);
5748 	return (err);
5749 }
5750 
5751 
5752 static int
5753 myri10ge_suspend(dev_info_t *dip)
5754 {
5755 	struct myri10ge_priv *mgp = ddi_get_driver_private(dip);
5756 	int status;
5757 
5758 	if (mgp == NULL) {
5759 		cmn_err(CE_WARN, "null dip in myri10ge_suspend\n");
5760 		return (DDI_FAILURE);
5761 	}
5762 	if (mgp->dip != dip) {
5763 		cmn_err(CE_WARN, "bad dip in myri10ge_suspend\n");
5764 		return (DDI_FAILURE);
5765 	}
5766 	mutex_enter(&mgp->intrlock);
5767 	if (mgp->running == MYRI10GE_ETH_RUNNING) {
5768 		mgp->running = MYRI10GE_ETH_STOPPING;
5769 		mutex_exit(&mgp->intrlock);
5770 		(void) untimeout(mgp->timer_id);
5771 		mutex_enter(&mgp->intrlock);
5772 		myri10ge_stop_locked(mgp);
5773 		mgp->running = MYRI10GE_ETH_SUSPENDED_RUNNING;
5774 	}
5775 	status = myri10ge_save_pci_state(mgp);
5776 	mutex_exit(&mgp->intrlock);
5777 	return (status);
5778 }
5779 
5780 static int
5781 myri10ge_resume(dev_info_t *dip)
5782 {
5783 	struct myri10ge_priv *mgp = ddi_get_driver_private(dip);
5784 	int status = DDI_SUCCESS;
5785 
5786 	if (mgp == NULL) {
5787 		cmn_err(CE_WARN, "null dip in myri10ge_resume\n");
5788 		return (DDI_FAILURE);
5789 	}
5790 	if (mgp->dip != dip) {
5791 		cmn_err(CE_WARN, "bad dip in myri10ge_resume\n");
5792 		return (DDI_FAILURE);
5793 	}
5794 
5795 	mutex_enter(&mgp->intrlock);
5796 	status = myri10ge_restore_pci_state(mgp);
5797 	if (status == DDI_SUCCESS &&
5798 	    mgp->running == MYRI10GE_ETH_SUSPENDED_RUNNING) {
5799 		status = myri10ge_start_locked(mgp);
5800 	}
5801 	mutex_exit(&mgp->intrlock);
5802 	if (status != DDI_SUCCESS)
5803 		return (status);
5804 
5805 	/* start the watchdog timer */
5806 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
5807 	    mgp->timer_ticks);
5808 	return (DDI_SUCCESS);
5809 }
5810 
5811 static int
5812 myri10ge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
5813 {
5814 
5815 	struct myri10ge_priv *mgp;
5816 	mac_register_t *macp, *omacp;
5817 	ddi_acc_handle_t handle;
5818 	uint32_t csr, hdr_offset;
5819 	int status, span, link_width, max_read_request_4k;
5820 	unsigned long bus_number, dev_number, func_number;
5821 	size_t bytes;
5822 	offset_t ss_offset;
5823 	uint8_t vso;
5824 
5825 	if (cmd == DDI_RESUME) {
5826 		return (myri10ge_resume(dip));
5827 	}
5828 
5829 	if (cmd != DDI_ATTACH)
5830 		return (DDI_FAILURE);
5831 	if (pci_config_setup(dip, &handle) != DDI_SUCCESS)
5832 		return (DDI_FAILURE);
5833 
5834 	/* enable busmater and io space access */
5835 	csr = pci_config_get32(handle, PCI_CONF_COMM);
5836 	pci_config_put32(handle, PCI_CONF_COMM,
5837 	    (csr |PCI_COMM_ME|PCI_COMM_MAE));
5838 	status = myri10ge_read_pcie_link_width(handle, &link_width);
5839 	if (status != 0) {
5840 		cmn_err(CE_WARN, "could not read link width!\n");
5841 		link_width = 0;
5842 	}
5843 	max_read_request_4k = !myri10ge_set_max_readreq(handle);
5844 	status = myri10ge_find_cap(handle, &vso, PCI_CAP_ID_VS);
5845 	if (status != 0)
5846 		goto abort_with_cfg_hdl;
5847 	if ((omacp = mac_alloc(MAC_VERSION)) == NULL)
5848 		goto abort_with_cfg_hdl;
5849 	/*
5850 	 * XXXX Hack: mac_register_t grows in newer kernels.  To be
5851 	 * able to write newer fields, such as m_margin, without
5852 	 * writing outside allocated memory, we allocate our own macp
5853 	 * and pass that to mac_register()
5854 	 */
5855 	macp = kmem_zalloc(sizeof (*macp) * 8, KM_SLEEP);
5856 	macp->m_version = omacp->m_version;
5857 
5858 	if ((mgp = (struct myri10ge_priv *)
5859 	    kmem_zalloc(sizeof (*mgp), KM_SLEEP)) == NULL) {
5860 		goto abort_with_macinfo;
5861 	}
5862 	ddi_set_driver_private(dip, mgp);
5863 
5864 	/* setup device name for log messages */
5865 	(void) sprintf(mgp->name, "myri10ge%d", ddi_get_instance(dip));
5866 
5867 	mutex_enter(&myri10ge_param_lock);
5868 	myri10ge_get_props(dip);
5869 	mgp->intr_coal_delay = myri10ge_intr_coal_delay;
5870 	mgp->pause = myri10ge_flow_control;
5871 	mutex_exit(&myri10ge_param_lock);
5872 
5873 	mgp->max_read_request_4k = max_read_request_4k;
5874 	mgp->pcie_link_width = link_width;
5875 	mgp->running = MYRI10GE_ETH_STOPPED;
5876 	mgp->vso = vso;
5877 	mgp->dip = dip;
5878 	mgp->cfg_hdl = handle;
5879 
5880 	mgp->timer_ticks = 5 * drv_usectohz(1000000); /* 5 seconds */
5881 	myri10ge_test_physical(dip);
5882 
5883 	/* allocate command page */
5884 	bytes = sizeof (*mgp->cmd);
5885 	mgp->cmd = (mcp_cmd_response_t *)
5886 	    (void *)myri10ge_dma_alloc(dip, bytes,
5887 	    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5888 	    DDI_DMA_CONSISTENT,	DDI_DMA_RDWR|DDI_DMA_CONSISTENT,
5889 	    &mgp->cmd_dma, 1, DDI_DMA_DONTWAIT);
5890 	if (mgp->cmd == NULL)
5891 		goto abort_with_mgp;
5892 
5893 	(void) myri10ge_reg_set(dip, &mgp->reg_set, &span, &bus_number,
5894 	    &dev_number, &func_number);
5895 	if (myri10ge_verbose)
5896 		printf("%s at %ld:%ld:%ld attaching\n", mgp->name,
5897 		    bus_number, dev_number, func_number);
5898 	status = ddi_regs_map_setup(dip, mgp->reg_set, (caddr_t *)&mgp->sram,
5899 	    (offset_t)0, (offset_t)span,  &myri10ge_dev_access_attr,
5900 	    &mgp->io_handle);
5901 	if (status != DDI_SUCCESS) {
5902 		cmn_err(CE_WARN, "%s: couldn't map memory space", mgp->name);
5903 		printf("%s: reg_set = %d, span = %d, status = %d",
5904 		    mgp->name, mgp->reg_set, span, status);
5905 		goto abort_with_mgp;
5906 	}
5907 
5908 	hdr_offset = *(uint32_t *)(void*)(mgp->sram +  MCP_HEADER_PTR_OFFSET);
5909 	hdr_offset = ntohl(hdr_offset) & 0xffffc;
5910 	ss_offset = hdr_offset +
5911 	    offsetof(struct mcp_gen_header, string_specs);
5912 	mgp->sram_size = ntohl(*(uint32_t *)(void*)(mgp->sram + ss_offset));
5913 	myri10ge_pio_copy32(mgp->eeprom_strings,
5914 	    (uint32_t *)(void*)((char *)mgp->sram + mgp->sram_size),
5915 	    MYRI10GE_EEPROM_STRINGS_SIZE);
5916 	(void) memset(mgp->eeprom_strings +
5917 	    MYRI10GE_EEPROM_STRINGS_SIZE - 2, 0, 2);
5918 
5919 	status = myri10ge_read_mac_addr(mgp);
5920 	if (status) {
5921 		goto abort_with_mapped;
5922 	}
5923 
5924 	status = myri10ge_select_firmware(mgp);
5925 	if (status != 0) {
5926 		cmn_err(CE_WARN, "%s: failed to load firmware\n", mgp->name);
5927 		goto abort_with_mapped;
5928 	}
5929 
5930 	status = myri10ge_probe_slices(mgp);
5931 	if (status != 0) {
5932 		cmn_err(CE_WARN, "%s: failed to probe slices\n", mgp->name);
5933 		goto abort_with_dummy_rdma;
5934 	}
5935 
5936 	status = myri10ge_alloc_slices(mgp);
5937 	if (status != 0) {
5938 		cmn_err(CE_WARN, "%s: failed to alloc slices\n", mgp->name);
5939 		goto abort_with_dummy_rdma;
5940 	}
5941 
5942 	/* add the interrupt handler */
5943 	status = myri10ge_add_intrs(mgp, 1);
5944 	if (status != 0) {
5945 		cmn_err(CE_WARN, "%s: Failed to add interrupt\n",
5946 		    mgp->name);
5947 		goto abort_with_slices;
5948 	}
5949 
5950 	/* now that we have an iblock_cookie, init the mutexes */
5951 	mutex_init(&mgp->cmd_lock, NULL, MUTEX_DRIVER, mgp->icookie);
5952 	mutex_init(&mgp->intrlock, NULL, MUTEX_DRIVER, mgp->icookie);
5953 
5954 
5955 	status = myri10ge_nic_stat_init(mgp);
5956 	if (status != DDI_SUCCESS)
5957 		goto abort_with_interrupts;
5958 	status = myri10ge_info_init(mgp);
5959 	if (status != DDI_SUCCESS)
5960 		goto abort_with_stats;
5961 
5962 	/*
5963 	 *	Initialize  GLD state
5964 	 */
5965 
5966 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
5967 	macp->m_driver = mgp;
5968 	macp->m_dip = dip;
5969 	macp->m_src_addr = mgp->mac_addr;
5970 	macp->m_callbacks = &myri10ge_m_callbacks;
5971 	macp->m_min_sdu = 0;
5972 	macp->m_max_sdu = myri10ge_mtu -
5973 	    (sizeof (struct ether_header) + MXGEFW_PAD + VLAN_TAGSZ);
5974 #ifdef SOLARIS_S11
5975 	macp->m_margin = VLAN_TAGSZ;
5976 #endif
5977 	macp->m_v12n = MAC_VIRT_LEVEL1;
5978 	status = mac_register(macp, &mgp->mh);
5979 	if (status != 0) {
5980 		cmn_err(CE_WARN, "%s: mac_register failed with %d\n",
5981 		    mgp->name, status);
5982 		goto abort_with_info;
5983 	}
5984 	myri10ge_ndd_init(mgp);
5985 	if (myri10ge_verbose)
5986 		printf("%s: %s, tx bndry %d, fw %s\n", mgp->name,
5987 		    mgp->intr_type, mgp->tx_boundary, mgp->fw_name);
5988 	mutex_enter(&myri10ge_param_lock);
5989 	mgp->next = mgp_list;
5990 	mgp_list = mgp;
5991 	mutex_exit(&myri10ge_param_lock);
5992 	kmem_free(macp, sizeof (*macp) * 8);
5993 	mac_free(omacp);
5994 	return (DDI_SUCCESS);
5995 
5996 abort_with_info:
5997 	myri10ge_info_destroy(mgp);
5998 
5999 abort_with_stats:
6000 	myri10ge_nic_stat_destroy(mgp);
6001 
6002 abort_with_interrupts:
6003 	mutex_destroy(&mgp->cmd_lock);
6004 	mutex_destroy(&mgp->intrlock);
6005 	myri10ge_rem_intrs(mgp, 1);
6006 
6007 abort_with_slices:
6008 	myri10ge_free_slices(mgp);
6009 
6010 abort_with_dummy_rdma:
6011 	myri10ge_dummy_rdma(mgp, 0);
6012 
6013 abort_with_mapped:
6014 	ddi_regs_map_free(&mgp->io_handle);
6015 
6016 	myri10ge_dma_free(&mgp->cmd_dma);
6017 
6018 abort_with_mgp:
6019 	kmem_free(mgp, sizeof (*mgp));
6020 
6021 abort_with_macinfo:
6022 	kmem_free(macp, sizeof (*macp) * 8);
6023 	mac_free(omacp);
6024 
6025 abort_with_cfg_hdl:
6026 	pci_config_teardown(&handle);
6027 	return (DDI_FAILURE);
6028 
6029 }
6030 
6031 
6032 static int
6033 myri10ge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
6034 {
6035 	struct myri10ge_priv	*mgp, *tmp;
6036 	int 			status, i, jbufs_alloced;
6037 
6038 	if (cmd == DDI_SUSPEND) {
6039 		status = myri10ge_suspend(dip);
6040 		return (status);
6041 	}
6042 
6043 	if (cmd != DDI_DETACH) {
6044 		return (DDI_FAILURE);
6045 	}
6046 	/* Get the driver private (gld_mac_info_t) structure */
6047 	mgp = ddi_get_driver_private(dip);
6048 
6049 	mutex_enter(&mgp->intrlock);
6050 	jbufs_alloced = 0;
6051 	for (i = 0; i < mgp->num_slices; i++) {
6052 		myri10ge_remove_jbufs(&mgp->ss[i]);
6053 		jbufs_alloced += mgp->ss[i].jpool.num_alloc;
6054 	}
6055 	mutex_exit(&mgp->intrlock);
6056 	if (jbufs_alloced != 0) {
6057 		cmn_err(CE_NOTE, "%s: %d loaned rx buffers remain\n",
6058 		    mgp->name, jbufs_alloced);
6059 		return (DDI_FAILURE);
6060 	}
6061 
6062 	mutex_enter(&myri10ge_param_lock);
6063 	if (mgp->refcnt != 0) {
6064 		mutex_exit(&myri10ge_param_lock);
6065 		cmn_err(CE_NOTE, "%s: %d external refs remain\n",
6066 		    mgp->name, mgp->refcnt);
6067 		return (DDI_FAILURE);
6068 	}
6069 	mutex_exit(&myri10ge_param_lock);
6070 
6071 	status = mac_unregister(mgp->mh);
6072 	if (status != DDI_SUCCESS)
6073 		return (status);
6074 
6075 	myri10ge_ndd_fini(mgp);
6076 	myri10ge_dummy_rdma(mgp, 0);
6077 	myri10ge_nic_stat_destroy(mgp);
6078 	myri10ge_info_destroy(mgp);
6079 
6080 	mutex_destroy(&mgp->cmd_lock);
6081 	mutex_destroy(&mgp->intrlock);
6082 
6083 	myri10ge_rem_intrs(mgp, 1);
6084 
6085 	myri10ge_free_slices(mgp);
6086 	ddi_regs_map_free(&mgp->io_handle);
6087 	myri10ge_dma_free(&mgp->cmd_dma);
6088 	pci_config_teardown(&mgp->cfg_hdl);
6089 
6090 	mutex_enter(&myri10ge_param_lock);
6091 	if (mgp_list == mgp) {
6092 		mgp_list = mgp->next;
6093 	} else {
6094 		tmp = mgp_list;
6095 		while (tmp->next != mgp && tmp->next != NULL)
6096 			tmp = tmp->next;
6097 		if (tmp->next != NULL)
6098 			tmp->next = tmp->next->next;
6099 	}
6100 	kmem_free(mgp, sizeof (*mgp));
6101 	mutex_exit(&myri10ge_param_lock);
6102 	return (DDI_SUCCESS);
6103 }
6104 
6105 /*
6106  * Helper for quiesce entry point: Interrupt threads are not being
6107  * scheduled, so we must poll for the confirmation DMA to arrive in
6108  * the firmware stats block for slice 0.  We're essentially running
6109  * the guts of the interrupt handler, and just cherry picking the
6110  * confirmation that the NIC is queuesced (stats->link_down)
6111  */
6112 
6113 static int
6114 myri10ge_poll_down(struct myri10ge_priv *mgp)
6115 {
6116 	struct myri10ge_slice_state *ss = mgp->ss;
6117 	mcp_irq_data_t *stats = ss->fw_stats;
6118 	int valid;
6119 	int found_down = 0;
6120 
6121 
6122 	/* check for a pending IRQ */
6123 
6124 	if (! *((volatile uint8_t *)& stats->valid))
6125 		return (0);
6126 	valid = stats->valid;
6127 
6128 	/*
6129 	 * Make sure to tell the NIC to lower a legacy IRQ, else
6130 	 * it may have corrupt state after restarting
6131 	 */
6132 
6133 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
6134 		/* lower legacy IRQ  */
6135 		*mgp->irq_deassert = 0;
6136 		mb();
6137 		/* wait for irq conf DMA */
6138 		while (*((volatile uint8_t *)& stats->valid))
6139 			;
6140 	}
6141 	if (stats->stats_updated && stats->link_down)
6142 		found_down = 1;
6143 
6144 	if (valid & 0x1)
6145 		*ss->irq_claim = BE_32(3);
6146 	*(ss->irq_claim + 1) = BE_32(3);
6147 
6148 	return (found_down);
6149 }
6150 
6151 static int
6152 myri10ge_quiesce(dev_info_t *dip)
6153 {
6154 	struct myri10ge_priv *mgp;
6155 	myri10ge_cmd_t cmd;
6156 	int status, down, i;
6157 
6158 	mgp = ddi_get_driver_private(dip);
6159 	if (mgp == NULL)
6160 		return (DDI_FAILURE);
6161 
6162 	/* if devices was unplumbed, it is guaranteed to be quiescent */
6163 	if (mgp->running == MYRI10GE_ETH_STOPPED)
6164 		return (DDI_SUCCESS);
6165 
6166 	/* send a down CMD to queuesce NIC */
6167 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
6168 	if (status) {
6169 		cmn_err(CE_WARN, "%s: Couldn't bring down link\n", mgp->name);
6170 		return (DDI_FAILURE);
6171 	}
6172 
6173 	for (i = 0; i < 20; i++) {
6174 		down = myri10ge_poll_down(mgp);
6175 		if (down)
6176 			break;
6177 		delay(drv_usectohz(100000));
6178 		mb();
6179 	}
6180 	if (down)
6181 		return (DDI_SUCCESS);
6182 	return (DDI_FAILURE);
6183 }
6184 
6185 /*
6186  * Distinguish between allocb'ed blocks, and gesballoc'ed attached
6187  * storage.
6188  */
6189 static void
6190 myri10ge_find_lastfree(void)
6191 {
6192 	mblk_t *mp = allocb(1024, 0);
6193 	dblk_t *dbp;
6194 
6195 	if (mp == NULL) {
6196 		cmn_err(CE_WARN, "myri10ge_find_lastfree failed\n");
6197 		return;
6198 	}
6199 	dbp = mp->b_datap;
6200 	myri10ge_db_lastfree = (void *)dbp->db_lastfree;
6201 }
6202 
6203 int
6204 _init(void)
6205 {
6206 	int i;
6207 
6208 	if (myri10ge_verbose)
6209 		cmn_err(CE_NOTE,
6210 		    "Myricom 10G driver (10GbE) version %s loading\n",
6211 		    MYRI10GE_VERSION_STR);
6212 	myri10ge_find_lastfree();
6213 	mac_init_ops(&myri10ge_ops, "myri10ge");
6214 	mutex_init(&myri10ge_param_lock, NULL, MUTEX_DEFAULT, NULL);
6215 	if ((i = mod_install(&modlinkage)) != 0) {
6216 		cmn_err(CE_WARN, "mod_install returned %d\n", i);
6217 		mac_fini_ops(&myri10ge_ops);
6218 		mutex_destroy(&myri10ge_param_lock);
6219 	}
6220 	return (i);
6221 }
6222 
6223 int
6224 _fini(void)
6225 {
6226 	int i;
6227 	i = mod_remove(&modlinkage);
6228 	if (i != 0) {
6229 		return (i);
6230 	}
6231 	mac_fini_ops(&myri10ge_ops);
6232 	mutex_destroy(&myri10ge_param_lock);
6233 	return (0);
6234 }
6235 
6236 int
6237 _info(struct modinfo *modinfop)
6238 {
6239 	return (mod_info(&modlinkage, modinfop));
6240 }
6241 
6242 
6243 /*
6244  *  This file uses MyriGE driver indentation.
6245  *
6246  * Local Variables:
6247  * c-file-style:"sun"
6248  * tab-width:8
6249  * End:
6250  */
6251