xref: /illumos-gate/usr/src/uts/common/io/myri10ge/drv/myri10ge.c (revision 6e6c7d67bf5ba2efa13619acd59395d0f278ee75)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright 2007-2009 Myricom, Inc.  All rights reserved.
29  * Use is subject to license terms.
30  */
31 
32 /*
33  * Copyright (c) 2014, Joyent, Inc.
34  * Copyright (c) 2016 by Delphix. All rights reserved.
35  */
36 
37 #define	MXGEFW_NDIS
38 #include "myri10ge_var.h"
39 #include "rss_eth_z8e.h"
40 #include "rss_ethp_z8e.h"
41 #include "mcp_gen_header.h"
42 
43 #define	MYRI10GE_MAX_ETHER_MTU 9014
44 #define	MYRI10GE_MAX_GLD_MTU	9000
45 #define	MYRI10GE_MIN_GLD_MTU	1500
46 
47 #define	MYRI10GE_ETH_STOPPED 0
48 #define	MYRI10GE_ETH_STOPPING 1
49 #define	MYRI10GE_ETH_STARTING 2
50 #define	MYRI10GE_ETH_RUNNING 3
51 #define	MYRI10GE_ETH_OPEN_FAILED 4
52 #define	MYRI10GE_ETH_SUSPENDED_RUNNING 5
53 
54 static int myri10ge_small_bytes = 510;
55 static int myri10ge_intr_coal_delay = 125;
56 static int myri10ge_flow_control = 1;
57 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
58 static int myri10ge_nvidia_ecrc_enable = 1;
59 #endif
60 static int myri10ge_mtu_override = 0;
61 static int myri10ge_tx_copylen = 512;
62 static int myri10ge_deassert_wait = 1;
63 static int myri10ge_verbose = 0;
64 static int myri10ge_watchdog_reset = 0;
65 static int myri10ge_use_msix = 1;
66 static int myri10ge_max_slices = -1;
67 static int myri10ge_use_msi = 1;
68 int myri10ge_force_firmware = 0;
69 static boolean_t myri10ge_use_lso = B_TRUE;
70 static int myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
71 static int myri10ge_tx_hash = 1;
72 static int myri10ge_lro = 0;
73 static int myri10ge_lro_cnt = 8;
74 int myri10ge_lro_max_aggr = 2;
75 static int myri10ge_lso_copy = 0;
76 static mblk_t *myri10ge_send_wrapper(void *arg, mblk_t *mp);
77 int myri10ge_tx_handles_initial = 128;
78 
79 static 	kmutex_t myri10ge_param_lock;
80 static void* myri10ge_db_lastfree;
81 
82 static int myri10ge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
83 static int myri10ge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
84 static int myri10ge_quiesce(dev_info_t *dip);
85 
86 DDI_DEFINE_STREAM_OPS(myri10ge_ops, nulldev, nulldev, myri10ge_attach,
87     myri10ge_detach, nodev, NULL, D_MP, NULL, myri10ge_quiesce);
88 
89 
90 static struct modldrv modldrv = {
91 	&mod_driverops,
92 	"Myricom 10G driver (10GbE)",
93 	&myri10ge_ops,
94 };
95 
96 
97 static struct modlinkage modlinkage = {
98 	MODREV_1,
99 	{&modldrv, NULL},
100 };
101 
102 unsigned char myri10ge_broadcastaddr[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
103 
104 static ddi_dma_attr_t myri10ge_misc_dma_attr = {
105 	DMA_ATTR_V0,			/* version number. */
106 	(uint64_t)0, 			/* low address */
107 	(uint64_t)0xffffffffffffffffULL, /* high address */
108 	(uint64_t)0x7ffffff,		/* address counter max */
109 	(uint64_t)4096,			/* alignment */
110 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
111 	(uint32_t)0x1,			/* minimum transfer size */
112 	(uint64_t)0x7fffffff,		/* maximum transfer size */
113 	(uint64_t)0x7fffffff,		/* maximum segment size */
114 	1,				/* scatter/gather list length */
115 	1,				/* granularity */
116 	0				/* attribute flags */
117 };
118 
119 /*
120  * The Myri10GE NIC has the following constraints on receive buffers:
121  * 1) Buffers which cross a 4KB boundary must be aligned to 4KB
122  * 2) Buffers which are not aligned to 4KB must not cross a 4KB boundary
123  */
124 
125 static ddi_dma_attr_t myri10ge_rx_jumbo_dma_attr = {
126 	DMA_ATTR_V0,			/* version number. */
127 	(uint64_t)0, 			/* low address */
128 	(uint64_t)0xffffffffffffffffULL, /* high address */
129 	(uint64_t)0x7ffffff,		/* address counter max */
130 	(uint64_t)4096,			/* alignment */
131 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
132 	(uint32_t)0x1,			/* minimum transfer size */
133 	(uint64_t)0x7fffffff,		/* maximum transfer size */
134 	UINT64_MAX,			/* maximum segment size */
135 	1,				/* scatter/gather list length */
136 	1,				/* granularity */
137 	0				/* attribute flags */
138 };
139 
140 static ddi_dma_attr_t myri10ge_rx_std_dma_attr = {
141 	DMA_ATTR_V0,			/* version number. */
142 	(uint64_t)0, 			/* low address */
143 	(uint64_t)0xffffffffffffffffULL, /* high address */
144 	(uint64_t)0x7ffffff,		/* address counter max */
145 #if defined sparc64 || defined __sparcv9
146 	(uint64_t)4096,			/* alignment */
147 #else
148 	(uint64_t)0x80,			/* alignment */
149 #endif
150 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
151 	(uint32_t)0x1,			/* minimum transfer size */
152 	(uint64_t)0x7fffffff,		/* maximum transfer size */
153 #if defined sparc64 || defined __sparcv9
154 	UINT64_MAX,			/* maximum segment size */
155 #else
156 	(uint64_t)0xfff,		/* maximum segment size */
157 #endif
158 	1,				/* scatter/gather list length */
159 	1,				/* granularity */
160 	0				/* attribute flags */
161 };
162 
163 static ddi_dma_attr_t myri10ge_tx_dma_attr = {
164 	DMA_ATTR_V0,			/* version number. */
165 	(uint64_t)0, 			/* low address */
166 	(uint64_t)0xffffffffffffffffULL, /* high address */
167 	(uint64_t)0x7ffffff,		/* address counter max */
168 	(uint64_t)1,			/* alignment */
169 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
170 	(uint32_t)0x1,			/* minimum transfer size */
171 	(uint64_t)0x7fffffff,		/* maximum transfer size */
172 	UINT64_MAX,			/* maximum segment size */
173 	INT32_MAX,			/* scatter/gather list length */
174 	1,				/* granularity */
175 	0			/* attribute flags */
176 };
177 
178 #if defined sparc64 || defined __sparcv9
179 #define	WC 0
180 #else
181 #define	WC 1
182 #endif
183 
184 struct ddi_device_acc_attr myri10ge_dev_access_attr = {
185 	DDI_DEVICE_ATTR_V0,		/* version */
186 	DDI_NEVERSWAP_ACC,		/* endian flash */
187 #if WC
188 	DDI_MERGING_OK_ACC		/* data order */
189 #else
190 	DDI_STRICTORDER_ACC
191 #endif
192 };
193 
194 static void myri10ge_watchdog(void *arg);
195 
196 #ifdef MYRICOM_PRIV
197 int myri10ge_mtu = MYRI10GE_MAX_ETHER_MTU + MXGEFW_PAD + VLAN_TAGSZ;
198 #define	MYRI10GE_DEFAULT_GLD_MTU	MYRI10GE_MAX_GLD_MTU
199 #else
200 int myri10ge_mtu = ETHERMAX + MXGEFW_PAD + VLAN_TAGSZ;
201 #define	MYRI10GE_DEFAULT_GLD_MTU	MYRI10GE_MIN_GLD_MTU
202 #endif
203 int myri10ge_bigbufs_initial = 1024;
204 int myri10ge_bigbufs_max = 4096;
205 
206 
207 caddr_t
208 myri10ge_dma_alloc(dev_info_t *dip, size_t len,
209     ddi_dma_attr_t *attr, ddi_device_acc_attr_t  *accattr,
210     uint_t alloc_flags, int bind_flags, struct myri10ge_dma_stuff *dma,
211     int warn, int (*wait)(caddr_t))
212 {
213 	caddr_t  kaddr;
214 	size_t real_length;
215 	ddi_dma_cookie_t cookie;
216 	uint_t count;
217 	int err;
218 
219 	err = ddi_dma_alloc_handle(dip, attr, wait,
220 	    NULL, &dma->handle);
221 	if (err != DDI_SUCCESS) {
222 		if (warn)
223 			cmn_err(CE_WARN,
224 			    "myri10ge: ddi_dma_alloc_handle failed\n");
225 		goto abort_with_nothing;
226 	}
227 
228 	err = ddi_dma_mem_alloc(dma->handle, len, accattr, alloc_flags,
229 	    wait, NULL, &kaddr, &real_length,
230 	    &dma->acc_handle);
231 	if (err != DDI_SUCCESS) {
232 		if (warn)
233 			cmn_err(CE_WARN,
234 			    "myri10ge: ddi_dma_mem_alloc failed\n");
235 		goto abort_with_handle;
236 	}
237 
238 	err = ddi_dma_addr_bind_handle(dma->handle, NULL, kaddr, len,
239 	    bind_flags, wait, NULL, &cookie, &count);
240 
241 	if (err != DDI_SUCCESS) {
242 		if (warn)
243 			cmn_err(CE_WARN,
244 			    "myri10ge: ddi_dma_addr_bind_handle failed\n");
245 		goto abort_with_mem;
246 	}
247 
248 	if (count != 1) {
249 		if (warn)
250 			cmn_err(CE_WARN,
251 			    "myri10ge: got too many dma segments ");
252 		goto abort_with_bind;
253 	}
254 	dma->low = htonl(MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress));
255 	dma->high = htonl(MYRI10GE_HIGHPART_TO_U32(cookie.dmac_laddress));
256 	return (kaddr);
257 
258 abort_with_bind:
259 	(void) ddi_dma_unbind_handle(dma->handle);
260 
261 abort_with_mem:
262 	ddi_dma_mem_free(&dma->acc_handle);
263 
264 abort_with_handle:
265 	ddi_dma_free_handle(&dma->handle);
266 abort_with_nothing:
267 	if (warn) {
268 		cmn_err(CE_WARN, "myri10ge: myri10ge_dma_alloc failed.\n  ");
269 		cmn_err(CE_WARN, "args: dip=%p len=0x%lx ddi_dma_attr=%p\n",
270 		    (void*) dip, len, (void*) attr);
271 		cmn_err(CE_WARN,
272 		    "args: ddi_device_acc_attr=%p  alloc_flags=0x%x\n",
273 		    (void*) accattr, alloc_flags);
274 		cmn_err(CE_WARN, "args: bind_flags=0x%x  dmastuff=%p",
275 		    bind_flags, (void*) dma);
276 	}
277 	return (NULL);
278 
279 }
280 
281 void
282 myri10ge_dma_free(struct myri10ge_dma_stuff *dma)
283 {
284 	(void) ddi_dma_unbind_handle(dma->handle);
285 	ddi_dma_mem_free(&dma->acc_handle);
286 	ddi_dma_free_handle(&dma->handle);
287 }
288 
289 static inline void
290 myri10ge_pio_copy32(void *to, uint32_t *from32, size_t size)
291 {
292 	register volatile uint32_t *to32;
293 	size_t i;
294 
295 	to32 = (volatile uint32_t *) to;
296 	for (i = (size / 4); i; i--) {
297 		*to32 = *from32;
298 		to32++;
299 		from32++;
300 	}
301 }
302 
303 #if defined(_LP64)
304 static inline void
305 myri10ge_pio_copy64(void *to, uint64_t *from64, size_t size)
306 {
307 	register volatile uint64_t *to64;
308 	size_t i;
309 
310 	to64 = (volatile uint64_t *) to;
311 	for (i = (size / 8); i; i--) {
312 		*to64 = *from64;
313 		to64++;
314 		from64++;
315 	}
316 }
317 #endif
318 
319 /*
320  * This routine copies memory from the host to the NIC.
321  * The "size" argument must always be a multiple of
322  * the size of long (4 or 8 bytes), and to/from must also
323  * be naturally aligned.
324  */
325 static inline void
326 myri10ge_pio_copy(void *to, void *from, size_t size)
327 {
328 #if !defined(_LP64)
329 	ASSERT((size % 4) == 0);
330 	myri10ge_pio_copy32(to, (uint32_t *)from, size);
331 #else
332 	ASSERT((size % 8) == 0);
333 	myri10ge_pio_copy64(to, (uint64_t *)from, size);
334 #endif
335 }
336 
337 
338 /*
339  * Due to various bugs in Solaris (especially bug 6186772 where the
340  * TCP/UDP checksum is calculated incorrectly on mblk chains with more
341  * than two elements), and the design bug where hardware checksums are
342  * ignored on mblk chains with more than 2 elements, we need to
343  * allocate private pool of physically contiguous receive buffers.
344  */
345 
346 static void
347 myri10ge_jpool_init(struct myri10ge_slice_state *ss)
348 {
349 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
350 
351 	bzero(jpool, sizeof (*jpool));
352 	mutex_init(&jpool->mtx, NULL, MUTEX_DRIVER,
353 	    ss->mgp->icookie);
354 	jpool->head = NULL;
355 }
356 
357 static void
358 myri10ge_jpool_fini(struct myri10ge_slice_state *ss)
359 {
360 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
361 
362 	if (jpool->head != NULL) {
363 		cmn_err(CE_WARN,
364 		    "%s: BUG! myri10ge_jpool_fini called on non-empty pool\n",
365 		    ss->mgp->name);
366 	}
367 	mutex_destroy(&jpool->mtx);
368 }
369 
370 
371 /*
372  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
373  * at most 32 bytes at a time, so as to avoid involving the software
374  * pio handler in the nic.   We re-write the first segment's low
375  * DMA address to mark it valid only after we write the entire chunk
376  * in a burst
377  */
378 static inline void
379 myri10ge_submit_8rx(mcp_kreq_ether_recv_t *dst, mcp_kreq_ether_recv_t *src)
380 {
381 	src->addr_low |= BE_32(1);
382 	myri10ge_pio_copy(dst, src, 4 * sizeof (*src));
383 	mb();
384 	myri10ge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
385 	mb();
386 	src->addr_low &= ~(BE_32(1));
387 	dst->addr_low = src->addr_low;
388 	mb();
389 }
390 
391 static void
392 myri10ge_pull_jpool(struct myri10ge_slice_state *ss)
393 {
394 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
395 	struct myri10ge_jpool_entry *jtail, *j, *jfree;
396 	volatile uintptr_t *putp;
397 	uintptr_t put;
398 	int i;
399 
400 	/* find tail */
401 	jtail = NULL;
402 	if (jpool->head != NULL) {
403 		j = jpool->head;
404 		while (j->next != NULL)
405 			j = j->next;
406 		jtail = j;
407 	}
408 
409 	/*
410 	 * iterate over all per-CPU caches, and add contents into
411 	 * jpool
412 	 */
413 	for (i = 0; i < MYRI10GE_MAX_CPUS; i++) {
414 		/* take per-CPU free list */
415 		putp = (void *)&jpool->cpu[i & MYRI10GE_MAX_CPU_MASK].head;
416 		if (*putp == NULL)
417 			continue;
418 		put = atomic_swap_ulong(putp, 0);
419 		jfree = (struct myri10ge_jpool_entry *)put;
420 
421 		/* append to pool */
422 		if (jtail == NULL) {
423 			jpool->head = jfree;
424 		} else {
425 			jtail->next = jfree;
426 		}
427 		j = jfree;
428 		while (j->next != NULL)
429 			j = j->next;
430 		jtail = j;
431 	}
432 }
433 
434 /*
435  * Transfers buffers from the free pool to the nic
436  * Must be called holding the jpool mutex.
437  */
438 
439 static inline void
440 myri10ge_restock_jumbos(struct myri10ge_slice_state *ss)
441 {
442 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
443 	struct myri10ge_jpool_entry *j;
444 	myri10ge_rx_ring_t *rx;
445 	int i, idx, limit;
446 
447 	rx = &ss->rx_big;
448 	limit = ss->j_rx_cnt + (rx->mask + 1);
449 
450 	for (i = rx->cnt; i != limit; i++) {
451 		idx = i & (rx->mask);
452 		j = jpool->head;
453 		if (j == NULL) {
454 			myri10ge_pull_jpool(ss);
455 			j = jpool->head;
456 			if (j == NULL) {
457 				break;
458 			}
459 		}
460 		jpool->head = j->next;
461 		rx->info[idx].j = j;
462 		rx->shadow[idx].addr_low = j->dma.low;
463 		rx->shadow[idx].addr_high = j->dma.high;
464 		/* copy 4 descriptors (32-bytes) to the mcp at a time */
465 		if ((idx & 7) == 7) {
466 			myri10ge_submit_8rx(&rx->lanai[idx - 7],
467 			    &rx->shadow[idx - 7]);
468 		}
469 	}
470 	rx->cnt = i;
471 }
472 
473 /*
474  * Transfer buffers from the nic to the free pool.
475  * Should be called holding the jpool mutex
476  */
477 
478 static inline void
479 myri10ge_unstock_jumbos(struct myri10ge_slice_state *ss)
480 {
481 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
482 	struct myri10ge_jpool_entry *j;
483 	myri10ge_rx_ring_t *rx;
484 	int i;
485 
486 	mutex_enter(&jpool->mtx);
487 	rx = &ss->rx_big;
488 
489 	for (i = 0; i < rx->mask + 1; i++) {
490 		j = rx->info[i].j;
491 		rx->info[i].j = NULL;
492 		if (j == NULL)
493 			continue;
494 		j->next = jpool->head;
495 		jpool->head = j;
496 	}
497 	mutex_exit(&jpool->mtx);
498 
499 }
500 
501 
502 /*
503  * Free routine which is called when the mblk allocated via
504  * esballoc() is freed.   Here we return the jumbo buffer
505  * to the free pool, and possibly pass some jumbo buffers
506  * to the nic
507  */
508 
509 static void
510 myri10ge_jfree_rtn(void *arg)
511 {
512 	struct myri10ge_jpool_entry *j = (struct myri10ge_jpool_entry *)arg;
513 	struct myri10ge_jpool_stuff *jpool;
514 	volatile uintptr_t *putp;
515 	uintptr_t old, new;
516 
517 	jpool = &j->ss->jpool;
518 
519 	/* prepend buffer locklessly to per-CPU freelist */
520 	putp = (void *)&jpool->cpu[CPU->cpu_seqid & MYRI10GE_MAX_CPU_MASK].head;
521 	new = (uintptr_t)j;
522 	do {
523 		old = *putp;
524 		j->next = (void *)old;
525 	} while (atomic_cas_ulong(putp, old, new) != old);
526 }
527 
528 static void
529 myri10ge_remove_jbuf(struct myri10ge_jpool_entry *j)
530 {
531 	(void) ddi_dma_unbind_handle(j->dma_handle);
532 	ddi_dma_mem_free(&j->acc_handle);
533 	ddi_dma_free_handle(&j->dma_handle);
534 	kmem_free(j, sizeof (*j));
535 }
536 
537 
538 /*
539  * Allocates one physically contiguous descriptor
540  * and add it to the jumbo buffer pool.
541  */
542 
543 static int
544 myri10ge_add_jbuf(struct myri10ge_slice_state *ss)
545 {
546 	struct myri10ge_jpool_entry *j;
547 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
548 	ddi_dma_attr_t *rx_dma_attr;
549 	size_t real_length;
550 	ddi_dma_cookie_t cookie;
551 	uint_t count;
552 	int err;
553 
554 	if (myri10ge_mtu < 2048)
555 		rx_dma_attr = &myri10ge_rx_std_dma_attr;
556 	else
557 		rx_dma_attr = &myri10ge_rx_jumbo_dma_attr;
558 
559 again:
560 	j = (struct myri10ge_jpool_entry *)
561 	    kmem_alloc(sizeof (*j), KM_SLEEP);
562 	err = ddi_dma_alloc_handle(ss->mgp->dip, rx_dma_attr,
563 	    DDI_DMA_DONTWAIT, NULL, &j->dma_handle);
564 	if (err != DDI_SUCCESS)
565 		goto abort_with_j;
566 
567 	err = ddi_dma_mem_alloc(j->dma_handle, myri10ge_mtu,
568 	    &myri10ge_dev_access_attr,  DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
569 	    NULL, &j->buf, &real_length, &j->acc_handle);
570 	if (err != DDI_SUCCESS)
571 		goto abort_with_handle;
572 
573 	err = ddi_dma_addr_bind_handle(j->dma_handle, NULL, j->buf,
574 	    real_length, DDI_DMA_READ|DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
575 	    NULL, &cookie, &count);
576 	if (err != DDI_SUCCESS)
577 		goto abort_with_mem;
578 
579 	/*
580 	 * Make certain std MTU buffers do not cross a 4KB boundary:
581 	 *
582 	 * Setting dma_attr_align=4096 will do this, but the system
583 	 * will only allocate 1 RX buffer per 4KB page, rather than 2.
584 	 * Setting dma_attr_granular=4096 *seems* to work around this,
585 	 * but I'm paranoid about future systems no longer honoring
586 	 * this, so fall back to the safe, but memory wasting way if a
587 	 * buffer crosses a 4KB boundary.
588 	 */
589 
590 	if (rx_dma_attr == &myri10ge_rx_std_dma_attr &&
591 	    rx_dma_attr->dma_attr_align != 4096) {
592 		uint32_t start, end;
593 
594 		start = MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress);
595 		end = start + myri10ge_mtu;
596 		if (((end >> 12) != (start >> 12)) && (start & 4095U)) {
597 			printf("std buffer crossed a 4KB boundary!\n");
598 			myri10ge_remove_jbuf(j);
599 			rx_dma_attr->dma_attr_align = 4096;
600 			rx_dma_attr->dma_attr_seg = UINT64_MAX;
601 			goto again;
602 		}
603 	}
604 
605 	j->dma.low =
606 	    htonl(MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress));
607 	j->dma.high =
608 	    htonl(MYRI10GE_HIGHPART_TO_U32(cookie.dmac_laddress));
609 	j->ss = ss;
610 
611 
612 	j->free_func.free_func = myri10ge_jfree_rtn;
613 	j->free_func.free_arg = (char *)j;
614 	mutex_enter(&jpool->mtx);
615 	j->next = jpool->head;
616 	jpool->head = j;
617 	jpool->num_alloc++;
618 	mutex_exit(&jpool->mtx);
619 	return (0);
620 
621 abort_with_mem:
622 	ddi_dma_mem_free(&j->acc_handle);
623 
624 abort_with_handle:
625 	ddi_dma_free_handle(&j->dma_handle);
626 
627 abort_with_j:
628 	kmem_free(j, sizeof (*j));
629 
630 	/*
631 	 * If an allocation failed, perhaps it failed because it could
632 	 * not satisfy granularity requirement.  Disable that, and
633 	 * try agin.
634 	 */
635 	if (rx_dma_attr == &myri10ge_rx_std_dma_attr &&
636 	    rx_dma_attr->dma_attr_align != 4096) {
637 			cmn_err(CE_NOTE,
638 			    "!alloc failed, reverting to gran=1\n");
639 			rx_dma_attr->dma_attr_align = 4096;
640 			rx_dma_attr->dma_attr_seg = UINT64_MAX;
641 			goto again;
642 	}
643 	return (err);
644 }
645 
646 static int
647 myri10ge_jfree_cnt(struct myri10ge_jpool_stuff *jpool)
648 {
649 	int i;
650 	struct myri10ge_jpool_entry *j;
651 
652 	mutex_enter(&jpool->mtx);
653 	j = jpool->head;
654 	i = 0;
655 	while (j != NULL) {
656 		i++;
657 		j = j->next;
658 	}
659 	mutex_exit(&jpool->mtx);
660 	return (i);
661 }
662 
663 static int
664 myri10ge_add_jbufs(struct myri10ge_slice_state *ss, int num, int total)
665 {
666 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
667 	int allocated = 0;
668 	int err;
669 	int needed;
670 
671 	/*
672 	 * if total is set, user wants "num" jbufs in the pool,
673 	 * otherwise the user wants to "num" additional jbufs
674 	 * added to the pool
675 	 */
676 	if (total && jpool->num_alloc) {
677 		allocated = myri10ge_jfree_cnt(jpool);
678 		needed = num - allocated;
679 	} else {
680 		needed = num;
681 	}
682 
683 	while (needed > 0) {
684 		needed--;
685 		err = myri10ge_add_jbuf(ss);
686 		if (err == 0) {
687 			allocated++;
688 		}
689 	}
690 	return (allocated);
691 }
692 
693 static void
694 myri10ge_remove_jbufs(struct myri10ge_slice_state *ss)
695 {
696 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
697 	struct myri10ge_jpool_entry *j;
698 
699 	mutex_enter(&jpool->mtx);
700 	myri10ge_pull_jpool(ss);
701 	while (jpool->head != NULL) {
702 		jpool->num_alloc--;
703 		j = jpool->head;
704 		jpool->head = j->next;
705 		myri10ge_remove_jbuf(j);
706 	}
707 	mutex_exit(&jpool->mtx);
708 }
709 
710 static void
711 myri10ge_carve_up_jbufs_into_small_ring(struct myri10ge_slice_state *ss)
712 {
713 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
714 	struct myri10ge_jpool_entry *j = NULL;
715 	caddr_t ptr;
716 	uint32_t dma_low, dma_high;
717 	int idx, len;
718 	unsigned int alloc_size;
719 
720 	dma_low = dma_high = len = 0;
721 	alloc_size = myri10ge_small_bytes + MXGEFW_PAD;
722 	ptr = NULL;
723 	for (idx = 0; idx < ss->rx_small.mask + 1; idx++) {
724 		/* Allocate a jumbo frame and carve it into small frames */
725 		if (len < alloc_size) {
726 			mutex_enter(&jpool->mtx);
727 			/* remove jumbo from freelist */
728 			j = jpool->head;
729 			jpool->head = j->next;
730 			/* place it onto small list */
731 			j->next = ss->small_jpool;
732 			ss->small_jpool = j;
733 			mutex_exit(&jpool->mtx);
734 			len = myri10ge_mtu;
735 			dma_low = ntohl(j->dma.low);
736 			dma_high = ntohl(j->dma.high);
737 			ptr = j->buf;
738 		}
739 		ss->rx_small.info[idx].ptr = ptr;
740 		ss->rx_small.shadow[idx].addr_low = htonl(dma_low);
741 		ss->rx_small.shadow[idx].addr_high = htonl(dma_high);
742 		len -= alloc_size;
743 		ptr += alloc_size;
744 		dma_low += alloc_size;
745 	}
746 }
747 
748 /*
749  * Return the jumbo bufs we carved up for small to the jumbo pool
750  */
751 
752 static void
753 myri10ge_release_small_jbufs(struct myri10ge_slice_state *ss)
754 {
755 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
756 	struct myri10ge_jpool_entry *j = NULL;
757 
758 	mutex_enter(&jpool->mtx);
759 	while (ss->small_jpool != NULL) {
760 		j = ss->small_jpool;
761 		ss->small_jpool = j->next;
762 		j->next = jpool->head;
763 		jpool->head = j;
764 	}
765 	mutex_exit(&jpool->mtx);
766 	ss->jbufs_for_smalls = 0;
767 }
768 
769 static int
770 myri10ge_add_tx_handle(struct myri10ge_slice_state *ss)
771 {
772 	myri10ge_tx_ring_t *tx = &ss->tx;
773 	struct myri10ge_priv *mgp = ss->mgp;
774 	struct myri10ge_tx_dma_handle *handle;
775 	int err;
776 
777 	handle = kmem_zalloc(sizeof (*handle), KM_SLEEP);
778 	err = ddi_dma_alloc_handle(mgp->dip,
779 	    &myri10ge_tx_dma_attr,
780 	    DDI_DMA_SLEEP, NULL,
781 	    &handle->h);
782 	if (err) {
783 		static int limit = 0;
784 		if (limit == 0)
785 			cmn_err(CE_WARN, "%s: Falled to alloc tx dma handle\n",
786 			    mgp->name);
787 		limit++;
788 		kmem_free(handle, sizeof (*handle));
789 		return (err);
790 	}
791 	mutex_enter(&tx->handle_lock);
792 	MYRI10GE_SLICE_STAT_INC(tx_handles_alloced);
793 	handle->next = tx->free_tx_handles;
794 	tx->free_tx_handles = handle;
795 	mutex_exit(&tx->handle_lock);
796 	return (DDI_SUCCESS);
797 }
798 
799 static void
800 myri10ge_remove_tx_handles(struct myri10ge_slice_state *ss)
801 {
802 	myri10ge_tx_ring_t *tx = &ss->tx;
803 	struct myri10ge_tx_dma_handle *handle;
804 	mutex_enter(&tx->handle_lock);
805 
806 	handle = tx->free_tx_handles;
807 	while (handle != NULL) {
808 		tx->free_tx_handles = handle->next;
809 		ddi_dma_free_handle(&handle->h);
810 		kmem_free(handle, sizeof (*handle));
811 		handle = tx->free_tx_handles;
812 		MYRI10GE_SLICE_STAT_DEC(tx_handles_alloced);
813 	}
814 	mutex_exit(&tx->handle_lock);
815 	if (MYRI10GE_SLICE_STAT(tx_handles_alloced) != 0) {
816 		cmn_err(CE_WARN, "%s: %d tx dma handles allocated at close\n",
817 		    ss->mgp->name,
818 		    (int)MYRI10GE_SLICE_STAT(tx_handles_alloced));
819 	}
820 }
821 
822 static void
823 myri10ge_free_tx_handles(myri10ge_tx_ring_t *tx,
824     struct myri10ge_tx_dma_handle_head *list)
825 {
826 	mutex_enter(&tx->handle_lock);
827 	list->tail->next = tx->free_tx_handles;
828 	tx->free_tx_handles = list->head;
829 	mutex_exit(&tx->handle_lock);
830 }
831 
832 static void
833 myri10ge_free_tx_handle_slist(myri10ge_tx_ring_t *tx,
834     struct myri10ge_tx_dma_handle *handle)
835 {
836 	struct myri10ge_tx_dma_handle_head list;
837 
838 	if (handle == NULL)
839 		return;
840 	list.head = handle;
841 	list.tail = handle;
842 	while (handle != NULL) {
843 		list.tail = handle;
844 		handle = handle->next;
845 	}
846 	myri10ge_free_tx_handles(tx, &list);
847 }
848 
849 static int
850 myri10ge_alloc_tx_handles(struct myri10ge_slice_state *ss, int count,
851     struct myri10ge_tx_dma_handle **ret)
852 {
853 	myri10ge_tx_ring_t *tx = &ss->tx;
854 	struct myri10ge_tx_dma_handle *handle;
855 	int err, i;
856 
857 	mutex_enter(&tx->handle_lock);
858 	for (i = 0; i < count; i++) {
859 		handle = tx->free_tx_handles;
860 		while (handle == NULL) {
861 			mutex_exit(&tx->handle_lock);
862 			err = myri10ge_add_tx_handle(ss);
863 			if (err != DDI_SUCCESS) {
864 				goto abort_with_handles;
865 			}
866 			mutex_enter(&tx->handle_lock);
867 			handle = tx->free_tx_handles;
868 		}
869 		tx->free_tx_handles = handle->next;
870 		handle->next = *ret;
871 		*ret = handle;
872 	}
873 	mutex_exit(&tx->handle_lock);
874 	return (DDI_SUCCESS);
875 
876 abort_with_handles:
877 	myri10ge_free_tx_handle_slist(tx, *ret);
878 	return (err);
879 }
880 
881 
882 /*
883  * Frees DMA resources associated with the send ring
884  */
885 static void
886 myri10ge_unprepare_tx_ring(struct myri10ge_slice_state *ss)
887 {
888 	myri10ge_tx_ring_t *tx;
889 	struct myri10ge_tx_dma_handle_head handles;
890 	size_t bytes;
891 	int idx;
892 
893 	tx = &ss->tx;
894 	handles.head = NULL;
895 	handles.tail = NULL;
896 	for (idx = 0; idx < ss->tx.mask + 1; idx++) {
897 		if (tx->info[idx].m) {
898 			(void) ddi_dma_unbind_handle(tx->info[idx].handle->h);
899 			handles.head = tx->info[idx].handle;
900 			if (handles.tail == NULL)
901 				handles.tail = tx->info[idx].handle;
902 			freeb(tx->info[idx].m);
903 			tx->info[idx].m = 0;
904 			tx->info[idx].handle = 0;
905 		}
906 		tx->cp[idx].va = NULL;
907 		myri10ge_dma_free(&tx->cp[idx].dma);
908 	}
909 	bytes = sizeof (*tx->cp) * (tx->mask + 1);
910 	kmem_free(tx->cp, bytes);
911 	tx->cp = NULL;
912 	if (handles.head != NULL)
913 		myri10ge_free_tx_handles(tx, &handles);
914 	myri10ge_remove_tx_handles(ss);
915 }
916 
917 /*
918  * Allocates DMA handles associated with the send ring
919  */
920 static inline int
921 myri10ge_prepare_tx_ring(struct myri10ge_slice_state *ss)
922 {
923 	struct myri10ge_tx_dma_handle *handles;
924 	int h;
925 	size_t bytes;
926 
927 	bytes = sizeof (*ss->tx.cp) * (ss->tx.mask + 1);
928 	ss->tx.cp = kmem_zalloc(bytes, KM_SLEEP);
929 	if (ss->tx.cp == NULL) {
930 		cmn_err(CE_WARN,
931 		    "%s: Failed to allocate tx copyblock storage\n",
932 		    ss->mgp->name);
933 		return (DDI_FAILURE);
934 	}
935 
936 
937 	/* allocate the TX copyblocks */
938 	for (h = 0; h < ss->tx.mask + 1; h++) {
939 		ss->tx.cp[h].va = myri10ge_dma_alloc(ss->mgp->dip,
940 		    4096, &myri10ge_rx_jumbo_dma_attr,
941 		    &myri10ge_dev_access_attr, DDI_DMA_STREAMING,
942 		    DDI_DMA_WRITE|DDI_DMA_STREAMING, &ss->tx.cp[h].dma, 1,
943 		    DDI_DMA_DONTWAIT);
944 		if (ss->tx.cp[h].va == NULL) {
945 			cmn_err(CE_WARN, "%s: Failed to allocate tx "
946 			    "copyblock %d\n", ss->mgp->name, h);
947 			goto abort_with_copyblocks;
948 		}
949 	}
950 	/* pre-allocate transmit handles */
951 	handles = NULL;
952 	(void) myri10ge_alloc_tx_handles(ss, myri10ge_tx_handles_initial,
953 	    &handles);
954 	if (handles != NULL)
955 		myri10ge_free_tx_handle_slist(&ss->tx, handles);
956 
957 	return (DDI_SUCCESS);
958 
959 abort_with_copyblocks:
960 	while (h > 0)  {
961 		h--;
962 		myri10ge_dma_free(&ss->tx.cp[h].dma);
963 	}
964 
965 	bytes = sizeof (*ss->tx.cp) * (ss->tx.mask + 1);
966 	kmem_free(ss->tx.cp, bytes);
967 	ss->tx.cp = NULL;
968 	return (DDI_FAILURE);
969 }
970 
971 /*
972  * The eeprom strings on the lanaiX have the format
973  * SN=x\0
974  * MAC=x:x:x:x:x:x\0
975  * PT:ddd mmm xx xx:xx:xx xx\0
976  * PV:ddd mmm xx xx:xx:xx xx\0
977  */
978 static int
979 myri10ge_read_mac_addr(struct myri10ge_priv *mgp)
980 {
981 #define	MYRI10GE_NEXT_STRING(p) while (ptr < limit && *ptr++)
982 #define	myri10ge_digit(c) (((c) >= '0' && (c) <= '9') ? ((c) - '0') :	\
983 		(((c) >= 'A' && (c) <= 'F') ? (10 + (c) - 'A') :	\
984 		(((c) >= 'a' && (c) <= 'f') ? (10 + (c) - 'a') : -1)))
985 
986 	char *ptr, *limit;
987 	int i, hv, lv;
988 
989 	ptr = mgp->eeprom_strings;
990 	limit = mgp->eeprom_strings + MYRI10GE_EEPROM_STRINGS_SIZE;
991 
992 	while (*ptr != '\0' && ptr < limit) {
993 		if (memcmp(ptr, "MAC=", 4) == 0) {
994 			ptr += 4;
995 			if (myri10ge_verbose)
996 				printf("%s: mac address = %s\n", mgp->name,
997 				    ptr);
998 			mgp->mac_addr_string = ptr;
999 			for (i = 0; i < 6; i++) {
1000 				if ((ptr + 2) > limit)
1001 					goto abort;
1002 
1003 				if (*(ptr+1) == ':') {
1004 					hv = 0;
1005 					lv = myri10ge_digit(*ptr); ptr++;
1006 				} else {
1007 					hv = myri10ge_digit(*ptr); ptr++;
1008 					lv = myri10ge_digit(*ptr); ptr++;
1009 				}
1010 				mgp->mac_addr[i] = (hv << 4) | lv;
1011 				ptr++;
1012 			}
1013 		}
1014 		if (memcmp((const void *)ptr, "SN=", 3) == 0) {
1015 			ptr += 3;
1016 			mgp->sn_str = (char *)ptr;
1017 		}
1018 		if (memcmp((const void *)ptr, "PC=", 3) == 0) {
1019 			ptr += 3;
1020 			mgp->pc_str = (char *)ptr;
1021 		}
1022 		MYRI10GE_NEXT_STRING(ptr);
1023 	}
1024 
1025 	return (0);
1026 
1027 abort:
1028 	cmn_err(CE_WARN, "%s: failed to parse eeprom_strings", mgp->name);
1029 	return (ENXIO);
1030 }
1031 
1032 
1033 /*
1034  * Determine the register set containing the PCI resource we
1035  * want to map: the memory-mappable part of the interface. We do
1036  * this by scanning the DDI "reg" property of the interface,
1037  * which is an array of mx_ddi_reg_set structures.
1038  */
1039 static int
1040 myri10ge_reg_set(dev_info_t *dip, int *reg_set, int *span,
1041     unsigned long *busno, unsigned long *devno,
1042     unsigned long *funcno)
1043 {
1044 
1045 #define	REGISTER_NUMBER(ip)	(ip[0] >>  0 & 0xff)
1046 #define	FUNCTION_NUMBER(ip)	(ip[0] >>  8 & 0x07)
1047 #define	DEVICE_NUMBER(ip)	(ip[0] >> 11 & 0x1f)
1048 #define	BUS_NUMBER(ip)		(ip[0] >> 16 & 0xff)
1049 #define	ADDRESS_SPACE(ip)	(ip[0] >> 24 & 0x03)
1050 #define	PCI_ADDR_HIGH(ip)	(ip[1])
1051 #define	PCI_ADDR_LOW(ip) 	(ip[2])
1052 #define	PCI_SPAN_HIGH(ip)	(ip[3])
1053 #define	PCI_SPAN_LOW(ip)	(ip[4])
1054 
1055 #define	MX_DDI_REG_SET_32_BIT_MEMORY_SPACE 2
1056 #define	MX_DDI_REG_SET_64_BIT_MEMORY_SPACE 3
1057 
1058 	int *data, i, *rs;
1059 	uint32_t nelementsp;
1060 
1061 #ifdef MYRI10GE_REGSET_VERBOSE
1062 	char *address_space_name[] = { "Configuration Space",
1063 					"I/O Space",
1064 					"32-bit Memory Space",
1065 					"64-bit Memory Space"
1066 	};
1067 #endif
1068 
1069 	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
1070 	    "reg", &data, &nelementsp) != DDI_SUCCESS) {
1071 		printf("Could not determine register set.\n");
1072 		return (ENXIO);
1073 	}
1074 
1075 #ifdef MYRI10GE_REGSET_VERBOSE
1076 	printf("There are %d register sets.\n", nelementsp / 5);
1077 #endif
1078 	if (!nelementsp) {
1079 		printf("Didn't find any \"reg\" properties.\n");
1080 		ddi_prop_free(data);
1081 		return (ENODEV);
1082 	}
1083 
1084 	/* Scan for the register number. */
1085 	rs = &data[0];
1086 	*busno = BUS_NUMBER(rs);
1087 	*devno = DEVICE_NUMBER(rs);
1088 	*funcno = FUNCTION_NUMBER(rs);
1089 
1090 #ifdef MYRI10GE_REGSET_VERBOSE
1091 	printf("*** Scanning for register number.\n");
1092 #endif
1093 	for (i = 0; i < nelementsp / 5; i++) {
1094 		rs = &data[5 * i];
1095 #ifdef MYRI10GE_REGSET_VERBOSE
1096 		printf("Examining register set %d:\n", i);
1097 		printf("  Register number = %d.\n", REGISTER_NUMBER(rs));
1098 		printf("  Function number = %d.\n", FUNCTION_NUMBER(rs));
1099 		printf("  Device number   = %d.\n", DEVICE_NUMBER(rs));
1100 		printf("  Bus number      = %d.\n", BUS_NUMBER(rs));
1101 		printf("  Address space   = %d (%s ).\n", ADDRESS_SPACE(rs),
1102 		    address_space_name[ADDRESS_SPACE(rs)]);
1103 		printf("  pci address 0x%08x %08x\n", PCI_ADDR_HIGH(rs),
1104 		    PCI_ADDR_LOW(rs));
1105 		printf("  pci span 0x%08x %08x\n", PCI_SPAN_HIGH(rs),
1106 		    PCI_SPAN_LOW(rs));
1107 #endif
1108 		/* We are looking for a memory property. */
1109 
1110 		if (ADDRESS_SPACE(rs) == MX_DDI_REG_SET_64_BIT_MEMORY_SPACE ||
1111 		    ADDRESS_SPACE(rs) == MX_DDI_REG_SET_32_BIT_MEMORY_SPACE) {
1112 			*reg_set = i;
1113 
1114 #ifdef MYRI10GE_REGSET_VERBOSE
1115 			printf("%s uses register set %d.\n",
1116 			    address_space_name[ADDRESS_SPACE(rs)], *reg_set);
1117 #endif
1118 
1119 			*span = (PCI_SPAN_LOW(rs));
1120 #ifdef MYRI10GE_REGSET_VERBOSE
1121 			printf("Board span is 0x%x\n", *span);
1122 #endif
1123 			break;
1124 		}
1125 	}
1126 
1127 	ddi_prop_free(data);
1128 
1129 	/* If no match, fail. */
1130 	if (i >= nelementsp / 5) {
1131 		return (EIO);
1132 	}
1133 
1134 	return (0);
1135 }
1136 
1137 
1138 static int
1139 myri10ge_load_firmware_from_zlib(struct myri10ge_priv *mgp, uint32_t *limit)
1140 {
1141 	void *inflate_buffer;
1142 	int rv, status;
1143 	size_t sram_size = mgp->sram_size - MYRI10GE_EEPROM_STRINGS_SIZE;
1144 	size_t destlen;
1145 	mcp_gen_header_t *hdr;
1146 	unsigned hdr_offset, i;
1147 
1148 
1149 	*limit = 0; /* -Wuninitialized */
1150 	status = 0;
1151 
1152 	inflate_buffer = kmem_zalloc(sram_size, KM_NOSLEEP);
1153 	if (!inflate_buffer) {
1154 		cmn_err(CE_WARN,
1155 		    "%s: Could not allocate buffer to inflate mcp\n",
1156 		    mgp->name);
1157 		return (ENOMEM);
1158 	}
1159 
1160 	destlen = sram_size;
1161 	rv = z_uncompress(inflate_buffer, &destlen, mgp->eth_z8e,
1162 	    mgp->eth_z8e_length);
1163 
1164 	if (rv != Z_OK) {
1165 		cmn_err(CE_WARN, "%s: Could not inflate mcp: %s\n",
1166 		    mgp->name, z_strerror(rv));
1167 		status = ENXIO;
1168 		goto abort;
1169 	}
1170 
1171 	*limit = (uint32_t)destlen;
1172 
1173 	hdr_offset = htonl(*(uint32_t *)(void *)((char *)inflate_buffer +
1174 	    MCP_HEADER_PTR_OFFSET));
1175 	hdr = (void *)((char *)inflate_buffer + hdr_offset);
1176 	if (ntohl(hdr->mcp_type) != MCP_TYPE_ETH) {
1177 		cmn_err(CE_WARN, "%s: Bad firmware type: 0x%x\n", mgp->name,
1178 		    ntohl(hdr->mcp_type));
1179 		status = EIO;
1180 		goto abort;
1181 	}
1182 
1183 	/* save firmware version for kstat */
1184 	(void) strncpy(mgp->fw_version, hdr->version, sizeof (mgp->fw_version));
1185 	if (myri10ge_verbose)
1186 		printf("%s: firmware id: %s\n", mgp->name, hdr->version);
1187 
1188 	/* Copy the inflated firmware to NIC SRAM. */
1189 	for (i = 0; i < *limit; i += 256) {
1190 		myri10ge_pio_copy((char *)mgp->sram + MYRI10GE_FW_OFFSET + i,
1191 		    (char *)inflate_buffer + i,
1192 		    min(256U, (unsigned)(*limit - i)));
1193 		mb();
1194 		(void) *(int *)(void *)mgp->sram;
1195 		mb();
1196 	}
1197 
1198 abort:
1199 	kmem_free(inflate_buffer, sram_size);
1200 
1201 	return (status);
1202 
1203 }
1204 
1205 
1206 int
1207 myri10ge_send_cmd(struct myri10ge_priv *mgp, uint32_t cmd,
1208     myri10ge_cmd_t *data)
1209 {
1210 	mcp_cmd_t *buf;
1211 	char buf_bytes[sizeof (*buf) + 8];
1212 	volatile mcp_cmd_response_t *response = mgp->cmd;
1213 	volatile char *cmd_addr =
1214 	    (volatile char *)mgp->sram + MXGEFW_ETH_CMD;
1215 	int sleep_total = 0;
1216 
1217 	/* ensure buf is aligned to 8 bytes */
1218 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1219 
1220 	buf->data0 = htonl(data->data0);
1221 	buf->data1 = htonl(data->data1);
1222 	buf->data2 = htonl(data->data2);
1223 	buf->cmd = htonl(cmd);
1224 	buf->response_addr.low = mgp->cmd_dma.low;
1225 	buf->response_addr.high = mgp->cmd_dma.high;
1226 	mutex_enter(&mgp->cmd_lock);
1227 	response->result = 0xffffffff;
1228 	mb();
1229 
1230 	myri10ge_pio_copy((void *)cmd_addr, buf, sizeof (*buf));
1231 
1232 	/* wait up to 20ms */
1233 	for (sleep_total = 0; sleep_total < 20; sleep_total++) {
1234 		mb();
1235 		if (response->result != 0xffffffff) {
1236 			if (response->result == 0) {
1237 				data->data0 = ntohl(response->data);
1238 				mutex_exit(&mgp->cmd_lock);
1239 				return (0);
1240 			} else if (ntohl(response->result)
1241 			    == MXGEFW_CMD_UNKNOWN) {
1242 				mutex_exit(&mgp->cmd_lock);
1243 				return (ENOSYS);
1244 			} else if (ntohl(response->result)
1245 			    == MXGEFW_CMD_ERROR_UNALIGNED) {
1246 				mutex_exit(&mgp->cmd_lock);
1247 				return (E2BIG);
1248 			} else {
1249 				cmn_err(CE_WARN,
1250 				    "%s: command %d failed, result = %d\n",
1251 				    mgp->name, cmd, ntohl(response->result));
1252 				mutex_exit(&mgp->cmd_lock);
1253 				return (ENXIO);
1254 			}
1255 		}
1256 		drv_usecwait(1000);
1257 	}
1258 	mutex_exit(&mgp->cmd_lock);
1259 	cmn_err(CE_WARN, "%s: command %d timed out, result = %d\n",
1260 	    mgp->name, cmd, ntohl(response->result));
1261 	return (EAGAIN);
1262 }
1263 
1264 /*
1265  * Enable or disable periodic RDMAs from the host to make certain
1266  * chipsets resend dropped PCIe messages
1267  */
1268 
1269 static void
1270 myri10ge_dummy_rdma(struct myri10ge_priv *mgp, int enable)
1271 {
1272 	char buf_bytes[72];
1273 	volatile uint32_t *confirm;
1274 	volatile char *submit;
1275 	uint32_t *buf;
1276 	int i;
1277 
1278 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1279 
1280 	/* clear confirmation addr */
1281 	confirm = (volatile uint32_t *)mgp->cmd;
1282 	*confirm = 0;
1283 	mb();
1284 
1285 	/*
1286 	 * send an rdma command to the PCIe engine, and wait for the
1287 	 * response in the confirmation address.  The firmware should
1288 	 *  write a -1 there to indicate it is alive and well
1289 	 */
1290 
1291 	buf[0] = mgp->cmd_dma.high;		/* confirm addr MSW */
1292 	buf[1] = mgp->cmd_dma.low;		/* confirm addr LSW */
1293 	buf[2] = htonl(0xffffffff);		/* confirm data */
1294 	buf[3] = htonl(mgp->cmd_dma.high); 	/* dummy addr MSW */
1295 	buf[4] = htonl(mgp->cmd_dma.low); 	/* dummy addr LSW */
1296 	buf[5] = htonl(enable);			/* enable? */
1297 
1298 
1299 	submit = (volatile char *)(mgp->sram + MXGEFW_BOOT_DUMMY_RDMA);
1300 
1301 	myri10ge_pio_copy((char *)submit, buf, 64);
1302 	mb();
1303 	drv_usecwait(1000);
1304 	mb();
1305 	i = 0;
1306 	while (*confirm != 0xffffffff && i < 20) {
1307 		drv_usecwait(1000);
1308 		i++;
1309 	}
1310 	if (*confirm != 0xffffffff) {
1311 		cmn_err(CE_WARN, "%s: dummy rdma %s failed (%p = 0x%x)",
1312 		    mgp->name,
1313 		    (enable ? "enable" : "disable"), (void*) confirm, *confirm);
1314 	}
1315 }
1316 
1317 static int
1318 myri10ge_load_firmware(struct myri10ge_priv *mgp)
1319 {
1320 	myri10ge_cmd_t cmd;
1321 	volatile uint32_t *confirm;
1322 	volatile char *submit;
1323 	char buf_bytes[72];
1324 	uint32_t *buf, size;
1325 	int status, i;
1326 
1327 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1328 
1329 	status = myri10ge_load_firmware_from_zlib(mgp, &size);
1330 	if (status) {
1331 		cmn_err(CE_WARN, "%s: firmware loading failed\n", mgp->name);
1332 		return (status);
1333 	}
1334 
1335 	/* clear confirmation addr */
1336 	confirm = (volatile uint32_t *)mgp->cmd;
1337 	*confirm = 0;
1338 	mb();
1339 
1340 	/*
1341 	 * send a reload command to the bootstrap MCP, and wait for the
1342 	 * response in the confirmation address.  The firmware should
1343 	 * write a -1 there to indicate it is alive and well
1344 	 */
1345 
1346 	buf[0] = mgp->cmd_dma.high;	/* confirm addr MSW */
1347 	buf[1] = mgp->cmd_dma.low;	/* confirm addr LSW */
1348 	buf[2] = htonl(0xffffffff);	/* confirm data */
1349 
1350 	/*
1351 	 * FIX: All newest firmware should un-protect the bottom of
1352 	 * the sram before handoff. However, the very first interfaces
1353 	 * do not. Therefore the handoff copy must skip the first 8 bytes
1354 	 */
1355 	buf[3] = htonl(MYRI10GE_FW_OFFSET + 8); /* where the code starts */
1356 	buf[4] = htonl(size - 8); 	/* length of code */
1357 	buf[5] = htonl(8);		/* where to copy to */
1358 	buf[6] = htonl(0);		/* where to jump to */
1359 
1360 	submit = (volatile char *)(mgp->sram + MXGEFW_BOOT_HANDOFF);
1361 
1362 	myri10ge_pio_copy((char *)submit, buf, 64);
1363 	mb();
1364 	drv_usecwait(1000);
1365 	mb();
1366 	i = 0;
1367 	while (*confirm != 0xffffffff && i < 1000) {
1368 		drv_usecwait(1000);
1369 		i++;
1370 	}
1371 	if (*confirm != 0xffffffff) {
1372 		cmn_err(CE_WARN, "%s: handoff failed (%p = 0x%x)",
1373 		    mgp->name, (void *) confirm, *confirm);
1374 
1375 		return (ENXIO);
1376 	}
1377 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
1378 	if (status != 0) {
1379 		cmn_err(CE_WARN, "%s: failed MXGEFW_CMD_GET_RX_RING_SIZE\n",
1380 		    mgp->name);
1381 		return (ENXIO);
1382 	}
1383 
1384 	mgp->max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
1385 	myri10ge_dummy_rdma(mgp, 1);
1386 	return (0);
1387 }
1388 
1389 static int
1390 myri10ge_m_unicst(void *arg, const uint8_t *addr)
1391 {
1392 	struct myri10ge_priv *mgp = arg;
1393 	myri10ge_cmd_t cmd;
1394 	int status;
1395 
1396 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1397 	    | (addr[2] << 8) | addr[3]);
1398 
1399 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1400 
1401 	status = myri10ge_send_cmd(mgp, MXGEFW_SET_MAC_ADDRESS, &cmd);
1402 	if (status == 0 && (addr != mgp->mac_addr))
1403 		(void) memcpy(mgp->mac_addr, addr, sizeof (mgp->mac_addr));
1404 
1405 	return (status);
1406 }
1407 
1408 static int
1409 myri10ge_change_pause(struct myri10ge_priv *mgp, int pause)
1410 {
1411 	myri10ge_cmd_t cmd;
1412 	int status;
1413 
1414 	if (pause)
1415 		status = myri10ge_send_cmd(mgp, MXGEFW_ENABLE_FLOW_CONTROL,
1416 		    &cmd);
1417 	else
1418 		status = myri10ge_send_cmd(mgp, MXGEFW_DISABLE_FLOW_CONTROL,
1419 		    &cmd);
1420 
1421 	if (status) {
1422 		cmn_err(CE_WARN, "%s: Failed to set flow control mode\n",
1423 		    mgp->name);
1424 		return (ENXIO);
1425 	}
1426 	mgp->pause = pause;
1427 	return (0);
1428 }
1429 
1430 static void
1431 myri10ge_change_promisc(struct myri10ge_priv *mgp, int promisc)
1432 {
1433 	myri10ge_cmd_t cmd;
1434 	int status;
1435 
1436 	if (promisc)
1437 		status = myri10ge_send_cmd(mgp, MXGEFW_ENABLE_PROMISC, &cmd);
1438 	else
1439 		status = myri10ge_send_cmd(mgp, MXGEFW_DISABLE_PROMISC, &cmd);
1440 
1441 	if (status) {
1442 		cmn_err(CE_WARN, "%s: Failed to set promisc mode\n",
1443 		    mgp->name);
1444 	}
1445 }
1446 
1447 static int
1448 myri10ge_dma_test(struct myri10ge_priv *mgp, int test_type)
1449 {
1450 	myri10ge_cmd_t cmd;
1451 	int status;
1452 	uint32_t len;
1453 	void *dmabench;
1454 	struct myri10ge_dma_stuff dmabench_dma;
1455 	char *test = " ";
1456 
1457 	/*
1458 	 * Run a small DMA test.
1459 	 * The magic multipliers to the length tell the firmware
1460 	 * tp do DMA read, write, or read+write tests.  The
1461 	 * results are returned in cmd.data0.  The upper 16
1462 	 * bits or the return is the number of transfers completed.
1463 	 * The lower 16 bits is the time in 0.5us ticks that the
1464 	 * transfers took to complete
1465 	 */
1466 
1467 	len = mgp->tx_boundary;
1468 
1469 	dmabench = myri10ge_dma_alloc(mgp->dip, len,
1470 	    &myri10ge_rx_jumbo_dma_attr, &myri10ge_dev_access_attr,
1471 	    DDI_DMA_STREAMING,  DDI_DMA_RDWR|DDI_DMA_STREAMING,
1472 	    &dmabench_dma, 1, DDI_DMA_DONTWAIT);
1473 	mgp->read_dma = mgp->write_dma = mgp->read_write_dma = 0;
1474 	if (dmabench == NULL) {
1475 		cmn_err(CE_WARN, "%s dma benchmark aborted\n", mgp->name);
1476 		return (ENOMEM);
1477 	}
1478 
1479 	cmd.data0 = ntohl(dmabench_dma.low);
1480 	cmd.data1 = ntohl(dmabench_dma.high);
1481 	cmd.data2 = len * 0x10000;
1482 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1483 	if (status != 0) {
1484 		test = "read";
1485 		goto abort;
1486 	}
1487 	mgp->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
1488 
1489 	cmd.data0 = ntohl(dmabench_dma.low);
1490 	cmd.data1 = ntohl(dmabench_dma.high);
1491 	cmd.data2 = len * 0x1;
1492 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1493 	if (status != 0) {
1494 		test = "write";
1495 		goto abort;
1496 	}
1497 	mgp->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
1498 
1499 	cmd.data0 = ntohl(dmabench_dma.low);
1500 	cmd.data1 = ntohl(dmabench_dma.high);
1501 	cmd.data2 = len * 0x10001;
1502 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1503 	if (status != 0) {
1504 		test = "read/write";
1505 		goto abort;
1506 	}
1507 	mgp->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
1508 	    (cmd.data0 & 0xffff);
1509 
1510 
1511 abort:
1512 	myri10ge_dma_free(&dmabench_dma);
1513 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
1514 		cmn_err(CE_WARN, "%s %s dma benchmark failed\n", mgp->name,
1515 		    test);
1516 	return (status);
1517 }
1518 
1519 static int
1520 myri10ge_reset(struct myri10ge_priv *mgp)
1521 {
1522 	myri10ge_cmd_t cmd;
1523 	struct myri10ge_nic_stat *ethstat;
1524 	struct myri10ge_slice_state *ss;
1525 	int i, status;
1526 	size_t bytes;
1527 
1528 	/* send a reset command to the card to see if it is alive */
1529 	(void) memset(&cmd, 0, sizeof (cmd));
1530 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_RESET, &cmd);
1531 	if (status != 0) {
1532 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
1533 		return (ENXIO);
1534 	}
1535 
1536 	/* Now exchange information about interrupts  */
1537 
1538 	bytes = mgp->max_intr_slots * sizeof (*mgp->ss[0].rx_done.entry);
1539 	cmd.data0 = (uint32_t)bytes;
1540 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1541 
1542 	/*
1543 	 * Even though we already know how many slices are supported
1544 	 * via myri10ge_probe_slices() MXGEFW_CMD_GET_MAX_RSS_QUEUES
1545 	 * has magic side effects, and must be called after a reset.
1546 	 * It must be called prior to calling any RSS related cmds,
1547 	 * including assigning an interrupt queue for anything but
1548 	 * slice 0.  It must also be called *after*
1549 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1550 	 * the firmware to compute offsets.
1551 	 */
1552 
1553 	if (mgp->num_slices > 1) {
1554 
1555 		/* ask the maximum number of slices it supports */
1556 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1557 		    &cmd);
1558 		if (status != 0) {
1559 			cmn_err(CE_WARN,
1560 			    "%s: failed to get number of slices\n",
1561 			    mgp->name);
1562 			return (status);
1563 		}
1564 
1565 		/*
1566 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1567 		 * to setting up the interrupt queue DMA
1568 		 */
1569 
1570 		cmd.data0 = mgp->num_slices;
1571 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE |
1572 		    MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1573 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1574 		    &cmd);
1575 		if (status != 0) {
1576 			cmn_err(CE_WARN,
1577 			    "%s: failed to set number of slices\n",
1578 			    mgp->name);
1579 			return (status);
1580 		}
1581 	}
1582 	for (i = 0; i < mgp->num_slices; i++) {
1583 		ss = &mgp->ss[i];
1584 		cmd.data0 = ntohl(ss->rx_done.dma.low);
1585 		cmd.data1 = ntohl(ss->rx_done.dma.high);
1586 		cmd.data2 = i;
1587 		status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_DMA,
1588 		    &cmd);
1589 	};
1590 
1591 	status |= myri10ge_send_cmd(mgp,  MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1592 	for (i = 0; i < mgp->num_slices; i++) {
1593 		ss = &mgp->ss[i];
1594 		ss->irq_claim = (volatile unsigned int *)
1595 		    (void *)(mgp->sram + cmd.data0 + 8 * i);
1596 	}
1597 
1598 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
1599 		status |= myri10ge_send_cmd(mgp,
1600 		    MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1601 		mgp->irq_deassert = (uint32_t *)(void *)(mgp->sram + cmd.data0);
1602 	}
1603 
1604 	status |= myri10ge_send_cmd(mgp,
1605 	    MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1606 	mgp->intr_coal_delay_ptr = (uint32_t *)(void *)(mgp->sram + cmd.data0);
1607 
1608 	if (status != 0) {
1609 		cmn_err(CE_WARN, "%s: failed set interrupt parameters\n",
1610 		    mgp->name);
1611 		return (status);
1612 	}
1613 
1614 	*mgp->intr_coal_delay_ptr = htonl(mgp->intr_coal_delay);
1615 	(void) myri10ge_dma_test(mgp, MXGEFW_DMA_TEST);
1616 
1617 	/* reset mcp/driver shared state back to 0 */
1618 
1619 	for (i = 0; i < mgp->num_slices; i++) {
1620 		ss = &mgp->ss[i];
1621 		bytes = mgp->max_intr_slots *
1622 		    sizeof (*mgp->ss[0].rx_done.entry);
1623 		(void) memset(ss->rx_done.entry, 0, bytes);
1624 		ss->tx.req = 0;
1625 		ss->tx.done = 0;
1626 		ss->tx.pkt_done = 0;
1627 		ss->rx_big.cnt = 0;
1628 		ss->rx_small.cnt = 0;
1629 		ss->rx_done.idx = 0;
1630 		ss->rx_done.cnt = 0;
1631 		ss->rx_token = 0;
1632 		ss->tx.watchdog_done = 0;
1633 		ss->tx.watchdog_req = 0;
1634 		ss->tx.active = 0;
1635 		ss->tx.activate = 0;
1636 	}
1637 	mgp->watchdog_rx_pause = 0;
1638 	if (mgp->ksp_stat != NULL) {
1639 		ethstat = (struct myri10ge_nic_stat *)mgp->ksp_stat->ks_data;
1640 		ethstat->link_changes.value.ul = 0;
1641 	}
1642 	status = myri10ge_m_unicst(mgp, mgp->mac_addr);
1643 	myri10ge_change_promisc(mgp, 0);
1644 	(void) myri10ge_change_pause(mgp, mgp->pause);
1645 	return (status);
1646 }
1647 
1648 static int
1649 myri10ge_init_toeplitz(struct myri10ge_priv *mgp)
1650 {
1651 	myri10ge_cmd_t cmd;
1652 	int i, b, s, t, j;
1653 	int status;
1654 	uint32_t k[8];
1655 	uint32_t tmp;
1656 	uint8_t *key;
1657 
1658 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RSS_KEY_OFFSET,
1659 	    &cmd);
1660 	if (status != 0) {
1661 		cmn_err(CE_WARN, "%s: failed to get rss key\n",
1662 		    mgp->name);
1663 		return (EIO);
1664 	}
1665 	myri10ge_pio_copy32(mgp->rss_key,
1666 	    (uint32_t *)(void*)((char *)mgp->sram + cmd.data0),
1667 	    sizeof (mgp->rss_key));
1668 
1669 	mgp->toeplitz_hash_table = kmem_alloc(sizeof (uint32_t) * 12 * 256,
1670 	    KM_SLEEP);
1671 	key = (uint8_t *)mgp->rss_key;
1672 	t = 0;
1673 	for (b = 0; b < 12; b++) {
1674 		for (s = 0; s < 8; s++) {
1675 			/* Bits: b*8+s, ..., b*8+s+31 */
1676 			k[s] = 0;
1677 			for (j = 0; j < 32; j++) {
1678 				int bit = b*8+s+j;
1679 				bit = 0x1 & (key[bit / 8] >> (7 -(bit & 0x7)));
1680 				k[s] |= bit << (31 - j);
1681 			}
1682 		}
1683 
1684 		for (i = 0; i <= 0xff; i++) {
1685 			tmp = 0;
1686 			if (i & (1 << 7)) { tmp ^= k[0]; }
1687 			if (i & (1 << 6)) { tmp ^= k[1]; }
1688 			if (i & (1 << 5)) { tmp ^= k[2]; }
1689 			if (i & (1 << 4)) { tmp ^= k[3]; }
1690 			if (i & (1 << 3)) { tmp ^= k[4]; }
1691 			if (i & (1 << 2)) { tmp ^= k[5]; }
1692 			if (i & (1 << 1)) { tmp ^= k[6]; }
1693 			if (i & (1 << 0)) { tmp ^= k[7]; }
1694 			mgp->toeplitz_hash_table[t++] = tmp;
1695 		}
1696 	}
1697 	return (0);
1698 }
1699 
1700 static inline struct myri10ge_slice_state *
1701 myri10ge_toeplitz_send_hash(struct myri10ge_priv *mgp, struct ip *ip)
1702 {
1703 	struct tcphdr *hdr;
1704 	uint32_t saddr, daddr;
1705 	uint32_t hash, slice;
1706 	uint32_t *table = mgp->toeplitz_hash_table;
1707 	uint16_t src, dst;
1708 
1709 	/*
1710 	 * Note hashing order is reversed from how it is done
1711 	 * in the NIC, so as to generate the same hash value
1712 	 * for the connection to try to keep connections CPU local
1713 	 */
1714 
1715 	/* hash on IPv4 src/dst address */
1716 	saddr = ntohl(ip->ip_src.s_addr);
1717 	daddr = ntohl(ip->ip_dst.s_addr);
1718 	hash = table[(256 * 0) + ((daddr >> 24) & 0xff)];
1719 	hash ^= table[(256 * 1) + ((daddr >> 16) & 0xff)];
1720 	hash ^= table[(256 * 2) + ((daddr >> 8) & 0xff)];
1721 	hash ^= table[(256 * 3) + ((daddr) & 0xff)];
1722 	hash ^= table[(256 * 4) + ((saddr >> 24) & 0xff)];
1723 	hash ^= table[(256 * 5) + ((saddr >> 16) & 0xff)];
1724 	hash ^= table[(256 * 6) + ((saddr >> 8) & 0xff)];
1725 	hash ^= table[(256 * 7) + ((saddr) & 0xff)];
1726 	/* hash on TCP port, if required */
1727 	if ((myri10ge_rss_hash & MXGEFW_RSS_HASH_TYPE_TCP_IPV4) &&
1728 	    ip->ip_p == IPPROTO_TCP) {
1729 		hdr = (struct tcphdr *)(void *)
1730 		    (((uint8_t *)ip) +  (ip->ip_hl << 2));
1731 		src = ntohs(hdr->th_sport);
1732 		dst = ntohs(hdr->th_dport);
1733 
1734 		hash ^= table[(256 * 8) + ((dst >> 8) & 0xff)];
1735 		hash ^= table[(256 * 9) + ((dst) & 0xff)];
1736 		hash ^= table[(256 * 10) + ((src >> 8) & 0xff)];
1737 		hash ^= table[(256 * 11) + ((src) & 0xff)];
1738 	}
1739 	slice = (mgp->num_slices - 1) & hash;
1740 	return (&mgp->ss[slice]);
1741 
1742 }
1743 
1744 static inline struct myri10ge_slice_state *
1745 myri10ge_simple_send_hash(struct myri10ge_priv *mgp, struct ip *ip)
1746 {
1747 	struct tcphdr *hdr;
1748 	uint32_t slice, hash_val;
1749 
1750 
1751 	if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP) {
1752 		return (&mgp->ss[0]);
1753 	}
1754 	hdr = (struct tcphdr *)(void *)(((uint8_t *)ip) +  (ip->ip_hl << 2));
1755 
1756 	/*
1757 	 * Use the second byte of the *destination* address for
1758 	 * MXGEFW_RSS_HASH_TYPE_SRC_PORT, so as to match NIC's hashing
1759 	 */
1760 	hash_val = ntohs(hdr->th_dport) & 0xff;
1761 	if (myri10ge_rss_hash == MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT)
1762 		hash_val += ntohs(hdr->th_sport) & 0xff;
1763 
1764 	slice = (mgp->num_slices - 1) & hash_val;
1765 	return (&mgp->ss[slice]);
1766 }
1767 
1768 static inline struct myri10ge_slice_state *
1769 myri10ge_send_hash(struct myri10ge_priv *mgp, mblk_t *mp)
1770 {
1771 	unsigned int slice = 0;
1772 	struct ether_header *eh;
1773 	struct ether_vlan_header *vh;
1774 	struct ip *ip;
1775 	int ehl, ihl;
1776 
1777 	if (mgp->num_slices == 1)
1778 		return (&mgp->ss[0]);
1779 
1780 	if (myri10ge_tx_hash == 0) {
1781 		slice = CPU->cpu_id & (mgp->num_slices - 1);
1782 		return (&mgp->ss[slice]);
1783 	}
1784 
1785 	/*
1786 	 *  ensure it is a TCP or UDP over IPv4 packet, and that the
1787 	 *  headers are in the 1st mblk.  Otherwise, punt
1788 	 */
1789 	ehl = sizeof (*eh);
1790 	ihl = sizeof (*ip);
1791 	if ((MBLKL(mp)) <  (ehl + ihl + 8))
1792 		return (&mgp->ss[0]);
1793 	eh = (struct ether_header *)(void *)mp->b_rptr;
1794 	ip = (struct ip *)(void *)(eh + 1);
1795 	if (eh->ether_type != BE_16(ETHERTYPE_IP)) {
1796 		if (eh->ether_type != BE_16(ETHERTYPE_VLAN))
1797 			return (&mgp->ss[0]);
1798 		vh = (struct ether_vlan_header *)(void *)mp->b_rptr;
1799 		if (vh->ether_type != BE_16(ETHERTYPE_IP))
1800 			return (&mgp->ss[0]);
1801 		ehl += 4;
1802 		ip = (struct ip *)(void *)(vh + 1);
1803 	}
1804 	ihl = ip->ip_hl << 2;
1805 	if (MBLKL(mp) <  (ehl + ihl + 8))
1806 		return (&mgp->ss[0]);
1807 	switch (myri10ge_rss_hash) {
1808 	case MXGEFW_RSS_HASH_TYPE_IPV4:
1809 		/* fallthru */
1810 	case MXGEFW_RSS_HASH_TYPE_TCP_IPV4:
1811 		/* fallthru */
1812 	case (MXGEFW_RSS_HASH_TYPE_IPV4|MXGEFW_RSS_HASH_TYPE_TCP_IPV4):
1813 		return (myri10ge_toeplitz_send_hash(mgp, ip));
1814 	case MXGEFW_RSS_HASH_TYPE_SRC_PORT:
1815 		/* fallthru */
1816 	case MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT:
1817 		return (myri10ge_simple_send_hash(mgp, ip));
1818 	default:
1819 		break;
1820 	}
1821 	return (&mgp->ss[0]);
1822 }
1823 
1824 static int
1825 myri10ge_setup_slice(struct myri10ge_slice_state *ss)
1826 {
1827 	struct myri10ge_priv *mgp = ss->mgp;
1828 	myri10ge_cmd_t cmd;
1829 	int tx_ring_size, rx_ring_size;
1830 	int tx_ring_entries, rx_ring_entries;
1831 	int slice, status;
1832 	int allocated, idx;
1833 	size_t bytes;
1834 
1835 	slice = ss - mgp->ss;
1836 	cmd.data0 = slice;
1837 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
1838 	tx_ring_size = cmd.data0;
1839 	cmd.data0 = slice;
1840 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
1841 	if (status != 0)
1842 		return (status);
1843 	rx_ring_size = cmd.data0;
1844 
1845 	tx_ring_entries = tx_ring_size / sizeof (struct mcp_kreq_ether_send);
1846 	rx_ring_entries = rx_ring_size / sizeof (struct mcp_dma_addr);
1847 	ss->tx.mask = tx_ring_entries - 1;
1848 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
1849 
1850 	/* get the lanai pointers to the send and receive rings */
1851 
1852 	cmd.data0 = slice;
1853 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
1854 	ss->tx.lanai = (mcp_kreq_ether_send_t *)(void *)(mgp->sram + cmd.data0);
1855 	if (mgp->num_slices > 1) {
1856 		ss->tx.go = (char *)mgp->sram + MXGEFW_ETH_SEND_GO + 64 * slice;
1857 		ss->tx.stop = (char *)mgp->sram + MXGEFW_ETH_SEND_STOP +
1858 		    64 * slice;
1859 	} else {
1860 		ss->tx.go = NULL;
1861 		ss->tx.stop = NULL;
1862 	}
1863 
1864 	cmd.data0 = slice;
1865 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
1866 	ss->rx_small.lanai = (mcp_kreq_ether_recv_t *)
1867 	    (void *)(mgp->sram + cmd.data0);
1868 
1869 	cmd.data0 = slice;
1870 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
1871 	ss->rx_big.lanai = (mcp_kreq_ether_recv_t *)(void *)
1872 	    (mgp->sram + cmd.data0);
1873 
1874 	if (status != 0) {
1875 		cmn_err(CE_WARN,
1876 		    "%s: failed to get ring sizes or locations\n", mgp->name);
1877 		return (status);
1878 	}
1879 
1880 	status = ENOMEM;
1881 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
1882 	ss->rx_small.shadow = kmem_zalloc(bytes, KM_SLEEP);
1883 	if (ss->rx_small.shadow == NULL)
1884 		goto abort;
1885 	(void) memset(ss->rx_small.shadow, 0, bytes);
1886 
1887 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
1888 	ss->rx_big.shadow = kmem_zalloc(bytes, KM_SLEEP);
1889 	if (ss->rx_big.shadow == NULL)
1890 		goto abort_with_rx_small_shadow;
1891 	(void) memset(ss->rx_big.shadow, 0, bytes);
1892 
1893 	/* allocate the host info rings */
1894 
1895 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
1896 	ss->tx.info = kmem_zalloc(bytes, KM_SLEEP);
1897 	if (ss->tx.info == NULL)
1898 		goto abort_with_rx_big_shadow;
1899 	(void) memset(ss->tx.info, 0, bytes);
1900 
1901 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
1902 	ss->rx_small.info = kmem_zalloc(bytes, KM_SLEEP);
1903 	if (ss->rx_small.info == NULL)
1904 		goto abort_with_tx_info;
1905 	(void) memset(ss->rx_small.info, 0, bytes);
1906 
1907 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
1908 	ss->rx_big.info = kmem_zalloc(bytes, KM_SLEEP);
1909 	if (ss->rx_big.info == NULL)
1910 		goto abort_with_rx_small_info;
1911 	(void) memset(ss->rx_big.info, 0, bytes);
1912 
1913 	ss->tx.stall = ss->tx.sched = 0;
1914 	ss->tx.stall_early = ss->tx.stall_late = 0;
1915 
1916 	ss->jbufs_for_smalls = 1 + (1 + ss->rx_small.mask) /
1917 	    (myri10ge_mtu / (myri10ge_small_bytes + MXGEFW_PAD));
1918 
1919 	allocated = myri10ge_add_jbufs(ss,
1920 	    myri10ge_bigbufs_initial + ss->jbufs_for_smalls, 1);
1921 	if (allocated < ss->jbufs_for_smalls + myri10ge_bigbufs_initial) {
1922 		cmn_err(CE_WARN,
1923 		    "%s: Could not allocate enough receive buffers (%d/%d)\n",
1924 		    mgp->name, allocated,
1925 		    myri10ge_bigbufs_initial + ss->jbufs_for_smalls);
1926 		goto abort_with_jumbos;
1927 	}
1928 
1929 	myri10ge_carve_up_jbufs_into_small_ring(ss);
1930 	ss->j_rx_cnt = 0;
1931 
1932 	mutex_enter(&ss->jpool.mtx);
1933 	if (allocated < rx_ring_entries)
1934 		ss->jpool.low_water = allocated / 4;
1935 	else
1936 		ss->jpool.low_water = rx_ring_entries / 2;
1937 
1938 	/*
1939 	 * invalidate the big receive ring in case we do not
1940 	 * allocate sufficient jumbos to fill it
1941 	 */
1942 	(void) memset(ss->rx_big.shadow, 1,
1943 	    (ss->rx_big.mask + 1) * sizeof (ss->rx_big.shadow[0]));
1944 	for (idx = 7; idx <= ss->rx_big.mask; idx += 8) {
1945 		myri10ge_submit_8rx(&ss->rx_big.lanai[idx - 7],
1946 		    &ss->rx_big.shadow[idx - 7]);
1947 		mb();
1948 	}
1949 
1950 
1951 	myri10ge_restock_jumbos(ss);
1952 
1953 	for (idx = 7; idx <= ss->rx_small.mask; idx += 8) {
1954 		myri10ge_submit_8rx(&ss->rx_small.lanai[idx - 7],
1955 		    &ss->rx_small.shadow[idx - 7]);
1956 		mb();
1957 	}
1958 	ss->rx_small.cnt = ss->rx_small.mask + 1;
1959 
1960 	mutex_exit(&ss->jpool.mtx);
1961 
1962 	status = myri10ge_prepare_tx_ring(ss);
1963 
1964 	if (status != 0)
1965 		goto abort_with_small_jbufs;
1966 
1967 	cmd.data0 = ntohl(ss->fw_stats_dma.low);
1968 	cmd.data1 = ntohl(ss->fw_stats_dma.high);
1969 	cmd.data2 = sizeof (mcp_irq_data_t);
1970 	cmd.data2 |= (slice << 16);
1971 	bzero(ss->fw_stats, sizeof (*ss->fw_stats));
1972 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
1973 	if (status == ENOSYS) {
1974 		cmd.data0 = ntohl(ss->fw_stats_dma.low) +
1975 		    offsetof(mcp_irq_data_t, send_done_count);
1976 		cmd.data1 = ntohl(ss->fw_stats_dma.high);
1977 		status = myri10ge_send_cmd(mgp,
1978 		    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE, &cmd);
1979 	}
1980 	if (status) {
1981 		cmn_err(CE_WARN, "%s: Couldn't set stats DMA\n", mgp->name);
1982 		goto abort_with_tx;
1983 	}
1984 
1985 	return (0);
1986 
1987 abort_with_tx:
1988 	myri10ge_unprepare_tx_ring(ss);
1989 
1990 abort_with_small_jbufs:
1991 	myri10ge_release_small_jbufs(ss);
1992 
1993 abort_with_jumbos:
1994 	if (allocated != 0) {
1995 		mutex_enter(&ss->jpool.mtx);
1996 		ss->jpool.low_water = 0;
1997 		mutex_exit(&ss->jpool.mtx);
1998 		myri10ge_unstock_jumbos(ss);
1999 		myri10ge_remove_jbufs(ss);
2000 	}
2001 
2002 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
2003 	kmem_free(ss->rx_big.info, bytes);
2004 
2005 abort_with_rx_small_info:
2006 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
2007 	kmem_free(ss->rx_small.info, bytes);
2008 
2009 abort_with_tx_info:
2010 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
2011 	kmem_free(ss->tx.info, bytes);
2012 
2013 abort_with_rx_big_shadow:
2014 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2015 	kmem_free(ss->rx_big.shadow, bytes);
2016 
2017 abort_with_rx_small_shadow:
2018 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2019 	kmem_free(ss->rx_small.shadow, bytes);
2020 abort:
2021 	return (status);
2022 
2023 }
2024 
2025 static void
2026 myri10ge_teardown_slice(struct myri10ge_slice_state *ss)
2027 {
2028 	int tx_ring_entries, rx_ring_entries;
2029 	size_t bytes;
2030 
2031 	/* ignore slices that have not been fully setup */
2032 	if (ss->tx.cp == NULL)
2033 		return;
2034 	/* Free the TX copy buffers */
2035 	myri10ge_unprepare_tx_ring(ss);
2036 
2037 	/* stop passing returned buffers to firmware */
2038 
2039 	mutex_enter(&ss->jpool.mtx);
2040 	ss->jpool.low_water = 0;
2041 	mutex_exit(&ss->jpool.mtx);
2042 	myri10ge_release_small_jbufs(ss);
2043 
2044 	/* Release the free jumbo frame pool */
2045 	myri10ge_unstock_jumbos(ss);
2046 	myri10ge_remove_jbufs(ss);
2047 
2048 	rx_ring_entries = ss->rx_big.mask + 1;
2049 	tx_ring_entries = ss->tx.mask + 1;
2050 
2051 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
2052 	kmem_free(ss->rx_big.info, bytes);
2053 
2054 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
2055 	kmem_free(ss->rx_small.info, bytes);
2056 
2057 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
2058 	kmem_free(ss->tx.info, bytes);
2059 
2060 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2061 	kmem_free(ss->rx_big.shadow, bytes);
2062 
2063 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2064 	kmem_free(ss->rx_small.shadow, bytes);
2065 
2066 }
2067 static int
2068 myri10ge_start_locked(struct myri10ge_priv *mgp)
2069 {
2070 	myri10ge_cmd_t cmd;
2071 	int status, big_pow2, i;
2072 	volatile uint8_t *itable;
2073 
2074 	status = DDI_SUCCESS;
2075 	/* Allocate DMA resources and receive buffers */
2076 
2077 	status = myri10ge_reset(mgp);
2078 	if (status != 0) {
2079 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
2080 		return (DDI_FAILURE);
2081 	}
2082 
2083 	if (mgp->num_slices > 1) {
2084 		cmd.data0 = mgp->num_slices;
2085 		cmd.data1 = 1; /* use MSI-X */
2086 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ENABLE_RSS_QUEUES,
2087 		    &cmd);
2088 		if (status != 0) {
2089 			cmn_err(CE_WARN,
2090 			    "%s: failed to set number of slices\n",
2091 			    mgp->name);
2092 			goto abort_with_nothing;
2093 		}
2094 		/* setup the indirection table */
2095 		cmd.data0 = mgp->num_slices;
2096 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
2097 		    &cmd);
2098 
2099 		status |= myri10ge_send_cmd(mgp,
2100 		    MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
2101 		if (status != 0) {
2102 			cmn_err(CE_WARN,
2103 			    "%s: failed to setup rss tables\n", mgp->name);
2104 		}
2105 
2106 		/* just enable an identity mapping */
2107 		itable = mgp->sram + cmd.data0;
2108 		for (i = 0; i < mgp->num_slices; i++)
2109 			itable[i] = (uint8_t)i;
2110 
2111 		if (myri10ge_rss_hash & MYRI10GE_TOEPLITZ_HASH) {
2112 			status = myri10ge_init_toeplitz(mgp);
2113 			if (status != 0) {
2114 				cmn_err(CE_WARN, "%s: failed to setup "
2115 				    "toeplitz tx hash table", mgp->name);
2116 				goto abort_with_nothing;
2117 			}
2118 		}
2119 		cmd.data0 = 1;
2120 		cmd.data1 = myri10ge_rss_hash;
2121 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_RSS_ENABLE,
2122 		    &cmd);
2123 		if (status != 0) {
2124 			cmn_err(CE_WARN,
2125 			    "%s: failed to enable slices\n", mgp->name);
2126 			goto abort_with_toeplitz;
2127 		}
2128 	}
2129 
2130 	for (i = 0; i < mgp->num_slices; i++) {
2131 		status = myri10ge_setup_slice(&mgp->ss[i]);
2132 		if (status != 0)
2133 			goto abort_with_slices;
2134 	}
2135 
2136 	/*
2137 	 * Tell the MCP how many buffers it has, and to
2138 	 *  bring the ethernet interface up
2139 	 *
2140 	 * Firmware needs the big buff size as a power of 2.  Lie and
2141 	 * tell it the buffer is larger, because we only use 1
2142 	 * buffer/pkt, and the mtu will prevent overruns
2143 	 */
2144 	big_pow2 = myri10ge_mtu + MXGEFW_PAD;
2145 	while (!ISP2(big_pow2))
2146 		big_pow2++;
2147 
2148 	/* now give firmware buffers sizes, and MTU */
2149 	cmd.data0 = myri10ge_mtu;
2150 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_MTU, &cmd);
2151 	cmd.data0 = myri10ge_small_bytes;
2152 	status |=
2153 	    myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
2154 	cmd.data0 = big_pow2;
2155 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2156 	if (status) {
2157 		cmn_err(CE_WARN, "%s: Couldn't set buffer sizes\n", mgp->name);
2158 		goto abort_with_slices;
2159 	}
2160 
2161 
2162 	cmd.data0 = 1;
2163 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_TSO_MODE, &cmd);
2164 	if (status) {
2165 		cmn_err(CE_WARN, "%s: unable to setup TSO (%d)\n",
2166 		    mgp->name, status);
2167 	} else {
2168 		mgp->features |= MYRI10GE_TSO;
2169 	}
2170 
2171 	mgp->link_state = -1;
2172 	mgp->rdma_tags_available = 15;
2173 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_UP, &cmd);
2174 	if (status) {
2175 		cmn_err(CE_WARN, "%s: unable to start ethernet\n", mgp->name);
2176 		goto abort_with_slices;
2177 	}
2178 	mgp->running = MYRI10GE_ETH_RUNNING;
2179 	return (DDI_SUCCESS);
2180 
2181 abort_with_slices:
2182 	for (i = 0; i < mgp->num_slices; i++)
2183 		myri10ge_teardown_slice(&mgp->ss[i]);
2184 
2185 	mgp->running = MYRI10GE_ETH_STOPPED;
2186 
2187 abort_with_toeplitz:
2188 	if (mgp->toeplitz_hash_table != NULL) {
2189 		kmem_free(mgp->toeplitz_hash_table,
2190 		    sizeof (uint32_t) * 12 * 256);
2191 		mgp->toeplitz_hash_table = NULL;
2192 	}
2193 
2194 abort_with_nothing:
2195 	return (DDI_FAILURE);
2196 }
2197 
2198 static void
2199 myri10ge_stop_locked(struct myri10ge_priv *mgp)
2200 {
2201 	int status, old_down_cnt;
2202 	myri10ge_cmd_t cmd;
2203 	int wait_time = 10;
2204 	int i, polling;
2205 
2206 	old_down_cnt = mgp->down_cnt;
2207 	mb();
2208 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2209 	if (status) {
2210 		cmn_err(CE_WARN, "%s: Couldn't bring down link\n", mgp->name);
2211 	}
2212 
2213 	while (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2214 		delay(1 * drv_usectohz(1000000));
2215 		wait_time--;
2216 		if (wait_time == 0)
2217 			break;
2218 	}
2219 again:
2220 	if (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2221 		cmn_err(CE_WARN, "%s: didn't get down irq\n", mgp->name);
2222 		for (i = 0; i < mgp->num_slices; i++) {
2223 			/*
2224 			 * take and release the rx lock to ensure
2225 			 * that no interrupt thread is blocked
2226 			 * elsewhere in the stack, preventing
2227 			 * completion
2228 			 */
2229 
2230 			mutex_enter(&mgp->ss[i].rx_lock);
2231 			printf("%s: slice %d rx irq idle\n",
2232 			    mgp->name, i);
2233 			mutex_exit(&mgp->ss[i].rx_lock);
2234 
2235 			/* verify that the poll handler is inactive */
2236 			mutex_enter(&mgp->ss->poll_lock);
2237 			polling = mgp->ss->rx_polling;
2238 			mutex_exit(&mgp->ss->poll_lock);
2239 			if (polling) {
2240 				printf("%s: slice %d is polling\n",
2241 				    mgp->name, i);
2242 				delay(1 * drv_usectohz(1000000));
2243 				goto again;
2244 			}
2245 		}
2246 		delay(1 * drv_usectohz(1000000));
2247 		if (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2248 			cmn_err(CE_WARN, "%s: Never got down irq\n", mgp->name);
2249 		}
2250 	}
2251 
2252 	for (i = 0; i < mgp->num_slices; i++)
2253 		myri10ge_teardown_slice(&mgp->ss[i]);
2254 
2255 	if (mgp->toeplitz_hash_table != NULL) {
2256 		kmem_free(mgp->toeplitz_hash_table,
2257 		    sizeof (uint32_t) * 12 * 256);
2258 		mgp->toeplitz_hash_table = NULL;
2259 	}
2260 	mgp->running = MYRI10GE_ETH_STOPPED;
2261 }
2262 
2263 static int
2264 myri10ge_m_start(void *arg)
2265 {
2266 	struct myri10ge_priv *mgp = arg;
2267 	int status;
2268 
2269 	mutex_enter(&mgp->intrlock);
2270 
2271 	if (mgp->running != MYRI10GE_ETH_STOPPED) {
2272 		mutex_exit(&mgp->intrlock);
2273 		return (DDI_FAILURE);
2274 	}
2275 	status = myri10ge_start_locked(mgp);
2276 	mutex_exit(&mgp->intrlock);
2277 
2278 	if (status != DDI_SUCCESS)
2279 		return (status);
2280 
2281 	/* start the watchdog timer */
2282 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
2283 	    mgp->timer_ticks);
2284 	return (DDI_SUCCESS);
2285 
2286 }
2287 
2288 static void
2289 myri10ge_m_stop(void *arg)
2290 {
2291 	struct myri10ge_priv *mgp = arg;
2292 
2293 	mutex_enter(&mgp->intrlock);
2294 	/* if the device not running give up */
2295 	if (mgp->running != MYRI10GE_ETH_RUNNING) {
2296 		mutex_exit(&mgp->intrlock);
2297 		return;
2298 	}
2299 
2300 	mgp->running = MYRI10GE_ETH_STOPPING;
2301 	mutex_exit(&mgp->intrlock);
2302 	(void) untimeout(mgp->timer_id);
2303 	mutex_enter(&mgp->intrlock);
2304 	myri10ge_stop_locked(mgp);
2305 	mutex_exit(&mgp->intrlock);
2306 
2307 }
2308 
2309 static inline void
2310 myri10ge_rx_csum(mblk_t *mp, struct myri10ge_rx_ring_stats *s, uint32_t csum)
2311 {
2312 	struct ether_header *eh;
2313 	struct ip *ip;
2314 	struct ip6_hdr *ip6;
2315 	uint32_t start, stuff, end, partial, hdrlen;
2316 
2317 
2318 	csum = ntohs((uint16_t)csum);
2319 	eh = (struct ether_header *)(void *)mp->b_rptr;
2320 	hdrlen = sizeof (*eh);
2321 	if (eh->ether_dhost.ether_addr_octet[0] & 1) {
2322 		if (0 == (bcmp(eh->ether_dhost.ether_addr_octet,
2323 		    myri10ge_broadcastaddr, sizeof (eh->ether_dhost))))
2324 			s->brdcstrcv++;
2325 		else
2326 			s->multircv++;
2327 	}
2328 
2329 	if (eh->ether_type == BE_16(ETHERTYPE_VLAN)) {
2330 		/*
2331 		 * fix checksum by subtracting 4 bytes after what the
2332 		 * firmware thought was the end of the ether hdr
2333 		 */
2334 		partial = *(uint32_t *)
2335 		    (void *)(mp->b_rptr + ETHERNET_HEADER_SIZE);
2336 		csum += ~partial;
2337 		csum +=  (csum < ~partial);
2338 		csum = (csum >> 16) + (csum & 0xFFFF);
2339 		csum = (csum >> 16) + (csum & 0xFFFF);
2340 		hdrlen += VLAN_TAGSZ;
2341 	}
2342 
2343 	if (eh->ether_type ==  BE_16(ETHERTYPE_IP)) {
2344 		ip = (struct ip *)(void *)(mp->b_rptr + hdrlen);
2345 		start = ip->ip_hl << 2;
2346 
2347 		if (ip->ip_p == IPPROTO_TCP)
2348 			stuff = start + offsetof(struct tcphdr, th_sum);
2349 		else if (ip->ip_p == IPPROTO_UDP)
2350 			stuff = start + offsetof(struct udphdr, uh_sum);
2351 		else
2352 			return;
2353 		end = ntohs(ip->ip_len);
2354 	} else if (eh->ether_type ==  BE_16(ETHERTYPE_IPV6)) {
2355 		ip6 = (struct ip6_hdr *)(void *)(mp->b_rptr + hdrlen);
2356 		start = sizeof (*ip6);
2357 		if (ip6->ip6_nxt == IPPROTO_TCP) {
2358 			stuff = start + offsetof(struct tcphdr, th_sum);
2359 		} else if (ip6->ip6_nxt == IPPROTO_UDP)
2360 			stuff = start + offsetof(struct udphdr, uh_sum);
2361 		else
2362 			return;
2363 		end = start + ntohs(ip6->ip6_plen);
2364 		/*
2365 		 * IPv6 headers do not contain a checksum, and hence
2366 		 * do not checksum to zero, so they don't "fall out"
2367 		 * of the partial checksum calculation like IPv4
2368 		 * headers do.  We need to fix the partial checksum by
2369 		 * subtracting the checksum of the IPv6 header.
2370 		 */
2371 
2372 		partial = myri10ge_csum_generic((uint16_t *)ip6, sizeof (*ip6));
2373 		csum += ~partial;
2374 		csum +=  (csum < ~partial);
2375 		csum = (csum >> 16) + (csum & 0xFFFF);
2376 		csum = (csum >> 16) + (csum & 0xFFFF);
2377 	} else {
2378 		return;
2379 	}
2380 
2381 	if (MBLKL(mp) > hdrlen + end) {
2382 		/* padded frame, so hw csum may be invalid */
2383 		return;
2384 	}
2385 
2386 	mac_hcksum_set(mp, start, stuff, end, csum, HCK_PARTIALCKSUM);
2387 }
2388 
2389 static mblk_t *
2390 myri10ge_rx_done_small(struct myri10ge_slice_state *ss, uint32_t len,
2391     uint32_t csum)
2392 {
2393 	mblk_t *mp;
2394 	myri10ge_rx_ring_t *rx;
2395 	int idx;
2396 
2397 	rx = &ss->rx_small;
2398 	idx = rx->cnt & rx->mask;
2399 	ss->rx_small.cnt++;
2400 
2401 	/* allocate a new buffer to pass up the stack */
2402 	mp = allocb(len + MXGEFW_PAD, 0);
2403 	if (mp == NULL) {
2404 		MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_small_nobuf);
2405 		goto abort;
2406 	}
2407 	bcopy(ss->rx_small.info[idx].ptr,
2408 	    (caddr_t)mp->b_wptr, len + MXGEFW_PAD);
2409 	mp->b_wptr += len + MXGEFW_PAD;
2410 	mp->b_rptr += MXGEFW_PAD;
2411 
2412 	ss->rx_stats.ibytes += len;
2413 	ss->rx_stats.ipackets += 1;
2414 	myri10ge_rx_csum(mp, &ss->rx_stats, csum);
2415 
2416 abort:
2417 	if ((idx & 7) == 7) {
2418 		myri10ge_submit_8rx(&rx->lanai[idx - 7],
2419 		    &rx->shadow[idx - 7]);
2420 	}
2421 
2422 	return (mp);
2423 }
2424 
2425 
2426 static mblk_t *
2427 myri10ge_rx_done_big(struct myri10ge_slice_state *ss, uint32_t len,
2428     uint32_t csum)
2429 {
2430 	struct myri10ge_jpool_stuff *jpool;
2431 	struct myri10ge_jpool_entry *j;
2432 	mblk_t *mp;
2433 	int idx, num_owned_by_mcp;
2434 
2435 	jpool = &ss->jpool;
2436 	idx = ss->j_rx_cnt & ss->rx_big.mask;
2437 	j = ss->rx_big.info[idx].j;
2438 
2439 	if (j == NULL) {
2440 		printf("%s: null j at idx=%d, rx_big.cnt = %d, j_rx_cnt=%d\n",
2441 		    ss->mgp->name, idx, ss->rx_big.cnt, ss->j_rx_cnt);
2442 		return (NULL);
2443 	}
2444 
2445 
2446 	ss->rx_big.info[idx].j = NULL;
2447 	ss->j_rx_cnt++;
2448 
2449 
2450 	/*
2451 	 * Check to see if we are low on rx buffers.
2452 	 * Note that we must leave at least 8 free so there are
2453 	 * enough to free in a single 64-byte write.
2454 	 */
2455 	num_owned_by_mcp = ss->rx_big.cnt - ss->j_rx_cnt;
2456 	if (num_owned_by_mcp < jpool->low_water) {
2457 		mutex_enter(&jpool->mtx);
2458 		myri10ge_restock_jumbos(ss);
2459 		mutex_exit(&jpool->mtx);
2460 		num_owned_by_mcp = ss->rx_big.cnt - ss->j_rx_cnt;
2461 		/* if we are still low, then we have to copy */
2462 		if (num_owned_by_mcp < 16) {
2463 			MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_copy);
2464 			/* allocate a new buffer to pass up the stack */
2465 			mp = allocb(len + MXGEFW_PAD, 0);
2466 			if (mp == NULL) {
2467 				goto abort;
2468 			}
2469 			bcopy(j->buf,
2470 			    (caddr_t)mp->b_wptr, len + MXGEFW_PAD);
2471 			myri10ge_jfree_rtn(j);
2472 			/* push buffer back to NIC */
2473 			mutex_enter(&jpool->mtx);
2474 			myri10ge_restock_jumbos(ss);
2475 			mutex_exit(&jpool->mtx);
2476 			goto set_len;
2477 		}
2478 	}
2479 
2480 	/* loan our buffer to the stack */
2481 	mp = desballoc((unsigned char *)j->buf, myri10ge_mtu, 0, &j->free_func);
2482 	if (mp == NULL) {
2483 		goto abort;
2484 	}
2485 
2486 set_len:
2487 	mp->b_rptr += MXGEFW_PAD;
2488 	mp->b_wptr = ((unsigned char *) mp->b_rptr + len);
2489 
2490 	ss->rx_stats.ibytes += len;
2491 	ss->rx_stats.ipackets += 1;
2492 	myri10ge_rx_csum(mp, &ss->rx_stats, csum);
2493 
2494 	return (mp);
2495 
2496 abort:
2497 	myri10ge_jfree_rtn(j);
2498 	MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_big_nobuf);
2499 	return (NULL);
2500 }
2501 
2502 /*
2503  * Free all transmit buffers up until the specified index
2504  */
2505 static inline void
2506 myri10ge_tx_done(struct myri10ge_slice_state *ss, uint32_t mcp_index)
2507 {
2508 	myri10ge_tx_ring_t *tx;
2509 	struct myri10ge_tx_dma_handle_head handles;
2510 	int idx;
2511 	int limit = 0;
2512 
2513 	tx = &ss->tx;
2514 	handles.head = NULL;
2515 	handles.tail = NULL;
2516 	while (tx->pkt_done != (int)mcp_index) {
2517 		idx = tx->done & tx->mask;
2518 
2519 		/*
2520 		 * mblk & DMA handle attached only to first slot
2521 		 * per buffer in the packet
2522 		 */
2523 
2524 		if (tx->info[idx].m) {
2525 			(void) ddi_dma_unbind_handle(tx->info[idx].handle->h);
2526 			tx->info[idx].handle->next = handles.head;
2527 			handles.head = tx->info[idx].handle;
2528 			if (handles.tail == NULL)
2529 				handles.tail = tx->info[idx].handle;
2530 			freeb(tx->info[idx].m);
2531 			tx->info[idx].m = 0;
2532 			tx->info[idx].handle = 0;
2533 		}
2534 		if (tx->info[idx].ostat.opackets != 0) {
2535 			tx->stats.multixmt += tx->info[idx].ostat.multixmt;
2536 			tx->stats.brdcstxmt += tx->info[idx].ostat.brdcstxmt;
2537 			tx->stats.obytes += tx->info[idx].ostat.obytes;
2538 			tx->stats.opackets += tx->info[idx].ostat.opackets;
2539 			tx->info[idx].stat.un.all = 0;
2540 			tx->pkt_done++;
2541 		}
2542 
2543 		tx->done++;
2544 		/*
2545 		 * if we stalled the queue, wake it.  But Wait until
2546 		 * we have at least 1/2 our slots free.
2547 		 */
2548 		if ((tx->req - tx->done) < (tx->mask >> 1) &&
2549 		    tx->stall != tx->sched) {
2550 			mutex_enter(&ss->tx.lock);
2551 			tx->sched = tx->stall;
2552 			mutex_exit(&ss->tx.lock);
2553 			mac_tx_ring_update(ss->mgp->mh, tx->rh);
2554 		}
2555 
2556 		/* limit potential for livelock */
2557 		if (unlikely(++limit >  2 * tx->mask))
2558 			break;
2559 	}
2560 	if (tx->req == tx->done && tx->stop != NULL) {
2561 		/*
2562 		 * Nic has sent all pending requests, allow it
2563 		 * to stop polling this queue
2564 		 */
2565 		mutex_enter(&tx->lock);
2566 		if (tx->req == tx->done && tx->active) {
2567 			*(int *)(void *)tx->stop = 1;
2568 			tx->active = 0;
2569 			mb();
2570 		}
2571 		mutex_exit(&tx->lock);
2572 	}
2573 	if (handles.head != NULL)
2574 		myri10ge_free_tx_handles(tx, &handles);
2575 }
2576 
2577 static void
2578 myri10ge_mbl_init(struct myri10ge_mblk_list *mbl)
2579 {
2580 	mbl->head = NULL;
2581 	mbl->tail = &mbl->head;
2582 	mbl->cnt = 0;
2583 }
2584 
2585 /*ARGSUSED*/
2586 void
2587 myri10ge_mbl_append(struct myri10ge_slice_state *ss,
2588     struct myri10ge_mblk_list *mbl, mblk_t *mp)
2589 {
2590 	*(mbl->tail) = mp;
2591 	mbl->tail = &mp->b_next;
2592 	mp->b_next = NULL;
2593 	mbl->cnt++;
2594 }
2595 
2596 
2597 static inline void
2598 myri10ge_clean_rx_done(struct myri10ge_slice_state *ss,
2599     struct myri10ge_mblk_list *mbl, int limit, boolean_t *stop)
2600 {
2601 	myri10ge_rx_done_t *rx_done = &ss->rx_done;
2602 	struct myri10ge_priv *mgp = ss->mgp;
2603 	mblk_t *mp;
2604 	struct lro_entry *lro;
2605 	uint16_t length;
2606 	uint16_t checksum;
2607 
2608 
2609 	while (rx_done->entry[rx_done->idx].length != 0) {
2610 		if (unlikely (*stop)) {
2611 			break;
2612 		}
2613 		length = ntohs(rx_done->entry[rx_done->idx].length);
2614 		length &= (~MXGEFW_RSS_HASH_MASK);
2615 
2616 		/* limit potential for livelock */
2617 		limit -= length;
2618 		if (unlikely(limit < 0))
2619 			break;
2620 
2621 		rx_done->entry[rx_done->idx].length = 0;
2622 		checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
2623 		if (length <= myri10ge_small_bytes)
2624 			mp = myri10ge_rx_done_small(ss, length, checksum);
2625 		else
2626 			mp = myri10ge_rx_done_big(ss, length, checksum);
2627 		if (mp != NULL) {
2628 			if (!myri10ge_lro ||
2629 			    0 != myri10ge_lro_rx(ss, mp, checksum, mbl))
2630 				myri10ge_mbl_append(ss, mbl, mp);
2631 		}
2632 		rx_done->cnt++;
2633 		rx_done->idx = rx_done->cnt & (mgp->max_intr_slots - 1);
2634 	}
2635 	while (ss->lro_active != NULL) {
2636 		lro = ss->lro_active;
2637 		ss->lro_active = lro->next;
2638 		myri10ge_lro_flush(ss, lro, mbl);
2639 	}
2640 }
2641 
2642 static void
2643 myri10ge_intr_rx(struct myri10ge_slice_state *ss)
2644 {
2645 	uint64_t gen;
2646 	struct myri10ge_mblk_list mbl;
2647 
2648 	myri10ge_mbl_init(&mbl);
2649 	if (mutex_tryenter(&ss->rx_lock) == 0)
2650 		return;
2651 	gen = ss->rx_gen_num;
2652 	myri10ge_clean_rx_done(ss, &mbl, MYRI10GE_POLL_NULL,
2653 	    &ss->rx_polling);
2654 	if (mbl.head != NULL)
2655 		mac_rx_ring(ss->mgp->mh, ss->rx_rh, mbl.head, gen);
2656 	mutex_exit(&ss->rx_lock);
2657 
2658 }
2659 
2660 static mblk_t *
2661 myri10ge_poll_rx(void *arg, int bytes)
2662 {
2663 	struct myri10ge_slice_state *ss = arg;
2664 	struct myri10ge_mblk_list mbl;
2665 	boolean_t dummy = B_FALSE;
2666 
2667 	if (bytes == 0)
2668 		return (NULL);
2669 
2670 	myri10ge_mbl_init(&mbl);
2671 	mutex_enter(&ss->rx_lock);
2672 	if (ss->rx_polling)
2673 		myri10ge_clean_rx_done(ss, &mbl, bytes, &dummy);
2674 	else
2675 		printf("%d: poll_rx: token=%d, polling=%d\n", (int)(ss -
2676 		    ss->mgp->ss), ss->rx_token, ss->rx_polling);
2677 	mutex_exit(&ss->rx_lock);
2678 	return (mbl.head);
2679 }
2680 
2681 /*ARGSUSED*/
2682 static uint_t
2683 myri10ge_intr(caddr_t arg0, caddr_t arg1)
2684 {
2685 	struct myri10ge_slice_state *ss =
2686 	    (struct myri10ge_slice_state *)(void *)arg0;
2687 	struct myri10ge_priv *mgp = ss->mgp;
2688 	mcp_irq_data_t *stats = ss->fw_stats;
2689 	myri10ge_tx_ring_t *tx = &ss->tx;
2690 	uint32_t send_done_count;
2691 	uint8_t valid;
2692 
2693 
2694 	/* make sure the DMA has finished */
2695 	if (!stats->valid) {
2696 		return (DDI_INTR_UNCLAIMED);
2697 	}
2698 	valid = stats->valid;
2699 
2700 	/* low bit indicates receives are present */
2701 	if (valid & 1)
2702 		myri10ge_intr_rx(ss);
2703 
2704 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
2705 		/* lower legacy IRQ  */
2706 		*mgp->irq_deassert = 0;
2707 		if (!myri10ge_deassert_wait)
2708 			/* don't wait for conf. that irq is low */
2709 			stats->valid = 0;
2710 		mb();
2711 	} else {
2712 		/* no need to wait for conf. that irq is low */
2713 		stats->valid = 0;
2714 	}
2715 
2716 	do {
2717 		/* check for transmit completes and receives */
2718 		send_done_count = ntohl(stats->send_done_count);
2719 		if (send_done_count != tx->pkt_done)
2720 			myri10ge_tx_done(ss, (int)send_done_count);
2721 	} while (*((volatile uint8_t *) &stats->valid));
2722 
2723 	if (stats->stats_updated) {
2724 		if (mgp->link_state != stats->link_up || stats->link_down) {
2725 			mgp->link_state = stats->link_up;
2726 			if (stats->link_down) {
2727 				mgp->down_cnt += stats->link_down;
2728 				mgp->link_state = 0;
2729 			}
2730 			if (mgp->link_state) {
2731 				if (myri10ge_verbose)
2732 					printf("%s: link up\n", mgp->name);
2733 				mac_link_update(mgp->mh, LINK_STATE_UP);
2734 			} else {
2735 				if (myri10ge_verbose)
2736 					printf("%s: link down\n", mgp->name);
2737 				mac_link_update(mgp->mh, LINK_STATE_DOWN);
2738 			}
2739 			MYRI10GE_NIC_STAT_INC(link_changes);
2740 		}
2741 		if (mgp->rdma_tags_available !=
2742 		    ntohl(ss->fw_stats->rdma_tags_available)) {
2743 			mgp->rdma_tags_available =
2744 			    ntohl(ss->fw_stats->rdma_tags_available);
2745 			cmn_err(CE_NOTE, "%s: RDMA timed out! "
2746 			    "%d tags left\n", mgp->name,
2747 			    mgp->rdma_tags_available);
2748 		}
2749 	}
2750 
2751 	mb();
2752 	/* check to see if we have rx token to pass back */
2753 	if (valid & 0x1) {
2754 		mutex_enter(&ss->poll_lock);
2755 		if (ss->rx_polling) {
2756 			ss->rx_token = 1;
2757 		} else {
2758 			*ss->irq_claim = BE_32(3);
2759 			ss->rx_token = 0;
2760 		}
2761 		mutex_exit(&ss->poll_lock);
2762 	}
2763 	*(ss->irq_claim + 1) = BE_32(3);
2764 	return (DDI_INTR_CLAIMED);
2765 }
2766 
2767 /*
2768  * Add or remove a multicast address.  This is called with our
2769  * macinfo's lock held by GLD, so we do not need to worry about
2770  * our own locking here.
2771  */
2772 static int
2773 myri10ge_m_multicst(void *arg, boolean_t add, const uint8_t *multicastaddr)
2774 {
2775 	myri10ge_cmd_t cmd;
2776 	struct myri10ge_priv *mgp = arg;
2777 	int status, join_leave;
2778 
2779 	if (add)
2780 		join_leave = MXGEFW_JOIN_MULTICAST_GROUP;
2781 	else
2782 		join_leave = MXGEFW_LEAVE_MULTICAST_GROUP;
2783 	(void) memcpy(&cmd.data0, multicastaddr, 4);
2784 	(void) memcpy(&cmd.data1, multicastaddr + 4, 2);
2785 	cmd.data0 = htonl(cmd.data0);
2786 	cmd.data1 = htonl(cmd.data1);
2787 	status = myri10ge_send_cmd(mgp, join_leave, &cmd);
2788 	if (status == 0)
2789 		return (0);
2790 
2791 	cmn_err(CE_WARN, "%s: failed to set multicast address\n",
2792 	    mgp->name);
2793 	return (status);
2794 }
2795 
2796 
2797 static int
2798 myri10ge_m_promisc(void *arg, boolean_t on)
2799 {
2800 	struct myri10ge_priv *mgp = arg;
2801 
2802 	myri10ge_change_promisc(mgp, on);
2803 	return (0);
2804 }
2805 
2806 /*
2807  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
2808  *  backwards one at a time and handle ring wraps
2809  */
2810 
2811 static inline void
2812 myri10ge_submit_req_backwards(myri10ge_tx_ring_t *tx,
2813     mcp_kreq_ether_send_t *src, int cnt)
2814 {
2815 	int idx, starting_slot;
2816 	starting_slot = tx->req;
2817 	while (cnt > 1) {
2818 		cnt--;
2819 		idx = (starting_slot + cnt) & tx->mask;
2820 		myri10ge_pio_copy(&tx->lanai[idx],
2821 		    &src[cnt], sizeof (*src));
2822 		mb();
2823 	}
2824 }
2825 
2826 /*
2827  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
2828  * at most 32 bytes at a time, so as to avoid involving the software
2829  * pio handler in the nic.   We re-write the first segment's flags
2830  * to mark them valid only after writing the entire chain
2831  */
2832 
2833 static inline void
2834 myri10ge_submit_req(myri10ge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
2835     int cnt)
2836 {
2837 	int idx, i;
2838 	uint32_t *src_ints, *dst_ints;
2839 	mcp_kreq_ether_send_t *srcp, *dstp, *dst;
2840 	uint8_t last_flags;
2841 
2842 	idx = tx->req & tx->mask;
2843 
2844 	last_flags = src->flags;
2845 	src->flags = 0;
2846 	mb();
2847 	dst = dstp = &tx->lanai[idx];
2848 	srcp = src;
2849 
2850 	if ((idx + cnt) < tx->mask) {
2851 		for (i = 0; i < (cnt - 1); i += 2) {
2852 			myri10ge_pio_copy(dstp, srcp, 2 * sizeof (*src));
2853 			mb(); /* force write every 32 bytes */
2854 			srcp += 2;
2855 			dstp += 2;
2856 		}
2857 	} else {
2858 		/*
2859 		 * submit all but the first request, and ensure
2860 		 *  that it is submitted below
2861 		 */
2862 		myri10ge_submit_req_backwards(tx, src, cnt);
2863 		i = 0;
2864 	}
2865 	if (i < cnt) {
2866 		/* submit the first request */
2867 		myri10ge_pio_copy(dstp, srcp, sizeof (*src));
2868 		mb(); /* barrier before setting valid flag */
2869 	}
2870 
2871 	/* re-write the last 32-bits with the valid flags */
2872 	src->flags |= last_flags;
2873 	src_ints = (uint32_t *)src;
2874 	src_ints += 3;
2875 	dst_ints = (uint32_t *)dst;
2876 	dst_ints += 3;
2877 	*dst_ints =  *src_ints;
2878 	tx->req += cnt;
2879 	mb();
2880 	/* notify NIC to poll this tx ring */
2881 	if (!tx->active && tx->go != NULL) {
2882 		*(int *)(void *)tx->go = 1;
2883 		tx->active = 1;
2884 		tx->activate++;
2885 		mb();
2886 	}
2887 }
2888 
2889 /* ARGSUSED */
2890 static inline void
2891 myri10ge_lso_info_get(mblk_t *mp, uint32_t *mss, uint32_t *flags)
2892 {
2893 	uint32_t lso_flag;
2894 	mac_lso_get(mp, mss, &lso_flag);
2895 	(*flags) |= lso_flag;
2896 }
2897 
2898 
2899 /* like pullupmsg, except preserve hcksum/LSO attributes */
2900 static int
2901 myri10ge_pullup(struct myri10ge_slice_state *ss, mblk_t *mp)
2902 {
2903 	uint32_t start, stuff, tx_offload_flags, mss;
2904 	int ok;
2905 
2906 	mss = 0;
2907 	mac_hcksum_get(mp, &start, &stuff, NULL, NULL, &tx_offload_flags);
2908 	myri10ge_lso_info_get(mp, &mss, &tx_offload_flags);
2909 
2910 	ok = pullupmsg(mp, -1);
2911 	if (!ok) {
2912 		printf("pullupmsg failed");
2913 		return (DDI_FAILURE);
2914 	}
2915 	MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_pullup);
2916 	mac_hcksum_set(mp, start, stuff, NULL, NULL, tx_offload_flags);
2917 	if (tx_offload_flags & HW_LSO)
2918 		DB_LSOMSS(mp) = (uint16_t)mss;
2919 	lso_info_set(mp, mss, tx_offload_flags);
2920 	return (DDI_SUCCESS);
2921 }
2922 
2923 static inline void
2924 myri10ge_tx_stat(struct myri10ge_tx_pkt_stats *s, struct ether_header *eh,
2925     int opackets, int obytes)
2926 {
2927 	s->un.all = 0;
2928 	if (eh->ether_dhost.ether_addr_octet[0] & 1) {
2929 		if (0 == (bcmp(eh->ether_dhost.ether_addr_octet,
2930 		    myri10ge_broadcastaddr, sizeof (eh->ether_dhost))))
2931 			s->un.s.brdcstxmt = 1;
2932 		else
2933 			s->un.s.multixmt = 1;
2934 	}
2935 	s->un.s.opackets = (uint16_t)opackets;
2936 	s->un.s.obytes = obytes;
2937 }
2938 
2939 static int
2940 myri10ge_tx_copy(struct myri10ge_slice_state *ss, mblk_t *mp,
2941     mcp_kreq_ether_send_t *req)
2942 {
2943 	myri10ge_tx_ring_t *tx = &ss->tx;
2944 	caddr_t ptr;
2945 	struct myri10ge_tx_copybuf *cp;
2946 	mblk_t *bp;
2947 	int idx, mblen, avail;
2948 	uint16_t len;
2949 
2950 	mutex_enter(&tx->lock);
2951 	avail = tx->mask - (tx->req - tx->done);
2952 	if (avail <= 1) {
2953 		mutex_exit(&tx->lock);
2954 		return (EBUSY);
2955 	}
2956 	idx = tx->req & tx->mask;
2957 	cp = &tx->cp[idx];
2958 	ptr = cp->va;
2959 	for (len = 0, bp = mp; bp != NULL; bp = bp->b_cont) {
2960 		mblen = MBLKL(bp);
2961 		bcopy(bp->b_rptr, ptr, mblen);
2962 		ptr += mblen;
2963 		len += mblen;
2964 	}
2965 	/* ensure runts are padded to 60 bytes */
2966 	if (len < 60) {
2967 		bzero(ptr, 64 - len);
2968 		len = 60;
2969 	}
2970 	req->addr_low = cp->dma.low;
2971 	req->addr_high = cp->dma.high;
2972 	req->length = htons(len);
2973 	req->pad = 0;
2974 	req->rdma_count = 1;
2975 	myri10ge_tx_stat(&tx->info[idx].stat,
2976 	    (struct ether_header *)(void *)cp->va, 1, len);
2977 	(void) ddi_dma_sync(cp->dma.handle, 0, len, DDI_DMA_SYNC_FORDEV);
2978 	myri10ge_submit_req(&ss->tx, req, 1);
2979 	mutex_exit(&tx->lock);
2980 	freemsg(mp);
2981 	return (DDI_SUCCESS);
2982 }
2983 
2984 
2985 static void
2986 myri10ge_send_locked(myri10ge_tx_ring_t *tx, mcp_kreq_ether_send_t *req_list,
2987     struct myri10ge_tx_buffer_state *tx_info,
2988     int count)
2989 {
2990 	int i, idx;
2991 
2992 	idx = 0; /* gcc -Wuninitialized */
2993 	/* store unmapping and bp info for tx irq handler */
2994 	for (i = 0; i < count; i++) {
2995 		idx = (tx->req + i) & tx->mask;
2996 		tx->info[idx].m = tx_info[i].m;
2997 		tx->info[idx].handle = tx_info[i].handle;
2998 	}
2999 	tx->info[idx].stat.un.all = tx_info[0].stat.un.all;
3000 
3001 	/* submit the frame to the nic */
3002 	myri10ge_submit_req(tx, req_list, count);
3003 
3004 
3005 }
3006 
3007 
3008 
3009 static void
3010 myri10ge_copydata(mblk_t *mp, int off, int len, caddr_t buf)
3011 {
3012 	mblk_t *bp;
3013 	int seglen;
3014 	uint_t count;
3015 
3016 	bp = mp;
3017 
3018 	while (off > 0) {
3019 		seglen = MBLKL(bp);
3020 		if (off < seglen)
3021 			break;
3022 		off -= seglen;
3023 		bp = bp->b_cont;
3024 	}
3025 	while (len > 0) {
3026 		seglen = MBLKL(bp);
3027 		count = min(seglen - off, len);
3028 		bcopy(bp->b_rptr + off, buf, count);
3029 		len -= count;
3030 		buf += count;
3031 		off = 0;
3032 		bp = bp->b_cont;
3033 	}
3034 }
3035 
3036 static int
3037 myri10ge_ether_parse_header(mblk_t *mp)
3038 {
3039 	struct ether_header eh_copy;
3040 	struct ether_header *eh;
3041 	int eth_hdr_len, seglen;
3042 
3043 	seglen = MBLKL(mp);
3044 	eth_hdr_len = sizeof (*eh);
3045 	if (seglen < eth_hdr_len) {
3046 		myri10ge_copydata(mp, 0, eth_hdr_len, (caddr_t)&eh_copy);
3047 		eh = &eh_copy;
3048 	} else {
3049 		eh = (struct ether_header *)(void *)mp->b_rptr;
3050 	}
3051 	if (eh->ether_type == BE_16(ETHERTYPE_VLAN)) {
3052 		eth_hdr_len += 4;
3053 	}
3054 
3055 	return (eth_hdr_len);
3056 }
3057 
3058 static int
3059 myri10ge_lso_parse_header(mblk_t *mp, int off)
3060 {
3061 	char buf[128];
3062 	int seglen, sum_off;
3063 	struct ip *ip;
3064 	struct tcphdr *tcp;
3065 
3066 	seglen = MBLKL(mp);
3067 	if (seglen < off + sizeof (*ip)) {
3068 		myri10ge_copydata(mp, off, sizeof (*ip), buf);
3069 		ip = (struct ip *)(void *)buf;
3070 	} else {
3071 		ip = (struct ip *)(void *)(mp->b_rptr + off);
3072 	}
3073 	if (seglen < off + (ip->ip_hl << 2) + sizeof (*tcp)) {
3074 		myri10ge_copydata(mp, off,
3075 		    (ip->ip_hl << 2) + sizeof (*tcp), buf);
3076 		ip = (struct ip *)(void *)buf;
3077 	}
3078 	tcp = (struct tcphdr *)(void *)((char *)ip + (ip->ip_hl << 2));
3079 
3080 	/*
3081 	 * NIC expects ip_sum to be zero.  Recent changes to
3082 	 * OpenSolaris leave the correct ip checksum there, rather
3083 	 * than the required zero, so we need to zero it.  Otherwise,
3084 	 * the NIC will produce bad checksums when sending LSO packets.
3085 	 */
3086 	if (ip->ip_sum != 0) {
3087 		if (((char *)ip) != buf) {
3088 			/* ip points into mblk, so just zero it */
3089 			ip->ip_sum = 0;
3090 		} else {
3091 			/*
3092 			 * ip points into a copy, so walk the chain
3093 			 * to find the ip_csum, then zero it
3094 			 */
3095 			sum_off = off + _PTRDIFF(&ip->ip_sum, buf);
3096 			while (sum_off > (int)(MBLKL(mp) - 1)) {
3097 				sum_off -= MBLKL(mp);
3098 				mp = mp->b_cont;
3099 			}
3100 			mp->b_rptr[sum_off] = 0;
3101 			sum_off++;
3102 			while (sum_off > MBLKL(mp) - 1) {
3103 				sum_off -= MBLKL(mp);
3104 				mp = mp->b_cont;
3105 			}
3106 			mp->b_rptr[sum_off] = 0;
3107 		}
3108 	}
3109 	return (off + ((ip->ip_hl + tcp->th_off) << 2));
3110 }
3111 
3112 static int
3113 myri10ge_tx_tso_copy(struct myri10ge_slice_state *ss, mblk_t *mp,
3114     mcp_kreq_ether_send_t *req_list, int hdr_size, int pkt_size,
3115     uint16_t mss, uint8_t cksum_offset)
3116 {
3117 	myri10ge_tx_ring_t *tx = &ss->tx;
3118 	struct myri10ge_priv *mgp = ss->mgp;
3119 	mblk_t *bp;
3120 	mcp_kreq_ether_send_t *req;
3121 	struct myri10ge_tx_copybuf *cp;
3122 	caddr_t rptr, ptr;
3123 	int mblen, count, cum_len, mss_resid, tx_req, pkt_size_tmp;
3124 	int resid, avail, idx, hdr_size_tmp, tx_boundary;
3125 	int rdma_count;
3126 	uint32_t seglen, len, boundary, low, high_swapped;
3127 	uint16_t pseudo_hdr_offset = htons(mss);
3128 	uint8_t flags;
3129 
3130 	tx_boundary = mgp->tx_boundary;
3131 	hdr_size_tmp = hdr_size;
3132 	resid = tx_boundary;
3133 	count = 1;
3134 	mutex_enter(&tx->lock);
3135 
3136 	/* check to see if the slots are really there */
3137 	avail = tx->mask - (tx->req - tx->done);
3138 	if (unlikely(avail <=  MYRI10GE_MAX_SEND_DESC_TSO)) {
3139 		atomic_inc_32(&tx->stall);
3140 		mutex_exit(&tx->lock);
3141 		return (EBUSY);
3142 	}
3143 
3144 	/* copy */
3145 	cum_len = -hdr_size;
3146 	count = 0;
3147 	req = req_list;
3148 	idx = tx->mask & tx->req;
3149 	cp = &tx->cp[idx];
3150 	low = ntohl(cp->dma.low);
3151 	ptr = cp->va;
3152 	cp->len = 0;
3153 	if (mss) {
3154 		int payload = pkt_size - hdr_size;
3155 		uint16_t opackets = (payload / mss) + ((payload % mss) != 0);
3156 		tx->info[idx].ostat.opackets = opackets;
3157 		tx->info[idx].ostat.obytes = (opackets - 1) * hdr_size
3158 		    + pkt_size;
3159 	}
3160 	hdr_size_tmp = hdr_size;
3161 	mss_resid = mss;
3162 	flags = (MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST);
3163 	tx_req = tx->req;
3164 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3165 		mblen = MBLKL(bp);
3166 		rptr = (caddr_t)bp->b_rptr;
3167 		len = min(hdr_size_tmp, mblen);
3168 		if (len) {
3169 			bcopy(rptr, ptr, len);
3170 			rptr += len;
3171 			ptr += len;
3172 			resid -= len;
3173 			mblen -= len;
3174 			hdr_size_tmp -= len;
3175 			cp->len += len;
3176 			if (hdr_size_tmp)
3177 				continue;
3178 			if (resid < mss) {
3179 				tx_req++;
3180 				idx = tx->mask & tx_req;
3181 				cp = &tx->cp[idx];
3182 				low = ntohl(cp->dma.low);
3183 				ptr = cp->va;
3184 				resid = tx_boundary;
3185 			}
3186 		}
3187 		while (mblen) {
3188 			len = min(mss_resid, mblen);
3189 			bcopy(rptr, ptr, len);
3190 			mss_resid -= len;
3191 			resid -= len;
3192 			mblen -= len;
3193 			rptr += len;
3194 			ptr += len;
3195 			cp->len += len;
3196 			if (mss_resid == 0) {
3197 				mss_resid = mss;
3198 				if (resid < mss) {
3199 					tx_req++;
3200 					idx = tx->mask & tx_req;
3201 					cp = &tx->cp[idx];
3202 					cp->len = 0;
3203 					low = ntohl(cp->dma.low);
3204 					ptr = cp->va;
3205 					resid = tx_boundary;
3206 				}
3207 			}
3208 		}
3209 	}
3210 
3211 	req = req_list;
3212 	pkt_size_tmp = pkt_size;
3213 	count = 0;
3214 	rdma_count = 0;
3215 	tx_req = tx->req;
3216 	while (pkt_size_tmp) {
3217 		idx = tx->mask & tx_req;
3218 		cp = &tx->cp[idx];
3219 		high_swapped = cp->dma.high;
3220 		low = ntohl(cp->dma.low);
3221 		len = cp->len;
3222 		if (len == 0) {
3223 			printf("len=0! pkt_size_tmp=%d, pkt_size=%d\n",
3224 			    pkt_size_tmp, pkt_size);
3225 			for (bp = mp; bp != NULL; bp = bp->b_cont) {
3226 				mblen = MBLKL(bp);
3227 				printf("mblen:%d\n", mblen);
3228 			}
3229 			pkt_size_tmp = pkt_size;
3230 			tx_req = tx->req;
3231 			while (pkt_size_tmp > 0) {
3232 				idx = tx->mask & tx_req;
3233 				cp = &tx->cp[idx];
3234 				printf("cp->len = %d\n", cp->len);
3235 				pkt_size_tmp -= cp->len;
3236 				tx_req++;
3237 			}
3238 			printf("dropped\n");
3239 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3240 			goto done;
3241 		}
3242 		pkt_size_tmp -= len;
3243 		while (len) {
3244 			while (len) {
3245 				uint8_t flags_next;
3246 				int cum_len_next;
3247 
3248 				boundary = (low + mgp->tx_boundary) &
3249 				    ~(mgp->tx_boundary - 1);
3250 				seglen = boundary - low;
3251 				if (seglen > len)
3252 					seglen = len;
3253 
3254 				flags_next = flags & ~MXGEFW_FLAGS_FIRST;
3255 				cum_len_next = cum_len + seglen;
3256 				(req-rdma_count)->rdma_count = rdma_count + 1;
3257 				if (likely(cum_len >= 0)) {
3258 					/* payload */
3259 					int next_is_first, chop;
3260 
3261 					chop = (cum_len_next > mss);
3262 					cum_len_next = cum_len_next % mss;
3263 					next_is_first = (cum_len_next == 0);
3264 					flags |= chop *
3265 					    MXGEFW_FLAGS_TSO_CHOP;
3266 					flags_next |= next_is_first *
3267 					    MXGEFW_FLAGS_FIRST;
3268 					rdma_count |= -(chop | next_is_first);
3269 					rdma_count += chop & !next_is_first;
3270 				} else if (likely(cum_len_next >= 0)) {
3271 					/* header ends */
3272 					int small;
3273 
3274 					rdma_count = -1;
3275 					cum_len_next = 0;
3276 					seglen = -cum_len;
3277 					small = (mss <= MXGEFW_SEND_SMALL_SIZE);
3278 					flags_next = MXGEFW_FLAGS_TSO_PLD |
3279 					    MXGEFW_FLAGS_FIRST |
3280 					    (small * MXGEFW_FLAGS_SMALL);
3281 				}
3282 				req->addr_high = high_swapped;
3283 				req->addr_low = htonl(low);
3284 				req->pseudo_hdr_offset = pseudo_hdr_offset;
3285 				req->pad = 0; /* complete solid 16-byte block */
3286 				req->rdma_count = 1;
3287 				req->cksum_offset = cksum_offset;
3288 				req->length = htons(seglen);
3289 				req->flags = flags | ((cum_len & 1) *
3290 				    MXGEFW_FLAGS_ALIGN_ODD);
3291 				if (cksum_offset > seglen)
3292 					cksum_offset -= seglen;
3293 				else
3294 					cksum_offset = 0;
3295 				low += seglen;
3296 				len -= seglen;
3297 				cum_len = cum_len_next;
3298 				req++;
3299 				req->flags = 0;
3300 				flags = flags_next;
3301 				count++;
3302 				rdma_count++;
3303 			}
3304 		}
3305 		tx_req++;
3306 	}
3307 	(req-rdma_count)->rdma_count = (uint8_t)rdma_count;
3308 	do {
3309 		req--;
3310 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
3311 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP |
3312 	    MXGEFW_FLAGS_FIRST)));
3313 
3314 	myri10ge_submit_req(tx, req_list, count);
3315 done:
3316 	mutex_exit(&tx->lock);
3317 	freemsg(mp);
3318 	return (DDI_SUCCESS);
3319 }
3320 
3321 /*
3322  * Try to send the chain of buffers described by the mp.  We must not
3323  * encapsulate more than eth->tx.req - eth->tx.done, or
3324  * MXGEFW_MAX_SEND_DESC, whichever is more.
3325  */
3326 
3327 static int
3328 myri10ge_send(struct myri10ge_slice_state *ss, mblk_t *mp,
3329     mcp_kreq_ether_send_t *req_list, struct myri10ge_tx_buffer_state *tx_info)
3330 {
3331 	struct myri10ge_priv *mgp = ss->mgp;
3332 	myri10ge_tx_ring_t *tx = &ss->tx;
3333 	mcp_kreq_ether_send_t *req;
3334 	struct myri10ge_tx_dma_handle *handles, *dma_handle = NULL;
3335 	mblk_t  *bp;
3336 	ddi_dma_cookie_t cookie;
3337 	int err, rv, count, avail, mblen, try_pullup, i, max_segs, maclen,
3338 	    rdma_count, cum_len, lso_hdr_size;
3339 	uint32_t start, stuff, tx_offload_flags;
3340 	uint32_t seglen, len, mss, boundary, low, high_swapped;
3341 	uint_t ncookies;
3342 	uint16_t pseudo_hdr_offset;
3343 	uint8_t flags, cksum_offset, odd_flag;
3344 	int pkt_size;
3345 	int lso_copy = myri10ge_lso_copy;
3346 	try_pullup = 1;
3347 
3348 again:
3349 	/* Setup checksum offloading, if needed */
3350 	mac_hcksum_get(mp, &start, &stuff, NULL, NULL, &tx_offload_flags);
3351 	myri10ge_lso_info_get(mp, &mss, &tx_offload_flags);
3352 	if (tx_offload_flags & HW_LSO) {
3353 		max_segs = MYRI10GE_MAX_SEND_DESC_TSO;
3354 		if ((tx_offload_flags & HCK_PARTIALCKSUM) == 0) {
3355 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_lsobadflags);
3356 			freemsg(mp);
3357 			return (DDI_SUCCESS);
3358 		}
3359 	} else {
3360 		max_segs = MXGEFW_MAX_SEND_DESC;
3361 		mss = 0;
3362 	}
3363 	req = req_list;
3364 	cksum_offset = 0;
3365 	pseudo_hdr_offset = 0;
3366 
3367 	/* leave an extra slot keep the ring from wrapping */
3368 	avail = tx->mask - (tx->req - tx->done);
3369 
3370 	/*
3371 	 * If we have > MXGEFW_MAX_SEND_DESC, then any over-length
3372 	 * message will need to be pulled up in order to fit.
3373 	 * Otherwise, we are low on transmit descriptors, it is
3374 	 * probably better to stall and try again rather than pullup a
3375 	 * message to fit.
3376 	 */
3377 
3378 	if (avail < max_segs) {
3379 		err = EBUSY;
3380 		atomic_inc_32(&tx->stall_early);
3381 		goto stall;
3382 	}
3383 
3384 	/* find out how long the frame is and how many segments it is */
3385 	count = 0;
3386 	odd_flag = 0;
3387 	pkt_size = 0;
3388 	flags = (MXGEFW_FLAGS_NO_TSO | MXGEFW_FLAGS_FIRST);
3389 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3390 		dblk_t *dbp;
3391 		mblen = MBLKL(bp);
3392 		if (mblen == 0) {
3393 			/*
3394 			 * we can't simply skip over 0-length mblks
3395 			 * because the hardware can't deal with them,
3396 			 * and we could leak them.
3397 			 */
3398 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_zero_len);
3399 			err = EIO;
3400 			goto pullup;
3401 		}
3402 		/*
3403 		 * There's no advantage to copying most gesballoc
3404 		 * attached blocks, so disable lso copy in that case
3405 		 */
3406 		if (mss && lso_copy == 1 && ((dbp = bp->b_datap) != NULL)) {
3407 			if ((void *)dbp->db_lastfree != myri10ge_db_lastfree) {
3408 				lso_copy = 0;
3409 			}
3410 		}
3411 		pkt_size += mblen;
3412 		count++;
3413 	}
3414 
3415 	/* Try to pull up excessivly long chains */
3416 	if (count >= max_segs) {
3417 		err = myri10ge_pullup(ss, mp);
3418 		if (likely(err == DDI_SUCCESS)) {
3419 			count = 1;
3420 		} else {
3421 			if (count <  MYRI10GE_MAX_SEND_DESC_TSO) {
3422 				/*
3423 				 * just let the h/w send it, it will be
3424 				 * inefficient, but us better than dropping
3425 				 */
3426 				max_segs = MYRI10GE_MAX_SEND_DESC_TSO;
3427 			} else {
3428 				/* drop it */
3429 				MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3430 				freemsg(mp);
3431 				return (0);
3432 			}
3433 		}
3434 	}
3435 
3436 	cum_len = 0;
3437 	maclen = myri10ge_ether_parse_header(mp);
3438 
3439 	if (tx_offload_flags & HCK_PARTIALCKSUM) {
3440 
3441 		cksum_offset = start + maclen;
3442 		pseudo_hdr_offset = htons(stuff + maclen);
3443 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
3444 		flags |= MXGEFW_FLAGS_CKSUM;
3445 	}
3446 
3447 	lso_hdr_size = 0; /* -Wunitinialized */
3448 	if (mss) { /* LSO */
3449 		/* this removes any CKSUM flag from before */
3450 		flags = (MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST);
3451 		/*
3452 		 * parse the headers and set cum_len to a negative
3453 		 * value to reflect the offset of the TCP payload
3454 		 */
3455 		lso_hdr_size =  myri10ge_lso_parse_header(mp, maclen);
3456 		cum_len = -lso_hdr_size;
3457 		if ((mss < mgp->tx_boundary) && lso_copy) {
3458 			err = myri10ge_tx_tso_copy(ss, mp, req_list,
3459 			    lso_hdr_size, pkt_size, mss, cksum_offset);
3460 			return (err);
3461 		}
3462 
3463 		/*
3464 		 * for TSO, pseudo_hdr_offset holds mss.  The firmware
3465 		 * figures out where to put the checksum by parsing
3466 		 * the header.
3467 		 */
3468 
3469 		pseudo_hdr_offset = htons(mss);
3470 	} else if (pkt_size <= MXGEFW_SEND_SMALL_SIZE) {
3471 		flags |= MXGEFW_FLAGS_SMALL;
3472 		if (pkt_size < myri10ge_tx_copylen) {
3473 			req->cksum_offset = cksum_offset;
3474 			req->pseudo_hdr_offset = pseudo_hdr_offset;
3475 			req->flags = flags;
3476 			err = myri10ge_tx_copy(ss, mp, req);
3477 			return (err);
3478 		}
3479 		cum_len = 0;
3480 	}
3481 
3482 	/* pull one DMA handle for each bp from our freelist */
3483 	handles = NULL;
3484 	err = myri10ge_alloc_tx_handles(ss, count, &handles);
3485 	if (err != DDI_SUCCESS) {
3486 		err = DDI_FAILURE;
3487 		goto stall;
3488 	}
3489 	count = 0;
3490 	rdma_count = 0;
3491 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3492 		mblen = MBLKL(bp);
3493 		dma_handle = handles;
3494 		handles = handles->next;
3495 
3496 		rv = ddi_dma_addr_bind_handle(dma_handle->h, NULL,
3497 		    (caddr_t)bp->b_rptr, mblen,
3498 		    DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL,
3499 		    &cookie, &ncookies);
3500 		if (unlikely(rv != DDI_DMA_MAPPED)) {
3501 			err = EIO;
3502 			try_pullup = 0;
3503 			dma_handle->next = handles;
3504 			handles = dma_handle;
3505 			goto abort_with_handles;
3506 		}
3507 
3508 		/* reserve the slot */
3509 		tx_info[count].m = bp;
3510 		tx_info[count].handle = dma_handle;
3511 
3512 		for (; ; ) {
3513 			low = MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress);
3514 			high_swapped =
3515 			    htonl(MYRI10GE_HIGHPART_TO_U32(
3516 			    cookie.dmac_laddress));
3517 			len = (uint32_t)cookie.dmac_size;
3518 			while (len) {
3519 				uint8_t flags_next;
3520 				int cum_len_next;
3521 
3522 				boundary = (low + mgp->tx_boundary) &
3523 				    ~(mgp->tx_boundary - 1);
3524 				seglen = boundary - low;
3525 				if (seglen > len)
3526 					seglen = len;
3527 
3528 				flags_next = flags & ~MXGEFW_FLAGS_FIRST;
3529 				cum_len_next = cum_len + seglen;
3530 				if (mss) {
3531 					(req-rdma_count)->rdma_count =
3532 					    rdma_count + 1;
3533 					if (likely(cum_len >= 0)) {
3534 						/* payload */
3535 						int next_is_first, chop;
3536 
3537 						chop = (cum_len_next > mss);
3538 						cum_len_next =
3539 						    cum_len_next % mss;
3540 						next_is_first =
3541 						    (cum_len_next == 0);
3542 						flags |= chop *
3543 						    MXGEFW_FLAGS_TSO_CHOP;
3544 						flags_next |= next_is_first *
3545 						    MXGEFW_FLAGS_FIRST;
3546 						rdma_count |=
3547 						    -(chop | next_is_first);
3548 						rdma_count +=
3549 						    chop & !next_is_first;
3550 					} else if (likely(cum_len_next >= 0)) {
3551 						/* header ends */
3552 						int small;
3553 
3554 						rdma_count = -1;
3555 						cum_len_next = 0;
3556 						seglen = -cum_len;
3557 						small = (mss <=
3558 						    MXGEFW_SEND_SMALL_SIZE);
3559 						flags_next =
3560 						    MXGEFW_FLAGS_TSO_PLD
3561 						    | MXGEFW_FLAGS_FIRST
3562 						    | (small *
3563 						    MXGEFW_FLAGS_SMALL);
3564 					}
3565 				}
3566 				req->addr_high = high_swapped;
3567 				req->addr_low = htonl(low);
3568 				req->pseudo_hdr_offset = pseudo_hdr_offset;
3569 				req->pad = 0; /* complete solid 16-byte block */
3570 				req->rdma_count = 1;
3571 				req->cksum_offset = cksum_offset;
3572 				req->length = htons(seglen);
3573 				req->flags = flags | ((cum_len & 1) * odd_flag);
3574 				if (cksum_offset > seglen)
3575 					cksum_offset -= seglen;
3576 				else
3577 					cksum_offset = 0;
3578 				low += seglen;
3579 				len -= seglen;
3580 				cum_len = cum_len_next;
3581 				count++;
3582 				rdma_count++;
3583 				/*  make sure all the segments will fit */
3584 				if (unlikely(count >= max_segs)) {
3585 					MYRI10GE_ATOMIC_SLICE_STAT_INC(
3586 					    xmit_lowbuf);
3587 					/* may try a pullup */
3588 					err = EBUSY;
3589 					if (try_pullup)
3590 						try_pullup = 2;
3591 					goto abort_with_handles;
3592 				}
3593 				req++;
3594 				req->flags = 0;
3595 				flags = flags_next;
3596 				tx_info[count].m = 0;
3597 			}
3598 			ncookies--;
3599 			if (ncookies == 0)
3600 				break;
3601 			ddi_dma_nextcookie(dma_handle->h, &cookie);
3602 		}
3603 	}
3604 	(req-rdma_count)->rdma_count = (uint8_t)rdma_count;
3605 
3606 	if (mss) {
3607 		do {
3608 			req--;
3609 			req->flags |= MXGEFW_FLAGS_TSO_LAST;
3610 		} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP |
3611 		    MXGEFW_FLAGS_FIRST)));
3612 	}
3613 
3614 	/* calculate tx stats */
3615 	if (mss) {
3616 		uint16_t opackets;
3617 		int payload;
3618 
3619 		payload = pkt_size - lso_hdr_size;
3620 		opackets = (payload / mss) + ((payload % mss) != 0);
3621 		tx_info[0].stat.un.all = 0;
3622 		tx_info[0].ostat.opackets = opackets;
3623 		tx_info[0].ostat.obytes = (opackets - 1) * lso_hdr_size
3624 		    + pkt_size;
3625 	} else {
3626 		myri10ge_tx_stat(&tx_info[0].stat,
3627 		    (struct ether_header *)(void *)mp->b_rptr, 1, pkt_size);
3628 	}
3629 	mutex_enter(&tx->lock);
3630 
3631 	/* check to see if the slots are really there */
3632 	avail = tx->mask - (tx->req - tx->done);
3633 	if (unlikely(avail <= count)) {
3634 		mutex_exit(&tx->lock);
3635 		err = 0;
3636 		goto late_stall;
3637 	}
3638 
3639 	myri10ge_send_locked(tx, req_list, tx_info, count);
3640 	mutex_exit(&tx->lock);
3641 	return (DDI_SUCCESS);
3642 
3643 late_stall:
3644 	try_pullup = 0;
3645 	atomic_inc_32(&tx->stall_late);
3646 
3647 abort_with_handles:
3648 	/* unbind and free handles from previous mblks */
3649 	for (i = 0; i < count; i++) {
3650 		bp = tx_info[i].m;
3651 		tx_info[i].m = 0;
3652 		if (bp) {
3653 			dma_handle = tx_info[i].handle;
3654 			(void) ddi_dma_unbind_handle(dma_handle->h);
3655 			dma_handle->next = handles;
3656 			handles = dma_handle;
3657 			tx_info[i].handle = NULL;
3658 			tx_info[i].m = NULL;
3659 		}
3660 	}
3661 	myri10ge_free_tx_handle_slist(tx, handles);
3662 pullup:
3663 	if (try_pullup) {
3664 		err = myri10ge_pullup(ss, mp);
3665 		if (err != DDI_SUCCESS && try_pullup == 2) {
3666 			/* drop */
3667 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3668 			freemsg(mp);
3669 			return (0);
3670 		}
3671 		try_pullup = 0;
3672 		goto again;
3673 	}
3674 
3675 stall:
3676 	if (err != 0) {
3677 		if (err == EBUSY) {
3678 			atomic_inc_32(&tx->stall);
3679 		} else {
3680 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3681 		}
3682 	}
3683 	return (err);
3684 }
3685 
3686 static mblk_t *
3687 myri10ge_send_wrapper(void *arg, mblk_t *mp)
3688 {
3689 	struct myri10ge_slice_state *ss = arg;
3690 	int err = 0;
3691 	mcp_kreq_ether_send_t *req_list;
3692 #if defined(__i386)
3693 	/*
3694 	 * We need about 2.5KB of scratch space to handle transmits.
3695 	 * i86pc has only 8KB of kernel stack space, so we malloc the
3696 	 * scratch space there rather than keeping it on the stack.
3697 	 */
3698 	size_t req_size, tx_info_size;
3699 	struct myri10ge_tx_buffer_state *tx_info;
3700 	caddr_t req_bytes;
3701 
3702 	req_size = sizeof (*req_list) * (MYRI10GE_MAX_SEND_DESC_TSO + 4)
3703 	    + 8;
3704 	req_bytes = kmem_alloc(req_size, KM_SLEEP);
3705 	tx_info_size = sizeof (*tx_info) * (MYRI10GE_MAX_SEND_DESC_TSO + 1);
3706 	tx_info = kmem_alloc(tx_info_size, KM_SLEEP);
3707 #else
3708 	char req_bytes[sizeof (*req_list) * (MYRI10GE_MAX_SEND_DESC_TSO + 4)
3709 	    + 8];
3710 	struct myri10ge_tx_buffer_state tx_info[MYRI10GE_MAX_SEND_DESC_TSO + 1];
3711 #endif
3712 
3713 	/* ensure req_list entries are aligned to 8 bytes */
3714 	req_list = (struct mcp_kreq_ether_send *)
3715 	    (((unsigned long)req_bytes + 7UL) & ~7UL);
3716 
3717 	err = myri10ge_send(ss, mp, req_list, tx_info);
3718 
3719 #if defined(__i386)
3720 	kmem_free(tx_info, tx_info_size);
3721 	kmem_free(req_bytes, req_size);
3722 #endif
3723 	if (err)
3724 		return (mp);
3725 	else
3726 		return (NULL);
3727 }
3728 
3729 static int
3730 myri10ge_addmac(void *arg, const uint8_t *mac_addr)
3731 {
3732 	struct myri10ge_priv *mgp = arg;
3733 	int err;
3734 
3735 	if (mac_addr == NULL)
3736 		return (EINVAL);
3737 
3738 	mutex_enter(&mgp->intrlock);
3739 	if (mgp->macaddr_cnt) {
3740 		mutex_exit(&mgp->intrlock);
3741 		return (ENOSPC);
3742 	}
3743 	err = myri10ge_m_unicst(mgp, mac_addr);
3744 	if (!err)
3745 		mgp->macaddr_cnt++;
3746 
3747 	mutex_exit(&mgp->intrlock);
3748 	if (err)
3749 		return (err);
3750 
3751 	bcopy(mac_addr, mgp->mac_addr, sizeof (mgp->mac_addr));
3752 	return (0);
3753 }
3754 
3755 /*ARGSUSED*/
3756 static int
3757 myri10ge_remmac(void *arg, const uint8_t *mac_addr)
3758 {
3759 	struct myri10ge_priv *mgp = arg;
3760 
3761 	mutex_enter(&mgp->intrlock);
3762 	mgp->macaddr_cnt--;
3763 	mutex_exit(&mgp->intrlock);
3764 
3765 	return (0);
3766 }
3767 
3768 /*ARGSUSED*/
3769 static void
3770 myri10ge_fill_group(void *arg, mac_ring_type_t rtype, const int index,
3771     mac_group_info_t *infop, mac_group_handle_t gh)
3772 {
3773 	struct myri10ge_priv *mgp = arg;
3774 
3775 	if (rtype != MAC_RING_TYPE_RX)
3776 		return;
3777 
3778 	infop->mgi_driver = (mac_group_driver_t)mgp;
3779 	infop->mgi_start = NULL;
3780 	infop->mgi_stop = NULL;
3781 	infop->mgi_addmac = myri10ge_addmac;
3782 	infop->mgi_remmac = myri10ge_remmac;
3783 	infop->mgi_count = mgp->num_slices;
3784 }
3785 
3786 static int
3787 myri10ge_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num)
3788 {
3789 	struct myri10ge_slice_state *ss;
3790 
3791 	ss = (struct myri10ge_slice_state *)rh;
3792 	mutex_enter(&ss->rx_lock);
3793 	ss->rx_gen_num = mr_gen_num;
3794 	mutex_exit(&ss->rx_lock);
3795 	return (0);
3796 }
3797 
3798 /*
3799  * Retrieve a value for one of the statistics for a particular rx ring
3800  */
3801 int
3802 myri10ge_rx_ring_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val)
3803 {
3804 	struct myri10ge_slice_state *ss;
3805 
3806 	ss = (struct myri10ge_slice_state *)rh;
3807 	switch (stat) {
3808 	case MAC_STAT_RBYTES:
3809 		*val = ss->rx_stats.ibytes;
3810 		break;
3811 
3812 	case MAC_STAT_IPACKETS:
3813 		*val = ss->rx_stats.ipackets;
3814 		break;
3815 
3816 	default:
3817 		*val = 0;
3818 		return (ENOTSUP);
3819 	}
3820 
3821 	return (0);
3822 }
3823 
3824 /*
3825  * Retrieve a value for one of the statistics for a particular tx ring
3826  */
3827 int
3828 myri10ge_tx_ring_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val)
3829 {
3830 	struct myri10ge_slice_state *ss;
3831 
3832 	ss = (struct myri10ge_slice_state *)rh;
3833 	switch (stat) {
3834 	case MAC_STAT_OBYTES:
3835 		*val = ss->tx.stats.obytes;
3836 		break;
3837 
3838 	case MAC_STAT_OPACKETS:
3839 		*val = ss->tx.stats.opackets;
3840 		break;
3841 
3842 	default:
3843 		*val = 0;
3844 		return (ENOTSUP);
3845 	}
3846 
3847 	return (0);
3848 }
3849 
3850 static int
3851 myri10ge_rx_ring_intr_disable(mac_intr_handle_t intrh)
3852 {
3853 	struct myri10ge_slice_state *ss;
3854 
3855 	ss = (struct myri10ge_slice_state *)intrh;
3856 	mutex_enter(&ss->poll_lock);
3857 	ss->rx_polling = B_TRUE;
3858 	mutex_exit(&ss->poll_lock);
3859 	return (0);
3860 }
3861 
3862 static int
3863 myri10ge_rx_ring_intr_enable(mac_intr_handle_t intrh)
3864 {
3865 	struct myri10ge_slice_state *ss;
3866 
3867 	ss = (struct myri10ge_slice_state *)intrh;
3868 	mutex_enter(&ss->poll_lock);
3869 	ss->rx_polling = B_FALSE;
3870 	if (ss->rx_token) {
3871 		*ss->irq_claim = BE_32(3);
3872 		ss->rx_token = 0;
3873 	}
3874 	mutex_exit(&ss->poll_lock);
3875 	return (0);
3876 }
3877 
3878 /*ARGSUSED*/
3879 static void
3880 myri10ge_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
3881     const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
3882 {
3883 	struct myri10ge_priv *mgp = arg;
3884 	struct myri10ge_slice_state *ss;
3885 	mac_intr_t *mintr = &infop->mri_intr;
3886 
3887 	ASSERT((unsigned int)ring_index < mgp->num_slices);
3888 
3889 	ss = &mgp->ss[ring_index];
3890 	switch (rtype) {
3891 	case MAC_RING_TYPE_RX:
3892 		ss->rx_rh = rh;
3893 		infop->mri_driver = (mac_ring_driver_t)ss;
3894 		infop->mri_start = myri10ge_ring_start;
3895 		infop->mri_stop = NULL;
3896 		infop->mri_poll = myri10ge_poll_rx;
3897 		infop->mri_stat = myri10ge_rx_ring_stat;
3898 		mintr->mi_handle = (mac_intr_handle_t)ss;
3899 		mintr->mi_enable = myri10ge_rx_ring_intr_enable;
3900 		mintr->mi_disable = myri10ge_rx_ring_intr_disable;
3901 		break;
3902 	case MAC_RING_TYPE_TX:
3903 		ss->tx.rh = rh;
3904 		infop->mri_driver = (mac_ring_driver_t)ss;
3905 		infop->mri_start = NULL;
3906 		infop->mri_stop = NULL;
3907 		infop->mri_tx = myri10ge_send_wrapper;
3908 		infop->mri_stat = myri10ge_tx_ring_stat;
3909 		break;
3910 	default:
3911 		break;
3912 	}
3913 }
3914 
3915 static void
3916 myri10ge_nic_stat_destroy(struct myri10ge_priv *mgp)
3917 {
3918 	if (mgp->ksp_stat == NULL)
3919 		return;
3920 
3921 	kstat_delete(mgp->ksp_stat);
3922 	mgp->ksp_stat = NULL;
3923 }
3924 
3925 static void
3926 myri10ge_slice_stat_destroy(struct myri10ge_slice_state *ss)
3927 {
3928 	if (ss->ksp_stat == NULL)
3929 		return;
3930 
3931 	kstat_delete(ss->ksp_stat);
3932 	ss->ksp_stat = NULL;
3933 }
3934 
3935 static void
3936 myri10ge_info_destroy(struct myri10ge_priv *mgp)
3937 {
3938 	if (mgp->ksp_info == NULL)
3939 		return;
3940 
3941 	kstat_delete(mgp->ksp_info);
3942 	mgp->ksp_info = NULL;
3943 }
3944 
3945 static int
3946 myri10ge_nic_stat_kstat_update(kstat_t *ksp, int rw)
3947 {
3948 	struct myri10ge_nic_stat *ethstat;
3949 	struct myri10ge_priv *mgp;
3950 	mcp_irq_data_t *fw_stats;
3951 
3952 
3953 	if (rw == KSTAT_WRITE)
3954 		return (EACCES);
3955 
3956 	ethstat = (struct myri10ge_nic_stat *)ksp->ks_data;
3957 	mgp = (struct myri10ge_priv *)ksp->ks_private;
3958 	fw_stats = mgp->ss[0].fw_stats;
3959 
3960 	ethstat->dma_read_bw_MBs.value.ul = mgp->read_dma;
3961 	ethstat->dma_write_bw_MBs.value.ul = mgp->write_dma;
3962 	ethstat->dma_read_write_bw_MBs.value.ul = mgp->read_write_dma;
3963 	if (myri10ge_tx_dma_attr.dma_attr_flags & DDI_DMA_FORCE_PHYSICAL)
3964 		ethstat->dma_force_physical.value.ul = 1;
3965 	else
3966 		ethstat->dma_force_physical.value.ul = 0;
3967 	ethstat->lanes.value.ul = mgp->pcie_link_width;
3968 	ethstat->dropped_bad_crc32.value.ul =
3969 	    ntohl(fw_stats->dropped_bad_crc32);
3970 	ethstat->dropped_bad_phy.value.ul =
3971 	    ntohl(fw_stats->dropped_bad_phy);
3972 	ethstat->dropped_link_error_or_filtered.value.ul =
3973 	    ntohl(fw_stats->dropped_link_error_or_filtered);
3974 	ethstat->dropped_link_overflow.value.ul =
3975 	    ntohl(fw_stats->dropped_link_overflow);
3976 	ethstat->dropped_multicast_filtered.value.ul =
3977 	    ntohl(fw_stats->dropped_multicast_filtered);
3978 	ethstat->dropped_no_big_buffer.value.ul =
3979 	    ntohl(fw_stats->dropped_no_big_buffer);
3980 	ethstat->dropped_no_small_buffer.value.ul =
3981 	    ntohl(fw_stats->dropped_no_small_buffer);
3982 	ethstat->dropped_overrun.value.ul =
3983 	    ntohl(fw_stats->dropped_overrun);
3984 	ethstat->dropped_pause.value.ul =
3985 	    ntohl(fw_stats->dropped_pause);
3986 	ethstat->dropped_runt.value.ul =
3987 	    ntohl(fw_stats->dropped_runt);
3988 	ethstat->link_up.value.ul =
3989 	    ntohl(fw_stats->link_up);
3990 	ethstat->dropped_unicast_filtered.value.ul =
3991 	    ntohl(fw_stats->dropped_unicast_filtered);
3992 	return (0);
3993 }
3994 
3995 static int
3996 myri10ge_slice_stat_kstat_update(kstat_t *ksp, int rw)
3997 {
3998 	struct myri10ge_slice_stat *ethstat;
3999 	struct myri10ge_slice_state *ss;
4000 
4001 	if (rw == KSTAT_WRITE)
4002 		return (EACCES);
4003 
4004 	ethstat = (struct myri10ge_slice_stat *)ksp->ks_data;
4005 	ss = (struct myri10ge_slice_state *)ksp->ks_private;
4006 
4007 	ethstat->rx_big.value.ul = ss->j_rx_cnt;
4008 	ethstat->rx_bigbuf_firmware.value.ul = ss->rx_big.cnt - ss->j_rx_cnt;
4009 	ethstat->rx_bigbuf_pool.value.ul =
4010 	    ss->jpool.num_alloc - ss->jbufs_for_smalls;
4011 	ethstat->rx_bigbuf_smalls.value.ul = ss->jbufs_for_smalls;
4012 	ethstat->rx_small.value.ul = ss->rx_small.cnt -
4013 	    (ss->rx_small.mask + 1);
4014 	ethstat->tx_done.value.ul = ss->tx.done;
4015 	ethstat->tx_req.value.ul = ss->tx.req;
4016 	ethstat->tx_activate.value.ul = ss->tx.activate;
4017 	ethstat->xmit_sched.value.ul = ss->tx.sched;
4018 	ethstat->xmit_stall.value.ul = ss->tx.stall;
4019 	ethstat->xmit_stall_early.value.ul = ss->tx.stall_early;
4020 	ethstat->xmit_stall_late.value.ul = ss->tx.stall_late;
4021 	ethstat->xmit_err.value.ul =  MYRI10GE_SLICE_STAT(xmit_err);
4022 	return (0);
4023 }
4024 
4025 static int
4026 myri10ge_info_kstat_update(kstat_t *ksp, int rw)
4027 {
4028 	struct myri10ge_info *info;
4029 	struct myri10ge_priv *mgp;
4030 
4031 
4032 	if (rw == KSTAT_WRITE)
4033 		return (EACCES);
4034 
4035 	info = (struct myri10ge_info *)ksp->ks_data;
4036 	mgp = (struct myri10ge_priv *)ksp->ks_private;
4037 	kstat_named_setstr(&info->driver_version, MYRI10GE_VERSION_STR);
4038 	kstat_named_setstr(&info->firmware_version, mgp->fw_version);
4039 	kstat_named_setstr(&info->firmware_name, mgp->fw_name);
4040 	kstat_named_setstr(&info->interrupt_type, mgp->intr_type);
4041 	kstat_named_setstr(&info->product_code, mgp->pc_str);
4042 	kstat_named_setstr(&info->serial_number, mgp->sn_str);
4043 	return (0);
4044 }
4045 
4046 static struct myri10ge_info myri10ge_info_template = {
4047 	{ "driver_version",	KSTAT_DATA_STRING },
4048 	{ "firmware_version",	KSTAT_DATA_STRING },
4049 	{ "firmware_name",	KSTAT_DATA_STRING },
4050 	{ "interrupt_type",	KSTAT_DATA_STRING },
4051 	{ "product_code",	KSTAT_DATA_STRING },
4052 	{ "serial_number",	KSTAT_DATA_STRING },
4053 };
4054 static kmutex_t myri10ge_info_template_lock;
4055 
4056 
4057 static int
4058 myri10ge_info_init(struct myri10ge_priv *mgp)
4059 {
4060 	struct kstat *ksp;
4061 
4062 	ksp = kstat_create("myri10ge", ddi_get_instance(mgp->dip),
4063 	    "myri10ge_info", "net", KSTAT_TYPE_NAMED,
4064 	    sizeof (myri10ge_info_template) /
4065 	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4066 	if (ksp == NULL) {
4067 		cmn_err(CE_WARN,
4068 		    "%s: myri10ge_info_init: kstat_create failed", mgp->name);
4069 		return (DDI_FAILURE);
4070 	}
4071 	mgp->ksp_info = ksp;
4072 	ksp->ks_update = myri10ge_info_kstat_update;
4073 	ksp->ks_private = (void *) mgp;
4074 	ksp->ks_data = &myri10ge_info_template;
4075 	ksp->ks_lock = &myri10ge_info_template_lock;
4076 	if (MYRI10GE_VERSION_STR != NULL)
4077 		ksp->ks_data_size += strlen(MYRI10GE_VERSION_STR) + 1;
4078 	if (mgp->fw_version != NULL)
4079 		ksp->ks_data_size += strlen(mgp->fw_version) + 1;
4080 	ksp->ks_data_size += strlen(mgp->fw_name) + 1;
4081 	ksp->ks_data_size += strlen(mgp->intr_type) + 1;
4082 	if (mgp->pc_str != NULL)
4083 		ksp->ks_data_size += strlen(mgp->pc_str) + 1;
4084 	if (mgp->sn_str != NULL)
4085 		ksp->ks_data_size += strlen(mgp->sn_str) + 1;
4086 
4087 	kstat_install(ksp);
4088 	return (DDI_SUCCESS);
4089 }
4090 
4091 
4092 static int
4093 myri10ge_nic_stat_init(struct myri10ge_priv *mgp)
4094 {
4095 	struct kstat *ksp;
4096 	struct myri10ge_nic_stat *ethstat;
4097 
4098 	ksp = kstat_create("myri10ge", ddi_get_instance(mgp->dip),
4099 	    "myri10ge_nic_stats", "net", KSTAT_TYPE_NAMED,
4100 	    sizeof (*ethstat) / sizeof (kstat_named_t), 0);
4101 	if (ksp == NULL) {
4102 		cmn_err(CE_WARN,
4103 		    "%s: myri10ge_stat_init: kstat_create failed", mgp->name);
4104 		return (DDI_FAILURE);
4105 	}
4106 	mgp->ksp_stat = ksp;
4107 	ethstat = (struct myri10ge_nic_stat *)(ksp->ks_data);
4108 
4109 	kstat_named_init(&ethstat->dma_read_bw_MBs,
4110 	    "dma_read_bw_MBs", KSTAT_DATA_ULONG);
4111 	kstat_named_init(&ethstat->dma_write_bw_MBs,
4112 	    "dma_write_bw_MBs", KSTAT_DATA_ULONG);
4113 	kstat_named_init(&ethstat->dma_read_write_bw_MBs,
4114 	    "dma_read_write_bw_MBs", KSTAT_DATA_ULONG);
4115 	kstat_named_init(&ethstat->dma_force_physical,
4116 	    "dma_force_physical", KSTAT_DATA_ULONG);
4117 	kstat_named_init(&ethstat->lanes,
4118 	    "lanes", KSTAT_DATA_ULONG);
4119 	kstat_named_init(&ethstat->dropped_bad_crc32,
4120 	    "dropped_bad_crc32", KSTAT_DATA_ULONG);
4121 	kstat_named_init(&ethstat->dropped_bad_phy,
4122 	    "dropped_bad_phy", KSTAT_DATA_ULONG);
4123 	kstat_named_init(&ethstat->dropped_link_error_or_filtered,
4124 	    "dropped_link_error_or_filtered", KSTAT_DATA_ULONG);
4125 	kstat_named_init(&ethstat->dropped_link_overflow,
4126 	    "dropped_link_overflow", KSTAT_DATA_ULONG);
4127 	kstat_named_init(&ethstat->dropped_multicast_filtered,
4128 	    "dropped_multicast_filtered", KSTAT_DATA_ULONG);
4129 	kstat_named_init(&ethstat->dropped_no_big_buffer,
4130 	    "dropped_no_big_buffer", KSTAT_DATA_ULONG);
4131 	kstat_named_init(&ethstat->dropped_no_small_buffer,
4132 	    "dropped_no_small_buffer", KSTAT_DATA_ULONG);
4133 	kstat_named_init(&ethstat->dropped_overrun,
4134 	    "dropped_overrun", KSTAT_DATA_ULONG);
4135 	kstat_named_init(&ethstat->dropped_pause,
4136 	    "dropped_pause", KSTAT_DATA_ULONG);
4137 	kstat_named_init(&ethstat->dropped_runt,
4138 	    "dropped_runt", KSTAT_DATA_ULONG);
4139 	kstat_named_init(&ethstat->dropped_unicast_filtered,
4140 	    "dropped_unicast_filtered", KSTAT_DATA_ULONG);
4141 	kstat_named_init(&ethstat->dropped_runt, "dropped_runt",
4142 	    KSTAT_DATA_ULONG);
4143 	kstat_named_init(&ethstat->link_up, "link_up", KSTAT_DATA_ULONG);
4144 	kstat_named_init(&ethstat->link_changes, "link_changes",
4145 	    KSTAT_DATA_ULONG);
4146 	ksp->ks_update = myri10ge_nic_stat_kstat_update;
4147 	ksp->ks_private = (void *) mgp;
4148 	kstat_install(ksp);
4149 	return (DDI_SUCCESS);
4150 }
4151 
4152 static int
4153 myri10ge_slice_stat_init(struct myri10ge_slice_state *ss)
4154 {
4155 	struct myri10ge_priv *mgp = ss->mgp;
4156 	struct kstat *ksp;
4157 	struct myri10ge_slice_stat *ethstat;
4158 	int instance;
4159 
4160 	/*
4161 	 * fake an instance so that the same slice numbers from
4162 	 * different instances do not collide
4163 	 */
4164 	instance = (ddi_get_instance(mgp->dip) * 1000) +  (int)(ss - mgp->ss);
4165 	ksp = kstat_create("myri10ge", instance,
4166 	    "myri10ge_slice_stats", "net", KSTAT_TYPE_NAMED,
4167 	    sizeof (*ethstat) / sizeof (kstat_named_t), 0);
4168 	if (ksp == NULL) {
4169 		cmn_err(CE_WARN,
4170 		    "%s: myri10ge_stat_init: kstat_create failed", mgp->name);
4171 		return (DDI_FAILURE);
4172 	}
4173 	ss->ksp_stat = ksp;
4174 	ethstat = (struct myri10ge_slice_stat *)(ksp->ks_data);
4175 	kstat_named_init(&ethstat->lro_bad_csum, "lro_bad_csum",
4176 	    KSTAT_DATA_ULONG);
4177 	kstat_named_init(&ethstat->lro_flushed, "lro_flushed",
4178 	    KSTAT_DATA_ULONG);
4179 	kstat_named_init(&ethstat->lro_queued, "lro_queued",
4180 	    KSTAT_DATA_ULONG);
4181 	kstat_named_init(&ethstat->rx_bigbuf_firmware, "rx_bigbuf_firmware",
4182 	    KSTAT_DATA_ULONG);
4183 	kstat_named_init(&ethstat->rx_bigbuf_pool, "rx_bigbuf_pool",
4184 	    KSTAT_DATA_ULONG);
4185 	kstat_named_init(&ethstat->rx_bigbuf_smalls, "rx_bigbuf_smalls",
4186 	    KSTAT_DATA_ULONG);
4187 	kstat_named_init(&ethstat->rx_copy, "rx_copy",
4188 	    KSTAT_DATA_ULONG);
4189 	kstat_named_init(&ethstat->rx_big_nobuf, "rx_big_nobuf",
4190 	    KSTAT_DATA_ULONG);
4191 	kstat_named_init(&ethstat->rx_small_nobuf, "rx_small_nobuf",
4192 	    KSTAT_DATA_ULONG);
4193 	kstat_named_init(&ethstat->xmit_zero_len, "xmit_zero_len",
4194 	    KSTAT_DATA_ULONG);
4195 	kstat_named_init(&ethstat->xmit_pullup, "xmit_pullup",
4196 	    KSTAT_DATA_ULONG);
4197 	kstat_named_init(&ethstat->xmit_pullup_first, "xmit_pullup_first",
4198 	    KSTAT_DATA_ULONG);
4199 	kstat_named_init(&ethstat->xmit_lowbuf, "xmit_lowbuf",
4200 	    KSTAT_DATA_ULONG);
4201 	kstat_named_init(&ethstat->xmit_lsobadflags, "xmit_lsobadflags",
4202 	    KSTAT_DATA_ULONG);
4203 	kstat_named_init(&ethstat->xmit_sched, "xmit_sched",
4204 	    KSTAT_DATA_ULONG);
4205 	kstat_named_init(&ethstat->xmit_stall, "xmit_stall",
4206 	    KSTAT_DATA_ULONG);
4207 	kstat_named_init(&ethstat->xmit_stall_early, "xmit_stall_early",
4208 	    KSTAT_DATA_ULONG);
4209 	kstat_named_init(&ethstat->xmit_stall_late, "xmit_stall_late",
4210 	    KSTAT_DATA_ULONG);
4211 	kstat_named_init(&ethstat->xmit_err, "xmit_err",
4212 	    KSTAT_DATA_ULONG);
4213 	kstat_named_init(&ethstat->tx_req, "tx_req",
4214 	    KSTAT_DATA_ULONG);
4215 	kstat_named_init(&ethstat->tx_activate, "tx_activate",
4216 	    KSTAT_DATA_ULONG);
4217 	kstat_named_init(&ethstat->tx_done, "tx_done",
4218 	    KSTAT_DATA_ULONG);
4219 	kstat_named_init(&ethstat->tx_handles_alloced, "tx_handles_alloced",
4220 	    KSTAT_DATA_ULONG);
4221 	kstat_named_init(&ethstat->rx_big, "rx_big",
4222 	    KSTAT_DATA_ULONG);
4223 	kstat_named_init(&ethstat->rx_small, "rx_small",
4224 	    KSTAT_DATA_ULONG);
4225 	ksp->ks_update = myri10ge_slice_stat_kstat_update;
4226 	ksp->ks_private = (void *) ss;
4227 	kstat_install(ksp);
4228 	return (DDI_SUCCESS);
4229 }
4230 
4231 
4232 
4233 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
4234 
4235 #include <vm/hat.h>
4236 #include <sys/ddi_isa.h>
4237 void *device_arena_alloc(size_t size, int vm_flag);
4238 void device_arena_free(void *vaddr, size_t size);
4239 
4240 static void
4241 myri10ge_enable_nvidia_ecrc(struct myri10ge_priv *mgp)
4242 {
4243 	dev_info_t *parent_dip;
4244 	ddi_acc_handle_t handle;
4245 	unsigned long bus_number, dev_number, func_number;
4246 	unsigned long cfg_pa, paddr, base, pgoffset;
4247 	char 		*cvaddr, *ptr;
4248 	uint32_t	*ptr32;
4249 	int 		retval = DDI_FAILURE;
4250 	int dontcare;
4251 	uint16_t read_vid, read_did, vendor_id, device_id;
4252 
4253 	if (!myri10ge_nvidia_ecrc_enable)
4254 		return;
4255 
4256 	parent_dip = ddi_get_parent(mgp->dip);
4257 	if (parent_dip == NULL) {
4258 		cmn_err(CE_WARN, "%s: I'm an orphan?", mgp->name);
4259 		return;
4260 	}
4261 
4262 	if (pci_config_setup(parent_dip, &handle) != DDI_SUCCESS) {
4263 		cmn_err(CE_WARN,
4264 		    "%s: Could not access my parent's registers", mgp->name);
4265 		return;
4266 	}
4267 
4268 	vendor_id = pci_config_get16(handle, PCI_CONF_VENID);
4269 	device_id = pci_config_get16(handle, PCI_CONF_DEVID);
4270 	pci_config_teardown(&handle);
4271 
4272 	if (myri10ge_verbose) {
4273 		unsigned long 	bus_number, dev_number, func_number;
4274 		int 		reg_set, span;
4275 		(void) myri10ge_reg_set(parent_dip, &reg_set, &span,
4276 		    &bus_number, &dev_number, &func_number);
4277 		if (myri10ge_verbose)
4278 			printf("%s: parent at %ld:%ld:%ld\n", mgp->name,
4279 			    bus_number, dev_number, func_number);
4280 	}
4281 
4282 	if (vendor_id !=  0x10de)
4283 		return;
4284 
4285 	if (device_id != 0x005d /* CK804 */ &&
4286 	    (device_id < 0x374 || device_id > 0x378) /* MCP55 */) {
4287 		return;
4288 	}
4289 	(void) myri10ge_reg_set(parent_dip, &dontcare, &dontcare,
4290 	    &bus_number, &dev_number, &func_number);
4291 
4292 	for (cfg_pa = 0xf0000000UL;
4293 	    retval != DDI_SUCCESS && cfg_pa >= 0xe0000000UL;
4294 	    cfg_pa -= 0x10000000UL) {
4295 		/* find the config space address for the nvidia bridge */
4296 		paddr = (cfg_pa + bus_number * 0x00100000UL +
4297 		    (dev_number * 8 + func_number) * 0x00001000UL);
4298 
4299 		base = paddr & (~MMU_PAGEOFFSET);
4300 		pgoffset = paddr & MMU_PAGEOFFSET;
4301 
4302 		/* map it into the kernel */
4303 		cvaddr =  device_arena_alloc(ptob(1), VM_NOSLEEP);
4304 		if (cvaddr == NULL)
4305 			cmn_err(CE_WARN, "%s: failed to map nf4: cvaddr\n",
4306 			    mgp->name);
4307 
4308 		hat_devload(kas.a_hat, cvaddr, mmu_ptob(1),
4309 		    i_ddi_paddr_to_pfn(base),
4310 		    PROT_WRITE|HAT_STRICTORDER, HAT_LOAD_LOCK);
4311 
4312 		ptr = cvaddr + pgoffset;
4313 		read_vid = *(uint16_t *)(void *)(ptr + PCI_CONF_VENID);
4314 		read_did = *(uint16_t *)(void *)(ptr + PCI_CONF_DEVID);
4315 		if (vendor_id ==  read_did || device_id == read_did) {
4316 			ptr32 = (uint32_t *)(void *)(ptr + 0x178);
4317 			if (myri10ge_verbose)
4318 				printf("%s: Enabling ECRC on upstream "
4319 				    "Nvidia bridge (0x%x:0x%x) "
4320 				    "at %ld:%ld:%ld\n", mgp->name,
4321 				    read_vid, read_did, bus_number,
4322 				    dev_number, func_number);
4323 			*ptr32 |= 0x40;
4324 			retval = DDI_SUCCESS;
4325 		}
4326 		hat_unload(kas.a_hat, cvaddr, ptob(1), HAT_UNLOAD_UNLOCK);
4327 		device_arena_free(cvaddr, ptob(1));
4328 	}
4329 }
4330 
4331 #else
4332 /*ARGSUSED*/
4333 static void
4334 myri10ge_enable_nvidia_ecrc(struct myri10ge_priv *mgp)
4335 {
4336 }
4337 #endif /* i386 */
4338 
4339 
4340 /*
4341  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
4342  * when the PCI-E Completion packets are aligned on an 8-byte
4343  * boundary.  Some PCI-E chip sets always align Completion packets; on
4344  * the ones that do not, the alignment can be enforced by enabling
4345  * ECRC generation (if supported).
4346  *
4347  * When PCI-E Completion packets are not aligned, it is actually more
4348  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
4349  *
4350  * If the driver can neither enable ECRC nor verify that it has
4351  * already been enabled, then it must use a firmware image which works
4352  * around unaligned completion packets (ethp_z8e.dat), and it should
4353  * also ensure that it never gives the device a Read-DMA which is
4354  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
4355  * enabled, then the driver should use the aligned (eth_z8e.dat)
4356  * firmware image, and set tx.boundary to 4KB.
4357  */
4358 
4359 
4360 static int
4361 myri10ge_firmware_probe(struct myri10ge_priv *mgp)
4362 {
4363 	int status;
4364 
4365 	mgp->tx_boundary = 4096;
4366 	/*
4367 	 * Verify the max read request size was set to 4KB
4368 	 * before trying the test with 4KB.
4369 	 */
4370 	if (mgp->max_read_request_4k == 0)
4371 		mgp->tx_boundary = 2048;
4372 	/*
4373 	 * load the optimized firmware which assumes aligned PCIe
4374 	 * completions in order to see if it works on this host.
4375 	 */
4376 
4377 	mgp->fw_name = "rss_eth_z8e";
4378 	mgp->eth_z8e = (unsigned char *)rss_eth_z8e;
4379 	mgp->eth_z8e_length = rss_eth_z8e_length;
4380 
4381 	status = myri10ge_load_firmware(mgp);
4382 	if (status != 0) {
4383 		return (status);
4384 	}
4385 	/*
4386 	 * Enable ECRC if possible
4387 	 */
4388 	myri10ge_enable_nvidia_ecrc(mgp);
4389 
4390 	/*
4391 	 * Run a DMA test which watches for unaligned completions and
4392 	 * aborts on the first one seen.
4393 	 */
4394 	status = myri10ge_dma_test(mgp, MXGEFW_CMD_UNALIGNED_TEST);
4395 	if (status == 0)
4396 		return (0); /* keep the aligned firmware */
4397 
4398 	if (status != E2BIG)
4399 		cmn_err(CE_WARN, "%s: DMA test failed: %d\n",
4400 		    mgp->name, status);
4401 	if (status == ENOSYS)
4402 		cmn_err(CE_WARN, "%s: Falling back to ethp! "
4403 		    "Please install up to date fw\n", mgp->name);
4404 	return (status);
4405 }
4406 
4407 static int
4408 myri10ge_select_firmware(struct myri10ge_priv *mgp)
4409 {
4410 	int aligned;
4411 
4412 	aligned = 0;
4413 
4414 	if (myri10ge_force_firmware == 1) {
4415 		if (myri10ge_verbose)
4416 			printf("%s: Assuming aligned completions (forced)\n",
4417 			    mgp->name);
4418 		aligned = 1;
4419 		goto done;
4420 	}
4421 
4422 	if (myri10ge_force_firmware == 2) {
4423 		if (myri10ge_verbose)
4424 			printf("%s: Assuming unaligned completions (forced)\n",
4425 			    mgp->name);
4426 		aligned = 0;
4427 		goto done;
4428 	}
4429 
4430 	/* If the width is less than 8, we may used the aligned firmware */
4431 	if (mgp->pcie_link_width != 0 && mgp->pcie_link_width < 8) {
4432 		cmn_err(CE_WARN, "!%s: PCIe link running at x%d\n",
4433 		    mgp->name, mgp->pcie_link_width);
4434 		aligned = 1;
4435 		goto done;
4436 	}
4437 
4438 	if (0 == myri10ge_firmware_probe(mgp))
4439 		return (0);  /* keep optimized firmware */
4440 
4441 done:
4442 	if (aligned) {
4443 		mgp->fw_name = "rss_eth_z8e";
4444 		mgp->eth_z8e = (unsigned char *)rss_eth_z8e;
4445 		mgp->eth_z8e_length = rss_eth_z8e_length;
4446 		mgp->tx_boundary = 4096;
4447 	} else {
4448 		mgp->fw_name = "rss_ethp_z8e";
4449 		mgp->eth_z8e = (unsigned char *)rss_ethp_z8e;
4450 		mgp->eth_z8e_length = rss_ethp_z8e_length;
4451 		mgp->tx_boundary = 2048;
4452 	}
4453 
4454 	return (myri10ge_load_firmware(mgp));
4455 }
4456 
4457 static int
4458 myri10ge_add_intrs(struct myri10ge_priv *mgp, int add_handler)
4459 {
4460 	dev_info_t *devinfo = mgp->dip;
4461 	int count, avail, actual, intr_types;
4462 	int x, y, rc, inum = 0;
4463 
4464 
4465 	rc = ddi_intr_get_supported_types(devinfo, &intr_types);
4466 	if (rc != DDI_SUCCESS) {
4467 		cmn_err(CE_WARN,
4468 		    "!%s: ddi_intr_get_nintrs() failure, rc = %d\n", mgp->name,
4469 		    rc);
4470 		return (DDI_FAILURE);
4471 	}
4472 
4473 	if (!myri10ge_use_msi)
4474 		intr_types &= ~DDI_INTR_TYPE_MSI;
4475 	if (!myri10ge_use_msix)
4476 		intr_types &= ~DDI_INTR_TYPE_MSIX;
4477 
4478 	if (intr_types & DDI_INTR_TYPE_MSIX) {
4479 		mgp->ddi_intr_type = DDI_INTR_TYPE_MSIX;
4480 		mgp->intr_type = "MSI-X";
4481 	} else if (intr_types & DDI_INTR_TYPE_MSI) {
4482 		mgp->ddi_intr_type = DDI_INTR_TYPE_MSI;
4483 		mgp->intr_type = "MSI";
4484 	} else {
4485 		mgp->ddi_intr_type = DDI_INTR_TYPE_FIXED;
4486 		mgp->intr_type = "Legacy";
4487 	}
4488 	/* Get number of interrupts */
4489 	rc = ddi_intr_get_nintrs(devinfo, mgp->ddi_intr_type, &count);
4490 	if ((rc != DDI_SUCCESS) || (count == 0)) {
4491 		cmn_err(CE_WARN, "%s: ddi_intr_get_nintrs() failure, rc: %d, "
4492 		    "count: %d", mgp->name, rc, count);
4493 
4494 		return (DDI_FAILURE);
4495 	}
4496 
4497 	/* Get number of available interrupts */
4498 	rc = ddi_intr_get_navail(devinfo, mgp->ddi_intr_type, &avail);
4499 	if ((rc != DDI_SUCCESS) || (avail == 0)) {
4500 		cmn_err(CE_WARN, "%s: ddi_intr_get_navail() failure, "
4501 		    "rc: %d, avail: %d\n", mgp->name, rc, avail);
4502 		return (DDI_FAILURE);
4503 	}
4504 	if (avail < count) {
4505 		cmn_err(CE_NOTE,
4506 		    "!%s: nintrs() returned %d, navail returned %d",
4507 		    mgp->name, count, avail);
4508 		count = avail;
4509 	}
4510 
4511 	if (count < mgp->num_slices)
4512 		return (DDI_FAILURE);
4513 
4514 	if (count > mgp->num_slices)
4515 		count = mgp->num_slices;
4516 
4517 	/* Allocate memory for MSI interrupts */
4518 	mgp->intr_size = count * sizeof (ddi_intr_handle_t);
4519 	mgp->htable = kmem_alloc(mgp->intr_size, KM_SLEEP);
4520 
4521 	rc = ddi_intr_alloc(devinfo, mgp->htable, mgp->ddi_intr_type, inum,
4522 	    count, &actual, DDI_INTR_ALLOC_NORMAL);
4523 
4524 	if ((rc != DDI_SUCCESS) || (actual == 0)) {
4525 		cmn_err(CE_WARN, "%s: ddi_intr_alloc() failed: %d",
4526 		    mgp->name, rc);
4527 
4528 		kmem_free(mgp->htable, mgp->intr_size);
4529 		mgp->htable = NULL;
4530 		return (DDI_FAILURE);
4531 	}
4532 
4533 	if ((actual < count) && myri10ge_verbose) {
4534 		cmn_err(CE_NOTE, "%s: got %d/%d slices",
4535 		    mgp->name, actual, count);
4536 	}
4537 
4538 	mgp->intr_cnt = actual;
4539 
4540 	/*
4541 	 * Get priority for first irq, assume remaining are all the same
4542 	 */
4543 	if (ddi_intr_get_pri(mgp->htable[0], &mgp->intr_pri)
4544 	    != DDI_SUCCESS) {
4545 		cmn_err(CE_WARN, "%s: ddi_intr_get_pri() failed", mgp->name);
4546 
4547 		/* Free already allocated intr */
4548 		for (y = 0; y < actual; y++) {
4549 			(void) ddi_intr_free(mgp->htable[y]);
4550 		}
4551 
4552 		kmem_free(mgp->htable, mgp->intr_size);
4553 		mgp->htable = NULL;
4554 		return (DDI_FAILURE);
4555 	}
4556 
4557 	mgp->icookie = (void *)(uintptr_t)mgp->intr_pri;
4558 
4559 	if (!add_handler)
4560 		return (DDI_SUCCESS);
4561 
4562 	/* Call ddi_intr_add_handler() */
4563 	for (x = 0; x < actual; x++) {
4564 		if (ddi_intr_add_handler(mgp->htable[x], myri10ge_intr,
4565 		    (caddr_t)&mgp->ss[x], NULL) != DDI_SUCCESS) {
4566 			cmn_err(CE_WARN, "%s: ddi_intr_add_handler() failed",
4567 			    mgp->name);
4568 
4569 			/* Free already allocated intr */
4570 			for (y = 0; y < actual; y++) {
4571 				(void) ddi_intr_free(mgp->htable[y]);
4572 			}
4573 
4574 			kmem_free(mgp->htable, mgp->intr_size);
4575 			mgp->htable = NULL;
4576 			return (DDI_FAILURE);
4577 		}
4578 	}
4579 
4580 	(void) ddi_intr_get_cap(mgp->htable[0], &mgp->intr_cap);
4581 	if (mgp->intr_cap & DDI_INTR_FLAG_BLOCK) {
4582 		/* Call ddi_intr_block_enable() for MSI */
4583 		(void) ddi_intr_block_enable(mgp->htable, mgp->intr_cnt);
4584 	} else {
4585 		/* Call ddi_intr_enable() for MSI non block enable */
4586 		for (x = 0; x < mgp->intr_cnt; x++) {
4587 			(void) ddi_intr_enable(mgp->htable[x]);
4588 		}
4589 	}
4590 
4591 	return (DDI_SUCCESS);
4592 }
4593 
4594 static void
4595 myri10ge_rem_intrs(struct myri10ge_priv *mgp, int handler_installed)
4596 {
4597 	int x, err;
4598 
4599 	/* Disable all interrupts */
4600 	if (handler_installed) {
4601 		if (mgp->intr_cap & DDI_INTR_FLAG_BLOCK) {
4602 			/* Call ddi_intr_block_disable() */
4603 			(void) ddi_intr_block_disable(mgp->htable,
4604 			    mgp->intr_cnt);
4605 		} else {
4606 			for (x = 0; x < mgp->intr_cnt; x++) {
4607 				(void) ddi_intr_disable(mgp->htable[x]);
4608 			}
4609 		}
4610 	}
4611 
4612 	for (x = 0; x < mgp->intr_cnt; x++) {
4613 		if (handler_installed) {
4614 		/* Call ddi_intr_remove_handler() */
4615 			err = ddi_intr_remove_handler(mgp->htable[x]);
4616 			if (err != DDI_SUCCESS) {
4617 				cmn_err(CE_WARN,
4618 				    "%s: ddi_intr_remove_handler for"
4619 				    "vec %d returned %d\n", mgp->name,
4620 				    x, err);
4621 			}
4622 		}
4623 		err = ddi_intr_free(mgp->htable[x]);
4624 		if (err != DDI_SUCCESS) {
4625 			cmn_err(CE_WARN,
4626 			    "%s: ddi_intr_free for vec %d returned %d\n",
4627 			    mgp->name, x, err);
4628 		}
4629 	}
4630 	kmem_free(mgp->htable, mgp->intr_size);
4631 	mgp->htable = NULL;
4632 }
4633 
4634 static void
4635 myri10ge_test_physical(dev_info_t *dip)
4636 {
4637 	ddi_dma_handle_t	handle;
4638 	struct myri10ge_dma_stuff dma;
4639 	void *addr;
4640 	int err;
4641 
4642 	/* test #1, sufficient for older sparc systems */
4643 	myri10ge_tx_dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
4644 	err = ddi_dma_alloc_handle(dip, &myri10ge_tx_dma_attr,
4645 	    DDI_DMA_DONTWAIT, NULL, &handle);
4646 	if (err == DDI_DMA_BADATTR)
4647 		goto fail;
4648 	ddi_dma_free_handle(&handle);
4649 
4650 	/* test #2, required on Olympis where the bind is what fails */
4651 	addr = myri10ge_dma_alloc(dip, 128, &myri10ge_tx_dma_attr,
4652 	    &myri10ge_dev_access_attr, DDI_DMA_STREAMING,
4653 	    DDI_DMA_WRITE|DDI_DMA_STREAMING, &dma, 0, DDI_DMA_DONTWAIT);
4654 	if (addr == NULL)
4655 		goto fail;
4656 	myri10ge_dma_free(&dma);
4657 	return;
4658 
4659 fail:
4660 	if (myri10ge_verbose)
4661 		printf("myri10ge%d: DDI_DMA_FORCE_PHYSICAL failed, "
4662 		    "using IOMMU\n", ddi_get_instance(dip));
4663 
4664 	myri10ge_tx_dma_attr.dma_attr_flags &= ~DDI_DMA_FORCE_PHYSICAL;
4665 }
4666 
4667 static void
4668 myri10ge_get_props(dev_info_t *dip)
4669 {
4670 
4671 	myri10ge_flow_control =  ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4672 	    "myri10ge_flow_control", myri10ge_flow_control);
4673 
4674 	myri10ge_intr_coal_delay = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4675 	    "myri10ge_intr_coal_delay", myri10ge_intr_coal_delay);
4676 
4677 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
4678 	myri10ge_nvidia_ecrc_enable = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4679 	    "myri10ge_nvidia_ecrc_enable", 1);
4680 #endif
4681 
4682 
4683 	myri10ge_use_msi = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4684 	    "myri10ge_use_msi", myri10ge_use_msi);
4685 
4686 	myri10ge_deassert_wait = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4687 	    "myri10ge_deassert_wait",  myri10ge_deassert_wait);
4688 
4689 	myri10ge_verbose = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4690 	    "myri10ge_verbose", myri10ge_verbose);
4691 
4692 	myri10ge_tx_copylen = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4693 	    "myri10ge_tx_copylen", myri10ge_tx_copylen);
4694 
4695 	if (myri10ge_tx_copylen < 60) {
4696 		cmn_err(CE_WARN,
4697 		    "myri10ge_tx_copylen must be >= 60 bytes\n");
4698 		myri10ge_tx_copylen = 60;
4699 	}
4700 
4701 	myri10ge_mtu_override = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4702 	    "myri10ge_mtu_override", myri10ge_mtu_override);
4703 
4704 	if (myri10ge_mtu_override >= MYRI10GE_MIN_GLD_MTU &&
4705 	    myri10ge_mtu_override <= MYRI10GE_MAX_GLD_MTU)
4706 		myri10ge_mtu = myri10ge_mtu_override +
4707 		    sizeof (struct ether_header) + MXGEFW_PAD + VLAN_TAGSZ;
4708 	else if (myri10ge_mtu_override != 0) {
4709 		cmn_err(CE_WARN,
4710 		    "myri10ge_mtu_override must be between 1500 and "
4711 		    "9000 bytes\n");
4712 	}
4713 
4714 	myri10ge_bigbufs_initial = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4715 	    "myri10ge_bigbufs_initial", myri10ge_bigbufs_initial);
4716 	myri10ge_bigbufs_max = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4717 	    "myri10ge_bigbufs_max", myri10ge_bigbufs_max);
4718 
4719 	myri10ge_watchdog_reset = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4720 	    "myri10ge_watchdog_reset", myri10ge_watchdog_reset);
4721 
4722 	if (myri10ge_bigbufs_initial < 128) {
4723 		cmn_err(CE_WARN,
4724 		    "myri10ge_bigbufs_initial be at least 128\n");
4725 		myri10ge_bigbufs_initial = 128;
4726 	}
4727 	if (myri10ge_bigbufs_max < 128) {
4728 		cmn_err(CE_WARN,
4729 		    "myri10ge_bigbufs_max be at least 128\n");
4730 		myri10ge_bigbufs_max = 128;
4731 	}
4732 
4733 	if (myri10ge_bigbufs_max < myri10ge_bigbufs_initial) {
4734 		cmn_err(CE_WARN,
4735 		    "myri10ge_bigbufs_max must be >=  "
4736 		    "myri10ge_bigbufs_initial\n");
4737 		myri10ge_bigbufs_max = myri10ge_bigbufs_initial;
4738 	}
4739 
4740 	myri10ge_force_firmware = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4741 	    "myri10ge_force_firmware", myri10ge_force_firmware);
4742 
4743 	myri10ge_max_slices = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4744 	    "myri10ge_max_slices", myri10ge_max_slices);
4745 
4746 	myri10ge_use_msix = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4747 	    "myri10ge_use_msix", myri10ge_use_msix);
4748 
4749 	myri10ge_rss_hash = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4750 	    "myri10ge_rss_hash", myri10ge_rss_hash);
4751 
4752 	if (myri10ge_rss_hash > MXGEFW_RSS_HASH_TYPE_MAX ||
4753 	    myri10ge_rss_hash < MXGEFW_RSS_HASH_TYPE_IPV4) {
4754 		cmn_err(CE_WARN, "myri10ge: Illegal rssh hash type %d\n",
4755 		    myri10ge_rss_hash);
4756 		myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4757 	}
4758 	myri10ge_lro = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4759 	    "myri10ge_lro", myri10ge_lro);
4760 	myri10ge_lro_cnt = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4761 	    "myri10ge_lro_cnt", myri10ge_lro_cnt);
4762 	myri10ge_lro_max_aggr = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4763 	    "myri10ge_lro_max_aggr", myri10ge_lro_max_aggr);
4764 	myri10ge_tx_hash = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4765 	    "myri10ge_tx_hash", myri10ge_tx_hash);
4766 	myri10ge_use_lso = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4767 	    "myri10ge_use_lso", myri10ge_use_lso);
4768 	myri10ge_lso_copy = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4769 	    "myri10ge_lso_copy", myri10ge_lso_copy);
4770 	myri10ge_tx_handles_initial = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4771 	    "myri10ge_tx_handles_initial", myri10ge_tx_handles_initial);
4772 	myri10ge_small_bytes = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4773 	    "myri10ge_small_bytes", myri10ge_small_bytes);
4774 	if ((myri10ge_small_bytes + MXGEFW_PAD) & (128 -1)) {
4775 		cmn_err(CE_WARN, "myri10ge: myri10ge_small_bytes (%d)\n",
4776 		    myri10ge_small_bytes);
4777 		cmn_err(CE_WARN, "must be aligned on 128b bndry -2\n");
4778 		myri10ge_small_bytes += 128;
4779 		myri10ge_small_bytes &= ~(128 -1);
4780 		myri10ge_small_bytes -= MXGEFW_PAD;
4781 		cmn_err(CE_WARN, "rounded up to %d\n",
4782 		    myri10ge_small_bytes);
4783 
4784 		myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4785 	}
4786 }
4787 
4788 #ifndef	PCI_EXP_LNKSTA
4789 #define	PCI_EXP_LNKSTA 18
4790 #endif
4791 
4792 static int
4793 myri10ge_find_cap(ddi_acc_handle_t handle, uint8_t *capptr, uint8_t capid)
4794 {
4795 	uint16_t	status;
4796 	uint8_t 	ptr;
4797 
4798 	/* check to see if we have capabilities */
4799 	status = pci_config_get16(handle, PCI_CONF_STAT);
4800 	if (!(status & PCI_STAT_CAP)) {
4801 		cmn_err(CE_WARN, "PCI_STAT_CAP not found\n");
4802 		return (ENXIO);
4803 	}
4804 
4805 	ptr = pci_config_get8(handle, PCI_CONF_CAP_PTR);
4806 
4807 	/* Walk the capabilities list, looking for a PCI Express cap */
4808 	while (ptr != PCI_CAP_NEXT_PTR_NULL) {
4809 		if (pci_config_get8(handle, ptr + PCI_CAP_ID) == capid)
4810 			break;
4811 		ptr = pci_config_get8(handle, ptr + PCI_CAP_NEXT_PTR);
4812 	}
4813 	if (ptr < 64) {
4814 		cmn_err(CE_WARN, "Bad capability offset %d\n", ptr);
4815 		return (ENXIO);
4816 	}
4817 	*capptr = ptr;
4818 	return (0);
4819 }
4820 
4821 static int
4822 myri10ge_set_max_readreq(ddi_acc_handle_t handle)
4823 {
4824 	int err;
4825 	uint16_t	val;
4826 	uint8_t		ptr;
4827 
4828 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_PCI_E);
4829 	if (err != 0) {
4830 		cmn_err(CE_WARN, "could not find PCIe cap\n");
4831 		return (ENXIO);
4832 	}
4833 
4834 	/* set max read req to 4096 */
4835 	val = pci_config_get16(handle, ptr + PCIE_DEVCTL);
4836 	val = (val & ~PCIE_DEVCTL_MAX_READ_REQ_MASK) |
4837 	    PCIE_DEVCTL_MAX_READ_REQ_4096;
4838 	pci_config_put16(handle, ptr + PCIE_DEVCTL, val);
4839 	val = pci_config_get16(handle, ptr + PCIE_DEVCTL);
4840 	if ((val & (PCIE_DEVCTL_MAX_READ_REQ_4096)) !=
4841 	    PCIE_DEVCTL_MAX_READ_REQ_4096) {
4842 		cmn_err(CE_WARN, "could not set max read req (%x)\n", val);
4843 		return (EINVAL);
4844 	}
4845 	return (0);
4846 }
4847 
4848 static int
4849 myri10ge_read_pcie_link_width(ddi_acc_handle_t handle, int *link)
4850 {
4851 	int err;
4852 	uint16_t	val;
4853 	uint8_t		ptr;
4854 
4855 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_PCI_E);
4856 	if (err != 0) {
4857 		cmn_err(CE_WARN, "could not set max read req\n");
4858 		return (ENXIO);
4859 	}
4860 
4861 	/* read link width */
4862 	val = pci_config_get16(handle, ptr + PCIE_LINKSTS);
4863 	val &= PCIE_LINKSTS_NEG_WIDTH_MASK;
4864 	*link = (val >> 4);
4865 	return (0);
4866 }
4867 
4868 static int
4869 myri10ge_reset_nic(struct myri10ge_priv *mgp)
4870 {
4871 	ddi_acc_handle_t handle = mgp->cfg_hdl;
4872 	uint32_t reboot;
4873 	uint16_t cmd;
4874 	int err;
4875 
4876 	cmd = pci_config_get16(handle, PCI_CONF_COMM);
4877 	if ((cmd & PCI_COMM_ME) == 0) {
4878 		/*
4879 		 * Bus master DMA disabled?  Check to see if the card
4880 		 * rebooted due to a parity error For now, just report
4881 		 * it
4882 		 */
4883 
4884 		/* enter read32 mode */
4885 		pci_config_put8(handle, mgp->vso + 0x10, 0x3);
4886 		/* read REBOOT_STATUS (0xfffffff0) */
4887 		pci_config_put32(handle, mgp->vso + 0x18, 0xfffffff0);
4888 		reboot = pci_config_get16(handle, mgp->vso + 0x14);
4889 		cmn_err(CE_WARN, "%s NIC rebooted 0x%x\n", mgp->name, reboot);
4890 		return (0);
4891 	}
4892 	if (!myri10ge_watchdog_reset) {
4893 		cmn_err(CE_WARN, "%s: not resetting\n", mgp->name);
4894 		return (1);
4895 	}
4896 
4897 	myri10ge_stop_locked(mgp);
4898 	err = myri10ge_start_locked(mgp);
4899 	if (err == DDI_FAILURE) {
4900 		return (0);
4901 	}
4902 	mac_tx_update(mgp->mh);
4903 	return (1);
4904 }
4905 
4906 static inline int
4907 myri10ge_ring_stalled(myri10ge_tx_ring_t *tx)
4908 {
4909 	if (tx->sched != tx->stall &&
4910 	    tx->done == tx->watchdog_done &&
4911 	    tx->watchdog_req != tx->watchdog_done)
4912 		return (1);
4913 	return (0);
4914 }
4915 
4916 static void
4917 myri10ge_watchdog(void *arg)
4918 {
4919 	struct myri10ge_priv *mgp;
4920 	struct myri10ge_slice_state *ss;
4921 	myri10ge_tx_ring_t *tx;
4922 	int nic_ok = 1;
4923 	int slices_stalled, rx_pause, i;
4924 	int add_rx;
4925 
4926 	mgp = arg;
4927 	mutex_enter(&mgp->intrlock);
4928 	if (mgp->running != MYRI10GE_ETH_RUNNING) {
4929 		cmn_err(CE_WARN,
4930 		    "%s not running, not rearming watchdog (%d)\n",
4931 		    mgp->name, mgp->running);
4932 		mutex_exit(&mgp->intrlock);
4933 		return;
4934 	}
4935 
4936 	rx_pause = ntohl(mgp->ss[0].fw_stats->dropped_pause);
4937 
4938 	/*
4939 	 * make sure nic is stalled before we reset the nic, so as to
4940 	 * ensure we don't rip the transmit data structures out from
4941 	 * under a pending transmit
4942 	 */
4943 
4944 	for (slices_stalled = 0, i = 0; i < mgp->num_slices; i++) {
4945 		tx = &mgp->ss[i].tx;
4946 		slices_stalled = myri10ge_ring_stalled(tx);
4947 		if (slices_stalled)
4948 			break;
4949 	}
4950 
4951 	if (slices_stalled) {
4952 		if (mgp->watchdog_rx_pause == rx_pause) {
4953 			cmn_err(CE_WARN,
4954 			    "%s slice %d stalled:(%d, %d, %d, %d, %d %d %d\n)",
4955 			    mgp->name, i, tx->sched, tx->stall,
4956 			    tx->done, tx->watchdog_done, tx->req, tx->pkt_done,
4957 			    (int)ntohl(mgp->ss[i].fw_stats->send_done_count));
4958 			nic_ok = myri10ge_reset_nic(mgp);
4959 		} else {
4960 			cmn_err(CE_WARN,
4961 			    "%s Flow controlled, check link partner\n",
4962 			    mgp->name);
4963 		}
4964 	}
4965 
4966 	if (!nic_ok) {
4967 		cmn_err(CE_WARN,
4968 		    "%s Nic dead, not rearming watchdog\n", mgp->name);
4969 		mutex_exit(&mgp->intrlock);
4970 		return;
4971 	}
4972 	for (i = 0; i < mgp->num_slices; i++) {
4973 		ss = &mgp->ss[i];
4974 		tx = &ss->tx;
4975 		tx->watchdog_done = tx->done;
4976 		tx->watchdog_req = tx->req;
4977 		if (ss->watchdog_rx_copy != MYRI10GE_SLICE_STAT(rx_copy)) {
4978 			ss->watchdog_rx_copy = MYRI10GE_SLICE_STAT(rx_copy);
4979 			add_rx =
4980 			    min(ss->jpool.num_alloc,
4981 			    myri10ge_bigbufs_max -
4982 			    (ss->jpool.num_alloc -
4983 			    ss->jbufs_for_smalls));
4984 			if (add_rx != 0) {
4985 				(void) myri10ge_add_jbufs(ss, add_rx, 0);
4986 				/* now feed them to the firmware */
4987 				mutex_enter(&ss->jpool.mtx);
4988 				myri10ge_restock_jumbos(ss);
4989 				mutex_exit(&ss->jpool.mtx);
4990 			}
4991 		}
4992 	}
4993 	mgp->watchdog_rx_pause = rx_pause;
4994 
4995 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
4996 	    mgp->timer_ticks);
4997 	mutex_exit(&mgp->intrlock);
4998 }
4999 
5000 /*ARGSUSED*/
5001 static int
5002 myri10ge_get_coalesce(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
5003 {
5004 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5005 	(void) mi_mpprintf(mp, "%d", mgp->intr_coal_delay);
5006 	return (0);
5007 }
5008 
5009 /*ARGSUSED*/
5010 static int
5011 myri10ge_set_coalesce(queue_t *q, mblk_t *mp, char *value,
5012     caddr_t cp, cred_t *credp)
5013 {
5014 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5015 	char *end;
5016 	size_t new_value;
5017 
5018 	new_value = mi_strtol(value, &end, 10);
5019 	if (end == value)
5020 		return (EINVAL);
5021 
5022 	mutex_enter(&myri10ge_param_lock);
5023 	mgp->intr_coal_delay = (int)new_value;
5024 	*mgp->intr_coal_delay_ptr = htonl(mgp->intr_coal_delay);
5025 	mutex_exit(&myri10ge_param_lock);
5026 	return (0);
5027 }
5028 
5029 /*ARGSUSED*/
5030 static int
5031 myri10ge_get_pauseparam(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
5032 {
5033 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5034 	(void) mi_mpprintf(mp, "%d", mgp->pause);
5035 	return (0);
5036 }
5037 
5038 /*ARGSUSED*/
5039 static int
5040 myri10ge_set_pauseparam(queue_t *q, mblk_t *mp, char *value,
5041     caddr_t cp, cred_t *credp)
5042 {
5043 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5044 	char *end;
5045 	size_t new_value;
5046 	int err = 0;
5047 
5048 	new_value = mi_strtol(value, &end, 10);
5049 	if (end == value)
5050 		return (EINVAL);
5051 	if (new_value != 0)
5052 		new_value = 1;
5053 
5054 	mutex_enter(&myri10ge_param_lock);
5055 	if (new_value != mgp->pause)
5056 		err = myri10ge_change_pause(mgp, new_value);
5057 	mutex_exit(&myri10ge_param_lock);
5058 	return (err);
5059 }
5060 
5061 /*ARGSUSED*/
5062 static int
5063 myri10ge_get_int(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
5064 {
5065 	(void) mi_mpprintf(mp, "%d", *(int *)(void *)cp);
5066 	return (0);
5067 }
5068 
5069 /*ARGSUSED*/
5070 static int
5071 myri10ge_set_int(queue_t *q, mblk_t *mp, char *value,
5072     caddr_t cp, cred_t *credp)
5073 {
5074 	char *end;
5075 	size_t new_value;
5076 
5077 	new_value = mi_strtol(value, &end, 10);
5078 	if (end == value)
5079 		return (EINVAL);
5080 	*(int *)(void *)cp = new_value;
5081 
5082 	return (0);
5083 }
5084 
5085 static void
5086 myri10ge_ndd_init(struct myri10ge_priv *mgp)
5087 {
5088 	mgp->nd_head = NULL;
5089 
5090 	(void) nd_load(&mgp->nd_head, "myri10ge_intr_coal_delay",
5091 	    myri10ge_get_coalesce, myri10ge_set_coalesce, (caddr_t)mgp);
5092 	(void) nd_load(&mgp->nd_head, "myri10ge_flow_control",
5093 	    myri10ge_get_pauseparam, myri10ge_set_pauseparam, (caddr_t)mgp);
5094 	(void) nd_load(&mgp->nd_head, "myri10ge_verbose",
5095 	    myri10ge_get_int, myri10ge_set_int, (caddr_t)&myri10ge_verbose);
5096 	(void) nd_load(&mgp->nd_head, "myri10ge_deassert_wait",
5097 	    myri10ge_get_int, myri10ge_set_int,
5098 	    (caddr_t)&myri10ge_deassert_wait);
5099 	(void) nd_load(&mgp->nd_head, "myri10ge_bigbufs_max",
5100 	    myri10ge_get_int, myri10ge_set_int,
5101 	    (caddr_t)&myri10ge_bigbufs_max);
5102 	(void) nd_load(&mgp->nd_head, "myri10ge_lro",
5103 	    myri10ge_get_int, myri10ge_set_int,
5104 	    (caddr_t)&myri10ge_lro);
5105 	(void) nd_load(&mgp->nd_head, "myri10ge_lro_max_aggr",
5106 	    myri10ge_get_int, myri10ge_set_int,
5107 	    (caddr_t)&myri10ge_lro_max_aggr);
5108 	(void) nd_load(&mgp->nd_head, "myri10ge_tx_hash",
5109 	    myri10ge_get_int, myri10ge_set_int,
5110 	    (caddr_t)&myri10ge_tx_hash);
5111 	(void) nd_load(&mgp->nd_head, "myri10ge_lso_copy",
5112 	    myri10ge_get_int, myri10ge_set_int,
5113 	    (caddr_t)&myri10ge_lso_copy);
5114 }
5115 
5116 static void
5117 myri10ge_ndd_fini(struct myri10ge_priv *mgp)
5118 {
5119 	nd_free(&mgp->nd_head);
5120 }
5121 
5122 static void
5123 myri10ge_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
5124 {
5125 	struct iocblk *iocp;
5126 	struct myri10ge_priv *mgp = arg;
5127 	int cmd, ok, err;
5128 
5129 	iocp = (struct iocblk *)(void *)mp->b_rptr;
5130 	cmd = iocp->ioc_cmd;
5131 
5132 	ok = 0;
5133 	err = 0;
5134 
5135 	switch (cmd) {
5136 	case ND_GET:
5137 	case ND_SET:
5138 		ok = nd_getset(wq, mgp->nd_head, mp);
5139 		break;
5140 	default:
5141 		break;
5142 	}
5143 	if (!ok)
5144 		err = EINVAL;
5145 	else
5146 		err = iocp->ioc_error;
5147 
5148 	if (!err)
5149 		miocack(wq, mp, iocp->ioc_count, err);
5150 	else
5151 		miocnak(wq, mp, 0, err);
5152 }
5153 
5154 static struct myri10ge_priv *mgp_list;
5155 
5156 struct myri10ge_priv *
5157 myri10ge_get_instance(uint_t unit)
5158 {
5159 	struct myri10ge_priv *mgp;
5160 
5161 	mutex_enter(&myri10ge_param_lock);
5162 	for (mgp = mgp_list; mgp != NULL; mgp = mgp->next) {
5163 		if (unit == ddi_get_instance(mgp->dip)) {
5164 			mgp->refcnt++;
5165 			break;
5166 		}
5167 	}
5168 	mutex_exit(&myri10ge_param_lock);
5169 	return (mgp);
5170 }
5171 
5172 void
5173 myri10ge_put_instance(struct myri10ge_priv *mgp)
5174 {
5175 	mutex_enter(&myri10ge_param_lock);
5176 	mgp->refcnt--;
5177 	mutex_exit(&myri10ge_param_lock);
5178 }
5179 
5180 static boolean_t
5181 myri10ge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
5182 {
5183 	struct myri10ge_priv *mgp = arg;
5184 	uint32_t *cap_hcksum;
5185 	mac_capab_lso_t *cap_lso;
5186 	mac_capab_rings_t *cap_rings;
5187 
5188 	switch (cap) {
5189 	case MAC_CAPAB_HCKSUM:
5190 		cap_hcksum = cap_data;
5191 		*cap_hcksum = HCKSUM_INET_PARTIAL;
5192 		break;
5193 	case MAC_CAPAB_RINGS:
5194 		cap_rings = cap_data;
5195 		switch (cap_rings->mr_type) {
5196 		case MAC_RING_TYPE_RX:
5197 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
5198 			cap_rings->mr_rnum = mgp->num_slices;
5199 			cap_rings->mr_gnum = 1;
5200 			cap_rings->mr_rget = myri10ge_fill_ring;
5201 			cap_rings->mr_gget = myri10ge_fill_group;
5202 			break;
5203 		case MAC_RING_TYPE_TX:
5204 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
5205 			cap_rings->mr_rnum = mgp->num_slices;
5206 			cap_rings->mr_gnum = 0;
5207 			cap_rings->mr_rget = myri10ge_fill_ring;
5208 			cap_rings->mr_gget = NULL;
5209 			break;
5210 		default:
5211 			return (B_FALSE);
5212 		}
5213 		break;
5214 	case MAC_CAPAB_LSO:
5215 		cap_lso = cap_data;
5216 		if (!myri10ge_use_lso)
5217 			return (B_FALSE);
5218 		if (!(mgp->features & MYRI10GE_TSO))
5219 			return (B_FALSE);
5220 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
5221 		cap_lso->lso_basic_tcp_ipv4.lso_max = (uint16_t)-1;
5222 		break;
5223 
5224 	default:
5225 		return (B_FALSE);
5226 	}
5227 	return (B_TRUE);
5228 }
5229 
5230 
5231 static int
5232 myri10ge_m_stat(void *arg, uint_t stat, uint64_t *val)
5233 {
5234 	struct myri10ge_priv *mgp = arg;
5235 	struct myri10ge_rx_ring_stats *rstat;
5236 	struct myri10ge_tx_ring_stats *tstat;
5237 	mcp_irq_data_t *fw_stats = mgp->ss[0].fw_stats;
5238 	struct myri10ge_slice_state *ss;
5239 	uint64_t tmp = 0;
5240 	int i;
5241 
5242 	switch (stat) {
5243 	case MAC_STAT_IFSPEED:
5244 		*val = 10ull * 1000ull * 1000000ull;
5245 		break;
5246 
5247 	case MAC_STAT_MULTIRCV:
5248 		for (i = 0; i < mgp->num_slices; i++) {
5249 			rstat = &mgp->ss[i].rx_stats;
5250 			tmp += rstat->multircv;
5251 		}
5252 		*val = tmp;
5253 		break;
5254 
5255 	case MAC_STAT_BRDCSTRCV:
5256 		for (i = 0; i < mgp->num_slices; i++) {
5257 			rstat = &mgp->ss[i].rx_stats;
5258 			tmp += rstat->brdcstrcv;
5259 		}
5260 		*val = tmp;
5261 		break;
5262 
5263 	case MAC_STAT_MULTIXMT:
5264 		for (i = 0; i < mgp->num_slices; i++) {
5265 			tstat = &mgp->ss[i].tx.stats;
5266 			tmp += tstat->multixmt;
5267 		}
5268 		*val = tmp;
5269 		break;
5270 
5271 	case MAC_STAT_BRDCSTXMT:
5272 		for (i = 0; i < mgp->num_slices; i++) {
5273 			tstat = &mgp->ss[i].tx.stats;
5274 			tmp += tstat->brdcstxmt;
5275 		}
5276 		*val = tmp;
5277 		break;
5278 
5279 	case MAC_STAT_NORCVBUF:
5280 		tmp = ntohl(fw_stats->dropped_no_big_buffer);
5281 		tmp += ntohl(fw_stats->dropped_no_small_buffer);
5282 		tmp += ntohl(fw_stats->dropped_link_overflow);
5283 		for (i = 0; i < mgp->num_slices; i++) {
5284 			ss = &mgp->ss[i];
5285 			tmp += MYRI10GE_SLICE_STAT(rx_big_nobuf);
5286 			tmp += MYRI10GE_SLICE_STAT(rx_small_nobuf);
5287 		}
5288 		*val = tmp;
5289 		break;
5290 
5291 	case MAC_STAT_IERRORS:
5292 		tmp += ntohl(fw_stats->dropped_bad_crc32);
5293 		tmp += ntohl(fw_stats->dropped_bad_phy);
5294 		tmp += ntohl(fw_stats->dropped_runt);
5295 		tmp += ntohl(fw_stats->dropped_overrun);
5296 		*val = tmp;
5297 		break;
5298 
5299 	case MAC_STAT_OERRORS:
5300 		for (i = 0; i < mgp->num_slices; i++) {
5301 			ss = &mgp->ss[i];
5302 			tmp += MYRI10GE_SLICE_STAT(xmit_lsobadflags);
5303 			tmp += MYRI10GE_SLICE_STAT(xmit_err);
5304 		}
5305 		*val = tmp;
5306 		break;
5307 
5308 	case MAC_STAT_RBYTES:
5309 		for (i = 0; i < mgp->num_slices; i++) {
5310 			rstat = &mgp->ss[i].rx_stats;
5311 			tmp += rstat->ibytes;
5312 		}
5313 		*val = tmp;
5314 		break;
5315 
5316 	case MAC_STAT_IPACKETS:
5317 		for (i = 0; i < mgp->num_slices; i++) {
5318 			rstat = &mgp->ss[i].rx_stats;
5319 			tmp += rstat->ipackets;
5320 		}
5321 		*val = tmp;
5322 		break;
5323 
5324 	case MAC_STAT_OBYTES:
5325 		for (i = 0; i < mgp->num_slices; i++) {
5326 			tstat = &mgp->ss[i].tx.stats;
5327 			tmp += tstat->obytes;
5328 		}
5329 		*val = tmp;
5330 		break;
5331 
5332 	case MAC_STAT_OPACKETS:
5333 		for (i = 0; i < mgp->num_slices; i++) {
5334 			tstat = &mgp->ss[i].tx.stats;
5335 			tmp += tstat->opackets;
5336 		}
5337 		*val = tmp;
5338 		break;
5339 
5340 	case ETHER_STAT_TOOLONG_ERRORS:
5341 		*val = ntohl(fw_stats->dropped_overrun);
5342 		break;
5343 
5344 #ifdef SOLARIS_S11
5345 	case ETHER_STAT_TOOSHORT_ERRORS:
5346 		*val = ntohl(fw_stats->dropped_runt);
5347 		break;
5348 #endif
5349 
5350 	case ETHER_STAT_LINK_PAUSE:
5351 		*val = mgp->pause;
5352 		break;
5353 
5354 	case ETHER_STAT_LINK_AUTONEG:
5355 		*val = 1;
5356 		break;
5357 
5358 	case ETHER_STAT_LINK_DUPLEX:
5359 		*val = LINK_DUPLEX_FULL;
5360 		break;
5361 
5362 	default:
5363 		return (ENOTSUP);
5364 	}
5365 
5366 	return (0);
5367 }
5368 
5369 /* ARGSUSED */
5370 static void
5371 myri10ge_m_propinfo(void *arg, const char *pr_name,
5372     mac_prop_id_t pr_num, mac_prop_info_handle_t prh)
5373 {
5374 	switch (pr_num) {
5375 	case MAC_PROP_MTU:
5376 		mac_prop_info_set_default_uint32(prh, MYRI10GE_DEFAULT_GLD_MTU);
5377 		mac_prop_info_set_range_uint32(prh, MYRI10GE_MIN_GLD_MTU,
5378 		    MYRI10GE_MAX_GLD_MTU);
5379 		break;
5380 	default:
5381 		break;
5382 	}
5383 }
5384 
5385 /*ARGSUSED*/
5386 static int
5387 myri10ge_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
5388     uint_t pr_valsize, const void *pr_val)
5389 {
5390 	int err = 0;
5391 	struct myri10ge_priv *mgp = arg;
5392 
5393 	switch (pr_num) {
5394 	case MAC_PROP_MTU: {
5395 		uint32_t mtu;
5396 		if (pr_valsize < sizeof (mtu)) {
5397 			err = EINVAL;
5398 			break;
5399 		}
5400 		bcopy(pr_val, &mtu, sizeof (mtu));
5401 		if (mtu > MYRI10GE_MAX_GLD_MTU ||
5402 		    mtu < MYRI10GE_MIN_GLD_MTU) {
5403 			err = EINVAL;
5404 			break;
5405 		}
5406 
5407 		mutex_enter(&mgp->intrlock);
5408 		if (mgp->running != MYRI10GE_ETH_STOPPED) {
5409 			err = EBUSY;
5410 			mutex_exit(&mgp->intrlock);
5411 			break;
5412 		}
5413 
5414 		myri10ge_mtu = mtu + sizeof (struct ether_header) +
5415 		    MXGEFW_PAD + VLAN_TAGSZ;
5416 		mutex_exit(&mgp->intrlock);
5417 		break;
5418 	}
5419 	default:
5420 		err = ENOTSUP;
5421 		break;
5422 	}
5423 
5424 	return (err);
5425 }
5426 
5427 static mac_callbacks_t myri10ge_m_callbacks = {
5428 	(MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO),
5429 	myri10ge_m_stat,
5430 	myri10ge_m_start,
5431 	myri10ge_m_stop,
5432 	myri10ge_m_promisc,
5433 	myri10ge_m_multicst,
5434 	NULL,
5435 	NULL,
5436 	NULL,
5437 	myri10ge_m_ioctl,
5438 	myri10ge_m_getcapab,
5439 	NULL,
5440 	NULL,
5441 	myri10ge_m_setprop,
5442 	NULL,
5443 	myri10ge_m_propinfo
5444 };
5445 
5446 
5447 static int
5448 myri10ge_probe_slices(struct myri10ge_priv *mgp)
5449 {
5450 	myri10ge_cmd_t cmd;
5451 	int status;
5452 
5453 	mgp->num_slices = 1;
5454 
5455 	/* hit the board with a reset to ensure it is alive */
5456 	(void) memset(&cmd, 0, sizeof (cmd));
5457 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_RESET, &cmd);
5458 	if (status != 0) {
5459 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
5460 		return (ENXIO);
5461 	}
5462 
5463 	if (myri10ge_use_msix == 0)
5464 		return (0);
5465 
5466 	/* tell it the size of the interrupt queues */
5467 	cmd.data0 = mgp->max_intr_slots * sizeof (struct mcp_slot);
5468 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
5469 	if (status != 0) {
5470 		cmn_err(CE_WARN, "%s: failed MXGEFW_CMD_SET_INTRQ_SIZE\n",
5471 		    mgp->name);
5472 		return (ENXIO);
5473 	}
5474 
5475 	/* ask the maximum number of slices it supports */
5476 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
5477 	    &cmd);
5478 	if (status != 0)
5479 		return (0);
5480 
5481 	mgp->num_slices = cmd.data0;
5482 
5483 	/*
5484 	 * if the admin did not specify a limit to how many
5485 	 * slices we should use, cap it automatically to the
5486 	 * number of CPUs currently online
5487 	 */
5488 	if (myri10ge_max_slices == -1)
5489 		myri10ge_max_slices = ncpus;
5490 
5491 	if (mgp->num_slices > myri10ge_max_slices)
5492 		mgp->num_slices = myri10ge_max_slices;
5493 
5494 
5495 	/*
5496 	 * Now try to allocate as many MSI-X vectors as we have
5497 	 * slices. We give up on MSI-X if we can only get a single
5498 	 * vector.
5499 	 */
5500 	while (mgp->num_slices > 1) {
5501 		/* make sure it is a power of two */
5502 		while (!ISP2(mgp->num_slices))
5503 			mgp->num_slices--;
5504 		if (mgp->num_slices == 1)
5505 			return (0);
5506 
5507 		status = myri10ge_add_intrs(mgp, 0);
5508 		if (status == 0) {
5509 			myri10ge_rem_intrs(mgp, 0);
5510 			if (mgp->intr_cnt == mgp->num_slices) {
5511 				if (myri10ge_verbose)
5512 					printf("Got %d slices!\n",
5513 					    mgp->num_slices);
5514 				return (0);
5515 			}
5516 			mgp->num_slices = mgp->intr_cnt;
5517 		} else {
5518 			mgp->num_slices = mgp->num_slices / 2;
5519 		}
5520 	}
5521 
5522 	if (myri10ge_verbose)
5523 		printf("Got %d slices\n", mgp->num_slices);
5524 	return (0);
5525 }
5526 
5527 static void
5528 myri10ge_lro_free(struct myri10ge_slice_state *ss)
5529 {
5530 	struct lro_entry *lro;
5531 
5532 	while (ss->lro_free != NULL) {
5533 		lro = ss->lro_free;
5534 		ss->lro_free = lro->next;
5535 		kmem_free(lro, sizeof (*lro));
5536 	}
5537 }
5538 
5539 static void
5540 myri10ge_lro_alloc(struct myri10ge_slice_state *ss)
5541 {
5542 	struct lro_entry *lro;
5543 	int idx;
5544 
5545 	ss->lro_free = NULL;
5546 	ss->lro_active = NULL;
5547 
5548 	for (idx = 0; idx < myri10ge_lro_cnt; idx++) {
5549 		lro = kmem_zalloc(sizeof (*lro), KM_SLEEP);
5550 		if (lro == NULL)
5551 			continue;
5552 		lro->next = ss->lro_free;
5553 		ss->lro_free = lro;
5554 	}
5555 }
5556 
5557 static void
5558 myri10ge_free_slices(struct myri10ge_priv *mgp)
5559 {
5560 	struct myri10ge_slice_state *ss;
5561 	size_t bytes;
5562 	int i;
5563 
5564 	if (mgp->ss == NULL)
5565 		return;
5566 
5567 	for (i = 0; i < mgp->num_slices; i++) {
5568 		ss = &mgp->ss[i];
5569 		if (ss->rx_done.entry == NULL)
5570 			continue;
5571 		myri10ge_dma_free(&ss->rx_done.dma);
5572 		ss->rx_done.entry = NULL;
5573 		if (ss->fw_stats == NULL)
5574 			continue;
5575 		myri10ge_dma_free(&ss->fw_stats_dma);
5576 		ss->fw_stats = NULL;
5577 		mutex_destroy(&ss->rx_lock);
5578 		mutex_destroy(&ss->tx.lock);
5579 		mutex_destroy(&ss->tx.handle_lock);
5580 		mutex_destroy(&ss->poll_lock);
5581 		myri10ge_jpool_fini(ss);
5582 		myri10ge_slice_stat_destroy(ss);
5583 		myri10ge_lro_free(ss);
5584 	}
5585 	bytes = sizeof (*mgp->ss) * mgp->num_slices;
5586 	kmem_free(mgp->ss, bytes);
5587 	mgp->ss = NULL;
5588 }
5589 
5590 
5591 static int
5592 myri10ge_alloc_slices(struct myri10ge_priv *mgp)
5593 {
5594 	struct myri10ge_slice_state *ss;
5595 	size_t bytes;
5596 	int i;
5597 
5598 	bytes = sizeof (*mgp->ss) * mgp->num_slices;
5599 	mgp->ss = kmem_zalloc(bytes, KM_SLEEP);
5600 	if (mgp->ss == NULL)
5601 		return (ENOMEM);
5602 	for (i = 0; i < mgp->num_slices; i++) {
5603 		ss = &mgp->ss[i];
5604 
5605 		ss->mgp = mgp;
5606 
5607 		/* allocate the per-slice firmware stats */
5608 		bytes = sizeof (*ss->fw_stats);
5609 		ss->fw_stats = (mcp_irq_data_t *)(void *)
5610 		    myri10ge_dma_alloc(mgp->dip, bytes,
5611 		    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5612 		    DDI_DMA_CONSISTENT, DDI_DMA_READ|DDI_DMA_CONSISTENT,
5613 		    &ss->fw_stats_dma, 1, DDI_DMA_DONTWAIT);
5614 		if (ss->fw_stats == NULL)
5615 			goto abort;
5616 		(void) memset(ss->fw_stats, 0, bytes);
5617 
5618 		/* allocate rx done ring */
5619 		bytes = mgp->max_intr_slots *
5620 		    sizeof (*ss->rx_done.entry);
5621 		ss->rx_done.entry = (mcp_slot_t *)(void *)
5622 		    myri10ge_dma_alloc(mgp->dip, bytes,
5623 		    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5624 		    DDI_DMA_CONSISTENT, DDI_DMA_READ|DDI_DMA_CONSISTENT,
5625 		    &ss->rx_done.dma, 1, DDI_DMA_DONTWAIT);
5626 		if (ss->rx_done.entry == NULL) {
5627 			goto abort;
5628 		}
5629 		(void) memset(ss->rx_done.entry, 0, bytes);
5630 		mutex_init(&ss->rx_lock,   NULL, MUTEX_DEFAULT, mgp->icookie);
5631 		mutex_init(&ss->tx.lock,   NULL, MUTEX_DEFAULT, NULL);
5632 		mutex_init(&ss->tx.handle_lock,   NULL, MUTEX_DEFAULT, NULL);
5633 		mutex_init(&ss->poll_lock,   NULL, MUTEX_DEFAULT, NULL);
5634 		myri10ge_jpool_init(ss);
5635 		(void) myri10ge_slice_stat_init(ss);
5636 		myri10ge_lro_alloc(ss);
5637 	}
5638 
5639 	return (0);
5640 
5641 abort:
5642 	myri10ge_free_slices(mgp);
5643 	return (ENOMEM);
5644 }
5645 
5646 static int
5647 myri10ge_save_msi_state(struct myri10ge_priv *mgp,
5648     ddi_acc_handle_t handle)
5649 {
5650 	uint8_t ptr;
5651 	int err;
5652 
5653 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_MSI);
5654 	if (err != 0) {
5655 		cmn_err(CE_WARN, "%s: could not find MSI cap\n",
5656 		    mgp->name);
5657 		return (DDI_FAILURE);
5658 	}
5659 	mgp->pci_saved_state.msi_ctrl =
5660 	    pci_config_get16(handle, ptr + PCI_MSI_CTRL);
5661 	mgp->pci_saved_state.msi_addr_low =
5662 	    pci_config_get32(handle, ptr + PCI_MSI_ADDR_OFFSET);
5663 	mgp->pci_saved_state.msi_addr_high =
5664 	    pci_config_get32(handle, ptr + PCI_MSI_ADDR_OFFSET + 4);
5665 	mgp->pci_saved_state.msi_data_32 =
5666 	    pci_config_get16(handle, ptr + PCI_MSI_32BIT_DATA);
5667 	mgp->pci_saved_state.msi_data_64 =
5668 	    pci_config_get16(handle, ptr + PCI_MSI_64BIT_DATA);
5669 	return (DDI_SUCCESS);
5670 }
5671 
5672 static int
5673 myri10ge_restore_msi_state(struct myri10ge_priv *mgp,
5674     ddi_acc_handle_t handle)
5675 {
5676 	uint8_t ptr;
5677 	int err;
5678 
5679 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_MSI);
5680 	if (err != 0) {
5681 		cmn_err(CE_WARN, "%s: could not find MSI cap\n",
5682 		    mgp->name);
5683 		return (DDI_FAILURE);
5684 	}
5685 
5686 	pci_config_put16(handle, ptr + PCI_MSI_CTRL,
5687 	    mgp->pci_saved_state.msi_ctrl);
5688 	pci_config_put32(handle, ptr + PCI_MSI_ADDR_OFFSET,
5689 	    mgp->pci_saved_state.msi_addr_low);
5690 	pci_config_put32(handle, ptr + PCI_MSI_ADDR_OFFSET + 4,
5691 	    mgp->pci_saved_state.msi_addr_high);
5692 	pci_config_put16(handle, ptr + PCI_MSI_32BIT_DATA,
5693 	    mgp->pci_saved_state.msi_data_32);
5694 	pci_config_put16(handle, ptr + PCI_MSI_64BIT_DATA,
5695 	    mgp->pci_saved_state.msi_data_64);
5696 
5697 	return (DDI_SUCCESS);
5698 }
5699 
5700 static int
5701 myri10ge_save_pci_state(struct myri10ge_priv *mgp)
5702 {
5703 	ddi_acc_handle_t handle = mgp->cfg_hdl;
5704 	int i;
5705 	int err = DDI_SUCCESS;
5706 
5707 
5708 	/* Save the non-extended PCI config space 32-bits at a time */
5709 	for (i = 0; i < 16; i++)
5710 		mgp->pci_saved_state.base[i] =
5711 		    pci_config_get32(handle, i*4);
5712 
5713 	/* now save MSI interrupt state *, if needed */
5714 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_MSI)
5715 		err = myri10ge_save_msi_state(mgp, handle);
5716 
5717 	return (err);
5718 }
5719 
5720 static int
5721 myri10ge_restore_pci_state(struct myri10ge_priv *mgp)
5722 {
5723 	ddi_acc_handle_t handle = mgp->cfg_hdl;
5724 	int i;
5725 	int err = DDI_SUCCESS;
5726 
5727 
5728 	/* Restore the non-extended PCI config space 32-bits at a time */
5729 	for (i = 15; i >= 0; i--)
5730 		pci_config_put32(handle, i*4, mgp->pci_saved_state.base[i]);
5731 
5732 	/* now restore MSI interrupt state *, if needed */
5733 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_MSI)
5734 		err = myri10ge_restore_msi_state(mgp, handle);
5735 
5736 	if (mgp->max_read_request_4k)
5737 		(void) myri10ge_set_max_readreq(handle);
5738 	return (err);
5739 }
5740 
5741 
5742 static int
5743 myri10ge_suspend(dev_info_t *dip)
5744 {
5745 	struct myri10ge_priv *mgp = ddi_get_driver_private(dip);
5746 	int status;
5747 
5748 	if (mgp == NULL) {
5749 		cmn_err(CE_WARN, "null dip in myri10ge_suspend\n");
5750 		return (DDI_FAILURE);
5751 	}
5752 	if (mgp->dip != dip) {
5753 		cmn_err(CE_WARN, "bad dip in myri10ge_suspend\n");
5754 		return (DDI_FAILURE);
5755 	}
5756 	mutex_enter(&mgp->intrlock);
5757 	if (mgp->running == MYRI10GE_ETH_RUNNING) {
5758 		mgp->running = MYRI10GE_ETH_STOPPING;
5759 		mutex_exit(&mgp->intrlock);
5760 		(void) untimeout(mgp->timer_id);
5761 		mutex_enter(&mgp->intrlock);
5762 		myri10ge_stop_locked(mgp);
5763 		mgp->running = MYRI10GE_ETH_SUSPENDED_RUNNING;
5764 	}
5765 	status = myri10ge_save_pci_state(mgp);
5766 	mutex_exit(&mgp->intrlock);
5767 	return (status);
5768 }
5769 
5770 static int
5771 myri10ge_resume(dev_info_t *dip)
5772 {
5773 	struct myri10ge_priv *mgp = ddi_get_driver_private(dip);
5774 	int status = DDI_SUCCESS;
5775 
5776 	if (mgp == NULL) {
5777 		cmn_err(CE_WARN, "null dip in myri10ge_resume\n");
5778 		return (DDI_FAILURE);
5779 	}
5780 	if (mgp->dip != dip) {
5781 		cmn_err(CE_WARN, "bad dip in myri10ge_resume\n");
5782 		return (DDI_FAILURE);
5783 	}
5784 
5785 	mutex_enter(&mgp->intrlock);
5786 	status = myri10ge_restore_pci_state(mgp);
5787 	if (status == DDI_SUCCESS &&
5788 	    mgp->running == MYRI10GE_ETH_SUSPENDED_RUNNING) {
5789 		status = myri10ge_start_locked(mgp);
5790 	}
5791 	mutex_exit(&mgp->intrlock);
5792 	if (status != DDI_SUCCESS)
5793 		return (status);
5794 
5795 	/* start the watchdog timer */
5796 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
5797 	    mgp->timer_ticks);
5798 	return (DDI_SUCCESS);
5799 }
5800 
5801 static int
5802 myri10ge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
5803 {
5804 
5805 	struct myri10ge_priv *mgp;
5806 	mac_register_t *macp, *omacp;
5807 	ddi_acc_handle_t handle;
5808 	uint32_t csr, hdr_offset;
5809 	int status, span, link_width, max_read_request_4k;
5810 	unsigned long bus_number, dev_number, func_number;
5811 	size_t bytes;
5812 	offset_t ss_offset;
5813 	uint8_t vso;
5814 
5815 	if (cmd == DDI_RESUME) {
5816 		return (myri10ge_resume(dip));
5817 	}
5818 
5819 	if (cmd != DDI_ATTACH)
5820 		return (DDI_FAILURE);
5821 	if (pci_config_setup(dip, &handle) != DDI_SUCCESS)
5822 		return (DDI_FAILURE);
5823 
5824 	/* enable busmater and io space access */
5825 	csr = pci_config_get32(handle, PCI_CONF_COMM);
5826 	pci_config_put32(handle, PCI_CONF_COMM,
5827 	    (csr |PCI_COMM_ME|PCI_COMM_MAE));
5828 	status = myri10ge_read_pcie_link_width(handle, &link_width);
5829 	if (status != 0) {
5830 		cmn_err(CE_WARN, "could not read link width!\n");
5831 		link_width = 0;
5832 	}
5833 	max_read_request_4k = !myri10ge_set_max_readreq(handle);
5834 	status = myri10ge_find_cap(handle, &vso, PCI_CAP_ID_VS);
5835 	if (status != 0)
5836 		goto abort_with_cfg_hdl;
5837 	if ((omacp = mac_alloc(MAC_VERSION)) == NULL)
5838 		goto abort_with_cfg_hdl;
5839 	/*
5840 	 * XXXX Hack: mac_register_t grows in newer kernels.  To be
5841 	 * able to write newer fields, such as m_margin, without
5842 	 * writing outside allocated memory, we allocate our own macp
5843 	 * and pass that to mac_register()
5844 	 */
5845 	macp = kmem_zalloc(sizeof (*macp) * 8, KM_SLEEP);
5846 	macp->m_version = omacp->m_version;
5847 
5848 	if ((mgp = (struct myri10ge_priv *)
5849 	    kmem_zalloc(sizeof (*mgp), KM_SLEEP)) == NULL) {
5850 		goto abort_with_macinfo;
5851 	}
5852 	ddi_set_driver_private(dip, mgp);
5853 
5854 	/* setup device name for log messages */
5855 	(void) sprintf(mgp->name, "myri10ge%d", ddi_get_instance(dip));
5856 
5857 	mutex_enter(&myri10ge_param_lock);
5858 	myri10ge_get_props(dip);
5859 	mgp->intr_coal_delay = myri10ge_intr_coal_delay;
5860 	mgp->pause = myri10ge_flow_control;
5861 	mutex_exit(&myri10ge_param_lock);
5862 
5863 	mgp->max_read_request_4k = max_read_request_4k;
5864 	mgp->pcie_link_width = link_width;
5865 	mgp->running = MYRI10GE_ETH_STOPPED;
5866 	mgp->vso = vso;
5867 	mgp->dip = dip;
5868 	mgp->cfg_hdl = handle;
5869 
5870 	mgp->timer_ticks = 5 * drv_usectohz(1000000); /* 5 seconds */
5871 	myri10ge_test_physical(dip);
5872 
5873 	/* allocate command page */
5874 	bytes = sizeof (*mgp->cmd);
5875 	mgp->cmd = (mcp_cmd_response_t *)
5876 	    (void *)myri10ge_dma_alloc(dip, bytes,
5877 	    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5878 	    DDI_DMA_CONSISTENT,	DDI_DMA_RDWR|DDI_DMA_CONSISTENT,
5879 	    &mgp->cmd_dma, 1, DDI_DMA_DONTWAIT);
5880 	if (mgp->cmd == NULL)
5881 		goto abort_with_mgp;
5882 
5883 	(void) myri10ge_reg_set(dip, &mgp->reg_set, &span, &bus_number,
5884 	    &dev_number, &func_number);
5885 	if (myri10ge_verbose)
5886 		printf("%s at %ld:%ld:%ld attaching\n", mgp->name,
5887 		    bus_number, dev_number, func_number);
5888 	status = ddi_regs_map_setup(dip, mgp->reg_set, (caddr_t *)&mgp->sram,
5889 	    (offset_t)0, (offset_t)span,  &myri10ge_dev_access_attr,
5890 	    &mgp->io_handle);
5891 	if (status != DDI_SUCCESS) {
5892 		cmn_err(CE_WARN, "%s: couldn't map memory space", mgp->name);
5893 		printf("%s: reg_set = %d, span = %d, status = %d",
5894 		    mgp->name, mgp->reg_set, span, status);
5895 		goto abort_with_mgp;
5896 	}
5897 
5898 	hdr_offset = *(uint32_t *)(void*)(mgp->sram +  MCP_HEADER_PTR_OFFSET);
5899 	hdr_offset = ntohl(hdr_offset) & 0xffffc;
5900 	ss_offset = hdr_offset +
5901 	    offsetof(struct mcp_gen_header, string_specs);
5902 	mgp->sram_size = ntohl(*(uint32_t *)(void*)(mgp->sram + ss_offset));
5903 	myri10ge_pio_copy32(mgp->eeprom_strings,
5904 	    (uint32_t *)(void*)((char *)mgp->sram + mgp->sram_size),
5905 	    MYRI10GE_EEPROM_STRINGS_SIZE);
5906 	(void) memset(mgp->eeprom_strings +
5907 	    MYRI10GE_EEPROM_STRINGS_SIZE - 2, 0, 2);
5908 
5909 	status = myri10ge_read_mac_addr(mgp);
5910 	if (status) {
5911 		goto abort_with_mapped;
5912 	}
5913 
5914 	status = myri10ge_select_firmware(mgp);
5915 	if (status != 0) {
5916 		cmn_err(CE_WARN, "%s: failed to load firmware\n", mgp->name);
5917 		goto abort_with_mapped;
5918 	}
5919 
5920 	status = myri10ge_probe_slices(mgp);
5921 	if (status != 0) {
5922 		cmn_err(CE_WARN, "%s: failed to probe slices\n", mgp->name);
5923 		goto abort_with_dummy_rdma;
5924 	}
5925 
5926 	status = myri10ge_alloc_slices(mgp);
5927 	if (status != 0) {
5928 		cmn_err(CE_WARN, "%s: failed to alloc slices\n", mgp->name);
5929 		goto abort_with_dummy_rdma;
5930 	}
5931 
5932 	/* add the interrupt handler */
5933 	status = myri10ge_add_intrs(mgp, 1);
5934 	if (status != 0) {
5935 		cmn_err(CE_WARN, "%s: Failed to add interrupt\n",
5936 		    mgp->name);
5937 		goto abort_with_slices;
5938 	}
5939 
5940 	/* now that we have an iblock_cookie, init the mutexes */
5941 	mutex_init(&mgp->cmd_lock, NULL, MUTEX_DRIVER, mgp->icookie);
5942 	mutex_init(&mgp->intrlock, NULL, MUTEX_DRIVER, mgp->icookie);
5943 
5944 
5945 	status = myri10ge_nic_stat_init(mgp);
5946 	if (status != DDI_SUCCESS)
5947 		goto abort_with_interrupts;
5948 	status = myri10ge_info_init(mgp);
5949 	if (status != DDI_SUCCESS)
5950 		goto abort_with_stats;
5951 
5952 	/*
5953 	 *	Initialize  GLD state
5954 	 */
5955 
5956 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
5957 	macp->m_driver = mgp;
5958 	macp->m_dip = dip;
5959 	macp->m_src_addr = mgp->mac_addr;
5960 	macp->m_callbacks = &myri10ge_m_callbacks;
5961 	macp->m_min_sdu = 0;
5962 	macp->m_max_sdu = myri10ge_mtu -
5963 	    (sizeof (struct ether_header) + MXGEFW_PAD + VLAN_TAGSZ);
5964 #ifdef SOLARIS_S11
5965 	macp->m_margin = VLAN_TAGSZ;
5966 #endif
5967 	macp->m_v12n = MAC_VIRT_LEVEL1;
5968 	status = mac_register(macp, &mgp->mh);
5969 	if (status != 0) {
5970 		cmn_err(CE_WARN, "%s: mac_register failed with %d\n",
5971 		    mgp->name, status);
5972 		goto abort_with_info;
5973 	}
5974 	myri10ge_ndd_init(mgp);
5975 	if (myri10ge_verbose)
5976 		printf("%s: %s, tx bndry %d, fw %s\n", mgp->name,
5977 		    mgp->intr_type, mgp->tx_boundary, mgp->fw_name);
5978 	mutex_enter(&myri10ge_param_lock);
5979 	mgp->next = mgp_list;
5980 	mgp_list = mgp;
5981 	mutex_exit(&myri10ge_param_lock);
5982 	kmem_free(macp, sizeof (*macp) * 8);
5983 	mac_free(omacp);
5984 	return (DDI_SUCCESS);
5985 
5986 abort_with_info:
5987 	myri10ge_info_destroy(mgp);
5988 
5989 abort_with_stats:
5990 	myri10ge_nic_stat_destroy(mgp);
5991 
5992 abort_with_interrupts:
5993 	mutex_destroy(&mgp->cmd_lock);
5994 	mutex_destroy(&mgp->intrlock);
5995 	myri10ge_rem_intrs(mgp, 1);
5996 
5997 abort_with_slices:
5998 	myri10ge_free_slices(mgp);
5999 
6000 abort_with_dummy_rdma:
6001 	myri10ge_dummy_rdma(mgp, 0);
6002 
6003 abort_with_mapped:
6004 	ddi_regs_map_free(&mgp->io_handle);
6005 
6006 	myri10ge_dma_free(&mgp->cmd_dma);
6007 
6008 abort_with_mgp:
6009 	kmem_free(mgp, sizeof (*mgp));
6010 
6011 abort_with_macinfo:
6012 	kmem_free(macp, sizeof (*macp) * 8);
6013 	mac_free(omacp);
6014 
6015 abort_with_cfg_hdl:
6016 	pci_config_teardown(&handle);
6017 	return (DDI_FAILURE);
6018 
6019 }
6020 
6021 
6022 static int
6023 myri10ge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
6024 {
6025 	struct myri10ge_priv	*mgp, *tmp;
6026 	int 			status, i, jbufs_alloced;
6027 
6028 	if (cmd == DDI_SUSPEND) {
6029 		status = myri10ge_suspend(dip);
6030 		return (status);
6031 	}
6032 
6033 	if (cmd != DDI_DETACH) {
6034 		return (DDI_FAILURE);
6035 	}
6036 	/* Get the driver private (gld_mac_info_t) structure */
6037 	mgp = ddi_get_driver_private(dip);
6038 
6039 	mutex_enter(&mgp->intrlock);
6040 	jbufs_alloced = 0;
6041 	for (i = 0; i < mgp->num_slices; i++) {
6042 		myri10ge_remove_jbufs(&mgp->ss[i]);
6043 		jbufs_alloced += mgp->ss[i].jpool.num_alloc;
6044 	}
6045 	mutex_exit(&mgp->intrlock);
6046 	if (jbufs_alloced != 0) {
6047 		cmn_err(CE_NOTE, "%s: %d loaned rx buffers remain\n",
6048 		    mgp->name, jbufs_alloced);
6049 		return (DDI_FAILURE);
6050 	}
6051 
6052 	mutex_enter(&myri10ge_param_lock);
6053 	if (mgp->refcnt != 0) {
6054 		mutex_exit(&myri10ge_param_lock);
6055 		cmn_err(CE_NOTE, "%s: %d external refs remain\n",
6056 		    mgp->name, mgp->refcnt);
6057 		return (DDI_FAILURE);
6058 	}
6059 	mutex_exit(&myri10ge_param_lock);
6060 
6061 	status = mac_unregister(mgp->mh);
6062 	if (status != DDI_SUCCESS)
6063 		return (status);
6064 
6065 	myri10ge_ndd_fini(mgp);
6066 	myri10ge_dummy_rdma(mgp, 0);
6067 	myri10ge_nic_stat_destroy(mgp);
6068 	myri10ge_info_destroy(mgp);
6069 
6070 	mutex_destroy(&mgp->cmd_lock);
6071 	mutex_destroy(&mgp->intrlock);
6072 
6073 	myri10ge_rem_intrs(mgp, 1);
6074 
6075 	myri10ge_free_slices(mgp);
6076 	ddi_regs_map_free(&mgp->io_handle);
6077 	myri10ge_dma_free(&mgp->cmd_dma);
6078 	pci_config_teardown(&mgp->cfg_hdl);
6079 
6080 	mutex_enter(&myri10ge_param_lock);
6081 	if (mgp_list == mgp) {
6082 		mgp_list = mgp->next;
6083 	} else {
6084 		tmp = mgp_list;
6085 		while (tmp->next != mgp && tmp->next != NULL)
6086 			tmp = tmp->next;
6087 		if (tmp->next != NULL)
6088 			tmp->next = tmp->next->next;
6089 	}
6090 	kmem_free(mgp, sizeof (*mgp));
6091 	mutex_exit(&myri10ge_param_lock);
6092 	return (DDI_SUCCESS);
6093 }
6094 
6095 /*
6096  * Helper for quiesce entry point: Interrupt threads are not being
6097  * scheduled, so we must poll for the confirmation DMA to arrive in
6098  * the firmware stats block for slice 0.  We're essentially running
6099  * the guts of the interrupt handler, and just cherry picking the
6100  * confirmation that the NIC is queuesced (stats->link_down)
6101  */
6102 
6103 static int
6104 myri10ge_poll_down(struct myri10ge_priv *mgp)
6105 {
6106 	struct myri10ge_slice_state *ss = mgp->ss;
6107 	mcp_irq_data_t *stats = ss->fw_stats;
6108 	int valid;
6109 	int found_down = 0;
6110 
6111 
6112 	/* check for a pending IRQ */
6113 
6114 	if (! *((volatile uint8_t *)& stats->valid))
6115 		return (0);
6116 	valid = stats->valid;
6117 
6118 	/*
6119 	 * Make sure to tell the NIC to lower a legacy IRQ, else
6120 	 * it may have corrupt state after restarting
6121 	 */
6122 
6123 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
6124 		/* lower legacy IRQ  */
6125 		*mgp->irq_deassert = 0;
6126 		mb();
6127 		/* wait for irq conf DMA */
6128 		while (*((volatile uint8_t *)& stats->valid))
6129 			;
6130 	}
6131 	if (stats->stats_updated && stats->link_down)
6132 		found_down = 1;
6133 
6134 	if (valid & 0x1)
6135 		*ss->irq_claim = BE_32(3);
6136 	*(ss->irq_claim + 1) = BE_32(3);
6137 
6138 	return (found_down);
6139 }
6140 
6141 static int
6142 myri10ge_quiesce(dev_info_t *dip)
6143 {
6144 	struct myri10ge_priv *mgp;
6145 	myri10ge_cmd_t cmd;
6146 	int status, down, i;
6147 
6148 	mgp = ddi_get_driver_private(dip);
6149 	if (mgp == NULL)
6150 		return (DDI_FAILURE);
6151 
6152 	/* if devices was unplumbed, it is guaranteed to be quiescent */
6153 	if (mgp->running == MYRI10GE_ETH_STOPPED)
6154 		return (DDI_SUCCESS);
6155 
6156 	/* send a down CMD to queuesce NIC */
6157 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
6158 	if (status) {
6159 		cmn_err(CE_WARN, "%s: Couldn't bring down link\n", mgp->name);
6160 		return (DDI_FAILURE);
6161 	}
6162 
6163 	for (i = 0; i < 20; i++) {
6164 		down = myri10ge_poll_down(mgp);
6165 		if (down)
6166 			break;
6167 		delay(drv_usectohz(100000));
6168 		mb();
6169 	}
6170 	if (down)
6171 		return (DDI_SUCCESS);
6172 	return (DDI_FAILURE);
6173 }
6174 
6175 /*
6176  * Distinguish between allocb'ed blocks, and gesballoc'ed attached
6177  * storage.
6178  */
6179 static void
6180 myri10ge_find_lastfree(void)
6181 {
6182 	mblk_t *mp = allocb(1024, 0);
6183 	dblk_t *dbp;
6184 
6185 	if (mp == NULL) {
6186 		cmn_err(CE_WARN, "myri10ge_find_lastfree failed\n");
6187 		return;
6188 	}
6189 	dbp = mp->b_datap;
6190 	myri10ge_db_lastfree = (void *)dbp->db_lastfree;
6191 }
6192 
6193 int
6194 _init(void)
6195 {
6196 	int i;
6197 
6198 	if (myri10ge_verbose)
6199 		cmn_err(CE_NOTE,
6200 		    "Myricom 10G driver (10GbE) version %s loading\n",
6201 		    MYRI10GE_VERSION_STR);
6202 	myri10ge_find_lastfree();
6203 	mac_init_ops(&myri10ge_ops, "myri10ge");
6204 	mutex_init(&myri10ge_param_lock, NULL, MUTEX_DEFAULT, NULL);
6205 	if ((i = mod_install(&modlinkage)) != 0) {
6206 		cmn_err(CE_WARN, "mod_install returned %d\n", i);
6207 		mac_fini_ops(&myri10ge_ops);
6208 		mutex_destroy(&myri10ge_param_lock);
6209 	}
6210 	return (i);
6211 }
6212 
6213 int
6214 _fini(void)
6215 {
6216 	int i;
6217 	i = mod_remove(&modlinkage);
6218 	if (i != 0) {
6219 		return (i);
6220 	}
6221 	mac_fini_ops(&myri10ge_ops);
6222 	mutex_destroy(&myri10ge_param_lock);
6223 	return (0);
6224 }
6225 
6226 int
6227 _info(struct modinfo *modinfop)
6228 {
6229 	return (mod_info(&modlinkage, modinfop));
6230 }
6231 
6232 
6233 /*
6234  *  This file uses MyriGE driver indentation.
6235  *
6236  * Local Variables:
6237  * c-file-style:"sun"
6238  * tab-width:8
6239  * End:
6240  */
6241