xref: /illumos-gate/usr/src/uts/common/io/myri10ge/drv/myri10ge.c (revision 3e6960d70408b9f4e09714ed3341173673ed28b2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright 2007-2009 Myricom, Inc.  All rights reserved.
29  * Use is subject to license terms.
30  */
31 
32 /*
33  * Copyright (c) 2014, Joyent, Inc.
34  * Copyright (c) 2016 by Delphix. All rights reserved.
35  */
36 
37 #ifndef	lint
38 static const char __idstring[] =
39 	"@(#)$Id: myri10ge.c,v 1.186 2009-06-29 13:47:22 gallatin Exp $";
40 #endif
41 
42 #define	MXGEFW_NDIS
43 #include "myri10ge_var.h"
44 #include "rss_eth_z8e.h"
45 #include "rss_ethp_z8e.h"
46 #include "mcp_gen_header.h"
47 
48 #define	MYRI10GE_MAX_ETHER_MTU 9014
49 #define	MYRI10GE_MAX_GLD_MTU	9000
50 #define	MYRI10GE_MIN_GLD_MTU	1500
51 
52 #define	MYRI10GE_ETH_STOPPED 0
53 #define	MYRI10GE_ETH_STOPPING 1
54 #define	MYRI10GE_ETH_STARTING 2
55 #define	MYRI10GE_ETH_RUNNING 3
56 #define	MYRI10GE_ETH_OPEN_FAILED 4
57 #define	MYRI10GE_ETH_SUSPENDED_RUNNING 5
58 
59 static int myri10ge_small_bytes = 510;
60 static int myri10ge_intr_coal_delay = 125;
61 static int myri10ge_flow_control = 1;
62 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
63 static int myri10ge_nvidia_ecrc_enable = 1;
64 #endif
65 static int myri10ge_mtu_override = 0;
66 static int myri10ge_tx_copylen = 512;
67 static int myri10ge_deassert_wait = 1;
68 static int myri10ge_verbose = 0;
69 static int myri10ge_watchdog_reset = 0;
70 static int myri10ge_use_msix = 1;
71 static int myri10ge_max_slices = -1;
72 static int myri10ge_use_msi = 1;
73 int myri10ge_force_firmware = 0;
74 static boolean_t myri10ge_use_lso = B_TRUE;
75 static int myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
76 static int myri10ge_tx_hash = 1;
77 static int myri10ge_lro = 0;
78 static int myri10ge_lro_cnt = 8;
79 int myri10ge_lro_max_aggr = 2;
80 static int myri10ge_lso_copy = 0;
81 static mblk_t *myri10ge_send_wrapper(void *arg, mblk_t *mp);
82 int myri10ge_tx_handles_initial = 128;
83 
84 static 	kmutex_t myri10ge_param_lock;
85 static void* myri10ge_db_lastfree;
86 
87 static int myri10ge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
88 static int myri10ge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
89 static int myri10ge_quiesce(dev_info_t *dip);
90 
91 DDI_DEFINE_STREAM_OPS(myri10ge_ops, nulldev, nulldev, myri10ge_attach,
92     myri10ge_detach, nodev, NULL, D_MP, NULL, myri10ge_quiesce);
93 
94 
95 static struct modldrv modldrv = {
96 	&mod_driverops,
97 	"Myricom 10G driver (10GbE)",
98 	&myri10ge_ops,
99 };
100 
101 
102 static struct modlinkage modlinkage = {
103 	MODREV_1,
104 	{&modldrv, NULL},
105 };
106 
107 unsigned char myri10ge_broadcastaddr[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
108 
109 static ddi_dma_attr_t myri10ge_misc_dma_attr = {
110 	DMA_ATTR_V0,			/* version number. */
111 	(uint64_t)0, 			/* low address */
112 	(uint64_t)0xffffffffffffffffULL, /* high address */
113 	(uint64_t)0x7ffffff,		/* address counter max */
114 	(uint64_t)4096,			/* alignment */
115 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
116 	(uint32_t)0x1,			/* minimum transfer size */
117 	(uint64_t)0x7fffffff,		/* maximum transfer size */
118 	(uint64_t)0x7fffffff,		/* maximum segment size */
119 	1,				/* scatter/gather list length */
120 	1,				/* granularity */
121 	0				/* attribute flags */
122 };
123 
124 /*
125  * The Myri10GE NIC has the following constraints on receive buffers:
126  * 1) Buffers which cross a 4KB boundary must be aligned to 4KB
127  * 2) Buffers which are not aligned to 4KB must not cross a 4KB boundary
128  */
129 
130 static ddi_dma_attr_t myri10ge_rx_jumbo_dma_attr = {
131 	DMA_ATTR_V0,			/* version number. */
132 	(uint64_t)0, 			/* low address */
133 	(uint64_t)0xffffffffffffffffULL, /* high address */
134 	(uint64_t)0x7ffffff,		/* address counter max */
135 	(uint64_t)4096,			/* alignment */
136 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
137 	(uint32_t)0x1,			/* minimum transfer size */
138 	(uint64_t)0x7fffffff,		/* maximum transfer size */
139 	UINT64_MAX,			/* maximum segment size */
140 	1,				/* scatter/gather list length */
141 	1,				/* granularity */
142 	0				/* attribute flags */
143 };
144 
145 static ddi_dma_attr_t myri10ge_rx_std_dma_attr = {
146 	DMA_ATTR_V0,			/* version number. */
147 	(uint64_t)0, 			/* low address */
148 	(uint64_t)0xffffffffffffffffULL, /* high address */
149 	(uint64_t)0x7ffffff,		/* address counter max */
150 #if defined sparc64 || defined __sparcv9
151 	(uint64_t)4096,			/* alignment */
152 #else
153 	(uint64_t)0x80,			/* alignment */
154 #endif
155 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
156 	(uint32_t)0x1,			/* minimum transfer size */
157 	(uint64_t)0x7fffffff,		/* maximum transfer size */
158 #if defined sparc64 || defined __sparcv9
159 	UINT64_MAX,			/* maximum segment size */
160 #else
161 	(uint64_t)0xfff,		/* maximum segment size */
162 #endif
163 	1,				/* scatter/gather list length */
164 	1,				/* granularity */
165 	0				/* attribute flags */
166 };
167 
168 static ddi_dma_attr_t myri10ge_tx_dma_attr = {
169 	DMA_ATTR_V0,			/* version number. */
170 	(uint64_t)0, 			/* low address */
171 	(uint64_t)0xffffffffffffffffULL, /* high address */
172 	(uint64_t)0x7ffffff,		/* address counter max */
173 	(uint64_t)1,			/* alignment */
174 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
175 	(uint32_t)0x1,			/* minimum transfer size */
176 	(uint64_t)0x7fffffff,		/* maximum transfer size */
177 	UINT64_MAX,			/* maximum segment size */
178 	INT32_MAX,			/* scatter/gather list length */
179 	1,				/* granularity */
180 	0			/* attribute flags */
181 };
182 
183 #if defined sparc64 || defined __sparcv9
184 #define	WC 0
185 #else
186 #define	WC 1
187 #endif
188 
189 struct ddi_device_acc_attr myri10ge_dev_access_attr = {
190 	DDI_DEVICE_ATTR_V0,		/* version */
191 	DDI_NEVERSWAP_ACC,		/* endian flash */
192 #if WC
193 	DDI_MERGING_OK_ACC		/* data order */
194 #else
195 	DDI_STRICTORDER_ACC
196 #endif
197 };
198 
199 static void myri10ge_watchdog(void *arg);
200 
201 #ifdef MYRICOM_PRIV
202 int myri10ge_mtu = MYRI10GE_MAX_ETHER_MTU + MXGEFW_PAD + VLAN_TAGSZ;
203 #define	MYRI10GE_DEFAULT_GLD_MTU	MYRI10GE_MAX_GLD_MTU
204 #else
205 int myri10ge_mtu = ETHERMAX + MXGEFW_PAD + VLAN_TAGSZ;
206 #define	MYRI10GE_DEFAULT_GLD_MTU	MYRI10GE_MIN_GLD_MTU
207 #endif
208 int myri10ge_bigbufs_initial = 1024;
209 int myri10ge_bigbufs_max = 4096;
210 
211 
212 caddr_t
213 myri10ge_dma_alloc(dev_info_t *dip, size_t len,
214     ddi_dma_attr_t *attr, ddi_device_acc_attr_t  *accattr,
215     uint_t alloc_flags, int bind_flags, struct myri10ge_dma_stuff *dma,
216     int warn, int (*wait)(caddr_t))
217 {
218 	caddr_t  kaddr;
219 	size_t real_length;
220 	ddi_dma_cookie_t cookie;
221 	uint_t count;
222 	int err;
223 
224 	err = ddi_dma_alloc_handle(dip, attr, wait,
225 	    NULL, &dma->handle);
226 	if (err != DDI_SUCCESS) {
227 		if (warn)
228 			cmn_err(CE_WARN,
229 			    "myri10ge: ddi_dma_alloc_handle failed\n");
230 		goto abort_with_nothing;
231 	}
232 
233 	err = ddi_dma_mem_alloc(dma->handle, len, accattr, alloc_flags,
234 	    wait, NULL, &kaddr, &real_length,
235 	    &dma->acc_handle);
236 	if (err != DDI_SUCCESS) {
237 		if (warn)
238 			cmn_err(CE_WARN,
239 			    "myri10ge: ddi_dma_mem_alloc failed\n");
240 		goto abort_with_handle;
241 	}
242 
243 	err = ddi_dma_addr_bind_handle(dma->handle, NULL, kaddr, len,
244 	    bind_flags, wait, NULL, &cookie, &count);
245 
246 	if (err != DDI_SUCCESS) {
247 		if (warn)
248 			cmn_err(CE_WARN,
249 			    "myri10ge: ddi_dma_addr_bind_handle failed\n");
250 		goto abort_with_mem;
251 	}
252 
253 	if (count != 1) {
254 		if (warn)
255 			cmn_err(CE_WARN,
256 			    "myri10ge: got too many dma segments ");
257 		goto abort_with_bind;
258 	}
259 	dma->low = htonl(MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress));
260 	dma->high = htonl(MYRI10GE_HIGHPART_TO_U32(cookie.dmac_laddress));
261 	return (kaddr);
262 
263 abort_with_bind:
264 	(void) ddi_dma_unbind_handle(dma->handle);
265 
266 abort_with_mem:
267 	ddi_dma_mem_free(&dma->acc_handle);
268 
269 abort_with_handle:
270 	ddi_dma_free_handle(&dma->handle);
271 abort_with_nothing:
272 	if (warn) {
273 		cmn_err(CE_WARN, "myri10ge: myri10ge_dma_alloc failed.\n  ");
274 		cmn_err(CE_WARN, "args: dip=%p len=0x%lx ddi_dma_attr=%p\n",
275 		    (void*) dip, len, (void*) attr);
276 		cmn_err(CE_WARN,
277 		    "args: ddi_device_acc_attr=%p  alloc_flags=0x%x\n",
278 		    (void*) accattr, alloc_flags);
279 		cmn_err(CE_WARN, "args: bind_flags=0x%x  dmastuff=%p",
280 		    bind_flags, (void*) dma);
281 	}
282 	return (NULL);
283 
284 }
285 
286 void
287 myri10ge_dma_free(struct myri10ge_dma_stuff *dma)
288 {
289 	(void) ddi_dma_unbind_handle(dma->handle);
290 	ddi_dma_mem_free(&dma->acc_handle);
291 	ddi_dma_free_handle(&dma->handle);
292 }
293 
294 static inline void
295 myri10ge_pio_copy32(void *to, uint32_t *from32, size_t size)
296 {
297 	register volatile uint32_t *to32;
298 	size_t i;
299 
300 	to32 = (volatile uint32_t *) to;
301 	for (i = (size / 4); i; i--) {
302 		*to32 = *from32;
303 		to32++;
304 		from32++;
305 	}
306 }
307 
308 #if defined(_LP64)
309 static inline void
310 myri10ge_pio_copy64(void *to, uint64_t *from64, size_t size)
311 {
312 	register volatile uint64_t *to64;
313 	size_t i;
314 
315 	to64 = (volatile uint64_t *) to;
316 	for (i = (size / 8); i; i--) {
317 		*to64 = *from64;
318 		to64++;
319 		from64++;
320 	}
321 }
322 #endif
323 
324 /*
325  * This routine copies memory from the host to the NIC.
326  * The "size" argument must always be a multiple of
327  * the size of long (4 or 8 bytes), and to/from must also
328  * be naturally aligned.
329  */
330 static inline void
331 myri10ge_pio_copy(void *to, void *from, size_t size)
332 {
333 #if !defined(_LP64)
334 	ASSERT((size % 4) == 0);
335 	myri10ge_pio_copy32(to, (uint32_t *)from, size);
336 #else
337 	ASSERT((size % 8) == 0);
338 	myri10ge_pio_copy64(to, (uint64_t *)from, size);
339 #endif
340 }
341 
342 
343 /*
344  * Due to various bugs in Solaris (especially bug 6186772 where the
345  * TCP/UDP checksum is calculated incorrectly on mblk chains with more
346  * than two elements), and the design bug where hardware checksums are
347  * ignored on mblk chains with more than 2 elements, we need to
348  * allocate private pool of physically contiguous receive buffers.
349  */
350 
351 static void
352 myri10ge_jpool_init(struct myri10ge_slice_state *ss)
353 {
354 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
355 
356 	bzero(jpool, sizeof (*jpool));
357 	mutex_init(&jpool->mtx, NULL, MUTEX_DRIVER,
358 	    ss->mgp->icookie);
359 	jpool->head = NULL;
360 }
361 
362 static void
363 myri10ge_jpool_fini(struct myri10ge_slice_state *ss)
364 {
365 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
366 
367 	if (jpool->head != NULL) {
368 		cmn_err(CE_WARN,
369 		    "%s: BUG! myri10ge_jpool_fini called on non-empty pool\n",
370 		    ss->mgp->name);
371 	}
372 	mutex_destroy(&jpool->mtx);
373 }
374 
375 
376 /*
377  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
378  * at most 32 bytes at a time, so as to avoid involving the software
379  * pio handler in the nic.   We re-write the first segment's low
380  * DMA address to mark it valid only after we write the entire chunk
381  * in a burst
382  */
383 static inline void
384 myri10ge_submit_8rx(mcp_kreq_ether_recv_t *dst, mcp_kreq_ether_recv_t *src)
385 {
386 	src->addr_low |= BE_32(1);
387 	myri10ge_pio_copy(dst, src, 4 * sizeof (*src));
388 	mb();
389 	myri10ge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
390 	mb();
391 	src->addr_low &= ~(BE_32(1));
392 	dst->addr_low = src->addr_low;
393 	mb();
394 }
395 
396 static void
397 myri10ge_pull_jpool(struct myri10ge_slice_state *ss)
398 {
399 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
400 	struct myri10ge_jpool_entry *jtail, *j, *jfree;
401 	volatile uintptr_t *putp;
402 	uintptr_t put;
403 	int i;
404 
405 	/* find tail */
406 	jtail = NULL;
407 	if (jpool->head != NULL) {
408 		j = jpool->head;
409 		while (j->next != NULL)
410 			j = j->next;
411 		jtail = j;
412 	}
413 
414 	/*
415 	 * iterate over all per-CPU caches, and add contents into
416 	 * jpool
417 	 */
418 	for (i = 0; i < MYRI10GE_MAX_CPUS; i++) {
419 		/* take per-CPU free list */
420 		putp = (void *)&jpool->cpu[i & MYRI10GE_MAX_CPU_MASK].head;
421 		if (*putp == NULL)
422 			continue;
423 		put = atomic_swap_ulong(putp, 0);
424 		jfree = (struct myri10ge_jpool_entry *)put;
425 
426 		/* append to pool */
427 		if (jtail == NULL) {
428 			jpool->head = jfree;
429 		} else {
430 			jtail->next = jfree;
431 		}
432 		j = jfree;
433 		while (j->next != NULL)
434 			j = j->next;
435 		jtail = j;
436 	}
437 }
438 
439 /*
440  * Transfers buffers from the free pool to the nic
441  * Must be called holding the jpool mutex.
442  */
443 
444 static inline void
445 myri10ge_restock_jumbos(struct myri10ge_slice_state *ss)
446 {
447 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
448 	struct myri10ge_jpool_entry *j;
449 	myri10ge_rx_ring_t *rx;
450 	int i, idx, limit;
451 
452 	rx = &ss->rx_big;
453 	limit = ss->j_rx_cnt + (rx->mask + 1);
454 
455 	for (i = rx->cnt; i != limit; i++) {
456 		idx = i & (rx->mask);
457 		j = jpool->head;
458 		if (j == NULL) {
459 			myri10ge_pull_jpool(ss);
460 			j = jpool->head;
461 			if (j == NULL) {
462 				break;
463 			}
464 		}
465 		jpool->head = j->next;
466 		rx->info[idx].j = j;
467 		rx->shadow[idx].addr_low = j->dma.low;
468 		rx->shadow[idx].addr_high = j->dma.high;
469 		/* copy 4 descriptors (32-bytes) to the mcp at a time */
470 		if ((idx & 7) == 7) {
471 			myri10ge_submit_8rx(&rx->lanai[idx - 7],
472 			    &rx->shadow[idx - 7]);
473 		}
474 	}
475 	rx->cnt = i;
476 }
477 
478 /*
479  * Transfer buffers from the nic to the free pool.
480  * Should be called holding the jpool mutex
481  */
482 
483 static inline void
484 myri10ge_unstock_jumbos(struct myri10ge_slice_state *ss)
485 {
486 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
487 	struct myri10ge_jpool_entry *j;
488 	myri10ge_rx_ring_t *rx;
489 	int i;
490 
491 	mutex_enter(&jpool->mtx);
492 	rx = &ss->rx_big;
493 
494 	for (i = 0; i < rx->mask + 1; i++) {
495 		j = rx->info[i].j;
496 		rx->info[i].j = NULL;
497 		if (j == NULL)
498 			continue;
499 		j->next = jpool->head;
500 		jpool->head = j;
501 	}
502 	mutex_exit(&jpool->mtx);
503 
504 }
505 
506 
507 /*
508  * Free routine which is called when the mblk allocated via
509  * esballoc() is freed.   Here we return the jumbo buffer
510  * to the free pool, and possibly pass some jumbo buffers
511  * to the nic
512  */
513 
514 static void
515 myri10ge_jfree_rtn(void *arg)
516 {
517 	struct myri10ge_jpool_entry *j = (struct myri10ge_jpool_entry *)arg;
518 	struct myri10ge_jpool_stuff *jpool;
519 	volatile uintptr_t *putp;
520 	uintptr_t old, new;
521 
522 	jpool = &j->ss->jpool;
523 
524 	/* prepend buffer locklessly to per-CPU freelist */
525 	putp = (void *)&jpool->cpu[CPU->cpu_seqid & MYRI10GE_MAX_CPU_MASK].head;
526 	new = (uintptr_t)j;
527 	do {
528 		old = *putp;
529 		j->next = (void *)old;
530 	} while (atomic_cas_ulong(putp, old, new) != old);
531 }
532 
533 static void
534 myri10ge_remove_jbuf(struct myri10ge_jpool_entry *j)
535 {
536 	(void) ddi_dma_unbind_handle(j->dma_handle);
537 	ddi_dma_mem_free(&j->acc_handle);
538 	ddi_dma_free_handle(&j->dma_handle);
539 	kmem_free(j, sizeof (*j));
540 }
541 
542 
543 /*
544  * Allocates one physically contiguous descriptor
545  * and add it to the jumbo buffer pool.
546  */
547 
548 static int
549 myri10ge_add_jbuf(struct myri10ge_slice_state *ss)
550 {
551 	struct myri10ge_jpool_entry *j;
552 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
553 	ddi_dma_attr_t *rx_dma_attr;
554 	size_t real_length;
555 	ddi_dma_cookie_t cookie;
556 	uint_t count;
557 	int err;
558 
559 	if (myri10ge_mtu < 2048)
560 		rx_dma_attr = &myri10ge_rx_std_dma_attr;
561 	else
562 		rx_dma_attr = &myri10ge_rx_jumbo_dma_attr;
563 
564 again:
565 	j = (struct myri10ge_jpool_entry *)
566 	    kmem_alloc(sizeof (*j), KM_SLEEP);
567 	err = ddi_dma_alloc_handle(ss->mgp->dip, rx_dma_attr,
568 	    DDI_DMA_DONTWAIT, NULL, &j->dma_handle);
569 	if (err != DDI_SUCCESS)
570 		goto abort_with_j;
571 
572 	err = ddi_dma_mem_alloc(j->dma_handle, myri10ge_mtu,
573 	    &myri10ge_dev_access_attr,  DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
574 	    NULL, &j->buf, &real_length, &j->acc_handle);
575 	if (err != DDI_SUCCESS)
576 		goto abort_with_handle;
577 
578 	err = ddi_dma_addr_bind_handle(j->dma_handle, NULL, j->buf,
579 	    real_length, DDI_DMA_READ|DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
580 	    NULL, &cookie, &count);
581 	if (err != DDI_SUCCESS)
582 		goto abort_with_mem;
583 
584 	/*
585 	 * Make certain std MTU buffers do not cross a 4KB boundary:
586 	 *
587 	 * Setting dma_attr_align=4096 will do this, but the system
588 	 * will only allocate 1 RX buffer per 4KB page, rather than 2.
589 	 * Setting dma_attr_granular=4096 *seems* to work around this,
590 	 * but I'm paranoid about future systems no longer honoring
591 	 * this, so fall back to the safe, but memory wasting way if a
592 	 * buffer crosses a 4KB boundary.
593 	 */
594 
595 	if (rx_dma_attr == &myri10ge_rx_std_dma_attr &&
596 	    rx_dma_attr->dma_attr_align != 4096) {
597 		uint32_t start, end;
598 
599 		start = MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress);
600 		end = start + myri10ge_mtu;
601 		if (((end >> 12) != (start >> 12)) && (start & 4095U)) {
602 			printf("std buffer crossed a 4KB boundary!\n");
603 			myri10ge_remove_jbuf(j);
604 			rx_dma_attr->dma_attr_align = 4096;
605 			rx_dma_attr->dma_attr_seg = UINT64_MAX;
606 			goto again;
607 		}
608 	}
609 
610 	j->dma.low =
611 	    htonl(MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress));
612 	j->dma.high =
613 	    htonl(MYRI10GE_HIGHPART_TO_U32(cookie.dmac_laddress));
614 	j->ss = ss;
615 
616 
617 	j->free_func.free_func = myri10ge_jfree_rtn;
618 	j->free_func.free_arg = (char *)j;
619 	mutex_enter(&jpool->mtx);
620 	j->next = jpool->head;
621 	jpool->head = j;
622 	jpool->num_alloc++;
623 	mutex_exit(&jpool->mtx);
624 	return (0);
625 
626 abort_with_mem:
627 	ddi_dma_mem_free(&j->acc_handle);
628 
629 abort_with_handle:
630 	ddi_dma_free_handle(&j->dma_handle);
631 
632 abort_with_j:
633 	kmem_free(j, sizeof (*j));
634 
635 	/*
636 	 * If an allocation failed, perhaps it failed because it could
637 	 * not satisfy granularity requirement.  Disable that, and
638 	 * try agin.
639 	 */
640 	if (rx_dma_attr == &myri10ge_rx_std_dma_attr &&
641 	    rx_dma_attr->dma_attr_align != 4096) {
642 			cmn_err(CE_NOTE,
643 			    "!alloc failed, reverting to gran=1\n");
644 			rx_dma_attr->dma_attr_align = 4096;
645 			rx_dma_attr->dma_attr_seg = UINT64_MAX;
646 			goto again;
647 	}
648 	return (err);
649 }
650 
651 static int
652 myri10ge_jfree_cnt(struct myri10ge_jpool_stuff *jpool)
653 {
654 	int i;
655 	struct myri10ge_jpool_entry *j;
656 
657 	mutex_enter(&jpool->mtx);
658 	j = jpool->head;
659 	i = 0;
660 	while (j != NULL) {
661 		i++;
662 		j = j->next;
663 	}
664 	mutex_exit(&jpool->mtx);
665 	return (i);
666 }
667 
668 static int
669 myri10ge_add_jbufs(struct myri10ge_slice_state *ss, int num, int total)
670 {
671 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
672 	int allocated = 0;
673 	int err;
674 	int needed;
675 
676 	/*
677 	 * if total is set, user wants "num" jbufs in the pool,
678 	 * otherwise the user wants to "num" additional jbufs
679 	 * added to the pool
680 	 */
681 	if (total && jpool->num_alloc) {
682 		allocated = myri10ge_jfree_cnt(jpool);
683 		needed = num - allocated;
684 	} else {
685 		needed = num;
686 	}
687 
688 	while (needed > 0) {
689 		needed--;
690 		err = myri10ge_add_jbuf(ss);
691 		if (err == 0) {
692 			allocated++;
693 		}
694 	}
695 	return (allocated);
696 }
697 
698 static void
699 myri10ge_remove_jbufs(struct myri10ge_slice_state *ss)
700 {
701 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
702 	struct myri10ge_jpool_entry *j;
703 
704 	mutex_enter(&jpool->mtx);
705 	myri10ge_pull_jpool(ss);
706 	while (jpool->head != NULL) {
707 		jpool->num_alloc--;
708 		j = jpool->head;
709 		jpool->head = j->next;
710 		myri10ge_remove_jbuf(j);
711 	}
712 	mutex_exit(&jpool->mtx);
713 }
714 
715 static void
716 myri10ge_carve_up_jbufs_into_small_ring(struct myri10ge_slice_state *ss)
717 {
718 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
719 	struct myri10ge_jpool_entry *j = NULL;
720 	caddr_t ptr;
721 	uint32_t dma_low, dma_high;
722 	int idx, len;
723 	unsigned int alloc_size;
724 
725 	dma_low = dma_high = len = 0;
726 	alloc_size = myri10ge_small_bytes + MXGEFW_PAD;
727 	ptr = NULL;
728 	for (idx = 0; idx < ss->rx_small.mask + 1; idx++) {
729 		/* Allocate a jumbo frame and carve it into small frames */
730 		if (len < alloc_size) {
731 			mutex_enter(&jpool->mtx);
732 			/* remove jumbo from freelist */
733 			j = jpool->head;
734 			jpool->head = j->next;
735 			/* place it onto small list */
736 			j->next = ss->small_jpool;
737 			ss->small_jpool = j;
738 			mutex_exit(&jpool->mtx);
739 			len = myri10ge_mtu;
740 			dma_low = ntohl(j->dma.low);
741 			dma_high = ntohl(j->dma.high);
742 			ptr = j->buf;
743 		}
744 		ss->rx_small.info[idx].ptr = ptr;
745 		ss->rx_small.shadow[idx].addr_low = htonl(dma_low);
746 		ss->rx_small.shadow[idx].addr_high = htonl(dma_high);
747 		len -= alloc_size;
748 		ptr += alloc_size;
749 		dma_low += alloc_size;
750 	}
751 }
752 
753 /*
754  * Return the jumbo bufs we carved up for small to the jumbo pool
755  */
756 
757 static void
758 myri10ge_release_small_jbufs(struct myri10ge_slice_state *ss)
759 {
760 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
761 	struct myri10ge_jpool_entry *j = NULL;
762 
763 	mutex_enter(&jpool->mtx);
764 	while (ss->small_jpool != NULL) {
765 		j = ss->small_jpool;
766 		ss->small_jpool = j->next;
767 		j->next = jpool->head;
768 		jpool->head = j;
769 	}
770 	mutex_exit(&jpool->mtx);
771 	ss->jbufs_for_smalls = 0;
772 }
773 
774 static int
775 myri10ge_add_tx_handle(struct myri10ge_slice_state *ss)
776 {
777 	myri10ge_tx_ring_t *tx = &ss->tx;
778 	struct myri10ge_priv *mgp = ss->mgp;
779 	struct myri10ge_tx_dma_handle *handle;
780 	int err;
781 
782 	handle = kmem_zalloc(sizeof (*handle), KM_SLEEP);
783 	err = ddi_dma_alloc_handle(mgp->dip,
784 	    &myri10ge_tx_dma_attr,
785 	    DDI_DMA_SLEEP, NULL,
786 	    &handle->h);
787 	if (err) {
788 		static int limit = 0;
789 		if (limit == 0)
790 			cmn_err(CE_WARN, "%s: Falled to alloc tx dma handle\n",
791 			    mgp->name);
792 		limit++;
793 		kmem_free(handle, sizeof (*handle));
794 		return (err);
795 	}
796 	mutex_enter(&tx->handle_lock);
797 	MYRI10GE_SLICE_STAT_INC(tx_handles_alloced);
798 	handle->next = tx->free_tx_handles;
799 	tx->free_tx_handles = handle;
800 	mutex_exit(&tx->handle_lock);
801 	return (DDI_SUCCESS);
802 }
803 
804 static void
805 myri10ge_remove_tx_handles(struct myri10ge_slice_state *ss)
806 {
807 	myri10ge_tx_ring_t *tx = &ss->tx;
808 	struct myri10ge_tx_dma_handle *handle;
809 	mutex_enter(&tx->handle_lock);
810 
811 	handle = tx->free_tx_handles;
812 	while (handle != NULL) {
813 		tx->free_tx_handles = handle->next;
814 		ddi_dma_free_handle(&handle->h);
815 		kmem_free(handle, sizeof (*handle));
816 		handle = tx->free_tx_handles;
817 		MYRI10GE_SLICE_STAT_DEC(tx_handles_alloced);
818 	}
819 	mutex_exit(&tx->handle_lock);
820 	if (MYRI10GE_SLICE_STAT(tx_handles_alloced) != 0) {
821 		cmn_err(CE_WARN, "%s: %d tx dma handles allocated at close\n",
822 		    ss->mgp->name,
823 		    (int)MYRI10GE_SLICE_STAT(tx_handles_alloced));
824 	}
825 }
826 
827 static void
828 myri10ge_free_tx_handles(myri10ge_tx_ring_t *tx,
829     struct myri10ge_tx_dma_handle_head *list)
830 {
831 	mutex_enter(&tx->handle_lock);
832 	list->tail->next = tx->free_tx_handles;
833 	tx->free_tx_handles = list->head;
834 	mutex_exit(&tx->handle_lock);
835 }
836 
837 static void
838 myri10ge_free_tx_handle_slist(myri10ge_tx_ring_t *tx,
839     struct myri10ge_tx_dma_handle *handle)
840 {
841 	struct myri10ge_tx_dma_handle_head list;
842 
843 	if (handle == NULL)
844 		return;
845 	list.head = handle;
846 	list.tail = handle;
847 	while (handle != NULL) {
848 		list.tail = handle;
849 		handle = handle->next;
850 	}
851 	myri10ge_free_tx_handles(tx, &list);
852 }
853 
854 static int
855 myri10ge_alloc_tx_handles(struct myri10ge_slice_state *ss, int count,
856     struct myri10ge_tx_dma_handle **ret)
857 {
858 	myri10ge_tx_ring_t *tx = &ss->tx;
859 	struct myri10ge_tx_dma_handle *handle;
860 	int err, i;
861 
862 	mutex_enter(&tx->handle_lock);
863 	for (i = 0; i < count; i++) {
864 		handle = tx->free_tx_handles;
865 		while (handle == NULL) {
866 			mutex_exit(&tx->handle_lock);
867 			err = myri10ge_add_tx_handle(ss);
868 			if (err != DDI_SUCCESS) {
869 				goto abort_with_handles;
870 			}
871 			mutex_enter(&tx->handle_lock);
872 			handle = tx->free_tx_handles;
873 		}
874 		tx->free_tx_handles = handle->next;
875 		handle->next = *ret;
876 		*ret = handle;
877 	}
878 	mutex_exit(&tx->handle_lock);
879 	return (DDI_SUCCESS);
880 
881 abort_with_handles:
882 	myri10ge_free_tx_handle_slist(tx, *ret);
883 	return (err);
884 }
885 
886 
887 /*
888  * Frees DMA resources associated with the send ring
889  */
890 static void
891 myri10ge_unprepare_tx_ring(struct myri10ge_slice_state *ss)
892 {
893 	myri10ge_tx_ring_t *tx;
894 	struct myri10ge_tx_dma_handle_head handles;
895 	size_t bytes;
896 	int idx;
897 
898 	tx = &ss->tx;
899 	handles.head = NULL;
900 	handles.tail = NULL;
901 	for (idx = 0; idx < ss->tx.mask + 1; idx++) {
902 		if (tx->info[idx].m) {
903 			(void) ddi_dma_unbind_handle(tx->info[idx].handle->h);
904 			handles.head = tx->info[idx].handle;
905 			if (handles.tail == NULL)
906 				handles.tail = tx->info[idx].handle;
907 			freeb(tx->info[idx].m);
908 			tx->info[idx].m = 0;
909 			tx->info[idx].handle = 0;
910 		}
911 		tx->cp[idx].va = NULL;
912 		myri10ge_dma_free(&tx->cp[idx].dma);
913 	}
914 	bytes = sizeof (*tx->cp) * (tx->mask + 1);
915 	kmem_free(tx->cp, bytes);
916 	tx->cp = NULL;
917 	if (handles.head != NULL)
918 		myri10ge_free_tx_handles(tx, &handles);
919 	myri10ge_remove_tx_handles(ss);
920 }
921 
922 /*
923  * Allocates DMA handles associated with the send ring
924  */
925 static inline int
926 myri10ge_prepare_tx_ring(struct myri10ge_slice_state *ss)
927 {
928 	struct myri10ge_tx_dma_handle *handles;
929 	int h;
930 	size_t bytes;
931 
932 	bytes = sizeof (*ss->tx.cp) * (ss->tx.mask + 1);
933 	ss->tx.cp = kmem_zalloc(bytes, KM_SLEEP);
934 	if (ss->tx.cp == NULL) {
935 		cmn_err(CE_WARN,
936 		    "%s: Failed to allocate tx copyblock storage\n",
937 		    ss->mgp->name);
938 		return (DDI_FAILURE);
939 	}
940 
941 
942 	/* allocate the TX copyblocks */
943 	for (h = 0; h < ss->tx.mask + 1; h++) {
944 		ss->tx.cp[h].va = myri10ge_dma_alloc(ss->mgp->dip,
945 		    4096, &myri10ge_rx_jumbo_dma_attr,
946 		    &myri10ge_dev_access_attr, DDI_DMA_STREAMING,
947 		    DDI_DMA_WRITE|DDI_DMA_STREAMING, &ss->tx.cp[h].dma, 1,
948 		    DDI_DMA_DONTWAIT);
949 		if (ss->tx.cp[h].va == NULL) {
950 			cmn_err(CE_WARN, "%s: Failed to allocate tx "
951 			    "copyblock %d\n", ss->mgp->name, h);
952 			goto abort_with_copyblocks;
953 		}
954 	}
955 	/* pre-allocate transmit handles */
956 	handles = NULL;
957 	(void) myri10ge_alloc_tx_handles(ss, myri10ge_tx_handles_initial,
958 	    &handles);
959 	if (handles != NULL)
960 		myri10ge_free_tx_handle_slist(&ss->tx, handles);
961 
962 	return (DDI_SUCCESS);
963 
964 abort_with_copyblocks:
965 	while (h > 0)  {
966 		h--;
967 		myri10ge_dma_free(&ss->tx.cp[h].dma);
968 	}
969 
970 	bytes = sizeof (*ss->tx.cp) * (ss->tx.mask + 1);
971 	kmem_free(ss->tx.cp, bytes);
972 	ss->tx.cp = NULL;
973 	return (DDI_FAILURE);
974 }
975 
976 /*
977  * The eeprom strings on the lanaiX have the format
978  * SN=x\0
979  * MAC=x:x:x:x:x:x\0
980  * PT:ddd mmm xx xx:xx:xx xx\0
981  * PV:ddd mmm xx xx:xx:xx xx\0
982  */
983 static int
984 myri10ge_read_mac_addr(struct myri10ge_priv *mgp)
985 {
986 #define	MYRI10GE_NEXT_STRING(p) while (ptr < limit && *ptr++)
987 #define	myri10ge_digit(c) (((c) >= '0' && (c) <= '9') ? ((c) - '0') :	\
988 		(((c) >= 'A' && (c) <= 'F') ? (10 + (c) - 'A') :	\
989 		(((c) >= 'a' && (c) <= 'f') ? (10 + (c) - 'a') : -1)))
990 
991 	char *ptr, *limit;
992 	int i, hv, lv;
993 
994 	ptr = mgp->eeprom_strings;
995 	limit = mgp->eeprom_strings + MYRI10GE_EEPROM_STRINGS_SIZE;
996 
997 	while (*ptr != '\0' && ptr < limit) {
998 		if (memcmp(ptr, "MAC=", 4) == 0) {
999 			ptr += 4;
1000 			if (myri10ge_verbose)
1001 				printf("%s: mac address = %s\n", mgp->name,
1002 				    ptr);
1003 			mgp->mac_addr_string = ptr;
1004 			for (i = 0; i < 6; i++) {
1005 				if ((ptr + 2) > limit)
1006 					goto abort;
1007 
1008 				if (*(ptr+1) == ':') {
1009 					hv = 0;
1010 					lv = myri10ge_digit(*ptr); ptr++;
1011 				} else {
1012 					hv = myri10ge_digit(*ptr); ptr++;
1013 					lv = myri10ge_digit(*ptr); ptr++;
1014 				}
1015 				mgp->mac_addr[i] = (hv << 4) | lv;
1016 				ptr++;
1017 			}
1018 		}
1019 		if (memcmp((const void *)ptr, "SN=", 3) == 0) {
1020 			ptr += 3;
1021 			mgp->sn_str = (char *)ptr;
1022 		}
1023 		if (memcmp((const void *)ptr, "PC=", 3) == 0) {
1024 			ptr += 3;
1025 			mgp->pc_str = (char *)ptr;
1026 		}
1027 		MYRI10GE_NEXT_STRING(ptr);
1028 	}
1029 
1030 	return (0);
1031 
1032 abort:
1033 	cmn_err(CE_WARN, "%s: failed to parse eeprom_strings", mgp->name);
1034 	return (ENXIO);
1035 }
1036 
1037 
1038 /*
1039  * Determine the register set containing the PCI resource we
1040  * want to map: the memory-mappable part of the interface. We do
1041  * this by scanning the DDI "reg" property of the interface,
1042  * which is an array of mx_ddi_reg_set structures.
1043  */
1044 static int
1045 myri10ge_reg_set(dev_info_t *dip, int *reg_set, int *span,
1046     unsigned long *busno, unsigned long *devno,
1047     unsigned long *funcno)
1048 {
1049 
1050 #define	REGISTER_NUMBER(ip)	(ip[0] >>  0 & 0xff)
1051 #define	FUNCTION_NUMBER(ip)	(ip[0] >>  8 & 0x07)
1052 #define	DEVICE_NUMBER(ip)	(ip[0] >> 11 & 0x1f)
1053 #define	BUS_NUMBER(ip)		(ip[0] >> 16 & 0xff)
1054 #define	ADDRESS_SPACE(ip)	(ip[0] >> 24 & 0x03)
1055 #define	PCI_ADDR_HIGH(ip)	(ip[1])
1056 #define	PCI_ADDR_LOW(ip) 	(ip[2])
1057 #define	PCI_SPAN_HIGH(ip)	(ip[3])
1058 #define	PCI_SPAN_LOW(ip)	(ip[4])
1059 
1060 #define	MX_DDI_REG_SET_32_BIT_MEMORY_SPACE 2
1061 #define	MX_DDI_REG_SET_64_BIT_MEMORY_SPACE 3
1062 
1063 	int *data, i, *rs;
1064 	uint32_t nelementsp;
1065 
1066 #ifdef MYRI10GE_REGSET_VERBOSE
1067 	char *address_space_name[] = { "Configuration Space",
1068 					"I/O Space",
1069 					"32-bit Memory Space",
1070 					"64-bit Memory Space"
1071 	};
1072 #endif
1073 
1074 	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
1075 	    "reg", &data, &nelementsp) != DDI_SUCCESS) {
1076 		printf("Could not determine register set.\n");
1077 		return (ENXIO);
1078 	}
1079 
1080 #ifdef MYRI10GE_REGSET_VERBOSE
1081 	printf("There are %d register sets.\n", nelementsp / 5);
1082 #endif
1083 	if (!nelementsp) {
1084 		printf("Didn't find any \"reg\" properties.\n");
1085 		ddi_prop_free(data);
1086 		return (ENODEV);
1087 	}
1088 
1089 	/* Scan for the register number. */
1090 	rs = &data[0];
1091 	*busno = BUS_NUMBER(rs);
1092 	*devno = DEVICE_NUMBER(rs);
1093 	*funcno = FUNCTION_NUMBER(rs);
1094 
1095 #ifdef MYRI10GE_REGSET_VERBOSE
1096 	printf("*** Scanning for register number.\n");
1097 #endif
1098 	for (i = 0; i < nelementsp / 5; i++) {
1099 		rs = &data[5 * i];
1100 #ifdef MYRI10GE_REGSET_VERBOSE
1101 		printf("Examining register set %d:\n", i);
1102 		printf("  Register number = %d.\n", REGISTER_NUMBER(rs));
1103 		printf("  Function number = %d.\n", FUNCTION_NUMBER(rs));
1104 		printf("  Device number   = %d.\n", DEVICE_NUMBER(rs));
1105 		printf("  Bus number      = %d.\n", BUS_NUMBER(rs));
1106 		printf("  Address space   = %d (%s ).\n", ADDRESS_SPACE(rs),
1107 		    address_space_name[ADDRESS_SPACE(rs)]);
1108 		printf("  pci address 0x%08x %08x\n", PCI_ADDR_HIGH(rs),
1109 		    PCI_ADDR_LOW(rs));
1110 		printf("  pci span 0x%08x %08x\n", PCI_SPAN_HIGH(rs),
1111 		    PCI_SPAN_LOW(rs));
1112 #endif
1113 		/* We are looking for a memory property. */
1114 
1115 		if (ADDRESS_SPACE(rs) == MX_DDI_REG_SET_64_BIT_MEMORY_SPACE ||
1116 		    ADDRESS_SPACE(rs) == MX_DDI_REG_SET_32_BIT_MEMORY_SPACE) {
1117 			*reg_set = i;
1118 
1119 #ifdef MYRI10GE_REGSET_VERBOSE
1120 			printf("%s uses register set %d.\n",
1121 			    address_space_name[ADDRESS_SPACE(rs)], *reg_set);
1122 #endif
1123 
1124 			*span = (PCI_SPAN_LOW(rs));
1125 #ifdef MYRI10GE_REGSET_VERBOSE
1126 			printf("Board span is 0x%x\n", *span);
1127 #endif
1128 			break;
1129 		}
1130 	}
1131 
1132 	ddi_prop_free(data);
1133 
1134 	/* If no match, fail. */
1135 	if (i >= nelementsp / 5) {
1136 		return (EIO);
1137 	}
1138 
1139 	return (0);
1140 }
1141 
1142 
1143 static int
1144 myri10ge_load_firmware_from_zlib(struct myri10ge_priv *mgp, uint32_t *limit)
1145 {
1146 	void *inflate_buffer;
1147 	int rv, status;
1148 	size_t sram_size = mgp->sram_size - MYRI10GE_EEPROM_STRINGS_SIZE;
1149 	size_t destlen;
1150 	mcp_gen_header_t *hdr;
1151 	unsigned hdr_offset, i;
1152 
1153 
1154 	*limit = 0; /* -Wuninitialized */
1155 	status = 0;
1156 
1157 	inflate_buffer = kmem_zalloc(sram_size, KM_NOSLEEP);
1158 	if (!inflate_buffer) {
1159 		cmn_err(CE_WARN,
1160 		    "%s: Could not allocate buffer to inflate mcp\n",
1161 		    mgp->name);
1162 		return (ENOMEM);
1163 	}
1164 
1165 	destlen = sram_size;
1166 	rv = z_uncompress(inflate_buffer, &destlen, mgp->eth_z8e,
1167 	    mgp->eth_z8e_length);
1168 
1169 	if (rv != Z_OK) {
1170 		cmn_err(CE_WARN, "%s: Could not inflate mcp: %s\n",
1171 		    mgp->name, z_strerror(rv));
1172 		status = ENXIO;
1173 		goto abort;
1174 	}
1175 
1176 	*limit = (uint32_t)destlen;
1177 
1178 	hdr_offset = htonl(*(uint32_t *)(void *)((char *)inflate_buffer +
1179 	    MCP_HEADER_PTR_OFFSET));
1180 	hdr = (void *)((char *)inflate_buffer + hdr_offset);
1181 	if (ntohl(hdr->mcp_type) != MCP_TYPE_ETH) {
1182 		cmn_err(CE_WARN, "%s: Bad firmware type: 0x%x\n", mgp->name,
1183 		    ntohl(hdr->mcp_type));
1184 		status = EIO;
1185 		goto abort;
1186 	}
1187 
1188 	/* save firmware version for kstat */
1189 	(void) strncpy(mgp->fw_version, hdr->version, sizeof (mgp->fw_version));
1190 	if (myri10ge_verbose)
1191 		printf("%s: firmware id: %s\n", mgp->name, hdr->version);
1192 
1193 	/* Copy the inflated firmware to NIC SRAM. */
1194 	for (i = 0; i < *limit; i += 256) {
1195 		myri10ge_pio_copy((char *)mgp->sram + MYRI10GE_FW_OFFSET + i,
1196 		    (char *)inflate_buffer + i,
1197 		    min(256U, (unsigned)(*limit - i)));
1198 		mb();
1199 		(void) *(int *)(void *)mgp->sram;
1200 		mb();
1201 	}
1202 
1203 abort:
1204 	kmem_free(inflate_buffer, sram_size);
1205 
1206 	return (status);
1207 
1208 }
1209 
1210 
1211 int
1212 myri10ge_send_cmd(struct myri10ge_priv *mgp, uint32_t cmd,
1213 		myri10ge_cmd_t *data)
1214 {
1215 	mcp_cmd_t *buf;
1216 	char buf_bytes[sizeof (*buf) + 8];
1217 	volatile mcp_cmd_response_t *response = mgp->cmd;
1218 	volatile char *cmd_addr =
1219 	    (volatile char *)mgp->sram + MXGEFW_ETH_CMD;
1220 	int sleep_total = 0;
1221 
1222 	/* ensure buf is aligned to 8 bytes */
1223 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1224 
1225 	buf->data0 = htonl(data->data0);
1226 	buf->data1 = htonl(data->data1);
1227 	buf->data2 = htonl(data->data2);
1228 	buf->cmd = htonl(cmd);
1229 	buf->response_addr.low = mgp->cmd_dma.low;
1230 	buf->response_addr.high = mgp->cmd_dma.high;
1231 	mutex_enter(&mgp->cmd_lock);
1232 	response->result = 0xffffffff;
1233 	mb();
1234 
1235 	myri10ge_pio_copy((void *)cmd_addr, buf, sizeof (*buf));
1236 
1237 	/* wait up to 20ms */
1238 	for (sleep_total = 0; sleep_total < 20; sleep_total++) {
1239 		mb();
1240 		if (response->result != 0xffffffff) {
1241 			if (response->result == 0) {
1242 				data->data0 = ntohl(response->data);
1243 				mutex_exit(&mgp->cmd_lock);
1244 				return (0);
1245 			} else if (ntohl(response->result)
1246 			    == MXGEFW_CMD_UNKNOWN) {
1247 				mutex_exit(&mgp->cmd_lock);
1248 				return (ENOSYS);
1249 			} else if (ntohl(response->result)
1250 			    == MXGEFW_CMD_ERROR_UNALIGNED) {
1251 				mutex_exit(&mgp->cmd_lock);
1252 				return (E2BIG);
1253 			} else {
1254 				cmn_err(CE_WARN,
1255 				    "%s: command %d failed, result = %d\n",
1256 				    mgp->name, cmd, ntohl(response->result));
1257 				mutex_exit(&mgp->cmd_lock);
1258 				return (ENXIO);
1259 			}
1260 		}
1261 		drv_usecwait(1000);
1262 	}
1263 	mutex_exit(&mgp->cmd_lock);
1264 	cmn_err(CE_WARN, "%s: command %d timed out, result = %d\n",
1265 	    mgp->name, cmd, ntohl(response->result));
1266 	return (EAGAIN);
1267 }
1268 
1269 /*
1270  * Enable or disable periodic RDMAs from the host to make certain
1271  * chipsets resend dropped PCIe messages
1272  */
1273 
1274 static void
1275 myri10ge_dummy_rdma(struct myri10ge_priv *mgp, int enable)
1276 {
1277 	char buf_bytes[72];
1278 	volatile uint32_t *confirm;
1279 	volatile char *submit;
1280 	uint32_t *buf;
1281 	int i;
1282 
1283 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1284 
1285 	/* clear confirmation addr */
1286 	confirm = (volatile uint32_t *)mgp->cmd;
1287 	*confirm = 0;
1288 	mb();
1289 
1290 	/*
1291 	 * send an rdma command to the PCIe engine, and wait for the
1292 	 * response in the confirmation address.  The firmware should
1293 	 *  write a -1 there to indicate it is alive and well
1294 	 */
1295 
1296 	buf[0] = mgp->cmd_dma.high;		/* confirm addr MSW */
1297 	buf[1] = mgp->cmd_dma.low;		/* confirm addr LSW */
1298 	buf[2] = htonl(0xffffffff);		/* confirm data */
1299 	buf[3] = htonl(mgp->cmd_dma.high); 	/* dummy addr MSW */
1300 	buf[4] = htonl(mgp->cmd_dma.low); 	/* dummy addr LSW */
1301 	buf[5] = htonl(enable);			/* enable? */
1302 
1303 
1304 	submit = (volatile char *)(mgp->sram + MXGEFW_BOOT_DUMMY_RDMA);
1305 
1306 	myri10ge_pio_copy((char *)submit, buf, 64);
1307 	mb();
1308 	drv_usecwait(1000);
1309 	mb();
1310 	i = 0;
1311 	while (*confirm != 0xffffffff && i < 20) {
1312 		drv_usecwait(1000);
1313 		i++;
1314 	}
1315 	if (*confirm != 0xffffffff) {
1316 		cmn_err(CE_WARN, "%s: dummy rdma %s failed (%p = 0x%x)",
1317 		    mgp->name,
1318 		    (enable ? "enable" : "disable"), (void*) confirm, *confirm);
1319 	}
1320 }
1321 
1322 static int
1323 myri10ge_load_firmware(struct myri10ge_priv *mgp)
1324 {
1325 	myri10ge_cmd_t cmd;
1326 	volatile uint32_t *confirm;
1327 	volatile char *submit;
1328 	char buf_bytes[72];
1329 	uint32_t *buf, size;
1330 	int status, i;
1331 
1332 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1333 
1334 	status = myri10ge_load_firmware_from_zlib(mgp, &size);
1335 	if (status) {
1336 		cmn_err(CE_WARN, "%s: firmware loading failed\n", mgp->name);
1337 		return (status);
1338 	}
1339 
1340 	/* clear confirmation addr */
1341 	confirm = (volatile uint32_t *)mgp->cmd;
1342 	*confirm = 0;
1343 	mb();
1344 
1345 	/*
1346 	 * send a reload command to the bootstrap MCP, and wait for the
1347 	 * response in the confirmation address.  The firmware should
1348 	 * write a -1 there to indicate it is alive and well
1349 	 */
1350 
1351 	buf[0] = mgp->cmd_dma.high;	/* confirm addr MSW */
1352 	buf[1] = mgp->cmd_dma.low;	/* confirm addr LSW */
1353 	buf[2] = htonl(0xffffffff);	/* confirm data */
1354 
1355 	/*
1356 	 * FIX: All newest firmware should un-protect the bottom of
1357 	 * the sram before handoff. However, the very first interfaces
1358 	 * do not. Therefore the handoff copy must skip the first 8 bytes
1359 	 */
1360 	buf[3] = htonl(MYRI10GE_FW_OFFSET + 8); /* where the code starts */
1361 	buf[4] = htonl(size - 8); 	/* length of code */
1362 	buf[5] = htonl(8);		/* where to copy to */
1363 	buf[6] = htonl(0);		/* where to jump to */
1364 
1365 	submit = (volatile char *)(mgp->sram + MXGEFW_BOOT_HANDOFF);
1366 
1367 	myri10ge_pio_copy((char *)submit, buf, 64);
1368 	mb();
1369 	drv_usecwait(1000);
1370 	mb();
1371 	i = 0;
1372 	while (*confirm != 0xffffffff && i < 1000) {
1373 		drv_usecwait(1000);
1374 		i++;
1375 	}
1376 	if (*confirm != 0xffffffff) {
1377 		cmn_err(CE_WARN, "%s: handoff failed (%p = 0x%x)",
1378 		    mgp->name, (void *) confirm, *confirm);
1379 
1380 		return (ENXIO);
1381 	}
1382 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
1383 	if (status != 0) {
1384 		cmn_err(CE_WARN, "%s: failed MXGEFW_CMD_GET_RX_RING_SIZE\n",
1385 		    mgp->name);
1386 		return (ENXIO);
1387 	}
1388 
1389 	mgp->max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
1390 	myri10ge_dummy_rdma(mgp, 1);
1391 	return (0);
1392 }
1393 
1394 static int
1395 myri10ge_m_unicst(void *arg, const uint8_t *addr)
1396 {
1397 	struct myri10ge_priv *mgp = arg;
1398 	myri10ge_cmd_t cmd;
1399 	int status;
1400 
1401 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1402 	    | (addr[2] << 8) | addr[3]);
1403 
1404 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1405 
1406 	status = myri10ge_send_cmd(mgp, MXGEFW_SET_MAC_ADDRESS, &cmd);
1407 	if (status == 0 && (addr != mgp->mac_addr))
1408 		(void) memcpy(mgp->mac_addr, addr, sizeof (mgp->mac_addr));
1409 
1410 	return (status);
1411 }
1412 
1413 static int
1414 myri10ge_change_pause(struct myri10ge_priv *mgp, int pause)
1415 {
1416 	myri10ge_cmd_t cmd;
1417 	int status;
1418 
1419 	if (pause)
1420 		status = myri10ge_send_cmd(mgp, MXGEFW_ENABLE_FLOW_CONTROL,
1421 		    &cmd);
1422 	else
1423 		status = myri10ge_send_cmd(mgp, MXGEFW_DISABLE_FLOW_CONTROL,
1424 		    &cmd);
1425 
1426 	if (status) {
1427 		cmn_err(CE_WARN, "%s: Failed to set flow control mode\n",
1428 		    mgp->name);
1429 		return (ENXIO);
1430 	}
1431 	mgp->pause = pause;
1432 	return (0);
1433 }
1434 
1435 static void
1436 myri10ge_change_promisc(struct myri10ge_priv *mgp, int promisc)
1437 {
1438 	myri10ge_cmd_t cmd;
1439 	int status;
1440 
1441 	if (promisc)
1442 		status = myri10ge_send_cmd(mgp, MXGEFW_ENABLE_PROMISC, &cmd);
1443 	else
1444 		status = myri10ge_send_cmd(mgp, MXGEFW_DISABLE_PROMISC, &cmd);
1445 
1446 	if (status) {
1447 		cmn_err(CE_WARN, "%s: Failed to set promisc mode\n",
1448 		    mgp->name);
1449 	}
1450 }
1451 
1452 static int
1453 myri10ge_dma_test(struct myri10ge_priv *mgp, int test_type)
1454 {
1455 	myri10ge_cmd_t cmd;
1456 	int status;
1457 	uint32_t len;
1458 	void *dmabench;
1459 	struct myri10ge_dma_stuff dmabench_dma;
1460 	char *test = " ";
1461 
1462 	/*
1463 	 * Run a small DMA test.
1464 	 * The magic multipliers to the length tell the firmware
1465 	 * tp do DMA read, write, or read+write tests.  The
1466 	 * results are returned in cmd.data0.  The upper 16
1467 	 * bits or the return is the number of transfers completed.
1468 	 * The lower 16 bits is the time in 0.5us ticks that the
1469 	 * transfers took to complete
1470 	 */
1471 
1472 	len = mgp->tx_boundary;
1473 
1474 	dmabench = myri10ge_dma_alloc(mgp->dip, len,
1475 	    &myri10ge_rx_jumbo_dma_attr, &myri10ge_dev_access_attr,
1476 	    DDI_DMA_STREAMING,  DDI_DMA_RDWR|DDI_DMA_STREAMING,
1477 	    &dmabench_dma, 1, DDI_DMA_DONTWAIT);
1478 	mgp->read_dma = mgp->write_dma = mgp->read_write_dma = 0;
1479 	if (dmabench == NULL) {
1480 		cmn_err(CE_WARN, "%s dma benchmark aborted\n", mgp->name);
1481 		return (ENOMEM);
1482 	}
1483 
1484 	cmd.data0 = ntohl(dmabench_dma.low);
1485 	cmd.data1 = ntohl(dmabench_dma.high);
1486 	cmd.data2 = len * 0x10000;
1487 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1488 	if (status != 0) {
1489 		test = "read";
1490 		goto abort;
1491 	}
1492 	mgp->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
1493 
1494 	cmd.data0 = ntohl(dmabench_dma.low);
1495 	cmd.data1 = ntohl(dmabench_dma.high);
1496 	cmd.data2 = len * 0x1;
1497 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1498 	if (status != 0) {
1499 		test = "write";
1500 		goto abort;
1501 	}
1502 	mgp->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
1503 
1504 	cmd.data0 = ntohl(dmabench_dma.low);
1505 	cmd.data1 = ntohl(dmabench_dma.high);
1506 	cmd.data2 = len * 0x10001;
1507 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1508 	if (status != 0) {
1509 		test = "read/write";
1510 		goto abort;
1511 	}
1512 	mgp->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
1513 	    (cmd.data0 & 0xffff);
1514 
1515 
1516 abort:
1517 	myri10ge_dma_free(&dmabench_dma);
1518 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
1519 		cmn_err(CE_WARN, "%s %s dma benchmark failed\n", mgp->name,
1520 		    test);
1521 	return (status);
1522 }
1523 
1524 static int
1525 myri10ge_reset(struct myri10ge_priv *mgp)
1526 {
1527 	myri10ge_cmd_t cmd;
1528 	struct myri10ge_nic_stat *ethstat;
1529 	struct myri10ge_slice_state *ss;
1530 	int i, status;
1531 	size_t bytes;
1532 
1533 	/* send a reset command to the card to see if it is alive */
1534 	(void) memset(&cmd, 0, sizeof (cmd));
1535 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_RESET, &cmd);
1536 	if (status != 0) {
1537 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
1538 		return (ENXIO);
1539 	}
1540 
1541 	/* Now exchange information about interrupts  */
1542 
1543 	bytes = mgp->max_intr_slots * sizeof (*mgp->ss[0].rx_done.entry);
1544 	cmd.data0 = (uint32_t)bytes;
1545 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1546 
1547 	/*
1548 	 * Even though we already know how many slices are supported
1549 	 * via myri10ge_probe_slices() MXGEFW_CMD_GET_MAX_RSS_QUEUES
1550 	 * has magic side effects, and must be called after a reset.
1551 	 * It must be called prior to calling any RSS related cmds,
1552 	 * including assigning an interrupt queue for anything but
1553 	 * slice 0.  It must also be called *after*
1554 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1555 	 * the firmware to compute offsets.
1556 	 */
1557 
1558 	if (mgp->num_slices > 1) {
1559 
1560 		/* ask the maximum number of slices it supports */
1561 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1562 		    &cmd);
1563 		if (status != 0) {
1564 			cmn_err(CE_WARN,
1565 			    "%s: failed to get number of slices\n",
1566 			    mgp->name);
1567 			return (status);
1568 		}
1569 
1570 		/*
1571 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1572 		 * to setting up the interrupt queue DMA
1573 		 */
1574 
1575 		cmd.data0 = mgp->num_slices;
1576 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE |
1577 		    MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1578 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1579 		    &cmd);
1580 		if (status != 0) {
1581 			cmn_err(CE_WARN,
1582 			    "%s: failed to set number of slices\n",
1583 			    mgp->name);
1584 			return (status);
1585 		}
1586 	}
1587 	for (i = 0; i < mgp->num_slices; i++) {
1588 		ss = &mgp->ss[i];
1589 		cmd.data0 = ntohl(ss->rx_done.dma.low);
1590 		cmd.data1 = ntohl(ss->rx_done.dma.high);
1591 		cmd.data2 = i;
1592 		status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_DMA,
1593 		    &cmd);
1594 	};
1595 
1596 	status |= myri10ge_send_cmd(mgp,  MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1597 	for (i = 0; i < mgp->num_slices; i++) {
1598 		ss = &mgp->ss[i];
1599 		ss->irq_claim = (volatile unsigned int *)
1600 		    (void *)(mgp->sram + cmd.data0 + 8 * i);
1601 	}
1602 
1603 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
1604 		status |= myri10ge_send_cmd(mgp,
1605 		    MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1606 		mgp->irq_deassert = (uint32_t *)(void *)(mgp->sram + cmd.data0);
1607 	}
1608 
1609 	status |= myri10ge_send_cmd(mgp,
1610 	    MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1611 	mgp->intr_coal_delay_ptr = (uint32_t *)(void *)(mgp->sram + cmd.data0);
1612 
1613 	if (status != 0) {
1614 		cmn_err(CE_WARN, "%s: failed set interrupt parameters\n",
1615 		    mgp->name);
1616 		return (status);
1617 	}
1618 
1619 	*mgp->intr_coal_delay_ptr = htonl(mgp->intr_coal_delay);
1620 	(void) myri10ge_dma_test(mgp, MXGEFW_DMA_TEST);
1621 
1622 	/* reset mcp/driver shared state back to 0 */
1623 
1624 	for (i = 0; i < mgp->num_slices; i++) {
1625 		ss = &mgp->ss[i];
1626 		bytes = mgp->max_intr_slots *
1627 		    sizeof (*mgp->ss[0].rx_done.entry);
1628 		(void) memset(ss->rx_done.entry, 0, bytes);
1629 		ss->tx.req = 0;
1630 		ss->tx.done = 0;
1631 		ss->tx.pkt_done = 0;
1632 		ss->rx_big.cnt = 0;
1633 		ss->rx_small.cnt = 0;
1634 		ss->rx_done.idx = 0;
1635 		ss->rx_done.cnt = 0;
1636 		ss->rx_token = 0;
1637 		ss->tx.watchdog_done = 0;
1638 		ss->tx.watchdog_req = 0;
1639 		ss->tx.active = 0;
1640 		ss->tx.activate = 0;
1641 	}
1642 	mgp->watchdog_rx_pause = 0;
1643 	if (mgp->ksp_stat != NULL) {
1644 		ethstat = (struct myri10ge_nic_stat *)mgp->ksp_stat->ks_data;
1645 		ethstat->link_changes.value.ul = 0;
1646 	}
1647 	status = myri10ge_m_unicst(mgp, mgp->mac_addr);
1648 	myri10ge_change_promisc(mgp, 0);
1649 	(void) myri10ge_change_pause(mgp, mgp->pause);
1650 	return (status);
1651 }
1652 
1653 static int
1654 myri10ge_init_toeplitz(struct myri10ge_priv *mgp)
1655 {
1656 	myri10ge_cmd_t cmd;
1657 	int i, b, s, t, j;
1658 	int status;
1659 	uint32_t k[8];
1660 	uint32_t tmp;
1661 	uint8_t *key;
1662 
1663 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RSS_KEY_OFFSET,
1664 	    &cmd);
1665 	if (status != 0) {
1666 		cmn_err(CE_WARN, "%s: failed to get rss key\n",
1667 		    mgp->name);
1668 		return (EIO);
1669 	}
1670 	myri10ge_pio_copy32(mgp->rss_key,
1671 	    (uint32_t *)(void*)((char *)mgp->sram + cmd.data0),
1672 	    sizeof (mgp->rss_key));
1673 
1674 	mgp->toeplitz_hash_table = kmem_alloc(sizeof (uint32_t) * 12 * 256,
1675 	    KM_SLEEP);
1676 	key = (uint8_t *)mgp->rss_key;
1677 	t = 0;
1678 	for (b = 0; b < 12; b++) {
1679 		for (s = 0; s < 8; s++) {
1680 			/* Bits: b*8+s, ..., b*8+s+31 */
1681 			k[s] = 0;
1682 			for (j = 0; j < 32; j++) {
1683 				int bit = b*8+s+j;
1684 				bit = 0x1 & (key[bit / 8] >> (7 -(bit & 0x7)));
1685 				k[s] |= bit << (31 - j);
1686 			}
1687 		}
1688 
1689 		for (i = 0; i <= 0xff; i++) {
1690 			tmp = 0;
1691 			if (i & (1 << 7)) { tmp ^= k[0]; }
1692 			if (i & (1 << 6)) { tmp ^= k[1]; }
1693 			if (i & (1 << 5)) { tmp ^= k[2]; }
1694 			if (i & (1 << 4)) { tmp ^= k[3]; }
1695 			if (i & (1 << 3)) { tmp ^= k[4]; }
1696 			if (i & (1 << 2)) { tmp ^= k[5]; }
1697 			if (i & (1 << 1)) { tmp ^= k[6]; }
1698 			if (i & (1 << 0)) { tmp ^= k[7]; }
1699 			mgp->toeplitz_hash_table[t++] = tmp;
1700 		}
1701 	}
1702 	return (0);
1703 }
1704 
1705 static inline struct myri10ge_slice_state *
1706 myri10ge_toeplitz_send_hash(struct myri10ge_priv *mgp, struct ip *ip)
1707 {
1708 	struct tcphdr *hdr;
1709 	uint32_t saddr, daddr;
1710 	uint32_t hash, slice;
1711 	uint32_t *table = mgp->toeplitz_hash_table;
1712 	uint16_t src, dst;
1713 
1714 	/*
1715 	 * Note hashing order is reversed from how it is done
1716 	 * in the NIC, so as to generate the same hash value
1717 	 * for the connection to try to keep connections CPU local
1718 	 */
1719 
1720 	/* hash on IPv4 src/dst address */
1721 	saddr = ntohl(ip->ip_src.s_addr);
1722 	daddr = ntohl(ip->ip_dst.s_addr);
1723 	hash = table[(256 * 0) + ((daddr >> 24) & 0xff)];
1724 	hash ^= table[(256 * 1) + ((daddr >> 16) & 0xff)];
1725 	hash ^= table[(256 * 2) + ((daddr >> 8) & 0xff)];
1726 	hash ^= table[(256 * 3) + ((daddr) & 0xff)];
1727 	hash ^= table[(256 * 4) + ((saddr >> 24) & 0xff)];
1728 	hash ^= table[(256 * 5) + ((saddr >> 16) & 0xff)];
1729 	hash ^= table[(256 * 6) + ((saddr >> 8) & 0xff)];
1730 	hash ^= table[(256 * 7) + ((saddr) & 0xff)];
1731 	/* hash on TCP port, if required */
1732 	if ((myri10ge_rss_hash & MXGEFW_RSS_HASH_TYPE_TCP_IPV4) &&
1733 	    ip->ip_p == IPPROTO_TCP) {
1734 		hdr = (struct tcphdr *)(void *)
1735 		    (((uint8_t *)ip) +  (ip->ip_hl << 2));
1736 		src = ntohs(hdr->th_sport);
1737 		dst = ntohs(hdr->th_dport);
1738 
1739 		hash ^= table[(256 * 8) + ((dst >> 8) & 0xff)];
1740 		hash ^= table[(256 * 9) + ((dst) & 0xff)];
1741 		hash ^= table[(256 * 10) + ((src >> 8) & 0xff)];
1742 		hash ^= table[(256 * 11) + ((src) & 0xff)];
1743 	}
1744 	slice = (mgp->num_slices - 1) & hash;
1745 	return (&mgp->ss[slice]);
1746 
1747 }
1748 
1749 static inline struct myri10ge_slice_state *
1750 myri10ge_simple_send_hash(struct myri10ge_priv *mgp, struct ip *ip)
1751 {
1752 	struct tcphdr *hdr;
1753 	uint32_t slice, hash_val;
1754 
1755 
1756 	if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP) {
1757 		return (&mgp->ss[0]);
1758 	}
1759 	hdr = (struct tcphdr *)(void *)(((uint8_t *)ip) +  (ip->ip_hl << 2));
1760 
1761 	/*
1762 	 * Use the second byte of the *destination* address for
1763 	 * MXGEFW_RSS_HASH_TYPE_SRC_PORT, so as to match NIC's hashing
1764 	 */
1765 	hash_val = ntohs(hdr->th_dport) & 0xff;
1766 	if (myri10ge_rss_hash == MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT)
1767 		hash_val += ntohs(hdr->th_sport) & 0xff;
1768 
1769 	slice = (mgp->num_slices - 1) & hash_val;
1770 	return (&mgp->ss[slice]);
1771 }
1772 
1773 static inline struct myri10ge_slice_state *
1774 myri10ge_send_hash(struct myri10ge_priv *mgp, mblk_t *mp)
1775 {
1776 	unsigned int slice = 0;
1777 	struct ether_header *eh;
1778 	struct ether_vlan_header *vh;
1779 	struct ip *ip;
1780 	int ehl, ihl;
1781 
1782 	if (mgp->num_slices == 1)
1783 		return (&mgp->ss[0]);
1784 
1785 	if (myri10ge_tx_hash == 0) {
1786 		slice = CPU->cpu_id & (mgp->num_slices - 1);
1787 		return (&mgp->ss[slice]);
1788 	}
1789 
1790 	/*
1791 	 *  ensure it is a TCP or UDP over IPv4 packet, and that the
1792 	 *  headers are in the 1st mblk.  Otherwise, punt
1793 	 */
1794 	ehl = sizeof (*eh);
1795 	ihl = sizeof (*ip);
1796 	if ((MBLKL(mp)) <  (ehl + ihl + 8))
1797 		return (&mgp->ss[0]);
1798 	eh = (struct ether_header *)(void *)mp->b_rptr;
1799 	ip = (struct ip *)(void *)(eh + 1);
1800 	if (eh->ether_type != BE_16(ETHERTYPE_IP)) {
1801 		if (eh->ether_type != BE_16(ETHERTYPE_VLAN))
1802 			return (&mgp->ss[0]);
1803 		vh = (struct ether_vlan_header *)(void *)mp->b_rptr;
1804 		if (vh->ether_type != BE_16(ETHERTYPE_IP))
1805 			return (&mgp->ss[0]);
1806 		ehl += 4;
1807 		ip = (struct ip *)(void *)(vh + 1);
1808 	}
1809 	ihl = ip->ip_hl << 2;
1810 	if (MBLKL(mp) <  (ehl + ihl + 8))
1811 		return (&mgp->ss[0]);
1812 	switch (myri10ge_rss_hash) {
1813 	case MXGEFW_RSS_HASH_TYPE_IPV4:
1814 		/* fallthru */
1815 	case MXGEFW_RSS_HASH_TYPE_TCP_IPV4:
1816 		/* fallthru */
1817 	case (MXGEFW_RSS_HASH_TYPE_IPV4|MXGEFW_RSS_HASH_TYPE_TCP_IPV4):
1818 		return (myri10ge_toeplitz_send_hash(mgp, ip));
1819 	case MXGEFW_RSS_HASH_TYPE_SRC_PORT:
1820 		/* fallthru */
1821 	case MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT:
1822 		return (myri10ge_simple_send_hash(mgp, ip));
1823 	default:
1824 		break;
1825 	}
1826 	return (&mgp->ss[0]);
1827 }
1828 
1829 static int
1830 myri10ge_setup_slice(struct myri10ge_slice_state *ss)
1831 {
1832 	struct myri10ge_priv *mgp = ss->mgp;
1833 	myri10ge_cmd_t cmd;
1834 	int tx_ring_size, rx_ring_size;
1835 	int tx_ring_entries, rx_ring_entries;
1836 	int slice, status;
1837 	int allocated, idx;
1838 	size_t bytes;
1839 
1840 	slice = ss - mgp->ss;
1841 	cmd.data0 = slice;
1842 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
1843 	tx_ring_size = cmd.data0;
1844 	cmd.data0 = slice;
1845 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
1846 	if (status != 0)
1847 		return (status);
1848 	rx_ring_size = cmd.data0;
1849 
1850 	tx_ring_entries = tx_ring_size / sizeof (struct mcp_kreq_ether_send);
1851 	rx_ring_entries = rx_ring_size / sizeof (struct mcp_dma_addr);
1852 	ss->tx.mask = tx_ring_entries - 1;
1853 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
1854 
1855 	/* get the lanai pointers to the send and receive rings */
1856 
1857 	cmd.data0 = slice;
1858 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
1859 	ss->tx.lanai = (mcp_kreq_ether_send_t *)(void *)(mgp->sram + cmd.data0);
1860 	if (mgp->num_slices > 1) {
1861 		ss->tx.go = (char *)mgp->sram + MXGEFW_ETH_SEND_GO + 64 * slice;
1862 		ss->tx.stop = (char *)mgp->sram + MXGEFW_ETH_SEND_STOP +
1863 		    64 * slice;
1864 	} else {
1865 		ss->tx.go = NULL;
1866 		ss->tx.stop = NULL;
1867 	}
1868 
1869 	cmd.data0 = slice;
1870 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
1871 	ss->rx_small.lanai = (mcp_kreq_ether_recv_t *)
1872 	    (void *)(mgp->sram + cmd.data0);
1873 
1874 	cmd.data0 = slice;
1875 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
1876 	ss->rx_big.lanai = (mcp_kreq_ether_recv_t *)(void *)
1877 	    (mgp->sram + cmd.data0);
1878 
1879 	if (status != 0) {
1880 		cmn_err(CE_WARN,
1881 		    "%s: failed to get ring sizes or locations\n", mgp->name);
1882 		return (status);
1883 	}
1884 
1885 	status = ENOMEM;
1886 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
1887 	ss->rx_small.shadow = kmem_zalloc(bytes, KM_SLEEP);
1888 	if (ss->rx_small.shadow == NULL)
1889 		goto abort;
1890 	(void) memset(ss->rx_small.shadow, 0, bytes);
1891 
1892 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
1893 	ss->rx_big.shadow = kmem_zalloc(bytes, KM_SLEEP);
1894 	if (ss->rx_big.shadow == NULL)
1895 		goto abort_with_rx_small_shadow;
1896 	(void) memset(ss->rx_big.shadow, 0, bytes);
1897 
1898 	/* allocate the host info rings */
1899 
1900 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
1901 	ss->tx.info = kmem_zalloc(bytes, KM_SLEEP);
1902 	if (ss->tx.info == NULL)
1903 		goto abort_with_rx_big_shadow;
1904 	(void) memset(ss->tx.info, 0, bytes);
1905 
1906 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
1907 	ss->rx_small.info = kmem_zalloc(bytes, KM_SLEEP);
1908 	if (ss->rx_small.info == NULL)
1909 		goto abort_with_tx_info;
1910 	(void) memset(ss->rx_small.info, 0, bytes);
1911 
1912 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
1913 	ss->rx_big.info = kmem_zalloc(bytes, KM_SLEEP);
1914 	if (ss->rx_big.info == NULL)
1915 		goto abort_with_rx_small_info;
1916 	(void) memset(ss->rx_big.info, 0, bytes);
1917 
1918 	ss->tx.stall = ss->tx.sched = 0;
1919 	ss->tx.stall_early = ss->tx.stall_late = 0;
1920 
1921 	ss->jbufs_for_smalls = 1 + (1 + ss->rx_small.mask) /
1922 	    (myri10ge_mtu / (myri10ge_small_bytes + MXGEFW_PAD));
1923 
1924 	allocated = myri10ge_add_jbufs(ss,
1925 	    myri10ge_bigbufs_initial + ss->jbufs_for_smalls, 1);
1926 	if (allocated < ss->jbufs_for_smalls + myri10ge_bigbufs_initial) {
1927 		cmn_err(CE_WARN,
1928 		    "%s: Could not allocate enough receive buffers (%d/%d)\n",
1929 		    mgp->name, allocated,
1930 		    myri10ge_bigbufs_initial + ss->jbufs_for_smalls);
1931 		goto abort_with_jumbos;
1932 	}
1933 
1934 	myri10ge_carve_up_jbufs_into_small_ring(ss);
1935 	ss->j_rx_cnt = 0;
1936 
1937 	mutex_enter(&ss->jpool.mtx);
1938 	if (allocated < rx_ring_entries)
1939 		ss->jpool.low_water = allocated / 4;
1940 	else
1941 		ss->jpool.low_water = rx_ring_entries / 2;
1942 
1943 	/*
1944 	 * invalidate the big receive ring in case we do not
1945 	 * allocate sufficient jumbos to fill it
1946 	 */
1947 	(void) memset(ss->rx_big.shadow, 1,
1948 	    (ss->rx_big.mask + 1) * sizeof (ss->rx_big.shadow[0]));
1949 	for (idx = 7; idx <= ss->rx_big.mask; idx += 8) {
1950 		myri10ge_submit_8rx(&ss->rx_big.lanai[idx - 7],
1951 		    &ss->rx_big.shadow[idx - 7]);
1952 		mb();
1953 	}
1954 
1955 
1956 	myri10ge_restock_jumbos(ss);
1957 
1958 	for (idx = 7; idx <= ss->rx_small.mask; idx += 8) {
1959 		myri10ge_submit_8rx(&ss->rx_small.lanai[idx - 7],
1960 		    &ss->rx_small.shadow[idx - 7]);
1961 		mb();
1962 	}
1963 	ss->rx_small.cnt = ss->rx_small.mask + 1;
1964 
1965 	mutex_exit(&ss->jpool.mtx);
1966 
1967 	status = myri10ge_prepare_tx_ring(ss);
1968 
1969 	if (status != 0)
1970 		goto abort_with_small_jbufs;
1971 
1972 	cmd.data0 = ntohl(ss->fw_stats_dma.low);
1973 	cmd.data1 = ntohl(ss->fw_stats_dma.high);
1974 	cmd.data2 = sizeof (mcp_irq_data_t);
1975 	cmd.data2 |= (slice << 16);
1976 	bzero(ss->fw_stats, sizeof (*ss->fw_stats));
1977 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
1978 	if (status == ENOSYS) {
1979 		cmd.data0 = ntohl(ss->fw_stats_dma.low) +
1980 		    offsetof(mcp_irq_data_t, send_done_count);
1981 		cmd.data1 = ntohl(ss->fw_stats_dma.high);
1982 		status = myri10ge_send_cmd(mgp,
1983 		    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE, &cmd);
1984 	}
1985 	if (status) {
1986 		cmn_err(CE_WARN, "%s: Couldn't set stats DMA\n", mgp->name);
1987 		goto abort_with_tx;
1988 	}
1989 
1990 	return (0);
1991 
1992 abort_with_tx:
1993 	myri10ge_unprepare_tx_ring(ss);
1994 
1995 abort_with_small_jbufs:
1996 	myri10ge_release_small_jbufs(ss);
1997 
1998 abort_with_jumbos:
1999 	if (allocated != 0) {
2000 		mutex_enter(&ss->jpool.mtx);
2001 		ss->jpool.low_water = 0;
2002 		mutex_exit(&ss->jpool.mtx);
2003 		myri10ge_unstock_jumbos(ss);
2004 		myri10ge_remove_jbufs(ss);
2005 	}
2006 
2007 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
2008 	kmem_free(ss->rx_big.info, bytes);
2009 
2010 abort_with_rx_small_info:
2011 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
2012 	kmem_free(ss->rx_small.info, bytes);
2013 
2014 abort_with_tx_info:
2015 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
2016 	kmem_free(ss->tx.info, bytes);
2017 
2018 abort_with_rx_big_shadow:
2019 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2020 	kmem_free(ss->rx_big.shadow, bytes);
2021 
2022 abort_with_rx_small_shadow:
2023 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2024 	kmem_free(ss->rx_small.shadow, bytes);
2025 abort:
2026 	return (status);
2027 
2028 }
2029 
2030 static void
2031 myri10ge_teardown_slice(struct myri10ge_slice_state *ss)
2032 {
2033 	int tx_ring_entries, rx_ring_entries;
2034 	size_t bytes;
2035 
2036 	/* ignore slices that have not been fully setup */
2037 	if (ss->tx.cp == NULL)
2038 		return;
2039 	/* Free the TX copy buffers */
2040 	myri10ge_unprepare_tx_ring(ss);
2041 
2042 	/* stop passing returned buffers to firmware */
2043 
2044 	mutex_enter(&ss->jpool.mtx);
2045 	ss->jpool.low_water = 0;
2046 	mutex_exit(&ss->jpool.mtx);
2047 	myri10ge_release_small_jbufs(ss);
2048 
2049 	/* Release the free jumbo frame pool */
2050 	myri10ge_unstock_jumbos(ss);
2051 	myri10ge_remove_jbufs(ss);
2052 
2053 	rx_ring_entries = ss->rx_big.mask + 1;
2054 	tx_ring_entries = ss->tx.mask + 1;
2055 
2056 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
2057 	kmem_free(ss->rx_big.info, bytes);
2058 
2059 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
2060 	kmem_free(ss->rx_small.info, bytes);
2061 
2062 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
2063 	kmem_free(ss->tx.info, bytes);
2064 
2065 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2066 	kmem_free(ss->rx_big.shadow, bytes);
2067 
2068 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2069 	kmem_free(ss->rx_small.shadow, bytes);
2070 
2071 }
2072 static int
2073 myri10ge_start_locked(struct myri10ge_priv *mgp)
2074 {
2075 	myri10ge_cmd_t cmd;
2076 	int status, big_pow2, i;
2077 	volatile uint8_t *itable;
2078 
2079 	status = DDI_SUCCESS;
2080 	/* Allocate DMA resources and receive buffers */
2081 
2082 	status = myri10ge_reset(mgp);
2083 	if (status != 0) {
2084 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
2085 		return (DDI_FAILURE);
2086 	}
2087 
2088 	if (mgp->num_slices > 1) {
2089 		cmd.data0 = mgp->num_slices;
2090 		cmd.data1 = 1; /* use MSI-X */
2091 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ENABLE_RSS_QUEUES,
2092 		    &cmd);
2093 		if (status != 0) {
2094 			cmn_err(CE_WARN,
2095 			    "%s: failed to set number of slices\n",
2096 			    mgp->name);
2097 			goto abort_with_nothing;
2098 		}
2099 		/* setup the indirection table */
2100 		cmd.data0 = mgp->num_slices;
2101 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
2102 		    &cmd);
2103 
2104 		status |= myri10ge_send_cmd(mgp,
2105 		    MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
2106 		if (status != 0) {
2107 			cmn_err(CE_WARN,
2108 			    "%s: failed to setup rss tables\n", mgp->name);
2109 		}
2110 
2111 		/* just enable an identity mapping */
2112 		itable = mgp->sram + cmd.data0;
2113 		for (i = 0; i < mgp->num_slices; i++)
2114 			itable[i] = (uint8_t)i;
2115 
2116 		if (myri10ge_rss_hash & MYRI10GE_TOEPLITZ_HASH) {
2117 			status = myri10ge_init_toeplitz(mgp);
2118 			if (status != 0) {
2119 				cmn_err(CE_WARN, "%s: failed to setup "
2120 				    "toeplitz tx hash table", mgp->name);
2121 				goto abort_with_nothing;
2122 			}
2123 		}
2124 		cmd.data0 = 1;
2125 		cmd.data1 = myri10ge_rss_hash;
2126 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_RSS_ENABLE,
2127 		    &cmd);
2128 		if (status != 0) {
2129 			cmn_err(CE_WARN,
2130 			    "%s: failed to enable slices\n", mgp->name);
2131 			goto abort_with_toeplitz;
2132 		}
2133 	}
2134 
2135 	for (i = 0; i < mgp->num_slices; i++) {
2136 		status = myri10ge_setup_slice(&mgp->ss[i]);
2137 		if (status != 0)
2138 			goto abort_with_slices;
2139 	}
2140 
2141 	/*
2142 	 * Tell the MCP how many buffers it has, and to
2143 	 *  bring the ethernet interface up
2144 	 *
2145 	 * Firmware needs the big buff size as a power of 2.  Lie and
2146 	 * tell it the buffer is larger, because we only use 1
2147 	 * buffer/pkt, and the mtu will prevent overruns
2148 	 */
2149 	big_pow2 = myri10ge_mtu + MXGEFW_PAD;
2150 	while (!ISP2(big_pow2))
2151 		big_pow2++;
2152 
2153 	/* now give firmware buffers sizes, and MTU */
2154 	cmd.data0 = myri10ge_mtu;
2155 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_MTU, &cmd);
2156 	cmd.data0 = myri10ge_small_bytes;
2157 	status |=
2158 	    myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
2159 	cmd.data0 = big_pow2;
2160 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2161 	if (status) {
2162 		cmn_err(CE_WARN, "%s: Couldn't set buffer sizes\n", mgp->name);
2163 		goto abort_with_slices;
2164 	}
2165 
2166 
2167 	cmd.data0 = 1;
2168 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_TSO_MODE, &cmd);
2169 	if (status) {
2170 		cmn_err(CE_WARN, "%s: unable to setup TSO (%d)\n",
2171 		    mgp->name, status);
2172 	} else {
2173 		mgp->features |= MYRI10GE_TSO;
2174 	}
2175 
2176 	mgp->link_state = -1;
2177 	mgp->rdma_tags_available = 15;
2178 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_UP, &cmd);
2179 	if (status) {
2180 		cmn_err(CE_WARN, "%s: unable to start ethernet\n", mgp->name);
2181 		goto abort_with_slices;
2182 	}
2183 	mgp->running = MYRI10GE_ETH_RUNNING;
2184 	return (DDI_SUCCESS);
2185 
2186 abort_with_slices:
2187 	for (i = 0; i < mgp->num_slices; i++)
2188 		myri10ge_teardown_slice(&mgp->ss[i]);
2189 
2190 	mgp->running = MYRI10GE_ETH_STOPPED;
2191 
2192 abort_with_toeplitz:
2193 	if (mgp->toeplitz_hash_table != NULL) {
2194 		kmem_free(mgp->toeplitz_hash_table,
2195 		    sizeof (uint32_t) * 12 * 256);
2196 		mgp->toeplitz_hash_table = NULL;
2197 	}
2198 
2199 abort_with_nothing:
2200 	return (DDI_FAILURE);
2201 }
2202 
2203 static void
2204 myri10ge_stop_locked(struct myri10ge_priv *mgp)
2205 {
2206 	int status, old_down_cnt;
2207 	myri10ge_cmd_t cmd;
2208 	int wait_time = 10;
2209 	int i, polling;
2210 
2211 	old_down_cnt = mgp->down_cnt;
2212 	mb();
2213 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2214 	if (status) {
2215 		cmn_err(CE_WARN, "%s: Couldn't bring down link\n", mgp->name);
2216 	}
2217 
2218 	while (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2219 		delay(1 * drv_usectohz(1000000));
2220 		wait_time--;
2221 		if (wait_time == 0)
2222 			break;
2223 	}
2224 again:
2225 	if (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2226 		cmn_err(CE_WARN, "%s: didn't get down irq\n", mgp->name);
2227 		for (i = 0; i < mgp->num_slices; i++) {
2228 			/*
2229 			 * take and release the rx lock to ensure
2230 			 * that no interrupt thread is blocked
2231 			 * elsewhere in the stack, preventing
2232 			 * completion
2233 			 */
2234 
2235 			mutex_enter(&mgp->ss[i].rx_lock);
2236 			printf("%s: slice %d rx irq idle\n",
2237 			    mgp->name, i);
2238 			mutex_exit(&mgp->ss[i].rx_lock);
2239 
2240 			/* verify that the poll handler is inactive */
2241 			mutex_enter(&mgp->ss->poll_lock);
2242 			polling = mgp->ss->rx_polling;
2243 			mutex_exit(&mgp->ss->poll_lock);
2244 			if (polling) {
2245 				printf("%s: slice %d is polling\n",
2246 				    mgp->name, i);
2247 				delay(1 * drv_usectohz(1000000));
2248 				goto again;
2249 			}
2250 		}
2251 		delay(1 * drv_usectohz(1000000));
2252 		if (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2253 			cmn_err(CE_WARN, "%s: Never got down irq\n", mgp->name);
2254 		}
2255 	}
2256 
2257 	for (i = 0; i < mgp->num_slices; i++)
2258 		myri10ge_teardown_slice(&mgp->ss[i]);
2259 
2260 	if (mgp->toeplitz_hash_table != NULL) {
2261 		kmem_free(mgp->toeplitz_hash_table,
2262 		    sizeof (uint32_t) * 12 * 256);
2263 		mgp->toeplitz_hash_table = NULL;
2264 	}
2265 	mgp->running = MYRI10GE_ETH_STOPPED;
2266 }
2267 
2268 static int
2269 myri10ge_m_start(void *arg)
2270 {
2271 	struct myri10ge_priv *mgp = arg;
2272 	int status;
2273 
2274 	mutex_enter(&mgp->intrlock);
2275 
2276 	if (mgp->running != MYRI10GE_ETH_STOPPED) {
2277 		mutex_exit(&mgp->intrlock);
2278 		return (DDI_FAILURE);
2279 	}
2280 	status = myri10ge_start_locked(mgp);
2281 	mutex_exit(&mgp->intrlock);
2282 
2283 	if (status != DDI_SUCCESS)
2284 		return (status);
2285 
2286 	/* start the watchdog timer */
2287 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
2288 	    mgp->timer_ticks);
2289 	return (DDI_SUCCESS);
2290 
2291 }
2292 
2293 static void
2294 myri10ge_m_stop(void *arg)
2295 {
2296 	struct myri10ge_priv *mgp = arg;
2297 
2298 	mutex_enter(&mgp->intrlock);
2299 	/* if the device not running give up */
2300 	if (mgp->running != MYRI10GE_ETH_RUNNING) {
2301 		mutex_exit(&mgp->intrlock);
2302 		return;
2303 	}
2304 
2305 	mgp->running = MYRI10GE_ETH_STOPPING;
2306 	mutex_exit(&mgp->intrlock);
2307 	(void) untimeout(mgp->timer_id);
2308 	mutex_enter(&mgp->intrlock);
2309 	myri10ge_stop_locked(mgp);
2310 	mutex_exit(&mgp->intrlock);
2311 
2312 }
2313 
2314 static inline void
2315 myri10ge_rx_csum(mblk_t *mp, struct myri10ge_rx_ring_stats *s, uint32_t csum)
2316 {
2317 	struct ether_header *eh;
2318 	struct ip *ip;
2319 	struct ip6_hdr *ip6;
2320 	uint32_t start, stuff, end, partial, hdrlen;
2321 
2322 
2323 	csum = ntohs((uint16_t)csum);
2324 	eh = (struct ether_header *)(void *)mp->b_rptr;
2325 	hdrlen = sizeof (*eh);
2326 	if (eh->ether_dhost.ether_addr_octet[0] & 1) {
2327 		if (0 == (bcmp(eh->ether_dhost.ether_addr_octet,
2328 		    myri10ge_broadcastaddr, sizeof (eh->ether_dhost))))
2329 			s->brdcstrcv++;
2330 		else
2331 			s->multircv++;
2332 	}
2333 
2334 	if (eh->ether_type == BE_16(ETHERTYPE_VLAN)) {
2335 		/*
2336 		 * fix checksum by subtracting 4 bytes after what the
2337 		 * firmware thought was the end of the ether hdr
2338 		 */
2339 		partial = *(uint32_t *)
2340 		    (void *)(mp->b_rptr + ETHERNET_HEADER_SIZE);
2341 		csum += ~partial;
2342 		csum +=  (csum < ~partial);
2343 		csum = (csum >> 16) + (csum & 0xFFFF);
2344 		csum = (csum >> 16) + (csum & 0xFFFF);
2345 		hdrlen += VLAN_TAGSZ;
2346 	}
2347 
2348 	if (eh->ether_type ==  BE_16(ETHERTYPE_IP)) {
2349 		ip = (struct ip *)(void *)(mp->b_rptr + hdrlen);
2350 		start = ip->ip_hl << 2;
2351 
2352 		if (ip->ip_p == IPPROTO_TCP)
2353 			stuff = start + offsetof(struct tcphdr, th_sum);
2354 		else if (ip->ip_p == IPPROTO_UDP)
2355 			stuff = start + offsetof(struct udphdr, uh_sum);
2356 		else
2357 			return;
2358 		end = ntohs(ip->ip_len);
2359 	} else if (eh->ether_type ==  BE_16(ETHERTYPE_IPV6)) {
2360 		ip6 = (struct ip6_hdr *)(void *)(mp->b_rptr + hdrlen);
2361 		start = sizeof (*ip6);
2362 		if (ip6->ip6_nxt == IPPROTO_TCP) {
2363 			stuff = start + offsetof(struct tcphdr, th_sum);
2364 		} else if (ip6->ip6_nxt == IPPROTO_UDP)
2365 			stuff = start + offsetof(struct udphdr, uh_sum);
2366 		else
2367 			return;
2368 		end = start + ntohs(ip6->ip6_plen);
2369 		/*
2370 		 * IPv6 headers do not contain a checksum, and hence
2371 		 * do not checksum to zero, so they don't "fall out"
2372 		 * of the partial checksum calculation like IPv4
2373 		 * headers do.  We need to fix the partial checksum by
2374 		 * subtracting the checksum of the IPv6 header.
2375 		 */
2376 
2377 		partial = myri10ge_csum_generic((uint16_t *)ip6, sizeof (*ip6));
2378 		csum += ~partial;
2379 		csum +=  (csum < ~partial);
2380 		csum = (csum >> 16) + (csum & 0xFFFF);
2381 		csum = (csum >> 16) + (csum & 0xFFFF);
2382 	} else {
2383 		return;
2384 	}
2385 
2386 	if (MBLKL(mp) > hdrlen + end) {
2387 		/* padded frame, so hw csum may be invalid */
2388 		return;
2389 	}
2390 
2391 	mac_hcksum_set(mp, start, stuff, end, csum, HCK_PARTIALCKSUM);
2392 }
2393 
2394 static mblk_t *
2395 myri10ge_rx_done_small(struct myri10ge_slice_state *ss, uint32_t len,
2396     uint32_t csum)
2397 {
2398 	mblk_t *mp;
2399 	myri10ge_rx_ring_t *rx;
2400 	int idx;
2401 
2402 	rx = &ss->rx_small;
2403 	idx = rx->cnt & rx->mask;
2404 	ss->rx_small.cnt++;
2405 
2406 	/* allocate a new buffer to pass up the stack */
2407 	mp = allocb(len + MXGEFW_PAD, 0);
2408 	if (mp == NULL) {
2409 		MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_small_nobuf);
2410 		goto abort;
2411 	}
2412 	bcopy(ss->rx_small.info[idx].ptr,
2413 	    (caddr_t)mp->b_wptr, len + MXGEFW_PAD);
2414 	mp->b_wptr += len + MXGEFW_PAD;
2415 	mp->b_rptr += MXGEFW_PAD;
2416 
2417 	ss->rx_stats.ibytes += len;
2418 	ss->rx_stats.ipackets += 1;
2419 	myri10ge_rx_csum(mp, &ss->rx_stats, csum);
2420 
2421 abort:
2422 	if ((idx & 7) == 7) {
2423 		myri10ge_submit_8rx(&rx->lanai[idx - 7],
2424 		    &rx->shadow[idx - 7]);
2425 	}
2426 
2427 	return (mp);
2428 }
2429 
2430 
2431 static mblk_t *
2432 myri10ge_rx_done_big(struct myri10ge_slice_state *ss, uint32_t len,
2433     uint32_t csum)
2434 {
2435 	struct myri10ge_jpool_stuff *jpool;
2436 	struct myri10ge_jpool_entry *j;
2437 	mblk_t *mp;
2438 	int idx, num_owned_by_mcp;
2439 
2440 	jpool = &ss->jpool;
2441 	idx = ss->j_rx_cnt & ss->rx_big.mask;
2442 	j = ss->rx_big.info[idx].j;
2443 
2444 	if (j == NULL) {
2445 		printf("%s: null j at idx=%d, rx_big.cnt = %d, j_rx_cnt=%d\n",
2446 		    ss->mgp->name, idx, ss->rx_big.cnt, ss->j_rx_cnt);
2447 		return (NULL);
2448 	}
2449 
2450 
2451 	ss->rx_big.info[idx].j = NULL;
2452 	ss->j_rx_cnt++;
2453 
2454 
2455 	/*
2456 	 * Check to see if we are low on rx buffers.
2457 	 * Note that we must leave at least 8 free so there are
2458 	 * enough to free in a single 64-byte write.
2459 	 */
2460 	num_owned_by_mcp = ss->rx_big.cnt - ss->j_rx_cnt;
2461 	if (num_owned_by_mcp < jpool->low_water) {
2462 		mutex_enter(&jpool->mtx);
2463 		myri10ge_restock_jumbos(ss);
2464 		mutex_exit(&jpool->mtx);
2465 		num_owned_by_mcp = ss->rx_big.cnt - ss->j_rx_cnt;
2466 		/* if we are still low, then we have to copy */
2467 		if (num_owned_by_mcp < 16) {
2468 			MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_copy);
2469 			/* allocate a new buffer to pass up the stack */
2470 			mp = allocb(len + MXGEFW_PAD, 0);
2471 			if (mp == NULL) {
2472 				goto abort;
2473 			}
2474 			bcopy(j->buf,
2475 			    (caddr_t)mp->b_wptr, len + MXGEFW_PAD);
2476 			myri10ge_jfree_rtn(j);
2477 			/* push buffer back to NIC */
2478 			mutex_enter(&jpool->mtx);
2479 			myri10ge_restock_jumbos(ss);
2480 			mutex_exit(&jpool->mtx);
2481 			goto set_len;
2482 		}
2483 	}
2484 
2485 	/* loan our buffer to the stack */
2486 	mp = desballoc((unsigned char *)j->buf, myri10ge_mtu, 0, &j->free_func);
2487 	if (mp == NULL) {
2488 		goto abort;
2489 	}
2490 
2491 set_len:
2492 	mp->b_rptr += MXGEFW_PAD;
2493 	mp->b_wptr = ((unsigned char *) mp->b_rptr + len);
2494 
2495 	ss->rx_stats.ibytes += len;
2496 	ss->rx_stats.ipackets += 1;
2497 	myri10ge_rx_csum(mp, &ss->rx_stats, csum);
2498 
2499 	return (mp);
2500 
2501 abort:
2502 	myri10ge_jfree_rtn(j);
2503 	MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_big_nobuf);
2504 	return (NULL);
2505 }
2506 
2507 /*
2508  * Free all transmit buffers up until the specified index
2509  */
2510 static inline void
2511 myri10ge_tx_done(struct myri10ge_slice_state *ss, uint32_t mcp_index)
2512 {
2513 	myri10ge_tx_ring_t *tx;
2514 	struct myri10ge_tx_dma_handle_head handles;
2515 	int idx;
2516 	int limit = 0;
2517 
2518 	tx = &ss->tx;
2519 	handles.head = NULL;
2520 	handles.tail = NULL;
2521 	while (tx->pkt_done != (int)mcp_index) {
2522 		idx = tx->done & tx->mask;
2523 
2524 		/*
2525 		 * mblk & DMA handle attached only to first slot
2526 		 * per buffer in the packet
2527 		 */
2528 
2529 		if (tx->info[idx].m) {
2530 			(void) ddi_dma_unbind_handle(tx->info[idx].handle->h);
2531 			tx->info[idx].handle->next = handles.head;
2532 			handles.head = tx->info[idx].handle;
2533 			if (handles.tail == NULL)
2534 				handles.tail = tx->info[idx].handle;
2535 			freeb(tx->info[idx].m);
2536 			tx->info[idx].m = 0;
2537 			tx->info[idx].handle = 0;
2538 		}
2539 		if (tx->info[idx].ostat.opackets != 0) {
2540 			tx->stats.multixmt += tx->info[idx].ostat.multixmt;
2541 			tx->stats.brdcstxmt += tx->info[idx].ostat.brdcstxmt;
2542 			tx->stats.obytes += tx->info[idx].ostat.obytes;
2543 			tx->stats.opackets += tx->info[idx].ostat.opackets;
2544 			tx->info[idx].stat.un.all = 0;
2545 			tx->pkt_done++;
2546 		}
2547 
2548 		tx->done++;
2549 		/*
2550 		 * if we stalled the queue, wake it.  But Wait until
2551 		 * we have at least 1/2 our slots free.
2552 		 */
2553 		if ((tx->req - tx->done) < (tx->mask >> 1) &&
2554 		    tx->stall != tx->sched) {
2555 			mutex_enter(&ss->tx.lock);
2556 			tx->sched = tx->stall;
2557 			mutex_exit(&ss->tx.lock);
2558 			mac_tx_ring_update(ss->mgp->mh, tx->rh);
2559 		}
2560 
2561 		/* limit potential for livelock */
2562 		if (unlikely(++limit >  2 * tx->mask))
2563 			break;
2564 	}
2565 	if (tx->req == tx->done && tx->stop != NULL) {
2566 		/*
2567 		 * Nic has sent all pending requests, allow it
2568 		 * to stop polling this queue
2569 		 */
2570 		mutex_enter(&tx->lock);
2571 		if (tx->req == tx->done && tx->active) {
2572 			*(int *)(void *)tx->stop = 1;
2573 			tx->active = 0;
2574 			mb();
2575 		}
2576 		mutex_exit(&tx->lock);
2577 	}
2578 	if (handles.head != NULL)
2579 		myri10ge_free_tx_handles(tx, &handles);
2580 }
2581 
2582 static void
2583 myri10ge_mbl_init(struct myri10ge_mblk_list *mbl)
2584 {
2585 	mbl->head = NULL;
2586 	mbl->tail = &mbl->head;
2587 	mbl->cnt = 0;
2588 }
2589 
2590 /*ARGSUSED*/
2591 void
2592 myri10ge_mbl_append(struct myri10ge_slice_state *ss,
2593     struct myri10ge_mblk_list *mbl, mblk_t *mp)
2594 {
2595 	*(mbl->tail) = mp;
2596 	mbl->tail = &mp->b_next;
2597 	mp->b_next = NULL;
2598 	mbl->cnt++;
2599 }
2600 
2601 
2602 static inline void
2603 myri10ge_clean_rx_done(struct myri10ge_slice_state *ss,
2604     struct myri10ge_mblk_list *mbl, int limit, boolean_t *stop)
2605 {
2606 	myri10ge_rx_done_t *rx_done = &ss->rx_done;
2607 	struct myri10ge_priv *mgp = ss->mgp;
2608 	mblk_t *mp;
2609 	struct lro_entry *lro;
2610 	uint16_t length;
2611 	uint16_t checksum;
2612 
2613 
2614 	while (rx_done->entry[rx_done->idx].length != 0) {
2615 		if (unlikely (*stop)) {
2616 			break;
2617 		}
2618 		length = ntohs(rx_done->entry[rx_done->idx].length);
2619 		length &= (~MXGEFW_RSS_HASH_MASK);
2620 
2621 		/* limit potential for livelock */
2622 		limit -= length;
2623 		if (unlikely(limit < 0))
2624 			break;
2625 
2626 		rx_done->entry[rx_done->idx].length = 0;
2627 		checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
2628 		if (length <= myri10ge_small_bytes)
2629 			mp = myri10ge_rx_done_small(ss, length, checksum);
2630 		else
2631 			mp = myri10ge_rx_done_big(ss, length, checksum);
2632 		if (mp != NULL) {
2633 			if (!myri10ge_lro ||
2634 			    0 != myri10ge_lro_rx(ss, mp, checksum, mbl))
2635 				myri10ge_mbl_append(ss, mbl, mp);
2636 		}
2637 		rx_done->cnt++;
2638 		rx_done->idx = rx_done->cnt & (mgp->max_intr_slots - 1);
2639 	}
2640 	while (ss->lro_active != NULL) {
2641 		lro = ss->lro_active;
2642 		ss->lro_active = lro->next;
2643 		myri10ge_lro_flush(ss, lro, mbl);
2644 	}
2645 }
2646 
2647 static void
2648 myri10ge_intr_rx(struct myri10ge_slice_state *ss)
2649 {
2650 	uint64_t gen;
2651 	struct myri10ge_mblk_list mbl;
2652 
2653 	myri10ge_mbl_init(&mbl);
2654 	if (mutex_tryenter(&ss->rx_lock) == 0)
2655 		return;
2656 	gen = ss->rx_gen_num;
2657 	myri10ge_clean_rx_done(ss, &mbl, MYRI10GE_POLL_NULL,
2658 	    &ss->rx_polling);
2659 	if (mbl.head != NULL)
2660 		mac_rx_ring(ss->mgp->mh, ss->rx_rh, mbl.head, gen);
2661 	mutex_exit(&ss->rx_lock);
2662 
2663 }
2664 
2665 static mblk_t *
2666 myri10ge_poll_rx(void *arg, int bytes)
2667 {
2668 	struct myri10ge_slice_state *ss = arg;
2669 	struct myri10ge_mblk_list mbl;
2670 	boolean_t dummy = B_FALSE;
2671 
2672 	if (bytes == 0)
2673 		return (NULL);
2674 
2675 	myri10ge_mbl_init(&mbl);
2676 	mutex_enter(&ss->rx_lock);
2677 	if (ss->rx_polling)
2678 		myri10ge_clean_rx_done(ss, &mbl, bytes, &dummy);
2679 	else
2680 		printf("%d: poll_rx: token=%d, polling=%d\n", (int)(ss -
2681 		    ss->mgp->ss), ss->rx_token, ss->rx_polling);
2682 	mutex_exit(&ss->rx_lock);
2683 	return (mbl.head);
2684 }
2685 
2686 /*ARGSUSED*/
2687 static uint_t
2688 myri10ge_intr(caddr_t arg0, caddr_t arg1)
2689 {
2690 	struct myri10ge_slice_state *ss =
2691 	    (struct myri10ge_slice_state *)(void *)arg0;
2692 	struct myri10ge_priv *mgp = ss->mgp;
2693 	mcp_irq_data_t *stats = ss->fw_stats;
2694 	myri10ge_tx_ring_t *tx = &ss->tx;
2695 	uint32_t send_done_count;
2696 	uint8_t valid;
2697 
2698 
2699 	/* make sure the DMA has finished */
2700 	if (!stats->valid) {
2701 		return (DDI_INTR_UNCLAIMED);
2702 	}
2703 	valid = stats->valid;
2704 
2705 	/* low bit indicates receives are present */
2706 	if (valid & 1)
2707 		myri10ge_intr_rx(ss);
2708 
2709 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
2710 		/* lower legacy IRQ  */
2711 		*mgp->irq_deassert = 0;
2712 		if (!myri10ge_deassert_wait)
2713 			/* don't wait for conf. that irq is low */
2714 			stats->valid = 0;
2715 		mb();
2716 	} else {
2717 		/* no need to wait for conf. that irq is low */
2718 		stats->valid = 0;
2719 	}
2720 
2721 	do {
2722 		/* check for transmit completes and receives */
2723 		send_done_count = ntohl(stats->send_done_count);
2724 		if (send_done_count != tx->pkt_done)
2725 			myri10ge_tx_done(ss, (int)send_done_count);
2726 	} while (*((volatile uint8_t *) &stats->valid));
2727 
2728 	if (stats->stats_updated) {
2729 		if (mgp->link_state != stats->link_up || stats->link_down) {
2730 			mgp->link_state = stats->link_up;
2731 			if (stats->link_down) {
2732 				mgp->down_cnt += stats->link_down;
2733 				mgp->link_state = 0;
2734 			}
2735 			if (mgp->link_state) {
2736 				if (myri10ge_verbose)
2737 					printf("%s: link up\n", mgp->name);
2738 				mac_link_update(mgp->mh, LINK_STATE_UP);
2739 			} else {
2740 				if (myri10ge_verbose)
2741 					printf("%s: link down\n", mgp->name);
2742 				mac_link_update(mgp->mh, LINK_STATE_DOWN);
2743 			}
2744 			MYRI10GE_NIC_STAT_INC(link_changes);
2745 		}
2746 		if (mgp->rdma_tags_available !=
2747 		    ntohl(ss->fw_stats->rdma_tags_available)) {
2748 			mgp->rdma_tags_available =
2749 			    ntohl(ss->fw_stats->rdma_tags_available);
2750 			cmn_err(CE_NOTE, "%s: RDMA timed out! "
2751 			    "%d tags left\n", mgp->name,
2752 			    mgp->rdma_tags_available);
2753 		}
2754 	}
2755 
2756 	mb();
2757 	/* check to see if we have rx token to pass back */
2758 	if (valid & 0x1) {
2759 		mutex_enter(&ss->poll_lock);
2760 		if (ss->rx_polling) {
2761 			ss->rx_token = 1;
2762 		} else {
2763 			*ss->irq_claim = BE_32(3);
2764 			ss->rx_token = 0;
2765 		}
2766 		mutex_exit(&ss->poll_lock);
2767 	}
2768 	*(ss->irq_claim + 1) = BE_32(3);
2769 	return (DDI_INTR_CLAIMED);
2770 }
2771 
2772 /*
2773  * Add or remove a multicast address.  This is called with our
2774  * macinfo's lock held by GLD, so we do not need to worry about
2775  * our own locking here.
2776  */
2777 static int
2778 myri10ge_m_multicst(void *arg, boolean_t add, const uint8_t *multicastaddr)
2779 {
2780 	myri10ge_cmd_t cmd;
2781 	struct myri10ge_priv *mgp = arg;
2782 	int status, join_leave;
2783 
2784 	if (add)
2785 		join_leave = MXGEFW_JOIN_MULTICAST_GROUP;
2786 	else
2787 		join_leave = MXGEFW_LEAVE_MULTICAST_GROUP;
2788 	(void) memcpy(&cmd.data0, multicastaddr, 4);
2789 	(void) memcpy(&cmd.data1, multicastaddr + 4, 2);
2790 	cmd.data0 = htonl(cmd.data0);
2791 	cmd.data1 = htonl(cmd.data1);
2792 	status = myri10ge_send_cmd(mgp, join_leave, &cmd);
2793 	if (status == 0)
2794 		return (0);
2795 
2796 	cmn_err(CE_WARN, "%s: failed to set multicast address\n",
2797 	    mgp->name);
2798 	return (status);
2799 }
2800 
2801 
2802 static int
2803 myri10ge_m_promisc(void *arg, boolean_t on)
2804 {
2805 	struct myri10ge_priv *mgp = arg;
2806 
2807 	myri10ge_change_promisc(mgp, on);
2808 	return (0);
2809 }
2810 
2811 /*
2812  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
2813  *  backwards one at a time and handle ring wraps
2814  */
2815 
2816 static inline void
2817 myri10ge_submit_req_backwards(myri10ge_tx_ring_t *tx,
2818     mcp_kreq_ether_send_t *src, int cnt)
2819 {
2820 	int idx, starting_slot;
2821 	starting_slot = tx->req;
2822 	while (cnt > 1) {
2823 		cnt--;
2824 		idx = (starting_slot + cnt) & tx->mask;
2825 		myri10ge_pio_copy(&tx->lanai[idx],
2826 		    &src[cnt], sizeof (*src));
2827 		mb();
2828 	}
2829 }
2830 
2831 /*
2832  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
2833  * at most 32 bytes at a time, so as to avoid involving the software
2834  * pio handler in the nic.   We re-write the first segment's flags
2835  * to mark them valid only after writing the entire chain
2836  */
2837 
2838 static inline void
2839 myri10ge_submit_req(myri10ge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
2840     int cnt)
2841 {
2842 	int idx, i;
2843 	uint32_t *src_ints, *dst_ints;
2844 	mcp_kreq_ether_send_t *srcp, *dstp, *dst;
2845 	uint8_t last_flags;
2846 
2847 	idx = tx->req & tx->mask;
2848 
2849 	last_flags = src->flags;
2850 	src->flags = 0;
2851 	mb();
2852 	dst = dstp = &tx->lanai[idx];
2853 	srcp = src;
2854 
2855 	if ((idx + cnt) < tx->mask) {
2856 		for (i = 0; i < (cnt - 1); i += 2) {
2857 			myri10ge_pio_copy(dstp, srcp, 2 * sizeof (*src));
2858 			mb(); /* force write every 32 bytes */
2859 			srcp += 2;
2860 			dstp += 2;
2861 		}
2862 	} else {
2863 		/*
2864 		 * submit all but the first request, and ensure
2865 		 *  that it is submitted below
2866 		 */
2867 		myri10ge_submit_req_backwards(tx, src, cnt);
2868 		i = 0;
2869 	}
2870 	if (i < cnt) {
2871 		/* submit the first request */
2872 		myri10ge_pio_copy(dstp, srcp, sizeof (*src));
2873 		mb(); /* barrier before setting valid flag */
2874 	}
2875 
2876 	/* re-write the last 32-bits with the valid flags */
2877 	src->flags |= last_flags;
2878 	src_ints = (uint32_t *)src;
2879 	src_ints += 3;
2880 	dst_ints = (uint32_t *)dst;
2881 	dst_ints += 3;
2882 	*dst_ints =  *src_ints;
2883 	tx->req += cnt;
2884 	mb();
2885 	/* notify NIC to poll this tx ring */
2886 	if (!tx->active && tx->go != NULL) {
2887 		*(int *)(void *)tx->go = 1;
2888 		tx->active = 1;
2889 		tx->activate++;
2890 		mb();
2891 	}
2892 }
2893 
2894 /* ARGSUSED */
2895 static inline void
2896 myri10ge_lso_info_get(mblk_t *mp, uint32_t *mss, uint32_t *flags)
2897 {
2898 	uint32_t lso_flag;
2899 	mac_lso_get(mp, mss, &lso_flag);
2900 	(*flags) |= lso_flag;
2901 }
2902 
2903 
2904 /* like pullupmsg, except preserve hcksum/LSO attributes */
2905 static int
2906 myri10ge_pullup(struct myri10ge_slice_state *ss, mblk_t *mp)
2907 {
2908 	uint32_t start, stuff, tx_offload_flags, mss;
2909 	int ok;
2910 
2911 	mss = 0;
2912 	mac_hcksum_get(mp, &start, &stuff, NULL, NULL, &tx_offload_flags);
2913 	myri10ge_lso_info_get(mp, &mss, &tx_offload_flags);
2914 
2915 	ok = pullupmsg(mp, -1);
2916 	if (!ok) {
2917 		printf("pullupmsg failed");
2918 		return (DDI_FAILURE);
2919 	}
2920 	MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_pullup);
2921 	mac_hcksum_set(mp, start, stuff, NULL, NULL, tx_offload_flags);
2922 	if (tx_offload_flags & HW_LSO)
2923 		DB_LSOMSS(mp) = (uint16_t)mss;
2924 	lso_info_set(mp, mss, tx_offload_flags);
2925 	return (DDI_SUCCESS);
2926 }
2927 
2928 static inline void
2929 myri10ge_tx_stat(struct myri10ge_tx_pkt_stats *s, struct ether_header *eh,
2930     int opackets, int obytes)
2931 {
2932 	s->un.all = 0;
2933 	if (eh->ether_dhost.ether_addr_octet[0] & 1) {
2934 		if (0 == (bcmp(eh->ether_dhost.ether_addr_octet,
2935 		    myri10ge_broadcastaddr, sizeof (eh->ether_dhost))))
2936 			s->un.s.brdcstxmt = 1;
2937 		else
2938 			s->un.s.multixmt = 1;
2939 	}
2940 	s->un.s.opackets = (uint16_t)opackets;
2941 	s->un.s.obytes = obytes;
2942 }
2943 
2944 static int
2945 myri10ge_tx_copy(struct myri10ge_slice_state *ss, mblk_t *mp,
2946     mcp_kreq_ether_send_t *req)
2947 {
2948 	myri10ge_tx_ring_t *tx = &ss->tx;
2949 	caddr_t ptr;
2950 	struct myri10ge_tx_copybuf *cp;
2951 	mblk_t *bp;
2952 	int idx, mblen, avail;
2953 	uint16_t len;
2954 
2955 	mutex_enter(&tx->lock);
2956 	avail = tx->mask - (tx->req - tx->done);
2957 	if (avail <= 1) {
2958 		mutex_exit(&tx->lock);
2959 		return (EBUSY);
2960 	}
2961 	idx = tx->req & tx->mask;
2962 	cp = &tx->cp[idx];
2963 	ptr = cp->va;
2964 	for (len = 0, bp = mp; bp != NULL; bp = bp->b_cont) {
2965 		mblen = MBLKL(bp);
2966 		bcopy(bp->b_rptr, ptr, mblen);
2967 		ptr += mblen;
2968 		len += mblen;
2969 	}
2970 	/* ensure runts are padded to 60 bytes */
2971 	if (len < 60) {
2972 		bzero(ptr, 64 - len);
2973 		len = 60;
2974 	}
2975 	req->addr_low = cp->dma.low;
2976 	req->addr_high = cp->dma.high;
2977 	req->length = htons(len);
2978 	req->pad = 0;
2979 	req->rdma_count = 1;
2980 	myri10ge_tx_stat(&tx->info[idx].stat,
2981 	    (struct ether_header *)(void *)cp->va, 1, len);
2982 	(void) ddi_dma_sync(cp->dma.handle, 0, len, DDI_DMA_SYNC_FORDEV);
2983 	myri10ge_submit_req(&ss->tx, req, 1);
2984 	mutex_exit(&tx->lock);
2985 	freemsg(mp);
2986 	return (DDI_SUCCESS);
2987 }
2988 
2989 
2990 static void
2991 myri10ge_send_locked(myri10ge_tx_ring_t *tx, mcp_kreq_ether_send_t *req_list,
2992     struct myri10ge_tx_buffer_state *tx_info,
2993     int count)
2994 {
2995 	int i, idx;
2996 
2997 	idx = 0; /* gcc -Wuninitialized */
2998 	/* store unmapping and bp info for tx irq handler */
2999 	for (i = 0; i < count; i++) {
3000 		idx = (tx->req + i) & tx->mask;
3001 		tx->info[idx].m = tx_info[i].m;
3002 		tx->info[idx].handle = tx_info[i].handle;
3003 	}
3004 	tx->info[idx].stat.un.all = tx_info[0].stat.un.all;
3005 
3006 	/* submit the frame to the nic */
3007 	myri10ge_submit_req(tx, req_list, count);
3008 
3009 
3010 }
3011 
3012 
3013 
3014 static void
3015 myri10ge_copydata(mblk_t *mp, int off, int len, caddr_t buf)
3016 {
3017 	mblk_t *bp;
3018 	int seglen;
3019 	uint_t count;
3020 
3021 	bp = mp;
3022 
3023 	while (off > 0) {
3024 		seglen = MBLKL(bp);
3025 		if (off < seglen)
3026 			break;
3027 		off -= seglen;
3028 		bp = bp->b_cont;
3029 	}
3030 	while (len > 0) {
3031 		seglen = MBLKL(bp);
3032 		count = min(seglen - off, len);
3033 		bcopy(bp->b_rptr + off, buf, count);
3034 		len -= count;
3035 		buf += count;
3036 		off = 0;
3037 		bp = bp->b_cont;
3038 	}
3039 }
3040 
3041 static int
3042 myri10ge_ether_parse_header(mblk_t *mp)
3043 {
3044 	struct ether_header eh_copy;
3045 	struct ether_header *eh;
3046 	int eth_hdr_len, seglen;
3047 
3048 	seglen = MBLKL(mp);
3049 	eth_hdr_len = sizeof (*eh);
3050 	if (seglen < eth_hdr_len) {
3051 		myri10ge_copydata(mp, 0, eth_hdr_len, (caddr_t)&eh_copy);
3052 		eh = &eh_copy;
3053 	} else {
3054 		eh = (struct ether_header *)(void *)mp->b_rptr;
3055 	}
3056 	if (eh->ether_type == BE_16(ETHERTYPE_VLAN)) {
3057 		eth_hdr_len += 4;
3058 	}
3059 
3060 	return (eth_hdr_len);
3061 }
3062 
3063 static int
3064 myri10ge_lso_parse_header(mblk_t *mp, int off)
3065 {
3066 	char buf[128];
3067 	int seglen, sum_off;
3068 	struct ip *ip;
3069 	struct tcphdr *tcp;
3070 
3071 	seglen = MBLKL(mp);
3072 	if (seglen < off + sizeof (*ip)) {
3073 		myri10ge_copydata(mp, off, sizeof (*ip), buf);
3074 		ip = (struct ip *)(void *)buf;
3075 	} else {
3076 		ip = (struct ip *)(void *)(mp->b_rptr + off);
3077 	}
3078 	if (seglen < off + (ip->ip_hl << 2) + sizeof (*tcp)) {
3079 		myri10ge_copydata(mp, off,
3080 		    (ip->ip_hl << 2) + sizeof (*tcp), buf);
3081 		ip = (struct ip *)(void *)buf;
3082 	}
3083 	tcp = (struct tcphdr *)(void *)((char *)ip + (ip->ip_hl << 2));
3084 
3085 	/*
3086 	 * NIC expects ip_sum to be zero.  Recent changes to
3087 	 * OpenSolaris leave the correct ip checksum there, rather
3088 	 * than the required zero, so we need to zero it.  Otherwise,
3089 	 * the NIC will produce bad checksums when sending LSO packets.
3090 	 */
3091 	if (ip->ip_sum != 0) {
3092 		if (((char *)ip) != buf) {
3093 			/* ip points into mblk, so just zero it */
3094 			ip->ip_sum = 0;
3095 		} else {
3096 			/*
3097 			 * ip points into a copy, so walk the chain
3098 			 * to find the ip_csum, then zero it
3099 			 */
3100 			sum_off = off + _PTRDIFF(&ip->ip_sum, buf);
3101 			while (sum_off > (int)(MBLKL(mp) - 1)) {
3102 				sum_off -= MBLKL(mp);
3103 				mp = mp->b_cont;
3104 			}
3105 			mp->b_rptr[sum_off] = 0;
3106 			sum_off++;
3107 			while (sum_off > MBLKL(mp) - 1) {
3108 				sum_off -= MBLKL(mp);
3109 				mp = mp->b_cont;
3110 			}
3111 			mp->b_rptr[sum_off] = 0;
3112 		}
3113 	}
3114 	return (off + ((ip->ip_hl + tcp->th_off) << 2));
3115 }
3116 
3117 static int
3118 myri10ge_tx_tso_copy(struct myri10ge_slice_state *ss, mblk_t *mp,
3119     mcp_kreq_ether_send_t *req_list, int hdr_size, int pkt_size,
3120     uint16_t mss, uint8_t cksum_offset)
3121 {
3122 	myri10ge_tx_ring_t *tx = &ss->tx;
3123 	struct myri10ge_priv *mgp = ss->mgp;
3124 	mblk_t *bp;
3125 	mcp_kreq_ether_send_t *req;
3126 	struct myri10ge_tx_copybuf *cp;
3127 	caddr_t rptr, ptr;
3128 	int mblen, count, cum_len, mss_resid, tx_req, pkt_size_tmp;
3129 	int resid, avail, idx, hdr_size_tmp, tx_boundary;
3130 	int rdma_count;
3131 	uint32_t seglen, len, boundary, low, high_swapped;
3132 	uint16_t pseudo_hdr_offset = htons(mss);
3133 	uint8_t flags;
3134 
3135 	tx_boundary = mgp->tx_boundary;
3136 	hdr_size_tmp = hdr_size;
3137 	resid = tx_boundary;
3138 	count = 1;
3139 	mutex_enter(&tx->lock);
3140 
3141 	/* check to see if the slots are really there */
3142 	avail = tx->mask - (tx->req - tx->done);
3143 	if (unlikely(avail <=  MYRI10GE_MAX_SEND_DESC_TSO)) {
3144 		atomic_inc_32(&tx->stall);
3145 		mutex_exit(&tx->lock);
3146 		return (EBUSY);
3147 	}
3148 
3149 	/* copy */
3150 	cum_len = -hdr_size;
3151 	count = 0;
3152 	req = req_list;
3153 	idx = tx->mask & tx->req;
3154 	cp = &tx->cp[idx];
3155 	low = ntohl(cp->dma.low);
3156 	ptr = cp->va;
3157 	cp->len = 0;
3158 	if (mss) {
3159 		int payload = pkt_size - hdr_size;
3160 		uint16_t opackets = (payload / mss) + ((payload % mss) != 0);
3161 		tx->info[idx].ostat.opackets = opackets;
3162 		tx->info[idx].ostat.obytes = (opackets - 1) * hdr_size
3163 		    + pkt_size;
3164 	}
3165 	hdr_size_tmp = hdr_size;
3166 	mss_resid = mss;
3167 	flags = (MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST);
3168 	tx_req = tx->req;
3169 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3170 		mblen = MBLKL(bp);
3171 		rptr = (caddr_t)bp->b_rptr;
3172 		len = min(hdr_size_tmp, mblen);
3173 		if (len) {
3174 			bcopy(rptr, ptr, len);
3175 			rptr += len;
3176 			ptr += len;
3177 			resid -= len;
3178 			mblen -= len;
3179 			hdr_size_tmp -= len;
3180 			cp->len += len;
3181 			if (hdr_size_tmp)
3182 				continue;
3183 			if (resid < mss) {
3184 				tx_req++;
3185 				idx = tx->mask & tx_req;
3186 				cp = &tx->cp[idx];
3187 				low = ntohl(cp->dma.low);
3188 				ptr = cp->va;
3189 				resid = tx_boundary;
3190 			}
3191 		}
3192 		while (mblen) {
3193 			len = min(mss_resid, mblen);
3194 			bcopy(rptr, ptr, len);
3195 			mss_resid -= len;
3196 			resid -= len;
3197 			mblen -= len;
3198 			rptr += len;
3199 			ptr += len;
3200 			cp->len += len;
3201 			if (mss_resid == 0) {
3202 				mss_resid = mss;
3203 				if (resid < mss) {
3204 					tx_req++;
3205 					idx = tx->mask & tx_req;
3206 					cp = &tx->cp[idx];
3207 					cp->len = 0;
3208 					low = ntohl(cp->dma.low);
3209 					ptr = cp->va;
3210 					resid = tx_boundary;
3211 				}
3212 			}
3213 		}
3214 	}
3215 
3216 	req = req_list;
3217 	pkt_size_tmp = pkt_size;
3218 	count = 0;
3219 	rdma_count = 0;
3220 	tx_req = tx->req;
3221 	while (pkt_size_tmp) {
3222 		idx = tx->mask & tx_req;
3223 		cp = &tx->cp[idx];
3224 		high_swapped = cp->dma.high;
3225 		low = ntohl(cp->dma.low);
3226 		len = cp->len;
3227 		if (len == 0) {
3228 			printf("len=0! pkt_size_tmp=%d, pkt_size=%d\n",
3229 			    pkt_size_tmp, pkt_size);
3230 			for (bp = mp; bp != NULL; bp = bp->b_cont) {
3231 				mblen = MBLKL(bp);
3232 				printf("mblen:%d\n", mblen);
3233 			}
3234 			pkt_size_tmp = pkt_size;
3235 			tx_req = tx->req;
3236 			while (pkt_size_tmp > 0) {
3237 				idx = tx->mask & tx_req;
3238 				cp = &tx->cp[idx];
3239 				printf("cp->len = %d\n", cp->len);
3240 				pkt_size_tmp -= cp->len;
3241 				tx_req++;
3242 			}
3243 			printf("dropped\n");
3244 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3245 			goto done;
3246 		}
3247 		pkt_size_tmp -= len;
3248 		while (len) {
3249 			while (len) {
3250 				uint8_t flags_next;
3251 				int cum_len_next;
3252 
3253 				boundary = (low + mgp->tx_boundary) &
3254 				    ~(mgp->tx_boundary - 1);
3255 				seglen = boundary - low;
3256 				if (seglen > len)
3257 					seglen = len;
3258 
3259 				flags_next = flags & ~MXGEFW_FLAGS_FIRST;
3260 				cum_len_next = cum_len + seglen;
3261 				(req-rdma_count)->rdma_count = rdma_count + 1;
3262 				if (likely(cum_len >= 0)) {
3263 					/* payload */
3264 					int next_is_first, chop;
3265 
3266 					chop = (cum_len_next > mss);
3267 					cum_len_next = cum_len_next % mss;
3268 					next_is_first = (cum_len_next == 0);
3269 					flags |= chop *
3270 					    MXGEFW_FLAGS_TSO_CHOP;
3271 					flags_next |= next_is_first *
3272 					    MXGEFW_FLAGS_FIRST;
3273 					rdma_count |= -(chop | next_is_first);
3274 					rdma_count += chop & !next_is_first;
3275 				} else if (likely(cum_len_next >= 0)) {
3276 					/* header ends */
3277 					int small;
3278 
3279 					rdma_count = -1;
3280 					cum_len_next = 0;
3281 					seglen = -cum_len;
3282 					small = (mss <= MXGEFW_SEND_SMALL_SIZE);
3283 					flags_next = MXGEFW_FLAGS_TSO_PLD |
3284 					    MXGEFW_FLAGS_FIRST |
3285 					    (small * MXGEFW_FLAGS_SMALL);
3286 				}
3287 				req->addr_high = high_swapped;
3288 				req->addr_low = htonl(low);
3289 				req->pseudo_hdr_offset = pseudo_hdr_offset;
3290 				req->pad = 0; /* complete solid 16-byte block */
3291 				req->rdma_count = 1;
3292 				req->cksum_offset = cksum_offset;
3293 				req->length = htons(seglen);
3294 				req->flags = flags | ((cum_len & 1) *
3295 				    MXGEFW_FLAGS_ALIGN_ODD);
3296 				if (cksum_offset > seglen)
3297 					cksum_offset -= seglen;
3298 				else
3299 					cksum_offset = 0;
3300 				low += seglen;
3301 				len -= seglen;
3302 				cum_len = cum_len_next;
3303 				req++;
3304 				req->flags = 0;
3305 				flags = flags_next;
3306 				count++;
3307 				rdma_count++;
3308 			}
3309 		}
3310 		tx_req++;
3311 	}
3312 	(req-rdma_count)->rdma_count = (uint8_t)rdma_count;
3313 	do {
3314 		req--;
3315 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
3316 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP |
3317 	    MXGEFW_FLAGS_FIRST)));
3318 
3319 	myri10ge_submit_req(tx, req_list, count);
3320 done:
3321 	mutex_exit(&tx->lock);
3322 	freemsg(mp);
3323 	return (DDI_SUCCESS);
3324 }
3325 
3326 /*
3327  * Try to send the chain of buffers described by the mp.  We must not
3328  * encapsulate more than eth->tx.req - eth->tx.done, or
3329  * MXGEFW_MAX_SEND_DESC, whichever is more.
3330  */
3331 
3332 static int
3333 myri10ge_send(struct myri10ge_slice_state *ss, mblk_t *mp,
3334     mcp_kreq_ether_send_t *req_list, struct myri10ge_tx_buffer_state *tx_info)
3335 {
3336 	struct myri10ge_priv *mgp = ss->mgp;
3337 	myri10ge_tx_ring_t *tx = &ss->tx;
3338 	mcp_kreq_ether_send_t *req;
3339 	struct myri10ge_tx_dma_handle *handles, *dma_handle = NULL;
3340 	mblk_t  *bp;
3341 	ddi_dma_cookie_t cookie;
3342 	int err, rv, count, avail, mblen, try_pullup, i, max_segs, maclen,
3343 	    rdma_count, cum_len, lso_hdr_size;
3344 	uint32_t start, stuff, tx_offload_flags;
3345 	uint32_t seglen, len, mss, boundary, low, high_swapped;
3346 	uint_t ncookies;
3347 	uint16_t pseudo_hdr_offset;
3348 	uint8_t flags, cksum_offset, odd_flag;
3349 	int pkt_size;
3350 	int lso_copy = myri10ge_lso_copy;
3351 	try_pullup = 1;
3352 
3353 again:
3354 	/* Setup checksum offloading, if needed */
3355 	mac_hcksum_get(mp, &start, &stuff, NULL, NULL, &tx_offload_flags);
3356 	myri10ge_lso_info_get(mp, &mss, &tx_offload_flags);
3357 	if (tx_offload_flags & HW_LSO) {
3358 		max_segs = MYRI10GE_MAX_SEND_DESC_TSO;
3359 		if ((tx_offload_flags & HCK_PARTIALCKSUM) == 0) {
3360 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_lsobadflags);
3361 			freemsg(mp);
3362 			return (DDI_SUCCESS);
3363 		}
3364 	} else {
3365 		max_segs = MXGEFW_MAX_SEND_DESC;
3366 		mss = 0;
3367 	}
3368 	req = req_list;
3369 	cksum_offset = 0;
3370 	pseudo_hdr_offset = 0;
3371 
3372 	/* leave an extra slot keep the ring from wrapping */
3373 	avail = tx->mask - (tx->req - tx->done);
3374 
3375 	/*
3376 	 * If we have > MXGEFW_MAX_SEND_DESC, then any over-length
3377 	 * message will need to be pulled up in order to fit.
3378 	 * Otherwise, we are low on transmit descriptors, it is
3379 	 * probably better to stall and try again rather than pullup a
3380 	 * message to fit.
3381 	 */
3382 
3383 	if (avail < max_segs) {
3384 		err = EBUSY;
3385 		atomic_inc_32(&tx->stall_early);
3386 		goto stall;
3387 	}
3388 
3389 	/* find out how long the frame is and how many segments it is */
3390 	count = 0;
3391 	odd_flag = 0;
3392 	pkt_size = 0;
3393 	flags = (MXGEFW_FLAGS_NO_TSO | MXGEFW_FLAGS_FIRST);
3394 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3395 		dblk_t *dbp;
3396 		mblen = MBLKL(bp);
3397 		if (mblen == 0) {
3398 			/*
3399 			 * we can't simply skip over 0-length mblks
3400 			 * because the hardware can't deal with them,
3401 			 * and we could leak them.
3402 			 */
3403 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_zero_len);
3404 			err = EIO;
3405 			goto pullup;
3406 		}
3407 		/*
3408 		 * There's no advantage to copying most gesballoc
3409 		 * attached blocks, so disable lso copy in that case
3410 		 */
3411 		if (mss && lso_copy == 1 && ((dbp = bp->b_datap) != NULL)) {
3412 			if ((void *)dbp->db_lastfree != myri10ge_db_lastfree) {
3413 				lso_copy = 0;
3414 			}
3415 		}
3416 		pkt_size += mblen;
3417 		count++;
3418 	}
3419 
3420 	/* Try to pull up excessivly long chains */
3421 	if (count >= max_segs) {
3422 		err = myri10ge_pullup(ss, mp);
3423 		if (likely(err == DDI_SUCCESS)) {
3424 			count = 1;
3425 		} else {
3426 			if (count <  MYRI10GE_MAX_SEND_DESC_TSO) {
3427 				/*
3428 				 * just let the h/w send it, it will be
3429 				 * inefficient, but us better than dropping
3430 				 */
3431 				max_segs = MYRI10GE_MAX_SEND_DESC_TSO;
3432 			} else {
3433 				/* drop it */
3434 				MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3435 				freemsg(mp);
3436 				return (0);
3437 			}
3438 		}
3439 	}
3440 
3441 	cum_len = 0;
3442 	maclen = myri10ge_ether_parse_header(mp);
3443 
3444 	if (tx_offload_flags & HCK_PARTIALCKSUM) {
3445 
3446 		cksum_offset = start + maclen;
3447 		pseudo_hdr_offset = htons(stuff + maclen);
3448 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
3449 		flags |= MXGEFW_FLAGS_CKSUM;
3450 	}
3451 
3452 	lso_hdr_size = 0; /* -Wunitinialized */
3453 	if (mss) { /* LSO */
3454 		/* this removes any CKSUM flag from before */
3455 		flags = (MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST);
3456 		/*
3457 		 * parse the headers and set cum_len to a negative
3458 		 * value to reflect the offset of the TCP payload
3459 		 */
3460 		lso_hdr_size =  myri10ge_lso_parse_header(mp, maclen);
3461 		cum_len = -lso_hdr_size;
3462 		if ((mss < mgp->tx_boundary) && lso_copy) {
3463 			err = myri10ge_tx_tso_copy(ss, mp, req_list,
3464 			    lso_hdr_size, pkt_size, mss, cksum_offset);
3465 			return (err);
3466 		}
3467 
3468 		/*
3469 		 * for TSO, pseudo_hdr_offset holds mss.  The firmware
3470 		 * figures out where to put the checksum by parsing
3471 		 * the header.
3472 		 */
3473 
3474 		pseudo_hdr_offset = htons(mss);
3475 	} else if (pkt_size <= MXGEFW_SEND_SMALL_SIZE) {
3476 		flags |= MXGEFW_FLAGS_SMALL;
3477 		if (pkt_size < myri10ge_tx_copylen) {
3478 			req->cksum_offset = cksum_offset;
3479 			req->pseudo_hdr_offset = pseudo_hdr_offset;
3480 			req->flags = flags;
3481 			err = myri10ge_tx_copy(ss, mp, req);
3482 			return (err);
3483 		}
3484 		cum_len = 0;
3485 	}
3486 
3487 	/* pull one DMA handle for each bp from our freelist */
3488 	handles = NULL;
3489 	err = myri10ge_alloc_tx_handles(ss, count, &handles);
3490 	if (err != DDI_SUCCESS) {
3491 		err = DDI_FAILURE;
3492 		goto stall;
3493 	}
3494 	count = 0;
3495 	rdma_count = 0;
3496 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3497 		mblen = MBLKL(bp);
3498 		dma_handle = handles;
3499 		handles = handles->next;
3500 
3501 		rv = ddi_dma_addr_bind_handle(dma_handle->h, NULL,
3502 		    (caddr_t)bp->b_rptr, mblen,
3503 		    DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL,
3504 		    &cookie, &ncookies);
3505 		if (unlikely(rv != DDI_DMA_MAPPED)) {
3506 			err = EIO;
3507 			try_pullup = 0;
3508 			dma_handle->next = handles;
3509 			handles = dma_handle;
3510 			goto abort_with_handles;
3511 		}
3512 
3513 		/* reserve the slot */
3514 		tx_info[count].m = bp;
3515 		tx_info[count].handle = dma_handle;
3516 
3517 		for (; ; ) {
3518 			low = MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress);
3519 			high_swapped =
3520 			    htonl(MYRI10GE_HIGHPART_TO_U32(
3521 			    cookie.dmac_laddress));
3522 			len = (uint32_t)cookie.dmac_size;
3523 			while (len) {
3524 				uint8_t flags_next;
3525 				int cum_len_next;
3526 
3527 				boundary = (low + mgp->tx_boundary) &
3528 				    ~(mgp->tx_boundary - 1);
3529 				seglen = boundary - low;
3530 				if (seglen > len)
3531 					seglen = len;
3532 
3533 				flags_next = flags & ~MXGEFW_FLAGS_FIRST;
3534 				cum_len_next = cum_len + seglen;
3535 				if (mss) {
3536 					(req-rdma_count)->rdma_count =
3537 					    rdma_count + 1;
3538 					if (likely(cum_len >= 0)) {
3539 						/* payload */
3540 						int next_is_first, chop;
3541 
3542 						chop = (cum_len_next > mss);
3543 						cum_len_next =
3544 						    cum_len_next % mss;
3545 						next_is_first =
3546 						    (cum_len_next == 0);
3547 						flags |= chop *
3548 						    MXGEFW_FLAGS_TSO_CHOP;
3549 						flags_next |= next_is_first *
3550 						    MXGEFW_FLAGS_FIRST;
3551 						rdma_count |=
3552 						    -(chop | next_is_first);
3553 						rdma_count +=
3554 						    chop & !next_is_first;
3555 					} else if (likely(cum_len_next >= 0)) {
3556 						/* header ends */
3557 						int small;
3558 
3559 						rdma_count = -1;
3560 						cum_len_next = 0;
3561 						seglen = -cum_len;
3562 						small = (mss <=
3563 						    MXGEFW_SEND_SMALL_SIZE);
3564 						flags_next =
3565 						    MXGEFW_FLAGS_TSO_PLD
3566 						    | MXGEFW_FLAGS_FIRST
3567 						    | (small *
3568 						    MXGEFW_FLAGS_SMALL);
3569 					}
3570 				}
3571 				req->addr_high = high_swapped;
3572 				req->addr_low = htonl(low);
3573 				req->pseudo_hdr_offset = pseudo_hdr_offset;
3574 				req->pad = 0; /* complete solid 16-byte block */
3575 				req->rdma_count = 1;
3576 				req->cksum_offset = cksum_offset;
3577 				req->length = htons(seglen);
3578 				req->flags = flags | ((cum_len & 1) * odd_flag);
3579 				if (cksum_offset > seglen)
3580 					cksum_offset -= seglen;
3581 				else
3582 					cksum_offset = 0;
3583 				low += seglen;
3584 				len -= seglen;
3585 				cum_len = cum_len_next;
3586 				count++;
3587 				rdma_count++;
3588 				/*  make sure all the segments will fit */
3589 				if (unlikely(count >= max_segs)) {
3590 					MYRI10GE_ATOMIC_SLICE_STAT_INC(
3591 					    xmit_lowbuf);
3592 					/* may try a pullup */
3593 					err = EBUSY;
3594 					if (try_pullup)
3595 						try_pullup = 2;
3596 					goto abort_with_handles;
3597 				}
3598 				req++;
3599 				req->flags = 0;
3600 				flags = flags_next;
3601 				tx_info[count].m = 0;
3602 			}
3603 			ncookies--;
3604 			if (ncookies == 0)
3605 				break;
3606 			ddi_dma_nextcookie(dma_handle->h, &cookie);
3607 		}
3608 	}
3609 	(req-rdma_count)->rdma_count = (uint8_t)rdma_count;
3610 
3611 	if (mss) {
3612 		do {
3613 			req--;
3614 			req->flags |= MXGEFW_FLAGS_TSO_LAST;
3615 		} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP |
3616 		    MXGEFW_FLAGS_FIRST)));
3617 	}
3618 
3619 	/* calculate tx stats */
3620 	if (mss) {
3621 		uint16_t opackets;
3622 		int payload;
3623 
3624 		payload = pkt_size - lso_hdr_size;
3625 		opackets = (payload / mss) + ((payload % mss) != 0);
3626 		tx_info[0].stat.un.all = 0;
3627 		tx_info[0].ostat.opackets = opackets;
3628 		tx_info[0].ostat.obytes = (opackets - 1) * lso_hdr_size
3629 		    + pkt_size;
3630 	} else {
3631 		myri10ge_tx_stat(&tx_info[0].stat,
3632 		    (struct ether_header *)(void *)mp->b_rptr, 1, pkt_size);
3633 	}
3634 	mutex_enter(&tx->lock);
3635 
3636 	/* check to see if the slots are really there */
3637 	avail = tx->mask - (tx->req - tx->done);
3638 	if (unlikely(avail <= count)) {
3639 		mutex_exit(&tx->lock);
3640 		err = 0;
3641 		goto late_stall;
3642 	}
3643 
3644 	myri10ge_send_locked(tx, req_list, tx_info, count);
3645 	mutex_exit(&tx->lock);
3646 	return (DDI_SUCCESS);
3647 
3648 late_stall:
3649 	try_pullup = 0;
3650 	atomic_inc_32(&tx->stall_late);
3651 
3652 abort_with_handles:
3653 	/* unbind and free handles from previous mblks */
3654 	for (i = 0; i < count; i++) {
3655 		bp = tx_info[i].m;
3656 		tx_info[i].m = 0;
3657 		if (bp) {
3658 			dma_handle = tx_info[i].handle;
3659 			(void) ddi_dma_unbind_handle(dma_handle->h);
3660 			dma_handle->next = handles;
3661 			handles = dma_handle;
3662 			tx_info[i].handle = NULL;
3663 			tx_info[i].m = NULL;
3664 		}
3665 	}
3666 	myri10ge_free_tx_handle_slist(tx, handles);
3667 pullup:
3668 	if (try_pullup) {
3669 		err = myri10ge_pullup(ss, mp);
3670 		if (err != DDI_SUCCESS && try_pullup == 2) {
3671 			/* drop */
3672 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3673 			freemsg(mp);
3674 			return (0);
3675 		}
3676 		try_pullup = 0;
3677 		goto again;
3678 	}
3679 
3680 stall:
3681 	if (err != 0) {
3682 		if (err == EBUSY) {
3683 			atomic_inc_32(&tx->stall);
3684 		} else {
3685 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3686 		}
3687 	}
3688 	return (err);
3689 }
3690 
3691 static mblk_t *
3692 myri10ge_send_wrapper(void *arg, mblk_t *mp)
3693 {
3694 	struct myri10ge_slice_state *ss = arg;
3695 	int err = 0;
3696 	mcp_kreq_ether_send_t *req_list;
3697 #if defined(__i386)
3698 	/*
3699 	 * We need about 2.5KB of scratch space to handle transmits.
3700 	 * i86pc has only 8KB of kernel stack space, so we malloc the
3701 	 * scratch space there rather than keeping it on the stack.
3702 	 */
3703 	size_t req_size, tx_info_size;
3704 	struct myri10ge_tx_buffer_state *tx_info;
3705 	caddr_t req_bytes;
3706 
3707 	req_size = sizeof (*req_list) * (MYRI10GE_MAX_SEND_DESC_TSO + 4)
3708 	    + 8;
3709 	req_bytes = kmem_alloc(req_size, KM_SLEEP);
3710 	tx_info_size = sizeof (*tx_info) * (MYRI10GE_MAX_SEND_DESC_TSO + 1);
3711 	tx_info = kmem_alloc(tx_info_size, KM_SLEEP);
3712 #else
3713 	char req_bytes[sizeof (*req_list) * (MYRI10GE_MAX_SEND_DESC_TSO + 4)
3714 	    + 8];
3715 	struct myri10ge_tx_buffer_state tx_info[MYRI10GE_MAX_SEND_DESC_TSO + 1];
3716 #endif
3717 
3718 	/* ensure req_list entries are aligned to 8 bytes */
3719 	req_list = (struct mcp_kreq_ether_send *)
3720 	    (((unsigned long)req_bytes + 7UL) & ~7UL);
3721 
3722 	err = myri10ge_send(ss, mp, req_list, tx_info);
3723 
3724 #if defined(__i386)
3725 	kmem_free(tx_info, tx_info_size);
3726 	kmem_free(req_bytes, req_size);
3727 #endif
3728 	if (err)
3729 		return (mp);
3730 	else
3731 		return (NULL);
3732 }
3733 
3734 static int
3735 myri10ge_addmac(void *arg, const uint8_t *mac_addr)
3736 {
3737 	struct myri10ge_priv *mgp = arg;
3738 	int err;
3739 
3740 	if (mac_addr == NULL)
3741 		return (EINVAL);
3742 
3743 	mutex_enter(&mgp->intrlock);
3744 	if (mgp->macaddr_cnt) {
3745 		mutex_exit(&mgp->intrlock);
3746 		return (ENOSPC);
3747 	}
3748 	err = myri10ge_m_unicst(mgp, mac_addr);
3749 	if (!err)
3750 		mgp->macaddr_cnt++;
3751 
3752 	mutex_exit(&mgp->intrlock);
3753 	if (err)
3754 		return (err);
3755 
3756 	bcopy(mac_addr, mgp->mac_addr, sizeof (mgp->mac_addr));
3757 	return (0);
3758 }
3759 
3760 /*ARGSUSED*/
3761 static int
3762 myri10ge_remmac(void *arg, const uint8_t *mac_addr)
3763 {
3764 	struct myri10ge_priv *mgp = arg;
3765 
3766 	mutex_enter(&mgp->intrlock);
3767 	mgp->macaddr_cnt--;
3768 	mutex_exit(&mgp->intrlock);
3769 
3770 	return (0);
3771 }
3772 
3773 /*ARGSUSED*/
3774 static void
3775 myri10ge_fill_group(void *arg, mac_ring_type_t rtype, const int index,
3776     mac_group_info_t *infop, mac_group_handle_t gh)
3777 {
3778 	struct myri10ge_priv *mgp = arg;
3779 
3780 	if (rtype != MAC_RING_TYPE_RX)
3781 		return;
3782 
3783 	infop->mgi_driver = (mac_group_driver_t)mgp;
3784 	infop->mgi_start = NULL;
3785 	infop->mgi_stop = NULL;
3786 	infop->mgi_addmac = myri10ge_addmac;
3787 	infop->mgi_remmac = myri10ge_remmac;
3788 	infop->mgi_count = mgp->num_slices;
3789 }
3790 
3791 static int
3792 myri10ge_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num)
3793 {
3794 	struct myri10ge_slice_state *ss;
3795 
3796 	ss = (struct myri10ge_slice_state *)rh;
3797 	mutex_enter(&ss->rx_lock);
3798 	ss->rx_gen_num = mr_gen_num;
3799 	mutex_exit(&ss->rx_lock);
3800 	return (0);
3801 }
3802 
3803 /*
3804  * Retrieve a value for one of the statistics for a particular rx ring
3805  */
3806 int
3807 myri10ge_rx_ring_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val)
3808 {
3809 	struct myri10ge_slice_state *ss;
3810 
3811 	ss = (struct myri10ge_slice_state *)rh;
3812 	switch (stat) {
3813 	case MAC_STAT_RBYTES:
3814 		*val = ss->rx_stats.ibytes;
3815 		break;
3816 
3817 	case MAC_STAT_IPACKETS:
3818 		*val = ss->rx_stats.ipackets;
3819 		break;
3820 
3821 	default:
3822 		*val = 0;
3823 		return (ENOTSUP);
3824 	}
3825 
3826 	return (0);
3827 }
3828 
3829 /*
3830  * Retrieve a value for one of the statistics for a particular tx ring
3831  */
3832 int
3833 myri10ge_tx_ring_stat(mac_ring_driver_t rh, uint_t stat, uint64_t *val)
3834 {
3835 	struct myri10ge_slice_state *ss;
3836 
3837 	ss = (struct myri10ge_slice_state *)rh;
3838 	switch (stat) {
3839 	case MAC_STAT_OBYTES:
3840 		*val = ss->tx.stats.obytes;
3841 		break;
3842 
3843 	case MAC_STAT_OPACKETS:
3844 		*val = ss->tx.stats.opackets;
3845 		break;
3846 
3847 	default:
3848 		*val = 0;
3849 		return (ENOTSUP);
3850 	}
3851 
3852 	return (0);
3853 }
3854 
3855 static int
3856 myri10ge_rx_ring_intr_disable(mac_intr_handle_t intrh)
3857 {
3858 	struct myri10ge_slice_state *ss;
3859 
3860 	ss = (struct myri10ge_slice_state *)intrh;
3861 	mutex_enter(&ss->poll_lock);
3862 	ss->rx_polling = B_TRUE;
3863 	mutex_exit(&ss->poll_lock);
3864 	return (0);
3865 }
3866 
3867 static int
3868 myri10ge_rx_ring_intr_enable(mac_intr_handle_t intrh)
3869 {
3870 	struct myri10ge_slice_state *ss;
3871 
3872 	ss = (struct myri10ge_slice_state *)intrh;
3873 	mutex_enter(&ss->poll_lock);
3874 	ss->rx_polling = B_FALSE;
3875 	if (ss->rx_token) {
3876 		*ss->irq_claim = BE_32(3);
3877 		ss->rx_token = 0;
3878 	}
3879 	mutex_exit(&ss->poll_lock);
3880 	return (0);
3881 }
3882 
3883 /*ARGSUSED*/
3884 static void
3885 myri10ge_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
3886     const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
3887 {
3888 	struct myri10ge_priv *mgp = arg;
3889 	struct myri10ge_slice_state *ss;
3890 	mac_intr_t *mintr = &infop->mri_intr;
3891 
3892 	ASSERT((unsigned int)ring_index < mgp->num_slices);
3893 
3894 	ss = &mgp->ss[ring_index];
3895 	switch (rtype) {
3896 	case MAC_RING_TYPE_RX:
3897 		ss->rx_rh = rh;
3898 		infop->mri_driver = (mac_ring_driver_t)ss;
3899 		infop->mri_start = myri10ge_ring_start;
3900 		infop->mri_stop = NULL;
3901 		infop->mri_poll = myri10ge_poll_rx;
3902 		infop->mri_stat = myri10ge_rx_ring_stat;
3903 		mintr->mi_handle = (mac_intr_handle_t)ss;
3904 		mintr->mi_enable = myri10ge_rx_ring_intr_enable;
3905 		mintr->mi_disable = myri10ge_rx_ring_intr_disable;
3906 		break;
3907 	case MAC_RING_TYPE_TX:
3908 		ss->tx.rh = rh;
3909 		infop->mri_driver = (mac_ring_driver_t)ss;
3910 		infop->mri_start = NULL;
3911 		infop->mri_stop = NULL;
3912 		infop->mri_tx = myri10ge_send_wrapper;
3913 		infop->mri_stat = myri10ge_tx_ring_stat;
3914 		break;
3915 	default:
3916 		break;
3917 	}
3918 }
3919 
3920 static void
3921 myri10ge_nic_stat_destroy(struct myri10ge_priv *mgp)
3922 {
3923 	if (mgp->ksp_stat == NULL)
3924 		return;
3925 
3926 	kstat_delete(mgp->ksp_stat);
3927 	mgp->ksp_stat = NULL;
3928 }
3929 
3930 static void
3931 myri10ge_slice_stat_destroy(struct myri10ge_slice_state *ss)
3932 {
3933 	if (ss->ksp_stat == NULL)
3934 		return;
3935 
3936 	kstat_delete(ss->ksp_stat);
3937 	ss->ksp_stat = NULL;
3938 }
3939 
3940 static void
3941 myri10ge_info_destroy(struct myri10ge_priv *mgp)
3942 {
3943 	if (mgp->ksp_info == NULL)
3944 		return;
3945 
3946 	kstat_delete(mgp->ksp_info);
3947 	mgp->ksp_info = NULL;
3948 }
3949 
3950 static int
3951 myri10ge_nic_stat_kstat_update(kstat_t *ksp, int rw)
3952 {
3953 	struct myri10ge_nic_stat *ethstat;
3954 	struct myri10ge_priv *mgp;
3955 	mcp_irq_data_t *fw_stats;
3956 
3957 
3958 	if (rw == KSTAT_WRITE)
3959 		return (EACCES);
3960 
3961 	ethstat = (struct myri10ge_nic_stat *)ksp->ks_data;
3962 	mgp = (struct myri10ge_priv *)ksp->ks_private;
3963 	fw_stats = mgp->ss[0].fw_stats;
3964 
3965 	ethstat->dma_read_bw_MBs.value.ul = mgp->read_dma;
3966 	ethstat->dma_write_bw_MBs.value.ul = mgp->write_dma;
3967 	ethstat->dma_read_write_bw_MBs.value.ul = mgp->read_write_dma;
3968 	if (myri10ge_tx_dma_attr.dma_attr_flags & DDI_DMA_FORCE_PHYSICAL)
3969 		ethstat->dma_force_physical.value.ul = 1;
3970 	else
3971 		ethstat->dma_force_physical.value.ul = 0;
3972 	ethstat->lanes.value.ul = mgp->pcie_link_width;
3973 	ethstat->dropped_bad_crc32.value.ul =
3974 	    ntohl(fw_stats->dropped_bad_crc32);
3975 	ethstat->dropped_bad_phy.value.ul =
3976 	    ntohl(fw_stats->dropped_bad_phy);
3977 	ethstat->dropped_link_error_or_filtered.value.ul =
3978 	    ntohl(fw_stats->dropped_link_error_or_filtered);
3979 	ethstat->dropped_link_overflow.value.ul =
3980 	    ntohl(fw_stats->dropped_link_overflow);
3981 	ethstat->dropped_multicast_filtered.value.ul =
3982 	    ntohl(fw_stats->dropped_multicast_filtered);
3983 	ethstat->dropped_no_big_buffer.value.ul =
3984 	    ntohl(fw_stats->dropped_no_big_buffer);
3985 	ethstat->dropped_no_small_buffer.value.ul =
3986 	    ntohl(fw_stats->dropped_no_small_buffer);
3987 	ethstat->dropped_overrun.value.ul =
3988 	    ntohl(fw_stats->dropped_overrun);
3989 	ethstat->dropped_pause.value.ul =
3990 	    ntohl(fw_stats->dropped_pause);
3991 	ethstat->dropped_runt.value.ul =
3992 	    ntohl(fw_stats->dropped_runt);
3993 	ethstat->link_up.value.ul =
3994 	    ntohl(fw_stats->link_up);
3995 	ethstat->dropped_unicast_filtered.value.ul =
3996 	    ntohl(fw_stats->dropped_unicast_filtered);
3997 	return (0);
3998 }
3999 
4000 static int
4001 myri10ge_slice_stat_kstat_update(kstat_t *ksp, int rw)
4002 {
4003 	struct myri10ge_slice_stat *ethstat;
4004 	struct myri10ge_slice_state *ss;
4005 
4006 	if (rw == KSTAT_WRITE)
4007 		return (EACCES);
4008 
4009 	ethstat = (struct myri10ge_slice_stat *)ksp->ks_data;
4010 	ss = (struct myri10ge_slice_state *)ksp->ks_private;
4011 
4012 	ethstat->rx_big.value.ul = ss->j_rx_cnt;
4013 	ethstat->rx_bigbuf_firmware.value.ul = ss->rx_big.cnt - ss->j_rx_cnt;
4014 	ethstat->rx_bigbuf_pool.value.ul =
4015 	    ss->jpool.num_alloc - ss->jbufs_for_smalls;
4016 	ethstat->rx_bigbuf_smalls.value.ul = ss->jbufs_for_smalls;
4017 	ethstat->rx_small.value.ul = ss->rx_small.cnt -
4018 	    (ss->rx_small.mask + 1);
4019 	ethstat->tx_done.value.ul = ss->tx.done;
4020 	ethstat->tx_req.value.ul = ss->tx.req;
4021 	ethstat->tx_activate.value.ul = ss->tx.activate;
4022 	ethstat->xmit_sched.value.ul = ss->tx.sched;
4023 	ethstat->xmit_stall.value.ul = ss->tx.stall;
4024 	ethstat->xmit_stall_early.value.ul = ss->tx.stall_early;
4025 	ethstat->xmit_stall_late.value.ul = ss->tx.stall_late;
4026 	ethstat->xmit_err.value.ul =  MYRI10GE_SLICE_STAT(xmit_err);
4027 	return (0);
4028 }
4029 
4030 static int
4031 myri10ge_info_kstat_update(kstat_t *ksp, int rw)
4032 {
4033 	struct myri10ge_info *info;
4034 	struct myri10ge_priv *mgp;
4035 
4036 
4037 	if (rw == KSTAT_WRITE)
4038 		return (EACCES);
4039 
4040 	info = (struct myri10ge_info *)ksp->ks_data;
4041 	mgp = (struct myri10ge_priv *)ksp->ks_private;
4042 	kstat_named_setstr(&info->driver_version, MYRI10GE_VERSION_STR);
4043 	kstat_named_setstr(&info->firmware_version, mgp->fw_version);
4044 	kstat_named_setstr(&info->firmware_name, mgp->fw_name);
4045 	kstat_named_setstr(&info->interrupt_type, mgp->intr_type);
4046 	kstat_named_setstr(&info->product_code, mgp->pc_str);
4047 	kstat_named_setstr(&info->serial_number, mgp->sn_str);
4048 	return (0);
4049 }
4050 
4051 static struct myri10ge_info myri10ge_info_template = {
4052 	{ "driver_version",	KSTAT_DATA_STRING },
4053 	{ "firmware_version",	KSTAT_DATA_STRING },
4054 	{ "firmware_name",	KSTAT_DATA_STRING },
4055 	{ "interrupt_type",	KSTAT_DATA_STRING },
4056 	{ "product_code",	KSTAT_DATA_STRING },
4057 	{ "serial_number",	KSTAT_DATA_STRING },
4058 };
4059 static kmutex_t myri10ge_info_template_lock;
4060 
4061 
4062 static int
4063 myri10ge_info_init(struct myri10ge_priv *mgp)
4064 {
4065 	struct kstat *ksp;
4066 
4067 	ksp = kstat_create("myri10ge", ddi_get_instance(mgp->dip),
4068 	    "myri10ge_info", "net", KSTAT_TYPE_NAMED,
4069 	    sizeof (myri10ge_info_template) /
4070 	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
4071 	if (ksp == NULL) {
4072 		cmn_err(CE_WARN,
4073 		    "%s: myri10ge_info_init: kstat_create failed", mgp->name);
4074 		return (DDI_FAILURE);
4075 	}
4076 	mgp->ksp_info = ksp;
4077 	ksp->ks_update = myri10ge_info_kstat_update;
4078 	ksp->ks_private = (void *) mgp;
4079 	ksp->ks_data = &myri10ge_info_template;
4080 	ksp->ks_lock = &myri10ge_info_template_lock;
4081 	if (MYRI10GE_VERSION_STR != NULL)
4082 		ksp->ks_data_size += strlen(MYRI10GE_VERSION_STR) + 1;
4083 	if (mgp->fw_version != NULL)
4084 		ksp->ks_data_size += strlen(mgp->fw_version) + 1;
4085 	ksp->ks_data_size += strlen(mgp->fw_name) + 1;
4086 	ksp->ks_data_size += strlen(mgp->intr_type) + 1;
4087 	if (mgp->pc_str != NULL)
4088 		ksp->ks_data_size += strlen(mgp->pc_str) + 1;
4089 	if (mgp->sn_str != NULL)
4090 		ksp->ks_data_size += strlen(mgp->sn_str) + 1;
4091 
4092 	kstat_install(ksp);
4093 	return (DDI_SUCCESS);
4094 }
4095 
4096 
4097 static int
4098 myri10ge_nic_stat_init(struct myri10ge_priv *mgp)
4099 {
4100 	struct kstat *ksp;
4101 	struct myri10ge_nic_stat *ethstat;
4102 
4103 	ksp = kstat_create("myri10ge", ddi_get_instance(mgp->dip),
4104 	    "myri10ge_nic_stats", "net", KSTAT_TYPE_NAMED,
4105 	    sizeof (*ethstat) / sizeof (kstat_named_t), 0);
4106 	if (ksp == NULL) {
4107 		cmn_err(CE_WARN,
4108 		    "%s: myri10ge_stat_init: kstat_create failed", mgp->name);
4109 		return (DDI_FAILURE);
4110 	}
4111 	mgp->ksp_stat = ksp;
4112 	ethstat = (struct myri10ge_nic_stat *)(ksp->ks_data);
4113 
4114 	kstat_named_init(&ethstat->dma_read_bw_MBs,
4115 	    "dma_read_bw_MBs", KSTAT_DATA_ULONG);
4116 	kstat_named_init(&ethstat->dma_write_bw_MBs,
4117 	    "dma_write_bw_MBs", KSTAT_DATA_ULONG);
4118 	kstat_named_init(&ethstat->dma_read_write_bw_MBs,
4119 	    "dma_read_write_bw_MBs", KSTAT_DATA_ULONG);
4120 	kstat_named_init(&ethstat->dma_force_physical,
4121 	    "dma_force_physical", KSTAT_DATA_ULONG);
4122 	kstat_named_init(&ethstat->lanes,
4123 	    "lanes", KSTAT_DATA_ULONG);
4124 	kstat_named_init(&ethstat->dropped_bad_crc32,
4125 	    "dropped_bad_crc32", KSTAT_DATA_ULONG);
4126 	kstat_named_init(&ethstat->dropped_bad_phy,
4127 	    "dropped_bad_phy", KSTAT_DATA_ULONG);
4128 	kstat_named_init(&ethstat->dropped_link_error_or_filtered,
4129 	    "dropped_link_error_or_filtered", KSTAT_DATA_ULONG);
4130 	kstat_named_init(&ethstat->dropped_link_overflow,
4131 	    "dropped_link_overflow", KSTAT_DATA_ULONG);
4132 	kstat_named_init(&ethstat->dropped_multicast_filtered,
4133 	    "dropped_multicast_filtered", KSTAT_DATA_ULONG);
4134 	kstat_named_init(&ethstat->dropped_no_big_buffer,
4135 	    "dropped_no_big_buffer", KSTAT_DATA_ULONG);
4136 	kstat_named_init(&ethstat->dropped_no_small_buffer,
4137 	    "dropped_no_small_buffer", KSTAT_DATA_ULONG);
4138 	kstat_named_init(&ethstat->dropped_overrun,
4139 	    "dropped_overrun", KSTAT_DATA_ULONG);
4140 	kstat_named_init(&ethstat->dropped_pause,
4141 	    "dropped_pause", KSTAT_DATA_ULONG);
4142 	kstat_named_init(&ethstat->dropped_runt,
4143 	    "dropped_runt", KSTAT_DATA_ULONG);
4144 	kstat_named_init(&ethstat->dropped_unicast_filtered,
4145 	    "dropped_unicast_filtered", KSTAT_DATA_ULONG);
4146 	kstat_named_init(&ethstat->dropped_runt, "dropped_runt",
4147 	    KSTAT_DATA_ULONG);
4148 	kstat_named_init(&ethstat->link_up, "link_up", KSTAT_DATA_ULONG);
4149 	kstat_named_init(&ethstat->link_changes, "link_changes",
4150 	    KSTAT_DATA_ULONG);
4151 	ksp->ks_update = myri10ge_nic_stat_kstat_update;
4152 	ksp->ks_private = (void *) mgp;
4153 	kstat_install(ksp);
4154 	return (DDI_SUCCESS);
4155 }
4156 
4157 static int
4158 myri10ge_slice_stat_init(struct myri10ge_slice_state *ss)
4159 {
4160 	struct myri10ge_priv *mgp = ss->mgp;
4161 	struct kstat *ksp;
4162 	struct myri10ge_slice_stat *ethstat;
4163 	int instance;
4164 
4165 	/*
4166 	 * fake an instance so that the same slice numbers from
4167 	 * different instances do not collide
4168 	 */
4169 	instance = (ddi_get_instance(mgp->dip) * 1000) +  (int)(ss - mgp->ss);
4170 	ksp = kstat_create("myri10ge", instance,
4171 	    "myri10ge_slice_stats", "net", KSTAT_TYPE_NAMED,
4172 	    sizeof (*ethstat) / sizeof (kstat_named_t), 0);
4173 	if (ksp == NULL) {
4174 		cmn_err(CE_WARN,
4175 		    "%s: myri10ge_stat_init: kstat_create failed", mgp->name);
4176 		return (DDI_FAILURE);
4177 	}
4178 	ss->ksp_stat = ksp;
4179 	ethstat = (struct myri10ge_slice_stat *)(ksp->ks_data);
4180 	kstat_named_init(&ethstat->lro_bad_csum, "lro_bad_csum",
4181 	    KSTAT_DATA_ULONG);
4182 	kstat_named_init(&ethstat->lro_flushed, "lro_flushed",
4183 	    KSTAT_DATA_ULONG);
4184 	kstat_named_init(&ethstat->lro_queued, "lro_queued",
4185 	    KSTAT_DATA_ULONG);
4186 	kstat_named_init(&ethstat->rx_bigbuf_firmware, "rx_bigbuf_firmware",
4187 	    KSTAT_DATA_ULONG);
4188 	kstat_named_init(&ethstat->rx_bigbuf_pool, "rx_bigbuf_pool",
4189 	    KSTAT_DATA_ULONG);
4190 	kstat_named_init(&ethstat->rx_bigbuf_smalls, "rx_bigbuf_smalls",
4191 	    KSTAT_DATA_ULONG);
4192 	kstat_named_init(&ethstat->rx_copy, "rx_copy",
4193 	    KSTAT_DATA_ULONG);
4194 	kstat_named_init(&ethstat->rx_big_nobuf, "rx_big_nobuf",
4195 	    KSTAT_DATA_ULONG);
4196 	kstat_named_init(&ethstat->rx_small_nobuf, "rx_small_nobuf",
4197 	    KSTAT_DATA_ULONG);
4198 	kstat_named_init(&ethstat->xmit_zero_len, "xmit_zero_len",
4199 	    KSTAT_DATA_ULONG);
4200 	kstat_named_init(&ethstat->xmit_pullup, "xmit_pullup",
4201 	    KSTAT_DATA_ULONG);
4202 	kstat_named_init(&ethstat->xmit_pullup_first, "xmit_pullup_first",
4203 	    KSTAT_DATA_ULONG);
4204 	kstat_named_init(&ethstat->xmit_lowbuf, "xmit_lowbuf",
4205 	    KSTAT_DATA_ULONG);
4206 	kstat_named_init(&ethstat->xmit_lsobadflags, "xmit_lsobadflags",
4207 	    KSTAT_DATA_ULONG);
4208 	kstat_named_init(&ethstat->xmit_sched, "xmit_sched",
4209 	    KSTAT_DATA_ULONG);
4210 	kstat_named_init(&ethstat->xmit_stall, "xmit_stall",
4211 	    KSTAT_DATA_ULONG);
4212 	kstat_named_init(&ethstat->xmit_stall_early, "xmit_stall_early",
4213 	    KSTAT_DATA_ULONG);
4214 	kstat_named_init(&ethstat->xmit_stall_late, "xmit_stall_late",
4215 	    KSTAT_DATA_ULONG);
4216 	kstat_named_init(&ethstat->xmit_err, "xmit_err",
4217 	    KSTAT_DATA_ULONG);
4218 	kstat_named_init(&ethstat->tx_req, "tx_req",
4219 	    KSTAT_DATA_ULONG);
4220 	kstat_named_init(&ethstat->tx_activate, "tx_activate",
4221 	    KSTAT_DATA_ULONG);
4222 	kstat_named_init(&ethstat->tx_done, "tx_done",
4223 	    KSTAT_DATA_ULONG);
4224 	kstat_named_init(&ethstat->tx_handles_alloced, "tx_handles_alloced",
4225 	    KSTAT_DATA_ULONG);
4226 	kstat_named_init(&ethstat->rx_big, "rx_big",
4227 	    KSTAT_DATA_ULONG);
4228 	kstat_named_init(&ethstat->rx_small, "rx_small",
4229 	    KSTAT_DATA_ULONG);
4230 	ksp->ks_update = myri10ge_slice_stat_kstat_update;
4231 	ksp->ks_private = (void *) ss;
4232 	kstat_install(ksp);
4233 	return (DDI_SUCCESS);
4234 }
4235 
4236 
4237 
4238 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
4239 
4240 #include <vm/hat.h>
4241 #include <sys/ddi_isa.h>
4242 void *device_arena_alloc(size_t size, int vm_flag);
4243 void device_arena_free(void *vaddr, size_t size);
4244 
4245 static void
4246 myri10ge_enable_nvidia_ecrc(struct myri10ge_priv *mgp)
4247 {
4248 	dev_info_t *parent_dip;
4249 	ddi_acc_handle_t handle;
4250 	unsigned long bus_number, dev_number, func_number;
4251 	unsigned long cfg_pa, paddr, base, pgoffset;
4252 	char 		*cvaddr, *ptr;
4253 	uint32_t	*ptr32;
4254 	int 		retval = DDI_FAILURE;
4255 	int dontcare;
4256 	uint16_t read_vid, read_did, vendor_id, device_id;
4257 
4258 	if (!myri10ge_nvidia_ecrc_enable)
4259 		return;
4260 
4261 	parent_dip = ddi_get_parent(mgp->dip);
4262 	if (parent_dip == NULL) {
4263 		cmn_err(CE_WARN, "%s: I'm an orphan?", mgp->name);
4264 		return;
4265 	}
4266 
4267 	if (pci_config_setup(parent_dip, &handle) != DDI_SUCCESS) {
4268 		cmn_err(CE_WARN,
4269 		    "%s: Could not access my parent's registers", mgp->name);
4270 		return;
4271 	}
4272 
4273 	vendor_id = pci_config_get16(handle, PCI_CONF_VENID);
4274 	device_id = pci_config_get16(handle, PCI_CONF_DEVID);
4275 	pci_config_teardown(&handle);
4276 
4277 	if (myri10ge_verbose) {
4278 		unsigned long 	bus_number, dev_number, func_number;
4279 		int 		reg_set, span;
4280 		(void) myri10ge_reg_set(parent_dip, &reg_set, &span,
4281 		    &bus_number, &dev_number, &func_number);
4282 		if (myri10ge_verbose)
4283 			printf("%s: parent at %ld:%ld:%ld\n", mgp->name,
4284 			    bus_number, dev_number, func_number);
4285 	}
4286 
4287 	if (vendor_id !=  0x10de)
4288 		return;
4289 
4290 	if (device_id != 0x005d /* CK804 */ &&
4291 	    (device_id < 0x374 || device_id > 0x378) /* MCP55 */) {
4292 		return;
4293 	}
4294 	(void) myri10ge_reg_set(parent_dip, &dontcare, &dontcare,
4295 	    &bus_number, &dev_number, &func_number);
4296 
4297 	for (cfg_pa = 0xf0000000UL;
4298 	    retval != DDI_SUCCESS && cfg_pa >= 0xe0000000UL;
4299 	    cfg_pa -= 0x10000000UL) {
4300 		/* find the config space address for the nvidia bridge */
4301 		paddr = (cfg_pa + bus_number * 0x00100000UL +
4302 		    (dev_number * 8 + func_number) * 0x00001000UL);
4303 
4304 		base = paddr & (~MMU_PAGEOFFSET);
4305 		pgoffset = paddr & MMU_PAGEOFFSET;
4306 
4307 		/* map it into the kernel */
4308 		cvaddr =  device_arena_alloc(ptob(1), VM_NOSLEEP);
4309 		if (cvaddr == NULL)
4310 			cmn_err(CE_WARN, "%s: failed to map nf4: cvaddr\n",
4311 			    mgp->name);
4312 
4313 		hat_devload(kas.a_hat, cvaddr, mmu_ptob(1),
4314 		    i_ddi_paddr_to_pfn(base),
4315 		    PROT_WRITE|HAT_STRICTORDER, HAT_LOAD_LOCK);
4316 
4317 		ptr = cvaddr + pgoffset;
4318 		read_vid = *(uint16_t *)(void *)(ptr + PCI_CONF_VENID);
4319 		read_did = *(uint16_t *)(void *)(ptr + PCI_CONF_DEVID);
4320 		if (vendor_id ==  read_did || device_id == read_did) {
4321 			ptr32 = (uint32_t *)(void *)(ptr + 0x178);
4322 			if (myri10ge_verbose)
4323 				printf("%s: Enabling ECRC on upstream "
4324 				    "Nvidia bridge (0x%x:0x%x) "
4325 				    "at %ld:%ld:%ld\n", mgp->name,
4326 				    read_vid, read_did, bus_number,
4327 				    dev_number, func_number);
4328 			*ptr32 |= 0x40;
4329 			retval = DDI_SUCCESS;
4330 		}
4331 		hat_unload(kas.a_hat, cvaddr, ptob(1), HAT_UNLOAD_UNLOCK);
4332 		device_arena_free(cvaddr, ptob(1));
4333 	}
4334 }
4335 
4336 #else
4337 /*ARGSUSED*/
4338 static void
4339 myri10ge_enable_nvidia_ecrc(struct myri10ge_priv *mgp)
4340 {
4341 }
4342 #endif /* i386 */
4343 
4344 
4345 /*
4346  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
4347  * when the PCI-E Completion packets are aligned on an 8-byte
4348  * boundary.  Some PCI-E chip sets always align Completion packets; on
4349  * the ones that do not, the alignment can be enforced by enabling
4350  * ECRC generation (if supported).
4351  *
4352  * When PCI-E Completion packets are not aligned, it is actually more
4353  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
4354  *
4355  * If the driver can neither enable ECRC nor verify that it has
4356  * already been enabled, then it must use a firmware image which works
4357  * around unaligned completion packets (ethp_z8e.dat), and it should
4358  * also ensure that it never gives the device a Read-DMA which is
4359  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
4360  * enabled, then the driver should use the aligned (eth_z8e.dat)
4361  * firmware image, and set tx.boundary to 4KB.
4362  */
4363 
4364 
4365 static int
4366 myri10ge_firmware_probe(struct myri10ge_priv *mgp)
4367 {
4368 	int status;
4369 
4370 	mgp->tx_boundary = 4096;
4371 	/*
4372 	 * Verify the max read request size was set to 4KB
4373 	 * before trying the test with 4KB.
4374 	 */
4375 	if (mgp->max_read_request_4k == 0)
4376 		mgp->tx_boundary = 2048;
4377 	/*
4378 	 * load the optimized firmware which assumes aligned PCIe
4379 	 * completions in order to see if it works on this host.
4380 	 */
4381 
4382 	mgp->fw_name = "rss_eth_z8e";
4383 	mgp->eth_z8e = (unsigned char *)rss_eth_z8e;
4384 	mgp->eth_z8e_length = rss_eth_z8e_length;
4385 
4386 	status = myri10ge_load_firmware(mgp);
4387 	if (status != 0) {
4388 		return (status);
4389 	}
4390 	/*
4391 	 * Enable ECRC if possible
4392 	 */
4393 	myri10ge_enable_nvidia_ecrc(mgp);
4394 
4395 	/*
4396 	 * Run a DMA test which watches for unaligned completions and
4397 	 * aborts on the first one seen.
4398 	 */
4399 	status = myri10ge_dma_test(mgp, MXGEFW_CMD_UNALIGNED_TEST);
4400 	if (status == 0)
4401 		return (0); /* keep the aligned firmware */
4402 
4403 	if (status != E2BIG)
4404 		cmn_err(CE_WARN, "%s: DMA test failed: %d\n",
4405 		    mgp->name, status);
4406 	if (status == ENOSYS)
4407 		cmn_err(CE_WARN, "%s: Falling back to ethp! "
4408 		    "Please install up to date fw\n", mgp->name);
4409 	return (status);
4410 }
4411 
4412 static int
4413 myri10ge_select_firmware(struct myri10ge_priv *mgp)
4414 {
4415 	int aligned;
4416 
4417 	aligned = 0;
4418 
4419 	if (myri10ge_force_firmware == 1) {
4420 		if (myri10ge_verbose)
4421 			printf("%s: Assuming aligned completions (forced)\n",
4422 			    mgp->name);
4423 		aligned = 1;
4424 		goto done;
4425 	}
4426 
4427 	if (myri10ge_force_firmware == 2) {
4428 		if (myri10ge_verbose)
4429 			printf("%s: Assuming unaligned completions (forced)\n",
4430 			    mgp->name);
4431 		aligned = 0;
4432 		goto done;
4433 	}
4434 
4435 	/* If the width is less than 8, we may used the aligned firmware */
4436 	if (mgp->pcie_link_width != 0 && mgp->pcie_link_width < 8) {
4437 		cmn_err(CE_WARN, "!%s: PCIe link running at x%d\n",
4438 		    mgp->name, mgp->pcie_link_width);
4439 		aligned = 1;
4440 		goto done;
4441 	}
4442 
4443 	if (0 == myri10ge_firmware_probe(mgp))
4444 		return (0);  /* keep optimized firmware */
4445 
4446 done:
4447 	if (aligned) {
4448 		mgp->fw_name = "rss_eth_z8e";
4449 		mgp->eth_z8e = (unsigned char *)rss_eth_z8e;
4450 		mgp->eth_z8e_length = rss_eth_z8e_length;
4451 		mgp->tx_boundary = 4096;
4452 	} else {
4453 		mgp->fw_name = "rss_ethp_z8e";
4454 		mgp->eth_z8e = (unsigned char *)rss_ethp_z8e;
4455 		mgp->eth_z8e_length = rss_ethp_z8e_length;
4456 		mgp->tx_boundary = 2048;
4457 	}
4458 
4459 	return (myri10ge_load_firmware(mgp));
4460 }
4461 
4462 static int
4463 myri10ge_add_intrs(struct myri10ge_priv *mgp, int add_handler)
4464 {
4465 	dev_info_t *devinfo = mgp->dip;
4466 	int count, avail, actual, intr_types;
4467 	int x, y, rc, inum = 0;
4468 
4469 
4470 	rc = ddi_intr_get_supported_types(devinfo, &intr_types);
4471 	if (rc != DDI_SUCCESS) {
4472 		cmn_err(CE_WARN,
4473 		    "!%s: ddi_intr_get_nintrs() failure, rc = %d\n", mgp->name,
4474 		    rc);
4475 		return (DDI_FAILURE);
4476 	}
4477 
4478 	if (!myri10ge_use_msi)
4479 		intr_types &= ~DDI_INTR_TYPE_MSI;
4480 	if (!myri10ge_use_msix)
4481 		intr_types &= ~DDI_INTR_TYPE_MSIX;
4482 
4483 	if (intr_types & DDI_INTR_TYPE_MSIX) {
4484 		mgp->ddi_intr_type = DDI_INTR_TYPE_MSIX;
4485 		mgp->intr_type = "MSI-X";
4486 	} else if (intr_types & DDI_INTR_TYPE_MSI) {
4487 		mgp->ddi_intr_type = DDI_INTR_TYPE_MSI;
4488 		mgp->intr_type = "MSI";
4489 	} else {
4490 		mgp->ddi_intr_type = DDI_INTR_TYPE_FIXED;
4491 		mgp->intr_type = "Legacy";
4492 	}
4493 	/* Get number of interrupts */
4494 	rc = ddi_intr_get_nintrs(devinfo, mgp->ddi_intr_type, &count);
4495 	if ((rc != DDI_SUCCESS) || (count == 0)) {
4496 		cmn_err(CE_WARN, "%s: ddi_intr_get_nintrs() failure, rc: %d, "
4497 		    "count: %d", mgp->name, rc, count);
4498 
4499 		return (DDI_FAILURE);
4500 	}
4501 
4502 	/* Get number of available interrupts */
4503 	rc = ddi_intr_get_navail(devinfo, mgp->ddi_intr_type, &avail);
4504 	if ((rc != DDI_SUCCESS) || (avail == 0)) {
4505 		cmn_err(CE_WARN, "%s: ddi_intr_get_navail() failure, "
4506 		    "rc: %d, avail: %d\n", mgp->name, rc, avail);
4507 		return (DDI_FAILURE);
4508 	}
4509 	if (avail < count) {
4510 		cmn_err(CE_NOTE,
4511 		    "!%s: nintrs() returned %d, navail returned %d",
4512 		    mgp->name, count, avail);
4513 		count = avail;
4514 	}
4515 
4516 	if (count < mgp->num_slices)
4517 		return (DDI_FAILURE);
4518 
4519 	if (count > mgp->num_slices)
4520 		count = mgp->num_slices;
4521 
4522 	/* Allocate memory for MSI interrupts */
4523 	mgp->intr_size = count * sizeof (ddi_intr_handle_t);
4524 	mgp->htable = kmem_alloc(mgp->intr_size, KM_SLEEP);
4525 
4526 	rc = ddi_intr_alloc(devinfo, mgp->htable, mgp->ddi_intr_type, inum,
4527 	    count, &actual, DDI_INTR_ALLOC_NORMAL);
4528 
4529 	if ((rc != DDI_SUCCESS) || (actual == 0)) {
4530 		cmn_err(CE_WARN, "%s: ddi_intr_alloc() failed: %d",
4531 		    mgp->name, rc);
4532 
4533 		kmem_free(mgp->htable, mgp->intr_size);
4534 		mgp->htable = NULL;
4535 		return (DDI_FAILURE);
4536 	}
4537 
4538 	if ((actual < count) && myri10ge_verbose) {
4539 		cmn_err(CE_NOTE, "%s: got %d/%d slices",
4540 		    mgp->name, actual, count);
4541 	}
4542 
4543 	mgp->intr_cnt = actual;
4544 
4545 	/*
4546 	 * Get priority for first irq, assume remaining are all the same
4547 	 */
4548 	if (ddi_intr_get_pri(mgp->htable[0], &mgp->intr_pri)
4549 	    != DDI_SUCCESS) {
4550 		cmn_err(CE_WARN, "%s: ddi_intr_get_pri() failed", mgp->name);
4551 
4552 		/* Free already allocated intr */
4553 		for (y = 0; y < actual; y++) {
4554 			(void) ddi_intr_free(mgp->htable[y]);
4555 		}
4556 
4557 		kmem_free(mgp->htable, mgp->intr_size);
4558 		mgp->htable = NULL;
4559 		return (DDI_FAILURE);
4560 	}
4561 
4562 	mgp->icookie = (void *)(uintptr_t)mgp->intr_pri;
4563 
4564 	if (!add_handler)
4565 		return (DDI_SUCCESS);
4566 
4567 	/* Call ddi_intr_add_handler() */
4568 	for (x = 0; x < actual; x++) {
4569 		if (ddi_intr_add_handler(mgp->htable[x], myri10ge_intr,
4570 		    (caddr_t)&mgp->ss[x], NULL) != DDI_SUCCESS) {
4571 			cmn_err(CE_WARN, "%s: ddi_intr_add_handler() failed",
4572 			    mgp->name);
4573 
4574 			/* Free already allocated intr */
4575 			for (y = 0; y < actual; y++) {
4576 				(void) ddi_intr_free(mgp->htable[y]);
4577 			}
4578 
4579 			kmem_free(mgp->htable, mgp->intr_size);
4580 			mgp->htable = NULL;
4581 			return (DDI_FAILURE);
4582 		}
4583 	}
4584 
4585 	(void) ddi_intr_get_cap(mgp->htable[0], &mgp->intr_cap);
4586 	if (mgp->intr_cap & DDI_INTR_FLAG_BLOCK) {
4587 		/* Call ddi_intr_block_enable() for MSI */
4588 		(void) ddi_intr_block_enable(mgp->htable, mgp->intr_cnt);
4589 	} else {
4590 		/* Call ddi_intr_enable() for MSI non block enable */
4591 		for (x = 0; x < mgp->intr_cnt; x++) {
4592 			(void) ddi_intr_enable(mgp->htable[x]);
4593 		}
4594 	}
4595 
4596 	return (DDI_SUCCESS);
4597 }
4598 
4599 static void
4600 myri10ge_rem_intrs(struct myri10ge_priv *mgp, int handler_installed)
4601 {
4602 	int x, err;
4603 
4604 	/* Disable all interrupts */
4605 	if (handler_installed) {
4606 		if (mgp->intr_cap & DDI_INTR_FLAG_BLOCK) {
4607 			/* Call ddi_intr_block_disable() */
4608 			(void) ddi_intr_block_disable(mgp->htable,
4609 			    mgp->intr_cnt);
4610 		} else {
4611 			for (x = 0; x < mgp->intr_cnt; x++) {
4612 				(void) ddi_intr_disable(mgp->htable[x]);
4613 			}
4614 		}
4615 	}
4616 
4617 	for (x = 0; x < mgp->intr_cnt; x++) {
4618 		if (handler_installed) {
4619 		/* Call ddi_intr_remove_handler() */
4620 			err = ddi_intr_remove_handler(mgp->htable[x]);
4621 			if (err != DDI_SUCCESS) {
4622 				cmn_err(CE_WARN,
4623 				    "%s: ddi_intr_remove_handler for"
4624 				    "vec %d returned %d\n", mgp->name,
4625 				    x, err);
4626 			}
4627 		}
4628 		err = ddi_intr_free(mgp->htable[x]);
4629 		if (err != DDI_SUCCESS) {
4630 			cmn_err(CE_WARN,
4631 			    "%s: ddi_intr_free for vec %d returned %d\n",
4632 			    mgp->name, x, err);
4633 		}
4634 	}
4635 	kmem_free(mgp->htable, mgp->intr_size);
4636 	mgp->htable = NULL;
4637 }
4638 
4639 static void
4640 myri10ge_test_physical(dev_info_t *dip)
4641 {
4642 	ddi_dma_handle_t	handle;
4643 	struct myri10ge_dma_stuff dma;
4644 	void *addr;
4645 	int err;
4646 
4647 	/* test #1, sufficient for older sparc systems */
4648 	myri10ge_tx_dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
4649 	err = ddi_dma_alloc_handle(dip, &myri10ge_tx_dma_attr,
4650 	    DDI_DMA_DONTWAIT, NULL, &handle);
4651 	if (err == DDI_DMA_BADATTR)
4652 		goto fail;
4653 	ddi_dma_free_handle(&handle);
4654 
4655 	/* test #2, required on Olympis where the bind is what fails */
4656 	addr = myri10ge_dma_alloc(dip, 128, &myri10ge_tx_dma_attr,
4657 	    &myri10ge_dev_access_attr, DDI_DMA_STREAMING,
4658 	    DDI_DMA_WRITE|DDI_DMA_STREAMING, &dma, 0, DDI_DMA_DONTWAIT);
4659 	if (addr == NULL)
4660 		goto fail;
4661 	myri10ge_dma_free(&dma);
4662 	return;
4663 
4664 fail:
4665 	if (myri10ge_verbose)
4666 		printf("myri10ge%d: DDI_DMA_FORCE_PHYSICAL failed, "
4667 		    "using IOMMU\n", ddi_get_instance(dip));
4668 
4669 	myri10ge_tx_dma_attr.dma_attr_flags &= ~DDI_DMA_FORCE_PHYSICAL;
4670 }
4671 
4672 static void
4673 myri10ge_get_props(dev_info_t *dip)
4674 {
4675 
4676 	myri10ge_flow_control =  ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4677 	    "myri10ge_flow_control", myri10ge_flow_control);
4678 
4679 	myri10ge_intr_coal_delay = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4680 	    "myri10ge_intr_coal_delay", myri10ge_intr_coal_delay);
4681 
4682 #if defined __i386 || defined i386 || defined __i386__ || defined __x86_64__
4683 	myri10ge_nvidia_ecrc_enable = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4684 	    "myri10ge_nvidia_ecrc_enable", 1);
4685 #endif
4686 
4687 
4688 	myri10ge_use_msi = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4689 	    "myri10ge_use_msi", myri10ge_use_msi);
4690 
4691 	myri10ge_deassert_wait = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4692 	    "myri10ge_deassert_wait",  myri10ge_deassert_wait);
4693 
4694 	myri10ge_verbose = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4695 	    "myri10ge_verbose", myri10ge_verbose);
4696 
4697 	myri10ge_tx_copylen = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4698 	    "myri10ge_tx_copylen", myri10ge_tx_copylen);
4699 
4700 	if (myri10ge_tx_copylen < 60) {
4701 		cmn_err(CE_WARN,
4702 		    "myri10ge_tx_copylen must be >= 60 bytes\n");
4703 		myri10ge_tx_copylen = 60;
4704 	}
4705 
4706 	myri10ge_mtu_override = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4707 	    "myri10ge_mtu_override", myri10ge_mtu_override);
4708 
4709 	if (myri10ge_mtu_override >= MYRI10GE_MIN_GLD_MTU &&
4710 	    myri10ge_mtu_override <= MYRI10GE_MAX_GLD_MTU)
4711 		myri10ge_mtu = myri10ge_mtu_override +
4712 		    sizeof (struct ether_header) + MXGEFW_PAD + VLAN_TAGSZ;
4713 	else if (myri10ge_mtu_override != 0) {
4714 		cmn_err(CE_WARN,
4715 		    "myri10ge_mtu_override must be between 1500 and "
4716 		    "9000 bytes\n");
4717 	}
4718 
4719 	myri10ge_bigbufs_initial = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4720 	    "myri10ge_bigbufs_initial", myri10ge_bigbufs_initial);
4721 	myri10ge_bigbufs_max = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4722 	    "myri10ge_bigbufs_max", myri10ge_bigbufs_max);
4723 
4724 	myri10ge_watchdog_reset = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4725 	    "myri10ge_watchdog_reset", myri10ge_watchdog_reset);
4726 
4727 	if (myri10ge_bigbufs_initial < 128) {
4728 		cmn_err(CE_WARN,
4729 		    "myri10ge_bigbufs_initial be at least 128\n");
4730 		myri10ge_bigbufs_initial = 128;
4731 	}
4732 	if (myri10ge_bigbufs_max < 128) {
4733 		cmn_err(CE_WARN,
4734 		    "myri10ge_bigbufs_max be at least 128\n");
4735 		myri10ge_bigbufs_max = 128;
4736 	}
4737 
4738 	if (myri10ge_bigbufs_max < myri10ge_bigbufs_initial) {
4739 		cmn_err(CE_WARN,
4740 		    "myri10ge_bigbufs_max must be >=  "
4741 		    "myri10ge_bigbufs_initial\n");
4742 		myri10ge_bigbufs_max = myri10ge_bigbufs_initial;
4743 	}
4744 
4745 	myri10ge_force_firmware = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4746 	    "myri10ge_force_firmware", myri10ge_force_firmware);
4747 
4748 	myri10ge_max_slices = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4749 	    "myri10ge_max_slices", myri10ge_max_slices);
4750 
4751 	myri10ge_use_msix = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4752 	    "myri10ge_use_msix", myri10ge_use_msix);
4753 
4754 	myri10ge_rss_hash = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4755 	    "myri10ge_rss_hash", myri10ge_rss_hash);
4756 
4757 	if (myri10ge_rss_hash > MXGEFW_RSS_HASH_TYPE_MAX ||
4758 	    myri10ge_rss_hash < MXGEFW_RSS_HASH_TYPE_IPV4) {
4759 		cmn_err(CE_WARN, "myri10ge: Illegal rssh hash type %d\n",
4760 		    myri10ge_rss_hash);
4761 		myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4762 	}
4763 	myri10ge_lro = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4764 	    "myri10ge_lro", myri10ge_lro);
4765 	myri10ge_lro_cnt = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4766 	    "myri10ge_lro_cnt", myri10ge_lro_cnt);
4767 	myri10ge_lro_max_aggr = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4768 	    "myri10ge_lro_max_aggr", myri10ge_lro_max_aggr);
4769 	myri10ge_tx_hash = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4770 	    "myri10ge_tx_hash", myri10ge_tx_hash);
4771 	myri10ge_use_lso = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4772 	    "myri10ge_use_lso", myri10ge_use_lso);
4773 	myri10ge_lso_copy = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4774 	    "myri10ge_lso_copy", myri10ge_lso_copy);
4775 	myri10ge_tx_handles_initial = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4776 	    "myri10ge_tx_handles_initial", myri10ge_tx_handles_initial);
4777 	myri10ge_small_bytes = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4778 	    "myri10ge_small_bytes", myri10ge_small_bytes);
4779 	if ((myri10ge_small_bytes + MXGEFW_PAD) & (128 -1)) {
4780 		cmn_err(CE_WARN, "myri10ge: myri10ge_small_bytes (%d)\n",
4781 		    myri10ge_small_bytes);
4782 		cmn_err(CE_WARN, "must be aligned on 128b bndry -2\n");
4783 		myri10ge_small_bytes += 128;
4784 		myri10ge_small_bytes &= ~(128 -1);
4785 		myri10ge_small_bytes -= MXGEFW_PAD;
4786 		cmn_err(CE_WARN, "rounded up to %d\n",
4787 		    myri10ge_small_bytes);
4788 
4789 		myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4790 	}
4791 }
4792 
4793 #ifndef	PCI_EXP_LNKSTA
4794 #define	PCI_EXP_LNKSTA 18
4795 #endif
4796 
4797 static int
4798 myri10ge_find_cap(ddi_acc_handle_t handle, uint8_t *capptr, uint8_t capid)
4799 {
4800 	uint16_t	status;
4801 	uint8_t 	ptr;
4802 
4803 	/* check to see if we have capabilities */
4804 	status = pci_config_get16(handle, PCI_CONF_STAT);
4805 	if (!(status & PCI_STAT_CAP)) {
4806 		cmn_err(CE_WARN, "PCI_STAT_CAP not found\n");
4807 		return (ENXIO);
4808 	}
4809 
4810 	ptr = pci_config_get8(handle, PCI_CONF_CAP_PTR);
4811 
4812 	/* Walk the capabilities list, looking for a PCI Express cap */
4813 	while (ptr != PCI_CAP_NEXT_PTR_NULL) {
4814 		if (pci_config_get8(handle, ptr + PCI_CAP_ID) == capid)
4815 			break;
4816 		ptr = pci_config_get8(handle, ptr + PCI_CAP_NEXT_PTR);
4817 	}
4818 	if (ptr < 64) {
4819 		cmn_err(CE_WARN, "Bad capability offset %d\n", ptr);
4820 		return (ENXIO);
4821 	}
4822 	*capptr = ptr;
4823 	return (0);
4824 }
4825 
4826 static int
4827 myri10ge_set_max_readreq(ddi_acc_handle_t handle)
4828 {
4829 	int err;
4830 	uint16_t	val;
4831 	uint8_t		ptr;
4832 
4833 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_PCI_E);
4834 	if (err != 0) {
4835 		cmn_err(CE_WARN, "could not find PCIe cap\n");
4836 		return (ENXIO);
4837 	}
4838 
4839 	/* set max read req to 4096 */
4840 	val = pci_config_get16(handle, ptr + PCIE_DEVCTL);
4841 	val = (val & ~PCIE_DEVCTL_MAX_READ_REQ_MASK) |
4842 	    PCIE_DEVCTL_MAX_READ_REQ_4096;
4843 	pci_config_put16(handle, ptr + PCIE_DEVCTL, val);
4844 	val = pci_config_get16(handle, ptr + PCIE_DEVCTL);
4845 	if ((val & (PCIE_DEVCTL_MAX_READ_REQ_4096)) !=
4846 	    PCIE_DEVCTL_MAX_READ_REQ_4096) {
4847 		cmn_err(CE_WARN, "could not set max read req (%x)\n", val);
4848 		return (EINVAL);
4849 	}
4850 	return (0);
4851 }
4852 
4853 static int
4854 myri10ge_read_pcie_link_width(ddi_acc_handle_t handle, int *link)
4855 {
4856 	int err;
4857 	uint16_t	val;
4858 	uint8_t		ptr;
4859 
4860 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_PCI_E);
4861 	if (err != 0) {
4862 		cmn_err(CE_WARN, "could not set max read req\n");
4863 		return (ENXIO);
4864 	}
4865 
4866 	/* read link width */
4867 	val = pci_config_get16(handle, ptr + PCIE_LINKSTS);
4868 	val &= PCIE_LINKSTS_NEG_WIDTH_MASK;
4869 	*link = (val >> 4);
4870 	return (0);
4871 }
4872 
4873 static int
4874 myri10ge_reset_nic(struct myri10ge_priv *mgp)
4875 {
4876 	ddi_acc_handle_t handle = mgp->cfg_hdl;
4877 	uint32_t reboot;
4878 	uint16_t cmd;
4879 	int err;
4880 
4881 	cmd = pci_config_get16(handle, PCI_CONF_COMM);
4882 	if ((cmd & PCI_COMM_ME) == 0) {
4883 		/*
4884 		 * Bus master DMA disabled?  Check to see if the card
4885 		 * rebooted due to a parity error For now, just report
4886 		 * it
4887 		 */
4888 
4889 		/* enter read32 mode */
4890 		pci_config_put8(handle, mgp->vso + 0x10, 0x3);
4891 		/* read REBOOT_STATUS (0xfffffff0) */
4892 		pci_config_put32(handle, mgp->vso + 0x18, 0xfffffff0);
4893 		reboot = pci_config_get16(handle, mgp->vso + 0x14);
4894 		cmn_err(CE_WARN, "%s NIC rebooted 0x%x\n", mgp->name, reboot);
4895 		return (0);
4896 	}
4897 	if (!myri10ge_watchdog_reset) {
4898 		cmn_err(CE_WARN, "%s: not resetting\n", mgp->name);
4899 		return (1);
4900 	}
4901 
4902 	myri10ge_stop_locked(mgp);
4903 	err = myri10ge_start_locked(mgp);
4904 	if (err == DDI_FAILURE) {
4905 		return (0);
4906 	}
4907 	mac_tx_update(mgp->mh);
4908 	return (1);
4909 }
4910 
4911 static inline int
4912 myri10ge_ring_stalled(myri10ge_tx_ring_t *tx)
4913 {
4914 	if (tx->sched != tx->stall &&
4915 	    tx->done == tx->watchdog_done &&
4916 	    tx->watchdog_req != tx->watchdog_done)
4917 		return (1);
4918 	return (0);
4919 }
4920 
4921 static void
4922 myri10ge_watchdog(void *arg)
4923 {
4924 	struct myri10ge_priv *mgp;
4925 	struct myri10ge_slice_state *ss;
4926 	myri10ge_tx_ring_t *tx;
4927 	int nic_ok = 1;
4928 	int slices_stalled, rx_pause, i;
4929 	int add_rx;
4930 
4931 	mgp = arg;
4932 	mutex_enter(&mgp->intrlock);
4933 	if (mgp->running != MYRI10GE_ETH_RUNNING) {
4934 		cmn_err(CE_WARN,
4935 		    "%s not running, not rearming watchdog (%d)\n",
4936 		    mgp->name, mgp->running);
4937 		mutex_exit(&mgp->intrlock);
4938 		return;
4939 	}
4940 
4941 	rx_pause = ntohl(mgp->ss[0].fw_stats->dropped_pause);
4942 
4943 	/*
4944 	 * make sure nic is stalled before we reset the nic, so as to
4945 	 * ensure we don't rip the transmit data structures out from
4946 	 * under a pending transmit
4947 	 */
4948 
4949 	for (slices_stalled = 0, i = 0; i < mgp->num_slices; i++) {
4950 		tx = &mgp->ss[i].tx;
4951 		slices_stalled = myri10ge_ring_stalled(tx);
4952 		if (slices_stalled)
4953 			break;
4954 	}
4955 
4956 	if (slices_stalled) {
4957 		if (mgp->watchdog_rx_pause == rx_pause) {
4958 			cmn_err(CE_WARN,
4959 			    "%s slice %d stalled:(%d, %d, %d, %d, %d %d %d\n)",
4960 			    mgp->name, i, tx->sched, tx->stall,
4961 			    tx->done, tx->watchdog_done, tx->req, tx->pkt_done,
4962 			    (int)ntohl(mgp->ss[i].fw_stats->send_done_count));
4963 			nic_ok = myri10ge_reset_nic(mgp);
4964 		} else {
4965 			cmn_err(CE_WARN,
4966 			    "%s Flow controlled, check link partner\n",
4967 			    mgp->name);
4968 		}
4969 	}
4970 
4971 	if (!nic_ok) {
4972 		cmn_err(CE_WARN,
4973 		    "%s Nic dead, not rearming watchdog\n", mgp->name);
4974 		mutex_exit(&mgp->intrlock);
4975 		return;
4976 	}
4977 	for (i = 0; i < mgp->num_slices; i++) {
4978 		ss = &mgp->ss[i];
4979 		tx = &ss->tx;
4980 		tx->watchdog_done = tx->done;
4981 		tx->watchdog_req = tx->req;
4982 		if (ss->watchdog_rx_copy != MYRI10GE_SLICE_STAT(rx_copy)) {
4983 			ss->watchdog_rx_copy = MYRI10GE_SLICE_STAT(rx_copy);
4984 			add_rx =
4985 			    min(ss->jpool.num_alloc,
4986 			    myri10ge_bigbufs_max -
4987 			    (ss->jpool.num_alloc -
4988 			    ss->jbufs_for_smalls));
4989 			if (add_rx != 0) {
4990 				(void) myri10ge_add_jbufs(ss, add_rx, 0);
4991 				/* now feed them to the firmware */
4992 				mutex_enter(&ss->jpool.mtx);
4993 				myri10ge_restock_jumbos(ss);
4994 				mutex_exit(&ss->jpool.mtx);
4995 			}
4996 		}
4997 	}
4998 	mgp->watchdog_rx_pause = rx_pause;
4999 
5000 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
5001 	    mgp->timer_ticks);
5002 	mutex_exit(&mgp->intrlock);
5003 }
5004 
5005 /*ARGSUSED*/
5006 static int
5007 myri10ge_get_coalesce(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
5008 
5009 {
5010 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5011 	(void) mi_mpprintf(mp, "%d", mgp->intr_coal_delay);
5012 	return (0);
5013 }
5014 
5015 /*ARGSUSED*/
5016 static int
5017 myri10ge_set_coalesce(queue_t *q, mblk_t *mp, char *value,
5018     caddr_t cp, cred_t *credp)
5019 
5020 {
5021 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5022 	char *end;
5023 	size_t new_value;
5024 
5025 	new_value = mi_strtol(value, &end, 10);
5026 	if (end == value)
5027 		return (EINVAL);
5028 
5029 	mutex_enter(&myri10ge_param_lock);
5030 	mgp->intr_coal_delay = (int)new_value;
5031 	*mgp->intr_coal_delay_ptr = htonl(mgp->intr_coal_delay);
5032 	mutex_exit(&myri10ge_param_lock);
5033 	return (0);
5034 }
5035 
5036 /*ARGSUSED*/
5037 static int
5038 myri10ge_get_pauseparam(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
5039 
5040 {
5041 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5042 	(void) mi_mpprintf(mp, "%d", mgp->pause);
5043 	return (0);
5044 }
5045 
5046 /*ARGSUSED*/
5047 static int
5048 myri10ge_set_pauseparam(queue_t *q, mblk_t *mp, char *value,
5049 			caddr_t cp, cred_t *credp)
5050 
5051 {
5052 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
5053 	char *end;
5054 	size_t new_value;
5055 	int err = 0;
5056 
5057 	new_value = mi_strtol(value, &end, 10);
5058 	if (end == value)
5059 		return (EINVAL);
5060 	if (new_value != 0)
5061 		new_value = 1;
5062 
5063 	mutex_enter(&myri10ge_param_lock);
5064 	if (new_value != mgp->pause)
5065 		err = myri10ge_change_pause(mgp, new_value);
5066 	mutex_exit(&myri10ge_param_lock);
5067 	return (err);
5068 }
5069 
5070 /*ARGSUSED*/
5071 static int
5072 myri10ge_get_int(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
5073 
5074 {
5075 	(void) mi_mpprintf(mp, "%d", *(int *)(void *)cp);
5076 	return (0);
5077 }
5078 
5079 /*ARGSUSED*/
5080 static int
5081 myri10ge_set_int(queue_t *q, mblk_t *mp, char *value,
5082     caddr_t cp, cred_t *credp)
5083 
5084 {
5085 	char *end;
5086 	size_t new_value;
5087 
5088 	new_value = mi_strtol(value, &end, 10);
5089 	if (end == value)
5090 		return (EINVAL);
5091 	*(int *)(void *)cp = new_value;
5092 
5093 	return (0);
5094 }
5095 
5096 static void
5097 myri10ge_ndd_init(struct myri10ge_priv *mgp)
5098 {
5099 	mgp->nd_head = NULL;
5100 
5101 	(void) nd_load(&mgp->nd_head, "myri10ge_intr_coal_delay",
5102 	    myri10ge_get_coalesce, myri10ge_set_coalesce, (caddr_t)mgp);
5103 	(void) nd_load(&mgp->nd_head, "myri10ge_flow_control",
5104 	    myri10ge_get_pauseparam, myri10ge_set_pauseparam, (caddr_t)mgp);
5105 	(void) nd_load(&mgp->nd_head, "myri10ge_verbose",
5106 	    myri10ge_get_int, myri10ge_set_int, (caddr_t)&myri10ge_verbose);
5107 	(void) nd_load(&mgp->nd_head, "myri10ge_deassert_wait",
5108 	    myri10ge_get_int, myri10ge_set_int,
5109 	    (caddr_t)&myri10ge_deassert_wait);
5110 	(void) nd_load(&mgp->nd_head, "myri10ge_bigbufs_max",
5111 	    myri10ge_get_int, myri10ge_set_int,
5112 	    (caddr_t)&myri10ge_bigbufs_max);
5113 	(void) nd_load(&mgp->nd_head, "myri10ge_lro",
5114 	    myri10ge_get_int, myri10ge_set_int,
5115 	    (caddr_t)&myri10ge_lro);
5116 	(void) nd_load(&mgp->nd_head, "myri10ge_lro_max_aggr",
5117 	    myri10ge_get_int, myri10ge_set_int,
5118 	    (caddr_t)&myri10ge_lro_max_aggr);
5119 	(void) nd_load(&mgp->nd_head, "myri10ge_tx_hash",
5120 	    myri10ge_get_int, myri10ge_set_int,
5121 	    (caddr_t)&myri10ge_tx_hash);
5122 	(void) nd_load(&mgp->nd_head, "myri10ge_lso_copy",
5123 	    myri10ge_get_int, myri10ge_set_int,
5124 	    (caddr_t)&myri10ge_lso_copy);
5125 }
5126 
5127 static void
5128 myri10ge_ndd_fini(struct myri10ge_priv *mgp)
5129 {
5130 	nd_free(&mgp->nd_head);
5131 }
5132 
5133 static void
5134 myri10ge_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
5135 {
5136 	struct iocblk *iocp;
5137 	struct myri10ge_priv *mgp = arg;
5138 	int cmd, ok, err;
5139 
5140 	iocp = (struct iocblk *)(void *)mp->b_rptr;
5141 	cmd = iocp->ioc_cmd;
5142 
5143 	ok = 0;
5144 	err = 0;
5145 
5146 	switch (cmd) {
5147 	case ND_GET:
5148 	case ND_SET:
5149 		ok = nd_getset(wq, mgp->nd_head, mp);
5150 		break;
5151 	default:
5152 		break;
5153 	}
5154 	if (!ok)
5155 		err = EINVAL;
5156 	else
5157 		err = iocp->ioc_error;
5158 
5159 	if (!err)
5160 		miocack(wq, mp, iocp->ioc_count, err);
5161 	else
5162 		miocnak(wq, mp, 0, err);
5163 }
5164 
5165 static struct myri10ge_priv *mgp_list;
5166 
5167 struct myri10ge_priv *
5168 myri10ge_get_instance(uint_t unit)
5169 {
5170 	struct myri10ge_priv *mgp;
5171 
5172 	mutex_enter(&myri10ge_param_lock);
5173 	for (mgp = mgp_list; mgp != NULL; mgp = mgp->next) {
5174 		if (unit == ddi_get_instance(mgp->dip)) {
5175 			mgp->refcnt++;
5176 			break;
5177 		}
5178 	}
5179 	mutex_exit(&myri10ge_param_lock);
5180 	return (mgp);
5181 }
5182 
5183 void
5184 myri10ge_put_instance(struct myri10ge_priv *mgp)
5185 {
5186 	mutex_enter(&myri10ge_param_lock);
5187 	mgp->refcnt--;
5188 	mutex_exit(&myri10ge_param_lock);
5189 }
5190 
5191 static boolean_t
5192 myri10ge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
5193 {
5194 	struct myri10ge_priv *mgp = arg;
5195 	uint32_t *cap_hcksum;
5196 	mac_capab_lso_t *cap_lso;
5197 	mac_capab_rings_t *cap_rings;
5198 
5199 	switch (cap) {
5200 	case MAC_CAPAB_HCKSUM:
5201 		cap_hcksum = cap_data;
5202 		*cap_hcksum = HCKSUM_INET_PARTIAL;
5203 		break;
5204 	case MAC_CAPAB_RINGS:
5205 		cap_rings = cap_data;
5206 		switch (cap_rings->mr_type) {
5207 		case MAC_RING_TYPE_RX:
5208 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
5209 			cap_rings->mr_rnum = mgp->num_slices;
5210 			cap_rings->mr_gnum = 1;
5211 			cap_rings->mr_rget = myri10ge_fill_ring;
5212 			cap_rings->mr_gget = myri10ge_fill_group;
5213 			break;
5214 		case MAC_RING_TYPE_TX:
5215 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
5216 			cap_rings->mr_rnum = mgp->num_slices;
5217 			cap_rings->mr_gnum = 0;
5218 			cap_rings->mr_rget = myri10ge_fill_ring;
5219 			cap_rings->mr_gget = NULL;
5220 			break;
5221 		default:
5222 			return (B_FALSE);
5223 		}
5224 		break;
5225 	case MAC_CAPAB_LSO:
5226 		cap_lso = cap_data;
5227 		if (!myri10ge_use_lso)
5228 			return (B_FALSE);
5229 		if (!(mgp->features & MYRI10GE_TSO))
5230 			return (B_FALSE);
5231 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
5232 		cap_lso->lso_basic_tcp_ipv4.lso_max = (uint16_t)-1;
5233 		break;
5234 
5235 	default:
5236 		return (B_FALSE);
5237 	}
5238 	return (B_TRUE);
5239 }
5240 
5241 
5242 static int
5243 myri10ge_m_stat(void *arg, uint_t stat, uint64_t *val)
5244 {
5245 	struct myri10ge_priv *mgp = arg;
5246 	struct myri10ge_rx_ring_stats *rstat;
5247 	struct myri10ge_tx_ring_stats *tstat;
5248 	mcp_irq_data_t *fw_stats = mgp->ss[0].fw_stats;
5249 	struct myri10ge_slice_state *ss;
5250 	uint64_t tmp = 0;
5251 	int i;
5252 
5253 	switch (stat) {
5254 	case MAC_STAT_IFSPEED:
5255 		*val = 10ull * 1000ull * 1000000ull;
5256 		break;
5257 
5258 	case MAC_STAT_MULTIRCV:
5259 		for (i = 0; i < mgp->num_slices; i++) {
5260 			rstat = &mgp->ss[i].rx_stats;
5261 			tmp += rstat->multircv;
5262 		}
5263 		*val = tmp;
5264 		break;
5265 
5266 	case MAC_STAT_BRDCSTRCV:
5267 		for (i = 0; i < mgp->num_slices; i++) {
5268 			rstat = &mgp->ss[i].rx_stats;
5269 			tmp += rstat->brdcstrcv;
5270 		}
5271 		*val = tmp;
5272 		break;
5273 
5274 	case MAC_STAT_MULTIXMT:
5275 		for (i = 0; i < mgp->num_slices; i++) {
5276 			tstat = &mgp->ss[i].tx.stats;
5277 			tmp += tstat->multixmt;
5278 		}
5279 		*val = tmp;
5280 		break;
5281 
5282 	case MAC_STAT_BRDCSTXMT:
5283 		for (i = 0; i < mgp->num_slices; i++) {
5284 			tstat = &mgp->ss[i].tx.stats;
5285 			tmp += tstat->brdcstxmt;
5286 		}
5287 		*val = tmp;
5288 		break;
5289 
5290 	case MAC_STAT_NORCVBUF:
5291 		tmp = ntohl(fw_stats->dropped_no_big_buffer);
5292 		tmp += ntohl(fw_stats->dropped_no_small_buffer);
5293 		tmp += ntohl(fw_stats->dropped_link_overflow);
5294 		for (i = 0; i < mgp->num_slices; i++) {
5295 			ss = &mgp->ss[i];
5296 			tmp += MYRI10GE_SLICE_STAT(rx_big_nobuf);
5297 			tmp += MYRI10GE_SLICE_STAT(rx_small_nobuf);
5298 		}
5299 		*val = tmp;
5300 		break;
5301 
5302 	case MAC_STAT_IERRORS:
5303 		tmp += ntohl(fw_stats->dropped_bad_crc32);
5304 		tmp += ntohl(fw_stats->dropped_bad_phy);
5305 		tmp += ntohl(fw_stats->dropped_runt);
5306 		tmp += ntohl(fw_stats->dropped_overrun);
5307 		*val = tmp;
5308 		break;
5309 
5310 	case MAC_STAT_OERRORS:
5311 		for (i = 0; i < mgp->num_slices; i++) {
5312 			ss = &mgp->ss[i];
5313 			tmp += MYRI10GE_SLICE_STAT(xmit_lsobadflags);
5314 			tmp += MYRI10GE_SLICE_STAT(xmit_err);
5315 		}
5316 		*val = tmp;
5317 		break;
5318 
5319 	case MAC_STAT_RBYTES:
5320 		for (i = 0; i < mgp->num_slices; i++) {
5321 			rstat = &mgp->ss[i].rx_stats;
5322 			tmp += rstat->ibytes;
5323 		}
5324 		*val = tmp;
5325 		break;
5326 
5327 	case MAC_STAT_IPACKETS:
5328 		for (i = 0; i < mgp->num_slices; i++) {
5329 			rstat = &mgp->ss[i].rx_stats;
5330 			tmp += rstat->ipackets;
5331 		}
5332 		*val = tmp;
5333 		break;
5334 
5335 	case MAC_STAT_OBYTES:
5336 		for (i = 0; i < mgp->num_slices; i++) {
5337 			tstat = &mgp->ss[i].tx.stats;
5338 			tmp += tstat->obytes;
5339 		}
5340 		*val = tmp;
5341 		break;
5342 
5343 	case MAC_STAT_OPACKETS:
5344 		for (i = 0; i < mgp->num_slices; i++) {
5345 			tstat = &mgp->ss[i].tx.stats;
5346 			tmp += tstat->opackets;
5347 		}
5348 		*val = tmp;
5349 		break;
5350 
5351 	case ETHER_STAT_TOOLONG_ERRORS:
5352 		*val = ntohl(fw_stats->dropped_overrun);
5353 		break;
5354 
5355 #ifdef SOLARIS_S11
5356 	case ETHER_STAT_TOOSHORT_ERRORS:
5357 		*val = ntohl(fw_stats->dropped_runt);
5358 		break;
5359 #endif
5360 
5361 	case ETHER_STAT_LINK_PAUSE:
5362 		*val = mgp->pause;
5363 		break;
5364 
5365 	case ETHER_STAT_LINK_AUTONEG:
5366 		*val = 1;
5367 		break;
5368 
5369 	case ETHER_STAT_LINK_DUPLEX:
5370 		*val = LINK_DUPLEX_FULL;
5371 		break;
5372 
5373 	default:
5374 		return (ENOTSUP);
5375 	}
5376 
5377 	return (0);
5378 }
5379 
5380 /* ARGSUSED */
5381 static void
5382 myri10ge_m_propinfo(void *arg, const char *pr_name,
5383     mac_prop_id_t pr_num, mac_prop_info_handle_t prh)
5384 {
5385 	switch (pr_num) {
5386 	case MAC_PROP_MTU:
5387 		mac_prop_info_set_default_uint32(prh, MYRI10GE_DEFAULT_GLD_MTU);
5388 		mac_prop_info_set_range_uint32(prh, MYRI10GE_MIN_GLD_MTU,
5389 		    MYRI10GE_MAX_GLD_MTU);
5390 		break;
5391 	default:
5392 		break;
5393 	}
5394 }
5395 
5396 /*ARGSUSED*/
5397 static int
5398 myri10ge_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num,
5399     uint_t pr_valsize, const void *pr_val)
5400 {
5401 	int err = 0;
5402 	struct myri10ge_priv *mgp = arg;
5403 
5404 	switch (pr_num) {
5405 	case MAC_PROP_MTU: {
5406 		uint32_t mtu;
5407 		if (pr_valsize < sizeof (mtu)) {
5408 			err = EINVAL;
5409 			break;
5410 		}
5411 		bcopy(pr_val, &mtu, sizeof (mtu));
5412 		if (mtu > MYRI10GE_MAX_GLD_MTU ||
5413 		    mtu < MYRI10GE_MIN_GLD_MTU) {
5414 			err = EINVAL;
5415 			break;
5416 		}
5417 
5418 		mutex_enter(&mgp->intrlock);
5419 		if (mgp->running != MYRI10GE_ETH_STOPPED) {
5420 			err = EBUSY;
5421 			mutex_exit(&mgp->intrlock);
5422 			break;
5423 		}
5424 
5425 		myri10ge_mtu = mtu + sizeof (struct ether_header) +
5426 		    MXGEFW_PAD + VLAN_TAGSZ;
5427 		mutex_exit(&mgp->intrlock);
5428 		break;
5429 	}
5430 	default:
5431 		err = ENOTSUP;
5432 		break;
5433 	}
5434 
5435 	return (err);
5436 }
5437 
5438 static mac_callbacks_t myri10ge_m_callbacks = {
5439 	(MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO),
5440 	myri10ge_m_stat,
5441 	myri10ge_m_start,
5442 	myri10ge_m_stop,
5443 	myri10ge_m_promisc,
5444 	myri10ge_m_multicst,
5445 	NULL,
5446 	NULL,
5447 	NULL,
5448 	myri10ge_m_ioctl,
5449 	myri10ge_m_getcapab,
5450 	NULL,
5451 	NULL,
5452 	myri10ge_m_setprop,
5453 	NULL,
5454 	myri10ge_m_propinfo
5455 };
5456 
5457 
5458 static int
5459 myri10ge_probe_slices(struct myri10ge_priv *mgp)
5460 {
5461 	myri10ge_cmd_t cmd;
5462 	int status;
5463 
5464 	mgp->num_slices = 1;
5465 
5466 	/* hit the board with a reset to ensure it is alive */
5467 	(void) memset(&cmd, 0, sizeof (cmd));
5468 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_RESET, &cmd);
5469 	if (status != 0) {
5470 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
5471 		return (ENXIO);
5472 	}
5473 
5474 	if (myri10ge_use_msix == 0)
5475 		return (0);
5476 
5477 	/* tell it the size of the interrupt queues */
5478 	cmd.data0 = mgp->max_intr_slots * sizeof (struct mcp_slot);
5479 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
5480 	if (status != 0) {
5481 		cmn_err(CE_WARN, "%s: failed MXGEFW_CMD_SET_INTRQ_SIZE\n",
5482 		    mgp->name);
5483 		return (ENXIO);
5484 	}
5485 
5486 	/* ask the maximum number of slices it supports */
5487 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
5488 	    &cmd);
5489 	if (status != 0)
5490 		return (0);
5491 
5492 	mgp->num_slices = cmd.data0;
5493 
5494 	/*
5495 	 * if the admin did not specify a limit to how many
5496 	 * slices we should use, cap it automatically to the
5497 	 * number of CPUs currently online
5498 	 */
5499 	if (myri10ge_max_slices == -1)
5500 		myri10ge_max_slices = ncpus;
5501 
5502 	if (mgp->num_slices > myri10ge_max_slices)
5503 		mgp->num_slices = myri10ge_max_slices;
5504 
5505 
5506 	/*
5507 	 * Now try to allocate as many MSI-X vectors as we have
5508 	 * slices. We give up on MSI-X if we can only get a single
5509 	 * vector.
5510 	 */
5511 	while (mgp->num_slices > 1) {
5512 		/* make sure it is a power of two */
5513 		while (!ISP2(mgp->num_slices))
5514 			mgp->num_slices--;
5515 		if (mgp->num_slices == 1)
5516 			return (0);
5517 
5518 		status = myri10ge_add_intrs(mgp, 0);
5519 		if (status == 0) {
5520 			myri10ge_rem_intrs(mgp, 0);
5521 			if (mgp->intr_cnt == mgp->num_slices) {
5522 				if (myri10ge_verbose)
5523 					printf("Got %d slices!\n",
5524 					    mgp->num_slices);
5525 				return (0);
5526 			}
5527 			mgp->num_slices = mgp->intr_cnt;
5528 		} else {
5529 			mgp->num_slices = mgp->num_slices / 2;
5530 		}
5531 	}
5532 
5533 	if (myri10ge_verbose)
5534 		printf("Got %d slices\n", mgp->num_slices);
5535 	return (0);
5536 }
5537 
5538 static void
5539 myri10ge_lro_free(struct myri10ge_slice_state *ss)
5540 {
5541 	struct lro_entry *lro;
5542 
5543 	while (ss->lro_free != NULL) {
5544 		lro = ss->lro_free;
5545 		ss->lro_free = lro->next;
5546 		kmem_free(lro, sizeof (*lro));
5547 	}
5548 }
5549 
5550 static void
5551 myri10ge_lro_alloc(struct myri10ge_slice_state *ss)
5552 {
5553 	struct lro_entry *lro;
5554 	int idx;
5555 
5556 	ss->lro_free = NULL;
5557 	ss->lro_active = NULL;
5558 
5559 	for (idx = 0; idx < myri10ge_lro_cnt; idx++) {
5560 		lro = kmem_zalloc(sizeof (*lro), KM_SLEEP);
5561 		if (lro == NULL)
5562 			continue;
5563 		lro->next = ss->lro_free;
5564 		ss->lro_free = lro;
5565 	}
5566 }
5567 
5568 static void
5569 myri10ge_free_slices(struct myri10ge_priv *mgp)
5570 {
5571 	struct myri10ge_slice_state *ss;
5572 	size_t bytes;
5573 	int i;
5574 
5575 	if (mgp->ss == NULL)
5576 		return;
5577 
5578 	for (i = 0; i < mgp->num_slices; i++) {
5579 		ss = &mgp->ss[i];
5580 		if (ss->rx_done.entry == NULL)
5581 			continue;
5582 		myri10ge_dma_free(&ss->rx_done.dma);
5583 		ss->rx_done.entry = NULL;
5584 		if (ss->fw_stats == NULL)
5585 			continue;
5586 		myri10ge_dma_free(&ss->fw_stats_dma);
5587 		ss->fw_stats = NULL;
5588 		mutex_destroy(&ss->rx_lock);
5589 		mutex_destroy(&ss->tx.lock);
5590 		mutex_destroy(&ss->tx.handle_lock);
5591 		mutex_destroy(&ss->poll_lock);
5592 		myri10ge_jpool_fini(ss);
5593 		myri10ge_slice_stat_destroy(ss);
5594 		myri10ge_lro_free(ss);
5595 	}
5596 	bytes = sizeof (*mgp->ss) * mgp->num_slices;
5597 	kmem_free(mgp->ss, bytes);
5598 	mgp->ss = NULL;
5599 }
5600 
5601 
5602 static int
5603 myri10ge_alloc_slices(struct myri10ge_priv *mgp)
5604 {
5605 	struct myri10ge_slice_state *ss;
5606 	size_t bytes;
5607 	int i;
5608 
5609 	bytes = sizeof (*mgp->ss) * mgp->num_slices;
5610 	mgp->ss = kmem_zalloc(bytes, KM_SLEEP);
5611 	if (mgp->ss == NULL)
5612 		return (ENOMEM);
5613 	for (i = 0; i < mgp->num_slices; i++) {
5614 		ss = &mgp->ss[i];
5615 
5616 		ss->mgp = mgp;
5617 
5618 		/* allocate the per-slice firmware stats */
5619 		bytes = sizeof (*ss->fw_stats);
5620 		ss->fw_stats = (mcp_irq_data_t *)(void *)
5621 		    myri10ge_dma_alloc(mgp->dip, bytes,
5622 		    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5623 		    DDI_DMA_CONSISTENT, DDI_DMA_READ|DDI_DMA_CONSISTENT,
5624 		    &ss->fw_stats_dma, 1, DDI_DMA_DONTWAIT);
5625 		if (ss->fw_stats == NULL)
5626 			goto abort;
5627 		(void) memset(ss->fw_stats, 0, bytes);
5628 
5629 		/* allocate rx done ring */
5630 		bytes = mgp->max_intr_slots *
5631 		    sizeof (*ss->rx_done.entry);
5632 		ss->rx_done.entry = (mcp_slot_t *)(void *)
5633 		    myri10ge_dma_alloc(mgp->dip, bytes,
5634 		    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5635 		    DDI_DMA_CONSISTENT, DDI_DMA_READ|DDI_DMA_CONSISTENT,
5636 		    &ss->rx_done.dma, 1, DDI_DMA_DONTWAIT);
5637 		if (ss->rx_done.entry == NULL) {
5638 			goto abort;
5639 		}
5640 		(void) memset(ss->rx_done.entry, 0, bytes);
5641 		mutex_init(&ss->rx_lock,   NULL, MUTEX_DEFAULT, mgp->icookie);
5642 		mutex_init(&ss->tx.lock,   NULL, MUTEX_DEFAULT, NULL);
5643 		mutex_init(&ss->tx.handle_lock,   NULL, MUTEX_DEFAULT, NULL);
5644 		mutex_init(&ss->poll_lock,   NULL, MUTEX_DEFAULT, NULL);
5645 		myri10ge_jpool_init(ss);
5646 		(void) myri10ge_slice_stat_init(ss);
5647 		myri10ge_lro_alloc(ss);
5648 	}
5649 
5650 	return (0);
5651 
5652 abort:
5653 	myri10ge_free_slices(mgp);
5654 	return (ENOMEM);
5655 }
5656 
5657 static int
5658 myri10ge_save_msi_state(struct myri10ge_priv *mgp,
5659     ddi_acc_handle_t handle)
5660 {
5661 	uint8_t ptr;
5662 	int err;
5663 
5664 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_MSI);
5665 	if (err != 0) {
5666 		cmn_err(CE_WARN, "%s: could not find MSI cap\n",
5667 		    mgp->name);
5668 		return (DDI_FAILURE);
5669 	}
5670 	mgp->pci_saved_state.msi_ctrl =
5671 	    pci_config_get16(handle, ptr + PCI_MSI_CTRL);
5672 	mgp->pci_saved_state.msi_addr_low =
5673 	    pci_config_get32(handle, ptr + PCI_MSI_ADDR_OFFSET);
5674 	mgp->pci_saved_state.msi_addr_high =
5675 	    pci_config_get32(handle, ptr + PCI_MSI_ADDR_OFFSET + 4);
5676 	mgp->pci_saved_state.msi_data_32 =
5677 	    pci_config_get16(handle, ptr + PCI_MSI_32BIT_DATA);
5678 	mgp->pci_saved_state.msi_data_64 =
5679 	    pci_config_get16(handle, ptr + PCI_MSI_64BIT_DATA);
5680 	return (DDI_SUCCESS);
5681 }
5682 
5683 static int
5684 myri10ge_restore_msi_state(struct myri10ge_priv *mgp,
5685     ddi_acc_handle_t handle)
5686 {
5687 	uint8_t ptr;
5688 	int err;
5689 
5690 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_MSI);
5691 	if (err != 0) {
5692 		cmn_err(CE_WARN, "%s: could not find MSI cap\n",
5693 		    mgp->name);
5694 		return (DDI_FAILURE);
5695 	}
5696 
5697 	pci_config_put16(handle, ptr + PCI_MSI_CTRL,
5698 	    mgp->pci_saved_state.msi_ctrl);
5699 	pci_config_put32(handle, ptr + PCI_MSI_ADDR_OFFSET,
5700 	    mgp->pci_saved_state.msi_addr_low);
5701 	pci_config_put32(handle, ptr + PCI_MSI_ADDR_OFFSET + 4,
5702 	    mgp->pci_saved_state.msi_addr_high);
5703 	pci_config_put16(handle, ptr + PCI_MSI_32BIT_DATA,
5704 	    mgp->pci_saved_state.msi_data_32);
5705 	pci_config_put16(handle, ptr + PCI_MSI_64BIT_DATA,
5706 	    mgp->pci_saved_state.msi_data_64);
5707 
5708 	return (DDI_SUCCESS);
5709 }
5710 
5711 static int
5712 myri10ge_save_pci_state(struct myri10ge_priv *mgp)
5713 {
5714 	ddi_acc_handle_t handle = mgp->cfg_hdl;
5715 	int i;
5716 	int err = DDI_SUCCESS;
5717 
5718 
5719 	/* Save the non-extended PCI config space 32-bits at a time */
5720 	for (i = 0; i < 16; i++)
5721 		mgp->pci_saved_state.base[i] =
5722 		    pci_config_get32(handle, i*4);
5723 
5724 	/* now save MSI interrupt state *, if needed */
5725 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_MSI)
5726 		err = myri10ge_save_msi_state(mgp, handle);
5727 
5728 	return (err);
5729 }
5730 
5731 static int
5732 myri10ge_restore_pci_state(struct myri10ge_priv *mgp)
5733 {
5734 	ddi_acc_handle_t handle = mgp->cfg_hdl;
5735 	int i;
5736 	int err = DDI_SUCCESS;
5737 
5738 
5739 	/* Restore the non-extended PCI config space 32-bits at a time */
5740 	for (i = 15; i >= 0; i--)
5741 		pci_config_put32(handle, i*4, mgp->pci_saved_state.base[i]);
5742 
5743 	/* now restore MSI interrupt state *, if needed */
5744 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_MSI)
5745 		err = myri10ge_restore_msi_state(mgp, handle);
5746 
5747 	if (mgp->max_read_request_4k)
5748 		(void) myri10ge_set_max_readreq(handle);
5749 	return (err);
5750 }
5751 
5752 
5753 static int
5754 myri10ge_suspend(dev_info_t *dip)
5755 {
5756 	struct myri10ge_priv *mgp = ddi_get_driver_private(dip);
5757 	int status;
5758 
5759 	if (mgp == NULL) {
5760 		cmn_err(CE_WARN, "null dip in myri10ge_suspend\n");
5761 		return (DDI_FAILURE);
5762 	}
5763 	if (mgp->dip != dip) {
5764 		cmn_err(CE_WARN, "bad dip in myri10ge_suspend\n");
5765 		return (DDI_FAILURE);
5766 	}
5767 	mutex_enter(&mgp->intrlock);
5768 	if (mgp->running == MYRI10GE_ETH_RUNNING) {
5769 		mgp->running = MYRI10GE_ETH_STOPPING;
5770 		mutex_exit(&mgp->intrlock);
5771 		(void) untimeout(mgp->timer_id);
5772 		mutex_enter(&mgp->intrlock);
5773 		myri10ge_stop_locked(mgp);
5774 		mgp->running = MYRI10GE_ETH_SUSPENDED_RUNNING;
5775 	}
5776 	status = myri10ge_save_pci_state(mgp);
5777 	mutex_exit(&mgp->intrlock);
5778 	return (status);
5779 }
5780 
5781 static int
5782 myri10ge_resume(dev_info_t *dip)
5783 {
5784 	struct myri10ge_priv *mgp = ddi_get_driver_private(dip);
5785 	int status = DDI_SUCCESS;
5786 
5787 	if (mgp == NULL) {
5788 		cmn_err(CE_WARN, "null dip in myri10ge_resume\n");
5789 		return (DDI_FAILURE);
5790 	}
5791 	if (mgp->dip != dip) {
5792 		cmn_err(CE_WARN, "bad dip in myri10ge_resume\n");
5793 		return (DDI_FAILURE);
5794 	}
5795 
5796 	mutex_enter(&mgp->intrlock);
5797 	status = myri10ge_restore_pci_state(mgp);
5798 	if (status == DDI_SUCCESS &&
5799 	    mgp->running == MYRI10GE_ETH_SUSPENDED_RUNNING) {
5800 		status = myri10ge_start_locked(mgp);
5801 	}
5802 	mutex_exit(&mgp->intrlock);
5803 	if (status != DDI_SUCCESS)
5804 		return (status);
5805 
5806 	/* start the watchdog timer */
5807 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
5808 	    mgp->timer_ticks);
5809 	return (DDI_SUCCESS);
5810 }
5811 
5812 static int
5813 myri10ge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
5814 {
5815 
5816 	struct myri10ge_priv *mgp;
5817 	mac_register_t *macp, *omacp;
5818 	ddi_acc_handle_t handle;
5819 	uint32_t csr, hdr_offset;
5820 	int status, span, link_width, max_read_request_4k;
5821 	unsigned long bus_number, dev_number, func_number;
5822 	size_t bytes;
5823 	offset_t ss_offset;
5824 	uint8_t vso;
5825 
5826 	if (cmd == DDI_RESUME) {
5827 		return (myri10ge_resume(dip));
5828 	}
5829 
5830 	if (cmd != DDI_ATTACH)
5831 		return (DDI_FAILURE);
5832 	if (pci_config_setup(dip, &handle) != DDI_SUCCESS)
5833 		return (DDI_FAILURE);
5834 
5835 	/* enable busmater and io space access */
5836 	csr = pci_config_get32(handle, PCI_CONF_COMM);
5837 	pci_config_put32(handle, PCI_CONF_COMM,
5838 	    (csr |PCI_COMM_ME|PCI_COMM_MAE));
5839 	status = myri10ge_read_pcie_link_width(handle, &link_width);
5840 	if (status != 0) {
5841 		cmn_err(CE_WARN, "could not read link width!\n");
5842 		link_width = 0;
5843 	}
5844 	max_read_request_4k = !myri10ge_set_max_readreq(handle);
5845 	status = myri10ge_find_cap(handle, &vso, PCI_CAP_ID_VS);
5846 	if (status != 0)
5847 		goto abort_with_cfg_hdl;
5848 	if ((omacp = mac_alloc(MAC_VERSION)) == NULL)
5849 		goto abort_with_cfg_hdl;
5850 	/*
5851 	 * XXXX Hack: mac_register_t grows in newer kernels.  To be
5852 	 * able to write newer fields, such as m_margin, without
5853 	 * writing outside allocated memory, we allocate our own macp
5854 	 * and pass that to mac_register()
5855 	 */
5856 	macp = kmem_zalloc(sizeof (*macp) * 8, KM_SLEEP);
5857 	macp->m_version = omacp->m_version;
5858 
5859 	if ((mgp = (struct myri10ge_priv *)
5860 	    kmem_zalloc(sizeof (*mgp), KM_SLEEP)) == NULL) {
5861 		goto abort_with_macinfo;
5862 	}
5863 	ddi_set_driver_private(dip, mgp);
5864 
5865 	/* setup device name for log messages */
5866 	(void) sprintf(mgp->name, "myri10ge%d", ddi_get_instance(dip));
5867 
5868 	mutex_enter(&myri10ge_param_lock);
5869 	myri10ge_get_props(dip);
5870 	mgp->intr_coal_delay = myri10ge_intr_coal_delay;
5871 	mgp->pause = myri10ge_flow_control;
5872 	mutex_exit(&myri10ge_param_lock);
5873 
5874 	mgp->max_read_request_4k = max_read_request_4k;
5875 	mgp->pcie_link_width = link_width;
5876 	mgp->running = MYRI10GE_ETH_STOPPED;
5877 	mgp->vso = vso;
5878 	mgp->dip = dip;
5879 	mgp->cfg_hdl = handle;
5880 
5881 	mgp->timer_ticks = 5 * drv_usectohz(1000000); /* 5 seconds */
5882 	myri10ge_test_physical(dip);
5883 
5884 	/* allocate command page */
5885 	bytes = sizeof (*mgp->cmd);
5886 	mgp->cmd = (mcp_cmd_response_t *)
5887 	    (void *)myri10ge_dma_alloc(dip, bytes,
5888 	    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5889 	    DDI_DMA_CONSISTENT,	DDI_DMA_RDWR|DDI_DMA_CONSISTENT,
5890 	    &mgp->cmd_dma, 1, DDI_DMA_DONTWAIT);
5891 	if (mgp->cmd == NULL)
5892 		goto abort_with_mgp;
5893 
5894 	(void) myri10ge_reg_set(dip, &mgp->reg_set, &span, &bus_number,
5895 	    &dev_number, &func_number);
5896 	if (myri10ge_verbose)
5897 		printf("%s at %ld:%ld:%ld attaching\n", mgp->name,
5898 		    bus_number, dev_number, func_number);
5899 	status = ddi_regs_map_setup(dip, mgp->reg_set, (caddr_t *)&mgp->sram,
5900 	    (offset_t)0, (offset_t)span,  &myri10ge_dev_access_attr,
5901 	    &mgp->io_handle);
5902 	if (status != DDI_SUCCESS) {
5903 		cmn_err(CE_WARN, "%s: couldn't map memory space", mgp->name);
5904 		printf("%s: reg_set = %d, span = %d, status = %d",
5905 		    mgp->name, mgp->reg_set, span, status);
5906 		goto abort_with_mgp;
5907 	}
5908 
5909 	hdr_offset = *(uint32_t *)(void*)(mgp->sram +  MCP_HEADER_PTR_OFFSET);
5910 	hdr_offset = ntohl(hdr_offset) & 0xffffc;
5911 	ss_offset = hdr_offset +
5912 	    offsetof(struct mcp_gen_header, string_specs);
5913 	mgp->sram_size = ntohl(*(uint32_t *)(void*)(mgp->sram + ss_offset));
5914 	myri10ge_pio_copy32(mgp->eeprom_strings,
5915 	    (uint32_t *)(void*)((char *)mgp->sram + mgp->sram_size),
5916 	    MYRI10GE_EEPROM_STRINGS_SIZE);
5917 	(void) memset(mgp->eeprom_strings +
5918 	    MYRI10GE_EEPROM_STRINGS_SIZE - 2, 0, 2);
5919 
5920 	status = myri10ge_read_mac_addr(mgp);
5921 	if (status) {
5922 		goto abort_with_mapped;
5923 	}
5924 
5925 	status = myri10ge_select_firmware(mgp);
5926 	if (status != 0) {
5927 		cmn_err(CE_WARN, "%s: failed to load firmware\n", mgp->name);
5928 		goto abort_with_mapped;
5929 	}
5930 
5931 	status = myri10ge_probe_slices(mgp);
5932 	if (status != 0) {
5933 		cmn_err(CE_WARN, "%s: failed to probe slices\n", mgp->name);
5934 		goto abort_with_dummy_rdma;
5935 	}
5936 
5937 	status = myri10ge_alloc_slices(mgp);
5938 	if (status != 0) {
5939 		cmn_err(CE_WARN, "%s: failed to alloc slices\n", mgp->name);
5940 		goto abort_with_dummy_rdma;
5941 	}
5942 
5943 	/* add the interrupt handler */
5944 	status = myri10ge_add_intrs(mgp, 1);
5945 	if (status != 0) {
5946 		cmn_err(CE_WARN, "%s: Failed to add interrupt\n",
5947 		    mgp->name);
5948 		goto abort_with_slices;
5949 	}
5950 
5951 	/* now that we have an iblock_cookie, init the mutexes */
5952 	mutex_init(&mgp->cmd_lock, NULL, MUTEX_DRIVER, mgp->icookie);
5953 	mutex_init(&mgp->intrlock, NULL, MUTEX_DRIVER, mgp->icookie);
5954 
5955 
5956 	status = myri10ge_nic_stat_init(mgp);
5957 	if (status != DDI_SUCCESS)
5958 		goto abort_with_interrupts;
5959 	status = myri10ge_info_init(mgp);
5960 	if (status != DDI_SUCCESS)
5961 		goto abort_with_stats;
5962 
5963 	/*
5964 	 *	Initialize  GLD state
5965 	 */
5966 
5967 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
5968 	macp->m_driver = mgp;
5969 	macp->m_dip = dip;
5970 	macp->m_src_addr = mgp->mac_addr;
5971 	macp->m_callbacks = &myri10ge_m_callbacks;
5972 	macp->m_min_sdu = 0;
5973 	macp->m_max_sdu = myri10ge_mtu -
5974 	    (sizeof (struct ether_header) + MXGEFW_PAD + VLAN_TAGSZ);
5975 #ifdef SOLARIS_S11
5976 	macp->m_margin = VLAN_TAGSZ;
5977 #endif
5978 	macp->m_v12n = MAC_VIRT_LEVEL1;
5979 	status = mac_register(macp, &mgp->mh);
5980 	if (status != 0) {
5981 		cmn_err(CE_WARN, "%s: mac_register failed with %d\n",
5982 		    mgp->name, status);
5983 		goto abort_with_info;
5984 	}
5985 	myri10ge_ndd_init(mgp);
5986 	if (myri10ge_verbose)
5987 		printf("%s: %s, tx bndry %d, fw %s\n", mgp->name,
5988 		    mgp->intr_type, mgp->tx_boundary, mgp->fw_name);
5989 	mutex_enter(&myri10ge_param_lock);
5990 	mgp->next = mgp_list;
5991 	mgp_list = mgp;
5992 	mutex_exit(&myri10ge_param_lock);
5993 	kmem_free(macp, sizeof (*macp) * 8);
5994 	mac_free(omacp);
5995 	return (DDI_SUCCESS);
5996 
5997 abort_with_info:
5998 	myri10ge_info_destroy(mgp);
5999 
6000 abort_with_stats:
6001 	myri10ge_nic_stat_destroy(mgp);
6002 
6003 abort_with_interrupts:
6004 	mutex_destroy(&mgp->cmd_lock);
6005 	mutex_destroy(&mgp->intrlock);
6006 	myri10ge_rem_intrs(mgp, 1);
6007 
6008 abort_with_slices:
6009 	myri10ge_free_slices(mgp);
6010 
6011 abort_with_dummy_rdma:
6012 	myri10ge_dummy_rdma(mgp, 0);
6013 
6014 abort_with_mapped:
6015 	ddi_regs_map_free(&mgp->io_handle);
6016 
6017 	myri10ge_dma_free(&mgp->cmd_dma);
6018 
6019 abort_with_mgp:
6020 	kmem_free(mgp, sizeof (*mgp));
6021 
6022 abort_with_macinfo:
6023 	kmem_free(macp, sizeof (*macp) * 8);
6024 	mac_free(omacp);
6025 
6026 abort_with_cfg_hdl:
6027 	pci_config_teardown(&handle);
6028 	return (DDI_FAILURE);
6029 
6030 }
6031 
6032 
6033 static int
6034 myri10ge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
6035 {
6036 	struct myri10ge_priv	*mgp, *tmp;
6037 	int 			status, i, jbufs_alloced;
6038 
6039 	if (cmd == DDI_SUSPEND) {
6040 		status = myri10ge_suspend(dip);
6041 		return (status);
6042 	}
6043 
6044 	if (cmd != DDI_DETACH) {
6045 		return (DDI_FAILURE);
6046 	}
6047 	/* Get the driver private (gld_mac_info_t) structure */
6048 	mgp = ddi_get_driver_private(dip);
6049 
6050 	mutex_enter(&mgp->intrlock);
6051 	jbufs_alloced = 0;
6052 	for (i = 0; i < mgp->num_slices; i++) {
6053 		myri10ge_remove_jbufs(&mgp->ss[i]);
6054 		jbufs_alloced += mgp->ss[i].jpool.num_alloc;
6055 	}
6056 	mutex_exit(&mgp->intrlock);
6057 	if (jbufs_alloced != 0) {
6058 		cmn_err(CE_NOTE, "%s: %d loaned rx buffers remain\n",
6059 		    mgp->name, jbufs_alloced);
6060 		return (DDI_FAILURE);
6061 	}
6062 
6063 	mutex_enter(&myri10ge_param_lock);
6064 	if (mgp->refcnt != 0) {
6065 		mutex_exit(&myri10ge_param_lock);
6066 		cmn_err(CE_NOTE, "%s: %d external refs remain\n",
6067 		    mgp->name, mgp->refcnt);
6068 		return (DDI_FAILURE);
6069 	}
6070 	mutex_exit(&myri10ge_param_lock);
6071 
6072 	status = mac_unregister(mgp->mh);
6073 	if (status != DDI_SUCCESS)
6074 		return (status);
6075 
6076 	myri10ge_ndd_fini(mgp);
6077 	myri10ge_dummy_rdma(mgp, 0);
6078 	myri10ge_nic_stat_destroy(mgp);
6079 	myri10ge_info_destroy(mgp);
6080 
6081 	mutex_destroy(&mgp->cmd_lock);
6082 	mutex_destroy(&mgp->intrlock);
6083 
6084 	myri10ge_rem_intrs(mgp, 1);
6085 
6086 	myri10ge_free_slices(mgp);
6087 	ddi_regs_map_free(&mgp->io_handle);
6088 	myri10ge_dma_free(&mgp->cmd_dma);
6089 	pci_config_teardown(&mgp->cfg_hdl);
6090 
6091 	mutex_enter(&myri10ge_param_lock);
6092 	if (mgp_list == mgp) {
6093 		mgp_list = mgp->next;
6094 	} else {
6095 		tmp = mgp_list;
6096 		while (tmp->next != mgp && tmp->next != NULL)
6097 			tmp = tmp->next;
6098 		if (tmp->next != NULL)
6099 			tmp->next = tmp->next->next;
6100 	}
6101 	kmem_free(mgp, sizeof (*mgp));
6102 	mutex_exit(&myri10ge_param_lock);
6103 	return (DDI_SUCCESS);
6104 }
6105 
6106 /*
6107  * Helper for quiesce entry point: Interrupt threads are not being
6108  * scheduled, so we must poll for the confirmation DMA to arrive in
6109  * the firmware stats block for slice 0.  We're essentially running
6110  * the guts of the interrupt handler, and just cherry picking the
6111  * confirmation that the NIC is queuesced (stats->link_down)
6112  */
6113 
6114 static int
6115 myri10ge_poll_down(struct myri10ge_priv *mgp)
6116 {
6117 	struct myri10ge_slice_state *ss = mgp->ss;
6118 	mcp_irq_data_t *stats = ss->fw_stats;
6119 	int valid;
6120 	int found_down = 0;
6121 
6122 
6123 	/* check for a pending IRQ */
6124 
6125 	if (! *((volatile uint8_t *)& stats->valid))
6126 		return (0);
6127 	valid = stats->valid;
6128 
6129 	/*
6130 	 * Make sure to tell the NIC to lower a legacy IRQ, else
6131 	 * it may have corrupt state after restarting
6132 	 */
6133 
6134 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
6135 		/* lower legacy IRQ  */
6136 		*mgp->irq_deassert = 0;
6137 		mb();
6138 		/* wait for irq conf DMA */
6139 		while (*((volatile uint8_t *)& stats->valid))
6140 			;
6141 	}
6142 	if (stats->stats_updated && stats->link_down)
6143 		found_down = 1;
6144 
6145 	if (valid & 0x1)
6146 		*ss->irq_claim = BE_32(3);
6147 	*(ss->irq_claim + 1) = BE_32(3);
6148 
6149 	return (found_down);
6150 }
6151 
6152 static int
6153 myri10ge_quiesce(dev_info_t *dip)
6154 {
6155 	struct myri10ge_priv *mgp;
6156 	myri10ge_cmd_t cmd;
6157 	int status, down, i;
6158 
6159 	mgp = ddi_get_driver_private(dip);
6160 	if (mgp == NULL)
6161 		return (DDI_FAILURE);
6162 
6163 	/* if devices was unplumbed, it is guaranteed to be quiescent */
6164 	if (mgp->running == MYRI10GE_ETH_STOPPED)
6165 		return (DDI_SUCCESS);
6166 
6167 	/* send a down CMD to queuesce NIC */
6168 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
6169 	if (status) {
6170 		cmn_err(CE_WARN, "%s: Couldn't bring down link\n", mgp->name);
6171 		return (DDI_FAILURE);
6172 	}
6173 
6174 	for (i = 0; i < 20; i++) {
6175 		down = myri10ge_poll_down(mgp);
6176 		if (down)
6177 			break;
6178 		delay(drv_usectohz(100000));
6179 		mb();
6180 	}
6181 	if (down)
6182 		return (DDI_SUCCESS);
6183 	return (DDI_FAILURE);
6184 }
6185 
6186 /*
6187  * Distinguish between allocb'ed blocks, and gesballoc'ed attached
6188  * storage.
6189  */
6190 static void
6191 myri10ge_find_lastfree(void)
6192 {
6193 	mblk_t *mp = allocb(1024, 0);
6194 	dblk_t *dbp;
6195 
6196 	if (mp == NULL) {
6197 		cmn_err(CE_WARN, "myri10ge_find_lastfree failed\n");
6198 		return;
6199 	}
6200 	dbp = mp->b_datap;
6201 	myri10ge_db_lastfree = (void *)dbp->db_lastfree;
6202 }
6203 
6204 int
6205 _init(void)
6206 {
6207 	int i;
6208 
6209 	if (myri10ge_verbose)
6210 		cmn_err(CE_NOTE,
6211 		    "Myricom 10G driver (10GbE) version %s loading\n",
6212 		    MYRI10GE_VERSION_STR);
6213 	myri10ge_find_lastfree();
6214 	mac_init_ops(&myri10ge_ops, "myri10ge");
6215 	mutex_init(&myri10ge_param_lock, NULL, MUTEX_DEFAULT, NULL);
6216 	if ((i = mod_install(&modlinkage)) != 0) {
6217 		cmn_err(CE_WARN, "mod_install returned %d\n", i);
6218 		mac_fini_ops(&myri10ge_ops);
6219 		mutex_destroy(&myri10ge_param_lock);
6220 	}
6221 	return (i);
6222 }
6223 
6224 int
6225 _fini(void)
6226 {
6227 	int i;
6228 	i = mod_remove(&modlinkage);
6229 	if (i != 0) {
6230 		return (i);
6231 	}
6232 	mac_fini_ops(&myri10ge_ops);
6233 	mutex_destroy(&myri10ge_param_lock);
6234 	return (0);
6235 }
6236 
6237 int
6238 _info(struct modinfo *modinfop)
6239 {
6240 	return (mod_info(&modlinkage, modinfop));
6241 }
6242 
6243 
6244 /*
6245  *  This file uses MyriGE driver indentation.
6246  *
6247  * Local Variables:
6248  * c-file-style:"sun"
6249  * tab-width:8
6250  * End:
6251  */
6252