xref: /illumos-gate/usr/src/uts/common/io/myri10ge/drv/myri10ge.c (revision 04b6cca3fef9f6205a9aa479c48d196116193dd9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007-2009 Myricom, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifndef	lint
28 static const char __idstring[] =
29 	"@(#)$Id: myri10ge.c,v 1.186 2009-06-29 13:47:22 gallatin Exp $";
30 #endif
31 
32 #define	MXGEFW_NDIS
33 #include "myri10ge_var.h"
34 #include "rss_eth_z8e.h"
35 #include "rss_ethp_z8e.h"
36 #include "mcp_gen_header.h"
37 
38 #define	MYRI10GE_MAX_ETHER_MTU 9014
39 
40 #define	MYRI10GE_ETH_STOPPED 0
41 #define	MYRI10GE_ETH_STOPPING 1
42 #define	MYRI10GE_ETH_STARTING 2
43 #define	MYRI10GE_ETH_RUNNING 3
44 #define	MYRI10GE_ETH_OPEN_FAILED 4
45 #define	MYRI10GE_ETH_SUSPENDED_RUNNING 5
46 
47 static int myri10ge_small_bytes = 510;
48 static int myri10ge_intr_coal_delay = 125;
49 static int myri10ge_flow_control = 1;
50 #if #cpu(i386) || defined __i386 || defined i386 ||	\
51 	defined __i386__ || #cpu(x86_64) || defined __x86_64__
52 static int myri10ge_nvidia_ecrc_enable = 1;
53 #endif
54 static int myri10ge_mtu_override = 0;
55 static int myri10ge_tx_copylen = 512;
56 static int myri10ge_deassert_wait = 1;
57 static int myri10ge_verbose = 0;
58 static int myri10ge_watchdog_reset = 0;
59 static int myri10ge_use_msix = 1;
60 static int myri10ge_max_slices = -1;
61 static int myri10ge_use_msi = 1;
62 int myri10ge_force_firmware = 0;
63 static boolean_t myri10ge_use_lso = B_TRUE;
64 static int myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
65 static int myri10ge_tx_hash = 1;
66 static int myri10ge_lro = 1;
67 static int myri10ge_lro_cnt = 8;
68 int myri10ge_lro_max_aggr = 2;
69 static int myri10ge_lso_copy = 0;
70 static mblk_t *myri10ge_send_wrapper(void *arg, mblk_t *mp);
71 int myri10ge_tx_handles_initial = 128;
72 
73 static 	kmutex_t myri10ge_param_lock;
74 static void* myri10ge_db_lastfree;
75 
76 static int myri10ge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
77 static int myri10ge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
78 static int myri10ge_quiesce(dev_info_t *dip);
79 
80 DDI_DEFINE_STREAM_OPS(myri10ge_ops, nulldev, nulldev, myri10ge_attach,
81     myri10ge_detach, nodev, NULL, D_MP, NULL, myri10ge_quiesce);
82 
83 
84 static struct modldrv modldrv = {
85 	&mod_driverops,
86 	"Myricom 10G driver (10GbE)",
87 	&myri10ge_ops,
88 };
89 
90 
91 static struct modlinkage modlinkage = {
92 	MODREV_1,
93 	{&modldrv, NULL},
94 };
95 
96 unsigned char myri10ge_broadcastaddr[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
97 
98 static ddi_dma_attr_t myri10ge_misc_dma_attr = {
99 	DMA_ATTR_V0,			/* version number. */
100 	(uint64_t)0, 			/* low address */
101 	(uint64_t)0xffffffffffffffffULL, /* high address */
102 	(uint64_t)0x7ffffff,		/* address counter max */
103 	(uint64_t)4096,			/* alignment */
104 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
105 	(uint32_t)0x1,			/* minimum transfer size */
106 	(uint64_t)0x7fffffff,		/* maximum transfer size */
107 	(uint64_t)0x7fffffff,		/* maximum segment size */
108 	1,				/* scatter/gather list length */
109 	1,				/* granularity */
110 	0				/* attribute flags */
111 };
112 
113 /*
114  * The Myri10GE NIC has the following constraints on receive buffers:
115  * 1) Buffers which cross a 4KB boundary must be aligned to 4KB
116  * 2) Buffers which are not aligned to 4KB must not cross a 4KB boundary
117  */
118 
119 static ddi_dma_attr_t myri10ge_rx_jumbo_dma_attr = {
120 	DMA_ATTR_V0,			/* version number. */
121 	(uint64_t)0, 			/* low address */
122 	(uint64_t)0xffffffffffffffffULL, /* high address */
123 	(uint64_t)0x7ffffff,		/* address counter max */
124 	(uint64_t)4096,			/* alignment */
125 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
126 	(uint32_t)0x1,			/* minimum transfer size */
127 	(uint64_t)0x7fffffff,		/* maximum transfer size */
128 	UINT64_MAX,			/* maximum segment size */
129 	1,				/* scatter/gather list length */
130 	1,				/* granularity */
131 	0				/* attribute flags */
132 };
133 
134 static ddi_dma_attr_t myri10ge_rx_std_dma_attr = {
135 	DMA_ATTR_V0,			/* version number. */
136 	(uint64_t)0, 			/* low address */
137 	(uint64_t)0xffffffffffffffffULL, /* high address */
138 	(uint64_t)0x7ffffff,		/* address counter max */
139 #if defined sparc64 || defined __sparcv9
140 	(uint64_t)4096,			/* alignment */
141 #else
142 	(uint64_t)0x80,			/* alignment */
143 #endif
144 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
145 	(uint32_t)0x1,			/* minimum transfer size */
146 	(uint64_t)0x7fffffff,		/* maximum transfer size */
147 #if defined sparc64 || defined __sparcv9
148 	UINT64_MAX,			/* maximum segment size */
149 #else
150 	(uint64_t)0xfff,		/* maximum segment size */
151 #endif
152 	1,				/* scatter/gather list length */
153 	1,				/* granularity */
154 	0				/* attribute flags */
155 };
156 
157 static ddi_dma_attr_t myri10ge_tx_dma_attr = {
158 	DMA_ATTR_V0,			/* version number. */
159 	(uint64_t)0, 			/* low address */
160 	(uint64_t)0xffffffffffffffffULL, /* high address */
161 	(uint64_t)0x7ffffff,		/* address counter max */
162 	(uint64_t)1,			/* alignment */
163 	(uint_t)0x7f,			/* burstsizes for 32b and 64b xfers */
164 	(uint32_t)0x1,			/* minimum transfer size */
165 	(uint64_t)0x7fffffff,		/* maximum transfer size */
166 	UINT64_MAX,			/* maximum segment size */
167 	INT32_MAX,			/* scatter/gather list length */
168 	1,				/* granularity */
169 	0			/* attribute flags */
170 };
171 
172 #if defined sparc64 || defined __sparcv9
173 #define	WC 0
174 #else
175 #define	WC 1
176 #endif
177 
178 struct ddi_device_acc_attr myri10ge_dev_access_attr = {
179 	DDI_DEVICE_ATTR_V0,		/* version */
180 	DDI_NEVERSWAP_ACC,		/* endian flash */
181 #if WC
182 	DDI_MERGING_OK_ACC		/* data order */
183 #else
184 	DDI_STRICTORDER_ACC
185 #endif
186 };
187 
188 static void myri10ge_watchdog(void *arg);
189 
190 #ifdef MYRICOM_PRIV
191 int myri10ge_mtu = MYRI10GE_MAX_ETHER_MTU + MXGEFW_PAD + VLAN_TAGSZ;
192 #else
193 int myri10ge_mtu = ETHERMAX + MXGEFW_PAD + VLAN_TAGSZ;
194 #endif
195 int myri10ge_bigbufs_initial = 1024;
196 int myri10ge_bigbufs_max = 4096;
197 
198 
199 caddr_t
200 myri10ge_dma_alloc(dev_info_t *dip, size_t len,
201     ddi_dma_attr_t *attr, ddi_device_acc_attr_t  *accattr,
202     uint_t alloc_flags, int bind_flags, struct myri10ge_dma_stuff *dma,
203     int warn, int (*wait)(caddr_t))
204 {
205 	caddr_t  kaddr;
206 	size_t real_length;
207 	ddi_dma_cookie_t cookie;
208 	uint_t count;
209 	int err;
210 
211 	err = ddi_dma_alloc_handle(dip, attr, wait,
212 	    NULL, &dma->handle);
213 	if (err != DDI_SUCCESS) {
214 		if (warn)
215 			cmn_err(CE_WARN,
216 			    "myri10ge: ddi_dma_alloc_handle failed\n");
217 		goto abort_with_nothing;
218 	}
219 
220 	err = ddi_dma_mem_alloc(dma->handle, len, accattr, alloc_flags,
221 	    wait, NULL, &kaddr, &real_length,
222 	    &dma->acc_handle);
223 	if (err != DDI_SUCCESS) {
224 		if (warn)
225 			cmn_err(CE_WARN,
226 			    "myri10ge: ddi_dma_mem_alloc failed\n");
227 		goto abort_with_handle;
228 	}
229 
230 	err = ddi_dma_addr_bind_handle(dma->handle, NULL, kaddr, len,
231 	    bind_flags, wait, NULL, &cookie, &count);
232 
233 	if (err != DDI_SUCCESS) {
234 		if (warn)
235 			cmn_err(CE_WARN,
236 			    "myri10ge: ddi_dma_addr_bind_handle failed\n");
237 		goto abort_with_mem;
238 	}
239 
240 	if (count != 1) {
241 		if (warn)
242 			cmn_err(CE_WARN,
243 			    "myri10ge: got too many dma segments ");
244 		goto abort_with_bind;
245 	}
246 	dma->low = htonl(MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress));
247 	dma->high = htonl(MYRI10GE_HIGHPART_TO_U32(cookie.dmac_laddress));
248 	return (kaddr);
249 
250 abort_with_bind:
251 	(void) ddi_dma_unbind_handle(dma->handle);
252 
253 abort_with_mem:
254 	ddi_dma_mem_free(&dma->acc_handle);
255 
256 abort_with_handle:
257 	ddi_dma_free_handle(&dma->handle);
258 abort_with_nothing:
259 	if (warn) {
260 		cmn_err(CE_WARN, "myri10ge: myri10ge_dma_alloc failed.\n  ");
261 		cmn_err(CE_WARN, "args: dip=%p len=0x%lx ddi_dma_attr=%p\n",
262 		    (void*) dip, len, (void*) attr);
263 		cmn_err(CE_WARN,
264 		    "args: ddi_device_acc_attr=%p  alloc_flags=0x%x\n",
265 		    (void*) accattr, alloc_flags);
266 		cmn_err(CE_WARN, "args: bind_flags=0x%x  dmastuff=%p",
267 		    bind_flags, (void*) dma);
268 	}
269 	return (NULL);
270 
271 }
272 
273 void
274 myri10ge_dma_free(struct myri10ge_dma_stuff *dma)
275 {
276 	(void) ddi_dma_unbind_handle(dma->handle);
277 	ddi_dma_mem_free(&dma->acc_handle);
278 	ddi_dma_free_handle(&dma->handle);
279 }
280 
281 static inline void
282 myri10ge_pio_copy32(void *to, uint32_t *from32, size_t size)
283 {
284 	register volatile uint32_t *to32;
285 	size_t i;
286 
287 	to32 = (volatile uint32_t *) to;
288 	for (i = (size / 4); i; i--) {
289 		*to32 = *from32;
290 		to32++;
291 		from32++;
292 	}
293 }
294 
295 #if defined(_LP64)
296 static inline void
297 myri10ge_pio_copy64(void *to, uint64_t *from64, size_t size)
298 {
299 	register volatile uint64_t *to64;
300 	size_t i;
301 
302 	to64 = (volatile uint64_t *) to;
303 	for (i = (size / 8); i; i--) {
304 		*to64 = *from64;
305 		to64++;
306 		from64++;
307 	}
308 }
309 #endif
310 
311 /*
312  * This routine copies memory from the host to the NIC.
313  * The "size" argument must always be a multiple of
314  * the size of long (4 or 8 bytes), and to/from must also
315  * be naturally aligned.
316  */
317 static inline void
318 myri10ge_pio_copy(void *to, void *from, size_t size)
319 {
320 #if !defined(_LP64)
321 	ASSERT((size % 4) == 0);
322 	myri10ge_pio_copy32(to, (uint32_t *)from, size);
323 #else
324 	ASSERT((size % 8) == 0);
325 	myri10ge_pio_copy64(to, (uint64_t *)from, size);
326 #endif
327 }
328 
329 
330 /*
331  * Due to various bugs in Solaris (especially bug 6186772 where the
332  * TCP/UDP checksum is calculated incorrectly on mblk chains with more
333  * than two elements), and the design bug where hardware checksums are
334  * ignored on mblk chains with more than 2 elements, we need to
335  * allocate private pool of physically contiguous receive buffers.
336  */
337 
338 static void
339 myri10ge_jpool_init(struct myri10ge_slice_state *ss)
340 {
341 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
342 
343 	bzero(jpool, sizeof (*jpool));
344 	mutex_init(&jpool->mtx, NULL, MUTEX_DRIVER,
345 	    ss->mgp->icookie);
346 	jpool->head = NULL;
347 }
348 
349 static void
350 myri10ge_jpool_fini(struct myri10ge_slice_state *ss)
351 {
352 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
353 
354 	if (jpool->head != NULL) {
355 		cmn_err(CE_WARN,
356 		    "%s: BUG! myri10ge_jpool_fini called on non-empty pool\n",
357 		    ss->mgp->name);
358 	}
359 	mutex_destroy(&jpool->mtx);
360 }
361 
362 
363 /*
364  * copy an array of mcp_kreq_ether_recv_t's to the mcp.  Copy
365  * at most 32 bytes at a time, so as to avoid involving the software
366  * pio handler in the nic.   We re-write the first segment's low
367  * DMA address to mark it valid only after we write the entire chunk
368  * in a burst
369  */
370 static inline void
371 myri10ge_submit_8rx(mcp_kreq_ether_recv_t *dst, mcp_kreq_ether_recv_t *src)
372 {
373 	src->addr_low |= BE_32(1);
374 	myri10ge_pio_copy(dst, src, 4 * sizeof (*src));
375 	mb();
376 	myri10ge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
377 	mb();
378 	src->addr_low &= ~(BE_32(1));
379 	dst->addr_low = src->addr_low;
380 	mb();
381 }
382 
383 static void
384 myri10ge_pull_jpool(struct myri10ge_slice_state *ss)
385 {
386 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
387 	struct myri10ge_jpool_entry *jtail, *j, *jfree;
388 	volatile uintptr_t *putp;
389 	uintptr_t put;
390 	int i;
391 
392 	/* find tail */
393 	jtail = NULL;
394 	if (jpool->head != NULL) {
395 		j = jpool->head;
396 		while (j->next != NULL)
397 			j = j->next;
398 		jtail = j;
399 	}
400 
401 	/*
402 	 * iterate over all per-CPU caches, and add contents into
403 	 * jpool
404 	 */
405 	for (i = 0; i < MYRI10GE_MAX_CPUS; i++) {
406 		/* take per-CPU free list */
407 		putp = (void *)&jpool->cpu[i & MYRI10GE_MAX_CPU_MASK].head;
408 		if (*putp == NULL)
409 			continue;
410 		put = atomic_swap_ulong(putp, 0);
411 		jfree = (struct myri10ge_jpool_entry *)put;
412 
413 		/* append to pool */
414 		if (jtail == NULL) {
415 			jpool->head = jfree;
416 		} else {
417 			jtail->next = jfree;
418 		}
419 		j = jfree;
420 		while (j->next != NULL)
421 			j = j->next;
422 		jtail = j;
423 	}
424 }
425 
426 /*
427  * Transfers buffers from the free pool to the nic
428  * Must be called holding the jpool mutex.
429  */
430 
431 static inline void
432 myri10ge_restock_jumbos(struct myri10ge_slice_state *ss)
433 {
434 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
435 	struct myri10ge_jpool_entry *j;
436 	myri10ge_rx_ring_t *rx;
437 	int i, idx, limit;
438 
439 	rx = &ss->rx_big;
440 	limit = ss->j_rx_cnt + (rx->mask + 1);
441 
442 	for (i = rx->cnt; i != limit; i++) {
443 		idx = i & (rx->mask);
444 		j = jpool->head;
445 		if (j == NULL) {
446 			myri10ge_pull_jpool(ss);
447 			j = jpool->head;
448 			if (j == NULL) {
449 				break;
450 			}
451 		}
452 		jpool->head = j->next;
453 		rx->info[idx].j = j;
454 		rx->shadow[idx].addr_low = j->dma.low;
455 		rx->shadow[idx].addr_high = j->dma.high;
456 		/* copy 4 descriptors (32-bytes) to the mcp at a time */
457 		if ((idx & 7) == 7) {
458 			myri10ge_submit_8rx(&rx->lanai[idx - 7],
459 			    &rx->shadow[idx - 7]);
460 		}
461 	}
462 	rx->cnt = i;
463 }
464 
465 /*
466  * Transfer buffers from the nic to the free pool.
467  * Should be called holding the jpool mutex
468  */
469 
470 static inline void
471 myri10ge_unstock_jumbos(struct myri10ge_slice_state *ss)
472 {
473 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
474 	struct myri10ge_jpool_entry *j;
475 	myri10ge_rx_ring_t *rx;
476 	int i;
477 
478 	mutex_enter(&jpool->mtx);
479 	rx = &ss->rx_big;
480 
481 	for (i = 0; i < rx->mask + 1; i++) {
482 		j = rx->info[i].j;
483 		rx->info[i].j = NULL;
484 		if (j == NULL)
485 			continue;
486 		j->next = jpool->head;
487 		jpool->head = j;
488 	}
489 	mutex_exit(&jpool->mtx);
490 
491 }
492 
493 
494 /*
495  * Free routine which is called when the mblk allocated via
496  * esballoc() is freed.   Here we return the jumbo buffer
497  * to the free pool, and possibly pass some jumbo buffers
498  * to the nic
499  */
500 
501 static void
502 myri10ge_jfree_rtn(void *arg)
503 {
504 	struct myri10ge_jpool_entry *j = (struct myri10ge_jpool_entry *)arg;
505 	struct myri10ge_jpool_stuff *jpool;
506 	volatile uintptr_t *putp;
507 	uintptr_t old, new;
508 
509 	jpool = &j->ss->jpool;
510 
511 	/* prepend buffer locklessly to per-CPU freelist */
512 	putp = (void *)&jpool->cpu[CPU->cpu_seqid & MYRI10GE_MAX_CPU_MASK].head;
513 	new = (uintptr_t)j;
514 	do {
515 		old = *putp;
516 		j->next = (void *)old;
517 	} while (atomic_cas_ulong(putp, old, new) != old);
518 }
519 
520 static void
521 myri10ge_remove_jbuf(struct myri10ge_jpool_entry *j)
522 {
523 	(void) ddi_dma_unbind_handle(j->dma_handle);
524 	ddi_dma_mem_free(&j->acc_handle);
525 	ddi_dma_free_handle(&j->dma_handle);
526 	kmem_free(j, sizeof (*j));
527 }
528 
529 
530 /*
531  * Allocates one physically contiguous descriptor
532  * and add it to the jumbo buffer pool.
533  */
534 
535 static int
536 myri10ge_add_jbuf(struct myri10ge_slice_state *ss)
537 {
538 	struct myri10ge_jpool_entry *j;
539 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
540 	ddi_dma_attr_t *rx_dma_attr;
541 	size_t real_length;
542 	ddi_dma_cookie_t cookie;
543 	uint_t count;
544 	int err;
545 
546 	if (myri10ge_mtu < 2048)
547 		rx_dma_attr = &myri10ge_rx_std_dma_attr;
548 	else
549 		rx_dma_attr = &myri10ge_rx_jumbo_dma_attr;
550 
551 again:
552 	j = (struct myri10ge_jpool_entry *)
553 	    kmem_alloc(sizeof (*j), KM_SLEEP);
554 	err = ddi_dma_alloc_handle(ss->mgp->dip, rx_dma_attr,
555 	    DDI_DMA_DONTWAIT, NULL, &j->dma_handle);
556 	if (err != DDI_SUCCESS)
557 		goto abort_with_j;
558 
559 	err = ddi_dma_mem_alloc(j->dma_handle, myri10ge_mtu,
560 	    &myri10ge_dev_access_attr,  DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
561 	    NULL, &j->buf, &real_length, &j->acc_handle);
562 	if (err != DDI_SUCCESS)
563 		goto abort_with_handle;
564 
565 	err = ddi_dma_addr_bind_handle(j->dma_handle, NULL, j->buf,
566 	    real_length, DDI_DMA_READ|DDI_DMA_STREAMING, DDI_DMA_DONTWAIT,
567 	    NULL, &cookie, &count);
568 	if (err != DDI_SUCCESS)
569 		goto abort_with_mem;
570 
571 	/*
572 	 * Make certain std MTU buffers do not cross a 4KB boundary:
573 	 *
574 	 * Setting dma_attr_align=4096 will do this, but the system
575 	 * will only allocate 1 RX buffer per 4KB page, rather than 2.
576 	 * Setting dma_attr_granular=4096 *seems* to work around this,
577 	 * but I'm paranoid about future systems no longer honoring
578 	 * this, so fall back to the safe, but memory wasting way if a
579 	 * buffer crosses a 4KB boundary.
580 	 */
581 
582 	if (rx_dma_attr == &myri10ge_rx_std_dma_attr &&
583 	    rx_dma_attr->dma_attr_align != 4096) {
584 		uint32_t start, end;
585 
586 		start = MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress);
587 		end = start + myri10ge_mtu;
588 		if (((end >> 12) != (start >> 12)) && (start & 4095U)) {
589 			printf("std buffer crossed a 4KB boundary!\n");
590 			myri10ge_remove_jbuf(j);
591 			rx_dma_attr->dma_attr_align = 4096;
592 			rx_dma_attr->dma_attr_seg = UINT64_MAX;
593 			goto again;
594 		}
595 	}
596 
597 	j->dma.low =
598 	    htonl(MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress));
599 	j->dma.high =
600 	    htonl(MYRI10GE_HIGHPART_TO_U32(cookie.dmac_laddress));
601 	j->ss = ss;
602 
603 
604 	j->free_func.free_func = myri10ge_jfree_rtn;
605 	j->free_func.free_arg = (char *)j;
606 	mutex_enter(&jpool->mtx);
607 	j->next = jpool->head;
608 	jpool->head = j;
609 	jpool->num_alloc++;
610 	mutex_exit(&jpool->mtx);
611 	return (0);
612 
613 abort_with_mem:
614 	ddi_dma_mem_free(&j->acc_handle);
615 
616 abort_with_handle:
617 	ddi_dma_free_handle(&j->dma_handle);
618 
619 abort_with_j:
620 	kmem_free(j, sizeof (*j));
621 
622 	/*
623 	 * If an allocation failed, perhaps it failed because it could
624 	 * not satisfy granularity requirement.  Disable that, and
625 	 * try agin.
626 	 */
627 	if (rx_dma_attr == &myri10ge_rx_std_dma_attr &&
628 	    rx_dma_attr->dma_attr_align != 4096) {
629 			cmn_err(CE_NOTE,
630 			    "!alloc failed, reverting to gran=1\n");
631 			rx_dma_attr->dma_attr_align = 4096;
632 			rx_dma_attr->dma_attr_seg = UINT64_MAX;
633 			goto again;
634 	}
635 	return (err);
636 }
637 
638 static int
639 myri10ge_jfree_cnt(struct myri10ge_jpool_stuff *jpool)
640 {
641 	int i;
642 	struct myri10ge_jpool_entry *j;
643 
644 	mutex_enter(&jpool->mtx);
645 	j = jpool->head;
646 	i = 0;
647 	while (j != NULL) {
648 		i++;
649 		j = j->next;
650 	}
651 	mutex_exit(&jpool->mtx);
652 	return (i);
653 }
654 
655 static int
656 myri10ge_add_jbufs(struct myri10ge_slice_state *ss, int num, int total)
657 {
658 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
659 	int allocated = 0;
660 	int err;
661 	int needed;
662 
663 	/*
664 	 * if total is set, user wants "num" jbufs in the pool,
665 	 * otherwise the user wants to "num" additional jbufs
666 	 * added to the pool
667 	 */
668 	if (total && jpool->num_alloc) {
669 		allocated = myri10ge_jfree_cnt(jpool);
670 		needed = num - allocated;
671 	} else {
672 		needed = num;
673 	}
674 
675 	while (needed > 0) {
676 		needed--;
677 		err = myri10ge_add_jbuf(ss);
678 		if (err == 0) {
679 			allocated++;
680 		}
681 	}
682 	return (allocated);
683 }
684 
685 static void
686 myri10ge_remove_jbufs(struct myri10ge_slice_state *ss)
687 {
688 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
689 	struct myri10ge_jpool_entry *j;
690 
691 	mutex_enter(&jpool->mtx);
692 	myri10ge_pull_jpool(ss);
693 	while (jpool->head != NULL) {
694 		jpool->num_alloc--;
695 		j = jpool->head;
696 		jpool->head = j->next;
697 		myri10ge_remove_jbuf(j);
698 	}
699 	mutex_exit(&jpool->mtx);
700 }
701 
702 static void
703 myri10ge_carve_up_jbufs_into_small_ring(struct myri10ge_slice_state *ss)
704 {
705 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
706 	struct myri10ge_jpool_entry *j = NULL;
707 	caddr_t ptr;
708 	uint32_t dma_low, dma_high;
709 	int idx, len;
710 	unsigned int alloc_size;
711 
712 	dma_low = dma_high = len = 0;
713 	alloc_size = myri10ge_small_bytes + MXGEFW_PAD;
714 	ptr = NULL;
715 	for (idx = 0; idx < ss->rx_small.mask + 1; idx++) {
716 		/* Allocate a jumbo frame and carve it into small frames */
717 		if (len < alloc_size) {
718 			mutex_enter(&jpool->mtx);
719 			/* remove jumbo from freelist */
720 			j = jpool->head;
721 			jpool->head = j->next;
722 			/* place it onto small list */
723 			j->next = ss->small_jpool;
724 			ss->small_jpool = j;
725 			mutex_exit(&jpool->mtx);
726 			len = myri10ge_mtu;
727 			dma_low = ntohl(j->dma.low);
728 			dma_high = ntohl(j->dma.high);
729 			ptr = j->buf;
730 		}
731 		ss->rx_small.info[idx].ptr = ptr;
732 		ss->rx_small.shadow[idx].addr_low = htonl(dma_low);
733 		ss->rx_small.shadow[idx].addr_high = htonl(dma_high);
734 		len -= alloc_size;
735 		ptr += alloc_size;
736 		dma_low += alloc_size;
737 	}
738 }
739 
740 /*
741  * Return the jumbo bufs we carved up for small to the jumbo pool
742  */
743 
744 static void
745 myri10ge_release_small_jbufs(struct myri10ge_slice_state *ss)
746 {
747 	struct myri10ge_jpool_stuff *jpool = &ss->jpool;
748 	struct myri10ge_jpool_entry *j = NULL;
749 
750 	mutex_enter(&jpool->mtx);
751 	while (ss->small_jpool != NULL) {
752 		j = ss->small_jpool;
753 		ss->small_jpool = j->next;
754 		j->next = jpool->head;
755 		jpool->head = j;
756 	}
757 	mutex_exit(&jpool->mtx);
758 	ss->jbufs_for_smalls = 0;
759 }
760 
761 static int
762 myri10ge_add_tx_handle(struct myri10ge_slice_state *ss)
763 {
764 	myri10ge_tx_ring_t *tx = &ss->tx;
765 	struct myri10ge_priv *mgp = ss->mgp;
766 	struct myri10ge_tx_dma_handle *handle;
767 	int err;
768 
769 	handle = kmem_zalloc(sizeof (*handle), KM_SLEEP);
770 	err = ddi_dma_alloc_handle(mgp->dip,
771 	    &myri10ge_tx_dma_attr,
772 	    DDI_DMA_SLEEP, NULL,
773 	    &handle->h);
774 	if (err) {
775 		static int limit = 0;
776 		if (limit == 0)
777 			cmn_err(CE_WARN, "%s: Falled to alloc tx dma handle\n",
778 			    mgp->name);
779 		limit++;
780 		kmem_free(handle, sizeof (*handle));
781 		return (err);
782 	}
783 	mutex_enter(&tx->handle_lock);
784 	MYRI10GE_SLICE_STAT_INC(tx_handles_alloced);
785 	handle->next = tx->free_tx_handles;
786 	tx->free_tx_handles = handle;
787 	mutex_exit(&tx->handle_lock);
788 	return (DDI_SUCCESS);
789 }
790 
791 static void
792 myri10ge_remove_tx_handles(struct myri10ge_slice_state *ss)
793 {
794 	myri10ge_tx_ring_t *tx = &ss->tx;
795 	struct myri10ge_tx_dma_handle *handle;
796 	mutex_enter(&tx->handle_lock);
797 
798 	handle = tx->free_tx_handles;
799 	while (handle != NULL) {
800 		tx->free_tx_handles = handle->next;
801 		ddi_dma_free_handle(&handle->h);
802 		kmem_free(handle, sizeof (*handle));
803 		handle = tx->free_tx_handles;
804 		MYRI10GE_SLICE_STAT_DEC(tx_handles_alloced);
805 	}
806 	mutex_exit(&tx->handle_lock);
807 	if (MYRI10GE_SLICE_STAT(tx_handles_alloced) != 0) {
808 		cmn_err(CE_WARN, "%s: %d tx dma handles allocated at close\n",
809 		    ss->mgp->name,
810 		    (int)MYRI10GE_SLICE_STAT(tx_handles_alloced));
811 	}
812 }
813 
814 static void
815 myri10ge_free_tx_handles(myri10ge_tx_ring_t *tx,
816     struct myri10ge_tx_dma_handle_head *list)
817 {
818 	mutex_enter(&tx->handle_lock);
819 	list->tail->next = tx->free_tx_handles;
820 	tx->free_tx_handles = list->head;
821 	mutex_exit(&tx->handle_lock);
822 }
823 
824 static void
825 myri10ge_free_tx_handle_slist(myri10ge_tx_ring_t *tx,
826     struct myri10ge_tx_dma_handle *handle)
827 {
828 	struct myri10ge_tx_dma_handle_head list;
829 
830 	if (handle == NULL)
831 		return;
832 	list.head = handle;
833 	list.tail = handle;
834 	while (handle != NULL) {
835 		list.tail = handle;
836 		handle = handle->next;
837 	}
838 	myri10ge_free_tx_handles(tx, &list);
839 }
840 
841 static int
842 myri10ge_alloc_tx_handles(struct myri10ge_slice_state *ss, int count,
843     struct myri10ge_tx_dma_handle **ret)
844 {
845 	myri10ge_tx_ring_t *tx = &ss->tx;
846 	struct myri10ge_tx_dma_handle *handle;
847 	int err, i;
848 
849 	mutex_enter(&tx->handle_lock);
850 	for (i = 0; i < count; i++) {
851 		handle = tx->free_tx_handles;
852 		while (handle == NULL) {
853 			mutex_exit(&tx->handle_lock);
854 			err = myri10ge_add_tx_handle(ss);
855 			if (err != DDI_SUCCESS) {
856 				goto abort_with_handles;
857 			}
858 			mutex_enter(&tx->handle_lock);
859 			handle = tx->free_tx_handles;
860 		}
861 		tx->free_tx_handles = handle->next;
862 		handle->next = *ret;
863 		*ret = handle;
864 	}
865 	mutex_exit(&tx->handle_lock);
866 	return (DDI_SUCCESS);
867 
868 abort_with_handles:
869 	myri10ge_free_tx_handle_slist(tx, *ret);
870 	return (err);
871 }
872 
873 
874 /*
875  * Frees DMA resources associated with the send ring
876  */
877 static void
878 myri10ge_unprepare_tx_ring(struct myri10ge_slice_state *ss)
879 {
880 	myri10ge_tx_ring_t *tx;
881 	struct myri10ge_tx_dma_handle_head handles;
882 	size_t bytes;
883 	int idx;
884 
885 	tx = &ss->tx;
886 	handles.head = NULL;
887 	handles.tail = NULL;
888 	for (idx = 0; idx < ss->tx.mask + 1; idx++) {
889 		if (tx->info[idx].m) {
890 			(void) ddi_dma_unbind_handle(tx->info[idx].handle->h);
891 			handles.head = tx->info[idx].handle;
892 			if (handles.tail == NULL)
893 				handles.tail = tx->info[idx].handle;
894 			freeb(tx->info[idx].m);
895 			tx->info[idx].m = 0;
896 			tx->info[idx].handle = 0;
897 		}
898 		tx->cp[idx].va = NULL;
899 		myri10ge_dma_free(&tx->cp[idx].dma);
900 	}
901 	bytes = sizeof (*tx->cp) * (tx->mask + 1);
902 	kmem_free(tx->cp, bytes);
903 	tx->cp = NULL;
904 	if (handles.head != NULL)
905 		myri10ge_free_tx_handles(tx, &handles);
906 	myri10ge_remove_tx_handles(ss);
907 }
908 
909 /*
910  * Allocates DMA handles associated with the send ring
911  */
912 static inline int
913 myri10ge_prepare_tx_ring(struct myri10ge_slice_state *ss)
914 {
915 	struct myri10ge_tx_dma_handle *handles;
916 	int h;
917 	size_t bytes;
918 
919 	bytes = sizeof (*ss->tx.cp) * (ss->tx.mask + 1);
920 	ss->tx.cp = kmem_zalloc(bytes, KM_SLEEP);
921 	if (ss->tx.cp == NULL) {
922 		cmn_err(CE_WARN,
923 		    "%s: Failed to allocate tx copyblock storage\n",
924 		    ss->mgp->name);
925 		return (DDI_FAILURE);
926 	}
927 
928 
929 	/* allocate the TX copyblocks */
930 	for (h = 0; h < ss->tx.mask + 1; h++) {
931 		ss->tx.cp[h].va = myri10ge_dma_alloc(ss->mgp->dip,
932 		    4096, &myri10ge_rx_jumbo_dma_attr,
933 		    &myri10ge_dev_access_attr, DDI_DMA_STREAMING,
934 		    DDI_DMA_WRITE|DDI_DMA_STREAMING, &ss->tx.cp[h].dma, 1,
935 		    DDI_DMA_DONTWAIT);
936 		if (ss->tx.cp[h].va == NULL) {
937 			cmn_err(CE_WARN, "%s: Failed to allocate tx "
938 			    "copyblock %d\n", ss->mgp->name, h);
939 			goto abort_with_copyblocks;
940 		}
941 	}
942 	/* pre-allocate transmit handles */
943 	handles = NULL;
944 	(void) myri10ge_alloc_tx_handles(ss, myri10ge_tx_handles_initial,
945 	    &handles);
946 	if (handles != NULL)
947 		myri10ge_free_tx_handle_slist(&ss->tx, handles);
948 
949 	return (DDI_SUCCESS);
950 
951 abort_with_copyblocks:
952 	while (h > 0)  {
953 		h--;
954 		myri10ge_dma_free(&ss->tx.cp[h].dma);
955 	}
956 
957 	bytes = sizeof (*ss->tx.cp) * (ss->tx.mask + 1);
958 	kmem_free(ss->tx.cp, bytes);
959 	ss->tx.cp = NULL;
960 	return (DDI_FAILURE);
961 }
962 
963 /*
964  * The eeprom strings on the lanaiX have the format
965  * SN=x\0
966  * MAC=x:x:x:x:x:x\0
967  * PT:ddd mmm xx xx:xx:xx xx\0
968  * PV:ddd mmm xx xx:xx:xx xx\0
969  */
970 static int
971 myri10ge_read_mac_addr(struct myri10ge_priv *mgp)
972 {
973 #define	MYRI10GE_NEXT_STRING(p) while (ptr < limit && *ptr++)
974 #define	myri10ge_digit(c) (((c) >= '0' && (c) <= '9') ? ((c) - '0') :	\
975 		(((c) >= 'A' && (c) <= 'F') ? (10 + (c) - 'A') :	\
976 		(((c) >= 'a' && (c) <= 'f') ? (10 + (c) - 'a') : -1)))
977 
978 	char *ptr, *limit;
979 	int i, hv, lv;
980 
981 	ptr = mgp->eeprom_strings;
982 	limit = mgp->eeprom_strings + MYRI10GE_EEPROM_STRINGS_SIZE;
983 
984 	while (*ptr != '\0' && ptr < limit) {
985 		if (memcmp(ptr, "MAC=", 4) == 0) {
986 			ptr += 4;
987 			if (myri10ge_verbose)
988 				printf("%s: mac address = %s\n", mgp->name,
989 				    ptr);
990 			mgp->mac_addr_string = ptr;
991 			for (i = 0; i < 6; i++) {
992 				if ((ptr + 2) > limit)
993 					goto abort;
994 
995 				if (*(ptr+1) == ':') {
996 					hv = 0;
997 					lv = myri10ge_digit(*ptr); ptr++;
998 				} else {
999 					hv = myri10ge_digit(*ptr); ptr++;
1000 					lv = myri10ge_digit(*ptr); ptr++;
1001 				}
1002 				mgp->mac_addr[i] = (hv << 4) | lv;
1003 				ptr++;
1004 			}
1005 		}
1006 		if (memcmp((const void *)ptr, "SN=", 3) == 0) {
1007 			ptr += 3;
1008 			mgp->sn_str = (char *)ptr;
1009 		}
1010 		if (memcmp((const void *)ptr, "PC=", 3) == 0) {
1011 			ptr += 3;
1012 			mgp->pc_str = (char *)ptr;
1013 		}
1014 		MYRI10GE_NEXT_STRING(ptr);
1015 	}
1016 
1017 	return (0);
1018 
1019 abort:
1020 	cmn_err(CE_WARN, "%s: failed to parse eeprom_strings", mgp->name);
1021 	return (ENXIO);
1022 }
1023 
1024 
1025 /*
1026  * Determine the register set containing the PCI resource we
1027  * want to map: the memory-mappable part of the interface. We do
1028  * this by scanning the DDI "reg" property of the interface,
1029  * which is an array of mx_ddi_reg_set structures.
1030  */
1031 static int
1032 myri10ge_reg_set(dev_info_t *dip, int *reg_set, int *span,
1033     unsigned long *busno, unsigned long *devno,
1034     unsigned long *funcno)
1035 {
1036 
1037 #define	REGISTER_NUMBER(ip)	(ip[0] >>  0 & 0xff)
1038 #define	FUNCTION_NUMBER(ip)	(ip[0] >>  8 & 0x07)
1039 #define	DEVICE_NUMBER(ip)	(ip[0] >> 11 & 0x1f)
1040 #define	BUS_NUMBER(ip)		(ip[0] >> 16 & 0xff)
1041 #define	ADDRESS_SPACE(ip)	(ip[0] >> 24 & 0x03)
1042 #define	PCI_ADDR_HIGH(ip)	(ip[1])
1043 #define	PCI_ADDR_LOW(ip) 	(ip[2])
1044 #define	PCI_SPAN_HIGH(ip)	(ip[3])
1045 #define	PCI_SPAN_LOW(ip)	(ip[4])
1046 
1047 #define	MX_DDI_REG_SET_32_BIT_MEMORY_SPACE 2
1048 #define	MX_DDI_REG_SET_64_BIT_MEMORY_SPACE 3
1049 
1050 	int *data, i, *rs;
1051 	uint32_t nelementsp;
1052 
1053 #ifdef MYRI10GE_REGSET_VERBOSE
1054 	char *address_space_name[] = { "Configuration Space",
1055 					"I/O Space",
1056 					"32-bit Memory Space",
1057 					"64-bit Memory Space"
1058 	};
1059 #endif
1060 
1061 	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
1062 	    "reg", &data, &nelementsp) != DDI_SUCCESS) {
1063 		printf("Could not determine register set.\n");
1064 		return (ENXIO);
1065 	}
1066 
1067 #ifdef MYRI10GE_REGSET_VERBOSE
1068 	printf("There are %d register sets.\n", nelementsp / 5);
1069 #endif
1070 	if (!nelementsp) {
1071 		printf("Didn't find any \"reg\" properties.\n");
1072 		ddi_prop_free(data);
1073 		return (ENODEV);
1074 	}
1075 
1076 	/* Scan for the register number. */
1077 	rs = &data[0];
1078 	*busno = BUS_NUMBER(rs);
1079 	*devno = DEVICE_NUMBER(rs);
1080 	*funcno = FUNCTION_NUMBER(rs);
1081 
1082 #ifdef MYRI10GE_REGSET_VERBOSE
1083 	printf("*** Scanning for register number.\n");
1084 #endif
1085 	for (i = 0; i < nelementsp / 5; i++) {
1086 		rs = &data[5 * i];
1087 #ifdef MYRI10GE_REGSET_VERBOSE
1088 		printf("Examining register set %d:\n", i);
1089 		printf("  Register number = %d.\n", REGISTER_NUMBER(rs));
1090 		printf("  Function number = %d.\n", FUNCTION_NUMBER(rs));
1091 		printf("  Device number   = %d.\n", DEVICE_NUMBER(rs));
1092 		printf("  Bus number      = %d.\n", BUS_NUMBER(rs));
1093 		printf("  Address space   = %d (%s ).\n", ADDRESS_SPACE(rs),
1094 		    address_space_name[ADDRESS_SPACE(rs)]);
1095 		printf("  pci address 0x%08x %08x\n", PCI_ADDR_HIGH(rs),
1096 		    PCI_ADDR_LOW(rs));
1097 		printf("  pci span 0x%08x %08x\n", PCI_SPAN_HIGH(rs),
1098 		    PCI_SPAN_LOW(rs));
1099 #endif
1100 		/* We are looking for a memory property. */
1101 
1102 		if (ADDRESS_SPACE(rs) == MX_DDI_REG_SET_64_BIT_MEMORY_SPACE ||
1103 		    ADDRESS_SPACE(rs) == MX_DDI_REG_SET_32_BIT_MEMORY_SPACE) {
1104 			*reg_set = i;
1105 
1106 #ifdef MYRI10GE_REGSET_VERBOSE
1107 			printf("%s uses register set %d.\n",
1108 			    address_space_name[ADDRESS_SPACE(rs)], *reg_set);
1109 #endif
1110 
1111 			*span = (PCI_SPAN_LOW(rs));
1112 #ifdef MYRI10GE_REGSET_VERBOSE
1113 			printf("Board span is 0x%x\n", *span);
1114 #endif
1115 			break;
1116 		}
1117 	}
1118 
1119 	ddi_prop_free(data);
1120 
1121 	/* If no match, fail. */
1122 	if (i >= nelementsp / 5) {
1123 		return (EIO);
1124 	}
1125 
1126 	return (0);
1127 }
1128 
1129 
1130 static int
1131 myri10ge_load_firmware_from_zlib(struct myri10ge_priv *mgp, uint32_t *limit)
1132 {
1133 	void *inflate_buffer;
1134 	int rv, status;
1135 	size_t sram_size = mgp->sram_size - MYRI10GE_EEPROM_STRINGS_SIZE;
1136 	size_t destlen;
1137 	mcp_gen_header_t *hdr;
1138 	unsigned hdr_offset, i;
1139 
1140 
1141 	*limit = 0; /* -Wuninitialized */
1142 	status = 0;
1143 
1144 	inflate_buffer = kmem_zalloc(sram_size, KM_NOSLEEP);
1145 	if (!inflate_buffer) {
1146 		cmn_err(CE_WARN,
1147 		    "%s: Could not allocate buffer to inflate mcp\n",
1148 		    mgp->name);
1149 		return (ENOMEM);
1150 	}
1151 
1152 	destlen = sram_size;
1153 	rv = z_uncompress(inflate_buffer, &destlen, mgp->eth_z8e,
1154 	    mgp->eth_z8e_length);
1155 
1156 	if (rv != Z_OK) {
1157 		cmn_err(CE_WARN, "%s: Could not inflate mcp: %s\n",
1158 		    mgp->name, z_strerror(rv));
1159 		status = ENXIO;
1160 		goto abort;
1161 	}
1162 
1163 	*limit = (uint32_t)destlen;
1164 
1165 	hdr_offset = htonl(*(uint32_t *)(void *)((char *)inflate_buffer +
1166 	    MCP_HEADER_PTR_OFFSET));
1167 	hdr = (void *)((char *)inflate_buffer + hdr_offset);
1168 	if (ntohl(hdr->mcp_type) != MCP_TYPE_ETH) {
1169 		cmn_err(CE_WARN, "%s: Bad firmware type: 0x%x\n", mgp->name,
1170 		    ntohl(hdr->mcp_type));
1171 		status = EIO;
1172 		goto abort;
1173 	}
1174 
1175 	/* save firmware version for kstat */
1176 	(void) strncpy(mgp->fw_version, hdr->version, sizeof (mgp->fw_version));
1177 	if (myri10ge_verbose)
1178 		printf("%s: firmware id: %s\n", mgp->name, hdr->version);
1179 
1180 	/* Copy the inflated firmware to NIC SRAM. */
1181 	for (i = 0; i < *limit; i += 256) {
1182 		myri10ge_pio_copy((char *)mgp->sram + MYRI10GE_FW_OFFSET + i,
1183 		    (char *)inflate_buffer + i,
1184 		    min(256U, (unsigned)(*limit - i)));
1185 		mb();
1186 		(void) *(int *)(void *)mgp->sram;
1187 		mb();
1188 	}
1189 
1190 abort:
1191 	kmem_free(inflate_buffer, sram_size);
1192 
1193 	return (status);
1194 
1195 }
1196 
1197 
1198 int
1199 myri10ge_send_cmd(struct myri10ge_priv *mgp, uint32_t cmd,
1200 		myri10ge_cmd_t *data)
1201 {
1202 	mcp_cmd_t *buf;
1203 	char buf_bytes[sizeof (*buf) + 8];
1204 	volatile mcp_cmd_response_t *response = mgp->cmd;
1205 	volatile char *cmd_addr =
1206 	    (volatile char *)mgp->sram + MXGEFW_ETH_CMD;
1207 	int sleep_total = 0;
1208 
1209 	/* ensure buf is aligned to 8 bytes */
1210 	buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1211 
1212 	buf->data0 = htonl(data->data0);
1213 	buf->data1 = htonl(data->data1);
1214 	buf->data2 = htonl(data->data2);
1215 	buf->cmd = htonl(cmd);
1216 	buf->response_addr.low = mgp->cmd_dma.low;
1217 	buf->response_addr.high = mgp->cmd_dma.high;
1218 	mutex_enter(&mgp->cmd_lock);
1219 	response->result = 0xffffffff;
1220 	mb();
1221 
1222 	myri10ge_pio_copy((void *)cmd_addr, buf, sizeof (*buf));
1223 
1224 	/* wait up to 20ms */
1225 	for (sleep_total = 0; sleep_total < 20; sleep_total++) {
1226 		mb();
1227 		if (response->result != 0xffffffff) {
1228 			if (response->result == 0) {
1229 				data->data0 = ntohl(response->data);
1230 				mutex_exit(&mgp->cmd_lock);
1231 				return (0);
1232 			} else if (ntohl(response->result)
1233 			    == MXGEFW_CMD_UNKNOWN) {
1234 				mutex_exit(&mgp->cmd_lock);
1235 				return (ENOSYS);
1236 			} else if (ntohl(response->result)
1237 			    == MXGEFW_CMD_ERROR_UNALIGNED) {
1238 				mutex_exit(&mgp->cmd_lock);
1239 				return (E2BIG);
1240 			} else {
1241 				cmn_err(CE_WARN,
1242 				    "%s: command %d failed, result = %d\n",
1243 				    mgp->name, cmd, ntohl(response->result));
1244 				mutex_exit(&mgp->cmd_lock);
1245 				return (ENXIO);
1246 			}
1247 		}
1248 		drv_usecwait(1000);
1249 	}
1250 	mutex_exit(&mgp->cmd_lock);
1251 	cmn_err(CE_WARN, "%s: command %d timed out, result = %d\n",
1252 	    mgp->name, cmd, ntohl(response->result));
1253 	return (EAGAIN);
1254 }
1255 
1256 /*
1257  * Enable or disable periodic RDMAs from the host to make certain
1258  * chipsets resend dropped PCIe messages
1259  */
1260 
1261 static void
1262 myri10ge_dummy_rdma(struct myri10ge_priv *mgp, int enable)
1263 {
1264 	char buf_bytes[72];
1265 	volatile uint32_t *confirm;
1266 	volatile char *submit;
1267 	uint32_t *buf;
1268 	int i;
1269 
1270 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1271 
1272 	/* clear confirmation addr */
1273 	confirm = (volatile uint32_t *)mgp->cmd;
1274 	*confirm = 0;
1275 	mb();
1276 
1277 	/*
1278 	 * send an rdma command to the PCIe engine, and wait for the
1279 	 * response in the confirmation address.  The firmware should
1280 	 *  write a -1 there to indicate it is alive and well
1281 	 */
1282 
1283 	buf[0] = mgp->cmd_dma.high;		/* confirm addr MSW */
1284 	buf[1] = mgp->cmd_dma.low;		/* confirm addr LSW */
1285 	buf[2] = htonl(0xffffffff);		/* confirm data */
1286 	buf[3] = htonl(mgp->cmd_dma.high); 	/* dummy addr MSW */
1287 	buf[4] = htonl(mgp->cmd_dma.low); 	/* dummy addr LSW */
1288 	buf[5] = htonl(enable);			/* enable? */
1289 
1290 
1291 	submit = (volatile char *)(mgp->sram + MXGEFW_BOOT_DUMMY_RDMA);
1292 
1293 	myri10ge_pio_copy((char *)submit, buf, 64);
1294 	mb();
1295 	drv_usecwait(1000);
1296 	mb();
1297 	i = 0;
1298 	while (*confirm != 0xffffffff && i < 20) {
1299 		drv_usecwait(1000);
1300 		i++;
1301 	}
1302 	if (*confirm != 0xffffffff) {
1303 		cmn_err(CE_WARN, "%s: dummy rdma %s failed (%p = 0x%x)",
1304 		    mgp->name,
1305 		    (enable ? "enable" : "disable"), (void*) confirm, *confirm);
1306 	}
1307 }
1308 
1309 static int
1310 myri10ge_load_firmware(struct myri10ge_priv *mgp)
1311 {
1312 	myri10ge_cmd_t cmd;
1313 	volatile uint32_t *confirm;
1314 	volatile char *submit;
1315 	char buf_bytes[72];
1316 	uint32_t *buf, size;
1317 	int status, i;
1318 
1319 	buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
1320 
1321 	status = myri10ge_load_firmware_from_zlib(mgp, &size);
1322 	if (status) {
1323 		cmn_err(CE_WARN, "%s: firmware loading failed\n", mgp->name);
1324 		return (status);
1325 	}
1326 
1327 	/* clear confirmation addr */
1328 	confirm = (volatile uint32_t *)mgp->cmd;
1329 	*confirm = 0;
1330 	mb();
1331 
1332 	/*
1333 	 * send a reload command to the bootstrap MCP, and wait for the
1334 	 * response in the confirmation address.  The firmware should
1335 	 * write a -1 there to indicate it is alive and well
1336 	 */
1337 
1338 	buf[0] = mgp->cmd_dma.high;	/* confirm addr MSW */
1339 	buf[1] = mgp->cmd_dma.low;	/* confirm addr LSW */
1340 	buf[2] = htonl(0xffffffff);	/* confirm data */
1341 
1342 	/*
1343 	 * FIX: All newest firmware should un-protect the bottom of
1344 	 * the sram before handoff. However, the very first interfaces
1345 	 * do not. Therefore the handoff copy must skip the first 8 bytes
1346 	 */
1347 	buf[3] = htonl(MYRI10GE_FW_OFFSET + 8); /* where the code starts */
1348 	buf[4] = htonl(size - 8); 	/* length of code */
1349 	buf[5] = htonl(8);		/* where to copy to */
1350 	buf[6] = htonl(0);		/* where to jump to */
1351 
1352 	submit = (volatile char *)(mgp->sram + MXGEFW_BOOT_HANDOFF);
1353 
1354 	myri10ge_pio_copy((char *)submit, buf, 64);
1355 	mb();
1356 	drv_usecwait(1000);
1357 	mb();
1358 	i = 0;
1359 	while (*confirm != 0xffffffff && i < 1000) {
1360 		drv_usecwait(1000);
1361 		i++;
1362 	}
1363 	if (*confirm != 0xffffffff) {
1364 		cmn_err(CE_WARN, "%s: handoff failed (%p = 0x%x)",
1365 		    mgp->name, (void *) confirm, *confirm);
1366 
1367 		return (ENXIO);
1368 	}
1369 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
1370 	if (status != 0) {
1371 		cmn_err(CE_WARN, "%s: failed MXGEFW_CMD_GET_RX_RING_SIZE\n",
1372 		    mgp->name);
1373 		return (ENXIO);
1374 	}
1375 
1376 	mgp->max_intr_slots = 2 * (cmd.data0 / sizeof (mcp_dma_addr_t));
1377 	myri10ge_dummy_rdma(mgp, 1);
1378 	return (0);
1379 }
1380 
1381 static int
1382 myri10ge_m_unicst(void *arg, const uint8_t *addr)
1383 {
1384 	struct myri10ge_priv *mgp = arg;
1385 	myri10ge_cmd_t cmd;
1386 	int status;
1387 
1388 	cmd.data0 = ((addr[0] << 24) | (addr[1] << 16)
1389 	    | (addr[2] << 8) | addr[3]);
1390 
1391 	cmd.data1 = ((addr[4] << 8) | (addr[5]));
1392 
1393 	status = myri10ge_send_cmd(mgp, MXGEFW_SET_MAC_ADDRESS, &cmd);
1394 	if (status == 0 && (addr != mgp->mac_addr))
1395 		(void) memcpy(mgp->mac_addr, addr, sizeof (mgp->mac_addr));
1396 
1397 	return (status);
1398 }
1399 
1400 static int
1401 myri10ge_change_pause(struct myri10ge_priv *mgp, int pause)
1402 {
1403 	myri10ge_cmd_t cmd;
1404 	int status;
1405 
1406 	if (pause)
1407 		status = myri10ge_send_cmd(mgp, MXGEFW_ENABLE_FLOW_CONTROL,
1408 		    &cmd);
1409 	else
1410 		status = myri10ge_send_cmd(mgp, MXGEFW_DISABLE_FLOW_CONTROL,
1411 		    &cmd);
1412 
1413 	if (status) {
1414 		cmn_err(CE_WARN, "%s: Failed to set flow control mode\n",
1415 		    mgp->name);
1416 		return (ENXIO);
1417 	}
1418 	mgp->pause = pause;
1419 	return (0);
1420 }
1421 
1422 static void
1423 myri10ge_change_promisc(struct myri10ge_priv *mgp, int promisc)
1424 {
1425 	myri10ge_cmd_t cmd;
1426 	int status;
1427 
1428 	if (promisc)
1429 		status = myri10ge_send_cmd(mgp, MXGEFW_ENABLE_PROMISC, &cmd);
1430 	else
1431 		status = myri10ge_send_cmd(mgp, MXGEFW_DISABLE_PROMISC, &cmd);
1432 
1433 	if (status) {
1434 		cmn_err(CE_WARN, "%s: Failed to set promisc mode\n",
1435 		    mgp->name);
1436 	}
1437 }
1438 
1439 static int
1440 myri10ge_dma_test(struct myri10ge_priv *mgp, int test_type)
1441 {
1442 	myri10ge_cmd_t cmd;
1443 	int status;
1444 	uint32_t len;
1445 	void *dmabench;
1446 	struct myri10ge_dma_stuff dmabench_dma;
1447 	char *test = " ";
1448 
1449 	/*
1450 	 * Run a small DMA test.
1451 	 * The magic multipliers to the length tell the firmware
1452 	 * tp do DMA read, write, or read+write tests.  The
1453 	 * results are returned in cmd.data0.  The upper 16
1454 	 * bits or the return is the number of transfers completed.
1455 	 * The lower 16 bits is the time in 0.5us ticks that the
1456 	 * transfers took to complete
1457 	 */
1458 
1459 	len = mgp->tx_boundary;
1460 
1461 	dmabench = myri10ge_dma_alloc(mgp->dip, len,
1462 	    &myri10ge_rx_jumbo_dma_attr, &myri10ge_dev_access_attr,
1463 	    DDI_DMA_STREAMING,  DDI_DMA_RDWR|DDI_DMA_STREAMING,
1464 	    &dmabench_dma, 1, DDI_DMA_DONTWAIT);
1465 	mgp->read_dma = mgp->write_dma = mgp->read_write_dma = 0;
1466 	if (dmabench == NULL) {
1467 		cmn_err(CE_WARN, "%s dma benchmark aborted\n", mgp->name);
1468 		return (ENOMEM);
1469 	}
1470 
1471 	cmd.data0 = ntohl(dmabench_dma.low);
1472 	cmd.data1 = ntohl(dmabench_dma.high);
1473 	cmd.data2 = len * 0x10000;
1474 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1475 	if (status != 0) {
1476 		test = "read";
1477 		goto abort;
1478 	}
1479 	mgp->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
1480 
1481 	cmd.data0 = ntohl(dmabench_dma.low);
1482 	cmd.data1 = ntohl(dmabench_dma.high);
1483 	cmd.data2 = len * 0x1;
1484 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1485 	if (status != 0) {
1486 		test = "write";
1487 		goto abort;
1488 	}
1489 	mgp->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
1490 
1491 	cmd.data0 = ntohl(dmabench_dma.low);
1492 	cmd.data1 = ntohl(dmabench_dma.high);
1493 	cmd.data2 = len * 0x10001;
1494 	status = myri10ge_send_cmd(mgp, test_type, &cmd);
1495 	if (status != 0) {
1496 		test = "read/write";
1497 		goto abort;
1498 	}
1499 	mgp->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
1500 	    (cmd.data0 & 0xffff);
1501 
1502 
1503 abort:
1504 	myri10ge_dma_free(&dmabench_dma);
1505 	if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST)
1506 		cmn_err(CE_WARN, "%s %s dma benchmark failed\n", mgp->name,
1507 		    test);
1508 	return (status);
1509 }
1510 
1511 static int
1512 myri10ge_reset(struct myri10ge_priv *mgp)
1513 {
1514 	myri10ge_cmd_t cmd;
1515 	struct myri10ge_nic_stat *ethstat;
1516 	struct myri10ge_slice_state *ss;
1517 	int i, status;
1518 	size_t bytes;
1519 
1520 	/* send a reset command to the card to see if it is alive */
1521 	(void) memset(&cmd, 0, sizeof (cmd));
1522 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_RESET, &cmd);
1523 	if (status != 0) {
1524 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
1525 		return (ENXIO);
1526 	}
1527 
1528 	/* Now exchange information about interrupts  */
1529 
1530 	bytes = mgp->max_intr_slots * sizeof (*mgp->ss[0].rx_done.entry);
1531 	cmd.data0 = (uint32_t)bytes;
1532 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1533 
1534 	/*
1535 	 * Even though we already know how many slices are supported
1536 	 * via myri10ge_probe_slices() MXGEFW_CMD_GET_MAX_RSS_QUEUES
1537 	 * has magic side effects, and must be called after a reset.
1538 	 * It must be called prior to calling any RSS related cmds,
1539 	 * including assigning an interrupt queue for anything but
1540 	 * slice 0.  It must also be called *after*
1541 	 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1542 	 * the firmware to compute offsets.
1543 	 */
1544 
1545 	if (mgp->num_slices > 1) {
1546 
1547 		/* ask the maximum number of slices it supports */
1548 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
1549 		    &cmd);
1550 		if (status != 0) {
1551 			cmn_err(CE_WARN,
1552 			    "%s: failed to get number of slices\n",
1553 			    mgp->name);
1554 			return (status);
1555 		}
1556 
1557 		/*
1558 		 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1559 		 * to setting up the interrupt queue DMA
1560 		 */
1561 
1562 		cmd.data0 = mgp->num_slices;
1563 		cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE |
1564 		    MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1565 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ENABLE_RSS_QUEUES,
1566 		    &cmd);
1567 		if (status != 0) {
1568 			cmn_err(CE_WARN,
1569 			    "%s: failed to set number of slices\n",
1570 			    mgp->name);
1571 			return (status);
1572 		}
1573 	}
1574 	for (i = 0; i < mgp->num_slices; i++) {
1575 		ss = &mgp->ss[i];
1576 		cmd.data0 = ntohl(ss->rx_done.dma.low);
1577 		cmd.data1 = ntohl(ss->rx_done.dma.high);
1578 		cmd.data2 = i;
1579 		status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_DMA,
1580 		    &cmd);
1581 	};
1582 
1583 	status |= myri10ge_send_cmd(mgp,  MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1584 	for (i = 0; i < mgp->num_slices; i++) {
1585 		ss = &mgp->ss[i];
1586 		ss->irq_claim = (volatile unsigned int *)
1587 		    (void *)(mgp->sram + cmd.data0 + 8 * i);
1588 	}
1589 
1590 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
1591 		status |= myri10ge_send_cmd(mgp,
1592 		    MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1593 		mgp->irq_deassert = (uint32_t *)(void *)(mgp->sram + cmd.data0);
1594 	}
1595 
1596 	status |= myri10ge_send_cmd(mgp,
1597 	    MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET, &cmd);
1598 	mgp->intr_coal_delay_ptr = (uint32_t *)(void *)(mgp->sram + cmd.data0);
1599 
1600 	if (status != 0) {
1601 		cmn_err(CE_WARN, "%s: failed set interrupt parameters\n",
1602 		    mgp->name);
1603 		return (status);
1604 	}
1605 
1606 	*mgp->intr_coal_delay_ptr = htonl(mgp->intr_coal_delay);
1607 	(void) myri10ge_dma_test(mgp, MXGEFW_DMA_TEST);
1608 
1609 	/* reset mcp/driver shared state back to 0 */
1610 
1611 	for (i = 0; i < mgp->num_slices; i++) {
1612 		ss = &mgp->ss[i];
1613 		bytes = mgp->max_intr_slots *
1614 		    sizeof (*mgp->ss[0].rx_done.entry);
1615 		(void) memset(ss->rx_done.entry, 0, bytes);
1616 		ss->tx.req = 0;
1617 		ss->tx.done = 0;
1618 		ss->tx.pkt_done = 0;
1619 		ss->rx_big.cnt = 0;
1620 		ss->rx_small.cnt = 0;
1621 		ss->rx_done.idx = 0;
1622 		ss->rx_done.cnt = 0;
1623 		ss->rx_token = 0;
1624 		ss->tx.watchdog_done = 0;
1625 		ss->tx.watchdog_req = 0;
1626 		ss->tx.active = 0;
1627 		ss->tx.activate = 0;
1628 	}
1629 	mgp->watchdog_rx_pause = 0;
1630 	if (mgp->ksp_stat != NULL) {
1631 		ethstat = (struct myri10ge_nic_stat *)mgp->ksp_stat->ks_data;
1632 		ethstat->link_changes.value.ul = 0;
1633 	}
1634 	status = myri10ge_m_unicst(mgp, mgp->mac_addr);
1635 	myri10ge_change_promisc(mgp, 0);
1636 	(void) myri10ge_change_pause(mgp, mgp->pause);
1637 	return (status);
1638 }
1639 
1640 static int
1641 myri10ge_init_toeplitz(struct myri10ge_priv *mgp)
1642 {
1643 	myri10ge_cmd_t cmd;
1644 	int i, b, s, t, j;
1645 	int status;
1646 	uint32_t k[8];
1647 	uint32_t tmp;
1648 	uint8_t *key;
1649 
1650 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RSS_KEY_OFFSET,
1651 	    &cmd);
1652 	if (status != 0) {
1653 		cmn_err(CE_WARN, "%s: failed to get rss key\n",
1654 		    mgp->name);
1655 		return (EIO);
1656 	}
1657 	myri10ge_pio_copy32(mgp->rss_key,
1658 	    (uint32_t *)(void*)((char *)mgp->sram + cmd.data0),
1659 	    sizeof (mgp->rss_key));
1660 
1661 	mgp->toeplitz_hash_table = kmem_alloc(sizeof (uint32_t) * 12 * 256,
1662 	    KM_SLEEP);
1663 	key = (uint8_t *)mgp->rss_key;
1664 	t = 0;
1665 	for (b = 0; b < 12; b++) {
1666 		for (s = 0; s < 8; s++) {
1667 			/* Bits: b*8+s, ..., b*8+s+31 */
1668 			k[s] = 0;
1669 			for (j = 0; j < 32; j++) {
1670 				int bit = b*8+s+j;
1671 				bit = 0x1 & (key[bit / 8] >> (7 -(bit & 0x7)));
1672 				k[s] |= bit << (31 - j);
1673 			}
1674 		}
1675 
1676 		for (i = 0; i <= 0xff; i++) {
1677 			tmp = 0;
1678 			if (i & (1 << 7)) { tmp ^= k[0]; }
1679 			if (i & (1 << 6)) { tmp ^= k[1]; }
1680 			if (i & (1 << 5)) { tmp ^= k[2]; }
1681 			if (i & (1 << 4)) { tmp ^= k[3]; }
1682 			if (i & (1 << 3)) { tmp ^= k[4]; }
1683 			if (i & (1 << 2)) { tmp ^= k[5]; }
1684 			if (i & (1 << 1)) { tmp ^= k[6]; }
1685 			if (i & (1 << 0)) { tmp ^= k[7]; }
1686 			mgp->toeplitz_hash_table[t++] = tmp;
1687 		}
1688 	}
1689 	return (0);
1690 }
1691 
1692 static inline struct myri10ge_slice_state *
1693 myri10ge_toeplitz_send_hash(struct myri10ge_priv *mgp, struct ip *ip)
1694 {
1695 	struct tcphdr *hdr;
1696 	uint32_t saddr, daddr;
1697 	uint32_t hash, slice;
1698 	uint32_t *table = mgp->toeplitz_hash_table;
1699 	uint16_t src, dst;
1700 
1701 	/*
1702 	 * Note hashing order is reversed from how it is done
1703 	 * in the NIC, so as to generate the same hash value
1704 	 * for the connection to try to keep connections CPU local
1705 	 */
1706 
1707 	/* hash on IPv4 src/dst address */
1708 	saddr = ntohl(ip->ip_src.s_addr);
1709 	daddr = ntohl(ip->ip_dst.s_addr);
1710 	hash = table[(256 * 0) + ((daddr >> 24) & 0xff)];
1711 	hash ^= table[(256 * 1) + ((daddr >> 16) & 0xff)];
1712 	hash ^= table[(256 * 2) + ((daddr >> 8) & 0xff)];
1713 	hash ^= table[(256 * 3) + ((daddr) & 0xff)];
1714 	hash ^= table[(256 * 4) + ((saddr >> 24) & 0xff)];
1715 	hash ^= table[(256 * 5) + ((saddr >> 16) & 0xff)];
1716 	hash ^= table[(256 * 6) + ((saddr >> 8) & 0xff)];
1717 	hash ^= table[(256 * 7) + ((saddr) & 0xff)];
1718 	/* hash on TCP port, if required */
1719 	if ((myri10ge_rss_hash & MXGEFW_RSS_HASH_TYPE_TCP_IPV4) &&
1720 	    ip->ip_p == IPPROTO_TCP) {
1721 		hdr = (struct tcphdr *)(void *)
1722 		    (((uint8_t *)ip) +  (ip->ip_hl << 2));
1723 		src = ntohs(hdr->th_sport);
1724 		dst = ntohs(hdr->th_dport);
1725 
1726 		hash ^= table[(256 * 8) + ((dst >> 8) & 0xff)];
1727 		hash ^= table[(256 * 9) + ((dst) & 0xff)];
1728 		hash ^= table[(256 * 10) + ((src >> 8) & 0xff)];
1729 		hash ^= table[(256 * 11) + ((src) & 0xff)];
1730 	}
1731 	slice = (mgp->num_slices - 1) & hash;
1732 	return (&mgp->ss[slice]);
1733 
1734 }
1735 
1736 static inline struct myri10ge_slice_state *
1737 myri10ge_simple_send_hash(struct myri10ge_priv *mgp, struct ip *ip)
1738 {
1739 	struct tcphdr *hdr;
1740 	uint32_t slice, hash_val;
1741 
1742 
1743 	if (ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP) {
1744 		return (&mgp->ss[0]);
1745 	}
1746 	hdr = (struct tcphdr *)(void *)(((uint8_t *)ip) +  (ip->ip_hl << 2));
1747 
1748 	/*
1749 	 * Use the second byte of the *destination* address for
1750 	 * MXGEFW_RSS_HASH_TYPE_SRC_PORT, so as to match NIC's hashing
1751 	 */
1752 	hash_val = ntohs(hdr->th_dport) & 0xff;
1753 	if (myri10ge_rss_hash == MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT)
1754 		hash_val += ntohs(hdr->th_sport) & 0xff;
1755 
1756 	slice = (mgp->num_slices - 1) & hash_val;
1757 	return (&mgp->ss[slice]);
1758 }
1759 
1760 static inline struct myri10ge_slice_state *
1761 myri10ge_send_hash(struct myri10ge_priv *mgp, mblk_t *mp)
1762 {
1763 	unsigned int slice = 0;
1764 	struct ether_header *eh;
1765 	struct ether_vlan_header *vh;
1766 	struct ip *ip;
1767 	int ehl, ihl;
1768 
1769 	if (mgp->num_slices == 1)
1770 		return (&mgp->ss[0]);
1771 
1772 	if (myri10ge_tx_hash == 0) {
1773 		slice = CPU->cpu_id & (mgp->num_slices - 1);
1774 		return (&mgp->ss[slice]);
1775 	}
1776 
1777 	/*
1778 	 *  ensure it is a TCP or UDP over IPv4 packet, and that the
1779 	 *  headers are in the 1st mblk.  Otherwise, punt
1780 	 */
1781 	ehl = sizeof (*eh);
1782 	ihl = sizeof (*ip);
1783 	if ((MBLKL(mp)) <  (ehl + ihl + 8))
1784 		return (&mgp->ss[0]);
1785 	eh = (struct ether_header *)(void *)mp->b_rptr;
1786 	ip = (struct ip *)(void *)(eh + 1);
1787 	if (eh->ether_type != BE_16(ETHERTYPE_IP)) {
1788 		if (eh->ether_type != BE_16(ETHERTYPE_VLAN))
1789 			return (&mgp->ss[0]);
1790 		vh = (struct ether_vlan_header *)(void *)mp->b_rptr;
1791 		if (vh->ether_type != BE_16(ETHERTYPE_IP))
1792 			return (&mgp->ss[0]);
1793 		ehl += 4;
1794 		ip = (struct ip *)(void *)(vh + 1);
1795 	}
1796 	ihl = ip->ip_hl << 2;
1797 	if (MBLKL(mp) <  (ehl + ihl + 8))
1798 		return (&mgp->ss[0]);
1799 	switch (myri10ge_rss_hash) {
1800 	case MXGEFW_RSS_HASH_TYPE_IPV4:
1801 		/* fallthru */
1802 	case MXGEFW_RSS_HASH_TYPE_TCP_IPV4:
1803 		/* fallthru */
1804 	case (MXGEFW_RSS_HASH_TYPE_IPV4|MXGEFW_RSS_HASH_TYPE_TCP_IPV4):
1805 		return (myri10ge_toeplitz_send_hash(mgp, ip));
1806 	case MXGEFW_RSS_HASH_TYPE_SRC_PORT:
1807 		/* fallthru */
1808 	case MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT:
1809 		return (myri10ge_simple_send_hash(mgp, ip));
1810 	default:
1811 		break;
1812 	}
1813 	return (&mgp->ss[0]);
1814 }
1815 
1816 static int
1817 myri10ge_setup_slice(struct myri10ge_slice_state *ss)
1818 {
1819 	struct myri10ge_priv *mgp = ss->mgp;
1820 	myri10ge_cmd_t cmd;
1821 	int tx_ring_size, rx_ring_size;
1822 	int tx_ring_entries, rx_ring_entries;
1823 	int slice, status;
1824 	int allocated, idx;
1825 	size_t bytes;
1826 
1827 	slice = ss - mgp->ss;
1828 	cmd.data0 = slice;
1829 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
1830 	tx_ring_size = cmd.data0;
1831 	cmd.data0 = slice;
1832 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
1833 	if (status != 0)
1834 		return (status);
1835 	rx_ring_size = cmd.data0;
1836 
1837 	tx_ring_entries = tx_ring_size / sizeof (struct mcp_kreq_ether_send);
1838 	rx_ring_entries = rx_ring_size / sizeof (struct mcp_dma_addr);
1839 	ss->tx.mask = tx_ring_entries - 1;
1840 	ss->rx_small.mask = ss->rx_big.mask = rx_ring_entries - 1;
1841 
1842 	/* get the lanai pointers to the send and receive rings */
1843 
1844 	cmd.data0 = slice;
1845 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
1846 	ss->tx.lanai = (mcp_kreq_ether_send_t *)(void *)(mgp->sram + cmd.data0);
1847 	if (mgp->num_slices > 1) {
1848 		ss->tx.go = (char *)mgp->sram + MXGEFW_ETH_SEND_GO + 64 * slice;
1849 		ss->tx.stop = (char *)mgp->sram + MXGEFW_ETH_SEND_STOP +
1850 		    64 * slice;
1851 	} else {
1852 		ss->tx.go = NULL;
1853 		ss->tx.stop = NULL;
1854 	}
1855 
1856 	cmd.data0 = slice;
1857 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
1858 	ss->rx_small.lanai = (mcp_kreq_ether_recv_t *)
1859 	    (void *)(mgp->sram + cmd.data0);
1860 
1861 	cmd.data0 = slice;
1862 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
1863 	ss->rx_big.lanai = (mcp_kreq_ether_recv_t *)(void *)
1864 	    (mgp->sram + cmd.data0);
1865 
1866 	if (status != 0) {
1867 		cmn_err(CE_WARN,
1868 		    "%s: failed to get ring sizes or locations\n", mgp->name);
1869 		return (status);
1870 	}
1871 
1872 	status = ENOMEM;
1873 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
1874 	ss->rx_small.shadow = kmem_zalloc(bytes, KM_SLEEP);
1875 	if (ss->rx_small.shadow == NULL)
1876 		goto abort;
1877 	(void) memset(ss->rx_small.shadow, 0, bytes);
1878 
1879 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
1880 	ss->rx_big.shadow = kmem_zalloc(bytes, KM_SLEEP);
1881 	if (ss->rx_big.shadow == NULL)
1882 		goto abort_with_rx_small_shadow;
1883 	(void) memset(ss->rx_big.shadow, 0, bytes);
1884 
1885 	/* allocate the host info rings */
1886 
1887 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
1888 	ss->tx.info = kmem_zalloc(bytes, KM_SLEEP);
1889 	if (ss->tx.info == NULL)
1890 		goto abort_with_rx_big_shadow;
1891 	(void) memset(ss->tx.info, 0, bytes);
1892 
1893 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
1894 	ss->rx_small.info = kmem_zalloc(bytes, KM_SLEEP);
1895 	if (ss->rx_small.info == NULL)
1896 		goto abort_with_tx_info;
1897 	(void) memset(ss->rx_small.info, 0, bytes);
1898 
1899 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
1900 	ss->rx_big.info = kmem_zalloc(bytes, KM_SLEEP);
1901 	if (ss->rx_big.info == NULL)
1902 		goto abort_with_rx_small_info;
1903 	(void) memset(ss->rx_big.info, 0, bytes);
1904 
1905 	ss->tx.stall = ss->tx.sched = 0;
1906 	ss->tx.stall_early = ss->tx.stall_late = 0;
1907 
1908 	ss->jbufs_for_smalls = 1 + (1 + ss->rx_small.mask) /
1909 	    (myri10ge_mtu / (myri10ge_small_bytes + MXGEFW_PAD));
1910 
1911 	allocated = myri10ge_add_jbufs(ss,
1912 	    myri10ge_bigbufs_initial + ss->jbufs_for_smalls, 1);
1913 	if (allocated < ss->jbufs_for_smalls + myri10ge_bigbufs_initial) {
1914 		cmn_err(CE_WARN,
1915 		    "%s: Could not allocate enough receive buffers (%d/%d)\n",
1916 		    mgp->name, allocated,
1917 		    myri10ge_bigbufs_initial + ss->jbufs_for_smalls);
1918 		goto abort_with_jumbos;
1919 	}
1920 
1921 	myri10ge_carve_up_jbufs_into_small_ring(ss);
1922 	ss->j_rx_cnt = 0;
1923 
1924 	mutex_enter(&ss->jpool.mtx);
1925 	if (allocated < rx_ring_entries)
1926 		ss->jpool.low_water = allocated / 4;
1927 	else
1928 		ss->jpool.low_water = rx_ring_entries / 2;
1929 
1930 	/*
1931 	 * invalidate the big receive ring in case we do not
1932 	 * allocate sufficient jumbos to fill it
1933 	 */
1934 	(void) memset(ss->rx_big.shadow, 1,
1935 	    (ss->rx_big.mask + 1) * sizeof (ss->rx_big.shadow[0]));
1936 	for (idx = 7; idx <= ss->rx_big.mask; idx += 8) {
1937 		myri10ge_submit_8rx(&ss->rx_big.lanai[idx - 7],
1938 		    &ss->rx_big.shadow[idx - 7]);
1939 		mb();
1940 	}
1941 
1942 
1943 	myri10ge_restock_jumbos(ss);
1944 
1945 	for (idx = 7; idx <= ss->rx_small.mask; idx += 8) {
1946 		myri10ge_submit_8rx(&ss->rx_small.lanai[idx - 7],
1947 		    &ss->rx_small.shadow[idx - 7]);
1948 		mb();
1949 	}
1950 	ss->rx_small.cnt = ss->rx_small.mask + 1;
1951 
1952 	mutex_exit(&ss->jpool.mtx);
1953 
1954 	status = myri10ge_prepare_tx_ring(ss);
1955 
1956 	if (status != 0)
1957 		goto abort_with_small_jbufs;
1958 
1959 	cmd.data0 = ntohl(ss->fw_stats_dma.low);
1960 	cmd.data1 = ntohl(ss->fw_stats_dma.high);
1961 	cmd.data2 = sizeof (mcp_irq_data_t);
1962 	cmd.data2 |= (slice << 16);
1963 	bzero(ss->fw_stats, sizeof (*ss->fw_stats));
1964 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
1965 	if (status == ENOSYS) {
1966 		cmd.data0 = ntohl(ss->fw_stats_dma.low) +
1967 		    offsetof(mcp_irq_data_t, send_done_count);
1968 		cmd.data1 = ntohl(ss->fw_stats_dma.high);
1969 		status = myri10ge_send_cmd(mgp,
1970 		    MXGEFW_CMD_SET_STATS_DMA_OBSOLETE, &cmd);
1971 	}
1972 	if (status) {
1973 		cmn_err(CE_WARN, "%s: Couldn't set stats DMA\n", mgp->name);
1974 		goto abort_with_tx;
1975 	}
1976 
1977 	return (0);
1978 
1979 abort_with_tx:
1980 	myri10ge_unprepare_tx_ring(ss);
1981 
1982 abort_with_small_jbufs:
1983 	myri10ge_release_small_jbufs(ss);
1984 
1985 abort_with_jumbos:
1986 	if (allocated != 0) {
1987 		mutex_enter(&ss->jpool.mtx);
1988 		ss->jpool.low_water = 0;
1989 		mutex_exit(&ss->jpool.mtx);
1990 		myri10ge_unstock_jumbos(ss);
1991 		myri10ge_remove_jbufs(ss);
1992 	}
1993 
1994 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
1995 	kmem_free(ss->rx_big.info, bytes);
1996 
1997 abort_with_rx_small_info:
1998 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
1999 	kmem_free(ss->rx_small.info, bytes);
2000 
2001 abort_with_tx_info:
2002 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
2003 	kmem_free(ss->tx.info, bytes);
2004 
2005 abort_with_rx_big_shadow:
2006 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2007 	kmem_free(ss->rx_big.shadow, bytes);
2008 
2009 abort_with_rx_small_shadow:
2010 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2011 	kmem_free(ss->rx_small.shadow, bytes);
2012 abort:
2013 	return (status);
2014 
2015 }
2016 
2017 static void
2018 myri10ge_teardown_slice(struct myri10ge_slice_state *ss)
2019 {
2020 	int tx_ring_entries, rx_ring_entries;
2021 	size_t bytes;
2022 
2023 	/* ignore slices that have not been fully setup */
2024 	if (ss->tx.cp == NULL)
2025 		return;
2026 	/* Free the TX copy buffers */
2027 	myri10ge_unprepare_tx_ring(ss);
2028 
2029 	/* stop passing returned buffers to firmware */
2030 
2031 	mutex_enter(&ss->jpool.mtx);
2032 	ss->jpool.low_water = 0;
2033 	mutex_exit(&ss->jpool.mtx);
2034 	myri10ge_release_small_jbufs(ss);
2035 
2036 	/* Release the free jumbo frame pool */
2037 	myri10ge_unstock_jumbos(ss);
2038 	myri10ge_remove_jbufs(ss);
2039 
2040 	rx_ring_entries = ss->rx_big.mask + 1;
2041 	tx_ring_entries = ss->tx.mask + 1;
2042 
2043 	bytes = rx_ring_entries * sizeof (*ss->rx_big.info);
2044 	kmem_free(ss->rx_big.info, bytes);
2045 
2046 	bytes = rx_ring_entries * sizeof (*ss->rx_small.info);
2047 	kmem_free(ss->rx_small.info, bytes);
2048 
2049 	bytes = tx_ring_entries * sizeof (*ss->tx.info);
2050 	kmem_free(ss->tx.info, bytes);
2051 
2052 	bytes = rx_ring_entries * sizeof (*ss->rx_big.shadow);
2053 	kmem_free(ss->rx_big.shadow, bytes);
2054 
2055 	bytes = rx_ring_entries * sizeof (*ss->rx_small.shadow);
2056 	kmem_free(ss->rx_small.shadow, bytes);
2057 
2058 }
2059 static int
2060 myri10ge_start_locked(struct myri10ge_priv *mgp)
2061 {
2062 	myri10ge_cmd_t cmd;
2063 	int status, big_pow2, i;
2064 	volatile uint8_t *itable;
2065 
2066 	status = DDI_SUCCESS;
2067 	/* Allocate DMA resources and receive buffers */
2068 
2069 	status = myri10ge_reset(mgp);
2070 	if (status != 0) {
2071 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
2072 		return (DDI_FAILURE);
2073 	}
2074 
2075 	if (mgp->num_slices > 1) {
2076 		cmd.data0 = mgp->num_slices;
2077 		cmd.data1 = 1; /* use MSI-X */
2078 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ENABLE_RSS_QUEUES,
2079 		    &cmd);
2080 		if (status != 0) {
2081 			cmn_err(CE_WARN,
2082 			    "%s: failed to set number of slices\n",
2083 			    mgp->name);
2084 			goto abort_with_nothing;
2085 		}
2086 		/* setup the indirection table */
2087 		cmd.data0 = mgp->num_slices;
2088 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_RSS_TABLE_SIZE,
2089 		    &cmd);
2090 
2091 		status |= myri10ge_send_cmd(mgp,
2092 		    MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
2093 		if (status != 0) {
2094 			cmn_err(CE_WARN,
2095 			    "%s: failed to setup rss tables\n", mgp->name);
2096 		}
2097 
2098 		/* just enable an identity mapping */
2099 		itable = mgp->sram + cmd.data0;
2100 		for (i = 0; i < mgp->num_slices; i++)
2101 			itable[i] = (uint8_t)i;
2102 
2103 		if (myri10ge_rss_hash & MYRI10GE_TOEPLITZ_HASH) {
2104 			status = myri10ge_init_toeplitz(mgp);
2105 			if (status != 0) {
2106 				cmn_err(CE_WARN, "%s: failed to setup "
2107 				    "toeplitz tx hash table", mgp->name);
2108 				goto abort_with_nothing;
2109 			}
2110 		}
2111 		cmd.data0 = 1;
2112 		cmd.data1 = myri10ge_rss_hash;
2113 		status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_RSS_ENABLE,
2114 		    &cmd);
2115 		if (status != 0) {
2116 			cmn_err(CE_WARN,
2117 			    "%s: failed to enable slices\n", mgp->name);
2118 			goto abort_with_toeplitz;
2119 		}
2120 	}
2121 
2122 	for (i = 0; i < mgp->num_slices; i++) {
2123 		status = myri10ge_setup_slice(&mgp->ss[i]);
2124 		if (status != 0)
2125 			goto abort_with_slices;
2126 	}
2127 
2128 	/*
2129 	 * Tell the MCP how many buffers he has, and to
2130 	 *  bring the ethernet interface up
2131 	 *
2132 	 * Firmware needs the big buff size as a power of 2.  Lie and
2133 	 * tell him the buffer is larger, because we only use 1
2134 	 * buffer/pkt, and the mtu will prevent overruns
2135 	 */
2136 	big_pow2 = myri10ge_mtu + MXGEFW_PAD;
2137 	while ((big_pow2 & (big_pow2 - 1)) != 0)
2138 		big_pow2++;
2139 
2140 	/* now give firmware buffers sizes, and MTU */
2141 	cmd.data0 = myri10ge_mtu;
2142 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_MTU, &cmd);
2143 	cmd.data0 = myri10ge_small_bytes;
2144 	status |=
2145 	    myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
2146 	cmd.data0 = big_pow2;
2147 	status |= myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
2148 	if (status) {
2149 		cmn_err(CE_WARN, "%s: Couldn't set buffer sizes\n", mgp->name);
2150 		goto abort_with_slices;
2151 	}
2152 
2153 
2154 	cmd.data0 = 1;
2155 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_TSO_MODE, &cmd);
2156 	if (status) {
2157 		cmn_err(CE_WARN, "%s: unable to setup TSO (%d)\n",
2158 		    mgp->name, status);
2159 	} else {
2160 		mgp->features |= MYRI10GE_TSO;
2161 	}
2162 
2163 	mgp->link_state = -1;
2164 	mgp->rdma_tags_available = 15;
2165 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_UP, &cmd);
2166 	if (status) {
2167 		cmn_err(CE_WARN, "%s: unable to start ethernet\n", mgp->name);
2168 		goto abort_with_slices;
2169 	}
2170 	mgp->running = MYRI10GE_ETH_RUNNING;
2171 	return (DDI_SUCCESS);
2172 
2173 abort_with_slices:
2174 	for (i = 0; i < mgp->num_slices; i++)
2175 		myri10ge_teardown_slice(&mgp->ss[i]);
2176 
2177 	mgp->running = MYRI10GE_ETH_STOPPED;
2178 
2179 abort_with_toeplitz:
2180 	if (mgp->toeplitz_hash_table != NULL) {
2181 		kmem_free(mgp->toeplitz_hash_table,
2182 		    sizeof (uint32_t) * 12 * 256);
2183 		mgp->toeplitz_hash_table = NULL;
2184 	}
2185 
2186 abort_with_nothing:
2187 	return (DDI_FAILURE);
2188 }
2189 
2190 static void
2191 myri10ge_stop_locked(struct myri10ge_priv *mgp)
2192 {
2193 	int status, old_down_cnt;
2194 	myri10ge_cmd_t cmd;
2195 	int wait_time = 10;
2196 	int i, polling;
2197 
2198 	old_down_cnt = mgp->down_cnt;
2199 	mb();
2200 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
2201 	if (status) {
2202 		cmn_err(CE_WARN, "%s: Couldn't bring down link\n", mgp->name);
2203 	}
2204 
2205 	while (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2206 		delay(1 * drv_usectohz(1000000));
2207 		wait_time--;
2208 		if (wait_time == 0)
2209 			break;
2210 	}
2211 again:
2212 	if (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2213 		cmn_err(CE_WARN, "%s: didn't get down irq\n", mgp->name);
2214 		for (i = 0; i < mgp->num_slices; i++) {
2215 			/*
2216 			 * take and release the rx lock to ensure
2217 			 * that no interrupt thread is blocked
2218 			 * elsewhere in the stack, preventing
2219 			 * completion
2220 			 */
2221 
2222 			mutex_enter(&mgp->ss[i].rx_lock);
2223 			printf("%s: slice %d rx irq idle\n",
2224 			    mgp->name, i);
2225 			mutex_exit(&mgp->ss[i].rx_lock);
2226 
2227 			/* verify that the poll handler is inactive */
2228 			mutex_enter(&mgp->ss->poll_lock);
2229 			polling = mgp->ss->rx_polling;
2230 			mutex_exit(&mgp->ss->poll_lock);
2231 			if (polling) {
2232 				printf("%s: slice %d is polling\n",
2233 				    mgp->name, i);
2234 				delay(1 * drv_usectohz(1000000));
2235 				goto again;
2236 			}
2237 		}
2238 		delay(1 * drv_usectohz(1000000));
2239 		if (old_down_cnt == *((volatile int *)&mgp->down_cnt)) {
2240 			cmn_err(CE_WARN, "%s: Never got down irq\n", mgp->name);
2241 		}
2242 	}
2243 
2244 	for (i = 0; i < mgp->num_slices; i++)
2245 		myri10ge_teardown_slice(&mgp->ss[i]);
2246 
2247 	if (mgp->toeplitz_hash_table != NULL) {
2248 		kmem_free(mgp->toeplitz_hash_table,
2249 		    sizeof (uint32_t) * 12 * 256);
2250 		mgp->toeplitz_hash_table = NULL;
2251 	}
2252 	mgp->running = MYRI10GE_ETH_STOPPED;
2253 }
2254 
2255 static int
2256 myri10ge_m_start(void *arg)
2257 {
2258 	struct myri10ge_priv *mgp = arg;
2259 	int status;
2260 
2261 	mutex_enter(&mgp->intrlock);
2262 
2263 	if (mgp->running != MYRI10GE_ETH_STOPPED) {
2264 		mutex_exit(&mgp->intrlock);
2265 		return (DDI_FAILURE);
2266 	}
2267 	status = myri10ge_start_locked(mgp);
2268 	mutex_exit(&mgp->intrlock);
2269 
2270 	if (status != DDI_SUCCESS)
2271 		return (status);
2272 
2273 	/* start the watchdog timer */
2274 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
2275 	    mgp->timer_ticks);
2276 	return (DDI_SUCCESS);
2277 
2278 }
2279 
2280 static void
2281 myri10ge_m_stop(void *arg)
2282 {
2283 	struct myri10ge_priv *mgp = arg;
2284 
2285 	mutex_enter(&mgp->intrlock);
2286 	/* if the device not running give up */
2287 	if (mgp->running != MYRI10GE_ETH_RUNNING) {
2288 		mutex_exit(&mgp->intrlock);
2289 		return;
2290 	}
2291 
2292 	mgp->running = MYRI10GE_ETH_STOPPING;
2293 	mutex_exit(&mgp->intrlock);
2294 	(void) untimeout(mgp->timer_id);
2295 	mutex_enter(&mgp->intrlock);
2296 	myri10ge_stop_locked(mgp);
2297 	mutex_exit(&mgp->intrlock);
2298 
2299 }
2300 
2301 static inline void
2302 myri10ge_rx_csum(mblk_t *mp, struct myri10ge_rx_ring_stats *s, uint32_t csum)
2303 {
2304 	struct ether_header *eh;
2305 	struct ip *ip;
2306 	struct ip6_hdr *ip6;
2307 	uint32_t start, stuff, end, partial, hdrlen;
2308 
2309 
2310 	csum = ntohs((uint16_t)csum);
2311 	eh = (struct ether_header *)(void *)mp->b_rptr;
2312 	hdrlen = sizeof (*eh);
2313 	if (eh->ether_dhost.ether_addr_octet[0] & 1) {
2314 		if (0 == (bcmp(eh->ether_dhost.ether_addr_octet,
2315 		    myri10ge_broadcastaddr, sizeof (eh->ether_dhost))))
2316 			s->brdcstrcv++;
2317 		else
2318 			s->multircv++;
2319 	}
2320 
2321 	if (eh->ether_type == BE_16(ETHERTYPE_VLAN)) {
2322 		/*
2323 		 * fix checksum by subtracting 4 bytes after what the
2324 		 * firmware thought was the end of the ether hdr
2325 		 */
2326 		partial = *(uint32_t *)
2327 		    (void *)(mp->b_rptr + ETHERNET_HEADER_SIZE);
2328 		csum += ~partial;
2329 		csum +=  (csum < ~partial);
2330 		csum = (csum >> 16) + (csum & 0xFFFF);
2331 		csum = (csum >> 16) + (csum & 0xFFFF);
2332 		hdrlen += VLAN_TAGSZ;
2333 	}
2334 
2335 	if (eh->ether_type ==  BE_16(ETHERTYPE_IP)) {
2336 		ip = (struct ip *)(void *)(mp->b_rptr + hdrlen);
2337 		start = ip->ip_hl << 2;
2338 
2339 		if (ip->ip_p == IPPROTO_TCP)
2340 			stuff = start + offsetof(struct tcphdr, th_sum);
2341 		else if (ip->ip_p == IPPROTO_UDP)
2342 			stuff = start + offsetof(struct udphdr, uh_sum);
2343 		else
2344 			return;
2345 		end = ntohs(ip->ip_len);
2346 	} else if (eh->ether_type ==  BE_16(ETHERTYPE_IPV6)) {
2347 		ip6 = (struct ip6_hdr *)(void *)(mp->b_rptr + hdrlen);
2348 		start = sizeof (*ip6);
2349 		if (ip6->ip6_nxt == IPPROTO_TCP) {
2350 			stuff = start + offsetof(struct tcphdr, th_sum);
2351 		} else if (ip6->ip6_nxt == IPPROTO_UDP)
2352 			stuff = start + offsetof(struct udphdr, uh_sum);
2353 		else
2354 			return;
2355 		end = start + ntohs(ip6->ip6_plen);
2356 		/*
2357 		 * IPv6 headers do not contain a checksum, and hence
2358 		 * do not checksum to zero, so they don't "fall out"
2359 		 * of the partial checksum calculation like IPv4
2360 		 * headers do.  We need to fix the partial checksum by
2361 		 * subtracting the checksum of the IPv6 header.
2362 		 */
2363 
2364 		partial = myri10ge_csum_generic((uint16_t *)ip6, sizeof (*ip6));
2365 		csum += ~partial;
2366 		csum +=  (csum < ~partial);
2367 		csum = (csum >> 16) + (csum & 0xFFFF);
2368 		csum = (csum >> 16) + (csum & 0xFFFF);
2369 	} else {
2370 		return;
2371 	}
2372 
2373 	if (MBLKL(mp) > hdrlen + end) {
2374 		/* padded frame, so hw csum may be invalid */
2375 		return;
2376 	}
2377 
2378 	(void) hcksum_assoc(mp, NULL, NULL, start, stuff, end,
2379 	    csum, HCK_PARTIALCKSUM, 0);
2380 }
2381 
2382 static mblk_t *
2383 myri10ge_rx_done_small(struct myri10ge_slice_state *ss, uint32_t len,
2384     uint32_t csum)
2385 {
2386 	mblk_t *mp;
2387 	myri10ge_rx_ring_t *rx;
2388 	int idx;
2389 
2390 	rx = &ss->rx_small;
2391 	idx = rx->cnt & rx->mask;
2392 	ss->rx_small.cnt++;
2393 
2394 	/* allocate a new buffer to pass up the stack */
2395 	mp = allocb(len + MXGEFW_PAD, 0);
2396 	if (mp == NULL) {
2397 		MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_small_nobuf);
2398 		goto abort;
2399 	}
2400 	bcopy(ss->rx_small.info[idx].ptr,
2401 	    (caddr_t)mp->b_wptr, len + MXGEFW_PAD);
2402 	mp->b_wptr += len + MXGEFW_PAD;
2403 	mp->b_rptr += MXGEFW_PAD;
2404 
2405 	ss->rx_stats.ibytes += len;
2406 	ss->rx_stats.ipackets += 1;
2407 	myri10ge_rx_csum(mp, &ss->rx_stats, csum);
2408 
2409 abort:
2410 	if ((idx & 7) == 7) {
2411 		myri10ge_submit_8rx(&rx->lanai[idx - 7],
2412 		    &rx->shadow[idx - 7]);
2413 	}
2414 
2415 	return (mp);
2416 }
2417 
2418 
2419 static mblk_t *
2420 myri10ge_rx_done_big(struct myri10ge_slice_state *ss, uint32_t len,
2421     uint32_t csum)
2422 {
2423 	struct myri10ge_jpool_stuff *jpool;
2424 	struct myri10ge_jpool_entry *j;
2425 	mblk_t *mp;
2426 	int idx, num_owned_by_mcp;
2427 
2428 	jpool = &ss->jpool;
2429 	idx = ss->j_rx_cnt & ss->rx_big.mask;
2430 	j = ss->rx_big.info[idx].j;
2431 
2432 	if (j == NULL) {
2433 		printf("%s: null j at idx=%d, rx_big.cnt = %d, j_rx_cnt=%d\n",
2434 		    ss->mgp->name, idx, ss->rx_big.cnt, ss->j_rx_cnt);
2435 		return (NULL);
2436 	}
2437 
2438 
2439 	ss->rx_big.info[idx].j = NULL;
2440 	ss->j_rx_cnt++;
2441 
2442 
2443 	/*
2444 	 * Check to see if we are low on rx buffers.
2445 	 * Note that we must leave at least 8 free so there are
2446 	 * enough to free in a single 64-byte write.
2447 	 */
2448 	num_owned_by_mcp = ss->rx_big.cnt - ss->j_rx_cnt;
2449 	if (num_owned_by_mcp < jpool->low_water) {
2450 		mutex_enter(&jpool->mtx);
2451 		myri10ge_restock_jumbos(ss);
2452 		mutex_exit(&jpool->mtx);
2453 		num_owned_by_mcp = ss->rx_big.cnt - ss->j_rx_cnt;
2454 		/* if we are still low, then we have to copy */
2455 		if (num_owned_by_mcp < 16) {
2456 			MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_copy);
2457 			/* allocate a new buffer to pass up the stack */
2458 			mp = allocb(len + MXGEFW_PAD, 0);
2459 			if (mp == NULL) {
2460 				goto abort;
2461 			}
2462 			bcopy(j->buf,
2463 			    (caddr_t)mp->b_wptr, len + MXGEFW_PAD);
2464 			myri10ge_jfree_rtn(j);
2465 			/* push buffer back to NIC */
2466 			mutex_enter(&jpool->mtx);
2467 			myri10ge_restock_jumbos(ss);
2468 			mutex_exit(&jpool->mtx);
2469 			goto set_len;
2470 		}
2471 	}
2472 
2473 	/* loan our buffer to the stack */
2474 	mp = desballoc((unsigned char *)j->buf, myri10ge_mtu, 0, &j->free_func);
2475 	if (mp == NULL) {
2476 		goto abort;
2477 	}
2478 
2479 set_len:
2480 	mp->b_rptr += MXGEFW_PAD;
2481 	mp->b_wptr = ((unsigned char *) mp->b_rptr + len);
2482 
2483 	ss->rx_stats.ibytes += len;
2484 	ss->rx_stats.ipackets += 1;
2485 	myri10ge_rx_csum(mp, &ss->rx_stats, csum);
2486 
2487 	return (mp);
2488 
2489 abort:
2490 	myri10ge_jfree_rtn(j);
2491 	MYRI10GE_ATOMIC_SLICE_STAT_INC(rx_big_nobuf);
2492 	return (NULL);
2493 }
2494 
2495 /*
2496  * Free all transmit buffers up until the specified index
2497  */
2498 static inline void
2499 myri10ge_tx_done(struct myri10ge_slice_state *ss, uint32_t mcp_index)
2500 {
2501 	myri10ge_tx_ring_t *tx;
2502 	struct myri10ge_tx_dma_handle_head handles;
2503 	int idx;
2504 	int limit = 0;
2505 
2506 	tx = &ss->tx;
2507 	handles.head = NULL;
2508 	handles.tail = NULL;
2509 	while (tx->pkt_done != (int)mcp_index) {
2510 		idx = tx->done & tx->mask;
2511 
2512 		/*
2513 		 * mblk & DMA handle attached only to first slot
2514 		 * per buffer in the packet
2515 		 */
2516 
2517 		if (tx->info[idx].m) {
2518 			(void) ddi_dma_unbind_handle(tx->info[idx].handle->h);
2519 			tx->info[idx].handle->next = handles.head;
2520 			handles.head = tx->info[idx].handle;
2521 			if (handles.tail == NULL)
2522 				handles.tail = tx->info[idx].handle;
2523 			freeb(tx->info[idx].m);
2524 			tx->info[idx].m = 0;
2525 			tx->info[idx].handle = 0;
2526 		}
2527 		if (tx->info[idx].ostat.opackets != 0) {
2528 			tx->stats.multixmt += tx->info[idx].ostat.multixmt;
2529 			tx->stats.brdcstxmt += tx->info[idx].ostat.brdcstxmt;
2530 			tx->stats.obytes += tx->info[idx].ostat.obytes;
2531 			tx->stats.opackets += tx->info[idx].ostat.opackets;
2532 			tx->info[idx].stat.un.all = 0;
2533 			tx->pkt_done++;
2534 		}
2535 
2536 		tx->done++;
2537 		/*
2538 		 * if we stalled the queue, wake it.  But Wait until
2539 		 * we have at least 1/2 our slots free.
2540 		 */
2541 		if ((tx->req - tx->done) < (tx->mask >> 1) &&
2542 		    tx->stall != tx->sched) {
2543 			mutex_enter(&ss->tx.lock);
2544 			tx->sched = tx->stall;
2545 			mutex_exit(&ss->tx.lock);
2546 			mac_tx_ring_update(ss->mgp->mh, tx->rh);
2547 		}
2548 
2549 		/* limit potential for livelock */
2550 		if (unlikely(++limit >  2 * tx->mask))
2551 			break;
2552 	}
2553 	if (tx->req == tx->done && tx->stop != NULL) {
2554 		/*
2555 		 * Nic has sent all pending requests, allow him
2556 		 * to stop polling this queue
2557 		 */
2558 		mutex_enter(&tx->lock);
2559 		if (tx->req == tx->done && tx->active) {
2560 			*(int *)(void *)tx->stop = 1;
2561 			tx->active = 0;
2562 			mb();
2563 		}
2564 		mutex_exit(&tx->lock);
2565 	}
2566 	if (handles.head != NULL)
2567 		myri10ge_free_tx_handles(tx, &handles);
2568 }
2569 
2570 static void
2571 myri10ge_mbl_init(struct myri10ge_mblk_list *mbl)
2572 {
2573 	mbl->head = NULL;
2574 	mbl->tail = &mbl->head;
2575 	mbl->cnt = 0;
2576 }
2577 
2578 /*ARGSUSED*/
2579 void
2580 myri10ge_mbl_append(struct myri10ge_slice_state *ss,
2581     struct myri10ge_mblk_list *mbl, mblk_t *mp)
2582 {
2583 	*(mbl->tail) = mp;
2584 	mbl->tail = &mp->b_next;
2585 	mp->b_next = NULL;
2586 	mbl->cnt++;
2587 }
2588 
2589 
2590 static inline void
2591 myri10ge_clean_rx_done(struct myri10ge_slice_state *ss,
2592     struct myri10ge_mblk_list *mbl, int limit, boolean_t *stop)
2593 {
2594 	myri10ge_rx_done_t *rx_done = &ss->rx_done;
2595 	struct myri10ge_priv *mgp = ss->mgp;
2596 	mblk_t *mp;
2597 	struct lro_entry *lro;
2598 	uint16_t length;
2599 	uint16_t checksum;
2600 
2601 
2602 	while (rx_done->entry[rx_done->idx].length != 0) {
2603 		if (unlikely (*stop)) {
2604 			break;
2605 		}
2606 		length = ntohs(rx_done->entry[rx_done->idx].length);
2607 		length &= (~MXGEFW_RSS_HASH_MASK);
2608 
2609 		/* limit potential for livelock */
2610 		limit -= length;
2611 		if (unlikely(limit < 0))
2612 			break;
2613 
2614 		rx_done->entry[rx_done->idx].length = 0;
2615 		checksum = ntohs(rx_done->entry[rx_done->idx].checksum);
2616 		if (length <= myri10ge_small_bytes)
2617 			mp = myri10ge_rx_done_small(ss, length, checksum);
2618 		else
2619 			mp = myri10ge_rx_done_big(ss, length, checksum);
2620 		if (mp != NULL) {
2621 			if (!myri10ge_lro ||
2622 			    0 != myri10ge_lro_rx(ss, mp, checksum, mbl))
2623 				myri10ge_mbl_append(ss, mbl, mp);
2624 		}
2625 		rx_done->cnt++;
2626 		rx_done->idx = rx_done->cnt & (mgp->max_intr_slots - 1);
2627 	}
2628 	while (ss->lro_active != NULL) {
2629 		lro = ss->lro_active;
2630 		ss->lro_active = lro->next;
2631 		myri10ge_lro_flush(ss, lro, mbl);
2632 	}
2633 }
2634 
2635 static void
2636 myri10ge_intr_rx(struct myri10ge_slice_state *ss)
2637 {
2638 	uint64_t gen;
2639 	struct myri10ge_mblk_list mbl;
2640 
2641 	myri10ge_mbl_init(&mbl);
2642 	if (mutex_tryenter(&ss->rx_lock) == 0)
2643 		return;
2644 	gen = ss->rx_gen_num;
2645 	myri10ge_clean_rx_done(ss, &mbl, MYRI10GE_POLL_NULL,
2646 	    &ss->rx_polling);
2647 	if (mbl.head != NULL)
2648 		mac_rx_ring(ss->mgp->mh, ss->rx_rh, mbl.head, gen);
2649 	mutex_exit(&ss->rx_lock);
2650 
2651 }
2652 
2653 static mblk_t *
2654 myri10ge_poll_rx(void *arg, int bytes)
2655 {
2656 	struct myri10ge_slice_state *ss = arg;
2657 	struct myri10ge_mblk_list mbl;
2658 	boolean_t dummy = B_FALSE;
2659 
2660 	if (bytes == 0)
2661 		return (NULL);
2662 
2663 	myri10ge_mbl_init(&mbl);
2664 	mutex_enter(&ss->rx_lock);
2665 	if (ss->rx_polling)
2666 		myri10ge_clean_rx_done(ss, &mbl, bytes, &dummy);
2667 	else
2668 		printf("%d: poll_rx: token=%d, polling=%d\n", (int)(ss -
2669 		    ss->mgp->ss), ss->rx_token, ss->rx_polling);
2670 	mutex_exit(&ss->rx_lock);
2671 	return (mbl.head);
2672 }
2673 
2674 /*ARGSUSED*/
2675 static uint_t
2676 myri10ge_intr(caddr_t arg0, caddr_t arg1)
2677 {
2678 	struct myri10ge_slice_state *ss =
2679 	    (struct myri10ge_slice_state *)(void *)arg0;
2680 	struct myri10ge_priv *mgp = ss->mgp;
2681 	mcp_irq_data_t *stats = ss->fw_stats;
2682 	myri10ge_tx_ring_t *tx = &ss->tx;
2683 	uint32_t send_done_count;
2684 	uint8_t valid;
2685 
2686 
2687 	/* make sure the DMA has finished */
2688 	if (!stats->valid) {
2689 		return (DDI_INTR_UNCLAIMED);
2690 	}
2691 	valid = stats->valid;
2692 
2693 	/* low bit indicates receives are present */
2694 	if (valid & 1)
2695 		myri10ge_intr_rx(ss);
2696 
2697 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
2698 		/* lower legacy IRQ  */
2699 		*mgp->irq_deassert = 0;
2700 		if (!myri10ge_deassert_wait)
2701 			/* don't wait for conf. that irq is low */
2702 			stats->valid = 0;
2703 		mb();
2704 	} else {
2705 		/* no need to wait for conf. that irq is low */
2706 		stats->valid = 0;
2707 	}
2708 
2709 	do {
2710 		/* check for transmit completes and receives */
2711 		send_done_count = ntohl(stats->send_done_count);
2712 		if (send_done_count != tx->pkt_done)
2713 			myri10ge_tx_done(ss, (int)send_done_count);
2714 	} while (*((volatile uint8_t *) &stats->valid));
2715 
2716 	if (stats->stats_updated) {
2717 		if (mgp->link_state != stats->link_up || stats->link_down) {
2718 			mgp->link_state = stats->link_up;
2719 			if (stats->link_down) {
2720 				mgp->down_cnt += stats->link_down;
2721 				mgp->link_state = 0;
2722 			}
2723 			if (mgp->link_state) {
2724 				if (myri10ge_verbose)
2725 					printf("%s: link up\n", mgp->name);
2726 				mac_link_update(mgp->mh, LINK_STATE_UP);
2727 			} else {
2728 				if (myri10ge_verbose)
2729 					printf("%s: link down\n", mgp->name);
2730 				mac_link_update(mgp->mh, LINK_STATE_DOWN);
2731 			}
2732 			MYRI10GE_NIC_STAT_INC(link_changes);
2733 		}
2734 		if (mgp->rdma_tags_available !=
2735 		    ntohl(ss->fw_stats->rdma_tags_available)) {
2736 			mgp->rdma_tags_available =
2737 			    ntohl(ss->fw_stats->rdma_tags_available);
2738 			cmn_err(CE_NOTE, "%s: RDMA timed out! "
2739 			    "%d tags left\n", mgp->name,
2740 			    mgp->rdma_tags_available);
2741 		}
2742 	}
2743 
2744 	mb();
2745 	/* check to see if we have rx token to pass back */
2746 	if (valid & 0x1) {
2747 		mutex_enter(&ss->poll_lock);
2748 		if (ss->rx_polling) {
2749 			ss->rx_token = 1;
2750 		} else {
2751 			*ss->irq_claim = BE_32(3);
2752 			ss->rx_token = 0;
2753 		}
2754 		mutex_exit(&ss->poll_lock);
2755 	}
2756 	*(ss->irq_claim + 1) = BE_32(3);
2757 	return (DDI_INTR_CLAIMED);
2758 }
2759 
2760 /*
2761  * Add or remove a multicast address.  This is called with our
2762  * macinfo's lock held by GLD, so we do not need to worry about
2763  * our own locking here.
2764  */
2765 static int
2766 myri10ge_m_multicst(void *arg, boolean_t add, const uint8_t *multicastaddr)
2767 {
2768 	myri10ge_cmd_t cmd;
2769 	struct myri10ge_priv *mgp = arg;
2770 	int status, join_leave;
2771 
2772 	if (add)
2773 		join_leave = MXGEFW_JOIN_MULTICAST_GROUP;
2774 	else
2775 		join_leave = MXGEFW_LEAVE_MULTICAST_GROUP;
2776 	(void) memcpy(&cmd.data0, multicastaddr, 4);
2777 	(void) memcpy(&cmd.data1, multicastaddr + 4, 2);
2778 	cmd.data0 = htonl(cmd.data0);
2779 	cmd.data1 = htonl(cmd.data1);
2780 	status = myri10ge_send_cmd(mgp, join_leave, &cmd);
2781 	if (status == 0)
2782 		return (0);
2783 
2784 	cmn_err(CE_WARN, "%s: failed to set multicast address\n",
2785 	    mgp->name);
2786 	return (status);
2787 }
2788 
2789 
2790 static int
2791 myri10ge_m_promisc(void *arg, boolean_t on)
2792 {
2793 	struct myri10ge_priv *mgp = arg;
2794 
2795 	myri10ge_change_promisc(mgp, on);
2796 	return (0);
2797 }
2798 
2799 /*
2800  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
2801  *  backwards one at a time and handle ring wraps
2802  */
2803 
2804 static inline void
2805 myri10ge_submit_req_backwards(myri10ge_tx_ring_t *tx,
2806     mcp_kreq_ether_send_t *src, int cnt)
2807 {
2808 	int idx, starting_slot;
2809 	starting_slot = tx->req;
2810 	while (cnt > 1) {
2811 		cnt--;
2812 		idx = (starting_slot + cnt) & tx->mask;
2813 		myri10ge_pio_copy(&tx->lanai[idx],
2814 		    &src[cnt], sizeof (*src));
2815 		mb();
2816 	}
2817 }
2818 
2819 /*
2820  * copy an array of mcp_kreq_ether_send_t's to the mcp.  Copy
2821  * at most 32 bytes at a time, so as to avoid involving the software
2822  * pio handler in the nic.   We re-write the first segment's flags
2823  * to mark them valid only after writing the entire chain
2824  */
2825 
2826 static inline void
2827 myri10ge_submit_req(myri10ge_tx_ring_t *tx, mcp_kreq_ether_send_t *src,
2828     int cnt)
2829 {
2830 	int idx, i;
2831 	uint32_t *src_ints, *dst_ints;
2832 	mcp_kreq_ether_send_t *srcp, *dstp, *dst;
2833 	uint8_t last_flags;
2834 
2835 	idx = tx->req & tx->mask;
2836 
2837 	last_flags = src->flags;
2838 	src->flags = 0;
2839 	mb();
2840 	dst = dstp = &tx->lanai[idx];
2841 	srcp = src;
2842 
2843 	if ((idx + cnt) < tx->mask) {
2844 		for (i = 0; i < (cnt - 1); i += 2) {
2845 			myri10ge_pio_copy(dstp, srcp, 2 * sizeof (*src));
2846 			mb(); /* force write every 32 bytes */
2847 			srcp += 2;
2848 			dstp += 2;
2849 		}
2850 	} else {
2851 		/*
2852 		 * submit all but the first request, and ensure
2853 		 *  that it is submitted below
2854 		 */
2855 		myri10ge_submit_req_backwards(tx, src, cnt);
2856 		i = 0;
2857 	}
2858 	if (i < cnt) {
2859 		/* submit the first request */
2860 		myri10ge_pio_copy(dstp, srcp, sizeof (*src));
2861 		mb(); /* barrier before setting valid flag */
2862 	}
2863 
2864 	/* re-write the last 32-bits with the valid flags */
2865 	src->flags |= last_flags;
2866 	src_ints = (uint32_t *)src;
2867 	src_ints += 3;
2868 	dst_ints = (uint32_t *)dst;
2869 	dst_ints += 3;
2870 	*dst_ints =  *src_ints;
2871 	tx->req += cnt;
2872 	mb();
2873 	/* notify NIC to poll this tx ring */
2874 	if (!tx->active && tx->go != NULL) {
2875 		*(int *)(void *)tx->go = 1;
2876 		tx->active = 1;
2877 		tx->activate++;
2878 		mb();
2879 	}
2880 }
2881 
2882 /* ARGSUSED */
2883 static inline void
2884 myri10ge_lso_info_get(mblk_t *mp, uint32_t *mss, uint32_t *flags)
2885 {
2886 	uint32_t lso_flag;
2887 	lso_info_get(mp, mss, &lso_flag);
2888 	(*flags) |= lso_flag;
2889 }
2890 
2891 
2892 /* like pullupmsg, except preserve hcksum/LSO attributes */
2893 static int
2894 myri10ge_pullup(struct myri10ge_slice_state *ss, mblk_t *mp)
2895 {
2896 	uint32_t start, stuff, tx_offload_flags, mss;
2897 	int ok;
2898 
2899 	mss = 0;
2900 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, NULL, NULL,
2901 	    &tx_offload_flags);
2902 	myri10ge_lso_info_get(mp, &mss, &tx_offload_flags);
2903 
2904 	ok = pullupmsg(mp, -1);
2905 	if (!ok) {
2906 		printf("pullupmsg failed");
2907 		return (DDI_FAILURE);
2908 	}
2909 	MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_pullup);
2910 	(void) hcksum_assoc(mp, NULL, NULL, start, stuff, NULL,
2911 	    NULL, tx_offload_flags, 0);
2912 	if (tx_offload_flags & HW_LSO)
2913 		DB_LSOMSS(mp) = (uint16_t)mss;
2914 	lso_info_set(mp, mss, tx_offload_flags);
2915 	return (DDI_SUCCESS);
2916 }
2917 
2918 static inline void
2919 myri10ge_tx_stat(struct myri10ge_tx_pkt_stats *s, struct ether_header *eh,
2920     int opackets, int obytes)
2921 {
2922 	s->un.all = 0;
2923 	if (eh->ether_dhost.ether_addr_octet[0] & 1) {
2924 		if (0 == (bcmp(eh->ether_dhost.ether_addr_octet,
2925 		    myri10ge_broadcastaddr, sizeof (eh->ether_dhost))))
2926 			s->un.s.brdcstxmt = 1;
2927 		else
2928 			s->un.s.multixmt = 1;
2929 	}
2930 	s->un.s.opackets = (uint16_t)opackets;
2931 	s->un.s.obytes = obytes;
2932 }
2933 
2934 static int
2935 myri10ge_tx_copy(struct myri10ge_slice_state *ss, mblk_t *mp,
2936     mcp_kreq_ether_send_t *req)
2937 {
2938 	myri10ge_tx_ring_t *tx = &ss->tx;
2939 	caddr_t ptr;
2940 	struct myri10ge_tx_copybuf *cp;
2941 	mblk_t *bp;
2942 	int idx, mblen, avail;
2943 	uint16_t len;
2944 
2945 	mutex_enter(&tx->lock);
2946 	avail = tx->mask - (tx->req - tx->done);
2947 	if (avail <= 1) {
2948 		mutex_exit(&tx->lock);
2949 		return (EBUSY);
2950 	}
2951 	idx = tx->req & tx->mask;
2952 	cp = &tx->cp[idx];
2953 	ptr = cp->va;
2954 	for (len = 0, bp = mp; bp != NULL; bp = bp->b_cont) {
2955 		mblen = MBLKL(bp);
2956 		bcopy(bp->b_rptr, ptr, mblen);
2957 		ptr += mblen;
2958 		len += mblen;
2959 	}
2960 	/* ensure runts are padded to 60 bytes */
2961 	if (len < 60) {
2962 		bzero(ptr, 64 - len);
2963 		len = 60;
2964 	}
2965 	req->addr_low = cp->dma.low;
2966 	req->addr_high = cp->dma.high;
2967 	req->length = htons(len);
2968 	req->pad = 0;
2969 	req->rdma_count = 1;
2970 	myri10ge_tx_stat(&tx->info[idx].stat,
2971 	    (struct ether_header *)(void *)cp->va, 1, len);
2972 	(void) ddi_dma_sync(cp->dma.handle, 0, len, DDI_DMA_SYNC_FORDEV);
2973 	myri10ge_submit_req(&ss->tx, req, 1);
2974 	mutex_exit(&tx->lock);
2975 	freemsg(mp);
2976 	return (DDI_SUCCESS);
2977 }
2978 
2979 
2980 static void
2981 myri10ge_send_locked(myri10ge_tx_ring_t *tx, mcp_kreq_ether_send_t *req_list,
2982     struct myri10ge_tx_buffer_state *tx_info,
2983     int count)
2984 {
2985 	int i, idx;
2986 
2987 	idx = 0; /* gcc -Wuninitialized */
2988 	/* store unmapping and bp info for tx irq handler */
2989 	for (i = 0; i < count; i++) {
2990 		idx = (tx->req + i) & tx->mask;
2991 		tx->info[idx].m = tx_info[i].m;
2992 		tx->info[idx].handle = tx_info[i].handle;
2993 	}
2994 	tx->info[idx].stat.un.all = tx_info[0].stat.un.all;
2995 
2996 	/* submit the frame to the nic */
2997 	myri10ge_submit_req(tx, req_list, count);
2998 
2999 
3000 }
3001 
3002 
3003 
3004 static void
3005 myri10ge_copydata(mblk_t *mp, int off, int len, caddr_t buf)
3006 {
3007 	mblk_t *bp;
3008 	int seglen;
3009 	uint_t count;
3010 
3011 	bp = mp;
3012 
3013 	while (off > 0) {
3014 		seglen = MBLKL(bp);
3015 		if (off < seglen)
3016 			break;
3017 		off -= seglen;
3018 		bp = bp->b_cont;
3019 	}
3020 	while (len > 0) {
3021 		seglen = MBLKL(bp);
3022 		count = min(seglen - off, len);
3023 		bcopy(bp->b_rptr + off, buf, count);
3024 		len -= count;
3025 		buf += count;
3026 		off = 0;
3027 		bp = bp->b_cont;
3028 	}
3029 }
3030 
3031 static int
3032 myri10ge_ether_parse_header(mblk_t *mp)
3033 {
3034 	struct ether_header eh_copy;
3035 	struct ether_header *eh;
3036 	int eth_hdr_len, seglen;
3037 
3038 	seglen = MBLKL(mp);
3039 	eth_hdr_len = sizeof (*eh);
3040 	if (seglen < eth_hdr_len) {
3041 		myri10ge_copydata(mp, 0, eth_hdr_len, (caddr_t)&eh_copy);
3042 		eh = &eh_copy;
3043 	} else {
3044 		eh = (struct ether_header *)(void *)mp->b_rptr;
3045 	}
3046 	if (eh->ether_type == BE_16(ETHERTYPE_VLAN)) {
3047 		eth_hdr_len += 4;
3048 	}
3049 
3050 	return (eth_hdr_len);
3051 }
3052 
3053 static int
3054 myri10ge_lso_parse_header(mblk_t *mp, int off)
3055 {
3056 	char buf[128];
3057 	int seglen;
3058 	struct ip *ip;
3059 	struct tcphdr *tcp;
3060 
3061 	seglen = MBLKL(mp);
3062 	if (seglen < off + sizeof (*ip)) {
3063 		myri10ge_copydata(mp, off, sizeof (*ip), buf);
3064 		ip = (struct ip *)(void *)buf;
3065 	} else {
3066 		ip = (struct ip *)(void *)(mp->b_rptr + off);
3067 	}
3068 	if (seglen < off + (ip->ip_hl << 2) + sizeof (*tcp)) {
3069 		myri10ge_copydata(mp, off,
3070 		    (ip->ip_hl << 2) + sizeof (*tcp), buf);
3071 		ip = (struct ip *)(void *)buf;
3072 	}
3073 	tcp = (struct tcphdr *)(void *)((char *)ip + (ip->ip_hl << 2));
3074 	return (off + ((ip->ip_hl + tcp->th_off) << 2));
3075 }
3076 
3077 static int
3078 myri10ge_tx_tso_copy(struct myri10ge_slice_state *ss, mblk_t *mp,
3079     mcp_kreq_ether_send_t *req_list, int hdr_size, int pkt_size,
3080     uint16_t mss, uint8_t cksum_offset)
3081 {
3082 	myri10ge_tx_ring_t *tx = &ss->tx;
3083 	struct myri10ge_priv *mgp = ss->mgp;
3084 	mblk_t *bp;
3085 	mcp_kreq_ether_send_t *req;
3086 	struct myri10ge_tx_copybuf *cp;
3087 	caddr_t rptr, ptr;
3088 	int mblen, count, cum_len, mss_resid, tx_req, pkt_size_tmp;
3089 	int resid, avail, idx, hdr_size_tmp, tx_boundary;
3090 	int rdma_count;
3091 	uint32_t seglen, len, boundary, low, high_swapped;
3092 	uint16_t pseudo_hdr_offset = htons(mss);
3093 	uint8_t flags;
3094 
3095 	tx_boundary = mgp->tx_boundary;
3096 	hdr_size_tmp = hdr_size;
3097 	resid = tx_boundary;
3098 	count = 1;
3099 	mutex_enter(&tx->lock);
3100 
3101 	/* check to see if the slots are really there */
3102 	avail = tx->mask - (tx->req - tx->done);
3103 	if (unlikely(avail <=  MYRI10GE_MAX_SEND_DESC_TSO)) {
3104 		atomic_add_32(&tx->stall, 1);
3105 		mutex_exit(&tx->lock);
3106 		return (EBUSY);
3107 	}
3108 
3109 	/* copy */
3110 	cum_len = -hdr_size;
3111 	count = 0;
3112 	req = req_list;
3113 	idx = tx->mask & tx->req;
3114 	cp = &tx->cp[idx];
3115 	low = ntohl(cp->dma.low);
3116 	ptr = cp->va;
3117 	cp->len = 0;
3118 	if (mss) {
3119 		int payload = pkt_size - hdr_size;
3120 		uint16_t opackets = (payload / mss) + ((payload % mss) != 0);
3121 		tx->info[idx].ostat.opackets = opackets;
3122 		tx->info[idx].ostat.obytes = (opackets - 1) * hdr_size
3123 		    + pkt_size;
3124 	}
3125 	hdr_size_tmp = hdr_size;
3126 	mss_resid = mss;
3127 	flags = (MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST);
3128 	tx_req = tx->req;
3129 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3130 		mblen = MBLKL(bp);
3131 		rptr = (caddr_t)bp->b_rptr;
3132 		len = min(hdr_size_tmp, mblen);
3133 		if (len) {
3134 			bcopy(rptr, ptr, len);
3135 			rptr += len;
3136 			ptr += len;
3137 			resid -= len;
3138 			mblen -= len;
3139 			hdr_size_tmp -= len;
3140 			cp->len += len;
3141 			if (hdr_size_tmp)
3142 				continue;
3143 			if (resid < mss) {
3144 				tx_req++;
3145 				idx = tx->mask & tx_req;
3146 				cp = &tx->cp[idx];
3147 				low = ntohl(cp->dma.low);
3148 				ptr = cp->va;
3149 				resid = tx_boundary;
3150 			}
3151 		}
3152 		while (mblen) {
3153 			len = min(mss_resid, mblen);
3154 			bcopy(rptr, ptr, len);
3155 			mss_resid -= len;
3156 			resid -= len;
3157 			mblen -= len;
3158 			rptr += len;
3159 			ptr += len;
3160 			cp->len += len;
3161 			if (mss_resid == 0) {
3162 				mss_resid = mss;
3163 				if (resid < mss) {
3164 					tx_req++;
3165 					idx = tx->mask & tx_req;
3166 					cp = &tx->cp[idx];
3167 					cp->len = 0;
3168 					low = ntohl(cp->dma.low);
3169 					ptr = cp->va;
3170 					resid = tx_boundary;
3171 				}
3172 			}
3173 		}
3174 	}
3175 
3176 	req = req_list;
3177 	pkt_size_tmp = pkt_size;
3178 	count = 0;
3179 	rdma_count = 0;
3180 	tx_req = tx->req;
3181 	while (pkt_size_tmp) {
3182 		idx = tx->mask & tx_req;
3183 		cp = &tx->cp[idx];
3184 		high_swapped = cp->dma.high;
3185 		low = ntohl(cp->dma.low);
3186 		len = cp->len;
3187 		if (len == 0) {
3188 			printf("len=0! pkt_size_tmp=%d, pkt_size=%d\n",
3189 			    pkt_size_tmp, pkt_size);
3190 			for (bp = mp; bp != NULL; bp = bp->b_cont) {
3191 				mblen = MBLKL(bp);
3192 				printf("mblen:%d\n", mblen);
3193 			}
3194 			pkt_size_tmp = pkt_size;
3195 			tx_req = tx->req;
3196 			while (pkt_size_tmp > 0) {
3197 				idx = tx->mask & tx_req;
3198 				cp = &tx->cp[idx];
3199 				printf("cp->len = %d\n", cp->len);
3200 				pkt_size_tmp -= cp->len;
3201 				tx_req++;
3202 			}
3203 			printf("dropped\n");
3204 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3205 			goto done;
3206 		}
3207 		pkt_size_tmp -= len;
3208 		while (len) {
3209 			while (len) {
3210 				uint8_t flags_next;
3211 				int cum_len_next;
3212 
3213 				boundary = (low + mgp->tx_boundary) &
3214 				    ~(mgp->tx_boundary - 1);
3215 				seglen = boundary - low;
3216 				if (seglen > len)
3217 					seglen = len;
3218 
3219 				flags_next = flags & ~MXGEFW_FLAGS_FIRST;
3220 				cum_len_next = cum_len + seglen;
3221 				(req-rdma_count)->rdma_count = rdma_count + 1;
3222 				if (likely(cum_len >= 0)) {
3223 					/* payload */
3224 					int next_is_first, chop;
3225 
3226 					chop = (cum_len_next > mss);
3227 					cum_len_next = cum_len_next % mss;
3228 					next_is_first = (cum_len_next == 0);
3229 					flags |= chop *
3230 					    MXGEFW_FLAGS_TSO_CHOP;
3231 					flags_next |= next_is_first *
3232 					    MXGEFW_FLAGS_FIRST;
3233 					rdma_count |= -(chop | next_is_first);
3234 					rdma_count += chop & !next_is_first;
3235 				} else if (likely(cum_len_next >= 0)) {
3236 					/* header ends */
3237 					int small;
3238 
3239 					rdma_count = -1;
3240 					cum_len_next = 0;
3241 					seglen = -cum_len;
3242 					small = (mss <= MXGEFW_SEND_SMALL_SIZE);
3243 					flags_next = MXGEFW_FLAGS_TSO_PLD |
3244 					    MXGEFW_FLAGS_FIRST |
3245 					    (small * MXGEFW_FLAGS_SMALL);
3246 				}
3247 				req->addr_high = high_swapped;
3248 				req->addr_low = htonl(low);
3249 				req->pseudo_hdr_offset = pseudo_hdr_offset;
3250 				req->pad = 0; /* complete solid 16-byte block */
3251 				req->rdma_count = 1;
3252 				req->cksum_offset = cksum_offset;
3253 				req->length = htons(seglen);
3254 				req->flags = flags | ((cum_len & 1) *
3255 				    MXGEFW_FLAGS_ALIGN_ODD);
3256 				if (cksum_offset > seglen)
3257 					cksum_offset -= seglen;
3258 				else
3259 					cksum_offset = 0;
3260 				low += seglen;
3261 				len -= seglen;
3262 				cum_len = cum_len_next;
3263 				req++;
3264 				req->flags = 0;
3265 				flags = flags_next;
3266 				count++;
3267 				rdma_count++;
3268 			}
3269 		}
3270 		tx_req++;
3271 	}
3272 	(req-rdma_count)->rdma_count = (uint8_t)rdma_count;
3273 	do {
3274 		req--;
3275 		req->flags |= MXGEFW_FLAGS_TSO_LAST;
3276 	} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP |
3277 	    MXGEFW_FLAGS_FIRST)));
3278 
3279 	myri10ge_submit_req(tx, req_list, count);
3280 done:
3281 	mutex_exit(&tx->lock);
3282 	freemsg(mp);
3283 	return (DDI_SUCCESS);
3284 }
3285 
3286 /*
3287  * Try to send the chain of buffers described by the mp.  We must not
3288  * encapsulate more than eth->tx.req - eth->tx.done, or
3289  * MXGEFW_MAX_SEND_DESC, whichever is more.
3290  */
3291 
3292 static int
3293 myri10ge_send(struct myri10ge_slice_state *ss, mblk_t *mp,
3294     mcp_kreq_ether_send_t *req_list, struct myri10ge_tx_buffer_state *tx_info)
3295 {
3296 	struct myri10ge_priv *mgp = ss->mgp;
3297 	myri10ge_tx_ring_t *tx = &ss->tx;
3298 	mcp_kreq_ether_send_t *req;
3299 	struct myri10ge_tx_dma_handle *handles, *dma_handle = NULL;
3300 	mblk_t  *bp;
3301 	ddi_dma_cookie_t cookie;
3302 	int err, rv, count, avail, mblen, try_pullup, i, max_segs, maclen,
3303 	    rdma_count, cum_len, lso_hdr_size;
3304 	uint32_t start, stuff, tx_offload_flags;
3305 	uint32_t seglen, len, mss, boundary, low, high_swapped;
3306 	uint_t ncookies;
3307 	uint16_t pseudo_hdr_offset;
3308 	uint8_t flags, cksum_offset, odd_flag;
3309 	int pkt_size;
3310 	int lso_copy = myri10ge_lso_copy;
3311 	try_pullup = 1;
3312 
3313 again:
3314 	/* Setup checksum offloading, if needed */
3315 	hcksum_retrieve(mp, NULL, NULL, &start, &stuff, NULL, NULL,
3316 	    &tx_offload_flags);
3317 	myri10ge_lso_info_get(mp, &mss, &tx_offload_flags);
3318 	if (tx_offload_flags & HW_LSO) {
3319 		max_segs = MYRI10GE_MAX_SEND_DESC_TSO;
3320 		if ((tx_offload_flags & HCK_PARTIALCKSUM) == 0) {
3321 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_lsobadflags);
3322 			freemsg(mp);
3323 			return (DDI_SUCCESS);
3324 		}
3325 	} else {
3326 		max_segs = MXGEFW_MAX_SEND_DESC;
3327 		mss = 0;
3328 	}
3329 	req = req_list;
3330 	cksum_offset = 0;
3331 	pseudo_hdr_offset = 0;
3332 
3333 	/* leave an extra slot keep the ring from wrapping */
3334 	avail = tx->mask - (tx->req - tx->done);
3335 
3336 	/*
3337 	 * If we have > MXGEFW_MAX_SEND_DESC, then any over-length
3338 	 * message will need to be pulled up in order to fit.
3339 	 * Otherwise, we are low on transmit descriptors, it is
3340 	 * probably better to stall and try again rather than pullup a
3341 	 * message to fit.
3342 	 */
3343 
3344 	if (avail < max_segs) {
3345 		err = EBUSY;
3346 		atomic_add_32(&tx->stall_early, 1);
3347 		goto stall;
3348 	}
3349 
3350 	/* find out how long the frame is and how many segments it is */
3351 	count = 0;
3352 	odd_flag = 0;
3353 	pkt_size = 0;
3354 	flags = (MXGEFW_FLAGS_NO_TSO | MXGEFW_FLAGS_FIRST);
3355 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3356 		dblk_t *dbp;
3357 		mblen = MBLKL(bp);
3358 		if (mblen == 0) {
3359 			/*
3360 			 * we can't simply skip over 0-length mblks
3361 			 * because the hardware can't deal with them,
3362 			 * and we could leak them.
3363 			 */
3364 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_zero_len);
3365 			err = EIO;
3366 			goto pullup;
3367 		}
3368 		/*
3369 		 * There's no advantage to copying most gesballoc
3370 		 * attached blocks, so disable lso copy in that case
3371 		 */
3372 		if (mss && lso_copy == 1 && ((dbp = bp->b_datap) != NULL)) {
3373 			if ((void *)dbp->db_lastfree != myri10ge_db_lastfree) {
3374 				lso_copy = 0;
3375 			}
3376 		}
3377 		pkt_size += mblen;
3378 		count++;
3379 	}
3380 
3381 	/* Try to pull up excessivly long chains */
3382 	if (count >= max_segs) {
3383 		err = myri10ge_pullup(ss, mp);
3384 		if (likely(err == DDI_SUCCESS)) {
3385 			count = 1;
3386 		} else {
3387 			if (count <  MYRI10GE_MAX_SEND_DESC_TSO) {
3388 				/*
3389 				 * just let the h/w send it, it will be
3390 				 * inefficient, but us better than dropping
3391 				 */
3392 				max_segs = MYRI10GE_MAX_SEND_DESC_TSO;
3393 			} else {
3394 				/* drop it */
3395 				MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3396 				freemsg(mp);
3397 				return (0);
3398 			}
3399 		}
3400 	}
3401 
3402 	cum_len = 0;
3403 	maclen = myri10ge_ether_parse_header(mp);
3404 
3405 	if (tx_offload_flags & HCK_PARTIALCKSUM) {
3406 
3407 		cksum_offset = start + maclen;
3408 		pseudo_hdr_offset = htons(stuff + maclen);
3409 		odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
3410 		flags |= MXGEFW_FLAGS_CKSUM;
3411 	}
3412 
3413 	lso_hdr_size = 0; /* -Wunitinialized */
3414 	if (mss) { /* LSO */
3415 		/* this removes any CKSUM flag from before */
3416 		flags = (MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST);
3417 		/*
3418 		 * parse the headers and set cum_len to a negative
3419 		 * value to reflect the offset of the TCP payload
3420 		 */
3421 		lso_hdr_size =  myri10ge_lso_parse_header(mp, maclen);
3422 		cum_len = -lso_hdr_size;
3423 		if ((mss < mgp->tx_boundary) && lso_copy) {
3424 			err = myri10ge_tx_tso_copy(ss, mp, req_list,
3425 			    lso_hdr_size, pkt_size, mss, cksum_offset);
3426 			return (err);
3427 		}
3428 
3429 		/*
3430 		 * for TSO, pseudo_hdr_offset holds mss.  The firmware
3431 		 * figures out where to put the checksum by parsing
3432 		 * the header.
3433 		 */
3434 
3435 		pseudo_hdr_offset = htons(mss);
3436 	} else if (pkt_size <= MXGEFW_SEND_SMALL_SIZE) {
3437 		flags |= MXGEFW_FLAGS_SMALL;
3438 		if (pkt_size < myri10ge_tx_copylen) {
3439 			req->cksum_offset = cksum_offset;
3440 			req->pseudo_hdr_offset = pseudo_hdr_offset;
3441 			req->flags = flags;
3442 			err = myri10ge_tx_copy(ss, mp, req);
3443 			return (err);
3444 		}
3445 		cum_len = 0;
3446 	}
3447 
3448 	/* pull one DMA handle for each bp from our freelist */
3449 	handles = NULL;
3450 	err = myri10ge_alloc_tx_handles(ss, count, &handles);
3451 	if (err != DDI_SUCCESS) {
3452 		err = DDI_FAILURE;
3453 		goto stall;
3454 	}
3455 	count = 0;
3456 	rdma_count = 0;
3457 	for (bp = mp; bp != NULL; bp = bp->b_cont) {
3458 		mblen = MBLKL(bp);
3459 		dma_handle = handles;
3460 		handles = handles->next;
3461 
3462 		rv = ddi_dma_addr_bind_handle(dma_handle->h, NULL,
3463 		    (caddr_t)bp->b_rptr, mblen,
3464 		    DDI_DMA_WRITE | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL,
3465 		    &cookie, &ncookies);
3466 		if (unlikely(rv != DDI_DMA_MAPPED)) {
3467 			err = EIO;
3468 			try_pullup = 0;
3469 			dma_handle->next = handles;
3470 			handles = dma_handle;
3471 			goto abort_with_handles;
3472 		}
3473 
3474 		/* reserve the slot */
3475 		tx_info[count].m = bp;
3476 		tx_info[count].handle = dma_handle;
3477 
3478 		for (; ; ) {
3479 			low = MYRI10GE_LOWPART_TO_U32(cookie.dmac_laddress);
3480 			high_swapped =
3481 			    htonl(MYRI10GE_HIGHPART_TO_U32(
3482 			    cookie.dmac_laddress));
3483 			len = (uint32_t)cookie.dmac_size;
3484 			while (len) {
3485 				uint8_t flags_next;
3486 				int cum_len_next;
3487 
3488 				boundary = (low + mgp->tx_boundary) &
3489 				    ~(mgp->tx_boundary - 1);
3490 				seglen = boundary - low;
3491 				if (seglen > len)
3492 					seglen = len;
3493 
3494 				flags_next = flags & ~MXGEFW_FLAGS_FIRST;
3495 				cum_len_next = cum_len + seglen;
3496 				if (mss) {
3497 					(req-rdma_count)->rdma_count =
3498 					    rdma_count + 1;
3499 					if (likely(cum_len >= 0)) {
3500 						/* payload */
3501 						int next_is_first, chop;
3502 
3503 						chop = (cum_len_next > mss);
3504 						cum_len_next =
3505 						    cum_len_next % mss;
3506 						next_is_first =
3507 						    (cum_len_next == 0);
3508 						flags |= chop *
3509 						    MXGEFW_FLAGS_TSO_CHOP;
3510 						flags_next |= next_is_first *
3511 						    MXGEFW_FLAGS_FIRST;
3512 						rdma_count |=
3513 						    -(chop | next_is_first);
3514 						rdma_count +=
3515 						    chop & !next_is_first;
3516 					} else if (likely(cum_len_next >= 0)) {
3517 						/* header ends */
3518 						int small;
3519 
3520 						rdma_count = -1;
3521 						cum_len_next = 0;
3522 						seglen = -cum_len;
3523 						small = (mss <=
3524 						    MXGEFW_SEND_SMALL_SIZE);
3525 						flags_next =
3526 						    MXGEFW_FLAGS_TSO_PLD
3527 						    | MXGEFW_FLAGS_FIRST
3528 						    | (small *
3529 						    MXGEFW_FLAGS_SMALL);
3530 					}
3531 				}
3532 				req->addr_high = high_swapped;
3533 				req->addr_low = htonl(low);
3534 				req->pseudo_hdr_offset = pseudo_hdr_offset;
3535 				req->pad = 0; /* complete solid 16-byte block */
3536 				req->rdma_count = 1;
3537 				req->cksum_offset = cksum_offset;
3538 				req->length = htons(seglen);
3539 				req->flags = flags | ((cum_len & 1) * odd_flag);
3540 				if (cksum_offset > seglen)
3541 					cksum_offset -= seglen;
3542 				else
3543 					cksum_offset = 0;
3544 				low += seglen;
3545 				len -= seglen;
3546 				cum_len = cum_len_next;
3547 				count++;
3548 				rdma_count++;
3549 				/*  make sure all the segments will fit */
3550 				if (unlikely(count >= max_segs)) {
3551 					MYRI10GE_ATOMIC_SLICE_STAT_INC(
3552 					    xmit_lowbuf);
3553 					/* may try a pullup */
3554 					err = EBUSY;
3555 					if (try_pullup)
3556 						try_pullup = 2;
3557 					goto abort_with_handles;
3558 				}
3559 				req++;
3560 				req->flags = 0;
3561 				flags = flags_next;
3562 				tx_info[count].m = 0;
3563 			}
3564 			ncookies--;
3565 			if (ncookies == 0)
3566 				break;
3567 			ddi_dma_nextcookie(dma_handle->h, &cookie);
3568 		}
3569 	}
3570 	(req-rdma_count)->rdma_count = (uint8_t)rdma_count;
3571 
3572 	if (mss) {
3573 		do {
3574 			req--;
3575 			req->flags |= MXGEFW_FLAGS_TSO_LAST;
3576 		} while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP |
3577 		    MXGEFW_FLAGS_FIRST)));
3578 	}
3579 
3580 	/* calculate tx stats */
3581 	if (mss) {
3582 		uint16_t opackets;
3583 		int payload;
3584 
3585 		payload = pkt_size - lso_hdr_size;
3586 		opackets = (payload / mss) + ((payload % mss) != 0);
3587 		tx_info[0].stat.un.all = 0;
3588 		tx_info[0].ostat.opackets = opackets;
3589 		tx_info[0].ostat.obytes = (opackets - 1) * lso_hdr_size
3590 		    + pkt_size;
3591 	} else {
3592 		myri10ge_tx_stat(&tx_info[0].stat,
3593 		    (struct ether_header *)(void *)mp->b_rptr, 1, pkt_size);
3594 	}
3595 	mutex_enter(&tx->lock);
3596 
3597 	/* check to see if the slots are really there */
3598 	avail = tx->mask - (tx->req - tx->done);
3599 	if (unlikely(avail <= count)) {
3600 		mutex_exit(&tx->lock);
3601 		err = 0;
3602 		goto late_stall;
3603 	}
3604 
3605 	myri10ge_send_locked(tx, req_list, tx_info, count);
3606 	mutex_exit(&tx->lock);
3607 	return (DDI_SUCCESS);
3608 
3609 late_stall:
3610 	try_pullup = 0;
3611 	atomic_add_32(&tx->stall_late, 1);
3612 
3613 abort_with_handles:
3614 	/* unbind and free handles from previous mblks */
3615 	for (i = 0; i < count; i++) {
3616 		bp = tx_info[i].m;
3617 		tx_info[i].m = 0;
3618 		if (bp) {
3619 			dma_handle = tx_info[i].handle;
3620 			(void) ddi_dma_unbind_handle(dma_handle->h);
3621 			dma_handle->next = handles;
3622 			handles = dma_handle;
3623 			tx_info[i].handle = NULL;
3624 			tx_info[i].m = NULL;
3625 		}
3626 	}
3627 	myri10ge_free_tx_handle_slist(tx, handles);
3628 pullup:
3629 	if (try_pullup) {
3630 		err = myri10ge_pullup(ss, mp);
3631 		if (err != DDI_SUCCESS && try_pullup == 2) {
3632 			/* drop */
3633 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3634 			freemsg(mp);
3635 			return (0);
3636 		}
3637 		try_pullup = 0;
3638 		goto again;
3639 	}
3640 
3641 stall:
3642 	if (err != 0) {
3643 		if (err == EBUSY) {
3644 			atomic_add_32(&tx->stall, 1);
3645 		} else {
3646 			MYRI10GE_ATOMIC_SLICE_STAT_INC(xmit_err);
3647 		}
3648 	}
3649 	return (err);
3650 }
3651 
3652 static mblk_t *
3653 myri10ge_send_wrapper(void *arg, mblk_t *mp)
3654 {
3655 	struct myri10ge_slice_state *ss = arg;
3656 	int err = 0;
3657 	mcp_kreq_ether_send_t *req_list;
3658 #if defined(__i386)
3659 	/*
3660 	 * We need about 2.5KB of scratch space to handle transmits.
3661 	 * i86pc has only 8KB of kernel stack space, so we malloc the
3662 	 * scratch space there rather than keeping it on the stack.
3663 	 */
3664 	size_t req_size, tx_info_size;
3665 	struct myri10ge_tx_buffer_state *tx_info;
3666 	caddr_t req_bytes;
3667 
3668 	req_size = sizeof (*req_list) * (MYRI10GE_MAX_SEND_DESC_TSO + 4)
3669 	    + 8;
3670 	req_bytes = kmem_alloc(req_size, KM_SLEEP);
3671 	tx_info_size = sizeof (*tx_info) * (MYRI10GE_MAX_SEND_DESC_TSO + 1);
3672 	tx_info = kmem_alloc(tx_info_size, KM_SLEEP);
3673 #else
3674 	char req_bytes[sizeof (*req_list) * (MYRI10GE_MAX_SEND_DESC_TSO + 4)
3675 	    + 8];
3676 	struct myri10ge_tx_buffer_state tx_info[MYRI10GE_MAX_SEND_DESC_TSO + 1];
3677 #endif
3678 
3679 	/* ensure req_list entries are aligned to 8 bytes */
3680 	req_list = (struct mcp_kreq_ether_send *)
3681 	    (((unsigned long)req_bytes + 7UL) & ~7UL);
3682 
3683 	err = myri10ge_send(ss, mp, req_list, tx_info);
3684 
3685 #if defined(__i386)
3686 	kmem_free(tx_info, tx_info_size);
3687 	kmem_free(req_bytes, req_size);
3688 #endif
3689 	if (err)
3690 		return (mp);
3691 	else
3692 		return (NULL);
3693 }
3694 
3695 static int
3696 myri10ge_addmac(void *arg, const uint8_t *mac_addr)
3697 {
3698 	struct myri10ge_priv *mgp = arg;
3699 	int err;
3700 
3701 	if (mac_addr == NULL)
3702 		return (EINVAL);
3703 
3704 	mutex_enter(&mgp->intrlock);
3705 	if (mgp->macaddr_cnt) {
3706 		mutex_exit(&mgp->intrlock);
3707 		return (ENOSPC);
3708 	}
3709 	err = myri10ge_m_unicst(mgp, mac_addr);
3710 	if (!err)
3711 		mgp->macaddr_cnt++;
3712 
3713 	mutex_exit(&mgp->intrlock);
3714 	if (err)
3715 		return (err);
3716 
3717 	bcopy(mac_addr, mgp->mac_addr, sizeof (mgp->mac_addr));
3718 	return (0);
3719 }
3720 
3721 /*ARGSUSED*/
3722 static int
3723 myri10ge_remmac(void *arg, const uint8_t *mac_addr)
3724 {
3725 	struct myri10ge_priv *mgp = arg;
3726 
3727 	mutex_enter(&mgp->intrlock);
3728 	mgp->macaddr_cnt--;
3729 	mutex_exit(&mgp->intrlock);
3730 
3731 	return (0);
3732 }
3733 
3734 /*ARGSUSED*/
3735 static void
3736 myri10ge_fill_group(void *arg, mac_ring_type_t rtype, const int index,
3737     mac_group_info_t *infop, mac_group_handle_t gh)
3738 {
3739 	struct myri10ge_priv *mgp = arg;
3740 
3741 	if (rtype != MAC_RING_TYPE_RX)
3742 		return;
3743 
3744 	infop->mgi_driver = (mac_group_driver_t)mgp;
3745 	infop->mgi_start = NULL;
3746 	infop->mgi_stop = NULL;
3747 	infop->mgi_addmac = myri10ge_addmac;
3748 	infop->mgi_remmac = myri10ge_remmac;
3749 	infop->mgi_count = mgp->num_slices;
3750 }
3751 
3752 static int
3753 myri10ge_ring_start(mac_ring_driver_t rh, uint64_t mr_gen_num)
3754 {
3755 	struct myri10ge_slice_state *ss;
3756 
3757 	ss = (struct myri10ge_slice_state *)rh;
3758 	mutex_enter(&ss->rx_lock);
3759 	ss->rx_gen_num = mr_gen_num;
3760 	mutex_exit(&ss->rx_lock);
3761 	return (0);
3762 }
3763 
3764 static int
3765 myri10ge_rx_ring_intr_disable(mac_intr_handle_t intrh)
3766 {
3767 	struct myri10ge_slice_state *ss;
3768 
3769 	ss = (struct myri10ge_slice_state *)intrh;
3770 	mutex_enter(&ss->poll_lock);
3771 	ss->rx_polling = B_TRUE;
3772 	mutex_exit(&ss->poll_lock);
3773 	return (0);
3774 }
3775 
3776 static int
3777 myri10ge_rx_ring_intr_enable(mac_intr_handle_t intrh)
3778 {
3779 	struct myri10ge_slice_state *ss;
3780 
3781 	ss = (struct myri10ge_slice_state *)intrh;
3782 	mutex_enter(&ss->poll_lock);
3783 	ss->rx_polling = B_FALSE;
3784 	if (ss->rx_token) {
3785 		*ss->irq_claim = BE_32(3);
3786 		ss->rx_token = 0;
3787 	}
3788 	mutex_exit(&ss->poll_lock);
3789 	return (0);
3790 }
3791 
3792 /*ARGSUSED*/
3793 static void
3794 myri10ge_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
3795     const int ring_index, mac_ring_info_t *infop, mac_ring_handle_t rh)
3796 {
3797 	struct myri10ge_priv *mgp = arg;
3798 	struct myri10ge_slice_state *ss;
3799 	mac_intr_t *mintr = &infop->mri_intr;
3800 
3801 	ASSERT((unsigned int)ring_index < mgp->num_slices);
3802 
3803 	ss = &mgp->ss[ring_index];
3804 	switch (rtype) {
3805 	case MAC_RING_TYPE_RX:
3806 		ss->rx_rh = rh;
3807 		infop->mri_driver = (mac_ring_driver_t)ss;
3808 		infop->mri_start = myri10ge_ring_start;
3809 		infop->mri_stop = NULL;
3810 		infop->mri_poll = myri10ge_poll_rx;
3811 		mintr->mi_handle = (mac_intr_handle_t)ss;
3812 		mintr->mi_enable = myri10ge_rx_ring_intr_enable;
3813 		mintr->mi_disable = myri10ge_rx_ring_intr_disable;
3814 		break;
3815 	case MAC_RING_TYPE_TX:
3816 		ss->tx.rh = rh;
3817 		infop->mri_driver = (mac_ring_driver_t)ss;
3818 		infop->mri_start = NULL;
3819 		infop->mri_stop = NULL;
3820 		infop->mri_tx = myri10ge_send_wrapper;
3821 		break;
3822 	default:
3823 		break;
3824 	}
3825 }
3826 
3827 static void
3828 myri10ge_nic_stat_destroy(struct myri10ge_priv *mgp)
3829 {
3830 	if (mgp->ksp_stat == NULL)
3831 		return;
3832 
3833 	kstat_delete(mgp->ksp_stat);
3834 	mgp->ksp_stat = NULL;
3835 }
3836 
3837 static void
3838 myri10ge_slice_stat_destroy(struct myri10ge_slice_state *ss)
3839 {
3840 	if (ss->ksp_stat == NULL)
3841 		return;
3842 
3843 	kstat_delete(ss->ksp_stat);
3844 	ss->ksp_stat = NULL;
3845 }
3846 
3847 static void
3848 myri10ge_info_destroy(struct myri10ge_priv *mgp)
3849 {
3850 	if (mgp->ksp_info == NULL)
3851 		return;
3852 
3853 	kstat_delete(mgp->ksp_info);
3854 	mgp->ksp_info = NULL;
3855 }
3856 
3857 static int
3858 myri10ge_nic_stat_kstat_update(kstat_t *ksp, int rw)
3859 {
3860 	struct myri10ge_nic_stat *ethstat;
3861 	struct myri10ge_priv *mgp;
3862 	mcp_irq_data_t *fw_stats;
3863 
3864 
3865 	if (rw == KSTAT_WRITE)
3866 		return (EACCES);
3867 
3868 	ethstat = (struct myri10ge_nic_stat *)ksp->ks_data;
3869 	mgp = (struct myri10ge_priv *)ksp->ks_private;
3870 	fw_stats = mgp->ss[0].fw_stats;
3871 
3872 	ethstat->dma_read_bw_MBs.value.ul = mgp->read_dma;
3873 	ethstat->dma_write_bw_MBs.value.ul = mgp->write_dma;
3874 	ethstat->dma_read_write_bw_MBs.value.ul = mgp->read_write_dma;
3875 	if (myri10ge_tx_dma_attr.dma_attr_flags & DDI_DMA_FORCE_PHYSICAL)
3876 		ethstat->dma_force_physical.value.ul = 1;
3877 	else
3878 		ethstat->dma_force_physical.value.ul = 0;
3879 	ethstat->lanes.value.ul = mgp->pcie_link_width;
3880 	ethstat->dropped_bad_crc32.value.ul =
3881 	    ntohl(fw_stats->dropped_bad_crc32);
3882 	ethstat->dropped_bad_phy.value.ul =
3883 	    ntohl(fw_stats->dropped_bad_phy);
3884 	ethstat->dropped_link_error_or_filtered.value.ul =
3885 	    ntohl(fw_stats->dropped_link_error_or_filtered);
3886 	ethstat->dropped_link_overflow.value.ul =
3887 	    ntohl(fw_stats->dropped_link_overflow);
3888 	ethstat->dropped_multicast_filtered.value.ul =
3889 	    ntohl(fw_stats->dropped_multicast_filtered);
3890 	ethstat->dropped_no_big_buffer.value.ul =
3891 	    ntohl(fw_stats->dropped_no_big_buffer);
3892 	ethstat->dropped_no_small_buffer.value.ul =
3893 	    ntohl(fw_stats->dropped_no_small_buffer);
3894 	ethstat->dropped_overrun.value.ul =
3895 	    ntohl(fw_stats->dropped_overrun);
3896 	ethstat->dropped_pause.value.ul =
3897 	    ntohl(fw_stats->dropped_pause);
3898 	ethstat->dropped_runt.value.ul =
3899 	    ntohl(fw_stats->dropped_runt);
3900 	ethstat->link_up.value.ul =
3901 	    ntohl(fw_stats->link_up);
3902 	ethstat->dropped_unicast_filtered.value.ul =
3903 	    ntohl(fw_stats->dropped_unicast_filtered);
3904 	return (0);
3905 }
3906 
3907 static int
3908 myri10ge_slice_stat_kstat_update(kstat_t *ksp, int rw)
3909 {
3910 	struct myri10ge_slice_stat *ethstat;
3911 	struct myri10ge_slice_state *ss;
3912 
3913 	if (rw == KSTAT_WRITE)
3914 		return (EACCES);
3915 
3916 	ethstat = (struct myri10ge_slice_stat *)ksp->ks_data;
3917 	ss = (struct myri10ge_slice_state *)ksp->ks_private;
3918 
3919 	ethstat->rx_big.value.ul = ss->j_rx_cnt;
3920 	ethstat->rx_bigbuf_firmware.value.ul = ss->rx_big.cnt - ss->j_rx_cnt;
3921 	ethstat->rx_bigbuf_pool.value.ul =
3922 	    ss->jpool.num_alloc - ss->jbufs_for_smalls;
3923 	ethstat->rx_bigbuf_smalls.value.ul = ss->jbufs_for_smalls;
3924 	ethstat->rx_small.value.ul = ss->rx_small.cnt -
3925 	    (ss->rx_small.mask + 1);
3926 	ethstat->tx_done.value.ul = ss->tx.done;
3927 	ethstat->tx_req.value.ul = ss->tx.req;
3928 	ethstat->tx_activate.value.ul = ss->tx.activate;
3929 	ethstat->xmit_sched.value.ul = ss->tx.sched;
3930 	ethstat->xmit_stall.value.ul = ss->tx.stall;
3931 	ethstat->xmit_stall_early.value.ul = ss->tx.stall_early;
3932 	ethstat->xmit_stall_late.value.ul = ss->tx.stall_late;
3933 	ethstat->xmit_err.value.ul =  MYRI10GE_SLICE_STAT(xmit_err);
3934 	return (0);
3935 }
3936 
3937 static int
3938 myri10ge_info_kstat_update(kstat_t *ksp, int rw)
3939 {
3940 	struct myri10ge_info *info;
3941 	struct myri10ge_priv *mgp;
3942 
3943 
3944 	if (rw == KSTAT_WRITE)
3945 		return (EACCES);
3946 
3947 	info = (struct myri10ge_info *)ksp->ks_data;
3948 	mgp = (struct myri10ge_priv *)ksp->ks_private;
3949 	kstat_named_setstr(&info->driver_version, MYRI10GE_VERSION_STR);
3950 	kstat_named_setstr(&info->firmware_version, mgp->fw_version);
3951 	kstat_named_setstr(&info->firmware_name, mgp->fw_name);
3952 	kstat_named_setstr(&info->interrupt_type, mgp->intr_type);
3953 	kstat_named_setstr(&info->product_code, mgp->pc_str);
3954 	kstat_named_setstr(&info->serial_number, mgp->sn_str);
3955 	return (0);
3956 }
3957 
3958 static struct myri10ge_info myri10ge_info_template = {
3959 	{ "driver_version",	KSTAT_DATA_STRING },
3960 	{ "firmware_version",	KSTAT_DATA_STRING },
3961 	{ "firmware_name",	KSTAT_DATA_STRING },
3962 	{ "interrupt_type",	KSTAT_DATA_STRING },
3963 	{ "product_code",	KSTAT_DATA_STRING },
3964 	{ "serial_number",	KSTAT_DATA_STRING },
3965 };
3966 static kmutex_t myri10ge_info_template_lock;
3967 
3968 
3969 static int
3970 myri10ge_info_init(struct myri10ge_priv *mgp)
3971 {
3972 	struct kstat *ksp;
3973 
3974 	ksp = kstat_create("myri10ge", ddi_get_instance(mgp->dip),
3975 	    "myri10ge_info", "net", KSTAT_TYPE_NAMED,
3976 	    sizeof (myri10ge_info_template) /
3977 	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
3978 	if (ksp == NULL) {
3979 		cmn_err(CE_WARN,
3980 		    "%s: myri10ge_info_init: kstat_create failed", mgp->name);
3981 		return (DDI_FAILURE);
3982 	}
3983 	mgp->ksp_info = ksp;
3984 	ksp->ks_update = myri10ge_info_kstat_update;
3985 	ksp->ks_private = (void *) mgp;
3986 	ksp->ks_data = &myri10ge_info_template;
3987 	ksp->ks_lock = &myri10ge_info_template_lock;
3988 	if (MYRI10GE_VERSION_STR != NULL)
3989 		ksp->ks_data_size += strlen(MYRI10GE_VERSION_STR) + 1;
3990 	if (mgp->fw_version != NULL)
3991 		ksp->ks_data_size += strlen(mgp->fw_version) + 1;
3992 	ksp->ks_data_size += strlen(mgp->fw_name) + 1;
3993 	ksp->ks_data_size += strlen(mgp->intr_type) + 1;
3994 	if (mgp->pc_str != NULL)
3995 		ksp->ks_data_size += strlen(mgp->pc_str) + 1;
3996 	if (mgp->sn_str != NULL)
3997 		ksp->ks_data_size += strlen(mgp->sn_str) + 1;
3998 
3999 	kstat_install(ksp);
4000 	return (DDI_SUCCESS);
4001 }
4002 
4003 
4004 static int
4005 myri10ge_nic_stat_init(struct myri10ge_priv *mgp)
4006 {
4007 	struct kstat *ksp;
4008 	struct myri10ge_nic_stat *ethstat;
4009 
4010 	ksp = kstat_create("myri10ge", ddi_get_instance(mgp->dip),
4011 	    "myri10ge_nic_stats", "net", KSTAT_TYPE_NAMED,
4012 	    sizeof (*ethstat) / sizeof (kstat_named_t), 0);
4013 	if (ksp == NULL) {
4014 		cmn_err(CE_WARN,
4015 		    "%s: myri10ge_stat_init: kstat_create failed", mgp->name);
4016 		return (DDI_FAILURE);
4017 	}
4018 	mgp->ksp_stat = ksp;
4019 	ethstat = (struct myri10ge_nic_stat *)(ksp->ks_data);
4020 
4021 	kstat_named_init(&ethstat->dma_read_bw_MBs,
4022 	    "dma_read_bw_MBs", KSTAT_DATA_ULONG);
4023 	kstat_named_init(&ethstat->dma_write_bw_MBs,
4024 	    "dma_write_bw_MBs", KSTAT_DATA_ULONG);
4025 	kstat_named_init(&ethstat->dma_read_write_bw_MBs,
4026 	    "dma_read_write_bw_MBs", KSTAT_DATA_ULONG);
4027 	kstat_named_init(&ethstat->dma_force_physical,
4028 	    "dma_force_physical", KSTAT_DATA_ULONG);
4029 	kstat_named_init(&ethstat->lanes,
4030 	    "lanes", KSTAT_DATA_ULONG);
4031 	kstat_named_init(&ethstat->dropped_bad_crc32,
4032 	    "dropped_bad_crc32", KSTAT_DATA_ULONG);
4033 	kstat_named_init(&ethstat->dropped_bad_phy,
4034 	    "dropped_bad_phy", KSTAT_DATA_ULONG);
4035 	kstat_named_init(&ethstat->dropped_link_error_or_filtered,
4036 	    "dropped_link_error_or_filtered", KSTAT_DATA_ULONG);
4037 	kstat_named_init(&ethstat->dropped_link_overflow,
4038 	    "dropped_link_overflow", KSTAT_DATA_ULONG);
4039 	kstat_named_init(&ethstat->dropped_multicast_filtered,
4040 	    "dropped_multicast_filtered", KSTAT_DATA_ULONG);
4041 	kstat_named_init(&ethstat->dropped_no_big_buffer,
4042 	    "dropped_no_big_buffer", KSTAT_DATA_ULONG);
4043 	kstat_named_init(&ethstat->dropped_no_small_buffer,
4044 	    "dropped_no_small_buffer", KSTAT_DATA_ULONG);
4045 	kstat_named_init(&ethstat->dropped_overrun,
4046 	    "dropped_overrun", KSTAT_DATA_ULONG);
4047 	kstat_named_init(&ethstat->dropped_pause,
4048 	    "dropped_pause", KSTAT_DATA_ULONG);
4049 	kstat_named_init(&ethstat->dropped_runt,
4050 	    "dropped_runt", KSTAT_DATA_ULONG);
4051 	kstat_named_init(&ethstat->dropped_unicast_filtered,
4052 	    "dropped_unicast_filtered", KSTAT_DATA_ULONG);
4053 	kstat_named_init(&ethstat->dropped_runt, "dropped_runt",
4054 	    KSTAT_DATA_ULONG);
4055 	kstat_named_init(&ethstat->link_up, "link_up", KSTAT_DATA_ULONG);
4056 	kstat_named_init(&ethstat->link_changes, "link_changes",
4057 	    KSTAT_DATA_ULONG);
4058 	ksp->ks_update = myri10ge_nic_stat_kstat_update;
4059 	ksp->ks_private = (void *) mgp;
4060 	kstat_install(ksp);
4061 	return (DDI_SUCCESS);
4062 }
4063 
4064 static int
4065 myri10ge_slice_stat_init(struct myri10ge_slice_state *ss)
4066 {
4067 	struct myri10ge_priv *mgp = ss->mgp;
4068 	struct kstat *ksp;
4069 	struct myri10ge_slice_stat *ethstat;
4070 	int instance;
4071 
4072 	/*
4073 	 * fake an instance so that the same slice numbers from
4074 	 * different instances do not collide
4075 	 */
4076 	instance = (ddi_get_instance(mgp->dip) * 1000) +  (int)(ss - mgp->ss);
4077 	ksp = kstat_create("myri10ge", instance,
4078 	    "myri10ge_slice_stats", "net", KSTAT_TYPE_NAMED,
4079 	    sizeof (*ethstat) / sizeof (kstat_named_t), 0);
4080 	if (ksp == NULL) {
4081 		cmn_err(CE_WARN,
4082 		    "%s: myri10ge_stat_init: kstat_create failed", mgp->name);
4083 		return (DDI_FAILURE);
4084 	}
4085 	ss->ksp_stat = ksp;
4086 	ethstat = (struct myri10ge_slice_stat *)(ksp->ks_data);
4087 	kstat_named_init(&ethstat->lro_bad_csum, "lro_bad_csum",
4088 	    KSTAT_DATA_ULONG);
4089 	kstat_named_init(&ethstat->lro_flushed, "lro_flushed",
4090 	    KSTAT_DATA_ULONG);
4091 	kstat_named_init(&ethstat->lro_queued, "lro_queued",
4092 	    KSTAT_DATA_ULONG);
4093 	kstat_named_init(&ethstat->rx_bigbuf_firmware, "rx_bigbuf_firmware",
4094 	    KSTAT_DATA_ULONG);
4095 	kstat_named_init(&ethstat->rx_bigbuf_pool, "rx_bigbuf_pool",
4096 	    KSTAT_DATA_ULONG);
4097 	kstat_named_init(&ethstat->rx_bigbuf_smalls, "rx_bigbuf_smalls",
4098 	    KSTAT_DATA_ULONG);
4099 	kstat_named_init(&ethstat->rx_copy, "rx_copy",
4100 	    KSTAT_DATA_ULONG);
4101 	kstat_named_init(&ethstat->rx_big_nobuf, "rx_big_nobuf",
4102 	    KSTAT_DATA_ULONG);
4103 	kstat_named_init(&ethstat->rx_small_nobuf, "rx_small_nobuf",
4104 	    KSTAT_DATA_ULONG);
4105 	kstat_named_init(&ethstat->xmit_zero_len, "xmit_zero_len",
4106 	    KSTAT_DATA_ULONG);
4107 	kstat_named_init(&ethstat->xmit_pullup, "xmit_pullup",
4108 	    KSTAT_DATA_ULONG);
4109 	kstat_named_init(&ethstat->xmit_pullup_first, "xmit_pullup_first",
4110 	    KSTAT_DATA_ULONG);
4111 	kstat_named_init(&ethstat->xmit_lowbuf, "xmit_lowbuf",
4112 	    KSTAT_DATA_ULONG);
4113 	kstat_named_init(&ethstat->xmit_lsobadflags, "xmit_lsobadflags",
4114 	    KSTAT_DATA_ULONG);
4115 	kstat_named_init(&ethstat->xmit_sched, "xmit_sched",
4116 	    KSTAT_DATA_ULONG);
4117 	kstat_named_init(&ethstat->xmit_stall, "xmit_stall",
4118 	    KSTAT_DATA_ULONG);
4119 	kstat_named_init(&ethstat->xmit_stall_early, "xmit_stall_early",
4120 	    KSTAT_DATA_ULONG);
4121 	kstat_named_init(&ethstat->xmit_stall_late, "xmit_stall_late",
4122 	    KSTAT_DATA_ULONG);
4123 	kstat_named_init(&ethstat->xmit_err, "xmit_err",
4124 	    KSTAT_DATA_ULONG);
4125 	kstat_named_init(&ethstat->tx_req, "tx_req",
4126 	    KSTAT_DATA_ULONG);
4127 	kstat_named_init(&ethstat->tx_activate, "tx_activate",
4128 	    KSTAT_DATA_ULONG);
4129 	kstat_named_init(&ethstat->tx_done, "tx_done",
4130 	    KSTAT_DATA_ULONG);
4131 	kstat_named_init(&ethstat->tx_handles_alloced, "tx_handles_alloced",
4132 	    KSTAT_DATA_ULONG);
4133 	kstat_named_init(&ethstat->rx_big, "rx_big",
4134 	    KSTAT_DATA_ULONG);
4135 	kstat_named_init(&ethstat->rx_small, "rx_small",
4136 	    KSTAT_DATA_ULONG);
4137 	ksp->ks_update = myri10ge_slice_stat_kstat_update;
4138 	ksp->ks_private = (void *) ss;
4139 	kstat_install(ksp);
4140 	return (DDI_SUCCESS);
4141 }
4142 
4143 
4144 
4145 #if #cpu(i386) || defined __i386 || defined i386 ||	\
4146 	defined __i386__ || #cpu(x86_64) || defined __x86_64__
4147 
4148 #include <vm/hat.h>
4149 void *device_arena_alloc(size_t size, int vm_flag);
4150 void device_arena_free(void *vaddr, size_t size);
4151 
4152 static void
4153 myri10ge_enable_nvidia_ecrc(struct myri10ge_priv *mgp)
4154 {
4155 	dev_info_t *parent_dip;
4156 	ddi_acc_handle_t handle;
4157 	unsigned long bus_number, dev_number, func_number;
4158 	unsigned long cfg_pa, paddr, base, pgoffset;
4159 	char 		*cvaddr, *ptr;
4160 	uint32_t	*ptr32;
4161 	int 		retval = DDI_FAILURE;
4162 	int dontcare;
4163 	uint16_t read_vid, read_did, vendor_id, device_id;
4164 
4165 	if (!myri10ge_nvidia_ecrc_enable)
4166 		return;
4167 
4168 	parent_dip = ddi_get_parent(mgp->dip);
4169 	if (parent_dip == NULL) {
4170 		cmn_err(CE_WARN, "%s: I'm an orphan?", mgp->name);
4171 		return;
4172 	}
4173 
4174 	if (pci_config_setup(parent_dip, &handle) != DDI_SUCCESS) {
4175 		cmn_err(CE_WARN,
4176 		    "%s: Could not access my parent's registers", mgp->name);
4177 		return;
4178 	}
4179 
4180 	vendor_id = pci_config_get16(handle, PCI_CONF_VENID);
4181 	device_id = pci_config_get16(handle, PCI_CONF_DEVID);
4182 	pci_config_teardown(&handle);
4183 
4184 	if (myri10ge_verbose) {
4185 		unsigned long 	bus_number, dev_number, func_number;
4186 		int 		reg_set, span;
4187 		(void) myri10ge_reg_set(parent_dip, &reg_set, &span,
4188 		    &bus_number, &dev_number, &func_number);
4189 		if (myri10ge_verbose)
4190 			printf("%s: parent at %ld:%ld:%ld\n", mgp->name,
4191 			    bus_number, dev_number, func_number);
4192 	}
4193 
4194 	if (vendor_id !=  0x10de)
4195 		return;
4196 
4197 	if (device_id != 0x005d /* CK804 */ &&
4198 	    (device_id < 0x374 || device_id > 0x378) /* MCP55 */) {
4199 		return;
4200 	}
4201 	(void) myri10ge_reg_set(parent_dip, &dontcare, &dontcare,
4202 	    &bus_number, &dev_number, &func_number);
4203 
4204 	for (cfg_pa = 0xf0000000UL;
4205 	    retval != DDI_SUCCESS && cfg_pa >= 0xe0000000UL;
4206 	    cfg_pa -= 0x10000000UL) {
4207 		/* find the config space address for the nvidia bridge */
4208 		paddr = (cfg_pa + bus_number * 0x00100000UL +
4209 		    (dev_number * 8 + func_number) * 0x00001000UL);
4210 
4211 		base = paddr & (~MMU_PAGEOFFSET);
4212 		pgoffset = paddr & MMU_PAGEOFFSET;
4213 
4214 		/* map it into the kernel */
4215 		cvaddr =  device_arena_alloc(ptob(1), VM_NOSLEEP);
4216 		if (cvaddr == NULL)
4217 			cmn_err(CE_WARN, "%s: failed to map nf4: cvaddr\n",
4218 			    mgp->name);
4219 
4220 		hat_devload(kas.a_hat, cvaddr, mmu_ptob(1), mmu_btop(base),
4221 		    PROT_WRITE|HAT_STRICTORDER, HAT_LOAD_LOCK);
4222 
4223 		ptr = cvaddr + pgoffset;
4224 		read_vid = *(uint16_t *)(void *)(ptr + PCI_CONF_VENID);
4225 		read_did = *(uint16_t *)(void *)(ptr + PCI_CONF_DEVID);
4226 		if (vendor_id ==  read_did || device_id == read_did) {
4227 			ptr32 = (uint32_t *)(void *)(ptr + 0x178);
4228 			if (myri10ge_verbose)
4229 				printf("%s: Enabling ECRC on upstream "
4230 				    "Nvidia bridge (0x%x:0x%x) "
4231 				    "at %ld:%ld:%ld\n", mgp->name,
4232 				    read_vid, read_did, bus_number,
4233 				    dev_number, func_number);
4234 			*ptr32 |= 0x40;
4235 			retval = DDI_SUCCESS;
4236 		}
4237 		hat_unload(kas.a_hat, cvaddr, ptob(1), HAT_UNLOAD_UNLOCK);
4238 		device_arena_free(cvaddr, ptob(1));
4239 	}
4240 }
4241 
4242 #else
4243 /*ARGSUSED*/
4244 static void
4245 myri10ge_enable_nvidia_ecrc(struct myri10ge_priv *mgp)
4246 {
4247 }
4248 #endif /* i386 */
4249 
4250 
4251 /*
4252  * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
4253  * when the PCI-E Completion packets are aligned on an 8-byte
4254  * boundary.  Some PCI-E chip sets always align Completion packets; on
4255  * the ones that do not, the alignment can be enforced by enabling
4256  * ECRC generation (if supported).
4257  *
4258  * When PCI-E Completion packets are not aligned, it is actually more
4259  * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
4260  *
4261  * If the driver can neither enable ECRC nor verify that it has
4262  * already been enabled, then it must use a firmware image which works
4263  * around unaligned completion packets (ethp_z8e.dat), and it should
4264  * also ensure that it never gives the device a Read-DMA which is
4265  * larger than 2KB by setting the tx.boundary to 2KB.  If ECRC is
4266  * enabled, then the driver should use the aligned (eth_z8e.dat)
4267  * firmware image, and set tx.boundary to 4KB.
4268  */
4269 
4270 
4271 static int
4272 myri10ge_firmware_probe(struct myri10ge_priv *mgp)
4273 {
4274 	int status;
4275 
4276 	mgp->tx_boundary = 4096;
4277 	/*
4278 	 * Verify the max read request size was set to 4KB
4279 	 * before trying the test with 4KB.
4280 	 */
4281 	if (mgp->max_read_request_4k == 0)
4282 		mgp->tx_boundary = 2048;
4283 	/*
4284 	 * load the optimized firmware which assumes aligned PCIe
4285 	 * completions in order to see if it works on this host.
4286 	 */
4287 
4288 	mgp->fw_name = "rss_eth_z8e";
4289 	mgp->eth_z8e = (unsigned char *)rss_eth_z8e;
4290 	mgp->eth_z8e_length = rss_eth_z8e_length;
4291 
4292 	status = myri10ge_load_firmware(mgp);
4293 	if (status != 0) {
4294 		return (status);
4295 	}
4296 	/*
4297 	 * Enable ECRC if possible
4298 	 */
4299 	myri10ge_enable_nvidia_ecrc(mgp);
4300 
4301 	/*
4302 	 * Run a DMA test which watches for unaligned completions and
4303 	 * aborts on the first one seen.
4304 	 */
4305 	status = myri10ge_dma_test(mgp, MXGEFW_CMD_UNALIGNED_TEST);
4306 	if (status == 0)
4307 		return (0); /* keep the aligned firmware */
4308 
4309 	if (status != E2BIG)
4310 		cmn_err(CE_WARN, "%s: DMA test failed: %d\n",
4311 		    mgp->name, status);
4312 	if (status == ENOSYS)
4313 		cmn_err(CE_WARN, "%s: Falling back to ethp! "
4314 		    "Please install up to date fw\n", mgp->name);
4315 	return (status);
4316 }
4317 
4318 static int
4319 myri10ge_select_firmware(struct myri10ge_priv *mgp)
4320 {
4321 	int aligned;
4322 
4323 	aligned = 0;
4324 
4325 	if (myri10ge_force_firmware == 1) {
4326 		if (myri10ge_verbose)
4327 			printf("%s: Assuming aligned completions (forced)\n",
4328 			    mgp->name);
4329 		aligned = 1;
4330 		goto done;
4331 	}
4332 
4333 	if (myri10ge_force_firmware == 2) {
4334 		if (myri10ge_verbose)
4335 			printf("%s: Assuming unaligned completions (forced)\n",
4336 			    mgp->name);
4337 		aligned = 0;
4338 		goto done;
4339 	}
4340 
4341 	/* If the width is less than 8, we may used the aligned firmware */
4342 	if (mgp->pcie_link_width != 0 && mgp->pcie_link_width < 8) {
4343 		cmn_err(CE_WARN, "!%s: PCIe link running at x%d\n",
4344 		    mgp->name, mgp->pcie_link_width);
4345 		aligned = 1;
4346 		goto done;
4347 	}
4348 
4349 	if (0 == myri10ge_firmware_probe(mgp))
4350 		return (0);  /* keep optimized firmware */
4351 
4352 done:
4353 	if (aligned) {
4354 		mgp->fw_name = "rss_eth_z8e";
4355 		mgp->eth_z8e = (unsigned char *)rss_eth_z8e;
4356 		mgp->eth_z8e_length = rss_eth_z8e_length;
4357 		mgp->tx_boundary = 4096;
4358 	} else {
4359 		mgp->fw_name = "rss_ethp_z8e";
4360 		mgp->eth_z8e = (unsigned char *)rss_ethp_z8e;
4361 		mgp->eth_z8e_length = rss_ethp_z8e_length;
4362 		mgp->tx_boundary = 2048;
4363 	}
4364 
4365 	return (myri10ge_load_firmware(mgp));
4366 }
4367 
4368 static int
4369 myri10ge_add_intrs(struct myri10ge_priv *mgp, int add_handler)
4370 {
4371 	dev_info_t *devinfo = mgp->dip;
4372 	int count, avail, actual, intr_types;
4373 	int x, y, rc, inum = 0;
4374 
4375 
4376 	rc = ddi_intr_get_supported_types(devinfo, &intr_types);
4377 	if (rc != DDI_SUCCESS) {
4378 		cmn_err(CE_WARN,
4379 		    "!%s: ddi_intr_get_nintrs() failure, rc = %d\n", mgp->name,
4380 		    rc);
4381 		return (DDI_FAILURE);
4382 	}
4383 
4384 	if (!myri10ge_use_msi)
4385 		intr_types &= ~DDI_INTR_TYPE_MSI;
4386 	if (!myri10ge_use_msix)
4387 		intr_types &= ~DDI_INTR_TYPE_MSIX;
4388 
4389 	if (intr_types & DDI_INTR_TYPE_MSIX) {
4390 		mgp->ddi_intr_type = DDI_INTR_TYPE_MSIX;
4391 		mgp->intr_type = "MSI-X";
4392 	} else if (intr_types & DDI_INTR_TYPE_MSI) {
4393 		mgp->ddi_intr_type = DDI_INTR_TYPE_MSI;
4394 		mgp->intr_type = "MSI";
4395 	} else {
4396 		mgp->ddi_intr_type = DDI_INTR_TYPE_FIXED;
4397 		mgp->intr_type = "Legacy";
4398 	}
4399 	/* Get number of interrupts */
4400 	rc = ddi_intr_get_nintrs(devinfo, mgp->ddi_intr_type, &count);
4401 	if ((rc != DDI_SUCCESS) || (count == 0)) {
4402 		cmn_err(CE_WARN, "%s: ddi_intr_get_nintrs() failure, rc: %d, "
4403 		    "count: %d", mgp->name, rc, count);
4404 
4405 		return (DDI_FAILURE);
4406 	}
4407 
4408 	/* Get number of available interrupts */
4409 	rc = ddi_intr_get_navail(devinfo, mgp->ddi_intr_type, &avail);
4410 	if ((rc != DDI_SUCCESS) || (avail == 0)) {
4411 		cmn_err(CE_WARN, "%s: ddi_intr_get_navail() failure, "
4412 		    "rc: %d, avail: %d\n", mgp->name, rc, avail);
4413 		return (DDI_FAILURE);
4414 	}
4415 	if (avail < count) {
4416 		cmn_err(CE_NOTE,
4417 		    "!%s: nintrs() returned %d, navail returned %d",
4418 		    mgp->name, count, avail);
4419 		count = avail;
4420 	}
4421 
4422 	if (count < mgp->num_slices)
4423 		return (DDI_FAILURE);
4424 
4425 	if (count > mgp->num_slices)
4426 		count = mgp->num_slices;
4427 
4428 	/* Allocate memory for MSI interrupts */
4429 	mgp->intr_size = count * sizeof (ddi_intr_handle_t);
4430 	mgp->htable = kmem_alloc(mgp->intr_size, KM_SLEEP);
4431 
4432 	rc = ddi_intr_alloc(devinfo, mgp->htable, mgp->ddi_intr_type, inum,
4433 	    count, &actual, DDI_INTR_ALLOC_NORMAL);
4434 
4435 	if ((rc != DDI_SUCCESS) || (actual == 0)) {
4436 		cmn_err(CE_WARN, "%s: ddi_intr_alloc() failed: %d",
4437 		    mgp->name, rc);
4438 
4439 		kmem_free(mgp->htable, mgp->intr_size);
4440 		mgp->htable = NULL;
4441 		return (DDI_FAILURE);
4442 	}
4443 
4444 	if ((actual < count) && myri10ge_verbose) {
4445 		cmn_err(CE_NOTE, "%s: got %d/%d slices",
4446 		    mgp->name, actual, count);
4447 	}
4448 
4449 	mgp->intr_cnt = actual;
4450 
4451 	/*
4452 	 * Get priority for first irq, assume remaining are all the same
4453 	 */
4454 	if (ddi_intr_get_pri(mgp->htable[0], &mgp->intr_pri)
4455 	    != DDI_SUCCESS) {
4456 		cmn_err(CE_WARN, "%s: ddi_intr_get_pri() failed", mgp->name);
4457 
4458 		/* Free already allocated intr */
4459 		for (y = 0; y < actual; y++) {
4460 			(void) ddi_intr_free(mgp->htable[y]);
4461 		}
4462 
4463 		kmem_free(mgp->htable, mgp->intr_size);
4464 		mgp->htable = NULL;
4465 		return (DDI_FAILURE);
4466 	}
4467 
4468 	mgp->icookie = (void *)(uintptr_t)mgp->intr_pri;
4469 
4470 	if (!add_handler)
4471 		return (DDI_SUCCESS);
4472 
4473 	/* Call ddi_intr_add_handler() */
4474 	for (x = 0; x < actual; x++) {
4475 		if (ddi_intr_add_handler(mgp->htable[x], myri10ge_intr,
4476 		    (caddr_t)&mgp->ss[x], NULL) != DDI_SUCCESS) {
4477 			cmn_err(CE_WARN, "%s: ddi_intr_add_handler() failed",
4478 			    mgp->name);
4479 
4480 			/* Free already allocated intr */
4481 			for (y = 0; y < actual; y++) {
4482 				(void) ddi_intr_free(mgp->htable[y]);
4483 			}
4484 
4485 			kmem_free(mgp->htable, mgp->intr_size);
4486 			mgp->htable = NULL;
4487 			return (DDI_FAILURE);
4488 		}
4489 	}
4490 
4491 	(void) ddi_intr_get_cap(mgp->htable[0], &mgp->intr_cap);
4492 	if (mgp->intr_cap & DDI_INTR_FLAG_BLOCK) {
4493 		/* Call ddi_intr_block_enable() for MSI */
4494 		(void) ddi_intr_block_enable(mgp->htable, mgp->intr_cnt);
4495 	} else {
4496 		/* Call ddi_intr_enable() for MSI non block enable */
4497 		for (x = 0; x < mgp->intr_cnt; x++) {
4498 			(void) ddi_intr_enable(mgp->htable[x]);
4499 		}
4500 	}
4501 
4502 	return (DDI_SUCCESS);
4503 }
4504 
4505 static void
4506 myri10ge_rem_intrs(struct myri10ge_priv *mgp, int handler_installed)
4507 {
4508 	int x, err;
4509 
4510 	/* Disable all interrupts */
4511 	if (handler_installed) {
4512 		if (mgp->intr_cap & DDI_INTR_FLAG_BLOCK) {
4513 			/* Call ddi_intr_block_disable() */
4514 			(void) ddi_intr_block_disable(mgp->htable,
4515 			    mgp->intr_cnt);
4516 		} else {
4517 			for (x = 0; x < mgp->intr_cnt; x++) {
4518 				(void) ddi_intr_disable(mgp->htable[x]);
4519 			}
4520 		}
4521 	}
4522 
4523 	for (x = 0; x < mgp->intr_cnt; x++) {
4524 		if (handler_installed) {
4525 		/* Call ddi_intr_remove_handler() */
4526 			err = ddi_intr_remove_handler(mgp->htable[x]);
4527 			if (err != DDI_SUCCESS) {
4528 				cmn_err(CE_WARN,
4529 				    "%s: ddi_intr_remove_handler for"
4530 				    "vec %d returned %d\n", mgp->name,
4531 				    x, err);
4532 			}
4533 		}
4534 		err = ddi_intr_free(mgp->htable[x]);
4535 		if (err != DDI_SUCCESS) {
4536 			cmn_err(CE_WARN,
4537 			    "%s: ddi_intr_free for vec %d returned %d\n",
4538 			    mgp->name, x, err);
4539 		}
4540 	}
4541 	kmem_free(mgp->htable, mgp->intr_size);
4542 	mgp->htable = NULL;
4543 }
4544 
4545 static void
4546 myri10ge_test_physical(dev_info_t *dip)
4547 {
4548 	ddi_dma_handle_t	handle;
4549 	struct myri10ge_dma_stuff dma;
4550 	void *addr;
4551 	int err;
4552 
4553 	/* test #1, sufficient for older sparc systems */
4554 	myri10ge_tx_dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
4555 	err = ddi_dma_alloc_handle(dip, &myri10ge_tx_dma_attr,
4556 	    DDI_DMA_DONTWAIT, NULL, &handle);
4557 	if (err == DDI_DMA_BADATTR)
4558 		goto fail;
4559 	ddi_dma_free_handle(&handle);
4560 
4561 	/* test #2, required on Olympis where the bind is what fails */
4562 	addr = myri10ge_dma_alloc(dip, 128, &myri10ge_tx_dma_attr,
4563 	    &myri10ge_dev_access_attr, DDI_DMA_STREAMING,
4564 	    DDI_DMA_WRITE|DDI_DMA_STREAMING, &dma, 0, DDI_DMA_DONTWAIT);
4565 	if (addr == NULL)
4566 		goto fail;
4567 	myri10ge_dma_free(&dma);
4568 	return;
4569 
4570 fail:
4571 	if (myri10ge_verbose)
4572 		printf("myri10ge%d: DDI_DMA_FORCE_PHYSICAL failed, "
4573 		    "using IOMMU\n", ddi_get_instance(dip));
4574 
4575 	myri10ge_tx_dma_attr.dma_attr_flags &= ~DDI_DMA_FORCE_PHYSICAL;
4576 }
4577 
4578 static void
4579 myri10ge_get_props(dev_info_t *dip)
4580 {
4581 
4582 	myri10ge_flow_control =  ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4583 	    "myri10ge_flow_control", myri10ge_flow_control);
4584 
4585 	myri10ge_intr_coal_delay = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4586 	    "myri10ge_intr_coal_delay", myri10ge_intr_coal_delay);
4587 
4588 #if #cpu(i386) || defined __i386 || defined i386 ||	\
4589 	defined __i386__ || #cpu(x86_64) || defined __x86_64__
4590 	myri10ge_nvidia_ecrc_enable = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4591 	    "myri10ge_nvidia_ecrc_enable", 1);
4592 #endif
4593 
4594 
4595 	myri10ge_use_msi = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4596 	    "myri10ge_use_msi", myri10ge_use_msi);
4597 
4598 	myri10ge_deassert_wait = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4599 	    "myri10ge_deassert_wait",  myri10ge_deassert_wait);
4600 
4601 	myri10ge_verbose = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4602 	    "myri10ge_verbose", myri10ge_verbose);
4603 
4604 	myri10ge_tx_copylen = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4605 	    "myri10ge_tx_copylen", myri10ge_tx_copylen);
4606 
4607 	if (myri10ge_tx_copylen < 60) {
4608 		cmn_err(CE_WARN,
4609 		    "myri10ge_tx_copylen must be >= 60 bytes\n");
4610 		myri10ge_tx_copylen = 60;
4611 	}
4612 
4613 	myri10ge_mtu_override = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4614 	    "myri10ge_mtu_override", myri10ge_mtu_override);
4615 
4616 	if (myri10ge_mtu_override >= 1500 && myri10ge_mtu_override <= 9000)
4617 		myri10ge_mtu = myri10ge_mtu_override +
4618 		    sizeof (struct ether_header) + MXGEFW_PAD + VLAN_TAGSZ;
4619 	else if (myri10ge_mtu_override != 0) {
4620 		cmn_err(CE_WARN,
4621 		    "myri10ge_mtu_override must be between 1500 and "
4622 		    "9000 bytes\n");
4623 	}
4624 
4625 	myri10ge_bigbufs_initial = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4626 	    "myri10ge_bigbufs_initial", myri10ge_bigbufs_initial);
4627 	myri10ge_bigbufs_max = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4628 	    "myri10ge_bigbufs_max", myri10ge_bigbufs_max);
4629 
4630 	myri10ge_watchdog_reset = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4631 	    "myri10ge_watchdog_reset", myri10ge_watchdog_reset);
4632 
4633 	if (myri10ge_bigbufs_initial < 128) {
4634 		cmn_err(CE_WARN,
4635 		    "myri10ge_bigbufs_initial be at least 128\n");
4636 		myri10ge_bigbufs_initial = 128;
4637 	}
4638 	if (myri10ge_bigbufs_max < 128) {
4639 		cmn_err(CE_WARN,
4640 		    "myri10ge_bigbufs_max be at least 128\n");
4641 		myri10ge_bigbufs_max = 128;
4642 	}
4643 
4644 	if (myri10ge_bigbufs_max < myri10ge_bigbufs_initial) {
4645 		cmn_err(CE_WARN,
4646 		    "myri10ge_bigbufs_max must be >=  "
4647 		    "myri10ge_bigbufs_initial\n");
4648 		myri10ge_bigbufs_max = myri10ge_bigbufs_initial;
4649 	}
4650 
4651 	myri10ge_force_firmware = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4652 	    "myri10ge_force_firmware", myri10ge_force_firmware);
4653 
4654 	myri10ge_max_slices = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4655 	    "myri10ge_max_slices", myri10ge_max_slices);
4656 
4657 	myri10ge_use_msix = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4658 	    "myri10ge_use_msix", myri10ge_use_msix);
4659 
4660 	myri10ge_rss_hash = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4661 	    "myri10ge_rss_hash", myri10ge_rss_hash);
4662 
4663 	if (myri10ge_rss_hash > MXGEFW_RSS_HASH_TYPE_MAX ||
4664 	    myri10ge_rss_hash < MXGEFW_RSS_HASH_TYPE_IPV4) {
4665 		cmn_err(CE_WARN, "myri10ge: Illegal rssh hash type %d\n",
4666 		    myri10ge_rss_hash);
4667 		myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4668 	}
4669 	myri10ge_lro = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4670 	    "myri10ge_lro", myri10ge_lro);
4671 	myri10ge_lro_cnt = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4672 	    "myri10ge_lro_cnt", myri10ge_lro_cnt);
4673 	myri10ge_lro_max_aggr = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4674 	    "myri10ge_lro_max_aggr", myri10ge_lro_max_aggr);
4675 	myri10ge_tx_hash = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4676 	    "myri10ge_tx_hash", myri10ge_tx_hash);
4677 	myri10ge_use_lso = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4678 	    "myri10ge_use_lso", myri10ge_use_lso);
4679 	myri10ge_lso_copy = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4680 	    "myri10ge_lso_copy", myri10ge_lso_copy);
4681 	myri10ge_tx_handles_initial = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4682 	    "myri10ge_tx_handles_initial", myri10ge_tx_handles_initial);
4683 	myri10ge_small_bytes = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4684 	    "myri10ge_small_bytes", myri10ge_small_bytes);
4685 	if ((myri10ge_small_bytes + MXGEFW_PAD) & (128 -1)) {
4686 		cmn_err(CE_WARN, "myri10ge: myri10ge_small_bytes (%d)\n",
4687 		    myri10ge_small_bytes);
4688 		cmn_err(CE_WARN, "must be aligned on 128b bndry -2\n");
4689 		myri10ge_small_bytes += 128;
4690 		myri10ge_small_bytes &= ~(128 -1);
4691 		myri10ge_small_bytes -= MXGEFW_PAD;
4692 		cmn_err(CE_WARN, "rounded up to %d\n",
4693 		    myri10ge_small_bytes);
4694 
4695 		myri10ge_rss_hash = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
4696 	}
4697 }
4698 
4699 #ifndef	PCI_EXP_LNKSTA
4700 #define	PCI_EXP_LNKSTA 18
4701 #endif
4702 
4703 static int
4704 myri10ge_find_cap(ddi_acc_handle_t handle, uint8_t *capptr, uint8_t capid)
4705 {
4706 	uint16_t	status;
4707 	uint8_t 	ptr;
4708 
4709 	/* check to see if we have capabilities */
4710 	status = pci_config_get16(handle, PCI_CONF_STAT);
4711 	if (!(status & PCI_STAT_CAP)) {
4712 		cmn_err(CE_WARN, "PCI_STAT_CAP not found\n");
4713 		return (ENXIO);
4714 	}
4715 
4716 	ptr = pci_config_get8(handle, PCI_CONF_CAP_PTR);
4717 
4718 	/* Walk the capabilities list, looking for a PCI Express cap */
4719 	while (ptr != PCI_CAP_NEXT_PTR_NULL) {
4720 		if (pci_config_get8(handle, ptr + PCI_CAP_ID) == capid)
4721 			break;
4722 		ptr = pci_config_get8(handle, ptr + PCI_CAP_NEXT_PTR);
4723 	}
4724 	if (ptr < 64) {
4725 		cmn_err(CE_WARN, "Bad capability offset %d\n", ptr);
4726 		return (ENXIO);
4727 	}
4728 	*capptr = ptr;
4729 	return (0);
4730 }
4731 
4732 static int
4733 myri10ge_set_max_readreq(ddi_acc_handle_t handle)
4734 {
4735 	int err;
4736 	uint16_t	val;
4737 	uint8_t		ptr;
4738 
4739 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_PCI_E);
4740 	if (err != 0) {
4741 		cmn_err(CE_WARN, "could not find PCIe cap\n");
4742 		return (ENXIO);
4743 	}
4744 
4745 	/* set max read req to 4096 */
4746 	val = pci_config_get16(handle, ptr + PCIE_DEVCTL);
4747 	val = (val & ~PCIE_DEVCTL_MAX_READ_REQ_MASK) |
4748 	    PCIE_DEVCTL_MAX_READ_REQ_4096;
4749 	pci_config_put16(handle, ptr + PCIE_DEVCTL, val);
4750 	val = pci_config_get16(handle, ptr + PCIE_DEVCTL);
4751 	if ((val & (PCIE_DEVCTL_MAX_READ_REQ_4096)) !=
4752 	    PCIE_DEVCTL_MAX_READ_REQ_4096) {
4753 		cmn_err(CE_WARN, "could not set max read req (%x)\n", val);
4754 		return (EINVAL);
4755 	}
4756 	return (0);
4757 }
4758 
4759 static int
4760 myri10ge_read_pcie_link_width(ddi_acc_handle_t handle, int *link)
4761 {
4762 	int err;
4763 	uint16_t	val;
4764 	uint8_t		ptr;
4765 
4766 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_PCI_E);
4767 	if (err != 0) {
4768 		cmn_err(CE_WARN, "could not set max read req\n");
4769 		return (ENXIO);
4770 	}
4771 
4772 	/* read link width */
4773 	val = pci_config_get16(handle, ptr + PCIE_LINKSTS);
4774 	val &= PCIE_LINKSTS_NEG_WIDTH_MASK;
4775 	*link = (val >> 4);
4776 	return (0);
4777 }
4778 
4779 static int
4780 myri10ge_reset_nic(struct myri10ge_priv *mgp)
4781 {
4782 	ddi_acc_handle_t handle = mgp->cfg_hdl;
4783 	uint32_t reboot;
4784 	uint16_t cmd;
4785 	int err;
4786 
4787 	cmd = pci_config_get16(handle, PCI_CONF_COMM);
4788 	if ((cmd & PCI_COMM_ME) == 0) {
4789 		/*
4790 		 * Bus master DMA disabled?  Check to see if the card
4791 		 * rebooted due to a parity error For now, just report
4792 		 * it
4793 		 */
4794 
4795 		/* enter read32 mode */
4796 		pci_config_put8(handle, mgp->vso + 0x10, 0x3);
4797 		/* read REBOOT_STATUS (0xfffffff0) */
4798 		pci_config_put32(handle, mgp->vso + 0x18, 0xfffffff0);
4799 		reboot = pci_config_get16(handle, mgp->vso + 0x14);
4800 		cmn_err(CE_WARN, "%s NIC rebooted 0x%x\n", mgp->name, reboot);
4801 		return (0);
4802 	}
4803 	if (!myri10ge_watchdog_reset) {
4804 		cmn_err(CE_WARN, "%s: not resetting\n", mgp->name);
4805 		return (1);
4806 	}
4807 
4808 	myri10ge_stop_locked(mgp);
4809 	err = myri10ge_start_locked(mgp);
4810 	if (err == DDI_FAILURE) {
4811 		return (0);
4812 	}
4813 	mac_tx_update(mgp->mh);
4814 	return (1);
4815 }
4816 
4817 static inline int
4818 myri10ge_ring_stalled(myri10ge_tx_ring_t *tx)
4819 {
4820 	if (tx->sched != tx->stall &&
4821 	    tx->done == tx->watchdog_done &&
4822 	    tx->watchdog_req != tx->watchdog_done)
4823 		return (1);
4824 	return (0);
4825 }
4826 
4827 static void
4828 myri10ge_watchdog(void *arg)
4829 {
4830 	struct myri10ge_priv *mgp;
4831 	struct myri10ge_slice_state *ss;
4832 	myri10ge_tx_ring_t *tx;
4833 	int nic_ok = 1;
4834 	int slices_stalled, rx_pause, i;
4835 	int add_rx;
4836 
4837 	mgp = arg;
4838 	mutex_enter(&mgp->intrlock);
4839 	if (mgp->running != MYRI10GE_ETH_RUNNING) {
4840 		cmn_err(CE_WARN,
4841 		    "%s not running, not rearming watchdog (%d)\n",
4842 		    mgp->name, mgp->running);
4843 		mutex_exit(&mgp->intrlock);
4844 		return;
4845 	}
4846 
4847 	rx_pause = ntohl(mgp->ss[0].fw_stats->dropped_pause);
4848 
4849 	/*
4850 	 * make sure nic is stalled before we reset the nic, so as to
4851 	 * ensure we don't rip the transmit data structures out from
4852 	 * under a pending transmit
4853 	 */
4854 
4855 	for (slices_stalled = 0, i = 0; i < mgp->num_slices; i++) {
4856 		tx = &mgp->ss[i].tx;
4857 		slices_stalled = myri10ge_ring_stalled(tx);
4858 		if (slices_stalled)
4859 			break;
4860 	}
4861 
4862 	if (slices_stalled) {
4863 		if (mgp->watchdog_rx_pause == rx_pause) {
4864 			cmn_err(CE_WARN,
4865 			    "%s slice %d stalled:(%d, %d, %d, %d, %d %d %d\n)",
4866 			    mgp->name, i, tx->sched, tx->stall,
4867 			    tx->done, tx->watchdog_done, tx->req, tx->pkt_done,
4868 			    (int)ntohl(mgp->ss[i].fw_stats->send_done_count));
4869 			nic_ok = myri10ge_reset_nic(mgp);
4870 		} else {
4871 			cmn_err(CE_WARN,
4872 			    "%s Flow controlled, check link partner\n",
4873 			    mgp->name);
4874 		}
4875 	}
4876 
4877 	if (!nic_ok) {
4878 		cmn_err(CE_WARN,
4879 		    "%s Nic dead, not rearming watchdog\n", mgp->name);
4880 		mutex_exit(&mgp->intrlock);
4881 		return;
4882 	}
4883 	for (i = 0; i < mgp->num_slices; i++) {
4884 		ss = &mgp->ss[i];
4885 		tx = &ss->tx;
4886 		tx->watchdog_done = tx->done;
4887 		tx->watchdog_req = tx->req;
4888 		if (ss->watchdog_rx_copy != MYRI10GE_SLICE_STAT(rx_copy)) {
4889 			ss->watchdog_rx_copy = MYRI10GE_SLICE_STAT(rx_copy);
4890 			add_rx =
4891 			    min(ss->jpool.num_alloc,
4892 			    myri10ge_bigbufs_max -
4893 			    (ss->jpool.num_alloc -
4894 			    ss->jbufs_for_smalls));
4895 			if (add_rx != 0) {
4896 				(void) myri10ge_add_jbufs(ss, add_rx, 0);
4897 				/* now feed them to the firmware */
4898 				mutex_enter(&ss->jpool.mtx);
4899 				myri10ge_restock_jumbos(ss);
4900 				mutex_exit(&ss->jpool.mtx);
4901 			}
4902 		}
4903 	}
4904 	mgp->watchdog_rx_pause = rx_pause;
4905 
4906 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
4907 	    mgp->timer_ticks);
4908 	mutex_exit(&mgp->intrlock);
4909 }
4910 
4911 /*ARGSUSED*/
4912 static int
4913 myri10ge_get_coalesce(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
4914 
4915 {
4916 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
4917 	(void) mi_mpprintf(mp, "%d", mgp->intr_coal_delay);
4918 	return (0);
4919 }
4920 
4921 /*ARGSUSED*/
4922 static int
4923 myri10ge_set_coalesce(queue_t *q, mblk_t *mp, char *value,
4924     caddr_t cp, cred_t *credp)
4925 
4926 {
4927 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
4928 	char *end;
4929 	size_t new_value;
4930 
4931 	new_value = mi_strtol(value, &end, 10);
4932 	if (end == value)
4933 		return (EINVAL);
4934 
4935 	mutex_enter(&myri10ge_param_lock);
4936 	mgp->intr_coal_delay = (int)new_value;
4937 	*mgp->intr_coal_delay_ptr = htonl(mgp->intr_coal_delay);
4938 	mutex_exit(&myri10ge_param_lock);
4939 	return (0);
4940 }
4941 
4942 /*ARGSUSED*/
4943 static int
4944 myri10ge_get_pauseparam(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
4945 
4946 {
4947 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
4948 	(void) mi_mpprintf(mp, "%d", mgp->pause);
4949 	return (0);
4950 }
4951 
4952 /*ARGSUSED*/
4953 static int
4954 myri10ge_set_pauseparam(queue_t *q, mblk_t *mp, char *value,
4955 			caddr_t cp, cred_t *credp)
4956 
4957 {
4958 	struct myri10ge_priv *mgp = (struct myri10ge_priv *)(void *)cp;
4959 	char *end;
4960 	size_t new_value;
4961 	int err = 0;
4962 
4963 	new_value = mi_strtol(value, &end, 10);
4964 	if (end == value)
4965 		return (EINVAL);
4966 	if (new_value != 0)
4967 		new_value = 1;
4968 
4969 	mutex_enter(&myri10ge_param_lock);
4970 	if (new_value != mgp->pause)
4971 		err = myri10ge_change_pause(mgp, new_value);
4972 	mutex_exit(&myri10ge_param_lock);
4973 	return (err);
4974 }
4975 
4976 /*ARGSUSED*/
4977 static int
4978 myri10ge_get_int(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *credp)
4979 
4980 {
4981 	(void) mi_mpprintf(mp, "%d", *(int *)(void *)cp);
4982 	return (0);
4983 }
4984 
4985 /*ARGSUSED*/
4986 static int
4987 myri10ge_set_int(queue_t *q, mblk_t *mp, char *value,
4988     caddr_t cp, cred_t *credp)
4989 
4990 {
4991 	char *end;
4992 	size_t new_value;
4993 
4994 	new_value = mi_strtol(value, &end, 10);
4995 	if (end == value)
4996 		return (EINVAL);
4997 	*(int *)(void *)cp = new_value;
4998 
4999 	return (0);
5000 }
5001 
5002 static void
5003 myri10ge_ndd_init(struct myri10ge_priv *mgp)
5004 {
5005 	mgp->nd_head = NULL;
5006 
5007 	(void) nd_load(&mgp->nd_head, "myri10ge_intr_coal_delay",
5008 	    myri10ge_get_coalesce, myri10ge_set_coalesce, (caddr_t)mgp);
5009 	(void) nd_load(&mgp->nd_head, "myri10ge_flow_control",
5010 	    myri10ge_get_pauseparam, myri10ge_set_pauseparam, (caddr_t)mgp);
5011 	(void) nd_load(&mgp->nd_head, "myri10ge_verbose",
5012 	    myri10ge_get_int, myri10ge_set_int, (caddr_t)&myri10ge_verbose);
5013 	(void) nd_load(&mgp->nd_head, "myri10ge_deassert_wait",
5014 	    myri10ge_get_int, myri10ge_set_int,
5015 	    (caddr_t)&myri10ge_deassert_wait);
5016 	(void) nd_load(&mgp->nd_head, "myri10ge_bigbufs_max",
5017 	    myri10ge_get_int, myri10ge_set_int,
5018 	    (caddr_t)&myri10ge_bigbufs_max);
5019 	(void) nd_load(&mgp->nd_head, "myri10ge_lro",
5020 	    myri10ge_get_int, myri10ge_set_int,
5021 	    (caddr_t)&myri10ge_lro);
5022 	(void) nd_load(&mgp->nd_head, "myri10ge_lro_max_aggr",
5023 	    myri10ge_get_int, myri10ge_set_int,
5024 	    (caddr_t)&myri10ge_lro_max_aggr);
5025 	(void) nd_load(&mgp->nd_head, "myri10ge_tx_hash",
5026 	    myri10ge_get_int, myri10ge_set_int,
5027 	    (caddr_t)&myri10ge_tx_hash);
5028 	(void) nd_load(&mgp->nd_head, "myri10ge_lso_copy",
5029 	    myri10ge_get_int, myri10ge_set_int,
5030 	    (caddr_t)&myri10ge_lso_copy);
5031 }
5032 
5033 static void
5034 myri10ge_ndd_fini(struct myri10ge_priv *mgp)
5035 {
5036 	nd_free(&mgp->nd_head);
5037 }
5038 
5039 static void
5040 myri10ge_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
5041 {
5042 	struct iocblk *iocp;
5043 	struct myri10ge_priv *mgp = arg;
5044 	int cmd, ok, err;
5045 
5046 	iocp = (struct iocblk *)(void *)mp->b_rptr;
5047 	cmd = iocp->ioc_cmd;
5048 
5049 	ok = 0;
5050 	err = 0;
5051 
5052 	switch (cmd) {
5053 	case ND_GET:
5054 	case ND_SET:
5055 		ok = nd_getset(wq, mgp->nd_head, mp);
5056 		break;
5057 	default:
5058 		break;
5059 	}
5060 	if (!ok)
5061 		err = EINVAL;
5062 	else
5063 		err = iocp->ioc_error;
5064 
5065 	if (!err)
5066 		miocack(wq, mp, iocp->ioc_count, err);
5067 	else
5068 		miocnak(wq, mp, 0, err);
5069 }
5070 
5071 static struct myri10ge_priv *mgp_list;
5072 
5073 struct myri10ge_priv *
5074 myri10ge_get_instance(uint_t unit)
5075 {
5076 	struct myri10ge_priv *mgp;
5077 
5078 	mutex_enter(&myri10ge_param_lock);
5079 	for (mgp = mgp_list; mgp != NULL; mgp = mgp->next) {
5080 		if (unit == ddi_get_instance(mgp->dip)) {
5081 			mgp->refcnt++;
5082 			break;
5083 		}
5084 	}
5085 	mutex_exit(&myri10ge_param_lock);
5086 	return (mgp);
5087 }
5088 
5089 void
5090 myri10ge_put_instance(struct myri10ge_priv *mgp)
5091 {
5092 	mutex_enter(&myri10ge_param_lock);
5093 	mgp->refcnt--;
5094 	mutex_exit(&myri10ge_param_lock);
5095 }
5096 
5097 static boolean_t
5098 myri10ge_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
5099 {
5100 	struct myri10ge_priv *mgp = arg;
5101 	uint32_t *cap_hcksum;
5102 	mac_capab_lso_t *cap_lso;
5103 	mac_capab_rings_t *cap_rings;
5104 
5105 	switch (cap) {
5106 	case MAC_CAPAB_HCKSUM:
5107 		cap_hcksum = cap_data;
5108 		*cap_hcksum = HCKSUM_INET_PARTIAL;
5109 		break;
5110 	case MAC_CAPAB_RINGS:
5111 		cap_rings = cap_data;
5112 		switch (cap_rings->mr_type) {
5113 		case MAC_RING_TYPE_RX:
5114 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
5115 			cap_rings->mr_rnum = mgp->num_slices;
5116 			cap_rings->mr_gnum = 1;
5117 			cap_rings->mr_rget = myri10ge_fill_ring;
5118 			cap_rings->mr_gget = myri10ge_fill_group;
5119 			break;
5120 		case MAC_RING_TYPE_TX:
5121 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
5122 			cap_rings->mr_rnum = mgp->num_slices;
5123 			cap_rings->mr_gnum = 0;
5124 			cap_rings->mr_rget = myri10ge_fill_ring;
5125 			cap_rings->mr_gget = NULL;
5126 			break;
5127 		default:
5128 			return (B_FALSE);
5129 		}
5130 		break;
5131 	case MAC_CAPAB_LSO:
5132 		cap_lso = cap_data;
5133 		if (!myri10ge_use_lso)
5134 			return (B_FALSE);
5135 		if (!(mgp->features & MYRI10GE_TSO))
5136 			return (B_FALSE);
5137 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
5138 		cap_lso->lso_basic_tcp_ipv4.lso_max = (uint16_t)-1;
5139 		break;
5140 
5141 	default:
5142 		return (B_FALSE);
5143 	}
5144 	return (B_TRUE);
5145 }
5146 
5147 
5148 static int
5149 myri10ge_m_stat(void *arg, uint_t stat, uint64_t *val)
5150 {
5151 	struct myri10ge_priv *mgp = arg;
5152 	struct myri10ge_rx_ring_stats *rstat;
5153 	struct myri10ge_tx_ring_stats *tstat;
5154 	mcp_irq_data_t *fw_stats = mgp->ss[0].fw_stats;
5155 	struct myri10ge_slice_state *ss;
5156 	uint64_t tmp = 0;
5157 	int i;
5158 
5159 	switch (stat) {
5160 	case MAC_STAT_IFSPEED:
5161 		*val = 10ull * 1000ull * 1000000ull;
5162 		break;
5163 
5164 	case MAC_STAT_MULTIRCV:
5165 		for (i = 0; i < mgp->num_slices; i++) {
5166 			rstat = &mgp->ss[i].rx_stats;
5167 			tmp += rstat->multircv;
5168 		}
5169 		*val = tmp;
5170 		break;
5171 
5172 	case MAC_STAT_BRDCSTRCV:
5173 		for (i = 0; i < mgp->num_slices; i++) {
5174 			rstat = &mgp->ss[i].rx_stats;
5175 			tmp += rstat->brdcstrcv;
5176 		}
5177 		*val = tmp;
5178 		break;
5179 
5180 	case MAC_STAT_MULTIXMT:
5181 		for (i = 0; i < mgp->num_slices; i++) {
5182 			tstat = &mgp->ss[i].tx.stats;
5183 			tmp += tstat->multixmt;
5184 		}
5185 		*val = tmp;
5186 		break;
5187 
5188 	case MAC_STAT_BRDCSTXMT:
5189 		for (i = 0; i < mgp->num_slices; i++) {
5190 			tstat = &mgp->ss[i].tx.stats;
5191 			tmp += tstat->brdcstxmt;
5192 		}
5193 		*val = tmp;
5194 		break;
5195 
5196 	case MAC_STAT_NORCVBUF:
5197 		tmp = ntohl(fw_stats->dropped_no_big_buffer);
5198 		tmp += ntohl(fw_stats->dropped_no_small_buffer);
5199 		tmp += ntohl(fw_stats->dropped_link_overflow);
5200 		for (i = 0; i < mgp->num_slices; i++) {
5201 			ss = &mgp->ss[i];
5202 			tmp += MYRI10GE_SLICE_STAT(rx_big_nobuf);
5203 			tmp += MYRI10GE_SLICE_STAT(rx_small_nobuf);
5204 		}
5205 		*val = tmp;
5206 		break;
5207 
5208 	case MAC_STAT_IERRORS:
5209 		tmp += ntohl(fw_stats->dropped_bad_crc32);
5210 		tmp += ntohl(fw_stats->dropped_bad_phy);
5211 		tmp += ntohl(fw_stats->dropped_runt);
5212 		tmp += ntohl(fw_stats->dropped_overrun);
5213 		*val = tmp;
5214 		break;
5215 
5216 	case MAC_STAT_OERRORS:
5217 		for (i = 0; i < mgp->num_slices; i++) {
5218 			ss = &mgp->ss[i];
5219 			tmp += MYRI10GE_SLICE_STAT(xmit_lsobadflags);
5220 			tmp += MYRI10GE_SLICE_STAT(xmit_err);
5221 		}
5222 		*val = tmp;
5223 		break;
5224 
5225 	case MAC_STAT_RBYTES:
5226 		for (i = 0; i < mgp->num_slices; i++) {
5227 			rstat = &mgp->ss[i].rx_stats;
5228 			tmp += rstat->ibytes;
5229 		}
5230 		*val = tmp;
5231 		break;
5232 
5233 	case MAC_STAT_IPACKETS:
5234 		for (i = 0; i < mgp->num_slices; i++) {
5235 			rstat = &mgp->ss[i].rx_stats;
5236 			tmp += rstat->ipackets;
5237 		}
5238 		*val = tmp;
5239 		break;
5240 
5241 	case MAC_STAT_OBYTES:
5242 		for (i = 0; i < mgp->num_slices; i++) {
5243 			tstat = &mgp->ss[i].tx.stats;
5244 			tmp += tstat->obytes;
5245 		}
5246 		*val = tmp;
5247 		break;
5248 
5249 	case MAC_STAT_OPACKETS:
5250 		for (i = 0; i < mgp->num_slices; i++) {
5251 			tstat = &mgp->ss[i].tx.stats;
5252 			tmp += tstat->opackets;
5253 		}
5254 		*val = tmp;
5255 		break;
5256 
5257 	case ETHER_STAT_TOOLONG_ERRORS:
5258 		*val = ntohl(fw_stats->dropped_overrun);
5259 		break;
5260 
5261 #ifdef SOLARIS_S11
5262 	case ETHER_STAT_TOOSHORT_ERRORS:
5263 		*val = ntohl(fw_stats->dropped_runt);
5264 		break;
5265 #endif
5266 
5267 	case ETHER_STAT_LINK_PAUSE:
5268 		*val = mgp->pause;
5269 		break;
5270 
5271 	case ETHER_STAT_LINK_AUTONEG:
5272 		*val = 1;
5273 		break;
5274 
5275 	case ETHER_STAT_LINK_DUPLEX:
5276 		*val = LINK_DUPLEX_FULL;
5277 		break;
5278 
5279 	default:
5280 		return (ENOTSUP);
5281 	}
5282 
5283 	return (0);
5284 }
5285 
5286 static mac_callbacks_t myri10ge_m_callbacks = {
5287 	(MC_IOCTL | MC_GETCAPAB),
5288 	myri10ge_m_stat,
5289 	myri10ge_m_start,
5290 	myri10ge_m_stop,
5291 	myri10ge_m_promisc,
5292 	myri10ge_m_multicst,
5293 	NULL,
5294 	NULL,
5295 	myri10ge_m_ioctl,
5296 	myri10ge_m_getcapab
5297 };
5298 
5299 
5300 static int
5301 myri10ge_probe_slices(struct myri10ge_priv *mgp)
5302 {
5303 	myri10ge_cmd_t cmd;
5304 	int status;
5305 
5306 	mgp->num_slices = 1;
5307 
5308 	/* hit the board with a reset to ensure it is alive */
5309 	(void) memset(&cmd, 0, sizeof (cmd));
5310 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_RESET, &cmd);
5311 	if (status != 0) {
5312 		cmn_err(CE_WARN, "%s: failed reset\n", mgp->name);
5313 		return (ENXIO);
5314 	}
5315 
5316 	if (myri10ge_use_msix == 0)
5317 		return (0);
5318 
5319 	/* tell it the size of the interrupt queues */
5320 	cmd.data0 = mgp->max_intr_slots * sizeof (struct mcp_slot);
5321 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
5322 	if (status != 0) {
5323 		cmn_err(CE_WARN, "%s: failed MXGEFW_CMD_SET_INTRQ_SIZE\n",
5324 		    mgp->name);
5325 		return (ENXIO);
5326 	}
5327 
5328 	/* ask the maximum number of slices it supports */
5329 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_GET_MAX_RSS_QUEUES,
5330 	    &cmd);
5331 	if (status != 0)
5332 		return (0);
5333 
5334 	mgp->num_slices = cmd.data0;
5335 
5336 	/*
5337 	 * if the admin did not specify a limit to how many
5338 	 * slices we should use, cap it automatically to the
5339 	 * number of CPUs currently online
5340 	 */
5341 	if (myri10ge_max_slices == -1)
5342 		myri10ge_max_slices = ncpus;
5343 
5344 	if (mgp->num_slices > myri10ge_max_slices)
5345 		mgp->num_slices = myri10ge_max_slices;
5346 
5347 
5348 	/*
5349 	 * Now try to allocate as many MSI-X vectors as we have
5350 	 * slices. We give up on MSI-X if we can only get a single
5351 	 * vector.
5352 	 */
5353 	while (mgp->num_slices > 1) {
5354 		/* make sure it is a power of two */
5355 		while (mgp->num_slices & (mgp->num_slices - 1))
5356 			mgp->num_slices--;
5357 		if (mgp->num_slices == 1)
5358 			return (0);
5359 
5360 		status = myri10ge_add_intrs(mgp, 0);
5361 		if (status == 0) {
5362 			myri10ge_rem_intrs(mgp, 0);
5363 			if (mgp->intr_cnt == mgp->num_slices) {
5364 				if (myri10ge_verbose)
5365 					printf("Got %d slices!\n",
5366 					    mgp->num_slices);
5367 				return (0);
5368 			}
5369 			mgp->num_slices = mgp->intr_cnt;
5370 		} else {
5371 			mgp->num_slices = mgp->num_slices / 2;
5372 		}
5373 	}
5374 
5375 	if (myri10ge_verbose)
5376 		printf("Got %d slices\n", mgp->num_slices);
5377 	return (0);
5378 }
5379 
5380 static void
5381 myri10ge_lro_free(struct myri10ge_slice_state *ss)
5382 {
5383 	struct lro_entry *lro;
5384 
5385 	while (ss->lro_free != NULL) {
5386 		lro = ss->lro_free;
5387 		ss->lro_free = lro->next;
5388 		kmem_free(lro, sizeof (*lro));
5389 	}
5390 }
5391 
5392 static void
5393 myri10ge_lro_alloc(struct myri10ge_slice_state *ss)
5394 {
5395 	struct lro_entry *lro;
5396 	int idx;
5397 
5398 	ss->lro_free = NULL;
5399 	ss->lro_active = NULL;
5400 
5401 	for (idx = 0; idx < myri10ge_lro_cnt; idx++) {
5402 		lro = kmem_zalloc(sizeof (*lro), KM_SLEEP);
5403 		if (lro == NULL)
5404 			continue;
5405 		lro->next = ss->lro_free;
5406 		ss->lro_free = lro;
5407 	}
5408 }
5409 
5410 static void
5411 myri10ge_free_slices(struct myri10ge_priv *mgp)
5412 {
5413 	struct myri10ge_slice_state *ss;
5414 	size_t bytes;
5415 	int i;
5416 
5417 	if (mgp->ss == NULL)
5418 		return;
5419 
5420 	for (i = 0; i < mgp->num_slices; i++) {
5421 		ss = &mgp->ss[i];
5422 		if (ss->rx_done.entry == NULL)
5423 			continue;
5424 		myri10ge_dma_free(&ss->rx_done.dma);
5425 		ss->rx_done.entry = NULL;
5426 		if (ss->fw_stats == NULL)
5427 			continue;
5428 		myri10ge_dma_free(&ss->fw_stats_dma);
5429 		ss->fw_stats = NULL;
5430 		mutex_destroy(&ss->rx_lock);
5431 		mutex_destroy(&ss->tx.lock);
5432 		mutex_destroy(&ss->tx.handle_lock);
5433 		mutex_destroy(&ss->poll_lock);
5434 		myri10ge_jpool_fini(ss);
5435 		myri10ge_slice_stat_destroy(ss);
5436 		myri10ge_lro_free(ss);
5437 	}
5438 	bytes = sizeof (*mgp->ss) * mgp->num_slices;
5439 	kmem_free(mgp->ss, bytes);
5440 	mgp->ss = NULL;
5441 }
5442 
5443 
5444 static int
5445 myri10ge_alloc_slices(struct myri10ge_priv *mgp)
5446 {
5447 	struct myri10ge_slice_state *ss;
5448 	size_t bytes;
5449 	int i;
5450 
5451 	bytes = sizeof (*mgp->ss) * mgp->num_slices;
5452 	mgp->ss = kmem_zalloc(bytes, KM_SLEEP);
5453 	if (mgp->ss == NULL)
5454 		return (ENOMEM);
5455 	for (i = 0; i < mgp->num_slices; i++) {
5456 		ss = &mgp->ss[i];
5457 
5458 		ss->mgp = mgp;
5459 
5460 		/* allocate the per-slice firmware stats */
5461 		bytes = sizeof (*ss->fw_stats);
5462 		ss->fw_stats = (mcp_irq_data_t *)(void *)
5463 		    myri10ge_dma_alloc(mgp->dip, bytes,
5464 		    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5465 		    DDI_DMA_CONSISTENT, DDI_DMA_READ|DDI_DMA_CONSISTENT,
5466 		    &ss->fw_stats_dma, 1, DDI_DMA_DONTWAIT);
5467 		if (ss->fw_stats == NULL)
5468 			goto abort;
5469 		(void) memset(ss->fw_stats, 0, bytes);
5470 
5471 		/* allocate rx done ring */
5472 		bytes = mgp->max_intr_slots *
5473 		    sizeof (*ss->rx_done.entry);
5474 		ss->rx_done.entry = (mcp_slot_t *)(void *)
5475 		    myri10ge_dma_alloc(mgp->dip, bytes,
5476 		    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5477 		    DDI_DMA_CONSISTENT, DDI_DMA_READ|DDI_DMA_CONSISTENT,
5478 		    &ss->rx_done.dma, 1, DDI_DMA_DONTWAIT);
5479 		if (ss->rx_done.entry == NULL) {
5480 			goto abort;
5481 		}
5482 		(void) memset(ss->rx_done.entry, 0, bytes);
5483 		mutex_init(&ss->rx_lock,   NULL, MUTEX_DEFAULT, mgp->icookie);
5484 		mutex_init(&ss->tx.lock,   NULL, MUTEX_DEFAULT, NULL);
5485 		mutex_init(&ss->tx.handle_lock,   NULL, MUTEX_DEFAULT, NULL);
5486 		mutex_init(&ss->poll_lock,   NULL, MUTEX_DEFAULT, NULL);
5487 		myri10ge_jpool_init(ss);
5488 		(void) myri10ge_slice_stat_init(ss);
5489 		myri10ge_lro_alloc(ss);
5490 	}
5491 
5492 	return (0);
5493 
5494 abort:
5495 	myri10ge_free_slices(mgp);
5496 	return (ENOMEM);
5497 }
5498 
5499 static int
5500 myri10ge_save_msi_state(struct myri10ge_priv *mgp,
5501     ddi_acc_handle_t handle)
5502 {
5503 	uint8_t ptr;
5504 	int err;
5505 
5506 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_MSI);
5507 	if (err != 0) {
5508 		cmn_err(CE_WARN, "%s: could not find MSI cap\n",
5509 		    mgp->name);
5510 		return (DDI_FAILURE);
5511 	}
5512 	mgp->pci_saved_state.msi_ctrl =
5513 	    pci_config_get16(handle, ptr + PCI_MSI_CTRL);
5514 	mgp->pci_saved_state.msi_addr_low =
5515 	    pci_config_get32(handle, ptr + PCI_MSI_ADDR_OFFSET);
5516 	mgp->pci_saved_state.msi_addr_high =
5517 	    pci_config_get32(handle, ptr + PCI_MSI_ADDR_OFFSET + 4);
5518 	mgp->pci_saved_state.msi_data_32 =
5519 	    pci_config_get16(handle, ptr + PCI_MSI_32BIT_DATA);
5520 	mgp->pci_saved_state.msi_data_64 =
5521 	    pci_config_get16(handle, ptr + PCI_MSI_64BIT_DATA);
5522 	return (DDI_SUCCESS);
5523 }
5524 
5525 static int
5526 myri10ge_restore_msi_state(struct myri10ge_priv *mgp,
5527     ddi_acc_handle_t handle)
5528 {
5529 	uint8_t ptr;
5530 	int err;
5531 
5532 	err = myri10ge_find_cap(handle, &ptr, PCI_CAP_ID_MSI);
5533 	if (err != 0) {
5534 		cmn_err(CE_WARN, "%s: could not find MSI cap\n",
5535 		    mgp->name);
5536 		return (DDI_FAILURE);
5537 	}
5538 
5539 	pci_config_put16(handle, ptr + PCI_MSI_CTRL,
5540 	    mgp->pci_saved_state.msi_ctrl);
5541 	pci_config_put32(handle, ptr + PCI_MSI_ADDR_OFFSET,
5542 	    mgp->pci_saved_state.msi_addr_low);
5543 	pci_config_put32(handle, ptr + PCI_MSI_ADDR_OFFSET + 4,
5544 	    mgp->pci_saved_state.msi_addr_high);
5545 	pci_config_put16(handle, ptr + PCI_MSI_32BIT_DATA,
5546 	    mgp->pci_saved_state.msi_data_32);
5547 	pci_config_put16(handle, ptr + PCI_MSI_64BIT_DATA,
5548 	    mgp->pci_saved_state.msi_data_64);
5549 
5550 	return (DDI_SUCCESS);
5551 }
5552 
5553 static int
5554 myri10ge_save_pci_state(struct myri10ge_priv *mgp)
5555 {
5556 	ddi_acc_handle_t handle = mgp->cfg_hdl;
5557 	int i;
5558 	int err = DDI_SUCCESS;
5559 
5560 
5561 	/* Save the non-extended PCI config space 32-bits at a time */
5562 	for (i = 0; i < 16; i++)
5563 		mgp->pci_saved_state.base[i] =
5564 		    pci_config_get32(handle, i*4);
5565 
5566 	/* now save MSI interrupt state *, if needed */
5567 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_MSI)
5568 		err = myri10ge_save_msi_state(mgp, handle);
5569 
5570 	return (err);
5571 }
5572 
5573 static int
5574 myri10ge_restore_pci_state(struct myri10ge_priv *mgp)
5575 {
5576 	ddi_acc_handle_t handle = mgp->cfg_hdl;
5577 	int i;
5578 	int err = DDI_SUCCESS;
5579 
5580 
5581 	/* Restore the non-extended PCI config space 32-bits at a time */
5582 	for (i = 15; i >= 0; i--)
5583 		pci_config_put32(handle, i*4, mgp->pci_saved_state.base[i]);
5584 
5585 	/* now restore MSI interrupt state *, if needed */
5586 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_MSI)
5587 		err = myri10ge_restore_msi_state(mgp, handle);
5588 
5589 	if (mgp->max_read_request_4k)
5590 		(void) myri10ge_set_max_readreq(handle);
5591 	return (err);
5592 }
5593 
5594 
5595 static int
5596 myri10ge_suspend(dev_info_t *dip)
5597 {
5598 	struct myri10ge_priv *mgp = ddi_get_driver_private(dip);
5599 	int status;
5600 
5601 	if (mgp == NULL) {
5602 		cmn_err(CE_WARN, "null dip in myri10ge_suspend\n");
5603 		return (DDI_FAILURE);
5604 	}
5605 	if (mgp->dip != dip) {
5606 		cmn_err(CE_WARN, "bad dip in myri10ge_suspend\n");
5607 		return (DDI_FAILURE);
5608 	}
5609 	mutex_enter(&mgp->intrlock);
5610 	if (mgp->running == MYRI10GE_ETH_RUNNING) {
5611 		mgp->running = MYRI10GE_ETH_STOPPING;
5612 		mutex_exit(&mgp->intrlock);
5613 		(void) untimeout(mgp->timer_id);
5614 		mutex_enter(&mgp->intrlock);
5615 		myri10ge_stop_locked(mgp);
5616 		mgp->running = MYRI10GE_ETH_SUSPENDED_RUNNING;
5617 	}
5618 	status = myri10ge_save_pci_state(mgp);
5619 	mutex_exit(&mgp->intrlock);
5620 	return (status);
5621 }
5622 
5623 static int
5624 myri10ge_resume(dev_info_t *dip)
5625 {
5626 	struct myri10ge_priv *mgp = ddi_get_driver_private(dip);
5627 	int status = DDI_SUCCESS;
5628 
5629 	if (mgp == NULL) {
5630 		cmn_err(CE_WARN, "null dip in myri10ge_resume\n");
5631 		return (DDI_FAILURE);
5632 	}
5633 	if (mgp->dip != dip) {
5634 		cmn_err(CE_WARN, "bad dip in myri10ge_resume\n");
5635 		return (DDI_FAILURE);
5636 	}
5637 
5638 	mutex_enter(&mgp->intrlock);
5639 	status = myri10ge_restore_pci_state(mgp);
5640 	if (status == DDI_SUCCESS &&
5641 	    mgp->running == MYRI10GE_ETH_SUSPENDED_RUNNING) {
5642 		status = myri10ge_start_locked(mgp);
5643 	}
5644 	mutex_exit(&mgp->intrlock);
5645 	if (status != DDI_SUCCESS)
5646 		return (status);
5647 
5648 	/* start the watchdog timer */
5649 	mgp->timer_id = timeout(myri10ge_watchdog, mgp,
5650 	    mgp->timer_ticks);
5651 	return (DDI_SUCCESS);
5652 }
5653 
5654 static int
5655 myri10ge_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
5656 {
5657 
5658 	struct myri10ge_priv *mgp;
5659 	mac_register_t *macp, *omacp;
5660 	ddi_acc_handle_t handle;
5661 	uint32_t csr, hdr_offset;
5662 	int status, span, link_width, max_read_request_4k;
5663 	unsigned long bus_number, dev_number, func_number;
5664 	size_t bytes;
5665 	offset_t ss_offset;
5666 	uint8_t vso;
5667 
5668 	if (cmd == DDI_RESUME) {
5669 		return (myri10ge_resume(dip));
5670 	}
5671 
5672 	if (cmd != DDI_ATTACH)
5673 		return (DDI_FAILURE);
5674 	if (pci_config_setup(dip, &handle) != DDI_SUCCESS)
5675 		return (DDI_FAILURE);
5676 
5677 	/* enable busmater and io space access */
5678 	csr = pci_config_get32(handle, PCI_CONF_COMM);
5679 	pci_config_put32(handle, PCI_CONF_COMM,
5680 	    (csr |PCI_COMM_ME|PCI_COMM_MAE));
5681 	status = myri10ge_read_pcie_link_width(handle, &link_width);
5682 	if (status != 0) {
5683 		cmn_err(CE_WARN, "could not read link width!\n");
5684 		link_width = 0;
5685 	}
5686 	max_read_request_4k = !myri10ge_set_max_readreq(handle);
5687 	status = myri10ge_find_cap(handle, &vso, PCI_CAP_ID_VS);
5688 	if (status != 0)
5689 		goto abort_with_cfg_hdl;
5690 	if ((omacp = mac_alloc(MAC_VERSION)) == NULL)
5691 		goto abort_with_cfg_hdl;
5692 	/*
5693 	 * XXXX Hack: mac_register_t grows in newer kernels.  To be
5694 	 * able to write newer fields, such as m_margin, without
5695 	 * writing outside allocated memory, we allocate our own macp
5696 	 * and pass that to mac_register()
5697 	 */
5698 	macp = kmem_zalloc(sizeof (*macp) * 8, KM_SLEEP);
5699 	macp->m_version = omacp->m_version;
5700 
5701 	if ((mgp = (struct myri10ge_priv *)
5702 	    kmem_zalloc(sizeof (*mgp), KM_SLEEP)) == NULL) {
5703 		goto abort_with_macinfo;
5704 	}
5705 	ddi_set_driver_private(dip, mgp);
5706 
5707 	/* setup device name for log messages */
5708 	(void) sprintf(mgp->name, "myri10ge%d", ddi_get_instance(dip));
5709 
5710 	mutex_enter(&myri10ge_param_lock);
5711 	myri10ge_get_props(dip);
5712 	mgp->intr_coal_delay = myri10ge_intr_coal_delay;
5713 	mgp->pause = myri10ge_flow_control;
5714 	mutex_exit(&myri10ge_param_lock);
5715 
5716 	mgp->max_read_request_4k = max_read_request_4k;
5717 	mgp->pcie_link_width = link_width;
5718 	mgp->running = MYRI10GE_ETH_STOPPED;
5719 	mgp->vso = vso;
5720 	mgp->dip = dip;
5721 	mgp->cfg_hdl = handle;
5722 
5723 	mgp->timer_ticks = 5 * drv_usectohz(1000000); /* 5 seconds */
5724 	myri10ge_test_physical(dip);
5725 
5726 	/* allocate command page */
5727 	bytes = sizeof (*mgp->cmd);
5728 	mgp->cmd = (mcp_cmd_response_t *)
5729 	    (void *)myri10ge_dma_alloc(dip, bytes,
5730 	    &myri10ge_misc_dma_attr, &myri10ge_dev_access_attr,
5731 	    DDI_DMA_CONSISTENT,	DDI_DMA_RDWR|DDI_DMA_CONSISTENT,
5732 	    &mgp->cmd_dma, 1, DDI_DMA_DONTWAIT);
5733 	if (mgp->cmd == NULL)
5734 		goto abort_with_mgp;
5735 
5736 	(void) myri10ge_reg_set(dip, &mgp->reg_set, &span, &bus_number,
5737 	    &dev_number, &func_number);
5738 	if (myri10ge_verbose)
5739 		printf("%s at %ld:%ld:%ld attaching\n", mgp->name,
5740 		    bus_number, dev_number, func_number);
5741 	status = ddi_regs_map_setup(dip, mgp->reg_set, (caddr_t *)&mgp->sram,
5742 	    (offset_t)0, (offset_t)span,  &myri10ge_dev_access_attr,
5743 	    &mgp->io_handle);
5744 	if (status != DDI_SUCCESS) {
5745 		cmn_err(CE_WARN, "%s: couldn't map memory space", mgp->name);
5746 		printf("%s: reg_set = %d, span = %d, status = %d",
5747 		    mgp->name, mgp->reg_set, span, status);
5748 		goto abort_with_mgp;
5749 	}
5750 
5751 	hdr_offset = *(uint32_t *)(void*)(mgp->sram +  MCP_HEADER_PTR_OFFSET);
5752 	hdr_offset = ntohl(hdr_offset) & 0xffffc;
5753 	ss_offset = hdr_offset +
5754 	    offsetof(struct mcp_gen_header, string_specs);
5755 	mgp->sram_size = ntohl(*(uint32_t *)(void*)(mgp->sram + ss_offset));
5756 	myri10ge_pio_copy32(mgp->eeprom_strings,
5757 	    (uint32_t *)(void*)((char *)mgp->sram + mgp->sram_size),
5758 	    MYRI10GE_EEPROM_STRINGS_SIZE);
5759 	(void) memset(mgp->eeprom_strings +
5760 	    MYRI10GE_EEPROM_STRINGS_SIZE - 2, 0, 2);
5761 
5762 	status = myri10ge_read_mac_addr(mgp);
5763 	if (status) {
5764 		goto abort_with_mapped;
5765 	}
5766 
5767 	status = myri10ge_select_firmware(mgp);
5768 	if (status != 0) {
5769 		cmn_err(CE_WARN, "%s: failed to load firmware\n", mgp->name);
5770 		goto abort_with_mapped;
5771 	}
5772 
5773 	status = myri10ge_probe_slices(mgp);
5774 	if (status != 0) {
5775 		cmn_err(CE_WARN, "%s: failed to probe slices\n", mgp->name);
5776 		goto abort_with_dummy_rdma;
5777 	}
5778 
5779 	status = myri10ge_alloc_slices(mgp);
5780 	if (status != 0) {
5781 		cmn_err(CE_WARN, "%s: failed to alloc slices\n", mgp->name);
5782 		goto abort_with_dummy_rdma;
5783 	}
5784 
5785 	/* add the interrupt handler */
5786 	status = myri10ge_add_intrs(mgp, 1);
5787 	if (status != 0) {
5788 		cmn_err(CE_WARN, "%s: Failed to add interrupt\n",
5789 		    mgp->name);
5790 		goto abort_with_slices;
5791 	}
5792 
5793 	/* now that we have an iblock_cookie, init the mutexes */
5794 	mutex_init(&mgp->cmd_lock, NULL, MUTEX_DRIVER, mgp->icookie);
5795 	mutex_init(&mgp->intrlock, NULL, MUTEX_DRIVER, mgp->icookie);
5796 
5797 
5798 	status = myri10ge_nic_stat_init(mgp);
5799 	if (status != DDI_SUCCESS)
5800 		goto abort_with_interrupts;
5801 	status = myri10ge_info_init(mgp);
5802 	if (status != DDI_SUCCESS)
5803 		goto abort_with_stats;
5804 
5805 	/*
5806 	 *	Initialize  GLD state
5807 	 */
5808 
5809 	macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
5810 	macp->m_driver = mgp;
5811 	macp->m_dip = dip;
5812 	macp->m_src_addr = mgp->mac_addr;
5813 	macp->m_callbacks = &myri10ge_m_callbacks;
5814 	macp->m_min_sdu = 0;
5815 	macp->m_max_sdu = myri10ge_mtu -
5816 	    (sizeof (struct ether_header) + MXGEFW_PAD + VLAN_TAGSZ);
5817 #ifdef SOLARIS_S11
5818 	macp->m_margin = VLAN_TAGSZ;
5819 #endif
5820 	macp->m_v12n = MAC_VIRT_LEVEL1;
5821 	status = mac_register(macp, &mgp->mh);
5822 	if (status != 0) {
5823 		cmn_err(CE_WARN, "%s: mac_register failed with %d\n",
5824 		    mgp->name, status);
5825 		goto abort_with_info;
5826 	}
5827 	myri10ge_ndd_init(mgp);
5828 	if (myri10ge_verbose)
5829 		printf("%s: %s, tx bndry %d, fw %s\n", mgp->name,
5830 		    mgp->intr_type, mgp->tx_boundary, mgp->fw_name);
5831 	mutex_enter(&myri10ge_param_lock);
5832 	mgp->next = mgp_list;
5833 	mgp_list = mgp;
5834 	mutex_exit(&myri10ge_param_lock);
5835 	kmem_free(macp, sizeof (*macp) * 8);
5836 	mac_free(omacp);
5837 	return (DDI_SUCCESS);
5838 
5839 abort_with_info:
5840 	myri10ge_info_destroy(mgp);
5841 
5842 abort_with_stats:
5843 	myri10ge_nic_stat_destroy(mgp);
5844 
5845 abort_with_interrupts:
5846 	mutex_destroy(&mgp->cmd_lock);
5847 	mutex_destroy(&mgp->intrlock);
5848 	myri10ge_rem_intrs(mgp, 1);
5849 
5850 abort_with_slices:
5851 	myri10ge_free_slices(mgp);
5852 
5853 abort_with_dummy_rdma:
5854 	myri10ge_dummy_rdma(mgp, 0);
5855 
5856 abort_with_mapped:
5857 	ddi_regs_map_free(&mgp->io_handle);
5858 
5859 	myri10ge_dma_free(&mgp->cmd_dma);
5860 
5861 abort_with_mgp:
5862 	kmem_free(mgp, sizeof (*mgp));
5863 
5864 abort_with_macinfo:
5865 	kmem_free(macp, sizeof (*macp) * 8);
5866 	mac_free(omacp);
5867 
5868 abort_with_cfg_hdl:
5869 	pci_config_teardown(&handle);
5870 	return (DDI_FAILURE);
5871 
5872 }
5873 
5874 
5875 static int
5876 myri10ge_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
5877 {
5878 	struct myri10ge_priv	*mgp, *tmp;
5879 	int 			status, i, jbufs_alloced;
5880 
5881 	if (cmd == DDI_SUSPEND) {
5882 		status = myri10ge_suspend(dip);
5883 		return (status);
5884 	}
5885 
5886 	if (cmd != DDI_DETACH) {
5887 		return (DDI_FAILURE);
5888 	}
5889 	/* Get the driver private (gld_mac_info_t) structure */
5890 	mgp = ddi_get_driver_private(dip);
5891 
5892 	mutex_enter(&mgp->intrlock);
5893 	jbufs_alloced = 0;
5894 	for (i = 0; i < mgp->num_slices; i++) {
5895 		myri10ge_remove_jbufs(&mgp->ss[i]);
5896 		jbufs_alloced += mgp->ss[i].jpool.num_alloc;
5897 	}
5898 	mutex_exit(&mgp->intrlock);
5899 	if (jbufs_alloced != 0) {
5900 		cmn_err(CE_NOTE, "%s: %d loaned rx buffers remain\n",
5901 		    mgp->name, jbufs_alloced);
5902 		return (DDI_FAILURE);
5903 	}
5904 
5905 	mutex_enter(&myri10ge_param_lock);
5906 	if (mgp->refcnt != 0) {
5907 		mutex_exit(&myri10ge_param_lock);
5908 		cmn_err(CE_NOTE, "%s: %d external refs remain\n",
5909 		    mgp->name, mgp->refcnt);
5910 		return (DDI_FAILURE);
5911 	}
5912 	mutex_exit(&myri10ge_param_lock);
5913 
5914 	status = mac_unregister(mgp->mh);
5915 	if (status != DDI_SUCCESS)
5916 		return (status);
5917 
5918 	myri10ge_ndd_fini(mgp);
5919 	myri10ge_dummy_rdma(mgp, 0);
5920 	myri10ge_nic_stat_destroy(mgp);
5921 	myri10ge_info_destroy(mgp);
5922 
5923 	mutex_destroy(&mgp->cmd_lock);
5924 	mutex_destroy(&mgp->intrlock);
5925 
5926 	myri10ge_rem_intrs(mgp, 1);
5927 
5928 	myri10ge_free_slices(mgp);
5929 	ddi_regs_map_free(&mgp->io_handle);
5930 	myri10ge_dma_free(&mgp->cmd_dma);
5931 	pci_config_teardown(&mgp->cfg_hdl);
5932 
5933 	mutex_enter(&myri10ge_param_lock);
5934 	if (mgp_list == mgp) {
5935 		mgp_list = mgp->next;
5936 	} else {
5937 		tmp = mgp_list;
5938 		while (tmp->next != mgp && tmp->next != NULL)
5939 			tmp = tmp->next;
5940 		if (tmp->next != NULL)
5941 			tmp->next = tmp->next->next;
5942 	}
5943 	kmem_free(mgp, sizeof (*mgp));
5944 	mutex_exit(&myri10ge_param_lock);
5945 	return (DDI_SUCCESS);
5946 }
5947 
5948 /*
5949  * Helper for quiesce entry point: Interrupt threads are not being
5950  * scheduled, so we must poll for the confirmation DMA to arrive in
5951  * the firmware stats block for slice 0.  We're essentially running
5952  * the guts of the interrupt handler, and just cherry picking the
5953  * confirmation that the NIC is queuesced (stats->link_down)
5954  */
5955 
5956 static int
5957 myri10ge_poll_down(struct myri10ge_priv *mgp)
5958 {
5959 	struct myri10ge_slice_state *ss = mgp->ss;
5960 	mcp_irq_data_t *stats = ss->fw_stats;
5961 	int valid;
5962 	int found_down = 0;
5963 
5964 
5965 	/* check for a pending IRQ */
5966 
5967 	if (! *((volatile uint8_t *)& stats->valid))
5968 		return (0);
5969 	valid = stats->valid;
5970 
5971 	/*
5972 	 * Make sure to tell the NIC to lower a legacy IRQ, else
5973 	 * it may have corrupt state after restarting
5974 	 */
5975 
5976 	if (mgp->ddi_intr_type == DDI_INTR_TYPE_FIXED) {
5977 		/* lower legacy IRQ  */
5978 		*mgp->irq_deassert = 0;
5979 		mb();
5980 		/* wait for irq conf DMA */
5981 		while (*((volatile uint8_t *)& stats->valid))
5982 			;
5983 	}
5984 	if (stats->stats_updated && stats->link_down)
5985 		found_down = 1;
5986 
5987 	if (valid & 0x1)
5988 		*ss->irq_claim = BE_32(3);
5989 	*(ss->irq_claim + 1) = BE_32(3);
5990 
5991 	return (found_down);
5992 }
5993 
5994 static int
5995 myri10ge_quiesce(dev_info_t *dip)
5996 {
5997 	struct myri10ge_priv *mgp;
5998 	myri10ge_cmd_t cmd;
5999 	int status, down, i;
6000 
6001 	mgp = ddi_get_driver_private(dip);
6002 	if (mgp == NULL)
6003 		return (DDI_FAILURE);
6004 
6005 	/* if devices was unplumbed, it is guaranteed to be quiescent */
6006 	if (mgp->running == MYRI10GE_ETH_STOPPED)
6007 		return (DDI_SUCCESS);
6008 
6009 	/* send a down CMD to queuesce NIC */
6010 	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
6011 	if (status) {
6012 		cmn_err(CE_WARN, "%s: Couldn't bring down link\n", mgp->name);
6013 		return (DDI_FAILURE);
6014 	}
6015 
6016 	for (i = 0; i < 20; i++) {
6017 		down = myri10ge_poll_down(mgp);
6018 		if (down)
6019 			break;
6020 		delay(drv_usectohz(100000));
6021 		mb();
6022 	}
6023 	if (down)
6024 		return (DDI_SUCCESS);
6025 	return (DDI_FAILURE);
6026 }
6027 
6028 /*
6029  * Distinguish between allocb'ed blocks, and gesballoc'ed attached
6030  * storage.
6031  */
6032 static void
6033 myri10ge_find_lastfree(void)
6034 {
6035 	mblk_t *mp = allocb(1024, 0);
6036 	dblk_t *dbp;
6037 
6038 	if (mp == NULL) {
6039 		cmn_err(CE_WARN, "myri10ge_find_lastfree failed\n");
6040 		return;
6041 	}
6042 	dbp = mp->b_datap;
6043 	myri10ge_db_lastfree = (void *)dbp->db_lastfree;
6044 }
6045 
6046 int
6047 _init(void)
6048 {
6049 	int i;
6050 
6051 	if (myri10ge_verbose)
6052 		cmn_err(CE_NOTE,
6053 		    "Myricom 10G driver (10GbE) version %s loading\n",
6054 		    MYRI10GE_VERSION_STR);
6055 	myri10ge_find_lastfree();
6056 	mac_init_ops(&myri10ge_ops, "myri10ge");
6057 	mutex_init(&myri10ge_param_lock, NULL, MUTEX_DEFAULT, NULL);
6058 	if ((i = mod_install(&modlinkage)) != 0) {
6059 		cmn_err(CE_WARN, "mod_install returned %d\n", i);
6060 		mac_fini_ops(&myri10ge_ops);
6061 		mutex_destroy(&myri10ge_param_lock);
6062 	}
6063 	return (i);
6064 }
6065 
6066 int
6067 _fini(void)
6068 {
6069 	int i;
6070 	i = mod_remove(&modlinkage);
6071 	if (i != 0) {
6072 		return (i);
6073 	}
6074 	mac_fini_ops(&myri10ge_ops);
6075 	mutex_destroy(&myri10ge_param_lock);
6076 	return (0);
6077 }
6078 
6079 int
6080 _info(struct modinfo *modinfop)
6081 {
6082 	return (mod_info(&modlinkage, modinfop));
6083 }
6084 
6085 
6086 /*
6087  *  This file uses MyriGE driver indentation.
6088  *
6089  * Local Variables:
6090  * c-file-style:"sun"
6091  * tab-width:8
6092  * End:
6093  */
6094