xref: /illumos-gate/usr/src/uts/common/io/e1000g/e1000g_tx.c (revision 24fe0b3bf671e123467ce1df0b67cadd3614c8e4)
1 /*
2  * This file is provided under a CDDLv1 license.  When using or
3  * redistributing this file, you may do so under this license.
4  * In redistributing this file this license must be included
5  * and no other modification of this header file is permitted.
6  *
7  * CDDL LICENSE SUMMARY
8  *
9  * Copyright(c) 1999 - 2009 Intel Corporation. All rights reserved.
10  *
11  * The contents of this file are subject to the terms of Version
12  * 1.0 of the Common Development and Distribution License (the "License").
13  *
14  * You should have received a copy of the License with this software.
15  * You can obtain a copy of the License at
16  *	http://www.opensolaris.org/os/licensing.
17  * See the License for the specific language governing permissions
18  * and limitations under the License.
19  */
20 
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * **********************************************************************
28  *									*
29  * Module Name:								*
30  *   e1000g_tx.c							*
31  *									*
32  * Abstract:								*
33  *   This file contains some routines that take care of Transmit,	*
34  *   make the hardware to send the data pointed by the packet out	*
35  *   on to the physical medium.						*
36  *									*
37  * **********************************************************************
38  */
39 
40 #include "e1000g_sw.h"
41 #include "e1000g_debug.h"
42 
43 static boolean_t e1000g_send(struct e1000g *, mblk_t *);
44 static int e1000g_tx_copy(e1000g_tx_ring_t *,
45     p_tx_sw_packet_t, mblk_t *, boolean_t);
46 static int e1000g_tx_bind(e1000g_tx_ring_t *,
47     p_tx_sw_packet_t, mblk_t *);
48 static boolean_t e1000g_retrieve_context(mblk_t *, context_data_t *, size_t);
49 static boolean_t e1000g_check_context(e1000g_tx_ring_t *, context_data_t *);
50 static int e1000g_fill_tx_ring(e1000g_tx_ring_t *, LIST_DESCRIBER *,
51     context_data_t *);
52 static void e1000g_fill_context_descriptor(context_data_t *,
53     struct e1000_context_desc *);
54 static int e1000g_fill_tx_desc(e1000g_tx_ring_t *,
55     p_tx_sw_packet_t, uint64_t, size_t);
56 static uint32_t e1000g_fill_82544_desc(uint64_t Address, size_t Length,
57     p_desc_array_t desc_array);
58 static int e1000g_tx_workaround_PCIX_82544(p_tx_sw_packet_t, uint64_t, size_t);
59 static int e1000g_tx_workaround_jumbo_82544(p_tx_sw_packet_t, uint64_t, size_t);
60 static void e1000g_82547_timeout(void *);
61 static void e1000g_82547_tx_move_tail(e1000g_tx_ring_t *);
62 static void e1000g_82547_tx_move_tail_work(e1000g_tx_ring_t *);
63 
64 #ifndef E1000G_DEBUG
65 #pragma inline(e1000g_tx_copy)
66 #pragma inline(e1000g_tx_bind)
67 #pragma inline(e1000g_retrieve_context)
68 #pragma inline(e1000g_check_context)
69 #pragma inline(e1000g_fill_tx_ring)
70 #pragma inline(e1000g_fill_context_descriptor)
71 #pragma inline(e1000g_fill_tx_desc)
72 #pragma inline(e1000g_fill_82544_desc)
73 #pragma inline(e1000g_tx_workaround_PCIX_82544)
74 #pragma inline(e1000g_tx_workaround_jumbo_82544)
75 #pragma inline(e1000g_free_tx_swpkt)
76 #endif
77 
78 /*
79  * e1000g_free_tx_swpkt	- free up the tx sw packet
80  *
81  * Unbind the previously bound DMA handle for a given
82  * transmit sw packet. And reset the sw packet data.
83  */
84 void
85 e1000g_free_tx_swpkt(register p_tx_sw_packet_t packet)
86 {
87 	switch (packet->data_transfer_type) {
88 	case USE_BCOPY:
89 		packet->tx_buf->len = 0;
90 		break;
91 #ifdef __sparc
92 	case USE_DVMA:
93 		dvma_unload(packet->tx_dma_handle, 0, -1);
94 		break;
95 #endif
96 	case USE_DMA:
97 		(void) ddi_dma_unbind_handle(packet->tx_dma_handle);
98 		break;
99 	default:
100 		break;
101 	}
102 
103 	/*
104 	 * The mblk has been stripped off the sw packet
105 	 * and will be freed in a triggered soft intr.
106 	 */
107 	ASSERT(packet->mp == NULL);
108 
109 	packet->data_transfer_type = USE_NONE;
110 	packet->num_mblk_frag = 0;
111 	packet->num_desc = 0;
112 }
113 
114 mblk_t *
115 e1000g_m_tx(void *arg, mblk_t *mp)
116 {
117 	struct e1000g *Adapter = (struct e1000g *)arg;
118 	mblk_t *next;
119 
120 	rw_enter(&Adapter->chip_lock, RW_READER);
121 
122 	if ((Adapter->e1000g_state & E1000G_SUSPENDED) ||
123 	    !(Adapter->e1000g_state & E1000G_STARTED) ||
124 	    (Adapter->link_state != LINK_STATE_UP)) {
125 		freemsgchain(mp);
126 		mp = NULL;
127 	}
128 
129 	while (mp != NULL) {
130 		next = mp->b_next;
131 		mp->b_next = NULL;
132 
133 		if (!e1000g_send(Adapter, mp)) {
134 			mp->b_next = next;
135 			break;
136 		}
137 
138 		mp = next;
139 	}
140 
141 	rw_exit(&Adapter->chip_lock);
142 	return (mp);
143 }
144 
145 /*
146  * e1000g_send -  send packets onto the wire
147  *
148  * Called from e1000g_m_tx with an mblk ready to send. this
149  * routine sets up the transmit descriptors and sends data to
150  * the wire. It also pushes the just transmitted packet to
151  * the used tx sw packet list.
152  */
153 static boolean_t
154 e1000g_send(struct e1000g *Adapter, mblk_t *mp)
155 {
156 	p_tx_sw_packet_t packet;
157 	LIST_DESCRIBER pending_list;
158 	size_t len;
159 	size_t msg_size;
160 	uint32_t frag_count;
161 	int desc_count;
162 	uint32_t desc_total;
163 	uint32_t bcopy_thresh;
164 	uint32_t hdr_frag_len;
165 	boolean_t tx_undersize_flag;
166 	mblk_t *nmp;
167 	mblk_t *tmp;
168 	mblk_t *new_mp;
169 	mblk_t *pre_mp;
170 	e1000g_tx_ring_t *tx_ring;
171 	context_data_t cur_context;
172 
173 	tx_ring = Adapter->tx_ring;
174 	bcopy_thresh = Adapter->tx_bcopy_thresh;
175 
176 	/* Get the total size and frags number of the message */
177 	tx_undersize_flag = B_FALSE;
178 	frag_count = 0;
179 	msg_size = 0;
180 	for (nmp = mp; nmp; nmp = nmp->b_cont) {
181 		frag_count++;
182 		msg_size += MBLKL(nmp);
183 	}
184 
185 	/* retrieve and compute information for context descriptor */
186 	if (!e1000g_retrieve_context(mp, &cur_context, msg_size)) {
187 		freemsg(mp);
188 		return (B_TRUE);
189 	}
190 
191 	/*
192 	 * Make sure the packet is less than the allowed size
193 	 */
194 	if (!cur_context.lso_flag &&
195 	    (msg_size > Adapter->max_frame_size - ETHERFCSL)) {
196 		/*
197 		 * For the over size packet, we'll just drop it.
198 		 * So we return B_TRUE here.
199 		 */
200 		E1000G_DEBUGLOG_1(Adapter, E1000G_WARN_LEVEL,
201 		    "Tx packet out of bound. length = %d \n", msg_size);
202 		E1000G_STAT(tx_ring->stat_over_size);
203 		freemsg(mp);
204 		return (B_TRUE);
205 	}
206 
207 	/*
208 	 * Check and reclaim tx descriptors.
209 	 * This low water mark check should be done all the time as
210 	 * Transmit interrupt delay can produce Transmit interrupts little
211 	 * late and that may cause few problems related to reaping Tx
212 	 * Descriptors... As you may run short of them before getting any
213 	 * transmit interrupt...
214 	 */
215 	if (tx_ring->tbd_avail < DEFAULT_TX_NO_RESOURCE) {
216 		(void) e1000g_recycle(tx_ring);
217 		E1000G_DEBUG_STAT(tx_ring->stat_recycle);
218 
219 		if (tx_ring->tbd_avail < DEFAULT_TX_NO_RESOURCE) {
220 			E1000G_DEBUG_STAT(tx_ring->stat_lack_desc);
221 			goto tx_no_resource;
222 		}
223 	}
224 
225 	/*
226 	 * If the message size is less than the minimum ethernet packet size,
227 	 * we'll use bcopy to send it, and padd it to 60 bytes later.
228 	 */
229 	if (msg_size < ETHERMIN) {
230 		E1000G_DEBUG_STAT(tx_ring->stat_under_size);
231 		tx_undersize_flag = B_TRUE;
232 	}
233 
234 	/* Initialize variables */
235 	desc_count = 1;	/* The initial value should be greater than 0 */
236 	desc_total = 0;
237 	QUEUE_INIT_LIST(&pending_list);
238 
239 	/* Process each mblk fragment and fill tx descriptors */
240 	/*
241 	 * The software should guarantee LSO packet header(MAC+IP+TCP)
242 	 * to be within one descriptor. Here we reallocate and refill the
243 	 * the header if it's physical memory non-contiguous.
244 	 */
245 	if (cur_context.lso_flag) {
246 		/* find the last fragment of the header */
247 		len = MBLKL(mp);
248 		ASSERT(len > 0);
249 		nmp = mp;
250 		pre_mp = NULL;
251 		while (len < cur_context.hdr_len) {
252 			pre_mp = nmp;
253 			nmp = nmp->b_cont;
254 			len += MBLKL(nmp);
255 		}
256 		/*
257 		 * If the header and the payload are in different mblks,
258 		 * we simply force the header to be copied into pre-allocated
259 		 * page-aligned buffer.
260 		 */
261 		if (len == cur_context.hdr_len)
262 			goto adjust_threshold;
263 
264 		hdr_frag_len = cur_context.hdr_len - (len - MBLKL(nmp));
265 		/*
266 		 * There are two cases we need to reallocate a mblk for the
267 		 * last header fragment:
268 		 * 1. the header is in multiple mblks and the last fragment
269 		 * share the same mblk with the payload
270 		 * 2. the header is in a single mblk shared with the payload
271 		 * and the header is physical memory non-contiguous
272 		 */
273 		if ((nmp != mp) ||
274 		    (P2NPHASE((uintptr_t)nmp->b_rptr, Adapter->sys_page_sz)
275 		    < cur_context.hdr_len)) {
276 			E1000G_DEBUG_STAT(tx_ring->stat_lso_header_fail);
277 			/*
278 			 * reallocate the mblk for the last header fragment,
279 			 * expect to bcopy into pre-allocated page-aligned
280 			 * buffer
281 			 */
282 			new_mp = allocb(hdr_frag_len, NULL);
283 			if (!new_mp)
284 				return (B_FALSE);
285 			bcopy(nmp->b_rptr, new_mp->b_rptr, hdr_frag_len);
286 			/* link the new header fragment with the other parts */
287 			new_mp->b_wptr = new_mp->b_rptr + hdr_frag_len;
288 			new_mp->b_cont = nmp;
289 			if (pre_mp)
290 				pre_mp->b_cont = new_mp;
291 			else
292 				mp = new_mp;
293 			nmp->b_rptr += hdr_frag_len;
294 			frag_count ++;
295 		}
296 adjust_threshold:
297 		/*
298 		 * adjust the bcopy threshhold to guarantee
299 		 * the header to use bcopy way
300 		 */
301 		if (bcopy_thresh < cur_context.hdr_len)
302 			bcopy_thresh = cur_context.hdr_len;
303 	}
304 
305 	packet = NULL;
306 	nmp = mp;
307 	while (nmp) {
308 		tmp = nmp->b_cont;
309 
310 		len = MBLKL(nmp);
311 		/* Check zero length mblks */
312 		if (len == 0) {
313 			E1000G_DEBUG_STAT(tx_ring->stat_empty_frags);
314 			/*
315 			 * If there're no packet buffers have been used,
316 			 * or we just completed processing a buffer, then
317 			 * skip the empty mblk fragment.
318 			 * Otherwise, there's still a pending buffer that
319 			 * needs to be processed (tx_copy).
320 			 */
321 			if (desc_count > 0) {
322 				nmp = tmp;
323 				continue;
324 			}
325 		}
326 
327 		/*
328 		 * Get a new TxSwPacket to process mblk buffers.
329 		 */
330 		if (desc_count > 0) {
331 			mutex_enter(&tx_ring->freelist_lock);
332 			packet = (p_tx_sw_packet_t)
333 			    QUEUE_POP_HEAD(&tx_ring->free_list);
334 			mutex_exit(&tx_ring->freelist_lock);
335 
336 			if (packet == NULL) {
337 				E1000G_DEBUGLOG_0(Adapter, E1000G_INFO_LEVEL,
338 				    "No Tx SwPacket available\n");
339 				E1000G_STAT(tx_ring->stat_no_swpkt);
340 				goto tx_send_failed;
341 			}
342 			QUEUE_PUSH_TAIL(&pending_list, &packet->Link);
343 		}
344 
345 		ASSERT(packet);
346 		/*
347 		 * If the size of the fragment is less than the tx_bcopy_thresh
348 		 * we'll use bcopy; Otherwise, we'll use DMA binding.
349 		 */
350 		if ((len <= bcopy_thresh) || tx_undersize_flag) {
351 			desc_count =
352 			    e1000g_tx_copy(tx_ring, packet, nmp,
353 			    tx_undersize_flag);
354 			E1000G_DEBUG_STAT(tx_ring->stat_copy);
355 		} else {
356 			desc_count =
357 			    e1000g_tx_bind(tx_ring, packet, nmp);
358 			E1000G_DEBUG_STAT(tx_ring->stat_bind);
359 		}
360 
361 		if (desc_count > 0)
362 			desc_total += desc_count;
363 		else if (desc_count < 0)
364 			goto tx_send_failed;
365 
366 		nmp = tmp;
367 	}
368 
369 	/* Assign the message to the last sw packet */
370 	ASSERT(packet);
371 	ASSERT(packet->mp == NULL);
372 	packet->mp = mp;
373 
374 	/* Try to recycle the tx descriptors again */
375 	if (tx_ring->tbd_avail < (desc_total + 2)) {
376 		E1000G_DEBUG_STAT(tx_ring->stat_recycle_retry);
377 		(void) e1000g_recycle(tx_ring);
378 	}
379 
380 	mutex_enter(&tx_ring->tx_lock);
381 
382 	/*
383 	 * If the number of available tx descriptors is not enough for transmit
384 	 * (one redundant descriptor and one hw checksum context descriptor are
385 	 * included), then return failure.
386 	 */
387 	if (tx_ring->tbd_avail < (desc_total + 2)) {
388 		E1000G_DEBUGLOG_0(Adapter, E1000G_INFO_LEVEL,
389 		    "No Enough Tx descriptors\n");
390 		E1000G_STAT(tx_ring->stat_no_desc);
391 		mutex_exit(&tx_ring->tx_lock);
392 		goto tx_send_failed;
393 	}
394 
395 	desc_count = e1000g_fill_tx_ring(tx_ring, &pending_list, &cur_context);
396 
397 	mutex_exit(&tx_ring->tx_lock);
398 
399 	ASSERT(desc_count > 0);
400 
401 	/* Send successful */
402 	return (B_TRUE);
403 
404 tx_send_failed:
405 	/*
406 	 * Enable Transmit interrupts, so that the interrupt routine can
407 	 * call mac_tx_update() when transmit descriptors become available.
408 	 */
409 	tx_ring->resched_timestamp = ddi_get_lbolt();
410 	tx_ring->resched_needed = B_TRUE;
411 	if (!Adapter->tx_intr_enable)
412 		e1000g_mask_tx_interrupt(Adapter);
413 
414 	/* Free pending TxSwPackets */
415 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&pending_list);
416 	while (packet) {
417 		packet->mp = NULL;
418 		e1000g_free_tx_swpkt(packet);
419 		packet = (p_tx_sw_packet_t)
420 		    QUEUE_GET_NEXT(&pending_list, &packet->Link);
421 	}
422 
423 	/* Return pending TxSwPackets to the "Free" list */
424 	mutex_enter(&tx_ring->freelist_lock);
425 	QUEUE_APPEND(&tx_ring->free_list, &pending_list);
426 	mutex_exit(&tx_ring->freelist_lock);
427 
428 	E1000G_STAT(tx_ring->stat_send_fail);
429 
430 	/* Message will be scheduled for re-transmit */
431 	return (B_FALSE);
432 
433 tx_no_resource:
434 	/*
435 	 * Enable Transmit interrupts, so that the interrupt routine can
436 	 * call mac_tx_update() when transmit descriptors become available.
437 	 */
438 	tx_ring->resched_timestamp = ddi_get_lbolt();
439 	tx_ring->resched_needed = B_TRUE;
440 	if (!Adapter->tx_intr_enable)
441 		e1000g_mask_tx_interrupt(Adapter);
442 
443 	/* Message will be scheduled for re-transmit */
444 	return (B_FALSE);
445 }
446 
447 static boolean_t
448 e1000g_retrieve_context(mblk_t *mp, context_data_t *cur_context,
449     size_t msg_size)
450 {
451 	uintptr_t ip_start;
452 	uintptr_t tcp_start;
453 	mblk_t *nmp;
454 	uint32_t lsoflags;
455 	uint32_t mss;
456 
457 	bzero(cur_context, sizeof (context_data_t));
458 
459 	/* first check lso information */
460 	lso_info_get(mp, &mss, &lsoflags);
461 
462 	/* retrieve checksum info */
463 	hcksum_retrieve(mp, NULL, NULL, &cur_context->cksum_start,
464 	    &cur_context->cksum_stuff, NULL, NULL, &cur_context->cksum_flags);
465 	/* retrieve ethernet header size */
466 	if (((struct ether_vlan_header *)(uintptr_t)mp->b_rptr)->ether_tpid ==
467 	    htons(ETHERTYPE_VLAN))
468 		cur_context->ether_header_size =
469 		    sizeof (struct ether_vlan_header);
470 	else
471 		cur_context->ether_header_size =
472 		    sizeof (struct ether_header);
473 
474 	if (lsoflags & HW_LSO) {
475 		ASSERT(mss != 0);
476 
477 		/* free the invalid packet */
478 		if (mss == 0 ||
479 		    !((cur_context->cksum_flags & HCK_PARTIALCKSUM) &&
480 		    (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM))) {
481 			return (B_FALSE);
482 		}
483 		cur_context->mss = (uint16_t)mss;
484 		cur_context->lso_flag = B_TRUE;
485 
486 		/*
487 		 * Some fields are cleared for the hardware to fill
488 		 * in. We don't assume Ethernet header, IP header and
489 		 * TCP header are always in the same mblk fragment,
490 		 * while we assume each header is always within one
491 		 * mblk fragment and Ethernet header is always in the
492 		 * first mblk fragment.
493 		 */
494 		nmp = mp;
495 		ip_start = (uintptr_t)(nmp->b_rptr)
496 		    + cur_context->ether_header_size;
497 		if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
498 			ip_start = (uintptr_t)nmp->b_cont->b_rptr
499 			    + (ip_start - (uintptr_t)(nmp->b_wptr));
500 			nmp = nmp->b_cont;
501 		}
502 		tcp_start = ip_start +
503 		    IPH_HDR_LENGTH((ipha_t *)ip_start);
504 		if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
505 			tcp_start = (uintptr_t)nmp->b_cont->b_rptr
506 			    + (tcp_start - (uintptr_t)(nmp->b_wptr));
507 			nmp = nmp->b_cont;
508 		}
509 		cur_context->hdr_len = cur_context->ether_header_size
510 		    + IPH_HDR_LENGTH((ipha_t *)ip_start)
511 		    + TCP_HDR_LENGTH((tcph_t *)tcp_start);
512 		((ipha_t *)ip_start)->ipha_length = 0;
513 		((ipha_t *)ip_start)->ipha_hdr_checksum = 0;
514 		/* calculate the TCP packet payload length */
515 		cur_context->pay_len = msg_size - cur_context->hdr_len;
516 	}
517 	return (B_TRUE);
518 }
519 
520 static boolean_t
521 e1000g_check_context(e1000g_tx_ring_t *tx_ring, context_data_t *cur_context)
522 {
523 	boolean_t context_reload;
524 	context_data_t *pre_context;
525 	struct e1000g *Adapter;
526 
527 	context_reload = B_FALSE;
528 	pre_context = &tx_ring->pre_context;
529 	Adapter = tx_ring->adapter;
530 
531 	/*
532 	 * The following code determine if the context descriptor is
533 	 * needed to be reloaded. The sequence of the conditions is
534 	 * made by their possibilities of changing.
535 	 */
536 	/*
537 	 * workaround for 82546EB, context descriptor must be reloaded
538 	 * per LSO/hw_cksum packet if LSO is enabled.
539 	 */
540 	if (Adapter->lso_premature_issue &&
541 	    Adapter->lso_enable &&
542 	    (cur_context->cksum_flags != 0)) {
543 
544 		context_reload = B_TRUE;
545 	} else if (cur_context->lso_flag) {
546 		if ((cur_context->lso_flag != pre_context->lso_flag) ||
547 		    (cur_context->cksum_flags != pre_context->cksum_flags) ||
548 		    (cur_context->pay_len != pre_context->pay_len) ||
549 		    (cur_context->mss != pre_context->mss) ||
550 		    (cur_context->hdr_len != pre_context->hdr_len) ||
551 		    (cur_context->cksum_stuff != pre_context->cksum_stuff) ||
552 		    (cur_context->cksum_start != pre_context->cksum_start) ||
553 		    (cur_context->ether_header_size !=
554 		    pre_context->ether_header_size)) {
555 
556 			context_reload = B_TRUE;
557 		}
558 	} else if (cur_context->cksum_flags != 0) {
559 		if ((cur_context->lso_flag != pre_context->lso_flag) ||
560 		    (cur_context->cksum_flags != pre_context->cksum_flags) ||
561 		    (cur_context->cksum_stuff != pre_context->cksum_stuff) ||
562 		    (cur_context->cksum_start != pre_context->cksum_start) ||
563 		    (cur_context->ether_header_size !=
564 		    pre_context->ether_header_size)) {
565 
566 			context_reload = B_TRUE;
567 		}
568 	}
569 
570 	return (context_reload);
571 }
572 
573 static int
574 e1000g_fill_tx_ring(e1000g_tx_ring_t *tx_ring, LIST_DESCRIBER *pending_list,
575     context_data_t *cur_context)
576 {
577 	struct e1000g *Adapter;
578 	struct e1000_hw *hw;
579 	p_tx_sw_packet_t first_packet;
580 	p_tx_sw_packet_t packet;
581 	p_tx_sw_packet_t previous_packet;
582 	boolean_t context_reload;
583 	struct e1000_tx_desc *first_data_desc;
584 	struct e1000_tx_desc *next_desc;
585 	struct e1000_tx_desc *descriptor;
586 	int desc_count;
587 	boolean_t buff_overrun_flag;
588 	int i;
589 
590 	Adapter = tx_ring->adapter;
591 	hw = &Adapter->shared;
592 
593 	desc_count = 0;
594 	first_packet = NULL;
595 	first_data_desc = NULL;
596 	descriptor = NULL;
597 	first_packet = NULL;
598 	packet = NULL;
599 	buff_overrun_flag = B_FALSE;
600 
601 	next_desc = tx_ring->tbd_next;
602 
603 	/* Context descriptor reload check */
604 	context_reload = e1000g_check_context(tx_ring, cur_context);
605 
606 	if (context_reload) {
607 		first_packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(pending_list);
608 
609 		descriptor = next_desc;
610 
611 		e1000g_fill_context_descriptor(cur_context,
612 		    (struct e1000_context_desc *)descriptor);
613 
614 		/* Check the wrap-around case */
615 		if (descriptor == tx_ring->tbd_last)
616 			next_desc = tx_ring->tbd_first;
617 		else
618 			next_desc++;
619 
620 		desc_count++;
621 	}
622 
623 	first_data_desc = next_desc;
624 
625 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(pending_list);
626 	while (packet) {
627 		ASSERT(packet->num_desc);
628 
629 		for (i = 0; i < packet->num_desc; i++) {
630 			ASSERT(tx_ring->tbd_avail > 0);
631 
632 			descriptor = next_desc;
633 			descriptor->buffer_addr =
634 			    packet->desc[i].address;
635 			descriptor->lower.data =
636 			    packet->desc[i].length;
637 
638 			/* Zero out status */
639 			descriptor->upper.data = 0;
640 
641 			descriptor->lower.data |=
642 			    E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D;
643 			/* must set RS on every outgoing descriptor */
644 			descriptor->lower.data |=
645 			    E1000_TXD_CMD_RS;
646 
647 			if (cur_context->lso_flag)
648 				descriptor->lower.data |= E1000_TXD_CMD_TSE;
649 
650 			/* Check the wrap-around case */
651 			if (descriptor == tx_ring->tbd_last)
652 				next_desc = tx_ring->tbd_first;
653 			else
654 				next_desc++;
655 
656 			desc_count++;
657 
658 			/*
659 			 * workaround for 82546EB errata 33, hang in PCI-X
660 			 * systems due to 2k Buffer Overrun during Transmit
661 			 * Operation. The workaround applies to all the Intel
662 			 * PCI-X chips.
663 			 */
664 			if (hw->bus.type == e1000_bus_type_pcix &&
665 			    descriptor == first_data_desc &&
666 			    ((descriptor->lower.data & E1000G_TBD_LENGTH_MASK)
667 			    > E1000_TX_BUFFER_OEVRRUN_THRESHOLD)) {
668 				/* modified the first descriptor */
669 				descriptor->lower.data &=
670 				    ~E1000G_TBD_LENGTH_MASK;
671 				descriptor->lower.flags.length =
672 				    E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
673 
674 				/* insert a new descriptor */
675 				ASSERT(tx_ring->tbd_avail > 0);
676 				next_desc->buffer_addr =
677 				    packet->desc[0].address +
678 				    E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
679 				next_desc->lower.data =
680 				    packet->desc[0].length -
681 				    E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
682 
683 				/* Zero out status */
684 				next_desc->upper.data = 0;
685 
686 				next_desc->lower.data |=
687 				    E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D;
688 				/* must set RS on every outgoing descriptor */
689 				next_desc->lower.data |=
690 				    E1000_TXD_CMD_RS;
691 
692 				if (cur_context->lso_flag)
693 					next_desc->lower.data |=
694 					    E1000_TXD_CMD_TSE;
695 
696 				descriptor = next_desc;
697 
698 				/* Check the wrap-around case */
699 				if (next_desc == tx_ring->tbd_last)
700 					next_desc = tx_ring->tbd_first;
701 				else
702 					next_desc++;
703 
704 				desc_count++;
705 				buff_overrun_flag = B_TRUE;
706 			}
707 		}
708 
709 		if (buff_overrun_flag) {
710 			packet->num_desc++;
711 			buff_overrun_flag = B_FALSE;
712 		}
713 
714 		if (first_packet != NULL) {
715 			/*
716 			 * Count the checksum context descriptor for
717 			 * the first SwPacket.
718 			 */
719 			first_packet->num_desc++;
720 			first_packet = NULL;
721 		}
722 
723 		packet->tickstamp = lbolt64;
724 
725 		previous_packet = packet;
726 		packet = (p_tx_sw_packet_t)
727 		    QUEUE_GET_NEXT(pending_list, &packet->Link);
728 	}
729 
730 	/*
731 	 * workaround for 82546EB errata 21, LSO Premature Descriptor Write Back
732 	 */
733 	if (Adapter->lso_premature_issue && cur_context->lso_flag &&
734 	    ((descriptor->lower.data & E1000G_TBD_LENGTH_MASK) > 8)) {
735 		/* modified the previous descriptor */
736 		descriptor->lower.data -= 4;
737 
738 		/* insert a new descriptor */
739 		ASSERT(tx_ring->tbd_avail > 0);
740 		/* the lower 20 bits of lower.data is the length field */
741 		next_desc->buffer_addr =
742 		    descriptor->buffer_addr +
743 		    (descriptor->lower.data & E1000G_TBD_LENGTH_MASK);
744 		next_desc->lower.data = 4;
745 
746 		/* Zero out status */
747 		next_desc->upper.data = 0;
748 		/* It must be part of a LSO packet */
749 		next_desc->lower.data |=
750 		    E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D |
751 		    E1000_TXD_CMD_RS | E1000_TXD_CMD_TSE;
752 
753 		descriptor = next_desc;
754 
755 		/* Check the wrap-around case */
756 		if (descriptor == tx_ring->tbd_last)
757 			next_desc = tx_ring->tbd_first;
758 		else
759 			next_desc++;
760 
761 		desc_count++;
762 		/* update the number of descriptors */
763 		previous_packet->num_desc++;
764 	}
765 
766 	ASSERT(descriptor);
767 
768 	if (cur_context->cksum_flags) {
769 		if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM)
770 			((struct e1000_data_desc *)first_data_desc)->
771 			    upper.fields.popts |= E1000_TXD_POPTS_IXSM;
772 		if (cur_context->cksum_flags & HCK_PARTIALCKSUM)
773 			((struct e1000_data_desc *)first_data_desc)->
774 			    upper.fields.popts |= E1000_TXD_POPTS_TXSM;
775 	}
776 
777 	/*
778 	 * Last Descriptor of Packet needs End Of Packet (EOP), Report
779 	 * Status (RS) set.
780 	 */
781 	if (Adapter->tx_intr_delay) {
782 		descriptor->lower.data |= E1000_TXD_CMD_IDE |
783 		    E1000_TXD_CMD_EOP;
784 	} else {
785 		descriptor->lower.data |= E1000_TXD_CMD_EOP;
786 	}
787 
788 	/* Set append Ethernet CRC (IFCS) bits */
789 	if (cur_context->lso_flag) {
790 		first_data_desc->lower.data |= E1000_TXD_CMD_IFCS;
791 	} else {
792 		descriptor->lower.data |= E1000_TXD_CMD_IFCS;
793 	}
794 
795 	/*
796 	 * Sync the Tx descriptors DMA buffer
797 	 */
798 	(void) ddi_dma_sync(tx_ring->tbd_dma_handle,
799 	    0, 0, DDI_DMA_SYNC_FORDEV);
800 
801 	tx_ring->tbd_next = next_desc;
802 
803 	/*
804 	 * Advance the Transmit Descriptor Tail (Tdt), this tells the
805 	 * FX1000 that this frame is available to transmit.
806 	 */
807 	if (hw->mac.type == e1000_82547)
808 		e1000g_82547_tx_move_tail(tx_ring);
809 	else
810 		E1000_WRITE_REG(hw, E1000_TDT(0),
811 		    (uint32_t)(next_desc - tx_ring->tbd_first));
812 
813 	if (e1000g_check_acc_handle(Adapter->osdep.reg_handle) != DDI_FM_OK) {
814 		ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
815 		Adapter->e1000g_state |= E1000G_ERROR;
816 	}
817 
818 	/* Put the pending SwPackets to the "Used" list */
819 	mutex_enter(&tx_ring->usedlist_lock);
820 	QUEUE_APPEND(&tx_ring->used_list, pending_list);
821 	tx_ring->tbd_avail -= desc_count;
822 	mutex_exit(&tx_ring->usedlist_lock);
823 
824 	/* update LSO related data */
825 	if (context_reload)
826 		tx_ring->pre_context = *cur_context;
827 
828 	return (desc_count);
829 }
830 
831 /*
832  * e1000g_tx_setup - setup tx data structures
833  *
834  * This routine initializes all of the transmit related
835  * structures. This includes the Transmit descriptors,
836  * and the tx_sw_packet structures.
837  */
838 void
839 e1000g_tx_setup(struct e1000g *Adapter)
840 {
841 	struct e1000_hw *hw;
842 	p_tx_sw_packet_t packet;
843 	uint32_t i;
844 	uint32_t buf_high;
845 	uint32_t buf_low;
846 	uint32_t reg_tipg;
847 	uint32_t reg_tctl;
848 	int size;
849 	e1000g_tx_ring_t *tx_ring;
850 
851 	hw = &Adapter->shared;
852 	tx_ring = Adapter->tx_ring;
853 
854 	/* init the lists */
855 	/*
856 	 * Here we don't need to protect the lists using the
857 	 * usedlist_lock and freelist_lock, for they have
858 	 * been protected by the chip_lock.
859 	 */
860 	QUEUE_INIT_LIST(&tx_ring->used_list);
861 	QUEUE_INIT_LIST(&tx_ring->free_list);
862 
863 	/* Go through and set up each SW_Packet */
864 	packet = tx_ring->packet_area;
865 	for (i = 0; i < Adapter->tx_freelist_num; i++, packet++) {
866 		/* Initialize this tx_sw_apcket area */
867 		e1000g_free_tx_swpkt(packet);
868 		/* Add this tx_sw_packet to the free list */
869 		QUEUE_PUSH_TAIL(&tx_ring->free_list,
870 		    &packet->Link);
871 	}
872 
873 	/* Setup TX descriptor pointers */
874 	tx_ring->tbd_next = tx_ring->tbd_first;
875 	tx_ring->tbd_oldest = tx_ring->tbd_first;
876 
877 	/*
878 	 * Setup Hardware TX Registers
879 	 */
880 	/* Setup the Transmit Control Register (TCTL). */
881 	reg_tctl = E1000_READ_REG(hw, E1000_TCTL);
882 	reg_tctl |= E1000_TCTL_PSP | E1000_TCTL_EN |
883 	    (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT) |
884 	    (E1000_COLLISION_DISTANCE << E1000_COLD_SHIFT) |
885 	    E1000_TCTL_RTLC;
886 
887 	/* Enable the MULR bit */
888 	if (hw->bus.type == e1000_bus_type_pci_express)
889 		reg_tctl |= E1000_TCTL_MULR;
890 
891 	E1000_WRITE_REG(hw, E1000_TCTL, reg_tctl);
892 
893 	/* Setup HW Base and Length of Tx descriptor area */
894 	size = (Adapter->tx_desc_num * sizeof (struct e1000_tx_desc));
895 	E1000_WRITE_REG(hw, E1000_TDLEN(0), size);
896 	size = E1000_READ_REG(hw, E1000_TDLEN(0));
897 
898 	buf_low = (uint32_t)tx_ring->tbd_dma_addr;
899 	buf_high = (uint32_t)(tx_ring->tbd_dma_addr >> 32);
900 
901 	/*
902 	 * Write the highest location first and work backward to the lowest.
903 	 * This is necessary for some adapter types to
904 	 * prevent write combining from occurring.
905 	 */
906 	E1000_WRITE_REG(hw, E1000_TDBAH(0), buf_high);
907 	E1000_WRITE_REG(hw, E1000_TDBAL(0), buf_low);
908 
909 	/* Setup our HW Tx Head & Tail descriptor pointers */
910 	E1000_WRITE_REG(hw, E1000_TDH(0), 0);
911 	E1000_WRITE_REG(hw, E1000_TDT(0), 0);
912 
913 	/* Set the default values for the Tx Inter Packet Gap timer */
914 	if ((hw->mac.type == e1000_82542) &&
915 	    ((hw->revision_id == E1000_REVISION_2) ||
916 	    (hw->revision_id == E1000_REVISION_3))) {
917 		reg_tipg = DEFAULT_82542_TIPG_IPGT;
918 		reg_tipg |=
919 		    DEFAULT_82542_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT;
920 		reg_tipg |=
921 		    DEFAULT_82542_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT;
922 	} else if (hw->mac.type == e1000_80003es2lan) {
923 		reg_tipg = DEFAULT_82543_TIPG_IPGR1;
924 		reg_tipg |= DEFAULT_80003ES2LAN_TIPG_IPGR2 <<
925 		    E1000_TIPG_IPGR2_SHIFT;
926 	} else {
927 		if (hw->phy.media_type == e1000_media_type_fiber)
928 			reg_tipg = DEFAULT_82543_TIPG_IPGT_FIBER;
929 		else
930 			reg_tipg = DEFAULT_82543_TIPG_IPGT_COPPER;
931 		reg_tipg |=
932 		    DEFAULT_82543_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT;
933 		reg_tipg |=
934 		    DEFAULT_82543_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT;
935 	}
936 	E1000_WRITE_REG(hw, E1000_TIPG, reg_tipg);
937 
938 	/* Setup Transmit Interrupt Delay Value */
939 	E1000_WRITE_REG(hw, E1000_TIDV, Adapter->tx_intr_delay);
940 	E1000G_DEBUGLOG_1(Adapter, E1000G_INFO_LEVEL,
941 	    "E1000_TIDV: 0x%x\n", Adapter->tx_intr_delay);
942 
943 	if (hw->mac.type >= e1000_82540) {
944 		E1000_WRITE_REG(&Adapter->shared, E1000_TADV,
945 		    Adapter->tx_intr_abs_delay);
946 		E1000G_DEBUGLOG_1(Adapter, E1000G_INFO_LEVEL,
947 		    "E1000_TADV: 0x%x\n", Adapter->tx_intr_abs_delay);
948 	}
949 
950 	tx_ring->tbd_avail = Adapter->tx_desc_num;
951 
952 	/* Initialize stored context information */
953 	bzero(&(tx_ring->pre_context), sizeof (context_data_t));
954 }
955 
956 /*
957  * e1000g_recycle - recycle the tx descriptors and tx sw packets
958  */
959 int
960 e1000g_recycle(e1000g_tx_ring_t *tx_ring)
961 {
962 	struct e1000g *Adapter;
963 	LIST_DESCRIBER pending_list;
964 	p_tx_sw_packet_t packet;
965 	mblk_t *mp;
966 	mblk_t *nmp;
967 	struct e1000_tx_desc *descriptor;
968 	int desc_count;
969 	int64_t delta;
970 
971 	/*
972 	 * This function will examine each TxSwPacket in the 'used' queue
973 	 * if the e1000g is done with it then the associated resources (Tx
974 	 * Descriptors) will be "freed" and the TxSwPacket will be
975 	 * returned to the 'free' queue.
976 	 */
977 	Adapter = tx_ring->adapter;
978 	delta = 0;
979 
980 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&tx_ring->used_list);
981 	if (packet == NULL) {
982 		Adapter->stall_flag = B_FALSE;
983 		return (0);
984 	}
985 
986 	desc_count = 0;
987 	QUEUE_INIT_LIST(&pending_list);
988 
989 	/* Sync the Tx descriptor DMA buffer */
990 	(void) ddi_dma_sync(tx_ring->tbd_dma_handle,
991 	    0, 0, DDI_DMA_SYNC_FORKERNEL);
992 	if (e1000g_check_dma_handle(
993 	    tx_ring->tbd_dma_handle) != DDI_FM_OK) {
994 		ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
995 		Adapter->e1000g_state |= E1000G_ERROR;
996 		return (0);
997 	}
998 
999 	/*
1000 	 * While there are still TxSwPackets in the used queue check them
1001 	 */
1002 	mutex_enter(&tx_ring->usedlist_lock);
1003 	while ((packet =
1004 	    (p_tx_sw_packet_t)QUEUE_GET_HEAD(&tx_ring->used_list)) != NULL) {
1005 
1006 		/*
1007 		 * Get hold of the next descriptor that the e1000g will
1008 		 * report status back to (this will be the last descriptor
1009 		 * of a given sw packet). We only want to free the
1010 		 * sw packet (and it resources) if the e1000g is done
1011 		 * with ALL of the descriptors.  If the e1000g is done
1012 		 * with the last one then it is done with all of them.
1013 		 */
1014 		ASSERT(packet->num_desc);
1015 		descriptor = tx_ring->tbd_oldest + (packet->num_desc - 1);
1016 
1017 		/* Check for wrap case */
1018 		if (descriptor > tx_ring->tbd_last)
1019 			descriptor -= Adapter->tx_desc_num;
1020 
1021 		/*
1022 		 * If the descriptor done bit is set free TxSwPacket and
1023 		 * associated resources
1024 		 */
1025 		if (descriptor->upper.fields.status & E1000_TXD_STAT_DD) {
1026 			QUEUE_POP_HEAD(&tx_ring->used_list);
1027 			QUEUE_PUSH_TAIL(&pending_list, &packet->Link);
1028 
1029 			if (descriptor == tx_ring->tbd_last)
1030 				tx_ring->tbd_oldest =
1031 				    tx_ring->tbd_first;
1032 			else
1033 				tx_ring->tbd_oldest =
1034 				    descriptor + 1;
1035 
1036 			desc_count += packet->num_desc;
1037 		} else {
1038 			/*
1039 			 * Found a sw packet that the e1000g is not done
1040 			 * with then there is no reason to check the rest
1041 			 * of the queue.
1042 			 */
1043 			delta = lbolt64 - packet->tickstamp;
1044 			break;
1045 		}
1046 	}
1047 
1048 	tx_ring->tbd_avail += desc_count;
1049 	Adapter->tx_pkt_cnt += desc_count;
1050 
1051 	mutex_exit(&tx_ring->usedlist_lock);
1052 
1053 	if (desc_count == 0) {
1054 		E1000G_DEBUG_STAT(tx_ring->stat_recycle_none);
1055 		/*
1056 		 * If the packet hasn't been sent out for seconds,
1057 		 * the transmitter is considered to be stalled.
1058 		 */
1059 		if (delta > Adapter->stall_threshold) {
1060 			Adapter->stall_flag = B_TRUE;
1061 		}
1062 		return (0);
1063 	}
1064 
1065 	Adapter->stall_flag = B_FALSE;
1066 
1067 	mp = NULL;
1068 	nmp = NULL;
1069 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&pending_list);
1070 	ASSERT(packet != NULL);
1071 	while (packet != NULL) {
1072 		if (packet->mp != NULL) {
1073 			ASSERT(packet->mp->b_next == NULL);
1074 			/* Assemble the message chain */
1075 			if (mp == NULL) {
1076 				mp = packet->mp;
1077 				nmp = packet->mp;
1078 			} else {
1079 				nmp->b_next = packet->mp;
1080 				nmp = packet->mp;
1081 			}
1082 			/* Disconnect the message from the sw packet */
1083 			packet->mp = NULL;
1084 		}
1085 
1086 		/* Free the TxSwPackets */
1087 		e1000g_free_tx_swpkt(packet);
1088 
1089 		packet = (p_tx_sw_packet_t)
1090 		    QUEUE_GET_NEXT(&pending_list, &packet->Link);
1091 	}
1092 
1093 	/* Return the TxSwPackets back to the FreeList */
1094 	mutex_enter(&tx_ring->freelist_lock);
1095 	QUEUE_APPEND(&tx_ring->free_list, &pending_list);
1096 	mutex_exit(&tx_ring->freelist_lock);
1097 
1098 	if (mp != NULL)
1099 		freemsgchain(mp);
1100 
1101 	return (desc_count);
1102 }
1103 /*
1104  * 82544 Coexistence issue workaround:
1105  *    There are 2 issues.
1106  *    1. If a 32 bit split completion happens from P64H2 and another
1107  *	agent drives a 64 bit request/split completion after ONLY
1108  *	1 idle clock (BRCM/Emulex/Adaptec fiber channel cards) then
1109  *	82544 has a problem where in to clock all the data in, it
1110  *	looks at REQ64# signal and since it has changed so fast (i.e. 1
1111  *	idle clock turn around), it will fail to clock all the data in.
1112  *	Data coming from certain ending addresses has exposure to this issue.
1113  *
1114  * To detect this issue, following equation can be used...
1115  *	SIZE[3:0] + ADDR[2:0] = SUM[3:0].
1116  *	If SUM[3:0] is in between 1 to 4, we will have this issue.
1117  *
1118  * ROOT CAUSE:
1119  *	The erratum involves the 82544 PCIX elasticity FIFO implementations as
1120  *	64-bit FIFO's and flushing of the final partial-bytes corresponding
1121  *	to the end of a requested read burst. Under a specific burst condition
1122  *	of ending-data alignment and 32-byte split-completions, the final
1123  *	byte(s) of split-completion data require an extra clock cycle to flush
1124  *	into 64-bit FIFO orientation.  An incorrect logic dependency on the
1125  *	REQ64# signal occurring during during this clock cycle may cause the
1126  *	residual byte(s) to be lost, thereby rendering the internal DMA client
1127  *	forever awaiting the final byte(s) for an outbound data-fetch.  The
1128  *	erratum is confirmed to *only* occur if certain subsequent external
1129  *	64-bit PCIX bus transactions occur immediately (minimum possible bus
1130  *	turn- around) following the odd-aligned 32-bit split-completion
1131  *	containing the final byte(s).  Intel has confirmed that this has been
1132  *	seen only with chipset/bridges which have the capability to provide
1133  *	32-bit split-completion data, and in the presence of newer PCIX bus
1134  *	agents which fully-optimize the inter-transaction turn-around (zero
1135  *	additional initiator latency when pre-granted bus ownership).
1136  *
1137  *   	This issue does not exist in PCI bus mode, when any agent is operating
1138  *	in 32 bit only mode or on chipsets that do not do 32 bit split
1139  *	completions for 64 bit read requests (Serverworks chipsets). P64H2 does
1140  *	32 bit split completions for any read request that has bit 2 set to 1
1141  *	for the requested address and read request size is more than 8 bytes.
1142  *
1143  *   2. Another issue is related to 82544 driving DACs under the similar
1144  *	scenario (32 bit split completion followed by 64 bit transaction with
1145  *	only 1 cycle turnaround). This issue is still being root caused. We
1146  *	think that both of these issues can be avoided if following workaround
1147  *	is implemented. It seems DAC issues is related to ending addresses being
1148  *	0x9, 0xA, 0xB, 0xC and hence ending up at odd boundaries in elasticity
1149  *	FIFO which does not get flushed due to REQ64# dependency. We will only
1150  *	know the full story after it has been simulated successfully by HW team.
1151  *
1152  * WORKAROUND:
1153  *	Make sure we do not have ending address as 1,2,3,4(Hang) or 9,a,b,c(DAC)
1154  */
1155 static uint32_t
1156 e1000g_fill_82544_desc(uint64_t address,
1157     size_t length, p_desc_array_t desc_array)
1158 {
1159 	/*
1160 	 * Since issue is sensitive to length and address.
1161 	 * Let us first check the address...
1162 	 */
1163 	uint32_t safe_terminator;
1164 
1165 	if (length <= 4) {
1166 		desc_array->descriptor[0].address = address;
1167 		desc_array->descriptor[0].length = (uint32_t)length;
1168 		desc_array->elements = 1;
1169 		return (desc_array->elements);
1170 	}
1171 	safe_terminator =
1172 	    (uint32_t)((((uint32_t)address & 0x7) +
1173 	    (length & 0xF)) & 0xF);
1174 	/*
1175 	 * if it does not fall between 0x1 to 0x4 and 0x9 to 0xC then
1176 	 * return
1177 	 */
1178 	if (safe_terminator == 0 ||
1179 	    (safe_terminator > 4 && safe_terminator < 9) ||
1180 	    (safe_terminator > 0xC && safe_terminator <= 0xF)) {
1181 		desc_array->descriptor[0].address = address;
1182 		desc_array->descriptor[0].length = (uint32_t)length;
1183 		desc_array->elements = 1;
1184 		return (desc_array->elements);
1185 	}
1186 
1187 	desc_array->descriptor[0].address = address;
1188 	desc_array->descriptor[0].length = length - 4;
1189 	desc_array->descriptor[1].address = address + (length - 4);
1190 	desc_array->descriptor[1].length = 4;
1191 	desc_array->elements = 2;
1192 	return (desc_array->elements);
1193 }
1194 
1195 static int
1196 e1000g_tx_copy(e1000g_tx_ring_t *tx_ring, p_tx_sw_packet_t packet,
1197     mblk_t *mp, boolean_t tx_undersize_flag)
1198 {
1199 	size_t len;
1200 	size_t len1;
1201 	dma_buffer_t *tx_buf;
1202 	mblk_t *nmp;
1203 	boolean_t finished;
1204 	int desc_count;
1205 
1206 	desc_count = 0;
1207 	tx_buf = packet->tx_buf;
1208 	len = MBLKL(mp);
1209 
1210 	ASSERT((tx_buf->len + len) <= tx_buf->size);
1211 
1212 	if (len > 0) {
1213 		bcopy(mp->b_rptr,
1214 		    tx_buf->address + tx_buf->len,
1215 		    len);
1216 		tx_buf->len += len;
1217 
1218 		packet->num_mblk_frag++;
1219 	}
1220 
1221 	nmp = mp->b_cont;
1222 	if (nmp == NULL) {
1223 		finished = B_TRUE;
1224 	} else {
1225 		len1 = MBLKL(nmp);
1226 		if ((tx_buf->len + len1) > tx_buf->size)
1227 			finished = B_TRUE;
1228 		else if (tx_undersize_flag)
1229 			finished = B_FALSE;
1230 		else if (len1 > tx_ring->adapter->tx_bcopy_thresh)
1231 			finished = B_TRUE;
1232 		else
1233 			finished = B_FALSE;
1234 	}
1235 
1236 	if (finished) {
1237 		E1000G_DEBUG_STAT_COND(tx_ring->stat_multi_copy,
1238 		    (tx_buf->len > len));
1239 
1240 		/*
1241 		 * If the packet is smaller than 64 bytes, which is the
1242 		 * minimum ethernet packet size, pad the packet to make
1243 		 * it at least 60 bytes. The hardware will add 4 bytes
1244 		 * for CRC.
1245 		 */
1246 		if (tx_undersize_flag) {
1247 			ASSERT(tx_buf->len < ETHERMIN);
1248 
1249 			bzero(tx_buf->address + tx_buf->len,
1250 			    ETHERMIN - tx_buf->len);
1251 			tx_buf->len = ETHERMIN;
1252 		}
1253 
1254 #ifdef __sparc
1255 		if (packet->dma_type == USE_DVMA)
1256 			dvma_sync(tx_buf->dma_handle, 0, DDI_DMA_SYNC_FORDEV);
1257 		else
1258 			(void) ddi_dma_sync(tx_buf->dma_handle, 0,
1259 			    tx_buf->len, DDI_DMA_SYNC_FORDEV);
1260 #else
1261 		(void) ddi_dma_sync(tx_buf->dma_handle, 0,
1262 		    tx_buf->len, DDI_DMA_SYNC_FORDEV);
1263 #endif
1264 
1265 		packet->data_transfer_type = USE_BCOPY;
1266 
1267 		desc_count = e1000g_fill_tx_desc(tx_ring,
1268 		    packet,
1269 		    tx_buf->dma_address,
1270 		    tx_buf->len);
1271 
1272 		if (desc_count <= 0)
1273 			return (-1);
1274 	}
1275 
1276 	return (desc_count);
1277 }
1278 
1279 static int
1280 e1000g_tx_bind(e1000g_tx_ring_t *tx_ring, p_tx_sw_packet_t packet, mblk_t *mp)
1281 {
1282 	int j;
1283 	int mystat;
1284 	size_t len;
1285 	ddi_dma_cookie_t dma_cookie;
1286 	uint_t ncookies;
1287 	int desc_count;
1288 	uint32_t desc_total;
1289 
1290 	desc_total = 0;
1291 	len = MBLKL(mp);
1292 
1293 	/*
1294 	 * ddi_dma_addr_bind_handle() allocates  DMA  resources  for  a
1295 	 * memory  object such that a device can perform DMA to or from
1296 	 * the object.  DMA resources  are  allocated  considering  the
1297 	 * device's  DMA  attributes  as  expressed by ddi_dma_attr(9S)
1298 	 * (see ddi_dma_alloc_handle(9F)).
1299 	 *
1300 	 * ddi_dma_addr_bind_handle() fills in  the  first  DMA  cookie
1301 	 * pointed  to by cookiep with the appropriate address, length,
1302 	 * and bus type. *ccountp is set to the number of DMA  cookies
1303 	 * representing this DMA object. Subsequent DMA cookies must be
1304 	 * retrieved by calling ddi_dma_nextcookie(9F)  the  number  of
1305 	 * times specified by *countp - 1.
1306 	 */
1307 	switch (packet->dma_type) {
1308 #ifdef __sparc
1309 	case USE_DVMA:
1310 		dvma_kaddr_load(packet->tx_dma_handle,
1311 		    (caddr_t)mp->b_rptr, len, 0, &dma_cookie);
1312 
1313 		dvma_sync(packet->tx_dma_handle, 0,
1314 		    DDI_DMA_SYNC_FORDEV);
1315 
1316 		ncookies = 1;
1317 		packet->data_transfer_type = USE_DVMA;
1318 		break;
1319 #endif
1320 	case USE_DMA:
1321 		if ((mystat = ddi_dma_addr_bind_handle(
1322 		    packet->tx_dma_handle, NULL,
1323 		    (caddr_t)mp->b_rptr, len,
1324 		    DDI_DMA_WRITE | DDI_DMA_STREAMING,
1325 		    DDI_DMA_DONTWAIT, 0, &dma_cookie,
1326 		    &ncookies)) != DDI_DMA_MAPPED) {
1327 
1328 			e1000g_log(tx_ring->adapter, CE_WARN,
1329 			    "Couldn't bind mblk buffer to Tx DMA handle: "
1330 			    "return: %X, Pkt: %X\n",
1331 			    mystat, packet);
1332 			return (-1);
1333 		}
1334 
1335 		/*
1336 		 * An implicit ddi_dma_sync() is done when the
1337 		 * ddi_dma_addr_bind_handle() is called. So we
1338 		 * don't need to explicitly call ddi_dma_sync()
1339 		 * here any more.
1340 		 */
1341 		ASSERT(ncookies);
1342 		E1000G_DEBUG_STAT_COND(tx_ring->stat_multi_cookie,
1343 		    (ncookies > 1));
1344 
1345 		/*
1346 		 * The data_transfer_type value must be set after the handle
1347 		 * has been bound, for it will be used in e1000g_free_tx_swpkt()
1348 		 * to decide whether we need to unbind the handle.
1349 		 */
1350 		packet->data_transfer_type = USE_DMA;
1351 		break;
1352 	default:
1353 		ASSERT(B_FALSE);
1354 		break;
1355 	}
1356 
1357 	packet->num_mblk_frag++;
1358 
1359 	/*
1360 	 * Each address could span thru multpile cookie..
1361 	 * Each cookie will have one descriptor
1362 	 */
1363 	for (j = ncookies; j != 0; j--) {
1364 
1365 		desc_count = e1000g_fill_tx_desc(tx_ring,
1366 		    packet,
1367 		    dma_cookie.dmac_laddress,
1368 		    dma_cookie.dmac_size);
1369 
1370 		if (desc_count <= 0)
1371 			return (-1);
1372 
1373 		desc_total += desc_count;
1374 
1375 		/*
1376 		 * ddi_dma_nextcookie() retrieves subsequent DMA
1377 		 * cookies for a DMA object.
1378 		 * ddi_dma_nextcookie() fills in the
1379 		 * ddi_dma_cookie(9S) structure pointed to by
1380 		 * cookiep.  The ddi_dma_cookie(9S) structure
1381 		 * must be allocated prior to calling
1382 		 * ddi_dma_nextcookie(). The DMA cookie count
1383 		 * returned by ddi_dma_buf_bind_handle(9F),
1384 		 * ddi_dma_addr_bind_handle(9F), or
1385 		 * ddi_dma_getwin(9F) indicates the number of DMA
1386 		 * cookies a DMA object consists of.  If the
1387 		 * resulting cookie count, N, is larger than 1,
1388 		 * ddi_dma_nextcookie() must be called N-1 times
1389 		 * to retrieve all DMA cookies.
1390 		 */
1391 		if (j > 1) {
1392 			ddi_dma_nextcookie(packet->tx_dma_handle,
1393 			    &dma_cookie);
1394 		}
1395 	}
1396 
1397 	return (desc_total);
1398 }
1399 
1400 static void
1401 e1000g_fill_context_descriptor(context_data_t *cur_context,
1402     struct e1000_context_desc *context_desc)
1403 {
1404 	if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM) {
1405 		context_desc->lower_setup.ip_fields.ipcss =
1406 		    cur_context->ether_header_size;
1407 		context_desc->lower_setup.ip_fields.ipcso =
1408 		    cur_context->ether_header_size +
1409 		    offsetof(struct ip, ip_sum);
1410 		context_desc->lower_setup.ip_fields.ipcse =
1411 		    cur_context->ether_header_size +
1412 		    cur_context->cksum_start - 1;
1413 	} else
1414 		context_desc->lower_setup.ip_config = 0;
1415 
1416 	if (cur_context->cksum_flags & HCK_PARTIALCKSUM) {
1417 		/*
1418 		 * The packet with same protocol has the following
1419 		 * stuff and start offset:
1420 		 * |  Protocol  | Stuff  | Start  | Checksum
1421 		 * |		| Offset | Offset | Enable
1422 		 * | IPv4 + TCP |  0x24  |  0x14  |  Yes
1423 		 * | IPv4 + UDP |  0x1A  |  0x14  |  Yes
1424 		 * | IPv6 + TCP |  0x20  |  0x10  |  No
1425 		 * | IPv6 + UDP |  0x14  |  0x10  |  No
1426 		 */
1427 		context_desc->upper_setup.tcp_fields.tucss =
1428 		    cur_context->cksum_start + cur_context->ether_header_size;
1429 		context_desc->upper_setup.tcp_fields.tucso =
1430 		    cur_context->cksum_stuff + cur_context->ether_header_size;
1431 		context_desc->upper_setup.tcp_fields.tucse = 0;
1432 	} else
1433 		context_desc->upper_setup.tcp_config = 0;
1434 
1435 	if (cur_context->lso_flag) {
1436 		context_desc->tcp_seg_setup.fields.mss = cur_context->mss;
1437 		context_desc->tcp_seg_setup.fields.hdr_len =
1438 		    cur_context->hdr_len;
1439 		/*
1440 		 * workaround for 82546EB errata 23, status-writeback
1441 		 * reporting (RS) should not be set on context or
1442 		 * Null descriptors
1443 		 */
1444 		context_desc->cmd_and_length = E1000_TXD_CMD_DEXT
1445 		    | E1000_TXD_CMD_TSE | E1000_TXD_CMD_IP | E1000_TXD_CMD_TCP
1446 		    | E1000_TXD_DTYP_C | cur_context->pay_len;
1447 	} else {
1448 		context_desc->cmd_and_length = E1000_TXD_CMD_DEXT
1449 		    | E1000_TXD_DTYP_C;
1450 		/*
1451 		 * Zero out the options for TCP Segmentation Offload
1452 		 */
1453 		context_desc->tcp_seg_setup.data = 0;
1454 	}
1455 }
1456 
1457 static int
1458 e1000g_fill_tx_desc(e1000g_tx_ring_t *tx_ring,
1459     p_tx_sw_packet_t packet, uint64_t address, size_t size)
1460 {
1461 	struct e1000_hw *hw = &tx_ring->adapter->shared;
1462 	p_sw_desc_t desc;
1463 
1464 	if (hw->mac.type == e1000_82544) {
1465 		if (hw->bus.type == e1000_bus_type_pcix)
1466 			return (e1000g_tx_workaround_PCIX_82544(packet,
1467 			    address, size));
1468 
1469 		if (size > JUMBO_FRAG_LENGTH)
1470 			return (e1000g_tx_workaround_jumbo_82544(packet,
1471 			    address, size));
1472 	}
1473 
1474 	ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
1475 
1476 	desc = &packet->desc[packet->num_desc];
1477 	desc->address = address;
1478 	desc->length = (uint32_t)size;
1479 
1480 	packet->num_desc++;
1481 
1482 	return (1);
1483 }
1484 
1485 static int
1486 e1000g_tx_workaround_PCIX_82544(p_tx_sw_packet_t packet,
1487     uint64_t address, size_t size)
1488 {
1489 	p_sw_desc_t desc;
1490 	int desc_count;
1491 	long size_left;
1492 	size_t len;
1493 	uint32_t counter;
1494 	uint32_t array_elements;
1495 	desc_array_t desc_array;
1496 
1497 	/*
1498 	 * Coexist Workaround for cordova: RP: 07/04/03
1499 	 *
1500 	 * RP: ERRATA: Workaround ISSUE:
1501 	 * 8kb_buffer_Lockup CONTROLLER: Cordova Breakup
1502 	 * Eachbuffer in to 8kb pieces until the
1503 	 * remainder is < 8kb
1504 	 */
1505 	size_left = size;
1506 	desc_count = 0;
1507 
1508 	while (size_left > 0) {
1509 		if (size_left > MAX_TX_BUF_SIZE)
1510 			len = MAX_TX_BUF_SIZE;
1511 		else
1512 			len = size_left;
1513 
1514 		array_elements = e1000g_fill_82544_desc(address,
1515 		    len, &desc_array);
1516 
1517 		for (counter = 0; counter < array_elements; counter++) {
1518 			ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
1519 			/*
1520 			 * Put in the buffer address
1521 			 */
1522 			desc = &packet->desc[packet->num_desc];
1523 
1524 			desc->address =
1525 			    desc_array.descriptor[counter].address;
1526 			desc->length =
1527 			    desc_array.descriptor[counter].length;
1528 
1529 			packet->num_desc++;
1530 			desc_count++;
1531 		} /* for */
1532 
1533 		/*
1534 		 * Update the buffer address and length
1535 		 */
1536 		address += MAX_TX_BUF_SIZE;
1537 		size_left -= MAX_TX_BUF_SIZE;
1538 	} /* while */
1539 
1540 	return (desc_count);
1541 }
1542 
1543 static int
1544 e1000g_tx_workaround_jumbo_82544(p_tx_sw_packet_t packet,
1545     uint64_t address, size_t size)
1546 {
1547 	p_sw_desc_t desc;
1548 	int desc_count;
1549 	long size_left;
1550 	uint32_t offset;
1551 
1552 	/*
1553 	 * Workaround for Jumbo Frames on Cordova
1554 	 * PSD 06/01/2001
1555 	 */
1556 	size_left = size;
1557 	desc_count = 0;
1558 	offset = 0;
1559 	while (size_left > 0) {
1560 		ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
1561 
1562 		desc = &packet->desc[packet->num_desc];
1563 
1564 		desc->address = address + offset;
1565 
1566 		if (size_left > JUMBO_FRAG_LENGTH)
1567 			desc->length = JUMBO_FRAG_LENGTH;
1568 		else
1569 			desc->length = (uint32_t)size_left;
1570 
1571 		packet->num_desc++;
1572 		desc_count++;
1573 
1574 		offset += desc->length;
1575 		size_left -= JUMBO_FRAG_LENGTH;
1576 	}
1577 
1578 	return (desc_count);
1579 }
1580 
1581 #pragma inline(e1000g_82547_tx_move_tail_work)
1582 
1583 static void
1584 e1000g_82547_tx_move_tail_work(e1000g_tx_ring_t *tx_ring)
1585 {
1586 	struct e1000_hw *hw;
1587 	uint16_t hw_tdt;
1588 	uint16_t sw_tdt;
1589 	struct e1000_tx_desc *tx_desc;
1590 	uint16_t length = 0;
1591 	boolean_t eop = B_FALSE;
1592 	struct e1000g *Adapter;
1593 
1594 	Adapter = tx_ring->adapter;
1595 	hw = &Adapter->shared;
1596 
1597 	hw_tdt = E1000_READ_REG(hw, E1000_TDT(0));
1598 	sw_tdt = tx_ring->tbd_next - tx_ring->tbd_first;
1599 
1600 	while (hw_tdt != sw_tdt) {
1601 		tx_desc = &(tx_ring->tbd_first[hw_tdt]);
1602 		length += tx_desc->lower.flags.length;
1603 		eop = tx_desc->lower.data & E1000_TXD_CMD_EOP;
1604 		if (++hw_tdt == Adapter->tx_desc_num)
1605 			hw_tdt = 0;
1606 
1607 		if (eop) {
1608 			if ((Adapter->link_duplex == HALF_DUPLEX) &&
1609 			    (e1000_fifo_workaround_82547(hw, length)
1610 			    != E1000_SUCCESS)) {
1611 				if (tx_ring->timer_enable_82547) {
1612 					ASSERT(tx_ring->timer_id_82547 == 0);
1613 					tx_ring->timer_id_82547 =
1614 					    timeout(e1000g_82547_timeout,
1615 					    (void *)tx_ring,
1616 					    drv_usectohz(10000));
1617 				}
1618 				return;
1619 
1620 			} else {
1621 				E1000_WRITE_REG(hw, E1000_TDT(0), hw_tdt);
1622 				e1000_update_tx_fifo_head_82547(hw, length);
1623 				length = 0;
1624 			}
1625 		}
1626 	}
1627 }
1628 
1629 static void
1630 e1000g_82547_timeout(void *arg)
1631 {
1632 	e1000g_tx_ring_t *tx_ring;
1633 
1634 	tx_ring = (e1000g_tx_ring_t *)arg;
1635 
1636 	mutex_enter(&tx_ring->tx_lock);
1637 
1638 	tx_ring->timer_id_82547 = 0;
1639 	e1000g_82547_tx_move_tail_work(tx_ring);
1640 
1641 	mutex_exit(&tx_ring->tx_lock);
1642 }
1643 
1644 static void
1645 e1000g_82547_tx_move_tail(e1000g_tx_ring_t *tx_ring)
1646 {
1647 	timeout_id_t tid;
1648 
1649 	ASSERT(MUTEX_HELD(&tx_ring->tx_lock));
1650 
1651 	tid = tx_ring->timer_id_82547;
1652 	tx_ring->timer_id_82547 = 0;
1653 	if (tid != 0) {
1654 		tx_ring->timer_enable_82547 = B_FALSE;
1655 		mutex_exit(&tx_ring->tx_lock);
1656 
1657 		(void) untimeout(tid);
1658 
1659 		mutex_enter(&tx_ring->tx_lock);
1660 	}
1661 	tx_ring->timer_enable_82547 = B_TRUE;
1662 	e1000g_82547_tx_move_tail_work(tx_ring);
1663 }
1664