xref: /titanic_52/usr/src/uts/common/io/e1000g/e1000g_tx.c (revision de22af4e795d4c10cbff9a60ec725aab46c03afe)
1 /*
2  * This file is provided under a CDDLv1 license.  When using or
3  * redistributing this file, you may do so under this license.
4  * In redistributing this file this license must be included
5  * and no other modification of this header file is permitted.
6  *
7  * CDDL LICENSE SUMMARY
8  *
9  * Copyright(c) 1999 - 2009 Intel Corporation. All rights reserved.
10  *
11  * The contents of this file are subject to the terms of Version
12  * 1.0 of the Common Development and Distribution License (the "License").
13  *
14  * You should have received a copy of the License with this software.
15  * You can obtain a copy of the License at
16  *	http://www.opensolaris.org/os/licensing.
17  * See the License for the specific language governing permissions
18  * and limitations under the License.
19  */
20 
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * **********************************************************************
28  *									*
29  * Module Name:								*
30  *   e1000g_tx.c							*
31  *									*
32  * Abstract:								*
33  *   This file contains some routines that take care of Transmit,	*
34  *   make the hardware to send the data pointed by the packet out	*
35  *   on to the physical medium.						*
36  *									*
37  * **********************************************************************
38  */
39 
40 #include "e1000g_sw.h"
41 #include "e1000g_debug.h"
42 
43 static boolean_t e1000g_send(struct e1000g *, mblk_t *);
44 static int e1000g_tx_copy(e1000g_tx_ring_t *,
45     p_tx_sw_packet_t, mblk_t *, boolean_t);
46 static int e1000g_tx_bind(e1000g_tx_ring_t *,
47     p_tx_sw_packet_t, mblk_t *);
48 static boolean_t e1000g_retrieve_context(mblk_t *, context_data_t *, size_t);
49 static boolean_t e1000g_check_context(e1000g_tx_ring_t *, context_data_t *);
50 static int e1000g_fill_tx_ring(e1000g_tx_ring_t *, LIST_DESCRIBER *,
51     context_data_t *);
52 static void e1000g_fill_context_descriptor(context_data_t *,
53     struct e1000_context_desc *);
54 static int e1000g_fill_tx_desc(e1000g_tx_ring_t *,
55     p_tx_sw_packet_t, uint64_t, size_t);
56 static uint32_t e1000g_fill_82544_desc(uint64_t Address, size_t Length,
57     p_desc_array_t desc_array);
58 static int e1000g_tx_workaround_PCIX_82544(p_tx_sw_packet_t, uint64_t, size_t);
59 static int e1000g_tx_workaround_jumbo_82544(p_tx_sw_packet_t, uint64_t, size_t);
60 static void e1000g_82547_timeout(void *);
61 static void e1000g_82547_tx_move_tail(e1000g_tx_ring_t *);
62 static void e1000g_82547_tx_move_tail_work(e1000g_tx_ring_t *);
63 
64 #ifndef E1000G_DEBUG
65 #pragma inline(e1000g_tx_copy)
66 #pragma inline(e1000g_tx_bind)
67 #pragma inline(e1000g_retrieve_context)
68 #pragma inline(e1000g_check_context)
69 #pragma inline(e1000g_fill_tx_ring)
70 #pragma inline(e1000g_fill_context_descriptor)
71 #pragma inline(e1000g_fill_tx_desc)
72 #pragma inline(e1000g_fill_82544_desc)
73 #pragma inline(e1000g_tx_workaround_PCIX_82544)
74 #pragma inline(e1000g_tx_workaround_jumbo_82544)
75 #pragma inline(e1000g_free_tx_swpkt)
76 #endif
77 
78 /*
79  * e1000g_free_tx_swpkt	- free up the tx sw packet
80  *
81  * Unbind the previously bound DMA handle for a given
82  * transmit sw packet. And reset the sw packet data.
83  */
84 void
85 e1000g_free_tx_swpkt(register p_tx_sw_packet_t packet)
86 {
87 	switch (packet->data_transfer_type) {
88 	case USE_BCOPY:
89 		packet->tx_buf->len = 0;
90 		break;
91 #ifdef __sparc
92 	case USE_DVMA:
93 		dvma_unload(packet->tx_dma_handle, 0, -1);
94 		break;
95 #endif
96 	case USE_DMA:
97 		(void) ddi_dma_unbind_handle(packet->tx_dma_handle);
98 		break;
99 	default:
100 		break;
101 	}
102 
103 	/*
104 	 * The mblk has been stripped off the sw packet
105 	 * and will be freed in a triggered soft intr.
106 	 */
107 	ASSERT(packet->mp == NULL);
108 
109 	packet->data_transfer_type = USE_NONE;
110 	packet->num_mblk_frag = 0;
111 	packet->num_desc = 0;
112 }
113 
114 mblk_t *
115 e1000g_m_tx(void *arg, mblk_t *mp)
116 {
117 	struct e1000g *Adapter = (struct e1000g *)arg;
118 	mblk_t *next;
119 
120 	rw_enter(&Adapter->chip_lock, RW_READER);
121 
122 	if ((Adapter->e1000g_state & E1000G_SUSPENDED) ||
123 	    !(Adapter->e1000g_state & E1000G_STARTED) ||
124 	    (Adapter->link_state != LINK_STATE_UP)) {
125 		freemsgchain(mp);
126 		mp = NULL;
127 	}
128 
129 	while (mp != NULL) {
130 		next = mp->b_next;
131 		mp->b_next = NULL;
132 
133 		if (!e1000g_send(Adapter, mp)) {
134 			mp->b_next = next;
135 			break;
136 		}
137 
138 		mp = next;
139 	}
140 
141 	rw_exit(&Adapter->chip_lock);
142 	return (mp);
143 }
144 
145 /*
146  * e1000g_send -  send packets onto the wire
147  *
148  * Called from e1000g_m_tx with an mblk ready to send. this
149  * routine sets up the transmit descriptors and sends data to
150  * the wire. It also pushes the just transmitted packet to
151  * the used tx sw packet list.
152  */
153 static boolean_t
154 e1000g_send(struct e1000g *Adapter, mblk_t *mp)
155 {
156 	p_tx_sw_packet_t packet;
157 	LIST_DESCRIBER pending_list;
158 	size_t len;
159 	size_t msg_size;
160 	uint32_t frag_count;
161 	int desc_count;
162 	uint32_t desc_total;
163 	uint32_t bcopy_thresh;
164 	uint32_t hdr_frag_len;
165 	boolean_t tx_undersize_flag;
166 	mblk_t *nmp;
167 	mblk_t *tmp;
168 	mblk_t *new_mp;
169 	mblk_t *pre_mp;
170 	mblk_t *next_mp;
171 	e1000g_tx_ring_t *tx_ring;
172 	context_data_t cur_context;
173 
174 	tx_ring = Adapter->tx_ring;
175 	bcopy_thresh = Adapter->tx_bcopy_thresh;
176 
177 	/* Get the total size and frags number of the message */
178 	tx_undersize_flag = B_FALSE;
179 	frag_count = 0;
180 	msg_size = 0;
181 	for (nmp = mp; nmp; nmp = nmp->b_cont) {
182 		frag_count++;
183 		msg_size += MBLKL(nmp);
184 	}
185 
186 	/* retrieve and compute information for context descriptor */
187 	if (!e1000g_retrieve_context(mp, &cur_context, msg_size)) {
188 		freemsg(mp);
189 		return (B_TRUE);
190 	}
191 
192 	/*
193 	 * Make sure the packet is less than the allowed size
194 	 */
195 	if (!cur_context.lso_flag &&
196 	    (msg_size > Adapter->max_frame_size - ETHERFCSL)) {
197 		/*
198 		 * For the over size packet, we'll just drop it.
199 		 * So we return B_TRUE here.
200 		 */
201 		E1000G_DEBUGLOG_1(Adapter, E1000G_WARN_LEVEL,
202 		    "Tx packet out of bound. length = %d \n", msg_size);
203 		E1000G_STAT(tx_ring->stat_over_size);
204 		freemsg(mp);
205 		return (B_TRUE);
206 	}
207 
208 	/*
209 	 * Check and reclaim tx descriptors.
210 	 * This low water mark check should be done all the time as
211 	 * Transmit interrupt delay can produce Transmit interrupts little
212 	 * late and that may cause few problems related to reaping Tx
213 	 * Descriptors... As you may run short of them before getting any
214 	 * transmit interrupt...
215 	 */
216 	if (tx_ring->tbd_avail < DEFAULT_TX_NO_RESOURCE) {
217 		(void) e1000g_recycle(tx_ring);
218 		E1000G_DEBUG_STAT(tx_ring->stat_recycle);
219 
220 		if (tx_ring->tbd_avail < DEFAULT_TX_NO_RESOURCE) {
221 			E1000G_DEBUG_STAT(tx_ring->stat_lack_desc);
222 			goto tx_no_resource;
223 		}
224 	}
225 
226 	/*
227 	 * If the message size is less than the minimum ethernet packet size,
228 	 * we'll use bcopy to send it, and padd it to 60 bytes later.
229 	 */
230 	if (msg_size < ETHERMIN) {
231 		E1000G_DEBUG_STAT(tx_ring->stat_under_size);
232 		tx_undersize_flag = B_TRUE;
233 	}
234 
235 	/* Initialize variables */
236 	desc_count = 1;	/* The initial value should be greater than 0 */
237 	desc_total = 0;
238 	new_mp = NULL;
239 	QUEUE_INIT_LIST(&pending_list);
240 
241 	/* Process each mblk fragment and fill tx descriptors */
242 	/*
243 	 * The software should guarantee LSO packet header(MAC+IP+TCP)
244 	 * to be within one descriptor. Here we reallocate and refill the
245 	 * the header if it's physical memory non-contiguous.
246 	 */
247 	if (cur_context.lso_flag) {
248 		/* find the last fragment of the header */
249 		len = MBLKL(mp);
250 		ASSERT(len > 0);
251 		next_mp = mp;
252 		pre_mp = NULL;
253 		while (len < cur_context.hdr_len) {
254 			pre_mp = next_mp;
255 			next_mp = next_mp->b_cont;
256 			len += MBLKL(next_mp);
257 		}
258 		/*
259 		 * If the header and the payload are in different mblks,
260 		 * we simply force the header to be copied into pre-allocated
261 		 * page-aligned buffer.
262 		 */
263 		if (len == cur_context.hdr_len)
264 			goto adjust_threshold;
265 
266 		hdr_frag_len = cur_context.hdr_len - (len - MBLKL(next_mp));
267 		/*
268 		 * There are three cases we need to reallocate a mblk for the
269 		 * last header fragment:
270 		 *
271 		 * 1. the header is in multiple mblks and the last fragment
272 		 * share the same mblk with the payload
273 		 *
274 		 * 2. the header is in a single mblk shared with the payload
275 		 * and the header is physical memory non-contiguous
276 		 *
277 		 * 3. there is 4 KB boundary within the header and 64 bytes
278 		 * following the end of the header bytes. The case may cause
279 		 * TCP data corruption issue.
280 		 *
281 		 * The workaround for the case #2 and case #3 is:
282 		 *   Assuming standard Ethernet/IP/TCP headers of 54 bytes,
283 		 *   this means that the buffer(containing the headers) should
284 		 *   not start -118 bytes before a 4 KB boundary. For example,
285 		 *   128-byte alignment for this buffer could be used to fulfill
286 		 *   this condition.
287 		 */
288 		if ((next_mp != mp) ||
289 		    (P2NPHASE((uintptr_t)next_mp->b_rptr,
290 		    E1000_LSO_FIRST_DESC_ALIGNMENT_BOUNDARY_4K)
291 		    < E1000_LSO_FIRST_DESC_ALIGNMENT)) {
292 			E1000G_DEBUG_STAT(tx_ring->stat_lso_header_fail);
293 			/*
294 			 * reallocate the mblk for the last header fragment,
295 			 * expect to bcopy into pre-allocated page-aligned
296 			 * buffer
297 			 */
298 			new_mp = allocb(hdr_frag_len, NULL);
299 			if (!new_mp)
300 				return (B_FALSE);
301 			bcopy(next_mp->b_rptr, new_mp->b_rptr, hdr_frag_len);
302 			/* link the new header fragment with the other parts */
303 			new_mp->b_wptr = new_mp->b_rptr + hdr_frag_len;
304 			new_mp->b_cont = next_mp;
305 			if (pre_mp)
306 				pre_mp->b_cont = new_mp;
307 			else
308 				mp = new_mp;
309 			next_mp->b_rptr += hdr_frag_len;
310 			frag_count++;
311 		}
312 adjust_threshold:
313 		/*
314 		 * adjust the bcopy threshhold to guarantee
315 		 * the header to use bcopy way
316 		 */
317 		if (bcopy_thresh < cur_context.hdr_len)
318 			bcopy_thresh = cur_context.hdr_len;
319 	}
320 
321 	packet = NULL;
322 	nmp = mp;
323 	while (nmp) {
324 		tmp = nmp->b_cont;
325 
326 		len = MBLKL(nmp);
327 		/* Check zero length mblks */
328 		if (len == 0) {
329 			E1000G_DEBUG_STAT(tx_ring->stat_empty_frags);
330 			/*
331 			 * If there're no packet buffers have been used,
332 			 * or we just completed processing a buffer, then
333 			 * skip the empty mblk fragment.
334 			 * Otherwise, there's still a pending buffer that
335 			 * needs to be processed (tx_copy).
336 			 */
337 			if (desc_count > 0) {
338 				nmp = tmp;
339 				continue;
340 			}
341 		}
342 
343 		/*
344 		 * Get a new TxSwPacket to process mblk buffers.
345 		 */
346 		if (desc_count > 0) {
347 			mutex_enter(&tx_ring->freelist_lock);
348 			packet = (p_tx_sw_packet_t)
349 			    QUEUE_POP_HEAD(&tx_ring->free_list);
350 			mutex_exit(&tx_ring->freelist_lock);
351 
352 			if (packet == NULL) {
353 				E1000G_DEBUGLOG_0(Adapter, E1000G_INFO_LEVEL,
354 				    "No Tx SwPacket available\n");
355 				E1000G_STAT(tx_ring->stat_no_swpkt);
356 				goto tx_send_failed;
357 			}
358 			QUEUE_PUSH_TAIL(&pending_list, &packet->Link);
359 		}
360 
361 		ASSERT(packet);
362 		/*
363 		 * If the size of the fragment is less than the tx_bcopy_thresh
364 		 * we'll use bcopy; Otherwise, we'll use DMA binding.
365 		 */
366 		if ((len <= bcopy_thresh) || tx_undersize_flag) {
367 			desc_count =
368 			    e1000g_tx_copy(tx_ring, packet, nmp,
369 			    tx_undersize_flag);
370 			E1000G_DEBUG_STAT(tx_ring->stat_copy);
371 		} else {
372 			desc_count =
373 			    e1000g_tx_bind(tx_ring, packet, nmp);
374 			E1000G_DEBUG_STAT(tx_ring->stat_bind);
375 		}
376 
377 		if (desc_count > 0)
378 			desc_total += desc_count;
379 		else if (desc_count < 0)
380 			goto tx_send_failed;
381 
382 		nmp = tmp;
383 	}
384 
385 	/* Assign the message to the last sw packet */
386 	ASSERT(packet);
387 	ASSERT(packet->mp == NULL);
388 	packet->mp = mp;
389 
390 	/* Try to recycle the tx descriptors again */
391 	if (tx_ring->tbd_avail < (desc_total + 3)) {
392 		E1000G_DEBUG_STAT(tx_ring->stat_recycle_retry);
393 		(void) e1000g_recycle(tx_ring);
394 	}
395 
396 	mutex_enter(&tx_ring->tx_lock);
397 
398 	/*
399 	 * If the number of available tx descriptors is not enough for transmit
400 	 * (one redundant descriptor and one hw checksum context descriptor are
401 	 * included), then return failure.
402 	 */
403 	if (tx_ring->tbd_avail < (desc_total + 3)) {
404 		E1000G_DEBUGLOG_0(Adapter, E1000G_INFO_LEVEL,
405 		    "No Enough Tx descriptors\n");
406 		E1000G_STAT(tx_ring->stat_no_desc);
407 		mutex_exit(&tx_ring->tx_lock);
408 		goto tx_send_failed;
409 	}
410 
411 	desc_count = e1000g_fill_tx_ring(tx_ring, &pending_list, &cur_context);
412 
413 	mutex_exit(&tx_ring->tx_lock);
414 
415 	ASSERT(desc_count > 0);
416 
417 	/* Send successful */
418 	return (B_TRUE);
419 
420 tx_send_failed:
421 	/* Restore mp to original */
422 	if (new_mp) {
423 		if (pre_mp) {
424 			pre_mp->b_cont = next_mp;
425 		}
426 		new_mp->b_cont = NULL;
427 		freemsg(new_mp);
428 
429 		next_mp->b_rptr -= hdr_frag_len;
430 	}
431 
432 	/*
433 	 * Enable Transmit interrupts, so that the interrupt routine can
434 	 * call mac_tx_update() when transmit descriptors become available.
435 	 */
436 	tx_ring->resched_timestamp = ddi_get_lbolt();
437 	tx_ring->resched_needed = B_TRUE;
438 	if (!Adapter->tx_intr_enable)
439 		e1000g_mask_tx_interrupt(Adapter);
440 
441 	/* Free pending TxSwPackets */
442 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&pending_list);
443 	while (packet) {
444 		packet->mp = NULL;
445 		e1000g_free_tx_swpkt(packet);
446 		packet = (p_tx_sw_packet_t)
447 		    QUEUE_GET_NEXT(&pending_list, &packet->Link);
448 	}
449 
450 	/* Return pending TxSwPackets to the "Free" list */
451 	mutex_enter(&tx_ring->freelist_lock);
452 	QUEUE_APPEND(&tx_ring->free_list, &pending_list);
453 	mutex_exit(&tx_ring->freelist_lock);
454 
455 	E1000G_STAT(tx_ring->stat_send_fail);
456 
457 	/* Message will be scheduled for re-transmit */
458 	return (B_FALSE);
459 
460 tx_no_resource:
461 	/*
462 	 * Enable Transmit interrupts, so that the interrupt routine can
463 	 * call mac_tx_update() when transmit descriptors become available.
464 	 */
465 	tx_ring->resched_timestamp = ddi_get_lbolt();
466 	tx_ring->resched_needed = B_TRUE;
467 	if (!Adapter->tx_intr_enable)
468 		e1000g_mask_tx_interrupt(Adapter);
469 
470 	/* Message will be scheduled for re-transmit */
471 	return (B_FALSE);
472 }
473 
474 static boolean_t
475 e1000g_retrieve_context(mblk_t *mp, context_data_t *cur_context,
476     size_t msg_size)
477 {
478 	uintptr_t ip_start;
479 	uintptr_t tcp_start;
480 	mblk_t *nmp;
481 	uint32_t lsoflags;
482 	uint32_t mss;
483 
484 	bzero(cur_context, sizeof (context_data_t));
485 
486 	/* first check lso information */
487 	mac_lso_get(mp, &mss, &lsoflags);
488 
489 	/* retrieve checksum info */
490 	mac_hcksum_get(mp, &cur_context->cksum_start,
491 	    &cur_context->cksum_stuff, NULL, NULL, &cur_context->cksum_flags);
492 	/* retrieve ethernet header size */
493 	if (((struct ether_vlan_header *)(uintptr_t)mp->b_rptr)->ether_tpid ==
494 	    htons(ETHERTYPE_VLAN))
495 		cur_context->ether_header_size =
496 		    sizeof (struct ether_vlan_header);
497 	else
498 		cur_context->ether_header_size =
499 		    sizeof (struct ether_header);
500 
501 	if (lsoflags & HW_LSO) {
502 		ASSERT(mss != 0);
503 
504 		/* free the invalid packet */
505 		if (mss == 0 ||
506 		    !((cur_context->cksum_flags & HCK_PARTIALCKSUM) &&
507 		    (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM))) {
508 			return (B_FALSE);
509 		}
510 		cur_context->mss = (uint16_t)mss;
511 		cur_context->lso_flag = B_TRUE;
512 
513 		/*
514 		 * Some fields are cleared for the hardware to fill
515 		 * in. We don't assume Ethernet header, IP header and
516 		 * TCP header are always in the same mblk fragment,
517 		 * while we assume each header is always within one
518 		 * mblk fragment and Ethernet header is always in the
519 		 * first mblk fragment.
520 		 */
521 		nmp = mp;
522 		ip_start = (uintptr_t)(nmp->b_rptr)
523 		    + cur_context->ether_header_size;
524 		if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
525 			ip_start = (uintptr_t)nmp->b_cont->b_rptr
526 			    + (ip_start - (uintptr_t)(nmp->b_wptr));
527 			nmp = nmp->b_cont;
528 		}
529 		tcp_start = ip_start +
530 		    IPH_HDR_LENGTH((ipha_t *)ip_start);
531 		if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
532 			tcp_start = (uintptr_t)nmp->b_cont->b_rptr
533 			    + (tcp_start - (uintptr_t)(nmp->b_wptr));
534 			nmp = nmp->b_cont;
535 		}
536 		cur_context->hdr_len = cur_context->ether_header_size
537 		    + IPH_HDR_LENGTH((ipha_t *)ip_start)
538 		    + TCP_HDR_LENGTH((tcph_t *)tcp_start);
539 		((ipha_t *)ip_start)->ipha_length = 0;
540 		((ipha_t *)ip_start)->ipha_hdr_checksum = 0;
541 		/* calculate the TCP packet payload length */
542 		cur_context->pay_len = msg_size - cur_context->hdr_len;
543 	}
544 	return (B_TRUE);
545 }
546 
547 static boolean_t
548 e1000g_check_context(e1000g_tx_ring_t *tx_ring, context_data_t *cur_context)
549 {
550 	boolean_t context_reload;
551 	context_data_t *pre_context;
552 	struct e1000g *Adapter;
553 
554 	context_reload = B_FALSE;
555 	pre_context = &tx_ring->pre_context;
556 	Adapter = tx_ring->adapter;
557 
558 	/*
559 	 * The following code determine if the context descriptor is
560 	 * needed to be reloaded. The sequence of the conditions is
561 	 * made by their possibilities of changing.
562 	 */
563 	/*
564 	 * workaround for 82546EB, context descriptor must be reloaded
565 	 * per LSO/hw_cksum packet if LSO is enabled.
566 	 */
567 	if (Adapter->lso_premature_issue &&
568 	    Adapter->lso_enable &&
569 	    (cur_context->cksum_flags != 0)) {
570 
571 		context_reload = B_TRUE;
572 	} else if (cur_context->lso_flag) {
573 		if ((cur_context->lso_flag != pre_context->lso_flag) ||
574 		    (cur_context->cksum_flags != pre_context->cksum_flags) ||
575 		    (cur_context->pay_len != pre_context->pay_len) ||
576 		    (cur_context->mss != pre_context->mss) ||
577 		    (cur_context->hdr_len != pre_context->hdr_len) ||
578 		    (cur_context->cksum_stuff != pre_context->cksum_stuff) ||
579 		    (cur_context->cksum_start != pre_context->cksum_start) ||
580 		    (cur_context->ether_header_size !=
581 		    pre_context->ether_header_size)) {
582 
583 			context_reload = B_TRUE;
584 		}
585 	} else if (cur_context->cksum_flags != 0) {
586 		if ((cur_context->lso_flag != pre_context->lso_flag) ||
587 		    (cur_context->cksum_flags != pre_context->cksum_flags) ||
588 		    (cur_context->cksum_stuff != pre_context->cksum_stuff) ||
589 		    (cur_context->cksum_start != pre_context->cksum_start) ||
590 		    (cur_context->ether_header_size !=
591 		    pre_context->ether_header_size)) {
592 
593 			context_reload = B_TRUE;
594 		}
595 	}
596 
597 	return (context_reload);
598 }
599 
600 static int
601 e1000g_fill_tx_ring(e1000g_tx_ring_t *tx_ring, LIST_DESCRIBER *pending_list,
602     context_data_t *cur_context)
603 {
604 	struct e1000g *Adapter;
605 	struct e1000_hw *hw;
606 	p_tx_sw_packet_t first_packet;
607 	p_tx_sw_packet_t packet;
608 	p_tx_sw_packet_t previous_packet;
609 	boolean_t context_reload;
610 	struct e1000_tx_desc *first_data_desc;
611 	struct e1000_tx_desc *next_desc;
612 	struct e1000_tx_desc *descriptor;
613 	int desc_count;
614 	boolean_t buff_overrun_flag;
615 	int i;
616 
617 	Adapter = tx_ring->adapter;
618 	hw = &Adapter->shared;
619 
620 	desc_count = 0;
621 	first_packet = NULL;
622 	first_data_desc = NULL;
623 	descriptor = NULL;
624 	first_packet = NULL;
625 	packet = NULL;
626 	buff_overrun_flag = B_FALSE;
627 
628 	next_desc = tx_ring->tbd_next;
629 
630 	/* Context descriptor reload check */
631 	context_reload = e1000g_check_context(tx_ring, cur_context);
632 
633 	if (context_reload) {
634 		first_packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(pending_list);
635 
636 		descriptor = next_desc;
637 
638 		e1000g_fill_context_descriptor(cur_context,
639 		    (struct e1000_context_desc *)descriptor);
640 
641 		/* Check the wrap-around case */
642 		if (descriptor == tx_ring->tbd_last)
643 			next_desc = tx_ring->tbd_first;
644 		else
645 			next_desc++;
646 
647 		desc_count++;
648 	}
649 
650 	first_data_desc = next_desc;
651 
652 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(pending_list);
653 	while (packet) {
654 		ASSERT(packet->num_desc);
655 
656 		for (i = 0; i < packet->num_desc; i++) {
657 			ASSERT(tx_ring->tbd_avail > 0);
658 
659 			descriptor = next_desc;
660 			descriptor->buffer_addr =
661 			    packet->desc[i].address;
662 			descriptor->lower.data =
663 			    packet->desc[i].length;
664 
665 			/* Zero out status */
666 			descriptor->upper.data = 0;
667 
668 			descriptor->lower.data |=
669 			    E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D;
670 			/* must set RS on every outgoing descriptor */
671 			descriptor->lower.data |=
672 			    E1000_TXD_CMD_RS;
673 
674 			if (cur_context->lso_flag)
675 				descriptor->lower.data |= E1000_TXD_CMD_TSE;
676 
677 			/* Check the wrap-around case */
678 			if (descriptor == tx_ring->tbd_last)
679 				next_desc = tx_ring->tbd_first;
680 			else
681 				next_desc++;
682 
683 			desc_count++;
684 
685 			/*
686 			 * workaround for 82546EB errata 33, hang in PCI-X
687 			 * systems due to 2k Buffer Overrun during Transmit
688 			 * Operation. The workaround applies to all the Intel
689 			 * PCI-X chips.
690 			 */
691 			if (hw->bus.type == e1000_bus_type_pcix &&
692 			    descriptor == first_data_desc &&
693 			    ((descriptor->lower.data & E1000G_TBD_LENGTH_MASK)
694 			    > E1000_TX_BUFFER_OEVRRUN_THRESHOLD)) {
695 				/* modified the first descriptor */
696 				descriptor->lower.data &=
697 				    ~E1000G_TBD_LENGTH_MASK;
698 				descriptor->lower.flags.length =
699 				    E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
700 
701 				/* insert a new descriptor */
702 				ASSERT(tx_ring->tbd_avail > 0);
703 				next_desc->buffer_addr =
704 				    packet->desc[0].address +
705 				    E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
706 				next_desc->lower.data =
707 				    packet->desc[0].length -
708 				    E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
709 
710 				/* Zero out status */
711 				next_desc->upper.data = 0;
712 
713 				next_desc->lower.data |=
714 				    E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D;
715 				/* must set RS on every outgoing descriptor */
716 				next_desc->lower.data |=
717 				    E1000_TXD_CMD_RS;
718 
719 				if (cur_context->lso_flag)
720 					next_desc->lower.data |=
721 					    E1000_TXD_CMD_TSE;
722 
723 				descriptor = next_desc;
724 
725 				/* Check the wrap-around case */
726 				if (next_desc == tx_ring->tbd_last)
727 					next_desc = tx_ring->tbd_first;
728 				else
729 					next_desc++;
730 
731 				desc_count++;
732 				buff_overrun_flag = B_TRUE;
733 			}
734 		}
735 
736 		if (buff_overrun_flag) {
737 			packet->num_desc++;
738 			buff_overrun_flag = B_FALSE;
739 		}
740 
741 		if (first_packet != NULL) {
742 			/*
743 			 * Count the checksum context descriptor for
744 			 * the first SwPacket.
745 			 */
746 			first_packet->num_desc++;
747 			first_packet = NULL;
748 		}
749 
750 		packet->tickstamp = ddi_get_lbolt64();
751 
752 		previous_packet = packet;
753 		packet = (p_tx_sw_packet_t)
754 		    QUEUE_GET_NEXT(pending_list, &packet->Link);
755 	}
756 
757 	/*
758 	 * workaround for 82546EB errata 21, LSO Premature Descriptor Write Back
759 	 */
760 	if (Adapter->lso_premature_issue && cur_context->lso_flag &&
761 	    ((descriptor->lower.data & E1000G_TBD_LENGTH_MASK) > 8)) {
762 		/* modified the previous descriptor */
763 		descriptor->lower.data -= 4;
764 
765 		/* insert a new descriptor */
766 		ASSERT(tx_ring->tbd_avail > 0);
767 		/* the lower 20 bits of lower.data is the length field */
768 		next_desc->buffer_addr =
769 		    descriptor->buffer_addr +
770 		    (descriptor->lower.data & E1000G_TBD_LENGTH_MASK);
771 		next_desc->lower.data = 4;
772 
773 		/* Zero out status */
774 		next_desc->upper.data = 0;
775 		/* It must be part of a LSO packet */
776 		next_desc->lower.data |=
777 		    E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D |
778 		    E1000_TXD_CMD_RS | E1000_TXD_CMD_TSE;
779 
780 		descriptor = next_desc;
781 
782 		/* Check the wrap-around case */
783 		if (descriptor == tx_ring->tbd_last)
784 			next_desc = tx_ring->tbd_first;
785 		else
786 			next_desc++;
787 
788 		desc_count++;
789 		/* update the number of descriptors */
790 		previous_packet->num_desc++;
791 	}
792 
793 	ASSERT(descriptor);
794 
795 	if (cur_context->cksum_flags) {
796 		if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM)
797 			((struct e1000_data_desc *)first_data_desc)->
798 			    upper.fields.popts |= E1000_TXD_POPTS_IXSM;
799 		if (cur_context->cksum_flags & HCK_PARTIALCKSUM)
800 			((struct e1000_data_desc *)first_data_desc)->
801 			    upper.fields.popts |= E1000_TXD_POPTS_TXSM;
802 	}
803 
804 	/*
805 	 * Last Descriptor of Packet needs End Of Packet (EOP), Report
806 	 * Status (RS) set.
807 	 */
808 	if (Adapter->tx_intr_delay) {
809 		descriptor->lower.data |= E1000_TXD_CMD_IDE |
810 		    E1000_TXD_CMD_EOP;
811 	} else {
812 		descriptor->lower.data |= E1000_TXD_CMD_EOP;
813 	}
814 
815 	/* Set append Ethernet CRC (IFCS) bits */
816 	if (cur_context->lso_flag) {
817 		first_data_desc->lower.data |= E1000_TXD_CMD_IFCS;
818 	} else {
819 		descriptor->lower.data |= E1000_TXD_CMD_IFCS;
820 	}
821 
822 	/*
823 	 * Sync the Tx descriptors DMA buffer
824 	 */
825 	(void) ddi_dma_sync(tx_ring->tbd_dma_handle,
826 	    0, 0, DDI_DMA_SYNC_FORDEV);
827 
828 	tx_ring->tbd_next = next_desc;
829 
830 	/*
831 	 * Advance the Transmit Descriptor Tail (Tdt), this tells the
832 	 * FX1000 that this frame is available to transmit.
833 	 */
834 	if (hw->mac.type == e1000_82547)
835 		e1000g_82547_tx_move_tail(tx_ring);
836 	else
837 		E1000_WRITE_REG(hw, E1000_TDT(0),
838 		    (uint32_t)(next_desc - tx_ring->tbd_first));
839 
840 	if (e1000g_check_acc_handle(Adapter->osdep.reg_handle) != DDI_FM_OK) {
841 		ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
842 		Adapter->e1000g_state |= E1000G_ERROR;
843 	}
844 
845 	/* Put the pending SwPackets to the "Used" list */
846 	mutex_enter(&tx_ring->usedlist_lock);
847 	QUEUE_APPEND(&tx_ring->used_list, pending_list);
848 	tx_ring->tbd_avail -= desc_count;
849 	mutex_exit(&tx_ring->usedlist_lock);
850 
851 	/* update LSO related data */
852 	if (context_reload)
853 		tx_ring->pre_context = *cur_context;
854 
855 	return (desc_count);
856 }
857 
858 /*
859  * e1000g_tx_setup - setup tx data structures
860  *
861  * This routine initializes all of the transmit related
862  * structures. This includes the Transmit descriptors,
863  * and the tx_sw_packet structures.
864  */
865 void
866 e1000g_tx_setup(struct e1000g *Adapter)
867 {
868 	struct e1000_hw *hw;
869 	p_tx_sw_packet_t packet;
870 	uint32_t i;
871 	uint32_t buf_high;
872 	uint32_t buf_low;
873 	uint32_t reg_tipg;
874 	uint32_t reg_tctl;
875 	int size;
876 	e1000g_tx_ring_t *tx_ring;
877 
878 	hw = &Adapter->shared;
879 	tx_ring = Adapter->tx_ring;
880 
881 	/* init the lists */
882 	/*
883 	 * Here we don't need to protect the lists using the
884 	 * usedlist_lock and freelist_lock, for they have
885 	 * been protected by the chip_lock.
886 	 */
887 	QUEUE_INIT_LIST(&tx_ring->used_list);
888 	QUEUE_INIT_LIST(&tx_ring->free_list);
889 
890 	/* Go through and set up each SW_Packet */
891 	packet = tx_ring->packet_area;
892 	for (i = 0; i < Adapter->tx_freelist_num; i++, packet++) {
893 		/* Initialize this tx_sw_apcket area */
894 		e1000g_free_tx_swpkt(packet);
895 		/* Add this tx_sw_packet to the free list */
896 		QUEUE_PUSH_TAIL(&tx_ring->free_list,
897 		    &packet->Link);
898 	}
899 
900 	/* Setup TX descriptor pointers */
901 	tx_ring->tbd_next = tx_ring->tbd_first;
902 	tx_ring->tbd_oldest = tx_ring->tbd_first;
903 
904 	/*
905 	 * Setup Hardware TX Registers
906 	 */
907 	/* Setup the Transmit Control Register (TCTL). */
908 	reg_tctl = E1000_READ_REG(hw, E1000_TCTL);
909 	reg_tctl |= E1000_TCTL_PSP | E1000_TCTL_EN |
910 	    (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT) |
911 	    (E1000_COLLISION_DISTANCE << E1000_COLD_SHIFT) |
912 	    E1000_TCTL_RTLC;
913 
914 	/* Enable the MULR bit */
915 	if (hw->bus.type == e1000_bus_type_pci_express)
916 		reg_tctl |= E1000_TCTL_MULR;
917 
918 	E1000_WRITE_REG(hw, E1000_TCTL, reg_tctl);
919 
920 	/* Setup HW Base and Length of Tx descriptor area */
921 	size = (Adapter->tx_desc_num * sizeof (struct e1000_tx_desc));
922 	E1000_WRITE_REG(hw, E1000_TDLEN(0), size);
923 	size = E1000_READ_REG(hw, E1000_TDLEN(0));
924 
925 	buf_low = (uint32_t)tx_ring->tbd_dma_addr;
926 	buf_high = (uint32_t)(tx_ring->tbd_dma_addr >> 32);
927 
928 	/*
929 	 * Write the highest location first and work backward to the lowest.
930 	 * This is necessary for some adapter types to
931 	 * prevent write combining from occurring.
932 	 */
933 	E1000_WRITE_REG(hw, E1000_TDBAH(0), buf_high);
934 	E1000_WRITE_REG(hw, E1000_TDBAL(0), buf_low);
935 
936 	/* Setup our HW Tx Head & Tail descriptor pointers */
937 	E1000_WRITE_REG(hw, E1000_TDH(0), 0);
938 	E1000_WRITE_REG(hw, E1000_TDT(0), 0);
939 
940 	/* Set the default values for the Tx Inter Packet Gap timer */
941 	if ((hw->mac.type == e1000_82542) &&
942 	    ((hw->revision_id == E1000_REVISION_2) ||
943 	    (hw->revision_id == E1000_REVISION_3))) {
944 		reg_tipg = DEFAULT_82542_TIPG_IPGT;
945 		reg_tipg |=
946 		    DEFAULT_82542_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT;
947 		reg_tipg |=
948 		    DEFAULT_82542_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT;
949 	} else if (hw->mac.type == e1000_80003es2lan) {
950 		reg_tipg = DEFAULT_82543_TIPG_IPGR1;
951 		reg_tipg |= DEFAULT_80003ES2LAN_TIPG_IPGR2 <<
952 		    E1000_TIPG_IPGR2_SHIFT;
953 	} else {
954 		if (hw->phy.media_type == e1000_media_type_fiber)
955 			reg_tipg = DEFAULT_82543_TIPG_IPGT_FIBER;
956 		else
957 			reg_tipg = DEFAULT_82543_TIPG_IPGT_COPPER;
958 		reg_tipg |=
959 		    DEFAULT_82543_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT;
960 		reg_tipg |=
961 		    DEFAULT_82543_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT;
962 	}
963 	E1000_WRITE_REG(hw, E1000_TIPG, reg_tipg);
964 
965 	/* Setup Transmit Interrupt Delay Value */
966 	E1000_WRITE_REG(hw, E1000_TIDV, Adapter->tx_intr_delay);
967 	E1000G_DEBUGLOG_1(Adapter, E1000G_INFO_LEVEL,
968 	    "E1000_TIDV: 0x%x\n", Adapter->tx_intr_delay);
969 
970 	if (hw->mac.type >= e1000_82540) {
971 		E1000_WRITE_REG(&Adapter->shared, E1000_TADV,
972 		    Adapter->tx_intr_abs_delay);
973 		E1000G_DEBUGLOG_1(Adapter, E1000G_INFO_LEVEL,
974 		    "E1000_TADV: 0x%x\n", Adapter->tx_intr_abs_delay);
975 	}
976 
977 	tx_ring->tbd_avail = Adapter->tx_desc_num;
978 
979 	/* Initialize stored context information */
980 	bzero(&(tx_ring->pre_context), sizeof (context_data_t));
981 }
982 
983 /*
984  * e1000g_recycle - recycle the tx descriptors and tx sw packets
985  */
986 int
987 e1000g_recycle(e1000g_tx_ring_t *tx_ring)
988 {
989 	struct e1000g *Adapter;
990 	LIST_DESCRIBER pending_list;
991 	p_tx_sw_packet_t packet;
992 	mblk_t *mp;
993 	mblk_t *nmp;
994 	struct e1000_tx_desc *descriptor;
995 	int desc_count;
996 	int64_t delta;
997 
998 	/*
999 	 * This function will examine each TxSwPacket in the 'used' queue
1000 	 * if the e1000g is done with it then the associated resources (Tx
1001 	 * Descriptors) will be "freed" and the TxSwPacket will be
1002 	 * returned to the 'free' queue.
1003 	 */
1004 	Adapter = tx_ring->adapter;
1005 	delta = 0;
1006 
1007 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&tx_ring->used_list);
1008 	if (packet == NULL) {
1009 		Adapter->stall_flag = B_FALSE;
1010 		return (0);
1011 	}
1012 
1013 	desc_count = 0;
1014 	QUEUE_INIT_LIST(&pending_list);
1015 
1016 	/* Sync the Tx descriptor DMA buffer */
1017 	(void) ddi_dma_sync(tx_ring->tbd_dma_handle,
1018 	    0, 0, DDI_DMA_SYNC_FORKERNEL);
1019 	if (e1000g_check_dma_handle(
1020 	    tx_ring->tbd_dma_handle) != DDI_FM_OK) {
1021 		ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
1022 		Adapter->e1000g_state |= E1000G_ERROR;
1023 		return (0);
1024 	}
1025 
1026 	/*
1027 	 * While there are still TxSwPackets in the used queue check them
1028 	 */
1029 	mutex_enter(&tx_ring->usedlist_lock);
1030 	while ((packet =
1031 	    (p_tx_sw_packet_t)QUEUE_GET_HEAD(&tx_ring->used_list)) != NULL) {
1032 
1033 		/*
1034 		 * Get hold of the next descriptor that the e1000g will
1035 		 * report status back to (this will be the last descriptor
1036 		 * of a given sw packet). We only want to free the
1037 		 * sw packet (and it resources) if the e1000g is done
1038 		 * with ALL of the descriptors.  If the e1000g is done
1039 		 * with the last one then it is done with all of them.
1040 		 */
1041 		ASSERT(packet->num_desc);
1042 		descriptor = tx_ring->tbd_oldest + (packet->num_desc - 1);
1043 
1044 		/* Check for wrap case */
1045 		if (descriptor > tx_ring->tbd_last)
1046 			descriptor -= Adapter->tx_desc_num;
1047 
1048 		/*
1049 		 * If the descriptor done bit is set free TxSwPacket and
1050 		 * associated resources
1051 		 */
1052 		if (descriptor->upper.fields.status & E1000_TXD_STAT_DD) {
1053 			QUEUE_POP_HEAD(&tx_ring->used_list);
1054 			QUEUE_PUSH_TAIL(&pending_list, &packet->Link);
1055 
1056 			if (descriptor == tx_ring->tbd_last)
1057 				tx_ring->tbd_oldest =
1058 				    tx_ring->tbd_first;
1059 			else
1060 				tx_ring->tbd_oldest =
1061 				    descriptor + 1;
1062 
1063 			desc_count += packet->num_desc;
1064 		} else {
1065 			/*
1066 			 * Found a sw packet that the e1000g is not done
1067 			 * with then there is no reason to check the rest
1068 			 * of the queue.
1069 			 */
1070 			delta = ddi_get_lbolt64() - packet->tickstamp;
1071 			break;
1072 		}
1073 	}
1074 
1075 	tx_ring->tbd_avail += desc_count;
1076 	Adapter->tx_pkt_cnt += desc_count;
1077 
1078 	mutex_exit(&tx_ring->usedlist_lock);
1079 
1080 	if (desc_count == 0) {
1081 		E1000G_DEBUG_STAT(tx_ring->stat_recycle_none);
1082 		/*
1083 		 * If the packet hasn't been sent out for seconds and
1084 		 * the transmitter is not under paused flowctrl condition,
1085 		 * the transmitter is considered to be stalled.
1086 		 */
1087 		if ((delta > Adapter->stall_threshold) &&
1088 		    !(E1000_READ_REG(&Adapter->shared,
1089 		    E1000_STATUS) & E1000_STATUS_TXOFF)) {
1090 			Adapter->stall_flag = B_TRUE;
1091 		}
1092 		return (0);
1093 	}
1094 
1095 	Adapter->stall_flag = B_FALSE;
1096 
1097 	mp = NULL;
1098 	nmp = NULL;
1099 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&pending_list);
1100 	ASSERT(packet != NULL);
1101 	while (packet != NULL) {
1102 		if (packet->mp != NULL) {
1103 			ASSERT(packet->mp->b_next == NULL);
1104 			/* Assemble the message chain */
1105 			if (mp == NULL) {
1106 				mp = packet->mp;
1107 				nmp = packet->mp;
1108 			} else {
1109 				nmp->b_next = packet->mp;
1110 				nmp = packet->mp;
1111 			}
1112 			/* Disconnect the message from the sw packet */
1113 			packet->mp = NULL;
1114 		}
1115 
1116 		/* Free the TxSwPackets */
1117 		e1000g_free_tx_swpkt(packet);
1118 
1119 		packet = (p_tx_sw_packet_t)
1120 		    QUEUE_GET_NEXT(&pending_list, &packet->Link);
1121 	}
1122 
1123 	/* Return the TxSwPackets back to the FreeList */
1124 	mutex_enter(&tx_ring->freelist_lock);
1125 	QUEUE_APPEND(&tx_ring->free_list, &pending_list);
1126 	mutex_exit(&tx_ring->freelist_lock);
1127 
1128 	if (mp != NULL)
1129 		freemsgchain(mp);
1130 
1131 	return (desc_count);
1132 }
1133 /*
1134  * 82544 Coexistence issue workaround:
1135  *    There are 2 issues.
1136  *    1. If a 32 bit split completion happens from P64H2 and another
1137  *	agent drives a 64 bit request/split completion after ONLY
1138  *	1 idle clock (BRCM/Emulex/Adaptec fiber channel cards) then
1139  *	82544 has a problem where in to clock all the data in, it
1140  *	looks at REQ64# signal and since it has changed so fast (i.e. 1
1141  *	idle clock turn around), it will fail to clock all the data in.
1142  *	Data coming from certain ending addresses has exposure to this issue.
1143  *
1144  * To detect this issue, following equation can be used...
1145  *	SIZE[3:0] + ADDR[2:0] = SUM[3:0].
1146  *	If SUM[3:0] is in between 1 to 4, we will have this issue.
1147  *
1148  * ROOT CAUSE:
1149  *	The erratum involves the 82544 PCIX elasticity FIFO implementations as
1150  *	64-bit FIFO's and flushing of the final partial-bytes corresponding
1151  *	to the end of a requested read burst. Under a specific burst condition
1152  *	of ending-data alignment and 32-byte split-completions, the final
1153  *	byte(s) of split-completion data require an extra clock cycle to flush
1154  *	into 64-bit FIFO orientation.  An incorrect logic dependency on the
1155  *	REQ64# signal occurring during during this clock cycle may cause the
1156  *	residual byte(s) to be lost, thereby rendering the internal DMA client
1157  *	forever awaiting the final byte(s) for an outbound data-fetch.  The
1158  *	erratum is confirmed to *only* occur if certain subsequent external
1159  *	64-bit PCIX bus transactions occur immediately (minimum possible bus
1160  *	turn- around) following the odd-aligned 32-bit split-completion
1161  *	containing the final byte(s).  Intel has confirmed that this has been
1162  *	seen only with chipset/bridges which have the capability to provide
1163  *	32-bit split-completion data, and in the presence of newer PCIX bus
1164  *	agents which fully-optimize the inter-transaction turn-around (zero
1165  *	additional initiator latency when pre-granted bus ownership).
1166  *
1167  *   	This issue does not exist in PCI bus mode, when any agent is operating
1168  *	in 32 bit only mode or on chipsets that do not do 32 bit split
1169  *	completions for 64 bit read requests (Serverworks chipsets). P64H2 does
1170  *	32 bit split completions for any read request that has bit 2 set to 1
1171  *	for the requested address and read request size is more than 8 bytes.
1172  *
1173  *   2. Another issue is related to 82544 driving DACs under the similar
1174  *	scenario (32 bit split completion followed by 64 bit transaction with
1175  *	only 1 cycle turnaround). This issue is still being root caused. We
1176  *	think that both of these issues can be avoided if following workaround
1177  *	is implemented. It seems DAC issues is related to ending addresses being
1178  *	0x9, 0xA, 0xB, 0xC and hence ending up at odd boundaries in elasticity
1179  *	FIFO which does not get flushed due to REQ64# dependency. We will only
1180  *	know the full story after it has been simulated successfully by HW team.
1181  *
1182  * WORKAROUND:
1183  *	Make sure we do not have ending address as 1,2,3,4(Hang) or 9,a,b,c(DAC)
1184  */
1185 static uint32_t
1186 e1000g_fill_82544_desc(uint64_t address,
1187     size_t length, p_desc_array_t desc_array)
1188 {
1189 	/*
1190 	 * Since issue is sensitive to length and address.
1191 	 * Let us first check the address...
1192 	 */
1193 	uint32_t safe_terminator;
1194 
1195 	if (length <= 4) {
1196 		desc_array->descriptor[0].address = address;
1197 		desc_array->descriptor[0].length = (uint32_t)length;
1198 		desc_array->elements = 1;
1199 		return (desc_array->elements);
1200 	}
1201 	safe_terminator =
1202 	    (uint32_t)((((uint32_t)address & 0x7) +
1203 	    (length & 0xF)) & 0xF);
1204 	/*
1205 	 * if it does not fall between 0x1 to 0x4 and 0x9 to 0xC then
1206 	 * return
1207 	 */
1208 	if (safe_terminator == 0 ||
1209 	    (safe_terminator > 4 && safe_terminator < 9) ||
1210 	    (safe_terminator > 0xC && safe_terminator <= 0xF)) {
1211 		desc_array->descriptor[0].address = address;
1212 		desc_array->descriptor[0].length = (uint32_t)length;
1213 		desc_array->elements = 1;
1214 		return (desc_array->elements);
1215 	}
1216 
1217 	desc_array->descriptor[0].address = address;
1218 	desc_array->descriptor[0].length = length - 4;
1219 	desc_array->descriptor[1].address = address + (length - 4);
1220 	desc_array->descriptor[1].length = 4;
1221 	desc_array->elements = 2;
1222 	return (desc_array->elements);
1223 }
1224 
1225 static int
1226 e1000g_tx_copy(e1000g_tx_ring_t *tx_ring, p_tx_sw_packet_t packet,
1227     mblk_t *mp, boolean_t tx_undersize_flag)
1228 {
1229 	size_t len;
1230 	size_t len1;
1231 	dma_buffer_t *tx_buf;
1232 	mblk_t *nmp;
1233 	boolean_t finished;
1234 	int desc_count;
1235 
1236 	desc_count = 0;
1237 	tx_buf = packet->tx_buf;
1238 	len = MBLKL(mp);
1239 
1240 	ASSERT((tx_buf->len + len) <= tx_buf->size);
1241 
1242 	if (len > 0) {
1243 		bcopy(mp->b_rptr,
1244 		    tx_buf->address + tx_buf->len,
1245 		    len);
1246 		tx_buf->len += len;
1247 
1248 		packet->num_mblk_frag++;
1249 	}
1250 
1251 	nmp = mp->b_cont;
1252 	if (nmp == NULL) {
1253 		finished = B_TRUE;
1254 	} else {
1255 		len1 = MBLKL(nmp);
1256 		if ((tx_buf->len + len1) > tx_buf->size)
1257 			finished = B_TRUE;
1258 		else if (tx_undersize_flag)
1259 			finished = B_FALSE;
1260 		else if (len1 > tx_ring->adapter->tx_bcopy_thresh)
1261 			finished = B_TRUE;
1262 		else
1263 			finished = B_FALSE;
1264 	}
1265 
1266 	if (finished) {
1267 		E1000G_DEBUG_STAT_COND(tx_ring->stat_multi_copy,
1268 		    (tx_buf->len > len));
1269 
1270 		/*
1271 		 * If the packet is smaller than 64 bytes, which is the
1272 		 * minimum ethernet packet size, pad the packet to make
1273 		 * it at least 60 bytes. The hardware will add 4 bytes
1274 		 * for CRC.
1275 		 */
1276 		if (tx_undersize_flag) {
1277 			ASSERT(tx_buf->len < ETHERMIN);
1278 
1279 			bzero(tx_buf->address + tx_buf->len,
1280 			    ETHERMIN - tx_buf->len);
1281 			tx_buf->len = ETHERMIN;
1282 		}
1283 
1284 #ifdef __sparc
1285 		if (packet->dma_type == USE_DVMA)
1286 			dvma_sync(tx_buf->dma_handle, 0, DDI_DMA_SYNC_FORDEV);
1287 		else
1288 			(void) ddi_dma_sync(tx_buf->dma_handle, 0,
1289 			    tx_buf->len, DDI_DMA_SYNC_FORDEV);
1290 #else
1291 		(void) ddi_dma_sync(tx_buf->dma_handle, 0,
1292 		    tx_buf->len, DDI_DMA_SYNC_FORDEV);
1293 #endif
1294 
1295 		packet->data_transfer_type = USE_BCOPY;
1296 
1297 		desc_count = e1000g_fill_tx_desc(tx_ring,
1298 		    packet,
1299 		    tx_buf->dma_address,
1300 		    tx_buf->len);
1301 
1302 		if (desc_count <= 0)
1303 			return (-1);
1304 	}
1305 
1306 	return (desc_count);
1307 }
1308 
1309 static int
1310 e1000g_tx_bind(e1000g_tx_ring_t *tx_ring, p_tx_sw_packet_t packet, mblk_t *mp)
1311 {
1312 	int j;
1313 	int mystat;
1314 	size_t len;
1315 	ddi_dma_cookie_t dma_cookie;
1316 	uint_t ncookies;
1317 	int desc_count;
1318 	uint32_t desc_total;
1319 
1320 	desc_total = 0;
1321 	len = MBLKL(mp);
1322 
1323 	/*
1324 	 * ddi_dma_addr_bind_handle() allocates  DMA  resources  for  a
1325 	 * memory  object such that a device can perform DMA to or from
1326 	 * the object.  DMA resources  are  allocated  considering  the
1327 	 * device's  DMA  attributes  as  expressed by ddi_dma_attr(9S)
1328 	 * (see ddi_dma_alloc_handle(9F)).
1329 	 *
1330 	 * ddi_dma_addr_bind_handle() fills in  the  first  DMA  cookie
1331 	 * pointed  to by cookiep with the appropriate address, length,
1332 	 * and bus type. *ccountp is set to the number of DMA  cookies
1333 	 * representing this DMA object. Subsequent DMA cookies must be
1334 	 * retrieved by calling ddi_dma_nextcookie(9F)  the  number  of
1335 	 * times specified by *countp - 1.
1336 	 */
1337 	switch (packet->dma_type) {
1338 #ifdef __sparc
1339 	case USE_DVMA:
1340 		dvma_kaddr_load(packet->tx_dma_handle,
1341 		    (caddr_t)mp->b_rptr, len, 0, &dma_cookie);
1342 
1343 		dvma_sync(packet->tx_dma_handle, 0,
1344 		    DDI_DMA_SYNC_FORDEV);
1345 
1346 		ncookies = 1;
1347 		packet->data_transfer_type = USE_DVMA;
1348 		break;
1349 #endif
1350 	case USE_DMA:
1351 		if ((mystat = ddi_dma_addr_bind_handle(
1352 		    packet->tx_dma_handle, NULL,
1353 		    (caddr_t)mp->b_rptr, len,
1354 		    DDI_DMA_WRITE | DDI_DMA_STREAMING,
1355 		    DDI_DMA_DONTWAIT, 0, &dma_cookie,
1356 		    &ncookies)) != DDI_DMA_MAPPED) {
1357 
1358 			e1000g_log(tx_ring->adapter, CE_WARN,
1359 			    "Couldn't bind mblk buffer to Tx DMA handle: "
1360 			    "return: %X, Pkt: %X\n",
1361 			    mystat, packet);
1362 			return (-1);
1363 		}
1364 
1365 		/*
1366 		 * An implicit ddi_dma_sync() is done when the
1367 		 * ddi_dma_addr_bind_handle() is called. So we
1368 		 * don't need to explicitly call ddi_dma_sync()
1369 		 * here any more.
1370 		 */
1371 		ASSERT(ncookies);
1372 		E1000G_DEBUG_STAT_COND(tx_ring->stat_multi_cookie,
1373 		    (ncookies > 1));
1374 
1375 		/*
1376 		 * The data_transfer_type value must be set after the handle
1377 		 * has been bound, for it will be used in e1000g_free_tx_swpkt()
1378 		 * to decide whether we need to unbind the handle.
1379 		 */
1380 		packet->data_transfer_type = USE_DMA;
1381 		break;
1382 	default:
1383 		ASSERT(B_FALSE);
1384 		break;
1385 	}
1386 
1387 	packet->num_mblk_frag++;
1388 
1389 	/*
1390 	 * Each address could span thru multpile cookie..
1391 	 * Each cookie will have one descriptor
1392 	 */
1393 	for (j = ncookies; j != 0; j--) {
1394 
1395 		desc_count = e1000g_fill_tx_desc(tx_ring,
1396 		    packet,
1397 		    dma_cookie.dmac_laddress,
1398 		    dma_cookie.dmac_size);
1399 
1400 		if (desc_count <= 0)
1401 			return (-1);
1402 
1403 		desc_total += desc_count;
1404 
1405 		/*
1406 		 * ddi_dma_nextcookie() retrieves subsequent DMA
1407 		 * cookies for a DMA object.
1408 		 * ddi_dma_nextcookie() fills in the
1409 		 * ddi_dma_cookie(9S) structure pointed to by
1410 		 * cookiep.  The ddi_dma_cookie(9S) structure
1411 		 * must be allocated prior to calling
1412 		 * ddi_dma_nextcookie(). The DMA cookie count
1413 		 * returned by ddi_dma_buf_bind_handle(9F),
1414 		 * ddi_dma_addr_bind_handle(9F), or
1415 		 * ddi_dma_getwin(9F) indicates the number of DMA
1416 		 * cookies a DMA object consists of.  If the
1417 		 * resulting cookie count, N, is larger than 1,
1418 		 * ddi_dma_nextcookie() must be called N-1 times
1419 		 * to retrieve all DMA cookies.
1420 		 */
1421 		if (j > 1) {
1422 			ddi_dma_nextcookie(packet->tx_dma_handle,
1423 			    &dma_cookie);
1424 		}
1425 	}
1426 
1427 	return (desc_total);
1428 }
1429 
1430 static void
1431 e1000g_fill_context_descriptor(context_data_t *cur_context,
1432     struct e1000_context_desc *context_desc)
1433 {
1434 	if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM) {
1435 		context_desc->lower_setup.ip_fields.ipcss =
1436 		    cur_context->ether_header_size;
1437 		context_desc->lower_setup.ip_fields.ipcso =
1438 		    cur_context->ether_header_size +
1439 		    offsetof(struct ip, ip_sum);
1440 		context_desc->lower_setup.ip_fields.ipcse =
1441 		    cur_context->ether_header_size +
1442 		    cur_context->cksum_start - 1;
1443 	} else
1444 		context_desc->lower_setup.ip_config = 0;
1445 
1446 	if (cur_context->cksum_flags & HCK_PARTIALCKSUM) {
1447 		/*
1448 		 * The packet with same protocol has the following
1449 		 * stuff and start offset:
1450 		 * |  Protocol  | Stuff  | Start  | Checksum
1451 		 * |		| Offset | Offset | Enable
1452 		 * | IPv4 + TCP |  0x24  |  0x14  |  Yes
1453 		 * | IPv4 + UDP |  0x1A  |  0x14  |  Yes
1454 		 * | IPv6 + TCP |  0x20  |  0x10  |  No
1455 		 * | IPv6 + UDP |  0x14  |  0x10  |  No
1456 		 */
1457 		context_desc->upper_setup.tcp_fields.tucss =
1458 		    cur_context->cksum_start + cur_context->ether_header_size;
1459 		context_desc->upper_setup.tcp_fields.tucso =
1460 		    cur_context->cksum_stuff + cur_context->ether_header_size;
1461 		context_desc->upper_setup.tcp_fields.tucse = 0;
1462 	} else
1463 		context_desc->upper_setup.tcp_config = 0;
1464 
1465 	if (cur_context->lso_flag) {
1466 		context_desc->tcp_seg_setup.fields.mss = cur_context->mss;
1467 		context_desc->tcp_seg_setup.fields.hdr_len =
1468 		    cur_context->hdr_len;
1469 		/*
1470 		 * workaround for 82546EB errata 23, status-writeback
1471 		 * reporting (RS) should not be set on context or
1472 		 * Null descriptors
1473 		 */
1474 		context_desc->cmd_and_length = E1000_TXD_CMD_DEXT
1475 		    | E1000_TXD_CMD_TSE | E1000_TXD_CMD_IP | E1000_TXD_CMD_TCP
1476 		    | E1000_TXD_DTYP_C | cur_context->pay_len;
1477 	} else {
1478 		context_desc->cmd_and_length = E1000_TXD_CMD_DEXT
1479 		    | E1000_TXD_DTYP_C;
1480 		/*
1481 		 * Zero out the options for TCP Segmentation Offload
1482 		 */
1483 		context_desc->tcp_seg_setup.data = 0;
1484 	}
1485 }
1486 
1487 static int
1488 e1000g_fill_tx_desc(e1000g_tx_ring_t *tx_ring,
1489     p_tx_sw_packet_t packet, uint64_t address, size_t size)
1490 {
1491 	struct e1000_hw *hw = &tx_ring->adapter->shared;
1492 	p_sw_desc_t desc;
1493 
1494 	if (hw->mac.type == e1000_82544) {
1495 		if (hw->bus.type == e1000_bus_type_pcix)
1496 			return (e1000g_tx_workaround_PCIX_82544(packet,
1497 			    address, size));
1498 
1499 		if (size > JUMBO_FRAG_LENGTH)
1500 			return (e1000g_tx_workaround_jumbo_82544(packet,
1501 			    address, size));
1502 	}
1503 
1504 	ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
1505 
1506 	desc = &packet->desc[packet->num_desc];
1507 	desc->address = address;
1508 	desc->length = (uint32_t)size;
1509 
1510 	packet->num_desc++;
1511 
1512 	return (1);
1513 }
1514 
1515 static int
1516 e1000g_tx_workaround_PCIX_82544(p_tx_sw_packet_t packet,
1517     uint64_t address, size_t size)
1518 {
1519 	p_sw_desc_t desc;
1520 	int desc_count;
1521 	long size_left;
1522 	size_t len;
1523 	uint32_t counter;
1524 	uint32_t array_elements;
1525 	desc_array_t desc_array;
1526 
1527 	/*
1528 	 * Coexist Workaround for cordova: RP: 07/04/03
1529 	 *
1530 	 * RP: ERRATA: Workaround ISSUE:
1531 	 * 8kb_buffer_Lockup CONTROLLER: Cordova Breakup
1532 	 * Eachbuffer in to 8kb pieces until the
1533 	 * remainder is < 8kb
1534 	 */
1535 	size_left = size;
1536 	desc_count = 0;
1537 
1538 	while (size_left > 0) {
1539 		if (size_left > MAX_TX_BUF_SIZE)
1540 			len = MAX_TX_BUF_SIZE;
1541 		else
1542 			len = size_left;
1543 
1544 		array_elements = e1000g_fill_82544_desc(address,
1545 		    len, &desc_array);
1546 
1547 		for (counter = 0; counter < array_elements; counter++) {
1548 			ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
1549 			/*
1550 			 * Put in the buffer address
1551 			 */
1552 			desc = &packet->desc[packet->num_desc];
1553 
1554 			desc->address =
1555 			    desc_array.descriptor[counter].address;
1556 			desc->length =
1557 			    desc_array.descriptor[counter].length;
1558 
1559 			packet->num_desc++;
1560 			desc_count++;
1561 		} /* for */
1562 
1563 		/*
1564 		 * Update the buffer address and length
1565 		 */
1566 		address += MAX_TX_BUF_SIZE;
1567 		size_left -= MAX_TX_BUF_SIZE;
1568 	} /* while */
1569 
1570 	return (desc_count);
1571 }
1572 
1573 static int
1574 e1000g_tx_workaround_jumbo_82544(p_tx_sw_packet_t packet,
1575     uint64_t address, size_t size)
1576 {
1577 	p_sw_desc_t desc;
1578 	int desc_count;
1579 	long size_left;
1580 	uint32_t offset;
1581 
1582 	/*
1583 	 * Workaround for Jumbo Frames on Cordova
1584 	 * PSD 06/01/2001
1585 	 */
1586 	size_left = size;
1587 	desc_count = 0;
1588 	offset = 0;
1589 	while (size_left > 0) {
1590 		ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
1591 
1592 		desc = &packet->desc[packet->num_desc];
1593 
1594 		desc->address = address + offset;
1595 
1596 		if (size_left > JUMBO_FRAG_LENGTH)
1597 			desc->length = JUMBO_FRAG_LENGTH;
1598 		else
1599 			desc->length = (uint32_t)size_left;
1600 
1601 		packet->num_desc++;
1602 		desc_count++;
1603 
1604 		offset += desc->length;
1605 		size_left -= JUMBO_FRAG_LENGTH;
1606 	}
1607 
1608 	return (desc_count);
1609 }
1610 
1611 #pragma inline(e1000g_82547_tx_move_tail_work)
1612 
1613 static void
1614 e1000g_82547_tx_move_tail_work(e1000g_tx_ring_t *tx_ring)
1615 {
1616 	struct e1000_hw *hw;
1617 	uint16_t hw_tdt;
1618 	uint16_t sw_tdt;
1619 	struct e1000_tx_desc *tx_desc;
1620 	uint16_t length = 0;
1621 	boolean_t eop = B_FALSE;
1622 	struct e1000g *Adapter;
1623 
1624 	Adapter = tx_ring->adapter;
1625 	hw = &Adapter->shared;
1626 
1627 	hw_tdt = E1000_READ_REG(hw, E1000_TDT(0));
1628 	sw_tdt = tx_ring->tbd_next - tx_ring->tbd_first;
1629 
1630 	while (hw_tdt != sw_tdt) {
1631 		tx_desc = &(tx_ring->tbd_first[hw_tdt]);
1632 		length += tx_desc->lower.flags.length;
1633 		eop = tx_desc->lower.data & E1000_TXD_CMD_EOP;
1634 		if (++hw_tdt == Adapter->tx_desc_num)
1635 			hw_tdt = 0;
1636 
1637 		if (eop) {
1638 			if ((Adapter->link_duplex == HALF_DUPLEX) &&
1639 			    (e1000_fifo_workaround_82547(hw, length)
1640 			    != E1000_SUCCESS)) {
1641 				if (tx_ring->timer_enable_82547) {
1642 					ASSERT(tx_ring->timer_id_82547 == 0);
1643 					tx_ring->timer_id_82547 =
1644 					    timeout(e1000g_82547_timeout,
1645 					    (void *)tx_ring,
1646 					    drv_usectohz(10000));
1647 				}
1648 				return;
1649 
1650 			} else {
1651 				E1000_WRITE_REG(hw, E1000_TDT(0), hw_tdt);
1652 				e1000_update_tx_fifo_head_82547(hw, length);
1653 				length = 0;
1654 			}
1655 		}
1656 	}
1657 }
1658 
1659 static void
1660 e1000g_82547_timeout(void *arg)
1661 {
1662 	e1000g_tx_ring_t *tx_ring;
1663 
1664 	tx_ring = (e1000g_tx_ring_t *)arg;
1665 
1666 	mutex_enter(&tx_ring->tx_lock);
1667 
1668 	tx_ring->timer_id_82547 = 0;
1669 	e1000g_82547_tx_move_tail_work(tx_ring);
1670 
1671 	mutex_exit(&tx_ring->tx_lock);
1672 }
1673 
1674 static void
1675 e1000g_82547_tx_move_tail(e1000g_tx_ring_t *tx_ring)
1676 {
1677 	timeout_id_t tid;
1678 
1679 	ASSERT(MUTEX_HELD(&tx_ring->tx_lock));
1680 
1681 	tid = tx_ring->timer_id_82547;
1682 	tx_ring->timer_id_82547 = 0;
1683 	if (tid != 0) {
1684 		tx_ring->timer_enable_82547 = B_FALSE;
1685 		mutex_exit(&tx_ring->tx_lock);
1686 
1687 		(void) untimeout(tid);
1688 
1689 		mutex_enter(&tx_ring->tx_lock);
1690 	}
1691 	tx_ring->timer_enable_82547 = B_TRUE;
1692 	e1000g_82547_tx_move_tail_work(tx_ring);
1693 }
1694