xref: /titanic_52/usr/src/uts/common/io/e1000g/e1000g_tx.c (revision 2ac302890e472bf0c11db192dd18f12ded6043f6)
1 /*
2  * This file is provided under a CDDLv1 license.  When using or
3  * redistributing this file, you may do so under this license.
4  * In redistributing this file this license must be included
5  * and no other modification of this header file is permitted.
6  *
7  * CDDL LICENSE SUMMARY
8  *
9  * Copyright(c) 1999 - 2009 Intel Corporation. All rights reserved.
10  *
11  * The contents of this file are subject to the terms of Version
12  * 1.0 of the Common Development and Distribution License (the "License").
13  *
14  * You should have received a copy of the License with this software.
15  * You can obtain a copy of the License at
16  *	http://www.opensolaris.org/os/licensing.
17  * See the License for the specific language governing permissions
18  * and limitations under the License.
19  */
20 
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
28  */
29 
30 /*
31  * **********************************************************************
32  *									*
33  * Module Name:								*
34  *   e1000g_tx.c							*
35  *									*
36  * Abstract:								*
37  *   This file contains some routines that take care of Transmit,	*
38  *   make the hardware to send the data pointed by the packet out	*
39  *   on to the physical medium.						*
40  *									*
41  * **********************************************************************
42  */
43 
44 #include "e1000g_sw.h"
45 #include "e1000g_debug.h"
46 
47 static boolean_t e1000g_send(struct e1000g *, mblk_t *);
48 static int e1000g_tx_copy(e1000g_tx_ring_t *,
49     p_tx_sw_packet_t, mblk_t *, boolean_t);
50 static int e1000g_tx_bind(e1000g_tx_ring_t *,
51     p_tx_sw_packet_t, mblk_t *);
52 static boolean_t e1000g_retrieve_context(mblk_t *, context_data_t *, size_t);
53 static boolean_t e1000g_check_context(e1000g_tx_ring_t *, context_data_t *);
54 static int e1000g_fill_tx_ring(e1000g_tx_ring_t *, LIST_DESCRIBER *,
55     context_data_t *);
56 static void e1000g_fill_context_descriptor(context_data_t *,
57     struct e1000_context_desc *);
58 static int e1000g_fill_tx_desc(e1000g_tx_ring_t *,
59     p_tx_sw_packet_t, uint64_t, size_t);
60 static uint32_t e1000g_fill_82544_desc(uint64_t Address, size_t Length,
61     p_desc_array_t desc_array);
62 static int e1000g_tx_workaround_PCIX_82544(p_tx_sw_packet_t, uint64_t, size_t);
63 static int e1000g_tx_workaround_jumbo_82544(p_tx_sw_packet_t, uint64_t, size_t);
64 static void e1000g_82547_timeout(void *);
65 static void e1000g_82547_tx_move_tail(e1000g_tx_ring_t *);
66 static void e1000g_82547_tx_move_tail_work(e1000g_tx_ring_t *);
67 
68 #ifndef E1000G_DEBUG
69 #pragma inline(e1000g_tx_copy)
70 #pragma inline(e1000g_tx_bind)
71 #pragma inline(e1000g_retrieve_context)
72 #pragma inline(e1000g_check_context)
73 #pragma inline(e1000g_fill_tx_ring)
74 #pragma inline(e1000g_fill_context_descriptor)
75 #pragma inline(e1000g_fill_tx_desc)
76 #pragma inline(e1000g_fill_82544_desc)
77 #pragma inline(e1000g_tx_workaround_PCIX_82544)
78 #pragma inline(e1000g_tx_workaround_jumbo_82544)
79 #pragma inline(e1000g_free_tx_swpkt)
80 #endif
81 
82 /*
83  * e1000g_free_tx_swpkt	- free up the tx sw packet
84  *
85  * Unbind the previously bound DMA handle for a given
86  * transmit sw packet. And reset the sw packet data.
87  */
88 void
89 e1000g_free_tx_swpkt(register p_tx_sw_packet_t packet)
90 {
91 	switch (packet->data_transfer_type) {
92 	case USE_BCOPY:
93 		packet->tx_buf->len = 0;
94 		break;
95 #ifdef __sparc
96 	case USE_DVMA:
97 		dvma_unload(packet->tx_dma_handle, 0, -1);
98 		break;
99 #endif
100 	case USE_DMA:
101 		(void) ddi_dma_unbind_handle(packet->tx_dma_handle);
102 		break;
103 	default:
104 		break;
105 	}
106 
107 	/*
108 	 * The mblk has been stripped off the sw packet
109 	 * and will be freed in a triggered soft intr.
110 	 */
111 	ASSERT(packet->mp == NULL);
112 
113 	packet->data_transfer_type = USE_NONE;
114 	packet->num_mblk_frag = 0;
115 	packet->num_desc = 0;
116 }
117 
118 mblk_t *
119 e1000g_m_tx(void *arg, mblk_t *mp)
120 {
121 	struct e1000g *Adapter = (struct e1000g *)arg;
122 	mblk_t *next;
123 
124 	rw_enter(&Adapter->chip_lock, RW_READER);
125 
126 	if ((Adapter->e1000g_state & E1000G_SUSPENDED) ||
127 	    !(Adapter->e1000g_state & E1000G_STARTED) ||
128 	    (Adapter->link_state != LINK_STATE_UP)) {
129 		freemsgchain(mp);
130 		mp = NULL;
131 	}
132 
133 	while (mp != NULL) {
134 		next = mp->b_next;
135 		mp->b_next = NULL;
136 
137 		if (!e1000g_send(Adapter, mp)) {
138 			mp->b_next = next;
139 			break;
140 		}
141 
142 		mp = next;
143 	}
144 
145 	rw_exit(&Adapter->chip_lock);
146 	return (mp);
147 }
148 
149 /*
150  * e1000g_send -  send packets onto the wire
151  *
152  * Called from e1000g_m_tx with an mblk ready to send. this
153  * routine sets up the transmit descriptors and sends data to
154  * the wire. It also pushes the just transmitted packet to
155  * the used tx sw packet list.
156  */
157 static boolean_t
158 e1000g_send(struct e1000g *Adapter, mblk_t *mp)
159 {
160 	p_tx_sw_packet_t packet;
161 	LIST_DESCRIBER pending_list;
162 	size_t len;
163 	size_t msg_size;
164 	uint32_t frag_count;
165 	int desc_count;
166 	uint32_t desc_total;
167 	uint32_t bcopy_thresh;
168 	uint32_t hdr_frag_len;
169 	boolean_t tx_undersize_flag;
170 	mblk_t *nmp;
171 	mblk_t *tmp;
172 	mblk_t *new_mp;
173 	mblk_t *pre_mp;
174 	mblk_t *next_mp;
175 	e1000g_tx_ring_t *tx_ring;
176 	context_data_t cur_context;
177 
178 	tx_ring = Adapter->tx_ring;
179 	bcopy_thresh = Adapter->tx_bcopy_thresh;
180 
181 	/* Get the total size and frags number of the message */
182 	tx_undersize_flag = B_FALSE;
183 	frag_count = 0;
184 	msg_size = 0;
185 	for (nmp = mp; nmp; nmp = nmp->b_cont) {
186 		frag_count++;
187 		msg_size += MBLKL(nmp);
188 	}
189 
190 	/* retrieve and compute information for context descriptor */
191 	if (!e1000g_retrieve_context(mp, &cur_context, msg_size)) {
192 		freemsg(mp);
193 		return (B_TRUE);
194 	}
195 
196 	/*
197 	 * Make sure the packet is less than the allowed size
198 	 */
199 	if (!cur_context.lso_flag &&
200 	    (msg_size > Adapter->max_frame_size - ETHERFCSL)) {
201 		/*
202 		 * For the over size packet, we'll just drop it.
203 		 * So we return B_TRUE here.
204 		 */
205 		E1000G_DEBUGLOG_1(Adapter, E1000G_WARN_LEVEL,
206 		    "Tx packet out of bound. length = %d \n", msg_size);
207 		E1000G_STAT(tx_ring->stat_over_size);
208 		freemsg(mp);
209 		return (B_TRUE);
210 	}
211 
212 	/*
213 	 * Check and reclaim tx descriptors.
214 	 * This low water mark check should be done all the time as
215 	 * Transmit interrupt delay can produce Transmit interrupts little
216 	 * late and that may cause few problems related to reaping Tx
217 	 * Descriptors... As you may run short of them before getting any
218 	 * transmit interrupt...
219 	 */
220 	if (tx_ring->tbd_avail < DEFAULT_TX_NO_RESOURCE) {
221 		(void) e1000g_recycle(tx_ring);
222 		E1000G_DEBUG_STAT(tx_ring->stat_recycle);
223 
224 		if (tx_ring->tbd_avail < DEFAULT_TX_NO_RESOURCE) {
225 			E1000G_DEBUG_STAT(tx_ring->stat_lack_desc);
226 			goto tx_no_resource;
227 		}
228 	}
229 
230 	/*
231 	 * If the message size is less than the minimum ethernet packet size,
232 	 * we'll use bcopy to send it, and padd it to 60 bytes later.
233 	 */
234 	if (msg_size < ETHERMIN) {
235 		E1000G_DEBUG_STAT(tx_ring->stat_under_size);
236 		tx_undersize_flag = B_TRUE;
237 	}
238 
239 	/* Initialize variables */
240 	desc_count = 1;	/* The initial value should be greater than 0 */
241 	desc_total = 0;
242 	new_mp = NULL;
243 	QUEUE_INIT_LIST(&pending_list);
244 
245 	/* Process each mblk fragment and fill tx descriptors */
246 	/*
247 	 * The software should guarantee LSO packet header(MAC+IP+TCP)
248 	 * to be within one descriptor. Here we reallocate and refill the
249 	 * the header if it's physical memory non-contiguous.
250 	 */
251 	if (cur_context.lso_flag) {
252 		/* find the last fragment of the header */
253 		len = MBLKL(mp);
254 		ASSERT(len > 0);
255 		next_mp = mp;
256 		pre_mp = NULL;
257 		while (len < cur_context.hdr_len) {
258 			pre_mp = next_mp;
259 			next_mp = next_mp->b_cont;
260 			len += MBLKL(next_mp);
261 		}
262 		/*
263 		 * If the header and the payload are in different mblks,
264 		 * we simply force the header to be copied into pre-allocated
265 		 * page-aligned buffer.
266 		 */
267 		if (len == cur_context.hdr_len)
268 			goto adjust_threshold;
269 
270 		hdr_frag_len = cur_context.hdr_len - (len - MBLKL(next_mp));
271 		/*
272 		 * There are three cases we need to reallocate a mblk for the
273 		 * last header fragment:
274 		 *
275 		 * 1. the header is in multiple mblks and the last fragment
276 		 * share the same mblk with the payload
277 		 *
278 		 * 2. the header is in a single mblk shared with the payload
279 		 * and the header is physical memory non-contiguous
280 		 *
281 		 * 3. there is 4 KB boundary within the header and 64 bytes
282 		 * following the end of the header bytes. The case may cause
283 		 * TCP data corruption issue.
284 		 *
285 		 * The workaround for the case #2 and case #3 is:
286 		 *   Assuming standard Ethernet/IP/TCP headers of 54 bytes,
287 		 *   this means that the buffer(containing the headers) should
288 		 *   not start -118 bytes before a 4 KB boundary. For example,
289 		 *   128-byte alignment for this buffer could be used to fulfill
290 		 *   this condition.
291 		 */
292 		if ((next_mp != mp) ||
293 		    (P2NPHASE((uintptr_t)next_mp->b_rptr,
294 		    E1000_LSO_FIRST_DESC_ALIGNMENT_BOUNDARY_4K)
295 		    < E1000_LSO_FIRST_DESC_ALIGNMENT)) {
296 			E1000G_DEBUG_STAT(tx_ring->stat_lso_header_fail);
297 			/*
298 			 * reallocate the mblk for the last header fragment,
299 			 * expect to bcopy into pre-allocated page-aligned
300 			 * buffer
301 			 */
302 			new_mp = allocb(hdr_frag_len, NULL);
303 			if (!new_mp)
304 				return (B_FALSE);
305 			bcopy(next_mp->b_rptr, new_mp->b_rptr, hdr_frag_len);
306 			/* link the new header fragment with the other parts */
307 			new_mp->b_wptr = new_mp->b_rptr + hdr_frag_len;
308 			new_mp->b_cont = next_mp;
309 			if (pre_mp)
310 				pre_mp->b_cont = new_mp;
311 			else
312 				mp = new_mp;
313 			next_mp->b_rptr += hdr_frag_len;
314 			frag_count++;
315 		}
316 adjust_threshold:
317 		/*
318 		 * adjust the bcopy threshhold to guarantee
319 		 * the header to use bcopy way
320 		 */
321 		if (bcopy_thresh < cur_context.hdr_len)
322 			bcopy_thresh = cur_context.hdr_len;
323 	}
324 
325 	packet = NULL;
326 	nmp = mp;
327 	while (nmp) {
328 		tmp = nmp->b_cont;
329 
330 		len = MBLKL(nmp);
331 		/* Check zero length mblks */
332 		if (len == 0) {
333 			E1000G_DEBUG_STAT(tx_ring->stat_empty_frags);
334 			/*
335 			 * If there're no packet buffers have been used,
336 			 * or we just completed processing a buffer, then
337 			 * skip the empty mblk fragment.
338 			 * Otherwise, there's still a pending buffer that
339 			 * needs to be processed (tx_copy).
340 			 */
341 			if (desc_count > 0) {
342 				nmp = tmp;
343 				continue;
344 			}
345 		}
346 
347 		/*
348 		 * Get a new TxSwPacket to process mblk buffers.
349 		 */
350 		if (desc_count > 0) {
351 			mutex_enter(&tx_ring->freelist_lock);
352 			packet = (p_tx_sw_packet_t)
353 			    QUEUE_POP_HEAD(&tx_ring->free_list);
354 			mutex_exit(&tx_ring->freelist_lock);
355 
356 			if (packet == NULL) {
357 				E1000G_DEBUGLOG_0(Adapter, E1000G_INFO_LEVEL,
358 				    "No Tx SwPacket available\n");
359 				E1000G_STAT(tx_ring->stat_no_swpkt);
360 				goto tx_send_failed;
361 			}
362 			QUEUE_PUSH_TAIL(&pending_list, &packet->Link);
363 		}
364 
365 		ASSERT(packet);
366 		/*
367 		 * If the size of the fragment is less than the tx_bcopy_thresh
368 		 * we'll use bcopy; Otherwise, we'll use DMA binding.
369 		 */
370 		if ((len <= bcopy_thresh) || tx_undersize_flag) {
371 			desc_count =
372 			    e1000g_tx_copy(tx_ring, packet, nmp,
373 			    tx_undersize_flag);
374 			E1000G_DEBUG_STAT(tx_ring->stat_copy);
375 		} else {
376 			desc_count =
377 			    e1000g_tx_bind(tx_ring, packet, nmp);
378 			E1000G_DEBUG_STAT(tx_ring->stat_bind);
379 		}
380 
381 		if (desc_count > 0)
382 			desc_total += desc_count;
383 		else if (desc_count < 0)
384 			goto tx_send_failed;
385 
386 		nmp = tmp;
387 	}
388 
389 	/* Assign the message to the last sw packet */
390 	ASSERT(packet);
391 	ASSERT(packet->mp == NULL);
392 	packet->mp = mp;
393 
394 	/* Try to recycle the tx descriptors again */
395 	if (tx_ring->tbd_avail < (desc_total + 3)) {
396 		E1000G_DEBUG_STAT(tx_ring->stat_recycle_retry);
397 		(void) e1000g_recycle(tx_ring);
398 	}
399 
400 	mutex_enter(&tx_ring->tx_lock);
401 
402 	/*
403 	 * If the number of available tx descriptors is not enough for transmit
404 	 * (one redundant descriptor and one hw checksum context descriptor are
405 	 * included), then return failure.
406 	 */
407 	if (tx_ring->tbd_avail < (desc_total + 3)) {
408 		E1000G_DEBUGLOG_0(Adapter, E1000G_INFO_LEVEL,
409 		    "No Enough Tx descriptors\n");
410 		E1000G_STAT(tx_ring->stat_no_desc);
411 		mutex_exit(&tx_ring->tx_lock);
412 		goto tx_send_failed;
413 	}
414 
415 	desc_count = e1000g_fill_tx_ring(tx_ring, &pending_list, &cur_context);
416 
417 	mutex_exit(&tx_ring->tx_lock);
418 
419 	ASSERT(desc_count > 0);
420 
421 	/* Send successful */
422 	return (B_TRUE);
423 
424 tx_send_failed:
425 	/* Restore mp to original */
426 	if (new_mp) {
427 		if (pre_mp) {
428 			pre_mp->b_cont = next_mp;
429 		}
430 		new_mp->b_cont = NULL;
431 		freemsg(new_mp);
432 
433 		next_mp->b_rptr -= hdr_frag_len;
434 	}
435 
436 	/*
437 	 * Enable Transmit interrupts, so that the interrupt routine can
438 	 * call mac_tx_update() when transmit descriptors become available.
439 	 */
440 	tx_ring->resched_timestamp = ddi_get_lbolt();
441 	tx_ring->resched_needed = B_TRUE;
442 	if (!Adapter->tx_intr_enable)
443 		e1000g_mask_tx_interrupt(Adapter);
444 
445 	/* Free pending TxSwPackets */
446 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&pending_list);
447 	while (packet) {
448 		packet->mp = NULL;
449 		e1000g_free_tx_swpkt(packet);
450 		packet = (p_tx_sw_packet_t)
451 		    QUEUE_GET_NEXT(&pending_list, &packet->Link);
452 	}
453 
454 	/* Return pending TxSwPackets to the "Free" list */
455 	mutex_enter(&tx_ring->freelist_lock);
456 	QUEUE_APPEND(&tx_ring->free_list, &pending_list);
457 	mutex_exit(&tx_ring->freelist_lock);
458 
459 	E1000G_STAT(tx_ring->stat_send_fail);
460 
461 	/* Message will be scheduled for re-transmit */
462 	return (B_FALSE);
463 
464 tx_no_resource:
465 	/*
466 	 * Enable Transmit interrupts, so that the interrupt routine can
467 	 * call mac_tx_update() when transmit descriptors become available.
468 	 */
469 	tx_ring->resched_timestamp = ddi_get_lbolt();
470 	tx_ring->resched_needed = B_TRUE;
471 	if (!Adapter->tx_intr_enable)
472 		e1000g_mask_tx_interrupt(Adapter);
473 
474 	/* Message will be scheduled for re-transmit */
475 	return (B_FALSE);
476 }
477 
478 static boolean_t
479 e1000g_retrieve_context(mblk_t *mp, context_data_t *cur_context,
480     size_t msg_size)
481 {
482 	uintptr_t ip_start;
483 	uintptr_t tcp_start;
484 	mblk_t *nmp;
485 	uint32_t lsoflags;
486 	uint32_t mss;
487 
488 	bzero(cur_context, sizeof (context_data_t));
489 
490 	/* first check lso information */
491 	mac_lso_get(mp, &mss, &lsoflags);
492 
493 	/* retrieve checksum info */
494 	mac_hcksum_get(mp, &cur_context->cksum_start,
495 	    &cur_context->cksum_stuff, NULL, NULL, &cur_context->cksum_flags);
496 	/* retrieve ethernet header size */
497 	if (((struct ether_vlan_header *)(uintptr_t)mp->b_rptr)->ether_tpid ==
498 	    htons(ETHERTYPE_VLAN))
499 		cur_context->ether_header_size =
500 		    sizeof (struct ether_vlan_header);
501 	else
502 		cur_context->ether_header_size =
503 		    sizeof (struct ether_header);
504 
505 	if (lsoflags & HW_LSO) {
506 		ASSERT(mss != 0);
507 
508 		/* free the invalid packet */
509 		if (mss == 0 ||
510 		    !((cur_context->cksum_flags & HCK_PARTIALCKSUM) &&
511 		    (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM))) {
512 			return (B_FALSE);
513 		}
514 		cur_context->mss = (uint16_t)mss;
515 		cur_context->lso_flag = B_TRUE;
516 
517 		/*
518 		 * Some fields are cleared for the hardware to fill
519 		 * in. We don't assume Ethernet header, IP header and
520 		 * TCP header are always in the same mblk fragment,
521 		 * while we assume each header is always within one
522 		 * mblk fragment and Ethernet header is always in the
523 		 * first mblk fragment.
524 		 */
525 		nmp = mp;
526 		ip_start = (uintptr_t)(nmp->b_rptr)
527 		    + cur_context->ether_header_size;
528 		if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
529 			ip_start = (uintptr_t)nmp->b_cont->b_rptr
530 			    + (ip_start - (uintptr_t)(nmp->b_wptr));
531 			nmp = nmp->b_cont;
532 		}
533 		tcp_start = ip_start +
534 		    IPH_HDR_LENGTH((ipha_t *)ip_start);
535 		if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
536 			tcp_start = (uintptr_t)nmp->b_cont->b_rptr
537 			    + (tcp_start - (uintptr_t)(nmp->b_wptr));
538 			nmp = nmp->b_cont;
539 		}
540 		cur_context->hdr_len = cur_context->ether_header_size
541 		    + IPH_HDR_LENGTH((ipha_t *)ip_start)
542 		    + TCP_HDR_LENGTH((tcph_t *)tcp_start);
543 		((ipha_t *)ip_start)->ipha_length = 0;
544 		((ipha_t *)ip_start)->ipha_hdr_checksum = 0;
545 		/* calculate the TCP packet payload length */
546 		cur_context->pay_len = msg_size - cur_context->hdr_len;
547 	}
548 	return (B_TRUE);
549 }
550 
551 static boolean_t
552 e1000g_check_context(e1000g_tx_ring_t *tx_ring, context_data_t *cur_context)
553 {
554 	boolean_t context_reload;
555 	context_data_t *pre_context;
556 	struct e1000g *Adapter;
557 
558 	context_reload = B_FALSE;
559 	pre_context = &tx_ring->pre_context;
560 	Adapter = tx_ring->adapter;
561 
562 	/*
563 	 * The following code determine if the context descriptor is
564 	 * needed to be reloaded. The sequence of the conditions is
565 	 * made by their possibilities of changing.
566 	 */
567 	/*
568 	 * workaround for 82546EB, context descriptor must be reloaded
569 	 * per LSO/hw_cksum packet if LSO is enabled.
570 	 */
571 	if (Adapter->lso_premature_issue &&
572 	    Adapter->lso_enable &&
573 	    (cur_context->cksum_flags != 0)) {
574 
575 		context_reload = B_TRUE;
576 	} else if (cur_context->lso_flag) {
577 		if ((cur_context->lso_flag != pre_context->lso_flag) ||
578 		    (cur_context->cksum_flags != pre_context->cksum_flags) ||
579 		    (cur_context->pay_len != pre_context->pay_len) ||
580 		    (cur_context->mss != pre_context->mss) ||
581 		    (cur_context->hdr_len != pre_context->hdr_len) ||
582 		    (cur_context->cksum_stuff != pre_context->cksum_stuff) ||
583 		    (cur_context->cksum_start != pre_context->cksum_start) ||
584 		    (cur_context->ether_header_size !=
585 		    pre_context->ether_header_size)) {
586 
587 			context_reload = B_TRUE;
588 		}
589 	} else if (cur_context->cksum_flags != 0) {
590 		if ((cur_context->lso_flag != pre_context->lso_flag) ||
591 		    (cur_context->cksum_flags != pre_context->cksum_flags) ||
592 		    (cur_context->cksum_stuff != pre_context->cksum_stuff) ||
593 		    (cur_context->cksum_start != pre_context->cksum_start) ||
594 		    (cur_context->ether_header_size !=
595 		    pre_context->ether_header_size)) {
596 
597 			context_reload = B_TRUE;
598 		}
599 	}
600 
601 	return (context_reload);
602 }
603 
604 static int
605 e1000g_fill_tx_ring(e1000g_tx_ring_t *tx_ring, LIST_DESCRIBER *pending_list,
606     context_data_t *cur_context)
607 {
608 	struct e1000g *Adapter;
609 	struct e1000_hw *hw;
610 	p_tx_sw_packet_t first_packet;
611 	p_tx_sw_packet_t packet;
612 	p_tx_sw_packet_t previous_packet;
613 	boolean_t context_reload;
614 	struct e1000_tx_desc *first_data_desc;
615 	struct e1000_tx_desc *next_desc;
616 	struct e1000_tx_desc *descriptor;
617 	struct e1000_data_desc zeroed;
618 	int desc_count;
619 	boolean_t buff_overrun_flag;
620 	int i;
621 
622 	Adapter = tx_ring->adapter;
623 	hw = &Adapter->shared;
624 
625 	desc_count = 0;
626 	first_packet = NULL;
627 	first_data_desc = NULL;
628 	descriptor = NULL;
629 	first_packet = NULL;
630 	packet = NULL;
631 	buff_overrun_flag = B_FALSE;
632 	zeroed.upper.data = 0;
633 
634 	next_desc = tx_ring->tbd_next;
635 
636 	/* Context descriptor reload check */
637 	context_reload = e1000g_check_context(tx_ring, cur_context);
638 
639 	if (context_reload) {
640 		first_packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(pending_list);
641 
642 		descriptor = next_desc;
643 
644 		e1000g_fill_context_descriptor(cur_context,
645 		    (struct e1000_context_desc *)descriptor);
646 
647 		/* Check the wrap-around case */
648 		if (descriptor == tx_ring->tbd_last)
649 			next_desc = tx_ring->tbd_first;
650 		else
651 			next_desc++;
652 
653 		desc_count++;
654 	}
655 
656 	first_data_desc = next_desc;
657 
658 	/*
659 	 * According to the documentation, the packet options field (POPTS) is
660 	 * "ignored except on the first data descriptor of a packet."  However,
661 	 * there is a bug in QEMU (638955) whereby the POPTS field within a
662 	 * given data descriptor is used to interpret that data descriptor --
663 	 * regardless of whether or not the descriptor is the first in a packet
664 	 * or not.  For a packet that spans multiple descriptors, the (virtual)
665 	 * HW checksum (either TCP/UDP or IP or both) will therefore _not_ be
666 	 * performed on descriptors after the first, resulting in incorrect
667 	 * checksums and mysteriously dropped/retransmitted packets.  Other
668 	 * drivers do not have this issue because they (harmlessly) set the
669 	 * POPTS field on every data descriptor to be the intended options for
670 	 * the entire packet.  To circumvent this QEMU bug, we engage in this
671 	 * same behavior iff the subsystem vendor and device IDs indicate that
672 	 * this is an emulated QEMU device (1af4,1100).
673 	 */
674 	if (hw->subsystem_vendor_id == 0x1af4 &&
675 	    hw->subsystem_device_id == 0x1100 &&
676 	    cur_context->cksum_flags) {
677 		if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM)
678 			zeroed.upper.fields.popts |= E1000_TXD_POPTS_IXSM;
679 
680 		if (cur_context->cksum_flags & HCK_PARTIALCKSUM)
681 			zeroed.upper.fields.popts |= E1000_TXD_POPTS_TXSM;
682 	}
683 
684 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(pending_list);
685 	while (packet) {
686 		ASSERT(packet->num_desc);
687 
688 		for (i = 0; i < packet->num_desc; i++) {
689 			ASSERT(tx_ring->tbd_avail > 0);
690 
691 			descriptor = next_desc;
692 			descriptor->buffer_addr =
693 			    packet->desc[i].address;
694 			descriptor->lower.data =
695 			    packet->desc[i].length;
696 
697 			/* Zero out status */
698 			descriptor->upper.data = zeroed.upper.data;
699 
700 			descriptor->lower.data |=
701 			    E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D;
702 			/* must set RS on every outgoing descriptor */
703 			descriptor->lower.data |=
704 			    E1000_TXD_CMD_RS;
705 
706 			if (cur_context->lso_flag)
707 				descriptor->lower.data |= E1000_TXD_CMD_TSE;
708 
709 			/* Check the wrap-around case */
710 			if (descriptor == tx_ring->tbd_last)
711 				next_desc = tx_ring->tbd_first;
712 			else
713 				next_desc++;
714 
715 			desc_count++;
716 
717 			/*
718 			 * workaround for 82546EB errata 33, hang in PCI-X
719 			 * systems due to 2k Buffer Overrun during Transmit
720 			 * Operation. The workaround applies to all the Intel
721 			 * PCI-X chips.
722 			 */
723 			if (hw->bus.type == e1000_bus_type_pcix &&
724 			    descriptor == first_data_desc &&
725 			    ((descriptor->lower.data & E1000G_TBD_LENGTH_MASK)
726 			    > E1000_TX_BUFFER_OEVRRUN_THRESHOLD)) {
727 				/* modified the first descriptor */
728 				descriptor->lower.data &=
729 				    ~E1000G_TBD_LENGTH_MASK;
730 				descriptor->lower.flags.length =
731 				    E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
732 
733 				/* insert a new descriptor */
734 				ASSERT(tx_ring->tbd_avail > 0);
735 				next_desc->buffer_addr =
736 				    packet->desc[0].address +
737 				    E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
738 				next_desc->lower.data =
739 				    packet->desc[0].length -
740 				    E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
741 
742 				/* Zero out status */
743 				next_desc->upper.data = zeroed.upper.data;
744 
745 				next_desc->lower.data |=
746 				    E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D;
747 				/* must set RS on every outgoing descriptor */
748 				next_desc->lower.data |=
749 				    E1000_TXD_CMD_RS;
750 
751 				if (cur_context->lso_flag)
752 					next_desc->lower.data |=
753 					    E1000_TXD_CMD_TSE;
754 
755 				descriptor = next_desc;
756 
757 				/* Check the wrap-around case */
758 				if (next_desc == tx_ring->tbd_last)
759 					next_desc = tx_ring->tbd_first;
760 				else
761 					next_desc++;
762 
763 				desc_count++;
764 				buff_overrun_flag = B_TRUE;
765 			}
766 		}
767 
768 		if (buff_overrun_flag) {
769 			packet->num_desc++;
770 			buff_overrun_flag = B_FALSE;
771 		}
772 
773 		if (first_packet != NULL) {
774 			/*
775 			 * Count the checksum context descriptor for
776 			 * the first SwPacket.
777 			 */
778 			first_packet->num_desc++;
779 			first_packet = NULL;
780 		}
781 
782 		packet->tickstamp = ddi_get_lbolt64();
783 
784 		previous_packet = packet;
785 		packet = (p_tx_sw_packet_t)
786 		    QUEUE_GET_NEXT(pending_list, &packet->Link);
787 	}
788 
789 	/*
790 	 * workaround for 82546EB errata 21, LSO Premature Descriptor Write Back
791 	 */
792 	if (Adapter->lso_premature_issue && cur_context->lso_flag &&
793 	    ((descriptor->lower.data & E1000G_TBD_LENGTH_MASK) > 8)) {
794 		/* modified the previous descriptor */
795 		descriptor->lower.data -= 4;
796 
797 		/* insert a new descriptor */
798 		ASSERT(tx_ring->tbd_avail > 0);
799 		/* the lower 20 bits of lower.data is the length field */
800 		next_desc->buffer_addr =
801 		    descriptor->buffer_addr +
802 		    (descriptor->lower.data & E1000G_TBD_LENGTH_MASK);
803 		next_desc->lower.data = 4;
804 
805 		/* Zero out status */
806 		next_desc->upper.data = zeroed.upper.data;
807 		/* It must be part of a LSO packet */
808 		next_desc->lower.data |=
809 		    E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D |
810 		    E1000_TXD_CMD_RS | E1000_TXD_CMD_TSE;
811 
812 		descriptor = next_desc;
813 
814 		/* Check the wrap-around case */
815 		if (descriptor == tx_ring->tbd_last)
816 			next_desc = tx_ring->tbd_first;
817 		else
818 			next_desc++;
819 
820 		desc_count++;
821 		/* update the number of descriptors */
822 		previous_packet->num_desc++;
823 	}
824 
825 	ASSERT(descriptor);
826 
827 	if (cur_context->cksum_flags) {
828 		if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM)
829 			((struct e1000_data_desc *)first_data_desc)->
830 			    upper.fields.popts |= E1000_TXD_POPTS_IXSM;
831 		if (cur_context->cksum_flags & HCK_PARTIALCKSUM)
832 			((struct e1000_data_desc *)first_data_desc)->
833 			    upper.fields.popts |= E1000_TXD_POPTS_TXSM;
834 	}
835 
836 	/*
837 	 * Last Descriptor of Packet needs End Of Packet (EOP), Report
838 	 * Status (RS) set.
839 	 */
840 	if (Adapter->tx_intr_delay) {
841 		descriptor->lower.data |= E1000_TXD_CMD_IDE |
842 		    E1000_TXD_CMD_EOP;
843 	} else {
844 		descriptor->lower.data |= E1000_TXD_CMD_EOP;
845 	}
846 
847 	/* Set append Ethernet CRC (IFCS) bits */
848 	if (cur_context->lso_flag) {
849 		first_data_desc->lower.data |= E1000_TXD_CMD_IFCS;
850 	} else {
851 		descriptor->lower.data |= E1000_TXD_CMD_IFCS;
852 	}
853 
854 	/*
855 	 * Sync the Tx descriptors DMA buffer
856 	 */
857 	(void) ddi_dma_sync(tx_ring->tbd_dma_handle,
858 	    0, 0, DDI_DMA_SYNC_FORDEV);
859 
860 	tx_ring->tbd_next = next_desc;
861 
862 	/*
863 	 * Advance the Transmit Descriptor Tail (Tdt), this tells the
864 	 * FX1000 that this frame is available to transmit.
865 	 */
866 	if (hw->mac.type == e1000_82547)
867 		e1000g_82547_tx_move_tail(tx_ring);
868 	else
869 		E1000_WRITE_REG(hw, E1000_TDT(0),
870 		    (uint32_t)(next_desc - tx_ring->tbd_first));
871 
872 	if (e1000g_check_acc_handle(Adapter->osdep.reg_handle) != DDI_FM_OK) {
873 		ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
874 		Adapter->e1000g_state |= E1000G_ERROR;
875 	}
876 
877 	/* Put the pending SwPackets to the "Used" list */
878 	mutex_enter(&tx_ring->usedlist_lock);
879 	QUEUE_APPEND(&tx_ring->used_list, pending_list);
880 	tx_ring->tbd_avail -= desc_count;
881 	mutex_exit(&tx_ring->usedlist_lock);
882 
883 	/* update LSO related data */
884 	if (context_reload)
885 		tx_ring->pre_context = *cur_context;
886 
887 	return (desc_count);
888 }
889 
890 /*
891  * e1000g_tx_setup - setup tx data structures
892  *
893  * This routine initializes all of the transmit related
894  * structures. This includes the Transmit descriptors,
895  * and the tx_sw_packet structures.
896  */
897 void
898 e1000g_tx_setup(struct e1000g *Adapter)
899 {
900 	struct e1000_hw *hw;
901 	p_tx_sw_packet_t packet;
902 	uint32_t i;
903 	uint32_t buf_high;
904 	uint32_t buf_low;
905 	uint32_t reg_tipg;
906 	uint32_t reg_tctl;
907 	int size;
908 	e1000g_tx_ring_t *tx_ring;
909 
910 	hw = &Adapter->shared;
911 	tx_ring = Adapter->tx_ring;
912 
913 	/* init the lists */
914 	/*
915 	 * Here we don't need to protect the lists using the
916 	 * usedlist_lock and freelist_lock, for they have
917 	 * been protected by the chip_lock.
918 	 */
919 	QUEUE_INIT_LIST(&tx_ring->used_list);
920 	QUEUE_INIT_LIST(&tx_ring->free_list);
921 
922 	/* Go through and set up each SW_Packet */
923 	packet = tx_ring->packet_area;
924 	for (i = 0; i < Adapter->tx_freelist_num; i++, packet++) {
925 		/* Initialize this tx_sw_apcket area */
926 		e1000g_free_tx_swpkt(packet);
927 		/* Add this tx_sw_packet to the free list */
928 		QUEUE_PUSH_TAIL(&tx_ring->free_list,
929 		    &packet->Link);
930 	}
931 
932 	/* Setup TX descriptor pointers */
933 	tx_ring->tbd_next = tx_ring->tbd_first;
934 	tx_ring->tbd_oldest = tx_ring->tbd_first;
935 
936 	/*
937 	 * Setup Hardware TX Registers
938 	 */
939 	/* Setup the Transmit Control Register (TCTL). */
940 	reg_tctl = E1000_READ_REG(hw, E1000_TCTL);
941 	reg_tctl |= E1000_TCTL_PSP | E1000_TCTL_EN |
942 	    (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT) |
943 	    (E1000_COLLISION_DISTANCE << E1000_COLD_SHIFT) |
944 	    E1000_TCTL_RTLC;
945 
946 	/* Enable the MULR bit */
947 	if (hw->bus.type == e1000_bus_type_pci_express)
948 		reg_tctl |= E1000_TCTL_MULR;
949 
950 	E1000_WRITE_REG(hw, E1000_TCTL, reg_tctl);
951 
952 	/* Setup HW Base and Length of Tx descriptor area */
953 	size = (Adapter->tx_desc_num * sizeof (struct e1000_tx_desc));
954 	E1000_WRITE_REG(hw, E1000_TDLEN(0), size);
955 	size = E1000_READ_REG(hw, E1000_TDLEN(0));
956 
957 	buf_low = (uint32_t)tx_ring->tbd_dma_addr;
958 	buf_high = (uint32_t)(tx_ring->tbd_dma_addr >> 32);
959 
960 	/*
961 	 * Write the highest location first and work backward to the lowest.
962 	 * This is necessary for some adapter types to
963 	 * prevent write combining from occurring.
964 	 */
965 	E1000_WRITE_REG(hw, E1000_TDBAH(0), buf_high);
966 	E1000_WRITE_REG(hw, E1000_TDBAL(0), buf_low);
967 
968 	/* Setup our HW Tx Head & Tail descriptor pointers */
969 	E1000_WRITE_REG(hw, E1000_TDH(0), 0);
970 	E1000_WRITE_REG(hw, E1000_TDT(0), 0);
971 
972 	/* Set the default values for the Tx Inter Packet Gap timer */
973 	if ((hw->mac.type == e1000_82542) &&
974 	    ((hw->revision_id == E1000_REVISION_2) ||
975 	    (hw->revision_id == E1000_REVISION_3))) {
976 		reg_tipg = DEFAULT_82542_TIPG_IPGT;
977 		reg_tipg |=
978 		    DEFAULT_82542_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT;
979 		reg_tipg |=
980 		    DEFAULT_82542_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT;
981 	} else if (hw->mac.type == e1000_80003es2lan) {
982 		reg_tipg = DEFAULT_82543_TIPG_IPGR1;
983 		reg_tipg |= DEFAULT_80003ES2LAN_TIPG_IPGR2 <<
984 		    E1000_TIPG_IPGR2_SHIFT;
985 	} else {
986 		if (hw->phy.media_type == e1000_media_type_fiber)
987 			reg_tipg = DEFAULT_82543_TIPG_IPGT_FIBER;
988 		else
989 			reg_tipg = DEFAULT_82543_TIPG_IPGT_COPPER;
990 		reg_tipg |=
991 		    DEFAULT_82543_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT;
992 		reg_tipg |=
993 		    DEFAULT_82543_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT;
994 	}
995 	E1000_WRITE_REG(hw, E1000_TIPG, reg_tipg);
996 
997 	/* Setup Transmit Interrupt Delay Value */
998 	E1000_WRITE_REG(hw, E1000_TIDV, Adapter->tx_intr_delay);
999 	E1000G_DEBUGLOG_1(Adapter, E1000G_INFO_LEVEL,
1000 	    "E1000_TIDV: 0x%x\n", Adapter->tx_intr_delay);
1001 
1002 	if (hw->mac.type >= e1000_82540) {
1003 		E1000_WRITE_REG(&Adapter->shared, E1000_TADV,
1004 		    Adapter->tx_intr_abs_delay);
1005 		E1000G_DEBUGLOG_1(Adapter, E1000G_INFO_LEVEL,
1006 		    "E1000_TADV: 0x%x\n", Adapter->tx_intr_abs_delay);
1007 	}
1008 
1009 	tx_ring->tbd_avail = Adapter->tx_desc_num;
1010 
1011 	/* Initialize stored context information */
1012 	bzero(&(tx_ring->pre_context), sizeof (context_data_t));
1013 }
1014 
1015 /*
1016  * e1000g_recycle - recycle the tx descriptors and tx sw packets
1017  */
1018 int
1019 e1000g_recycle(e1000g_tx_ring_t *tx_ring)
1020 {
1021 	struct e1000g *Adapter;
1022 	LIST_DESCRIBER pending_list;
1023 	p_tx_sw_packet_t packet;
1024 	mblk_t *mp;
1025 	mblk_t *nmp;
1026 	struct e1000_tx_desc *descriptor;
1027 	int desc_count;
1028 	int64_t delta;
1029 
1030 	/*
1031 	 * This function will examine each TxSwPacket in the 'used' queue
1032 	 * if the e1000g is done with it then the associated resources (Tx
1033 	 * Descriptors) will be "freed" and the TxSwPacket will be
1034 	 * returned to the 'free' queue.
1035 	 */
1036 	Adapter = tx_ring->adapter;
1037 	delta = 0;
1038 
1039 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&tx_ring->used_list);
1040 	if (packet == NULL) {
1041 		Adapter->stall_flag = B_FALSE;
1042 		return (0);
1043 	}
1044 
1045 	desc_count = 0;
1046 	QUEUE_INIT_LIST(&pending_list);
1047 
1048 	/* Sync the Tx descriptor DMA buffer */
1049 	(void) ddi_dma_sync(tx_ring->tbd_dma_handle,
1050 	    0, 0, DDI_DMA_SYNC_FORKERNEL);
1051 	if (e1000g_check_dma_handle(
1052 	    tx_ring->tbd_dma_handle) != DDI_FM_OK) {
1053 		ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
1054 		Adapter->e1000g_state |= E1000G_ERROR;
1055 		return (0);
1056 	}
1057 
1058 	/*
1059 	 * While there are still TxSwPackets in the used queue check them
1060 	 */
1061 	mutex_enter(&tx_ring->usedlist_lock);
1062 	while ((packet =
1063 	    (p_tx_sw_packet_t)QUEUE_GET_HEAD(&tx_ring->used_list)) != NULL) {
1064 
1065 		/*
1066 		 * Get hold of the next descriptor that the e1000g will
1067 		 * report status back to (this will be the last descriptor
1068 		 * of a given sw packet). We only want to free the
1069 		 * sw packet (and it resources) if the e1000g is done
1070 		 * with ALL of the descriptors.  If the e1000g is done
1071 		 * with the last one then it is done with all of them.
1072 		 */
1073 		ASSERT(packet->num_desc);
1074 		descriptor = tx_ring->tbd_oldest + (packet->num_desc - 1);
1075 
1076 		/* Check for wrap case */
1077 		if (descriptor > tx_ring->tbd_last)
1078 			descriptor -= Adapter->tx_desc_num;
1079 
1080 		/*
1081 		 * If the descriptor done bit is set free TxSwPacket and
1082 		 * associated resources
1083 		 */
1084 		if (descriptor->upper.fields.status & E1000_TXD_STAT_DD) {
1085 			QUEUE_POP_HEAD(&tx_ring->used_list);
1086 			QUEUE_PUSH_TAIL(&pending_list, &packet->Link);
1087 
1088 			if (descriptor == tx_ring->tbd_last)
1089 				tx_ring->tbd_oldest =
1090 				    tx_ring->tbd_first;
1091 			else
1092 				tx_ring->tbd_oldest =
1093 				    descriptor + 1;
1094 
1095 			desc_count += packet->num_desc;
1096 		} else {
1097 			/*
1098 			 * Found a sw packet that the e1000g is not done
1099 			 * with then there is no reason to check the rest
1100 			 * of the queue.
1101 			 */
1102 			delta = ddi_get_lbolt64() - packet->tickstamp;
1103 			break;
1104 		}
1105 	}
1106 
1107 	tx_ring->tbd_avail += desc_count;
1108 	Adapter->tx_pkt_cnt += desc_count;
1109 
1110 	mutex_exit(&tx_ring->usedlist_lock);
1111 
1112 	if (desc_count == 0) {
1113 		E1000G_DEBUG_STAT(tx_ring->stat_recycle_none);
1114 		/*
1115 		 * If the packet hasn't been sent out for seconds and
1116 		 * the transmitter is not under paused flowctrl condition,
1117 		 * the transmitter is considered to be stalled.
1118 		 */
1119 		if ((delta > Adapter->stall_threshold) &&
1120 		    !(E1000_READ_REG(&Adapter->shared,
1121 		    E1000_STATUS) & E1000_STATUS_TXOFF)) {
1122 			Adapter->stall_flag = B_TRUE;
1123 		}
1124 		return (0);
1125 	}
1126 
1127 	Adapter->stall_flag = B_FALSE;
1128 
1129 	mp = NULL;
1130 	nmp = NULL;
1131 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&pending_list);
1132 	ASSERT(packet != NULL);
1133 	while (packet != NULL) {
1134 		if (packet->mp != NULL) {
1135 			ASSERT(packet->mp->b_next == NULL);
1136 			/* Assemble the message chain */
1137 			if (mp == NULL) {
1138 				mp = packet->mp;
1139 				nmp = packet->mp;
1140 			} else {
1141 				nmp->b_next = packet->mp;
1142 				nmp = packet->mp;
1143 			}
1144 			/* Disconnect the message from the sw packet */
1145 			packet->mp = NULL;
1146 		}
1147 
1148 		/* Free the TxSwPackets */
1149 		e1000g_free_tx_swpkt(packet);
1150 
1151 		packet = (p_tx_sw_packet_t)
1152 		    QUEUE_GET_NEXT(&pending_list, &packet->Link);
1153 	}
1154 
1155 	/* Return the TxSwPackets back to the FreeList */
1156 	mutex_enter(&tx_ring->freelist_lock);
1157 	QUEUE_APPEND(&tx_ring->free_list, &pending_list);
1158 	mutex_exit(&tx_ring->freelist_lock);
1159 
1160 	if (mp != NULL)
1161 		freemsgchain(mp);
1162 
1163 	return (desc_count);
1164 }
1165 /*
1166  * 82544 Coexistence issue workaround:
1167  *    There are 2 issues.
1168  *    1. If a 32 bit split completion happens from P64H2 and another
1169  *	agent drives a 64 bit request/split completion after ONLY
1170  *	1 idle clock (BRCM/Emulex/Adaptec fiber channel cards) then
1171  *	82544 has a problem where in to clock all the data in, it
1172  *	looks at REQ64# signal and since it has changed so fast (i.e. 1
1173  *	idle clock turn around), it will fail to clock all the data in.
1174  *	Data coming from certain ending addresses has exposure to this issue.
1175  *
1176  * To detect this issue, following equation can be used...
1177  *	SIZE[3:0] + ADDR[2:0] = SUM[3:0].
1178  *	If SUM[3:0] is in between 1 to 4, we will have this issue.
1179  *
1180  * ROOT CAUSE:
1181  *	The erratum involves the 82544 PCIX elasticity FIFO implementations as
1182  *	64-bit FIFO's and flushing of the final partial-bytes corresponding
1183  *	to the end of a requested read burst. Under a specific burst condition
1184  *	of ending-data alignment and 32-byte split-completions, the final
1185  *	byte(s) of split-completion data require an extra clock cycle to flush
1186  *	into 64-bit FIFO orientation.  An incorrect logic dependency on the
1187  *	REQ64# signal occurring during during this clock cycle may cause the
1188  *	residual byte(s) to be lost, thereby rendering the internal DMA client
1189  *	forever awaiting the final byte(s) for an outbound data-fetch.  The
1190  *	erratum is confirmed to *only* occur if certain subsequent external
1191  *	64-bit PCIX bus transactions occur immediately (minimum possible bus
1192  *	turn- around) following the odd-aligned 32-bit split-completion
1193  *	containing the final byte(s).  Intel has confirmed that this has been
1194  *	seen only with chipset/bridges which have the capability to provide
1195  *	32-bit split-completion data, and in the presence of newer PCIX bus
1196  *	agents which fully-optimize the inter-transaction turn-around (zero
1197  *	additional initiator latency when pre-granted bus ownership).
1198  *
1199  *   	This issue does not exist in PCI bus mode, when any agent is operating
1200  *	in 32 bit only mode or on chipsets that do not do 32 bit split
1201  *	completions for 64 bit read requests (Serverworks chipsets). P64H2 does
1202  *	32 bit split completions for any read request that has bit 2 set to 1
1203  *	for the requested address and read request size is more than 8 bytes.
1204  *
1205  *   2. Another issue is related to 82544 driving DACs under the similar
1206  *	scenario (32 bit split completion followed by 64 bit transaction with
1207  *	only 1 cycle turnaround). This issue is still being root caused. We
1208  *	think that both of these issues can be avoided if following workaround
1209  *	is implemented. It seems DAC issues is related to ending addresses being
1210  *	0x9, 0xA, 0xB, 0xC and hence ending up at odd boundaries in elasticity
1211  *	FIFO which does not get flushed due to REQ64# dependency. We will only
1212  *	know the full story after it has been simulated successfully by HW team.
1213  *
1214  * WORKAROUND:
1215  *	Make sure we do not have ending address as 1,2,3,4(Hang) or 9,a,b,c(DAC)
1216  */
1217 static uint32_t
1218 e1000g_fill_82544_desc(uint64_t address,
1219     size_t length, p_desc_array_t desc_array)
1220 {
1221 	/*
1222 	 * Since issue is sensitive to length and address.
1223 	 * Let us first check the address...
1224 	 */
1225 	uint32_t safe_terminator;
1226 
1227 	if (length <= 4) {
1228 		desc_array->descriptor[0].address = address;
1229 		desc_array->descriptor[0].length = (uint32_t)length;
1230 		desc_array->elements = 1;
1231 		return (desc_array->elements);
1232 	}
1233 	safe_terminator =
1234 	    (uint32_t)((((uint32_t)address & 0x7) +
1235 	    (length & 0xF)) & 0xF);
1236 	/*
1237 	 * if it does not fall between 0x1 to 0x4 and 0x9 to 0xC then
1238 	 * return
1239 	 */
1240 	if (safe_terminator == 0 ||
1241 	    (safe_terminator > 4 && safe_terminator < 9) ||
1242 	    (safe_terminator > 0xC && safe_terminator <= 0xF)) {
1243 		desc_array->descriptor[0].address = address;
1244 		desc_array->descriptor[0].length = (uint32_t)length;
1245 		desc_array->elements = 1;
1246 		return (desc_array->elements);
1247 	}
1248 
1249 	desc_array->descriptor[0].address = address;
1250 	desc_array->descriptor[0].length = length - 4;
1251 	desc_array->descriptor[1].address = address + (length - 4);
1252 	desc_array->descriptor[1].length = 4;
1253 	desc_array->elements = 2;
1254 	return (desc_array->elements);
1255 }
1256 
1257 static int
1258 e1000g_tx_copy(e1000g_tx_ring_t *tx_ring, p_tx_sw_packet_t packet,
1259     mblk_t *mp, boolean_t tx_undersize_flag)
1260 {
1261 	size_t len;
1262 	size_t len1;
1263 	dma_buffer_t *tx_buf;
1264 	mblk_t *nmp;
1265 	boolean_t finished;
1266 	int desc_count;
1267 
1268 	desc_count = 0;
1269 	tx_buf = packet->tx_buf;
1270 	len = MBLKL(mp);
1271 
1272 	ASSERT((tx_buf->len + len) <= tx_buf->size);
1273 
1274 	if (len > 0) {
1275 		bcopy(mp->b_rptr,
1276 		    tx_buf->address + tx_buf->len,
1277 		    len);
1278 		tx_buf->len += len;
1279 
1280 		packet->num_mblk_frag++;
1281 	}
1282 
1283 	nmp = mp->b_cont;
1284 	if (nmp == NULL) {
1285 		finished = B_TRUE;
1286 	} else {
1287 		len1 = MBLKL(nmp);
1288 		if ((tx_buf->len + len1) > tx_buf->size)
1289 			finished = B_TRUE;
1290 		else if (tx_undersize_flag)
1291 			finished = B_FALSE;
1292 		else if (len1 > tx_ring->adapter->tx_bcopy_thresh)
1293 			finished = B_TRUE;
1294 		else
1295 			finished = B_FALSE;
1296 	}
1297 
1298 	if (finished) {
1299 		E1000G_DEBUG_STAT_COND(tx_ring->stat_multi_copy,
1300 		    (tx_buf->len > len));
1301 
1302 		/*
1303 		 * If the packet is smaller than 64 bytes, which is the
1304 		 * minimum ethernet packet size, pad the packet to make
1305 		 * it at least 60 bytes. The hardware will add 4 bytes
1306 		 * for CRC.
1307 		 */
1308 		if (tx_undersize_flag) {
1309 			ASSERT(tx_buf->len < ETHERMIN);
1310 
1311 			bzero(tx_buf->address + tx_buf->len,
1312 			    ETHERMIN - tx_buf->len);
1313 			tx_buf->len = ETHERMIN;
1314 		}
1315 
1316 #ifdef __sparc
1317 		if (packet->dma_type == USE_DVMA)
1318 			dvma_sync(tx_buf->dma_handle, 0, DDI_DMA_SYNC_FORDEV);
1319 		else
1320 			(void) ddi_dma_sync(tx_buf->dma_handle, 0,
1321 			    tx_buf->len, DDI_DMA_SYNC_FORDEV);
1322 #else
1323 		(void) ddi_dma_sync(tx_buf->dma_handle, 0,
1324 		    tx_buf->len, DDI_DMA_SYNC_FORDEV);
1325 #endif
1326 
1327 		packet->data_transfer_type = USE_BCOPY;
1328 
1329 		desc_count = e1000g_fill_tx_desc(tx_ring,
1330 		    packet,
1331 		    tx_buf->dma_address,
1332 		    tx_buf->len);
1333 
1334 		if (desc_count <= 0)
1335 			return (-1);
1336 	}
1337 
1338 	return (desc_count);
1339 }
1340 
1341 static int
1342 e1000g_tx_bind(e1000g_tx_ring_t *tx_ring, p_tx_sw_packet_t packet, mblk_t *mp)
1343 {
1344 	int j;
1345 	int mystat;
1346 	size_t len;
1347 	ddi_dma_cookie_t dma_cookie;
1348 	uint_t ncookies;
1349 	int desc_count;
1350 	uint32_t desc_total;
1351 
1352 	desc_total = 0;
1353 	len = MBLKL(mp);
1354 
1355 	/*
1356 	 * ddi_dma_addr_bind_handle() allocates  DMA  resources  for  a
1357 	 * memory  object such that a device can perform DMA to or from
1358 	 * the object.  DMA resources  are  allocated  considering  the
1359 	 * device's  DMA  attributes  as  expressed by ddi_dma_attr(9S)
1360 	 * (see ddi_dma_alloc_handle(9F)).
1361 	 *
1362 	 * ddi_dma_addr_bind_handle() fills in  the  first  DMA  cookie
1363 	 * pointed  to by cookiep with the appropriate address, length,
1364 	 * and bus type. *ccountp is set to the number of DMA  cookies
1365 	 * representing this DMA object. Subsequent DMA cookies must be
1366 	 * retrieved by calling ddi_dma_nextcookie(9F)  the  number  of
1367 	 * times specified by *countp - 1.
1368 	 */
1369 	switch (packet->dma_type) {
1370 #ifdef __sparc
1371 	case USE_DVMA:
1372 		dvma_kaddr_load(packet->tx_dma_handle,
1373 		    (caddr_t)mp->b_rptr, len, 0, &dma_cookie);
1374 
1375 		dvma_sync(packet->tx_dma_handle, 0,
1376 		    DDI_DMA_SYNC_FORDEV);
1377 
1378 		ncookies = 1;
1379 		packet->data_transfer_type = USE_DVMA;
1380 		break;
1381 #endif
1382 	case USE_DMA:
1383 		if ((mystat = ddi_dma_addr_bind_handle(
1384 		    packet->tx_dma_handle, NULL,
1385 		    (caddr_t)mp->b_rptr, len,
1386 		    DDI_DMA_WRITE | DDI_DMA_STREAMING,
1387 		    DDI_DMA_DONTWAIT, 0, &dma_cookie,
1388 		    &ncookies)) != DDI_DMA_MAPPED) {
1389 
1390 			e1000g_log(tx_ring->adapter, CE_WARN,
1391 			    "Couldn't bind mblk buffer to Tx DMA handle: "
1392 			    "return: %X, Pkt: %X\n",
1393 			    mystat, packet);
1394 			return (-1);
1395 		}
1396 
1397 		/*
1398 		 * An implicit ddi_dma_sync() is done when the
1399 		 * ddi_dma_addr_bind_handle() is called. So we
1400 		 * don't need to explicitly call ddi_dma_sync()
1401 		 * here any more.
1402 		 */
1403 		ASSERT(ncookies);
1404 		E1000G_DEBUG_STAT_COND(tx_ring->stat_multi_cookie,
1405 		    (ncookies > 1));
1406 
1407 		/*
1408 		 * The data_transfer_type value must be set after the handle
1409 		 * has been bound, for it will be used in e1000g_free_tx_swpkt()
1410 		 * to decide whether we need to unbind the handle.
1411 		 */
1412 		packet->data_transfer_type = USE_DMA;
1413 		break;
1414 	default:
1415 		ASSERT(B_FALSE);
1416 		break;
1417 	}
1418 
1419 	packet->num_mblk_frag++;
1420 
1421 	/*
1422 	 * Each address could span thru multpile cookie..
1423 	 * Each cookie will have one descriptor
1424 	 */
1425 	for (j = ncookies; j != 0; j--) {
1426 
1427 		desc_count = e1000g_fill_tx_desc(tx_ring,
1428 		    packet,
1429 		    dma_cookie.dmac_laddress,
1430 		    dma_cookie.dmac_size);
1431 
1432 		if (desc_count <= 0)
1433 			return (-1);
1434 
1435 		desc_total += desc_count;
1436 
1437 		/*
1438 		 * ddi_dma_nextcookie() retrieves subsequent DMA
1439 		 * cookies for a DMA object.
1440 		 * ddi_dma_nextcookie() fills in the
1441 		 * ddi_dma_cookie(9S) structure pointed to by
1442 		 * cookiep.  The ddi_dma_cookie(9S) structure
1443 		 * must be allocated prior to calling
1444 		 * ddi_dma_nextcookie(). The DMA cookie count
1445 		 * returned by ddi_dma_buf_bind_handle(9F),
1446 		 * ddi_dma_addr_bind_handle(9F), or
1447 		 * ddi_dma_getwin(9F) indicates the number of DMA
1448 		 * cookies a DMA object consists of.  If the
1449 		 * resulting cookie count, N, is larger than 1,
1450 		 * ddi_dma_nextcookie() must be called N-1 times
1451 		 * to retrieve all DMA cookies.
1452 		 */
1453 		if (j > 1) {
1454 			ddi_dma_nextcookie(packet->tx_dma_handle,
1455 			    &dma_cookie);
1456 		}
1457 	}
1458 
1459 	return (desc_total);
1460 }
1461 
1462 static void
1463 e1000g_fill_context_descriptor(context_data_t *cur_context,
1464     struct e1000_context_desc *context_desc)
1465 {
1466 	if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM) {
1467 		context_desc->lower_setup.ip_fields.ipcss =
1468 		    cur_context->ether_header_size;
1469 		context_desc->lower_setup.ip_fields.ipcso =
1470 		    cur_context->ether_header_size +
1471 		    offsetof(struct ip, ip_sum);
1472 		context_desc->lower_setup.ip_fields.ipcse =
1473 		    cur_context->ether_header_size +
1474 		    cur_context->cksum_start - 1;
1475 	} else
1476 		context_desc->lower_setup.ip_config = 0;
1477 
1478 	if (cur_context->cksum_flags & HCK_PARTIALCKSUM) {
1479 		/*
1480 		 * The packet with same protocol has the following
1481 		 * stuff and start offset:
1482 		 * |  Protocol  | Stuff  | Start  | Checksum
1483 		 * |		| Offset | Offset | Enable
1484 		 * | IPv4 + TCP |  0x24  |  0x14  |  Yes
1485 		 * | IPv4 + UDP |  0x1A  |  0x14  |  Yes
1486 		 * | IPv6 + TCP |  0x20  |  0x10  |  No
1487 		 * | IPv6 + UDP |  0x14  |  0x10  |  No
1488 		 */
1489 		context_desc->upper_setup.tcp_fields.tucss =
1490 		    cur_context->cksum_start + cur_context->ether_header_size;
1491 		context_desc->upper_setup.tcp_fields.tucso =
1492 		    cur_context->cksum_stuff + cur_context->ether_header_size;
1493 		context_desc->upper_setup.tcp_fields.tucse = 0;
1494 	} else
1495 		context_desc->upper_setup.tcp_config = 0;
1496 
1497 	if (cur_context->lso_flag) {
1498 		context_desc->tcp_seg_setup.fields.mss = cur_context->mss;
1499 		context_desc->tcp_seg_setup.fields.hdr_len =
1500 		    cur_context->hdr_len;
1501 		/*
1502 		 * workaround for 82546EB errata 23, status-writeback
1503 		 * reporting (RS) should not be set on context or
1504 		 * Null descriptors
1505 		 */
1506 		context_desc->cmd_and_length = E1000_TXD_CMD_DEXT
1507 		    | E1000_TXD_CMD_TSE | E1000_TXD_CMD_IP | E1000_TXD_CMD_TCP
1508 		    | E1000_TXD_DTYP_C | cur_context->pay_len;
1509 	} else {
1510 		context_desc->cmd_and_length = E1000_TXD_CMD_DEXT
1511 		    | E1000_TXD_DTYP_C;
1512 		/*
1513 		 * Zero out the options for TCP Segmentation Offload
1514 		 */
1515 		context_desc->tcp_seg_setup.data = 0;
1516 	}
1517 }
1518 
1519 static int
1520 e1000g_fill_tx_desc(e1000g_tx_ring_t *tx_ring,
1521     p_tx_sw_packet_t packet, uint64_t address, size_t size)
1522 {
1523 	struct e1000_hw *hw = &tx_ring->adapter->shared;
1524 	p_sw_desc_t desc;
1525 
1526 	if (hw->mac.type == e1000_82544) {
1527 		if (hw->bus.type == e1000_bus_type_pcix)
1528 			return (e1000g_tx_workaround_PCIX_82544(packet,
1529 			    address, size));
1530 
1531 		if (size > JUMBO_FRAG_LENGTH)
1532 			return (e1000g_tx_workaround_jumbo_82544(packet,
1533 			    address, size));
1534 	}
1535 
1536 	ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
1537 
1538 	desc = &packet->desc[packet->num_desc];
1539 	desc->address = address;
1540 	desc->length = (uint32_t)size;
1541 
1542 	packet->num_desc++;
1543 
1544 	return (1);
1545 }
1546 
1547 static int
1548 e1000g_tx_workaround_PCIX_82544(p_tx_sw_packet_t packet,
1549     uint64_t address, size_t size)
1550 {
1551 	p_sw_desc_t desc;
1552 	int desc_count;
1553 	long size_left;
1554 	size_t len;
1555 	uint32_t counter;
1556 	uint32_t array_elements;
1557 	desc_array_t desc_array;
1558 
1559 	/*
1560 	 * Coexist Workaround for cordova: RP: 07/04/03
1561 	 *
1562 	 * RP: ERRATA: Workaround ISSUE:
1563 	 * 8kb_buffer_Lockup CONTROLLER: Cordova Breakup
1564 	 * Eachbuffer in to 8kb pieces until the
1565 	 * remainder is < 8kb
1566 	 */
1567 	size_left = size;
1568 	desc_count = 0;
1569 
1570 	while (size_left > 0) {
1571 		if (size_left > MAX_TX_BUF_SIZE)
1572 			len = MAX_TX_BUF_SIZE;
1573 		else
1574 			len = size_left;
1575 
1576 		array_elements = e1000g_fill_82544_desc(address,
1577 		    len, &desc_array);
1578 
1579 		for (counter = 0; counter < array_elements; counter++) {
1580 			ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
1581 			/*
1582 			 * Put in the buffer address
1583 			 */
1584 			desc = &packet->desc[packet->num_desc];
1585 
1586 			desc->address =
1587 			    desc_array.descriptor[counter].address;
1588 			desc->length =
1589 			    desc_array.descriptor[counter].length;
1590 
1591 			packet->num_desc++;
1592 			desc_count++;
1593 		} /* for */
1594 
1595 		/*
1596 		 * Update the buffer address and length
1597 		 */
1598 		address += MAX_TX_BUF_SIZE;
1599 		size_left -= MAX_TX_BUF_SIZE;
1600 	} /* while */
1601 
1602 	return (desc_count);
1603 }
1604 
1605 static int
1606 e1000g_tx_workaround_jumbo_82544(p_tx_sw_packet_t packet,
1607     uint64_t address, size_t size)
1608 {
1609 	p_sw_desc_t desc;
1610 	int desc_count;
1611 	long size_left;
1612 	uint32_t offset;
1613 
1614 	/*
1615 	 * Workaround for Jumbo Frames on Cordova
1616 	 * PSD 06/01/2001
1617 	 */
1618 	size_left = size;
1619 	desc_count = 0;
1620 	offset = 0;
1621 	while (size_left > 0) {
1622 		ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
1623 
1624 		desc = &packet->desc[packet->num_desc];
1625 
1626 		desc->address = address + offset;
1627 
1628 		if (size_left > JUMBO_FRAG_LENGTH)
1629 			desc->length = JUMBO_FRAG_LENGTH;
1630 		else
1631 			desc->length = (uint32_t)size_left;
1632 
1633 		packet->num_desc++;
1634 		desc_count++;
1635 
1636 		offset += desc->length;
1637 		size_left -= JUMBO_FRAG_LENGTH;
1638 	}
1639 
1640 	return (desc_count);
1641 }
1642 
1643 #pragma inline(e1000g_82547_tx_move_tail_work)
1644 
1645 static void
1646 e1000g_82547_tx_move_tail_work(e1000g_tx_ring_t *tx_ring)
1647 {
1648 	struct e1000_hw *hw;
1649 	uint16_t hw_tdt;
1650 	uint16_t sw_tdt;
1651 	struct e1000_tx_desc *tx_desc;
1652 	uint16_t length = 0;
1653 	boolean_t eop = B_FALSE;
1654 	struct e1000g *Adapter;
1655 
1656 	Adapter = tx_ring->adapter;
1657 	hw = &Adapter->shared;
1658 
1659 	hw_tdt = E1000_READ_REG(hw, E1000_TDT(0));
1660 	sw_tdt = tx_ring->tbd_next - tx_ring->tbd_first;
1661 
1662 	while (hw_tdt != sw_tdt) {
1663 		tx_desc = &(tx_ring->tbd_first[hw_tdt]);
1664 		length += tx_desc->lower.flags.length;
1665 		eop = tx_desc->lower.data & E1000_TXD_CMD_EOP;
1666 		if (++hw_tdt == Adapter->tx_desc_num)
1667 			hw_tdt = 0;
1668 
1669 		if (eop) {
1670 			if ((Adapter->link_duplex == HALF_DUPLEX) &&
1671 			    (e1000_fifo_workaround_82547(hw, length)
1672 			    != E1000_SUCCESS)) {
1673 				if (tx_ring->timer_enable_82547) {
1674 					ASSERT(tx_ring->timer_id_82547 == 0);
1675 					tx_ring->timer_id_82547 =
1676 					    timeout(e1000g_82547_timeout,
1677 					    (void *)tx_ring,
1678 					    drv_usectohz(10000));
1679 				}
1680 				return;
1681 
1682 			} else {
1683 				E1000_WRITE_REG(hw, E1000_TDT(0), hw_tdt);
1684 				e1000_update_tx_fifo_head_82547(hw, length);
1685 				length = 0;
1686 			}
1687 		}
1688 	}
1689 }
1690 
1691 static void
1692 e1000g_82547_timeout(void *arg)
1693 {
1694 	e1000g_tx_ring_t *tx_ring;
1695 
1696 	tx_ring = (e1000g_tx_ring_t *)arg;
1697 
1698 	mutex_enter(&tx_ring->tx_lock);
1699 
1700 	tx_ring->timer_id_82547 = 0;
1701 	e1000g_82547_tx_move_tail_work(tx_ring);
1702 
1703 	mutex_exit(&tx_ring->tx_lock);
1704 }
1705 
1706 static void
1707 e1000g_82547_tx_move_tail(e1000g_tx_ring_t *tx_ring)
1708 {
1709 	timeout_id_t tid;
1710 
1711 	ASSERT(MUTEX_HELD(&tx_ring->tx_lock));
1712 
1713 	tid = tx_ring->timer_id_82547;
1714 	tx_ring->timer_id_82547 = 0;
1715 	if (tid != 0) {
1716 		tx_ring->timer_enable_82547 = B_FALSE;
1717 		mutex_exit(&tx_ring->tx_lock);
1718 
1719 		(void) untimeout(tid);
1720 
1721 		mutex_enter(&tx_ring->tx_lock);
1722 	}
1723 	tx_ring->timer_enable_82547 = B_TRUE;
1724 	e1000g_82547_tx_move_tail_work(tx_ring);
1725 }
1726