xref: /illumos-gate/usr/src/uts/common/io/e1000g/e1000g_tx.c (revision b424305435881ac456a9343be2898f1f86440f31)
1 /*
2  * This file is provided under a CDDLv1 license.  When using or
3  * redistributing this file, you may do so under this license.
4  * In redistributing this file this license must be included
5  * and no other modification of this header file is permitted.
6  *
7  * CDDL LICENSE SUMMARY
8  *
9  * Copyright(c) 1999 - 2008 Intel Corporation. All rights reserved.
10  *
11  * The contents of this file are subject to the terms of Version
12  * 1.0 of the Common Development and Distribution License (the "License").
13  *
14  * You should have received a copy of the License with this software.
15  * You can obtain a copy of the License at
16  *	http://www.opensolaris.org/os/licensing.
17  * See the License for the specific language governing permissions
18  * and limitations under the License.
19  */
20 
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * **********************************************************************
28  *									*
29  * Module Name:								*
30  *   e1000g_tx.c							*
31  *									*
32  * Abstract:								*
33  *   This file contains some routines that take care of Transmit,	*
34  *   make the hardware to send the data pointed by the packet out	*
35  *   on to the physical medium.						*
36  *									*
37  * **********************************************************************
38  */
39 
40 #include "e1000g_sw.h"
41 #include "e1000g_debug.h"
42 
43 static boolean_t e1000g_send(struct e1000g *, mblk_t *);
44 static int e1000g_tx_copy(e1000g_tx_ring_t *,
45     p_tx_sw_packet_t, mblk_t *, boolean_t);
46 static int e1000g_tx_bind(e1000g_tx_ring_t *,
47     p_tx_sw_packet_t, mblk_t *);
48 static boolean_t e1000g_retrieve_context(mblk_t *, context_data_t *, size_t);
49 static boolean_t e1000g_check_context(e1000g_tx_ring_t *, context_data_t *);
50 static int e1000g_fill_tx_ring(e1000g_tx_ring_t *, LIST_DESCRIBER *,
51     context_data_t *);
52 static void e1000g_fill_context_descriptor(context_data_t *,
53     struct e1000_context_desc *);
54 static int e1000g_fill_tx_desc(e1000g_tx_ring_t *,
55     p_tx_sw_packet_t, uint64_t, size_t);
56 static uint32_t e1000g_fill_82544_desc(uint64_t Address, size_t Length,
57     p_desc_array_t desc_array);
58 static int e1000g_tx_workaround_PCIX_82544(p_tx_sw_packet_t, uint64_t, size_t);
59 static int e1000g_tx_workaround_jumbo_82544(p_tx_sw_packet_t, uint64_t, size_t);
60 static void e1000g_82547_timeout(void *);
61 static void e1000g_82547_tx_move_tail(e1000g_tx_ring_t *);
62 static void e1000g_82547_tx_move_tail_work(e1000g_tx_ring_t *);
63 
64 #ifndef E1000G_DEBUG
65 #pragma inline(e1000g_tx_copy)
66 #pragma inline(e1000g_tx_bind)
67 #pragma inline(e1000g_retrieve_context)
68 #pragma inline(e1000g_check_context)
69 #pragma inline(e1000g_fill_tx_ring)
70 #pragma inline(e1000g_fill_context_descriptor)
71 #pragma inline(e1000g_fill_tx_desc)
72 #pragma inline(e1000g_fill_82544_desc)
73 #pragma inline(e1000g_tx_workaround_PCIX_82544)
74 #pragma inline(e1000g_tx_workaround_jumbo_82544)
75 #pragma inline(e1000g_free_tx_swpkt)
76 #endif
77 
78 /*
79  * e1000g_free_tx_swpkt	- free up the tx sw packet
80  *
81  * Unbind the previously bound DMA handle for a given
82  * transmit sw packet. And reset the sw packet data.
83  */
84 void
85 e1000g_free_tx_swpkt(register p_tx_sw_packet_t packet)
86 {
87 	switch (packet->data_transfer_type) {
88 	case USE_BCOPY:
89 		packet->tx_buf->len = 0;
90 		break;
91 #ifdef __sparc
92 	case USE_DVMA:
93 		dvma_unload(packet->tx_dma_handle, 0, -1);
94 		break;
95 #endif
96 	case USE_DMA:
97 		(void) ddi_dma_unbind_handle(packet->tx_dma_handle);
98 		break;
99 	default:
100 		break;
101 	}
102 
103 	/*
104 	 * The mblk has been stripped off the sw packet
105 	 * and will be freed in a triggered soft intr.
106 	 */
107 	ASSERT(packet->mp == NULL);
108 
109 	packet->data_transfer_type = USE_NONE;
110 	packet->num_mblk_frag = 0;
111 	packet->num_desc = 0;
112 }
113 
114 mblk_t *
115 e1000g_m_tx(void *arg, mblk_t *mp)
116 {
117 	struct e1000g *Adapter = (struct e1000g *)arg;
118 	mblk_t *next;
119 
120 	rw_enter(&Adapter->chip_lock, RW_READER);
121 
122 	if ((Adapter->chip_state != E1000G_START) ||
123 	    (Adapter->link_state != LINK_STATE_UP)) {
124 		freemsgchain(mp);
125 		mp = NULL;
126 	}
127 
128 	while (mp != NULL) {
129 		next = mp->b_next;
130 		mp->b_next = NULL;
131 
132 		if (!e1000g_send(Adapter, mp)) {
133 			mp->b_next = next;
134 			break;
135 		}
136 
137 		mp = next;
138 	}
139 
140 	rw_exit(&Adapter->chip_lock);
141 	return (mp);
142 }
143 
144 /*
145  * e1000g_send -  send packets onto the wire
146  *
147  * Called from e1000g_m_tx with an mblk ready to send. this
148  * routine sets up the transmit descriptors and sends data to
149  * the wire. It also pushes the just transmitted packet to
150  * the used tx sw packet list.
151  */
152 static boolean_t
153 e1000g_send(struct e1000g *Adapter, mblk_t *mp)
154 {
155 	p_tx_sw_packet_t packet;
156 	LIST_DESCRIBER pending_list;
157 	size_t len;
158 	size_t msg_size;
159 	uint32_t frag_count;
160 	int desc_count;
161 	uint32_t desc_total;
162 	uint32_t bcopy_thresh;
163 	uint32_t hdr_frag_len;
164 	boolean_t tx_undersize_flag;
165 	mblk_t *nmp;
166 	mblk_t *tmp;
167 	mblk_t *new_mp;
168 	mblk_t *pre_mp;
169 	e1000g_tx_ring_t *tx_ring;
170 	context_data_t cur_context;
171 
172 	tx_ring = Adapter->tx_ring;
173 	bcopy_thresh = Adapter->tx_bcopy_thresh;
174 
175 	/* Get the total size and frags number of the message */
176 	tx_undersize_flag = B_FALSE;
177 	frag_count = 0;
178 	msg_size = 0;
179 	for (nmp = mp; nmp; nmp = nmp->b_cont) {
180 		frag_count++;
181 		msg_size += MBLKL(nmp);
182 	}
183 
184 	/* retrieve and compute information for context descriptor */
185 	if (!e1000g_retrieve_context(mp, &cur_context, msg_size)) {
186 		freemsg(mp);
187 		return (B_TRUE);
188 	}
189 
190 	/*
191 	 * Make sure the packet is less than the allowed size
192 	 */
193 	if (!cur_context.lso_flag &&
194 	    (msg_size > Adapter->max_frame_size - ETHERFCSL)) {
195 		/*
196 		 * For the over size packet, we'll just drop it.
197 		 * So we return B_TRUE here.
198 		 */
199 		E1000G_DEBUGLOG_1(Adapter, E1000G_WARN_LEVEL,
200 		    "Tx packet out of bound. length = %d \n", msg_size);
201 		E1000G_STAT(tx_ring->stat_over_size);
202 		freemsg(mp);
203 		return (B_TRUE);
204 	}
205 
206 	/*
207 	 * Check and reclaim tx descriptors.
208 	 * This low water mark check should be done all the time as
209 	 * Transmit interrupt delay can produce Transmit interrupts little
210 	 * late and that may cause few problems related to reaping Tx
211 	 * Descriptors... As you may run short of them before getting any
212 	 * transmit interrupt...
213 	 */
214 	if (tx_ring->tbd_avail < DEFAULT_TX_NO_RESOURCE) {
215 		(void) e1000g_recycle(tx_ring);
216 		E1000G_DEBUG_STAT(tx_ring->stat_recycle);
217 
218 		if (tx_ring->tbd_avail < DEFAULT_TX_NO_RESOURCE) {
219 			E1000G_DEBUG_STAT(tx_ring->stat_lack_desc);
220 			goto tx_no_resource;
221 		}
222 	}
223 
224 	/*
225 	 * If the message size is less than the minimum ethernet packet size,
226 	 * we'll use bcopy to send it, and padd it to 60 bytes later.
227 	 */
228 	if (msg_size < ETHERMIN) {
229 		E1000G_DEBUG_STAT(tx_ring->stat_under_size);
230 		tx_undersize_flag = B_TRUE;
231 	}
232 
233 	/* Initialize variables */
234 	desc_count = 1;	/* The initial value should be greater than 0 */
235 	desc_total = 0;
236 	QUEUE_INIT_LIST(&pending_list);
237 
238 	/* Process each mblk fragment and fill tx descriptors */
239 	/*
240 	 * The software should guarantee LSO packet header(MAC+IP+TCP)
241 	 * to be within one descriptor. Here we reallocate and refill the
242 	 * the header if it's physical memory non-contiguous.
243 	 */
244 	if (cur_context.lso_flag) {
245 		/* find the last fragment of the header */
246 		len = MBLKL(mp);
247 		ASSERT(len > 0);
248 		nmp = mp;
249 		pre_mp = NULL;
250 		while (len < cur_context.hdr_len) {
251 			pre_mp = nmp;
252 			nmp = nmp->b_cont;
253 			len += MBLKL(nmp);
254 		}
255 		/*
256 		 * If the header and the payload are in different mblks,
257 		 * we simply force the header to be copied into pre-allocated
258 		 * page-aligned buffer.
259 		 */
260 		if (len == cur_context.hdr_len)
261 			goto adjust_threshold;
262 
263 		hdr_frag_len = cur_context.hdr_len - (len - MBLKL(nmp));
264 		/*
265 		 * There are two cases we need to reallocate a mblk for the
266 		 * last header fragment:
267 		 * 1. the header is in multiple mblks and the last fragment
268 		 * share the same mblk with the payload
269 		 * 2. the header is in a single mblk shared with the payload
270 		 * and the header is physical memory non-contiguous
271 		 */
272 		if ((nmp != mp) ||
273 		    (P2NPHASE((uintptr_t)nmp->b_rptr, Adapter->sys_page_sz)
274 		    < len)) {
275 			E1000G_DEBUG_STAT(tx_ring->stat_lso_header_fail);
276 			/*
277 			 * reallocate the mblk for the last header fragment,
278 			 * expect to bcopy into pre-allocated page-aligned
279 			 * buffer
280 			 */
281 			new_mp = allocb(hdr_frag_len, NULL);
282 			if (!new_mp)
283 				return (B_FALSE);
284 			bcopy(nmp->b_rptr, new_mp->b_rptr, hdr_frag_len);
285 			/* link the new header fragment with the other parts */
286 			new_mp->b_wptr = new_mp->b_rptr + hdr_frag_len;
287 			new_mp->b_cont = nmp;
288 			if (pre_mp)
289 				pre_mp->b_cont = new_mp;
290 			nmp->b_rptr += hdr_frag_len;
291 			if (hdr_frag_len == cur_context.hdr_len)
292 				mp = new_mp;
293 			frag_count ++;
294 		}
295 adjust_threshold:
296 		/*
297 		 * adjust the bcopy threshhold to guarantee
298 		 * the header to use bcopy way
299 		 */
300 		if (bcopy_thresh < cur_context.hdr_len)
301 			bcopy_thresh = cur_context.hdr_len;
302 	}
303 
304 	packet = NULL;
305 	nmp = mp;
306 	while (nmp) {
307 		tmp = nmp->b_cont;
308 
309 		len = MBLKL(nmp);
310 		/* Check zero length mblks */
311 		if (len == 0) {
312 			E1000G_DEBUG_STAT(tx_ring->stat_empty_frags);
313 			/*
314 			 * If there're no packet buffers have been used,
315 			 * or we just completed processing a buffer, then
316 			 * skip the empty mblk fragment.
317 			 * Otherwise, there's still a pending buffer that
318 			 * needs to be processed (tx_copy).
319 			 */
320 			if (desc_count > 0) {
321 				nmp = tmp;
322 				continue;
323 			}
324 		}
325 
326 		/*
327 		 * Get a new TxSwPacket to process mblk buffers.
328 		 */
329 		if (desc_count > 0) {
330 			mutex_enter(&tx_ring->freelist_lock);
331 			packet = (p_tx_sw_packet_t)
332 			    QUEUE_POP_HEAD(&tx_ring->free_list);
333 			mutex_exit(&tx_ring->freelist_lock);
334 
335 			if (packet == NULL) {
336 				E1000G_DEBUGLOG_0(Adapter, E1000G_INFO_LEVEL,
337 				    "No Tx SwPacket available\n");
338 				E1000G_STAT(tx_ring->stat_no_swpkt);
339 				goto tx_send_failed;
340 			}
341 			QUEUE_PUSH_TAIL(&pending_list, &packet->Link);
342 		}
343 
344 		ASSERT(packet);
345 		/*
346 		 * If the size of the fragment is less than the tx_bcopy_thresh
347 		 * we'll use bcopy; Otherwise, we'll use DMA binding.
348 		 */
349 		if ((len <= bcopy_thresh) || tx_undersize_flag) {
350 			desc_count =
351 			    e1000g_tx_copy(tx_ring, packet, nmp,
352 			    tx_undersize_flag);
353 			E1000G_DEBUG_STAT(tx_ring->stat_copy);
354 		} else {
355 			desc_count =
356 			    e1000g_tx_bind(tx_ring, packet, nmp);
357 			E1000G_DEBUG_STAT(tx_ring->stat_bind);
358 		}
359 
360 		if (desc_count > 0)
361 			desc_total += desc_count;
362 		else if (desc_count < 0)
363 			goto tx_send_failed;
364 
365 		nmp = tmp;
366 	}
367 
368 	/* Assign the message to the last sw packet */
369 	ASSERT(packet);
370 	ASSERT(packet->mp == NULL);
371 	packet->mp = mp;
372 
373 	/* Try to recycle the tx descriptors again */
374 	if (tx_ring->tbd_avail < (desc_total + 2)) {
375 		E1000G_DEBUG_STAT(tx_ring->stat_recycle_retry);
376 		(void) e1000g_recycle(tx_ring);
377 	}
378 
379 	mutex_enter(&tx_ring->tx_lock);
380 
381 	/*
382 	 * If the number of available tx descriptors is not enough for transmit
383 	 * (one redundant descriptor and one hw checksum context descriptor are
384 	 * included), then return failure.
385 	 */
386 	if (tx_ring->tbd_avail < (desc_total + 2)) {
387 		E1000G_DEBUGLOG_0(Adapter, E1000G_INFO_LEVEL,
388 		    "No Enough Tx descriptors\n");
389 		E1000G_STAT(tx_ring->stat_no_desc);
390 		mutex_exit(&tx_ring->tx_lock);
391 		goto tx_send_failed;
392 	}
393 
394 	desc_count = e1000g_fill_tx_ring(tx_ring, &pending_list, &cur_context);
395 
396 	mutex_exit(&tx_ring->tx_lock);
397 
398 	ASSERT(desc_count > 0);
399 
400 	/* Send successful */
401 	return (B_TRUE);
402 
403 tx_send_failed:
404 	/*
405 	 * Enable Transmit interrupts, so that the interrupt routine can
406 	 * call mac_tx_update() when transmit descriptors become available.
407 	 */
408 	tx_ring->resched_timestamp = ddi_get_lbolt();
409 	tx_ring->resched_needed = B_TRUE;
410 	if (!Adapter->tx_intr_enable)
411 		e1000g_mask_tx_interrupt(Adapter);
412 
413 	/* Free pending TxSwPackets */
414 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&pending_list);
415 	while (packet) {
416 		packet->mp = NULL;
417 		e1000g_free_tx_swpkt(packet);
418 		packet = (p_tx_sw_packet_t)
419 		    QUEUE_GET_NEXT(&pending_list, &packet->Link);
420 	}
421 
422 	/* Return pending TxSwPackets to the "Free" list */
423 	mutex_enter(&tx_ring->freelist_lock);
424 	QUEUE_APPEND(&tx_ring->free_list, &pending_list);
425 	mutex_exit(&tx_ring->freelist_lock);
426 
427 	E1000G_STAT(tx_ring->stat_send_fail);
428 
429 	/* Message will be scheduled for re-transmit */
430 	return (B_FALSE);
431 
432 tx_no_resource:
433 	/*
434 	 * Enable Transmit interrupts, so that the interrupt routine can
435 	 * call mac_tx_update() when transmit descriptors become available.
436 	 */
437 	tx_ring->resched_timestamp = ddi_get_lbolt();
438 	tx_ring->resched_needed = B_TRUE;
439 	if (!Adapter->tx_intr_enable)
440 		e1000g_mask_tx_interrupt(Adapter);
441 
442 	/* Message will be scheduled for re-transmit */
443 	return (B_FALSE);
444 }
445 
446 static boolean_t
447 e1000g_retrieve_context(mblk_t *mp, context_data_t *cur_context,
448     size_t msg_size)
449 {
450 	uintptr_t ip_start;
451 	uintptr_t tcp_start;
452 	mblk_t *nmp;
453 	uint32_t lsoflags;
454 	uint32_t mss;
455 
456 	bzero(cur_context, sizeof (context_data_t));
457 
458 	/* first check lso information */
459 	lso_info_get(mp, &mss, &lsoflags);
460 
461 	/* retrieve checksum info */
462 	hcksum_retrieve(mp, NULL, NULL, &cur_context->cksum_start,
463 	    &cur_context->cksum_stuff, NULL, NULL, &cur_context->cksum_flags);
464 	/* retrieve ethernet header size */
465 	if (((struct ether_vlan_header *)(uintptr_t)mp->b_rptr)->ether_tpid ==
466 	    htons(ETHERTYPE_VLAN))
467 		cur_context->ether_header_size =
468 		    sizeof (struct ether_vlan_header);
469 	else
470 		cur_context->ether_header_size =
471 		    sizeof (struct ether_header);
472 
473 	if (lsoflags & HW_LSO) {
474 		ASSERT(mss != 0);
475 
476 		/* free the invalid packet */
477 		if (mss == 0 ||
478 		    !((cur_context->cksum_flags & HCK_PARTIALCKSUM) &&
479 		    (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM))) {
480 			return (B_FALSE);
481 		}
482 		cur_context->mss = (uint16_t)mss;
483 		cur_context->lso_flag = B_TRUE;
484 
485 		/*
486 		 * Some fields are cleared for the hardware to fill
487 		 * in. We don't assume Ethernet header, IP header and
488 		 * TCP header are always in the same mblk fragment,
489 		 * while we assume each header is always within one
490 		 * mblk fragment and Ethernet header is always in the
491 		 * first mblk fragment.
492 		 */
493 		nmp = mp;
494 		ip_start = (uintptr_t)(nmp->b_rptr)
495 		    + cur_context->ether_header_size;
496 		if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
497 			ip_start = (uintptr_t)nmp->b_cont->b_rptr
498 			    + (ip_start - (uintptr_t)(nmp->b_wptr));
499 			nmp = nmp->b_cont;
500 		}
501 		tcp_start = ip_start +
502 		    IPH_HDR_LENGTH((ipha_t *)ip_start);
503 		if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
504 			tcp_start = (uintptr_t)nmp->b_cont->b_rptr
505 			    + (tcp_start - (uintptr_t)(nmp->b_wptr));
506 			nmp = nmp->b_cont;
507 		}
508 		cur_context->hdr_len = cur_context->ether_header_size
509 		    + IPH_HDR_LENGTH((ipha_t *)ip_start)
510 		    + TCP_HDR_LENGTH((tcph_t *)tcp_start);
511 		((ipha_t *)ip_start)->ipha_length = 0;
512 		((ipha_t *)ip_start)->ipha_hdr_checksum = 0;
513 		/* calculate the TCP packet payload length */
514 		cur_context->pay_len = msg_size - cur_context->hdr_len;
515 	}
516 	return (B_TRUE);
517 }
518 
519 static boolean_t
520 e1000g_check_context(e1000g_tx_ring_t *tx_ring, context_data_t *cur_context)
521 {
522 	boolean_t context_reload;
523 	context_data_t *pre_context;
524 	struct e1000g *Adapter;
525 
526 	context_reload = B_FALSE;
527 	pre_context = &tx_ring->pre_context;
528 	Adapter = tx_ring->adapter;
529 
530 	/*
531 	 * The following code determine if the context descriptor is
532 	 * needed to be reloaded. The sequence of the conditions is
533 	 * made by their possibilities of changing.
534 	 */
535 	/*
536 	 * workaround for 82546EB, context descriptor must be reloaded
537 	 * per LSO/hw_cksum packet if LSO is enabled.
538 	 */
539 	if (Adapter->lso_premature_issue &&
540 	    Adapter->lso_enable &&
541 	    (cur_context->cksum_flags != 0)) {
542 
543 		context_reload = B_TRUE;
544 	} else if (cur_context->lso_flag) {
545 		if ((cur_context->lso_flag != pre_context->lso_flag) ||
546 		    (cur_context->cksum_flags != pre_context->cksum_flags) ||
547 		    (cur_context->pay_len != pre_context->pay_len) ||
548 		    (cur_context->mss != pre_context->mss) ||
549 		    (cur_context->hdr_len != pre_context->hdr_len) ||
550 		    (cur_context->cksum_stuff != pre_context->cksum_stuff) ||
551 		    (cur_context->cksum_start != pre_context->cksum_start) ||
552 		    (cur_context->ether_header_size !=
553 		    pre_context->ether_header_size)) {
554 
555 			context_reload = B_TRUE;
556 		}
557 	} else if (cur_context->cksum_flags != 0) {
558 		if ((cur_context->lso_flag != pre_context->lso_flag) ||
559 		    (cur_context->cksum_flags != pre_context->cksum_flags) ||
560 		    (cur_context->cksum_stuff != pre_context->cksum_stuff) ||
561 		    (cur_context->cksum_start != pre_context->cksum_start) ||
562 		    (cur_context->ether_header_size !=
563 		    pre_context->ether_header_size)) {
564 
565 			context_reload = B_TRUE;
566 		}
567 	}
568 
569 	return (context_reload);
570 }
571 
572 static int
573 e1000g_fill_tx_ring(e1000g_tx_ring_t *tx_ring, LIST_DESCRIBER *pending_list,
574     context_data_t *cur_context)
575 {
576 	struct e1000g *Adapter;
577 	struct e1000_hw *hw;
578 	p_tx_sw_packet_t first_packet;
579 	p_tx_sw_packet_t packet;
580 	p_tx_sw_packet_t previous_packet;
581 	boolean_t context_reload;
582 	struct e1000_tx_desc *first_data_desc;
583 	struct e1000_tx_desc *next_desc;
584 	struct e1000_tx_desc *descriptor;
585 	int desc_count;
586 	boolean_t buff_overrun_flag;
587 	int i;
588 
589 	Adapter = tx_ring->adapter;
590 	hw = &Adapter->shared;
591 
592 	desc_count = 0;
593 	first_packet = NULL;
594 	first_data_desc = NULL;
595 	descriptor = NULL;
596 	first_packet = NULL;
597 	packet = NULL;
598 	buff_overrun_flag = B_FALSE;
599 
600 	next_desc = tx_ring->tbd_next;
601 
602 	/* Context descriptor reload check */
603 	context_reload = e1000g_check_context(tx_ring, cur_context);
604 
605 	if (context_reload) {
606 		first_packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(pending_list);
607 
608 		descriptor = next_desc;
609 
610 		e1000g_fill_context_descriptor(cur_context,
611 		    (struct e1000_context_desc *)descriptor);
612 
613 		/* Check the wrap-around case */
614 		if (descriptor == tx_ring->tbd_last)
615 			next_desc = tx_ring->tbd_first;
616 		else
617 			next_desc++;
618 
619 		desc_count++;
620 	}
621 
622 	first_data_desc = next_desc;
623 
624 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(pending_list);
625 	while (packet) {
626 		ASSERT(packet->num_desc);
627 
628 		for (i = 0; i < packet->num_desc; i++) {
629 			ASSERT(tx_ring->tbd_avail > 0);
630 
631 			descriptor = next_desc;
632 			descriptor->buffer_addr =
633 			    packet->desc[i].address;
634 			descriptor->lower.data =
635 			    packet->desc[i].length;
636 
637 			/* Zero out status */
638 			descriptor->upper.data = 0;
639 
640 			descriptor->lower.data |=
641 			    E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D;
642 			/* must set RS on every outgoing descriptor */
643 			descriptor->lower.data |=
644 			    E1000_TXD_CMD_RS;
645 
646 			if (cur_context->lso_flag)
647 				descriptor->lower.data |= E1000_TXD_CMD_TSE;
648 
649 			/* Check the wrap-around case */
650 			if (descriptor == tx_ring->tbd_last)
651 				next_desc = tx_ring->tbd_first;
652 			else
653 				next_desc++;
654 
655 			desc_count++;
656 
657 			/*
658 			 * workaround for 82546EB errata 33, hang in PCI-X
659 			 * systems due to 2k Buffer Overrun during Transmit
660 			 * Operation. The workaround applies to all the Intel
661 			 * PCI-X chips.
662 			 */
663 			if (hw->bus.type == e1000_bus_type_pcix &&
664 			    descriptor == first_data_desc &&
665 			    ((descriptor->lower.data & E1000G_TBD_LENGTH_MASK)
666 			    > E1000_TX_BUFFER_OEVRRUN_THRESHOLD)) {
667 				/* modified the first descriptor */
668 				descriptor->lower.data &=
669 				    ~E1000G_TBD_LENGTH_MASK;
670 				descriptor->lower.flags.length =
671 				    E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
672 
673 				/* insert a new descriptor */
674 				ASSERT(tx_ring->tbd_avail > 0);
675 				next_desc->buffer_addr =
676 				    packet->desc[0].address +
677 				    E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
678 				next_desc->lower.data =
679 				    packet->desc[0].length -
680 				    E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
681 
682 				/* Zero out status */
683 				next_desc->upper.data = 0;
684 
685 				next_desc->lower.data |=
686 				    E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D;
687 				/* must set RS on every outgoing descriptor */
688 				next_desc->lower.data |=
689 				    E1000_TXD_CMD_RS;
690 
691 				if (cur_context->lso_flag)
692 					next_desc->lower.data |=
693 					    E1000_TXD_CMD_TSE;
694 
695 				descriptor = next_desc;
696 
697 				/* Check the wrap-around case */
698 				if (next_desc == tx_ring->tbd_last)
699 					next_desc = tx_ring->tbd_first;
700 				else
701 					next_desc++;
702 
703 				desc_count++;
704 				buff_overrun_flag = B_TRUE;
705 			}
706 		}
707 
708 		if (buff_overrun_flag) {
709 			packet->num_desc++;
710 			buff_overrun_flag = B_FALSE;
711 		}
712 
713 		if (first_packet != NULL) {
714 			/*
715 			 * Count the checksum context descriptor for
716 			 * the first SwPacket.
717 			 */
718 			first_packet->num_desc++;
719 			first_packet = NULL;
720 		}
721 
722 		previous_packet = packet;
723 		packet = (p_tx_sw_packet_t)
724 		    QUEUE_GET_NEXT(pending_list, &packet->Link);
725 	}
726 
727 	/*
728 	 * workaround for 82546EB errata 21, LSO Premature Descriptor Write Back
729 	 */
730 	if (Adapter->lso_premature_issue && cur_context->lso_flag &&
731 	    ((descriptor->lower.data & E1000G_TBD_LENGTH_MASK) > 8)) {
732 		/* modified the previous descriptor */
733 		descriptor->lower.data -= 4;
734 
735 		/* insert a new descriptor */
736 		ASSERT(tx_ring->tbd_avail > 0);
737 		/* the lower 20 bits of lower.data is the length field */
738 		next_desc->buffer_addr =
739 		    descriptor->buffer_addr +
740 		    (descriptor->lower.data & E1000G_TBD_LENGTH_MASK);
741 		next_desc->lower.data = 4;
742 
743 		/* Zero out status */
744 		next_desc->upper.data = 0;
745 		/* It must be part of a LSO packet */
746 		next_desc->lower.data |=
747 		    E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D |
748 		    E1000_TXD_CMD_RS | E1000_TXD_CMD_TSE;
749 
750 		descriptor = next_desc;
751 
752 		/* Check the wrap-around case */
753 		if (descriptor == tx_ring->tbd_last)
754 			next_desc = tx_ring->tbd_first;
755 		else
756 			next_desc++;
757 
758 		desc_count++;
759 		/* update the number of descriptors */
760 		previous_packet->num_desc++;
761 	}
762 
763 	ASSERT(descriptor);
764 
765 	if (cur_context->cksum_flags) {
766 		if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM)
767 			((struct e1000_data_desc *)first_data_desc)->
768 			    upper.fields.popts |= E1000_TXD_POPTS_IXSM;
769 		if (cur_context->cksum_flags & HCK_PARTIALCKSUM)
770 			((struct e1000_data_desc *)first_data_desc)->
771 			    upper.fields.popts |= E1000_TXD_POPTS_TXSM;
772 	}
773 
774 	/*
775 	 * Last Descriptor of Packet needs End Of Packet (EOP), Report
776 	 * Status (RS) set.
777 	 */
778 	if (Adapter->tx_intr_delay) {
779 		descriptor->lower.data |= E1000_TXD_CMD_IDE |
780 		    E1000_TXD_CMD_EOP;
781 	} else {
782 		descriptor->lower.data |= E1000_TXD_CMD_EOP;
783 	}
784 
785 	/* Set append Ethernet CRC (IFCS) bits */
786 	if (cur_context->lso_flag) {
787 		first_data_desc->lower.data |= E1000_TXD_CMD_IFCS;
788 	} else {
789 		descriptor->lower.data |= E1000_TXD_CMD_IFCS;
790 	}
791 
792 	/*
793 	 * Sync the Tx descriptors DMA buffer
794 	 */
795 	(void) ddi_dma_sync(tx_ring->tbd_dma_handle,
796 	    0, 0, DDI_DMA_SYNC_FORDEV);
797 
798 	tx_ring->tbd_next = next_desc;
799 
800 	/*
801 	 * Advance the Transmit Descriptor Tail (Tdt), this tells the
802 	 * FX1000 that this frame is available to transmit.
803 	 */
804 	if (hw->mac.type == e1000_82547)
805 		e1000g_82547_tx_move_tail(tx_ring);
806 	else
807 		E1000_WRITE_REG(hw, E1000_TDT(0),
808 		    (uint32_t)(next_desc - tx_ring->tbd_first));
809 
810 	if (e1000g_check_acc_handle(Adapter->osdep.reg_handle) != DDI_FM_OK) {
811 		ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
812 		Adapter->chip_state = E1000G_ERROR;
813 	}
814 
815 	/* Put the pending SwPackets to the "Used" list */
816 	mutex_enter(&tx_ring->usedlist_lock);
817 	QUEUE_APPEND(&tx_ring->used_list, pending_list);
818 	tx_ring->tbd_avail -= desc_count;
819 	mutex_exit(&tx_ring->usedlist_lock);
820 
821 	/* update LSO related data */
822 	if (context_reload)
823 		tx_ring->pre_context = *cur_context;
824 
825 	return (desc_count);
826 }
827 
828 /*
829  * e1000g_tx_setup - setup tx data structures
830  *
831  * This routine initializes all of the transmit related
832  * structures. This includes the Transmit descriptors,
833  * and the tx_sw_packet structures.
834  */
835 void
836 e1000g_tx_setup(struct e1000g *Adapter)
837 {
838 	struct e1000_hw *hw;
839 	p_tx_sw_packet_t packet;
840 	uint32_t i;
841 	uint32_t buf_high;
842 	uint32_t buf_low;
843 	uint32_t reg_tipg;
844 	uint32_t reg_tctl;
845 	int size;
846 	e1000g_tx_ring_t *tx_ring;
847 
848 	hw = &Adapter->shared;
849 	tx_ring = Adapter->tx_ring;
850 
851 	/* init the lists */
852 	/*
853 	 * Here we don't need to protect the lists using the
854 	 * usedlist_lock and freelist_lock, for they have
855 	 * been protected by the chip_lock.
856 	 */
857 	QUEUE_INIT_LIST(&tx_ring->used_list);
858 	QUEUE_INIT_LIST(&tx_ring->free_list);
859 
860 	/* Go through and set up each SW_Packet */
861 	packet = tx_ring->packet_area;
862 	for (i = 0; i < Adapter->tx_freelist_num; i++, packet++) {
863 		/* Initialize this tx_sw_apcket area */
864 		e1000g_free_tx_swpkt(packet);
865 		/* Add this tx_sw_packet to the free list */
866 		QUEUE_PUSH_TAIL(&tx_ring->free_list,
867 		    &packet->Link);
868 	}
869 
870 	/* Setup TX descriptor pointers */
871 	tx_ring->tbd_next = tx_ring->tbd_first;
872 	tx_ring->tbd_oldest = tx_ring->tbd_first;
873 
874 	/*
875 	 * Setup Hardware TX Registers
876 	 */
877 	/* Setup the Transmit Control Register (TCTL). */
878 	reg_tctl = E1000_READ_REG(hw, E1000_TCTL);
879 	reg_tctl |= E1000_TCTL_PSP | E1000_TCTL_EN |
880 	    (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT) |
881 	    (E1000_COLLISION_DISTANCE << E1000_COLD_SHIFT) |
882 	    E1000_TCTL_RTLC;
883 
884 	/* Enable the MULR bit */
885 	if (hw->bus.type == e1000_bus_type_pci_express)
886 		reg_tctl |= E1000_TCTL_MULR;
887 
888 	E1000_WRITE_REG(hw, E1000_TCTL, reg_tctl);
889 
890 	/* Setup HW Base and Length of Tx descriptor area */
891 	size = (Adapter->tx_desc_num * sizeof (struct e1000_tx_desc));
892 	E1000_WRITE_REG(hw, E1000_TDLEN(0), size);
893 	size = E1000_READ_REG(hw, E1000_TDLEN(0));
894 
895 	buf_low = (uint32_t)tx_ring->tbd_dma_addr;
896 	buf_high = (uint32_t)(tx_ring->tbd_dma_addr >> 32);
897 
898 	/*
899 	 * Write the highest location first and work backward to the lowest.
900 	 * This is necessary for some adapter types to
901 	 * prevent write combining from occurring.
902 	 */
903 	E1000_WRITE_REG(hw, E1000_TDBAH(0), buf_high);
904 	E1000_WRITE_REG(hw, E1000_TDBAL(0), buf_low);
905 
906 	/* Setup our HW Tx Head & Tail descriptor pointers */
907 	E1000_WRITE_REG(hw, E1000_TDH(0), 0);
908 	E1000_WRITE_REG(hw, E1000_TDT(0), 0);
909 
910 	/* Set the default values for the Tx Inter Packet Gap timer */
911 	if ((hw->mac.type == e1000_82542) &&
912 	    ((hw->revision_id == E1000_REVISION_2) ||
913 	    (hw->revision_id == E1000_REVISION_3))) {
914 		reg_tipg = DEFAULT_82542_TIPG_IPGT;
915 		reg_tipg |=
916 		    DEFAULT_82542_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT;
917 		reg_tipg |=
918 		    DEFAULT_82542_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT;
919 	} else if (hw->mac.type == e1000_80003es2lan) {
920 		reg_tipg = DEFAULT_82543_TIPG_IPGR1;
921 		reg_tipg |= DEFAULT_80003ES2LAN_TIPG_IPGR2 <<
922 		    E1000_TIPG_IPGR2_SHIFT;
923 	} else {
924 		if (hw->phy.media_type == e1000_media_type_fiber)
925 			reg_tipg = DEFAULT_82543_TIPG_IPGT_FIBER;
926 		else
927 			reg_tipg = DEFAULT_82543_TIPG_IPGT_COPPER;
928 		reg_tipg |=
929 		    DEFAULT_82543_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT;
930 		reg_tipg |=
931 		    DEFAULT_82543_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT;
932 	}
933 	E1000_WRITE_REG(hw, E1000_TIPG, reg_tipg);
934 
935 	/* Setup Transmit Interrupt Delay Value */
936 	E1000_WRITE_REG(hw, E1000_TIDV, Adapter->tx_intr_delay);
937 	E1000G_DEBUGLOG_1(Adapter, E1000G_INFO_LEVEL,
938 	    "E1000_TIDV: 0x%x\n", Adapter->tx_intr_delay);
939 
940 	if (hw->mac.type >= e1000_82540) {
941 		E1000_WRITE_REG(&Adapter->shared, E1000_TADV,
942 		    Adapter->tx_intr_abs_delay);
943 		E1000G_DEBUGLOG_1(Adapter, E1000G_INFO_LEVEL,
944 		    "E1000_TADV: 0x%x\n", Adapter->tx_intr_abs_delay);
945 	}
946 
947 	tx_ring->tbd_avail = Adapter->tx_desc_num;
948 
949 	/* Initialize stored context information */
950 	bzero(&(tx_ring->pre_context), sizeof (context_data_t));
951 }
952 
953 /*
954  * e1000g_recycle - recycle the tx descriptors and tx sw packets
955  */
956 int
957 e1000g_recycle(e1000g_tx_ring_t *tx_ring)
958 {
959 	struct e1000g *Adapter;
960 	LIST_DESCRIBER pending_list;
961 	p_tx_sw_packet_t packet;
962 	mblk_t *mp;
963 	mblk_t *nmp;
964 	struct e1000_tx_desc *descriptor;
965 	int desc_count;
966 
967 	/*
968 	 * This function will examine each TxSwPacket in the 'used' queue
969 	 * if the e1000g is done with it then the associated resources (Tx
970 	 * Descriptors) will be "freed" and the TxSwPacket will be
971 	 * returned to the 'free' queue.
972 	 */
973 	Adapter = tx_ring->adapter;
974 
975 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&tx_ring->used_list);
976 	if (packet == NULL) {
977 		tx_ring->recycle_fail = 0;
978 		tx_ring->stall_watchdog = 0;
979 		return (0);
980 	}
981 
982 	desc_count = 0;
983 	QUEUE_INIT_LIST(&pending_list);
984 
985 	/* Sync the Tx descriptor DMA buffer */
986 	(void) ddi_dma_sync(tx_ring->tbd_dma_handle,
987 	    0, 0, DDI_DMA_SYNC_FORKERNEL);
988 	if (e1000g_check_dma_handle(
989 	    tx_ring->tbd_dma_handle) != DDI_FM_OK) {
990 		ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
991 		Adapter->chip_state = E1000G_ERROR;
992 		return (0);
993 	}
994 
995 	/*
996 	 * While there are still TxSwPackets in the used queue check them
997 	 */
998 	mutex_enter(&tx_ring->usedlist_lock);
999 	while ((packet =
1000 	    (p_tx_sw_packet_t)QUEUE_GET_HEAD(&tx_ring->used_list)) != NULL) {
1001 
1002 		/*
1003 		 * Get hold of the next descriptor that the e1000g will
1004 		 * report status back to (this will be the last descriptor
1005 		 * of a given sw packet). We only want to free the
1006 		 * sw packet (and it resources) if the e1000g is done
1007 		 * with ALL of the descriptors.  If the e1000g is done
1008 		 * with the last one then it is done with all of them.
1009 		 */
1010 		ASSERT(packet->num_desc);
1011 		descriptor = tx_ring->tbd_oldest + (packet->num_desc - 1);
1012 
1013 		/* Check for wrap case */
1014 		if (descriptor > tx_ring->tbd_last)
1015 			descriptor -= Adapter->tx_desc_num;
1016 
1017 		/*
1018 		 * If the descriptor done bit is set free TxSwPacket and
1019 		 * associated resources
1020 		 */
1021 		if (descriptor->upper.fields.status & E1000_TXD_STAT_DD) {
1022 			QUEUE_POP_HEAD(&tx_ring->used_list);
1023 			QUEUE_PUSH_TAIL(&pending_list, &packet->Link);
1024 
1025 			if (descriptor == tx_ring->tbd_last)
1026 				tx_ring->tbd_oldest =
1027 				    tx_ring->tbd_first;
1028 			else
1029 				tx_ring->tbd_oldest =
1030 				    descriptor + 1;
1031 
1032 			desc_count += packet->num_desc;
1033 		} else {
1034 			/*
1035 			 * Found a sw packet that the e1000g is not done
1036 			 * with then there is no reason to check the rest
1037 			 * of the queue.
1038 			 */
1039 			break;
1040 		}
1041 	}
1042 
1043 	tx_ring->tbd_avail += desc_count;
1044 	Adapter->tx_pkt_cnt += desc_count;
1045 
1046 	mutex_exit(&tx_ring->usedlist_lock);
1047 
1048 	if (desc_count == 0) {
1049 		tx_ring->recycle_fail++;
1050 		E1000G_DEBUG_STAT(tx_ring->stat_recycle_none);
1051 		return (0);
1052 	}
1053 
1054 	tx_ring->recycle_fail = 0;
1055 	tx_ring->stall_watchdog = 0;
1056 
1057 	mp = NULL;
1058 	nmp = NULL;
1059 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&pending_list);
1060 	ASSERT(packet != NULL);
1061 	while (packet != NULL) {
1062 		if (packet->mp != NULL) {
1063 			ASSERT(packet->mp->b_next == NULL);
1064 			/* Assemble the message chain */
1065 			if (mp == NULL) {
1066 				mp = packet->mp;
1067 				nmp = packet->mp;
1068 			} else {
1069 				nmp->b_next = packet->mp;
1070 				nmp = packet->mp;
1071 			}
1072 			/* Disconnect the message from the sw packet */
1073 			packet->mp = NULL;
1074 		}
1075 
1076 		/* Free the TxSwPackets */
1077 		e1000g_free_tx_swpkt(packet);
1078 
1079 		packet = (p_tx_sw_packet_t)
1080 		    QUEUE_GET_NEXT(&pending_list, &packet->Link);
1081 	}
1082 
1083 	/* Return the TxSwPackets back to the FreeList */
1084 	mutex_enter(&tx_ring->freelist_lock);
1085 	QUEUE_APPEND(&tx_ring->free_list, &pending_list);
1086 	mutex_exit(&tx_ring->freelist_lock);
1087 
1088 	if (mp != NULL)
1089 		freemsgchain(mp);
1090 
1091 	return (desc_count);
1092 }
1093 /*
1094  * 82544 Coexistence issue workaround:
1095  *    There are 2 issues.
1096  *    1. If a 32 bit split completion happens from P64H2 and another
1097  *	agent drives a 64 bit request/split completion after ONLY
1098  *	1 idle clock (BRCM/Emulex/Adaptec fiber channel cards) then
1099  *	82544 has a problem where in to clock all the data in, it
1100  *	looks at REQ64# signal and since it has changed so fast (i.e. 1
1101  *	idle clock turn around), it will fail to clock all the data in.
1102  *	Data coming from certain ending addresses has exposure to this issue.
1103  *
1104  * To detect this issue, following equation can be used...
1105  *	SIZE[3:0] + ADDR[2:0] = SUM[3:0].
1106  *	If SUM[3:0] is in between 1 to 4, we will have this issue.
1107  *
1108  * ROOT CAUSE:
1109  *	The erratum involves the 82544 PCIX elasticity FIFO implementations as
1110  *	64-bit FIFO's and flushing of the final partial-bytes corresponding
1111  *	to the end of a requested read burst. Under a specific burst condition
1112  *	of ending-data alignment and 32-byte split-completions, the final
1113  *	byte(s) of split-completion data require an extra clock cycle to flush
1114  *	into 64-bit FIFO orientation.  An incorrect logic dependency on the
1115  *	REQ64# signal occurring during during this clock cycle may cause the
1116  *	residual byte(s) to be lost, thereby rendering the internal DMA client
1117  *	forever awaiting the final byte(s) for an outbound data-fetch.  The
1118  *	erratum is confirmed to *only* occur if certain subsequent external
1119  *	64-bit PCIX bus transactions occur immediately (minimum possible bus
1120  *	turn- around) following the odd-aligned 32-bit split-completion
1121  *	containing the final byte(s).  Intel has confirmed that this has been
1122  *	seen only with chipset/bridges which have the capability to provide
1123  *	32-bit split-completion data, and in the presence of newer PCIX bus
1124  *	agents which fully-optimize the inter-transaction turn-around (zero
1125  *	additional initiator latency when pre-granted bus ownership).
1126  *
1127  *   	This issue does not exist in PCI bus mode, when any agent is operating
1128  *	in 32 bit only mode or on chipsets that do not do 32 bit split
1129  *	completions for 64 bit read requests (Serverworks chipsets). P64H2 does
1130  *	32 bit split completions for any read request that has bit 2 set to 1
1131  *	for the requested address and read request size is more than 8 bytes.
1132  *
1133  *   2. Another issue is related to 82544 driving DACs under the similar
1134  *	scenario (32 bit split completion followed by 64 bit transaction with
1135  *	only 1 cycle turnaround). This issue is still being root caused. We
1136  *	think that both of these issues can be avoided if following workaround
1137  *	is implemented. It seems DAC issues is related to ending addresses being
1138  *	0x9, 0xA, 0xB, 0xC and hence ending up at odd boundaries in elasticity
1139  *	FIFO which does not get flushed due to REQ64# dependency. We will only
1140  *	know the full story after it has been simulated successfully by HW team.
1141  *
1142  * WORKAROUND:
1143  *	Make sure we do not have ending address as 1,2,3,4(Hang) or 9,a,b,c(DAC)
1144  */
1145 static uint32_t
1146 e1000g_fill_82544_desc(uint64_t address,
1147     size_t length, p_desc_array_t desc_array)
1148 {
1149 	/*
1150 	 * Since issue is sensitive to length and address.
1151 	 * Let us first check the address...
1152 	 */
1153 	uint32_t safe_terminator;
1154 
1155 	if (length <= 4) {
1156 		desc_array->descriptor[0].address = address;
1157 		desc_array->descriptor[0].length = (uint32_t)length;
1158 		desc_array->elements = 1;
1159 		return (desc_array->elements);
1160 	}
1161 	safe_terminator =
1162 	    (uint32_t)((((uint32_t)address & 0x7) +
1163 	    (length & 0xF)) & 0xF);
1164 	/*
1165 	 * if it does not fall between 0x1 to 0x4 and 0x9 to 0xC then
1166 	 * return
1167 	 */
1168 	if (safe_terminator == 0 ||
1169 	    (safe_terminator > 4 && safe_terminator < 9) ||
1170 	    (safe_terminator > 0xC && safe_terminator <= 0xF)) {
1171 		desc_array->descriptor[0].address = address;
1172 		desc_array->descriptor[0].length = (uint32_t)length;
1173 		desc_array->elements = 1;
1174 		return (desc_array->elements);
1175 	}
1176 
1177 	desc_array->descriptor[0].address = address;
1178 	desc_array->descriptor[0].length = length - 4;
1179 	desc_array->descriptor[1].address = address + (length - 4);
1180 	desc_array->descriptor[1].length = 4;
1181 	desc_array->elements = 2;
1182 	return (desc_array->elements);
1183 }
1184 
1185 static int
1186 e1000g_tx_copy(e1000g_tx_ring_t *tx_ring, p_tx_sw_packet_t packet,
1187     mblk_t *mp, boolean_t tx_undersize_flag)
1188 {
1189 	size_t len;
1190 	size_t len1;
1191 	dma_buffer_t *tx_buf;
1192 	mblk_t *nmp;
1193 	boolean_t finished;
1194 	int desc_count;
1195 
1196 	desc_count = 0;
1197 	tx_buf = packet->tx_buf;
1198 	len = MBLKL(mp);
1199 
1200 	ASSERT((tx_buf->len + len) <= tx_buf->size);
1201 
1202 	if (len > 0) {
1203 		bcopy(mp->b_rptr,
1204 		    tx_buf->address + tx_buf->len,
1205 		    len);
1206 		tx_buf->len += len;
1207 
1208 		packet->num_mblk_frag++;
1209 	}
1210 
1211 	nmp = mp->b_cont;
1212 	if (nmp == NULL) {
1213 		finished = B_TRUE;
1214 	} else {
1215 		len1 = MBLKL(nmp);
1216 		if ((tx_buf->len + len1) > tx_buf->size)
1217 			finished = B_TRUE;
1218 		else if (tx_undersize_flag)
1219 			finished = B_FALSE;
1220 		else if (len1 > tx_ring->adapter->tx_bcopy_thresh)
1221 			finished = B_TRUE;
1222 		else
1223 			finished = B_FALSE;
1224 	}
1225 
1226 	if (finished) {
1227 		E1000G_DEBUG_STAT_COND(tx_ring->stat_multi_copy,
1228 		    (tx_buf->len > len));
1229 
1230 		/*
1231 		 * If the packet is smaller than 64 bytes, which is the
1232 		 * minimum ethernet packet size, pad the packet to make
1233 		 * it at least 60 bytes. The hardware will add 4 bytes
1234 		 * for CRC.
1235 		 */
1236 		if (tx_undersize_flag) {
1237 			ASSERT(tx_buf->len < ETHERMIN);
1238 
1239 			bzero(tx_buf->address + tx_buf->len,
1240 			    ETHERMIN - tx_buf->len);
1241 			tx_buf->len = ETHERMIN;
1242 		}
1243 
1244 #ifdef __sparc
1245 		if (packet->dma_type == USE_DVMA)
1246 			dvma_sync(tx_buf->dma_handle, 0, DDI_DMA_SYNC_FORDEV);
1247 		else
1248 			(void) ddi_dma_sync(tx_buf->dma_handle, 0,
1249 			    tx_buf->len, DDI_DMA_SYNC_FORDEV);
1250 #else
1251 		(void) ddi_dma_sync(tx_buf->dma_handle, 0,
1252 		    tx_buf->len, DDI_DMA_SYNC_FORDEV);
1253 #endif
1254 
1255 		packet->data_transfer_type = USE_BCOPY;
1256 
1257 		desc_count = e1000g_fill_tx_desc(tx_ring,
1258 		    packet,
1259 		    tx_buf->dma_address,
1260 		    tx_buf->len);
1261 
1262 		if (desc_count <= 0)
1263 			return (-1);
1264 	}
1265 
1266 	return (desc_count);
1267 }
1268 
1269 static int
1270 e1000g_tx_bind(e1000g_tx_ring_t *tx_ring, p_tx_sw_packet_t packet, mblk_t *mp)
1271 {
1272 	int j;
1273 	int mystat;
1274 	size_t len;
1275 	ddi_dma_cookie_t dma_cookie;
1276 	uint_t ncookies;
1277 	int desc_count;
1278 	uint32_t desc_total;
1279 
1280 	desc_total = 0;
1281 	len = MBLKL(mp);
1282 
1283 	/*
1284 	 * ddi_dma_addr_bind_handle() allocates  DMA  resources  for  a
1285 	 * memory  object such that a device can perform DMA to or from
1286 	 * the object.  DMA resources  are  allocated  considering  the
1287 	 * device's  DMA  attributes  as  expressed by ddi_dma_attr(9S)
1288 	 * (see ddi_dma_alloc_handle(9F)).
1289 	 *
1290 	 * ddi_dma_addr_bind_handle() fills in  the  first  DMA  cookie
1291 	 * pointed  to by cookiep with the appropriate address, length,
1292 	 * and bus type. *ccountp is set to the number of DMA  cookies
1293 	 * representing this DMA object. Subsequent DMA cookies must be
1294 	 * retrieved by calling ddi_dma_nextcookie(9F)  the  number  of
1295 	 * times specified by *countp - 1.
1296 	 */
1297 	switch (packet->dma_type) {
1298 #ifdef __sparc
1299 	case USE_DVMA:
1300 		dvma_kaddr_load(packet->tx_dma_handle,
1301 		    (caddr_t)mp->b_rptr, len, 0, &dma_cookie);
1302 
1303 		dvma_sync(packet->tx_dma_handle, 0,
1304 		    DDI_DMA_SYNC_FORDEV);
1305 
1306 		ncookies = 1;
1307 		packet->data_transfer_type = USE_DVMA;
1308 		break;
1309 #endif
1310 	case USE_DMA:
1311 		if ((mystat = ddi_dma_addr_bind_handle(
1312 		    packet->tx_dma_handle, NULL,
1313 		    (caddr_t)mp->b_rptr, len,
1314 		    DDI_DMA_WRITE | DDI_DMA_STREAMING,
1315 		    DDI_DMA_DONTWAIT, 0, &dma_cookie,
1316 		    &ncookies)) != DDI_DMA_MAPPED) {
1317 
1318 			e1000g_log(tx_ring->adapter, CE_WARN,
1319 			    "Couldn't bind mblk buffer to Tx DMA handle: "
1320 			    "return: %X, Pkt: %X\n",
1321 			    mystat, packet);
1322 			return (-1);
1323 		}
1324 
1325 		/*
1326 		 * An implicit ddi_dma_sync() is done when the
1327 		 * ddi_dma_addr_bind_handle() is called. So we
1328 		 * don't need to explicitly call ddi_dma_sync()
1329 		 * here any more.
1330 		 */
1331 		ASSERT(ncookies);
1332 		E1000G_DEBUG_STAT_COND(tx_ring->stat_multi_cookie,
1333 		    (ncookies > 1));
1334 
1335 		/*
1336 		 * The data_transfer_type value must be set after the handle
1337 		 * has been bound, for it will be used in e1000g_free_tx_swpkt()
1338 		 * to decide whether we need to unbind the handle.
1339 		 */
1340 		packet->data_transfer_type = USE_DMA;
1341 		break;
1342 	default:
1343 		ASSERT(B_FALSE);
1344 		break;
1345 	}
1346 
1347 	packet->num_mblk_frag++;
1348 
1349 	/*
1350 	 * Each address could span thru multpile cookie..
1351 	 * Each cookie will have one descriptor
1352 	 */
1353 	for (j = ncookies; j != 0; j--) {
1354 
1355 		desc_count = e1000g_fill_tx_desc(tx_ring,
1356 		    packet,
1357 		    dma_cookie.dmac_laddress,
1358 		    dma_cookie.dmac_size);
1359 
1360 		if (desc_count <= 0)
1361 			return (-1);
1362 
1363 		desc_total += desc_count;
1364 
1365 		/*
1366 		 * ddi_dma_nextcookie() retrieves subsequent DMA
1367 		 * cookies for a DMA object.
1368 		 * ddi_dma_nextcookie() fills in the
1369 		 * ddi_dma_cookie(9S) structure pointed to by
1370 		 * cookiep.  The ddi_dma_cookie(9S) structure
1371 		 * must be allocated prior to calling
1372 		 * ddi_dma_nextcookie(). The DMA cookie count
1373 		 * returned by ddi_dma_buf_bind_handle(9F),
1374 		 * ddi_dma_addr_bind_handle(9F), or
1375 		 * ddi_dma_getwin(9F) indicates the number of DMA
1376 		 * cookies a DMA object consists of.  If the
1377 		 * resulting cookie count, N, is larger than 1,
1378 		 * ddi_dma_nextcookie() must be called N-1 times
1379 		 * to retrieve all DMA cookies.
1380 		 */
1381 		if (j > 1) {
1382 			ddi_dma_nextcookie(packet->tx_dma_handle,
1383 			    &dma_cookie);
1384 		}
1385 	}
1386 
1387 	return (desc_total);
1388 }
1389 
1390 static void
1391 e1000g_fill_context_descriptor(context_data_t *cur_context,
1392     struct e1000_context_desc *context_desc)
1393 {
1394 	if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM) {
1395 		context_desc->lower_setup.ip_fields.ipcss =
1396 		    cur_context->ether_header_size;
1397 		context_desc->lower_setup.ip_fields.ipcso =
1398 		    cur_context->ether_header_size +
1399 		    offsetof(struct ip, ip_sum);
1400 		context_desc->lower_setup.ip_fields.ipcse =
1401 		    cur_context->ether_header_size +
1402 		    cur_context->cksum_start - 1;
1403 	} else
1404 		context_desc->lower_setup.ip_config = 0;
1405 
1406 	if (cur_context->cksum_flags & HCK_PARTIALCKSUM) {
1407 		/*
1408 		 * The packet with same protocol has the following
1409 		 * stuff and start offset:
1410 		 * |  Protocol  | Stuff  | Start  | Checksum
1411 		 * |		| Offset | Offset | Enable
1412 		 * | IPv4 + TCP |  0x24  |  0x14  |  Yes
1413 		 * | IPv4 + UDP |  0x1A  |  0x14  |  Yes
1414 		 * | IPv6 + TCP |  0x20  |  0x10  |  No
1415 		 * | IPv6 + UDP |  0x14  |  0x10  |  No
1416 		 */
1417 		context_desc->upper_setup.tcp_fields.tucss =
1418 		    cur_context->cksum_start + cur_context->ether_header_size;
1419 		context_desc->upper_setup.tcp_fields.tucso =
1420 		    cur_context->cksum_stuff + cur_context->ether_header_size;
1421 		context_desc->upper_setup.tcp_fields.tucse = 0;
1422 	} else
1423 		context_desc->upper_setup.tcp_config = 0;
1424 
1425 	if (cur_context->lso_flag) {
1426 		context_desc->tcp_seg_setup.fields.mss = cur_context->mss;
1427 		context_desc->tcp_seg_setup.fields.hdr_len =
1428 		    cur_context->hdr_len;
1429 		/*
1430 		 * workaround for 82546EB errata 23, status-writeback
1431 		 * reporting (RS) should not be set on context or
1432 		 * Null descriptors
1433 		 */
1434 		context_desc->cmd_and_length = E1000_TXD_CMD_DEXT
1435 		    | E1000_TXD_CMD_TSE | E1000_TXD_CMD_IP | E1000_TXD_CMD_TCP
1436 		    | E1000_TXD_DTYP_C | cur_context->pay_len;
1437 	} else {
1438 		context_desc->cmd_and_length = E1000_TXD_CMD_DEXT
1439 		    | E1000_TXD_DTYP_C;
1440 		/*
1441 		 * Zero out the options for TCP Segmentation Offload
1442 		 */
1443 		context_desc->tcp_seg_setup.data = 0;
1444 	}
1445 }
1446 
1447 static int
1448 e1000g_fill_tx_desc(e1000g_tx_ring_t *tx_ring,
1449     p_tx_sw_packet_t packet, uint64_t address, size_t size)
1450 {
1451 	struct e1000_hw *hw = &tx_ring->adapter->shared;
1452 	p_sw_desc_t desc;
1453 
1454 	if (hw->mac.type == e1000_82544) {
1455 		if (hw->bus.type == e1000_bus_type_pcix)
1456 			return (e1000g_tx_workaround_PCIX_82544(packet,
1457 			    address, size));
1458 
1459 		if (size > JUMBO_FRAG_LENGTH)
1460 			return (e1000g_tx_workaround_jumbo_82544(packet,
1461 			    address, size));
1462 	}
1463 
1464 	ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
1465 
1466 	desc = &packet->desc[packet->num_desc];
1467 	desc->address = address;
1468 	desc->length = (uint32_t)size;
1469 
1470 	packet->num_desc++;
1471 
1472 	return (1);
1473 }
1474 
1475 static int
1476 e1000g_tx_workaround_PCIX_82544(p_tx_sw_packet_t packet,
1477     uint64_t address, size_t size)
1478 {
1479 	p_sw_desc_t desc;
1480 	int desc_count;
1481 	long size_left;
1482 	size_t len;
1483 	uint32_t counter;
1484 	uint32_t array_elements;
1485 	desc_array_t desc_array;
1486 
1487 	/*
1488 	 * Coexist Workaround for cordova: RP: 07/04/03
1489 	 *
1490 	 * RP: ERRATA: Workaround ISSUE:
1491 	 * 8kb_buffer_Lockup CONTROLLER: Cordova Breakup
1492 	 * Eachbuffer in to 8kb pieces until the
1493 	 * remainder is < 8kb
1494 	 */
1495 	size_left = size;
1496 	desc_count = 0;
1497 
1498 	while (size_left > 0) {
1499 		if (size_left > MAX_TX_BUF_SIZE)
1500 			len = MAX_TX_BUF_SIZE;
1501 		else
1502 			len = size_left;
1503 
1504 		array_elements = e1000g_fill_82544_desc(address,
1505 		    len, &desc_array);
1506 
1507 		for (counter = 0; counter < array_elements; counter++) {
1508 			ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
1509 			/*
1510 			 * Put in the buffer address
1511 			 */
1512 			desc = &packet->desc[packet->num_desc];
1513 
1514 			desc->address =
1515 			    desc_array.descriptor[counter].address;
1516 			desc->length =
1517 			    desc_array.descriptor[counter].length;
1518 
1519 			packet->num_desc++;
1520 			desc_count++;
1521 		} /* for */
1522 
1523 		/*
1524 		 * Update the buffer address and length
1525 		 */
1526 		address += MAX_TX_BUF_SIZE;
1527 		size_left -= MAX_TX_BUF_SIZE;
1528 	} /* while */
1529 
1530 	return (desc_count);
1531 }
1532 
1533 static int
1534 e1000g_tx_workaround_jumbo_82544(p_tx_sw_packet_t packet,
1535     uint64_t address, size_t size)
1536 {
1537 	p_sw_desc_t desc;
1538 	int desc_count;
1539 	long size_left;
1540 	uint32_t offset;
1541 
1542 	/*
1543 	 * Workaround for Jumbo Frames on Cordova
1544 	 * PSD 06/01/2001
1545 	 */
1546 	size_left = size;
1547 	desc_count = 0;
1548 	offset = 0;
1549 	while (size_left > 0) {
1550 		ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
1551 
1552 		desc = &packet->desc[packet->num_desc];
1553 
1554 		desc->address = address + offset;
1555 
1556 		if (size_left > JUMBO_FRAG_LENGTH)
1557 			desc->length = JUMBO_FRAG_LENGTH;
1558 		else
1559 			desc->length = (uint32_t)size_left;
1560 
1561 		packet->num_desc++;
1562 		desc_count++;
1563 
1564 		offset += desc->length;
1565 		size_left -= JUMBO_FRAG_LENGTH;
1566 	}
1567 
1568 	return (desc_count);
1569 }
1570 
1571 #pragma inline(e1000g_82547_tx_move_tail_work)
1572 
1573 static void
1574 e1000g_82547_tx_move_tail_work(e1000g_tx_ring_t *tx_ring)
1575 {
1576 	struct e1000_hw *hw;
1577 	uint16_t hw_tdt;
1578 	uint16_t sw_tdt;
1579 	struct e1000_tx_desc *tx_desc;
1580 	uint16_t length = 0;
1581 	boolean_t eop = B_FALSE;
1582 	struct e1000g *Adapter;
1583 
1584 	Adapter = tx_ring->adapter;
1585 	hw = &Adapter->shared;
1586 
1587 	hw_tdt = E1000_READ_REG(hw, E1000_TDT(0));
1588 	sw_tdt = tx_ring->tbd_next - tx_ring->tbd_first;
1589 
1590 	while (hw_tdt != sw_tdt) {
1591 		tx_desc = &(tx_ring->tbd_first[hw_tdt]);
1592 		length += tx_desc->lower.flags.length;
1593 		eop = tx_desc->lower.data & E1000_TXD_CMD_EOP;
1594 		if (++hw_tdt == Adapter->tx_desc_num)
1595 			hw_tdt = 0;
1596 
1597 		if (eop) {
1598 			if ((Adapter->link_duplex == HALF_DUPLEX) &&
1599 			    (e1000_fifo_workaround_82547(hw, length)
1600 			    != E1000_SUCCESS)) {
1601 				if (tx_ring->timer_enable_82547) {
1602 					ASSERT(tx_ring->timer_id_82547 == 0);
1603 					tx_ring->timer_id_82547 =
1604 					    timeout(e1000g_82547_timeout,
1605 					    (void *)tx_ring,
1606 					    drv_usectohz(10000));
1607 				}
1608 				return;
1609 
1610 			} else {
1611 				E1000_WRITE_REG(hw, E1000_TDT(0), hw_tdt);
1612 				e1000_update_tx_fifo_head_82547(hw, length);
1613 				length = 0;
1614 			}
1615 		}
1616 	}
1617 }
1618 
1619 static void
1620 e1000g_82547_timeout(void *arg)
1621 {
1622 	e1000g_tx_ring_t *tx_ring;
1623 
1624 	tx_ring = (e1000g_tx_ring_t *)arg;
1625 
1626 	mutex_enter(&tx_ring->tx_lock);
1627 
1628 	tx_ring->timer_id_82547 = 0;
1629 	e1000g_82547_tx_move_tail_work(tx_ring);
1630 
1631 	mutex_exit(&tx_ring->tx_lock);
1632 }
1633 
1634 static void
1635 e1000g_82547_tx_move_tail(e1000g_tx_ring_t *tx_ring)
1636 {
1637 	timeout_id_t tid;
1638 
1639 	ASSERT(MUTEX_HELD(&tx_ring->tx_lock));
1640 
1641 	tid = tx_ring->timer_id_82547;
1642 	tx_ring->timer_id_82547 = 0;
1643 	if (tid != 0) {
1644 		tx_ring->timer_enable_82547 = B_FALSE;
1645 		mutex_exit(&tx_ring->tx_lock);
1646 
1647 		(void) untimeout(tid);
1648 
1649 		mutex_enter(&tx_ring->tx_lock);
1650 	}
1651 	tx_ring->timer_enable_82547 = B_TRUE;
1652 	e1000g_82547_tx_move_tail_work(tx_ring);
1653 }
1654