xref: /titanic_50/usr/src/uts/common/io/e1000g/e1000g_tx.c (revision 924965c71c59efd981e0c32ba257aeb4d9ac51d6)
1 /*
2  * This file is provided under a CDDLv1 license.  When using or
3  * redistributing this file, you may do so under this license.
4  * In redistributing this file this license must be included
5  * and no other modification of this header file is permitted.
6  *
7  * CDDL LICENSE SUMMARY
8  *
9  * Copyright(c) 1999 - 2008 Intel Corporation. All rights reserved.
10  *
11  * The contents of this file are subject to the terms of Version
12  * 1.0 of the Common Development and Distribution License (the "License").
13  *
14  * You should have received a copy of the License with this software.
15  * You can obtain a copy of the License at
16  *	http://www.opensolaris.org/os/licensing.
17  * See the License for the specific language governing permissions
18  * and limitations under the License.
19  */
20 
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms of the CDDLv1.
24  */
25 
26 /*
27  * **********************************************************************
28  *									*
29  * Module Name:								*
30  *   e1000g_tx.c							*
31  *									*
32  * Abstract:								*
33  *   This file contains some routines that take care of Transmit,	*
34  *   make the hardware to send the data pointed by the packet out	*
35  *   on to the physical medium.						*
36  *									*
37  * **********************************************************************
38  */
39 
40 #include "e1000g_sw.h"
41 #include "e1000g_debug.h"
42 
43 static boolean_t e1000g_send(struct e1000g *, mblk_t *);
44 static int e1000g_tx_copy(e1000g_tx_ring_t *,
45     p_tx_sw_packet_t, mblk_t *, boolean_t);
46 static int e1000g_tx_bind(e1000g_tx_ring_t *,
47     p_tx_sw_packet_t, mblk_t *);
48 static boolean_t e1000g_retrieve_context(mblk_t *, context_data_t *, size_t);
49 static boolean_t e1000g_check_context(e1000g_tx_ring_t *, context_data_t *);
50 static int e1000g_fill_tx_ring(e1000g_tx_ring_t *, LIST_DESCRIBER *,
51     context_data_t *);
52 static void e1000g_fill_context_descriptor(context_data_t *,
53     struct e1000_context_desc *);
54 static int e1000g_fill_tx_desc(e1000g_tx_ring_t *,
55     p_tx_sw_packet_t, uint64_t, size_t);
56 static uint32_t e1000g_fill_82544_desc(uint64_t Address, size_t Length,
57     p_desc_array_t desc_array);
58 static int e1000g_tx_workaround_PCIX_82544(p_tx_sw_packet_t, uint64_t, size_t);
59 static int e1000g_tx_workaround_jumbo_82544(p_tx_sw_packet_t, uint64_t, size_t);
60 static void e1000g_82547_timeout(void *);
61 static void e1000g_82547_tx_move_tail(e1000g_tx_ring_t *);
62 static void e1000g_82547_tx_move_tail_work(e1000g_tx_ring_t *);
63 
64 #ifndef E1000G_DEBUG
65 #pragma inline(e1000g_tx_copy)
66 #pragma inline(e1000g_tx_bind)
67 #pragma inline(e1000g_retrieve_context)
68 #pragma inline(e1000g_check_context)
69 #pragma inline(e1000g_fill_tx_ring)
70 #pragma inline(e1000g_fill_context_descriptor)
71 #pragma inline(e1000g_fill_tx_desc)
72 #pragma inline(e1000g_fill_82544_desc)
73 #pragma inline(e1000g_tx_workaround_PCIX_82544)
74 #pragma inline(e1000g_tx_workaround_jumbo_82544)
75 #pragma inline(e1000g_free_tx_swpkt)
76 #endif
77 
78 /*
79  * e1000g_free_tx_swpkt	- free up the tx sw packet
80  *
81  * Unbind the previously bound DMA handle for a given
82  * transmit sw packet. And reset the sw packet data.
83  */
84 void
85 e1000g_free_tx_swpkt(register p_tx_sw_packet_t packet)
86 {
87 	switch (packet->data_transfer_type) {
88 	case USE_BCOPY:
89 		packet->tx_buf->len = 0;
90 		break;
91 #ifdef __sparc
92 	case USE_DVMA:
93 		dvma_unload(packet->tx_dma_handle, 0, -1);
94 		break;
95 #endif
96 	case USE_DMA:
97 		(void) ddi_dma_unbind_handle(packet->tx_dma_handle);
98 		break;
99 	default:
100 		break;
101 	}
102 
103 	/*
104 	 * The mblk has been stripped off the sw packet
105 	 * and will be freed in a triggered soft intr.
106 	 */
107 	ASSERT(packet->mp == NULL);
108 
109 	packet->data_transfer_type = USE_NONE;
110 	packet->num_mblk_frag = 0;
111 	packet->num_desc = 0;
112 }
113 
114 mblk_t *
115 e1000g_m_tx(void *arg, mblk_t *mp)
116 {
117 	struct e1000g *Adapter = (struct e1000g *)arg;
118 	mblk_t *next;
119 
120 	rw_enter(&Adapter->chip_lock, RW_READER);
121 
122 	if ((Adapter->chip_state != E1000G_START) ||
123 	    (Adapter->link_state != LINK_STATE_UP)) {
124 		freemsgchain(mp);
125 		mp = NULL;
126 	}
127 
128 	while (mp != NULL) {
129 		next = mp->b_next;
130 		mp->b_next = NULL;
131 
132 		if (!e1000g_send(Adapter, mp)) {
133 			mp->b_next = next;
134 			break;
135 		}
136 
137 		mp = next;
138 	}
139 
140 	rw_exit(&Adapter->chip_lock);
141 	return (mp);
142 }
143 
144 /*
145  * e1000g_send -  send packets onto the wire
146  *
147  * Called from e1000g_m_tx with an mblk ready to send. this
148  * routine sets up the transmit descriptors and sends data to
149  * the wire. It also pushes the just transmitted packet to
150  * the used tx sw packet list.
151  */
152 static boolean_t
153 e1000g_send(struct e1000g *Adapter, mblk_t *mp)
154 {
155 	p_tx_sw_packet_t packet;
156 	LIST_DESCRIBER pending_list;
157 	size_t len;
158 	size_t msg_size;
159 	uint32_t frag_count;
160 	int desc_count;
161 	uint32_t desc_total;
162 	boolean_t tx_undersize_flag;
163 	mblk_t *nmp;
164 	mblk_t *tmp;
165 	e1000g_tx_ring_t *tx_ring;
166 	context_data_t cur_context;
167 
168 	tx_ring = Adapter->tx_ring;
169 
170 	/* Get the total size and frags number of the message */
171 	tx_undersize_flag = B_FALSE;
172 	frag_count = 0;
173 	msg_size = 0;
174 	for (nmp = mp; nmp; nmp = nmp->b_cont) {
175 		frag_count++;
176 		msg_size += MBLKL(nmp);
177 	}
178 
179 	/* retrieve and compute information for context descriptor */
180 	if (!e1000g_retrieve_context(mp, &cur_context, msg_size)) {
181 		freemsg(mp);
182 		return (B_TRUE);
183 	}
184 
185 	/*
186 	 * Make sure the packet is less than the allowed size
187 	 */
188 	if (!cur_context.lso_flag &&
189 	    (msg_size > Adapter->max_frame_size - ETHERFCSL)) {
190 		/*
191 		 * For the over size packet, we'll just drop it.
192 		 * So we return B_TRUE here.
193 		 */
194 		E1000G_DEBUGLOG_1(Adapter, E1000G_WARN_LEVEL,
195 		    "Tx packet out of bound. length = %d \n", msg_size);
196 		E1000G_STAT(tx_ring->stat_over_size);
197 		freemsg(mp);
198 		return (B_TRUE);
199 	}
200 
201 	/*
202 	 * Check and reclaim tx descriptors.
203 	 * This low water mark check should be done all the time as
204 	 * Transmit interrupt delay can produce Transmit interrupts little
205 	 * late and that may cause few problems related to reaping Tx
206 	 * Descriptors... As you may run short of them before getting any
207 	 * transmit interrupt...
208 	 */
209 	if (tx_ring->resched_needed ||
210 	    (tx_ring->tbd_avail < Adapter->tx_recycle_thresh)) {
211 		(void) e1000g_recycle(tx_ring);
212 		E1000G_DEBUG_STAT(tx_ring->stat_recycle);
213 
214 		if (tx_ring->tbd_avail < DEFAULT_TX_NO_RESOURCE) {
215 			E1000G_DEBUG_STAT(tx_ring->stat_lack_desc);
216 			goto tx_no_resource;
217 		}
218 	}
219 
220 	/*
221 	 * If the message size is less than the minimum ethernet packet size,
222 	 * we'll use bcopy to send it, and padd it to 60 bytes later.
223 	 */
224 	if (msg_size < ETHERMIN) {
225 		E1000G_DEBUG_STAT(tx_ring->stat_under_size);
226 		tx_undersize_flag = B_TRUE;
227 	}
228 
229 	/* Initialize variables */
230 	desc_count = 1;	/* The initial value should be greater than 0 */
231 	desc_total = 0;
232 	QUEUE_INIT_LIST(&pending_list);
233 
234 	/* Process each mblk fragment and fill tx descriptors */
235 	packet = NULL;
236 	nmp = mp;
237 	while (nmp) {
238 		tmp = nmp->b_cont;
239 
240 		len = MBLKL(nmp);
241 		/* Check zero length mblks */
242 		if (len == 0) {
243 			E1000G_DEBUG_STAT(tx_ring->stat_empty_frags);
244 			/*
245 			 * If there're no packet buffers have been used,
246 			 * or we just completed processing a buffer, then
247 			 * skip the empty mblk fragment.
248 			 * Otherwise, there's still a pending buffer that
249 			 * needs to be processed (tx_copy).
250 			 */
251 			if (desc_count > 0) {
252 				nmp = tmp;
253 				continue;
254 			}
255 		}
256 
257 		/*
258 		 * Get a new TxSwPacket to process mblk buffers.
259 		 */
260 		if (desc_count > 0) {
261 			mutex_enter(&tx_ring->freelist_lock);
262 			packet = (p_tx_sw_packet_t)
263 			    QUEUE_POP_HEAD(&tx_ring->free_list);
264 			mutex_exit(&tx_ring->freelist_lock);
265 
266 			if (packet == NULL) {
267 				E1000G_DEBUGLOG_0(Adapter, E1000G_INFO_LEVEL,
268 				    "No Tx SwPacket available\n");
269 				E1000G_STAT(tx_ring->stat_no_swpkt);
270 				goto tx_send_failed;
271 			}
272 			QUEUE_PUSH_TAIL(&pending_list, &packet->Link);
273 		}
274 
275 		ASSERT(packet);
276 		/*
277 		 * If the size of the fragment is less than the tx_bcopy_thresh
278 		 * we'll use bcopy; Otherwise, we'll use DMA binding.
279 		 */
280 		if ((len <= Adapter->tx_bcopy_thresh) || tx_undersize_flag) {
281 			desc_count =
282 			    e1000g_tx_copy(tx_ring, packet, nmp,
283 			    tx_undersize_flag);
284 			E1000G_DEBUG_STAT(tx_ring->stat_copy);
285 		} else {
286 			desc_count =
287 			    e1000g_tx_bind(tx_ring, packet, nmp);
288 			E1000G_DEBUG_STAT(tx_ring->stat_bind);
289 		}
290 
291 		if (desc_count > 0)
292 			desc_total += desc_count;
293 		else if (desc_count < 0)
294 			goto tx_send_failed;
295 
296 		nmp = tmp;
297 	}
298 
299 	/* Assign the message to the last sw packet */
300 	ASSERT(packet);
301 	ASSERT(packet->mp == NULL);
302 	packet->mp = mp;
303 
304 	/* Try to recycle the tx descriptors again */
305 	if (tx_ring->tbd_avail < (desc_total + 2)) {
306 		E1000G_DEBUG_STAT(tx_ring->stat_recycle_retry);
307 		(void) e1000g_recycle(tx_ring);
308 	}
309 
310 	mutex_enter(&tx_ring->tx_lock);
311 
312 	/*
313 	 * If the number of available tx descriptors is not enough for transmit
314 	 * (one redundant descriptor and one hw checksum context descriptor are
315 	 * included), then return failure.
316 	 */
317 	if (tx_ring->tbd_avail < (desc_total + 2)) {
318 		E1000G_DEBUGLOG_0(Adapter, E1000G_INFO_LEVEL,
319 		    "No Enough Tx descriptors\n");
320 		E1000G_STAT(tx_ring->stat_no_desc);
321 		mutex_exit(&tx_ring->tx_lock);
322 		goto tx_send_failed;
323 	}
324 
325 	desc_count = e1000g_fill_tx_ring(tx_ring, &pending_list, &cur_context);
326 
327 	mutex_exit(&tx_ring->tx_lock);
328 
329 	ASSERT(desc_count > 0);
330 
331 	/* Send successful */
332 	return (B_TRUE);
333 
334 tx_send_failed:
335 	/*
336 	 * Enable Transmit interrupts, so that the interrupt routine can
337 	 * call mac_tx_update() when transmit descriptors become available.
338 	 */
339 	tx_ring->resched_needed = B_TRUE;
340 	if (!Adapter->tx_intr_enable)
341 		e1000g_mask_tx_interrupt(Adapter);
342 
343 	/* Free pending TxSwPackets */
344 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&pending_list);
345 	while (packet) {
346 		packet->mp = NULL;
347 		e1000g_free_tx_swpkt(packet);
348 		packet = (p_tx_sw_packet_t)
349 		    QUEUE_GET_NEXT(&pending_list, &packet->Link);
350 	}
351 
352 	/* Return pending TxSwPackets to the "Free" list */
353 	mutex_enter(&tx_ring->freelist_lock);
354 	QUEUE_APPEND(&tx_ring->free_list, &pending_list);
355 	mutex_exit(&tx_ring->freelist_lock);
356 
357 	E1000G_STAT(tx_ring->stat_send_fail);
358 
359 	/* Message will be scheduled for re-transmit */
360 	return (B_FALSE);
361 
362 tx_no_resource:
363 	/*
364 	 * Enable Transmit interrupts, so that the interrupt routine can
365 	 * call mac_tx_update() when transmit descriptors become available.
366 	 */
367 	tx_ring->resched_needed = B_TRUE;
368 	if (!Adapter->tx_intr_enable)
369 		e1000g_mask_tx_interrupt(Adapter);
370 
371 	/* Message will be scheduled for re-transmit */
372 	return (B_FALSE);
373 }
374 
375 static boolean_t
376 e1000g_retrieve_context(mblk_t *mp, context_data_t *cur_context,
377     size_t msg_size)
378 {
379 	uintptr_t ip_start;
380 	uintptr_t tcp_start;
381 	mblk_t *nmp;
382 
383 	bzero(cur_context, sizeof (context_data_t));
384 
385 	/* retrieve checksum info */
386 	hcksum_retrieve(mp, NULL, NULL, &cur_context->cksum_start,
387 	    &cur_context->cksum_stuff, NULL, NULL, &cur_context->cksum_flags);
388 	/* retrieve ethernet header size */
389 	if (((struct ether_vlan_header *)(uintptr_t)mp->b_rptr)->ether_tpid ==
390 	    htons(ETHERTYPE_VLAN))
391 		cur_context->ether_header_size =
392 		    sizeof (struct ether_vlan_header);
393 	else
394 		cur_context->ether_header_size =
395 		    sizeof (struct ether_header);
396 
397 	if (cur_context->cksum_flags & HW_LSO) {
398 		if ((cur_context->mss = DB_LSOMSS(mp)) != 0) {
399 			/* free the invaid packet */
400 			if (!((cur_context->cksum_flags & HCK_PARTIALCKSUM) &&
401 			    (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM))) {
402 				return (B_FALSE);
403 			}
404 			cur_context->lso_flag = B_TRUE;
405 			/*
406 			 * Some fields are cleared for the hardware to fill
407 			 * in. We don't assume Ethernet header, IP header and
408 			 * TCP header are always in the same mblk fragment,
409 			 * while we assume each header is always within one
410 			 * mblk fragment and Ethernet header is always in the
411 			 * first mblk fragment.
412 			 */
413 			nmp = mp;
414 			ip_start = (uintptr_t)(nmp->b_rptr)
415 			    + cur_context->ether_header_size;
416 			if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
417 				ip_start = (uintptr_t)nmp->b_cont->b_rptr
418 				    + (ip_start - (uintptr_t)(nmp->b_wptr));
419 				nmp = nmp->b_cont;
420 			}
421 			tcp_start = ip_start +
422 			    IPH_HDR_LENGTH((ipha_t *)ip_start);
423 			if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
424 				tcp_start = (uintptr_t)nmp->b_cont->b_rptr
425 				    + (tcp_start - (uintptr_t)(nmp->b_wptr));
426 				nmp = nmp->b_cont;
427 			}
428 			cur_context->hdr_len = cur_context->ether_header_size
429 			    + IPH_HDR_LENGTH((ipha_t *)ip_start)
430 			    + TCP_HDR_LENGTH((tcph_t *)tcp_start);
431 			((ipha_t *)ip_start)->ipha_length = 0;
432 			((ipha_t *)ip_start)->ipha_hdr_checksum = 0;
433 			/* calculate the TCP packet payload length */
434 			cur_context->pay_len = msg_size - cur_context->hdr_len;
435 		}
436 	}
437 	return (B_TRUE);
438 }
439 
440 static boolean_t
441 e1000g_check_context(e1000g_tx_ring_t *tx_ring, context_data_t *cur_context)
442 {
443 	boolean_t context_reload;
444 	context_data_t *pre_context;
445 	struct e1000g *Adapter;
446 
447 	context_reload = B_FALSE;
448 	pre_context = &tx_ring->pre_context;
449 	Adapter = tx_ring->adapter;
450 
451 	/*
452 	 * The following code determine if the context descriptor is
453 	 * needed to be reloaded. The sequence of the conditions is
454 	 * made by their possibilities of changing.
455 	 */
456 	/*
457 	 * workaround for 82546EB, context descriptor must be reloaded
458 	 * per LSO/hw_cksum packet if LSO is enabled.
459 	 */
460 	if (Adapter->lso_premature_issue &&
461 	    Adapter->lso_enable &&
462 	    (cur_context->cksum_flags != 0)) {
463 
464 		context_reload = B_TRUE;
465 	} else if (cur_context->lso_flag) {
466 		if ((cur_context->cksum_flags != pre_context->cksum_flags) ||
467 		    (cur_context->pay_len != pre_context->pay_len) ||
468 		    (cur_context->mss != pre_context->mss) ||
469 		    (cur_context->hdr_len != pre_context->hdr_len) ||
470 		    (cur_context->cksum_stuff != pre_context->cksum_stuff) ||
471 		    (cur_context->cksum_start != pre_context->cksum_start) ||
472 		    (cur_context->ether_header_size !=
473 		    pre_context->ether_header_size)) {
474 
475 			context_reload = B_TRUE;
476 		}
477 	} else if (cur_context->cksum_flags != 0) {
478 		if ((cur_context->cksum_flags != pre_context->cksum_flags) ||
479 		    (cur_context->cksum_stuff != pre_context->cksum_stuff) ||
480 		    (cur_context->cksum_start != pre_context->cksum_start) ||
481 		    (cur_context->ether_header_size !=
482 		    pre_context->ether_header_size)) {
483 
484 			context_reload = B_TRUE;
485 		}
486 	}
487 
488 	return (context_reload);
489 }
490 
491 static int
492 e1000g_fill_tx_ring(e1000g_tx_ring_t *tx_ring, LIST_DESCRIBER *pending_list,
493     context_data_t *cur_context)
494 {
495 	struct e1000g *Adapter;
496 	struct e1000_hw *hw;
497 	p_tx_sw_packet_t first_packet;
498 	p_tx_sw_packet_t packet;
499 	p_tx_sw_packet_t previous_packet;
500 	boolean_t context_reload;
501 	struct e1000_tx_desc *first_data_desc;
502 	struct e1000_tx_desc *next_desc;
503 	struct e1000_tx_desc *descriptor;
504 	int desc_count;
505 	boolean_t buff_overrun_flag;
506 	int i;
507 
508 	Adapter = tx_ring->adapter;
509 	hw = &Adapter->shared;
510 
511 	desc_count = 0;
512 	first_packet = NULL;
513 	first_data_desc = NULL;
514 	descriptor = NULL;
515 	first_packet = NULL;
516 	packet = NULL;
517 	buff_overrun_flag = B_FALSE;
518 
519 	next_desc = tx_ring->tbd_next;
520 
521 	/* Context descriptor reload check */
522 	context_reload = e1000g_check_context(tx_ring, cur_context);
523 
524 	if (context_reload) {
525 		first_packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(pending_list);
526 
527 		descriptor = next_desc;
528 
529 		e1000g_fill_context_descriptor(cur_context,
530 		    (struct e1000_context_desc *)descriptor);
531 
532 		/* Check the wrap-around case */
533 		if (descriptor == tx_ring->tbd_last)
534 			next_desc = tx_ring->tbd_first;
535 		else
536 			next_desc++;
537 
538 		desc_count++;
539 	}
540 
541 	first_data_desc = next_desc;
542 
543 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(pending_list);
544 	while (packet) {
545 		ASSERT(packet->num_desc);
546 
547 		for (i = 0; i < packet->num_desc; i++) {
548 			ASSERT(tx_ring->tbd_avail > 0);
549 
550 			descriptor = next_desc;
551 			descriptor->buffer_addr =
552 			    packet->desc[i].address;
553 			descriptor->lower.data =
554 			    packet->desc[i].length;
555 
556 			/* Zero out status */
557 			descriptor->upper.data = 0;
558 
559 			descriptor->lower.data |=
560 			    E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D;
561 			/* must set RS on every outgoing descriptor */
562 			descriptor->lower.data |=
563 			    E1000_TXD_CMD_RS;
564 
565 			if (cur_context->lso_flag)
566 				descriptor->lower.data |= E1000_TXD_CMD_TSE;
567 
568 			/* Check the wrap-around case */
569 			if (descriptor == tx_ring->tbd_last)
570 				next_desc = tx_ring->tbd_first;
571 			else
572 				next_desc++;
573 
574 			desc_count++;
575 
576 			/*
577 			 * workaround for 82546EB errata 33, hang in PCI-X
578 			 * systems due to 2k Buffer Overrun during Transmit
579 			 * Operation. The workaround applies to all the Intel
580 			 * PCI-X chips.
581 			 */
582 			if (hw->bus.type == e1000_bus_type_pcix &&
583 			    descriptor == first_data_desc &&
584 			    ((descriptor->lower.data & E1000G_TBD_LENGTH_MASK)
585 			    > E1000_TX_BUFFER_OEVRRUN_THRESHOLD)) {
586 				/* modified the first descriptor */
587 				descriptor->lower.data &=
588 				    ~E1000G_TBD_LENGTH_MASK;
589 				descriptor->lower.flags.length =
590 				    E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
591 
592 				/* insert a new descriptor */
593 				ASSERT(tx_ring->tbd_avail > 0);
594 				next_desc->buffer_addr =
595 				    packet->desc[0].address +
596 				    E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
597 				next_desc->lower.data =
598 				    packet->desc[0].length -
599 				    E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
600 
601 				/* Zero out status */
602 				next_desc->upper.data = 0;
603 
604 				next_desc->lower.data |=
605 				    E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D;
606 				/* must set RS on every outgoing descriptor */
607 				next_desc->lower.data |=
608 				    E1000_TXD_CMD_RS;
609 
610 				if (cur_context->lso_flag)
611 					next_desc->lower.data |=
612 					    E1000_TXD_CMD_TSE;
613 
614 				descriptor = next_desc;
615 
616 				/* Check the wrap-around case */
617 				if (next_desc == tx_ring->tbd_last)
618 					next_desc = tx_ring->tbd_first;
619 				else
620 					next_desc++;
621 
622 				desc_count++;
623 				buff_overrun_flag = B_TRUE;
624 			}
625 		}
626 
627 		if (buff_overrun_flag) {
628 			packet->num_desc++;
629 			buff_overrun_flag = B_FALSE;
630 		}
631 
632 		if (first_packet != NULL) {
633 			/*
634 			 * Count the checksum context descriptor for
635 			 * the first SwPacket.
636 			 */
637 			first_packet->num_desc++;
638 			first_packet = NULL;
639 		}
640 
641 		previous_packet = packet;
642 		packet = (p_tx_sw_packet_t)
643 		    QUEUE_GET_NEXT(pending_list, &packet->Link);
644 	}
645 
646 	/*
647 	 * workaround for 82546EB errata 21, LSO Premature Descriptor Write Back
648 	 */
649 	if (Adapter->lso_premature_issue && cur_context->lso_flag &&
650 	    ((descriptor->lower.data & E1000G_TBD_LENGTH_MASK) > 8)) {
651 		/* modified the previous descriptor */
652 		descriptor->lower.data -= 4;
653 
654 		/* insert a new descriptor */
655 		ASSERT(tx_ring->tbd_avail > 0);
656 		/* the lower 20 bits of lower.data is the length field */
657 		next_desc->buffer_addr =
658 		    descriptor->buffer_addr +
659 		    (descriptor->lower.data & E1000G_TBD_LENGTH_MASK);
660 		next_desc->lower.data = 4;
661 
662 		/* Zero out status */
663 		next_desc->upper.data = 0;
664 		/* It must be part of a LSO packet */
665 		next_desc->lower.data |=
666 		    E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D |
667 		    E1000_TXD_CMD_RS | E1000_TXD_CMD_TSE;
668 
669 		descriptor = next_desc;
670 
671 		/* Check the wrap-around case */
672 		if (descriptor == tx_ring->tbd_last)
673 			next_desc = tx_ring->tbd_first;
674 		else
675 			next_desc++;
676 
677 		desc_count++;
678 		/* update the number of descriptors */
679 		previous_packet->num_desc++;
680 	}
681 
682 	ASSERT(descriptor);
683 
684 	if (cur_context->cksum_flags) {
685 		if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM)
686 			((struct e1000_data_desc *)first_data_desc)->
687 			    upper.fields.popts |= E1000_TXD_POPTS_IXSM;
688 		if (cur_context->cksum_flags & HCK_PARTIALCKSUM)
689 			((struct e1000_data_desc *)first_data_desc)->
690 			    upper.fields.popts |= E1000_TXD_POPTS_TXSM;
691 	}
692 
693 	/*
694 	 * Last Descriptor of Packet needs End Of Packet (EOP), Report
695 	 * Status (RS) set.
696 	 */
697 	if (Adapter->tx_intr_delay) {
698 		descriptor->lower.data |= E1000_TXD_CMD_IDE |
699 		    E1000_TXD_CMD_EOP;
700 	} else {
701 		descriptor->lower.data |= E1000_TXD_CMD_EOP;
702 	}
703 
704 	/* Set append Ethernet CRC (IFCS) bits */
705 	if (cur_context->lso_flag) {
706 		first_data_desc->lower.data |= E1000_TXD_CMD_IFCS;
707 	} else {
708 		descriptor->lower.data |= E1000_TXD_CMD_IFCS;
709 	}
710 
711 	/*
712 	 * Sync the Tx descriptors DMA buffer
713 	 */
714 	(void) ddi_dma_sync(tx_ring->tbd_dma_handle,
715 	    0, 0, DDI_DMA_SYNC_FORDEV);
716 
717 	tx_ring->tbd_next = next_desc;
718 
719 	/*
720 	 * Advance the Transmit Descriptor Tail (Tdt), this tells the
721 	 * FX1000 that this frame is available to transmit.
722 	 */
723 	if (hw->mac.type == e1000_82547)
724 		e1000g_82547_tx_move_tail(tx_ring);
725 	else
726 		E1000_WRITE_REG(hw, E1000_TDT(0),
727 		    (uint32_t)(next_desc - tx_ring->tbd_first));
728 
729 	if (e1000g_check_acc_handle(Adapter->osdep.reg_handle) != DDI_FM_OK) {
730 		ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
731 		Adapter->chip_state = E1000G_ERROR;
732 	}
733 
734 	/* Put the pending SwPackets to the "Used" list */
735 	mutex_enter(&tx_ring->usedlist_lock);
736 	QUEUE_APPEND(&tx_ring->used_list, pending_list);
737 	tx_ring->tbd_avail -= desc_count;
738 	mutex_exit(&tx_ring->usedlist_lock);
739 
740 	/* update LSO related data */
741 	if (context_reload)
742 		tx_ring->pre_context = *cur_context;
743 
744 	return (desc_count);
745 }
746 
747 
748 /*
749  * e1000g_tx_setup - setup tx data structures
750  *
751  * This routine initializes all of the transmit related
752  * structures. This includes the Transmit descriptors,
753  * and the tx_sw_packet structures.
754  */
755 void
756 e1000g_tx_setup(struct e1000g *Adapter)
757 {
758 	struct e1000_hw *hw;
759 	p_tx_sw_packet_t packet;
760 	uint32_t i;
761 	uint32_t buf_high;
762 	uint32_t buf_low;
763 	uint32_t reg_tipg;
764 	uint32_t reg_tctl;
765 	int size;
766 	e1000g_tx_ring_t *tx_ring;
767 
768 	hw = &Adapter->shared;
769 	tx_ring = Adapter->tx_ring;
770 
771 	/* init the lists */
772 	/*
773 	 * Here we don't need to protect the lists using the
774 	 * usedlist_lock and freelist_lock, for they have
775 	 * been protected by the chip_lock.
776 	 */
777 	QUEUE_INIT_LIST(&tx_ring->used_list);
778 	QUEUE_INIT_LIST(&tx_ring->free_list);
779 
780 	/* Go through and set up each SW_Packet */
781 	packet = tx_ring->packet_area;
782 	for (i = 0; i < Adapter->tx_freelist_num; i++, packet++) {
783 		/* Initialize this tx_sw_apcket area */
784 		e1000g_free_tx_swpkt(packet);
785 		/* Add this tx_sw_packet to the free list */
786 		QUEUE_PUSH_TAIL(&tx_ring->free_list,
787 		    &packet->Link);
788 	}
789 
790 	/* Setup TX descriptor pointers */
791 	tx_ring->tbd_next = tx_ring->tbd_first;
792 	tx_ring->tbd_oldest = tx_ring->tbd_first;
793 
794 	/*
795 	 * Setup Hardware TX Registers
796 	 */
797 	/* Setup the Transmit Control Register (TCTL). */
798 	reg_tctl = E1000_READ_REG(hw, E1000_TCTL);
799 	reg_tctl |= E1000_TCTL_PSP | E1000_TCTL_EN |
800 	    (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT) |
801 	    (E1000_COLLISION_DISTANCE << E1000_COLD_SHIFT) |
802 	    E1000_TCTL_RTLC;
803 
804 	/* Enable the MULR bit */
805 	if (hw->bus.type == e1000_bus_type_pci_express)
806 		reg_tctl |= E1000_TCTL_MULR;
807 
808 	E1000_WRITE_REG(hw, E1000_TCTL, reg_tctl);
809 
810 	/* Setup HW Base and Length of Tx descriptor area */
811 	size = (Adapter->tx_desc_num * sizeof (struct e1000_tx_desc));
812 	E1000_WRITE_REG(hw, E1000_TDLEN(0), size);
813 	size = E1000_READ_REG(hw, E1000_TDLEN(0));
814 
815 	buf_low = (uint32_t)tx_ring->tbd_dma_addr;
816 	buf_high = (uint32_t)(tx_ring->tbd_dma_addr >> 32);
817 
818 	E1000_WRITE_REG(hw, E1000_TDBAL(0), buf_low);
819 	E1000_WRITE_REG(hw, E1000_TDBAH(0), buf_high);
820 
821 	/* Setup our HW Tx Head & Tail descriptor pointers */
822 	E1000_WRITE_REG(hw, E1000_TDH(0), 0);
823 	E1000_WRITE_REG(hw, E1000_TDT(0), 0);
824 
825 	/* Set the default values for the Tx Inter Packet Gap timer */
826 	if ((hw->mac.type == e1000_82542) &&
827 	    ((hw->revision_id == E1000_REVISION_2) ||
828 	    (hw->revision_id == E1000_REVISION_3))) {
829 		reg_tipg = DEFAULT_82542_TIPG_IPGT;
830 		reg_tipg |=
831 		    DEFAULT_82542_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT;
832 		reg_tipg |=
833 		    DEFAULT_82542_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT;
834 	} else if (hw->mac.type == e1000_80003es2lan) {
835 		reg_tipg = DEFAULT_82543_TIPG_IPGR1;
836 		reg_tipg |= DEFAULT_80003ES2LAN_TIPG_IPGR2 <<
837 		    E1000_TIPG_IPGR2_SHIFT;
838 	} else {
839 		if (hw->phy.media_type == e1000_media_type_fiber)
840 			reg_tipg = DEFAULT_82543_TIPG_IPGT_FIBER;
841 		else
842 			reg_tipg = DEFAULT_82543_TIPG_IPGT_COPPER;
843 		reg_tipg |=
844 		    DEFAULT_82543_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT;
845 		reg_tipg |=
846 		    DEFAULT_82543_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT;
847 	}
848 	E1000_WRITE_REG(hw, E1000_TIPG, reg_tipg);
849 
850 	/* Setup Transmit Interrupt Delay Value */
851 	E1000_WRITE_REG(hw, E1000_TIDV, Adapter->tx_intr_delay);
852 	E1000G_DEBUGLOG_1(Adapter, E1000G_INFO_LEVEL,
853 	    "E1000_TIDV: 0x%x\n", Adapter->tx_intr_delay);
854 
855 	if (hw->mac.type >= e1000_82540) {
856 		E1000_WRITE_REG(&Adapter->shared, E1000_TADV,
857 		    Adapter->tx_intr_abs_delay);
858 		E1000G_DEBUGLOG_1(Adapter, E1000G_INFO_LEVEL,
859 		    "E1000_TADV: 0x%x\n", Adapter->tx_intr_abs_delay);
860 	}
861 
862 	tx_ring->tbd_avail = Adapter->tx_desc_num;
863 
864 	/* Initialize stored context information */
865 	bzero(&(tx_ring->pre_context), sizeof (context_data_t));
866 }
867 
868 /*
869  * e1000g_recycle - recycle the tx descriptors and tx sw packets
870  */
871 int
872 e1000g_recycle(e1000g_tx_ring_t *tx_ring)
873 {
874 	struct e1000g *Adapter;
875 	LIST_DESCRIBER pending_list;
876 	p_tx_sw_packet_t packet;
877 	mblk_t *mp;
878 	mblk_t *nmp;
879 	struct e1000_tx_desc *descriptor;
880 	int desc_count;
881 	int is_intr;
882 
883 	/*
884 	 * This function will examine each TxSwPacket in the 'used' queue
885 	 * if the e1000g is done with it then the associated resources (Tx
886 	 * Descriptors) will be "freed" and the TxSwPacket will be
887 	 * returned to the 'free' queue.
888 	 */
889 	Adapter = tx_ring->adapter;
890 
891 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&tx_ring->used_list);
892 	if (packet == NULL) {
893 		tx_ring->recycle_fail = 0;
894 		tx_ring->stall_watchdog = 0;
895 		return (0);
896 	}
897 
898 	is_intr = servicing_interrupt();
899 
900 	if (is_intr)
901 		mutex_enter(&tx_ring->usedlist_lock);
902 	else if (mutex_tryenter(&tx_ring->usedlist_lock) == 0)
903 		return (0);
904 
905 	desc_count = 0;
906 	QUEUE_INIT_LIST(&pending_list);
907 
908 	/* Sync the Tx descriptor DMA buffer */
909 	(void) ddi_dma_sync(tx_ring->tbd_dma_handle,
910 	    0, 0, DDI_DMA_SYNC_FORKERNEL);
911 	if (e1000g_check_dma_handle(
912 	    tx_ring->tbd_dma_handle) != DDI_FM_OK) {
913 		mutex_exit(&tx_ring->usedlist_lock);
914 		ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
915 		Adapter->chip_state = E1000G_ERROR;
916 		return (0);
917 	}
918 
919 	/*
920 	 * While there are still TxSwPackets in the used queue check them
921 	 */
922 	while ((packet =
923 	    (p_tx_sw_packet_t)QUEUE_GET_HEAD(&tx_ring->used_list)) != NULL) {
924 
925 		/*
926 		 * Get hold of the next descriptor that the e1000g will
927 		 * report status back to (this will be the last descriptor
928 		 * of a given sw packet). We only want to free the
929 		 * sw packet (and it resources) if the e1000g is done
930 		 * with ALL of the descriptors.  If the e1000g is done
931 		 * with the last one then it is done with all of them.
932 		 */
933 		ASSERT(packet->num_desc);
934 		descriptor = tx_ring->tbd_oldest + (packet->num_desc - 1);
935 
936 		/* Check for wrap case */
937 		if (descriptor > tx_ring->tbd_last)
938 			descriptor -= Adapter->tx_desc_num;
939 
940 		/*
941 		 * If the descriptor done bit is set free TxSwPacket and
942 		 * associated resources
943 		 */
944 		if (descriptor->upper.fields.status & E1000_TXD_STAT_DD) {
945 			QUEUE_POP_HEAD(&tx_ring->used_list);
946 			QUEUE_PUSH_TAIL(&pending_list, &packet->Link);
947 
948 			if (descriptor == tx_ring->tbd_last)
949 				tx_ring->tbd_oldest =
950 				    tx_ring->tbd_first;
951 			else
952 				tx_ring->tbd_oldest =
953 				    descriptor + 1;
954 
955 			desc_count += packet->num_desc;
956 
957 			if (is_intr && (desc_count >= Adapter->tx_recycle_num))
958 				break;
959 		} else {
960 			/*
961 			 * Found a sw packet that the e1000g is not done
962 			 * with then there is no reason to check the rest
963 			 * of the queue.
964 			 */
965 			break;
966 		}
967 	}
968 
969 	tx_ring->tbd_avail += desc_count;
970 	Adapter->tx_pkt_cnt += desc_count;
971 
972 	mutex_exit(&tx_ring->usedlist_lock);
973 
974 	if (desc_count == 0) {
975 		tx_ring->recycle_fail++;
976 		E1000G_DEBUG_STAT(tx_ring->stat_recycle_none);
977 		return (0);
978 	}
979 
980 	tx_ring->recycle_fail = 0;
981 	tx_ring->stall_watchdog = 0;
982 
983 	mp = NULL;
984 	nmp = NULL;
985 	packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&pending_list);
986 	ASSERT(packet != NULL);
987 	while (packet != NULL) {
988 		if (packet->mp != NULL) {
989 			ASSERT(packet->mp->b_next == NULL);
990 			/* Assemble the message chain */
991 			if (mp == NULL) {
992 				mp = packet->mp;
993 				nmp = packet->mp;
994 			} else {
995 				nmp->b_next = packet->mp;
996 				nmp = packet->mp;
997 			}
998 			/* Disconnect the message from the sw packet */
999 			packet->mp = NULL;
1000 		}
1001 
1002 		/* Free the TxSwPackets */
1003 		e1000g_free_tx_swpkt(packet);
1004 
1005 		packet = (p_tx_sw_packet_t)
1006 		    QUEUE_GET_NEXT(&pending_list, &packet->Link);
1007 	}
1008 
1009 	/* Return the TxSwPackets back to the FreeList */
1010 	mutex_enter(&tx_ring->freelist_lock);
1011 	QUEUE_APPEND(&tx_ring->free_list, &pending_list);
1012 	mutex_exit(&tx_ring->freelist_lock);
1013 
1014 	if (mp != NULL)
1015 		freemsgchain(mp);
1016 
1017 	return (desc_count);
1018 }
1019 /*
1020  * 82544 Coexistence issue workaround:
1021  *    There are 2 issues.
1022  *    1. If a 32 bit split completion happens from P64H2 and another
1023  *	agent drives a 64 bit request/split completion after ONLY
1024  *	1 idle clock (BRCM/Emulex/Adaptec fiber channel cards) then
1025  *	82544 has a problem where in to clock all the data in, it
1026  *	looks at REQ64# signal and since it has changed so fast (i.e. 1
1027  *	idle clock turn around), it will fail to clock all the data in.
1028  *	Data coming from certain ending addresses has exposure to this issue.
1029  *
1030  * To detect this issue, following equation can be used...
1031  *	SIZE[3:0] + ADDR[2:0] = SUM[3:0].
1032  *	If SUM[3:0] is in between 1 to 4, we will have this issue.
1033  *
1034  * ROOT CAUSE:
1035  *	The erratum involves the 82544 PCIX elasticity FIFO implementations as
1036  *	64-bit FIFO's and flushing of the final partial-bytes corresponding
1037  *	to the end of a requested read burst. Under a specific burst condition
1038  *	of ending-data alignment and 32-byte split-completions, the final
1039  *	byte(s) of split-completion data require an extra clock cycle to flush
1040  *	into 64-bit FIFO orientation.  An incorrect logic dependency on the
1041  *	REQ64# signal occurring during during this clock cycle may cause the
1042  *	residual byte(s) to be lost, thereby rendering the internal DMA client
1043  *	forever awaiting the final byte(s) for an outbound data-fetch.  The
1044  *	erratum is confirmed to *only* occur if certain subsequent external
1045  *	64-bit PCIX bus transactions occur immediately (minimum possible bus
1046  *	turn- around) following the odd-aligned 32-bit split-completion
1047  *	containing the final byte(s).  Intel has confirmed that this has been
1048  *	seen only with chipset/bridges which have the capability to provide
1049  *	32-bit split-completion data, and in the presence of newer PCIX bus
1050  *	agents which fully-optimize the inter-transaction turn-around (zero
1051  *	additional initiator latency when pre-granted bus ownership).
1052  *
1053  *   	This issue does not exist in PCI bus mode, when any agent is operating
1054  *	in 32 bit only mode or on chipsets that do not do 32 bit split
1055  *	completions for 64 bit read requests (Serverworks chipsets). P64H2 does
1056  *	32 bit split completions for any read request that has bit 2 set to 1
1057  *	for the requested address and read request size is more than 8 bytes.
1058  *
1059  *   2. Another issue is related to 82544 driving DACs under the similar
1060  *	scenario (32 bit split completion followed by 64 bit transaction with
1061  *	only 1 cycle turnaround). This issue is still being root caused. We
1062  *	think that both of these issues can be avoided if following workaround
1063  *	is implemented. It seems DAC issues is related to ending addresses being
1064  *	0x9, 0xA, 0xB, 0xC and hence ending up at odd boundaries in elasticity
1065  *	FIFO which does not get flushed due to REQ64# dependency. We will only
1066  *	know the full story after it has been simulated successfully by HW team.
1067  *
1068  * WORKAROUND:
1069  *	Make sure we do not have ending address as 1,2,3,4(Hang) or 9,a,b,c(DAC)
1070  */
1071 static uint32_t
1072 e1000g_fill_82544_desc(uint64_t address,
1073     size_t length, p_desc_array_t desc_array)
1074 {
1075 	/*
1076 	 * Since issue is sensitive to length and address.
1077 	 * Let us first check the address...
1078 	 */
1079 	uint32_t safe_terminator;
1080 
1081 	if (length <= 4) {
1082 		desc_array->descriptor[0].address = address;
1083 		desc_array->descriptor[0].length = (uint32_t)length;
1084 		desc_array->elements = 1;
1085 		return (desc_array->elements);
1086 	}
1087 	safe_terminator =
1088 	    (uint32_t)((((uint32_t)address & 0x7) +
1089 	    (length & 0xF)) & 0xF);
1090 	/*
1091 	 * if it does not fall between 0x1 to 0x4 and 0x9 to 0xC then
1092 	 * return
1093 	 */
1094 	if (safe_terminator == 0 ||
1095 	    (safe_terminator > 4 && safe_terminator < 9) ||
1096 	    (safe_terminator > 0xC && safe_terminator <= 0xF)) {
1097 		desc_array->descriptor[0].address = address;
1098 		desc_array->descriptor[0].length = (uint32_t)length;
1099 		desc_array->elements = 1;
1100 		return (desc_array->elements);
1101 	}
1102 
1103 	desc_array->descriptor[0].address = address;
1104 	desc_array->descriptor[0].length = length - 4;
1105 	desc_array->descriptor[1].address = address + (length - 4);
1106 	desc_array->descriptor[1].length = 4;
1107 	desc_array->elements = 2;
1108 	return (desc_array->elements);
1109 }
1110 
1111 static int
1112 e1000g_tx_copy(e1000g_tx_ring_t *tx_ring, p_tx_sw_packet_t packet,
1113     mblk_t *mp, boolean_t tx_undersize_flag)
1114 {
1115 	size_t len;
1116 	size_t len1;
1117 	dma_buffer_t *tx_buf;
1118 	mblk_t *nmp;
1119 	boolean_t finished;
1120 	int desc_count;
1121 
1122 	desc_count = 0;
1123 	tx_buf = packet->tx_buf;
1124 	len = MBLKL(mp);
1125 
1126 	ASSERT((tx_buf->len + len) <= tx_buf->size);
1127 
1128 	if (len > 0) {
1129 		bcopy(mp->b_rptr,
1130 		    tx_buf->address + tx_buf->len,
1131 		    len);
1132 		tx_buf->len += len;
1133 
1134 		packet->num_mblk_frag++;
1135 	}
1136 
1137 	nmp = mp->b_cont;
1138 	if (nmp == NULL) {
1139 		finished = B_TRUE;
1140 	} else {
1141 		len1 = MBLKL(nmp);
1142 		if ((tx_buf->len + len1) > tx_buf->size)
1143 			finished = B_TRUE;
1144 		else if (tx_undersize_flag)
1145 			finished = B_FALSE;
1146 		else if (len1 > tx_ring->adapter->tx_bcopy_thresh)
1147 			finished = B_TRUE;
1148 		else
1149 			finished = B_FALSE;
1150 	}
1151 
1152 	if (finished) {
1153 		E1000G_DEBUG_STAT_COND(tx_ring->stat_multi_copy,
1154 		    (tx_buf->len > len));
1155 
1156 		/*
1157 		 * If the packet is smaller than 64 bytes, which is the
1158 		 * minimum ethernet packet size, pad the packet to make
1159 		 * it at least 60 bytes. The hardware will add 4 bytes
1160 		 * for CRC.
1161 		 */
1162 		if (tx_undersize_flag) {
1163 			ASSERT(tx_buf->len < ETHERMIN);
1164 
1165 			bzero(tx_buf->address + tx_buf->len,
1166 			    ETHERMIN - tx_buf->len);
1167 			tx_buf->len = ETHERMIN;
1168 		}
1169 
1170 #ifdef __sparc
1171 		if (packet->dma_type == USE_DVMA)
1172 			dvma_sync(tx_buf->dma_handle, 0, DDI_DMA_SYNC_FORDEV);
1173 		else
1174 			(void) ddi_dma_sync(tx_buf->dma_handle, 0,
1175 			    tx_buf->len, DDI_DMA_SYNC_FORDEV);
1176 #else
1177 		(void) ddi_dma_sync(tx_buf->dma_handle, 0,
1178 		    tx_buf->len, DDI_DMA_SYNC_FORDEV);
1179 #endif
1180 
1181 		packet->data_transfer_type = USE_BCOPY;
1182 
1183 		desc_count = e1000g_fill_tx_desc(tx_ring,
1184 		    packet,
1185 		    tx_buf->dma_address,
1186 		    tx_buf->len);
1187 
1188 		if (desc_count <= 0)
1189 			return (-1);
1190 	}
1191 
1192 	return (desc_count);
1193 }
1194 
1195 static int
1196 e1000g_tx_bind(e1000g_tx_ring_t *tx_ring, p_tx_sw_packet_t packet, mblk_t *mp)
1197 {
1198 	int j;
1199 	int mystat;
1200 	size_t len;
1201 	ddi_dma_cookie_t dma_cookie;
1202 	uint_t ncookies;
1203 	int desc_count;
1204 	uint32_t desc_total;
1205 
1206 	desc_total = 0;
1207 	len = MBLKL(mp);
1208 
1209 	/*
1210 	 * ddi_dma_addr_bind_handle() allocates  DMA  resources  for  a
1211 	 * memory  object such that a device can perform DMA to or from
1212 	 * the object.  DMA resources  are  allocated  considering  the
1213 	 * device's  DMA  attributes  as  expressed by ddi_dma_attr(9S)
1214 	 * (see ddi_dma_alloc_handle(9F)).
1215 	 *
1216 	 * ddi_dma_addr_bind_handle() fills in  the  first  DMA  cookie
1217 	 * pointed  to by cookiep with the appropriate address, length,
1218 	 * and bus type. *ccountp is set to the number of DMA  cookies
1219 	 * representing this DMA object. Subsequent DMA cookies must be
1220 	 * retrieved by calling ddi_dma_nextcookie(9F)  the  number  of
1221 	 * times specified by *countp - 1.
1222 	 */
1223 	switch (packet->dma_type) {
1224 #ifdef __sparc
1225 	case USE_DVMA:
1226 		dvma_kaddr_load(packet->tx_dma_handle,
1227 		    (caddr_t)mp->b_rptr, len, 0, &dma_cookie);
1228 
1229 		dvma_sync(packet->tx_dma_handle, 0,
1230 		    DDI_DMA_SYNC_FORDEV);
1231 
1232 		ncookies = 1;
1233 		packet->data_transfer_type = USE_DVMA;
1234 		break;
1235 #endif
1236 	case USE_DMA:
1237 		if ((mystat = ddi_dma_addr_bind_handle(
1238 		    packet->tx_dma_handle, NULL,
1239 		    (caddr_t)mp->b_rptr, len,
1240 		    DDI_DMA_WRITE | DDI_DMA_STREAMING,
1241 		    DDI_DMA_DONTWAIT, 0, &dma_cookie,
1242 		    &ncookies)) != DDI_DMA_MAPPED) {
1243 
1244 			e1000g_log(tx_ring->adapter, CE_WARN,
1245 			    "Couldn't bind mblk buffer to Tx DMA handle: "
1246 			    "return: %X, Pkt: %X\n",
1247 			    mystat, packet);
1248 			return (-1);
1249 		}
1250 
1251 		/*
1252 		 * An implicit ddi_dma_sync() is done when the
1253 		 * ddi_dma_addr_bind_handle() is called. So we
1254 		 * don't need to explicitly call ddi_dma_sync()
1255 		 * here any more.
1256 		 */
1257 		ASSERT(ncookies);
1258 		E1000G_DEBUG_STAT_COND(tx_ring->stat_multi_cookie,
1259 		    (ncookies > 1));
1260 
1261 		/*
1262 		 * The data_transfer_type value must be set after the handle
1263 		 * has been bound, for it will be used in e1000g_free_tx_swpkt()
1264 		 * to decide whether we need to unbind the handle.
1265 		 */
1266 		packet->data_transfer_type = USE_DMA;
1267 		break;
1268 	default:
1269 		ASSERT(B_FALSE);
1270 		break;
1271 	}
1272 
1273 	packet->num_mblk_frag++;
1274 
1275 	/*
1276 	 * Each address could span thru multpile cookie..
1277 	 * Each cookie will have one descriptor
1278 	 */
1279 	for (j = ncookies; j != 0; j--) {
1280 
1281 		desc_count = e1000g_fill_tx_desc(tx_ring,
1282 		    packet,
1283 		    dma_cookie.dmac_laddress,
1284 		    dma_cookie.dmac_size);
1285 
1286 		if (desc_count <= 0)
1287 			return (-1);
1288 
1289 		desc_total += desc_count;
1290 
1291 		/*
1292 		 * ddi_dma_nextcookie() retrieves subsequent DMA
1293 		 * cookies for a DMA object.
1294 		 * ddi_dma_nextcookie() fills in the
1295 		 * ddi_dma_cookie(9S) structure pointed to by
1296 		 * cookiep.  The ddi_dma_cookie(9S) structure
1297 		 * must be allocated prior to calling
1298 		 * ddi_dma_nextcookie(). The DMA cookie count
1299 		 * returned by ddi_dma_buf_bind_handle(9F),
1300 		 * ddi_dma_addr_bind_handle(9F), or
1301 		 * ddi_dma_getwin(9F) indicates the number of DMA
1302 		 * cookies a DMA object consists of.  If the
1303 		 * resulting cookie count, N, is larger than 1,
1304 		 * ddi_dma_nextcookie() must be called N-1 times
1305 		 * to retrieve all DMA cookies.
1306 		 */
1307 		if (j > 1) {
1308 			ddi_dma_nextcookie(packet->tx_dma_handle,
1309 			    &dma_cookie);
1310 		}
1311 	}
1312 
1313 	return (desc_total);
1314 }
1315 
1316 static void
1317 e1000g_fill_context_descriptor(context_data_t *cur_context,
1318     struct e1000_context_desc *context_desc)
1319 {
1320 	if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM) {
1321 		context_desc->lower_setup.ip_fields.ipcss =
1322 		    cur_context->ether_header_size;
1323 		context_desc->lower_setup.ip_fields.ipcso =
1324 		    cur_context->ether_header_size +
1325 		    offsetof(struct ip, ip_sum);
1326 		context_desc->lower_setup.ip_fields.ipcse =
1327 		    cur_context->ether_header_size +
1328 		    cur_context->cksum_start - 1;
1329 	} else
1330 		context_desc->lower_setup.ip_config = 0;
1331 
1332 	if (cur_context->cksum_flags & HCK_PARTIALCKSUM) {
1333 		/*
1334 		 * The packet with same protocol has the following
1335 		 * stuff and start offset:
1336 		 * |  Protocol  | Stuff  | Start  | Checksum
1337 		 * |		| Offset | Offset | Enable
1338 		 * | IPv4 + TCP |  0x24  |  0x14  |  Yes
1339 		 * | IPv4 + UDP |  0x1A  |  0x14  |  Yes
1340 		 * | IPv6 + TCP |  0x20  |  0x10  |  No
1341 		 * | IPv6 + UDP |  0x14  |  0x10  |  No
1342 		 */
1343 		context_desc->upper_setup.tcp_fields.tucss =
1344 		    cur_context->cksum_start + cur_context->ether_header_size;
1345 		context_desc->upper_setup.tcp_fields.tucso =
1346 		    cur_context->cksum_stuff + cur_context->ether_header_size;
1347 		context_desc->upper_setup.tcp_fields.tucse = 0;
1348 	} else
1349 		context_desc->upper_setup.tcp_config = 0;
1350 
1351 	if (cur_context->lso_flag) {
1352 		context_desc->tcp_seg_setup.fields.mss = cur_context->mss;
1353 		context_desc->tcp_seg_setup.fields.hdr_len =
1354 		    cur_context->hdr_len;
1355 		/*
1356 		 * workaround for 82546EB errata 23, status-writeback
1357 		 * reporting (RS) should not be set on context or
1358 		 * Null descriptors
1359 		 */
1360 		context_desc->cmd_and_length = E1000_TXD_CMD_DEXT
1361 		    | E1000_TXD_CMD_TSE | E1000_TXD_CMD_IP | E1000_TXD_CMD_TCP
1362 		    | E1000_TXD_DTYP_C | cur_context->pay_len;
1363 	} else {
1364 		context_desc->cmd_and_length = E1000_TXD_CMD_DEXT
1365 		    | E1000_TXD_DTYP_C;
1366 		/*
1367 		 * Zero out the options for TCP Segmentation Offload
1368 		 */
1369 		context_desc->tcp_seg_setup.data = 0;
1370 	}
1371 }
1372 
1373 static int
1374 e1000g_fill_tx_desc(e1000g_tx_ring_t *tx_ring,
1375     p_tx_sw_packet_t packet, uint64_t address, size_t size)
1376 {
1377 	struct e1000_hw *hw = &tx_ring->adapter->shared;
1378 	p_sw_desc_t desc;
1379 
1380 	if (hw->mac.type == e1000_82544) {
1381 		if (hw->bus.type == e1000_bus_type_pcix)
1382 			return (e1000g_tx_workaround_PCIX_82544(packet,
1383 			    address, size));
1384 
1385 		if (size > JUMBO_FRAG_LENGTH)
1386 			return (e1000g_tx_workaround_jumbo_82544(packet,
1387 			    address, size));
1388 	}
1389 
1390 	ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
1391 
1392 	desc = &packet->desc[packet->num_desc];
1393 	desc->address = address;
1394 	desc->length = (uint32_t)size;
1395 
1396 	packet->num_desc++;
1397 
1398 	return (1);
1399 }
1400 
1401 static int
1402 e1000g_tx_workaround_PCIX_82544(p_tx_sw_packet_t packet,
1403     uint64_t address, size_t size)
1404 {
1405 	p_sw_desc_t desc;
1406 	int desc_count;
1407 	long size_left;
1408 	size_t len;
1409 	uint32_t counter;
1410 	uint32_t array_elements;
1411 	desc_array_t desc_array;
1412 
1413 	/*
1414 	 * Coexist Workaround for cordova: RP: 07/04/03
1415 	 *
1416 	 * RP: ERRATA: Workaround ISSUE:
1417 	 * 8kb_buffer_Lockup CONTROLLER: Cordova Breakup
1418 	 * Eachbuffer in to 8kb pieces until the
1419 	 * remainder is < 8kb
1420 	 */
1421 	size_left = size;
1422 	desc_count = 0;
1423 
1424 	while (size_left > 0) {
1425 		if (size_left > MAX_TX_BUF_SIZE)
1426 			len = MAX_TX_BUF_SIZE;
1427 		else
1428 			len = size_left;
1429 
1430 		array_elements = e1000g_fill_82544_desc(address,
1431 		    len, &desc_array);
1432 
1433 		for (counter = 0; counter < array_elements; counter++) {
1434 			ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
1435 			/*
1436 			 * Put in the buffer address
1437 			 */
1438 			desc = &packet->desc[packet->num_desc];
1439 
1440 			desc->address =
1441 			    desc_array.descriptor[counter].address;
1442 			desc->length =
1443 			    desc_array.descriptor[counter].length;
1444 
1445 			packet->num_desc++;
1446 			desc_count++;
1447 		} /* for */
1448 
1449 		/*
1450 		 * Update the buffer address and length
1451 		 */
1452 		address += MAX_TX_BUF_SIZE;
1453 		size_left -= MAX_TX_BUF_SIZE;
1454 	} /* while */
1455 
1456 	return (desc_count);
1457 }
1458 
1459 static int
1460 e1000g_tx_workaround_jumbo_82544(p_tx_sw_packet_t packet,
1461     uint64_t address, size_t size)
1462 {
1463 	p_sw_desc_t desc;
1464 	int desc_count;
1465 	long size_left;
1466 	uint32_t offset;
1467 
1468 	/*
1469 	 * Workaround for Jumbo Frames on Cordova
1470 	 * PSD 06/01/2001
1471 	 */
1472 	size_left = size;
1473 	desc_count = 0;
1474 	offset = 0;
1475 	while (size_left > 0) {
1476 		ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
1477 
1478 		desc = &packet->desc[packet->num_desc];
1479 
1480 		desc->address = address + offset;
1481 
1482 		if (size_left > JUMBO_FRAG_LENGTH)
1483 			desc->length = JUMBO_FRAG_LENGTH;
1484 		else
1485 			desc->length = (uint32_t)size_left;
1486 
1487 		packet->num_desc++;
1488 		desc_count++;
1489 
1490 		offset += desc->length;
1491 		size_left -= JUMBO_FRAG_LENGTH;
1492 	}
1493 
1494 	return (desc_count);
1495 }
1496 
1497 #pragma inline(e1000g_82547_tx_move_tail_work)
1498 
1499 static void
1500 e1000g_82547_tx_move_tail_work(e1000g_tx_ring_t *tx_ring)
1501 {
1502 	struct e1000_hw *hw;
1503 	uint16_t hw_tdt;
1504 	uint16_t sw_tdt;
1505 	struct e1000_tx_desc *tx_desc;
1506 	uint16_t length = 0;
1507 	boolean_t eop = B_FALSE;
1508 	struct e1000g *Adapter;
1509 
1510 	Adapter = tx_ring->adapter;
1511 	hw = &Adapter->shared;
1512 
1513 	hw_tdt = E1000_READ_REG(hw, E1000_TDT(0));
1514 	sw_tdt = tx_ring->tbd_next - tx_ring->tbd_first;
1515 
1516 	while (hw_tdt != sw_tdt) {
1517 		tx_desc = &(tx_ring->tbd_first[hw_tdt]);
1518 		length += tx_desc->lower.flags.length;
1519 		eop = tx_desc->lower.data & E1000_TXD_CMD_EOP;
1520 		if (++hw_tdt == Adapter->tx_desc_num)
1521 			hw_tdt = 0;
1522 
1523 		if (eop) {
1524 			if ((Adapter->link_duplex == HALF_DUPLEX) &&
1525 			    (e1000_fifo_workaround_82547(hw, length)
1526 			    != E1000_SUCCESS)) {
1527 				if (tx_ring->timer_enable_82547) {
1528 					ASSERT(tx_ring->timer_id_82547 == 0);
1529 					tx_ring->timer_id_82547 =
1530 					    timeout(e1000g_82547_timeout,
1531 					    (void *)tx_ring,
1532 					    drv_usectohz(10000));
1533 				}
1534 				return;
1535 
1536 			} else {
1537 				E1000_WRITE_REG(hw, E1000_TDT(0), hw_tdt);
1538 				e1000_update_tx_fifo_head_82547(hw, length);
1539 				length = 0;
1540 			}
1541 		}
1542 	}
1543 }
1544 
1545 static void
1546 e1000g_82547_timeout(void *arg)
1547 {
1548 	e1000g_tx_ring_t *tx_ring;
1549 
1550 	tx_ring = (e1000g_tx_ring_t *)arg;
1551 
1552 	mutex_enter(&tx_ring->tx_lock);
1553 
1554 	tx_ring->timer_id_82547 = 0;
1555 	e1000g_82547_tx_move_tail_work(tx_ring);
1556 
1557 	mutex_exit(&tx_ring->tx_lock);
1558 }
1559 
1560 static void
1561 e1000g_82547_tx_move_tail(e1000g_tx_ring_t *tx_ring)
1562 {
1563 	timeout_id_t tid;
1564 
1565 	ASSERT(MUTEX_HELD(&tx_ring->tx_lock));
1566 
1567 	tid = tx_ring->timer_id_82547;
1568 	tx_ring->timer_id_82547 = 0;
1569 	if (tid != 0) {
1570 		tx_ring->timer_enable_82547 = B_FALSE;
1571 		mutex_exit(&tx_ring->tx_lock);
1572 
1573 		(void) untimeout(tid);
1574 
1575 		mutex_enter(&tx_ring->tx_lock);
1576 	}
1577 	tx_ring->timer_enable_82547 = B_TRUE;
1578 	e1000g_82547_tx_move_tail_work(tx_ring);
1579 }
1580