xref: /titanic_44/usr/src/uts/common/io/e1000g/e1000g_rx.c (revision 3f1e69bef33050bee99ea1e9992af13fc467281f)
1 /*
2  * This file is provided under a CDDLv1 license.  When using or
3  * redistributing this file, you may do so under this license.
4  * In redistributing this file this license must be included
5  * and no other modification of this header file is permitted.
6  *
7  * CDDL LICENSE SUMMARY
8  *
9  * Copyright(c) 1999 - 2008 Intel Corporation. All rights reserved.
10  *
11  * The contents of this file are subject to the terms of Version
12  * 1.0 of the Common Development and Distribution License (the "License").
13  *
14  * You should have received a copy of the License with this software.
15  * You can obtain a copy of the License at
16  *	http://www.opensolaris.org/os/licensing.
17  * See the License for the specific language governing permissions
18  * and limitations under the License.
19  */
20 
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms of the CDDLv1.
24  */
25 
26 /*
27  * **********************************************************************
28  *									*
29  * Module Name:								*
30  *   e1000g_rx.c							*
31  *									*
32  * Abstract:								*
33  *   This file contains some routines that take care of Receive		*
34  *   interrupt and also for the received packets it sends up to		*
35  *   upper layer.							*
36  *   It tries to do a zero copy if free buffers are available in	*
37  *   the pool.								*
38  *									*
39  * **********************************************************************
40  */
41 
42 #include "e1000g_sw.h"
43 #include "e1000g_debug.h"
44 
45 static p_rx_sw_packet_t e1000g_get_buf(e1000g_rx_ring_t *rx_ring);
46 #pragma	inline(e1000g_get_buf)
47 static void e1000g_priv_devi_list_clean();
48 
49 /*
50  * e1000g_rxfree_func - the call-back function to reclaim rx buffer
51  *
52  * This function is called when an mp is freed by the user thru
53  * freeb call (Only for mp constructed through desballoc call)
54  * It returns back the freed buffer to the freelist
55  */
56 void
57 e1000g_rxfree_func(p_rx_sw_packet_t packet)
58 {
59 	e1000g_rx_ring_t *rx_ring;
60 
61 	rx_ring = (e1000g_rx_ring_t *)(uintptr_t)packet->rx_ring;
62 
63 	/*
64 	 * Here the rx recycling processes different rx packets in different
65 	 * threads, so we protect it with RW_READER to ensure it won't block
66 	 * other rx recycling threads.
67 	 */
68 	rw_enter(&e1000g_rx_detach_lock, RW_READER);
69 
70 	if (packet->flag == E1000G_RX_SW_FREE) {
71 		rw_exit(&e1000g_rx_detach_lock);
72 		return;
73 	}
74 
75 	if (packet->flag == E1000G_RX_SW_STOP) {
76 		packet->flag = E1000G_RX_SW_FREE;
77 		rw_exit(&e1000g_rx_detach_lock);
78 
79 		rw_enter(&e1000g_rx_detach_lock, RW_WRITER);
80 		rx_ring->pending_count--;
81 		e1000g_mblks_pending--;
82 
83 		if (rx_ring->pending_count == 0) {
84 			while (rx_ring->pending_list != NULL) {
85 				packet = rx_ring->pending_list;
86 				rx_ring->pending_list =
87 				    rx_ring->pending_list->next;
88 
89 				ASSERT(packet->mp == NULL);
90 				e1000g_free_rx_sw_packet(packet);
91 			}
92 		}
93 
94 		/*
95 		 * If e1000g_force_detach is enabled, we need to clean up
96 		 * the idle priv_dip entries in the private dip list while
97 		 * e1000g_mblks_pending is zero.
98 		 */
99 		if (e1000g_force_detach && (e1000g_mblks_pending == 0))
100 			e1000g_priv_devi_list_clean();
101 		rw_exit(&e1000g_rx_detach_lock);
102 		return;
103 	}
104 
105 	if (packet->flag == E1000G_RX_SW_DETACH) {
106 		packet->flag = E1000G_RX_SW_FREE;
107 		rw_exit(&e1000g_rx_detach_lock);
108 
109 		ASSERT(packet->mp == NULL);
110 		e1000g_free_rx_sw_packet(packet);
111 
112 		/*
113 		 * Here the e1000g_mblks_pending may be modified by different
114 		 * rx recycling threads simultaneously, so we need to protect
115 		 * it with RW_WRITER.
116 		 */
117 		rw_enter(&e1000g_rx_detach_lock, RW_WRITER);
118 		e1000g_mblks_pending--;
119 
120 		/*
121 		 * If e1000g_force_detach is enabled, we need to clean up
122 		 * the idle priv_dip entries in the private dip list while
123 		 * e1000g_mblks_pending is zero.
124 		 */
125 		if (e1000g_force_detach && (e1000g_mblks_pending == 0))
126 			e1000g_priv_devi_list_clean();
127 		rw_exit(&e1000g_rx_detach_lock);
128 		return;
129 	}
130 
131 	packet->flag = E1000G_RX_SW_FREE;
132 
133 	if (packet->mp == NULL) {
134 		/*
135 		 * Allocate a mblk that binds to the data buffer
136 		 */
137 		packet->mp = desballoc((unsigned char *)
138 		    packet->rx_buf->address - E1000G_IPALIGNROOM,
139 		    packet->rx_buf->size + E1000G_IPALIGNROOM,
140 		    BPRI_MED, &packet->free_rtn);
141 
142 		if (packet->mp != NULL) {
143 			packet->mp->b_rptr += E1000G_IPALIGNROOM;
144 			packet->mp->b_wptr += E1000G_IPALIGNROOM;
145 		} else {
146 			E1000G_STAT(rx_ring->stat_esballoc_fail);
147 		}
148 	}
149 
150 	mutex_enter(&rx_ring->freelist_lock);
151 	QUEUE_PUSH_TAIL(&rx_ring->free_list, &packet->Link);
152 	rx_ring->avail_freepkt++;
153 	mutex_exit(&rx_ring->freelist_lock);
154 
155 	rw_exit(&e1000g_rx_detach_lock);
156 }
157 
158 /*
159  * e1000g_priv_devi_list_clean - clean up e1000g_private_devi_list
160  *
161  * We will walk the e1000g_private_devi_list to free the entry marked
162  * with the E1000G_PRIV_DEVI_DETACH flag.
163  */
164 static void
165 e1000g_priv_devi_list_clean()
166 {
167 	private_devi_list_t *devi_node, *devi_del;
168 
169 	if (e1000g_private_devi_list == NULL)
170 		return;
171 
172 	devi_node = e1000g_private_devi_list;
173 	while ((devi_node != NULL) &&
174 	    (devi_node->flag == E1000G_PRIV_DEVI_DETACH)) {
175 		e1000g_private_devi_list = devi_node->next;
176 		kmem_free(devi_node->priv_dip,
177 		    sizeof (struct dev_info));
178 		kmem_free(devi_node,
179 		    sizeof (private_devi_list_t));
180 		devi_node = e1000g_private_devi_list;
181 	}
182 	if (e1000g_private_devi_list == NULL)
183 		return;
184 	while (devi_node->next != NULL) {
185 		if (devi_node->next->flag == E1000G_PRIV_DEVI_DETACH) {
186 			devi_del = devi_node->next;
187 			devi_node->next = devi_del->next;
188 			kmem_free(devi_del->priv_dip,
189 			    sizeof (struct dev_info));
190 			kmem_free(devi_del,
191 			    sizeof (private_devi_list_t));
192 		} else {
193 			devi_node = devi_node->next;
194 		}
195 	}
196 }
197 
198 /*
199  * e1000g_rx_setup - setup rx data structures
200  *
201  * This routine initializes all of the receive related
202  * structures. This includes the receive descriptors, the
203  * actual receive buffers, and the rx_sw_packet software
204  * structures.
205  */
206 void
207 e1000g_rx_setup(struct e1000g *Adapter)
208 {
209 	struct e1000_hw *hw;
210 	p_rx_sw_packet_t packet;
211 	struct e1000_rx_desc *descriptor;
212 	uint32_t buf_low;
213 	uint32_t buf_high;
214 	uint32_t reg_val;
215 	uint32_t rctl;
216 	uint32_t rxdctl;
217 	uint32_t ert;
218 	int i;
219 	int size;
220 	e1000g_rx_ring_t *rx_ring;
221 
222 	hw = &Adapter->shared;
223 	rx_ring = Adapter->rx_ring;
224 
225 	/*
226 	 * zero out all of the receive buffer descriptor memory
227 	 * assures any previous data or status is erased
228 	 */
229 	bzero(rx_ring->rbd_area,
230 	    sizeof (struct e1000_rx_desc) * Adapter->rx_desc_num);
231 
232 	if (!Adapter->rx_buffer_setup) {
233 		/* Init the list of "Receive Buffer" */
234 		QUEUE_INIT_LIST(&rx_ring->recv_list);
235 
236 		/* Init the list of "Free Receive Buffer" */
237 		QUEUE_INIT_LIST(&rx_ring->free_list);
238 
239 		/*
240 		 * Setup Receive list and the Free list. Note that
241 		 * the both were allocated in one packet area.
242 		 */
243 		packet = rx_ring->packet_area;
244 		descriptor = rx_ring->rbd_first;
245 
246 		for (i = 0; i < Adapter->rx_desc_num;
247 		    i++, packet = packet->next, descriptor++) {
248 			ASSERT(packet != NULL);
249 			ASSERT(descriptor != NULL);
250 			descriptor->buffer_addr =
251 			    packet->rx_buf->dma_address;
252 
253 			/* Add this rx_sw_packet to the receive list */
254 			QUEUE_PUSH_TAIL(&rx_ring->recv_list,
255 			    &packet->Link);
256 		}
257 
258 		for (i = 0; i < Adapter->rx_freelist_num;
259 		    i++, packet = packet->next) {
260 			ASSERT(packet != NULL);
261 			/* Add this rx_sw_packet to the free list */
262 			QUEUE_PUSH_TAIL(&rx_ring->free_list,
263 			    &packet->Link);
264 		}
265 		rx_ring->avail_freepkt = Adapter->rx_freelist_num;
266 
267 		Adapter->rx_buffer_setup = B_TRUE;
268 	} else {
269 		/* Setup the initial pointer to the first rx descriptor */
270 		packet = (p_rx_sw_packet_t)
271 		    QUEUE_GET_HEAD(&rx_ring->recv_list);
272 		descriptor = rx_ring->rbd_first;
273 
274 		for (i = 0; i < Adapter->rx_desc_num; i++) {
275 			ASSERT(packet != NULL);
276 			ASSERT(descriptor != NULL);
277 			descriptor->buffer_addr =
278 			    packet->rx_buf->dma_address;
279 
280 			/* Get next rx_sw_packet */
281 			packet = (p_rx_sw_packet_t)
282 			    QUEUE_GET_NEXT(&rx_ring->recv_list, &packet->Link);
283 			descriptor++;
284 		}
285 	}
286 
287 	E1000_WRITE_REG(&Adapter->shared, E1000_RDTR, Adapter->rx_intr_delay);
288 	E1000G_DEBUGLOG_1(Adapter, E1000G_INFO_LEVEL,
289 	    "E1000_RDTR: 0x%x\n", Adapter->rx_intr_delay);
290 	if (hw->mac.type >= e1000_82540) {
291 		E1000_WRITE_REG(&Adapter->shared, E1000_RADV,
292 		    Adapter->rx_intr_abs_delay);
293 		E1000G_DEBUGLOG_1(Adapter, E1000G_INFO_LEVEL,
294 		    "E1000_RADV: 0x%x\n", Adapter->rx_intr_abs_delay);
295 	}
296 
297 	/*
298 	 * Setup our descriptor pointers
299 	 */
300 	rx_ring->rbd_next = rx_ring->rbd_first;
301 
302 	size = Adapter->rx_desc_num * sizeof (struct e1000_rx_desc);
303 	E1000_WRITE_REG(hw, E1000_RDLEN(0), size);
304 	size = E1000_READ_REG(hw, E1000_RDLEN(0));
305 
306 	/* To get lower order bits */
307 	buf_low = (uint32_t)rx_ring->rbd_dma_addr;
308 	/* To get the higher order bits */
309 	buf_high = (uint32_t)(rx_ring->rbd_dma_addr >> 32);
310 
311 	E1000_WRITE_REG(hw, E1000_RDBAH(0), buf_high);
312 	E1000_WRITE_REG(hw, E1000_RDBAL(0), buf_low);
313 
314 	/*
315 	 * Setup our HW Rx Head & Tail descriptor pointers
316 	 */
317 	E1000_WRITE_REG(hw, E1000_RDT(0),
318 	    (uint32_t)(rx_ring->rbd_last - rx_ring->rbd_first));
319 	E1000_WRITE_REG(hw, E1000_RDH(0), 0);
320 
321 	/*
322 	 * Setup the Receive Control Register (RCTL), and ENABLE the
323 	 * receiver. The initial configuration is to: Enable the receiver,
324 	 * accept broadcasts, discard bad packets (and long packets),
325 	 * disable VLAN filter checking, set the receive descriptor
326 	 * minimum threshold size to 1/2, and the receive buffer size to
327 	 * 2k.
328 	 */
329 	rctl = E1000_RCTL_EN |		/* Enable Receive Unit */
330 	    E1000_RCTL_BAM |		/* Accept Broadcast Packets */
331 	    E1000_RCTL_LPE |		/* Large Packet Enable bit */
332 	    (hw->mac.mc_filter_type << E1000_RCTL_MO_SHIFT) |
333 	    E1000_RCTL_RDMTS_HALF |
334 	    E1000_RCTL_LBM_NO;		/* Loopback Mode = none */
335 
336 	if (Adapter->strip_crc)
337 		rctl |= E1000_RCTL_SECRC;	/* Strip Ethernet CRC */
338 
339 	if ((hw->mac.type == e1000_82545) ||
340 	    (hw->mac.type == e1000_82546) ||
341 	    (hw->mac.type == e1000_82546_rev_3)) {
342 		rctl |= E1000_RCTL_SZ_2048;
343 	} else {
344 		if ((Adapter->max_frame_size > FRAME_SIZE_UPTO_2K) &&
345 		    (Adapter->max_frame_size <= FRAME_SIZE_UPTO_4K))
346 			rctl |= E1000_RCTL_SZ_4096 | E1000_RCTL_BSEX;
347 		else if ((Adapter->max_frame_size > FRAME_SIZE_UPTO_4K) &&
348 		    (Adapter->max_frame_size <= FRAME_SIZE_UPTO_8K))
349 			rctl |= E1000_RCTL_SZ_8192 | E1000_RCTL_BSEX;
350 		else if ((Adapter->max_frame_size > FRAME_SIZE_UPTO_8K) &&
351 		    (Adapter->max_frame_size <= FRAME_SIZE_UPTO_16K))
352 			rctl |= E1000_RCTL_SZ_16384 | E1000_RCTL_BSEX;
353 		else
354 			rctl |= E1000_RCTL_SZ_2048;
355 	}
356 
357 	if (e1000_tbi_sbp_enabled_82543(hw))
358 		rctl |= E1000_RCTL_SBP;
359 
360 	/*
361 	 * Enable early receives on supported devices, only takes effect when
362 	 * packet size is equal or larger than the specified value (in 8 byte
363 	 * units), e.g. using jumbo frames when setting to E1000_ERT_2048
364 	 */
365 	if ((hw->mac.type == e1000_82573) ||
366 	    (hw->mac.type == e1000_82574) ||
367 	    (hw->mac.type == e1000_ich9lan) ||
368 	    (hw->mac.type == e1000_ich10lan)) {
369 
370 		ert = E1000_ERT_2048;
371 
372 		/*
373 		 * Special modification when ERT and
374 		 * jumbo frames are enabled
375 		 */
376 		if (Adapter->default_mtu > ETHERMTU) {
377 			rxdctl = E1000_READ_REG(hw, E1000_RXDCTL(0));
378 			E1000_WRITE_REG(hw, E1000_RXDCTL(0), rxdctl | 0x3);
379 			ert |= (1 << 13);
380 		}
381 
382 		E1000_WRITE_REG(hw, E1000_ERT, ert);
383 	}
384 
385 	reg_val =
386 	    E1000_RXCSUM_TUOFL |	/* TCP/UDP checksum offload Enable */
387 	    E1000_RXCSUM_IPOFL;		/* IP checksum offload Enable */
388 
389 	E1000_WRITE_REG(hw, E1000_RXCSUM, reg_val);
390 
391 	/*
392 	 * Workaround: Set bit 16 (IPv6_ExDIS) to disable the
393 	 * processing of received IPV6 extension headers
394 	 */
395 	if ((hw->mac.type == e1000_82571) || (hw->mac.type == e1000_82572)) {
396 		reg_val = E1000_READ_REG(hw, E1000_RFCTL);
397 		reg_val |= (E1000_RFCTL_IPV6_EX_DIS |
398 		    E1000_RFCTL_NEW_IPV6_EXT_DIS);
399 		E1000_WRITE_REG(hw, E1000_RFCTL, reg_val);
400 	}
401 
402 	/* Write to enable the receive unit */
403 	E1000_WRITE_REG(hw, E1000_RCTL, rctl);
404 }
405 
406 /*
407  * e1000g_get_buf - get an rx sw packet from the free_list
408  */
409 static p_rx_sw_packet_t
410 e1000g_get_buf(e1000g_rx_ring_t *rx_ring)
411 {
412 	p_rx_sw_packet_t packet;
413 
414 	mutex_enter(&rx_ring->freelist_lock);
415 	packet = (p_rx_sw_packet_t)
416 	    QUEUE_POP_HEAD(&rx_ring->free_list);
417 	if (packet != NULL)
418 		rx_ring->avail_freepkt--;
419 	mutex_exit(&rx_ring->freelist_lock);
420 
421 	return (packet);
422 }
423 
424 /*
425  * e1000g_receive - main receive routine
426  *
427  * This routine will process packets received in an interrupt
428  */
429 mblk_t *
430 e1000g_receive(struct e1000g *Adapter)
431 {
432 	struct e1000_hw *hw;
433 	mblk_t *nmp;
434 	mblk_t *ret_mp;
435 	mblk_t *ret_nmp;
436 	struct e1000_rx_desc *current_desc;
437 	struct e1000_rx_desc *last_desc;
438 	p_rx_sw_packet_t packet;
439 	p_rx_sw_packet_t newpkt;
440 	uint16_t length;
441 	uint32_t pkt_count;
442 	uint32_t desc_count;
443 	boolean_t accept_frame;
444 	boolean_t end_of_packet;
445 	boolean_t need_copy;
446 	e1000g_rx_ring_t *rx_ring;
447 	dma_buffer_t *rx_buf;
448 	uint16_t cksumflags;
449 
450 	ret_mp = NULL;
451 	ret_nmp = NULL;
452 	pkt_count = 0;
453 	desc_count = 0;
454 	cksumflags = 0;
455 
456 	hw = &Adapter->shared;
457 	rx_ring = Adapter->rx_ring;
458 
459 	/* Sync the Rx descriptor DMA buffers */
460 	(void) ddi_dma_sync(rx_ring->rbd_dma_handle,
461 	    0, 0, DDI_DMA_SYNC_FORKERNEL);
462 
463 	if (e1000g_check_dma_handle(rx_ring->rbd_dma_handle) != DDI_FM_OK) {
464 		ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
465 		Adapter->chip_state = E1000G_ERROR;
466 	}
467 
468 	current_desc = rx_ring->rbd_next;
469 	if (!(current_desc->status & E1000_RXD_STAT_DD)) {
470 		/*
471 		 * don't send anything up. just clear the RFD
472 		 */
473 		E1000G_DEBUG_STAT(rx_ring->stat_none);
474 		return (ret_mp);
475 	}
476 
477 	/*
478 	 * Loop through the receive descriptors starting at the last known
479 	 * descriptor owned by the hardware that begins a packet.
480 	 */
481 	while ((current_desc->status & E1000_RXD_STAT_DD) &&
482 	    (pkt_count < Adapter->rx_limit_onintr)) {
483 
484 		desc_count++;
485 		/*
486 		 * Now this can happen in Jumbo frame situation.
487 		 */
488 		if (current_desc->status & E1000_RXD_STAT_EOP) {
489 			/* packet has EOP set */
490 			end_of_packet = B_TRUE;
491 		} else {
492 			/*
493 			 * If this received buffer does not have the
494 			 * End-Of-Packet bit set, the received packet
495 			 * will consume multiple buffers. We won't send this
496 			 * packet upstack till we get all the related buffers.
497 			 */
498 			end_of_packet = B_FALSE;
499 		}
500 
501 		/*
502 		 * Get a pointer to the actual receive buffer
503 		 * The mp->b_rptr is mapped to The CurrentDescriptor
504 		 * Buffer Address.
505 		 */
506 		packet =
507 		    (p_rx_sw_packet_t)QUEUE_GET_HEAD(&rx_ring->recv_list);
508 		ASSERT(packet != NULL);
509 
510 		rx_buf = packet->rx_buf;
511 
512 		length = current_desc->length;
513 
514 #ifdef __sparc
515 		if (packet->dma_type == USE_DVMA)
516 			dvma_sync(rx_buf->dma_handle, 0,
517 			    DDI_DMA_SYNC_FORKERNEL);
518 		else
519 			(void) ddi_dma_sync(rx_buf->dma_handle,
520 			    E1000G_IPALIGNROOM, length,
521 			    DDI_DMA_SYNC_FORKERNEL);
522 #else
523 		(void) ddi_dma_sync(rx_buf->dma_handle,
524 		    E1000G_IPALIGNROOM, length,
525 		    DDI_DMA_SYNC_FORKERNEL);
526 #endif
527 
528 		if (e1000g_check_dma_handle(
529 		    rx_buf->dma_handle) != DDI_FM_OK) {
530 			ddi_fm_service_impact(Adapter->dip,
531 			    DDI_SERVICE_DEGRADED);
532 			Adapter->chip_state = E1000G_ERROR;
533 		}
534 
535 		accept_frame = (current_desc->errors == 0) ||
536 		    ((current_desc->errors &
537 		    (E1000_RXD_ERR_TCPE | E1000_RXD_ERR_IPE)) != 0);
538 
539 		if (hw->mac.type == e1000_82543) {
540 			unsigned char last_byte;
541 
542 			last_byte =
543 			    *((unsigned char *)rx_buf->address + length - 1);
544 
545 			if (TBI_ACCEPT(hw,
546 			    current_desc->status, current_desc->errors,
547 			    current_desc->length, last_byte,
548 			    Adapter->min_frame_size, Adapter->max_frame_size)) {
549 
550 				e1000_tbi_adjust_stats(Adapter,
551 				    length, hw->mac.addr);
552 
553 				length--;
554 				accept_frame = B_TRUE;
555 			} else if (e1000_tbi_sbp_enabled_82543(hw) &&
556 			    (current_desc->errors == E1000_RXD_ERR_CE)) {
557 				accept_frame = B_TRUE;
558 			}
559 		}
560 
561 		/*
562 		 * Indicate the packet to the NOS if it was good.
563 		 * Normally, hardware will discard bad packets for us.
564 		 * Check for the packet to be a valid Ethernet packet
565 		 */
566 		if (!accept_frame) {
567 			/*
568 			 * error in incoming packet, either the packet is not a
569 			 * ethernet size packet, or the packet has an error. In
570 			 * either case, the packet will simply be discarded.
571 			 */
572 			E1000G_DEBUGLOG_0(Adapter, E1000G_INFO_LEVEL,
573 			    "Process Receive Interrupts: Error in Packet\n");
574 
575 			E1000G_STAT(rx_ring->stat_error);
576 			/*
577 			 * Returning here as we are done here. There is
578 			 * no point in waiting for while loop to elapse
579 			 * and the things which were done. More efficient
580 			 * and less error prone...
581 			 */
582 			goto rx_drop;
583 		}
584 
585 		/*
586 		 * If the Ethernet CRC is not stripped by the hardware,
587 		 * we need to strip it before sending it up to the stack.
588 		 */
589 		if (end_of_packet && !Adapter->strip_crc) {
590 			if (length > ETHERFCSL) {
591 				length -= ETHERFCSL;
592 			} else {
593 				/*
594 				 * If the fragment is smaller than the CRC,
595 				 * drop this fragment, do the processing of
596 				 * the end of the packet.
597 				 */
598 				ASSERT(rx_ring->rx_mblk_tail != NULL);
599 				rx_ring->rx_mblk_tail->b_wptr -=
600 				    ETHERFCSL - length;
601 				rx_ring->rx_mblk_len -=
602 				    ETHERFCSL - length;
603 
604 				QUEUE_POP_HEAD(&rx_ring->recv_list);
605 
606 				goto rx_end_of_packet;
607 			}
608 		}
609 
610 		need_copy = B_TRUE;
611 
612 		if (length <= Adapter->rx_bcopy_thresh)
613 			goto rx_copy;
614 
615 		/*
616 		 * Get the pre-constructed mblk that was associated
617 		 * to the receive data buffer.
618 		 */
619 		if (packet->mp == NULL) {
620 			packet->mp = desballoc((unsigned char *)
621 			    rx_buf->address - E1000G_IPALIGNROOM,
622 			    length + E1000G_IPALIGNROOM,
623 			    BPRI_MED, &packet->free_rtn);
624 
625 			if (packet->mp != NULL) {
626 				packet->mp->b_rptr += E1000G_IPALIGNROOM;
627 				packet->mp->b_wptr += E1000G_IPALIGNROOM;
628 			} else {
629 				E1000G_STAT(rx_ring->stat_esballoc_fail);
630 			}
631 		}
632 
633 		if (packet->mp != NULL) {
634 			/*
635 			 * We have two sets of buffer pool. One associated with
636 			 * the Rxdescriptors and other a freelist buffer pool.
637 			 * Each time we get a good packet, Try to get a buffer
638 			 * from the freelist pool using e1000g_get_buf. If we
639 			 * get free buffer, then replace the descriptor buffer
640 			 * address with the free buffer we just got, and pass
641 			 * the pre-constructed mblk upstack. (note no copying)
642 			 *
643 			 * If we failed to get a free buffer, then try to
644 			 * allocate a new buffer(mp) and copy the recv buffer
645 			 * content to our newly allocated buffer(mp). Don't
646 			 * disturb the desriptor buffer address. (note copying)
647 			 */
648 			newpkt = e1000g_get_buf(rx_ring);
649 
650 			if (newpkt != NULL) {
651 				/*
652 				 * Get the mblk associated to the data,
653 				 * and strip it off the sw packet.
654 				 */
655 				nmp = packet->mp;
656 				packet->mp = NULL;
657 				packet->flag = E1000G_RX_SW_SENDUP;
658 
659 				/*
660 				 * Now replace old buffer with the new
661 				 * one we got from free list
662 				 * Both the RxSwPacket as well as the
663 				 * Receive Buffer Descriptor will now
664 				 * point to this new packet.
665 				 */
666 				packet = newpkt;
667 
668 				current_desc->buffer_addr =
669 				    newpkt->rx_buf->dma_address;
670 
671 				need_copy = B_FALSE;
672 			} else {
673 				E1000G_DEBUG_STAT(rx_ring->stat_no_freepkt);
674 			}
675 		}
676 
677 rx_copy:
678 		if (need_copy) {
679 			/*
680 			 * No buffers available on free list,
681 			 * bcopy the data from the buffer and
682 			 * keep the original buffer. Dont want to
683 			 * do this.. Yack but no other way
684 			 */
685 			if ((nmp = allocb(length + E1000G_IPALIGNROOM,
686 			    BPRI_MED)) == NULL) {
687 				/*
688 				 * The system has no buffers available
689 				 * to send up the incoming packet, hence
690 				 * the packet will have to be processed
691 				 * when there're more buffers available.
692 				 */
693 				E1000G_STAT(rx_ring->stat_allocb_fail);
694 				goto rx_drop;
695 			}
696 			nmp->b_rptr += E1000G_IPALIGNROOM;
697 			nmp->b_wptr += E1000G_IPALIGNROOM;
698 			/*
699 			 * The free list did not have any buffers
700 			 * available, so, the received packet will
701 			 * have to be copied into a mp and the original
702 			 * buffer will have to be retained for future
703 			 * packet reception.
704 			 */
705 			bcopy(rx_buf->address, nmp->b_wptr, length);
706 		}
707 
708 		/*
709 		 * The rx_sw_packet MUST be popped off the
710 		 * RxSwPacketList before either a putnext or freemsg
711 		 * is done on the mp that has now been created by the
712 		 * desballoc. If not, it is possible that the free
713 		 * routine will get called from the interrupt context
714 		 * and try to put this packet on the free list
715 		 */
716 		(p_rx_sw_packet_t)QUEUE_POP_HEAD(&rx_ring->recv_list);
717 
718 		ASSERT(nmp != NULL);
719 		nmp->b_wptr += length;
720 
721 		if (rx_ring->rx_mblk == NULL) {
722 			/*
723 			 *  TCP/UDP checksum offload and
724 			 *  IP checksum offload
725 			 */
726 			if (!(current_desc->status & E1000_RXD_STAT_IXSM)) {
727 				/*
728 				 * Check TCP/UDP checksum
729 				 */
730 				if ((current_desc->status &
731 				    E1000_RXD_STAT_TCPCS) &&
732 				    !(current_desc->errors &
733 				    E1000_RXD_ERR_TCPE))
734 					cksumflags |= HCK_FULLCKSUM |
735 					    HCK_FULLCKSUM_OK;
736 				/*
737 				 * Check IP Checksum
738 				 */
739 				if ((current_desc->status &
740 				    E1000_RXD_STAT_IPCS) &&
741 				    !(current_desc->errors &
742 				    E1000_RXD_ERR_IPE))
743 					cksumflags |= HCK_IPV4_HDRCKSUM;
744 			}
745 		}
746 
747 		/*
748 		 * We need to maintain our packet chain in the global
749 		 * Adapter structure, for the Rx processing can end
750 		 * with a fragment that has no EOP set.
751 		 */
752 		if (rx_ring->rx_mblk == NULL) {
753 			/* Get the head of the message chain */
754 			rx_ring->rx_mblk = nmp;
755 			rx_ring->rx_mblk_tail = nmp;
756 			rx_ring->rx_mblk_len = length;
757 		} else {	/* Not the first packet */
758 			/* Continue adding buffers */
759 			rx_ring->rx_mblk_tail->b_cont = nmp;
760 			rx_ring->rx_mblk_tail = nmp;
761 			rx_ring->rx_mblk_len += length;
762 		}
763 		ASSERT(rx_ring->rx_mblk != NULL);
764 		ASSERT(rx_ring->rx_mblk_tail != NULL);
765 		ASSERT(rx_ring->rx_mblk_tail->b_cont == NULL);
766 
767 		/*
768 		 * Now this MP is ready to travel upwards but some more
769 		 * fragments are coming.
770 		 * We will send packet upwards as soon as we get EOP
771 		 * set on the packet.
772 		 */
773 		if (!end_of_packet) {
774 			/*
775 			 * continue to get the next descriptor,
776 			 * Tail would be advanced at the end
777 			 */
778 			goto rx_next_desc;
779 		}
780 
781 rx_end_of_packet:
782 		/*
783 		 * Found packet with EOP
784 		 * Process the last fragment.
785 		 */
786 		if (cksumflags != 0) {
787 			(void) hcksum_assoc(rx_ring->rx_mblk,
788 			    NULL, NULL, 0, 0, 0, 0, cksumflags, 0);
789 			cksumflags = 0;
790 		}
791 
792 		/*
793 		 * Count packets that span multi-descriptors
794 		 */
795 		E1000G_DEBUG_STAT_COND(rx_ring->stat_multi_desc,
796 		    (rx_ring->rx_mblk->b_cont != NULL));
797 
798 		/*
799 		 * Append to list to send upstream
800 		 */
801 		if (ret_mp == NULL) {
802 			ret_mp = ret_nmp = rx_ring->rx_mblk;
803 		} else {
804 			ret_nmp->b_next = rx_ring->rx_mblk;
805 			ret_nmp = rx_ring->rx_mblk;
806 		}
807 		ret_nmp->b_next = NULL;
808 
809 		rx_ring->rx_mblk = NULL;
810 		rx_ring->rx_mblk_tail = NULL;
811 		rx_ring->rx_mblk_len = 0;
812 
813 		pkt_count++;
814 
815 rx_next_desc:
816 		/*
817 		 * Zero out the receive descriptors status
818 		 */
819 		current_desc->status = 0;
820 
821 		if (current_desc == rx_ring->rbd_last)
822 			rx_ring->rbd_next = rx_ring->rbd_first;
823 		else
824 			rx_ring->rbd_next++;
825 
826 		last_desc = current_desc;
827 		current_desc = rx_ring->rbd_next;
828 
829 		/*
830 		 * Put the buffer that we just indicated back
831 		 * at the end of our list
832 		 */
833 		QUEUE_PUSH_TAIL(&rx_ring->recv_list,
834 		    &packet->Link);
835 	}	/* while loop */
836 
837 	/* Sync the Rx descriptor DMA buffers */
838 	(void) ddi_dma_sync(rx_ring->rbd_dma_handle,
839 	    0, 0, DDI_DMA_SYNC_FORDEV);
840 
841 	/*
842 	 * Advance the E1000's Receive Queue #0 "Tail Pointer".
843 	 */
844 	E1000_WRITE_REG(hw, E1000_RDT(0),
845 	    (uint32_t)(last_desc - rx_ring->rbd_first));
846 
847 	if (e1000g_check_acc_handle(Adapter->osdep.reg_handle) != DDI_FM_OK) {
848 		ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
849 		Adapter->chip_state = E1000G_ERROR;
850 	}
851 
852 	Adapter->rx_pkt_cnt = pkt_count;
853 
854 	return (ret_mp);
855 
856 rx_drop:
857 	/*
858 	 * Zero out the receive descriptors status
859 	 */
860 	current_desc->status = 0;
861 
862 	/* Sync the Rx descriptor DMA buffers */
863 	(void) ddi_dma_sync(rx_ring->rbd_dma_handle,
864 	    0, 0, DDI_DMA_SYNC_FORDEV);
865 
866 	if (current_desc == rx_ring->rbd_last)
867 		rx_ring->rbd_next = rx_ring->rbd_first;
868 	else
869 		rx_ring->rbd_next++;
870 
871 	last_desc = current_desc;
872 
873 	(p_rx_sw_packet_t)QUEUE_POP_HEAD(&rx_ring->recv_list);
874 
875 	QUEUE_PUSH_TAIL(&rx_ring->recv_list, &packet->Link);
876 	/*
877 	 * Reclaim all old buffers already allocated during
878 	 * Jumbo receives.....for incomplete reception
879 	 */
880 	if (rx_ring->rx_mblk != NULL) {
881 		freemsg(rx_ring->rx_mblk);
882 		rx_ring->rx_mblk = NULL;
883 		rx_ring->rx_mblk_tail = NULL;
884 		rx_ring->rx_mblk_len = 0;
885 	}
886 	/*
887 	 * Advance the E1000's Receive Queue #0 "Tail Pointer".
888 	 */
889 	E1000_WRITE_REG(hw, E1000_RDT(0),
890 	    (uint32_t)(last_desc - rx_ring->rbd_first));
891 
892 	if (e1000g_check_acc_handle(Adapter->osdep.reg_handle) != DDI_FM_OK) {
893 		ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
894 		Adapter->chip_state = E1000G_ERROR;
895 	}
896 
897 	return (ret_mp);
898 }
899