xref: /illumos-gate/usr/src/uts/common/io/igb/igb_rx.c (revision fcdb3229a31dd4ff700c69238814e326aad49098)
1 /*
2  * CDDL HEADER START
3  *
4  * Copyright(c) 2007-2009 Intel Corporation. All rights reserved.
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 #include "igb_sw.h"
29 
30 /* function prototypes */
31 static mblk_t *igb_rx_bind(igb_rx_data_t *, uint32_t, uint32_t);
32 static mblk_t *igb_rx_copy(igb_rx_data_t *, uint32_t, uint32_t);
33 static void igb_rx_assoc_hcksum(mblk_t *, uint32_t);
34 
35 /*
36  * igb_rx_recycle - the call-back function to reclaim rx buffer
37  *
38  * This function is called when an mp is freed by the user thru
39  * freeb call (Only for mp constructed through desballoc call).
40  * It returns back the freed buffer to the free list.
41  */
42 void
igb_rx_recycle(caddr_t arg)43 igb_rx_recycle(caddr_t arg)
44 {
45 	igb_t *igb;
46 	igb_rx_ring_t *rx_ring;
47 	igb_rx_data_t	*rx_data;
48 	rx_control_block_t *recycle_rcb;
49 	uint32_t free_index;
50 	uint32_t ref_cnt;
51 
52 	recycle_rcb = (rx_control_block_t *)(uintptr_t)arg;
53 	rx_data = recycle_rcb->rx_data;
54 	rx_ring = rx_data->rx_ring;
55 	igb = rx_ring->igb;
56 
57 	if (recycle_rcb->ref_cnt == 0) {
58 		/*
59 		 * This case only happens when rx buffers are being freed
60 		 * in igb_stop() and freemsg() is called.
61 		 */
62 		return;
63 	}
64 
65 	ASSERT(recycle_rcb->mp == NULL);
66 
67 	/*
68 	 * Using the recycled data buffer to generate a new mblk
69 	 */
70 	recycle_rcb->mp = desballoc((unsigned char *)
71 	    recycle_rcb->rx_buf.address,
72 	    recycle_rcb->rx_buf.size,
73 	    0, &recycle_rcb->free_rtn);
74 
75 	/*
76 	 * Put the recycled rx control block into free list
77 	 */
78 	mutex_enter(&rx_data->recycle_lock);
79 
80 	free_index = rx_data->rcb_tail;
81 	ASSERT(rx_data->free_list[free_index] == NULL);
82 
83 	rx_data->free_list[free_index] = recycle_rcb;
84 	rx_data->rcb_tail = NEXT_INDEX(free_index, 1, rx_data->free_list_size);
85 
86 	mutex_exit(&rx_data->recycle_lock);
87 
88 	/*
89 	 * The atomic operation on the number of the available rx control
90 	 * blocks in the free list is used to make the recycling mutual
91 	 * exclusive with the receiving.
92 	 */
93 	atomic_inc_32(&rx_data->rcb_free);
94 	ASSERT(rx_data->rcb_free <= rx_data->free_list_size);
95 
96 	/*
97 	 * Considering the case that the interface is unplumbed
98 	 * and there are still some buffers held by the upper layer.
99 	 * When the buffer is returned back, we need to free it.
100 	 */
101 	ref_cnt = atomic_dec_32_nv(&recycle_rcb->ref_cnt);
102 	if (ref_cnt == 0) {
103 		if (recycle_rcb->mp != NULL) {
104 			freemsg(recycle_rcb->mp);
105 			recycle_rcb->mp = NULL;
106 		}
107 
108 		igb_free_dma_buffer(&recycle_rcb->rx_buf);
109 
110 		mutex_enter(&igb->rx_pending_lock);
111 		atomic_dec_32(&rx_data->rcb_pending);
112 		atomic_dec_32(&igb->rcb_pending);
113 
114 		/*
115 		 * When there is not any buffer belonging to this rx_data
116 		 * held by the upper layer, the rx_data can be freed.
117 		 */
118 		if ((rx_data->flag & IGB_RX_STOPPED) &&
119 		    (rx_data->rcb_pending == 0))
120 			igb_free_rx_ring_data(rx_data);
121 
122 		mutex_exit(&igb->rx_pending_lock);
123 	}
124 }
125 
126 /*
127  * igb_rx_copy - Use copy to process the received packet
128  *
129  * This function will use bcopy to process the packet
130  * and send the copied packet upstream
131  */
132 static mblk_t *
igb_rx_copy(igb_rx_data_t * rx_data,uint32_t index,uint32_t pkt_len)133 igb_rx_copy(igb_rx_data_t *rx_data, uint32_t index, uint32_t pkt_len)
134 {
135 	rx_control_block_t *current_rcb;
136 	mblk_t *mp;
137 	igb_t *igb = rx_data->rx_ring->igb;
138 
139 	current_rcb = rx_data->work_list[index];
140 
141 	DMA_SYNC(&current_rcb->rx_buf, DDI_DMA_SYNC_FORKERNEL);
142 
143 	if (igb_check_dma_handle(
144 	    current_rcb->rx_buf.dma_handle) != DDI_FM_OK) {
145 		ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED);
146 		atomic_or_32(&igb->igb_state, IGB_ERROR);
147 		return (NULL);
148 	}
149 
150 	/*
151 	 * Allocate buffer to receive this packet
152 	 */
153 	mp = allocb(pkt_len + IPHDR_ALIGN_ROOM, 0);
154 	if (mp == NULL) {
155 		igb_log(igb, IGB_LOG_INFO,
156 		    "igb_rx_copy: allocate buffer failed");
157 		return (NULL);
158 	}
159 
160 	/*
161 	 * Copy the data received into the new cluster
162 	 */
163 	mp->b_rptr += IPHDR_ALIGN_ROOM;
164 	bcopy(current_rcb->rx_buf.address, mp->b_rptr, pkt_len);
165 	mp->b_wptr = mp->b_rptr + pkt_len;
166 
167 	return (mp);
168 }
169 
170 /*
171  * igb_rx_bind - Use existing DMA buffer to build mblk for receiving
172  *
173  * This function will use pre-bound DMA buffer to receive the packet
174  * and build mblk that will be sent upstream.
175  */
176 static mblk_t *
igb_rx_bind(igb_rx_data_t * rx_data,uint32_t index,uint32_t pkt_len)177 igb_rx_bind(igb_rx_data_t *rx_data, uint32_t index, uint32_t pkt_len)
178 {
179 	rx_control_block_t *current_rcb;
180 	rx_control_block_t *free_rcb;
181 	uint32_t free_index;
182 	mblk_t *mp;
183 	igb_t *igb = rx_data->rx_ring->igb;
184 
185 	/*
186 	 * If the free list is empty, we cannot proceed to send
187 	 * the current DMA buffer upstream. We'll have to return
188 	 * and use bcopy to process the packet.
189 	 */
190 	if (igb_atomic_reserve(&rx_data->rcb_free, 1) < 0)
191 		return (NULL);
192 
193 	current_rcb = rx_data->work_list[index];
194 	/*
195 	 * If the mp of the rx control block is NULL, try to do
196 	 * desballoc again.
197 	 */
198 	if (current_rcb->mp == NULL) {
199 		current_rcb->mp = desballoc((unsigned char *)
200 		    current_rcb->rx_buf.address,
201 		    current_rcb->rx_buf.size,
202 		    0, &current_rcb->free_rtn);
203 		/*
204 		 * If it is failed to built a mblk using the current
205 		 * DMA buffer, we have to return and use bcopy to
206 		 * process the packet.
207 		 */
208 		if (current_rcb->mp == NULL) {
209 			atomic_inc_32(&rx_data->rcb_free);
210 			return (NULL);
211 		}
212 	}
213 	/*
214 	 * Sync up the data received
215 	 */
216 	DMA_SYNC(&current_rcb->rx_buf, DDI_DMA_SYNC_FORKERNEL);
217 
218 	if (igb_check_dma_handle(
219 	    current_rcb->rx_buf.dma_handle) != DDI_FM_OK) {
220 		ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED);
221 		atomic_or_32(&igb->igb_state, IGB_ERROR);
222 		atomic_inc_32(&rx_data->rcb_free);
223 		return (NULL);
224 	}
225 
226 	mp = current_rcb->mp;
227 	current_rcb->mp = NULL;
228 	atomic_inc_32(&current_rcb->ref_cnt);
229 
230 	mp->b_wptr = mp->b_rptr + pkt_len;
231 	mp->b_next = mp->b_cont = NULL;
232 
233 	/*
234 	 * Strip off one free rx control block from the free list
235 	 */
236 	free_index = rx_data->rcb_head;
237 	free_rcb = rx_data->free_list[free_index];
238 	ASSERT(free_rcb != NULL);
239 	rx_data->free_list[free_index] = NULL;
240 	rx_data->rcb_head = NEXT_INDEX(free_index, 1, rx_data->free_list_size);
241 
242 	/*
243 	 * Put the rx control block to the work list
244 	 */
245 	rx_data->work_list[index] = free_rcb;
246 
247 	return (mp);
248 }
249 
250 /*
251  * igb_rx_assoc_hcksum
252  *
253  * Check the rx hardware checksum status and associate the hcksum flags
254  */
255 static void
igb_rx_assoc_hcksum(mblk_t * mp,uint32_t status_error)256 igb_rx_assoc_hcksum(mblk_t *mp, uint32_t status_error)
257 {
258 	uint32_t hcksum_flags = 0;
259 
260 	/* Ignore Checksum Indication */
261 	if (status_error & E1000_RXD_STAT_IXSM)
262 		return;
263 
264 	/*
265 	 * Check TCP/UDP checksum
266 	 */
267 	if (((status_error & E1000_RXD_STAT_TCPCS) ||
268 	    (status_error & E1000_RXD_STAT_UDPCS)) &&
269 	    !(status_error & E1000_RXDEXT_STATERR_TCPE))
270 		hcksum_flags |= HCK_FULLCKSUM_OK;
271 
272 	/*
273 	 * Check IP Checksum
274 	 */
275 	if ((status_error & E1000_RXD_STAT_IPCS) &&
276 	    !(status_error & E1000_RXDEXT_STATERR_IPE))
277 		hcksum_flags |= HCK_IPV4_HDRCKSUM_OK;
278 
279 	if (hcksum_flags != 0) {
280 		mac_hcksum_set(mp, 0, 0, 0, 0, hcksum_flags);
281 	}
282 }
283 
284 mblk_t *
igb_rx_ring_poll(void * arg,int bytes)285 igb_rx_ring_poll(void *arg, int bytes)
286 {
287 	igb_rx_ring_t *rx_ring = (igb_rx_ring_t *)arg;
288 	mblk_t *mp = NULL;
289 
290 	ASSERT(bytes >= 0);
291 
292 	if ((bytes == 0) || (rx_ring->igb->igb_state & IGB_SUSPENDED) ||
293 	    !(rx_ring->igb->igb_state & IGB_STARTED))
294 		return (NULL);
295 
296 	mutex_enter(&rx_ring->rx_lock);
297 	mp = igb_rx(rx_ring, bytes);
298 	mutex_exit(&rx_ring->rx_lock);
299 
300 	return (mp);
301 }
302 
303 /*
304  * igb_rx - Receive the data of one ring
305  *
306  * This function goes throught h/w descriptor in one specified rx ring,
307  * receives the data if the descriptor status shows the data is ready.
308  * It returns a chain of mblks containing the received data, to be
309  * passed up to mac_rx().
310  */
311 mblk_t *
igb_rx(igb_rx_ring_t * rx_ring,int poll_bytes)312 igb_rx(igb_rx_ring_t *rx_ring, int poll_bytes)
313 {
314 	union e1000_adv_rx_desc *current_rbd;
315 	rx_control_block_t *current_rcb;
316 	mblk_t *mp;
317 	mblk_t *mblk_head;
318 	mblk_t **mblk_tail;
319 	uint32_t rx_next;
320 	uint32_t rx_tail;
321 	uint32_t pkt_len;
322 	uint32_t status_error;
323 	uint32_t pkt_num;
324 	uint32_t total_bytes;
325 	igb_t *igb = rx_ring->igb;
326 	igb_rx_data_t *rx_data = rx_ring->rx_data;
327 
328 	mblk_head = NULL;
329 	mblk_tail = &mblk_head;
330 
331 	if (igb->igb_state & IGB_ERROR)
332 		return (NULL);
333 
334 	/*
335 	 * Sync the receive descriptors before
336 	 * accepting the packets
337 	 */
338 	DMA_SYNC(&rx_data->rbd_area, DDI_DMA_SYNC_FORKERNEL);
339 
340 	if (igb_check_dma_handle(
341 	    rx_data->rbd_area.dma_handle) != DDI_FM_OK) {
342 		ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED);
343 		atomic_or_32(&igb->igb_state, IGB_ERROR);
344 		return (NULL);
345 	}
346 
347 	/*
348 	 * Get the start point of rx bd ring which should be examined
349 	 * during this cycle.
350 	 */
351 	rx_next = rx_data->rbd_next;
352 
353 	current_rbd = &rx_data->rbd_ring[rx_next];
354 	pkt_num = 0;
355 	total_bytes = 0;
356 	status_error = current_rbd->wb.upper.status_error;
357 	while (status_error & E1000_RXD_STAT_DD) {
358 		/*
359 		 * If hardware has found the errors, but the error
360 		 * is hardware checksum error, here does not discard the
361 		 * packet, and let upper layer compute the checksum;
362 		 * Otherwise discard the packet.
363 		 */
364 		if ((status_error & E1000_RXDEXT_ERR_FRAME_ERR_MASK) ||
365 		    !(status_error & E1000_RXD_STAT_EOP)) {
366 			IGB_DEBUG_STAT(rx_ring->stat_frame_error);
367 			goto rx_discard;
368 		}
369 
370 		IGB_DEBUG_STAT_COND(rx_ring->stat_cksum_error,
371 		    (status_error & E1000_RXDEXT_STATERR_TCPE) ||
372 		    (status_error & E1000_RXDEXT_STATERR_IPE));
373 
374 		pkt_len = current_rbd->wb.upper.length;
375 
376 		if ((poll_bytes != IGB_NO_POLL) &&
377 		    ((pkt_len + total_bytes) > poll_bytes))
378 			break;
379 
380 		IGB_DEBUG_STAT(rx_ring->stat_pkt_cnt);
381 		total_bytes += pkt_len;
382 
383 		mp = NULL;
384 		/*
385 		 * For packets with length more than the copy threshold,
386 		 * we'll firstly try to use the existed DMA buffer to built
387 		 * a mblk and send the mblk upstream.
388 		 *
389 		 * If the first method fails, or the packet length is less
390 		 * than the copy threshold, we'll allocate a new mblk and
391 		 * copy the packet data to the mblk.
392 		 */
393 		if (pkt_len > igb->rx_copy_thresh)
394 			mp = igb_rx_bind(rx_data, rx_next, pkt_len);
395 
396 		if (mp == NULL)
397 			mp = igb_rx_copy(rx_data, rx_next, pkt_len);
398 
399 		if (mp != NULL) {
400 			/*
401 			 * Check h/w checksum offload status
402 			 */
403 			if (igb->rx_hcksum_enable)
404 				igb_rx_assoc_hcksum(mp, status_error);
405 
406 			*mblk_tail = mp;
407 			mblk_tail = &mp->b_next;
408 		}
409 
410 		/* Update per-ring rx statistics */
411 		rx_ring->rx_pkts++;
412 		rx_ring->rx_bytes += pkt_len;
413 
414 rx_discard:
415 		/*
416 		 * Reset rx descriptor read bits
417 		 */
418 		current_rcb = rx_data->work_list[rx_next];
419 		current_rbd->read.pkt_addr = current_rcb->rx_buf.dma_address;
420 		current_rbd->read.hdr_addr = 0;
421 
422 		rx_next = NEXT_INDEX(rx_next, 1, rx_data->ring_size);
423 
424 		/*
425 		 * The receive function is in interrupt context, so here
426 		 * rx_limit_per_intr is used to avoid doing receiving too long
427 		 * per interrupt.
428 		 */
429 		if (++pkt_num > igb->rx_limit_per_intr) {
430 			IGB_DEBUG_STAT(rx_ring->stat_exceed_pkt);
431 			break;
432 		}
433 
434 		current_rbd = &rx_data->rbd_ring[rx_next];
435 		status_error = current_rbd->wb.upper.status_error;
436 	}
437 
438 	DMA_SYNC(&rx_data->rbd_area, DDI_DMA_SYNC_FORDEV);
439 
440 	rx_data->rbd_next = rx_next;
441 
442 	/*
443 	 * Update the h/w tail accordingly
444 	 */
445 	rx_tail = PREV_INDEX(rx_next, 1, rx_data->ring_size);
446 
447 	E1000_WRITE_REG(&igb->hw, E1000_RDT(rx_ring->index), rx_tail);
448 
449 	if (igb_check_acc_handle(igb->osdep.reg_handle) != DDI_FM_OK) {
450 		ddi_fm_service_impact(igb->dip, DDI_SERVICE_DEGRADED);
451 		atomic_or_32(&igb->igb_state, IGB_ERROR);
452 	}
453 
454 	return (mblk_head);
455 }
456