xref: /illumos-gate/usr/src/uts/common/io/ena/ena_rx.c (revision e98897e3ff64095f1d7afa3f16c8c3dcbd805e3e)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2021 Oxide Computer Company
14  */
15 #include "ena.h"
16 
17 static void
18 ena_refill_rx(ena_rxq_t *rxq, uint16_t num)
19 {
20 	VERIFY3P(rxq, !=, NULL);
21 	ASSERT(MUTEX_HELD(&rxq->er_lock));
22 	ASSERT3U(num, <=, rxq->er_sq_num_descs);
23 	uint16_t tail_mod = rxq->er_sq_tail_idx & (rxq->er_sq_num_descs - 1);
24 
25 	while (num != 0) {
26 		enahw_rx_desc_t *desc = &rxq->er_sq_descs[tail_mod];
27 		ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[tail_mod];
28 		uint16_t phase = rxq->er_sq_phase;
29 
30 		VERIFY3U(tail_mod, <, rxq->er_sq_num_descs);
31 		VERIFY3P(desc, !=, NULL);
32 		VERIFY3P(rcb, !=, NULL);
33 		VERIFY3P(desc, >=, rxq->er_sq_descs);
34 		VERIFY3P(desc, <=,
35 		    (rxq->er_sq_descs + rxq->er_sq_num_descs - 1));
36 
37 		desc->erd_length = rcb->ercb_dma.edb_len;
38 		desc->erd_req_id = tail_mod;
39 		VERIFY3P(rcb->ercb_dma.edb_cookie, !=, NULL);
40 		ena_set_dma_addr_values(rxq->er_ena,
41 		    rcb->ercb_dma.edb_cookie->dmac_laddress,
42 		    &desc->erd_buff_addr_lo, &desc->erd_buff_addr_hi);
43 		ENAHW_RX_DESC_SET_PHASE(desc, phase);
44 		ENAHW_RX_DESC_SET_FIRST(desc);
45 		ENAHW_RX_DESC_SET_LAST(desc);
46 		ENAHW_RX_DESC_SET_COMP_REQ(desc);
47 		DTRACE_PROBE1(ena__refill__rx, enahw_rx_desc_t *, desc);
48 		rxq->er_sq_tail_idx++;
49 		tail_mod = rxq->er_sq_tail_idx & (rxq->er_sq_num_descs - 1);
50 
51 		if (tail_mod == 0) {
52 			rxq->er_sq_phase = !rxq->er_sq_phase;
53 		}
54 
55 		num--;
56 	}
57 
58 	ENA_DMA_SYNC(rxq->er_sq_dma, DDI_DMA_SYNC_FORDEV);
59 	ena_hw_abs_write32(rxq->er_ena, rxq->er_sq_db_addr,
60 	    rxq->er_sq_tail_idx);
61 }
62 
63 void
64 ena_free_rx_dma(ena_rxq_t *rxq)
65 {
66 	if (rxq->er_rcbs != NULL) {
67 		for (uint_t i = 0; i < rxq->er_sq_num_descs; i++) {
68 			ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[i];
69 			ena_dma_free(&rcb->ercb_dma);
70 		}
71 
72 		kmem_free(rxq->er_rcbs,
73 		    sizeof (*rxq->er_rcbs) * rxq->er_sq_num_descs);
74 
75 		rxq->er_rcbs = NULL;
76 	}
77 
78 	ena_dma_free(&rxq->er_cq_dma);
79 	rxq->er_cq_descs = NULL;
80 	rxq->er_cq_num_descs = 0;
81 
82 	ena_dma_free(&rxq->er_sq_dma);
83 	rxq->er_sq_descs = NULL;
84 	rxq->er_sq_num_descs = 0;
85 
86 	rxq->er_state &= ~ENA_RXQ_STATE_HOST_ALLOC;
87 }
88 
89 static int
90 ena_alloc_rx_dma(ena_rxq_t *rxq)
91 {
92 	ena_t *ena = rxq->er_ena;
93 	size_t cq_descs_sz;
94 	size_t sq_descs_sz;
95 	ena_dma_conf_t conf;
96 	int err = 0;
97 
98 	cq_descs_sz = rxq->er_cq_num_descs * sizeof (*rxq->er_cq_descs);
99 	sq_descs_sz = rxq->er_sq_num_descs * sizeof (*rxq->er_sq_descs);
100 	conf = (ena_dma_conf_t) {
101 		.edc_size = sq_descs_sz,
102 		.edc_align = ENAHW_IO_SQ_DESC_BUF_ALIGNMENT,
103 		.edc_sgl = 1,
104 		.edc_endian = DDI_NEVERSWAP_ACC,
105 		.edc_stream = B_FALSE,
106 	};
107 
108 	if (!ena_dma_alloc(ena, &rxq->er_sq_dma, &conf, sq_descs_sz)) {
109 		return (ENOMEM);
110 	}
111 
112 	rxq->er_sq_descs = (void *)rxq->er_sq_dma.edb_va;
113 	rxq->er_rcbs = kmem_zalloc(sizeof (*rxq->er_rcbs) *
114 	    rxq->er_sq_num_descs, KM_SLEEP);
115 
116 	for (uint_t i = 0; i < rxq->er_sq_num_descs; i++) {
117 		ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[i];
118 		ena_dma_conf_t buf_conf = {
119 			.edc_size = ena->ena_rx_buf_sz,
120 			.edc_align = 1,
121 			.edc_sgl = ena->ena_rx_sgl_max_sz,
122 			.edc_endian = DDI_NEVERSWAP_ACC,
123 			.edc_stream = B_TRUE,
124 		};
125 
126 		if (!ena_dma_alloc(ena, &rcb->ercb_dma, &buf_conf,
127 		    ena->ena_rx_buf_sz)) {
128 			err = ENOMEM;
129 			goto error;
130 		}
131 	}
132 
133 	conf = (ena_dma_conf_t) {
134 		.edc_size = cq_descs_sz,
135 		.edc_align = ENAHW_IO_CQ_DESC_BUF_ALIGNMENT,
136 		.edc_sgl = 1,
137 		.edc_endian = DDI_NEVERSWAP_ACC,
138 		.edc_stream = B_FALSE,
139 	};
140 
141 	if (!ena_dma_alloc(ena, &rxq->er_cq_dma, &conf, cq_descs_sz)) {
142 		err = ENOMEM;
143 		goto error;
144 	}
145 
146 	rxq->er_cq_descs = (void *)rxq->er_cq_dma.edb_va;
147 	rxq->er_state |= ENA_RXQ_STATE_HOST_ALLOC;
148 	return (0);
149 
150 error:
151 	ena_free_rx_dma(rxq);
152 	return (err);
153 }
154 
155 boolean_t
156 ena_alloc_rxq(ena_rxq_t *rxq)
157 {
158 	int ret = 0;
159 	ena_t *ena = rxq->er_ena;
160 	uint16_t cq_hw_idx, sq_hw_idx;
161 	uint32_t *cq_unmask_addr, *cq_headdb, *cq_numanode;
162 	uint32_t *sq_db_addr;
163 
164 	/*
165 	 * First, allocate the Rx data buffers.
166 	 */
167 	if ((ret = ena_alloc_rx_dma(rxq)) != 0) {
168 		ena_err(ena, "failed to allocate Rx queue %u data buffers: %d",
169 		    rxq->er_rxqs_idx, ret);
170 		return (B_FALSE);
171 	}
172 
173 	ASSERT(rxq->er_state & ENA_RXQ_STATE_HOST_ALLOC);
174 
175 	/*
176 	 * Second, create the Completion Queue.
177 	 */
178 	ret = ena_create_cq(ena,  rxq->er_cq_num_descs,
179 	    rxq->er_cq_dma.edb_cookie->dmac_laddress, B_FALSE,
180 	    rxq->er_intr_vector, &cq_hw_idx, &cq_unmask_addr, &cq_headdb,
181 	    &cq_numanode);
182 
183 	if (ret != 0) {
184 		ena_err(ena, "failed to create Rx CQ %u: %d", rxq->er_rxqs_idx,
185 		    ret);
186 		return (B_FALSE);
187 	}
188 
189 	/* The phase must always start on 1. */
190 	rxq->er_cq_phase = 1;
191 	rxq->er_cq_head_idx = 0;
192 	rxq->er_cq_hw_idx = cq_hw_idx;
193 	rxq->er_cq_unmask_addr = cq_unmask_addr;
194 	rxq->er_cq_head_db_addr = cq_headdb;
195 	rxq->er_cq_numa_addr = cq_numanode;
196 	rxq->er_state |= ENA_RXQ_STATE_CQ_CREATED;
197 
198 	/*
199 	 * Third, create the Submission Queue to match with the above
200 	 * CQ. At this time we force the SQ and CQ to have the same
201 	 * number of descriptors as we only use a 1:1 completion
202 	 * policy. However, in the future, we could loosen this and
203 	 * use an on-demand completion policy and the two could have a
204 	 * different number of descriptors.
205 	 */
206 	ASSERT3U(rxq->er_sq_num_descs, ==, rxq->er_cq_num_descs);
207 	ret = ena_create_sq(ena, rxq->er_sq_num_descs,
208 	    rxq->er_sq_dma.edb_cookie->dmac_laddress, B_FALSE, cq_hw_idx,
209 	    &sq_hw_idx, &sq_db_addr);
210 
211 	if (ret != 0) {
212 		ena_err(ena, "failed to create Rx SQ %u: %d", rxq->er_rxqs_idx,
213 		    ret);
214 		return (B_FALSE);
215 	}
216 
217 	ASSERT3P(sq_db_addr, !=, NULL);
218 	rxq->er_sq_hw_idx = sq_hw_idx;
219 	rxq->er_sq_db_addr = sq_db_addr;
220 	/* The phase must always start on 1. */
221 	rxq->er_sq_phase = 1;
222 	rxq->er_sq_tail_idx = 0;
223 	rxq->er_sq_avail_descs = rxq->er_sq_num_descs;
224 	rxq->er_mode = ENA_RXQ_MODE_INTR;
225 	rxq->er_state |= ENA_RXQ_STATE_SQ_CREATED;
226 
227 	return (B_TRUE);
228 }
229 
230 void
231 ena_cleanup_rxq(ena_rxq_t *rxq)
232 {
233 	int ret = 0;
234 	ena_t *ena = rxq->er_ena;
235 
236 	if ((rxq->er_state & ENA_RXQ_STATE_SQ_CREATED) != 0) {
237 		ret = ena_destroy_sq(ena, rxq->er_sq_hw_idx, B_FALSE);
238 
239 		if (ret != 0) {
240 			ena_err(ena, "failed to destroy Rx SQ %u: %d",
241 			    rxq->er_rxqs_idx, ret);
242 		}
243 
244 		rxq->er_sq_hw_idx = 0;
245 		rxq->er_sq_db_addr = NULL;
246 		rxq->er_sq_tail_idx = 0;
247 		rxq->er_sq_phase = 0;
248 		rxq->er_state &= ~ENA_RXQ_STATE_SQ_CREATED;
249 	}
250 
251 	if ((rxq->er_state & ENA_RXQ_STATE_CQ_CREATED) != 0) {
252 		ret = ena_destroy_cq(ena, rxq->er_cq_hw_idx);
253 
254 		if (ret != 0) {
255 			ena_err(ena, "failed to destroy Rx CQ %u: %d",
256 			    rxq->er_rxqs_idx, ret);
257 		}
258 
259 		rxq->er_cq_hw_idx = 0;
260 		rxq->er_cq_head_idx = 0;
261 		rxq->er_cq_phase = 0;
262 		rxq->er_cq_head_db_addr = NULL;
263 		rxq->er_cq_unmask_addr = NULL;
264 		rxq->er_cq_numa_addr = NULL;
265 		rxq->er_state &= ~ENA_RXQ_STATE_CQ_CREATED;
266 	}
267 
268 	ena_free_rx_dma(rxq);
269 	ASSERT3S(rxq->er_state, ==, ENA_RXQ_STATE_NONE);
270 }
271 
272 void
273 ena_ring_rx_stop(mac_ring_driver_t rh)
274 {
275 	ena_rxq_t *rxq = (ena_rxq_t *)rh;
276 	uint32_t intr_ctrl;
277 
278 	intr_ctrl = ena_hw_abs_read32(rxq->er_ena, rxq->er_cq_unmask_addr);
279 	ENAHW_REG_INTR_MASK(intr_ctrl);
280 	ena_hw_abs_write32(rxq->er_ena, rxq->er_cq_unmask_addr, intr_ctrl);
281 
282 	rxq->er_state &= ~ENA_RXQ_STATE_RUNNING;
283 	rxq->er_state &= ~ENA_RXQ_STATE_READY;
284 }
285 
286 int
287 ena_ring_rx_start(mac_ring_driver_t rh, uint64_t gen_num)
288 {
289 	ena_rxq_t *rxq = (ena_rxq_t *)rh;
290 	ena_t *ena = rxq->er_ena;
291 	uint32_t intr_ctrl;
292 
293 	mutex_enter(&rxq->er_lock);
294 	ena_refill_rx(rxq, rxq->er_sq_num_descs);
295 	rxq->er_m_gen_num = gen_num;
296 	rxq->er_intr_limit = ena->ena_rxq_intr_limit;
297 	mutex_exit(&rxq->er_lock);
298 
299 	rxq->er_state |= ENA_RXQ_STATE_READY;
300 
301 	intr_ctrl = ena_hw_abs_read32(ena, rxq->er_cq_unmask_addr);
302 	ENAHW_REG_INTR_UNMASK(intr_ctrl);
303 	ena_hw_abs_write32(ena, rxq->er_cq_unmask_addr, intr_ctrl);
304 	rxq->er_state |= ENA_RXQ_STATE_RUNNING;
305 	return (0);
306 }
307 
308 mblk_t *
309 ena_ring_rx(ena_rxq_t *rxq, int poll_bytes)
310 {
311 	ena_t *ena = rxq->er_ena;
312 	uint16_t head_mod = rxq->er_cq_head_idx & (rxq->er_cq_num_descs - 1);
313 	uint64_t total_bytes = 0;
314 	uint64_t num_frames = 0;
315 	enahw_rx_cdesc_t *cdesc;
316 	boolean_t polling = B_TRUE;
317 	mblk_t *head = NULL;
318 	mblk_t *tail = NULL;
319 
320 	ASSERT(MUTEX_HELD(&rxq->er_lock));
321 	ENA_DMA_SYNC(rxq->er_cq_dma, DDI_DMA_SYNC_FORKERNEL);
322 
323 	if (poll_bytes == ENA_INTERRUPT_MODE) {
324 		polling = B_FALSE;
325 	}
326 
327 	cdesc = &rxq->er_cq_descs[head_mod];
328 	VERIFY3P(cdesc, >=, rxq->er_cq_descs);
329 	VERIFY3P(cdesc, <=, (rxq->er_cq_descs + rxq->er_cq_num_descs - 1));
330 
331 	while (ENAHW_RX_CDESC_PHASE(cdesc) == rxq->er_cq_phase) {
332 		boolean_t first, last;
333 		ena_rx_ctrl_block_t *rcb;
334 		uint16_t req_id;
335 		mblk_t *mp;
336 		enahw_io_l3_proto_t l3proto;
337 		enahw_io_l4_proto_t l4proto;
338 		boolean_t l4csum_checked;
339 		uint32_t hflags = 0;
340 
341 		VERIFY3U(head_mod, <, rxq->er_cq_num_descs);
342 		/*
343 		 * Currently, all incoming frames fit in a single Rx
344 		 * buffer (erd_length > total frame size). In the
345 		 * future, if we decide to loan buffers which are
346 		 * smaller, we will need to modify this code to read
347 		 * one or more descriptors (based on frame size).
348 		 *
349 		 * For this reason we do not expect any frame to span
350 		 * multiple descriptors. Therefore, we drop any data
351 		 * not delivered as a single descriptor, i.e., where
352 		 * 'first' and 'last' are both true.
353 		 */
354 		first = ENAHW_RX_CDESC_FIRST(cdesc);
355 		last = ENAHW_RX_CDESC_LAST(cdesc);
356 
357 		if (!first || !last) {
358 			mutex_enter(&rxq->er_stat_lock);
359 			rxq->er_stat.ers_multi_desc.value.ui64++;
360 			mutex_exit(&rxq->er_stat_lock);
361 			goto next_desc;
362 		}
363 
364 		req_id = cdesc->erc_req_id;
365 		VERIFY3U(req_id, <, rxq->er_cq_num_descs);
366 		rcb = &rxq->er_rcbs[req_id];
367 		rcb->ercb_offset = cdesc->erc_offset;
368 		rcb->ercb_length = cdesc->erc_length;
369 		ASSERT3U(rcb->ercb_length, <=, ena->ena_max_frame_total);
370 		mp = allocb(rcb->ercb_length + ENA_RX_BUF_IPHDR_ALIGNMENT, 0);
371 
372 		/*
373 		 * If we can't allocate an mblk, things are looking
374 		 * grim. Forget about this frame and move on.
375 		 */
376 		if (mp == NULL) {
377 			mutex_enter(&rxq->er_stat_lock);
378 			rxq->er_stat.ers_allocb_fail.value.ui64++;
379 			mutex_exit(&rxq->er_stat_lock);
380 			goto next_desc;
381 		}
382 
383 		/*
384 		 * As we pull frames we need to link them together as
385 		 * one chain to be delivered up to mac.
386 		 */
387 		if (head == NULL) {
388 			head = mp;
389 		} else {
390 			tail->b_next = mp;
391 		}
392 
393 		tail = mp;
394 
395 		/*
396 		 * We need to make sure the bytes are copied to the
397 		 * correct offset to achieve 4-byte IP header
398 		 * alignment.
399 		 *
400 		 * If we start using desballoc on the buffers, then we
401 		 * will need to make sure to apply this offset to the
402 		 * DMA buffers as well. Though it may be the case the
403 		 * device does this implicitly and that's what
404 		 * cdesc->erc_offset is for; we don't know because
405 		 * it's not documented.
406 		 */
407 		mp->b_wptr += ENA_RX_BUF_IPHDR_ALIGNMENT;
408 		mp->b_rptr += ENA_RX_BUF_IPHDR_ALIGNMENT;
409 		bcopy(rcb->ercb_dma.edb_va + rcb->ercb_offset, mp->b_wptr,
410 		    rcb->ercb_length);
411 		mp->b_wptr += rcb->ercb_length;
412 		total_bytes += rcb->ercb_length;
413 		VERIFY3P(mp->b_wptr, >, mp->b_rptr);
414 		VERIFY3P(mp->b_wptr, <=, mp->b_datap->db_lim);
415 
416 		l3proto = ENAHW_RX_CDESC_L3_PROTO(cdesc);
417 		l4proto = ENAHW_RX_CDESC_L4_PROTO(cdesc);
418 
419 		/*
420 		 * When it comes to bad TCP/IP checksums we do not
421 		 * discard the packet at this level. Instead, we let
422 		 * it percolate up for further processing and tracking
423 		 * by the upstream TCP/IP stack.
424 		 */
425 		if (ena->ena_rx_l3_ipv4_csum &&
426 		    l3proto == ENAHW_IO_L3_PROTO_IPV4) {
427 			boolean_t l3_csum_err =
428 			    ENAHW_RX_CDESC_L3_CSUM_ERR(cdesc);
429 
430 			if (l3_csum_err) {
431 				mutex_enter(&rxq->er_stat_lock);
432 				rxq->er_stat.ers_hck_ipv4_err.value.ui64++;
433 				mutex_exit(&rxq->er_stat_lock);
434 			} else {
435 				hflags |= HCK_IPV4_HDRCKSUM_OK;
436 			}
437 		}
438 
439 		l4csum_checked = ENAHW_RX_CDESC_L4_CSUM_CHECKED(cdesc);
440 
441 		if (ena->ena_rx_l4_ipv4_csum && l4csum_checked &&
442 		    l4proto == ENAHW_IO_L4_PROTO_TCP) {
443 			boolean_t l4_csum_err =
444 			    ENAHW_RX_CDESC_L4_CSUM_ERR(cdesc);
445 
446 			if (l4_csum_err) {
447 				mutex_enter(&rxq->er_stat_lock);
448 				rxq->er_stat.ers_hck_l4_err.value.ui64++;
449 				mutex_exit(&rxq->er_stat_lock);
450 			} else {
451 				hflags |= HCK_FULLCKSUM_OK;
452 			}
453 		}
454 
455 		if (hflags != 0) {
456 			mac_hcksum_set(mp, 0, 0, 0, 0, hflags);
457 		}
458 
459 next_desc:
460 		/*
461 		 * Technically, if we arrived here due to a failure,
462 		 * then we did not read a new frame. However, we count
463 		 * it all the same anyways in order to count it as
464 		 * progress to the interrupt work limit. The failure
465 		 * stats will allow us to differentiate good frames
466 		 * from bad.
467 		 */
468 		num_frames++;
469 		rxq->er_cq_head_idx++;
470 		head_mod = rxq->er_cq_head_idx & (rxq->er_cq_num_descs - 1);
471 
472 		if (head_mod == 0) {
473 			rxq->er_cq_phase = !rxq->er_cq_phase;
474 		}
475 
476 		if (polling && (total_bytes > poll_bytes)) {
477 			break;
478 		} else if (!polling && (num_frames >= rxq->er_intr_limit)) {
479 			mutex_enter(&rxq->er_stat_lock);
480 			rxq->er_stat.ers_intr_limit.value.ui64++;
481 			mutex_exit(&rxq->er_stat_lock);
482 			break;
483 		}
484 
485 		cdesc = &rxq->er_cq_descs[head_mod];
486 		VERIFY3P(cdesc, >=, rxq->er_cq_descs);
487 		VERIFY3P(cdesc, <=,
488 		    (rxq->er_cq_descs + rxq->er_cq_num_descs - 1));
489 	}
490 
491 	mutex_enter(&rxq->er_stat_lock);
492 	rxq->er_stat.ers_packets.value.ui64 += num_frames;
493 	rxq->er_stat.ers_bytes.value.ui64 += total_bytes;
494 	mutex_exit(&rxq->er_stat_lock);
495 
496 	DTRACE_PROBE4(rx__frames, mblk_t *, head, boolean_t, polling, uint64_t,
497 	    num_frames, uint64_t, total_bytes);
498 	ena_refill_rx(rxq, num_frames);
499 	return (head);
500 }
501 
502 void
503 ena_rx_intr_work(ena_rxq_t *rxq)
504 {
505 	mblk_t *mp;
506 
507 	mutex_enter(&rxq->er_lock);
508 	mp = ena_ring_rx(rxq, ENA_INTERRUPT_MODE);
509 	mutex_exit(&rxq->er_lock);
510 
511 	if (mp == NULL) {
512 		return;
513 	}
514 
515 	mac_rx_ring(rxq->er_ena->ena_mh, rxq->er_mrh, mp, rxq->er_m_gen_num);
516 }
517 
518 mblk_t *
519 ena_ring_rx_poll(void *rh, int poll_bytes)
520 {
521 	ena_rxq_t *rxq = rh;
522 	mblk_t *mp;
523 
524 	ASSERT3S(poll_bytes, >, 0);
525 
526 	mutex_enter(&rxq->er_lock);
527 	mp = ena_ring_rx(rxq, poll_bytes);
528 	mutex_exit(&rxq->er_lock);
529 
530 	return (mp);
531 }
532