1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2024 Oxide Computer Company
14 */
15
16 #include "ena.h"
17
18 static void
ena_refill_rx(ena_rxq_t * rxq,uint16_t num)19 ena_refill_rx(ena_rxq_t *rxq, uint16_t num)
20 {
21 VERIFY3P(rxq, !=, NULL);
22 ASSERT(MUTEX_HELD(&rxq->er_lock));
23 ASSERT3U(num, <=, rxq->er_sq_num_descs);
24
25 const uint16_t modulo_mask = rxq->er_sq_num_descs - 1;
26 uint16_t tail_mod = rxq->er_sq_tail_idx & modulo_mask;
27
28 while (num != 0) {
29 enahw_rx_desc_t *desc = &rxq->er_sq_descs[tail_mod];
30 ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[tail_mod];
31 uint16_t phase = rxq->er_sq_phase;
32
33 VERIFY3U(tail_mod, <, rxq->er_sq_num_descs);
34 VERIFY3P(desc, !=, NULL);
35 VERIFY3P(rcb, !=, NULL);
36 VERIFY3P(desc, >=, rxq->er_sq_descs);
37 VERIFY3P(desc, <=,
38 (rxq->er_sq_descs + rxq->er_sq_num_descs - 1));
39
40 desc->erd_length = rcb->ercb_dma.edb_len;
41 desc->erd_req_id = tail_mod;
42 VERIFY3P(rcb->ercb_dma.edb_cookie, !=, NULL);
43 ena_set_dma_addr_values(rxq->er_ena,
44 rcb->ercb_dma.edb_cookie->dmac_laddress,
45 &desc->erd_buff_addr_lo, &desc->erd_buff_addr_hi);
46
47 ENAHW_RX_DESC_CLEAR_CTRL(desc);
48 ENAHW_RX_DESC_SET_PHASE(desc, phase);
49 ENAHW_RX_DESC_SET_FIRST(desc);
50 ENAHW_RX_DESC_SET_LAST(desc);
51 ENAHW_RX_DESC_SET_COMP_REQ(desc);
52 DTRACE_PROBE1(ena__refill__rx, enahw_rx_desc_t *, desc);
53 rxq->er_sq_tail_idx++;
54 tail_mod = rxq->er_sq_tail_idx & modulo_mask;
55
56 if (tail_mod == 0)
57 rxq->er_sq_phase ^= 1;
58
59 num--;
60 }
61
62 ENA_DMA_SYNC(rxq->er_sq_dma, DDI_DMA_SYNC_FORDEV);
63 ena_hw_abs_write32(rxq->er_ena, rxq->er_sq_db_addr,
64 rxq->er_sq_tail_idx);
65 }
66
67 void
ena_free_rx_dma(ena_rxq_t * rxq)68 ena_free_rx_dma(ena_rxq_t *rxq)
69 {
70 if (rxq->er_rcbs != NULL) {
71 for (uint_t i = 0; i < rxq->er_sq_num_descs; i++) {
72 ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[i];
73 ena_dma_free(&rcb->ercb_dma);
74 }
75
76 kmem_free(rxq->er_rcbs,
77 sizeof (*rxq->er_rcbs) * rxq->er_sq_num_descs);
78
79 rxq->er_rcbs = NULL;
80 }
81
82 ena_dma_free(&rxq->er_cq_dma);
83 rxq->er_cq_descs = NULL;
84 rxq->er_cq_num_descs = 0;
85
86 ena_dma_free(&rxq->er_sq_dma);
87 rxq->er_sq_descs = NULL;
88 rxq->er_sq_num_descs = 0;
89
90 rxq->er_state &= ~ENA_RXQ_STATE_HOST_ALLOC;
91 }
92
93 static int
ena_alloc_rx_dma(ena_rxq_t * rxq)94 ena_alloc_rx_dma(ena_rxq_t *rxq)
95 {
96 ena_t *ena = rxq->er_ena;
97 size_t cq_descs_sz;
98 size_t sq_descs_sz;
99 int err = 0;
100
101 cq_descs_sz = rxq->er_cq_num_descs * sizeof (*rxq->er_cq_descs);
102 sq_descs_sz = rxq->er_sq_num_descs * sizeof (*rxq->er_sq_descs);
103
104 ena_dma_conf_t sq_conf = {
105 .edc_size = sq_descs_sz,
106 .edc_align = ENAHW_IO_SQ_DESC_BUF_ALIGNMENT,
107 .edc_sgl = 1,
108 .edc_endian = DDI_NEVERSWAP_ACC,
109 .edc_stream = false,
110 };
111
112 if (!ena_dma_alloc(ena, &rxq->er_sq_dma, &sq_conf, sq_descs_sz)) {
113 return (ENOMEM);
114 }
115
116 rxq->er_sq_descs = (void *)rxq->er_sq_dma.edb_va;
117 rxq->er_rcbs = kmem_zalloc(sizeof (*rxq->er_rcbs) *
118 rxq->er_sq_num_descs, KM_SLEEP);
119
120 for (uint_t i = 0; i < rxq->er_sq_num_descs; i++) {
121 ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[i];
122 ena_dma_conf_t buf_conf = {
123 .edc_size = ena->ena_rx_buf_sz,
124 .edc_align = 1,
125 .edc_sgl = ena->ena_rx_sgl_max_sz,
126 .edc_endian = DDI_NEVERSWAP_ACC,
127 .edc_stream = true,
128 };
129
130 if (!ena_dma_alloc(ena, &rcb->ercb_dma, &buf_conf,
131 ena->ena_rx_buf_sz)) {
132 err = ENOMEM;
133 goto error;
134 }
135 }
136
137 ena_dma_conf_t cq_conf = {
138 .edc_size = cq_descs_sz,
139 .edc_align = ENAHW_IO_CQ_DESC_BUF_ALIGNMENT,
140 .edc_sgl = 1,
141 .edc_endian = DDI_NEVERSWAP_ACC,
142 .edc_stream = false,
143 };
144
145 if (!ena_dma_alloc(ena, &rxq->er_cq_dma, &cq_conf, cq_descs_sz)) {
146 err = ENOMEM;
147 goto error;
148 }
149
150 rxq->er_cq_descs = (void *)rxq->er_cq_dma.edb_va;
151 rxq->er_state |= ENA_RXQ_STATE_HOST_ALLOC;
152 return (0);
153
154 error:
155 ena_free_rx_dma(rxq);
156 return (err);
157 }
158
159 bool
ena_alloc_rxq(ena_rxq_t * rxq)160 ena_alloc_rxq(ena_rxq_t *rxq)
161 {
162 int ret = 0;
163 ena_t *ena = rxq->er_ena;
164 uint16_t cq_hw_idx, sq_hw_idx;
165 uint32_t *cq_unmask_addr, *cq_numanode;
166 uint32_t *sq_db_addr;
167
168 /*
169 * First, allocate the Rx data buffers.
170 */
171 if ((ret = ena_alloc_rx_dma(rxq)) != 0) {
172 ena_err(ena, "failed to allocate Rx queue %u data buffers: %d",
173 rxq->er_rxqs_idx, ret);
174 return (false);
175 }
176
177 ASSERT(rxq->er_state & ENA_RXQ_STATE_HOST_ALLOC);
178
179 /*
180 * Second, create the Completion Queue.
181 */
182 ret = ena_create_cq(ena, rxq->er_cq_num_descs,
183 rxq->er_cq_dma.edb_cookie->dmac_laddress, false,
184 rxq->er_intr_vector, &cq_hw_idx, &cq_unmask_addr, &cq_numanode);
185
186 if (ret != 0) {
187 ena_err(ena, "failed to create Rx CQ %u: %d", rxq->er_rxqs_idx,
188 ret);
189 return (false);
190 }
191
192 /* The phase must always start on 1. */
193 rxq->er_cq_phase = 1;
194 rxq->er_cq_head_idx = 0;
195 rxq->er_cq_hw_idx = cq_hw_idx;
196 rxq->er_cq_unmask_addr = cq_unmask_addr;
197 rxq->er_cq_numa_addr = cq_numanode;
198 rxq->er_state |= ENA_RXQ_STATE_CQ_CREATED;
199
200 /*
201 * Third, create the Submission Queue to match with the above
202 * CQ. At this time we force the SQ and CQ to have the same
203 * number of descriptors as we only use a 1:1 completion
204 * policy. However, in the future, we could loosen this and
205 * use an on-demand completion policy and the two could have a
206 * different number of descriptors.
207 */
208 ASSERT3U(rxq->er_sq_num_descs, ==, rxq->er_cq_num_descs);
209 ret = ena_create_sq(ena, rxq->er_sq_num_descs,
210 rxq->er_sq_dma.edb_cookie->dmac_laddress, false, cq_hw_idx,
211 &sq_hw_idx, &sq_db_addr);
212
213 if (ret != 0) {
214 ena_err(ena, "failed to create Rx SQ %u: %d", rxq->er_rxqs_idx,
215 ret);
216 return (false);
217 }
218
219 ASSERT3P(sq_db_addr, !=, NULL);
220 rxq->er_sq_hw_idx = sq_hw_idx;
221 rxq->er_sq_db_addr = sq_db_addr;
222 /* The phase must always start on 1. */
223 rxq->er_sq_phase = 1;
224 rxq->er_sq_tail_idx = 0;
225 rxq->er_sq_avail_descs = rxq->er_sq_num_descs;
226 rxq->er_mode = ENA_RXQ_MODE_INTR;
227 rxq->er_state |= ENA_RXQ_STATE_SQ_CREATED;
228
229 return (true);
230 }
231
232 void
ena_cleanup_rxq(ena_rxq_t * rxq,bool resetting)233 ena_cleanup_rxq(ena_rxq_t *rxq, bool resetting)
234 {
235 int ret = 0;
236 ena_t *ena = rxq->er_ena;
237
238 if ((rxq->er_state & ENA_RXQ_STATE_SQ_CREATED) != 0) {
239 if (!resetting) {
240 ret = ena_destroy_sq(ena, rxq->er_sq_hw_idx, false);
241
242 if (ret != 0) {
243 ena_err(ena, "failed to destroy Rx SQ %u: %d",
244 rxq->er_rxqs_idx, ret);
245 }
246 }
247
248 rxq->er_sq_hw_idx = 0;
249 rxq->er_sq_db_addr = NULL;
250 rxq->er_sq_tail_idx = 0;
251 rxq->er_sq_phase = 0;
252 rxq->er_state &= ~ENA_RXQ_STATE_SQ_CREATED;
253 rxq->er_state &= ~ENA_RXQ_STATE_SQ_FILLED;
254 }
255
256 if ((rxq->er_state & ENA_RXQ_STATE_CQ_CREATED) != 0) {
257 if (!resetting) {
258 ret = ena_destroy_cq(ena, rxq->er_cq_hw_idx);
259
260 if (ret != 0) {
261 ena_err(ena, "failed to destroy Rx CQ %u: %d",
262 rxq->er_rxqs_idx, ret);
263 }
264 }
265
266 rxq->er_cq_hw_idx = 0;
267 rxq->er_cq_head_idx = 0;
268 rxq->er_cq_phase = 0;
269 rxq->er_cq_unmask_addr = NULL;
270 rxq->er_cq_numa_addr = NULL;
271 rxq->er_state &= ~ENA_RXQ_STATE_CQ_CREATED;
272 }
273
274 ena_free_rx_dma(rxq);
275 ASSERT3S(rxq->er_state, ==, ENA_RXQ_STATE_NONE);
276 }
277
278 void
ena_ring_rx_stop(mac_ring_driver_t rh)279 ena_ring_rx_stop(mac_ring_driver_t rh)
280 {
281 ena_rxq_t *rxq = (ena_rxq_t *)rh;
282 uint32_t intr_ctrl;
283
284 intr_ctrl = ena_hw_abs_read32(rxq->er_ena, rxq->er_cq_unmask_addr);
285 ENAHW_REG_INTR_MASK(intr_ctrl);
286 ena_hw_abs_write32(rxq->er_ena, rxq->er_cq_unmask_addr, intr_ctrl);
287
288 rxq->er_state &= ~ENA_RXQ_STATE_RUNNING;
289 rxq->er_state &= ~ENA_RXQ_STATE_READY;
290 }
291
292 int
ena_ring_rx_start(mac_ring_driver_t rh,uint64_t gen_num)293 ena_ring_rx_start(mac_ring_driver_t rh, uint64_t gen_num)
294 {
295 ena_rxq_t *rxq = (ena_rxq_t *)rh;
296 ena_t *ena = rxq->er_ena;
297 uint32_t intr_ctrl;
298
299 ena_dbg(ena, "ring_rx_start %p: state 0x%x", rxq, rxq->er_state);
300
301 mutex_enter(&rxq->er_lock);
302 if ((rxq->er_state & ENA_RXQ_STATE_SQ_FILLED) == 0) {
303 /*
304 * The ENA controller gets upset and sets the fatal error bit
305 * in its status register if we write a value to an RX SQ's
306 * doorbell that is past its current head. This makes sense as
307 * it would represent there being more descriptors available
308 * than can fit in the ring. For this reason, we make sure that
309 * we only fill the ring once, even if it is started multiple
310 * times.
311 * The `- 1` below is harder to explain. If we completely fill
312 * the SQ ring, then at some time later that seems to be
313 * independent of how many times we've been around the ring,
314 * the ENA controller will set the fatal error bit and stop
315 * responding. Leaving a gap prevents this somehow and it is
316 * what the other open source drivers do.
317 */
318 ena_refill_rx(rxq, rxq->er_sq_num_descs - 1);
319 rxq->er_state |= ENA_RXQ_STATE_SQ_FILLED;
320 }
321 rxq->er_m_gen_num = gen_num;
322 rxq->er_intr_limit = ena->ena_rxq_intr_limit;
323 mutex_exit(&rxq->er_lock);
324
325 rxq->er_state |= ENA_RXQ_STATE_READY;
326
327 intr_ctrl = ena_hw_abs_read32(ena, rxq->er_cq_unmask_addr);
328 ENAHW_REG_INTR_UNMASK(intr_ctrl);
329 ena_hw_abs_write32(ena, rxq->er_cq_unmask_addr, intr_ctrl);
330 rxq->er_state |= ENA_RXQ_STATE_RUNNING;
331 return (0);
332 }
333
334 mblk_t *
ena_ring_rx(ena_rxq_t * rxq,int poll_bytes)335 ena_ring_rx(ena_rxq_t *rxq, int poll_bytes)
336 {
337 ena_t *ena = rxq->er_ena;
338 const uint16_t modulo_mask = rxq->er_cq_num_descs - 1;
339 uint16_t head_mod = rxq->er_cq_head_idx & modulo_mask;
340 uint64_t total_bytes = 0;
341 uint64_t num_frames = 0;
342 enahw_rx_cdesc_t *cdesc;
343 bool polling = true;
344 mblk_t *head = NULL;
345 mblk_t *tail = NULL;
346
347 ASSERT(MUTEX_HELD(&rxq->er_lock));
348 ENA_DMA_SYNC(rxq->er_cq_dma, DDI_DMA_SYNC_FORKERNEL);
349
350 if (poll_bytes == ENA_INTERRUPT_MODE) {
351 polling = false;
352 }
353
354 cdesc = &rxq->er_cq_descs[head_mod];
355 VERIFY3P(cdesc, >=, rxq->er_cq_descs);
356 VERIFY3P(cdesc, <=, (rxq->er_cq_descs + rxq->er_cq_num_descs - 1));
357
358 while (ENAHW_RX_CDESC_PHASE(cdesc) == rxq->er_cq_phase) {
359 bool first, last;
360 ena_rx_ctrl_block_t *rcb;
361 uint16_t req_id;
362 mblk_t *mp;
363 enahw_io_l3_proto_t l3proto;
364 enahw_io_l4_proto_t l4proto;
365 bool l4csum_checked;
366 uint32_t hflags = 0;
367
368 VERIFY3U(head_mod, <, rxq->er_cq_num_descs);
369 /*
370 * Currently, all incoming frames fit in a single Rx
371 * buffer (erd_length > total frame size). In the
372 * future, if we decide to loan buffers which are
373 * smaller, we will need to modify this code to read
374 * one or more descriptors (based on frame size).
375 *
376 * For this reason we do not expect any frame to span
377 * multiple descriptors. Therefore, we drop any data
378 * not delivered as a single descriptor, i.e., where
379 * 'first' and 'last' are both true.
380 */
381 first = ENAHW_RX_CDESC_FIRST(cdesc);
382 last = ENAHW_RX_CDESC_LAST(cdesc);
383
384 if (!first || !last) {
385 mutex_enter(&rxq->er_stat_lock);
386 rxq->er_stat.ers_multi_desc.value.ui64++;
387 mutex_exit(&rxq->er_stat_lock);
388 goto next_desc;
389 }
390
391 req_id = cdesc->erc_req_id;
392 VERIFY3U(req_id, <, rxq->er_cq_num_descs);
393 rcb = &rxq->er_rcbs[req_id];
394 rcb->ercb_offset = cdesc->erc_offset;
395 rcb->ercb_length = cdesc->erc_length;
396 ASSERT3U(rcb->ercb_length, <=, ena->ena_max_frame_total);
397 mp = allocb(rcb->ercb_length + ENA_RX_BUF_IPHDR_ALIGNMENT, 0);
398
399 /*
400 * If we can't allocate an mblk, things are looking
401 * grim. Forget about this frame and move on.
402 */
403 if (mp == NULL) {
404 mutex_enter(&rxq->er_stat_lock);
405 rxq->er_stat.ers_allocb_fail.value.ui64++;
406 mutex_exit(&rxq->er_stat_lock);
407 goto next_desc;
408 }
409
410 /*
411 * As we pull frames we need to link them together as
412 * one chain to be delivered up to mac.
413 */
414 if (head == NULL) {
415 head = mp;
416 } else {
417 tail->b_next = mp;
418 }
419
420 tail = mp;
421
422 /*
423 * We need to make sure the bytes are copied to the
424 * correct offset to achieve 4-byte IP header
425 * alignment.
426 *
427 * If we start using desballoc on the buffers, then we
428 * will need to make sure to apply this offset to the
429 * DMA buffers as well. Though it may be the case the
430 * device does this implicitly and that's what
431 * cdesc->erc_offset is for; we don't know because
432 * it's not documented.
433 */
434 mp->b_wptr += ENA_RX_BUF_IPHDR_ALIGNMENT;
435 mp->b_rptr += ENA_RX_BUF_IPHDR_ALIGNMENT;
436 bcopy(rcb->ercb_dma.edb_va + rcb->ercb_offset, mp->b_wptr,
437 rcb->ercb_length);
438 mp->b_wptr += rcb->ercb_length;
439 total_bytes += rcb->ercb_length;
440 VERIFY3P(mp->b_wptr, >, mp->b_rptr);
441 VERIFY3P(mp->b_wptr, <=, mp->b_datap->db_lim);
442
443 l3proto = ENAHW_RX_CDESC_L3_PROTO(cdesc);
444 l4proto = ENAHW_RX_CDESC_L4_PROTO(cdesc);
445
446 /*
447 * When it comes to bad TCP/IP checksums we do not
448 * discard the packet at this level. Instead, we let
449 * it percolate up for further processing and tracking
450 * by the upstream TCP/IP stack.
451 */
452 if (ena->ena_rx_l3_ipv4_csum &&
453 l3proto == ENAHW_IO_L3_PROTO_IPV4) {
454 bool l3_csum_err =
455 ENAHW_RX_CDESC_L3_CSUM_ERR(cdesc);
456
457 if (l3_csum_err) {
458 mutex_enter(&rxq->er_stat_lock);
459 rxq->er_stat.ers_hck_ipv4_err.value.ui64++;
460 mutex_exit(&rxq->er_stat_lock);
461 } else {
462 hflags |= HCK_IPV4_HDRCKSUM_OK;
463 }
464 }
465
466 l4csum_checked = ENAHW_RX_CDESC_L4_CSUM_CHECKED(cdesc);
467
468 if (ena->ena_rx_l4_ipv4_csum && l4csum_checked &&
469 l4proto == ENAHW_IO_L4_PROTO_TCP) {
470 bool l4_csum_err =
471 ENAHW_RX_CDESC_L4_CSUM_ERR(cdesc);
472
473 if (l4_csum_err) {
474 mutex_enter(&rxq->er_stat_lock);
475 rxq->er_stat.ers_hck_l4_err.value.ui64++;
476 mutex_exit(&rxq->er_stat_lock);
477 } else {
478 hflags |= HCK_FULLCKSUM_OK;
479 }
480 }
481
482 if (hflags != 0) {
483 mac_hcksum_set(mp, 0, 0, 0, 0, hflags);
484 }
485
486 next_desc:
487 /*
488 * Technically, if we arrived here due to a failure,
489 * then we did not read a new frame. However, we count
490 * it all the same anyways in order to count it as
491 * progress to the interrupt work limit. The failure
492 * stats will allow us to differentiate good frames
493 * from bad.
494 */
495 num_frames++;
496 rxq->er_cq_head_idx++;
497 head_mod = rxq->er_cq_head_idx & modulo_mask;
498 if (head_mod == 0)
499 rxq->er_cq_phase ^= 1;
500
501 if (polling && total_bytes > poll_bytes) {
502 break;
503 } else if (!polling && num_frames >= rxq->er_intr_limit) {
504 mutex_enter(&rxq->er_stat_lock);
505 rxq->er_stat.ers_intr_limit.value.ui64++;
506 mutex_exit(&rxq->er_stat_lock);
507 break;
508 }
509
510 cdesc = &rxq->er_cq_descs[head_mod];
511 VERIFY3P(cdesc, >=, rxq->er_cq_descs);
512 VERIFY3P(cdesc, <=,
513 (rxq->er_cq_descs + rxq->er_cq_num_descs - 1));
514 }
515
516 if (num_frames > 0) {
517 mutex_enter(&rxq->er_stat_lock);
518 rxq->er_stat.ers_packets.value.ui64 += num_frames;
519 rxq->er_stat.ers_bytes.value.ui64 += total_bytes;
520 mutex_exit(&rxq->er_stat_lock);
521
522 DTRACE_PROBE5(rx__frames, ena_rxq_t *, rxq, mblk_t *, head,
523 bool, polling, uint64_t, num_frames, uint64_t, total_bytes);
524 ena_refill_rx(rxq, num_frames);
525 }
526
527 return (head);
528 }
529
530 void
ena_rx_intr_work(ena_rxq_t * rxq)531 ena_rx_intr_work(ena_rxq_t *rxq)
532 {
533 mblk_t *mp;
534
535 mutex_enter(&rxq->er_lock);
536 mp = ena_ring_rx(rxq, ENA_INTERRUPT_MODE);
537 mutex_exit(&rxq->er_lock);
538
539 if (mp == NULL) {
540 return;
541 }
542
543 mac_rx_ring(rxq->er_ena->ena_mh, rxq->er_mrh, mp, rxq->er_m_gen_num);
544 }
545
546 mblk_t *
ena_ring_rx_poll(void * rh,int poll_bytes)547 ena_ring_rx_poll(void *rh, int poll_bytes)
548 {
549 ena_rxq_t *rxq = rh;
550 mblk_t *mp;
551
552 ASSERT3S(poll_bytes, >, 0);
553
554 mutex_enter(&rxq->er_lock);
555 mp = ena_ring_rx(rxq, poll_bytes);
556 mutex_exit(&rxq->er_lock);
557
558 return (mp);
559 }
560