/*
 * This file and its contents are supplied under the terms of the
 * Common Development and Distribution License ("CDDL"), version 1.0.
 * You may only use this file in accordance with the terms of version
 * 1.0 of the CDDL.
 *
 * A full copy of the text of the CDDL should have accompanied this
 * source.  A copy of the CDDL is also available via the Internet at
 * http://www.illumos.org/license/CDDL.
 */

/*
 * Copyright 2024 Oxide Computer Company
 */
#include "ena.h"

static void
ena_refill_rx(ena_rxq_t *rxq, uint16_t num)
{
	VERIFY3P(rxq, !=, NULL);
	ASSERT(MUTEX_HELD(&rxq->er_lock));
	ASSERT3U(num, <=, rxq->er_sq_num_descs);
	uint16_t tail_mod = rxq->er_sq_tail_idx & (rxq->er_sq_num_descs - 1);

	while (num != 0) {
		enahw_rx_desc_t *desc = &rxq->er_sq_descs[tail_mod];
		ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[tail_mod];
		uint16_t phase = rxq->er_sq_phase;

		VERIFY3U(tail_mod, <, rxq->er_sq_num_descs);
		VERIFY3P(desc, !=, NULL);
		VERIFY3P(rcb, !=, NULL);
		VERIFY3P(desc, >=, rxq->er_sq_descs);
		VERIFY3P(desc, <=,
		    (rxq->er_sq_descs + rxq->er_sq_num_descs - 1));

		desc->erd_length = rcb->ercb_dma.edb_len;
		desc->erd_req_id = tail_mod;
		VERIFY3P(rcb->ercb_dma.edb_cookie, !=, NULL);
		ena_set_dma_addr_values(rxq->er_ena,
		    rcb->ercb_dma.edb_cookie->dmac_laddress,
		    &desc->erd_buff_addr_lo, &desc->erd_buff_addr_hi);

		ENAHW_RX_DESC_CLEAR_CTRL(desc);
		ENAHW_RX_DESC_SET_PHASE(desc, phase);
		ENAHW_RX_DESC_SET_FIRST(desc);
		ENAHW_RX_DESC_SET_LAST(desc);
		ENAHW_RX_DESC_SET_COMP_REQ(desc);
		DTRACE_PROBE1(ena__refill__rx, enahw_rx_desc_t *, desc);
		rxq->er_sq_tail_idx++;
		tail_mod = rxq->er_sq_tail_idx & (rxq->er_sq_num_descs - 1);

		if (tail_mod == 0) {
			rxq->er_sq_phase ^= 1;
		}

		num--;
	}

	ENA_DMA_SYNC(rxq->er_sq_dma, DDI_DMA_SYNC_FORDEV);
	ena_hw_abs_write32(rxq->er_ena, rxq->er_sq_db_addr,
	    rxq->er_sq_tail_idx);
}

void
ena_free_rx_dma(ena_rxq_t *rxq)
{
	if (rxq->er_rcbs != NULL) {
		for (uint_t i = 0; i < rxq->er_sq_num_descs; i++) {
			ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[i];
			ena_dma_free(&rcb->ercb_dma);
		}

		kmem_free(rxq->er_rcbs,
		    sizeof (*rxq->er_rcbs) * rxq->er_sq_num_descs);

		rxq->er_rcbs = NULL;
	}

	ena_dma_free(&rxq->er_cq_dma);
	rxq->er_cq_descs = NULL;
	rxq->er_cq_num_descs = 0;

	ena_dma_free(&rxq->er_sq_dma);
	rxq->er_sq_descs = NULL;
	rxq->er_sq_num_descs = 0;

	rxq->er_state &= ~ENA_RXQ_STATE_HOST_ALLOC;
}

static int
ena_alloc_rx_dma(ena_rxq_t *rxq)
{
	ena_t *ena = rxq->er_ena;
	size_t cq_descs_sz;
	size_t sq_descs_sz;
	ena_dma_conf_t conf;
	int err = 0;

	cq_descs_sz = rxq->er_cq_num_descs * sizeof (*rxq->er_cq_descs);
	sq_descs_sz = rxq->er_sq_num_descs * sizeof (*rxq->er_sq_descs);
	/* BEGIN CSTYLED */
	conf = (ena_dma_conf_t) {
		.edc_size = sq_descs_sz,
		.edc_align = ENAHW_IO_SQ_DESC_BUF_ALIGNMENT,
		.edc_sgl = 1,
		.edc_endian = DDI_NEVERSWAP_ACC,
		.edc_stream = B_FALSE,
	};
	/* END CSTYLED */

	if (!ena_dma_alloc(ena, &rxq->er_sq_dma, &conf, sq_descs_sz)) {
		return (ENOMEM);
	}

	rxq->er_sq_descs = (void *)rxq->er_sq_dma.edb_va;
	rxq->er_rcbs = kmem_zalloc(sizeof (*rxq->er_rcbs) *
	    rxq->er_sq_num_descs, KM_SLEEP);

	for (uint_t i = 0; i < rxq->er_sq_num_descs; i++) {
		ena_rx_ctrl_block_t *rcb = &rxq->er_rcbs[i];
		ena_dma_conf_t buf_conf = {
			.edc_size = ena->ena_rx_buf_sz,
			.edc_align = 1,
			.edc_sgl = ena->ena_rx_sgl_max_sz,
			.edc_endian = DDI_NEVERSWAP_ACC,
			.edc_stream = B_TRUE,
		};

		if (!ena_dma_alloc(ena, &rcb->ercb_dma, &buf_conf,
		    ena->ena_rx_buf_sz)) {
			err = ENOMEM;
			goto error;
		}
	}

	/* BEGIN CSTYLED */
	conf = (ena_dma_conf_t) {
		.edc_size = cq_descs_sz,
		.edc_align = ENAHW_IO_CQ_DESC_BUF_ALIGNMENT,
		.edc_sgl = 1,
		.edc_endian = DDI_NEVERSWAP_ACC,
		.edc_stream = B_FALSE,
	};
	/* END CSTYLED */

	if (!ena_dma_alloc(ena, &rxq->er_cq_dma, &conf, cq_descs_sz)) {
		err = ENOMEM;
		goto error;
	}

	rxq->er_cq_descs = (void *)rxq->er_cq_dma.edb_va;
	rxq->er_state |= ENA_RXQ_STATE_HOST_ALLOC;
	return (0);

error:
	ena_free_rx_dma(rxq);
	return (err);
}

boolean_t
ena_alloc_rxq(ena_rxq_t *rxq)
{
	int ret = 0;
	ena_t *ena = rxq->er_ena;
	uint16_t cq_hw_idx, sq_hw_idx;
	uint32_t *cq_unmask_addr, *cq_numanode;
	uint32_t *sq_db_addr;

	/*
	 * First, allocate the Rx data buffers.
	 */
	if ((ret = ena_alloc_rx_dma(rxq)) != 0) {
		ena_err(ena, "failed to allocate Rx queue %u data buffers: %d",
		    rxq->er_rxqs_idx, ret);
		return (B_FALSE);
	}

	ASSERT(rxq->er_state & ENA_RXQ_STATE_HOST_ALLOC);

	/*
	 * Second, create the Completion Queue.
	 */
	ret = ena_create_cq(ena,  rxq->er_cq_num_descs,
	    rxq->er_cq_dma.edb_cookie->dmac_laddress, B_FALSE,
	    rxq->er_intr_vector, &cq_hw_idx, &cq_unmask_addr, &cq_numanode);

	if (ret != 0) {
		ena_err(ena, "failed to create Rx CQ %u: %d", rxq->er_rxqs_idx,
		    ret);
		return (B_FALSE);
	}

	/* The phase must always start on 1. */
	rxq->er_cq_phase = 1;
	rxq->er_cq_head_idx = 0;
	rxq->er_cq_hw_idx = cq_hw_idx;
	rxq->er_cq_unmask_addr = cq_unmask_addr;
	rxq->er_cq_numa_addr = cq_numanode;
	rxq->er_state |= ENA_RXQ_STATE_CQ_CREATED;

	/*
	 * Third, create the Submission Queue to match with the above
	 * CQ. At this time we force the SQ and CQ to have the same
	 * number of descriptors as we only use a 1:1 completion
	 * policy. However, in the future, we could loosen this and
	 * use an on-demand completion policy and the two could have a
	 * different number of descriptors.
	 */
	ASSERT3U(rxq->er_sq_num_descs, ==, rxq->er_cq_num_descs);
	ret = ena_create_sq(ena, rxq->er_sq_num_descs,
	    rxq->er_sq_dma.edb_cookie->dmac_laddress, B_FALSE, cq_hw_idx,
	    &sq_hw_idx, &sq_db_addr);

	if (ret != 0) {
		ena_err(ena, "failed to create Rx SQ %u: %d", rxq->er_rxqs_idx,
		    ret);
		return (B_FALSE);
	}

	ASSERT3P(sq_db_addr, !=, NULL);
	rxq->er_sq_hw_idx = sq_hw_idx;
	rxq->er_sq_db_addr = sq_db_addr;
	/* The phase must always start on 1. */
	rxq->er_sq_phase = 1;
	rxq->er_sq_tail_idx = 0;
	rxq->er_sq_avail_descs = rxq->er_sq_num_descs;
	rxq->er_mode = ENA_RXQ_MODE_INTR;
	rxq->er_state |= ENA_RXQ_STATE_SQ_CREATED;

	return (B_TRUE);
}

void
ena_cleanup_rxq(ena_rxq_t *rxq)
{
	int ret = 0;
	ena_t *ena = rxq->er_ena;

	if ((rxq->er_state & ENA_RXQ_STATE_SQ_CREATED) != 0) {
		ret = ena_destroy_sq(ena, rxq->er_sq_hw_idx, B_FALSE);

		if (ret != 0) {
			ena_err(ena, "failed to destroy Rx SQ %u: %d",
			    rxq->er_rxqs_idx, ret);
		}

		rxq->er_sq_hw_idx = 0;
		rxq->er_sq_db_addr = NULL;
		rxq->er_sq_tail_idx = 0;
		rxq->er_sq_phase = 0;
		rxq->er_state &= ~ENA_RXQ_STATE_SQ_CREATED;
		rxq->er_state &= ~ENA_RXQ_STATE_SQ_FILLED;
	}

	if ((rxq->er_state & ENA_RXQ_STATE_CQ_CREATED) != 0) {
		ret = ena_destroy_cq(ena, rxq->er_cq_hw_idx);

		if (ret != 0) {
			ena_err(ena, "failed to destroy Rx CQ %u: %d",
			    rxq->er_rxqs_idx, ret);
		}

		rxq->er_cq_hw_idx = 0;
		rxq->er_cq_head_idx = 0;
		rxq->er_cq_phase = 0;
		rxq->er_cq_unmask_addr = NULL;
		rxq->er_cq_numa_addr = NULL;
		rxq->er_state &= ~ENA_RXQ_STATE_CQ_CREATED;
	}

	ena_free_rx_dma(rxq);
	ASSERT3S(rxq->er_state, ==, ENA_RXQ_STATE_NONE);
}

void
ena_ring_rx_stop(mac_ring_driver_t rh)
{
	ena_rxq_t *rxq = (ena_rxq_t *)rh;
	uint32_t intr_ctrl;

	intr_ctrl = ena_hw_abs_read32(rxq->er_ena, rxq->er_cq_unmask_addr);
	ENAHW_REG_INTR_MASK(intr_ctrl);
	ena_hw_abs_write32(rxq->er_ena, rxq->er_cq_unmask_addr, intr_ctrl);

	rxq->er_state &= ~ENA_RXQ_STATE_RUNNING;
	rxq->er_state &= ~ENA_RXQ_STATE_READY;
}

int
ena_ring_rx_start(mac_ring_driver_t rh, uint64_t gen_num)
{
	ena_rxq_t *rxq = (ena_rxq_t *)rh;
	ena_t *ena = rxq->er_ena;
	uint32_t intr_ctrl;

	ena_dbg(ena, "ring_rx_start %p: state %x", rxq, rxq->er_state);

	mutex_enter(&rxq->er_lock);
	if ((rxq->er_state & ENA_RXQ_STATE_SQ_FILLED) == 0) {
		/*
		 * The ENA controller gets upset and sets the fatal error bit
		 * in its status register if we write a value to an RX SQ's
		 * doorbell that is past its current head. This makes sense as
		 * it would represent there being more descriptors available
		 * than can fit in the ring. For this reason, we make sure that
		 * we only fill the ring once, even if it is started multiple
		 * times.
		 * The `- 1` below is harder to explain. If we completely fill
		 * the SQ ring, then at some time later that seems to be
		 * independent of how many times we've been around the ring,
		 * the ENA controller will set the fatal error bit and stop
		 * responding. Leaving a gap prevents this somehow and it is
		 * what the other open source drivers do.
		 */
		ena_refill_rx(rxq, rxq->er_sq_num_descs - 1);
		rxq->er_state |= ENA_RXQ_STATE_SQ_FILLED;
	}
	rxq->er_m_gen_num = gen_num;
	rxq->er_intr_limit = ena->ena_rxq_intr_limit;
	mutex_exit(&rxq->er_lock);

	rxq->er_state |= ENA_RXQ_STATE_READY;

	intr_ctrl = ena_hw_abs_read32(ena, rxq->er_cq_unmask_addr);
	ENAHW_REG_INTR_UNMASK(intr_ctrl);
	ena_hw_abs_write32(ena, rxq->er_cq_unmask_addr, intr_ctrl);
	rxq->er_state |= ENA_RXQ_STATE_RUNNING;
	return (0);
}

mblk_t *
ena_ring_rx(ena_rxq_t *rxq, int poll_bytes)
{
	ena_t *ena = rxq->er_ena;
	uint16_t head_mod = rxq->er_cq_head_idx & (rxq->er_cq_num_descs - 1);
	uint64_t total_bytes = 0;
	uint64_t num_frames = 0;
	enahw_rx_cdesc_t *cdesc;
	boolean_t polling = B_TRUE;
	mblk_t *head = NULL;
	mblk_t *tail = NULL;

	ASSERT(MUTEX_HELD(&rxq->er_lock));
	ENA_DMA_SYNC(rxq->er_cq_dma, DDI_DMA_SYNC_FORKERNEL);

	if (poll_bytes == ENA_INTERRUPT_MODE) {
		polling = B_FALSE;
	}

	cdesc = &rxq->er_cq_descs[head_mod];
	VERIFY3P(cdesc, >=, rxq->er_cq_descs);
	VERIFY3P(cdesc, <=, (rxq->er_cq_descs + rxq->er_cq_num_descs - 1));

	while (ENAHW_RX_CDESC_PHASE(cdesc) == rxq->er_cq_phase) {
		boolean_t first, last;
		ena_rx_ctrl_block_t *rcb;
		uint16_t req_id;
		mblk_t *mp;
		enahw_io_l3_proto_t l3proto;
		enahw_io_l4_proto_t l4proto;
		boolean_t l4csum_checked;
		uint32_t hflags = 0;

		VERIFY3U(head_mod, <, rxq->er_cq_num_descs);
		/*
		 * Currently, all incoming frames fit in a single Rx
		 * buffer (erd_length > total frame size). In the
		 * future, if we decide to loan buffers which are
		 * smaller, we will need to modify this code to read
		 * one or more descriptors (based on frame size).
		 *
		 * For this reason we do not expect any frame to span
		 * multiple descriptors. Therefore, we drop any data
		 * not delivered as a single descriptor, i.e., where
		 * 'first' and 'last' are both true.
		 */
		first = ENAHW_RX_CDESC_FIRST(cdesc);
		last = ENAHW_RX_CDESC_LAST(cdesc);

		if (!first || !last) {
			mutex_enter(&rxq->er_stat_lock);
			rxq->er_stat.ers_multi_desc.value.ui64++;
			mutex_exit(&rxq->er_stat_lock);
			goto next_desc;
		}

		req_id = cdesc->erc_req_id;
		VERIFY3U(req_id, <, rxq->er_cq_num_descs);
		rcb = &rxq->er_rcbs[req_id];
		rcb->ercb_offset = cdesc->erc_offset;
		rcb->ercb_length = cdesc->erc_length;
		ASSERT3U(rcb->ercb_length, <=, ena->ena_max_frame_total);
		mp = allocb(rcb->ercb_length + ENA_RX_BUF_IPHDR_ALIGNMENT, 0);

		/*
		 * If we can't allocate an mblk, things are looking
		 * grim. Forget about this frame and move on.
		 */
		if (mp == NULL) {
			mutex_enter(&rxq->er_stat_lock);
			rxq->er_stat.ers_allocb_fail.value.ui64++;
			mutex_exit(&rxq->er_stat_lock);
			goto next_desc;
		}

		/*
		 * As we pull frames we need to link them together as
		 * one chain to be delivered up to mac.
		 */
		if (head == NULL) {
			head = mp;
		} else {
			tail->b_next = mp;
		}

		tail = mp;

		/*
		 * We need to make sure the bytes are copied to the
		 * correct offset to achieve 4-byte IP header
		 * alignment.
		 *
		 * If we start using desballoc on the buffers, then we
		 * will need to make sure to apply this offset to the
		 * DMA buffers as well. Though it may be the case the
		 * device does this implicitly and that's what
		 * cdesc->erc_offset is for; we don't know because
		 * it's not documented.
		 */
		mp->b_wptr += ENA_RX_BUF_IPHDR_ALIGNMENT;
		mp->b_rptr += ENA_RX_BUF_IPHDR_ALIGNMENT;
		bcopy(rcb->ercb_dma.edb_va + rcb->ercb_offset, mp->b_wptr,
		    rcb->ercb_length);
		mp->b_wptr += rcb->ercb_length;
		total_bytes += rcb->ercb_length;
		VERIFY3P(mp->b_wptr, >, mp->b_rptr);
		VERIFY3P(mp->b_wptr, <=, mp->b_datap->db_lim);

		l3proto = ENAHW_RX_CDESC_L3_PROTO(cdesc);
		l4proto = ENAHW_RX_CDESC_L4_PROTO(cdesc);

		/*
		 * When it comes to bad TCP/IP checksums we do not
		 * discard the packet at this level. Instead, we let
		 * it percolate up for further processing and tracking
		 * by the upstream TCP/IP stack.
		 */
		if (ena->ena_rx_l3_ipv4_csum &&
		    l3proto == ENAHW_IO_L3_PROTO_IPV4) {
			boolean_t l3_csum_err =
			    ENAHW_RX_CDESC_L3_CSUM_ERR(cdesc);

			if (l3_csum_err) {
				mutex_enter(&rxq->er_stat_lock);
				rxq->er_stat.ers_hck_ipv4_err.value.ui64++;
				mutex_exit(&rxq->er_stat_lock);
			} else {
				hflags |= HCK_IPV4_HDRCKSUM_OK;
			}
		}

		l4csum_checked = ENAHW_RX_CDESC_L4_CSUM_CHECKED(cdesc);

		if (ena->ena_rx_l4_ipv4_csum && l4csum_checked &&
		    l4proto == ENAHW_IO_L4_PROTO_TCP) {
			boolean_t l4_csum_err =
			    ENAHW_RX_CDESC_L4_CSUM_ERR(cdesc);

			if (l4_csum_err) {
				mutex_enter(&rxq->er_stat_lock);
				rxq->er_stat.ers_hck_l4_err.value.ui64++;
				mutex_exit(&rxq->er_stat_lock);
			} else {
				hflags |= HCK_FULLCKSUM_OK;
			}
		}

		if (hflags != 0) {
			mac_hcksum_set(mp, 0, 0, 0, 0, hflags);
		}

next_desc:
		/*
		 * Technically, if we arrived here due to a failure,
		 * then we did not read a new frame. However, we count
		 * it all the same anyways in order to count it as
		 * progress to the interrupt work limit. The failure
		 * stats will allow us to differentiate good frames
		 * from bad.
		 */
		num_frames++;
		rxq->er_cq_head_idx++;
		head_mod = rxq->er_cq_head_idx & (rxq->er_cq_num_descs - 1);

		if (head_mod == 0) {
			rxq->er_cq_phase ^= 1;
		}

		if (polling && (total_bytes > poll_bytes)) {
			break;
		} else if (!polling && (num_frames >= rxq->er_intr_limit)) {
			mutex_enter(&rxq->er_stat_lock);
			rxq->er_stat.ers_intr_limit.value.ui64++;
			mutex_exit(&rxq->er_stat_lock);
			break;
		}

		cdesc = &rxq->er_cq_descs[head_mod];
		VERIFY3P(cdesc, >=, rxq->er_cq_descs);
		VERIFY3P(cdesc, <=,
		    (rxq->er_cq_descs + rxq->er_cq_num_descs - 1));
	}

	if (num_frames > 0) {
		mutex_enter(&rxq->er_stat_lock);
		rxq->er_stat.ers_packets.value.ui64 += num_frames;
		rxq->er_stat.ers_bytes.value.ui64 += total_bytes;
		mutex_exit(&rxq->er_stat_lock);

		DTRACE_PROBE4(rx__frames, mblk_t *, head, boolean_t, polling,
		    uint64_t, num_frames, uint64_t, total_bytes);
		ena_refill_rx(rxq, num_frames);
	}

	return (head);
}

void
ena_rx_intr_work(ena_rxq_t *rxq)
{
	mblk_t *mp;

	mutex_enter(&rxq->er_lock);
	mp = ena_ring_rx(rxq, ENA_INTERRUPT_MODE);
	mutex_exit(&rxq->er_lock);

	if (mp == NULL) {
		return;
	}

	mac_rx_ring(rxq->er_ena->ena_mh, rxq->er_mrh, mp, rxq->er_m_gen_num);
}

mblk_t *
ena_ring_rx_poll(void *rh, int poll_bytes)
{
	ena_rxq_t *rxq = rh;
	mblk_t *mp;

	ASSERT3S(poll_bytes, >, 0);

	mutex_enter(&rxq->er_lock);
	mp = ena_ring_rx(rxq, poll_bytes);
	mutex_exit(&rxq->er_lock);

	return (mp);
}