tavor_cq.c (revision 2570281cf351044b6936651ce26dbe1f801dcbd8) - OpenGrok cross reference for /illumos-gate/usr/src/uts/common/io/ib/adapters/tavor/tavor_cq.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * tavor_cq.c
 *    Tavor Completion Queue Processing Routines
 *
 *    Implements all the routines necessary for allocating, freeing, resizing,
 *    and handling the completion type events that the Tavor hardware can
 *    generate.
 */

#include <sys/types.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/modctl.h>
#include <sys/bitmap.h>
#include <sys/sysmacros.h>

#include <sys/ib/adapters/tavor/tavor.h>

static void tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd,
    uint32_t cqn, uint32_t cq_param);
#pragma inline(tavor_cq_doorbell)
static int tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
    tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
static int tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
    tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
static void tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
    uint_t flag);
static void tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
    uint32_t old_cons_indx, uint32_t num_newcqe);

/*
 * tavor_cq_alloc()
 *    Context: Can be called only from user or kernel context.
 */
int
tavor_cq_alloc(tavor_state_t *state, ibt_cq_hdl_t ibt_cqhdl,
    ibt_cq_attr_t *cq_attr, uint_t *actual_size, tavor_cqhdl_t *cqhdl,
    uint_t sleepflag)
{
	tavor_rsrc_t		*cqc, *rsrc;
	tavor_umap_db_entry_t	*umapdb;
	tavor_hw_cqc_t		cqc_entry;
	tavor_cqhdl_t		cq;
	ibt_mr_attr_t		mr_attr;
	tavor_mr_options_t	op;
	tavor_pdhdl_t		pd;
	tavor_mrhdl_t		mr;
	tavor_hw_cqe_t		*buf;
	uint64_t		addr, value;
	uint32_t		log_cq_size, lkey, uarpg;
	uint_t			dma_xfer_mode, cq_sync, cq_is_umap;
	int			status, i, flag;

	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq_attr))

	/*
	 * Determine whether CQ is being allocated for userland access or
	 * whether it is being allocated for kernel access.  If the CQ is
	 * being allocated for userland access, then lookup the UAR doorbell
	 * page number for the current process.  Note:  If this is not found
	 * (e.g. if the process has not previously open()'d the Tavor driver),
	 * then an error is returned.
	 */
	cq_is_umap = (cq_attr->cq_flags & IBT_CQ_USER_MAP) ? 1 : 0;
	if (cq_is_umap) {
		status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
		    MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
		if (status != DDI_SUCCESS) {
			goto cqalloc_fail;
		}
		uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
	}

	/* Use the internal protection domain (PD) for setting up CQs */
	pd = state->ts_pdhdl_internal;

	/* Increment the reference count on the protection domain (PD) */
	tavor_pd_refcnt_inc(pd);

	/*
	 * Allocate an CQ context entry.  This will be filled in with all
	 * the necessary parameters to define the Completion Queue.  And then
	 * ownership will be passed to the hardware in the final step
	 * below.  If we fail here, we must undo the protection domain
	 * reference count.
	 */
	status = tavor_rsrc_alloc(state, TAVOR_CQC, 1, sleepflag, &cqc);
	if (status != DDI_SUCCESS) {
		goto cqalloc_fail1;
	}

	/*
	 * Allocate the software structure for tracking the completion queue
	 * (i.e. the Tavor Completion Queue handle).  If we fail here, we must
	 * undo the protection domain reference count and the previous
	 * resource allocation.
	 */
	status = tavor_rsrc_alloc(state, TAVOR_CQHDL, 1, sleepflag, &rsrc);
	if (status != DDI_SUCCESS) {
		goto cqalloc_fail2;
	}
	cq = (tavor_cqhdl_t)rsrc->tr_addr;
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
	cq->cq_is_umap = cq_is_umap;

	/* Use the index as CQ number */
	cq->cq_cqnum = cqc->tr_indx;

	/*
	 * If this will be a user-mappable CQ, then allocate an entry for
	 * the "userland resources database".  This will later be added to
	 * the database (after all further CQ operations are successful).
	 * If we fail here, we must undo the reference counts and the
	 * previous resource allocation.
	 */
	if (cq->cq_is_umap) {
		umapdb = tavor_umap_db_alloc(state->ts_instance, cq->cq_cqnum,
		    MLNX_UMAP_CQMEM_RSRC, (uint64_t)(uintptr_t)rsrc);
		if (umapdb == NULL) {
			goto cqalloc_fail3;
		}
	}

	/*
	 * Calculate the appropriate size for the completion queue.
	 * Note:  All Tavor CQs must be a power-of-2 minus 1 in size.  Also
	 * they may not be any smaller than TAVOR_CQ_MIN_SIZE.  This step is
	 * to round the requested size up to the next highest power-of-2
	 */
	cq_attr->cq_size = max(cq_attr->cq_size, TAVOR_CQ_MIN_SIZE);
	log_cq_size = highbit(cq_attr->cq_size);

	/*
	 * Next we verify that the rounded-up size is valid (i.e. consistent
	 * with the device limits and/or software-configured limits)
	 */
	if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
		goto cqalloc_fail4;
	}

	/*
	 * Allocate the memory for Completion Queue.
	 *
	 * Note: Although we use the common queue allocation routine, we
	 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
	 * kernel system memory) for kernel CQs because it would be
	 * inefficient to have CQs located in DDR memory.  This is primarily
	 * because CQs are read from (by software) more than they are written
	 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
	 * user-mappable CQs for a similar reason.)
	 * It is also worth noting that, unlike Tavor QP work queues,
	 * completion queues do not have the same strict alignment
	 * requirements.  It is sufficient for the CQ memory to be both
	 * aligned to and bound to addresses which are a multiple of CQE size.
	 */
	cq->cq_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
	cq->cq_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
	cq->cq_cqinfo.qa_bind_align  = sizeof (tavor_hw_cqe_t);
	if (cq->cq_is_umap) {
		cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
	} else {
		cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
	}
	status = tavor_queue_alloc(state, &cq->cq_cqinfo, sleepflag);
	if (status != DDI_SUCCESS) {
		goto cqalloc_fail4;
	}
	buf = (tavor_hw_cqe_t *)cq->cq_cqinfo.qa_buf_aligned;
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))

	/*
	 * Initialize each of the Completion Queue Entries (CQE) by setting
	 * their ownership to hardware ("owner" bit set to HW).  This is in
	 * preparation for the final transfer of ownership (below) of the
	 * CQ context itself.
	 */
	for (i = 0; i < (1 << log_cq_size); i++) {
		TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
	}

	/*
	 * Register the memory for the CQ.  The memory for the CQ must
	 * be registered in the Tavor TPT tables.  This gives us the LKey
	 * to specify in the CQ context below.  Note: If this is a user-
	 * mappable CQ, then we will force DDI_DMA_CONSISTENT mapping.
	 */
	flag = (sleepflag == TAVOR_SLEEP) ?  IBT_MR_SLEEP : IBT_MR_NOSLEEP;
	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
	mr_attr.mr_len	 = cq->cq_cqinfo.qa_size;
	mr_attr.mr_as	 = NULL;
	mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
	if (cq->cq_is_umap) {
		dma_xfer_mode = DDI_DMA_CONSISTENT;
	} else {
		dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
	}
	if (dma_xfer_mode == DDI_DMA_STREAMING) {
		mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
	}
	op.mro_bind_type   = state->ts_cfg_profile->cp_iommu_bypass;
	op.mro_bind_dmahdl = cq->cq_cqinfo.qa_dmahdl;
	op.mro_bind_override_addr = 0;
	status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
	if (status != DDI_SUCCESS) {
		goto cqalloc_fail5;
	}
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
	addr = mr->mr_bindinfo.bi_addr;
	lkey = mr->mr_lkey;

	/* Determine if later ddi_dma_sync will be necessary */
	cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, cq->cq_cqinfo);

	/* Sync entire CQ for use by the hardware (if necessary). */
	if (cq_sync) {
		(void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
		    cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
	}

	/*
	 * Fill in the CQC entry.  This is the final step before passing
	 * ownership of the CQC entry to the Tavor hardware.  We use all of
	 * the information collected/calculated above to fill in the
	 * requisite portions of the CQC.  Note: If this CQ is going to be
	 * used for userland access, then we need to set the UAR page number
	 * appropriately (otherwise it's a "don't care")
	 */
	bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
	cq->cq_eqnum		= TAVOR_CQ_EQNUM_GET(cq->cq_cqnum);
	cq->cq_erreqnum		= TAVOR_CQ_ERREQNUM_GET(cq->cq_cqnum);
	cqc_entry.xlat		= TAVOR_VA2PA_XLAT_ENABLED;
	cqc_entry.state		= TAVOR_CQ_DISARMED;
	cqc_entry.start_addr_h	= (addr >> 32);
	cqc_entry.start_addr_l	= (addr & 0xFFFFFFFF);
	cqc_entry.log_cq_sz	= log_cq_size;
	if (cq->cq_is_umap) {
		cqc_entry.usr_page = uarpg;
	} else {
		cqc_entry.usr_page = 0;
	}
	cqc_entry.pd		= pd->pd_pdnum;
	cqc_entry.lkey		= lkey;
	cqc_entry.e_eqn		= cq->cq_erreqnum;
	cqc_entry.c_eqn		= cq->cq_eqnum;
	cqc_entry.cqn		= cq->cq_cqnum;

	/*
	 * Write the CQC entry to hardware.  Lastly, we pass ownership of
	 * the entry to the hardware (using the Tavor SW2HW_CQ firmware
	 * command).  Note: In general, this operation shouldn't fail.  But
	 * if it does, we have to undo everything we've done above before
	 * returning error.
	 */
	status = tavor_cmn_ownership_cmd_post(state, SW2HW_CQ, &cqc_entry,
	    sizeof (tavor_hw_cqc_t), cq->cq_cqnum, sleepflag);
	if (status != TAVOR_CMD_SUCCESS) {
		cmn_err(CE_CONT, "Tavor: SW2HW_CQ command failed: %08x\n",
		    status);
		goto cqalloc_fail6;
	}

	/*
	 * Fill in the rest of the Tavor Completion Queue handle.  Having
	 * successfully transferred ownership of the CQC, we can update the
	 * following fields for use in further operations on the CQ.
	 */
	cq->cq_cqcrsrcp	  = cqc;
	cq->cq_rsrcp	  = rsrc;
	cq->cq_consindx	  = 0;
	cq->cq_buf	  = buf;
	cq->cq_bufsz	  = (1 << log_cq_size);
	cq->cq_mrhdl	  = mr;
	cq->cq_sync	  = cq_sync;
	cq->cq_refcnt	  = 0;
	cq->cq_is_special = 0;
	cq->cq_uarpg	  = uarpg;
	cq->cq_umap_dhp	  = (devmap_cookie_t)NULL;
	avl_create(&cq->cq_wrid_wqhdr_avl_tree, tavor_wrid_wqhdr_compare,
	    sizeof (struct tavor_workq_hdr_s),
	    offsetof(struct tavor_workq_hdr_s, wq_avl_link));

	cq->cq_wrid_reap_head  = NULL;
	cq->cq_wrid_reap_tail  = NULL;
	cq->cq_hdlrarg	  = (void *)ibt_cqhdl;

	/*
	 * Put CQ handle in Tavor CQNum-to-CQHdl list.  Then fill in the
	 * "actual_size" and "cqhdl" and return success
	 */
	ASSERT(state->ts_cqhdl[cqc->tr_indx] == NULL);
	state->ts_cqhdl[cqc->tr_indx] = cq;

	/*
	 * If this is a user-mappable CQ, then we need to insert the previously
	 * allocated entry into the "userland resources database".  This will
	 * allow for later lookup during devmap() (i.e. mmap()) calls.
	 */
	if (cq->cq_is_umap) {
		tavor_umap_db_add(umapdb);
	}

	/*
	 * Fill in the return arguments (if necessary).  This includes the
	 * real completion queue size.
	 */
	if (actual_size != NULL) {
		*actual_size = (1 << log_cq_size) - 1;
	}
	*cqhdl = cq;

	return (DDI_SUCCESS);

/*
 * The following is cleanup for all possible failure cases in this routine
 */
cqalloc_fail6:
	if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
	    sleepflag) != DDI_SUCCESS) {
		TAVOR_WARNING(state, "failed to deregister CQ memory");
	}
cqalloc_fail5:
	tavor_queue_free(state, &cq->cq_cqinfo);
cqalloc_fail4:
	if (cq_is_umap) {
		tavor_umap_db_free(umapdb);
	}
cqalloc_fail3:
	tavor_rsrc_free(state, &rsrc);
cqalloc_fail2:
	tavor_rsrc_free(state, &cqc);
cqalloc_fail1:
	tavor_pd_refcnt_dec(pd);
cqalloc_fail:
	return (status);
}


/*
 * tavor_cq_free()
 *    Context: Can be called only from user or kernel context.
 */
/* ARGSUSED */
int
tavor_cq_free(tavor_state_t *state, tavor_cqhdl_t *cqhdl, uint_t sleepflag)
{
	tavor_rsrc_t		*cqc, *rsrc;
	tavor_umap_db_entry_t	*umapdb;
	tavor_hw_cqc_t		cqc_entry;
	tavor_pdhdl_t		pd;
	tavor_mrhdl_t		mr;
	tavor_cqhdl_t		cq;
	uint32_t		cqnum;
	uint64_t		value;
	uint_t			maxprot;
	int			status;

	/*
	 * Pull all the necessary information from the Tavor Completion Queue
	 * handle.  This is necessary here because the resource for the
	 * CQ handle is going to be freed up as part of this operation.
	 */
	cq	= *cqhdl;
	mutex_enter(&cq->cq_lock);
	cqc	= cq->cq_cqcrsrcp;
	rsrc	= cq->cq_rsrcp;
	pd	= state->ts_pdhdl_internal;
	mr	= cq->cq_mrhdl;
	cqnum	= cq->cq_cqnum;

	/*
	 * If there are work queues still associated with the CQ, then return
	 * an error.  Otherwise, we will be holding the CQ lock.
	 */
	if (cq->cq_refcnt != 0) {
		mutex_exit(&cq->cq_lock);
		return (IBT_CQ_BUSY);
	}

	/*
	 * If this was a user-mappable CQ, then we need to remove its entry
	 * from the "userland resources database".  If it is also currently
	 * mmap()'d out to a user process, then we need to call
	 * devmap_devmem_remap() to remap the CQ memory to an invalid mapping.
	 * We also need to invalidate the CQ tracking information for the
	 * user mapping.
	 */
	if (cq->cq_is_umap) {
		status = tavor_umap_db_find(state->ts_instance, cqnum,
		    MLNX_UMAP_CQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
		    &umapdb);
		if (status != DDI_SUCCESS) {
			mutex_exit(&cq->cq_lock);
			TAVOR_WARNING(state, "failed to find in database");
			return (ibc_get_ci_failure(0));
		}
		tavor_umap_db_free(umapdb);
		if (cq->cq_umap_dhp != NULL) {
			maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
			status = devmap_devmem_remap(cq->cq_umap_dhp,
			    state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size,
			    maxprot, DEVMAP_MAPPING_INVALID, NULL);
			if (status != DDI_SUCCESS) {
				mutex_exit(&cq->cq_lock);
				TAVOR_WARNING(state, "failed in CQ memory "
				    "devmap_devmem_remap()");
				return (ibc_get_ci_failure(0));
			}
			cq->cq_umap_dhp = (devmap_cookie_t)NULL;
		}
	}

	/*
	 * Put NULL into the Tavor CQNum-to-CQHdl list.  This will allow any
	 * in-progress events to detect that the CQ corresponding to this
	 * number has been freed.
	 */
	state->ts_cqhdl[cqc->tr_indx] = NULL;

	/*
	 * While we hold the CQ lock, do a "forced reap" of the workQ WRID
	 * list.  This cleans up all the structures associated with the WRID
	 * processing for this CQ.  Once we complete, drop the lock and finish
	 * the deallocation of the CQ.
	 */
	tavor_wrid_cq_force_reap(cq);

	mutex_exit(&cq->cq_lock);
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))

	/*
	 * Reclaim CQC entry from hardware (using the Tavor HW2SW_CQ
	 * firmware command).  If the ownership transfer fails for any reason,
	 * then it is an indication that something (either in HW or SW) has
	 * gone seriously wrong.
	 */
	status = tavor_cmn_ownership_cmd_post(state, HW2SW_CQ, &cqc_entry,
	    sizeof (tavor_hw_cqc_t), cqnum, sleepflag);
	if (status != TAVOR_CMD_SUCCESS) {
		TAVOR_WARNING(state, "failed to reclaim CQC ownership");
		cmn_err(CE_CONT, "Tavor: HW2SW_CQ command failed: %08x\n",
		    status);
		return (ibc_get_ci_failure(0));
	}

	/*
	 * Deregister the memory for the Completion Queue.  If this fails
	 * for any reason, then it is an indication that something (either
	 * in HW or SW) has gone seriously wrong.  So we print a warning
	 * message and return.
	 */
	status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
	    sleepflag);
	if (status != DDI_SUCCESS) {
		TAVOR_WARNING(state, "failed to deregister CQ memory");
		return (ibc_get_ci_failure(0));
	}

	/* Free the memory for the CQ */
	tavor_queue_free(state, &cq->cq_cqinfo);

	/* Free the Tavor Completion Queue handle */
	tavor_rsrc_free(state, &rsrc);

	/* Free up the CQC entry resource */
	tavor_rsrc_free(state, &cqc);

	/* Decrement the reference count on the protection domain (PD) */
	tavor_pd_refcnt_dec(pd);

	/* Set the cqhdl pointer to NULL and return success */
	*cqhdl = NULL;

	return (DDI_SUCCESS);
}


/*
 * tavor_cq_resize()
 *    Context: Can be called only from user or kernel context.
 */
int
tavor_cq_resize(tavor_state_t *state, tavor_cqhdl_t cq, uint_t req_size,
    uint_t *actual_size, uint_t sleepflag)
{
	tavor_hw_cqc_t		cqc_entry;
	tavor_qalloc_info_t	new_cqinfo, old_cqinfo;
	ibt_mr_attr_t		mr_attr;
	tavor_mr_options_t	op;
	tavor_pdhdl_t		pd;
	tavor_mrhdl_t		mr, mr_old;
	tavor_hw_cqe_t		*buf;
	uint32_t		new_prod_indx, old_cons_indx;
	uint_t			dma_xfer_mode, cq_sync, log_cq_size, maxprot;
	int			status, i, flag;

	/* Use the internal protection domain (PD) for CQs */
	pd = state->ts_pdhdl_internal;

	/*
	 * Calculate the appropriate size for the new resized completion queue.
	 * Note:  All Tavor CQs must be a power-of-2 minus 1 in size.  Also
	 * they may not be any smaller than TAVOR_CQ_MIN_SIZE.  This step is
	 * to round the requested size up to the next highest power-of-2
	 */
	req_size = max(req_size, TAVOR_CQ_MIN_SIZE);
	log_cq_size = highbit(req_size);

	/*
	 * Next we verify that the rounded-up size is valid (i.e. consistent
	 * with the device limits and/or software-configured limits)
	 */
	if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
		goto cqresize_fail;
	}

	/*
	 * Allocate the memory for newly resized Completion Queue.
	 *
	 * Note: Although we use the common queue allocation routine, we
	 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
	 * kernel system memory) for kernel CQs because it would be
	 * inefficient to have CQs located in DDR memory.  This is the same
	 * as we do when we first allocate completion queues primarily
	 * because CQs are read from (by software) more than they are written
	 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
	 * user-mappable CQs for a similar reason.)
	 * It is also worth noting that, unlike Tavor QP work queues,
	 * completion queues do not have the same strict alignment
	 * requirements.  It is sufficient for the CQ memory to be both
	 * aligned to and bound to addresses which are a multiple of CQE size.
	 */
	new_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
	new_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
	new_cqinfo.qa_bind_align  = sizeof (tavor_hw_cqe_t);
	if (cq->cq_is_umap) {
		new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
	} else {
		new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
	}
	status = tavor_queue_alloc(state, &new_cqinfo, sleepflag);
	if (status != DDI_SUCCESS) {
		goto cqresize_fail;
	}
	buf = (tavor_hw_cqe_t *)new_cqinfo.qa_buf_aligned;
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))

	/*
	 * Initialize each of the Completion Queue Entries (CQE) by setting
	 * their ownership to hardware ("owner" bit set to HW).  This is in
	 * preparation for the final resize operation (below).
	 */
	for (i = 0; i < (1 << log_cq_size); i++) {
		TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
	}

	/*
	 * Register the memory for the CQ.  The memory for the CQ must
	 * be registered in the Tavor TPT tables.  This gives us the LKey
	 * to specify in the CQ context below.
	 */
	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
	mr_attr.mr_len	 = new_cqinfo.qa_size;
	mr_attr.mr_as	 = NULL;
	mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
	if (cq->cq_is_umap) {
		dma_xfer_mode = DDI_DMA_CONSISTENT;
	} else {
		dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
	}
	if (dma_xfer_mode == DDI_DMA_STREAMING) {
		mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
	}
	op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
	op.mro_bind_dmahdl = new_cqinfo.qa_dmahdl;
	op.mro_bind_override_addr = 0;
	status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
	if (status != DDI_SUCCESS) {
		tavor_queue_free(state, &new_cqinfo);
		goto cqresize_fail;
	}
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))

	/* Determine if later ddi_dma_sync will be necessary */
	cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, new_cqinfo);

	/* Sync entire "new" CQ for use by hardware (if necessary) */
	if (cq_sync) {
		(void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
		    new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
	}

	/*
	 * Now we grab the CQ lock.  Since we will be updating the actual
	 * CQ location and the producer/consumer indexes, we should hold
	 * the lock.
	 *
	 * We do a TAVOR_NOSLEEP here (and below), though, because we are
	 * holding the "cq_lock" and if we got raised to interrupt level
	 * by priority inversion, we would not want to block in this routine
	 * waiting for success.
	 */
	mutex_enter(&cq->cq_lock);

	/*
	 * Determine the current CQ "consumer index".
	 *
	 * Note:  This will depend on whether the CQ had previously been
	 * mapped for user access or whether it is a kernel CQ.  If this
	 * is a kernel CQ, then all PollCQ() operations have come through
	 * the IBTF and, hence, the driver's CQ state structure will
	 * contain the current consumer index.  If, however, the user has
	 * accessed this CQ by bypassing the driver (OS-bypass), then we
	 * need to query the firmware to determine the current CQ consumer
	 * index.  This also assumes that the user process will not continue
	 * to consume entries while at the same time doing the ResizeCQ()
	 * operation.  If the user process does not guarantee this, then it
	 * may see duplicate or missed completions.  But under no
	 * circumstances should this panic the system.
	 */
	if (cq->cq_is_umap) {
		status = tavor_cmn_query_cmd_post(state, QUERY_CQ,
		    cq->cq_cqnum, &cqc_entry, sizeof (tavor_hw_cqc_t),
		    TAVOR_NOSLEEP);
		if (status != TAVOR_CMD_SUCCESS) {
			/* Query CQ has failed, drop CQ lock and cleanup */
			mutex_exit(&cq->cq_lock);
			if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
			    sleepflag) != DDI_SUCCESS) {
				TAVOR_WARNING(state, "failed to deregister "
				    "CQ memory");
			}
			tavor_queue_free(state, &new_cqinfo);
			TAVOR_WARNING(state, "failed to find in database");

			goto cqresize_fail;
		}
		old_cons_indx = cqc_entry.cons_indx;
	} else {
		old_cons_indx = cq->cq_consindx;
	}

	/*
	 * Fill in the CQC entry.  For the resize operation this is the
	 * final step before attempting the resize operation on the CQC entry.
	 * We use all of the information collected/calculated above to fill
	 * in the requisite portions of the CQC.
	 */
	bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
	cqc_entry.start_addr_h	= (mr->mr_bindinfo.bi_addr >> 32);
	cqc_entry.start_addr_l	= (mr->mr_bindinfo.bi_addr & 0xFFFFFFFF);
	cqc_entry.log_cq_sz	= log_cq_size;
	cqc_entry.lkey		= mr->mr_lkey;

	/*
	 * Write the CQC entry to hardware.  Lastly, we pass ownership of
	 * the entry to the hardware (using the Tavor RESIZE_CQ firmware
	 * command).  Note: In general, this operation shouldn't fail.  But
	 * if it does, we have to undo everything we've done above before
	 * returning error.  Also note that the status returned may indicate
	 * the code to return to the IBTF.
	 */
	status = tavor_resize_cq_cmd_post(state, &cqc_entry, cq->cq_cqnum,
	    &new_prod_indx, TAVOR_CMD_NOSLEEP_SPIN);
	if (status != TAVOR_CMD_SUCCESS) {
		/* Resize attempt has failed, drop CQ lock and cleanup */
		mutex_exit(&cq->cq_lock);
		if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
		    sleepflag) != DDI_SUCCESS) {
			TAVOR_WARNING(state, "failed to deregister CQ memory");
		}
		tavor_queue_free(state, &new_cqinfo);
		if (status == TAVOR_CMD_BAD_SIZE) {
			return (IBT_CQ_SZ_INSUFFICIENT);
		} else {
			cmn_err(CE_CONT, "Tavor: RESIZE_CQ command failed: "
			    "%08x\n", status);
			return (ibc_get_ci_failure(0));
		}
	}

	/*
	 * The CQ resize attempt was successful.  Before dropping the CQ lock,
	 * copy all of the CQEs from the "old" CQ into the "new" CQ.  Note:
	 * the Tavor firmware guarantees us that sufficient space is set aside
	 * in the "new" CQ to handle any un-polled CQEs from the "old" CQ.
	 * The two parameters to this helper function ("old_cons_indx" and
	 * "new_prod_indx") essentially indicate the starting index and number
	 * of any CQEs that might remain in the "old" CQ memory.
	 */
	tavor_cq_resize_helper(cq, buf, old_cons_indx, new_prod_indx);

	/* Sync entire "new" CQ for use by hardware (if necessary) */
	if (cq_sync) {
		(void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
		    new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
	}

	/*
	 * Update the Tavor Completion Queue handle with all the new
	 * information.  At the same time, save away all the necessary
	 * information for freeing up the old resources
	 */
	mr_old		 = cq->cq_mrhdl;
	old_cqinfo	 = cq->cq_cqinfo;
	cq->cq_cqinfo	 = new_cqinfo;
	cq->cq_consindx	 = 0;
	cq->cq_buf	 = buf;
	cq->cq_bufsz	 = (1 << log_cq_size);
	cq->cq_mrhdl	 = mr;
	cq->cq_sync	 = cq_sync;

	/*
	 * If "old" CQ was a user-mappable CQ that is currently mmap()'d out
	 * to a user process, then we need to call devmap_devmem_remap() to
	 * invalidate the mapping to the CQ memory.  We also need to
	 * invalidate the CQ tracking information for the user mapping.
	 */
	if ((cq->cq_is_umap) && (cq->cq_umap_dhp != NULL)) {
		maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
		status = devmap_devmem_remap(cq->cq_umap_dhp,
		    state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size, maxprot,
		    DEVMAP_MAPPING_INVALID, NULL);
		if (status != DDI_SUCCESS) {
			mutex_exit(&cq->cq_lock);
			TAVOR_WARNING(state, "failed in CQ memory "
			    "devmap_devmem_remap()");
			return (ibc_get_ci_failure(0));
		}
		cq->cq_umap_dhp = (devmap_cookie_t)NULL;
	}

	/*
	 * Drop the CQ lock now.  The only thing left to do is to free up
	 * the old resources.
	 */
	mutex_exit(&cq->cq_lock);

	/*
	 * Deregister the memory for the old Completion Queue.  Note: We
	 * really can't return error here because we have no good way to
	 * cleanup.  Plus, the deregistration really shouldn't ever happen.
	 * So, if it does, it is an indication that something has gone
	 * seriously wrong.  So we print a warning message and return error
	 * (knowing, of course, that the "old" CQ memory will be leaked)
	 */
	status = tavor_mr_deregister(state, &mr_old, TAVOR_MR_DEREG_ALL,
	    sleepflag);
	if (status != DDI_SUCCESS) {
		TAVOR_WARNING(state, "failed to deregister old CQ memory");
		goto cqresize_fail;
	}

	/* Free the memory for the old CQ */
	tavor_queue_free(state, &old_cqinfo);

	/*
	 * Fill in the return arguments (if necessary).  This includes the
	 * real new completion queue size.
	 */
	if (actual_size != NULL) {
		*actual_size = (1 << log_cq_size) - 1;
	}

	return (DDI_SUCCESS);

cqresize_fail:
	return (status);
}


/*
 * tavor_cq_notify()
 *    Context: Can be called from interrupt or base context.
 */
int
tavor_cq_notify(tavor_state_t *state, tavor_cqhdl_t cq,
    ibt_cq_notify_flags_t flags)
{
	uint_t		cqnum;

	/*
	 * Determine if we are trying to get the next completion or the next
	 * "solicited" completion.  Then hit the appropriate doorbell.
	 *
	 * NOTE: Please see the comment in tavor_event.c:tavor_eq_poll
	 * regarding why we do not have to do an extra PIO read here, and we
	 * will not lose an event after writing this doorbell.
	 */
	cqnum = cq->cq_cqnum;
	if (flags == IBT_NEXT_COMPLETION) {
		tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ, cqnum,
		    TAVOR_CQDB_DEFAULT_PARAM);

	} else if (flags == IBT_NEXT_SOLICITED) {
		tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ_SOLICIT,
		    cqnum, TAVOR_CQDB_DEFAULT_PARAM);

	} else {
		return (IBT_CQ_NOTIFY_TYPE_INVALID);
	}

	return (DDI_SUCCESS);
}


/*
 * tavor_cq_poll()
 *    Context: Can be called from interrupt or base context.
 */
int
tavor_cq_poll(tavor_state_t *state, tavor_cqhdl_t cq, ibt_wc_t *wc_p,
    uint_t num_wc, uint_t *num_polled)
{
	tavor_hw_cqe_t	*cqe;
	uint32_t	cons_indx, wrap_around_mask;
	uint32_t	polled_cnt, num_to_increment;
	int		status;

	/*
	 * Check for user-mappable CQ memory.  Note:  We do not allow kernel
	 * clients to poll CQ memory that is accessible directly by the user.
	 * If the CQ memory is user accessible, then return an error.
	 */
	if (cq->cq_is_umap) {
		return (IBT_CQ_HDL_INVALID);
	}

	mutex_enter(&cq->cq_lock);

	/* Get the consumer index */
	cons_indx = cq->cq_consindx;

	/*
	 * Calculate the wrap around mask.  Note: This operation only works
	 * because all Tavor completion queues have power-of-2 sizes
	 */
	wrap_around_mask = (cq->cq_bufsz - 1);

	/* Calculate the pointer to the first CQ entry */
	cqe = &cq->cq_buf[cons_indx];

	/* Sync the current CQE to read */
	tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);

	/*
	 * Keep pulling entries from the CQ until we find an entry owned by
	 * the hardware.  As long as there the CQE's owned by SW, process
	 * each entry by calling tavor_cq_cqe_consume() and updating the CQ
	 * consumer index.  Note:  We only update the consumer index if
	 * tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.  Otherwise,
	 * it indicates that we are going to "recycle" the CQE (probably
	 * because it is a error CQE and corresponds to more than one
	 * completion).
	 */
	polled_cnt = 0;
	while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
		status = tavor_cq_cqe_consume(state, cq, cqe,
		    &wc_p[polled_cnt++]);
		if (status == TAVOR_CQ_SYNC_AND_DB) {
			/* Reset entry to hardware ownership */
			TAVOR_CQE_OWNER_SET_HW(cq, cqe);

			/* Sync the current CQE for device */
			tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORDEV);

			/* Increment the consumer index */
			cons_indx = (cons_indx + 1) & wrap_around_mask;

			/* Update the pointer to the next CQ entry */
			cqe = &cq->cq_buf[cons_indx];

			/* Sync the next CQE to read */
			tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
		}

		/*
		 * If we have run out of space to store work completions,
		 * then stop and return the ones we have pulled of the CQ.
		 */
		if (polled_cnt >= num_wc) {
			break;
		}
	}

	/*
	 * Now we only ring the doorbell (to update the consumer index) if
	 * we've actually consumed a CQ entry.  If we have, for example,
	 * pulled from a CQE that we are still in the process of "recycling"
	 * for error purposes, then we would not update the consumer index.
	 */
	if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) {
		/*
		 * Post doorbell to update the consumer index.  Doorbell
		 * value indicates number of entries consumed (minus 1)
		 */
		if (cons_indx > cq->cq_consindx) {
			num_to_increment = (cons_indx - cq->cq_consindx) - 1;
		} else {
			num_to_increment = ((cons_indx + cq->cq_bufsz) -
			    cq->cq_consindx) - 1;
		}
		cq->cq_consindx = cons_indx;
		tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
		    cq->cq_cqnum, num_to_increment);

	} else if (polled_cnt == 0) {
		/*
		 * If the CQ is empty, we can try to free up some of the WRID
		 * list containers.  See tavor_wr.c for more details on this
		 * operation.
		 */
		tavor_wrid_cq_reap(cq);
	}

	mutex_exit(&cq->cq_lock);

	/* Set "num_polled" (if necessary) */
	if (num_polled != NULL) {
		*num_polled = polled_cnt;
	}

	/* Set CQ_EMPTY condition if needed, otherwise return success */
	if (polled_cnt == 0) {
		status = IBT_CQ_EMPTY;
	} else {
		status = DDI_SUCCESS;
	}

	/*
	 * Check if the system is currently panicking.  If it is, then call
	 * the Tavor interrupt service routine.  This step is necessary here
	 * because we might be in a polled I/O mode and without the call to
	 * tavor_isr() - and its subsequent calls to poll and rearm each
	 * event queue - we might overflow our EQs and render the system
	 * unable to sync/dump.
	 */
	if (ddi_in_panic() != 0) {
		(void) tavor_isr((caddr_t)state, (caddr_t)NULL);
	}

	return (status);
}


/*
 * tavor_cq_handler()
 *    Context: Only called from interrupt context
 */
int
tavor_cq_handler(tavor_state_t *state, tavor_eqhdl_t eq,
    tavor_hw_eqe_t *eqe)
{
	tavor_cqhdl_t		cq;
	uint_t			cqnum;
	uint_t			eqe_evttype;

	eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);

	ASSERT(eqe_evttype == TAVOR_EVT_COMPLETION ||
	    eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);

	if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
		tavor_eq_overflow_handler(state, eq, eqe);

		return (DDI_FAILURE);
	}


	/* Get the CQ handle from CQ number in event descriptor */
	cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
	cq = tavor_cqhdl_from_cqnum(state, cqnum);

	/*
	 * Post the EQ doorbell to move the CQ to the "disarmed" state.
	 * This operation is to enable subsequent CQ doorbells (e.g. those
	 * that can be rung by tavor_cq_notify() above) to rearm the CQ.
	 */
	tavor_eq_doorbell(state, TAVOR_EQDB_DISARM_CQ, eq->eq_eqnum, cqnum);

	/*
	 * If the CQ handle is NULL, this is probably an indication
	 * that the CQ has been freed already.  In which case, we
	 * should not deliver this event.
	 *
	 * We also check that the CQ number in the handle is the
	 * same as the CQ number in the event queue entry.  This
	 * extra check allows us to handle the case where a CQ was
	 * freed and then allocated again in the time it took to
	 * handle the event queue processing.  By constantly incrementing
	 * the non-constrained portion of the CQ number every time
	 * a new CQ is allocated, we mitigate (somewhat) the chance
	 * that a stale event could be passed to the client's CQ
	 * handler.
	 *
	 * Lastly, we check if "ts_ibtfpriv" is NULL.  If it is then it
	 * means that we've have either received this event before we
	 * finished attaching to the IBTF or we've received it while we
	 * are in the process of detaching.
	 */
	if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
	    (state->ts_ibtfpriv != NULL)) {
		TAVOR_DO_IBTF_CQ_CALLB(state, cq);
	}

	return (DDI_SUCCESS);
}


/*
 * tavor_cq_err_handler()
 *    Context: Only called from interrupt context
 */
int
tavor_cq_err_handler(tavor_state_t *state, tavor_eqhdl_t eq,
    tavor_hw_eqe_t *eqe)
{
	tavor_cqhdl_t		cq;
	uint_t			cqnum;
	ibc_async_event_t	event;
	ibt_async_code_t	type;
	uint_t			eqe_evttype;

	eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);

	ASSERT(eqe_evttype == TAVOR_EVT_CQ_ERRORS ||
	    eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);

	if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
		tavor_eq_overflow_handler(state, eq, eqe);

		return (DDI_FAILURE);
	}

	/* cmn_err(CE_CONT, "CQ Error handler\n"); */

	/* Get the CQ handle from CQ number in event descriptor */
	cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
	cq = tavor_cqhdl_from_cqnum(state, cqnum);

	/*
	 * If the CQ handle is NULL, this is probably an indication
	 * that the CQ has been freed already.  In which case, we
	 * should not deliver this event.
	 *
	 * We also check that the CQ number in the handle is the
	 * same as the CQ number in the event queue entry.  This
	 * extra check allows us to handle the case where a CQ was
	 * freed and then allocated again in the time it took to
	 * handle the event queue processing.  By constantly incrementing
	 * the non-constrained portion of the CQ number every time
	 * a new CQ is allocated, we mitigate (somewhat) the chance
	 * that a stale event could be passed to the client's CQ
	 * handler.
	 *
	 * And then we check if "ts_ibtfpriv" is NULL.  If it is then it
	 * means that we've have either received this event before we
	 * finished attaching to the IBTF or we've received it while we
	 * are in the process of detaching.
	 */
	if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
	    (state->ts_ibtfpriv != NULL)) {
		event.ev_cq_hdl = (ibt_cq_hdl_t)cq->cq_hdlrarg;
		type		= IBT_ERROR_CQ;

		TAVOR_DO_IBTF_ASYNC_CALLB(state, type, &event);
	}

	return (DDI_SUCCESS);
}


/*
 * tavor_cq_refcnt_inc()
 *    Context: Can be called from interrupt or base context.
 */
int
tavor_cq_refcnt_inc(tavor_cqhdl_t cq, uint_t is_special)
{
	/*
	 * Increment the completion queue's reference count.  Note: In order
	 * to ensure compliance with IBA C11-15, we must ensure that a given
	 * CQ is not used for both special (SMI/GSI) QP and non-special QP.
	 * This is accomplished here by keeping track of how the referenced
	 * CQ is being used.
	 */
	mutex_enter(&cq->cq_lock);
	if (cq->cq_refcnt == 0) {
		cq->cq_is_special = is_special;
	} else {
		if (cq->cq_is_special != is_special) {
			mutex_exit(&cq->cq_lock);
			return (DDI_FAILURE);
		}
	}
	cq->cq_refcnt++;
	mutex_exit(&cq->cq_lock);
	return (DDI_SUCCESS);
}


/*
 * tavor_cq_refcnt_dec()
 *    Context: Can be called from interrupt or base context.
 */
void
tavor_cq_refcnt_dec(tavor_cqhdl_t cq)
{
	/* Decrement the completion queue's reference count */
	mutex_enter(&cq->cq_lock);
	cq->cq_refcnt--;
	mutex_exit(&cq->cq_lock);
}


/*
 * tavor_cq_doorbell()
 *    Context: Can be called from interrupt or base context.
 */
static void
tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd, uint32_t cqn,
    uint32_t cq_param)
{
	uint64_t	doorbell = 0;

	/* Build the doorbell from the parameters */
	doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) |
	    ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param;

	/* Write the doorbell to UAR */
	TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->cq,
	    doorbell);
}


/*
 * tavor_cqhdl_from_cqnum()
 *    Context: Can be called from interrupt or base context.
 *
 *    This routine is important because changing the unconstrained
 *    portion of the CQ number is critical to the detection of a
 *    potential race condition in the CQ handler code (i.e. the case
 *    where a CQ is freed and alloc'd again before an event for the
 *    "old" CQ can be handled).
 *
 *    While this is not a perfect solution (not sure that one exists)
 *    it does help to mitigate the chance that this race condition will
 *    cause us to deliver a "stale" event to the new CQ owner.  Note:
 *    this solution does not scale well because the number of constrained
 *    bits increases (and, hence, the number of unconstrained bits
 *    decreases) as the number of supported CQs grows.  For small and
 *    intermediate values, it should hopefully provide sufficient
 *    protection.
 */
tavor_cqhdl_t
tavor_cqhdl_from_cqnum(tavor_state_t *state, uint_t cqnum)
{
	uint_t	cqindx, cqmask;

	/* Calculate the CQ table index from the cqnum */
	cqmask = (1 << state->ts_cfg_profile->cp_log_num_cq) - 1;
	cqindx = cqnum & cqmask;
	return (state->ts_cqhdl[cqindx]);
}


/*
 * tavor_cq_cqe_consume()
 *    Context: Can be called from interrupt or base context.
 */
static int
tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
    tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
{
	uint_t		flags, type, opcode, qpnum, qp1_indx;
	int		status;

	/*
	 * Determine if this is an "error" CQE by examining "opcode".  If it
	 * is an error CQE, then call tavor_cq_errcqe_consume() and return
	 * whatever status it returns.  Otherwise, this is a successful
	 * completion.
	 */
	opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
	if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
	    (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
		status = tavor_cq_errcqe_consume(state, cq, cqe, wc);
		return (status);
	}

	/*
	 * Fetch the Work Request ID using the information in the CQE.
	 * See tavor_wr.c for more details.
	 */
	wc->wc_id = tavor_wrid_get_entry(cq, cqe, NULL);

	/*
	 * Parse the CQE opcode to determine completion type.  This will set
	 * not only the type of the completion, but also any flags that might
	 * be associated with it (e.g. whether immediate data is present).
	 */
	flags = IBT_WC_NO_FLAGS;
	if (TAVOR_CQE_SENDRECV_GET(cq, cqe) != TAVOR_COMPLETION_RECV) {

		/* Send CQE */
		switch (opcode) {
		case TAVOR_CQE_SND_RDMAWR_IMM:
			flags |= IBT_WC_IMMED_DATA_PRESENT;
			/* FALLTHROUGH */
		case TAVOR_CQE_SND_RDMAWR:
			type = IBT_WRC_RDMAW;
			break;

		case TAVOR_CQE_SND_SEND_IMM:
			flags |= IBT_WC_IMMED_DATA_PRESENT;
			/* FALLTHROUGH */
		case TAVOR_CQE_SND_SEND:
			type = IBT_WRC_SEND;
			break;

		case TAVOR_CQE_SND_RDMARD:
			type = IBT_WRC_RDMAR;
			break;

		case TAVOR_CQE_SND_ATOMIC_CS:
			type = IBT_WRC_CSWAP;
			break;

		case TAVOR_CQE_SND_ATOMIC_FA:
			type = IBT_WRC_FADD;
			break;

		case TAVOR_CQE_SND_BIND_MW:
			type = IBT_WRC_BIND;
			break;

		default:
			TAVOR_WARNING(state, "unknown send CQE type");
			wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
			return (TAVOR_CQ_SYNC_AND_DB);
		}
	} else {

		/* Receive CQE */
		switch (opcode & 0x1F) {
		case TAVOR_CQE_RCV_RECV_IMM:
			/* FALLTHROUGH */
		case TAVOR_CQE_RCV_RECV_IMM2:
			/*
			 * Note:  According to the Tavor PRM, all QP1 recv
			 * completions look like the result of a Send with
			 * Immediate.  They are not, however, (MADs are Send
			 * Only) so we need to check the QP number and set
			 * the flag only if it is non-QP1.
			 */
			qpnum	 = TAVOR_CQE_QPNUM_GET(cq, cqe);
			qp1_indx = state->ts_spec_qp1->tr_indx;
			if ((qpnum < qp1_indx) || (qpnum > qp1_indx + 1)) {
				flags |= IBT_WC_IMMED_DATA_PRESENT;
			}
			/* FALLTHROUGH */
		case TAVOR_CQE_RCV_RECV:
			/* FALLTHROUGH */
		case TAVOR_CQE_RCV_RECV2:
			type = IBT_WRC_RECV;
			break;

		case TAVOR_CQE_RCV_RDMAWR_IMM:
			/* FALLTHROUGH */
		case TAVOR_CQE_RCV_RDMAWR_IMM2:
			flags |= IBT_WC_IMMED_DATA_PRESENT;
			type = IBT_WRC_RECV_RDMAWI;
			break;

		default:
			TAVOR_WARNING(state, "unknown recv CQE type");
			wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
			return (TAVOR_CQ_SYNC_AND_DB);
		}
	}
	wc->wc_type = type;

	/*
	 * Check for GRH, update the flags, then fill in "wc_flags" field
	 * in the work completion
	 */
	if (TAVOR_CQE_GRH_GET(cq, cqe) != 0) {
		flags |= IBT_WC_GRH_PRESENT;
	}
	wc->wc_flags = flags;

	/* If we got here, completion status must be success */
	wc->wc_status = IBT_WC_SUCCESS;

	/*
	 * Parse the remaining contents of the CQE into the work completion.
	 * This means filling in SL, QP number, SLID, immediate data, etc.
	 * Note:  Not all of these fields are valid in a given completion.
	 * Many of them depend on the actual type of completion.  So we fill
	 * in all of the fields and leave it up to the IBTF and consumer to
	 * sort out which are valid based on their context.
	 */
	wc->wc_sl	  = TAVOR_CQE_SL_GET(cq, cqe);
	wc->wc_immed_data = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
	wc->wc_qpn	  = TAVOR_CQE_DQPN_GET(cq, cqe);
	wc->wc_res_hash	  = 0;
	wc->wc_slid	  = TAVOR_CQE_DLID_GET(cq, cqe);
	wc->wc_ethertype  = (wc->wc_immed_data & 0xFFFF);
	wc->wc_pkey_ix	  = (wc->wc_immed_data >> 16);

	/*
	 * Depending on whether the completion was a receive or a send
	 * completion, fill in "bytes transferred" as appropriate.  Also,
	 * if necessary, fill in the "path bits" field.
	 */
	if (TAVOR_CQE_SENDRECV_GET(cq, cqe) == TAVOR_COMPLETION_RECV) {
		wc->wc_path_bits = TAVOR_CQE_PATHBITS_GET(cq, cqe);
		wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);

	} else if ((wc->wc_type == IBT_WRC_RDMAR) ||
	    (wc->wc_type == IBT_WRC_CSWAP) || (wc->wc_type == IBT_WRC_FADD)) {
		wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
	}

	return (TAVOR_CQ_SYNC_AND_DB);
}


/*
 * tavor_cq_errcqe_consume()
 *    Context: Can be called from interrupt or base context.
 */
static int
tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
    tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
{
	uint64_t		next_wqeaddr;
	uint32_t		imm_eth_pkey_cred;
	uint_t			nextwqesize, dbd;
	uint_t			doorbell_cnt, status;
	tavor_wrid_entry_t	wre;

	/*
	 * Fetch the Work Request ID using the information in the CQE.
	 * See tavor_wr.c for more details.
	 */
	wc->wc_id = tavor_wrid_get_entry(cq, cqe, &wre);

	/*
	 * Parse the CQE opcode to determine completion type.  We know that
	 * the CQE is an error completion, so we extract only the completion
	 * status here.
	 */
	imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
	status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT;
	switch (status) {
	case TAVOR_CQE_LOC_LEN_ERR:
		status = IBT_WC_LOCAL_LEN_ERR;
		break;

	case TAVOR_CQE_LOC_OP_ERR:
		status = IBT_WC_LOCAL_QP_OP_ERR;
		break;

	case TAVOR_CQE_LOC_PROT_ERR:
		status = IBT_WC_LOCAL_PROTECT_ERR;
		break;

	case TAVOR_CQE_WR_FLUSHED_ERR:
		status = IBT_WC_WR_FLUSHED_ERR;
		break;

	case TAVOR_CQE_MW_BIND_ERR:
		status = IBT_WC_MEM_WIN_BIND_ERR;
		break;

	case TAVOR_CQE_BAD_RESPONSE_ERR:
		status = IBT_WC_BAD_RESPONSE_ERR;
		break;

	case TAVOR_CQE_LOCAL_ACCESS_ERR:
		status = IBT_WC_LOCAL_ACCESS_ERR;
		break;

	case TAVOR_CQE_REM_INV_REQ_ERR:
		status = IBT_WC_REMOTE_INVALID_REQ_ERR;
		break;

	case TAVOR_CQE_REM_ACC_ERR:
		status = IBT_WC_REMOTE_ACCESS_ERR;
		break;

	case TAVOR_CQE_REM_OP_ERR:
		status = IBT_WC_REMOTE_OP_ERR;
		break;

	case TAVOR_CQE_TRANS_TO_ERR:
		status = IBT_WC_TRANS_TIMEOUT_ERR;
		break;

	case TAVOR_CQE_RNRNAK_TO_ERR:
		status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
		break;

	/*
	 * The following error codes are not supported in the Tavor driver
	 * as they relate only to Reliable Datagram completion statuses:
	 *    case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
	 *    case TAVOR_CQE_REM_INV_RD_REQ_ERR:
	 *    case TAVOR_CQE_EEC_REM_ABORTED_ERR:
	 *    case TAVOR_CQE_INV_EEC_NUM_ERR:
	 *    case TAVOR_CQE_INV_EEC_STATE_ERR:
	 *    case TAVOR_CQE_LOC_EEC_ERR:
	 */

	default:
		TAVOR_WARNING(state, "unknown error CQE status");
		status = IBT_WC_LOCAL_QP_OP_ERR;
		break;
	}
	wc->wc_status = status;

	/*
	 * Now we do all the checking that's necessary to handle completion
	 * queue entry "recycling"
	 *
	 * It is not necessary here to try to sync the WQE as we are only
	 * attempting to read from the Work Queue (and hardware does not
	 * write to it).
	 */

	/*
	 * We can get doorbell info, WQE address, size for the next WQE
	 * from the "wre" (which was filled in above in the call to the
	 * tavor_wrid_get_entry() routine)
	 */
	dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0;
	next_wqeaddr = wre.wr_wqeaddrsz;
	nextwqesize  = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK;

	/*
	 * Get the doorbell count from the CQE.  This indicates how many
	 * completions this one CQE represents.
	 */
	doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;

	/*
	 * Determine if we're ready to consume this CQE yet or not.  If the
	 * next WQE has size zero (i.e. no next WQE) or if the doorbell count
	 * is down to zero, then this is the last/only completion represented
	 * by the current CQE (return TAVOR_CQ_SYNC_AND_DB).  Otherwise, the
	 * current CQE needs to be recycled (see below).
	 */
	if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) {
		/*
		 * Consume the CQE
		 *    Return status to indicate that doorbell and sync may be
		 *    necessary.
		 */
		return (TAVOR_CQ_SYNC_AND_DB);

	} else {
		/*
		 * Recycle the CQE for use in the next PollCQ() call
		 *    Decrement the doorbell count, modify the error status,
		 *    and update the WQE address and size (to point to the
		 *    next WQE on the chain.  Put these update entries back
		 *    into the CQE.
		 *    Despite the fact that we have updated the CQE, it is not
		 *    necessary for us to attempt to sync this entry just yet
		 *    as we have not changed the "hardware's view" of the
		 *    entry (i.e. we have not modified the "owner" bit - which
		 *    is all that the Tavor hardware really cares about.
		 */
		doorbell_cnt = doorbell_cnt - dbd;
		TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cq, cqe,
		    ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) |
		    (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK)));
		TAVOR_CQE_WQEADDRSZ_SET(cq, cqe,
		    TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize));

		return (TAVOR_CQ_RECYCLE_ENTRY);
	}
}


/*
 * tavor_cqe_sync()
 *    Context: Can be called from interrupt or base context.
 */
static void
tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, uint_t flag)
{
	ddi_dma_handle_t	dmahdl;
	off_t			offset;

	/* Determine if CQ needs to be synced or not */
	if (cq->cq_sync == 0)
		return;

	/* Get the DMA handle from CQ context */
	dmahdl = cq->cq_mrhdl->mr_bindinfo.bi_dmahdl;

	/* Calculate offset of next CQE */
	offset = (off_t)((uintptr_t)cqe - (uintptr_t)&cq->cq_buf[0]);
	(void) ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_cqe_t), flag);
}


/*
 * tavor_cq_resize_helper()
 *    Context: Can be called only from user or kernel context.
 */
static void
tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
    uint32_t old_cons_indx, uint32_t num_newcqe)
{
	tavor_hw_cqe_t	*old_cqe, *new_cqe;
	uint32_t	new_cons_indx, wrap_around_mask;
	int		i;

	ASSERT(MUTEX_HELD(&cq->cq_lock));

	/* Get the consumer index */
	new_cons_indx = 0;

	/*
	 * Calculate the wrap around mask.  Note: This operation only works
	 * because all Tavor completion queues have power-of-2 sizes
	 */
	wrap_around_mask = (cq->cq_bufsz - 1);

	/*
	 * Calculate the pointers to the first CQ entry (in the "old" CQ)
	 * and the first CQ entry in the "new" CQ
	 */
	old_cqe = &cq->cq_buf[old_cons_indx];
	new_cqe = &new_cqbuf[new_cons_indx];

	/* Sync entire "old" CQ for use by software (if necessary). */
	if (cq->cq_sync) {
		(void) ddi_dma_sync(cq->cq_mrhdl->mr_bindinfo.bi_dmahdl,
		    0, cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORCPU);
	}

	/*
	 * Keep pulling entries from the "old" CQ until we find an entry owned
	 * by the hardware.  Process each entry by copying it into the "new"
	 * CQ and updating respective indices and pointers in the "old" CQ.
	 */
	for (i = 0; i < num_newcqe; i++) {

		/* Copy this old CQE into the "new_cqe" pointer */
		bcopy(old_cqe, new_cqe, sizeof (tavor_hw_cqe_t));

		/* Increment the consumer index (for both CQs) */
		old_cons_indx = (old_cons_indx + 1) & wrap_around_mask;
		new_cons_indx = (new_cons_indx + 1);

		/* Update the pointer to the next CQ entry */
		old_cqe = &cq->cq_buf[old_cons_indx];
		new_cqe = &new_cqbuf[new_cons_indx];
	}
}

/*
 * tavor_cq_srq_entries_flush()
 * Context: Can be called from interrupt or base context.
 */
void
tavor_cq_srq_entries_flush(tavor_state_t *state, tavor_qphdl_t qp)
{
	tavor_cqhdl_t		cq;
	tavor_workq_hdr_t	*wqhdr;
	tavor_hw_cqe_t		*cqe;
	tavor_hw_cqe_t		*next_cqe;
	uint32_t		cons_indx, tail_cons_indx, wrap_around_mask;
	uint32_t		new_indx, check_indx, indx;
	uint32_t		num_to_increment;
	int			cqe_qpnum, cqe_type;
	int			outstanding_cqes, removed_cqes;
	int			i;

	ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock));

	cq = qp->qp_rq_cqhdl;
	wqhdr = qp->qp_rq_wqhdr;

	ASSERT(wqhdr->wq_wrid_post != NULL);
	ASSERT(wqhdr->wq_wrid_post->wl_srq_en != 0);

	/*
	 * Check for user-mapped CQ memory.  Note:  We do not allow kernel
	 * clients to modify any userland mapping CQ.  If the CQ is
	 * user-mapped, then we simply return here, and this "flush" function
	 * becomes a NO-OP in this case.
	 */
	if (cq->cq_is_umap) {
		return;
	}

	/* Get the consumer index */
	cons_indx = cq->cq_consindx;

	/*
	 * Calculate the wrap around mask.  Note: This operation only works
	 * because all Tavor completion queues have power-of-2 sizes
	 */
	wrap_around_mask = (cq->cq_bufsz - 1);

	/* Calculate the pointer to the first CQ entry */
	cqe = &cq->cq_buf[cons_indx];

	/* Sync the current CQE to read */
	tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);

	/*
	 * Loop through the CQ looking for entries owned by software.  If an
	 * entry is owned by software then we increment an 'outstanding_cqes'
	 * count to know how many entries total we have on our CQ.  We use this
	 * value further down to know how many entries to loop through looking
	 * for our same QP number.
	 */
	outstanding_cqes = 0;
	tail_cons_indx = cons_indx;
	while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
		/* increment total cqes count */
		outstanding_cqes++;

		/* increment the consumer index */
		tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask;

		/* update the pointer to the next cq entry */
		cqe = &cq->cq_buf[tail_cons_indx];

		/* sync the next cqe to read */
		tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
	}

	/*
	 * Using the 'tail_cons_indx' that was just set, we now know how many
	 * total CQEs possible there are.  Set the 'check_indx' and the
	 * 'new_indx' to the last entry identified by 'tail_cons_indx'
	 */
	check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask;

	for (i = 0; i < outstanding_cqes; i++) {
		cqe = &cq->cq_buf[check_indx];

		/* Grab QP number from CQE */
		cqe_qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
		cqe_type = TAVOR_CQE_SENDRECV_GET(cq, cqe);

		/*
		 * If the QP number is the same in the CQE as the QP that we
		 * have on this SRQ, then we must free up the entry off the
		 * SRQ.  We also make sure that the completion type is of the
		 * 'TAVOR_COMPLETION_RECV' type.  So any send completions on
		 * this CQ will be left as-is.  The handling of returning
		 * entries back to HW ownership happens further down.
		 */
		if (cqe_qpnum == qp->qp_qpnum &&
		    cqe_type == TAVOR_COMPLETION_RECV) {

			/* Add back to SRQ free list */
			(void) tavor_wrid_find_match_srq(wqhdr->wq_wrid_post,
			    cq, cqe);
		} else {
			/* Do Copy */
			if (check_indx != new_indx) {
				next_cqe = &cq->cq_buf[new_indx];

				/*
				 * Copy the CQE into the "next_cqe"
				 * pointer.
				 */
				bcopy(cqe, next_cqe, sizeof (tavor_hw_cqe_t));
			}
			new_indx = (new_indx - 1) & wrap_around_mask;
		}
		/* Move index to next CQE to check */
		check_indx = (check_indx - 1) & wrap_around_mask;
	}

	/* Initialize removed cqes count */
	removed_cqes = 0;

	/* If an entry was removed */
	if (check_indx != new_indx) {

		/*
		 * Set current pointer back to the beginning consumer index.
		 * At this point, all unclaimed entries have been copied to the
		 * index specified by 'new_indx'.  This 'new_indx' will be used
		 * as the new consumer index after we mark all freed entries as
		 * having HW ownership.  We do that here.
		 */

		/* Loop through all entries until we reach our new pointer */
		for (indx = cons_indx; indx <= new_indx;
		    indx = (indx + 1) & wrap_around_mask) {
			removed_cqes++;
			cqe = &cq->cq_buf[indx];

			/* Reset entry to hardware ownership */
			TAVOR_CQE_OWNER_SET_HW(cq, cqe);
		}
	}

	/*
	 * Update consumer index to be the 'new_indx'.  This moves it past all
	 * removed entries.  Because 'new_indx' is pointing to the last
	 * previously valid SW owned entry, we add 1 to point the cons_indx to
	 * the first HW owned entry.
	 */
	cons_indx = (new_indx + 1) & wrap_around_mask;

	/*
	 * Now we only ring the doorbell (to update the consumer index) if
	 * we've actually consumed a CQ entry.  If we found no QP number
	 * matches above, then we would not have removed anything.  So only if
	 * something was removed do we ring the doorbell.
	 */
	if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
		/*
		 * Post doorbell to update the consumer index.  Doorbell
		 * value indicates number of entries consumed (minus 1)
		 */
		if (cons_indx > cq->cq_consindx) {
			num_to_increment = (cons_indx - cq->cq_consindx) - 1;
		} else {
			num_to_increment = ((cons_indx + cq->cq_bufsz) -
			    cq->cq_consindx) - 1;
		}
		cq->cq_consindx = cons_indx;

		tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
		    cq->cq_cqnum, num_to_increment);
	}
}