xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/tavor/tavor_cq.c (revision fcdb3229a31dd4ff700c69238814e326aad49098)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_cq.c
29  *    Tavor Completion Queue Processing Routines
30  *
31  *    Implements all the routines necessary for allocating, freeing, resizing,
32  *    and handling the completion type events that the Tavor hardware can
33  *    generate.
34  */
35 
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/bitmap.h>
42 #include <sys/sysmacros.h>
43 
44 #include <sys/ib/adapters/tavor/tavor.h>
45 
46 static void tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd,
47     uint32_t cqn, uint32_t cq_param);
48 static int tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
49     tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
50 static int tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
51     tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
52 static void tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
53     uint_t flag);
54 static void tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
55     uint32_t old_cons_indx, uint32_t num_newcqe);
56 
57 /*
58  * tavor_cq_alloc()
59  *    Context: Can be called only from user or kernel context.
60  */
61 int
tavor_cq_alloc(tavor_state_t * state,ibt_cq_hdl_t ibt_cqhdl,ibt_cq_attr_t * cq_attr,uint_t * actual_size,tavor_cqhdl_t * cqhdl,uint_t sleepflag)62 tavor_cq_alloc(tavor_state_t *state, ibt_cq_hdl_t ibt_cqhdl,
63     ibt_cq_attr_t *cq_attr, uint_t *actual_size, tavor_cqhdl_t *cqhdl,
64     uint_t sleepflag)
65 {
66 	tavor_rsrc_t		*cqc, *rsrc;
67 	tavor_umap_db_entry_t	*umapdb;
68 	tavor_hw_cqc_t		cqc_entry;
69 	tavor_cqhdl_t		cq;
70 	ibt_mr_attr_t		mr_attr;
71 	tavor_mr_options_t	op;
72 	tavor_pdhdl_t		pd;
73 	tavor_mrhdl_t		mr;
74 	tavor_hw_cqe_t		*buf;
75 	uint64_t		addr, value;
76 	uint32_t		log_cq_size, lkey, uarpg;
77 	uint_t			dma_xfer_mode, cq_sync, cq_is_umap;
78 	int			status, i, flag;
79 
80 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq_attr))
81 
82 	/*
83 	 * Determine whether CQ is being allocated for userland access or
84 	 * whether it is being allocated for kernel access.  If the CQ is
85 	 * being allocated for userland access, then lookup the UAR doorbell
86 	 * page number for the current process.  Note:  If this is not found
87 	 * (e.g. if the process has not previously open()'d the Tavor driver),
88 	 * then an error is returned.
89 	 */
90 	cq_is_umap = (cq_attr->cq_flags & IBT_CQ_USER_MAP) ? 1 : 0;
91 	if (cq_is_umap) {
92 		status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
93 		    MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
94 		if (status != DDI_SUCCESS) {
95 			goto cqalloc_fail;
96 		}
97 		uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
98 	}
99 
100 	/* Use the internal protection domain (PD) for setting up CQs */
101 	pd = state->ts_pdhdl_internal;
102 
103 	/* Increment the reference count on the protection domain (PD) */
104 	tavor_pd_refcnt_inc(pd);
105 
106 	/*
107 	 * Allocate an CQ context entry.  This will be filled in with all
108 	 * the necessary parameters to define the Completion Queue.  And then
109 	 * ownership will be passed to the hardware in the final step
110 	 * below.  If we fail here, we must undo the protection domain
111 	 * reference count.
112 	 */
113 	status = tavor_rsrc_alloc(state, TAVOR_CQC, 1, sleepflag, &cqc);
114 	if (status != DDI_SUCCESS) {
115 		goto cqalloc_fail1;
116 	}
117 
118 	/*
119 	 * Allocate the software structure for tracking the completion queue
120 	 * (i.e. the Tavor Completion Queue handle).  If we fail here, we must
121 	 * undo the protection domain reference count and the previous
122 	 * resource allocation.
123 	 */
124 	status = tavor_rsrc_alloc(state, TAVOR_CQHDL, 1, sleepflag, &rsrc);
125 	if (status != DDI_SUCCESS) {
126 		goto cqalloc_fail2;
127 	}
128 	cq = (tavor_cqhdl_t)rsrc->tr_addr;
129 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
130 	cq->cq_is_umap = cq_is_umap;
131 
132 	/* Use the index as CQ number */
133 	cq->cq_cqnum = cqc->tr_indx;
134 
135 	/*
136 	 * If this will be a user-mappable CQ, then allocate an entry for
137 	 * the "userland resources database".  This will later be added to
138 	 * the database (after all further CQ operations are successful).
139 	 * If we fail here, we must undo the reference counts and the
140 	 * previous resource allocation.
141 	 */
142 	if (cq->cq_is_umap) {
143 		umapdb = tavor_umap_db_alloc(state->ts_instance, cq->cq_cqnum,
144 		    MLNX_UMAP_CQMEM_RSRC, (uint64_t)(uintptr_t)rsrc);
145 		if (umapdb == NULL) {
146 			goto cqalloc_fail3;
147 		}
148 	}
149 
150 	/*
151 	 * Calculate the appropriate size for the completion queue.
152 	 * Note:  All Tavor CQs must be a power-of-2 minus 1 in size.  Also
153 	 * they may not be any smaller than TAVOR_CQ_MIN_SIZE.  This step is
154 	 * to round the requested size up to the next highest power-of-2
155 	 */
156 	cq_attr->cq_size = max(cq_attr->cq_size, TAVOR_CQ_MIN_SIZE);
157 	log_cq_size = highbit(cq_attr->cq_size);
158 
159 	/*
160 	 * Next we verify that the rounded-up size is valid (i.e. consistent
161 	 * with the device limits and/or software-configured limits)
162 	 */
163 	if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
164 		goto cqalloc_fail4;
165 	}
166 
167 	/*
168 	 * Allocate the memory for Completion Queue.
169 	 *
170 	 * Note: Although we use the common queue allocation routine, we
171 	 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
172 	 * kernel system memory) for kernel CQs because it would be
173 	 * inefficient to have CQs located in DDR memory.  This is primarily
174 	 * because CQs are read from (by software) more than they are written
175 	 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
176 	 * user-mappable CQs for a similar reason.)
177 	 * It is also worth noting that, unlike Tavor QP work queues,
178 	 * completion queues do not have the same strict alignment
179 	 * requirements.  It is sufficient for the CQ memory to be both
180 	 * aligned to and bound to addresses which are a multiple of CQE size.
181 	 */
182 	cq->cq_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
183 	cq->cq_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
184 	cq->cq_cqinfo.qa_bind_align  = sizeof (tavor_hw_cqe_t);
185 	if (cq->cq_is_umap) {
186 		cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
187 	} else {
188 		cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
189 	}
190 	status = tavor_queue_alloc(state, &cq->cq_cqinfo, sleepflag);
191 	if (status != DDI_SUCCESS) {
192 		goto cqalloc_fail4;
193 	}
194 	buf = (tavor_hw_cqe_t *)cq->cq_cqinfo.qa_buf_aligned;
195 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
196 
197 	/*
198 	 * Initialize each of the Completion Queue Entries (CQE) by setting
199 	 * their ownership to hardware ("owner" bit set to HW).  This is in
200 	 * preparation for the final transfer of ownership (below) of the
201 	 * CQ context itself.
202 	 */
203 	for (i = 0; i < (1 << log_cq_size); i++) {
204 		TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
205 	}
206 
207 	/*
208 	 * Register the memory for the CQ.  The memory for the CQ must
209 	 * be registered in the Tavor TPT tables.  This gives us the LKey
210 	 * to specify in the CQ context below.  Note: If this is a user-
211 	 * mappable CQ, then we will force DDI_DMA_CONSISTENT mapping.
212 	 */
213 	flag = (sleepflag == TAVOR_SLEEP) ?  IBT_MR_SLEEP : IBT_MR_NOSLEEP;
214 	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
215 	mr_attr.mr_len	 = cq->cq_cqinfo.qa_size;
216 	mr_attr.mr_as	 = NULL;
217 	mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
218 	if (cq->cq_is_umap) {
219 		dma_xfer_mode = DDI_DMA_CONSISTENT;
220 	} else {
221 		dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
222 	}
223 	if (dma_xfer_mode == DDI_DMA_STREAMING) {
224 		mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
225 	}
226 	op.mro_bind_type   = state->ts_cfg_profile->cp_iommu_bypass;
227 	op.mro_bind_dmahdl = cq->cq_cqinfo.qa_dmahdl;
228 	op.mro_bind_override_addr = 0;
229 	status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
230 	if (status != DDI_SUCCESS) {
231 		goto cqalloc_fail5;
232 	}
233 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
234 	addr = mr->mr_bindinfo.bi_addr;
235 	lkey = mr->mr_lkey;
236 
237 	/* Determine if later ddi_dma_sync will be necessary */
238 	cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, cq->cq_cqinfo);
239 
240 	/* Sync entire CQ for use by the hardware (if necessary). */
241 	if (cq_sync) {
242 		(void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
243 		    cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
244 	}
245 
246 	/*
247 	 * Fill in the CQC entry.  This is the final step before passing
248 	 * ownership of the CQC entry to the Tavor hardware.  We use all of
249 	 * the information collected/calculated above to fill in the
250 	 * requisite portions of the CQC.  Note: If this CQ is going to be
251 	 * used for userland access, then we need to set the UAR page number
252 	 * appropriately (otherwise it's a "don't care")
253 	 */
254 	bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
255 	cq->cq_eqnum		= TAVOR_CQ_EQNUM_GET(cq->cq_cqnum);
256 	cq->cq_erreqnum		= TAVOR_CQ_ERREQNUM_GET(cq->cq_cqnum);
257 	cqc_entry.xlat		= TAVOR_VA2PA_XLAT_ENABLED;
258 	cqc_entry.state		= TAVOR_CQ_DISARMED;
259 	cqc_entry.start_addr_h	= (addr >> 32);
260 	cqc_entry.start_addr_l	= (addr & 0xFFFFFFFF);
261 	cqc_entry.log_cq_sz	= log_cq_size;
262 	if (cq->cq_is_umap) {
263 		cqc_entry.usr_page = uarpg;
264 	} else {
265 		cqc_entry.usr_page = 0;
266 	}
267 	cqc_entry.pd		= pd->pd_pdnum;
268 	cqc_entry.lkey		= lkey;
269 	cqc_entry.e_eqn		= cq->cq_erreqnum;
270 	cqc_entry.c_eqn		= cq->cq_eqnum;
271 	cqc_entry.cqn		= cq->cq_cqnum;
272 
273 	/*
274 	 * Write the CQC entry to hardware.  Lastly, we pass ownership of
275 	 * the entry to the hardware (using the Tavor SW2HW_CQ firmware
276 	 * command).  Note: In general, this operation shouldn't fail.  But
277 	 * if it does, we have to undo everything we've done above before
278 	 * returning error.
279 	 */
280 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_CQ, &cqc_entry,
281 	    sizeof (tavor_hw_cqc_t), cq->cq_cqnum, sleepflag);
282 	if (status != TAVOR_CMD_SUCCESS) {
283 		cmn_err(CE_CONT, "Tavor: SW2HW_CQ command failed: %08x\n",
284 		    status);
285 		goto cqalloc_fail6;
286 	}
287 
288 	/*
289 	 * Fill in the rest of the Tavor Completion Queue handle.  Having
290 	 * successfully transferred ownership of the CQC, we can update the
291 	 * following fields for use in further operations on the CQ.
292 	 */
293 	cq->cq_cqcrsrcp	  = cqc;
294 	cq->cq_rsrcp	  = rsrc;
295 	cq->cq_consindx	  = 0;
296 	cq->cq_buf	  = buf;
297 	cq->cq_bufsz	  = (1 << log_cq_size);
298 	cq->cq_mrhdl	  = mr;
299 	cq->cq_sync	  = cq_sync;
300 	cq->cq_refcnt	  = 0;
301 	cq->cq_is_special = 0;
302 	cq->cq_uarpg	  = uarpg;
303 	cq->cq_umap_dhp	  = (devmap_cookie_t)NULL;
304 	avl_create(&cq->cq_wrid_wqhdr_avl_tree, tavor_wrid_wqhdr_compare,
305 	    sizeof (struct tavor_workq_hdr_s),
306 	    offsetof(struct tavor_workq_hdr_s, wq_avl_link));
307 
308 	cq->cq_wrid_reap_head  = NULL;
309 	cq->cq_wrid_reap_tail  = NULL;
310 	cq->cq_hdlrarg	  = (void *)ibt_cqhdl;
311 
312 	/*
313 	 * Put CQ handle in Tavor CQNum-to-CQHdl list.  Then fill in the
314 	 * "actual_size" and "cqhdl" and return success
315 	 */
316 	ASSERT(state->ts_cqhdl[cqc->tr_indx] == NULL);
317 	state->ts_cqhdl[cqc->tr_indx] = cq;
318 
319 	/*
320 	 * If this is a user-mappable CQ, then we need to insert the previously
321 	 * allocated entry into the "userland resources database".  This will
322 	 * allow for later lookup during devmap() (i.e. mmap()) calls.
323 	 */
324 	if (cq->cq_is_umap) {
325 		tavor_umap_db_add(umapdb);
326 	}
327 
328 	/*
329 	 * Fill in the return arguments (if necessary).  This includes the
330 	 * real completion queue size.
331 	 */
332 	if (actual_size != NULL) {
333 		*actual_size = (1 << log_cq_size) - 1;
334 	}
335 	*cqhdl = cq;
336 
337 	return (DDI_SUCCESS);
338 
339 /*
340  * The following is cleanup for all possible failure cases in this routine
341  */
342 cqalloc_fail6:
343 	if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
344 	    sleepflag) != DDI_SUCCESS) {
345 		TAVOR_WARNING(state, "failed to deregister CQ memory");
346 	}
347 cqalloc_fail5:
348 	tavor_queue_free(state, &cq->cq_cqinfo);
349 cqalloc_fail4:
350 	if (cq_is_umap) {
351 		tavor_umap_db_free(umapdb);
352 	}
353 cqalloc_fail3:
354 	tavor_rsrc_free(state, &rsrc);
355 cqalloc_fail2:
356 	tavor_rsrc_free(state, &cqc);
357 cqalloc_fail1:
358 	tavor_pd_refcnt_dec(pd);
359 cqalloc_fail:
360 	return (status);
361 }
362 
363 
364 /*
365  * tavor_cq_free()
366  *    Context: Can be called only from user or kernel context.
367  */
368 /* ARGSUSED */
369 int
tavor_cq_free(tavor_state_t * state,tavor_cqhdl_t * cqhdl,uint_t sleepflag)370 tavor_cq_free(tavor_state_t *state, tavor_cqhdl_t *cqhdl, uint_t sleepflag)
371 {
372 	tavor_rsrc_t		*cqc, *rsrc;
373 	tavor_umap_db_entry_t	*umapdb;
374 	tavor_hw_cqc_t		cqc_entry;
375 	tavor_pdhdl_t		pd;
376 	tavor_mrhdl_t		mr;
377 	tavor_cqhdl_t		cq;
378 	uint32_t		cqnum;
379 	uint64_t		value;
380 	uint_t			maxprot;
381 	int			status;
382 
383 	/*
384 	 * Pull all the necessary information from the Tavor Completion Queue
385 	 * handle.  This is necessary here because the resource for the
386 	 * CQ handle is going to be freed up as part of this operation.
387 	 */
388 	cq	= *cqhdl;
389 	mutex_enter(&cq->cq_lock);
390 	cqc	= cq->cq_cqcrsrcp;
391 	rsrc	= cq->cq_rsrcp;
392 	pd	= state->ts_pdhdl_internal;
393 	mr	= cq->cq_mrhdl;
394 	cqnum	= cq->cq_cqnum;
395 
396 	/*
397 	 * If there are work queues still associated with the CQ, then return
398 	 * an error.  Otherwise, we will be holding the CQ lock.
399 	 */
400 	if (cq->cq_refcnt != 0) {
401 		mutex_exit(&cq->cq_lock);
402 		return (IBT_CQ_BUSY);
403 	}
404 
405 	/*
406 	 * If this was a user-mappable CQ, then we need to remove its entry
407 	 * from the "userland resources database".  If it is also currently
408 	 * mmap()'d out to a user process, then we need to call
409 	 * devmap_devmem_remap() to remap the CQ memory to an invalid mapping.
410 	 * We also need to invalidate the CQ tracking information for the
411 	 * user mapping.
412 	 */
413 	if (cq->cq_is_umap) {
414 		status = tavor_umap_db_find(state->ts_instance, cqnum,
415 		    MLNX_UMAP_CQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
416 		    &umapdb);
417 		if (status != DDI_SUCCESS) {
418 			mutex_exit(&cq->cq_lock);
419 			TAVOR_WARNING(state, "failed to find in database");
420 			return (ibc_get_ci_failure(0));
421 		}
422 		tavor_umap_db_free(umapdb);
423 		if (cq->cq_umap_dhp != NULL) {
424 			maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
425 			status = devmap_devmem_remap(cq->cq_umap_dhp,
426 			    state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size,
427 			    maxprot, DEVMAP_MAPPING_INVALID, NULL);
428 			if (status != DDI_SUCCESS) {
429 				mutex_exit(&cq->cq_lock);
430 				TAVOR_WARNING(state, "failed in CQ memory "
431 				    "devmap_devmem_remap()");
432 				return (ibc_get_ci_failure(0));
433 			}
434 			cq->cq_umap_dhp = (devmap_cookie_t)NULL;
435 		}
436 	}
437 
438 	/*
439 	 * Put NULL into the Tavor CQNum-to-CQHdl list.  This will allow any
440 	 * in-progress events to detect that the CQ corresponding to this
441 	 * number has been freed.
442 	 */
443 	state->ts_cqhdl[cqc->tr_indx] = NULL;
444 
445 	/*
446 	 * While we hold the CQ lock, do a "forced reap" of the workQ WRID
447 	 * list.  This cleans up all the structures associated with the WRID
448 	 * processing for this CQ.  Once we complete, drop the lock and finish
449 	 * the deallocation of the CQ.
450 	 */
451 	tavor_wrid_cq_force_reap(cq);
452 
453 	mutex_exit(&cq->cq_lock);
454 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
455 
456 	/*
457 	 * Reclaim CQC entry from hardware (using the Tavor HW2SW_CQ
458 	 * firmware command).  If the ownership transfer fails for any reason,
459 	 * then it is an indication that something (either in HW or SW) has
460 	 * gone seriously wrong.
461 	 */
462 	status = tavor_cmn_ownership_cmd_post(state, HW2SW_CQ, &cqc_entry,
463 	    sizeof (tavor_hw_cqc_t), cqnum, sleepflag);
464 	if (status != TAVOR_CMD_SUCCESS) {
465 		TAVOR_WARNING(state, "failed to reclaim CQC ownership");
466 		cmn_err(CE_CONT, "Tavor: HW2SW_CQ command failed: %08x\n",
467 		    status);
468 		return (ibc_get_ci_failure(0));
469 	}
470 
471 	/*
472 	 * Deregister the memory for the Completion Queue.  If this fails
473 	 * for any reason, then it is an indication that something (either
474 	 * in HW or SW) has gone seriously wrong.  So we print a warning
475 	 * message and return.
476 	 */
477 	status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
478 	    sleepflag);
479 	if (status != DDI_SUCCESS) {
480 		TAVOR_WARNING(state, "failed to deregister CQ memory");
481 		return (ibc_get_ci_failure(0));
482 	}
483 
484 	/* Free the memory for the CQ */
485 	tavor_queue_free(state, &cq->cq_cqinfo);
486 
487 	/* Free the Tavor Completion Queue handle */
488 	tavor_rsrc_free(state, &rsrc);
489 
490 	/* Free up the CQC entry resource */
491 	tavor_rsrc_free(state, &cqc);
492 
493 	/* Decrement the reference count on the protection domain (PD) */
494 	tavor_pd_refcnt_dec(pd);
495 
496 	/* Set the cqhdl pointer to NULL and return success */
497 	*cqhdl = NULL;
498 
499 	return (DDI_SUCCESS);
500 }
501 
502 
503 /*
504  * tavor_cq_resize()
505  *    Context: Can be called only from user or kernel context.
506  */
507 int
tavor_cq_resize(tavor_state_t * state,tavor_cqhdl_t cq,uint_t req_size,uint_t * actual_size,uint_t sleepflag)508 tavor_cq_resize(tavor_state_t *state, tavor_cqhdl_t cq, uint_t req_size,
509     uint_t *actual_size, uint_t sleepflag)
510 {
511 	tavor_hw_cqc_t		cqc_entry;
512 	tavor_qalloc_info_t	new_cqinfo, old_cqinfo;
513 	ibt_mr_attr_t		mr_attr;
514 	tavor_mr_options_t	op;
515 	tavor_pdhdl_t		pd;
516 	tavor_mrhdl_t		mr, mr_old;
517 	tavor_hw_cqe_t		*buf;
518 	uint32_t		new_prod_indx, old_cons_indx;
519 	uint_t			dma_xfer_mode, cq_sync, log_cq_size, maxprot;
520 	int			status, i, flag;
521 
522 	/* Use the internal protection domain (PD) for CQs */
523 	pd = state->ts_pdhdl_internal;
524 
525 	/*
526 	 * Calculate the appropriate size for the new resized completion queue.
527 	 * Note:  All Tavor CQs must be a power-of-2 minus 1 in size.  Also
528 	 * they may not be any smaller than TAVOR_CQ_MIN_SIZE.  This step is
529 	 * to round the requested size up to the next highest power-of-2
530 	 */
531 	req_size = max(req_size, TAVOR_CQ_MIN_SIZE);
532 	log_cq_size = highbit(req_size);
533 
534 	/*
535 	 * Next we verify that the rounded-up size is valid (i.e. consistent
536 	 * with the device limits and/or software-configured limits)
537 	 */
538 	if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
539 		goto cqresize_fail;
540 	}
541 
542 	/*
543 	 * Allocate the memory for newly resized Completion Queue.
544 	 *
545 	 * Note: Although we use the common queue allocation routine, we
546 	 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
547 	 * kernel system memory) for kernel CQs because it would be
548 	 * inefficient to have CQs located in DDR memory.  This is the same
549 	 * as we do when we first allocate completion queues primarily
550 	 * because CQs are read from (by software) more than they are written
551 	 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
552 	 * user-mappable CQs for a similar reason.)
553 	 * It is also worth noting that, unlike Tavor QP work queues,
554 	 * completion queues do not have the same strict alignment
555 	 * requirements.  It is sufficient for the CQ memory to be both
556 	 * aligned to and bound to addresses which are a multiple of CQE size.
557 	 */
558 	new_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
559 	new_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
560 	new_cqinfo.qa_bind_align  = sizeof (tavor_hw_cqe_t);
561 	if (cq->cq_is_umap) {
562 		new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
563 	} else {
564 		new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
565 	}
566 	status = tavor_queue_alloc(state, &new_cqinfo, sleepflag);
567 	if (status != DDI_SUCCESS) {
568 		goto cqresize_fail;
569 	}
570 	buf = (tavor_hw_cqe_t *)new_cqinfo.qa_buf_aligned;
571 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
572 
573 	/*
574 	 * Initialize each of the Completion Queue Entries (CQE) by setting
575 	 * their ownership to hardware ("owner" bit set to HW).  This is in
576 	 * preparation for the final resize operation (below).
577 	 */
578 	for (i = 0; i < (1 << log_cq_size); i++) {
579 		TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
580 	}
581 
582 	/*
583 	 * Register the memory for the CQ.  The memory for the CQ must
584 	 * be registered in the Tavor TPT tables.  This gives us the LKey
585 	 * to specify in the CQ context below.
586 	 */
587 	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
588 	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
589 	mr_attr.mr_len	 = new_cqinfo.qa_size;
590 	mr_attr.mr_as	 = NULL;
591 	mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
592 	if (cq->cq_is_umap) {
593 		dma_xfer_mode = DDI_DMA_CONSISTENT;
594 	} else {
595 		dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
596 	}
597 	if (dma_xfer_mode == DDI_DMA_STREAMING) {
598 		mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
599 	}
600 	op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
601 	op.mro_bind_dmahdl = new_cqinfo.qa_dmahdl;
602 	op.mro_bind_override_addr = 0;
603 	status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
604 	if (status != DDI_SUCCESS) {
605 		tavor_queue_free(state, &new_cqinfo);
606 		goto cqresize_fail;
607 	}
608 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
609 
610 	/* Determine if later ddi_dma_sync will be necessary */
611 	cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, new_cqinfo);
612 
613 	/* Sync entire "new" CQ for use by hardware (if necessary) */
614 	if (cq_sync) {
615 		(void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
616 		    new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
617 	}
618 
619 	/*
620 	 * Now we grab the CQ lock.  Since we will be updating the actual
621 	 * CQ location and the producer/consumer indexes, we should hold
622 	 * the lock.
623 	 *
624 	 * We do a TAVOR_NOSLEEP here (and below), though, because we are
625 	 * holding the "cq_lock" and if we got raised to interrupt level
626 	 * by priority inversion, we would not want to block in this routine
627 	 * waiting for success.
628 	 */
629 	mutex_enter(&cq->cq_lock);
630 
631 	/*
632 	 * Determine the current CQ "consumer index".
633 	 *
634 	 * Note:  This will depend on whether the CQ had previously been
635 	 * mapped for user access or whether it is a kernel CQ.  If this
636 	 * is a kernel CQ, then all PollCQ() operations have come through
637 	 * the IBTF and, hence, the driver's CQ state structure will
638 	 * contain the current consumer index.  If, however, the user has
639 	 * accessed this CQ by bypassing the driver (OS-bypass), then we
640 	 * need to query the firmware to determine the current CQ consumer
641 	 * index.  This also assumes that the user process will not continue
642 	 * to consume entries while at the same time doing the ResizeCQ()
643 	 * operation.  If the user process does not guarantee this, then it
644 	 * may see duplicate or missed completions.  But under no
645 	 * circumstances should this panic the system.
646 	 */
647 	if (cq->cq_is_umap) {
648 		status = tavor_cmn_query_cmd_post(state, QUERY_CQ,
649 		    cq->cq_cqnum, &cqc_entry, sizeof (tavor_hw_cqc_t),
650 		    TAVOR_NOSLEEP);
651 		if (status != TAVOR_CMD_SUCCESS) {
652 			/* Query CQ has failed, drop CQ lock and cleanup */
653 			mutex_exit(&cq->cq_lock);
654 			if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
655 			    sleepflag) != DDI_SUCCESS) {
656 				TAVOR_WARNING(state, "failed to deregister "
657 				    "CQ memory");
658 			}
659 			tavor_queue_free(state, &new_cqinfo);
660 			TAVOR_WARNING(state, "failed to find in database");
661 
662 			goto cqresize_fail;
663 		}
664 		old_cons_indx = cqc_entry.cons_indx;
665 	} else {
666 		old_cons_indx = cq->cq_consindx;
667 	}
668 
669 	/*
670 	 * Fill in the CQC entry.  For the resize operation this is the
671 	 * final step before attempting the resize operation on the CQC entry.
672 	 * We use all of the information collected/calculated above to fill
673 	 * in the requisite portions of the CQC.
674 	 */
675 	bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
676 	cqc_entry.start_addr_h	= (mr->mr_bindinfo.bi_addr >> 32);
677 	cqc_entry.start_addr_l	= (mr->mr_bindinfo.bi_addr & 0xFFFFFFFF);
678 	cqc_entry.log_cq_sz	= log_cq_size;
679 	cqc_entry.lkey		= mr->mr_lkey;
680 
681 	/*
682 	 * Write the CQC entry to hardware.  Lastly, we pass ownership of
683 	 * the entry to the hardware (using the Tavor RESIZE_CQ firmware
684 	 * command).  Note: In general, this operation shouldn't fail.  But
685 	 * if it does, we have to undo everything we've done above before
686 	 * returning error.  Also note that the status returned may indicate
687 	 * the code to return to the IBTF.
688 	 */
689 	status = tavor_resize_cq_cmd_post(state, &cqc_entry, cq->cq_cqnum,
690 	    &new_prod_indx, TAVOR_CMD_NOSLEEP_SPIN);
691 	if (status != TAVOR_CMD_SUCCESS) {
692 		/* Resize attempt has failed, drop CQ lock and cleanup */
693 		mutex_exit(&cq->cq_lock);
694 		if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
695 		    sleepflag) != DDI_SUCCESS) {
696 			TAVOR_WARNING(state, "failed to deregister CQ memory");
697 		}
698 		tavor_queue_free(state, &new_cqinfo);
699 		if (status == TAVOR_CMD_BAD_SIZE) {
700 			return (IBT_CQ_SZ_INSUFFICIENT);
701 		} else {
702 			cmn_err(CE_CONT, "Tavor: RESIZE_CQ command failed: "
703 			    "%08x\n", status);
704 			return (ibc_get_ci_failure(0));
705 		}
706 	}
707 
708 	/*
709 	 * The CQ resize attempt was successful.  Before dropping the CQ lock,
710 	 * copy all of the CQEs from the "old" CQ into the "new" CQ.  Note:
711 	 * the Tavor firmware guarantees us that sufficient space is set aside
712 	 * in the "new" CQ to handle any un-polled CQEs from the "old" CQ.
713 	 * The two parameters to this helper function ("old_cons_indx" and
714 	 * "new_prod_indx") essentially indicate the starting index and number
715 	 * of any CQEs that might remain in the "old" CQ memory.
716 	 */
717 	tavor_cq_resize_helper(cq, buf, old_cons_indx, new_prod_indx);
718 
719 	/* Sync entire "new" CQ for use by hardware (if necessary) */
720 	if (cq_sync) {
721 		(void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
722 		    new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
723 	}
724 
725 	/*
726 	 * Update the Tavor Completion Queue handle with all the new
727 	 * information.  At the same time, save away all the necessary
728 	 * information for freeing up the old resources
729 	 */
730 	mr_old		 = cq->cq_mrhdl;
731 	old_cqinfo	 = cq->cq_cqinfo;
732 	cq->cq_cqinfo	 = new_cqinfo;
733 	cq->cq_consindx	 = 0;
734 	cq->cq_buf	 = buf;
735 	cq->cq_bufsz	 = (1 << log_cq_size);
736 	cq->cq_mrhdl	 = mr;
737 	cq->cq_sync	 = cq_sync;
738 
739 	/*
740 	 * If "old" CQ was a user-mappable CQ that is currently mmap()'d out
741 	 * to a user process, then we need to call devmap_devmem_remap() to
742 	 * invalidate the mapping to the CQ memory.  We also need to
743 	 * invalidate the CQ tracking information for the user mapping.
744 	 */
745 	if ((cq->cq_is_umap) && (cq->cq_umap_dhp != NULL)) {
746 		maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
747 		status = devmap_devmem_remap(cq->cq_umap_dhp,
748 		    state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size, maxprot,
749 		    DEVMAP_MAPPING_INVALID, NULL);
750 		if (status != DDI_SUCCESS) {
751 			mutex_exit(&cq->cq_lock);
752 			TAVOR_WARNING(state, "failed in CQ memory "
753 			    "devmap_devmem_remap()");
754 			return (ibc_get_ci_failure(0));
755 		}
756 		cq->cq_umap_dhp = (devmap_cookie_t)NULL;
757 	}
758 
759 	/*
760 	 * Drop the CQ lock now.  The only thing left to do is to free up
761 	 * the old resources.
762 	 */
763 	mutex_exit(&cq->cq_lock);
764 
765 	/*
766 	 * Deregister the memory for the old Completion Queue.  Note: We
767 	 * really can't return error here because we have no good way to
768 	 * cleanup.  Plus, the deregistration really shouldn't ever happen.
769 	 * So, if it does, it is an indication that something has gone
770 	 * seriously wrong.  So we print a warning message and return error
771 	 * (knowing, of course, that the "old" CQ memory will be leaked)
772 	 */
773 	status = tavor_mr_deregister(state, &mr_old, TAVOR_MR_DEREG_ALL,
774 	    sleepflag);
775 	if (status != DDI_SUCCESS) {
776 		TAVOR_WARNING(state, "failed to deregister old CQ memory");
777 		goto cqresize_fail;
778 	}
779 
780 	/* Free the memory for the old CQ */
781 	tavor_queue_free(state, &old_cqinfo);
782 
783 	/*
784 	 * Fill in the return arguments (if necessary).  This includes the
785 	 * real new completion queue size.
786 	 */
787 	if (actual_size != NULL) {
788 		*actual_size = (1 << log_cq_size) - 1;
789 	}
790 
791 	return (DDI_SUCCESS);
792 
793 cqresize_fail:
794 	return (status);
795 }
796 
797 
798 /*
799  * tavor_cq_notify()
800  *    Context: Can be called from interrupt or base context.
801  */
802 int
tavor_cq_notify(tavor_state_t * state,tavor_cqhdl_t cq,ibt_cq_notify_flags_t flags)803 tavor_cq_notify(tavor_state_t *state, tavor_cqhdl_t cq,
804     ibt_cq_notify_flags_t flags)
805 {
806 	uint_t		cqnum;
807 
808 	/*
809 	 * Determine if we are trying to get the next completion or the next
810 	 * "solicited" completion.  Then hit the appropriate doorbell.
811 	 *
812 	 * NOTE: Please see the comment in tavor_event.c:tavor_eq_poll
813 	 * regarding why we do not have to do an extra PIO read here, and we
814 	 * will not lose an event after writing this doorbell.
815 	 */
816 	cqnum = cq->cq_cqnum;
817 	if (flags == IBT_NEXT_COMPLETION) {
818 		tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ, cqnum,
819 		    TAVOR_CQDB_DEFAULT_PARAM);
820 
821 	} else if (flags == IBT_NEXT_SOLICITED) {
822 		tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ_SOLICIT,
823 		    cqnum, TAVOR_CQDB_DEFAULT_PARAM);
824 
825 	} else {
826 		return (IBT_CQ_NOTIFY_TYPE_INVALID);
827 	}
828 
829 	return (DDI_SUCCESS);
830 }
831 
832 
833 /*
834  * tavor_cq_poll()
835  *    Context: Can be called from interrupt or base context.
836  */
837 int
tavor_cq_poll(tavor_state_t * state,tavor_cqhdl_t cq,ibt_wc_t * wc_p,uint_t num_wc,uint_t * num_polled)838 tavor_cq_poll(tavor_state_t *state, tavor_cqhdl_t cq, ibt_wc_t *wc_p,
839     uint_t num_wc, uint_t *num_polled)
840 {
841 	tavor_hw_cqe_t	*cqe;
842 	uint32_t	cons_indx, wrap_around_mask;
843 	uint32_t	polled_cnt, num_to_increment;
844 	int		status;
845 
846 	/*
847 	 * Check for user-mappable CQ memory.  Note:  We do not allow kernel
848 	 * clients to poll CQ memory that is accessible directly by the user.
849 	 * If the CQ memory is user accessible, then return an error.
850 	 */
851 	if (cq->cq_is_umap) {
852 		return (IBT_CQ_HDL_INVALID);
853 	}
854 
855 	mutex_enter(&cq->cq_lock);
856 
857 	/* Get the consumer index */
858 	cons_indx = cq->cq_consindx;
859 
860 	/*
861 	 * Calculate the wrap around mask.  Note: This operation only works
862 	 * because all Tavor completion queues have power-of-2 sizes
863 	 */
864 	wrap_around_mask = (cq->cq_bufsz - 1);
865 
866 	/* Calculate the pointer to the first CQ entry */
867 	cqe = &cq->cq_buf[cons_indx];
868 
869 	/* Sync the current CQE to read */
870 	tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
871 
872 	/*
873 	 * Keep pulling entries from the CQ until we find an entry owned by
874 	 * the hardware.  As long as there the CQE's owned by SW, process
875 	 * each entry by calling tavor_cq_cqe_consume() and updating the CQ
876 	 * consumer index.  Note:  We only update the consumer index if
877 	 * tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.  Otherwise,
878 	 * it indicates that we are going to "recycle" the CQE (probably
879 	 * because it is a error CQE and corresponds to more than one
880 	 * completion).
881 	 */
882 	polled_cnt = 0;
883 	while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
884 		status = tavor_cq_cqe_consume(state, cq, cqe,
885 		    &wc_p[polled_cnt++]);
886 		if (status == TAVOR_CQ_SYNC_AND_DB) {
887 			/* Reset entry to hardware ownership */
888 			TAVOR_CQE_OWNER_SET_HW(cq, cqe);
889 
890 			/* Sync the current CQE for device */
891 			tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORDEV);
892 
893 			/* Increment the consumer index */
894 			cons_indx = (cons_indx + 1) & wrap_around_mask;
895 
896 			/* Update the pointer to the next CQ entry */
897 			cqe = &cq->cq_buf[cons_indx];
898 
899 			/* Sync the next CQE to read */
900 			tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
901 		}
902 
903 		/*
904 		 * If we have run out of space to store work completions,
905 		 * then stop and return the ones we have pulled of the CQ.
906 		 */
907 		if (polled_cnt >= num_wc) {
908 			break;
909 		}
910 	}
911 
912 	/*
913 	 * Now we only ring the doorbell (to update the consumer index) if
914 	 * we've actually consumed a CQ entry.  If we have, for example,
915 	 * pulled from a CQE that we are still in the process of "recycling"
916 	 * for error purposes, then we would not update the consumer index.
917 	 */
918 	if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) {
919 		/*
920 		 * Post doorbell to update the consumer index.  Doorbell
921 		 * value indicates number of entries consumed (minus 1)
922 		 */
923 		if (cons_indx > cq->cq_consindx) {
924 			num_to_increment = (cons_indx - cq->cq_consindx) - 1;
925 		} else {
926 			num_to_increment = ((cons_indx + cq->cq_bufsz) -
927 			    cq->cq_consindx) - 1;
928 		}
929 		cq->cq_consindx = cons_indx;
930 		tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
931 		    cq->cq_cqnum, num_to_increment);
932 
933 	} else if (polled_cnt == 0) {
934 		/*
935 		 * If the CQ is empty, we can try to free up some of the WRID
936 		 * list containers.  See tavor_wr.c for more details on this
937 		 * operation.
938 		 */
939 		tavor_wrid_cq_reap(cq);
940 	}
941 
942 	mutex_exit(&cq->cq_lock);
943 
944 	/* Set "num_polled" (if necessary) */
945 	if (num_polled != NULL) {
946 		*num_polled = polled_cnt;
947 	}
948 
949 	/* Set CQ_EMPTY condition if needed, otherwise return success */
950 	if (polled_cnt == 0) {
951 		status = IBT_CQ_EMPTY;
952 	} else {
953 		status = DDI_SUCCESS;
954 	}
955 
956 	/*
957 	 * Check if the system is currently panicking.  If it is, then call
958 	 * the Tavor interrupt service routine.  This step is necessary here
959 	 * because we might be in a polled I/O mode and without the call to
960 	 * tavor_isr() - and its subsequent calls to poll and rearm each
961 	 * event queue - we might overflow our EQs and render the system
962 	 * unable to sync/dump.
963 	 */
964 	if (ddi_in_panic() != 0) {
965 		(void) tavor_isr((caddr_t)state, (caddr_t)NULL);
966 	}
967 
968 	return (status);
969 }
970 
971 
972 /*
973  * tavor_cq_handler()
974  *    Context: Only called from interrupt context
975  */
976 int
tavor_cq_handler(tavor_state_t * state,tavor_eqhdl_t eq,tavor_hw_eqe_t * eqe)977 tavor_cq_handler(tavor_state_t *state, tavor_eqhdl_t eq,
978     tavor_hw_eqe_t *eqe)
979 {
980 	tavor_cqhdl_t		cq;
981 	uint_t			cqnum;
982 	uint_t			eqe_evttype;
983 
984 	eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
985 
986 	ASSERT(eqe_evttype == TAVOR_EVT_COMPLETION ||
987 	    eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
988 
989 	if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
990 		tavor_eq_overflow_handler(state, eq, eqe);
991 
992 		return (DDI_FAILURE);
993 	}
994 
995 
996 	/* Get the CQ handle from CQ number in event descriptor */
997 	cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
998 	cq = tavor_cqhdl_from_cqnum(state, cqnum);
999 
1000 	/*
1001 	 * Post the EQ doorbell to move the CQ to the "disarmed" state.
1002 	 * This operation is to enable subsequent CQ doorbells (e.g. those
1003 	 * that can be rung by tavor_cq_notify() above) to rearm the CQ.
1004 	 */
1005 	tavor_eq_doorbell(state, TAVOR_EQDB_DISARM_CQ, eq->eq_eqnum, cqnum);
1006 
1007 	/*
1008 	 * If the CQ handle is NULL, this is probably an indication
1009 	 * that the CQ has been freed already.  In which case, we
1010 	 * should not deliver this event.
1011 	 *
1012 	 * We also check that the CQ number in the handle is the
1013 	 * same as the CQ number in the event queue entry.  This
1014 	 * extra check allows us to handle the case where a CQ was
1015 	 * freed and then allocated again in the time it took to
1016 	 * handle the event queue processing.  By constantly incrementing
1017 	 * the non-constrained portion of the CQ number every time
1018 	 * a new CQ is allocated, we mitigate (somewhat) the chance
1019 	 * that a stale event could be passed to the client's CQ
1020 	 * handler.
1021 	 *
1022 	 * Lastly, we check if "ts_ibtfpriv" is NULL.  If it is then it
1023 	 * means that we've have either received this event before we
1024 	 * finished attaching to the IBTF or we've received it while we
1025 	 * are in the process of detaching.
1026 	 */
1027 	if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1028 	    (state->ts_ibtfpriv != NULL)) {
1029 		TAVOR_DO_IBTF_CQ_CALLB(state, cq);
1030 	}
1031 
1032 	return (DDI_SUCCESS);
1033 }
1034 
1035 
1036 /*
1037  * tavor_cq_err_handler()
1038  *    Context: Only called from interrupt context
1039  */
1040 int
tavor_cq_err_handler(tavor_state_t * state,tavor_eqhdl_t eq,tavor_hw_eqe_t * eqe)1041 tavor_cq_err_handler(tavor_state_t *state, tavor_eqhdl_t eq,
1042     tavor_hw_eqe_t *eqe)
1043 {
1044 	tavor_cqhdl_t		cq;
1045 	uint_t			cqnum;
1046 	ibc_async_event_t	event;
1047 	ibt_async_code_t	type;
1048 	uint_t			eqe_evttype;
1049 
1050 	eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
1051 
1052 	ASSERT(eqe_evttype == TAVOR_EVT_CQ_ERRORS ||
1053 	    eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
1054 
1055 	if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
1056 		tavor_eq_overflow_handler(state, eq, eqe);
1057 
1058 		return (DDI_FAILURE);
1059 	}
1060 
1061 	/* cmn_err(CE_CONT, "CQ Error handler\n"); */
1062 
1063 	/* Get the CQ handle from CQ number in event descriptor */
1064 	cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
1065 	cq = tavor_cqhdl_from_cqnum(state, cqnum);
1066 
1067 	/*
1068 	 * If the CQ handle is NULL, this is probably an indication
1069 	 * that the CQ has been freed already.  In which case, we
1070 	 * should not deliver this event.
1071 	 *
1072 	 * We also check that the CQ number in the handle is the
1073 	 * same as the CQ number in the event queue entry.  This
1074 	 * extra check allows us to handle the case where a CQ was
1075 	 * freed and then allocated again in the time it took to
1076 	 * handle the event queue processing.  By constantly incrementing
1077 	 * the non-constrained portion of the CQ number every time
1078 	 * a new CQ is allocated, we mitigate (somewhat) the chance
1079 	 * that a stale event could be passed to the client's CQ
1080 	 * handler.
1081 	 *
1082 	 * And then we check if "ts_ibtfpriv" is NULL.  If it is then it
1083 	 * means that we've have either received this event before we
1084 	 * finished attaching to the IBTF or we've received it while we
1085 	 * are in the process of detaching.
1086 	 */
1087 	if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1088 	    (state->ts_ibtfpriv != NULL)) {
1089 		event.ev_cq_hdl = (ibt_cq_hdl_t)cq->cq_hdlrarg;
1090 		type		= IBT_ERROR_CQ;
1091 
1092 		TAVOR_DO_IBTF_ASYNC_CALLB(state, type, &event);
1093 	}
1094 
1095 	return (DDI_SUCCESS);
1096 }
1097 
1098 
1099 /*
1100  * tavor_cq_refcnt_inc()
1101  *    Context: Can be called from interrupt or base context.
1102  */
1103 int
tavor_cq_refcnt_inc(tavor_cqhdl_t cq,uint_t is_special)1104 tavor_cq_refcnt_inc(tavor_cqhdl_t cq, uint_t is_special)
1105 {
1106 	/*
1107 	 * Increment the completion queue's reference count.  Note: In order
1108 	 * to ensure compliance with IBA C11-15, we must ensure that a given
1109 	 * CQ is not used for both special (SMI/GSI) QP and non-special QP.
1110 	 * This is accomplished here by keeping track of how the referenced
1111 	 * CQ is being used.
1112 	 */
1113 	mutex_enter(&cq->cq_lock);
1114 	if (cq->cq_refcnt == 0) {
1115 		cq->cq_is_special = is_special;
1116 	} else {
1117 		if (cq->cq_is_special != is_special) {
1118 			mutex_exit(&cq->cq_lock);
1119 			return (DDI_FAILURE);
1120 		}
1121 	}
1122 	cq->cq_refcnt++;
1123 	mutex_exit(&cq->cq_lock);
1124 	return (DDI_SUCCESS);
1125 }
1126 
1127 
1128 /*
1129  * tavor_cq_refcnt_dec()
1130  *    Context: Can be called from interrupt or base context.
1131  */
1132 void
tavor_cq_refcnt_dec(tavor_cqhdl_t cq)1133 tavor_cq_refcnt_dec(tavor_cqhdl_t cq)
1134 {
1135 	/* Decrement the completion queue's reference count */
1136 	mutex_enter(&cq->cq_lock);
1137 	cq->cq_refcnt--;
1138 	mutex_exit(&cq->cq_lock);
1139 }
1140 
1141 
1142 /*
1143  * tavor_cq_doorbell()
1144  *    Context: Can be called from interrupt or base context.
1145  */
1146 static void
tavor_cq_doorbell(tavor_state_t * state,uint32_t cq_cmd,uint32_t cqn,uint32_t cq_param)1147 tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd, uint32_t cqn,
1148     uint32_t cq_param)
1149 {
1150 	uint64_t	doorbell = 0;
1151 
1152 	/* Build the doorbell from the parameters */
1153 	doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) |
1154 	    ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param;
1155 
1156 	/* Write the doorbell to UAR */
1157 	TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->cq,
1158 	    doorbell);
1159 }
1160 
1161 
1162 /*
1163  * tavor_cqhdl_from_cqnum()
1164  *    Context: Can be called from interrupt or base context.
1165  *
1166  *    This routine is important because changing the unconstrained
1167  *    portion of the CQ number is critical to the detection of a
1168  *    potential race condition in the CQ handler code (i.e. the case
1169  *    where a CQ is freed and alloc'd again before an event for the
1170  *    "old" CQ can be handled).
1171  *
1172  *    While this is not a perfect solution (not sure that one exists)
1173  *    it does help to mitigate the chance that this race condition will
1174  *    cause us to deliver a "stale" event to the new CQ owner.  Note:
1175  *    this solution does not scale well because the number of constrained
1176  *    bits increases (and, hence, the number of unconstrained bits
1177  *    decreases) as the number of supported CQs grows.  For small and
1178  *    intermediate values, it should hopefully provide sufficient
1179  *    protection.
1180  */
1181 tavor_cqhdl_t
tavor_cqhdl_from_cqnum(tavor_state_t * state,uint_t cqnum)1182 tavor_cqhdl_from_cqnum(tavor_state_t *state, uint_t cqnum)
1183 {
1184 	uint_t	cqindx, cqmask;
1185 
1186 	/* Calculate the CQ table index from the cqnum */
1187 	cqmask = (1 << state->ts_cfg_profile->cp_log_num_cq) - 1;
1188 	cqindx = cqnum & cqmask;
1189 	return (state->ts_cqhdl[cqindx]);
1190 }
1191 
1192 
1193 /*
1194  * tavor_cq_cqe_consume()
1195  *    Context: Can be called from interrupt or base context.
1196  */
1197 static int
tavor_cq_cqe_consume(tavor_state_t * state,tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe,ibt_wc_t * wc)1198 tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1199     tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1200 {
1201 	uint_t		flags, type, opcode, qpnum, qp1_indx;
1202 	int		status;
1203 
1204 	/*
1205 	 * Determine if this is an "error" CQE by examining "opcode".  If it
1206 	 * is an error CQE, then call tavor_cq_errcqe_consume() and return
1207 	 * whatever status it returns.  Otherwise, this is a successful
1208 	 * completion.
1209 	 */
1210 	opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
1211 	if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
1212 	    (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
1213 		status = tavor_cq_errcqe_consume(state, cq, cqe, wc);
1214 		return (status);
1215 	}
1216 
1217 	/*
1218 	 * Fetch the Work Request ID using the information in the CQE.
1219 	 * See tavor_wr.c for more details.
1220 	 */
1221 	wc->wc_id = tavor_wrid_get_entry(cq, cqe, NULL);
1222 
1223 	/*
1224 	 * Parse the CQE opcode to determine completion type.  This will set
1225 	 * not only the type of the completion, but also any flags that might
1226 	 * be associated with it (e.g. whether immediate data is present).
1227 	 */
1228 	flags = IBT_WC_NO_FLAGS;
1229 	if (TAVOR_CQE_SENDRECV_GET(cq, cqe) != TAVOR_COMPLETION_RECV) {
1230 
1231 		/* Send CQE */
1232 		switch (opcode) {
1233 		case TAVOR_CQE_SND_RDMAWR_IMM:
1234 			flags |= IBT_WC_IMMED_DATA_PRESENT;
1235 			/* FALLTHROUGH */
1236 		case TAVOR_CQE_SND_RDMAWR:
1237 			type = IBT_WRC_RDMAW;
1238 			break;
1239 
1240 		case TAVOR_CQE_SND_SEND_IMM:
1241 			flags |= IBT_WC_IMMED_DATA_PRESENT;
1242 			/* FALLTHROUGH */
1243 		case TAVOR_CQE_SND_SEND:
1244 			type = IBT_WRC_SEND;
1245 			break;
1246 
1247 		case TAVOR_CQE_SND_RDMARD:
1248 			type = IBT_WRC_RDMAR;
1249 			break;
1250 
1251 		case TAVOR_CQE_SND_ATOMIC_CS:
1252 			type = IBT_WRC_CSWAP;
1253 			break;
1254 
1255 		case TAVOR_CQE_SND_ATOMIC_FA:
1256 			type = IBT_WRC_FADD;
1257 			break;
1258 
1259 		case TAVOR_CQE_SND_BIND_MW:
1260 			type = IBT_WRC_BIND;
1261 			break;
1262 
1263 		default:
1264 			TAVOR_WARNING(state, "unknown send CQE type");
1265 			wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1266 			return (TAVOR_CQ_SYNC_AND_DB);
1267 		}
1268 	} else {
1269 
1270 		/* Receive CQE */
1271 		switch (opcode & 0x1F) {
1272 		case TAVOR_CQE_RCV_RECV_IMM:
1273 			/* FALLTHROUGH */
1274 		case TAVOR_CQE_RCV_RECV_IMM2:
1275 			/*
1276 			 * Note:  According to the Tavor PRM, all QP1 recv
1277 			 * completions look like the result of a Send with
1278 			 * Immediate.  They are not, however, (MADs are Send
1279 			 * Only) so we need to check the QP number and set
1280 			 * the flag only if it is non-QP1.
1281 			 */
1282 			qpnum	 = TAVOR_CQE_QPNUM_GET(cq, cqe);
1283 			qp1_indx = state->ts_spec_qp1->tr_indx;
1284 			if ((qpnum < qp1_indx) || (qpnum > qp1_indx + 1)) {
1285 				flags |= IBT_WC_IMMED_DATA_PRESENT;
1286 			}
1287 			/* FALLTHROUGH */
1288 		case TAVOR_CQE_RCV_RECV:
1289 			/* FALLTHROUGH */
1290 		case TAVOR_CQE_RCV_RECV2:
1291 			type = IBT_WRC_RECV;
1292 			break;
1293 
1294 		case TAVOR_CQE_RCV_RDMAWR_IMM:
1295 			/* FALLTHROUGH */
1296 		case TAVOR_CQE_RCV_RDMAWR_IMM2:
1297 			flags |= IBT_WC_IMMED_DATA_PRESENT;
1298 			type = IBT_WRC_RECV_RDMAWI;
1299 			break;
1300 
1301 		default:
1302 			TAVOR_WARNING(state, "unknown recv CQE type");
1303 			wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1304 			return (TAVOR_CQ_SYNC_AND_DB);
1305 		}
1306 	}
1307 	wc->wc_type = type;
1308 
1309 	/*
1310 	 * Check for GRH, update the flags, then fill in "wc_flags" field
1311 	 * in the work completion
1312 	 */
1313 	if (TAVOR_CQE_GRH_GET(cq, cqe) != 0) {
1314 		flags |= IBT_WC_GRH_PRESENT;
1315 	}
1316 	wc->wc_flags = flags;
1317 
1318 	/* If we got here, completion status must be success */
1319 	wc->wc_status = IBT_WC_SUCCESS;
1320 
1321 	/*
1322 	 * Parse the remaining contents of the CQE into the work completion.
1323 	 * This means filling in SL, QP number, SLID, immediate data, etc.
1324 	 * Note:  Not all of these fields are valid in a given completion.
1325 	 * Many of them depend on the actual type of completion.  So we fill
1326 	 * in all of the fields and leave it up to the IBTF and consumer to
1327 	 * sort out which are valid based on their context.
1328 	 */
1329 	wc->wc_sl	  = TAVOR_CQE_SL_GET(cq, cqe);
1330 	wc->wc_immed_data = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1331 	wc->wc_qpn	  = TAVOR_CQE_DQPN_GET(cq, cqe);
1332 	wc->wc_res_hash	  = 0;
1333 	wc->wc_slid	  = TAVOR_CQE_DLID_GET(cq, cqe);
1334 	wc->wc_ethertype  = (wc->wc_immed_data & 0xFFFF);
1335 	wc->wc_pkey_ix	  = (wc->wc_immed_data >> 16);
1336 
1337 	/*
1338 	 * Depending on whether the completion was a receive or a send
1339 	 * completion, fill in "bytes transferred" as appropriate.  Also,
1340 	 * if necessary, fill in the "path bits" field.
1341 	 */
1342 	if (TAVOR_CQE_SENDRECV_GET(cq, cqe) == TAVOR_COMPLETION_RECV) {
1343 		wc->wc_path_bits = TAVOR_CQE_PATHBITS_GET(cq, cqe);
1344 		wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1345 
1346 	} else if ((wc->wc_type == IBT_WRC_RDMAR) ||
1347 	    (wc->wc_type == IBT_WRC_CSWAP) || (wc->wc_type == IBT_WRC_FADD)) {
1348 		wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1349 	}
1350 
1351 	return (TAVOR_CQ_SYNC_AND_DB);
1352 }
1353 
1354 
1355 /*
1356  * tavor_cq_errcqe_consume()
1357  *    Context: Can be called from interrupt or base context.
1358  */
1359 static int
tavor_cq_errcqe_consume(tavor_state_t * state,tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe,ibt_wc_t * wc)1360 tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1361     tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1362 {
1363 	uint64_t		next_wqeaddr;
1364 	uint32_t		imm_eth_pkey_cred;
1365 	uint_t			nextwqesize, dbd;
1366 	uint_t			doorbell_cnt, status;
1367 	tavor_wrid_entry_t	wre;
1368 
1369 	/*
1370 	 * Fetch the Work Request ID using the information in the CQE.
1371 	 * See tavor_wr.c for more details.
1372 	 */
1373 	wc->wc_id = tavor_wrid_get_entry(cq, cqe, &wre);
1374 
1375 	/*
1376 	 * Parse the CQE opcode to determine completion type.  We know that
1377 	 * the CQE is an error completion, so we extract only the completion
1378 	 * status here.
1379 	 */
1380 	imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1381 	status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT;
1382 	switch (status) {
1383 	case TAVOR_CQE_LOC_LEN_ERR:
1384 		status = IBT_WC_LOCAL_LEN_ERR;
1385 		break;
1386 
1387 	case TAVOR_CQE_LOC_OP_ERR:
1388 		status = IBT_WC_LOCAL_QP_OP_ERR;
1389 		break;
1390 
1391 	case TAVOR_CQE_LOC_PROT_ERR:
1392 		status = IBT_WC_LOCAL_PROTECT_ERR;
1393 		break;
1394 
1395 	case TAVOR_CQE_WR_FLUSHED_ERR:
1396 		status = IBT_WC_WR_FLUSHED_ERR;
1397 		break;
1398 
1399 	case TAVOR_CQE_MW_BIND_ERR:
1400 		status = IBT_WC_MEM_WIN_BIND_ERR;
1401 		break;
1402 
1403 	case TAVOR_CQE_BAD_RESPONSE_ERR:
1404 		status = IBT_WC_BAD_RESPONSE_ERR;
1405 		break;
1406 
1407 	case TAVOR_CQE_LOCAL_ACCESS_ERR:
1408 		status = IBT_WC_LOCAL_ACCESS_ERR;
1409 		break;
1410 
1411 	case TAVOR_CQE_REM_INV_REQ_ERR:
1412 		status = IBT_WC_REMOTE_INVALID_REQ_ERR;
1413 		break;
1414 
1415 	case TAVOR_CQE_REM_ACC_ERR:
1416 		status = IBT_WC_REMOTE_ACCESS_ERR;
1417 		break;
1418 
1419 	case TAVOR_CQE_REM_OP_ERR:
1420 		status = IBT_WC_REMOTE_OP_ERR;
1421 		break;
1422 
1423 	case TAVOR_CQE_TRANS_TO_ERR:
1424 		status = IBT_WC_TRANS_TIMEOUT_ERR;
1425 		break;
1426 
1427 	case TAVOR_CQE_RNRNAK_TO_ERR:
1428 		status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
1429 		break;
1430 
1431 	/*
1432 	 * The following error codes are not supported in the Tavor driver
1433 	 * as they relate only to Reliable Datagram completion statuses:
1434 	 *    case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
1435 	 *    case TAVOR_CQE_REM_INV_RD_REQ_ERR:
1436 	 *    case TAVOR_CQE_EEC_REM_ABORTED_ERR:
1437 	 *    case TAVOR_CQE_INV_EEC_NUM_ERR:
1438 	 *    case TAVOR_CQE_INV_EEC_STATE_ERR:
1439 	 *    case TAVOR_CQE_LOC_EEC_ERR:
1440 	 */
1441 
1442 	default:
1443 		TAVOR_WARNING(state, "unknown error CQE status");
1444 		status = IBT_WC_LOCAL_QP_OP_ERR;
1445 		break;
1446 	}
1447 	wc->wc_status = status;
1448 
1449 	/*
1450 	 * Now we do all the checking that's necessary to handle completion
1451 	 * queue entry "recycling"
1452 	 *
1453 	 * It is not necessary here to try to sync the WQE as we are only
1454 	 * attempting to read from the Work Queue (and hardware does not
1455 	 * write to it).
1456 	 */
1457 
1458 	/*
1459 	 * We can get doorbell info, WQE address, size for the next WQE
1460 	 * from the "wre" (which was filled in above in the call to the
1461 	 * tavor_wrid_get_entry() routine)
1462 	 */
1463 	dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0;
1464 	next_wqeaddr = wre.wr_wqeaddrsz;
1465 	nextwqesize  = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK;
1466 
1467 	/*
1468 	 * Get the doorbell count from the CQE.  This indicates how many
1469 	 * completions this one CQE represents.
1470 	 */
1471 	doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
1472 
1473 	/*
1474 	 * Determine if we're ready to consume this CQE yet or not.  If the
1475 	 * next WQE has size zero (i.e. no next WQE) or if the doorbell count
1476 	 * is down to zero, then this is the last/only completion represented
1477 	 * by the current CQE (return TAVOR_CQ_SYNC_AND_DB).  Otherwise, the
1478 	 * current CQE needs to be recycled (see below).
1479 	 */
1480 	if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) {
1481 		/*
1482 		 * Consume the CQE
1483 		 *    Return status to indicate that doorbell and sync may be
1484 		 *    necessary.
1485 		 */
1486 		return (TAVOR_CQ_SYNC_AND_DB);
1487 
1488 	} else {
1489 		/*
1490 		 * Recycle the CQE for use in the next PollCQ() call
1491 		 *    Decrement the doorbell count, modify the error status,
1492 		 *    and update the WQE address and size (to point to the
1493 		 *    next WQE on the chain.  Put these update entries back
1494 		 *    into the CQE.
1495 		 *    Despite the fact that we have updated the CQE, it is not
1496 		 *    necessary for us to attempt to sync this entry just yet
1497 		 *    as we have not changed the "hardware's view" of the
1498 		 *    entry (i.e. we have not modified the "owner" bit - which
1499 		 *    is all that the Tavor hardware really cares about.
1500 		 */
1501 		doorbell_cnt = doorbell_cnt - dbd;
1502 		TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cq, cqe,
1503 		    ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) |
1504 		    (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK)));
1505 		TAVOR_CQE_WQEADDRSZ_SET(cq, cqe,
1506 		    TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize));
1507 
1508 		return (TAVOR_CQ_RECYCLE_ENTRY);
1509 	}
1510 }
1511 
1512 
1513 /*
1514  * tavor_cqe_sync()
1515  *    Context: Can be called from interrupt or base context.
1516  */
1517 static void
tavor_cqe_sync(tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe,uint_t flag)1518 tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, uint_t flag)
1519 {
1520 	ddi_dma_handle_t	dmahdl;
1521 	off_t			offset;
1522 
1523 	/* Determine if CQ needs to be synced or not */
1524 	if (cq->cq_sync == 0)
1525 		return;
1526 
1527 	/* Get the DMA handle from CQ context */
1528 	dmahdl = cq->cq_mrhdl->mr_bindinfo.bi_dmahdl;
1529 
1530 	/* Calculate offset of next CQE */
1531 	offset = (off_t)((uintptr_t)cqe - (uintptr_t)&cq->cq_buf[0]);
1532 	(void) ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_cqe_t), flag);
1533 }
1534 
1535 
1536 /*
1537  * tavor_cq_resize_helper()
1538  *    Context: Can be called only from user or kernel context.
1539  */
1540 static void
tavor_cq_resize_helper(tavor_cqhdl_t cq,tavor_hw_cqe_t * new_cqbuf,uint32_t old_cons_indx,uint32_t num_newcqe)1541 tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
1542     uint32_t old_cons_indx, uint32_t num_newcqe)
1543 {
1544 	tavor_hw_cqe_t	*old_cqe, *new_cqe;
1545 	uint32_t	new_cons_indx, wrap_around_mask;
1546 	int		i;
1547 
1548 	ASSERT(MUTEX_HELD(&cq->cq_lock));
1549 
1550 	/* Get the consumer index */
1551 	new_cons_indx = 0;
1552 
1553 	/*
1554 	 * Calculate the wrap around mask.  Note: This operation only works
1555 	 * because all Tavor completion queues have power-of-2 sizes
1556 	 */
1557 	wrap_around_mask = (cq->cq_bufsz - 1);
1558 
1559 	/*
1560 	 * Calculate the pointers to the first CQ entry (in the "old" CQ)
1561 	 * and the first CQ entry in the "new" CQ
1562 	 */
1563 	old_cqe = &cq->cq_buf[old_cons_indx];
1564 	new_cqe = &new_cqbuf[new_cons_indx];
1565 
1566 	/* Sync entire "old" CQ for use by software (if necessary). */
1567 	if (cq->cq_sync) {
1568 		(void) ddi_dma_sync(cq->cq_mrhdl->mr_bindinfo.bi_dmahdl,
1569 		    0, cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORCPU);
1570 	}
1571 
1572 	/*
1573 	 * Keep pulling entries from the "old" CQ until we find an entry owned
1574 	 * by the hardware.  Process each entry by copying it into the "new"
1575 	 * CQ and updating respective indices and pointers in the "old" CQ.
1576 	 */
1577 	for (i = 0; i < num_newcqe; i++) {
1578 
1579 		/* Copy this old CQE into the "new_cqe" pointer */
1580 		bcopy(old_cqe, new_cqe, sizeof (tavor_hw_cqe_t));
1581 
1582 		/* Increment the consumer index (for both CQs) */
1583 		old_cons_indx = (old_cons_indx + 1) & wrap_around_mask;
1584 		new_cons_indx = (new_cons_indx + 1);
1585 
1586 		/* Update the pointer to the next CQ entry */
1587 		old_cqe = &cq->cq_buf[old_cons_indx];
1588 		new_cqe = &new_cqbuf[new_cons_indx];
1589 	}
1590 }
1591 
1592 /*
1593  * tavor_cq_srq_entries_flush()
1594  * Context: Can be called from interrupt or base context.
1595  */
1596 void
tavor_cq_srq_entries_flush(tavor_state_t * state,tavor_qphdl_t qp)1597 tavor_cq_srq_entries_flush(tavor_state_t *state, tavor_qphdl_t qp)
1598 {
1599 	tavor_cqhdl_t		cq;
1600 	tavor_workq_hdr_t	*wqhdr;
1601 	tavor_hw_cqe_t		*cqe;
1602 	tavor_hw_cqe_t		*next_cqe;
1603 	uint32_t		cons_indx, tail_cons_indx, wrap_around_mask;
1604 	uint32_t		new_indx, check_indx, indx;
1605 	uint32_t		num_to_increment;
1606 	int			cqe_qpnum, cqe_type;
1607 	int			outstanding_cqes, removed_cqes;
1608 	int			i;
1609 
1610 	ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock));
1611 
1612 	cq = qp->qp_rq_cqhdl;
1613 	wqhdr = qp->qp_rq_wqhdr;
1614 
1615 	ASSERT(wqhdr->wq_wrid_post != NULL);
1616 	ASSERT(wqhdr->wq_wrid_post->wl_srq_en != 0);
1617 
1618 	/*
1619 	 * Check for user-mapped CQ memory.  Note:  We do not allow kernel
1620 	 * clients to modify any userland mapping CQ.  If the CQ is
1621 	 * user-mapped, then we simply return here, and this "flush" function
1622 	 * becomes a NO-OP in this case.
1623 	 */
1624 	if (cq->cq_is_umap) {
1625 		return;
1626 	}
1627 
1628 	/* Get the consumer index */
1629 	cons_indx = cq->cq_consindx;
1630 
1631 	/*
1632 	 * Calculate the wrap around mask.  Note: This operation only works
1633 	 * because all Tavor completion queues have power-of-2 sizes
1634 	 */
1635 	wrap_around_mask = (cq->cq_bufsz - 1);
1636 
1637 	/* Calculate the pointer to the first CQ entry */
1638 	cqe = &cq->cq_buf[cons_indx];
1639 
1640 	/* Sync the current CQE to read */
1641 	tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1642 
1643 	/*
1644 	 * Loop through the CQ looking for entries owned by software.  If an
1645 	 * entry is owned by software then we increment an 'outstanding_cqes'
1646 	 * count to know how many entries total we have on our CQ.  We use this
1647 	 * value further down to know how many entries to loop through looking
1648 	 * for our same QP number.
1649 	 */
1650 	outstanding_cqes = 0;
1651 	tail_cons_indx = cons_indx;
1652 	while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
1653 		/* increment total cqes count */
1654 		outstanding_cqes++;
1655 
1656 		/* increment the consumer index */
1657 		tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask;
1658 
1659 		/* update the pointer to the next cq entry */
1660 		cqe = &cq->cq_buf[tail_cons_indx];
1661 
1662 		/* sync the next cqe to read */
1663 		tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1664 	}
1665 
1666 	/*
1667 	 * Using the 'tail_cons_indx' that was just set, we now know how many
1668 	 * total CQEs possible there are.  Set the 'check_indx' and the
1669 	 * 'new_indx' to the last entry identified by 'tail_cons_indx'
1670 	 */
1671 	check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask;
1672 
1673 	for (i = 0; i < outstanding_cqes; i++) {
1674 		cqe = &cq->cq_buf[check_indx];
1675 
1676 		/* Grab QP number from CQE */
1677 		cqe_qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
1678 		cqe_type = TAVOR_CQE_SENDRECV_GET(cq, cqe);
1679 
1680 		/*
1681 		 * If the QP number is the same in the CQE as the QP that we
1682 		 * have on this SRQ, then we must free up the entry off the
1683 		 * SRQ.  We also make sure that the completion type is of the
1684 		 * 'TAVOR_COMPLETION_RECV' type.  So any send completions on
1685 		 * this CQ will be left as-is.  The handling of returning
1686 		 * entries back to HW ownership happens further down.
1687 		 */
1688 		if (cqe_qpnum == qp->qp_qpnum &&
1689 		    cqe_type == TAVOR_COMPLETION_RECV) {
1690 
1691 			/* Add back to SRQ free list */
1692 			(void) tavor_wrid_find_match_srq(wqhdr->wq_wrid_post,
1693 			    cq, cqe);
1694 		} else {
1695 			/* Do Copy */
1696 			if (check_indx != new_indx) {
1697 				next_cqe = &cq->cq_buf[new_indx];
1698 
1699 				/*
1700 				 * Copy the CQE into the "next_cqe"
1701 				 * pointer.
1702 				 */
1703 				bcopy(cqe, next_cqe, sizeof (tavor_hw_cqe_t));
1704 			}
1705 			new_indx = (new_indx - 1) & wrap_around_mask;
1706 		}
1707 		/* Move index to next CQE to check */
1708 		check_indx = (check_indx - 1) & wrap_around_mask;
1709 	}
1710 
1711 	/* Initialize removed cqes count */
1712 	removed_cqes = 0;
1713 
1714 	/* If an entry was removed */
1715 	if (check_indx != new_indx) {
1716 
1717 		/*
1718 		 * Set current pointer back to the beginning consumer index.
1719 		 * At this point, all unclaimed entries have been copied to the
1720 		 * index specified by 'new_indx'.  This 'new_indx' will be used
1721 		 * as the new consumer index after we mark all freed entries as
1722 		 * having HW ownership.  We do that here.
1723 		 */
1724 
1725 		/* Loop through all entries until we reach our new pointer */
1726 		for (indx = cons_indx; indx <= new_indx;
1727 		    indx = (indx + 1) & wrap_around_mask) {
1728 			removed_cqes++;
1729 			cqe = &cq->cq_buf[indx];
1730 
1731 			/* Reset entry to hardware ownership */
1732 			TAVOR_CQE_OWNER_SET_HW(cq, cqe);
1733 		}
1734 	}
1735 
1736 	/*
1737 	 * Update consumer index to be the 'new_indx'.  This moves it past all
1738 	 * removed entries.  Because 'new_indx' is pointing to the last
1739 	 * previously valid SW owned entry, we add 1 to point the cons_indx to
1740 	 * the first HW owned entry.
1741 	 */
1742 	cons_indx = (new_indx + 1) & wrap_around_mask;
1743 
1744 	/*
1745 	 * Now we only ring the doorbell (to update the consumer index) if
1746 	 * we've actually consumed a CQ entry.  If we found no QP number
1747 	 * matches above, then we would not have removed anything.  So only if
1748 	 * something was removed do we ring the doorbell.
1749 	 */
1750 	if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
1751 		/*
1752 		 * Post doorbell to update the consumer index.  Doorbell
1753 		 * value indicates number of entries consumed (minus 1)
1754 		 */
1755 		if (cons_indx > cq->cq_consindx) {
1756 			num_to_increment = (cons_indx - cq->cq_consindx) - 1;
1757 		} else {
1758 			num_to_increment = ((cons_indx + cq->cq_bufsz) -
1759 			    cq->cq_consindx) - 1;
1760 		}
1761 		cq->cq_consindx = cons_indx;
1762 
1763 		tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
1764 		    cq->cq_cqnum, num_to_increment);
1765 	}
1766 }
1767