1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * tavor_cq.c
29 * Tavor Completion Queue Processing Routines
30 *
31 * Implements all the routines necessary for allocating, freeing, resizing,
32 * and handling the completion type events that the Tavor hardware can
33 * generate.
34 */
35
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/bitmap.h>
42 #include <sys/sysmacros.h>
43
44 #include <sys/ib/adapters/tavor/tavor.h>
45
46 static void tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd,
47 uint32_t cqn, uint32_t cq_param);
48 static int tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
49 tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
50 static int tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
51 tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
52 static void tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
53 uint_t flag);
54 static void tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
55 uint32_t old_cons_indx, uint32_t num_newcqe);
56
57 /*
58 * tavor_cq_alloc()
59 * Context: Can be called only from user or kernel context.
60 */
61 int
tavor_cq_alloc(tavor_state_t * state,ibt_cq_hdl_t ibt_cqhdl,ibt_cq_attr_t * cq_attr,uint_t * actual_size,tavor_cqhdl_t * cqhdl,uint_t sleepflag)62 tavor_cq_alloc(tavor_state_t *state, ibt_cq_hdl_t ibt_cqhdl,
63 ibt_cq_attr_t *cq_attr, uint_t *actual_size, tavor_cqhdl_t *cqhdl,
64 uint_t sleepflag)
65 {
66 tavor_rsrc_t *cqc, *rsrc;
67 tavor_umap_db_entry_t *umapdb;
68 tavor_hw_cqc_t cqc_entry;
69 tavor_cqhdl_t cq;
70 ibt_mr_attr_t mr_attr;
71 tavor_mr_options_t op;
72 tavor_pdhdl_t pd;
73 tavor_mrhdl_t mr;
74 tavor_hw_cqe_t *buf;
75 uint64_t addr, value;
76 uint32_t log_cq_size, lkey, uarpg;
77 uint_t dma_xfer_mode, cq_sync, cq_is_umap;
78 int status, i, flag;
79
80 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq_attr))
81
82 /*
83 * Determine whether CQ is being allocated for userland access or
84 * whether it is being allocated for kernel access. If the CQ is
85 * being allocated for userland access, then lookup the UAR doorbell
86 * page number for the current process. Note: If this is not found
87 * (e.g. if the process has not previously open()'d the Tavor driver),
88 * then an error is returned.
89 */
90 cq_is_umap = (cq_attr->cq_flags & IBT_CQ_USER_MAP) ? 1 : 0;
91 if (cq_is_umap) {
92 status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
93 MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
94 if (status != DDI_SUCCESS) {
95 goto cqalloc_fail;
96 }
97 uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
98 }
99
100 /* Use the internal protection domain (PD) for setting up CQs */
101 pd = state->ts_pdhdl_internal;
102
103 /* Increment the reference count on the protection domain (PD) */
104 tavor_pd_refcnt_inc(pd);
105
106 /*
107 * Allocate an CQ context entry. This will be filled in with all
108 * the necessary parameters to define the Completion Queue. And then
109 * ownership will be passed to the hardware in the final step
110 * below. If we fail here, we must undo the protection domain
111 * reference count.
112 */
113 status = tavor_rsrc_alloc(state, TAVOR_CQC, 1, sleepflag, &cqc);
114 if (status != DDI_SUCCESS) {
115 goto cqalloc_fail1;
116 }
117
118 /*
119 * Allocate the software structure for tracking the completion queue
120 * (i.e. the Tavor Completion Queue handle). If we fail here, we must
121 * undo the protection domain reference count and the previous
122 * resource allocation.
123 */
124 status = tavor_rsrc_alloc(state, TAVOR_CQHDL, 1, sleepflag, &rsrc);
125 if (status != DDI_SUCCESS) {
126 goto cqalloc_fail2;
127 }
128 cq = (tavor_cqhdl_t)rsrc->tr_addr;
129 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
130 cq->cq_is_umap = cq_is_umap;
131
132 /* Use the index as CQ number */
133 cq->cq_cqnum = cqc->tr_indx;
134
135 /*
136 * If this will be a user-mappable CQ, then allocate an entry for
137 * the "userland resources database". This will later be added to
138 * the database (after all further CQ operations are successful).
139 * If we fail here, we must undo the reference counts and the
140 * previous resource allocation.
141 */
142 if (cq->cq_is_umap) {
143 umapdb = tavor_umap_db_alloc(state->ts_instance, cq->cq_cqnum,
144 MLNX_UMAP_CQMEM_RSRC, (uint64_t)(uintptr_t)rsrc);
145 if (umapdb == NULL) {
146 goto cqalloc_fail3;
147 }
148 }
149
150 /*
151 * Calculate the appropriate size for the completion queue.
152 * Note: All Tavor CQs must be a power-of-2 minus 1 in size. Also
153 * they may not be any smaller than TAVOR_CQ_MIN_SIZE. This step is
154 * to round the requested size up to the next highest power-of-2
155 */
156 cq_attr->cq_size = max(cq_attr->cq_size, TAVOR_CQ_MIN_SIZE);
157 log_cq_size = highbit(cq_attr->cq_size);
158
159 /*
160 * Next we verify that the rounded-up size is valid (i.e. consistent
161 * with the device limits and/or software-configured limits)
162 */
163 if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
164 goto cqalloc_fail4;
165 }
166
167 /*
168 * Allocate the memory for Completion Queue.
169 *
170 * Note: Although we use the common queue allocation routine, we
171 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
172 * kernel system memory) for kernel CQs because it would be
173 * inefficient to have CQs located in DDR memory. This is primarily
174 * because CQs are read from (by software) more than they are written
175 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
176 * user-mappable CQs for a similar reason.)
177 * It is also worth noting that, unlike Tavor QP work queues,
178 * completion queues do not have the same strict alignment
179 * requirements. It is sufficient for the CQ memory to be both
180 * aligned to and bound to addresses which are a multiple of CQE size.
181 */
182 cq->cq_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
183 cq->cq_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
184 cq->cq_cqinfo.qa_bind_align = sizeof (tavor_hw_cqe_t);
185 if (cq->cq_is_umap) {
186 cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
187 } else {
188 cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
189 }
190 status = tavor_queue_alloc(state, &cq->cq_cqinfo, sleepflag);
191 if (status != DDI_SUCCESS) {
192 goto cqalloc_fail4;
193 }
194 buf = (tavor_hw_cqe_t *)cq->cq_cqinfo.qa_buf_aligned;
195 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
196
197 /*
198 * Initialize each of the Completion Queue Entries (CQE) by setting
199 * their ownership to hardware ("owner" bit set to HW). This is in
200 * preparation for the final transfer of ownership (below) of the
201 * CQ context itself.
202 */
203 for (i = 0; i < (1 << log_cq_size); i++) {
204 TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
205 }
206
207 /*
208 * Register the memory for the CQ. The memory for the CQ must
209 * be registered in the Tavor TPT tables. This gives us the LKey
210 * to specify in the CQ context below. Note: If this is a user-
211 * mappable CQ, then we will force DDI_DMA_CONSISTENT mapping.
212 */
213 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
214 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
215 mr_attr.mr_len = cq->cq_cqinfo.qa_size;
216 mr_attr.mr_as = NULL;
217 mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
218 if (cq->cq_is_umap) {
219 dma_xfer_mode = DDI_DMA_CONSISTENT;
220 } else {
221 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
222 }
223 if (dma_xfer_mode == DDI_DMA_STREAMING) {
224 mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
225 }
226 op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
227 op.mro_bind_dmahdl = cq->cq_cqinfo.qa_dmahdl;
228 op.mro_bind_override_addr = 0;
229 status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
230 if (status != DDI_SUCCESS) {
231 goto cqalloc_fail5;
232 }
233 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
234 addr = mr->mr_bindinfo.bi_addr;
235 lkey = mr->mr_lkey;
236
237 /* Determine if later ddi_dma_sync will be necessary */
238 cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, cq->cq_cqinfo);
239
240 /* Sync entire CQ for use by the hardware (if necessary). */
241 if (cq_sync) {
242 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
243 cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
244 }
245
246 /*
247 * Fill in the CQC entry. This is the final step before passing
248 * ownership of the CQC entry to the Tavor hardware. We use all of
249 * the information collected/calculated above to fill in the
250 * requisite portions of the CQC. Note: If this CQ is going to be
251 * used for userland access, then we need to set the UAR page number
252 * appropriately (otherwise it's a "don't care")
253 */
254 bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
255 cq->cq_eqnum = TAVOR_CQ_EQNUM_GET(cq->cq_cqnum);
256 cq->cq_erreqnum = TAVOR_CQ_ERREQNUM_GET(cq->cq_cqnum);
257 cqc_entry.xlat = TAVOR_VA2PA_XLAT_ENABLED;
258 cqc_entry.state = TAVOR_CQ_DISARMED;
259 cqc_entry.start_addr_h = (addr >> 32);
260 cqc_entry.start_addr_l = (addr & 0xFFFFFFFF);
261 cqc_entry.log_cq_sz = log_cq_size;
262 if (cq->cq_is_umap) {
263 cqc_entry.usr_page = uarpg;
264 } else {
265 cqc_entry.usr_page = 0;
266 }
267 cqc_entry.pd = pd->pd_pdnum;
268 cqc_entry.lkey = lkey;
269 cqc_entry.e_eqn = cq->cq_erreqnum;
270 cqc_entry.c_eqn = cq->cq_eqnum;
271 cqc_entry.cqn = cq->cq_cqnum;
272
273 /*
274 * Write the CQC entry to hardware. Lastly, we pass ownership of
275 * the entry to the hardware (using the Tavor SW2HW_CQ firmware
276 * command). Note: In general, this operation shouldn't fail. But
277 * if it does, we have to undo everything we've done above before
278 * returning error.
279 */
280 status = tavor_cmn_ownership_cmd_post(state, SW2HW_CQ, &cqc_entry,
281 sizeof (tavor_hw_cqc_t), cq->cq_cqnum, sleepflag);
282 if (status != TAVOR_CMD_SUCCESS) {
283 cmn_err(CE_CONT, "Tavor: SW2HW_CQ command failed: %08x\n",
284 status);
285 goto cqalloc_fail6;
286 }
287
288 /*
289 * Fill in the rest of the Tavor Completion Queue handle. Having
290 * successfully transferred ownership of the CQC, we can update the
291 * following fields for use in further operations on the CQ.
292 */
293 cq->cq_cqcrsrcp = cqc;
294 cq->cq_rsrcp = rsrc;
295 cq->cq_consindx = 0;
296 cq->cq_buf = buf;
297 cq->cq_bufsz = (1 << log_cq_size);
298 cq->cq_mrhdl = mr;
299 cq->cq_sync = cq_sync;
300 cq->cq_refcnt = 0;
301 cq->cq_is_special = 0;
302 cq->cq_uarpg = uarpg;
303 cq->cq_umap_dhp = (devmap_cookie_t)NULL;
304 avl_create(&cq->cq_wrid_wqhdr_avl_tree, tavor_wrid_wqhdr_compare,
305 sizeof (struct tavor_workq_hdr_s),
306 offsetof(struct tavor_workq_hdr_s, wq_avl_link));
307
308 cq->cq_wrid_reap_head = NULL;
309 cq->cq_wrid_reap_tail = NULL;
310 cq->cq_hdlrarg = (void *)ibt_cqhdl;
311
312 /*
313 * Put CQ handle in Tavor CQNum-to-CQHdl list. Then fill in the
314 * "actual_size" and "cqhdl" and return success
315 */
316 ASSERT(state->ts_cqhdl[cqc->tr_indx] == NULL);
317 state->ts_cqhdl[cqc->tr_indx] = cq;
318
319 /*
320 * If this is a user-mappable CQ, then we need to insert the previously
321 * allocated entry into the "userland resources database". This will
322 * allow for later lookup during devmap() (i.e. mmap()) calls.
323 */
324 if (cq->cq_is_umap) {
325 tavor_umap_db_add(umapdb);
326 }
327
328 /*
329 * Fill in the return arguments (if necessary). This includes the
330 * real completion queue size.
331 */
332 if (actual_size != NULL) {
333 *actual_size = (1 << log_cq_size) - 1;
334 }
335 *cqhdl = cq;
336
337 return (DDI_SUCCESS);
338
339 /*
340 * The following is cleanup for all possible failure cases in this routine
341 */
342 cqalloc_fail6:
343 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
344 sleepflag) != DDI_SUCCESS) {
345 TAVOR_WARNING(state, "failed to deregister CQ memory");
346 }
347 cqalloc_fail5:
348 tavor_queue_free(state, &cq->cq_cqinfo);
349 cqalloc_fail4:
350 if (cq_is_umap) {
351 tavor_umap_db_free(umapdb);
352 }
353 cqalloc_fail3:
354 tavor_rsrc_free(state, &rsrc);
355 cqalloc_fail2:
356 tavor_rsrc_free(state, &cqc);
357 cqalloc_fail1:
358 tavor_pd_refcnt_dec(pd);
359 cqalloc_fail:
360 return (status);
361 }
362
363
364 /*
365 * tavor_cq_free()
366 * Context: Can be called only from user or kernel context.
367 */
368 /* ARGSUSED */
369 int
tavor_cq_free(tavor_state_t * state,tavor_cqhdl_t * cqhdl,uint_t sleepflag)370 tavor_cq_free(tavor_state_t *state, tavor_cqhdl_t *cqhdl, uint_t sleepflag)
371 {
372 tavor_rsrc_t *cqc, *rsrc;
373 tavor_umap_db_entry_t *umapdb;
374 tavor_hw_cqc_t cqc_entry;
375 tavor_pdhdl_t pd;
376 tavor_mrhdl_t mr;
377 tavor_cqhdl_t cq;
378 uint32_t cqnum;
379 uint64_t value;
380 uint_t maxprot;
381 int status;
382
383 /*
384 * Pull all the necessary information from the Tavor Completion Queue
385 * handle. This is necessary here because the resource for the
386 * CQ handle is going to be freed up as part of this operation.
387 */
388 cq = *cqhdl;
389 mutex_enter(&cq->cq_lock);
390 cqc = cq->cq_cqcrsrcp;
391 rsrc = cq->cq_rsrcp;
392 pd = state->ts_pdhdl_internal;
393 mr = cq->cq_mrhdl;
394 cqnum = cq->cq_cqnum;
395
396 /*
397 * If there are work queues still associated with the CQ, then return
398 * an error. Otherwise, we will be holding the CQ lock.
399 */
400 if (cq->cq_refcnt != 0) {
401 mutex_exit(&cq->cq_lock);
402 return (IBT_CQ_BUSY);
403 }
404
405 /*
406 * If this was a user-mappable CQ, then we need to remove its entry
407 * from the "userland resources database". If it is also currently
408 * mmap()'d out to a user process, then we need to call
409 * devmap_devmem_remap() to remap the CQ memory to an invalid mapping.
410 * We also need to invalidate the CQ tracking information for the
411 * user mapping.
412 */
413 if (cq->cq_is_umap) {
414 status = tavor_umap_db_find(state->ts_instance, cqnum,
415 MLNX_UMAP_CQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
416 &umapdb);
417 if (status != DDI_SUCCESS) {
418 mutex_exit(&cq->cq_lock);
419 TAVOR_WARNING(state, "failed to find in database");
420 return (ibc_get_ci_failure(0));
421 }
422 tavor_umap_db_free(umapdb);
423 if (cq->cq_umap_dhp != NULL) {
424 maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
425 status = devmap_devmem_remap(cq->cq_umap_dhp,
426 state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size,
427 maxprot, DEVMAP_MAPPING_INVALID, NULL);
428 if (status != DDI_SUCCESS) {
429 mutex_exit(&cq->cq_lock);
430 TAVOR_WARNING(state, "failed in CQ memory "
431 "devmap_devmem_remap()");
432 return (ibc_get_ci_failure(0));
433 }
434 cq->cq_umap_dhp = (devmap_cookie_t)NULL;
435 }
436 }
437
438 /*
439 * Put NULL into the Tavor CQNum-to-CQHdl list. This will allow any
440 * in-progress events to detect that the CQ corresponding to this
441 * number has been freed.
442 */
443 state->ts_cqhdl[cqc->tr_indx] = NULL;
444
445 /*
446 * While we hold the CQ lock, do a "forced reap" of the workQ WRID
447 * list. This cleans up all the structures associated with the WRID
448 * processing for this CQ. Once we complete, drop the lock and finish
449 * the deallocation of the CQ.
450 */
451 tavor_wrid_cq_force_reap(cq);
452
453 mutex_exit(&cq->cq_lock);
454 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
455
456 /*
457 * Reclaim CQC entry from hardware (using the Tavor HW2SW_CQ
458 * firmware command). If the ownership transfer fails for any reason,
459 * then it is an indication that something (either in HW or SW) has
460 * gone seriously wrong.
461 */
462 status = tavor_cmn_ownership_cmd_post(state, HW2SW_CQ, &cqc_entry,
463 sizeof (tavor_hw_cqc_t), cqnum, sleepflag);
464 if (status != TAVOR_CMD_SUCCESS) {
465 TAVOR_WARNING(state, "failed to reclaim CQC ownership");
466 cmn_err(CE_CONT, "Tavor: HW2SW_CQ command failed: %08x\n",
467 status);
468 return (ibc_get_ci_failure(0));
469 }
470
471 /*
472 * Deregister the memory for the Completion Queue. If this fails
473 * for any reason, then it is an indication that something (either
474 * in HW or SW) has gone seriously wrong. So we print a warning
475 * message and return.
476 */
477 status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
478 sleepflag);
479 if (status != DDI_SUCCESS) {
480 TAVOR_WARNING(state, "failed to deregister CQ memory");
481 return (ibc_get_ci_failure(0));
482 }
483
484 /* Free the memory for the CQ */
485 tavor_queue_free(state, &cq->cq_cqinfo);
486
487 /* Free the Tavor Completion Queue handle */
488 tavor_rsrc_free(state, &rsrc);
489
490 /* Free up the CQC entry resource */
491 tavor_rsrc_free(state, &cqc);
492
493 /* Decrement the reference count on the protection domain (PD) */
494 tavor_pd_refcnt_dec(pd);
495
496 /* Set the cqhdl pointer to NULL and return success */
497 *cqhdl = NULL;
498
499 return (DDI_SUCCESS);
500 }
501
502
503 /*
504 * tavor_cq_resize()
505 * Context: Can be called only from user or kernel context.
506 */
507 int
tavor_cq_resize(tavor_state_t * state,tavor_cqhdl_t cq,uint_t req_size,uint_t * actual_size,uint_t sleepflag)508 tavor_cq_resize(tavor_state_t *state, tavor_cqhdl_t cq, uint_t req_size,
509 uint_t *actual_size, uint_t sleepflag)
510 {
511 tavor_hw_cqc_t cqc_entry;
512 tavor_qalloc_info_t new_cqinfo, old_cqinfo;
513 ibt_mr_attr_t mr_attr;
514 tavor_mr_options_t op;
515 tavor_pdhdl_t pd;
516 tavor_mrhdl_t mr, mr_old;
517 tavor_hw_cqe_t *buf;
518 uint32_t new_prod_indx, old_cons_indx;
519 uint_t dma_xfer_mode, cq_sync, log_cq_size, maxprot;
520 int status, i, flag;
521
522 /* Use the internal protection domain (PD) for CQs */
523 pd = state->ts_pdhdl_internal;
524
525 /*
526 * Calculate the appropriate size for the new resized completion queue.
527 * Note: All Tavor CQs must be a power-of-2 minus 1 in size. Also
528 * they may not be any smaller than TAVOR_CQ_MIN_SIZE. This step is
529 * to round the requested size up to the next highest power-of-2
530 */
531 req_size = max(req_size, TAVOR_CQ_MIN_SIZE);
532 log_cq_size = highbit(req_size);
533
534 /*
535 * Next we verify that the rounded-up size is valid (i.e. consistent
536 * with the device limits and/or software-configured limits)
537 */
538 if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
539 goto cqresize_fail;
540 }
541
542 /*
543 * Allocate the memory for newly resized Completion Queue.
544 *
545 * Note: Although we use the common queue allocation routine, we
546 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
547 * kernel system memory) for kernel CQs because it would be
548 * inefficient to have CQs located in DDR memory. This is the same
549 * as we do when we first allocate completion queues primarily
550 * because CQs are read from (by software) more than they are written
551 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
552 * user-mappable CQs for a similar reason.)
553 * It is also worth noting that, unlike Tavor QP work queues,
554 * completion queues do not have the same strict alignment
555 * requirements. It is sufficient for the CQ memory to be both
556 * aligned to and bound to addresses which are a multiple of CQE size.
557 */
558 new_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
559 new_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
560 new_cqinfo.qa_bind_align = sizeof (tavor_hw_cqe_t);
561 if (cq->cq_is_umap) {
562 new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
563 } else {
564 new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
565 }
566 status = tavor_queue_alloc(state, &new_cqinfo, sleepflag);
567 if (status != DDI_SUCCESS) {
568 goto cqresize_fail;
569 }
570 buf = (tavor_hw_cqe_t *)new_cqinfo.qa_buf_aligned;
571 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
572
573 /*
574 * Initialize each of the Completion Queue Entries (CQE) by setting
575 * their ownership to hardware ("owner" bit set to HW). This is in
576 * preparation for the final resize operation (below).
577 */
578 for (i = 0; i < (1 << log_cq_size); i++) {
579 TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
580 }
581
582 /*
583 * Register the memory for the CQ. The memory for the CQ must
584 * be registered in the Tavor TPT tables. This gives us the LKey
585 * to specify in the CQ context below.
586 */
587 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
588 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
589 mr_attr.mr_len = new_cqinfo.qa_size;
590 mr_attr.mr_as = NULL;
591 mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
592 if (cq->cq_is_umap) {
593 dma_xfer_mode = DDI_DMA_CONSISTENT;
594 } else {
595 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
596 }
597 if (dma_xfer_mode == DDI_DMA_STREAMING) {
598 mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
599 }
600 op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
601 op.mro_bind_dmahdl = new_cqinfo.qa_dmahdl;
602 op.mro_bind_override_addr = 0;
603 status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
604 if (status != DDI_SUCCESS) {
605 tavor_queue_free(state, &new_cqinfo);
606 goto cqresize_fail;
607 }
608 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
609
610 /* Determine if later ddi_dma_sync will be necessary */
611 cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, new_cqinfo);
612
613 /* Sync entire "new" CQ for use by hardware (if necessary) */
614 if (cq_sync) {
615 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
616 new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
617 }
618
619 /*
620 * Now we grab the CQ lock. Since we will be updating the actual
621 * CQ location and the producer/consumer indexes, we should hold
622 * the lock.
623 *
624 * We do a TAVOR_NOSLEEP here (and below), though, because we are
625 * holding the "cq_lock" and if we got raised to interrupt level
626 * by priority inversion, we would not want to block in this routine
627 * waiting for success.
628 */
629 mutex_enter(&cq->cq_lock);
630
631 /*
632 * Determine the current CQ "consumer index".
633 *
634 * Note: This will depend on whether the CQ had previously been
635 * mapped for user access or whether it is a kernel CQ. If this
636 * is a kernel CQ, then all PollCQ() operations have come through
637 * the IBTF and, hence, the driver's CQ state structure will
638 * contain the current consumer index. If, however, the user has
639 * accessed this CQ by bypassing the driver (OS-bypass), then we
640 * need to query the firmware to determine the current CQ consumer
641 * index. This also assumes that the user process will not continue
642 * to consume entries while at the same time doing the ResizeCQ()
643 * operation. If the user process does not guarantee this, then it
644 * may see duplicate or missed completions. But under no
645 * circumstances should this panic the system.
646 */
647 if (cq->cq_is_umap) {
648 status = tavor_cmn_query_cmd_post(state, QUERY_CQ,
649 cq->cq_cqnum, &cqc_entry, sizeof (tavor_hw_cqc_t),
650 TAVOR_NOSLEEP);
651 if (status != TAVOR_CMD_SUCCESS) {
652 /* Query CQ has failed, drop CQ lock and cleanup */
653 mutex_exit(&cq->cq_lock);
654 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
655 sleepflag) != DDI_SUCCESS) {
656 TAVOR_WARNING(state, "failed to deregister "
657 "CQ memory");
658 }
659 tavor_queue_free(state, &new_cqinfo);
660 TAVOR_WARNING(state, "failed to find in database");
661
662 goto cqresize_fail;
663 }
664 old_cons_indx = cqc_entry.cons_indx;
665 } else {
666 old_cons_indx = cq->cq_consindx;
667 }
668
669 /*
670 * Fill in the CQC entry. For the resize operation this is the
671 * final step before attempting the resize operation on the CQC entry.
672 * We use all of the information collected/calculated above to fill
673 * in the requisite portions of the CQC.
674 */
675 bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
676 cqc_entry.start_addr_h = (mr->mr_bindinfo.bi_addr >> 32);
677 cqc_entry.start_addr_l = (mr->mr_bindinfo.bi_addr & 0xFFFFFFFF);
678 cqc_entry.log_cq_sz = log_cq_size;
679 cqc_entry.lkey = mr->mr_lkey;
680
681 /*
682 * Write the CQC entry to hardware. Lastly, we pass ownership of
683 * the entry to the hardware (using the Tavor RESIZE_CQ firmware
684 * command). Note: In general, this operation shouldn't fail. But
685 * if it does, we have to undo everything we've done above before
686 * returning error. Also note that the status returned may indicate
687 * the code to return to the IBTF.
688 */
689 status = tavor_resize_cq_cmd_post(state, &cqc_entry, cq->cq_cqnum,
690 &new_prod_indx, TAVOR_CMD_NOSLEEP_SPIN);
691 if (status != TAVOR_CMD_SUCCESS) {
692 /* Resize attempt has failed, drop CQ lock and cleanup */
693 mutex_exit(&cq->cq_lock);
694 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
695 sleepflag) != DDI_SUCCESS) {
696 TAVOR_WARNING(state, "failed to deregister CQ memory");
697 }
698 tavor_queue_free(state, &new_cqinfo);
699 if (status == TAVOR_CMD_BAD_SIZE) {
700 return (IBT_CQ_SZ_INSUFFICIENT);
701 } else {
702 cmn_err(CE_CONT, "Tavor: RESIZE_CQ command failed: "
703 "%08x\n", status);
704 return (ibc_get_ci_failure(0));
705 }
706 }
707
708 /*
709 * The CQ resize attempt was successful. Before dropping the CQ lock,
710 * copy all of the CQEs from the "old" CQ into the "new" CQ. Note:
711 * the Tavor firmware guarantees us that sufficient space is set aside
712 * in the "new" CQ to handle any un-polled CQEs from the "old" CQ.
713 * The two parameters to this helper function ("old_cons_indx" and
714 * "new_prod_indx") essentially indicate the starting index and number
715 * of any CQEs that might remain in the "old" CQ memory.
716 */
717 tavor_cq_resize_helper(cq, buf, old_cons_indx, new_prod_indx);
718
719 /* Sync entire "new" CQ for use by hardware (if necessary) */
720 if (cq_sync) {
721 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
722 new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
723 }
724
725 /*
726 * Update the Tavor Completion Queue handle with all the new
727 * information. At the same time, save away all the necessary
728 * information for freeing up the old resources
729 */
730 mr_old = cq->cq_mrhdl;
731 old_cqinfo = cq->cq_cqinfo;
732 cq->cq_cqinfo = new_cqinfo;
733 cq->cq_consindx = 0;
734 cq->cq_buf = buf;
735 cq->cq_bufsz = (1 << log_cq_size);
736 cq->cq_mrhdl = mr;
737 cq->cq_sync = cq_sync;
738
739 /*
740 * If "old" CQ was a user-mappable CQ that is currently mmap()'d out
741 * to a user process, then we need to call devmap_devmem_remap() to
742 * invalidate the mapping to the CQ memory. We also need to
743 * invalidate the CQ tracking information for the user mapping.
744 */
745 if ((cq->cq_is_umap) && (cq->cq_umap_dhp != NULL)) {
746 maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
747 status = devmap_devmem_remap(cq->cq_umap_dhp,
748 state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size, maxprot,
749 DEVMAP_MAPPING_INVALID, NULL);
750 if (status != DDI_SUCCESS) {
751 mutex_exit(&cq->cq_lock);
752 TAVOR_WARNING(state, "failed in CQ memory "
753 "devmap_devmem_remap()");
754 return (ibc_get_ci_failure(0));
755 }
756 cq->cq_umap_dhp = (devmap_cookie_t)NULL;
757 }
758
759 /*
760 * Drop the CQ lock now. The only thing left to do is to free up
761 * the old resources.
762 */
763 mutex_exit(&cq->cq_lock);
764
765 /*
766 * Deregister the memory for the old Completion Queue. Note: We
767 * really can't return error here because we have no good way to
768 * cleanup. Plus, the deregistration really shouldn't ever happen.
769 * So, if it does, it is an indication that something has gone
770 * seriously wrong. So we print a warning message and return error
771 * (knowing, of course, that the "old" CQ memory will be leaked)
772 */
773 status = tavor_mr_deregister(state, &mr_old, TAVOR_MR_DEREG_ALL,
774 sleepflag);
775 if (status != DDI_SUCCESS) {
776 TAVOR_WARNING(state, "failed to deregister old CQ memory");
777 goto cqresize_fail;
778 }
779
780 /* Free the memory for the old CQ */
781 tavor_queue_free(state, &old_cqinfo);
782
783 /*
784 * Fill in the return arguments (if necessary). This includes the
785 * real new completion queue size.
786 */
787 if (actual_size != NULL) {
788 *actual_size = (1 << log_cq_size) - 1;
789 }
790
791 return (DDI_SUCCESS);
792
793 cqresize_fail:
794 return (status);
795 }
796
797
798 /*
799 * tavor_cq_notify()
800 * Context: Can be called from interrupt or base context.
801 */
802 int
tavor_cq_notify(tavor_state_t * state,tavor_cqhdl_t cq,ibt_cq_notify_flags_t flags)803 tavor_cq_notify(tavor_state_t *state, tavor_cqhdl_t cq,
804 ibt_cq_notify_flags_t flags)
805 {
806 uint_t cqnum;
807
808 /*
809 * Determine if we are trying to get the next completion or the next
810 * "solicited" completion. Then hit the appropriate doorbell.
811 *
812 * NOTE: Please see the comment in tavor_event.c:tavor_eq_poll
813 * regarding why we do not have to do an extra PIO read here, and we
814 * will not lose an event after writing this doorbell.
815 */
816 cqnum = cq->cq_cqnum;
817 if (flags == IBT_NEXT_COMPLETION) {
818 tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ, cqnum,
819 TAVOR_CQDB_DEFAULT_PARAM);
820
821 } else if (flags == IBT_NEXT_SOLICITED) {
822 tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ_SOLICIT,
823 cqnum, TAVOR_CQDB_DEFAULT_PARAM);
824
825 } else {
826 return (IBT_CQ_NOTIFY_TYPE_INVALID);
827 }
828
829 return (DDI_SUCCESS);
830 }
831
832
833 /*
834 * tavor_cq_poll()
835 * Context: Can be called from interrupt or base context.
836 */
837 int
tavor_cq_poll(tavor_state_t * state,tavor_cqhdl_t cq,ibt_wc_t * wc_p,uint_t num_wc,uint_t * num_polled)838 tavor_cq_poll(tavor_state_t *state, tavor_cqhdl_t cq, ibt_wc_t *wc_p,
839 uint_t num_wc, uint_t *num_polled)
840 {
841 tavor_hw_cqe_t *cqe;
842 uint32_t cons_indx, wrap_around_mask;
843 uint32_t polled_cnt, num_to_increment;
844 int status;
845
846 /*
847 * Check for user-mappable CQ memory. Note: We do not allow kernel
848 * clients to poll CQ memory that is accessible directly by the user.
849 * If the CQ memory is user accessible, then return an error.
850 */
851 if (cq->cq_is_umap) {
852 return (IBT_CQ_HDL_INVALID);
853 }
854
855 mutex_enter(&cq->cq_lock);
856
857 /* Get the consumer index */
858 cons_indx = cq->cq_consindx;
859
860 /*
861 * Calculate the wrap around mask. Note: This operation only works
862 * because all Tavor completion queues have power-of-2 sizes
863 */
864 wrap_around_mask = (cq->cq_bufsz - 1);
865
866 /* Calculate the pointer to the first CQ entry */
867 cqe = &cq->cq_buf[cons_indx];
868
869 /* Sync the current CQE to read */
870 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
871
872 /*
873 * Keep pulling entries from the CQ until we find an entry owned by
874 * the hardware. As long as there the CQE's owned by SW, process
875 * each entry by calling tavor_cq_cqe_consume() and updating the CQ
876 * consumer index. Note: We only update the consumer index if
877 * tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB. Otherwise,
878 * it indicates that we are going to "recycle" the CQE (probably
879 * because it is a error CQE and corresponds to more than one
880 * completion).
881 */
882 polled_cnt = 0;
883 while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
884 status = tavor_cq_cqe_consume(state, cq, cqe,
885 &wc_p[polled_cnt++]);
886 if (status == TAVOR_CQ_SYNC_AND_DB) {
887 /* Reset entry to hardware ownership */
888 TAVOR_CQE_OWNER_SET_HW(cq, cqe);
889
890 /* Sync the current CQE for device */
891 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORDEV);
892
893 /* Increment the consumer index */
894 cons_indx = (cons_indx + 1) & wrap_around_mask;
895
896 /* Update the pointer to the next CQ entry */
897 cqe = &cq->cq_buf[cons_indx];
898
899 /* Sync the next CQE to read */
900 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
901 }
902
903 /*
904 * If we have run out of space to store work completions,
905 * then stop and return the ones we have pulled of the CQ.
906 */
907 if (polled_cnt >= num_wc) {
908 break;
909 }
910 }
911
912 /*
913 * Now we only ring the doorbell (to update the consumer index) if
914 * we've actually consumed a CQ entry. If we have, for example,
915 * pulled from a CQE that we are still in the process of "recycling"
916 * for error purposes, then we would not update the consumer index.
917 */
918 if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) {
919 /*
920 * Post doorbell to update the consumer index. Doorbell
921 * value indicates number of entries consumed (minus 1)
922 */
923 if (cons_indx > cq->cq_consindx) {
924 num_to_increment = (cons_indx - cq->cq_consindx) - 1;
925 } else {
926 num_to_increment = ((cons_indx + cq->cq_bufsz) -
927 cq->cq_consindx) - 1;
928 }
929 cq->cq_consindx = cons_indx;
930 tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
931 cq->cq_cqnum, num_to_increment);
932
933 } else if (polled_cnt == 0) {
934 /*
935 * If the CQ is empty, we can try to free up some of the WRID
936 * list containers. See tavor_wr.c for more details on this
937 * operation.
938 */
939 tavor_wrid_cq_reap(cq);
940 }
941
942 mutex_exit(&cq->cq_lock);
943
944 /* Set "num_polled" (if necessary) */
945 if (num_polled != NULL) {
946 *num_polled = polled_cnt;
947 }
948
949 /* Set CQ_EMPTY condition if needed, otherwise return success */
950 if (polled_cnt == 0) {
951 status = IBT_CQ_EMPTY;
952 } else {
953 status = DDI_SUCCESS;
954 }
955
956 /*
957 * Check if the system is currently panicking. If it is, then call
958 * the Tavor interrupt service routine. This step is necessary here
959 * because we might be in a polled I/O mode and without the call to
960 * tavor_isr() - and its subsequent calls to poll and rearm each
961 * event queue - we might overflow our EQs and render the system
962 * unable to sync/dump.
963 */
964 if (ddi_in_panic() != 0) {
965 (void) tavor_isr((caddr_t)state, (caddr_t)NULL);
966 }
967
968 return (status);
969 }
970
971
972 /*
973 * tavor_cq_handler()
974 * Context: Only called from interrupt context
975 */
976 int
tavor_cq_handler(tavor_state_t * state,tavor_eqhdl_t eq,tavor_hw_eqe_t * eqe)977 tavor_cq_handler(tavor_state_t *state, tavor_eqhdl_t eq,
978 tavor_hw_eqe_t *eqe)
979 {
980 tavor_cqhdl_t cq;
981 uint_t cqnum;
982 uint_t eqe_evttype;
983
984 eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
985
986 ASSERT(eqe_evttype == TAVOR_EVT_COMPLETION ||
987 eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
988
989 if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
990 tavor_eq_overflow_handler(state, eq, eqe);
991
992 return (DDI_FAILURE);
993 }
994
995
996 /* Get the CQ handle from CQ number in event descriptor */
997 cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
998 cq = tavor_cqhdl_from_cqnum(state, cqnum);
999
1000 /*
1001 * Post the EQ doorbell to move the CQ to the "disarmed" state.
1002 * This operation is to enable subsequent CQ doorbells (e.g. those
1003 * that can be rung by tavor_cq_notify() above) to rearm the CQ.
1004 */
1005 tavor_eq_doorbell(state, TAVOR_EQDB_DISARM_CQ, eq->eq_eqnum, cqnum);
1006
1007 /*
1008 * If the CQ handle is NULL, this is probably an indication
1009 * that the CQ has been freed already. In which case, we
1010 * should not deliver this event.
1011 *
1012 * We also check that the CQ number in the handle is the
1013 * same as the CQ number in the event queue entry. This
1014 * extra check allows us to handle the case where a CQ was
1015 * freed and then allocated again in the time it took to
1016 * handle the event queue processing. By constantly incrementing
1017 * the non-constrained portion of the CQ number every time
1018 * a new CQ is allocated, we mitigate (somewhat) the chance
1019 * that a stale event could be passed to the client's CQ
1020 * handler.
1021 *
1022 * Lastly, we check if "ts_ibtfpriv" is NULL. If it is then it
1023 * means that we've have either received this event before we
1024 * finished attaching to the IBTF or we've received it while we
1025 * are in the process of detaching.
1026 */
1027 if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1028 (state->ts_ibtfpriv != NULL)) {
1029 TAVOR_DO_IBTF_CQ_CALLB(state, cq);
1030 }
1031
1032 return (DDI_SUCCESS);
1033 }
1034
1035
1036 /*
1037 * tavor_cq_err_handler()
1038 * Context: Only called from interrupt context
1039 */
1040 int
tavor_cq_err_handler(tavor_state_t * state,tavor_eqhdl_t eq,tavor_hw_eqe_t * eqe)1041 tavor_cq_err_handler(tavor_state_t *state, tavor_eqhdl_t eq,
1042 tavor_hw_eqe_t *eqe)
1043 {
1044 tavor_cqhdl_t cq;
1045 uint_t cqnum;
1046 ibc_async_event_t event;
1047 ibt_async_code_t type;
1048 uint_t eqe_evttype;
1049
1050 eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
1051
1052 ASSERT(eqe_evttype == TAVOR_EVT_CQ_ERRORS ||
1053 eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
1054
1055 if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
1056 tavor_eq_overflow_handler(state, eq, eqe);
1057
1058 return (DDI_FAILURE);
1059 }
1060
1061 /* cmn_err(CE_CONT, "CQ Error handler\n"); */
1062
1063 /* Get the CQ handle from CQ number in event descriptor */
1064 cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
1065 cq = tavor_cqhdl_from_cqnum(state, cqnum);
1066
1067 /*
1068 * If the CQ handle is NULL, this is probably an indication
1069 * that the CQ has been freed already. In which case, we
1070 * should not deliver this event.
1071 *
1072 * We also check that the CQ number in the handle is the
1073 * same as the CQ number in the event queue entry. This
1074 * extra check allows us to handle the case where a CQ was
1075 * freed and then allocated again in the time it took to
1076 * handle the event queue processing. By constantly incrementing
1077 * the non-constrained portion of the CQ number every time
1078 * a new CQ is allocated, we mitigate (somewhat) the chance
1079 * that a stale event could be passed to the client's CQ
1080 * handler.
1081 *
1082 * And then we check if "ts_ibtfpriv" is NULL. If it is then it
1083 * means that we've have either received this event before we
1084 * finished attaching to the IBTF or we've received it while we
1085 * are in the process of detaching.
1086 */
1087 if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1088 (state->ts_ibtfpriv != NULL)) {
1089 event.ev_cq_hdl = (ibt_cq_hdl_t)cq->cq_hdlrarg;
1090 type = IBT_ERROR_CQ;
1091
1092 TAVOR_DO_IBTF_ASYNC_CALLB(state, type, &event);
1093 }
1094
1095 return (DDI_SUCCESS);
1096 }
1097
1098
1099 /*
1100 * tavor_cq_refcnt_inc()
1101 * Context: Can be called from interrupt or base context.
1102 */
1103 int
tavor_cq_refcnt_inc(tavor_cqhdl_t cq,uint_t is_special)1104 tavor_cq_refcnt_inc(tavor_cqhdl_t cq, uint_t is_special)
1105 {
1106 /*
1107 * Increment the completion queue's reference count. Note: In order
1108 * to ensure compliance with IBA C11-15, we must ensure that a given
1109 * CQ is not used for both special (SMI/GSI) QP and non-special QP.
1110 * This is accomplished here by keeping track of how the referenced
1111 * CQ is being used.
1112 */
1113 mutex_enter(&cq->cq_lock);
1114 if (cq->cq_refcnt == 0) {
1115 cq->cq_is_special = is_special;
1116 } else {
1117 if (cq->cq_is_special != is_special) {
1118 mutex_exit(&cq->cq_lock);
1119 return (DDI_FAILURE);
1120 }
1121 }
1122 cq->cq_refcnt++;
1123 mutex_exit(&cq->cq_lock);
1124 return (DDI_SUCCESS);
1125 }
1126
1127
1128 /*
1129 * tavor_cq_refcnt_dec()
1130 * Context: Can be called from interrupt or base context.
1131 */
1132 void
tavor_cq_refcnt_dec(tavor_cqhdl_t cq)1133 tavor_cq_refcnt_dec(tavor_cqhdl_t cq)
1134 {
1135 /* Decrement the completion queue's reference count */
1136 mutex_enter(&cq->cq_lock);
1137 cq->cq_refcnt--;
1138 mutex_exit(&cq->cq_lock);
1139 }
1140
1141
1142 /*
1143 * tavor_cq_doorbell()
1144 * Context: Can be called from interrupt or base context.
1145 */
1146 static void
tavor_cq_doorbell(tavor_state_t * state,uint32_t cq_cmd,uint32_t cqn,uint32_t cq_param)1147 tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd, uint32_t cqn,
1148 uint32_t cq_param)
1149 {
1150 uint64_t doorbell = 0;
1151
1152 /* Build the doorbell from the parameters */
1153 doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) |
1154 ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param;
1155
1156 /* Write the doorbell to UAR */
1157 TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->cq,
1158 doorbell);
1159 }
1160
1161
1162 /*
1163 * tavor_cqhdl_from_cqnum()
1164 * Context: Can be called from interrupt or base context.
1165 *
1166 * This routine is important because changing the unconstrained
1167 * portion of the CQ number is critical to the detection of a
1168 * potential race condition in the CQ handler code (i.e. the case
1169 * where a CQ is freed and alloc'd again before an event for the
1170 * "old" CQ can be handled).
1171 *
1172 * While this is not a perfect solution (not sure that one exists)
1173 * it does help to mitigate the chance that this race condition will
1174 * cause us to deliver a "stale" event to the new CQ owner. Note:
1175 * this solution does not scale well because the number of constrained
1176 * bits increases (and, hence, the number of unconstrained bits
1177 * decreases) as the number of supported CQs grows. For small and
1178 * intermediate values, it should hopefully provide sufficient
1179 * protection.
1180 */
1181 tavor_cqhdl_t
tavor_cqhdl_from_cqnum(tavor_state_t * state,uint_t cqnum)1182 tavor_cqhdl_from_cqnum(tavor_state_t *state, uint_t cqnum)
1183 {
1184 uint_t cqindx, cqmask;
1185
1186 /* Calculate the CQ table index from the cqnum */
1187 cqmask = (1 << state->ts_cfg_profile->cp_log_num_cq) - 1;
1188 cqindx = cqnum & cqmask;
1189 return (state->ts_cqhdl[cqindx]);
1190 }
1191
1192
1193 /*
1194 * tavor_cq_cqe_consume()
1195 * Context: Can be called from interrupt or base context.
1196 */
1197 static int
tavor_cq_cqe_consume(tavor_state_t * state,tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe,ibt_wc_t * wc)1198 tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1199 tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1200 {
1201 uint_t flags, type, opcode, qpnum, qp1_indx;
1202 int status;
1203
1204 /*
1205 * Determine if this is an "error" CQE by examining "opcode". If it
1206 * is an error CQE, then call tavor_cq_errcqe_consume() and return
1207 * whatever status it returns. Otherwise, this is a successful
1208 * completion.
1209 */
1210 opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
1211 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
1212 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
1213 status = tavor_cq_errcqe_consume(state, cq, cqe, wc);
1214 return (status);
1215 }
1216
1217 /*
1218 * Fetch the Work Request ID using the information in the CQE.
1219 * See tavor_wr.c for more details.
1220 */
1221 wc->wc_id = tavor_wrid_get_entry(cq, cqe, NULL);
1222
1223 /*
1224 * Parse the CQE opcode to determine completion type. This will set
1225 * not only the type of the completion, but also any flags that might
1226 * be associated with it (e.g. whether immediate data is present).
1227 */
1228 flags = IBT_WC_NO_FLAGS;
1229 if (TAVOR_CQE_SENDRECV_GET(cq, cqe) != TAVOR_COMPLETION_RECV) {
1230
1231 /* Send CQE */
1232 switch (opcode) {
1233 case TAVOR_CQE_SND_RDMAWR_IMM:
1234 flags |= IBT_WC_IMMED_DATA_PRESENT;
1235 /* FALLTHROUGH */
1236 case TAVOR_CQE_SND_RDMAWR:
1237 type = IBT_WRC_RDMAW;
1238 break;
1239
1240 case TAVOR_CQE_SND_SEND_IMM:
1241 flags |= IBT_WC_IMMED_DATA_PRESENT;
1242 /* FALLTHROUGH */
1243 case TAVOR_CQE_SND_SEND:
1244 type = IBT_WRC_SEND;
1245 break;
1246
1247 case TAVOR_CQE_SND_RDMARD:
1248 type = IBT_WRC_RDMAR;
1249 break;
1250
1251 case TAVOR_CQE_SND_ATOMIC_CS:
1252 type = IBT_WRC_CSWAP;
1253 break;
1254
1255 case TAVOR_CQE_SND_ATOMIC_FA:
1256 type = IBT_WRC_FADD;
1257 break;
1258
1259 case TAVOR_CQE_SND_BIND_MW:
1260 type = IBT_WRC_BIND;
1261 break;
1262
1263 default:
1264 TAVOR_WARNING(state, "unknown send CQE type");
1265 wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1266 return (TAVOR_CQ_SYNC_AND_DB);
1267 }
1268 } else {
1269
1270 /* Receive CQE */
1271 switch (opcode & 0x1F) {
1272 case TAVOR_CQE_RCV_RECV_IMM:
1273 /* FALLTHROUGH */
1274 case TAVOR_CQE_RCV_RECV_IMM2:
1275 /*
1276 * Note: According to the Tavor PRM, all QP1 recv
1277 * completions look like the result of a Send with
1278 * Immediate. They are not, however, (MADs are Send
1279 * Only) so we need to check the QP number and set
1280 * the flag only if it is non-QP1.
1281 */
1282 qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
1283 qp1_indx = state->ts_spec_qp1->tr_indx;
1284 if ((qpnum < qp1_indx) || (qpnum > qp1_indx + 1)) {
1285 flags |= IBT_WC_IMMED_DATA_PRESENT;
1286 }
1287 /* FALLTHROUGH */
1288 case TAVOR_CQE_RCV_RECV:
1289 /* FALLTHROUGH */
1290 case TAVOR_CQE_RCV_RECV2:
1291 type = IBT_WRC_RECV;
1292 break;
1293
1294 case TAVOR_CQE_RCV_RDMAWR_IMM:
1295 /* FALLTHROUGH */
1296 case TAVOR_CQE_RCV_RDMAWR_IMM2:
1297 flags |= IBT_WC_IMMED_DATA_PRESENT;
1298 type = IBT_WRC_RECV_RDMAWI;
1299 break;
1300
1301 default:
1302 TAVOR_WARNING(state, "unknown recv CQE type");
1303 wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1304 return (TAVOR_CQ_SYNC_AND_DB);
1305 }
1306 }
1307 wc->wc_type = type;
1308
1309 /*
1310 * Check for GRH, update the flags, then fill in "wc_flags" field
1311 * in the work completion
1312 */
1313 if (TAVOR_CQE_GRH_GET(cq, cqe) != 0) {
1314 flags |= IBT_WC_GRH_PRESENT;
1315 }
1316 wc->wc_flags = flags;
1317
1318 /* If we got here, completion status must be success */
1319 wc->wc_status = IBT_WC_SUCCESS;
1320
1321 /*
1322 * Parse the remaining contents of the CQE into the work completion.
1323 * This means filling in SL, QP number, SLID, immediate data, etc.
1324 * Note: Not all of these fields are valid in a given completion.
1325 * Many of them depend on the actual type of completion. So we fill
1326 * in all of the fields and leave it up to the IBTF and consumer to
1327 * sort out which are valid based on their context.
1328 */
1329 wc->wc_sl = TAVOR_CQE_SL_GET(cq, cqe);
1330 wc->wc_immed_data = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1331 wc->wc_qpn = TAVOR_CQE_DQPN_GET(cq, cqe);
1332 wc->wc_res_hash = 0;
1333 wc->wc_slid = TAVOR_CQE_DLID_GET(cq, cqe);
1334 wc->wc_ethertype = (wc->wc_immed_data & 0xFFFF);
1335 wc->wc_pkey_ix = (wc->wc_immed_data >> 16);
1336
1337 /*
1338 * Depending on whether the completion was a receive or a send
1339 * completion, fill in "bytes transferred" as appropriate. Also,
1340 * if necessary, fill in the "path bits" field.
1341 */
1342 if (TAVOR_CQE_SENDRECV_GET(cq, cqe) == TAVOR_COMPLETION_RECV) {
1343 wc->wc_path_bits = TAVOR_CQE_PATHBITS_GET(cq, cqe);
1344 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1345
1346 } else if ((wc->wc_type == IBT_WRC_RDMAR) ||
1347 (wc->wc_type == IBT_WRC_CSWAP) || (wc->wc_type == IBT_WRC_FADD)) {
1348 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1349 }
1350
1351 return (TAVOR_CQ_SYNC_AND_DB);
1352 }
1353
1354
1355 /*
1356 * tavor_cq_errcqe_consume()
1357 * Context: Can be called from interrupt or base context.
1358 */
1359 static int
tavor_cq_errcqe_consume(tavor_state_t * state,tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe,ibt_wc_t * wc)1360 tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1361 tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1362 {
1363 uint64_t next_wqeaddr;
1364 uint32_t imm_eth_pkey_cred;
1365 uint_t nextwqesize, dbd;
1366 uint_t doorbell_cnt, status;
1367 tavor_wrid_entry_t wre;
1368
1369 /*
1370 * Fetch the Work Request ID using the information in the CQE.
1371 * See tavor_wr.c for more details.
1372 */
1373 wc->wc_id = tavor_wrid_get_entry(cq, cqe, &wre);
1374
1375 /*
1376 * Parse the CQE opcode to determine completion type. We know that
1377 * the CQE is an error completion, so we extract only the completion
1378 * status here.
1379 */
1380 imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1381 status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT;
1382 switch (status) {
1383 case TAVOR_CQE_LOC_LEN_ERR:
1384 status = IBT_WC_LOCAL_LEN_ERR;
1385 break;
1386
1387 case TAVOR_CQE_LOC_OP_ERR:
1388 status = IBT_WC_LOCAL_QP_OP_ERR;
1389 break;
1390
1391 case TAVOR_CQE_LOC_PROT_ERR:
1392 status = IBT_WC_LOCAL_PROTECT_ERR;
1393 break;
1394
1395 case TAVOR_CQE_WR_FLUSHED_ERR:
1396 status = IBT_WC_WR_FLUSHED_ERR;
1397 break;
1398
1399 case TAVOR_CQE_MW_BIND_ERR:
1400 status = IBT_WC_MEM_WIN_BIND_ERR;
1401 break;
1402
1403 case TAVOR_CQE_BAD_RESPONSE_ERR:
1404 status = IBT_WC_BAD_RESPONSE_ERR;
1405 break;
1406
1407 case TAVOR_CQE_LOCAL_ACCESS_ERR:
1408 status = IBT_WC_LOCAL_ACCESS_ERR;
1409 break;
1410
1411 case TAVOR_CQE_REM_INV_REQ_ERR:
1412 status = IBT_WC_REMOTE_INVALID_REQ_ERR;
1413 break;
1414
1415 case TAVOR_CQE_REM_ACC_ERR:
1416 status = IBT_WC_REMOTE_ACCESS_ERR;
1417 break;
1418
1419 case TAVOR_CQE_REM_OP_ERR:
1420 status = IBT_WC_REMOTE_OP_ERR;
1421 break;
1422
1423 case TAVOR_CQE_TRANS_TO_ERR:
1424 status = IBT_WC_TRANS_TIMEOUT_ERR;
1425 break;
1426
1427 case TAVOR_CQE_RNRNAK_TO_ERR:
1428 status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
1429 break;
1430
1431 /*
1432 * The following error codes are not supported in the Tavor driver
1433 * as they relate only to Reliable Datagram completion statuses:
1434 * case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
1435 * case TAVOR_CQE_REM_INV_RD_REQ_ERR:
1436 * case TAVOR_CQE_EEC_REM_ABORTED_ERR:
1437 * case TAVOR_CQE_INV_EEC_NUM_ERR:
1438 * case TAVOR_CQE_INV_EEC_STATE_ERR:
1439 * case TAVOR_CQE_LOC_EEC_ERR:
1440 */
1441
1442 default:
1443 TAVOR_WARNING(state, "unknown error CQE status");
1444 status = IBT_WC_LOCAL_QP_OP_ERR;
1445 break;
1446 }
1447 wc->wc_status = status;
1448
1449 /*
1450 * Now we do all the checking that's necessary to handle completion
1451 * queue entry "recycling"
1452 *
1453 * It is not necessary here to try to sync the WQE as we are only
1454 * attempting to read from the Work Queue (and hardware does not
1455 * write to it).
1456 */
1457
1458 /*
1459 * We can get doorbell info, WQE address, size for the next WQE
1460 * from the "wre" (which was filled in above in the call to the
1461 * tavor_wrid_get_entry() routine)
1462 */
1463 dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0;
1464 next_wqeaddr = wre.wr_wqeaddrsz;
1465 nextwqesize = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK;
1466
1467 /*
1468 * Get the doorbell count from the CQE. This indicates how many
1469 * completions this one CQE represents.
1470 */
1471 doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
1472
1473 /*
1474 * Determine if we're ready to consume this CQE yet or not. If the
1475 * next WQE has size zero (i.e. no next WQE) or if the doorbell count
1476 * is down to zero, then this is the last/only completion represented
1477 * by the current CQE (return TAVOR_CQ_SYNC_AND_DB). Otherwise, the
1478 * current CQE needs to be recycled (see below).
1479 */
1480 if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) {
1481 /*
1482 * Consume the CQE
1483 * Return status to indicate that doorbell and sync may be
1484 * necessary.
1485 */
1486 return (TAVOR_CQ_SYNC_AND_DB);
1487
1488 } else {
1489 /*
1490 * Recycle the CQE for use in the next PollCQ() call
1491 * Decrement the doorbell count, modify the error status,
1492 * and update the WQE address and size (to point to the
1493 * next WQE on the chain. Put these update entries back
1494 * into the CQE.
1495 * Despite the fact that we have updated the CQE, it is not
1496 * necessary for us to attempt to sync this entry just yet
1497 * as we have not changed the "hardware's view" of the
1498 * entry (i.e. we have not modified the "owner" bit - which
1499 * is all that the Tavor hardware really cares about.
1500 */
1501 doorbell_cnt = doorbell_cnt - dbd;
1502 TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cq, cqe,
1503 ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) |
1504 (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK)));
1505 TAVOR_CQE_WQEADDRSZ_SET(cq, cqe,
1506 TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize));
1507
1508 return (TAVOR_CQ_RECYCLE_ENTRY);
1509 }
1510 }
1511
1512
1513 /*
1514 * tavor_cqe_sync()
1515 * Context: Can be called from interrupt or base context.
1516 */
1517 static void
tavor_cqe_sync(tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe,uint_t flag)1518 tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, uint_t flag)
1519 {
1520 ddi_dma_handle_t dmahdl;
1521 off_t offset;
1522
1523 /* Determine if CQ needs to be synced or not */
1524 if (cq->cq_sync == 0)
1525 return;
1526
1527 /* Get the DMA handle from CQ context */
1528 dmahdl = cq->cq_mrhdl->mr_bindinfo.bi_dmahdl;
1529
1530 /* Calculate offset of next CQE */
1531 offset = (off_t)((uintptr_t)cqe - (uintptr_t)&cq->cq_buf[0]);
1532 (void) ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_cqe_t), flag);
1533 }
1534
1535
1536 /*
1537 * tavor_cq_resize_helper()
1538 * Context: Can be called only from user or kernel context.
1539 */
1540 static void
tavor_cq_resize_helper(tavor_cqhdl_t cq,tavor_hw_cqe_t * new_cqbuf,uint32_t old_cons_indx,uint32_t num_newcqe)1541 tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
1542 uint32_t old_cons_indx, uint32_t num_newcqe)
1543 {
1544 tavor_hw_cqe_t *old_cqe, *new_cqe;
1545 uint32_t new_cons_indx, wrap_around_mask;
1546 int i;
1547
1548 ASSERT(MUTEX_HELD(&cq->cq_lock));
1549
1550 /* Get the consumer index */
1551 new_cons_indx = 0;
1552
1553 /*
1554 * Calculate the wrap around mask. Note: This operation only works
1555 * because all Tavor completion queues have power-of-2 sizes
1556 */
1557 wrap_around_mask = (cq->cq_bufsz - 1);
1558
1559 /*
1560 * Calculate the pointers to the first CQ entry (in the "old" CQ)
1561 * and the first CQ entry in the "new" CQ
1562 */
1563 old_cqe = &cq->cq_buf[old_cons_indx];
1564 new_cqe = &new_cqbuf[new_cons_indx];
1565
1566 /* Sync entire "old" CQ for use by software (if necessary). */
1567 if (cq->cq_sync) {
1568 (void) ddi_dma_sync(cq->cq_mrhdl->mr_bindinfo.bi_dmahdl,
1569 0, cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORCPU);
1570 }
1571
1572 /*
1573 * Keep pulling entries from the "old" CQ until we find an entry owned
1574 * by the hardware. Process each entry by copying it into the "new"
1575 * CQ and updating respective indices and pointers in the "old" CQ.
1576 */
1577 for (i = 0; i < num_newcqe; i++) {
1578
1579 /* Copy this old CQE into the "new_cqe" pointer */
1580 bcopy(old_cqe, new_cqe, sizeof (tavor_hw_cqe_t));
1581
1582 /* Increment the consumer index (for both CQs) */
1583 old_cons_indx = (old_cons_indx + 1) & wrap_around_mask;
1584 new_cons_indx = (new_cons_indx + 1);
1585
1586 /* Update the pointer to the next CQ entry */
1587 old_cqe = &cq->cq_buf[old_cons_indx];
1588 new_cqe = &new_cqbuf[new_cons_indx];
1589 }
1590 }
1591
1592 /*
1593 * tavor_cq_srq_entries_flush()
1594 * Context: Can be called from interrupt or base context.
1595 */
1596 void
tavor_cq_srq_entries_flush(tavor_state_t * state,tavor_qphdl_t qp)1597 tavor_cq_srq_entries_flush(tavor_state_t *state, tavor_qphdl_t qp)
1598 {
1599 tavor_cqhdl_t cq;
1600 tavor_workq_hdr_t *wqhdr;
1601 tavor_hw_cqe_t *cqe;
1602 tavor_hw_cqe_t *next_cqe;
1603 uint32_t cons_indx, tail_cons_indx, wrap_around_mask;
1604 uint32_t new_indx, check_indx, indx;
1605 uint32_t num_to_increment;
1606 int cqe_qpnum, cqe_type;
1607 int outstanding_cqes, removed_cqes;
1608 int i;
1609
1610 ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock));
1611
1612 cq = qp->qp_rq_cqhdl;
1613 wqhdr = qp->qp_rq_wqhdr;
1614
1615 ASSERT(wqhdr->wq_wrid_post != NULL);
1616 ASSERT(wqhdr->wq_wrid_post->wl_srq_en != 0);
1617
1618 /*
1619 * Check for user-mapped CQ memory. Note: We do not allow kernel
1620 * clients to modify any userland mapping CQ. If the CQ is
1621 * user-mapped, then we simply return here, and this "flush" function
1622 * becomes a NO-OP in this case.
1623 */
1624 if (cq->cq_is_umap) {
1625 return;
1626 }
1627
1628 /* Get the consumer index */
1629 cons_indx = cq->cq_consindx;
1630
1631 /*
1632 * Calculate the wrap around mask. Note: This operation only works
1633 * because all Tavor completion queues have power-of-2 sizes
1634 */
1635 wrap_around_mask = (cq->cq_bufsz - 1);
1636
1637 /* Calculate the pointer to the first CQ entry */
1638 cqe = &cq->cq_buf[cons_indx];
1639
1640 /* Sync the current CQE to read */
1641 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1642
1643 /*
1644 * Loop through the CQ looking for entries owned by software. If an
1645 * entry is owned by software then we increment an 'outstanding_cqes'
1646 * count to know how many entries total we have on our CQ. We use this
1647 * value further down to know how many entries to loop through looking
1648 * for our same QP number.
1649 */
1650 outstanding_cqes = 0;
1651 tail_cons_indx = cons_indx;
1652 while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
1653 /* increment total cqes count */
1654 outstanding_cqes++;
1655
1656 /* increment the consumer index */
1657 tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask;
1658
1659 /* update the pointer to the next cq entry */
1660 cqe = &cq->cq_buf[tail_cons_indx];
1661
1662 /* sync the next cqe to read */
1663 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1664 }
1665
1666 /*
1667 * Using the 'tail_cons_indx' that was just set, we now know how many
1668 * total CQEs possible there are. Set the 'check_indx' and the
1669 * 'new_indx' to the last entry identified by 'tail_cons_indx'
1670 */
1671 check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask;
1672
1673 for (i = 0; i < outstanding_cqes; i++) {
1674 cqe = &cq->cq_buf[check_indx];
1675
1676 /* Grab QP number from CQE */
1677 cqe_qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
1678 cqe_type = TAVOR_CQE_SENDRECV_GET(cq, cqe);
1679
1680 /*
1681 * If the QP number is the same in the CQE as the QP that we
1682 * have on this SRQ, then we must free up the entry off the
1683 * SRQ. We also make sure that the completion type is of the
1684 * 'TAVOR_COMPLETION_RECV' type. So any send completions on
1685 * this CQ will be left as-is. The handling of returning
1686 * entries back to HW ownership happens further down.
1687 */
1688 if (cqe_qpnum == qp->qp_qpnum &&
1689 cqe_type == TAVOR_COMPLETION_RECV) {
1690
1691 /* Add back to SRQ free list */
1692 (void) tavor_wrid_find_match_srq(wqhdr->wq_wrid_post,
1693 cq, cqe);
1694 } else {
1695 /* Do Copy */
1696 if (check_indx != new_indx) {
1697 next_cqe = &cq->cq_buf[new_indx];
1698
1699 /*
1700 * Copy the CQE into the "next_cqe"
1701 * pointer.
1702 */
1703 bcopy(cqe, next_cqe, sizeof (tavor_hw_cqe_t));
1704 }
1705 new_indx = (new_indx - 1) & wrap_around_mask;
1706 }
1707 /* Move index to next CQE to check */
1708 check_indx = (check_indx - 1) & wrap_around_mask;
1709 }
1710
1711 /* Initialize removed cqes count */
1712 removed_cqes = 0;
1713
1714 /* If an entry was removed */
1715 if (check_indx != new_indx) {
1716
1717 /*
1718 * Set current pointer back to the beginning consumer index.
1719 * At this point, all unclaimed entries have been copied to the
1720 * index specified by 'new_indx'. This 'new_indx' will be used
1721 * as the new consumer index after we mark all freed entries as
1722 * having HW ownership. We do that here.
1723 */
1724
1725 /* Loop through all entries until we reach our new pointer */
1726 for (indx = cons_indx; indx <= new_indx;
1727 indx = (indx + 1) & wrap_around_mask) {
1728 removed_cqes++;
1729 cqe = &cq->cq_buf[indx];
1730
1731 /* Reset entry to hardware ownership */
1732 TAVOR_CQE_OWNER_SET_HW(cq, cqe);
1733 }
1734 }
1735
1736 /*
1737 * Update consumer index to be the 'new_indx'. This moves it past all
1738 * removed entries. Because 'new_indx' is pointing to the last
1739 * previously valid SW owned entry, we add 1 to point the cons_indx to
1740 * the first HW owned entry.
1741 */
1742 cons_indx = (new_indx + 1) & wrap_around_mask;
1743
1744 /*
1745 * Now we only ring the doorbell (to update the consumer index) if
1746 * we've actually consumed a CQ entry. If we found no QP number
1747 * matches above, then we would not have removed anything. So only if
1748 * something was removed do we ring the doorbell.
1749 */
1750 if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
1751 /*
1752 * Post doorbell to update the consumer index. Doorbell
1753 * value indicates number of entries consumed (minus 1)
1754 */
1755 if (cons_indx > cq->cq_consindx) {
1756 num_to_increment = (cons_indx - cq->cq_consindx) - 1;
1757 } else {
1758 num_to_increment = ((cons_indx + cq->cq_bufsz) -
1759 cq->cq_consindx) - 1;
1760 }
1761 cq->cq_consindx = cons_indx;
1762
1763 tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
1764 cq->cq_cqnum, num_to_increment);
1765 }
1766 }
1767