1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * tavor_cq.c
29 * Tavor Completion Queue Processing Routines
30 *
31 * Implements all the routines necessary for allocating, freeing, resizing,
32 * and handling the completion type events that the Tavor hardware can
33 * generate.
34 */
35
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/bitmap.h>
42 #include <sys/sysmacros.h>
43
44 #include <sys/ib/adapters/tavor/tavor.h>
45
46 static void tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd,
47 uint32_t cqn, uint32_t cq_param);
48 #pragma inline(tavor_cq_doorbell)
49 static int tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
50 tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
51 static int tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
52 tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
53 static void tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
54 uint_t flag);
55 static void tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
56 uint32_t old_cons_indx, uint32_t num_newcqe);
57
58 /*
59 * tavor_cq_alloc()
60 * Context: Can be called only from user or kernel context.
61 */
62 int
tavor_cq_alloc(tavor_state_t * state,ibt_cq_hdl_t ibt_cqhdl,ibt_cq_attr_t * cq_attr,uint_t * actual_size,tavor_cqhdl_t * cqhdl,uint_t sleepflag)63 tavor_cq_alloc(tavor_state_t *state, ibt_cq_hdl_t ibt_cqhdl,
64 ibt_cq_attr_t *cq_attr, uint_t *actual_size, tavor_cqhdl_t *cqhdl,
65 uint_t sleepflag)
66 {
67 tavor_rsrc_t *cqc, *rsrc;
68 tavor_umap_db_entry_t *umapdb;
69 tavor_hw_cqc_t cqc_entry;
70 tavor_cqhdl_t cq;
71 ibt_mr_attr_t mr_attr;
72 tavor_mr_options_t op;
73 tavor_pdhdl_t pd;
74 tavor_mrhdl_t mr;
75 tavor_hw_cqe_t *buf;
76 uint64_t addr, value;
77 uint32_t log_cq_size, lkey, uarpg;
78 uint_t dma_xfer_mode, cq_sync, cq_is_umap;
79 int status, i, flag;
80
81 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq_attr))
82
83 /*
84 * Determine whether CQ is being allocated for userland access or
85 * whether it is being allocated for kernel access. If the CQ is
86 * being allocated for userland access, then lookup the UAR doorbell
87 * page number for the current process. Note: If this is not found
88 * (e.g. if the process has not previously open()'d the Tavor driver),
89 * then an error is returned.
90 */
91 cq_is_umap = (cq_attr->cq_flags & IBT_CQ_USER_MAP) ? 1 : 0;
92 if (cq_is_umap) {
93 status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
94 MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
95 if (status != DDI_SUCCESS) {
96 goto cqalloc_fail;
97 }
98 uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
99 }
100
101 /* Use the internal protection domain (PD) for setting up CQs */
102 pd = state->ts_pdhdl_internal;
103
104 /* Increment the reference count on the protection domain (PD) */
105 tavor_pd_refcnt_inc(pd);
106
107 /*
108 * Allocate an CQ context entry. This will be filled in with all
109 * the necessary parameters to define the Completion Queue. And then
110 * ownership will be passed to the hardware in the final step
111 * below. If we fail here, we must undo the protection domain
112 * reference count.
113 */
114 status = tavor_rsrc_alloc(state, TAVOR_CQC, 1, sleepflag, &cqc);
115 if (status != DDI_SUCCESS) {
116 goto cqalloc_fail1;
117 }
118
119 /*
120 * Allocate the software structure for tracking the completion queue
121 * (i.e. the Tavor Completion Queue handle). If we fail here, we must
122 * undo the protection domain reference count and the previous
123 * resource allocation.
124 */
125 status = tavor_rsrc_alloc(state, TAVOR_CQHDL, 1, sleepflag, &rsrc);
126 if (status != DDI_SUCCESS) {
127 goto cqalloc_fail2;
128 }
129 cq = (tavor_cqhdl_t)rsrc->tr_addr;
130 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
131 cq->cq_is_umap = cq_is_umap;
132
133 /* Use the index as CQ number */
134 cq->cq_cqnum = cqc->tr_indx;
135
136 /*
137 * If this will be a user-mappable CQ, then allocate an entry for
138 * the "userland resources database". This will later be added to
139 * the database (after all further CQ operations are successful).
140 * If we fail here, we must undo the reference counts and the
141 * previous resource allocation.
142 */
143 if (cq->cq_is_umap) {
144 umapdb = tavor_umap_db_alloc(state->ts_instance, cq->cq_cqnum,
145 MLNX_UMAP_CQMEM_RSRC, (uint64_t)(uintptr_t)rsrc);
146 if (umapdb == NULL) {
147 goto cqalloc_fail3;
148 }
149 }
150
151 /*
152 * Calculate the appropriate size for the completion queue.
153 * Note: All Tavor CQs must be a power-of-2 minus 1 in size. Also
154 * they may not be any smaller than TAVOR_CQ_MIN_SIZE. This step is
155 * to round the requested size up to the next highest power-of-2
156 */
157 cq_attr->cq_size = max(cq_attr->cq_size, TAVOR_CQ_MIN_SIZE);
158 log_cq_size = highbit(cq_attr->cq_size);
159
160 /*
161 * Next we verify that the rounded-up size is valid (i.e. consistent
162 * with the device limits and/or software-configured limits)
163 */
164 if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
165 goto cqalloc_fail4;
166 }
167
168 /*
169 * Allocate the memory for Completion Queue.
170 *
171 * Note: Although we use the common queue allocation routine, we
172 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
173 * kernel system memory) for kernel CQs because it would be
174 * inefficient to have CQs located in DDR memory. This is primarily
175 * because CQs are read from (by software) more than they are written
176 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
177 * user-mappable CQs for a similar reason.)
178 * It is also worth noting that, unlike Tavor QP work queues,
179 * completion queues do not have the same strict alignment
180 * requirements. It is sufficient for the CQ memory to be both
181 * aligned to and bound to addresses which are a multiple of CQE size.
182 */
183 cq->cq_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
184 cq->cq_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
185 cq->cq_cqinfo.qa_bind_align = sizeof (tavor_hw_cqe_t);
186 if (cq->cq_is_umap) {
187 cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
188 } else {
189 cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
190 }
191 status = tavor_queue_alloc(state, &cq->cq_cqinfo, sleepflag);
192 if (status != DDI_SUCCESS) {
193 goto cqalloc_fail4;
194 }
195 buf = (tavor_hw_cqe_t *)cq->cq_cqinfo.qa_buf_aligned;
196 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
197
198 /*
199 * Initialize each of the Completion Queue Entries (CQE) by setting
200 * their ownership to hardware ("owner" bit set to HW). This is in
201 * preparation for the final transfer of ownership (below) of the
202 * CQ context itself.
203 */
204 for (i = 0; i < (1 << log_cq_size); i++) {
205 TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
206 }
207
208 /*
209 * Register the memory for the CQ. The memory for the CQ must
210 * be registered in the Tavor TPT tables. This gives us the LKey
211 * to specify in the CQ context below. Note: If this is a user-
212 * mappable CQ, then we will force DDI_DMA_CONSISTENT mapping.
213 */
214 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
215 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
216 mr_attr.mr_len = cq->cq_cqinfo.qa_size;
217 mr_attr.mr_as = NULL;
218 mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
219 if (cq->cq_is_umap) {
220 dma_xfer_mode = DDI_DMA_CONSISTENT;
221 } else {
222 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
223 }
224 if (dma_xfer_mode == DDI_DMA_STREAMING) {
225 mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
226 }
227 op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
228 op.mro_bind_dmahdl = cq->cq_cqinfo.qa_dmahdl;
229 op.mro_bind_override_addr = 0;
230 status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
231 if (status != DDI_SUCCESS) {
232 goto cqalloc_fail5;
233 }
234 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
235 addr = mr->mr_bindinfo.bi_addr;
236 lkey = mr->mr_lkey;
237
238 /* Determine if later ddi_dma_sync will be necessary */
239 cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, cq->cq_cqinfo);
240
241 /* Sync entire CQ for use by the hardware (if necessary). */
242 if (cq_sync) {
243 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
244 cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
245 }
246
247 /*
248 * Fill in the CQC entry. This is the final step before passing
249 * ownership of the CQC entry to the Tavor hardware. We use all of
250 * the information collected/calculated above to fill in the
251 * requisite portions of the CQC. Note: If this CQ is going to be
252 * used for userland access, then we need to set the UAR page number
253 * appropriately (otherwise it's a "don't care")
254 */
255 bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
256 cq->cq_eqnum = TAVOR_CQ_EQNUM_GET(cq->cq_cqnum);
257 cq->cq_erreqnum = TAVOR_CQ_ERREQNUM_GET(cq->cq_cqnum);
258 cqc_entry.xlat = TAVOR_VA2PA_XLAT_ENABLED;
259 cqc_entry.state = TAVOR_CQ_DISARMED;
260 cqc_entry.start_addr_h = (addr >> 32);
261 cqc_entry.start_addr_l = (addr & 0xFFFFFFFF);
262 cqc_entry.log_cq_sz = log_cq_size;
263 if (cq->cq_is_umap) {
264 cqc_entry.usr_page = uarpg;
265 } else {
266 cqc_entry.usr_page = 0;
267 }
268 cqc_entry.pd = pd->pd_pdnum;
269 cqc_entry.lkey = lkey;
270 cqc_entry.e_eqn = cq->cq_erreqnum;
271 cqc_entry.c_eqn = cq->cq_eqnum;
272 cqc_entry.cqn = cq->cq_cqnum;
273
274 /*
275 * Write the CQC entry to hardware. Lastly, we pass ownership of
276 * the entry to the hardware (using the Tavor SW2HW_CQ firmware
277 * command). Note: In general, this operation shouldn't fail. But
278 * if it does, we have to undo everything we've done above before
279 * returning error.
280 */
281 status = tavor_cmn_ownership_cmd_post(state, SW2HW_CQ, &cqc_entry,
282 sizeof (tavor_hw_cqc_t), cq->cq_cqnum, sleepflag);
283 if (status != TAVOR_CMD_SUCCESS) {
284 cmn_err(CE_CONT, "Tavor: SW2HW_CQ command failed: %08x\n",
285 status);
286 goto cqalloc_fail6;
287 }
288
289 /*
290 * Fill in the rest of the Tavor Completion Queue handle. Having
291 * successfully transferred ownership of the CQC, we can update the
292 * following fields for use in further operations on the CQ.
293 */
294 cq->cq_cqcrsrcp = cqc;
295 cq->cq_rsrcp = rsrc;
296 cq->cq_consindx = 0;
297 cq->cq_buf = buf;
298 cq->cq_bufsz = (1 << log_cq_size);
299 cq->cq_mrhdl = mr;
300 cq->cq_sync = cq_sync;
301 cq->cq_refcnt = 0;
302 cq->cq_is_special = 0;
303 cq->cq_uarpg = uarpg;
304 cq->cq_umap_dhp = (devmap_cookie_t)NULL;
305 avl_create(&cq->cq_wrid_wqhdr_avl_tree, tavor_wrid_wqhdr_compare,
306 sizeof (struct tavor_workq_hdr_s),
307 offsetof(struct tavor_workq_hdr_s, wq_avl_link));
308
309 cq->cq_wrid_reap_head = NULL;
310 cq->cq_wrid_reap_tail = NULL;
311 cq->cq_hdlrarg = (void *)ibt_cqhdl;
312
313 /*
314 * Put CQ handle in Tavor CQNum-to-CQHdl list. Then fill in the
315 * "actual_size" and "cqhdl" and return success
316 */
317 ASSERT(state->ts_cqhdl[cqc->tr_indx] == NULL);
318 state->ts_cqhdl[cqc->tr_indx] = cq;
319
320 /*
321 * If this is a user-mappable CQ, then we need to insert the previously
322 * allocated entry into the "userland resources database". This will
323 * allow for later lookup during devmap() (i.e. mmap()) calls.
324 */
325 if (cq->cq_is_umap) {
326 tavor_umap_db_add(umapdb);
327 }
328
329 /*
330 * Fill in the return arguments (if necessary). This includes the
331 * real completion queue size.
332 */
333 if (actual_size != NULL) {
334 *actual_size = (1 << log_cq_size) - 1;
335 }
336 *cqhdl = cq;
337
338 return (DDI_SUCCESS);
339
340 /*
341 * The following is cleanup for all possible failure cases in this routine
342 */
343 cqalloc_fail6:
344 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
345 sleepflag) != DDI_SUCCESS) {
346 TAVOR_WARNING(state, "failed to deregister CQ memory");
347 }
348 cqalloc_fail5:
349 tavor_queue_free(state, &cq->cq_cqinfo);
350 cqalloc_fail4:
351 if (cq_is_umap) {
352 tavor_umap_db_free(umapdb);
353 }
354 cqalloc_fail3:
355 tavor_rsrc_free(state, &rsrc);
356 cqalloc_fail2:
357 tavor_rsrc_free(state, &cqc);
358 cqalloc_fail1:
359 tavor_pd_refcnt_dec(pd);
360 cqalloc_fail:
361 return (status);
362 }
363
364
365 /*
366 * tavor_cq_free()
367 * Context: Can be called only from user or kernel context.
368 */
369 /* ARGSUSED */
370 int
tavor_cq_free(tavor_state_t * state,tavor_cqhdl_t * cqhdl,uint_t sleepflag)371 tavor_cq_free(tavor_state_t *state, tavor_cqhdl_t *cqhdl, uint_t sleepflag)
372 {
373 tavor_rsrc_t *cqc, *rsrc;
374 tavor_umap_db_entry_t *umapdb;
375 tavor_hw_cqc_t cqc_entry;
376 tavor_pdhdl_t pd;
377 tavor_mrhdl_t mr;
378 tavor_cqhdl_t cq;
379 uint32_t cqnum;
380 uint64_t value;
381 uint_t maxprot;
382 int status;
383
384 /*
385 * Pull all the necessary information from the Tavor Completion Queue
386 * handle. This is necessary here because the resource for the
387 * CQ handle is going to be freed up as part of this operation.
388 */
389 cq = *cqhdl;
390 mutex_enter(&cq->cq_lock);
391 cqc = cq->cq_cqcrsrcp;
392 rsrc = cq->cq_rsrcp;
393 pd = state->ts_pdhdl_internal;
394 mr = cq->cq_mrhdl;
395 cqnum = cq->cq_cqnum;
396
397 /*
398 * If there are work queues still associated with the CQ, then return
399 * an error. Otherwise, we will be holding the CQ lock.
400 */
401 if (cq->cq_refcnt != 0) {
402 mutex_exit(&cq->cq_lock);
403 return (IBT_CQ_BUSY);
404 }
405
406 /*
407 * If this was a user-mappable CQ, then we need to remove its entry
408 * from the "userland resources database". If it is also currently
409 * mmap()'d out to a user process, then we need to call
410 * devmap_devmem_remap() to remap the CQ memory to an invalid mapping.
411 * We also need to invalidate the CQ tracking information for the
412 * user mapping.
413 */
414 if (cq->cq_is_umap) {
415 status = tavor_umap_db_find(state->ts_instance, cqnum,
416 MLNX_UMAP_CQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
417 &umapdb);
418 if (status != DDI_SUCCESS) {
419 mutex_exit(&cq->cq_lock);
420 TAVOR_WARNING(state, "failed to find in database");
421 return (ibc_get_ci_failure(0));
422 }
423 tavor_umap_db_free(umapdb);
424 if (cq->cq_umap_dhp != NULL) {
425 maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
426 status = devmap_devmem_remap(cq->cq_umap_dhp,
427 state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size,
428 maxprot, DEVMAP_MAPPING_INVALID, NULL);
429 if (status != DDI_SUCCESS) {
430 mutex_exit(&cq->cq_lock);
431 TAVOR_WARNING(state, "failed in CQ memory "
432 "devmap_devmem_remap()");
433 return (ibc_get_ci_failure(0));
434 }
435 cq->cq_umap_dhp = (devmap_cookie_t)NULL;
436 }
437 }
438
439 /*
440 * Put NULL into the Tavor CQNum-to-CQHdl list. This will allow any
441 * in-progress events to detect that the CQ corresponding to this
442 * number has been freed.
443 */
444 state->ts_cqhdl[cqc->tr_indx] = NULL;
445
446 /*
447 * While we hold the CQ lock, do a "forced reap" of the workQ WRID
448 * list. This cleans up all the structures associated with the WRID
449 * processing for this CQ. Once we complete, drop the lock and finish
450 * the deallocation of the CQ.
451 */
452 tavor_wrid_cq_force_reap(cq);
453
454 mutex_exit(&cq->cq_lock);
455 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
456
457 /*
458 * Reclaim CQC entry from hardware (using the Tavor HW2SW_CQ
459 * firmware command). If the ownership transfer fails for any reason,
460 * then it is an indication that something (either in HW or SW) has
461 * gone seriously wrong.
462 */
463 status = tavor_cmn_ownership_cmd_post(state, HW2SW_CQ, &cqc_entry,
464 sizeof (tavor_hw_cqc_t), cqnum, sleepflag);
465 if (status != TAVOR_CMD_SUCCESS) {
466 TAVOR_WARNING(state, "failed to reclaim CQC ownership");
467 cmn_err(CE_CONT, "Tavor: HW2SW_CQ command failed: %08x\n",
468 status);
469 return (ibc_get_ci_failure(0));
470 }
471
472 /*
473 * Deregister the memory for the Completion Queue. If this fails
474 * for any reason, then it is an indication that something (either
475 * in HW or SW) has gone seriously wrong. So we print a warning
476 * message and return.
477 */
478 status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
479 sleepflag);
480 if (status != DDI_SUCCESS) {
481 TAVOR_WARNING(state, "failed to deregister CQ memory");
482 return (ibc_get_ci_failure(0));
483 }
484
485 /* Free the memory for the CQ */
486 tavor_queue_free(state, &cq->cq_cqinfo);
487
488 /* Free the Tavor Completion Queue handle */
489 tavor_rsrc_free(state, &rsrc);
490
491 /* Free up the CQC entry resource */
492 tavor_rsrc_free(state, &cqc);
493
494 /* Decrement the reference count on the protection domain (PD) */
495 tavor_pd_refcnt_dec(pd);
496
497 /* Set the cqhdl pointer to NULL and return success */
498 *cqhdl = NULL;
499
500 return (DDI_SUCCESS);
501 }
502
503
504 /*
505 * tavor_cq_resize()
506 * Context: Can be called only from user or kernel context.
507 */
508 int
tavor_cq_resize(tavor_state_t * state,tavor_cqhdl_t cq,uint_t req_size,uint_t * actual_size,uint_t sleepflag)509 tavor_cq_resize(tavor_state_t *state, tavor_cqhdl_t cq, uint_t req_size,
510 uint_t *actual_size, uint_t sleepflag)
511 {
512 tavor_hw_cqc_t cqc_entry;
513 tavor_qalloc_info_t new_cqinfo, old_cqinfo;
514 ibt_mr_attr_t mr_attr;
515 tavor_mr_options_t op;
516 tavor_pdhdl_t pd;
517 tavor_mrhdl_t mr, mr_old;
518 tavor_hw_cqe_t *buf;
519 uint32_t new_prod_indx, old_cons_indx;
520 uint_t dma_xfer_mode, cq_sync, log_cq_size, maxprot;
521 int status, i, flag;
522
523 /* Use the internal protection domain (PD) for CQs */
524 pd = state->ts_pdhdl_internal;
525
526 /*
527 * Calculate the appropriate size for the new resized completion queue.
528 * Note: All Tavor CQs must be a power-of-2 minus 1 in size. Also
529 * they may not be any smaller than TAVOR_CQ_MIN_SIZE. This step is
530 * to round the requested size up to the next highest power-of-2
531 */
532 req_size = max(req_size, TAVOR_CQ_MIN_SIZE);
533 log_cq_size = highbit(req_size);
534
535 /*
536 * Next we verify that the rounded-up size is valid (i.e. consistent
537 * with the device limits and/or software-configured limits)
538 */
539 if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
540 goto cqresize_fail;
541 }
542
543 /*
544 * Allocate the memory for newly resized Completion Queue.
545 *
546 * Note: Although we use the common queue allocation routine, we
547 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
548 * kernel system memory) for kernel CQs because it would be
549 * inefficient to have CQs located in DDR memory. This is the same
550 * as we do when we first allocate completion queues primarily
551 * because CQs are read from (by software) more than they are written
552 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
553 * user-mappable CQs for a similar reason.)
554 * It is also worth noting that, unlike Tavor QP work queues,
555 * completion queues do not have the same strict alignment
556 * requirements. It is sufficient for the CQ memory to be both
557 * aligned to and bound to addresses which are a multiple of CQE size.
558 */
559 new_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
560 new_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
561 new_cqinfo.qa_bind_align = sizeof (tavor_hw_cqe_t);
562 if (cq->cq_is_umap) {
563 new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
564 } else {
565 new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
566 }
567 status = tavor_queue_alloc(state, &new_cqinfo, sleepflag);
568 if (status != DDI_SUCCESS) {
569 goto cqresize_fail;
570 }
571 buf = (tavor_hw_cqe_t *)new_cqinfo.qa_buf_aligned;
572 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
573
574 /*
575 * Initialize each of the Completion Queue Entries (CQE) by setting
576 * their ownership to hardware ("owner" bit set to HW). This is in
577 * preparation for the final resize operation (below).
578 */
579 for (i = 0; i < (1 << log_cq_size); i++) {
580 TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
581 }
582
583 /*
584 * Register the memory for the CQ. The memory for the CQ must
585 * be registered in the Tavor TPT tables. This gives us the LKey
586 * to specify in the CQ context below.
587 */
588 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
589 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
590 mr_attr.mr_len = new_cqinfo.qa_size;
591 mr_attr.mr_as = NULL;
592 mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
593 if (cq->cq_is_umap) {
594 dma_xfer_mode = DDI_DMA_CONSISTENT;
595 } else {
596 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
597 }
598 if (dma_xfer_mode == DDI_DMA_STREAMING) {
599 mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
600 }
601 op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
602 op.mro_bind_dmahdl = new_cqinfo.qa_dmahdl;
603 op.mro_bind_override_addr = 0;
604 status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
605 if (status != DDI_SUCCESS) {
606 tavor_queue_free(state, &new_cqinfo);
607 goto cqresize_fail;
608 }
609 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
610
611 /* Determine if later ddi_dma_sync will be necessary */
612 cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, new_cqinfo);
613
614 /* Sync entire "new" CQ for use by hardware (if necessary) */
615 if (cq_sync) {
616 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
617 new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
618 }
619
620 /*
621 * Now we grab the CQ lock. Since we will be updating the actual
622 * CQ location and the producer/consumer indexes, we should hold
623 * the lock.
624 *
625 * We do a TAVOR_NOSLEEP here (and below), though, because we are
626 * holding the "cq_lock" and if we got raised to interrupt level
627 * by priority inversion, we would not want to block in this routine
628 * waiting for success.
629 */
630 mutex_enter(&cq->cq_lock);
631
632 /*
633 * Determine the current CQ "consumer index".
634 *
635 * Note: This will depend on whether the CQ had previously been
636 * mapped for user access or whether it is a kernel CQ. If this
637 * is a kernel CQ, then all PollCQ() operations have come through
638 * the IBTF and, hence, the driver's CQ state structure will
639 * contain the current consumer index. If, however, the user has
640 * accessed this CQ by bypassing the driver (OS-bypass), then we
641 * need to query the firmware to determine the current CQ consumer
642 * index. This also assumes that the user process will not continue
643 * to consume entries while at the same time doing the ResizeCQ()
644 * operation. If the user process does not guarantee this, then it
645 * may see duplicate or missed completions. But under no
646 * circumstances should this panic the system.
647 */
648 if (cq->cq_is_umap) {
649 status = tavor_cmn_query_cmd_post(state, QUERY_CQ,
650 cq->cq_cqnum, &cqc_entry, sizeof (tavor_hw_cqc_t),
651 TAVOR_NOSLEEP);
652 if (status != TAVOR_CMD_SUCCESS) {
653 /* Query CQ has failed, drop CQ lock and cleanup */
654 mutex_exit(&cq->cq_lock);
655 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
656 sleepflag) != DDI_SUCCESS) {
657 TAVOR_WARNING(state, "failed to deregister "
658 "CQ memory");
659 }
660 tavor_queue_free(state, &new_cqinfo);
661 TAVOR_WARNING(state, "failed to find in database");
662
663 goto cqresize_fail;
664 }
665 old_cons_indx = cqc_entry.cons_indx;
666 } else {
667 old_cons_indx = cq->cq_consindx;
668 }
669
670 /*
671 * Fill in the CQC entry. For the resize operation this is the
672 * final step before attempting the resize operation on the CQC entry.
673 * We use all of the information collected/calculated above to fill
674 * in the requisite portions of the CQC.
675 */
676 bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
677 cqc_entry.start_addr_h = (mr->mr_bindinfo.bi_addr >> 32);
678 cqc_entry.start_addr_l = (mr->mr_bindinfo.bi_addr & 0xFFFFFFFF);
679 cqc_entry.log_cq_sz = log_cq_size;
680 cqc_entry.lkey = mr->mr_lkey;
681
682 /*
683 * Write the CQC entry to hardware. Lastly, we pass ownership of
684 * the entry to the hardware (using the Tavor RESIZE_CQ firmware
685 * command). Note: In general, this operation shouldn't fail. But
686 * if it does, we have to undo everything we've done above before
687 * returning error. Also note that the status returned may indicate
688 * the code to return to the IBTF.
689 */
690 status = tavor_resize_cq_cmd_post(state, &cqc_entry, cq->cq_cqnum,
691 &new_prod_indx, TAVOR_CMD_NOSLEEP_SPIN);
692 if (status != TAVOR_CMD_SUCCESS) {
693 /* Resize attempt has failed, drop CQ lock and cleanup */
694 mutex_exit(&cq->cq_lock);
695 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
696 sleepflag) != DDI_SUCCESS) {
697 TAVOR_WARNING(state, "failed to deregister CQ memory");
698 }
699 tavor_queue_free(state, &new_cqinfo);
700 if (status == TAVOR_CMD_BAD_SIZE) {
701 return (IBT_CQ_SZ_INSUFFICIENT);
702 } else {
703 cmn_err(CE_CONT, "Tavor: RESIZE_CQ command failed: "
704 "%08x\n", status);
705 return (ibc_get_ci_failure(0));
706 }
707 }
708
709 /*
710 * The CQ resize attempt was successful. Before dropping the CQ lock,
711 * copy all of the CQEs from the "old" CQ into the "new" CQ. Note:
712 * the Tavor firmware guarantees us that sufficient space is set aside
713 * in the "new" CQ to handle any un-polled CQEs from the "old" CQ.
714 * The two parameters to this helper function ("old_cons_indx" and
715 * "new_prod_indx") essentially indicate the starting index and number
716 * of any CQEs that might remain in the "old" CQ memory.
717 */
718 tavor_cq_resize_helper(cq, buf, old_cons_indx, new_prod_indx);
719
720 /* Sync entire "new" CQ for use by hardware (if necessary) */
721 if (cq_sync) {
722 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
723 new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
724 }
725
726 /*
727 * Update the Tavor Completion Queue handle with all the new
728 * information. At the same time, save away all the necessary
729 * information for freeing up the old resources
730 */
731 mr_old = cq->cq_mrhdl;
732 old_cqinfo = cq->cq_cqinfo;
733 cq->cq_cqinfo = new_cqinfo;
734 cq->cq_consindx = 0;
735 cq->cq_buf = buf;
736 cq->cq_bufsz = (1 << log_cq_size);
737 cq->cq_mrhdl = mr;
738 cq->cq_sync = cq_sync;
739
740 /*
741 * If "old" CQ was a user-mappable CQ that is currently mmap()'d out
742 * to a user process, then we need to call devmap_devmem_remap() to
743 * invalidate the mapping to the CQ memory. We also need to
744 * invalidate the CQ tracking information for the user mapping.
745 */
746 if ((cq->cq_is_umap) && (cq->cq_umap_dhp != NULL)) {
747 maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
748 status = devmap_devmem_remap(cq->cq_umap_dhp,
749 state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size, maxprot,
750 DEVMAP_MAPPING_INVALID, NULL);
751 if (status != DDI_SUCCESS) {
752 mutex_exit(&cq->cq_lock);
753 TAVOR_WARNING(state, "failed in CQ memory "
754 "devmap_devmem_remap()");
755 return (ibc_get_ci_failure(0));
756 }
757 cq->cq_umap_dhp = (devmap_cookie_t)NULL;
758 }
759
760 /*
761 * Drop the CQ lock now. The only thing left to do is to free up
762 * the old resources.
763 */
764 mutex_exit(&cq->cq_lock);
765
766 /*
767 * Deregister the memory for the old Completion Queue. Note: We
768 * really can't return error here because we have no good way to
769 * cleanup. Plus, the deregistration really shouldn't ever happen.
770 * So, if it does, it is an indication that something has gone
771 * seriously wrong. So we print a warning message and return error
772 * (knowing, of course, that the "old" CQ memory will be leaked)
773 */
774 status = tavor_mr_deregister(state, &mr_old, TAVOR_MR_DEREG_ALL,
775 sleepflag);
776 if (status != DDI_SUCCESS) {
777 TAVOR_WARNING(state, "failed to deregister old CQ memory");
778 goto cqresize_fail;
779 }
780
781 /* Free the memory for the old CQ */
782 tavor_queue_free(state, &old_cqinfo);
783
784 /*
785 * Fill in the return arguments (if necessary). This includes the
786 * real new completion queue size.
787 */
788 if (actual_size != NULL) {
789 *actual_size = (1 << log_cq_size) - 1;
790 }
791
792 return (DDI_SUCCESS);
793
794 cqresize_fail:
795 return (status);
796 }
797
798
799 /*
800 * tavor_cq_notify()
801 * Context: Can be called from interrupt or base context.
802 */
803 int
tavor_cq_notify(tavor_state_t * state,tavor_cqhdl_t cq,ibt_cq_notify_flags_t flags)804 tavor_cq_notify(tavor_state_t *state, tavor_cqhdl_t cq,
805 ibt_cq_notify_flags_t flags)
806 {
807 uint_t cqnum;
808
809 /*
810 * Determine if we are trying to get the next completion or the next
811 * "solicited" completion. Then hit the appropriate doorbell.
812 *
813 * NOTE: Please see the comment in tavor_event.c:tavor_eq_poll
814 * regarding why we do not have to do an extra PIO read here, and we
815 * will not lose an event after writing this doorbell.
816 */
817 cqnum = cq->cq_cqnum;
818 if (flags == IBT_NEXT_COMPLETION) {
819 tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ, cqnum,
820 TAVOR_CQDB_DEFAULT_PARAM);
821
822 } else if (flags == IBT_NEXT_SOLICITED) {
823 tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ_SOLICIT,
824 cqnum, TAVOR_CQDB_DEFAULT_PARAM);
825
826 } else {
827 return (IBT_CQ_NOTIFY_TYPE_INVALID);
828 }
829
830 return (DDI_SUCCESS);
831 }
832
833
834 /*
835 * tavor_cq_poll()
836 * Context: Can be called from interrupt or base context.
837 */
838 int
tavor_cq_poll(tavor_state_t * state,tavor_cqhdl_t cq,ibt_wc_t * wc_p,uint_t num_wc,uint_t * num_polled)839 tavor_cq_poll(tavor_state_t *state, tavor_cqhdl_t cq, ibt_wc_t *wc_p,
840 uint_t num_wc, uint_t *num_polled)
841 {
842 tavor_hw_cqe_t *cqe;
843 uint32_t cons_indx, wrap_around_mask;
844 uint32_t polled_cnt, num_to_increment;
845 int status;
846
847 /*
848 * Check for user-mappable CQ memory. Note: We do not allow kernel
849 * clients to poll CQ memory that is accessible directly by the user.
850 * If the CQ memory is user accessible, then return an error.
851 */
852 if (cq->cq_is_umap) {
853 return (IBT_CQ_HDL_INVALID);
854 }
855
856 mutex_enter(&cq->cq_lock);
857
858 /* Get the consumer index */
859 cons_indx = cq->cq_consindx;
860
861 /*
862 * Calculate the wrap around mask. Note: This operation only works
863 * because all Tavor completion queues have power-of-2 sizes
864 */
865 wrap_around_mask = (cq->cq_bufsz - 1);
866
867 /* Calculate the pointer to the first CQ entry */
868 cqe = &cq->cq_buf[cons_indx];
869
870 /* Sync the current CQE to read */
871 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
872
873 /*
874 * Keep pulling entries from the CQ until we find an entry owned by
875 * the hardware. As long as there the CQE's owned by SW, process
876 * each entry by calling tavor_cq_cqe_consume() and updating the CQ
877 * consumer index. Note: We only update the consumer index if
878 * tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB. Otherwise,
879 * it indicates that we are going to "recycle" the CQE (probably
880 * because it is a error CQE and corresponds to more than one
881 * completion).
882 */
883 polled_cnt = 0;
884 while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
885 status = tavor_cq_cqe_consume(state, cq, cqe,
886 &wc_p[polled_cnt++]);
887 if (status == TAVOR_CQ_SYNC_AND_DB) {
888 /* Reset entry to hardware ownership */
889 TAVOR_CQE_OWNER_SET_HW(cq, cqe);
890
891 /* Sync the current CQE for device */
892 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORDEV);
893
894 /* Increment the consumer index */
895 cons_indx = (cons_indx + 1) & wrap_around_mask;
896
897 /* Update the pointer to the next CQ entry */
898 cqe = &cq->cq_buf[cons_indx];
899
900 /* Sync the next CQE to read */
901 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
902 }
903
904 /*
905 * If we have run out of space to store work completions,
906 * then stop and return the ones we have pulled of the CQ.
907 */
908 if (polled_cnt >= num_wc) {
909 break;
910 }
911 }
912
913 /*
914 * Now we only ring the doorbell (to update the consumer index) if
915 * we've actually consumed a CQ entry. If we have, for example,
916 * pulled from a CQE that we are still in the process of "recycling"
917 * for error purposes, then we would not update the consumer index.
918 */
919 if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) {
920 /*
921 * Post doorbell to update the consumer index. Doorbell
922 * value indicates number of entries consumed (minus 1)
923 */
924 if (cons_indx > cq->cq_consindx) {
925 num_to_increment = (cons_indx - cq->cq_consindx) - 1;
926 } else {
927 num_to_increment = ((cons_indx + cq->cq_bufsz) -
928 cq->cq_consindx) - 1;
929 }
930 cq->cq_consindx = cons_indx;
931 tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
932 cq->cq_cqnum, num_to_increment);
933
934 } else if (polled_cnt == 0) {
935 /*
936 * If the CQ is empty, we can try to free up some of the WRID
937 * list containers. See tavor_wr.c for more details on this
938 * operation.
939 */
940 tavor_wrid_cq_reap(cq);
941 }
942
943 mutex_exit(&cq->cq_lock);
944
945 /* Set "num_polled" (if necessary) */
946 if (num_polled != NULL) {
947 *num_polled = polled_cnt;
948 }
949
950 /* Set CQ_EMPTY condition if needed, otherwise return success */
951 if (polled_cnt == 0) {
952 status = IBT_CQ_EMPTY;
953 } else {
954 status = DDI_SUCCESS;
955 }
956
957 /*
958 * Check if the system is currently panicking. If it is, then call
959 * the Tavor interrupt service routine. This step is necessary here
960 * because we might be in a polled I/O mode and without the call to
961 * tavor_isr() - and its subsequent calls to poll and rearm each
962 * event queue - we might overflow our EQs and render the system
963 * unable to sync/dump.
964 */
965 if (ddi_in_panic() != 0) {
966 (void) tavor_isr((caddr_t)state, (caddr_t)NULL);
967 }
968
969 return (status);
970 }
971
972
973 /*
974 * tavor_cq_handler()
975 * Context: Only called from interrupt context
976 */
977 int
tavor_cq_handler(tavor_state_t * state,tavor_eqhdl_t eq,tavor_hw_eqe_t * eqe)978 tavor_cq_handler(tavor_state_t *state, tavor_eqhdl_t eq,
979 tavor_hw_eqe_t *eqe)
980 {
981 tavor_cqhdl_t cq;
982 uint_t cqnum;
983 uint_t eqe_evttype;
984
985 eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
986
987 ASSERT(eqe_evttype == TAVOR_EVT_COMPLETION ||
988 eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
989
990 if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
991 tavor_eq_overflow_handler(state, eq, eqe);
992
993 return (DDI_FAILURE);
994 }
995
996
997 /* Get the CQ handle from CQ number in event descriptor */
998 cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
999 cq = tavor_cqhdl_from_cqnum(state, cqnum);
1000
1001 /*
1002 * Post the EQ doorbell to move the CQ to the "disarmed" state.
1003 * This operation is to enable subsequent CQ doorbells (e.g. those
1004 * that can be rung by tavor_cq_notify() above) to rearm the CQ.
1005 */
1006 tavor_eq_doorbell(state, TAVOR_EQDB_DISARM_CQ, eq->eq_eqnum, cqnum);
1007
1008 /*
1009 * If the CQ handle is NULL, this is probably an indication
1010 * that the CQ has been freed already. In which case, we
1011 * should not deliver this event.
1012 *
1013 * We also check that the CQ number in the handle is the
1014 * same as the CQ number in the event queue entry. This
1015 * extra check allows us to handle the case where a CQ was
1016 * freed and then allocated again in the time it took to
1017 * handle the event queue processing. By constantly incrementing
1018 * the non-constrained portion of the CQ number every time
1019 * a new CQ is allocated, we mitigate (somewhat) the chance
1020 * that a stale event could be passed to the client's CQ
1021 * handler.
1022 *
1023 * Lastly, we check if "ts_ibtfpriv" is NULL. If it is then it
1024 * means that we've have either received this event before we
1025 * finished attaching to the IBTF or we've received it while we
1026 * are in the process of detaching.
1027 */
1028 if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1029 (state->ts_ibtfpriv != NULL)) {
1030 TAVOR_DO_IBTF_CQ_CALLB(state, cq);
1031 }
1032
1033 return (DDI_SUCCESS);
1034 }
1035
1036
1037 /*
1038 * tavor_cq_err_handler()
1039 * Context: Only called from interrupt context
1040 */
1041 int
tavor_cq_err_handler(tavor_state_t * state,tavor_eqhdl_t eq,tavor_hw_eqe_t * eqe)1042 tavor_cq_err_handler(tavor_state_t *state, tavor_eqhdl_t eq,
1043 tavor_hw_eqe_t *eqe)
1044 {
1045 tavor_cqhdl_t cq;
1046 uint_t cqnum;
1047 ibc_async_event_t event;
1048 ibt_async_code_t type;
1049 uint_t eqe_evttype;
1050
1051 eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
1052
1053 ASSERT(eqe_evttype == TAVOR_EVT_CQ_ERRORS ||
1054 eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
1055
1056 if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
1057 tavor_eq_overflow_handler(state, eq, eqe);
1058
1059 return (DDI_FAILURE);
1060 }
1061
1062 /* cmn_err(CE_CONT, "CQ Error handler\n"); */
1063
1064 /* Get the CQ handle from CQ number in event descriptor */
1065 cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
1066 cq = tavor_cqhdl_from_cqnum(state, cqnum);
1067
1068 /*
1069 * If the CQ handle is NULL, this is probably an indication
1070 * that the CQ has been freed already. In which case, we
1071 * should not deliver this event.
1072 *
1073 * We also check that the CQ number in the handle is the
1074 * same as the CQ number in the event queue entry. This
1075 * extra check allows us to handle the case where a CQ was
1076 * freed and then allocated again in the time it took to
1077 * handle the event queue processing. By constantly incrementing
1078 * the non-constrained portion of the CQ number every time
1079 * a new CQ is allocated, we mitigate (somewhat) the chance
1080 * that a stale event could be passed to the client's CQ
1081 * handler.
1082 *
1083 * And then we check if "ts_ibtfpriv" is NULL. If it is then it
1084 * means that we've have either received this event before we
1085 * finished attaching to the IBTF or we've received it while we
1086 * are in the process of detaching.
1087 */
1088 if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1089 (state->ts_ibtfpriv != NULL)) {
1090 event.ev_cq_hdl = (ibt_cq_hdl_t)cq->cq_hdlrarg;
1091 type = IBT_ERROR_CQ;
1092
1093 TAVOR_DO_IBTF_ASYNC_CALLB(state, type, &event);
1094 }
1095
1096 return (DDI_SUCCESS);
1097 }
1098
1099
1100 /*
1101 * tavor_cq_refcnt_inc()
1102 * Context: Can be called from interrupt or base context.
1103 */
1104 int
tavor_cq_refcnt_inc(tavor_cqhdl_t cq,uint_t is_special)1105 tavor_cq_refcnt_inc(tavor_cqhdl_t cq, uint_t is_special)
1106 {
1107 /*
1108 * Increment the completion queue's reference count. Note: In order
1109 * to ensure compliance with IBA C11-15, we must ensure that a given
1110 * CQ is not used for both special (SMI/GSI) QP and non-special QP.
1111 * This is accomplished here by keeping track of how the referenced
1112 * CQ is being used.
1113 */
1114 mutex_enter(&cq->cq_lock);
1115 if (cq->cq_refcnt == 0) {
1116 cq->cq_is_special = is_special;
1117 } else {
1118 if (cq->cq_is_special != is_special) {
1119 mutex_exit(&cq->cq_lock);
1120 return (DDI_FAILURE);
1121 }
1122 }
1123 cq->cq_refcnt++;
1124 mutex_exit(&cq->cq_lock);
1125 return (DDI_SUCCESS);
1126 }
1127
1128
1129 /*
1130 * tavor_cq_refcnt_dec()
1131 * Context: Can be called from interrupt or base context.
1132 */
1133 void
tavor_cq_refcnt_dec(tavor_cqhdl_t cq)1134 tavor_cq_refcnt_dec(tavor_cqhdl_t cq)
1135 {
1136 /* Decrement the completion queue's reference count */
1137 mutex_enter(&cq->cq_lock);
1138 cq->cq_refcnt--;
1139 mutex_exit(&cq->cq_lock);
1140 }
1141
1142
1143 /*
1144 * tavor_cq_doorbell()
1145 * Context: Can be called from interrupt or base context.
1146 */
1147 static void
tavor_cq_doorbell(tavor_state_t * state,uint32_t cq_cmd,uint32_t cqn,uint32_t cq_param)1148 tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd, uint32_t cqn,
1149 uint32_t cq_param)
1150 {
1151 uint64_t doorbell = 0;
1152
1153 /* Build the doorbell from the parameters */
1154 doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) |
1155 ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param;
1156
1157 /* Write the doorbell to UAR */
1158 TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->cq,
1159 doorbell);
1160 }
1161
1162
1163 /*
1164 * tavor_cqhdl_from_cqnum()
1165 * Context: Can be called from interrupt or base context.
1166 *
1167 * This routine is important because changing the unconstrained
1168 * portion of the CQ number is critical to the detection of a
1169 * potential race condition in the CQ handler code (i.e. the case
1170 * where a CQ is freed and alloc'd again before an event for the
1171 * "old" CQ can be handled).
1172 *
1173 * While this is not a perfect solution (not sure that one exists)
1174 * it does help to mitigate the chance that this race condition will
1175 * cause us to deliver a "stale" event to the new CQ owner. Note:
1176 * this solution does not scale well because the number of constrained
1177 * bits increases (and, hence, the number of unconstrained bits
1178 * decreases) as the number of supported CQs grows. For small and
1179 * intermediate values, it should hopefully provide sufficient
1180 * protection.
1181 */
1182 tavor_cqhdl_t
tavor_cqhdl_from_cqnum(tavor_state_t * state,uint_t cqnum)1183 tavor_cqhdl_from_cqnum(tavor_state_t *state, uint_t cqnum)
1184 {
1185 uint_t cqindx, cqmask;
1186
1187 /* Calculate the CQ table index from the cqnum */
1188 cqmask = (1 << state->ts_cfg_profile->cp_log_num_cq) - 1;
1189 cqindx = cqnum & cqmask;
1190 return (state->ts_cqhdl[cqindx]);
1191 }
1192
1193
1194 /*
1195 * tavor_cq_cqe_consume()
1196 * Context: Can be called from interrupt or base context.
1197 */
1198 static int
tavor_cq_cqe_consume(tavor_state_t * state,tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe,ibt_wc_t * wc)1199 tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1200 tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1201 {
1202 uint_t flags, type, opcode, qpnum, qp1_indx;
1203 int status;
1204
1205 /*
1206 * Determine if this is an "error" CQE by examining "opcode". If it
1207 * is an error CQE, then call tavor_cq_errcqe_consume() and return
1208 * whatever status it returns. Otherwise, this is a successful
1209 * completion.
1210 */
1211 opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
1212 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
1213 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
1214 status = tavor_cq_errcqe_consume(state, cq, cqe, wc);
1215 return (status);
1216 }
1217
1218 /*
1219 * Fetch the Work Request ID using the information in the CQE.
1220 * See tavor_wr.c for more details.
1221 */
1222 wc->wc_id = tavor_wrid_get_entry(cq, cqe, NULL);
1223
1224 /*
1225 * Parse the CQE opcode to determine completion type. This will set
1226 * not only the type of the completion, but also any flags that might
1227 * be associated with it (e.g. whether immediate data is present).
1228 */
1229 flags = IBT_WC_NO_FLAGS;
1230 if (TAVOR_CQE_SENDRECV_GET(cq, cqe) != TAVOR_COMPLETION_RECV) {
1231
1232 /* Send CQE */
1233 switch (opcode) {
1234 case TAVOR_CQE_SND_RDMAWR_IMM:
1235 flags |= IBT_WC_IMMED_DATA_PRESENT;
1236 /* FALLTHROUGH */
1237 case TAVOR_CQE_SND_RDMAWR:
1238 type = IBT_WRC_RDMAW;
1239 break;
1240
1241 case TAVOR_CQE_SND_SEND_IMM:
1242 flags |= IBT_WC_IMMED_DATA_PRESENT;
1243 /* FALLTHROUGH */
1244 case TAVOR_CQE_SND_SEND:
1245 type = IBT_WRC_SEND;
1246 break;
1247
1248 case TAVOR_CQE_SND_RDMARD:
1249 type = IBT_WRC_RDMAR;
1250 break;
1251
1252 case TAVOR_CQE_SND_ATOMIC_CS:
1253 type = IBT_WRC_CSWAP;
1254 break;
1255
1256 case TAVOR_CQE_SND_ATOMIC_FA:
1257 type = IBT_WRC_FADD;
1258 break;
1259
1260 case TAVOR_CQE_SND_BIND_MW:
1261 type = IBT_WRC_BIND;
1262 break;
1263
1264 default:
1265 TAVOR_WARNING(state, "unknown send CQE type");
1266 wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1267 return (TAVOR_CQ_SYNC_AND_DB);
1268 }
1269 } else {
1270
1271 /* Receive CQE */
1272 switch (opcode & 0x1F) {
1273 case TAVOR_CQE_RCV_RECV_IMM:
1274 /* FALLTHROUGH */
1275 case TAVOR_CQE_RCV_RECV_IMM2:
1276 /*
1277 * Note: According to the Tavor PRM, all QP1 recv
1278 * completions look like the result of a Send with
1279 * Immediate. They are not, however, (MADs are Send
1280 * Only) so we need to check the QP number and set
1281 * the flag only if it is non-QP1.
1282 */
1283 qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
1284 qp1_indx = state->ts_spec_qp1->tr_indx;
1285 if ((qpnum < qp1_indx) || (qpnum > qp1_indx + 1)) {
1286 flags |= IBT_WC_IMMED_DATA_PRESENT;
1287 }
1288 /* FALLTHROUGH */
1289 case TAVOR_CQE_RCV_RECV:
1290 /* FALLTHROUGH */
1291 case TAVOR_CQE_RCV_RECV2:
1292 type = IBT_WRC_RECV;
1293 break;
1294
1295 case TAVOR_CQE_RCV_RDMAWR_IMM:
1296 /* FALLTHROUGH */
1297 case TAVOR_CQE_RCV_RDMAWR_IMM2:
1298 flags |= IBT_WC_IMMED_DATA_PRESENT;
1299 type = IBT_WRC_RECV_RDMAWI;
1300 break;
1301
1302 default:
1303 TAVOR_WARNING(state, "unknown recv CQE type");
1304 wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1305 return (TAVOR_CQ_SYNC_AND_DB);
1306 }
1307 }
1308 wc->wc_type = type;
1309
1310 /*
1311 * Check for GRH, update the flags, then fill in "wc_flags" field
1312 * in the work completion
1313 */
1314 if (TAVOR_CQE_GRH_GET(cq, cqe) != 0) {
1315 flags |= IBT_WC_GRH_PRESENT;
1316 }
1317 wc->wc_flags = flags;
1318
1319 /* If we got here, completion status must be success */
1320 wc->wc_status = IBT_WC_SUCCESS;
1321
1322 /*
1323 * Parse the remaining contents of the CQE into the work completion.
1324 * This means filling in SL, QP number, SLID, immediate data, etc.
1325 * Note: Not all of these fields are valid in a given completion.
1326 * Many of them depend on the actual type of completion. So we fill
1327 * in all of the fields and leave it up to the IBTF and consumer to
1328 * sort out which are valid based on their context.
1329 */
1330 wc->wc_sl = TAVOR_CQE_SL_GET(cq, cqe);
1331 wc->wc_immed_data = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1332 wc->wc_qpn = TAVOR_CQE_DQPN_GET(cq, cqe);
1333 wc->wc_res_hash = 0;
1334 wc->wc_slid = TAVOR_CQE_DLID_GET(cq, cqe);
1335 wc->wc_ethertype = (wc->wc_immed_data & 0xFFFF);
1336 wc->wc_pkey_ix = (wc->wc_immed_data >> 16);
1337
1338 /*
1339 * Depending on whether the completion was a receive or a send
1340 * completion, fill in "bytes transferred" as appropriate. Also,
1341 * if necessary, fill in the "path bits" field.
1342 */
1343 if (TAVOR_CQE_SENDRECV_GET(cq, cqe) == TAVOR_COMPLETION_RECV) {
1344 wc->wc_path_bits = TAVOR_CQE_PATHBITS_GET(cq, cqe);
1345 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1346
1347 } else if ((wc->wc_type == IBT_WRC_RDMAR) ||
1348 (wc->wc_type == IBT_WRC_CSWAP) || (wc->wc_type == IBT_WRC_FADD)) {
1349 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1350 }
1351
1352 return (TAVOR_CQ_SYNC_AND_DB);
1353 }
1354
1355
1356 /*
1357 * tavor_cq_errcqe_consume()
1358 * Context: Can be called from interrupt or base context.
1359 */
1360 static int
tavor_cq_errcqe_consume(tavor_state_t * state,tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe,ibt_wc_t * wc)1361 tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1362 tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1363 {
1364 uint64_t next_wqeaddr;
1365 uint32_t imm_eth_pkey_cred;
1366 uint_t nextwqesize, dbd;
1367 uint_t doorbell_cnt, status;
1368 tavor_wrid_entry_t wre;
1369
1370 /*
1371 * Fetch the Work Request ID using the information in the CQE.
1372 * See tavor_wr.c for more details.
1373 */
1374 wc->wc_id = tavor_wrid_get_entry(cq, cqe, &wre);
1375
1376 /*
1377 * Parse the CQE opcode to determine completion type. We know that
1378 * the CQE is an error completion, so we extract only the completion
1379 * status here.
1380 */
1381 imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1382 status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT;
1383 switch (status) {
1384 case TAVOR_CQE_LOC_LEN_ERR:
1385 status = IBT_WC_LOCAL_LEN_ERR;
1386 break;
1387
1388 case TAVOR_CQE_LOC_OP_ERR:
1389 status = IBT_WC_LOCAL_QP_OP_ERR;
1390 break;
1391
1392 case TAVOR_CQE_LOC_PROT_ERR:
1393 status = IBT_WC_LOCAL_PROTECT_ERR;
1394 break;
1395
1396 case TAVOR_CQE_WR_FLUSHED_ERR:
1397 status = IBT_WC_WR_FLUSHED_ERR;
1398 break;
1399
1400 case TAVOR_CQE_MW_BIND_ERR:
1401 status = IBT_WC_MEM_WIN_BIND_ERR;
1402 break;
1403
1404 case TAVOR_CQE_BAD_RESPONSE_ERR:
1405 status = IBT_WC_BAD_RESPONSE_ERR;
1406 break;
1407
1408 case TAVOR_CQE_LOCAL_ACCESS_ERR:
1409 status = IBT_WC_LOCAL_ACCESS_ERR;
1410 break;
1411
1412 case TAVOR_CQE_REM_INV_REQ_ERR:
1413 status = IBT_WC_REMOTE_INVALID_REQ_ERR;
1414 break;
1415
1416 case TAVOR_CQE_REM_ACC_ERR:
1417 status = IBT_WC_REMOTE_ACCESS_ERR;
1418 break;
1419
1420 case TAVOR_CQE_REM_OP_ERR:
1421 status = IBT_WC_REMOTE_OP_ERR;
1422 break;
1423
1424 case TAVOR_CQE_TRANS_TO_ERR:
1425 status = IBT_WC_TRANS_TIMEOUT_ERR;
1426 break;
1427
1428 case TAVOR_CQE_RNRNAK_TO_ERR:
1429 status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
1430 break;
1431
1432 /*
1433 * The following error codes are not supported in the Tavor driver
1434 * as they relate only to Reliable Datagram completion statuses:
1435 * case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
1436 * case TAVOR_CQE_REM_INV_RD_REQ_ERR:
1437 * case TAVOR_CQE_EEC_REM_ABORTED_ERR:
1438 * case TAVOR_CQE_INV_EEC_NUM_ERR:
1439 * case TAVOR_CQE_INV_EEC_STATE_ERR:
1440 * case TAVOR_CQE_LOC_EEC_ERR:
1441 */
1442
1443 default:
1444 TAVOR_WARNING(state, "unknown error CQE status");
1445 status = IBT_WC_LOCAL_QP_OP_ERR;
1446 break;
1447 }
1448 wc->wc_status = status;
1449
1450 /*
1451 * Now we do all the checking that's necessary to handle completion
1452 * queue entry "recycling"
1453 *
1454 * It is not necessary here to try to sync the WQE as we are only
1455 * attempting to read from the Work Queue (and hardware does not
1456 * write to it).
1457 */
1458
1459 /*
1460 * We can get doorbell info, WQE address, size for the next WQE
1461 * from the "wre" (which was filled in above in the call to the
1462 * tavor_wrid_get_entry() routine)
1463 */
1464 dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0;
1465 next_wqeaddr = wre.wr_wqeaddrsz;
1466 nextwqesize = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK;
1467
1468 /*
1469 * Get the doorbell count from the CQE. This indicates how many
1470 * completions this one CQE represents.
1471 */
1472 doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
1473
1474 /*
1475 * Determine if we're ready to consume this CQE yet or not. If the
1476 * next WQE has size zero (i.e. no next WQE) or if the doorbell count
1477 * is down to zero, then this is the last/only completion represented
1478 * by the current CQE (return TAVOR_CQ_SYNC_AND_DB). Otherwise, the
1479 * current CQE needs to be recycled (see below).
1480 */
1481 if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) {
1482 /*
1483 * Consume the CQE
1484 * Return status to indicate that doorbell and sync may be
1485 * necessary.
1486 */
1487 return (TAVOR_CQ_SYNC_AND_DB);
1488
1489 } else {
1490 /*
1491 * Recycle the CQE for use in the next PollCQ() call
1492 * Decrement the doorbell count, modify the error status,
1493 * and update the WQE address and size (to point to the
1494 * next WQE on the chain. Put these update entries back
1495 * into the CQE.
1496 * Despite the fact that we have updated the CQE, it is not
1497 * necessary for us to attempt to sync this entry just yet
1498 * as we have not changed the "hardware's view" of the
1499 * entry (i.e. we have not modified the "owner" bit - which
1500 * is all that the Tavor hardware really cares about.
1501 */
1502 doorbell_cnt = doorbell_cnt - dbd;
1503 TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cq, cqe,
1504 ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) |
1505 (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK)));
1506 TAVOR_CQE_WQEADDRSZ_SET(cq, cqe,
1507 TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize));
1508
1509 return (TAVOR_CQ_RECYCLE_ENTRY);
1510 }
1511 }
1512
1513
1514 /*
1515 * tavor_cqe_sync()
1516 * Context: Can be called from interrupt or base context.
1517 */
1518 static void
tavor_cqe_sync(tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe,uint_t flag)1519 tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, uint_t flag)
1520 {
1521 ddi_dma_handle_t dmahdl;
1522 off_t offset;
1523
1524 /* Determine if CQ needs to be synced or not */
1525 if (cq->cq_sync == 0)
1526 return;
1527
1528 /* Get the DMA handle from CQ context */
1529 dmahdl = cq->cq_mrhdl->mr_bindinfo.bi_dmahdl;
1530
1531 /* Calculate offset of next CQE */
1532 offset = (off_t)((uintptr_t)cqe - (uintptr_t)&cq->cq_buf[0]);
1533 (void) ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_cqe_t), flag);
1534 }
1535
1536
1537 /*
1538 * tavor_cq_resize_helper()
1539 * Context: Can be called only from user or kernel context.
1540 */
1541 static void
tavor_cq_resize_helper(tavor_cqhdl_t cq,tavor_hw_cqe_t * new_cqbuf,uint32_t old_cons_indx,uint32_t num_newcqe)1542 tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
1543 uint32_t old_cons_indx, uint32_t num_newcqe)
1544 {
1545 tavor_hw_cqe_t *old_cqe, *new_cqe;
1546 uint32_t new_cons_indx, wrap_around_mask;
1547 int i;
1548
1549 ASSERT(MUTEX_HELD(&cq->cq_lock));
1550
1551 /* Get the consumer index */
1552 new_cons_indx = 0;
1553
1554 /*
1555 * Calculate the wrap around mask. Note: This operation only works
1556 * because all Tavor completion queues have power-of-2 sizes
1557 */
1558 wrap_around_mask = (cq->cq_bufsz - 1);
1559
1560 /*
1561 * Calculate the pointers to the first CQ entry (in the "old" CQ)
1562 * and the first CQ entry in the "new" CQ
1563 */
1564 old_cqe = &cq->cq_buf[old_cons_indx];
1565 new_cqe = &new_cqbuf[new_cons_indx];
1566
1567 /* Sync entire "old" CQ for use by software (if necessary). */
1568 if (cq->cq_sync) {
1569 (void) ddi_dma_sync(cq->cq_mrhdl->mr_bindinfo.bi_dmahdl,
1570 0, cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORCPU);
1571 }
1572
1573 /*
1574 * Keep pulling entries from the "old" CQ until we find an entry owned
1575 * by the hardware. Process each entry by copying it into the "new"
1576 * CQ and updating respective indices and pointers in the "old" CQ.
1577 */
1578 for (i = 0; i < num_newcqe; i++) {
1579
1580 /* Copy this old CQE into the "new_cqe" pointer */
1581 bcopy(old_cqe, new_cqe, sizeof (tavor_hw_cqe_t));
1582
1583 /* Increment the consumer index (for both CQs) */
1584 old_cons_indx = (old_cons_indx + 1) & wrap_around_mask;
1585 new_cons_indx = (new_cons_indx + 1);
1586
1587 /* Update the pointer to the next CQ entry */
1588 old_cqe = &cq->cq_buf[old_cons_indx];
1589 new_cqe = &new_cqbuf[new_cons_indx];
1590 }
1591 }
1592
1593 /*
1594 * tavor_cq_srq_entries_flush()
1595 * Context: Can be called from interrupt or base context.
1596 */
1597 void
tavor_cq_srq_entries_flush(tavor_state_t * state,tavor_qphdl_t qp)1598 tavor_cq_srq_entries_flush(tavor_state_t *state, tavor_qphdl_t qp)
1599 {
1600 tavor_cqhdl_t cq;
1601 tavor_workq_hdr_t *wqhdr;
1602 tavor_hw_cqe_t *cqe;
1603 tavor_hw_cqe_t *next_cqe;
1604 uint32_t cons_indx, tail_cons_indx, wrap_around_mask;
1605 uint32_t new_indx, check_indx, indx;
1606 uint32_t num_to_increment;
1607 int cqe_qpnum, cqe_type;
1608 int outstanding_cqes, removed_cqes;
1609 int i;
1610
1611 ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock));
1612
1613 cq = qp->qp_rq_cqhdl;
1614 wqhdr = qp->qp_rq_wqhdr;
1615
1616 ASSERT(wqhdr->wq_wrid_post != NULL);
1617 ASSERT(wqhdr->wq_wrid_post->wl_srq_en != 0);
1618
1619 /*
1620 * Check for user-mapped CQ memory. Note: We do not allow kernel
1621 * clients to modify any userland mapping CQ. If the CQ is
1622 * user-mapped, then we simply return here, and this "flush" function
1623 * becomes a NO-OP in this case.
1624 */
1625 if (cq->cq_is_umap) {
1626 return;
1627 }
1628
1629 /* Get the consumer index */
1630 cons_indx = cq->cq_consindx;
1631
1632 /*
1633 * Calculate the wrap around mask. Note: This operation only works
1634 * because all Tavor completion queues have power-of-2 sizes
1635 */
1636 wrap_around_mask = (cq->cq_bufsz - 1);
1637
1638 /* Calculate the pointer to the first CQ entry */
1639 cqe = &cq->cq_buf[cons_indx];
1640
1641 /* Sync the current CQE to read */
1642 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1643
1644 /*
1645 * Loop through the CQ looking for entries owned by software. If an
1646 * entry is owned by software then we increment an 'outstanding_cqes'
1647 * count to know how many entries total we have on our CQ. We use this
1648 * value further down to know how many entries to loop through looking
1649 * for our same QP number.
1650 */
1651 outstanding_cqes = 0;
1652 tail_cons_indx = cons_indx;
1653 while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
1654 /* increment total cqes count */
1655 outstanding_cqes++;
1656
1657 /* increment the consumer index */
1658 tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask;
1659
1660 /* update the pointer to the next cq entry */
1661 cqe = &cq->cq_buf[tail_cons_indx];
1662
1663 /* sync the next cqe to read */
1664 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1665 }
1666
1667 /*
1668 * Using the 'tail_cons_indx' that was just set, we now know how many
1669 * total CQEs possible there are. Set the 'check_indx' and the
1670 * 'new_indx' to the last entry identified by 'tail_cons_indx'
1671 */
1672 check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask;
1673
1674 for (i = 0; i < outstanding_cqes; i++) {
1675 cqe = &cq->cq_buf[check_indx];
1676
1677 /* Grab QP number from CQE */
1678 cqe_qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
1679 cqe_type = TAVOR_CQE_SENDRECV_GET(cq, cqe);
1680
1681 /*
1682 * If the QP number is the same in the CQE as the QP that we
1683 * have on this SRQ, then we must free up the entry off the
1684 * SRQ. We also make sure that the completion type is of the
1685 * 'TAVOR_COMPLETION_RECV' type. So any send completions on
1686 * this CQ will be left as-is. The handling of returning
1687 * entries back to HW ownership happens further down.
1688 */
1689 if (cqe_qpnum == qp->qp_qpnum &&
1690 cqe_type == TAVOR_COMPLETION_RECV) {
1691
1692 /* Add back to SRQ free list */
1693 (void) tavor_wrid_find_match_srq(wqhdr->wq_wrid_post,
1694 cq, cqe);
1695 } else {
1696 /* Do Copy */
1697 if (check_indx != new_indx) {
1698 next_cqe = &cq->cq_buf[new_indx];
1699
1700 /*
1701 * Copy the CQE into the "next_cqe"
1702 * pointer.
1703 */
1704 bcopy(cqe, next_cqe, sizeof (tavor_hw_cqe_t));
1705 }
1706 new_indx = (new_indx - 1) & wrap_around_mask;
1707 }
1708 /* Move index to next CQE to check */
1709 check_indx = (check_indx - 1) & wrap_around_mask;
1710 }
1711
1712 /* Initialize removed cqes count */
1713 removed_cqes = 0;
1714
1715 /* If an entry was removed */
1716 if (check_indx != new_indx) {
1717
1718 /*
1719 * Set current pointer back to the beginning consumer index.
1720 * At this point, all unclaimed entries have been copied to the
1721 * index specified by 'new_indx'. This 'new_indx' will be used
1722 * as the new consumer index after we mark all freed entries as
1723 * having HW ownership. We do that here.
1724 */
1725
1726 /* Loop through all entries until we reach our new pointer */
1727 for (indx = cons_indx; indx <= new_indx;
1728 indx = (indx + 1) & wrap_around_mask) {
1729 removed_cqes++;
1730 cqe = &cq->cq_buf[indx];
1731
1732 /* Reset entry to hardware ownership */
1733 TAVOR_CQE_OWNER_SET_HW(cq, cqe);
1734 }
1735 }
1736
1737 /*
1738 * Update consumer index to be the 'new_indx'. This moves it past all
1739 * removed entries. Because 'new_indx' is pointing to the last
1740 * previously valid SW owned entry, we add 1 to point the cons_indx to
1741 * the first HW owned entry.
1742 */
1743 cons_indx = (new_indx + 1) & wrap_around_mask;
1744
1745 /*
1746 * Now we only ring the doorbell (to update the consumer index) if
1747 * we've actually consumed a CQ entry. If we found no QP number
1748 * matches above, then we would not have removed anything. So only if
1749 * something was removed do we ring the doorbell.
1750 */
1751 if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
1752 /*
1753 * Post doorbell to update the consumer index. Doorbell
1754 * value indicates number of entries consumed (minus 1)
1755 */
1756 if (cons_indx > cq->cq_consindx) {
1757 num_to_increment = (cons_indx - cq->cq_consindx) - 1;
1758 } else {
1759 num_to_increment = ((cons_indx + cq->cq_bufsz) -
1760 cq->cq_consindx) - 1;
1761 }
1762 cq->cq_consindx = cons_indx;
1763
1764 tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
1765 cq->cq_cqnum, num_to_increment);
1766 }
1767 }
1768