1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * tavor_cq.c
29 * Tavor Completion Queue Processing Routines
30 *
31 * Implements all the routines necessary for allocating, freeing, resizing,
32 * and handling the completion type events that the Tavor hardware can
33 * generate.
34 */
35
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/bitmap.h>
42 #include <sys/sysmacros.h>
43
44 #include <sys/ib/adapters/tavor/tavor.h>
45
46 static void tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd,
47 uint32_t cqn, uint32_t cq_param);
48 #pragma inline(tavor_cq_doorbell)
49 static int tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
50 tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
51 static int tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
52 tavor_hw_cqe_t *cqe, ibt_wc_t *wc);
53 static void tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
54 uint_t flag);
55 static void tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
56 uint32_t old_cons_indx, uint32_t num_newcqe);
57
58 /*
59 * tavor_cq_alloc()
60 * Context: Can be called only from user or kernel context.
61 */
62 int
tavor_cq_alloc(tavor_state_t * state,ibt_cq_hdl_t ibt_cqhdl,ibt_cq_attr_t * cq_attr,uint_t * actual_size,tavor_cqhdl_t * cqhdl,uint_t sleepflag)63 tavor_cq_alloc(tavor_state_t *state, ibt_cq_hdl_t ibt_cqhdl,
64 ibt_cq_attr_t *cq_attr, uint_t *actual_size, tavor_cqhdl_t *cqhdl,
65 uint_t sleepflag)
66 {
67 tavor_rsrc_t *cqc, *rsrc;
68 tavor_umap_db_entry_t *umapdb;
69 tavor_hw_cqc_t cqc_entry;
70 tavor_cqhdl_t cq;
71 ibt_mr_attr_t mr_attr;
72 tavor_mr_options_t op;
73 tavor_pdhdl_t pd;
74 tavor_mrhdl_t mr;
75 tavor_hw_cqe_t *buf;
76 uint64_t addr, value;
77 uint32_t log_cq_size, lkey, uarpg;
78 uint_t dma_xfer_mode, cq_sync, cq_is_umap;
79 int status, i, flag;
80 char *errormsg;
81
82 TAVOR_TNF_ENTER(tavor_cq_alloc);
83
84 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq_attr))
85
86 /*
87 * Determine whether CQ is being allocated for userland access or
88 * whether it is being allocated for kernel access. If the CQ is
89 * being allocated for userland access, then lookup the UAR doorbell
90 * page number for the current process. Note: If this is not found
91 * (e.g. if the process has not previously open()'d the Tavor driver),
92 * then an error is returned.
93 */
94 cq_is_umap = (cq_attr->cq_flags & IBT_CQ_USER_MAP) ? 1 : 0;
95 if (cq_is_umap) {
96 status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
97 MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
98 if (status != DDI_SUCCESS) {
99 /* Set "status" and "errormsg" and goto failure */
100 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page");
101 goto cqalloc_fail;
102 }
103 uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
104 }
105
106 /* Use the internal protection domain (PD) for setting up CQs */
107 pd = state->ts_pdhdl_internal;
108
109 /* Increment the reference count on the protection domain (PD) */
110 tavor_pd_refcnt_inc(pd);
111
112 /*
113 * Allocate an CQ context entry. This will be filled in with all
114 * the necessary parameters to define the Completion Queue. And then
115 * ownership will be passed to the hardware in the final step
116 * below. If we fail here, we must undo the protection domain
117 * reference count.
118 */
119 status = tavor_rsrc_alloc(state, TAVOR_CQC, 1, sleepflag, &cqc);
120 if (status != DDI_SUCCESS) {
121 /* Set "status" and "errormsg" and goto failure */
122 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ context");
123 goto cqalloc_fail1;
124 }
125
126 /*
127 * Allocate the software structure for tracking the completion queue
128 * (i.e. the Tavor Completion Queue handle). If we fail here, we must
129 * undo the protection domain reference count and the previous
130 * resource allocation.
131 */
132 status = tavor_rsrc_alloc(state, TAVOR_CQHDL, 1, sleepflag, &rsrc);
133 if (status != DDI_SUCCESS) {
134 /* Set "status" and "errormsg" and goto failure */
135 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ handle");
136 goto cqalloc_fail2;
137 }
138 cq = (tavor_cqhdl_t)rsrc->tr_addr;
139 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
140 cq->cq_is_umap = cq_is_umap;
141
142 /* Use the index as CQ number */
143 cq->cq_cqnum = cqc->tr_indx;
144
145 /*
146 * If this will be a user-mappable CQ, then allocate an entry for
147 * the "userland resources database". This will later be added to
148 * the database (after all further CQ operations are successful).
149 * If we fail here, we must undo the reference counts and the
150 * previous resource allocation.
151 */
152 if (cq->cq_is_umap) {
153 umapdb = tavor_umap_db_alloc(state->ts_instance, cq->cq_cqnum,
154 MLNX_UMAP_CQMEM_RSRC, (uint64_t)(uintptr_t)rsrc);
155 if (umapdb == NULL) {
156 /* Set "status" and "errormsg" and goto failure */
157 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
158 goto cqalloc_fail3;
159 }
160 }
161
162 /*
163 * Calculate the appropriate size for the completion queue.
164 * Note: All Tavor CQs must be a power-of-2 minus 1 in size. Also
165 * they may not be any smaller than TAVOR_CQ_MIN_SIZE. This step is
166 * to round the requested size up to the next highest power-of-2
167 */
168 cq_attr->cq_size = max(cq_attr->cq_size, TAVOR_CQ_MIN_SIZE);
169 log_cq_size = highbit(cq_attr->cq_size);
170
171 /*
172 * Next we verify that the rounded-up size is valid (i.e. consistent
173 * with the device limits and/or software-configured limits)
174 */
175 if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
176 /* Set "status" and "errormsg" and goto failure */
177 TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size");
178 goto cqalloc_fail4;
179 }
180
181 /*
182 * Allocate the memory for Completion Queue.
183 *
184 * Note: Although we use the common queue allocation routine, we
185 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
186 * kernel system memory) for kernel CQs because it would be
187 * inefficient to have CQs located in DDR memory. This is primarily
188 * because CQs are read from (by software) more than they are written
189 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
190 * user-mappable CQs for a similar reason.)
191 * It is also worth noting that, unlike Tavor QP work queues,
192 * completion queues do not have the same strict alignment
193 * requirements. It is sufficient for the CQ memory to be both
194 * aligned to and bound to addresses which are a multiple of CQE size.
195 */
196 cq->cq_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
197 cq->cq_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
198 cq->cq_cqinfo.qa_bind_align = sizeof (tavor_hw_cqe_t);
199 if (cq->cq_is_umap) {
200 cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
201 } else {
202 cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
203 }
204 status = tavor_queue_alloc(state, &cq->cq_cqinfo, sleepflag);
205 if (status != DDI_SUCCESS) {
206 /* Set "status" and "errormsg" and goto failure */
207 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue");
208 goto cqalloc_fail4;
209 }
210 buf = (tavor_hw_cqe_t *)cq->cq_cqinfo.qa_buf_aligned;
211 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
212
213 /*
214 * Initialize each of the Completion Queue Entries (CQE) by setting
215 * their ownership to hardware ("owner" bit set to HW). This is in
216 * preparation for the final transfer of ownership (below) of the
217 * CQ context itself.
218 */
219 for (i = 0; i < (1 << log_cq_size); i++) {
220 TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
221 }
222
223 /*
224 * Register the memory for the CQ. The memory for the CQ must
225 * be registered in the Tavor TPT tables. This gives us the LKey
226 * to specify in the CQ context below. Note: If this is a user-
227 * mappable CQ, then we will force DDI_DMA_CONSISTENT mapping.
228 */
229 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
230 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
231 mr_attr.mr_len = cq->cq_cqinfo.qa_size;
232 mr_attr.mr_as = NULL;
233 mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
234 if (cq->cq_is_umap) {
235 dma_xfer_mode = DDI_DMA_CONSISTENT;
236 } else {
237 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
238 }
239 if (dma_xfer_mode == DDI_DMA_STREAMING) {
240 mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
241 }
242 op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
243 op.mro_bind_dmahdl = cq->cq_cqinfo.qa_dmahdl;
244 op.mro_bind_override_addr = 0;
245 status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
246 if (status != DDI_SUCCESS) {
247 /* Set "status" and "errormsg" and goto failure */
248 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
249 goto cqalloc_fail5;
250 }
251 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
252 addr = mr->mr_bindinfo.bi_addr;
253 lkey = mr->mr_lkey;
254
255 /* Determine if later ddi_dma_sync will be necessary */
256 cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, cq->cq_cqinfo);
257
258 /* Sync entire CQ for use by the hardware (if necessary). */
259 if (cq_sync) {
260 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
261 cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
262 }
263
264 /*
265 * Fill in the CQC entry. This is the final step before passing
266 * ownership of the CQC entry to the Tavor hardware. We use all of
267 * the information collected/calculated above to fill in the
268 * requisite portions of the CQC. Note: If this CQ is going to be
269 * used for userland access, then we need to set the UAR page number
270 * appropriately (otherwise it's a "don't care")
271 */
272 bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
273 cq->cq_eqnum = TAVOR_CQ_EQNUM_GET(cq->cq_cqnum);
274 cq->cq_erreqnum = TAVOR_CQ_ERREQNUM_GET(cq->cq_cqnum);
275 cqc_entry.xlat = TAVOR_VA2PA_XLAT_ENABLED;
276 cqc_entry.state = TAVOR_CQ_DISARMED;
277 cqc_entry.start_addr_h = (addr >> 32);
278 cqc_entry.start_addr_l = (addr & 0xFFFFFFFF);
279 cqc_entry.log_cq_sz = log_cq_size;
280 if (cq->cq_is_umap) {
281 cqc_entry.usr_page = uarpg;
282 } else {
283 cqc_entry.usr_page = 0;
284 }
285 cqc_entry.pd = pd->pd_pdnum;
286 cqc_entry.lkey = lkey;
287 cqc_entry.e_eqn = cq->cq_erreqnum;
288 cqc_entry.c_eqn = cq->cq_eqnum;
289 cqc_entry.cqn = cq->cq_cqnum;
290
291 /*
292 * Write the CQC entry to hardware. Lastly, we pass ownership of
293 * the entry to the hardware (using the Tavor SW2HW_CQ firmware
294 * command). Note: In general, this operation shouldn't fail. But
295 * if it does, we have to undo everything we've done above before
296 * returning error.
297 */
298 status = tavor_cmn_ownership_cmd_post(state, SW2HW_CQ, &cqc_entry,
299 sizeof (tavor_hw_cqc_t), cq->cq_cqnum, sleepflag);
300 if (status != TAVOR_CMD_SUCCESS) {
301 cmn_err(CE_CONT, "Tavor: SW2HW_CQ command failed: %08x\n",
302 status);
303 TNF_PROBE_1(tavor_cq_alloc_sw2hw_cq_cmd_fail,
304 TAVOR_TNF_ERROR, "", tnf_uint, status, status);
305 /* Set "status" and "errormsg" and goto failure */
306 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "tavor SW2HW_CQ command");
307 goto cqalloc_fail6;
308 }
309
310 /*
311 * Fill in the rest of the Tavor Completion Queue handle. Having
312 * successfully transferred ownership of the CQC, we can update the
313 * following fields for use in further operations on the CQ.
314 */
315 cq->cq_cqcrsrcp = cqc;
316 cq->cq_rsrcp = rsrc;
317 cq->cq_consindx = 0;
318 cq->cq_buf = buf;
319 cq->cq_bufsz = (1 << log_cq_size);
320 cq->cq_mrhdl = mr;
321 cq->cq_sync = cq_sync;
322 cq->cq_refcnt = 0;
323 cq->cq_is_special = 0;
324 cq->cq_uarpg = uarpg;
325 cq->cq_umap_dhp = (devmap_cookie_t)NULL;
326 avl_create(&cq->cq_wrid_wqhdr_avl_tree, tavor_wrid_wqhdr_compare,
327 sizeof (struct tavor_workq_hdr_s),
328 offsetof(struct tavor_workq_hdr_s, wq_avl_link));
329
330 cq->cq_wrid_reap_head = NULL;
331 cq->cq_wrid_reap_tail = NULL;
332 cq->cq_hdlrarg = (void *)ibt_cqhdl;
333
334 /*
335 * Put CQ handle in Tavor CQNum-to-CQHdl list. Then fill in the
336 * "actual_size" and "cqhdl" and return success
337 */
338 ASSERT(state->ts_cqhdl[cqc->tr_indx] == NULL);
339 state->ts_cqhdl[cqc->tr_indx] = cq;
340
341 /*
342 * If this is a user-mappable CQ, then we need to insert the previously
343 * allocated entry into the "userland resources database". This will
344 * allow for later lookup during devmap() (i.e. mmap()) calls.
345 */
346 if (cq->cq_is_umap) {
347 tavor_umap_db_add(umapdb);
348 }
349
350 /*
351 * Fill in the return arguments (if necessary). This includes the
352 * real completion queue size.
353 */
354 if (actual_size != NULL) {
355 *actual_size = (1 << log_cq_size) - 1;
356 }
357 *cqhdl = cq;
358
359 TAVOR_TNF_EXIT(tavor_cq_alloc);
360 return (DDI_SUCCESS);
361
362 /*
363 * The following is cleanup for all possible failure cases in this routine
364 */
365 cqalloc_fail6:
366 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
367 sleepflag) != DDI_SUCCESS) {
368 TAVOR_WARNING(state, "failed to deregister CQ memory");
369 }
370 cqalloc_fail5:
371 tavor_queue_free(state, &cq->cq_cqinfo);
372 cqalloc_fail4:
373 if (cq_is_umap) {
374 tavor_umap_db_free(umapdb);
375 }
376 cqalloc_fail3:
377 tavor_rsrc_free(state, &rsrc);
378 cqalloc_fail2:
379 tavor_rsrc_free(state, &cqc);
380 cqalloc_fail1:
381 tavor_pd_refcnt_dec(pd);
382 cqalloc_fail:
383 TNF_PROBE_1(tavor_cq_alloc_fail, TAVOR_TNF_ERROR, "",
384 tnf_string, msg, errormsg);
385 TAVOR_TNF_EXIT(tavor_cq_alloc);
386 return (status);
387 }
388
389
390 /*
391 * tavor_cq_free()
392 * Context: Can be called only from user or kernel context.
393 */
394 /* ARGSUSED */
395 int
tavor_cq_free(tavor_state_t * state,tavor_cqhdl_t * cqhdl,uint_t sleepflag)396 tavor_cq_free(tavor_state_t *state, tavor_cqhdl_t *cqhdl, uint_t sleepflag)
397 {
398 tavor_rsrc_t *cqc, *rsrc;
399 tavor_umap_db_entry_t *umapdb;
400 tavor_hw_cqc_t cqc_entry;
401 tavor_pdhdl_t pd;
402 tavor_mrhdl_t mr;
403 tavor_cqhdl_t cq;
404 uint32_t cqnum;
405 uint64_t value;
406 uint_t maxprot;
407 int status;
408
409 TAVOR_TNF_ENTER(tavor_cq_free);
410
411 /*
412 * Pull all the necessary information from the Tavor Completion Queue
413 * handle. This is necessary here because the resource for the
414 * CQ handle is going to be freed up as part of this operation.
415 */
416 cq = *cqhdl;
417 mutex_enter(&cq->cq_lock);
418 cqc = cq->cq_cqcrsrcp;
419 rsrc = cq->cq_rsrcp;
420 pd = state->ts_pdhdl_internal;
421 mr = cq->cq_mrhdl;
422 cqnum = cq->cq_cqnum;
423
424 /*
425 * If there are work queues still associated with the CQ, then return
426 * an error. Otherwise, we will be holding the CQ lock.
427 */
428 if (cq->cq_refcnt != 0) {
429 mutex_exit(&cq->cq_lock);
430 TNF_PROBE_1(tavor_cq_free_refcnt_fail, TAVOR_TNF_ERROR, "",
431 tnf_int, refcnt, cq->cq_refcnt);
432 TAVOR_TNF_EXIT(tavor_cq_free);
433 return (IBT_CQ_BUSY);
434 }
435
436 /*
437 * If this was a user-mappable CQ, then we need to remove its entry
438 * from the "userland resources database". If it is also currently
439 * mmap()'d out to a user process, then we need to call
440 * devmap_devmem_remap() to remap the CQ memory to an invalid mapping.
441 * We also need to invalidate the CQ tracking information for the
442 * user mapping.
443 */
444 if (cq->cq_is_umap) {
445 status = tavor_umap_db_find(state->ts_instance, cqnum,
446 MLNX_UMAP_CQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
447 &umapdb);
448 if (status != DDI_SUCCESS) {
449 mutex_exit(&cq->cq_lock);
450 TAVOR_WARNING(state, "failed to find in database");
451 TAVOR_TNF_EXIT(tavor_cq_free);
452 return (ibc_get_ci_failure(0));
453 }
454 tavor_umap_db_free(umapdb);
455 if (cq->cq_umap_dhp != NULL) {
456 maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
457 status = devmap_devmem_remap(cq->cq_umap_dhp,
458 state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size,
459 maxprot, DEVMAP_MAPPING_INVALID, NULL);
460 if (status != DDI_SUCCESS) {
461 mutex_exit(&cq->cq_lock);
462 TAVOR_WARNING(state, "failed in CQ memory "
463 "devmap_devmem_remap()");
464 TAVOR_TNF_EXIT(tavor_cq_free);
465 return (ibc_get_ci_failure(0));
466 }
467 cq->cq_umap_dhp = (devmap_cookie_t)NULL;
468 }
469 }
470
471 /*
472 * Put NULL into the Tavor CQNum-to-CQHdl list. This will allow any
473 * in-progress events to detect that the CQ corresponding to this
474 * number has been freed.
475 */
476 state->ts_cqhdl[cqc->tr_indx] = NULL;
477
478 /*
479 * While we hold the CQ lock, do a "forced reap" of the workQ WRID
480 * list. This cleans up all the structures associated with the WRID
481 * processing for this CQ. Once we complete, drop the lock and finish
482 * the deallocation of the CQ.
483 */
484 tavor_wrid_cq_force_reap(cq);
485
486 mutex_exit(&cq->cq_lock);
487 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq))
488
489 /*
490 * Reclaim CQC entry from hardware (using the Tavor HW2SW_CQ
491 * firmware command). If the ownership transfer fails for any reason,
492 * then it is an indication that something (either in HW or SW) has
493 * gone seriously wrong.
494 */
495 status = tavor_cmn_ownership_cmd_post(state, HW2SW_CQ, &cqc_entry,
496 sizeof (tavor_hw_cqc_t), cqnum, sleepflag);
497 if (status != TAVOR_CMD_SUCCESS) {
498 TAVOR_WARNING(state, "failed to reclaim CQC ownership");
499 cmn_err(CE_CONT, "Tavor: HW2SW_CQ command failed: %08x\n",
500 status);
501 TNF_PROBE_1(tavor_cq_free_hw2sw_cq_cmd_fail,
502 TAVOR_TNF_ERROR, "", tnf_uint, status, status);
503 TAVOR_TNF_EXIT(tavor_cq_free);
504 return (ibc_get_ci_failure(0));
505 }
506
507 /*
508 * Deregister the memory for the Completion Queue. If this fails
509 * for any reason, then it is an indication that something (either
510 * in HW or SW) has gone seriously wrong. So we print a warning
511 * message and return.
512 */
513 status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
514 sleepflag);
515 if (status != DDI_SUCCESS) {
516 TAVOR_WARNING(state, "failed to deregister CQ memory");
517 TNF_PROBE_0(tavor_cq_free_dereg_mr_fail, TAVOR_TNF_ERROR, "");
518 TAVOR_TNF_EXIT(tavor_cq_free);
519 return (ibc_get_ci_failure(0));
520 }
521
522 /* Free the memory for the CQ */
523 tavor_queue_free(state, &cq->cq_cqinfo);
524
525 /* Free the Tavor Completion Queue handle */
526 tavor_rsrc_free(state, &rsrc);
527
528 /* Free up the CQC entry resource */
529 tavor_rsrc_free(state, &cqc);
530
531 /* Decrement the reference count on the protection domain (PD) */
532 tavor_pd_refcnt_dec(pd);
533
534 /* Set the cqhdl pointer to NULL and return success */
535 *cqhdl = NULL;
536
537 TAVOR_TNF_EXIT(tavor_cq_free);
538 return (DDI_SUCCESS);
539 }
540
541
542 /*
543 * tavor_cq_resize()
544 * Context: Can be called only from user or kernel context.
545 */
546 int
tavor_cq_resize(tavor_state_t * state,tavor_cqhdl_t cq,uint_t req_size,uint_t * actual_size,uint_t sleepflag)547 tavor_cq_resize(tavor_state_t *state, tavor_cqhdl_t cq, uint_t req_size,
548 uint_t *actual_size, uint_t sleepflag)
549 {
550 tavor_hw_cqc_t cqc_entry;
551 tavor_qalloc_info_t new_cqinfo, old_cqinfo;
552 ibt_mr_attr_t mr_attr;
553 tavor_mr_options_t op;
554 tavor_pdhdl_t pd;
555 tavor_mrhdl_t mr, mr_old;
556 tavor_hw_cqe_t *buf;
557 uint32_t new_prod_indx, old_cons_indx;
558 uint_t dma_xfer_mode, cq_sync, log_cq_size, maxprot;
559 int status, i, flag;
560 char *errormsg;
561
562 TAVOR_TNF_ENTER(tavor_cq_resize);
563
564 /* Use the internal protection domain (PD) for CQs */
565 pd = state->ts_pdhdl_internal;
566
567 /*
568 * Calculate the appropriate size for the new resized completion queue.
569 * Note: All Tavor CQs must be a power-of-2 minus 1 in size. Also
570 * they may not be any smaller than TAVOR_CQ_MIN_SIZE. This step is
571 * to round the requested size up to the next highest power-of-2
572 */
573 req_size = max(req_size, TAVOR_CQ_MIN_SIZE);
574 log_cq_size = highbit(req_size);
575
576 /*
577 * Next we verify that the rounded-up size is valid (i.e. consistent
578 * with the device limits and/or software-configured limits)
579 */
580 if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) {
581 /* Set "status" and "errormsg" and goto failure */
582 TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size");
583 goto cqresize_fail;
584 }
585
586 /*
587 * Allocate the memory for newly resized Completion Queue.
588 *
589 * Note: Although we use the common queue allocation routine, we
590 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in
591 * kernel system memory) for kernel CQs because it would be
592 * inefficient to have CQs located in DDR memory. This is the same
593 * as we do when we first allocate completion queues primarily
594 * because CQs are read from (by software) more than they are written
595 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all
596 * user-mappable CQs for a similar reason.)
597 * It is also worth noting that, unlike Tavor QP work queues,
598 * completion queues do not have the same strict alignment
599 * requirements. It is sufficient for the CQ memory to be both
600 * aligned to and bound to addresses which are a multiple of CQE size.
601 */
602 new_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t);
603 new_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t);
604 new_cqinfo.qa_bind_align = sizeof (tavor_hw_cqe_t);
605 if (cq->cq_is_umap) {
606 new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
607 } else {
608 new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL;
609 }
610 status = tavor_queue_alloc(state, &new_cqinfo, sleepflag);
611 if (status != DDI_SUCCESS) {
612 /* Set "status" and "errormsg" and goto failure */
613 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue");
614 goto cqresize_fail;
615 }
616 buf = (tavor_hw_cqe_t *)new_cqinfo.qa_buf_aligned;
617 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
618
619 /*
620 * Initialize each of the Completion Queue Entries (CQE) by setting
621 * their ownership to hardware ("owner" bit set to HW). This is in
622 * preparation for the final resize operation (below).
623 */
624 for (i = 0; i < (1 << log_cq_size); i++) {
625 TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]);
626 }
627
628 /*
629 * Register the memory for the CQ. The memory for the CQ must
630 * be registered in the Tavor TPT tables. This gives us the LKey
631 * to specify in the CQ context below.
632 */
633 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP;
634 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
635 mr_attr.mr_len = new_cqinfo.qa_size;
636 mr_attr.mr_as = NULL;
637 mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
638 if (cq->cq_is_umap) {
639 dma_xfer_mode = DDI_DMA_CONSISTENT;
640 } else {
641 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
642 }
643 if (dma_xfer_mode == DDI_DMA_STREAMING) {
644 mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
645 }
646 op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
647 op.mro_bind_dmahdl = new_cqinfo.qa_dmahdl;
648 op.mro_bind_override_addr = 0;
649 status = tavor_mr_register(state, pd, &mr_attr, &mr, &op);
650 if (status != DDI_SUCCESS) {
651 tavor_queue_free(state, &new_cqinfo);
652 /* Set "status" and "errormsg" and goto failure */
653 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
654 goto cqresize_fail;
655 }
656 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
657
658 /* Determine if later ddi_dma_sync will be necessary */
659 cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, new_cqinfo);
660
661 /* Sync entire "new" CQ for use by hardware (if necessary) */
662 if (cq_sync) {
663 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
664 new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
665 }
666
667 /*
668 * Now we grab the CQ lock. Since we will be updating the actual
669 * CQ location and the producer/consumer indexes, we should hold
670 * the lock.
671 *
672 * We do a TAVOR_NOSLEEP here (and below), though, because we are
673 * holding the "cq_lock" and if we got raised to interrupt level
674 * by priority inversion, we would not want to block in this routine
675 * waiting for success.
676 */
677 mutex_enter(&cq->cq_lock);
678
679 /*
680 * Determine the current CQ "consumer index".
681 *
682 * Note: This will depend on whether the CQ had previously been
683 * mapped for user access or whether it is a kernel CQ. If this
684 * is a kernel CQ, then all PollCQ() operations have come through
685 * the IBTF and, hence, the driver's CQ state structure will
686 * contain the current consumer index. If, however, the user has
687 * accessed this CQ by bypassing the driver (OS-bypass), then we
688 * need to query the firmware to determine the current CQ consumer
689 * index. This also assumes that the user process will not continue
690 * to consume entries while at the same time doing the ResizeCQ()
691 * operation. If the user process does not guarantee this, then it
692 * may see duplicate or missed completions. But under no
693 * circumstances should this panic the system.
694 */
695 if (cq->cq_is_umap) {
696 status = tavor_cmn_query_cmd_post(state, QUERY_CQ,
697 cq->cq_cqnum, &cqc_entry, sizeof (tavor_hw_cqc_t),
698 TAVOR_NOSLEEP);
699 if (status != TAVOR_CMD_SUCCESS) {
700 /* Query CQ has failed, drop CQ lock and cleanup */
701 mutex_exit(&cq->cq_lock);
702 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
703 sleepflag) != DDI_SUCCESS) {
704 TAVOR_WARNING(state, "failed to deregister "
705 "CQ memory");
706 }
707 tavor_queue_free(state, &new_cqinfo);
708 TAVOR_WARNING(state, "failed to find in database");
709
710 /* Set "status" and "errormsg" and goto failure */
711 TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
712 "failed umap lookup");
713 goto cqresize_fail;
714 }
715 old_cons_indx = cqc_entry.cons_indx;
716 } else {
717 old_cons_indx = cq->cq_consindx;
718 }
719
720 /*
721 * Fill in the CQC entry. For the resize operation this is the
722 * final step before attempting the resize operation on the CQC entry.
723 * We use all of the information collected/calculated above to fill
724 * in the requisite portions of the CQC.
725 */
726 bzero(&cqc_entry, sizeof (tavor_hw_cqc_t));
727 cqc_entry.start_addr_h = (mr->mr_bindinfo.bi_addr >> 32);
728 cqc_entry.start_addr_l = (mr->mr_bindinfo.bi_addr & 0xFFFFFFFF);
729 cqc_entry.log_cq_sz = log_cq_size;
730 cqc_entry.lkey = mr->mr_lkey;
731
732 /*
733 * Write the CQC entry to hardware. Lastly, we pass ownership of
734 * the entry to the hardware (using the Tavor RESIZE_CQ firmware
735 * command). Note: In general, this operation shouldn't fail. But
736 * if it does, we have to undo everything we've done above before
737 * returning error. Also note that the status returned may indicate
738 * the code to return to the IBTF.
739 */
740 status = tavor_resize_cq_cmd_post(state, &cqc_entry, cq->cq_cqnum,
741 &new_prod_indx, TAVOR_CMD_NOSLEEP_SPIN);
742 if (status != TAVOR_CMD_SUCCESS) {
743 /* Resize attempt has failed, drop CQ lock and cleanup */
744 mutex_exit(&cq->cq_lock);
745 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
746 sleepflag) != DDI_SUCCESS) {
747 TAVOR_WARNING(state, "failed to deregister CQ memory");
748 }
749 tavor_queue_free(state, &new_cqinfo);
750 if (status == TAVOR_CMD_BAD_SIZE) {
751 TAVOR_TNF_EXIT(tavor_cq_resize);
752 return (IBT_CQ_SZ_INSUFFICIENT);
753 } else {
754 cmn_err(CE_CONT, "Tavor: RESIZE_CQ command failed: "
755 "%08x\n", status);
756 TNF_PROBE_1(tavor_cq_resize_cq_cmd_fail,
757 TAVOR_TNF_ERROR, "", tnf_uint, status, status);
758 TAVOR_TNF_EXIT(tavor_cq_resize);
759 return (ibc_get_ci_failure(0));
760 }
761 }
762
763 /*
764 * The CQ resize attempt was successful. Before dropping the CQ lock,
765 * copy all of the CQEs from the "old" CQ into the "new" CQ. Note:
766 * the Tavor firmware guarantees us that sufficient space is set aside
767 * in the "new" CQ to handle any un-polled CQEs from the "old" CQ.
768 * The two parameters to this helper function ("old_cons_indx" and
769 * "new_prod_indx") essentially indicate the starting index and number
770 * of any CQEs that might remain in the "old" CQ memory.
771 */
772 tavor_cq_resize_helper(cq, buf, old_cons_indx, new_prod_indx);
773
774 /* Sync entire "new" CQ for use by hardware (if necessary) */
775 if (cq_sync) {
776 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0,
777 new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
778 }
779
780 /*
781 * Update the Tavor Completion Queue handle with all the new
782 * information. At the same time, save away all the necessary
783 * information for freeing up the old resources
784 */
785 mr_old = cq->cq_mrhdl;
786 old_cqinfo = cq->cq_cqinfo;
787 cq->cq_cqinfo = new_cqinfo;
788 cq->cq_consindx = 0;
789 cq->cq_buf = buf;
790 cq->cq_bufsz = (1 << log_cq_size);
791 cq->cq_mrhdl = mr;
792 cq->cq_sync = cq_sync;
793
794 /*
795 * If "old" CQ was a user-mappable CQ that is currently mmap()'d out
796 * to a user process, then we need to call devmap_devmem_remap() to
797 * invalidate the mapping to the CQ memory. We also need to
798 * invalidate the CQ tracking information for the user mapping.
799 */
800 if ((cq->cq_is_umap) && (cq->cq_umap_dhp != NULL)) {
801 maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
802 status = devmap_devmem_remap(cq->cq_umap_dhp,
803 state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size, maxprot,
804 DEVMAP_MAPPING_INVALID, NULL);
805 if (status != DDI_SUCCESS) {
806 mutex_exit(&cq->cq_lock);
807 TAVOR_WARNING(state, "failed in CQ memory "
808 "devmap_devmem_remap()");
809 TAVOR_TNF_EXIT(tavor_cq_free);
810 return (ibc_get_ci_failure(0));
811 }
812 cq->cq_umap_dhp = (devmap_cookie_t)NULL;
813 }
814
815 /*
816 * Drop the CQ lock now. The only thing left to do is to free up
817 * the old resources.
818 */
819 mutex_exit(&cq->cq_lock);
820
821 /*
822 * Deregister the memory for the old Completion Queue. Note: We
823 * really can't return error here because we have no good way to
824 * cleanup. Plus, the deregistration really shouldn't ever happen.
825 * So, if it does, it is an indication that something has gone
826 * seriously wrong. So we print a warning message and return error
827 * (knowing, of course, that the "old" CQ memory will be leaked)
828 */
829 status = tavor_mr_deregister(state, &mr_old, TAVOR_MR_DEREG_ALL,
830 sleepflag);
831 if (status != DDI_SUCCESS) {
832 TAVOR_WARNING(state, "failed to deregister old CQ memory");
833 /* Set "status" and "errormsg" and goto failure */
834 TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
835 "failed deregister mr (old)");
836 goto cqresize_fail;
837 }
838
839 /* Free the memory for the old CQ */
840 tavor_queue_free(state, &old_cqinfo);
841
842 /*
843 * Fill in the return arguments (if necessary). This includes the
844 * real new completion queue size.
845 */
846 if (actual_size != NULL) {
847 *actual_size = (1 << log_cq_size) - 1;
848 }
849
850 TAVOR_TNF_EXIT(tavor_cq_resize);
851 return (DDI_SUCCESS);
852
853 cqresize_fail:
854 TNF_PROBE_1(tavor_cq_resize_fail, TAVOR_TNF_ERROR, "",
855 tnf_string, msg, errormsg);
856 TAVOR_TNF_EXIT(tavor_cq_resize);
857 return (status);
858 }
859
860
861 /*
862 * tavor_cq_notify()
863 * Context: Can be called from interrupt or base context.
864 */
865 int
tavor_cq_notify(tavor_state_t * state,tavor_cqhdl_t cq,ibt_cq_notify_flags_t flags)866 tavor_cq_notify(tavor_state_t *state, tavor_cqhdl_t cq,
867 ibt_cq_notify_flags_t flags)
868 {
869 uint_t cqnum;
870
871 TAVOR_TNF_ENTER(tavor_cq_notify);
872
873 /*
874 * Determine if we are trying to get the next completion or the next
875 * "solicited" completion. Then hit the appropriate doorbell.
876 *
877 * NOTE: Please see the comment in tavor_event.c:tavor_eq_poll
878 * regarding why we do not have to do an extra PIO read here, and we
879 * will not lose an event after writing this doorbell.
880 */
881 cqnum = cq->cq_cqnum;
882 if (flags == IBT_NEXT_COMPLETION) {
883 tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ, cqnum,
884 TAVOR_CQDB_DEFAULT_PARAM);
885
886 } else if (flags == IBT_NEXT_SOLICITED) {
887 tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ_SOLICIT,
888 cqnum, TAVOR_CQDB_DEFAULT_PARAM);
889
890 } else {
891 TNF_PROBE_1(tavor_cq_notify_invflags_fail, TAVOR_TNF_ERROR, "",
892 tnf_int, flags, flags);
893 TAVOR_TNF_EXIT(tavor_cq_notify);
894 return (IBT_CQ_NOTIFY_TYPE_INVALID);
895 }
896
897 TAVOR_TNF_EXIT(tavor_cq_notify);
898 return (DDI_SUCCESS);
899 }
900
901
902 /*
903 * tavor_cq_poll()
904 * Context: Can be called from interrupt or base context.
905 */
906 int
tavor_cq_poll(tavor_state_t * state,tavor_cqhdl_t cq,ibt_wc_t * wc_p,uint_t num_wc,uint_t * num_polled)907 tavor_cq_poll(tavor_state_t *state, tavor_cqhdl_t cq, ibt_wc_t *wc_p,
908 uint_t num_wc, uint_t *num_polled)
909 {
910 tavor_hw_cqe_t *cqe;
911 uint32_t cons_indx, wrap_around_mask;
912 uint32_t polled_cnt, num_to_increment;
913 int status;
914
915 TAVOR_TNF_ENTER(tavor_cq_poll);
916
917 /*
918 * Check for user-mappable CQ memory. Note: We do not allow kernel
919 * clients to poll CQ memory that is accessible directly by the user.
920 * If the CQ memory is user accessible, then return an error.
921 */
922 if (cq->cq_is_umap) {
923 TNF_PROBE_0(tavor_cq_poll_inv_usrmapped_type,
924 TAVOR_TNF_ERROR, "");
925 TAVOR_TNF_EXIT(tavor_cq_poll);
926 return (IBT_CQ_HDL_INVALID);
927 }
928
929 mutex_enter(&cq->cq_lock);
930
931 /* Get the consumer index */
932 cons_indx = cq->cq_consindx;
933
934 /*
935 * Calculate the wrap around mask. Note: This operation only works
936 * because all Tavor completion queues have power-of-2 sizes
937 */
938 wrap_around_mask = (cq->cq_bufsz - 1);
939
940 /* Calculate the pointer to the first CQ entry */
941 cqe = &cq->cq_buf[cons_indx];
942
943 /* Sync the current CQE to read */
944 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
945
946 /*
947 * Keep pulling entries from the CQ until we find an entry owned by
948 * the hardware. As long as there the CQE's owned by SW, process
949 * each entry by calling tavor_cq_cqe_consume() and updating the CQ
950 * consumer index. Note: We only update the consumer index if
951 * tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB. Otherwise,
952 * it indicates that we are going to "recycle" the CQE (probably
953 * because it is a error CQE and corresponds to more than one
954 * completion).
955 */
956 polled_cnt = 0;
957 while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
958 status = tavor_cq_cqe_consume(state, cq, cqe,
959 &wc_p[polled_cnt++]);
960 if (status == TAVOR_CQ_SYNC_AND_DB) {
961 /* Reset entry to hardware ownership */
962 TAVOR_CQE_OWNER_SET_HW(cq, cqe);
963
964 /* Sync the current CQE for device */
965 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORDEV);
966
967 /* Increment the consumer index */
968 cons_indx = (cons_indx + 1) & wrap_around_mask;
969
970 /* Update the pointer to the next CQ entry */
971 cqe = &cq->cq_buf[cons_indx];
972
973 /* Sync the next CQE to read */
974 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
975 }
976
977 /*
978 * If we have run out of space to store work completions,
979 * then stop and return the ones we have pulled of the CQ.
980 */
981 if (polled_cnt >= num_wc) {
982 break;
983 }
984 }
985
986 /*
987 * Now we only ring the doorbell (to update the consumer index) if
988 * we've actually consumed a CQ entry. If we have, for example,
989 * pulled from a CQE that we are still in the process of "recycling"
990 * for error purposes, then we would not update the consumer index.
991 */
992 if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) {
993 /*
994 * Post doorbell to update the consumer index. Doorbell
995 * value indicates number of entries consumed (minus 1)
996 */
997 if (cons_indx > cq->cq_consindx) {
998 num_to_increment = (cons_indx - cq->cq_consindx) - 1;
999 } else {
1000 num_to_increment = ((cons_indx + cq->cq_bufsz) -
1001 cq->cq_consindx) - 1;
1002 }
1003 cq->cq_consindx = cons_indx;
1004 tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
1005 cq->cq_cqnum, num_to_increment);
1006
1007 } else if (polled_cnt == 0) {
1008 /*
1009 * If the CQ is empty, we can try to free up some of the WRID
1010 * list containers. See tavor_wr.c for more details on this
1011 * operation.
1012 */
1013 tavor_wrid_cq_reap(cq);
1014 }
1015
1016 mutex_exit(&cq->cq_lock);
1017
1018 /* Set "num_polled" (if necessary) */
1019 if (num_polled != NULL) {
1020 *num_polled = polled_cnt;
1021 }
1022
1023 /* Set CQ_EMPTY condition if needed, otherwise return success */
1024 if (polled_cnt == 0) {
1025 status = IBT_CQ_EMPTY;
1026 } else {
1027 status = DDI_SUCCESS;
1028 }
1029
1030 /*
1031 * Check if the system is currently panicking. If it is, then call
1032 * the Tavor interrupt service routine. This step is necessary here
1033 * because we might be in a polled I/O mode and without the call to
1034 * tavor_isr() - and its subsequent calls to poll and rearm each
1035 * event queue - we might overflow our EQs and render the system
1036 * unable to sync/dump.
1037 */
1038 if (ddi_in_panic() != 0) {
1039 (void) tavor_isr((caddr_t)state, (caddr_t)NULL);
1040 }
1041
1042 TAVOR_TNF_EXIT(tavor_cq_poll);
1043 return (status);
1044 }
1045
1046
1047 /*
1048 * tavor_cq_handler()
1049 * Context: Only called from interrupt context
1050 */
1051 int
tavor_cq_handler(tavor_state_t * state,tavor_eqhdl_t eq,tavor_hw_eqe_t * eqe)1052 tavor_cq_handler(tavor_state_t *state, tavor_eqhdl_t eq,
1053 tavor_hw_eqe_t *eqe)
1054 {
1055 tavor_cqhdl_t cq;
1056 uint_t cqnum;
1057 uint_t eqe_evttype;
1058
1059 TAVOR_TNF_ENTER(tavor_cq_handler);
1060
1061 eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
1062
1063 ASSERT(eqe_evttype == TAVOR_EVT_COMPLETION ||
1064 eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
1065
1066 if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
1067 TNF_PROBE_0(tavor_cq_handler_eq_overflow_condition,
1068 TAVOR_TNF_ERROR, "");
1069 tavor_eq_overflow_handler(state, eq, eqe);
1070
1071 TAVOR_TNF_EXIT(tavor_cq_handler);
1072 return (DDI_FAILURE);
1073 }
1074
1075
1076 /* Get the CQ handle from CQ number in event descriptor */
1077 cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
1078 cq = tavor_cqhdl_from_cqnum(state, cqnum);
1079
1080 /*
1081 * Post the EQ doorbell to move the CQ to the "disarmed" state.
1082 * This operation is to enable subsequent CQ doorbells (e.g. those
1083 * that can be rung by tavor_cq_notify() above) to rearm the CQ.
1084 */
1085 tavor_eq_doorbell(state, TAVOR_EQDB_DISARM_CQ, eq->eq_eqnum, cqnum);
1086
1087 /*
1088 * If the CQ handle is NULL, this is probably an indication
1089 * that the CQ has been freed already. In which case, we
1090 * should not deliver this event.
1091 *
1092 * We also check that the CQ number in the handle is the
1093 * same as the CQ number in the event queue entry. This
1094 * extra check allows us to handle the case where a CQ was
1095 * freed and then allocated again in the time it took to
1096 * handle the event queue processing. By constantly incrementing
1097 * the non-constrained portion of the CQ number every time
1098 * a new CQ is allocated, we mitigate (somewhat) the chance
1099 * that a stale event could be passed to the client's CQ
1100 * handler.
1101 *
1102 * Lastly, we check if "ts_ibtfpriv" is NULL. If it is then it
1103 * means that we've have either received this event before we
1104 * finished attaching to the IBTF or we've received it while we
1105 * are in the process of detaching.
1106 */
1107 if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1108 (state->ts_ibtfpriv != NULL)) {
1109 TAVOR_DO_IBTF_CQ_CALLB(state, cq);
1110 } else {
1111 TNF_PROBE_2(tavor_cq_handler_dropped_event,
1112 TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum,
1113 tnf_uint, hdl_cqnum, cqnum);
1114 }
1115
1116 TAVOR_TNF_EXIT(tavor_cq_handler);
1117 return (DDI_SUCCESS);
1118 }
1119
1120
1121 /*
1122 * tavor_cq_err_handler()
1123 * Context: Only called from interrupt context
1124 */
1125 int
tavor_cq_err_handler(tavor_state_t * state,tavor_eqhdl_t eq,tavor_hw_eqe_t * eqe)1126 tavor_cq_err_handler(tavor_state_t *state, tavor_eqhdl_t eq,
1127 tavor_hw_eqe_t *eqe)
1128 {
1129 tavor_cqhdl_t cq;
1130 uint_t cqnum;
1131 ibc_async_event_t event;
1132 ibt_async_code_t type;
1133 uint_t eqe_evttype;
1134
1135 TAVOR_TNF_ENTER(tavor_cq_err_handler);
1136
1137 eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe);
1138
1139 ASSERT(eqe_evttype == TAVOR_EVT_CQ_ERRORS ||
1140 eqe_evttype == TAVOR_EVT_EQ_OVERFLOW);
1141
1142 if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) {
1143 TNF_PROBE_0(tavor_cq_err_handler_eq_overflow_condition,
1144 TAVOR_TNF_ERROR, "");
1145 tavor_eq_overflow_handler(state, eq, eqe);
1146
1147 TAVOR_TNF_EXIT(tavor_cq_err_handler);
1148 return (DDI_FAILURE);
1149 }
1150
1151 /* cmn_err(CE_CONT, "CQ Error handler\n"); */
1152
1153 /* Get the CQ handle from CQ number in event descriptor */
1154 cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe);
1155 cq = tavor_cqhdl_from_cqnum(state, cqnum);
1156
1157 /*
1158 * If the CQ handle is NULL, this is probably an indication
1159 * that the CQ has been freed already. In which case, we
1160 * should not deliver this event.
1161 *
1162 * We also check that the CQ number in the handle is the
1163 * same as the CQ number in the event queue entry. This
1164 * extra check allows us to handle the case where a CQ was
1165 * freed and then allocated again in the time it took to
1166 * handle the event queue processing. By constantly incrementing
1167 * the non-constrained portion of the CQ number every time
1168 * a new CQ is allocated, we mitigate (somewhat) the chance
1169 * that a stale event could be passed to the client's CQ
1170 * handler.
1171 *
1172 * And then we check if "ts_ibtfpriv" is NULL. If it is then it
1173 * means that we've have either received this event before we
1174 * finished attaching to the IBTF or we've received it while we
1175 * are in the process of detaching.
1176 */
1177 if ((cq != NULL) && (cq->cq_cqnum == cqnum) &&
1178 (state->ts_ibtfpriv != NULL)) {
1179 event.ev_cq_hdl = (ibt_cq_hdl_t)cq->cq_hdlrarg;
1180 type = IBT_ERROR_CQ;
1181
1182 TAVOR_DO_IBTF_ASYNC_CALLB(state, type, &event);
1183 } else {
1184 TNF_PROBE_2(tavor_cq_err_handler_dropped_event,
1185 TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum,
1186 tnf_uint, hdl_cqnum, cqnum);
1187 }
1188
1189 TAVOR_TNF_EXIT(tavor_cq_err_handler);
1190 return (DDI_SUCCESS);
1191 }
1192
1193
1194 /*
1195 * tavor_cq_refcnt_inc()
1196 * Context: Can be called from interrupt or base context.
1197 */
1198 int
tavor_cq_refcnt_inc(tavor_cqhdl_t cq,uint_t is_special)1199 tavor_cq_refcnt_inc(tavor_cqhdl_t cq, uint_t is_special)
1200 {
1201 /*
1202 * Increment the completion queue's reference count. Note: In order
1203 * to ensure compliance with IBA C11-15, we must ensure that a given
1204 * CQ is not used for both special (SMI/GSI) QP and non-special QP.
1205 * This is accomplished here by keeping track of how the referenced
1206 * CQ is being used.
1207 */
1208 mutex_enter(&cq->cq_lock);
1209 TNF_PROBE_1_DEBUG(tavor_cq_refcnt_inc, TAVOR_TNF_TRACE, "",
1210 tnf_uint, refcnt, cq->cq_refcnt);
1211 if (cq->cq_refcnt == 0) {
1212 cq->cq_is_special = is_special;
1213 } else {
1214 if (cq->cq_is_special != is_special) {
1215 mutex_exit(&cq->cq_lock);
1216 return (DDI_FAILURE);
1217 }
1218 }
1219 cq->cq_refcnt++;
1220 mutex_exit(&cq->cq_lock);
1221 return (DDI_SUCCESS);
1222 }
1223
1224
1225 /*
1226 * tavor_cq_refcnt_dec()
1227 * Context: Can be called from interrupt or base context.
1228 */
1229 void
tavor_cq_refcnt_dec(tavor_cqhdl_t cq)1230 tavor_cq_refcnt_dec(tavor_cqhdl_t cq)
1231 {
1232 /* Decrement the completion queue's reference count */
1233 mutex_enter(&cq->cq_lock);
1234 cq->cq_refcnt--;
1235 TNF_PROBE_1_DEBUG(tavor_cq_refcnt_dec, TAVOR_TNF_TRACE, "",
1236 tnf_uint, refcnt, cq->cq_refcnt);
1237 mutex_exit(&cq->cq_lock);
1238 }
1239
1240
1241 /*
1242 * tavor_cq_doorbell()
1243 * Context: Can be called from interrupt or base context.
1244 */
1245 static void
tavor_cq_doorbell(tavor_state_t * state,uint32_t cq_cmd,uint32_t cqn,uint32_t cq_param)1246 tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd, uint32_t cqn,
1247 uint32_t cq_param)
1248 {
1249 uint64_t doorbell = 0;
1250
1251 /* Build the doorbell from the parameters */
1252 doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) |
1253 ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param;
1254
1255 TNF_PROBE_1_DEBUG(tavor_cq_doorbell, TAVOR_TNF_TRACE, "",
1256 tnf_ulong, doorbell, doorbell);
1257
1258 /* Write the doorbell to UAR */
1259 TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->cq,
1260 doorbell);
1261 }
1262
1263
1264 /*
1265 * tavor_cqhdl_from_cqnum()
1266 * Context: Can be called from interrupt or base context.
1267 *
1268 * This routine is important because changing the unconstrained
1269 * portion of the CQ number is critical to the detection of a
1270 * potential race condition in the CQ handler code (i.e. the case
1271 * where a CQ is freed and alloc'd again before an event for the
1272 * "old" CQ can be handled).
1273 *
1274 * While this is not a perfect solution (not sure that one exists)
1275 * it does help to mitigate the chance that this race condition will
1276 * cause us to deliver a "stale" event to the new CQ owner. Note:
1277 * this solution does not scale well because the number of constrained
1278 * bits increases (and, hence, the number of unconstrained bits
1279 * decreases) as the number of supported CQs grows. For small and
1280 * intermediate values, it should hopefully provide sufficient
1281 * protection.
1282 */
1283 tavor_cqhdl_t
tavor_cqhdl_from_cqnum(tavor_state_t * state,uint_t cqnum)1284 tavor_cqhdl_from_cqnum(tavor_state_t *state, uint_t cqnum)
1285 {
1286 uint_t cqindx, cqmask;
1287
1288 /* Calculate the CQ table index from the cqnum */
1289 cqmask = (1 << state->ts_cfg_profile->cp_log_num_cq) - 1;
1290 cqindx = cqnum & cqmask;
1291 return (state->ts_cqhdl[cqindx]);
1292 }
1293
1294
1295 /*
1296 * tavor_cq_cqe_consume()
1297 * Context: Can be called from interrupt or base context.
1298 */
1299 static int
tavor_cq_cqe_consume(tavor_state_t * state,tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe,ibt_wc_t * wc)1300 tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1301 tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1302 {
1303 uint_t flags, type, opcode, qpnum, qp1_indx;
1304 int status;
1305
1306 TAVOR_TNF_ENTER(tavor_cq_cqe_consume);
1307
1308 /*
1309 * Determine if this is an "error" CQE by examining "opcode". If it
1310 * is an error CQE, then call tavor_cq_errcqe_consume() and return
1311 * whatever status it returns. Otherwise, this is a successful
1312 * completion.
1313 */
1314 opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
1315 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
1316 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
1317 status = tavor_cq_errcqe_consume(state, cq, cqe, wc);
1318 TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1319 return (status);
1320 }
1321
1322 /*
1323 * Fetch the Work Request ID using the information in the CQE.
1324 * See tavor_wr.c for more details.
1325 */
1326 wc->wc_id = tavor_wrid_get_entry(cq, cqe, NULL);
1327
1328 /*
1329 * Parse the CQE opcode to determine completion type. This will set
1330 * not only the type of the completion, but also any flags that might
1331 * be associated with it (e.g. whether immediate data is present).
1332 */
1333 flags = IBT_WC_NO_FLAGS;
1334 if (TAVOR_CQE_SENDRECV_GET(cq, cqe) != TAVOR_COMPLETION_RECV) {
1335
1336 /* Send CQE */
1337 switch (opcode) {
1338 case TAVOR_CQE_SND_RDMAWR_IMM:
1339 flags |= IBT_WC_IMMED_DATA_PRESENT;
1340 /* FALLTHROUGH */
1341 case TAVOR_CQE_SND_RDMAWR:
1342 type = IBT_WRC_RDMAW;
1343 break;
1344
1345 case TAVOR_CQE_SND_SEND_IMM:
1346 flags |= IBT_WC_IMMED_DATA_PRESENT;
1347 /* FALLTHROUGH */
1348 case TAVOR_CQE_SND_SEND:
1349 type = IBT_WRC_SEND;
1350 break;
1351
1352 case TAVOR_CQE_SND_RDMARD:
1353 type = IBT_WRC_RDMAR;
1354 break;
1355
1356 case TAVOR_CQE_SND_ATOMIC_CS:
1357 type = IBT_WRC_CSWAP;
1358 break;
1359
1360 case TAVOR_CQE_SND_ATOMIC_FA:
1361 type = IBT_WRC_FADD;
1362 break;
1363
1364 case TAVOR_CQE_SND_BIND_MW:
1365 type = IBT_WRC_BIND;
1366 break;
1367
1368 default:
1369 TAVOR_WARNING(state, "unknown send CQE type");
1370 wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1371 TNF_PROBE_1(tavor_cq_cqe_consume_unknown_send_type,
1372 TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode);
1373 TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1374 return (TAVOR_CQ_SYNC_AND_DB);
1375 }
1376 } else {
1377
1378 /* Receive CQE */
1379 switch (opcode & 0x1F) {
1380 case TAVOR_CQE_RCV_RECV_IMM:
1381 /* FALLTHROUGH */
1382 case TAVOR_CQE_RCV_RECV_IMM2:
1383 /*
1384 * Note: According to the Tavor PRM, all QP1 recv
1385 * completions look like the result of a Send with
1386 * Immediate. They are not, however, (MADs are Send
1387 * Only) so we need to check the QP number and set
1388 * the flag only if it is non-QP1.
1389 */
1390 qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
1391 qp1_indx = state->ts_spec_qp1->tr_indx;
1392 if ((qpnum < qp1_indx) || (qpnum > qp1_indx + 1)) {
1393 flags |= IBT_WC_IMMED_DATA_PRESENT;
1394 }
1395 /* FALLTHROUGH */
1396 case TAVOR_CQE_RCV_RECV:
1397 /* FALLTHROUGH */
1398 case TAVOR_CQE_RCV_RECV2:
1399 type = IBT_WRC_RECV;
1400 break;
1401
1402 case TAVOR_CQE_RCV_RDMAWR_IMM:
1403 /* FALLTHROUGH */
1404 case TAVOR_CQE_RCV_RDMAWR_IMM2:
1405 flags |= IBT_WC_IMMED_DATA_PRESENT;
1406 type = IBT_WRC_RECV_RDMAWI;
1407 break;
1408
1409 default:
1410 TAVOR_WARNING(state, "unknown recv CQE type");
1411 wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR;
1412 TNF_PROBE_1(tavor_cq_cqe_consume_unknown_rcv_type,
1413 TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode);
1414 TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1415 return (TAVOR_CQ_SYNC_AND_DB);
1416 }
1417 }
1418 wc->wc_type = type;
1419
1420 /*
1421 * Check for GRH, update the flags, then fill in "wc_flags" field
1422 * in the work completion
1423 */
1424 if (TAVOR_CQE_GRH_GET(cq, cqe) != 0) {
1425 flags |= IBT_WC_GRH_PRESENT;
1426 }
1427 wc->wc_flags = flags;
1428
1429 /* If we got here, completion status must be success */
1430 wc->wc_status = IBT_WC_SUCCESS;
1431
1432 /*
1433 * Parse the remaining contents of the CQE into the work completion.
1434 * This means filling in SL, QP number, SLID, immediate data, etc.
1435 * Note: Not all of these fields are valid in a given completion.
1436 * Many of them depend on the actual type of completion. So we fill
1437 * in all of the fields and leave it up to the IBTF and consumer to
1438 * sort out which are valid based on their context.
1439 */
1440 wc->wc_sl = TAVOR_CQE_SL_GET(cq, cqe);
1441 wc->wc_immed_data = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1442 wc->wc_qpn = TAVOR_CQE_DQPN_GET(cq, cqe);
1443 wc->wc_res_hash = 0;
1444 wc->wc_slid = TAVOR_CQE_DLID_GET(cq, cqe);
1445 wc->wc_ethertype = (wc->wc_immed_data & 0xFFFF);
1446 wc->wc_pkey_ix = (wc->wc_immed_data >> 16);
1447
1448 /*
1449 * Depending on whether the completion was a receive or a send
1450 * completion, fill in "bytes transferred" as appropriate. Also,
1451 * if necessary, fill in the "path bits" field.
1452 */
1453 if (TAVOR_CQE_SENDRECV_GET(cq, cqe) == TAVOR_COMPLETION_RECV) {
1454 wc->wc_path_bits = TAVOR_CQE_PATHBITS_GET(cq, cqe);
1455 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1456
1457 } else if ((wc->wc_type == IBT_WRC_RDMAR) ||
1458 (wc->wc_type == IBT_WRC_CSWAP) || (wc->wc_type == IBT_WRC_FADD)) {
1459 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe);
1460 }
1461
1462 TAVOR_TNF_EXIT(tavor_cq_cqe_consume);
1463 return (TAVOR_CQ_SYNC_AND_DB);
1464 }
1465
1466
1467 /*
1468 * tavor_cq_errcqe_consume()
1469 * Context: Can be called from interrupt or base context.
1470 */
1471 static int
tavor_cq_errcqe_consume(tavor_state_t * state,tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe,ibt_wc_t * wc)1472 tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq,
1473 tavor_hw_cqe_t *cqe, ibt_wc_t *wc)
1474 {
1475 uint64_t next_wqeaddr;
1476 uint32_t imm_eth_pkey_cred;
1477 uint_t nextwqesize, dbd;
1478 uint_t doorbell_cnt, status;
1479 tavor_wrid_entry_t wre;
1480
1481 TAVOR_TNF_ENTER(tavor_cq_errcqe_consume);
1482
1483 /*
1484 * Fetch the Work Request ID using the information in the CQE.
1485 * See tavor_wr.c for more details.
1486 */
1487 wc->wc_id = tavor_wrid_get_entry(cq, cqe, &wre);
1488
1489 /*
1490 * Parse the CQE opcode to determine completion type. We know that
1491 * the CQE is an error completion, so we extract only the completion
1492 * status here.
1493 */
1494 imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe);
1495 status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT;
1496 switch (status) {
1497 case TAVOR_CQE_LOC_LEN_ERR:
1498 status = IBT_WC_LOCAL_LEN_ERR;
1499 break;
1500
1501 case TAVOR_CQE_LOC_OP_ERR:
1502 status = IBT_WC_LOCAL_QP_OP_ERR;
1503 break;
1504
1505 case TAVOR_CQE_LOC_PROT_ERR:
1506 status = IBT_WC_LOCAL_PROTECT_ERR;
1507 break;
1508
1509 case TAVOR_CQE_WR_FLUSHED_ERR:
1510 status = IBT_WC_WR_FLUSHED_ERR;
1511 break;
1512
1513 case TAVOR_CQE_MW_BIND_ERR:
1514 status = IBT_WC_MEM_WIN_BIND_ERR;
1515 break;
1516
1517 case TAVOR_CQE_BAD_RESPONSE_ERR:
1518 status = IBT_WC_BAD_RESPONSE_ERR;
1519 break;
1520
1521 case TAVOR_CQE_LOCAL_ACCESS_ERR:
1522 status = IBT_WC_LOCAL_ACCESS_ERR;
1523 break;
1524
1525 case TAVOR_CQE_REM_INV_REQ_ERR:
1526 status = IBT_WC_REMOTE_INVALID_REQ_ERR;
1527 break;
1528
1529 case TAVOR_CQE_REM_ACC_ERR:
1530 status = IBT_WC_REMOTE_ACCESS_ERR;
1531 break;
1532
1533 case TAVOR_CQE_REM_OP_ERR:
1534 status = IBT_WC_REMOTE_OP_ERR;
1535 break;
1536
1537 case TAVOR_CQE_TRANS_TO_ERR:
1538 status = IBT_WC_TRANS_TIMEOUT_ERR;
1539 break;
1540
1541 case TAVOR_CQE_RNRNAK_TO_ERR:
1542 status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
1543 break;
1544
1545 /*
1546 * The following error codes are not supported in the Tavor driver
1547 * as they relate only to Reliable Datagram completion statuses:
1548 * case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
1549 * case TAVOR_CQE_REM_INV_RD_REQ_ERR:
1550 * case TAVOR_CQE_EEC_REM_ABORTED_ERR:
1551 * case TAVOR_CQE_INV_EEC_NUM_ERR:
1552 * case TAVOR_CQE_INV_EEC_STATE_ERR:
1553 * case TAVOR_CQE_LOC_EEC_ERR:
1554 */
1555
1556 default:
1557 TAVOR_WARNING(state, "unknown error CQE status");
1558 status = IBT_WC_LOCAL_QP_OP_ERR;
1559 TNF_PROBE_1(tavor_cq_errcqe_consume_unknown_status,
1560 TAVOR_TNF_ERROR, "", tnf_uint, status, status);
1561 break;
1562 }
1563 wc->wc_status = status;
1564
1565 /*
1566 * Now we do all the checking that's necessary to handle completion
1567 * queue entry "recycling"
1568 *
1569 * It is not necessary here to try to sync the WQE as we are only
1570 * attempting to read from the Work Queue (and hardware does not
1571 * write to it).
1572 */
1573
1574 /*
1575 * We can get doorbell info, WQE address, size for the next WQE
1576 * from the "wre" (which was filled in above in the call to the
1577 * tavor_wrid_get_entry() routine)
1578 */
1579 dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0;
1580 next_wqeaddr = wre.wr_wqeaddrsz;
1581 nextwqesize = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK;
1582
1583 /*
1584 * Get the doorbell count from the CQE. This indicates how many
1585 * completions this one CQE represents.
1586 */
1587 doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
1588
1589 /*
1590 * Determine if we're ready to consume this CQE yet or not. If the
1591 * next WQE has size zero (i.e. no next WQE) or if the doorbell count
1592 * is down to zero, then this is the last/only completion represented
1593 * by the current CQE (return TAVOR_CQ_SYNC_AND_DB). Otherwise, the
1594 * current CQE needs to be recycled (see below).
1595 */
1596 if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) {
1597 /*
1598 * Consume the CQE
1599 * Return status to indicate that doorbell and sync may be
1600 * necessary.
1601 */
1602 TAVOR_TNF_EXIT(tavor_cq_errcqe_consume);
1603 return (TAVOR_CQ_SYNC_AND_DB);
1604
1605 } else {
1606 /*
1607 * Recycle the CQE for use in the next PollCQ() call
1608 * Decrement the doorbell count, modify the error status,
1609 * and update the WQE address and size (to point to the
1610 * next WQE on the chain. Put these update entries back
1611 * into the CQE.
1612 * Despite the fact that we have updated the CQE, it is not
1613 * necessary for us to attempt to sync this entry just yet
1614 * as we have not changed the "hardware's view" of the
1615 * entry (i.e. we have not modified the "owner" bit - which
1616 * is all that the Tavor hardware really cares about.
1617 */
1618 doorbell_cnt = doorbell_cnt - dbd;
1619 TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cq, cqe,
1620 ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) |
1621 (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK)));
1622 TAVOR_CQE_WQEADDRSZ_SET(cq, cqe,
1623 TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize));
1624
1625 TAVOR_TNF_EXIT(tavor_cq_errcqe_consume);
1626 return (TAVOR_CQ_RECYCLE_ENTRY);
1627 }
1628 }
1629
1630
1631 /*
1632 * tavor_cqe_sync()
1633 * Context: Can be called from interrupt or base context.
1634 */
1635 static void
tavor_cqe_sync(tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe,uint_t flag)1636 tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, uint_t flag)
1637 {
1638 ddi_dma_handle_t dmahdl;
1639 off_t offset;
1640 int status;
1641
1642 TAVOR_TNF_ENTER(tavor_cqe_sync);
1643
1644 /* Determine if CQ needs to be synced or not */
1645 if (cq->cq_sync == 0) {
1646 TAVOR_TNF_EXIT(tavor_cqe_sync);
1647 return;
1648 }
1649
1650 /* Get the DMA handle from CQ context */
1651 dmahdl = cq->cq_mrhdl->mr_bindinfo.bi_dmahdl;
1652
1653 /* Calculate offset of next CQE */
1654 offset = (off_t)((uintptr_t)cqe - (uintptr_t)&cq->cq_buf[0]);
1655 status = ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_cqe_t), flag);
1656 if (status != DDI_SUCCESS) {
1657 TNF_PROBE_0(tavor_cqe_sync_getnextentry_fail,
1658 TAVOR_TNF_ERROR, "");
1659 TAVOR_TNF_EXIT(tavor_cqe_sync);
1660 return;
1661 }
1662
1663 TAVOR_TNF_EXIT(tavor_cqe_sync);
1664 }
1665
1666
1667 /*
1668 * tavor_cq_resize_helper()
1669 * Context: Can be called only from user or kernel context.
1670 */
1671 static void
tavor_cq_resize_helper(tavor_cqhdl_t cq,tavor_hw_cqe_t * new_cqbuf,uint32_t old_cons_indx,uint32_t num_newcqe)1672 tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf,
1673 uint32_t old_cons_indx, uint32_t num_newcqe)
1674 {
1675 tavor_hw_cqe_t *old_cqe, *new_cqe;
1676 uint32_t new_cons_indx, wrap_around_mask;
1677 int i;
1678
1679 TAVOR_TNF_ENTER(tavor_cq_resize_helper);
1680
1681 ASSERT(MUTEX_HELD(&cq->cq_lock));
1682
1683 /* Get the consumer index */
1684 new_cons_indx = 0;
1685
1686 /*
1687 * Calculate the wrap around mask. Note: This operation only works
1688 * because all Tavor completion queues have power-of-2 sizes
1689 */
1690 wrap_around_mask = (cq->cq_bufsz - 1);
1691
1692 /*
1693 * Calculate the pointers to the first CQ entry (in the "old" CQ)
1694 * and the first CQ entry in the "new" CQ
1695 */
1696 old_cqe = &cq->cq_buf[old_cons_indx];
1697 new_cqe = &new_cqbuf[new_cons_indx];
1698
1699 /* Sync entire "old" CQ for use by software (if necessary). */
1700 if (cq->cq_sync) {
1701 (void) ddi_dma_sync(cq->cq_mrhdl->mr_bindinfo.bi_dmahdl,
1702 0, cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORCPU);
1703 }
1704
1705 /*
1706 * Keep pulling entries from the "old" CQ until we find an entry owned
1707 * by the hardware. Process each entry by copying it into the "new"
1708 * CQ and updating respective indices and pointers in the "old" CQ.
1709 */
1710 for (i = 0; i < num_newcqe; i++) {
1711
1712 /* Copy this old CQE into the "new_cqe" pointer */
1713 bcopy(old_cqe, new_cqe, sizeof (tavor_hw_cqe_t));
1714
1715 /* Increment the consumer index (for both CQs) */
1716 old_cons_indx = (old_cons_indx + 1) & wrap_around_mask;
1717 new_cons_indx = (new_cons_indx + 1);
1718
1719 /* Update the pointer to the next CQ entry */
1720 old_cqe = &cq->cq_buf[old_cons_indx];
1721 new_cqe = &new_cqbuf[new_cons_indx];
1722 }
1723
1724 TAVOR_TNF_EXIT(tavor_cq_resize_helper);
1725 }
1726
1727 /*
1728 * tavor_cq_srq_entries_flush()
1729 * Context: Can be called from interrupt or base context.
1730 */
1731 void
tavor_cq_srq_entries_flush(tavor_state_t * state,tavor_qphdl_t qp)1732 tavor_cq_srq_entries_flush(tavor_state_t *state, tavor_qphdl_t qp)
1733 {
1734 tavor_cqhdl_t cq;
1735 tavor_workq_hdr_t *wqhdr;
1736 tavor_hw_cqe_t *cqe;
1737 tavor_hw_cqe_t *next_cqe;
1738 uint32_t cons_indx, tail_cons_indx, wrap_around_mask;
1739 uint32_t new_indx, check_indx, indx;
1740 uint32_t num_to_increment;
1741 int cqe_qpnum, cqe_type;
1742 int outstanding_cqes, removed_cqes;
1743 int i;
1744
1745 ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock));
1746
1747 cq = qp->qp_rq_cqhdl;
1748 wqhdr = qp->qp_rq_wqhdr;
1749
1750 ASSERT(wqhdr->wq_wrid_post != NULL);
1751 ASSERT(wqhdr->wq_wrid_post->wl_srq_en != 0);
1752
1753 /*
1754 * Check for user-mapped CQ memory. Note: We do not allow kernel
1755 * clients to modify any userland mapping CQ. If the CQ is
1756 * user-mapped, then we simply return here, and this "flush" function
1757 * becomes a NO-OP in this case.
1758 */
1759 if (cq->cq_is_umap) {
1760 return;
1761 }
1762
1763 /* Get the consumer index */
1764 cons_indx = cq->cq_consindx;
1765
1766 /*
1767 * Calculate the wrap around mask. Note: This operation only works
1768 * because all Tavor completion queues have power-of-2 sizes
1769 */
1770 wrap_around_mask = (cq->cq_bufsz - 1);
1771
1772 /* Calculate the pointer to the first CQ entry */
1773 cqe = &cq->cq_buf[cons_indx];
1774
1775 /* Sync the current CQE to read */
1776 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1777
1778 /*
1779 * Loop through the CQ looking for entries owned by software. If an
1780 * entry is owned by software then we increment an 'outstanding_cqes'
1781 * count to know how many entries total we have on our CQ. We use this
1782 * value further down to know how many entries to loop through looking
1783 * for our same QP number.
1784 */
1785 outstanding_cqes = 0;
1786 tail_cons_indx = cons_indx;
1787 while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) {
1788 /* increment total cqes count */
1789 outstanding_cqes++;
1790
1791 /* increment the consumer index */
1792 tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask;
1793
1794 /* update the pointer to the next cq entry */
1795 cqe = &cq->cq_buf[tail_cons_indx];
1796
1797 /* sync the next cqe to read */
1798 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU);
1799 }
1800
1801 /*
1802 * Using the 'tail_cons_indx' that was just set, we now know how many
1803 * total CQEs possible there are. Set the 'check_indx' and the
1804 * 'new_indx' to the last entry identified by 'tail_cons_indx'
1805 */
1806 check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask;
1807
1808 for (i = 0; i < outstanding_cqes; i++) {
1809 cqe = &cq->cq_buf[check_indx];
1810
1811 /* Grab QP number from CQE */
1812 cqe_qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
1813 cqe_type = TAVOR_CQE_SENDRECV_GET(cq, cqe);
1814
1815 /*
1816 * If the QP number is the same in the CQE as the QP that we
1817 * have on this SRQ, then we must free up the entry off the
1818 * SRQ. We also make sure that the completion type is of the
1819 * 'TAVOR_COMPLETION_RECV' type. So any send completions on
1820 * this CQ will be left as-is. The handling of returning
1821 * entries back to HW ownership happens further down.
1822 */
1823 if (cqe_qpnum == qp->qp_qpnum &&
1824 cqe_type == TAVOR_COMPLETION_RECV) {
1825
1826 /* Add back to SRQ free list */
1827 (void) tavor_wrid_find_match_srq(wqhdr->wq_wrid_post,
1828 cq, cqe);
1829 } else {
1830 /* Do Copy */
1831 if (check_indx != new_indx) {
1832 next_cqe = &cq->cq_buf[new_indx];
1833
1834 /*
1835 * Copy the CQE into the "next_cqe"
1836 * pointer.
1837 */
1838 bcopy(cqe, next_cqe, sizeof (tavor_hw_cqe_t));
1839 }
1840 new_indx = (new_indx - 1) & wrap_around_mask;
1841 }
1842 /* Move index to next CQE to check */
1843 check_indx = (check_indx - 1) & wrap_around_mask;
1844 }
1845
1846 /* Initialize removed cqes count */
1847 removed_cqes = 0;
1848
1849 /* If an entry was removed */
1850 if (check_indx != new_indx) {
1851
1852 /*
1853 * Set current pointer back to the beginning consumer index.
1854 * At this point, all unclaimed entries have been copied to the
1855 * index specified by 'new_indx'. This 'new_indx' will be used
1856 * as the new consumer index after we mark all freed entries as
1857 * having HW ownership. We do that here.
1858 */
1859
1860 /* Loop through all entries until we reach our new pointer */
1861 for (indx = cons_indx; indx <= new_indx;
1862 indx = (indx + 1) & wrap_around_mask) {
1863 removed_cqes++;
1864 cqe = &cq->cq_buf[indx];
1865
1866 /* Reset entry to hardware ownership */
1867 TAVOR_CQE_OWNER_SET_HW(cq, cqe);
1868 }
1869 }
1870
1871 /*
1872 * Update consumer index to be the 'new_indx'. This moves it past all
1873 * removed entries. Because 'new_indx' is pointing to the last
1874 * previously valid SW owned entry, we add 1 to point the cons_indx to
1875 * the first HW owned entry.
1876 */
1877 cons_indx = (new_indx + 1) & wrap_around_mask;
1878
1879 /*
1880 * Now we only ring the doorbell (to update the consumer index) if
1881 * we've actually consumed a CQ entry. If we found no QP number
1882 * matches above, then we would not have removed anything. So only if
1883 * something was removed do we ring the doorbell.
1884 */
1885 if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
1886 /*
1887 * Post doorbell to update the consumer index. Doorbell
1888 * value indicates number of entries consumed (minus 1)
1889 */
1890 if (cons_indx > cq->cq_consindx) {
1891 num_to_increment = (cons_indx - cq->cq_consindx) - 1;
1892 } else {
1893 num_to_increment = ((cons_indx + cq->cq_bufsz) -
1894 cq->cq_consindx) - 1;
1895 }
1896 cq->cq_consindx = cons_indx;
1897
1898 tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX,
1899 cq->cq_cqnum, num_to_increment);
1900 }
1901 }
1902