1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * tavor_cq.c 29 * Tavor Completion Queue Processing Routines 30 * 31 * Implements all the routines necessary for allocating, freeing, resizing, 32 * and handling the completion type events that the Tavor hardware can 33 * generate. 34 */ 35 36 #include <sys/types.h> 37 #include <sys/conf.h> 38 #include <sys/ddi.h> 39 #include <sys/sunddi.h> 40 #include <sys/modctl.h> 41 #include <sys/bitmap.h> 42 #include <sys/sysmacros.h> 43 44 #include <sys/ib/adapters/tavor/tavor.h> 45 46 static void tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd, 47 uint32_t cqn, uint32_t cq_param); 48 #pragma inline(tavor_cq_doorbell) 49 static int tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq, 50 tavor_hw_cqe_t *cqe, ibt_wc_t *wc); 51 static int tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq, 52 tavor_hw_cqe_t *cqe, ibt_wc_t *wc); 53 static void tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, 54 uint_t flag); 55 static void tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf, 56 uint32_t old_cons_indx, uint32_t num_newcqe); 57 58 /* 59 * tavor_cq_alloc() 60 * Context: Can be called only from user or kernel context. 61 */ 62 int 63 tavor_cq_alloc(tavor_state_t *state, ibt_cq_hdl_t ibt_cqhdl, 64 ibt_cq_attr_t *cq_attr, uint_t *actual_size, tavor_cqhdl_t *cqhdl, 65 uint_t sleepflag) 66 { 67 tavor_rsrc_t *cqc, *rsrc; 68 tavor_umap_db_entry_t *umapdb; 69 tavor_hw_cqc_t cqc_entry; 70 tavor_cqhdl_t cq; 71 ibt_mr_attr_t mr_attr; 72 tavor_mr_options_t op; 73 tavor_pdhdl_t pd; 74 tavor_mrhdl_t mr; 75 tavor_hw_cqe_t *buf; 76 uint64_t addr, value; 77 uint32_t log_cq_size, lkey, uarpg; 78 uint_t dma_xfer_mode, cq_sync, cq_is_umap; 79 int status, i, flag; 80 char *errormsg; 81 82 TAVOR_TNF_ENTER(tavor_cq_alloc); 83 84 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq_attr)) 85 86 /* 87 * Determine whether CQ is being allocated for userland access or 88 * whether it is being allocated for kernel access. If the CQ is 89 * being allocated for userland access, then lookup the UAR doorbell 90 * page number for the current process. Note: If this is not found 91 * (e.g. if the process has not previously open()'d the Tavor driver), 92 * then an error is returned. 93 */ 94 cq_is_umap = (cq_attr->cq_flags & IBT_CQ_USER_MAP) ? 1 : 0; 95 if (cq_is_umap) { 96 status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(), 97 MLNX_UMAP_UARPG_RSRC, &value, 0, NULL); 98 if (status != DDI_SUCCESS) { 99 /* Set "status" and "errormsg" and goto failure */ 100 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page"); 101 goto cqalloc_fail; 102 } 103 uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx; 104 } 105 106 /* Use the internal protection domain (PD) for setting up CQs */ 107 pd = state->ts_pdhdl_internal; 108 109 /* Increment the reference count on the protection domain (PD) */ 110 tavor_pd_refcnt_inc(pd); 111 112 /* 113 * Allocate an CQ context entry. This will be filled in with all 114 * the necessary parameters to define the Completion Queue. And then 115 * ownership will be passed to the hardware in the final step 116 * below. If we fail here, we must undo the protection domain 117 * reference count. 118 */ 119 status = tavor_rsrc_alloc(state, TAVOR_CQC, 1, sleepflag, &cqc); 120 if (status != DDI_SUCCESS) { 121 /* Set "status" and "errormsg" and goto failure */ 122 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ context"); 123 goto cqalloc_fail1; 124 } 125 126 /* 127 * Allocate the software structure for tracking the completion queue 128 * (i.e. the Tavor Completion Queue handle). If we fail here, we must 129 * undo the protection domain reference count and the previous 130 * resource allocation. 131 */ 132 status = tavor_rsrc_alloc(state, TAVOR_CQHDL, 1, sleepflag, &rsrc); 133 if (status != DDI_SUCCESS) { 134 /* Set "status" and "errormsg" and goto failure */ 135 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed CQ handle"); 136 goto cqalloc_fail2; 137 } 138 cq = (tavor_cqhdl_t)rsrc->tr_addr; 139 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq)) 140 cq->cq_is_umap = cq_is_umap; 141 142 /* Use the index as CQ number */ 143 cq->cq_cqnum = cqc->tr_indx; 144 145 /* 146 * If this will be a user-mappable CQ, then allocate an entry for 147 * the "userland resources database". This will later be added to 148 * the database (after all further CQ operations are successful). 149 * If we fail here, we must undo the reference counts and the 150 * previous resource allocation. 151 */ 152 if (cq->cq_is_umap) { 153 umapdb = tavor_umap_db_alloc(state->ts_instance, cq->cq_cqnum, 154 MLNX_UMAP_CQMEM_RSRC, (uint64_t)(uintptr_t)rsrc); 155 if (umapdb == NULL) { 156 /* Set "status" and "errormsg" and goto failure */ 157 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add"); 158 goto cqalloc_fail3; 159 } 160 } 161 162 /* 163 * Calculate the appropriate size for the completion queue. 164 * Note: All Tavor CQs must be a power-of-2 minus 1 in size. Also 165 * they may not be any smaller than TAVOR_CQ_MIN_SIZE. This step is 166 * to round the requested size up to the next highest power-of-2 167 */ 168 cq_attr->cq_size = max(cq_attr->cq_size, TAVOR_CQ_MIN_SIZE); 169 log_cq_size = highbit(cq_attr->cq_size); 170 171 /* 172 * Next we verify that the rounded-up size is valid (i.e. consistent 173 * with the device limits and/or software-configured limits) 174 */ 175 if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) { 176 /* Set "status" and "errormsg" and goto failure */ 177 TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size"); 178 goto cqalloc_fail4; 179 } 180 181 /* 182 * Allocate the memory for Completion Queue. 183 * 184 * Note: Although we use the common queue allocation routine, we 185 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in 186 * kernel system memory) for kernel CQs because it would be 187 * inefficient to have CQs located in DDR memory. This is primarily 188 * because CQs are read from (by software) more than they are written 189 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all 190 * user-mappable CQs for a similar reason.) 191 * It is also worth noting that, unlike Tavor QP work queues, 192 * completion queues do not have the same strict alignment 193 * requirements. It is sufficient for the CQ memory to be both 194 * aligned to and bound to addresses which are a multiple of CQE size. 195 */ 196 cq->cq_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t); 197 cq->cq_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t); 198 cq->cq_cqinfo.qa_bind_align = sizeof (tavor_hw_cqe_t); 199 if (cq->cq_is_umap) { 200 cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND; 201 } else { 202 cq->cq_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL; 203 } 204 status = tavor_queue_alloc(state, &cq->cq_cqinfo, sleepflag); 205 if (status != DDI_SUCCESS) { 206 /* Set "status" and "errormsg" and goto failure */ 207 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue"); 208 goto cqalloc_fail4; 209 } 210 buf = (tavor_hw_cqe_t *)cq->cq_cqinfo.qa_buf_aligned; 211 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf)) 212 213 /* 214 * Initialize each of the Completion Queue Entries (CQE) by setting 215 * their ownership to hardware ("owner" bit set to HW). This is in 216 * preparation for the final transfer of ownership (below) of the 217 * CQ context itself. 218 */ 219 for (i = 0; i < (1 << log_cq_size); i++) { 220 TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]); 221 } 222 223 /* 224 * Register the memory for the CQ. The memory for the CQ must 225 * be registered in the Tavor TPT tables. This gives us the LKey 226 * to specify in the CQ context below. Note: If this is a user- 227 * mappable CQ, then we will force DDI_DMA_CONSISTENT mapping. 228 */ 229 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP; 230 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf; 231 mr_attr.mr_len = cq->cq_cqinfo.qa_size; 232 mr_attr.mr_as = NULL; 233 mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE; 234 if (cq->cq_is_umap) { 235 dma_xfer_mode = DDI_DMA_CONSISTENT; 236 } else { 237 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent; 238 } 239 if (dma_xfer_mode == DDI_DMA_STREAMING) { 240 mr_attr.mr_flags |= IBT_MR_NONCOHERENT; 241 } 242 op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass; 243 op.mro_bind_dmahdl = cq->cq_cqinfo.qa_dmahdl; 244 op.mro_bind_override_addr = 0; 245 status = tavor_mr_register(state, pd, &mr_attr, &mr, &op); 246 if (status != DDI_SUCCESS) { 247 /* Set "status" and "errormsg" and goto failure */ 248 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr"); 249 goto cqalloc_fail5; 250 } 251 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr)) 252 addr = mr->mr_bindinfo.bi_addr; 253 lkey = mr->mr_lkey; 254 255 /* Determine if later ddi_dma_sync will be necessary */ 256 cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, cq->cq_cqinfo); 257 258 /* Sync entire CQ for use by the hardware (if necessary). */ 259 if (cq_sync) { 260 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0, 261 cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV); 262 } 263 264 /* 265 * Fill in the CQC entry. This is the final step before passing 266 * ownership of the CQC entry to the Tavor hardware. We use all of 267 * the information collected/calculated above to fill in the 268 * requisite portions of the CQC. Note: If this CQ is going to be 269 * used for userland access, then we need to set the UAR page number 270 * appropriately (otherwise it's a "don't care") 271 */ 272 bzero(&cqc_entry, sizeof (tavor_hw_cqc_t)); 273 cq->cq_eqnum = TAVOR_CQ_EQNUM_GET(cq->cq_cqnum); 274 cq->cq_erreqnum = TAVOR_CQ_ERREQNUM_GET(cq->cq_cqnum); 275 cqc_entry.xlat = TAVOR_VA2PA_XLAT_ENABLED; 276 cqc_entry.state = TAVOR_CQ_DISARMED; 277 cqc_entry.start_addr_h = (addr >> 32); 278 cqc_entry.start_addr_l = (addr & 0xFFFFFFFF); 279 cqc_entry.log_cq_sz = log_cq_size; 280 if (cq->cq_is_umap) { 281 cqc_entry.usr_page = uarpg; 282 } else { 283 cqc_entry.usr_page = 0; 284 } 285 cqc_entry.pd = pd->pd_pdnum; 286 cqc_entry.lkey = lkey; 287 cqc_entry.e_eqn = cq->cq_erreqnum; 288 cqc_entry.c_eqn = cq->cq_eqnum; 289 cqc_entry.cqn = cq->cq_cqnum; 290 291 /* 292 * Write the CQC entry to hardware. Lastly, we pass ownership of 293 * the entry to the hardware (using the Tavor SW2HW_CQ firmware 294 * command). Note: In general, this operation shouldn't fail. But 295 * if it does, we have to undo everything we've done above before 296 * returning error. 297 */ 298 status = tavor_cmn_ownership_cmd_post(state, SW2HW_CQ, &cqc_entry, 299 sizeof (tavor_hw_cqc_t), cq->cq_cqnum, sleepflag); 300 if (status != TAVOR_CMD_SUCCESS) { 301 cmn_err(CE_CONT, "Tavor: SW2HW_CQ command failed: %08x\n", 302 status); 303 TNF_PROBE_1(tavor_cq_alloc_sw2hw_cq_cmd_fail, 304 TAVOR_TNF_ERROR, "", tnf_uint, status, status); 305 /* Set "status" and "errormsg" and goto failure */ 306 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "tavor SW2HW_CQ command"); 307 goto cqalloc_fail6; 308 } 309 310 /* 311 * Fill in the rest of the Tavor Completion Queue handle. Having 312 * successfully transferred ownership of the CQC, we can update the 313 * following fields for use in further operations on the CQ. 314 */ 315 cq->cq_cqcrsrcp = cqc; 316 cq->cq_rsrcp = rsrc; 317 cq->cq_consindx = 0; 318 cq->cq_buf = buf; 319 cq->cq_bufsz = (1 << log_cq_size); 320 cq->cq_mrhdl = mr; 321 cq->cq_sync = cq_sync; 322 cq->cq_refcnt = 0; 323 cq->cq_is_special = 0; 324 cq->cq_uarpg = uarpg; 325 cq->cq_umap_dhp = (devmap_cookie_t)NULL; 326 avl_create(&cq->cq_wrid_wqhdr_avl_tree, tavor_wrid_wqhdr_compare, 327 sizeof (struct tavor_workq_hdr_s), 328 offsetof(struct tavor_workq_hdr_s, wq_avl_link)); 329 330 cq->cq_wrid_reap_head = NULL; 331 cq->cq_wrid_reap_tail = NULL; 332 cq->cq_hdlrarg = (void *)ibt_cqhdl; 333 334 /* 335 * Put CQ handle in Tavor CQNum-to-CQHdl list. Then fill in the 336 * "actual_size" and "cqhdl" and return success 337 */ 338 ASSERT(state->ts_cqhdl[cqc->tr_indx] == NULL); 339 state->ts_cqhdl[cqc->tr_indx] = cq; 340 341 /* 342 * If this is a user-mappable CQ, then we need to insert the previously 343 * allocated entry into the "userland resources database". This will 344 * allow for later lookup during devmap() (i.e. mmap()) calls. 345 */ 346 if (cq->cq_is_umap) { 347 tavor_umap_db_add(umapdb); 348 } 349 350 /* 351 * Fill in the return arguments (if necessary). This includes the 352 * real completion queue size. 353 */ 354 if (actual_size != NULL) { 355 *actual_size = (1 << log_cq_size) - 1; 356 } 357 *cqhdl = cq; 358 359 TAVOR_TNF_EXIT(tavor_cq_alloc); 360 return (DDI_SUCCESS); 361 362 /* 363 * The following is cleanup for all possible failure cases in this routine 364 */ 365 cqalloc_fail6: 366 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL, 367 sleepflag) != DDI_SUCCESS) { 368 TAVOR_WARNING(state, "failed to deregister CQ memory"); 369 } 370 cqalloc_fail5: 371 tavor_queue_free(state, &cq->cq_cqinfo); 372 cqalloc_fail4: 373 if (cq_is_umap) { 374 tavor_umap_db_free(umapdb); 375 } 376 cqalloc_fail3: 377 tavor_rsrc_free(state, &rsrc); 378 cqalloc_fail2: 379 tavor_rsrc_free(state, &cqc); 380 cqalloc_fail1: 381 tavor_pd_refcnt_dec(pd); 382 cqalloc_fail: 383 TNF_PROBE_1(tavor_cq_alloc_fail, TAVOR_TNF_ERROR, "", 384 tnf_string, msg, errormsg); 385 TAVOR_TNF_EXIT(tavor_cq_alloc); 386 return (status); 387 } 388 389 390 /* 391 * tavor_cq_free() 392 * Context: Can be called only from user or kernel context. 393 */ 394 /* ARGSUSED */ 395 int 396 tavor_cq_free(tavor_state_t *state, tavor_cqhdl_t *cqhdl, uint_t sleepflag) 397 { 398 tavor_rsrc_t *cqc, *rsrc; 399 tavor_umap_db_entry_t *umapdb; 400 tavor_hw_cqc_t cqc_entry; 401 tavor_pdhdl_t pd; 402 tavor_mrhdl_t mr; 403 tavor_cqhdl_t cq; 404 uint32_t cqnum; 405 uint64_t value; 406 uint_t maxprot; 407 int status; 408 409 TAVOR_TNF_ENTER(tavor_cq_free); 410 411 /* 412 * Pull all the necessary information from the Tavor Completion Queue 413 * handle. This is necessary here because the resource for the 414 * CQ handle is going to be freed up as part of this operation. 415 */ 416 cq = *cqhdl; 417 mutex_enter(&cq->cq_lock); 418 cqc = cq->cq_cqcrsrcp; 419 rsrc = cq->cq_rsrcp; 420 pd = state->ts_pdhdl_internal; 421 mr = cq->cq_mrhdl; 422 cqnum = cq->cq_cqnum; 423 424 /* 425 * If there are work queues still associated with the CQ, then return 426 * an error. Otherwise, we will be holding the CQ lock. 427 */ 428 if (cq->cq_refcnt != 0) { 429 mutex_exit(&cq->cq_lock); 430 TNF_PROBE_1(tavor_cq_free_refcnt_fail, TAVOR_TNF_ERROR, "", 431 tnf_int, refcnt, cq->cq_refcnt); 432 TAVOR_TNF_EXIT(tavor_cq_free); 433 return (IBT_CQ_BUSY); 434 } 435 436 /* 437 * If this was a user-mappable CQ, then we need to remove its entry 438 * from the "userland resources database". If it is also currently 439 * mmap()'d out to a user process, then we need to call 440 * devmap_devmem_remap() to remap the CQ memory to an invalid mapping. 441 * We also need to invalidate the CQ tracking information for the 442 * user mapping. 443 */ 444 if (cq->cq_is_umap) { 445 status = tavor_umap_db_find(state->ts_instance, cqnum, 446 MLNX_UMAP_CQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE, 447 &umapdb); 448 if (status != DDI_SUCCESS) { 449 mutex_exit(&cq->cq_lock); 450 TAVOR_WARNING(state, "failed to find in database"); 451 TAVOR_TNF_EXIT(tavor_cq_free); 452 return (ibc_get_ci_failure(0)); 453 } 454 tavor_umap_db_free(umapdb); 455 if (cq->cq_umap_dhp != NULL) { 456 maxprot = (PROT_READ | PROT_WRITE | PROT_USER); 457 status = devmap_devmem_remap(cq->cq_umap_dhp, 458 state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size, 459 maxprot, DEVMAP_MAPPING_INVALID, NULL); 460 if (status != DDI_SUCCESS) { 461 mutex_exit(&cq->cq_lock); 462 TAVOR_WARNING(state, "failed in CQ memory " 463 "devmap_devmem_remap()"); 464 TAVOR_TNF_EXIT(tavor_cq_free); 465 return (ibc_get_ci_failure(0)); 466 } 467 cq->cq_umap_dhp = (devmap_cookie_t)NULL; 468 } 469 } 470 471 /* 472 * Put NULL into the Tavor CQNum-to-CQHdl list. This will allow any 473 * in-progress events to detect that the CQ corresponding to this 474 * number has been freed. 475 */ 476 state->ts_cqhdl[cqc->tr_indx] = NULL; 477 478 /* 479 * While we hold the CQ lock, do a "forced reap" of the workQ WRID 480 * list. This cleans up all the structures associated with the WRID 481 * processing for this CQ. Once we complete, drop the lock and finish 482 * the deallocation of the CQ. 483 */ 484 tavor_wrid_cq_force_reap(cq); 485 486 mutex_exit(&cq->cq_lock); 487 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cq)) 488 489 /* 490 * Reclaim CQC entry from hardware (using the Tavor HW2SW_CQ 491 * firmware command). If the ownership transfer fails for any reason, 492 * then it is an indication that something (either in HW or SW) has 493 * gone seriously wrong. 494 */ 495 status = tavor_cmn_ownership_cmd_post(state, HW2SW_CQ, &cqc_entry, 496 sizeof (tavor_hw_cqc_t), cqnum, sleepflag); 497 if (status != TAVOR_CMD_SUCCESS) { 498 TAVOR_WARNING(state, "failed to reclaim CQC ownership"); 499 cmn_err(CE_CONT, "Tavor: HW2SW_CQ command failed: %08x\n", 500 status); 501 TNF_PROBE_1(tavor_cq_free_hw2sw_cq_cmd_fail, 502 TAVOR_TNF_ERROR, "", tnf_uint, status, status); 503 TAVOR_TNF_EXIT(tavor_cq_free); 504 return (ibc_get_ci_failure(0)); 505 } 506 507 /* 508 * Deregister the memory for the Completion Queue. If this fails 509 * for any reason, then it is an indication that something (either 510 * in HW or SW) has gone seriously wrong. So we print a warning 511 * message and return. 512 */ 513 status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL, 514 sleepflag); 515 if (status != DDI_SUCCESS) { 516 TAVOR_WARNING(state, "failed to deregister CQ memory"); 517 TNF_PROBE_0(tavor_cq_free_dereg_mr_fail, TAVOR_TNF_ERROR, ""); 518 TAVOR_TNF_EXIT(tavor_cq_free); 519 return (ibc_get_ci_failure(0)); 520 } 521 522 /* Free the memory for the CQ */ 523 tavor_queue_free(state, &cq->cq_cqinfo); 524 525 /* Free the Tavor Completion Queue handle */ 526 tavor_rsrc_free(state, &rsrc); 527 528 /* Free up the CQC entry resource */ 529 tavor_rsrc_free(state, &cqc); 530 531 /* Decrement the reference count on the protection domain (PD) */ 532 tavor_pd_refcnt_dec(pd); 533 534 /* Set the cqhdl pointer to NULL and return success */ 535 *cqhdl = NULL; 536 537 TAVOR_TNF_EXIT(tavor_cq_free); 538 return (DDI_SUCCESS); 539 } 540 541 542 /* 543 * tavor_cq_resize() 544 * Context: Can be called only from user or kernel context. 545 */ 546 int 547 tavor_cq_resize(tavor_state_t *state, tavor_cqhdl_t cq, uint_t req_size, 548 uint_t *actual_size, uint_t sleepflag) 549 { 550 tavor_hw_cqc_t cqc_entry; 551 tavor_qalloc_info_t new_cqinfo, old_cqinfo; 552 ibt_mr_attr_t mr_attr; 553 tavor_mr_options_t op; 554 tavor_pdhdl_t pd; 555 tavor_mrhdl_t mr, mr_old; 556 tavor_hw_cqe_t *buf; 557 uint32_t new_prod_indx, old_cons_indx; 558 uint_t dma_xfer_mode, cq_sync, log_cq_size, maxprot; 559 int status, i, flag; 560 char *errormsg; 561 562 TAVOR_TNF_ENTER(tavor_cq_resize); 563 564 /* Use the internal protection domain (PD) for CQs */ 565 pd = state->ts_pdhdl_internal; 566 567 /* 568 * Calculate the appropriate size for the new resized completion queue. 569 * Note: All Tavor CQs must be a power-of-2 minus 1 in size. Also 570 * they may not be any smaller than TAVOR_CQ_MIN_SIZE. This step is 571 * to round the requested size up to the next highest power-of-2 572 */ 573 req_size = max(req_size, TAVOR_CQ_MIN_SIZE); 574 log_cq_size = highbit(req_size); 575 576 /* 577 * Next we verify that the rounded-up size is valid (i.e. consistent 578 * with the device limits and/or software-configured limits) 579 */ 580 if (log_cq_size > state->ts_cfg_profile->cp_log_max_cq_sz) { 581 /* Set "status" and "errormsg" and goto failure */ 582 TAVOR_TNF_FAIL(IBT_HCA_CQ_EXCEEDED, "max CQ size"); 583 goto cqresize_fail; 584 } 585 586 /* 587 * Allocate the memory for newly resized Completion Queue. 588 * 589 * Note: Although we use the common queue allocation routine, we 590 * always specify TAVOR_QUEUE_LOCATION_NORMAL (i.e. CQ located in 591 * kernel system memory) for kernel CQs because it would be 592 * inefficient to have CQs located in DDR memory. This is the same 593 * as we do when we first allocate completion queues primarily 594 * because CQs are read from (by software) more than they are written 595 * to. (We always specify TAVOR_QUEUE_LOCATION_USERLAND for all 596 * user-mappable CQs for a similar reason.) 597 * It is also worth noting that, unlike Tavor QP work queues, 598 * completion queues do not have the same strict alignment 599 * requirements. It is sufficient for the CQ memory to be both 600 * aligned to and bound to addresses which are a multiple of CQE size. 601 */ 602 new_cqinfo.qa_size = (1 << log_cq_size) * sizeof (tavor_hw_cqe_t); 603 new_cqinfo.qa_alloc_align = sizeof (tavor_hw_cqe_t); 604 new_cqinfo.qa_bind_align = sizeof (tavor_hw_cqe_t); 605 if (cq->cq_is_umap) { 606 new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND; 607 } else { 608 new_cqinfo.qa_location = TAVOR_QUEUE_LOCATION_NORMAL; 609 } 610 status = tavor_queue_alloc(state, &new_cqinfo, sleepflag); 611 if (status != DDI_SUCCESS) { 612 /* Set "status" and "errormsg" and goto failure */ 613 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed completion queue"); 614 goto cqresize_fail; 615 } 616 buf = (tavor_hw_cqe_t *)new_cqinfo.qa_buf_aligned; 617 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf)) 618 619 /* 620 * Initialize each of the Completion Queue Entries (CQE) by setting 621 * their ownership to hardware ("owner" bit set to HW). This is in 622 * preparation for the final resize operation (below). 623 */ 624 for (i = 0; i < (1 << log_cq_size); i++) { 625 TAVOR_CQE_OWNER_SET_HW(cq, &buf[i]); 626 } 627 628 /* 629 * Register the memory for the CQ. The memory for the CQ must 630 * be registered in the Tavor TPT tables. This gives us the LKey 631 * to specify in the CQ context below. 632 */ 633 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP; 634 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf; 635 mr_attr.mr_len = new_cqinfo.qa_size; 636 mr_attr.mr_as = NULL; 637 mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE; 638 if (cq->cq_is_umap) { 639 dma_xfer_mode = DDI_DMA_CONSISTENT; 640 } else { 641 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent; 642 } 643 if (dma_xfer_mode == DDI_DMA_STREAMING) { 644 mr_attr.mr_flags |= IBT_MR_NONCOHERENT; 645 } 646 op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass; 647 op.mro_bind_dmahdl = new_cqinfo.qa_dmahdl; 648 op.mro_bind_override_addr = 0; 649 status = tavor_mr_register(state, pd, &mr_attr, &mr, &op); 650 if (status != DDI_SUCCESS) { 651 tavor_queue_free(state, &new_cqinfo); 652 /* Set "status" and "errormsg" and goto failure */ 653 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr"); 654 goto cqresize_fail; 655 } 656 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr)) 657 658 /* Determine if later ddi_dma_sync will be necessary */ 659 cq_sync = TAVOR_CQ_IS_SYNC_REQ(state, new_cqinfo); 660 661 /* Sync entire "new" CQ for use by hardware (if necessary) */ 662 if (cq_sync) { 663 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0, 664 new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV); 665 } 666 667 /* 668 * Now we grab the CQ lock. Since we will be updating the actual 669 * CQ location and the producer/consumer indexes, we should hold 670 * the lock. 671 * 672 * We do a TAVOR_NOSLEEP here (and below), though, because we are 673 * holding the "cq_lock" and if we got raised to interrupt level 674 * by priority inversion, we would not want to block in this routine 675 * waiting for success. 676 */ 677 mutex_enter(&cq->cq_lock); 678 679 /* 680 * Determine the current CQ "consumer index". 681 * 682 * Note: This will depend on whether the CQ had previously been 683 * mapped for user access or whether it is a kernel CQ. If this 684 * is a kernel CQ, then all PollCQ() operations have come through 685 * the IBTF and, hence, the driver's CQ state structure will 686 * contain the current consumer index. If, however, the user has 687 * accessed this CQ by bypassing the driver (OS-bypass), then we 688 * need to query the firmware to determine the current CQ consumer 689 * index. This also assumes that the user process will not continue 690 * to consume entries while at the same time doing the ResizeCQ() 691 * operation. If the user process does not guarantee this, then it 692 * may see duplicate or missed completions. But under no 693 * circumstances should this panic the system. 694 */ 695 if (cq->cq_is_umap) { 696 status = tavor_cmn_query_cmd_post(state, QUERY_CQ, 697 cq->cq_cqnum, &cqc_entry, sizeof (tavor_hw_cqc_t), 698 TAVOR_NOSLEEP); 699 if (status != TAVOR_CMD_SUCCESS) { 700 /* Query CQ has failed, drop CQ lock and cleanup */ 701 mutex_exit(&cq->cq_lock); 702 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL, 703 sleepflag) != DDI_SUCCESS) { 704 TAVOR_WARNING(state, "failed to deregister " 705 "CQ memory"); 706 } 707 tavor_queue_free(state, &new_cqinfo); 708 TAVOR_WARNING(state, "failed to find in database"); 709 710 /* Set "status" and "errormsg" and goto failure */ 711 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), 712 "failed umap lookup"); 713 goto cqresize_fail; 714 } 715 old_cons_indx = cqc_entry.cons_indx; 716 } else { 717 old_cons_indx = cq->cq_consindx; 718 } 719 720 /* 721 * Fill in the CQC entry. For the resize operation this is the 722 * final step before attempting the resize operation on the CQC entry. 723 * We use all of the information collected/calculated above to fill 724 * in the requisite portions of the CQC. 725 */ 726 bzero(&cqc_entry, sizeof (tavor_hw_cqc_t)); 727 cqc_entry.start_addr_h = (mr->mr_bindinfo.bi_addr >> 32); 728 cqc_entry.start_addr_l = (mr->mr_bindinfo.bi_addr & 0xFFFFFFFF); 729 cqc_entry.log_cq_sz = log_cq_size; 730 cqc_entry.lkey = mr->mr_lkey; 731 732 /* 733 * Write the CQC entry to hardware. Lastly, we pass ownership of 734 * the entry to the hardware (using the Tavor RESIZE_CQ firmware 735 * command). Note: In general, this operation shouldn't fail. But 736 * if it does, we have to undo everything we've done above before 737 * returning error. Also note that the status returned may indicate 738 * the code to return to the IBTF. 739 */ 740 status = tavor_resize_cq_cmd_post(state, &cqc_entry, cq->cq_cqnum, 741 &new_prod_indx, TAVOR_CMD_NOSLEEP_SPIN); 742 if (status != TAVOR_CMD_SUCCESS) { 743 /* Resize attempt has failed, drop CQ lock and cleanup */ 744 mutex_exit(&cq->cq_lock); 745 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL, 746 sleepflag) != DDI_SUCCESS) { 747 TAVOR_WARNING(state, "failed to deregister CQ memory"); 748 } 749 tavor_queue_free(state, &new_cqinfo); 750 if (status == TAVOR_CMD_BAD_SIZE) { 751 TAVOR_TNF_EXIT(tavor_cq_resize); 752 return (IBT_CQ_SZ_INSUFFICIENT); 753 } else { 754 cmn_err(CE_CONT, "Tavor: RESIZE_CQ command failed: " 755 "%08x\n", status); 756 TNF_PROBE_1(tavor_cq_resize_cq_cmd_fail, 757 TAVOR_TNF_ERROR, "", tnf_uint, status, status); 758 TAVOR_TNF_EXIT(tavor_cq_resize); 759 return (ibc_get_ci_failure(0)); 760 } 761 } 762 763 /* 764 * The CQ resize attempt was successful. Before dropping the CQ lock, 765 * copy all of the CQEs from the "old" CQ into the "new" CQ. Note: 766 * the Tavor firmware guarantees us that sufficient space is set aside 767 * in the "new" CQ to handle any un-polled CQEs from the "old" CQ. 768 * The two parameters to this helper function ("old_cons_indx" and 769 * "new_prod_indx") essentially indicate the starting index and number 770 * of any CQEs that might remain in the "old" CQ memory. 771 */ 772 tavor_cq_resize_helper(cq, buf, old_cons_indx, new_prod_indx); 773 774 /* Sync entire "new" CQ for use by hardware (if necessary) */ 775 if (cq_sync) { 776 (void) ddi_dma_sync(mr->mr_bindinfo.bi_dmahdl, 0, 777 new_cqinfo.qa_size, DDI_DMA_SYNC_FORDEV); 778 } 779 780 /* 781 * Update the Tavor Completion Queue handle with all the new 782 * information. At the same time, save away all the necessary 783 * information for freeing up the old resources 784 */ 785 mr_old = cq->cq_mrhdl; 786 old_cqinfo = cq->cq_cqinfo; 787 cq->cq_cqinfo = new_cqinfo; 788 cq->cq_consindx = 0; 789 cq->cq_buf = buf; 790 cq->cq_bufsz = (1 << log_cq_size); 791 cq->cq_mrhdl = mr; 792 cq->cq_sync = cq_sync; 793 794 /* 795 * If "old" CQ was a user-mappable CQ that is currently mmap()'d out 796 * to a user process, then we need to call devmap_devmem_remap() to 797 * invalidate the mapping to the CQ memory. We also need to 798 * invalidate the CQ tracking information for the user mapping. 799 */ 800 if ((cq->cq_is_umap) && (cq->cq_umap_dhp != NULL)) { 801 maxprot = (PROT_READ | PROT_WRITE | PROT_USER); 802 status = devmap_devmem_remap(cq->cq_umap_dhp, 803 state->ts_dip, 0, 0, cq->cq_cqinfo.qa_size, maxprot, 804 DEVMAP_MAPPING_INVALID, NULL); 805 if (status != DDI_SUCCESS) { 806 mutex_exit(&cq->cq_lock); 807 TAVOR_WARNING(state, "failed in CQ memory " 808 "devmap_devmem_remap()"); 809 TAVOR_TNF_EXIT(tavor_cq_free); 810 return (ibc_get_ci_failure(0)); 811 } 812 cq->cq_umap_dhp = (devmap_cookie_t)NULL; 813 } 814 815 /* 816 * Drop the CQ lock now. The only thing left to do is to free up 817 * the old resources. 818 */ 819 mutex_exit(&cq->cq_lock); 820 821 /* 822 * Deregister the memory for the old Completion Queue. Note: We 823 * really can't return error here because we have no good way to 824 * cleanup. Plus, the deregistration really shouldn't ever happen. 825 * So, if it does, it is an indication that something has gone 826 * seriously wrong. So we print a warning message and return error 827 * (knowing, of course, that the "old" CQ memory will be leaked) 828 */ 829 status = tavor_mr_deregister(state, &mr_old, TAVOR_MR_DEREG_ALL, 830 sleepflag); 831 if (status != DDI_SUCCESS) { 832 TAVOR_WARNING(state, "failed to deregister old CQ memory"); 833 /* Set "status" and "errormsg" and goto failure */ 834 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), 835 "failed deregister mr (old)"); 836 goto cqresize_fail; 837 } 838 839 /* Free the memory for the old CQ */ 840 tavor_queue_free(state, &old_cqinfo); 841 842 /* 843 * Fill in the return arguments (if necessary). This includes the 844 * real new completion queue size. 845 */ 846 if (actual_size != NULL) { 847 *actual_size = (1 << log_cq_size) - 1; 848 } 849 850 TAVOR_TNF_EXIT(tavor_cq_resize); 851 return (DDI_SUCCESS); 852 853 cqresize_fail: 854 TNF_PROBE_1(tavor_cq_resize_fail, TAVOR_TNF_ERROR, "", 855 tnf_string, msg, errormsg); 856 TAVOR_TNF_EXIT(tavor_cq_resize); 857 return (status); 858 } 859 860 861 /* 862 * tavor_cq_notify() 863 * Context: Can be called from interrupt or base context. 864 */ 865 int 866 tavor_cq_notify(tavor_state_t *state, tavor_cqhdl_t cq, 867 ibt_cq_notify_flags_t flags) 868 { 869 uint_t cqnum; 870 871 TAVOR_TNF_ENTER(tavor_cq_notify); 872 873 /* 874 * Determine if we are trying to get the next completion or the next 875 * "solicited" completion. Then hit the appropriate doorbell. 876 * 877 * NOTE: Please see the comment in tavor_event.c:tavor_eq_poll 878 * regarding why we do not have to do an extra PIO read here, and we 879 * will not lose an event after writing this doorbell. 880 */ 881 cqnum = cq->cq_cqnum; 882 if (flags == IBT_NEXT_COMPLETION) { 883 tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ, cqnum, 884 TAVOR_CQDB_DEFAULT_PARAM); 885 886 } else if (flags == IBT_NEXT_SOLICITED) { 887 tavor_cq_doorbell(state, TAVOR_CQDB_NOTIFY_CQ_SOLICIT, 888 cqnum, TAVOR_CQDB_DEFAULT_PARAM); 889 890 } else { 891 TNF_PROBE_1(tavor_cq_notify_invflags_fail, TAVOR_TNF_ERROR, "", 892 tnf_int, flags, flags); 893 TAVOR_TNF_EXIT(tavor_cq_notify); 894 return (IBT_CQ_NOTIFY_TYPE_INVALID); 895 } 896 897 TAVOR_TNF_EXIT(tavor_cq_notify); 898 return (DDI_SUCCESS); 899 } 900 901 902 /* 903 * tavor_cq_poll() 904 * Context: Can be called from interrupt or base context. 905 */ 906 int 907 tavor_cq_poll(tavor_state_t *state, tavor_cqhdl_t cq, ibt_wc_t *wc_p, 908 uint_t num_wc, uint_t *num_polled) 909 { 910 tavor_hw_cqe_t *cqe; 911 uint32_t cons_indx, wrap_around_mask; 912 uint32_t polled_cnt, num_to_increment; 913 int status; 914 915 TAVOR_TNF_ENTER(tavor_cq_poll); 916 917 /* 918 * Check for user-mappable CQ memory. Note: We do not allow kernel 919 * clients to poll CQ memory that is accessible directly by the user. 920 * If the CQ memory is user accessible, then return an error. 921 */ 922 if (cq->cq_is_umap) { 923 TNF_PROBE_0(tavor_cq_poll_inv_usrmapped_type, 924 TAVOR_TNF_ERROR, ""); 925 TAVOR_TNF_EXIT(tavor_cq_poll); 926 return (IBT_CQ_HDL_INVALID); 927 } 928 929 mutex_enter(&cq->cq_lock); 930 931 /* Get the consumer index */ 932 cons_indx = cq->cq_consindx; 933 934 /* 935 * Calculate the wrap around mask. Note: This operation only works 936 * because all Tavor completion queues have power-of-2 sizes 937 */ 938 wrap_around_mask = (cq->cq_bufsz - 1); 939 940 /* Calculate the pointer to the first CQ entry */ 941 cqe = &cq->cq_buf[cons_indx]; 942 943 /* Sync the current CQE to read */ 944 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU); 945 946 /* 947 * Keep pulling entries from the CQ until we find an entry owned by 948 * the hardware. As long as there the CQE's owned by SW, process 949 * each entry by calling tavor_cq_cqe_consume() and updating the CQ 950 * consumer index. Note: We only update the consumer index if 951 * tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB. Otherwise, 952 * it indicates that we are going to "recycle" the CQE (probably 953 * because it is a error CQE and corresponds to more than one 954 * completion). 955 */ 956 polled_cnt = 0; 957 while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) { 958 status = tavor_cq_cqe_consume(state, cq, cqe, 959 &wc_p[polled_cnt++]); 960 if (status == TAVOR_CQ_SYNC_AND_DB) { 961 /* Reset entry to hardware ownership */ 962 TAVOR_CQE_OWNER_SET_HW(cq, cqe); 963 964 /* Sync the current CQE for device */ 965 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORDEV); 966 967 /* Increment the consumer index */ 968 cons_indx = (cons_indx + 1) & wrap_around_mask; 969 970 /* Update the pointer to the next CQ entry */ 971 cqe = &cq->cq_buf[cons_indx]; 972 973 /* Sync the next CQE to read */ 974 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU); 975 } 976 977 /* 978 * If we have run out of space to store work completions, 979 * then stop and return the ones we have pulled of the CQ. 980 */ 981 if (polled_cnt >= num_wc) { 982 break; 983 } 984 } 985 986 /* 987 * Now we only ring the doorbell (to update the consumer index) if 988 * we've actually consumed a CQ entry. If we have, for example, 989 * pulled from a CQE that we are still in the process of "recycling" 990 * for error purposes, then we would not update the consumer index. 991 */ 992 if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) { 993 /* 994 * Post doorbell to update the consumer index. Doorbell 995 * value indicates number of entries consumed (minus 1) 996 */ 997 if (cons_indx > cq->cq_consindx) { 998 num_to_increment = (cons_indx - cq->cq_consindx) - 1; 999 } else { 1000 num_to_increment = ((cons_indx + cq->cq_bufsz) - 1001 cq->cq_consindx) - 1; 1002 } 1003 cq->cq_consindx = cons_indx; 1004 tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX, 1005 cq->cq_cqnum, num_to_increment); 1006 1007 } else if (polled_cnt == 0) { 1008 /* 1009 * If the CQ is empty, we can try to free up some of the WRID 1010 * list containers. See tavor_wr.c for more details on this 1011 * operation. 1012 */ 1013 tavor_wrid_cq_reap(cq); 1014 } 1015 1016 mutex_exit(&cq->cq_lock); 1017 1018 /* Set "num_polled" (if necessary) */ 1019 if (num_polled != NULL) { 1020 *num_polled = polled_cnt; 1021 } 1022 1023 /* Set CQ_EMPTY condition if needed, otherwise return success */ 1024 if (polled_cnt == 0) { 1025 status = IBT_CQ_EMPTY; 1026 } else { 1027 status = DDI_SUCCESS; 1028 } 1029 1030 /* 1031 * Check if the system is currently panicking. If it is, then call 1032 * the Tavor interrupt service routine. This step is necessary here 1033 * because we might be in a polled I/O mode and without the call to 1034 * tavor_isr() - and its subsequent calls to poll and rearm each 1035 * event queue - we might overflow our EQs and render the system 1036 * unable to sync/dump. 1037 */ 1038 if (ddi_in_panic() != 0) { 1039 (void) tavor_isr((caddr_t)state, (caddr_t)NULL); 1040 } 1041 1042 TAVOR_TNF_EXIT(tavor_cq_poll); 1043 return (status); 1044 } 1045 1046 1047 /* 1048 * tavor_cq_handler() 1049 * Context: Only called from interrupt context 1050 */ 1051 int 1052 tavor_cq_handler(tavor_state_t *state, tavor_eqhdl_t eq, 1053 tavor_hw_eqe_t *eqe) 1054 { 1055 tavor_cqhdl_t cq; 1056 uint_t cqnum; 1057 uint_t eqe_evttype; 1058 1059 TAVOR_TNF_ENTER(tavor_cq_handler); 1060 1061 eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe); 1062 1063 ASSERT(eqe_evttype == TAVOR_EVT_COMPLETION || 1064 eqe_evttype == TAVOR_EVT_EQ_OVERFLOW); 1065 1066 if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) { 1067 TNF_PROBE_0(tavor_cq_handler_eq_overflow_condition, 1068 TAVOR_TNF_ERROR, ""); 1069 tavor_eq_overflow_handler(state, eq, eqe); 1070 1071 TAVOR_TNF_EXIT(tavor_cq_handler); 1072 return (DDI_FAILURE); 1073 } 1074 1075 1076 /* Get the CQ handle from CQ number in event descriptor */ 1077 cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe); 1078 cq = tavor_cqhdl_from_cqnum(state, cqnum); 1079 1080 /* 1081 * Post the EQ doorbell to move the CQ to the "disarmed" state. 1082 * This operation is to enable subsequent CQ doorbells (e.g. those 1083 * that can be rung by tavor_cq_notify() above) to rearm the CQ. 1084 */ 1085 tavor_eq_doorbell(state, TAVOR_EQDB_DISARM_CQ, eq->eq_eqnum, cqnum); 1086 1087 /* 1088 * If the CQ handle is NULL, this is probably an indication 1089 * that the CQ has been freed already. In which case, we 1090 * should not deliver this event. 1091 * 1092 * We also check that the CQ number in the handle is the 1093 * same as the CQ number in the event queue entry. This 1094 * extra check allows us to handle the case where a CQ was 1095 * freed and then allocated again in the time it took to 1096 * handle the event queue processing. By constantly incrementing 1097 * the non-constrained portion of the CQ number every time 1098 * a new CQ is allocated, we mitigate (somewhat) the chance 1099 * that a stale event could be passed to the client's CQ 1100 * handler. 1101 * 1102 * Lastly, we check if "ts_ibtfpriv" is NULL. If it is then it 1103 * means that we've have either received this event before we 1104 * finished attaching to the IBTF or we've received it while we 1105 * are in the process of detaching. 1106 */ 1107 if ((cq != NULL) && (cq->cq_cqnum == cqnum) && 1108 (state->ts_ibtfpriv != NULL)) { 1109 TAVOR_DO_IBTF_CQ_CALLB(state, cq); 1110 } else { 1111 TNF_PROBE_2(tavor_cq_handler_dropped_event, 1112 TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum, 1113 tnf_uint, hdl_cqnum, cqnum); 1114 } 1115 1116 TAVOR_TNF_EXIT(tavor_cq_handler); 1117 return (DDI_SUCCESS); 1118 } 1119 1120 1121 /* 1122 * tavor_cq_err_handler() 1123 * Context: Only called from interrupt context 1124 */ 1125 int 1126 tavor_cq_err_handler(tavor_state_t *state, tavor_eqhdl_t eq, 1127 tavor_hw_eqe_t *eqe) 1128 { 1129 tavor_cqhdl_t cq; 1130 uint_t cqnum; 1131 ibc_async_event_t event; 1132 ibt_async_code_t type; 1133 uint_t eqe_evttype; 1134 1135 TAVOR_TNF_ENTER(tavor_cq_err_handler); 1136 1137 eqe_evttype = TAVOR_EQE_EVTTYPE_GET(eq, eqe); 1138 1139 ASSERT(eqe_evttype == TAVOR_EVT_CQ_ERRORS || 1140 eqe_evttype == TAVOR_EVT_EQ_OVERFLOW); 1141 1142 if (eqe_evttype == TAVOR_EVT_EQ_OVERFLOW) { 1143 TNF_PROBE_0(tavor_cq_err_handler_eq_overflow_condition, 1144 TAVOR_TNF_ERROR, ""); 1145 tavor_eq_overflow_handler(state, eq, eqe); 1146 1147 TAVOR_TNF_EXIT(tavor_cq_err_handler); 1148 return (DDI_FAILURE); 1149 } 1150 1151 /* cmn_err(CE_CONT, "CQ Error handler\n"); */ 1152 1153 /* Get the CQ handle from CQ number in event descriptor */ 1154 cqnum = TAVOR_EQE_CQNUM_GET(eq, eqe); 1155 cq = tavor_cqhdl_from_cqnum(state, cqnum); 1156 1157 /* 1158 * If the CQ handle is NULL, this is probably an indication 1159 * that the CQ has been freed already. In which case, we 1160 * should not deliver this event. 1161 * 1162 * We also check that the CQ number in the handle is the 1163 * same as the CQ number in the event queue entry. This 1164 * extra check allows us to handle the case where a CQ was 1165 * freed and then allocated again in the time it took to 1166 * handle the event queue processing. By constantly incrementing 1167 * the non-constrained portion of the CQ number every time 1168 * a new CQ is allocated, we mitigate (somewhat) the chance 1169 * that a stale event could be passed to the client's CQ 1170 * handler. 1171 * 1172 * And then we check if "ts_ibtfpriv" is NULL. If it is then it 1173 * means that we've have either received this event before we 1174 * finished attaching to the IBTF or we've received it while we 1175 * are in the process of detaching. 1176 */ 1177 if ((cq != NULL) && (cq->cq_cqnum == cqnum) && 1178 (state->ts_ibtfpriv != NULL)) { 1179 event.ev_cq_hdl = (ibt_cq_hdl_t)cq->cq_hdlrarg; 1180 type = IBT_ERROR_CQ; 1181 1182 TAVOR_DO_IBTF_ASYNC_CALLB(state, type, &event); 1183 } else { 1184 TNF_PROBE_2(tavor_cq_err_handler_dropped_event, 1185 TAVOR_TNF_ERROR, "", tnf_uint, ev_cqnum, cqnum, 1186 tnf_uint, hdl_cqnum, cqnum); 1187 } 1188 1189 TAVOR_TNF_EXIT(tavor_cq_err_handler); 1190 return (DDI_SUCCESS); 1191 } 1192 1193 1194 /* 1195 * tavor_cq_refcnt_inc() 1196 * Context: Can be called from interrupt or base context. 1197 */ 1198 int 1199 tavor_cq_refcnt_inc(tavor_cqhdl_t cq, uint_t is_special) 1200 { 1201 /* 1202 * Increment the completion queue's reference count. Note: In order 1203 * to ensure compliance with IBA C11-15, we must ensure that a given 1204 * CQ is not used for both special (SMI/GSI) QP and non-special QP. 1205 * This is accomplished here by keeping track of how the referenced 1206 * CQ is being used. 1207 */ 1208 mutex_enter(&cq->cq_lock); 1209 TNF_PROBE_1_DEBUG(tavor_cq_refcnt_inc, TAVOR_TNF_TRACE, "", 1210 tnf_uint, refcnt, cq->cq_refcnt); 1211 if (cq->cq_refcnt == 0) { 1212 cq->cq_is_special = is_special; 1213 } else { 1214 if (cq->cq_is_special != is_special) { 1215 mutex_exit(&cq->cq_lock); 1216 return (DDI_FAILURE); 1217 } 1218 } 1219 cq->cq_refcnt++; 1220 mutex_exit(&cq->cq_lock); 1221 return (DDI_SUCCESS); 1222 } 1223 1224 1225 /* 1226 * tavor_cq_refcnt_dec() 1227 * Context: Can be called from interrupt or base context. 1228 */ 1229 void 1230 tavor_cq_refcnt_dec(tavor_cqhdl_t cq) 1231 { 1232 /* Decrement the completion queue's reference count */ 1233 mutex_enter(&cq->cq_lock); 1234 cq->cq_refcnt--; 1235 TNF_PROBE_1_DEBUG(tavor_cq_refcnt_dec, TAVOR_TNF_TRACE, "", 1236 tnf_uint, refcnt, cq->cq_refcnt); 1237 mutex_exit(&cq->cq_lock); 1238 } 1239 1240 1241 /* 1242 * tavor_cq_doorbell() 1243 * Context: Can be called from interrupt or base context. 1244 */ 1245 static void 1246 tavor_cq_doorbell(tavor_state_t *state, uint32_t cq_cmd, uint32_t cqn, 1247 uint32_t cq_param) 1248 { 1249 uint64_t doorbell = 0; 1250 1251 /* Build the doorbell from the parameters */ 1252 doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) | 1253 ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param; 1254 1255 TNF_PROBE_1_DEBUG(tavor_cq_doorbell, TAVOR_TNF_TRACE, "", 1256 tnf_ulong, doorbell, doorbell); 1257 1258 /* Write the doorbell to UAR */ 1259 TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->cq, 1260 doorbell); 1261 } 1262 1263 1264 /* 1265 * tavor_cqhdl_from_cqnum() 1266 * Context: Can be called from interrupt or base context. 1267 * 1268 * This routine is important because changing the unconstrained 1269 * portion of the CQ number is critical to the detection of a 1270 * potential race condition in the CQ handler code (i.e. the case 1271 * where a CQ is freed and alloc'd again before an event for the 1272 * "old" CQ can be handled). 1273 * 1274 * While this is not a perfect solution (not sure that one exists) 1275 * it does help to mitigate the chance that this race condition will 1276 * cause us to deliver a "stale" event to the new CQ owner. Note: 1277 * this solution does not scale well because the number of constrained 1278 * bits increases (and, hence, the number of unconstrained bits 1279 * decreases) as the number of supported CQs grows. For small and 1280 * intermediate values, it should hopefully provide sufficient 1281 * protection. 1282 */ 1283 tavor_cqhdl_t 1284 tavor_cqhdl_from_cqnum(tavor_state_t *state, uint_t cqnum) 1285 { 1286 uint_t cqindx, cqmask; 1287 1288 /* Calculate the CQ table index from the cqnum */ 1289 cqmask = (1 << state->ts_cfg_profile->cp_log_num_cq) - 1; 1290 cqindx = cqnum & cqmask; 1291 return (state->ts_cqhdl[cqindx]); 1292 } 1293 1294 1295 /* 1296 * tavor_cq_cqe_consume() 1297 * Context: Can be called from interrupt or base context. 1298 */ 1299 static int 1300 tavor_cq_cqe_consume(tavor_state_t *state, tavor_cqhdl_t cq, 1301 tavor_hw_cqe_t *cqe, ibt_wc_t *wc) 1302 { 1303 uint_t flags, type, opcode, qpnum, qp1_indx; 1304 int status; 1305 1306 TAVOR_TNF_ENTER(tavor_cq_cqe_consume); 1307 1308 /* 1309 * Determine if this is an "error" CQE by examining "opcode". If it 1310 * is an error CQE, then call tavor_cq_errcqe_consume() and return 1311 * whatever status it returns. Otherwise, this is a successful 1312 * completion. 1313 */ 1314 opcode = TAVOR_CQE_OPCODE_GET(cq, cqe); 1315 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) || 1316 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) { 1317 status = tavor_cq_errcqe_consume(state, cq, cqe, wc); 1318 TAVOR_TNF_EXIT(tavor_cq_cqe_consume); 1319 return (status); 1320 } 1321 1322 /* 1323 * Fetch the Work Request ID using the information in the CQE. 1324 * See tavor_wr.c for more details. 1325 */ 1326 wc->wc_id = tavor_wrid_get_entry(cq, cqe, NULL); 1327 1328 /* 1329 * Parse the CQE opcode to determine completion type. This will set 1330 * not only the type of the completion, but also any flags that might 1331 * be associated with it (e.g. whether immediate data is present). 1332 */ 1333 flags = IBT_WC_NO_FLAGS; 1334 if (TAVOR_CQE_SENDRECV_GET(cq, cqe) != TAVOR_COMPLETION_RECV) { 1335 1336 /* Send CQE */ 1337 switch (opcode) { 1338 case TAVOR_CQE_SND_RDMAWR_IMM: 1339 flags |= IBT_WC_IMMED_DATA_PRESENT; 1340 /* FALLTHROUGH */ 1341 case TAVOR_CQE_SND_RDMAWR: 1342 type = IBT_WRC_RDMAW; 1343 break; 1344 1345 case TAVOR_CQE_SND_SEND_IMM: 1346 flags |= IBT_WC_IMMED_DATA_PRESENT; 1347 /* FALLTHROUGH */ 1348 case TAVOR_CQE_SND_SEND: 1349 type = IBT_WRC_SEND; 1350 break; 1351 1352 case TAVOR_CQE_SND_RDMARD: 1353 type = IBT_WRC_RDMAR; 1354 break; 1355 1356 case TAVOR_CQE_SND_ATOMIC_CS: 1357 type = IBT_WRC_CSWAP; 1358 break; 1359 1360 case TAVOR_CQE_SND_ATOMIC_FA: 1361 type = IBT_WRC_FADD; 1362 break; 1363 1364 case TAVOR_CQE_SND_BIND_MW: 1365 type = IBT_WRC_BIND; 1366 break; 1367 1368 default: 1369 TAVOR_WARNING(state, "unknown send CQE type"); 1370 wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR; 1371 TNF_PROBE_1(tavor_cq_cqe_consume_unknown_send_type, 1372 TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode); 1373 TAVOR_TNF_EXIT(tavor_cq_cqe_consume); 1374 return (TAVOR_CQ_SYNC_AND_DB); 1375 } 1376 } else { 1377 1378 /* Receive CQE */ 1379 switch (opcode & 0x1F) { 1380 case TAVOR_CQE_RCV_RECV_IMM: 1381 /* FALLTHROUGH */ 1382 case TAVOR_CQE_RCV_RECV_IMM2: 1383 /* 1384 * Note: According to the Tavor PRM, all QP1 recv 1385 * completions look like the result of a Send with 1386 * Immediate. They are not, however, (MADs are Send 1387 * Only) so we need to check the QP number and set 1388 * the flag only if it is non-QP1. 1389 */ 1390 qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe); 1391 qp1_indx = state->ts_spec_qp1->tr_indx; 1392 if ((qpnum < qp1_indx) || (qpnum > qp1_indx + 1)) { 1393 flags |= IBT_WC_IMMED_DATA_PRESENT; 1394 } 1395 /* FALLTHROUGH */ 1396 case TAVOR_CQE_RCV_RECV: 1397 /* FALLTHROUGH */ 1398 case TAVOR_CQE_RCV_RECV2: 1399 type = IBT_WRC_RECV; 1400 break; 1401 1402 case TAVOR_CQE_RCV_RDMAWR_IMM: 1403 /* FALLTHROUGH */ 1404 case TAVOR_CQE_RCV_RDMAWR_IMM2: 1405 flags |= IBT_WC_IMMED_DATA_PRESENT; 1406 type = IBT_WRC_RECV_RDMAWI; 1407 break; 1408 1409 default: 1410 TAVOR_WARNING(state, "unknown recv CQE type"); 1411 wc->wc_status = IBT_WC_LOCAL_QP_OP_ERR; 1412 TNF_PROBE_1(tavor_cq_cqe_consume_unknown_rcv_type, 1413 TAVOR_TNF_ERROR, "", tnf_uint, opcode, opcode); 1414 TAVOR_TNF_EXIT(tavor_cq_cqe_consume); 1415 return (TAVOR_CQ_SYNC_AND_DB); 1416 } 1417 } 1418 wc->wc_type = type; 1419 1420 /* 1421 * Check for GRH, update the flags, then fill in "wc_flags" field 1422 * in the work completion 1423 */ 1424 if (TAVOR_CQE_GRH_GET(cq, cqe) != 0) { 1425 flags |= IBT_WC_GRH_PRESENT; 1426 } 1427 wc->wc_flags = flags; 1428 1429 /* If we got here, completion status must be success */ 1430 wc->wc_status = IBT_WC_SUCCESS; 1431 1432 /* 1433 * Parse the remaining contents of the CQE into the work completion. 1434 * This means filling in SL, QP number, SLID, immediate data, etc. 1435 * Note: Not all of these fields are valid in a given completion. 1436 * Many of them depend on the actual type of completion. So we fill 1437 * in all of the fields and leave it up to the IBTF and consumer to 1438 * sort out which are valid based on their context. 1439 */ 1440 wc->wc_sl = TAVOR_CQE_SL_GET(cq, cqe); 1441 wc->wc_immed_data = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe); 1442 wc->wc_qpn = TAVOR_CQE_DQPN_GET(cq, cqe); 1443 wc->wc_res_hash = 0; 1444 wc->wc_slid = TAVOR_CQE_DLID_GET(cq, cqe); 1445 wc->wc_ethertype = (wc->wc_immed_data & 0xFFFF); 1446 wc->wc_pkey_ix = (wc->wc_immed_data >> 16); 1447 1448 /* 1449 * Depending on whether the completion was a receive or a send 1450 * completion, fill in "bytes transferred" as appropriate. Also, 1451 * if necessary, fill in the "path bits" field. 1452 */ 1453 if (TAVOR_CQE_SENDRECV_GET(cq, cqe) == TAVOR_COMPLETION_RECV) { 1454 wc->wc_path_bits = TAVOR_CQE_PATHBITS_GET(cq, cqe); 1455 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe); 1456 1457 } else if ((wc->wc_type == IBT_WRC_RDMAR) || 1458 (wc->wc_type == IBT_WRC_CSWAP) || (wc->wc_type == IBT_WRC_FADD)) { 1459 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cq, cqe); 1460 } 1461 1462 TAVOR_TNF_EXIT(tavor_cq_cqe_consume); 1463 return (TAVOR_CQ_SYNC_AND_DB); 1464 } 1465 1466 1467 /* 1468 * tavor_cq_errcqe_consume() 1469 * Context: Can be called from interrupt or base context. 1470 */ 1471 static int 1472 tavor_cq_errcqe_consume(tavor_state_t *state, tavor_cqhdl_t cq, 1473 tavor_hw_cqe_t *cqe, ibt_wc_t *wc) 1474 { 1475 uint64_t next_wqeaddr; 1476 uint32_t imm_eth_pkey_cred; 1477 uint_t nextwqesize, dbd; 1478 uint_t doorbell_cnt, status; 1479 tavor_wrid_entry_t wre; 1480 1481 TAVOR_TNF_ENTER(tavor_cq_errcqe_consume); 1482 1483 /* 1484 * Fetch the Work Request ID using the information in the CQE. 1485 * See tavor_wr.c for more details. 1486 */ 1487 wc->wc_id = tavor_wrid_get_entry(cq, cqe, &wre); 1488 1489 /* 1490 * Parse the CQE opcode to determine completion type. We know that 1491 * the CQE is an error completion, so we extract only the completion 1492 * status here. 1493 */ 1494 imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cq, cqe); 1495 status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT; 1496 switch (status) { 1497 case TAVOR_CQE_LOC_LEN_ERR: 1498 status = IBT_WC_LOCAL_LEN_ERR; 1499 break; 1500 1501 case TAVOR_CQE_LOC_OP_ERR: 1502 status = IBT_WC_LOCAL_QP_OP_ERR; 1503 break; 1504 1505 case TAVOR_CQE_LOC_PROT_ERR: 1506 status = IBT_WC_LOCAL_PROTECT_ERR; 1507 break; 1508 1509 case TAVOR_CQE_WR_FLUSHED_ERR: 1510 status = IBT_WC_WR_FLUSHED_ERR; 1511 break; 1512 1513 case TAVOR_CQE_MW_BIND_ERR: 1514 status = IBT_WC_MEM_WIN_BIND_ERR; 1515 break; 1516 1517 case TAVOR_CQE_BAD_RESPONSE_ERR: 1518 status = IBT_WC_BAD_RESPONSE_ERR; 1519 break; 1520 1521 case TAVOR_CQE_LOCAL_ACCESS_ERR: 1522 status = IBT_WC_LOCAL_ACCESS_ERR; 1523 break; 1524 1525 case TAVOR_CQE_REM_INV_REQ_ERR: 1526 status = IBT_WC_REMOTE_INVALID_REQ_ERR; 1527 break; 1528 1529 case TAVOR_CQE_REM_ACC_ERR: 1530 status = IBT_WC_REMOTE_ACCESS_ERR; 1531 break; 1532 1533 case TAVOR_CQE_REM_OP_ERR: 1534 status = IBT_WC_REMOTE_OP_ERR; 1535 break; 1536 1537 case TAVOR_CQE_TRANS_TO_ERR: 1538 status = IBT_WC_TRANS_TIMEOUT_ERR; 1539 break; 1540 1541 case TAVOR_CQE_RNRNAK_TO_ERR: 1542 status = IBT_WC_RNR_NAK_TIMEOUT_ERR; 1543 break; 1544 1545 /* 1546 * The following error codes are not supported in the Tavor driver 1547 * as they relate only to Reliable Datagram completion statuses: 1548 * case TAVOR_CQE_LOCAL_RDD_VIO_ERR: 1549 * case TAVOR_CQE_REM_INV_RD_REQ_ERR: 1550 * case TAVOR_CQE_EEC_REM_ABORTED_ERR: 1551 * case TAVOR_CQE_INV_EEC_NUM_ERR: 1552 * case TAVOR_CQE_INV_EEC_STATE_ERR: 1553 * case TAVOR_CQE_LOC_EEC_ERR: 1554 */ 1555 1556 default: 1557 TAVOR_WARNING(state, "unknown error CQE status"); 1558 status = IBT_WC_LOCAL_QP_OP_ERR; 1559 TNF_PROBE_1(tavor_cq_errcqe_consume_unknown_status, 1560 TAVOR_TNF_ERROR, "", tnf_uint, status, status); 1561 break; 1562 } 1563 wc->wc_status = status; 1564 1565 /* 1566 * Now we do all the checking that's necessary to handle completion 1567 * queue entry "recycling" 1568 * 1569 * It is not necessary here to try to sync the WQE as we are only 1570 * attempting to read from the Work Queue (and hardware does not 1571 * write to it). 1572 */ 1573 1574 /* 1575 * We can get doorbell info, WQE address, size for the next WQE 1576 * from the "wre" (which was filled in above in the call to the 1577 * tavor_wrid_get_entry() routine) 1578 */ 1579 dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0; 1580 next_wqeaddr = wre.wr_wqeaddrsz; 1581 nextwqesize = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK; 1582 1583 /* 1584 * Get the doorbell count from the CQE. This indicates how many 1585 * completions this one CQE represents. 1586 */ 1587 doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK; 1588 1589 /* 1590 * Determine if we're ready to consume this CQE yet or not. If the 1591 * next WQE has size zero (i.e. no next WQE) or if the doorbell count 1592 * is down to zero, then this is the last/only completion represented 1593 * by the current CQE (return TAVOR_CQ_SYNC_AND_DB). Otherwise, the 1594 * current CQE needs to be recycled (see below). 1595 */ 1596 if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) { 1597 /* 1598 * Consume the CQE 1599 * Return status to indicate that doorbell and sync may be 1600 * necessary. 1601 */ 1602 TAVOR_TNF_EXIT(tavor_cq_errcqe_consume); 1603 return (TAVOR_CQ_SYNC_AND_DB); 1604 1605 } else { 1606 /* 1607 * Recycle the CQE for use in the next PollCQ() call 1608 * Decrement the doorbell count, modify the error status, 1609 * and update the WQE address and size (to point to the 1610 * next WQE on the chain. Put these update entries back 1611 * into the CQE. 1612 * Despite the fact that we have updated the CQE, it is not 1613 * necessary for us to attempt to sync this entry just yet 1614 * as we have not changed the "hardware's view" of the 1615 * entry (i.e. we have not modified the "owner" bit - which 1616 * is all that the Tavor hardware really cares about. 1617 */ 1618 doorbell_cnt = doorbell_cnt - dbd; 1619 TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cq, cqe, 1620 ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) | 1621 (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK))); 1622 TAVOR_CQE_WQEADDRSZ_SET(cq, cqe, 1623 TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize)); 1624 1625 TAVOR_TNF_EXIT(tavor_cq_errcqe_consume); 1626 return (TAVOR_CQ_RECYCLE_ENTRY); 1627 } 1628 } 1629 1630 1631 /* 1632 * tavor_cqe_sync() 1633 * Context: Can be called from interrupt or base context. 1634 */ 1635 static void 1636 tavor_cqe_sync(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, uint_t flag) 1637 { 1638 ddi_dma_handle_t dmahdl; 1639 off_t offset; 1640 int status; 1641 1642 TAVOR_TNF_ENTER(tavor_cqe_sync); 1643 1644 /* Determine if CQ needs to be synced or not */ 1645 if (cq->cq_sync == 0) { 1646 TAVOR_TNF_EXIT(tavor_cqe_sync); 1647 return; 1648 } 1649 1650 /* Get the DMA handle from CQ context */ 1651 dmahdl = cq->cq_mrhdl->mr_bindinfo.bi_dmahdl; 1652 1653 /* Calculate offset of next CQE */ 1654 offset = (off_t)((uintptr_t)cqe - (uintptr_t)&cq->cq_buf[0]); 1655 status = ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_cqe_t), flag); 1656 if (status != DDI_SUCCESS) { 1657 TNF_PROBE_0(tavor_cqe_sync_getnextentry_fail, 1658 TAVOR_TNF_ERROR, ""); 1659 TAVOR_TNF_EXIT(tavor_cqe_sync); 1660 return; 1661 } 1662 1663 TAVOR_TNF_EXIT(tavor_cqe_sync); 1664 } 1665 1666 1667 /* 1668 * tavor_cq_resize_helper() 1669 * Context: Can be called only from user or kernel context. 1670 */ 1671 static void 1672 tavor_cq_resize_helper(tavor_cqhdl_t cq, tavor_hw_cqe_t *new_cqbuf, 1673 uint32_t old_cons_indx, uint32_t num_newcqe) 1674 { 1675 tavor_hw_cqe_t *old_cqe, *new_cqe; 1676 uint32_t new_cons_indx, wrap_around_mask; 1677 int i; 1678 1679 TAVOR_TNF_ENTER(tavor_cq_resize_helper); 1680 1681 ASSERT(MUTEX_HELD(&cq->cq_lock)); 1682 1683 /* Get the consumer index */ 1684 new_cons_indx = 0; 1685 1686 /* 1687 * Calculate the wrap around mask. Note: This operation only works 1688 * because all Tavor completion queues have power-of-2 sizes 1689 */ 1690 wrap_around_mask = (cq->cq_bufsz - 1); 1691 1692 /* 1693 * Calculate the pointers to the first CQ entry (in the "old" CQ) 1694 * and the first CQ entry in the "new" CQ 1695 */ 1696 old_cqe = &cq->cq_buf[old_cons_indx]; 1697 new_cqe = &new_cqbuf[new_cons_indx]; 1698 1699 /* Sync entire "old" CQ for use by software (if necessary). */ 1700 if (cq->cq_sync) { 1701 (void) ddi_dma_sync(cq->cq_mrhdl->mr_bindinfo.bi_dmahdl, 1702 0, cq->cq_cqinfo.qa_size, DDI_DMA_SYNC_FORCPU); 1703 } 1704 1705 /* 1706 * Keep pulling entries from the "old" CQ until we find an entry owned 1707 * by the hardware. Process each entry by copying it into the "new" 1708 * CQ and updating respective indices and pointers in the "old" CQ. 1709 */ 1710 for (i = 0; i < num_newcqe; i++) { 1711 1712 /* Copy this old CQE into the "new_cqe" pointer */ 1713 bcopy(old_cqe, new_cqe, sizeof (tavor_hw_cqe_t)); 1714 1715 /* Increment the consumer index (for both CQs) */ 1716 old_cons_indx = (old_cons_indx + 1) & wrap_around_mask; 1717 new_cons_indx = (new_cons_indx + 1); 1718 1719 /* Update the pointer to the next CQ entry */ 1720 old_cqe = &cq->cq_buf[old_cons_indx]; 1721 new_cqe = &new_cqbuf[new_cons_indx]; 1722 } 1723 1724 TAVOR_TNF_EXIT(tavor_cq_resize_helper); 1725 } 1726 1727 /* 1728 * tavor_cq_srq_entries_flush() 1729 * Context: Can be called from interrupt or base context. 1730 */ 1731 void 1732 tavor_cq_srq_entries_flush(tavor_state_t *state, tavor_qphdl_t qp) 1733 { 1734 tavor_cqhdl_t cq; 1735 tavor_workq_hdr_t *wqhdr; 1736 tavor_hw_cqe_t *cqe; 1737 tavor_hw_cqe_t *next_cqe; 1738 uint32_t cons_indx, tail_cons_indx, wrap_around_mask; 1739 uint32_t new_indx, check_indx, indx; 1740 uint32_t num_to_increment; 1741 int cqe_qpnum, cqe_type; 1742 int outstanding_cqes, removed_cqes; 1743 int i; 1744 1745 ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock)); 1746 1747 cq = qp->qp_rq_cqhdl; 1748 wqhdr = qp->qp_rq_wqhdr; 1749 1750 ASSERT(wqhdr->wq_wrid_post != NULL); 1751 ASSERT(wqhdr->wq_wrid_post->wl_srq_en != 0); 1752 1753 /* 1754 * Check for user-mapped CQ memory. Note: We do not allow kernel 1755 * clients to modify any userland mapping CQ. If the CQ is 1756 * user-mapped, then we simply return here, and this "flush" function 1757 * becomes a NO-OP in this case. 1758 */ 1759 if (cq->cq_is_umap) { 1760 return; 1761 } 1762 1763 /* Get the consumer index */ 1764 cons_indx = cq->cq_consindx; 1765 1766 /* 1767 * Calculate the wrap around mask. Note: This operation only works 1768 * because all Tavor completion queues have power-of-2 sizes 1769 */ 1770 wrap_around_mask = (cq->cq_bufsz - 1); 1771 1772 /* Calculate the pointer to the first CQ entry */ 1773 cqe = &cq->cq_buf[cons_indx]; 1774 1775 /* Sync the current CQE to read */ 1776 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU); 1777 1778 /* 1779 * Loop through the CQ looking for entries owned by software. If an 1780 * entry is owned by software then we increment an 'outstanding_cqes' 1781 * count to know how many entries total we have on our CQ. We use this 1782 * value further down to know how many entries to loop through looking 1783 * for our same QP number. 1784 */ 1785 outstanding_cqes = 0; 1786 tail_cons_indx = cons_indx; 1787 while (TAVOR_CQE_OWNER_IS_SW(cq, cqe)) { 1788 /* increment total cqes count */ 1789 outstanding_cqes++; 1790 1791 /* increment the consumer index */ 1792 tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask; 1793 1794 /* update the pointer to the next cq entry */ 1795 cqe = &cq->cq_buf[tail_cons_indx]; 1796 1797 /* sync the next cqe to read */ 1798 tavor_cqe_sync(cq, cqe, DDI_DMA_SYNC_FORCPU); 1799 } 1800 1801 /* 1802 * Using the 'tail_cons_indx' that was just set, we now know how many 1803 * total CQEs possible there are. Set the 'check_indx' and the 1804 * 'new_indx' to the last entry identified by 'tail_cons_indx' 1805 */ 1806 check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask; 1807 1808 for (i = 0; i < outstanding_cqes; i++) { 1809 cqe = &cq->cq_buf[check_indx]; 1810 1811 /* Grab QP number from CQE */ 1812 cqe_qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe); 1813 cqe_type = TAVOR_CQE_SENDRECV_GET(cq, cqe); 1814 1815 /* 1816 * If the QP number is the same in the CQE as the QP that we 1817 * have on this SRQ, then we must free up the entry off the 1818 * SRQ. We also make sure that the completion type is of the 1819 * 'TAVOR_COMPLETION_RECV' type. So any send completions on 1820 * this CQ will be left as-is. The handling of returning 1821 * entries back to HW ownership happens further down. 1822 */ 1823 if (cqe_qpnum == qp->qp_qpnum && 1824 cqe_type == TAVOR_COMPLETION_RECV) { 1825 1826 /* Add back to SRQ free list */ 1827 (void) tavor_wrid_find_match_srq(wqhdr->wq_wrid_post, 1828 cq, cqe); 1829 } else { 1830 /* Do Copy */ 1831 if (check_indx != new_indx) { 1832 next_cqe = &cq->cq_buf[new_indx]; 1833 1834 /* 1835 * Copy the CQE into the "next_cqe" 1836 * pointer. 1837 */ 1838 bcopy(cqe, next_cqe, sizeof (tavor_hw_cqe_t)); 1839 } 1840 new_indx = (new_indx - 1) & wrap_around_mask; 1841 } 1842 /* Move index to next CQE to check */ 1843 check_indx = (check_indx - 1) & wrap_around_mask; 1844 } 1845 1846 /* Initialize removed cqes count */ 1847 removed_cqes = 0; 1848 1849 /* If an entry was removed */ 1850 if (check_indx != new_indx) { 1851 1852 /* 1853 * Set current pointer back to the beginning consumer index. 1854 * At this point, all unclaimed entries have been copied to the 1855 * index specified by 'new_indx'. This 'new_indx' will be used 1856 * as the new consumer index after we mark all freed entries as 1857 * having HW ownership. We do that here. 1858 */ 1859 1860 /* Loop through all entries until we reach our new pointer */ 1861 for (indx = cons_indx; indx <= new_indx; 1862 indx = (indx + 1) & wrap_around_mask) { 1863 removed_cqes++; 1864 cqe = &cq->cq_buf[indx]; 1865 1866 /* Reset entry to hardware ownership */ 1867 TAVOR_CQE_OWNER_SET_HW(cq, cqe); 1868 } 1869 } 1870 1871 /* 1872 * Update consumer index to be the 'new_indx'. This moves it past all 1873 * removed entries. Because 'new_indx' is pointing to the last 1874 * previously valid SW owned entry, we add 1 to point the cons_indx to 1875 * the first HW owned entry. 1876 */ 1877 cons_indx = (new_indx + 1) & wrap_around_mask; 1878 1879 /* 1880 * Now we only ring the doorbell (to update the consumer index) if 1881 * we've actually consumed a CQ entry. If we found no QP number 1882 * matches above, then we would not have removed anything. So only if 1883 * something was removed do we ring the doorbell. 1884 */ 1885 if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) { 1886 /* 1887 * Post doorbell to update the consumer index. Doorbell 1888 * value indicates number of entries consumed (minus 1) 1889 */ 1890 if (cons_indx > cq->cq_consindx) { 1891 num_to_increment = (cons_indx - cq->cq_consindx) - 1; 1892 } else { 1893 num_to_increment = ((cons_indx + cq->cq_bufsz) - 1894 cq->cq_consindx) - 1; 1895 } 1896 cq->cq_consindx = cons_indx; 1897 1898 tavor_cq_doorbell(state, TAVOR_CQDB_INCR_CONSINDX, 1899 cq->cq_cqnum, num_to_increment); 1900 } 1901 } 1902