1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * tavor_qp.c 29 * Tavor Queue Pair Processing Routines 30 * 31 * Implements all the routines necessary for allocating, freeing, and 32 * querying the Tavor queue pairs. 33 */ 34 35 #include <sys/types.h> 36 #include <sys/conf.h> 37 #include <sys/ddi.h> 38 #include <sys/sunddi.h> 39 #include <sys/modctl.h> 40 #include <sys/bitmap.h> 41 #include <sys/sysmacros.h> 42 43 #include <sys/ib/adapters/tavor/tavor.h> 44 #include <sys/ib/ib_pkt_hdrs.h> 45 46 static int tavor_qp_create_qpn(tavor_state_t *state, tavor_qphdl_t qp, 47 tavor_rsrc_t *qpc); 48 static int tavor_qpn_avl_compare(const void *q, const void *e); 49 static int tavor_special_qp_rsrc_alloc(tavor_state_t *state, 50 ibt_sqp_type_t type, uint_t port, tavor_rsrc_t **qp_rsrc); 51 static int tavor_special_qp_rsrc_free(tavor_state_t *state, ibt_sqp_type_t type, 52 uint_t port); 53 static void tavor_qp_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl, 54 tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl); 55 56 /* 57 * tavor_qp_alloc() 58 * Context: Can be called only from user or kernel context. 59 */ 60 int 61 tavor_qp_alloc(tavor_state_t *state, tavor_qp_info_t *qpinfo, 62 uint_t sleepflag, tavor_qp_options_t *op) 63 { 64 tavor_rsrc_pool_info_t *rsrc_pool; 65 tavor_rsrc_t *qpc, *rsrc, *rdb; 66 tavor_umap_db_entry_t *umapdb; 67 tavor_qphdl_t qp; 68 ibt_qp_alloc_attr_t *attr_p; 69 ibt_qp_type_t type; 70 ibtl_qp_hdl_t ibt_qphdl; 71 ibt_chan_sizes_t *queuesz_p; 72 ib_qpn_t *qpn; 73 tavor_qphdl_t *qphdl; 74 ibt_mr_attr_t mr_attr; 75 tavor_mr_options_t mr_op; 76 tavor_srqhdl_t srq; 77 tavor_pdhdl_t pd; 78 tavor_cqhdl_t sq_cq, rq_cq; 79 tavor_mrhdl_t mr; 80 uint64_t value, qp_desc_off; 81 uint32_t *sq_buf, *rq_buf; 82 uint32_t log_qp_sq_size, log_qp_rq_size; 83 uint32_t sq_size, rq_size; 84 uint32_t sq_wqe_size, rq_wqe_size; 85 uint32_t max_rdb, max_sgl, uarpg; 86 uint_t wq_location, dma_xfer_mode, qp_is_umap; 87 uint_t qp_srq_en; 88 int status, flag; 89 char *errormsg; 90 91 TAVOR_TNF_ENTER(tavor_qp_alloc); 92 93 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr_p, *queuesz_p)) 94 95 /* 96 * Check the "options" flag. Currently this flag tells the driver 97 * whether or not the QP's work queues should be come from normal 98 * system memory or whether they should be allocated from DDR memory. 99 */ 100 if (op == NULL) { 101 wq_location = TAVOR_QUEUE_LOCATION_NORMAL; 102 } else { 103 wq_location = op->qpo_wq_loc; 104 } 105 106 /* 107 * Extract the necessary info from the tavor_qp_info_t structure 108 */ 109 attr_p = qpinfo->qpi_attrp; 110 type = qpinfo->qpi_type; 111 ibt_qphdl = qpinfo->qpi_ibt_qphdl; 112 queuesz_p = qpinfo->qpi_queueszp; 113 qpn = qpinfo->qpi_qpn; 114 qphdl = &qpinfo->qpi_qphdl; 115 116 /* 117 * Determine whether QP is being allocated for userland access or 118 * whether it is being allocated for kernel access. If the QP is 119 * being allocated for userland access, then lookup the UAR doorbell 120 * page number for the current process. Note: If this is not found 121 * (e.g. if the process has not previously open()'d the Tavor driver), 122 * then an error is returned. 123 */ 124 qp_is_umap = (attr_p->qp_alloc_flags & IBT_QP_USER_MAP) ? 1 : 0; 125 if (qp_is_umap) { 126 status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(), 127 MLNX_UMAP_UARPG_RSRC, &value, 0, NULL); 128 if (status != DDI_SUCCESS) { 129 /* Set "status" and "errormsg" and goto failure */ 130 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page"); 131 goto qpalloc_fail; 132 } 133 uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx; 134 } 135 136 /* 137 * Determine whether QP is being associated with an SRQ 138 */ 139 qp_srq_en = (attr_p->qp_alloc_flags & IBT_QP_USES_SRQ) ? 1 : 0; 140 if (qp_srq_en) { 141 /* 142 * Check for valid SRQ handle pointers 143 */ 144 if (attr_p->qp_ibc_srq_hdl == NULL) { 145 /* Set "status" and "errormsg" and goto failure */ 146 TAVOR_TNF_FAIL(IBT_SRQ_HDL_INVALID, 147 "invalid SRQ handle"); 148 goto qpalloc_fail; 149 } 150 srq = (tavor_srqhdl_t)attr_p->qp_ibc_srq_hdl; 151 } 152 153 /* 154 * Check for valid QP service type (only UD/RC/UC supported) 155 */ 156 if (((type != IBT_UD_RQP) && (type != IBT_RC_RQP) && 157 (type != IBT_UC_RQP))) { 158 /* Set "status" and "errormsg" and goto failure */ 159 TAVOR_TNF_FAIL(IBT_QP_SRV_TYPE_INVALID, "invalid serv type"); 160 goto qpalloc_fail; 161 } 162 163 /* 164 * Only RC is supported on an SRQ -- This is a Tavor hardware 165 * limitation. Arbel native mode will not have this shortcoming. 166 */ 167 if (qp_srq_en && type != IBT_RC_RQP) { 168 /* Set "status" and "errormsg" and goto failure */ 169 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid serv type with SRQ"); 170 goto qpalloc_fail; 171 } 172 173 /* 174 * Check for valid PD handle pointer 175 */ 176 if (attr_p->qp_pd_hdl == NULL) { 177 /* Set "status" and "errormsg" and goto failure */ 178 TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID, "invalid PD handle"); 179 goto qpalloc_fail; 180 } 181 pd = (tavor_pdhdl_t)attr_p->qp_pd_hdl; 182 183 /* 184 * If on an SRQ, check to make sure the PD is the same 185 */ 186 if (qp_srq_en && (pd->pd_pdnum != srq->srq_pdhdl->pd_pdnum)) { 187 /* Set "status" and "errormsg" and goto failure */ 188 TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID, "invalid PD handle"); 189 goto qpalloc_fail; 190 } 191 192 /* Increment the reference count on the protection domain (PD) */ 193 tavor_pd_refcnt_inc(pd); 194 195 /* 196 * Check for valid CQ handle pointers 197 */ 198 if ((attr_p->qp_ibc_scq_hdl == NULL) || 199 (attr_p->qp_ibc_rcq_hdl == NULL)) { 200 /* Set "status" and "errormsg" and goto failure */ 201 TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle"); 202 goto qpalloc_fail1; 203 } 204 sq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_scq_hdl; 205 rq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_rcq_hdl; 206 207 /* 208 * Increment the reference count on the CQs. One or both of these 209 * could return error if we determine that the given CQ is already 210 * being used with a special (SMI/GSI) QP. 211 */ 212 status = tavor_cq_refcnt_inc(sq_cq, TAVOR_CQ_IS_NORMAL); 213 if (status != DDI_SUCCESS) { 214 /* Set "status" and "errormsg" and goto failure */ 215 TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle"); 216 goto qpalloc_fail1; 217 } 218 status = tavor_cq_refcnt_inc(rq_cq, TAVOR_CQ_IS_NORMAL); 219 if (status != DDI_SUCCESS) { 220 /* Set "status" and "errormsg" and goto failure */ 221 TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle"); 222 goto qpalloc_fail2; 223 } 224 225 /* 226 * Allocate an QP context entry. This will be filled in with all 227 * the necessary parameters to define the Queue Pair. Unlike 228 * other Tavor hardware resources, ownership is not immediately 229 * given to hardware in the final step here. Instead, we must 230 * wait until the QP is later transitioned to the "Init" state before 231 * passing the QP to hardware. If we fail here, we must undo all 232 * the reference count (CQ and PD). 233 */ 234 status = tavor_rsrc_alloc(state, TAVOR_QPC, 1, sleepflag, &qpc); 235 if (status != DDI_SUCCESS) { 236 /* Set "status" and "errormsg" and goto failure */ 237 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed QP context"); 238 goto qpalloc_fail3; 239 } 240 241 /* 242 * Allocate the software structure for tracking the queue pair 243 * (i.e. the Tavor Queue Pair handle). If we fail here, we must 244 * undo the reference counts and the previous resource allocation. 245 */ 246 status = tavor_rsrc_alloc(state, TAVOR_QPHDL, 1, sleepflag, &rsrc); 247 if (status != DDI_SUCCESS) { 248 /* Set "status" and "errormsg" and goto failure */ 249 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed QP handle"); 250 goto qpalloc_fail4; 251 } 252 qp = (tavor_qphdl_t)rsrc->tr_addr; 253 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp)) 254 255 /* 256 * Calculate the QP number from QPC index. This routine handles 257 * all of the operations necessary to keep track of used, unused, 258 * and released QP numbers. 259 */ 260 status = tavor_qp_create_qpn(state, qp, qpc); 261 if (status != DDI_SUCCESS) { 262 /* Set "status" and "errormsg" and goto failure */ 263 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed QPN create"); 264 goto qpalloc_fail5; 265 } 266 267 /* 268 * If this will be a user-mappable QP, then allocate an entry for 269 * the "userland resources database". This will later be added to 270 * the database (after all further QP operations are successful). 271 * If we fail here, we must undo the reference counts and the 272 * previous resource allocation. 273 */ 274 if (qp_is_umap) { 275 umapdb = tavor_umap_db_alloc(state->ts_instance, qp->qp_qpnum, 276 MLNX_UMAP_QPMEM_RSRC, (uint64_t)(uintptr_t)rsrc); 277 if (umapdb == NULL) { 278 /* Set "status" and "errormsg" and goto failure */ 279 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add"); 280 goto qpalloc_fail6; 281 } 282 } 283 284 /* 285 * If this is an RC QP, then pre-allocate the maximum number of RDB 286 * entries. This allows us to ensure that we can later cover all 287 * the resources needed by hardware for handling multiple incoming 288 * RDMA Reads. Note: These resources are obviously not always 289 * necessary. They are allocated here anyway. Someday maybe this 290 * can be modified to allocate these on-the-fly (i.e. only if RDMA 291 * Read or Atomic operations are enabled) XXX 292 * If we fail here, we have a bunch of resource and reference count 293 * cleanup to do. 294 */ 295 if (type == IBT_RC_RQP) { 296 max_rdb = state->ts_cfg_profile->cp_hca_max_rdma_in_qp; 297 status = tavor_rsrc_alloc(state, TAVOR_RDB, max_rdb, 298 sleepflag, &rdb); 299 if (status != DDI_SUCCESS) { 300 /* Set "status" and "errormsg" and goto failure */ 301 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed RDB"); 302 goto qpalloc_fail7; 303 } 304 qp->qp_rdbrsrcp = rdb; 305 /* Calculate offset (into DDR memory) of RDB entries */ 306 rsrc_pool = &state->ts_rsrc_hdl[TAVOR_RDB]; 307 qp->qp_rdb_ddraddr = (uintptr_t)rsrc_pool->rsrc_ddr_offset + 308 (rdb->tr_indx << TAVOR_RDB_SIZE_SHIFT); 309 } 310 311 /* 312 * Calculate the appropriate size for the work queues. 313 * Note: All Tavor QP work queues must be a power-of-2 in size. Also 314 * they may not be any smaller than TAVOR_QP_MIN_SIZE. This step is 315 * to round the requested size up to the next highest power-of-2 316 */ 317 attr_p->qp_sizes.cs_sq = max(attr_p->qp_sizes.cs_sq, TAVOR_QP_MIN_SIZE); 318 attr_p->qp_sizes.cs_rq = max(attr_p->qp_sizes.cs_rq, TAVOR_QP_MIN_SIZE); 319 log_qp_sq_size = highbit(attr_p->qp_sizes.cs_sq); 320 if (ISP2(attr_p->qp_sizes.cs_sq)) { 321 log_qp_sq_size = log_qp_sq_size - 1; 322 } 323 log_qp_rq_size = highbit(attr_p->qp_sizes.cs_rq); 324 if (ISP2(attr_p->qp_sizes.cs_rq)) { 325 log_qp_rq_size = log_qp_rq_size - 1; 326 } 327 328 /* 329 * Next we verify that the rounded-up size is valid (i.e. consistent 330 * with the device limits and/or software-configured limits). If not, 331 * then obviously we have a lot of cleanup to do before returning. 332 */ 333 if ((log_qp_sq_size > state->ts_cfg_profile->cp_log_max_qp_sz) || 334 (!qp_srq_en && (log_qp_rq_size > 335 state->ts_cfg_profile->cp_log_max_qp_sz))) { 336 /* Set "status" and "errormsg" and goto failure */ 337 TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max QP size"); 338 goto qpalloc_fail8; 339 } 340 341 /* 342 * Next we verify that the requested number of SGL is valid (i.e. 343 * consistent with the device limits and/or software-configured 344 * limits). If not, then obviously the same cleanup needs to be done. 345 */ 346 max_sgl = state->ts_cfg_profile->cp_wqe_real_max_sgl; 347 if ((attr_p->qp_sizes.cs_sq_sgl > max_sgl) || 348 (!qp_srq_en && (attr_p->qp_sizes.cs_rq_sgl > max_sgl))) { 349 /* Set "status" and "errormsg" and goto failure */ 350 TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED, "max QP SGL"); 351 goto qpalloc_fail8; 352 } 353 354 /* 355 * Determine this QP's WQE sizes (for both the Send and Recv WQEs). 356 * This will depend on the requested number of SGLs. Note: this 357 * has the side-effect of also calculating the real number of SGLs 358 * (for the calculated WQE size). 359 * 360 * For QP's on an SRQ, we set these to 0. 361 */ 362 if (qp_srq_en) { 363 qp->qp_rq_log_wqesz = 0; 364 qp->qp_rq_sgl = 0; 365 } else { 366 tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_rq_sgl, 367 TAVOR_QP_WQ_TYPE_RECVQ, &qp->qp_rq_log_wqesz, 368 &qp->qp_rq_sgl); 369 } 370 tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_sq_sgl, 371 TAVOR_QP_WQ_TYPE_SENDQ, &qp->qp_sq_log_wqesz, &qp->qp_sq_sgl); 372 373 /* 374 * Allocate the memory for QP work queues. Note: The location from 375 * which we will allocate these work queues has been passed in 376 * through the tavor_qp_options_t structure. Since Tavor work queues 377 * are not allowed to cross a 32-bit (4GB) boundary, the alignment of 378 * the work queue memory is very important. We used to allocate 379 * work queues (the combined receive and send queues) so that they 380 * would be aligned on their combined size. That alignment guaranteed 381 * that they would never cross the 4GB boundary (Tavor work queues 382 * are on the order of MBs at maximum). Now we are able to relax 383 * this alignment constraint by ensuring that the IB address assigned 384 * to the queue memory (as a result of the tavor_mr_register() call) 385 * is offset from zero. 386 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to 387 * guarantee the alignment, but when attempting to use IOMMU bypass 388 * mode we found that we were not allowed to specify any alignment 389 * that was more restrictive than the system page size. 390 * So we avoided this constraint by passing two alignment values, 391 * one for the memory allocation itself and the other for the DMA 392 * handle (for later bind). This used to cause more memory than 393 * necessary to be allocated (in order to guarantee the more 394 * restrictive alignment contraint). But be guaranteeing the 395 * zero-based IB virtual address for the queue, we are able to 396 * conserve this memory. 397 * Note: If QP is not user-mappable, then it may come from either 398 * kernel system memory or from HCA-attached local DDR memory. 399 */ 400 sq_wqe_size = 1 << qp->qp_sq_log_wqesz; 401 sq_size = (1 << log_qp_sq_size) * sq_wqe_size; 402 403 /* QP on SRQ sets these to 0 */ 404 if (qp_srq_en) { 405 rq_wqe_size = 0; 406 rq_size = 0; 407 } else { 408 rq_wqe_size = 1 << qp->qp_rq_log_wqesz; 409 rq_size = (1 << log_qp_rq_size) * rq_wqe_size; 410 } 411 412 qp->qp_wqinfo.qa_size = sq_size + rq_size; 413 qp->qp_wqinfo.qa_alloc_align = max(sq_wqe_size, rq_wqe_size); 414 qp->qp_wqinfo.qa_bind_align = max(sq_wqe_size, rq_wqe_size); 415 if (qp_is_umap) { 416 qp->qp_wqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND; 417 } else { 418 qp->qp_wqinfo.qa_location = wq_location; 419 } 420 status = tavor_queue_alloc(state, &qp->qp_wqinfo, sleepflag); 421 if (status != DDI_SUCCESS) { 422 /* Set "status" and "errormsg" and goto failure */ 423 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed work queue"); 424 goto qpalloc_fail8; 425 } 426 if (sq_wqe_size > rq_wqe_size) { 427 sq_buf = qp->qp_wqinfo.qa_buf_aligned; 428 429 /* 430 * If QP's on an SRQ, we set the rq_buf to NULL 431 */ 432 if (qp_srq_en) 433 rq_buf = NULL; 434 else 435 rq_buf = (uint32_t *)((uintptr_t)sq_buf + sq_size); 436 } else { 437 rq_buf = qp->qp_wqinfo.qa_buf_aligned; 438 sq_buf = (uint32_t *)((uintptr_t)rq_buf + rq_size); 439 } 440 441 /* 442 * Register the memory for the QP work queues. The memory for the 443 * QP must be registered in the Tavor TPT tables. This gives us the 444 * LKey to specify in the QP context later. Note: The memory for 445 * Tavor work queues (both Send and Recv) must be contiguous and 446 * registered as a single memory region. Note also: If the work 447 * queue is to be allocated from DDR memory, then only a "bypass" 448 * mapping is appropriate. And if the QP memory is user-mappable, 449 * then we force DDI_DMA_CONSISTENT mapping. 450 * Also, in order to meet the alignment restriction, we pass the 451 * "mro_bind_override_addr" flag in the call to tavor_mr_register(). 452 * This guarantees that the resulting IB vaddr will be zero-based 453 * (modulo the offset into the first page). 454 * If we fail here, we still have the bunch of resource and reference 455 * count cleanup to do. 456 */ 457 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : 458 IBT_MR_NOSLEEP; 459 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned; 460 mr_attr.mr_len = qp->qp_wqinfo.qa_size; 461 mr_attr.mr_as = NULL; 462 mr_attr.mr_flags = flag; 463 if (qp_is_umap) { 464 mr_op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass; 465 } else { 466 if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) { 467 mr_op.mro_bind_type = 468 state->ts_cfg_profile->cp_iommu_bypass; 469 dma_xfer_mode = 470 state->ts_cfg_profile->cp_streaming_consistent; 471 if (dma_xfer_mode == DDI_DMA_STREAMING) { 472 mr_attr.mr_flags |= IBT_MR_NONCOHERENT; 473 } 474 } else { 475 mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS; 476 } 477 } 478 mr_op.mro_bind_dmahdl = qp->qp_wqinfo.qa_dmahdl; 479 mr_op.mro_bind_override_addr = 1; 480 status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op); 481 if (status != DDI_SUCCESS) { 482 /* Set "status" and "errormsg" and goto failure */ 483 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr"); 484 goto qpalloc_fail9; 485 } 486 487 /* 488 * Calculate the offset between the kernel virtual address space 489 * and the IB virtual address space. This will be used when 490 * posting work requests to properly initialize each WQE. 491 */ 492 qp_desc_off = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned - 493 (uint64_t)mr->mr_bindinfo.bi_addr; 494 495 /* 496 * Fill in all the return arguments (if necessary). This includes 497 * real work queue sizes, real SGLs, and QP number 498 */ 499 if (queuesz_p != NULL) { 500 queuesz_p->cs_sq = (1 << log_qp_sq_size); 501 queuesz_p->cs_sq_sgl = qp->qp_sq_sgl; 502 503 /* QP on an SRQ set these to 0 */ 504 if (qp_srq_en) { 505 queuesz_p->cs_rq = 0; 506 queuesz_p->cs_rq_sgl = 0; 507 } else { 508 queuesz_p->cs_rq = (1 << log_qp_rq_size); 509 queuesz_p->cs_rq_sgl = qp->qp_rq_sgl; 510 } 511 } 512 if (qpn != NULL) { 513 *qpn = (ib_qpn_t)qp->qp_qpnum; 514 } 515 516 /* 517 * Fill in the rest of the Tavor Queue Pair handle. We can update 518 * the following fields for use in further operations on the QP. 519 */ 520 qp->qp_qpcrsrcp = qpc; 521 qp->qp_rsrcp = rsrc; 522 qp->qp_state = TAVOR_QP_RESET; 523 qp->qp_pdhdl = pd; 524 qp->qp_mrhdl = mr; 525 qp->qp_sq_sigtype = (attr_p->qp_flags & IBT_WR_SIGNALED) ? 526 TAVOR_QP_SQ_WR_SIGNALED : TAVOR_QP_SQ_ALL_SIGNALED; 527 qp->qp_is_special = 0; 528 qp->qp_is_umap = qp_is_umap; 529 qp->qp_uarpg = (qp->qp_is_umap) ? uarpg : 0; 530 qp->qp_umap_dhp = (devmap_cookie_t)NULL; 531 qp->qp_sq_cqhdl = sq_cq; 532 qp->qp_sq_lastwqeaddr = NULL; 533 qp->qp_sq_bufsz = (1 << log_qp_sq_size); 534 qp->qp_sq_buf = sq_buf; 535 qp->qp_desc_off = qp_desc_off; 536 qp->qp_rq_cqhdl = rq_cq; 537 qp->qp_rq_lastwqeaddr = NULL; 538 qp->qp_rq_buf = rq_buf; 539 540 /* QP on an SRQ sets this to 0 */ 541 if (qp_srq_en) { 542 qp->qp_rq_bufsz = 0; 543 } else { 544 qp->qp_rq_bufsz = (1 << log_qp_rq_size); 545 } 546 547 qp->qp_forward_sqd_event = 0; 548 qp->qp_sqd_still_draining = 0; 549 qp->qp_hdlrarg = (void *)ibt_qphdl; 550 qp->qp_mcg_refcnt = 0; 551 552 /* 553 * If this QP is to be associated with an SRQ, then set the SRQ handle 554 * appropriately. 555 */ 556 if (qp_srq_en) { 557 qp->qp_srqhdl = srq; 558 qp->qp_srq_en = TAVOR_QP_SRQ_ENABLED; 559 tavor_srq_refcnt_inc(qp->qp_srqhdl); 560 } else { 561 qp->qp_srqhdl = NULL; 562 qp->qp_srq_en = TAVOR_QP_SRQ_DISABLED; 563 } 564 565 /* Determine if later ddi_dma_sync will be necessary */ 566 qp->qp_sync = TAVOR_QP_IS_SYNC_REQ(state, qp->qp_wqinfo); 567 568 /* Determine the QP service type */ 569 if (type == IBT_RC_RQP) { 570 qp->qp_serv_type = TAVOR_QP_RC; 571 } else if (type == IBT_UD_RQP) { 572 qp->qp_serv_type = TAVOR_QP_UD; 573 } else { 574 qp->qp_serv_type = TAVOR_QP_UC; 575 } 576 577 /* Zero out the QP context */ 578 bzero(&qp->qpc, sizeof (tavor_hw_qpc_t)); 579 580 /* 581 * Put QP handle in Tavor QPNum-to-QPHdl list. Then fill in the 582 * "qphdl" and return success 583 */ 584 ASSERT(state->ts_qphdl[qpc->tr_indx] == NULL); 585 state->ts_qphdl[qpc->tr_indx] = qp; 586 587 /* 588 * If this is a user-mappable QP, then we need to insert the previously 589 * allocated entry into the "userland resources database". This will 590 * allow for later lookup during devmap() (i.e. mmap()) calls. 591 */ 592 if (qp_is_umap) { 593 tavor_umap_db_add(umapdb); 594 } 595 596 *qphdl = qp; 597 598 TAVOR_TNF_EXIT(tavor_qp_alloc); 599 return (DDI_SUCCESS); 600 601 /* 602 * The following is cleanup for all possible failure cases in this routine 603 */ 604 qpalloc_fail9: 605 tavor_queue_free(state, &qp->qp_wqinfo); 606 qpalloc_fail8: 607 if (type == IBT_RC_RQP) { 608 tavor_rsrc_free(state, &rdb); 609 } 610 qpalloc_fail7: 611 if (qp_is_umap) { 612 tavor_umap_db_free(umapdb); 613 } 614 qpalloc_fail6: 615 /* 616 * Releasing the QPN will also free up the QPC context. Update 617 * the QPC context pointer to indicate this. 618 */ 619 tavor_qp_release_qpn(state, qp->qp_qpn_hdl, TAVOR_QPN_RELEASE); 620 qpc = NULL; 621 qpalloc_fail5: 622 tavor_rsrc_free(state, &rsrc); 623 qpalloc_fail4: 624 if (qpc) { 625 tavor_rsrc_free(state, &qpc); 626 } 627 qpalloc_fail3: 628 tavor_cq_refcnt_dec(rq_cq); 629 qpalloc_fail2: 630 tavor_cq_refcnt_dec(sq_cq); 631 qpalloc_fail1: 632 tavor_pd_refcnt_dec(pd); 633 qpalloc_fail: 634 TNF_PROBE_1(tavor_qp_alloc_fail, TAVOR_TNF_ERROR, "", 635 tnf_string, msg, errormsg); 636 TAVOR_TNF_EXIT(tavor_qp_alloc); 637 return (status); 638 } 639 640 641 642 /* 643 * tavor_special_qp_alloc() 644 * Context: Can be called only from user or kernel context. 645 */ 646 int 647 tavor_special_qp_alloc(tavor_state_t *state, tavor_qp_info_t *qpinfo, 648 uint_t sleepflag, tavor_qp_options_t *op) 649 { 650 tavor_rsrc_t *qpc, *rsrc; 651 tavor_qphdl_t qp; 652 ibt_qp_alloc_attr_t *attr_p; 653 ibt_sqp_type_t type; 654 uint8_t port; 655 ibtl_qp_hdl_t ibt_qphdl; 656 ibt_chan_sizes_t *queuesz_p; 657 tavor_qphdl_t *qphdl; 658 ibt_mr_attr_t mr_attr; 659 tavor_mr_options_t mr_op; 660 tavor_pdhdl_t pd; 661 tavor_cqhdl_t sq_cq, rq_cq; 662 tavor_mrhdl_t mr; 663 uint64_t qp_desc_off; 664 uint32_t *sq_buf, *rq_buf; 665 uint32_t log_qp_sq_size, log_qp_rq_size; 666 uint32_t sq_size, rq_size, max_sgl; 667 uint32_t sq_wqe_size, rq_wqe_size; 668 uint_t wq_location, dma_xfer_mode; 669 int status, flag; 670 char *errormsg; 671 672 TAVOR_TNF_ENTER(tavor_special_qp_alloc); 673 674 /* 675 * Check the "options" flag. Currently this flag tells the driver 676 * whether or not the QP's work queues should be come from normal 677 * system memory or whether they should be allocated from DDR memory. 678 */ 679 if (op == NULL) { 680 wq_location = TAVOR_QUEUE_LOCATION_NORMAL; 681 } else { 682 wq_location = op->qpo_wq_loc; 683 } 684 685 /* 686 * Extract the necessary info from the tavor_qp_info_t structure 687 */ 688 attr_p = qpinfo->qpi_attrp; 689 type = qpinfo->qpi_type; 690 port = qpinfo->qpi_port; 691 ibt_qphdl = qpinfo->qpi_ibt_qphdl; 692 queuesz_p = qpinfo->qpi_queueszp; 693 qphdl = &qpinfo->qpi_qphdl; 694 695 /* 696 * Check for valid special QP type (only SMI & GSI supported) 697 */ 698 if ((type != IBT_SMI_SQP) && (type != IBT_GSI_SQP)) { 699 /* Set "status" and "errormsg" and goto failure */ 700 TAVOR_TNF_FAIL(IBT_QP_SPECIAL_TYPE_INVALID, "invalid QP type"); 701 goto spec_qpalloc_fail; 702 } 703 704 /* 705 * Check for valid port number 706 */ 707 if (!tavor_portnum_is_valid(state, port)) { 708 /* Set "status" and "errormsg" and goto failure */ 709 TAVOR_TNF_FAIL(IBT_HCA_PORT_INVALID, "invalid port num"); 710 goto spec_qpalloc_fail; 711 } 712 port = port - 1; 713 714 /* 715 * Check for valid PD handle pointer 716 */ 717 if (attr_p->qp_pd_hdl == NULL) { 718 /* Set "status" and "errormsg" and goto failure */ 719 TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID, "invalid PD handle"); 720 goto spec_qpalloc_fail; 721 } 722 pd = (tavor_pdhdl_t)attr_p->qp_pd_hdl; 723 724 /* Increment the reference count on the PD */ 725 tavor_pd_refcnt_inc(pd); 726 727 /* 728 * Check for valid CQ handle pointers 729 */ 730 if ((attr_p->qp_ibc_scq_hdl == NULL) || 731 (attr_p->qp_ibc_rcq_hdl == NULL)) { 732 /* Set "status" and "errormsg" and goto failure */ 733 TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle"); 734 goto spec_qpalloc_fail1; 735 } 736 sq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_scq_hdl; 737 rq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_rcq_hdl; 738 739 /* 740 * Increment the reference count on the CQs. One or both of these 741 * could return error if we determine that the given CQ is already 742 * being used with a non-special QP (i.e. a normal QP). 743 */ 744 status = tavor_cq_refcnt_inc(sq_cq, TAVOR_CQ_IS_SPECIAL); 745 if (status != DDI_SUCCESS) { 746 /* Set "status" and "errormsg" and goto failure */ 747 TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle"); 748 goto spec_qpalloc_fail1; 749 } 750 status = tavor_cq_refcnt_inc(rq_cq, TAVOR_CQ_IS_SPECIAL); 751 if (status != DDI_SUCCESS) { 752 /* Set "status" and "errormsg" and goto failure */ 753 TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle"); 754 goto spec_qpalloc_fail2; 755 } 756 757 /* 758 * Allocate the special QP resources. Essentially, this allocation 759 * amounts to checking if the request special QP has already been 760 * allocated. If successful, the QP context return is an actual 761 * QP context that has been "aliased" to act as a special QP of the 762 * appropriate type (and for the appropriate port). Just as in 763 * tavor_qp_alloc() above, ownership for this QP context is not 764 * immediately given to hardware in the final step here. Instead, we 765 * wait until the QP is later transitioned to the "Init" state before 766 * passing the QP to hardware. If we fail here, we must undo all 767 * the reference count (CQ and PD). 768 */ 769 status = tavor_special_qp_rsrc_alloc(state, type, port, &qpc); 770 if (status != DDI_SUCCESS) { 771 /* Set "status" and "errormsg" and goto failure */ 772 TAVOR_TNF_FAIL(status, "failed special QP rsrc"); 773 goto spec_qpalloc_fail3; 774 } 775 776 /* 777 * Allocate the software structure for tracking the special queue 778 * pair (i.e. the Tavor Queue Pair handle). If we fail here, we 779 * must undo the reference counts and the previous resource allocation. 780 */ 781 status = tavor_rsrc_alloc(state, TAVOR_QPHDL, 1, sleepflag, &rsrc); 782 if (status != DDI_SUCCESS) { 783 /* Set "status" and "errormsg" and goto failure */ 784 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed QP handle"); 785 goto spec_qpalloc_fail4; 786 } 787 qp = (tavor_qphdl_t)rsrc->tr_addr; 788 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp)) 789 790 /* 791 * Actual QP number is a combination of the index of the QPC and 792 * the port number. This is because the special QP contexts must 793 * be allocated two-at-a-time. 794 */ 795 qp->qp_qpnum = qpc->tr_indx + port; 796 797 /* 798 * Calculate the appropriate size for the work queues. 799 * Note: All Tavor QP work queues must be a power-of-2 in size. Also 800 * they may not be any smaller than TAVOR_QP_MIN_SIZE. This step is 801 * to round the requested size up to the next highest power-of-2 802 */ 803 attr_p->qp_sizes.cs_sq = max(attr_p->qp_sizes.cs_sq, TAVOR_QP_MIN_SIZE); 804 attr_p->qp_sizes.cs_rq = max(attr_p->qp_sizes.cs_rq, TAVOR_QP_MIN_SIZE); 805 log_qp_sq_size = highbit(attr_p->qp_sizes.cs_sq); 806 if (ISP2(attr_p->qp_sizes.cs_sq)) { 807 log_qp_sq_size = log_qp_sq_size - 1; 808 } 809 log_qp_rq_size = highbit(attr_p->qp_sizes.cs_rq); 810 if (ISP2(attr_p->qp_sizes.cs_rq)) { 811 log_qp_rq_size = log_qp_rq_size - 1; 812 } 813 814 /* 815 * Next we verify that the rounded-up size is valid (i.e. consistent 816 * with the device limits and/or software-configured limits). If not, 817 * then obviously we have a bit of cleanup to do before returning. 818 */ 819 if ((log_qp_sq_size > state->ts_cfg_profile->cp_log_max_qp_sz) || 820 (log_qp_rq_size > state->ts_cfg_profile->cp_log_max_qp_sz)) { 821 /* Set "status" and "errormsg" and goto failure */ 822 TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max QP size"); 823 goto spec_qpalloc_fail5; 824 } 825 826 /* 827 * Next we verify that the requested number of SGL is valid (i.e. 828 * consistent with the device limits and/or software-configured 829 * limits). If not, then obviously the same cleanup needs to be done. 830 */ 831 max_sgl = state->ts_cfg_profile->cp_wqe_real_max_sgl; 832 if ((attr_p->qp_sizes.cs_sq_sgl > max_sgl) || 833 (attr_p->qp_sizes.cs_rq_sgl > max_sgl)) { 834 /* Set "status" and "errormsg" and goto failure */ 835 TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED, "max QP SGL"); 836 goto spec_qpalloc_fail5; 837 } 838 839 /* 840 * Determine this QP's WQE sizes (for both the Send and Recv WQEs). 841 * This will depend on the requested number of SGLs. Note: this 842 * has the side-effect of also calculating the real number of SGLs 843 * (for the calculated WQE size). 844 */ 845 tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_rq_sgl, 846 TAVOR_QP_WQ_TYPE_RECVQ, &qp->qp_rq_log_wqesz, &qp->qp_rq_sgl); 847 if (type == IBT_SMI_SQP) { 848 tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_sq_sgl, 849 TAVOR_QP_WQ_TYPE_SENDMLX_QP0, &qp->qp_sq_log_wqesz, 850 &qp->qp_sq_sgl); 851 } else { 852 tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_sq_sgl, 853 TAVOR_QP_WQ_TYPE_SENDMLX_QP1, &qp->qp_sq_log_wqesz, 854 &qp->qp_sq_sgl); 855 } 856 857 /* 858 * Allocate the memory for QP work queues. Note: The location from 859 * which we will allocate these work queues has been passed in 860 * through the tavor_qp_options_t structure. Since Tavor work queues 861 * are not allowed to cross a 32-bit (4GB) boundary, the alignment of 862 * the work queue memory is very important. We used to allocate 863 * work queues (the combined receive and send queues) so that they 864 * would be aligned on their combined size. That alignment guaranteed 865 * that they would never cross the 4GB boundary (Tavor work queues 866 * are on the order of MBs at maximum). Now we are able to relax 867 * this alignment constraint by ensuring that the IB address assigned 868 * to the queue memory (as a result of the tavor_mr_register() call) 869 * is offset from zero. 870 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to 871 * guarantee the alignment, but when attempting to use IOMMU bypass 872 * mode we found that we were not allowed to specify any alignment 873 * that was more restrictive than the system page size. 874 * So we avoided this constraint by passing two alignment values, 875 * one for the memory allocation itself and the other for the DMA 876 * handle (for later bind). This used to cause more memory than 877 * necessary to be allocated (in order to guarantee the more 878 * restrictive alignment contraint). But be guaranteeing the 879 * zero-based IB virtual address for the queue, we are able to 880 * conserve this memory. 881 */ 882 sq_wqe_size = 1 << qp->qp_sq_log_wqesz; 883 rq_wqe_size = 1 << qp->qp_rq_log_wqesz; 884 sq_size = (1 << log_qp_sq_size) * sq_wqe_size; 885 rq_size = (1 << log_qp_rq_size) * rq_wqe_size; 886 qp->qp_wqinfo.qa_size = sq_size + rq_size; 887 qp->qp_wqinfo.qa_alloc_align = max(sq_wqe_size, rq_wqe_size); 888 qp->qp_wqinfo.qa_bind_align = max(sq_wqe_size, rq_wqe_size); 889 qp->qp_wqinfo.qa_location = wq_location; 890 status = tavor_queue_alloc(state, &qp->qp_wqinfo, sleepflag); 891 if (status != NULL) { 892 /* Set "status" and "errormsg" and goto failure */ 893 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed work queue"); 894 goto spec_qpalloc_fail5; 895 } 896 if (sq_wqe_size > rq_wqe_size) { 897 sq_buf = qp->qp_wqinfo.qa_buf_aligned; 898 rq_buf = (uint32_t *)((uintptr_t)sq_buf + sq_size); 899 } else { 900 rq_buf = qp->qp_wqinfo.qa_buf_aligned; 901 sq_buf = (uint32_t *)((uintptr_t)rq_buf + rq_size); 902 } 903 904 /* 905 * Register the memory for the special QP work queues. The memory for 906 * the special QP must be registered in the Tavor TPT tables. This 907 * gives us the LKey to specify in the QP context later. Note: The 908 * memory for Tavor work queues (both Send and Recv) must be contiguous 909 * and registered as a single memory region. Note also: If the work 910 * queue is to be allocated from DDR memory, then only a "bypass" 911 * mapping is appropriate. 912 * Also, in order to meet the alignment restriction, we pass the 913 * "mro_bind_override_addr" flag in the call to tavor_mr_register(). 914 * This guarantees that the resulting IB vaddr will be zero-based 915 * (modulo the offset into the first page). 916 * If we fail here, we have a bunch of resource and reference count 917 * cleanup to do. 918 */ 919 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : 920 IBT_MR_NOSLEEP; 921 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned; 922 mr_attr.mr_len = qp->qp_wqinfo.qa_size; 923 mr_attr.mr_as = NULL; 924 mr_attr.mr_flags = flag; 925 if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) { 926 mr_op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass; 927 928 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent; 929 if (dma_xfer_mode == DDI_DMA_STREAMING) { 930 mr_attr.mr_flags |= IBT_MR_NONCOHERENT; 931 } 932 } else { 933 mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS; 934 } 935 mr_op.mro_bind_dmahdl = qp->qp_wqinfo.qa_dmahdl; 936 mr_op.mro_bind_override_addr = 1; 937 status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op); 938 if (status != DDI_SUCCESS) { 939 /* Set "status" and "errormsg" and goto failure */ 940 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr"); 941 goto spec_qpalloc_fail6; 942 } 943 944 /* 945 * Calculate the offset between the kernel virtual address space 946 * and the IB virtual address space. This will be used when 947 * posting work requests to properly initialize each WQE. 948 */ 949 qp_desc_off = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned - 950 (uint64_t)mr->mr_bindinfo.bi_addr; 951 952 /* 953 * Fill in all the return arguments (if necessary). This includes 954 * real work queue sizes, real SGLs, and QP number (which will be 955 * either zero or one, depending on the special QP type) 956 */ 957 if (queuesz_p != NULL) { 958 queuesz_p->cs_sq = (1 << log_qp_sq_size); 959 queuesz_p->cs_sq_sgl = qp->qp_sq_sgl; 960 queuesz_p->cs_rq = (1 << log_qp_rq_size); 961 queuesz_p->cs_rq_sgl = qp->qp_rq_sgl; 962 } 963 964 /* 965 * Fill in the rest of the Tavor Queue Pair handle. We can update 966 * the following fields for use in further operations on the QP. 967 */ 968 qp->qp_qpcrsrcp = qpc; 969 qp->qp_rsrcp = rsrc; 970 qp->qp_state = TAVOR_QP_RESET; 971 qp->qp_pdhdl = pd; 972 qp->qp_mrhdl = mr; 973 qp->qp_sq_sigtype = (attr_p->qp_flags & IBT_WR_SIGNALED) ? 974 TAVOR_QP_SQ_WR_SIGNALED : TAVOR_QP_SQ_ALL_SIGNALED; 975 qp->qp_is_special = (type == IBT_SMI_SQP) ? 976 TAVOR_QP_SMI : TAVOR_QP_GSI; 977 qp->qp_is_umap = 0; 978 qp->qp_uarpg = 0; 979 qp->qp_sq_cqhdl = sq_cq; 980 qp->qp_sq_lastwqeaddr = NULL; 981 qp->qp_sq_bufsz = (1 << log_qp_sq_size); 982 qp->qp_sq_buf = sq_buf; 983 qp->qp_desc_off = qp_desc_off; 984 qp->qp_rq_cqhdl = rq_cq; 985 qp->qp_rq_lastwqeaddr = NULL; 986 qp->qp_rq_bufsz = (1 << log_qp_rq_size); 987 qp->qp_rq_buf = rq_buf; 988 qp->qp_portnum = port; 989 qp->qp_pkeyindx = 0; 990 qp->qp_hdlrarg = (void *)ibt_qphdl; 991 qp->qp_mcg_refcnt = 0; 992 qp->qp_srq_en = 0; 993 qp->qp_srqhdl = NULL; 994 995 /* Determine if later ddi_dma_sync will be necessary */ 996 qp->qp_sync = TAVOR_QP_IS_SYNC_REQ(state, qp->qp_wqinfo); 997 998 /* All special QPs are UD QP service type */ 999 qp->qp_serv_type = TAVOR_QP_UD; 1000 1001 /* Zero out the QP context */ 1002 bzero(&qp->qpc, sizeof (tavor_hw_qpc_t)); 1003 1004 /* 1005 * Put QP handle in Tavor QPNum-to-QPHdl list. Then fill in the 1006 * "qphdl" and return success 1007 */ 1008 ASSERT(state->ts_qphdl[qpc->tr_indx + port] == NULL); 1009 state->ts_qphdl[qpc->tr_indx + port] = qp; 1010 1011 *qphdl = qp; 1012 1013 TAVOR_TNF_EXIT(tavor_special_qp_alloc); 1014 return (DDI_SUCCESS); 1015 1016 /* 1017 * The following is cleanup for all possible failure cases in this routine 1018 */ 1019 spec_qpalloc_fail6: 1020 tavor_queue_free(state, &qp->qp_wqinfo); 1021 spec_qpalloc_fail5: 1022 tavor_rsrc_free(state, &rsrc); 1023 spec_qpalloc_fail4: 1024 if (tavor_special_qp_rsrc_free(state, type, port) != DDI_SUCCESS) { 1025 TAVOR_WARNING(state, "failed to free special QP rsrc"); 1026 } 1027 spec_qpalloc_fail3: 1028 tavor_cq_refcnt_dec(rq_cq); 1029 spec_qpalloc_fail2: 1030 tavor_cq_refcnt_dec(sq_cq); 1031 spec_qpalloc_fail1: 1032 tavor_pd_refcnt_dec(pd); 1033 spec_qpalloc_fail: 1034 TNF_PROBE_1(tavor_special_qp_alloc_fail, TAVOR_TNF_ERROR, "", 1035 tnf_string, msg, errormsg); 1036 TAVOR_TNF_EXIT(tavor_special_qp_alloc); 1037 return (status); 1038 } 1039 1040 1041 /* 1042 * tavor_qp_free() 1043 * This function frees up the QP resources. Depending on the value 1044 * of the "free_qp_flags", the QP number may not be released until 1045 * a subsequent call to tavor_qp_release_qpn(). 1046 * 1047 * Context: Can be called only from user or kernel context. 1048 */ 1049 /* ARGSUSED */ 1050 int 1051 tavor_qp_free(tavor_state_t *state, tavor_qphdl_t *qphdl, 1052 ibc_free_qp_flags_t free_qp_flags, ibc_qpn_hdl_t *qpnh, 1053 uint_t sleepflag) 1054 { 1055 tavor_rsrc_t *qpc, *rdb, *rsrc; 1056 tavor_umap_db_entry_t *umapdb; 1057 tavor_qpn_entry_t *entry; 1058 tavor_pdhdl_t pd; 1059 tavor_mrhdl_t mr; 1060 tavor_cqhdl_t sq_cq, rq_cq; 1061 tavor_srqhdl_t srq; 1062 tavor_qphdl_t qp; 1063 uint64_t value; 1064 uint_t type, port; 1065 uint_t maxprot; 1066 uint_t qp_srq_en; 1067 int status; 1068 char *errormsg; 1069 1070 TAVOR_TNF_ENTER(tavor_qp_free); 1071 1072 /* 1073 * Pull all the necessary information from the Tavor Queue Pair 1074 * handle. This is necessary here because the resource for the 1075 * QP handle is going to be freed up as part of this operation. 1076 */ 1077 qp = *qphdl; 1078 mutex_enter(&qp->qp_lock); 1079 qpc = qp->qp_qpcrsrcp; 1080 rsrc = qp->qp_rsrcp; 1081 pd = qp->qp_pdhdl; 1082 srq = qp->qp_srqhdl; 1083 mr = qp->qp_mrhdl; 1084 rq_cq = qp->qp_rq_cqhdl; 1085 sq_cq = qp->qp_sq_cqhdl; 1086 rdb = qp->qp_rdbrsrcp; 1087 port = qp->qp_portnum; 1088 qp_srq_en = qp->qp_srq_en; 1089 1090 /* 1091 * If the QP is part of an MCG, then we fail the qp_free 1092 */ 1093 if (qp->qp_mcg_refcnt != 0) { 1094 mutex_exit(&qp->qp_lock); 1095 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "QP part of MCG on free"); 1096 goto qpfree_fail; 1097 } 1098 1099 /* 1100 * If the QP is not already in "Reset" state, then transition to 1101 * "Reset". This is necessary because software does not reclaim 1102 * ownership of the QP context until the QP is in the "Reset" state. 1103 * If the ownership transfer fails for any reason, then it is an 1104 * indication that something (either in HW or SW) has gone seriously 1105 * wrong. So we print a warning message and return. 1106 */ 1107 if (qp->qp_state != TAVOR_QP_RESET) { 1108 if (tavor_qp_to_reset(state, qp) != DDI_SUCCESS) { 1109 mutex_exit(&qp->qp_lock); 1110 TAVOR_WARNING(state, "failed to reset QP context"); 1111 /* Set "status" and "errormsg" and goto failure */ 1112 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), 1113 "reset QP context"); 1114 goto qpfree_fail; 1115 } 1116 qp->qp_state = TAVOR_QP_RESET; 1117 1118 /* 1119 * Do any additional handling necessary for the transition 1120 * to the "Reset" state (e.g. update the WRID lists) 1121 */ 1122 tavor_wrid_to_reset_handling(state, qp); 1123 } 1124 1125 /* 1126 * If this was a user-mappable QP, then we need to remove its entry 1127 * from the "userland resources database". If it is also currently 1128 * mmap()'d out to a user process, then we need to call 1129 * devmap_devmem_remap() to remap the QP memory to an invalid mapping. 1130 * We also need to invalidate the QP tracking information for the 1131 * user mapping. 1132 */ 1133 if (qp->qp_is_umap) { 1134 status = tavor_umap_db_find(state->ts_instance, qp->qp_qpnum, 1135 MLNX_UMAP_QPMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE, 1136 &umapdb); 1137 if (status != DDI_SUCCESS) { 1138 mutex_exit(&qp->qp_lock); 1139 TAVOR_WARNING(state, "failed to find in database"); 1140 TAVOR_TNF_EXIT(tavor_qp_free); 1141 return (ibc_get_ci_failure(0)); 1142 } 1143 tavor_umap_db_free(umapdb); 1144 if (qp->qp_umap_dhp != NULL) { 1145 maxprot = (PROT_READ | PROT_WRITE | PROT_USER); 1146 status = devmap_devmem_remap(qp->qp_umap_dhp, 1147 state->ts_dip, 0, 0, qp->qp_wqinfo.qa_size, 1148 maxprot, DEVMAP_MAPPING_INVALID, NULL); 1149 if (status != DDI_SUCCESS) { 1150 mutex_exit(&qp->qp_lock); 1151 TAVOR_WARNING(state, "failed in QP memory " 1152 "devmap_devmem_remap()"); 1153 TAVOR_TNF_EXIT(tavor_qp_free); 1154 return (ibc_get_ci_failure(0)); 1155 } 1156 qp->qp_umap_dhp = (devmap_cookie_t)NULL; 1157 } 1158 } 1159 1160 /* 1161 * Put NULL into the Tavor QPNum-to-QPHdl list. This will allow any 1162 * in-progress events to detect that the QP corresponding to this 1163 * number has been freed. Note: it does depend in whether we are 1164 * freeing a special QP or not. 1165 */ 1166 if (qp->qp_is_special) { 1167 state->ts_qphdl[qpc->tr_indx + port] = NULL; 1168 } else { 1169 state->ts_qphdl[qpc->tr_indx] = NULL; 1170 } 1171 1172 /* 1173 * Drop the QP lock 1174 * At this point the lock is no longer necessary. We cannot 1175 * protect from multiple simultaneous calls to free the same QP. 1176 * In addition, since the QP lock is contained in the QP "software 1177 * handle" resource, which we will free (see below), it is 1178 * important that we have no further references to that memory. 1179 */ 1180 mutex_exit(&qp->qp_lock); 1181 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp)) 1182 1183 /* 1184 * Free the QP resources 1185 * Start by deregistering and freeing the memory for work queues. 1186 * Next free any previously allocated context information 1187 * (depending on QP type) 1188 * Finally, decrement the necessary reference counts. 1189 * If this fails for any reason, then it is an indication that 1190 * something (either in HW or SW) has gone seriously wrong. So we 1191 * print a warning message and return. 1192 */ 1193 status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL, 1194 sleepflag); 1195 if (status != DDI_SUCCESS) { 1196 TAVOR_WARNING(state, "failed to deregister QP memory"); 1197 /* Set "status" and "errormsg" and goto failure */ 1198 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "failed deregister mr"); 1199 goto qpfree_fail; 1200 } 1201 1202 /* Free the memory for the QP */ 1203 tavor_queue_free(state, &qp->qp_wqinfo); 1204 1205 /* 1206 * Free up the remainder of the QP resources. Note: we have a few 1207 * different resources to free up depending on whether the QP is a 1208 * special QP or not. As described above, if any of these fail for 1209 * any reason it is an indication that something (either in HW or SW) 1210 * has gone seriously wrong. So we print a warning message and 1211 * return. 1212 */ 1213 if (qp->qp_is_special) { 1214 type = (qp->qp_is_special == TAVOR_QP_SMI) ? 1215 IBT_SMI_SQP : IBT_GSI_SQP; 1216 1217 /* Free up resources for the special QP */ 1218 status = tavor_special_qp_rsrc_free(state, type, port); 1219 if (status != DDI_SUCCESS) { 1220 TAVOR_WARNING(state, "failed to free special QP rsrc"); 1221 /* Set "status" and "errormsg" and goto failure */ 1222 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), 1223 "failed special QP rsrc"); 1224 goto qpfree_fail; 1225 } 1226 1227 } else { 1228 type = qp->qp_serv_type; 1229 1230 /* Free up the RDB entries resource */ 1231 if (type == TAVOR_QP_RC) { 1232 tavor_rsrc_free(state, &rdb); 1233 } 1234 1235 /* 1236 * Check the flags and determine whether to release the 1237 * QPN or not, based on their value. 1238 */ 1239 if (free_qp_flags == IBC_FREE_QP_ONLY) { 1240 entry = qp->qp_qpn_hdl; 1241 tavor_qp_release_qpn(state, qp->qp_qpn_hdl, 1242 TAVOR_QPN_FREE_ONLY); 1243 *qpnh = (ibc_qpn_hdl_t)entry; 1244 } else { 1245 tavor_qp_release_qpn(state, qp->qp_qpn_hdl, 1246 TAVOR_QPN_RELEASE); 1247 } 1248 } 1249 1250 /* Free the Tavor Queue Pair handle */ 1251 tavor_rsrc_free(state, &rsrc); 1252 1253 /* Decrement the reference counts on CQs, PD and SRQ (if needed) */ 1254 tavor_cq_refcnt_dec(rq_cq); 1255 tavor_cq_refcnt_dec(sq_cq); 1256 tavor_pd_refcnt_dec(pd); 1257 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 1258 tavor_srq_refcnt_dec(srq); 1259 } 1260 1261 /* Set the qphdl pointer to NULL and return success */ 1262 *qphdl = NULL; 1263 1264 TAVOR_TNF_EXIT(tavor_qp_free); 1265 return (DDI_SUCCESS); 1266 1267 qpfree_fail: 1268 TNF_PROBE_1(tavor_qp_free_fail, TAVOR_TNF_ERROR, "", 1269 tnf_string, msg, errormsg); 1270 TAVOR_TNF_EXIT(tavor_qp_free); 1271 return (status); 1272 } 1273 1274 1275 /* 1276 * tavor_qp_query() 1277 * Context: Can be called from interrupt or base context. 1278 */ 1279 int 1280 tavor_qp_query(tavor_state_t *state, tavor_qphdl_t qp, 1281 ibt_qp_query_attr_t *attr_p) 1282 { 1283 ibt_cep_state_t qp_state; 1284 ibt_qp_ud_attr_t *ud; 1285 ibt_qp_rc_attr_t *rc; 1286 ibt_qp_uc_attr_t *uc; 1287 ibt_cep_flags_t enable_flags; 1288 tavor_hw_addr_path_t *qpc_path, *qpc_alt_path; 1289 ibt_cep_path_t *path_ptr, *alt_path_ptr; 1290 tavor_hw_qpc_t *qpc; 1291 int status; 1292 1293 TAVOR_TNF_ENTER(tavor_qp_query); 1294 1295 mutex_enter(&qp->qp_lock); 1296 1297 /* 1298 * Grab the temporary QPC entry from QP software state 1299 */ 1300 qpc = &qp->qpc; 1301 1302 /* Convert the current Tavor QP state to IBTF QP state */ 1303 switch (qp->qp_state) { 1304 case TAVOR_QP_RESET: 1305 qp_state = IBT_STATE_RESET; /* "Reset" */ 1306 break; 1307 case TAVOR_QP_INIT: 1308 qp_state = IBT_STATE_INIT; /* Initialized */ 1309 break; 1310 case TAVOR_QP_RTR: 1311 qp_state = IBT_STATE_RTR; /* Ready to Receive */ 1312 break; 1313 case TAVOR_QP_RTS: 1314 qp_state = IBT_STATE_RTS; /* Ready to Send */ 1315 break; 1316 case TAVOR_QP_SQERR: 1317 qp_state = IBT_STATE_SQE; /* Send Queue Error */ 1318 break; 1319 case TAVOR_QP_SQD: 1320 if (qp->qp_sqd_still_draining) { 1321 qp_state = IBT_STATE_SQDRAIN; /* SQ Draining */ 1322 } else { 1323 qp_state = IBT_STATE_SQD; /* SQ Drained */ 1324 } 1325 break; 1326 case TAVOR_QP_ERR: 1327 qp_state = IBT_STATE_ERROR; /* Error */ 1328 break; 1329 default: 1330 mutex_exit(&qp->qp_lock); 1331 TNF_PROBE_1(tavor_qp_query_inv_qpstate_fail, 1332 TAVOR_TNF_ERROR, "", tnf_uint, qpstate, qp->qp_state); 1333 TAVOR_TNF_EXIT(tavor_qp_query); 1334 return (ibc_get_ci_failure(0)); 1335 } 1336 attr_p->qp_info.qp_state = qp_state; 1337 1338 /* SRQ Hook. */ 1339 attr_p->qp_srq = NULL; 1340 1341 /* 1342 * The following QP information is always returned, regardless of 1343 * the current QP state. Note: Some special handling is necessary 1344 * for calculating the QP number on special QP (QP0 and QP1). 1345 */ 1346 attr_p->qp_sq_cq = qp->qp_sq_cqhdl->cq_hdlrarg; 1347 attr_p->qp_rq_cq = qp->qp_rq_cqhdl->cq_hdlrarg; 1348 if (qp->qp_is_special) { 1349 attr_p->qp_qpn = (qp->qp_is_special == TAVOR_QP_SMI) ? 0 : 1; 1350 } else { 1351 attr_p->qp_qpn = (ib_qpn_t)qp->qp_qpnum; 1352 } 1353 attr_p->qp_sq_sgl = qp->qp_sq_sgl; 1354 attr_p->qp_rq_sgl = qp->qp_rq_sgl; 1355 attr_p->qp_info.qp_sq_sz = qp->qp_sq_bufsz; 1356 attr_p->qp_info.qp_rq_sz = qp->qp_rq_bufsz; 1357 1358 /* 1359 * If QP is currently in the "Reset" state, then only the above are 1360 * returned 1361 */ 1362 if (qp_state == IBT_STATE_RESET) { 1363 mutex_exit(&qp->qp_lock); 1364 TAVOR_TNF_EXIT(tavor_qp_query); 1365 return (DDI_SUCCESS); 1366 } 1367 1368 /* 1369 * Post QUERY_QP command to firmware 1370 * 1371 * We do a TAVOR_NOSLEEP here because we are holding the "qp_lock". 1372 * Since we may be in the interrupt context (or subsequently raised 1373 * to interrupt level by priority inversion), we do not want to block 1374 * in this routine waiting for success. 1375 */ 1376 status = tavor_cmn_query_cmd_post(state, QUERY_QP, qp->qp_qpnum, 1377 qpc, sizeof (tavor_hw_qpc_t), TAVOR_CMD_NOSLEEP_SPIN); 1378 if (status != TAVOR_CMD_SUCCESS) { 1379 mutex_exit(&qp->qp_lock); 1380 cmn_err(CE_CONT, "Tavor: QUERY_QP command failed: %08x\n", 1381 status); 1382 TNF_PROBE_1(tavor_qp_query_cmd_fail, TAVOR_TNF_ERROR, "", 1383 tnf_uint, status, status); 1384 TAVOR_TNF_EXIT(tavor_qp_query); 1385 return (ibc_get_ci_failure(0)); 1386 } 1387 1388 /* 1389 * Fill in the additional QP info based on the QP's transport type. 1390 */ 1391 if (qp->qp_serv_type == TAVOR_QP_UD) { 1392 1393 /* Fill in the UD-specific info */ 1394 ud = &attr_p->qp_info.qp_transport.ud; 1395 ud->ud_qkey = (ib_qkey_t)qpc->qkey; 1396 ud->ud_sq_psn = qpc->next_snd_psn; 1397 ud->ud_pkey_ix = qpc->pri_addr_path.pkey_indx; 1398 ud->ud_port = qpc->pri_addr_path.portnum; 1399 1400 attr_p->qp_info.qp_trans = IBT_UD_SRV; 1401 1402 } else if (qp->qp_serv_type == TAVOR_QP_RC) { 1403 1404 /* Fill in the RC-specific info */ 1405 rc = &attr_p->qp_info.qp_transport.rc; 1406 rc->rc_sq_psn = qpc->next_snd_psn; 1407 rc->rc_rq_psn = qpc->next_rcv_psn; 1408 rc->rc_dst_qpn = qpc->rem_qpn; 1409 1410 /* Grab the path migration state information */ 1411 if (qpc->pm_state == TAVOR_QP_PMSTATE_MIGRATED) { 1412 rc->rc_mig_state = IBT_STATE_MIGRATED; 1413 } else if (qpc->pm_state == TAVOR_QP_PMSTATE_REARM) { 1414 rc->rc_mig_state = IBT_STATE_REARMED; 1415 } else { 1416 rc->rc_mig_state = IBT_STATE_ARMED; 1417 } 1418 rc->rc_rdma_ra_out = (1 << qpc->sra_max); 1419 rc->rc_rdma_ra_in = (1 << qpc->rra_max); 1420 rc->rc_min_rnr_nak = qpc->min_rnr_nak; 1421 rc->rc_path_mtu = qpc->mtu; 1422 rc->rc_retry_cnt = qpc->retry_cnt; 1423 1424 /* Get the common primary address path fields */ 1425 qpc_path = &qpc->pri_addr_path; 1426 path_ptr = &rc->rc_path; 1427 tavor_get_addr_path(state, qpc_path, &path_ptr->cep_adds_vect, 1428 TAVOR_ADDRPATH_QP, qp); 1429 1430 /* Fill in the additional primary address path fields */ 1431 path_ptr->cep_pkey_ix = qpc_path->pkey_indx; 1432 path_ptr->cep_hca_port_num = qpc_path->portnum; 1433 path_ptr->cep_timeout = qpc_path->ack_timeout; 1434 1435 /* Get the common alternate address path fields */ 1436 qpc_alt_path = &qpc->alt_addr_path; 1437 alt_path_ptr = &rc->rc_alt_path; 1438 tavor_get_addr_path(state, qpc_alt_path, 1439 &alt_path_ptr->cep_adds_vect, TAVOR_ADDRPATH_QP, qp); 1440 1441 /* Fill in the additional alternate address path fields */ 1442 alt_path_ptr->cep_pkey_ix = qpc_alt_path->pkey_indx; 1443 alt_path_ptr->cep_hca_port_num = qpc_alt_path->portnum; 1444 alt_path_ptr->cep_timeout = qpc_alt_path->ack_timeout; 1445 1446 /* Get the RNR retry time from primary path */ 1447 rc->rc_rnr_retry_cnt = qpc_path->rnr_retry; 1448 1449 /* Set the enable flags based on RDMA/Atomic enable bits */ 1450 enable_flags = IBT_CEP_NO_FLAGS; 1451 enable_flags |= ((qpc->rre == 0) ? 0 : IBT_CEP_RDMA_RD); 1452 enable_flags |= ((qpc->rwe == 0) ? 0 : IBT_CEP_RDMA_WR); 1453 enable_flags |= ((qpc->rae == 0) ? 0 : IBT_CEP_ATOMIC); 1454 attr_p->qp_info.qp_flags = enable_flags; 1455 1456 attr_p->qp_info.qp_trans = IBT_RC_SRV; 1457 1458 } else if (qp->qp_serv_type == TAVOR_QP_UC) { 1459 1460 /* Fill in the UC-specific info */ 1461 uc = &attr_p->qp_info.qp_transport.uc; 1462 uc->uc_sq_psn = qpc->next_snd_psn; 1463 uc->uc_rq_psn = qpc->next_rcv_psn; 1464 uc->uc_dst_qpn = qpc->rem_qpn; 1465 1466 /* Grab the path migration state information */ 1467 if (qpc->pm_state == TAVOR_QP_PMSTATE_MIGRATED) { 1468 uc->uc_mig_state = IBT_STATE_MIGRATED; 1469 } else if (qpc->pm_state == TAVOR_QP_PMSTATE_REARM) { 1470 uc->uc_mig_state = IBT_STATE_REARMED; 1471 } else { 1472 uc->uc_mig_state = IBT_STATE_ARMED; 1473 } 1474 uc->uc_path_mtu = qpc->mtu; 1475 1476 /* Get the common primary address path fields */ 1477 qpc_path = &qpc->pri_addr_path; 1478 path_ptr = &uc->uc_path; 1479 tavor_get_addr_path(state, qpc_path, &path_ptr->cep_adds_vect, 1480 TAVOR_ADDRPATH_QP, qp); 1481 1482 /* Fill in the additional primary address path fields */ 1483 path_ptr->cep_pkey_ix = qpc_path->pkey_indx; 1484 path_ptr->cep_hca_port_num = qpc_path->portnum; 1485 1486 /* Get the common alternate address path fields */ 1487 qpc_alt_path = &qpc->alt_addr_path; 1488 alt_path_ptr = &uc->uc_alt_path; 1489 tavor_get_addr_path(state, qpc_alt_path, 1490 &alt_path_ptr->cep_adds_vect, TAVOR_ADDRPATH_QP, qp); 1491 1492 /* Fill in the additional alternate address path fields */ 1493 alt_path_ptr->cep_pkey_ix = qpc_alt_path->pkey_indx; 1494 alt_path_ptr->cep_hca_port_num = qpc_alt_path->portnum; 1495 1496 /* 1497 * Set the enable flags based on RDMA enable bits (by 1498 * definition UC doesn't support Atomic or RDMA Read) 1499 */ 1500 enable_flags = ((qpc->rwe == 0) ? 0 : IBT_CEP_RDMA_WR); 1501 attr_p->qp_info.qp_flags = enable_flags; 1502 1503 attr_p->qp_info.qp_trans = IBT_UC_SRV; 1504 1505 } else { 1506 TAVOR_WARNING(state, "unexpected QP transport type"); 1507 mutex_exit(&qp->qp_lock); 1508 return (ibc_get_ci_failure(0)); 1509 } 1510 1511 /* 1512 * Under certain circumstances it is possible for the Tavor hardware 1513 * to transition to one of the error states without software directly 1514 * knowing about it. The QueryQP() call is the one place where we 1515 * have an opportunity to sample and update our view of the QP state. 1516 */ 1517 if (qpc->state == TAVOR_QP_SQERR) { 1518 attr_p->qp_info.qp_state = IBT_STATE_SQE; 1519 qp->qp_state = TAVOR_QP_SQERR; 1520 } 1521 if (qpc->state == TAVOR_QP_ERR) { 1522 attr_p->qp_info.qp_state = IBT_STATE_ERROR; 1523 qp->qp_state = TAVOR_QP_ERR; 1524 } 1525 mutex_exit(&qp->qp_lock); 1526 1527 TAVOR_TNF_EXIT(tavor_qp_query); 1528 return (DDI_SUCCESS); 1529 } 1530 1531 1532 /* 1533 * tavor_qp_create_qpn() 1534 * Context: Can be called from interrupt or base context. 1535 */ 1536 static int 1537 tavor_qp_create_qpn(tavor_state_t *state, tavor_qphdl_t qp, tavor_rsrc_t *qpc) 1538 { 1539 tavor_qpn_entry_t query; 1540 tavor_qpn_entry_t *entry; 1541 avl_index_t where; 1542 1543 TAVOR_TNF_ENTER(tavor_qp_create_qpn); 1544 1545 /* 1546 * Build a query (for the AVL tree lookup) and attempt to find 1547 * a previously added entry that has a matching QPC index. If 1548 * no matching entry is found, then allocate, initialize, and 1549 * add an entry to the AVL tree. 1550 * If a matching entry is found, then increment its QPN counter 1551 * and reference counter. 1552 */ 1553 query.qpn_indx = qpc->tr_indx; 1554 mutex_enter(&state->ts_qpn_avl_lock); 1555 entry = (tavor_qpn_entry_t *)avl_find(&state->ts_qpn_avl, 1556 &query, &where); 1557 if (entry == NULL) { 1558 /* 1559 * Allocate and initialize a QPN entry, then insert 1560 * it into the AVL tree. 1561 */ 1562 entry = (tavor_qpn_entry_t *)kmem_zalloc( 1563 sizeof (tavor_qpn_entry_t), KM_NOSLEEP); 1564 if (entry == NULL) { 1565 mutex_exit(&state->ts_qpn_avl_lock); 1566 TAVOR_TNF_EXIT(tavor_qp_create_qpn); 1567 return (DDI_FAILURE); 1568 } 1569 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*entry)) 1570 1571 entry->qpn_indx = qpc->tr_indx; 1572 entry->qpn_refcnt = 0; 1573 entry->qpn_counter = 0; 1574 1575 avl_insert(&state->ts_qpn_avl, entry, where); 1576 } 1577 1578 /* 1579 * Make the AVL tree entry point to the QP context resource that 1580 * it will be responsible for tracking 1581 */ 1582 entry->qpn_qpc = qpc; 1583 1584 /* 1585 * Setup the QP handle to point to the AVL tree entry. Then 1586 * generate the new QP number from the entry's QPN counter value 1587 * and the hardware's QP context table index. 1588 */ 1589 qp->qp_qpn_hdl = entry; 1590 qp->qp_qpnum = ((entry->qpn_counter << 1591 state->ts_cfg_profile->cp_log_num_qp) | qpc->tr_indx) & 1592 TAVOR_QP_MAXNUMBER_MSK; 1593 1594 /* 1595 * Increment the reference counter and QPN counter. The QPN 1596 * counter always indicates the next available number for use. 1597 */ 1598 entry->qpn_counter++; 1599 entry->qpn_refcnt++; 1600 1601 mutex_exit(&state->ts_qpn_avl_lock); 1602 TAVOR_TNF_EXIT(tavor_qp_create_qpn); 1603 return (DDI_SUCCESS); 1604 } 1605 1606 1607 /* 1608 * tavor_qp_release_qpn() 1609 * Context: Can be called only from user or kernel context. 1610 */ 1611 void 1612 tavor_qp_release_qpn(tavor_state_t *state, tavor_qpn_entry_t *entry, int flags) 1613 { 1614 TAVOR_TNF_ENTER(tavor_qp_release_qpn); 1615 1616 ASSERT(entry != NULL); 1617 1618 mutex_enter(&state->ts_qpn_avl_lock); 1619 1620 /* 1621 * If we are releasing the QP number here, then we decrement the 1622 * reference count and check for zero references. If there are 1623 * zero references, then we free the QPC context (if it hadn't 1624 * already been freed during a TAVOR_QPN_FREE_ONLY free, i.e. for 1625 * reuse with another similar QP number) and remove the tracking 1626 * structure from the QP number AVL tree and free the structure. 1627 * If we are not releasing the QP number here, then, as long as we 1628 * have not exhausted the usefulness of the QPC context (that is, 1629 * re-used it too many times without the reference count having 1630 * gone to zero), we free up the QPC context for use by another 1631 * thread (which will use it to construct a different QP number 1632 * from the same QPC table index). 1633 */ 1634 if (flags == TAVOR_QPN_RELEASE) { 1635 entry->qpn_refcnt--; 1636 1637 /* 1638 * If the reference count is zero, then we free the QPC 1639 * context (if it hadn't already been freed in an early 1640 * step, e.g. TAVOR_QPN_FREE_ONLY) and remove/free the 1641 * tracking structure from the QP number AVL tree. 1642 */ 1643 if (entry->qpn_refcnt == 0) { 1644 if (entry->qpn_qpc != NULL) { 1645 tavor_rsrc_free(state, &entry->qpn_qpc); 1646 } 1647 1648 /* 1649 * If the current entry has served it's useful 1650 * purpose (i.e. been reused the maximum allowable 1651 * number of times), then remove it from QP number 1652 * AVL tree and free it up. 1653 */ 1654 if (entry->qpn_counter >= (1 << 1655 (24 - state->ts_cfg_profile->cp_log_num_qp))) { 1656 avl_remove(&state->ts_qpn_avl, entry); 1657 kmem_free(entry, sizeof (tavor_qpn_entry_t)); 1658 } 1659 } 1660 1661 } else if (flags == TAVOR_QPN_FREE_ONLY) { 1662 /* 1663 * Even if we are not freeing the QP number, that will not 1664 * always prevent us from releasing the QPC context. In fact, 1665 * since the QPC context only forms part of the whole QPN, 1666 * we want to free it up for use by other consumers. But 1667 * if the reference count is non-zero (which it will always 1668 * be when we are doing TAVOR_QPN_FREE_ONLY) and the counter 1669 * has reached its maximum value, then we cannot reuse the 1670 * QPC context until the reference count eventually reaches 1671 * zero (in TAVOR_QPN_RELEASE, above). 1672 */ 1673 if (entry->qpn_counter < (1 << 1674 (24 - state->ts_cfg_profile->cp_log_num_qp))) { 1675 tavor_rsrc_free(state, &entry->qpn_qpc); 1676 } 1677 } 1678 mutex_exit(&state->ts_qpn_avl_lock); 1679 1680 TAVOR_TNF_EXIT(tavor_qp_release_qpn); 1681 } 1682 1683 1684 /* 1685 * tavor_qpn_db_compare() 1686 * Context: Can be called from user or kernel context. 1687 */ 1688 static int 1689 tavor_qpn_avl_compare(const void *q, const void *e) 1690 { 1691 tavor_qpn_entry_t *entry, *query; 1692 1693 TAVOR_TNF_ENTER(tavor_qpn_avl_compare); 1694 1695 entry = (tavor_qpn_entry_t *)e; 1696 query = (tavor_qpn_entry_t *)q; 1697 1698 if (query->qpn_indx < entry->qpn_indx) { 1699 TAVOR_TNF_EXIT(tavor_qpn_avl_compare); 1700 return (-1); 1701 } else if (query->qpn_indx > entry->qpn_indx) { 1702 TAVOR_TNF_EXIT(tavor_qpn_avl_compare); 1703 return (+1); 1704 } else { 1705 TAVOR_TNF_EXIT(tavor_qpn_avl_compare); 1706 return (0); 1707 } 1708 } 1709 1710 1711 /* 1712 * tavor_qpn_avl_init() 1713 * Context: Only called from attach() path context 1714 */ 1715 void 1716 tavor_qpn_avl_init(tavor_state_t *state) 1717 { 1718 TAVOR_TNF_ENTER(tavor_qpn_avl_init); 1719 1720 /* Initialize the lock used for QP number (QPN) AVL tree access */ 1721 mutex_init(&state->ts_qpn_avl_lock, NULL, MUTEX_DRIVER, 1722 DDI_INTR_PRI(state->ts_intrmsi_pri)); 1723 1724 /* Initialize the AVL tree for the QP number (QPN) storage */ 1725 avl_create(&state->ts_qpn_avl, tavor_qpn_avl_compare, 1726 sizeof (tavor_qpn_entry_t), 1727 offsetof(tavor_qpn_entry_t, qpn_avlnode)); 1728 1729 TAVOR_TNF_EXIT(tavor_qpn_avl_init); 1730 } 1731 1732 1733 /* 1734 * tavor_qpn_avl_fini() 1735 * Context: Only called from attach() and/or detach() path contexts 1736 */ 1737 void 1738 tavor_qpn_avl_fini(tavor_state_t *state) 1739 { 1740 tavor_qpn_entry_t *entry; 1741 void *cookie; 1742 1743 TAVOR_TNF_ENTER(tavor_qpn_avl_fini); 1744 1745 /* 1746 * Empty all entries (if necessary) and destroy the AVL tree 1747 * that was used for QP number (QPN) tracking. 1748 */ 1749 cookie = NULL; 1750 while ((entry = (tavor_qpn_entry_t *)avl_destroy_nodes( 1751 &state->ts_qpn_avl, &cookie)) != NULL) { 1752 kmem_free(entry, sizeof (tavor_qpn_entry_t)); 1753 } 1754 avl_destroy(&state->ts_qpn_avl); 1755 1756 /* Destroy the lock used for QP number (QPN) AVL tree access */ 1757 mutex_destroy(&state->ts_qpn_avl_lock); 1758 1759 TAVOR_TNF_EXIT(tavor_qpn_avl_fini); 1760 } 1761 1762 1763 /* 1764 * tavor_qphdl_from_qpnum() 1765 * Context: Can be called from interrupt or base context. 1766 * 1767 * This routine is important because changing the unconstrained 1768 * portion of the QP number is critical to the detection of a 1769 * potential race condition in the QP event handler code (i.e. the case 1770 * where a QP is freed and alloc'd again before an event for the 1771 * "old" QP can be handled). 1772 * 1773 * While this is not a perfect solution (not sure that one exists) 1774 * it does help to mitigate the chance that this race condition will 1775 * cause us to deliver a "stale" event to the new QP owner. Note: 1776 * this solution does not scale well because the number of constrained 1777 * bits increases (and, hence, the number of unconstrained bits 1778 * decreases) as the number of supported QPs grows. For small and 1779 * intermediate values, it should hopefully provide sufficient 1780 * protection. 1781 */ 1782 tavor_qphdl_t 1783 tavor_qphdl_from_qpnum(tavor_state_t *state, uint_t qpnum) 1784 { 1785 uint_t qpindx, qpmask; 1786 1787 /* Calculate the QP table index from the qpnum */ 1788 qpmask = (1 << state->ts_cfg_profile->cp_log_num_qp) - 1; 1789 qpindx = qpnum & qpmask; 1790 return (state->ts_qphdl[qpindx]); 1791 } 1792 1793 1794 /* 1795 * tavor_special_qp_rsrc_alloc 1796 * Context: Can be called from interrupt or base context. 1797 */ 1798 static int 1799 tavor_special_qp_rsrc_alloc(tavor_state_t *state, ibt_sqp_type_t type, 1800 uint_t port, tavor_rsrc_t **qp_rsrc) 1801 { 1802 uint_t mask, flags; 1803 int status; 1804 1805 TAVOR_TNF_ENTER(tavor_special_qp_rsrc_alloc); 1806 1807 mutex_enter(&state->ts_spec_qplock); 1808 flags = state->ts_spec_qpflags; 1809 if (type == IBT_SMI_SQP) { 1810 /* 1811 * Check here to see if the driver has been configured 1812 * to instruct the Tavor firmware to handle all incoming 1813 * SMP messages (i.e. messages sent to SMA). If so, 1814 * then we will treat QP0 as if it has already been 1815 * allocated (for internal use). Otherwise, if we allow 1816 * the allocation to happen, it will cause unexpected 1817 * behaviors (e.g. Tavor SMA becomes unresponsive). 1818 */ 1819 if (state->ts_cfg_profile->cp_qp0_agents_in_fw != 0) { 1820 mutex_exit(&state->ts_spec_qplock); 1821 TNF_PROBE_0(tavor_special_qp0_alloc_already_in_fw, 1822 TAVOR_TNF_ERROR, ""); 1823 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc); 1824 return (IBT_QP_IN_USE); 1825 } 1826 1827 /* 1828 * If this is the first QP0 allocation, then post 1829 * a CONF_SPECIAL_QP firmware command 1830 */ 1831 if ((flags & TAVOR_SPECIAL_QP0_RSRC_MASK) == 0) { 1832 status = tavor_conf_special_qp_cmd_post(state, 1833 state->ts_spec_qp0->tr_indx, TAVOR_CMD_QP_SMI, 1834 TAVOR_CMD_NOSLEEP_SPIN); 1835 if (status != TAVOR_CMD_SUCCESS) { 1836 mutex_exit(&state->ts_spec_qplock); 1837 cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP " 1838 "command failed: %08x\n", status); 1839 TNF_PROBE_1(tavor_conf_special_qp_cmd_fail, 1840 TAVOR_TNF_ERROR, "", tnf_uint, status, 1841 status); 1842 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc); 1843 return (IBT_INSUFF_RESOURCE); 1844 } 1845 } 1846 1847 /* 1848 * Now check (and, if necessary, modify) the flags to indicate 1849 * whether the allocation was successful 1850 */ 1851 mask = (1 << (TAVOR_SPECIAL_QP0_RSRC + port)); 1852 if (flags & mask) { 1853 mutex_exit(&state->ts_spec_qplock); 1854 TNF_PROBE_1(tavor_ts_spec_qp0_alloc_already, 1855 TAVOR_TNF_ERROR, "", tnf_uint, port, port); 1856 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc); 1857 return (IBT_QP_IN_USE); 1858 } 1859 state->ts_spec_qpflags |= mask; 1860 *qp_rsrc = state->ts_spec_qp0; 1861 1862 } else { 1863 /* 1864 * If this is the first QP1 allocation, then post 1865 * a CONF_SPECIAL_QP firmware command 1866 */ 1867 if ((flags & TAVOR_SPECIAL_QP1_RSRC_MASK) == 0) { 1868 status = tavor_conf_special_qp_cmd_post(state, 1869 state->ts_spec_qp1->tr_indx, TAVOR_CMD_QP_GSI, 1870 TAVOR_CMD_NOSLEEP_SPIN); 1871 if (status != TAVOR_CMD_SUCCESS) { 1872 mutex_exit(&state->ts_spec_qplock); 1873 cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP " 1874 "command failed: %08x\n", status); 1875 TNF_PROBE_1(tavor_conf_special_qp_cmd_fail, 1876 TAVOR_TNF_ERROR, "", tnf_uint, status, 1877 status); 1878 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc); 1879 return (IBT_INSUFF_RESOURCE); 1880 } 1881 } 1882 1883 /* 1884 * Now check (and, if necessary, modify) the flags to indicate 1885 * whether the allocation was successful 1886 */ 1887 mask = (1 << (TAVOR_SPECIAL_QP1_RSRC + port)); 1888 if (flags & mask) { 1889 mutex_exit(&state->ts_spec_qplock); 1890 TNF_PROBE_0(tavor_ts_spec_qp1_alloc_already, 1891 TAVOR_TNF_ERROR, ""); 1892 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc); 1893 return (IBT_QP_IN_USE); 1894 } 1895 state->ts_spec_qpflags |= mask; 1896 *qp_rsrc = state->ts_spec_qp1; 1897 } 1898 1899 mutex_exit(&state->ts_spec_qplock); 1900 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc); 1901 return (DDI_SUCCESS); 1902 } 1903 1904 1905 /* 1906 * tavor_special_qp_rsrc_free 1907 * Context: Can be called from interrupt or base context. 1908 */ 1909 static int 1910 tavor_special_qp_rsrc_free(tavor_state_t *state, ibt_sqp_type_t type, 1911 uint_t port) 1912 { 1913 uint_t mask, flags; 1914 int status; 1915 1916 TAVOR_TNF_ENTER(tavor_special_qp_rsrc_free); 1917 1918 mutex_enter(&state->ts_spec_qplock); 1919 if (type == IBT_SMI_SQP) { 1920 mask = (1 << (TAVOR_SPECIAL_QP0_RSRC + port)); 1921 state->ts_spec_qpflags &= ~mask; 1922 flags = state->ts_spec_qpflags; 1923 1924 /* 1925 * If this is the last QP0 free, then post a CONF_SPECIAL_QP 1926 * firmware command 1927 */ 1928 if ((flags & TAVOR_SPECIAL_QP0_RSRC_MASK) == 0) { 1929 status = tavor_conf_special_qp_cmd_post(state, 0, 1930 TAVOR_CMD_QP_SMI, TAVOR_CMD_NOSLEEP_SPIN); 1931 if (status != TAVOR_CMD_SUCCESS) { 1932 mutex_exit(&state->ts_spec_qplock); 1933 cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP " 1934 "command failed: %08x\n", status); 1935 TNF_PROBE_1(tavor_conf_special_qp_cmd_fail, 1936 TAVOR_TNF_ERROR, "", tnf_uint, status, 1937 status); 1938 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_free); 1939 return (ibc_get_ci_failure(0)); 1940 } 1941 } 1942 } else { 1943 mask = (1 << (TAVOR_SPECIAL_QP1_RSRC + port)); 1944 state->ts_spec_qpflags &= ~mask; 1945 flags = state->ts_spec_qpflags; 1946 1947 /* 1948 * If this is the last QP1 free, then post a CONF_SPECIAL_QP 1949 * firmware command 1950 */ 1951 if ((flags & TAVOR_SPECIAL_QP1_RSRC_MASK) == 0) { 1952 status = tavor_conf_special_qp_cmd_post(state, 0, 1953 TAVOR_CMD_QP_GSI, TAVOR_CMD_NOSLEEP_SPIN); 1954 if (status != TAVOR_CMD_SUCCESS) { 1955 mutex_exit(&state->ts_spec_qplock); 1956 cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP " 1957 "command failed: %08x\n", status); 1958 TNF_PROBE_1(tavor_conf_special_qp_cmd_fail, 1959 TAVOR_TNF_ERROR, "", tnf_uint, status, 1960 status); 1961 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_free); 1962 return (ibc_get_ci_failure(0)); 1963 } 1964 } 1965 } 1966 1967 mutex_exit(&state->ts_spec_qplock); 1968 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_free); 1969 return (DDI_SUCCESS); 1970 } 1971 1972 1973 /* 1974 * tavor_qp_sgl_to_logwqesz() 1975 * Context: Can be called from interrupt or base context. 1976 */ 1977 static void 1978 tavor_qp_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl, 1979 tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl) 1980 { 1981 uint_t max_size, log2, actual_sgl; 1982 1983 TAVOR_TNF_ENTER(tavor_qp_sgl_to_logwqesz); 1984 1985 switch (wq_type) { 1986 case TAVOR_QP_WQ_TYPE_SENDQ: 1987 /* 1988 * Use requested maximum SGL to calculate max descriptor size 1989 * (while guaranteeing that the descriptor size is a 1990 * power-of-2 cachelines). 1991 */ 1992 max_size = (TAVOR_QP_WQE_MLX_SND_HDRS + (num_sgl << 4)); 1993 log2 = highbit(max_size); 1994 if (ISP2(max_size)) { 1995 log2 = log2 - 1; 1996 } 1997 1998 /* Make sure descriptor is at least the minimum size */ 1999 log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM); 2000 2001 /* Calculate actual number of SGL (given WQE size) */ 2002 actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_SND_HDRS) >> 4; 2003 break; 2004 2005 case TAVOR_QP_WQ_TYPE_RECVQ: 2006 /* 2007 * Same as above (except for Recv WQEs) 2008 */ 2009 max_size = (TAVOR_QP_WQE_MLX_RCV_HDRS + (num_sgl << 4)); 2010 log2 = highbit(max_size); 2011 if (ISP2(max_size)) { 2012 log2 = log2 - 1; 2013 } 2014 2015 /* Make sure descriptor is at least the minimum size */ 2016 log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM); 2017 2018 /* Calculate actual number of SGL (given WQE size) */ 2019 actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_RCV_HDRS) >> 4; 2020 break; 2021 2022 case TAVOR_QP_WQ_TYPE_SENDMLX_QP0: 2023 /* 2024 * Same as above (except for MLX transport WQEs). For these 2025 * WQEs we have to account for the space consumed by the 2026 * "inline" packet headers. (This is smaller than for QP1 2027 * below because QP0 is not allowed to send packets with a GRH. 2028 */ 2029 max_size = (TAVOR_QP_WQE_MLX_QP0_HDRS + (num_sgl << 4)); 2030 log2 = highbit(max_size); 2031 if (ISP2(max_size)) { 2032 log2 = log2 - 1; 2033 } 2034 2035 /* Make sure descriptor is at least the minimum size */ 2036 log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM); 2037 2038 /* Calculate actual number of SGL (given WQE size) */ 2039 actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_QP0_HDRS) >> 4; 2040 break; 2041 2042 case TAVOR_QP_WQ_TYPE_SENDMLX_QP1: 2043 /* 2044 * Same as above. For these WQEs we again have to account for 2045 * the space consumed by the "inline" packet headers. (This 2046 * is larger than for QP0 above because we have to account for 2047 * the possibility of a GRH in each packet - and this 2048 * introduces an alignment issue that causes us to consume 2049 * an additional 8 bytes). 2050 */ 2051 max_size = (TAVOR_QP_WQE_MLX_QP1_HDRS + (num_sgl << 4)); 2052 log2 = highbit(max_size); 2053 if (ISP2(max_size)) { 2054 log2 = log2 - 1; 2055 } 2056 2057 /* Make sure descriptor is at least the minimum size */ 2058 log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM); 2059 2060 /* Calculate actual number of SGL (given WQE size) */ 2061 actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_QP1_HDRS) >> 4; 2062 break; 2063 2064 default: 2065 TAVOR_WARNING(state, "unexpected work queue type"); 2066 TNF_PROBE_0(tavor_qp_sgl_to_logwqesz_inv_wqtype_fail, 2067 TAVOR_TNF_ERROR, ""); 2068 break; 2069 } 2070 2071 /* Fill in the return values */ 2072 *logwqesz = log2; 2073 *max_sgl = min(state->ts_cfg_profile->cp_wqe_real_max_sgl, actual_sgl); 2074 2075 TAVOR_TNF_EXIT(tavor_qp_sgl_to_logwqesz); 2076 } 2077