1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * tavor_srq.c 29 * Tavor Shared Receive Queue Processing Routines 30 * 31 * Implements all the routines necessary for allocating, freeing, querying, 32 * modifying and posting shared receive queues. 33 */ 34 35 #include <sys/sysmacros.h> 36 #include <sys/types.h> 37 #include <sys/conf.h> 38 #include <sys/ddi.h> 39 #include <sys/sunddi.h> 40 #include <sys/modctl.h> 41 #include <sys/bitmap.h> 42 43 #include <sys/ib/adapters/tavor/tavor.h> 44 45 static void tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl, 46 tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl); 47 48 /* 49 * tavor_srq_alloc() 50 * Context: Can be called only from user or kernel context. 51 */ 52 int 53 tavor_srq_alloc(tavor_state_t *state, tavor_srq_info_t *srqinfo, 54 uint_t sleepflag, tavor_srq_options_t *op) 55 { 56 ibt_srq_hdl_t ibt_srqhdl; 57 tavor_pdhdl_t pd; 58 ibt_srq_sizes_t *sizes; 59 ibt_srq_sizes_t *real_sizes; 60 tavor_srqhdl_t *srqhdl; 61 ibt_srq_flags_t flags; 62 tavor_rsrc_t *srqc, *rsrc; 63 tavor_hw_srqc_t srqc_entry; 64 uint32_t *buf; 65 tavor_srqhdl_t srq; 66 tavor_umap_db_entry_t *umapdb; 67 ibt_mr_attr_t mr_attr; 68 tavor_mr_options_t mr_op; 69 tavor_mrhdl_t mr; 70 uint64_t addr; 71 uint64_t value, srq_desc_off; 72 uint32_t lkey; 73 uint32_t log_srq_size; 74 uint32_t uarpg; 75 uint_t wq_location, dma_xfer_mode, srq_is_umap; 76 int flag, status; 77 uint_t max_sgl; 78 uint_t wqesz; 79 80 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sizes)) 81 82 /* 83 * Check the "options" flag. Currently this flag tells the driver 84 * whether or not the SRQ's work queues should be come from normal 85 * system memory or whether they should be allocated from DDR memory. 86 */ 87 if (op == NULL) { 88 wq_location = TAVOR_QUEUE_LOCATION_NORMAL; 89 } else { 90 wq_location = op->srqo_wq_loc; 91 } 92 93 /* 94 * Extract the necessary info from the tavor_srq_info_t structure 95 */ 96 real_sizes = srqinfo->srqi_real_sizes; 97 sizes = srqinfo->srqi_sizes; 98 pd = srqinfo->srqi_pd; 99 ibt_srqhdl = srqinfo->srqi_ibt_srqhdl; 100 flags = srqinfo->srqi_flags; 101 srqhdl = srqinfo->srqi_srqhdl; 102 103 /* 104 * Determine whether SRQ is being allocated for userland access or 105 * whether it is being allocated for kernel access. If the SRQ is 106 * being allocated for userland access, then lookup the UAR doorbell 107 * page number for the current process. Note: If this is not found 108 * (e.g. if the process has not previously open()'d the Tavor driver), 109 * then an error is returned. 110 */ 111 srq_is_umap = (flags & IBT_SRQ_USER_MAP) ? 1 : 0; 112 if (srq_is_umap) { 113 status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(), 114 MLNX_UMAP_UARPG_RSRC, &value, 0, NULL); 115 if (status != DDI_SUCCESS) { 116 goto srqalloc_fail3; 117 } 118 uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx; 119 } 120 121 /* Increase PD refcnt */ 122 tavor_pd_refcnt_inc(pd); 123 124 /* Allocate an SRQ context entry */ 125 status = tavor_rsrc_alloc(state, TAVOR_SRQC, 1, sleepflag, &srqc); 126 if (status != DDI_SUCCESS) { 127 goto srqalloc_fail1; 128 } 129 130 /* Allocate the SRQ Handle entry */ 131 status = tavor_rsrc_alloc(state, TAVOR_SRQHDL, 1, sleepflag, &rsrc); 132 if (status != DDI_SUCCESS) { 133 goto srqalloc_fail2; 134 } 135 136 srq = (tavor_srqhdl_t)rsrc->tr_addr; 137 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq)) 138 139 srq->srq_srqnum = srqc->tr_indx; /* just use index */ 140 141 /* 142 * If this will be a user-mappable SRQ, then allocate an entry for 143 * the "userland resources database". This will later be added to 144 * the database (after all further SRQ operations are successful). 145 * If we fail here, we must undo the reference counts and the 146 * previous resource allocation. 147 */ 148 if (srq_is_umap) { 149 umapdb = tavor_umap_db_alloc(state->ts_instance, 150 srq->srq_srqnum, MLNX_UMAP_SRQMEM_RSRC, 151 (uint64_t)(uintptr_t)rsrc); 152 if (umapdb == NULL) { 153 goto srqalloc_fail3; 154 } 155 } 156 157 /* 158 * Calculate the appropriate size for the SRQ. 159 * Note: All Tavor SRQs must be a power-of-2 in size. Also 160 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE. This step 161 * is to round the requested size up to the next highest power-of-2 162 */ 163 sizes->srq_wr_sz = max(sizes->srq_wr_sz, TAVOR_SRQ_MIN_SIZE); 164 log_srq_size = highbit(sizes->srq_wr_sz); 165 if (ISP2(sizes->srq_wr_sz)) { 166 log_srq_size = log_srq_size - 1; 167 } 168 169 /* 170 * Next we verify that the rounded-up size is valid (i.e. consistent 171 * with the device limits and/or software-configured limits). If not, 172 * then obviously we have a lot of cleanup to do before returning. 173 */ 174 if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) { 175 goto srqalloc_fail4; 176 } 177 178 /* 179 * Next we verify that the requested number of SGL is valid (i.e. 180 * consistent with the device limits and/or software-configured 181 * limits). If not, then obviously the same cleanup needs to be done. 182 */ 183 max_sgl = state->ts_cfg_profile->cp_srq_max_sgl; 184 if (sizes->srq_sgl_sz > max_sgl) { 185 goto srqalloc_fail4; 186 } 187 188 /* 189 * Determine the SRQ's WQE sizes. This depends on the requested 190 * number of SGLs. Note: This also has the side-effect of 191 * calculating the real number of SGLs (for the calculated WQE size) 192 */ 193 tavor_srq_sgl_to_logwqesz(state, sizes->srq_sgl_sz, 194 TAVOR_QP_WQ_TYPE_RECVQ, &srq->srq_wq_log_wqesz, 195 &srq->srq_wq_sgl); 196 197 /* 198 * Allocate the memory for SRQ work queues. Note: The location from 199 * which we will allocate these work queues has been passed in through 200 * the tavor_qp_options_t structure. Since Tavor work queues are not 201 * allowed to cross a 32-bit (4GB) boundary, the alignment of the work 202 * queue memory is very important. We used to allocate work queues 203 * (the combined receive and send queues) so that they would be aligned 204 * on their combined size. That alignment guaranteed that they would 205 * never cross the 4GB boundary (Tavor work queues are on the order of 206 * MBs at maximum). Now we are able to relax this alignment constraint 207 * by ensuring that the IB address assigned to the queue memory (as a 208 * result of the tavor_mr_register() call) is offset from zero. 209 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to 210 * guarantee the alignment, but when attempting to use IOMMU bypass 211 * mode we found that we were not allowed to specify any alignment that 212 * was more restrictive than the system page size. So we avoided this 213 * constraint by passing two alignment values, one for the memory 214 * allocation itself and the other for the DMA handle (for later bind). 215 * This used to cause more memory than necessary to be allocated (in 216 * order to guarantee the more restrictive alignment contraint). But 217 * be guaranteeing the zero-based IB virtual address for the queue, we 218 * are able to conserve this memory. 219 * 220 * Note: If SRQ is not user-mappable, then it may come from either 221 * kernel system memory or from HCA-attached local DDR memory. 222 * 223 * Note2: We align this queue on a pagesize boundary. This is required 224 * to make sure that all the resulting IB addresses will start at 0, for 225 * a zero-based queue. By making sure we are aligned on at least a 226 * page, any offset we use into our queue will be the same as when we 227 * perform tavor_srq_modify() operations later. 228 */ 229 wqesz = (1 << srq->srq_wq_log_wqesz); 230 srq->srq_wqinfo.qa_size = (1 << log_srq_size) * wqesz; 231 srq->srq_wqinfo.qa_alloc_align = PAGESIZE; 232 srq->srq_wqinfo.qa_bind_align = PAGESIZE; 233 if (srq_is_umap) { 234 srq->srq_wqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND; 235 } else { 236 srq->srq_wqinfo.qa_location = wq_location; 237 } 238 status = tavor_queue_alloc(state, &srq->srq_wqinfo, sleepflag); 239 if (status != DDI_SUCCESS) { 240 goto srqalloc_fail4; 241 } 242 buf = (uint32_t *)srq->srq_wqinfo.qa_buf_aligned; 243 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf)) 244 245 /* 246 * Register the memory for the SRQ work queues. The memory for the SRQ 247 * must be registered in the Tavor TPT tables. This gives us the LKey 248 * to specify in the SRQ context later. Note: If the work queue is to 249 * be allocated from DDR memory, then only a "bypass" mapping is 250 * appropriate. And if the SRQ memory is user-mappable, then we force 251 * DDI_DMA_CONSISTENT mapping. Also, in order to meet the alignment 252 * restriction, we pass the "mro_bind_override_addr" flag in the call 253 * to tavor_mr_register(). This guarantees that the resulting IB vaddr 254 * will be zero-based (modulo the offset into the first page). If we 255 * fail here, we still have the bunch of resource and reference count 256 * cleanup to do. 257 */ 258 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : 259 IBT_MR_NOSLEEP; 260 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf; 261 mr_attr.mr_len = srq->srq_wqinfo.qa_size; 262 mr_attr.mr_as = NULL; 263 mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE; 264 if (srq_is_umap) { 265 mr_op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass; 266 } else { 267 if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) { 268 mr_op.mro_bind_type = 269 state->ts_cfg_profile->cp_iommu_bypass; 270 dma_xfer_mode = 271 state->ts_cfg_profile->cp_streaming_consistent; 272 if (dma_xfer_mode == DDI_DMA_STREAMING) { 273 mr_attr.mr_flags |= IBT_MR_NONCOHERENT; 274 } 275 } else { 276 mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS; 277 } 278 } 279 mr_op.mro_bind_dmahdl = srq->srq_wqinfo.qa_dmahdl; 280 mr_op.mro_bind_override_addr = 1; 281 status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op); 282 if (status != DDI_SUCCESS) { 283 goto srqalloc_fail5; 284 } 285 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr)) 286 addr = mr->mr_bindinfo.bi_addr; 287 lkey = mr->mr_lkey; 288 289 /* 290 * Calculate the offset between the kernel virtual address space 291 * and the IB virtual address space. This will be used when 292 * posting work requests to properly initialize each WQE. 293 */ 294 srq_desc_off = (uint64_t)(uintptr_t)srq->srq_wqinfo.qa_buf_aligned - 295 (uint64_t)mr->mr_bindinfo.bi_addr; 296 297 /* 298 * Create WQL and Wridlist for use by this SRQ 299 */ 300 srq->srq_wrid_wql = tavor_wrid_wql_create(state); 301 if (srq->srq_wrid_wql == NULL) { 302 goto srqalloc_fail6; 303 } 304 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wrid_wql))) 305 306 srq->srq_wridlist = tavor_wrid_get_list(1 << log_srq_size); 307 if (srq->srq_wridlist == NULL) { 308 goto srqalloc_fail7; 309 } 310 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wridlist))) 311 312 srq->srq_wridlist->wl_srq_en = 1; 313 srq->srq_wridlist->wl_free_list_indx = -1; 314 315 /* 316 * Fill in all the return arguments (if necessary). This includes 317 * real queue size and real SGLs. 318 */ 319 if (real_sizes != NULL) { 320 real_sizes->srq_wr_sz = (1 << log_srq_size); 321 real_sizes->srq_sgl_sz = srq->srq_wq_sgl; 322 } 323 324 /* 325 * Fill in the SRQC entry. This is the final step before passing 326 * ownership of the SRQC entry to the Tavor hardware. We use all of 327 * the information collected/calculated above to fill in the 328 * requisite portions of the SRQC. Note: If this SRQ is going to be 329 * used for userland access, then we need to set the UAR page number 330 * appropriately (otherwise it's a "don't care") 331 */ 332 bzero(&srqc_entry, sizeof (tavor_hw_srqc_t)); 333 srqc_entry.wqe_addr_h = (addr >> 32); 334 srqc_entry.next_wqe_addr_l = 0; 335 srqc_entry.ds = (wqesz >> 4); 336 srqc_entry.state = TAVOR_SRQ_STATE_HW_OWNER; 337 srqc_entry.pd = pd->pd_pdnum; 338 srqc_entry.lkey = lkey; 339 srqc_entry.wqe_cnt = 0; 340 if (srq_is_umap) { 341 srqc_entry.uar = uarpg; 342 } else { 343 srqc_entry.uar = 0; 344 } 345 346 /* 347 * Write the SRQC entry to hardware. Lastly, we pass ownership of 348 * the entry to the hardware (using the Tavor SW2HW_SRQ firmware 349 * command). Note: In general, this operation shouldn't fail. But 350 * if it does, we have to undo everything we've done above before 351 * returning error. 352 */ 353 status = tavor_cmn_ownership_cmd_post(state, SW2HW_SRQ, &srqc_entry, 354 sizeof (tavor_hw_srqc_t), srq->srq_srqnum, 355 sleepflag); 356 if (status != TAVOR_CMD_SUCCESS) { 357 cmn_err(CE_CONT, "Tavor: SW2HW_SRQ command failed: %08x\n", 358 status); 359 goto srqalloc_fail8; 360 } 361 362 /* 363 * Fill in the rest of the Tavor SRQ handle. We can update 364 * the following fields for use in further operations on the SRQ. 365 */ 366 srq->srq_srqcrsrcp = srqc; 367 srq->srq_rsrcp = rsrc; 368 srq->srq_mrhdl = mr; 369 srq->srq_refcnt = 0; 370 srq->srq_is_umap = srq_is_umap; 371 srq->srq_uarpg = (srq->srq_is_umap) ? uarpg : 0; 372 srq->srq_umap_dhp = (devmap_cookie_t)NULL; 373 srq->srq_pdhdl = pd; 374 srq->srq_wq_lastwqeindx = -1; 375 srq->srq_wq_bufsz = (1 << log_srq_size); 376 srq->srq_wq_buf = buf; 377 srq->srq_desc_off = srq_desc_off; 378 srq->srq_hdlrarg = (void *)ibt_srqhdl; 379 srq->srq_state = 0; 380 srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size); 381 srq->srq_real_sizes.srq_sgl_sz = srq->srq_wq_sgl; 382 383 /* Determine if later ddi_dma_sync will be necessary */ 384 srq->srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo); 385 386 /* 387 * Put SRQ handle in Tavor SRQNum-to-SRQhdl list. Then fill in the 388 * "srqhdl" and return success 389 */ 390 ASSERT(state->ts_srqhdl[srqc->tr_indx] == NULL); 391 state->ts_srqhdl[srqc->tr_indx] = srq; 392 393 /* 394 * If this is a user-mappable SRQ, then we need to insert the 395 * previously allocated entry into the "userland resources database". 396 * This will allow for later lookup during devmap() (i.e. mmap()) 397 * calls. 398 */ 399 if (srq->srq_is_umap) { 400 tavor_umap_db_add(umapdb); 401 } else { 402 mutex_enter(&srq->srq_wrid_wql->wql_lock); 403 tavor_wrid_list_srq_init(srq->srq_wridlist, srq, 0); 404 mutex_exit(&srq->srq_wrid_wql->wql_lock); 405 } 406 407 *srqhdl = srq; 408 409 return (status); 410 411 /* 412 * The following is cleanup for all possible failure cases in this routine 413 */ 414 srqalloc_fail8: 415 kmem_free(srq->srq_wridlist->wl_wre, srq->srq_wridlist->wl_size * 416 sizeof (tavor_wrid_entry_t)); 417 kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t)); 418 srqalloc_fail7: 419 tavor_wql_refcnt_dec(srq->srq_wrid_wql); 420 srqalloc_fail6: 421 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL, 422 TAVOR_SLEEPFLAG_FOR_CONTEXT()) != DDI_SUCCESS) { 423 TAVOR_WARNING(state, "failed to deregister SRQ memory"); 424 } 425 srqalloc_fail5: 426 tavor_queue_free(state, &srq->srq_wqinfo); 427 srqalloc_fail4: 428 if (srq_is_umap) { 429 tavor_umap_db_free(umapdb); 430 } 431 srqalloc_fail3: 432 tavor_rsrc_free(state, &rsrc); 433 srqalloc_fail2: 434 tavor_rsrc_free(state, &srqc); 435 srqalloc_fail1: 436 tavor_pd_refcnt_dec(pd); 437 srqalloc_fail: 438 return (status); 439 } 440 441 442 /* 443 * tavor_srq_free() 444 * Context: Can be called only from user or kernel context. 445 */ 446 /* ARGSUSED */ 447 int 448 tavor_srq_free(tavor_state_t *state, tavor_srqhdl_t *srqhdl, uint_t sleepflag) 449 { 450 tavor_rsrc_t *srqc, *rsrc; 451 tavor_umap_db_entry_t *umapdb; 452 uint64_t value; 453 tavor_srqhdl_t srq; 454 tavor_mrhdl_t mr; 455 tavor_pdhdl_t pd; 456 tavor_hw_srqc_t srqc_entry; 457 uint32_t srqnum; 458 uint32_t size; 459 uint_t maxprot; 460 int status; 461 462 /* 463 * Pull all the necessary information from the Tavor Shared Receive 464 * Queue handle. This is necessary here because the resource for the 465 * SRQ handle is going to be freed up as part of this operation. 466 */ 467 srq = *srqhdl; 468 mutex_enter(&srq->srq_lock); 469 srqc = srq->srq_srqcrsrcp; 470 rsrc = srq->srq_rsrcp; 471 pd = srq->srq_pdhdl; 472 mr = srq->srq_mrhdl; 473 srqnum = srq->srq_srqnum; 474 475 /* 476 * If there are work queues still associated with the SRQ, then return 477 * an error. Otherwise, we will be holding the SRQ lock. 478 */ 479 if (srq->srq_refcnt != 0) { 480 mutex_exit(&srq->srq_lock); 481 return (IBT_SRQ_IN_USE); 482 } 483 484 /* 485 * If this was a user-mappable SRQ, then we need to remove its entry 486 * from the "userland resources database". If it is also currently 487 * mmap()'d out to a user process, then we need to call 488 * devmap_devmem_remap() to remap the SRQ memory to an invalid mapping. 489 * We also need to invalidate the SRQ tracking information for the 490 * user mapping. 491 */ 492 if (srq->srq_is_umap) { 493 status = tavor_umap_db_find(state->ts_instance, srq->srq_srqnum, 494 MLNX_UMAP_SRQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE, 495 &umapdb); 496 if (status != DDI_SUCCESS) { 497 mutex_exit(&srq->srq_lock); 498 TAVOR_WARNING(state, "failed to find in database"); 499 return (ibc_get_ci_failure(0)); 500 } 501 tavor_umap_db_free(umapdb); 502 if (srq->srq_umap_dhp != NULL) { 503 maxprot = (PROT_READ | PROT_WRITE | PROT_USER); 504 status = devmap_devmem_remap(srq->srq_umap_dhp, 505 state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size, 506 maxprot, DEVMAP_MAPPING_INVALID, NULL); 507 if (status != DDI_SUCCESS) { 508 mutex_exit(&srq->srq_lock); 509 TAVOR_WARNING(state, "failed in SRQ memory " 510 "devmap_devmem_remap()"); 511 return (ibc_get_ci_failure(0)); 512 } 513 srq->srq_umap_dhp = (devmap_cookie_t)NULL; 514 } 515 } 516 517 /* 518 * Put NULL into the Tavor SRQNum-to-SRQHdl list. This will allow any 519 * in-progress events to detect that the SRQ corresponding to this 520 * number has been freed. 521 */ 522 state->ts_srqhdl[srqc->tr_indx] = NULL; 523 524 mutex_exit(&srq->srq_lock); 525 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq)); 526 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq->srq_wridlist)); 527 528 /* 529 * Reclaim SRQC entry from hardware (using the Tavor HW2SW_SRQ 530 * firmware command). If the ownership transfer fails for any reason, 531 * then it is an indication that something (either in HW or SW) has 532 * gone seriously wrong. 533 */ 534 status = tavor_cmn_ownership_cmd_post(state, HW2SW_SRQ, &srqc_entry, 535 sizeof (tavor_hw_srqc_t), srqnum, sleepflag); 536 if (status != TAVOR_CMD_SUCCESS) { 537 TAVOR_WARNING(state, "failed to reclaim SRQC ownership"); 538 cmn_err(CE_CONT, "Tavor: HW2SW_SRQ command failed: %08x\n", 539 status); 540 return (IBT_FAILURE); 541 } 542 543 /* 544 * Deregister the memory for the Shared Receive Queue. If this fails 545 * for any reason, then it is an indication that something (either 546 * in HW or SW) has gone seriously wrong. So we print a warning 547 * message and return. 548 */ 549 status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL, 550 sleepflag); 551 if (status != DDI_SUCCESS) { 552 TAVOR_WARNING(state, "failed to deregister SRQ memory"); 553 return (IBT_FAILURE); 554 } 555 556 /* Calculate the size and free the wridlist container */ 557 if (srq->srq_wridlist != NULL) { 558 size = (srq->srq_wridlist->wl_size * 559 sizeof (tavor_wrid_entry_t)); 560 kmem_free(srq->srq_wridlist->wl_wre, size); 561 kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t)); 562 563 /* 564 * Release reference to WQL; If this is the last reference, 565 * this call also has the side effect of freeing up the 566 * 'srq_wrid_wql' memory. 567 */ 568 tavor_wql_refcnt_dec(srq->srq_wrid_wql); 569 } 570 571 /* Free the memory for the SRQ */ 572 tavor_queue_free(state, &srq->srq_wqinfo); 573 574 /* Free the Tavor SRQ Handle */ 575 tavor_rsrc_free(state, &rsrc); 576 577 /* Free the SRQC entry resource */ 578 tavor_rsrc_free(state, &srqc); 579 580 /* Decrement the reference count on the protection domain (PD) */ 581 tavor_pd_refcnt_dec(pd); 582 583 /* Set the srqhdl pointer to NULL and return success */ 584 *srqhdl = NULL; 585 586 return (DDI_SUCCESS); 587 } 588 589 590 /* 591 * tavor_srq_modify() 592 * Context: Can be called only from user or kernel context. 593 */ 594 int 595 tavor_srq_modify(tavor_state_t *state, tavor_srqhdl_t srq, uint_t size, 596 uint_t *real_size, uint_t sleepflag) 597 { 598 tavor_qalloc_info_t new_srqinfo, old_srqinfo; 599 tavor_rsrc_t *mtt, *mpt, *old_mtt; 600 tavor_bind_info_t bind; 601 tavor_bind_info_t old_bind; 602 tavor_rsrc_pool_info_t *rsrc_pool; 603 tavor_mrhdl_t mr; 604 tavor_hw_mpt_t mpt_entry; 605 tavor_wrid_entry_t *wre_new, *wre_old; 606 uint64_t mtt_ddrbaseaddr, mtt_addr; 607 uint64_t srq_desc_off; 608 uint32_t *buf, srq_old_bufsz; 609 uint32_t wqesz; 610 uint_t max_srq_size; 611 uint_t dma_xfer_mode, mtt_pgsize_bits; 612 uint_t srq_sync, log_srq_size, maxprot; 613 uint_t wq_location; 614 int status; 615 616 /* 617 * Check the "inddr" flag. This flag tells the driver whether or not 618 * the SRQ's work queues should be come from normal system memory or 619 * whether they should be allocated from DDR memory. 620 */ 621 wq_location = state->ts_cfg_profile->cp_srq_wq_inddr; 622 623 /* 624 * If size requested is larger than device capability, return 625 * Insufficient Resources 626 */ 627 max_srq_size = (1 << state->ts_cfg_profile->cp_log_max_srq_sz); 628 if (size > max_srq_size) { 629 return (IBT_HCA_WR_EXCEEDED); 630 } 631 632 /* 633 * Calculate the appropriate size for the SRQ. 634 * Note: All Tavor SRQs must be a power-of-2 in size. Also 635 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE. This step 636 * is to round the requested size up to the next highest power-of-2 637 */ 638 size = max(size, TAVOR_SRQ_MIN_SIZE); 639 log_srq_size = highbit(size); 640 if (ISP2(size)) { 641 log_srq_size = log_srq_size - 1; 642 } 643 644 /* 645 * Next we verify that the rounded-up size is valid (i.e. consistent 646 * with the device limits and/or software-configured limits). 647 */ 648 if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) { 649 goto srqmodify_fail; 650 } 651 652 /* 653 * Allocate the memory for newly resized Shared Receive Queue. 654 * 655 * Note: If SRQ is not user-mappable, then it may come from either 656 * kernel system memory or from HCA-attached local DDR memory. 657 * 658 * Note2: We align this queue on a pagesize boundary. This is required 659 * to make sure that all the resulting IB addresses will start at 0, 660 * for a zero-based queue. By making sure we are aligned on at least a 661 * page, any offset we use into our queue will be the same as it was 662 * when we allocated it at tavor_srq_alloc() time. 663 */ 664 wqesz = (1 << srq->srq_wq_log_wqesz); 665 new_srqinfo.qa_size = (1 << log_srq_size) * wqesz; 666 new_srqinfo.qa_alloc_align = PAGESIZE; 667 new_srqinfo.qa_bind_align = PAGESIZE; 668 if (srq->srq_is_umap) { 669 new_srqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND; 670 } else { 671 new_srqinfo.qa_location = wq_location; 672 } 673 status = tavor_queue_alloc(state, &new_srqinfo, sleepflag); 674 if (status != DDI_SUCCESS) { 675 goto srqmodify_fail; 676 } 677 buf = (uint32_t *)new_srqinfo.qa_buf_aligned; 678 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf)) 679 680 /* 681 * Allocate the memory for the new WRE list. This will be used later 682 * when we resize the wridlist based on the new SRQ size. 683 */ 684 wre_new = (tavor_wrid_entry_t *)kmem_zalloc((1 << log_srq_size) * 685 sizeof (tavor_wrid_entry_t), sleepflag); 686 if (wre_new == NULL) { 687 goto srqmodify_fail; 688 } 689 690 /* 691 * Fill in the "bind" struct. This struct provides the majority 692 * of the information that will be used to distinguish between an 693 * "addr" binding (as is the case here) and a "buf" binding (see 694 * below). The "bind" struct is later passed to tavor_mr_mem_bind() 695 * which does most of the "heavy lifting" for the Tavor memory 696 * registration routines. 697 */ 698 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(bind)) 699 bzero(&bind, sizeof (tavor_bind_info_t)); 700 bind.bi_type = TAVOR_BINDHDL_VADDR; 701 bind.bi_addr = (uint64_t)(uintptr_t)buf; 702 bind.bi_len = new_srqinfo.qa_size; 703 bind.bi_as = NULL; 704 bind.bi_flags = sleepflag == TAVOR_SLEEP ? IBT_MR_SLEEP : 705 IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 706 if (srq->srq_is_umap) { 707 bind.bi_bypass = state->ts_cfg_profile->cp_iommu_bypass; 708 } else { 709 if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) { 710 bind.bi_bypass = 711 state->ts_cfg_profile->cp_iommu_bypass; 712 dma_xfer_mode = 713 state->ts_cfg_profile->cp_streaming_consistent; 714 if (dma_xfer_mode == DDI_DMA_STREAMING) { 715 bind.bi_flags |= IBT_MR_NONCOHERENT; 716 } 717 } else { 718 bind.bi_bypass = TAVOR_BINDMEM_BYPASS; 719 } 720 } 721 status = tavor_mr_mtt_bind(state, &bind, new_srqinfo.qa_dmahdl, &mtt, 722 &mtt_pgsize_bits); 723 if (status != DDI_SUCCESS) { 724 kmem_free(wre_new, srq->srq_wq_bufsz * 725 sizeof (tavor_wrid_entry_t)); 726 tavor_queue_free(state, &new_srqinfo); 727 goto srqmodify_fail; 728 } 729 730 /* 731 * Calculate the offset between the kernel virtual address space 732 * and the IB virtual address space. This will be used when 733 * posting work requests to properly initialize each WQE. 734 * 735 * Note: bind addr is zero-based (from alloc) so we calculate the 736 * correct new offset here. 737 */ 738 bind.bi_addr = bind.bi_addr & ((1 << mtt_pgsize_bits) - 1); 739 srq_desc_off = (uint64_t)(uintptr_t)new_srqinfo.qa_buf_aligned - 740 (uint64_t)bind.bi_addr; 741 742 /* 743 * Get the base address for the MTT table. This will be necessary 744 * below when we are modifying the MPT entry. 745 */ 746 rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT]; 747 mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset; 748 749 /* 750 * Fill in the MPT entry. This is the final step before passing 751 * ownership of the MPT entry to the Tavor hardware. We use all of 752 * the information collected/calculated above to fill in the 753 * requisite portions of the MPT. 754 */ 755 bzero(&mpt_entry, sizeof (tavor_hw_mpt_t)); 756 mpt_entry.reg_win_len = bind.bi_len; 757 mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT); 758 mpt_entry.mttseg_addr_h = mtt_addr >> 32; 759 mpt_entry.mttseg_addr_l = mtt_addr >> 6; 760 761 /* 762 * Now we grab the SRQ lock. Since we will be updating the actual 763 * SRQ location and the producer/consumer indexes, we should hold 764 * the lock. 765 * 766 * We do a TAVOR_NOSLEEP here (and below), though, because we are 767 * holding the "srq_lock" and if we got raised to interrupt level 768 * by priority inversion, we would not want to block in this routine 769 * waiting for success. 770 */ 771 mutex_enter(&srq->srq_lock); 772 773 /* 774 * Copy old entries to new buffer 775 */ 776 srq_old_bufsz = srq->srq_wq_bufsz; 777 bcopy(srq->srq_wq_buf, buf, srq_old_bufsz * wqesz); 778 779 /* Determine if later ddi_dma_sync will be necessary */ 780 srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo); 781 782 /* Sync entire "new" SRQ for use by hardware (if necessary) */ 783 if (srq_sync) { 784 (void) ddi_dma_sync(bind.bi_dmahdl, 0, 785 new_srqinfo.qa_size, DDI_DMA_SYNC_FORDEV); 786 } 787 788 /* 789 * Setup MPT information for use in the MODIFY_MPT command 790 */ 791 mr = srq->srq_mrhdl; 792 mutex_enter(&mr->mr_lock); 793 mpt = srq->srq_mrhdl->mr_mptrsrcp; 794 795 /* 796 * MODIFY_MPT 797 * 798 * If this fails for any reason, then it is an indication that 799 * something (either in HW or SW) has gone seriously wrong. So we 800 * print a warning message and return. 801 */ 802 status = tavor_modify_mpt_cmd_post(state, &mpt_entry, mpt->tr_indx, 803 TAVOR_CMD_MODIFY_MPT_RESIZESRQ, sleepflag); 804 if (status != TAVOR_CMD_SUCCESS) { 805 cmn_err(CE_CONT, "Tavor: MODIFY_MPT command failed: %08x\n", 806 status); 807 (void) tavor_mr_mtt_unbind(state, &srq->srq_mrhdl->mr_bindinfo, 808 srq->srq_mrhdl->mr_mttrsrcp); 809 kmem_free(wre_new, srq->srq_wq_bufsz * 810 sizeof (tavor_wrid_entry_t)); 811 tavor_queue_free(state, &new_srqinfo); 812 mutex_exit(&mr->mr_lock); 813 mutex_exit(&srq->srq_lock); 814 return (ibc_get_ci_failure(0)); 815 } 816 817 /* 818 * Update the Tavor Shared Receive Queue handle with all the new 819 * information. At the same time, save away all the necessary 820 * information for freeing up the old resources 821 */ 822 old_srqinfo = srq->srq_wqinfo; 823 old_mtt = srq->srq_mrhdl->mr_mttrsrcp; 824 bcopy(&srq->srq_mrhdl->mr_bindinfo, &old_bind, 825 sizeof (tavor_bind_info_t)); 826 827 /* Now set the new info */ 828 srq->srq_wqinfo = new_srqinfo; 829 srq->srq_wq_buf = buf; 830 srq->srq_wq_bufsz = (1 << log_srq_size); 831 bcopy(&bind, &srq->srq_mrhdl->mr_bindinfo, sizeof (tavor_bind_info_t)); 832 srq->srq_mrhdl->mr_mttrsrcp = mtt; 833 srq->srq_desc_off = srq_desc_off; 834 srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size); 835 836 /* Update MR mtt pagesize */ 837 mr->mr_logmttpgsz = mtt_pgsize_bits; 838 mutex_exit(&mr->mr_lock); 839 840 #ifdef __lock_lint 841 mutex_enter(&srq->srq_wrid_wql->wql_lock); 842 #else 843 if (srq->srq_wrid_wql != NULL) { 844 mutex_enter(&srq->srq_wrid_wql->wql_lock); 845 } 846 #endif 847 848 /* 849 * Initialize new wridlist, if needed. 850 * 851 * If a wridlist already is setup on an SRQ (the QP associated with an 852 * SRQ has moved "from_reset") then we must update this wridlist based 853 * on the new SRQ size. We allocate the new size of Work Request ID 854 * Entries, copy over the old entries to the new list, and 855 * re-initialize the srq wridlist in non-umap case 856 */ 857 wre_old = NULL; 858 if (srq->srq_wridlist != NULL) { 859 wre_old = srq->srq_wridlist->wl_wre; 860 861 bcopy(wre_old, wre_new, srq_old_bufsz * 862 sizeof (tavor_wrid_entry_t)); 863 864 /* Setup new sizes in wre */ 865 srq->srq_wridlist->wl_wre = wre_new; 866 srq->srq_wridlist->wl_size = srq->srq_wq_bufsz; 867 868 if (!srq->srq_is_umap) { 869 tavor_wrid_list_srq_init(srq->srq_wridlist, srq, 870 srq_old_bufsz); 871 } 872 } 873 874 #ifdef __lock_lint 875 mutex_exit(&srq->srq_wrid_wql->wql_lock); 876 #else 877 if (srq->srq_wrid_wql != NULL) { 878 mutex_exit(&srq->srq_wrid_wql->wql_lock); 879 } 880 #endif 881 882 /* 883 * If "old" SRQ was a user-mappable SRQ that is currently mmap()'d out 884 * to a user process, then we need to call devmap_devmem_remap() to 885 * invalidate the mapping to the SRQ memory. We also need to 886 * invalidate the SRQ tracking information for the user mapping. 887 * 888 * Note: On failure, the remap really shouldn't ever happen. So, if it 889 * does, it is an indication that something has gone seriously wrong. 890 * So we print a warning message and return error (knowing, of course, 891 * that the "old" SRQ memory will be leaked) 892 */ 893 if ((srq->srq_is_umap) && (srq->srq_umap_dhp != NULL)) { 894 maxprot = (PROT_READ | PROT_WRITE | PROT_USER); 895 status = devmap_devmem_remap(srq->srq_umap_dhp, 896 state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size, maxprot, 897 DEVMAP_MAPPING_INVALID, NULL); 898 if (status != DDI_SUCCESS) { 899 mutex_exit(&srq->srq_lock); 900 TAVOR_WARNING(state, "failed in SRQ memory " 901 "devmap_devmem_remap()"); 902 /* We can, however, free the memory for old wre */ 903 if (wre_old != NULL) { 904 kmem_free(wre_old, srq_old_bufsz * 905 sizeof (tavor_wrid_entry_t)); 906 } 907 return (ibc_get_ci_failure(0)); 908 } 909 srq->srq_umap_dhp = (devmap_cookie_t)NULL; 910 } 911 912 /* 913 * Drop the SRQ lock now. The only thing left to do is to free up 914 * the old resources. 915 */ 916 mutex_exit(&srq->srq_lock); 917 918 /* 919 * Unbind the MTT entries. 920 */ 921 status = tavor_mr_mtt_unbind(state, &old_bind, old_mtt); 922 if (status != DDI_SUCCESS) { 923 TAVOR_WARNING(state, "failed to unbind old SRQ memory"); 924 goto srqmodify_fail; 925 } 926 927 /* Free the memory for old wre */ 928 if (wre_old != NULL) { 929 kmem_free(wre_old, srq_old_bufsz * 930 sizeof (tavor_wrid_entry_t)); 931 } 932 933 /* Free the memory for the old SRQ */ 934 tavor_queue_free(state, &old_srqinfo); 935 936 /* 937 * Fill in the return arguments (if necessary). This includes the 938 * real new completion queue size. 939 */ 940 if (real_size != NULL) { 941 *real_size = (1 << log_srq_size); 942 } 943 944 return (DDI_SUCCESS); 945 946 srqmodify_fail: 947 return (status); 948 } 949 950 951 /* 952 * tavor_srq_refcnt_inc() 953 * Context: Can be called from interrupt or base context. 954 */ 955 void 956 tavor_srq_refcnt_inc(tavor_srqhdl_t srq) 957 { 958 mutex_enter(&srq->srq_lock); 959 srq->srq_refcnt++; 960 mutex_exit(&srq->srq_lock); 961 } 962 963 964 /* 965 * tavor_srq_refcnt_dec() 966 * Context: Can be called from interrupt or base context. 967 */ 968 void 969 tavor_srq_refcnt_dec(tavor_srqhdl_t srq) 970 { 971 mutex_enter(&srq->srq_lock); 972 srq->srq_refcnt--; 973 mutex_exit(&srq->srq_lock); 974 } 975 976 977 /* 978 * tavor_srqhdl_from_srqnum() 979 * Context: Can be called from interrupt or base context. 980 * 981 * This routine is important because changing the unconstrained 982 * portion of the SRQ number is critical to the detection of a 983 * potential race condition in the SRQ handler code (i.e. the case 984 * where a SRQ is freed and alloc'd again before an event for the 985 * "old" SRQ can be handled). 986 * 987 * While this is not a perfect solution (not sure that one exists) 988 * it does help to mitigate the chance that this race condition will 989 * cause us to deliver a "stale" event to the new SRQ owner. Note: 990 * this solution does not scale well because the number of constrained 991 * bits increases (and, hence, the number of unconstrained bits 992 * decreases) as the number of supported SRQ grows. For small and 993 * intermediate values, it should hopefully provide sufficient 994 * protection. 995 */ 996 tavor_srqhdl_t 997 tavor_srqhdl_from_srqnum(tavor_state_t *state, uint_t srqnum) 998 { 999 uint_t srqindx, srqmask; 1000 1001 /* Calculate the SRQ table index from the srqnum */ 1002 srqmask = (1 << state->ts_cfg_profile->cp_log_num_srq) - 1; 1003 srqindx = srqnum & srqmask; 1004 return (state->ts_srqhdl[srqindx]); 1005 } 1006 1007 1008 /* 1009 * tavor_srq_sgl_to_logwqesz() 1010 * Context: Can be called from interrupt or base context. 1011 */ 1012 static void 1013 tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl, 1014 tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl) 1015 { 1016 uint_t max_size, log2, actual_sgl; 1017 1018 switch (wq_type) { 1019 case TAVOR_QP_WQ_TYPE_RECVQ: 1020 /* 1021 * Use requested maximum SGL to calculate max descriptor size 1022 * (while guaranteeing that the descriptor size is a 1023 * power-of-2 cachelines). 1024 */ 1025 max_size = (TAVOR_QP_WQE_MLX_RCV_HDRS + (num_sgl << 4)); 1026 log2 = highbit(max_size); 1027 if (ISP2(max_size)) { 1028 log2 = log2 - 1; 1029 } 1030 1031 /* Make sure descriptor is at least the minimum size */ 1032 log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM); 1033 1034 /* Calculate actual number of SGL (given WQE size) */ 1035 actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_RCV_HDRS) >> 4; 1036 break; 1037 1038 default: 1039 TAVOR_WARNING(state, "unexpected work queue type"); 1040 break; 1041 } 1042 1043 /* Fill in the return values */ 1044 *logwqesz = log2; 1045 *max_sgl = min(state->ts_cfg_profile->cp_srq_max_sgl, actual_sgl); 1046 } 1047