1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * tavor_mr.c 29 * Tavor Memory Region/Window Routines 30 * 31 * Implements all the routines necessary to provide the requisite memory 32 * registration verbs. These include operations like RegisterMemRegion(), 33 * DeregisterMemRegion(), ReregisterMemRegion, RegisterSharedMemRegion, 34 * etc., that affect Memory Regions. It also includes the verbs that 35 * affect Memory Windows, including AllocMemWindow(), FreeMemWindow(), 36 * and QueryMemWindow(). 37 */ 38 39 #include <sys/types.h> 40 #include <sys/conf.h> 41 #include <sys/ddi.h> 42 #include <sys/sunddi.h> 43 #include <sys/modctl.h> 44 #include <sys/esunddi.h> 45 46 #include <sys/ib/adapters/tavor/tavor.h> 47 48 49 /* 50 * Used by tavor_mr_keycalc() below to fill in the "unconstrained" portion 51 * of Tavor memory keys (LKeys and RKeys) 52 */ 53 static uint_t tavor_debug_memkey_cnt = 0x00000000; 54 55 static int tavor_mr_common_reg(tavor_state_t *state, tavor_pdhdl_t pd, 56 tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op); 57 static int tavor_mr_common_rereg(tavor_state_t *state, tavor_mrhdl_t mr, 58 tavor_pdhdl_t pd, tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl_new, 59 tavor_mr_options_t *op); 60 static int tavor_mr_rereg_xlat_helper(tavor_state_t *state, tavor_mrhdl_t mr, 61 tavor_bind_info_t *bind, tavor_mr_options_t *op, uint64_t *mtt_addr, 62 uint_t sleep, uint_t *dereg_level); 63 static uint64_t tavor_mr_nummtt_needed(tavor_state_t *state, 64 tavor_bind_info_t *bind, uint_t *mtt_pgsize); 65 static int tavor_mr_mem_bind(tavor_state_t *state, tavor_bind_info_t *bind, 66 ddi_dma_handle_t dmahdl, uint_t sleep); 67 static void tavor_mr_mem_unbind(tavor_state_t *state, 68 tavor_bind_info_t *bind); 69 static int tavor_mr_fast_mtt_write(tavor_rsrc_t *mtt, tavor_bind_info_t *bind, 70 uint32_t mtt_pgsize_bits); 71 static int tavor_mtt_refcnt_inc(tavor_rsrc_t *rsrc); 72 static int tavor_mtt_refcnt_dec(tavor_rsrc_t *rsrc); 73 74 /* 75 * The Tavor umem_lockmemory() callback ops. When userland memory is 76 * registered, these callback ops are specified. The tavor_umap_umemlock_cb() 77 * callback will be called whenever the memory for the corresponding 78 * ddi_umem_cookie_t is being freed. 79 */ 80 static struct umem_callback_ops tavor_umem_cbops = { 81 UMEM_CALLBACK_VERSION, 82 tavor_umap_umemlock_cb, 83 }; 84 85 86 /* 87 * tavor_mr_register() 88 * Context: Can be called from interrupt or base context. 89 */ 90 int 91 tavor_mr_register(tavor_state_t *state, tavor_pdhdl_t pd, 92 ibt_mr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op) 93 { 94 tavor_bind_info_t bind; 95 int status; 96 97 TAVOR_TNF_ENTER(tavor_mr_register); 98 99 /* 100 * Fill in the "bind" struct. This struct provides the majority 101 * of the information that will be used to distinguish between an 102 * "addr" binding (as is the case here) and a "buf" binding (see 103 * below). The "bind" struct is later passed to tavor_mr_mem_bind() 104 * which does most of the "heavy lifting" for the Tavor memory 105 * registration routines. 106 */ 107 bind.bi_type = TAVOR_BINDHDL_VADDR; 108 bind.bi_addr = mr_attr->mr_vaddr; 109 bind.bi_len = mr_attr->mr_len; 110 bind.bi_as = mr_attr->mr_as; 111 bind.bi_flags = mr_attr->mr_flags; 112 status = tavor_mr_common_reg(state, pd, &bind, mrhdl, op); 113 if (status != DDI_SUCCESS) { 114 TNF_PROBE_0(tavor_mr_register_cmnreg_fail, 115 TAVOR_TNF_ERROR, ""); 116 TAVOR_TNF_EXIT(tavor_mr_register); 117 return (status); 118 } 119 120 TAVOR_TNF_EXIT(tavor_mr_register); 121 return (DDI_SUCCESS); 122 } 123 124 125 /* 126 * tavor_mr_register_buf() 127 * Context: Can be called from interrupt or base context. 128 */ 129 int 130 tavor_mr_register_buf(tavor_state_t *state, tavor_pdhdl_t pd, 131 ibt_smr_attr_t *mr_attr, struct buf *buf, tavor_mrhdl_t *mrhdl, 132 tavor_mr_options_t *op) 133 { 134 tavor_bind_info_t bind; 135 int status; 136 137 TAVOR_TNF_ENTER(tavor_mr_register_buf); 138 139 /* 140 * Fill in the "bind" struct. This struct provides the majority 141 * of the information that will be used to distinguish between an 142 * "addr" binding (see above) and a "buf" binding (as is the case 143 * here). The "bind" struct is later passed to tavor_mr_mem_bind() 144 * which does most of the "heavy lifting" for the Tavor memory 145 * registration routines. Note: We have chosen to provide 146 * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is 147 * not set). It is not critical what value we choose here as it need 148 * only be unique for the given RKey (which will happen by default), 149 * so the choice here is somewhat arbitrary. 150 */ 151 bind.bi_type = TAVOR_BINDHDL_BUF; 152 bind.bi_buf = buf; 153 if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) { 154 bind.bi_addr = mr_attr->mr_vaddr; 155 } else { 156 bind.bi_addr = (uint64_t)(uintptr_t)buf->b_un.b_addr; 157 } 158 bind.bi_as = NULL; 159 bind.bi_len = (uint64_t)buf->b_bcount; 160 bind.bi_flags = mr_attr->mr_flags; 161 status = tavor_mr_common_reg(state, pd, &bind, mrhdl, op); 162 if (status != DDI_SUCCESS) { 163 TNF_PROBE_0(tavor_mr_register_buf_cmnreg_fail, 164 TAVOR_TNF_ERROR, ""); 165 TAVOR_TNF_EXIT(tavor_mr_register_buf); 166 return (status); 167 } 168 169 TAVOR_TNF_EXIT(tavor_mr_register_buf); 170 return (DDI_SUCCESS); 171 } 172 173 174 /* 175 * tavor_mr_register_shared() 176 * Context: Can be called from interrupt or base context. 177 */ 178 int 179 tavor_mr_register_shared(tavor_state_t *state, tavor_mrhdl_t mrhdl, 180 tavor_pdhdl_t pd, ibt_smr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl_new) 181 { 182 tavor_rsrc_pool_info_t *rsrc_pool; 183 tavor_rsrc_t *mpt, *mtt, *rsrc; 184 tavor_umap_db_entry_t *umapdb; 185 tavor_hw_mpt_t mpt_entry; 186 tavor_mrhdl_t mr; 187 tavor_bind_info_t *bind; 188 ddi_umem_cookie_t umem_cookie; 189 size_t umem_len; 190 caddr_t umem_addr; 191 uint64_t mtt_addr, mtt_ddrbaseaddr, pgsize_msk; 192 uint_t sleep, mr_is_umem; 193 int status, umem_flags; 194 char *errormsg; 195 196 TAVOR_TNF_ENTER(tavor_mr_register_shared); 197 198 /* 199 * Check the sleep flag. Ensure that it is consistent with the 200 * current thread context (i.e. if we are currently in the interrupt 201 * context, then we shouldn't be attempting to sleep). 202 */ 203 sleep = (mr_attr->mr_flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP : 204 TAVOR_SLEEP; 205 if ((sleep == TAVOR_SLEEP) && 206 (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) { 207 /* Set "status" and "errormsg" and goto failure */ 208 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags"); 209 goto mrshared_fail; 210 } 211 212 /* Increment the reference count on the protection domain (PD) */ 213 tavor_pd_refcnt_inc(pd); 214 215 /* 216 * Allocate an MPT entry. This will be filled in with all the 217 * necessary parameters to define the shared memory region. 218 * Specifically, it will be made to reference the currently existing 219 * MTT entries and ownership of the MPT will be passed to the hardware 220 * in the last step below. If we fail here, we must undo the 221 * protection domain reference count. 222 */ 223 status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt); 224 if (status != DDI_SUCCESS) { 225 /* Set "status" and "errormsg" and goto failure */ 226 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT"); 227 goto mrshared_fail1; 228 } 229 230 /* 231 * Allocate the software structure for tracking the shared memory 232 * region (i.e. the Tavor Memory Region handle). If we fail here, we 233 * must undo the protection domain reference count and the previous 234 * resource allocation. 235 */ 236 status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc); 237 if (status != DDI_SUCCESS) { 238 /* Set "status" and "errormsg" and goto failure */ 239 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle"); 240 goto mrshared_fail2; 241 } 242 mr = (tavor_mrhdl_t)rsrc->tr_addr; 243 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr)) 244 245 /* 246 * Setup and validate the memory region access flags. This means 247 * translating the IBTF's enable flags into the access flags that 248 * will be used in later operations. 249 */ 250 mr->mr_accflag = 0; 251 if (mr_attr->mr_flags & IBT_MR_ENABLE_WINDOW_BIND) 252 mr->mr_accflag |= IBT_MR_WINDOW_BIND; 253 if (mr_attr->mr_flags & IBT_MR_ENABLE_LOCAL_WRITE) 254 mr->mr_accflag |= IBT_MR_LOCAL_WRITE; 255 if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_READ) 256 mr->mr_accflag |= IBT_MR_REMOTE_READ; 257 if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_WRITE) 258 mr->mr_accflag |= IBT_MR_REMOTE_WRITE; 259 if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_ATOMIC) 260 mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC; 261 262 /* 263 * Calculate keys (Lkey, Rkey) from MPT index. Each key is formed 264 * from a certain number of "constrained" bits (the least significant 265 * bits) and some number of "unconstrained" bits. The constrained 266 * bits must be set to the index of the entry in the MPT table, but 267 * the unconstrained bits can be set to any value we wish. Note: 268 * if no remote access is required, then the RKey value is not filled 269 * in. Otherwise both Rkey and LKey are given the same value. 270 */ 271 tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey); 272 if ((mr->mr_accflag & IBT_MR_REMOTE_READ) || 273 (mr->mr_accflag & IBT_MR_REMOTE_WRITE) || 274 (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) { 275 mr->mr_rkey = mr->mr_lkey; 276 } 277 278 /* Grab the MR lock for the current memory region */ 279 mutex_enter(&mrhdl->mr_lock); 280 281 /* 282 * Check here to see if the memory region has already been partially 283 * deregistered as a result of a tavor_umap_umemlock_cb() callback. 284 * If so, this is an error, return failure. 285 */ 286 if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) { 287 mutex_exit(&mrhdl->mr_lock); 288 /* Set "status" and "errormsg" and goto failure */ 289 TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl"); 290 goto mrshared_fail3; 291 } 292 293 /* 294 * Determine if the original memory was from userland and, if so, pin 295 * the pages (again) with umem_lockmemory(). This will guarantee a 296 * separate callback for each of this shared region's MR handles. 297 * If this is userland memory, then allocate an entry in the 298 * "userland resources database". This will later be added to 299 * the database (after all further memory registration operations are 300 * successful). If we fail here, we must undo all the above setup. 301 */ 302 mr_is_umem = mrhdl->mr_is_umem; 303 if (mr_is_umem) { 304 umem_len = ptob(btopr(mrhdl->mr_bindinfo.bi_len + 305 ((uintptr_t)mrhdl->mr_bindinfo.bi_addr & PAGEOFFSET))); 306 umem_addr = (caddr_t)((uintptr_t)mrhdl->mr_bindinfo.bi_addr & 307 ~PAGEOFFSET); 308 umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ | 309 DDI_UMEMLOCK_LONGTERM); 310 status = umem_lockmemory(umem_addr, umem_len, umem_flags, 311 &umem_cookie, &tavor_umem_cbops, NULL); 312 if (status != 0) { 313 mutex_exit(&mrhdl->mr_lock); 314 /* Set "status" and "errormsg" and goto failure */ 315 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umem pin"); 316 goto mrshared_fail3; 317 } 318 319 umapdb = tavor_umap_db_alloc(state->ts_instance, 320 (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC, 321 (uint64_t)(uintptr_t)rsrc); 322 if (umapdb == NULL) { 323 mutex_exit(&mrhdl->mr_lock); 324 /* Set "status" and "errormsg" and goto failure */ 325 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add"); 326 goto mrshared_fail4; 327 } 328 } 329 330 /* 331 * Copy the MTT resource pointer (and additional parameters) from 332 * the original Tavor Memory Region handle. Note: this is normally 333 * where the tavor_mr_mem_bind() routine would be called, but because 334 * we already have bound and filled-in MTT entries it is simply a 335 * matter here of managing the MTT reference count and grabbing the 336 * address of the MTT table entries (for filling in the shared region's 337 * MPT entry). 338 */ 339 mr->mr_mttrsrcp = mrhdl->mr_mttrsrcp; 340 mr->mr_logmttpgsz = mrhdl->mr_logmttpgsz; 341 mr->mr_bindinfo = mrhdl->mr_bindinfo; 342 mr->mr_mttrefcntp = mrhdl->mr_mttrefcntp; 343 mutex_exit(&mrhdl->mr_lock); 344 bind = &mr->mr_bindinfo; 345 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind)) 346 mtt = mr->mr_mttrsrcp; 347 348 /* 349 * Increment the MTT reference count (to reflect the fact that 350 * the MTT is now shared) 351 */ 352 (void) tavor_mtt_refcnt_inc(mr->mr_mttrefcntp); 353 354 /* 355 * Update the new "bind" virtual address. Do some extra work here 356 * to ensure proper alignment. That is, make sure that the page 357 * offset for the beginning of the old range is the same as the 358 * offset for this new mapping 359 */ 360 pgsize_msk = (((uint64_t)1 << mr->mr_logmttpgsz) - 1); 361 bind->bi_addr = ((mr_attr->mr_vaddr & ~pgsize_msk) | 362 (mr->mr_bindinfo.bi_addr & pgsize_msk)); 363 364 /* 365 * Get the base address for the MTT table. This will be necessary 366 * in the next step when we are setting up the MPT entry. 367 */ 368 rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT]; 369 mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset; 370 371 /* 372 * Fill in the MPT entry. This is the final step before passing 373 * ownership of the MPT entry to the Tavor hardware. We use all of 374 * the information collected/calculated above to fill in the 375 * requisite portions of the MPT. 376 */ 377 bzero(&mpt_entry, sizeof (tavor_hw_mpt_t)); 378 mpt_entry.m_io = TAVOR_MEM_CYCLE_GENERATE; 379 mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND) ? 1 : 0; 380 mpt_entry.atomic = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0; 381 mpt_entry.rw = (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ? 1 : 0; 382 mpt_entry.rr = (mr->mr_accflag & IBT_MR_REMOTE_READ) ? 1 : 0; 383 mpt_entry.lw = (mr->mr_accflag & IBT_MR_LOCAL_WRITE) ? 1 : 0; 384 mpt_entry.lr = 1; 385 mpt_entry.reg_win = TAVOR_MPT_IS_REGION; 386 mpt_entry.page_sz = mr->mr_logmttpgsz - 0xC; 387 mpt_entry.mem_key = mr->mr_lkey; 388 mpt_entry.pd = pd->pd_pdnum; 389 mpt_entry.start_addr = bind->bi_addr; 390 mpt_entry.reg_win_len = bind->bi_len; 391 mpt_entry.win_cnt_limit = TAVOR_UNLIMITED_WIN_BIND; 392 mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT); 393 mpt_entry.mttseg_addr_h = mtt_addr >> 32; 394 mpt_entry.mttseg_addr_l = mtt_addr >> 6; 395 396 /* 397 * Write the MPT entry to hardware. Lastly, we pass ownership of 398 * the entry to the hardware. Note: in general, this operation 399 * shouldn't fail. But if it does, we have to undo everything we've 400 * done above before returning error. 401 */ 402 status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry, 403 sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep); 404 if (status != TAVOR_CMD_SUCCESS) { 405 cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n", 406 status); 407 TNF_PROBE_1(tavor_mr_register_shared_sw2hw_mpt_cmd_fail, 408 TAVOR_TNF_ERROR, "", tnf_uint, status, status); 409 /* Set "status" and "errormsg" and goto failure */ 410 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), 411 "tavor SW2HW_MPT command"); 412 goto mrshared_fail5; 413 } 414 415 /* 416 * Fill in the rest of the Tavor Memory Region handle. Having 417 * successfully transferred ownership of the MPT, we can update the 418 * following fields for use in further operations on the MR. 419 */ 420 mr->mr_mptrsrcp = mpt; 421 mr->mr_mttrsrcp = mtt; 422 mr->mr_pdhdl = pd; 423 mr->mr_rsrcp = rsrc; 424 mr->mr_is_umem = mr_is_umem; 425 mr->mr_umemcookie = (mr_is_umem != 0) ? umem_cookie : NULL; 426 mr->mr_umem_cbfunc = NULL; 427 mr->mr_umem_cbarg1 = NULL; 428 mr->mr_umem_cbarg2 = NULL; 429 430 /* 431 * If this is userland memory, then we need to insert the previously 432 * allocated entry into the "userland resources database". This will 433 * allow for later coordination between the tavor_umap_umemlock_cb() 434 * callback and tavor_mr_deregister(). 435 */ 436 if (mr_is_umem) { 437 tavor_umap_db_add(umapdb); 438 } 439 440 *mrhdl_new = mr; 441 442 TAVOR_TNF_EXIT(tavor_mr_register_shared); 443 return (DDI_SUCCESS); 444 445 /* 446 * The following is cleanup for all possible failure cases in this routine 447 */ 448 mrshared_fail5: 449 (void) tavor_mtt_refcnt_dec(mr->mr_mttrefcntp); 450 if (mr_is_umem) { 451 tavor_umap_db_free(umapdb); 452 } 453 mrshared_fail4: 454 if (mr_is_umem) { 455 ddi_umem_unlock(umem_cookie); 456 } 457 mrshared_fail3: 458 tavor_rsrc_free(state, &rsrc); 459 mrshared_fail2: 460 tavor_rsrc_free(state, &mpt); 461 mrshared_fail1: 462 tavor_pd_refcnt_dec(pd); 463 mrshared_fail: 464 TNF_PROBE_1(tavor_mr_register_shared_fail, TAVOR_TNF_ERROR, "", 465 tnf_string, msg, errormsg); 466 TAVOR_TNF_EXIT(tavor_mr_register_shared); 467 return (status); 468 } 469 470 471 /* 472 * tavor_mr_deregister() 473 * Context: Can be called from interrupt or base context. 474 */ 475 /* ARGSUSED */ 476 int 477 tavor_mr_deregister(tavor_state_t *state, tavor_mrhdl_t *mrhdl, uint_t level, 478 uint_t sleep) 479 { 480 tavor_rsrc_t *mpt, *mtt, *rsrc, *mtt_refcnt; 481 tavor_umap_db_entry_t *umapdb; 482 tavor_pdhdl_t pd; 483 tavor_mrhdl_t mr; 484 tavor_bind_info_t *bind; 485 uint64_t value; 486 int status, shared_mtt; 487 char *errormsg; 488 489 TAVOR_TNF_ENTER(tavor_mr_deregister); 490 491 /* 492 * Check the sleep flag. Ensure that it is consistent with the 493 * current thread context (i.e. if we are currently in the interrupt 494 * context, then we shouldn't be attempting to sleep). 495 */ 496 if ((sleep == TAVOR_SLEEP) && 497 (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) { 498 /* Set "status" and "errormsg" and goto failure */ 499 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sleep flags"); 500 TNF_PROBE_1(tavor_mr_deregister_fail, TAVOR_TNF_ERROR, "", 501 tnf_string, msg, errormsg); 502 TAVOR_TNF_EXIT(tavor_mr_deregister); 503 return (status); 504 } 505 506 /* 507 * Pull all the necessary information from the Tavor Memory Region 508 * handle. This is necessary here because the resource for the 509 * MR handle is going to be freed up as part of the this 510 * deregistration 511 */ 512 mr = *mrhdl; 513 mutex_enter(&mr->mr_lock); 514 mpt = mr->mr_mptrsrcp; 515 mtt = mr->mr_mttrsrcp; 516 mtt_refcnt = mr->mr_mttrefcntp; 517 rsrc = mr->mr_rsrcp; 518 pd = mr->mr_pdhdl; 519 bind = &mr->mr_bindinfo; 520 521 /* 522 * Check here to see if the memory region has already been partially 523 * deregistered as a result of the tavor_umap_umemlock_cb() callback. 524 * If so, then jump to the end and free the remaining resources. 525 */ 526 if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) { 527 goto mrdereg_finish_cleanup; 528 } 529 530 /* 531 * We must drop the "mr_lock" here to ensure that both SLEEP and 532 * NOSLEEP calls into the firmware work as expected. Also, if two 533 * threads are attemping to access this MR (via de-register, 534 * re-register, or otherwise), then we allow the firmware to enforce 535 * the checking, that only one deregister is valid. 536 */ 537 mutex_exit(&mr->mr_lock); 538 539 /* 540 * Reclaim MPT entry from hardware (if necessary). Since the 541 * tavor_mr_deregister() routine is used in the memory region 542 * reregistration process as well, it is possible that we will 543 * not always wish to reclaim ownership of the MPT. Check the 544 * "level" arg and, if necessary, attempt to reclaim it. If 545 * the ownership transfer fails for any reason, we check to see 546 * what command status was returned from the hardware. The only 547 * "expected" error status is the one that indicates an attempt to 548 * deregister a memory region that has memory windows bound to it 549 */ 550 if (level >= TAVOR_MR_DEREG_ALL) { 551 status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT, 552 NULL, 0, mpt->tr_indx, sleep); 553 if (status != TAVOR_CMD_SUCCESS) { 554 if (status == TAVOR_CMD_REG_BOUND) { 555 TAVOR_TNF_EXIT(tavor_mr_deregister); 556 return (IBT_MR_IN_USE); 557 } else { 558 cmn_err(CE_CONT, "Tavor: HW2SW_MPT command " 559 "failed: %08x\n", status); 560 TNF_PROBE_1(tavor_hw2sw_mpt_cmd_fail, 561 TAVOR_TNF_ERROR, "", tnf_uint, status, 562 status); 563 TAVOR_TNF_EXIT(tavor_mr_deregister); 564 return (IBT_INVALID_PARAM); 565 } 566 } 567 } 568 569 /* 570 * Re-grab the mr_lock here. Since further access to the protected 571 * 'mr' structure is needed, and we would have returned previously for 572 * the multiple deregistration case, we can safely grab the lock here. 573 */ 574 mutex_enter(&mr->mr_lock); 575 576 /* 577 * If the memory had come from userland, then we do a lookup in the 578 * "userland resources database". On success, we free the entry, call 579 * ddi_umem_unlock(), and continue the cleanup. On failure (which is 580 * an indication that the umem_lockmemory() callback has called 581 * tavor_mr_deregister()), we call ddi_umem_unlock() and invalidate 582 * the "mr_umemcookie" field in the MR handle (this will be used 583 * later to detect that only partial cleaup still remains to be done 584 * on the MR handle). 585 */ 586 if (mr->mr_is_umem) { 587 status = tavor_umap_db_find(state->ts_instance, 588 (uint64_t)(uintptr_t)mr->mr_umemcookie, 589 MLNX_UMAP_MRMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE, 590 &umapdb); 591 if (status == DDI_SUCCESS) { 592 tavor_umap_db_free(umapdb); 593 ddi_umem_unlock(mr->mr_umemcookie); 594 } else { 595 ddi_umem_unlock(mr->mr_umemcookie); 596 mr->mr_umemcookie = NULL; 597 } 598 } 599 600 /* 601 * Decrement the MTT reference count. Since the MTT resource 602 * may be shared between multiple memory regions (as a result 603 * of a "RegisterSharedMR" verb) it is important that we not 604 * free up or unbind resources prematurely. If it's not shared (as 605 * indicated by the return status), then free the resource. 606 */ 607 shared_mtt = tavor_mtt_refcnt_dec(mtt_refcnt); 608 if (!shared_mtt) { 609 tavor_rsrc_free(state, &mtt_refcnt); 610 } 611 612 /* 613 * Free up the MTT entries and unbind the memory. Here, as above, we 614 * attempt to free these resources only if it is appropriate to do so. 615 */ 616 if (!shared_mtt) { 617 if (level >= TAVOR_MR_DEREG_NO_HW2SW_MPT) { 618 tavor_mr_mem_unbind(state, bind); 619 } 620 tavor_rsrc_free(state, &mtt); 621 } 622 623 /* 624 * If the MR handle has been invalidated, then drop the 625 * lock and return success. Note: This only happens because 626 * the umem_lockmemory() callback has been triggered. The 627 * cleanup here is partial, and further cleanup (in a 628 * subsequent tavor_mr_deregister() call) will be necessary. 629 */ 630 if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) { 631 mutex_exit(&mr->mr_lock); 632 TAVOR_TNF_EXIT(tavor_mr_deregister); 633 return (DDI_SUCCESS); 634 } 635 636 mrdereg_finish_cleanup: 637 mutex_exit(&mr->mr_lock); 638 639 /* Free the Tavor Memory Region handle */ 640 tavor_rsrc_free(state, &rsrc); 641 642 /* Free up the MPT entry resource */ 643 tavor_rsrc_free(state, &mpt); 644 645 /* Decrement the reference count on the protection domain (PD) */ 646 tavor_pd_refcnt_dec(pd); 647 648 /* Set the mrhdl pointer to NULL and return success */ 649 *mrhdl = NULL; 650 651 TAVOR_TNF_EXIT(tavor_mr_deregister); 652 return (DDI_SUCCESS); 653 } 654 655 656 /* 657 * tavor_mr_query() 658 * Context: Can be called from interrupt or base context. 659 */ 660 /* ARGSUSED */ 661 int 662 tavor_mr_query(tavor_state_t *state, tavor_mrhdl_t mr, 663 ibt_mr_query_attr_t *attr) 664 { 665 TAVOR_TNF_ENTER(tavor_mr_query); 666 667 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr)) 668 669 mutex_enter(&mr->mr_lock); 670 671 /* 672 * Check here to see if the memory region has already been partially 673 * deregistered as a result of a tavor_umap_umemlock_cb() callback. 674 * If so, this is an error, return failure. 675 */ 676 if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) { 677 mutex_exit(&mr->mr_lock); 678 TNF_PROBE_0(tavor_mr_query_inv_mrhdl_fail, TAVOR_TNF_ERROR, ""); 679 TAVOR_TNF_EXIT(tavor_mr_query); 680 return (IBT_MR_HDL_INVALID); 681 } 682 683 /* Fill in the queried attributes */ 684 attr->mr_attr_flags = mr->mr_accflag; 685 attr->mr_pd = (ibt_pd_hdl_t)mr->mr_pdhdl; 686 687 /* Fill in the "local" attributes */ 688 attr->mr_lkey = (ibt_lkey_t)mr->mr_lkey; 689 attr->mr_lbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr; 690 attr->mr_lbounds.pb_len = (size_t)mr->mr_bindinfo.bi_len; 691 692 /* 693 * Fill in the "remote" attributes (if necessary). Note: the 694 * remote attributes are only valid if the memory region has one 695 * or more of the remote access flags set. 696 */ 697 if ((mr->mr_accflag & IBT_MR_REMOTE_READ) || 698 (mr->mr_accflag & IBT_MR_REMOTE_WRITE) || 699 (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) { 700 attr->mr_rkey = (ibt_rkey_t)mr->mr_rkey; 701 attr->mr_rbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr; 702 attr->mr_rbounds.pb_len = (size_t)mr->mr_bindinfo.bi_len; 703 } 704 705 /* 706 * If region is mapped for streaming (i.e. noncoherent), then set sync 707 * is required 708 */ 709 attr->mr_sync_required = (mr->mr_bindinfo.bi_flags & 710 IBT_MR_NONCOHERENT) ? B_TRUE : B_FALSE; 711 712 mutex_exit(&mr->mr_lock); 713 TAVOR_TNF_EXIT(tavor_mr_query); 714 return (DDI_SUCCESS); 715 } 716 717 718 /* 719 * tavor_mr_reregister() 720 * Context: Can be called from interrupt or base context. 721 */ 722 int 723 tavor_mr_reregister(tavor_state_t *state, tavor_mrhdl_t mr, 724 tavor_pdhdl_t pd, ibt_mr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl_new, 725 tavor_mr_options_t *op) 726 { 727 tavor_bind_info_t bind; 728 int status; 729 730 TAVOR_TNF_ENTER(tavor_mr_reregister); 731 732 /* 733 * Fill in the "bind" struct. This struct provides the majority 734 * of the information that will be used to distinguish between an 735 * "addr" binding (as is the case here) and a "buf" binding (see 736 * below). The "bind" struct is later passed to tavor_mr_mem_bind() 737 * which does most of the "heavy lifting" for the Tavor memory 738 * registration (and reregistration) routines. 739 */ 740 bind.bi_type = TAVOR_BINDHDL_VADDR; 741 bind.bi_addr = mr_attr->mr_vaddr; 742 bind.bi_len = mr_attr->mr_len; 743 bind.bi_as = mr_attr->mr_as; 744 bind.bi_flags = mr_attr->mr_flags; 745 status = tavor_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op); 746 if (status != DDI_SUCCESS) { 747 TNF_PROBE_0(tavor_mr_reregister_cmnreg_fail, 748 TAVOR_TNF_ERROR, ""); 749 TAVOR_TNF_EXIT(tavor_mr_reregister); 750 return (status); 751 } 752 753 TAVOR_TNF_EXIT(tavor_mr_reregister); 754 return (DDI_SUCCESS); 755 } 756 757 758 /* 759 * tavor_mr_reregister_buf() 760 * Context: Can be called from interrupt or base context. 761 */ 762 int 763 tavor_mr_reregister_buf(tavor_state_t *state, tavor_mrhdl_t mr, 764 tavor_pdhdl_t pd, ibt_smr_attr_t *mr_attr, struct buf *buf, 765 tavor_mrhdl_t *mrhdl_new, tavor_mr_options_t *op) 766 { 767 tavor_bind_info_t bind; 768 int status; 769 770 TAVOR_TNF_ENTER(tavor_mr_reregister_buf); 771 772 /* 773 * Fill in the "bind" struct. This struct provides the majority 774 * of the information that will be used to distinguish between an 775 * "addr" binding (see above) and a "buf" binding (as is the case 776 * here). The "bind" struct is later passed to tavor_mr_mem_bind() 777 * which does most of the "heavy lifting" for the Tavor memory 778 * registration routines. Note: We have chosen to provide 779 * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is 780 * not set). It is not critical what value we choose here as it need 781 * only be unique for the given RKey (which will happen by default), 782 * so the choice here is somewhat arbitrary. 783 */ 784 bind.bi_type = TAVOR_BINDHDL_BUF; 785 bind.bi_buf = buf; 786 if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) { 787 bind.bi_addr = mr_attr->mr_vaddr; 788 } else { 789 bind.bi_addr = (uint64_t)(uintptr_t)buf->b_un.b_addr; 790 } 791 bind.bi_len = (uint64_t)buf->b_bcount; 792 bind.bi_flags = mr_attr->mr_flags; 793 bind.bi_as = NULL; 794 status = tavor_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op); 795 if (status != DDI_SUCCESS) { 796 TNF_PROBE_0(tavor_mr_reregister_buf_cmnreg_fail, 797 TAVOR_TNF_ERROR, ""); 798 TAVOR_TNF_EXIT(tavor_mr_reregister_buf); 799 return (status); 800 } 801 802 TAVOR_TNF_EXIT(tavor_mr_reregister_buf); 803 return (DDI_SUCCESS); 804 } 805 806 807 /* 808 * tavor_mr_sync() 809 * Context: Can be called from interrupt or base context. 810 */ 811 /* ARGSUSED */ 812 int 813 tavor_mr_sync(tavor_state_t *state, ibt_mr_sync_t *mr_segs, size_t num_segs) 814 { 815 tavor_mrhdl_t mrhdl; 816 uint64_t seg_vaddr, seg_len, seg_end; 817 uint64_t mr_start, mr_end; 818 uint_t type; 819 int status, i; 820 char *errormsg; 821 822 TAVOR_TNF_ENTER(tavor_mr_sync); 823 824 /* Process each of the ibt_mr_sync_t's */ 825 for (i = 0; i < num_segs; i++) { 826 mrhdl = (tavor_mrhdl_t)mr_segs[i].ms_handle; 827 828 /* Check for valid memory region handle */ 829 if (mrhdl == NULL) { 830 /* Set "status" and "errormsg" and goto failure */ 831 TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl"); 832 goto mrsync_fail; 833 } 834 835 mutex_enter(&mrhdl->mr_lock); 836 837 /* 838 * Check here to see if the memory region has already been 839 * partially deregistered as a result of a 840 * tavor_umap_umemlock_cb() callback. If so, this is an 841 * error, return failure. 842 */ 843 if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) { 844 mutex_exit(&mrhdl->mr_lock); 845 /* Set "status" and "errormsg" and goto failure */ 846 TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl2"); 847 goto mrsync_fail; 848 } 849 850 /* Check for valid bounds on sync request */ 851 seg_vaddr = mr_segs[i].ms_vaddr; 852 seg_len = mr_segs[i].ms_len; 853 seg_end = seg_vaddr + seg_len - 1; 854 mr_start = mrhdl->mr_bindinfo.bi_addr; 855 mr_end = mr_start + mrhdl->mr_bindinfo.bi_len - 1; 856 if ((seg_vaddr < mr_start) || (seg_vaddr > mr_end)) { 857 mutex_exit(&mrhdl->mr_lock); 858 /* Set "status" and "errormsg" and goto failure */ 859 TAVOR_TNF_FAIL(IBT_MR_VA_INVALID, "invalid vaddr"); 860 goto mrsync_fail; 861 } 862 if ((seg_end < mr_start) || (seg_end > mr_end)) { 863 mutex_exit(&mrhdl->mr_lock); 864 /* Set "status" and "errormsg" and goto failure */ 865 TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length"); 866 goto mrsync_fail; 867 } 868 869 /* Determine what type (i.e. direction) for sync */ 870 if (mr_segs[i].ms_flags & IBT_SYNC_READ) { 871 type = DDI_DMA_SYNC_FORDEV; 872 } else if (mr_segs[i].ms_flags & IBT_SYNC_WRITE) { 873 type = DDI_DMA_SYNC_FORCPU; 874 } else { 875 mutex_exit(&mrhdl->mr_lock); 876 /* Set "status" and "errormsg" and goto failure */ 877 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sync type"); 878 goto mrsync_fail; 879 } 880 881 (void) ddi_dma_sync(mrhdl->mr_bindinfo.bi_dmahdl, 882 (off_t)(seg_vaddr - mr_start), (size_t)seg_len, type); 883 mutex_exit(&mrhdl->mr_lock); 884 } 885 886 TAVOR_TNF_EXIT(tavor_mr_sync); 887 return (DDI_SUCCESS); 888 889 mrsync_fail: 890 TNF_PROBE_1(tavor_mr_sync_fail, TAVOR_TNF_ERROR, "", tnf_string, msg, 891 errormsg); 892 TAVOR_TNF_EXIT(tavor_mr_sync); 893 return (status); 894 } 895 896 897 /* 898 * tavor_mw_alloc() 899 * Context: Can be called from interrupt or base context. 900 */ 901 int 902 tavor_mw_alloc(tavor_state_t *state, tavor_pdhdl_t pd, ibt_mw_flags_t flags, 903 tavor_mwhdl_t *mwhdl) 904 { 905 tavor_rsrc_t *mpt, *rsrc; 906 tavor_hw_mpt_t mpt_entry; 907 tavor_mwhdl_t mw; 908 uint_t sleep; 909 int status; 910 char *errormsg; 911 912 TAVOR_TNF_ENTER(tavor_mw_alloc); 913 914 /* 915 * Check the sleep flag. Ensure that it is consistent with the 916 * current thread context (i.e. if we are currently in the interrupt 917 * context, then we shouldn't be attempting to sleep). 918 */ 919 sleep = (flags & IBT_MW_NOSLEEP) ? TAVOR_NOSLEEP : TAVOR_SLEEP; 920 if ((sleep == TAVOR_SLEEP) && 921 (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) { 922 /* Set "status" and "errormsg" and goto failure */ 923 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags"); 924 goto mwalloc_fail; 925 } 926 927 /* Increment the reference count on the protection domain (PD) */ 928 tavor_pd_refcnt_inc(pd); 929 930 /* 931 * Allocate an MPT entry (for use as a memory window). Since the 932 * Tavor hardware uses the MPT entry for memory regions and for 933 * memory windows, we will fill in this MPT with all the necessary 934 * parameters for the memory window. And then (just as we do for 935 * memory regions) ownership will be passed to the hardware in the 936 * final step below. If we fail here, we must undo the protection 937 * domain reference count. 938 */ 939 status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt); 940 if (status != DDI_SUCCESS) { 941 /* Set "status" and "errormsg" and goto failure */ 942 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT"); 943 goto mwalloc_fail1; 944 } 945 946 /* 947 * Allocate the software structure for tracking the memory window (i.e. 948 * the Tavor Memory Window handle). Note: This is actually the same 949 * software structure used for tracking memory regions, but since many 950 * of the same properties are needed, only a single structure is 951 * necessary. If we fail here, we must undo the protection domain 952 * reference count and the previous resource allocation. 953 */ 954 status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc); 955 if (status != DDI_SUCCESS) { 956 /* Set "status" and "errormsg" and goto failure */ 957 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle"); 958 goto mwalloc_fail2; 959 } 960 mw = (tavor_mwhdl_t)rsrc->tr_addr; 961 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw)) 962 963 /* 964 * Calculate an "unbound" RKey from MPT index. In much the same way 965 * as we do for memory regions (above), this key is constructed from 966 * a "constrained" (which depends on the MPT index) and an 967 * "unconstrained" portion (which may be arbitrarily chosen). 968 */ 969 tavor_mr_keycalc(state, mpt->tr_indx, &mw->mr_rkey); 970 971 /* 972 * Fill in the MPT entry. This is the final step before passing 973 * ownership of the MPT entry to the Tavor hardware. We use all of 974 * the information collected/calculated above to fill in the 975 * requisite portions of the MPT. Note: fewer entries in the MPT 976 * entry are necessary to allocate a memory window. 977 */ 978 bzero(&mpt_entry, sizeof (tavor_hw_mpt_t)); 979 mpt_entry.reg_win = TAVOR_MPT_IS_WINDOW; 980 mpt_entry.mem_key = mw->mr_rkey; 981 mpt_entry.pd = pd->pd_pdnum; 982 983 /* 984 * Write the MPT entry to hardware. Lastly, we pass ownership of 985 * the entry to the hardware. Note: in general, this operation 986 * shouldn't fail. But if it does, we have to undo everything we've 987 * done above before returning error. 988 */ 989 status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry, 990 sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep); 991 if (status != TAVOR_CMD_SUCCESS) { 992 cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n", 993 status); 994 TNF_PROBE_1(tavor_mw_alloc_sw2hw_mpt_cmd_fail, 995 TAVOR_TNF_ERROR, "", tnf_uint, status, status); 996 /* Set "status" and "errormsg" and goto failure */ 997 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), 998 "tavor SW2HW_MPT command"); 999 goto mwalloc_fail3; 1000 } 1001 1002 /* 1003 * Fill in the rest of the Tavor Memory Window handle. Having 1004 * successfully transferred ownership of the MPT, we can update the 1005 * following fields for use in further operations on the MW. 1006 */ 1007 mw->mr_mptrsrcp = mpt; 1008 mw->mr_pdhdl = pd; 1009 mw->mr_rsrcp = rsrc; 1010 *mwhdl = mw; 1011 1012 TAVOR_TNF_EXIT(tavor_mw_alloc); 1013 return (DDI_SUCCESS); 1014 1015 mwalloc_fail3: 1016 tavor_rsrc_free(state, &rsrc); 1017 mwalloc_fail2: 1018 tavor_rsrc_free(state, &mpt); 1019 mwalloc_fail1: 1020 tavor_pd_refcnt_dec(pd); 1021 mwalloc_fail: 1022 TNF_PROBE_1(tavor_mw_alloc_fail, TAVOR_TNF_ERROR, "", 1023 tnf_string, msg, errormsg); 1024 TAVOR_TNF_EXIT(tavor_mw_alloc); 1025 return (status); 1026 } 1027 1028 1029 /* 1030 * tavor_mw_free() 1031 * Context: Can be called from interrupt or base context. 1032 */ 1033 int 1034 tavor_mw_free(tavor_state_t *state, tavor_mwhdl_t *mwhdl, uint_t sleep) 1035 { 1036 tavor_rsrc_t *mpt, *rsrc; 1037 tavor_mwhdl_t mw; 1038 int status; 1039 char *errormsg; 1040 tavor_pdhdl_t pd; 1041 1042 TAVOR_TNF_ENTER(tavor_mw_free); 1043 1044 /* 1045 * Check the sleep flag. Ensure that it is consistent with the 1046 * current thread context (i.e. if we are currently in the interrupt 1047 * context, then we shouldn't be attempting to sleep). 1048 */ 1049 if ((sleep == TAVOR_SLEEP) && 1050 (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) { 1051 /* Set "status" and "errormsg" and goto failure */ 1052 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid sleep flags"); 1053 TNF_PROBE_1(tavor_mw_free_fail, TAVOR_TNF_ERROR, "", 1054 tnf_string, msg, errormsg); 1055 TAVOR_TNF_EXIT(tavor_mw_free); 1056 return (status); 1057 } 1058 1059 /* 1060 * Pull all the necessary information from the Tavor Memory Window 1061 * handle. This is necessary here because the resource for the 1062 * MW handle is going to be freed up as part of the this operation. 1063 */ 1064 mw = *mwhdl; 1065 mutex_enter(&mw->mr_lock); 1066 mpt = mw->mr_mptrsrcp; 1067 rsrc = mw->mr_rsrcp; 1068 pd = mw->mr_pdhdl; 1069 mutex_exit(&mw->mr_lock); 1070 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw)) 1071 1072 /* 1073 * Reclaim the MPT entry from hardware. Note: in general, it is 1074 * unexpected for this operation to return an error. 1075 */ 1076 status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT, NULL, 1077 0, mpt->tr_indx, sleep); 1078 if (status != TAVOR_CMD_SUCCESS) { 1079 cmn_err(CE_CONT, "Tavor: HW2SW_MPT command failed: %08x\n", 1080 status); 1081 TNF_PROBE_1(tavor_hw2sw_mpt_cmd_fail, TAVOR_TNF_ERROR, "", 1082 tnf_uint, status, status); 1083 TAVOR_TNF_EXIT(tavor_mw_free); 1084 return (IBT_INVALID_PARAM); 1085 } 1086 1087 /* Free the Tavor Memory Window handle */ 1088 tavor_rsrc_free(state, &rsrc); 1089 1090 /* Free up the MPT entry resource */ 1091 tavor_rsrc_free(state, &mpt); 1092 1093 /* Decrement the reference count on the protection domain (PD) */ 1094 tavor_pd_refcnt_dec(pd); 1095 1096 /* Set the mwhdl pointer to NULL and return success */ 1097 *mwhdl = NULL; 1098 1099 TAVOR_TNF_EXIT(tavor_mw_free); 1100 return (DDI_SUCCESS); 1101 } 1102 1103 1104 /* 1105 * tavor_mr_keycalc() 1106 * Context: Can be called from interrupt or base context. 1107 */ 1108 void 1109 tavor_mr_keycalc(tavor_state_t *state, uint32_t indx, uint32_t *key) 1110 { 1111 uint32_t tmp, log_num_mpt; 1112 1113 /* 1114 * Generate a simple key from counter. Note: We increment this 1115 * static variable _intentionally_ without any kind of mutex around 1116 * it. First, single-threading all operations through a single lock 1117 * would be a bad idea (from a performance point-of-view). Second, 1118 * the upper "unconstrained" bits don't really have to be unique 1119 * because the lower bits are guaranteed to be (although we do make a 1120 * best effort to ensure that they are). Third, the window for the 1121 * race (where both threads read and update the counter at the same 1122 * time) is incredibly small. 1123 * And, lastly, we'd like to make this into a "random" key XXX 1124 */ 1125 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(tavor_debug_memkey_cnt)) 1126 log_num_mpt = state->ts_cfg_profile->cp_log_num_mpt; 1127 tmp = (tavor_debug_memkey_cnt++) << log_num_mpt; 1128 *key = tmp | indx; 1129 } 1130 1131 1132 /* 1133 * tavor_mr_common_reg() 1134 * Context: Can be called from interrupt or base context. 1135 */ 1136 static int 1137 tavor_mr_common_reg(tavor_state_t *state, tavor_pdhdl_t pd, 1138 tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op) 1139 { 1140 tavor_rsrc_pool_info_t *rsrc_pool; 1141 tavor_rsrc_t *mpt, *mtt, *rsrc, *mtt_refcnt; 1142 tavor_umap_db_entry_t *umapdb; 1143 tavor_sw_refcnt_t *swrc_tmp; 1144 tavor_hw_mpt_t mpt_entry; 1145 tavor_mrhdl_t mr; 1146 ibt_mr_flags_t flags; 1147 tavor_bind_info_t *bh; 1148 ddi_dma_handle_t bind_dmahdl; 1149 ddi_umem_cookie_t umem_cookie; 1150 size_t umem_len; 1151 caddr_t umem_addr; 1152 uint64_t mtt_addr, mtt_ddrbaseaddr, max_sz; 1153 uint_t sleep, mtt_pgsize_bits, bind_type, mr_is_umem; 1154 int status, umem_flags, bind_override_addr; 1155 char *errormsg; 1156 1157 TAVOR_TNF_ENTER(tavor_mr_common_reg); 1158 1159 /* 1160 * Check the "options" flag. Currently this flag tells the driver 1161 * whether or not the region should be bound normally (i.e. with 1162 * entries written into the PCI IOMMU), whether it should be 1163 * registered to bypass the IOMMU, and whether or not the resulting 1164 * address should be "zero-based" (to aid the alignment restrictions 1165 * for QPs). 1166 */ 1167 if (op == NULL) { 1168 bind_type = TAVOR_BINDMEM_NORMAL; 1169 bind_dmahdl = NULL; 1170 bind_override_addr = 0; 1171 } else { 1172 bind_type = op->mro_bind_type; 1173 bind_dmahdl = op->mro_bind_dmahdl; 1174 bind_override_addr = op->mro_bind_override_addr; 1175 } 1176 1177 /* Extract the flags field from the tavor_bind_info_t */ 1178 flags = bind->bi_flags; 1179 1180 /* 1181 * Check for invalid length. Check is the length is zero or if the 1182 * length is larger than the maximum configured value. Return error 1183 * if it is. 1184 */ 1185 max_sz = ((uint64_t)1 << state->ts_cfg_profile->cp_log_max_mrw_sz); 1186 if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) { 1187 /* Set "status" and "errormsg" and goto failure */ 1188 TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length"); 1189 goto mrcommon_fail; 1190 } 1191 1192 /* 1193 * Check the sleep flag. Ensure that it is consistent with the 1194 * current thread context (i.e. if we are currently in the interrupt 1195 * context, then we shouldn't be attempting to sleep). 1196 */ 1197 sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP; 1198 if ((sleep == TAVOR_SLEEP) && 1199 (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) { 1200 /* Set "status" and "errormsg" and goto failure */ 1201 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags"); 1202 goto mrcommon_fail; 1203 } 1204 1205 /* 1206 * Get the base address for the MTT table. This will be necessary 1207 * below when we are setting up the MPT entry. 1208 */ 1209 rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT]; 1210 mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset; 1211 1212 /* Increment the reference count on the protection domain (PD) */ 1213 tavor_pd_refcnt_inc(pd); 1214 1215 /* 1216 * Allocate an MPT entry. This will be filled in with all the 1217 * necessary parameters to define the memory region. And then 1218 * ownership will be passed to the hardware in the final step 1219 * below. If we fail here, we must undo the protection domain 1220 * reference count. 1221 */ 1222 status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt); 1223 if (status != DDI_SUCCESS) { 1224 /* Set "status" and "errormsg" and goto failure */ 1225 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MPT"); 1226 goto mrcommon_fail1; 1227 } 1228 1229 /* 1230 * Allocate the software structure for tracking the memory region (i.e. 1231 * the Tavor Memory Region handle). If we fail here, we must undo 1232 * the protection domain reference count and the previous resource 1233 * allocation. 1234 */ 1235 status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc); 1236 if (status != DDI_SUCCESS) { 1237 /* Set "status" and "errormsg" and goto failure */ 1238 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MR handle"); 1239 goto mrcommon_fail2; 1240 } 1241 mr = (tavor_mrhdl_t)rsrc->tr_addr; 1242 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr)) 1243 1244 /* 1245 * Setup and validate the memory region access flags. This means 1246 * translating the IBTF's enable flags into the access flags that 1247 * will be used in later operations. 1248 */ 1249 mr->mr_accflag = 0; 1250 if (flags & IBT_MR_ENABLE_WINDOW_BIND) 1251 mr->mr_accflag |= IBT_MR_WINDOW_BIND; 1252 if (flags & IBT_MR_ENABLE_LOCAL_WRITE) 1253 mr->mr_accflag |= IBT_MR_LOCAL_WRITE; 1254 if (flags & IBT_MR_ENABLE_REMOTE_READ) 1255 mr->mr_accflag |= IBT_MR_REMOTE_READ; 1256 if (flags & IBT_MR_ENABLE_REMOTE_WRITE) 1257 mr->mr_accflag |= IBT_MR_REMOTE_WRITE; 1258 if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC) 1259 mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC; 1260 1261 /* 1262 * Calculate keys (Lkey, Rkey) from MPT index. Each key is formed 1263 * from a certain number of "constrained" bits (the least significant 1264 * bits) and some number of "unconstrained" bits. The constrained 1265 * bits must be set to the index of the entry in the MPT table, but 1266 * the unconstrained bits can be set to any value we wish. Note: 1267 * if no remote access is required, then the RKey value is not filled 1268 * in. Otherwise both Rkey and LKey are given the same value. 1269 */ 1270 tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey); 1271 if ((mr->mr_accflag & IBT_MR_REMOTE_READ) || 1272 (mr->mr_accflag & IBT_MR_REMOTE_WRITE) || 1273 (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) { 1274 mr->mr_rkey = mr->mr_lkey; 1275 } 1276 1277 /* 1278 * Determine if the memory is from userland and pin the pages 1279 * with umem_lockmemory() if necessary. 1280 * Then, if this is userland memory, allocate an entry in the 1281 * "userland resources database". This will later be added to 1282 * the database (after all further memory registration operations are 1283 * successful). If we fail here, we must undo the reference counts 1284 * and the previous resource allocations. 1285 */ 1286 mr_is_umem = (((bind->bi_as != NULL) && (bind->bi_as != &kas)) ? 1 : 0); 1287 if (mr_is_umem) { 1288 umem_len = ptob(btopr(bind->bi_len + 1289 ((uintptr_t)bind->bi_addr & PAGEOFFSET))); 1290 umem_addr = (caddr_t)((uintptr_t)bind->bi_addr & ~PAGEOFFSET); 1291 umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ | 1292 DDI_UMEMLOCK_LONGTERM); 1293 status = umem_lockmemory(umem_addr, umem_len, umem_flags, 1294 &umem_cookie, &tavor_umem_cbops, NULL); 1295 if (status != 0) { 1296 /* Set "status" and "errormsg" and goto failure */ 1297 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umem pin"); 1298 goto mrcommon_fail3; 1299 } 1300 1301 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind)) 1302 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind->bi_buf)) 1303 1304 bind->bi_buf = ddi_umem_iosetup(umem_cookie, 0, umem_len, 1305 B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP); 1306 if (bind->bi_buf == NULL) { 1307 /* Set "status" and "errormsg" and goto failure */ 1308 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed iosetup"); 1309 goto mrcommon_fail3; 1310 } 1311 bind->bi_type = TAVOR_BINDHDL_UBUF; 1312 bind->bi_buf->b_flags |= B_READ; 1313 1314 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind->bi_buf)) 1315 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind)) 1316 1317 umapdb = tavor_umap_db_alloc(state->ts_instance, 1318 (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC, 1319 (uint64_t)(uintptr_t)rsrc); 1320 if (umapdb == NULL) { 1321 /* Set "status" and "errormsg" and goto failure */ 1322 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add"); 1323 goto mrcommon_fail4; 1324 } 1325 } 1326 1327 /* 1328 * Setup the bindinfo for the mtt bind call 1329 */ 1330 bh = &mr->mr_bindinfo; 1331 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bh)) 1332 bcopy(bind, bh, sizeof (tavor_bind_info_t)); 1333 bh->bi_bypass = bind_type; 1334 status = tavor_mr_mtt_bind(state, bh, bind_dmahdl, &mtt, 1335 &mtt_pgsize_bits); 1336 if (status != DDI_SUCCESS) { 1337 /* Set "status" and "errormsg" and goto failure */ 1338 TAVOR_TNF_FAIL(status, "failed mtt bind"); 1339 /* 1340 * When mtt_bind fails, freerbuf has already been done, 1341 * so make sure not to call it again. 1342 */ 1343 bind->bi_type = bh->bi_type; 1344 goto mrcommon_fail5; 1345 } 1346 mr->mr_logmttpgsz = mtt_pgsize_bits; 1347 1348 /* 1349 * Allocate MTT reference count (to track shared memory regions). 1350 * This reference count resource may never be used on the given 1351 * memory region, but if it is ever later registered as "shared" 1352 * memory region then this resource will be necessary. If we fail 1353 * here, we do pretty much the same as above to clean up. 1354 */ 1355 status = tavor_rsrc_alloc(state, TAVOR_REFCNT, 1, sleep, 1356 &mtt_refcnt); 1357 if (status != DDI_SUCCESS) { 1358 /* Set "status" and "errormsg" and goto failure */ 1359 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed refence count"); 1360 goto mrcommon_fail6; 1361 } 1362 mr->mr_mttrefcntp = mtt_refcnt; 1363 swrc_tmp = (tavor_sw_refcnt_t *)mtt_refcnt->tr_addr; 1364 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_tmp)) 1365 TAVOR_MTT_REFCNT_INIT(swrc_tmp); 1366 1367 /* 1368 * Fill in the MPT entry. This is the final step before passing 1369 * ownership of the MPT entry to the Tavor hardware. We use all of 1370 * the information collected/calculated above to fill in the 1371 * requisite portions of the MPT. 1372 */ 1373 bzero(&mpt_entry, sizeof (tavor_hw_mpt_t)); 1374 mpt_entry.m_io = TAVOR_MEM_CYCLE_GENERATE; 1375 mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND) ? 1 : 0; 1376 mpt_entry.atomic = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0; 1377 mpt_entry.rw = (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ? 1 : 0; 1378 mpt_entry.rr = (mr->mr_accflag & IBT_MR_REMOTE_READ) ? 1 : 0; 1379 mpt_entry.lw = (mr->mr_accflag & IBT_MR_LOCAL_WRITE) ? 1 : 0; 1380 mpt_entry.lr = 1; 1381 mpt_entry.reg_win = TAVOR_MPT_IS_REGION; 1382 mpt_entry.page_sz = mr->mr_logmttpgsz - 0xC; 1383 mpt_entry.mem_key = mr->mr_lkey; 1384 mpt_entry.pd = pd->pd_pdnum; 1385 if (bind_override_addr == 0) { 1386 mpt_entry.start_addr = bh->bi_addr; 1387 } else { 1388 bh->bi_addr = bh->bi_addr & ((1 << mr->mr_logmttpgsz) - 1); 1389 mpt_entry.start_addr = bh->bi_addr; 1390 } 1391 mpt_entry.reg_win_len = bh->bi_len; 1392 mpt_entry.win_cnt_limit = TAVOR_UNLIMITED_WIN_BIND; 1393 mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT); 1394 mpt_entry.mttseg_addr_h = mtt_addr >> 32; 1395 mpt_entry.mttseg_addr_l = mtt_addr >> 6; 1396 1397 /* 1398 * Write the MPT entry to hardware. Lastly, we pass ownership of 1399 * the entry to the hardware. Note: in general, this operation 1400 * shouldn't fail. But if it does, we have to undo everything we've 1401 * done above before returning error. 1402 */ 1403 status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry, 1404 sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep); 1405 if (status != TAVOR_CMD_SUCCESS) { 1406 cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n", 1407 status); 1408 TNF_PROBE_1(tavor_mr_common_reg_sw2hw_mpt_cmd_fail, 1409 TAVOR_TNF_ERROR, "", tnf_uint, status, status); 1410 /* Set "status" and "errormsg" and goto failure */ 1411 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), 1412 "tavor SW2HW_MPT command"); 1413 goto mrcommon_fail7; 1414 } 1415 1416 /* 1417 * Fill in the rest of the Tavor Memory Region handle. Having 1418 * successfully transferred ownership of the MPT, we can update the 1419 * following fields for use in further operations on the MR. 1420 */ 1421 mr->mr_mptrsrcp = mpt; 1422 mr->mr_mttrsrcp = mtt; 1423 mr->mr_pdhdl = pd; 1424 mr->mr_rsrcp = rsrc; 1425 mr->mr_is_umem = mr_is_umem; 1426 mr->mr_umemcookie = (mr_is_umem != 0) ? umem_cookie : NULL; 1427 mr->mr_umem_cbfunc = NULL; 1428 mr->mr_umem_cbarg1 = NULL; 1429 mr->mr_umem_cbarg2 = NULL; 1430 1431 /* 1432 * If this is userland memory, then we need to insert the previously 1433 * allocated entry into the "userland resources database". This will 1434 * allow for later coordination between the tavor_umap_umemlock_cb() 1435 * callback and tavor_mr_deregister(). 1436 */ 1437 if (mr_is_umem) { 1438 tavor_umap_db_add(umapdb); 1439 } 1440 1441 *mrhdl = mr; 1442 1443 TAVOR_TNF_EXIT(tavor_mr_common_reg); 1444 return (DDI_SUCCESS); 1445 1446 /* 1447 * The following is cleanup for all possible failure cases in this routine 1448 */ 1449 mrcommon_fail7: 1450 tavor_rsrc_free(state, &mtt_refcnt); 1451 mrcommon_fail6: 1452 tavor_rsrc_free(state, &mtt); 1453 tavor_mr_mem_unbind(state, bh); 1454 bind->bi_type = bh->bi_type; 1455 mrcommon_fail5: 1456 if (mr_is_umem) { 1457 tavor_umap_db_free(umapdb); 1458 } 1459 mrcommon_fail4: 1460 if (mr_is_umem) { 1461 /* 1462 * Free up the memory ddi_umem_iosetup() allocates 1463 * internally. 1464 */ 1465 if (bind->bi_type == TAVOR_BINDHDL_UBUF) { 1466 freerbuf(bind->bi_buf); 1467 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind)) 1468 bind->bi_type = TAVOR_BINDHDL_NONE; 1469 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind)) 1470 } 1471 ddi_umem_unlock(umem_cookie); 1472 } 1473 mrcommon_fail3: 1474 tavor_rsrc_free(state, &rsrc); 1475 mrcommon_fail2: 1476 tavor_rsrc_free(state, &mpt); 1477 mrcommon_fail1: 1478 tavor_pd_refcnt_dec(pd); 1479 mrcommon_fail: 1480 TNF_PROBE_1(tavor_mr_common_reg_fail, TAVOR_TNF_ERROR, "", 1481 tnf_string, msg, errormsg); 1482 TAVOR_TNF_EXIT(tavor_mr_common_reg); 1483 return (status); 1484 } 1485 1486 /* 1487 * tavor_mr_mtt_bind() 1488 * Context: Can be called from interrupt or base context. 1489 */ 1490 int 1491 tavor_mr_mtt_bind(tavor_state_t *state, tavor_bind_info_t *bind, 1492 ddi_dma_handle_t bind_dmahdl, tavor_rsrc_t **mtt, uint_t *mtt_pgsize_bits) 1493 { 1494 uint64_t nummtt; 1495 uint_t sleep; 1496 int status; 1497 char *errormsg; 1498 1499 TAVOR_TNF_ENTER(tavor_mr_common_reg); 1500 1501 /* 1502 * Check the sleep flag. Ensure that it is consistent with the 1503 * current thread context (i.e. if we are currently in the interrupt 1504 * context, then we shouldn't be attempting to sleep). 1505 */ 1506 sleep = (bind->bi_flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP; 1507 if ((sleep == TAVOR_SLEEP) && 1508 (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) { 1509 /* Set "status" and "errormsg" and goto failure */ 1510 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags"); 1511 goto mrmttbind_fail; 1512 } 1513 1514 /* 1515 * Bind the memory and determine the mapped addresses. This is 1516 * the first of two routines that do all the "heavy lifting" for 1517 * the Tavor memory registration routines. The tavor_mr_mem_bind() 1518 * routine takes the "bind" struct with all its fields filled 1519 * in and returns a list of DMA cookies (for the PCI mapped addresses 1520 * corresponding to the specified address region) which are used by 1521 * the tavor_mr_fast_mtt_write() routine below. If we fail here, we 1522 * must undo all the previous resource allocation (and PD reference 1523 * count). 1524 */ 1525 status = tavor_mr_mem_bind(state, bind, bind_dmahdl, sleep); 1526 if (status != DDI_SUCCESS) { 1527 /* Set "status" and "errormsg" and goto failure */ 1528 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind"); 1529 goto mrmttbind_fail; 1530 } 1531 1532 /* 1533 * Determine number of pages spanned. This routine uses the 1534 * information in the "bind" struct to determine the required 1535 * number of MTT entries needed (and returns the suggested page size - 1536 * as a "power-of-2" - for each MTT entry). 1537 */ 1538 nummtt = tavor_mr_nummtt_needed(state, bind, mtt_pgsize_bits); 1539 1540 /* 1541 * Allocate the MTT entries. Use the calculations performed above to 1542 * allocate the required number of MTT entries. Note: MTT entries are 1543 * allocated in "MTT segments" which consist of complete cachelines 1544 * (i.e. 8 entries, 16 entries, etc.) So the TAVOR_NUMMTT_TO_MTTSEG() 1545 * macro is used to do the proper conversion. If we fail here, we 1546 * must not only undo all the previous resource allocation (and PD 1547 * reference count), but we must also unbind the memory. 1548 */ 1549 status = tavor_rsrc_alloc(state, TAVOR_MTT, 1550 TAVOR_NUMMTT_TO_MTTSEG(nummtt), sleep, mtt); 1551 if (status != DDI_SUCCESS) { 1552 /* Set "status" and "errormsg" and goto failure */ 1553 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MTT"); 1554 goto mrmttbind_fail2; 1555 } 1556 1557 /* 1558 * Write the mapped addresses into the MTT entries. This is part two 1559 * of the "heavy lifting" routines that we talked about above. Note: 1560 * we pass the suggested page size from the earlier operation here. 1561 * And if we fail here, we again do pretty much the same huge clean up. 1562 */ 1563 status = tavor_mr_fast_mtt_write(*mtt, bind, *mtt_pgsize_bits); 1564 if (status != DDI_SUCCESS) { 1565 /* Set "status" and "errormsg" and goto failure */ 1566 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "failed write mtt"); 1567 goto mrmttbind_fail3; 1568 } 1569 TAVOR_TNF_EXIT(tavor_mr_mtt_bind); 1570 return (DDI_SUCCESS); 1571 1572 /* 1573 * The following is cleanup for all possible failure cases in this routine 1574 */ 1575 mrmttbind_fail3: 1576 tavor_rsrc_free(state, mtt); 1577 mrmttbind_fail2: 1578 tavor_mr_mem_unbind(state, bind); 1579 mrmttbind_fail: 1580 TNF_PROBE_1(tavor_mr_mtt_bind_fail, TAVOR_TNF_ERROR, "", 1581 tnf_string, msg, errormsg); 1582 TAVOR_TNF_EXIT(tavor_mr_mtt_bind); 1583 return (status); 1584 } 1585 1586 1587 /* 1588 * tavor_mr_mtt_unbind() 1589 * Context: Can be called from interrupt or base context. 1590 */ 1591 int 1592 tavor_mr_mtt_unbind(tavor_state_t *state, tavor_bind_info_t *bind, 1593 tavor_rsrc_t *mtt) 1594 { 1595 TAVOR_TNF_ENTER(tavor_mr_mtt_unbind); 1596 1597 /* 1598 * Free up the MTT entries and unbind the memory. Here, as above, we 1599 * attempt to free these resources only if it is appropriate to do so. 1600 */ 1601 tavor_mr_mem_unbind(state, bind); 1602 tavor_rsrc_free(state, &mtt); 1603 1604 TAVOR_TNF_EXIT(tavor_mr_mtt_unbind); 1605 return (DDI_SUCCESS); 1606 } 1607 1608 1609 /* 1610 * tavor_mr_common_rereg() 1611 * Context: Can be called from interrupt or base context. 1612 */ 1613 static int 1614 tavor_mr_common_rereg(tavor_state_t *state, tavor_mrhdl_t mr, 1615 tavor_pdhdl_t pd, tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl_new, 1616 tavor_mr_options_t *op) 1617 { 1618 tavor_rsrc_t *mpt; 1619 ibt_mr_attr_flags_t acc_flags_to_use; 1620 ibt_mr_flags_t flags; 1621 tavor_pdhdl_t pd_to_use; 1622 tavor_hw_mpt_t mpt_entry; 1623 uint64_t mtt_addr_to_use, vaddr_to_use, len_to_use; 1624 uint_t sleep, dereg_level; 1625 int status; 1626 char *errormsg; 1627 1628 TAVOR_TNF_ENTER(tavor_mr_common_rereg); 1629 1630 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind)) 1631 1632 /* 1633 * Check here to see if the memory region corresponds to a userland 1634 * mapping. Reregistration of userland memory regions is not 1635 * currently supported. Return failure. XXX 1636 */ 1637 if (mr->mr_is_umem) { 1638 /* Set "status" and "errormsg" and goto failure */ 1639 TAVOR_TNF_FAIL(IBT_MR_HDL_INVALID, "invalid mrhdl"); 1640 goto mrrereg_fail; 1641 } 1642 1643 mutex_enter(&mr->mr_lock); 1644 1645 /* Pull MPT resource pointer from the Tavor Memory Region handle */ 1646 mpt = mr->mr_mptrsrcp; 1647 1648 /* Extract the flags field from the tavor_bind_info_t */ 1649 flags = bind->bi_flags; 1650 1651 /* 1652 * Check the sleep flag. Ensure that it is consistent with the 1653 * current thread context (i.e. if we are currently in the interrupt 1654 * context, then we shouldn't be attempting to sleep). 1655 */ 1656 sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP; 1657 if ((sleep == TAVOR_SLEEP) && 1658 (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) { 1659 mutex_exit(&mr->mr_lock); 1660 /* Set "status" and "errormsg" and goto failure */ 1661 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid flags"); 1662 goto mrrereg_fail; 1663 } 1664 1665 /* 1666 * First step is to temporarily invalidate the MPT entry. This 1667 * regains ownership from the hardware, and gives us the opportunity 1668 * to modify the entry. Note: The HW2SW_MPT command returns the 1669 * current MPT entry contents. These are saved away here because 1670 * they will be reused in a later step below. If the region has 1671 * bound memory windows that we fail returning an "in use" error code. 1672 * Otherwise, this is an unexpected error and we deregister the 1673 * memory region and return error. 1674 * 1675 * We use TAVOR_CMD_NOSLEEP_SPIN here always because we must protect 1676 * against holding the lock around this rereg call in all contexts. 1677 */ 1678 status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT, &mpt_entry, 1679 sizeof (tavor_hw_mpt_t), mpt->tr_indx, TAVOR_CMD_NOSLEEP_SPIN); 1680 if (status != TAVOR_CMD_SUCCESS) { 1681 mutex_exit(&mr->mr_lock); 1682 if (status == TAVOR_CMD_REG_BOUND) { 1683 TAVOR_TNF_EXIT(tavor_mr_common_rereg); 1684 return (IBT_MR_IN_USE); 1685 } else { 1686 cmn_err(CE_CONT, "Tavor: HW2SW_MPT command failed: " 1687 "%08x\n", status); 1688 1689 /* 1690 * Call deregister and ensure that all current 1691 * resources get freed up 1692 */ 1693 if (tavor_mr_deregister(state, &mr, 1694 TAVOR_MR_DEREG_ALL, sleep) != DDI_SUCCESS) { 1695 TAVOR_WARNING(state, "failed to deregister " 1696 "memory region"); 1697 } 1698 TNF_PROBE_1(tavor_mr_common_rereg_hw2sw_mpt_cmd_fail, 1699 TAVOR_TNF_ERROR, "", tnf_uint, status, status); 1700 TAVOR_TNF_EXIT(tavor_mr_common_rereg); 1701 return (ibc_get_ci_failure(0)); 1702 } 1703 } 1704 1705 /* 1706 * If we're changing the protection domain, then validate the new one 1707 */ 1708 if (flags & IBT_MR_CHANGE_PD) { 1709 1710 /* Check for valid PD handle pointer */ 1711 if (pd == NULL) { 1712 mutex_exit(&mr->mr_lock); 1713 /* 1714 * Call deregister and ensure that all current 1715 * resources get properly freed up. Unnecessary 1716 * here to attempt to regain software ownership 1717 * of the MPT entry as that has already been 1718 * done above. 1719 */ 1720 if (tavor_mr_deregister(state, &mr, 1721 TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) != 1722 DDI_SUCCESS) { 1723 TAVOR_WARNING(state, "failed to deregister " 1724 "memory region"); 1725 } 1726 /* Set "status" and "errormsg" and goto failure */ 1727 TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID, "invalid PD handle"); 1728 goto mrrereg_fail; 1729 } 1730 1731 /* Use the new PD handle in all operations below */ 1732 pd_to_use = pd; 1733 1734 } else { 1735 /* Use the current PD handle in all operations below */ 1736 pd_to_use = mr->mr_pdhdl; 1737 } 1738 1739 /* 1740 * If we're changing access permissions, then validate the new ones 1741 */ 1742 if (flags & IBT_MR_CHANGE_ACCESS) { 1743 /* 1744 * Validate the access flags. Both remote write and remote 1745 * atomic require the local write flag to be set 1746 */ 1747 if (((flags & IBT_MR_ENABLE_REMOTE_WRITE) || 1748 (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)) && 1749 !(flags & IBT_MR_ENABLE_LOCAL_WRITE)) { 1750 mutex_exit(&mr->mr_lock); 1751 /* 1752 * Call deregister and ensure that all current 1753 * resources get properly freed up. Unnecessary 1754 * here to attempt to regain software ownership 1755 * of the MPT entry as that has already been 1756 * done above. 1757 */ 1758 if (tavor_mr_deregister(state, &mr, 1759 TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) != 1760 DDI_SUCCESS) { 1761 TAVOR_WARNING(state, "failed to deregister " 1762 "memory region"); 1763 } 1764 /* Set "status" and "errormsg" and goto failure */ 1765 TAVOR_TNF_FAIL(IBT_MR_ACCESS_REQ_INVALID, 1766 "invalid access flags"); 1767 goto mrrereg_fail; 1768 } 1769 1770 /* 1771 * Setup and validate the memory region access flags. This 1772 * means translating the IBTF's enable flags into the access 1773 * flags that will be used in later operations. 1774 */ 1775 acc_flags_to_use = 0; 1776 if (flags & IBT_MR_ENABLE_WINDOW_BIND) 1777 acc_flags_to_use |= IBT_MR_WINDOW_BIND; 1778 if (flags & IBT_MR_ENABLE_LOCAL_WRITE) 1779 acc_flags_to_use |= IBT_MR_LOCAL_WRITE; 1780 if (flags & IBT_MR_ENABLE_REMOTE_READ) 1781 acc_flags_to_use |= IBT_MR_REMOTE_READ; 1782 if (flags & IBT_MR_ENABLE_REMOTE_WRITE) 1783 acc_flags_to_use |= IBT_MR_REMOTE_WRITE; 1784 if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC) 1785 acc_flags_to_use |= IBT_MR_REMOTE_ATOMIC; 1786 1787 } else { 1788 acc_flags_to_use = mr->mr_accflag; 1789 } 1790 1791 /* 1792 * If we're modifying the translation, then figure out whether 1793 * we can reuse the current MTT resources. This means calling 1794 * tavor_mr_rereg_xlat_helper() which does most of the heavy lifting 1795 * for the reregistration. If the current memory region contains 1796 * sufficient MTT entries for the new regions, then it will be 1797 * reused and filled in. Otherwise, new entries will be allocated, 1798 * the old ones will be freed, and the new entries will be filled 1799 * in. Note: If we're not modifying the translation, then we 1800 * should already have all the information we need to update the MPT. 1801 * Also note: If tavor_mr_rereg_xlat_helper() fails, it will return 1802 * a "dereg_level" which is the level of cleanup that needs to be 1803 * passed to tavor_mr_deregister() to finish the cleanup. 1804 */ 1805 if (flags & IBT_MR_CHANGE_TRANSLATION) { 1806 status = tavor_mr_rereg_xlat_helper(state, mr, bind, op, 1807 &mtt_addr_to_use, sleep, &dereg_level); 1808 if (status != DDI_SUCCESS) { 1809 mutex_exit(&mr->mr_lock); 1810 /* 1811 * Call deregister and ensure that all resources get 1812 * properly freed up. 1813 */ 1814 if (tavor_mr_deregister(state, &mr, dereg_level, 1815 sleep) != DDI_SUCCESS) { 1816 TAVOR_WARNING(state, "failed to deregister " 1817 "memory region"); 1818 } 1819 1820 /* Set "status" and "errormsg" and goto failure */ 1821 TAVOR_TNF_FAIL(status, "failed rereg helper"); 1822 goto mrrereg_fail; 1823 } 1824 vaddr_to_use = mr->mr_bindinfo.bi_addr; 1825 len_to_use = mr->mr_bindinfo.bi_len; 1826 } else { 1827 mtt_addr_to_use = (((uint64_t)mpt_entry.mttseg_addr_h << 32) | 1828 ((uint64_t)mpt_entry.mttseg_addr_l << 6)); 1829 vaddr_to_use = mr->mr_bindinfo.bi_addr; 1830 len_to_use = mr->mr_bindinfo.bi_len; 1831 } 1832 1833 /* 1834 * Calculate new keys (Lkey, Rkey) from MPT index. Just like they were 1835 * when the region was first registered, each key is formed from 1836 * "constrained" bits and "unconstrained" bits. Note: If no remote 1837 * access is required, then the RKey value is not filled in. Otherwise 1838 * both Rkey and LKey are given the same value. 1839 */ 1840 tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey); 1841 if ((acc_flags_to_use & IBT_MR_REMOTE_READ) || 1842 (acc_flags_to_use & IBT_MR_REMOTE_WRITE) || 1843 (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC)) { 1844 mr->mr_rkey = mr->mr_lkey; 1845 } 1846 1847 /* 1848 * Update the MPT entry with the new information. Some of this 1849 * information is retained from the previous operation, some of 1850 * it is new based on request. 1851 */ 1852 mpt_entry.en_bind = (acc_flags_to_use & IBT_MR_WINDOW_BIND) ? 1 : 0; 1853 mpt_entry.atomic = (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC) ? 1 : 0; 1854 mpt_entry.rw = (acc_flags_to_use & IBT_MR_REMOTE_WRITE) ? 1 : 0; 1855 mpt_entry.rr = (acc_flags_to_use & IBT_MR_REMOTE_READ) ? 1 : 0; 1856 mpt_entry.lw = (acc_flags_to_use & IBT_MR_LOCAL_WRITE) ? 1 : 0; 1857 mpt_entry.page_sz = mr->mr_logmttpgsz - 0xC; 1858 mpt_entry.mem_key = mr->mr_lkey; 1859 mpt_entry.pd = pd_to_use->pd_pdnum; 1860 mpt_entry.start_addr = vaddr_to_use; 1861 mpt_entry.reg_win_len = len_to_use; 1862 mpt_entry.mttseg_addr_h = mtt_addr_to_use >> 32; 1863 mpt_entry.mttseg_addr_l = mtt_addr_to_use >> 6; 1864 1865 /* 1866 * Write the updated MPT entry to hardware 1867 * 1868 * We use TAVOR_CMD_NOSLEEP_SPIN here always because we must protect 1869 * against holding the lock around this rereg call in all contexts. 1870 */ 1871 status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry, 1872 sizeof (tavor_hw_mpt_t), mpt->tr_indx, TAVOR_CMD_NOSLEEP_SPIN); 1873 if (status != TAVOR_CMD_SUCCESS) { 1874 mutex_exit(&mr->mr_lock); 1875 cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n", 1876 status); 1877 /* 1878 * Call deregister and ensure that all current resources get 1879 * properly freed up. Unnecessary here to attempt to regain 1880 * software ownership of the MPT entry as that has already 1881 * been done above. 1882 */ 1883 if (tavor_mr_deregister(state, &mr, 1884 TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) != DDI_SUCCESS) { 1885 TAVOR_WARNING(state, "failed to deregister memory " 1886 "region"); 1887 } 1888 TNF_PROBE_1(tavor_mr_common_rereg_sw2hw_mpt_cmd_fail, 1889 TAVOR_TNF_ERROR, "", tnf_uint, status, status); 1890 TAVOR_TNF_EXIT(tavor_mr_common_rereg); 1891 return (ibc_get_ci_failure(0)); 1892 } 1893 1894 /* 1895 * If we're changing PD, then update their reference counts now. 1896 * This means decrementing the reference count on the old PD and 1897 * incrementing the reference count on the new PD. 1898 */ 1899 if (flags & IBT_MR_CHANGE_PD) { 1900 tavor_pd_refcnt_dec(mr->mr_pdhdl); 1901 tavor_pd_refcnt_inc(pd); 1902 } 1903 1904 /* 1905 * Update the contents of the Tavor Memory Region handle to reflect 1906 * what has been changed. 1907 */ 1908 mr->mr_pdhdl = pd_to_use; 1909 mr->mr_accflag = acc_flags_to_use; 1910 mr->mr_is_umem = 0; 1911 mr->mr_umemcookie = NULL; 1912 1913 /* New MR handle is same as the old */ 1914 *mrhdl_new = mr; 1915 mutex_exit(&mr->mr_lock); 1916 1917 TAVOR_TNF_EXIT(tavor_mr_common_rereg); 1918 return (DDI_SUCCESS); 1919 1920 mrrereg_fail: 1921 TNF_PROBE_1(tavor_mr_common_rereg_fail, TAVOR_TNF_ERROR, "", 1922 tnf_string, msg, errormsg); 1923 TAVOR_TNF_EXIT(tavor_mr_common_rereg); 1924 return (status); 1925 } 1926 1927 1928 /* 1929 * tavor_mr_rereg_xlat_helper 1930 * Context: Can be called from interrupt or base context. 1931 * Note: This routine expects the "mr_lock" to be held when it 1932 * is called. Upon returning failure, this routine passes information 1933 * about what "dereg_level" should be passed to tavor_mr_deregister(). 1934 */ 1935 static int 1936 tavor_mr_rereg_xlat_helper(tavor_state_t *state, tavor_mrhdl_t mr, 1937 tavor_bind_info_t *bind, tavor_mr_options_t *op, uint64_t *mtt_addr, 1938 uint_t sleep, uint_t *dereg_level) 1939 { 1940 tavor_rsrc_pool_info_t *rsrc_pool; 1941 tavor_rsrc_t *mtt, *mtt_refcnt; 1942 tavor_sw_refcnt_t *swrc_old, *swrc_new; 1943 ddi_dma_handle_t dmahdl; 1944 uint64_t nummtt_needed, nummtt_in_currrsrc, max_sz; 1945 uint64_t mtt_ddrbaseaddr; 1946 uint_t mtt_pgsize_bits, bind_type, reuse_dmahdl; 1947 int status; 1948 char *errormsg; 1949 1950 TAVOR_TNF_ENTER(tavor_mr_rereg_xlat_helper); 1951 1952 ASSERT(MUTEX_HELD(&mr->mr_lock)); 1953 1954 /* 1955 * Check the "options" flag. Currently this flag tells the driver 1956 * whether or not the region should be bound normally (i.e. with 1957 * entries written into the PCI IOMMU) or whether it should be 1958 * registered to bypass the IOMMU. 1959 */ 1960 if (op == NULL) { 1961 bind_type = TAVOR_BINDMEM_NORMAL; 1962 } else { 1963 bind_type = op->mro_bind_type; 1964 } 1965 1966 /* 1967 * Check for invalid length. Check is the length is zero or if the 1968 * length is larger than the maximum configured value. Return error 1969 * if it is. 1970 */ 1971 max_sz = ((uint64_t)1 << state->ts_cfg_profile->cp_log_max_mrw_sz); 1972 if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) { 1973 /* 1974 * Deregister will be called upon returning failure from this 1975 * routine. This will ensure that all current resources get 1976 * properly freed up. Unnecessary to attempt to regain 1977 * software ownership of the MPT entry as that has already 1978 * been done above (in tavor_mr_reregister()) 1979 */ 1980 *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT; 1981 1982 /* Set "status" and "errormsg" and goto failure */ 1983 TAVOR_TNF_FAIL(IBT_MR_LEN_INVALID, "invalid length"); 1984 goto mrrereghelp_fail; 1985 } 1986 1987 /* 1988 * Determine the number of pages necessary for new region and the 1989 * number of pages supported by the current MTT resources 1990 */ 1991 nummtt_needed = tavor_mr_nummtt_needed(state, bind, &mtt_pgsize_bits); 1992 nummtt_in_currrsrc = mr->mr_mttrsrcp->tr_len >> TAVOR_MTT_SIZE_SHIFT; 1993 1994 /* 1995 * Depending on whether we have enough pages or not, the next step is 1996 * to fill in a set of MTT entries that reflect the new mapping. In 1997 * the first case below, we already have enough entries. This means 1998 * we need to unbind the memory from the previous mapping, bind the 1999 * memory for the new mapping, write the new MTT entries, and update 2000 * the mr to reflect the changes. 2001 * In the second case below, we do not have enough entries in the 2002 * current mapping. So, in this case, we need not only to unbind the 2003 * current mapping, but we need to free up the MTT resources associated 2004 * with that mapping. After we've successfully done that, we continue 2005 * by binding the new memory, allocating new MTT entries, writing the 2006 * new MTT entries, and updating the mr to reflect the changes. 2007 */ 2008 2009 /* 2010 * If this region is being shared (i.e. MTT refcount != 1), then we 2011 * can't reuse the current MTT resources regardless of their size. 2012 * Instead we'll need to alloc new ones (below) just as if there 2013 * hadn't been enough room in the current entries. 2014 */ 2015 swrc_old = (tavor_sw_refcnt_t *)mr->mr_mttrefcntp->tr_addr; 2016 if (TAVOR_MTT_IS_NOT_SHARED(swrc_old) && 2017 (nummtt_needed <= nummtt_in_currrsrc)) { 2018 2019 /* 2020 * Unbind the old mapping for this memory region, but retain 2021 * the ddi_dma_handle_t (if possible) for reuse in the bind 2022 * operation below. Note: If original memory region was 2023 * bound for IOMMU bypass and the new region can not use 2024 * bypass, then a new DMA handle will be necessary. 2025 */ 2026 if (TAVOR_MR_REUSE_DMAHDL(mr, bind->bi_flags)) { 2027 mr->mr_bindinfo.bi_free_dmahdl = 0; 2028 tavor_mr_mem_unbind(state, &mr->mr_bindinfo); 2029 dmahdl = mr->mr_bindinfo.bi_dmahdl; 2030 reuse_dmahdl = 1; 2031 } else { 2032 tavor_mr_mem_unbind(state, &mr->mr_bindinfo); 2033 dmahdl = NULL; 2034 reuse_dmahdl = 0; 2035 } 2036 2037 /* 2038 * Bind the new memory and determine the mapped addresses. 2039 * As described, this routine and tavor_mr_fast_mtt_write() 2040 * do the majority of the work for the memory registration 2041 * operations. Note: When we successfully finish the binding, 2042 * we will set the "bi_free_dmahdl" flag to indicate that 2043 * even though we may have reused the ddi_dma_handle_t we do 2044 * wish it to be freed up at some later time. Note also that 2045 * if we fail, we may need to cleanup the ddi_dma_handle_t. 2046 */ 2047 bind->bi_bypass = bind_type; 2048 status = tavor_mr_mem_bind(state, bind, dmahdl, sleep); 2049 if (status != DDI_SUCCESS) { 2050 if (reuse_dmahdl) { 2051 ddi_dma_free_handle(&dmahdl); 2052 } 2053 2054 /* 2055 * Deregister will be called upon returning failure 2056 * from this routine. This will ensure that all 2057 * current resources get properly freed up. 2058 * Unnecessary to attempt to regain software ownership 2059 * of the MPT entry as that has already been done 2060 * above (in tavor_mr_reregister()). Also unnecessary 2061 * to attempt to unbind the memory. 2062 */ 2063 *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND; 2064 2065 /* Set "status" and "errormsg" and goto failure */ 2066 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind"); 2067 goto mrrereghelp_fail; 2068 } 2069 if (reuse_dmahdl) { 2070 bind->bi_free_dmahdl = 1; 2071 } 2072 2073 /* 2074 * Using the new mapping, but reusing the current MTT 2075 * resources, write the updated entries to MTT 2076 */ 2077 mtt = mr->mr_mttrsrcp; 2078 status = tavor_mr_fast_mtt_write(mtt, bind, mtt_pgsize_bits); 2079 if (status != DDI_SUCCESS) { 2080 /* 2081 * Deregister will be called upon returning failure 2082 * from this routine. This will ensure that all 2083 * current resources get properly freed up. 2084 * Unnecessary to attempt to regain software ownership 2085 * of the MPT entry as that has already been done 2086 * above (in tavor_mr_reregister()). Also unnecessary 2087 * to attempt to unbind the memory. 2088 * 2089 * But we do need to unbind the newly bound memory 2090 * before returning. 2091 */ 2092 tavor_mr_mem_unbind(state, bind); 2093 *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND; 2094 2095 /* Set "status" and "errormsg" and goto failure */ 2096 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), 2097 "failed write mtt"); 2098 goto mrrereghelp_fail; 2099 } 2100 2101 /* Put the updated information into the Mem Region handle */ 2102 mr->mr_bindinfo = *bind; 2103 mr->mr_logmttpgsz = mtt_pgsize_bits; 2104 2105 } else { 2106 /* 2107 * Check if the memory region MTT is shared by any other MRs. 2108 * Since the resource may be shared between multiple memory 2109 * regions (as a result of a "RegisterSharedMR()" verb) it is 2110 * important that we not unbind any resources prematurely. 2111 */ 2112 if (!TAVOR_MTT_IS_SHARED(swrc_old)) { 2113 /* 2114 * Unbind the old mapping for this memory region, but 2115 * retain the ddi_dma_handle_t for reuse in the bind 2116 * operation below. Note: This can only be done here 2117 * because the region being reregistered is not 2118 * currently shared. Also if original memory region 2119 * was bound for IOMMU bypass and the new region can 2120 * not use bypass, then a new DMA handle will be 2121 * necessary. 2122 */ 2123 if (TAVOR_MR_REUSE_DMAHDL(mr, bind->bi_flags)) { 2124 mr->mr_bindinfo.bi_free_dmahdl = 0; 2125 tavor_mr_mem_unbind(state, &mr->mr_bindinfo); 2126 dmahdl = mr->mr_bindinfo.bi_dmahdl; 2127 reuse_dmahdl = 1; 2128 } else { 2129 tavor_mr_mem_unbind(state, &mr->mr_bindinfo); 2130 dmahdl = NULL; 2131 reuse_dmahdl = 0; 2132 } 2133 } else { 2134 dmahdl = NULL; 2135 reuse_dmahdl = 0; 2136 } 2137 2138 /* 2139 * Bind the new memory and determine the mapped addresses. 2140 * As described, this routine and tavor_mr_fast_mtt_write() 2141 * do the majority of the work for the memory registration 2142 * operations. Note: When we successfully finish the binding, 2143 * we will set the "bi_free_dmahdl" flag to indicate that 2144 * even though we may have reused the ddi_dma_handle_t we do 2145 * wish it to be freed up at some later time. Note also that 2146 * if we fail, we may need to cleanup the ddi_dma_handle_t. 2147 */ 2148 bind->bi_bypass = bind_type; 2149 status = tavor_mr_mem_bind(state, bind, dmahdl, sleep); 2150 if (status != DDI_SUCCESS) { 2151 if (reuse_dmahdl) { 2152 ddi_dma_free_handle(&dmahdl); 2153 } 2154 2155 /* 2156 * Deregister will be called upon returning failure 2157 * from this routine. This will ensure that all 2158 * current resources get properly freed up. 2159 * Unnecessary to attempt to regain software ownership 2160 * of the MPT entry as that has already been done 2161 * above (in tavor_mr_reregister()). Also unnecessary 2162 * to attempt to unbind the memory. 2163 */ 2164 *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND; 2165 2166 /* Set "status" and "errormsg" and goto failure */ 2167 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed mem bind"); 2168 goto mrrereghelp_fail; 2169 } 2170 if (reuse_dmahdl) { 2171 bind->bi_free_dmahdl = 1; 2172 } 2173 2174 /* 2175 * Allocate the new MTT entries resource 2176 */ 2177 status = tavor_rsrc_alloc(state, TAVOR_MTT, 2178 TAVOR_NUMMTT_TO_MTTSEG(nummtt_needed), sleep, &mtt); 2179 if (status != DDI_SUCCESS) { 2180 /* 2181 * Deregister will be called upon returning failure 2182 * from this routine. This will ensure that all 2183 * current resources get properly freed up. 2184 * Unnecessary to attempt to regain software ownership 2185 * of the MPT entry as that has already been done 2186 * above (in tavor_mr_reregister()). Also unnecessary 2187 * to attempt to unbind the memory. 2188 * 2189 * But we do need to unbind the newly bound memory 2190 * before returning. 2191 */ 2192 tavor_mr_mem_unbind(state, bind); 2193 *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND; 2194 2195 /* Set "status" and "errormsg" and goto failure */ 2196 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed MTT"); 2197 goto mrrereghelp_fail; 2198 } 2199 2200 /* 2201 * Allocate MTT reference count (to track shared memory 2202 * regions). As mentioned elsewhere above, this reference 2203 * count resource may never be used on the given memory region, 2204 * but if it is ever later registered as a "shared" memory 2205 * region then this resource will be necessary. Note: This 2206 * is only necessary here if the existing memory region is 2207 * already being shared (because otherwise we already have 2208 * a useable reference count resource). 2209 */ 2210 if (TAVOR_MTT_IS_SHARED(swrc_old)) { 2211 status = tavor_rsrc_alloc(state, TAVOR_REFCNT, 1, 2212 sleep, &mtt_refcnt); 2213 if (status != DDI_SUCCESS) { 2214 /* 2215 * Deregister will be called upon returning 2216 * failure from this routine. This will ensure 2217 * that all current resources get properly 2218 * freed up. Unnecessary to attempt to regain 2219 * software ownership of the MPT entry as that 2220 * has already been done above (in 2221 * tavor_mr_reregister()). Also unnecessary 2222 * to attempt to unbind the memory. 2223 * 2224 * But we need to unbind the newly bound 2225 * memory and free up the newly allocated MTT 2226 * entries before returning. 2227 */ 2228 tavor_mr_mem_unbind(state, bind); 2229 tavor_rsrc_free(state, &mtt); 2230 *dereg_level = 2231 TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND; 2232 2233 /* Set "status"/"errormsg", goto failure */ 2234 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, 2235 "failed reference count"); 2236 goto mrrereghelp_fail; 2237 } 2238 swrc_new = (tavor_sw_refcnt_t *)mtt_refcnt->tr_addr; 2239 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_new)) 2240 TAVOR_MTT_REFCNT_INIT(swrc_new); 2241 } else { 2242 mtt_refcnt = mr->mr_mttrefcntp; 2243 } 2244 2245 /* 2246 * Using the new mapping and the new MTT resources, write the 2247 * updated entries to MTT 2248 */ 2249 status = tavor_mr_fast_mtt_write(mtt, bind, mtt_pgsize_bits); 2250 if (status != DDI_SUCCESS) { 2251 /* 2252 * Deregister will be called upon returning failure 2253 * from this routine. This will ensure that all 2254 * current resources get properly freed up. 2255 * Unnecessary to attempt to regain software ownership 2256 * of the MPT entry as that has already been done 2257 * above (in tavor_mr_reregister()). Also unnecessary 2258 * to attempt to unbind the memory. 2259 * 2260 * But we need to unbind the newly bound memory, 2261 * free up the newly allocated MTT entries, and 2262 * (possibly) free the new MTT reference count 2263 * resource before returning. 2264 */ 2265 if (TAVOR_MTT_IS_SHARED(swrc_old)) { 2266 tavor_rsrc_free(state, &mtt_refcnt); 2267 } 2268 tavor_mr_mem_unbind(state, bind); 2269 tavor_rsrc_free(state, &mtt); 2270 *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND; 2271 2272 /* Set "status" and "errormsg" and goto failure */ 2273 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed write mtt"); 2274 goto mrrereghelp_fail; 2275 } 2276 2277 /* 2278 * Check if the memory region MTT is shared by any other MRs. 2279 * Since the resource may be shared between multiple memory 2280 * regions (as a result of a "RegisterSharedMR()" verb) it is 2281 * important that we not free up any resources prematurely. 2282 */ 2283 if (TAVOR_MTT_IS_SHARED(swrc_old)) { 2284 /* Decrement MTT reference count for "old" region */ 2285 (void) tavor_mtt_refcnt_dec(mr->mr_mttrefcntp); 2286 } else { 2287 /* Free up the old MTT entries resource */ 2288 tavor_rsrc_free(state, &mr->mr_mttrsrcp); 2289 } 2290 2291 /* Put the updated information into the mrhdl */ 2292 mr->mr_bindinfo = *bind; 2293 mr->mr_logmttpgsz = mtt_pgsize_bits; 2294 mr->mr_mttrsrcp = mtt; 2295 mr->mr_mttrefcntp = mtt_refcnt; 2296 } 2297 2298 /* 2299 * Calculate and return the updated MTT address (in the DDR address 2300 * space). This will be used by the caller (tavor_mr_reregister) in 2301 * the updated MPT entry 2302 */ 2303 rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT]; 2304 mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset; 2305 *mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << 2306 TAVOR_MTT_SIZE_SHIFT); 2307 2308 TAVOR_TNF_EXIT(tavor_mr_rereg_xlat_helper); 2309 return (DDI_SUCCESS); 2310 2311 mrrereghelp_fail: 2312 TNF_PROBE_1(tavor_mr_rereg_xlat_helper_fail, TAVOR_TNF_ERROR, "", 2313 tnf_string, msg, errormsg); 2314 TAVOR_TNF_EXIT(tavor_mr_rereg_xlat_helper); 2315 return (status); 2316 } 2317 2318 2319 /* 2320 * tavor_mr_nummtt_needed() 2321 * Context: Can be called from interrupt or base context. 2322 */ 2323 /* ARGSUSED */ 2324 static uint64_t 2325 tavor_mr_nummtt_needed(tavor_state_t *state, tavor_bind_info_t *bind, 2326 uint_t *mtt_pgsize_bits) 2327 { 2328 uint64_t pg_offset_mask; 2329 uint64_t pg_offset, tmp_length; 2330 2331 /* 2332 * For now we specify the page size as 8Kb (the default page size for 2333 * the sun4u architecture), or 4Kb for x86. Figure out optimal page 2334 * size by examining the dmacookies XXX 2335 */ 2336 *mtt_pgsize_bits = PAGESHIFT; 2337 2338 pg_offset_mask = ((uint64_t)1 << *mtt_pgsize_bits) - 1; 2339 pg_offset = bind->bi_addr & pg_offset_mask; 2340 tmp_length = pg_offset + (bind->bi_len - 1); 2341 return ((tmp_length >> *mtt_pgsize_bits) + 1); 2342 } 2343 2344 2345 /* 2346 * tavor_mr_mem_bind() 2347 * Context: Can be called from interrupt or base context. 2348 */ 2349 static int 2350 tavor_mr_mem_bind(tavor_state_t *state, tavor_bind_info_t *bind, 2351 ddi_dma_handle_t dmahdl, uint_t sleep) 2352 { 2353 ddi_dma_attr_t dma_attr; 2354 int (*callback)(caddr_t); 2355 uint_t dma_xfer_mode; 2356 int status; 2357 2358 /* bi_type must be set to a meaningful value to get a bind handle */ 2359 ASSERT(bind->bi_type == TAVOR_BINDHDL_VADDR || 2360 bind->bi_type == TAVOR_BINDHDL_BUF || 2361 bind->bi_type == TAVOR_BINDHDL_UBUF); 2362 2363 TAVOR_TNF_ENTER(tavor_mr_mem_bind); 2364 2365 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind)) 2366 2367 /* Set the callback flag appropriately */ 2368 callback = (sleep == TAVOR_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT; 2369 2370 /* Determine whether to map STREAMING or CONSISTENT */ 2371 dma_xfer_mode = (bind->bi_flags & IBT_MR_NONCOHERENT) ? 2372 DDI_DMA_STREAMING : DDI_DMA_CONSISTENT; 2373 2374 /* 2375 * Initialize many of the default DMA attributes. Then, if we're 2376 * bypassing the IOMMU, set the DDI_DMA_FORCE_PHYSICAL flag. 2377 */ 2378 if (dmahdl == NULL) { 2379 tavor_dma_attr_init(&dma_attr); 2380 #ifdef __sparc 2381 /* 2382 * First, disable streaming and switch to consistent if 2383 * configured to do so and IOMMU BYPASS is enabled. 2384 */ 2385 if (state->ts_cfg_profile->cp_disable_streaming_on_bypass && 2386 dma_xfer_mode == DDI_DMA_STREAMING && 2387 bind->bi_bypass == TAVOR_BINDMEM_BYPASS) { 2388 dma_xfer_mode = DDI_DMA_CONSISTENT; 2389 } 2390 2391 /* 2392 * Then, if streaming is still specified, then "bypass" is not 2393 * allowed. 2394 */ 2395 if ((dma_xfer_mode == DDI_DMA_CONSISTENT) && 2396 (bind->bi_bypass == TAVOR_BINDMEM_BYPASS)) { 2397 dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL; 2398 } 2399 #endif 2400 /* Allocate a DMA handle for the binding */ 2401 status = ddi_dma_alloc_handle(state->ts_dip, &dma_attr, 2402 callback, NULL, &bind->bi_dmahdl); 2403 if (status != DDI_SUCCESS) { 2404 TNF_PROBE_0(tavor_mr_mem_bind_dmahdl_fail, 2405 TAVOR_TNF_ERROR, ""); 2406 TAVOR_TNF_EXIT(tavor_mr_mem_bind); 2407 return (status); 2408 } 2409 bind->bi_free_dmahdl = 1; 2410 2411 } else { 2412 bind->bi_dmahdl = dmahdl; 2413 bind->bi_free_dmahdl = 0; 2414 } 2415 2416 /* 2417 * Bind the memory to get the PCI mapped addresses. The decision 2418 * to call ddi_dma_addr_bind_handle() or ddi_dma_buf_bind_handle() 2419 * is determined by the "bi_type" flag. Note: if the bind operation 2420 * fails then we have to free up the DMA handle and return error. 2421 */ 2422 if (bind->bi_type == TAVOR_BINDHDL_VADDR) { 2423 status = ddi_dma_addr_bind_handle(bind->bi_dmahdl, NULL, 2424 (caddr_t)(uintptr_t)bind->bi_addr, bind->bi_len, 2425 (DDI_DMA_RDWR | dma_xfer_mode), callback, NULL, 2426 &bind->bi_dmacookie, &bind->bi_cookiecnt); 2427 } else { /* TAVOR_BINDHDL_BUF || TAVOR_BINDHDL_UBUF */ 2428 status = ddi_dma_buf_bind_handle(bind->bi_dmahdl, 2429 bind->bi_buf, (DDI_DMA_RDWR | dma_xfer_mode), callback, 2430 NULL, &bind->bi_dmacookie, &bind->bi_cookiecnt); 2431 } 2432 2433 if (status != DDI_DMA_MAPPED) { 2434 if (bind->bi_free_dmahdl != 0) { 2435 ddi_dma_free_handle(&bind->bi_dmahdl); 2436 } 2437 TNF_PROBE_0(tavor_mr_mem_bind_dmabind_fail, TAVOR_TNF_ERROR, 2438 ""); 2439 TAVOR_TNF_EXIT(tavor_mr_mem_bind); 2440 return (status); 2441 } 2442 2443 TAVOR_TNF_EXIT(tavor_mr_mem_bind); 2444 return (DDI_SUCCESS); 2445 } 2446 2447 2448 /* 2449 * tavor_mr_mem_unbind() 2450 * Context: Can be called from interrupt or base context. 2451 */ 2452 static void 2453 tavor_mr_mem_unbind(tavor_state_t *state, tavor_bind_info_t *bind) 2454 { 2455 int status; 2456 2457 TAVOR_TNF_ENTER(tavor_mr_mem_unbind); 2458 2459 /* 2460 * In case of TAVOR_BINDHDL_UBUF, the memory bi_buf points to 2461 * is actually allocated by ddi_umem_iosetup() internally, then 2462 * it's required to free it here. Reset bi_type to TAVOR_BINDHDL_NONE 2463 * not to free it again later. 2464 */ 2465 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind)) 2466 if (bind->bi_type == TAVOR_BINDHDL_UBUF) { 2467 freerbuf(bind->bi_buf); 2468 bind->bi_type = TAVOR_BINDHDL_NONE; 2469 } 2470 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind)) 2471 2472 /* 2473 * Unbind the DMA memory for the region 2474 * 2475 * Note: The only way ddi_dma_unbind_handle() currently 2476 * can return an error is if the handle passed in is invalid. 2477 * Since this should never happen, we choose to return void 2478 * from this function! If this does return an error, however, 2479 * then we print a warning message to the console. 2480 */ 2481 status = ddi_dma_unbind_handle(bind->bi_dmahdl); 2482 if (status != DDI_SUCCESS) { 2483 TAVOR_WARNING(state, "failed to unbind DMA mapping"); 2484 TNF_PROBE_0(tavor_mr_mem_unbind_dmaunbind_fail, 2485 TAVOR_TNF_ERROR, ""); 2486 TAVOR_TNF_EXIT(tavor_mr_mem_unbind); 2487 return; 2488 } 2489 2490 /* Free up the DMA handle */ 2491 if (bind->bi_free_dmahdl != 0) { 2492 ddi_dma_free_handle(&bind->bi_dmahdl); 2493 } 2494 2495 TAVOR_TNF_EXIT(tavor_mr_mem_unbind); 2496 } 2497 2498 2499 /* 2500 * tavor_mr_fast_mtt_write() 2501 * Context: Can be called from interrupt or base context. 2502 */ 2503 static int 2504 tavor_mr_fast_mtt_write(tavor_rsrc_t *mtt, tavor_bind_info_t *bind, 2505 uint32_t mtt_pgsize_bits) 2506 { 2507 ddi_dma_cookie_t dmacookie; 2508 uint_t cookie_cnt; 2509 uint64_t *mtt_table; 2510 uint64_t mtt_entry; 2511 uint64_t addr, endaddr; 2512 uint64_t pagesize; 2513 int i; 2514 2515 TAVOR_TNF_ENTER(tavor_mr_fast_mtt_write); 2516 2517 /* Calculate page size from the suggested value passed in */ 2518 pagesize = ((uint64_t)1 << mtt_pgsize_bits); 2519 2520 /* 2521 * Walk the "cookie list" and fill in the MTT table entries 2522 */ 2523 i = 0; 2524 mtt_table = (uint64_t *)mtt->tr_addr; 2525 dmacookie = bind->bi_dmacookie; 2526 cookie_cnt = bind->bi_cookiecnt; 2527 while (cookie_cnt-- > 0) { 2528 addr = dmacookie.dmac_laddress; 2529 endaddr = addr + (dmacookie.dmac_size - 1); 2530 addr = addr & ~((uint64_t)pagesize - 1); 2531 while (addr <= endaddr) { 2532 /* 2533 * Fill in the mapped addresses (calculated above) and 2534 * set TAVOR_MTT_ENTRY_PRESET flag for each MTT entry. 2535 */ 2536 mtt_entry = addr | TAVOR_MTT_ENTRY_PRESET; 2537 ddi_put64(mtt->tr_acchdl, &mtt_table[i], mtt_entry); 2538 addr += pagesize; 2539 i++; 2540 2541 if (addr == 0) { 2542 static int do_once = 1; 2543 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", 2544 do_once)) 2545 if (do_once) { 2546 do_once = 0; 2547 cmn_err(CE_NOTE, "probable error in " 2548 "dma_cookie address from caller\n"); 2549 } 2550 break; 2551 } 2552 } 2553 2554 /* 2555 * When we've reached the end of the current DMA cookie, 2556 * jump to the next cookie (if there are more) 2557 */ 2558 if (cookie_cnt != 0) { 2559 ddi_dma_nextcookie(bind->bi_dmahdl, &dmacookie); 2560 } 2561 } 2562 2563 TAVOR_TNF_EXIT(tavor_mr_fast_mtt_write); 2564 return (DDI_SUCCESS); 2565 } 2566 2567 /* 2568 * tavor_mtt_refcnt_inc() 2569 * Context: Can be called from interrupt or base context. 2570 */ 2571 static int 2572 tavor_mtt_refcnt_inc(tavor_rsrc_t *rsrc) 2573 { 2574 tavor_sw_refcnt_t *rc; 2575 uint32_t cnt; 2576 2577 rc = (tavor_sw_refcnt_t *)rsrc->tr_addr; 2578 2579 /* Increment the MTT's reference count */ 2580 mutex_enter(&rc->swrc_lock); 2581 TNF_PROBE_1_DEBUG(tavor_mtt_refcnt_inc, TAVOR_TNF_TRACE, "", 2582 tnf_uint, refcnt, rc->swrc_refcnt); 2583 cnt = rc->swrc_refcnt++; 2584 mutex_exit(&rc->swrc_lock); 2585 2586 return (cnt); 2587 } 2588 2589 2590 /* 2591 * tavor_mtt_refcnt_dec() 2592 * Context: Can be called from interrupt or base context. 2593 */ 2594 static int 2595 tavor_mtt_refcnt_dec(tavor_rsrc_t *rsrc) 2596 { 2597 tavor_sw_refcnt_t *rc; 2598 uint32_t cnt; 2599 2600 rc = (tavor_sw_refcnt_t *)rsrc->tr_addr; 2601 2602 /* Decrement the MTT's reference count */ 2603 mutex_enter(&rc->swrc_lock); 2604 cnt = --rc->swrc_refcnt; 2605 TNF_PROBE_1_DEBUG(tavor_mtt_refcnt_dec, TAVOR_TNF_TRACE, "", 2606 tnf_uint, refcnt, rc->swrc_refcnt); 2607 mutex_exit(&rc->swrc_lock); 2608 2609 return (cnt); 2610 } 2611