1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * tavor_misc.c 29 * Tavor Miscellaneous routines - Address Handle, Multicast, Protection 30 * Domain, and port-related operations 31 * 32 * Implements all the routines necessary for allocating, freeing, querying 33 * and modifying Address Handles and Protection Domains. Also implements 34 * all the routines necessary for adding and removing Queue Pairs to/from 35 * Multicast Groups. Lastly, it implements the routines necessary for 36 * port-related query and modify operations. 37 */ 38 39 #include <sys/types.h> 40 #include <sys/conf.h> 41 #include <sys/ddi.h> 42 #include <sys/sunddi.h> 43 #include <sys/modctl.h> 44 #include <sys/bitmap.h> 45 #include <sys/sysmacros.h> 46 47 #include <sys/ib/adapters/tavor/tavor.h> 48 49 static void tavor_udav_sync(tavor_ahhdl_t ah, tavor_hw_udav_t *udav, 50 uint_t flag); 51 static int tavor_mcg_qplist_add(tavor_state_t *state, tavor_mcghdl_t mcg, 52 tavor_hw_mcg_qp_list_t *mcg_qplist, tavor_qphdl_t qp, uint_t *qp_found); 53 static int tavor_mcg_qplist_remove(tavor_mcghdl_t mcg, 54 tavor_hw_mcg_qp_list_t *mcg_qplist, tavor_qphdl_t qp); 55 static void tavor_qp_mcg_refcnt_inc(tavor_qphdl_t qp); 56 static void tavor_qp_mcg_refcnt_dec(tavor_qphdl_t qp); 57 static uint_t tavor_mcg_walk_mgid_hash(tavor_state_t *state, 58 uint64_t start_indx, ib_gid_t mgid, uint_t *prev_indx); 59 static void tavor_mcg_setup_new_hdr(tavor_mcghdl_t mcg, 60 tavor_hw_mcg_t *mcg_hdr, ib_gid_t mgid, tavor_rsrc_t *mcg_rsrc); 61 static int tavor_mcg_hash_list_remove(tavor_state_t *state, uint_t curr_indx, 62 uint_t prev_indx, tavor_hw_mcg_t *mcg_entry); 63 static int tavor_mcg_entry_invalidate(tavor_state_t *state, 64 tavor_hw_mcg_t *mcg_entry, uint_t indx); 65 static int tavor_mgid_is_valid(ib_gid_t gid); 66 static int tavor_mlid_is_valid(ib_lid_t lid); 67 68 69 /* 70 * tavor_ah_alloc() 71 * Context: Can be called only from user or kernel context. 72 */ 73 int 74 tavor_ah_alloc(tavor_state_t *state, tavor_pdhdl_t pd, 75 ibt_adds_vect_t *attr_p, tavor_ahhdl_t *ahhdl, uint_t sleepflag) 76 { 77 tavor_rsrc_t *udav, *rsrc; 78 tavor_hw_udav_t udav_entry; 79 tavor_ahhdl_t ah; 80 ibt_mr_attr_t mr_attr; 81 tavor_mr_options_t op; 82 tavor_mrhdl_t mr; 83 uint64_t data; 84 uint32_t size; 85 int status, i, flag; 86 87 /* 88 * Someday maybe the "ibt_adds_vect_t *attr_p" will be NULL to 89 * indicate that we wish to allocate an "invalid" (i.e. empty) 90 * address handle XXX 91 */ 92 93 /* Validate that specified port number is legal */ 94 if (!tavor_portnum_is_valid(state, attr_p->av_port_num)) { 95 goto ahalloc_fail; 96 } 97 98 /* 99 * Allocate a UDAV entry. This will be filled in with all the 100 * necessary parameters to define the Address Handle. Unlike the 101 * other hardware resources no ownership transfer takes place as 102 * these UDAV entries are always owned by hardware. 103 */ 104 status = tavor_rsrc_alloc(state, TAVOR_UDAV, 1, sleepflag, &udav); 105 if (status != DDI_SUCCESS) { 106 goto ahalloc_fail; 107 } 108 109 /* 110 * Allocate the software structure for tracking the address handle 111 * (i.e. the Tavor Address Handle struct). If we fail here, we must 112 * undo the previous resource allocation. 113 */ 114 status = tavor_rsrc_alloc(state, TAVOR_AHHDL, 1, sleepflag, &rsrc); 115 if (status != DDI_SUCCESS) { 116 goto ahalloc_fail1; 117 } 118 ah = (tavor_ahhdl_t)rsrc->tr_addr; 119 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah)) 120 121 /* Increment the reference count on the protection domain (PD) */ 122 tavor_pd_refcnt_inc(pd); 123 124 /* 125 * Fill in the UDAV entry. Note: We are only filling in a temporary 126 * copy here, which we will later copy into the actual entry in 127 * Tavor DDR memory. This starts be zeroing out the temporary copy 128 * and then calling tavor_set_addr_path() to fill in the common 129 * portions that can be pulled from the "ibt_adds_vect_t" passed in 130 */ 131 bzero(&udav_entry, sizeof (tavor_hw_udav_t)); 132 status = tavor_set_addr_path(state, attr_p, 133 (tavor_hw_addr_path_t *)&udav_entry, TAVOR_ADDRPATH_UDAV, NULL); 134 if (status != DDI_SUCCESS) { 135 tavor_pd_refcnt_dec(pd); 136 tavor_rsrc_free(state, &rsrc); 137 tavor_rsrc_free(state, &udav); 138 goto ahalloc_fail; 139 } 140 udav_entry.pd = pd->pd_pdnum; 141 udav_entry.msg_sz = state->ts_cfg_profile->cp_max_mtu - 1; 142 143 /* 144 * Register the memory for the UDAV. The memory for the UDAV must 145 * be registered in the Tavor TPT tables. This gives us the LKey 146 * that we will need when we later post a UD work request that 147 * uses this address handle. 148 * We might be able to pre-register all the memory for the UDAV XXX 149 */ 150 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP : IBT_MR_NOSLEEP; 151 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)udav->tr_addr; 152 mr_attr.mr_len = udav->tr_len; 153 mr_attr.mr_as = NULL; 154 mr_attr.mr_flags = flag; 155 op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass; 156 op.mro_bind_dmahdl = NULL; 157 op.mro_bind_override_addr = 0; 158 status = tavor_mr_register(state, pd, &mr_attr, &mr, &op); 159 if (status != DDI_SUCCESS) { 160 goto ahalloc_fail2; 161 } 162 163 /* 164 * Fill in the UDAV entry. Here we copy all the information from 165 * the temporary UDAV into the DDR memory for the real UDAV entry. 166 * Note that we copy everything but the first 64-bit word. This 167 * is where the PD number for the address handle resides. 168 * By filling everything except the PD and then writing the PD in 169 * a separate step below, we can ensure that the UDAV is not 170 * accessed while there are partially written values in it (something 171 * which really should not happen anyway). This is guaranteed 172 * because we take measures to ensure that the PD number is zero for 173 * all unused UDAV (and because PD#0 is reserved for Tavor). 174 */ 175 size = sizeof (tavor_hw_udav_t) >> 3; 176 for (i = 1; i < size; i++) { 177 data = ((uint64_t *)&udav_entry)[i]; 178 ddi_put64(udav->tr_acchdl, ((uint64_t *)udav->tr_addr + i), 179 data); 180 } 181 data = ((uint64_t *)&udav_entry)[0]; 182 ddi_put64(udav->tr_acchdl, (uint64_t *)udav->tr_addr, data); 183 184 /* 185 * Fill in the rest of the Tavor Address Handle struct. Having 186 * successfully copied the UDAV into the hardware, we update the 187 * following fields for use in further operations on the AH. 188 * 189 * NOTE: We are saving away a copy of the "av_dgid.gid_guid" field 190 * here because we may need to return it later to the IBTF (as a 191 * result of a subsequent query operation). Unlike the other UDAV 192 * parameters, the value of "av_dgid.gid_guid" is not always preserved 193 * by being written to hardware. The reason for this is described in 194 * tavor_set_addr_path(). 195 */ 196 ah->ah_udavrsrcp = udav; 197 ah->ah_rsrcp = rsrc; 198 ah->ah_pdhdl = pd; 199 ah->ah_mrhdl = mr; 200 ah->ah_save_guid = attr_p->av_dgid.gid_guid; 201 ah->ah_save_srate = attr_p->av_srate; 202 *ahhdl = ah; 203 204 /* Determine if later ddi_dma_sync will be necessary */ 205 ah->ah_sync = TAVOR_UDAV_IS_SYNC_REQ(state); 206 207 /* Sync the UDAV for use by the hardware */ 208 tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV); 209 210 return (DDI_SUCCESS); 211 212 ahalloc_fail2: 213 tavor_pd_refcnt_dec(pd); 214 tavor_rsrc_free(state, &rsrc); 215 ahalloc_fail1: 216 tavor_rsrc_free(state, &udav); 217 ahalloc_fail: 218 return (status); 219 } 220 221 222 /* 223 * tavor_ah_free() 224 * Context: Can be called only from user or kernel context. 225 */ 226 /* ARGSUSED */ 227 int 228 tavor_ah_free(tavor_state_t *state, tavor_ahhdl_t *ahhdl, uint_t sleepflag) 229 { 230 tavor_rsrc_t *udav, *rsrc; 231 tavor_pdhdl_t pd; 232 tavor_mrhdl_t mr; 233 tavor_ahhdl_t ah; 234 int status; 235 236 /* 237 * Pull all the necessary information from the Tavor Address Handle 238 * struct. This is necessary here because the resource for the 239 * AH is going to be freed up as part of this operation. 240 */ 241 ah = *ahhdl; 242 mutex_enter(&ah->ah_lock); 243 udav = ah->ah_udavrsrcp; 244 rsrc = ah->ah_rsrcp; 245 pd = ah->ah_pdhdl; 246 mr = ah->ah_mrhdl; 247 mutex_exit(&ah->ah_lock); 248 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah)) 249 250 /* 251 * Deregister the memory for the UDAV. If this fails for any reason, 252 * then it is an indication that something (either in HW or SW) has 253 * gone seriously wrong. So we print a warning message and return 254 * failure. 255 */ 256 status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL, 257 sleepflag); 258 if (status != DDI_SUCCESS) { 259 return (ibc_get_ci_failure(0)); 260 } 261 262 /* 263 * Write zero to the first 64-bit word in the UDAV entry. As 264 * described above (in tavor_ah_alloc), the PD number is stored in 265 * the first 64-bits of each UDAV and setting this to zero is 266 * guaranteed to invalidate the entry. 267 */ 268 ddi_put64(udav->tr_acchdl, (uint64_t *)udav->tr_addr, 0); 269 270 /* Sync the UDAV for use by the hardware */ 271 tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV); 272 273 /* Decrement the reference count on the protection domain (PD) */ 274 tavor_pd_refcnt_dec(pd); 275 276 /* Free the Tavor Address Handle structure */ 277 tavor_rsrc_free(state, &rsrc); 278 279 /* Free up the UDAV entry resource */ 280 tavor_rsrc_free(state, &udav); 281 282 /* Set the ahhdl pointer to NULL and return success */ 283 *ahhdl = NULL; 284 285 return (DDI_SUCCESS); 286 } 287 288 289 /* 290 * tavor_ah_query() 291 * Context: Can be called from interrupt or base context. 292 */ 293 /* ARGSUSED */ 294 int 295 tavor_ah_query(tavor_state_t *state, tavor_ahhdl_t ah, tavor_pdhdl_t *pd, 296 ibt_adds_vect_t *attr_p) 297 { 298 tavor_hw_udav_t udav_entry; 299 tavor_rsrc_t *udav; 300 uint64_t data; 301 uint32_t size; 302 int i; 303 304 mutex_enter(&ah->ah_lock); 305 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr_p)) 306 307 /* 308 * Pull all the necessary information from the Tavor Address Handle 309 * structure 310 */ 311 udav = ah->ah_udavrsrcp; 312 *pd = ah->ah_pdhdl; 313 314 /* 315 * Copy the UDAV entry into the temporary copy. Here we copy all 316 * the information from the UDAV entry in DDR memory into the 317 * temporary UDAV. Note: We don't need to sync the UDAV for 318 * reading by software because Tavor HW never modifies the entry. 319 */ 320 size = sizeof (tavor_hw_udav_t) >> 3; 321 for (i = 0; i < size; i++) { 322 data = ddi_get64(udav->tr_acchdl, 323 ((uint64_t *)udav->tr_addr + i)); 324 ((uint64_t *)&udav_entry)[i] = data; 325 } 326 327 /* 328 * Fill in "ibt_adds_vect_t". We call tavor_get_addr_path() to fill 329 * the common portions that can be pulled from the UDAV we pass in. 330 * 331 * NOTE: We will also fill the "av_dgid.gid_guid" field from the 332 * "ah_save_guid" field we have previously saved away. The reason 333 * for this is described in tavor_ah_alloc() and tavor_ah_modify(). 334 */ 335 tavor_get_addr_path(state, (tavor_hw_addr_path_t *)&udav_entry, 336 attr_p, TAVOR_ADDRPATH_UDAV, NULL); 337 338 attr_p->av_dgid.gid_guid = ah->ah_save_guid; 339 attr_p->av_srate = ah->ah_save_srate; 340 341 mutex_exit(&ah->ah_lock); 342 return (DDI_SUCCESS); 343 } 344 345 346 /* 347 * tavor_ah_modify() 348 * Context: Can be called from interrupt or base context. 349 */ 350 /* ARGSUSED */ 351 int 352 tavor_ah_modify(tavor_state_t *state, tavor_ahhdl_t ah, 353 ibt_adds_vect_t *attr_p) 354 { 355 tavor_hw_udav_t udav_entry; 356 tavor_rsrc_t *udav; 357 uint64_t data_new, data_old; 358 uint32_t udav_pd, size, portnum_new; 359 int i, status; 360 361 /* Validate that specified port number is legal */ 362 if (!tavor_portnum_is_valid(state, attr_p->av_port_num)) { 363 return (IBT_HCA_PORT_INVALID); 364 } 365 366 mutex_enter(&ah->ah_lock); 367 368 /* 369 * Pull all the necessary information from the Tavor Address Handle 370 * structure 371 */ 372 udav = ah->ah_udavrsrcp; 373 374 /* 375 * Fill in the UDAV entry. Note: we are only filling in a temporary 376 * copy here, which we will later copy into the actual entry in 377 * Tavor DDR memory. This starts be zeroing out the temporary copy 378 * and then calling tavor_set_addr_path() to fill in the common 379 * portions that can be pulled from the "ibt_adds_vect_t" passed in 380 * 381 * NOTE: We also need to save away a copy of the "av_dgid.gid_guid" 382 * field here (just as we did during tavor_ah_alloc()) because we 383 * may need to return it later to the IBTF (as a result of a 384 * subsequent query operation). As explained in tavor_ah_alloc(), 385 * unlike the other UDAV parameters, the value of "av_dgid.gid_guid" 386 * is not always preserved by being written to hardware. The reason 387 * for this is described in tavor_set_addr_path(). 388 */ 389 bzero(&udav_entry, sizeof (tavor_hw_udav_t)); 390 status = tavor_set_addr_path(state, attr_p, 391 (tavor_hw_addr_path_t *)&udav_entry, TAVOR_ADDRPATH_UDAV, NULL); 392 if (status != DDI_SUCCESS) { 393 mutex_exit(&ah->ah_lock); 394 return (status); 395 } 396 ah->ah_save_guid = attr_p->av_dgid.gid_guid; 397 ah->ah_save_srate = attr_p->av_srate; 398 399 /* 400 * Save away the current PD number for this UDAV. Then temporarily 401 * invalidate the entry (by setting the PD to zero). Note: Since 402 * the first 32 bits of the UDAV actually contain the current port 403 * number _and_ current PD number, we need to mask off some bits. 404 */ 405 udav_pd = ddi_get32(udav->tr_acchdl, (uint32_t *)udav->tr_addr); 406 udav_pd = udav_pd & 0xFFFFFF; 407 ddi_put32(udav->tr_acchdl, (uint32_t *)udav->tr_addr, 0); 408 409 /* Sync the UDAV for use by the hardware */ 410 tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV); 411 412 /* 413 * Copy UDAV structure to the entry 414 * Note: We copy in 64-bit chunks. For the first two of these 415 * chunks it is necessary to read the current contents of the 416 * UDAV, mask off the modifiable portions (maintaining any 417 * of the "reserved" portions), and then mask on the new data. 418 */ 419 size = sizeof (tavor_hw_udav_t) >> 3; 420 for (i = 0; i < size; i++) { 421 data_new = ((uint64_t *)&udav_entry)[i]; 422 data_old = ddi_get64(udav->tr_acchdl, 423 ((uint64_t *)udav->tr_addr + i)); 424 425 /* 426 * Apply mask to change only the relevant values. Note: We 427 * extract the new portnum from the address handle here 428 * because the "PD" and "portnum" fields are in the same 429 * 32-bit word in the UDAV. We will use the (new) port 430 * number extracted here when we write the valid PD number 431 * in the last step below. 432 */ 433 if (i == 0) { 434 data_old = data_old & TAVOR_UDAV_MODIFY_MASK0; 435 portnum_new = data_new >> 56; 436 } else if (i == 1) { 437 data_old = data_old & TAVOR_UDAV_MODIFY_MASK1; 438 } else { 439 data_old = 0; 440 } 441 442 /* Write the updated values to the UDAV (in DDR) */ 443 data_new = data_old | data_new; 444 ddi_put64(udav->tr_acchdl, ((uint64_t *)udav->tr_addr + i), 445 data_new); 446 } 447 448 /* 449 * Sync the body of the UDAV for use by the hardware. After we 450 * have updated the PD number (to make the UDAV valid), we sync 451 * again to push the entire entry out for hardware access. 452 */ 453 tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV); 454 455 /* 456 * Put the valid PD number back into UDAV entry. Note: Because port 457 * number and PD number are in the same word, we must mask the 458 * new port number with the old PD number before writing it back 459 * to the UDAV entry 460 */ 461 udav_pd = ((portnum_new << 24) | udav_pd); 462 ddi_put32(udav->tr_acchdl, (uint32_t *)udav->tr_addr, udav_pd); 463 464 /* Sync the rest of the UDAV for use by the hardware */ 465 tavor_udav_sync(ah, udav->tr_addr, DDI_DMA_SYNC_FORDEV); 466 467 mutex_exit(&ah->ah_lock); 468 return (DDI_SUCCESS); 469 } 470 471 472 /* 473 * tavor_udav_sync() 474 * Context: Can be called from interrupt or base context. 475 */ 476 /* ARGSUSED */ 477 static void 478 tavor_udav_sync(tavor_ahhdl_t ah, tavor_hw_udav_t *udav, uint_t flag) 479 { 480 ddi_dma_handle_t dmahdl; 481 off_t offset; 482 int status; 483 484 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah)) 485 486 /* Determine if AH needs to be synced or not */ 487 if (ah->ah_sync == 0) { 488 return; 489 } 490 491 /* Get the DMA handle from AH handle */ 492 dmahdl = ah->ah_mrhdl->mr_bindinfo.bi_dmahdl; 493 494 /* Calculate offset into address handle */ 495 offset = (off_t)0; 496 status = ddi_dma_sync(dmahdl, offset, sizeof (tavor_hw_udav_t), flag); 497 if (status != DDI_SUCCESS) { 498 return; 499 } 500 } 501 502 503 /* 504 * tavor_mcg_attach() 505 * Context: Can be called only from user or kernel context. 506 */ 507 int 508 tavor_mcg_attach(tavor_state_t *state, tavor_qphdl_t qp, ib_gid_t gid, 509 ib_lid_t lid) 510 { 511 tavor_rsrc_t *rsrc; 512 tavor_hw_mcg_t *mcg_entry; 513 tavor_hw_mcg_qp_list_t *mcg_entry_qplist; 514 tavor_mcghdl_t mcg, newmcg; 515 uint64_t mgid_hash; 516 uint32_t end_indx; 517 int status; 518 uint_t qp_found; 519 520 /* 521 * It is only allowed to attach MCG to UD queue pairs. Verify 522 * that the intended QP is of the appropriate transport type 523 */ 524 if (qp->qp_serv_type != TAVOR_QP_UD) { 525 goto mcgattach_fail; 526 } 527 528 /* 529 * Check for invalid Multicast DLID. Specifically, all Multicast 530 * LIDs should be within a well defined range. If the specified LID 531 * is outside of that range, then return an error. 532 */ 533 if (tavor_mlid_is_valid(lid) == 0) { 534 goto mcgattach_fail; 535 } 536 /* 537 * Check for invalid Multicast GID. All Multicast GIDs should have 538 * a well-defined pattern of bits and flags that are allowable. If 539 * the specified GID does not meet the criteria, then return an error. 540 */ 541 if (tavor_mgid_is_valid(gid) == 0) { 542 goto mcgattach_fail; 543 } 544 545 /* 546 * Compute the MGID hash value. Since the MCG table is arranged as 547 * a number of separate hash chains, this operation converts the 548 * specified MGID into the starting index of an entry in the hash 549 * table (i.e. the index for the start of the appropriate hash chain). 550 * Subsequent operations below will walk the chain searching for the 551 * right place to add this new QP. 552 */ 553 status = tavor_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid, 554 &mgid_hash, TAVOR_SLEEPFLAG_FOR_CONTEXT()); 555 if (status != TAVOR_CMD_SUCCESS) { 556 cmn_err(CE_CONT, "Tavor: MGID_HASH command failed: %08x\n", 557 status); 558 return (ibc_get_ci_failure(0)); 559 } 560 561 /* 562 * Grab the multicast group mutex. Then grab the pre-allocated 563 * temporary buffer used for holding and/or modifying MCG entries. 564 * Zero out the temporary MCG entry before we begin. 565 */ 566 mutex_enter(&state->ts_mcglock); 567 mcg_entry = state->ts_mcgtmp; 568 mcg_entry_qplist = TAVOR_MCG_GET_QPLIST_PTR(mcg_entry); 569 bzero(mcg_entry, TAVOR_MCGMEM_SZ(state)); 570 571 /* 572 * Walk through the array of MCG entries starting at "mgid_hash". 573 * Try to find the appropriate place for this new QP to be added. 574 * This could happen when the first entry of the chain has MGID == 0 575 * (which means that the hash chain is empty), or because we find 576 * an entry with the same MGID (in which case we'll add the QP to 577 * that MCG), or because we come to the end of the chain (in which 578 * case this is the first QP being added to the multicast group that 579 * corresponds to the MGID. The tavor_mcg_walk_mgid_hash() routine 580 * walks the list and returns an index into the MCG table. The entry 581 * at this index is then checked to determine which case we have 582 * fallen into (see below). Note: We are using the "shadow" MCG 583 * list (of tavor_mcg_t structs) for this lookup because the real 584 * MCG entries are in hardware (and the lookup process would be much 585 * more time consuming). 586 */ 587 end_indx = tavor_mcg_walk_mgid_hash(state, mgid_hash, gid, NULL); 588 mcg = &state->ts_mcghdl[end_indx]; 589 590 /* 591 * If MGID == 0, then the hash chain is empty. Just fill in the 592 * current entry. Note: No need to allocate an MCG table entry 593 * as all the hash chain "heads" are already preallocated. 594 */ 595 if ((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) { 596 597 /* Fill in the current entry in the "shadow" MCG list */ 598 tavor_mcg_setup_new_hdr(mcg, mcg_entry, gid, NULL); 599 600 /* 601 * Try to add the new QP number to the list. This (and the 602 * above) routine fills in a temporary MCG. The "mcg_entry" 603 * and "mcg_entry_qplist" pointers simply point to different 604 * offsets within the same temporary copy of the MCG (for 605 * convenience). Note: If this fails, we need to invalidate 606 * the entries we've already put into the "shadow" list entry 607 * above. 608 */ 609 status = tavor_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp, 610 &qp_found); 611 if (status != DDI_SUCCESS) { 612 bzero(mcg, sizeof (struct tavor_sw_mcg_list_s)); 613 mutex_exit(&state->ts_mcglock); 614 goto mcgattach_fail; 615 } 616 617 /* 618 * Once the temporary MCG has been filled in, write the entry 619 * into the appropriate location in the Tavor MCG entry table. 620 * If it's successful, then drop the lock and return success. 621 * Note: In general, this operation shouldn't fail. If it 622 * does, then it is an indication that something (probably in 623 * HW, but maybe in SW) has gone seriously wrong. We still 624 * want to zero out the entries that we've filled in above 625 * (in the tavor_mcg_setup_new_hdr() routine). 626 */ 627 status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx, 628 TAVOR_CMD_NOSLEEP_SPIN); 629 if (status != TAVOR_CMD_SUCCESS) { 630 bzero(mcg, sizeof (struct tavor_sw_mcg_list_s)); 631 mutex_exit(&state->ts_mcglock); 632 TAVOR_WARNING(state, "failed to write MCG entry"); 633 cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: " 634 "%08x\n", status); 635 return (ibc_get_ci_failure(0)); 636 } 637 638 /* 639 * Now that we know all the Tavor firmware accesses have been 640 * successful, we update the "shadow" MCG entry by incrementing 641 * the "number of attached QPs" count. 642 * 643 * We increment only if the QP is not already part of the 644 * MCG by checking the 'qp_found' flag returned from the 645 * qplist_add above. 646 */ 647 if (!qp_found) { 648 mcg->mcg_num_qps++; 649 650 /* 651 * Increment the refcnt for this QP. Because the QP 652 * was added to this MCG, the refcnt must be 653 * incremented. 654 */ 655 tavor_qp_mcg_refcnt_inc(qp); 656 } 657 658 /* 659 * We drop the lock and return success. 660 */ 661 mutex_exit(&state->ts_mcglock); 662 return (DDI_SUCCESS); 663 } 664 665 /* 666 * If the specified MGID matches the MGID in the current entry, then 667 * we need to try to add the QP to the current MCG entry. In this 668 * case, it means that we need to read the existing MCG entry (into 669 * the temporary MCG), add the new QP number to the temporary entry 670 * (using the same method we used above), and write the entry back 671 * to the hardware (same as above). 672 */ 673 if ((mcg->mcg_mgid_h == gid.gid_prefix) && 674 (mcg->mcg_mgid_l == gid.gid_guid)) { 675 676 /* 677 * Read the current MCG entry into the temporary MCG. Note: 678 * In general, this operation shouldn't fail. If it does, 679 * then it is an indication that something (probably in HW, 680 * but maybe in SW) has gone seriously wrong. 681 */ 682 status = tavor_read_mgm_cmd_post(state, mcg_entry, end_indx, 683 TAVOR_CMD_NOSLEEP_SPIN); 684 if (status != TAVOR_CMD_SUCCESS) { 685 mutex_exit(&state->ts_mcglock); 686 TAVOR_WARNING(state, "failed to read MCG entry"); 687 cmn_err(CE_CONT, "Tavor: READ_MGM command failed: " 688 "%08x\n", status); 689 return (ibc_get_ci_failure(0)); 690 } 691 692 /* 693 * Try to add the new QP number to the list. This routine 694 * fills in the necessary pieces of the temporary MCG. The 695 * "mcg_entry_qplist" pointer is used to point to the portion 696 * of the temporary MCG that holds the QP numbers. 697 * 698 * Note: tavor_mcg_qplist_add() returns SUCCESS if it 699 * already found the QP in the list. In this case, the QP is 700 * not added on to the list again. Check the flag 'qp_found' 701 * if this value is needed to be known. 702 * 703 */ 704 status = tavor_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp, 705 &qp_found); 706 if (status != DDI_SUCCESS) { 707 mutex_exit(&state->ts_mcglock); 708 /* Set "status" and "errormsg" and goto failure */ 709 goto mcgattach_fail; 710 } 711 712 /* 713 * Once the temporary MCG has been updated, write the entry 714 * into the appropriate location in the Tavor MCG entry table. 715 * If it's successful, then drop the lock and return success. 716 * Note: In general, this operation shouldn't fail. If it 717 * does, then it is an indication that something (probably in 718 * HW, but maybe in SW) has gone seriously wrong. 719 */ 720 status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx, 721 TAVOR_CMD_NOSLEEP_SPIN); 722 if (status != TAVOR_CMD_SUCCESS) { 723 mutex_exit(&state->ts_mcglock); 724 TAVOR_WARNING(state, "failed to write MCG entry"); 725 cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: " 726 "%08x\n", status); 727 return (ibc_get_ci_failure(0)); 728 } 729 730 /* 731 * Now that we know all the Tavor firmware accesses have been 732 * successful, we update the current "shadow" MCG entry by 733 * incrementing the "number of attached QPs" count. 734 * 735 * We increment only if the QP is not already part of the 736 * MCG by checking the 'qp_found' flag returned from the 737 * qplist_add above. 738 */ 739 if (!qp_found) { 740 mcg->mcg_num_qps++; 741 742 /* 743 * Increment the refcnt for this QP. Because the QP 744 * was added to this MCG, the refcnt must be 745 * incremented. 746 */ 747 tavor_qp_mcg_refcnt_inc(qp); 748 } 749 750 /* 751 * We drop the lock and return success. 752 */ 753 mutex_exit(&state->ts_mcglock); 754 return (DDI_SUCCESS); 755 } 756 757 /* 758 * If we've reached here, then we're at the end of the hash chain. 759 * We need to allocate a new MCG entry, fill it in, write it to Tavor, 760 * and update the previous entry to link the new one to the end of the 761 * chain. 762 */ 763 764 /* 765 * Allocate an MCG table entry. This will be filled in with all 766 * the necessary parameters to define the multicast group. Then it 767 * will be written to the hardware in the next-to-last step below. 768 */ 769 status = tavor_rsrc_alloc(state, TAVOR_MCG, 1, TAVOR_NOSLEEP, &rsrc); 770 if (status != DDI_SUCCESS) { 771 mutex_exit(&state->ts_mcglock); 772 goto mcgattach_fail; 773 } 774 775 /* 776 * Fill in the new entry in the "shadow" MCG list. Note: Just as 777 * it does above, tavor_mcg_setup_new_hdr() also fills in a portion 778 * of the temporary MCG entry (the rest of which will be filled in by 779 * tavor_mcg_qplist_add() below) 780 */ 781 newmcg = &state->ts_mcghdl[rsrc->tr_indx]; 782 tavor_mcg_setup_new_hdr(newmcg, mcg_entry, gid, rsrc); 783 784 /* 785 * Try to add the new QP number to the list. This routine fills in 786 * the final necessary pieces of the temporary MCG. The 787 * "mcg_entry_qplist" pointer is used to point to the portion of the 788 * temporary MCG that holds the QP numbers. If we fail here, we 789 * must undo the previous resource allocation. 790 * 791 * Note: tavor_mcg_qplist_add() can we return SUCCESS if it already 792 * found the QP in the list. In this case, the QP is not added on to 793 * the list again. Check the flag 'qp_found' if this value is needed 794 * to be known. 795 */ 796 status = tavor_mcg_qplist_add(state, newmcg, mcg_entry_qplist, qp, 797 &qp_found); 798 if (status != DDI_SUCCESS) { 799 bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s)); 800 tavor_rsrc_free(state, &rsrc); 801 mutex_exit(&state->ts_mcglock); 802 goto mcgattach_fail; 803 } 804 805 /* 806 * Once the temporary MCG has been updated, write the entry into the 807 * appropriate location in the Tavor MCG entry table. If this is 808 * successful, then we need to chain the previous entry to this one. 809 * Note: In general, this operation shouldn't fail. If it does, then 810 * it is an indication that something (probably in HW, but maybe in 811 * SW) has gone seriously wrong. 812 */ 813 status = tavor_write_mgm_cmd_post(state, mcg_entry, rsrc->tr_indx, 814 TAVOR_CMD_NOSLEEP_SPIN); 815 if (status != TAVOR_CMD_SUCCESS) { 816 bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s)); 817 tavor_rsrc_free(state, &rsrc); 818 mutex_exit(&state->ts_mcglock); 819 TAVOR_WARNING(state, "failed to write MCG entry"); 820 cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n", 821 status); 822 return (ibc_get_ci_failure(0)); 823 } 824 825 /* 826 * Now read the current MCG entry (the one previously at the end of 827 * hash chain) into the temporary MCG. We are going to update its 828 * "next_gid_indx" now and write the entry back to the MCG table. 829 * Note: In general, this operation shouldn't fail. If it does, then 830 * it is an indication that something (probably in HW, but maybe in SW) 831 * has gone seriously wrong. We will free up the MCG entry resource, 832 * but we will not undo the previously written MCG entry in the HW. 833 * This is OK, though, because the MCG entry is not currently attached 834 * to any hash chain. 835 */ 836 status = tavor_read_mgm_cmd_post(state, mcg_entry, end_indx, 837 TAVOR_CMD_NOSLEEP_SPIN); 838 if (status != TAVOR_CMD_SUCCESS) { 839 bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s)); 840 tavor_rsrc_free(state, &rsrc); 841 mutex_exit(&state->ts_mcglock); 842 TAVOR_WARNING(state, "failed to read MCG entry"); 843 cmn_err(CE_CONT, "Tavor: READ_MGM command failed: %08x\n", 844 status); 845 return (ibc_get_ci_failure(0)); 846 } 847 848 /* 849 * Finally, we update the "next_gid_indx" field in the temporary MCG 850 * and attempt to write the entry back into the Tavor MCG table. If 851 * this succeeds, then we update the "shadow" list to reflect the 852 * change, drop the lock, and return success. Note: In general, this 853 * operation shouldn't fail. If it does, then it is an indication 854 * that something (probably in HW, but maybe in SW) has gone seriously 855 * wrong. Just as we do above, we will free up the MCG entry resource, 856 * but we will not try to undo the previously written MCG entry. This 857 * is OK, though, because (since we failed here to update the end of 858 * the chain) that other entry is not currently attached to any chain. 859 */ 860 mcg_entry->next_gid_indx = rsrc->tr_indx; 861 status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx, 862 TAVOR_CMD_NOSLEEP_SPIN); 863 if (status != TAVOR_CMD_SUCCESS) { 864 bzero(newmcg, sizeof (struct tavor_sw_mcg_list_s)); 865 tavor_rsrc_free(state, &rsrc); 866 mutex_exit(&state->ts_mcglock); 867 TAVOR_WARNING(state, "failed to write MCG entry"); 868 cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n", 869 status); 870 return (ibc_get_ci_failure(0)); 871 } 872 mcg = &state->ts_mcghdl[end_indx]; 873 mcg->mcg_next_indx = rsrc->tr_indx; 874 875 /* 876 * Now that we know all the Tavor firmware accesses have been 877 * successful, we update the new "shadow" MCG entry by incrementing 878 * the "number of attached QPs" count. Then we drop the lock and 879 * return success. 880 */ 881 newmcg->mcg_num_qps++; 882 883 /* 884 * Increment the refcnt for this QP. Because the QP 885 * was added to this MCG, the refcnt must be 886 * incremented. 887 */ 888 tavor_qp_mcg_refcnt_inc(qp); 889 890 mutex_exit(&state->ts_mcglock); 891 return (DDI_SUCCESS); 892 893 mcgattach_fail: 894 return (status); 895 } 896 897 898 /* 899 * tavor_mcg_detach() 900 * Context: Can be called only from user or kernel context. 901 */ 902 int 903 tavor_mcg_detach(tavor_state_t *state, tavor_qphdl_t qp, ib_gid_t gid, 904 ib_lid_t lid) 905 { 906 tavor_hw_mcg_t *mcg_entry; 907 tavor_hw_mcg_qp_list_t *mcg_entry_qplist; 908 tavor_mcghdl_t mcg; 909 uint64_t mgid_hash; 910 uint32_t end_indx, prev_indx; 911 int status; 912 913 /* 914 * Check for invalid Multicast DLID. Specifically, all Multicast 915 * LIDs should be within a well defined range. If the specified LID 916 * is outside of that range, then return an error. 917 */ 918 if (tavor_mlid_is_valid(lid) == 0) { 919 return (IBT_MC_MLID_INVALID); 920 } 921 922 /* 923 * Compute the MGID hash value. As described above, the MCG table is 924 * arranged as a number of separate hash chains. This operation 925 * converts the specified MGID into the starting index of an entry in 926 * the hash table (i.e. the index for the start of the appropriate 927 * hash chain). Subsequent operations below will walk the chain 928 * searching for a matching entry from which to attempt to remove 929 * the specified QP. 930 */ 931 status = tavor_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid, 932 &mgid_hash, TAVOR_SLEEPFLAG_FOR_CONTEXT()); 933 if (status != TAVOR_CMD_SUCCESS) { 934 cmn_err(CE_CONT, "Tavor: MGID_HASH command failed: %08x\n", 935 status); 936 return (ibc_get_ci_failure(0)); 937 } 938 939 /* 940 * Grab the multicast group mutex. Then grab the pre-allocated 941 * temporary buffer used for holding and/or modifying MCG entries. 942 */ 943 mutex_enter(&state->ts_mcglock); 944 mcg_entry = state->ts_mcgtmp; 945 mcg_entry_qplist = TAVOR_MCG_GET_QPLIST_PTR(mcg_entry); 946 947 /* 948 * Walk through the array of MCG entries starting at "mgid_hash". 949 * Try to find an MCG entry with a matching MGID. The 950 * tavor_mcg_walk_mgid_hash() routine walks the list and returns an 951 * index into the MCG table. The entry at this index is checked to 952 * determine whether it is a match or not. If it is a match, then 953 * we continue on to attempt to remove the QP from the MCG. If it 954 * is not a match (or not a valid MCG entry), then we return an error. 955 */ 956 end_indx = tavor_mcg_walk_mgid_hash(state, mgid_hash, gid, &prev_indx); 957 mcg = &state->ts_mcghdl[end_indx]; 958 959 /* 960 * If MGID == 0 (the hash chain is empty) or if the specified MGID 961 * does not match the MGID in the current entry, then return 962 * IBT_MC_MGID_INVALID (to indicate that the specified MGID is not 963 * valid). 964 */ 965 if (((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) || 966 ((mcg->mcg_mgid_h != gid.gid_prefix) || 967 (mcg->mcg_mgid_l != gid.gid_guid))) { 968 mutex_exit(&state->ts_mcglock); 969 return (IBT_MC_MGID_INVALID); 970 } 971 972 /* 973 * Read the current MCG entry into the temporary MCG. Note: In 974 * general, this operation shouldn't fail. If it does, then it is 975 * an indication that something (probably in HW, but maybe in SW) 976 * has gone seriously wrong. 977 */ 978 status = tavor_read_mgm_cmd_post(state, mcg_entry, end_indx, 979 TAVOR_CMD_NOSLEEP_SPIN); 980 if (status != TAVOR_CMD_SUCCESS) { 981 mutex_exit(&state->ts_mcglock); 982 TAVOR_WARNING(state, "failed to read MCG entry"); 983 cmn_err(CE_CONT, "Tavor: READ_MGM command failed: %08x\n", 984 status); 985 return (ibc_get_ci_failure(0)); 986 } 987 988 /* 989 * Search the QP number list for a match. If a match is found, then 990 * remove the entry from the QP list. Otherwise, if no match is found, 991 * return an error. 992 */ 993 status = tavor_mcg_qplist_remove(mcg, mcg_entry_qplist, qp); 994 if (status != DDI_SUCCESS) { 995 mutex_exit(&state->ts_mcglock); 996 return (status); 997 } 998 999 /* 1000 * Decrement the MCG count for this QP. When the 'qp_mcg' 1001 * field becomes 0, then this QP is no longer a member of any 1002 * MCG. 1003 */ 1004 tavor_qp_mcg_refcnt_dec(qp); 1005 1006 /* 1007 * If the current MCG's QP number list is about to be made empty 1008 * ("mcg_num_qps" == 1), then remove the entry itself from the hash 1009 * chain. Otherwise, just write the updated MCG entry back to the 1010 * hardware. In either case, once we successfully update the hardware 1011 * chain, then we decrement the "shadow" list entry's "mcg_num_qps" 1012 * count (or zero out the entire "shadow" list entry) before returning 1013 * success. Note: Zeroing out the "shadow" list entry is done 1014 * inside of tavor_mcg_hash_list_remove(). 1015 */ 1016 if (mcg->mcg_num_qps == 1) { 1017 1018 /* Remove an MCG entry from the hash chain */ 1019 status = tavor_mcg_hash_list_remove(state, end_indx, prev_indx, 1020 mcg_entry); 1021 if (status != DDI_SUCCESS) { 1022 mutex_exit(&state->ts_mcglock); 1023 return (status); 1024 } 1025 1026 } else { 1027 /* 1028 * Write the updated MCG entry back to the Tavor MCG table. 1029 * If this succeeds, then we update the "shadow" list to 1030 * reflect the change (i.e. decrement the "mcg_num_qps"), 1031 * drop the lock, and return success. Note: In general, 1032 * this operation shouldn't fail. If it does, then it is an 1033 * indication that something (probably in HW, but maybe in SW) 1034 * has gone seriously wrong. 1035 */ 1036 status = tavor_write_mgm_cmd_post(state, mcg_entry, end_indx, 1037 TAVOR_CMD_NOSLEEP_SPIN); 1038 if (status != TAVOR_CMD_SUCCESS) { 1039 mutex_exit(&state->ts_mcglock); 1040 TAVOR_WARNING(state, "failed to write MCG entry"); 1041 cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: " 1042 "%08x\n", status); 1043 return (ibc_get_ci_failure(0)); 1044 } 1045 mcg->mcg_num_qps--; 1046 } 1047 1048 mutex_exit(&state->ts_mcglock); 1049 return (DDI_SUCCESS); 1050 } 1051 1052 /* 1053 * tavor_qp_mcg_refcnt_inc() 1054 * Context: Can be called from interrupt or base context. 1055 */ 1056 static void 1057 tavor_qp_mcg_refcnt_inc(tavor_qphdl_t qp) 1058 { 1059 /* Increment the QP's MCG reference count */ 1060 mutex_enter(&qp->qp_lock); 1061 qp->qp_mcg_refcnt++; 1062 mutex_exit(&qp->qp_lock); 1063 } 1064 1065 1066 /* 1067 * tavor_qp_mcg_refcnt_dec() 1068 * Context: Can be called from interrupt or base context. 1069 */ 1070 static void 1071 tavor_qp_mcg_refcnt_dec(tavor_qphdl_t qp) 1072 { 1073 /* Decrement the QP's MCG reference count */ 1074 mutex_enter(&qp->qp_lock); 1075 qp->qp_mcg_refcnt--; 1076 mutex_exit(&qp->qp_lock); 1077 } 1078 1079 1080 /* 1081 * tavor_mcg_qplist_add() 1082 * Context: Can be called from interrupt or base context. 1083 */ 1084 static int 1085 tavor_mcg_qplist_add(tavor_state_t *state, tavor_mcghdl_t mcg, 1086 tavor_hw_mcg_qp_list_t *mcg_qplist, tavor_qphdl_t qp, 1087 uint_t *qp_found) 1088 { 1089 uint_t qplist_indx; 1090 1091 ASSERT(MUTEX_HELD(&state->ts_mcglock)); 1092 1093 qplist_indx = mcg->mcg_num_qps; 1094 1095 /* 1096 * Determine if we have exceeded the maximum number of QP per 1097 * multicast group. If we have, then return an error 1098 */ 1099 if (qplist_indx >= state->ts_cfg_profile->cp_num_qp_per_mcg) { 1100 return (IBT_HCA_MCG_QP_EXCEEDED); 1101 } 1102 1103 /* 1104 * Determine if the QP is already attached to this MCG table. If it 1105 * is, then we break out and treat this operation as a NO-OP 1106 */ 1107 for (qplist_indx = 0; qplist_indx < mcg->mcg_num_qps; 1108 qplist_indx++) { 1109 if (mcg_qplist[qplist_indx].qpn == qp->qp_qpnum) { 1110 break; 1111 } 1112 } 1113 1114 /* 1115 * If the QP was already on the list, set 'qp_found' to TRUE. We still 1116 * return SUCCESS in this case, but the qplist will not have been 1117 * updated because the QP was already on the list. 1118 */ 1119 if (qplist_indx < mcg->mcg_num_qps) { 1120 *qp_found = 1; 1121 } else { 1122 /* 1123 * Otherwise, append the new QP number to the end of the 1124 * current QP list. Note: We will increment the "mcg_num_qps" 1125 * field on the "shadow" MCG list entry later (after we know 1126 * that all necessary Tavor firmware accesses have been 1127 * successful). 1128 * 1129 * Set 'qp_found' to 0 so we know the QP was added on to the 1130 * list for sure. 1131 */ 1132 mcg_qplist[qplist_indx].q = TAVOR_MCG_QPN_VALID; 1133 mcg_qplist[qplist_indx].qpn = qp->qp_qpnum; 1134 *qp_found = 0; 1135 } 1136 1137 return (DDI_SUCCESS); 1138 } 1139 1140 1141 1142 /* 1143 * tavor_mcg_qplist_remove() 1144 * Context: Can be called from interrupt or base context. 1145 */ 1146 static int 1147 tavor_mcg_qplist_remove(tavor_mcghdl_t mcg, tavor_hw_mcg_qp_list_t *mcg_qplist, 1148 tavor_qphdl_t qp) 1149 { 1150 uint_t i, qplist_indx; 1151 1152 /* 1153 * Search the MCG QP list for a matching QPN. When 1154 * it's found, we swap the last entry with the current 1155 * one, set the last entry to zero, decrement the last 1156 * entry, and return. If it's not found, then it's 1157 * and error. 1158 */ 1159 qplist_indx = mcg->mcg_num_qps; 1160 for (i = 0; i < qplist_indx; i++) { 1161 if (mcg_qplist[i].qpn == qp->qp_qpnum) { 1162 mcg_qplist[i] = mcg_qplist[qplist_indx - 1]; 1163 mcg_qplist[qplist_indx - 1].q = TAVOR_MCG_QPN_INVALID; 1164 mcg_qplist[qplist_indx - 1].qpn = 0; 1165 1166 return (DDI_SUCCESS); 1167 } 1168 } 1169 1170 return (IBT_QP_HDL_INVALID); 1171 } 1172 1173 1174 /* 1175 * tavor_mcg_walk_mgid_hash() 1176 * Context: Can be called from interrupt or base context. 1177 */ 1178 static uint_t 1179 tavor_mcg_walk_mgid_hash(tavor_state_t *state, uint64_t start_indx, 1180 ib_gid_t mgid, uint_t *p_indx) 1181 { 1182 tavor_mcghdl_t curr_mcghdl; 1183 uint_t curr_indx, prev_indx; 1184 1185 ASSERT(MUTEX_HELD(&state->ts_mcglock)); 1186 1187 /* Start at the head of the hash chain */ 1188 curr_indx = start_indx; 1189 prev_indx = curr_indx; 1190 curr_mcghdl = &state->ts_mcghdl[curr_indx]; 1191 1192 /* If the first entry in the chain has MGID == 0, then stop */ 1193 if ((curr_mcghdl->mcg_mgid_h == 0) && 1194 (curr_mcghdl->mcg_mgid_l == 0)) { 1195 goto end_mgid_hash_walk; 1196 } 1197 1198 /* If the first entry in the chain matches the MGID, then stop */ 1199 if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) && 1200 (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) { 1201 goto end_mgid_hash_walk; 1202 } 1203 1204 /* Otherwise, walk the hash chain looking for a match */ 1205 while (curr_mcghdl->mcg_next_indx != 0) { 1206 prev_indx = curr_indx; 1207 curr_indx = curr_mcghdl->mcg_next_indx; 1208 curr_mcghdl = &state->ts_mcghdl[curr_indx]; 1209 1210 if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) && 1211 (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) { 1212 break; 1213 } 1214 } 1215 1216 end_mgid_hash_walk: 1217 /* 1218 * If necessary, return the index of the previous entry too. This 1219 * is primarily used for detaching a QP from a multicast group. It 1220 * may be necessary, in that case, to delete an MCG entry from the 1221 * hash chain and having the index of the previous entry is helpful. 1222 */ 1223 if (p_indx != NULL) { 1224 *p_indx = prev_indx; 1225 } 1226 return (curr_indx); 1227 } 1228 1229 1230 /* 1231 * tavor_mcg_setup_new_hdr() 1232 * Context: Can be called from interrupt or base context. 1233 */ 1234 static void 1235 tavor_mcg_setup_new_hdr(tavor_mcghdl_t mcg, tavor_hw_mcg_t *mcg_hdr, 1236 ib_gid_t mgid, tavor_rsrc_t *mcg_rsrc) 1237 { 1238 /* 1239 * Fill in the fields of the "shadow" entry used by software 1240 * to track MCG hardware entry 1241 */ 1242 mcg->mcg_mgid_h = mgid.gid_prefix; 1243 mcg->mcg_mgid_l = mgid.gid_guid; 1244 mcg->mcg_rsrcp = mcg_rsrc; 1245 mcg->mcg_next_indx = 0; 1246 mcg->mcg_num_qps = 0; 1247 1248 /* 1249 * Fill the header fields of the MCG entry (in the temporary copy) 1250 */ 1251 mcg_hdr->mgid_h = mgid.gid_prefix; 1252 mcg_hdr->mgid_l = mgid.gid_guid; 1253 mcg_hdr->next_gid_indx = 0; 1254 } 1255 1256 1257 /* 1258 * tavor_mcg_hash_list_remove() 1259 * Context: Can be called only from user or kernel context. 1260 */ 1261 static int 1262 tavor_mcg_hash_list_remove(tavor_state_t *state, uint_t curr_indx, 1263 uint_t prev_indx, tavor_hw_mcg_t *mcg_entry) 1264 { 1265 tavor_mcghdl_t curr_mcg, prev_mcg, next_mcg; 1266 uint_t next_indx; 1267 int status; 1268 1269 /* Get the pointer to "shadow" list for current entry */ 1270 curr_mcg = &state->ts_mcghdl[curr_indx]; 1271 1272 /* 1273 * If this is the first entry on a hash chain, then attempt to replace 1274 * the entry with the next entry on the chain. If there are no 1275 * subsequent entries on the chain, then this is the only entry and 1276 * should be invalidated. 1277 */ 1278 if (curr_indx == prev_indx) { 1279 1280 /* 1281 * If this is the only entry on the chain, then invalidate it. 1282 * Note: Invalidating an MCG entry means writing all zeros 1283 * to the entry. This is only necessary for those MCG 1284 * entries that are the "head" entries of the individual hash 1285 * chains. Regardless of whether this operation returns 1286 * success or failure, return that result to the caller. 1287 */ 1288 next_indx = curr_mcg->mcg_next_indx; 1289 if (next_indx == 0) { 1290 status = tavor_mcg_entry_invalidate(state, mcg_entry, 1291 curr_indx); 1292 bzero(curr_mcg, sizeof (struct tavor_sw_mcg_list_s)); 1293 return (status); 1294 } 1295 1296 /* 1297 * Otherwise, this is just the first entry on the chain, so 1298 * grab the next one 1299 */ 1300 next_mcg = &state->ts_mcghdl[next_indx]; 1301 1302 /* 1303 * Read the next MCG entry into the temporary MCG. Note: 1304 * In general, this operation shouldn't fail. If it does, 1305 * then it is an indication that something (probably in HW, 1306 * but maybe in SW) has gone seriously wrong. 1307 */ 1308 status = tavor_read_mgm_cmd_post(state, mcg_entry, next_indx, 1309 TAVOR_CMD_NOSLEEP_SPIN); 1310 if (status != TAVOR_CMD_SUCCESS) { 1311 TAVOR_WARNING(state, "failed to read MCG entry"); 1312 cmn_err(CE_CONT, "Tavor: READ_MGM command failed: " 1313 "%08x\n", status); 1314 return (ibc_get_ci_failure(0)); 1315 } 1316 1317 /* 1318 * Copy/Write the temporary MCG back to the hardware MCG list 1319 * using the current index. This essentially removes the 1320 * current MCG entry from the list by writing over it with 1321 * the next one. If this is successful, then we can do the 1322 * same operation for the "shadow" list. And we can also 1323 * free up the Tavor MCG entry resource that was associated 1324 * with the (old) next entry. Note: In general, this 1325 * operation shouldn't fail. If it does, then it is an 1326 * indication that something (probably in HW, but maybe in SW) 1327 * has gone seriously wrong. 1328 */ 1329 status = tavor_write_mgm_cmd_post(state, mcg_entry, curr_indx, 1330 TAVOR_CMD_NOSLEEP_SPIN); 1331 if (status != TAVOR_CMD_SUCCESS) { 1332 TAVOR_WARNING(state, "failed to write MCG entry"); 1333 cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: " 1334 "%08x\n", status); 1335 return (ibc_get_ci_failure(0)); 1336 } 1337 1338 /* 1339 * Copy all the software tracking information from the next 1340 * entry on the "shadow" MCG list into the current entry on 1341 * the list. Then invalidate (zero out) the other "shadow" 1342 * list entry. 1343 */ 1344 bcopy(next_mcg, curr_mcg, sizeof (struct tavor_sw_mcg_list_s)); 1345 bzero(next_mcg, sizeof (struct tavor_sw_mcg_list_s)); 1346 1347 /* 1348 * Free up the Tavor MCG entry resource used by the "next" 1349 * MCG entry. That resource is no longer needed by any 1350 * MCG entry which is first on a hash chain (like the "next" 1351 * entry has just become). 1352 */ 1353 tavor_rsrc_free(state, &curr_mcg->mcg_rsrcp); 1354 1355 return (DDI_SUCCESS); 1356 } 1357 1358 /* 1359 * Else if this is the last entry on the hash chain (or a middle 1360 * entry, then we update the previous entry's "next_gid_index" field 1361 * to make it point instead to the next entry on the chain. By 1362 * skipping over the removed entry in this way, we can then free up 1363 * any resources associated with the current entry. Note: We don't 1364 * need to invalidate the "skipped over" hardware entry because it 1365 * will no be longer connected to any hash chains, and if/when it is 1366 * finally re-used, it will be written with entirely new values. 1367 */ 1368 1369 /* 1370 * Read the next MCG entry into the temporary MCG. Note: In general, 1371 * this operation shouldn't fail. If it does, then it is an 1372 * indication that something (probably in HW, but maybe in SW) has 1373 * gone seriously wrong. 1374 */ 1375 status = tavor_read_mgm_cmd_post(state, mcg_entry, prev_indx, 1376 TAVOR_CMD_NOSLEEP_SPIN); 1377 if (status != TAVOR_CMD_SUCCESS) { 1378 TAVOR_WARNING(state, "failed to read MCG entry"); 1379 cmn_err(CE_CONT, "Tavor: READ_MGM command failed: %08x\n", 1380 status); 1381 return (ibc_get_ci_failure(0)); 1382 } 1383 1384 /* 1385 * Finally, we update the "next_gid_indx" field in the temporary MCG 1386 * and attempt to write the entry back into the Tavor MCG table. If 1387 * this succeeds, then we update the "shadow" list to reflect the 1388 * change, free up the Tavor MCG entry resource that was associated 1389 * with the current entry, and return success. Note: In general, 1390 * this operation shouldn't fail. If it does, then it is an indication 1391 * that something (probably in HW, but maybe in SW) has gone seriously 1392 * wrong. 1393 */ 1394 mcg_entry->next_gid_indx = curr_mcg->mcg_next_indx; 1395 status = tavor_write_mgm_cmd_post(state, mcg_entry, prev_indx, 1396 TAVOR_CMD_NOSLEEP_SPIN); 1397 if (status != TAVOR_CMD_SUCCESS) { 1398 TAVOR_WARNING(state, "failed to write MCG entry"); 1399 cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n", 1400 status); 1401 return (ibc_get_ci_failure(0)); 1402 } 1403 1404 /* 1405 * Get the pointer to the "shadow" MCG list entry for the previous 1406 * MCG. Update its "mcg_next_indx" to point to the next entry 1407 * the one after the current entry. Note: This next index may be 1408 * zero, indicating the end of the list. 1409 */ 1410 prev_mcg = &state->ts_mcghdl[prev_indx]; 1411 prev_mcg->mcg_next_indx = curr_mcg->mcg_next_indx; 1412 1413 /* 1414 * Free up the Tavor MCG entry resource used by the current entry. 1415 * This resource is no longer needed because the chain now skips over 1416 * the current entry. Then invalidate (zero out) the current "shadow" 1417 * list entry. 1418 */ 1419 tavor_rsrc_free(state, &curr_mcg->mcg_rsrcp); 1420 bzero(curr_mcg, sizeof (struct tavor_sw_mcg_list_s)); 1421 1422 return (DDI_SUCCESS); 1423 } 1424 1425 1426 /* 1427 * tavor_mcg_entry_invalidate() 1428 * Context: Can be called only from user or kernel context. 1429 */ 1430 static int 1431 tavor_mcg_entry_invalidate(tavor_state_t *state, tavor_hw_mcg_t *mcg_entry, 1432 uint_t indx) 1433 { 1434 int status; 1435 1436 /* 1437 * Invalidate the hardware MCG entry by zeroing out this temporary 1438 * MCG and writing it the the hardware. Note: In general, this 1439 * operation shouldn't fail. If it does, then it is an indication 1440 * that something (probably in HW, but maybe in SW) has gone seriously 1441 * wrong. 1442 */ 1443 bzero(mcg_entry, TAVOR_MCGMEM_SZ(state)); 1444 status = tavor_write_mgm_cmd_post(state, mcg_entry, indx, 1445 TAVOR_CMD_NOSLEEP_SPIN); 1446 if (status != TAVOR_CMD_SUCCESS) { 1447 TAVOR_WARNING(state, "failed to write MCG entry"); 1448 cmn_err(CE_CONT, "Tavor: WRITE_MGM command failed: %08x\n", 1449 status); 1450 return (ibc_get_ci_failure(0)); 1451 } 1452 1453 return (DDI_SUCCESS); 1454 } 1455 1456 1457 /* 1458 * tavor_mgid_is_valid() 1459 * Context: Can be called from interrupt or base context. 1460 */ 1461 static int 1462 tavor_mgid_is_valid(ib_gid_t gid) 1463 { 1464 uint_t topbits, flags, scope; 1465 1466 /* 1467 * According to IBA 1.1 specification (section 4.1.1) a valid 1468 * "multicast GID" must have its top eight bits set to all ones 1469 */ 1470 topbits = (gid.gid_prefix >> TAVOR_MCG_TOPBITS_SHIFT) & 1471 TAVOR_MCG_TOPBITS_MASK; 1472 if (topbits != TAVOR_MCG_TOPBITS) { 1473 return (0); 1474 } 1475 1476 /* 1477 * The next 4 bits are the "flag" bits. These are valid only 1478 * if they are "0" (which correspond to permanently assigned/ 1479 * "well-known" multicast GIDs) or "1" (for so-called "transient" 1480 * multicast GIDs). All other values are reserved. 1481 */ 1482 flags = (gid.gid_prefix >> TAVOR_MCG_FLAGS_SHIFT) & 1483 TAVOR_MCG_FLAGS_MASK; 1484 if (!((flags == TAVOR_MCG_FLAGS_PERM) || 1485 (flags == TAVOR_MCG_FLAGS_NONPERM))) { 1486 return (0); 1487 } 1488 1489 /* 1490 * The next 4 bits are the "scope" bits. These are valid only 1491 * if they are "2" (Link-local), "5" (Site-local), "8" 1492 * (Organization-local) or "E" (Global). All other values 1493 * are reserved (or currently unassigned). 1494 */ 1495 scope = (gid.gid_prefix >> TAVOR_MCG_SCOPE_SHIFT) & 1496 TAVOR_MCG_SCOPE_MASK; 1497 if (!((scope == TAVOR_MCG_SCOPE_LINKLOC) || 1498 (scope == TAVOR_MCG_SCOPE_SITELOC) || 1499 (scope == TAVOR_MCG_SCOPE_ORGLOC) || 1500 (scope == TAVOR_MCG_SCOPE_GLOBAL))) { 1501 return (0); 1502 } 1503 1504 /* 1505 * If it passes all of the above checks, then we will consider it 1506 * a valid multicast GID. 1507 */ 1508 return (1); 1509 } 1510 1511 1512 /* 1513 * tavor_mlid_is_valid() 1514 * Context: Can be called from interrupt or base context. 1515 */ 1516 static int 1517 tavor_mlid_is_valid(ib_lid_t lid) 1518 { 1519 /* 1520 * According to IBA 1.1 specification (section 4.1.1) a valid 1521 * "multicast DLID" must be between 0xC000 and 0xFFFE. 1522 */ 1523 if ((lid < IB_LID_MC_FIRST) || (lid > IB_LID_MC_LAST)) { 1524 return (0); 1525 } 1526 1527 return (1); 1528 } 1529 1530 1531 /* 1532 * tavor_pd_alloc() 1533 * Context: Can be called only from user or kernel context. 1534 */ 1535 int 1536 tavor_pd_alloc(tavor_state_t *state, tavor_pdhdl_t *pdhdl, uint_t sleepflag) 1537 { 1538 tavor_rsrc_t *rsrc; 1539 tavor_pdhdl_t pd; 1540 int status; 1541 1542 /* 1543 * Allocate the software structure for tracking the protection domain 1544 * (i.e. the Tavor Protection Domain handle). By default each PD 1545 * structure will have a unique PD number assigned to it. All that 1546 * is necessary is for software to initialize the PD reference count 1547 * (to zero) and return success. 1548 */ 1549 status = tavor_rsrc_alloc(state, TAVOR_PDHDL, 1, sleepflag, &rsrc); 1550 if (status != DDI_SUCCESS) { 1551 return (IBT_INSUFF_RESOURCE); 1552 } 1553 pd = (tavor_pdhdl_t)rsrc->tr_addr; 1554 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pd)) 1555 1556 pd->pd_refcnt = 0; 1557 *pdhdl = pd; 1558 1559 return (DDI_SUCCESS); 1560 } 1561 1562 1563 /* 1564 * tavor_pd_free() 1565 * Context: Can be called only from user or kernel context. 1566 */ 1567 int 1568 tavor_pd_free(tavor_state_t *state, tavor_pdhdl_t *pdhdl) 1569 { 1570 tavor_rsrc_t *rsrc; 1571 tavor_pdhdl_t pd; 1572 1573 /* 1574 * Pull all the necessary information from the Tavor Protection Domain 1575 * handle. This is necessary here because the resource for the 1576 * PD is going to be freed up as part of this operation. 1577 */ 1578 pd = *pdhdl; 1579 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pd)) 1580 rsrc = pd->pd_rsrcp; 1581 1582 /* 1583 * Check the PD reference count. If the reference count is non-zero, 1584 * then it means that this protection domain is still referenced by 1585 * some memory region, queue pair, address handle, or other IB object 1586 * If it is non-zero, then return an error. Otherwise, free the 1587 * Tavor resource and return success. 1588 */ 1589 if (pd->pd_refcnt != 0) { 1590 return (IBT_PD_IN_USE); 1591 } 1592 1593 /* Free the Tavor Protection Domain handle */ 1594 tavor_rsrc_free(state, &rsrc); 1595 1596 /* Set the pdhdl pointer to NULL and return success */ 1597 *pdhdl = (tavor_pdhdl_t)NULL; 1598 1599 return (DDI_SUCCESS); 1600 } 1601 1602 1603 /* 1604 * tavor_pd_refcnt_inc() 1605 * Context: Can be called from interrupt or base context. 1606 */ 1607 void 1608 tavor_pd_refcnt_inc(tavor_pdhdl_t pd) 1609 { 1610 /* Increment the protection domain's reference count */ 1611 mutex_enter(&pd->pd_lock); 1612 pd->pd_refcnt++; 1613 mutex_exit(&pd->pd_lock); 1614 1615 } 1616 1617 1618 /* 1619 * tavor_pd_refcnt_dec() 1620 * Context: Can be called from interrupt or base context. 1621 */ 1622 void 1623 tavor_pd_refcnt_dec(tavor_pdhdl_t pd) 1624 { 1625 /* Decrement the protection domain's reference count */ 1626 mutex_enter(&pd->pd_lock); 1627 pd->pd_refcnt--; 1628 mutex_exit(&pd->pd_lock); 1629 1630 } 1631 1632 1633 /* 1634 * tavor_port_query() 1635 * Context: Can be called only from user or kernel context. 1636 */ 1637 int 1638 tavor_port_query(tavor_state_t *state, uint_t port, ibt_hca_portinfo_t *pi) 1639 { 1640 sm_portinfo_t portinfo; 1641 sm_guidinfo_t guidinfo; 1642 sm_pkey_table_t pkeytable; 1643 ib_gid_t *sgid; 1644 uint_t sgid_max, pkey_max, tbl_size; 1645 int i, j, indx, status; 1646 1647 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pi)) 1648 1649 /* Validate that specified port number is legal */ 1650 if (!tavor_portnum_is_valid(state, port)) { 1651 return (IBT_HCA_PORT_INVALID); 1652 } 1653 1654 /* 1655 * We use the Tavor MAD_IFC command to post a GetPortInfo MAD 1656 * to the firmware (for the specified port number). This returns 1657 * a full PortInfo MAD (in "portinfo") which we subsequently 1658 * parse to fill in the "ibt_hca_portinfo_t" structure returned 1659 * to the IBTF. 1660 */ 1661 status = tavor_getportinfo_cmd_post(state, port, 1662 TAVOR_SLEEPFLAG_FOR_CONTEXT(), &portinfo); 1663 if (status != TAVOR_CMD_SUCCESS) { 1664 cmn_err(CE_CONT, "Tavor: GetPortInfo (port %02d) command " 1665 "failed: %08x\n", port, status); 1666 return (ibc_get_ci_failure(0)); 1667 } 1668 1669 /* 1670 * Parse the PortInfo MAD and fill in the IBTF structure 1671 */ 1672 pi->p_base_lid = portinfo.LID; 1673 pi->p_qkey_violations = portinfo.Q_KeyViolations; 1674 pi->p_pkey_violations = portinfo.P_KeyViolations; 1675 pi->p_sm_sl = portinfo.MasterSMSL; 1676 pi->p_sm_lid = portinfo.MasterSMLID; 1677 pi->p_linkstate = portinfo.PortState; 1678 pi->p_port_num = portinfo.LocalPortNum; 1679 pi->p_phys_state = portinfo.PortPhysicalState; 1680 pi->p_width_supported = portinfo.LinkWidthSupported; 1681 pi->p_width_enabled = portinfo.LinkWidthEnabled; 1682 pi->p_width_active = portinfo.LinkWidthActive; 1683 pi->p_speed_supported = portinfo.LinkSpeedSupported; 1684 pi->p_speed_enabled = portinfo.LinkSpeedEnabled; 1685 pi->p_speed_active = portinfo.LinkSpeedActive; 1686 pi->p_mtu = portinfo.MTUCap; 1687 pi->p_lmc = portinfo.LMC; 1688 pi->p_max_vl = portinfo.VLCap; 1689 pi->p_subnet_timeout = portinfo.SubnetTimeOut; 1690 pi->p_msg_sz = ((uint32_t)1 << TAVOR_QP_LOG_MAX_MSGSZ); 1691 tbl_size = state->ts_cfg_profile->cp_log_max_gidtbl; 1692 pi->p_sgid_tbl_sz = (1 << tbl_size); 1693 tbl_size = state->ts_cfg_profile->cp_log_max_pkeytbl; 1694 pi->p_pkey_tbl_sz = (1 << tbl_size); 1695 1696 /* 1697 * Convert InfiniBand-defined port capability flags to the format 1698 * specified by the IBTF 1699 */ 1700 if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM) 1701 pi->p_capabilities |= IBT_PORT_CAP_SM; 1702 if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM_DISABLED) 1703 pi->p_capabilities |= IBT_PORT_CAP_SM_DISABLED; 1704 if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SNMP_SUPPD) 1705 pi->p_capabilities |= IBT_PORT_CAP_SNMP_TUNNEL; 1706 if (portinfo.CapabilityMask & SM_CAP_MASK_IS_DM_SUPPD) 1707 pi->p_capabilities |= IBT_PORT_CAP_DM; 1708 if (portinfo.CapabilityMask & SM_CAP_MASK_IS_VM_SUPPD) 1709 pi->p_capabilities |= IBT_PORT_CAP_VENDOR; 1710 1711 /* 1712 * Fill in the SGID table. Since the only access to the Tavor 1713 * GID tables is through the firmware's MAD_IFC interface, we 1714 * post as many GetGUIDInfo MADs as necessary to read in the entire 1715 * contents of the SGID table (for the specified port). Note: The 1716 * GetGUIDInfo command only gets eight GUIDs per operation. These 1717 * GUIDs are then appended to the GID prefix for the port (from the 1718 * GetPortInfo above) to form the entire SGID table. 1719 */ 1720 for (i = 0; i < pi->p_sgid_tbl_sz; i += 8) { 1721 status = tavor_getguidinfo_cmd_post(state, port, i >> 3, 1722 TAVOR_SLEEPFLAG_FOR_CONTEXT(), &guidinfo); 1723 if (status != TAVOR_CMD_SUCCESS) { 1724 cmn_err(CE_CONT, "Tavor: GetGUIDInfo (port %02d) " 1725 "command failed: %08x\n", port, status); 1726 return (ibc_get_ci_failure(0)); 1727 } 1728 1729 /* Figure out how many of the entries are valid */ 1730 sgid_max = min((pi->p_sgid_tbl_sz - i), 8); 1731 for (j = 0; j < sgid_max; j++) { 1732 indx = (i + j); 1733 sgid = &pi->p_sgid_tbl[indx]; 1734 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sgid)) 1735 sgid->gid_prefix = portinfo.GidPrefix; 1736 sgid->gid_guid = guidinfo.GUIDBlocks[j]; 1737 } 1738 } 1739 1740 /* 1741 * Fill in the PKey table. Just as for the GID tables above, the 1742 * only access to the Tavor PKey tables is through the firmware's 1743 * MAD_IFC interface. We post as many GetPKeyTable MADs as necessary 1744 * to read in the entire contents of the PKey table (for the specified 1745 * port). Note: The GetPKeyTable command only gets 32 PKeys per 1746 * operation. 1747 */ 1748 for (i = 0; i < pi->p_pkey_tbl_sz; i += 32) { 1749 status = tavor_getpkeytable_cmd_post(state, port, i, 1750 TAVOR_SLEEPFLAG_FOR_CONTEXT(), &pkeytable); 1751 if (status != TAVOR_CMD_SUCCESS) { 1752 cmn_err(CE_CONT, "Tavor: GetPKeyTable (port %02d) " 1753 "command failed: %08x\n", port, status); 1754 return (ibc_get_ci_failure(0)); 1755 } 1756 1757 /* Figure out how many of the entries are valid */ 1758 pkey_max = min((pi->p_pkey_tbl_sz - i), 32); 1759 for (j = 0; j < pkey_max; j++) { 1760 indx = (i + j); 1761 pi->p_pkey_tbl[indx] = pkeytable.P_KeyTableBlocks[j]; 1762 } 1763 } 1764 1765 return (DDI_SUCCESS); 1766 } 1767 1768 1769 /* 1770 * tavor_port_modify() 1771 * Context: Can be called only from user or kernel context. 1772 */ 1773 /* ARGSUSED */ 1774 int 1775 tavor_port_modify(tavor_state_t *state, uint8_t port, 1776 ibt_port_modify_flags_t flags, uint8_t init_type) 1777 { 1778 sm_portinfo_t portinfo; 1779 uint32_t capmask, reset_qkey; 1780 int status; 1781 1782 /* 1783 * Return an error if either of the unsupported flags are set 1784 */ 1785 if ((flags & IBT_PORT_SHUTDOWN) || 1786 (flags & IBT_PORT_SET_INIT_TYPE)) { 1787 return (IBT_NOT_SUPPORTED); 1788 } 1789 1790 /* 1791 * Determine whether we are trying to reset the QKey counter 1792 */ 1793 reset_qkey = (flags & IBT_PORT_RESET_QKEY) ? 1 : 0; 1794 1795 /* Validate that specified port number is legal */ 1796 if (!tavor_portnum_is_valid(state, port)) { 1797 return (IBT_HCA_PORT_INVALID); 1798 } 1799 1800 /* 1801 * Use the Tavor MAD_IFC command to post a GetPortInfo MAD to the 1802 * firmware (for the specified port number). This returns a full 1803 * PortInfo MAD (in "portinfo") from which we pull the current 1804 * capability mask. We then modify the capability mask as directed 1805 * by the "pmod_flags" field, and write the updated capability mask 1806 * using the Tavor SET_IB command (below). 1807 */ 1808 status = tavor_getportinfo_cmd_post(state, port, 1809 TAVOR_SLEEPFLAG_FOR_CONTEXT(), &portinfo); 1810 if (status != TAVOR_CMD_SUCCESS) { 1811 return (ibc_get_ci_failure(0)); 1812 } 1813 1814 /* 1815 * Convert InfiniBand-defined port capability flags to the format 1816 * specified by the IBTF. Specifically, we modify the capability 1817 * mask based on the specified values. 1818 */ 1819 capmask = portinfo.CapabilityMask; 1820 1821 if (flags & IBT_PORT_RESET_SM) 1822 capmask &= ~SM_CAP_MASK_IS_SM; 1823 else if (flags & IBT_PORT_SET_SM) 1824 capmask |= SM_CAP_MASK_IS_SM; 1825 1826 if (flags & IBT_PORT_RESET_SNMP) 1827 capmask &= ~SM_CAP_MASK_IS_SNMP_SUPPD; 1828 else if (flags & IBT_PORT_SET_SNMP) 1829 capmask |= SM_CAP_MASK_IS_SNMP_SUPPD; 1830 1831 if (flags & IBT_PORT_RESET_DEVMGT) 1832 capmask &= ~SM_CAP_MASK_IS_DM_SUPPD; 1833 else if (flags & IBT_PORT_SET_DEVMGT) 1834 capmask |= SM_CAP_MASK_IS_DM_SUPPD; 1835 1836 if (flags & IBT_PORT_RESET_VENDOR) 1837 capmask &= ~SM_CAP_MASK_IS_VM_SUPPD; 1838 else if (flags & IBT_PORT_SET_VENDOR) 1839 capmask |= SM_CAP_MASK_IS_VM_SUPPD; 1840 1841 /* 1842 * Use the Tavor SET_IB command to update the capability mask and 1843 * (possibly) reset the QKey violation counter for the specified port. 1844 * Note: In general, this operation shouldn't fail. If it does, then 1845 * it is an indication that something (probably in HW, but maybe in 1846 * SW) has gone seriously wrong. 1847 */ 1848 status = tavor_set_ib_cmd_post(state, capmask, port, reset_qkey, 1849 TAVOR_SLEEPFLAG_FOR_CONTEXT()); 1850 if (status != TAVOR_CMD_SUCCESS) { 1851 TAVOR_WARNING(state, "failed to modify port capabilities"); 1852 cmn_err(CE_CONT, "Tavor: SET_IB (port %02d) command failed: " 1853 "%08x\n", port, status); 1854 return (ibc_get_ci_failure(0)); 1855 } 1856 1857 return (DDI_SUCCESS); 1858 } 1859 1860 1861 /* 1862 * tavor_set_addr_path() 1863 * Context: Can be called from interrupt or base context. 1864 * 1865 * Note: This routine is used for two purposes. It is used to fill in the 1866 * Tavor UDAV fields, and it is used to fill in the address path information 1867 * for QPs. Because the two Tavor structures are similar, common fields can 1868 * be filled in here. Because they are slightly different, however, we pass 1869 * an additional flag to indicate which type is being filled. 1870 */ 1871 int 1872 tavor_set_addr_path(tavor_state_t *state, ibt_adds_vect_t *av, 1873 tavor_hw_addr_path_t *path, uint_t type, tavor_qphdl_t qp) 1874 { 1875 uint_t gidtbl_sz; 1876 1877 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*av)) 1878 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*path)) 1879 1880 path->ml_path = av->av_src_path; 1881 path->rlid = av->av_dlid; 1882 path->sl = av->av_srvl; 1883 1884 /* Port number only valid (in "av_port_num") if this is a UDAV */ 1885 if (type == TAVOR_ADDRPATH_UDAV) { 1886 path->portnum = av->av_port_num; 1887 } 1888 1889 /* 1890 * Validate (and fill in) static rate. 1891 * 1892 * The stat_rate_sup is used to decide how to set the rate and 1893 * if it is zero, the driver uses the old interface. 1894 */ 1895 if (state->ts_devlim.stat_rate_sup) { 1896 if (av->av_srate == IBT_SRATE_20) { 1897 path->max_stat_rate = 0; /* 4x@DDR injection rate */ 1898 } else if (av->av_srate == IBT_SRATE_5) { 1899 path->max_stat_rate = 3; /* 1x@DDR injection rate */ 1900 } else if (av->av_srate == IBT_SRATE_10) { 1901 path->max_stat_rate = 2; /* 4x@SDR injection rate */ 1902 } else if (av->av_srate == IBT_SRATE_2) { 1903 path->max_stat_rate = 1; /* 1x@SDR injection rate */ 1904 } else if (av->av_srate == IBT_SRATE_NOT_SPECIFIED) { 1905 path->max_stat_rate = 0; /* Max */ 1906 } else { 1907 return (IBT_STATIC_RATE_INVALID); 1908 } 1909 } else { 1910 if (av->av_srate == IBT_SRATE_10) { 1911 path->max_stat_rate = 0; /* 4x@SDR injection rate */ 1912 } else if (av->av_srate == IBT_SRATE_2) { 1913 path->max_stat_rate = 1; /* 1x@SDR injection rate */ 1914 } else if (av->av_srate == IBT_SRATE_NOT_SPECIFIED) { 1915 path->max_stat_rate = 0; /* Max */ 1916 } else { 1917 return (IBT_STATIC_RATE_INVALID); 1918 } 1919 } 1920 1921 /* 1922 * If this is a QP operation save asoft copy. 1923 */ 1924 if (qp) { 1925 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(qp->qp_save_srate)) 1926 qp->qp_save_srate = av->av_srate; 1927 } 1928 1929 /* If "grh" flag is set, then check for valid SGID index too */ 1930 gidtbl_sz = (1 << state->ts_devlim.log_max_gid); 1931 if ((av->av_send_grh) && (av->av_sgid_ix > gidtbl_sz)) { 1932 return (IBT_SGID_INVALID); 1933 } 1934 1935 /* 1936 * Fill in all "global" values regardless of the value in the GRH 1937 * flag. Because "grh" is not set unless "av_send_grh" is set, the 1938 * hardware will ignore the other "global" values as necessary. Note: 1939 * SW does this here to enable later query operations to return 1940 * exactly the same params that were passed when the addr path was 1941 * last written. 1942 */ 1943 path->grh = av->av_send_grh; 1944 if (type == TAVOR_ADDRPATH_QP) { 1945 path->mgid_index = av->av_sgid_ix; 1946 } else { 1947 /* 1948 * For Tavor UDAV, the "mgid_index" field is the index into 1949 * a combined table (not a per-port table). So some extra 1950 * calculations are necessary. 1951 */ 1952 path->mgid_index = ((av->av_port_num - 1) * gidtbl_sz) + 1953 av->av_sgid_ix; 1954 } 1955 path->flow_label = av->av_flow; 1956 path->tclass = av->av_tclass; 1957 path->hop_limit = av->av_hop; 1958 path->rgid_h = av->av_dgid.gid_prefix; 1959 1960 /* 1961 * According to Tavor PRM, the (31:0) part of rgid_l must be set to 1962 * "0x2" if the 'grh' or 'g' bit is cleared. It also says that we 1963 * only need to do it for UDAV's. So we enforce that here. 1964 * 1965 * NOTE: The entire 64 bits worth of GUID info is actually being 1966 * preserved (for UDAVs) by the callers of this function 1967 * (tavor_ah_alloc() and tavor_ah_modify()) and as long as the 1968 * 'grh' bit is not set, the upper 32 bits (63:32) of rgid_l are 1969 * "don't care". 1970 */ 1971 if ((path->grh) || (type == TAVOR_ADDRPATH_QP)) { 1972 path->rgid_l = av->av_dgid.gid_guid; 1973 } else { 1974 path->rgid_l = 0x2; 1975 } 1976 1977 return (DDI_SUCCESS); 1978 } 1979 1980 1981 /* 1982 * tavor_get_addr_path() 1983 * Context: Can be called from interrupt or base context. 1984 * 1985 * Note: Just like tavor_set_addr_path() above, this routine is used for two 1986 * purposes. It is used to read in the Tavor UDAV fields, and it is used to 1987 * read in the address path information for QPs. Because the two Tavor 1988 * structures are similar, common fields can be read in here. But because 1989 * they are slightly different, we pass an additional flag to indicate which 1990 * type is being read. 1991 */ 1992 void 1993 tavor_get_addr_path(tavor_state_t *state, tavor_hw_addr_path_t *path, 1994 ibt_adds_vect_t *av, uint_t type, tavor_qphdl_t qp) 1995 { 1996 uint_t gidtbl_sz; 1997 1998 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*path)) 1999 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*av)) 2000 2001 av->av_src_path = path->ml_path; 2002 av->av_port_num = path->portnum; 2003 av->av_dlid = path->rlid; 2004 av->av_srvl = path->sl; 2005 2006 /* 2007 * Set "av_ipd" value from max_stat_rate. 2008 */ 2009 if (qp) { 2010 /* 2011 * If a QP operation use the soft copy 2012 */ 2013 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(qp->qp_save_srate)) 2014 av->av_srate = qp->qp_save_srate; 2015 } else { 2016 /* 2017 * The stat_rate_sup is used to decide how the srate value is 2018 * set and 2019 * if it is zero, the driver uses the old interface. 2020 */ 2021 if (state->ts_devlim.stat_rate_sup) { 2022 if (path->max_stat_rate == 0) { 2023 av->av_srate = IBT_SRATE_20; /* 4x@DDR rate */ 2024 } else if (path->max_stat_rate == 1) { 2025 av->av_srate = IBT_SRATE_2; /* 1x@SDR rate */ 2026 } else if (path->max_stat_rate == 2) { 2027 av->av_srate = IBT_SRATE_10; /* 4x@SDR rate */ 2028 } else if (path->max_stat_rate == 3) { 2029 av->av_srate = IBT_SRATE_5; /* 1xDDR rate */ 2030 } 2031 } else { 2032 if (path->max_stat_rate == 0) { 2033 av->av_srate = IBT_SRATE_10; /* 4x@SDR rate */ 2034 } else if (path->max_stat_rate == 1) { 2035 av->av_srate = IBT_SRATE_2; /* 1x@SDR rate */ 2036 } 2037 } 2038 } 2039 2040 /* 2041 * Extract all "global" values regardless of the value in the GRH 2042 * flag. Because "av_send_grh" is set only if "grh" is set, software 2043 * knows to ignore the other "global" values as necessary. Note: SW 2044 * does it this way to enable these query operations to return exactly 2045 * the same params that were passed when the addr path was last written. 2046 */ 2047 av->av_send_grh = path->grh; 2048 if (type == TAVOR_ADDRPATH_QP) { 2049 av->av_sgid_ix = path->mgid_index; 2050 } else { 2051 /* 2052 * For Tavor UDAV, the "mgid_index" field is the index into 2053 * a combined table (not a per-port table). So some extra 2054 * calculations are necessary. 2055 */ 2056 gidtbl_sz = (1 << state->ts_devlim.log_max_gid); 2057 av->av_sgid_ix = path->mgid_index - ((av->av_port_num - 1) * 2058 gidtbl_sz); 2059 } 2060 av->av_flow = path->flow_label; 2061 av->av_tclass = path->tclass; 2062 av->av_hop = path->hop_limit; 2063 av->av_dgid.gid_prefix = path->rgid_h; 2064 av->av_dgid.gid_guid = path->rgid_l; 2065 } 2066 2067 2068 /* 2069 * tavor_portnum_is_valid() 2070 * Context: Can be called from interrupt or base context. 2071 */ 2072 int 2073 tavor_portnum_is_valid(tavor_state_t *state, uint_t portnum) 2074 { 2075 uint_t max_port; 2076 2077 max_port = state->ts_cfg_profile->cp_num_ports; 2078 if ((portnum <= max_port) && (portnum != 0)) { 2079 return (1); 2080 } else { 2081 return (0); 2082 } 2083 } 2084 2085 2086 /* 2087 * tavor_pkeyindex_is_valid() 2088 * Context: Can be called from interrupt or base context. 2089 */ 2090 int 2091 tavor_pkeyindex_is_valid(tavor_state_t *state, uint_t pkeyindx) 2092 { 2093 uint_t max_pkeyindx; 2094 2095 max_pkeyindx = 1 << state->ts_cfg_profile->cp_log_max_pkeytbl; 2096 if (pkeyindx < max_pkeyindx) { 2097 return (1); 2098 } else { 2099 return (0); 2100 } 2101 } 2102 2103 2104 /* 2105 * tavor_queue_alloc() 2106 * Context: Can be called from interrupt or base context. 2107 */ 2108 int 2109 tavor_queue_alloc(tavor_state_t *state, tavor_qalloc_info_t *qa_info, 2110 uint_t sleepflag) 2111 { 2112 ddi_dma_attr_t dma_attr; 2113 int (*callback)(caddr_t); 2114 uint64_t realsize, alloc_mask; 2115 uint_t dma_xfer_mode, type; 2116 int flag, status; 2117 2118 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qa_info)) 2119 2120 /* Set the callback flag appropriately */ 2121 callback = (sleepflag == TAVOR_SLEEP) ? DDI_DMA_SLEEP : 2122 DDI_DMA_DONTWAIT; 2123 2124 /* 2125 * Initialize many of the default DMA attributes. Then set additional 2126 * alignment restrictions as necessary for the queue memory. Also 2127 * respect the configured value for IOMMU bypass 2128 */ 2129 tavor_dma_attr_init(&dma_attr); 2130 dma_attr.dma_attr_align = qa_info->qa_bind_align; 2131 type = state->ts_cfg_profile->cp_iommu_bypass; 2132 if (type == TAVOR_BINDMEM_BYPASS) { 2133 dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL; 2134 } 2135 2136 /* Allocate a DMA handle */ 2137 status = ddi_dma_alloc_handle(state->ts_dip, &dma_attr, callback, NULL, 2138 &qa_info->qa_dmahdl); 2139 if (status != DDI_SUCCESS) { 2140 return (DDI_FAILURE); 2141 } 2142 2143 /* 2144 * Determine the amount of memory to allocate, depending on the values 2145 * in "qa_bind_align" and "qa_alloc_align". The problem we are trying 2146 * to solve here is that allocating a DMA handle with IOMMU bypass 2147 * (DDI_DMA_FORCE_PHYSICAL) constrains us to only requesting alignments 2148 * that are less than the page size. Since we may need stricter 2149 * alignments on the memory allocated by ddi_dma_mem_alloc() (e.g. in 2150 * Tavor QP work queue memory allocation), we use the following method 2151 * to calculate how much additional memory to request, and we enforce 2152 * our own alignment on the allocated result. 2153 */ 2154 alloc_mask = qa_info->qa_alloc_align - 1; 2155 if (qa_info->qa_bind_align == qa_info->qa_alloc_align) { 2156 realsize = qa_info->qa_size; 2157 } else { 2158 realsize = qa_info->qa_size + alloc_mask; 2159 } 2160 2161 /* 2162 * If we are to allocate the queue from system memory, then use 2163 * ddi_dma_mem_alloc() to find the space. Otherwise, if we are to 2164 * allocate the queue from locally-attached DDR memory, then use the 2165 * vmem allocator to find the space. In either case, return a pointer 2166 * to the memory range allocated (including any necessary alignment 2167 * adjustments), the "real" memory pointer, the "real" size, and a 2168 * ddi_acc_handle_t to use when reading from/writing to the memory. 2169 */ 2170 if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_NORMAL) { 2171 2172 /* 2173 * Determine whether to map STREAMING or CONSISTENT. This is 2174 * based on the value set in the configuration profile at 2175 * attach time. 2176 */ 2177 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent; 2178 2179 /* Allocate system memory for the queue */ 2180 status = ddi_dma_mem_alloc(qa_info->qa_dmahdl, realsize, 2181 &state->ts_reg_accattr, dma_xfer_mode, callback, NULL, 2182 (caddr_t *)&qa_info->qa_buf_real, 2183 (size_t *)&qa_info->qa_buf_realsz, &qa_info->qa_acchdl); 2184 if (status != DDI_SUCCESS) { 2185 ddi_dma_free_handle(&qa_info->qa_dmahdl); 2186 return (DDI_FAILURE); 2187 } 2188 2189 /* 2190 * Save temporary copy of the real pointer. (This may be 2191 * modified in the last step below). 2192 */ 2193 qa_info->qa_buf_aligned = qa_info->qa_buf_real; 2194 2195 } else if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_USERLAND) { 2196 2197 /* Allocate userland mappable memory for the queue */ 2198 flag = (sleepflag == TAVOR_SLEEP) ? DDI_UMEM_SLEEP : 2199 DDI_UMEM_NOSLEEP; 2200 qa_info->qa_buf_real = ddi_umem_alloc(realsize, flag, 2201 &qa_info->qa_umemcookie); 2202 if (qa_info->qa_buf_real == NULL) { 2203 ddi_dma_free_handle(&qa_info->qa_dmahdl); 2204 return (DDI_FAILURE); 2205 } 2206 2207 /* 2208 * Save temporary copy of the real pointer. (This may be 2209 * modified in the last step below). 2210 */ 2211 qa_info->qa_buf_aligned = qa_info->qa_buf_real; 2212 2213 } else { /* TAVOR_QUEUE_LOCATION_INDDR */ 2214 2215 /* Allocate DDR memory for the queue */ 2216 flag = (sleepflag == TAVOR_SLEEP) ? VM_SLEEP : VM_NOSLEEP; 2217 qa_info->qa_buf_real = (uint32_t *)vmem_xalloc( 2218 state->ts_ddrvmem, realsize, qa_info->qa_bind_align, 0, 0, 2219 NULL, NULL, flag); 2220 if (qa_info->qa_buf_real == NULL) { 2221 ddi_dma_free_handle(&qa_info->qa_dmahdl); 2222 return (DDI_FAILURE); 2223 } 2224 2225 /* 2226 * Since "qa_buf_real" will be a PCI address (the offset into 2227 * the DDR memory), we first need to do some calculations to 2228 * convert it to its kernel mapped address. (Note: This may 2229 * be modified again below, when any additional "alloc" 2230 * alignment constraint is applied). 2231 */ 2232 qa_info->qa_buf_aligned = (uint32_t *)(uintptr_t)(((uintptr_t) 2233 state->ts_reg_ddr_baseaddr) + ((uintptr_t) 2234 qa_info->qa_buf_real - state->ts_ddr.ddr_baseaddr)); 2235 qa_info->qa_buf_realsz = realsize; 2236 qa_info->qa_acchdl = state->ts_reg_ddrhdl; 2237 } 2238 2239 /* 2240 * The last step is to ensure that the final address ("qa_buf_aligned") 2241 * has the appropriate "alloc" alignment restriction applied to it 2242 * (if necessary). 2243 */ 2244 if (qa_info->qa_bind_align != qa_info->qa_alloc_align) { 2245 qa_info->qa_buf_aligned = (uint32_t *)(uintptr_t)(((uintptr_t) 2246 qa_info->qa_buf_aligned + alloc_mask) & ~alloc_mask); 2247 } 2248 2249 return (DDI_SUCCESS); 2250 } 2251 2252 2253 /* 2254 * tavor_queue_free() 2255 * Context: Can be called from interrupt or base context. 2256 */ 2257 void 2258 tavor_queue_free(tavor_state_t *state, tavor_qalloc_info_t *qa_info) 2259 { 2260 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qa_info)) 2261 2262 /* 2263 * Depending on how (i.e. from where) we allocated the memory for 2264 * this queue, we choose the appropriate method for releasing the 2265 * resources. 2266 */ 2267 if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_NORMAL) { 2268 2269 ddi_dma_mem_free(&qa_info->qa_acchdl); 2270 2271 } else if (qa_info->qa_location == TAVOR_QUEUE_LOCATION_USERLAND) { 2272 2273 ddi_umem_free(qa_info->qa_umemcookie); 2274 2275 } else { /* TAVOR_QUEUE_LOCATION_INDDR */ 2276 2277 vmem_xfree(state->ts_ddrvmem, qa_info->qa_buf_real, 2278 qa_info->qa_buf_realsz); 2279 } 2280 2281 /* Always free the dma handle */ 2282 ddi_dma_free_handle(&qa_info->qa_dmahdl); 2283 } 2284 2285 2286 /* 2287 * tavor_dmaattr_get() 2288 * Context: Can be called from interrupt or base context. 2289 */ 2290 void 2291 tavor_dma_attr_init(ddi_dma_attr_t *dma_attr) 2292 { 2293 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*dma_attr)) 2294 2295 dma_attr->dma_attr_version = DMA_ATTR_V0; 2296 dma_attr->dma_attr_addr_lo = 0; 2297 dma_attr->dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFFull; 2298 dma_attr->dma_attr_count_max = 0xFFFFFFFFFFFFFFFFull; 2299 dma_attr->dma_attr_align = 1; 2300 dma_attr->dma_attr_burstsizes = 0x3FF; 2301 dma_attr->dma_attr_minxfer = 1; 2302 dma_attr->dma_attr_maxxfer = 0xFFFFFFFFFFFFFFFFull; 2303 dma_attr->dma_attr_seg = 0xFFFFFFFFFFFFFFFFull; 2304 dma_attr->dma_attr_sgllen = 0x7FFFFFFF; 2305 dma_attr->dma_attr_granular = 1; 2306 dma_attr->dma_attr_flags = 0; 2307 } 2308