1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * tavor_wr.c 29 * Tavor Work Request Processing Routines 30 * 31 * Implements all the routines necessary to provide the PostSend(), 32 * PostRecv() and PostSRQ() verbs. Also contains all the code 33 * necessary to implement the Tavor WRID tracking mechanism. 34 */ 35 36 #include <sys/types.h> 37 #include <sys/conf.h> 38 #include <sys/ddi.h> 39 #include <sys/sunddi.h> 40 #include <sys/modctl.h> 41 #include <sys/avl.h> 42 43 #include <sys/ib/adapters/tavor/tavor.h> 44 45 static void tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda, 46 uint32_t nds, uint32_t qpn, uint32_t fence, uint32_t nopcode); 47 #pragma inline(tavor_qp_send_doorbell) 48 static void tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda, 49 uint32_t nds, uint32_t qpn, uint32_t credits); 50 #pragma inline(tavor_qp_recv_doorbell) 51 static uint32_t tavor_wr_get_immediate(ibt_send_wr_t *wr); 52 static int tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr); 53 static int tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp, 54 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size); 55 static void tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, 56 ibt_send_wr_t *prev_wr, uint64_t *curr_desc, uint_t curr_descsz, 57 uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp); 58 static int tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp, 59 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size); 60 static void tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc, 61 uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo, 62 tavor_qphdl_t qp); 63 static int tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp, 64 ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size); 65 static void tavor_wqe_recv_linknext(uint64_t *desc, uint_t desc_sz, 66 uint64_t *prev, tavor_qphdl_t qp); 67 static int tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq, 68 ibt_recv_wr_t *wr, uint64_t *desc); 69 static void tavor_wqe_srq_linknext(uint64_t *desc, uint64_t *prev, 70 tavor_srqhdl_t srq); 71 static void tavor_wqe_sync(void *hdl, uint_t sync_from, 72 uint_t sync_to, uint_t sync_type, uint_t flag); 73 static tavor_wrid_entry_t *tavor_wrid_find_match(tavor_workq_hdr_t *wq, 74 tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe); 75 static void tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq); 76 static tavor_workq_hdr_t *tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn, 77 uint_t send_or_recv); 78 static tavor_workq_hdr_t *tavor_wrid_wqhdr_create(tavor_state_t *state, 79 tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type, uint_t create_wql); 80 static uint32_t tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq); 81 static void tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr, 82 tavor_wrid_list_hdr_t *wrid_list); 83 static void tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr, 84 tavor_wrid_list_hdr_t *wrid_list); 85 static tavor_workq_hdr_t *tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wq); 86 static void tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp); 87 static void tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp); 88 static void tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr); 89 static void tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr); 90 91 /* 92 * tavor_post_send() 93 * Context: Can be called from interrupt or base context. 94 */ 95 int 96 tavor_post_send(tavor_state_t *state, tavor_qphdl_t qp, 97 ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted) 98 { 99 tavor_sw_wqe_dbinfo_t dbinfo; 100 tavor_wrid_list_hdr_t *wridlist; 101 tavor_wrid_entry_t *wre_last; 102 uint64_t *desc, *prev, *first; 103 uint32_t desc_sz, first_sz; 104 uint32_t wqeaddrsz, signaled_dbd; 105 uint32_t head, tail, next_tail, qsize_msk; 106 uint32_t sync_from, sync_to; 107 uint_t currindx, wrindx, numremain; 108 uint_t chainlen, chainbegin, posted_cnt; 109 uint_t maxdb = TAVOR_QP_MAXDESC_PER_DB; 110 int status; 111 112 TAVOR_TNF_ENTER(tavor_post_send); 113 114 /* 115 * Check for user-mappable QP memory. Note: We do not allow kernel 116 * clients to post to QP memory that is accessible directly by the 117 * user. If the QP memory is user accessible, then return an error. 118 */ 119 if (qp->qp_is_umap) { 120 TNF_PROBE_0(tavor_post_send_inv_usrmapped_type, 121 TAVOR_TNF_ERROR, ""); 122 TAVOR_TNF_EXIT(tavor_post_send); 123 return (IBT_QP_HDL_INVALID); 124 } 125 126 /* Initialize posted_cnt */ 127 posted_cnt = 0; 128 129 mutex_enter(&qp->qp_lock); 130 131 /* 132 * Check QP state. Can not post Send requests from the "Reset", 133 * "Init", or "RTR" states 134 */ 135 if ((qp->qp_state == TAVOR_QP_RESET) || 136 (qp->qp_state == TAVOR_QP_INIT) || 137 (qp->qp_state == TAVOR_QP_RTR)) { 138 mutex_exit(&qp->qp_lock); 139 TNF_PROBE_0(tavor_post_send_inv_qpstate_fail, 140 TAVOR_TNF_ERROR, ""); 141 TAVOR_TNF_EXIT(tavor_post_send); 142 return (IBT_QP_STATE_INVALID); 143 } 144 145 /* Grab the lock for the WRID list */ 146 mutex_enter(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock); 147 wridlist = qp->qp_sq_wqhdr->wq_wrid_post; 148 149 /* Save away some initial QP state */ 150 qsize_msk = qp->qp_sq_wqhdr->wq_size - 1; 151 tail = qp->qp_sq_wqhdr->wq_tail; 152 head = qp->qp_sq_wqhdr->wq_head; 153 154 /* 155 * For each ibt_send_wr_t in the wr[] list passed in, parse the 156 * request and build a Send WQE. Note: Because we are potentially 157 * building a chain of WQEs, we want to link them all together. 158 * However, we do not want to link the first one to the previous 159 * WQE until the entire chain has been linked. Then in the last 160 * step we ring the appropriate doorbell. Note: It is possible for 161 * more Work Requests to be posted than the HW will support at one 162 * shot. If this happens, we need to be able to post and ring 163 * several chains here until the the entire request is complete. 164 */ 165 wrindx = 0; 166 numremain = num_wr; 167 status = DDI_SUCCESS; 168 while ((wrindx < num_wr) && (status == DDI_SUCCESS)) { 169 /* 170 * For the first WQE on a new chain we need "prev" to point 171 * to the current descriptor. As we begin to process 172 * further, "prev" will be updated to point to the previous 173 * WQE on the current chain (see below). 174 */ 175 prev = TAVOR_QP_SQ_ENTRY(qp, tail); 176 177 /* 178 * Before we begin, save the current "tail index" for later 179 * DMA sync 180 */ 181 sync_from = tail; 182 183 /* 184 * Break the request up into chains that are less than or 185 * equal to the maximum number of WQEs that can be posted 186 * per doorbell ring 187 */ 188 chainlen = (numremain > maxdb) ? maxdb : numremain; 189 numremain -= chainlen; 190 chainbegin = wrindx; 191 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) { 192 /* 193 * Check for "queue full" condition. If the queue 194 * is already full, then no more WQEs can be posted. 195 * So break out, ring a doorbell (if necessary) and 196 * return an error 197 */ 198 if (qp->qp_sq_wqhdr->wq_full != 0) { 199 status = IBT_QP_FULL; 200 TNF_PROBE_0_DEBUG(tavor_post_send_sqfull, 201 TAVOR_TNF_TRACE, ""); 202 break; 203 } 204 205 /* 206 * Increment the "tail index" and check for "queue 207 * full" condition. If we detect that the current 208 * work request is going to fill the work queue, then 209 * we mark this condition and continue. 210 */ 211 next_tail = (tail + 1) & qsize_msk; 212 if (next_tail == head) { 213 qp->qp_sq_wqhdr->wq_full = 1; 214 } 215 216 /* 217 * Get the address of the location where the next 218 * Send WQE should be built 219 */ 220 desc = TAVOR_QP_SQ_ENTRY(qp, tail); 221 222 /* 223 * Call tavor_wqe_send_build() to build the WQE 224 * at the given address. This routine uses the 225 * information in the ibt_send_wr_t list (wr[]) and 226 * returns the size of the WQE when it returns. 227 */ 228 status = tavor_wqe_send_build(state, qp, 229 &wr[wrindx], desc, &desc_sz); 230 if (status != DDI_SUCCESS) { 231 TNF_PROBE_0(tavor_post_send_bldwqe_fail, 232 TAVOR_TNF_ERROR, ""); 233 break; 234 } 235 236 /* 237 * Add a WRID entry to the WRID list. Need to 238 * calculate the "wqeaddrsz" and "signaled_dbd" 239 * values to pass to tavor_wrid_add_entry() 240 */ 241 wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t) 242 ((uint64_t)(uintptr_t)desc - qp->qp_desc_off), 243 desc_sz); 244 if ((qp->qp_sq_sigtype == TAVOR_QP_SQ_ALL_SIGNALED) || 245 (wr[wrindx].wr_flags & IBT_WR_SEND_SIGNAL)) { 246 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED; 247 } else { 248 signaled_dbd = 0; 249 } 250 tavor_wrid_add_entry(qp->qp_sq_wqhdr, 251 wr[wrindx].wr_id, wqeaddrsz, signaled_dbd); 252 253 /* 254 * If this is not the first descriptor on the current 255 * chain, then link it to the previous WQE. Otherwise, 256 * save the address and size of this descriptor (in 257 * "first" and "first_sz" respectively) and continue. 258 * Note: Linking a WQE to the the previous one will 259 * depend on whether the two WQEs are from "special 260 * QPs" (i.e. MLX transport WQEs) or whether they are 261 * normal Send WQEs. 262 */ 263 if (currindx != 0) { 264 if (qp->qp_is_special) { 265 tavor_wqe_mlx_linknext(&wr[wrindx - 1], 266 desc, desc_sz, prev, NULL, qp); 267 } else { 268 tavor_wqe_send_linknext(&wr[wrindx], 269 &wr[wrindx - 1], desc, desc_sz, 270 prev, NULL, qp); 271 } 272 prev = desc; 273 } else { 274 first = desc; 275 first_sz = desc_sz; 276 } 277 278 /* 279 * Update the current "tail index" and increment 280 * "posted_cnt" 281 */ 282 tail = next_tail; 283 posted_cnt++; 284 } 285 286 /* 287 * If we reach here and there are one or more WQEs which have 288 * been successfully chained together, then we need to link 289 * the current chain to the previously executing chain of 290 * descriptor (if there is one) and ring the doorbell for the 291 * send work queue. 292 */ 293 if (currindx != 0) { 294 /* 295 * Before we link the chain, we need to ensure that the 296 * "next" field on the last WQE is set to NULL (to 297 * indicate the end of the chain). Note: Just as it 298 * did above, the format for the "next" fields in a 299 * given WQE depend on whether the WQE is MLX 300 * transport or not. 301 */ 302 if (qp->qp_is_special) { 303 tavor_wqe_mlx_linknext(&wr[chainbegin + 304 currindx - 1], NULL, 0, prev, NULL, qp); 305 } else { 306 tavor_wqe_send_linknext(NULL, 307 &wr[chainbegin + currindx - 1], NULL, 0, 308 prev, NULL, qp); 309 } 310 311 /* Save away updated "tail index" for the DMA sync */ 312 sync_to = tail; 313 314 /* Do a DMA sync for current send WQE(s) */ 315 tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_SEND, 316 DDI_DMA_SYNC_FORDEV); 317 318 /* 319 * Now link the chain to the old chain (if there was 320 * one. Note: still need to pay attention to whether 321 * the QP used MLX transport WQEs or not. 322 */ 323 if (qp->qp_is_special) { 324 tavor_wqe_mlx_linknext(NULL, first, first_sz, 325 qp->qp_sq_lastwqeaddr, &dbinfo, qp); 326 } else { 327 tavor_wqe_send_linknext(&wr[chainbegin], NULL, 328 first, first_sz, qp->qp_sq_lastwqeaddr, 329 &dbinfo, qp); 330 } 331 332 /* 333 * If there was a valid previous WQE (i.e. non-NULL), 334 * then sync it too. This is because we have updated 335 * its "next" fields and we want to ensure that the 336 * hardware can see the changes. 337 */ 338 if (qp->qp_sq_lastwqeaddr != NULL) { 339 sync_to = sync_from; 340 sync_from = (sync_from - 1) & qsize_msk; 341 tavor_wqe_sync(qp, sync_from, sync_to, 342 TAVOR_WR_SEND, DDI_DMA_SYNC_FORDEV); 343 } 344 345 /* 346 * Now if the WRID tail entry is non-NULL, then this 347 * represents the entry to which we are chaining the 348 * new entries. Since we are going to ring the 349 * doorbell for this WQE, we want set its "dbd" bit. 350 * 351 * On the other hand, if the tail is NULL, even though 352 * we will have rung the doorbell for the previous WQE 353 * (for the hardware's sake) it is irrelevant to our 354 * purposes (for tracking WRIDs) because we know the 355 * request must have already completed. 356 */ 357 wre_last = wridlist->wl_wre_old_tail; 358 if (wre_last != NULL) { 359 wre_last->wr_signaled_dbd |= 360 TAVOR_WRID_ENTRY_DOORBELLED; 361 } 362 363 /* Update some of the state in the QP */ 364 qp->qp_sq_lastwqeaddr = desc; 365 qp->qp_sq_wqhdr->wq_tail = tail; 366 367 /* Ring the doorbell */ 368 tavor_qp_send_doorbell(state, 369 (uint32_t)((uintptr_t)first - qp->qp_desc_off), 370 first_sz, qp->qp_qpnum, dbinfo.db_fence, 371 dbinfo.db_nopcode); 372 } 373 } 374 375 /* 376 * Update the "num_posted" return value (if necessary). Then drop 377 * the locks and return success. 378 */ 379 if (num_posted != NULL) { 380 *num_posted = posted_cnt; 381 } 382 383 mutex_exit(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock); 384 mutex_exit(&qp->qp_lock); 385 386 TAVOR_TNF_EXIT(tavor_post_send); 387 return (status); 388 } 389 390 391 /* 392 * tavor_post_recv() 393 * Context: Can be called from interrupt or base context. 394 */ 395 int 396 tavor_post_recv(tavor_state_t *state, tavor_qphdl_t qp, 397 ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted) 398 { 399 uint64_t *desc, *prev, *first; 400 uint32_t desc_sz, first_sz; 401 uint32_t wqeaddrsz, signaled_dbd; 402 uint32_t head, tail, next_tail, qsize_msk; 403 uint32_t sync_from, sync_to; 404 uint_t currindx, wrindx, numremain; 405 uint_t chainlen, posted_cnt; 406 uint_t maxdb = TAVOR_QP_MAXDESC_PER_DB; 407 int status; 408 409 TAVOR_TNF_ENTER(tavor_post_recv); 410 411 /* 412 * Check for user-mappable QP memory. Note: We do not allow kernel 413 * clients to post to QP memory that is accessible directly by the 414 * user. If the QP memory is user accessible, then return an error. 415 */ 416 if (qp->qp_is_umap) { 417 TNF_PROBE_0(tavor_post_recv_inv_usrmapped_type, 418 TAVOR_TNF_ERROR, ""); 419 TAVOR_TNF_EXIT(tavor_post_recv); 420 return (IBT_QP_HDL_INVALID); 421 } 422 423 /* Initialize posted_cnt */ 424 posted_cnt = 0; 425 426 mutex_enter(&qp->qp_lock); 427 428 /* 429 * Check if QP is associated with an SRQ 430 */ 431 if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 432 mutex_exit(&qp->qp_lock); 433 TNF_PROBE_0(tavor_post_recv_fail_qp_on_srq, 434 TAVOR_TNF_ERROR, ""); 435 TAVOR_TNF_EXIT(tavor_post_recv); 436 return (IBT_SRQ_IN_USE); 437 } 438 439 /* 440 * Check QP state. Can not post Recv requests from the "Reset" state 441 */ 442 if (qp->qp_state == TAVOR_QP_RESET) { 443 mutex_exit(&qp->qp_lock); 444 TNF_PROBE_0(tavor_post_recv_inv_qpstate_fail, 445 TAVOR_TNF_ERROR, ""); 446 TAVOR_TNF_EXIT(tavor_post_recv); 447 return (IBT_QP_STATE_INVALID); 448 } 449 450 /* Grab the lock for the WRID list */ 451 mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock); 452 453 /* Save away some initial QP state */ 454 qsize_msk = qp->qp_rq_wqhdr->wq_size - 1; 455 tail = qp->qp_rq_wqhdr->wq_tail; 456 head = qp->qp_rq_wqhdr->wq_head; 457 458 /* 459 * For each ibt_recv_wr_t in the wr[] list passed in, parse the 460 * request and build a Recv WQE. Note: Because we are potentially 461 * building a chain of WQEs, we want to link them all together. 462 * However, we do not want to link the first one to the previous 463 * WQE until the entire chain has been linked. Then in the last 464 * step we ring the appropriate doorbell. Note: It is possible for 465 * more Work Requests to be posted than the HW will support at one 466 * shot. If this happens, we need to be able to post and ring 467 * several chains here until the the entire request is complete. 468 */ 469 wrindx = 0; 470 numremain = num_wr; 471 status = DDI_SUCCESS; 472 while ((wrindx < num_wr) && (status == DDI_SUCCESS)) { 473 /* 474 * For the first WQE on a new chain we need "prev" to point 475 * to the current descriptor. As we begin to process 476 * further, "prev" will be updated to point to the previous 477 * WQE on the current chain (see below). 478 */ 479 prev = TAVOR_QP_RQ_ENTRY(qp, tail); 480 481 /* 482 * Before we begin, save the current "tail index" for later 483 * DMA sync 484 */ 485 sync_from = tail; 486 487 /* 488 * Break the request up into chains that are less than or 489 * equal to the maximum number of WQEs that can be posted 490 * per doorbell ring 491 */ 492 chainlen = (numremain > maxdb) ? maxdb : numremain; 493 numremain -= chainlen; 494 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) { 495 /* 496 * Check for "queue full" condition. If the queue 497 * is already full, then no more WQEs can be posted. 498 * So break out, ring a doorbell (if necessary) and 499 * return an error 500 */ 501 if (qp->qp_rq_wqhdr->wq_full != 0) { 502 status = IBT_QP_FULL; 503 TNF_PROBE_0_DEBUG(tavor_post_recv_rqfull, 504 TAVOR_TNF_TRACE, ""); 505 break; 506 } 507 508 /* 509 * Increment the "tail index" and check for "queue 510 * full" condition. If we detect that the current 511 * work request is going to fill the work queue, then 512 * we mark this condition and continue. 513 */ 514 next_tail = (tail + 1) & qsize_msk; 515 if (next_tail == head) { 516 qp->qp_rq_wqhdr->wq_full = 1; 517 } 518 519 /* 520 * Get the address of the location where the next 521 * Recv WQE should be built 522 */ 523 desc = TAVOR_QP_RQ_ENTRY(qp, tail); 524 525 /* 526 * Call tavor_wqe_recv_build() to build the WQE 527 * at the given address. This routine uses the 528 * information in the ibt_recv_wr_t list (wr[]) and 529 * returns the size of the WQE when it returns. 530 */ 531 status = tavor_wqe_recv_build(state, qp, &wr[wrindx], 532 desc, &desc_sz); 533 if (status != DDI_SUCCESS) { 534 TNF_PROBE_0(tavor_post_recv_bldwqe_fail, 535 TAVOR_TNF_ERROR, ""); 536 break; 537 } 538 539 /* 540 * Add a WRID entry to the WRID list. Need to 541 * calculate the "wqeaddrsz" and "signaled_dbd" 542 * values to pass to tavor_wrid_add_entry(). Note: 543 * all Recv WQEs are essentially "signaled" and 544 * "doorbelled" (since Tavor HW requires all 545 * RecvWQE's to have their "DBD" bits set). 546 */ 547 wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t) 548 ((uint64_t)(uintptr_t)desc - qp->qp_desc_off), 549 desc_sz); 550 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED | 551 TAVOR_WRID_ENTRY_DOORBELLED; 552 tavor_wrid_add_entry(qp->qp_rq_wqhdr, 553 wr[wrindx].wr_id, wqeaddrsz, signaled_dbd); 554 555 /* 556 * If this is not the first descriptor on the current 557 * chain, then link it to the previous WQE. Otherwise, 558 * save the address and size of this descriptor (in 559 * "first" and "first_sz" respectively) and continue. 560 */ 561 if (currindx != 0) { 562 tavor_wqe_recv_linknext(desc, desc_sz, prev, 563 qp); 564 prev = desc; 565 } else { 566 first = desc; 567 first_sz = desc_sz; 568 } 569 570 /* 571 * Update the current "tail index" and increment 572 * "posted_cnt" 573 */ 574 tail = next_tail; 575 posted_cnt++; 576 } 577 578 /* 579 * If we reach here and there are one or more WQEs which have 580 * been successfully chained together, then we need to link 581 * the current chain to the previously executing chain of 582 * descriptor (if there is one) and ring the doorbell for the 583 * recv work queue. 584 */ 585 if (currindx != 0) { 586 /* 587 * Before we link the chain, we need to ensure that the 588 * "next" field on the last WQE is set to NULL (to 589 * indicate the end of the chain). 590 */ 591 tavor_wqe_recv_linknext(NULL, 0, prev, qp); 592 593 /* Save away updated "tail index" for the DMA sync */ 594 sync_to = tail; 595 596 /* Do a DMA sync for current recv WQE(s) */ 597 tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_RECV, 598 DDI_DMA_SYNC_FORDEV); 599 600 /* 601 * Now link the chain to the old chain (if there was 602 * one. 603 */ 604 tavor_wqe_recv_linknext(first, first_sz, 605 qp->qp_rq_lastwqeaddr, qp); 606 607 /* 608 * If there was a valid previous WQE (i.e. non-NULL), 609 * then sync it too. This is because we have updated 610 * its "next" fields and we want to ensure that the 611 * hardware can see the changes. 612 */ 613 if (qp->qp_rq_lastwqeaddr != NULL) { 614 sync_to = sync_from; 615 sync_from = (sync_from - 1) & qsize_msk; 616 tavor_wqe_sync(qp, sync_from, sync_to, 617 TAVOR_WR_RECV, DDI_DMA_SYNC_FORDEV); 618 } 619 620 /* Update some of the state in the QP */ 621 qp->qp_rq_lastwqeaddr = desc; 622 qp->qp_rq_wqhdr->wq_tail = tail; 623 624 /* Ring the doorbell */ 625 tavor_qp_recv_doorbell(state, 626 (uint32_t)((uintptr_t)first - qp->qp_desc_off), 627 first_sz, qp->qp_qpnum, (chainlen % maxdb)); 628 } 629 } 630 631 /* 632 * Update the "num_posted" return value (if necessary). Then drop 633 * the locks and return success. 634 */ 635 if (num_posted != NULL) { 636 *num_posted = posted_cnt; 637 } 638 639 mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock); 640 mutex_exit(&qp->qp_lock); 641 642 TAVOR_TNF_EXIT(tavor_post_recv); 643 return (status); 644 } 645 646 /* 647 * tavor_post_srq() 648 * Context: Can be called from interrupt or base context. 649 */ 650 int 651 tavor_post_srq(tavor_state_t *state, tavor_srqhdl_t srq, 652 ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted) 653 { 654 uint64_t *desc, *prev, *first, *last_wqe_addr; 655 uint32_t signaled_dbd; 656 uint32_t sync_indx; 657 uint_t currindx, wrindx, numremain; 658 uint_t chainlen, posted_cnt; 659 uint_t maxdb = TAVOR_QP_MAXDESC_PER_DB; 660 int status; 661 662 TAVOR_TNF_ENTER(tavor_post_srq); 663 664 /* 665 * Check for user-mappable QP memory. Note: We do not allow kernel 666 * clients to post to QP memory that is accessible directly by the 667 * user. If the QP memory is user accessible, then return an error. 668 */ 669 if (srq->srq_is_umap) { 670 TNF_PROBE_0(tavor_post_srq_inv_usrmapped_type, 671 TAVOR_TNF_ERROR, ""); 672 TAVOR_TNF_EXIT(tavor_post_srq); 673 return (IBT_SRQ_HDL_INVALID); 674 } 675 676 /* Initialize posted_cnt */ 677 posted_cnt = 0; 678 679 mutex_enter(&srq->srq_lock); 680 681 /* 682 * Check SRQ state. Can not post Recv requests when SRQ is in error 683 */ 684 if (srq->srq_state == TAVOR_SRQ_STATE_ERROR) { 685 mutex_exit(&srq->srq_lock); 686 TNF_PROBE_0(tavor_post_srq_inv_srqstate_fail, 687 TAVOR_TNF_ERROR, ""); 688 TAVOR_TNF_EXIT(tavor_post_srq); 689 return (IBT_QP_STATE_INVALID); 690 } 691 692 /* Grab the lock for the WRID list */ 693 mutex_enter(&srq->srq_wrid_wql->wql_lock); 694 695 /* 696 * For each ibt_recv_wr_t in the wr[] list passed in, parse the 697 * request and build a Recv WQE. Note: Because we are potentially 698 * building a chain of WQEs, we want to link them all together. 699 * However, we do not want to link the first one to the previous 700 * WQE until the entire chain has been linked. Then in the last 701 * step we ring the appropriate doorbell. Note: It is possible for 702 * more Work Requests to be posted than the HW will support at one 703 * shot. If this happens, we need to be able to post and ring 704 * several chains here until the the entire request is complete. 705 */ 706 wrindx = 0; 707 numremain = num_wr; 708 status = DDI_SUCCESS; 709 while ((wrindx < num_wr) && (status == DDI_SUCCESS)) { 710 /* 711 * For the first WQE on a new chain we need "prev" to point 712 * to the current descriptor. As we begin to process 713 * further, "prev" will be updated to point to the previous 714 * WQE on the current chain (see below). 715 */ 716 if (srq->srq_wq_lastwqeindx == -1) { 717 prev = NULL; 718 } else { 719 prev = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wq_lastwqeindx); 720 } 721 722 /* 723 * Break the request up into chains that are less than or 724 * equal to the maximum number of WQEs that can be posted 725 * per doorbell ring 726 */ 727 chainlen = (numremain > maxdb) ? maxdb : numremain; 728 numremain -= chainlen; 729 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) { 730 731 /* 732 * Check for "queue full" condition. If the queue 733 * is already full, then no more WQEs can be posted. 734 * So break out, ring a doorbell (if necessary) and 735 * return an error 736 */ 737 if (srq->srq_wridlist->wl_free_list_indx == -1) { 738 status = IBT_QP_FULL; 739 TNF_PROBE_0_DEBUG(tavor_post_srq_wqfull, 740 TAVOR_TNF_TRACE, ""); 741 break; 742 } 743 744 /* 745 * Get the address of the location where the next 746 * Recv WQE should be built 747 */ 748 desc = TAVOR_SRQ_WQE_ADDR(srq, 749 srq->srq_wridlist->wl_free_list_indx); 750 751 /* 752 * Add a WRID entry to the WRID list. Need to 753 * set the "signaled_dbd" values to pass to 754 * tavor_wrid_add_entry(). Note: all Recv WQEs are 755 * essentially "signaled" 756 * 757 * The 'size' is stored at srq_alloc time, in the 758 * srq_wq_stride. This is a constant value required 759 * for SRQ. 760 */ 761 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED; 762 tavor_wrid_add_entry_srq(srq, wr[wrindx].wr_id, 763 signaled_dbd); 764 765 /* 766 * Call tavor_wqe_srq_build() to build the WQE 767 * at the given address. This routine uses the 768 * information in the ibt_recv_wr_t list (wr[]) and 769 * returns the size of the WQE when it returns. 770 */ 771 status = tavor_wqe_srq_build(state, srq, &wr[wrindx], 772 desc); 773 if (status != DDI_SUCCESS) { 774 TNF_PROBE_0(tavor_post_recv_bldwqe_fail, 775 TAVOR_TNF_ERROR, ""); 776 break; 777 } 778 779 /* 780 * If this is not the first descriptor on the current 781 * chain, then link it to the previous WQE. Otherwise, 782 * save the address of this descriptor (in "first") and 783 * continue. 784 */ 785 if (currindx != 0) { 786 tavor_wqe_srq_linknext(desc, prev, srq); 787 sync_indx = TAVOR_SRQ_WQE_INDEX( 788 srq->srq_wq_buf, prev, 789 srq->srq_wq_log_wqesz); 790 791 /* Do a DMA sync for previous recv WQE */ 792 tavor_wqe_sync(srq, sync_indx, sync_indx+1, 793 TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV); 794 795 prev = desc; 796 } else { 797 798 /* 799 * In this case, the last WQE on the chain is 800 * also considered 'first'. So set prev to 801 * first, here. 802 */ 803 first = prev = desc; 804 } 805 806 /* 807 * Increment "posted_cnt" 808 */ 809 posted_cnt++; 810 } 811 812 /* 813 * If we reach here and there are one or more WQEs which have 814 * been successfully chained together, then we need to link 815 * the current chain to the previously executing chain of 816 * descriptor (if there is one) and ring the doorbell for the 817 * recv work queue. 818 */ 819 if (currindx != 0) { 820 /* 821 * Before we link the chain, we need to ensure that the 822 * "next" field on the last WQE is set to NULL (to 823 * indicate the end of the chain). 824 */ 825 tavor_wqe_srq_linknext(NULL, prev, srq); 826 827 sync_indx = TAVOR_SRQ_WQE_INDEX(srq->srq_wq_buf, prev, 828 srq->srq_wq_log_wqesz); 829 830 /* Do a DMA sync for current recv WQE */ 831 tavor_wqe_sync(srq, sync_indx, sync_indx+1, 832 TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV); 833 834 /* 835 * Now link the chain to the old chain (if there was 836 * one). 837 */ 838 if (srq->srq_wq_lastwqeindx == -1) { 839 last_wqe_addr = NULL; 840 } else { 841 last_wqe_addr = TAVOR_SRQ_WQE_ADDR(srq, 842 srq->srq_wq_lastwqeindx); 843 } 844 tavor_wqe_srq_linknext(first, last_wqe_addr, srq); 845 846 /* 847 * If there was a valid previous WQE (i.e. valid index), 848 * then sync it too. This is because we have updated 849 * its "next" fields and we want to ensure that the 850 * hardware can see the changes. 851 */ 852 if (srq->srq_wq_lastwqeindx != -1) { 853 sync_indx = srq->srq_wq_lastwqeindx; 854 tavor_wqe_sync(srq, sync_indx, sync_indx+1, 855 TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV); 856 } 857 858 /* Update some of the state in the QP */ 859 srq->srq_wq_lastwqeindx = TAVOR_SRQ_WQE_INDEX( 860 srq->srq_wq_buf, desc, 861 srq->srq_wq_log_wqesz); 862 863 /* Ring the doorbell */ 864 /* SRQ needs NDS of 0 */ 865 tavor_qp_recv_doorbell(state, 866 (uint32_t)((uintptr_t)first - srq->srq_desc_off), 867 0, srq->srq_srqnum, (chainlen % maxdb)); 868 } 869 } 870 871 /* 872 * Update the "num_posted" return value (if necessary). Then drop 873 * the locks and return success. 874 */ 875 if (num_posted != NULL) { 876 *num_posted = posted_cnt; 877 } 878 879 mutex_exit(&srq->srq_wrid_wql->wql_lock); 880 mutex_exit(&srq->srq_lock); 881 882 TAVOR_TNF_EXIT(tavor_post_srq); 883 return (status); 884 } 885 886 887 /* 888 * tavor_qp_send_doorbell() 889 * Context: Can be called from interrupt or base context. 890 */ 891 static void 892 tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds, 893 uint32_t qpn, uint32_t fence, uint32_t nopcode) 894 { 895 uint64_t doorbell = 0; 896 897 /* Build the doorbell from the parameters */ 898 doorbell = (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) << 899 TAVOR_QPSNDDB_NDA_SHIFT) | 900 ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) | 901 ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) | 902 ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds; 903 904 TNF_PROBE_1_DEBUG(tavor_qp_send_doorbell, TAVOR_TNF_TRACE, "", 905 tnf_ulong, doorbell, doorbell); 906 907 /* Write the doorbell to UAR */ 908 TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->send, 909 doorbell); 910 } 911 912 913 /* 914 * tavor_qp_recv_doorbell() 915 * Context: Can be called from interrupt or base context. 916 */ 917 static void 918 tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds, 919 uint32_t qpn, uint32_t credits) 920 { 921 uint64_t doorbell = 0; 922 923 /* Build the doorbell from the parameters */ 924 doorbell = (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) << 925 TAVOR_QPRCVDB_NDA_SHIFT) | 926 ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) | 927 ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits; 928 929 TNF_PROBE_1_DEBUG(tavor_qp_recv_doorbell, TAVOR_TNF_TRACE, "", 930 tnf_ulong, doorbell, doorbell); 931 932 /* Write the doorbell to UAR */ 933 TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->recv, 934 doorbell); 935 } 936 937 938 /* 939 * tavor_wqe_send_build() 940 * Context: Can be called from interrupt or base context. 941 */ 942 static int 943 tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp, 944 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size) 945 { 946 tavor_hw_snd_wqe_ud_t *ud; 947 tavor_hw_snd_wqe_remaddr_t *rc; 948 tavor_hw_snd_wqe_atomic_t *at; 949 tavor_hw_snd_wqe_remaddr_t *uc; 950 tavor_hw_snd_wqe_bind_t *bn; 951 tavor_hw_wqe_sgl_t *ds; 952 ibt_wr_ds_t *sgl; 953 tavor_ahhdl_t ah; 954 uint32_t nds; 955 int i, num_ds, status; 956 957 TAVOR_TNF_ENTER(tavor_wqe_send_build); 958 959 ASSERT(MUTEX_HELD(&qp->qp_lock)); 960 961 /* Initialize the information for the Data Segments */ 962 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc + 963 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 964 nds = wr->wr_nds; 965 sgl = wr->wr_sgl; 966 num_ds = 0; 967 968 /* 969 * Build a Send WQE depends first and foremost on the transport 970 * type of Work Request (i.e. UD, RC, or UC) 971 */ 972 switch (wr->wr_trans) { 973 case IBT_UD_SRV: 974 /* Ensure that work request transport type matches QP type */ 975 if (qp->qp_serv_type != TAVOR_QP_UD) { 976 TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail, 977 TAVOR_TNF_ERROR, ""); 978 TAVOR_TNF_EXIT(tavor_wqe_send_build); 979 return (IBT_QP_SRV_TYPE_INVALID); 980 } 981 982 /* 983 * Validate the operation type. For UD requests, only the 984 * "Send" operation is valid 985 */ 986 if (wr->wr_opcode != IBT_WRC_SEND) { 987 TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail, 988 TAVOR_TNF_ERROR, ""); 989 TAVOR_TNF_EXIT(tavor_wqe_send_build); 990 return (IBT_QP_OP_TYPE_INVALID); 991 } 992 993 /* 994 * If this is a Special QP (QP0 or QP1), then we need to 995 * build MLX WQEs instead. So jump to tavor_wqe_mlx_build() 996 * and return whatever status it returns 997 */ 998 if (qp->qp_is_special) { 999 status = tavor_wqe_mlx_build(state, qp, wr, desc, size); 1000 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1001 return (status); 1002 } 1003 1004 /* 1005 * Otherwise, if this is a normal UD Send request, then fill 1006 * all the fields in the Tavor UD header for the WQE. Note: 1007 * to do this we'll need to extract some information from the 1008 * Address Handle passed with the work request. 1009 */ 1010 ud = (tavor_hw_snd_wqe_ud_t *)((uintptr_t)desc + 1011 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 1012 ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah; 1013 if (ah == NULL) { 1014 TNF_PROBE_0(tavor_wqe_send_build_invahhdl_fail, 1015 TAVOR_TNF_ERROR, ""); 1016 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1017 return (IBT_AH_HDL_INVALID); 1018 } 1019 1020 /* 1021 * Build the Unreliable Datagram Segment for the WQE, using 1022 * the information from the address handle and the work 1023 * request. 1024 */ 1025 mutex_enter(&ah->ah_lock); 1026 TAVOR_WQE_BUILD_UD(qp, ud, ah, wr); 1027 mutex_exit(&ah->ah_lock); 1028 1029 /* Update "ds" for filling in Data Segments (below) */ 1030 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)ud + 1031 sizeof (tavor_hw_snd_wqe_ud_t)); 1032 break; 1033 1034 case IBT_RC_SRV: 1035 /* Ensure that work request transport type matches QP type */ 1036 if (qp->qp_serv_type != TAVOR_QP_RC) { 1037 TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail, 1038 TAVOR_TNF_ERROR, ""); 1039 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1040 return (IBT_QP_SRV_TYPE_INVALID); 1041 } 1042 1043 /* 1044 * Validate the operation type. For RC requests, we allow 1045 * "Send", "RDMA Read", "RDMA Write", various "Atomic" 1046 * operations, and memory window "Bind" 1047 */ 1048 if ((wr->wr_opcode != IBT_WRC_SEND) && 1049 (wr->wr_opcode != IBT_WRC_RDMAR) && 1050 (wr->wr_opcode != IBT_WRC_RDMAW) && 1051 (wr->wr_opcode != IBT_WRC_CSWAP) && 1052 (wr->wr_opcode != IBT_WRC_FADD) && 1053 (wr->wr_opcode != IBT_WRC_BIND)) { 1054 TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail, 1055 TAVOR_TNF_ERROR, ""); 1056 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1057 return (IBT_QP_OP_TYPE_INVALID); 1058 } 1059 1060 /* 1061 * If this is a Send request, then all we need to do is break 1062 * out and here and begin the Data Segment processing below 1063 */ 1064 if (wr->wr_opcode == IBT_WRC_SEND) { 1065 break; 1066 } 1067 1068 /* 1069 * If this is an RDMA Read or RDMA Write request, then fill 1070 * in the "Remote Address" header fields. 1071 */ 1072 if ((wr->wr_opcode == IBT_WRC_RDMAR) || 1073 (wr->wr_opcode == IBT_WRC_RDMAW)) { 1074 rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc + 1075 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 1076 1077 /* 1078 * Build the Remote Address Segment for the WQE, using 1079 * the information from the RC work request. 1080 */ 1081 TAVOR_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma); 1082 1083 /* Update "ds" for filling in Data Segments (below) */ 1084 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc + 1085 sizeof (tavor_hw_snd_wqe_remaddr_t)); 1086 break; 1087 } 1088 1089 /* 1090 * If this is one of the Atomic type operations (i.e 1091 * Compare-Swap or Fetch-Add), then fill in both the "Remote 1092 * Address" header fields and the "Atomic" header fields. 1093 */ 1094 if ((wr->wr_opcode == IBT_WRC_CSWAP) || 1095 (wr->wr_opcode == IBT_WRC_FADD)) { 1096 rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc + 1097 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 1098 at = (tavor_hw_snd_wqe_atomic_t *)((uintptr_t)rc + 1099 sizeof (tavor_hw_snd_wqe_remaddr_t)); 1100 1101 /* 1102 * Build the Remote Address and Atomic Segments for 1103 * the WQE, using the information from the RC Atomic 1104 * work request. 1105 */ 1106 TAVOR_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr); 1107 TAVOR_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic); 1108 1109 /* Update "ds" for filling in Data Segments (below) */ 1110 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)at + 1111 sizeof (tavor_hw_snd_wqe_atomic_t)); 1112 1113 /* 1114 * Update "nds" and "sgl" because Atomic requests have 1115 * only a single Data Segment (and they are encoded 1116 * somewhat differently in the work request. 1117 */ 1118 nds = 1; 1119 sgl = wr->wr_sgl; 1120 break; 1121 } 1122 1123 /* 1124 * If this is memory window Bind operation, then we call the 1125 * tavor_wr_bind_check() routine to validate the request and 1126 * to generate the updated RKey. If this is successful, then 1127 * we fill in the WQE's "Bind" header fields. 1128 */ 1129 if (wr->wr_opcode == IBT_WRC_BIND) { 1130 status = tavor_wr_bind_check(state, wr); 1131 if (status != DDI_SUCCESS) { 1132 TNF_PROBE_0(tavor_wqe_send_build_bind_fail, 1133 TAVOR_TNF_ERROR, ""); 1134 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1135 return (status); 1136 } 1137 1138 bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc + 1139 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 1140 1141 /* 1142 * Build the Bind Memory Window Segments for the WQE, 1143 * using the information from the RC Bind memory 1144 * window work request. 1145 */ 1146 TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind); 1147 1148 /* 1149 * Update the "ds" pointer. Even though the "bind" 1150 * operation requires no SGLs, this is necessary to 1151 * facilitate the correct descriptor size calculations 1152 * (below). 1153 */ 1154 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn + 1155 sizeof (tavor_hw_snd_wqe_bind_t)); 1156 nds = 0; 1157 } 1158 break; 1159 1160 case IBT_UC_SRV: 1161 /* Ensure that work request transport type matches QP type */ 1162 if (qp->qp_serv_type != TAVOR_QP_UC) { 1163 TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail, 1164 TAVOR_TNF_ERROR, ""); 1165 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1166 return (IBT_QP_SRV_TYPE_INVALID); 1167 } 1168 1169 /* 1170 * Validate the operation type. For UC requests, we only 1171 * allow "Send", "RDMA Write", and memory window "Bind". 1172 * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic" 1173 * operations 1174 */ 1175 if ((wr->wr_opcode != IBT_WRC_SEND) && 1176 (wr->wr_opcode != IBT_WRC_RDMAW) && 1177 (wr->wr_opcode != IBT_WRC_BIND)) { 1178 TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail, 1179 TAVOR_TNF_ERROR, ""); 1180 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1181 return (IBT_QP_OP_TYPE_INVALID); 1182 } 1183 1184 /* 1185 * If this is a Send request, then all we need to do is break 1186 * out and here and begin the Data Segment processing below 1187 */ 1188 if (wr->wr_opcode == IBT_WRC_SEND) { 1189 break; 1190 } 1191 1192 /* 1193 * If this is an RDMA Write request, then fill in the "Remote 1194 * Address" header fields. 1195 */ 1196 if (wr->wr_opcode == IBT_WRC_RDMAW) { 1197 uc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc + 1198 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 1199 1200 /* 1201 * Build the Remote Address Segment for the WQE, using 1202 * the information from the UC work request. 1203 */ 1204 TAVOR_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma); 1205 1206 /* Update "ds" for filling in Data Segments (below) */ 1207 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)uc + 1208 sizeof (tavor_hw_snd_wqe_remaddr_t)); 1209 break; 1210 } 1211 1212 /* 1213 * If this is memory window Bind operation, then we call the 1214 * tavor_wr_bind_check() routine to validate the request and 1215 * to generate the updated RKey. If this is successful, then 1216 * we fill in the WQE's "Bind" header fields. 1217 */ 1218 if (wr->wr_opcode == IBT_WRC_BIND) { 1219 status = tavor_wr_bind_check(state, wr); 1220 if (status != DDI_SUCCESS) { 1221 TNF_PROBE_0(tavor_wqe_send_build_bind_fail, 1222 TAVOR_TNF_ERROR, ""); 1223 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1224 return (status); 1225 } 1226 1227 bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc + 1228 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 1229 1230 /* 1231 * Build the Bind Memory Window Segments for the WQE, 1232 * using the information from the UC Bind memory 1233 * window work request. 1234 */ 1235 TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind); 1236 1237 /* 1238 * Update the "ds" pointer. Even though the "bind" 1239 * operation requires no SGLs, this is necessary to 1240 * facilitate the correct descriptor size calculations 1241 * (below). 1242 */ 1243 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn + 1244 sizeof (tavor_hw_snd_wqe_bind_t)); 1245 nds = 0; 1246 } 1247 break; 1248 1249 default: 1250 TNF_PROBE_0(tavor_wqe_send_build_inv_tranport_fail, 1251 TAVOR_TNF_ERROR, ""); 1252 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1253 return (IBT_QP_SRV_TYPE_INVALID); 1254 } 1255 1256 /* 1257 * Now fill in the Data Segments (SGL) for the Send WQE based on 1258 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer 1259 * Start by checking for a valid number of SGL entries 1260 */ 1261 if (nds > qp->qp_sq_sgl) { 1262 TNF_PROBE_0(tavor_wqe_send_build_toomanysgl_fail, 1263 TAVOR_TNF_ERROR, ""); 1264 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1265 return (IBT_QP_SGL_LEN_INVALID); 1266 } 1267 1268 /* 1269 * For each SGL in the Send Work Request, fill in the Send WQE's data 1270 * segments. Note: We skip any SGL with zero size because Tavor 1271 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 1272 * the encoding for zero means a 2GB transfer. Because of this special 1273 * encoding in the hardware, we mask the requested length with 1274 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 1275 * zero.) 1276 */ 1277 for (i = 0; i < nds; i++) { 1278 if (sgl[i].ds_len == 0) { 1279 continue; 1280 } 1281 1282 /* 1283 * Fill in the Data Segment(s) for the current WQE, using the 1284 * information contained in the scatter-gather list of the 1285 * work request. 1286 */ 1287 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]); 1288 num_ds++; 1289 } 1290 1291 /* Return the size of descriptor (in 16-byte chunks) */ 1292 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 4; 1293 1294 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1295 return (DDI_SUCCESS); 1296 } 1297 1298 1299 /* 1300 * tavor_wqe_send_linknext() 1301 * Context: Can be called from interrupt or base context. 1302 */ 1303 static void 1304 tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, ibt_send_wr_t *prev_wr, 1305 uint64_t *curr_desc, uint_t curr_descsz, uint64_t *prev_desc, 1306 tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp) 1307 { 1308 uint64_t next, ctrl; 1309 uint32_t nopcode, fence; 1310 1311 /* 1312 * Calculate the "next" field of the descriptor. This amounts to 1313 * setting up the "next_wqe_addr", "nopcode", "fence", and "nds" 1314 * fields (see tavor_hw.h for more). Note: If there is no next 1315 * descriptor (i.e. if the current descriptor is the last WQE on 1316 * the chain), then set "next" to zero. 1317 */ 1318 if (curr_desc != NULL) { 1319 /* 1320 * Determine the value for the Tavor WQE "nopcode" field 1321 * by using the IBTF opcode from the work request 1322 */ 1323 switch (curr_wr->wr_opcode) { 1324 case IBT_WRC_RDMAW: 1325 if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) { 1326 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAWI; 1327 } else { 1328 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW; 1329 } 1330 break; 1331 1332 case IBT_WRC_SEND: 1333 if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) { 1334 nopcode = TAVOR_WQE_SEND_NOPCODE_SENDI; 1335 } else { 1336 nopcode = TAVOR_WQE_SEND_NOPCODE_SEND; 1337 } 1338 break; 1339 1340 case IBT_WRC_RDMAR: 1341 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR; 1342 break; 1343 1344 case IBT_WRC_CSWAP: 1345 nopcode = TAVOR_WQE_SEND_NOPCODE_ATMCS; 1346 break; 1347 1348 case IBT_WRC_FADD: 1349 nopcode = TAVOR_WQE_SEND_NOPCODE_ATMFA; 1350 break; 1351 1352 case IBT_WRC_BIND: 1353 nopcode = TAVOR_WQE_SEND_NOPCODE_BIND; 1354 break; 1355 } 1356 1357 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc 1358 - qp->qp_desc_off); 1359 next = ((uint64_t)(uintptr_t)curr_desc & 1360 TAVOR_WQE_NDA_MASK) << 32; 1361 next = next | ((uint64_t)nopcode << 32); 1362 fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0; 1363 if (fence) { 1364 next = next | TAVOR_WQE_SEND_FENCE_MASK; 1365 } 1366 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK); 1367 1368 /* 1369 * If a send queue doorbell will be rung for the next 1370 * WQE on the chain, then set the current WQE's "dbd" bit. 1371 * Note: We also update the "dbinfo" structure here to pass 1372 * back information about what should (later) be included 1373 * in the send queue doorbell. 1374 */ 1375 if (dbinfo) { 1376 next = next | TAVOR_WQE_DBD_MASK; 1377 dbinfo->db_nopcode = nopcode; 1378 dbinfo->db_fence = fence; 1379 } 1380 } else { 1381 next = 0; 1382 } 1383 1384 /* 1385 * If this WQE is supposed to be linked to the previous descriptor, 1386 * then we need to update not only the previous WQE's "next" fields 1387 * but we must also update this WQE's "ctrl" fields (i.e. the "c", "e", 1388 * "s", "i" and "immediate" fields - see tavor_hw.h for more). Note: 1389 * the "e" bit is always hardcoded to zero. 1390 */ 1391 if (prev_desc != NULL) { 1392 /* 1393 * If a send queue doorbell will be rung for the next WQE on 1394 * the chain, then update the current WQE's "next" field and 1395 * return. 1396 * Note: We don't want to modify the "ctrl" field here because 1397 * that portion of the previous WQE has already been set 1398 * correctly at some previous point in time. 1399 */ 1400 if (dbinfo) { 1401 TAVOR_WQE_LINKFIRST(qp, prev_desc, next); 1402 return; 1403 } 1404 1405 ctrl = 0; 1406 1407 /* Set the "c" (i.e. "signaled") bit appropriately */ 1408 if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) { 1409 ctrl = ctrl | TAVOR_WQE_SEND_SIGNALED_MASK; 1410 } 1411 1412 /* Set the "s" (i.e. "solicited") bit appropriately */ 1413 if (prev_wr->wr_flags & IBT_WR_SEND_SOLICIT) { 1414 ctrl = ctrl | TAVOR_WQE_SEND_SOLICIT_MASK; 1415 } 1416 1417 /* Set the "i" bit and the immediate data appropriately */ 1418 if (prev_wr->wr_flags & IBT_WR_SEND_IMMED) { 1419 ctrl = ctrl | TAVOR_WQE_SEND_IMMEDIATE_MASK; 1420 ctrl = ctrl | tavor_wr_get_immediate(prev_wr); 1421 } 1422 1423 TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next); 1424 } 1425 } 1426 1427 1428 /* 1429 * tavor_wqe_mlx_build() 1430 * Context: Can be called from interrupt or base context. 1431 */ 1432 static int 1433 tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp, 1434 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size) 1435 { 1436 tavor_hw_udav_t udav; 1437 tavor_ahhdl_t ah; 1438 ib_lrh_hdr_t *lrh; 1439 ib_grh_t *grh; 1440 ib_bth_hdr_t *bth; 1441 ib_deth_hdr_t *deth; 1442 tavor_hw_wqe_sgl_t *ds; 1443 ibt_wr_ds_t *sgl; 1444 uint8_t *mgmtclass, *hpoint, *hcount; 1445 uint64_t data; 1446 uint32_t nds, offset, pktlen; 1447 uint32_t desc_sz, udav_sz; 1448 int i, num_ds; 1449 1450 TAVOR_TNF_ENTER(tavor_wqe_mlx_build); 1451 1452 ASSERT(MUTEX_HELD(&qp->qp_lock)); 1453 1454 /* Initialize the information for the Data Segments */ 1455 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc + 1456 sizeof (tavor_hw_mlx_wqe_nextctrl_t)); 1457 1458 /* 1459 * Pull the address handle from the work request and read in 1460 * the contents of the UDAV. This will be used to answer some 1461 * questions about the request. 1462 */ 1463 ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah; 1464 if (ah == NULL) { 1465 TNF_PROBE_0(tavor_wqe_mlx_build_invahhdl_fail, 1466 TAVOR_TNF_ERROR, ""); 1467 TAVOR_TNF_EXIT(tavor_wqe_mlx_build); 1468 return (IBT_AH_HDL_INVALID); 1469 } 1470 mutex_enter(&ah->ah_lock); 1471 udav_sz = sizeof (tavor_hw_udav_t) >> 3; 1472 for (i = 0; i < udav_sz; i++) { 1473 data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl, 1474 ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i)); 1475 ((uint64_t *)&udav)[i] = data; 1476 } 1477 mutex_exit(&ah->ah_lock); 1478 1479 /* 1480 * If the request is for QP1 and the destination LID is equal to 1481 * the Permissive LID, then return an error. This combination is 1482 * not allowed 1483 */ 1484 if ((udav.rlid == IB_LID_PERMISSIVE) && 1485 (qp->qp_is_special == TAVOR_QP_GSI)) { 1486 TNF_PROBE_0(tavor_wqe_mlx_build_permissiveLIDonQP1_fail, 1487 TAVOR_TNF_ERROR, ""); 1488 TAVOR_TNF_EXIT(tavor_wqe_mlx_build); 1489 return (IBT_AH_HDL_INVALID); 1490 } 1491 1492 /* 1493 * Calculate the size of the packet headers, including the GRH 1494 * (if necessary) 1495 */ 1496 desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) + 1497 sizeof (ib_deth_hdr_t); 1498 if (udav.grh) { 1499 desc_sz += sizeof (ib_grh_t); 1500 } 1501 1502 /* 1503 * Begin to build the first "inline" data segment for the packet 1504 * headers. Note: By specifying "inline" we can build the contents 1505 * of the MAD packet headers directly into the work queue (as part 1506 * descriptor). This has the advantage of both speeding things up 1507 * and of not requiring the driver to allocate/register any additional 1508 * memory for the packet headers. 1509 */ 1510 TAVOR_WQE_BUILD_INLINE(qp, &ds[0], desc_sz); 1511 desc_sz += 4; 1512 1513 /* 1514 * Build Local Route Header (LRH) 1515 * We start here by building the LRH into a temporary location. 1516 * When we have finished we copy the LRH data into the descriptor. 1517 * 1518 * Notice that the VL values are hardcoded. This is not a problem 1519 * because VL15 is decided later based on the value in the MLX 1520 * transport "next/ctrl" header (see the "vl15" bit below), and it 1521 * is otherwise (meaning for QP1) chosen from the SL-to-VL table 1522 * values. This rule does not hold for loopback packets however 1523 * (all of which bypass the SL-to-VL tables) and it is the reason 1524 * that non-QP0 MADs are setup with VL hardcoded to zero below. 1525 * 1526 * Notice also that Source LID is hardcoded to the Permissive LID 1527 * (0xFFFF). This is also not a problem because if the Destination 1528 * LID is not the Permissive LID, then the "slr" value in the MLX 1529 * transport "next/ctrl" header will be set to zero and the hardware 1530 * will pull the LID from value in the port. 1531 */ 1532 lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4); 1533 pktlen = (desc_sz + 0x100) >> 2; 1534 TAVOR_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen); 1535 1536 /* 1537 * Build Global Route Header (GRH) 1538 * This is only built if necessary as defined by the "grh" bit in 1539 * the address vector. Note: We also calculate the offset to the 1540 * next header (BTH) based on whether or not the "grh" bit is set. 1541 */ 1542 if (udav.grh) { 1543 /* 1544 * If the request is for QP0, then return an error. The 1545 * combination of global routine (GRH) and QP0 is not allowed. 1546 */ 1547 if (qp->qp_is_special == TAVOR_QP_SMI) { 1548 TNF_PROBE_0(tavor_wqe_mlx_build_GRHonQP0_fail, 1549 TAVOR_TNF_ERROR, ""); 1550 TAVOR_TNF_EXIT(tavor_wqe_mlx_build); 1551 return (IBT_AH_HDL_INVALID); 1552 } 1553 grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t)); 1554 TAVOR_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen); 1555 1556 bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t)); 1557 } else { 1558 bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t)); 1559 } 1560 1561 1562 /* 1563 * Build Base Transport Header (BTH) 1564 * Notice that the M, PadCnt, and TVer fields are all set 1565 * to zero implicitly. This is true for all Management Datagrams 1566 * MADs whether GSI are SMI. 1567 */ 1568 TAVOR_WQE_BUILD_MLX_BTH(state, bth, qp, wr); 1569 1570 /* 1571 * Build Datagram Extended Transport Header (DETH) 1572 */ 1573 deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t)); 1574 TAVOR_WQE_BUILD_MLX_DETH(deth, qp); 1575 1576 /* Ensure that the Data Segment is aligned on a 16-byte boundary */ 1577 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t)); 1578 ds = (tavor_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF); 1579 nds = wr->wr_nds; 1580 sgl = wr->wr_sgl; 1581 num_ds = 0; 1582 1583 /* 1584 * Now fill in the Data Segments (SGL) for the MLX WQE based on the 1585 * values set up above (i.e. "sgl", "nds", and the "ds" pointer 1586 * Start by checking for a valid number of SGL entries 1587 */ 1588 if (nds > qp->qp_sq_sgl) { 1589 TNF_PROBE_0(tavor_wqe_mlx_build_toomanysgl_fail, 1590 TAVOR_TNF_ERROR, ""); 1591 TAVOR_TNF_EXIT(tavor_wqe_mlx_build); 1592 return (IBT_QP_SGL_LEN_INVALID); 1593 } 1594 1595 /* 1596 * For each SGL in the Send Work Request, fill in the MLX WQE's data 1597 * segments. Note: We skip any SGL with zero size because Tavor 1598 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 1599 * the encoding for zero means a 2GB transfer. Because of this special 1600 * encoding in the hardware, we mask the requested length with 1601 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 1602 * zero.) 1603 */ 1604 mgmtclass = hpoint = hcount = NULL; 1605 offset = 0; 1606 for (i = 0; i < nds; i++) { 1607 if (sgl[i].ds_len == 0) { 1608 continue; 1609 } 1610 1611 /* 1612 * Fill in the Data Segment(s) for the MLX send WQE, using 1613 * the information contained in the scatter-gather list of 1614 * the work request. 1615 */ 1616 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]); 1617 1618 /* 1619 * Search through the contents of all MADs posted to QP0 to 1620 * initialize pointers to the places where Directed Route "hop 1621 * pointer", "hop count", and "mgmtclass" would be. Tavor 1622 * needs these updated (i.e. incremented or decremented, as 1623 * necessary) by software. 1624 */ 1625 if (qp->qp_is_special == TAVOR_QP_SMI) { 1626 1627 TAVOR_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass, 1628 offset, sgl[i].ds_va, sgl[i].ds_len); 1629 1630 TAVOR_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint, 1631 offset, sgl[i].ds_va, sgl[i].ds_len); 1632 1633 TAVOR_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount, 1634 offset, sgl[i].ds_va, sgl[i].ds_len); 1635 1636 offset += sgl[i].ds_len; 1637 } 1638 num_ds++; 1639 } 1640 1641 /* 1642 * Tavor's Directed Route MADs need to have the "hop pointer" 1643 * incremented/decremented (as necessary) depending on whether it is 1644 * currently less than or greater than the "hop count" (i.e. whether 1645 * the MAD is a request or a response.) 1646 */ 1647 if (qp->qp_is_special == TAVOR_QP_SMI) { 1648 TAVOR_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass, 1649 *hpoint, *hcount); 1650 } 1651 1652 /* 1653 * Now fill in the ICRC Data Segment. This data segment is inlined 1654 * just like the packets headers above, but it is only four bytes and 1655 * set to zero (to indicate that we wish the hardware to generate ICRC. 1656 */ 1657 TAVOR_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0); 1658 num_ds++; 1659 1660 /* Return the size of descriptor (in 16-byte chunks) */ 1661 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4; 1662 1663 TAVOR_TNF_EXIT(tavor_wqe_mlx_build); 1664 return (DDI_SUCCESS); 1665 } 1666 1667 1668 /* 1669 * tavor_wqe_mlx_linknext() 1670 * Context: Can be called from interrupt or base context. 1671 */ 1672 static void 1673 tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc, 1674 uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo, 1675 tavor_qphdl_t qp) 1676 { 1677 tavor_hw_udav_t udav; 1678 tavor_ahhdl_t ah; 1679 uint64_t next, ctrl, data; 1680 uint_t nopcode; 1681 uint_t udav_sz; 1682 int i; 1683 1684 /* 1685 * Calculate the "next" field of the descriptor. This amounts to 1686 * setting up the "next_wqe_addr", "nopcode", and "nds" fields (see 1687 * tavor_hw.h for more). Note: If there is no next descriptor (i.e. 1688 * if the current descriptor is the last WQE on the chain), then set 1689 * "next" to zero. 1690 */ 1691 if (curr_desc != NULL) { 1692 /* 1693 * The only valid Tavor WQE "nopcode" for MLX transport 1694 * requests is the "Send" code. 1695 */ 1696 nopcode = TAVOR_WQE_SEND_NOPCODE_SEND; 1697 curr_desc = (uint64_t *)(uintptr_t)((uint64_t) 1698 (uintptr_t)curr_desc - qp->qp_desc_off); 1699 next = (uint64_t)((uintptr_t)curr_desc & 1700 TAVOR_WQE_NDA_MASK) << 32; 1701 next = next | ((uint64_t)nopcode << 32); 1702 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK); 1703 1704 /* 1705 * If a send queue doorbell will be rung for the next 1706 * WQE on the chain, then set the current WQE's "dbd" bit. 1707 * Note: We also update the "dbinfo" structure here to pass 1708 * back information about what should (later) be included 1709 * in the send queue doorbell. 1710 */ 1711 if (dbinfo) { 1712 next = next | TAVOR_WQE_DBD_MASK; 1713 dbinfo->db_nopcode = nopcode; 1714 dbinfo->db_fence = 0; 1715 } 1716 } else { 1717 next = 0; 1718 } 1719 1720 /* 1721 * If this WQE is supposed to be linked to the previous descriptor, 1722 * then we need to update not only the previous WQE's "next" fields 1723 * but we must also update this WQE's "ctrl" fields (i.e. the "vl15", 1724 * "slr", "max_srate", "sl", "c", "e", "rlid", and "vcrc" fields - 1725 * see tavor_hw.h for more) Note: the "e" bit and "vcrc" fields are 1726 * always hardcoded to zero. 1727 */ 1728 if (prev_desc != NULL) { 1729 /* 1730 * If a send queue doorbell will be rung for the next WQE on 1731 * the chain, then update the current WQE's "next" field and 1732 * return. 1733 * Note: We don't want to modify the "ctrl" field here because 1734 * that portion of the previous WQE has already been set 1735 * correctly at some previous point in time. 1736 */ 1737 if (dbinfo) { 1738 TAVOR_WQE_LINKFIRST(qp, prev_desc, next); 1739 return; 1740 } 1741 1742 /* 1743 * Pull the address handle from the work request and read in 1744 * the contents of the UDAV. This will be used to answer some 1745 * questions about the request. 1746 */ 1747 ah = (tavor_ahhdl_t)prev_wr->wr.ud.udwr_dest->ud_ah; 1748 mutex_enter(&ah->ah_lock); 1749 udav_sz = sizeof (tavor_hw_udav_t) >> 3; 1750 for (i = 0; i < udav_sz; i++) { 1751 data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl, 1752 ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i)); 1753 ((uint64_t *)&udav)[i] = data; 1754 } 1755 mutex_exit(&ah->ah_lock); 1756 1757 ctrl = 0; 1758 1759 /* Only QP0 uses VL15, otherwise use VL in the packet */ 1760 if (qp->qp_is_special == TAVOR_QP_SMI) { 1761 ctrl = ctrl | TAVOR_WQE_MLXHDR_VL15_MASK; 1762 } 1763 1764 /* 1765 * The SLR (Source LID Replace) bit determines whether the 1766 * source LID for an outgoing MLX packet should come from the 1767 * PortInfo (SLR = 0) or should be left as it is in the 1768 * descriptor (SLR = 1). The latter is necessary for packets 1769 * to be sent with the Permissive LID. 1770 */ 1771 if (udav.rlid == IB_LID_PERMISSIVE) { 1772 ctrl = ctrl | TAVOR_WQE_MLXHDR_SLR_MASK; 1773 } 1774 1775 /* Fill in the max static rate from the address handle */ 1776 ctrl = ctrl | ((uint64_t)udav.max_stat_rate << 1777 TAVOR_WQE_MLXHDR_SRATE_SHIFT); 1778 1779 /* All VL15 (i.e. SMI) traffic is required to use SL 0 */ 1780 if (qp->qp_is_special != TAVOR_QP_SMI) { 1781 ctrl = ctrl | ((uint64_t)udav.sl << 1782 TAVOR_WQE_MLXHDR_SL_SHIFT); 1783 } 1784 1785 /* Set the "c" (i.e. "signaled") bit appropriately */ 1786 if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) { 1787 ctrl = ctrl | TAVOR_WQE_MLXHDR_SIGNALED_MASK; 1788 } 1789 1790 /* Fill in the destination LID from the address handle */ 1791 ctrl = ctrl | ((uint64_t)udav.rlid << 1792 TAVOR_WQE_MLXHDR_RLID_SHIFT); 1793 1794 TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next); 1795 } 1796 } 1797 1798 1799 /* 1800 * tavor_wqe_recv_build() 1801 * Context: Can be called from interrupt or base context. 1802 */ 1803 /* ARGSUSED */ 1804 static int 1805 tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp, 1806 ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size) 1807 { 1808 tavor_hw_wqe_sgl_t *ds; 1809 int i, num_ds; 1810 1811 TAVOR_TNF_ENTER(tavor_wqe_recv_build); 1812 1813 ASSERT(MUTEX_HELD(&qp->qp_lock)); 1814 1815 /* Check that work request transport type is valid */ 1816 if ((qp->qp_serv_type != TAVOR_QP_UD) && 1817 (qp->qp_serv_type != TAVOR_QP_RC) && 1818 (qp->qp_serv_type != TAVOR_QP_UC)) { 1819 TNF_PROBE_0(tavor_build_recv_wqe_inv_servtype_fail, 1820 TAVOR_TNF_ERROR, ""); 1821 TAVOR_TNF_EXIT(tavor_build_recv_wqe); 1822 return (IBT_QP_SRV_TYPE_INVALID); 1823 } 1824 1825 /* Fill in the Data Segments (SGL) for the Recv WQE */ 1826 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc + 1827 sizeof (tavor_hw_rcv_wqe_nextctrl_t)); 1828 num_ds = 0; 1829 1830 /* Check for valid number of SGL entries */ 1831 if (wr->wr_nds > qp->qp_rq_sgl) { 1832 TNF_PROBE_0(tavor_wqe_recv_build_toomanysgl_fail, 1833 TAVOR_TNF_ERROR, ""); 1834 TAVOR_TNF_EXIT(tavor_wqe_recv_build); 1835 return (IBT_QP_SGL_LEN_INVALID); 1836 } 1837 1838 /* 1839 * For each SGL in the Recv Work Request, fill in the Recv WQE's data 1840 * segments. Note: We skip any SGL with zero size because Tavor 1841 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 1842 * the encoding for zero means a 2GB transfer. Because of this special 1843 * encoding in the hardware, we mask the requested length with 1844 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 1845 * zero.) 1846 */ 1847 for (i = 0; i < wr->wr_nds; i++) { 1848 if (wr->wr_sgl[i].ds_len == 0) { 1849 continue; 1850 } 1851 1852 /* 1853 * Fill in the Data Segment(s) for the receive WQE, using the 1854 * information contained in the scatter-gather list of the 1855 * work request. 1856 */ 1857 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &wr->wr_sgl[i]); 1858 num_ds++; 1859 } 1860 1861 /* Return the size of descriptor (in 16-byte chunks) */ 1862 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4; 1863 1864 TAVOR_TNF_EXIT(tavor_wqe_recv_build); 1865 return (DDI_SUCCESS); 1866 } 1867 1868 1869 /* 1870 * tavor_wqe_recv_linknext() 1871 * Context: Can be called from interrupt or base context. 1872 */ 1873 static void 1874 tavor_wqe_recv_linknext(uint64_t *curr_desc, uint_t curr_descsz, 1875 uint64_t *prev_desc, tavor_qphdl_t qp) 1876 { 1877 uint64_t next; 1878 1879 /* 1880 * Calculate the "next" field of the descriptor. This amounts to 1881 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see 1882 * tavor_hw.h for more). Note: If there is no next descriptor (i.e. 1883 * if the current descriptor is the last WQE on the chain), then set 1884 * "next" field to TAVOR_WQE_DBD_MASK. This is because the Tavor 1885 * hardware requires the "dbd" bit to be set to one for all Recv WQEs. 1886 * In either case, we must add a single bit in the "reserved" field 1887 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA. This is the 1888 * workaround for a known Tavor errata that can cause Recv WQEs with 1889 * zero in the NDA field to behave improperly. 1890 */ 1891 if (curr_desc != NULL) { 1892 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc - 1893 qp->qp_desc_off); 1894 next = (uint64_t)((uintptr_t)curr_desc & 1895 TAVOR_WQE_NDA_MASK) << 32; 1896 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK) | 1897 TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK; 1898 } else { 1899 next = TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK; 1900 } 1901 1902 /* 1903 * If this WQE is supposed to be linked to the previous descriptor, 1904 * then we need to update not only the previous WQE's "next" fields 1905 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and 1906 * "e" bits - see tavor_hw.h for more). Note: both the "c" and "e" 1907 * bits are always hardcoded to zero. 1908 */ 1909 if (prev_desc != NULL) { 1910 TAVOR_WQE_LINKNEXT(qp, prev_desc, 0, next); 1911 } 1912 } 1913 1914 1915 /* 1916 * tavor_wqe_srq_build() 1917 * Context: Can be called from interrupt or base context. 1918 */ 1919 /* ARGSUSED */ 1920 static int 1921 tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq, 1922 ibt_recv_wr_t *wr, uint64_t *desc) 1923 { 1924 tavor_hw_wqe_sgl_t *ds; 1925 ibt_wr_ds_t end_sgl; 1926 int i, num_ds; 1927 1928 TAVOR_TNF_ENTER(tavor_wqe_recv_build); 1929 1930 ASSERT(MUTEX_HELD(&srq->srq_lock)); 1931 1932 /* Fill in the Data Segments (SGL) for the Recv WQE */ 1933 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc + 1934 sizeof (tavor_hw_rcv_wqe_nextctrl_t)); 1935 num_ds = 0; 1936 1937 /* Check for valid number of SGL entries */ 1938 if (wr->wr_nds > srq->srq_wq_sgl) { 1939 TNF_PROBE_0(tavor_wqe_srq_build_toomanysgl_fail, 1940 TAVOR_TNF_ERROR, ""); 1941 TAVOR_TNF_EXIT(tavor_wqe_srq_build); 1942 return (IBT_QP_SGL_LEN_INVALID); 1943 } 1944 1945 /* 1946 * For each SGL in the Recv Work Request, fill in the Recv WQE's data 1947 * segments. Note: We skip any SGL with zero size because Tavor 1948 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 1949 * the encoding for zero means a 2GB transfer. Because of this special 1950 * encoding in the hardware, we mask the requested length with 1951 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 1952 * zero.) 1953 */ 1954 for (i = 0; i < wr->wr_nds; i++) { 1955 if (wr->wr_sgl[i].ds_len == 0) { 1956 continue; 1957 } 1958 1959 /* 1960 * Fill in the Data Segment(s) for the receive WQE, using the 1961 * information contained in the scatter-gather list of the 1962 * work request. 1963 */ 1964 TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &wr->wr_sgl[i]); 1965 num_ds++; 1966 } 1967 1968 /* 1969 * For SRQ, if the number of data segments is less than the maximum 1970 * specified at alloc, then we have to fill in a special "key" entry in 1971 * the sgl entry after the last valid one in this post request. We do 1972 * that here. 1973 */ 1974 if (num_ds < srq->srq_wq_sgl) { 1975 end_sgl.ds_va = 0; 1976 end_sgl.ds_len = 0; 1977 end_sgl.ds_key = 0x1; 1978 TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &end_sgl); 1979 } 1980 1981 TAVOR_TNF_EXIT(tavor_wqe_srq_build); 1982 return (DDI_SUCCESS); 1983 } 1984 1985 1986 /* 1987 * tavor_wqe_srq_linknext() 1988 * Context: Can be called from interrupt or base context. 1989 */ 1990 static void 1991 tavor_wqe_srq_linknext(uint64_t *curr_desc, uint64_t *prev_desc, 1992 tavor_srqhdl_t srq) 1993 { 1994 uint64_t next; 1995 1996 /* 1997 * Calculate the "next" field of the descriptor. This amounts to 1998 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see 1999 * tavor_hw.h for more). Note: If there is no next descriptor (i.e. 2000 * if the current descriptor is the last WQE on the chain), then set 2001 * "next" field to TAVOR_WQE_DBD_MASK. This is because the Tavor 2002 * hardware requires the "dbd" bit to be set to one for all Recv WQEs. 2003 * In either case, we must add a single bit in the "reserved" field 2004 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA. This is the 2005 * workaround for a known Tavor errata that can cause Recv WQEs with 2006 * zero in the NDA field to behave improperly. 2007 */ 2008 if (curr_desc != NULL) { 2009 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc - 2010 srq->srq_desc_off); 2011 next = (uint64_t)((uintptr_t)curr_desc & 2012 TAVOR_WQE_NDA_MASK) << 32; 2013 next = next | TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK; 2014 } else { 2015 next = TAVOR_RCV_WQE_NDA0_WA_MASK; 2016 } 2017 2018 /* 2019 * If this WQE is supposed to be linked to the previous descriptor, 2020 * then we need to update not only the previous WQE's "next" fields 2021 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and 2022 * "e" bits - see tavor_hw.h for more). Note: both the "c" and "e" 2023 * bits are always hardcoded to zero. 2024 */ 2025 if (prev_desc != NULL) { 2026 TAVOR_WQE_LINKNEXT_SRQ(srq, prev_desc, 0, next); 2027 } 2028 } 2029 2030 2031 /* 2032 * tavor_wr_get_immediate() 2033 * Context: Can be called from interrupt or base context. 2034 */ 2035 static uint32_t 2036 tavor_wr_get_immediate(ibt_send_wr_t *wr) 2037 { 2038 /* 2039 * This routine extracts the "immediate data" from the appropriate 2040 * location in the IBTF work request. Because of the way the 2041 * work request structure is defined, the location for this data 2042 * depends on the actual work request operation type. 2043 */ 2044 2045 /* For RDMA Write, test if RC or UC */ 2046 if (wr->wr_opcode == IBT_WRC_RDMAW) { 2047 if (wr->wr_trans == IBT_RC_SRV) { 2048 return (wr->wr.rc.rcwr.rdma.rdma_immed); 2049 } else { /* IBT_UC_SRV */ 2050 return (wr->wr.uc.ucwr.rdma.rdma_immed); 2051 } 2052 } 2053 2054 /* For Send, test if RC, UD, or UC */ 2055 if (wr->wr_opcode == IBT_WRC_SEND) { 2056 if (wr->wr_trans == IBT_RC_SRV) { 2057 return (wr->wr.rc.rcwr.send_immed); 2058 } else if (wr->wr_trans == IBT_UD_SRV) { 2059 return (wr->wr.ud.udwr_immed); 2060 } else { /* IBT_UC_SRV */ 2061 return (wr->wr.uc.ucwr.send_immed); 2062 } 2063 } 2064 2065 /* 2066 * If any other type of request, then immediate is undefined 2067 */ 2068 return (0); 2069 } 2070 2071 2072 /* 2073 * tavor_wqe_sync() 2074 * Context: Can be called from interrupt or base context. 2075 */ 2076 static void 2077 tavor_wqe_sync(void *hdl, uint_t sync_from, uint_t sync_to, 2078 uint_t sync_type, uint_t flag) 2079 { 2080 tavor_qphdl_t qp; 2081 tavor_srqhdl_t srq; 2082 uint_t is_sync_req; 2083 uint64_t *wqe_from, *wqe_to, *wqe_base, *wqe_top; 2084 ddi_dma_handle_t dmahdl; 2085 off_t offset; 2086 size_t length; 2087 uint32_t qsize; 2088 int status; 2089 2090 TAVOR_TNF_ENTER(tavor_wqe_sync); 2091 2092 if (sync_type == TAVOR_WR_SRQ) { 2093 srq = (tavor_srqhdl_t)hdl; 2094 is_sync_req = srq->srq_sync; 2095 /* Get the DMA handle from SRQ context */ 2096 dmahdl = srq->srq_mrhdl->mr_bindinfo.bi_dmahdl; 2097 } else { 2098 qp = (tavor_qphdl_t)hdl; 2099 is_sync_req = qp->qp_sync; 2100 /* Get the DMA handle from QP context */ 2101 dmahdl = qp->qp_mrhdl->mr_bindinfo.bi_dmahdl; 2102 } 2103 2104 /* Determine if the work queues need to be synced or not */ 2105 if (is_sync_req == 0) { 2106 TAVOR_TNF_EXIT(tavor_wqe_sync); 2107 return; 2108 } 2109 2110 /* 2111 * Depending on the type of the work queue, we grab information 2112 * about the address ranges we need to DMA sync. 2113 */ 2114 if (sync_type == TAVOR_WR_SEND) { 2115 wqe_from = TAVOR_QP_SQ_ENTRY(qp, sync_from); 2116 wqe_to = TAVOR_QP_SQ_ENTRY(qp, sync_to); 2117 qsize = qp->qp_sq_bufsz; 2118 2119 wqe_base = TAVOR_QP_SQ_ENTRY(qp, 0); 2120 wqe_top = TAVOR_QP_SQ_ENTRY(qp, qsize); 2121 } else if (sync_type == TAVOR_WR_RECV) { 2122 wqe_from = TAVOR_QP_RQ_ENTRY(qp, sync_from); 2123 wqe_to = TAVOR_QP_RQ_ENTRY(qp, sync_to); 2124 qsize = qp->qp_rq_bufsz; 2125 2126 wqe_base = TAVOR_QP_RQ_ENTRY(qp, 0); 2127 wqe_top = TAVOR_QP_RQ_ENTRY(qp, qsize); 2128 } else { 2129 wqe_from = TAVOR_SRQ_WQ_ENTRY(srq, sync_from); 2130 wqe_to = TAVOR_SRQ_WQ_ENTRY(srq, sync_to); 2131 qsize = srq->srq_wq_bufsz; 2132 2133 wqe_base = TAVOR_SRQ_WQ_ENTRY(srq, 0); 2134 wqe_top = TAVOR_SRQ_WQ_ENTRY(srq, qsize); 2135 } 2136 2137 /* 2138 * There are two possible cases for the beginning and end of the WQE 2139 * chain we are trying to sync. Either this is the simple case, where 2140 * the end of the chain is below the beginning of the chain, or it is 2141 * the "wrap-around" case, where the end of the chain has wrapped over 2142 * the end of the queue. In the former case, we simply need to 2143 * calculate the span from beginning to end and sync it. In the latter 2144 * case, however, we need to calculate the span from the top of the 2145 * work queue to the end of the chain and sync that, and then we need 2146 * to find the other portion (from beginning of chain to end of queue) 2147 * and sync that as well. Note: if the "top to end" span is actually 2148 * zero length, then we don't do a DMA sync because a zero length DMA 2149 * sync unnecessarily syncs the entire work queue. 2150 */ 2151 if (wqe_to > wqe_from) { 2152 /* "From Beginning to End" */ 2153 offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base); 2154 length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_from); 2155 2156 status = ddi_dma_sync(dmahdl, offset, length, flag); 2157 if (status != DDI_SUCCESS) { 2158 TNF_PROBE_0(tavor_wqe_sync_fail, TAVOR_TNF_ERROR, ""); 2159 TAVOR_TNF_EXIT(tavor_wqe_sync); 2160 return; 2161 } 2162 } else { 2163 /* "From Top to End" */ 2164 offset = (off_t)0; 2165 length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_base); 2166 if (length) { 2167 status = ddi_dma_sync(dmahdl, offset, length, flag); 2168 if (status != DDI_SUCCESS) { 2169 TNF_PROBE_0(tavor_wqe_sync_fail, 2170 TAVOR_TNF_ERROR, ""); 2171 TAVOR_TNF_EXIT(tavor_wqe_sync); 2172 return; 2173 } 2174 } 2175 2176 /* "From Beginning to Bottom" */ 2177 offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base); 2178 length = (size_t)((uintptr_t)wqe_top - (uintptr_t)wqe_from); 2179 status = ddi_dma_sync(dmahdl, offset, length, flag); 2180 if (status != DDI_SUCCESS) { 2181 TNF_PROBE_0(tavor_wqe_sync_fail, TAVOR_TNF_ERROR, ""); 2182 TAVOR_TNF_EXIT(tavor_wqe_sync); 2183 return; 2184 } 2185 } 2186 2187 TAVOR_TNF_EXIT(tavor_wqe_sync); 2188 } 2189 2190 2191 /* 2192 * tavor_wr_bind_check() 2193 * Context: Can be called from interrupt or base context. 2194 */ 2195 static int 2196 tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr) 2197 { 2198 ibt_bind_flags_t bind_flags; 2199 uint64_t vaddr, len; 2200 uint64_t reg_start_addr, reg_end_addr; 2201 tavor_mwhdl_t mw; 2202 tavor_mrhdl_t mr; 2203 tavor_rsrc_t *mpt; 2204 uint32_t new_rkey; 2205 2206 TAVOR_TNF_ENTER(tavor_wr_bind_check); 2207 2208 /* Check for a valid Memory Window handle in the WR */ 2209 mw = (tavor_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl; 2210 if (mw == NULL) { 2211 TNF_PROBE_0(tavor_wr_bind_check_invmwhdl_fail, 2212 TAVOR_TNF_ERROR, ""); 2213 TAVOR_TNF_EXIT(tavor_wr_bind_check); 2214 return (IBT_MW_HDL_INVALID); 2215 } 2216 2217 /* Check for a valid Memory Region handle in the WR */ 2218 mr = (tavor_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl; 2219 if (mr == NULL) { 2220 TNF_PROBE_0(tavor_wr_bind_check_invmrhdl_fail, 2221 TAVOR_TNF_ERROR, ""); 2222 TAVOR_TNF_EXIT(tavor_wr_bind_check); 2223 return (IBT_MR_HDL_INVALID); 2224 } 2225 2226 mutex_enter(&mr->mr_lock); 2227 mutex_enter(&mw->mr_lock); 2228 2229 /* 2230 * Check here to see if the memory region has already been partially 2231 * deregistered as a result of a tavor_umap_umemlock_cb() callback. 2232 * If so, this is an error, return failure. 2233 */ 2234 if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) { 2235 mutex_exit(&mr->mr_lock); 2236 mutex_exit(&mw->mr_lock); 2237 TNF_PROBE_0(tavor_wr_bind_check_invmrhdl2_fail, 2238 TAVOR_TNF_ERROR, ""); 2239 TAVOR_TNF_EXIT(tavor_wr_bind_check); 2240 return (IBT_MR_HDL_INVALID); 2241 } 2242 2243 /* Check for a valid Memory Window RKey (i.e. a matching RKey) */ 2244 if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) { 2245 mutex_exit(&mr->mr_lock); 2246 mutex_exit(&mw->mr_lock); 2247 TNF_PROBE_0(tavor_wr_bind_check_invrkey_fail, 2248 TAVOR_TNF_ERROR, ""); 2249 TAVOR_TNF_EXIT(tavor_wr_bind_check); 2250 return (IBT_MR_RKEY_INVALID); 2251 } 2252 2253 /* Check for a valid Memory Region LKey (i.e. a matching LKey) */ 2254 if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) { 2255 mutex_exit(&mr->mr_lock); 2256 mutex_exit(&mw->mr_lock); 2257 TNF_PROBE_0(tavor_wr_bind_check_invlkey_fail, 2258 TAVOR_TNF_ERROR, ""); 2259 TAVOR_TNF_EXIT(tavor_wr_bind_check); 2260 return (IBT_MR_LKEY_INVALID); 2261 } 2262 2263 /* 2264 * Now check for valid "vaddr" and "len". Note: We don't check the 2265 * "vaddr" range when "len == 0" (i.e. on unbind operations) 2266 */ 2267 len = wr->wr.rc.rcwr.bind->bind_len; 2268 if (len != 0) { 2269 vaddr = wr->wr.rc.rcwr.bind->bind_va; 2270 reg_start_addr = mr->mr_bindinfo.bi_addr; 2271 reg_end_addr = mr->mr_bindinfo.bi_addr + 2272 (mr->mr_bindinfo.bi_len - 1); 2273 if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) { 2274 mutex_exit(&mr->mr_lock); 2275 mutex_exit(&mw->mr_lock); 2276 TNF_PROBE_0(tavor_wr_bind_check_inv_vaddr_fail, 2277 TAVOR_TNF_ERROR, ""); 2278 TAVOR_TNF_EXIT(tavor_wr_bind_check); 2279 return (IBT_MR_VA_INVALID); 2280 } 2281 vaddr = (vaddr + len) - 1; 2282 if (vaddr > reg_end_addr) { 2283 mutex_exit(&mr->mr_lock); 2284 mutex_exit(&mw->mr_lock); 2285 TNF_PROBE_0(tavor_wr_bind_check_invlen_fail, 2286 TAVOR_TNF_ERROR, ""); 2287 TAVOR_TNF_EXIT(tavor_wr_bind_check); 2288 return (IBT_MR_LEN_INVALID); 2289 } 2290 } 2291 2292 /* 2293 * Validate the bind access flags. Remote Write and Atomic access for 2294 * the Memory Window require that Local Write access be set in the 2295 * corresponding Memory Region. 2296 */ 2297 bind_flags = wr->wr.rc.rcwr.bind->bind_flags; 2298 if (((bind_flags & IBT_WR_BIND_WRITE) || 2299 (bind_flags & IBT_WR_BIND_ATOMIC)) && 2300 !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) { 2301 mutex_exit(&mr->mr_lock); 2302 mutex_exit(&mw->mr_lock); 2303 TNF_PROBE_0(tavor_wr_bind_check_invflags_fail, 2304 TAVOR_TNF_ERROR, ""); 2305 TAVOR_TNF_EXIT(tavor_wr_bind_check); 2306 return (IBT_MR_ACCESS_REQ_INVALID); 2307 } 2308 2309 /* Calculate the new RKey for the Memory Window */ 2310 mpt = mw->mr_mptrsrcp; 2311 tavor_mr_keycalc(state, mpt->tr_indx, &new_rkey); 2312 2313 wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey; 2314 mw->mr_rkey = new_rkey; 2315 2316 mutex_exit(&mr->mr_lock); 2317 mutex_exit(&mw->mr_lock); 2318 TAVOR_TNF_EXIT(tavor_wr_bind_check); 2319 return (DDI_SUCCESS); 2320 } 2321 2322 2323 /* 2324 * tavor_wrid_from_reset_handling() 2325 * Context: Can be called from interrupt or base context. 2326 */ 2327 int 2328 tavor_wrid_from_reset_handling(tavor_state_t *state, tavor_qphdl_t qp) 2329 { 2330 tavor_workq_hdr_t *swq, *rwq; 2331 tavor_wrid_list_hdr_t *s_wridlist, *r_wridlist; 2332 uint_t create_new_swq = 0, create_new_rwq = 0; 2333 uint_t create_wql = 0; 2334 uint_t qp_srq_en; 2335 2336 TAVOR_TNF_ENTER(tavor_wrid_from_reset_handling); 2337 2338 /* 2339 * For each of this QP's Work Queues, make sure we have a (properly 2340 * initialized) Work Request ID list attached to the relevant 2341 * completion queue. Grab the CQ lock(s) before manipulating the 2342 * lists. 2343 */ 2344 tavor_wrid_wqhdr_lock_both(qp); 2345 swq = tavor_wrid_wqhdr_find(qp->qp_sq_cqhdl, qp->qp_qpnum, 2346 TAVOR_WR_SEND); 2347 if (swq == NULL) { 2348 /* Couldn't find matching work queue header, create it */ 2349 create_new_swq = create_wql = 1; 2350 swq = tavor_wrid_wqhdr_create(state, qp->qp_sq_cqhdl, 2351 qp->qp_qpnum, TAVOR_WR_SEND, create_wql); 2352 if (swq == NULL) { 2353 /* 2354 * If we couldn't find/allocate space for the workq 2355 * header, then drop the lock(s) and return failure. 2356 */ 2357 tavor_wrid_wqhdr_unlock_both(qp); 2358 TNF_PROBE_0(tavor_wrid_from_reset_handling_wqhdr_fail, 2359 TAVOR_TNF_ERROR, ""); 2360 TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling); 2361 return (ibc_get_ci_failure(0)); 2362 } 2363 } 2364 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swq)) 2365 qp->qp_sq_wqhdr = swq; 2366 swq->wq_size = qp->qp_sq_bufsz; 2367 swq->wq_head = 0; 2368 swq->wq_tail = 0; 2369 swq->wq_full = 0; 2370 2371 /* 2372 * Allocate space for the tavor_wrid_entry_t container 2373 */ 2374 s_wridlist = tavor_wrid_get_list(swq->wq_size); 2375 if (s_wridlist == NULL) { 2376 /* 2377 * If we couldn't allocate space for tracking the WRID 2378 * entries, then cleanup the workq header from above (if 2379 * necessary, i.e. if we created the workq header). Then 2380 * drop the lock(s) and return failure. 2381 */ 2382 if (create_new_swq) { 2383 tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq); 2384 } 2385 2386 tavor_wrid_wqhdr_unlock_both(qp); 2387 TNF_PROBE_0(tavor_wrid_from_reset_handling_wridlist_fail, 2388 TAVOR_TNF_ERROR, ""); 2389 TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling); 2390 return (ibc_get_ci_failure(0)); 2391 } 2392 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*s_wridlist)) 2393 s_wridlist->wl_wqhdr = swq; 2394 2395 /* Chain the new WRID list container to the workq hdr list */ 2396 mutex_enter(&swq->wq_wrid_wql->wql_lock); 2397 tavor_wrid_wqhdr_add(swq, s_wridlist); 2398 mutex_exit(&swq->wq_wrid_wql->wql_lock); 2399 2400 qp_srq_en = qp->qp_srq_en; 2401 2402 #ifdef __lock_lint 2403 mutex_enter(&qp->qp_srqhdl->srq_lock); 2404 #else 2405 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 2406 mutex_enter(&qp->qp_srqhdl->srq_lock); 2407 } 2408 #endif 2409 /* 2410 * Now we repeat all the above operations for the receive work queue, 2411 * or shared receive work queue. 2412 * 2413 * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case. 2414 */ 2415 rwq = tavor_wrid_wqhdr_find(qp->qp_rq_cqhdl, qp->qp_qpnum, 2416 TAVOR_WR_RECV); 2417 if (rwq == NULL) { 2418 create_new_rwq = create_wql = 1; 2419 2420 /* 2421 * If this QP is associated with an SRQ, and this isn't the 2422 * first QP on the SRQ, then the 'srq_wrid_wql' will already be 2423 * created. Since the WQL is created at 'wqhdr_create' time we 2424 * pass in the flag 'create_wql' here to be 0 if we have 2425 * already created it. And later on below we then next setup 2426 * the WQL and rwq information based off the existing SRQ info. 2427 */ 2428 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED && 2429 qp->qp_srqhdl->srq_wrid_wql != NULL) { 2430 create_wql = 0; 2431 } 2432 2433 rwq = tavor_wrid_wqhdr_create(state, qp->qp_rq_cqhdl, 2434 qp->qp_qpnum, TAVOR_WR_RECV, create_wql); 2435 if (rwq == NULL) { 2436 /* 2437 * If we couldn't find/allocate space for the workq 2438 * header, then free all the send queue resources we 2439 * just allocated and setup (above), drop the lock(s) 2440 * and return failure. 2441 */ 2442 mutex_enter(&swq->wq_wrid_wql->wql_lock); 2443 tavor_wrid_wqhdr_remove(swq, s_wridlist); 2444 mutex_exit(&swq->wq_wrid_wql->wql_lock); 2445 if (create_new_swq) { 2446 tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, 2447 swq); 2448 } 2449 2450 #ifdef __lock_lint 2451 mutex_exit(&qp->qp_srqhdl->srq_lock); 2452 #else 2453 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 2454 mutex_exit(&qp->qp_srqhdl->srq_lock); 2455 } 2456 #endif 2457 2458 tavor_wrid_wqhdr_unlock_both(qp); 2459 TNF_PROBE_0(tavor_wrid_from_reset_handling_wqhdr_fail, 2460 TAVOR_TNF_ERROR, ""); 2461 TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling); 2462 return (ibc_get_ci_failure(0)); 2463 } 2464 } 2465 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*rwq)) 2466 2467 /* 2468 * Setup receive workq hdr 2469 * 2470 * If the QP is on an SRQ, we setup the SRQ specific fields, setting 2471 * keeping a copy of the rwq pointer, setting the rwq bufsize 2472 * appropriately, and initializing our part of the WQLock. 2473 * 2474 * In the normal QP case, the QP recv queue bufsize is used. 2475 */ 2476 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 2477 rwq->wq_size = qp->qp_srqhdl->srq_wq_bufsz; 2478 if (qp->qp_srqhdl->srq_wrid_wql == NULL) { 2479 qp->qp_srqhdl->srq_wrid_wql = rwq->wq_wrid_wql; 2480 } else { 2481 rwq->wq_wrid_wql = qp->qp_srqhdl->srq_wrid_wql; 2482 } 2483 tavor_wql_refcnt_inc(qp->qp_srqhdl->srq_wrid_wql); 2484 2485 } else { 2486 rwq->wq_size = qp->qp_rq_bufsz; 2487 } 2488 2489 qp->qp_rq_wqhdr = rwq; 2490 rwq->wq_head = 0; 2491 rwq->wq_tail = 0; 2492 rwq->wq_full = 0; 2493 2494 /* 2495 * Allocate space for the tavor_wrid_entry_t container. 2496 * 2497 * If QP is on an SRQ, and the wrq_wridlist is NULL then we must 2498 * allocate the wridlist normally. However, if the srq_wridlist is != 2499 * NULL, then we know this SRQ has already been initialized, thus the 2500 * wridlist has already been initialized. So we re-use the 2501 * srq_wridlist as the r_wridlist for this QP in this case. 2502 */ 2503 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED && 2504 qp->qp_srqhdl->srq_wridlist != NULL) { 2505 /* Use existing srq_wridlist pointer */ 2506 r_wridlist = qp->qp_srqhdl->srq_wridlist; 2507 ASSERT(r_wridlist != NULL); 2508 } else { 2509 /* Allocate memory for the r_wridlist */ 2510 r_wridlist = tavor_wrid_get_list(rwq->wq_size); 2511 } 2512 2513 /* 2514 * If the memory allocation failed for r_wridlist (or the SRQ pointer 2515 * is mistakenly NULL), we cleanup our previous swq allocation from 2516 * above 2517 */ 2518 if (r_wridlist == NULL) { 2519 /* 2520 * If we couldn't allocate space for tracking the WRID 2521 * entries, then cleanup all the stuff from above. Then 2522 * drop the lock(s) and return failure. 2523 */ 2524 mutex_enter(&swq->wq_wrid_wql->wql_lock); 2525 tavor_wrid_wqhdr_remove(swq, s_wridlist); 2526 mutex_exit(&swq->wq_wrid_wql->wql_lock); 2527 if (create_new_swq) { 2528 tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq); 2529 } 2530 if (create_new_rwq) { 2531 tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, rwq); 2532 } 2533 2534 #ifdef __lock_lint 2535 mutex_exit(&qp->qp_srqhdl->srq_lock); 2536 #else 2537 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 2538 mutex_exit(&qp->qp_srqhdl->srq_lock); 2539 } 2540 #endif 2541 2542 tavor_wrid_wqhdr_unlock_both(qp); 2543 TNF_PROBE_0(tavor_wrid_from_reset_handling_wridlist_fail, 2544 TAVOR_TNF_ERROR, ""); 2545 TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling); 2546 return (ibc_get_ci_failure(0)); 2547 } 2548 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*r_wridlist)) 2549 2550 /* 2551 * Initialize the wridlist 2552 * 2553 * In the normal QP case, there is no special initialization needed. 2554 * We simply setup the wridlist backpointer to be the receive wqhdr 2555 * (rwq). 2556 * 2557 * But in the SRQ case, there is no backpointer to the wqhdr possible. 2558 * Instead we set 'wl_srq_en', specifying this wridlist is on an SRQ 2559 * and thus potentially shared across multiple QPs with the SRQ. We 2560 * also setup the srq_wridlist pointer to be the r_wridlist, and 2561 * intialize the freelist to an invalid index. This srq_wridlist 2562 * pointer is used above on future moves from_reset to let us know that 2563 * the srq_wridlist has been initialized already. 2564 * 2565 * And finally, if we are in a non-UMAP case, we setup the srq wrid 2566 * free list. 2567 */ 2568 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED && 2569 qp->qp_srqhdl->srq_wridlist == NULL) { 2570 r_wridlist->wl_srq_en = 1; 2571 r_wridlist->wl_free_list_indx = -1; 2572 qp->qp_srqhdl->srq_wridlist = r_wridlist; 2573 2574 /* Initialize srq wrid free list */ 2575 if (qp->qp_srqhdl->srq_is_umap == 0) { 2576 mutex_enter(&rwq->wq_wrid_wql->wql_lock); 2577 tavor_wrid_list_srq_init(r_wridlist, qp->qp_srqhdl, 0); 2578 mutex_exit(&rwq->wq_wrid_wql->wql_lock); 2579 } 2580 } else { 2581 r_wridlist->wl_wqhdr = rwq; 2582 } 2583 2584 /* Chain the WRID list "container" to the workq hdr list */ 2585 mutex_enter(&rwq->wq_wrid_wql->wql_lock); 2586 tavor_wrid_wqhdr_add(rwq, r_wridlist); 2587 mutex_exit(&rwq->wq_wrid_wql->wql_lock); 2588 2589 #ifdef __lock_lint 2590 mutex_exit(&qp->qp_srqhdl->srq_lock); 2591 #else 2592 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 2593 mutex_exit(&qp->qp_srqhdl->srq_lock); 2594 } 2595 #endif 2596 2597 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*r_wridlist)) 2598 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*rwq)) 2599 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*s_wridlist)) 2600 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*swq)) 2601 2602 tavor_wrid_wqhdr_unlock_both(qp); 2603 TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling); 2604 return (DDI_SUCCESS); 2605 } 2606 2607 2608 /* 2609 * tavor_wrid_to_reset_handling() 2610 * Context: Can be called from interrupt or base context. 2611 */ 2612 void 2613 tavor_wrid_to_reset_handling(tavor_state_t *state, tavor_qphdl_t qp) 2614 { 2615 uint_t free_wqhdr = 0; 2616 2617 TAVOR_TNF_ENTER(tavor_wrid_to_reset_handling); 2618 2619 /* 2620 * For each of this QP's Work Queues, move the WRID "container" to 2621 * the "reapable" list. Although there may still be unpolled 2622 * entries in these containers, it is not a big deal. We will not 2623 * reap the list until either the Poll CQ command detects an empty 2624 * condition or the CQ itself is freed. Grab the CQ lock(s) before 2625 * manipulating the lists. 2626 */ 2627 mutex_enter(&qp->qp_rq_cqhdl->cq_lock); 2628 tavor_wrid_wqhdr_lock_both(qp); 2629 tavor_wrid_reaplist_add(qp->qp_sq_cqhdl, qp->qp_sq_wqhdr); 2630 2631 /* 2632 * Add the receive work queue header on to the reaplist. But if we are 2633 * on SRQ, then don't add anything to the reaplist. Instead we flush 2634 * the SRQ entries on the CQ, remove wridlist from WQHDR, and free the 2635 * WQHDR (if needed). We must hold the WQL for these operations, yet 2636 * the call to tavor_cq_wqhdr_remove grabs the WQL internally. So we 2637 * drop WQL before that call. Then release the CQ WQHDR locks and the 2638 * CQ lock and return. 2639 */ 2640 if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 2641 2642 /* 2643 * Pull off all (if any) entries for this QP from CQ. This 2644 * only includes entries that have not yet been polled 2645 */ 2646 mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock); 2647 tavor_cq_srq_entries_flush(state, qp); 2648 2649 /* Remove wridlist from WQHDR */ 2650 tavor_wrid_wqhdr_remove(qp->qp_rq_wqhdr, 2651 qp->qp_rq_wqhdr->wq_wrid_post); 2652 2653 /* If wridlist chain is now empty, remove the wqhdr as well */ 2654 if (qp->qp_rq_wqhdr->wq_wrid_post == NULL) { 2655 free_wqhdr = 1; 2656 } else { 2657 free_wqhdr = 0; 2658 } 2659 2660 mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock); 2661 2662 /* Free the WQHDR */ 2663 if (free_wqhdr) { 2664 tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr); 2665 } 2666 } else { 2667 tavor_wrid_reaplist_add(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr); 2668 } 2669 tavor_wrid_wqhdr_unlock_both(qp); 2670 mutex_exit(&qp->qp_rq_cqhdl->cq_lock); 2671 2672 TAVOR_TNF_EXIT(tavor_wrid_to_reset_handling); 2673 } 2674 2675 2676 /* 2677 * tavor_wrid_add_entry() 2678 * Context: Can be called from interrupt or base context. 2679 */ 2680 void 2681 tavor_wrid_add_entry(tavor_workq_hdr_t *wq, uint64_t wrid, uint32_t wqeaddrsz, 2682 uint_t signaled_dbd) 2683 { 2684 tavor_wrid_entry_t *wre_tmp; 2685 uint32_t head, tail, size; 2686 2687 TAVOR_TNF_ENTER(tavor_wrid_add_entry); 2688 2689 ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock)); 2690 2691 /* 2692 * Find the entry in the container pointed to by the "tail" index. 2693 * Add all of the relevant information to that entry, including WRID, 2694 * "wqeaddrsz" parameter, and whether it was signaled/unsignaled 2695 * and/or doorbelled. 2696 */ 2697 head = wq->wq_wrid_post->wl_head; 2698 tail = wq->wq_wrid_post->wl_tail; 2699 size = wq->wq_wrid_post->wl_size; 2700 wre_tmp = &wq->wq_wrid_post->wl_wre[tail]; 2701 wre_tmp->wr_wrid = wrid; 2702 wre_tmp->wr_wqeaddrsz = wqeaddrsz; 2703 wre_tmp->wr_signaled_dbd = signaled_dbd; 2704 2705 /* 2706 * Update the "wrid_old_tail" pointer to point to the entry we just 2707 * inserted into the queue. By tracking this pointer (the pointer to 2708 * the most recently inserted entry) it will possible later in the 2709 * PostSend() and PostRecv() code paths to find the entry that needs 2710 * its "doorbelled" flag set (see comment in tavor_post_recv() and/or 2711 * tavor_post_send()). 2712 */ 2713 wq->wq_wrid_post->wl_wre_old_tail = wre_tmp; 2714 2715 /* Update the tail index */ 2716 tail = ((tail + 1) & (size - 1)); 2717 wq->wq_wrid_post->wl_tail = tail; 2718 2719 /* 2720 * If the "tail" index has just wrapped over into the "head" index, 2721 * then we have filled the container. We use the "full" flag to 2722 * indicate this condition and to distinguish it from the "empty" 2723 * condition (where head and tail are also equal). 2724 */ 2725 if (head == tail) { 2726 wq->wq_wrid_post->wl_full = 1; 2727 } 2728 TAVOR_TNF_EXIT(tavor_wrid_add_entry); 2729 } 2730 2731 /* 2732 * tavor_wrid_add_entry_srq() 2733 * Context: Can be called from interrupt or base context 2734 */ 2735 void 2736 tavor_wrid_add_entry_srq(tavor_srqhdl_t srq, uint64_t wrid, uint_t signaled_dbd) 2737 { 2738 tavor_wrid_entry_t *wre; 2739 uint64_t *wl_wqe; 2740 uint32_t wqe_index; 2741 2742 TAVOR_TNF_ENTER(tavor_wrid_add_entry_srq); 2743 2744 /* 2745 * Find the next available WQE from the SRQ free_list. Then update the 2746 * free_list to point to the next entry 2747 */ 2748 wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wridlist->wl_free_list_indx); 2749 2750 wqe_index = srq->srq_wridlist->wl_free_list_indx; 2751 2752 /* ASSERT on impossible wqe_index values */ 2753 ASSERT(wqe_index < srq->srq_wq_bufsz); 2754 2755 /* 2756 * Setup the WRE. 2757 * 2758 * Given the 'wqe_index' value, we store the WRID at this WRE offset. 2759 * And we set the WRE to be signaled_dbd so that on poll CQ we can find 2760 * this information and associate the WRID to the WQE found on the CQE. 2761 */ 2762 wre = &srq->srq_wridlist->wl_wre[wqe_index]; 2763 wre->wr_wrid = wrid; 2764 wre->wr_signaled_dbd = signaled_dbd; 2765 2766 /* Update the free list index */ 2767 srq->srq_wridlist->wl_free_list_indx = ddi_get32( 2768 srq->srq_wridlist->wl_acchdl, (uint32_t *)wl_wqe); 2769 2770 TAVOR_TNF_EXIT(tavor_wrid_add_entry_srq); 2771 } 2772 2773 2774 /* 2775 * tavor_wrid_get_entry() 2776 * Context: Can be called from interrupt or base context. 2777 */ 2778 uint64_t 2779 tavor_wrid_get_entry(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, 2780 tavor_wrid_entry_t *wre) 2781 { 2782 tavor_workq_hdr_t *wq; 2783 tavor_wrid_entry_t *wre_tmp; 2784 uint64_t wrid; 2785 uint_t send_or_recv, qpnum, error, opcode; 2786 2787 TAVOR_TNF_ENTER(tavor_wrid_get_entry); 2788 2789 /* Lock the list of work queues associated with this CQ */ 2790 mutex_enter(&cq->cq_wrid_wqhdr_lock); 2791 2792 /* 2793 * Determine whether this CQE is a send or receive completion (and 2794 * whether it was a "successful" completion or not) 2795 */ 2796 opcode = TAVOR_CQE_OPCODE_GET(cq, cqe); 2797 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) || 2798 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) { 2799 error = 1; 2800 send_or_recv = (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ? 2801 TAVOR_COMPLETION_SEND : TAVOR_COMPLETION_RECV; 2802 } else { 2803 error = 0; 2804 send_or_recv = TAVOR_CQE_SENDRECV_GET(cq, cqe); 2805 } 2806 2807 /* Find the work queue for this QP number (send or receive side) */ 2808 qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe); 2809 wq = tavor_wrid_wqhdr_find(cq, qpnum, send_or_recv); 2810 ASSERT(wq != NULL); 2811 2812 /* 2813 * Regardless of whether the completion is the result of a "success" 2814 * or a "failure", we lock the list of "containers" and attempt to 2815 * search for the the first matching completion (i.e. the first WR 2816 * with a matching WQE addr and size). Once we find it, we pull out 2817 * the "wrid" field and return it (see below). Note: One possible 2818 * future enhancement would be to enable this routine to skip over 2819 * any "unsignaled" completions to go directly to the next "signaled" 2820 * entry on success. XXX 2821 */ 2822 mutex_enter(&wq->wq_wrid_wql->wql_lock); 2823 wre_tmp = tavor_wrid_find_match(wq, cq, cqe); 2824 2825 /* 2826 * If this is a "successful" completion, then we assert that this 2827 * completion must be a "signaled" completion. 2828 */ 2829 ASSERT(error || (wre_tmp->wr_signaled_dbd & TAVOR_WRID_ENTRY_SIGNALED)); 2830 2831 /* 2832 * If the completion is a "failed" completion, then we save away the 2833 * contents of the entry (into the "wre" field passed in) for use 2834 * in later CQE processing. Note: We use the tavor_wrid_get_wqeaddrsz() 2835 * function to grab "wqeaddrsz" from the next entry in the container. 2836 * This is required for error processing (where updating these fields 2837 * properly is necessary to correct handling of the "error" CQE) 2838 */ 2839 if (error && (wre != NULL)) { 2840 *wre = *wre_tmp; 2841 wre->wr_wqeaddrsz = tavor_wrid_get_wqeaddrsz(wq); 2842 } 2843 2844 /* Pull out the WRID and return it */ 2845 wrid = wre_tmp->wr_wrid; 2846 2847 mutex_exit(&wq->wq_wrid_wql->wql_lock); 2848 mutex_exit(&cq->cq_wrid_wqhdr_lock); 2849 2850 TAVOR_TNF_EXIT(tavor_wrid_get_entry); 2851 return (wrid); 2852 } 2853 2854 2855 /* 2856 * tavor_wrid_find_match() 2857 * Context: Can be called from interrupt or base context. 2858 */ 2859 static tavor_wrid_entry_t * 2860 tavor_wrid_find_match(tavor_workq_hdr_t *wq, tavor_cqhdl_t cq, 2861 tavor_hw_cqe_t *cqe) 2862 { 2863 tavor_wrid_entry_t *curr = NULL; 2864 tavor_wrid_list_hdr_t *container; 2865 uint32_t wqeaddr_size; 2866 uint32_t head, tail, size; 2867 int found = 0, last_container; 2868 2869 TAVOR_TNF_ENTER(tavor_wrid_find_match); 2870 2871 ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock)); 2872 2873 /* Pull the "wqeaddrsz" information from the CQE */ 2874 wqeaddr_size = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe); 2875 2876 /* 2877 * Walk the "containers" list(s), find first WR with a matching WQE 2878 * addr. If the current "container" is not the last one on the list, 2879 * i.e. not the current one to which we are posting new WRID entries, 2880 * then we do not attempt to update the "q_head", "q_tail", and 2881 * "q_full" indicators on the main work queue header. We do, however, 2882 * update the "head" and "full" indicators on the individual containers 2883 * as we go. This is imperative because we need to be able to 2884 * determine when the current container has been emptied (so that we 2885 * can move on to the next container). 2886 */ 2887 container = wq->wq_wrid_poll; 2888 while (container != NULL) { 2889 /* Is this the last/only "container" on the list */ 2890 last_container = (container != wq->wq_wrid_post) ? 0 : 1; 2891 2892 /* 2893 * First check if we are on an SRQ. If so, we grab the entry 2894 * and break out. Since SRQ wridlist's are never added to 2895 * reaplist, they can only be the last container. 2896 */ 2897 if (container->wl_srq_en) { 2898 ASSERT(last_container == 1); 2899 curr = tavor_wrid_find_match_srq(container, cq, cqe); 2900 break; 2901 } 2902 2903 /* 2904 * Grab the current "head", "tail" and "size" fields before 2905 * walking the list in the current container. Note: the "size" 2906 * field here must always be a power-of-2. The "full" 2907 * parameter is checked (and updated) here to distinguish the 2908 * "queue full" condition from "queue empty". 2909 */ 2910 head = container->wl_head; 2911 tail = container->wl_tail; 2912 size = container->wl_size; 2913 while ((head != tail) || (container->wl_full)) { 2914 container->wl_full = 0; 2915 curr = &container->wl_wre[head]; 2916 head = ((head + 1) & (size - 1)); 2917 2918 /* 2919 * If the current entry's "wqeaddrsz" matches the one 2920 * we're searching for, then this must correspond to 2921 * the work request that caused the completion. Set 2922 * the "found" flag and bail out. 2923 */ 2924 if (curr->wr_wqeaddrsz == wqeaddr_size) { 2925 found = 1; 2926 break; 2927 } 2928 } 2929 2930 /* 2931 * If the current container is empty (having reached here the 2932 * "head == tail" condition can only mean that the container 2933 * is empty), then NULL out the "wrid_old_tail" field (see 2934 * tavor_post_send() and tavor_post_recv() for more details) 2935 * and (potentially) remove the current container from future 2936 * searches. 2937 */ 2938 if (head == tail) { 2939 2940 container->wl_wre_old_tail = NULL; 2941 /* 2942 * If this wasn't the last "container" on the chain, 2943 * i.e. the one to which new WRID entries will be 2944 * added, then remove it from the list. 2945 * Note: we don't "lose" the memory pointed to by this 2946 * because we should have already put this container 2947 * on the "reapable" list (from where it will later be 2948 * pulled). 2949 */ 2950 if (!last_container) { 2951 wq->wq_wrid_poll = container->wl_next; 2952 } 2953 } 2954 2955 /* Update the head index for the container */ 2956 container->wl_head = head; 2957 2958 /* 2959 * If the entry was found in this container, then continue to 2960 * bail out. Else reset the "curr" pointer and move on to the 2961 * next container (if there is one). Note: the only real 2962 * reason for setting "curr = NULL" here is so that the ASSERT 2963 * below can catch the case where no matching entry was found 2964 * on any of the lists. 2965 */ 2966 if (found) { 2967 break; 2968 } else { 2969 curr = NULL; 2970 container = container->wl_next; 2971 } 2972 } 2973 2974 /* 2975 * Update work queue header's "head" and "full" conditions to match 2976 * the last entry on the container list. (Note: Only if we're pulling 2977 * entries from the last work queue portion of the list, i.e. not from 2978 * the previous portions that may be the "reapable" list.) 2979 */ 2980 if (last_container) { 2981 wq->wq_head = wq->wq_wrid_post->wl_head; 2982 wq->wq_full = wq->wq_wrid_post->wl_full; 2983 } 2984 2985 /* Ensure that we've actually found what we were searching for */ 2986 ASSERT(curr != NULL); 2987 2988 TAVOR_TNF_EXIT(tavor_wrid_find_match); 2989 return (curr); 2990 } 2991 2992 2993 /* 2994 * tavor_wrid_find_match_srq() 2995 * Context: Can be called from interrupt or base context. 2996 */ 2997 tavor_wrid_entry_t * 2998 tavor_wrid_find_match_srq(tavor_wrid_list_hdr_t *wl, tavor_cqhdl_t cq, 2999 tavor_hw_cqe_t *cqe) 3000 { 3001 tavor_wrid_entry_t *wre; 3002 uint64_t *wl_wqe; 3003 uint32_t wqe_index; 3004 uint64_t wqe_addr; 3005 uint32_t cqe_wqe_addr; 3006 3007 /* Grab the WQE addr out of the CQE */ 3008 cqe_wqe_addr = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe) & 0xFFFFFFC0; 3009 3010 /* 3011 * Use the WQE addr as the lower 32-bit, we add back on the 3012 * 'wl_srq_desc_off' because we have a zero-based queue. Then the 3013 * upper 32-bit of the 'wl_srq_wq_buf' OR'd on gives us the WQE addr in 3014 * the SRQ Work Queue itself. We use this address as the index to find 3015 * out which Work Queue Entry this CQE corresponds with. 3016 * 3017 * We also use this address below to add the WQE back on to the free 3018 * list. 3019 */ 3020 wqe_addr = ((uintptr_t)wl->wl_srq_wq_buf & 0xFFFFFFFF00000000ull) | 3021 (cqe_wqe_addr + wl->wl_srq_desc_off); 3022 3023 /* 3024 * Given the 'wqe_addr' just calculated and the srq buf address, we 3025 * find the 'wqe_index'. The 'wre' returned below contains the WRID 3026 * that we are looking for. This indexes into the wre_list for this 3027 * specific WQE. 3028 */ 3029 wqe_index = TAVOR_SRQ_WQE_INDEX(wl->wl_srq_wq_buf, wqe_addr, 3030 wl->wl_srq_log_wqesz); 3031 3032 /* ASSERT on impossible wqe_index values */ 3033 ASSERT(wqe_index < wl->wl_srq_wq_bufsz); 3034 3035 /* Get the pointer to this WQE */ 3036 wl_wqe = (uint64_t *)(uintptr_t)wqe_addr; 3037 3038 /* Put this WQE index back on the free list */ 3039 ddi_put32(wl->wl_acchdl, (uint32_t *)wl_wqe, wl->wl_free_list_indx); 3040 wl->wl_free_list_indx = wqe_index; 3041 3042 /* Using the index, return the Work Request ID Entry (wre) */ 3043 wre = &wl->wl_wre[wqe_index]; 3044 3045 return (wre); 3046 } 3047 3048 3049 /* 3050 * tavor_wrid_cq_reap() 3051 * Context: Can be called from interrupt or base context. 3052 */ 3053 void 3054 tavor_wrid_cq_reap(tavor_cqhdl_t cq) 3055 { 3056 tavor_workq_hdr_t *consume_wqhdr; 3057 tavor_wrid_list_hdr_t *container, *to_free; 3058 3059 ASSERT(MUTEX_HELD(&cq->cq_lock)); 3060 3061 TAVOR_TNF_ENTER(tavor_wrid_cq_reap); 3062 3063 /* Lock the list of work queues associated with this CQ */ 3064 mutex_enter(&cq->cq_wrid_wqhdr_lock); 3065 3066 /* Walk the "reapable" list and free up containers */ 3067 container = cq->cq_wrid_reap_head; 3068 while (container != NULL) { 3069 to_free = container; 3070 container = container->wl_reap_next; 3071 /* 3072 * If reaping the WRID list containers pulls the last 3073 * container from the given work queue header, then we free 3074 * the work queue header as well. 3075 */ 3076 consume_wqhdr = tavor_wrid_list_reap(to_free); 3077 if (consume_wqhdr != NULL) { 3078 tavor_cq_wqhdr_remove(cq, consume_wqhdr); 3079 } 3080 } 3081 3082 /* Once finished reaping, we reset the CQ's reap list */ 3083 cq->cq_wrid_reap_head = cq->cq_wrid_reap_tail = NULL; 3084 3085 mutex_exit(&cq->cq_wrid_wqhdr_lock); 3086 TAVOR_TNF_EXIT(tavor_wrid_cq_reap); 3087 } 3088 3089 3090 /* 3091 * tavor_wrid_cq_force_reap() 3092 * Context: Can be called from interrupt or base context. 3093 */ 3094 void 3095 tavor_wrid_cq_force_reap(tavor_cqhdl_t cq) 3096 { 3097 tavor_workq_hdr_t *curr; 3098 tavor_wrid_list_hdr_t *container, *to_free; 3099 avl_tree_t *treep; 3100 void *cookie = NULL; 3101 3102 ASSERT(MUTEX_HELD(&cq->cq_lock)); 3103 3104 TAVOR_TNF_ENTER(tavor_wrid_cq_reap); 3105 3106 /* 3107 * The first step is to walk the "reapable" list and free up those 3108 * containers. This is necessary because the containers on the 3109 * reapable list are not otherwise connected to the work queue headers 3110 * anymore. 3111 */ 3112 tavor_wrid_cq_reap(cq); 3113 3114 /* Now lock the list of work queues associated with this CQ */ 3115 mutex_enter(&cq->cq_wrid_wqhdr_lock); 3116 3117 /* 3118 * Walk the list of work queue headers and free up all the WRID list 3119 * containers chained to it. Note: We don't need to grab the locks 3120 * for each of the individual WRID lists here because the only way 3121 * things can be added or removed from the list at this point would be 3122 * through post a work request to a QP. But if we've come this far, 3123 * then we can be assured that there are no longer any QP associated 3124 * with the CQ that we are trying to free. 3125 */ 3126 #ifdef __lock_lint 3127 tavor_wrid_wqhdr_compare(NULL, NULL); 3128 #endif 3129 treep = &cq->cq_wrid_wqhdr_avl_tree; 3130 while ((curr = avl_destroy_nodes(treep, &cookie)) != NULL) { 3131 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*curr)) 3132 container = curr->wq_wrid_poll; 3133 while (container != NULL) { 3134 to_free = container; 3135 container = container->wl_next; 3136 /* 3137 * If reaping the WRID list containers pulls the last 3138 * container from the given work queue header, then 3139 * we free the work queue header as well. Note: we 3140 * ignore the return value because we know that the 3141 * work queue header should always be freed once the 3142 * list of containers has come to an end. 3143 */ 3144 (void) tavor_wrid_list_reap(to_free); 3145 if (container == NULL) { 3146 tavor_cq_wqhdr_remove(cq, curr); 3147 } 3148 } 3149 } 3150 avl_destroy(treep); 3151 3152 mutex_exit(&cq->cq_wrid_wqhdr_lock); 3153 TAVOR_TNF_EXIT(tavor_wrid_cq_reap); 3154 } 3155 3156 3157 /* 3158 * tavor_wrid_get_list() 3159 * Context: Can be called from interrupt or base context. 3160 */ 3161 tavor_wrid_list_hdr_t * 3162 tavor_wrid_get_list(uint32_t qsize) 3163 { 3164 tavor_wrid_list_hdr_t *wridlist; 3165 uint32_t size; 3166 3167 /* 3168 * The WRID list "container" consists of the tavor_wrid_list_hdr_t, 3169 * which holds the pointers necessary for maintaining the "reapable" 3170 * list, chaining together multiple "containers" old and new, and 3171 * tracking the head, tail, size, etc. for each container. 3172 * 3173 * The "container" also holds all the tavor_wrid_entry_t's, which is 3174 * allocated separately, one for each entry on the corresponding work 3175 * queue. 3176 */ 3177 size = sizeof (tavor_wrid_list_hdr_t); 3178 3179 /* 3180 * Note that this allocation has to be a NOSLEEP operation here 3181 * because we are holding the "wqhdr_list_lock" and, therefore, 3182 * could get raised to the interrupt level. 3183 */ 3184 wridlist = (tavor_wrid_list_hdr_t *)kmem_zalloc(size, KM_NOSLEEP); 3185 if (wridlist == NULL) { 3186 return (NULL); 3187 } 3188 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wridlist)) 3189 3190 /* Complete the "container" initialization */ 3191 wridlist->wl_size = qsize; 3192 wridlist->wl_full = 0; 3193 wridlist->wl_head = 0; 3194 wridlist->wl_tail = 0; 3195 wridlist->wl_wre = (tavor_wrid_entry_t *)kmem_zalloc(qsize * 3196 sizeof (tavor_wrid_entry_t), KM_NOSLEEP); 3197 if (wridlist->wl_wre == NULL) { 3198 kmem_free(wridlist, size); 3199 return (NULL); 3200 } 3201 wridlist->wl_wre_old_tail = NULL; 3202 wridlist->wl_reap_next = NULL; 3203 wridlist->wl_next = NULL; 3204 wridlist->wl_prev = NULL; 3205 wridlist->wl_srq_en = 0; 3206 3207 return (wridlist); 3208 } 3209 3210 /* 3211 * tavor_wrid_list_srq_init() 3212 * Context: Can be called from interrupt or base context 3213 */ 3214 void 3215 tavor_wrid_list_srq_init(tavor_wrid_list_hdr_t *wridlist, tavor_srqhdl_t srq, 3216 uint_t wq_start) 3217 { 3218 uint64_t *wl_wqe; 3219 int wqe_index; 3220 3221 ASSERT(MUTEX_HELD(&srq->srq_wrid_wql->wql_lock)); 3222 3223 /* Setup pointers for use later when we are polling the CQ */ 3224 wridlist->wl_srq_wq_buf = srq->srq_wq_buf; 3225 wridlist->wl_srq_wq_bufsz = srq->srq_wq_bufsz; 3226 wridlist->wl_srq_log_wqesz = srq->srq_wq_log_wqesz; 3227 wridlist->wl_srq_desc_off = srq->srq_desc_off; 3228 wridlist->wl_acchdl = srq->srq_wqinfo.qa_acchdl; 3229 3230 /* Given wq_start to start initializing buf at, verify sanity */ 3231 ASSERT(wq_start >= 0 && wq_start < srq->srq_wq_bufsz); 3232 3233 /* 3234 * Initialize wridlist free list 3235 * 3236 * For each WQ up to the size of our queue, we store an index in the WQ 3237 * memory itself, representing the next available free entry. The 3238 * 'wl_free_list_indx' always holds the index of the next available 3239 * free entry in the WQ. If 'wl_free_list_indx' is -1, then we are 3240 * completely full. This gives us the advantage of being able to have 3241 * entries complete or be polled off the WQ out-of-order. 3242 * 3243 * For now, we write the free_list entries inside the WQ itself. It 3244 * may be useful in the future to store this information in a separate 3245 * structure for debugging purposes. 3246 */ 3247 for (wqe_index = wq_start; wqe_index < srq->srq_wq_bufsz; wqe_index++) { 3248 wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, wqe_index); 3249 ddi_put32(wridlist->wl_acchdl, (uint32_t *)wl_wqe, 3250 wridlist->wl_free_list_indx); 3251 wridlist->wl_free_list_indx = wqe_index; 3252 } 3253 } 3254 3255 3256 /* 3257 * tavor_wrid_reaplist_add() 3258 * Context: Can be called from interrupt or base context. 3259 */ 3260 static void 3261 tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq) 3262 { 3263 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); 3264 3265 TAVOR_TNF_ENTER(tavor_wrid_reaplist_add); 3266 3267 mutex_enter(&wq->wq_wrid_wql->wql_lock); 3268 3269 /* 3270 * Add the "post" container (the last one on the current chain) to 3271 * the CQ's "reapable" list 3272 */ 3273 if ((cq->cq_wrid_reap_head == NULL) && 3274 (cq->cq_wrid_reap_tail == NULL)) { 3275 cq->cq_wrid_reap_head = wq->wq_wrid_post; 3276 cq->cq_wrid_reap_tail = wq->wq_wrid_post; 3277 } else { 3278 cq->cq_wrid_reap_tail->wl_reap_next = wq->wq_wrid_post; 3279 cq->cq_wrid_reap_tail = wq->wq_wrid_post; 3280 } 3281 3282 mutex_exit(&wq->wq_wrid_wql->wql_lock); 3283 } 3284 3285 3286 int 3287 tavor_wrid_wqhdr_compare(const void *p1, const void *p2) 3288 { 3289 tavor_workq_compare_t *cmpp; 3290 tavor_workq_hdr_t *curr; 3291 3292 cmpp = (tavor_workq_compare_t *)p1; 3293 curr = (tavor_workq_hdr_t *)p2; 3294 3295 if (cmpp->cmp_qpn < curr->wq_qpn) 3296 return (-1); 3297 else if (cmpp->cmp_qpn > curr->wq_qpn) 3298 return (+1); 3299 else if (cmpp->cmp_type < curr->wq_type) 3300 return (-1); 3301 else if (cmpp->cmp_type > curr->wq_type) 3302 return (+1); 3303 else 3304 return (0); 3305 } 3306 3307 3308 /* 3309 * tavor_wrid_wqhdr_find() 3310 * Context: Can be called from interrupt or base context. 3311 */ 3312 static tavor_workq_hdr_t * 3313 tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type) 3314 { 3315 tavor_workq_hdr_t *curr; 3316 tavor_workq_compare_t cmp; 3317 3318 TAVOR_TNF_ENTER(tavor_wrid_wqhdr_find); 3319 3320 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); 3321 3322 /* 3323 * Walk the CQ's work queue list, trying to find a send or recv queue 3324 * with the same QP number. We do this even if we are going to later 3325 * create a new entry because it helps us easily find the end of the 3326 * list. 3327 */ 3328 cmp.cmp_qpn = qpn; 3329 cmp.cmp_type = wq_type; 3330 #ifdef __lock_lint 3331 tavor_wrid_wqhdr_compare(NULL, NULL); 3332 #endif 3333 curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL); 3334 3335 TAVOR_TNF_EXIT(tavor_wrid_wqhdr_find); 3336 return (curr); 3337 } 3338 3339 3340 /* 3341 * tavor_wrid_wqhdr_create() 3342 * Context: Can be called from interrupt or base context. 3343 */ 3344 static tavor_workq_hdr_t * 3345 tavor_wrid_wqhdr_create(tavor_state_t *state, tavor_cqhdl_t cq, uint_t qpn, 3346 uint_t wq_type, uint_t create_wql) 3347 { 3348 tavor_workq_hdr_t *wqhdr_tmp; 3349 3350 TAVOR_TNF_ENTER(tavor_wrid_wqhdr_create); 3351 3352 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); 3353 3354 /* 3355 * Allocate space a work queue header structure and initialize it. 3356 * Each work queue header structure includes a "wq_wrid_wql" 3357 * which needs to be initialized. Note that this allocation has to be 3358 * a NOSLEEP operation because we are holding the "cq_wrid_wqhdr_lock" 3359 * and, therefore, could get raised to the interrupt level. 3360 */ 3361 wqhdr_tmp = (tavor_workq_hdr_t *)kmem_zalloc( 3362 sizeof (tavor_workq_hdr_t), KM_NOSLEEP); 3363 if (wqhdr_tmp == NULL) { 3364 TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create); 3365 return (NULL); 3366 } 3367 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr_tmp)) 3368 wqhdr_tmp->wq_qpn = qpn; 3369 wqhdr_tmp->wq_type = wq_type; 3370 3371 if (create_wql) { 3372 wqhdr_tmp->wq_wrid_wql = tavor_wrid_wql_create(state); 3373 if (wqhdr_tmp->wq_wrid_wql == NULL) { 3374 kmem_free(wqhdr_tmp, sizeof (tavor_workq_hdr_t)); 3375 TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create); 3376 return (NULL); 3377 } 3378 } 3379 3380 wqhdr_tmp->wq_wrid_poll = NULL; 3381 wqhdr_tmp->wq_wrid_post = NULL; 3382 3383 /* Chain the newly allocated work queue header to the CQ's list */ 3384 tavor_cq_wqhdr_add(cq, wqhdr_tmp); 3385 3386 TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create); 3387 return (wqhdr_tmp); 3388 } 3389 3390 3391 /* 3392 * tavor_wrid_wql_create() 3393 * Context: Can be called from interrupt or base context. 3394 */ 3395 tavor_wq_lock_t * 3396 tavor_wrid_wql_create(tavor_state_t *state) 3397 { 3398 tavor_wq_lock_t *wql; 3399 3400 TAVOR_TNF_ENTER(tavor_wrid_wql_create); 3401 3402 /* 3403 * Allocate the WQL and initialize it. 3404 */ 3405 wql = kmem_zalloc(sizeof (tavor_wq_lock_t), KM_NOSLEEP); 3406 if (wql == NULL) { 3407 TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create); 3408 return (NULL); 3409 } 3410 3411 mutex_init(&wql->wql_lock, NULL, MUTEX_DRIVER, 3412 DDI_INTR_PRI(state->ts_intrmsi_pri)); 3413 3414 /* Add refcount to WQL */ 3415 tavor_wql_refcnt_inc(wql); 3416 3417 TAVOR_TNF_EXIT(tavor_wrid_wql_create); 3418 return (wql); 3419 } 3420 3421 3422 /* 3423 * tavor_wrid_get_wqeaddrsz() 3424 * Context: Can be called from interrupt or base context. 3425 */ 3426 static uint32_t 3427 tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq) 3428 { 3429 tavor_wrid_entry_t *wre; 3430 uint32_t wqeaddrsz; 3431 uint32_t head; 3432 3433 /* 3434 * If the container is empty, then there is no next entry. So just 3435 * return zero. Note: the "head == tail" condition here can only 3436 * mean that the container is empty because we have previously pulled 3437 * something from the container. 3438 * 3439 * If the container is not empty, then find the next entry and return 3440 * the contents of its "wqeaddrsz" field. 3441 */ 3442 if (wq->wq_wrid_poll->wl_head == wq->wq_wrid_poll->wl_tail) { 3443 wqeaddrsz = 0; 3444 } else { 3445 /* 3446 * We don't need to calculate the "next" head pointer here 3447 * because "head" should already point to the next entry on 3448 * the list (since we just pulled something off - in 3449 * tavor_wrid_find_match() - and moved the head index forward.) 3450 */ 3451 head = wq->wq_wrid_poll->wl_head; 3452 wre = &wq->wq_wrid_poll->wl_wre[head]; 3453 wqeaddrsz = wre->wr_wqeaddrsz; 3454 } 3455 return (wqeaddrsz); 3456 } 3457 3458 3459 /* 3460 * tavor_wrid_wqhdr_add() 3461 * Context: Can be called from interrupt or base context. 3462 */ 3463 static void 3464 tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr, 3465 tavor_wrid_list_hdr_t *wridlist) 3466 { 3467 ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock)); 3468 3469 /* Chain the new WRID list "container" to the work queue list */ 3470 if ((wqhdr->wq_wrid_post == NULL) && 3471 (wqhdr->wq_wrid_poll == NULL)) { 3472 wqhdr->wq_wrid_poll = wridlist; 3473 wqhdr->wq_wrid_post = wridlist; 3474 } else { 3475 wqhdr->wq_wrid_post->wl_next = wridlist; 3476 wridlist->wl_prev = wqhdr->wq_wrid_post; 3477 wqhdr->wq_wrid_post = wridlist; 3478 } 3479 } 3480 3481 3482 /* 3483 * tavor_wrid_wqhdr_remove() 3484 * Context: Can be called from interrupt or base context. 3485 * 3486 * Note: this is only called to remove the most recently added WRID list 3487 * container (i.e. in tavor_from_reset() above) 3488 */ 3489 static void 3490 tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr, 3491 tavor_wrid_list_hdr_t *wridlist) 3492 { 3493 tavor_wrid_list_hdr_t *prev, *next; 3494 3495 ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock)); 3496 3497 /* Unlink the WRID list "container" from the work queue list */ 3498 prev = wridlist->wl_prev; 3499 next = wridlist->wl_next; 3500 if (prev != NULL) { 3501 prev->wl_next = next; 3502 } 3503 if (next != NULL) { 3504 next->wl_prev = prev; 3505 } 3506 3507 /* 3508 * Update any pointers in the work queue hdr that may point to this 3509 * WRID list container 3510 */ 3511 if (wqhdr->wq_wrid_post == wridlist) { 3512 wqhdr->wq_wrid_post = prev; 3513 } 3514 if (wqhdr->wq_wrid_poll == wridlist) { 3515 wqhdr->wq_wrid_poll = NULL; 3516 } 3517 } 3518 3519 3520 /* 3521 * tavor_wrid_list_reap() 3522 * Context: Can be called from interrupt or base context. 3523 * Note: The "wqhdr_list_lock" must be held. 3524 */ 3525 static tavor_workq_hdr_t * 3526 tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wridlist) 3527 { 3528 tavor_workq_hdr_t *wqhdr, *consume_wqhdr = NULL; 3529 tavor_wrid_list_hdr_t *prev, *next; 3530 uint32_t size; 3531 3532 TAVOR_TNF_ENTER(tavor_wrid_list_reap); 3533 3534 /* Get the back pointer to the work queue header (see below) */ 3535 wqhdr = wridlist->wl_wqhdr; 3536 mutex_enter(&wqhdr->wq_wrid_wql->wql_lock); 3537 3538 /* Unlink the WRID list "container" from the work queue list */ 3539 prev = wridlist->wl_prev; 3540 next = wridlist->wl_next; 3541 if (prev != NULL) { 3542 prev->wl_next = next; 3543 } 3544 if (next != NULL) { 3545 next->wl_prev = prev; 3546 } 3547 3548 /* 3549 * If the back pointer to the work queue header shows that it 3550 * was pointing to the entry we are about to remove, then the work 3551 * queue header is reapable as well. 3552 */ 3553 if ((wqhdr->wq_wrid_poll == wridlist) && 3554 (wqhdr->wq_wrid_post == wridlist)) { 3555 consume_wqhdr = wqhdr; 3556 } 3557 3558 /* Be sure to update the "poll" and "post" container pointers */ 3559 if (wqhdr->wq_wrid_poll == wridlist) { 3560 wqhdr->wq_wrid_poll = next; 3561 } 3562 if (wqhdr->wq_wrid_post == wridlist) { 3563 wqhdr->wq_wrid_post = NULL; 3564 } 3565 3566 /* Calculate the size and free the container */ 3567 size = (wridlist->wl_size * sizeof (tavor_wrid_entry_t)); 3568 kmem_free(wridlist->wl_wre, size); 3569 kmem_free(wridlist, sizeof (tavor_wrid_list_hdr_t)); 3570 3571 mutex_exit(&wqhdr->wq_wrid_wql->wql_lock); 3572 3573 TAVOR_TNF_EXIT(tavor_wrid_list_reap); 3574 return (consume_wqhdr); 3575 } 3576 3577 3578 /* 3579 * tavor_wrid_wqhdr_lock_both() 3580 * Context: Can be called from interrupt or base context. 3581 */ 3582 static void 3583 tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp) 3584 { 3585 tavor_cqhdl_t sq_cq, rq_cq; 3586 3587 sq_cq = qp->qp_sq_cqhdl; 3588 rq_cq = qp->qp_rq_cqhdl; 3589 3590 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock)) 3591 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock)) 3592 3593 /* 3594 * If both work queues (send and recv) share a completion queue, then 3595 * grab the common lock. If they use different CQs (hence different 3596 * "cq_wrid_wqhdr_list" locks), then grab the send one first, then the 3597 * receive. We do this consistently and correctly in 3598 * tavor_wrid_wqhdr_unlock_both() below to avoid introducing any kind 3599 * of dead lock condition. Note: We add the "__lock_lint" code here 3600 * to fake out warlock into thinking we've grabbed both locks (when, 3601 * in fact, we only needed the one). 3602 */ 3603 if (sq_cq == rq_cq) { 3604 mutex_enter(&sq_cq->cq_wrid_wqhdr_lock); 3605 #ifdef __lock_lint 3606 mutex_enter(&rq_cq->cq_wrid_wqhdr_lock); 3607 #endif 3608 } else { 3609 mutex_enter(&sq_cq->cq_wrid_wqhdr_lock); 3610 mutex_enter(&rq_cq->cq_wrid_wqhdr_lock); 3611 } 3612 } 3613 3614 /* 3615 * tavor_wrid_wqhdr_unlock_both() 3616 * Context: Can be called from interrupt or base context. 3617 */ 3618 static void 3619 tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp) 3620 { 3621 tavor_cqhdl_t sq_cq, rq_cq; 3622 3623 sq_cq = qp->qp_sq_cqhdl; 3624 rq_cq = qp->qp_rq_cqhdl; 3625 3626 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock)) 3627 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock)) 3628 3629 /* 3630 * See tavor_wrid_wqhdr_lock_both() above for more detail 3631 */ 3632 if (sq_cq == rq_cq) { 3633 #ifdef __lock_lint 3634 mutex_exit(&rq_cq->cq_wrid_wqhdr_lock); 3635 #endif 3636 mutex_exit(&sq_cq->cq_wrid_wqhdr_lock); 3637 } else { 3638 mutex_exit(&rq_cq->cq_wrid_wqhdr_lock); 3639 mutex_exit(&sq_cq->cq_wrid_wqhdr_lock); 3640 } 3641 } 3642 3643 3644 /* 3645 * tavor_cq_wqhdr_add() 3646 * Context: Can be called from interrupt or base context. 3647 */ 3648 static void 3649 tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr) 3650 { 3651 tavor_workq_compare_t cmp; 3652 avl_index_t where; 3653 3654 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); 3655 3656 cmp.cmp_qpn = wqhdr->wq_qpn; 3657 cmp.cmp_type = wqhdr->wq_type; 3658 #ifdef __lock_lint 3659 tavor_wrid_wqhdr_compare(NULL, NULL); 3660 #endif 3661 (void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where); 3662 /* 3663 * If the CQ's work queue list is empty, then just add it. 3664 * Otherwise, chain it to the beginning of the list. 3665 */ 3666 avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqhdr, where); 3667 } 3668 3669 3670 /* 3671 * tavor_cq_wqhdr_remove() 3672 * Context: Can be called from interrupt or base context. 3673 */ 3674 static void 3675 tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr) 3676 { 3677 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); 3678 3679 #ifdef __lock_lint 3680 tavor_wrid_wqhdr_compare(NULL, NULL); 3681 #endif 3682 /* Remove "wqhdr" from the work queue header list on "cq" */ 3683 avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqhdr); 3684 3685 /* 3686 * Release reference to WQL; If this is the last reference, this call 3687 * also has the side effect of freeing up the 'wq_wrid_wql' memory. 3688 */ 3689 tavor_wql_refcnt_dec(wqhdr->wq_wrid_wql); 3690 3691 /* Free the memory associated with "wqhdr" */ 3692 kmem_free(wqhdr, sizeof (tavor_workq_hdr_t)); 3693 } 3694 3695 3696 /* 3697 * tavor_wql_refcnt_inc() 3698 * Context: Can be called from interrupt or base context 3699 */ 3700 void 3701 tavor_wql_refcnt_inc(tavor_wq_lock_t *wql) 3702 { 3703 ASSERT(wql != NULL); 3704 3705 mutex_enter(&wql->wql_lock); 3706 wql->wql_refcnt++; 3707 mutex_exit(&wql->wql_lock); 3708 } 3709 3710 /* 3711 * tavor_wql_refcnt_dec() 3712 * Context: Can be called from interrupt or base context 3713 */ 3714 void 3715 tavor_wql_refcnt_dec(tavor_wq_lock_t *wql) 3716 { 3717 int refcnt; 3718 3719 ASSERT(wql != NULL); 3720 3721 mutex_enter(&wql->wql_lock); 3722 wql->wql_refcnt--; 3723 refcnt = wql->wql_refcnt; 3724 mutex_exit(&wql->wql_lock); 3725 3726 /* 3727 * 3728 * Free up WQL memory if we're the last one associated with this 3729 * structure. 3730 */ 3731 if (refcnt == 0) { 3732 mutex_destroy(&wql->wql_lock); 3733 kmem_free(wql, sizeof (tavor_wq_lock_t)); 3734 } 3735 } 3736