1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * tavor_wr.c 29 * Tavor Work Request Processing Routines 30 * 31 * Implements all the routines necessary to provide the PostSend(), 32 * PostRecv() and PostSRQ() verbs. Also contains all the code 33 * necessary to implement the Tavor WRID tracking mechanism. 34 */ 35 36 #include <sys/types.h> 37 #include <sys/conf.h> 38 #include <sys/ddi.h> 39 #include <sys/sunddi.h> 40 #include <sys/modctl.h> 41 #include <sys/avl.h> 42 43 #include <sys/ib/adapters/tavor/tavor.h> 44 45 static void tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda, 46 uint32_t nds, uint32_t qpn, uint32_t fence, uint32_t nopcode); 47 #pragma inline(tavor_qp_send_doorbell) 48 static void tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda, 49 uint32_t nds, uint32_t qpn, uint32_t credits); 50 #pragma inline(tavor_qp_recv_doorbell) 51 static uint32_t tavor_wr_get_immediate(ibt_send_wr_t *wr); 52 static int tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr); 53 static int tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp, 54 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size); 55 static void tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, 56 ibt_send_wr_t *prev_wr, uint64_t *curr_desc, uint_t curr_descsz, 57 uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp); 58 static int tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp, 59 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size); 60 static void tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc, 61 uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo, 62 tavor_qphdl_t qp); 63 static int tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp, 64 ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size); 65 static void tavor_wqe_recv_linknext(uint64_t *desc, uint_t desc_sz, 66 uint64_t *prev, tavor_qphdl_t qp); 67 static int tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq, 68 ibt_recv_wr_t *wr, uint64_t *desc); 69 static void tavor_wqe_srq_linknext(uint64_t *desc, uint64_t *prev, 70 tavor_srqhdl_t srq); 71 static void tavor_wqe_sync(void *hdl, uint_t sync_from, 72 uint_t sync_to, uint_t sync_type, uint_t flag); 73 static tavor_wrid_entry_t *tavor_wrid_find_match(tavor_workq_hdr_t *wq, 74 tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe); 75 static void tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq); 76 static tavor_workq_hdr_t *tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn, 77 uint_t send_or_recv); 78 static tavor_workq_hdr_t *tavor_wrid_wqhdr_create(tavor_state_t *state, 79 tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type, uint_t create_wql); 80 static uint32_t tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq); 81 static void tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr, 82 tavor_wrid_list_hdr_t *wrid_list); 83 static void tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr, 84 tavor_wrid_list_hdr_t *wrid_list); 85 static tavor_workq_hdr_t *tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wq); 86 static void tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp); 87 static void tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp); 88 static void tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr); 89 static void tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr); 90 91 /* 92 * tavor_post_send() 93 * Context: Can be called from interrupt or base context. 94 */ 95 int 96 tavor_post_send(tavor_state_t *state, tavor_qphdl_t qp, 97 ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted) 98 { 99 tavor_sw_wqe_dbinfo_t dbinfo; 100 tavor_wrid_list_hdr_t *wridlist; 101 tavor_wrid_entry_t *wre_last; 102 uint64_t *desc, *prev, *first; 103 uint32_t desc_sz, first_sz; 104 uint32_t wqeaddrsz, signaled_dbd; 105 uint32_t head, tail, next_tail, qsize_msk; 106 uint32_t sync_from, sync_to; 107 uint_t currindx, wrindx, numremain; 108 uint_t chainlen, chainbegin, posted_cnt; 109 uint_t maxdb = TAVOR_QP_MAXDESC_PER_DB; 110 int status; 111 112 TAVOR_TNF_ENTER(tavor_post_send); 113 114 /* 115 * Check for user-mappable QP memory. Note: We do not allow kernel 116 * clients to post to QP memory that is accessible directly by the 117 * user. If the QP memory is user accessible, then return an error. 118 */ 119 if (qp->qp_is_umap) { 120 TNF_PROBE_0(tavor_post_send_inv_usrmapped_type, 121 TAVOR_TNF_ERROR, ""); 122 TAVOR_TNF_EXIT(tavor_post_send); 123 return (IBT_QP_HDL_INVALID); 124 } 125 126 /* Initialize posted_cnt */ 127 posted_cnt = 0; 128 129 mutex_enter(&qp->qp_lock); 130 131 /* 132 * Check QP state. Can not post Send requests from the "Reset", 133 * "Init", or "RTR" states 134 */ 135 if ((qp->qp_state == TAVOR_QP_RESET) || 136 (qp->qp_state == TAVOR_QP_INIT) || 137 (qp->qp_state == TAVOR_QP_RTR)) { 138 mutex_exit(&qp->qp_lock); 139 TNF_PROBE_0(tavor_post_send_inv_qpstate_fail, 140 TAVOR_TNF_ERROR, ""); 141 TAVOR_TNF_EXIT(tavor_post_send); 142 return (IBT_QP_STATE_INVALID); 143 } 144 145 /* Grab the lock for the WRID list */ 146 mutex_enter(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock); 147 wridlist = qp->qp_sq_wqhdr->wq_wrid_post; 148 149 /* Save away some initial QP state */ 150 qsize_msk = qp->qp_sq_wqhdr->wq_size - 1; 151 tail = qp->qp_sq_wqhdr->wq_tail; 152 head = qp->qp_sq_wqhdr->wq_head; 153 154 /* 155 * For each ibt_send_wr_t in the wr[] list passed in, parse the 156 * request and build a Send WQE. Note: Because we are potentially 157 * building a chain of WQEs, we want to link them all together. 158 * However, we do not want to link the first one to the previous 159 * WQE until the entire chain has been linked. Then in the last 160 * step we ring the appropriate doorbell. Note: It is possible for 161 * more Work Requests to be posted than the HW will support at one 162 * shot. If this happens, we need to be able to post and ring 163 * several chains here until the the entire request is complete. 164 */ 165 wrindx = 0; 166 numremain = num_wr; 167 status = DDI_SUCCESS; 168 while ((wrindx < num_wr) && (status == DDI_SUCCESS)) { 169 /* 170 * For the first WQE on a new chain we need "prev" to point 171 * to the current descriptor. As we begin to process 172 * further, "prev" will be updated to point to the previous 173 * WQE on the current chain (see below). 174 */ 175 prev = TAVOR_QP_SQ_ENTRY(qp, tail); 176 177 /* 178 * Before we begin, save the current "tail index" for later 179 * DMA sync 180 */ 181 sync_from = tail; 182 183 /* 184 * Break the request up into chains that are less than or 185 * equal to the maximum number of WQEs that can be posted 186 * per doorbell ring 187 */ 188 chainlen = (numremain > maxdb) ? maxdb : numremain; 189 numremain -= chainlen; 190 chainbegin = wrindx; 191 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) { 192 /* 193 * Check for "queue full" condition. If the queue 194 * is already full, then no more WQEs can be posted. 195 * So break out, ring a doorbell (if necessary) and 196 * return an error 197 */ 198 if (qp->qp_sq_wqhdr->wq_full != 0) { 199 status = IBT_QP_FULL; 200 TNF_PROBE_0_DEBUG(tavor_post_send_sqfull, 201 TAVOR_TNF_TRACE, ""); 202 break; 203 } 204 205 /* 206 * Increment the "tail index" and check for "queue 207 * full" condition. If we detect that the current 208 * work request is going to fill the work queue, then 209 * we mark this condition and continue. 210 */ 211 next_tail = (tail + 1) & qsize_msk; 212 if (next_tail == head) { 213 qp->qp_sq_wqhdr->wq_full = 1; 214 } 215 216 /* 217 * Get the address of the location where the next 218 * Send WQE should be built 219 */ 220 desc = TAVOR_QP_SQ_ENTRY(qp, tail); 221 222 /* 223 * Call tavor_wqe_send_build() to build the WQE 224 * at the given address. This routine uses the 225 * information in the ibt_send_wr_t list (wr[]) and 226 * returns the size of the WQE when it returns. 227 */ 228 status = tavor_wqe_send_build(state, qp, 229 &wr[wrindx], desc, &desc_sz); 230 if (status != DDI_SUCCESS) { 231 TNF_PROBE_0(tavor_post_send_bldwqe_fail, 232 TAVOR_TNF_ERROR, ""); 233 break; 234 } 235 236 /* 237 * Add a WRID entry to the WRID list. Need to 238 * calculate the "wqeaddrsz" and "signaled_dbd" 239 * values to pass to tavor_wrid_add_entry() 240 */ 241 wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t) 242 ((uint64_t)(uintptr_t)desc - qp->qp_desc_off), 243 desc_sz); 244 if ((qp->qp_sq_sigtype == TAVOR_QP_SQ_ALL_SIGNALED) || 245 (wr[wrindx].wr_flags & IBT_WR_SEND_SIGNAL)) { 246 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED; 247 } else { 248 signaled_dbd = 0; 249 } 250 tavor_wrid_add_entry(qp->qp_sq_wqhdr, 251 wr[wrindx].wr_id, wqeaddrsz, signaled_dbd); 252 253 /* 254 * If this is not the first descriptor on the current 255 * chain, then link it to the previous WQE. Otherwise, 256 * save the address and size of this descriptor (in 257 * "first" and "first_sz" respectively) and continue. 258 * Note: Linking a WQE to the the previous one will 259 * depend on whether the two WQEs are from "special 260 * QPs" (i.e. MLX transport WQEs) or whether they are 261 * normal Send WQEs. 262 */ 263 if (currindx != 0) { 264 if (qp->qp_is_special) { 265 tavor_wqe_mlx_linknext(&wr[wrindx - 1], 266 desc, desc_sz, prev, NULL, qp); 267 } else { 268 tavor_wqe_send_linknext(&wr[wrindx], 269 &wr[wrindx - 1], desc, desc_sz, 270 prev, NULL, qp); 271 } 272 prev = desc; 273 } else { 274 first = desc; 275 first_sz = desc_sz; 276 } 277 278 /* 279 * Update the current "tail index" and increment 280 * "posted_cnt" 281 */ 282 tail = next_tail; 283 posted_cnt++; 284 } 285 286 /* 287 * If we reach here and there are one or more WQEs which have 288 * been successfully chained together, then we need to link 289 * the current chain to the previously executing chain of 290 * descriptor (if there is one) and ring the doorbell for the 291 * send work queue. 292 */ 293 if (currindx != 0) { 294 /* 295 * Before we link the chain, we need to ensure that the 296 * "next" field on the last WQE is set to NULL (to 297 * indicate the end of the chain). Note: Just as it 298 * did above, the format for the "next" fields in a 299 * given WQE depend on whether the WQE is MLX 300 * transport or not. 301 */ 302 if (qp->qp_is_special) { 303 tavor_wqe_mlx_linknext(&wr[chainbegin + 304 currindx - 1], NULL, 0, prev, NULL, qp); 305 } else { 306 tavor_wqe_send_linknext(NULL, 307 &wr[chainbegin + currindx - 1], NULL, 0, 308 prev, NULL, qp); 309 } 310 311 /* Save away updated "tail index" for the DMA sync */ 312 sync_to = tail; 313 314 /* Do a DMA sync for current send WQE(s) */ 315 tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_SEND, 316 DDI_DMA_SYNC_FORDEV); 317 318 /* 319 * Now link the chain to the old chain (if there was 320 * one. Note: still need to pay attention to whether 321 * the QP used MLX transport WQEs or not. 322 */ 323 if (qp->qp_is_special) { 324 tavor_wqe_mlx_linknext(NULL, first, first_sz, 325 qp->qp_sq_lastwqeaddr, &dbinfo, qp); 326 } else { 327 tavor_wqe_send_linknext(&wr[chainbegin], NULL, 328 first, first_sz, qp->qp_sq_lastwqeaddr, 329 &dbinfo, qp); 330 } 331 332 /* 333 * If there was a valid previous WQE (i.e. non-NULL), 334 * then sync it too. This is because we have updated 335 * its "next" fields and we want to ensure that the 336 * hardware can see the changes. 337 */ 338 if (qp->qp_sq_lastwqeaddr != NULL) { 339 sync_to = sync_from; 340 sync_from = (sync_from - 1) & qsize_msk; 341 tavor_wqe_sync(qp, sync_from, sync_to, 342 TAVOR_WR_SEND, DDI_DMA_SYNC_FORDEV); 343 } 344 345 /* 346 * Now if the WRID tail entry is non-NULL, then this 347 * represents the entry to which we are chaining the 348 * new entries. Since we are going to ring the 349 * doorbell for this WQE, we want set its "dbd" bit. 350 * 351 * On the other hand, if the tail is NULL, even though 352 * we will have rung the doorbell for the previous WQE 353 * (for the hardware's sake) it is irrelevant to our 354 * purposes (for tracking WRIDs) because we know the 355 * request must have already completed. 356 */ 357 wre_last = wridlist->wl_wre_old_tail; 358 if (wre_last != NULL) { 359 wre_last->wr_signaled_dbd |= 360 TAVOR_WRID_ENTRY_DOORBELLED; 361 } 362 363 /* Update some of the state in the QP */ 364 qp->qp_sq_lastwqeaddr = desc; 365 qp->qp_sq_wqhdr->wq_tail = tail; 366 367 /* Ring the doorbell */ 368 tavor_qp_send_doorbell(state, 369 (uint32_t)((uintptr_t)first - qp->qp_desc_off), 370 first_sz, qp->qp_qpnum, dbinfo.db_fence, 371 dbinfo.db_nopcode); 372 } 373 } 374 375 /* 376 * Update the "num_posted" return value (if necessary). Then drop 377 * the locks and return success. 378 */ 379 if (num_posted != NULL) { 380 *num_posted = posted_cnt; 381 } 382 383 mutex_exit(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock); 384 mutex_exit(&qp->qp_lock); 385 386 TAVOR_TNF_EXIT(tavor_post_send); 387 return (status); 388 } 389 390 391 /* 392 * tavor_post_recv() 393 * Context: Can be called from interrupt or base context. 394 */ 395 int 396 tavor_post_recv(tavor_state_t *state, tavor_qphdl_t qp, 397 ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted) 398 { 399 tavor_wrid_list_hdr_t *wridlist; 400 tavor_wrid_entry_t *wre_last; 401 uint64_t *desc, *prev, *first; 402 uint32_t desc_sz, first_sz; 403 uint32_t wqeaddrsz, signaled_dbd; 404 uint32_t head, tail, next_tail, qsize_msk; 405 uint32_t sync_from, sync_to; 406 uint_t currindx, wrindx, numremain; 407 uint_t chainlen, posted_cnt; 408 uint_t maxdb = TAVOR_QP_MAXDESC_PER_DB; 409 int status; 410 411 TAVOR_TNF_ENTER(tavor_post_recv); 412 413 /* 414 * Check for user-mappable QP memory. Note: We do not allow kernel 415 * clients to post to QP memory that is accessible directly by the 416 * user. If the QP memory is user accessible, then return an error. 417 */ 418 if (qp->qp_is_umap) { 419 TNF_PROBE_0(tavor_post_recv_inv_usrmapped_type, 420 TAVOR_TNF_ERROR, ""); 421 TAVOR_TNF_EXIT(tavor_post_recv); 422 return (IBT_QP_HDL_INVALID); 423 } 424 425 /* Initialize posted_cnt */ 426 posted_cnt = 0; 427 428 mutex_enter(&qp->qp_lock); 429 430 /* 431 * Check if QP is associated with an SRQ 432 */ 433 if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 434 mutex_exit(&qp->qp_lock); 435 TNF_PROBE_0(tavor_post_recv_fail_qp_on_srq, 436 TAVOR_TNF_ERROR, ""); 437 TAVOR_TNF_EXIT(tavor_post_recv); 438 return (IBT_SRQ_IN_USE); 439 } 440 441 /* 442 * Check QP state. Can not post Recv requests from the "Reset" state 443 */ 444 if (qp->qp_state == TAVOR_QP_RESET) { 445 mutex_exit(&qp->qp_lock); 446 TNF_PROBE_0(tavor_post_recv_inv_qpstate_fail, 447 TAVOR_TNF_ERROR, ""); 448 TAVOR_TNF_EXIT(tavor_post_recv); 449 return (IBT_QP_STATE_INVALID); 450 } 451 452 /* Grab the lock for the WRID list */ 453 mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock); 454 wridlist = qp->qp_rq_wqhdr->wq_wrid_post; 455 456 /* Save away some initial QP state */ 457 qsize_msk = qp->qp_rq_wqhdr->wq_size - 1; 458 tail = qp->qp_rq_wqhdr->wq_tail; 459 head = qp->qp_rq_wqhdr->wq_head; 460 461 /* 462 * For each ibt_recv_wr_t in the wr[] list passed in, parse the 463 * request and build a Recv WQE. Note: Because we are potentially 464 * building a chain of WQEs, we want to link them all together. 465 * However, we do not want to link the first one to the previous 466 * WQE until the entire chain has been linked. Then in the last 467 * step we ring the appropriate doorbell. Note: It is possible for 468 * more Work Requests to be posted than the HW will support at one 469 * shot. If this happens, we need to be able to post and ring 470 * several chains here until the the entire request is complete. 471 */ 472 wrindx = 0; 473 numremain = num_wr; 474 status = DDI_SUCCESS; 475 while ((wrindx < num_wr) && (status == DDI_SUCCESS)) { 476 /* 477 * For the first WQE on a new chain we need "prev" to point 478 * to the current descriptor. As we begin to process 479 * further, "prev" will be updated to point to the previous 480 * WQE on the current chain (see below). 481 */ 482 prev = TAVOR_QP_RQ_ENTRY(qp, tail); 483 484 /* 485 * Before we begin, save the current "tail index" for later 486 * DMA sync 487 */ 488 sync_from = tail; 489 490 /* 491 * Break the request up into chains that are less than or 492 * equal to the maximum number of WQEs that can be posted 493 * per doorbell ring 494 */ 495 chainlen = (numremain > maxdb) ? maxdb : numremain; 496 numremain -= chainlen; 497 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) { 498 /* 499 * Check for "queue full" condition. If the queue 500 * is already full, then no more WQEs can be posted. 501 * So break out, ring a doorbell (if necessary) and 502 * return an error 503 */ 504 if (qp->qp_rq_wqhdr->wq_full != 0) { 505 status = IBT_QP_FULL; 506 TNF_PROBE_0_DEBUG(tavor_post_recv_rqfull, 507 TAVOR_TNF_TRACE, ""); 508 break; 509 } 510 511 /* 512 * Increment the "tail index" and check for "queue 513 * full" condition. If we detect that the current 514 * work request is going to fill the work queue, then 515 * we mark this condition and continue. 516 */ 517 next_tail = (tail + 1) & qsize_msk; 518 if (next_tail == head) { 519 qp->qp_rq_wqhdr->wq_full = 1; 520 } 521 522 /* 523 * Get the address of the location where the next 524 * Recv WQE should be built 525 */ 526 desc = TAVOR_QP_RQ_ENTRY(qp, tail); 527 528 /* 529 * Call tavor_wqe_recv_build() to build the WQE 530 * at the given address. This routine uses the 531 * information in the ibt_recv_wr_t list (wr[]) and 532 * returns the size of the WQE when it returns. 533 */ 534 status = tavor_wqe_recv_build(state, qp, &wr[wrindx], 535 desc, &desc_sz); 536 if (status != DDI_SUCCESS) { 537 TNF_PROBE_0(tavor_post_recv_bldwqe_fail, 538 TAVOR_TNF_ERROR, ""); 539 break; 540 } 541 542 /* 543 * Add a WRID entry to the WRID list. Need to 544 * calculate the "wqeaddrsz" and "signaled_dbd" 545 * values to pass to tavor_wrid_add_entry(). Note: 546 * all Recv WQEs are essentially "signaled" 547 */ 548 wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t) 549 ((uint64_t)(uintptr_t)desc - qp->qp_desc_off), 550 desc_sz); 551 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED; 552 tavor_wrid_add_entry(qp->qp_rq_wqhdr, 553 wr[wrindx].wr_id, wqeaddrsz, signaled_dbd); 554 555 /* 556 * If this is not the first descriptor on the current 557 * chain, then link it to the previous WQE. Otherwise, 558 * save the address and size of this descriptor (in 559 * "first" and "first_sz" respectively) and continue. 560 */ 561 if (currindx != 0) { 562 tavor_wqe_recv_linknext(desc, desc_sz, prev, 563 qp); 564 prev = desc; 565 } else { 566 first = desc; 567 first_sz = desc_sz; 568 } 569 570 /* 571 * Update the current "tail index" and increment 572 * "posted_cnt" 573 */ 574 tail = next_tail; 575 posted_cnt++; 576 } 577 578 /* 579 * If we reach here and there are one or more WQEs which have 580 * been successfully chained together, then we need to link 581 * the current chain to the previously executing chain of 582 * descriptor (if there is one) and ring the doorbell for the 583 * recv work queue. 584 */ 585 if (currindx != 0) { 586 /* 587 * Before we link the chain, we need to ensure that the 588 * "next" field on the last WQE is set to NULL (to 589 * indicate the end of the chain). 590 */ 591 tavor_wqe_recv_linknext(NULL, 0, prev, qp); 592 593 /* Save away updated "tail index" for the DMA sync */ 594 sync_to = tail; 595 596 /* Do a DMA sync for current recv WQE(s) */ 597 tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_RECV, 598 DDI_DMA_SYNC_FORDEV); 599 600 /* 601 * Now link the chain to the old chain (if there was 602 * one. 603 */ 604 tavor_wqe_recv_linknext(first, first_sz, 605 qp->qp_rq_lastwqeaddr, qp); 606 607 /* 608 * If there was a valid previous WQE (i.e. non-NULL), 609 * then sync it too. This is because we have updated 610 * its "next" fields and we want to ensure that the 611 * hardware can see the changes. 612 */ 613 if (qp->qp_rq_lastwqeaddr != NULL) { 614 sync_to = sync_from; 615 sync_from = (sync_from - 1) & qsize_msk; 616 tavor_wqe_sync(qp, sync_from, sync_to, 617 TAVOR_WR_RECV, DDI_DMA_SYNC_FORDEV); 618 } 619 620 /* 621 * Now if the WRID tail entry is non-NULL, then this 622 * represents the entry to which we are chaining the 623 * new entries. Since we are going to ring the 624 * doorbell for this WQE, we want set its "dbd" bit. 625 * 626 * On the other hand, if the tail is NULL, even though 627 * we will have rung the doorbell for the previous WQE 628 * (for the hardware's sake) it is irrelevant to our 629 * purposes (for tracking WRIDs) because we know the 630 * request must have already completed. 631 */ 632 wre_last = wridlist->wl_wre_old_tail; 633 if (wre_last != NULL) { 634 wre_last->wr_signaled_dbd |= 635 TAVOR_WRID_ENTRY_DOORBELLED; 636 } 637 638 /* Update some of the state in the QP */ 639 qp->qp_rq_lastwqeaddr = desc; 640 qp->qp_rq_wqhdr->wq_tail = tail; 641 642 /* Ring the doorbell */ 643 tavor_qp_recv_doorbell(state, 644 (uint32_t)((uintptr_t)first - qp->qp_desc_off), 645 first_sz, qp->qp_qpnum, (chainlen % maxdb)); 646 } 647 } 648 649 /* 650 * Update the "num_posted" return value (if necessary). Then drop 651 * the locks and return success. 652 */ 653 if (num_posted != NULL) { 654 *num_posted = posted_cnt; 655 } 656 657 mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock); 658 mutex_exit(&qp->qp_lock); 659 660 TAVOR_TNF_EXIT(tavor_post_recv); 661 return (status); 662 } 663 664 /* 665 * tavor_post_srq() 666 * Context: Can be called from interrupt or base context. 667 */ 668 int 669 tavor_post_srq(tavor_state_t *state, tavor_srqhdl_t srq, 670 ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted) 671 { 672 uint64_t *desc, *prev, *first, *last_wqe_addr; 673 uint32_t signaled_dbd; 674 uint32_t sync_indx; 675 uint_t currindx, wrindx, numremain; 676 uint_t chainlen, posted_cnt; 677 uint_t maxdb = TAVOR_QP_MAXDESC_PER_DB; 678 int status; 679 680 TAVOR_TNF_ENTER(tavor_post_srq); 681 682 /* 683 * Check for user-mappable QP memory. Note: We do not allow kernel 684 * clients to post to QP memory that is accessible directly by the 685 * user. If the QP memory is user accessible, then return an error. 686 */ 687 if (srq->srq_is_umap) { 688 TNF_PROBE_0(tavor_post_srq_inv_usrmapped_type, 689 TAVOR_TNF_ERROR, ""); 690 TAVOR_TNF_EXIT(tavor_post_srq); 691 return (IBT_SRQ_HDL_INVALID); 692 } 693 694 /* Initialize posted_cnt */ 695 posted_cnt = 0; 696 697 mutex_enter(&srq->srq_lock); 698 699 /* 700 * Check SRQ state. Can not post Recv requests when SRQ is in error 701 */ 702 if (srq->srq_state == TAVOR_SRQ_STATE_ERROR) { 703 mutex_exit(&srq->srq_lock); 704 TNF_PROBE_0(tavor_post_srq_inv_srqstate_fail, 705 TAVOR_TNF_ERROR, ""); 706 TAVOR_TNF_EXIT(tavor_post_srq); 707 return (IBT_QP_STATE_INVALID); 708 } 709 710 /* Grab the lock for the WRID list */ 711 mutex_enter(&srq->srq_wrid_wql->wql_lock); 712 713 /* 714 * For each ibt_recv_wr_t in the wr[] list passed in, parse the 715 * request and build a Recv WQE. Note: Because we are potentially 716 * building a chain of WQEs, we want to link them all together. 717 * However, we do not want to link the first one to the previous 718 * WQE until the entire chain has been linked. Then in the last 719 * step we ring the appropriate doorbell. Note: It is possible for 720 * more Work Requests to be posted than the HW will support at one 721 * shot. If this happens, we need to be able to post and ring 722 * several chains here until the the entire request is complete. 723 */ 724 wrindx = 0; 725 numremain = num_wr; 726 status = DDI_SUCCESS; 727 while ((wrindx < num_wr) && (status == DDI_SUCCESS)) { 728 /* 729 * For the first WQE on a new chain we need "prev" to point 730 * to the current descriptor. As we begin to process 731 * further, "prev" will be updated to point to the previous 732 * WQE on the current chain (see below). 733 */ 734 if (srq->srq_wq_lastwqeindx == -1) { 735 prev = NULL; 736 } else { 737 prev = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wq_lastwqeindx); 738 } 739 740 /* 741 * Break the request up into chains that are less than or 742 * equal to the maximum number of WQEs that can be posted 743 * per doorbell ring 744 */ 745 chainlen = (numremain > maxdb) ? maxdb : numremain; 746 numremain -= chainlen; 747 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) { 748 749 /* 750 * Check for "queue full" condition. If the queue 751 * is already full, then no more WQEs can be posted. 752 * So break out, ring a doorbell (if necessary) and 753 * return an error 754 */ 755 if (srq->srq_wridlist->wl_free_list_indx == -1) { 756 status = IBT_QP_FULL; 757 TNF_PROBE_0_DEBUG(tavor_post_srq_wqfull, 758 TAVOR_TNF_TRACE, ""); 759 break; 760 } 761 762 /* 763 * Get the address of the location where the next 764 * Recv WQE should be built 765 */ 766 desc = TAVOR_SRQ_WQE_ADDR(srq, 767 srq->srq_wridlist->wl_free_list_indx); 768 769 /* 770 * Add a WRID entry to the WRID list. Need to 771 * set the "signaled_dbd" values to pass to 772 * tavor_wrid_add_entry(). Note: all Recv WQEs are 773 * essentially "signaled" 774 * 775 * The 'size' is stored at srq_alloc time, in the 776 * srq_wq_stride. This is a constant value required 777 * for SRQ. 778 */ 779 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED; 780 tavor_wrid_add_entry_srq(srq, wr[wrindx].wr_id, 781 signaled_dbd); 782 783 /* 784 * Call tavor_wqe_srq_build() to build the WQE 785 * at the given address. This routine uses the 786 * information in the ibt_recv_wr_t list (wr[]) and 787 * returns the size of the WQE when it returns. 788 */ 789 status = tavor_wqe_srq_build(state, srq, &wr[wrindx], 790 desc); 791 if (status != DDI_SUCCESS) { 792 TNF_PROBE_0(tavor_post_recv_bldwqe_fail, 793 TAVOR_TNF_ERROR, ""); 794 break; 795 } 796 797 /* 798 * If this is not the first descriptor on the current 799 * chain, then link it to the previous WQE. Otherwise, 800 * save the address of this descriptor (in "first") and 801 * continue. 802 */ 803 if (currindx != 0) { 804 tavor_wqe_srq_linknext(desc, prev, srq); 805 sync_indx = TAVOR_SRQ_WQE_INDEX( 806 srq->srq_wq_buf, prev, 807 srq->srq_wq_log_wqesz); 808 809 /* Do a DMA sync for previous recv WQE */ 810 tavor_wqe_sync(srq, sync_indx, sync_indx+1, 811 TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV); 812 813 prev = desc; 814 } else { 815 816 /* 817 * In this case, the last WQE on the chain is 818 * also considered 'first'. So set prev to 819 * first, here. 820 */ 821 first = prev = desc; 822 } 823 824 /* 825 * Increment "posted_cnt" 826 */ 827 posted_cnt++; 828 } 829 830 /* 831 * If we reach here and there are one or more WQEs which have 832 * been successfully chained together, then we need to link 833 * the current chain to the previously executing chain of 834 * descriptor (if there is one) and ring the doorbell for the 835 * recv work queue. 836 */ 837 if (currindx != 0) { 838 /* 839 * Before we link the chain, we need to ensure that the 840 * "next" field on the last WQE is set to NULL (to 841 * indicate the end of the chain). 842 */ 843 tavor_wqe_srq_linknext(NULL, prev, srq); 844 845 sync_indx = TAVOR_SRQ_WQE_INDEX(srq->srq_wq_buf, prev, 846 srq->srq_wq_log_wqesz); 847 848 /* Do a DMA sync for current recv WQE */ 849 tavor_wqe_sync(srq, sync_indx, sync_indx+1, 850 TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV); 851 852 /* 853 * Now link the chain to the old chain (if there was 854 * one). 855 */ 856 if (srq->srq_wq_lastwqeindx == -1) { 857 last_wqe_addr = NULL; 858 } else { 859 last_wqe_addr = TAVOR_SRQ_WQE_ADDR(srq, 860 srq->srq_wq_lastwqeindx); 861 } 862 tavor_wqe_srq_linknext(first, last_wqe_addr, srq); 863 864 /* 865 * If there was a valid previous WQE (i.e. valid index), 866 * then sync it too. This is because we have updated 867 * its "next" fields and we want to ensure that the 868 * hardware can see the changes. 869 */ 870 if (srq->srq_wq_lastwqeindx != -1) { 871 sync_indx = srq->srq_wq_lastwqeindx; 872 tavor_wqe_sync(srq, sync_indx, sync_indx+1, 873 TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV); 874 } 875 876 /* Update some of the state in the QP */ 877 srq->srq_wq_lastwqeindx = TAVOR_SRQ_WQE_INDEX( 878 srq->srq_wq_buf, desc, 879 srq->srq_wq_log_wqesz); 880 881 /* Ring the doorbell */ 882 /* SRQ needs NDS of 0 */ 883 tavor_qp_recv_doorbell(state, 884 (uint32_t)((uintptr_t)first - srq->srq_desc_off), 885 0, srq->srq_srqnum, (chainlen % maxdb)); 886 } 887 } 888 889 /* 890 * Update the "num_posted" return value (if necessary). Then drop 891 * the locks and return success. 892 */ 893 if (num_posted != NULL) { 894 *num_posted = posted_cnt; 895 } 896 897 mutex_exit(&srq->srq_wrid_wql->wql_lock); 898 mutex_exit(&srq->srq_lock); 899 900 TAVOR_TNF_EXIT(tavor_post_srq); 901 return (status); 902 } 903 904 905 /* 906 * tavor_qp_send_doorbell() 907 * Context: Can be called from interrupt or base context. 908 */ 909 static void 910 tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds, 911 uint32_t qpn, uint32_t fence, uint32_t nopcode) 912 { 913 uint64_t doorbell = 0; 914 915 /* Build the doorbell from the parameters */ 916 doorbell = (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) << 917 TAVOR_QPSNDDB_NDA_SHIFT) | 918 ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) | 919 ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) | 920 ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds; 921 922 TNF_PROBE_1_DEBUG(tavor_qp_send_doorbell, TAVOR_TNF_TRACE, "", 923 tnf_ulong, doorbell, doorbell); 924 925 /* Write the doorbell to UAR */ 926 TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->send, 927 doorbell); 928 } 929 930 931 /* 932 * tavor_qp_recv_doorbell() 933 * Context: Can be called from interrupt or base context. 934 */ 935 static void 936 tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds, 937 uint32_t qpn, uint32_t credits) 938 { 939 uint64_t doorbell = 0; 940 941 /* Build the doorbell from the parameters */ 942 doorbell = (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) << 943 TAVOR_QPRCVDB_NDA_SHIFT) | 944 ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) | 945 ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits; 946 947 TNF_PROBE_1_DEBUG(tavor_qp_recv_doorbell, TAVOR_TNF_TRACE, "", 948 tnf_ulong, doorbell, doorbell); 949 950 /* Write the doorbell to UAR */ 951 TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->recv, 952 doorbell); 953 } 954 955 956 /* 957 * tavor_wqe_send_build() 958 * Context: Can be called from interrupt or base context. 959 */ 960 static int 961 tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp, 962 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size) 963 { 964 tavor_hw_snd_wqe_ud_t *ud; 965 tavor_hw_snd_wqe_remaddr_t *rc; 966 tavor_hw_snd_wqe_atomic_t *at; 967 tavor_hw_snd_wqe_remaddr_t *uc; 968 tavor_hw_snd_wqe_bind_t *bn; 969 tavor_hw_wqe_sgl_t *ds; 970 ibt_wr_ds_t *sgl; 971 tavor_ahhdl_t ah; 972 uint32_t nds; 973 int i, num_ds, status; 974 975 TAVOR_TNF_ENTER(tavor_wqe_send_build); 976 977 ASSERT(MUTEX_HELD(&qp->qp_lock)); 978 979 /* Initialize the information for the Data Segments */ 980 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc + 981 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 982 nds = wr->wr_nds; 983 sgl = wr->wr_sgl; 984 num_ds = 0; 985 986 /* 987 * Build a Send WQE depends first and foremost on the transport 988 * type of Work Request (i.e. UD, RC, or UC) 989 */ 990 switch (wr->wr_trans) { 991 case IBT_UD_SRV: 992 /* Ensure that work request transport type matches QP type */ 993 if (qp->qp_serv_type != TAVOR_QP_UD) { 994 TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail, 995 TAVOR_TNF_ERROR, ""); 996 TAVOR_TNF_EXIT(tavor_wqe_send_build); 997 return (IBT_QP_SRV_TYPE_INVALID); 998 } 999 1000 /* 1001 * Validate the operation type. For UD requests, only the 1002 * "Send" operation is valid 1003 */ 1004 if (wr->wr_opcode != IBT_WRC_SEND) { 1005 TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail, 1006 TAVOR_TNF_ERROR, ""); 1007 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1008 return (IBT_QP_OP_TYPE_INVALID); 1009 } 1010 1011 /* 1012 * If this is a Special QP (QP0 or QP1), then we need to 1013 * build MLX WQEs instead. So jump to tavor_wqe_mlx_build() 1014 * and return whatever status it returns 1015 */ 1016 if (qp->qp_is_special) { 1017 status = tavor_wqe_mlx_build(state, qp, wr, desc, size); 1018 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1019 return (status); 1020 } 1021 1022 /* 1023 * Otherwise, if this is a normal UD Send request, then fill 1024 * all the fields in the Tavor UD header for the WQE. Note: 1025 * to do this we'll need to extract some information from the 1026 * Address Handle passed with the work request. 1027 */ 1028 ud = (tavor_hw_snd_wqe_ud_t *)((uintptr_t)desc + 1029 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 1030 ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah; 1031 if (ah == NULL) { 1032 TNF_PROBE_0(tavor_wqe_send_build_invahhdl_fail, 1033 TAVOR_TNF_ERROR, ""); 1034 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1035 return (IBT_AH_HDL_INVALID); 1036 } 1037 1038 /* 1039 * Build the Unreliable Datagram Segment for the WQE, using 1040 * the information from the address handle and the work 1041 * request. 1042 */ 1043 mutex_enter(&ah->ah_lock); 1044 TAVOR_WQE_BUILD_UD(qp, ud, ah, wr); 1045 mutex_exit(&ah->ah_lock); 1046 1047 /* Update "ds" for filling in Data Segments (below) */ 1048 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)ud + 1049 sizeof (tavor_hw_snd_wqe_ud_t)); 1050 break; 1051 1052 case IBT_RC_SRV: 1053 /* Ensure that work request transport type matches QP type */ 1054 if (qp->qp_serv_type != TAVOR_QP_RC) { 1055 TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail, 1056 TAVOR_TNF_ERROR, ""); 1057 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1058 return (IBT_QP_SRV_TYPE_INVALID); 1059 } 1060 1061 /* 1062 * Validate the operation type. For RC requests, we allow 1063 * "Send", "RDMA Read", "RDMA Write", various "Atomic" 1064 * operations, and memory window "Bind" 1065 */ 1066 if ((wr->wr_opcode != IBT_WRC_SEND) && 1067 (wr->wr_opcode != IBT_WRC_RDMAR) && 1068 (wr->wr_opcode != IBT_WRC_RDMAW) && 1069 (wr->wr_opcode != IBT_WRC_CSWAP) && 1070 (wr->wr_opcode != IBT_WRC_FADD) && 1071 (wr->wr_opcode != IBT_WRC_BIND)) { 1072 TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail, 1073 TAVOR_TNF_ERROR, ""); 1074 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1075 return (IBT_QP_OP_TYPE_INVALID); 1076 } 1077 1078 /* 1079 * If this is a Send request, then all we need to do is break 1080 * out and here and begin the Data Segment processing below 1081 */ 1082 if (wr->wr_opcode == IBT_WRC_SEND) { 1083 break; 1084 } 1085 1086 /* 1087 * If this is an RDMA Read or RDMA Write request, then fill 1088 * in the "Remote Address" header fields. 1089 */ 1090 if ((wr->wr_opcode == IBT_WRC_RDMAR) || 1091 (wr->wr_opcode == IBT_WRC_RDMAW)) { 1092 rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc + 1093 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 1094 1095 /* 1096 * Build the Remote Address Segment for the WQE, using 1097 * the information from the RC work request. 1098 */ 1099 TAVOR_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma); 1100 1101 /* Update "ds" for filling in Data Segments (below) */ 1102 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc + 1103 sizeof (tavor_hw_snd_wqe_remaddr_t)); 1104 break; 1105 } 1106 1107 /* 1108 * If this is one of the Atomic type operations (i.e 1109 * Compare-Swap or Fetch-Add), then fill in both the "Remote 1110 * Address" header fields and the "Atomic" header fields. 1111 */ 1112 if ((wr->wr_opcode == IBT_WRC_CSWAP) || 1113 (wr->wr_opcode == IBT_WRC_FADD)) { 1114 rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc + 1115 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 1116 at = (tavor_hw_snd_wqe_atomic_t *)((uintptr_t)rc + 1117 sizeof (tavor_hw_snd_wqe_remaddr_t)); 1118 1119 /* 1120 * Build the Remote Address and Atomic Segments for 1121 * the WQE, using the information from the RC Atomic 1122 * work request. 1123 */ 1124 TAVOR_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr); 1125 TAVOR_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic); 1126 1127 /* Update "ds" for filling in Data Segments (below) */ 1128 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)at + 1129 sizeof (tavor_hw_snd_wqe_atomic_t)); 1130 1131 /* 1132 * Update "nds" and "sgl" because Atomic requests have 1133 * only a single Data Segment (and they are encoded 1134 * somewhat differently in the work request. 1135 */ 1136 nds = 1; 1137 sgl = wr->wr_sgl; 1138 break; 1139 } 1140 1141 /* 1142 * If this is memory window Bind operation, then we call the 1143 * tavor_wr_bind_check() routine to validate the request and 1144 * to generate the updated RKey. If this is successful, then 1145 * we fill in the WQE's "Bind" header fields. 1146 */ 1147 if (wr->wr_opcode == IBT_WRC_BIND) { 1148 status = tavor_wr_bind_check(state, wr); 1149 if (status != DDI_SUCCESS) { 1150 TNF_PROBE_0(tavor_wqe_send_build_bind_fail, 1151 TAVOR_TNF_ERROR, ""); 1152 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1153 return (status); 1154 } 1155 1156 bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc + 1157 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 1158 1159 /* 1160 * Build the Bind Memory Window Segments for the WQE, 1161 * using the information from the RC Bind memory 1162 * window work request. 1163 */ 1164 TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind); 1165 1166 /* 1167 * Update the "ds" pointer. Even though the "bind" 1168 * operation requires no SGLs, this is necessary to 1169 * facilitate the correct descriptor size calculations 1170 * (below). 1171 */ 1172 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn + 1173 sizeof (tavor_hw_snd_wqe_bind_t)); 1174 nds = 0; 1175 } 1176 break; 1177 1178 case IBT_UC_SRV: 1179 /* Ensure that work request transport type matches QP type */ 1180 if (qp->qp_serv_type != TAVOR_QP_UC) { 1181 TNF_PROBE_0(tavor_wqe_send_build_inv_servtype_fail, 1182 TAVOR_TNF_ERROR, ""); 1183 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1184 return (IBT_QP_SRV_TYPE_INVALID); 1185 } 1186 1187 /* 1188 * Validate the operation type. For UC requests, we only 1189 * allow "Send", "RDMA Write", and memory window "Bind". 1190 * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic" 1191 * operations 1192 */ 1193 if ((wr->wr_opcode != IBT_WRC_SEND) && 1194 (wr->wr_opcode != IBT_WRC_RDMAW) && 1195 (wr->wr_opcode != IBT_WRC_BIND)) { 1196 TNF_PROBE_0(tavor_wqe_send_build_inv_optype_fail, 1197 TAVOR_TNF_ERROR, ""); 1198 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1199 return (IBT_QP_OP_TYPE_INVALID); 1200 } 1201 1202 /* 1203 * If this is a Send request, then all we need to do is break 1204 * out and here and begin the Data Segment processing below 1205 */ 1206 if (wr->wr_opcode == IBT_WRC_SEND) { 1207 break; 1208 } 1209 1210 /* 1211 * If this is an RDMA Write request, then fill in the "Remote 1212 * Address" header fields. 1213 */ 1214 if (wr->wr_opcode == IBT_WRC_RDMAW) { 1215 uc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc + 1216 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 1217 1218 /* 1219 * Build the Remote Address Segment for the WQE, using 1220 * the information from the UC work request. 1221 */ 1222 TAVOR_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma); 1223 1224 /* Update "ds" for filling in Data Segments (below) */ 1225 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)uc + 1226 sizeof (tavor_hw_snd_wqe_remaddr_t)); 1227 break; 1228 } 1229 1230 /* 1231 * If this is memory window Bind operation, then we call the 1232 * tavor_wr_bind_check() routine to validate the request and 1233 * to generate the updated RKey. If this is successful, then 1234 * we fill in the WQE's "Bind" header fields. 1235 */ 1236 if (wr->wr_opcode == IBT_WRC_BIND) { 1237 status = tavor_wr_bind_check(state, wr); 1238 if (status != DDI_SUCCESS) { 1239 TNF_PROBE_0(tavor_wqe_send_build_bind_fail, 1240 TAVOR_TNF_ERROR, ""); 1241 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1242 return (status); 1243 } 1244 1245 bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc + 1246 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 1247 1248 /* 1249 * Build the Bind Memory Window Segments for the WQE, 1250 * using the information from the UC Bind memory 1251 * window work request. 1252 */ 1253 TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind); 1254 1255 /* 1256 * Update the "ds" pointer. Even though the "bind" 1257 * operation requires no SGLs, this is necessary to 1258 * facilitate the correct descriptor size calculations 1259 * (below). 1260 */ 1261 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn + 1262 sizeof (tavor_hw_snd_wqe_bind_t)); 1263 nds = 0; 1264 } 1265 break; 1266 1267 default: 1268 TNF_PROBE_0(tavor_wqe_send_build_inv_tranport_fail, 1269 TAVOR_TNF_ERROR, ""); 1270 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1271 return (IBT_QP_SRV_TYPE_INVALID); 1272 } 1273 1274 /* 1275 * Now fill in the Data Segments (SGL) for the Send WQE based on 1276 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer 1277 * Start by checking for a valid number of SGL entries 1278 */ 1279 if (nds > qp->qp_sq_sgl) { 1280 TNF_PROBE_0(tavor_wqe_send_build_toomanysgl_fail, 1281 TAVOR_TNF_ERROR, ""); 1282 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1283 return (IBT_QP_SGL_LEN_INVALID); 1284 } 1285 1286 /* 1287 * For each SGL in the Send Work Request, fill in the Send WQE's data 1288 * segments. Note: We skip any SGL with zero size because Tavor 1289 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 1290 * the encoding for zero means a 2GB transfer. Because of this special 1291 * encoding in the hardware, we mask the requested length with 1292 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 1293 * zero.) 1294 */ 1295 for (i = 0; i < nds; i++) { 1296 if (sgl[i].ds_len == 0) { 1297 continue; 1298 } 1299 1300 /* 1301 * Fill in the Data Segment(s) for the current WQE, using the 1302 * information contained in the scatter-gather list of the 1303 * work request. 1304 */ 1305 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]); 1306 num_ds++; 1307 } 1308 1309 /* Return the size of descriptor (in 16-byte chunks) */ 1310 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 4; 1311 1312 TAVOR_TNF_EXIT(tavor_wqe_send_build); 1313 return (DDI_SUCCESS); 1314 } 1315 1316 1317 /* 1318 * tavor_wqe_send_linknext() 1319 * Context: Can be called from interrupt or base context. 1320 */ 1321 static void 1322 tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, ibt_send_wr_t *prev_wr, 1323 uint64_t *curr_desc, uint_t curr_descsz, uint64_t *prev_desc, 1324 tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp) 1325 { 1326 uint64_t next, ctrl; 1327 uint32_t nopcode, fence; 1328 1329 /* 1330 * Calculate the "next" field of the descriptor. This amounts to 1331 * setting up the "next_wqe_addr", "nopcode", "fence", and "nds" 1332 * fields (see tavor_hw.h for more). Note: If there is no next 1333 * descriptor (i.e. if the current descriptor is the last WQE on 1334 * the chain), then set "next" to zero. 1335 */ 1336 if (curr_desc != NULL) { 1337 /* 1338 * Determine the value for the Tavor WQE "nopcode" field 1339 * by using the IBTF opcode from the work request 1340 */ 1341 switch (curr_wr->wr_opcode) { 1342 case IBT_WRC_RDMAW: 1343 if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) { 1344 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAWI; 1345 } else { 1346 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW; 1347 } 1348 break; 1349 1350 case IBT_WRC_SEND: 1351 if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) { 1352 nopcode = TAVOR_WQE_SEND_NOPCODE_SENDI; 1353 } else { 1354 nopcode = TAVOR_WQE_SEND_NOPCODE_SEND; 1355 } 1356 break; 1357 1358 case IBT_WRC_RDMAR: 1359 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR; 1360 break; 1361 1362 case IBT_WRC_CSWAP: 1363 nopcode = TAVOR_WQE_SEND_NOPCODE_ATMCS; 1364 break; 1365 1366 case IBT_WRC_FADD: 1367 nopcode = TAVOR_WQE_SEND_NOPCODE_ATMFA; 1368 break; 1369 1370 case IBT_WRC_BIND: 1371 nopcode = TAVOR_WQE_SEND_NOPCODE_BIND; 1372 break; 1373 } 1374 1375 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc 1376 - qp->qp_desc_off); 1377 next = ((uint64_t)(uintptr_t)curr_desc & 1378 TAVOR_WQE_NDA_MASK) << 32; 1379 next = next | ((uint64_t)nopcode << 32); 1380 fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0; 1381 if (fence) { 1382 next = next | TAVOR_WQE_SEND_FENCE_MASK; 1383 } 1384 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK); 1385 1386 /* 1387 * If a send queue doorbell will be rung for the next 1388 * WQE on the chain, then set the current WQE's "dbd" bit. 1389 * Note: We also update the "dbinfo" structure here to pass 1390 * back information about what should (later) be included 1391 * in the send queue doorbell. 1392 */ 1393 if (dbinfo) { 1394 next = next | TAVOR_WQE_DBD_MASK; 1395 dbinfo->db_nopcode = nopcode; 1396 dbinfo->db_fence = fence; 1397 } 1398 } else { 1399 next = 0; 1400 } 1401 1402 /* 1403 * If this WQE is supposed to be linked to the previous descriptor, 1404 * then we need to update not only the previous WQE's "next" fields 1405 * but we must also update this WQE's "ctrl" fields (i.e. the "c", "e", 1406 * "s", "i" and "immediate" fields - see tavor_hw.h for more). Note: 1407 * the "e" bit is always hardcoded to zero. 1408 */ 1409 if (prev_desc != NULL) { 1410 /* 1411 * If a send queue doorbell will be rung for the next WQE on 1412 * the chain, then update the current WQE's "next" field and 1413 * return. 1414 * Note: We don't want to modify the "ctrl" field here because 1415 * that portion of the previous WQE has already been set 1416 * correctly at some previous point in time. 1417 */ 1418 if (dbinfo) { 1419 TAVOR_WQE_LINKFIRST(qp, prev_desc, next); 1420 return; 1421 } 1422 1423 ctrl = 0; 1424 1425 /* Set the "c" (i.e. "signaled") bit appropriately */ 1426 if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) { 1427 ctrl = ctrl | TAVOR_WQE_SEND_SIGNALED_MASK; 1428 } 1429 1430 /* Set the "s" (i.e. "solicited") bit appropriately */ 1431 if (prev_wr->wr_flags & IBT_WR_SEND_SOLICIT) { 1432 ctrl = ctrl | TAVOR_WQE_SEND_SOLICIT_MASK; 1433 } 1434 1435 /* Set the "i" bit and the immediate data appropriately */ 1436 if (prev_wr->wr_flags & IBT_WR_SEND_IMMED) { 1437 ctrl = ctrl | TAVOR_WQE_SEND_IMMEDIATE_MASK; 1438 ctrl = ctrl | tavor_wr_get_immediate(prev_wr); 1439 } 1440 1441 TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next); 1442 } 1443 } 1444 1445 1446 /* 1447 * tavor_wqe_mlx_build() 1448 * Context: Can be called from interrupt or base context. 1449 */ 1450 static int 1451 tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp, 1452 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size) 1453 { 1454 tavor_hw_udav_t udav; 1455 tavor_ahhdl_t ah; 1456 ib_lrh_hdr_t *lrh; 1457 ib_grh_t *grh; 1458 ib_bth_hdr_t *bth; 1459 ib_deth_hdr_t *deth; 1460 tavor_hw_wqe_sgl_t *ds; 1461 ibt_wr_ds_t *sgl; 1462 uint8_t *mgmtclass, *hpoint, *hcount; 1463 uint64_t data; 1464 uint32_t nds, offset, pktlen; 1465 uint32_t desc_sz, udav_sz; 1466 int i, num_ds; 1467 1468 TAVOR_TNF_ENTER(tavor_wqe_mlx_build); 1469 1470 ASSERT(MUTEX_HELD(&qp->qp_lock)); 1471 1472 /* Initialize the information for the Data Segments */ 1473 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc + 1474 sizeof (tavor_hw_mlx_wqe_nextctrl_t)); 1475 1476 /* 1477 * Pull the address handle from the work request and read in 1478 * the contents of the UDAV. This will be used to answer some 1479 * questions about the request. 1480 */ 1481 ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah; 1482 if (ah == NULL) { 1483 TNF_PROBE_0(tavor_wqe_mlx_build_invahhdl_fail, 1484 TAVOR_TNF_ERROR, ""); 1485 TAVOR_TNF_EXIT(tavor_wqe_mlx_build); 1486 return (IBT_AH_HDL_INVALID); 1487 } 1488 mutex_enter(&ah->ah_lock); 1489 udav_sz = sizeof (tavor_hw_udav_t) >> 3; 1490 for (i = 0; i < udav_sz; i++) { 1491 data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl, 1492 ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i)); 1493 ((uint64_t *)&udav)[i] = data; 1494 } 1495 mutex_exit(&ah->ah_lock); 1496 1497 /* 1498 * If the request is for QP1 and the destination LID is equal to 1499 * the Permissive LID, then return an error. This combination is 1500 * not allowed 1501 */ 1502 if ((udav.rlid == IB_LID_PERMISSIVE) && 1503 (qp->qp_is_special == TAVOR_QP_GSI)) { 1504 TNF_PROBE_0(tavor_wqe_mlx_build_permissiveLIDonQP1_fail, 1505 TAVOR_TNF_ERROR, ""); 1506 TAVOR_TNF_EXIT(tavor_wqe_mlx_build); 1507 return (IBT_AH_HDL_INVALID); 1508 } 1509 1510 /* 1511 * Calculate the size of the packet headers, including the GRH 1512 * (if necessary) 1513 */ 1514 desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) + 1515 sizeof (ib_deth_hdr_t); 1516 if (udav.grh) { 1517 desc_sz += sizeof (ib_grh_t); 1518 } 1519 1520 /* 1521 * Begin to build the first "inline" data segment for the packet 1522 * headers. Note: By specifying "inline" we can build the contents 1523 * of the MAD packet headers directly into the work queue (as part 1524 * descriptor). This has the advantage of both speeding things up 1525 * and of not requiring the driver to allocate/register any additional 1526 * memory for the packet headers. 1527 */ 1528 TAVOR_WQE_BUILD_INLINE(qp, &ds[0], desc_sz); 1529 desc_sz += 4; 1530 1531 /* 1532 * Build Local Route Header (LRH) 1533 * We start here by building the LRH into a temporary location. 1534 * When we have finished we copy the LRH data into the descriptor. 1535 * 1536 * Notice that the VL values are hardcoded. This is not a problem 1537 * because VL15 is decided later based on the value in the MLX 1538 * transport "next/ctrl" header (see the "vl15" bit below), and it 1539 * is otherwise (meaning for QP1) chosen from the SL-to-VL table 1540 * values. This rule does not hold for loopback packets however 1541 * (all of which bypass the SL-to-VL tables) and it is the reason 1542 * that non-QP0 MADs are setup with VL hardcoded to zero below. 1543 * 1544 * Notice also that Source LID is hardcoded to the Permissive LID 1545 * (0xFFFF). This is also not a problem because if the Destination 1546 * LID is not the Permissive LID, then the "slr" value in the MLX 1547 * transport "next/ctrl" header will be set to zero and the hardware 1548 * will pull the LID from value in the port. 1549 */ 1550 lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4); 1551 pktlen = (desc_sz + 0x100) >> 2; 1552 TAVOR_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen); 1553 1554 /* 1555 * Build Global Route Header (GRH) 1556 * This is only built if necessary as defined by the "grh" bit in 1557 * the address vector. Note: We also calculate the offset to the 1558 * next header (BTH) based on whether or not the "grh" bit is set. 1559 */ 1560 if (udav.grh) { 1561 /* 1562 * If the request is for QP0, then return an error. The 1563 * combination of global routine (GRH) and QP0 is not allowed. 1564 */ 1565 if (qp->qp_is_special == TAVOR_QP_SMI) { 1566 TNF_PROBE_0(tavor_wqe_mlx_build_GRHonQP0_fail, 1567 TAVOR_TNF_ERROR, ""); 1568 TAVOR_TNF_EXIT(tavor_wqe_mlx_build); 1569 return (IBT_AH_HDL_INVALID); 1570 } 1571 grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t)); 1572 TAVOR_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen); 1573 1574 bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t)); 1575 } else { 1576 bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t)); 1577 } 1578 1579 1580 /* 1581 * Build Base Transport Header (BTH) 1582 * Notice that the M, PadCnt, and TVer fields are all set 1583 * to zero implicitly. This is true for all Management Datagrams 1584 * MADs whether GSI are SMI. 1585 */ 1586 TAVOR_WQE_BUILD_MLX_BTH(state, bth, qp, wr); 1587 1588 /* 1589 * Build Datagram Extended Transport Header (DETH) 1590 */ 1591 deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t)); 1592 TAVOR_WQE_BUILD_MLX_DETH(deth, qp); 1593 1594 /* Ensure that the Data Segment is aligned on a 16-byte boundary */ 1595 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t)); 1596 ds = (tavor_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF); 1597 nds = wr->wr_nds; 1598 sgl = wr->wr_sgl; 1599 num_ds = 0; 1600 1601 /* 1602 * Now fill in the Data Segments (SGL) for the MLX WQE based on the 1603 * values set up above (i.e. "sgl", "nds", and the "ds" pointer 1604 * Start by checking for a valid number of SGL entries 1605 */ 1606 if (nds > qp->qp_sq_sgl) { 1607 TNF_PROBE_0(tavor_wqe_mlx_build_toomanysgl_fail, 1608 TAVOR_TNF_ERROR, ""); 1609 TAVOR_TNF_EXIT(tavor_wqe_mlx_build); 1610 return (IBT_QP_SGL_LEN_INVALID); 1611 } 1612 1613 /* 1614 * For each SGL in the Send Work Request, fill in the MLX WQE's data 1615 * segments. Note: We skip any SGL with zero size because Tavor 1616 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 1617 * the encoding for zero means a 2GB transfer. Because of this special 1618 * encoding in the hardware, we mask the requested length with 1619 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 1620 * zero.) 1621 */ 1622 mgmtclass = hpoint = hcount = NULL; 1623 offset = 0; 1624 for (i = 0; i < nds; i++) { 1625 if (sgl[i].ds_len == 0) { 1626 continue; 1627 } 1628 1629 /* 1630 * Fill in the Data Segment(s) for the MLX send WQE, using 1631 * the information contained in the scatter-gather list of 1632 * the work request. 1633 */ 1634 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]); 1635 1636 /* 1637 * Search through the contents of all MADs posted to QP0 to 1638 * initialize pointers to the places where Directed Route "hop 1639 * pointer", "hop count", and "mgmtclass" would be. Tavor 1640 * needs these updated (i.e. incremented or decremented, as 1641 * necessary) by software. 1642 */ 1643 if (qp->qp_is_special == TAVOR_QP_SMI) { 1644 1645 TAVOR_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass, 1646 offset, sgl[i].ds_va, sgl[i].ds_len); 1647 1648 TAVOR_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint, 1649 offset, sgl[i].ds_va, sgl[i].ds_len); 1650 1651 TAVOR_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount, 1652 offset, sgl[i].ds_va, sgl[i].ds_len); 1653 1654 offset += sgl[i].ds_len; 1655 } 1656 num_ds++; 1657 } 1658 1659 /* 1660 * Tavor's Directed Route MADs need to have the "hop pointer" 1661 * incremented/decremented (as necessary) depending on whether it is 1662 * currently less than or greater than the "hop count" (i.e. whether 1663 * the MAD is a request or a response.) 1664 */ 1665 if (qp->qp_is_special == TAVOR_QP_SMI) { 1666 TAVOR_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass, 1667 *hpoint, *hcount); 1668 } 1669 1670 /* 1671 * Now fill in the ICRC Data Segment. This data segment is inlined 1672 * just like the packets headers above, but it is only four bytes and 1673 * set to zero (to indicate that we wish the hardware to generate ICRC. 1674 */ 1675 TAVOR_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0); 1676 num_ds++; 1677 1678 /* Return the size of descriptor (in 16-byte chunks) */ 1679 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4; 1680 1681 TAVOR_TNF_EXIT(tavor_wqe_mlx_build); 1682 return (DDI_SUCCESS); 1683 } 1684 1685 1686 /* 1687 * tavor_wqe_mlx_linknext() 1688 * Context: Can be called from interrupt or base context. 1689 */ 1690 static void 1691 tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc, 1692 uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo, 1693 tavor_qphdl_t qp) 1694 { 1695 tavor_hw_udav_t udav; 1696 tavor_ahhdl_t ah; 1697 uint64_t next, ctrl, data; 1698 uint_t nopcode; 1699 uint_t udav_sz; 1700 int i; 1701 1702 /* 1703 * Calculate the "next" field of the descriptor. This amounts to 1704 * setting up the "next_wqe_addr", "nopcode", and "nds" fields (see 1705 * tavor_hw.h for more). Note: If there is no next descriptor (i.e. 1706 * if the current descriptor is the last WQE on the chain), then set 1707 * "next" to zero. 1708 */ 1709 if (curr_desc != NULL) { 1710 /* 1711 * The only valid Tavor WQE "nopcode" for MLX transport 1712 * requests is the "Send" code. 1713 */ 1714 nopcode = TAVOR_WQE_SEND_NOPCODE_SEND; 1715 curr_desc = (uint64_t *)(uintptr_t)((uint64_t) 1716 (uintptr_t)curr_desc - qp->qp_desc_off); 1717 next = (uint64_t)((uintptr_t)curr_desc & 1718 TAVOR_WQE_NDA_MASK) << 32; 1719 next = next | ((uint64_t)nopcode << 32); 1720 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK); 1721 1722 /* 1723 * If a send queue doorbell will be rung for the next 1724 * WQE on the chain, then set the current WQE's "dbd" bit. 1725 * Note: We also update the "dbinfo" structure here to pass 1726 * back information about what should (later) be included 1727 * in the send queue doorbell. 1728 */ 1729 if (dbinfo) { 1730 next = next | TAVOR_WQE_DBD_MASK; 1731 dbinfo->db_nopcode = nopcode; 1732 dbinfo->db_fence = 0; 1733 } 1734 } else { 1735 next = 0; 1736 } 1737 1738 /* 1739 * If this WQE is supposed to be linked to the previous descriptor, 1740 * then we need to update not only the previous WQE's "next" fields 1741 * but we must also update this WQE's "ctrl" fields (i.e. the "vl15", 1742 * "slr", "max_srate", "sl", "c", "e", "rlid", and "vcrc" fields - 1743 * see tavor_hw.h for more) Note: the "e" bit and "vcrc" fields are 1744 * always hardcoded to zero. 1745 */ 1746 if (prev_desc != NULL) { 1747 /* 1748 * If a send queue doorbell will be rung for the next WQE on 1749 * the chain, then update the current WQE's "next" field and 1750 * return. 1751 * Note: We don't want to modify the "ctrl" field here because 1752 * that portion of the previous WQE has already been set 1753 * correctly at some previous point in time. 1754 */ 1755 if (dbinfo) { 1756 TAVOR_WQE_LINKFIRST(qp, prev_desc, next); 1757 return; 1758 } 1759 1760 /* 1761 * Pull the address handle from the work request and read in 1762 * the contents of the UDAV. This will be used to answer some 1763 * questions about the request. 1764 */ 1765 ah = (tavor_ahhdl_t)prev_wr->wr.ud.udwr_dest->ud_ah; 1766 mutex_enter(&ah->ah_lock); 1767 udav_sz = sizeof (tavor_hw_udav_t) >> 3; 1768 for (i = 0; i < udav_sz; i++) { 1769 data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl, 1770 ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i)); 1771 ((uint64_t *)&udav)[i] = data; 1772 } 1773 mutex_exit(&ah->ah_lock); 1774 1775 ctrl = 0; 1776 1777 /* Only QP0 uses VL15, otherwise use VL in the packet */ 1778 if (qp->qp_is_special == TAVOR_QP_SMI) { 1779 ctrl = ctrl | TAVOR_WQE_MLXHDR_VL15_MASK; 1780 } 1781 1782 /* 1783 * The SLR (Source LID Replace) bit determines whether the 1784 * source LID for an outgoing MLX packet should come from the 1785 * PortInfo (SLR = 0) or should be left as it is in the 1786 * descriptor (SLR = 1). The latter is necessary for packets 1787 * to be sent with the Permissive LID. 1788 */ 1789 if (udav.rlid == IB_LID_PERMISSIVE) { 1790 ctrl = ctrl | TAVOR_WQE_MLXHDR_SLR_MASK; 1791 } 1792 1793 /* Fill in the max static rate from the address handle */ 1794 ctrl = ctrl | ((uint64_t)udav.max_stat_rate << 1795 TAVOR_WQE_MLXHDR_SRATE_SHIFT); 1796 1797 /* All VL15 (i.e. SMI) traffic is required to use SL 0 */ 1798 if (qp->qp_is_special != TAVOR_QP_SMI) { 1799 ctrl = ctrl | ((uint64_t)udav.sl << 1800 TAVOR_WQE_MLXHDR_SL_SHIFT); 1801 } 1802 1803 /* Set the "c" (i.e. "signaled") bit appropriately */ 1804 if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) { 1805 ctrl = ctrl | TAVOR_WQE_MLXHDR_SIGNALED_MASK; 1806 } 1807 1808 /* Fill in the destination LID from the address handle */ 1809 ctrl = ctrl | ((uint64_t)udav.rlid << 1810 TAVOR_WQE_MLXHDR_RLID_SHIFT); 1811 1812 TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next); 1813 } 1814 } 1815 1816 1817 /* 1818 * tavor_wqe_recv_build() 1819 * Context: Can be called from interrupt or base context. 1820 */ 1821 /* ARGSUSED */ 1822 static int 1823 tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp, 1824 ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size) 1825 { 1826 tavor_hw_wqe_sgl_t *ds; 1827 int i, num_ds; 1828 1829 TAVOR_TNF_ENTER(tavor_wqe_recv_build); 1830 1831 ASSERT(MUTEX_HELD(&qp->qp_lock)); 1832 1833 /* Check that work request transport type is valid */ 1834 if ((qp->qp_serv_type != TAVOR_QP_UD) && 1835 (qp->qp_serv_type != TAVOR_QP_RC) && 1836 (qp->qp_serv_type != TAVOR_QP_UC)) { 1837 TNF_PROBE_0(tavor_build_recv_wqe_inv_servtype_fail, 1838 TAVOR_TNF_ERROR, ""); 1839 TAVOR_TNF_EXIT(tavor_build_recv_wqe); 1840 return (IBT_QP_SRV_TYPE_INVALID); 1841 } 1842 1843 /* Fill in the Data Segments (SGL) for the Recv WQE */ 1844 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc + 1845 sizeof (tavor_hw_rcv_wqe_nextctrl_t)); 1846 num_ds = 0; 1847 1848 /* Check for valid number of SGL entries */ 1849 if (wr->wr_nds > qp->qp_rq_sgl) { 1850 TNF_PROBE_0(tavor_wqe_recv_build_toomanysgl_fail, 1851 TAVOR_TNF_ERROR, ""); 1852 TAVOR_TNF_EXIT(tavor_wqe_recv_build); 1853 return (IBT_QP_SGL_LEN_INVALID); 1854 } 1855 1856 /* 1857 * For each SGL in the Recv Work Request, fill in the Recv WQE's data 1858 * segments. Note: We skip any SGL with zero size because Tavor 1859 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 1860 * the encoding for zero means a 2GB transfer. Because of this special 1861 * encoding in the hardware, we mask the requested length with 1862 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 1863 * zero.) 1864 */ 1865 for (i = 0; i < wr->wr_nds; i++) { 1866 if (wr->wr_sgl[i].ds_len == 0) { 1867 continue; 1868 } 1869 1870 /* 1871 * Fill in the Data Segment(s) for the receive WQE, using the 1872 * information contained in the scatter-gather list of the 1873 * work request. 1874 */ 1875 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &wr->wr_sgl[i]); 1876 num_ds++; 1877 } 1878 1879 /* Return the size of descriptor (in 16-byte chunks) */ 1880 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4; 1881 1882 TAVOR_TNF_EXIT(tavor_wqe_recv_build); 1883 return (DDI_SUCCESS); 1884 } 1885 1886 1887 /* 1888 * tavor_wqe_recv_linknext() 1889 * Context: Can be called from interrupt or base context. 1890 */ 1891 static void 1892 tavor_wqe_recv_linknext(uint64_t *curr_desc, uint_t curr_descsz, 1893 uint64_t *prev_desc, tavor_qphdl_t qp) 1894 { 1895 uint64_t next; 1896 1897 /* 1898 * Calculate the "next" field of the descriptor. This amounts to 1899 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see 1900 * tavor_hw.h for more). Note: If there is no next descriptor (i.e. 1901 * if the current descriptor is the last WQE on the chain), then set 1902 * "next" field to TAVOR_WQE_DBD_MASK. This is because the Tavor 1903 * hardware requires the "dbd" bit to be set to one for all Recv WQEs. 1904 * In either case, we must add a single bit in the "reserved" field 1905 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA. This is the 1906 * workaround for a known Tavor errata that can cause Recv WQEs with 1907 * zero in the NDA field to behave improperly. 1908 */ 1909 if (curr_desc != NULL) { 1910 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc - 1911 qp->qp_desc_off); 1912 next = (uint64_t)((uintptr_t)curr_desc & 1913 TAVOR_WQE_NDA_MASK) << 32; 1914 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK) | 1915 TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK; 1916 } else { 1917 next = TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK; 1918 } 1919 1920 /* 1921 * If this WQE is supposed to be linked to the previous descriptor, 1922 * then we need to update not only the previous WQE's "next" fields 1923 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and 1924 * "e" bits - see tavor_hw.h for more). Note: both the "c" and "e" 1925 * bits are always hardcoded to zero. 1926 */ 1927 if (prev_desc != NULL) { 1928 TAVOR_WQE_LINKNEXT(qp, prev_desc, 0, next); 1929 } 1930 } 1931 1932 1933 /* 1934 * tavor_wqe_srq_build() 1935 * Context: Can be called from interrupt or base context. 1936 */ 1937 /* ARGSUSED */ 1938 static int 1939 tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq, 1940 ibt_recv_wr_t *wr, uint64_t *desc) 1941 { 1942 tavor_hw_wqe_sgl_t *ds; 1943 ibt_wr_ds_t end_sgl; 1944 int i, num_ds; 1945 1946 TAVOR_TNF_ENTER(tavor_wqe_recv_build); 1947 1948 ASSERT(MUTEX_HELD(&srq->srq_lock)); 1949 1950 /* Fill in the Data Segments (SGL) for the Recv WQE */ 1951 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc + 1952 sizeof (tavor_hw_rcv_wqe_nextctrl_t)); 1953 num_ds = 0; 1954 1955 /* Check for valid number of SGL entries */ 1956 if (wr->wr_nds > srq->srq_wq_sgl) { 1957 TNF_PROBE_0(tavor_wqe_srq_build_toomanysgl_fail, 1958 TAVOR_TNF_ERROR, ""); 1959 TAVOR_TNF_EXIT(tavor_wqe_srq_build); 1960 return (IBT_QP_SGL_LEN_INVALID); 1961 } 1962 1963 /* 1964 * For each SGL in the Recv Work Request, fill in the Recv WQE's data 1965 * segments. Note: We skip any SGL with zero size because Tavor 1966 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 1967 * the encoding for zero means a 2GB transfer. Because of this special 1968 * encoding in the hardware, we mask the requested length with 1969 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 1970 * zero.) 1971 */ 1972 for (i = 0; i < wr->wr_nds; i++) { 1973 if (wr->wr_sgl[i].ds_len == 0) { 1974 continue; 1975 } 1976 1977 /* 1978 * Fill in the Data Segment(s) for the receive WQE, using the 1979 * information contained in the scatter-gather list of the 1980 * work request. 1981 */ 1982 TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &wr->wr_sgl[i]); 1983 num_ds++; 1984 } 1985 1986 /* 1987 * For SRQ, if the number of data segments is less than the maximum 1988 * specified at alloc, then we have to fill in a special "key" entry in 1989 * the sgl entry after the last valid one in this post request. We do 1990 * that here. 1991 */ 1992 if (num_ds < srq->srq_wq_sgl) { 1993 end_sgl.ds_va = 0; 1994 end_sgl.ds_len = 0; 1995 end_sgl.ds_key = 0x1; 1996 TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &end_sgl); 1997 } 1998 1999 TAVOR_TNF_EXIT(tavor_wqe_srq_build); 2000 return (DDI_SUCCESS); 2001 } 2002 2003 2004 /* 2005 * tavor_wqe_srq_linknext() 2006 * Context: Can be called from interrupt or base context. 2007 */ 2008 static void 2009 tavor_wqe_srq_linknext(uint64_t *curr_desc, uint64_t *prev_desc, 2010 tavor_srqhdl_t srq) 2011 { 2012 uint64_t next; 2013 2014 /* 2015 * Calculate the "next" field of the descriptor. This amounts to 2016 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see 2017 * tavor_hw.h for more). Note: If there is no next descriptor (i.e. 2018 * if the current descriptor is the last WQE on the chain), then set 2019 * "next" field to TAVOR_WQE_DBD_MASK. This is because the Tavor 2020 * hardware requires the "dbd" bit to be set to one for all Recv WQEs. 2021 * In either case, we must add a single bit in the "reserved" field 2022 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA. This is the 2023 * workaround for a known Tavor errata that can cause Recv WQEs with 2024 * zero in the NDA field to behave improperly. 2025 */ 2026 if (curr_desc != NULL) { 2027 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc - 2028 srq->srq_desc_off); 2029 next = (uint64_t)((uintptr_t)curr_desc & 2030 TAVOR_WQE_NDA_MASK) << 32; 2031 next = next | TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK; 2032 } else { 2033 next = TAVOR_RCV_WQE_NDA0_WA_MASK; 2034 } 2035 2036 /* 2037 * If this WQE is supposed to be linked to the previous descriptor, 2038 * then we need to update not only the previous WQE's "next" fields 2039 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and 2040 * "e" bits - see tavor_hw.h for more). Note: both the "c" and "e" 2041 * bits are always hardcoded to zero. 2042 */ 2043 if (prev_desc != NULL) { 2044 TAVOR_WQE_LINKNEXT_SRQ(srq, prev_desc, 0, next); 2045 } 2046 } 2047 2048 2049 /* 2050 * tavor_wr_get_immediate() 2051 * Context: Can be called from interrupt or base context. 2052 */ 2053 static uint32_t 2054 tavor_wr_get_immediate(ibt_send_wr_t *wr) 2055 { 2056 /* 2057 * This routine extracts the "immediate data" from the appropriate 2058 * location in the IBTF work request. Because of the way the 2059 * work request structure is defined, the location for this data 2060 * depends on the actual work request operation type. 2061 */ 2062 2063 /* For RDMA Write, test if RC or UC */ 2064 if (wr->wr_opcode == IBT_WRC_RDMAW) { 2065 if (wr->wr_trans == IBT_RC_SRV) { 2066 return (wr->wr.rc.rcwr.rdma.rdma_immed); 2067 } else { /* IBT_UC_SRV */ 2068 return (wr->wr.uc.ucwr.rdma.rdma_immed); 2069 } 2070 } 2071 2072 /* For Send, test if RC, UD, or UC */ 2073 if (wr->wr_opcode == IBT_WRC_SEND) { 2074 if (wr->wr_trans == IBT_RC_SRV) { 2075 return (wr->wr.rc.rcwr.send_immed); 2076 } else if (wr->wr_trans == IBT_UD_SRV) { 2077 return (wr->wr.ud.udwr_immed); 2078 } else { /* IBT_UC_SRV */ 2079 return (wr->wr.uc.ucwr.send_immed); 2080 } 2081 } 2082 2083 /* 2084 * If any other type of request, then immediate is undefined 2085 */ 2086 return (0); 2087 } 2088 2089 2090 /* 2091 * tavor_wqe_sync() 2092 * Context: Can be called from interrupt or base context. 2093 */ 2094 static void 2095 tavor_wqe_sync(void *hdl, uint_t sync_from, uint_t sync_to, 2096 uint_t sync_type, uint_t flag) 2097 { 2098 tavor_qphdl_t qp; 2099 tavor_srqhdl_t srq; 2100 uint_t is_sync_req; 2101 uint64_t *wqe_from, *wqe_to, *wqe_base, *wqe_top; 2102 ddi_dma_handle_t dmahdl; 2103 off_t offset; 2104 size_t length; 2105 uint32_t qsize; 2106 int status; 2107 2108 TAVOR_TNF_ENTER(tavor_wqe_sync); 2109 2110 if (sync_type == TAVOR_WR_SRQ) { 2111 srq = (tavor_srqhdl_t)hdl; 2112 is_sync_req = srq->srq_sync; 2113 /* Get the DMA handle from SRQ context */ 2114 dmahdl = srq->srq_mrhdl->mr_bindinfo.bi_dmahdl; 2115 } else { 2116 qp = (tavor_qphdl_t)hdl; 2117 is_sync_req = qp->qp_sync; 2118 /* Get the DMA handle from QP context */ 2119 dmahdl = qp->qp_mrhdl->mr_bindinfo.bi_dmahdl; 2120 } 2121 2122 /* Determine if the work queues need to be synced or not */ 2123 if (is_sync_req == 0) { 2124 TAVOR_TNF_EXIT(tavor_wqe_sync); 2125 return; 2126 } 2127 2128 /* 2129 * Depending on the type of the work queue, we grab information 2130 * about the address ranges we need to DMA sync. 2131 */ 2132 if (sync_type == TAVOR_WR_SEND) { 2133 wqe_from = TAVOR_QP_SQ_ENTRY(qp, sync_from); 2134 wqe_to = TAVOR_QP_SQ_ENTRY(qp, sync_to); 2135 qsize = qp->qp_sq_bufsz; 2136 2137 wqe_base = TAVOR_QP_SQ_ENTRY(qp, 0); 2138 wqe_top = TAVOR_QP_SQ_ENTRY(qp, qsize); 2139 } else if (sync_type == TAVOR_WR_RECV) { 2140 wqe_from = TAVOR_QP_RQ_ENTRY(qp, sync_from); 2141 wqe_to = TAVOR_QP_RQ_ENTRY(qp, sync_to); 2142 qsize = qp->qp_rq_bufsz; 2143 2144 wqe_base = TAVOR_QP_RQ_ENTRY(qp, 0); 2145 wqe_top = TAVOR_QP_RQ_ENTRY(qp, qsize); 2146 } else { 2147 wqe_from = TAVOR_SRQ_WQ_ENTRY(srq, sync_from); 2148 wqe_to = TAVOR_SRQ_WQ_ENTRY(srq, sync_to); 2149 qsize = srq->srq_wq_bufsz; 2150 2151 wqe_base = TAVOR_SRQ_WQ_ENTRY(srq, 0); 2152 wqe_top = TAVOR_SRQ_WQ_ENTRY(srq, qsize); 2153 } 2154 2155 /* 2156 * There are two possible cases for the beginning and end of the WQE 2157 * chain we are trying to sync. Either this is the simple case, where 2158 * the end of the chain is below the beginning of the chain, or it is 2159 * the "wrap-around" case, where the end of the chain has wrapped over 2160 * the end of the queue. In the former case, we simply need to 2161 * calculate the span from beginning to end and sync it. In the latter 2162 * case, however, we need to calculate the span from the top of the 2163 * work queue to the end of the chain and sync that, and then we need 2164 * to find the other portion (from beginning of chain to end of queue) 2165 * and sync that as well. Note: if the "top to end" span is actually 2166 * zero length, then we don't do a DMA sync because a zero length DMA 2167 * sync unnecessarily syncs the entire work queue. 2168 */ 2169 if (wqe_to > wqe_from) { 2170 /* "From Beginning to End" */ 2171 offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base); 2172 length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_from); 2173 2174 status = ddi_dma_sync(dmahdl, offset, length, flag); 2175 if (status != DDI_SUCCESS) { 2176 TNF_PROBE_0(tavor_wqe_sync_fail, TAVOR_TNF_ERROR, ""); 2177 TAVOR_TNF_EXIT(tavor_wqe_sync); 2178 return; 2179 } 2180 } else { 2181 /* "From Top to End" */ 2182 offset = (off_t)0; 2183 length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_base); 2184 if (length) { 2185 status = ddi_dma_sync(dmahdl, offset, length, flag); 2186 if (status != DDI_SUCCESS) { 2187 TNF_PROBE_0(tavor_wqe_sync_fail, 2188 TAVOR_TNF_ERROR, ""); 2189 TAVOR_TNF_EXIT(tavor_wqe_sync); 2190 return; 2191 } 2192 } 2193 2194 /* "From Beginning to Bottom" */ 2195 offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base); 2196 length = (size_t)((uintptr_t)wqe_top - (uintptr_t)wqe_from); 2197 status = ddi_dma_sync(dmahdl, offset, length, flag); 2198 if (status != DDI_SUCCESS) { 2199 TNF_PROBE_0(tavor_wqe_sync_fail, TAVOR_TNF_ERROR, ""); 2200 TAVOR_TNF_EXIT(tavor_wqe_sync); 2201 return; 2202 } 2203 } 2204 2205 TAVOR_TNF_EXIT(tavor_wqe_sync); 2206 } 2207 2208 2209 /* 2210 * tavor_wr_bind_check() 2211 * Context: Can be called from interrupt or base context. 2212 */ 2213 static int 2214 tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr) 2215 { 2216 ibt_bind_flags_t bind_flags; 2217 uint64_t vaddr, len; 2218 uint64_t reg_start_addr, reg_end_addr; 2219 tavor_mwhdl_t mw; 2220 tavor_mrhdl_t mr; 2221 tavor_rsrc_t *mpt; 2222 uint32_t new_rkey; 2223 2224 TAVOR_TNF_ENTER(tavor_wr_bind_check); 2225 2226 /* Check for a valid Memory Window handle in the WR */ 2227 mw = (tavor_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl; 2228 if (mw == NULL) { 2229 TNF_PROBE_0(tavor_wr_bind_check_invmwhdl_fail, 2230 TAVOR_TNF_ERROR, ""); 2231 TAVOR_TNF_EXIT(tavor_wr_bind_check); 2232 return (IBT_MW_HDL_INVALID); 2233 } 2234 2235 /* Check for a valid Memory Region handle in the WR */ 2236 mr = (tavor_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl; 2237 if (mr == NULL) { 2238 TNF_PROBE_0(tavor_wr_bind_check_invmrhdl_fail, 2239 TAVOR_TNF_ERROR, ""); 2240 TAVOR_TNF_EXIT(tavor_wr_bind_check); 2241 return (IBT_MR_HDL_INVALID); 2242 } 2243 2244 mutex_enter(&mr->mr_lock); 2245 mutex_enter(&mw->mr_lock); 2246 2247 /* 2248 * Check here to see if the memory region has already been partially 2249 * deregistered as a result of a tavor_umap_umemlock_cb() callback. 2250 * If so, this is an error, return failure. 2251 */ 2252 if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) { 2253 mutex_exit(&mr->mr_lock); 2254 mutex_exit(&mw->mr_lock); 2255 TNF_PROBE_0(tavor_wr_bind_check_invmrhdl2_fail, 2256 TAVOR_TNF_ERROR, ""); 2257 TAVOR_TNF_EXIT(tavor_wr_bind_check); 2258 return (IBT_MR_HDL_INVALID); 2259 } 2260 2261 /* Check for a valid Memory Window RKey (i.e. a matching RKey) */ 2262 if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) { 2263 mutex_exit(&mr->mr_lock); 2264 mutex_exit(&mw->mr_lock); 2265 TNF_PROBE_0(tavor_wr_bind_check_invrkey_fail, 2266 TAVOR_TNF_ERROR, ""); 2267 TAVOR_TNF_EXIT(tavor_wr_bind_check); 2268 return (IBT_MR_RKEY_INVALID); 2269 } 2270 2271 /* Check for a valid Memory Region LKey (i.e. a matching LKey) */ 2272 if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) { 2273 mutex_exit(&mr->mr_lock); 2274 mutex_exit(&mw->mr_lock); 2275 TNF_PROBE_0(tavor_wr_bind_check_invlkey_fail, 2276 TAVOR_TNF_ERROR, ""); 2277 TAVOR_TNF_EXIT(tavor_wr_bind_check); 2278 return (IBT_MR_LKEY_INVALID); 2279 } 2280 2281 /* 2282 * Now check for valid "vaddr" and "len". Note: We don't check the 2283 * "vaddr" range when "len == 0" (i.e. on unbind operations) 2284 */ 2285 len = wr->wr.rc.rcwr.bind->bind_len; 2286 if (len != 0) { 2287 vaddr = wr->wr.rc.rcwr.bind->bind_va; 2288 reg_start_addr = mr->mr_bindinfo.bi_addr; 2289 reg_end_addr = mr->mr_bindinfo.bi_addr + 2290 (mr->mr_bindinfo.bi_len - 1); 2291 if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) { 2292 mutex_exit(&mr->mr_lock); 2293 mutex_exit(&mw->mr_lock); 2294 TNF_PROBE_0(tavor_wr_bind_check_inv_vaddr_fail, 2295 TAVOR_TNF_ERROR, ""); 2296 TAVOR_TNF_EXIT(tavor_wr_bind_check); 2297 return (IBT_MR_VA_INVALID); 2298 } 2299 vaddr = (vaddr + len) - 1; 2300 if (vaddr > reg_end_addr) { 2301 mutex_exit(&mr->mr_lock); 2302 mutex_exit(&mw->mr_lock); 2303 TNF_PROBE_0(tavor_wr_bind_check_invlen_fail, 2304 TAVOR_TNF_ERROR, ""); 2305 TAVOR_TNF_EXIT(tavor_wr_bind_check); 2306 return (IBT_MR_LEN_INVALID); 2307 } 2308 } 2309 2310 /* 2311 * Validate the bind access flags. Remote Write and Atomic access for 2312 * the Memory Window require that Local Write access be set in the 2313 * corresponding Memory Region. 2314 */ 2315 bind_flags = wr->wr.rc.rcwr.bind->bind_flags; 2316 if (((bind_flags & IBT_WR_BIND_WRITE) || 2317 (bind_flags & IBT_WR_BIND_ATOMIC)) && 2318 !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) { 2319 mutex_exit(&mr->mr_lock); 2320 mutex_exit(&mw->mr_lock); 2321 TNF_PROBE_0(tavor_wr_bind_check_invflags_fail, 2322 TAVOR_TNF_ERROR, ""); 2323 TAVOR_TNF_EXIT(tavor_wr_bind_check); 2324 return (IBT_MR_ACCESS_REQ_INVALID); 2325 } 2326 2327 /* Calculate the new RKey for the Memory Window */ 2328 mpt = mw->mr_mptrsrcp; 2329 tavor_mr_keycalc(state, mpt->tr_indx, &new_rkey); 2330 2331 wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey; 2332 mw->mr_rkey = new_rkey; 2333 2334 mutex_exit(&mr->mr_lock); 2335 mutex_exit(&mw->mr_lock); 2336 TAVOR_TNF_EXIT(tavor_wr_bind_check); 2337 return (DDI_SUCCESS); 2338 } 2339 2340 2341 /* 2342 * tavor_wrid_from_reset_handling() 2343 * Context: Can be called from interrupt or base context. 2344 */ 2345 int 2346 tavor_wrid_from_reset_handling(tavor_state_t *state, tavor_qphdl_t qp) 2347 { 2348 tavor_workq_hdr_t *swq, *rwq; 2349 tavor_wrid_list_hdr_t *s_wridlist, *r_wridlist; 2350 uint_t create_new_swq = 0, create_new_rwq = 0; 2351 uint_t create_wql = 0; 2352 uint_t qp_srq_en; 2353 2354 TAVOR_TNF_ENTER(tavor_wrid_from_reset_handling); 2355 2356 /* 2357 * For each of this QP's Work Queues, make sure we have a (properly 2358 * initialized) Work Request ID list attached to the relevant 2359 * completion queue. Grab the CQ lock(s) before manipulating the 2360 * lists. 2361 */ 2362 tavor_wrid_wqhdr_lock_both(qp); 2363 swq = tavor_wrid_wqhdr_find(qp->qp_sq_cqhdl, qp->qp_qpnum, 2364 TAVOR_WR_SEND); 2365 if (swq == NULL) { 2366 /* Couldn't find matching work queue header, create it */ 2367 create_new_swq = create_wql = 1; 2368 swq = tavor_wrid_wqhdr_create(state, qp->qp_sq_cqhdl, 2369 qp->qp_qpnum, TAVOR_WR_SEND, create_wql); 2370 if (swq == NULL) { 2371 /* 2372 * If we couldn't find/allocate space for the workq 2373 * header, then drop the lock(s) and return failure. 2374 */ 2375 tavor_wrid_wqhdr_unlock_both(qp); 2376 TNF_PROBE_0(tavor_wrid_from_reset_handling_wqhdr_fail, 2377 TAVOR_TNF_ERROR, ""); 2378 TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling); 2379 return (ibc_get_ci_failure(0)); 2380 } 2381 } 2382 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swq)) 2383 qp->qp_sq_wqhdr = swq; 2384 swq->wq_size = qp->qp_sq_bufsz; 2385 swq->wq_head = 0; 2386 swq->wq_tail = 0; 2387 swq->wq_full = 0; 2388 2389 /* 2390 * Allocate space for the tavor_wrid_entry_t container 2391 */ 2392 s_wridlist = tavor_wrid_get_list(swq->wq_size); 2393 if (s_wridlist == NULL) { 2394 /* 2395 * If we couldn't allocate space for tracking the WRID 2396 * entries, then cleanup the workq header from above (if 2397 * necessary, i.e. if we created the workq header). Then 2398 * drop the lock(s) and return failure. 2399 */ 2400 if (create_new_swq) { 2401 tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq); 2402 } 2403 2404 tavor_wrid_wqhdr_unlock_both(qp); 2405 TNF_PROBE_0(tavor_wrid_from_reset_handling_wridlist_fail, 2406 TAVOR_TNF_ERROR, ""); 2407 TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling); 2408 return (ibc_get_ci_failure(0)); 2409 } 2410 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*s_wridlist)) 2411 s_wridlist->wl_wqhdr = swq; 2412 2413 /* Chain the new WRID list container to the workq hdr list */ 2414 mutex_enter(&swq->wq_wrid_wql->wql_lock); 2415 tavor_wrid_wqhdr_add(swq, s_wridlist); 2416 mutex_exit(&swq->wq_wrid_wql->wql_lock); 2417 2418 qp_srq_en = qp->qp_srq_en; 2419 2420 #ifdef __lock_lint 2421 mutex_enter(&qp->qp_srqhdl->srq_lock); 2422 #else 2423 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 2424 mutex_enter(&qp->qp_srqhdl->srq_lock); 2425 } 2426 #endif 2427 /* 2428 * Now we repeat all the above operations for the receive work queue, 2429 * or shared receive work queue. 2430 * 2431 * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case. 2432 */ 2433 rwq = tavor_wrid_wqhdr_find(qp->qp_rq_cqhdl, qp->qp_qpnum, 2434 TAVOR_WR_RECV); 2435 if (rwq == NULL) { 2436 create_new_rwq = create_wql = 1; 2437 2438 /* 2439 * If this QP is associated with an SRQ, and this isn't the 2440 * first QP on the SRQ, then the 'srq_wrid_wql' will already be 2441 * created. Since the WQL is created at 'wqhdr_create' time we 2442 * pass in the flag 'create_wql' here to be 0 if we have 2443 * already created it. And later on below we then next setup 2444 * the WQL and rwq information based off the existing SRQ info. 2445 */ 2446 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED && 2447 qp->qp_srqhdl->srq_wrid_wql != NULL) { 2448 create_wql = 0; 2449 } 2450 2451 rwq = tavor_wrid_wqhdr_create(state, qp->qp_rq_cqhdl, 2452 qp->qp_qpnum, TAVOR_WR_RECV, create_wql); 2453 if (rwq == NULL) { 2454 /* 2455 * If we couldn't find/allocate space for the workq 2456 * header, then free all the send queue resources we 2457 * just allocated and setup (above), drop the lock(s) 2458 * and return failure. 2459 */ 2460 mutex_enter(&swq->wq_wrid_wql->wql_lock); 2461 tavor_wrid_wqhdr_remove(swq, s_wridlist); 2462 mutex_exit(&swq->wq_wrid_wql->wql_lock); 2463 if (create_new_swq) { 2464 tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, 2465 swq); 2466 } 2467 2468 #ifdef __lock_lint 2469 mutex_exit(&qp->qp_srqhdl->srq_lock); 2470 #else 2471 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 2472 mutex_exit(&qp->qp_srqhdl->srq_lock); 2473 } 2474 #endif 2475 2476 tavor_wrid_wqhdr_unlock_both(qp); 2477 TNF_PROBE_0(tavor_wrid_from_reset_handling_wqhdr_fail, 2478 TAVOR_TNF_ERROR, ""); 2479 TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling); 2480 return (ibc_get_ci_failure(0)); 2481 } 2482 } 2483 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*rwq)) 2484 2485 /* 2486 * Setup receive workq hdr 2487 * 2488 * If the QP is on an SRQ, we setup the SRQ specific fields, setting 2489 * keeping a copy of the rwq pointer, setting the rwq bufsize 2490 * appropriately, and initializing our part of the WQLock. 2491 * 2492 * In the normal QP case, the QP recv queue bufsize is used. 2493 */ 2494 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 2495 rwq->wq_size = qp->qp_srqhdl->srq_wq_bufsz; 2496 if (qp->qp_srqhdl->srq_wrid_wql == NULL) { 2497 qp->qp_srqhdl->srq_wrid_wql = rwq->wq_wrid_wql; 2498 } else { 2499 rwq->wq_wrid_wql = qp->qp_srqhdl->srq_wrid_wql; 2500 } 2501 tavor_wql_refcnt_inc(qp->qp_srqhdl->srq_wrid_wql); 2502 2503 } else { 2504 rwq->wq_size = qp->qp_rq_bufsz; 2505 } 2506 2507 qp->qp_rq_wqhdr = rwq; 2508 rwq->wq_head = 0; 2509 rwq->wq_tail = 0; 2510 rwq->wq_full = 0; 2511 2512 /* 2513 * Allocate space for the tavor_wrid_entry_t container. 2514 * 2515 * If QP is on an SRQ, and the wrq_wridlist is NULL then we must 2516 * allocate the wridlist normally. However, if the srq_wridlist is != 2517 * NULL, then we know this SRQ has already been initialized, thus the 2518 * wridlist has already been initialized. So we re-use the 2519 * srq_wridlist as the r_wridlist for this QP in this case. 2520 */ 2521 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED && 2522 qp->qp_srqhdl->srq_wridlist != NULL) { 2523 /* Use existing srq_wridlist pointer */ 2524 r_wridlist = qp->qp_srqhdl->srq_wridlist; 2525 ASSERT(r_wridlist != NULL); 2526 } else { 2527 /* Allocate memory for the r_wridlist */ 2528 r_wridlist = tavor_wrid_get_list(rwq->wq_size); 2529 } 2530 2531 /* 2532 * If the memory allocation failed for r_wridlist (or the SRQ pointer 2533 * is mistakenly NULL), we cleanup our previous swq allocation from 2534 * above 2535 */ 2536 if (r_wridlist == NULL) { 2537 /* 2538 * If we couldn't allocate space for tracking the WRID 2539 * entries, then cleanup all the stuff from above. Then 2540 * drop the lock(s) and return failure. 2541 */ 2542 mutex_enter(&swq->wq_wrid_wql->wql_lock); 2543 tavor_wrid_wqhdr_remove(swq, s_wridlist); 2544 mutex_exit(&swq->wq_wrid_wql->wql_lock); 2545 if (create_new_swq) { 2546 tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq); 2547 } 2548 if (create_new_rwq) { 2549 tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, rwq); 2550 } 2551 2552 #ifdef __lock_lint 2553 mutex_exit(&qp->qp_srqhdl->srq_lock); 2554 #else 2555 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 2556 mutex_exit(&qp->qp_srqhdl->srq_lock); 2557 } 2558 #endif 2559 2560 tavor_wrid_wqhdr_unlock_both(qp); 2561 TNF_PROBE_0(tavor_wrid_from_reset_handling_wridlist_fail, 2562 TAVOR_TNF_ERROR, ""); 2563 TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling); 2564 return (ibc_get_ci_failure(0)); 2565 } 2566 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*r_wridlist)) 2567 2568 /* 2569 * Initialize the wridlist 2570 * 2571 * In the normal QP case, there is no special initialization needed. 2572 * We simply setup the wridlist backpointer to be the receive wqhdr 2573 * (rwq). 2574 * 2575 * But in the SRQ case, there is no backpointer to the wqhdr possible. 2576 * Instead we set 'wl_srq_en', specifying this wridlist is on an SRQ 2577 * and thus potentially shared across multiple QPs with the SRQ. We 2578 * also setup the srq_wridlist pointer to be the r_wridlist, and 2579 * intialize the freelist to an invalid index. This srq_wridlist 2580 * pointer is used above on future moves from_reset to let us know that 2581 * the srq_wridlist has been initialized already. 2582 * 2583 * And finally, if we are in a non-UMAP case, we setup the srq wrid 2584 * free list. 2585 */ 2586 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED && 2587 qp->qp_srqhdl->srq_wridlist == NULL) { 2588 r_wridlist->wl_srq_en = 1; 2589 r_wridlist->wl_free_list_indx = -1; 2590 qp->qp_srqhdl->srq_wridlist = r_wridlist; 2591 2592 /* Initialize srq wrid free list */ 2593 if (qp->qp_srqhdl->srq_is_umap == 0) { 2594 mutex_enter(&rwq->wq_wrid_wql->wql_lock); 2595 tavor_wrid_list_srq_init(r_wridlist, qp->qp_srqhdl, 0); 2596 mutex_exit(&rwq->wq_wrid_wql->wql_lock); 2597 } 2598 } else { 2599 r_wridlist->wl_wqhdr = rwq; 2600 } 2601 2602 /* Chain the WRID list "container" to the workq hdr list */ 2603 mutex_enter(&rwq->wq_wrid_wql->wql_lock); 2604 tavor_wrid_wqhdr_add(rwq, r_wridlist); 2605 mutex_exit(&rwq->wq_wrid_wql->wql_lock); 2606 2607 #ifdef __lock_lint 2608 mutex_exit(&qp->qp_srqhdl->srq_lock); 2609 #else 2610 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 2611 mutex_exit(&qp->qp_srqhdl->srq_lock); 2612 } 2613 #endif 2614 2615 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*r_wridlist)) 2616 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*rwq)) 2617 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*s_wridlist)) 2618 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*swq)) 2619 2620 tavor_wrid_wqhdr_unlock_both(qp); 2621 TAVOR_TNF_EXIT(tavor_wrid_from_reset_handling); 2622 return (DDI_SUCCESS); 2623 } 2624 2625 2626 /* 2627 * tavor_wrid_to_reset_handling() 2628 * Context: Can be called from interrupt or base context. 2629 */ 2630 void 2631 tavor_wrid_to_reset_handling(tavor_state_t *state, tavor_qphdl_t qp) 2632 { 2633 uint_t free_wqhdr = 0; 2634 2635 TAVOR_TNF_ENTER(tavor_wrid_to_reset_handling); 2636 2637 /* 2638 * For each of this QP's Work Queues, move the WRID "container" to 2639 * the "reapable" list. Although there may still be unpolled 2640 * entries in these containers, it is not a big deal. We will not 2641 * reap the list until either the Poll CQ command detects an empty 2642 * condition or the CQ itself is freed. Grab the CQ lock(s) before 2643 * manipulating the lists. 2644 */ 2645 mutex_enter(&qp->qp_rq_cqhdl->cq_lock); 2646 tavor_wrid_wqhdr_lock_both(qp); 2647 tavor_wrid_reaplist_add(qp->qp_sq_cqhdl, qp->qp_sq_wqhdr); 2648 2649 /* 2650 * Add the receive work queue header on to the reaplist. But if we are 2651 * on SRQ, then don't add anything to the reaplist. Instead we flush 2652 * the SRQ entries on the CQ, remove wridlist from WQHDR, and free the 2653 * WQHDR (if needed). We must hold the WQL for these operations, yet 2654 * the call to tavor_cq_wqhdr_remove grabs the WQL internally. So we 2655 * drop WQL before that call. Then release the CQ WQHDR locks and the 2656 * CQ lock and return. 2657 */ 2658 if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 2659 2660 /* 2661 * Pull off all (if any) entries for this QP from CQ. This 2662 * only includes entries that have not yet been polled 2663 */ 2664 mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock); 2665 tavor_cq_srq_entries_flush(state, qp); 2666 2667 /* Remove wridlist from WQHDR */ 2668 tavor_wrid_wqhdr_remove(qp->qp_rq_wqhdr, 2669 qp->qp_rq_wqhdr->wq_wrid_post); 2670 2671 /* If wridlist chain is now empty, remove the wqhdr as well */ 2672 if (qp->qp_rq_wqhdr->wq_wrid_post == NULL) { 2673 free_wqhdr = 1; 2674 } else { 2675 free_wqhdr = 0; 2676 } 2677 2678 mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock); 2679 2680 /* Free the WQHDR */ 2681 if (free_wqhdr) { 2682 tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr); 2683 } 2684 } else { 2685 tavor_wrid_reaplist_add(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr); 2686 } 2687 tavor_wrid_wqhdr_unlock_both(qp); 2688 mutex_exit(&qp->qp_rq_cqhdl->cq_lock); 2689 2690 TAVOR_TNF_EXIT(tavor_wrid_to_reset_handling); 2691 } 2692 2693 2694 /* 2695 * tavor_wrid_add_entry() 2696 * Context: Can be called from interrupt or base context. 2697 */ 2698 void 2699 tavor_wrid_add_entry(tavor_workq_hdr_t *wq, uint64_t wrid, uint32_t wqeaddrsz, 2700 uint_t signaled_dbd) 2701 { 2702 tavor_wrid_entry_t *wre_tmp; 2703 uint32_t head, tail, size; 2704 2705 TAVOR_TNF_ENTER(tavor_wrid_add_entry); 2706 2707 ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock)); 2708 2709 /* 2710 * Find the entry in the container pointed to by the "tail" index. 2711 * Add all of the relevant information to that entry, including WRID, 2712 * "wqeaddrsz" parameter, and whether it was signaled/unsignaled 2713 * and/or doorbelled. 2714 */ 2715 head = wq->wq_wrid_post->wl_head; 2716 tail = wq->wq_wrid_post->wl_tail; 2717 size = wq->wq_wrid_post->wl_size; 2718 wre_tmp = &wq->wq_wrid_post->wl_wre[tail]; 2719 wre_tmp->wr_wrid = wrid; 2720 wre_tmp->wr_wqeaddrsz = wqeaddrsz; 2721 wre_tmp->wr_signaled_dbd = signaled_dbd; 2722 2723 /* 2724 * Update the "wrid_old_tail" pointer to point to the entry we just 2725 * inserted into the queue. By tracking this pointer (the pointer to 2726 * the most recently inserted entry) it will possible later in the 2727 * PostSend() and PostRecv() code paths to find the entry that needs 2728 * its "doorbelled" flag set (see comment in tavor_post_recv() and/or 2729 * tavor_post_send()). 2730 */ 2731 wq->wq_wrid_post->wl_wre_old_tail = wre_tmp; 2732 2733 /* Update the tail index */ 2734 tail = ((tail + 1) & (size - 1)); 2735 wq->wq_wrid_post->wl_tail = tail; 2736 2737 /* 2738 * If the "tail" index has just wrapped over into the "head" index, 2739 * then we have filled the container. We use the "full" flag to 2740 * indicate this condition and to distinguish it from the "empty" 2741 * condition (where head and tail are also equal). 2742 */ 2743 if (head == tail) { 2744 wq->wq_wrid_post->wl_full = 1; 2745 } 2746 TAVOR_TNF_EXIT(tavor_wrid_add_entry); 2747 } 2748 2749 /* 2750 * tavor_wrid_add_entry_srq() 2751 * Context: Can be called from interrupt or base context 2752 */ 2753 void 2754 tavor_wrid_add_entry_srq(tavor_srqhdl_t srq, uint64_t wrid, uint_t signaled_dbd) 2755 { 2756 tavor_wrid_entry_t *wre; 2757 uint64_t *wl_wqe; 2758 uint32_t wqe_index; 2759 2760 TAVOR_TNF_ENTER(tavor_wrid_add_entry_srq); 2761 2762 /* 2763 * Find the next available WQE from the SRQ free_list. Then update the 2764 * free_list to point to the next entry 2765 */ 2766 wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wridlist->wl_free_list_indx); 2767 2768 wqe_index = srq->srq_wridlist->wl_free_list_indx; 2769 2770 /* ASSERT on impossible wqe_index values */ 2771 ASSERT(wqe_index < srq->srq_wq_bufsz); 2772 2773 /* 2774 * Setup the WRE. 2775 * 2776 * Given the 'wqe_index' value, we store the WRID at this WRE offset. 2777 * And we set the WRE to be signaled_dbd so that on poll CQ we can find 2778 * this information and associate the WRID to the WQE found on the CQE. 2779 */ 2780 wre = &srq->srq_wridlist->wl_wre[wqe_index]; 2781 wre->wr_wrid = wrid; 2782 wre->wr_signaled_dbd = signaled_dbd; 2783 2784 /* Update the free list index */ 2785 srq->srq_wridlist->wl_free_list_indx = ddi_get32( 2786 srq->srq_wridlist->wl_acchdl, (uint32_t *)wl_wqe); 2787 2788 TAVOR_TNF_EXIT(tavor_wrid_add_entry_srq); 2789 } 2790 2791 2792 /* 2793 * tavor_wrid_get_entry() 2794 * Context: Can be called from interrupt or base context. 2795 */ 2796 uint64_t 2797 tavor_wrid_get_entry(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, 2798 tavor_wrid_entry_t *wre) 2799 { 2800 tavor_workq_hdr_t *wq; 2801 tavor_wrid_entry_t *wre_tmp; 2802 uint64_t wrid; 2803 uint_t send_or_recv, qpnum, error, opcode; 2804 2805 TAVOR_TNF_ENTER(tavor_wrid_get_entry); 2806 2807 /* Lock the list of work queues associated with this CQ */ 2808 mutex_enter(&cq->cq_wrid_wqhdr_lock); 2809 2810 /* 2811 * Determine whether this CQE is a send or receive completion (and 2812 * whether it was a "successful" completion or not) 2813 */ 2814 opcode = TAVOR_CQE_OPCODE_GET(cq, cqe); 2815 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) || 2816 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) { 2817 error = 1; 2818 send_or_recv = (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ? 2819 TAVOR_COMPLETION_SEND : TAVOR_COMPLETION_RECV; 2820 } else { 2821 error = 0; 2822 send_or_recv = TAVOR_CQE_SENDRECV_GET(cq, cqe); 2823 } 2824 2825 /* Find the work queue for this QP number (send or receive side) */ 2826 qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe); 2827 wq = tavor_wrid_wqhdr_find(cq, qpnum, send_or_recv); 2828 ASSERT(wq != NULL); 2829 2830 /* 2831 * Regardless of whether the completion is the result of a "success" 2832 * or a "failure", we lock the list of "containers" and attempt to 2833 * search for the the first matching completion (i.e. the first WR 2834 * with a matching WQE addr and size). Once we find it, we pull out 2835 * the "wrid" field and return it (see below). Note: One possible 2836 * future enhancement would be to enable this routine to skip over 2837 * any "unsignaled" completions to go directly to the next "signaled" 2838 * entry on success. XXX 2839 */ 2840 mutex_enter(&wq->wq_wrid_wql->wql_lock); 2841 wre_tmp = tavor_wrid_find_match(wq, cq, cqe); 2842 2843 /* 2844 * If this is a "successful" completion, then we assert that this 2845 * completion must be a "signaled" completion. 2846 */ 2847 ASSERT(error || (wre_tmp->wr_signaled_dbd & TAVOR_WRID_ENTRY_SIGNALED)); 2848 2849 /* 2850 * If the completion is a "failed" completion, then we save away the 2851 * contents of the entry (into the "wre" field passed in) for use 2852 * in later CQE processing. Note: We use the tavor_wrid_get_wqeaddrsz() 2853 * function to grab "wqeaddrsz" from the next entry in the container. 2854 * This is required for error processing (where updating these fields 2855 * properly is necessary to correct handling of the "error" CQE) 2856 */ 2857 if (error && (wre != NULL)) { 2858 *wre = *wre_tmp; 2859 wre->wr_wqeaddrsz = tavor_wrid_get_wqeaddrsz(wq); 2860 } 2861 2862 /* Pull out the WRID and return it */ 2863 wrid = wre_tmp->wr_wrid; 2864 2865 mutex_exit(&wq->wq_wrid_wql->wql_lock); 2866 mutex_exit(&cq->cq_wrid_wqhdr_lock); 2867 2868 TAVOR_TNF_EXIT(tavor_wrid_get_entry); 2869 return (wrid); 2870 } 2871 2872 2873 /* 2874 * tavor_wrid_find_match() 2875 * Context: Can be called from interrupt or base context. 2876 */ 2877 static tavor_wrid_entry_t * 2878 tavor_wrid_find_match(tavor_workq_hdr_t *wq, tavor_cqhdl_t cq, 2879 tavor_hw_cqe_t *cqe) 2880 { 2881 tavor_wrid_entry_t *curr = NULL; 2882 tavor_wrid_list_hdr_t *container; 2883 uint32_t wqeaddr_size; 2884 uint32_t head, tail, size; 2885 int found = 0, last_container; 2886 2887 TAVOR_TNF_ENTER(tavor_wrid_find_match); 2888 2889 ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock)); 2890 2891 /* Pull the "wqeaddrsz" information from the CQE */ 2892 wqeaddr_size = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe); 2893 2894 /* 2895 * Walk the "containers" list(s), find first WR with a matching WQE 2896 * addr. If the current "container" is not the last one on the list, 2897 * i.e. not the current one to which we are posting new WRID entries, 2898 * then we do not attempt to update the "q_head", "q_tail", and 2899 * "q_full" indicators on the main work queue header. We do, however, 2900 * update the "head" and "full" indicators on the individual containers 2901 * as we go. This is imperative because we need to be able to 2902 * determine when the current container has been emptied (so that we 2903 * can move on to the next container). 2904 */ 2905 container = wq->wq_wrid_poll; 2906 while (container != NULL) { 2907 /* Is this the last/only "container" on the list */ 2908 last_container = (container != wq->wq_wrid_post) ? 0 : 1; 2909 2910 /* 2911 * First check if we are on an SRQ. If so, we grab the entry 2912 * and break out. Since SRQ wridlist's are never added to 2913 * reaplist, they can only be the last container. 2914 */ 2915 if (container->wl_srq_en) { 2916 ASSERT(last_container == 1); 2917 curr = tavor_wrid_find_match_srq(container, cq, cqe); 2918 break; 2919 } 2920 2921 /* 2922 * Grab the current "head", "tail" and "size" fields before 2923 * walking the list in the current container. Note: the "size" 2924 * field here must always be a power-of-2. The "full" 2925 * parameter is checked (and updated) here to distinguish the 2926 * "queue full" condition from "queue empty". 2927 */ 2928 head = container->wl_head; 2929 tail = container->wl_tail; 2930 size = container->wl_size; 2931 while ((head != tail) || (container->wl_full)) { 2932 container->wl_full = 0; 2933 curr = &container->wl_wre[head]; 2934 head = ((head + 1) & (size - 1)); 2935 2936 /* 2937 * If the current entry's "wqeaddrsz" matches the one 2938 * we're searching for, then this must correspond to 2939 * the work request that caused the completion. Set 2940 * the "found" flag and bail out. 2941 */ 2942 if (curr->wr_wqeaddrsz == wqeaddr_size) { 2943 found = 1; 2944 break; 2945 } 2946 } 2947 2948 /* 2949 * If the current container is empty (having reached here the 2950 * "head == tail" condition can only mean that the container 2951 * is empty), then NULL out the "wrid_old_tail" field (see 2952 * tavor_post_send() and tavor_post_recv() for more details) 2953 * and (potentially) remove the current container from future 2954 * searches. 2955 */ 2956 if (head == tail) { 2957 2958 container->wl_wre_old_tail = NULL; 2959 /* 2960 * If this wasn't the last "container" on the chain, 2961 * i.e. the one to which new WRID entries will be 2962 * added, then remove it from the list. 2963 * Note: we don't "lose" the memory pointed to by this 2964 * because we should have already put this container 2965 * on the "reapable" list (from where it will later be 2966 * pulled). 2967 */ 2968 if (!last_container) { 2969 wq->wq_wrid_poll = container->wl_next; 2970 } 2971 } 2972 2973 /* Update the head index for the container */ 2974 container->wl_head = head; 2975 2976 /* 2977 * If the entry was found in this container, then continue to 2978 * bail out. Else reset the "curr" pointer and move on to the 2979 * next container (if there is one). Note: the only real 2980 * reason for setting "curr = NULL" here is so that the ASSERT 2981 * below can catch the case where no matching entry was found 2982 * on any of the lists. 2983 */ 2984 if (found) { 2985 break; 2986 } else { 2987 curr = NULL; 2988 container = container->wl_next; 2989 } 2990 } 2991 2992 /* 2993 * Update work queue header's "head" and "full" conditions to match 2994 * the last entry on the container list. (Note: Only if we're pulling 2995 * entries from the last work queue portion of the list, i.e. not from 2996 * the previous portions that may be the "reapable" list.) 2997 */ 2998 if (last_container) { 2999 wq->wq_head = wq->wq_wrid_post->wl_head; 3000 wq->wq_full = wq->wq_wrid_post->wl_full; 3001 } 3002 3003 /* Ensure that we've actually found what we were searching for */ 3004 ASSERT(curr != NULL); 3005 3006 TAVOR_TNF_EXIT(tavor_wrid_find_match); 3007 return (curr); 3008 } 3009 3010 3011 /* 3012 * tavor_wrid_find_match_srq() 3013 * Context: Can be called from interrupt or base context. 3014 */ 3015 tavor_wrid_entry_t * 3016 tavor_wrid_find_match_srq(tavor_wrid_list_hdr_t *wl, tavor_cqhdl_t cq, 3017 tavor_hw_cqe_t *cqe) 3018 { 3019 tavor_wrid_entry_t *wre; 3020 uint64_t *wl_wqe; 3021 uint32_t wqe_index; 3022 uint64_t wqe_addr; 3023 uint32_t cqe_wqe_addr; 3024 3025 /* Grab the WQE addr out of the CQE */ 3026 cqe_wqe_addr = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe) & 0xFFFFFFC0; 3027 3028 /* 3029 * Use the WQE addr as the lower 32-bit, we add back on the 3030 * 'wl_srq_desc_off' because we have a zero-based queue. Then the 3031 * upper 32-bit of the 'wl_srq_wq_buf' OR'd on gives us the WQE addr in 3032 * the SRQ Work Queue itself. We use this address as the index to find 3033 * out which Work Queue Entry this CQE corresponds with. 3034 * 3035 * We also use this address below to add the WQE back on to the free 3036 * list. 3037 */ 3038 wqe_addr = ((uintptr_t)wl->wl_srq_wq_buf & 0xFFFFFFFF00000000ull) | 3039 (cqe_wqe_addr + wl->wl_srq_desc_off); 3040 3041 /* 3042 * Given the 'wqe_addr' just calculated and the srq buf address, we 3043 * find the 'wqe_index'. The 'wre' returned below contains the WRID 3044 * that we are looking for. This indexes into the wre_list for this 3045 * specific WQE. 3046 */ 3047 wqe_index = TAVOR_SRQ_WQE_INDEX(wl->wl_srq_wq_buf, wqe_addr, 3048 wl->wl_srq_log_wqesz); 3049 3050 /* ASSERT on impossible wqe_index values */ 3051 ASSERT(wqe_index < wl->wl_srq_wq_bufsz); 3052 3053 /* Get the pointer to this WQE */ 3054 wl_wqe = (uint64_t *)(uintptr_t)wqe_addr; 3055 3056 /* Put this WQE index back on the free list */ 3057 ddi_put32(wl->wl_acchdl, (uint32_t *)wl_wqe, wl->wl_free_list_indx); 3058 wl->wl_free_list_indx = wqe_index; 3059 3060 /* Using the index, return the Work Request ID Entry (wre) */ 3061 wre = &wl->wl_wre[wqe_index]; 3062 3063 return (wre); 3064 } 3065 3066 3067 /* 3068 * tavor_wrid_cq_reap() 3069 * Context: Can be called from interrupt or base context. 3070 */ 3071 void 3072 tavor_wrid_cq_reap(tavor_cqhdl_t cq) 3073 { 3074 tavor_workq_hdr_t *consume_wqhdr; 3075 tavor_wrid_list_hdr_t *container, *to_free; 3076 3077 ASSERT(MUTEX_HELD(&cq->cq_lock)); 3078 3079 TAVOR_TNF_ENTER(tavor_wrid_cq_reap); 3080 3081 /* Lock the list of work queues associated with this CQ */ 3082 mutex_enter(&cq->cq_wrid_wqhdr_lock); 3083 3084 /* Walk the "reapable" list and free up containers */ 3085 container = cq->cq_wrid_reap_head; 3086 while (container != NULL) { 3087 to_free = container; 3088 container = container->wl_reap_next; 3089 /* 3090 * If reaping the WRID list containers pulls the last 3091 * container from the given work queue header, then we free 3092 * the work queue header as well. 3093 */ 3094 consume_wqhdr = tavor_wrid_list_reap(to_free); 3095 if (consume_wqhdr != NULL) { 3096 tavor_cq_wqhdr_remove(cq, consume_wqhdr); 3097 } 3098 } 3099 3100 /* Once finished reaping, we reset the CQ's reap list */ 3101 cq->cq_wrid_reap_head = cq->cq_wrid_reap_tail = NULL; 3102 3103 mutex_exit(&cq->cq_wrid_wqhdr_lock); 3104 TAVOR_TNF_EXIT(tavor_wrid_cq_reap); 3105 } 3106 3107 3108 /* 3109 * tavor_wrid_cq_force_reap() 3110 * Context: Can be called from interrupt or base context. 3111 */ 3112 void 3113 tavor_wrid_cq_force_reap(tavor_cqhdl_t cq) 3114 { 3115 tavor_workq_hdr_t *curr; 3116 tavor_wrid_list_hdr_t *container, *to_free; 3117 avl_tree_t *treep; 3118 void *cookie = NULL; 3119 3120 ASSERT(MUTEX_HELD(&cq->cq_lock)); 3121 3122 TAVOR_TNF_ENTER(tavor_wrid_cq_reap); 3123 3124 /* 3125 * The first step is to walk the "reapable" list and free up those 3126 * containers. This is necessary because the containers on the 3127 * reapable list are not otherwise connected to the work queue headers 3128 * anymore. 3129 */ 3130 tavor_wrid_cq_reap(cq); 3131 3132 /* Now lock the list of work queues associated with this CQ */ 3133 mutex_enter(&cq->cq_wrid_wqhdr_lock); 3134 3135 /* 3136 * Walk the list of work queue headers and free up all the WRID list 3137 * containers chained to it. Note: We don't need to grab the locks 3138 * for each of the individual WRID lists here because the only way 3139 * things can be added or removed from the list at this point would be 3140 * through post a work request to a QP. But if we've come this far, 3141 * then we can be assured that there are no longer any QP associated 3142 * with the CQ that we are trying to free. 3143 */ 3144 #ifdef __lock_lint 3145 tavor_wrid_wqhdr_compare(NULL, NULL); 3146 #endif 3147 treep = &cq->cq_wrid_wqhdr_avl_tree; 3148 while ((curr = avl_destroy_nodes(treep, &cookie)) != NULL) { 3149 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*curr)) 3150 container = curr->wq_wrid_poll; 3151 while (container != NULL) { 3152 to_free = container; 3153 container = container->wl_next; 3154 /* 3155 * If reaping the WRID list containers pulls the last 3156 * container from the given work queue header, then 3157 * we free the work queue header as well. Note: we 3158 * ignore the return value because we know that the 3159 * work queue header should always be freed once the 3160 * list of containers has come to an end. 3161 */ 3162 (void) tavor_wrid_list_reap(to_free); 3163 if (container == NULL) { 3164 tavor_cq_wqhdr_remove(cq, curr); 3165 } 3166 } 3167 } 3168 avl_destroy(treep); 3169 3170 mutex_exit(&cq->cq_wrid_wqhdr_lock); 3171 TAVOR_TNF_EXIT(tavor_wrid_cq_reap); 3172 } 3173 3174 3175 /* 3176 * tavor_wrid_get_list() 3177 * Context: Can be called from interrupt or base context. 3178 */ 3179 tavor_wrid_list_hdr_t * 3180 tavor_wrid_get_list(uint32_t qsize) 3181 { 3182 tavor_wrid_list_hdr_t *wridlist; 3183 uint32_t size; 3184 3185 /* 3186 * The WRID list "container" consists of the tavor_wrid_list_hdr_t, 3187 * which holds the pointers necessary for maintaining the "reapable" 3188 * list, chaining together multiple "containers" old and new, and 3189 * tracking the head, tail, size, etc. for each container. 3190 * 3191 * The "container" also holds all the tavor_wrid_entry_t's, which is 3192 * allocated separately, one for each entry on the corresponding work 3193 * queue. 3194 */ 3195 size = sizeof (tavor_wrid_list_hdr_t); 3196 3197 /* 3198 * Note that this allocation has to be a NOSLEEP operation here 3199 * because we are holding the "wqhdr_list_lock" and, therefore, 3200 * could get raised to the interrupt level. 3201 */ 3202 wridlist = (tavor_wrid_list_hdr_t *)kmem_zalloc(size, KM_NOSLEEP); 3203 if (wridlist == NULL) { 3204 return (NULL); 3205 } 3206 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wridlist)) 3207 3208 /* Complete the "container" initialization */ 3209 wridlist->wl_size = qsize; 3210 wridlist->wl_full = 0; 3211 wridlist->wl_head = 0; 3212 wridlist->wl_tail = 0; 3213 wridlist->wl_wre = (tavor_wrid_entry_t *)kmem_zalloc(qsize * 3214 sizeof (tavor_wrid_entry_t), KM_NOSLEEP); 3215 if (wridlist->wl_wre == NULL) { 3216 kmem_free(wridlist, size); 3217 return (NULL); 3218 } 3219 wridlist->wl_wre_old_tail = NULL; 3220 wridlist->wl_reap_next = NULL; 3221 wridlist->wl_next = NULL; 3222 wridlist->wl_prev = NULL; 3223 wridlist->wl_srq_en = 0; 3224 3225 return (wridlist); 3226 } 3227 3228 /* 3229 * tavor_wrid_list_srq_init() 3230 * Context: Can be called from interrupt or base context 3231 */ 3232 void 3233 tavor_wrid_list_srq_init(tavor_wrid_list_hdr_t *wridlist, tavor_srqhdl_t srq, 3234 uint_t wq_start) 3235 { 3236 uint64_t *wl_wqe; 3237 int wqe_index; 3238 3239 ASSERT(MUTEX_HELD(&srq->srq_wrid_wql->wql_lock)); 3240 3241 /* Setup pointers for use later when we are polling the CQ */ 3242 wridlist->wl_srq_wq_buf = srq->srq_wq_buf; 3243 wridlist->wl_srq_wq_bufsz = srq->srq_wq_bufsz; 3244 wridlist->wl_srq_log_wqesz = srq->srq_wq_log_wqesz; 3245 wridlist->wl_srq_desc_off = srq->srq_desc_off; 3246 wridlist->wl_acchdl = srq->srq_wqinfo.qa_acchdl; 3247 3248 /* Given wq_start to start initializing buf at, verify sanity */ 3249 ASSERT(wq_start >= 0 && wq_start < srq->srq_wq_bufsz); 3250 3251 /* 3252 * Initialize wridlist free list 3253 * 3254 * For each WQ up to the size of our queue, we store an index in the WQ 3255 * memory itself, representing the next available free entry. The 3256 * 'wl_free_list_indx' always holds the index of the next available 3257 * free entry in the WQ. If 'wl_free_list_indx' is -1, then we are 3258 * completely full. This gives us the advantage of being able to have 3259 * entries complete or be polled off the WQ out-of-order. 3260 * 3261 * For now, we write the free_list entries inside the WQ itself. It 3262 * may be useful in the future to store this information in a separate 3263 * structure for debugging purposes. 3264 */ 3265 for (wqe_index = wq_start; wqe_index < srq->srq_wq_bufsz; wqe_index++) { 3266 wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, wqe_index); 3267 ddi_put32(wridlist->wl_acchdl, (uint32_t *)wl_wqe, 3268 wridlist->wl_free_list_indx); 3269 wridlist->wl_free_list_indx = wqe_index; 3270 } 3271 } 3272 3273 3274 /* 3275 * tavor_wrid_reaplist_add() 3276 * Context: Can be called from interrupt or base context. 3277 */ 3278 static void 3279 tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq) 3280 { 3281 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); 3282 3283 TAVOR_TNF_ENTER(tavor_wrid_reaplist_add); 3284 3285 mutex_enter(&wq->wq_wrid_wql->wql_lock); 3286 3287 /* 3288 * Add the "post" container (the last one on the current chain) to 3289 * the CQ's "reapable" list 3290 */ 3291 if ((cq->cq_wrid_reap_head == NULL) && 3292 (cq->cq_wrid_reap_tail == NULL)) { 3293 cq->cq_wrid_reap_head = wq->wq_wrid_post; 3294 cq->cq_wrid_reap_tail = wq->wq_wrid_post; 3295 } else { 3296 cq->cq_wrid_reap_tail->wl_reap_next = wq->wq_wrid_post; 3297 cq->cq_wrid_reap_tail = wq->wq_wrid_post; 3298 } 3299 3300 mutex_exit(&wq->wq_wrid_wql->wql_lock); 3301 } 3302 3303 3304 int 3305 tavor_wrid_wqhdr_compare(const void *p1, const void *p2) 3306 { 3307 tavor_workq_compare_t *cmpp; 3308 tavor_workq_hdr_t *curr; 3309 3310 cmpp = (tavor_workq_compare_t *)p1; 3311 curr = (tavor_workq_hdr_t *)p2; 3312 3313 if (cmpp->cmp_qpn < curr->wq_qpn) 3314 return (-1); 3315 else if (cmpp->cmp_qpn > curr->wq_qpn) 3316 return (+1); 3317 else if (cmpp->cmp_type < curr->wq_type) 3318 return (-1); 3319 else if (cmpp->cmp_type > curr->wq_type) 3320 return (+1); 3321 else 3322 return (0); 3323 } 3324 3325 3326 /* 3327 * tavor_wrid_wqhdr_find() 3328 * Context: Can be called from interrupt or base context. 3329 */ 3330 static tavor_workq_hdr_t * 3331 tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type) 3332 { 3333 tavor_workq_hdr_t *curr; 3334 tavor_workq_compare_t cmp; 3335 3336 TAVOR_TNF_ENTER(tavor_wrid_wqhdr_find); 3337 3338 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); 3339 3340 /* 3341 * Walk the CQ's work queue list, trying to find a send or recv queue 3342 * with the same QP number. We do this even if we are going to later 3343 * create a new entry because it helps us easily find the end of the 3344 * list. 3345 */ 3346 cmp.cmp_qpn = qpn; 3347 cmp.cmp_type = wq_type; 3348 #ifdef __lock_lint 3349 tavor_wrid_wqhdr_compare(NULL, NULL); 3350 #endif 3351 curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL); 3352 3353 TAVOR_TNF_EXIT(tavor_wrid_wqhdr_find); 3354 return (curr); 3355 } 3356 3357 3358 /* 3359 * tavor_wrid_wqhdr_create() 3360 * Context: Can be called from interrupt or base context. 3361 */ 3362 static tavor_workq_hdr_t * 3363 tavor_wrid_wqhdr_create(tavor_state_t *state, tavor_cqhdl_t cq, uint_t qpn, 3364 uint_t wq_type, uint_t create_wql) 3365 { 3366 tavor_workq_hdr_t *wqhdr_tmp; 3367 3368 TAVOR_TNF_ENTER(tavor_wrid_wqhdr_create); 3369 3370 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); 3371 3372 /* 3373 * Allocate space a work queue header structure and initialize it. 3374 * Each work queue header structure includes a "wq_wrid_wql" 3375 * which needs to be initialized. Note that this allocation has to be 3376 * a NOSLEEP operation because we are holding the "cq_wrid_wqhdr_lock" 3377 * and, therefore, could get raised to the interrupt level. 3378 */ 3379 wqhdr_tmp = (tavor_workq_hdr_t *)kmem_zalloc( 3380 sizeof (tavor_workq_hdr_t), KM_NOSLEEP); 3381 if (wqhdr_tmp == NULL) { 3382 TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create); 3383 return (NULL); 3384 } 3385 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr_tmp)) 3386 wqhdr_tmp->wq_qpn = qpn; 3387 wqhdr_tmp->wq_type = wq_type; 3388 3389 if (create_wql) { 3390 wqhdr_tmp->wq_wrid_wql = tavor_wrid_wql_create(state); 3391 if (wqhdr_tmp->wq_wrid_wql == NULL) { 3392 kmem_free(wqhdr_tmp, sizeof (tavor_workq_hdr_t)); 3393 TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create); 3394 return (NULL); 3395 } 3396 } 3397 3398 wqhdr_tmp->wq_wrid_poll = NULL; 3399 wqhdr_tmp->wq_wrid_post = NULL; 3400 3401 /* Chain the newly allocated work queue header to the CQ's list */ 3402 tavor_cq_wqhdr_add(cq, wqhdr_tmp); 3403 3404 TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create); 3405 return (wqhdr_tmp); 3406 } 3407 3408 3409 /* 3410 * tavor_wrid_wql_create() 3411 * Context: Can be called from interrupt or base context. 3412 */ 3413 tavor_wq_lock_t * 3414 tavor_wrid_wql_create(tavor_state_t *state) 3415 { 3416 tavor_wq_lock_t *wql; 3417 3418 TAVOR_TNF_ENTER(tavor_wrid_wql_create); 3419 3420 /* 3421 * Allocate the WQL and initialize it. 3422 */ 3423 wql = kmem_zalloc(sizeof (tavor_wq_lock_t), KM_NOSLEEP); 3424 if (wql == NULL) { 3425 TAVOR_TNF_EXIT(tavor_wrid_wqhdr_create); 3426 return (NULL); 3427 } 3428 3429 mutex_init(&wql->wql_lock, NULL, MUTEX_DRIVER, 3430 DDI_INTR_PRI(state->ts_intrmsi_pri)); 3431 3432 /* Add refcount to WQL */ 3433 tavor_wql_refcnt_inc(wql); 3434 3435 TAVOR_TNF_EXIT(tavor_wrid_wql_create); 3436 return (wql); 3437 } 3438 3439 3440 /* 3441 * tavor_wrid_get_wqeaddrsz() 3442 * Context: Can be called from interrupt or base context. 3443 */ 3444 static uint32_t 3445 tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq) 3446 { 3447 tavor_wrid_entry_t *wre; 3448 uint32_t wqeaddrsz; 3449 uint32_t head; 3450 3451 /* 3452 * If the container is empty, then there is no next entry. So just 3453 * return zero. Note: the "head == tail" condition here can only 3454 * mean that the container is empty because we have previously pulled 3455 * something from the container. 3456 * 3457 * If the container is not empty, then find the next entry and return 3458 * the contents of its "wqeaddrsz" field. 3459 */ 3460 if (wq->wq_wrid_poll->wl_head == wq->wq_wrid_poll->wl_tail) { 3461 wqeaddrsz = 0; 3462 } else { 3463 /* 3464 * We don't need to calculate the "next" head pointer here 3465 * because "head" should already point to the next entry on 3466 * the list (since we just pulled something off - in 3467 * tavor_wrid_find_match() - and moved the head index forward.) 3468 */ 3469 head = wq->wq_wrid_poll->wl_head; 3470 wre = &wq->wq_wrid_poll->wl_wre[head]; 3471 wqeaddrsz = wre->wr_wqeaddrsz; 3472 } 3473 return (wqeaddrsz); 3474 } 3475 3476 3477 /* 3478 * tavor_wrid_wqhdr_add() 3479 * Context: Can be called from interrupt or base context. 3480 */ 3481 static void 3482 tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr, 3483 tavor_wrid_list_hdr_t *wridlist) 3484 { 3485 ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock)); 3486 3487 /* Chain the new WRID list "container" to the work queue list */ 3488 if ((wqhdr->wq_wrid_post == NULL) && 3489 (wqhdr->wq_wrid_poll == NULL)) { 3490 wqhdr->wq_wrid_poll = wridlist; 3491 wqhdr->wq_wrid_post = wridlist; 3492 } else { 3493 wqhdr->wq_wrid_post->wl_next = wridlist; 3494 wridlist->wl_prev = wqhdr->wq_wrid_post; 3495 wqhdr->wq_wrid_post = wridlist; 3496 } 3497 } 3498 3499 3500 /* 3501 * tavor_wrid_wqhdr_remove() 3502 * Context: Can be called from interrupt or base context. 3503 * 3504 * Note: this is only called to remove the most recently added WRID list 3505 * container (i.e. in tavor_from_reset() above) 3506 */ 3507 static void 3508 tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr, 3509 tavor_wrid_list_hdr_t *wridlist) 3510 { 3511 tavor_wrid_list_hdr_t *prev, *next; 3512 3513 ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock)); 3514 3515 /* Unlink the WRID list "container" from the work queue list */ 3516 prev = wridlist->wl_prev; 3517 next = wridlist->wl_next; 3518 if (prev != NULL) { 3519 prev->wl_next = next; 3520 } 3521 if (next != NULL) { 3522 next->wl_prev = prev; 3523 } 3524 3525 /* 3526 * Update any pointers in the work queue hdr that may point to this 3527 * WRID list container 3528 */ 3529 if (wqhdr->wq_wrid_post == wridlist) { 3530 wqhdr->wq_wrid_post = prev; 3531 } 3532 if (wqhdr->wq_wrid_poll == wridlist) { 3533 wqhdr->wq_wrid_poll = NULL; 3534 } 3535 } 3536 3537 3538 /* 3539 * tavor_wrid_list_reap() 3540 * Context: Can be called from interrupt or base context. 3541 * Note: The "wqhdr_list_lock" must be held. 3542 */ 3543 static tavor_workq_hdr_t * 3544 tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wridlist) 3545 { 3546 tavor_workq_hdr_t *wqhdr, *consume_wqhdr = NULL; 3547 tavor_wrid_list_hdr_t *prev, *next; 3548 uint32_t size; 3549 3550 TAVOR_TNF_ENTER(tavor_wrid_list_reap); 3551 3552 /* Get the back pointer to the work queue header (see below) */ 3553 wqhdr = wridlist->wl_wqhdr; 3554 mutex_enter(&wqhdr->wq_wrid_wql->wql_lock); 3555 3556 /* Unlink the WRID list "container" from the work queue list */ 3557 prev = wridlist->wl_prev; 3558 next = wridlist->wl_next; 3559 if (prev != NULL) { 3560 prev->wl_next = next; 3561 } 3562 if (next != NULL) { 3563 next->wl_prev = prev; 3564 } 3565 3566 /* 3567 * If the back pointer to the work queue header shows that it 3568 * was pointing to the entry we are about to remove, then the work 3569 * queue header is reapable as well. 3570 */ 3571 if ((wqhdr->wq_wrid_poll == wridlist) && 3572 (wqhdr->wq_wrid_post == wridlist)) { 3573 consume_wqhdr = wqhdr; 3574 } 3575 3576 /* Be sure to update the "poll" and "post" container pointers */ 3577 if (wqhdr->wq_wrid_poll == wridlist) { 3578 wqhdr->wq_wrid_poll = next; 3579 } 3580 if (wqhdr->wq_wrid_post == wridlist) { 3581 wqhdr->wq_wrid_post = NULL; 3582 } 3583 3584 /* Calculate the size and free the container */ 3585 size = (wridlist->wl_size * sizeof (tavor_wrid_entry_t)); 3586 kmem_free(wridlist->wl_wre, size); 3587 kmem_free(wridlist, sizeof (tavor_wrid_list_hdr_t)); 3588 3589 mutex_exit(&wqhdr->wq_wrid_wql->wql_lock); 3590 3591 TAVOR_TNF_EXIT(tavor_wrid_list_reap); 3592 return (consume_wqhdr); 3593 } 3594 3595 3596 /* 3597 * tavor_wrid_wqhdr_lock_both() 3598 * Context: Can be called from interrupt or base context. 3599 */ 3600 static void 3601 tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp) 3602 { 3603 tavor_cqhdl_t sq_cq, rq_cq; 3604 3605 sq_cq = qp->qp_sq_cqhdl; 3606 rq_cq = qp->qp_rq_cqhdl; 3607 3608 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock)) 3609 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock)) 3610 3611 /* 3612 * If both work queues (send and recv) share a completion queue, then 3613 * grab the common lock. If they use different CQs (hence different 3614 * "cq_wrid_wqhdr_list" locks), then grab the send one first, then the 3615 * receive. We do this consistently and correctly in 3616 * tavor_wrid_wqhdr_unlock_both() below to avoid introducing any kind 3617 * of dead lock condition. Note: We add the "__lock_lint" code here 3618 * to fake out warlock into thinking we've grabbed both locks (when, 3619 * in fact, we only needed the one). 3620 */ 3621 if (sq_cq == rq_cq) { 3622 mutex_enter(&sq_cq->cq_wrid_wqhdr_lock); 3623 #ifdef __lock_lint 3624 mutex_enter(&rq_cq->cq_wrid_wqhdr_lock); 3625 #endif 3626 } else { 3627 mutex_enter(&sq_cq->cq_wrid_wqhdr_lock); 3628 mutex_enter(&rq_cq->cq_wrid_wqhdr_lock); 3629 } 3630 } 3631 3632 /* 3633 * tavor_wrid_wqhdr_unlock_both() 3634 * Context: Can be called from interrupt or base context. 3635 */ 3636 static void 3637 tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp) 3638 { 3639 tavor_cqhdl_t sq_cq, rq_cq; 3640 3641 sq_cq = qp->qp_sq_cqhdl; 3642 rq_cq = qp->qp_rq_cqhdl; 3643 3644 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock)) 3645 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock)) 3646 3647 /* 3648 * See tavor_wrid_wqhdr_lock_both() above for more detail 3649 */ 3650 if (sq_cq == rq_cq) { 3651 #ifdef __lock_lint 3652 mutex_exit(&rq_cq->cq_wrid_wqhdr_lock); 3653 #endif 3654 mutex_exit(&sq_cq->cq_wrid_wqhdr_lock); 3655 } else { 3656 mutex_exit(&rq_cq->cq_wrid_wqhdr_lock); 3657 mutex_exit(&sq_cq->cq_wrid_wqhdr_lock); 3658 } 3659 } 3660 3661 3662 /* 3663 * tavor_cq_wqhdr_add() 3664 * Context: Can be called from interrupt or base context. 3665 */ 3666 static void 3667 tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr) 3668 { 3669 tavor_workq_compare_t cmp; 3670 avl_index_t where; 3671 3672 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); 3673 3674 cmp.cmp_qpn = wqhdr->wq_qpn; 3675 cmp.cmp_type = wqhdr->wq_type; 3676 #ifdef __lock_lint 3677 tavor_wrid_wqhdr_compare(NULL, NULL); 3678 #endif 3679 (void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where); 3680 /* 3681 * If the CQ's work queue list is empty, then just add it. 3682 * Otherwise, chain it to the beginning of the list. 3683 */ 3684 avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqhdr, where); 3685 } 3686 3687 3688 /* 3689 * tavor_cq_wqhdr_remove() 3690 * Context: Can be called from interrupt or base context. 3691 */ 3692 static void 3693 tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr) 3694 { 3695 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); 3696 3697 #ifdef __lock_lint 3698 tavor_wrid_wqhdr_compare(NULL, NULL); 3699 #endif 3700 /* Remove "wqhdr" from the work queue header list on "cq" */ 3701 avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqhdr); 3702 3703 /* 3704 * Release reference to WQL; If this is the last reference, this call 3705 * also has the side effect of freeing up the 'wq_wrid_wql' memory. 3706 */ 3707 tavor_wql_refcnt_dec(wqhdr->wq_wrid_wql); 3708 3709 /* Free the memory associated with "wqhdr" */ 3710 kmem_free(wqhdr, sizeof (tavor_workq_hdr_t)); 3711 } 3712 3713 3714 /* 3715 * tavor_wql_refcnt_inc() 3716 * Context: Can be called from interrupt or base context 3717 */ 3718 void 3719 tavor_wql_refcnt_inc(tavor_wq_lock_t *wql) 3720 { 3721 ASSERT(wql != NULL); 3722 3723 mutex_enter(&wql->wql_lock); 3724 wql->wql_refcnt++; 3725 mutex_exit(&wql->wql_lock); 3726 } 3727 3728 /* 3729 * tavor_wql_refcnt_dec() 3730 * Context: Can be called from interrupt or base context 3731 */ 3732 void 3733 tavor_wql_refcnt_dec(tavor_wq_lock_t *wql) 3734 { 3735 int refcnt; 3736 3737 ASSERT(wql != NULL); 3738 3739 mutex_enter(&wql->wql_lock); 3740 wql->wql_refcnt--; 3741 refcnt = wql->wql_refcnt; 3742 mutex_exit(&wql->wql_lock); 3743 3744 /* 3745 * 3746 * Free up WQL memory if we're the last one associated with this 3747 * structure. 3748 */ 3749 if (refcnt == 0) { 3750 mutex_destroy(&wql->wql_lock); 3751 kmem_free(wql, sizeof (tavor_wq_lock_t)); 3752 } 3753 } 3754