1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * tavor_wr.c 29 * Tavor Work Request Processing Routines 30 * 31 * Implements all the routines necessary to provide the PostSend(), 32 * PostRecv() and PostSRQ() verbs. Also contains all the code 33 * necessary to implement the Tavor WRID tracking mechanism. 34 */ 35 36 #include <sys/types.h> 37 #include <sys/conf.h> 38 #include <sys/ddi.h> 39 #include <sys/sunddi.h> 40 #include <sys/modctl.h> 41 #include <sys/avl.h> 42 43 #include <sys/ib/adapters/tavor/tavor.h> 44 45 static void tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda, 46 uint32_t nds, uint32_t qpn, uint32_t fence, uint32_t nopcode); 47 #pragma inline(tavor_qp_send_doorbell) 48 static void tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda, 49 uint32_t nds, uint32_t qpn, uint32_t credits); 50 #pragma inline(tavor_qp_recv_doorbell) 51 static uint32_t tavor_wr_get_immediate(ibt_send_wr_t *wr); 52 static int tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr); 53 static int tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp, 54 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size); 55 static void tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, 56 ibt_send_wr_t *prev_wr, uint64_t *curr_desc, uint_t curr_descsz, 57 uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp); 58 static int tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp, 59 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size); 60 static void tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc, 61 uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo, 62 tavor_qphdl_t qp); 63 static int tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp, 64 ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size); 65 static void tavor_wqe_recv_linknext(uint64_t *desc, uint_t desc_sz, 66 uint64_t *prev, tavor_qphdl_t qp); 67 static int tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq, 68 ibt_recv_wr_t *wr, uint64_t *desc); 69 static void tavor_wqe_srq_linknext(uint64_t *desc, uint64_t *prev, 70 tavor_srqhdl_t srq); 71 static void tavor_wqe_sync(void *hdl, uint_t sync_from, 72 uint_t sync_to, uint_t sync_type, uint_t flag); 73 static tavor_wrid_entry_t *tavor_wrid_find_match(tavor_workq_hdr_t *wq, 74 tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe); 75 static void tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq); 76 static tavor_workq_hdr_t *tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn, 77 uint_t send_or_recv); 78 static tavor_workq_hdr_t *tavor_wrid_wqhdr_create(tavor_state_t *state, 79 tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type, uint_t create_wql); 80 static uint32_t tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq); 81 static void tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr, 82 tavor_wrid_list_hdr_t *wrid_list); 83 static void tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr, 84 tavor_wrid_list_hdr_t *wrid_list); 85 static tavor_workq_hdr_t *tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wq); 86 static void tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp); 87 static void tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp); 88 static void tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr); 89 static void tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr); 90 91 /* 92 * tavor_post_send() 93 * Context: Can be called from interrupt or base context. 94 */ 95 int 96 tavor_post_send(tavor_state_t *state, tavor_qphdl_t qp, 97 ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted) 98 { 99 tavor_sw_wqe_dbinfo_t dbinfo; 100 tavor_wrid_list_hdr_t *wridlist; 101 tavor_wrid_entry_t *wre_last; 102 uint64_t *desc, *prev, *first; 103 uint32_t desc_sz, first_sz; 104 uint32_t wqeaddrsz, signaled_dbd; 105 uint32_t head, tail, next_tail, qsize_msk; 106 uint32_t sync_from, sync_to; 107 uint_t currindx, wrindx, numremain; 108 uint_t chainlen, chainbegin, posted_cnt; 109 uint_t maxdb = TAVOR_QP_MAXDESC_PER_DB; 110 int status; 111 112 /* 113 * Check for user-mappable QP memory. Note: We do not allow kernel 114 * clients to post to QP memory that is accessible directly by the 115 * user. If the QP memory is user accessible, then return an error. 116 */ 117 if (qp->qp_is_umap) { 118 return (IBT_QP_HDL_INVALID); 119 } 120 121 /* Initialize posted_cnt */ 122 posted_cnt = 0; 123 124 mutex_enter(&qp->qp_lock); 125 126 /* 127 * Check QP state. Can not post Send requests from the "Reset", 128 * "Init", or "RTR" states 129 */ 130 if ((qp->qp_state == TAVOR_QP_RESET) || 131 (qp->qp_state == TAVOR_QP_INIT) || 132 (qp->qp_state == TAVOR_QP_RTR)) { 133 mutex_exit(&qp->qp_lock); 134 return (IBT_QP_STATE_INVALID); 135 } 136 137 /* Grab the lock for the WRID list */ 138 mutex_enter(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock); 139 wridlist = qp->qp_sq_wqhdr->wq_wrid_post; 140 141 /* Save away some initial QP state */ 142 qsize_msk = qp->qp_sq_wqhdr->wq_size - 1; 143 tail = qp->qp_sq_wqhdr->wq_tail; 144 head = qp->qp_sq_wqhdr->wq_head; 145 146 /* 147 * For each ibt_send_wr_t in the wr[] list passed in, parse the 148 * request and build a Send WQE. Note: Because we are potentially 149 * building a chain of WQEs, we want to link them all together. 150 * However, we do not want to link the first one to the previous 151 * WQE until the entire chain has been linked. Then in the last 152 * step we ring the appropriate doorbell. Note: It is possible for 153 * more Work Requests to be posted than the HW will support at one 154 * shot. If this happens, we need to be able to post and ring 155 * several chains here until the the entire request is complete. 156 */ 157 wrindx = 0; 158 numremain = num_wr; 159 status = DDI_SUCCESS; 160 while ((wrindx < num_wr) && (status == DDI_SUCCESS)) { 161 /* 162 * For the first WQE on a new chain we need "prev" to point 163 * to the current descriptor. As we begin to process 164 * further, "prev" will be updated to point to the previous 165 * WQE on the current chain (see below). 166 */ 167 prev = TAVOR_QP_SQ_ENTRY(qp, tail); 168 169 /* 170 * Before we begin, save the current "tail index" for later 171 * DMA sync 172 */ 173 sync_from = tail; 174 175 /* 176 * Break the request up into chains that are less than or 177 * equal to the maximum number of WQEs that can be posted 178 * per doorbell ring 179 */ 180 chainlen = (numremain > maxdb) ? maxdb : numremain; 181 numremain -= chainlen; 182 chainbegin = wrindx; 183 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) { 184 /* 185 * Check for "queue full" condition. If the queue 186 * is already full, then no more WQEs can be posted. 187 * So break out, ring a doorbell (if necessary) and 188 * return an error 189 */ 190 if (qp->qp_sq_wqhdr->wq_full != 0) { 191 status = IBT_QP_FULL; 192 break; 193 } 194 195 /* 196 * Increment the "tail index" and check for "queue 197 * full" condition. If we detect that the current 198 * work request is going to fill the work queue, then 199 * we mark this condition and continue. 200 */ 201 next_tail = (tail + 1) & qsize_msk; 202 if (next_tail == head) { 203 qp->qp_sq_wqhdr->wq_full = 1; 204 } 205 206 /* 207 * Get the address of the location where the next 208 * Send WQE should be built 209 */ 210 desc = TAVOR_QP_SQ_ENTRY(qp, tail); 211 212 /* 213 * Call tavor_wqe_send_build() to build the WQE 214 * at the given address. This routine uses the 215 * information in the ibt_send_wr_t list (wr[]) and 216 * returns the size of the WQE when it returns. 217 */ 218 status = tavor_wqe_send_build(state, qp, 219 &wr[wrindx], desc, &desc_sz); 220 if (status != DDI_SUCCESS) { 221 break; 222 } 223 224 /* 225 * Add a WRID entry to the WRID list. Need to 226 * calculate the "wqeaddrsz" and "signaled_dbd" 227 * values to pass to tavor_wrid_add_entry() 228 */ 229 wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t) 230 ((uint64_t)(uintptr_t)desc - qp->qp_desc_off), 231 desc_sz); 232 if ((qp->qp_sq_sigtype == TAVOR_QP_SQ_ALL_SIGNALED) || 233 (wr[wrindx].wr_flags & IBT_WR_SEND_SIGNAL)) { 234 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED; 235 } else { 236 signaled_dbd = 0; 237 } 238 tavor_wrid_add_entry(qp->qp_sq_wqhdr, 239 wr[wrindx].wr_id, wqeaddrsz, signaled_dbd); 240 241 /* 242 * If this is not the first descriptor on the current 243 * chain, then link it to the previous WQE. Otherwise, 244 * save the address and size of this descriptor (in 245 * "first" and "first_sz" respectively) and continue. 246 * Note: Linking a WQE to the the previous one will 247 * depend on whether the two WQEs are from "special 248 * QPs" (i.e. MLX transport WQEs) or whether they are 249 * normal Send WQEs. 250 */ 251 if (currindx != 0) { 252 if (qp->qp_is_special) { 253 tavor_wqe_mlx_linknext(&wr[wrindx - 1], 254 desc, desc_sz, prev, NULL, qp); 255 } else { 256 tavor_wqe_send_linknext(&wr[wrindx], 257 &wr[wrindx - 1], desc, desc_sz, 258 prev, NULL, qp); 259 } 260 prev = desc; 261 } else { 262 first = desc; 263 first_sz = desc_sz; 264 } 265 266 /* 267 * Update the current "tail index" and increment 268 * "posted_cnt" 269 */ 270 tail = next_tail; 271 posted_cnt++; 272 } 273 274 /* 275 * If we reach here and there are one or more WQEs which have 276 * been successfully chained together, then we need to link 277 * the current chain to the previously executing chain of 278 * descriptor (if there is one) and ring the doorbell for the 279 * send work queue. 280 */ 281 if (currindx != 0) { 282 /* 283 * Before we link the chain, we need to ensure that the 284 * "next" field on the last WQE is set to NULL (to 285 * indicate the end of the chain). Note: Just as it 286 * did above, the format for the "next" fields in a 287 * given WQE depend on whether the WQE is MLX 288 * transport or not. 289 */ 290 if (qp->qp_is_special) { 291 tavor_wqe_mlx_linknext(&wr[chainbegin + 292 currindx - 1], NULL, 0, prev, NULL, qp); 293 } else { 294 tavor_wqe_send_linknext(NULL, 295 &wr[chainbegin + currindx - 1], NULL, 0, 296 prev, NULL, qp); 297 } 298 299 /* Save away updated "tail index" for the DMA sync */ 300 sync_to = tail; 301 302 /* Do a DMA sync for current send WQE(s) */ 303 tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_SEND, 304 DDI_DMA_SYNC_FORDEV); 305 306 /* 307 * Now link the chain to the old chain (if there was 308 * one. Note: still need to pay attention to whether 309 * the QP used MLX transport WQEs or not. 310 */ 311 if (qp->qp_is_special) { 312 tavor_wqe_mlx_linknext(NULL, first, first_sz, 313 qp->qp_sq_lastwqeaddr, &dbinfo, qp); 314 } else { 315 tavor_wqe_send_linknext(&wr[chainbegin], NULL, 316 first, first_sz, qp->qp_sq_lastwqeaddr, 317 &dbinfo, qp); 318 } 319 320 /* 321 * If there was a valid previous WQE (i.e. non-NULL), 322 * then sync it too. This is because we have updated 323 * its "next" fields and we want to ensure that the 324 * hardware can see the changes. 325 */ 326 if (qp->qp_sq_lastwqeaddr != NULL) { 327 sync_to = sync_from; 328 sync_from = (sync_from - 1) & qsize_msk; 329 tavor_wqe_sync(qp, sync_from, sync_to, 330 TAVOR_WR_SEND, DDI_DMA_SYNC_FORDEV); 331 } 332 333 /* 334 * Now if the WRID tail entry is non-NULL, then this 335 * represents the entry to which we are chaining the 336 * new entries. Since we are going to ring the 337 * doorbell for this WQE, we want set its "dbd" bit. 338 * 339 * On the other hand, if the tail is NULL, even though 340 * we will have rung the doorbell for the previous WQE 341 * (for the hardware's sake) it is irrelevant to our 342 * purposes (for tracking WRIDs) because we know the 343 * request must have already completed. 344 */ 345 wre_last = wridlist->wl_wre_old_tail; 346 if (wre_last != NULL) { 347 wre_last->wr_signaled_dbd |= 348 TAVOR_WRID_ENTRY_DOORBELLED; 349 } 350 351 /* Update some of the state in the QP */ 352 qp->qp_sq_lastwqeaddr = desc; 353 qp->qp_sq_wqhdr->wq_tail = tail; 354 355 /* Ring the doorbell */ 356 tavor_qp_send_doorbell(state, 357 (uint32_t)((uintptr_t)first - qp->qp_desc_off), 358 first_sz, qp->qp_qpnum, dbinfo.db_fence, 359 dbinfo.db_nopcode); 360 } 361 } 362 363 /* 364 * Update the "num_posted" return value (if necessary). Then drop 365 * the locks and return success. 366 */ 367 if (num_posted != NULL) { 368 *num_posted = posted_cnt; 369 } 370 371 mutex_exit(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock); 372 mutex_exit(&qp->qp_lock); 373 374 return (status); 375 } 376 377 378 /* 379 * tavor_post_recv() 380 * Context: Can be called from interrupt or base context. 381 */ 382 int 383 tavor_post_recv(tavor_state_t *state, tavor_qphdl_t qp, 384 ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted) 385 { 386 uint64_t *desc, *prev, *first; 387 uint32_t desc_sz, first_sz; 388 uint32_t wqeaddrsz, signaled_dbd; 389 uint32_t head, tail, next_tail, qsize_msk; 390 uint32_t sync_from, sync_to; 391 uint_t currindx, wrindx, numremain; 392 uint_t chainlen, posted_cnt; 393 uint_t maxdb = TAVOR_QP_MAXDESC_PER_DB; 394 int status; 395 396 /* 397 * Check for user-mappable QP memory. Note: We do not allow kernel 398 * clients to post to QP memory that is accessible directly by the 399 * user. If the QP memory is user accessible, then return an error. 400 */ 401 if (qp->qp_is_umap) { 402 return (IBT_QP_HDL_INVALID); 403 } 404 405 /* Initialize posted_cnt */ 406 posted_cnt = 0; 407 408 mutex_enter(&qp->qp_lock); 409 410 /* 411 * Check if QP is associated with an SRQ 412 */ 413 if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 414 mutex_exit(&qp->qp_lock); 415 return (IBT_SRQ_IN_USE); 416 } 417 418 /* 419 * Check QP state. Can not post Recv requests from the "Reset" state 420 */ 421 if (qp->qp_state == TAVOR_QP_RESET) { 422 mutex_exit(&qp->qp_lock); 423 return (IBT_QP_STATE_INVALID); 424 } 425 426 /* Grab the lock for the WRID list */ 427 mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock); 428 429 /* Save away some initial QP state */ 430 qsize_msk = qp->qp_rq_wqhdr->wq_size - 1; 431 tail = qp->qp_rq_wqhdr->wq_tail; 432 head = qp->qp_rq_wqhdr->wq_head; 433 434 /* 435 * For each ibt_recv_wr_t in the wr[] list passed in, parse the 436 * request and build a Recv WQE. Note: Because we are potentially 437 * building a chain of WQEs, we want to link them all together. 438 * However, we do not want to link the first one to the previous 439 * WQE until the entire chain has been linked. Then in the last 440 * step we ring the appropriate doorbell. Note: It is possible for 441 * more Work Requests to be posted than the HW will support at one 442 * shot. If this happens, we need to be able to post and ring 443 * several chains here until the the entire request is complete. 444 */ 445 wrindx = 0; 446 numremain = num_wr; 447 status = DDI_SUCCESS; 448 while ((wrindx < num_wr) && (status == DDI_SUCCESS)) { 449 /* 450 * For the first WQE on a new chain we need "prev" to point 451 * to the current descriptor. As we begin to process 452 * further, "prev" will be updated to point to the previous 453 * WQE on the current chain (see below). 454 */ 455 prev = TAVOR_QP_RQ_ENTRY(qp, tail); 456 457 /* 458 * Before we begin, save the current "tail index" for later 459 * DMA sync 460 */ 461 sync_from = tail; 462 463 /* 464 * Break the request up into chains that are less than or 465 * equal to the maximum number of WQEs that can be posted 466 * per doorbell ring 467 */ 468 chainlen = (numremain > maxdb) ? maxdb : numremain; 469 numremain -= chainlen; 470 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) { 471 /* 472 * Check for "queue full" condition. If the queue 473 * is already full, then no more WQEs can be posted. 474 * So break out, ring a doorbell (if necessary) and 475 * return an error 476 */ 477 if (qp->qp_rq_wqhdr->wq_full != 0) { 478 status = IBT_QP_FULL; 479 break; 480 } 481 482 /* 483 * Increment the "tail index" and check for "queue 484 * full" condition. If we detect that the current 485 * work request is going to fill the work queue, then 486 * we mark this condition and continue. 487 */ 488 next_tail = (tail + 1) & qsize_msk; 489 if (next_tail == head) { 490 qp->qp_rq_wqhdr->wq_full = 1; 491 } 492 493 /* 494 * Get the address of the location where the next 495 * Recv WQE should be built 496 */ 497 desc = TAVOR_QP_RQ_ENTRY(qp, tail); 498 499 /* 500 * Call tavor_wqe_recv_build() to build the WQE 501 * at the given address. This routine uses the 502 * information in the ibt_recv_wr_t list (wr[]) and 503 * returns the size of the WQE when it returns. 504 */ 505 status = tavor_wqe_recv_build(state, qp, &wr[wrindx], 506 desc, &desc_sz); 507 if (status != DDI_SUCCESS) { 508 break; 509 } 510 511 /* 512 * Add a WRID entry to the WRID list. Need to 513 * calculate the "wqeaddrsz" and "signaled_dbd" 514 * values to pass to tavor_wrid_add_entry(). Note: 515 * all Recv WQEs are essentially "signaled" and 516 * "doorbelled" (since Tavor HW requires all 517 * RecvWQE's to have their "DBD" bits set). 518 */ 519 wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t) 520 ((uint64_t)(uintptr_t)desc - qp->qp_desc_off), 521 desc_sz); 522 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED | 523 TAVOR_WRID_ENTRY_DOORBELLED; 524 tavor_wrid_add_entry(qp->qp_rq_wqhdr, 525 wr[wrindx].wr_id, wqeaddrsz, signaled_dbd); 526 527 /* 528 * If this is not the first descriptor on the current 529 * chain, then link it to the previous WQE. Otherwise, 530 * save the address and size of this descriptor (in 531 * "first" and "first_sz" respectively) and continue. 532 */ 533 if (currindx != 0) { 534 tavor_wqe_recv_linknext(desc, desc_sz, prev, 535 qp); 536 prev = desc; 537 } else { 538 first = desc; 539 first_sz = desc_sz; 540 } 541 542 /* 543 * Update the current "tail index" and increment 544 * "posted_cnt" 545 */ 546 tail = next_tail; 547 posted_cnt++; 548 } 549 550 /* 551 * If we reach here and there are one or more WQEs which have 552 * been successfully chained together, then we need to link 553 * the current chain to the previously executing chain of 554 * descriptor (if there is one) and ring the doorbell for the 555 * recv work queue. 556 */ 557 if (currindx != 0) { 558 /* 559 * Before we link the chain, we need to ensure that the 560 * "next" field on the last WQE is set to NULL (to 561 * indicate the end of the chain). 562 */ 563 tavor_wqe_recv_linknext(NULL, 0, prev, qp); 564 565 /* Save away updated "tail index" for the DMA sync */ 566 sync_to = tail; 567 568 /* Do a DMA sync for current recv WQE(s) */ 569 tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_RECV, 570 DDI_DMA_SYNC_FORDEV); 571 572 /* 573 * Now link the chain to the old chain (if there was 574 * one. 575 */ 576 tavor_wqe_recv_linknext(first, first_sz, 577 qp->qp_rq_lastwqeaddr, qp); 578 579 /* 580 * If there was a valid previous WQE (i.e. non-NULL), 581 * then sync it too. This is because we have updated 582 * its "next" fields and we want to ensure that the 583 * hardware can see the changes. 584 */ 585 if (qp->qp_rq_lastwqeaddr != NULL) { 586 sync_to = sync_from; 587 sync_from = (sync_from - 1) & qsize_msk; 588 tavor_wqe_sync(qp, sync_from, sync_to, 589 TAVOR_WR_RECV, DDI_DMA_SYNC_FORDEV); 590 } 591 592 /* Update some of the state in the QP */ 593 qp->qp_rq_lastwqeaddr = desc; 594 qp->qp_rq_wqhdr->wq_tail = tail; 595 596 /* Ring the doorbell */ 597 tavor_qp_recv_doorbell(state, 598 (uint32_t)((uintptr_t)first - qp->qp_desc_off), 599 first_sz, qp->qp_qpnum, (chainlen % maxdb)); 600 } 601 } 602 603 /* 604 * Update the "num_posted" return value (if necessary). Then drop 605 * the locks and return success. 606 */ 607 if (num_posted != NULL) { 608 *num_posted = posted_cnt; 609 } 610 611 mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock); 612 mutex_exit(&qp->qp_lock); 613 614 return (status); 615 } 616 617 /* 618 * tavor_post_srq() 619 * Context: Can be called from interrupt or base context. 620 */ 621 int 622 tavor_post_srq(tavor_state_t *state, tavor_srqhdl_t srq, 623 ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted) 624 { 625 uint64_t *desc, *prev, *first, *last_wqe_addr; 626 uint32_t signaled_dbd; 627 uint32_t sync_indx; 628 uint_t currindx, wrindx, numremain; 629 uint_t chainlen, posted_cnt; 630 uint_t maxdb = TAVOR_QP_MAXDESC_PER_DB; 631 int status; 632 633 /* 634 * Check for user-mappable QP memory. Note: We do not allow kernel 635 * clients to post to QP memory that is accessible directly by the 636 * user. If the QP memory is user accessible, then return an error. 637 */ 638 if (srq->srq_is_umap) { 639 return (IBT_SRQ_HDL_INVALID); 640 } 641 642 /* Initialize posted_cnt */ 643 posted_cnt = 0; 644 645 mutex_enter(&srq->srq_lock); 646 647 /* 648 * Check SRQ state. Can not post Recv requests when SRQ is in error 649 */ 650 if (srq->srq_state == TAVOR_SRQ_STATE_ERROR) { 651 mutex_exit(&srq->srq_lock); 652 return (IBT_QP_STATE_INVALID); 653 } 654 655 /* Grab the lock for the WRID list */ 656 mutex_enter(&srq->srq_wrid_wql->wql_lock); 657 658 /* 659 * For each ibt_recv_wr_t in the wr[] list passed in, parse the 660 * request and build a Recv WQE. Note: Because we are potentially 661 * building a chain of WQEs, we want to link them all together. 662 * However, we do not want to link the first one to the previous 663 * WQE until the entire chain has been linked. Then in the last 664 * step we ring the appropriate doorbell. Note: It is possible for 665 * more Work Requests to be posted than the HW will support at one 666 * shot. If this happens, we need to be able to post and ring 667 * several chains here until the the entire request is complete. 668 */ 669 wrindx = 0; 670 numremain = num_wr; 671 status = DDI_SUCCESS; 672 while ((wrindx < num_wr) && (status == DDI_SUCCESS)) { 673 /* 674 * For the first WQE on a new chain we need "prev" to point 675 * to the current descriptor. As we begin to process 676 * further, "prev" will be updated to point to the previous 677 * WQE on the current chain (see below). 678 */ 679 if (srq->srq_wq_lastwqeindx == -1) { 680 prev = NULL; 681 } else { 682 prev = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wq_lastwqeindx); 683 } 684 685 /* 686 * Break the request up into chains that are less than or 687 * equal to the maximum number of WQEs that can be posted 688 * per doorbell ring 689 */ 690 chainlen = (numremain > maxdb) ? maxdb : numremain; 691 numremain -= chainlen; 692 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) { 693 694 /* 695 * Check for "queue full" condition. If the queue 696 * is already full, then no more WQEs can be posted. 697 * So break out, ring a doorbell (if necessary) and 698 * return an error 699 */ 700 if (srq->srq_wridlist->wl_free_list_indx == -1) { 701 status = IBT_QP_FULL; 702 break; 703 } 704 705 /* 706 * Get the address of the location where the next 707 * Recv WQE should be built 708 */ 709 desc = TAVOR_SRQ_WQE_ADDR(srq, 710 srq->srq_wridlist->wl_free_list_indx); 711 712 /* 713 * Add a WRID entry to the WRID list. Need to 714 * set the "signaled_dbd" values to pass to 715 * tavor_wrid_add_entry(). Note: all Recv WQEs are 716 * essentially "signaled" 717 * 718 * The 'size' is stored at srq_alloc time, in the 719 * srq_wq_stride. This is a constant value required 720 * for SRQ. 721 */ 722 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED; 723 tavor_wrid_add_entry_srq(srq, wr[wrindx].wr_id, 724 signaled_dbd); 725 726 /* 727 * Call tavor_wqe_srq_build() to build the WQE 728 * at the given address. This routine uses the 729 * information in the ibt_recv_wr_t list (wr[]) and 730 * returns the size of the WQE when it returns. 731 */ 732 status = tavor_wqe_srq_build(state, srq, &wr[wrindx], 733 desc); 734 if (status != DDI_SUCCESS) { 735 break; 736 } 737 738 /* 739 * If this is not the first descriptor on the current 740 * chain, then link it to the previous WQE. Otherwise, 741 * save the address of this descriptor (in "first") and 742 * continue. 743 */ 744 if (currindx != 0) { 745 tavor_wqe_srq_linknext(desc, prev, srq); 746 sync_indx = TAVOR_SRQ_WQE_INDEX( 747 srq->srq_wq_buf, prev, 748 srq->srq_wq_log_wqesz); 749 750 /* Do a DMA sync for previous recv WQE */ 751 tavor_wqe_sync(srq, sync_indx, sync_indx+1, 752 TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV); 753 754 prev = desc; 755 } else { 756 757 /* 758 * In this case, the last WQE on the chain is 759 * also considered 'first'. So set prev to 760 * first, here. 761 */ 762 first = prev = desc; 763 } 764 765 /* 766 * Increment "posted_cnt" 767 */ 768 posted_cnt++; 769 } 770 771 /* 772 * If we reach here and there are one or more WQEs which have 773 * been successfully chained together, then we need to link 774 * the current chain to the previously executing chain of 775 * descriptor (if there is one) and ring the doorbell for the 776 * recv work queue. 777 */ 778 if (currindx != 0) { 779 /* 780 * Before we link the chain, we need to ensure that the 781 * "next" field on the last WQE is set to NULL (to 782 * indicate the end of the chain). 783 */ 784 tavor_wqe_srq_linknext(NULL, prev, srq); 785 786 sync_indx = TAVOR_SRQ_WQE_INDEX(srq->srq_wq_buf, prev, 787 srq->srq_wq_log_wqesz); 788 789 /* Do a DMA sync for current recv WQE */ 790 tavor_wqe_sync(srq, sync_indx, sync_indx+1, 791 TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV); 792 793 /* 794 * Now link the chain to the old chain (if there was 795 * one). 796 */ 797 if (srq->srq_wq_lastwqeindx == -1) { 798 last_wqe_addr = NULL; 799 } else { 800 last_wqe_addr = TAVOR_SRQ_WQE_ADDR(srq, 801 srq->srq_wq_lastwqeindx); 802 } 803 tavor_wqe_srq_linknext(first, last_wqe_addr, srq); 804 805 /* 806 * If there was a valid previous WQE (i.e. valid index), 807 * then sync it too. This is because we have updated 808 * its "next" fields and we want to ensure that the 809 * hardware can see the changes. 810 */ 811 if (srq->srq_wq_lastwqeindx != -1) { 812 sync_indx = srq->srq_wq_lastwqeindx; 813 tavor_wqe_sync(srq, sync_indx, sync_indx+1, 814 TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV); 815 } 816 817 /* Update some of the state in the QP */ 818 srq->srq_wq_lastwqeindx = TAVOR_SRQ_WQE_INDEX( 819 srq->srq_wq_buf, desc, 820 srq->srq_wq_log_wqesz); 821 822 /* Ring the doorbell */ 823 /* SRQ needs NDS of 0 */ 824 tavor_qp_recv_doorbell(state, 825 (uint32_t)((uintptr_t)first - srq->srq_desc_off), 826 0, srq->srq_srqnum, (chainlen % maxdb)); 827 } 828 } 829 830 /* 831 * Update the "num_posted" return value (if necessary). Then drop 832 * the locks and return success. 833 */ 834 if (num_posted != NULL) { 835 *num_posted = posted_cnt; 836 } 837 838 mutex_exit(&srq->srq_wrid_wql->wql_lock); 839 mutex_exit(&srq->srq_lock); 840 841 return (status); 842 } 843 844 845 /* 846 * tavor_qp_send_doorbell() 847 * Context: Can be called from interrupt or base context. 848 */ 849 static void 850 tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds, 851 uint32_t qpn, uint32_t fence, uint32_t nopcode) 852 { 853 uint64_t doorbell = 0; 854 855 /* Build the doorbell from the parameters */ 856 doorbell = (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) << 857 TAVOR_QPSNDDB_NDA_SHIFT) | 858 ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) | 859 ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) | 860 ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds; 861 862 /* Write the doorbell to UAR */ 863 TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->send, 864 doorbell); 865 } 866 867 868 /* 869 * tavor_qp_recv_doorbell() 870 * Context: Can be called from interrupt or base context. 871 */ 872 static void 873 tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds, 874 uint32_t qpn, uint32_t credits) 875 { 876 uint64_t doorbell = 0; 877 878 /* Build the doorbell from the parameters */ 879 doorbell = (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) << 880 TAVOR_QPRCVDB_NDA_SHIFT) | 881 ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) | 882 ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits; 883 884 /* Write the doorbell to UAR */ 885 TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->recv, 886 doorbell); 887 } 888 889 890 /* 891 * tavor_wqe_send_build() 892 * Context: Can be called from interrupt or base context. 893 */ 894 static int 895 tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp, 896 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size) 897 { 898 tavor_hw_snd_wqe_ud_t *ud; 899 tavor_hw_snd_wqe_remaddr_t *rc; 900 tavor_hw_snd_wqe_atomic_t *at; 901 tavor_hw_snd_wqe_remaddr_t *uc; 902 tavor_hw_snd_wqe_bind_t *bn; 903 tavor_hw_wqe_sgl_t *ds; 904 ibt_wr_ds_t *sgl; 905 tavor_ahhdl_t ah; 906 uint32_t nds; 907 int i, num_ds, status; 908 909 ASSERT(MUTEX_HELD(&qp->qp_lock)); 910 911 /* Initialize the information for the Data Segments */ 912 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc + 913 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 914 nds = wr->wr_nds; 915 sgl = wr->wr_sgl; 916 num_ds = 0; 917 918 /* 919 * Build a Send WQE depends first and foremost on the transport 920 * type of Work Request (i.e. UD, RC, or UC) 921 */ 922 switch (wr->wr_trans) { 923 case IBT_UD_SRV: 924 /* Ensure that work request transport type matches QP type */ 925 if (qp->qp_serv_type != TAVOR_QP_UD) { 926 return (IBT_QP_SRV_TYPE_INVALID); 927 } 928 929 /* 930 * Validate the operation type. For UD requests, only the 931 * "Send" operation is valid 932 */ 933 if (wr->wr_opcode != IBT_WRC_SEND) { 934 return (IBT_QP_OP_TYPE_INVALID); 935 } 936 937 /* 938 * If this is a Special QP (QP0 or QP1), then we need to 939 * build MLX WQEs instead. So jump to tavor_wqe_mlx_build() 940 * and return whatever status it returns 941 */ 942 if (qp->qp_is_special) { 943 status = tavor_wqe_mlx_build(state, qp, wr, desc, size); 944 return (status); 945 } 946 947 /* 948 * Otherwise, if this is a normal UD Send request, then fill 949 * all the fields in the Tavor UD header for the WQE. Note: 950 * to do this we'll need to extract some information from the 951 * Address Handle passed with the work request. 952 */ 953 ud = (tavor_hw_snd_wqe_ud_t *)((uintptr_t)desc + 954 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 955 ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah; 956 if (ah == NULL) { 957 return (IBT_AH_HDL_INVALID); 958 } 959 960 /* 961 * Build the Unreliable Datagram Segment for the WQE, using 962 * the information from the address handle and the work 963 * request. 964 */ 965 mutex_enter(&ah->ah_lock); 966 TAVOR_WQE_BUILD_UD(qp, ud, ah, wr); 967 mutex_exit(&ah->ah_lock); 968 969 /* Update "ds" for filling in Data Segments (below) */ 970 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)ud + 971 sizeof (tavor_hw_snd_wqe_ud_t)); 972 break; 973 974 case IBT_RC_SRV: 975 /* Ensure that work request transport type matches QP type */ 976 if (qp->qp_serv_type != TAVOR_QP_RC) { 977 return (IBT_QP_SRV_TYPE_INVALID); 978 } 979 980 /* 981 * Validate the operation type. For RC requests, we allow 982 * "Send", "RDMA Read", "RDMA Write", various "Atomic" 983 * operations, and memory window "Bind" 984 */ 985 if ((wr->wr_opcode != IBT_WRC_SEND) && 986 (wr->wr_opcode != IBT_WRC_RDMAR) && 987 (wr->wr_opcode != IBT_WRC_RDMAW) && 988 (wr->wr_opcode != IBT_WRC_CSWAP) && 989 (wr->wr_opcode != IBT_WRC_FADD) && 990 (wr->wr_opcode != IBT_WRC_BIND)) { 991 return (IBT_QP_OP_TYPE_INVALID); 992 } 993 994 /* 995 * If this is a Send request, then all we need to do is break 996 * out and here and begin the Data Segment processing below 997 */ 998 if (wr->wr_opcode == IBT_WRC_SEND) { 999 break; 1000 } 1001 1002 /* 1003 * If this is an RDMA Read or RDMA Write request, then fill 1004 * in the "Remote Address" header fields. 1005 */ 1006 if ((wr->wr_opcode == IBT_WRC_RDMAR) || 1007 (wr->wr_opcode == IBT_WRC_RDMAW)) { 1008 rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc + 1009 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 1010 1011 /* 1012 * Build the Remote Address Segment for the WQE, using 1013 * the information from the RC work request. 1014 */ 1015 TAVOR_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma); 1016 1017 /* Update "ds" for filling in Data Segments (below) */ 1018 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc + 1019 sizeof (tavor_hw_snd_wqe_remaddr_t)); 1020 break; 1021 } 1022 1023 /* 1024 * If this is one of the Atomic type operations (i.e 1025 * Compare-Swap or Fetch-Add), then fill in both the "Remote 1026 * Address" header fields and the "Atomic" header fields. 1027 */ 1028 if ((wr->wr_opcode == IBT_WRC_CSWAP) || 1029 (wr->wr_opcode == IBT_WRC_FADD)) { 1030 rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc + 1031 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 1032 at = (tavor_hw_snd_wqe_atomic_t *)((uintptr_t)rc + 1033 sizeof (tavor_hw_snd_wqe_remaddr_t)); 1034 1035 /* 1036 * Build the Remote Address and Atomic Segments for 1037 * the WQE, using the information from the RC Atomic 1038 * work request. 1039 */ 1040 TAVOR_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr); 1041 TAVOR_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic); 1042 1043 /* Update "ds" for filling in Data Segments (below) */ 1044 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)at + 1045 sizeof (tavor_hw_snd_wqe_atomic_t)); 1046 1047 /* 1048 * Update "nds" and "sgl" because Atomic requests have 1049 * only a single Data Segment (and they are encoded 1050 * somewhat differently in the work request. 1051 */ 1052 nds = 1; 1053 sgl = wr->wr_sgl; 1054 break; 1055 } 1056 1057 /* 1058 * If this is memory window Bind operation, then we call the 1059 * tavor_wr_bind_check() routine to validate the request and 1060 * to generate the updated RKey. If this is successful, then 1061 * we fill in the WQE's "Bind" header fields. 1062 */ 1063 if (wr->wr_opcode == IBT_WRC_BIND) { 1064 status = tavor_wr_bind_check(state, wr); 1065 if (status != DDI_SUCCESS) { 1066 return (status); 1067 } 1068 1069 bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc + 1070 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 1071 1072 /* 1073 * Build the Bind Memory Window Segments for the WQE, 1074 * using the information from the RC Bind memory 1075 * window work request. 1076 */ 1077 TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind); 1078 1079 /* 1080 * Update the "ds" pointer. Even though the "bind" 1081 * operation requires no SGLs, this is necessary to 1082 * facilitate the correct descriptor size calculations 1083 * (below). 1084 */ 1085 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn + 1086 sizeof (tavor_hw_snd_wqe_bind_t)); 1087 nds = 0; 1088 } 1089 break; 1090 1091 case IBT_UC_SRV: 1092 /* Ensure that work request transport type matches QP type */ 1093 if (qp->qp_serv_type != TAVOR_QP_UC) { 1094 return (IBT_QP_SRV_TYPE_INVALID); 1095 } 1096 1097 /* 1098 * Validate the operation type. For UC requests, we only 1099 * allow "Send", "RDMA Write", and memory window "Bind". 1100 * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic" 1101 * operations 1102 */ 1103 if ((wr->wr_opcode != IBT_WRC_SEND) && 1104 (wr->wr_opcode != IBT_WRC_RDMAW) && 1105 (wr->wr_opcode != IBT_WRC_BIND)) { 1106 return (IBT_QP_OP_TYPE_INVALID); 1107 } 1108 1109 /* 1110 * If this is a Send request, then all we need to do is break 1111 * out and here and begin the Data Segment processing below 1112 */ 1113 if (wr->wr_opcode == IBT_WRC_SEND) { 1114 break; 1115 } 1116 1117 /* 1118 * If this is an RDMA Write request, then fill in the "Remote 1119 * Address" header fields. 1120 */ 1121 if (wr->wr_opcode == IBT_WRC_RDMAW) { 1122 uc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc + 1123 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 1124 1125 /* 1126 * Build the Remote Address Segment for the WQE, using 1127 * the information from the UC work request. 1128 */ 1129 TAVOR_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma); 1130 1131 /* Update "ds" for filling in Data Segments (below) */ 1132 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)uc + 1133 sizeof (tavor_hw_snd_wqe_remaddr_t)); 1134 break; 1135 } 1136 1137 /* 1138 * If this is memory window Bind operation, then we call the 1139 * tavor_wr_bind_check() routine to validate the request and 1140 * to generate the updated RKey. If this is successful, then 1141 * we fill in the WQE's "Bind" header fields. 1142 */ 1143 if (wr->wr_opcode == IBT_WRC_BIND) { 1144 status = tavor_wr_bind_check(state, wr); 1145 if (status != DDI_SUCCESS) { 1146 return (status); 1147 } 1148 1149 bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc + 1150 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 1151 1152 /* 1153 * Build the Bind Memory Window Segments for the WQE, 1154 * using the information from the UC Bind memory 1155 * window work request. 1156 */ 1157 TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind); 1158 1159 /* 1160 * Update the "ds" pointer. Even though the "bind" 1161 * operation requires no SGLs, this is necessary to 1162 * facilitate the correct descriptor size calculations 1163 * (below). 1164 */ 1165 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn + 1166 sizeof (tavor_hw_snd_wqe_bind_t)); 1167 nds = 0; 1168 } 1169 break; 1170 1171 default: 1172 return (IBT_QP_SRV_TYPE_INVALID); 1173 } 1174 1175 /* 1176 * Now fill in the Data Segments (SGL) for the Send WQE based on 1177 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer 1178 * Start by checking for a valid number of SGL entries 1179 */ 1180 if (nds > qp->qp_sq_sgl) { 1181 return (IBT_QP_SGL_LEN_INVALID); 1182 } 1183 1184 /* 1185 * For each SGL in the Send Work Request, fill in the Send WQE's data 1186 * segments. Note: We skip any SGL with zero size because Tavor 1187 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 1188 * the encoding for zero means a 2GB transfer. Because of this special 1189 * encoding in the hardware, we mask the requested length with 1190 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 1191 * zero.) 1192 */ 1193 for (i = 0; i < nds; i++) { 1194 if (sgl[i].ds_len == 0) { 1195 continue; 1196 } 1197 1198 /* 1199 * Fill in the Data Segment(s) for the current WQE, using the 1200 * information contained in the scatter-gather list of the 1201 * work request. 1202 */ 1203 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]); 1204 num_ds++; 1205 } 1206 1207 /* Return the size of descriptor (in 16-byte chunks) */ 1208 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 4; 1209 1210 return (DDI_SUCCESS); 1211 } 1212 1213 1214 /* 1215 * tavor_wqe_send_linknext() 1216 * Context: Can be called from interrupt or base context. 1217 */ 1218 static void 1219 tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, ibt_send_wr_t *prev_wr, 1220 uint64_t *curr_desc, uint_t curr_descsz, uint64_t *prev_desc, 1221 tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp) 1222 { 1223 uint64_t next, ctrl; 1224 uint32_t nopcode, fence; 1225 1226 /* 1227 * Calculate the "next" field of the descriptor. This amounts to 1228 * setting up the "next_wqe_addr", "nopcode", "fence", and "nds" 1229 * fields (see tavor_hw.h for more). Note: If there is no next 1230 * descriptor (i.e. if the current descriptor is the last WQE on 1231 * the chain), then set "next" to zero. 1232 */ 1233 if (curr_desc != NULL) { 1234 /* 1235 * Determine the value for the Tavor WQE "nopcode" field 1236 * by using the IBTF opcode from the work request 1237 */ 1238 switch (curr_wr->wr_opcode) { 1239 case IBT_WRC_RDMAW: 1240 if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) { 1241 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAWI; 1242 } else { 1243 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW; 1244 } 1245 break; 1246 1247 case IBT_WRC_SEND: 1248 if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) { 1249 nopcode = TAVOR_WQE_SEND_NOPCODE_SENDI; 1250 } else { 1251 nopcode = TAVOR_WQE_SEND_NOPCODE_SEND; 1252 } 1253 break; 1254 1255 case IBT_WRC_RDMAR: 1256 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR; 1257 break; 1258 1259 case IBT_WRC_CSWAP: 1260 nopcode = TAVOR_WQE_SEND_NOPCODE_ATMCS; 1261 break; 1262 1263 case IBT_WRC_FADD: 1264 nopcode = TAVOR_WQE_SEND_NOPCODE_ATMFA; 1265 break; 1266 1267 case IBT_WRC_BIND: 1268 nopcode = TAVOR_WQE_SEND_NOPCODE_BIND; 1269 break; 1270 } 1271 1272 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc 1273 - qp->qp_desc_off); 1274 next = ((uint64_t)(uintptr_t)curr_desc & 1275 TAVOR_WQE_NDA_MASK) << 32; 1276 next = next | ((uint64_t)nopcode << 32); 1277 fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0; 1278 if (fence) { 1279 next = next | TAVOR_WQE_SEND_FENCE_MASK; 1280 } 1281 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK); 1282 1283 /* 1284 * If a send queue doorbell will be rung for the next 1285 * WQE on the chain, then set the current WQE's "dbd" bit. 1286 * Note: We also update the "dbinfo" structure here to pass 1287 * back information about what should (later) be included 1288 * in the send queue doorbell. 1289 */ 1290 if (dbinfo) { 1291 next = next | TAVOR_WQE_DBD_MASK; 1292 dbinfo->db_nopcode = nopcode; 1293 dbinfo->db_fence = fence; 1294 } 1295 } else { 1296 next = 0; 1297 } 1298 1299 /* 1300 * If this WQE is supposed to be linked to the previous descriptor, 1301 * then we need to update not only the previous WQE's "next" fields 1302 * but we must also update this WQE's "ctrl" fields (i.e. the "c", "e", 1303 * "s", "i" and "immediate" fields - see tavor_hw.h for more). Note: 1304 * the "e" bit is always hardcoded to zero. 1305 */ 1306 if (prev_desc != NULL) { 1307 /* 1308 * If a send queue doorbell will be rung for the next WQE on 1309 * the chain, then update the current WQE's "next" field and 1310 * return. 1311 * Note: We don't want to modify the "ctrl" field here because 1312 * that portion of the previous WQE has already been set 1313 * correctly at some previous point in time. 1314 */ 1315 if (dbinfo) { 1316 TAVOR_WQE_LINKFIRST(qp, prev_desc, next); 1317 return; 1318 } 1319 1320 ctrl = 0; 1321 1322 /* Set the "c" (i.e. "signaled") bit appropriately */ 1323 if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) { 1324 ctrl = ctrl | TAVOR_WQE_SEND_SIGNALED_MASK; 1325 } 1326 1327 /* Set the "s" (i.e. "solicited") bit appropriately */ 1328 if (prev_wr->wr_flags & IBT_WR_SEND_SOLICIT) { 1329 ctrl = ctrl | TAVOR_WQE_SEND_SOLICIT_MASK; 1330 } 1331 1332 /* Set the "i" bit and the immediate data appropriately */ 1333 if (prev_wr->wr_flags & IBT_WR_SEND_IMMED) { 1334 ctrl = ctrl | TAVOR_WQE_SEND_IMMEDIATE_MASK; 1335 ctrl = ctrl | tavor_wr_get_immediate(prev_wr); 1336 } 1337 1338 TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next); 1339 } 1340 } 1341 1342 1343 /* 1344 * tavor_wqe_mlx_build() 1345 * Context: Can be called from interrupt or base context. 1346 */ 1347 static int 1348 tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp, 1349 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size) 1350 { 1351 tavor_hw_udav_t udav; 1352 tavor_ahhdl_t ah; 1353 ib_lrh_hdr_t *lrh; 1354 ib_grh_t *grh; 1355 ib_bth_hdr_t *bth; 1356 ib_deth_hdr_t *deth; 1357 tavor_hw_wqe_sgl_t *ds; 1358 ibt_wr_ds_t *sgl; 1359 uint8_t *mgmtclass, *hpoint, *hcount; 1360 uint64_t data; 1361 uint32_t nds, offset, pktlen; 1362 uint32_t desc_sz, udav_sz; 1363 int i, num_ds; 1364 1365 ASSERT(MUTEX_HELD(&qp->qp_lock)); 1366 1367 /* Initialize the information for the Data Segments */ 1368 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc + 1369 sizeof (tavor_hw_mlx_wqe_nextctrl_t)); 1370 1371 /* 1372 * Pull the address handle from the work request and read in 1373 * the contents of the UDAV. This will be used to answer some 1374 * questions about the request. 1375 */ 1376 ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah; 1377 if (ah == NULL) { 1378 return (IBT_AH_HDL_INVALID); 1379 } 1380 mutex_enter(&ah->ah_lock); 1381 udav_sz = sizeof (tavor_hw_udav_t) >> 3; 1382 for (i = 0; i < udav_sz; i++) { 1383 data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl, 1384 ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i)); 1385 ((uint64_t *)&udav)[i] = data; 1386 } 1387 mutex_exit(&ah->ah_lock); 1388 1389 /* 1390 * If the request is for QP1 and the destination LID is equal to 1391 * the Permissive LID, then return an error. This combination is 1392 * not allowed 1393 */ 1394 if ((udav.rlid == IB_LID_PERMISSIVE) && 1395 (qp->qp_is_special == TAVOR_QP_GSI)) { 1396 return (IBT_AH_HDL_INVALID); 1397 } 1398 1399 /* 1400 * Calculate the size of the packet headers, including the GRH 1401 * (if necessary) 1402 */ 1403 desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) + 1404 sizeof (ib_deth_hdr_t); 1405 if (udav.grh) { 1406 desc_sz += sizeof (ib_grh_t); 1407 } 1408 1409 /* 1410 * Begin to build the first "inline" data segment for the packet 1411 * headers. Note: By specifying "inline" we can build the contents 1412 * of the MAD packet headers directly into the work queue (as part 1413 * descriptor). This has the advantage of both speeding things up 1414 * and of not requiring the driver to allocate/register any additional 1415 * memory for the packet headers. 1416 */ 1417 TAVOR_WQE_BUILD_INLINE(qp, &ds[0], desc_sz); 1418 desc_sz += 4; 1419 1420 /* 1421 * Build Local Route Header (LRH) 1422 * We start here by building the LRH into a temporary location. 1423 * When we have finished we copy the LRH data into the descriptor. 1424 * 1425 * Notice that the VL values are hardcoded. This is not a problem 1426 * because VL15 is decided later based on the value in the MLX 1427 * transport "next/ctrl" header (see the "vl15" bit below), and it 1428 * is otherwise (meaning for QP1) chosen from the SL-to-VL table 1429 * values. This rule does not hold for loopback packets however 1430 * (all of which bypass the SL-to-VL tables) and it is the reason 1431 * that non-QP0 MADs are setup with VL hardcoded to zero below. 1432 * 1433 * Notice also that Source LID is hardcoded to the Permissive LID 1434 * (0xFFFF). This is also not a problem because if the Destination 1435 * LID is not the Permissive LID, then the "slr" value in the MLX 1436 * transport "next/ctrl" header will be set to zero and the hardware 1437 * will pull the LID from value in the port. 1438 */ 1439 lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4); 1440 pktlen = (desc_sz + 0x100) >> 2; 1441 TAVOR_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen); 1442 1443 /* 1444 * Build Global Route Header (GRH) 1445 * This is only built if necessary as defined by the "grh" bit in 1446 * the address vector. Note: We also calculate the offset to the 1447 * next header (BTH) based on whether or not the "grh" bit is set. 1448 */ 1449 if (udav.grh) { 1450 /* 1451 * If the request is for QP0, then return an error. The 1452 * combination of global routine (GRH) and QP0 is not allowed. 1453 */ 1454 if (qp->qp_is_special == TAVOR_QP_SMI) { 1455 return (IBT_AH_HDL_INVALID); 1456 } 1457 grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t)); 1458 TAVOR_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen); 1459 1460 bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t)); 1461 } else { 1462 bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t)); 1463 } 1464 1465 1466 /* 1467 * Build Base Transport Header (BTH) 1468 * Notice that the M, PadCnt, and TVer fields are all set 1469 * to zero implicitly. This is true for all Management Datagrams 1470 * MADs whether GSI are SMI. 1471 */ 1472 TAVOR_WQE_BUILD_MLX_BTH(state, bth, qp, wr); 1473 1474 /* 1475 * Build Datagram Extended Transport Header (DETH) 1476 */ 1477 deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t)); 1478 TAVOR_WQE_BUILD_MLX_DETH(deth, qp); 1479 1480 /* Ensure that the Data Segment is aligned on a 16-byte boundary */ 1481 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t)); 1482 ds = (tavor_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF); 1483 nds = wr->wr_nds; 1484 sgl = wr->wr_sgl; 1485 num_ds = 0; 1486 1487 /* 1488 * Now fill in the Data Segments (SGL) for the MLX WQE based on the 1489 * values set up above (i.e. "sgl", "nds", and the "ds" pointer 1490 * Start by checking for a valid number of SGL entries 1491 */ 1492 if (nds > qp->qp_sq_sgl) { 1493 return (IBT_QP_SGL_LEN_INVALID); 1494 } 1495 1496 /* 1497 * For each SGL in the Send Work Request, fill in the MLX WQE's data 1498 * segments. Note: We skip any SGL with zero size because Tavor 1499 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 1500 * the encoding for zero means a 2GB transfer. Because of this special 1501 * encoding in the hardware, we mask the requested length with 1502 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 1503 * zero.) 1504 */ 1505 mgmtclass = hpoint = hcount = NULL; 1506 offset = 0; 1507 for (i = 0; i < nds; i++) { 1508 if (sgl[i].ds_len == 0) { 1509 continue; 1510 } 1511 1512 /* 1513 * Fill in the Data Segment(s) for the MLX send WQE, using 1514 * the information contained in the scatter-gather list of 1515 * the work request. 1516 */ 1517 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]); 1518 1519 /* 1520 * Search through the contents of all MADs posted to QP0 to 1521 * initialize pointers to the places where Directed Route "hop 1522 * pointer", "hop count", and "mgmtclass" would be. Tavor 1523 * needs these updated (i.e. incremented or decremented, as 1524 * necessary) by software. 1525 */ 1526 if (qp->qp_is_special == TAVOR_QP_SMI) { 1527 1528 TAVOR_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass, 1529 offset, sgl[i].ds_va, sgl[i].ds_len); 1530 1531 TAVOR_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint, 1532 offset, sgl[i].ds_va, sgl[i].ds_len); 1533 1534 TAVOR_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount, 1535 offset, sgl[i].ds_va, sgl[i].ds_len); 1536 1537 offset += sgl[i].ds_len; 1538 } 1539 num_ds++; 1540 } 1541 1542 /* 1543 * Tavor's Directed Route MADs need to have the "hop pointer" 1544 * incremented/decremented (as necessary) depending on whether it is 1545 * currently less than or greater than the "hop count" (i.e. whether 1546 * the MAD is a request or a response.) 1547 */ 1548 if (qp->qp_is_special == TAVOR_QP_SMI) { 1549 TAVOR_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass, 1550 *hpoint, *hcount); 1551 } 1552 1553 /* 1554 * Now fill in the ICRC Data Segment. This data segment is inlined 1555 * just like the packets headers above, but it is only four bytes and 1556 * set to zero (to indicate that we wish the hardware to generate ICRC. 1557 */ 1558 TAVOR_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0); 1559 num_ds++; 1560 1561 /* Return the size of descriptor (in 16-byte chunks) */ 1562 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4; 1563 1564 return (DDI_SUCCESS); 1565 } 1566 1567 1568 /* 1569 * tavor_wqe_mlx_linknext() 1570 * Context: Can be called from interrupt or base context. 1571 */ 1572 static void 1573 tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc, 1574 uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo, 1575 tavor_qphdl_t qp) 1576 { 1577 tavor_hw_udav_t udav; 1578 tavor_ahhdl_t ah; 1579 uint64_t next, ctrl, data; 1580 uint_t nopcode; 1581 uint_t udav_sz; 1582 int i; 1583 1584 /* 1585 * Calculate the "next" field of the descriptor. This amounts to 1586 * setting up the "next_wqe_addr", "nopcode", and "nds" fields (see 1587 * tavor_hw.h for more). Note: If there is no next descriptor (i.e. 1588 * if the current descriptor is the last WQE on the chain), then set 1589 * "next" to zero. 1590 */ 1591 if (curr_desc != NULL) { 1592 /* 1593 * The only valid Tavor WQE "nopcode" for MLX transport 1594 * requests is the "Send" code. 1595 */ 1596 nopcode = TAVOR_WQE_SEND_NOPCODE_SEND; 1597 curr_desc = (uint64_t *)(uintptr_t)((uint64_t) 1598 (uintptr_t)curr_desc - qp->qp_desc_off); 1599 next = (uint64_t)((uintptr_t)curr_desc & 1600 TAVOR_WQE_NDA_MASK) << 32; 1601 next = next | ((uint64_t)nopcode << 32); 1602 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK); 1603 1604 /* 1605 * If a send queue doorbell will be rung for the next 1606 * WQE on the chain, then set the current WQE's "dbd" bit. 1607 * Note: We also update the "dbinfo" structure here to pass 1608 * back information about what should (later) be included 1609 * in the send queue doorbell. 1610 */ 1611 if (dbinfo) { 1612 next = next | TAVOR_WQE_DBD_MASK; 1613 dbinfo->db_nopcode = nopcode; 1614 dbinfo->db_fence = 0; 1615 } 1616 } else { 1617 next = 0; 1618 } 1619 1620 /* 1621 * If this WQE is supposed to be linked to the previous descriptor, 1622 * then we need to update not only the previous WQE's "next" fields 1623 * but we must also update this WQE's "ctrl" fields (i.e. the "vl15", 1624 * "slr", "max_srate", "sl", "c", "e", "rlid", and "vcrc" fields - 1625 * see tavor_hw.h for more) Note: the "e" bit and "vcrc" fields are 1626 * always hardcoded to zero. 1627 */ 1628 if (prev_desc != NULL) { 1629 /* 1630 * If a send queue doorbell will be rung for the next WQE on 1631 * the chain, then update the current WQE's "next" field and 1632 * return. 1633 * Note: We don't want to modify the "ctrl" field here because 1634 * that portion of the previous WQE has already been set 1635 * correctly at some previous point in time. 1636 */ 1637 if (dbinfo) { 1638 TAVOR_WQE_LINKFIRST(qp, prev_desc, next); 1639 return; 1640 } 1641 1642 /* 1643 * Pull the address handle from the work request and read in 1644 * the contents of the UDAV. This will be used to answer some 1645 * questions about the request. 1646 */ 1647 ah = (tavor_ahhdl_t)prev_wr->wr.ud.udwr_dest->ud_ah; 1648 mutex_enter(&ah->ah_lock); 1649 udav_sz = sizeof (tavor_hw_udav_t) >> 3; 1650 for (i = 0; i < udav_sz; i++) { 1651 data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl, 1652 ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i)); 1653 ((uint64_t *)&udav)[i] = data; 1654 } 1655 mutex_exit(&ah->ah_lock); 1656 1657 ctrl = 0; 1658 1659 /* Only QP0 uses VL15, otherwise use VL in the packet */ 1660 if (qp->qp_is_special == TAVOR_QP_SMI) { 1661 ctrl = ctrl | TAVOR_WQE_MLXHDR_VL15_MASK; 1662 } 1663 1664 /* 1665 * The SLR (Source LID Replace) bit determines whether the 1666 * source LID for an outgoing MLX packet should come from the 1667 * PortInfo (SLR = 0) or should be left as it is in the 1668 * descriptor (SLR = 1). The latter is necessary for packets 1669 * to be sent with the Permissive LID. 1670 */ 1671 if (udav.rlid == IB_LID_PERMISSIVE) { 1672 ctrl = ctrl | TAVOR_WQE_MLXHDR_SLR_MASK; 1673 } 1674 1675 /* Fill in the max static rate from the address handle */ 1676 ctrl = ctrl | ((uint64_t)udav.max_stat_rate << 1677 TAVOR_WQE_MLXHDR_SRATE_SHIFT); 1678 1679 /* All VL15 (i.e. SMI) traffic is required to use SL 0 */ 1680 if (qp->qp_is_special != TAVOR_QP_SMI) { 1681 ctrl = ctrl | ((uint64_t)udav.sl << 1682 TAVOR_WQE_MLXHDR_SL_SHIFT); 1683 } 1684 1685 /* Set the "c" (i.e. "signaled") bit appropriately */ 1686 if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) { 1687 ctrl = ctrl | TAVOR_WQE_MLXHDR_SIGNALED_MASK; 1688 } 1689 1690 /* Fill in the destination LID from the address handle */ 1691 ctrl = ctrl | ((uint64_t)udav.rlid << 1692 TAVOR_WQE_MLXHDR_RLID_SHIFT); 1693 1694 TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next); 1695 } 1696 } 1697 1698 1699 /* 1700 * tavor_wqe_recv_build() 1701 * Context: Can be called from interrupt or base context. 1702 */ 1703 /* ARGSUSED */ 1704 static int 1705 tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp, 1706 ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size) 1707 { 1708 tavor_hw_wqe_sgl_t *ds; 1709 int i, num_ds; 1710 1711 ASSERT(MUTEX_HELD(&qp->qp_lock)); 1712 1713 /* Check that work request transport type is valid */ 1714 if ((qp->qp_serv_type != TAVOR_QP_UD) && 1715 (qp->qp_serv_type != TAVOR_QP_RC) && 1716 (qp->qp_serv_type != TAVOR_QP_UC)) { 1717 return (IBT_QP_SRV_TYPE_INVALID); 1718 } 1719 1720 /* Fill in the Data Segments (SGL) for the Recv WQE */ 1721 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc + 1722 sizeof (tavor_hw_rcv_wqe_nextctrl_t)); 1723 num_ds = 0; 1724 1725 /* Check for valid number of SGL entries */ 1726 if (wr->wr_nds > qp->qp_rq_sgl) { 1727 return (IBT_QP_SGL_LEN_INVALID); 1728 } 1729 1730 /* 1731 * For each SGL in the Recv Work Request, fill in the Recv WQE's data 1732 * segments. Note: We skip any SGL with zero size because Tavor 1733 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 1734 * the encoding for zero means a 2GB transfer. Because of this special 1735 * encoding in the hardware, we mask the requested length with 1736 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 1737 * zero.) 1738 */ 1739 for (i = 0; i < wr->wr_nds; i++) { 1740 if (wr->wr_sgl[i].ds_len == 0) { 1741 continue; 1742 } 1743 1744 /* 1745 * Fill in the Data Segment(s) for the receive WQE, using the 1746 * information contained in the scatter-gather list of the 1747 * work request. 1748 */ 1749 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &wr->wr_sgl[i]); 1750 num_ds++; 1751 } 1752 1753 /* Return the size of descriptor (in 16-byte chunks) */ 1754 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4; 1755 1756 return (DDI_SUCCESS); 1757 } 1758 1759 1760 /* 1761 * tavor_wqe_recv_linknext() 1762 * Context: Can be called from interrupt or base context. 1763 */ 1764 static void 1765 tavor_wqe_recv_linknext(uint64_t *curr_desc, uint_t curr_descsz, 1766 uint64_t *prev_desc, tavor_qphdl_t qp) 1767 { 1768 uint64_t next; 1769 1770 /* 1771 * Calculate the "next" field of the descriptor. This amounts to 1772 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see 1773 * tavor_hw.h for more). Note: If there is no next descriptor (i.e. 1774 * if the current descriptor is the last WQE on the chain), then set 1775 * "next" field to TAVOR_WQE_DBD_MASK. This is because the Tavor 1776 * hardware requires the "dbd" bit to be set to one for all Recv WQEs. 1777 * In either case, we must add a single bit in the "reserved" field 1778 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA. This is the 1779 * workaround for a known Tavor errata that can cause Recv WQEs with 1780 * zero in the NDA field to behave improperly. 1781 */ 1782 if (curr_desc != NULL) { 1783 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc - 1784 qp->qp_desc_off); 1785 next = (uint64_t)((uintptr_t)curr_desc & 1786 TAVOR_WQE_NDA_MASK) << 32; 1787 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK) | 1788 TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK; 1789 } else { 1790 next = TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK; 1791 } 1792 1793 /* 1794 * If this WQE is supposed to be linked to the previous descriptor, 1795 * then we need to update not only the previous WQE's "next" fields 1796 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and 1797 * "e" bits - see tavor_hw.h for more). Note: both the "c" and "e" 1798 * bits are always hardcoded to zero. 1799 */ 1800 if (prev_desc != NULL) { 1801 TAVOR_WQE_LINKNEXT(qp, prev_desc, 0, next); 1802 } 1803 } 1804 1805 1806 /* 1807 * tavor_wqe_srq_build() 1808 * Context: Can be called from interrupt or base context. 1809 */ 1810 /* ARGSUSED */ 1811 static int 1812 tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq, 1813 ibt_recv_wr_t *wr, uint64_t *desc) 1814 { 1815 tavor_hw_wqe_sgl_t *ds; 1816 ibt_wr_ds_t end_sgl; 1817 int i, num_ds; 1818 1819 ASSERT(MUTEX_HELD(&srq->srq_lock)); 1820 1821 /* Fill in the Data Segments (SGL) for the Recv WQE */ 1822 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc + 1823 sizeof (tavor_hw_rcv_wqe_nextctrl_t)); 1824 num_ds = 0; 1825 1826 /* Check for valid number of SGL entries */ 1827 if (wr->wr_nds > srq->srq_wq_sgl) { 1828 return (IBT_QP_SGL_LEN_INVALID); 1829 } 1830 1831 /* 1832 * For each SGL in the Recv Work Request, fill in the Recv WQE's data 1833 * segments. Note: We skip any SGL with zero size because Tavor 1834 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 1835 * the encoding for zero means a 2GB transfer. Because of this special 1836 * encoding in the hardware, we mask the requested length with 1837 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 1838 * zero.) 1839 */ 1840 for (i = 0; i < wr->wr_nds; i++) { 1841 if (wr->wr_sgl[i].ds_len == 0) { 1842 continue; 1843 } 1844 1845 /* 1846 * Fill in the Data Segment(s) for the receive WQE, using the 1847 * information contained in the scatter-gather list of the 1848 * work request. 1849 */ 1850 TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &wr->wr_sgl[i]); 1851 num_ds++; 1852 } 1853 1854 /* 1855 * For SRQ, if the number of data segments is less than the maximum 1856 * specified at alloc, then we have to fill in a special "key" entry in 1857 * the sgl entry after the last valid one in this post request. We do 1858 * that here. 1859 */ 1860 if (num_ds < srq->srq_wq_sgl) { 1861 end_sgl.ds_va = 0; 1862 end_sgl.ds_len = 0; 1863 end_sgl.ds_key = 0x1; 1864 TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &end_sgl); 1865 } 1866 1867 return (DDI_SUCCESS); 1868 } 1869 1870 1871 /* 1872 * tavor_wqe_srq_linknext() 1873 * Context: Can be called from interrupt or base context. 1874 */ 1875 static void 1876 tavor_wqe_srq_linknext(uint64_t *curr_desc, uint64_t *prev_desc, 1877 tavor_srqhdl_t srq) 1878 { 1879 uint64_t next; 1880 1881 /* 1882 * Calculate the "next" field of the descriptor. This amounts to 1883 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see 1884 * tavor_hw.h for more). Note: If there is no next descriptor (i.e. 1885 * if the current descriptor is the last WQE on the chain), then set 1886 * "next" field to TAVOR_WQE_DBD_MASK. This is because the Tavor 1887 * hardware requires the "dbd" bit to be set to one for all Recv WQEs. 1888 * In either case, we must add a single bit in the "reserved" field 1889 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA. This is the 1890 * workaround for a known Tavor errata that can cause Recv WQEs with 1891 * zero in the NDA field to behave improperly. 1892 */ 1893 if (curr_desc != NULL) { 1894 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc - 1895 srq->srq_desc_off); 1896 next = (uint64_t)((uintptr_t)curr_desc & 1897 TAVOR_WQE_NDA_MASK) << 32; 1898 next = next | TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK; 1899 } else { 1900 next = TAVOR_RCV_WQE_NDA0_WA_MASK; 1901 } 1902 1903 /* 1904 * If this WQE is supposed to be linked to the previous descriptor, 1905 * then we need to update not only the previous WQE's "next" fields 1906 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and 1907 * "e" bits - see tavor_hw.h for more). Note: both the "c" and "e" 1908 * bits are always hardcoded to zero. 1909 */ 1910 if (prev_desc != NULL) { 1911 TAVOR_WQE_LINKNEXT_SRQ(srq, prev_desc, 0, next); 1912 } 1913 } 1914 1915 1916 /* 1917 * tavor_wr_get_immediate() 1918 * Context: Can be called from interrupt or base context. 1919 */ 1920 static uint32_t 1921 tavor_wr_get_immediate(ibt_send_wr_t *wr) 1922 { 1923 /* 1924 * This routine extracts the "immediate data" from the appropriate 1925 * location in the IBTF work request. Because of the way the 1926 * work request structure is defined, the location for this data 1927 * depends on the actual work request operation type. 1928 */ 1929 1930 /* For RDMA Write, test if RC or UC */ 1931 if (wr->wr_opcode == IBT_WRC_RDMAW) { 1932 if (wr->wr_trans == IBT_RC_SRV) { 1933 return (wr->wr.rc.rcwr.rdma.rdma_immed); 1934 } else { /* IBT_UC_SRV */ 1935 return (wr->wr.uc.ucwr.rdma.rdma_immed); 1936 } 1937 } 1938 1939 /* For Send, test if RC, UD, or UC */ 1940 if (wr->wr_opcode == IBT_WRC_SEND) { 1941 if (wr->wr_trans == IBT_RC_SRV) { 1942 return (wr->wr.rc.rcwr.send_immed); 1943 } else if (wr->wr_trans == IBT_UD_SRV) { 1944 return (wr->wr.ud.udwr_immed); 1945 } else { /* IBT_UC_SRV */ 1946 return (wr->wr.uc.ucwr.send_immed); 1947 } 1948 } 1949 1950 /* 1951 * If any other type of request, then immediate is undefined 1952 */ 1953 return (0); 1954 } 1955 1956 1957 /* 1958 * tavor_wqe_sync() 1959 * Context: Can be called from interrupt or base context. 1960 */ 1961 static void 1962 tavor_wqe_sync(void *hdl, uint_t sync_from, uint_t sync_to, 1963 uint_t sync_type, uint_t flag) 1964 { 1965 tavor_qphdl_t qp; 1966 tavor_srqhdl_t srq; 1967 uint_t is_sync_req; 1968 uint64_t *wqe_from, *wqe_to, *wqe_base, *wqe_top; 1969 ddi_dma_handle_t dmahdl; 1970 off_t offset; 1971 size_t length; 1972 uint32_t qsize; 1973 int status; 1974 1975 if (sync_type == TAVOR_WR_SRQ) { 1976 srq = (tavor_srqhdl_t)hdl; 1977 is_sync_req = srq->srq_sync; 1978 /* Get the DMA handle from SRQ context */ 1979 dmahdl = srq->srq_mrhdl->mr_bindinfo.bi_dmahdl; 1980 } else { 1981 qp = (tavor_qphdl_t)hdl; 1982 is_sync_req = qp->qp_sync; 1983 /* Get the DMA handle from QP context */ 1984 dmahdl = qp->qp_mrhdl->mr_bindinfo.bi_dmahdl; 1985 } 1986 1987 /* Determine if the work queues need to be synced or not */ 1988 if (is_sync_req == 0) { 1989 return; 1990 } 1991 1992 /* 1993 * Depending on the type of the work queue, we grab information 1994 * about the address ranges we need to DMA sync. 1995 */ 1996 if (sync_type == TAVOR_WR_SEND) { 1997 wqe_from = TAVOR_QP_SQ_ENTRY(qp, sync_from); 1998 wqe_to = TAVOR_QP_SQ_ENTRY(qp, sync_to); 1999 qsize = qp->qp_sq_bufsz; 2000 2001 wqe_base = TAVOR_QP_SQ_ENTRY(qp, 0); 2002 wqe_top = TAVOR_QP_SQ_ENTRY(qp, qsize); 2003 } else if (sync_type == TAVOR_WR_RECV) { 2004 wqe_from = TAVOR_QP_RQ_ENTRY(qp, sync_from); 2005 wqe_to = TAVOR_QP_RQ_ENTRY(qp, sync_to); 2006 qsize = qp->qp_rq_bufsz; 2007 2008 wqe_base = TAVOR_QP_RQ_ENTRY(qp, 0); 2009 wqe_top = TAVOR_QP_RQ_ENTRY(qp, qsize); 2010 } else { 2011 wqe_from = TAVOR_SRQ_WQ_ENTRY(srq, sync_from); 2012 wqe_to = TAVOR_SRQ_WQ_ENTRY(srq, sync_to); 2013 qsize = srq->srq_wq_bufsz; 2014 2015 wqe_base = TAVOR_SRQ_WQ_ENTRY(srq, 0); 2016 wqe_top = TAVOR_SRQ_WQ_ENTRY(srq, qsize); 2017 } 2018 2019 /* 2020 * There are two possible cases for the beginning and end of the WQE 2021 * chain we are trying to sync. Either this is the simple case, where 2022 * the end of the chain is below the beginning of the chain, or it is 2023 * the "wrap-around" case, where the end of the chain has wrapped over 2024 * the end of the queue. In the former case, we simply need to 2025 * calculate the span from beginning to end and sync it. In the latter 2026 * case, however, we need to calculate the span from the top of the 2027 * work queue to the end of the chain and sync that, and then we need 2028 * to find the other portion (from beginning of chain to end of queue) 2029 * and sync that as well. Note: if the "top to end" span is actually 2030 * zero length, then we don't do a DMA sync because a zero length DMA 2031 * sync unnecessarily syncs the entire work queue. 2032 */ 2033 if (wqe_to > wqe_from) { 2034 /* "From Beginning to End" */ 2035 offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base); 2036 length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_from); 2037 2038 status = ddi_dma_sync(dmahdl, offset, length, flag); 2039 if (status != DDI_SUCCESS) { 2040 return; 2041 } 2042 } else { 2043 /* "From Top to End" */ 2044 offset = (off_t)0; 2045 length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_base); 2046 if (length) { 2047 status = ddi_dma_sync(dmahdl, offset, length, flag); 2048 if (status != DDI_SUCCESS) { 2049 return; 2050 } 2051 } 2052 2053 /* "From Beginning to Bottom" */ 2054 offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base); 2055 length = (size_t)((uintptr_t)wqe_top - (uintptr_t)wqe_from); 2056 status = ddi_dma_sync(dmahdl, offset, length, flag); 2057 if (status != DDI_SUCCESS) { 2058 return; 2059 } 2060 } 2061 } 2062 2063 2064 /* 2065 * tavor_wr_bind_check() 2066 * Context: Can be called from interrupt or base context. 2067 */ 2068 static int 2069 tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr) 2070 { 2071 ibt_bind_flags_t bind_flags; 2072 uint64_t vaddr, len; 2073 uint64_t reg_start_addr, reg_end_addr; 2074 tavor_mwhdl_t mw; 2075 tavor_mrhdl_t mr; 2076 tavor_rsrc_t *mpt; 2077 uint32_t new_rkey; 2078 2079 /* Check for a valid Memory Window handle in the WR */ 2080 mw = (tavor_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl; 2081 if (mw == NULL) { 2082 return (IBT_MW_HDL_INVALID); 2083 } 2084 2085 /* Check for a valid Memory Region handle in the WR */ 2086 mr = (tavor_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl; 2087 if (mr == NULL) { 2088 return (IBT_MR_HDL_INVALID); 2089 } 2090 2091 mutex_enter(&mr->mr_lock); 2092 mutex_enter(&mw->mr_lock); 2093 2094 /* 2095 * Check here to see if the memory region has already been partially 2096 * deregistered as a result of a tavor_umap_umemlock_cb() callback. 2097 * If so, this is an error, return failure. 2098 */ 2099 if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) { 2100 mutex_exit(&mr->mr_lock); 2101 mutex_exit(&mw->mr_lock); 2102 return (IBT_MR_HDL_INVALID); 2103 } 2104 2105 /* Check for a valid Memory Window RKey (i.e. a matching RKey) */ 2106 if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) { 2107 mutex_exit(&mr->mr_lock); 2108 mutex_exit(&mw->mr_lock); 2109 return (IBT_MR_RKEY_INVALID); 2110 } 2111 2112 /* Check for a valid Memory Region LKey (i.e. a matching LKey) */ 2113 if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) { 2114 mutex_exit(&mr->mr_lock); 2115 mutex_exit(&mw->mr_lock); 2116 return (IBT_MR_LKEY_INVALID); 2117 } 2118 2119 /* 2120 * Now check for valid "vaddr" and "len". Note: We don't check the 2121 * "vaddr" range when "len == 0" (i.e. on unbind operations) 2122 */ 2123 len = wr->wr.rc.rcwr.bind->bind_len; 2124 if (len != 0) { 2125 vaddr = wr->wr.rc.rcwr.bind->bind_va; 2126 reg_start_addr = mr->mr_bindinfo.bi_addr; 2127 reg_end_addr = mr->mr_bindinfo.bi_addr + 2128 (mr->mr_bindinfo.bi_len - 1); 2129 if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) { 2130 mutex_exit(&mr->mr_lock); 2131 mutex_exit(&mw->mr_lock); 2132 return (IBT_MR_VA_INVALID); 2133 } 2134 vaddr = (vaddr + len) - 1; 2135 if (vaddr > reg_end_addr) { 2136 mutex_exit(&mr->mr_lock); 2137 mutex_exit(&mw->mr_lock); 2138 return (IBT_MR_LEN_INVALID); 2139 } 2140 } 2141 2142 /* 2143 * Validate the bind access flags. Remote Write and Atomic access for 2144 * the Memory Window require that Local Write access be set in the 2145 * corresponding Memory Region. 2146 */ 2147 bind_flags = wr->wr.rc.rcwr.bind->bind_flags; 2148 if (((bind_flags & IBT_WR_BIND_WRITE) || 2149 (bind_flags & IBT_WR_BIND_ATOMIC)) && 2150 !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) { 2151 mutex_exit(&mr->mr_lock); 2152 mutex_exit(&mw->mr_lock); 2153 return (IBT_MR_ACCESS_REQ_INVALID); 2154 } 2155 2156 /* Calculate the new RKey for the Memory Window */ 2157 mpt = mw->mr_mptrsrcp; 2158 tavor_mr_keycalc(state, mpt->tr_indx, &new_rkey); 2159 2160 wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey; 2161 mw->mr_rkey = new_rkey; 2162 2163 mutex_exit(&mr->mr_lock); 2164 mutex_exit(&mw->mr_lock); 2165 return (DDI_SUCCESS); 2166 } 2167 2168 2169 /* 2170 * tavor_wrid_from_reset_handling() 2171 * Context: Can be called from interrupt or base context. 2172 */ 2173 int 2174 tavor_wrid_from_reset_handling(tavor_state_t *state, tavor_qphdl_t qp) 2175 { 2176 tavor_workq_hdr_t *swq, *rwq; 2177 tavor_wrid_list_hdr_t *s_wridlist, *r_wridlist; 2178 uint_t create_new_swq = 0, create_new_rwq = 0; 2179 uint_t create_wql = 0; 2180 uint_t qp_srq_en; 2181 2182 /* 2183 * For each of this QP's Work Queues, make sure we have a (properly 2184 * initialized) Work Request ID list attached to the relevant 2185 * completion queue. Grab the CQ lock(s) before manipulating the 2186 * lists. 2187 */ 2188 tavor_wrid_wqhdr_lock_both(qp); 2189 swq = tavor_wrid_wqhdr_find(qp->qp_sq_cqhdl, qp->qp_qpnum, 2190 TAVOR_WR_SEND); 2191 if (swq == NULL) { 2192 /* Couldn't find matching work queue header, create it */ 2193 create_new_swq = create_wql = 1; 2194 swq = tavor_wrid_wqhdr_create(state, qp->qp_sq_cqhdl, 2195 qp->qp_qpnum, TAVOR_WR_SEND, create_wql); 2196 if (swq == NULL) { 2197 /* 2198 * If we couldn't find/allocate space for the workq 2199 * header, then drop the lock(s) and return failure. 2200 */ 2201 tavor_wrid_wqhdr_unlock_both(qp); 2202 return (ibc_get_ci_failure(0)); 2203 } 2204 } 2205 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swq)) 2206 qp->qp_sq_wqhdr = swq; 2207 swq->wq_size = qp->qp_sq_bufsz; 2208 swq->wq_head = 0; 2209 swq->wq_tail = 0; 2210 swq->wq_full = 0; 2211 2212 /* 2213 * Allocate space for the tavor_wrid_entry_t container 2214 */ 2215 s_wridlist = tavor_wrid_get_list(swq->wq_size); 2216 if (s_wridlist == NULL) { 2217 /* 2218 * If we couldn't allocate space for tracking the WRID 2219 * entries, then cleanup the workq header from above (if 2220 * necessary, i.e. if we created the workq header). Then 2221 * drop the lock(s) and return failure. 2222 */ 2223 if (create_new_swq) { 2224 tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq); 2225 } 2226 2227 tavor_wrid_wqhdr_unlock_both(qp); 2228 return (ibc_get_ci_failure(0)); 2229 } 2230 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*s_wridlist)) 2231 s_wridlist->wl_wqhdr = swq; 2232 2233 /* Chain the new WRID list container to the workq hdr list */ 2234 mutex_enter(&swq->wq_wrid_wql->wql_lock); 2235 tavor_wrid_wqhdr_add(swq, s_wridlist); 2236 mutex_exit(&swq->wq_wrid_wql->wql_lock); 2237 2238 qp_srq_en = qp->qp_srq_en; 2239 2240 #ifdef __lock_lint 2241 mutex_enter(&qp->qp_srqhdl->srq_lock); 2242 #else 2243 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 2244 mutex_enter(&qp->qp_srqhdl->srq_lock); 2245 } 2246 #endif 2247 /* 2248 * Now we repeat all the above operations for the receive work queue, 2249 * or shared receive work queue. 2250 * 2251 * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case. 2252 */ 2253 rwq = tavor_wrid_wqhdr_find(qp->qp_rq_cqhdl, qp->qp_qpnum, 2254 TAVOR_WR_RECV); 2255 if (rwq == NULL) { 2256 create_new_rwq = create_wql = 1; 2257 2258 /* 2259 * If this QP is associated with an SRQ, and this isn't the 2260 * first QP on the SRQ, then the 'srq_wrid_wql' will already be 2261 * created. Since the WQL is created at 'wqhdr_create' time we 2262 * pass in the flag 'create_wql' here to be 0 if we have 2263 * already created it. And later on below we then next setup 2264 * the WQL and rwq information based off the existing SRQ info. 2265 */ 2266 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED && 2267 qp->qp_srqhdl->srq_wrid_wql != NULL) { 2268 create_wql = 0; 2269 } 2270 2271 rwq = tavor_wrid_wqhdr_create(state, qp->qp_rq_cqhdl, 2272 qp->qp_qpnum, TAVOR_WR_RECV, create_wql); 2273 if (rwq == NULL) { 2274 /* 2275 * If we couldn't find/allocate space for the workq 2276 * header, then free all the send queue resources we 2277 * just allocated and setup (above), drop the lock(s) 2278 * and return failure. 2279 */ 2280 mutex_enter(&swq->wq_wrid_wql->wql_lock); 2281 tavor_wrid_wqhdr_remove(swq, s_wridlist); 2282 mutex_exit(&swq->wq_wrid_wql->wql_lock); 2283 if (create_new_swq) { 2284 tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, 2285 swq); 2286 } 2287 2288 #ifdef __lock_lint 2289 mutex_exit(&qp->qp_srqhdl->srq_lock); 2290 #else 2291 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 2292 mutex_exit(&qp->qp_srqhdl->srq_lock); 2293 } 2294 #endif 2295 2296 tavor_wrid_wqhdr_unlock_both(qp); 2297 return (ibc_get_ci_failure(0)); 2298 } 2299 } 2300 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*rwq)) 2301 2302 /* 2303 * Setup receive workq hdr 2304 * 2305 * If the QP is on an SRQ, we setup the SRQ specific fields, setting 2306 * keeping a copy of the rwq pointer, setting the rwq bufsize 2307 * appropriately, and initializing our part of the WQLock. 2308 * 2309 * In the normal QP case, the QP recv queue bufsize is used. 2310 */ 2311 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 2312 rwq->wq_size = qp->qp_srqhdl->srq_wq_bufsz; 2313 if (qp->qp_srqhdl->srq_wrid_wql == NULL) { 2314 qp->qp_srqhdl->srq_wrid_wql = rwq->wq_wrid_wql; 2315 } else { 2316 rwq->wq_wrid_wql = qp->qp_srqhdl->srq_wrid_wql; 2317 } 2318 tavor_wql_refcnt_inc(qp->qp_srqhdl->srq_wrid_wql); 2319 2320 } else { 2321 rwq->wq_size = qp->qp_rq_bufsz; 2322 } 2323 2324 qp->qp_rq_wqhdr = rwq; 2325 rwq->wq_head = 0; 2326 rwq->wq_tail = 0; 2327 rwq->wq_full = 0; 2328 2329 /* 2330 * Allocate space for the tavor_wrid_entry_t container. 2331 * 2332 * If QP is on an SRQ, and the wrq_wridlist is NULL then we must 2333 * allocate the wridlist normally. However, if the srq_wridlist is != 2334 * NULL, then we know this SRQ has already been initialized, thus the 2335 * wridlist has already been initialized. So we re-use the 2336 * srq_wridlist as the r_wridlist for this QP in this case. 2337 */ 2338 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED && 2339 qp->qp_srqhdl->srq_wridlist != NULL) { 2340 /* Use existing srq_wridlist pointer */ 2341 r_wridlist = qp->qp_srqhdl->srq_wridlist; 2342 ASSERT(r_wridlist != NULL); 2343 } else { 2344 /* Allocate memory for the r_wridlist */ 2345 r_wridlist = tavor_wrid_get_list(rwq->wq_size); 2346 } 2347 2348 /* 2349 * If the memory allocation failed for r_wridlist (or the SRQ pointer 2350 * is mistakenly NULL), we cleanup our previous swq allocation from 2351 * above 2352 */ 2353 if (r_wridlist == NULL) { 2354 /* 2355 * If we couldn't allocate space for tracking the WRID 2356 * entries, then cleanup all the stuff from above. Then 2357 * drop the lock(s) and return failure. 2358 */ 2359 mutex_enter(&swq->wq_wrid_wql->wql_lock); 2360 tavor_wrid_wqhdr_remove(swq, s_wridlist); 2361 mutex_exit(&swq->wq_wrid_wql->wql_lock); 2362 if (create_new_swq) { 2363 tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq); 2364 } 2365 if (create_new_rwq) { 2366 tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, rwq); 2367 } 2368 2369 #ifdef __lock_lint 2370 mutex_exit(&qp->qp_srqhdl->srq_lock); 2371 #else 2372 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 2373 mutex_exit(&qp->qp_srqhdl->srq_lock); 2374 } 2375 #endif 2376 2377 tavor_wrid_wqhdr_unlock_both(qp); 2378 return (ibc_get_ci_failure(0)); 2379 } 2380 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*r_wridlist)) 2381 2382 /* 2383 * Initialize the wridlist 2384 * 2385 * In the normal QP case, there is no special initialization needed. 2386 * We simply setup the wridlist backpointer to be the receive wqhdr 2387 * (rwq). 2388 * 2389 * But in the SRQ case, there is no backpointer to the wqhdr possible. 2390 * Instead we set 'wl_srq_en', specifying this wridlist is on an SRQ 2391 * and thus potentially shared across multiple QPs with the SRQ. We 2392 * also setup the srq_wridlist pointer to be the r_wridlist, and 2393 * intialize the freelist to an invalid index. This srq_wridlist 2394 * pointer is used above on future moves from_reset to let us know that 2395 * the srq_wridlist has been initialized already. 2396 * 2397 * And finally, if we are in a non-UMAP case, we setup the srq wrid 2398 * free list. 2399 */ 2400 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED && 2401 qp->qp_srqhdl->srq_wridlist == NULL) { 2402 r_wridlist->wl_srq_en = 1; 2403 r_wridlist->wl_free_list_indx = -1; 2404 qp->qp_srqhdl->srq_wridlist = r_wridlist; 2405 2406 /* Initialize srq wrid free list */ 2407 if (qp->qp_srqhdl->srq_is_umap == 0) { 2408 mutex_enter(&rwq->wq_wrid_wql->wql_lock); 2409 tavor_wrid_list_srq_init(r_wridlist, qp->qp_srqhdl, 0); 2410 mutex_exit(&rwq->wq_wrid_wql->wql_lock); 2411 } 2412 } else { 2413 r_wridlist->wl_wqhdr = rwq; 2414 } 2415 2416 /* Chain the WRID list "container" to the workq hdr list */ 2417 mutex_enter(&rwq->wq_wrid_wql->wql_lock); 2418 tavor_wrid_wqhdr_add(rwq, r_wridlist); 2419 mutex_exit(&rwq->wq_wrid_wql->wql_lock); 2420 2421 #ifdef __lock_lint 2422 mutex_exit(&qp->qp_srqhdl->srq_lock); 2423 #else 2424 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 2425 mutex_exit(&qp->qp_srqhdl->srq_lock); 2426 } 2427 #endif 2428 2429 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*r_wridlist)) 2430 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*rwq)) 2431 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*s_wridlist)) 2432 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*swq)) 2433 2434 tavor_wrid_wqhdr_unlock_both(qp); 2435 return (DDI_SUCCESS); 2436 } 2437 2438 2439 /* 2440 * tavor_wrid_to_reset_handling() 2441 * Context: Can be called from interrupt or base context. 2442 */ 2443 void 2444 tavor_wrid_to_reset_handling(tavor_state_t *state, tavor_qphdl_t qp) 2445 { 2446 uint_t free_wqhdr = 0; 2447 2448 /* 2449 * For each of this QP's Work Queues, move the WRID "container" to 2450 * the "reapable" list. Although there may still be unpolled 2451 * entries in these containers, it is not a big deal. We will not 2452 * reap the list until either the Poll CQ command detects an empty 2453 * condition or the CQ itself is freed. Grab the CQ lock(s) before 2454 * manipulating the lists. 2455 */ 2456 mutex_enter(&qp->qp_rq_cqhdl->cq_lock); 2457 tavor_wrid_wqhdr_lock_both(qp); 2458 tavor_wrid_reaplist_add(qp->qp_sq_cqhdl, qp->qp_sq_wqhdr); 2459 2460 /* 2461 * Add the receive work queue header on to the reaplist. But if we are 2462 * on SRQ, then don't add anything to the reaplist. Instead we flush 2463 * the SRQ entries on the CQ, remove wridlist from WQHDR, and free the 2464 * WQHDR (if needed). We must hold the WQL for these operations, yet 2465 * the call to tavor_cq_wqhdr_remove grabs the WQL internally. So we 2466 * drop WQL before that call. Then release the CQ WQHDR locks and the 2467 * CQ lock and return. 2468 */ 2469 if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) { 2470 2471 /* 2472 * Pull off all (if any) entries for this QP from CQ. This 2473 * only includes entries that have not yet been polled 2474 */ 2475 mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock); 2476 tavor_cq_srq_entries_flush(state, qp); 2477 2478 /* Remove wridlist from WQHDR */ 2479 tavor_wrid_wqhdr_remove(qp->qp_rq_wqhdr, 2480 qp->qp_rq_wqhdr->wq_wrid_post); 2481 2482 /* If wridlist chain is now empty, remove the wqhdr as well */ 2483 if (qp->qp_rq_wqhdr->wq_wrid_post == NULL) { 2484 free_wqhdr = 1; 2485 } else { 2486 free_wqhdr = 0; 2487 } 2488 2489 mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock); 2490 2491 /* Free the WQHDR */ 2492 if (free_wqhdr) { 2493 tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr); 2494 } 2495 } else { 2496 tavor_wrid_reaplist_add(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr); 2497 } 2498 tavor_wrid_wqhdr_unlock_both(qp); 2499 mutex_exit(&qp->qp_rq_cqhdl->cq_lock); 2500 } 2501 2502 2503 /* 2504 * tavor_wrid_add_entry() 2505 * Context: Can be called from interrupt or base context. 2506 */ 2507 void 2508 tavor_wrid_add_entry(tavor_workq_hdr_t *wq, uint64_t wrid, uint32_t wqeaddrsz, 2509 uint_t signaled_dbd) 2510 { 2511 tavor_wrid_entry_t *wre_tmp; 2512 uint32_t head, tail, size; 2513 2514 ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock)); 2515 2516 /* 2517 * Find the entry in the container pointed to by the "tail" index. 2518 * Add all of the relevant information to that entry, including WRID, 2519 * "wqeaddrsz" parameter, and whether it was signaled/unsignaled 2520 * and/or doorbelled. 2521 */ 2522 head = wq->wq_wrid_post->wl_head; 2523 tail = wq->wq_wrid_post->wl_tail; 2524 size = wq->wq_wrid_post->wl_size; 2525 wre_tmp = &wq->wq_wrid_post->wl_wre[tail]; 2526 wre_tmp->wr_wrid = wrid; 2527 wre_tmp->wr_wqeaddrsz = wqeaddrsz; 2528 wre_tmp->wr_signaled_dbd = signaled_dbd; 2529 2530 /* 2531 * Update the "wrid_old_tail" pointer to point to the entry we just 2532 * inserted into the queue. By tracking this pointer (the pointer to 2533 * the most recently inserted entry) it will possible later in the 2534 * PostSend() and PostRecv() code paths to find the entry that needs 2535 * its "doorbelled" flag set (see comment in tavor_post_recv() and/or 2536 * tavor_post_send()). 2537 */ 2538 wq->wq_wrid_post->wl_wre_old_tail = wre_tmp; 2539 2540 /* Update the tail index */ 2541 tail = ((tail + 1) & (size - 1)); 2542 wq->wq_wrid_post->wl_tail = tail; 2543 2544 /* 2545 * If the "tail" index has just wrapped over into the "head" index, 2546 * then we have filled the container. We use the "full" flag to 2547 * indicate this condition and to distinguish it from the "empty" 2548 * condition (where head and tail are also equal). 2549 */ 2550 if (head == tail) { 2551 wq->wq_wrid_post->wl_full = 1; 2552 } 2553 } 2554 2555 /* 2556 * tavor_wrid_add_entry_srq() 2557 * Context: Can be called from interrupt or base context 2558 */ 2559 void 2560 tavor_wrid_add_entry_srq(tavor_srqhdl_t srq, uint64_t wrid, uint_t signaled_dbd) 2561 { 2562 tavor_wrid_entry_t *wre; 2563 uint64_t *wl_wqe; 2564 uint32_t wqe_index; 2565 2566 /* 2567 * Find the next available WQE from the SRQ free_list. Then update the 2568 * free_list to point to the next entry 2569 */ 2570 wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wridlist->wl_free_list_indx); 2571 2572 wqe_index = srq->srq_wridlist->wl_free_list_indx; 2573 2574 /* ASSERT on impossible wqe_index values */ 2575 ASSERT(wqe_index < srq->srq_wq_bufsz); 2576 2577 /* 2578 * Setup the WRE. 2579 * 2580 * Given the 'wqe_index' value, we store the WRID at this WRE offset. 2581 * And we set the WRE to be signaled_dbd so that on poll CQ we can find 2582 * this information and associate the WRID to the WQE found on the CQE. 2583 */ 2584 wre = &srq->srq_wridlist->wl_wre[wqe_index]; 2585 wre->wr_wrid = wrid; 2586 wre->wr_signaled_dbd = signaled_dbd; 2587 2588 /* Update the free list index */ 2589 srq->srq_wridlist->wl_free_list_indx = ddi_get32( 2590 srq->srq_wridlist->wl_acchdl, (uint32_t *)wl_wqe); 2591 } 2592 2593 2594 /* 2595 * tavor_wrid_get_entry() 2596 * Context: Can be called from interrupt or base context. 2597 */ 2598 uint64_t 2599 tavor_wrid_get_entry(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe, 2600 tavor_wrid_entry_t *wre) 2601 { 2602 tavor_workq_hdr_t *wq; 2603 tavor_wrid_entry_t *wre_tmp; 2604 uint64_t wrid; 2605 uint_t send_or_recv, qpnum, error, opcode; 2606 2607 /* Lock the list of work queues associated with this CQ */ 2608 mutex_enter(&cq->cq_wrid_wqhdr_lock); 2609 2610 /* 2611 * Determine whether this CQE is a send or receive completion (and 2612 * whether it was a "successful" completion or not) 2613 */ 2614 opcode = TAVOR_CQE_OPCODE_GET(cq, cqe); 2615 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) || 2616 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) { 2617 error = 1; 2618 send_or_recv = (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ? 2619 TAVOR_COMPLETION_SEND : TAVOR_COMPLETION_RECV; 2620 } else { 2621 error = 0; 2622 send_or_recv = TAVOR_CQE_SENDRECV_GET(cq, cqe); 2623 } 2624 2625 /* Find the work queue for this QP number (send or receive side) */ 2626 qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe); 2627 wq = tavor_wrid_wqhdr_find(cq, qpnum, send_or_recv); 2628 ASSERT(wq != NULL); 2629 2630 /* 2631 * Regardless of whether the completion is the result of a "success" 2632 * or a "failure", we lock the list of "containers" and attempt to 2633 * search for the the first matching completion (i.e. the first WR 2634 * with a matching WQE addr and size). Once we find it, we pull out 2635 * the "wrid" field and return it (see below). Note: One possible 2636 * future enhancement would be to enable this routine to skip over 2637 * any "unsignaled" completions to go directly to the next "signaled" 2638 * entry on success. XXX 2639 */ 2640 mutex_enter(&wq->wq_wrid_wql->wql_lock); 2641 wre_tmp = tavor_wrid_find_match(wq, cq, cqe); 2642 2643 /* 2644 * If this is a "successful" completion, then we assert that this 2645 * completion must be a "signaled" completion. 2646 */ 2647 ASSERT(error || (wre_tmp->wr_signaled_dbd & TAVOR_WRID_ENTRY_SIGNALED)); 2648 2649 /* 2650 * If the completion is a "failed" completion, then we save away the 2651 * contents of the entry (into the "wre" field passed in) for use 2652 * in later CQE processing. Note: We use the tavor_wrid_get_wqeaddrsz() 2653 * function to grab "wqeaddrsz" from the next entry in the container. 2654 * This is required for error processing (where updating these fields 2655 * properly is necessary to correct handling of the "error" CQE) 2656 */ 2657 if (error && (wre != NULL)) { 2658 *wre = *wre_tmp; 2659 wre->wr_wqeaddrsz = tavor_wrid_get_wqeaddrsz(wq); 2660 } 2661 2662 /* Pull out the WRID and return it */ 2663 wrid = wre_tmp->wr_wrid; 2664 2665 mutex_exit(&wq->wq_wrid_wql->wql_lock); 2666 mutex_exit(&cq->cq_wrid_wqhdr_lock); 2667 2668 return (wrid); 2669 } 2670 2671 2672 /* 2673 * tavor_wrid_find_match() 2674 * Context: Can be called from interrupt or base context. 2675 */ 2676 static tavor_wrid_entry_t * 2677 tavor_wrid_find_match(tavor_workq_hdr_t *wq, tavor_cqhdl_t cq, 2678 tavor_hw_cqe_t *cqe) 2679 { 2680 tavor_wrid_entry_t *curr = NULL; 2681 tavor_wrid_list_hdr_t *container; 2682 uint32_t wqeaddr_size; 2683 uint32_t head, tail, size; 2684 int found = 0, last_container; 2685 2686 ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock)); 2687 2688 /* Pull the "wqeaddrsz" information from the CQE */ 2689 wqeaddr_size = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe); 2690 2691 /* 2692 * Walk the "containers" list(s), find first WR with a matching WQE 2693 * addr. If the current "container" is not the last one on the list, 2694 * i.e. not the current one to which we are posting new WRID entries, 2695 * then we do not attempt to update the "q_head", "q_tail", and 2696 * "q_full" indicators on the main work queue header. We do, however, 2697 * update the "head" and "full" indicators on the individual containers 2698 * as we go. This is imperative because we need to be able to 2699 * determine when the current container has been emptied (so that we 2700 * can move on to the next container). 2701 */ 2702 container = wq->wq_wrid_poll; 2703 while (container != NULL) { 2704 /* Is this the last/only "container" on the list */ 2705 last_container = (container != wq->wq_wrid_post) ? 0 : 1; 2706 2707 /* 2708 * First check if we are on an SRQ. If so, we grab the entry 2709 * and break out. Since SRQ wridlist's are never added to 2710 * reaplist, they can only be the last container. 2711 */ 2712 if (container->wl_srq_en) { 2713 ASSERT(last_container == 1); 2714 curr = tavor_wrid_find_match_srq(container, cq, cqe); 2715 break; 2716 } 2717 2718 /* 2719 * Grab the current "head", "tail" and "size" fields before 2720 * walking the list in the current container. Note: the "size" 2721 * field here must always be a power-of-2. The "full" 2722 * parameter is checked (and updated) here to distinguish the 2723 * "queue full" condition from "queue empty". 2724 */ 2725 head = container->wl_head; 2726 tail = container->wl_tail; 2727 size = container->wl_size; 2728 while ((head != tail) || (container->wl_full)) { 2729 container->wl_full = 0; 2730 curr = &container->wl_wre[head]; 2731 head = ((head + 1) & (size - 1)); 2732 2733 /* 2734 * If the current entry's "wqeaddrsz" matches the one 2735 * we're searching for, then this must correspond to 2736 * the work request that caused the completion. Set 2737 * the "found" flag and bail out. 2738 */ 2739 if (curr->wr_wqeaddrsz == wqeaddr_size) { 2740 found = 1; 2741 break; 2742 } 2743 } 2744 2745 /* 2746 * If the current container is empty (having reached here the 2747 * "head == tail" condition can only mean that the container 2748 * is empty), then NULL out the "wrid_old_tail" field (see 2749 * tavor_post_send() and tavor_post_recv() for more details) 2750 * and (potentially) remove the current container from future 2751 * searches. 2752 */ 2753 if (head == tail) { 2754 2755 container->wl_wre_old_tail = NULL; 2756 /* 2757 * If this wasn't the last "container" on the chain, 2758 * i.e. the one to which new WRID entries will be 2759 * added, then remove it from the list. 2760 * Note: we don't "lose" the memory pointed to by this 2761 * because we should have already put this container 2762 * on the "reapable" list (from where it will later be 2763 * pulled). 2764 */ 2765 if (!last_container) { 2766 wq->wq_wrid_poll = container->wl_next; 2767 } 2768 } 2769 2770 /* Update the head index for the container */ 2771 container->wl_head = head; 2772 2773 /* 2774 * If the entry was found in this container, then continue to 2775 * bail out. Else reset the "curr" pointer and move on to the 2776 * next container (if there is one). Note: the only real 2777 * reason for setting "curr = NULL" here is so that the ASSERT 2778 * below can catch the case where no matching entry was found 2779 * on any of the lists. 2780 */ 2781 if (found) { 2782 break; 2783 } else { 2784 curr = NULL; 2785 container = container->wl_next; 2786 } 2787 } 2788 2789 /* 2790 * Update work queue header's "head" and "full" conditions to match 2791 * the last entry on the container list. (Note: Only if we're pulling 2792 * entries from the last work queue portion of the list, i.e. not from 2793 * the previous portions that may be the "reapable" list.) 2794 */ 2795 if (last_container) { 2796 wq->wq_head = wq->wq_wrid_post->wl_head; 2797 wq->wq_full = wq->wq_wrid_post->wl_full; 2798 } 2799 2800 /* Ensure that we've actually found what we were searching for */ 2801 ASSERT(curr != NULL); 2802 2803 return (curr); 2804 } 2805 2806 2807 /* 2808 * tavor_wrid_find_match_srq() 2809 * Context: Can be called from interrupt or base context. 2810 */ 2811 tavor_wrid_entry_t * 2812 tavor_wrid_find_match_srq(tavor_wrid_list_hdr_t *wl, tavor_cqhdl_t cq, 2813 tavor_hw_cqe_t *cqe) 2814 { 2815 tavor_wrid_entry_t *wre; 2816 uint64_t *wl_wqe; 2817 uint32_t wqe_index; 2818 uint64_t wqe_addr; 2819 uint32_t cqe_wqe_addr; 2820 2821 /* Grab the WQE addr out of the CQE */ 2822 cqe_wqe_addr = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe) & 0xFFFFFFC0; 2823 2824 /* 2825 * Use the WQE addr as the lower 32-bit, we add back on the 2826 * 'wl_srq_desc_off' because we have a zero-based queue. Then the 2827 * upper 32-bit of the 'wl_srq_wq_buf' OR'd on gives us the WQE addr in 2828 * the SRQ Work Queue itself. We use this address as the index to find 2829 * out which Work Queue Entry this CQE corresponds with. 2830 * 2831 * We also use this address below to add the WQE back on to the free 2832 * list. 2833 */ 2834 wqe_addr = ((uintptr_t)wl->wl_srq_wq_buf & 0xFFFFFFFF00000000ull) | 2835 (cqe_wqe_addr + wl->wl_srq_desc_off); 2836 2837 /* 2838 * Given the 'wqe_addr' just calculated and the srq buf address, we 2839 * find the 'wqe_index'. The 'wre' returned below contains the WRID 2840 * that we are looking for. This indexes into the wre_list for this 2841 * specific WQE. 2842 */ 2843 wqe_index = TAVOR_SRQ_WQE_INDEX(wl->wl_srq_wq_buf, wqe_addr, 2844 wl->wl_srq_log_wqesz); 2845 2846 /* ASSERT on impossible wqe_index values */ 2847 ASSERT(wqe_index < wl->wl_srq_wq_bufsz); 2848 2849 /* Get the pointer to this WQE */ 2850 wl_wqe = (uint64_t *)(uintptr_t)wqe_addr; 2851 2852 /* Put this WQE index back on the free list */ 2853 ddi_put32(wl->wl_acchdl, (uint32_t *)wl_wqe, wl->wl_free_list_indx); 2854 wl->wl_free_list_indx = wqe_index; 2855 2856 /* Using the index, return the Work Request ID Entry (wre) */ 2857 wre = &wl->wl_wre[wqe_index]; 2858 2859 return (wre); 2860 } 2861 2862 2863 /* 2864 * tavor_wrid_cq_reap() 2865 * Context: Can be called from interrupt or base context. 2866 */ 2867 void 2868 tavor_wrid_cq_reap(tavor_cqhdl_t cq) 2869 { 2870 tavor_workq_hdr_t *consume_wqhdr; 2871 tavor_wrid_list_hdr_t *container, *to_free; 2872 2873 ASSERT(MUTEX_HELD(&cq->cq_lock)); 2874 2875 /* Lock the list of work queues associated with this CQ */ 2876 mutex_enter(&cq->cq_wrid_wqhdr_lock); 2877 2878 /* Walk the "reapable" list and free up containers */ 2879 container = cq->cq_wrid_reap_head; 2880 while (container != NULL) { 2881 to_free = container; 2882 container = container->wl_reap_next; 2883 /* 2884 * If reaping the WRID list containers pulls the last 2885 * container from the given work queue header, then we free 2886 * the work queue header as well. 2887 */ 2888 consume_wqhdr = tavor_wrid_list_reap(to_free); 2889 if (consume_wqhdr != NULL) { 2890 tavor_cq_wqhdr_remove(cq, consume_wqhdr); 2891 } 2892 } 2893 2894 /* Once finished reaping, we reset the CQ's reap list */ 2895 cq->cq_wrid_reap_head = cq->cq_wrid_reap_tail = NULL; 2896 2897 mutex_exit(&cq->cq_wrid_wqhdr_lock); 2898 } 2899 2900 2901 /* 2902 * tavor_wrid_cq_force_reap() 2903 * Context: Can be called from interrupt or base context. 2904 */ 2905 void 2906 tavor_wrid_cq_force_reap(tavor_cqhdl_t cq) 2907 { 2908 tavor_workq_hdr_t *curr; 2909 tavor_wrid_list_hdr_t *container, *to_free; 2910 avl_tree_t *treep; 2911 void *cookie = NULL; 2912 2913 ASSERT(MUTEX_HELD(&cq->cq_lock)); 2914 2915 /* 2916 * The first step is to walk the "reapable" list and free up those 2917 * containers. This is necessary because the containers on the 2918 * reapable list are not otherwise connected to the work queue headers 2919 * anymore. 2920 */ 2921 tavor_wrid_cq_reap(cq); 2922 2923 /* Now lock the list of work queues associated with this CQ */ 2924 mutex_enter(&cq->cq_wrid_wqhdr_lock); 2925 2926 /* 2927 * Walk the list of work queue headers and free up all the WRID list 2928 * containers chained to it. Note: We don't need to grab the locks 2929 * for each of the individual WRID lists here because the only way 2930 * things can be added or removed from the list at this point would be 2931 * through post a work request to a QP. But if we've come this far, 2932 * then we can be assured that there are no longer any QP associated 2933 * with the CQ that we are trying to free. 2934 */ 2935 #ifdef __lock_lint 2936 tavor_wrid_wqhdr_compare(NULL, NULL); 2937 #endif 2938 treep = &cq->cq_wrid_wqhdr_avl_tree; 2939 while ((curr = avl_destroy_nodes(treep, &cookie)) != NULL) { 2940 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*curr)) 2941 container = curr->wq_wrid_poll; 2942 while (container != NULL) { 2943 to_free = container; 2944 container = container->wl_next; 2945 /* 2946 * If reaping the WRID list containers pulls the last 2947 * container from the given work queue header, then 2948 * we free the work queue header as well. Note: we 2949 * ignore the return value because we know that the 2950 * work queue header should always be freed once the 2951 * list of containers has come to an end. 2952 */ 2953 (void) tavor_wrid_list_reap(to_free); 2954 if (container == NULL) { 2955 tavor_cq_wqhdr_remove(cq, curr); 2956 } 2957 } 2958 } 2959 avl_destroy(treep); 2960 2961 mutex_exit(&cq->cq_wrid_wqhdr_lock); 2962 } 2963 2964 2965 /* 2966 * tavor_wrid_get_list() 2967 * Context: Can be called from interrupt or base context. 2968 */ 2969 tavor_wrid_list_hdr_t * 2970 tavor_wrid_get_list(uint32_t qsize) 2971 { 2972 tavor_wrid_list_hdr_t *wridlist; 2973 uint32_t size; 2974 2975 /* 2976 * The WRID list "container" consists of the tavor_wrid_list_hdr_t, 2977 * which holds the pointers necessary for maintaining the "reapable" 2978 * list, chaining together multiple "containers" old and new, and 2979 * tracking the head, tail, size, etc. for each container. 2980 * 2981 * The "container" also holds all the tavor_wrid_entry_t's, which is 2982 * allocated separately, one for each entry on the corresponding work 2983 * queue. 2984 */ 2985 size = sizeof (tavor_wrid_list_hdr_t); 2986 2987 /* 2988 * Note that this allocation has to be a NOSLEEP operation here 2989 * because we are holding the "wqhdr_list_lock" and, therefore, 2990 * could get raised to the interrupt level. 2991 */ 2992 wridlist = (tavor_wrid_list_hdr_t *)kmem_zalloc(size, KM_NOSLEEP); 2993 if (wridlist == NULL) { 2994 return (NULL); 2995 } 2996 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wridlist)) 2997 2998 /* Complete the "container" initialization */ 2999 wridlist->wl_size = qsize; 3000 wridlist->wl_full = 0; 3001 wridlist->wl_head = 0; 3002 wridlist->wl_tail = 0; 3003 wridlist->wl_wre = (tavor_wrid_entry_t *)kmem_zalloc(qsize * 3004 sizeof (tavor_wrid_entry_t), KM_NOSLEEP); 3005 if (wridlist->wl_wre == NULL) { 3006 kmem_free(wridlist, size); 3007 return (NULL); 3008 } 3009 wridlist->wl_wre_old_tail = NULL; 3010 wridlist->wl_reap_next = NULL; 3011 wridlist->wl_next = NULL; 3012 wridlist->wl_prev = NULL; 3013 wridlist->wl_srq_en = 0; 3014 3015 return (wridlist); 3016 } 3017 3018 /* 3019 * tavor_wrid_list_srq_init() 3020 * Context: Can be called from interrupt or base context 3021 */ 3022 void 3023 tavor_wrid_list_srq_init(tavor_wrid_list_hdr_t *wridlist, tavor_srqhdl_t srq, 3024 uint_t wq_start) 3025 { 3026 uint64_t *wl_wqe; 3027 int wqe_index; 3028 3029 ASSERT(MUTEX_HELD(&srq->srq_wrid_wql->wql_lock)); 3030 3031 /* Setup pointers for use later when we are polling the CQ */ 3032 wridlist->wl_srq_wq_buf = srq->srq_wq_buf; 3033 wridlist->wl_srq_wq_bufsz = srq->srq_wq_bufsz; 3034 wridlist->wl_srq_log_wqesz = srq->srq_wq_log_wqesz; 3035 wridlist->wl_srq_desc_off = srq->srq_desc_off; 3036 wridlist->wl_acchdl = srq->srq_wqinfo.qa_acchdl; 3037 3038 /* Given wq_start to start initializing buf at, verify sanity */ 3039 ASSERT(wq_start >= 0 && wq_start < srq->srq_wq_bufsz); 3040 3041 /* 3042 * Initialize wridlist free list 3043 * 3044 * For each WQ up to the size of our queue, we store an index in the WQ 3045 * memory itself, representing the next available free entry. The 3046 * 'wl_free_list_indx' always holds the index of the next available 3047 * free entry in the WQ. If 'wl_free_list_indx' is -1, then we are 3048 * completely full. This gives us the advantage of being able to have 3049 * entries complete or be polled off the WQ out-of-order. 3050 * 3051 * For now, we write the free_list entries inside the WQ itself. It 3052 * may be useful in the future to store this information in a separate 3053 * structure for debugging purposes. 3054 */ 3055 for (wqe_index = wq_start; wqe_index < srq->srq_wq_bufsz; wqe_index++) { 3056 wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, wqe_index); 3057 ddi_put32(wridlist->wl_acchdl, (uint32_t *)wl_wqe, 3058 wridlist->wl_free_list_indx); 3059 wridlist->wl_free_list_indx = wqe_index; 3060 } 3061 } 3062 3063 3064 /* 3065 * tavor_wrid_reaplist_add() 3066 * Context: Can be called from interrupt or base context. 3067 */ 3068 static void 3069 tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq) 3070 { 3071 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); 3072 3073 mutex_enter(&wq->wq_wrid_wql->wql_lock); 3074 3075 /* 3076 * Add the "post" container (the last one on the current chain) to 3077 * the CQ's "reapable" list 3078 */ 3079 if ((cq->cq_wrid_reap_head == NULL) && 3080 (cq->cq_wrid_reap_tail == NULL)) { 3081 cq->cq_wrid_reap_head = wq->wq_wrid_post; 3082 cq->cq_wrid_reap_tail = wq->wq_wrid_post; 3083 } else { 3084 cq->cq_wrid_reap_tail->wl_reap_next = wq->wq_wrid_post; 3085 cq->cq_wrid_reap_tail = wq->wq_wrid_post; 3086 } 3087 3088 mutex_exit(&wq->wq_wrid_wql->wql_lock); 3089 } 3090 3091 3092 int 3093 tavor_wrid_wqhdr_compare(const void *p1, const void *p2) 3094 { 3095 tavor_workq_compare_t *cmpp; 3096 tavor_workq_hdr_t *curr; 3097 3098 cmpp = (tavor_workq_compare_t *)p1; 3099 curr = (tavor_workq_hdr_t *)p2; 3100 3101 if (cmpp->cmp_qpn < curr->wq_qpn) 3102 return (-1); 3103 else if (cmpp->cmp_qpn > curr->wq_qpn) 3104 return (+1); 3105 else if (cmpp->cmp_type < curr->wq_type) 3106 return (-1); 3107 else if (cmpp->cmp_type > curr->wq_type) 3108 return (+1); 3109 else 3110 return (0); 3111 } 3112 3113 3114 /* 3115 * tavor_wrid_wqhdr_find() 3116 * Context: Can be called from interrupt or base context. 3117 */ 3118 static tavor_workq_hdr_t * 3119 tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type) 3120 { 3121 tavor_workq_hdr_t *curr; 3122 tavor_workq_compare_t cmp; 3123 3124 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); 3125 3126 /* 3127 * Walk the CQ's work queue list, trying to find a send or recv queue 3128 * with the same QP number. We do this even if we are going to later 3129 * create a new entry because it helps us easily find the end of the 3130 * list. 3131 */ 3132 cmp.cmp_qpn = qpn; 3133 cmp.cmp_type = wq_type; 3134 #ifdef __lock_lint 3135 tavor_wrid_wqhdr_compare(NULL, NULL); 3136 #endif 3137 curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL); 3138 3139 return (curr); 3140 } 3141 3142 3143 /* 3144 * tavor_wrid_wqhdr_create() 3145 * Context: Can be called from interrupt or base context. 3146 */ 3147 static tavor_workq_hdr_t * 3148 tavor_wrid_wqhdr_create(tavor_state_t *state, tavor_cqhdl_t cq, uint_t qpn, 3149 uint_t wq_type, uint_t create_wql) 3150 { 3151 tavor_workq_hdr_t *wqhdr_tmp; 3152 3153 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); 3154 3155 /* 3156 * Allocate space a work queue header structure and initialize it. 3157 * Each work queue header structure includes a "wq_wrid_wql" 3158 * which needs to be initialized. Note that this allocation has to be 3159 * a NOSLEEP operation because we are holding the "cq_wrid_wqhdr_lock" 3160 * and, therefore, could get raised to the interrupt level. 3161 */ 3162 wqhdr_tmp = (tavor_workq_hdr_t *)kmem_zalloc( 3163 sizeof (tavor_workq_hdr_t), KM_NOSLEEP); 3164 if (wqhdr_tmp == NULL) { 3165 return (NULL); 3166 } 3167 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr_tmp)) 3168 wqhdr_tmp->wq_qpn = qpn; 3169 wqhdr_tmp->wq_type = wq_type; 3170 3171 if (create_wql) { 3172 wqhdr_tmp->wq_wrid_wql = tavor_wrid_wql_create(state); 3173 if (wqhdr_tmp->wq_wrid_wql == NULL) { 3174 kmem_free(wqhdr_tmp, sizeof (tavor_workq_hdr_t)); 3175 return (NULL); 3176 } 3177 } 3178 3179 wqhdr_tmp->wq_wrid_poll = NULL; 3180 wqhdr_tmp->wq_wrid_post = NULL; 3181 3182 /* Chain the newly allocated work queue header to the CQ's list */ 3183 tavor_cq_wqhdr_add(cq, wqhdr_tmp); 3184 3185 return (wqhdr_tmp); 3186 } 3187 3188 3189 /* 3190 * tavor_wrid_wql_create() 3191 * Context: Can be called from interrupt or base context. 3192 */ 3193 tavor_wq_lock_t * 3194 tavor_wrid_wql_create(tavor_state_t *state) 3195 { 3196 tavor_wq_lock_t *wql; 3197 3198 /* 3199 * Allocate the WQL and initialize it. 3200 */ 3201 wql = kmem_zalloc(sizeof (tavor_wq_lock_t), KM_NOSLEEP); 3202 if (wql == NULL) { 3203 return (NULL); 3204 } 3205 3206 mutex_init(&wql->wql_lock, NULL, MUTEX_DRIVER, 3207 DDI_INTR_PRI(state->ts_intrmsi_pri)); 3208 3209 /* Add refcount to WQL */ 3210 tavor_wql_refcnt_inc(wql); 3211 3212 return (wql); 3213 } 3214 3215 3216 /* 3217 * tavor_wrid_get_wqeaddrsz() 3218 * Context: Can be called from interrupt or base context. 3219 */ 3220 static uint32_t 3221 tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq) 3222 { 3223 tavor_wrid_entry_t *wre; 3224 uint32_t wqeaddrsz; 3225 uint32_t head; 3226 3227 /* 3228 * If the container is empty, then there is no next entry. So just 3229 * return zero. Note: the "head == tail" condition here can only 3230 * mean that the container is empty because we have previously pulled 3231 * something from the container. 3232 * 3233 * If the container is not empty, then find the next entry and return 3234 * the contents of its "wqeaddrsz" field. 3235 */ 3236 if (wq->wq_wrid_poll->wl_head == wq->wq_wrid_poll->wl_tail) { 3237 wqeaddrsz = 0; 3238 } else { 3239 /* 3240 * We don't need to calculate the "next" head pointer here 3241 * because "head" should already point to the next entry on 3242 * the list (since we just pulled something off - in 3243 * tavor_wrid_find_match() - and moved the head index forward.) 3244 */ 3245 head = wq->wq_wrid_poll->wl_head; 3246 wre = &wq->wq_wrid_poll->wl_wre[head]; 3247 wqeaddrsz = wre->wr_wqeaddrsz; 3248 } 3249 return (wqeaddrsz); 3250 } 3251 3252 3253 /* 3254 * tavor_wrid_wqhdr_add() 3255 * Context: Can be called from interrupt or base context. 3256 */ 3257 static void 3258 tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr, 3259 tavor_wrid_list_hdr_t *wridlist) 3260 { 3261 ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock)); 3262 3263 /* Chain the new WRID list "container" to the work queue list */ 3264 if ((wqhdr->wq_wrid_post == NULL) && 3265 (wqhdr->wq_wrid_poll == NULL)) { 3266 wqhdr->wq_wrid_poll = wridlist; 3267 wqhdr->wq_wrid_post = wridlist; 3268 } else { 3269 wqhdr->wq_wrid_post->wl_next = wridlist; 3270 wridlist->wl_prev = wqhdr->wq_wrid_post; 3271 wqhdr->wq_wrid_post = wridlist; 3272 } 3273 } 3274 3275 3276 /* 3277 * tavor_wrid_wqhdr_remove() 3278 * Context: Can be called from interrupt or base context. 3279 * 3280 * Note: this is only called to remove the most recently added WRID list 3281 * container (i.e. in tavor_from_reset() above) 3282 */ 3283 static void 3284 tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr, 3285 tavor_wrid_list_hdr_t *wridlist) 3286 { 3287 tavor_wrid_list_hdr_t *prev, *next; 3288 3289 ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock)); 3290 3291 /* Unlink the WRID list "container" from the work queue list */ 3292 prev = wridlist->wl_prev; 3293 next = wridlist->wl_next; 3294 if (prev != NULL) { 3295 prev->wl_next = next; 3296 } 3297 if (next != NULL) { 3298 next->wl_prev = prev; 3299 } 3300 3301 /* 3302 * Update any pointers in the work queue hdr that may point to this 3303 * WRID list container 3304 */ 3305 if (wqhdr->wq_wrid_post == wridlist) { 3306 wqhdr->wq_wrid_post = prev; 3307 } 3308 if (wqhdr->wq_wrid_poll == wridlist) { 3309 wqhdr->wq_wrid_poll = NULL; 3310 } 3311 } 3312 3313 3314 /* 3315 * tavor_wrid_list_reap() 3316 * Context: Can be called from interrupt or base context. 3317 * Note: The "wqhdr_list_lock" must be held. 3318 */ 3319 static tavor_workq_hdr_t * 3320 tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wridlist) 3321 { 3322 tavor_workq_hdr_t *wqhdr, *consume_wqhdr = NULL; 3323 tavor_wrid_list_hdr_t *prev, *next; 3324 uint32_t size; 3325 3326 /* Get the back pointer to the work queue header (see below) */ 3327 wqhdr = wridlist->wl_wqhdr; 3328 mutex_enter(&wqhdr->wq_wrid_wql->wql_lock); 3329 3330 /* Unlink the WRID list "container" from the work queue list */ 3331 prev = wridlist->wl_prev; 3332 next = wridlist->wl_next; 3333 if (prev != NULL) { 3334 prev->wl_next = next; 3335 } 3336 if (next != NULL) { 3337 next->wl_prev = prev; 3338 } 3339 3340 /* 3341 * If the back pointer to the work queue header shows that it 3342 * was pointing to the entry we are about to remove, then the work 3343 * queue header is reapable as well. 3344 */ 3345 if ((wqhdr->wq_wrid_poll == wridlist) && 3346 (wqhdr->wq_wrid_post == wridlist)) { 3347 consume_wqhdr = wqhdr; 3348 } 3349 3350 /* Be sure to update the "poll" and "post" container pointers */ 3351 if (wqhdr->wq_wrid_poll == wridlist) { 3352 wqhdr->wq_wrid_poll = next; 3353 } 3354 if (wqhdr->wq_wrid_post == wridlist) { 3355 wqhdr->wq_wrid_post = NULL; 3356 } 3357 3358 /* Calculate the size and free the container */ 3359 size = (wridlist->wl_size * sizeof (tavor_wrid_entry_t)); 3360 kmem_free(wridlist->wl_wre, size); 3361 kmem_free(wridlist, sizeof (tavor_wrid_list_hdr_t)); 3362 3363 mutex_exit(&wqhdr->wq_wrid_wql->wql_lock); 3364 3365 return (consume_wqhdr); 3366 } 3367 3368 3369 /* 3370 * tavor_wrid_wqhdr_lock_both() 3371 * Context: Can be called from interrupt or base context. 3372 */ 3373 static void 3374 tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp) 3375 { 3376 tavor_cqhdl_t sq_cq, rq_cq; 3377 3378 sq_cq = qp->qp_sq_cqhdl; 3379 rq_cq = qp->qp_rq_cqhdl; 3380 3381 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock)) 3382 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock)) 3383 3384 /* 3385 * If both work queues (send and recv) share a completion queue, then 3386 * grab the common lock. If they use different CQs (hence different 3387 * "cq_wrid_wqhdr_list" locks), then grab the send one first, then the 3388 * receive. We do this consistently and correctly in 3389 * tavor_wrid_wqhdr_unlock_both() below to avoid introducing any kind 3390 * of dead lock condition. Note: We add the "__lock_lint" code here 3391 * to fake out warlock into thinking we've grabbed both locks (when, 3392 * in fact, we only needed the one). 3393 */ 3394 if (sq_cq == rq_cq) { 3395 mutex_enter(&sq_cq->cq_wrid_wqhdr_lock); 3396 #ifdef __lock_lint 3397 mutex_enter(&rq_cq->cq_wrid_wqhdr_lock); 3398 #endif 3399 } else { 3400 mutex_enter(&sq_cq->cq_wrid_wqhdr_lock); 3401 mutex_enter(&rq_cq->cq_wrid_wqhdr_lock); 3402 } 3403 } 3404 3405 /* 3406 * tavor_wrid_wqhdr_unlock_both() 3407 * Context: Can be called from interrupt or base context. 3408 */ 3409 static void 3410 tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp) 3411 { 3412 tavor_cqhdl_t sq_cq, rq_cq; 3413 3414 sq_cq = qp->qp_sq_cqhdl; 3415 rq_cq = qp->qp_rq_cqhdl; 3416 3417 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock)) 3418 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock)) 3419 3420 /* 3421 * See tavor_wrid_wqhdr_lock_both() above for more detail 3422 */ 3423 if (sq_cq == rq_cq) { 3424 #ifdef __lock_lint 3425 mutex_exit(&rq_cq->cq_wrid_wqhdr_lock); 3426 #endif 3427 mutex_exit(&sq_cq->cq_wrid_wqhdr_lock); 3428 } else { 3429 mutex_exit(&rq_cq->cq_wrid_wqhdr_lock); 3430 mutex_exit(&sq_cq->cq_wrid_wqhdr_lock); 3431 } 3432 } 3433 3434 3435 /* 3436 * tavor_cq_wqhdr_add() 3437 * Context: Can be called from interrupt or base context. 3438 */ 3439 static void 3440 tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr) 3441 { 3442 tavor_workq_compare_t cmp; 3443 avl_index_t where; 3444 3445 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); 3446 3447 cmp.cmp_qpn = wqhdr->wq_qpn; 3448 cmp.cmp_type = wqhdr->wq_type; 3449 #ifdef __lock_lint 3450 tavor_wrid_wqhdr_compare(NULL, NULL); 3451 #endif 3452 (void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where); 3453 /* 3454 * If the CQ's work queue list is empty, then just add it. 3455 * Otherwise, chain it to the beginning of the list. 3456 */ 3457 avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqhdr, where); 3458 } 3459 3460 3461 /* 3462 * tavor_cq_wqhdr_remove() 3463 * Context: Can be called from interrupt or base context. 3464 */ 3465 static void 3466 tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr) 3467 { 3468 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock)); 3469 3470 #ifdef __lock_lint 3471 tavor_wrid_wqhdr_compare(NULL, NULL); 3472 #endif 3473 /* Remove "wqhdr" from the work queue header list on "cq" */ 3474 avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqhdr); 3475 3476 /* 3477 * Release reference to WQL; If this is the last reference, this call 3478 * also has the side effect of freeing up the 'wq_wrid_wql' memory. 3479 */ 3480 tavor_wql_refcnt_dec(wqhdr->wq_wrid_wql); 3481 3482 /* Free the memory associated with "wqhdr" */ 3483 kmem_free(wqhdr, sizeof (tavor_workq_hdr_t)); 3484 } 3485 3486 3487 /* 3488 * tavor_wql_refcnt_inc() 3489 * Context: Can be called from interrupt or base context 3490 */ 3491 void 3492 tavor_wql_refcnt_inc(tavor_wq_lock_t *wql) 3493 { 3494 ASSERT(wql != NULL); 3495 3496 mutex_enter(&wql->wql_lock); 3497 wql->wql_refcnt++; 3498 mutex_exit(&wql->wql_lock); 3499 } 3500 3501 /* 3502 * tavor_wql_refcnt_dec() 3503 * Context: Can be called from interrupt or base context 3504 */ 3505 void 3506 tavor_wql_refcnt_dec(tavor_wq_lock_t *wql) 3507 { 3508 int refcnt; 3509 3510 ASSERT(wql != NULL); 3511 3512 mutex_enter(&wql->wql_lock); 3513 wql->wql_refcnt--; 3514 refcnt = wql->wql_refcnt; 3515 mutex_exit(&wql->wql_lock); 3516 3517 /* 3518 * 3519 * Free up WQL memory if we're the last one associated with this 3520 * structure. 3521 */ 3522 if (refcnt == 0) { 3523 mutex_destroy(&wql->wql_lock); 3524 kmem_free(wql, sizeof (tavor_wq_lock_t)); 3525 } 3526 } 3527