1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * This file may contain confidential information of 29 * Mellanox Technologies, Ltd. and should not be distributed in source 30 * form without approval from Sun Legal. 31 */ 32 33 #include "dapl.h" 34 #include "dapl_tavor_hw.h" 35 #include "dapl_tavor_wr.h" 36 #include "dapl_tavor_ibtf_impl.h" 37 38 /* 39 * Function signatures 40 */ 41 extern uint64_t dapls_tavor_wrid_get_entry(ib_cq_handle_t, tavor_hw_cqe_t *, 42 uint_t, uint_t, dapls_tavor_wrid_entry_t *); 43 extern void dapls_tavor_wrid_cq_reap(ib_cq_handle_t); 44 extern DAPL_OS_LOCK g_tavor_uar_lock; 45 46 #ifndef _LP64 47 extern void dapls_atomic_assign_64(uint64_t, uint64_t *); 48 #endif 49 50 static int dapli_tavor_wqe_send_build(ib_qp_handle_t, ibt_send_wr_t *, 51 uint64_t *, uint_t *); 52 static void dapli_tavor_wqe_send_linknext(ibt_send_wr_t *, uint64_t *, 53 boolean_t, uint32_t, uint_t, uint64_t *, tavor_sw_wqe_dbinfo_t *); 54 static DAT_RETURN dapli_tavor_wqe_recv_build(ib_qp_handle_t, ibt_recv_wr_t *, 55 uint64_t *, uint_t *); 56 static void dapli_tavor_wqe_recv_linknext(uint64_t *, boolean_t, uint32_t, 57 uint_t, uint64_t *); 58 static int dapli_tavor_cq_cqe_consume(ib_cq_handle_t, tavor_hw_cqe_t *, 59 ibt_wc_t *); 60 static int dapli_tavor_cq_errcqe_consume(ib_cq_handle_t, tavor_hw_cqe_t *, 61 ibt_wc_t *); 62 63 /* exported to other HCAs */ 64 extern void dapli_tavor_wrid_add_entry(dapls_tavor_workq_hdr_t *, uint64_t, 65 uint32_t, uint_t); 66 extern void dapli_tavor_wrid_add_entry_srq(ib_srq_handle_t, uint64_t, uint32_t); 67 68 /* 69 * Note: The 64 bit doorbells need to written atomically. 70 * In 32 bit libraries we need to use the special assembly rtn 71 * because compiler generated code splits into 2 word writes 72 */ 73 74 #if defined(_LP64) || defined(__lint) 75 /* use a macro to ensure inlining on S10 amd64 compiler */ 76 #define dapli_tavor_cq_doorbell(ia_uar, cq_cmd, cqn, cq_param) \ 77 ((tavor_hw_uar_t *)ia_uar)->cq = HTOBE_64( \ 78 ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) | \ 79 ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param) 80 #else 81 82 /* 83 * dapli_tavor_cq_doorbell() 84 * Takes the specified cq cmd and cq number and rings the cq doorbell 85 */ 86 static void 87 dapli_tavor_cq_doorbell(dapls_hw_uar_t ia_uar, uint32_t cq_cmd, uint32_t cqn, 88 uint32_t cq_param) 89 { 90 uint64_t doorbell; 91 92 /* Build the doorbell from the parameters */ 93 doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) | 94 ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param; 95 96 /* Write the doorbell to UAR */ 97 #ifdef _LP64 98 ((tavor_hw_uar_t *)ia_uar)->cq = HTOBE_64(doorbell); 99 /* 32 bit version */ 100 #elif defined(i386) 101 dapl_os_lock(&g_tavor_uar_lock); 102 /* 103 * For 32 bit intel we assign the doorbell in the order 104 * prescribed by the Tavor PRM, lower to upper addresses 105 */ 106 ((tavor_hw_uar32_t *)ia_uar)->cq[0] = 107 (uint32_t)HTOBE_32(doorbell >> 32); 108 ((tavor_hw_uar32_t *)ia_uar)->cq[1] = 109 (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff); 110 dapl_os_unlock(&g_tavor_uar_lock); 111 #else 112 dapls_atomic_assign_64(HTOBE_64(doorbell), 113 &((tavor_hw_uar_t *)ia_uar)->cq); 114 #endif 115 } 116 #pragma inline(dapli_tavor_cq_doorbell) 117 118 #endif /* _LP64 */ 119 120 #if defined(_LP64) || defined(__lint) 121 #define dapli_tavor_qp_send_doorbell(ia_uar, nda, nds, qpn, fence, nopcode) \ 122 ((tavor_hw_uar_t *)ia_uar)->send = HTOBE_64( \ 123 (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) << \ 124 TAVOR_QPSNDDB_NDA_SHIFT) | \ 125 ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) | \ 126 ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) | \ 127 ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds) 128 #else 129 130 /* 131 * dapli_tavor_qp_send_doorbell() 132 * Takes the specified next descriptor information, qp number, opcode and 133 * rings the send doorbell 134 */ 135 static void 136 dapli_tavor_qp_send_doorbell(dapls_hw_uar_t ia_uar, uint32_t nda, 137 uint32_t nds, uint32_t qpn, uint32_t fence, uint32_t nopcode) 138 { 139 uint64_t doorbell; 140 141 /* Build the doorbell from the parameters */ 142 doorbell = (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) << 143 TAVOR_QPSNDDB_NDA_SHIFT) | 144 ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) | 145 ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) | 146 ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds; 147 148 /* Write the doorbell to UAR */ 149 #ifdef _LP64 150 ((tavor_hw_uar_t *)ia_uar)->send = HTOBE_64(doorbell); 151 #else 152 #if defined(i386) 153 dapl_os_lock(&g_tavor_uar_lock); 154 /* 155 * For 32 bit intel we assign the doorbell in the order 156 * prescribed by the Tavor PRM, lower to upper addresses 157 */ 158 ((tavor_hw_uar32_t *)ia_uar)->send[0] = 159 (uint32_t)HTOBE_32(doorbell >> 32); 160 ((tavor_hw_uar32_t *)ia_uar)->send[1] = 161 (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff); 162 dapl_os_unlock(&g_tavor_uar_lock); 163 #else 164 dapls_atomic_assign_64(HTOBE_64(doorbell), 165 &((tavor_hw_uar_t *)ia_uar)->send); 166 #endif 167 #endif 168 } 169 #pragma inline(dapli_tavor_qp_send_doorbell) 170 #endif /* _LP64 */ 171 172 #if defined(_LP64) || defined(__lint) 173 174 #define dapli_tavor_qp_recv_doorbell(ia_uar, nda, nds, qpn, credits) \ 175 ((tavor_hw_uar_t *)ia_uar)->recv = HTOBE_64( \ 176 (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) << \ 177 TAVOR_QPRCVDB_NDA_SHIFT) | \ 178 ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) | \ 179 ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits) 180 #else 181 182 /* 183 * dapli_tavor_qp_recv_doorbell() 184 * Takes the specified next descriptor information, qp number and 185 * rings the recv doorbell 186 */ 187 static void 188 dapli_tavor_qp_recv_doorbell(dapls_hw_uar_t ia_uar, uint32_t nda, 189 uint32_t nds, uint32_t qpn, uint32_t credits) 190 { 191 uint64_t doorbell; 192 193 /* Build the doorbell from the parameters */ 194 doorbell = (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) << 195 TAVOR_QPRCVDB_NDA_SHIFT) | 196 ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) | 197 ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits; 198 199 /* Write the doorbell to UAR */ 200 #ifdef _LP64 201 ((tavor_hw_uar_t *)ia_uar)->recv = HTOBE_64(doorbell); 202 #else 203 #if defined(i386) 204 dapl_os_lock(&g_tavor_uar_lock); 205 /* 206 * For 32 bit intel we assign the doorbell in the order 207 * prescribed by the Tavor PRM, lower to upper addresses 208 */ 209 ((tavor_hw_uar32_t *)ia_uar)->recv[0] = 210 (uint32_t)HTOBE_32(doorbell >> 32); 211 ((tavor_hw_uar32_t *)ia_uar)->recv[1] = 212 (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff); 213 dapl_os_unlock(&g_tavor_uar_lock); 214 #else 215 dapls_atomic_assign_64(HTOBE_64(doorbell), 216 &((tavor_hw_uar_t *)ia_uar)->recv); 217 #endif 218 #endif 219 } 220 #pragma inline(dapli_tavor_qp_recv_doorbell) 221 #endif /* _LP64 */ 222 223 224 /* 225 * dapls_tavor_max_inline() 226 * Return the max inline value that should be used. 227 * Env variable DAPL_MAX_INLINE can override the default. 228 * If it's not set (or set to -1), default behavior is used. 229 * If it's zero or negative (except -1) inline is not done. 230 */ 231 int 232 dapls_tavor_max_inline(void) 233 { 234 static int max_inline_env = -2; 235 236 /* Check the env exactly once, otherwise return previous value. */ 237 if (max_inline_env != -2) 238 return (max_inline_env); 239 240 max_inline_env = dapl_os_get_env_val("DAPL_MAX_INLINE", -1); 241 if (max_inline_env != -1) 242 if (max_inline_env <= 0) 243 max_inline_env = 0; /* no inlining */ 244 return (max_inline_env); 245 } 246 247 /* 248 * dapls_ib_max_request_iov(), aka, max send sgl size. 249 * The send queue's scatter/gather list is used for "inline" data. 250 * 251 * By default, compute reasonable send queue size based on #iovs, #wqes, 252 * max_iovs, and max inline byte count. If the #wqes is large, then we 253 * limit how much the SGL (space for inline data) can take. The heuristic 254 * is to increase the memory for the send queue to a maximum of 32KB: 255 * 256 * < 128 wqes increase to at most 256 minus header 257 * < 256 wqes increase to at most 128 minus header 258 * >= 256 wqes use SGL unaltered 259 * 260 * If the env is supplied (max_inline >= 0), use it without checking. 261 */ 262 int 263 dapls_ib_max_request_iov(int iovs, int wqes, int max_iovs, 264 int max_inline_bytes) 265 { 266 int ret_iovs; 267 268 if (max_inline_bytes > 0) { 269 ret_iovs = max_inline_bytes / sizeof (tavor_hw_wqe_sgl_t); 270 } else if (wqes < 128) { 271 max_inline_bytes = 256 - TAVOR_INLINE_HEADER_SIZE_MAX; 272 ret_iovs = max_inline_bytes / sizeof (tavor_hw_wqe_sgl_t); 273 } else if (wqes < 256) { 274 max_inline_bytes = 128 - TAVOR_INLINE_HEADER_SIZE_MAX; 275 ret_iovs = max_inline_bytes / sizeof (tavor_hw_wqe_sgl_t); 276 } else { 277 ret_iovs = iovs; 278 } 279 280 if (ret_iovs > max_iovs) /* do not exceed max */ 281 ret_iovs = max_iovs; 282 if (iovs > ret_iovs) /* never decrease iovs */ 283 ret_iovs = iovs; 284 return (ret_iovs); 285 } 286 287 /* 288 * dapli_tavor_wqe_send_build() 289 * Constructs a WQE for a given ibt_send_wr_t 290 */ 291 static int 292 dapli_tavor_wqe_send_build(ib_qp_handle_t qp, ibt_send_wr_t *wr, 293 uint64_t *addr, uint_t *size) 294 { 295 tavor_hw_snd_wqe_remaddr_t *rc; 296 tavor_hw_snd_wqe_bind_t *bn; 297 tavor_hw_wqe_sgl_t *ds; 298 ibt_wr_ds_t *sgl; 299 uint32_t nds; 300 uint32_t len, total_len; 301 uint32_t tavor_num_mpt_mask; 302 uint32_t new_rkey; 303 uint32_t old_rkey; 304 int i, num_ds; 305 int max_inline_bytes = -1; 306 307 nds = wr->wr_nds; 308 sgl = wr->wr_sgl; 309 num_ds = 0; 310 311 /* 312 * RC is the only supported transport in UDAPL 313 * For RC requests, we allow "Send", "RDMA Read", "RDMA Write" 314 */ 315 switch (wr->wr_opcode) { 316 case IBT_WRC_SEND: 317 /* 318 * If this is a Send request, then all we need is 319 * the Data Segment processing below. 320 * Initialize the information for the Data Segments 321 */ 322 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr + 323 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 324 if (qp->qp_sq_inline != 0) 325 max_inline_bytes = 326 qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_SEND; 327 break; 328 case IBT_WRC_RDMAW: 329 if (qp->qp_sq_inline != 0) 330 max_inline_bytes = 331 qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_RDMAW; 332 /* FALLTHROUGH */ 333 case IBT_WRC_RDMAR: 334 if (qp->qp_sq_inline < 0 && wr->wr_opcode == IBT_WRC_RDMAR) 335 qp->qp_sq_inline = 0; 336 /* 337 * If this is an RDMA Read or RDMA Write request, then fill 338 * in the "Remote Address" header fields. 339 */ 340 rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)addr + 341 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 342 343 /* 344 * Build the Remote Address Segment for the WQE, using 345 * the information from the RC work request. 346 */ 347 TAVOR_WQE_BUILD_REMADDR(rc, &wr->wr.rc.rcwr.rdma); 348 349 /* Update "ds" for filling in Data Segments (below) */ 350 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc + 351 sizeof (tavor_hw_snd_wqe_remaddr_t)); 352 break; 353 case IBT_WRC_BIND: 354 /* 355 * Generate a new R_key 356 * Increment the upper "unconstrained" bits and need to keep 357 * the lower "constrained" bits the same it represents 358 * the MPT index. 359 */ 360 old_rkey = wr->wr.rc.rcwr.bind->bind_rkey; 361 tavor_num_mpt_mask = (uint32_t)(1 << qp->qp_num_mpt_shift) - 1; 362 new_rkey = (old_rkey >> qp->qp_num_mpt_shift); 363 new_rkey++; 364 new_rkey = ((new_rkey << qp->qp_num_mpt_shift) | 365 (old_rkey & tavor_num_mpt_mask)); 366 367 wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey; 368 369 bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)addr + 370 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 371 372 /* 373 * Build the Bind Memory Window Segments for the WQE, 374 * using the information from the RC Bind memory 375 * window work request. 376 */ 377 TAVOR_WQE_BUILD_BIND(bn, wr->wr.rc.rcwr.bind); 378 379 /* 380 * Update the "ds" pointer. Even though the "bind" 381 * operation requires no SGLs, this is necessary to 382 * facilitate the correct descriptor size calculations 383 * (below). 384 */ 385 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn + 386 sizeof (tavor_hw_snd_wqe_bind_t)); 387 break; 388 default: 389 dapl_dbg_log(DAPL_DBG_TYPE_ERR, 390 "dapli_tavor_wqe_send_build: invalid wr_opcode=%d\n", 391 wr->wr_opcode); 392 return (DAT_INTERNAL_ERROR); 393 } 394 395 /* 396 * Now fill in the Data Segments (SGL) for the Send WQE based on 397 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer 398 * Start by checking for a valid number of SGL entries 399 */ 400 if (nds > qp->qp_sq_sgl) { 401 return (DAT_INVALID_PARAMETER); 402 } 403 404 /* 405 * For each SGL in the Send Work Request, fill in the Send WQE's data 406 * segments. Note: We skip any SGL with zero size because Tavor 407 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 408 * the encoding for zero means a 2GB transfer. Because of this special 409 * encoding in the hardware, we mask the requested length with 410 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 411 * zero.) 412 */ 413 414 if (max_inline_bytes != -1) { /* compute total_len */ 415 total_len = 0; 416 for (i = 0; i < nds; i++) 417 total_len += sgl[i].ds_len; 418 if (total_len > max_inline_bytes) 419 max_inline_bytes = -1; /* too big, do not "inline" */ 420 } 421 if (max_inline_bytes != -1) { /* do "inline" */ 422 uint8_t *dst = (uint8_t *)((uint32_t *)ds + 1); 423 *(uint32_t *)ds = 424 HTOBE_32(total_len | TAVOR_WQE_SGL_INLINE_MASK); 425 for (i = 0; i < nds; i++) { 426 if ((len = sgl[i].ds_len) == 0) { 427 continue; 428 } 429 (void) dapl_os_memcpy(dst, 430 (void *)(uintptr_t)sgl[i].ds_va, len); 431 dst += len; 432 } 433 /* Return the size of descriptor (in 16-byte chunks) */ 434 *size = ((uintptr_t)dst - (uintptr_t)addr + 15) >> 4; 435 } else { 436 for (i = 0; i < nds; i++) { 437 if (sgl[i].ds_len == 0) { 438 continue; 439 } 440 441 /* 442 * Fill in the Data Segment(s) for the current WQE, 443 * using the information contained in the 444 * scatter-gather list of the work request. 445 */ 446 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &sgl[i]); 447 num_ds++; 448 } 449 450 /* Return the size of descriptor (in 16-byte chunks) */ 451 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)addr) >> 4; 452 } 453 454 return (DAT_SUCCESS); 455 } 456 457 /* 458 * dapli_tavor_wqe_send_linknext() 459 * Takes a WQE and links it to the prev WQE chain 460 */ 461 static void 462 dapli_tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, uint64_t *curr_addr, 463 boolean_t ns, uint32_t curr_desc, uint_t curr_descsz, uint64_t *prev_addr, 464 tavor_sw_wqe_dbinfo_t *dbinfo) 465 { 466 uint64_t next, ctrl; 467 uint32_t nopcode, fence; 468 469 next = 0; 470 ctrl = 0; 471 472 /* Set the "c" (i.e. "signaled") bit appropriately */ 473 if (curr_wr->wr_flags & IBT_WR_SEND_SIGNAL) { 474 ctrl = ctrl | TAVOR_WQE_SEND_SIGNALED_MASK; 475 } 476 477 /* Set the "s" (i.e. "solicited") bit appropriately */ 478 if (curr_wr->wr_flags & IBT_WR_SEND_SOLICIT) { 479 ctrl = ctrl | TAVOR_WQE_SEND_SOLICIT_MASK; 480 } 481 /* Set the "e" (i.e. "event") bit if notification is needed */ 482 if (!ns) { 483 ctrl = ctrl | TAVOR_WQE_RCV_EVENT_MASK; 484 } 485 486 /* 487 * The "i" bit is unused since uDAPL doesn't support 488 * the immediate data 489 */ 490 491 /* initialize the ctrl and next fields of the current descriptor */ 492 TAVOR_WQE_LINKNEXT(curr_addr, ctrl, next); 493 494 /* 495 * Calculate the "next" field of the prev descriptor. This amounts 496 * to setting up the "next_wqe_addr", "nopcode", "fence", and "nds" 497 * fields (see tavor_hw.h for more). 498 */ 499 500 /* 501 * Determine the value for the Tavor WQE "nopcode" field 502 * by using the IBTF opcode from the work request 503 */ 504 switch (curr_wr->wr_opcode) { 505 case IBT_WRC_RDMAW: 506 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW; 507 break; 508 509 case IBT_WRC_SEND: 510 nopcode = TAVOR_WQE_SEND_NOPCODE_SEND; 511 break; 512 513 case IBT_WRC_RDMAR: 514 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR; 515 break; 516 517 case IBT_WRC_BIND: 518 nopcode = TAVOR_WQE_SEND_NOPCODE_BIND; 519 break; 520 default: 521 /* Unsupported opcodes in UDAPL */ 522 dapl_dbg_log(DAPL_DBG_TYPE_ERR, 523 "dapli_tavor_wqe_send_linknext: invalid nopcode=%d\n", 524 nopcode); 525 return; 526 } 527 528 next = ((uint64_t)curr_desc & TAVOR_WQE_NDA_MASK) << 32; 529 next = next | ((uint64_t)nopcode << 32); 530 fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0; 531 if (fence) { 532 next = next | TAVOR_WQE_SEND_FENCE_MASK; 533 } 534 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK); 535 536 /* 537 * A send queue doorbell will be rung for the next 538 * WQE on the chain, set the current WQE's "dbd" bit. 539 * Note: We also update the "dbinfo" structure here to pass 540 * back information about what should (later) be included 541 * in the send queue doorbell. 542 */ 543 next = next | TAVOR_WQE_DBD_MASK; 544 dbinfo->db_nopcode = nopcode; 545 dbinfo->db_fence = fence; 546 547 /* 548 * Send queue doorbell will be rung for the next WQE on 549 * the chain, update the prev WQE's "next" field and return. 550 */ 551 if (prev_addr != NULL) { 552 TAVOR_WQE_LINKFIRST(prev_addr, next); 553 } 554 } 555 556 557 /* 558 * dapli_tavor_wqe_recv_build() 559 * Builds the recv WQE for a given ibt_recv_wr_t 560 */ 561 static DAT_RETURN 562 dapli_tavor_wqe_recv_build(ib_qp_handle_t qp, ibt_recv_wr_t *wr, 563 uint64_t *addr, uint_t *size) 564 { 565 tavor_hw_wqe_sgl_t *ds; 566 int i; 567 int num_ds; 568 569 /* Fill in the Data Segments (SGL) for the Recv WQE */ 570 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr + 571 sizeof (tavor_hw_rcv_wqe_nextctrl_t)); 572 num_ds = 0; 573 574 /* Check for valid number of SGL entries */ 575 if (wr->wr_nds > qp->qp_rq_sgl) { 576 return (DAT_INVALID_PARAMETER); 577 } 578 579 /* 580 * For each SGL in the Recv Work Request, fill in the Recv WQE's data 581 * segments. Note: We skip any SGL with zero size because Tavor 582 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 583 * the encoding for zero means a 2GB transfer. Because of this special 584 * encoding in the hardware, we mask the requested length with 585 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 586 * zero.) 587 */ 588 for (i = 0; i < wr->wr_nds; i++) { 589 if (wr->wr_sgl[i].ds_len == 0) { 590 continue; 591 } 592 593 /* 594 * Fill in the Data Segment(s) for the receive WQE, using the 595 * information contained in the scatter-gather list of the 596 * work request. 597 */ 598 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]); 599 num_ds++; 600 } 601 602 /* Return the size of descriptor (in 16-byte chunks) */ 603 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)addr) >> 0x4; 604 605 return (DAT_SUCCESS); 606 } 607 608 609 /* 610 * dapli_tavor_wqe_recv_linknext() 611 * Links a recv WQE to the prev chain 612 */ 613 static void 614 dapli_tavor_wqe_recv_linknext(uint64_t *curr_addr, boolean_t ns, 615 uint32_t curr_desc, uint_t curr_descsz, uint64_t *prev_addr) 616 { 617 uint64_t next; 618 uint64_t ctrl = 0; 619 620 /* 621 * Note: curr_addr is the last WQE (In uDAPL we manipulate 1 WQE 622 * at a time. If there is no next descriptor (i.e. if the current 623 * descriptor is the last WQE on the chain), then set "next" field 624 * to TAVOR_WQE_DBD_MASK. This is because the Tavor hardware 625 * requires the "dbd" bit to be set to one for all Recv WQEs. 626 * In either case, we must add a single bit in the "reserved" field 627 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA. This is the 628 * workaround for a known Tavor errata that can cause Recv WQEs with 629 * zero in the NDA field to behave improperly. 630 * 631 * If notification suppression is not desired then we set 632 * the "E" bit in the ctrl field. 633 */ 634 635 next = TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK; 636 if (!ns) { /* notification needed - so set the "E" bit */ 637 ctrl = TAVOR_WQE_RCV_EVENT_MASK; 638 } 639 640 /* update the WQE */ 641 TAVOR_WQE_LINKNEXT(curr_addr, ctrl, next); 642 643 if (prev_addr != NULL) { 644 /* 645 * Calculate the "next" field of the descriptor. This amounts 646 * to setting up the "next_wqe_addr", "dbd", and "nds" fields 647 * (see tavor_hw.h for more). 648 */ 649 next = ((uint64_t)curr_desc & TAVOR_WQE_NDA_MASK) << 32; 650 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK) | 651 TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK; 652 653 /* 654 * If this WQE is supposed to be linked to the previous 655 * descriptor, then we need to update not only the previous 656 * WQE's "next" fields but we must not touch this WQE's 657 * "ctrl" fields. 658 */ 659 TAVOR_WQE_LINKFIRST(prev_addr, next); 660 } 661 } 662 663 /* 664 * dapli_tavor_wqe_srq_build() 665 * Builds the recv WQE for a given ibt_recv_wr_t 666 */ 667 static DAT_RETURN 668 dapli_tavor_wqe_srq_build(ib_srq_handle_t srq, ibt_recv_wr_t *wr, 669 uint64_t *addr) 670 { 671 tavor_hw_wqe_sgl_t *ds; 672 ibt_wr_ds_t end_sgl; 673 int i; 674 int num_ds; 675 676 /* Fill in the Data Segments (SGL) for the Recv WQE */ 677 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr + 678 sizeof (tavor_hw_rcv_wqe_nextctrl_t)); 679 num_ds = 0; 680 681 /* Check for valid number of SGL entries */ 682 if (wr->wr_nds > srq->srq_wq_sgl) { 683 return (DAT_INVALID_PARAMETER); 684 } 685 686 /* 687 * For each SGL in the Recv Work Request, fill in the Recv WQE's data 688 * segments. Note: We skip any SGL with zero size because Tavor 689 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 690 * the encoding for zero means a 2GB transfer. Because of this special 691 * encoding in the hardware, we mask the requested length with 692 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 693 * zero.) 694 */ 695 for (i = 0; i < wr->wr_nds; i++) { 696 if (wr->wr_sgl[i].ds_len == 0) { 697 continue; 698 } 699 700 /* 701 * Fill in the Data Segment(s) for the receive WQE, using the 702 * information contained in the scatter-gather list of the 703 * work request. 704 */ 705 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]); 706 num_ds++; 707 } 708 709 /* 710 * For SRQ, if the number of data segments is less than the maximum 711 * specified at alloc, then we have to fill in a special "key" entry in 712 * the sgl entry after the last valid one in this post request. We do 713 * that here. 714 */ 715 if (num_ds < srq->srq_wq_sgl) { 716 end_sgl.ds_va = (ib_vaddr_t)0; 717 end_sgl.ds_len = (ib_msglen_t)0; 718 end_sgl.ds_key = (ibt_lkey_t)1; 719 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &end_sgl); 720 } 721 722 return (DAT_SUCCESS); 723 } 724 725 /* 726 * dapli_tavor_wqe_srq_linknext() 727 * Links a srq recv WQE to the prev chain 728 */ 729 static void 730 dapli_tavor_wqe_srq_linknext(uint64_t *curr_addr, boolean_t ns, 731 uint32_t curr_desc, uint64_t *prev_addr) 732 { 733 uint64_t next; 734 uint64_t ctrl = 0; 735 736 /* 737 * Note: curr_addr is the last WQE (In uDAPL we manipulate 1 WQE 738 * at a time. If there is no next descriptor (i.e. if the current 739 * descriptor is the last WQE on the chain), then set "next" field 740 * to TAVOR_WQE_DBD_MASK. This is because the Tavor hardware 741 * requires the "dbd" bit to be set to one for all Recv WQEs. 742 * In either case, we must add a single bit in the "reserved" field 743 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA. This is the 744 * workaround for a known Tavor errata that can cause Recv WQEs with 745 * zero in the NDA field to behave improperly. 746 * 747 * If notification suppression is not desired then we set 748 * the "E" bit in the ctrl field. 749 */ 750 751 next = TAVOR_RCV_WQE_NDA0_WA_MASK; 752 if (!ns) { /* notification needed - so set the "E" bit */ 753 ctrl = TAVOR_WQE_RCV_EVENT_MASK; 754 } 755 756 /* update the WQE */ 757 TAVOR_WQE_LINKNEXT(curr_addr, ctrl, next); 758 759 if (prev_addr != NULL) { 760 /* 761 * Calculate the "next" field of the descriptor. This amounts 762 * to setting up the "next_wqe_addr", "dbd", and "nds" fields 763 * (see tavor_hw.h for more). 764 */ 765 next = ((uint64_t)curr_desc & TAVOR_WQE_NDA_MASK) << 32; 766 next = next | TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK; 767 768 /* 769 * If this WQE is supposed to be linked to the previous 770 * descriptor, then we need to update not only the previous 771 * WQE's "next" fields but we must not touch this WQE's 772 * "ctrl" fields. 773 */ 774 TAVOR_WQE_LINKFIRST(prev_addr, next); 775 } 776 } 777 778 /* 779 * dapli_tavor_cq_peek() 780 * Peeks into a given CQ to check if there are any events that can be 781 * polled. It returns the number of CQEs that can be polled. 782 */ 783 static void 784 dapli_tavor_cq_peek(ib_cq_handle_t cq, int *num_cqe) 785 { 786 tavor_hw_cqe_t *cqe; 787 uint32_t imm_eth_pkey_cred; 788 uint32_t cons_indx; 789 uint32_t wrap_around_mask; 790 uint32_t polled_cnt; 791 uint_t doorbell_cnt; 792 uint_t opcode; 793 794 /* Get the consumer index */ 795 cons_indx = cq->cq_consindx; 796 797 /* 798 * Calculate the wrap around mask. Note: This operation only works 799 * because all Tavor completion queues have power-of-2 sizes 800 */ 801 wrap_around_mask = (cq->cq_size - 1); 802 803 /* Calculate the pointer to the first CQ entry */ 804 cqe = &cq->cq_addr[cons_indx]; 805 806 /* 807 * Count entries in the CQ until we find an entry owned by 808 * the hardware. 809 */ 810 polled_cnt = 0; 811 while (TAVOR_CQE_OWNER_IS_SW(cqe)) { 812 opcode = TAVOR_CQE_OPCODE_GET(cqe); 813 /* Error CQE map to multiple work completions */ 814 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) || 815 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) { 816 imm_eth_pkey_cred = 817 TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe); 818 doorbell_cnt = 819 imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK; 820 polled_cnt += (doorbell_cnt + 1); 821 } else { 822 polled_cnt++; 823 } 824 /* Increment the consumer index */ 825 cons_indx = (cons_indx + 1) & wrap_around_mask; 826 827 /* Update the pointer to the next CQ entry */ 828 cqe = &cq->cq_addr[cons_indx]; 829 } 830 831 *num_cqe = polled_cnt; 832 } 833 834 /* 835 * dapli_tavor_cq_poll() 836 * This routine polls CQEs out of a CQ and puts them into the ibt_wc_t 837 * array that is passed in. 838 */ 839 static DAT_RETURN 840 dapli_tavor_cq_poll(ib_cq_handle_t cq, ibt_wc_t *wc_p, uint_t num_wc, 841 uint_t *num_polled) 842 { 843 tavor_hw_cqe_t *cqe; 844 uint32_t cons_indx; 845 uint32_t wrap_around_mask; 846 uint32_t polled_cnt; 847 uint32_t num_to_increment; 848 DAT_RETURN dat_status; 849 int status; 850 851 /* Get the consumer index */ 852 cons_indx = cq->cq_consindx; 853 854 /* 855 * Calculate the wrap around mask. Note: This operation only works 856 * because all Tavor completion queues have power-of-2 sizes 857 */ 858 wrap_around_mask = (cq->cq_size - 1); 859 860 /* Calculate the pointer to the first CQ entry */ 861 cqe = &cq->cq_addr[cons_indx]; 862 863 /* 864 * Keep pulling entries from the CQ until we find an entry owned by 865 * the hardware. As long as there the CQE's owned by SW, process 866 * each entry by calling dapli_tavor_cq_cqe_consume() and updating the 867 * CQ consumer index. Note: We only update the consumer index if 868 * dapli_tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB. 869 * Otherwise, it indicates that we are going to "recycle" the CQE 870 * (probably because it is a error CQE and corresponds to more than one 871 * completion). 872 */ 873 polled_cnt = 0; 874 while (TAVOR_CQE_OWNER_IS_SW(cqe)) { 875 status = dapli_tavor_cq_cqe_consume(cq, cqe, 876 &wc_p[polled_cnt++]); 877 if (status == TAVOR_CQ_SYNC_AND_DB) { 878 /* Reset entry to hardware ownership */ 879 TAVOR_CQE_OWNER_SET_HW(cqe); 880 881 /* Increment the consumer index */ 882 cons_indx = (cons_indx + 1) & wrap_around_mask; 883 884 /* Update the pointer to the next CQ entry */ 885 cqe = &cq->cq_addr[cons_indx]; 886 } 887 888 /* 889 * If we have run out of space to store work completions, 890 * then stop and return the ones we have pulled of the CQ. 891 */ 892 if (polled_cnt >= num_wc) { 893 break; 894 } 895 } 896 897 dat_status = DAT_SUCCESS; 898 /* 899 * Now we only ring the doorbell (to update the consumer index) if 900 * we've actually consumed a CQ entry. If we have, for example, 901 * pulled from a CQE that we are still in the process of "recycling" 902 * for error purposes, then we would not update the consumer index. 903 */ 904 if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) { 905 /* 906 * Post doorbell to update the consumer index. Doorbell 907 * value indicates number of entries consumed (minus 1) 908 */ 909 if (cons_indx > cq->cq_consindx) { 910 num_to_increment = (cons_indx - cq->cq_consindx) - 1; 911 } else { 912 num_to_increment = ((cons_indx + cq->cq_size) - 913 cq->cq_consindx) - 1; 914 } 915 cq->cq_consindx = cons_indx; 916 dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_INCR_CONSINDX, 917 cq->cq_num, num_to_increment); 918 } else if (polled_cnt == 0) { 919 /* 920 * If the CQ is empty, we can try to free up some of the WRID 921 * list containers. 922 */ 923 if (cq->cq_wrid_reap_head) /* look before leaping */ 924 dapls_tavor_wrid_cq_reap(cq); 925 dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0); 926 } 927 928 if (num_polled != NULL) { 929 *num_polled = polled_cnt; 930 } 931 932 return (dat_status); 933 } 934 935 /* 936 * dapli_tavor_cq_poll_one() 937 * This routine polls one CQE out of a CQ and puts ot into the ibt_wc_t 938 * that is passed in. See above for more comments/details. 939 */ 940 static DAT_RETURN 941 dapli_tavor_cq_poll_one(ib_cq_handle_t cq, ibt_wc_t *wc_p) 942 { 943 tavor_hw_cqe_t *cqe; 944 uint32_t cons_indx; 945 DAT_RETURN dat_status; 946 int status; 947 948 /* Get the consumer index */ 949 cons_indx = cq->cq_consindx; 950 951 /* Calculate the pointer to the first CQ entry */ 952 cqe = &cq->cq_addr[cons_indx]; 953 954 /* 955 * Keep pulling entries from the CQ until we find an entry owned by 956 * the hardware. As long as there the CQE's owned by SW, process 957 * each entry by calling dapli_tavor_cq_cqe_consume() and updating the 958 * CQ consumer index. Note: We only update the consumer index if 959 * dapli_tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB. 960 * Otherwise, it indicates that we are going to "recycle" the CQE 961 * (probably because it is a error CQE and corresponds to more than one 962 * completion). 963 */ 964 if (TAVOR_CQE_OWNER_IS_SW(cqe)) { 965 status = dapli_tavor_cq_cqe_consume(cq, cqe, wc_p); 966 if (status == TAVOR_CQ_SYNC_AND_DB) { 967 /* Reset entry to hardware ownership */ 968 TAVOR_CQE_OWNER_SET_HW(cqe); 969 970 /* Increment the consumer index */ 971 cq->cq_consindx = 972 (cons_indx + 1) & (cq->cq_size - 1); 973 dapli_tavor_cq_doorbell(cq->cq_iauar, 974 TAVOR_CQDB_INCR_CONSINDX, 975 cq->cq_num, 0); 976 } 977 dat_status = DAT_SUCCESS; 978 } else { 979 if (cq->cq_wrid_reap_head) /* look before leaping */ 980 dapls_tavor_wrid_cq_reap(cq); 981 dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0); 982 } 983 return (dat_status); 984 } 985 986 /* 987 * dapli_tavor_cq_cqe_consume() 988 * Converts a given CQE into a ibt_wc_t object 989 */ 990 static int 991 dapli_tavor_cq_cqe_consume(ib_cq_handle_t cqhdl, tavor_hw_cqe_t *cqe, 992 ibt_wc_t *wc) 993 { 994 uint_t flags; 995 uint_t type; 996 uint_t opcode; 997 int status; 998 999 /* 1000 * Determine if this is an "error" CQE by examining "opcode". If it 1001 * is an error CQE, then call dapli_tavor_cq_errcqe_consume() and return 1002 * whatever status it returns. Otherwise, this is a successful 1003 * completion. 1004 */ 1005 opcode = TAVOR_CQE_OPCODE_GET(cqe); 1006 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) || 1007 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) { 1008 status = dapli_tavor_cq_errcqe_consume(cqhdl, cqe, wc); 1009 return (status); 1010 } 1011 1012 /* 1013 * Fetch the Work Request ID using the information in the CQE. 1014 * See tavor_wr.c for more details. 1015 */ 1016 wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, cqe, 1017 TAVOR_CQE_SENDRECV_GET(cqe), 0, NULL); 1018 wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe); 1019 1020 /* 1021 * Parse the CQE opcode to determine completion type. This will set 1022 * not only the type of the completion, but also any flags that might 1023 * be associated with it (e.g. whether immediate data is present). 1024 */ 1025 flags = IBT_WC_NO_FLAGS; 1026 if (TAVOR_CQE_SENDRECV_GET(cqe) != TAVOR_COMPLETION_RECV) { 1027 1028 /* 1029 * Send CQE 1030 * 1031 * The following opcodes will not be generated in uDAPL 1032 * case TAVOR_CQE_SND_RDMAWR_IMM: 1033 * case TAVOR_CQE_SND_SEND_IMM: 1034 * case TAVOR_CQE_SND_ATOMIC_CS: 1035 * case TAVOR_CQE_SND_ATOMIC_FA: 1036 */ 1037 switch (opcode) { 1038 case TAVOR_CQE_SND_RDMAWR: 1039 type = IBT_WRC_RDMAW; 1040 break; 1041 1042 case TAVOR_CQE_SND_SEND: 1043 type = IBT_WRC_SEND; 1044 break; 1045 1046 case TAVOR_CQE_SND_RDMARD: 1047 type = IBT_WRC_RDMAR; 1048 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe); 1049 break; 1050 1051 case TAVOR_CQE_SND_BIND_MW: 1052 type = IBT_WRC_BIND; 1053 break; 1054 1055 default: 1056 wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR; 1057 return (TAVOR_CQ_SYNC_AND_DB); 1058 } 1059 } else { 1060 1061 /* 1062 * Receive CQE 1063 * 1064 * The following opcodes will not be generated in uDAPL 1065 * 1066 * case TAVOR_CQE_RCV_RECV_IMM: 1067 * case TAVOR_CQE_RCV_RECV_IMM2: 1068 * case TAVOR_CQE_RCV_RDMAWR_IMM: 1069 * case TAVOR_CQE_RCV_RDMAWR_IMM2: 1070 */ 1071 switch (opcode & 0x1F) { 1072 case TAVOR_CQE_RCV_RECV: 1073 /* FALLTHROUGH */ 1074 case TAVOR_CQE_RCV_RECV2: 1075 type = IBT_WRC_RECV; 1076 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe); 1077 break; 1078 default: 1079 wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR; 1080 return (TAVOR_CQ_SYNC_AND_DB); 1081 } 1082 } 1083 wc->wc_type = type; 1084 wc->wc_flags = flags; 1085 /* If we got here, completion status must be success */ 1086 wc->wc_status = IBT_WC_SUCCESS; 1087 1088 return (TAVOR_CQ_SYNC_AND_DB); 1089 } 1090 1091 1092 /* 1093 * dapli_tavor_cq_errcqe_consume() 1094 */ 1095 static int 1096 dapli_tavor_cq_errcqe_consume(ib_cq_handle_t cqhdl, tavor_hw_cqe_t *cqe, 1097 ibt_wc_t *wc) 1098 { 1099 dapls_tavor_wrid_entry_t wre; 1100 uint32_t next_wqeaddr; 1101 uint32_t imm_eth_pkey_cred; 1102 uint_t nextwqesize, dbd; 1103 uint_t doorbell_cnt, status; 1104 uint_t opcode = TAVOR_CQE_OPCODE_GET(cqe); 1105 1106 dapl_dbg_log(DAPL_DBG_TYPE_EVD, "errcqe_consume:cqe.eth=%x, wqe=%x\n", 1107 TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe), 1108 TAVOR_CQE_WQEADDRSZ_GET(cqe)); 1109 1110 /* 1111 * Fetch the Work Request ID using the information in the CQE. 1112 * See tavor_wr.c for more details. 1113 */ 1114 wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, cqe, 1115 (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ? TAVOR_COMPLETION_SEND : 1116 TAVOR_COMPLETION_RECV, 1, &wre); 1117 wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe); 1118 1119 /* 1120 * Parse the CQE opcode to determine completion type. We know that 1121 * the CQE is an error completion, so we extract only the completion 1122 * status here. 1123 */ 1124 imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe); 1125 status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT; 1126 switch (status) { 1127 case TAVOR_CQE_LOC_LEN_ERR: 1128 status = IBT_WC_LOCAL_LEN_ERR; 1129 break; 1130 1131 case TAVOR_CQE_LOC_OP_ERR: 1132 status = IBT_WC_LOCAL_CHAN_OP_ERR; 1133 break; 1134 1135 case TAVOR_CQE_LOC_PROT_ERR: 1136 status = IBT_WC_LOCAL_PROTECT_ERR; 1137 break; 1138 1139 case TAVOR_CQE_WR_FLUSHED_ERR: 1140 status = IBT_WC_WR_FLUSHED_ERR; 1141 break; 1142 1143 case TAVOR_CQE_MW_BIND_ERR: 1144 status = IBT_WC_MEM_WIN_BIND_ERR; 1145 break; 1146 1147 case TAVOR_CQE_BAD_RESPONSE_ERR: 1148 status = IBT_WC_BAD_RESPONSE_ERR; 1149 break; 1150 1151 case TAVOR_CQE_LOCAL_ACCESS_ERR: 1152 status = IBT_WC_LOCAL_ACCESS_ERR; 1153 break; 1154 1155 case TAVOR_CQE_REM_INV_REQ_ERR: 1156 status = IBT_WC_REMOTE_INVALID_REQ_ERR; 1157 break; 1158 1159 case TAVOR_CQE_REM_ACC_ERR: 1160 status = IBT_WC_REMOTE_ACCESS_ERR; 1161 break; 1162 1163 case TAVOR_CQE_REM_OP_ERR: 1164 status = IBT_WC_REMOTE_OP_ERR; 1165 break; 1166 1167 case TAVOR_CQE_TRANS_TO_ERR: 1168 status = IBT_WC_TRANS_TIMEOUT_ERR; 1169 break; 1170 1171 case TAVOR_CQE_RNRNAK_TO_ERR: 1172 status = IBT_WC_RNR_NAK_TIMEOUT_ERR; 1173 break; 1174 1175 /* 1176 * The following error codes are not supported in the Tavor driver 1177 * as they relate only to Reliable Datagram completion statuses: 1178 * case TAVOR_CQE_LOCAL_RDD_VIO_ERR: 1179 * case TAVOR_CQE_REM_INV_RD_REQ_ERR: 1180 * case TAVOR_CQE_EEC_REM_ABORTED_ERR: 1181 * case TAVOR_CQE_INV_EEC_NUM_ERR: 1182 * case TAVOR_CQE_INV_EEC_STATE_ERR: 1183 * case TAVOR_CQE_LOC_EEC_ERR: 1184 */ 1185 1186 default: 1187 status = IBT_WC_LOCAL_CHAN_OP_ERR; 1188 break; 1189 } 1190 wc->wc_status = status; 1191 wc->wc_type = 0; 1192 /* 1193 * Now we do all the checking that's necessary to handle completion 1194 * queue entry "recycling" 1195 * 1196 * It is not necessary here to try to sync the WQE as we are only 1197 * attempting to read from the Work Queue (and hardware does not 1198 * write to it). 1199 */ 1200 1201 /* 1202 * We can get doorbell info, WQE address, size for the next WQE 1203 * from the "wre" (which was filled in above in the call to the 1204 * tavor_wrid_get_entry() routine) 1205 */ 1206 dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0; 1207 next_wqeaddr = wre.wr_wqeaddrsz; 1208 nextwqesize = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK; 1209 1210 /* 1211 * Get the doorbell count from the CQE. This indicates how many 1212 * completions this one CQE represents. 1213 */ 1214 doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK; 1215 1216 /* 1217 * Determine if we're ready to consume this CQE yet or not. If the 1218 * next WQE has size zero (i.e. no next WQE) or if the doorbell count 1219 * is down to zero, then this is the last/only completion represented 1220 * by the current CQE (return TAVOR_CQ_SYNC_AND_DB). Otherwise, the 1221 * current CQE needs to be recycled (see below). 1222 */ 1223 if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) { 1224 /* 1225 * Consume the CQE 1226 * Return status to indicate that doorbell and sync may be 1227 * necessary. 1228 */ 1229 return (TAVOR_CQ_SYNC_AND_DB); 1230 1231 } else { 1232 /* 1233 * Recycle the CQE for use in the next PollCQ() call 1234 * Decrement the doorbell count, modify the error status, 1235 * and update the WQE address and size (to point to the 1236 * next WQE on the chain. Put these update entries back 1237 * into the CQE. 1238 * Despite the fact that we have updated the CQE, it is not 1239 * necessary for us to attempt to sync this entry just yet 1240 * as we have not changed the "hardware's view" of the 1241 * entry (i.e. we have not modified the "owner" bit - which 1242 * is all that the Tavor hardware really cares about. 1243 */ 1244 doorbell_cnt = doorbell_cnt - dbd; 1245 TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cqe, 1246 ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) | 1247 (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK))); 1248 TAVOR_CQE_WQEADDRSZ_SET(cqe, 1249 TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize)); 1250 dapl_dbg_log(DAPL_DBG_TYPE_EVD, 1251 "errcqe_consume: recycling cqe.eth=%x, wqe=%x\n", 1252 TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe), 1253 TAVOR_CQE_WQEADDRSZ_GET(cqe)); 1254 return (TAVOR_CQ_RECYCLE_ENTRY); 1255 } 1256 } 1257 1258 /* 1259 * dapli_tavor_cq_notify() 1260 * This function is used for arming the CQ by ringing the CQ doorbell. 1261 */ 1262 static DAT_RETURN 1263 dapli_tavor_cq_notify(ib_cq_handle_t cq, int flags, uint32_t param) 1264 { 1265 uint32_t cqnum; 1266 1267 /* 1268 * Determine if we are trying to get the next completion or the next 1269 * "solicited" completion. Then hit the appropriate doorbell. 1270 */ 1271 cqnum = cq->cq_num; 1272 if (flags == IB_NOTIFY_ON_NEXT_COMP) { 1273 dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_NOTIFY_CQ, 1274 cqnum, TAVOR_CQDB_DEFAULT_PARAM); 1275 1276 } else if (flags == IB_NOTIFY_ON_NEXT_SOLICITED) { 1277 dapli_tavor_cq_doorbell(cq->cq_iauar, 1278 TAVOR_CQDB_NOTIFY_CQ_SOLICIT, cqnum, 1279 TAVOR_CQDB_DEFAULT_PARAM); 1280 1281 } else if (flags == IB_NOTIFY_ON_NEXT_NCOMP) { 1282 dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_NOTIFY_NCQ, 1283 cqnum, param); 1284 } else { 1285 return (DAT_INVALID_PARAMETER); 1286 } 1287 1288 return (DAT_SUCCESS); 1289 } 1290 1291 /* 1292 * dapli_tavor_post_send() 1293 */ 1294 static DAT_RETURN 1295 dapli_tavor_post_send(DAPL_EP *ep, ibt_send_wr_t *wr, boolean_t ns) 1296 { 1297 tavor_sw_wqe_dbinfo_t dbinfo; 1298 dapls_tavor_wrid_list_hdr_t *wridlist; 1299 dapls_tavor_wrid_entry_t *wre_last; 1300 uint32_t desc; 1301 uint64_t *wqe_addr; 1302 uint32_t desc_sz; 1303 uint32_t wqeaddrsz, signaled_dbd; 1304 uint32_t head, tail, next_tail, qsize_msk; 1305 int status; 1306 ib_qp_handle_t qp; 1307 1308 if ((ep->qp_state == IBT_STATE_RESET) || 1309 (ep->qp_state == IBT_STATE_INIT) || 1310 (ep->qp_state == IBT_STATE_RTR)) { 1311 dapl_dbg_log(DAPL_DBG_TYPE_ERR, 1312 "post_send: invalid qp_state %d\n", ep->qp_state); 1313 return (DAT_INVALID_STATE); 1314 } 1315 1316 qp = ep->qp_handle; 1317 1318 /* Grab the lock for the WRID list */ 1319 dapl_os_lock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock); 1320 wridlist = qp->qp_sq_wqhdr->wq_wrid_post; 1321 1322 /* Save away some initial QP state */ 1323 qsize_msk = qp->qp_sq_wqhdr->wq_size - 1; 1324 tail = qp->qp_sq_wqhdr->wq_tail; 1325 head = qp->qp_sq_wqhdr->wq_head; 1326 1327 /* 1328 * Check for "queue full" condition. If the queue is already full, 1329 * then no more WQEs can be posted, return an error 1330 */ 1331 if (qp->qp_sq_wqhdr->wq_full != 0) { 1332 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock); 1333 return (DAT_INSUFFICIENT_RESOURCES); 1334 } 1335 1336 /* 1337 * Increment the "tail index" and check for "queue full" condition. 1338 * If we detect that the current work request is going to fill the 1339 * work queue, then we mark this condition and continue. 1340 */ 1341 next_tail = (tail + 1) & qsize_msk; 1342 if (next_tail == head) { 1343 qp->qp_sq_wqhdr->wq_full = 1; 1344 } 1345 1346 /* 1347 * Get the user virtual address of the location where the next 1348 * Send WQE should be built 1349 */ 1350 wqe_addr = TAVOR_QP_SQ_ENTRY(qp, tail); 1351 1352 /* 1353 * Call tavor_wqe_send_build() to build the WQE at the given address. 1354 * This routine uses the information in the ibt_send_wr_t and 1355 * returns the size of the WQE when it returns. 1356 */ 1357 status = dapli_tavor_wqe_send_build(qp, wr, wqe_addr, &desc_sz); 1358 if (status != DAT_SUCCESS) { 1359 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock); 1360 return (status); 1361 } 1362 1363 /* 1364 * Get the descriptor (io address) corresponding to the location 1365 * Send WQE was built. 1366 */ 1367 desc = TAVOR_QP_SQ_DESC(qp, tail); 1368 1369 dapl_os_assert(desc >= qp->qp_sq_desc_addr && 1370 desc <= (qp->qp_sq_desc_addr + 1371 qp->qp_sq_numwqe*qp->qp_sq_wqesz)); 1372 1373 /* 1374 * Add a WRID entry to the WRID list. Need to calculate the 1375 * "wqeaddrsz" and "signaled_dbd" values to pass to 1376 * dapli_tavor_wrid_add_entry() 1377 */ 1378 wqeaddrsz = TAVOR_QP_WQEADDRSZ(desc, desc_sz); 1379 1380 if (wr->wr_flags & IBT_WR_SEND_SIGNAL) { 1381 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED; 1382 } 1383 1384 dapli_tavor_wrid_add_entry(qp->qp_sq_wqhdr, wr->wr_id, wqeaddrsz, 1385 signaled_dbd); 1386 1387 /* 1388 * Now link the wqe to the old chain (if there was one) 1389 */ 1390 dapli_tavor_wqe_send_linknext(wr, wqe_addr, ns, desc, desc_sz, 1391 qp->qp_sq_lastwqeaddr, &dbinfo); 1392 1393 /* 1394 * Now if the WRID tail entry is non-NULL, then this 1395 * represents the entry to which we are chaining the 1396 * new entries. Since we are going to ring the 1397 * doorbell for this WQE, we want set its "dbd" bit. 1398 * 1399 * On the other hand, if the tail is NULL, even though 1400 * we will have rung the doorbell for the previous WQE 1401 * (for the hardware's sake) it is irrelevant to our 1402 * purposes (for tracking WRIDs) because we know the 1403 * request must have already completed. 1404 */ 1405 wre_last = wridlist->wl_wre_old_tail; 1406 if (wre_last != NULL) { 1407 wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED; 1408 } 1409 1410 /* Update some of the state in the QP */ 1411 qp->qp_sq_lastwqeaddr = wqe_addr; 1412 qp->qp_sq_wqhdr->wq_tail = next_tail; 1413 1414 /* Ring the doorbell */ 1415 dapli_tavor_qp_send_doorbell(qp->qp_iauar, desc, desc_sz, 1416 qp->qp_num, dbinfo.db_fence, dbinfo.db_nopcode); 1417 1418 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock); 1419 1420 return (DAT_SUCCESS); 1421 } 1422 1423 /* 1424 * dapli_tavor_post_recv() 1425 */ 1426 static DAT_RETURN 1427 dapli_tavor_post_recv(DAPL_EP *ep, ibt_recv_wr_t *wr, boolean_t ns) 1428 { 1429 dapls_tavor_wrid_list_hdr_t *wridlist; 1430 dapls_tavor_wrid_entry_t *wre_last; 1431 ib_qp_handle_t qp; 1432 DAT_RETURN status; 1433 uint32_t desc; 1434 uint64_t *wqe_addr; 1435 uint32_t desc_sz; 1436 uint32_t wqeaddrsz; 1437 uint32_t head, tail, next_tail, qsize_msk; 1438 1439 if (ep->qp_state == IBT_STATE_RESET) { 1440 dapl_dbg_log(DAPL_DBG_TYPE_ERR, 1441 "post_recv: invalid qp_state %d\n", ep->qp_state); 1442 return (DAT_INVALID_STATE); 1443 } 1444 qp = ep->qp_handle; 1445 1446 /* Grab the lock for the WRID list */ 1447 dapl_os_lock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock); 1448 wridlist = qp->qp_rq_wqhdr->wq_wrid_post; 1449 1450 /* Save away some initial QP state */ 1451 qsize_msk = qp->qp_rq_wqhdr->wq_size - 1; 1452 tail = qp->qp_rq_wqhdr->wq_tail; 1453 head = qp->qp_rq_wqhdr->wq_head; 1454 1455 /* 1456 * For the ibt_recv_wr_t passed in, parse the request and build a 1457 * Recv WQE. Link the WQE with the previous WQE and ring the 1458 * door bell. 1459 */ 1460 1461 /* 1462 * Check for "queue full" condition. If the queue is already full, 1463 * then no more WQEs can be posted. So return an error. 1464 */ 1465 if (qp->qp_rq_wqhdr->wq_full != 0) { 1466 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock); 1467 return (DAT_INSUFFICIENT_RESOURCES); 1468 } 1469 1470 /* 1471 * Increment the "tail index" and check for "queue 1472 * full" condition. If we detect that the current 1473 * work request is going to fill the work queue, then 1474 * we mark this condition and continue. 1475 */ 1476 next_tail = (tail + 1) & qsize_msk; 1477 if (next_tail == head) { 1478 qp->qp_rq_wqhdr->wq_full = 1; 1479 } 1480 1481 /* Get the descriptor (IO Address) of the WQE to be built */ 1482 desc = TAVOR_QP_RQ_DESC(qp, tail); 1483 /* The user virtual address of the WQE to be built */ 1484 wqe_addr = TAVOR_QP_RQ_ENTRY(qp, tail); 1485 1486 /* 1487 * Call tavor_wqe_recv_build() to build the WQE at the given 1488 * address. This routine uses the information in the 1489 * ibt_recv_wr_t and returns the size of the WQE. 1490 */ 1491 status = dapli_tavor_wqe_recv_build(qp, wr, wqe_addr, &desc_sz); 1492 if (status != DAT_SUCCESS) { 1493 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock); 1494 return (DAT_INTERNAL_ERROR); 1495 } 1496 1497 /* 1498 * Add a WRID entry to the WRID list. Need to calculate the 1499 * "wqeaddrsz" and "signaled_dbd" values to pass to 1500 * dapli_tavor_wrid_add_entry(). 1501 * Note: all Recv WQEs are essentially "signaled" 1502 */ 1503 wqeaddrsz = TAVOR_QP_WQEADDRSZ(desc, desc_sz); 1504 dapli_tavor_wrid_add_entry(qp->qp_rq_wqhdr, wr->wr_id, wqeaddrsz, 1505 (uint32_t)TAVOR_WRID_ENTRY_SIGNALED); 1506 1507 /* 1508 * Now link the chain to the old chain (if there was one) 1509 * and ring the doorbel for the recv work queue. 1510 */ 1511 dapli_tavor_wqe_recv_linknext(wqe_addr, ns, desc, desc_sz, 1512 qp->qp_rq_lastwqeaddr); 1513 1514 /* 1515 * Now if the WRID tail entry is non-NULL, then this 1516 * represents the entry to which we are chaining the 1517 * new entries. Since we are going to ring the 1518 * doorbell for this WQE, we want set its "dbd" bit. 1519 * 1520 * On the other hand, if the tail is NULL, even though 1521 * we will have rung the doorbell for the previous WQE 1522 * (for the hardware's sake) it is irrelevant to our 1523 * purposes (for tracking WRIDs) because we know the 1524 * request must have already completed. 1525 */ 1526 wre_last = wridlist->wl_wre_old_tail; 1527 if (wre_last != NULL) { 1528 wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED; 1529 } 1530 1531 /* Update some of the state in the QP */ 1532 qp->qp_rq_lastwqeaddr = wqe_addr; 1533 qp->qp_rq_wqhdr->wq_tail = next_tail; 1534 1535 /* Ring the doorbell */ 1536 dapli_tavor_qp_recv_doorbell(qp->qp_iauar, desc, desc_sz, 1537 qp->qp_num, 1); 1538 1539 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock); 1540 1541 return (DAT_SUCCESS); 1542 } 1543 1544 /* 1545 * dapli_tavor_post_srq() 1546 */ 1547 static DAT_RETURN 1548 dapli_tavor_post_srq(DAPL_SRQ *srqp, ibt_recv_wr_t *wr, boolean_t ns) 1549 { 1550 ib_srq_handle_t srq; 1551 DAT_RETURN status; 1552 uint32_t desc; 1553 uint64_t *wqe_addr; 1554 uint64_t *last_wqe_addr; 1555 uint32_t head, next_head, qsize_msk; 1556 uint32_t wqe_index; 1557 1558 1559 srq = srqp->srq_handle; 1560 1561 /* Grab the lock for the WRID list */ 1562 dapl_os_lock(&srq->srq_wridlist->wl_lock->wrl_lock); 1563 1564 /* 1565 * For the ibt_recv_wr_t passed in, parse the request and build a 1566 * Recv WQE. Link the WQE with the previous WQE and ring the 1567 * door bell. 1568 */ 1569 1570 /* 1571 * Check for "queue full" condition. If the queue is already full, 1572 * ie. there are no free entries, then no more WQEs can be posted. 1573 * So return an error. 1574 */ 1575 if (srq->srq_wridlist->wl_freel_entries == 0) { 1576 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock); 1577 return (DAT_INSUFFICIENT_RESOURCES); 1578 } 1579 1580 /* Save away some initial SRQ state */ 1581 qsize_msk = srq->srq_wridlist->wl_size - 1; 1582 head = srq->srq_wridlist->wl_freel_head; 1583 1584 next_head = (head + 1) & qsize_msk; 1585 1586 /* Get the descriptor (IO Address) of the WQE to be built */ 1587 desc = srq->srq_wridlist->wl_free_list[head]; 1588 1589 wqe_index = TAVOR_SRQ_WQ_INDEX(srq->srq_wq_desc_addr, desc, 1590 srq->srq_wq_wqesz); 1591 1592 /* The user virtual address of the WQE to be built */ 1593 wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq, wqe_index); 1594 1595 /* 1596 * Call dapli_tavor_wqe_srq_build() to build the WQE at the given 1597 * address. This routine uses the information in the 1598 * ibt_recv_wr_t and returns the size of the WQE. 1599 */ 1600 status = dapli_tavor_wqe_srq_build(srq, wr, wqe_addr); 1601 if (status != DAT_SUCCESS) { 1602 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock); 1603 return (status); 1604 } 1605 1606 /* 1607 * Add a WRID entry to the WRID list. 1608 */ 1609 dapli_tavor_wrid_add_entry_srq(srq, wr->wr_id, wqe_index); 1610 1611 if (srq->srq_wq_lastwqeindex == -1) { 1612 last_wqe_addr = NULL; 1613 } else { 1614 last_wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq, 1615 srq->srq_wq_lastwqeindex); 1616 } 1617 /* 1618 * Now link the chain to the old chain (if there was one) 1619 * and ring the doorbell for the SRQ. 1620 */ 1621 dapli_tavor_wqe_srq_linknext(wqe_addr, ns, desc, last_wqe_addr); 1622 1623 /* Update some of the state in the SRQ */ 1624 srq->srq_wq_lastwqeindex = wqe_index; 1625 srq->srq_wridlist->wl_freel_head = next_head; 1626 srq->srq_wridlist->wl_freel_entries--; 1627 dapl_os_assert(srq->srq_wridlist->wl_freel_entries <= 1628 srq->srq_wridlist->wl_size); 1629 1630 /* Ring the doorbell - for SRQ nds = 0 */ 1631 dapli_tavor_qp_recv_doorbell(srq->srq_iauar, desc, 0, 1632 srq->srq_num, 1); 1633 1634 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock); 1635 1636 return (DAT_SUCCESS); 1637 } 1638 1639 /* 1640 * dapli_tavor_wrid_add_entry() 1641 */ 1642 extern void 1643 dapli_tavor_wrid_add_entry(dapls_tavor_workq_hdr_t *wq, uint64_t wrid, 1644 uint32_t wqeaddrsz, uint_t signaled_dbd) 1645 { 1646 dapls_tavor_wrid_entry_t *wre_tmp; 1647 uint32_t head, tail, size; 1648 1649 /* 1650 * Find the entry in the container pointed to by the "tail" index. 1651 * Add all of the relevant information to that entry, including WRID, 1652 * "wqeaddrsz" parameter, and whether it was signaled/unsignaled 1653 * and/or doorbelled. 1654 */ 1655 head = wq->wq_wrid_post->wl_head; 1656 tail = wq->wq_wrid_post->wl_tail; 1657 size = wq->wq_wrid_post->wl_size; 1658 wre_tmp = &wq->wq_wrid_post->wl_wre[tail]; 1659 wre_tmp->wr_wrid = wrid; 1660 wre_tmp->wr_wqeaddrsz = wqeaddrsz; 1661 wre_tmp->wr_signaled_dbd = signaled_dbd; 1662 1663 /* 1664 * Update the "wrid_old_tail" pointer to point to the entry we just 1665 * inserted into the queue. By tracking this pointer (the pointer to 1666 * the most recently inserted entry) it will possible later in the 1667 * PostSend() and PostRecv() code paths to find the entry that needs 1668 * its "doorbelled" flag set (see comment in tavor_post_recv() and/or 1669 * tavor_post_send()). 1670 */ 1671 wq->wq_wrid_post->wl_wre_old_tail = wre_tmp; 1672 1673 /* Update the tail index */ 1674 tail = ((tail + 1) & (size - 1)); 1675 wq->wq_wrid_post->wl_tail = tail; 1676 1677 /* 1678 * If the "tail" index has just wrapped over into the "head" index, 1679 * then we have filled the container. We use the "full" flag to 1680 * indicate this condition and to distinguish it from the "empty" 1681 * condition (where head and tail are also equal). 1682 */ 1683 if (head == tail) { 1684 wq->wq_wrid_post->wl_full = 1; 1685 } 1686 } 1687 1688 /* 1689 * dapli_tavor_wrid_add_entry_srq() 1690 */ 1691 extern void 1692 dapli_tavor_wrid_add_entry_srq(ib_srq_handle_t srq, uint64_t wrid, 1693 uint32_t wqe_index) 1694 { 1695 dapls_tavor_wrid_entry_t *wre; 1696 1697 /* ASSERT on impossible wqe_index values */ 1698 dapl_os_assert(wqe_index < srq->srq_wq_numwqe); 1699 1700 /* 1701 * Setup the WRE. 1702 * 1703 * Given the 'wqe_index' value, we store the WRID at this WRE offset. 1704 * And we set the WRE to be signaled_dbd so that on poll CQ we can find 1705 * this information and associate the WRID to the WQE found on the CQE. 1706 * Note: all Recv WQEs are essentially "signaled" 1707 */ 1708 wre = &srq->srq_wridlist->wl_wre[wqe_index]; 1709 wre->wr_wrid = wrid; 1710 wre->wr_signaled_dbd = (uint32_t)TAVOR_WRID_ENTRY_SIGNALED; 1711 } 1712 1713 /* 1714 * dapli_tavor_cq_srq_entries_flush() 1715 */ 1716 static void 1717 dapli_tavor_cq_srq_entries_flush(ib_qp_handle_t qp) 1718 { 1719 ib_cq_handle_t cq; 1720 dapls_tavor_workq_hdr_t *wqhdr; 1721 tavor_hw_cqe_t *cqe; 1722 tavor_hw_cqe_t *next_cqe; 1723 uint32_t cons_indx, tail_cons_indx, wrap_around_mask; 1724 uint32_t new_indx, check_indx, indx; 1725 uint32_t num_to_increment; 1726 int cqe_qpnum, cqe_type; 1727 int outstanding_cqes, removed_cqes; 1728 int i; 1729 1730 /* ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock)); */ 1731 1732 cq = qp->qp_rq_cqhdl; 1733 wqhdr = qp->qp_rq_wqhdr; 1734 1735 dapl_os_assert(wqhdr->wq_wrid_post != NULL); 1736 dapl_os_assert(wqhdr->wq_wrid_post->wl_srq_en != 0); 1737 1738 /* Get the consumer index */ 1739 cons_indx = cq->cq_consindx; 1740 1741 /* 1742 * Calculate the wrap around mask. Note: This operation only works 1743 * because all Tavor completion queues have power-of-2 sizes 1744 */ 1745 wrap_around_mask = (cq->cq_size - 1); 1746 1747 /* Calculate the pointer to the first CQ entry */ 1748 cqe = &cq->cq_addr[cons_indx]; 1749 1750 /* 1751 * Loop through the CQ looking for entries owned by software. If an 1752 * entry is owned by software then we increment an 'outstanding_cqes' 1753 * count to know how many entries total we have on our CQ. We use this 1754 * value further down to know how many entries to loop through looking 1755 * for our same QP number. 1756 */ 1757 outstanding_cqes = 0; 1758 tail_cons_indx = cons_indx; 1759 while (TAVOR_CQE_OWNER_IS_SW(cqe)) { 1760 /* increment total cqes count */ 1761 outstanding_cqes++; 1762 1763 /* increment the consumer index */ 1764 tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask; 1765 1766 /* update the pointer to the next cq entry */ 1767 cqe = &cq->cq_addr[tail_cons_indx]; 1768 } 1769 1770 /* 1771 * Using the 'tail_cons_indx' that was just set, we now know how many 1772 * total CQEs possible there are. Set the 'check_indx' and the 1773 * 'new_indx' to the last entry identified by 'tail_cons_indx' 1774 */ 1775 check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask; 1776 1777 for (i = 0; i < outstanding_cqes; i++) { 1778 cqe = &cq->cq_addr[check_indx]; 1779 1780 /* Grab QP number from CQE */ 1781 cqe_qpnum = TAVOR_CQE_QPNUM_GET(cqe); 1782 cqe_type = TAVOR_CQE_SENDRECV_GET(cqe); 1783 1784 /* 1785 * If the QP number is the same in the CQE as the QP that we 1786 * have on this SRQ, then we must free up the entry off the 1787 * SRQ. We also make sure that the completion type is of the 1788 * 'TAVOR_COMPLETION_RECV' type. So any send completions on 1789 * this CQ will be left as-is. The handling of returning 1790 * entries back to HW ownership happens further down. 1791 */ 1792 if (cqe_qpnum == qp->qp_num && 1793 cqe_type == TAVOR_COMPLETION_RECV) { 1794 /* Add back to SRQ free list */ 1795 (void) dapli_tavor_wrid_find_match_srq( 1796 wqhdr->wq_wrid_post, cqe); 1797 } else { 1798 /* Do Copy */ 1799 if (check_indx != new_indx) { 1800 next_cqe = &cq->cq_addr[new_indx]; 1801 /* 1802 * Copy the CQE into the "next_cqe" 1803 * pointer. 1804 */ 1805 (void) dapl_os_memcpy(next_cqe, cqe, 1806 sizeof (tavor_hw_cqe_t)); 1807 } 1808 new_indx = (new_indx - 1) & wrap_around_mask; 1809 } 1810 /* Move index to next CQE to check */ 1811 check_indx = (check_indx - 1) & wrap_around_mask; 1812 } 1813 1814 /* Initialize removed cqes count */ 1815 removed_cqes = 0; 1816 1817 /* If an entry was removed */ 1818 if (check_indx != new_indx) { 1819 1820 /* 1821 * Set current pointer back to the beginning consumer index. 1822 * At this point, all unclaimed entries have been copied to the 1823 * index specified by 'new_indx'. This 'new_indx' will be used 1824 * as the new consumer index after we mark all freed entries as 1825 * having HW ownership. We do that here. 1826 */ 1827 1828 /* Loop through all entries until we reach our new pointer */ 1829 for (indx = cons_indx; indx <= new_indx; 1830 indx = (indx + 1) & wrap_around_mask) { 1831 removed_cqes++; 1832 cqe = &cq->cq_addr[indx]; 1833 1834 /* Reset entry to hardware ownership */ 1835 TAVOR_CQE_OWNER_SET_HW(cqe); 1836 } 1837 } 1838 1839 /* 1840 * Update consumer index to be the 'new_indx'. This moves it past all 1841 * removed entries. Because 'new_indx' is pointing to the last 1842 * previously valid SW owned entry, we add 1 to point the cons_indx to 1843 * the first HW owned entry. 1844 */ 1845 cons_indx = (new_indx + 1) & wrap_around_mask; 1846 1847 /* 1848 * Now we only ring the doorbell (to update the consumer index) if 1849 * we've actually consumed a CQ entry. If we found no QP number 1850 * matches above, then we would not have removed anything. So only if 1851 * something was removed do we ring the doorbell. 1852 */ 1853 if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) { 1854 /* 1855 * Post doorbell to update the consumer index. Doorbell 1856 * value indicates number of entries consumed (minus 1) 1857 */ 1858 if (cons_indx > cq->cq_consindx) { 1859 num_to_increment = (cons_indx - cq->cq_consindx) - 1; 1860 } else { 1861 num_to_increment = ((cons_indx + cq->cq_size) - 1862 cq->cq_consindx) - 1; 1863 } 1864 cq->cq_consindx = cons_indx; 1865 1866 dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_INCR_CONSINDX, 1867 cq->cq_num, num_to_increment); 1868 } 1869 } 1870 1871 /* ARGSUSED */ 1872 static void 1873 dapli_tavor_qp_init(ib_qp_handle_t qp) 1874 { 1875 } 1876 1877 /* ARGSUSED */ 1878 static void 1879 dapli_tavor_cq_init(ib_cq_handle_t cq) 1880 { 1881 } 1882 1883 /* ARGSUSED */ 1884 static void 1885 dapli_tavor_srq_init(ib_srq_handle_t srq) 1886 { 1887 } 1888 1889 void 1890 dapls_init_funcs_tavor(DAPL_HCA *hca_ptr) 1891 { 1892 hca_ptr->post_send = dapli_tavor_post_send; 1893 hca_ptr->post_recv = dapli_tavor_post_recv; 1894 hca_ptr->post_srq = dapli_tavor_post_srq; 1895 hca_ptr->cq_peek = dapli_tavor_cq_peek; 1896 hca_ptr->cq_poll = dapli_tavor_cq_poll; 1897 hca_ptr->cq_poll_one = dapli_tavor_cq_poll_one; 1898 hca_ptr->cq_notify = dapli_tavor_cq_notify; 1899 hca_ptr->srq_flush = dapli_tavor_cq_srq_entries_flush; 1900 hca_ptr->qp_init = dapli_tavor_qp_init; 1901 hca_ptr->cq_init = dapli_tavor_cq_init; 1902 hca_ptr->srq_init = dapli_tavor_srq_init; 1903 hca_ptr->hermon_resize_cq = 0; 1904 } 1905