1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include "dapl.h" 28 #include "dapl_tavor_hw.h" 29 #include "dapl_tavor_wr.h" 30 #include "dapl_tavor_ibtf_impl.h" 31 32 #define bt_debug 0 33 34 enum arbel_db_type_e { 35 ARBEL_DBR_CQ_SET_CI = 0x1 << 5, 36 ARBEL_DBR_CQ_ARM = 0x2 << 5, 37 ARBEL_DBR_SQ = 0x3 << 5, 38 ARBEL_DBR_RQ = 0x4 << 5, 39 ARBEL_DBR_SRQ = 0x5 << 5 40 }; 41 42 #define ARBEL_WQE_SGL_INVALID_LKEY 0x00000100 43 #define ARBEL_WQE_SEND_SIGNALED_MASK 0x0000000800000000ull 44 #define ARBEL_WQE_SEND_SOLICIT_MASK 0x0000000200000000ull 45 #define ARBEL_WQE_CTRL_REQBIT_MASK 0x0000000100000000ull 46 #define ARBEL_WQE_NEXT_REQBIT_MASK 0x80 47 #define ARBEL_WQE_SETCTRL(qp, desc, ctrl) \ 48 ((uint64_t *)(desc))[1] = HTOBE_64(ctrl) 49 #define ARBEL_WQE_SETNEXT(qp, desc, nda_op, ee_nds) \ 50 { \ 51 ((uint32_t *)(desc))[0] = HTOBE_32(nda_op); \ 52 ((uint32_t *)(desc))[1] = HTOBE_32(ee_nds); \ 53 } 54 #define ARBEL_WQE_SEND_FENCE_MASK 0x40 55 #define ARBEL_WQE_SEND_NOPCODE_RDMAW 0x8 56 #define ARBEL_WQE_SEND_NOPCODE_SEND 0xA 57 #define ARBEL_WQE_SEND_NOPCODE_RDMAR 0x10 58 #define ARBEL_WQE_SEND_NOPCODE_BIND 0x18 59 #define ARBEL_WQE_NDA_MASK 0x00000000FFFFFFC0ull 60 #define ARBEL_WQE_NDS_MASK 0x3F 61 #define ARBEL_QPSNDDB_WQE_CNT_SHIFT 0x38 62 #define ARBEL_QPSNDDB_WQE_COUNTER_SHIFT 0x28 63 #define ARBEL_QPSNDDB_F_SHIFT 0x25 64 #define ARBEL_QPSNDDB_NOPCODE_SHIFT 0x20 65 #define ARBEL_QPSNDDB_QPN_SHIFT 0x8 66 #define ARBEL_DBR_QP_WQE_COUNTER_SHIFT 0x20 67 #define ARBEL_DBR_QN_SHIFT 0x8 68 69 #define ARBEL_CQDB_NOTIFY_CQ_SOLICIT 0x1 70 #define ARBEL_CQDB_NOTIFY_CQ 0x2 71 72 /* 73 * Function signatures 74 */ 75 extern uint64_t dapls_tavor_wrid_get_entry(ib_cq_handle_t, tavor_hw_cqe_t *, 76 uint_t, uint_t, dapls_tavor_wrid_entry_t *); 77 extern void dapls_tavor_wrid_cq_reap(ib_cq_handle_t); 78 extern DAPL_OS_LOCK g_tavor_uar_lock; 79 80 #ifndef _LP64 81 extern void dapls_atomic_assign_64(uint64_t, uint64_t *); 82 #endif 83 84 static int dapli_arbel_wqe_send_build(ib_qp_handle_t, ibt_send_wr_t *, 85 uint64_t *, uint_t *); 86 static DAT_RETURN dapli_arbel_wqe_recv_build(ib_qp_handle_t, ibt_recv_wr_t *, 87 uint64_t *, uint_t *); 88 static int dapli_arbel_cq_cqe_consume(ib_cq_handle_t, tavor_hw_cqe_t *, 89 ibt_wc_t *); 90 static int dapli_arbel_cq_errcqe_consume(ib_cq_handle_t, tavor_hw_cqe_t *, 91 ibt_wc_t *); 92 extern void dapli_tavor_wrid_add_entry(dapls_tavor_workq_hdr_t *, uint64_t, 93 uint32_t, uint_t); 94 extern void dapli_tavor_wrid_add_entry_srq(ib_srq_handle_t, uint64_t, uint32_t); 95 96 /* 97 * Note: The 64 bit doorbells need to written atomically. 98 * In 32 bit libraries we need to use the special assembly rtn 99 * because compiler generated code splits into 2 word writes 100 */ 101 102 /* 103 * dapli_arbel_cq_doorbell() 104 * Takes the specified cq cmd and cq number and rings the cq doorbell 105 */ 106 static void 107 dapli_arbel_cq_doorbell(dapls_hw_uar_t ia_uar, uint32_t cq_cmd, uint32_t cqn, 108 uint32_t cmd_sn, uint32_t cq_param) 109 { 110 uint64_t doorbell; 111 112 /* Build the doorbell from the parameters */ 113 doorbell = (cmd_sn << 4) | cq_cmd; 114 doorbell = (doorbell << 24) | cqn; 115 doorbell = (doorbell << 32) | cq_param; 116 117 /* Write the doorbell to UAR */ 118 #ifdef _LP64 119 ((tavor_hw_uar_t *)ia_uar)->cq = HTOBE_64(doorbell); 120 /* 32 bit version */ 121 #elif defined(i386) 122 dapl_os_lock(&g_tavor_uar_lock); 123 /* 124 * For 32 bit intel we assign the doorbell in the order 125 * prescribed by the Tavor PRM, lower to upper addresses 126 */ 127 ((tavor_hw_uar32_t *)ia_uar)->cq[0] = 128 (uint32_t)HTOBE_32(doorbell >> 32); 129 ((tavor_hw_uar32_t *)ia_uar)->cq[1] = 130 (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff); 131 dapl_os_unlock(&g_tavor_uar_lock); 132 #else 133 dapls_atomic_assign_64(HTOBE_64(doorbell), 134 &((tavor_hw_uar_t *)ia_uar)->cq); 135 #endif 136 } 137 138 /* 139 * dapli_arbel_qp_send_doorbell() 140 * Takes the specified next descriptor information, qp number, opcode and 141 * rings the send doorbell 142 */ 143 static void 144 dapli_arbel_sq_dbrec(ib_qp_handle_t qp, uint16_t wqe_counter) 145 { 146 qp->qp_sq_dbp[0] = HTOBE_32((wqe_counter + 1) & 0xffff); 147 } 148 149 static void 150 dapli_arbel_sq_dbreg(dapls_hw_uar_t ia_uar, uint32_t qpn, uint32_t fence, 151 uint32_t nopcode, uint16_t wqe_counter, uint32_t nds) 152 { 153 uint64_t doorbell; 154 155 doorbell = ((uint64_t)1 << ARBEL_QPSNDDB_WQE_CNT_SHIFT) | 156 ((uint64_t)wqe_counter << ARBEL_QPSNDDB_WQE_COUNTER_SHIFT) | 157 ((uint64_t)fence << ARBEL_QPSNDDB_F_SHIFT) | 158 ((uint64_t)nopcode << ARBEL_QPSNDDB_NOPCODE_SHIFT) | 159 (qpn << ARBEL_QPSNDDB_QPN_SHIFT) | nds; 160 161 /* Write the doorbell to UAR */ 162 #ifdef _LP64 163 ((tavor_hw_uar_t *)ia_uar)->send = HTOBE_64(doorbell); 164 #else 165 #if defined(i386) 166 dapl_os_lock(&g_tavor_uar_lock); 167 /* 168 * For 32 bit intel we assign the doorbell in the order 169 * prescribed by the Tavor PRM, lower to upper addresses 170 */ 171 ((tavor_hw_uar32_t *)ia_uar)->send[0] = 172 (uint32_t)HTOBE_32(doorbell >> 32); 173 ((tavor_hw_uar32_t *)ia_uar)->send[1] = 174 (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff); 175 dapl_os_unlock(&g_tavor_uar_lock); 176 #else 177 dapls_atomic_assign_64(HTOBE_64(doorbell), 178 &((tavor_hw_uar_t *)ia_uar)->send); 179 #endif 180 #endif 181 } 182 183 /* 184 * dapli_arbel_wqe_send_build() 185 * Constructs a WQE for a given ibt_send_wr_t 186 */ 187 static int 188 dapli_arbel_wqe_send_build(ib_qp_handle_t qp, ibt_send_wr_t *wr, 189 uint64_t *addr, uint_t *size) 190 { 191 tavor_hw_snd_wqe_remaddr_t *rc; 192 tavor_hw_snd_wqe_bind_t *bn; 193 tavor_hw_wqe_sgl_t *ds; 194 ibt_wr_ds_t *sgl; 195 uint32_t nds; 196 uint32_t len, total_len; 197 uint32_t new_rkey; 198 uint32_t old_rkey; 199 int i, num_ds; 200 int max_inline_bytes = -1; 201 uint64_t ctrl; 202 203 nds = wr->wr_nds; 204 sgl = wr->wr_sgl; 205 num_ds = 0; 206 ctrl = ((wr->wr_flags & IBT_WR_SEND_SIGNAL) ? 207 ARBEL_WQE_SEND_SIGNALED_MASK : 0) | 208 ((wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 209 ARBEL_WQE_SEND_SOLICIT_MASK : 0) | 210 ARBEL_WQE_CTRL_REQBIT_MASK; 211 212 /* 213 * RC is the only supported transport in UDAPL 214 * For RC requests, we allow "Send", "RDMA Read", "RDMA Write" 215 */ 216 switch (wr->wr_opcode) { 217 case IBT_WRC_SEND: 218 /* 219 * If this is a Send request, then all we need is 220 * the Data Segment processing below. 221 * Initialize the information for the Data Segments 222 */ 223 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr + 224 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 225 if (qp->qp_sq_inline != 0) 226 max_inline_bytes = 227 qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_SEND; 228 break; 229 case IBT_WRC_RDMAW: 230 if (qp->qp_sq_inline != 0) 231 max_inline_bytes = 232 qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_RDMAW; 233 /* FALLTHROUGH */ 234 case IBT_WRC_RDMAR: 235 if (qp->qp_sq_inline < 0 && wr->wr_opcode == IBT_WRC_RDMAR) 236 qp->qp_sq_inline = 0; 237 /* 238 * If this is an RDMA Read or RDMA Write request, then fill 239 * in the "Remote Address" header fields. 240 */ 241 rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)addr + 242 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 243 244 /* 245 * Build the Remote Address Segment for the WQE, using 246 * the information from the RC work request. 247 */ 248 TAVOR_WQE_BUILD_REMADDR(rc, &wr->wr.rc.rcwr.rdma); 249 250 /* Update "ds" for filling in Data Segments (below) */ 251 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc + 252 sizeof (tavor_hw_snd_wqe_remaddr_t)); 253 break; 254 case IBT_WRC_BIND: 255 /* 256 * Generate a new R_key 257 * Increment the upper "unconstrained" bits and need to keep 258 * the lower "constrained" bits the same it represents 259 * the MPT index. 260 */ 261 #if 0 262 /* XXX - need equiv of "arbel_wr_bind_check(state, wr);" */ 263 /* XXX - uses arbel_mr_keycalc - what about Sinai vs. Arbel??? */ 264 #endif 265 old_rkey = wr->wr.rc.rcwr.bind->bind_rkey; 266 new_rkey = old_rkey >> 8; /* index */ 267 old_rkey = ((old_rkey & 0xff) + 1) & 0xff; /* incremented key */ 268 new_rkey = (new_rkey << 8) | old_rkey; 269 270 wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey; 271 272 bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)addr + 273 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 274 275 /* 276 * Build the Bind Memory Window Segments for the WQE, 277 * using the information from the RC Bind memory 278 * window work request. 279 */ 280 TAVOR_WQE_BUILD_BIND(bn, wr->wr.rc.rcwr.bind); 281 282 /* 283 * Update the "ds" pointer. Even though the "bind" 284 * operation requires no SGLs, this is necessary to 285 * facilitate the correct descriptor size calculations 286 * (below). 287 */ 288 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn + 289 sizeof (tavor_hw_snd_wqe_bind_t)); 290 nds = 0; 291 break; 292 default: 293 dapl_dbg_log(DAPL_DBG_TYPE_ERR, 294 "dapli_arbel_wqe_send_build: invalid wr_opcode=%d\n", 295 wr->wr_opcode); 296 return (DAT_INTERNAL_ERROR); 297 } 298 299 /* 300 * Now fill in the Data Segments (SGL) for the Send WQE based on 301 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer 302 * Start by checking for a valid number of SGL entries 303 */ 304 if (nds > qp->qp_sq_sgl) { 305 return (DAT_INVALID_PARAMETER); 306 } 307 308 /* 309 * For each SGL in the Send Work Request, fill in the Send WQE's data 310 * segments. Note: We skip any SGL with zero size because Tavor 311 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 312 * the encoding for zero means a 2GB transfer. Because of this special 313 * encoding in the hardware, we mask the requested length with 314 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 315 * zero.) 316 */ 317 if (max_inline_bytes != -1) { /* compute total_len */ 318 total_len = 0; 319 for (i = 0; i < nds; i++) 320 total_len += sgl[i].ds_len; 321 if (total_len > max_inline_bytes) 322 max_inline_bytes = -1; /* too big, do not "inline" */ 323 } 324 if (max_inline_bytes != -1) { /* do "inline" */ 325 uint8_t *dst = (uint8_t *)((uint32_t *)ds + 1); 326 *(uint32_t *)ds = 327 HTOBE_32(total_len | TAVOR_WQE_SGL_INLINE_MASK); 328 for (i = 0; i < nds; i++) { 329 if ((len = sgl[i].ds_len) == 0) { 330 continue; 331 } 332 (void) dapl_os_memcpy(dst, 333 (void *)(uintptr_t)sgl[i].ds_va, len); 334 dst += len; 335 } 336 /* Return the size of descriptor (in 16-byte chunks) */ 337 *size = ((uintptr_t)dst - (uintptr_t)addr + 15) >> 4; 338 } else { 339 for (i = 0; i < nds; i++) { 340 if (sgl[i].ds_len == 0) { 341 continue; 342 } 343 344 /* 345 * Fill in the Data Segment(s) for the current WQE, 346 * using the information contained in the 347 * scatter-gather list of the work request. 348 */ 349 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &sgl[i]); 350 num_ds++; 351 } 352 353 /* Return the size of descriptor (in 16-byte chunks) */ 354 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)addr) >> 4; 355 } 356 ARBEL_WQE_SETCTRL(qp, addr, ctrl); 357 358 return (DAT_SUCCESS); 359 } 360 361 /* 362 * dapli_arbel_wqe_send_linknext() 363 * Takes a WQE and links it to the prev WQE chain 364 */ 365 static void 366 dapli_arbel_wqe_send_linknext(ibt_send_wr_t *curr_wr, 367 uint32_t curr_desc, uint_t curr_descsz, uint64_t *prev_addr, 368 tavor_sw_wqe_dbinfo_t *dbinfo) 369 { 370 uint32_t nopcode, fence, nda_op, ee_nds; 371 372 /* 373 * Calculate the "next" field of the prev descriptor. This amounts 374 * to setting up the "next_wqe_addr", "nopcode", "fence", and "nds" 375 * fields (see tavor_hw.h for more). 376 */ 377 378 /* 379 * Determine the value for the Tavor WQE "nopcode" field 380 * by using the IBTF opcode from the work request 381 */ 382 switch (curr_wr->wr_opcode) { 383 case IBT_WRC_RDMAW: 384 nopcode = ARBEL_WQE_SEND_NOPCODE_RDMAW; 385 break; 386 387 case IBT_WRC_SEND: 388 nopcode = ARBEL_WQE_SEND_NOPCODE_SEND; 389 break; 390 391 case IBT_WRC_RDMAR: 392 nopcode = ARBEL_WQE_SEND_NOPCODE_RDMAR; 393 break; 394 395 case IBT_WRC_BIND: 396 nopcode = ARBEL_WQE_SEND_NOPCODE_BIND; 397 break; 398 default: 399 /* Unsupported opcodes in UDAPL */ 400 dapl_dbg_log(DAPL_DBG_TYPE_ERR, 401 "dapli_arbel_wqe_send_linknext: invalid nopcode=%d\n", 402 nopcode); 403 return; 404 } 405 406 fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0; 407 nda_op = ((uintptr_t)curr_desc & ARBEL_WQE_NDA_MASK) | nopcode; 408 ee_nds = ((fence == 1) ? ARBEL_WQE_SEND_FENCE_MASK : 0) | 409 (curr_descsz & ARBEL_WQE_NDS_MASK) | 410 ARBEL_WQE_NEXT_REQBIT_MASK; 411 412 /* 413 * A send queue doorbell will be rung for the next 414 * WQE on the chain, set the current WQE's "dbd" bit. 415 * Note: We also update the "dbinfo" structure here to pass 416 * back information about what should (later) be included 417 * in the send queue doorbell. 418 */ 419 dbinfo->db_nopcode = nopcode; 420 dbinfo->db_fence = fence; 421 422 ARBEL_WQE_SETNEXT(qp, prev_addr, nda_op, ee_nds); 423 } 424 425 426 /* 427 * dapli_arbel_wqe_recv_build() 428 * Builds the recv WQE for a given ibt_recv_wr_t 429 */ 430 static DAT_RETURN 431 dapli_arbel_wqe_recv_build(ib_qp_handle_t qp, ibt_recv_wr_t *wr, 432 uint64_t *addr, uint_t *size) 433 { 434 tavor_hw_wqe_sgl_t *ds; 435 int i; 436 int num_ds; 437 438 /* Fill in the Data Segments (SGL) for the Recv WQE */ 439 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr + 440 sizeof (tavor_hw_rcv_wqe_nextctrl_t)); 441 num_ds = 0; 442 443 /* Check for valid number of SGL entries */ 444 if (wr->wr_nds > qp->qp_rq_sgl) { 445 return (DAT_INVALID_PARAMETER); 446 } 447 448 /* 449 * For each SGL in the Recv Work Request, fill in the Recv WQE's data 450 * segments. Note: We skip any SGL with zero size because Tavor 451 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 452 * the encoding for zero means a 2GB transfer. Because of this special 453 * encoding in the hardware, we mask the requested length with 454 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 455 * zero.) 456 */ 457 for (i = 0; i < wr->wr_nds; i++) { 458 if (wr->wr_sgl[i].ds_len == 0) { 459 continue; 460 } 461 462 /* 463 * Fill in the Data Segment(s) for the receive WQE, using the 464 * information contained in the scatter-gather list of the 465 * work request. 466 */ 467 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]); 468 num_ds++; 469 } 470 if (i < qp->qp_rq_sgl) { 471 ibt_wr_ds_t sgl; 472 sgl.ds_va = (ib_vaddr_t)0; 473 sgl.ds_len = (ib_msglen_t)0; 474 sgl.ds_key = (ibt_lkey_t)ARBEL_WQE_SGL_INVALID_LKEY; 475 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &sgl); 476 } 477 478 /* Return the size of descriptor (in 16-byte chunks) */ 479 *size = qp->qp_rq_wqesz >> 4; 480 481 return (DAT_SUCCESS); 482 } 483 484 /* 485 * dapli_arbel_wqe_srq_build() 486 * Builds the recv WQE for a given ibt_recv_wr_t 487 */ 488 static DAT_RETURN 489 dapli_arbel_wqe_srq_build(ib_srq_handle_t srq, ibt_recv_wr_t *wr, 490 uint64_t *addr) 491 { 492 tavor_hw_wqe_sgl_t *ds; 493 ibt_wr_ds_t end_sgl; 494 int i; 495 int num_ds; 496 497 /* Fill in the Data Segments (SGL) for the Recv WQE */ 498 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr + 499 sizeof (tavor_hw_rcv_wqe_nextctrl_t)); 500 num_ds = 0; 501 502 /* Check for valid number of SGL entries */ 503 if (wr->wr_nds > srq->srq_wq_sgl) { 504 return (DAT_INVALID_PARAMETER); 505 } 506 507 /* 508 * For each SGL in the Recv Work Request, fill in the Recv WQE's data 509 * segments. Note: We skip any SGL with zero size because Tavor 510 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 511 * the encoding for zero means a 2GB transfer. Because of this special 512 * encoding in the hardware, we mask the requested length with 513 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 514 * zero.) 515 */ 516 for (i = 0; i < wr->wr_nds; i++) { 517 if (wr->wr_sgl[i].ds_len == 0) { 518 continue; 519 } 520 521 /* 522 * Fill in the Data Segment(s) for the receive WQE, using the 523 * information contained in the scatter-gather list of the 524 * work request. 525 */ 526 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]); 527 num_ds++; 528 } 529 530 /* 531 * For SRQ, if the number of data segments is less than the maximum 532 * specified at alloc, then we have to fill in a special "key" entry in 533 * the sgl entry after the last valid one in this post request. We do 534 * that here. 535 */ 536 if (num_ds < srq->srq_wq_sgl) { 537 end_sgl.ds_va = (ib_vaddr_t)0; 538 end_sgl.ds_len = (ib_msglen_t)0; 539 end_sgl.ds_key = (ibt_lkey_t)ARBEL_WQE_SGL_INVALID_LKEY; 540 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &end_sgl); 541 } 542 543 return (DAT_SUCCESS); 544 } 545 546 /* 547 * dapli_arbel_cq_peek() 548 * Peeks into a given CQ to check if there are any events that can be 549 * polled. It returns the number of CQEs that can be polled. 550 */ 551 static void 552 dapli_arbel_cq_peek(ib_cq_handle_t cq, int *num_cqe) 553 { 554 tavor_hw_cqe_t *cqe; 555 uint32_t imm_eth_pkey_cred; 556 uint32_t cons_indx; 557 uint32_t wrap_around_mask; 558 uint32_t polled_cnt; 559 uint_t doorbell_cnt; 560 uint_t opcode; 561 562 /* Get the consumer index */ 563 cons_indx = cq->cq_consindx; 564 565 /* 566 * Calculate the wrap around mask. Note: This operation only works 567 * because all Tavor completion queues have power-of-2 sizes 568 */ 569 wrap_around_mask = (cq->cq_size - 1); 570 571 /* Calculate the pointer to the first CQ entry */ 572 cqe = &cq->cq_addr[cons_indx]; 573 574 /* 575 * Count entries in the CQ until we find an entry owned by 576 * the hardware. 577 */ 578 polled_cnt = 0; 579 while (TAVOR_CQE_OWNER_IS_SW(cqe)) { 580 opcode = TAVOR_CQE_OPCODE_GET(cqe); 581 /* Error CQE map to multiple work completions */ 582 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) || 583 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) { 584 imm_eth_pkey_cred = 585 TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe); 586 doorbell_cnt = 587 imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK; 588 polled_cnt += (doorbell_cnt + 1); 589 } else { 590 polled_cnt++; 591 } 592 /* Increment the consumer index */ 593 cons_indx = (cons_indx + 1) & wrap_around_mask; 594 595 /* Update the pointer to the next CQ entry */ 596 cqe = &cq->cq_addr[cons_indx]; 597 } 598 599 *num_cqe = polled_cnt; 600 } 601 602 #define dapli_arbel_cq_update_ci(cq, dbp) \ 603 (dbp)[0] = HTOBE_32(cq->cq_consindx) 604 605 /* 606 * dapli_arbel_cq_poll() 607 * This routine polls CQEs out of a CQ and puts them into the ibt_wc_t 608 * array that is passed in. 609 */ 610 static DAT_RETURN 611 dapli_arbel_cq_poll(ib_cq_handle_t cq, ibt_wc_t *wc_p, uint_t num_wc, 612 uint_t *num_polled) 613 { 614 tavor_hw_cqe_t *cqe; 615 uint32_t cons_indx; 616 uint32_t wrap_around_mask; 617 uint32_t polled_cnt; 618 DAT_RETURN dat_status; 619 int status; 620 621 /* Get the consumer index */ 622 cons_indx = cq->cq_consindx; 623 624 /* 625 * Calculate the wrap around mask. Note: This operation only works 626 * because all Tavor completion queues have power-of-2 sizes 627 */ 628 wrap_around_mask = (cq->cq_size - 1); 629 630 /* Calculate the pointer to the first CQ entry */ 631 cqe = &cq->cq_addr[cons_indx]; 632 633 /* 634 * Keep pulling entries from the CQ until we find an entry owned by 635 * the hardware. As long as there the CQE's owned by SW, process 636 * each entry by calling dapli_arbel_cq_cqe_consume() and updating the 637 * CQ consumer index. Note: We only update the consumer index if 638 * dapli_arbel_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB. 639 * Otherwise, it indicates that we are going to "recycle" the CQE 640 * (probably because it is a error CQE and corresponds to more than one 641 * completion). 642 */ 643 polled_cnt = 0; 644 while (TAVOR_CQE_OWNER_IS_SW(cqe)) { 645 status = dapli_arbel_cq_cqe_consume(cq, cqe, 646 &wc_p[polled_cnt++]); 647 if (status == TAVOR_CQ_SYNC_AND_DB) { 648 /* Reset entry to hardware ownership */ 649 TAVOR_CQE_OWNER_SET_HW(cqe); 650 651 /* Increment the consumer index */ 652 cons_indx = (cons_indx + 1) & wrap_around_mask; 653 654 /* Update the pointer to the next CQ entry */ 655 cqe = &cq->cq_addr[cons_indx]; 656 } 657 658 /* 659 * If we have run out of space to store work completions, 660 * then stop and return the ones we have pulled of the CQ. 661 */ 662 if (polled_cnt >= num_wc) { 663 break; 664 } 665 } 666 667 dat_status = DAT_SUCCESS; 668 /* 669 * Now we only ring the doorbell (to update the consumer index) if 670 * we've actually consumed a CQ entry. If we have, for example, 671 * pulled from a CQE that we are still in the process of "recycling" 672 * for error purposes, then we would not update the consumer index. 673 */ 674 if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) { 675 /* 676 * Update the consumer index in both the CQ handle and the 677 * doorbell record. 678 */ 679 cq->cq_consindx = cons_indx; 680 dapli_arbel_cq_update_ci(cq, cq->cq_poll_dbp); 681 } else if (polled_cnt == 0) { 682 /* 683 * If the CQ is empty, we can try to free up some of the WRID 684 * list containers. 685 */ 686 if (cq->cq_wrid_reap_head) /* look before leaping */ 687 dapls_tavor_wrid_cq_reap(cq); 688 dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0); 689 } 690 691 if (num_polled != NULL) { 692 *num_polled = polled_cnt; 693 } 694 695 return (dat_status); 696 } 697 698 /* 699 * dapli_arbel_cq_poll_one() 700 * This routine polls one CQE out of a CQ and puts ot into the ibt_wc_t 701 * that is passed in. See above for more comments/details. 702 */ 703 static DAT_RETURN 704 dapli_arbel_cq_poll_one(ib_cq_handle_t cq, ibt_wc_t *wc_p) 705 { 706 tavor_hw_cqe_t *cqe; 707 uint32_t cons_indx; 708 DAT_RETURN dat_status; 709 int status; 710 711 /* Get the consumer index */ 712 cons_indx = cq->cq_consindx; 713 714 /* Calculate the pointer to the first CQ entry */ 715 cqe = &cq->cq_addr[cons_indx]; 716 717 /* 718 * Keep pulling entries from the CQ until we find an entry owned by 719 * the hardware. As long as there the CQE's owned by SW, process 720 * each entry by calling dapli_arbel_cq_cqe_consume() and updating the 721 * CQ consumer index. Note: We only update the consumer index if 722 * dapli_arbel_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB. 723 * Otherwise, it indicates that we are going to "recycle" the CQE 724 * (probably because it is a error CQE and corresponds to more than one 725 * completion). 726 */ 727 if (TAVOR_CQE_OWNER_IS_SW(cqe)) { 728 status = dapli_arbel_cq_cqe_consume(cq, cqe, wc_p); 729 if (status == TAVOR_CQ_SYNC_AND_DB) { 730 /* Reset entry to hardware ownership */ 731 TAVOR_CQE_OWNER_SET_HW(cqe); 732 733 /* Increment the consumer index */ 734 cq->cq_consindx = 735 (cons_indx + 1) & (cq->cq_size - 1); 736 dapli_arbel_cq_update_ci(cq, cq->cq_poll_dbp); 737 } 738 dat_status = DAT_SUCCESS; 739 } else { 740 if (cq->cq_wrid_reap_head) /* look before leaping */ 741 dapls_tavor_wrid_cq_reap(cq); 742 dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0); 743 } 744 return (dat_status); 745 } 746 747 /* 748 * dapli_arbel_cq_cqe_consume() 749 * Converts a given CQE into a ibt_wc_t object 750 */ 751 static int 752 dapli_arbel_cq_cqe_consume(ib_cq_handle_t cqhdl, tavor_hw_cqe_t *cqe, 753 ibt_wc_t *wc) 754 { 755 uint_t flags; 756 uint_t type; 757 uint_t opcode; 758 int status; 759 760 /* strip off the size in wqeaddrsz */ 761 TAVOR_CQE_WQEADDRSZ_SET(cqe, TAVOR_CQE_WQEADDRSZ_GET(cqe) & 762 ~ARBEL_WQE_NDS_MASK); 763 764 /* 765 * Determine if this is an "error" CQE by examining "opcode". If it 766 * is an error CQE, then call dapli_arbel_cq_errcqe_consume() and return 767 * whatever status it returns. Otherwise, this is a successful 768 * completion. 769 */ 770 opcode = TAVOR_CQE_OPCODE_GET(cqe); 771 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) || 772 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) { 773 status = dapli_arbel_cq_errcqe_consume(cqhdl, cqe, wc); 774 return (status); 775 } 776 777 /* 778 * Fetch the Work Request ID using the information in the CQE. 779 * See tavor_wr.c for more details. 780 */ 781 wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, cqe, 782 TAVOR_CQE_SENDRECV_GET(cqe), 0, NULL); 783 wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe); 784 785 /* 786 * Parse the CQE opcode to determine completion type. This will set 787 * not only the type of the completion, but also any flags that might 788 * be associated with it (e.g. whether immediate data is present). 789 */ 790 flags = IBT_WC_NO_FLAGS; 791 if (TAVOR_CQE_SENDRECV_GET(cqe) != TAVOR_COMPLETION_RECV) { 792 793 /* 794 * Send CQE 795 * 796 * The following opcodes will not be generated in uDAPL 797 * case TAVOR_CQE_SND_RDMAWR_IMM: 798 * case TAVOR_CQE_SND_SEND_IMM: 799 * case TAVOR_CQE_SND_ATOMIC_CS: 800 * case TAVOR_CQE_SND_ATOMIC_FA: 801 */ 802 switch (opcode) { 803 case TAVOR_CQE_SND_RDMAWR: 804 type = IBT_WRC_RDMAW; 805 break; 806 807 case TAVOR_CQE_SND_SEND: 808 type = IBT_WRC_SEND; 809 break; 810 811 case TAVOR_CQE_SND_RDMARD: 812 type = IBT_WRC_RDMAR; 813 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe); 814 break; 815 816 case TAVOR_CQE_SND_BIND_MW: 817 type = IBT_WRC_BIND; 818 break; 819 820 default: 821 wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR; 822 return (TAVOR_CQ_SYNC_AND_DB); 823 } 824 } else { 825 826 /* 827 * Receive CQE 828 * 829 * The following opcodes will not be generated in uDAPL 830 * 831 * case TAVOR_CQE_RCV_RECV_IMM: 832 * case TAVOR_CQE_RCV_RECV_IMM2: 833 * case TAVOR_CQE_RCV_RDMAWR_IMM: 834 * case TAVOR_CQE_RCV_RDMAWR_IMM2: 835 */ 836 switch (opcode & 0x1F) { 837 case TAVOR_CQE_RCV_RECV: 838 /* FALLTHROUGH */ 839 case TAVOR_CQE_RCV_RECV2: 840 type = IBT_WRC_RECV; 841 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe); 842 break; 843 default: 844 wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR; 845 return (TAVOR_CQ_SYNC_AND_DB); 846 } 847 } 848 wc->wc_type = type; 849 wc->wc_flags = flags; 850 /* If we got here, completion status must be success */ 851 wc->wc_status = IBT_WC_SUCCESS; 852 853 return (TAVOR_CQ_SYNC_AND_DB); 854 } 855 856 857 /* 858 * dapli_arbel_cq_errcqe_consume() 859 */ 860 static int 861 dapli_arbel_cq_errcqe_consume(ib_cq_handle_t cqhdl, tavor_hw_cqe_t *cqe, 862 ibt_wc_t *wc) 863 { 864 dapls_tavor_wrid_entry_t wre; 865 uint32_t imm_eth_pkey_cred; 866 uint_t status; 867 uint_t opcode = TAVOR_CQE_OPCODE_GET(cqe); 868 869 dapl_dbg_log(DAPL_DBG_TYPE_EVD, "errcqe_consume:cqe.eth=%x, wqe=%x\n", 870 TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe), 871 TAVOR_CQE_WQEADDRSZ_GET(cqe)); 872 873 /* 874 * Fetch the Work Request ID using the information in the CQE. 875 * See tavor_wr.c for more details. 876 */ 877 wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, cqe, 878 (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ? TAVOR_COMPLETION_SEND : 879 TAVOR_COMPLETION_RECV, 1, &wre); 880 wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe); 881 882 /* 883 * Parse the CQE opcode to determine completion type. We know that 884 * the CQE is an error completion, so we extract only the completion 885 * status here. 886 */ 887 imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe); 888 status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT; 889 switch (status) { 890 case TAVOR_CQE_LOC_LEN_ERR: 891 status = IBT_WC_LOCAL_LEN_ERR; 892 break; 893 894 case TAVOR_CQE_LOC_OP_ERR: 895 status = IBT_WC_LOCAL_CHAN_OP_ERR; 896 break; 897 898 case TAVOR_CQE_LOC_PROT_ERR: 899 status = IBT_WC_LOCAL_PROTECT_ERR; 900 break; 901 902 case TAVOR_CQE_WR_FLUSHED_ERR: 903 status = IBT_WC_WR_FLUSHED_ERR; 904 break; 905 906 case TAVOR_CQE_MW_BIND_ERR: 907 status = IBT_WC_MEM_WIN_BIND_ERR; 908 break; 909 910 case TAVOR_CQE_BAD_RESPONSE_ERR: 911 status = IBT_WC_BAD_RESPONSE_ERR; 912 break; 913 914 case TAVOR_CQE_LOCAL_ACCESS_ERR: 915 status = IBT_WC_LOCAL_ACCESS_ERR; 916 break; 917 918 case TAVOR_CQE_REM_INV_REQ_ERR: 919 status = IBT_WC_REMOTE_INVALID_REQ_ERR; 920 break; 921 922 case TAVOR_CQE_REM_ACC_ERR: 923 status = IBT_WC_REMOTE_ACCESS_ERR; 924 break; 925 926 case TAVOR_CQE_REM_OP_ERR: 927 status = IBT_WC_REMOTE_OP_ERR; 928 break; 929 930 case TAVOR_CQE_TRANS_TO_ERR: 931 status = IBT_WC_TRANS_TIMEOUT_ERR; 932 break; 933 934 case TAVOR_CQE_RNRNAK_TO_ERR: 935 status = IBT_WC_RNR_NAK_TIMEOUT_ERR; 936 break; 937 938 /* 939 * The following error codes are not supported in the Tavor driver 940 * as they relate only to Reliable Datagram completion statuses: 941 * case TAVOR_CQE_LOCAL_RDD_VIO_ERR: 942 * case TAVOR_CQE_REM_INV_RD_REQ_ERR: 943 * case TAVOR_CQE_EEC_REM_ABORTED_ERR: 944 * case TAVOR_CQE_INV_EEC_NUM_ERR: 945 * case TAVOR_CQE_INV_EEC_STATE_ERR: 946 * case TAVOR_CQE_LOC_EEC_ERR: 947 */ 948 949 default: 950 status = IBT_WC_LOCAL_CHAN_OP_ERR; 951 break; 952 } 953 wc->wc_status = status; 954 wc->wc_type = 0; 955 956 /* 957 * Consume the CQE 958 * Return status to indicate that doorbell and sync may be 959 * necessary. 960 */ 961 return (TAVOR_CQ_SYNC_AND_DB); 962 } 963 964 /* 965 * dapli_arbel_cq_notify() 966 * This function is used for arming the CQ by ringing the CQ doorbell. 967 * 968 * Note: there is something very subtle here. This code assumes a very 969 * specific behavior of the kernel driver. The cmd_sn field of the 970 * arm_dbr is updated by the kernel driver whenever a notification 971 * event for the cq is received. This code extracts the cmd_sn field 972 * from the arm_dbr to know the right value to use. The arm_dbr is 973 * always updated atomically so that neither the kernel driver nor this 974 * will get confused about what the other is doing. 975 * 976 * Note: param is not used here. It is necessary for arming a CQ for 977 * N completions (param is N), but no uDAPL API supports this for now. 978 * Thus, we declare ARGSUSED to make lint happy. 979 */ 980 /*ARGSUSED*/ 981 static DAT_RETURN 982 dapli_arbel_cq_notify(ib_cq_handle_t cq, int flags, uint32_t param) 983 { 984 uint32_t cqnum; 985 uint32_t *target; 986 uint32_t old_cmd, cmp, new, tmp, cmd_sn; 987 988 /* 989 * Determine if we are trying to get the next completion or the next 990 * "solicited" completion. Then hit the appropriate doorbell. 991 */ 992 dapli_arbel_cq_update_ci(cq, cq->cq_arm_dbp); 993 cqnum = cq->cq_num; 994 target = cq->cq_arm_dbp + 1; 995 retry: 996 cmp = *target; 997 tmp = HTOBE_32(cmp); 998 old_cmd = tmp & 0x7; 999 cmd_sn = (tmp & 0x18) >> 3; 1000 1001 if (flags == IB_NOTIFY_ON_NEXT_COMP) { 1002 if (old_cmd != ARBEL_CQDB_NOTIFY_CQ) { 1003 new = HTOBE_32((tmp & ~0x7) | ARBEL_CQDB_NOTIFY_CQ); 1004 tmp = atomic_cas_32(target, cmp, new); 1005 if (tmp != cmp) 1006 goto retry; 1007 dapli_arbel_cq_doorbell(cq->cq_iauar, 1008 ARBEL_CQDB_NOTIFY_CQ, cqnum, 1009 cmd_sn, cq->cq_consindx); 1010 } /* else it's already armed */ 1011 } else if (flags == IB_NOTIFY_ON_NEXT_SOLICITED) { 1012 if (old_cmd != ARBEL_CQDB_NOTIFY_CQ && 1013 old_cmd != ARBEL_CQDB_NOTIFY_CQ_SOLICIT) { 1014 new = HTOBE_32((tmp & ~0x7) | 1015 ARBEL_CQDB_NOTIFY_CQ_SOLICIT); 1016 tmp = atomic_cas_32(target, cmp, new); 1017 if (tmp != cmp) 1018 goto retry; 1019 dapli_arbel_cq_doorbell(cq->cq_iauar, 1020 ARBEL_CQDB_NOTIFY_CQ_SOLICIT, cqnum, 1021 cmd_sn, cq->cq_consindx); 1022 } /* else it's already armed */ 1023 } else { 1024 return (DAT_INVALID_PARAMETER); 1025 } 1026 1027 return (DAT_SUCCESS); 1028 } 1029 1030 /* 1031 * dapli_arbel_post_send() 1032 */ 1033 /* ARGSUSED */ 1034 static DAT_RETURN 1035 dapli_arbel_post_send(DAPL_EP *ep, ibt_send_wr_t *wr, boolean_t ns) 1036 { 1037 tavor_sw_wqe_dbinfo_t dbinfo; 1038 dapls_tavor_wrid_list_hdr_t *wridlist; 1039 dapls_tavor_wrid_entry_t *wre_last; 1040 uint32_t desc; 1041 uint64_t *wqe_addr; 1042 uint32_t desc_sz; 1043 uint32_t wqeaddrsz, signaled_dbd; 1044 uint32_t head, tail, next_tail, qsize_msk; 1045 int status; 1046 ib_qp_handle_t qp; 1047 1048 if ((ep->qp_state == IBT_STATE_RESET) || 1049 (ep->qp_state == IBT_STATE_INIT) || 1050 (ep->qp_state == IBT_STATE_RTR)) { 1051 dapl_dbg_log(DAPL_DBG_TYPE_ERR, 1052 "post_send: invalid qp_state %d\n", ep->qp_state); 1053 return (DAT_INVALID_STATE); 1054 } 1055 1056 qp = ep->qp_handle; 1057 1058 /* Grab the lock for the WRID list */ 1059 dapl_os_lock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock); 1060 wridlist = qp->qp_sq_wqhdr->wq_wrid_post; 1061 1062 /* Save away some initial QP state */ 1063 qsize_msk = qp->qp_sq_wqhdr->wq_size - 1; 1064 tail = qp->qp_sq_wqhdr->wq_tail; 1065 head = qp->qp_sq_wqhdr->wq_head; 1066 1067 /* 1068 * Check for "queue full" condition. If the queue is already full, 1069 * then no more WQEs can be posted, return an error 1070 */ 1071 if (qp->qp_sq_wqhdr->wq_full != 0) { 1072 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock); 1073 return (DAT_INSUFFICIENT_RESOURCES); 1074 } 1075 1076 /* 1077 * Increment the "tail index" and check for "queue full" condition. 1078 * If we detect that the current work request is going to fill the 1079 * work queue, then we mark this condition and continue. 1080 */ 1081 next_tail = (tail + 1) & qsize_msk; 1082 if (next_tail == head) { 1083 qp->qp_sq_wqhdr->wq_full = 1; 1084 } 1085 1086 /* 1087 * Get the user virtual address of the location where the next 1088 * Send WQE should be built 1089 */ 1090 wqe_addr = TAVOR_QP_SQ_ENTRY(qp, tail); 1091 1092 /* 1093 * Call tavor_wqe_send_build() to build the WQE at the given address. 1094 * This routine uses the information in the ibt_send_wr_t and 1095 * returns the size of the WQE when it returns. 1096 */ 1097 status = dapli_arbel_wqe_send_build(qp, wr, wqe_addr, &desc_sz); 1098 if (status != DAT_SUCCESS) { 1099 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock); 1100 return (status); 1101 } 1102 1103 /* 1104 * Get the descriptor (io address) corresponding to the location 1105 * Send WQE was built. 1106 */ 1107 desc = TAVOR_QP_SQ_DESC(qp, tail); 1108 1109 dapl_os_assert(desc >= qp->qp_sq_desc_addr && 1110 desc <= (qp->qp_sq_desc_addr + 1111 qp->qp_sq_numwqe*qp->qp_sq_wqesz)); 1112 1113 /* 1114 * Add a WRID entry to the WRID list. Need to calculate the 1115 * "wqeaddr" to pass to dapli_tavor_wrid_add_entry(). 1116 * signaled_dbd is still calculated, but ignored. 1117 */ 1118 wqeaddrsz = TAVOR_QP_WQEADDRSZ(desc, 0); 1119 1120 if (wr->wr_flags & IBT_WR_SEND_SIGNAL) { 1121 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED; 1122 } 1123 1124 dapli_tavor_wrid_add_entry(qp->qp_sq_wqhdr, wr->wr_id, wqeaddrsz, 1125 signaled_dbd); 1126 1127 /* 1128 * Now link the wqe to the old chain (if there was one) 1129 */ 1130 dapli_arbel_wqe_send_linknext(wr, desc, desc_sz, 1131 qp->qp_sq_lastwqeaddr, &dbinfo); 1132 1133 /* 1134 * Now if the WRID tail entry is non-NULL, then this 1135 * represents the entry to which we are chaining the 1136 * new entries. Since we are going to ring the 1137 * doorbell for this WQE, we want set its "dbd" bit. 1138 * 1139 * On the other hand, if the tail is NULL, even though 1140 * we will have rung the doorbell for the previous WQE 1141 * (for the hardware's sake) it is irrelevant to our 1142 * purposes (for tracking WRIDs) because we know the 1143 * request must have already completed. 1144 */ 1145 wre_last = wridlist->wl_wre_old_tail; 1146 if (wre_last != NULL) { 1147 wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED; 1148 } 1149 1150 /* Update some of the state in the QP */ 1151 qp->qp_sq_lastwqeaddr = wqe_addr; 1152 qp->qp_sq_wqhdr->wq_tail = next_tail; 1153 1154 /* Set the doorbell decord */ 1155 dapli_arbel_sq_dbrec(qp, qp->qp_sq_counter); 1156 1157 /* Ring the doorbell */ 1158 dapli_arbel_sq_dbreg(qp->qp_iauar, qp->qp_num, dbinfo.db_fence, 1159 dbinfo.db_nopcode, qp->qp_sq_counter, desc_sz); 1160 qp->qp_sq_counter++; 1161 1162 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock); 1163 1164 return (DAT_SUCCESS); 1165 } 1166 1167 /* 1168 * dapli_arbel_post_recv() 1169 */ 1170 /* ARGSUSED */ 1171 static DAT_RETURN 1172 dapli_arbel_post_recv(DAPL_EP *ep, ibt_recv_wr_t *wr, boolean_t ns) 1173 { 1174 dapls_tavor_wrid_list_hdr_t *wridlist; 1175 dapls_tavor_wrid_entry_t *wre_last; 1176 ib_qp_handle_t qp; 1177 DAT_RETURN status; 1178 uint32_t desc; 1179 uint64_t *wqe_addr; 1180 uint32_t desc_sz; 1181 uint32_t wqeaddrsz; 1182 uint32_t head, tail, next_tail, qsize_msk; 1183 1184 if (ep->qp_state == IBT_STATE_RESET) { 1185 dapl_dbg_log(DAPL_DBG_TYPE_ERR, 1186 "post_recv: invalid qp_state %d\n", ep->qp_state); 1187 return (DAT_INVALID_STATE); 1188 } 1189 qp = ep->qp_handle; 1190 1191 /* Grab the lock for the WRID list */ 1192 dapl_os_lock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock); 1193 wridlist = qp->qp_rq_wqhdr->wq_wrid_post; 1194 1195 /* Save away some initial QP state */ 1196 qsize_msk = qp->qp_rq_wqhdr->wq_size - 1; 1197 tail = qp->qp_rq_wqhdr->wq_tail; 1198 head = qp->qp_rq_wqhdr->wq_head; 1199 1200 /* 1201 * For the ibt_recv_wr_t passed in, parse the request and build a 1202 * Recv WQE. Link the WQE with the previous WQE and ring the 1203 * door bell. 1204 */ 1205 1206 /* 1207 * Check for "queue full" condition. If the queue is already full, 1208 * then no more WQEs can be posted. So return an error. 1209 */ 1210 if (qp->qp_rq_wqhdr->wq_full != 0) { 1211 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock); 1212 return (DAT_INSUFFICIENT_RESOURCES); 1213 } 1214 1215 /* 1216 * Increment the "tail index" and check for "queue 1217 * full" condition. If we detect that the current 1218 * work request is going to fill the work queue, then 1219 * we mark this condition and continue. 1220 */ 1221 next_tail = (tail + 1) & qsize_msk; 1222 if (next_tail == head) { 1223 qp->qp_rq_wqhdr->wq_full = 1; 1224 } 1225 1226 /* Get the descriptor (IO Address) of the WQE to be built */ 1227 desc = TAVOR_QP_RQ_DESC(qp, tail); 1228 /* The user virtual address of the WQE to be built */ 1229 wqe_addr = TAVOR_QP_RQ_ENTRY(qp, tail); 1230 1231 /* 1232 * Call tavor_wqe_recv_build() to build the WQE at the given 1233 * address. This routine uses the information in the 1234 * ibt_recv_wr_t and returns the size of the WQE. 1235 */ 1236 status = dapli_arbel_wqe_recv_build(qp, wr, wqe_addr, &desc_sz); 1237 if (status != DAT_SUCCESS) { 1238 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock); 1239 return (DAT_INTERNAL_ERROR); 1240 } 1241 1242 /* 1243 * Add a WRID entry to the WRID list. Need to calculate the 1244 * "wqeaddr" and "signaled_dbd" values to pass to 1245 * dapli_tavor_wrid_add_entry(). 1246 * Note: all Recv WQEs are essentially "signaled" 1247 */ 1248 wqeaddrsz = TAVOR_QP_WQEADDRSZ(desc, 0); 1249 dapli_tavor_wrid_add_entry(qp->qp_rq_wqhdr, wr->wr_id, wqeaddrsz, 1250 (uint32_t)TAVOR_WRID_ENTRY_SIGNALED); 1251 1252 /* 1253 * Now if the WRID tail entry is non-NULL, then this 1254 * represents the entry to which we are chaining the 1255 * new entries. Since we are going to ring the 1256 * doorbell for this WQE, we want set its "dbd" bit. 1257 * 1258 * On the other hand, if the tail is NULL, even though 1259 * we will have rung the doorbell for the previous WQE 1260 * (for the hardware's sake) it is irrelevant to our 1261 * purposes (for tracking WRIDs) because we know the 1262 * request must have already completed. 1263 */ 1264 wre_last = wridlist->wl_wre_old_tail; 1265 if (wre_last != NULL) { 1266 wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED; 1267 } 1268 1269 /* Update some of the state in the QP */ 1270 qp->qp_rq_lastwqeaddr = wqe_addr; 1271 qp->qp_rq_wqhdr->wq_tail = next_tail; 1272 1273 /* Update the doorbell record */ 1274 qp->qp_rq_counter++; 1275 (qp->qp_rq_dbp)[0] = HTOBE_32(qp->qp_rq_counter); 1276 1277 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock); 1278 1279 return (DAT_SUCCESS); 1280 } 1281 1282 /* 1283 * dapli_arbel_post_srq() 1284 */ 1285 /* ARGSUSED */ 1286 static DAT_RETURN 1287 dapli_arbel_post_srq(DAPL_SRQ *srqp, ibt_recv_wr_t *wr, boolean_t ns) 1288 { 1289 ib_srq_handle_t srq; 1290 DAT_RETURN status; 1291 uint32_t desc; 1292 uint64_t *wqe_addr; 1293 uint32_t head, next_head, qsize_msk; 1294 uint32_t wqe_index; 1295 1296 1297 srq = srqp->srq_handle; 1298 1299 /* Grab the lock for the WRID list */ 1300 dapl_os_lock(&srq->srq_wridlist->wl_lock->wrl_lock); 1301 1302 /* 1303 * For the ibt_recv_wr_t passed in, parse the request and build a 1304 * Recv WQE. Link the WQE with the previous WQE and ring the 1305 * door bell. 1306 */ 1307 1308 /* 1309 * Check for "queue full" condition. If the queue is already full, 1310 * ie. there are no free entries, then no more WQEs can be posted. 1311 * So return an error. 1312 */ 1313 if (srq->srq_wridlist->wl_freel_entries == 0) { 1314 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock); 1315 return (DAT_INSUFFICIENT_RESOURCES); 1316 } 1317 1318 /* Save away some initial SRQ state */ 1319 qsize_msk = srq->srq_wridlist->wl_size - 1; 1320 head = srq->srq_wridlist->wl_freel_head; 1321 1322 next_head = (head + 1) & qsize_msk; 1323 1324 /* Get the descriptor (IO Address) of the WQE to be built */ 1325 desc = srq->srq_wridlist->wl_free_list[head]; 1326 1327 wqe_index = TAVOR_SRQ_WQ_INDEX(srq->srq_wq_desc_addr, desc, 1328 srq->srq_wq_wqesz); 1329 1330 /* The user virtual address of the WQE to be built */ 1331 wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq, wqe_index); 1332 1333 /* 1334 * Call dapli_arbel_wqe_srq_build() to build the WQE at the given 1335 * address. This routine uses the information in the 1336 * ibt_recv_wr_t and returns the size of the WQE. 1337 */ 1338 status = dapli_arbel_wqe_srq_build(srq, wr, wqe_addr); 1339 if (status != DAT_SUCCESS) { 1340 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock); 1341 return (status); 1342 } 1343 1344 /* 1345 * Add a WRID entry to the WRID list. 1346 */ 1347 dapli_tavor_wrid_add_entry_srq(srq, wr->wr_id, wqe_index); 1348 1349 #if 0 1350 if (srq->srq_wq_lastwqeindex == -1) { 1351 last_wqe_addr = NULL; 1352 } else { 1353 last_wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq, 1354 srq->srq_wq_lastwqeindex); 1355 } 1356 /* 1357 * Now link the chain to the old chain (if there was one) 1358 * and update the wqe_counter in the doorbell record. 1359 */ 1360 XXX 1361 dapli_tavor_wqe_srq_linknext(wqe_addr, ns, desc, last_wqe_addr); 1362 #endif 1363 1364 /* Update some of the state in the SRQ */ 1365 srq->srq_wq_lastwqeindex = wqe_index; 1366 srq->srq_wridlist->wl_freel_head = next_head; 1367 srq->srq_wridlist->wl_freel_entries--; 1368 dapl_os_assert(srq->srq_wridlist->wl_freel_entries <= 1369 srq->srq_wridlist->wl_size); 1370 1371 /* Update the doorbell record */ 1372 srq->srq_counter++; 1373 (srq->srq_dbp)[0] = HTOBE_32(srq->srq_counter); 1374 1375 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock); 1376 1377 return (DAT_SUCCESS); 1378 } 1379 1380 /* 1381 * dapli_arbel_cq_srq_entries_flush() 1382 */ 1383 static void 1384 dapli_arbel_cq_srq_entries_flush(ib_qp_handle_t qp) 1385 { 1386 ib_cq_handle_t cq; 1387 dapls_tavor_workq_hdr_t *wqhdr; 1388 tavor_hw_cqe_t *cqe; 1389 tavor_hw_cqe_t *next_cqe; 1390 uint32_t cons_indx, tail_cons_indx, wrap_around_mask; 1391 uint32_t new_indx, check_indx, indx; 1392 int cqe_qpnum, cqe_type; 1393 int outstanding_cqes, removed_cqes; 1394 int i; 1395 1396 /* ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock)); */ 1397 1398 cq = qp->qp_rq_cqhdl; 1399 wqhdr = qp->qp_rq_wqhdr; 1400 1401 dapl_os_assert(wqhdr->wq_wrid_post != NULL); 1402 dapl_os_assert(wqhdr->wq_wrid_post->wl_srq_en != 0); 1403 1404 /* Get the consumer index */ 1405 cons_indx = cq->cq_consindx; 1406 1407 /* 1408 * Calculate the wrap around mask. Note: This operation only works 1409 * because all Tavor completion queues have power-of-2 sizes 1410 */ 1411 wrap_around_mask = (cq->cq_size - 1); 1412 1413 /* Calculate the pointer to the first CQ entry */ 1414 cqe = &cq->cq_addr[cons_indx]; 1415 1416 /* 1417 * Loop through the CQ looking for entries owned by software. If an 1418 * entry is owned by software then we increment an 'outstanding_cqes' 1419 * count to know how many entries total we have on our CQ. We use this 1420 * value further down to know how many entries to loop through looking 1421 * for our same QP number. 1422 */ 1423 outstanding_cqes = 0; 1424 tail_cons_indx = cons_indx; 1425 while (TAVOR_CQE_OWNER_IS_SW(cqe)) { 1426 /* increment total cqes count */ 1427 outstanding_cqes++; 1428 1429 /* increment the consumer index */ 1430 tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask; 1431 1432 /* update the pointer to the next cq entry */ 1433 cqe = &cq->cq_addr[tail_cons_indx]; 1434 } 1435 1436 /* 1437 * Using the 'tail_cons_indx' that was just set, we now know how many 1438 * total CQEs possible there are. Set the 'check_indx' and the 1439 * 'new_indx' to the last entry identified by 'tail_cons_indx' 1440 */ 1441 check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask; 1442 1443 for (i = 0; i < outstanding_cqes; i++) { 1444 cqe = &cq->cq_addr[check_indx]; 1445 1446 /* Grab QP number from CQE */ 1447 cqe_qpnum = TAVOR_CQE_QPNUM_GET(cqe); 1448 cqe_type = TAVOR_CQE_SENDRECV_GET(cqe); 1449 1450 /* 1451 * If the QP number is the same in the CQE as the QP that we 1452 * have on this SRQ, then we must free up the entry off the 1453 * SRQ. We also make sure that the completion type is of the 1454 * 'TAVOR_COMPLETION_RECV' type. So any send completions on 1455 * this CQ will be left as-is. The handling of returning 1456 * entries back to HW ownership happens further down. 1457 */ 1458 if (cqe_qpnum == qp->qp_num && 1459 cqe_type == TAVOR_COMPLETION_RECV) { 1460 /* Add back to SRQ free list */ 1461 (void) dapli_tavor_wrid_find_match_srq( 1462 wqhdr->wq_wrid_post, cqe); 1463 } else { 1464 /* Do Copy */ 1465 if (check_indx != new_indx) { 1466 next_cqe = &cq->cq_addr[new_indx]; 1467 /* 1468 * Copy the CQE into the "next_cqe" 1469 * pointer. 1470 */ 1471 (void) dapl_os_memcpy(next_cqe, cqe, 1472 sizeof (tavor_hw_cqe_t)); 1473 } 1474 new_indx = (new_indx - 1) & wrap_around_mask; 1475 } 1476 /* Move index to next CQE to check */ 1477 check_indx = (check_indx - 1) & wrap_around_mask; 1478 } 1479 1480 /* Initialize removed cqes count */ 1481 removed_cqes = 0; 1482 1483 /* If an entry was removed */ 1484 if (check_indx != new_indx) { 1485 1486 /* 1487 * Set current pointer back to the beginning consumer index. 1488 * At this point, all unclaimed entries have been copied to the 1489 * index specified by 'new_indx'. This 'new_indx' will be used 1490 * as the new consumer index after we mark all freed entries as 1491 * having HW ownership. We do that here. 1492 */ 1493 1494 /* Loop through all entries until we reach our new pointer */ 1495 for (indx = cons_indx; indx <= new_indx; 1496 indx = (indx + 1) & wrap_around_mask) { 1497 removed_cqes++; 1498 cqe = &cq->cq_addr[indx]; 1499 1500 /* Reset entry to hardware ownership */ 1501 TAVOR_CQE_OWNER_SET_HW(cqe); 1502 } 1503 } 1504 1505 /* 1506 * Update consumer index to be the 'new_indx'. This moves it past all 1507 * removed entries. Because 'new_indx' is pointing to the last 1508 * previously valid SW owned entry, we add 1 to point the cons_indx to 1509 * the first HW owned entry. 1510 */ 1511 cons_indx = (new_indx + 1) & wrap_around_mask; 1512 1513 /* 1514 * Now we only ring the doorbell (to update the consumer index) if 1515 * we've actually consumed a CQ entry. If we found no QP number 1516 * matches above, then we would not have removed anything. So only if 1517 * something was removed do we ring the doorbell. 1518 */ 1519 if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) { 1520 /* 1521 * Update the consumer index in both the CQ handle and the 1522 * doorbell record. 1523 */ 1524 cq->cq_consindx = cons_indx; 1525 dapli_arbel_cq_update_ci(cq, cq->cq_poll_dbp); 1526 } 1527 } 1528 1529 static void 1530 dapli_arbel_rq_prelink(caddr_t first, uint32_t desc_off, uint32_t wqesz, 1531 uint32_t numwqe, uint32_t nds) 1532 { 1533 int i; 1534 uint32_t *p = (uint32_t *)(uintptr_t)first; 1535 uint32_t off = desc_off; 1536 uint32_t pincr = wqesz / sizeof (uint32_t); 1537 ibt_wr_ds_t sgl; 1538 1539 sgl.ds_va = (ib_vaddr_t)0; 1540 sgl.ds_key = ARBEL_WQE_SGL_INVALID_LKEY; 1541 sgl.ds_len = (ib_msglen_t)0; 1542 1543 for (i = 0; i < numwqe - 1; i++, p += pincr) { 1544 off += wqesz; 1545 p[0] = HTOBE_32(off); /* link curr to next */ 1546 p[1] = nds; /* nds is 0 for SRQ */ 1547 TAVOR_WQE_BUILD_DATA_SEG((void *)&p[2], &sgl); 1548 } 1549 p[0] = HTOBE_32(desc_off); /* link last to first */ 1550 p[1] = nds; 1551 TAVOR_WQE_BUILD_DATA_SEG((void *)&p[2], &sgl); 1552 } 1553 1554 static void 1555 dapli_arbel_sq_prelink(caddr_t first, uint32_t desc_off, uint32_t wqesz, 1556 uint32_t numwqe) 1557 { 1558 int i; 1559 uint32_t *p = (uint32_t *)(uintptr_t)first; 1560 uint32_t off = desc_off; 1561 uint32_t pincr = wqesz / sizeof (uint32_t); 1562 1563 for (i = 0; i < numwqe - 1; i++, p += pincr) { 1564 off += wqesz; 1565 p[0] = HTOBE_32(off); /* link curr to next */ 1566 } 1567 p[0] = HTOBE_32(desc_off); /* link last to first */ 1568 } 1569 1570 static void 1571 dapli_arbel_qp_init(ib_qp_handle_t qp) 1572 { 1573 (qp->qp_sq_dbp)[1] = HTOBE_32((qp->qp_num << 8) | ARBEL_DBR_SQ); 1574 if (qp->qp_srq_enabled == 0) { 1575 (qp->qp_rq_dbp)[1] = HTOBE_32((qp->qp_num << 8) | ARBEL_DBR_RQ); 1576 1577 /* pre-link the whole receive queue */ 1578 dapli_arbel_rq_prelink(qp->qp_rq_buf, qp->qp_rq_desc_addr, 1579 qp->qp_rq_wqesz, qp->qp_rq_numwqe, 1580 HTOBE_32(qp->qp_rq_wqesz >> 4)); 1581 } 1582 dapli_arbel_sq_prelink(qp->qp_sq_buf, qp->qp_sq_desc_addr, 1583 qp->qp_sq_wqesz, qp->qp_sq_numwqe); 1584 qp->qp_sq_lastwqeaddr = (uint64_t *)((uintptr_t)qp->qp_sq_buf + 1585 ((qp->qp_sq_numwqe - 1) * qp->qp_sq_wqesz)); 1586 qp->qp_rq_counter = 0; 1587 qp->qp_sq_counter = 0; 1588 } 1589 1590 static void 1591 dapli_arbel_cq_init(ib_cq_handle_t cq) 1592 { 1593 (cq->cq_poll_dbp)[1] = 1594 HTOBE_32((cq->cq_num << 8) | ARBEL_DBR_CQ_SET_CI); 1595 (cq->cq_arm_dbp)[1] = 1596 HTOBE_32((cq->cq_num << 8) | ARBEL_DBR_CQ_ARM | 0x8); 1597 /* cq_resize -- needs testing */ 1598 } 1599 1600 static void 1601 dapli_arbel_srq_init(ib_srq_handle_t srq) 1602 { 1603 (srq->srq_dbp)[1] = 1604 HTOBE_32((srq->srq_num << 8) | ARBEL_DBR_SRQ); 1605 1606 /* pre-link the whole shared receive queue */ 1607 dapli_arbel_rq_prelink(srq->srq_addr, srq->srq_wq_desc_addr, 1608 srq->srq_wq_wqesz, srq->srq_wq_numwqe, 0); 1609 srq->srq_counter = 0; 1610 1611 /* needs testing */ 1612 } 1613 1614 void 1615 dapls_init_funcs_arbel(DAPL_HCA *hca_ptr) 1616 { 1617 hca_ptr->post_send = dapli_arbel_post_send; 1618 hca_ptr->post_recv = dapli_arbel_post_recv; 1619 hca_ptr->post_srq = dapli_arbel_post_srq; 1620 hca_ptr->cq_peek = dapli_arbel_cq_peek; 1621 hca_ptr->cq_poll = dapli_arbel_cq_poll; 1622 hca_ptr->cq_poll_one = dapli_arbel_cq_poll_one; 1623 hca_ptr->cq_notify = dapli_arbel_cq_notify; 1624 hca_ptr->srq_flush = dapli_arbel_cq_srq_entries_flush; 1625 hca_ptr->qp_init = dapli_arbel_qp_init; 1626 hca_ptr->cq_init = dapli_arbel_cq_init; 1627 hca_ptr->srq_init = dapli_arbel_srq_init; 1628 hca_ptr->hermon_resize_cq = 0; 1629 } 1630