1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include "dapl.h" 28 #include "dapl_tavor_hw.h" 29 #include "dapl_tavor_wr.h" 30 #include "dapl_tavor_ibtf_impl.h" 31 32 #define HERMON_WQE_SGL_INVALID_LKEY 0x00000100 33 #define HERMON_WQE_SEND_FENCE_MASK 0x40 34 #define HERMON_WQE_NDS_MASK 0x3F 35 36 #define HERMON_CQDB_NOTIFY_CQ_SOLICIT (0x1 << 24) 37 #define HERMON_CQDB_NOTIFY_CQ (0x2 << 24) 38 39 #define HERMON_CQE_RCV_SEND 0x1 40 #define HERMON_CQE_ERR_OPCODE 0x1E 41 #define HERMON_CQE_RESIZE_OPCODE 0x16 42 #define HERMON_CQE_OPCODE_GET(cqe) (((uint8_t *)cqe)[31] & 0x1F) 43 #define HERMON_CQE_SENDRECV_GET(cqe) (((uint8_t *)cqe)[31] & 0x40) 44 #define HERMON_CQE_OWNER_IS_SW(cq, cqe) ((((uint8_t *)cqe)[31] >> 7) == \ 45 ((cq->cq_consindx & cq->cq_size) >> cq->cq_log_cqsz)) 46 47 #define HERMON_QP_WQEADDRSZ(wcnt) ((uint32_t)(wcnt << 6)) 48 49 #define HERMON_WQE_SEND_SIGNALED_MASK 0x0000000C00000000ull 50 #define HERMON_WQE_SEND_SOLICIT_MASK 0x0000000200000000ull 51 #define HERMON_WQE_SETCTRL(desc, ctrl) \ 52 ((uint64_t *)(desc))[1] = HTOBE_64(ctrl) 53 #define HERMON_WQE_SETNEXT(desc, nopcode, size, fence) \ 54 ((uint64_t *)(desc))[0] = HTOBE_64((nopcode) | (size) | (fence) | \ 55 (((uint64_t)((uint8_t *)desc)[0] &0x80) << 56)) 56 #define HERMON_WQE_BUILD_DATA_SEG(ds, sgl) \ 57 { \ 58 uint64_t *tmp; \ 59 \ 60 tmp = (uint64_t *)(ds); \ 61 tmp[1] = HTOBE_64((sgl)->ds_va); \ 62 ((uint32_t *)tmp)[1] = HTOBE_32((sgl)->ds_key); \ 63 membar_producer(); \ 64 ((uint32_t *)tmp)[0] = HTOBE_32((sgl)->ds_len); \ 65 } 66 67 68 /* handy macro, useful because of cq_resize dynamics */ 69 #define cq_wrap_around_mask (cq->cq_size - 1) 70 71 pthread_spinlock_t hermon_bf_lock; 72 73 /* 74 * Function signatures 75 */ 76 extern uint64_t dapls_tavor_wrid_get_entry(ib_cq_handle_t, tavor_hw_cqe_t *, 77 uint_t, uint_t, dapls_tavor_wrid_entry_t *); 78 extern void dapls_tavor_wrid_cq_reap(ib_cq_handle_t); 79 extern DAPL_OS_LOCK g_tavor_uar_lock; 80 81 #ifndef _LP64 82 extern void dapls_atomic_assign_64(uint64_t, uint64_t *); 83 #endif 84 85 static int dapli_hermon_wqe_send_build(ib_qp_handle_t, ibt_send_wr_t *, 86 uint64_t *, uint_t *); 87 static DAT_RETURN dapli_hermon_wqe_recv_build(ib_qp_handle_t, ibt_recv_wr_t *, 88 uint64_t *, uint_t *); 89 static int dapli_hermon_cq_cqe_consume(ib_cq_handle_t, uint32_t *, ibt_wc_t *); 90 static int dapli_hermon_cq_errcqe_consume(ib_cq_handle_t, uint32_t *, 91 ibt_wc_t *); 92 extern void dapli_tavor_wrid_add_entry(dapls_tavor_workq_hdr_t *, uint64_t, 93 uint32_t, uint_t); 94 extern void dapli_tavor_wrid_add_entry_srq(ib_srq_handle_t, uint64_t, uint32_t); 95 96 /* 97 * Note: The 64 bit doorbells need to written atomically. 98 * In 32 bit libraries we need to use the special assembly rtn 99 * because compiler generated code splits into 2 word writes 100 */ 101 102 /* 103 * dapli_hermon_cq_doorbell() 104 * Takes the specified cq cmd and cq number and rings the cq doorbell 105 */ 106 static void 107 dapli_hermon_cq_doorbell(dapls_hw_uar_t ia_uar, uint32_t cq_cmd, uint32_t cqn, 108 uint32_t cmd_sn, uint32_t cq_param) 109 { 110 uint64_t doorbell; 111 112 /* Build the doorbell from the parameters */ 113 doorbell = (cmd_sn | cq_cmd | cqn); 114 doorbell = (doorbell << 32) | cq_param; 115 116 /* Write the doorbell to UAR */ 117 #ifdef _LP64 118 ((tavor_hw_uar_t *)ia_uar)->cq = HTOBE_64(doorbell); 119 /* 32 bit version */ 120 #elif defined(i386) 121 dapl_os_lock(&g_tavor_uar_lock); 122 /* 123 * For 32 bit intel we assign the doorbell in the order 124 * prescribed by the Tavor PRM, lower to upper addresses 125 */ 126 ((tavor_hw_uar32_t *)ia_uar)->cq[0] = 127 (uint32_t)HTOBE_32(doorbell >> 32); 128 ((tavor_hw_uar32_t *)ia_uar)->cq[1] = 129 (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff); 130 dapl_os_unlock(&g_tavor_uar_lock); 131 #else 132 dapls_atomic_assign_64(HTOBE_64(doorbell), 133 &((tavor_hw_uar_t *)ia_uar)->cq); 134 #endif 135 } 136 137 /* 138 * dapli_hermon_qp_send_doorbell() 139 * Takes the specified qp number and rings the send doorbell. 140 */ 141 static void 142 dapli_hermon_sq_dbreg(dapls_hw_uar_t ia_uar, uint32_t qpn) 143 { 144 uint64_t doorbell; 145 146 doorbell = qpn << 8; 147 148 /* Write the doorbell to UAR */ 149 #ifdef _LP64 150 ((tavor_hw_uar_t *)ia_uar)->send = HTOBE_64(doorbell); 151 #else 152 #if defined(i386) 153 dapl_os_lock(&g_tavor_uar_lock); 154 /* 155 * For 32 bit intel we assign the doorbell in the order 156 * prescribed by the Tavor PRM, lower to upper addresses 157 */ 158 ((tavor_hw_uar32_t *)ia_uar)->send[0] = 159 (uint32_t)HTOBE_32(doorbell >> 32); 160 ((tavor_hw_uar32_t *)ia_uar)->send[1] = 161 (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff); 162 dapl_os_unlock(&g_tavor_uar_lock); 163 #else 164 dapls_atomic_assign_64(HTOBE_64(doorbell), 165 &((tavor_hw_uar_t *)ia_uar)->send); 166 #endif 167 #endif 168 } 169 170 /* 171 * dapli_hermon_wqe_send_build() 172 * Constructs a WQE for a given ibt_send_wr_t 173 */ 174 static int 175 dapli_hermon_wqe_send_build(ib_qp_handle_t qp, ibt_send_wr_t *wr, 176 uint64_t *addr, uint_t *size) 177 { 178 tavor_hw_snd_wqe_remaddr_t *rc; 179 tavor_hw_snd_wqe_bind_t *bn; 180 tavor_hw_wqe_sgl_t *ds; 181 ibt_wr_ds_t *sgl; 182 uint8_t *src, *dst, *maxdst; 183 uint32_t nds; 184 int len, thislen, maxlen; 185 uint32_t new_rkey; 186 uint32_t old_rkey; 187 int i, num_ds; 188 int max_inline_bytes = -1; 189 uint64_t ctrl; 190 uint64_t nopcode; 191 uint_t my_size; 192 193 nds = wr->wr_nds; 194 sgl = wr->wr_sgl; 195 num_ds = 0; 196 ctrl = ((wr->wr_flags & IBT_WR_SEND_SIGNAL) ? 197 HERMON_WQE_SEND_SIGNALED_MASK : 0) | 198 ((wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 199 HERMON_WQE_SEND_SOLICIT_MASK : 0); 200 201 /* 202 * RC is the only supported transport in UDAPL 203 * For RC requests, we allow "Send", "RDMA Read", "RDMA Write" 204 */ 205 switch (wr->wr_opcode) { 206 case IBT_WRC_SEND: 207 /* 208 * If this is a Send request, then all we need is 209 * the Data Segment processing below. 210 * Initialize the information for the Data Segments 211 */ 212 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr + 213 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 214 if (qp->qp_sq_inline != 0) 215 max_inline_bytes = 216 qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_SEND; 217 nopcode = TAVOR_WQE_SEND_NOPCODE_SEND; 218 break; 219 case IBT_WRC_RDMAW: 220 if (qp->qp_sq_inline != 0) 221 max_inline_bytes = 222 qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_RDMAW; 223 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW; 224 /* FALLTHROUGH */ 225 case IBT_WRC_RDMAR: 226 if (wr->wr_opcode == IBT_WRC_RDMAR) { 227 if (qp->qp_sq_inline < 0) 228 qp->qp_sq_inline = 0; 229 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR; 230 } 231 /* 232 * If this is an RDMA Read or RDMA Write request, then fill 233 * in the "Remote Address" header fields. 234 */ 235 rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)addr + 236 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 237 238 /* 239 * Build the Remote Address Segment for the WQE, using 240 * the information from the RC work request. 241 */ 242 TAVOR_WQE_BUILD_REMADDR(rc, &wr->wr.rc.rcwr.rdma); 243 244 /* Update "ds" for filling in Data Segments (below) */ 245 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc + 246 sizeof (tavor_hw_snd_wqe_remaddr_t)); 247 break; 248 case IBT_WRC_BIND: 249 /* 250 * Generate a new R_key 251 * Increment the upper "unconstrained" bits and need to keep 252 * the lower "constrained" bits the same it represents 253 * the MPT index. 254 */ 255 #if 0 256 /* XXX - need equiv of "hermon_wr_bind_check(state, wr);" */ 257 /* XXX - uses hermon_mr_keycalc - what about Sinai vs. Arbel??? */ 258 #endif 259 old_rkey = wr->wr.rc.rcwr.bind->bind_rkey; 260 new_rkey = old_rkey >> 8; /* index */ 261 old_rkey = (old_rkey + 1) & 0xff; /* incremented key */ 262 new_rkey = (new_rkey << 8) | old_rkey; 263 264 wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey; 265 266 bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)addr + 267 sizeof (tavor_hw_snd_wqe_nextctrl_t)); 268 269 /* 270 * Build the Bind Memory Window Segments for the WQE, 271 * using the information from the RC Bind memory 272 * window work request. 273 */ 274 TAVOR_WQE_BUILD_BIND(bn, wr->wr.rc.rcwr.bind); 275 276 /* 277 * Update the "ds" pointer. Even though the "bind" 278 * operation requires no SGLs, this is necessary to 279 * facilitate the correct descriptor size calculations 280 * (below). 281 */ 282 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn + 283 sizeof (tavor_hw_snd_wqe_bind_t)); 284 nds = 0; 285 nopcode = TAVOR_WQE_SEND_NOPCODE_BIND; 286 break; 287 default: 288 dapl_dbg_log(DAPL_DBG_TYPE_ERR, 289 "dapli_hermon_wqe_send_build: invalid wr_opcode=%d\n", 290 wr->wr_opcode); 291 return (DAT_INTERNAL_ERROR); 292 } 293 294 /* 295 * Now fill in the Data Segments (SGL) for the Send WQE based on 296 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer 297 * Start by checking for a valid number of SGL entries 298 */ 299 if (nds > qp->qp_sq_sgl) { 300 return (DAT_INVALID_PARAMETER); 301 } 302 303 /* 304 * For each SGL in the Send Work Request, fill in the Send WQE's data 305 * segments. Note: We skip any SGL with zero size because Tavor 306 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 307 * the encoding for zero means a 2GB transfer. Because of this special 308 * encoding in the hardware, we mask the requested length with 309 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 310 * zero.) 311 */ 312 if (max_inline_bytes != -1) { /* compute total_len */ 313 len = 0; 314 for (i = 0; i < nds; i++) 315 len += sgl[i].ds_len; 316 if (len == 0) 317 max_inline_bytes = -1; /* do not inline */ 318 else { 319 /* need to reduce the length by dword "len" fields */ 320 max_inline_bytes -= (len / 64) * sizeof (uint32_t); 321 if (len > max_inline_bytes) 322 max_inline_bytes = -1; /* too big for inline */ 323 } 324 } 325 if (max_inline_bytes != -1) { /* do "inline" */ 326 327 dst = (uint8_t *)((uint32_t *)ds + 1); 328 maxdst = (uint8_t *)(((uintptr_t)dst + 64) & ~(64 - 1)); 329 maxlen = maxdst - dst; 330 thislen = 0; 331 i = 0; 332 src = (uint8_t *)(uintptr_t)sgl[i].ds_va; 333 len = sgl[i].ds_len; 334 do { 335 /* if this sgl overflows the inline segment */ 336 if (len > maxlen) { 337 if (maxlen) /* might be 0 */ 338 (void) dapl_os_memcpy(dst, 339 src, maxlen); 340 membar_producer(); 341 *(uint32_t *)ds = 342 HTOBE_32((thislen + maxlen) | 343 TAVOR_WQE_SGL_INLINE_MASK); 344 thislen = 0; 345 len -= maxlen; 346 src += maxlen; 347 dst = maxdst + sizeof (uint32_t); 348 ds = (tavor_hw_wqe_sgl_t *)(void *)maxdst; 349 maxdst += 64; 350 maxlen = 64 - sizeof (uint32_t); 351 } else { /* this sgl fully fits */ 352 (void) dapl_os_memcpy(dst, 353 src, len); 354 maxlen -= len; /* room left */ 355 thislen += len; 356 dst += len; 357 while (++i < nds) 358 if (sgl[i].ds_len) 359 break; 360 if (i >= nds) 361 break; 362 src = (uint8_t *)(uintptr_t)sgl[i].ds_va; 363 len = sgl[i].ds_len; 364 } 365 } while (i < nds); 366 membar_producer(); 367 *(uint32_t *)ds = HTOBE_32(thislen | 368 TAVOR_WQE_SGL_INLINE_MASK); 369 370 /* Return the size of descriptor (in 16-byte chunks) */ 371 my_size = ((uintptr_t)dst - (uintptr_t)addr + 15) >> 4; 372 if (my_size <= (256 >> 4)) 373 *size = my_size; /* use Hermon Blueflame */ 374 else 375 *size = 0; 376 } else { 377 for (i = 0; i < nds; i++) { 378 if (sgl[i].ds_len == 0) { 379 continue; 380 } 381 382 /* 383 * Fill in the Data Segment(s) for the current WQE, 384 * using the information contained in the 385 * scatter-gather list of the work request. 386 */ 387 HERMON_WQE_BUILD_DATA_SEG(&ds[num_ds], &sgl[i]); 388 num_ds++; 389 } 390 391 /* Return the size of descriptor (in 16-byte chunks) */ 392 my_size = ((uintptr_t)&ds[num_ds] - (uintptr_t)addr) >> 4; 393 *size = 0; /* do not use Hermon Blueflame */ 394 } 395 HERMON_WQE_SETCTRL(addr, ctrl); 396 membar_producer(); 397 HERMON_WQE_SETNEXT(addr, nopcode << 32, my_size, 398 (wr->wr_flags & IBT_WR_SEND_FENCE) ? 399 HERMON_WQE_SEND_FENCE_MASK : 0); 400 401 return (DAT_SUCCESS); 402 } 403 404 /* 405 * dapli_hermon_wqe_recv_build() 406 * Builds the recv WQE for a given ibt_recv_wr_t 407 */ 408 static DAT_RETURN 409 dapli_hermon_wqe_recv_build(ib_qp_handle_t qp, ibt_recv_wr_t *wr, 410 uint64_t *addr, uint_t *size) 411 { 412 tavor_hw_wqe_sgl_t *ds; 413 int i; 414 int num_ds; 415 416 /* Fill in the Data Segments (SGL) for the Recv WQE */ 417 ds = (tavor_hw_wqe_sgl_t *)addr; 418 num_ds = 0; 419 420 /* Check for valid number of SGL entries */ 421 if (wr->wr_nds > qp->qp_rq_sgl) { 422 return (DAT_INVALID_PARAMETER); 423 } 424 425 /* 426 * For each SGL in the Recv Work Request, fill in the Recv WQE's data 427 * segments. Note: We skip any SGL with zero size because Tavor 428 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 429 * the encoding for zero means a 2GB transfer. Because of this special 430 * encoding in the hardware, we mask the requested length with 431 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 432 * zero.) 433 */ 434 for (i = 0; i < wr->wr_nds; i++) { 435 if (wr->wr_sgl[i].ds_len == 0) { 436 continue; 437 } 438 439 /* 440 * Fill in the Data Segment(s) for the receive WQE, using the 441 * information contained in the scatter-gather list of the 442 * work request. 443 */ 444 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]); 445 num_ds++; 446 } 447 if (i < qp->qp_rq_sgl) { 448 ibt_wr_ds_t sgl; 449 sgl.ds_va = (ib_vaddr_t)0; 450 sgl.ds_len = (ib_msglen_t)0; 451 sgl.ds_key = (ibt_lkey_t)HERMON_WQE_SGL_INVALID_LKEY; 452 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &sgl); 453 } 454 455 /* Return the size of descriptor (in 16-byte chunks) */ 456 *size = qp->qp_rq_wqesz >> 4; 457 458 return (DAT_SUCCESS); 459 } 460 461 /* 462 * dapli_hermon_wqe_srq_build() 463 * Builds the recv WQE for a given ibt_recv_wr_t 464 */ 465 static DAT_RETURN 466 dapli_hermon_wqe_srq_build(ib_srq_handle_t srq, ibt_recv_wr_t *wr, 467 uint64_t *addr) 468 { 469 tavor_hw_wqe_sgl_t *ds; 470 ibt_wr_ds_t end_sgl; 471 int i; 472 int num_ds; 473 474 /* Fill in the Data Segments (SGL) for the Recv WQE */ 475 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr + 476 sizeof (tavor_hw_rcv_wqe_nextctrl_t)); 477 num_ds = 0; 478 479 /* Check for valid number of SGL entries */ 480 if (wr->wr_nds > srq->srq_wq_sgl) { 481 return (DAT_INVALID_PARAMETER); 482 } 483 484 /* 485 * For each SGL in the Recv Work Request, fill in the Recv WQE's data 486 * segments. Note: We skip any SGL with zero size because Tavor 487 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually 488 * the encoding for zero means a 2GB transfer. Because of this special 489 * encoding in the hardware, we mask the requested length with 490 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as 491 * zero.) 492 */ 493 for (i = 0; i < wr->wr_nds; i++) { 494 if (wr->wr_sgl[i].ds_len == 0) { 495 continue; 496 } 497 498 /* 499 * Fill in the Data Segment(s) for the receive WQE, using the 500 * information contained in the scatter-gather list of the 501 * work request. 502 */ 503 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]); 504 num_ds++; 505 } 506 507 /* 508 * For SRQ, if the number of data segments is less than the maximum 509 * specified at alloc, then we have to fill in a special "key" entry in 510 * the sgl entry after the last valid one in this post request. We do 511 * that here. 512 */ 513 if (num_ds < srq->srq_wq_sgl) { 514 end_sgl.ds_va = (ib_vaddr_t)0; 515 end_sgl.ds_len = (ib_msglen_t)0; 516 end_sgl.ds_key = (ibt_lkey_t)HERMON_WQE_SGL_INVALID_LKEY; 517 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &end_sgl); 518 } 519 520 return (DAT_SUCCESS); 521 } 522 523 /* 524 * dapli_hermon_cq_peek() 525 * Peeks into a given CQ to check if there are any events that can be 526 * polled. It returns the number of CQEs that can be polled. 527 */ 528 static void 529 dapli_hermon_cq_peek(ib_cq_handle_t cq, int *num_cqe) 530 { 531 uint32_t *cqe; 532 uint32_t imm_eth_pkey_cred; 533 uint32_t cons_indx; 534 int polled_cnt; 535 uint_t doorbell_cnt; 536 uint_t opcode; 537 538 /* Get the consumer index */ 539 cons_indx = cq->cq_consindx & cq_wrap_around_mask; 540 541 /* Calculate the pointer to the first CQ entry */ 542 cqe = (uint32_t *)&cq->cq_addr[cons_indx]; 543 544 /* 545 * Count entries in the CQ until we find an entry owned by 546 * the hardware. 547 */ 548 polled_cnt = 0; 549 while (HERMON_CQE_OWNER_IS_SW(cq, cqe)) { 550 opcode = HERMON_CQE_OPCODE_GET(cqe); 551 /* Error CQE map to multiple work completions */ 552 if (opcode == HERMON_CQE_ERR_OPCODE) { 553 imm_eth_pkey_cred = 554 TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe); 555 doorbell_cnt = 556 imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK; 557 polled_cnt += (doorbell_cnt + 1); 558 } else { 559 polled_cnt++; 560 } 561 /* Increment the consumer index */ 562 cons_indx = (cons_indx + 1) & cq_wrap_around_mask; 563 564 /* Update the pointer to the next CQ entry */ 565 cqe = (uint32_t *)&cq->cq_addr[cons_indx]; 566 } 567 568 *num_cqe = polled_cnt; 569 } 570 571 #define dapli_hermon_cq_update_ci(cq, dbp) \ 572 (dbp)[0] = HTOBE_32(cq->cq_consindx & 0xFFFFFF) 573 574 /* 575 * dapli_hermon_cq_resize_helper() 576 * This routine switches from the pre-cq_resize buffer to the new buffer. 577 */ 578 static int 579 dapli_hermon_cq_resize_helper(ib_cq_handle_t cq) 580 { 581 int i; 582 583 if ((cq->cq_resize_addr == 0) || 584 (munmap((char *)cq->cq_addr, cq->cq_map_len) < 0)) { 585 dapl_dbg_log(DAPL_DBG_TYPE_ERR, "cq_resize_helper: " 586 "munmap(%p:0x%llx) failed(%d)\n", cq->cq_addr, 587 cq->cq_map_len, errno); 588 return (1); /* FAILED */ 589 } 590 cq->cq_addr = cq->cq_resize_addr; 591 cq->cq_map_offset = cq->cq_resize_map_offset; 592 cq->cq_map_len = cq->cq_resize_map_len; 593 cq->cq_size = cq->cq_resize_size; 594 cq->cq_cqesz = cq->cq_resize_cqesz; 595 cq->cq_resize_addr = 0; 596 cq->cq_resize_map_offset = 0; 597 cq->cq_resize_map_len = 0; 598 cq->cq_resize_size = 0; 599 cq->cq_resize_cqesz = 0; 600 for (i = 0; (1 << i) < cq->cq_size; i++) 601 ; 602 cq->cq_log_cqsz = i; 603 604 cq->cq_consindx++; /* consume the RESIZE cqe */ 605 606 return (0); /* SUCCESS */ 607 } 608 609 /* 610 * dapli_hermon_cq_poll() 611 * This routine polls CQEs out of a CQ and puts them into the ibt_wc_t 612 * array that is passed in. 613 */ 614 static DAT_RETURN 615 dapli_hermon_cq_poll(ib_cq_handle_t cq, ibt_wc_t *wc_p, uint_t num_wc, 616 uint_t *num_polled) 617 { 618 uint32_t *cqe; 619 uint32_t cons_indx; 620 uint32_t polled_cnt; 621 DAT_RETURN dat_status; 622 int status; 623 624 /* Get the consumer index */ 625 cons_indx = cq->cq_consindx & cq_wrap_around_mask; 626 627 /* Calculate the pointer to the first CQ entry */ 628 cqe = (uint32_t *)&cq->cq_addr[cons_indx]; 629 630 /* 631 * Keep pulling entries from the CQ until we find an entry owned by 632 * the hardware. As long as there the CQE's owned by SW, process 633 * each entry by calling dapli_hermon_cq_cqe_consume() and updating the 634 * CQ consumer index. Note: We only update the consumer index if 635 * dapli_hermon_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB. 636 * Otherwise, it indicates that we are going to "recycle" the CQE 637 * (probably because it is a error CQE and corresponds to more than one 638 * completion). 639 */ 640 polled_cnt = 0; 641 while (HERMON_CQE_OWNER_IS_SW(cq, cqe)) { 642 if (HERMON_CQE_OPCODE_GET(cqe) == HERMON_CQE_RESIZE_OPCODE) { 643 if (dapli_hermon_cq_resize_helper(cq)) 644 return (DAT_ERROR(DAT_INTERNAL_ERROR, 0)); 645 cons_indx = cq->cq_consindx & cq_wrap_around_mask; 646 cqe = (uint32_t *)&cq->cq_addr[cons_indx]; 647 continue; 648 } 649 status = dapli_hermon_cq_cqe_consume(cq, cqe, 650 &wc_p[polled_cnt++]); 651 if (status == TAVOR_CQ_SYNC_AND_DB) { 652 /* Reset to hardware ownership is implicit in Hermon */ 653 cq->cq_consindx++; /* incr the total counter */ 654 655 /* Increment the consumer index */ 656 cons_indx = (cons_indx + 1) & cq_wrap_around_mask; 657 658 /* Update the pointer to the next CQ entry */ 659 cqe = (uint32_t *)&cq->cq_addr[cons_indx]; 660 } 661 662 /* 663 * If we have run out of space to store work completions, 664 * then stop and return the ones we have pulled of the CQ. 665 */ 666 if (polled_cnt >= num_wc) { 667 break; 668 } 669 } 670 671 dat_status = DAT_SUCCESS; 672 /* 673 * Now we only ring the doorbell (to update the consumer index) if 674 * we've actually consumed a CQ entry. If we have, for example, 675 * pulled from a CQE that we are still in the process of "recycling" 676 * for error purposes, then we would not update the consumer index. 677 */ 678 if (polled_cnt != 0) { 679 /* 680 * Update the consumer index in both the CQ handle and the 681 * doorbell record. 682 */ 683 dapli_hermon_cq_update_ci(cq, cq->cq_poll_dbp); 684 } else if (polled_cnt == 0) { 685 /* 686 * If the CQ is empty, we can try to free up some of the WRID 687 * list containers. 688 */ 689 if (cq->cq_wrid_reap_head) /* look before leaping */ 690 dapls_tavor_wrid_cq_reap(cq); 691 dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0); 692 } 693 694 if (num_polled != NULL) { 695 *num_polled = polled_cnt; 696 } 697 698 return (dat_status); 699 } 700 701 /* 702 * dapli_hermon_cq_poll_one() 703 * This routine polls one CQE out of a CQ and puts ot into the ibt_wc_t 704 * that is passed in. See above for more comments/details. 705 */ 706 static DAT_RETURN 707 dapli_hermon_cq_poll_one(ib_cq_handle_t cq, ibt_wc_t *wc_p) 708 { 709 uint32_t *cqe; 710 uint32_t cons_indx; 711 DAT_RETURN dat_status; 712 int status; 713 714 start_over: 715 /* Get the consumer index */ 716 cons_indx = cq->cq_consindx & cq_wrap_around_mask; 717 718 /* Calculate the pointer to the first CQ entry */ 719 cqe = (uint32_t *)&cq->cq_addr[cons_indx]; 720 721 /* 722 * Keep pulling entries from the CQ until we find an entry owned by 723 * the hardware. As long as there the CQE's owned by SW, process 724 * each entry by calling dapli_hermon_cq_cqe_consume() and updating the 725 * CQ consumer index. Note: We only update the consumer index if 726 * dapli_hermon_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB. 727 * Otherwise, it indicates that we are going to "recycle" the CQE 728 * (probably because it is a error CQE and corresponds to more than one 729 * completion). 730 */ 731 if (HERMON_CQE_OWNER_IS_SW(cq, cqe)) { 732 if (HERMON_CQE_OPCODE_GET(cqe) == HERMON_CQE_RESIZE_OPCODE) { 733 if (dapli_hermon_cq_resize_helper(cq)) 734 return (DAT_ERROR(DAT_INTERNAL_ERROR, 0)); 735 goto start_over; 736 } 737 status = dapli_hermon_cq_cqe_consume(cq, cqe, wc_p); 738 if (status == TAVOR_CQ_SYNC_AND_DB) { 739 /* Reset to hardware ownership is implicit in Hermon */ 740 741 /* Increment the consumer index */ 742 cq->cq_consindx++; 743 dapli_hermon_cq_update_ci(cq, cq->cq_poll_dbp); 744 } 745 dat_status = DAT_SUCCESS; 746 } else { 747 if (cq->cq_wrid_reap_head) /* look before leaping */ 748 dapls_tavor_wrid_cq_reap(cq); 749 dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0); 750 } 751 return (dat_status); 752 } 753 754 /* 755 * dapli_hermon_cq_cqe_consume() 756 * Converts a given CQE into a ibt_wc_t object 757 */ 758 static int 759 dapli_hermon_cq_cqe_consume(ib_cq_handle_t cqhdl, uint32_t *cqe, 760 ibt_wc_t *wc) 761 { 762 uint_t flags; 763 uint_t type; 764 uint_t opcode; 765 int status; 766 767 /* 768 * Determine if this is an "error" CQE by examining "opcode". If it 769 * is an error CQE, then call dapli_hermon_cq_errcqe_consume() and 770 * return whatever status it returns. Otherwise, this is a successful 771 * completion. 772 */ 773 opcode = HERMON_CQE_OPCODE_GET(cqe); 774 if (opcode == HERMON_CQE_ERR_OPCODE) { 775 status = dapli_hermon_cq_errcqe_consume(cqhdl, cqe, wc); 776 return (status); 777 } 778 TAVOR_CQE_WQEADDRSZ_SET(cqe, (HTOBE_32(cqe[6]) >> 10) & 779 ~HERMON_WQE_NDS_MASK); 780 781 /* 782 * Fetch the Work Request ID using the information in the CQE. 783 * See tavor_wr.c for more details. 784 */ 785 wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, (tavor_hw_cqe_t *)cqe, 786 HERMON_CQE_SENDRECV_GET(cqe) >> 6, 0, NULL); 787 wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe); 788 789 /* 790 * Parse the CQE opcode to determine completion type. This will set 791 * not only the type of the completion, but also any flags that might 792 * be associated with it (e.g. whether immediate data is present). 793 */ 794 flags = IBT_WC_NO_FLAGS; 795 if (HERMON_CQE_SENDRECV_GET(cqe) != TAVOR_COMPLETION_RECV) { 796 797 /* 798 * Send CQE 799 * 800 * The following opcodes will not be generated in uDAPL 801 * case TAVOR_CQE_SND_RDMAWR_IMM: 802 * case TAVOR_CQE_SND_SEND_IMM: 803 * case TAVOR_CQE_SND_ATOMIC_CS: 804 * case TAVOR_CQE_SND_ATOMIC_FA: 805 */ 806 switch (opcode) { 807 case TAVOR_CQE_SND_RDMAWR: 808 type = IBT_WRC_RDMAW; 809 break; 810 811 case TAVOR_CQE_SND_SEND: 812 type = IBT_WRC_SEND; 813 break; 814 815 case TAVOR_CQE_SND_RDMARD: 816 type = IBT_WRC_RDMAR; 817 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe); 818 break; 819 820 case TAVOR_CQE_SND_BIND_MW: 821 type = IBT_WRC_BIND; 822 break; 823 824 default: 825 wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR; 826 return (TAVOR_CQ_SYNC_AND_DB); 827 } 828 } else { 829 830 /* 831 * Receive CQE 832 * 833 * The following opcodes will not be generated in uDAPL 834 * 835 * case TAVOR_CQE_RCV_RECV_IMM: 836 * case TAVOR_CQE_RCV_RECV_IMM2: 837 * case TAVOR_CQE_RCV_RDMAWR_IMM: 838 * case TAVOR_CQE_RCV_RDMAWR_IMM2: 839 */ 840 switch (opcode) { 841 case HERMON_CQE_RCV_SEND: 842 type = IBT_WRC_RECV; 843 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe); 844 break; 845 default: 846 wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR; 847 return (TAVOR_CQ_SYNC_AND_DB); 848 } 849 } 850 wc->wc_type = type; 851 wc->wc_flags = flags; 852 /* If we got here, completion status must be success */ 853 wc->wc_status = IBT_WC_SUCCESS; 854 855 return (TAVOR_CQ_SYNC_AND_DB); 856 } 857 858 /* 859 * dapli_hermon_cq_errcqe_consume() 860 */ 861 static int 862 dapli_hermon_cq_errcqe_consume(ib_cq_handle_t cqhdl, uint32_t *cqe, 863 ibt_wc_t *wc) 864 { 865 dapls_tavor_wrid_entry_t wre; 866 uint_t status; 867 uint_t send_or_recv; 868 869 dapl_dbg_log(DAPL_DBG_TYPE_EVD, "errcqe_consume:cqe.eth=%x, wqe=%x\n", 870 TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe), 871 TAVOR_CQE_WQEADDRSZ_GET(cqe)); 872 873 status = ((uint8_t *)cqe)[0x1B]; 874 TAVOR_CQE_WQEADDRSZ_SET(cqe, (HTOBE_32(cqe[6]) >> 10) & 875 ~HERMON_WQE_NDS_MASK); 876 if (HERMON_CQE_SENDRECV_GET(cqe) == 0) { 877 send_or_recv = 0; 878 } else { 879 send_or_recv = 1; 880 } 881 882 /* 883 * Fetch the Work Request ID using the information in the CQE. 884 * See tavor_wr.c for more details. 885 */ 886 wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, (tavor_hw_cqe_t *)cqe, 887 send_or_recv, 1, &wre); 888 wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe); 889 890 /* 891 * Parse the CQE opcode to determine completion type. We know that 892 * the CQE is an error completion, so we extract only the completion 893 * status here. 894 */ 895 switch (status) { 896 case TAVOR_CQE_LOC_LEN_ERR: 897 status = IBT_WC_LOCAL_LEN_ERR; 898 break; 899 900 case TAVOR_CQE_LOC_OP_ERR: 901 status = IBT_WC_LOCAL_CHAN_OP_ERR; 902 break; 903 904 case TAVOR_CQE_LOC_PROT_ERR: 905 status = IBT_WC_LOCAL_PROTECT_ERR; 906 break; 907 908 case TAVOR_CQE_WR_FLUSHED_ERR: 909 status = IBT_WC_WR_FLUSHED_ERR; 910 break; 911 912 case TAVOR_CQE_MW_BIND_ERR: 913 status = IBT_WC_MEM_WIN_BIND_ERR; 914 break; 915 916 case TAVOR_CQE_BAD_RESPONSE_ERR: 917 status = IBT_WC_BAD_RESPONSE_ERR; 918 break; 919 920 case TAVOR_CQE_LOCAL_ACCESS_ERR: 921 status = IBT_WC_LOCAL_ACCESS_ERR; 922 break; 923 924 case TAVOR_CQE_REM_INV_REQ_ERR: 925 status = IBT_WC_REMOTE_INVALID_REQ_ERR; 926 break; 927 928 case TAVOR_CQE_REM_ACC_ERR: 929 status = IBT_WC_REMOTE_ACCESS_ERR; 930 break; 931 932 case TAVOR_CQE_REM_OP_ERR: 933 status = IBT_WC_REMOTE_OP_ERR; 934 break; 935 936 case TAVOR_CQE_TRANS_TO_ERR: 937 status = IBT_WC_TRANS_TIMEOUT_ERR; 938 break; 939 940 case TAVOR_CQE_RNRNAK_TO_ERR: 941 status = IBT_WC_RNR_NAK_TIMEOUT_ERR; 942 break; 943 944 /* 945 * The following error codes are not supported in the Tavor driver 946 * as they relate only to Reliable Datagram completion statuses: 947 * case TAVOR_CQE_LOCAL_RDD_VIO_ERR: 948 * case TAVOR_CQE_REM_INV_RD_REQ_ERR: 949 * case TAVOR_CQE_EEC_REM_ABORTED_ERR: 950 * case TAVOR_CQE_INV_EEC_NUM_ERR: 951 * case TAVOR_CQE_INV_EEC_STATE_ERR: 952 * case TAVOR_CQE_LOC_EEC_ERR: 953 */ 954 955 default: 956 status = IBT_WC_LOCAL_CHAN_OP_ERR; 957 break; 958 } 959 wc->wc_status = status; 960 wc->wc_type = 0; 961 962 /* 963 * Consume the CQE 964 * Return status to indicate that doorbell and sync may be 965 * necessary. 966 */ 967 return (TAVOR_CQ_SYNC_AND_DB); 968 } 969 970 /* 971 * dapli_hermon_cq_notify() 972 * This function is used for arming the CQ by ringing the CQ doorbell. 973 * 974 * Note: there is something very subtle here. This code assumes a very 975 * specific behavior of the kernel driver. The cmd_sn field of the 976 * arm_dbr is updated by the kernel driver whenever a notification 977 * event for the cq is received. This code extracts the cmd_sn field 978 * from the arm_dbr to know the right value to use. The arm_dbr is 979 * always updated atomically so that neither the kernel driver nor this 980 * will get confused about what the other is doing. 981 * 982 * Note: param is not used here. It is necessary for arming a CQ for 983 * N completions (param is N), but no uDAPL API supports this for now. 984 * Thus, we declare ARGSUSED to make lint happy. 985 */ 986 /*ARGSUSED*/ 987 static DAT_RETURN 988 dapli_hermon_cq_notify(ib_cq_handle_t cq, int flags, uint32_t param) 989 { 990 uint32_t cqnum; 991 uint32_t *target; 992 uint32_t old_cmd, cmp, new, tmp, cmd_sn; 993 994 /* 995 * Determine if we are trying to get the next completion or the next 996 * "solicited" completion. Then hit the appropriate doorbell. 997 */ 998 cqnum = cq->cq_num; 999 target = cq->cq_arm_dbp; 1000 retry: 1001 cmp = *target; 1002 tmp = HTOBE_32(cmp); 1003 old_cmd = tmp & (0x7 << 24); 1004 cmd_sn = tmp & (0x3 << 28); 1005 1006 if (flags == IB_NOTIFY_ON_NEXT_COMP) { 1007 if (old_cmd != HERMON_CQDB_NOTIFY_CQ) { 1008 new = HTOBE_32(cmd_sn | HERMON_CQDB_NOTIFY_CQ | 1009 (cq->cq_consindx & 0xFFFFFF)); 1010 tmp = atomic_cas_32(target, cmp, new); 1011 if (tmp != cmp) 1012 goto retry; 1013 dapli_hermon_cq_doorbell(cq->cq_iauar, 1014 HERMON_CQDB_NOTIFY_CQ, cqnum, 1015 cmd_sn, cq->cq_consindx); 1016 } /* else it's already armed */ 1017 } else if (flags == IB_NOTIFY_ON_NEXT_SOLICITED) { 1018 if (old_cmd != HERMON_CQDB_NOTIFY_CQ && 1019 old_cmd != HERMON_CQDB_NOTIFY_CQ_SOLICIT) { 1020 new = HTOBE_32(cmd_sn | HERMON_CQDB_NOTIFY_CQ_SOLICIT | 1021 (cq->cq_consindx & 0xFFFFFF)); 1022 tmp = atomic_cas_32(target, cmp, new); 1023 if (tmp != cmp) 1024 goto retry; 1025 dapli_hermon_cq_doorbell(cq->cq_iauar, 1026 HERMON_CQDB_NOTIFY_CQ_SOLICIT, cqnum, 1027 cmd_sn, cq->cq_consindx); 1028 } /* else it's already armed */ 1029 } else { 1030 return (DAT_INVALID_PARAMETER); 1031 } 1032 1033 return (DAT_SUCCESS); 1034 } 1035 1036 /* 1037 * Since uDAPL posts 1 wqe per request, we 1038 * only need to do stores for the last one. 1039 */ 1040 static void 1041 dapli_hermon_wqe_headroom(ib_qp_handle_t qp, uint32_t start) 1042 { 1043 uint32_t *wqe_start, *wqe_top, *wqe_base, qsize, invalue; 1044 int hdrmwqes, wqesizebytes, sectperwqe, i, j; 1045 1046 qsize = qp->qp_sq_numwqe; 1047 wqesizebytes = qp->qp_sq_wqesz; 1048 sectperwqe = wqesizebytes >> 6; 1049 hdrmwqes = qp->qp_sq_headroom; 1050 wqe_base = (uint32_t *)TAVOR_QP_SQ_ENTRY(qp, 0); 1051 wqe_top = (uint32_t *)TAVOR_QP_SQ_ENTRY(qp, qsize); 1052 wqe_start = (uint32_t *)TAVOR_QP_SQ_ENTRY(qp, start); 1053 1054 for (i = 0; i < hdrmwqes - 1; i++) { 1055 wqe_start += sectperwqe * 16; 1056 if (wqe_start == wqe_top) 1057 wqe_start = wqe_base; 1058 } 1059 invalue = HTOBE_32(*wqe_start); 1060 invalue |= 0x7FFFFFFF; 1061 *wqe_start = HTOBE_32(invalue); 1062 wqe_start += 16; 1063 for (j = 1; j < sectperwqe; j++) { 1064 *wqe_start = 0xFFFFFFFF; 1065 wqe_start += 16; 1066 } 1067 } 1068 1069 /* 1070 * dapli_hermon_post_send() 1071 */ 1072 /* ARGSUSED */ 1073 static DAT_RETURN 1074 dapli_hermon_post_send(DAPL_EP *ep, ibt_send_wr_t *wr, boolean_t ns) 1075 { 1076 dapls_tavor_wrid_list_hdr_t *wridlist; 1077 dapls_tavor_wrid_entry_t *wre_last; 1078 uint64_t *desc; 1079 uint64_t *wqe_addr; 1080 uint32_t desc_sz; 1081 uint32_t wqeaddrsz, signaled_dbd; 1082 uint32_t head, tail, next_tail, qsize_msk; 1083 int status; 1084 ib_qp_handle_t qp; 1085 1086 if ((ep->qp_state == IBT_STATE_RESET) || 1087 (ep->qp_state == IBT_STATE_INIT) || 1088 (ep->qp_state == IBT_STATE_RTR)) { 1089 dapl_dbg_log(DAPL_DBG_TYPE_ERR, 1090 "post_send: invalid qp_state %d\n", ep->qp_state); 1091 return (DAT_INVALID_STATE); 1092 } 1093 1094 qp = ep->qp_handle; 1095 1096 /* Grab the lock for the WRID list */ 1097 dapl_os_lock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock); 1098 wridlist = qp->qp_sq_wqhdr->wq_wrid_post; 1099 1100 /* Save away some initial QP state */ 1101 qsize_msk = qp->qp_sq_wqhdr->wq_size - 1; 1102 tail = qp->qp_sq_wqhdr->wq_tail; 1103 head = qp->qp_sq_wqhdr->wq_head; 1104 1105 /* 1106 * Check for "queue full" condition. If the queue is already full, 1107 * then no more WQEs can be posted, return an error 1108 */ 1109 if (qp->qp_sq_wqhdr->wq_full != 0) { 1110 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock); 1111 return (DAT_INSUFFICIENT_RESOURCES); 1112 } 1113 1114 /* 1115 * Increment the "tail index" and check for "queue full" condition. 1116 * If we detect that the current work request is going to fill the 1117 * work queue, then we mark this condition and continue. 1118 */ 1119 next_tail = (tail + 1) & qsize_msk; 1120 if (next_tail == head) { 1121 qp->qp_sq_wqhdr->wq_full = 1; 1122 } 1123 1124 /* 1125 * Get the user virtual address of the location where the next 1126 * Send WQE should be built 1127 */ 1128 wqe_addr = TAVOR_QP_SQ_ENTRY(qp, tail); 1129 1130 /* 1131 * Call tavor_wqe_send_build() to build the WQE at the given address. 1132 * This routine uses the information in the ibt_send_wr_t and 1133 * returns the size of the WQE when it returns. 1134 */ 1135 status = dapli_hermon_wqe_send_build(qp, wr, wqe_addr, &desc_sz); 1136 if (status != DAT_SUCCESS) { 1137 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock); 1138 return (status); 1139 } 1140 1141 /* 1142 * Get the descriptor (io address) corresponding to the location 1143 * Send WQE was built. 1144 */ 1145 desc = TAVOR_QP_SQ_ENTRY(qp, tail); 1146 1147 /* 1148 * Add a WRID entry to the WRID list. Need to calculate the 1149 * "wqeaddr" to pass to dapli_tavor_wrid_add_entry(). 1150 * signaled_dbd is still calculated, but ignored. 1151 */ 1152 wqeaddrsz = HERMON_QP_WQEADDRSZ(qp->qp_sq_counter); 1153 1154 if (wr->wr_flags & IBT_WR_SEND_SIGNAL) { 1155 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED; 1156 } 1157 1158 dapli_tavor_wrid_add_entry(qp->qp_sq_wqhdr, wr->wr_id, wqeaddrsz, 1159 signaled_dbd); 1160 1161 dapli_hermon_wqe_headroom(qp, next_tail); 1162 *(uint8_t *)desc ^= 0x80; /* set owner bit */ 1163 1164 /* 1165 * Now if the WRID tail entry is non-NULL, then this 1166 * represents the entry to which we are chaining the 1167 * new entries. Since we are going to ring the 1168 * doorbell for this WQE, we want set its "dbd" bit. 1169 * 1170 * On the other hand, if the tail is NULL, even though 1171 * we will have rung the doorbell for the previous WQE 1172 * (for the hardware's sake) it is irrelevant to our 1173 * purposes (for tracking WRIDs) because we know the 1174 * request must have already completed. 1175 */ 1176 wre_last = wridlist->wl_wre_old_tail; 1177 if (wre_last != NULL) { 1178 wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED; 1179 } 1180 1181 /* Update some of the state in the QP */ 1182 qp->qp_sq_lastwqeaddr = wqe_addr; 1183 qp->qp_sq_wqhdr->wq_tail = next_tail; 1184 1185 if (desc_sz && qp->qp_ia_bf != NULL) { /* use Hermon Blueflame */ 1186 uint64_t *bf_dest, *src64; 1187 uint8_t *src8; 1188 int i; 1189 1190 (void) pthread_spin_lock(&hermon_bf_lock); 1191 1192 src8 = (uint8_t *)desc; 1193 src8[1] = (uint8_t)(qp->qp_sq_counter >> 8); 1194 src8[2] = (uint8_t)qp->qp_sq_counter; 1195 src8[4] = (uint8_t)(qp->qp_num >> 16); 1196 src8[5] = (uint8_t)(qp->qp_num >> 8); 1197 src8[6] = (uint8_t)qp->qp_num; 1198 1199 src64 = (uint64_t *)desc; 1200 bf_dest = (uint64_t *)((uintptr_t)qp->qp_ia_bf + 1201 *qp->qp_ia_bf_toggle); 1202 *qp->qp_ia_bf_toggle ^= 256; /* 2 256-byte buffers */ 1203 for (i = 0; i < desc_sz * 2; i += 8) { 1204 bf_dest[i] = src64[i]; 1205 bf_dest[i + 1] = src64[i + 1]; 1206 bf_dest[i + 2] = src64[i + 2]; 1207 bf_dest[i + 3] = src64[i + 3]; 1208 bf_dest[i + 4] = src64[i + 4]; 1209 bf_dest[i + 5] = src64[i + 5]; 1210 bf_dest[i + 6] = src64[i + 6]; 1211 bf_dest[i + 7] = src64[i + 7]; 1212 } 1213 (void) pthread_spin_unlock(&hermon_bf_lock); 1214 } else { 1215 /* Ring the doorbell */ 1216 dapli_hermon_sq_dbreg(qp->qp_iauar, qp->qp_num); 1217 } 1218 qp->qp_sq_counter++; 1219 1220 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock); 1221 1222 return (DAT_SUCCESS); 1223 } 1224 1225 /* 1226 * dapli_hermon_post_recv() 1227 */ 1228 /* ARGSUSED */ 1229 static DAT_RETURN 1230 dapli_hermon_post_recv(DAPL_EP *ep, ibt_recv_wr_t *wr, boolean_t ns) 1231 { 1232 dapls_tavor_wrid_list_hdr_t *wridlist; 1233 dapls_tavor_wrid_entry_t *wre_last; 1234 ib_qp_handle_t qp; 1235 DAT_RETURN status; 1236 uint64_t *wqe_addr; 1237 uint32_t desc_sz; 1238 uint32_t wqeaddrsz; 1239 uint32_t head, tail, next_tail, qsize_msk; 1240 1241 if (ep->qp_state == IBT_STATE_RESET) { 1242 dapl_dbg_log(DAPL_DBG_TYPE_ERR, 1243 "post_recv: invalid qp_state %d\n", ep->qp_state); 1244 return (DAT_INVALID_STATE); 1245 } 1246 qp = ep->qp_handle; 1247 1248 /* Grab the lock for the WRID list */ 1249 dapl_os_lock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock); 1250 wridlist = qp->qp_rq_wqhdr->wq_wrid_post; 1251 1252 /* Save away some initial QP state */ 1253 qsize_msk = qp->qp_rq_wqhdr->wq_size - 1; 1254 tail = qp->qp_rq_wqhdr->wq_tail; 1255 head = qp->qp_rq_wqhdr->wq_head; 1256 1257 /* 1258 * For the ibt_recv_wr_t passed in, parse the request and build a 1259 * Recv WQE. Link the WQE with the previous WQE and ring the 1260 * door bell. 1261 */ 1262 1263 /* 1264 * Check for "queue full" condition. If the queue is already full, 1265 * then no more WQEs can be posted. So return an error. 1266 */ 1267 if (qp->qp_rq_wqhdr->wq_full != 0) { 1268 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock); 1269 return (DAT_INSUFFICIENT_RESOURCES); 1270 } 1271 1272 /* 1273 * Increment the "tail index" and check for "queue 1274 * full" condition. If we detect that the current 1275 * work request is going to fill the work queue, then 1276 * we mark this condition and continue. 1277 */ 1278 next_tail = (tail + 1) & qsize_msk; 1279 if (next_tail == head) { 1280 qp->qp_rq_wqhdr->wq_full = 1; 1281 } 1282 1283 /* The user virtual address of the WQE to be built */ 1284 wqe_addr = TAVOR_QP_RQ_ENTRY(qp, tail); 1285 1286 /* 1287 * Call tavor_wqe_recv_build() to build the WQE at the given 1288 * address. This routine uses the information in the 1289 * ibt_recv_wr_t and returns the size of the WQE. 1290 */ 1291 status = dapli_hermon_wqe_recv_build(qp, wr, wqe_addr, &desc_sz); 1292 if (status != DAT_SUCCESS) { 1293 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock); 1294 return (DAT_INTERNAL_ERROR); 1295 } 1296 1297 /* 1298 * Add a WRID entry to the WRID list. Need to calculate the 1299 * "wqeaddr" and "signaled_dbd" values to pass to 1300 * dapli_tavor_wrid_add_entry(). 1301 * Note: all Recv WQEs are essentially "signaled" 1302 */ 1303 wqeaddrsz = HERMON_QP_WQEADDRSZ(qp->qp_rq_counter); 1304 dapli_tavor_wrid_add_entry(qp->qp_rq_wqhdr, wr->wr_id, wqeaddrsz, 1305 (uint32_t)TAVOR_WRID_ENTRY_SIGNALED); 1306 1307 /* 1308 * Now if the WRID tail entry is non-NULL, then this 1309 * represents the entry to which we are chaining the 1310 * new entries. Since we are going to ring the 1311 * doorbell for this WQE, we want set its "dbd" bit. 1312 * 1313 * On the other hand, if the tail is NULL, even though 1314 * we will have rung the doorbell for the previous WQE 1315 * (for the hardware's sake) it is irrelevant to our 1316 * purposes (for tracking WRIDs) because we know the 1317 * request must have already completed. 1318 */ 1319 wre_last = wridlist->wl_wre_old_tail; 1320 if (wre_last != NULL) { 1321 wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED; 1322 } 1323 1324 /* Update some of the state in the QP */ 1325 qp->qp_rq_lastwqeaddr = wqe_addr; 1326 qp->qp_rq_wqhdr->wq_tail = next_tail; 1327 1328 /* Update the doorbell record */ 1329 qp->qp_rq_counter++; 1330 (qp->qp_rq_dbp)[0] = HTOBE_32(qp->qp_rq_counter); 1331 1332 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock); 1333 1334 return (DAT_SUCCESS); 1335 } 1336 1337 /* 1338 * dapli_hermon_post_srq() 1339 */ 1340 /* ARGSUSED */ 1341 static DAT_RETURN 1342 dapli_hermon_post_srq(DAPL_SRQ *srqp, ibt_recv_wr_t *wr, boolean_t ns) 1343 { 1344 ib_srq_handle_t srq; 1345 DAT_RETURN status; 1346 uint32_t desc; 1347 uint64_t *wqe_addr; 1348 uint32_t head, next_head, qsize_msk; 1349 uint32_t wqe_index; 1350 1351 1352 srq = srqp->srq_handle; 1353 1354 /* Grab the lock for the WRID list */ 1355 dapl_os_lock(&srq->srq_wridlist->wl_lock->wrl_lock); 1356 1357 /* 1358 * For the ibt_recv_wr_t passed in, parse the request and build a 1359 * Recv WQE. Link the WQE with the previous WQE and ring the 1360 * door bell. 1361 */ 1362 1363 /* 1364 * Check for "queue full" condition. If the queue is already full, 1365 * ie. there are no free entries, then no more WQEs can be posted. 1366 * So return an error. 1367 */ 1368 if (srq->srq_wridlist->wl_freel_entries == 0) { 1369 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock); 1370 return (DAT_INSUFFICIENT_RESOURCES); 1371 } 1372 1373 /* Save away some initial SRQ state */ 1374 qsize_msk = srq->srq_wridlist->wl_size - 1; 1375 head = srq->srq_wridlist->wl_freel_head; 1376 1377 next_head = (head + 1) & qsize_msk; 1378 1379 /* Get the descriptor (IO Address) of the WQE to be built */ 1380 desc = srq->srq_wridlist->wl_free_list[head]; 1381 1382 wqe_index = TAVOR_SRQ_WQ_INDEX(srq->srq_wq_desc_addr, desc, 1383 srq->srq_wq_wqesz); 1384 1385 /* The user virtual address of the WQE to be built */ 1386 wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq, wqe_index); 1387 1388 /* 1389 * Call dapli_hermon_wqe_srq_build() to build the WQE at the given 1390 * address. This routine uses the information in the 1391 * ibt_recv_wr_t and returns the size of the WQE. 1392 */ 1393 status = dapli_hermon_wqe_srq_build(srq, wr, wqe_addr); 1394 if (status != DAT_SUCCESS) { 1395 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock); 1396 return (status); 1397 } 1398 1399 /* 1400 * Add a WRID entry to the WRID list. 1401 */ 1402 dapli_tavor_wrid_add_entry_srq(srq, wr->wr_id, wqe_index); 1403 1404 #if 0 1405 if (srq->srq_wq_lastwqeindex == -1) { 1406 last_wqe_addr = NULL; 1407 } else { 1408 last_wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq, 1409 srq->srq_wq_lastwqeindex); 1410 } 1411 /* 1412 * Now link the chain to the old chain (if there was one) 1413 * and update the wqe_counter in the doorbell record. 1414 */ 1415 XXX 1416 dapli_tavor_wqe_srq_linknext(wqe_addr, ns, desc, last_wqe_addr); 1417 #endif 1418 1419 /* Update some of the state in the SRQ */ 1420 srq->srq_wq_lastwqeindex = wqe_index; 1421 srq->srq_wridlist->wl_freel_head = next_head; 1422 srq->srq_wridlist->wl_freel_entries--; 1423 dapl_os_assert(srq->srq_wridlist->wl_freel_entries <= 1424 srq->srq_wridlist->wl_size); 1425 1426 /* Update the doorbell record */ 1427 srq->srq_counter++; 1428 (srq->srq_dbp)[0] = HTOBE_32(srq->srq_counter); 1429 1430 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock); 1431 1432 return (DAT_SUCCESS); 1433 } 1434 1435 /* 1436 * dapli_hermon_cq_srq_entries_flush() 1437 */ 1438 static void 1439 dapli_hermon_cq_srq_entries_flush(ib_qp_handle_t qp) 1440 { 1441 ib_cq_handle_t cq; 1442 dapls_tavor_workq_hdr_t *wqhdr; 1443 tavor_hw_cqe_t *cqe; 1444 tavor_hw_cqe_t *next_cqe; 1445 uint32_t cons_indx, tail_cons_indx; 1446 uint32_t new_indx, check_indx, indx; 1447 int cqe_qpnum, cqe_type; 1448 int outstanding_cqes, removed_cqes; 1449 int i; 1450 1451 /* ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock)); */ 1452 1453 cq = qp->qp_rq_cqhdl; 1454 wqhdr = qp->qp_rq_wqhdr; 1455 1456 dapl_os_assert(wqhdr->wq_wrid_post != NULL); 1457 dapl_os_assert(wqhdr->wq_wrid_post->wl_srq_en != 0); 1458 1459 /* Get the consumer index */ 1460 cons_indx = cq->cq_consindx; 1461 1462 /* Calculate the pointer to the first CQ entry */ 1463 cqe = &cq->cq_addr[cons_indx]; 1464 1465 /* 1466 * Loop through the CQ looking for entries owned by software. If an 1467 * entry is owned by software then we increment an 'outstanding_cqes' 1468 * count to know how many entries total we have on our CQ. We use this 1469 * value further down to know how many entries to loop through looking 1470 * for our same QP number. 1471 */ 1472 outstanding_cqes = 0; 1473 tail_cons_indx = cons_indx; 1474 while (TAVOR_CQE_OWNER_IS_SW(cqe)) { 1475 /* increment total cqes count */ 1476 outstanding_cqes++; 1477 1478 /* increment the consumer index */ 1479 tail_cons_indx = (tail_cons_indx + 1) & cq_wrap_around_mask; 1480 1481 /* update the pointer to the next cq entry */ 1482 cqe = &cq->cq_addr[tail_cons_indx]; 1483 } 1484 1485 /* 1486 * Using the 'tail_cons_indx' that was just set, we now know how many 1487 * total CQEs possible there are. Set the 'check_indx' and the 1488 * 'new_indx' to the last entry identified by 'tail_cons_indx' 1489 */ 1490 check_indx = new_indx = (tail_cons_indx - 1) & cq_wrap_around_mask; 1491 1492 for (i = 0; i < outstanding_cqes; i++) { 1493 cqe = &cq->cq_addr[check_indx]; 1494 1495 /* Grab QP number from CQE */ 1496 cqe_qpnum = TAVOR_CQE_QPNUM_GET(cqe); 1497 cqe_type = HERMON_CQE_SENDRECV_GET(cqe); 1498 1499 /* 1500 * If the QP number is the same in the CQE as the QP that we 1501 * have on this SRQ, then we must free up the entry off the 1502 * SRQ. We also make sure that the completion type is of the 1503 * 'TAVOR_COMPLETION_RECV' type. So any send completions on 1504 * this CQ will be left as-is. The handling of returning 1505 * entries back to HW ownership happens further down. 1506 */ 1507 if (cqe_qpnum == qp->qp_num && 1508 cqe_type == TAVOR_COMPLETION_RECV) { 1509 /* Add back to SRQ free list */ 1510 (void) dapli_tavor_wrid_find_match_srq( 1511 wqhdr->wq_wrid_post, cqe); 1512 } else { 1513 /* Do Copy */ 1514 if (check_indx != new_indx) { 1515 next_cqe = &cq->cq_addr[new_indx]; 1516 /* 1517 * Copy the CQE into the "next_cqe" 1518 * pointer. 1519 */ 1520 (void) dapl_os_memcpy(next_cqe, cqe, 1521 sizeof (tavor_hw_cqe_t)); 1522 } 1523 new_indx = (new_indx - 1) & cq_wrap_around_mask; 1524 } 1525 /* Move index to next CQE to check */ 1526 check_indx = (check_indx - 1) & cq_wrap_around_mask; 1527 } 1528 1529 /* Initialize removed cqes count */ 1530 removed_cqes = 0; 1531 1532 /* If an entry was removed */ 1533 if (check_indx != new_indx) { 1534 1535 /* 1536 * Set current pointer back to the beginning consumer index. 1537 * At this point, all unclaimed entries have been copied to the 1538 * index specified by 'new_indx'. This 'new_indx' will be used 1539 * as the new consumer index after we mark all freed entries as 1540 * having HW ownership. We do that here. 1541 */ 1542 1543 /* Loop through all entries until we reach our new pointer */ 1544 for (indx = cons_indx; indx <= new_indx; 1545 indx = (indx + 1) & cq_wrap_around_mask) { 1546 removed_cqes++; 1547 cqe = &cq->cq_addr[indx]; 1548 1549 /* Reset entry to hardware ownership */ 1550 TAVOR_CQE_OWNER_SET_HW(cqe); 1551 } 1552 } 1553 1554 /* 1555 * Update consumer index to be the 'new_indx'. This moves it past all 1556 * removed entries. Because 'new_indx' is pointing to the last 1557 * previously valid SW owned entry, we add 1 to point the cons_indx to 1558 * the first HW owned entry. 1559 */ 1560 cons_indx = (new_indx + 1) & cq_wrap_around_mask; 1561 1562 /* 1563 * Now we only ring the doorbell (to update the consumer index) if 1564 * we've actually consumed a CQ entry. If we found no QP number 1565 * matches above, then we would not have removed anything. So only if 1566 * something was removed do we ring the doorbell. 1567 */ 1568 if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) { 1569 /* 1570 * Update the consumer index in both the CQ handle and the 1571 * doorbell record. 1572 */ 1573 cq->cq_consindx = cons_indx; 1574 dapli_hermon_cq_update_ci(cq, cq->cq_poll_dbp); 1575 } 1576 } 1577 1578 static void 1579 dapli_hermon_rq_prelink(caddr_t first, uint32_t desc_off, uint32_t wqesz, 1580 uint32_t numwqe, uint32_t nds) 1581 { 1582 int i; 1583 uint32_t *p = (uint32_t *)(uintptr_t)first; 1584 uint32_t off = desc_off; 1585 uint32_t pincr = wqesz / sizeof (uint32_t); 1586 ibt_wr_ds_t sgl; 1587 1588 sgl.ds_va = (ib_vaddr_t)0; 1589 sgl.ds_key = HERMON_WQE_SGL_INVALID_LKEY; 1590 sgl.ds_len = (ib_msglen_t)0; 1591 1592 for (i = 0; i < numwqe - 1; i++, p += pincr) { 1593 off += wqesz; 1594 p[0] = HTOBE_32(off); /* link curr to next */ 1595 p[1] = nds; /* nds is 0 for SRQ */ 1596 TAVOR_WQE_BUILD_DATA_SEG((void *)&p[2], &sgl); 1597 } 1598 p[0] = HTOBE_32(desc_off); /* link last to first */ 1599 p[1] = nds; 1600 TAVOR_WQE_BUILD_DATA_SEG((void *)&p[2], &sgl); 1601 } 1602 1603 static void 1604 dapli_hermon_sq_init(caddr_t first, uint32_t wqesz, uint32_t numwqe) 1605 { 1606 int i, j; 1607 uint64_t *wqe = (uint64_t *)(uintptr_t)first; 1608 1609 for (i = 0; i < numwqe; i++) { 1610 for (j = 0; j < wqesz; j += 64, wqe += 8) 1611 *(uint32_t *)wqe = 0xFFFFFFFF; 1612 } 1613 } 1614 1615 static void 1616 dapli_hermon_qp_init(ib_qp_handle_t qp) 1617 { 1618 dapli_hermon_sq_init(qp->qp_sq_buf, qp->qp_sq_wqesz, qp->qp_sq_numwqe); 1619 qp->qp_rq_counter = 0; 1620 qp->qp_sq_counter = 0; 1621 } 1622 1623 static void 1624 dapli_hermon_cq_init(ib_cq_handle_t cq) 1625 { 1626 uint32_t i; 1627 1628 (cq->cq_arm_dbp)[0] = HTOBE_32(1 << 28); 1629 for (i = 0; (1 << i) < cq->cq_size; i++) 1630 ; 1631 cq->cq_log_cqsz = i; 1632 cq->cq_consindx = 0; 1633 1634 /* cq_resize -- needs testing */ 1635 } 1636 1637 static void 1638 dapli_hermon_srq_init(ib_srq_handle_t srq) 1639 { 1640 /* pre-link the whole shared receive queue */ 1641 dapli_hermon_rq_prelink(srq->srq_addr, srq->srq_wq_desc_addr, 1642 srq->srq_wq_wqesz, srq->srq_wq_numwqe, 0); 1643 srq->srq_counter = 0; 1644 1645 /* needs testing */ 1646 } 1647 1648 void 1649 dapls_init_funcs_hermon(DAPL_HCA *hca_ptr) 1650 { 1651 hca_ptr->post_send = dapli_hermon_post_send; 1652 hca_ptr->post_recv = dapli_hermon_post_recv; 1653 hca_ptr->post_srq = dapli_hermon_post_srq; 1654 hca_ptr->cq_peek = dapli_hermon_cq_peek; 1655 hca_ptr->cq_poll = dapli_hermon_cq_poll; 1656 hca_ptr->cq_poll_one = dapli_hermon_cq_poll_one; 1657 hca_ptr->cq_notify = dapli_hermon_cq_notify; 1658 hca_ptr->srq_flush = dapli_hermon_cq_srq_entries_flush; 1659 hca_ptr->qp_init = dapli_hermon_qp_init; 1660 hca_ptr->cq_init = dapli_hermon_cq_init; 1661 hca_ptr->srq_init = dapli_hermon_srq_init; 1662 hca_ptr->hermon_resize_cq = 1; 1663 1664 (void) pthread_spin_init(&hermon_bf_lock, 0); 1665 } 1666