1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #include "dapl.h"
28 #include "dapl_tavor_hw.h"
29 #include "dapl_tavor_wr.h"
30 #include "dapl_tavor_ibtf_impl.h"
31
32 #define HERMON_WQE_SGL_INVALID_LKEY 0x00000100
33 #define HERMON_WQE_SEND_FENCE_MASK 0x40
34 #define HERMON_WQE_NDS_MASK 0x3F
35
36 #define HERMON_CQDB_NOTIFY_CQ_SOLICIT (0x1 << 24)
37 #define HERMON_CQDB_NOTIFY_CQ (0x2 << 24)
38
39 #define HERMON_CQE_RCV_SEND 0x1
40 #define HERMON_CQE_ERR_OPCODE 0x1E
41 #define HERMON_CQE_RESIZE_OPCODE 0x16
42 #define HERMON_CQE_OPCODE_GET(cqe) (((uint8_t *)cqe)[31] & 0x1F)
43 #define HERMON_CQE_SENDRECV_GET(cqe) (((uint8_t *)cqe)[31] & 0x40)
44 #define HERMON_CQE_OWNER_IS_SW(cq, cqe) ((((uint8_t *)cqe)[31] >> 7) == \
45 ((cq->cq_consindx & cq->cq_size) >> cq->cq_log_cqsz))
46
47 #define HERMON_QP_WQEADDRSZ(wcnt) ((uint32_t)(wcnt << 6))
48
49 #define HERMON_WQE_SEND_SIGNALED_MASK 0x0000000C00000000ull
50 #define HERMON_WQE_SEND_SOLICIT_MASK 0x0000000200000000ull
51 #define HERMON_WQE_SETCTRL(desc, ctrl) \
52 ((uint64_t *)(desc))[1] = HTOBE_64(ctrl)
53 #define HERMON_WQE_SETNEXT(desc, nopcode, size, fence) \
54 ((uint64_t *)(desc))[0] = HTOBE_64((nopcode) | (size) | (fence) | \
55 (((uint64_t)((uint8_t *)desc)[0] &0x80) << 56))
56 #define HERMON_WQE_BUILD_DATA_SEG(ds, sgl) \
57 { \
58 uint64_t *tmp; \
59 \
60 tmp = (uint64_t *)(ds); \
61 tmp[1] = HTOBE_64((sgl)->ds_va); \
62 ((uint32_t *)tmp)[1] = HTOBE_32((sgl)->ds_key); \
63 membar_producer(); \
64 ((uint32_t *)tmp)[0] = HTOBE_32((sgl)->ds_len); \
65 }
66
67
68 /* handy macro, useful because of cq_resize dynamics */
69 #define cq_wrap_around_mask (cq->cq_size - 1)
70
71 pthread_spinlock_t hermon_bf_lock;
72
73 /*
74 * Function signatures
75 */
76 extern uint64_t dapls_tavor_wrid_get_entry(ib_cq_handle_t, tavor_hw_cqe_t *,
77 uint_t, uint_t, dapls_tavor_wrid_entry_t *);
78 extern void dapls_tavor_wrid_cq_reap(ib_cq_handle_t);
79 extern DAPL_OS_LOCK g_tavor_uar_lock;
80
81 #ifndef _LP64
82 extern void dapls_atomic_assign_64(uint64_t, uint64_t *);
83 #endif
84
85 static int dapli_hermon_wqe_send_build(ib_qp_handle_t, ibt_send_wr_t *,
86 uint64_t *, uint_t *);
87 static DAT_RETURN dapli_hermon_wqe_recv_build(ib_qp_handle_t, ibt_recv_wr_t *,
88 uint64_t *, uint_t *);
89 static int dapli_hermon_cq_cqe_consume(ib_cq_handle_t, uint32_t *, ibt_wc_t *);
90 static int dapli_hermon_cq_errcqe_consume(ib_cq_handle_t, uint32_t *,
91 ibt_wc_t *);
92 extern void dapli_tavor_wrid_add_entry(dapls_tavor_workq_hdr_t *, uint64_t,
93 uint32_t, uint_t);
94 extern void dapli_tavor_wrid_add_entry_srq(ib_srq_handle_t, uint64_t, uint32_t);
95
96 /*
97 * Note: The 64 bit doorbells need to written atomically.
98 * In 32 bit libraries we need to use the special assembly rtn
99 * because compiler generated code splits into 2 word writes
100 */
101
102 /*
103 * dapli_hermon_cq_doorbell()
104 * Takes the specified cq cmd and cq number and rings the cq doorbell
105 */
106 static void
dapli_hermon_cq_doorbell(dapls_hw_uar_t ia_uar,uint32_t cq_cmd,uint32_t cqn,uint32_t cmd_sn,uint32_t cq_param)107 dapli_hermon_cq_doorbell(dapls_hw_uar_t ia_uar, uint32_t cq_cmd, uint32_t cqn,
108 uint32_t cmd_sn, uint32_t cq_param)
109 {
110 uint64_t doorbell;
111
112 /* Build the doorbell from the parameters */
113 doorbell = (cmd_sn | cq_cmd | cqn);
114 doorbell = (doorbell << 32) | cq_param;
115
116 /* Write the doorbell to UAR */
117 #ifdef _LP64
118 ((tavor_hw_uar_t *)ia_uar)->cq = HTOBE_64(doorbell);
119 /* 32 bit version */
120 #elif defined(i386)
121 dapl_os_lock(&g_tavor_uar_lock);
122 /*
123 * For 32 bit intel we assign the doorbell in the order
124 * prescribed by the Tavor PRM, lower to upper addresses
125 */
126 ((tavor_hw_uar32_t *)ia_uar)->cq[0] =
127 (uint32_t)HTOBE_32(doorbell >> 32);
128 ((tavor_hw_uar32_t *)ia_uar)->cq[1] =
129 (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
130 dapl_os_unlock(&g_tavor_uar_lock);
131 #else
132 dapls_atomic_assign_64(HTOBE_64(doorbell),
133 &((tavor_hw_uar_t *)ia_uar)->cq);
134 #endif
135 }
136
137 /*
138 * dapli_hermon_qp_send_doorbell()
139 * Takes the specified qp number and rings the send doorbell.
140 */
141 static void
dapli_hermon_sq_dbreg(dapls_hw_uar_t ia_uar,uint32_t qpn)142 dapli_hermon_sq_dbreg(dapls_hw_uar_t ia_uar, uint32_t qpn)
143 {
144 uint64_t doorbell;
145
146 doorbell = qpn << 8;
147
148 /* Write the doorbell to UAR */
149 #ifdef _LP64
150 ((tavor_hw_uar_t *)ia_uar)->send = HTOBE_64(doorbell);
151 #else
152 #if defined(i386)
153 dapl_os_lock(&g_tavor_uar_lock);
154 /*
155 * For 32 bit intel we assign the doorbell in the order
156 * prescribed by the Tavor PRM, lower to upper addresses
157 */
158 ((tavor_hw_uar32_t *)ia_uar)->send[0] =
159 (uint32_t)HTOBE_32(doorbell >> 32);
160 ((tavor_hw_uar32_t *)ia_uar)->send[1] =
161 (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
162 dapl_os_unlock(&g_tavor_uar_lock);
163 #else
164 dapls_atomic_assign_64(HTOBE_64(doorbell),
165 &((tavor_hw_uar_t *)ia_uar)->send);
166 #endif
167 #endif
168 }
169
170 /*
171 * dapli_hermon_wqe_send_build()
172 * Constructs a WQE for a given ibt_send_wr_t
173 */
174 static int
dapli_hermon_wqe_send_build(ib_qp_handle_t qp,ibt_send_wr_t * wr,uint64_t * addr,uint_t * size)175 dapli_hermon_wqe_send_build(ib_qp_handle_t qp, ibt_send_wr_t *wr,
176 uint64_t *addr, uint_t *size)
177 {
178 tavor_hw_snd_wqe_remaddr_t *rc;
179 tavor_hw_snd_wqe_bind_t *bn;
180 tavor_hw_wqe_sgl_t *ds;
181 ibt_wr_ds_t *sgl;
182 uint8_t *src, *dst, *maxdst;
183 uint32_t nds;
184 int len, thislen, maxlen;
185 uint32_t new_rkey;
186 uint32_t old_rkey;
187 int i, num_ds;
188 int max_inline_bytes = -1;
189 uint64_t ctrl;
190 uint64_t nopcode;
191 uint_t my_size;
192
193 nds = wr->wr_nds;
194 sgl = wr->wr_sgl;
195 num_ds = 0;
196 ctrl = ((wr->wr_flags & IBT_WR_SEND_SIGNAL) ?
197 HERMON_WQE_SEND_SIGNALED_MASK : 0) |
198 ((wr->wr_flags & IBT_WR_SEND_SOLICIT) ?
199 HERMON_WQE_SEND_SOLICIT_MASK : 0);
200
201 /*
202 * RC is the only supported transport in UDAPL
203 * For RC requests, we allow "Send", "RDMA Read", "RDMA Write"
204 */
205 switch (wr->wr_opcode) {
206 case IBT_WRC_SEND:
207 /*
208 * If this is a Send request, then all we need is
209 * the Data Segment processing below.
210 * Initialize the information for the Data Segments
211 */
212 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
213 sizeof (tavor_hw_snd_wqe_nextctrl_t));
214 if (qp->qp_sq_inline != 0)
215 max_inline_bytes =
216 qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_SEND;
217 nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
218 break;
219 case IBT_WRC_RDMAW:
220 if (qp->qp_sq_inline != 0)
221 max_inline_bytes =
222 qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_RDMAW;
223 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW;
224 /* FALLTHROUGH */
225 case IBT_WRC_RDMAR:
226 if (wr->wr_opcode == IBT_WRC_RDMAR) {
227 if (qp->qp_sq_inline < 0)
228 qp->qp_sq_inline = 0;
229 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR;
230 }
231 /*
232 * If this is an RDMA Read or RDMA Write request, then fill
233 * in the "Remote Address" header fields.
234 */
235 rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)addr +
236 sizeof (tavor_hw_snd_wqe_nextctrl_t));
237
238 /*
239 * Build the Remote Address Segment for the WQE, using
240 * the information from the RC work request.
241 */
242 TAVOR_WQE_BUILD_REMADDR(rc, &wr->wr.rc.rcwr.rdma);
243
244 /* Update "ds" for filling in Data Segments (below) */
245 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc +
246 sizeof (tavor_hw_snd_wqe_remaddr_t));
247 break;
248 case IBT_WRC_BIND:
249 /*
250 * Generate a new R_key
251 * Increment the upper "unconstrained" bits and need to keep
252 * the lower "constrained" bits the same it represents
253 * the MPT index.
254 */
255 #if 0
256 /* XXX - need equiv of "hermon_wr_bind_check(state, wr);" */
257 /* XXX - uses hermon_mr_keycalc - what about Sinai vs. Arbel??? */
258 #endif
259 old_rkey = wr->wr.rc.rcwr.bind->bind_rkey;
260 new_rkey = old_rkey >> 8; /* index */
261 old_rkey = (old_rkey + 1) & 0xff; /* incremented key */
262 new_rkey = (new_rkey << 8) | old_rkey;
263
264 wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
265
266 bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)addr +
267 sizeof (tavor_hw_snd_wqe_nextctrl_t));
268
269 /*
270 * Build the Bind Memory Window Segments for the WQE,
271 * using the information from the RC Bind memory
272 * window work request.
273 */
274 TAVOR_WQE_BUILD_BIND(bn, wr->wr.rc.rcwr.bind);
275
276 /*
277 * Update the "ds" pointer. Even though the "bind"
278 * operation requires no SGLs, this is necessary to
279 * facilitate the correct descriptor size calculations
280 * (below).
281 */
282 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
283 sizeof (tavor_hw_snd_wqe_bind_t));
284 nds = 0;
285 nopcode = TAVOR_WQE_SEND_NOPCODE_BIND;
286 break;
287 default:
288 dapl_dbg_log(DAPL_DBG_TYPE_ERR,
289 "dapli_hermon_wqe_send_build: invalid wr_opcode=%d\n",
290 wr->wr_opcode);
291 return (DAT_INTERNAL_ERROR);
292 }
293
294 /*
295 * Now fill in the Data Segments (SGL) for the Send WQE based on
296 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
297 * Start by checking for a valid number of SGL entries
298 */
299 if (nds > qp->qp_sq_sgl) {
300 return (DAT_INVALID_PARAMETER);
301 }
302
303 /*
304 * For each SGL in the Send Work Request, fill in the Send WQE's data
305 * segments. Note: We skip any SGL with zero size because Tavor
306 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
307 * the encoding for zero means a 2GB transfer. Because of this special
308 * encoding in the hardware, we mask the requested length with
309 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
310 * zero.)
311 */
312 if (max_inline_bytes != -1) { /* compute total_len */
313 len = 0;
314 for (i = 0; i < nds; i++)
315 len += sgl[i].ds_len;
316 if (len == 0)
317 max_inline_bytes = -1; /* do not inline */
318 else {
319 /* need to reduce the length by dword "len" fields */
320 max_inline_bytes -= (len / 64) * sizeof (uint32_t);
321 if (len > max_inline_bytes)
322 max_inline_bytes = -1; /* too big for inline */
323 }
324 }
325 if (max_inline_bytes != -1) { /* do "inline" */
326
327 dst = (uint8_t *)((uint32_t *)ds + 1);
328 maxdst = (uint8_t *)(((uintptr_t)dst + 64) & ~(64 - 1));
329 maxlen = maxdst - dst;
330 thislen = 0;
331 i = 0;
332 src = (uint8_t *)(uintptr_t)sgl[i].ds_va;
333 len = sgl[i].ds_len;
334 do {
335 /* if this sgl overflows the inline segment */
336 if (len > maxlen) {
337 if (maxlen) /* might be 0 */
338 (void) dapl_os_memcpy(dst,
339 src, maxlen);
340 membar_producer();
341 *(uint32_t *)ds =
342 HTOBE_32((thislen + maxlen) |
343 TAVOR_WQE_SGL_INLINE_MASK);
344 thislen = 0;
345 len -= maxlen;
346 src += maxlen;
347 dst = maxdst + sizeof (uint32_t);
348 ds = (tavor_hw_wqe_sgl_t *)(void *)maxdst;
349 maxdst += 64;
350 maxlen = 64 - sizeof (uint32_t);
351 } else { /* this sgl fully fits */
352 (void) dapl_os_memcpy(dst,
353 src, len);
354 maxlen -= len; /* room left */
355 thislen += len;
356 dst += len;
357 while (++i < nds)
358 if (sgl[i].ds_len)
359 break;
360 if (i >= nds)
361 break;
362 src = (uint8_t *)(uintptr_t)sgl[i].ds_va;
363 len = sgl[i].ds_len;
364 }
365 } while (i < nds);
366 membar_producer();
367 *(uint32_t *)ds = HTOBE_32(thislen |
368 TAVOR_WQE_SGL_INLINE_MASK);
369
370 /* Return the size of descriptor (in 16-byte chunks) */
371 my_size = ((uintptr_t)dst - (uintptr_t)addr + 15) >> 4;
372 if (my_size <= (256 >> 4))
373 *size = my_size; /* use Hermon Blueflame */
374 else
375 *size = 0;
376 } else {
377 for (i = 0; i < nds; i++) {
378 if (sgl[i].ds_len == 0) {
379 continue;
380 }
381
382 /*
383 * Fill in the Data Segment(s) for the current WQE,
384 * using the information contained in the
385 * scatter-gather list of the work request.
386 */
387 HERMON_WQE_BUILD_DATA_SEG(&ds[num_ds], &sgl[i]);
388 num_ds++;
389 }
390
391 /* Return the size of descriptor (in 16-byte chunks) */
392 my_size = ((uintptr_t)&ds[num_ds] - (uintptr_t)addr) >> 4;
393 *size = 0; /* do not use Hermon Blueflame */
394 }
395 HERMON_WQE_SETCTRL(addr, ctrl);
396 membar_producer();
397 HERMON_WQE_SETNEXT(addr, nopcode << 32, my_size,
398 (wr->wr_flags & IBT_WR_SEND_FENCE) ?
399 HERMON_WQE_SEND_FENCE_MASK : 0);
400
401 return (DAT_SUCCESS);
402 }
403
404 /*
405 * dapli_hermon_wqe_recv_build()
406 * Builds the recv WQE for a given ibt_recv_wr_t
407 */
408 static DAT_RETURN
dapli_hermon_wqe_recv_build(ib_qp_handle_t qp,ibt_recv_wr_t * wr,uint64_t * addr,uint_t * size)409 dapli_hermon_wqe_recv_build(ib_qp_handle_t qp, ibt_recv_wr_t *wr,
410 uint64_t *addr, uint_t *size)
411 {
412 tavor_hw_wqe_sgl_t *ds;
413 int i;
414 int num_ds;
415
416 /* Fill in the Data Segments (SGL) for the Recv WQE */
417 ds = (tavor_hw_wqe_sgl_t *)addr;
418 num_ds = 0;
419
420 /* Check for valid number of SGL entries */
421 if (wr->wr_nds > qp->qp_rq_sgl) {
422 return (DAT_INVALID_PARAMETER);
423 }
424
425 /*
426 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
427 * segments. Note: We skip any SGL with zero size because Tavor
428 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
429 * the encoding for zero means a 2GB transfer. Because of this special
430 * encoding in the hardware, we mask the requested length with
431 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
432 * zero.)
433 */
434 for (i = 0; i < wr->wr_nds; i++) {
435 if (wr->wr_sgl[i].ds_len == 0) {
436 continue;
437 }
438
439 /*
440 * Fill in the Data Segment(s) for the receive WQE, using the
441 * information contained in the scatter-gather list of the
442 * work request.
443 */
444 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]);
445 num_ds++;
446 }
447 if (i < qp->qp_rq_sgl) {
448 ibt_wr_ds_t sgl;
449 sgl.ds_va = (ib_vaddr_t)0;
450 sgl.ds_len = (ib_msglen_t)0;
451 sgl.ds_key = (ibt_lkey_t)HERMON_WQE_SGL_INVALID_LKEY;
452 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &sgl);
453 }
454
455 /* Return the size of descriptor (in 16-byte chunks) */
456 *size = qp->qp_rq_wqesz >> 4;
457
458 return (DAT_SUCCESS);
459 }
460
461 /*
462 * dapli_hermon_wqe_srq_build()
463 * Builds the recv WQE for a given ibt_recv_wr_t
464 */
465 static DAT_RETURN
dapli_hermon_wqe_srq_build(ib_srq_handle_t srq,ibt_recv_wr_t * wr,uint64_t * addr)466 dapli_hermon_wqe_srq_build(ib_srq_handle_t srq, ibt_recv_wr_t *wr,
467 uint64_t *addr)
468 {
469 tavor_hw_wqe_sgl_t *ds;
470 ibt_wr_ds_t end_sgl;
471 int i;
472 int num_ds;
473
474 /* Fill in the Data Segments (SGL) for the Recv WQE */
475 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
476 sizeof (tavor_hw_rcv_wqe_nextctrl_t));
477 num_ds = 0;
478
479 /* Check for valid number of SGL entries */
480 if (wr->wr_nds > srq->srq_wq_sgl) {
481 return (DAT_INVALID_PARAMETER);
482 }
483
484 /*
485 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
486 * segments. Note: We skip any SGL with zero size because Tavor
487 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
488 * the encoding for zero means a 2GB transfer. Because of this special
489 * encoding in the hardware, we mask the requested length with
490 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
491 * zero.)
492 */
493 for (i = 0; i < wr->wr_nds; i++) {
494 if (wr->wr_sgl[i].ds_len == 0) {
495 continue;
496 }
497
498 /*
499 * Fill in the Data Segment(s) for the receive WQE, using the
500 * information contained in the scatter-gather list of the
501 * work request.
502 */
503 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]);
504 num_ds++;
505 }
506
507 /*
508 * For SRQ, if the number of data segments is less than the maximum
509 * specified at alloc, then we have to fill in a special "key" entry in
510 * the sgl entry after the last valid one in this post request. We do
511 * that here.
512 */
513 if (num_ds < srq->srq_wq_sgl) {
514 end_sgl.ds_va = (ib_vaddr_t)0;
515 end_sgl.ds_len = (ib_msglen_t)0;
516 end_sgl.ds_key = (ibt_lkey_t)HERMON_WQE_SGL_INVALID_LKEY;
517 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &end_sgl);
518 }
519
520 return (DAT_SUCCESS);
521 }
522
523 /*
524 * dapli_hermon_cq_peek()
525 * Peeks into a given CQ to check if there are any events that can be
526 * polled. It returns the number of CQEs that can be polled.
527 */
528 static void
dapli_hermon_cq_peek(ib_cq_handle_t cq,int * num_cqe)529 dapli_hermon_cq_peek(ib_cq_handle_t cq, int *num_cqe)
530 {
531 uint32_t *cqe;
532 uint32_t imm_eth_pkey_cred;
533 uint32_t cons_indx;
534 int polled_cnt;
535 uint_t doorbell_cnt;
536 uint_t opcode;
537
538 /* Get the consumer index */
539 cons_indx = cq->cq_consindx & cq_wrap_around_mask;
540
541 /* Calculate the pointer to the first CQ entry */
542 cqe = (uint32_t *)&cq->cq_addr[cons_indx];
543
544 /*
545 * Count entries in the CQ until we find an entry owned by
546 * the hardware.
547 */
548 polled_cnt = 0;
549 while (HERMON_CQE_OWNER_IS_SW(cq, cqe)) {
550 opcode = HERMON_CQE_OPCODE_GET(cqe);
551 /* Error CQE map to multiple work completions */
552 if (opcode == HERMON_CQE_ERR_OPCODE) {
553 imm_eth_pkey_cred =
554 TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe);
555 doorbell_cnt =
556 imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
557 polled_cnt += (doorbell_cnt + 1);
558 } else {
559 polled_cnt++;
560 }
561 /* Increment the consumer index */
562 cons_indx = (cons_indx + 1) & cq_wrap_around_mask;
563
564 /* Update the pointer to the next CQ entry */
565 cqe = (uint32_t *)&cq->cq_addr[cons_indx];
566 }
567
568 *num_cqe = polled_cnt;
569 }
570
571 #define dapli_hermon_cq_update_ci(cq, dbp) \
572 (dbp)[0] = HTOBE_32(cq->cq_consindx & 0xFFFFFF)
573
574 /*
575 * dapli_hermon_cq_resize_helper()
576 * This routine switches from the pre-cq_resize buffer to the new buffer.
577 */
578 static int
dapli_hermon_cq_resize_helper(ib_cq_handle_t cq)579 dapli_hermon_cq_resize_helper(ib_cq_handle_t cq)
580 {
581 int i;
582
583 if ((cq->cq_resize_addr == 0) ||
584 (munmap((char *)cq->cq_addr, cq->cq_map_len) < 0)) {
585 dapl_dbg_log(DAPL_DBG_TYPE_ERR, "cq_resize_helper: "
586 "munmap(%p:0x%llx) failed(%d)\n", cq->cq_addr,
587 cq->cq_map_len, errno);
588 return (1); /* FAILED */
589 }
590 cq->cq_addr = cq->cq_resize_addr;
591 cq->cq_map_offset = cq->cq_resize_map_offset;
592 cq->cq_map_len = cq->cq_resize_map_len;
593 cq->cq_size = cq->cq_resize_size;
594 cq->cq_cqesz = cq->cq_resize_cqesz;
595 cq->cq_resize_addr = 0;
596 cq->cq_resize_map_offset = 0;
597 cq->cq_resize_map_len = 0;
598 cq->cq_resize_size = 0;
599 cq->cq_resize_cqesz = 0;
600 for (i = 0; (1 << i) < cq->cq_size; i++)
601 ;
602 cq->cq_log_cqsz = i;
603
604 cq->cq_consindx++; /* consume the RESIZE cqe */
605
606 return (0); /* SUCCESS */
607 }
608
609 /*
610 * dapli_hermon_cq_poll()
611 * This routine polls CQEs out of a CQ and puts them into the ibt_wc_t
612 * array that is passed in.
613 */
614 static DAT_RETURN
dapli_hermon_cq_poll(ib_cq_handle_t cq,ibt_wc_t * wc_p,uint_t num_wc,uint_t * num_polled)615 dapli_hermon_cq_poll(ib_cq_handle_t cq, ibt_wc_t *wc_p, uint_t num_wc,
616 uint_t *num_polled)
617 {
618 uint32_t *cqe;
619 uint32_t cons_indx;
620 uint32_t polled_cnt;
621 DAT_RETURN dat_status;
622 int status;
623
624 /* Get the consumer index */
625 cons_indx = cq->cq_consindx & cq_wrap_around_mask;
626
627 /* Calculate the pointer to the first CQ entry */
628 cqe = (uint32_t *)&cq->cq_addr[cons_indx];
629
630 /*
631 * Keep pulling entries from the CQ until we find an entry owned by
632 * the hardware. As long as there the CQE's owned by SW, process
633 * each entry by calling dapli_hermon_cq_cqe_consume() and updating the
634 * CQ consumer index. Note: We only update the consumer index if
635 * dapli_hermon_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.
636 * Otherwise, it indicates that we are going to "recycle" the CQE
637 * (probably because it is a error CQE and corresponds to more than one
638 * completion).
639 */
640 polled_cnt = 0;
641 while (HERMON_CQE_OWNER_IS_SW(cq, cqe)) {
642 if (HERMON_CQE_OPCODE_GET(cqe) == HERMON_CQE_RESIZE_OPCODE) {
643 if (dapli_hermon_cq_resize_helper(cq))
644 return (DAT_ERROR(DAT_INTERNAL_ERROR, 0));
645 cons_indx = cq->cq_consindx & cq_wrap_around_mask;
646 cqe = (uint32_t *)&cq->cq_addr[cons_indx];
647 continue;
648 }
649 status = dapli_hermon_cq_cqe_consume(cq, cqe,
650 &wc_p[polled_cnt++]);
651 if (status == TAVOR_CQ_SYNC_AND_DB) {
652 /* Reset to hardware ownership is implicit in Hermon */
653 cq->cq_consindx++; /* incr the total counter */
654
655 /* Increment the consumer index */
656 cons_indx = (cons_indx + 1) & cq_wrap_around_mask;
657
658 /* Update the pointer to the next CQ entry */
659 cqe = (uint32_t *)&cq->cq_addr[cons_indx];
660 }
661
662 /*
663 * If we have run out of space to store work completions,
664 * then stop and return the ones we have pulled of the CQ.
665 */
666 if (polled_cnt >= num_wc) {
667 break;
668 }
669 }
670
671 dat_status = DAT_SUCCESS;
672 /*
673 * Now we only ring the doorbell (to update the consumer index) if
674 * we've actually consumed a CQ entry. If we have, for example,
675 * pulled from a CQE that we are still in the process of "recycling"
676 * for error purposes, then we would not update the consumer index.
677 */
678 if (polled_cnt != 0) {
679 /*
680 * Update the consumer index in both the CQ handle and the
681 * doorbell record.
682 */
683 dapli_hermon_cq_update_ci(cq, cq->cq_poll_dbp);
684 } else if (polled_cnt == 0) {
685 /*
686 * If the CQ is empty, we can try to free up some of the WRID
687 * list containers.
688 */
689 if (cq->cq_wrid_reap_head) /* look before leaping */
690 dapls_tavor_wrid_cq_reap(cq);
691 dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0);
692 }
693
694 if (num_polled != NULL) {
695 *num_polled = polled_cnt;
696 }
697
698 return (dat_status);
699 }
700
701 /*
702 * dapli_hermon_cq_poll_one()
703 * This routine polls one CQE out of a CQ and puts ot into the ibt_wc_t
704 * that is passed in. See above for more comments/details.
705 */
706 static DAT_RETURN
dapli_hermon_cq_poll_one(ib_cq_handle_t cq,ibt_wc_t * wc_p)707 dapli_hermon_cq_poll_one(ib_cq_handle_t cq, ibt_wc_t *wc_p)
708 {
709 uint32_t *cqe;
710 uint32_t cons_indx;
711 DAT_RETURN dat_status;
712 int status;
713
714 start_over:
715 /* Get the consumer index */
716 cons_indx = cq->cq_consindx & cq_wrap_around_mask;
717
718 /* Calculate the pointer to the first CQ entry */
719 cqe = (uint32_t *)&cq->cq_addr[cons_indx];
720
721 /*
722 * Keep pulling entries from the CQ until we find an entry owned by
723 * the hardware. As long as there the CQE's owned by SW, process
724 * each entry by calling dapli_hermon_cq_cqe_consume() and updating the
725 * CQ consumer index. Note: We only update the consumer index if
726 * dapli_hermon_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.
727 * Otherwise, it indicates that we are going to "recycle" the CQE
728 * (probably because it is a error CQE and corresponds to more than one
729 * completion).
730 */
731 if (HERMON_CQE_OWNER_IS_SW(cq, cqe)) {
732 if (HERMON_CQE_OPCODE_GET(cqe) == HERMON_CQE_RESIZE_OPCODE) {
733 if (dapli_hermon_cq_resize_helper(cq))
734 return (DAT_ERROR(DAT_INTERNAL_ERROR, 0));
735 goto start_over;
736 }
737 status = dapli_hermon_cq_cqe_consume(cq, cqe, wc_p);
738 if (status == TAVOR_CQ_SYNC_AND_DB) {
739 /* Reset to hardware ownership is implicit in Hermon */
740
741 /* Increment the consumer index */
742 cq->cq_consindx++;
743 dapli_hermon_cq_update_ci(cq, cq->cq_poll_dbp);
744 }
745 dat_status = DAT_SUCCESS;
746 } else {
747 if (cq->cq_wrid_reap_head) /* look before leaping */
748 dapls_tavor_wrid_cq_reap(cq);
749 dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0);
750 }
751 return (dat_status);
752 }
753
754 /*
755 * dapli_hermon_cq_cqe_consume()
756 * Converts a given CQE into a ibt_wc_t object
757 */
758 static int
dapli_hermon_cq_cqe_consume(ib_cq_handle_t cqhdl,uint32_t * cqe,ibt_wc_t * wc)759 dapli_hermon_cq_cqe_consume(ib_cq_handle_t cqhdl, uint32_t *cqe,
760 ibt_wc_t *wc)
761 {
762 uint_t flags;
763 uint_t type;
764 uint_t opcode;
765 int status;
766
767 /*
768 * Determine if this is an "error" CQE by examining "opcode". If it
769 * is an error CQE, then call dapli_hermon_cq_errcqe_consume() and
770 * return whatever status it returns. Otherwise, this is a successful
771 * completion.
772 */
773 opcode = HERMON_CQE_OPCODE_GET(cqe);
774 if (opcode == HERMON_CQE_ERR_OPCODE) {
775 status = dapli_hermon_cq_errcqe_consume(cqhdl, cqe, wc);
776 return (status);
777 }
778 TAVOR_CQE_WQEADDRSZ_SET(cqe, (HTOBE_32(cqe[6]) >> 10) &
779 ~HERMON_WQE_NDS_MASK);
780
781 /*
782 * Fetch the Work Request ID using the information in the CQE.
783 * See tavor_wr.c for more details.
784 */
785 wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, (tavor_hw_cqe_t *)cqe,
786 HERMON_CQE_SENDRECV_GET(cqe) >> 6, 0, NULL);
787 wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe);
788
789 /*
790 * Parse the CQE opcode to determine completion type. This will set
791 * not only the type of the completion, but also any flags that might
792 * be associated with it (e.g. whether immediate data is present).
793 */
794 flags = IBT_WC_NO_FLAGS;
795 if (HERMON_CQE_SENDRECV_GET(cqe) != TAVOR_COMPLETION_RECV) {
796
797 /*
798 * Send CQE
799 *
800 * The following opcodes will not be generated in uDAPL
801 * case TAVOR_CQE_SND_RDMAWR_IMM:
802 * case TAVOR_CQE_SND_SEND_IMM:
803 * case TAVOR_CQE_SND_ATOMIC_CS:
804 * case TAVOR_CQE_SND_ATOMIC_FA:
805 */
806 switch (opcode) {
807 case TAVOR_CQE_SND_RDMAWR:
808 type = IBT_WRC_RDMAW;
809 break;
810
811 case TAVOR_CQE_SND_SEND:
812 type = IBT_WRC_SEND;
813 break;
814
815 case TAVOR_CQE_SND_RDMARD:
816 type = IBT_WRC_RDMAR;
817 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe);
818 break;
819
820 case TAVOR_CQE_SND_BIND_MW:
821 type = IBT_WRC_BIND;
822 break;
823
824 default:
825 wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR;
826 return (TAVOR_CQ_SYNC_AND_DB);
827 }
828 } else {
829
830 /*
831 * Receive CQE
832 *
833 * The following opcodes will not be generated in uDAPL
834 *
835 * case TAVOR_CQE_RCV_RECV_IMM:
836 * case TAVOR_CQE_RCV_RECV_IMM2:
837 * case TAVOR_CQE_RCV_RDMAWR_IMM:
838 * case TAVOR_CQE_RCV_RDMAWR_IMM2:
839 */
840 switch (opcode) {
841 case HERMON_CQE_RCV_SEND:
842 type = IBT_WRC_RECV;
843 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe);
844 break;
845 default:
846 wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR;
847 return (TAVOR_CQ_SYNC_AND_DB);
848 }
849 }
850 wc->wc_type = type;
851 wc->wc_flags = flags;
852 /* If we got here, completion status must be success */
853 wc->wc_status = IBT_WC_SUCCESS;
854
855 return (TAVOR_CQ_SYNC_AND_DB);
856 }
857
858 /*
859 * dapli_hermon_cq_errcqe_consume()
860 */
861 static int
dapli_hermon_cq_errcqe_consume(ib_cq_handle_t cqhdl,uint32_t * cqe,ibt_wc_t * wc)862 dapli_hermon_cq_errcqe_consume(ib_cq_handle_t cqhdl, uint32_t *cqe,
863 ibt_wc_t *wc)
864 {
865 dapls_tavor_wrid_entry_t wre;
866 uint_t status;
867 uint_t send_or_recv;
868
869 dapl_dbg_log(DAPL_DBG_TYPE_EVD, "errcqe_consume:cqe.eth=%x, wqe=%x\n",
870 TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe),
871 TAVOR_CQE_WQEADDRSZ_GET(cqe));
872
873 status = ((uint8_t *)cqe)[0x1B];
874 TAVOR_CQE_WQEADDRSZ_SET(cqe, (HTOBE_32(cqe[6]) >> 10) &
875 ~HERMON_WQE_NDS_MASK);
876 if (HERMON_CQE_SENDRECV_GET(cqe) == 0) {
877 send_or_recv = 0;
878 } else {
879 send_or_recv = 1;
880 }
881
882 /*
883 * Fetch the Work Request ID using the information in the CQE.
884 * See tavor_wr.c for more details.
885 */
886 wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, (tavor_hw_cqe_t *)cqe,
887 send_or_recv, 1, &wre);
888 wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe);
889
890 /*
891 * Parse the CQE opcode to determine completion type. We know that
892 * the CQE is an error completion, so we extract only the completion
893 * status here.
894 */
895 switch (status) {
896 case TAVOR_CQE_LOC_LEN_ERR:
897 status = IBT_WC_LOCAL_LEN_ERR;
898 break;
899
900 case TAVOR_CQE_LOC_OP_ERR:
901 status = IBT_WC_LOCAL_CHAN_OP_ERR;
902 break;
903
904 case TAVOR_CQE_LOC_PROT_ERR:
905 status = IBT_WC_LOCAL_PROTECT_ERR;
906 break;
907
908 case TAVOR_CQE_WR_FLUSHED_ERR:
909 status = IBT_WC_WR_FLUSHED_ERR;
910 break;
911
912 case TAVOR_CQE_MW_BIND_ERR:
913 status = IBT_WC_MEM_WIN_BIND_ERR;
914 break;
915
916 case TAVOR_CQE_BAD_RESPONSE_ERR:
917 status = IBT_WC_BAD_RESPONSE_ERR;
918 break;
919
920 case TAVOR_CQE_LOCAL_ACCESS_ERR:
921 status = IBT_WC_LOCAL_ACCESS_ERR;
922 break;
923
924 case TAVOR_CQE_REM_INV_REQ_ERR:
925 status = IBT_WC_REMOTE_INVALID_REQ_ERR;
926 break;
927
928 case TAVOR_CQE_REM_ACC_ERR:
929 status = IBT_WC_REMOTE_ACCESS_ERR;
930 break;
931
932 case TAVOR_CQE_REM_OP_ERR:
933 status = IBT_WC_REMOTE_OP_ERR;
934 break;
935
936 case TAVOR_CQE_TRANS_TO_ERR:
937 status = IBT_WC_TRANS_TIMEOUT_ERR;
938 break;
939
940 case TAVOR_CQE_RNRNAK_TO_ERR:
941 status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
942 break;
943
944 /*
945 * The following error codes are not supported in the Tavor driver
946 * as they relate only to Reliable Datagram completion statuses:
947 * case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
948 * case TAVOR_CQE_REM_INV_RD_REQ_ERR:
949 * case TAVOR_CQE_EEC_REM_ABORTED_ERR:
950 * case TAVOR_CQE_INV_EEC_NUM_ERR:
951 * case TAVOR_CQE_INV_EEC_STATE_ERR:
952 * case TAVOR_CQE_LOC_EEC_ERR:
953 */
954
955 default:
956 status = IBT_WC_LOCAL_CHAN_OP_ERR;
957 break;
958 }
959 wc->wc_status = status;
960 wc->wc_type = 0;
961
962 /*
963 * Consume the CQE
964 * Return status to indicate that doorbell and sync may be
965 * necessary.
966 */
967 return (TAVOR_CQ_SYNC_AND_DB);
968 }
969
970 /*
971 * dapli_hermon_cq_notify()
972 * This function is used for arming the CQ by ringing the CQ doorbell.
973 *
974 * Note: there is something very subtle here. This code assumes a very
975 * specific behavior of the kernel driver. The cmd_sn field of the
976 * arm_dbr is updated by the kernel driver whenever a notification
977 * event for the cq is received. This code extracts the cmd_sn field
978 * from the arm_dbr to know the right value to use. The arm_dbr is
979 * always updated atomically so that neither the kernel driver nor this
980 * will get confused about what the other is doing.
981 *
982 * Note: param is not used here. It is necessary for arming a CQ for
983 * N completions (param is N), but no uDAPL API supports this for now.
984 * Thus, we declare ARGSUSED to make lint happy.
985 */
986 /*ARGSUSED*/
987 static DAT_RETURN
dapli_hermon_cq_notify(ib_cq_handle_t cq,int flags,uint32_t param)988 dapli_hermon_cq_notify(ib_cq_handle_t cq, int flags, uint32_t param)
989 {
990 uint32_t cqnum;
991 uint32_t *target;
992 uint32_t old_cmd, cmp, new, tmp, cmd_sn;
993
994 /*
995 * Determine if we are trying to get the next completion or the next
996 * "solicited" completion. Then hit the appropriate doorbell.
997 */
998 cqnum = cq->cq_num;
999 target = cq->cq_arm_dbp;
1000 retry:
1001 cmp = *target;
1002 tmp = HTOBE_32(cmp);
1003 old_cmd = tmp & (0x7 << 24);
1004 cmd_sn = tmp & (0x3 << 28);
1005
1006 if (flags == IB_NOTIFY_ON_NEXT_COMP) {
1007 if (old_cmd != HERMON_CQDB_NOTIFY_CQ) {
1008 new = HTOBE_32(cmd_sn | HERMON_CQDB_NOTIFY_CQ |
1009 (cq->cq_consindx & 0xFFFFFF));
1010 tmp = atomic_cas_32(target, cmp, new);
1011 if (tmp != cmp)
1012 goto retry;
1013 dapli_hermon_cq_doorbell(cq->cq_iauar,
1014 HERMON_CQDB_NOTIFY_CQ, cqnum,
1015 cmd_sn, cq->cq_consindx);
1016 } /* else it's already armed */
1017 } else if (flags == IB_NOTIFY_ON_NEXT_SOLICITED) {
1018 if (old_cmd != HERMON_CQDB_NOTIFY_CQ &&
1019 old_cmd != HERMON_CQDB_NOTIFY_CQ_SOLICIT) {
1020 new = HTOBE_32(cmd_sn | HERMON_CQDB_NOTIFY_CQ_SOLICIT |
1021 (cq->cq_consindx & 0xFFFFFF));
1022 tmp = atomic_cas_32(target, cmp, new);
1023 if (tmp != cmp)
1024 goto retry;
1025 dapli_hermon_cq_doorbell(cq->cq_iauar,
1026 HERMON_CQDB_NOTIFY_CQ_SOLICIT, cqnum,
1027 cmd_sn, cq->cq_consindx);
1028 } /* else it's already armed */
1029 } else {
1030 return (DAT_INVALID_PARAMETER);
1031 }
1032
1033 return (DAT_SUCCESS);
1034 }
1035
1036 /*
1037 * Since uDAPL posts 1 wqe per request, we
1038 * only need to do stores for the last one.
1039 */
1040 static void
dapli_hermon_wqe_headroom(ib_qp_handle_t qp,uint32_t start)1041 dapli_hermon_wqe_headroom(ib_qp_handle_t qp, uint32_t start)
1042 {
1043 uint32_t *wqe_start, *wqe_top, *wqe_base, qsize, invalue;
1044 int hdrmwqes, wqesizebytes, sectperwqe, i, j;
1045
1046 qsize = qp->qp_sq_numwqe;
1047 wqesizebytes = qp->qp_sq_wqesz;
1048 sectperwqe = wqesizebytes >> 6;
1049 hdrmwqes = qp->qp_sq_headroom;
1050 wqe_base = (uint32_t *)TAVOR_QP_SQ_ENTRY(qp, 0);
1051 wqe_top = (uint32_t *)TAVOR_QP_SQ_ENTRY(qp, qsize);
1052 wqe_start = (uint32_t *)TAVOR_QP_SQ_ENTRY(qp, start);
1053
1054 for (i = 0; i < hdrmwqes - 1; i++) {
1055 wqe_start += sectperwqe * 16;
1056 if (wqe_start == wqe_top)
1057 wqe_start = wqe_base;
1058 }
1059 invalue = HTOBE_32(*wqe_start);
1060 invalue |= 0x7FFFFFFF;
1061 *wqe_start = HTOBE_32(invalue);
1062 wqe_start += 16;
1063 for (j = 1; j < sectperwqe; j++) {
1064 *wqe_start = 0xFFFFFFFF;
1065 wqe_start += 16;
1066 }
1067 }
1068
1069 /*
1070 * dapli_hermon_post_send()
1071 */
1072 /* ARGSUSED */
1073 static DAT_RETURN
dapli_hermon_post_send(DAPL_EP * ep,ibt_send_wr_t * wr,boolean_t ns)1074 dapli_hermon_post_send(DAPL_EP *ep, ibt_send_wr_t *wr, boolean_t ns)
1075 {
1076 dapls_tavor_wrid_list_hdr_t *wridlist;
1077 dapls_tavor_wrid_entry_t *wre_last;
1078 uint64_t *desc;
1079 uint64_t *wqe_addr;
1080 uint32_t desc_sz;
1081 uint32_t wqeaddrsz, signaled_dbd;
1082 uint32_t head, tail, next_tail, qsize_msk;
1083 int status;
1084 ib_qp_handle_t qp;
1085
1086 if ((ep->qp_state == IBT_STATE_RESET) ||
1087 (ep->qp_state == IBT_STATE_INIT) ||
1088 (ep->qp_state == IBT_STATE_RTR)) {
1089 dapl_dbg_log(DAPL_DBG_TYPE_ERR,
1090 "post_send: invalid qp_state %d\n", ep->qp_state);
1091 return (DAT_INVALID_STATE);
1092 }
1093
1094 qp = ep->qp_handle;
1095
1096 /* Grab the lock for the WRID list */
1097 dapl_os_lock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1098 wridlist = qp->qp_sq_wqhdr->wq_wrid_post;
1099
1100 /* Save away some initial QP state */
1101 qsize_msk = qp->qp_sq_wqhdr->wq_size - 1;
1102 tail = qp->qp_sq_wqhdr->wq_tail;
1103 head = qp->qp_sq_wqhdr->wq_head;
1104
1105 /*
1106 * Check for "queue full" condition. If the queue is already full,
1107 * then no more WQEs can be posted, return an error
1108 */
1109 if (qp->qp_sq_wqhdr->wq_full != 0) {
1110 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1111 return (DAT_INSUFFICIENT_RESOURCES);
1112 }
1113
1114 /*
1115 * Increment the "tail index" and check for "queue full" condition.
1116 * If we detect that the current work request is going to fill the
1117 * work queue, then we mark this condition and continue.
1118 */
1119 next_tail = (tail + 1) & qsize_msk;
1120 if (next_tail == head) {
1121 qp->qp_sq_wqhdr->wq_full = 1;
1122 }
1123
1124 /*
1125 * Get the user virtual address of the location where the next
1126 * Send WQE should be built
1127 */
1128 wqe_addr = TAVOR_QP_SQ_ENTRY(qp, tail);
1129
1130 /*
1131 * Call tavor_wqe_send_build() to build the WQE at the given address.
1132 * This routine uses the information in the ibt_send_wr_t and
1133 * returns the size of the WQE when it returns.
1134 */
1135 status = dapli_hermon_wqe_send_build(qp, wr, wqe_addr, &desc_sz);
1136 if (status != DAT_SUCCESS) {
1137 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1138 return (status);
1139 }
1140
1141 /*
1142 * Get the descriptor (io address) corresponding to the location
1143 * Send WQE was built.
1144 */
1145 desc = TAVOR_QP_SQ_ENTRY(qp, tail);
1146
1147 /*
1148 * Add a WRID entry to the WRID list. Need to calculate the
1149 * "wqeaddr" to pass to dapli_tavor_wrid_add_entry().
1150 * signaled_dbd is still calculated, but ignored.
1151 */
1152 wqeaddrsz = HERMON_QP_WQEADDRSZ(qp->qp_sq_counter);
1153
1154 if (wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1155 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
1156 }
1157
1158 dapli_tavor_wrid_add_entry(qp->qp_sq_wqhdr, wr->wr_id, wqeaddrsz,
1159 signaled_dbd);
1160
1161 dapli_hermon_wqe_headroom(qp, next_tail);
1162 *(uint8_t *)desc ^= 0x80; /* set owner bit */
1163
1164 /*
1165 * Now if the WRID tail entry is non-NULL, then this
1166 * represents the entry to which we are chaining the
1167 * new entries. Since we are going to ring the
1168 * doorbell for this WQE, we want set its "dbd" bit.
1169 *
1170 * On the other hand, if the tail is NULL, even though
1171 * we will have rung the doorbell for the previous WQE
1172 * (for the hardware's sake) it is irrelevant to our
1173 * purposes (for tracking WRIDs) because we know the
1174 * request must have already completed.
1175 */
1176 wre_last = wridlist->wl_wre_old_tail;
1177 if (wre_last != NULL) {
1178 wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED;
1179 }
1180
1181 /* Update some of the state in the QP */
1182 qp->qp_sq_lastwqeaddr = wqe_addr;
1183 qp->qp_sq_wqhdr->wq_tail = next_tail;
1184
1185 if (desc_sz && qp->qp_ia_bf != NULL) { /* use Hermon Blueflame */
1186 uint64_t *bf_dest, *src64;
1187 uint8_t *src8;
1188 int i;
1189
1190 (void) pthread_spin_lock(&hermon_bf_lock);
1191
1192 src8 = (uint8_t *)desc;
1193 src8[1] = (uint8_t)(qp->qp_sq_counter >> 8);
1194 src8[2] = (uint8_t)qp->qp_sq_counter;
1195 src8[4] = (uint8_t)(qp->qp_num >> 16);
1196 src8[5] = (uint8_t)(qp->qp_num >> 8);
1197 src8[6] = (uint8_t)qp->qp_num;
1198
1199 src64 = (uint64_t *)desc;
1200 bf_dest = (uint64_t *)((uintptr_t)qp->qp_ia_bf +
1201 *qp->qp_ia_bf_toggle);
1202 *qp->qp_ia_bf_toggle ^= 256; /* 2 256-byte buffers */
1203 for (i = 0; i < desc_sz * 2; i += 8) {
1204 bf_dest[i] = src64[i];
1205 bf_dest[i + 1] = src64[i + 1];
1206 bf_dest[i + 2] = src64[i + 2];
1207 bf_dest[i + 3] = src64[i + 3];
1208 bf_dest[i + 4] = src64[i + 4];
1209 bf_dest[i + 5] = src64[i + 5];
1210 bf_dest[i + 6] = src64[i + 6];
1211 bf_dest[i + 7] = src64[i + 7];
1212 }
1213 (void) pthread_spin_unlock(&hermon_bf_lock);
1214 } else {
1215 /* Ring the doorbell */
1216 dapli_hermon_sq_dbreg(qp->qp_iauar, qp->qp_num);
1217 }
1218 qp->qp_sq_counter++;
1219
1220 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1221
1222 return (DAT_SUCCESS);
1223 }
1224
1225 /*
1226 * dapli_hermon_post_recv()
1227 */
1228 /* ARGSUSED */
1229 static DAT_RETURN
dapli_hermon_post_recv(DAPL_EP * ep,ibt_recv_wr_t * wr,boolean_t ns)1230 dapli_hermon_post_recv(DAPL_EP *ep, ibt_recv_wr_t *wr, boolean_t ns)
1231 {
1232 dapls_tavor_wrid_list_hdr_t *wridlist;
1233 dapls_tavor_wrid_entry_t *wre_last;
1234 ib_qp_handle_t qp;
1235 DAT_RETURN status;
1236 uint64_t *wqe_addr;
1237 uint32_t desc_sz;
1238 uint32_t wqeaddrsz;
1239 uint32_t head, tail, next_tail, qsize_msk;
1240
1241 if (ep->qp_state == IBT_STATE_RESET) {
1242 dapl_dbg_log(DAPL_DBG_TYPE_ERR,
1243 "post_recv: invalid qp_state %d\n", ep->qp_state);
1244 return (DAT_INVALID_STATE);
1245 }
1246 qp = ep->qp_handle;
1247
1248 /* Grab the lock for the WRID list */
1249 dapl_os_lock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1250 wridlist = qp->qp_rq_wqhdr->wq_wrid_post;
1251
1252 /* Save away some initial QP state */
1253 qsize_msk = qp->qp_rq_wqhdr->wq_size - 1;
1254 tail = qp->qp_rq_wqhdr->wq_tail;
1255 head = qp->qp_rq_wqhdr->wq_head;
1256
1257 /*
1258 * For the ibt_recv_wr_t passed in, parse the request and build a
1259 * Recv WQE. Link the WQE with the previous WQE and ring the
1260 * door bell.
1261 */
1262
1263 /*
1264 * Check for "queue full" condition. If the queue is already full,
1265 * then no more WQEs can be posted. So return an error.
1266 */
1267 if (qp->qp_rq_wqhdr->wq_full != 0) {
1268 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1269 return (DAT_INSUFFICIENT_RESOURCES);
1270 }
1271
1272 /*
1273 * Increment the "tail index" and check for "queue
1274 * full" condition. If we detect that the current
1275 * work request is going to fill the work queue, then
1276 * we mark this condition and continue.
1277 */
1278 next_tail = (tail + 1) & qsize_msk;
1279 if (next_tail == head) {
1280 qp->qp_rq_wqhdr->wq_full = 1;
1281 }
1282
1283 /* The user virtual address of the WQE to be built */
1284 wqe_addr = TAVOR_QP_RQ_ENTRY(qp, tail);
1285
1286 /*
1287 * Call tavor_wqe_recv_build() to build the WQE at the given
1288 * address. This routine uses the information in the
1289 * ibt_recv_wr_t and returns the size of the WQE.
1290 */
1291 status = dapli_hermon_wqe_recv_build(qp, wr, wqe_addr, &desc_sz);
1292 if (status != DAT_SUCCESS) {
1293 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1294 return (DAT_INTERNAL_ERROR);
1295 }
1296
1297 /*
1298 * Add a WRID entry to the WRID list. Need to calculate the
1299 * "wqeaddr" and "signaled_dbd" values to pass to
1300 * dapli_tavor_wrid_add_entry().
1301 * Note: all Recv WQEs are essentially "signaled"
1302 */
1303 wqeaddrsz = HERMON_QP_WQEADDRSZ(qp->qp_rq_counter);
1304 dapli_tavor_wrid_add_entry(qp->qp_rq_wqhdr, wr->wr_id, wqeaddrsz,
1305 (uint32_t)TAVOR_WRID_ENTRY_SIGNALED);
1306
1307 /*
1308 * Now if the WRID tail entry is non-NULL, then this
1309 * represents the entry to which we are chaining the
1310 * new entries. Since we are going to ring the
1311 * doorbell for this WQE, we want set its "dbd" bit.
1312 *
1313 * On the other hand, if the tail is NULL, even though
1314 * we will have rung the doorbell for the previous WQE
1315 * (for the hardware's sake) it is irrelevant to our
1316 * purposes (for tracking WRIDs) because we know the
1317 * request must have already completed.
1318 */
1319 wre_last = wridlist->wl_wre_old_tail;
1320 if (wre_last != NULL) {
1321 wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED;
1322 }
1323
1324 /* Update some of the state in the QP */
1325 qp->qp_rq_lastwqeaddr = wqe_addr;
1326 qp->qp_rq_wqhdr->wq_tail = next_tail;
1327
1328 /* Update the doorbell record */
1329 qp->qp_rq_counter++;
1330 (qp->qp_rq_dbp)[0] = HTOBE_32(qp->qp_rq_counter);
1331
1332 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1333
1334 return (DAT_SUCCESS);
1335 }
1336
1337 /*
1338 * dapli_hermon_post_srq()
1339 */
1340 /* ARGSUSED */
1341 static DAT_RETURN
dapli_hermon_post_srq(DAPL_SRQ * srqp,ibt_recv_wr_t * wr,boolean_t ns)1342 dapli_hermon_post_srq(DAPL_SRQ *srqp, ibt_recv_wr_t *wr, boolean_t ns)
1343 {
1344 ib_srq_handle_t srq;
1345 DAT_RETURN status;
1346 uint32_t desc;
1347 uint64_t *wqe_addr;
1348 uint32_t head, next_head, qsize_msk;
1349 uint32_t wqe_index;
1350
1351
1352 srq = srqp->srq_handle;
1353
1354 /* Grab the lock for the WRID list */
1355 dapl_os_lock(&srq->srq_wridlist->wl_lock->wrl_lock);
1356
1357 /*
1358 * For the ibt_recv_wr_t passed in, parse the request and build a
1359 * Recv WQE. Link the WQE with the previous WQE and ring the
1360 * door bell.
1361 */
1362
1363 /*
1364 * Check for "queue full" condition. If the queue is already full,
1365 * ie. there are no free entries, then no more WQEs can be posted.
1366 * So return an error.
1367 */
1368 if (srq->srq_wridlist->wl_freel_entries == 0) {
1369 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1370 return (DAT_INSUFFICIENT_RESOURCES);
1371 }
1372
1373 /* Save away some initial SRQ state */
1374 qsize_msk = srq->srq_wridlist->wl_size - 1;
1375 head = srq->srq_wridlist->wl_freel_head;
1376
1377 next_head = (head + 1) & qsize_msk;
1378
1379 /* Get the descriptor (IO Address) of the WQE to be built */
1380 desc = srq->srq_wridlist->wl_free_list[head];
1381
1382 wqe_index = TAVOR_SRQ_WQ_INDEX(srq->srq_wq_desc_addr, desc,
1383 srq->srq_wq_wqesz);
1384
1385 /* The user virtual address of the WQE to be built */
1386 wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq, wqe_index);
1387
1388 /*
1389 * Call dapli_hermon_wqe_srq_build() to build the WQE at the given
1390 * address. This routine uses the information in the
1391 * ibt_recv_wr_t and returns the size of the WQE.
1392 */
1393 status = dapli_hermon_wqe_srq_build(srq, wr, wqe_addr);
1394 if (status != DAT_SUCCESS) {
1395 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1396 return (status);
1397 }
1398
1399 /*
1400 * Add a WRID entry to the WRID list.
1401 */
1402 dapli_tavor_wrid_add_entry_srq(srq, wr->wr_id, wqe_index);
1403
1404 #if 0
1405 if (srq->srq_wq_lastwqeindex == -1) {
1406 last_wqe_addr = NULL;
1407 } else {
1408 last_wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq,
1409 srq->srq_wq_lastwqeindex);
1410 }
1411 /*
1412 * Now link the chain to the old chain (if there was one)
1413 * and update the wqe_counter in the doorbell record.
1414 */
1415 XXX
1416 dapli_tavor_wqe_srq_linknext(wqe_addr, ns, desc, last_wqe_addr);
1417 #endif
1418
1419 /* Update some of the state in the SRQ */
1420 srq->srq_wq_lastwqeindex = wqe_index;
1421 srq->srq_wridlist->wl_freel_head = next_head;
1422 srq->srq_wridlist->wl_freel_entries--;
1423 dapl_os_assert(srq->srq_wridlist->wl_freel_entries <=
1424 srq->srq_wridlist->wl_size);
1425
1426 /* Update the doorbell record */
1427 srq->srq_counter++;
1428 (srq->srq_dbp)[0] = HTOBE_32(srq->srq_counter);
1429
1430 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1431
1432 return (DAT_SUCCESS);
1433 }
1434
1435 /*
1436 * dapli_hermon_cq_srq_entries_flush()
1437 */
1438 static void
dapli_hermon_cq_srq_entries_flush(ib_qp_handle_t qp)1439 dapli_hermon_cq_srq_entries_flush(ib_qp_handle_t qp)
1440 {
1441 ib_cq_handle_t cq;
1442 dapls_tavor_workq_hdr_t *wqhdr;
1443 tavor_hw_cqe_t *cqe;
1444 tavor_hw_cqe_t *next_cqe;
1445 uint32_t cons_indx, tail_cons_indx;
1446 uint32_t new_indx, check_indx, indx;
1447 int cqe_qpnum, cqe_type;
1448 int outstanding_cqes, removed_cqes;
1449 int i;
1450
1451 /* ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock)); */
1452
1453 cq = qp->qp_rq_cqhdl;
1454 wqhdr = qp->qp_rq_wqhdr;
1455
1456 dapl_os_assert(wqhdr->wq_wrid_post != NULL);
1457 dapl_os_assert(wqhdr->wq_wrid_post->wl_srq_en != 0);
1458
1459 /* Get the consumer index */
1460 cons_indx = cq->cq_consindx;
1461
1462 /* Calculate the pointer to the first CQ entry */
1463 cqe = &cq->cq_addr[cons_indx];
1464
1465 /*
1466 * Loop through the CQ looking for entries owned by software. If an
1467 * entry is owned by software then we increment an 'outstanding_cqes'
1468 * count to know how many entries total we have on our CQ. We use this
1469 * value further down to know how many entries to loop through looking
1470 * for our same QP number.
1471 */
1472 outstanding_cqes = 0;
1473 tail_cons_indx = cons_indx;
1474 while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
1475 /* increment total cqes count */
1476 outstanding_cqes++;
1477
1478 /* increment the consumer index */
1479 tail_cons_indx = (tail_cons_indx + 1) & cq_wrap_around_mask;
1480
1481 /* update the pointer to the next cq entry */
1482 cqe = &cq->cq_addr[tail_cons_indx];
1483 }
1484
1485 /*
1486 * Using the 'tail_cons_indx' that was just set, we now know how many
1487 * total CQEs possible there are. Set the 'check_indx' and the
1488 * 'new_indx' to the last entry identified by 'tail_cons_indx'
1489 */
1490 check_indx = new_indx = (tail_cons_indx - 1) & cq_wrap_around_mask;
1491
1492 for (i = 0; i < outstanding_cqes; i++) {
1493 cqe = &cq->cq_addr[check_indx];
1494
1495 /* Grab QP number from CQE */
1496 cqe_qpnum = TAVOR_CQE_QPNUM_GET(cqe);
1497 cqe_type = HERMON_CQE_SENDRECV_GET(cqe);
1498
1499 /*
1500 * If the QP number is the same in the CQE as the QP that we
1501 * have on this SRQ, then we must free up the entry off the
1502 * SRQ. We also make sure that the completion type is of the
1503 * 'TAVOR_COMPLETION_RECV' type. So any send completions on
1504 * this CQ will be left as-is. The handling of returning
1505 * entries back to HW ownership happens further down.
1506 */
1507 if (cqe_qpnum == qp->qp_num &&
1508 cqe_type == TAVOR_COMPLETION_RECV) {
1509 /* Add back to SRQ free list */
1510 (void) dapli_tavor_wrid_find_match_srq(
1511 wqhdr->wq_wrid_post, cqe);
1512 } else {
1513 /* Do Copy */
1514 if (check_indx != new_indx) {
1515 next_cqe = &cq->cq_addr[new_indx];
1516 /*
1517 * Copy the CQE into the "next_cqe"
1518 * pointer.
1519 */
1520 (void) dapl_os_memcpy(next_cqe, cqe,
1521 sizeof (tavor_hw_cqe_t));
1522 }
1523 new_indx = (new_indx - 1) & cq_wrap_around_mask;
1524 }
1525 /* Move index to next CQE to check */
1526 check_indx = (check_indx - 1) & cq_wrap_around_mask;
1527 }
1528
1529 /* Initialize removed cqes count */
1530 removed_cqes = 0;
1531
1532 /* If an entry was removed */
1533 if (check_indx != new_indx) {
1534
1535 /*
1536 * Set current pointer back to the beginning consumer index.
1537 * At this point, all unclaimed entries have been copied to the
1538 * index specified by 'new_indx'. This 'new_indx' will be used
1539 * as the new consumer index after we mark all freed entries as
1540 * having HW ownership. We do that here.
1541 */
1542
1543 /* Loop through all entries until we reach our new pointer */
1544 for (indx = cons_indx; indx <= new_indx;
1545 indx = (indx + 1) & cq_wrap_around_mask) {
1546 removed_cqes++;
1547 cqe = &cq->cq_addr[indx];
1548
1549 /* Reset entry to hardware ownership */
1550 TAVOR_CQE_OWNER_SET_HW(cqe);
1551 }
1552 }
1553
1554 /*
1555 * Update consumer index to be the 'new_indx'. This moves it past all
1556 * removed entries. Because 'new_indx' is pointing to the last
1557 * previously valid SW owned entry, we add 1 to point the cons_indx to
1558 * the first HW owned entry.
1559 */
1560 cons_indx = (new_indx + 1) & cq_wrap_around_mask;
1561
1562 /*
1563 * Now we only ring the doorbell (to update the consumer index) if
1564 * we've actually consumed a CQ entry. If we found no QP number
1565 * matches above, then we would not have removed anything. So only if
1566 * something was removed do we ring the doorbell.
1567 */
1568 if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
1569 /*
1570 * Update the consumer index in both the CQ handle and the
1571 * doorbell record.
1572 */
1573 cq->cq_consindx = cons_indx;
1574 dapli_hermon_cq_update_ci(cq, cq->cq_poll_dbp);
1575 }
1576 }
1577
1578 static void
dapli_hermon_rq_prelink(caddr_t first,uint32_t desc_off,uint32_t wqesz,uint32_t numwqe,uint32_t nds)1579 dapli_hermon_rq_prelink(caddr_t first, uint32_t desc_off, uint32_t wqesz,
1580 uint32_t numwqe, uint32_t nds)
1581 {
1582 int i;
1583 uint32_t *p = (uint32_t *)(uintptr_t)first;
1584 uint32_t off = desc_off;
1585 uint32_t pincr = wqesz / sizeof (uint32_t);
1586 ibt_wr_ds_t sgl;
1587
1588 sgl.ds_va = (ib_vaddr_t)0;
1589 sgl.ds_key = HERMON_WQE_SGL_INVALID_LKEY;
1590 sgl.ds_len = (ib_msglen_t)0;
1591
1592 for (i = 0; i < numwqe - 1; i++, p += pincr) {
1593 off += wqesz;
1594 p[0] = HTOBE_32(off); /* link curr to next */
1595 p[1] = nds; /* nds is 0 for SRQ */
1596 TAVOR_WQE_BUILD_DATA_SEG((void *)&p[2], &sgl);
1597 }
1598 p[0] = HTOBE_32(desc_off); /* link last to first */
1599 p[1] = nds;
1600 TAVOR_WQE_BUILD_DATA_SEG((void *)&p[2], &sgl);
1601 }
1602
1603 static void
dapli_hermon_sq_init(caddr_t first,uint32_t wqesz,uint32_t numwqe)1604 dapli_hermon_sq_init(caddr_t first, uint32_t wqesz, uint32_t numwqe)
1605 {
1606 int i, j;
1607 uint64_t *wqe = (uint64_t *)(uintptr_t)first;
1608
1609 for (i = 0; i < numwqe; i++) {
1610 for (j = 0; j < wqesz; j += 64, wqe += 8)
1611 *(uint32_t *)wqe = 0xFFFFFFFF;
1612 }
1613 }
1614
1615 static void
dapli_hermon_qp_init(ib_qp_handle_t qp)1616 dapli_hermon_qp_init(ib_qp_handle_t qp)
1617 {
1618 dapli_hermon_sq_init(qp->qp_sq_buf, qp->qp_sq_wqesz, qp->qp_sq_numwqe);
1619 qp->qp_rq_counter = 0;
1620 qp->qp_sq_counter = 0;
1621 }
1622
1623 static void
dapli_hermon_cq_init(ib_cq_handle_t cq)1624 dapli_hermon_cq_init(ib_cq_handle_t cq)
1625 {
1626 uint32_t i;
1627
1628 (cq->cq_arm_dbp)[0] = HTOBE_32(1 << 28);
1629 for (i = 0; (1 << i) < cq->cq_size; i++)
1630 ;
1631 cq->cq_log_cqsz = i;
1632 cq->cq_consindx = 0;
1633
1634 /* cq_resize -- needs testing */
1635 }
1636
1637 static void
dapli_hermon_srq_init(ib_srq_handle_t srq)1638 dapli_hermon_srq_init(ib_srq_handle_t srq)
1639 {
1640 /* pre-link the whole shared receive queue */
1641 dapli_hermon_rq_prelink(srq->srq_addr, srq->srq_wq_desc_addr,
1642 srq->srq_wq_wqesz, srq->srq_wq_numwqe, 0);
1643 srq->srq_counter = 0;
1644
1645 /* needs testing */
1646 }
1647
1648 void
dapls_init_funcs_hermon(DAPL_HCA * hca_ptr)1649 dapls_init_funcs_hermon(DAPL_HCA *hca_ptr)
1650 {
1651 hca_ptr->post_send = dapli_hermon_post_send;
1652 hca_ptr->post_recv = dapli_hermon_post_recv;
1653 hca_ptr->post_srq = dapli_hermon_post_srq;
1654 hca_ptr->cq_peek = dapli_hermon_cq_peek;
1655 hca_ptr->cq_poll = dapli_hermon_cq_poll;
1656 hca_ptr->cq_poll_one = dapli_hermon_cq_poll_one;
1657 hca_ptr->cq_notify = dapli_hermon_cq_notify;
1658 hca_ptr->srq_flush = dapli_hermon_cq_srq_entries_flush;
1659 hca_ptr->qp_init = dapli_hermon_qp_init;
1660 hca_ptr->cq_init = dapli_hermon_cq_init;
1661 hca_ptr->srq_init = dapli_hermon_srq_init;
1662 hca_ptr->hermon_resize_cq = 1;
1663
1664 (void) pthread_spin_init(&hermon_bf_lock, 0);
1665 }
1666