1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * This file may contain confidential information of
29 * Mellanox Technologies, Ltd. and should not be distributed in source
30 * form without approval from Sun Legal.
31 */
32
33 #include "dapl.h"
34 #include "dapl_tavor_hw.h"
35 #include "dapl_tavor_wr.h"
36 #include "dapl_tavor_ibtf_impl.h"
37
38 /*
39 * Function signatures
40 */
41 extern uint64_t dapls_tavor_wrid_get_entry(ib_cq_handle_t, tavor_hw_cqe_t *,
42 uint_t, uint_t, dapls_tavor_wrid_entry_t *);
43 extern void dapls_tavor_wrid_cq_reap(ib_cq_handle_t);
44 extern DAPL_OS_LOCK g_tavor_uar_lock;
45
46 #ifndef _LP64
47 extern void dapls_atomic_assign_64(uint64_t, uint64_t *);
48 #endif
49
50 static int dapli_tavor_wqe_send_build(ib_qp_handle_t, ibt_send_wr_t *,
51 uint64_t *, uint_t *);
52 static void dapli_tavor_wqe_send_linknext(ibt_send_wr_t *, uint64_t *,
53 boolean_t, uint32_t, uint_t, uint64_t *, tavor_sw_wqe_dbinfo_t *);
54 static DAT_RETURN dapli_tavor_wqe_recv_build(ib_qp_handle_t, ibt_recv_wr_t *,
55 uint64_t *, uint_t *);
56 static void dapli_tavor_wqe_recv_linknext(uint64_t *, boolean_t, uint32_t,
57 uint_t, uint64_t *);
58 static int dapli_tavor_cq_cqe_consume(ib_cq_handle_t, tavor_hw_cqe_t *,
59 ibt_wc_t *);
60 static int dapli_tavor_cq_errcqe_consume(ib_cq_handle_t, tavor_hw_cqe_t *,
61 ibt_wc_t *);
62
63 /* exported to other HCAs */
64 extern void dapli_tavor_wrid_add_entry(dapls_tavor_workq_hdr_t *, uint64_t,
65 uint32_t, uint_t);
66 extern void dapli_tavor_wrid_add_entry_srq(ib_srq_handle_t, uint64_t, uint32_t);
67
68 /*
69 * Note: The 64 bit doorbells need to written atomically.
70 * In 32 bit libraries we need to use the special assembly rtn
71 * because compiler generated code splits into 2 word writes
72 */
73
74 #if defined(_LP64) || defined(__lint)
75 /* use a macro to ensure inlining on S10 amd64 compiler */
76 #define dapli_tavor_cq_doorbell(ia_uar, cq_cmd, cqn, cq_param) \
77 ((tavor_hw_uar_t *)ia_uar)->cq = HTOBE_64( \
78 ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) | \
79 ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param)
80 #else
81
82 /*
83 * dapli_tavor_cq_doorbell()
84 * Takes the specified cq cmd and cq number and rings the cq doorbell
85 */
86 static void
dapli_tavor_cq_doorbell(dapls_hw_uar_t ia_uar,uint32_t cq_cmd,uint32_t cqn,uint32_t cq_param)87 dapli_tavor_cq_doorbell(dapls_hw_uar_t ia_uar, uint32_t cq_cmd, uint32_t cqn,
88 uint32_t cq_param)
89 {
90 uint64_t doorbell;
91
92 /* Build the doorbell from the parameters */
93 doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) |
94 ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param;
95
96 /* Write the doorbell to UAR */
97 #ifdef _LP64
98 ((tavor_hw_uar_t *)ia_uar)->cq = HTOBE_64(doorbell);
99 /* 32 bit version */
100 #elif defined(i386)
101 dapl_os_lock(&g_tavor_uar_lock);
102 /*
103 * For 32 bit intel we assign the doorbell in the order
104 * prescribed by the Tavor PRM, lower to upper addresses
105 */
106 ((tavor_hw_uar32_t *)ia_uar)->cq[0] =
107 (uint32_t)HTOBE_32(doorbell >> 32);
108 ((tavor_hw_uar32_t *)ia_uar)->cq[1] =
109 (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
110 dapl_os_unlock(&g_tavor_uar_lock);
111 #else
112 dapls_atomic_assign_64(HTOBE_64(doorbell),
113 &((tavor_hw_uar_t *)ia_uar)->cq);
114 #endif
115 }
116 #pragma inline(dapli_tavor_cq_doorbell)
117
118 #endif /* _LP64 */
119
120 #if defined(_LP64) || defined(__lint)
121 #define dapli_tavor_qp_send_doorbell(ia_uar, nda, nds, qpn, fence, nopcode) \
122 ((tavor_hw_uar_t *)ia_uar)->send = HTOBE_64( \
123 (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) << \
124 TAVOR_QPSNDDB_NDA_SHIFT) | \
125 ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) | \
126 ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) | \
127 ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds)
128 #else
129
130 /*
131 * dapli_tavor_qp_send_doorbell()
132 * Takes the specified next descriptor information, qp number, opcode and
133 * rings the send doorbell
134 */
135 static void
dapli_tavor_qp_send_doorbell(dapls_hw_uar_t ia_uar,uint32_t nda,uint32_t nds,uint32_t qpn,uint32_t fence,uint32_t nopcode)136 dapli_tavor_qp_send_doorbell(dapls_hw_uar_t ia_uar, uint32_t nda,
137 uint32_t nds, uint32_t qpn, uint32_t fence, uint32_t nopcode)
138 {
139 uint64_t doorbell;
140
141 /* Build the doorbell from the parameters */
142 doorbell = (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) <<
143 TAVOR_QPSNDDB_NDA_SHIFT) |
144 ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) |
145 ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) |
146 ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds;
147
148 /* Write the doorbell to UAR */
149 #ifdef _LP64
150 ((tavor_hw_uar_t *)ia_uar)->send = HTOBE_64(doorbell);
151 #else
152 #if defined(i386)
153 dapl_os_lock(&g_tavor_uar_lock);
154 /*
155 * For 32 bit intel we assign the doorbell in the order
156 * prescribed by the Tavor PRM, lower to upper addresses
157 */
158 ((tavor_hw_uar32_t *)ia_uar)->send[0] =
159 (uint32_t)HTOBE_32(doorbell >> 32);
160 ((tavor_hw_uar32_t *)ia_uar)->send[1] =
161 (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
162 dapl_os_unlock(&g_tavor_uar_lock);
163 #else
164 dapls_atomic_assign_64(HTOBE_64(doorbell),
165 &((tavor_hw_uar_t *)ia_uar)->send);
166 #endif
167 #endif
168 }
169 #pragma inline(dapli_tavor_qp_send_doorbell)
170 #endif /* _LP64 */
171
172 #if defined(_LP64) || defined(__lint)
173
174 #define dapli_tavor_qp_recv_doorbell(ia_uar, nda, nds, qpn, credits) \
175 ((tavor_hw_uar_t *)ia_uar)->recv = HTOBE_64( \
176 (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) << \
177 TAVOR_QPRCVDB_NDA_SHIFT) | \
178 ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) | \
179 ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits)
180 #else
181
182 /*
183 * dapli_tavor_qp_recv_doorbell()
184 * Takes the specified next descriptor information, qp number and
185 * rings the recv doorbell
186 */
187 static void
dapli_tavor_qp_recv_doorbell(dapls_hw_uar_t ia_uar,uint32_t nda,uint32_t nds,uint32_t qpn,uint32_t credits)188 dapli_tavor_qp_recv_doorbell(dapls_hw_uar_t ia_uar, uint32_t nda,
189 uint32_t nds, uint32_t qpn, uint32_t credits)
190 {
191 uint64_t doorbell;
192
193 /* Build the doorbell from the parameters */
194 doorbell = (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) <<
195 TAVOR_QPRCVDB_NDA_SHIFT) |
196 ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) |
197 ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits;
198
199 /* Write the doorbell to UAR */
200 #ifdef _LP64
201 ((tavor_hw_uar_t *)ia_uar)->recv = HTOBE_64(doorbell);
202 #else
203 #if defined(i386)
204 dapl_os_lock(&g_tavor_uar_lock);
205 /*
206 * For 32 bit intel we assign the doorbell in the order
207 * prescribed by the Tavor PRM, lower to upper addresses
208 */
209 ((tavor_hw_uar32_t *)ia_uar)->recv[0] =
210 (uint32_t)HTOBE_32(doorbell >> 32);
211 ((tavor_hw_uar32_t *)ia_uar)->recv[1] =
212 (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
213 dapl_os_unlock(&g_tavor_uar_lock);
214 #else
215 dapls_atomic_assign_64(HTOBE_64(doorbell),
216 &((tavor_hw_uar_t *)ia_uar)->recv);
217 #endif
218 #endif
219 }
220 #pragma inline(dapli_tavor_qp_recv_doorbell)
221 #endif /* _LP64 */
222
223
224 /*
225 * dapls_tavor_max_inline()
226 * Return the max inline value that should be used.
227 * Env variable DAPL_MAX_INLINE can override the default.
228 * If it's not set (or set to -1), default behavior is used.
229 * If it's zero or negative (except -1) inline is not done.
230 */
231 int
dapls_tavor_max_inline(void)232 dapls_tavor_max_inline(void)
233 {
234 static int max_inline_env = -2;
235
236 /* Check the env exactly once, otherwise return previous value. */
237 if (max_inline_env != -2)
238 return (max_inline_env);
239
240 max_inline_env = dapl_os_get_env_val("DAPL_MAX_INLINE", -1);
241 if (max_inline_env != -1)
242 if (max_inline_env <= 0)
243 max_inline_env = 0; /* no inlining */
244 return (max_inline_env);
245 }
246
247 /*
248 * dapls_ib_max_request_iov(), aka, max send sgl size.
249 * The send queue's scatter/gather list is used for "inline" data.
250 *
251 * By default, compute reasonable send queue size based on #iovs, #wqes,
252 * max_iovs, and max inline byte count. If the #wqes is large, then we
253 * limit how much the SGL (space for inline data) can take. The heuristic
254 * is to increase the memory for the send queue to a maximum of 32KB:
255 *
256 * < 128 wqes increase to at most 256 minus header
257 * < 256 wqes increase to at most 128 minus header
258 * >= 256 wqes use SGL unaltered
259 *
260 * If the env is supplied (max_inline >= 0), use it without checking.
261 */
262 int
dapls_ib_max_request_iov(int iovs,int wqes,int max_iovs,int max_inline_bytes)263 dapls_ib_max_request_iov(int iovs, int wqes, int max_iovs,
264 int max_inline_bytes)
265 {
266 int ret_iovs;
267
268 if (max_inline_bytes > 0) {
269 ret_iovs = max_inline_bytes / sizeof (tavor_hw_wqe_sgl_t);
270 } else if (wqes < 128) {
271 max_inline_bytes = 256 - TAVOR_INLINE_HEADER_SIZE_MAX;
272 ret_iovs = max_inline_bytes / sizeof (tavor_hw_wqe_sgl_t);
273 } else if (wqes < 256) {
274 max_inline_bytes = 128 - TAVOR_INLINE_HEADER_SIZE_MAX;
275 ret_iovs = max_inline_bytes / sizeof (tavor_hw_wqe_sgl_t);
276 } else {
277 ret_iovs = iovs;
278 }
279
280 if (ret_iovs > max_iovs) /* do not exceed max */
281 ret_iovs = max_iovs;
282 if (iovs > ret_iovs) /* never decrease iovs */
283 ret_iovs = iovs;
284 return (ret_iovs);
285 }
286
287 /*
288 * dapli_tavor_wqe_send_build()
289 * Constructs a WQE for a given ibt_send_wr_t
290 */
291 static int
dapli_tavor_wqe_send_build(ib_qp_handle_t qp,ibt_send_wr_t * wr,uint64_t * addr,uint_t * size)292 dapli_tavor_wqe_send_build(ib_qp_handle_t qp, ibt_send_wr_t *wr,
293 uint64_t *addr, uint_t *size)
294 {
295 tavor_hw_snd_wqe_remaddr_t *rc;
296 tavor_hw_snd_wqe_bind_t *bn;
297 tavor_hw_wqe_sgl_t *ds;
298 ibt_wr_ds_t *sgl;
299 uint32_t nds;
300 uint32_t len, total_len;
301 uint32_t tavor_num_mpt_mask;
302 uint32_t new_rkey;
303 uint32_t old_rkey;
304 int i, num_ds;
305 int max_inline_bytes = -1;
306
307 nds = wr->wr_nds;
308 sgl = wr->wr_sgl;
309 num_ds = 0;
310
311 /*
312 * RC is the only supported transport in UDAPL
313 * For RC requests, we allow "Send", "RDMA Read", "RDMA Write"
314 */
315 switch (wr->wr_opcode) {
316 case IBT_WRC_SEND:
317 /*
318 * If this is a Send request, then all we need is
319 * the Data Segment processing below.
320 * Initialize the information for the Data Segments
321 */
322 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
323 sizeof (tavor_hw_snd_wqe_nextctrl_t));
324 if (qp->qp_sq_inline != 0)
325 max_inline_bytes =
326 qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_SEND;
327 break;
328 case IBT_WRC_RDMAW:
329 if (qp->qp_sq_inline != 0)
330 max_inline_bytes =
331 qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_RDMAW;
332 /* FALLTHROUGH */
333 case IBT_WRC_RDMAR:
334 if (qp->qp_sq_inline < 0 && wr->wr_opcode == IBT_WRC_RDMAR)
335 qp->qp_sq_inline = 0;
336 /*
337 * If this is an RDMA Read or RDMA Write request, then fill
338 * in the "Remote Address" header fields.
339 */
340 rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)addr +
341 sizeof (tavor_hw_snd_wqe_nextctrl_t));
342
343 /*
344 * Build the Remote Address Segment for the WQE, using
345 * the information from the RC work request.
346 */
347 TAVOR_WQE_BUILD_REMADDR(rc, &wr->wr.rc.rcwr.rdma);
348
349 /* Update "ds" for filling in Data Segments (below) */
350 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc +
351 sizeof (tavor_hw_snd_wqe_remaddr_t));
352 break;
353 case IBT_WRC_BIND:
354 /*
355 * Generate a new R_key
356 * Increment the upper "unconstrained" bits and need to keep
357 * the lower "constrained" bits the same it represents
358 * the MPT index.
359 */
360 old_rkey = wr->wr.rc.rcwr.bind->bind_rkey;
361 tavor_num_mpt_mask = (uint32_t)(1 << qp->qp_num_mpt_shift) - 1;
362 new_rkey = (old_rkey >> qp->qp_num_mpt_shift);
363 new_rkey++;
364 new_rkey = ((new_rkey << qp->qp_num_mpt_shift) |
365 (old_rkey & tavor_num_mpt_mask));
366
367 wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
368
369 bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)addr +
370 sizeof (tavor_hw_snd_wqe_nextctrl_t));
371
372 /*
373 * Build the Bind Memory Window Segments for the WQE,
374 * using the information from the RC Bind memory
375 * window work request.
376 */
377 TAVOR_WQE_BUILD_BIND(bn, wr->wr.rc.rcwr.bind);
378
379 /*
380 * Update the "ds" pointer. Even though the "bind"
381 * operation requires no SGLs, this is necessary to
382 * facilitate the correct descriptor size calculations
383 * (below).
384 */
385 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
386 sizeof (tavor_hw_snd_wqe_bind_t));
387 break;
388 default:
389 dapl_dbg_log(DAPL_DBG_TYPE_ERR,
390 "dapli_tavor_wqe_send_build: invalid wr_opcode=%d\n",
391 wr->wr_opcode);
392 return (DAT_INTERNAL_ERROR);
393 }
394
395 /*
396 * Now fill in the Data Segments (SGL) for the Send WQE based on
397 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
398 * Start by checking for a valid number of SGL entries
399 */
400 if (nds > qp->qp_sq_sgl) {
401 return (DAT_INVALID_PARAMETER);
402 }
403
404 /*
405 * For each SGL in the Send Work Request, fill in the Send WQE's data
406 * segments. Note: We skip any SGL with zero size because Tavor
407 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
408 * the encoding for zero means a 2GB transfer. Because of this special
409 * encoding in the hardware, we mask the requested length with
410 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
411 * zero.)
412 */
413
414 if (max_inline_bytes != -1) { /* compute total_len */
415 total_len = 0;
416 for (i = 0; i < nds; i++)
417 total_len += sgl[i].ds_len;
418 if (total_len > max_inline_bytes)
419 max_inline_bytes = -1; /* too big, do not "inline" */
420 }
421 if (max_inline_bytes != -1) { /* do "inline" */
422 uint8_t *dst = (uint8_t *)((uint32_t *)ds + 1);
423 *(uint32_t *)ds =
424 HTOBE_32(total_len | TAVOR_WQE_SGL_INLINE_MASK);
425 for (i = 0; i < nds; i++) {
426 if ((len = sgl[i].ds_len) == 0) {
427 continue;
428 }
429 (void) dapl_os_memcpy(dst,
430 (void *)(uintptr_t)sgl[i].ds_va, len);
431 dst += len;
432 }
433 /* Return the size of descriptor (in 16-byte chunks) */
434 *size = ((uintptr_t)dst - (uintptr_t)addr + 15) >> 4;
435 } else {
436 for (i = 0; i < nds; i++) {
437 if (sgl[i].ds_len == 0) {
438 continue;
439 }
440
441 /*
442 * Fill in the Data Segment(s) for the current WQE,
443 * using the information contained in the
444 * scatter-gather list of the work request.
445 */
446 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &sgl[i]);
447 num_ds++;
448 }
449
450 /* Return the size of descriptor (in 16-byte chunks) */
451 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)addr) >> 4;
452 }
453
454 return (DAT_SUCCESS);
455 }
456
457 /*
458 * dapli_tavor_wqe_send_linknext()
459 * Takes a WQE and links it to the prev WQE chain
460 */
461 static void
dapli_tavor_wqe_send_linknext(ibt_send_wr_t * curr_wr,uint64_t * curr_addr,boolean_t ns,uint32_t curr_desc,uint_t curr_descsz,uint64_t * prev_addr,tavor_sw_wqe_dbinfo_t * dbinfo)462 dapli_tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, uint64_t *curr_addr,
463 boolean_t ns, uint32_t curr_desc, uint_t curr_descsz, uint64_t *prev_addr,
464 tavor_sw_wqe_dbinfo_t *dbinfo)
465 {
466 uint64_t next, ctrl;
467 uint32_t nopcode, fence;
468
469 next = 0;
470 ctrl = 0;
471
472 /* Set the "c" (i.e. "signaled") bit appropriately */
473 if (curr_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
474 ctrl = ctrl | TAVOR_WQE_SEND_SIGNALED_MASK;
475 }
476
477 /* Set the "s" (i.e. "solicited") bit appropriately */
478 if (curr_wr->wr_flags & IBT_WR_SEND_SOLICIT) {
479 ctrl = ctrl | TAVOR_WQE_SEND_SOLICIT_MASK;
480 }
481 /* Set the "e" (i.e. "event") bit if notification is needed */
482 if (!ns) {
483 ctrl = ctrl | TAVOR_WQE_RCV_EVENT_MASK;
484 }
485
486 /*
487 * The "i" bit is unused since uDAPL doesn't support
488 * the immediate data
489 */
490
491 /* initialize the ctrl and next fields of the current descriptor */
492 TAVOR_WQE_LINKNEXT(curr_addr, ctrl, next);
493
494 /*
495 * Calculate the "next" field of the prev descriptor. This amounts
496 * to setting up the "next_wqe_addr", "nopcode", "fence", and "nds"
497 * fields (see tavor_hw.h for more).
498 */
499
500 /*
501 * Determine the value for the Tavor WQE "nopcode" field
502 * by using the IBTF opcode from the work request
503 */
504 switch (curr_wr->wr_opcode) {
505 case IBT_WRC_RDMAW:
506 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW;
507 break;
508
509 case IBT_WRC_SEND:
510 nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
511 break;
512
513 case IBT_WRC_RDMAR:
514 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR;
515 break;
516
517 case IBT_WRC_BIND:
518 nopcode = TAVOR_WQE_SEND_NOPCODE_BIND;
519 break;
520 default:
521 /* Unsupported opcodes in UDAPL */
522 dapl_dbg_log(DAPL_DBG_TYPE_ERR,
523 "dapli_tavor_wqe_send_linknext: invalid nopcode=%d\n",
524 nopcode);
525 return;
526 }
527
528 next = ((uint64_t)curr_desc & TAVOR_WQE_NDA_MASK) << 32;
529 next = next | ((uint64_t)nopcode << 32);
530 fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
531 if (fence) {
532 next = next | TAVOR_WQE_SEND_FENCE_MASK;
533 }
534 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
535
536 /*
537 * A send queue doorbell will be rung for the next
538 * WQE on the chain, set the current WQE's "dbd" bit.
539 * Note: We also update the "dbinfo" structure here to pass
540 * back information about what should (later) be included
541 * in the send queue doorbell.
542 */
543 next = next | TAVOR_WQE_DBD_MASK;
544 dbinfo->db_nopcode = nopcode;
545 dbinfo->db_fence = fence;
546
547 /*
548 * Send queue doorbell will be rung for the next WQE on
549 * the chain, update the prev WQE's "next" field and return.
550 */
551 if (prev_addr != NULL) {
552 TAVOR_WQE_LINKFIRST(prev_addr, next);
553 }
554 }
555
556
557 /*
558 * dapli_tavor_wqe_recv_build()
559 * Builds the recv WQE for a given ibt_recv_wr_t
560 */
561 static DAT_RETURN
dapli_tavor_wqe_recv_build(ib_qp_handle_t qp,ibt_recv_wr_t * wr,uint64_t * addr,uint_t * size)562 dapli_tavor_wqe_recv_build(ib_qp_handle_t qp, ibt_recv_wr_t *wr,
563 uint64_t *addr, uint_t *size)
564 {
565 tavor_hw_wqe_sgl_t *ds;
566 int i;
567 int num_ds;
568
569 /* Fill in the Data Segments (SGL) for the Recv WQE */
570 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
571 sizeof (tavor_hw_rcv_wqe_nextctrl_t));
572 num_ds = 0;
573
574 /* Check for valid number of SGL entries */
575 if (wr->wr_nds > qp->qp_rq_sgl) {
576 return (DAT_INVALID_PARAMETER);
577 }
578
579 /*
580 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
581 * segments. Note: We skip any SGL with zero size because Tavor
582 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
583 * the encoding for zero means a 2GB transfer. Because of this special
584 * encoding in the hardware, we mask the requested length with
585 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
586 * zero.)
587 */
588 for (i = 0; i < wr->wr_nds; i++) {
589 if (wr->wr_sgl[i].ds_len == 0) {
590 continue;
591 }
592
593 /*
594 * Fill in the Data Segment(s) for the receive WQE, using the
595 * information contained in the scatter-gather list of the
596 * work request.
597 */
598 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]);
599 num_ds++;
600 }
601
602 /* Return the size of descriptor (in 16-byte chunks) */
603 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)addr) >> 0x4;
604
605 return (DAT_SUCCESS);
606 }
607
608
609 /*
610 * dapli_tavor_wqe_recv_linknext()
611 * Links a recv WQE to the prev chain
612 */
613 static void
dapli_tavor_wqe_recv_linknext(uint64_t * curr_addr,boolean_t ns,uint32_t curr_desc,uint_t curr_descsz,uint64_t * prev_addr)614 dapli_tavor_wqe_recv_linknext(uint64_t *curr_addr, boolean_t ns,
615 uint32_t curr_desc, uint_t curr_descsz, uint64_t *prev_addr)
616 {
617 uint64_t next;
618 uint64_t ctrl = 0;
619
620 /*
621 * Note: curr_addr is the last WQE (In uDAPL we manipulate 1 WQE
622 * at a time. If there is no next descriptor (i.e. if the current
623 * descriptor is the last WQE on the chain), then set "next" field
624 * to TAVOR_WQE_DBD_MASK. This is because the Tavor hardware
625 * requires the "dbd" bit to be set to one for all Recv WQEs.
626 * In either case, we must add a single bit in the "reserved" field
627 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA. This is the
628 * workaround for a known Tavor errata that can cause Recv WQEs with
629 * zero in the NDA field to behave improperly.
630 *
631 * If notification suppression is not desired then we set
632 * the "E" bit in the ctrl field.
633 */
634
635 next = TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
636 if (!ns) { /* notification needed - so set the "E" bit */
637 ctrl = TAVOR_WQE_RCV_EVENT_MASK;
638 }
639
640 /* update the WQE */
641 TAVOR_WQE_LINKNEXT(curr_addr, ctrl, next);
642
643 if (prev_addr != NULL) {
644 /*
645 * Calculate the "next" field of the descriptor. This amounts
646 * to setting up the "next_wqe_addr", "dbd", and "nds" fields
647 * (see tavor_hw.h for more).
648 */
649 next = ((uint64_t)curr_desc & TAVOR_WQE_NDA_MASK) << 32;
650 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK) |
651 TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
652
653 /*
654 * If this WQE is supposed to be linked to the previous
655 * descriptor, then we need to update not only the previous
656 * WQE's "next" fields but we must not touch this WQE's
657 * "ctrl" fields.
658 */
659 TAVOR_WQE_LINKFIRST(prev_addr, next);
660 }
661 }
662
663 /*
664 * dapli_tavor_wqe_srq_build()
665 * Builds the recv WQE for a given ibt_recv_wr_t
666 */
667 static DAT_RETURN
dapli_tavor_wqe_srq_build(ib_srq_handle_t srq,ibt_recv_wr_t * wr,uint64_t * addr)668 dapli_tavor_wqe_srq_build(ib_srq_handle_t srq, ibt_recv_wr_t *wr,
669 uint64_t *addr)
670 {
671 tavor_hw_wqe_sgl_t *ds;
672 ibt_wr_ds_t end_sgl;
673 int i;
674 int num_ds;
675
676 /* Fill in the Data Segments (SGL) for the Recv WQE */
677 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
678 sizeof (tavor_hw_rcv_wqe_nextctrl_t));
679 num_ds = 0;
680
681 /* Check for valid number of SGL entries */
682 if (wr->wr_nds > srq->srq_wq_sgl) {
683 return (DAT_INVALID_PARAMETER);
684 }
685
686 /*
687 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
688 * segments. Note: We skip any SGL with zero size because Tavor
689 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
690 * the encoding for zero means a 2GB transfer. Because of this special
691 * encoding in the hardware, we mask the requested length with
692 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
693 * zero.)
694 */
695 for (i = 0; i < wr->wr_nds; i++) {
696 if (wr->wr_sgl[i].ds_len == 0) {
697 continue;
698 }
699
700 /*
701 * Fill in the Data Segment(s) for the receive WQE, using the
702 * information contained in the scatter-gather list of the
703 * work request.
704 */
705 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]);
706 num_ds++;
707 }
708
709 /*
710 * For SRQ, if the number of data segments is less than the maximum
711 * specified at alloc, then we have to fill in a special "key" entry in
712 * the sgl entry after the last valid one in this post request. We do
713 * that here.
714 */
715 if (num_ds < srq->srq_wq_sgl) {
716 end_sgl.ds_va = (ib_vaddr_t)0;
717 end_sgl.ds_len = (ib_msglen_t)0;
718 end_sgl.ds_key = (ibt_lkey_t)1;
719 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &end_sgl);
720 }
721
722 return (DAT_SUCCESS);
723 }
724
725 /*
726 * dapli_tavor_wqe_srq_linknext()
727 * Links a srq recv WQE to the prev chain
728 */
729 static void
dapli_tavor_wqe_srq_linknext(uint64_t * curr_addr,boolean_t ns,uint32_t curr_desc,uint64_t * prev_addr)730 dapli_tavor_wqe_srq_linknext(uint64_t *curr_addr, boolean_t ns,
731 uint32_t curr_desc, uint64_t *prev_addr)
732 {
733 uint64_t next;
734 uint64_t ctrl = 0;
735
736 /*
737 * Note: curr_addr is the last WQE (In uDAPL we manipulate 1 WQE
738 * at a time. If there is no next descriptor (i.e. if the current
739 * descriptor is the last WQE on the chain), then set "next" field
740 * to TAVOR_WQE_DBD_MASK. This is because the Tavor hardware
741 * requires the "dbd" bit to be set to one for all Recv WQEs.
742 * In either case, we must add a single bit in the "reserved" field
743 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA. This is the
744 * workaround for a known Tavor errata that can cause Recv WQEs with
745 * zero in the NDA field to behave improperly.
746 *
747 * If notification suppression is not desired then we set
748 * the "E" bit in the ctrl field.
749 */
750
751 next = TAVOR_RCV_WQE_NDA0_WA_MASK;
752 if (!ns) { /* notification needed - so set the "E" bit */
753 ctrl = TAVOR_WQE_RCV_EVENT_MASK;
754 }
755
756 /* update the WQE */
757 TAVOR_WQE_LINKNEXT(curr_addr, ctrl, next);
758
759 if (prev_addr != NULL) {
760 /*
761 * Calculate the "next" field of the descriptor. This amounts
762 * to setting up the "next_wqe_addr", "dbd", and "nds" fields
763 * (see tavor_hw.h for more).
764 */
765 next = ((uint64_t)curr_desc & TAVOR_WQE_NDA_MASK) << 32;
766 next = next | TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
767
768 /*
769 * If this WQE is supposed to be linked to the previous
770 * descriptor, then we need to update not only the previous
771 * WQE's "next" fields but we must not touch this WQE's
772 * "ctrl" fields.
773 */
774 TAVOR_WQE_LINKFIRST(prev_addr, next);
775 }
776 }
777
778 /*
779 * dapli_tavor_cq_peek()
780 * Peeks into a given CQ to check if there are any events that can be
781 * polled. It returns the number of CQEs that can be polled.
782 */
783 static void
dapli_tavor_cq_peek(ib_cq_handle_t cq,int * num_cqe)784 dapli_tavor_cq_peek(ib_cq_handle_t cq, int *num_cqe)
785 {
786 tavor_hw_cqe_t *cqe;
787 uint32_t imm_eth_pkey_cred;
788 uint32_t cons_indx;
789 uint32_t wrap_around_mask;
790 uint32_t polled_cnt;
791 uint_t doorbell_cnt;
792 uint_t opcode;
793
794 /* Get the consumer index */
795 cons_indx = cq->cq_consindx;
796
797 /*
798 * Calculate the wrap around mask. Note: This operation only works
799 * because all Tavor completion queues have power-of-2 sizes
800 */
801 wrap_around_mask = (cq->cq_size - 1);
802
803 /* Calculate the pointer to the first CQ entry */
804 cqe = &cq->cq_addr[cons_indx];
805
806 /*
807 * Count entries in the CQ until we find an entry owned by
808 * the hardware.
809 */
810 polled_cnt = 0;
811 while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
812 opcode = TAVOR_CQE_OPCODE_GET(cqe);
813 /* Error CQE map to multiple work completions */
814 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
815 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
816 imm_eth_pkey_cred =
817 TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe);
818 doorbell_cnt =
819 imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
820 polled_cnt += (doorbell_cnt + 1);
821 } else {
822 polled_cnt++;
823 }
824 /* Increment the consumer index */
825 cons_indx = (cons_indx + 1) & wrap_around_mask;
826
827 /* Update the pointer to the next CQ entry */
828 cqe = &cq->cq_addr[cons_indx];
829 }
830
831 *num_cqe = polled_cnt;
832 }
833
834 /*
835 * dapli_tavor_cq_poll()
836 * This routine polls CQEs out of a CQ and puts them into the ibt_wc_t
837 * array that is passed in.
838 */
839 static DAT_RETURN
dapli_tavor_cq_poll(ib_cq_handle_t cq,ibt_wc_t * wc_p,uint_t num_wc,uint_t * num_polled)840 dapli_tavor_cq_poll(ib_cq_handle_t cq, ibt_wc_t *wc_p, uint_t num_wc,
841 uint_t *num_polled)
842 {
843 tavor_hw_cqe_t *cqe;
844 uint32_t cons_indx;
845 uint32_t wrap_around_mask;
846 uint32_t polled_cnt;
847 uint32_t num_to_increment;
848 DAT_RETURN dat_status;
849 int status;
850
851 /* Get the consumer index */
852 cons_indx = cq->cq_consindx;
853
854 /*
855 * Calculate the wrap around mask. Note: This operation only works
856 * because all Tavor completion queues have power-of-2 sizes
857 */
858 wrap_around_mask = (cq->cq_size - 1);
859
860 /* Calculate the pointer to the first CQ entry */
861 cqe = &cq->cq_addr[cons_indx];
862
863 /*
864 * Keep pulling entries from the CQ until we find an entry owned by
865 * the hardware. As long as there the CQE's owned by SW, process
866 * each entry by calling dapli_tavor_cq_cqe_consume() and updating the
867 * CQ consumer index. Note: We only update the consumer index if
868 * dapli_tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.
869 * Otherwise, it indicates that we are going to "recycle" the CQE
870 * (probably because it is a error CQE and corresponds to more than one
871 * completion).
872 */
873 polled_cnt = 0;
874 while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
875 status = dapli_tavor_cq_cqe_consume(cq, cqe,
876 &wc_p[polled_cnt++]);
877 if (status == TAVOR_CQ_SYNC_AND_DB) {
878 /* Reset entry to hardware ownership */
879 TAVOR_CQE_OWNER_SET_HW(cqe);
880
881 /* Increment the consumer index */
882 cons_indx = (cons_indx + 1) & wrap_around_mask;
883
884 /* Update the pointer to the next CQ entry */
885 cqe = &cq->cq_addr[cons_indx];
886 }
887
888 /*
889 * If we have run out of space to store work completions,
890 * then stop and return the ones we have pulled of the CQ.
891 */
892 if (polled_cnt >= num_wc) {
893 break;
894 }
895 }
896
897 dat_status = DAT_SUCCESS;
898 /*
899 * Now we only ring the doorbell (to update the consumer index) if
900 * we've actually consumed a CQ entry. If we have, for example,
901 * pulled from a CQE that we are still in the process of "recycling"
902 * for error purposes, then we would not update the consumer index.
903 */
904 if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) {
905 /*
906 * Post doorbell to update the consumer index. Doorbell
907 * value indicates number of entries consumed (minus 1)
908 */
909 if (cons_indx > cq->cq_consindx) {
910 num_to_increment = (cons_indx - cq->cq_consindx) - 1;
911 } else {
912 num_to_increment = ((cons_indx + cq->cq_size) -
913 cq->cq_consindx) - 1;
914 }
915 cq->cq_consindx = cons_indx;
916 dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_INCR_CONSINDX,
917 cq->cq_num, num_to_increment);
918 } else if (polled_cnt == 0) {
919 /*
920 * If the CQ is empty, we can try to free up some of the WRID
921 * list containers.
922 */
923 if (cq->cq_wrid_reap_head) /* look before leaping */
924 dapls_tavor_wrid_cq_reap(cq);
925 dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0);
926 }
927
928 if (num_polled != NULL) {
929 *num_polled = polled_cnt;
930 }
931
932 return (dat_status);
933 }
934
935 /*
936 * dapli_tavor_cq_poll_one()
937 * This routine polls one CQE out of a CQ and puts ot into the ibt_wc_t
938 * that is passed in. See above for more comments/details.
939 */
940 static DAT_RETURN
dapli_tavor_cq_poll_one(ib_cq_handle_t cq,ibt_wc_t * wc_p)941 dapli_tavor_cq_poll_one(ib_cq_handle_t cq, ibt_wc_t *wc_p)
942 {
943 tavor_hw_cqe_t *cqe;
944 uint32_t cons_indx;
945 DAT_RETURN dat_status;
946 int status;
947
948 /* Get the consumer index */
949 cons_indx = cq->cq_consindx;
950
951 /* Calculate the pointer to the first CQ entry */
952 cqe = &cq->cq_addr[cons_indx];
953
954 /*
955 * Keep pulling entries from the CQ until we find an entry owned by
956 * the hardware. As long as there the CQE's owned by SW, process
957 * each entry by calling dapli_tavor_cq_cqe_consume() and updating the
958 * CQ consumer index. Note: We only update the consumer index if
959 * dapli_tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.
960 * Otherwise, it indicates that we are going to "recycle" the CQE
961 * (probably because it is a error CQE and corresponds to more than one
962 * completion).
963 */
964 if (TAVOR_CQE_OWNER_IS_SW(cqe)) {
965 status = dapli_tavor_cq_cqe_consume(cq, cqe, wc_p);
966 if (status == TAVOR_CQ_SYNC_AND_DB) {
967 /* Reset entry to hardware ownership */
968 TAVOR_CQE_OWNER_SET_HW(cqe);
969
970 /* Increment the consumer index */
971 cq->cq_consindx =
972 (cons_indx + 1) & (cq->cq_size - 1);
973 dapli_tavor_cq_doorbell(cq->cq_iauar,
974 TAVOR_CQDB_INCR_CONSINDX,
975 cq->cq_num, 0);
976 }
977 dat_status = DAT_SUCCESS;
978 } else {
979 if (cq->cq_wrid_reap_head) /* look before leaping */
980 dapls_tavor_wrid_cq_reap(cq);
981 dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0);
982 }
983 return (dat_status);
984 }
985
986 /*
987 * dapli_tavor_cq_cqe_consume()
988 * Converts a given CQE into a ibt_wc_t object
989 */
990 static int
dapli_tavor_cq_cqe_consume(ib_cq_handle_t cqhdl,tavor_hw_cqe_t * cqe,ibt_wc_t * wc)991 dapli_tavor_cq_cqe_consume(ib_cq_handle_t cqhdl, tavor_hw_cqe_t *cqe,
992 ibt_wc_t *wc)
993 {
994 uint_t flags;
995 uint_t type;
996 uint_t opcode;
997 int status;
998
999 /*
1000 * Determine if this is an "error" CQE by examining "opcode". If it
1001 * is an error CQE, then call dapli_tavor_cq_errcqe_consume() and return
1002 * whatever status it returns. Otherwise, this is a successful
1003 * completion.
1004 */
1005 opcode = TAVOR_CQE_OPCODE_GET(cqe);
1006 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
1007 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
1008 status = dapli_tavor_cq_errcqe_consume(cqhdl, cqe, wc);
1009 return (status);
1010 }
1011
1012 /*
1013 * Fetch the Work Request ID using the information in the CQE.
1014 * See tavor_wr.c for more details.
1015 */
1016 wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, cqe,
1017 TAVOR_CQE_SENDRECV_GET(cqe), 0, NULL);
1018 wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe);
1019
1020 /*
1021 * Parse the CQE opcode to determine completion type. This will set
1022 * not only the type of the completion, but also any flags that might
1023 * be associated with it (e.g. whether immediate data is present).
1024 */
1025 flags = IBT_WC_NO_FLAGS;
1026 if (TAVOR_CQE_SENDRECV_GET(cqe) != TAVOR_COMPLETION_RECV) {
1027
1028 /*
1029 * Send CQE
1030 *
1031 * The following opcodes will not be generated in uDAPL
1032 * case TAVOR_CQE_SND_RDMAWR_IMM:
1033 * case TAVOR_CQE_SND_SEND_IMM:
1034 * case TAVOR_CQE_SND_ATOMIC_CS:
1035 * case TAVOR_CQE_SND_ATOMIC_FA:
1036 */
1037 switch (opcode) {
1038 case TAVOR_CQE_SND_RDMAWR:
1039 type = IBT_WRC_RDMAW;
1040 break;
1041
1042 case TAVOR_CQE_SND_SEND:
1043 type = IBT_WRC_SEND;
1044 break;
1045
1046 case TAVOR_CQE_SND_RDMARD:
1047 type = IBT_WRC_RDMAR;
1048 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe);
1049 break;
1050
1051 case TAVOR_CQE_SND_BIND_MW:
1052 type = IBT_WRC_BIND;
1053 break;
1054
1055 default:
1056 wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR;
1057 return (TAVOR_CQ_SYNC_AND_DB);
1058 }
1059 } else {
1060
1061 /*
1062 * Receive CQE
1063 *
1064 * The following opcodes will not be generated in uDAPL
1065 *
1066 * case TAVOR_CQE_RCV_RECV_IMM:
1067 * case TAVOR_CQE_RCV_RECV_IMM2:
1068 * case TAVOR_CQE_RCV_RDMAWR_IMM:
1069 * case TAVOR_CQE_RCV_RDMAWR_IMM2:
1070 */
1071 switch (opcode & 0x1F) {
1072 case TAVOR_CQE_RCV_RECV:
1073 /* FALLTHROUGH */
1074 case TAVOR_CQE_RCV_RECV2:
1075 type = IBT_WRC_RECV;
1076 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe);
1077 break;
1078 default:
1079 wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR;
1080 return (TAVOR_CQ_SYNC_AND_DB);
1081 }
1082 }
1083 wc->wc_type = type;
1084 wc->wc_flags = flags;
1085 /* If we got here, completion status must be success */
1086 wc->wc_status = IBT_WC_SUCCESS;
1087
1088 return (TAVOR_CQ_SYNC_AND_DB);
1089 }
1090
1091
1092 /*
1093 * dapli_tavor_cq_errcqe_consume()
1094 */
1095 static int
dapli_tavor_cq_errcqe_consume(ib_cq_handle_t cqhdl,tavor_hw_cqe_t * cqe,ibt_wc_t * wc)1096 dapli_tavor_cq_errcqe_consume(ib_cq_handle_t cqhdl, tavor_hw_cqe_t *cqe,
1097 ibt_wc_t *wc)
1098 {
1099 dapls_tavor_wrid_entry_t wre;
1100 uint32_t next_wqeaddr;
1101 uint32_t imm_eth_pkey_cred;
1102 uint_t nextwqesize, dbd;
1103 uint_t doorbell_cnt, status;
1104 uint_t opcode = TAVOR_CQE_OPCODE_GET(cqe);
1105
1106 dapl_dbg_log(DAPL_DBG_TYPE_EVD, "errcqe_consume:cqe.eth=%x, wqe=%x\n",
1107 TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe),
1108 TAVOR_CQE_WQEADDRSZ_GET(cqe));
1109
1110 /*
1111 * Fetch the Work Request ID using the information in the CQE.
1112 * See tavor_wr.c for more details.
1113 */
1114 wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, cqe,
1115 (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ? TAVOR_COMPLETION_SEND :
1116 TAVOR_COMPLETION_RECV, 1, &wre);
1117 wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe);
1118
1119 /*
1120 * Parse the CQE opcode to determine completion type. We know that
1121 * the CQE is an error completion, so we extract only the completion
1122 * status here.
1123 */
1124 imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe);
1125 status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT;
1126 switch (status) {
1127 case TAVOR_CQE_LOC_LEN_ERR:
1128 status = IBT_WC_LOCAL_LEN_ERR;
1129 break;
1130
1131 case TAVOR_CQE_LOC_OP_ERR:
1132 status = IBT_WC_LOCAL_CHAN_OP_ERR;
1133 break;
1134
1135 case TAVOR_CQE_LOC_PROT_ERR:
1136 status = IBT_WC_LOCAL_PROTECT_ERR;
1137 break;
1138
1139 case TAVOR_CQE_WR_FLUSHED_ERR:
1140 status = IBT_WC_WR_FLUSHED_ERR;
1141 break;
1142
1143 case TAVOR_CQE_MW_BIND_ERR:
1144 status = IBT_WC_MEM_WIN_BIND_ERR;
1145 break;
1146
1147 case TAVOR_CQE_BAD_RESPONSE_ERR:
1148 status = IBT_WC_BAD_RESPONSE_ERR;
1149 break;
1150
1151 case TAVOR_CQE_LOCAL_ACCESS_ERR:
1152 status = IBT_WC_LOCAL_ACCESS_ERR;
1153 break;
1154
1155 case TAVOR_CQE_REM_INV_REQ_ERR:
1156 status = IBT_WC_REMOTE_INVALID_REQ_ERR;
1157 break;
1158
1159 case TAVOR_CQE_REM_ACC_ERR:
1160 status = IBT_WC_REMOTE_ACCESS_ERR;
1161 break;
1162
1163 case TAVOR_CQE_REM_OP_ERR:
1164 status = IBT_WC_REMOTE_OP_ERR;
1165 break;
1166
1167 case TAVOR_CQE_TRANS_TO_ERR:
1168 status = IBT_WC_TRANS_TIMEOUT_ERR;
1169 break;
1170
1171 case TAVOR_CQE_RNRNAK_TO_ERR:
1172 status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
1173 break;
1174
1175 /*
1176 * The following error codes are not supported in the Tavor driver
1177 * as they relate only to Reliable Datagram completion statuses:
1178 * case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
1179 * case TAVOR_CQE_REM_INV_RD_REQ_ERR:
1180 * case TAVOR_CQE_EEC_REM_ABORTED_ERR:
1181 * case TAVOR_CQE_INV_EEC_NUM_ERR:
1182 * case TAVOR_CQE_INV_EEC_STATE_ERR:
1183 * case TAVOR_CQE_LOC_EEC_ERR:
1184 */
1185
1186 default:
1187 status = IBT_WC_LOCAL_CHAN_OP_ERR;
1188 break;
1189 }
1190 wc->wc_status = status;
1191 wc->wc_type = 0;
1192 /*
1193 * Now we do all the checking that's necessary to handle completion
1194 * queue entry "recycling"
1195 *
1196 * It is not necessary here to try to sync the WQE as we are only
1197 * attempting to read from the Work Queue (and hardware does not
1198 * write to it).
1199 */
1200
1201 /*
1202 * We can get doorbell info, WQE address, size for the next WQE
1203 * from the "wre" (which was filled in above in the call to the
1204 * tavor_wrid_get_entry() routine)
1205 */
1206 dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0;
1207 next_wqeaddr = wre.wr_wqeaddrsz;
1208 nextwqesize = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK;
1209
1210 /*
1211 * Get the doorbell count from the CQE. This indicates how many
1212 * completions this one CQE represents.
1213 */
1214 doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
1215
1216 /*
1217 * Determine if we're ready to consume this CQE yet or not. If the
1218 * next WQE has size zero (i.e. no next WQE) or if the doorbell count
1219 * is down to zero, then this is the last/only completion represented
1220 * by the current CQE (return TAVOR_CQ_SYNC_AND_DB). Otherwise, the
1221 * current CQE needs to be recycled (see below).
1222 */
1223 if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) {
1224 /*
1225 * Consume the CQE
1226 * Return status to indicate that doorbell and sync may be
1227 * necessary.
1228 */
1229 return (TAVOR_CQ_SYNC_AND_DB);
1230
1231 } else {
1232 /*
1233 * Recycle the CQE for use in the next PollCQ() call
1234 * Decrement the doorbell count, modify the error status,
1235 * and update the WQE address and size (to point to the
1236 * next WQE on the chain. Put these update entries back
1237 * into the CQE.
1238 * Despite the fact that we have updated the CQE, it is not
1239 * necessary for us to attempt to sync this entry just yet
1240 * as we have not changed the "hardware's view" of the
1241 * entry (i.e. we have not modified the "owner" bit - which
1242 * is all that the Tavor hardware really cares about.
1243 */
1244 doorbell_cnt = doorbell_cnt - dbd;
1245 TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cqe,
1246 ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) |
1247 (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK)));
1248 TAVOR_CQE_WQEADDRSZ_SET(cqe,
1249 TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize));
1250 dapl_dbg_log(DAPL_DBG_TYPE_EVD,
1251 "errcqe_consume: recycling cqe.eth=%x, wqe=%x\n",
1252 TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe),
1253 TAVOR_CQE_WQEADDRSZ_GET(cqe));
1254 return (TAVOR_CQ_RECYCLE_ENTRY);
1255 }
1256 }
1257
1258 /*
1259 * dapli_tavor_cq_notify()
1260 * This function is used for arming the CQ by ringing the CQ doorbell.
1261 */
1262 static DAT_RETURN
dapli_tavor_cq_notify(ib_cq_handle_t cq,int flags,uint32_t param)1263 dapli_tavor_cq_notify(ib_cq_handle_t cq, int flags, uint32_t param)
1264 {
1265 uint32_t cqnum;
1266
1267 /*
1268 * Determine if we are trying to get the next completion or the next
1269 * "solicited" completion. Then hit the appropriate doorbell.
1270 */
1271 cqnum = cq->cq_num;
1272 if (flags == IB_NOTIFY_ON_NEXT_COMP) {
1273 dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_NOTIFY_CQ,
1274 cqnum, TAVOR_CQDB_DEFAULT_PARAM);
1275
1276 } else if (flags == IB_NOTIFY_ON_NEXT_SOLICITED) {
1277 dapli_tavor_cq_doorbell(cq->cq_iauar,
1278 TAVOR_CQDB_NOTIFY_CQ_SOLICIT, cqnum,
1279 TAVOR_CQDB_DEFAULT_PARAM);
1280
1281 } else if (flags == IB_NOTIFY_ON_NEXT_NCOMP) {
1282 dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_NOTIFY_NCQ,
1283 cqnum, param);
1284 } else {
1285 return (DAT_INVALID_PARAMETER);
1286 }
1287
1288 return (DAT_SUCCESS);
1289 }
1290
1291 /*
1292 * dapli_tavor_post_send()
1293 */
1294 static DAT_RETURN
dapli_tavor_post_send(DAPL_EP * ep,ibt_send_wr_t * wr,boolean_t ns)1295 dapli_tavor_post_send(DAPL_EP *ep, ibt_send_wr_t *wr, boolean_t ns)
1296 {
1297 tavor_sw_wqe_dbinfo_t dbinfo;
1298 dapls_tavor_wrid_list_hdr_t *wridlist;
1299 dapls_tavor_wrid_entry_t *wre_last;
1300 uint32_t desc;
1301 uint64_t *wqe_addr;
1302 uint32_t desc_sz;
1303 uint32_t wqeaddrsz, signaled_dbd;
1304 uint32_t head, tail, next_tail, qsize_msk;
1305 int status;
1306 ib_qp_handle_t qp;
1307
1308 if ((ep->qp_state == IBT_STATE_RESET) ||
1309 (ep->qp_state == IBT_STATE_INIT) ||
1310 (ep->qp_state == IBT_STATE_RTR)) {
1311 dapl_dbg_log(DAPL_DBG_TYPE_ERR,
1312 "post_send: invalid qp_state %d\n", ep->qp_state);
1313 return (DAT_INVALID_STATE);
1314 }
1315
1316 qp = ep->qp_handle;
1317
1318 /* Grab the lock for the WRID list */
1319 dapl_os_lock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1320 wridlist = qp->qp_sq_wqhdr->wq_wrid_post;
1321
1322 /* Save away some initial QP state */
1323 qsize_msk = qp->qp_sq_wqhdr->wq_size - 1;
1324 tail = qp->qp_sq_wqhdr->wq_tail;
1325 head = qp->qp_sq_wqhdr->wq_head;
1326
1327 /*
1328 * Check for "queue full" condition. If the queue is already full,
1329 * then no more WQEs can be posted, return an error
1330 */
1331 if (qp->qp_sq_wqhdr->wq_full != 0) {
1332 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1333 return (DAT_INSUFFICIENT_RESOURCES);
1334 }
1335
1336 /*
1337 * Increment the "tail index" and check for "queue full" condition.
1338 * If we detect that the current work request is going to fill the
1339 * work queue, then we mark this condition and continue.
1340 */
1341 next_tail = (tail + 1) & qsize_msk;
1342 if (next_tail == head) {
1343 qp->qp_sq_wqhdr->wq_full = 1;
1344 }
1345
1346 /*
1347 * Get the user virtual address of the location where the next
1348 * Send WQE should be built
1349 */
1350 wqe_addr = TAVOR_QP_SQ_ENTRY(qp, tail);
1351
1352 /*
1353 * Call tavor_wqe_send_build() to build the WQE at the given address.
1354 * This routine uses the information in the ibt_send_wr_t and
1355 * returns the size of the WQE when it returns.
1356 */
1357 status = dapli_tavor_wqe_send_build(qp, wr, wqe_addr, &desc_sz);
1358 if (status != DAT_SUCCESS) {
1359 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1360 return (status);
1361 }
1362
1363 /*
1364 * Get the descriptor (io address) corresponding to the location
1365 * Send WQE was built.
1366 */
1367 desc = TAVOR_QP_SQ_DESC(qp, tail);
1368
1369 dapl_os_assert(desc >= qp->qp_sq_desc_addr &&
1370 desc <= (qp->qp_sq_desc_addr +
1371 qp->qp_sq_numwqe*qp->qp_sq_wqesz));
1372
1373 /*
1374 * Add a WRID entry to the WRID list. Need to calculate the
1375 * "wqeaddrsz" and "signaled_dbd" values to pass to
1376 * dapli_tavor_wrid_add_entry()
1377 */
1378 wqeaddrsz = TAVOR_QP_WQEADDRSZ(desc, desc_sz);
1379
1380 if (wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1381 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
1382 }
1383
1384 dapli_tavor_wrid_add_entry(qp->qp_sq_wqhdr, wr->wr_id, wqeaddrsz,
1385 signaled_dbd);
1386
1387 /*
1388 * Now link the wqe to the old chain (if there was one)
1389 */
1390 dapli_tavor_wqe_send_linknext(wr, wqe_addr, ns, desc, desc_sz,
1391 qp->qp_sq_lastwqeaddr, &dbinfo);
1392
1393 /*
1394 * Now if the WRID tail entry is non-NULL, then this
1395 * represents the entry to which we are chaining the
1396 * new entries. Since we are going to ring the
1397 * doorbell for this WQE, we want set its "dbd" bit.
1398 *
1399 * On the other hand, if the tail is NULL, even though
1400 * we will have rung the doorbell for the previous WQE
1401 * (for the hardware's sake) it is irrelevant to our
1402 * purposes (for tracking WRIDs) because we know the
1403 * request must have already completed.
1404 */
1405 wre_last = wridlist->wl_wre_old_tail;
1406 if (wre_last != NULL) {
1407 wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED;
1408 }
1409
1410 /* Update some of the state in the QP */
1411 qp->qp_sq_lastwqeaddr = wqe_addr;
1412 qp->qp_sq_wqhdr->wq_tail = next_tail;
1413
1414 /* Ring the doorbell */
1415 dapli_tavor_qp_send_doorbell(qp->qp_iauar, desc, desc_sz,
1416 qp->qp_num, dbinfo.db_fence, dbinfo.db_nopcode);
1417
1418 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1419
1420 return (DAT_SUCCESS);
1421 }
1422
1423 /*
1424 * dapli_tavor_post_recv()
1425 */
1426 static DAT_RETURN
dapli_tavor_post_recv(DAPL_EP * ep,ibt_recv_wr_t * wr,boolean_t ns)1427 dapli_tavor_post_recv(DAPL_EP *ep, ibt_recv_wr_t *wr, boolean_t ns)
1428 {
1429 dapls_tavor_wrid_list_hdr_t *wridlist;
1430 dapls_tavor_wrid_entry_t *wre_last;
1431 ib_qp_handle_t qp;
1432 DAT_RETURN status;
1433 uint32_t desc;
1434 uint64_t *wqe_addr;
1435 uint32_t desc_sz;
1436 uint32_t wqeaddrsz;
1437 uint32_t head, tail, next_tail, qsize_msk;
1438
1439 if (ep->qp_state == IBT_STATE_RESET) {
1440 dapl_dbg_log(DAPL_DBG_TYPE_ERR,
1441 "post_recv: invalid qp_state %d\n", ep->qp_state);
1442 return (DAT_INVALID_STATE);
1443 }
1444 qp = ep->qp_handle;
1445
1446 /* Grab the lock for the WRID list */
1447 dapl_os_lock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1448 wridlist = qp->qp_rq_wqhdr->wq_wrid_post;
1449
1450 /* Save away some initial QP state */
1451 qsize_msk = qp->qp_rq_wqhdr->wq_size - 1;
1452 tail = qp->qp_rq_wqhdr->wq_tail;
1453 head = qp->qp_rq_wqhdr->wq_head;
1454
1455 /*
1456 * For the ibt_recv_wr_t passed in, parse the request and build a
1457 * Recv WQE. Link the WQE with the previous WQE and ring the
1458 * door bell.
1459 */
1460
1461 /*
1462 * Check for "queue full" condition. If the queue is already full,
1463 * then no more WQEs can be posted. So return an error.
1464 */
1465 if (qp->qp_rq_wqhdr->wq_full != 0) {
1466 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1467 return (DAT_INSUFFICIENT_RESOURCES);
1468 }
1469
1470 /*
1471 * Increment the "tail index" and check for "queue
1472 * full" condition. If we detect that the current
1473 * work request is going to fill the work queue, then
1474 * we mark this condition and continue.
1475 */
1476 next_tail = (tail + 1) & qsize_msk;
1477 if (next_tail == head) {
1478 qp->qp_rq_wqhdr->wq_full = 1;
1479 }
1480
1481 /* Get the descriptor (IO Address) of the WQE to be built */
1482 desc = TAVOR_QP_RQ_DESC(qp, tail);
1483 /* The user virtual address of the WQE to be built */
1484 wqe_addr = TAVOR_QP_RQ_ENTRY(qp, tail);
1485
1486 /*
1487 * Call tavor_wqe_recv_build() to build the WQE at the given
1488 * address. This routine uses the information in the
1489 * ibt_recv_wr_t and returns the size of the WQE.
1490 */
1491 status = dapli_tavor_wqe_recv_build(qp, wr, wqe_addr, &desc_sz);
1492 if (status != DAT_SUCCESS) {
1493 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1494 return (DAT_INTERNAL_ERROR);
1495 }
1496
1497 /*
1498 * Add a WRID entry to the WRID list. Need to calculate the
1499 * "wqeaddrsz" and "signaled_dbd" values to pass to
1500 * dapli_tavor_wrid_add_entry().
1501 * Note: all Recv WQEs are essentially "signaled"
1502 */
1503 wqeaddrsz = TAVOR_QP_WQEADDRSZ(desc, desc_sz);
1504 dapli_tavor_wrid_add_entry(qp->qp_rq_wqhdr, wr->wr_id, wqeaddrsz,
1505 (uint32_t)TAVOR_WRID_ENTRY_SIGNALED);
1506
1507 /*
1508 * Now link the chain to the old chain (if there was one)
1509 * and ring the doorbel for the recv work queue.
1510 */
1511 dapli_tavor_wqe_recv_linknext(wqe_addr, ns, desc, desc_sz,
1512 qp->qp_rq_lastwqeaddr);
1513
1514 /*
1515 * Now if the WRID tail entry is non-NULL, then this
1516 * represents the entry to which we are chaining the
1517 * new entries. Since we are going to ring the
1518 * doorbell for this WQE, we want set its "dbd" bit.
1519 *
1520 * On the other hand, if the tail is NULL, even though
1521 * we will have rung the doorbell for the previous WQE
1522 * (for the hardware's sake) it is irrelevant to our
1523 * purposes (for tracking WRIDs) because we know the
1524 * request must have already completed.
1525 */
1526 wre_last = wridlist->wl_wre_old_tail;
1527 if (wre_last != NULL) {
1528 wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED;
1529 }
1530
1531 /* Update some of the state in the QP */
1532 qp->qp_rq_lastwqeaddr = wqe_addr;
1533 qp->qp_rq_wqhdr->wq_tail = next_tail;
1534
1535 /* Ring the doorbell */
1536 dapli_tavor_qp_recv_doorbell(qp->qp_iauar, desc, desc_sz,
1537 qp->qp_num, 1);
1538
1539 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1540
1541 return (DAT_SUCCESS);
1542 }
1543
1544 /*
1545 * dapli_tavor_post_srq()
1546 */
1547 static DAT_RETURN
dapli_tavor_post_srq(DAPL_SRQ * srqp,ibt_recv_wr_t * wr,boolean_t ns)1548 dapli_tavor_post_srq(DAPL_SRQ *srqp, ibt_recv_wr_t *wr, boolean_t ns)
1549 {
1550 ib_srq_handle_t srq;
1551 DAT_RETURN status;
1552 uint32_t desc;
1553 uint64_t *wqe_addr;
1554 uint64_t *last_wqe_addr;
1555 uint32_t head, next_head, qsize_msk;
1556 uint32_t wqe_index;
1557
1558
1559 srq = srqp->srq_handle;
1560
1561 /* Grab the lock for the WRID list */
1562 dapl_os_lock(&srq->srq_wridlist->wl_lock->wrl_lock);
1563
1564 /*
1565 * For the ibt_recv_wr_t passed in, parse the request and build a
1566 * Recv WQE. Link the WQE with the previous WQE and ring the
1567 * door bell.
1568 */
1569
1570 /*
1571 * Check for "queue full" condition. If the queue is already full,
1572 * ie. there are no free entries, then no more WQEs can be posted.
1573 * So return an error.
1574 */
1575 if (srq->srq_wridlist->wl_freel_entries == 0) {
1576 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1577 return (DAT_INSUFFICIENT_RESOURCES);
1578 }
1579
1580 /* Save away some initial SRQ state */
1581 qsize_msk = srq->srq_wridlist->wl_size - 1;
1582 head = srq->srq_wridlist->wl_freel_head;
1583
1584 next_head = (head + 1) & qsize_msk;
1585
1586 /* Get the descriptor (IO Address) of the WQE to be built */
1587 desc = srq->srq_wridlist->wl_free_list[head];
1588
1589 wqe_index = TAVOR_SRQ_WQ_INDEX(srq->srq_wq_desc_addr, desc,
1590 srq->srq_wq_wqesz);
1591
1592 /* The user virtual address of the WQE to be built */
1593 wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq, wqe_index);
1594
1595 /*
1596 * Call dapli_tavor_wqe_srq_build() to build the WQE at the given
1597 * address. This routine uses the information in the
1598 * ibt_recv_wr_t and returns the size of the WQE.
1599 */
1600 status = dapli_tavor_wqe_srq_build(srq, wr, wqe_addr);
1601 if (status != DAT_SUCCESS) {
1602 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1603 return (status);
1604 }
1605
1606 /*
1607 * Add a WRID entry to the WRID list.
1608 */
1609 dapli_tavor_wrid_add_entry_srq(srq, wr->wr_id, wqe_index);
1610
1611 if (srq->srq_wq_lastwqeindex == -1) {
1612 last_wqe_addr = NULL;
1613 } else {
1614 last_wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq,
1615 srq->srq_wq_lastwqeindex);
1616 }
1617 /*
1618 * Now link the chain to the old chain (if there was one)
1619 * and ring the doorbell for the SRQ.
1620 */
1621 dapli_tavor_wqe_srq_linknext(wqe_addr, ns, desc, last_wqe_addr);
1622
1623 /* Update some of the state in the SRQ */
1624 srq->srq_wq_lastwqeindex = wqe_index;
1625 srq->srq_wridlist->wl_freel_head = next_head;
1626 srq->srq_wridlist->wl_freel_entries--;
1627 dapl_os_assert(srq->srq_wridlist->wl_freel_entries <=
1628 srq->srq_wridlist->wl_size);
1629
1630 /* Ring the doorbell - for SRQ nds = 0 */
1631 dapli_tavor_qp_recv_doorbell(srq->srq_iauar, desc, 0,
1632 srq->srq_num, 1);
1633
1634 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1635
1636 return (DAT_SUCCESS);
1637 }
1638
1639 /*
1640 * dapli_tavor_wrid_add_entry()
1641 */
1642 extern void
dapli_tavor_wrid_add_entry(dapls_tavor_workq_hdr_t * wq,uint64_t wrid,uint32_t wqeaddrsz,uint_t signaled_dbd)1643 dapli_tavor_wrid_add_entry(dapls_tavor_workq_hdr_t *wq, uint64_t wrid,
1644 uint32_t wqeaddrsz, uint_t signaled_dbd)
1645 {
1646 dapls_tavor_wrid_entry_t *wre_tmp;
1647 uint32_t head, tail, size;
1648
1649 /*
1650 * Find the entry in the container pointed to by the "tail" index.
1651 * Add all of the relevant information to that entry, including WRID,
1652 * "wqeaddrsz" parameter, and whether it was signaled/unsignaled
1653 * and/or doorbelled.
1654 */
1655 head = wq->wq_wrid_post->wl_head;
1656 tail = wq->wq_wrid_post->wl_tail;
1657 size = wq->wq_wrid_post->wl_size;
1658 wre_tmp = &wq->wq_wrid_post->wl_wre[tail];
1659 wre_tmp->wr_wrid = wrid;
1660 wre_tmp->wr_wqeaddrsz = wqeaddrsz;
1661 wre_tmp->wr_signaled_dbd = signaled_dbd;
1662
1663 /*
1664 * Update the "wrid_old_tail" pointer to point to the entry we just
1665 * inserted into the queue. By tracking this pointer (the pointer to
1666 * the most recently inserted entry) it will possible later in the
1667 * PostSend() and PostRecv() code paths to find the entry that needs
1668 * its "doorbelled" flag set (see comment in tavor_post_recv() and/or
1669 * tavor_post_send()).
1670 */
1671 wq->wq_wrid_post->wl_wre_old_tail = wre_tmp;
1672
1673 /* Update the tail index */
1674 tail = ((tail + 1) & (size - 1));
1675 wq->wq_wrid_post->wl_tail = tail;
1676
1677 /*
1678 * If the "tail" index has just wrapped over into the "head" index,
1679 * then we have filled the container. We use the "full" flag to
1680 * indicate this condition and to distinguish it from the "empty"
1681 * condition (where head and tail are also equal).
1682 */
1683 if (head == tail) {
1684 wq->wq_wrid_post->wl_full = 1;
1685 }
1686 }
1687
1688 /*
1689 * dapli_tavor_wrid_add_entry_srq()
1690 */
1691 extern void
dapli_tavor_wrid_add_entry_srq(ib_srq_handle_t srq,uint64_t wrid,uint32_t wqe_index)1692 dapli_tavor_wrid_add_entry_srq(ib_srq_handle_t srq, uint64_t wrid,
1693 uint32_t wqe_index)
1694 {
1695 dapls_tavor_wrid_entry_t *wre;
1696
1697 /* ASSERT on impossible wqe_index values */
1698 dapl_os_assert(wqe_index < srq->srq_wq_numwqe);
1699
1700 /*
1701 * Setup the WRE.
1702 *
1703 * Given the 'wqe_index' value, we store the WRID at this WRE offset.
1704 * And we set the WRE to be signaled_dbd so that on poll CQ we can find
1705 * this information and associate the WRID to the WQE found on the CQE.
1706 * Note: all Recv WQEs are essentially "signaled"
1707 */
1708 wre = &srq->srq_wridlist->wl_wre[wqe_index];
1709 wre->wr_wrid = wrid;
1710 wre->wr_signaled_dbd = (uint32_t)TAVOR_WRID_ENTRY_SIGNALED;
1711 }
1712
1713 /*
1714 * dapli_tavor_cq_srq_entries_flush()
1715 */
1716 static void
dapli_tavor_cq_srq_entries_flush(ib_qp_handle_t qp)1717 dapli_tavor_cq_srq_entries_flush(ib_qp_handle_t qp)
1718 {
1719 ib_cq_handle_t cq;
1720 dapls_tavor_workq_hdr_t *wqhdr;
1721 tavor_hw_cqe_t *cqe;
1722 tavor_hw_cqe_t *next_cqe;
1723 uint32_t cons_indx, tail_cons_indx, wrap_around_mask;
1724 uint32_t new_indx, check_indx, indx;
1725 uint32_t num_to_increment;
1726 int cqe_qpnum, cqe_type;
1727 int outstanding_cqes, removed_cqes;
1728 int i;
1729
1730 /* ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock)); */
1731
1732 cq = qp->qp_rq_cqhdl;
1733 wqhdr = qp->qp_rq_wqhdr;
1734
1735 dapl_os_assert(wqhdr->wq_wrid_post != NULL);
1736 dapl_os_assert(wqhdr->wq_wrid_post->wl_srq_en != 0);
1737
1738 /* Get the consumer index */
1739 cons_indx = cq->cq_consindx;
1740
1741 /*
1742 * Calculate the wrap around mask. Note: This operation only works
1743 * because all Tavor completion queues have power-of-2 sizes
1744 */
1745 wrap_around_mask = (cq->cq_size - 1);
1746
1747 /* Calculate the pointer to the first CQ entry */
1748 cqe = &cq->cq_addr[cons_indx];
1749
1750 /*
1751 * Loop through the CQ looking for entries owned by software. If an
1752 * entry is owned by software then we increment an 'outstanding_cqes'
1753 * count to know how many entries total we have on our CQ. We use this
1754 * value further down to know how many entries to loop through looking
1755 * for our same QP number.
1756 */
1757 outstanding_cqes = 0;
1758 tail_cons_indx = cons_indx;
1759 while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
1760 /* increment total cqes count */
1761 outstanding_cqes++;
1762
1763 /* increment the consumer index */
1764 tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask;
1765
1766 /* update the pointer to the next cq entry */
1767 cqe = &cq->cq_addr[tail_cons_indx];
1768 }
1769
1770 /*
1771 * Using the 'tail_cons_indx' that was just set, we now know how many
1772 * total CQEs possible there are. Set the 'check_indx' and the
1773 * 'new_indx' to the last entry identified by 'tail_cons_indx'
1774 */
1775 check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask;
1776
1777 for (i = 0; i < outstanding_cqes; i++) {
1778 cqe = &cq->cq_addr[check_indx];
1779
1780 /* Grab QP number from CQE */
1781 cqe_qpnum = TAVOR_CQE_QPNUM_GET(cqe);
1782 cqe_type = TAVOR_CQE_SENDRECV_GET(cqe);
1783
1784 /*
1785 * If the QP number is the same in the CQE as the QP that we
1786 * have on this SRQ, then we must free up the entry off the
1787 * SRQ. We also make sure that the completion type is of the
1788 * 'TAVOR_COMPLETION_RECV' type. So any send completions on
1789 * this CQ will be left as-is. The handling of returning
1790 * entries back to HW ownership happens further down.
1791 */
1792 if (cqe_qpnum == qp->qp_num &&
1793 cqe_type == TAVOR_COMPLETION_RECV) {
1794 /* Add back to SRQ free list */
1795 (void) dapli_tavor_wrid_find_match_srq(
1796 wqhdr->wq_wrid_post, cqe);
1797 } else {
1798 /* Do Copy */
1799 if (check_indx != new_indx) {
1800 next_cqe = &cq->cq_addr[new_indx];
1801 /*
1802 * Copy the CQE into the "next_cqe"
1803 * pointer.
1804 */
1805 (void) dapl_os_memcpy(next_cqe, cqe,
1806 sizeof (tavor_hw_cqe_t));
1807 }
1808 new_indx = (new_indx - 1) & wrap_around_mask;
1809 }
1810 /* Move index to next CQE to check */
1811 check_indx = (check_indx - 1) & wrap_around_mask;
1812 }
1813
1814 /* Initialize removed cqes count */
1815 removed_cqes = 0;
1816
1817 /* If an entry was removed */
1818 if (check_indx != new_indx) {
1819
1820 /*
1821 * Set current pointer back to the beginning consumer index.
1822 * At this point, all unclaimed entries have been copied to the
1823 * index specified by 'new_indx'. This 'new_indx' will be used
1824 * as the new consumer index after we mark all freed entries as
1825 * having HW ownership. We do that here.
1826 */
1827
1828 /* Loop through all entries until we reach our new pointer */
1829 for (indx = cons_indx; indx <= new_indx;
1830 indx = (indx + 1) & wrap_around_mask) {
1831 removed_cqes++;
1832 cqe = &cq->cq_addr[indx];
1833
1834 /* Reset entry to hardware ownership */
1835 TAVOR_CQE_OWNER_SET_HW(cqe);
1836 }
1837 }
1838
1839 /*
1840 * Update consumer index to be the 'new_indx'. This moves it past all
1841 * removed entries. Because 'new_indx' is pointing to the last
1842 * previously valid SW owned entry, we add 1 to point the cons_indx to
1843 * the first HW owned entry.
1844 */
1845 cons_indx = (new_indx + 1) & wrap_around_mask;
1846
1847 /*
1848 * Now we only ring the doorbell (to update the consumer index) if
1849 * we've actually consumed a CQ entry. If we found no QP number
1850 * matches above, then we would not have removed anything. So only if
1851 * something was removed do we ring the doorbell.
1852 */
1853 if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
1854 /*
1855 * Post doorbell to update the consumer index. Doorbell
1856 * value indicates number of entries consumed (minus 1)
1857 */
1858 if (cons_indx > cq->cq_consindx) {
1859 num_to_increment = (cons_indx - cq->cq_consindx) - 1;
1860 } else {
1861 num_to_increment = ((cons_indx + cq->cq_size) -
1862 cq->cq_consindx) - 1;
1863 }
1864 cq->cq_consindx = cons_indx;
1865
1866 dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_INCR_CONSINDX,
1867 cq->cq_num, num_to_increment);
1868 }
1869 }
1870
1871 /* ARGSUSED */
1872 static void
dapli_tavor_qp_init(ib_qp_handle_t qp)1873 dapli_tavor_qp_init(ib_qp_handle_t qp)
1874 {
1875 }
1876
1877 /* ARGSUSED */
1878 static void
dapli_tavor_cq_init(ib_cq_handle_t cq)1879 dapli_tavor_cq_init(ib_cq_handle_t cq)
1880 {
1881 }
1882
1883 /* ARGSUSED */
1884 static void
dapli_tavor_srq_init(ib_srq_handle_t srq)1885 dapli_tavor_srq_init(ib_srq_handle_t srq)
1886 {
1887 }
1888
1889 void
dapls_init_funcs_tavor(DAPL_HCA * hca_ptr)1890 dapls_init_funcs_tavor(DAPL_HCA *hca_ptr)
1891 {
1892 hca_ptr->post_send = dapli_tavor_post_send;
1893 hca_ptr->post_recv = dapli_tavor_post_recv;
1894 hca_ptr->post_srq = dapli_tavor_post_srq;
1895 hca_ptr->cq_peek = dapli_tavor_cq_peek;
1896 hca_ptr->cq_poll = dapli_tavor_cq_poll;
1897 hca_ptr->cq_poll_one = dapli_tavor_cq_poll_one;
1898 hca_ptr->cq_notify = dapli_tavor_cq_notify;
1899 hca_ptr->srq_flush = dapli_tavor_cq_srq_entries_flush;
1900 hca_ptr->qp_init = dapli_tavor_qp_init;
1901 hca_ptr->cq_init = dapli_tavor_cq_init;
1902 hca_ptr->srq_init = dapli_tavor_srq_init;
1903 hca_ptr->hermon_resize_cq = 0;
1904 }
1905