1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * This file may contain confidential information of
29 * Mellanox Technologies, Ltd. and should not be distributed in source
30 * form without approval from Sun Legal.
31 */
32
33 #include "dapl.h"
34 #include "dapl_tavor_hw.h"
35 #include "dapl_tavor_wr.h"
36 #include "dapl_tavor_ibtf_impl.h"
37
38 /*
39 * Function signatures
40 */
41 extern uint64_t dapls_tavor_wrid_get_entry(ib_cq_handle_t, tavor_hw_cqe_t *,
42 uint_t, uint_t, dapls_tavor_wrid_entry_t *);
43 extern void dapls_tavor_wrid_cq_reap(ib_cq_handle_t);
44 extern DAPL_OS_LOCK g_tavor_uar_lock;
45
46 #ifndef _LP64
47 extern void dapls_atomic_assign_64(uint64_t, uint64_t *);
48 #endif
49
50 static int dapli_tavor_wqe_send_build(ib_qp_handle_t, ibt_send_wr_t *,
51 uint64_t *, uint_t *);
52 static void dapli_tavor_wqe_send_linknext(ibt_send_wr_t *, uint64_t *,
53 boolean_t, uint32_t, uint_t, uint64_t *, tavor_sw_wqe_dbinfo_t *);
54 static DAT_RETURN dapli_tavor_wqe_recv_build(ib_qp_handle_t, ibt_recv_wr_t *,
55 uint64_t *, uint_t *);
56 static void dapli_tavor_wqe_recv_linknext(uint64_t *, boolean_t, uint32_t,
57 uint_t, uint64_t *);
58 static int dapli_tavor_cq_cqe_consume(ib_cq_handle_t, tavor_hw_cqe_t *,
59 ibt_wc_t *);
60 static int dapli_tavor_cq_errcqe_consume(ib_cq_handle_t, tavor_hw_cqe_t *,
61 ibt_wc_t *);
62
63 /* exported to other HCAs */
64 extern void dapli_tavor_wrid_add_entry(dapls_tavor_workq_hdr_t *, uint64_t,
65 uint32_t, uint_t);
66 extern void dapli_tavor_wrid_add_entry_srq(ib_srq_handle_t, uint64_t, uint32_t);
67
68 /*
69 * Note: The 64 bit doorbells need to written atomically.
70 * In 32 bit libraries we need to use the special assembly rtn
71 * because compiler generated code splits into 2 word writes
72 */
73
74 #if defined(_LP64) || defined(__lint)
75 /* use a macro to ensure inlining on S10 amd64 compiler */
76 #define dapli_tavor_cq_doorbell(ia_uar, cq_cmd, cqn, cq_param) \
77 ((tavor_hw_uar_t *)ia_uar)->cq = HTOBE_64( \
78 ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) | \
79 ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param)
80 #else
81
82 /*
83 * dapli_tavor_cq_doorbell()
84 * Takes the specified cq cmd and cq number and rings the cq doorbell
85 */
86 static void
dapli_tavor_cq_doorbell(dapls_hw_uar_t ia_uar,uint32_t cq_cmd,uint32_t cqn,uint32_t cq_param)87 dapli_tavor_cq_doorbell(dapls_hw_uar_t ia_uar, uint32_t cq_cmd, uint32_t cqn,
88 uint32_t cq_param)
89 {
90 uint64_t doorbell;
91
92 /* Build the doorbell from the parameters */
93 doorbell = ((uint64_t)cq_cmd << TAVOR_CQDB_CMD_SHIFT) |
94 ((uint64_t)cqn << TAVOR_CQDB_CQN_SHIFT) | cq_param;
95
96 /* Write the doorbell to UAR */
97 #ifdef _LP64
98 ((tavor_hw_uar_t *)ia_uar)->cq = HTOBE_64(doorbell);
99 /* 32 bit version */
100 #elif defined(i386)
101 dapl_os_lock(&g_tavor_uar_lock);
102 /*
103 * For 32 bit intel we assign the doorbell in the order
104 * prescribed by the Tavor PRM, lower to upper addresses
105 */
106 ((tavor_hw_uar32_t *)ia_uar)->cq[0] =
107 (uint32_t)HTOBE_32(doorbell >> 32);
108 ((tavor_hw_uar32_t *)ia_uar)->cq[1] =
109 (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
110 dapl_os_unlock(&g_tavor_uar_lock);
111 #else
112 dapls_atomic_assign_64(HTOBE_64(doorbell),
113 &((tavor_hw_uar_t *)ia_uar)->cq);
114 #endif
115 }
116
117 #endif /* _LP64 */
118
119 #if defined(_LP64) || defined(__lint)
120 #define dapli_tavor_qp_send_doorbell(ia_uar, nda, nds, qpn, fence, nopcode) \
121 ((tavor_hw_uar_t *)ia_uar)->send = HTOBE_64( \
122 (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) << \
123 TAVOR_QPSNDDB_NDA_SHIFT) | \
124 ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) | \
125 ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) | \
126 ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds)
127 #else
128
129 /*
130 * dapli_tavor_qp_send_doorbell()
131 * Takes the specified next descriptor information, qp number, opcode and
132 * rings the send doorbell
133 */
134 static void
dapli_tavor_qp_send_doorbell(dapls_hw_uar_t ia_uar,uint32_t nda,uint32_t nds,uint32_t qpn,uint32_t fence,uint32_t nopcode)135 dapli_tavor_qp_send_doorbell(dapls_hw_uar_t ia_uar, uint32_t nda,
136 uint32_t nds, uint32_t qpn, uint32_t fence, uint32_t nopcode)
137 {
138 uint64_t doorbell;
139
140 /* Build the doorbell from the parameters */
141 doorbell = (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) <<
142 TAVOR_QPSNDDB_NDA_SHIFT) |
143 ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) |
144 ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) |
145 ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds;
146
147 /* Write the doorbell to UAR */
148 #ifdef _LP64
149 ((tavor_hw_uar_t *)ia_uar)->send = HTOBE_64(doorbell);
150 #else
151 #if defined(i386)
152 dapl_os_lock(&g_tavor_uar_lock);
153 /*
154 * For 32 bit intel we assign the doorbell in the order
155 * prescribed by the Tavor PRM, lower to upper addresses
156 */
157 ((tavor_hw_uar32_t *)ia_uar)->send[0] =
158 (uint32_t)HTOBE_32(doorbell >> 32);
159 ((tavor_hw_uar32_t *)ia_uar)->send[1] =
160 (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
161 dapl_os_unlock(&g_tavor_uar_lock);
162 #else
163 dapls_atomic_assign_64(HTOBE_64(doorbell),
164 &((tavor_hw_uar_t *)ia_uar)->send);
165 #endif
166 #endif
167 }
168 #endif /* _LP64 */
169
170 #if defined(_LP64) || defined(__lint)
171
172 #define dapli_tavor_qp_recv_doorbell(ia_uar, nda, nds, qpn, credits) \
173 ((tavor_hw_uar_t *)ia_uar)->recv = HTOBE_64( \
174 (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) << \
175 TAVOR_QPRCVDB_NDA_SHIFT) | \
176 ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) | \
177 ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits)
178 #else
179
180 /*
181 * dapli_tavor_qp_recv_doorbell()
182 * Takes the specified next descriptor information, qp number and
183 * rings the recv doorbell
184 */
185 static void
dapli_tavor_qp_recv_doorbell(dapls_hw_uar_t ia_uar,uint32_t nda,uint32_t nds,uint32_t qpn,uint32_t credits)186 dapli_tavor_qp_recv_doorbell(dapls_hw_uar_t ia_uar, uint32_t nda,
187 uint32_t nds, uint32_t qpn, uint32_t credits)
188 {
189 uint64_t doorbell;
190
191 /* Build the doorbell from the parameters */
192 doorbell = (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) <<
193 TAVOR_QPRCVDB_NDA_SHIFT) |
194 ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) |
195 ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits;
196
197 /* Write the doorbell to UAR */
198 #ifdef _LP64
199 ((tavor_hw_uar_t *)ia_uar)->recv = HTOBE_64(doorbell);
200 #else
201 #if defined(i386)
202 dapl_os_lock(&g_tavor_uar_lock);
203 /*
204 * For 32 bit intel we assign the doorbell in the order
205 * prescribed by the Tavor PRM, lower to upper addresses
206 */
207 ((tavor_hw_uar32_t *)ia_uar)->recv[0] =
208 (uint32_t)HTOBE_32(doorbell >> 32);
209 ((tavor_hw_uar32_t *)ia_uar)->recv[1] =
210 (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
211 dapl_os_unlock(&g_tavor_uar_lock);
212 #else
213 dapls_atomic_assign_64(HTOBE_64(doorbell),
214 &((tavor_hw_uar_t *)ia_uar)->recv);
215 #endif
216 #endif
217 }
218 #endif /* _LP64 */
219
220 /*
221 * dapls_tavor_max_inline()
222 * Return the max inline value that should be used.
223 * Env variable DAPL_MAX_INLINE can override the default.
224 * If it's not set (or set to -1), default behavior is used.
225 * If it's zero or negative (except -1) inline is not done.
226 */
227 int
dapls_tavor_max_inline(void)228 dapls_tavor_max_inline(void)
229 {
230 static int max_inline_env = -2;
231
232 /* Check the env exactly once, otherwise return previous value. */
233 if (max_inline_env != -2)
234 return (max_inline_env);
235
236 max_inline_env = dapl_os_get_env_val("DAPL_MAX_INLINE", -1);
237 if (max_inline_env != -1)
238 if (max_inline_env <= 0)
239 max_inline_env = 0; /* no inlining */
240 return (max_inline_env);
241 }
242
243 /*
244 * dapls_ib_max_request_iov(), aka, max send sgl size.
245 * The send queue's scatter/gather list is used for "inline" data.
246 *
247 * By default, compute reasonable send queue size based on #iovs, #wqes,
248 * max_iovs, and max inline byte count. If the #wqes is large, then we
249 * limit how much the SGL (space for inline data) can take. The heuristic
250 * is to increase the memory for the send queue to a maximum of 32KB:
251 *
252 * < 128 wqes increase to at most 256 minus header
253 * < 256 wqes increase to at most 128 minus header
254 * >= 256 wqes use SGL unaltered
255 *
256 * If the env is supplied (max_inline >= 0), use it without checking.
257 */
258 int
dapls_ib_max_request_iov(int iovs,int wqes,int max_iovs,int max_inline_bytes)259 dapls_ib_max_request_iov(int iovs, int wqes, int max_iovs,
260 int max_inline_bytes)
261 {
262 int ret_iovs;
263
264 if (max_inline_bytes > 0) {
265 ret_iovs = max_inline_bytes / sizeof (tavor_hw_wqe_sgl_t);
266 } else if (wqes < 128) {
267 max_inline_bytes = 256 - TAVOR_INLINE_HEADER_SIZE_MAX;
268 ret_iovs = max_inline_bytes / sizeof (tavor_hw_wqe_sgl_t);
269 } else if (wqes < 256) {
270 max_inline_bytes = 128 - TAVOR_INLINE_HEADER_SIZE_MAX;
271 ret_iovs = max_inline_bytes / sizeof (tavor_hw_wqe_sgl_t);
272 } else {
273 ret_iovs = iovs;
274 }
275
276 if (ret_iovs > max_iovs) /* do not exceed max */
277 ret_iovs = max_iovs;
278 if (iovs > ret_iovs) /* never decrease iovs */
279 ret_iovs = iovs;
280 return (ret_iovs);
281 }
282
283 /*
284 * dapli_tavor_wqe_send_build()
285 * Constructs a WQE for a given ibt_send_wr_t
286 */
287 static int
dapli_tavor_wqe_send_build(ib_qp_handle_t qp,ibt_send_wr_t * wr,uint64_t * addr,uint_t * size)288 dapli_tavor_wqe_send_build(ib_qp_handle_t qp, ibt_send_wr_t *wr,
289 uint64_t *addr, uint_t *size)
290 {
291 tavor_hw_snd_wqe_remaddr_t *rc;
292 tavor_hw_snd_wqe_bind_t *bn;
293 tavor_hw_wqe_sgl_t *ds;
294 ibt_wr_ds_t *sgl;
295 uint32_t nds;
296 uint32_t len, total_len;
297 uint32_t tavor_num_mpt_mask;
298 uint32_t new_rkey;
299 uint32_t old_rkey;
300 int i, num_ds;
301 int max_inline_bytes = -1;
302
303 nds = wr->wr_nds;
304 sgl = wr->wr_sgl;
305 num_ds = 0;
306
307 /*
308 * RC is the only supported transport in UDAPL
309 * For RC requests, we allow "Send", "RDMA Read", "RDMA Write"
310 */
311 switch (wr->wr_opcode) {
312 case IBT_WRC_SEND:
313 /*
314 * If this is a Send request, then all we need is
315 * the Data Segment processing below.
316 * Initialize the information for the Data Segments
317 */
318 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
319 sizeof (tavor_hw_snd_wqe_nextctrl_t));
320 if (qp->qp_sq_inline != 0)
321 max_inline_bytes =
322 qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_SEND;
323 break;
324 case IBT_WRC_RDMAW:
325 if (qp->qp_sq_inline != 0)
326 max_inline_bytes =
327 qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_RDMAW;
328 /* FALLTHROUGH */
329 case IBT_WRC_RDMAR:
330 if (qp->qp_sq_inline < 0 && wr->wr_opcode == IBT_WRC_RDMAR)
331 qp->qp_sq_inline = 0;
332 /*
333 * If this is an RDMA Read or RDMA Write request, then fill
334 * in the "Remote Address" header fields.
335 */
336 rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)addr +
337 sizeof (tavor_hw_snd_wqe_nextctrl_t));
338
339 /*
340 * Build the Remote Address Segment for the WQE, using
341 * the information from the RC work request.
342 */
343 TAVOR_WQE_BUILD_REMADDR(rc, &wr->wr.rc.rcwr.rdma);
344
345 /* Update "ds" for filling in Data Segments (below) */
346 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc +
347 sizeof (tavor_hw_snd_wqe_remaddr_t));
348 break;
349 case IBT_WRC_BIND:
350 /*
351 * Generate a new R_key
352 * Increment the upper "unconstrained" bits and need to keep
353 * the lower "constrained" bits the same it represents
354 * the MPT index.
355 */
356 old_rkey = wr->wr.rc.rcwr.bind->bind_rkey;
357 tavor_num_mpt_mask = (uint32_t)(1 << qp->qp_num_mpt_shift) - 1;
358 new_rkey = (old_rkey >> qp->qp_num_mpt_shift);
359 new_rkey++;
360 new_rkey = ((new_rkey << qp->qp_num_mpt_shift) |
361 (old_rkey & tavor_num_mpt_mask));
362
363 wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
364
365 bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)addr +
366 sizeof (tavor_hw_snd_wqe_nextctrl_t));
367
368 /*
369 * Build the Bind Memory Window Segments for the WQE,
370 * using the information from the RC Bind memory
371 * window work request.
372 */
373 TAVOR_WQE_BUILD_BIND(bn, wr->wr.rc.rcwr.bind);
374
375 /*
376 * Update the "ds" pointer. Even though the "bind"
377 * operation requires no SGLs, this is necessary to
378 * facilitate the correct descriptor size calculations
379 * (below).
380 */
381 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
382 sizeof (tavor_hw_snd_wqe_bind_t));
383 break;
384 default:
385 dapl_dbg_log(DAPL_DBG_TYPE_ERR,
386 "dapli_tavor_wqe_send_build: invalid wr_opcode=%d\n",
387 wr->wr_opcode);
388 return (DAT_INTERNAL_ERROR);
389 }
390
391 /*
392 * Now fill in the Data Segments (SGL) for the Send WQE based on
393 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
394 * Start by checking for a valid number of SGL entries
395 */
396 if (nds > qp->qp_sq_sgl) {
397 return (DAT_INVALID_PARAMETER);
398 }
399
400 /*
401 * For each SGL in the Send Work Request, fill in the Send WQE's data
402 * segments. Note: We skip any SGL with zero size because Tavor
403 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
404 * the encoding for zero means a 2GB transfer. Because of this special
405 * encoding in the hardware, we mask the requested length with
406 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
407 * zero.)
408 */
409
410 if (max_inline_bytes != -1) { /* compute total_len */
411 total_len = 0;
412 for (i = 0; i < nds; i++)
413 total_len += sgl[i].ds_len;
414 if (total_len > max_inline_bytes)
415 max_inline_bytes = -1; /* too big, do not "inline" */
416 }
417 if (max_inline_bytes != -1) { /* do "inline" */
418 uint8_t *dst = (uint8_t *)((uint32_t *)ds + 1);
419 *(uint32_t *)ds =
420 HTOBE_32(total_len | TAVOR_WQE_SGL_INLINE_MASK);
421 for (i = 0; i < nds; i++) {
422 if ((len = sgl[i].ds_len) == 0) {
423 continue;
424 }
425 (void) dapl_os_memcpy(dst,
426 (void *)(uintptr_t)sgl[i].ds_va, len);
427 dst += len;
428 }
429 /* Return the size of descriptor (in 16-byte chunks) */
430 *size = ((uintptr_t)dst - (uintptr_t)addr + 15) >> 4;
431 } else {
432 for (i = 0; i < nds; i++) {
433 if (sgl[i].ds_len == 0) {
434 continue;
435 }
436
437 /*
438 * Fill in the Data Segment(s) for the current WQE,
439 * using the information contained in the
440 * scatter-gather list of the work request.
441 */
442 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &sgl[i]);
443 num_ds++;
444 }
445
446 /* Return the size of descriptor (in 16-byte chunks) */
447 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)addr) >> 4;
448 }
449
450 return (DAT_SUCCESS);
451 }
452
453 /*
454 * dapli_tavor_wqe_send_linknext()
455 * Takes a WQE and links it to the prev WQE chain
456 */
457 static void
dapli_tavor_wqe_send_linknext(ibt_send_wr_t * curr_wr,uint64_t * curr_addr,boolean_t ns,uint32_t curr_desc,uint_t curr_descsz,uint64_t * prev_addr,tavor_sw_wqe_dbinfo_t * dbinfo)458 dapli_tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, uint64_t *curr_addr,
459 boolean_t ns, uint32_t curr_desc, uint_t curr_descsz, uint64_t *prev_addr,
460 tavor_sw_wqe_dbinfo_t *dbinfo)
461 {
462 uint64_t next, ctrl;
463 uint32_t nopcode, fence;
464
465 next = 0;
466 ctrl = 0;
467
468 /* Set the "c" (i.e. "signaled") bit appropriately */
469 if (curr_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
470 ctrl = ctrl | TAVOR_WQE_SEND_SIGNALED_MASK;
471 }
472
473 /* Set the "s" (i.e. "solicited") bit appropriately */
474 if (curr_wr->wr_flags & IBT_WR_SEND_SOLICIT) {
475 ctrl = ctrl | TAVOR_WQE_SEND_SOLICIT_MASK;
476 }
477 /* Set the "e" (i.e. "event") bit if notification is needed */
478 if (!ns) {
479 ctrl = ctrl | TAVOR_WQE_RCV_EVENT_MASK;
480 }
481
482 /*
483 * The "i" bit is unused since uDAPL doesn't support
484 * the immediate data
485 */
486
487 /* initialize the ctrl and next fields of the current descriptor */
488 TAVOR_WQE_LINKNEXT(curr_addr, ctrl, next);
489
490 /*
491 * Calculate the "next" field of the prev descriptor. This amounts
492 * to setting up the "next_wqe_addr", "nopcode", "fence", and "nds"
493 * fields (see tavor_hw.h for more).
494 */
495
496 /*
497 * Determine the value for the Tavor WQE "nopcode" field
498 * by using the IBTF opcode from the work request
499 */
500 switch (curr_wr->wr_opcode) {
501 case IBT_WRC_RDMAW:
502 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW;
503 break;
504
505 case IBT_WRC_SEND:
506 nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
507 break;
508
509 case IBT_WRC_RDMAR:
510 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR;
511 break;
512
513 case IBT_WRC_BIND:
514 nopcode = TAVOR_WQE_SEND_NOPCODE_BIND;
515 break;
516 default:
517 /* Unsupported opcodes in UDAPL */
518 dapl_dbg_log(DAPL_DBG_TYPE_ERR,
519 "dapli_tavor_wqe_send_linknext: invalid nopcode=%d\n",
520 nopcode);
521 return;
522 }
523
524 next = ((uint64_t)curr_desc & TAVOR_WQE_NDA_MASK) << 32;
525 next = next | ((uint64_t)nopcode << 32);
526 fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
527 if (fence) {
528 next = next | TAVOR_WQE_SEND_FENCE_MASK;
529 }
530 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
531
532 /*
533 * A send queue doorbell will be rung for the next
534 * WQE on the chain, set the current WQE's "dbd" bit.
535 * Note: We also update the "dbinfo" structure here to pass
536 * back information about what should (later) be included
537 * in the send queue doorbell.
538 */
539 next = next | TAVOR_WQE_DBD_MASK;
540 dbinfo->db_nopcode = nopcode;
541 dbinfo->db_fence = fence;
542
543 /*
544 * Send queue doorbell will be rung for the next WQE on
545 * the chain, update the prev WQE's "next" field and return.
546 */
547 if (prev_addr != NULL) {
548 TAVOR_WQE_LINKFIRST(prev_addr, next);
549 }
550 }
551
552
553 /*
554 * dapli_tavor_wqe_recv_build()
555 * Builds the recv WQE for a given ibt_recv_wr_t
556 */
557 static DAT_RETURN
dapli_tavor_wqe_recv_build(ib_qp_handle_t qp,ibt_recv_wr_t * wr,uint64_t * addr,uint_t * size)558 dapli_tavor_wqe_recv_build(ib_qp_handle_t qp, ibt_recv_wr_t *wr,
559 uint64_t *addr, uint_t *size)
560 {
561 tavor_hw_wqe_sgl_t *ds;
562 int i;
563 int num_ds;
564
565 /* Fill in the Data Segments (SGL) for the Recv WQE */
566 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
567 sizeof (tavor_hw_rcv_wqe_nextctrl_t));
568 num_ds = 0;
569
570 /* Check for valid number of SGL entries */
571 if (wr->wr_nds > qp->qp_rq_sgl) {
572 return (DAT_INVALID_PARAMETER);
573 }
574
575 /*
576 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
577 * segments. Note: We skip any SGL with zero size because Tavor
578 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
579 * the encoding for zero means a 2GB transfer. Because of this special
580 * encoding in the hardware, we mask the requested length with
581 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
582 * zero.)
583 */
584 for (i = 0; i < wr->wr_nds; i++) {
585 if (wr->wr_sgl[i].ds_len == 0) {
586 continue;
587 }
588
589 /*
590 * Fill in the Data Segment(s) for the receive WQE, using the
591 * information contained in the scatter-gather list of the
592 * work request.
593 */
594 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]);
595 num_ds++;
596 }
597
598 /* Return the size of descriptor (in 16-byte chunks) */
599 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)addr) >> 0x4;
600
601 return (DAT_SUCCESS);
602 }
603
604
605 /*
606 * dapli_tavor_wqe_recv_linknext()
607 * Links a recv WQE to the prev chain
608 */
609 static void
dapli_tavor_wqe_recv_linknext(uint64_t * curr_addr,boolean_t ns,uint32_t curr_desc,uint_t curr_descsz,uint64_t * prev_addr)610 dapli_tavor_wqe_recv_linknext(uint64_t *curr_addr, boolean_t ns,
611 uint32_t curr_desc, uint_t curr_descsz, uint64_t *prev_addr)
612 {
613 uint64_t next;
614 uint64_t ctrl = 0;
615
616 /*
617 * Note: curr_addr is the last WQE (In uDAPL we manipulate 1 WQE
618 * at a time. If there is no next descriptor (i.e. if the current
619 * descriptor is the last WQE on the chain), then set "next" field
620 * to TAVOR_WQE_DBD_MASK. This is because the Tavor hardware
621 * requires the "dbd" bit to be set to one for all Recv WQEs.
622 * In either case, we must add a single bit in the "reserved" field
623 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA. This is the
624 * workaround for a known Tavor errata that can cause Recv WQEs with
625 * zero in the NDA field to behave improperly.
626 *
627 * If notification suppression is not desired then we set
628 * the "E" bit in the ctrl field.
629 */
630
631 next = TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
632 if (!ns) { /* notification needed - so set the "E" bit */
633 ctrl = TAVOR_WQE_RCV_EVENT_MASK;
634 }
635
636 /* update the WQE */
637 TAVOR_WQE_LINKNEXT(curr_addr, ctrl, next);
638
639 if (prev_addr != NULL) {
640 /*
641 * Calculate the "next" field of the descriptor. This amounts
642 * to setting up the "next_wqe_addr", "dbd", and "nds" fields
643 * (see tavor_hw.h for more).
644 */
645 next = ((uint64_t)curr_desc & TAVOR_WQE_NDA_MASK) << 32;
646 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK) |
647 TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
648
649 /*
650 * If this WQE is supposed to be linked to the previous
651 * descriptor, then we need to update not only the previous
652 * WQE's "next" fields but we must not touch this WQE's
653 * "ctrl" fields.
654 */
655 TAVOR_WQE_LINKFIRST(prev_addr, next);
656 }
657 }
658
659 /*
660 * dapli_tavor_wqe_srq_build()
661 * Builds the recv WQE for a given ibt_recv_wr_t
662 */
663 static DAT_RETURN
dapli_tavor_wqe_srq_build(ib_srq_handle_t srq,ibt_recv_wr_t * wr,uint64_t * addr)664 dapli_tavor_wqe_srq_build(ib_srq_handle_t srq, ibt_recv_wr_t *wr,
665 uint64_t *addr)
666 {
667 tavor_hw_wqe_sgl_t *ds;
668 ibt_wr_ds_t end_sgl;
669 int i;
670 int num_ds;
671
672 /* Fill in the Data Segments (SGL) for the Recv WQE */
673 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
674 sizeof (tavor_hw_rcv_wqe_nextctrl_t));
675 num_ds = 0;
676
677 /* Check for valid number of SGL entries */
678 if (wr->wr_nds > srq->srq_wq_sgl) {
679 return (DAT_INVALID_PARAMETER);
680 }
681
682 /*
683 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
684 * segments. Note: We skip any SGL with zero size because Tavor
685 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
686 * the encoding for zero means a 2GB transfer. Because of this special
687 * encoding in the hardware, we mask the requested length with
688 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
689 * zero.)
690 */
691 for (i = 0; i < wr->wr_nds; i++) {
692 if (wr->wr_sgl[i].ds_len == 0) {
693 continue;
694 }
695
696 /*
697 * Fill in the Data Segment(s) for the receive WQE, using the
698 * information contained in the scatter-gather list of the
699 * work request.
700 */
701 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]);
702 num_ds++;
703 }
704
705 /*
706 * For SRQ, if the number of data segments is less than the maximum
707 * specified at alloc, then we have to fill in a special "key" entry in
708 * the sgl entry after the last valid one in this post request. We do
709 * that here.
710 */
711 if (num_ds < srq->srq_wq_sgl) {
712 end_sgl.ds_va = (ib_vaddr_t)0;
713 end_sgl.ds_len = (ib_msglen_t)0;
714 end_sgl.ds_key = (ibt_lkey_t)1;
715 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &end_sgl);
716 }
717
718 return (DAT_SUCCESS);
719 }
720
721 /*
722 * dapli_tavor_wqe_srq_linknext()
723 * Links a srq recv WQE to the prev chain
724 */
725 static void
dapli_tavor_wqe_srq_linknext(uint64_t * curr_addr,boolean_t ns,uint32_t curr_desc,uint64_t * prev_addr)726 dapli_tavor_wqe_srq_linknext(uint64_t *curr_addr, boolean_t ns,
727 uint32_t curr_desc, uint64_t *prev_addr)
728 {
729 uint64_t next;
730 uint64_t ctrl = 0;
731
732 /*
733 * Note: curr_addr is the last WQE (In uDAPL we manipulate 1 WQE
734 * at a time. If there is no next descriptor (i.e. if the current
735 * descriptor is the last WQE on the chain), then set "next" field
736 * to TAVOR_WQE_DBD_MASK. This is because the Tavor hardware
737 * requires the "dbd" bit to be set to one for all Recv WQEs.
738 * In either case, we must add a single bit in the "reserved" field
739 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA. This is the
740 * workaround for a known Tavor errata that can cause Recv WQEs with
741 * zero in the NDA field to behave improperly.
742 *
743 * If notification suppression is not desired then we set
744 * the "E" bit in the ctrl field.
745 */
746
747 next = TAVOR_RCV_WQE_NDA0_WA_MASK;
748 if (!ns) { /* notification needed - so set the "E" bit */
749 ctrl = TAVOR_WQE_RCV_EVENT_MASK;
750 }
751
752 /* update the WQE */
753 TAVOR_WQE_LINKNEXT(curr_addr, ctrl, next);
754
755 if (prev_addr != NULL) {
756 /*
757 * Calculate the "next" field of the descriptor. This amounts
758 * to setting up the "next_wqe_addr", "dbd", and "nds" fields
759 * (see tavor_hw.h for more).
760 */
761 next = ((uint64_t)curr_desc & TAVOR_WQE_NDA_MASK) << 32;
762 next = next | TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
763
764 /*
765 * If this WQE is supposed to be linked to the previous
766 * descriptor, then we need to update not only the previous
767 * WQE's "next" fields but we must not touch this WQE's
768 * "ctrl" fields.
769 */
770 TAVOR_WQE_LINKFIRST(prev_addr, next);
771 }
772 }
773
774 /*
775 * dapli_tavor_cq_peek()
776 * Peeks into a given CQ to check if there are any events that can be
777 * polled. It returns the number of CQEs that can be polled.
778 */
779 static void
dapli_tavor_cq_peek(ib_cq_handle_t cq,int * num_cqe)780 dapli_tavor_cq_peek(ib_cq_handle_t cq, int *num_cqe)
781 {
782 tavor_hw_cqe_t *cqe;
783 uint32_t imm_eth_pkey_cred;
784 uint32_t cons_indx;
785 uint32_t wrap_around_mask;
786 uint32_t polled_cnt;
787 uint_t doorbell_cnt;
788 uint_t opcode;
789
790 /* Get the consumer index */
791 cons_indx = cq->cq_consindx;
792
793 /*
794 * Calculate the wrap around mask. Note: This operation only works
795 * because all Tavor completion queues have power-of-2 sizes
796 */
797 wrap_around_mask = (cq->cq_size - 1);
798
799 /* Calculate the pointer to the first CQ entry */
800 cqe = &cq->cq_addr[cons_indx];
801
802 /*
803 * Count entries in the CQ until we find an entry owned by
804 * the hardware.
805 */
806 polled_cnt = 0;
807 while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
808 opcode = TAVOR_CQE_OPCODE_GET(cqe);
809 /* Error CQE map to multiple work completions */
810 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
811 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
812 imm_eth_pkey_cred =
813 TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe);
814 doorbell_cnt =
815 imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
816 polled_cnt += (doorbell_cnt + 1);
817 } else {
818 polled_cnt++;
819 }
820 /* Increment the consumer index */
821 cons_indx = (cons_indx + 1) & wrap_around_mask;
822
823 /* Update the pointer to the next CQ entry */
824 cqe = &cq->cq_addr[cons_indx];
825 }
826
827 *num_cqe = polled_cnt;
828 }
829
830 /*
831 * dapli_tavor_cq_poll()
832 * This routine polls CQEs out of a CQ and puts them into the ibt_wc_t
833 * array that is passed in.
834 */
835 static DAT_RETURN
dapli_tavor_cq_poll(ib_cq_handle_t cq,ibt_wc_t * wc_p,uint_t num_wc,uint_t * num_polled)836 dapli_tavor_cq_poll(ib_cq_handle_t cq, ibt_wc_t *wc_p, uint_t num_wc,
837 uint_t *num_polled)
838 {
839 tavor_hw_cqe_t *cqe;
840 uint32_t cons_indx;
841 uint32_t wrap_around_mask;
842 uint32_t polled_cnt;
843 uint32_t num_to_increment;
844 DAT_RETURN dat_status;
845 int status;
846
847 /* Get the consumer index */
848 cons_indx = cq->cq_consindx;
849
850 /*
851 * Calculate the wrap around mask. Note: This operation only works
852 * because all Tavor completion queues have power-of-2 sizes
853 */
854 wrap_around_mask = (cq->cq_size - 1);
855
856 /* Calculate the pointer to the first CQ entry */
857 cqe = &cq->cq_addr[cons_indx];
858
859 /*
860 * Keep pulling entries from the CQ until we find an entry owned by
861 * the hardware. As long as there the CQE's owned by SW, process
862 * each entry by calling dapli_tavor_cq_cqe_consume() and updating the
863 * CQ consumer index. Note: We only update the consumer index if
864 * dapli_tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.
865 * Otherwise, it indicates that we are going to "recycle" the CQE
866 * (probably because it is a error CQE and corresponds to more than one
867 * completion).
868 */
869 polled_cnt = 0;
870 while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
871 status = dapli_tavor_cq_cqe_consume(cq, cqe,
872 &wc_p[polled_cnt++]);
873 if (status == TAVOR_CQ_SYNC_AND_DB) {
874 /* Reset entry to hardware ownership */
875 TAVOR_CQE_OWNER_SET_HW(cqe);
876
877 /* Increment the consumer index */
878 cons_indx = (cons_indx + 1) & wrap_around_mask;
879
880 /* Update the pointer to the next CQ entry */
881 cqe = &cq->cq_addr[cons_indx];
882 }
883
884 /*
885 * If we have run out of space to store work completions,
886 * then stop and return the ones we have pulled of the CQ.
887 */
888 if (polled_cnt >= num_wc) {
889 break;
890 }
891 }
892
893 dat_status = DAT_SUCCESS;
894 /*
895 * Now we only ring the doorbell (to update the consumer index) if
896 * we've actually consumed a CQ entry. If we have, for example,
897 * pulled from a CQE that we are still in the process of "recycling"
898 * for error purposes, then we would not update the consumer index.
899 */
900 if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) {
901 /*
902 * Post doorbell to update the consumer index. Doorbell
903 * value indicates number of entries consumed (minus 1)
904 */
905 if (cons_indx > cq->cq_consindx) {
906 num_to_increment = (cons_indx - cq->cq_consindx) - 1;
907 } else {
908 num_to_increment = ((cons_indx + cq->cq_size) -
909 cq->cq_consindx) - 1;
910 }
911 cq->cq_consindx = cons_indx;
912 dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_INCR_CONSINDX,
913 cq->cq_num, num_to_increment);
914 } else if (polled_cnt == 0) {
915 /*
916 * If the CQ is empty, we can try to free up some of the WRID
917 * list containers.
918 */
919 if (cq->cq_wrid_reap_head) /* look before leaping */
920 dapls_tavor_wrid_cq_reap(cq);
921 dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0);
922 }
923
924 if (num_polled != NULL) {
925 *num_polled = polled_cnt;
926 }
927
928 return (dat_status);
929 }
930
931 /*
932 * dapli_tavor_cq_poll_one()
933 * This routine polls one CQE out of a CQ and puts ot into the ibt_wc_t
934 * that is passed in. See above for more comments/details.
935 */
936 static DAT_RETURN
dapli_tavor_cq_poll_one(ib_cq_handle_t cq,ibt_wc_t * wc_p)937 dapli_tavor_cq_poll_one(ib_cq_handle_t cq, ibt_wc_t *wc_p)
938 {
939 tavor_hw_cqe_t *cqe;
940 uint32_t cons_indx;
941 DAT_RETURN dat_status;
942 int status;
943
944 /* Get the consumer index */
945 cons_indx = cq->cq_consindx;
946
947 /* Calculate the pointer to the first CQ entry */
948 cqe = &cq->cq_addr[cons_indx];
949
950 /*
951 * Keep pulling entries from the CQ until we find an entry owned by
952 * the hardware. As long as there the CQE's owned by SW, process
953 * each entry by calling dapli_tavor_cq_cqe_consume() and updating the
954 * CQ consumer index. Note: We only update the consumer index if
955 * dapli_tavor_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.
956 * Otherwise, it indicates that we are going to "recycle" the CQE
957 * (probably because it is a error CQE and corresponds to more than one
958 * completion).
959 */
960 if (TAVOR_CQE_OWNER_IS_SW(cqe)) {
961 status = dapli_tavor_cq_cqe_consume(cq, cqe, wc_p);
962 if (status == TAVOR_CQ_SYNC_AND_DB) {
963 /* Reset entry to hardware ownership */
964 TAVOR_CQE_OWNER_SET_HW(cqe);
965
966 /* Increment the consumer index */
967 cq->cq_consindx =
968 (cons_indx + 1) & (cq->cq_size - 1);
969 dapli_tavor_cq_doorbell(cq->cq_iauar,
970 TAVOR_CQDB_INCR_CONSINDX,
971 cq->cq_num, 0);
972 }
973 dat_status = DAT_SUCCESS;
974 } else {
975 if (cq->cq_wrid_reap_head) /* look before leaping */
976 dapls_tavor_wrid_cq_reap(cq);
977 dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0);
978 }
979 return (dat_status);
980 }
981
982 /*
983 * dapli_tavor_cq_cqe_consume()
984 * Converts a given CQE into a ibt_wc_t object
985 */
986 static int
dapli_tavor_cq_cqe_consume(ib_cq_handle_t cqhdl,tavor_hw_cqe_t * cqe,ibt_wc_t * wc)987 dapli_tavor_cq_cqe_consume(ib_cq_handle_t cqhdl, tavor_hw_cqe_t *cqe,
988 ibt_wc_t *wc)
989 {
990 uint_t flags;
991 uint_t type;
992 uint_t opcode;
993 int status;
994
995 /*
996 * Determine if this is an "error" CQE by examining "opcode". If it
997 * is an error CQE, then call dapli_tavor_cq_errcqe_consume() and return
998 * whatever status it returns. Otherwise, this is a successful
999 * completion.
1000 */
1001 opcode = TAVOR_CQE_OPCODE_GET(cqe);
1002 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
1003 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
1004 status = dapli_tavor_cq_errcqe_consume(cqhdl, cqe, wc);
1005 return (status);
1006 }
1007
1008 /*
1009 * Fetch the Work Request ID using the information in the CQE.
1010 * See tavor_wr.c for more details.
1011 */
1012 wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, cqe,
1013 TAVOR_CQE_SENDRECV_GET(cqe), 0, NULL);
1014 wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe);
1015
1016 /*
1017 * Parse the CQE opcode to determine completion type. This will set
1018 * not only the type of the completion, but also any flags that might
1019 * be associated with it (e.g. whether immediate data is present).
1020 */
1021 flags = IBT_WC_NO_FLAGS;
1022 if (TAVOR_CQE_SENDRECV_GET(cqe) != TAVOR_COMPLETION_RECV) {
1023
1024 /*
1025 * Send CQE
1026 *
1027 * The following opcodes will not be generated in uDAPL
1028 * case TAVOR_CQE_SND_RDMAWR_IMM:
1029 * case TAVOR_CQE_SND_SEND_IMM:
1030 * case TAVOR_CQE_SND_ATOMIC_CS:
1031 * case TAVOR_CQE_SND_ATOMIC_FA:
1032 */
1033 switch (opcode) {
1034 case TAVOR_CQE_SND_RDMAWR:
1035 type = IBT_WRC_RDMAW;
1036 break;
1037
1038 case TAVOR_CQE_SND_SEND:
1039 type = IBT_WRC_SEND;
1040 break;
1041
1042 case TAVOR_CQE_SND_RDMARD:
1043 type = IBT_WRC_RDMAR;
1044 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe);
1045 break;
1046
1047 case TAVOR_CQE_SND_BIND_MW:
1048 type = IBT_WRC_BIND;
1049 break;
1050
1051 default:
1052 wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR;
1053 return (TAVOR_CQ_SYNC_AND_DB);
1054 }
1055 } else {
1056
1057 /*
1058 * Receive CQE
1059 *
1060 * The following opcodes will not be generated in uDAPL
1061 *
1062 * case TAVOR_CQE_RCV_RECV_IMM:
1063 * case TAVOR_CQE_RCV_RECV_IMM2:
1064 * case TAVOR_CQE_RCV_RDMAWR_IMM:
1065 * case TAVOR_CQE_RCV_RDMAWR_IMM2:
1066 */
1067 switch (opcode & 0x1F) {
1068 case TAVOR_CQE_RCV_RECV:
1069 /* FALLTHROUGH */
1070 case TAVOR_CQE_RCV_RECV2:
1071 type = IBT_WRC_RECV;
1072 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe);
1073 break;
1074 default:
1075 wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR;
1076 return (TAVOR_CQ_SYNC_AND_DB);
1077 }
1078 }
1079 wc->wc_type = type;
1080 wc->wc_flags = flags;
1081 /* If we got here, completion status must be success */
1082 wc->wc_status = IBT_WC_SUCCESS;
1083
1084 return (TAVOR_CQ_SYNC_AND_DB);
1085 }
1086
1087
1088 /*
1089 * dapli_tavor_cq_errcqe_consume()
1090 */
1091 static int
dapli_tavor_cq_errcqe_consume(ib_cq_handle_t cqhdl,tavor_hw_cqe_t * cqe,ibt_wc_t * wc)1092 dapli_tavor_cq_errcqe_consume(ib_cq_handle_t cqhdl, tavor_hw_cqe_t *cqe,
1093 ibt_wc_t *wc)
1094 {
1095 dapls_tavor_wrid_entry_t wre;
1096 uint32_t next_wqeaddr;
1097 uint32_t imm_eth_pkey_cred;
1098 uint_t nextwqesize, dbd;
1099 uint_t doorbell_cnt, status;
1100 uint_t opcode = TAVOR_CQE_OPCODE_GET(cqe);
1101
1102 dapl_dbg_log(DAPL_DBG_TYPE_EVD, "errcqe_consume:cqe.eth=%x, wqe=%x\n",
1103 TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe),
1104 TAVOR_CQE_WQEADDRSZ_GET(cqe));
1105
1106 /*
1107 * Fetch the Work Request ID using the information in the CQE.
1108 * See tavor_wr.c for more details.
1109 */
1110 wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, cqe,
1111 (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ? TAVOR_COMPLETION_SEND :
1112 TAVOR_COMPLETION_RECV, 1, &wre);
1113 wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe);
1114
1115 /*
1116 * Parse the CQE opcode to determine completion type. We know that
1117 * the CQE is an error completion, so we extract only the completion
1118 * status here.
1119 */
1120 imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe);
1121 status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT;
1122 switch (status) {
1123 case TAVOR_CQE_LOC_LEN_ERR:
1124 status = IBT_WC_LOCAL_LEN_ERR;
1125 break;
1126
1127 case TAVOR_CQE_LOC_OP_ERR:
1128 status = IBT_WC_LOCAL_CHAN_OP_ERR;
1129 break;
1130
1131 case TAVOR_CQE_LOC_PROT_ERR:
1132 status = IBT_WC_LOCAL_PROTECT_ERR;
1133 break;
1134
1135 case TAVOR_CQE_WR_FLUSHED_ERR:
1136 status = IBT_WC_WR_FLUSHED_ERR;
1137 break;
1138
1139 case TAVOR_CQE_MW_BIND_ERR:
1140 status = IBT_WC_MEM_WIN_BIND_ERR;
1141 break;
1142
1143 case TAVOR_CQE_BAD_RESPONSE_ERR:
1144 status = IBT_WC_BAD_RESPONSE_ERR;
1145 break;
1146
1147 case TAVOR_CQE_LOCAL_ACCESS_ERR:
1148 status = IBT_WC_LOCAL_ACCESS_ERR;
1149 break;
1150
1151 case TAVOR_CQE_REM_INV_REQ_ERR:
1152 status = IBT_WC_REMOTE_INVALID_REQ_ERR;
1153 break;
1154
1155 case TAVOR_CQE_REM_ACC_ERR:
1156 status = IBT_WC_REMOTE_ACCESS_ERR;
1157 break;
1158
1159 case TAVOR_CQE_REM_OP_ERR:
1160 status = IBT_WC_REMOTE_OP_ERR;
1161 break;
1162
1163 case TAVOR_CQE_TRANS_TO_ERR:
1164 status = IBT_WC_TRANS_TIMEOUT_ERR;
1165 break;
1166
1167 case TAVOR_CQE_RNRNAK_TO_ERR:
1168 status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
1169 break;
1170
1171 /*
1172 * The following error codes are not supported in the Tavor driver
1173 * as they relate only to Reliable Datagram completion statuses:
1174 * case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
1175 * case TAVOR_CQE_REM_INV_RD_REQ_ERR:
1176 * case TAVOR_CQE_EEC_REM_ABORTED_ERR:
1177 * case TAVOR_CQE_INV_EEC_NUM_ERR:
1178 * case TAVOR_CQE_INV_EEC_STATE_ERR:
1179 * case TAVOR_CQE_LOC_EEC_ERR:
1180 */
1181
1182 default:
1183 status = IBT_WC_LOCAL_CHAN_OP_ERR;
1184 break;
1185 }
1186 wc->wc_status = status;
1187 wc->wc_type = 0;
1188 /*
1189 * Now we do all the checking that's necessary to handle completion
1190 * queue entry "recycling"
1191 *
1192 * It is not necessary here to try to sync the WQE as we are only
1193 * attempting to read from the Work Queue (and hardware does not
1194 * write to it).
1195 */
1196
1197 /*
1198 * We can get doorbell info, WQE address, size for the next WQE
1199 * from the "wre" (which was filled in above in the call to the
1200 * tavor_wrid_get_entry() routine)
1201 */
1202 dbd = (wre.wr_signaled_dbd & TAVOR_WRID_ENTRY_DOORBELLED) ? 1 : 0;
1203 next_wqeaddr = wre.wr_wqeaddrsz;
1204 nextwqesize = wre.wr_wqeaddrsz & TAVOR_WQE_NDS_MASK;
1205
1206 /*
1207 * Get the doorbell count from the CQE. This indicates how many
1208 * completions this one CQE represents.
1209 */
1210 doorbell_cnt = imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
1211
1212 /*
1213 * Determine if we're ready to consume this CQE yet or not. If the
1214 * next WQE has size zero (i.e. no next WQE) or if the doorbell count
1215 * is down to zero, then this is the last/only completion represented
1216 * by the current CQE (return TAVOR_CQ_SYNC_AND_DB). Otherwise, the
1217 * current CQE needs to be recycled (see below).
1218 */
1219 if ((nextwqesize == 0) || ((doorbell_cnt == 0) && (dbd == 1))) {
1220 /*
1221 * Consume the CQE
1222 * Return status to indicate that doorbell and sync may be
1223 * necessary.
1224 */
1225 return (TAVOR_CQ_SYNC_AND_DB);
1226
1227 } else {
1228 /*
1229 * Recycle the CQE for use in the next PollCQ() call
1230 * Decrement the doorbell count, modify the error status,
1231 * and update the WQE address and size (to point to the
1232 * next WQE on the chain. Put these update entries back
1233 * into the CQE.
1234 * Despite the fact that we have updated the CQE, it is not
1235 * necessary for us to attempt to sync this entry just yet
1236 * as we have not changed the "hardware's view" of the
1237 * entry (i.e. we have not modified the "owner" bit - which
1238 * is all that the Tavor hardware really cares about.
1239 */
1240 doorbell_cnt = doorbell_cnt - dbd;
1241 TAVOR_CQE_IMM_ETH_PKEY_CRED_SET(cqe,
1242 ((TAVOR_CQE_WR_FLUSHED_ERR << TAVOR_CQE_ERR_STATUS_SHIFT) |
1243 (doorbell_cnt & TAVOR_CQE_ERR_DBDCNT_MASK)));
1244 TAVOR_CQE_WQEADDRSZ_SET(cqe,
1245 TAVOR_QP_WQEADDRSZ(next_wqeaddr, nextwqesize));
1246 dapl_dbg_log(DAPL_DBG_TYPE_EVD,
1247 "errcqe_consume: recycling cqe.eth=%x, wqe=%x\n",
1248 TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe),
1249 TAVOR_CQE_WQEADDRSZ_GET(cqe));
1250 return (TAVOR_CQ_RECYCLE_ENTRY);
1251 }
1252 }
1253
1254 /*
1255 * dapli_tavor_cq_notify()
1256 * This function is used for arming the CQ by ringing the CQ doorbell.
1257 */
1258 static DAT_RETURN
dapli_tavor_cq_notify(ib_cq_handle_t cq,int flags,uint32_t param)1259 dapli_tavor_cq_notify(ib_cq_handle_t cq, int flags, uint32_t param)
1260 {
1261 uint32_t cqnum;
1262
1263 /*
1264 * Determine if we are trying to get the next completion or the next
1265 * "solicited" completion. Then hit the appropriate doorbell.
1266 */
1267 cqnum = cq->cq_num;
1268 if (flags == IB_NOTIFY_ON_NEXT_COMP) {
1269 dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_NOTIFY_CQ,
1270 cqnum, TAVOR_CQDB_DEFAULT_PARAM);
1271
1272 } else if (flags == IB_NOTIFY_ON_NEXT_SOLICITED) {
1273 dapli_tavor_cq_doorbell(cq->cq_iauar,
1274 TAVOR_CQDB_NOTIFY_CQ_SOLICIT, cqnum,
1275 TAVOR_CQDB_DEFAULT_PARAM);
1276
1277 } else if (flags == IB_NOTIFY_ON_NEXT_NCOMP) {
1278 dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_NOTIFY_NCQ,
1279 cqnum, param);
1280 } else {
1281 return (DAT_INVALID_PARAMETER);
1282 }
1283
1284 return (DAT_SUCCESS);
1285 }
1286
1287 /*
1288 * dapli_tavor_post_send()
1289 */
1290 static DAT_RETURN
dapli_tavor_post_send(DAPL_EP * ep,ibt_send_wr_t * wr,boolean_t ns)1291 dapli_tavor_post_send(DAPL_EP *ep, ibt_send_wr_t *wr, boolean_t ns)
1292 {
1293 tavor_sw_wqe_dbinfo_t dbinfo;
1294 dapls_tavor_wrid_list_hdr_t *wridlist;
1295 dapls_tavor_wrid_entry_t *wre_last;
1296 uint32_t desc;
1297 uint64_t *wqe_addr;
1298 uint32_t desc_sz;
1299 uint32_t wqeaddrsz, signaled_dbd;
1300 uint32_t head, tail, next_tail, qsize_msk;
1301 int status;
1302 ib_qp_handle_t qp;
1303
1304 if ((ep->qp_state == IBT_STATE_RESET) ||
1305 (ep->qp_state == IBT_STATE_INIT) ||
1306 (ep->qp_state == IBT_STATE_RTR)) {
1307 dapl_dbg_log(DAPL_DBG_TYPE_ERR,
1308 "post_send: invalid qp_state %d\n", ep->qp_state);
1309 return (DAT_INVALID_STATE);
1310 }
1311
1312 qp = ep->qp_handle;
1313
1314 /* Grab the lock for the WRID list */
1315 dapl_os_lock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1316 wridlist = qp->qp_sq_wqhdr->wq_wrid_post;
1317
1318 /* Save away some initial QP state */
1319 qsize_msk = qp->qp_sq_wqhdr->wq_size - 1;
1320 tail = qp->qp_sq_wqhdr->wq_tail;
1321 head = qp->qp_sq_wqhdr->wq_head;
1322
1323 /*
1324 * Check for "queue full" condition. If the queue is already full,
1325 * then no more WQEs can be posted, return an error
1326 */
1327 if (qp->qp_sq_wqhdr->wq_full != 0) {
1328 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1329 return (DAT_INSUFFICIENT_RESOURCES);
1330 }
1331
1332 /*
1333 * Increment the "tail index" and check for "queue full" condition.
1334 * If we detect that the current work request is going to fill the
1335 * work queue, then we mark this condition and continue.
1336 */
1337 next_tail = (tail + 1) & qsize_msk;
1338 if (next_tail == head) {
1339 qp->qp_sq_wqhdr->wq_full = 1;
1340 }
1341
1342 /*
1343 * Get the user virtual address of the location where the next
1344 * Send WQE should be built
1345 */
1346 wqe_addr = TAVOR_QP_SQ_ENTRY(qp, tail);
1347
1348 /*
1349 * Call tavor_wqe_send_build() to build the WQE at the given address.
1350 * This routine uses the information in the ibt_send_wr_t and
1351 * returns the size of the WQE when it returns.
1352 */
1353 status = dapli_tavor_wqe_send_build(qp, wr, wqe_addr, &desc_sz);
1354 if (status != DAT_SUCCESS) {
1355 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1356 return (status);
1357 }
1358
1359 /*
1360 * Get the descriptor (io address) corresponding to the location
1361 * Send WQE was built.
1362 */
1363 desc = TAVOR_QP_SQ_DESC(qp, tail);
1364
1365 dapl_os_assert(desc >= qp->qp_sq_desc_addr &&
1366 desc <= (qp->qp_sq_desc_addr +
1367 qp->qp_sq_numwqe*qp->qp_sq_wqesz));
1368
1369 /*
1370 * Add a WRID entry to the WRID list. Need to calculate the
1371 * "wqeaddrsz" and "signaled_dbd" values to pass to
1372 * dapli_tavor_wrid_add_entry()
1373 */
1374 wqeaddrsz = TAVOR_QP_WQEADDRSZ(desc, desc_sz);
1375
1376 if (wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1377 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
1378 }
1379
1380 dapli_tavor_wrid_add_entry(qp->qp_sq_wqhdr, wr->wr_id, wqeaddrsz,
1381 signaled_dbd);
1382
1383 /*
1384 * Now link the wqe to the old chain (if there was one)
1385 */
1386 dapli_tavor_wqe_send_linknext(wr, wqe_addr, ns, desc, desc_sz,
1387 qp->qp_sq_lastwqeaddr, &dbinfo);
1388
1389 /*
1390 * Now if the WRID tail entry is non-NULL, then this
1391 * represents the entry to which we are chaining the
1392 * new entries. Since we are going to ring the
1393 * doorbell for this WQE, we want set its "dbd" bit.
1394 *
1395 * On the other hand, if the tail is NULL, even though
1396 * we will have rung the doorbell for the previous WQE
1397 * (for the hardware's sake) it is irrelevant to our
1398 * purposes (for tracking WRIDs) because we know the
1399 * request must have already completed.
1400 */
1401 wre_last = wridlist->wl_wre_old_tail;
1402 if (wre_last != NULL) {
1403 wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED;
1404 }
1405
1406 /* Update some of the state in the QP */
1407 qp->qp_sq_lastwqeaddr = wqe_addr;
1408 qp->qp_sq_wqhdr->wq_tail = next_tail;
1409
1410 /* Ring the doorbell */
1411 dapli_tavor_qp_send_doorbell(qp->qp_iauar, desc, desc_sz,
1412 qp->qp_num, dbinfo.db_fence, dbinfo.db_nopcode);
1413
1414 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1415
1416 return (DAT_SUCCESS);
1417 }
1418
1419 /*
1420 * dapli_tavor_post_recv()
1421 */
1422 static DAT_RETURN
dapli_tavor_post_recv(DAPL_EP * ep,ibt_recv_wr_t * wr,boolean_t ns)1423 dapli_tavor_post_recv(DAPL_EP *ep, ibt_recv_wr_t *wr, boolean_t ns)
1424 {
1425 dapls_tavor_wrid_list_hdr_t *wridlist;
1426 dapls_tavor_wrid_entry_t *wre_last;
1427 ib_qp_handle_t qp;
1428 DAT_RETURN status;
1429 uint32_t desc;
1430 uint64_t *wqe_addr;
1431 uint32_t desc_sz;
1432 uint32_t wqeaddrsz;
1433 uint32_t head, tail, next_tail, qsize_msk;
1434
1435 if (ep->qp_state == IBT_STATE_RESET) {
1436 dapl_dbg_log(DAPL_DBG_TYPE_ERR,
1437 "post_recv: invalid qp_state %d\n", ep->qp_state);
1438 return (DAT_INVALID_STATE);
1439 }
1440 qp = ep->qp_handle;
1441
1442 /* Grab the lock for the WRID list */
1443 dapl_os_lock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1444 wridlist = qp->qp_rq_wqhdr->wq_wrid_post;
1445
1446 /* Save away some initial QP state */
1447 qsize_msk = qp->qp_rq_wqhdr->wq_size - 1;
1448 tail = qp->qp_rq_wqhdr->wq_tail;
1449 head = qp->qp_rq_wqhdr->wq_head;
1450
1451 /*
1452 * For the ibt_recv_wr_t passed in, parse the request and build a
1453 * Recv WQE. Link the WQE with the previous WQE and ring the
1454 * door bell.
1455 */
1456
1457 /*
1458 * Check for "queue full" condition. If the queue is already full,
1459 * then no more WQEs can be posted. So return an error.
1460 */
1461 if (qp->qp_rq_wqhdr->wq_full != 0) {
1462 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1463 return (DAT_INSUFFICIENT_RESOURCES);
1464 }
1465
1466 /*
1467 * Increment the "tail index" and check for "queue
1468 * full" condition. If we detect that the current
1469 * work request is going to fill the work queue, then
1470 * we mark this condition and continue.
1471 */
1472 next_tail = (tail + 1) & qsize_msk;
1473 if (next_tail == head) {
1474 qp->qp_rq_wqhdr->wq_full = 1;
1475 }
1476
1477 /* Get the descriptor (IO Address) of the WQE to be built */
1478 desc = TAVOR_QP_RQ_DESC(qp, tail);
1479 /* The user virtual address of the WQE to be built */
1480 wqe_addr = TAVOR_QP_RQ_ENTRY(qp, tail);
1481
1482 /*
1483 * Call tavor_wqe_recv_build() to build the WQE at the given
1484 * address. This routine uses the information in the
1485 * ibt_recv_wr_t and returns the size of the WQE.
1486 */
1487 status = dapli_tavor_wqe_recv_build(qp, wr, wqe_addr, &desc_sz);
1488 if (status != DAT_SUCCESS) {
1489 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1490 return (DAT_INTERNAL_ERROR);
1491 }
1492
1493 /*
1494 * Add a WRID entry to the WRID list. Need to calculate the
1495 * "wqeaddrsz" and "signaled_dbd" values to pass to
1496 * dapli_tavor_wrid_add_entry().
1497 * Note: all Recv WQEs are essentially "signaled"
1498 */
1499 wqeaddrsz = TAVOR_QP_WQEADDRSZ(desc, desc_sz);
1500 dapli_tavor_wrid_add_entry(qp->qp_rq_wqhdr, wr->wr_id, wqeaddrsz,
1501 (uint32_t)TAVOR_WRID_ENTRY_SIGNALED);
1502
1503 /*
1504 * Now link the chain to the old chain (if there was one)
1505 * and ring the doorbel for the recv work queue.
1506 */
1507 dapli_tavor_wqe_recv_linknext(wqe_addr, ns, desc, desc_sz,
1508 qp->qp_rq_lastwqeaddr);
1509
1510 /*
1511 * Now if the WRID tail entry is non-NULL, then this
1512 * represents the entry to which we are chaining the
1513 * new entries. Since we are going to ring the
1514 * doorbell for this WQE, we want set its "dbd" bit.
1515 *
1516 * On the other hand, if the tail is NULL, even though
1517 * we will have rung the doorbell for the previous WQE
1518 * (for the hardware's sake) it is irrelevant to our
1519 * purposes (for tracking WRIDs) because we know the
1520 * request must have already completed.
1521 */
1522 wre_last = wridlist->wl_wre_old_tail;
1523 if (wre_last != NULL) {
1524 wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED;
1525 }
1526
1527 /* Update some of the state in the QP */
1528 qp->qp_rq_lastwqeaddr = wqe_addr;
1529 qp->qp_rq_wqhdr->wq_tail = next_tail;
1530
1531 /* Ring the doorbell */
1532 dapli_tavor_qp_recv_doorbell(qp->qp_iauar, desc, desc_sz,
1533 qp->qp_num, 1);
1534
1535 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1536
1537 return (DAT_SUCCESS);
1538 }
1539
1540 /*
1541 * dapli_tavor_post_srq()
1542 */
1543 static DAT_RETURN
dapli_tavor_post_srq(DAPL_SRQ * srqp,ibt_recv_wr_t * wr,boolean_t ns)1544 dapli_tavor_post_srq(DAPL_SRQ *srqp, ibt_recv_wr_t *wr, boolean_t ns)
1545 {
1546 ib_srq_handle_t srq;
1547 DAT_RETURN status;
1548 uint32_t desc;
1549 uint64_t *wqe_addr;
1550 uint64_t *last_wqe_addr;
1551 uint32_t head, next_head, qsize_msk;
1552 uint32_t wqe_index;
1553
1554
1555 srq = srqp->srq_handle;
1556
1557 /* Grab the lock for the WRID list */
1558 dapl_os_lock(&srq->srq_wridlist->wl_lock->wrl_lock);
1559
1560 /*
1561 * For the ibt_recv_wr_t passed in, parse the request and build a
1562 * Recv WQE. Link the WQE with the previous WQE and ring the
1563 * door bell.
1564 */
1565
1566 /*
1567 * Check for "queue full" condition. If the queue is already full,
1568 * ie. there are no free entries, then no more WQEs can be posted.
1569 * So return an error.
1570 */
1571 if (srq->srq_wridlist->wl_freel_entries == 0) {
1572 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1573 return (DAT_INSUFFICIENT_RESOURCES);
1574 }
1575
1576 /* Save away some initial SRQ state */
1577 qsize_msk = srq->srq_wridlist->wl_size - 1;
1578 head = srq->srq_wridlist->wl_freel_head;
1579
1580 next_head = (head + 1) & qsize_msk;
1581
1582 /* Get the descriptor (IO Address) of the WQE to be built */
1583 desc = srq->srq_wridlist->wl_free_list[head];
1584
1585 wqe_index = TAVOR_SRQ_WQ_INDEX(srq->srq_wq_desc_addr, desc,
1586 srq->srq_wq_wqesz);
1587
1588 /* The user virtual address of the WQE to be built */
1589 wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq, wqe_index);
1590
1591 /*
1592 * Call dapli_tavor_wqe_srq_build() to build the WQE at the given
1593 * address. This routine uses the information in the
1594 * ibt_recv_wr_t and returns the size of the WQE.
1595 */
1596 status = dapli_tavor_wqe_srq_build(srq, wr, wqe_addr);
1597 if (status != DAT_SUCCESS) {
1598 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1599 return (status);
1600 }
1601
1602 /*
1603 * Add a WRID entry to the WRID list.
1604 */
1605 dapli_tavor_wrid_add_entry_srq(srq, wr->wr_id, wqe_index);
1606
1607 if (srq->srq_wq_lastwqeindex == -1) {
1608 last_wqe_addr = NULL;
1609 } else {
1610 last_wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq,
1611 srq->srq_wq_lastwqeindex);
1612 }
1613 /*
1614 * Now link the chain to the old chain (if there was one)
1615 * and ring the doorbell for the SRQ.
1616 */
1617 dapli_tavor_wqe_srq_linknext(wqe_addr, ns, desc, last_wqe_addr);
1618
1619 /* Update some of the state in the SRQ */
1620 srq->srq_wq_lastwqeindex = wqe_index;
1621 srq->srq_wridlist->wl_freel_head = next_head;
1622 srq->srq_wridlist->wl_freel_entries--;
1623 dapl_os_assert(srq->srq_wridlist->wl_freel_entries <=
1624 srq->srq_wridlist->wl_size);
1625
1626 /* Ring the doorbell - for SRQ nds = 0 */
1627 dapli_tavor_qp_recv_doorbell(srq->srq_iauar, desc, 0,
1628 srq->srq_num, 1);
1629
1630 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1631
1632 return (DAT_SUCCESS);
1633 }
1634
1635 /*
1636 * dapli_tavor_wrid_add_entry()
1637 */
1638 extern void
dapli_tavor_wrid_add_entry(dapls_tavor_workq_hdr_t * wq,uint64_t wrid,uint32_t wqeaddrsz,uint_t signaled_dbd)1639 dapli_tavor_wrid_add_entry(dapls_tavor_workq_hdr_t *wq, uint64_t wrid,
1640 uint32_t wqeaddrsz, uint_t signaled_dbd)
1641 {
1642 dapls_tavor_wrid_entry_t *wre_tmp;
1643 uint32_t head, tail, size;
1644
1645 /*
1646 * Find the entry in the container pointed to by the "tail" index.
1647 * Add all of the relevant information to that entry, including WRID,
1648 * "wqeaddrsz" parameter, and whether it was signaled/unsignaled
1649 * and/or doorbelled.
1650 */
1651 head = wq->wq_wrid_post->wl_head;
1652 tail = wq->wq_wrid_post->wl_tail;
1653 size = wq->wq_wrid_post->wl_size;
1654 wre_tmp = &wq->wq_wrid_post->wl_wre[tail];
1655 wre_tmp->wr_wrid = wrid;
1656 wre_tmp->wr_wqeaddrsz = wqeaddrsz;
1657 wre_tmp->wr_signaled_dbd = signaled_dbd;
1658
1659 /*
1660 * Update the "wrid_old_tail" pointer to point to the entry we just
1661 * inserted into the queue. By tracking this pointer (the pointer to
1662 * the most recently inserted entry) it will possible later in the
1663 * PostSend() and PostRecv() code paths to find the entry that needs
1664 * its "doorbelled" flag set (see comment in tavor_post_recv() and/or
1665 * tavor_post_send()).
1666 */
1667 wq->wq_wrid_post->wl_wre_old_tail = wre_tmp;
1668
1669 /* Update the tail index */
1670 tail = ((tail + 1) & (size - 1));
1671 wq->wq_wrid_post->wl_tail = tail;
1672
1673 /*
1674 * If the "tail" index has just wrapped over into the "head" index,
1675 * then we have filled the container. We use the "full" flag to
1676 * indicate this condition and to distinguish it from the "empty"
1677 * condition (where head and tail are also equal).
1678 */
1679 if (head == tail) {
1680 wq->wq_wrid_post->wl_full = 1;
1681 }
1682 }
1683
1684 /*
1685 * dapli_tavor_wrid_add_entry_srq()
1686 */
1687 extern void
dapli_tavor_wrid_add_entry_srq(ib_srq_handle_t srq,uint64_t wrid,uint32_t wqe_index)1688 dapli_tavor_wrid_add_entry_srq(ib_srq_handle_t srq, uint64_t wrid,
1689 uint32_t wqe_index)
1690 {
1691 dapls_tavor_wrid_entry_t *wre;
1692
1693 /* ASSERT on impossible wqe_index values */
1694 dapl_os_assert(wqe_index < srq->srq_wq_numwqe);
1695
1696 /*
1697 * Setup the WRE.
1698 *
1699 * Given the 'wqe_index' value, we store the WRID at this WRE offset.
1700 * And we set the WRE to be signaled_dbd so that on poll CQ we can find
1701 * this information and associate the WRID to the WQE found on the CQE.
1702 * Note: all Recv WQEs are essentially "signaled"
1703 */
1704 wre = &srq->srq_wridlist->wl_wre[wqe_index];
1705 wre->wr_wrid = wrid;
1706 wre->wr_signaled_dbd = (uint32_t)TAVOR_WRID_ENTRY_SIGNALED;
1707 }
1708
1709 /*
1710 * dapli_tavor_cq_srq_entries_flush()
1711 */
1712 static void
dapli_tavor_cq_srq_entries_flush(ib_qp_handle_t qp)1713 dapli_tavor_cq_srq_entries_flush(ib_qp_handle_t qp)
1714 {
1715 ib_cq_handle_t cq;
1716 dapls_tavor_workq_hdr_t *wqhdr;
1717 tavor_hw_cqe_t *cqe;
1718 tavor_hw_cqe_t *next_cqe;
1719 uint32_t cons_indx, tail_cons_indx, wrap_around_mask;
1720 uint32_t new_indx, check_indx, indx;
1721 uint32_t num_to_increment;
1722 int cqe_qpnum, cqe_type;
1723 int outstanding_cqes, removed_cqes;
1724 int i;
1725
1726 /* ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock)); */
1727
1728 cq = qp->qp_rq_cqhdl;
1729 wqhdr = qp->qp_rq_wqhdr;
1730
1731 dapl_os_assert(wqhdr->wq_wrid_post != NULL);
1732 dapl_os_assert(wqhdr->wq_wrid_post->wl_srq_en != 0);
1733
1734 /* Get the consumer index */
1735 cons_indx = cq->cq_consindx;
1736
1737 /*
1738 * Calculate the wrap around mask. Note: This operation only works
1739 * because all Tavor completion queues have power-of-2 sizes
1740 */
1741 wrap_around_mask = (cq->cq_size - 1);
1742
1743 /* Calculate the pointer to the first CQ entry */
1744 cqe = &cq->cq_addr[cons_indx];
1745
1746 /*
1747 * Loop through the CQ looking for entries owned by software. If an
1748 * entry is owned by software then we increment an 'outstanding_cqes'
1749 * count to know how many entries total we have on our CQ. We use this
1750 * value further down to know how many entries to loop through looking
1751 * for our same QP number.
1752 */
1753 outstanding_cqes = 0;
1754 tail_cons_indx = cons_indx;
1755 while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
1756 /* increment total cqes count */
1757 outstanding_cqes++;
1758
1759 /* increment the consumer index */
1760 tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask;
1761
1762 /* update the pointer to the next cq entry */
1763 cqe = &cq->cq_addr[tail_cons_indx];
1764 }
1765
1766 /*
1767 * Using the 'tail_cons_indx' that was just set, we now know how many
1768 * total CQEs possible there are. Set the 'check_indx' and the
1769 * 'new_indx' to the last entry identified by 'tail_cons_indx'
1770 */
1771 check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask;
1772
1773 for (i = 0; i < outstanding_cqes; i++) {
1774 cqe = &cq->cq_addr[check_indx];
1775
1776 /* Grab QP number from CQE */
1777 cqe_qpnum = TAVOR_CQE_QPNUM_GET(cqe);
1778 cqe_type = TAVOR_CQE_SENDRECV_GET(cqe);
1779
1780 /*
1781 * If the QP number is the same in the CQE as the QP that we
1782 * have on this SRQ, then we must free up the entry off the
1783 * SRQ. We also make sure that the completion type is of the
1784 * 'TAVOR_COMPLETION_RECV' type. So any send completions on
1785 * this CQ will be left as-is. The handling of returning
1786 * entries back to HW ownership happens further down.
1787 */
1788 if (cqe_qpnum == qp->qp_num &&
1789 cqe_type == TAVOR_COMPLETION_RECV) {
1790 /* Add back to SRQ free list */
1791 (void) dapli_tavor_wrid_find_match_srq(
1792 wqhdr->wq_wrid_post, cqe);
1793 } else {
1794 /* Do Copy */
1795 if (check_indx != new_indx) {
1796 next_cqe = &cq->cq_addr[new_indx];
1797 /*
1798 * Copy the CQE into the "next_cqe"
1799 * pointer.
1800 */
1801 (void) dapl_os_memcpy(next_cqe, cqe,
1802 sizeof (tavor_hw_cqe_t));
1803 }
1804 new_indx = (new_indx - 1) & wrap_around_mask;
1805 }
1806 /* Move index to next CQE to check */
1807 check_indx = (check_indx - 1) & wrap_around_mask;
1808 }
1809
1810 /* Initialize removed cqes count */
1811 removed_cqes = 0;
1812
1813 /* If an entry was removed */
1814 if (check_indx != new_indx) {
1815
1816 /*
1817 * Set current pointer back to the beginning consumer index.
1818 * At this point, all unclaimed entries have been copied to the
1819 * index specified by 'new_indx'. This 'new_indx' will be used
1820 * as the new consumer index after we mark all freed entries as
1821 * having HW ownership. We do that here.
1822 */
1823
1824 /* Loop through all entries until we reach our new pointer */
1825 for (indx = cons_indx; indx <= new_indx;
1826 indx = (indx + 1) & wrap_around_mask) {
1827 removed_cqes++;
1828 cqe = &cq->cq_addr[indx];
1829
1830 /* Reset entry to hardware ownership */
1831 TAVOR_CQE_OWNER_SET_HW(cqe);
1832 }
1833 }
1834
1835 /*
1836 * Update consumer index to be the 'new_indx'. This moves it past all
1837 * removed entries. Because 'new_indx' is pointing to the last
1838 * previously valid SW owned entry, we add 1 to point the cons_indx to
1839 * the first HW owned entry.
1840 */
1841 cons_indx = (new_indx + 1) & wrap_around_mask;
1842
1843 /*
1844 * Now we only ring the doorbell (to update the consumer index) if
1845 * we've actually consumed a CQ entry. If we found no QP number
1846 * matches above, then we would not have removed anything. So only if
1847 * something was removed do we ring the doorbell.
1848 */
1849 if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
1850 /*
1851 * Post doorbell to update the consumer index. Doorbell
1852 * value indicates number of entries consumed (minus 1)
1853 */
1854 if (cons_indx > cq->cq_consindx) {
1855 num_to_increment = (cons_indx - cq->cq_consindx) - 1;
1856 } else {
1857 num_to_increment = ((cons_indx + cq->cq_size) -
1858 cq->cq_consindx) - 1;
1859 }
1860 cq->cq_consindx = cons_indx;
1861
1862 dapli_tavor_cq_doorbell(cq->cq_iauar, TAVOR_CQDB_INCR_CONSINDX,
1863 cq->cq_num, num_to_increment);
1864 }
1865 }
1866
1867 /* ARGSUSED */
1868 static void
dapli_tavor_qp_init(ib_qp_handle_t qp)1869 dapli_tavor_qp_init(ib_qp_handle_t qp)
1870 {
1871 }
1872
1873 /* ARGSUSED */
1874 static void
dapli_tavor_cq_init(ib_cq_handle_t cq)1875 dapli_tavor_cq_init(ib_cq_handle_t cq)
1876 {
1877 }
1878
1879 /* ARGSUSED */
1880 static void
dapli_tavor_srq_init(ib_srq_handle_t srq)1881 dapli_tavor_srq_init(ib_srq_handle_t srq)
1882 {
1883 }
1884
1885 void
dapls_init_funcs_tavor(DAPL_HCA * hca_ptr)1886 dapls_init_funcs_tavor(DAPL_HCA *hca_ptr)
1887 {
1888 hca_ptr->post_send = dapli_tavor_post_send;
1889 hca_ptr->post_recv = dapli_tavor_post_recv;
1890 hca_ptr->post_srq = dapli_tavor_post_srq;
1891 hca_ptr->cq_peek = dapli_tavor_cq_peek;
1892 hca_ptr->cq_poll = dapli_tavor_cq_poll;
1893 hca_ptr->cq_poll_one = dapli_tavor_cq_poll_one;
1894 hca_ptr->cq_notify = dapli_tavor_cq_notify;
1895 hca_ptr->srq_flush = dapli_tavor_cq_srq_entries_flush;
1896 hca_ptr->qp_init = dapli_tavor_qp_init;
1897 hca_ptr->cq_init = dapli_tavor_cq_init;
1898 hca_ptr->srq_init = dapli_tavor_srq_init;
1899 hca_ptr->hermon_resize_cq = 0;
1900 }
1901