1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #include "dapl.h"
28 #include "dapl_tavor_hw.h"
29 #include "dapl_tavor_wr.h"
30 #include "dapl_tavor_ibtf_impl.h"
31
32 #define bt_debug 0
33
34 enum arbel_db_type_e {
35 ARBEL_DBR_CQ_SET_CI = 0x1 << 5,
36 ARBEL_DBR_CQ_ARM = 0x2 << 5,
37 ARBEL_DBR_SQ = 0x3 << 5,
38 ARBEL_DBR_RQ = 0x4 << 5,
39 ARBEL_DBR_SRQ = 0x5 << 5
40 };
41
42 #define ARBEL_WQE_SGL_INVALID_LKEY 0x00000100
43 #define ARBEL_WQE_SEND_SIGNALED_MASK 0x0000000800000000ull
44 #define ARBEL_WQE_SEND_SOLICIT_MASK 0x0000000200000000ull
45 #define ARBEL_WQE_CTRL_REQBIT_MASK 0x0000000100000000ull
46 #define ARBEL_WQE_NEXT_REQBIT_MASK 0x80
47 #define ARBEL_WQE_SETCTRL(qp, desc, ctrl) \
48 ((uint64_t *)(desc))[1] = HTOBE_64(ctrl)
49 #define ARBEL_WQE_SETNEXT(qp, desc, nda_op, ee_nds) \
50 { \
51 ((uint32_t *)(desc))[0] = HTOBE_32(nda_op); \
52 ((uint32_t *)(desc))[1] = HTOBE_32(ee_nds); \
53 }
54 #define ARBEL_WQE_SEND_FENCE_MASK 0x40
55 #define ARBEL_WQE_SEND_NOPCODE_RDMAW 0x8
56 #define ARBEL_WQE_SEND_NOPCODE_SEND 0xA
57 #define ARBEL_WQE_SEND_NOPCODE_RDMAR 0x10
58 #define ARBEL_WQE_SEND_NOPCODE_BIND 0x18
59 #define ARBEL_WQE_NDA_MASK 0x00000000FFFFFFC0ull
60 #define ARBEL_WQE_NDS_MASK 0x3F
61 #define ARBEL_QPSNDDB_WQE_CNT_SHIFT 0x38
62 #define ARBEL_QPSNDDB_WQE_COUNTER_SHIFT 0x28
63 #define ARBEL_QPSNDDB_F_SHIFT 0x25
64 #define ARBEL_QPSNDDB_NOPCODE_SHIFT 0x20
65 #define ARBEL_QPSNDDB_QPN_SHIFT 0x8
66 #define ARBEL_DBR_QP_WQE_COUNTER_SHIFT 0x20
67 #define ARBEL_DBR_QN_SHIFT 0x8
68
69 #define ARBEL_CQDB_NOTIFY_CQ_SOLICIT 0x1
70 #define ARBEL_CQDB_NOTIFY_CQ 0x2
71
72 /*
73 * Function signatures
74 */
75 extern uint64_t dapls_tavor_wrid_get_entry(ib_cq_handle_t, tavor_hw_cqe_t *,
76 uint_t, uint_t, dapls_tavor_wrid_entry_t *);
77 extern void dapls_tavor_wrid_cq_reap(ib_cq_handle_t);
78 extern DAPL_OS_LOCK g_tavor_uar_lock;
79
80 #ifndef _LP64
81 extern void dapls_atomic_assign_64(uint64_t, uint64_t *);
82 #endif
83
84 static int dapli_arbel_wqe_send_build(ib_qp_handle_t, ibt_send_wr_t *,
85 uint64_t *, uint_t *);
86 static DAT_RETURN dapli_arbel_wqe_recv_build(ib_qp_handle_t, ibt_recv_wr_t *,
87 uint64_t *, uint_t *);
88 static int dapli_arbel_cq_cqe_consume(ib_cq_handle_t, tavor_hw_cqe_t *,
89 ibt_wc_t *);
90 static int dapli_arbel_cq_errcqe_consume(ib_cq_handle_t, tavor_hw_cqe_t *,
91 ibt_wc_t *);
92 extern void dapli_tavor_wrid_add_entry(dapls_tavor_workq_hdr_t *, uint64_t,
93 uint32_t, uint_t);
94 extern void dapli_tavor_wrid_add_entry_srq(ib_srq_handle_t, uint64_t, uint32_t);
95
96 /*
97 * Note: The 64 bit doorbells need to written atomically.
98 * In 32 bit libraries we need to use the special assembly rtn
99 * because compiler generated code splits into 2 word writes
100 */
101
102 /*
103 * dapli_arbel_cq_doorbell()
104 * Takes the specified cq cmd and cq number and rings the cq doorbell
105 */
106 static void
dapli_arbel_cq_doorbell(dapls_hw_uar_t ia_uar,uint32_t cq_cmd,uint32_t cqn,uint32_t cmd_sn,uint32_t cq_param)107 dapli_arbel_cq_doorbell(dapls_hw_uar_t ia_uar, uint32_t cq_cmd, uint32_t cqn,
108 uint32_t cmd_sn, uint32_t cq_param)
109 {
110 uint64_t doorbell;
111
112 /* Build the doorbell from the parameters */
113 doorbell = (cmd_sn << 4) | cq_cmd;
114 doorbell = (doorbell << 24) | cqn;
115 doorbell = (doorbell << 32) | cq_param;
116
117 /* Write the doorbell to UAR */
118 #ifdef _LP64
119 ((tavor_hw_uar_t *)ia_uar)->cq = HTOBE_64(doorbell);
120 /* 32 bit version */
121 #elif defined(i386)
122 dapl_os_lock(&g_tavor_uar_lock);
123 /*
124 * For 32 bit intel we assign the doorbell in the order
125 * prescribed by the Tavor PRM, lower to upper addresses
126 */
127 ((tavor_hw_uar32_t *)ia_uar)->cq[0] =
128 (uint32_t)HTOBE_32(doorbell >> 32);
129 ((tavor_hw_uar32_t *)ia_uar)->cq[1] =
130 (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
131 dapl_os_unlock(&g_tavor_uar_lock);
132 #else
133 dapls_atomic_assign_64(HTOBE_64(doorbell),
134 &((tavor_hw_uar_t *)ia_uar)->cq);
135 #endif
136 }
137
138 /*
139 * dapli_arbel_qp_send_doorbell()
140 * Takes the specified next descriptor information, qp number, opcode and
141 * rings the send doorbell
142 */
143 static void
dapli_arbel_sq_dbrec(ib_qp_handle_t qp,uint16_t wqe_counter)144 dapli_arbel_sq_dbrec(ib_qp_handle_t qp, uint16_t wqe_counter)
145 {
146 qp->qp_sq_dbp[0] = HTOBE_32((wqe_counter + 1) & 0xffff);
147 }
148
149 static void
dapli_arbel_sq_dbreg(dapls_hw_uar_t ia_uar,uint32_t qpn,uint32_t fence,uint32_t nopcode,uint16_t wqe_counter,uint32_t nds)150 dapli_arbel_sq_dbreg(dapls_hw_uar_t ia_uar, uint32_t qpn, uint32_t fence,
151 uint32_t nopcode, uint16_t wqe_counter, uint32_t nds)
152 {
153 uint64_t doorbell;
154
155 doorbell = ((uint64_t)1 << ARBEL_QPSNDDB_WQE_CNT_SHIFT) |
156 ((uint64_t)wqe_counter << ARBEL_QPSNDDB_WQE_COUNTER_SHIFT) |
157 ((uint64_t)fence << ARBEL_QPSNDDB_F_SHIFT) |
158 ((uint64_t)nopcode << ARBEL_QPSNDDB_NOPCODE_SHIFT) |
159 (qpn << ARBEL_QPSNDDB_QPN_SHIFT) | nds;
160
161 /* Write the doorbell to UAR */
162 #ifdef _LP64
163 ((tavor_hw_uar_t *)ia_uar)->send = HTOBE_64(doorbell);
164 #else
165 #if defined(i386)
166 dapl_os_lock(&g_tavor_uar_lock);
167 /*
168 * For 32 bit intel we assign the doorbell in the order
169 * prescribed by the Tavor PRM, lower to upper addresses
170 */
171 ((tavor_hw_uar32_t *)ia_uar)->send[0] =
172 (uint32_t)HTOBE_32(doorbell >> 32);
173 ((tavor_hw_uar32_t *)ia_uar)->send[1] =
174 (uint32_t)HTOBE_32(doorbell & 0x00000000ffffffff);
175 dapl_os_unlock(&g_tavor_uar_lock);
176 #else
177 dapls_atomic_assign_64(HTOBE_64(doorbell),
178 &((tavor_hw_uar_t *)ia_uar)->send);
179 #endif
180 #endif
181 }
182
183 /*
184 * dapli_arbel_wqe_send_build()
185 * Constructs a WQE for a given ibt_send_wr_t
186 */
187 static int
dapli_arbel_wqe_send_build(ib_qp_handle_t qp,ibt_send_wr_t * wr,uint64_t * addr,uint_t * size)188 dapli_arbel_wqe_send_build(ib_qp_handle_t qp, ibt_send_wr_t *wr,
189 uint64_t *addr, uint_t *size)
190 {
191 tavor_hw_snd_wqe_remaddr_t *rc;
192 tavor_hw_snd_wqe_bind_t *bn;
193 tavor_hw_wqe_sgl_t *ds;
194 ibt_wr_ds_t *sgl;
195 uint32_t nds;
196 uint32_t len, total_len;
197 uint32_t new_rkey;
198 uint32_t old_rkey;
199 int i, num_ds;
200 int max_inline_bytes = -1;
201 uint64_t ctrl;
202
203 nds = wr->wr_nds;
204 sgl = wr->wr_sgl;
205 num_ds = 0;
206 ctrl = ((wr->wr_flags & IBT_WR_SEND_SIGNAL) ?
207 ARBEL_WQE_SEND_SIGNALED_MASK : 0) |
208 ((wr->wr_flags & IBT_WR_SEND_SOLICIT) ?
209 ARBEL_WQE_SEND_SOLICIT_MASK : 0) |
210 ARBEL_WQE_CTRL_REQBIT_MASK;
211
212 /*
213 * RC is the only supported transport in UDAPL
214 * For RC requests, we allow "Send", "RDMA Read", "RDMA Write"
215 */
216 switch (wr->wr_opcode) {
217 case IBT_WRC_SEND:
218 /*
219 * If this is a Send request, then all we need is
220 * the Data Segment processing below.
221 * Initialize the information for the Data Segments
222 */
223 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
224 sizeof (tavor_hw_snd_wqe_nextctrl_t));
225 if (qp->qp_sq_inline != 0)
226 max_inline_bytes =
227 qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_SEND;
228 break;
229 case IBT_WRC_RDMAW:
230 if (qp->qp_sq_inline != 0)
231 max_inline_bytes =
232 qp->qp_sq_wqesz - TAVOR_INLINE_HEADER_SIZE_RDMAW;
233 /* FALLTHROUGH */
234 case IBT_WRC_RDMAR:
235 if (qp->qp_sq_inline < 0 && wr->wr_opcode == IBT_WRC_RDMAR)
236 qp->qp_sq_inline = 0;
237 /*
238 * If this is an RDMA Read or RDMA Write request, then fill
239 * in the "Remote Address" header fields.
240 */
241 rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)addr +
242 sizeof (tavor_hw_snd_wqe_nextctrl_t));
243
244 /*
245 * Build the Remote Address Segment for the WQE, using
246 * the information from the RC work request.
247 */
248 TAVOR_WQE_BUILD_REMADDR(rc, &wr->wr.rc.rcwr.rdma);
249
250 /* Update "ds" for filling in Data Segments (below) */
251 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc +
252 sizeof (tavor_hw_snd_wqe_remaddr_t));
253 break;
254 case IBT_WRC_BIND:
255 /*
256 * Generate a new R_key
257 * Increment the upper "unconstrained" bits and need to keep
258 * the lower "constrained" bits the same it represents
259 * the MPT index.
260 */
261 #if 0
262 /* XXX - need equiv of "arbel_wr_bind_check(state, wr);" */
263 /* XXX - uses arbel_mr_keycalc - what about Sinai vs. Arbel??? */
264 #endif
265 old_rkey = wr->wr.rc.rcwr.bind->bind_rkey;
266 new_rkey = old_rkey >> 8; /* index */
267 old_rkey = ((old_rkey & 0xff) + 1) & 0xff; /* incremented key */
268 new_rkey = (new_rkey << 8) | old_rkey;
269
270 wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
271
272 bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)addr +
273 sizeof (tavor_hw_snd_wqe_nextctrl_t));
274
275 /*
276 * Build the Bind Memory Window Segments for the WQE,
277 * using the information from the RC Bind memory
278 * window work request.
279 */
280 TAVOR_WQE_BUILD_BIND(bn, wr->wr.rc.rcwr.bind);
281
282 /*
283 * Update the "ds" pointer. Even though the "bind"
284 * operation requires no SGLs, this is necessary to
285 * facilitate the correct descriptor size calculations
286 * (below).
287 */
288 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
289 sizeof (tavor_hw_snd_wqe_bind_t));
290 nds = 0;
291 break;
292 default:
293 dapl_dbg_log(DAPL_DBG_TYPE_ERR,
294 "dapli_arbel_wqe_send_build: invalid wr_opcode=%d\n",
295 wr->wr_opcode);
296 return (DAT_INTERNAL_ERROR);
297 }
298
299 /*
300 * Now fill in the Data Segments (SGL) for the Send WQE based on
301 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
302 * Start by checking for a valid number of SGL entries
303 */
304 if (nds > qp->qp_sq_sgl) {
305 return (DAT_INVALID_PARAMETER);
306 }
307
308 /*
309 * For each SGL in the Send Work Request, fill in the Send WQE's data
310 * segments. Note: We skip any SGL with zero size because Tavor
311 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
312 * the encoding for zero means a 2GB transfer. Because of this special
313 * encoding in the hardware, we mask the requested length with
314 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
315 * zero.)
316 */
317 if (max_inline_bytes != -1) { /* compute total_len */
318 total_len = 0;
319 for (i = 0; i < nds; i++)
320 total_len += sgl[i].ds_len;
321 if (total_len > max_inline_bytes)
322 max_inline_bytes = -1; /* too big, do not "inline" */
323 }
324 if (max_inline_bytes != -1) { /* do "inline" */
325 uint8_t *dst = (uint8_t *)((uint32_t *)ds + 1);
326 *(uint32_t *)ds =
327 HTOBE_32(total_len | TAVOR_WQE_SGL_INLINE_MASK);
328 for (i = 0; i < nds; i++) {
329 if ((len = sgl[i].ds_len) == 0) {
330 continue;
331 }
332 (void) dapl_os_memcpy(dst,
333 (void *)(uintptr_t)sgl[i].ds_va, len);
334 dst += len;
335 }
336 /* Return the size of descriptor (in 16-byte chunks) */
337 *size = ((uintptr_t)dst - (uintptr_t)addr + 15) >> 4;
338 } else {
339 for (i = 0; i < nds; i++) {
340 if (sgl[i].ds_len == 0) {
341 continue;
342 }
343
344 /*
345 * Fill in the Data Segment(s) for the current WQE,
346 * using the information contained in the
347 * scatter-gather list of the work request.
348 */
349 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &sgl[i]);
350 num_ds++;
351 }
352
353 /* Return the size of descriptor (in 16-byte chunks) */
354 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)addr) >> 4;
355 }
356 ARBEL_WQE_SETCTRL(qp, addr, ctrl);
357
358 return (DAT_SUCCESS);
359 }
360
361 /*
362 * dapli_arbel_wqe_send_linknext()
363 * Takes a WQE and links it to the prev WQE chain
364 */
365 static void
dapli_arbel_wqe_send_linknext(ibt_send_wr_t * curr_wr,uint32_t curr_desc,uint_t curr_descsz,uint64_t * prev_addr,tavor_sw_wqe_dbinfo_t * dbinfo)366 dapli_arbel_wqe_send_linknext(ibt_send_wr_t *curr_wr,
367 uint32_t curr_desc, uint_t curr_descsz, uint64_t *prev_addr,
368 tavor_sw_wqe_dbinfo_t *dbinfo)
369 {
370 uint32_t nopcode, fence, nda_op, ee_nds;
371
372 /*
373 * Calculate the "next" field of the prev descriptor. This amounts
374 * to setting up the "next_wqe_addr", "nopcode", "fence", and "nds"
375 * fields (see tavor_hw.h for more).
376 */
377
378 /*
379 * Determine the value for the Tavor WQE "nopcode" field
380 * by using the IBTF opcode from the work request
381 */
382 switch (curr_wr->wr_opcode) {
383 case IBT_WRC_RDMAW:
384 nopcode = ARBEL_WQE_SEND_NOPCODE_RDMAW;
385 break;
386
387 case IBT_WRC_SEND:
388 nopcode = ARBEL_WQE_SEND_NOPCODE_SEND;
389 break;
390
391 case IBT_WRC_RDMAR:
392 nopcode = ARBEL_WQE_SEND_NOPCODE_RDMAR;
393 break;
394
395 case IBT_WRC_BIND:
396 nopcode = ARBEL_WQE_SEND_NOPCODE_BIND;
397 break;
398 default:
399 /* Unsupported opcodes in UDAPL */
400 dapl_dbg_log(DAPL_DBG_TYPE_ERR,
401 "dapli_arbel_wqe_send_linknext: invalid nopcode=%d\n",
402 nopcode);
403 return;
404 }
405
406 fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
407 nda_op = ((uintptr_t)curr_desc & ARBEL_WQE_NDA_MASK) | nopcode;
408 ee_nds = ((fence == 1) ? ARBEL_WQE_SEND_FENCE_MASK : 0) |
409 (curr_descsz & ARBEL_WQE_NDS_MASK) |
410 ARBEL_WQE_NEXT_REQBIT_MASK;
411
412 /*
413 * A send queue doorbell will be rung for the next
414 * WQE on the chain, set the current WQE's "dbd" bit.
415 * Note: We also update the "dbinfo" structure here to pass
416 * back information about what should (later) be included
417 * in the send queue doorbell.
418 */
419 dbinfo->db_nopcode = nopcode;
420 dbinfo->db_fence = fence;
421
422 ARBEL_WQE_SETNEXT(qp, prev_addr, nda_op, ee_nds);
423 }
424
425
426 /*
427 * dapli_arbel_wqe_recv_build()
428 * Builds the recv WQE for a given ibt_recv_wr_t
429 */
430 static DAT_RETURN
dapli_arbel_wqe_recv_build(ib_qp_handle_t qp,ibt_recv_wr_t * wr,uint64_t * addr,uint_t * size)431 dapli_arbel_wqe_recv_build(ib_qp_handle_t qp, ibt_recv_wr_t *wr,
432 uint64_t *addr, uint_t *size)
433 {
434 tavor_hw_wqe_sgl_t *ds;
435 int i;
436 int num_ds;
437
438 /* Fill in the Data Segments (SGL) for the Recv WQE */
439 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
440 sizeof (tavor_hw_rcv_wqe_nextctrl_t));
441 num_ds = 0;
442
443 /* Check for valid number of SGL entries */
444 if (wr->wr_nds > qp->qp_rq_sgl) {
445 return (DAT_INVALID_PARAMETER);
446 }
447
448 /*
449 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
450 * segments. Note: We skip any SGL with zero size because Tavor
451 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
452 * the encoding for zero means a 2GB transfer. Because of this special
453 * encoding in the hardware, we mask the requested length with
454 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
455 * zero.)
456 */
457 for (i = 0; i < wr->wr_nds; i++) {
458 if (wr->wr_sgl[i].ds_len == 0) {
459 continue;
460 }
461
462 /*
463 * Fill in the Data Segment(s) for the receive WQE, using the
464 * information contained in the scatter-gather list of the
465 * work request.
466 */
467 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]);
468 num_ds++;
469 }
470 if (i < qp->qp_rq_sgl) {
471 ibt_wr_ds_t sgl;
472 sgl.ds_va = (ib_vaddr_t)0;
473 sgl.ds_len = (ib_msglen_t)0;
474 sgl.ds_key = (ibt_lkey_t)ARBEL_WQE_SGL_INVALID_LKEY;
475 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &sgl);
476 }
477
478 /* Return the size of descriptor (in 16-byte chunks) */
479 *size = qp->qp_rq_wqesz >> 4;
480
481 return (DAT_SUCCESS);
482 }
483
484 /*
485 * dapli_arbel_wqe_srq_build()
486 * Builds the recv WQE for a given ibt_recv_wr_t
487 */
488 static DAT_RETURN
dapli_arbel_wqe_srq_build(ib_srq_handle_t srq,ibt_recv_wr_t * wr,uint64_t * addr)489 dapli_arbel_wqe_srq_build(ib_srq_handle_t srq, ibt_recv_wr_t *wr,
490 uint64_t *addr)
491 {
492 tavor_hw_wqe_sgl_t *ds;
493 ibt_wr_ds_t end_sgl;
494 int i;
495 int num_ds;
496
497 /* Fill in the Data Segments (SGL) for the Recv WQE */
498 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)addr +
499 sizeof (tavor_hw_rcv_wqe_nextctrl_t));
500 num_ds = 0;
501
502 /* Check for valid number of SGL entries */
503 if (wr->wr_nds > srq->srq_wq_sgl) {
504 return (DAT_INVALID_PARAMETER);
505 }
506
507 /*
508 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
509 * segments. Note: We skip any SGL with zero size because Tavor
510 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
511 * the encoding for zero means a 2GB transfer. Because of this special
512 * encoding in the hardware, we mask the requested length with
513 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
514 * zero.)
515 */
516 for (i = 0; i < wr->wr_nds; i++) {
517 if (wr->wr_sgl[i].ds_len == 0) {
518 continue;
519 }
520
521 /*
522 * Fill in the Data Segment(s) for the receive WQE, using the
523 * information contained in the scatter-gather list of the
524 * work request.
525 */
526 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &wr->wr_sgl[i]);
527 num_ds++;
528 }
529
530 /*
531 * For SRQ, if the number of data segments is less than the maximum
532 * specified at alloc, then we have to fill in a special "key" entry in
533 * the sgl entry after the last valid one in this post request. We do
534 * that here.
535 */
536 if (num_ds < srq->srq_wq_sgl) {
537 end_sgl.ds_va = (ib_vaddr_t)0;
538 end_sgl.ds_len = (ib_msglen_t)0;
539 end_sgl.ds_key = (ibt_lkey_t)ARBEL_WQE_SGL_INVALID_LKEY;
540 TAVOR_WQE_BUILD_DATA_SEG(&ds[num_ds], &end_sgl);
541 }
542
543 return (DAT_SUCCESS);
544 }
545
546 /*
547 * dapli_arbel_cq_peek()
548 * Peeks into a given CQ to check if there are any events that can be
549 * polled. It returns the number of CQEs that can be polled.
550 */
551 static void
dapli_arbel_cq_peek(ib_cq_handle_t cq,int * num_cqe)552 dapli_arbel_cq_peek(ib_cq_handle_t cq, int *num_cqe)
553 {
554 tavor_hw_cqe_t *cqe;
555 uint32_t imm_eth_pkey_cred;
556 uint32_t cons_indx;
557 uint32_t wrap_around_mask;
558 uint32_t polled_cnt;
559 uint_t doorbell_cnt;
560 uint_t opcode;
561
562 /* Get the consumer index */
563 cons_indx = cq->cq_consindx;
564
565 /*
566 * Calculate the wrap around mask. Note: This operation only works
567 * because all Tavor completion queues have power-of-2 sizes
568 */
569 wrap_around_mask = (cq->cq_size - 1);
570
571 /* Calculate the pointer to the first CQ entry */
572 cqe = &cq->cq_addr[cons_indx];
573
574 /*
575 * Count entries in the CQ until we find an entry owned by
576 * the hardware.
577 */
578 polled_cnt = 0;
579 while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
580 opcode = TAVOR_CQE_OPCODE_GET(cqe);
581 /* Error CQE map to multiple work completions */
582 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
583 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
584 imm_eth_pkey_cred =
585 TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe);
586 doorbell_cnt =
587 imm_eth_pkey_cred & TAVOR_CQE_ERR_DBDCNT_MASK;
588 polled_cnt += (doorbell_cnt + 1);
589 } else {
590 polled_cnt++;
591 }
592 /* Increment the consumer index */
593 cons_indx = (cons_indx + 1) & wrap_around_mask;
594
595 /* Update the pointer to the next CQ entry */
596 cqe = &cq->cq_addr[cons_indx];
597 }
598
599 *num_cqe = polled_cnt;
600 }
601
602 #define dapli_arbel_cq_update_ci(cq, dbp) \
603 (dbp)[0] = HTOBE_32(cq->cq_consindx)
604
605 /*
606 * dapli_arbel_cq_poll()
607 * This routine polls CQEs out of a CQ and puts them into the ibt_wc_t
608 * array that is passed in.
609 */
610 static DAT_RETURN
dapli_arbel_cq_poll(ib_cq_handle_t cq,ibt_wc_t * wc_p,uint_t num_wc,uint_t * num_polled)611 dapli_arbel_cq_poll(ib_cq_handle_t cq, ibt_wc_t *wc_p, uint_t num_wc,
612 uint_t *num_polled)
613 {
614 tavor_hw_cqe_t *cqe;
615 uint32_t cons_indx;
616 uint32_t wrap_around_mask;
617 uint32_t polled_cnt;
618 DAT_RETURN dat_status;
619 int status;
620
621 /* Get the consumer index */
622 cons_indx = cq->cq_consindx;
623
624 /*
625 * Calculate the wrap around mask. Note: This operation only works
626 * because all Tavor completion queues have power-of-2 sizes
627 */
628 wrap_around_mask = (cq->cq_size - 1);
629
630 /* Calculate the pointer to the first CQ entry */
631 cqe = &cq->cq_addr[cons_indx];
632
633 /*
634 * Keep pulling entries from the CQ until we find an entry owned by
635 * the hardware. As long as there the CQE's owned by SW, process
636 * each entry by calling dapli_arbel_cq_cqe_consume() and updating the
637 * CQ consumer index. Note: We only update the consumer index if
638 * dapli_arbel_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.
639 * Otherwise, it indicates that we are going to "recycle" the CQE
640 * (probably because it is a error CQE and corresponds to more than one
641 * completion).
642 */
643 polled_cnt = 0;
644 while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
645 status = dapli_arbel_cq_cqe_consume(cq, cqe,
646 &wc_p[polled_cnt++]);
647 if (status == TAVOR_CQ_SYNC_AND_DB) {
648 /* Reset entry to hardware ownership */
649 TAVOR_CQE_OWNER_SET_HW(cqe);
650
651 /* Increment the consumer index */
652 cons_indx = (cons_indx + 1) & wrap_around_mask;
653
654 /* Update the pointer to the next CQ entry */
655 cqe = &cq->cq_addr[cons_indx];
656 }
657
658 /*
659 * If we have run out of space to store work completions,
660 * then stop and return the ones we have pulled of the CQ.
661 */
662 if (polled_cnt >= num_wc) {
663 break;
664 }
665 }
666
667 dat_status = DAT_SUCCESS;
668 /*
669 * Now we only ring the doorbell (to update the consumer index) if
670 * we've actually consumed a CQ entry. If we have, for example,
671 * pulled from a CQE that we are still in the process of "recycling"
672 * for error purposes, then we would not update the consumer index.
673 */
674 if ((polled_cnt != 0) && (cq->cq_consindx != cons_indx)) {
675 /*
676 * Update the consumer index in both the CQ handle and the
677 * doorbell record.
678 */
679 cq->cq_consindx = cons_indx;
680 dapli_arbel_cq_update_ci(cq, cq->cq_poll_dbp);
681 } else if (polled_cnt == 0) {
682 /*
683 * If the CQ is empty, we can try to free up some of the WRID
684 * list containers.
685 */
686 if (cq->cq_wrid_reap_head) /* look before leaping */
687 dapls_tavor_wrid_cq_reap(cq);
688 dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0);
689 }
690
691 if (num_polled != NULL) {
692 *num_polled = polled_cnt;
693 }
694
695 return (dat_status);
696 }
697
698 /*
699 * dapli_arbel_cq_poll_one()
700 * This routine polls one CQE out of a CQ and puts ot into the ibt_wc_t
701 * that is passed in. See above for more comments/details.
702 */
703 static DAT_RETURN
dapli_arbel_cq_poll_one(ib_cq_handle_t cq,ibt_wc_t * wc_p)704 dapli_arbel_cq_poll_one(ib_cq_handle_t cq, ibt_wc_t *wc_p)
705 {
706 tavor_hw_cqe_t *cqe;
707 uint32_t cons_indx;
708 DAT_RETURN dat_status;
709 int status;
710
711 /* Get the consumer index */
712 cons_indx = cq->cq_consindx;
713
714 /* Calculate the pointer to the first CQ entry */
715 cqe = &cq->cq_addr[cons_indx];
716
717 /*
718 * Keep pulling entries from the CQ until we find an entry owned by
719 * the hardware. As long as there the CQE's owned by SW, process
720 * each entry by calling dapli_arbel_cq_cqe_consume() and updating the
721 * CQ consumer index. Note: We only update the consumer index if
722 * dapli_arbel_cq_cqe_consume() returns TAVOR_CQ_SYNC_AND_DB.
723 * Otherwise, it indicates that we are going to "recycle" the CQE
724 * (probably because it is a error CQE and corresponds to more than one
725 * completion).
726 */
727 if (TAVOR_CQE_OWNER_IS_SW(cqe)) {
728 status = dapli_arbel_cq_cqe_consume(cq, cqe, wc_p);
729 if (status == TAVOR_CQ_SYNC_AND_DB) {
730 /* Reset entry to hardware ownership */
731 TAVOR_CQE_OWNER_SET_HW(cqe);
732
733 /* Increment the consumer index */
734 cq->cq_consindx =
735 (cons_indx + 1) & (cq->cq_size - 1);
736 dapli_arbel_cq_update_ci(cq, cq->cq_poll_dbp);
737 }
738 dat_status = DAT_SUCCESS;
739 } else {
740 if (cq->cq_wrid_reap_head) /* look before leaping */
741 dapls_tavor_wrid_cq_reap(cq);
742 dat_status = DAT_ERROR(DAT_QUEUE_EMPTY, 0);
743 }
744 return (dat_status);
745 }
746
747 /*
748 * dapli_arbel_cq_cqe_consume()
749 * Converts a given CQE into a ibt_wc_t object
750 */
751 static int
dapli_arbel_cq_cqe_consume(ib_cq_handle_t cqhdl,tavor_hw_cqe_t * cqe,ibt_wc_t * wc)752 dapli_arbel_cq_cqe_consume(ib_cq_handle_t cqhdl, tavor_hw_cqe_t *cqe,
753 ibt_wc_t *wc)
754 {
755 uint_t flags;
756 uint_t type;
757 uint_t opcode;
758 int status;
759
760 /* strip off the size in wqeaddrsz */
761 TAVOR_CQE_WQEADDRSZ_SET(cqe, TAVOR_CQE_WQEADDRSZ_GET(cqe) &
762 ~ARBEL_WQE_NDS_MASK);
763
764 /*
765 * Determine if this is an "error" CQE by examining "opcode". If it
766 * is an error CQE, then call dapli_arbel_cq_errcqe_consume() and return
767 * whatever status it returns. Otherwise, this is a successful
768 * completion.
769 */
770 opcode = TAVOR_CQE_OPCODE_GET(cqe);
771 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
772 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
773 status = dapli_arbel_cq_errcqe_consume(cqhdl, cqe, wc);
774 return (status);
775 }
776
777 /*
778 * Fetch the Work Request ID using the information in the CQE.
779 * See tavor_wr.c for more details.
780 */
781 wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, cqe,
782 TAVOR_CQE_SENDRECV_GET(cqe), 0, NULL);
783 wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe);
784
785 /*
786 * Parse the CQE opcode to determine completion type. This will set
787 * not only the type of the completion, but also any flags that might
788 * be associated with it (e.g. whether immediate data is present).
789 */
790 flags = IBT_WC_NO_FLAGS;
791 if (TAVOR_CQE_SENDRECV_GET(cqe) != TAVOR_COMPLETION_RECV) {
792
793 /*
794 * Send CQE
795 *
796 * The following opcodes will not be generated in uDAPL
797 * case TAVOR_CQE_SND_RDMAWR_IMM:
798 * case TAVOR_CQE_SND_SEND_IMM:
799 * case TAVOR_CQE_SND_ATOMIC_CS:
800 * case TAVOR_CQE_SND_ATOMIC_FA:
801 */
802 switch (opcode) {
803 case TAVOR_CQE_SND_RDMAWR:
804 type = IBT_WRC_RDMAW;
805 break;
806
807 case TAVOR_CQE_SND_SEND:
808 type = IBT_WRC_SEND;
809 break;
810
811 case TAVOR_CQE_SND_RDMARD:
812 type = IBT_WRC_RDMAR;
813 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe);
814 break;
815
816 case TAVOR_CQE_SND_BIND_MW:
817 type = IBT_WRC_BIND;
818 break;
819
820 default:
821 wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR;
822 return (TAVOR_CQ_SYNC_AND_DB);
823 }
824 } else {
825
826 /*
827 * Receive CQE
828 *
829 * The following opcodes will not be generated in uDAPL
830 *
831 * case TAVOR_CQE_RCV_RECV_IMM:
832 * case TAVOR_CQE_RCV_RECV_IMM2:
833 * case TAVOR_CQE_RCV_RDMAWR_IMM:
834 * case TAVOR_CQE_RCV_RDMAWR_IMM2:
835 */
836 switch (opcode & 0x1F) {
837 case TAVOR_CQE_RCV_RECV:
838 /* FALLTHROUGH */
839 case TAVOR_CQE_RCV_RECV2:
840 type = IBT_WRC_RECV;
841 wc->wc_bytes_xfer = TAVOR_CQE_BYTECNT_GET(cqe);
842 break;
843 default:
844 wc->wc_status = IBT_WC_LOCAL_CHAN_OP_ERR;
845 return (TAVOR_CQ_SYNC_AND_DB);
846 }
847 }
848 wc->wc_type = type;
849 wc->wc_flags = flags;
850 /* If we got here, completion status must be success */
851 wc->wc_status = IBT_WC_SUCCESS;
852
853 return (TAVOR_CQ_SYNC_AND_DB);
854 }
855
856
857 /*
858 * dapli_arbel_cq_errcqe_consume()
859 */
860 static int
dapli_arbel_cq_errcqe_consume(ib_cq_handle_t cqhdl,tavor_hw_cqe_t * cqe,ibt_wc_t * wc)861 dapli_arbel_cq_errcqe_consume(ib_cq_handle_t cqhdl, tavor_hw_cqe_t *cqe,
862 ibt_wc_t *wc)
863 {
864 dapls_tavor_wrid_entry_t wre;
865 uint32_t imm_eth_pkey_cred;
866 uint_t status;
867 uint_t opcode = TAVOR_CQE_OPCODE_GET(cqe);
868
869 dapl_dbg_log(DAPL_DBG_TYPE_EVD, "errcqe_consume:cqe.eth=%x, wqe=%x\n",
870 TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe),
871 TAVOR_CQE_WQEADDRSZ_GET(cqe));
872
873 /*
874 * Fetch the Work Request ID using the information in the CQE.
875 * See tavor_wr.c for more details.
876 */
877 wc->wc_id = dapls_tavor_wrid_get_entry(cqhdl, cqe,
878 (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ? TAVOR_COMPLETION_SEND :
879 TAVOR_COMPLETION_RECV, 1, &wre);
880 wc->wc_qpn = TAVOR_CQE_QPNUM_GET(cqe);
881
882 /*
883 * Parse the CQE opcode to determine completion type. We know that
884 * the CQE is an error completion, so we extract only the completion
885 * status here.
886 */
887 imm_eth_pkey_cred = TAVOR_CQE_IMM_ETH_PKEY_CRED_GET(cqe);
888 status = imm_eth_pkey_cred >> TAVOR_CQE_ERR_STATUS_SHIFT;
889 switch (status) {
890 case TAVOR_CQE_LOC_LEN_ERR:
891 status = IBT_WC_LOCAL_LEN_ERR;
892 break;
893
894 case TAVOR_CQE_LOC_OP_ERR:
895 status = IBT_WC_LOCAL_CHAN_OP_ERR;
896 break;
897
898 case TAVOR_CQE_LOC_PROT_ERR:
899 status = IBT_WC_LOCAL_PROTECT_ERR;
900 break;
901
902 case TAVOR_CQE_WR_FLUSHED_ERR:
903 status = IBT_WC_WR_FLUSHED_ERR;
904 break;
905
906 case TAVOR_CQE_MW_BIND_ERR:
907 status = IBT_WC_MEM_WIN_BIND_ERR;
908 break;
909
910 case TAVOR_CQE_BAD_RESPONSE_ERR:
911 status = IBT_WC_BAD_RESPONSE_ERR;
912 break;
913
914 case TAVOR_CQE_LOCAL_ACCESS_ERR:
915 status = IBT_WC_LOCAL_ACCESS_ERR;
916 break;
917
918 case TAVOR_CQE_REM_INV_REQ_ERR:
919 status = IBT_WC_REMOTE_INVALID_REQ_ERR;
920 break;
921
922 case TAVOR_CQE_REM_ACC_ERR:
923 status = IBT_WC_REMOTE_ACCESS_ERR;
924 break;
925
926 case TAVOR_CQE_REM_OP_ERR:
927 status = IBT_WC_REMOTE_OP_ERR;
928 break;
929
930 case TAVOR_CQE_TRANS_TO_ERR:
931 status = IBT_WC_TRANS_TIMEOUT_ERR;
932 break;
933
934 case TAVOR_CQE_RNRNAK_TO_ERR:
935 status = IBT_WC_RNR_NAK_TIMEOUT_ERR;
936 break;
937
938 /*
939 * The following error codes are not supported in the Tavor driver
940 * as they relate only to Reliable Datagram completion statuses:
941 * case TAVOR_CQE_LOCAL_RDD_VIO_ERR:
942 * case TAVOR_CQE_REM_INV_RD_REQ_ERR:
943 * case TAVOR_CQE_EEC_REM_ABORTED_ERR:
944 * case TAVOR_CQE_INV_EEC_NUM_ERR:
945 * case TAVOR_CQE_INV_EEC_STATE_ERR:
946 * case TAVOR_CQE_LOC_EEC_ERR:
947 */
948
949 default:
950 status = IBT_WC_LOCAL_CHAN_OP_ERR;
951 break;
952 }
953 wc->wc_status = status;
954 wc->wc_type = 0;
955
956 /*
957 * Consume the CQE
958 * Return status to indicate that doorbell and sync may be
959 * necessary.
960 */
961 return (TAVOR_CQ_SYNC_AND_DB);
962 }
963
964 /*
965 * dapli_arbel_cq_notify()
966 * This function is used for arming the CQ by ringing the CQ doorbell.
967 *
968 * Note: there is something very subtle here. This code assumes a very
969 * specific behavior of the kernel driver. The cmd_sn field of the
970 * arm_dbr is updated by the kernel driver whenever a notification
971 * event for the cq is received. This code extracts the cmd_sn field
972 * from the arm_dbr to know the right value to use. The arm_dbr is
973 * always updated atomically so that neither the kernel driver nor this
974 * will get confused about what the other is doing.
975 *
976 * Note: param is not used here. It is necessary for arming a CQ for
977 * N completions (param is N), but no uDAPL API supports this for now.
978 * Thus, we declare ARGSUSED to make lint happy.
979 */
980 /*ARGSUSED*/
981 static DAT_RETURN
dapli_arbel_cq_notify(ib_cq_handle_t cq,int flags,uint32_t param)982 dapli_arbel_cq_notify(ib_cq_handle_t cq, int flags, uint32_t param)
983 {
984 uint32_t cqnum;
985 uint32_t *target;
986 uint32_t old_cmd, cmp, new, tmp, cmd_sn;
987
988 /*
989 * Determine if we are trying to get the next completion or the next
990 * "solicited" completion. Then hit the appropriate doorbell.
991 */
992 dapli_arbel_cq_update_ci(cq, cq->cq_arm_dbp);
993 cqnum = cq->cq_num;
994 target = cq->cq_arm_dbp + 1;
995 retry:
996 cmp = *target;
997 tmp = HTOBE_32(cmp);
998 old_cmd = tmp & 0x7;
999 cmd_sn = (tmp & 0x18) >> 3;
1000
1001 if (flags == IB_NOTIFY_ON_NEXT_COMP) {
1002 if (old_cmd != ARBEL_CQDB_NOTIFY_CQ) {
1003 new = HTOBE_32((tmp & ~0x7) | ARBEL_CQDB_NOTIFY_CQ);
1004 tmp = atomic_cas_32(target, cmp, new);
1005 if (tmp != cmp)
1006 goto retry;
1007 dapli_arbel_cq_doorbell(cq->cq_iauar,
1008 ARBEL_CQDB_NOTIFY_CQ, cqnum,
1009 cmd_sn, cq->cq_consindx);
1010 } /* else it's already armed */
1011 } else if (flags == IB_NOTIFY_ON_NEXT_SOLICITED) {
1012 if (old_cmd != ARBEL_CQDB_NOTIFY_CQ &&
1013 old_cmd != ARBEL_CQDB_NOTIFY_CQ_SOLICIT) {
1014 new = HTOBE_32((tmp & ~0x7) |
1015 ARBEL_CQDB_NOTIFY_CQ_SOLICIT);
1016 tmp = atomic_cas_32(target, cmp, new);
1017 if (tmp != cmp)
1018 goto retry;
1019 dapli_arbel_cq_doorbell(cq->cq_iauar,
1020 ARBEL_CQDB_NOTIFY_CQ_SOLICIT, cqnum,
1021 cmd_sn, cq->cq_consindx);
1022 } /* else it's already armed */
1023 } else {
1024 return (DAT_INVALID_PARAMETER);
1025 }
1026
1027 return (DAT_SUCCESS);
1028 }
1029
1030 /*
1031 * dapli_arbel_post_send()
1032 */
1033 /* ARGSUSED */
1034 static DAT_RETURN
dapli_arbel_post_send(DAPL_EP * ep,ibt_send_wr_t * wr,boolean_t ns)1035 dapli_arbel_post_send(DAPL_EP *ep, ibt_send_wr_t *wr, boolean_t ns)
1036 {
1037 tavor_sw_wqe_dbinfo_t dbinfo;
1038 dapls_tavor_wrid_list_hdr_t *wridlist;
1039 dapls_tavor_wrid_entry_t *wre_last;
1040 uint32_t desc;
1041 uint64_t *wqe_addr;
1042 uint32_t desc_sz;
1043 uint32_t wqeaddrsz, signaled_dbd;
1044 uint32_t head, tail, next_tail, qsize_msk;
1045 int status;
1046 ib_qp_handle_t qp;
1047
1048 if ((ep->qp_state == IBT_STATE_RESET) ||
1049 (ep->qp_state == IBT_STATE_INIT) ||
1050 (ep->qp_state == IBT_STATE_RTR)) {
1051 dapl_dbg_log(DAPL_DBG_TYPE_ERR,
1052 "post_send: invalid qp_state %d\n", ep->qp_state);
1053 return (DAT_INVALID_STATE);
1054 }
1055
1056 qp = ep->qp_handle;
1057
1058 /* Grab the lock for the WRID list */
1059 dapl_os_lock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1060 wridlist = qp->qp_sq_wqhdr->wq_wrid_post;
1061
1062 /* Save away some initial QP state */
1063 qsize_msk = qp->qp_sq_wqhdr->wq_size - 1;
1064 tail = qp->qp_sq_wqhdr->wq_tail;
1065 head = qp->qp_sq_wqhdr->wq_head;
1066
1067 /*
1068 * Check for "queue full" condition. If the queue is already full,
1069 * then no more WQEs can be posted, return an error
1070 */
1071 if (qp->qp_sq_wqhdr->wq_full != 0) {
1072 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1073 return (DAT_INSUFFICIENT_RESOURCES);
1074 }
1075
1076 /*
1077 * Increment the "tail index" and check for "queue full" condition.
1078 * If we detect that the current work request is going to fill the
1079 * work queue, then we mark this condition and continue.
1080 */
1081 next_tail = (tail + 1) & qsize_msk;
1082 if (next_tail == head) {
1083 qp->qp_sq_wqhdr->wq_full = 1;
1084 }
1085
1086 /*
1087 * Get the user virtual address of the location where the next
1088 * Send WQE should be built
1089 */
1090 wqe_addr = TAVOR_QP_SQ_ENTRY(qp, tail);
1091
1092 /*
1093 * Call tavor_wqe_send_build() to build the WQE at the given address.
1094 * This routine uses the information in the ibt_send_wr_t and
1095 * returns the size of the WQE when it returns.
1096 */
1097 status = dapli_arbel_wqe_send_build(qp, wr, wqe_addr, &desc_sz);
1098 if (status != DAT_SUCCESS) {
1099 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1100 return (status);
1101 }
1102
1103 /*
1104 * Get the descriptor (io address) corresponding to the location
1105 * Send WQE was built.
1106 */
1107 desc = TAVOR_QP_SQ_DESC(qp, tail);
1108
1109 dapl_os_assert(desc >= qp->qp_sq_desc_addr &&
1110 desc <= (qp->qp_sq_desc_addr +
1111 qp->qp_sq_numwqe*qp->qp_sq_wqesz));
1112
1113 /*
1114 * Add a WRID entry to the WRID list. Need to calculate the
1115 * "wqeaddr" to pass to dapli_tavor_wrid_add_entry().
1116 * signaled_dbd is still calculated, but ignored.
1117 */
1118 wqeaddrsz = TAVOR_QP_WQEADDRSZ(desc, 0);
1119
1120 if (wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1121 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
1122 }
1123
1124 dapli_tavor_wrid_add_entry(qp->qp_sq_wqhdr, wr->wr_id, wqeaddrsz,
1125 signaled_dbd);
1126
1127 /*
1128 * Now link the wqe to the old chain (if there was one)
1129 */
1130 dapli_arbel_wqe_send_linknext(wr, desc, desc_sz,
1131 qp->qp_sq_lastwqeaddr, &dbinfo);
1132
1133 /*
1134 * Now if the WRID tail entry is non-NULL, then this
1135 * represents the entry to which we are chaining the
1136 * new entries. Since we are going to ring the
1137 * doorbell for this WQE, we want set its "dbd" bit.
1138 *
1139 * On the other hand, if the tail is NULL, even though
1140 * we will have rung the doorbell for the previous WQE
1141 * (for the hardware's sake) it is irrelevant to our
1142 * purposes (for tracking WRIDs) because we know the
1143 * request must have already completed.
1144 */
1145 wre_last = wridlist->wl_wre_old_tail;
1146 if (wre_last != NULL) {
1147 wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED;
1148 }
1149
1150 /* Update some of the state in the QP */
1151 qp->qp_sq_lastwqeaddr = wqe_addr;
1152 qp->qp_sq_wqhdr->wq_tail = next_tail;
1153
1154 /* Set the doorbell decord */
1155 dapli_arbel_sq_dbrec(qp, qp->qp_sq_counter);
1156
1157 /* Ring the doorbell */
1158 dapli_arbel_sq_dbreg(qp->qp_iauar, qp->qp_num, dbinfo.db_fence,
1159 dbinfo.db_nopcode, qp->qp_sq_counter, desc_sz);
1160 qp->qp_sq_counter++;
1161
1162 dapl_os_unlock(&qp->qp_sq_wqhdr->wq_wrid_lock->wrl_lock);
1163
1164 return (DAT_SUCCESS);
1165 }
1166
1167 /*
1168 * dapli_arbel_post_recv()
1169 */
1170 /* ARGSUSED */
1171 static DAT_RETURN
dapli_arbel_post_recv(DAPL_EP * ep,ibt_recv_wr_t * wr,boolean_t ns)1172 dapli_arbel_post_recv(DAPL_EP *ep, ibt_recv_wr_t *wr, boolean_t ns)
1173 {
1174 dapls_tavor_wrid_list_hdr_t *wridlist;
1175 dapls_tavor_wrid_entry_t *wre_last;
1176 ib_qp_handle_t qp;
1177 DAT_RETURN status;
1178 uint32_t desc;
1179 uint64_t *wqe_addr;
1180 uint32_t desc_sz;
1181 uint32_t wqeaddrsz;
1182 uint32_t head, tail, next_tail, qsize_msk;
1183
1184 if (ep->qp_state == IBT_STATE_RESET) {
1185 dapl_dbg_log(DAPL_DBG_TYPE_ERR,
1186 "post_recv: invalid qp_state %d\n", ep->qp_state);
1187 return (DAT_INVALID_STATE);
1188 }
1189 qp = ep->qp_handle;
1190
1191 /* Grab the lock for the WRID list */
1192 dapl_os_lock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1193 wridlist = qp->qp_rq_wqhdr->wq_wrid_post;
1194
1195 /* Save away some initial QP state */
1196 qsize_msk = qp->qp_rq_wqhdr->wq_size - 1;
1197 tail = qp->qp_rq_wqhdr->wq_tail;
1198 head = qp->qp_rq_wqhdr->wq_head;
1199
1200 /*
1201 * For the ibt_recv_wr_t passed in, parse the request and build a
1202 * Recv WQE. Link the WQE with the previous WQE and ring the
1203 * door bell.
1204 */
1205
1206 /*
1207 * Check for "queue full" condition. If the queue is already full,
1208 * then no more WQEs can be posted. So return an error.
1209 */
1210 if (qp->qp_rq_wqhdr->wq_full != 0) {
1211 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1212 return (DAT_INSUFFICIENT_RESOURCES);
1213 }
1214
1215 /*
1216 * Increment the "tail index" and check for "queue
1217 * full" condition. If we detect that the current
1218 * work request is going to fill the work queue, then
1219 * we mark this condition and continue.
1220 */
1221 next_tail = (tail + 1) & qsize_msk;
1222 if (next_tail == head) {
1223 qp->qp_rq_wqhdr->wq_full = 1;
1224 }
1225
1226 /* Get the descriptor (IO Address) of the WQE to be built */
1227 desc = TAVOR_QP_RQ_DESC(qp, tail);
1228 /* The user virtual address of the WQE to be built */
1229 wqe_addr = TAVOR_QP_RQ_ENTRY(qp, tail);
1230
1231 /*
1232 * Call tavor_wqe_recv_build() to build the WQE at the given
1233 * address. This routine uses the information in the
1234 * ibt_recv_wr_t and returns the size of the WQE.
1235 */
1236 status = dapli_arbel_wqe_recv_build(qp, wr, wqe_addr, &desc_sz);
1237 if (status != DAT_SUCCESS) {
1238 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1239 return (DAT_INTERNAL_ERROR);
1240 }
1241
1242 /*
1243 * Add a WRID entry to the WRID list. Need to calculate the
1244 * "wqeaddr" and "signaled_dbd" values to pass to
1245 * dapli_tavor_wrid_add_entry().
1246 * Note: all Recv WQEs are essentially "signaled"
1247 */
1248 wqeaddrsz = TAVOR_QP_WQEADDRSZ(desc, 0);
1249 dapli_tavor_wrid_add_entry(qp->qp_rq_wqhdr, wr->wr_id, wqeaddrsz,
1250 (uint32_t)TAVOR_WRID_ENTRY_SIGNALED);
1251
1252 /*
1253 * Now if the WRID tail entry is non-NULL, then this
1254 * represents the entry to which we are chaining the
1255 * new entries. Since we are going to ring the
1256 * doorbell for this WQE, we want set its "dbd" bit.
1257 *
1258 * On the other hand, if the tail is NULL, even though
1259 * we will have rung the doorbell for the previous WQE
1260 * (for the hardware's sake) it is irrelevant to our
1261 * purposes (for tracking WRIDs) because we know the
1262 * request must have already completed.
1263 */
1264 wre_last = wridlist->wl_wre_old_tail;
1265 if (wre_last != NULL) {
1266 wre_last->wr_signaled_dbd |= TAVOR_WRID_ENTRY_DOORBELLED;
1267 }
1268
1269 /* Update some of the state in the QP */
1270 qp->qp_rq_lastwqeaddr = wqe_addr;
1271 qp->qp_rq_wqhdr->wq_tail = next_tail;
1272
1273 /* Update the doorbell record */
1274 qp->qp_rq_counter++;
1275 (qp->qp_rq_dbp)[0] = HTOBE_32(qp->qp_rq_counter);
1276
1277 dapl_os_unlock(&qp->qp_rq_wqhdr->wq_wrid_lock->wrl_lock);
1278
1279 return (DAT_SUCCESS);
1280 }
1281
1282 /*
1283 * dapli_arbel_post_srq()
1284 */
1285 /* ARGSUSED */
1286 static DAT_RETURN
dapli_arbel_post_srq(DAPL_SRQ * srqp,ibt_recv_wr_t * wr,boolean_t ns)1287 dapli_arbel_post_srq(DAPL_SRQ *srqp, ibt_recv_wr_t *wr, boolean_t ns)
1288 {
1289 ib_srq_handle_t srq;
1290 DAT_RETURN status;
1291 uint32_t desc;
1292 uint64_t *wqe_addr;
1293 uint32_t head, next_head, qsize_msk;
1294 uint32_t wqe_index;
1295
1296
1297 srq = srqp->srq_handle;
1298
1299 /* Grab the lock for the WRID list */
1300 dapl_os_lock(&srq->srq_wridlist->wl_lock->wrl_lock);
1301
1302 /*
1303 * For the ibt_recv_wr_t passed in, parse the request and build a
1304 * Recv WQE. Link the WQE with the previous WQE and ring the
1305 * door bell.
1306 */
1307
1308 /*
1309 * Check for "queue full" condition. If the queue is already full,
1310 * ie. there are no free entries, then no more WQEs can be posted.
1311 * So return an error.
1312 */
1313 if (srq->srq_wridlist->wl_freel_entries == 0) {
1314 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1315 return (DAT_INSUFFICIENT_RESOURCES);
1316 }
1317
1318 /* Save away some initial SRQ state */
1319 qsize_msk = srq->srq_wridlist->wl_size - 1;
1320 head = srq->srq_wridlist->wl_freel_head;
1321
1322 next_head = (head + 1) & qsize_msk;
1323
1324 /* Get the descriptor (IO Address) of the WQE to be built */
1325 desc = srq->srq_wridlist->wl_free_list[head];
1326
1327 wqe_index = TAVOR_SRQ_WQ_INDEX(srq->srq_wq_desc_addr, desc,
1328 srq->srq_wq_wqesz);
1329
1330 /* The user virtual address of the WQE to be built */
1331 wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq, wqe_index);
1332
1333 /*
1334 * Call dapli_arbel_wqe_srq_build() to build the WQE at the given
1335 * address. This routine uses the information in the
1336 * ibt_recv_wr_t and returns the size of the WQE.
1337 */
1338 status = dapli_arbel_wqe_srq_build(srq, wr, wqe_addr);
1339 if (status != DAT_SUCCESS) {
1340 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1341 return (status);
1342 }
1343
1344 /*
1345 * Add a WRID entry to the WRID list.
1346 */
1347 dapli_tavor_wrid_add_entry_srq(srq, wr->wr_id, wqe_index);
1348
1349 #if 0
1350 if (srq->srq_wq_lastwqeindex == -1) {
1351 last_wqe_addr = NULL;
1352 } else {
1353 last_wqe_addr = TAVOR_SRQ_WQ_ENTRY(srq,
1354 srq->srq_wq_lastwqeindex);
1355 }
1356 /*
1357 * Now link the chain to the old chain (if there was one)
1358 * and update the wqe_counter in the doorbell record.
1359 */
1360 XXX
1361 dapli_tavor_wqe_srq_linknext(wqe_addr, ns, desc, last_wqe_addr);
1362 #endif
1363
1364 /* Update some of the state in the SRQ */
1365 srq->srq_wq_lastwqeindex = wqe_index;
1366 srq->srq_wridlist->wl_freel_head = next_head;
1367 srq->srq_wridlist->wl_freel_entries--;
1368 dapl_os_assert(srq->srq_wridlist->wl_freel_entries <=
1369 srq->srq_wridlist->wl_size);
1370
1371 /* Update the doorbell record */
1372 srq->srq_counter++;
1373 (srq->srq_dbp)[0] = HTOBE_32(srq->srq_counter);
1374
1375 dapl_os_unlock(&srq->srq_wridlist->wl_lock->wrl_lock);
1376
1377 return (DAT_SUCCESS);
1378 }
1379
1380 /*
1381 * dapli_arbel_cq_srq_entries_flush()
1382 */
1383 static void
dapli_arbel_cq_srq_entries_flush(ib_qp_handle_t qp)1384 dapli_arbel_cq_srq_entries_flush(ib_qp_handle_t qp)
1385 {
1386 ib_cq_handle_t cq;
1387 dapls_tavor_workq_hdr_t *wqhdr;
1388 tavor_hw_cqe_t *cqe;
1389 tavor_hw_cqe_t *next_cqe;
1390 uint32_t cons_indx, tail_cons_indx, wrap_around_mask;
1391 uint32_t new_indx, check_indx, indx;
1392 int cqe_qpnum, cqe_type;
1393 int outstanding_cqes, removed_cqes;
1394 int i;
1395
1396 /* ASSERT(MUTEX_HELD(&qp->qp_rq_cqhdl->cq_lock)); */
1397
1398 cq = qp->qp_rq_cqhdl;
1399 wqhdr = qp->qp_rq_wqhdr;
1400
1401 dapl_os_assert(wqhdr->wq_wrid_post != NULL);
1402 dapl_os_assert(wqhdr->wq_wrid_post->wl_srq_en != 0);
1403
1404 /* Get the consumer index */
1405 cons_indx = cq->cq_consindx;
1406
1407 /*
1408 * Calculate the wrap around mask. Note: This operation only works
1409 * because all Tavor completion queues have power-of-2 sizes
1410 */
1411 wrap_around_mask = (cq->cq_size - 1);
1412
1413 /* Calculate the pointer to the first CQ entry */
1414 cqe = &cq->cq_addr[cons_indx];
1415
1416 /*
1417 * Loop through the CQ looking for entries owned by software. If an
1418 * entry is owned by software then we increment an 'outstanding_cqes'
1419 * count to know how many entries total we have on our CQ. We use this
1420 * value further down to know how many entries to loop through looking
1421 * for our same QP number.
1422 */
1423 outstanding_cqes = 0;
1424 tail_cons_indx = cons_indx;
1425 while (TAVOR_CQE_OWNER_IS_SW(cqe)) {
1426 /* increment total cqes count */
1427 outstanding_cqes++;
1428
1429 /* increment the consumer index */
1430 tail_cons_indx = (tail_cons_indx + 1) & wrap_around_mask;
1431
1432 /* update the pointer to the next cq entry */
1433 cqe = &cq->cq_addr[tail_cons_indx];
1434 }
1435
1436 /*
1437 * Using the 'tail_cons_indx' that was just set, we now know how many
1438 * total CQEs possible there are. Set the 'check_indx' and the
1439 * 'new_indx' to the last entry identified by 'tail_cons_indx'
1440 */
1441 check_indx = new_indx = (tail_cons_indx - 1) & wrap_around_mask;
1442
1443 for (i = 0; i < outstanding_cqes; i++) {
1444 cqe = &cq->cq_addr[check_indx];
1445
1446 /* Grab QP number from CQE */
1447 cqe_qpnum = TAVOR_CQE_QPNUM_GET(cqe);
1448 cqe_type = TAVOR_CQE_SENDRECV_GET(cqe);
1449
1450 /*
1451 * If the QP number is the same in the CQE as the QP that we
1452 * have on this SRQ, then we must free up the entry off the
1453 * SRQ. We also make sure that the completion type is of the
1454 * 'TAVOR_COMPLETION_RECV' type. So any send completions on
1455 * this CQ will be left as-is. The handling of returning
1456 * entries back to HW ownership happens further down.
1457 */
1458 if (cqe_qpnum == qp->qp_num &&
1459 cqe_type == TAVOR_COMPLETION_RECV) {
1460 /* Add back to SRQ free list */
1461 (void) dapli_tavor_wrid_find_match_srq(
1462 wqhdr->wq_wrid_post, cqe);
1463 } else {
1464 /* Do Copy */
1465 if (check_indx != new_indx) {
1466 next_cqe = &cq->cq_addr[new_indx];
1467 /*
1468 * Copy the CQE into the "next_cqe"
1469 * pointer.
1470 */
1471 (void) dapl_os_memcpy(next_cqe, cqe,
1472 sizeof (tavor_hw_cqe_t));
1473 }
1474 new_indx = (new_indx - 1) & wrap_around_mask;
1475 }
1476 /* Move index to next CQE to check */
1477 check_indx = (check_indx - 1) & wrap_around_mask;
1478 }
1479
1480 /* Initialize removed cqes count */
1481 removed_cqes = 0;
1482
1483 /* If an entry was removed */
1484 if (check_indx != new_indx) {
1485
1486 /*
1487 * Set current pointer back to the beginning consumer index.
1488 * At this point, all unclaimed entries have been copied to the
1489 * index specified by 'new_indx'. This 'new_indx' will be used
1490 * as the new consumer index after we mark all freed entries as
1491 * having HW ownership. We do that here.
1492 */
1493
1494 /* Loop through all entries until we reach our new pointer */
1495 for (indx = cons_indx; indx <= new_indx;
1496 indx = (indx + 1) & wrap_around_mask) {
1497 removed_cqes++;
1498 cqe = &cq->cq_addr[indx];
1499
1500 /* Reset entry to hardware ownership */
1501 TAVOR_CQE_OWNER_SET_HW(cqe);
1502 }
1503 }
1504
1505 /*
1506 * Update consumer index to be the 'new_indx'. This moves it past all
1507 * removed entries. Because 'new_indx' is pointing to the last
1508 * previously valid SW owned entry, we add 1 to point the cons_indx to
1509 * the first HW owned entry.
1510 */
1511 cons_indx = (new_indx + 1) & wrap_around_mask;
1512
1513 /*
1514 * Now we only ring the doorbell (to update the consumer index) if
1515 * we've actually consumed a CQ entry. If we found no QP number
1516 * matches above, then we would not have removed anything. So only if
1517 * something was removed do we ring the doorbell.
1518 */
1519 if ((removed_cqes != 0) && (cq->cq_consindx != cons_indx)) {
1520 /*
1521 * Update the consumer index in both the CQ handle and the
1522 * doorbell record.
1523 */
1524 cq->cq_consindx = cons_indx;
1525 dapli_arbel_cq_update_ci(cq, cq->cq_poll_dbp);
1526 }
1527 }
1528
1529 static void
dapli_arbel_rq_prelink(caddr_t first,uint32_t desc_off,uint32_t wqesz,uint32_t numwqe,uint32_t nds)1530 dapli_arbel_rq_prelink(caddr_t first, uint32_t desc_off, uint32_t wqesz,
1531 uint32_t numwqe, uint32_t nds)
1532 {
1533 int i;
1534 uint32_t *p = (uint32_t *)(uintptr_t)first;
1535 uint32_t off = desc_off;
1536 uint32_t pincr = wqesz / sizeof (uint32_t);
1537 ibt_wr_ds_t sgl;
1538
1539 sgl.ds_va = (ib_vaddr_t)0;
1540 sgl.ds_key = ARBEL_WQE_SGL_INVALID_LKEY;
1541 sgl.ds_len = (ib_msglen_t)0;
1542
1543 for (i = 0; i < numwqe - 1; i++, p += pincr) {
1544 off += wqesz;
1545 p[0] = HTOBE_32(off); /* link curr to next */
1546 p[1] = nds; /* nds is 0 for SRQ */
1547 TAVOR_WQE_BUILD_DATA_SEG((void *)&p[2], &sgl);
1548 }
1549 p[0] = HTOBE_32(desc_off); /* link last to first */
1550 p[1] = nds;
1551 TAVOR_WQE_BUILD_DATA_SEG((void *)&p[2], &sgl);
1552 }
1553
1554 static void
dapli_arbel_sq_prelink(caddr_t first,uint32_t desc_off,uint32_t wqesz,uint32_t numwqe)1555 dapli_arbel_sq_prelink(caddr_t first, uint32_t desc_off, uint32_t wqesz,
1556 uint32_t numwqe)
1557 {
1558 int i;
1559 uint32_t *p = (uint32_t *)(uintptr_t)first;
1560 uint32_t off = desc_off;
1561 uint32_t pincr = wqesz / sizeof (uint32_t);
1562
1563 for (i = 0; i < numwqe - 1; i++, p += pincr) {
1564 off += wqesz;
1565 p[0] = HTOBE_32(off); /* link curr to next */
1566 }
1567 p[0] = HTOBE_32(desc_off); /* link last to first */
1568 }
1569
1570 static void
dapli_arbel_qp_init(ib_qp_handle_t qp)1571 dapli_arbel_qp_init(ib_qp_handle_t qp)
1572 {
1573 (qp->qp_sq_dbp)[1] = HTOBE_32((qp->qp_num << 8) | ARBEL_DBR_SQ);
1574 if (qp->qp_srq_enabled == 0) {
1575 (qp->qp_rq_dbp)[1] = HTOBE_32((qp->qp_num << 8) | ARBEL_DBR_RQ);
1576
1577 /* pre-link the whole receive queue */
1578 dapli_arbel_rq_prelink(qp->qp_rq_buf, qp->qp_rq_desc_addr,
1579 qp->qp_rq_wqesz, qp->qp_rq_numwqe,
1580 HTOBE_32(qp->qp_rq_wqesz >> 4));
1581 }
1582 dapli_arbel_sq_prelink(qp->qp_sq_buf, qp->qp_sq_desc_addr,
1583 qp->qp_sq_wqesz, qp->qp_sq_numwqe);
1584 qp->qp_sq_lastwqeaddr = (uint64_t *)((uintptr_t)qp->qp_sq_buf +
1585 ((qp->qp_sq_numwqe - 1) * qp->qp_sq_wqesz));
1586 qp->qp_rq_counter = 0;
1587 qp->qp_sq_counter = 0;
1588 }
1589
1590 static void
dapli_arbel_cq_init(ib_cq_handle_t cq)1591 dapli_arbel_cq_init(ib_cq_handle_t cq)
1592 {
1593 (cq->cq_poll_dbp)[1] =
1594 HTOBE_32((cq->cq_num << 8) | ARBEL_DBR_CQ_SET_CI);
1595 (cq->cq_arm_dbp)[1] =
1596 HTOBE_32((cq->cq_num << 8) | ARBEL_DBR_CQ_ARM | 0x8);
1597 /* cq_resize -- needs testing */
1598 }
1599
1600 static void
dapli_arbel_srq_init(ib_srq_handle_t srq)1601 dapli_arbel_srq_init(ib_srq_handle_t srq)
1602 {
1603 (srq->srq_dbp)[1] =
1604 HTOBE_32((srq->srq_num << 8) | ARBEL_DBR_SRQ);
1605
1606 /* pre-link the whole shared receive queue */
1607 dapli_arbel_rq_prelink(srq->srq_addr, srq->srq_wq_desc_addr,
1608 srq->srq_wq_wqesz, srq->srq_wq_numwqe, 0);
1609 srq->srq_counter = 0;
1610
1611 /* needs testing */
1612 }
1613
1614 void
dapls_init_funcs_arbel(DAPL_HCA * hca_ptr)1615 dapls_init_funcs_arbel(DAPL_HCA *hca_ptr)
1616 {
1617 hca_ptr->post_send = dapli_arbel_post_send;
1618 hca_ptr->post_recv = dapli_arbel_post_recv;
1619 hca_ptr->post_srq = dapli_arbel_post_srq;
1620 hca_ptr->cq_peek = dapli_arbel_cq_peek;
1621 hca_ptr->cq_poll = dapli_arbel_cq_poll;
1622 hca_ptr->cq_poll_one = dapli_arbel_cq_poll_one;
1623 hca_ptr->cq_notify = dapli_arbel_cq_notify;
1624 hca_ptr->srq_flush = dapli_arbel_cq_srq_entries_flush;
1625 hca_ptr->qp_init = dapli_arbel_qp_init;
1626 hca_ptr->cq_init = dapli_arbel_cq_init;
1627 hca_ptr->srq_init = dapli_arbel_srq_init;
1628 hca_ptr->hermon_resize_cq = 0;
1629 }
1630