1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * tavor_wr.c
29 * Tavor Work Request Processing Routines
30 *
31 * Implements all the routines necessary to provide the PostSend(),
32 * PostRecv() and PostSRQ() verbs. Also contains all the code
33 * necessary to implement the Tavor WRID tracking mechanism.
34 */
35
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/avl.h>
42
43 #include <sys/ib/adapters/tavor/tavor.h>
44
45 static void tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda,
46 uint32_t nds, uint32_t qpn, uint32_t fence, uint32_t nopcode);
47 #pragma inline(tavor_qp_send_doorbell)
48 static void tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda,
49 uint32_t nds, uint32_t qpn, uint32_t credits);
50 #pragma inline(tavor_qp_recv_doorbell)
51 static uint32_t tavor_wr_get_immediate(ibt_send_wr_t *wr);
52 static int tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr);
53 static int tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
54 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
55 static void tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr,
56 ibt_send_wr_t *prev_wr, uint64_t *curr_desc, uint_t curr_descsz,
57 uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp);
58 static int tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
59 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
60 static void tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
61 uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
62 tavor_qphdl_t qp);
63 static int tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
64 ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size);
65 static void tavor_wqe_recv_linknext(uint64_t *desc, uint_t desc_sz,
66 uint64_t *prev, tavor_qphdl_t qp);
67 static int tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
68 ibt_recv_wr_t *wr, uint64_t *desc);
69 static void tavor_wqe_srq_linknext(uint64_t *desc, uint64_t *prev,
70 tavor_srqhdl_t srq);
71 static void tavor_wqe_sync(void *hdl, uint_t sync_from,
72 uint_t sync_to, uint_t sync_type, uint_t flag);
73 static tavor_wrid_entry_t *tavor_wrid_find_match(tavor_workq_hdr_t *wq,
74 tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe);
75 static void tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq);
76 static tavor_workq_hdr_t *tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn,
77 uint_t send_or_recv);
78 static tavor_workq_hdr_t *tavor_wrid_wqhdr_create(tavor_state_t *state,
79 tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type, uint_t create_wql);
80 static uint32_t tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq);
81 static void tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
82 tavor_wrid_list_hdr_t *wrid_list);
83 static void tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
84 tavor_wrid_list_hdr_t *wrid_list);
85 static tavor_workq_hdr_t *tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wq);
86 static void tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp);
87 static void tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp);
88 static void tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
89 static void tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
90
91 /*
92 * tavor_post_send()
93 * Context: Can be called from interrupt or base context.
94 */
95 int
tavor_post_send(tavor_state_t * state,tavor_qphdl_t qp,ibt_send_wr_t * wr,uint_t num_wr,uint_t * num_posted)96 tavor_post_send(tavor_state_t *state, tavor_qphdl_t qp,
97 ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
98 {
99 tavor_sw_wqe_dbinfo_t dbinfo;
100 tavor_wrid_list_hdr_t *wridlist;
101 tavor_wrid_entry_t *wre_last;
102 uint64_t *desc, *prev, *first;
103 uint32_t desc_sz, first_sz;
104 uint32_t wqeaddrsz, signaled_dbd;
105 uint32_t head, tail, next_tail, qsize_msk;
106 uint32_t sync_from, sync_to;
107 uint_t currindx, wrindx, numremain;
108 uint_t chainlen, chainbegin, posted_cnt;
109 uint_t maxdb = TAVOR_QP_MAXDESC_PER_DB;
110 int status;
111
112 /*
113 * Check for user-mappable QP memory. Note: We do not allow kernel
114 * clients to post to QP memory that is accessible directly by the
115 * user. If the QP memory is user accessible, then return an error.
116 */
117 if (qp->qp_is_umap) {
118 return (IBT_QP_HDL_INVALID);
119 }
120
121 /* Initialize posted_cnt */
122 posted_cnt = 0;
123
124 mutex_enter(&qp->qp_lock);
125
126 /*
127 * Check QP state. Can not post Send requests from the "Reset",
128 * "Init", or "RTR" states
129 */
130 if ((qp->qp_state == TAVOR_QP_RESET) ||
131 (qp->qp_state == TAVOR_QP_INIT) ||
132 (qp->qp_state == TAVOR_QP_RTR)) {
133 mutex_exit(&qp->qp_lock);
134 return (IBT_QP_STATE_INVALID);
135 }
136
137 /* Grab the lock for the WRID list */
138 mutex_enter(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
139 wridlist = qp->qp_sq_wqhdr->wq_wrid_post;
140
141 /* Save away some initial QP state */
142 qsize_msk = qp->qp_sq_wqhdr->wq_size - 1;
143 tail = qp->qp_sq_wqhdr->wq_tail;
144 head = qp->qp_sq_wqhdr->wq_head;
145
146 /*
147 * For each ibt_send_wr_t in the wr[] list passed in, parse the
148 * request and build a Send WQE. Note: Because we are potentially
149 * building a chain of WQEs, we want to link them all together.
150 * However, we do not want to link the first one to the previous
151 * WQE until the entire chain has been linked. Then in the last
152 * step we ring the appropriate doorbell. Note: It is possible for
153 * more Work Requests to be posted than the HW will support at one
154 * shot. If this happens, we need to be able to post and ring
155 * several chains here until the the entire request is complete.
156 */
157 wrindx = 0;
158 numremain = num_wr;
159 status = DDI_SUCCESS;
160 while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
161 /*
162 * For the first WQE on a new chain we need "prev" to point
163 * to the current descriptor. As we begin to process
164 * further, "prev" will be updated to point to the previous
165 * WQE on the current chain (see below).
166 */
167 prev = TAVOR_QP_SQ_ENTRY(qp, tail);
168
169 /*
170 * Before we begin, save the current "tail index" for later
171 * DMA sync
172 */
173 sync_from = tail;
174
175 /*
176 * Break the request up into chains that are less than or
177 * equal to the maximum number of WQEs that can be posted
178 * per doorbell ring
179 */
180 chainlen = (numremain > maxdb) ? maxdb : numremain;
181 numremain -= chainlen;
182 chainbegin = wrindx;
183 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
184 /*
185 * Check for "queue full" condition. If the queue
186 * is already full, then no more WQEs can be posted.
187 * So break out, ring a doorbell (if necessary) and
188 * return an error
189 */
190 if (qp->qp_sq_wqhdr->wq_full != 0) {
191 status = IBT_QP_FULL;
192 break;
193 }
194
195 /*
196 * Increment the "tail index" and check for "queue
197 * full" condition. If we detect that the current
198 * work request is going to fill the work queue, then
199 * we mark this condition and continue.
200 */
201 next_tail = (tail + 1) & qsize_msk;
202 if (next_tail == head) {
203 qp->qp_sq_wqhdr->wq_full = 1;
204 }
205
206 /*
207 * Get the address of the location where the next
208 * Send WQE should be built
209 */
210 desc = TAVOR_QP_SQ_ENTRY(qp, tail);
211
212 /*
213 * Call tavor_wqe_send_build() to build the WQE
214 * at the given address. This routine uses the
215 * information in the ibt_send_wr_t list (wr[]) and
216 * returns the size of the WQE when it returns.
217 */
218 status = tavor_wqe_send_build(state, qp,
219 &wr[wrindx], desc, &desc_sz);
220 if (status != DDI_SUCCESS) {
221 break;
222 }
223
224 /*
225 * Add a WRID entry to the WRID list. Need to
226 * calculate the "wqeaddrsz" and "signaled_dbd"
227 * values to pass to tavor_wrid_add_entry()
228 */
229 wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
230 ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
231 desc_sz);
232 if ((qp->qp_sq_sigtype == TAVOR_QP_SQ_ALL_SIGNALED) ||
233 (wr[wrindx].wr_flags & IBT_WR_SEND_SIGNAL)) {
234 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
235 } else {
236 signaled_dbd = 0;
237 }
238 tavor_wrid_add_entry(qp->qp_sq_wqhdr,
239 wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
240
241 /*
242 * If this is not the first descriptor on the current
243 * chain, then link it to the previous WQE. Otherwise,
244 * save the address and size of this descriptor (in
245 * "first" and "first_sz" respectively) and continue.
246 * Note: Linking a WQE to the the previous one will
247 * depend on whether the two WQEs are from "special
248 * QPs" (i.e. MLX transport WQEs) or whether they are
249 * normal Send WQEs.
250 */
251 if (currindx != 0) {
252 if (qp->qp_is_special) {
253 tavor_wqe_mlx_linknext(&wr[wrindx - 1],
254 desc, desc_sz, prev, NULL, qp);
255 } else {
256 tavor_wqe_send_linknext(&wr[wrindx],
257 &wr[wrindx - 1], desc, desc_sz,
258 prev, NULL, qp);
259 }
260 prev = desc;
261 } else {
262 first = desc;
263 first_sz = desc_sz;
264 }
265
266 /*
267 * Update the current "tail index" and increment
268 * "posted_cnt"
269 */
270 tail = next_tail;
271 posted_cnt++;
272 }
273
274 /*
275 * If we reach here and there are one or more WQEs which have
276 * been successfully chained together, then we need to link
277 * the current chain to the previously executing chain of
278 * descriptor (if there is one) and ring the doorbell for the
279 * send work queue.
280 */
281 if (currindx != 0) {
282 /*
283 * Before we link the chain, we need to ensure that the
284 * "next" field on the last WQE is set to NULL (to
285 * indicate the end of the chain). Note: Just as it
286 * did above, the format for the "next" fields in a
287 * given WQE depend on whether the WQE is MLX
288 * transport or not.
289 */
290 if (qp->qp_is_special) {
291 tavor_wqe_mlx_linknext(&wr[chainbegin +
292 currindx - 1], NULL, 0, prev, NULL, qp);
293 } else {
294 tavor_wqe_send_linknext(NULL,
295 &wr[chainbegin + currindx - 1], NULL, 0,
296 prev, NULL, qp);
297 }
298
299 /* Save away updated "tail index" for the DMA sync */
300 sync_to = tail;
301
302 /* Do a DMA sync for current send WQE(s) */
303 tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_SEND,
304 DDI_DMA_SYNC_FORDEV);
305
306 /*
307 * Now link the chain to the old chain (if there was
308 * one. Note: still need to pay attention to whether
309 * the QP used MLX transport WQEs or not.
310 */
311 if (qp->qp_is_special) {
312 tavor_wqe_mlx_linknext(NULL, first, first_sz,
313 qp->qp_sq_lastwqeaddr, &dbinfo, qp);
314 } else {
315 tavor_wqe_send_linknext(&wr[chainbegin], NULL,
316 first, first_sz, qp->qp_sq_lastwqeaddr,
317 &dbinfo, qp);
318 }
319
320 /*
321 * If there was a valid previous WQE (i.e. non-NULL),
322 * then sync it too. This is because we have updated
323 * its "next" fields and we want to ensure that the
324 * hardware can see the changes.
325 */
326 if (qp->qp_sq_lastwqeaddr != NULL) {
327 sync_to = sync_from;
328 sync_from = (sync_from - 1) & qsize_msk;
329 tavor_wqe_sync(qp, sync_from, sync_to,
330 TAVOR_WR_SEND, DDI_DMA_SYNC_FORDEV);
331 }
332
333 /*
334 * Now if the WRID tail entry is non-NULL, then this
335 * represents the entry to which we are chaining the
336 * new entries. Since we are going to ring the
337 * doorbell for this WQE, we want set its "dbd" bit.
338 *
339 * On the other hand, if the tail is NULL, even though
340 * we will have rung the doorbell for the previous WQE
341 * (for the hardware's sake) it is irrelevant to our
342 * purposes (for tracking WRIDs) because we know the
343 * request must have already completed.
344 */
345 wre_last = wridlist->wl_wre_old_tail;
346 if (wre_last != NULL) {
347 wre_last->wr_signaled_dbd |=
348 TAVOR_WRID_ENTRY_DOORBELLED;
349 }
350
351 /* Update some of the state in the QP */
352 qp->qp_sq_lastwqeaddr = desc;
353 qp->qp_sq_wqhdr->wq_tail = tail;
354
355 /* Ring the doorbell */
356 tavor_qp_send_doorbell(state,
357 (uint32_t)((uintptr_t)first - qp->qp_desc_off),
358 first_sz, qp->qp_qpnum, dbinfo.db_fence,
359 dbinfo.db_nopcode);
360 }
361 }
362
363 /*
364 * Update the "num_posted" return value (if necessary). Then drop
365 * the locks and return success.
366 */
367 if (num_posted != NULL) {
368 *num_posted = posted_cnt;
369 }
370
371 mutex_exit(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
372 mutex_exit(&qp->qp_lock);
373
374 return (status);
375 }
376
377
378 /*
379 * tavor_post_recv()
380 * Context: Can be called from interrupt or base context.
381 */
382 int
tavor_post_recv(tavor_state_t * state,tavor_qphdl_t qp,ibt_recv_wr_t * wr,uint_t num_wr,uint_t * num_posted)383 tavor_post_recv(tavor_state_t *state, tavor_qphdl_t qp,
384 ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
385 {
386 uint64_t *desc, *prev, *first;
387 uint32_t desc_sz, first_sz;
388 uint32_t wqeaddrsz, signaled_dbd;
389 uint32_t head, tail, next_tail, qsize_msk;
390 uint32_t sync_from, sync_to;
391 uint_t currindx, wrindx, numremain;
392 uint_t chainlen, posted_cnt;
393 uint_t maxdb = TAVOR_QP_MAXDESC_PER_DB;
394 int status;
395
396 /*
397 * Check for user-mappable QP memory. Note: We do not allow kernel
398 * clients to post to QP memory that is accessible directly by the
399 * user. If the QP memory is user accessible, then return an error.
400 */
401 if (qp->qp_is_umap) {
402 return (IBT_QP_HDL_INVALID);
403 }
404
405 /* Initialize posted_cnt */
406 posted_cnt = 0;
407
408 mutex_enter(&qp->qp_lock);
409
410 /*
411 * Check if QP is associated with an SRQ
412 */
413 if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
414 mutex_exit(&qp->qp_lock);
415 return (IBT_SRQ_IN_USE);
416 }
417
418 /*
419 * Check QP state. Can not post Recv requests from the "Reset" state
420 */
421 if (qp->qp_state == TAVOR_QP_RESET) {
422 mutex_exit(&qp->qp_lock);
423 return (IBT_QP_STATE_INVALID);
424 }
425
426 /* Grab the lock for the WRID list */
427 mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
428
429 /* Save away some initial QP state */
430 qsize_msk = qp->qp_rq_wqhdr->wq_size - 1;
431 tail = qp->qp_rq_wqhdr->wq_tail;
432 head = qp->qp_rq_wqhdr->wq_head;
433
434 /*
435 * For each ibt_recv_wr_t in the wr[] list passed in, parse the
436 * request and build a Recv WQE. Note: Because we are potentially
437 * building a chain of WQEs, we want to link them all together.
438 * However, we do not want to link the first one to the previous
439 * WQE until the entire chain has been linked. Then in the last
440 * step we ring the appropriate doorbell. Note: It is possible for
441 * more Work Requests to be posted than the HW will support at one
442 * shot. If this happens, we need to be able to post and ring
443 * several chains here until the the entire request is complete.
444 */
445 wrindx = 0;
446 numremain = num_wr;
447 status = DDI_SUCCESS;
448 while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
449 /*
450 * For the first WQE on a new chain we need "prev" to point
451 * to the current descriptor. As we begin to process
452 * further, "prev" will be updated to point to the previous
453 * WQE on the current chain (see below).
454 */
455 prev = TAVOR_QP_RQ_ENTRY(qp, tail);
456
457 /*
458 * Before we begin, save the current "tail index" for later
459 * DMA sync
460 */
461 sync_from = tail;
462
463 /*
464 * Break the request up into chains that are less than or
465 * equal to the maximum number of WQEs that can be posted
466 * per doorbell ring
467 */
468 chainlen = (numremain > maxdb) ? maxdb : numremain;
469 numremain -= chainlen;
470 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
471 /*
472 * Check for "queue full" condition. If the queue
473 * is already full, then no more WQEs can be posted.
474 * So break out, ring a doorbell (if necessary) and
475 * return an error
476 */
477 if (qp->qp_rq_wqhdr->wq_full != 0) {
478 status = IBT_QP_FULL;
479 break;
480 }
481
482 /*
483 * Increment the "tail index" and check for "queue
484 * full" condition. If we detect that the current
485 * work request is going to fill the work queue, then
486 * we mark this condition and continue.
487 */
488 next_tail = (tail + 1) & qsize_msk;
489 if (next_tail == head) {
490 qp->qp_rq_wqhdr->wq_full = 1;
491 }
492
493 /*
494 * Get the address of the location where the next
495 * Recv WQE should be built
496 */
497 desc = TAVOR_QP_RQ_ENTRY(qp, tail);
498
499 /*
500 * Call tavor_wqe_recv_build() to build the WQE
501 * at the given address. This routine uses the
502 * information in the ibt_recv_wr_t list (wr[]) and
503 * returns the size of the WQE when it returns.
504 */
505 status = tavor_wqe_recv_build(state, qp, &wr[wrindx],
506 desc, &desc_sz);
507 if (status != DDI_SUCCESS) {
508 break;
509 }
510
511 /*
512 * Add a WRID entry to the WRID list. Need to
513 * calculate the "wqeaddrsz" and "signaled_dbd"
514 * values to pass to tavor_wrid_add_entry(). Note:
515 * all Recv WQEs are essentially "signaled" and
516 * "doorbelled" (since Tavor HW requires all
517 * RecvWQE's to have their "DBD" bits set).
518 */
519 wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
520 ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
521 desc_sz);
522 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED |
523 TAVOR_WRID_ENTRY_DOORBELLED;
524 tavor_wrid_add_entry(qp->qp_rq_wqhdr,
525 wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
526
527 /*
528 * If this is not the first descriptor on the current
529 * chain, then link it to the previous WQE. Otherwise,
530 * save the address and size of this descriptor (in
531 * "first" and "first_sz" respectively) and continue.
532 */
533 if (currindx != 0) {
534 tavor_wqe_recv_linknext(desc, desc_sz, prev,
535 qp);
536 prev = desc;
537 } else {
538 first = desc;
539 first_sz = desc_sz;
540 }
541
542 /*
543 * Update the current "tail index" and increment
544 * "posted_cnt"
545 */
546 tail = next_tail;
547 posted_cnt++;
548 }
549
550 /*
551 * If we reach here and there are one or more WQEs which have
552 * been successfully chained together, then we need to link
553 * the current chain to the previously executing chain of
554 * descriptor (if there is one) and ring the doorbell for the
555 * recv work queue.
556 */
557 if (currindx != 0) {
558 /*
559 * Before we link the chain, we need to ensure that the
560 * "next" field on the last WQE is set to NULL (to
561 * indicate the end of the chain).
562 */
563 tavor_wqe_recv_linknext(NULL, 0, prev, qp);
564
565 /* Save away updated "tail index" for the DMA sync */
566 sync_to = tail;
567
568 /* Do a DMA sync for current recv WQE(s) */
569 tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_RECV,
570 DDI_DMA_SYNC_FORDEV);
571
572 /*
573 * Now link the chain to the old chain (if there was
574 * one.
575 */
576 tavor_wqe_recv_linknext(first, first_sz,
577 qp->qp_rq_lastwqeaddr, qp);
578
579 /*
580 * If there was a valid previous WQE (i.e. non-NULL),
581 * then sync it too. This is because we have updated
582 * its "next" fields and we want to ensure that the
583 * hardware can see the changes.
584 */
585 if (qp->qp_rq_lastwqeaddr != NULL) {
586 sync_to = sync_from;
587 sync_from = (sync_from - 1) & qsize_msk;
588 tavor_wqe_sync(qp, sync_from, sync_to,
589 TAVOR_WR_RECV, DDI_DMA_SYNC_FORDEV);
590 }
591
592 /* Update some of the state in the QP */
593 qp->qp_rq_lastwqeaddr = desc;
594 qp->qp_rq_wqhdr->wq_tail = tail;
595
596 /* Ring the doorbell */
597 tavor_qp_recv_doorbell(state,
598 (uint32_t)((uintptr_t)first - qp->qp_desc_off),
599 first_sz, qp->qp_qpnum, (chainlen % maxdb));
600 }
601 }
602
603 /*
604 * Update the "num_posted" return value (if necessary). Then drop
605 * the locks and return success.
606 */
607 if (num_posted != NULL) {
608 *num_posted = posted_cnt;
609 }
610
611 mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
612 mutex_exit(&qp->qp_lock);
613
614 return (status);
615 }
616
617 /*
618 * tavor_post_srq()
619 * Context: Can be called from interrupt or base context.
620 */
621 int
tavor_post_srq(tavor_state_t * state,tavor_srqhdl_t srq,ibt_recv_wr_t * wr,uint_t num_wr,uint_t * num_posted)622 tavor_post_srq(tavor_state_t *state, tavor_srqhdl_t srq,
623 ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
624 {
625 uint64_t *desc, *prev, *first, *last_wqe_addr;
626 uint32_t signaled_dbd;
627 uint32_t sync_indx;
628 uint_t currindx, wrindx, numremain;
629 uint_t chainlen, posted_cnt;
630 uint_t maxdb = TAVOR_QP_MAXDESC_PER_DB;
631 int status;
632
633 /*
634 * Check for user-mappable QP memory. Note: We do not allow kernel
635 * clients to post to QP memory that is accessible directly by the
636 * user. If the QP memory is user accessible, then return an error.
637 */
638 if (srq->srq_is_umap) {
639 return (IBT_SRQ_HDL_INVALID);
640 }
641
642 /* Initialize posted_cnt */
643 posted_cnt = 0;
644
645 mutex_enter(&srq->srq_lock);
646
647 /*
648 * Check SRQ state. Can not post Recv requests when SRQ is in error
649 */
650 if (srq->srq_state == TAVOR_SRQ_STATE_ERROR) {
651 mutex_exit(&srq->srq_lock);
652 return (IBT_QP_STATE_INVALID);
653 }
654
655 /* Grab the lock for the WRID list */
656 mutex_enter(&srq->srq_wrid_wql->wql_lock);
657
658 /*
659 * For each ibt_recv_wr_t in the wr[] list passed in, parse the
660 * request and build a Recv WQE. Note: Because we are potentially
661 * building a chain of WQEs, we want to link them all together.
662 * However, we do not want to link the first one to the previous
663 * WQE until the entire chain has been linked. Then in the last
664 * step we ring the appropriate doorbell. Note: It is possible for
665 * more Work Requests to be posted than the HW will support at one
666 * shot. If this happens, we need to be able to post and ring
667 * several chains here until the the entire request is complete.
668 */
669 wrindx = 0;
670 numremain = num_wr;
671 status = DDI_SUCCESS;
672 while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
673 /*
674 * For the first WQE on a new chain we need "prev" to point
675 * to the current descriptor. As we begin to process
676 * further, "prev" will be updated to point to the previous
677 * WQE on the current chain (see below).
678 */
679 if (srq->srq_wq_lastwqeindx == -1) {
680 prev = NULL;
681 } else {
682 prev = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wq_lastwqeindx);
683 }
684
685 /*
686 * Break the request up into chains that are less than or
687 * equal to the maximum number of WQEs that can be posted
688 * per doorbell ring
689 */
690 chainlen = (numremain > maxdb) ? maxdb : numremain;
691 numremain -= chainlen;
692 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
693
694 /*
695 * Check for "queue full" condition. If the queue
696 * is already full, then no more WQEs can be posted.
697 * So break out, ring a doorbell (if necessary) and
698 * return an error
699 */
700 if (srq->srq_wridlist->wl_free_list_indx == -1) {
701 status = IBT_QP_FULL;
702 break;
703 }
704
705 /*
706 * Get the address of the location where the next
707 * Recv WQE should be built
708 */
709 desc = TAVOR_SRQ_WQE_ADDR(srq,
710 srq->srq_wridlist->wl_free_list_indx);
711
712 /*
713 * Add a WRID entry to the WRID list. Need to
714 * set the "signaled_dbd" values to pass to
715 * tavor_wrid_add_entry(). Note: all Recv WQEs are
716 * essentially "signaled"
717 *
718 * The 'size' is stored at srq_alloc time, in the
719 * srq_wq_stride. This is a constant value required
720 * for SRQ.
721 */
722 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
723 tavor_wrid_add_entry_srq(srq, wr[wrindx].wr_id,
724 signaled_dbd);
725
726 /*
727 * Call tavor_wqe_srq_build() to build the WQE
728 * at the given address. This routine uses the
729 * information in the ibt_recv_wr_t list (wr[]) and
730 * returns the size of the WQE when it returns.
731 */
732 status = tavor_wqe_srq_build(state, srq, &wr[wrindx],
733 desc);
734 if (status != DDI_SUCCESS) {
735 break;
736 }
737
738 /*
739 * If this is not the first descriptor on the current
740 * chain, then link it to the previous WQE. Otherwise,
741 * save the address of this descriptor (in "first") and
742 * continue.
743 */
744 if (currindx != 0) {
745 tavor_wqe_srq_linknext(desc, prev, srq);
746 sync_indx = TAVOR_SRQ_WQE_INDEX(
747 srq->srq_wq_buf, prev,
748 srq->srq_wq_log_wqesz);
749
750 /* Do a DMA sync for previous recv WQE */
751 tavor_wqe_sync(srq, sync_indx, sync_indx+1,
752 TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
753
754 prev = desc;
755 } else {
756
757 /*
758 * In this case, the last WQE on the chain is
759 * also considered 'first'. So set prev to
760 * first, here.
761 */
762 first = prev = desc;
763 }
764
765 /*
766 * Increment "posted_cnt"
767 */
768 posted_cnt++;
769 }
770
771 /*
772 * If we reach here and there are one or more WQEs which have
773 * been successfully chained together, then we need to link
774 * the current chain to the previously executing chain of
775 * descriptor (if there is one) and ring the doorbell for the
776 * recv work queue.
777 */
778 if (currindx != 0) {
779 /*
780 * Before we link the chain, we need to ensure that the
781 * "next" field on the last WQE is set to NULL (to
782 * indicate the end of the chain).
783 */
784 tavor_wqe_srq_linknext(NULL, prev, srq);
785
786 sync_indx = TAVOR_SRQ_WQE_INDEX(srq->srq_wq_buf, prev,
787 srq->srq_wq_log_wqesz);
788
789 /* Do a DMA sync for current recv WQE */
790 tavor_wqe_sync(srq, sync_indx, sync_indx+1,
791 TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
792
793 /*
794 * Now link the chain to the old chain (if there was
795 * one).
796 */
797 if (srq->srq_wq_lastwqeindx == -1) {
798 last_wqe_addr = NULL;
799 } else {
800 last_wqe_addr = TAVOR_SRQ_WQE_ADDR(srq,
801 srq->srq_wq_lastwqeindx);
802 }
803 tavor_wqe_srq_linknext(first, last_wqe_addr, srq);
804
805 /*
806 * If there was a valid previous WQE (i.e. valid index),
807 * then sync it too. This is because we have updated
808 * its "next" fields and we want to ensure that the
809 * hardware can see the changes.
810 */
811 if (srq->srq_wq_lastwqeindx != -1) {
812 sync_indx = srq->srq_wq_lastwqeindx;
813 tavor_wqe_sync(srq, sync_indx, sync_indx+1,
814 TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
815 }
816
817 /* Update some of the state in the QP */
818 srq->srq_wq_lastwqeindx = TAVOR_SRQ_WQE_INDEX(
819 srq->srq_wq_buf, desc,
820 srq->srq_wq_log_wqesz);
821
822 /* Ring the doorbell */
823 /* SRQ needs NDS of 0 */
824 tavor_qp_recv_doorbell(state,
825 (uint32_t)((uintptr_t)first - srq->srq_desc_off),
826 0, srq->srq_srqnum, (chainlen % maxdb));
827 }
828 }
829
830 /*
831 * Update the "num_posted" return value (if necessary). Then drop
832 * the locks and return success.
833 */
834 if (num_posted != NULL) {
835 *num_posted = posted_cnt;
836 }
837
838 mutex_exit(&srq->srq_wrid_wql->wql_lock);
839 mutex_exit(&srq->srq_lock);
840
841 return (status);
842 }
843
844
845 /*
846 * tavor_qp_send_doorbell()
847 * Context: Can be called from interrupt or base context.
848 */
849 static void
tavor_qp_send_doorbell(tavor_state_t * state,uint32_t nda,uint32_t nds,uint32_t qpn,uint32_t fence,uint32_t nopcode)850 tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
851 uint32_t qpn, uint32_t fence, uint32_t nopcode)
852 {
853 uint64_t doorbell = 0;
854
855 /* Build the doorbell from the parameters */
856 doorbell = (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) <<
857 TAVOR_QPSNDDB_NDA_SHIFT) |
858 ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) |
859 ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) |
860 ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds;
861
862 /* Write the doorbell to UAR */
863 TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->send,
864 doorbell);
865 }
866
867
868 /*
869 * tavor_qp_recv_doorbell()
870 * Context: Can be called from interrupt or base context.
871 */
872 static void
tavor_qp_recv_doorbell(tavor_state_t * state,uint32_t nda,uint32_t nds,uint32_t qpn,uint32_t credits)873 tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
874 uint32_t qpn, uint32_t credits)
875 {
876 uint64_t doorbell = 0;
877
878 /* Build the doorbell from the parameters */
879 doorbell = (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) <<
880 TAVOR_QPRCVDB_NDA_SHIFT) |
881 ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) |
882 ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits;
883
884 /* Write the doorbell to UAR */
885 TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->recv,
886 doorbell);
887 }
888
889
890 /*
891 * tavor_wqe_send_build()
892 * Context: Can be called from interrupt or base context.
893 */
894 static int
tavor_wqe_send_build(tavor_state_t * state,tavor_qphdl_t qp,ibt_send_wr_t * wr,uint64_t * desc,uint_t * size)895 tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
896 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
897 {
898 tavor_hw_snd_wqe_ud_t *ud;
899 tavor_hw_snd_wqe_remaddr_t *rc;
900 tavor_hw_snd_wqe_atomic_t *at;
901 tavor_hw_snd_wqe_remaddr_t *uc;
902 tavor_hw_snd_wqe_bind_t *bn;
903 tavor_hw_wqe_sgl_t *ds;
904 ibt_wr_ds_t *sgl;
905 tavor_ahhdl_t ah;
906 uint32_t nds;
907 int i, num_ds, status;
908
909 ASSERT(MUTEX_HELD(&qp->qp_lock));
910
911 /* Initialize the information for the Data Segments */
912 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
913 sizeof (tavor_hw_snd_wqe_nextctrl_t));
914 nds = wr->wr_nds;
915 sgl = wr->wr_sgl;
916 num_ds = 0;
917
918 /*
919 * Build a Send WQE depends first and foremost on the transport
920 * type of Work Request (i.e. UD, RC, or UC)
921 */
922 switch (wr->wr_trans) {
923 case IBT_UD_SRV:
924 /* Ensure that work request transport type matches QP type */
925 if (qp->qp_serv_type != TAVOR_QP_UD) {
926 return (IBT_QP_SRV_TYPE_INVALID);
927 }
928
929 /*
930 * Validate the operation type. For UD requests, only the
931 * "Send" operation is valid
932 */
933 if (wr->wr_opcode != IBT_WRC_SEND) {
934 return (IBT_QP_OP_TYPE_INVALID);
935 }
936
937 /*
938 * If this is a Special QP (QP0 or QP1), then we need to
939 * build MLX WQEs instead. So jump to tavor_wqe_mlx_build()
940 * and return whatever status it returns
941 */
942 if (qp->qp_is_special) {
943 status = tavor_wqe_mlx_build(state, qp, wr, desc, size);
944 return (status);
945 }
946
947 /*
948 * Otherwise, if this is a normal UD Send request, then fill
949 * all the fields in the Tavor UD header for the WQE. Note:
950 * to do this we'll need to extract some information from the
951 * Address Handle passed with the work request.
952 */
953 ud = (tavor_hw_snd_wqe_ud_t *)((uintptr_t)desc +
954 sizeof (tavor_hw_snd_wqe_nextctrl_t));
955 ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
956 if (ah == NULL) {
957 return (IBT_AH_HDL_INVALID);
958 }
959
960 /*
961 * Build the Unreliable Datagram Segment for the WQE, using
962 * the information from the address handle and the work
963 * request.
964 */
965 mutex_enter(&ah->ah_lock);
966 TAVOR_WQE_BUILD_UD(qp, ud, ah, wr);
967 mutex_exit(&ah->ah_lock);
968
969 /* Update "ds" for filling in Data Segments (below) */
970 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)ud +
971 sizeof (tavor_hw_snd_wqe_ud_t));
972 break;
973
974 case IBT_RC_SRV:
975 /* Ensure that work request transport type matches QP type */
976 if (qp->qp_serv_type != TAVOR_QP_RC) {
977 return (IBT_QP_SRV_TYPE_INVALID);
978 }
979
980 /*
981 * Validate the operation type. For RC requests, we allow
982 * "Send", "RDMA Read", "RDMA Write", various "Atomic"
983 * operations, and memory window "Bind"
984 */
985 if ((wr->wr_opcode != IBT_WRC_SEND) &&
986 (wr->wr_opcode != IBT_WRC_RDMAR) &&
987 (wr->wr_opcode != IBT_WRC_RDMAW) &&
988 (wr->wr_opcode != IBT_WRC_CSWAP) &&
989 (wr->wr_opcode != IBT_WRC_FADD) &&
990 (wr->wr_opcode != IBT_WRC_BIND)) {
991 return (IBT_QP_OP_TYPE_INVALID);
992 }
993
994 /*
995 * If this is a Send request, then all we need to do is break
996 * out and here and begin the Data Segment processing below
997 */
998 if (wr->wr_opcode == IBT_WRC_SEND) {
999 break;
1000 }
1001
1002 /*
1003 * If this is an RDMA Read or RDMA Write request, then fill
1004 * in the "Remote Address" header fields.
1005 */
1006 if ((wr->wr_opcode == IBT_WRC_RDMAR) ||
1007 (wr->wr_opcode == IBT_WRC_RDMAW)) {
1008 rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1009 sizeof (tavor_hw_snd_wqe_nextctrl_t));
1010
1011 /*
1012 * Build the Remote Address Segment for the WQE, using
1013 * the information from the RC work request.
1014 */
1015 TAVOR_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
1016
1017 /* Update "ds" for filling in Data Segments (below) */
1018 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc +
1019 sizeof (tavor_hw_snd_wqe_remaddr_t));
1020 break;
1021 }
1022
1023 /*
1024 * If this is one of the Atomic type operations (i.e
1025 * Compare-Swap or Fetch-Add), then fill in both the "Remote
1026 * Address" header fields and the "Atomic" header fields.
1027 */
1028 if ((wr->wr_opcode == IBT_WRC_CSWAP) ||
1029 (wr->wr_opcode == IBT_WRC_FADD)) {
1030 rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1031 sizeof (tavor_hw_snd_wqe_nextctrl_t));
1032 at = (tavor_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
1033 sizeof (tavor_hw_snd_wqe_remaddr_t));
1034
1035 /*
1036 * Build the Remote Address and Atomic Segments for
1037 * the WQE, using the information from the RC Atomic
1038 * work request.
1039 */
1040 TAVOR_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
1041 TAVOR_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
1042
1043 /* Update "ds" for filling in Data Segments (below) */
1044 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)at +
1045 sizeof (tavor_hw_snd_wqe_atomic_t));
1046
1047 /*
1048 * Update "nds" and "sgl" because Atomic requests have
1049 * only a single Data Segment (and they are encoded
1050 * somewhat differently in the work request.
1051 */
1052 nds = 1;
1053 sgl = wr->wr_sgl;
1054 break;
1055 }
1056
1057 /*
1058 * If this is memory window Bind operation, then we call the
1059 * tavor_wr_bind_check() routine to validate the request and
1060 * to generate the updated RKey. If this is successful, then
1061 * we fill in the WQE's "Bind" header fields.
1062 */
1063 if (wr->wr_opcode == IBT_WRC_BIND) {
1064 status = tavor_wr_bind_check(state, wr);
1065 if (status != DDI_SUCCESS) {
1066 return (status);
1067 }
1068
1069 bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1070 sizeof (tavor_hw_snd_wqe_nextctrl_t));
1071
1072 /*
1073 * Build the Bind Memory Window Segments for the WQE,
1074 * using the information from the RC Bind memory
1075 * window work request.
1076 */
1077 TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
1078
1079 /*
1080 * Update the "ds" pointer. Even though the "bind"
1081 * operation requires no SGLs, this is necessary to
1082 * facilitate the correct descriptor size calculations
1083 * (below).
1084 */
1085 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1086 sizeof (tavor_hw_snd_wqe_bind_t));
1087 nds = 0;
1088 }
1089 break;
1090
1091 case IBT_UC_SRV:
1092 /* Ensure that work request transport type matches QP type */
1093 if (qp->qp_serv_type != TAVOR_QP_UC) {
1094 return (IBT_QP_SRV_TYPE_INVALID);
1095 }
1096
1097 /*
1098 * Validate the operation type. For UC requests, we only
1099 * allow "Send", "RDMA Write", and memory window "Bind".
1100 * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic"
1101 * operations
1102 */
1103 if ((wr->wr_opcode != IBT_WRC_SEND) &&
1104 (wr->wr_opcode != IBT_WRC_RDMAW) &&
1105 (wr->wr_opcode != IBT_WRC_BIND)) {
1106 return (IBT_QP_OP_TYPE_INVALID);
1107 }
1108
1109 /*
1110 * If this is a Send request, then all we need to do is break
1111 * out and here and begin the Data Segment processing below
1112 */
1113 if (wr->wr_opcode == IBT_WRC_SEND) {
1114 break;
1115 }
1116
1117 /*
1118 * If this is an RDMA Write request, then fill in the "Remote
1119 * Address" header fields.
1120 */
1121 if (wr->wr_opcode == IBT_WRC_RDMAW) {
1122 uc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1123 sizeof (tavor_hw_snd_wqe_nextctrl_t));
1124
1125 /*
1126 * Build the Remote Address Segment for the WQE, using
1127 * the information from the UC work request.
1128 */
1129 TAVOR_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma);
1130
1131 /* Update "ds" for filling in Data Segments (below) */
1132 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)uc +
1133 sizeof (tavor_hw_snd_wqe_remaddr_t));
1134 break;
1135 }
1136
1137 /*
1138 * If this is memory window Bind operation, then we call the
1139 * tavor_wr_bind_check() routine to validate the request and
1140 * to generate the updated RKey. If this is successful, then
1141 * we fill in the WQE's "Bind" header fields.
1142 */
1143 if (wr->wr_opcode == IBT_WRC_BIND) {
1144 status = tavor_wr_bind_check(state, wr);
1145 if (status != DDI_SUCCESS) {
1146 return (status);
1147 }
1148
1149 bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1150 sizeof (tavor_hw_snd_wqe_nextctrl_t));
1151
1152 /*
1153 * Build the Bind Memory Window Segments for the WQE,
1154 * using the information from the UC Bind memory
1155 * window work request.
1156 */
1157 TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind);
1158
1159 /*
1160 * Update the "ds" pointer. Even though the "bind"
1161 * operation requires no SGLs, this is necessary to
1162 * facilitate the correct descriptor size calculations
1163 * (below).
1164 */
1165 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1166 sizeof (tavor_hw_snd_wqe_bind_t));
1167 nds = 0;
1168 }
1169 break;
1170
1171 default:
1172 return (IBT_QP_SRV_TYPE_INVALID);
1173 }
1174
1175 /*
1176 * Now fill in the Data Segments (SGL) for the Send WQE based on
1177 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
1178 * Start by checking for a valid number of SGL entries
1179 */
1180 if (nds > qp->qp_sq_sgl) {
1181 return (IBT_QP_SGL_LEN_INVALID);
1182 }
1183
1184 /*
1185 * For each SGL in the Send Work Request, fill in the Send WQE's data
1186 * segments. Note: We skip any SGL with zero size because Tavor
1187 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
1188 * the encoding for zero means a 2GB transfer. Because of this special
1189 * encoding in the hardware, we mask the requested length with
1190 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1191 * zero.)
1192 */
1193 for (i = 0; i < nds; i++) {
1194 if (sgl[i].ds_len == 0) {
1195 continue;
1196 }
1197
1198 /*
1199 * Fill in the Data Segment(s) for the current WQE, using the
1200 * information contained in the scatter-gather list of the
1201 * work request.
1202 */
1203 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1204 num_ds++;
1205 }
1206
1207 /* Return the size of descriptor (in 16-byte chunks) */
1208 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 4;
1209
1210 return (DDI_SUCCESS);
1211 }
1212
1213
1214 /*
1215 * tavor_wqe_send_linknext()
1216 * Context: Can be called from interrupt or base context.
1217 */
1218 static void
tavor_wqe_send_linknext(ibt_send_wr_t * curr_wr,ibt_send_wr_t * prev_wr,uint64_t * curr_desc,uint_t curr_descsz,uint64_t * prev_desc,tavor_sw_wqe_dbinfo_t * dbinfo,tavor_qphdl_t qp)1219 tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, ibt_send_wr_t *prev_wr,
1220 uint64_t *curr_desc, uint_t curr_descsz, uint64_t *prev_desc,
1221 tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp)
1222 {
1223 uint64_t next, ctrl;
1224 uint32_t nopcode, fence;
1225
1226 /*
1227 * Calculate the "next" field of the descriptor. This amounts to
1228 * setting up the "next_wqe_addr", "nopcode", "fence", and "nds"
1229 * fields (see tavor_hw.h for more). Note: If there is no next
1230 * descriptor (i.e. if the current descriptor is the last WQE on
1231 * the chain), then set "next" to zero.
1232 */
1233 if (curr_desc != NULL) {
1234 /*
1235 * Determine the value for the Tavor WQE "nopcode" field
1236 * by using the IBTF opcode from the work request
1237 */
1238 switch (curr_wr->wr_opcode) {
1239 case IBT_WRC_RDMAW:
1240 if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1241 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAWI;
1242 } else {
1243 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW;
1244 }
1245 break;
1246
1247 case IBT_WRC_SEND:
1248 if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1249 nopcode = TAVOR_WQE_SEND_NOPCODE_SENDI;
1250 } else {
1251 nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1252 }
1253 break;
1254
1255 case IBT_WRC_RDMAR:
1256 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR;
1257 break;
1258
1259 case IBT_WRC_CSWAP:
1260 nopcode = TAVOR_WQE_SEND_NOPCODE_ATMCS;
1261 break;
1262
1263 case IBT_WRC_FADD:
1264 nopcode = TAVOR_WQE_SEND_NOPCODE_ATMFA;
1265 break;
1266
1267 case IBT_WRC_BIND:
1268 nopcode = TAVOR_WQE_SEND_NOPCODE_BIND;
1269 break;
1270 }
1271
1272 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc
1273 - qp->qp_desc_off);
1274 next = ((uint64_t)(uintptr_t)curr_desc &
1275 TAVOR_WQE_NDA_MASK) << 32;
1276 next = next | ((uint64_t)nopcode << 32);
1277 fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
1278 if (fence) {
1279 next = next | TAVOR_WQE_SEND_FENCE_MASK;
1280 }
1281 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1282
1283 /*
1284 * If a send queue doorbell will be rung for the next
1285 * WQE on the chain, then set the current WQE's "dbd" bit.
1286 * Note: We also update the "dbinfo" structure here to pass
1287 * back information about what should (later) be included
1288 * in the send queue doorbell.
1289 */
1290 if (dbinfo) {
1291 next = next | TAVOR_WQE_DBD_MASK;
1292 dbinfo->db_nopcode = nopcode;
1293 dbinfo->db_fence = fence;
1294 }
1295 } else {
1296 next = 0;
1297 }
1298
1299 /*
1300 * If this WQE is supposed to be linked to the previous descriptor,
1301 * then we need to update not only the previous WQE's "next" fields
1302 * but we must also update this WQE's "ctrl" fields (i.e. the "c", "e",
1303 * "s", "i" and "immediate" fields - see tavor_hw.h for more). Note:
1304 * the "e" bit is always hardcoded to zero.
1305 */
1306 if (prev_desc != NULL) {
1307 /*
1308 * If a send queue doorbell will be rung for the next WQE on
1309 * the chain, then update the current WQE's "next" field and
1310 * return.
1311 * Note: We don't want to modify the "ctrl" field here because
1312 * that portion of the previous WQE has already been set
1313 * correctly at some previous point in time.
1314 */
1315 if (dbinfo) {
1316 TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1317 return;
1318 }
1319
1320 ctrl = 0;
1321
1322 /* Set the "c" (i.e. "signaled") bit appropriately */
1323 if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1324 ctrl = ctrl | TAVOR_WQE_SEND_SIGNALED_MASK;
1325 }
1326
1327 /* Set the "s" (i.e. "solicited") bit appropriately */
1328 if (prev_wr->wr_flags & IBT_WR_SEND_SOLICIT) {
1329 ctrl = ctrl | TAVOR_WQE_SEND_SOLICIT_MASK;
1330 }
1331
1332 /* Set the "i" bit and the immediate data appropriately */
1333 if (prev_wr->wr_flags & IBT_WR_SEND_IMMED) {
1334 ctrl = ctrl | TAVOR_WQE_SEND_IMMEDIATE_MASK;
1335 ctrl = ctrl | tavor_wr_get_immediate(prev_wr);
1336 }
1337
1338 TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1339 }
1340 }
1341
1342
1343 /*
1344 * tavor_wqe_mlx_build()
1345 * Context: Can be called from interrupt or base context.
1346 */
1347 static int
tavor_wqe_mlx_build(tavor_state_t * state,tavor_qphdl_t qp,ibt_send_wr_t * wr,uint64_t * desc,uint_t * size)1348 tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
1349 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1350 {
1351 tavor_hw_udav_t udav;
1352 tavor_ahhdl_t ah;
1353 ib_lrh_hdr_t *lrh;
1354 ib_grh_t *grh;
1355 ib_bth_hdr_t *bth;
1356 ib_deth_hdr_t *deth;
1357 tavor_hw_wqe_sgl_t *ds;
1358 ibt_wr_ds_t *sgl;
1359 uint8_t *mgmtclass, *hpoint, *hcount;
1360 uint64_t data;
1361 uint32_t nds, offset, pktlen;
1362 uint32_t desc_sz, udav_sz;
1363 int i, num_ds;
1364
1365 ASSERT(MUTEX_HELD(&qp->qp_lock));
1366
1367 /* Initialize the information for the Data Segments */
1368 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1369 sizeof (tavor_hw_mlx_wqe_nextctrl_t));
1370
1371 /*
1372 * Pull the address handle from the work request and read in
1373 * the contents of the UDAV. This will be used to answer some
1374 * questions about the request.
1375 */
1376 ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1377 if (ah == NULL) {
1378 return (IBT_AH_HDL_INVALID);
1379 }
1380 mutex_enter(&ah->ah_lock);
1381 udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1382 for (i = 0; i < udav_sz; i++) {
1383 data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1384 ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1385 ((uint64_t *)&udav)[i] = data;
1386 }
1387 mutex_exit(&ah->ah_lock);
1388
1389 /*
1390 * If the request is for QP1 and the destination LID is equal to
1391 * the Permissive LID, then return an error. This combination is
1392 * not allowed
1393 */
1394 if ((udav.rlid == IB_LID_PERMISSIVE) &&
1395 (qp->qp_is_special == TAVOR_QP_GSI)) {
1396 return (IBT_AH_HDL_INVALID);
1397 }
1398
1399 /*
1400 * Calculate the size of the packet headers, including the GRH
1401 * (if necessary)
1402 */
1403 desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) +
1404 sizeof (ib_deth_hdr_t);
1405 if (udav.grh) {
1406 desc_sz += sizeof (ib_grh_t);
1407 }
1408
1409 /*
1410 * Begin to build the first "inline" data segment for the packet
1411 * headers. Note: By specifying "inline" we can build the contents
1412 * of the MAD packet headers directly into the work queue (as part
1413 * descriptor). This has the advantage of both speeding things up
1414 * and of not requiring the driver to allocate/register any additional
1415 * memory for the packet headers.
1416 */
1417 TAVOR_WQE_BUILD_INLINE(qp, &ds[0], desc_sz);
1418 desc_sz += 4;
1419
1420 /*
1421 * Build Local Route Header (LRH)
1422 * We start here by building the LRH into a temporary location.
1423 * When we have finished we copy the LRH data into the descriptor.
1424 *
1425 * Notice that the VL values are hardcoded. This is not a problem
1426 * because VL15 is decided later based on the value in the MLX
1427 * transport "next/ctrl" header (see the "vl15" bit below), and it
1428 * is otherwise (meaning for QP1) chosen from the SL-to-VL table
1429 * values. This rule does not hold for loopback packets however
1430 * (all of which bypass the SL-to-VL tables) and it is the reason
1431 * that non-QP0 MADs are setup with VL hardcoded to zero below.
1432 *
1433 * Notice also that Source LID is hardcoded to the Permissive LID
1434 * (0xFFFF). This is also not a problem because if the Destination
1435 * LID is not the Permissive LID, then the "slr" value in the MLX
1436 * transport "next/ctrl" header will be set to zero and the hardware
1437 * will pull the LID from value in the port.
1438 */
1439 lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4);
1440 pktlen = (desc_sz + 0x100) >> 2;
1441 TAVOR_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen);
1442
1443 /*
1444 * Build Global Route Header (GRH)
1445 * This is only built if necessary as defined by the "grh" bit in
1446 * the address vector. Note: We also calculate the offset to the
1447 * next header (BTH) based on whether or not the "grh" bit is set.
1448 */
1449 if (udav.grh) {
1450 /*
1451 * If the request is for QP0, then return an error. The
1452 * combination of global routine (GRH) and QP0 is not allowed.
1453 */
1454 if (qp->qp_is_special == TAVOR_QP_SMI) {
1455 return (IBT_AH_HDL_INVALID);
1456 }
1457 grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1458 TAVOR_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen);
1459
1460 bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t));
1461 } else {
1462 bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1463 }
1464
1465
1466 /*
1467 * Build Base Transport Header (BTH)
1468 * Notice that the M, PadCnt, and TVer fields are all set
1469 * to zero implicitly. This is true for all Management Datagrams
1470 * MADs whether GSI are SMI.
1471 */
1472 TAVOR_WQE_BUILD_MLX_BTH(state, bth, qp, wr);
1473
1474 /*
1475 * Build Datagram Extended Transport Header (DETH)
1476 */
1477 deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t));
1478 TAVOR_WQE_BUILD_MLX_DETH(deth, qp);
1479
1480 /* Ensure that the Data Segment is aligned on a 16-byte boundary */
1481 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t));
1482 ds = (tavor_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF);
1483 nds = wr->wr_nds;
1484 sgl = wr->wr_sgl;
1485 num_ds = 0;
1486
1487 /*
1488 * Now fill in the Data Segments (SGL) for the MLX WQE based on the
1489 * values set up above (i.e. "sgl", "nds", and the "ds" pointer
1490 * Start by checking for a valid number of SGL entries
1491 */
1492 if (nds > qp->qp_sq_sgl) {
1493 return (IBT_QP_SGL_LEN_INVALID);
1494 }
1495
1496 /*
1497 * For each SGL in the Send Work Request, fill in the MLX WQE's data
1498 * segments. Note: We skip any SGL with zero size because Tavor
1499 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
1500 * the encoding for zero means a 2GB transfer. Because of this special
1501 * encoding in the hardware, we mask the requested length with
1502 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1503 * zero.)
1504 */
1505 mgmtclass = hpoint = hcount = NULL;
1506 offset = 0;
1507 for (i = 0; i < nds; i++) {
1508 if (sgl[i].ds_len == 0) {
1509 continue;
1510 }
1511
1512 /*
1513 * Fill in the Data Segment(s) for the MLX send WQE, using
1514 * the information contained in the scatter-gather list of
1515 * the work request.
1516 */
1517 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1518
1519 /*
1520 * Search through the contents of all MADs posted to QP0 to
1521 * initialize pointers to the places where Directed Route "hop
1522 * pointer", "hop count", and "mgmtclass" would be. Tavor
1523 * needs these updated (i.e. incremented or decremented, as
1524 * necessary) by software.
1525 */
1526 if (qp->qp_is_special == TAVOR_QP_SMI) {
1527
1528 TAVOR_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass,
1529 offset, sgl[i].ds_va, sgl[i].ds_len);
1530
1531 TAVOR_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint,
1532 offset, sgl[i].ds_va, sgl[i].ds_len);
1533
1534 TAVOR_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount,
1535 offset, sgl[i].ds_va, sgl[i].ds_len);
1536
1537 offset += sgl[i].ds_len;
1538 }
1539 num_ds++;
1540 }
1541
1542 /*
1543 * Tavor's Directed Route MADs need to have the "hop pointer"
1544 * incremented/decremented (as necessary) depending on whether it is
1545 * currently less than or greater than the "hop count" (i.e. whether
1546 * the MAD is a request or a response.)
1547 */
1548 if (qp->qp_is_special == TAVOR_QP_SMI) {
1549 TAVOR_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass,
1550 *hpoint, *hcount);
1551 }
1552
1553 /*
1554 * Now fill in the ICRC Data Segment. This data segment is inlined
1555 * just like the packets headers above, but it is only four bytes and
1556 * set to zero (to indicate that we wish the hardware to generate ICRC.
1557 */
1558 TAVOR_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0);
1559 num_ds++;
1560
1561 /* Return the size of descriptor (in 16-byte chunks) */
1562 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1563
1564 return (DDI_SUCCESS);
1565 }
1566
1567
1568 /*
1569 * tavor_wqe_mlx_linknext()
1570 * Context: Can be called from interrupt or base context.
1571 */
1572 static void
tavor_wqe_mlx_linknext(ibt_send_wr_t * prev_wr,uint64_t * curr_desc,uint_t curr_descsz,uint64_t * prev_desc,tavor_sw_wqe_dbinfo_t * dbinfo,tavor_qphdl_t qp)1573 tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
1574 uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
1575 tavor_qphdl_t qp)
1576 {
1577 tavor_hw_udav_t udav;
1578 tavor_ahhdl_t ah;
1579 uint64_t next, ctrl, data;
1580 uint_t nopcode;
1581 uint_t udav_sz;
1582 int i;
1583
1584 /*
1585 * Calculate the "next" field of the descriptor. This amounts to
1586 * setting up the "next_wqe_addr", "nopcode", and "nds" fields (see
1587 * tavor_hw.h for more). Note: If there is no next descriptor (i.e.
1588 * if the current descriptor is the last WQE on the chain), then set
1589 * "next" to zero.
1590 */
1591 if (curr_desc != NULL) {
1592 /*
1593 * The only valid Tavor WQE "nopcode" for MLX transport
1594 * requests is the "Send" code.
1595 */
1596 nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1597 curr_desc = (uint64_t *)(uintptr_t)((uint64_t)
1598 (uintptr_t)curr_desc - qp->qp_desc_off);
1599 next = (uint64_t)((uintptr_t)curr_desc &
1600 TAVOR_WQE_NDA_MASK) << 32;
1601 next = next | ((uint64_t)nopcode << 32);
1602 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1603
1604 /*
1605 * If a send queue doorbell will be rung for the next
1606 * WQE on the chain, then set the current WQE's "dbd" bit.
1607 * Note: We also update the "dbinfo" structure here to pass
1608 * back information about what should (later) be included
1609 * in the send queue doorbell.
1610 */
1611 if (dbinfo) {
1612 next = next | TAVOR_WQE_DBD_MASK;
1613 dbinfo->db_nopcode = nopcode;
1614 dbinfo->db_fence = 0;
1615 }
1616 } else {
1617 next = 0;
1618 }
1619
1620 /*
1621 * If this WQE is supposed to be linked to the previous descriptor,
1622 * then we need to update not only the previous WQE's "next" fields
1623 * but we must also update this WQE's "ctrl" fields (i.e. the "vl15",
1624 * "slr", "max_srate", "sl", "c", "e", "rlid", and "vcrc" fields -
1625 * see tavor_hw.h for more) Note: the "e" bit and "vcrc" fields are
1626 * always hardcoded to zero.
1627 */
1628 if (prev_desc != NULL) {
1629 /*
1630 * If a send queue doorbell will be rung for the next WQE on
1631 * the chain, then update the current WQE's "next" field and
1632 * return.
1633 * Note: We don't want to modify the "ctrl" field here because
1634 * that portion of the previous WQE has already been set
1635 * correctly at some previous point in time.
1636 */
1637 if (dbinfo) {
1638 TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1639 return;
1640 }
1641
1642 /*
1643 * Pull the address handle from the work request and read in
1644 * the contents of the UDAV. This will be used to answer some
1645 * questions about the request.
1646 */
1647 ah = (tavor_ahhdl_t)prev_wr->wr.ud.udwr_dest->ud_ah;
1648 mutex_enter(&ah->ah_lock);
1649 udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1650 for (i = 0; i < udav_sz; i++) {
1651 data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1652 ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1653 ((uint64_t *)&udav)[i] = data;
1654 }
1655 mutex_exit(&ah->ah_lock);
1656
1657 ctrl = 0;
1658
1659 /* Only QP0 uses VL15, otherwise use VL in the packet */
1660 if (qp->qp_is_special == TAVOR_QP_SMI) {
1661 ctrl = ctrl | TAVOR_WQE_MLXHDR_VL15_MASK;
1662 }
1663
1664 /*
1665 * The SLR (Source LID Replace) bit determines whether the
1666 * source LID for an outgoing MLX packet should come from the
1667 * PortInfo (SLR = 0) or should be left as it is in the
1668 * descriptor (SLR = 1). The latter is necessary for packets
1669 * to be sent with the Permissive LID.
1670 */
1671 if (udav.rlid == IB_LID_PERMISSIVE) {
1672 ctrl = ctrl | TAVOR_WQE_MLXHDR_SLR_MASK;
1673 }
1674
1675 /* Fill in the max static rate from the address handle */
1676 ctrl = ctrl | ((uint64_t)udav.max_stat_rate <<
1677 TAVOR_WQE_MLXHDR_SRATE_SHIFT);
1678
1679 /* All VL15 (i.e. SMI) traffic is required to use SL 0 */
1680 if (qp->qp_is_special != TAVOR_QP_SMI) {
1681 ctrl = ctrl | ((uint64_t)udav.sl <<
1682 TAVOR_WQE_MLXHDR_SL_SHIFT);
1683 }
1684
1685 /* Set the "c" (i.e. "signaled") bit appropriately */
1686 if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1687 ctrl = ctrl | TAVOR_WQE_MLXHDR_SIGNALED_MASK;
1688 }
1689
1690 /* Fill in the destination LID from the address handle */
1691 ctrl = ctrl | ((uint64_t)udav.rlid <<
1692 TAVOR_WQE_MLXHDR_RLID_SHIFT);
1693
1694 TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1695 }
1696 }
1697
1698
1699 /*
1700 * tavor_wqe_recv_build()
1701 * Context: Can be called from interrupt or base context.
1702 */
1703 /* ARGSUSED */
1704 static int
tavor_wqe_recv_build(tavor_state_t * state,tavor_qphdl_t qp,ibt_recv_wr_t * wr,uint64_t * desc,uint_t * size)1705 tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
1706 ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size)
1707 {
1708 tavor_hw_wqe_sgl_t *ds;
1709 int i, num_ds;
1710
1711 ASSERT(MUTEX_HELD(&qp->qp_lock));
1712
1713 /* Check that work request transport type is valid */
1714 if ((qp->qp_serv_type != TAVOR_QP_UD) &&
1715 (qp->qp_serv_type != TAVOR_QP_RC) &&
1716 (qp->qp_serv_type != TAVOR_QP_UC)) {
1717 return (IBT_QP_SRV_TYPE_INVALID);
1718 }
1719
1720 /* Fill in the Data Segments (SGL) for the Recv WQE */
1721 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1722 sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1723 num_ds = 0;
1724
1725 /* Check for valid number of SGL entries */
1726 if (wr->wr_nds > qp->qp_rq_sgl) {
1727 return (IBT_QP_SGL_LEN_INVALID);
1728 }
1729
1730 /*
1731 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1732 * segments. Note: We skip any SGL with zero size because Tavor
1733 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
1734 * the encoding for zero means a 2GB transfer. Because of this special
1735 * encoding in the hardware, we mask the requested length with
1736 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1737 * zero.)
1738 */
1739 for (i = 0; i < wr->wr_nds; i++) {
1740 if (wr->wr_sgl[i].ds_len == 0) {
1741 continue;
1742 }
1743
1744 /*
1745 * Fill in the Data Segment(s) for the receive WQE, using the
1746 * information contained in the scatter-gather list of the
1747 * work request.
1748 */
1749 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &wr->wr_sgl[i]);
1750 num_ds++;
1751 }
1752
1753 /* Return the size of descriptor (in 16-byte chunks) */
1754 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1755
1756 return (DDI_SUCCESS);
1757 }
1758
1759
1760 /*
1761 * tavor_wqe_recv_linknext()
1762 * Context: Can be called from interrupt or base context.
1763 */
1764 static void
tavor_wqe_recv_linknext(uint64_t * curr_desc,uint_t curr_descsz,uint64_t * prev_desc,tavor_qphdl_t qp)1765 tavor_wqe_recv_linknext(uint64_t *curr_desc, uint_t curr_descsz,
1766 uint64_t *prev_desc, tavor_qphdl_t qp)
1767 {
1768 uint64_t next;
1769
1770 /*
1771 * Calculate the "next" field of the descriptor. This amounts to
1772 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
1773 * tavor_hw.h for more). Note: If there is no next descriptor (i.e.
1774 * if the current descriptor is the last WQE on the chain), then set
1775 * "next" field to TAVOR_WQE_DBD_MASK. This is because the Tavor
1776 * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
1777 * In either case, we must add a single bit in the "reserved" field
1778 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA. This is the
1779 * workaround for a known Tavor errata that can cause Recv WQEs with
1780 * zero in the NDA field to behave improperly.
1781 */
1782 if (curr_desc != NULL) {
1783 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
1784 qp->qp_desc_off);
1785 next = (uint64_t)((uintptr_t)curr_desc &
1786 TAVOR_WQE_NDA_MASK) << 32;
1787 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK) |
1788 TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1789 } else {
1790 next = TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1791 }
1792
1793 /*
1794 * If this WQE is supposed to be linked to the previous descriptor,
1795 * then we need to update not only the previous WQE's "next" fields
1796 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
1797 * "e" bits - see tavor_hw.h for more). Note: both the "c" and "e"
1798 * bits are always hardcoded to zero.
1799 */
1800 if (prev_desc != NULL) {
1801 TAVOR_WQE_LINKNEXT(qp, prev_desc, 0, next);
1802 }
1803 }
1804
1805
1806 /*
1807 * tavor_wqe_srq_build()
1808 * Context: Can be called from interrupt or base context.
1809 */
1810 /* ARGSUSED */
1811 static int
tavor_wqe_srq_build(tavor_state_t * state,tavor_srqhdl_t srq,ibt_recv_wr_t * wr,uint64_t * desc)1812 tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
1813 ibt_recv_wr_t *wr, uint64_t *desc)
1814 {
1815 tavor_hw_wqe_sgl_t *ds;
1816 ibt_wr_ds_t end_sgl;
1817 int i, num_ds;
1818
1819 ASSERT(MUTEX_HELD(&srq->srq_lock));
1820
1821 /* Fill in the Data Segments (SGL) for the Recv WQE */
1822 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1823 sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1824 num_ds = 0;
1825
1826 /* Check for valid number of SGL entries */
1827 if (wr->wr_nds > srq->srq_wq_sgl) {
1828 return (IBT_QP_SGL_LEN_INVALID);
1829 }
1830
1831 /*
1832 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1833 * segments. Note: We skip any SGL with zero size because Tavor
1834 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
1835 * the encoding for zero means a 2GB transfer. Because of this special
1836 * encoding in the hardware, we mask the requested length with
1837 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1838 * zero.)
1839 */
1840 for (i = 0; i < wr->wr_nds; i++) {
1841 if (wr->wr_sgl[i].ds_len == 0) {
1842 continue;
1843 }
1844
1845 /*
1846 * Fill in the Data Segment(s) for the receive WQE, using the
1847 * information contained in the scatter-gather list of the
1848 * work request.
1849 */
1850 TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &wr->wr_sgl[i]);
1851 num_ds++;
1852 }
1853
1854 /*
1855 * For SRQ, if the number of data segments is less than the maximum
1856 * specified at alloc, then we have to fill in a special "key" entry in
1857 * the sgl entry after the last valid one in this post request. We do
1858 * that here.
1859 */
1860 if (num_ds < srq->srq_wq_sgl) {
1861 end_sgl.ds_va = 0;
1862 end_sgl.ds_len = 0;
1863 end_sgl.ds_key = 0x1;
1864 TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &end_sgl);
1865 }
1866
1867 return (DDI_SUCCESS);
1868 }
1869
1870
1871 /*
1872 * tavor_wqe_srq_linknext()
1873 * Context: Can be called from interrupt or base context.
1874 */
1875 static void
tavor_wqe_srq_linknext(uint64_t * curr_desc,uint64_t * prev_desc,tavor_srqhdl_t srq)1876 tavor_wqe_srq_linknext(uint64_t *curr_desc, uint64_t *prev_desc,
1877 tavor_srqhdl_t srq)
1878 {
1879 uint64_t next;
1880
1881 /*
1882 * Calculate the "next" field of the descriptor. This amounts to
1883 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
1884 * tavor_hw.h for more). Note: If there is no next descriptor (i.e.
1885 * if the current descriptor is the last WQE on the chain), then set
1886 * "next" field to TAVOR_WQE_DBD_MASK. This is because the Tavor
1887 * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
1888 * In either case, we must add a single bit in the "reserved" field
1889 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA. This is the
1890 * workaround for a known Tavor errata that can cause Recv WQEs with
1891 * zero in the NDA field to behave improperly.
1892 */
1893 if (curr_desc != NULL) {
1894 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
1895 srq->srq_desc_off);
1896 next = (uint64_t)((uintptr_t)curr_desc &
1897 TAVOR_WQE_NDA_MASK) << 32;
1898 next = next | TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1899 } else {
1900 next = TAVOR_RCV_WQE_NDA0_WA_MASK;
1901 }
1902
1903 /*
1904 * If this WQE is supposed to be linked to the previous descriptor,
1905 * then we need to update not only the previous WQE's "next" fields
1906 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
1907 * "e" bits - see tavor_hw.h for more). Note: both the "c" and "e"
1908 * bits are always hardcoded to zero.
1909 */
1910 if (prev_desc != NULL) {
1911 TAVOR_WQE_LINKNEXT_SRQ(srq, prev_desc, 0, next);
1912 }
1913 }
1914
1915
1916 /*
1917 * tavor_wr_get_immediate()
1918 * Context: Can be called from interrupt or base context.
1919 */
1920 static uint32_t
tavor_wr_get_immediate(ibt_send_wr_t * wr)1921 tavor_wr_get_immediate(ibt_send_wr_t *wr)
1922 {
1923 /*
1924 * This routine extracts the "immediate data" from the appropriate
1925 * location in the IBTF work request. Because of the way the
1926 * work request structure is defined, the location for this data
1927 * depends on the actual work request operation type.
1928 */
1929
1930 /* For RDMA Write, test if RC or UC */
1931 if (wr->wr_opcode == IBT_WRC_RDMAW) {
1932 if (wr->wr_trans == IBT_RC_SRV) {
1933 return (wr->wr.rc.rcwr.rdma.rdma_immed);
1934 } else { /* IBT_UC_SRV */
1935 return (wr->wr.uc.ucwr.rdma.rdma_immed);
1936 }
1937 }
1938
1939 /* For Send, test if RC, UD, or UC */
1940 if (wr->wr_opcode == IBT_WRC_SEND) {
1941 if (wr->wr_trans == IBT_RC_SRV) {
1942 return (wr->wr.rc.rcwr.send_immed);
1943 } else if (wr->wr_trans == IBT_UD_SRV) {
1944 return (wr->wr.ud.udwr_immed);
1945 } else { /* IBT_UC_SRV */
1946 return (wr->wr.uc.ucwr.send_immed);
1947 }
1948 }
1949
1950 /*
1951 * If any other type of request, then immediate is undefined
1952 */
1953 return (0);
1954 }
1955
1956
1957 /*
1958 * tavor_wqe_sync()
1959 * Context: Can be called from interrupt or base context.
1960 */
1961 static void
tavor_wqe_sync(void * hdl,uint_t sync_from,uint_t sync_to,uint_t sync_type,uint_t flag)1962 tavor_wqe_sync(void *hdl, uint_t sync_from, uint_t sync_to,
1963 uint_t sync_type, uint_t flag)
1964 {
1965 tavor_qphdl_t qp;
1966 tavor_srqhdl_t srq;
1967 uint_t is_sync_req;
1968 uint64_t *wqe_from, *wqe_to, *wqe_base, *wqe_top;
1969 ddi_dma_handle_t dmahdl;
1970 off_t offset;
1971 size_t length;
1972 uint32_t qsize;
1973 int status;
1974
1975 if (sync_type == TAVOR_WR_SRQ) {
1976 srq = (tavor_srqhdl_t)hdl;
1977 is_sync_req = srq->srq_sync;
1978 /* Get the DMA handle from SRQ context */
1979 dmahdl = srq->srq_mrhdl->mr_bindinfo.bi_dmahdl;
1980 } else {
1981 qp = (tavor_qphdl_t)hdl;
1982 is_sync_req = qp->qp_sync;
1983 /* Get the DMA handle from QP context */
1984 dmahdl = qp->qp_mrhdl->mr_bindinfo.bi_dmahdl;
1985 }
1986
1987 /* Determine if the work queues need to be synced or not */
1988 if (is_sync_req == 0) {
1989 return;
1990 }
1991
1992 /*
1993 * Depending on the type of the work queue, we grab information
1994 * about the address ranges we need to DMA sync.
1995 */
1996 if (sync_type == TAVOR_WR_SEND) {
1997 wqe_from = TAVOR_QP_SQ_ENTRY(qp, sync_from);
1998 wqe_to = TAVOR_QP_SQ_ENTRY(qp, sync_to);
1999 qsize = qp->qp_sq_bufsz;
2000
2001 wqe_base = TAVOR_QP_SQ_ENTRY(qp, 0);
2002 wqe_top = TAVOR_QP_SQ_ENTRY(qp, qsize);
2003 } else if (sync_type == TAVOR_WR_RECV) {
2004 wqe_from = TAVOR_QP_RQ_ENTRY(qp, sync_from);
2005 wqe_to = TAVOR_QP_RQ_ENTRY(qp, sync_to);
2006 qsize = qp->qp_rq_bufsz;
2007
2008 wqe_base = TAVOR_QP_RQ_ENTRY(qp, 0);
2009 wqe_top = TAVOR_QP_RQ_ENTRY(qp, qsize);
2010 } else {
2011 wqe_from = TAVOR_SRQ_WQ_ENTRY(srq, sync_from);
2012 wqe_to = TAVOR_SRQ_WQ_ENTRY(srq, sync_to);
2013 qsize = srq->srq_wq_bufsz;
2014
2015 wqe_base = TAVOR_SRQ_WQ_ENTRY(srq, 0);
2016 wqe_top = TAVOR_SRQ_WQ_ENTRY(srq, qsize);
2017 }
2018
2019 /*
2020 * There are two possible cases for the beginning and end of the WQE
2021 * chain we are trying to sync. Either this is the simple case, where
2022 * the end of the chain is below the beginning of the chain, or it is
2023 * the "wrap-around" case, where the end of the chain has wrapped over
2024 * the end of the queue. In the former case, we simply need to
2025 * calculate the span from beginning to end and sync it. In the latter
2026 * case, however, we need to calculate the span from the top of the
2027 * work queue to the end of the chain and sync that, and then we need
2028 * to find the other portion (from beginning of chain to end of queue)
2029 * and sync that as well. Note: if the "top to end" span is actually
2030 * zero length, then we don't do a DMA sync because a zero length DMA
2031 * sync unnecessarily syncs the entire work queue.
2032 */
2033 if (wqe_to > wqe_from) {
2034 /* "From Beginning to End" */
2035 offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2036 length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_from);
2037
2038 status = ddi_dma_sync(dmahdl, offset, length, flag);
2039 if (status != DDI_SUCCESS) {
2040 return;
2041 }
2042 } else {
2043 /* "From Top to End" */
2044 offset = (off_t)0;
2045 length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_base);
2046 if (length) {
2047 status = ddi_dma_sync(dmahdl, offset, length, flag);
2048 if (status != DDI_SUCCESS) {
2049 return;
2050 }
2051 }
2052
2053 /* "From Beginning to Bottom" */
2054 offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2055 length = (size_t)((uintptr_t)wqe_top - (uintptr_t)wqe_from);
2056 status = ddi_dma_sync(dmahdl, offset, length, flag);
2057 if (status != DDI_SUCCESS) {
2058 return;
2059 }
2060 }
2061 }
2062
2063
2064 /*
2065 * tavor_wr_bind_check()
2066 * Context: Can be called from interrupt or base context.
2067 */
2068 static int
tavor_wr_bind_check(tavor_state_t * state,ibt_send_wr_t * wr)2069 tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr)
2070 {
2071 ibt_bind_flags_t bind_flags;
2072 uint64_t vaddr, len;
2073 uint64_t reg_start_addr, reg_end_addr;
2074 tavor_mwhdl_t mw;
2075 tavor_mrhdl_t mr;
2076 tavor_rsrc_t *mpt;
2077 uint32_t new_rkey;
2078
2079 /* Check for a valid Memory Window handle in the WR */
2080 mw = (tavor_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl;
2081 if (mw == NULL) {
2082 return (IBT_MW_HDL_INVALID);
2083 }
2084
2085 /* Check for a valid Memory Region handle in the WR */
2086 mr = (tavor_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl;
2087 if (mr == NULL) {
2088 return (IBT_MR_HDL_INVALID);
2089 }
2090
2091 mutex_enter(&mr->mr_lock);
2092 mutex_enter(&mw->mr_lock);
2093
2094 /*
2095 * Check here to see if the memory region has already been partially
2096 * deregistered as a result of a tavor_umap_umemlock_cb() callback.
2097 * If so, this is an error, return failure.
2098 */
2099 if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
2100 mutex_exit(&mr->mr_lock);
2101 mutex_exit(&mw->mr_lock);
2102 return (IBT_MR_HDL_INVALID);
2103 }
2104
2105 /* Check for a valid Memory Window RKey (i.e. a matching RKey) */
2106 if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) {
2107 mutex_exit(&mr->mr_lock);
2108 mutex_exit(&mw->mr_lock);
2109 return (IBT_MR_RKEY_INVALID);
2110 }
2111
2112 /* Check for a valid Memory Region LKey (i.e. a matching LKey) */
2113 if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) {
2114 mutex_exit(&mr->mr_lock);
2115 mutex_exit(&mw->mr_lock);
2116 return (IBT_MR_LKEY_INVALID);
2117 }
2118
2119 /*
2120 * Now check for valid "vaddr" and "len". Note: We don't check the
2121 * "vaddr" range when "len == 0" (i.e. on unbind operations)
2122 */
2123 len = wr->wr.rc.rcwr.bind->bind_len;
2124 if (len != 0) {
2125 vaddr = wr->wr.rc.rcwr.bind->bind_va;
2126 reg_start_addr = mr->mr_bindinfo.bi_addr;
2127 reg_end_addr = mr->mr_bindinfo.bi_addr +
2128 (mr->mr_bindinfo.bi_len - 1);
2129 if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) {
2130 mutex_exit(&mr->mr_lock);
2131 mutex_exit(&mw->mr_lock);
2132 return (IBT_MR_VA_INVALID);
2133 }
2134 vaddr = (vaddr + len) - 1;
2135 if (vaddr > reg_end_addr) {
2136 mutex_exit(&mr->mr_lock);
2137 mutex_exit(&mw->mr_lock);
2138 return (IBT_MR_LEN_INVALID);
2139 }
2140 }
2141
2142 /*
2143 * Validate the bind access flags. Remote Write and Atomic access for
2144 * the Memory Window require that Local Write access be set in the
2145 * corresponding Memory Region.
2146 */
2147 bind_flags = wr->wr.rc.rcwr.bind->bind_flags;
2148 if (((bind_flags & IBT_WR_BIND_WRITE) ||
2149 (bind_flags & IBT_WR_BIND_ATOMIC)) &&
2150 !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) {
2151 mutex_exit(&mr->mr_lock);
2152 mutex_exit(&mw->mr_lock);
2153 return (IBT_MR_ACCESS_REQ_INVALID);
2154 }
2155
2156 /* Calculate the new RKey for the Memory Window */
2157 mpt = mw->mr_mptrsrcp;
2158 tavor_mr_keycalc(state, mpt->tr_indx, &new_rkey);
2159
2160 wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
2161 mw->mr_rkey = new_rkey;
2162
2163 mutex_exit(&mr->mr_lock);
2164 mutex_exit(&mw->mr_lock);
2165 return (DDI_SUCCESS);
2166 }
2167
2168
2169 /*
2170 * tavor_wrid_from_reset_handling()
2171 * Context: Can be called from interrupt or base context.
2172 */
2173 int
tavor_wrid_from_reset_handling(tavor_state_t * state,tavor_qphdl_t qp)2174 tavor_wrid_from_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2175 {
2176 tavor_workq_hdr_t *swq, *rwq;
2177 tavor_wrid_list_hdr_t *s_wridlist, *r_wridlist;
2178 uint_t create_new_swq = 0, create_new_rwq = 0;
2179 uint_t create_wql = 0;
2180 uint_t qp_srq_en;
2181
2182 /*
2183 * For each of this QP's Work Queues, make sure we have a (properly
2184 * initialized) Work Request ID list attached to the relevant
2185 * completion queue. Grab the CQ lock(s) before manipulating the
2186 * lists.
2187 */
2188 tavor_wrid_wqhdr_lock_both(qp);
2189 swq = tavor_wrid_wqhdr_find(qp->qp_sq_cqhdl, qp->qp_qpnum,
2190 TAVOR_WR_SEND);
2191 if (swq == NULL) {
2192 /* Couldn't find matching work queue header, create it */
2193 create_new_swq = create_wql = 1;
2194 swq = tavor_wrid_wqhdr_create(state, qp->qp_sq_cqhdl,
2195 qp->qp_qpnum, TAVOR_WR_SEND, create_wql);
2196 if (swq == NULL) {
2197 /*
2198 * If we couldn't find/allocate space for the workq
2199 * header, then drop the lock(s) and return failure.
2200 */
2201 tavor_wrid_wqhdr_unlock_both(qp);
2202 return (ibc_get_ci_failure(0));
2203 }
2204 }
2205 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swq))
2206 qp->qp_sq_wqhdr = swq;
2207 swq->wq_size = qp->qp_sq_bufsz;
2208 swq->wq_head = 0;
2209 swq->wq_tail = 0;
2210 swq->wq_full = 0;
2211
2212 /*
2213 * Allocate space for the tavor_wrid_entry_t container
2214 */
2215 s_wridlist = tavor_wrid_get_list(swq->wq_size);
2216 if (s_wridlist == NULL) {
2217 /*
2218 * If we couldn't allocate space for tracking the WRID
2219 * entries, then cleanup the workq header from above (if
2220 * necessary, i.e. if we created the workq header). Then
2221 * drop the lock(s) and return failure.
2222 */
2223 if (create_new_swq) {
2224 tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2225 }
2226
2227 tavor_wrid_wqhdr_unlock_both(qp);
2228 return (ibc_get_ci_failure(0));
2229 }
2230 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*s_wridlist))
2231 s_wridlist->wl_wqhdr = swq;
2232
2233 /* Chain the new WRID list container to the workq hdr list */
2234 mutex_enter(&swq->wq_wrid_wql->wql_lock);
2235 tavor_wrid_wqhdr_add(swq, s_wridlist);
2236 mutex_exit(&swq->wq_wrid_wql->wql_lock);
2237
2238 qp_srq_en = qp->qp_srq_en;
2239
2240 #ifdef __lock_lint
2241 mutex_enter(&qp->qp_srqhdl->srq_lock);
2242 #else
2243 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2244 mutex_enter(&qp->qp_srqhdl->srq_lock);
2245 }
2246 #endif
2247 /*
2248 * Now we repeat all the above operations for the receive work queue,
2249 * or shared receive work queue.
2250 *
2251 * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case.
2252 */
2253 rwq = tavor_wrid_wqhdr_find(qp->qp_rq_cqhdl, qp->qp_qpnum,
2254 TAVOR_WR_RECV);
2255 if (rwq == NULL) {
2256 create_new_rwq = create_wql = 1;
2257
2258 /*
2259 * If this QP is associated with an SRQ, and this isn't the
2260 * first QP on the SRQ, then the 'srq_wrid_wql' will already be
2261 * created. Since the WQL is created at 'wqhdr_create' time we
2262 * pass in the flag 'create_wql' here to be 0 if we have
2263 * already created it. And later on below we then next setup
2264 * the WQL and rwq information based off the existing SRQ info.
2265 */
2266 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2267 qp->qp_srqhdl->srq_wrid_wql != NULL) {
2268 create_wql = 0;
2269 }
2270
2271 rwq = tavor_wrid_wqhdr_create(state, qp->qp_rq_cqhdl,
2272 qp->qp_qpnum, TAVOR_WR_RECV, create_wql);
2273 if (rwq == NULL) {
2274 /*
2275 * If we couldn't find/allocate space for the workq
2276 * header, then free all the send queue resources we
2277 * just allocated and setup (above), drop the lock(s)
2278 * and return failure.
2279 */
2280 mutex_enter(&swq->wq_wrid_wql->wql_lock);
2281 tavor_wrid_wqhdr_remove(swq, s_wridlist);
2282 mutex_exit(&swq->wq_wrid_wql->wql_lock);
2283 if (create_new_swq) {
2284 tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl,
2285 swq);
2286 }
2287
2288 #ifdef __lock_lint
2289 mutex_exit(&qp->qp_srqhdl->srq_lock);
2290 #else
2291 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2292 mutex_exit(&qp->qp_srqhdl->srq_lock);
2293 }
2294 #endif
2295
2296 tavor_wrid_wqhdr_unlock_both(qp);
2297 return (ibc_get_ci_failure(0));
2298 }
2299 }
2300 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*rwq))
2301
2302 /*
2303 * Setup receive workq hdr
2304 *
2305 * If the QP is on an SRQ, we setup the SRQ specific fields, setting
2306 * keeping a copy of the rwq pointer, setting the rwq bufsize
2307 * appropriately, and initializing our part of the WQLock.
2308 *
2309 * In the normal QP case, the QP recv queue bufsize is used.
2310 */
2311 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2312 rwq->wq_size = qp->qp_srqhdl->srq_wq_bufsz;
2313 if (qp->qp_srqhdl->srq_wrid_wql == NULL) {
2314 qp->qp_srqhdl->srq_wrid_wql = rwq->wq_wrid_wql;
2315 } else {
2316 rwq->wq_wrid_wql = qp->qp_srqhdl->srq_wrid_wql;
2317 }
2318 tavor_wql_refcnt_inc(qp->qp_srqhdl->srq_wrid_wql);
2319
2320 } else {
2321 rwq->wq_size = qp->qp_rq_bufsz;
2322 }
2323
2324 qp->qp_rq_wqhdr = rwq;
2325 rwq->wq_head = 0;
2326 rwq->wq_tail = 0;
2327 rwq->wq_full = 0;
2328
2329 /*
2330 * Allocate space for the tavor_wrid_entry_t container.
2331 *
2332 * If QP is on an SRQ, and the wrq_wridlist is NULL then we must
2333 * allocate the wridlist normally. However, if the srq_wridlist is !=
2334 * NULL, then we know this SRQ has already been initialized, thus the
2335 * wridlist has already been initialized. So we re-use the
2336 * srq_wridlist as the r_wridlist for this QP in this case.
2337 */
2338 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2339 qp->qp_srqhdl->srq_wridlist != NULL) {
2340 /* Use existing srq_wridlist pointer */
2341 r_wridlist = qp->qp_srqhdl->srq_wridlist;
2342 ASSERT(r_wridlist != NULL);
2343 } else {
2344 /* Allocate memory for the r_wridlist */
2345 r_wridlist = tavor_wrid_get_list(rwq->wq_size);
2346 }
2347
2348 /*
2349 * If the memory allocation failed for r_wridlist (or the SRQ pointer
2350 * is mistakenly NULL), we cleanup our previous swq allocation from
2351 * above
2352 */
2353 if (r_wridlist == NULL) {
2354 /*
2355 * If we couldn't allocate space for tracking the WRID
2356 * entries, then cleanup all the stuff from above. Then
2357 * drop the lock(s) and return failure.
2358 */
2359 mutex_enter(&swq->wq_wrid_wql->wql_lock);
2360 tavor_wrid_wqhdr_remove(swq, s_wridlist);
2361 mutex_exit(&swq->wq_wrid_wql->wql_lock);
2362 if (create_new_swq) {
2363 tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2364 }
2365 if (create_new_rwq) {
2366 tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, rwq);
2367 }
2368
2369 #ifdef __lock_lint
2370 mutex_exit(&qp->qp_srqhdl->srq_lock);
2371 #else
2372 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2373 mutex_exit(&qp->qp_srqhdl->srq_lock);
2374 }
2375 #endif
2376
2377 tavor_wrid_wqhdr_unlock_both(qp);
2378 return (ibc_get_ci_failure(0));
2379 }
2380 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*r_wridlist))
2381
2382 /*
2383 * Initialize the wridlist
2384 *
2385 * In the normal QP case, there is no special initialization needed.
2386 * We simply setup the wridlist backpointer to be the receive wqhdr
2387 * (rwq).
2388 *
2389 * But in the SRQ case, there is no backpointer to the wqhdr possible.
2390 * Instead we set 'wl_srq_en', specifying this wridlist is on an SRQ
2391 * and thus potentially shared across multiple QPs with the SRQ. We
2392 * also setup the srq_wridlist pointer to be the r_wridlist, and
2393 * intialize the freelist to an invalid index. This srq_wridlist
2394 * pointer is used above on future moves from_reset to let us know that
2395 * the srq_wridlist has been initialized already.
2396 *
2397 * And finally, if we are in a non-UMAP case, we setup the srq wrid
2398 * free list.
2399 */
2400 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2401 qp->qp_srqhdl->srq_wridlist == NULL) {
2402 r_wridlist->wl_srq_en = 1;
2403 r_wridlist->wl_free_list_indx = -1;
2404 qp->qp_srqhdl->srq_wridlist = r_wridlist;
2405
2406 /* Initialize srq wrid free list */
2407 if (qp->qp_srqhdl->srq_is_umap == 0) {
2408 mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2409 tavor_wrid_list_srq_init(r_wridlist, qp->qp_srqhdl, 0);
2410 mutex_exit(&rwq->wq_wrid_wql->wql_lock);
2411 }
2412 } else {
2413 r_wridlist->wl_wqhdr = rwq;
2414 }
2415
2416 /* Chain the WRID list "container" to the workq hdr list */
2417 mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2418 tavor_wrid_wqhdr_add(rwq, r_wridlist);
2419 mutex_exit(&rwq->wq_wrid_wql->wql_lock);
2420
2421 #ifdef __lock_lint
2422 mutex_exit(&qp->qp_srqhdl->srq_lock);
2423 #else
2424 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2425 mutex_exit(&qp->qp_srqhdl->srq_lock);
2426 }
2427 #endif
2428
2429 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*r_wridlist))
2430 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*rwq))
2431 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*s_wridlist))
2432 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*swq))
2433
2434 tavor_wrid_wqhdr_unlock_both(qp);
2435 return (DDI_SUCCESS);
2436 }
2437
2438
2439 /*
2440 * tavor_wrid_to_reset_handling()
2441 * Context: Can be called from interrupt or base context.
2442 */
2443 void
tavor_wrid_to_reset_handling(tavor_state_t * state,tavor_qphdl_t qp)2444 tavor_wrid_to_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2445 {
2446 uint_t free_wqhdr = 0;
2447
2448 /*
2449 * For each of this QP's Work Queues, move the WRID "container" to
2450 * the "reapable" list. Although there may still be unpolled
2451 * entries in these containers, it is not a big deal. We will not
2452 * reap the list until either the Poll CQ command detects an empty
2453 * condition or the CQ itself is freed. Grab the CQ lock(s) before
2454 * manipulating the lists.
2455 */
2456 mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2457 tavor_wrid_wqhdr_lock_both(qp);
2458 tavor_wrid_reaplist_add(qp->qp_sq_cqhdl, qp->qp_sq_wqhdr);
2459
2460 /*
2461 * Add the receive work queue header on to the reaplist. But if we are
2462 * on SRQ, then don't add anything to the reaplist. Instead we flush
2463 * the SRQ entries on the CQ, remove wridlist from WQHDR, and free the
2464 * WQHDR (if needed). We must hold the WQL for these operations, yet
2465 * the call to tavor_cq_wqhdr_remove grabs the WQL internally. So we
2466 * drop WQL before that call. Then release the CQ WQHDR locks and the
2467 * CQ lock and return.
2468 */
2469 if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2470
2471 /*
2472 * Pull off all (if any) entries for this QP from CQ. This
2473 * only includes entries that have not yet been polled
2474 */
2475 mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2476 tavor_cq_srq_entries_flush(state, qp);
2477
2478 /* Remove wridlist from WQHDR */
2479 tavor_wrid_wqhdr_remove(qp->qp_rq_wqhdr,
2480 qp->qp_rq_wqhdr->wq_wrid_post);
2481
2482 /* If wridlist chain is now empty, remove the wqhdr as well */
2483 if (qp->qp_rq_wqhdr->wq_wrid_post == NULL) {
2484 free_wqhdr = 1;
2485 } else {
2486 free_wqhdr = 0;
2487 }
2488
2489 mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2490
2491 /* Free the WQHDR */
2492 if (free_wqhdr) {
2493 tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2494 }
2495 } else {
2496 tavor_wrid_reaplist_add(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2497 }
2498 tavor_wrid_wqhdr_unlock_both(qp);
2499 mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2500 }
2501
2502
2503 /*
2504 * tavor_wrid_add_entry()
2505 * Context: Can be called from interrupt or base context.
2506 */
2507 void
tavor_wrid_add_entry(tavor_workq_hdr_t * wq,uint64_t wrid,uint32_t wqeaddrsz,uint_t signaled_dbd)2508 tavor_wrid_add_entry(tavor_workq_hdr_t *wq, uint64_t wrid, uint32_t wqeaddrsz,
2509 uint_t signaled_dbd)
2510 {
2511 tavor_wrid_entry_t *wre_tmp;
2512 uint32_t head, tail, size;
2513
2514 ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2515
2516 /*
2517 * Find the entry in the container pointed to by the "tail" index.
2518 * Add all of the relevant information to that entry, including WRID,
2519 * "wqeaddrsz" parameter, and whether it was signaled/unsignaled
2520 * and/or doorbelled.
2521 */
2522 head = wq->wq_wrid_post->wl_head;
2523 tail = wq->wq_wrid_post->wl_tail;
2524 size = wq->wq_wrid_post->wl_size;
2525 wre_tmp = &wq->wq_wrid_post->wl_wre[tail];
2526 wre_tmp->wr_wrid = wrid;
2527 wre_tmp->wr_wqeaddrsz = wqeaddrsz;
2528 wre_tmp->wr_signaled_dbd = signaled_dbd;
2529
2530 /*
2531 * Update the "wrid_old_tail" pointer to point to the entry we just
2532 * inserted into the queue. By tracking this pointer (the pointer to
2533 * the most recently inserted entry) it will possible later in the
2534 * PostSend() and PostRecv() code paths to find the entry that needs
2535 * its "doorbelled" flag set (see comment in tavor_post_recv() and/or
2536 * tavor_post_send()).
2537 */
2538 wq->wq_wrid_post->wl_wre_old_tail = wre_tmp;
2539
2540 /* Update the tail index */
2541 tail = ((tail + 1) & (size - 1));
2542 wq->wq_wrid_post->wl_tail = tail;
2543
2544 /*
2545 * If the "tail" index has just wrapped over into the "head" index,
2546 * then we have filled the container. We use the "full" flag to
2547 * indicate this condition and to distinguish it from the "empty"
2548 * condition (where head and tail are also equal).
2549 */
2550 if (head == tail) {
2551 wq->wq_wrid_post->wl_full = 1;
2552 }
2553 }
2554
2555 /*
2556 * tavor_wrid_add_entry_srq()
2557 * Context: Can be called from interrupt or base context
2558 */
2559 void
tavor_wrid_add_entry_srq(tavor_srqhdl_t srq,uint64_t wrid,uint_t signaled_dbd)2560 tavor_wrid_add_entry_srq(tavor_srqhdl_t srq, uint64_t wrid, uint_t signaled_dbd)
2561 {
2562 tavor_wrid_entry_t *wre;
2563 uint64_t *wl_wqe;
2564 uint32_t wqe_index;
2565
2566 /*
2567 * Find the next available WQE from the SRQ free_list. Then update the
2568 * free_list to point to the next entry
2569 */
2570 wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wridlist->wl_free_list_indx);
2571
2572 wqe_index = srq->srq_wridlist->wl_free_list_indx;
2573
2574 /* ASSERT on impossible wqe_index values */
2575 ASSERT(wqe_index < srq->srq_wq_bufsz);
2576
2577 /*
2578 * Setup the WRE.
2579 *
2580 * Given the 'wqe_index' value, we store the WRID at this WRE offset.
2581 * And we set the WRE to be signaled_dbd so that on poll CQ we can find
2582 * this information and associate the WRID to the WQE found on the CQE.
2583 */
2584 wre = &srq->srq_wridlist->wl_wre[wqe_index];
2585 wre->wr_wrid = wrid;
2586 wre->wr_signaled_dbd = signaled_dbd;
2587
2588 /* Update the free list index */
2589 srq->srq_wridlist->wl_free_list_indx = ddi_get32(
2590 srq->srq_wridlist->wl_acchdl, (uint32_t *)wl_wqe);
2591 }
2592
2593
2594 /*
2595 * tavor_wrid_get_entry()
2596 * Context: Can be called from interrupt or base context.
2597 */
2598 uint64_t
tavor_wrid_get_entry(tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe,tavor_wrid_entry_t * wre)2599 tavor_wrid_get_entry(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
2600 tavor_wrid_entry_t *wre)
2601 {
2602 tavor_workq_hdr_t *wq;
2603 tavor_wrid_entry_t *wre_tmp;
2604 uint64_t wrid;
2605 uint_t send_or_recv, qpnum, error, opcode;
2606
2607 /* Lock the list of work queues associated with this CQ */
2608 mutex_enter(&cq->cq_wrid_wqhdr_lock);
2609
2610 /*
2611 * Determine whether this CQE is a send or receive completion (and
2612 * whether it was a "successful" completion or not)
2613 */
2614 opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
2615 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
2616 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
2617 error = 1;
2618 send_or_recv = (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ?
2619 TAVOR_COMPLETION_SEND : TAVOR_COMPLETION_RECV;
2620 } else {
2621 error = 0;
2622 send_or_recv = TAVOR_CQE_SENDRECV_GET(cq, cqe);
2623 }
2624
2625 /* Find the work queue for this QP number (send or receive side) */
2626 qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
2627 wq = tavor_wrid_wqhdr_find(cq, qpnum, send_or_recv);
2628 ASSERT(wq != NULL);
2629
2630 /*
2631 * Regardless of whether the completion is the result of a "success"
2632 * or a "failure", we lock the list of "containers" and attempt to
2633 * search for the the first matching completion (i.e. the first WR
2634 * with a matching WQE addr and size). Once we find it, we pull out
2635 * the "wrid" field and return it (see below). Note: One possible
2636 * future enhancement would be to enable this routine to skip over
2637 * any "unsignaled" completions to go directly to the next "signaled"
2638 * entry on success. XXX
2639 */
2640 mutex_enter(&wq->wq_wrid_wql->wql_lock);
2641 wre_tmp = tavor_wrid_find_match(wq, cq, cqe);
2642
2643 /*
2644 * If this is a "successful" completion, then we assert that this
2645 * completion must be a "signaled" completion.
2646 */
2647 ASSERT(error || (wre_tmp->wr_signaled_dbd & TAVOR_WRID_ENTRY_SIGNALED));
2648
2649 /*
2650 * If the completion is a "failed" completion, then we save away the
2651 * contents of the entry (into the "wre" field passed in) for use
2652 * in later CQE processing. Note: We use the tavor_wrid_get_wqeaddrsz()
2653 * function to grab "wqeaddrsz" from the next entry in the container.
2654 * This is required for error processing (where updating these fields
2655 * properly is necessary to correct handling of the "error" CQE)
2656 */
2657 if (error && (wre != NULL)) {
2658 *wre = *wre_tmp;
2659 wre->wr_wqeaddrsz = tavor_wrid_get_wqeaddrsz(wq);
2660 }
2661
2662 /* Pull out the WRID and return it */
2663 wrid = wre_tmp->wr_wrid;
2664
2665 mutex_exit(&wq->wq_wrid_wql->wql_lock);
2666 mutex_exit(&cq->cq_wrid_wqhdr_lock);
2667
2668 return (wrid);
2669 }
2670
2671
2672 /*
2673 * tavor_wrid_find_match()
2674 * Context: Can be called from interrupt or base context.
2675 */
2676 static tavor_wrid_entry_t *
tavor_wrid_find_match(tavor_workq_hdr_t * wq,tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe)2677 tavor_wrid_find_match(tavor_workq_hdr_t *wq, tavor_cqhdl_t cq,
2678 tavor_hw_cqe_t *cqe)
2679 {
2680 tavor_wrid_entry_t *curr = NULL;
2681 tavor_wrid_list_hdr_t *container;
2682 uint32_t wqeaddr_size;
2683 uint32_t head, tail, size;
2684 int found = 0, last_container;
2685
2686 ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2687
2688 /* Pull the "wqeaddrsz" information from the CQE */
2689 wqeaddr_size = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe);
2690
2691 /*
2692 * Walk the "containers" list(s), find first WR with a matching WQE
2693 * addr. If the current "container" is not the last one on the list,
2694 * i.e. not the current one to which we are posting new WRID entries,
2695 * then we do not attempt to update the "q_head", "q_tail", and
2696 * "q_full" indicators on the main work queue header. We do, however,
2697 * update the "head" and "full" indicators on the individual containers
2698 * as we go. This is imperative because we need to be able to
2699 * determine when the current container has been emptied (so that we
2700 * can move on to the next container).
2701 */
2702 container = wq->wq_wrid_poll;
2703 while (container != NULL) {
2704 /* Is this the last/only "container" on the list */
2705 last_container = (container != wq->wq_wrid_post) ? 0 : 1;
2706
2707 /*
2708 * First check if we are on an SRQ. If so, we grab the entry
2709 * and break out. Since SRQ wridlist's are never added to
2710 * reaplist, they can only be the last container.
2711 */
2712 if (container->wl_srq_en) {
2713 ASSERT(last_container == 1);
2714 curr = tavor_wrid_find_match_srq(container, cq, cqe);
2715 break;
2716 }
2717
2718 /*
2719 * Grab the current "head", "tail" and "size" fields before
2720 * walking the list in the current container. Note: the "size"
2721 * field here must always be a power-of-2. The "full"
2722 * parameter is checked (and updated) here to distinguish the
2723 * "queue full" condition from "queue empty".
2724 */
2725 head = container->wl_head;
2726 tail = container->wl_tail;
2727 size = container->wl_size;
2728 while ((head != tail) || (container->wl_full)) {
2729 container->wl_full = 0;
2730 curr = &container->wl_wre[head];
2731 head = ((head + 1) & (size - 1));
2732
2733 /*
2734 * If the current entry's "wqeaddrsz" matches the one
2735 * we're searching for, then this must correspond to
2736 * the work request that caused the completion. Set
2737 * the "found" flag and bail out.
2738 */
2739 if (curr->wr_wqeaddrsz == wqeaddr_size) {
2740 found = 1;
2741 break;
2742 }
2743 }
2744
2745 /*
2746 * If the current container is empty (having reached here the
2747 * "head == tail" condition can only mean that the container
2748 * is empty), then NULL out the "wrid_old_tail" field (see
2749 * tavor_post_send() and tavor_post_recv() for more details)
2750 * and (potentially) remove the current container from future
2751 * searches.
2752 */
2753 if (head == tail) {
2754
2755 container->wl_wre_old_tail = NULL;
2756 /*
2757 * If this wasn't the last "container" on the chain,
2758 * i.e. the one to which new WRID entries will be
2759 * added, then remove it from the list.
2760 * Note: we don't "lose" the memory pointed to by this
2761 * because we should have already put this container
2762 * on the "reapable" list (from where it will later be
2763 * pulled).
2764 */
2765 if (!last_container) {
2766 wq->wq_wrid_poll = container->wl_next;
2767 }
2768 }
2769
2770 /* Update the head index for the container */
2771 container->wl_head = head;
2772
2773 /*
2774 * If the entry was found in this container, then continue to
2775 * bail out. Else reset the "curr" pointer and move on to the
2776 * next container (if there is one). Note: the only real
2777 * reason for setting "curr = NULL" here is so that the ASSERT
2778 * below can catch the case where no matching entry was found
2779 * on any of the lists.
2780 */
2781 if (found) {
2782 break;
2783 } else {
2784 curr = NULL;
2785 container = container->wl_next;
2786 }
2787 }
2788
2789 /*
2790 * Update work queue header's "head" and "full" conditions to match
2791 * the last entry on the container list. (Note: Only if we're pulling
2792 * entries from the last work queue portion of the list, i.e. not from
2793 * the previous portions that may be the "reapable" list.)
2794 */
2795 if (last_container) {
2796 wq->wq_head = wq->wq_wrid_post->wl_head;
2797 wq->wq_full = wq->wq_wrid_post->wl_full;
2798 }
2799
2800 /* Ensure that we've actually found what we were searching for */
2801 ASSERT(curr != NULL);
2802
2803 return (curr);
2804 }
2805
2806
2807 /*
2808 * tavor_wrid_find_match_srq()
2809 * Context: Can be called from interrupt or base context.
2810 */
2811 tavor_wrid_entry_t *
tavor_wrid_find_match_srq(tavor_wrid_list_hdr_t * wl,tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe)2812 tavor_wrid_find_match_srq(tavor_wrid_list_hdr_t *wl, tavor_cqhdl_t cq,
2813 tavor_hw_cqe_t *cqe)
2814 {
2815 tavor_wrid_entry_t *wre;
2816 uint64_t *wl_wqe;
2817 uint32_t wqe_index;
2818 uint64_t wqe_addr;
2819 uint32_t cqe_wqe_addr;
2820
2821 /* Grab the WQE addr out of the CQE */
2822 cqe_wqe_addr = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe) & 0xFFFFFFC0;
2823
2824 /*
2825 * Use the WQE addr as the lower 32-bit, we add back on the
2826 * 'wl_srq_desc_off' because we have a zero-based queue. Then the
2827 * upper 32-bit of the 'wl_srq_wq_buf' OR'd on gives us the WQE addr in
2828 * the SRQ Work Queue itself. We use this address as the index to find
2829 * out which Work Queue Entry this CQE corresponds with.
2830 *
2831 * We also use this address below to add the WQE back on to the free
2832 * list.
2833 */
2834 wqe_addr = ((uintptr_t)wl->wl_srq_wq_buf & 0xFFFFFFFF00000000ull) |
2835 (cqe_wqe_addr + wl->wl_srq_desc_off);
2836
2837 /*
2838 * Given the 'wqe_addr' just calculated and the srq buf address, we
2839 * find the 'wqe_index'. The 'wre' returned below contains the WRID
2840 * that we are looking for. This indexes into the wre_list for this
2841 * specific WQE.
2842 */
2843 wqe_index = TAVOR_SRQ_WQE_INDEX(wl->wl_srq_wq_buf, wqe_addr,
2844 wl->wl_srq_log_wqesz);
2845
2846 /* ASSERT on impossible wqe_index values */
2847 ASSERT(wqe_index < wl->wl_srq_wq_bufsz);
2848
2849 /* Get the pointer to this WQE */
2850 wl_wqe = (uint64_t *)(uintptr_t)wqe_addr;
2851
2852 /* Put this WQE index back on the free list */
2853 ddi_put32(wl->wl_acchdl, (uint32_t *)wl_wqe, wl->wl_free_list_indx);
2854 wl->wl_free_list_indx = wqe_index;
2855
2856 /* Using the index, return the Work Request ID Entry (wre) */
2857 wre = &wl->wl_wre[wqe_index];
2858
2859 return (wre);
2860 }
2861
2862
2863 /*
2864 * tavor_wrid_cq_reap()
2865 * Context: Can be called from interrupt or base context.
2866 */
2867 void
tavor_wrid_cq_reap(tavor_cqhdl_t cq)2868 tavor_wrid_cq_reap(tavor_cqhdl_t cq)
2869 {
2870 tavor_workq_hdr_t *consume_wqhdr;
2871 tavor_wrid_list_hdr_t *container, *to_free;
2872
2873 ASSERT(MUTEX_HELD(&cq->cq_lock));
2874
2875 /* Lock the list of work queues associated with this CQ */
2876 mutex_enter(&cq->cq_wrid_wqhdr_lock);
2877
2878 /* Walk the "reapable" list and free up containers */
2879 container = cq->cq_wrid_reap_head;
2880 while (container != NULL) {
2881 to_free = container;
2882 container = container->wl_reap_next;
2883 /*
2884 * If reaping the WRID list containers pulls the last
2885 * container from the given work queue header, then we free
2886 * the work queue header as well.
2887 */
2888 consume_wqhdr = tavor_wrid_list_reap(to_free);
2889 if (consume_wqhdr != NULL) {
2890 tavor_cq_wqhdr_remove(cq, consume_wqhdr);
2891 }
2892 }
2893
2894 /* Once finished reaping, we reset the CQ's reap list */
2895 cq->cq_wrid_reap_head = cq->cq_wrid_reap_tail = NULL;
2896
2897 mutex_exit(&cq->cq_wrid_wqhdr_lock);
2898 }
2899
2900
2901 /*
2902 * tavor_wrid_cq_force_reap()
2903 * Context: Can be called from interrupt or base context.
2904 */
2905 void
tavor_wrid_cq_force_reap(tavor_cqhdl_t cq)2906 tavor_wrid_cq_force_reap(tavor_cqhdl_t cq)
2907 {
2908 tavor_workq_hdr_t *curr;
2909 tavor_wrid_list_hdr_t *container, *to_free;
2910 avl_tree_t *treep;
2911 void *cookie = NULL;
2912
2913 ASSERT(MUTEX_HELD(&cq->cq_lock));
2914
2915 /*
2916 * The first step is to walk the "reapable" list and free up those
2917 * containers. This is necessary because the containers on the
2918 * reapable list are not otherwise connected to the work queue headers
2919 * anymore.
2920 */
2921 tavor_wrid_cq_reap(cq);
2922
2923 /* Now lock the list of work queues associated with this CQ */
2924 mutex_enter(&cq->cq_wrid_wqhdr_lock);
2925
2926 /*
2927 * Walk the list of work queue headers and free up all the WRID list
2928 * containers chained to it. Note: We don't need to grab the locks
2929 * for each of the individual WRID lists here because the only way
2930 * things can be added or removed from the list at this point would be
2931 * through post a work request to a QP. But if we've come this far,
2932 * then we can be assured that there are no longer any QP associated
2933 * with the CQ that we are trying to free.
2934 */
2935 #ifdef __lock_lint
2936 tavor_wrid_wqhdr_compare(NULL, NULL);
2937 #endif
2938 treep = &cq->cq_wrid_wqhdr_avl_tree;
2939 while ((curr = avl_destroy_nodes(treep, &cookie)) != NULL) {
2940 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*curr))
2941 container = curr->wq_wrid_poll;
2942 while (container != NULL) {
2943 to_free = container;
2944 container = container->wl_next;
2945 /*
2946 * If reaping the WRID list containers pulls the last
2947 * container from the given work queue header, then
2948 * we free the work queue header as well. Note: we
2949 * ignore the return value because we know that the
2950 * work queue header should always be freed once the
2951 * list of containers has come to an end.
2952 */
2953 (void) tavor_wrid_list_reap(to_free);
2954 if (container == NULL) {
2955 tavor_cq_wqhdr_remove(cq, curr);
2956 }
2957 }
2958 }
2959 avl_destroy(treep);
2960
2961 mutex_exit(&cq->cq_wrid_wqhdr_lock);
2962 }
2963
2964
2965 /*
2966 * tavor_wrid_get_list()
2967 * Context: Can be called from interrupt or base context.
2968 */
2969 tavor_wrid_list_hdr_t *
tavor_wrid_get_list(uint32_t qsize)2970 tavor_wrid_get_list(uint32_t qsize)
2971 {
2972 tavor_wrid_list_hdr_t *wridlist;
2973 uint32_t size;
2974
2975 /*
2976 * The WRID list "container" consists of the tavor_wrid_list_hdr_t,
2977 * which holds the pointers necessary for maintaining the "reapable"
2978 * list, chaining together multiple "containers" old and new, and
2979 * tracking the head, tail, size, etc. for each container.
2980 *
2981 * The "container" also holds all the tavor_wrid_entry_t's, which is
2982 * allocated separately, one for each entry on the corresponding work
2983 * queue.
2984 */
2985 size = sizeof (tavor_wrid_list_hdr_t);
2986
2987 /*
2988 * Note that this allocation has to be a NOSLEEP operation here
2989 * because we are holding the "wqhdr_list_lock" and, therefore,
2990 * could get raised to the interrupt level.
2991 */
2992 wridlist = (tavor_wrid_list_hdr_t *)kmem_zalloc(size, KM_NOSLEEP);
2993 if (wridlist == NULL) {
2994 return (NULL);
2995 }
2996 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wridlist))
2997
2998 /* Complete the "container" initialization */
2999 wridlist->wl_size = qsize;
3000 wridlist->wl_full = 0;
3001 wridlist->wl_head = 0;
3002 wridlist->wl_tail = 0;
3003 wridlist->wl_wre = (tavor_wrid_entry_t *)kmem_zalloc(qsize *
3004 sizeof (tavor_wrid_entry_t), KM_NOSLEEP);
3005 if (wridlist->wl_wre == NULL) {
3006 kmem_free(wridlist, size);
3007 return (NULL);
3008 }
3009 wridlist->wl_wre_old_tail = NULL;
3010 wridlist->wl_reap_next = NULL;
3011 wridlist->wl_next = NULL;
3012 wridlist->wl_prev = NULL;
3013 wridlist->wl_srq_en = 0;
3014
3015 return (wridlist);
3016 }
3017
3018 /*
3019 * tavor_wrid_list_srq_init()
3020 * Context: Can be called from interrupt or base context
3021 */
3022 void
tavor_wrid_list_srq_init(tavor_wrid_list_hdr_t * wridlist,tavor_srqhdl_t srq,uint_t wq_start)3023 tavor_wrid_list_srq_init(tavor_wrid_list_hdr_t *wridlist, tavor_srqhdl_t srq,
3024 uint_t wq_start)
3025 {
3026 uint64_t *wl_wqe;
3027 int wqe_index;
3028
3029 ASSERT(MUTEX_HELD(&srq->srq_wrid_wql->wql_lock));
3030
3031 /* Setup pointers for use later when we are polling the CQ */
3032 wridlist->wl_srq_wq_buf = srq->srq_wq_buf;
3033 wridlist->wl_srq_wq_bufsz = srq->srq_wq_bufsz;
3034 wridlist->wl_srq_log_wqesz = srq->srq_wq_log_wqesz;
3035 wridlist->wl_srq_desc_off = srq->srq_desc_off;
3036 wridlist->wl_acchdl = srq->srq_wqinfo.qa_acchdl;
3037
3038 /* Given wq_start to start initializing buf at, verify sanity */
3039 ASSERT(wq_start >= 0 && wq_start < srq->srq_wq_bufsz);
3040
3041 /*
3042 * Initialize wridlist free list
3043 *
3044 * For each WQ up to the size of our queue, we store an index in the WQ
3045 * memory itself, representing the next available free entry. The
3046 * 'wl_free_list_indx' always holds the index of the next available
3047 * free entry in the WQ. If 'wl_free_list_indx' is -1, then we are
3048 * completely full. This gives us the advantage of being able to have
3049 * entries complete or be polled off the WQ out-of-order.
3050 *
3051 * For now, we write the free_list entries inside the WQ itself. It
3052 * may be useful in the future to store this information in a separate
3053 * structure for debugging purposes.
3054 */
3055 for (wqe_index = wq_start; wqe_index < srq->srq_wq_bufsz; wqe_index++) {
3056 wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, wqe_index);
3057 ddi_put32(wridlist->wl_acchdl, (uint32_t *)wl_wqe,
3058 wridlist->wl_free_list_indx);
3059 wridlist->wl_free_list_indx = wqe_index;
3060 }
3061 }
3062
3063
3064 /*
3065 * tavor_wrid_reaplist_add()
3066 * Context: Can be called from interrupt or base context.
3067 */
3068 static void
tavor_wrid_reaplist_add(tavor_cqhdl_t cq,tavor_workq_hdr_t * wq)3069 tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq)
3070 {
3071 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3072
3073 mutex_enter(&wq->wq_wrid_wql->wql_lock);
3074
3075 /*
3076 * Add the "post" container (the last one on the current chain) to
3077 * the CQ's "reapable" list
3078 */
3079 if ((cq->cq_wrid_reap_head == NULL) &&
3080 (cq->cq_wrid_reap_tail == NULL)) {
3081 cq->cq_wrid_reap_head = wq->wq_wrid_post;
3082 cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3083 } else {
3084 cq->cq_wrid_reap_tail->wl_reap_next = wq->wq_wrid_post;
3085 cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3086 }
3087
3088 mutex_exit(&wq->wq_wrid_wql->wql_lock);
3089 }
3090
3091
3092 int
tavor_wrid_wqhdr_compare(const void * p1,const void * p2)3093 tavor_wrid_wqhdr_compare(const void *p1, const void *p2)
3094 {
3095 tavor_workq_compare_t *cmpp;
3096 tavor_workq_hdr_t *curr;
3097
3098 cmpp = (tavor_workq_compare_t *)p1;
3099 curr = (tavor_workq_hdr_t *)p2;
3100
3101 if (cmpp->cmp_qpn < curr->wq_qpn)
3102 return (-1);
3103 else if (cmpp->cmp_qpn > curr->wq_qpn)
3104 return (+1);
3105 else if (cmpp->cmp_type < curr->wq_type)
3106 return (-1);
3107 else if (cmpp->cmp_type > curr->wq_type)
3108 return (+1);
3109 else
3110 return (0);
3111 }
3112
3113
3114 /*
3115 * tavor_wrid_wqhdr_find()
3116 * Context: Can be called from interrupt or base context.
3117 */
3118 static tavor_workq_hdr_t *
tavor_wrid_wqhdr_find(tavor_cqhdl_t cq,uint_t qpn,uint_t wq_type)3119 tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type)
3120 {
3121 tavor_workq_hdr_t *curr;
3122 tavor_workq_compare_t cmp;
3123
3124 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3125
3126 /*
3127 * Walk the CQ's work queue list, trying to find a send or recv queue
3128 * with the same QP number. We do this even if we are going to later
3129 * create a new entry because it helps us easily find the end of the
3130 * list.
3131 */
3132 cmp.cmp_qpn = qpn;
3133 cmp.cmp_type = wq_type;
3134 #ifdef __lock_lint
3135 tavor_wrid_wqhdr_compare(NULL, NULL);
3136 #endif
3137 curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL);
3138
3139 return (curr);
3140 }
3141
3142
3143 /*
3144 * tavor_wrid_wqhdr_create()
3145 * Context: Can be called from interrupt or base context.
3146 */
3147 static tavor_workq_hdr_t *
tavor_wrid_wqhdr_create(tavor_state_t * state,tavor_cqhdl_t cq,uint_t qpn,uint_t wq_type,uint_t create_wql)3148 tavor_wrid_wqhdr_create(tavor_state_t *state, tavor_cqhdl_t cq, uint_t qpn,
3149 uint_t wq_type, uint_t create_wql)
3150 {
3151 tavor_workq_hdr_t *wqhdr_tmp;
3152
3153 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3154
3155 /*
3156 * Allocate space a work queue header structure and initialize it.
3157 * Each work queue header structure includes a "wq_wrid_wql"
3158 * which needs to be initialized. Note that this allocation has to be
3159 * a NOSLEEP operation because we are holding the "cq_wrid_wqhdr_lock"
3160 * and, therefore, could get raised to the interrupt level.
3161 */
3162 wqhdr_tmp = (tavor_workq_hdr_t *)kmem_zalloc(
3163 sizeof (tavor_workq_hdr_t), KM_NOSLEEP);
3164 if (wqhdr_tmp == NULL) {
3165 return (NULL);
3166 }
3167 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr_tmp))
3168 wqhdr_tmp->wq_qpn = qpn;
3169 wqhdr_tmp->wq_type = wq_type;
3170
3171 if (create_wql) {
3172 wqhdr_tmp->wq_wrid_wql = tavor_wrid_wql_create(state);
3173 if (wqhdr_tmp->wq_wrid_wql == NULL) {
3174 kmem_free(wqhdr_tmp, sizeof (tavor_workq_hdr_t));
3175 return (NULL);
3176 }
3177 }
3178
3179 wqhdr_tmp->wq_wrid_poll = NULL;
3180 wqhdr_tmp->wq_wrid_post = NULL;
3181
3182 /* Chain the newly allocated work queue header to the CQ's list */
3183 tavor_cq_wqhdr_add(cq, wqhdr_tmp);
3184
3185 return (wqhdr_tmp);
3186 }
3187
3188
3189 /*
3190 * tavor_wrid_wql_create()
3191 * Context: Can be called from interrupt or base context.
3192 */
3193 tavor_wq_lock_t *
tavor_wrid_wql_create(tavor_state_t * state)3194 tavor_wrid_wql_create(tavor_state_t *state)
3195 {
3196 tavor_wq_lock_t *wql;
3197
3198 /*
3199 * Allocate the WQL and initialize it.
3200 */
3201 wql = kmem_zalloc(sizeof (tavor_wq_lock_t), KM_NOSLEEP);
3202 if (wql == NULL) {
3203 return (NULL);
3204 }
3205
3206 mutex_init(&wql->wql_lock, NULL, MUTEX_DRIVER,
3207 DDI_INTR_PRI(state->ts_intrmsi_pri));
3208
3209 /* Add refcount to WQL */
3210 tavor_wql_refcnt_inc(wql);
3211
3212 return (wql);
3213 }
3214
3215
3216 /*
3217 * tavor_wrid_get_wqeaddrsz()
3218 * Context: Can be called from interrupt or base context.
3219 */
3220 static uint32_t
tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t * wq)3221 tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq)
3222 {
3223 tavor_wrid_entry_t *wre;
3224 uint32_t wqeaddrsz;
3225 uint32_t head;
3226
3227 /*
3228 * If the container is empty, then there is no next entry. So just
3229 * return zero. Note: the "head == tail" condition here can only
3230 * mean that the container is empty because we have previously pulled
3231 * something from the container.
3232 *
3233 * If the container is not empty, then find the next entry and return
3234 * the contents of its "wqeaddrsz" field.
3235 */
3236 if (wq->wq_wrid_poll->wl_head == wq->wq_wrid_poll->wl_tail) {
3237 wqeaddrsz = 0;
3238 } else {
3239 /*
3240 * We don't need to calculate the "next" head pointer here
3241 * because "head" should already point to the next entry on
3242 * the list (since we just pulled something off - in
3243 * tavor_wrid_find_match() - and moved the head index forward.)
3244 */
3245 head = wq->wq_wrid_poll->wl_head;
3246 wre = &wq->wq_wrid_poll->wl_wre[head];
3247 wqeaddrsz = wre->wr_wqeaddrsz;
3248 }
3249 return (wqeaddrsz);
3250 }
3251
3252
3253 /*
3254 * tavor_wrid_wqhdr_add()
3255 * Context: Can be called from interrupt or base context.
3256 */
3257 static void
tavor_wrid_wqhdr_add(tavor_workq_hdr_t * wqhdr,tavor_wrid_list_hdr_t * wridlist)3258 tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
3259 tavor_wrid_list_hdr_t *wridlist)
3260 {
3261 ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3262
3263 /* Chain the new WRID list "container" to the work queue list */
3264 if ((wqhdr->wq_wrid_post == NULL) &&
3265 (wqhdr->wq_wrid_poll == NULL)) {
3266 wqhdr->wq_wrid_poll = wridlist;
3267 wqhdr->wq_wrid_post = wridlist;
3268 } else {
3269 wqhdr->wq_wrid_post->wl_next = wridlist;
3270 wridlist->wl_prev = wqhdr->wq_wrid_post;
3271 wqhdr->wq_wrid_post = wridlist;
3272 }
3273 }
3274
3275
3276 /*
3277 * tavor_wrid_wqhdr_remove()
3278 * Context: Can be called from interrupt or base context.
3279 *
3280 * Note: this is only called to remove the most recently added WRID list
3281 * container (i.e. in tavor_from_reset() above)
3282 */
3283 static void
tavor_wrid_wqhdr_remove(tavor_workq_hdr_t * wqhdr,tavor_wrid_list_hdr_t * wridlist)3284 tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
3285 tavor_wrid_list_hdr_t *wridlist)
3286 {
3287 tavor_wrid_list_hdr_t *prev, *next;
3288
3289 ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3290
3291 /* Unlink the WRID list "container" from the work queue list */
3292 prev = wridlist->wl_prev;
3293 next = wridlist->wl_next;
3294 if (prev != NULL) {
3295 prev->wl_next = next;
3296 }
3297 if (next != NULL) {
3298 next->wl_prev = prev;
3299 }
3300
3301 /*
3302 * Update any pointers in the work queue hdr that may point to this
3303 * WRID list container
3304 */
3305 if (wqhdr->wq_wrid_post == wridlist) {
3306 wqhdr->wq_wrid_post = prev;
3307 }
3308 if (wqhdr->wq_wrid_poll == wridlist) {
3309 wqhdr->wq_wrid_poll = NULL;
3310 }
3311 }
3312
3313
3314 /*
3315 * tavor_wrid_list_reap()
3316 * Context: Can be called from interrupt or base context.
3317 * Note: The "wqhdr_list_lock" must be held.
3318 */
3319 static tavor_workq_hdr_t *
tavor_wrid_list_reap(tavor_wrid_list_hdr_t * wridlist)3320 tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wridlist)
3321 {
3322 tavor_workq_hdr_t *wqhdr, *consume_wqhdr = NULL;
3323 tavor_wrid_list_hdr_t *prev, *next;
3324 uint32_t size;
3325
3326 /* Get the back pointer to the work queue header (see below) */
3327 wqhdr = wridlist->wl_wqhdr;
3328 mutex_enter(&wqhdr->wq_wrid_wql->wql_lock);
3329
3330 /* Unlink the WRID list "container" from the work queue list */
3331 prev = wridlist->wl_prev;
3332 next = wridlist->wl_next;
3333 if (prev != NULL) {
3334 prev->wl_next = next;
3335 }
3336 if (next != NULL) {
3337 next->wl_prev = prev;
3338 }
3339
3340 /*
3341 * If the back pointer to the work queue header shows that it
3342 * was pointing to the entry we are about to remove, then the work
3343 * queue header is reapable as well.
3344 */
3345 if ((wqhdr->wq_wrid_poll == wridlist) &&
3346 (wqhdr->wq_wrid_post == wridlist)) {
3347 consume_wqhdr = wqhdr;
3348 }
3349
3350 /* Be sure to update the "poll" and "post" container pointers */
3351 if (wqhdr->wq_wrid_poll == wridlist) {
3352 wqhdr->wq_wrid_poll = next;
3353 }
3354 if (wqhdr->wq_wrid_post == wridlist) {
3355 wqhdr->wq_wrid_post = NULL;
3356 }
3357
3358 /* Calculate the size and free the container */
3359 size = (wridlist->wl_size * sizeof (tavor_wrid_entry_t));
3360 kmem_free(wridlist->wl_wre, size);
3361 kmem_free(wridlist, sizeof (tavor_wrid_list_hdr_t));
3362
3363 mutex_exit(&wqhdr->wq_wrid_wql->wql_lock);
3364
3365 return (consume_wqhdr);
3366 }
3367
3368
3369 /*
3370 * tavor_wrid_wqhdr_lock_both()
3371 * Context: Can be called from interrupt or base context.
3372 */
3373 static void
tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp)3374 tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp)
3375 {
3376 tavor_cqhdl_t sq_cq, rq_cq;
3377
3378 sq_cq = qp->qp_sq_cqhdl;
3379 rq_cq = qp->qp_rq_cqhdl;
3380
3381 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3382 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3383
3384 /*
3385 * If both work queues (send and recv) share a completion queue, then
3386 * grab the common lock. If they use different CQs (hence different
3387 * "cq_wrid_wqhdr_list" locks), then grab the send one first, then the
3388 * receive. We do this consistently and correctly in
3389 * tavor_wrid_wqhdr_unlock_both() below to avoid introducing any kind
3390 * of dead lock condition. Note: We add the "__lock_lint" code here
3391 * to fake out warlock into thinking we've grabbed both locks (when,
3392 * in fact, we only needed the one).
3393 */
3394 if (sq_cq == rq_cq) {
3395 mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3396 #ifdef __lock_lint
3397 mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3398 #endif
3399 } else {
3400 mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3401 mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3402 }
3403 }
3404
3405 /*
3406 * tavor_wrid_wqhdr_unlock_both()
3407 * Context: Can be called from interrupt or base context.
3408 */
3409 static void
tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp)3410 tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp)
3411 {
3412 tavor_cqhdl_t sq_cq, rq_cq;
3413
3414 sq_cq = qp->qp_sq_cqhdl;
3415 rq_cq = qp->qp_rq_cqhdl;
3416
3417 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3418 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3419
3420 /*
3421 * See tavor_wrid_wqhdr_lock_both() above for more detail
3422 */
3423 if (sq_cq == rq_cq) {
3424 #ifdef __lock_lint
3425 mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3426 #endif
3427 mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3428 } else {
3429 mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3430 mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3431 }
3432 }
3433
3434
3435 /*
3436 * tavor_cq_wqhdr_add()
3437 * Context: Can be called from interrupt or base context.
3438 */
3439 static void
tavor_cq_wqhdr_add(tavor_cqhdl_t cq,tavor_workq_hdr_t * wqhdr)3440 tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3441 {
3442 tavor_workq_compare_t cmp;
3443 avl_index_t where;
3444
3445 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3446
3447 cmp.cmp_qpn = wqhdr->wq_qpn;
3448 cmp.cmp_type = wqhdr->wq_type;
3449 #ifdef __lock_lint
3450 tavor_wrid_wqhdr_compare(NULL, NULL);
3451 #endif
3452 (void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where);
3453 /*
3454 * If the CQ's work queue list is empty, then just add it.
3455 * Otherwise, chain it to the beginning of the list.
3456 */
3457 avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqhdr, where);
3458 }
3459
3460
3461 /*
3462 * tavor_cq_wqhdr_remove()
3463 * Context: Can be called from interrupt or base context.
3464 */
3465 static void
tavor_cq_wqhdr_remove(tavor_cqhdl_t cq,tavor_workq_hdr_t * wqhdr)3466 tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3467 {
3468 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3469
3470 #ifdef __lock_lint
3471 tavor_wrid_wqhdr_compare(NULL, NULL);
3472 #endif
3473 /* Remove "wqhdr" from the work queue header list on "cq" */
3474 avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqhdr);
3475
3476 /*
3477 * Release reference to WQL; If this is the last reference, this call
3478 * also has the side effect of freeing up the 'wq_wrid_wql' memory.
3479 */
3480 tavor_wql_refcnt_dec(wqhdr->wq_wrid_wql);
3481
3482 /* Free the memory associated with "wqhdr" */
3483 kmem_free(wqhdr, sizeof (tavor_workq_hdr_t));
3484 }
3485
3486
3487 /*
3488 * tavor_wql_refcnt_inc()
3489 * Context: Can be called from interrupt or base context
3490 */
3491 void
tavor_wql_refcnt_inc(tavor_wq_lock_t * wql)3492 tavor_wql_refcnt_inc(tavor_wq_lock_t *wql)
3493 {
3494 ASSERT(wql != NULL);
3495
3496 mutex_enter(&wql->wql_lock);
3497 wql->wql_refcnt++;
3498 mutex_exit(&wql->wql_lock);
3499 }
3500
3501 /*
3502 * tavor_wql_refcnt_dec()
3503 * Context: Can be called from interrupt or base context
3504 */
3505 void
tavor_wql_refcnt_dec(tavor_wq_lock_t * wql)3506 tavor_wql_refcnt_dec(tavor_wq_lock_t *wql)
3507 {
3508 int refcnt;
3509
3510 ASSERT(wql != NULL);
3511
3512 mutex_enter(&wql->wql_lock);
3513 wql->wql_refcnt--;
3514 refcnt = wql->wql_refcnt;
3515 mutex_exit(&wql->wql_lock);
3516
3517 /*
3518 *
3519 * Free up WQL memory if we're the last one associated with this
3520 * structure.
3521 */
3522 if (refcnt == 0) {
3523 mutex_destroy(&wql->wql_lock);
3524 kmem_free(wql, sizeof (tavor_wq_lock_t));
3525 }
3526 }
3527