1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * tavor_wr.c
29 * Tavor Work Request Processing Routines
30 *
31 * Implements all the routines necessary to provide the PostSend(),
32 * PostRecv() and PostSRQ() verbs. Also contains all the code
33 * necessary to implement the Tavor WRID tracking mechanism.
34 */
35
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/avl.h>
42
43 #include <sys/ib/adapters/tavor/tavor.h>
44
45 static void tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda,
46 uint32_t nds, uint32_t qpn, uint32_t fence, uint32_t nopcode);
47 static void tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda,
48 uint32_t nds, uint32_t qpn, uint32_t credits);
49 static uint32_t tavor_wr_get_immediate(ibt_send_wr_t *wr);
50 static int tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr);
51 static int tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
52 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
53 static void tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr,
54 ibt_send_wr_t *prev_wr, uint64_t *curr_desc, uint_t curr_descsz,
55 uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp);
56 static int tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
57 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
58 static void tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
59 uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
60 tavor_qphdl_t qp);
61 static int tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
62 ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size);
63 static void tavor_wqe_recv_linknext(uint64_t *desc, uint_t desc_sz,
64 uint64_t *prev, tavor_qphdl_t qp);
65 static int tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
66 ibt_recv_wr_t *wr, uint64_t *desc);
67 static void tavor_wqe_srq_linknext(uint64_t *desc, uint64_t *prev,
68 tavor_srqhdl_t srq);
69 static void tavor_wqe_sync(void *hdl, uint_t sync_from,
70 uint_t sync_to, uint_t sync_type, uint_t flag);
71 static tavor_wrid_entry_t *tavor_wrid_find_match(tavor_workq_hdr_t *wq,
72 tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe);
73 static void tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq);
74 static tavor_workq_hdr_t *tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn,
75 uint_t send_or_recv);
76 static tavor_workq_hdr_t *tavor_wrid_wqhdr_create(tavor_state_t *state,
77 tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type, uint_t create_wql);
78 static uint32_t tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq);
79 static void tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
80 tavor_wrid_list_hdr_t *wrid_list);
81 static void tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
82 tavor_wrid_list_hdr_t *wrid_list);
83 static tavor_workq_hdr_t *tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wq);
84 static void tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp);
85 static void tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp);
86 static void tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
87 static void tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
88
89 /*
90 * tavor_post_send()
91 * Context: Can be called from interrupt or base context.
92 */
93 int
tavor_post_send(tavor_state_t * state,tavor_qphdl_t qp,ibt_send_wr_t * wr,uint_t num_wr,uint_t * num_posted)94 tavor_post_send(tavor_state_t *state, tavor_qphdl_t qp,
95 ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
96 {
97 tavor_sw_wqe_dbinfo_t dbinfo;
98 tavor_wrid_list_hdr_t *wridlist;
99 tavor_wrid_entry_t *wre_last;
100 uint64_t *desc, *prev, *first;
101 uint32_t desc_sz, first_sz;
102 uint32_t wqeaddrsz, signaled_dbd;
103 uint32_t head, tail, next_tail, qsize_msk;
104 uint32_t sync_from, sync_to;
105 uint_t currindx, wrindx, numremain;
106 uint_t chainlen, chainbegin, posted_cnt;
107 uint_t maxdb = TAVOR_QP_MAXDESC_PER_DB;
108 int status;
109
110 /*
111 * Check for user-mappable QP memory. Note: We do not allow kernel
112 * clients to post to QP memory that is accessible directly by the
113 * user. If the QP memory is user accessible, then return an error.
114 */
115 if (qp->qp_is_umap) {
116 return (IBT_QP_HDL_INVALID);
117 }
118
119 /* Initialize posted_cnt */
120 posted_cnt = 0;
121
122 mutex_enter(&qp->qp_lock);
123
124 /*
125 * Check QP state. Can not post Send requests from the "Reset",
126 * "Init", or "RTR" states
127 */
128 if ((qp->qp_state == TAVOR_QP_RESET) ||
129 (qp->qp_state == TAVOR_QP_INIT) ||
130 (qp->qp_state == TAVOR_QP_RTR)) {
131 mutex_exit(&qp->qp_lock);
132 return (IBT_QP_STATE_INVALID);
133 }
134
135 /* Grab the lock for the WRID list */
136 mutex_enter(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
137 wridlist = qp->qp_sq_wqhdr->wq_wrid_post;
138
139 /* Save away some initial QP state */
140 qsize_msk = qp->qp_sq_wqhdr->wq_size - 1;
141 tail = qp->qp_sq_wqhdr->wq_tail;
142 head = qp->qp_sq_wqhdr->wq_head;
143
144 /*
145 * For each ibt_send_wr_t in the wr[] list passed in, parse the
146 * request and build a Send WQE. Note: Because we are potentially
147 * building a chain of WQEs, we want to link them all together.
148 * However, we do not want to link the first one to the previous
149 * WQE until the entire chain has been linked. Then in the last
150 * step we ring the appropriate doorbell. Note: It is possible for
151 * more Work Requests to be posted than the HW will support at one
152 * shot. If this happens, we need to be able to post and ring
153 * several chains here until the the entire request is complete.
154 */
155 wrindx = 0;
156 numremain = num_wr;
157 status = DDI_SUCCESS;
158 while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
159 /*
160 * For the first WQE on a new chain we need "prev" to point
161 * to the current descriptor. As we begin to process
162 * further, "prev" will be updated to point to the previous
163 * WQE on the current chain (see below).
164 */
165 prev = TAVOR_QP_SQ_ENTRY(qp, tail);
166
167 /*
168 * Before we begin, save the current "tail index" for later
169 * DMA sync
170 */
171 sync_from = tail;
172
173 /*
174 * Break the request up into chains that are less than or
175 * equal to the maximum number of WQEs that can be posted
176 * per doorbell ring
177 */
178 chainlen = (numremain > maxdb) ? maxdb : numremain;
179 numremain -= chainlen;
180 chainbegin = wrindx;
181 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
182 /*
183 * Check for "queue full" condition. If the queue
184 * is already full, then no more WQEs can be posted.
185 * So break out, ring a doorbell (if necessary) and
186 * return an error
187 */
188 if (qp->qp_sq_wqhdr->wq_full != 0) {
189 status = IBT_QP_FULL;
190 break;
191 }
192
193 /*
194 * Increment the "tail index" and check for "queue
195 * full" condition. If we detect that the current
196 * work request is going to fill the work queue, then
197 * we mark this condition and continue.
198 */
199 next_tail = (tail + 1) & qsize_msk;
200 if (next_tail == head) {
201 qp->qp_sq_wqhdr->wq_full = 1;
202 }
203
204 /*
205 * Get the address of the location where the next
206 * Send WQE should be built
207 */
208 desc = TAVOR_QP_SQ_ENTRY(qp, tail);
209
210 /*
211 * Call tavor_wqe_send_build() to build the WQE
212 * at the given address. This routine uses the
213 * information in the ibt_send_wr_t list (wr[]) and
214 * returns the size of the WQE when it returns.
215 */
216 status = tavor_wqe_send_build(state, qp,
217 &wr[wrindx], desc, &desc_sz);
218 if (status != DDI_SUCCESS) {
219 break;
220 }
221
222 /*
223 * Add a WRID entry to the WRID list. Need to
224 * calculate the "wqeaddrsz" and "signaled_dbd"
225 * values to pass to tavor_wrid_add_entry()
226 */
227 wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
228 ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
229 desc_sz);
230 if ((qp->qp_sq_sigtype == TAVOR_QP_SQ_ALL_SIGNALED) ||
231 (wr[wrindx].wr_flags & IBT_WR_SEND_SIGNAL)) {
232 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
233 } else {
234 signaled_dbd = 0;
235 }
236 tavor_wrid_add_entry(qp->qp_sq_wqhdr,
237 wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
238
239 /*
240 * If this is not the first descriptor on the current
241 * chain, then link it to the previous WQE. Otherwise,
242 * save the address and size of this descriptor (in
243 * "first" and "first_sz" respectively) and continue.
244 * Note: Linking a WQE to the the previous one will
245 * depend on whether the two WQEs are from "special
246 * QPs" (i.e. MLX transport WQEs) or whether they are
247 * normal Send WQEs.
248 */
249 if (currindx != 0) {
250 if (qp->qp_is_special) {
251 tavor_wqe_mlx_linknext(&wr[wrindx - 1],
252 desc, desc_sz, prev, NULL, qp);
253 } else {
254 tavor_wqe_send_linknext(&wr[wrindx],
255 &wr[wrindx - 1], desc, desc_sz,
256 prev, NULL, qp);
257 }
258 prev = desc;
259 } else {
260 first = desc;
261 first_sz = desc_sz;
262 }
263
264 /*
265 * Update the current "tail index" and increment
266 * "posted_cnt"
267 */
268 tail = next_tail;
269 posted_cnt++;
270 }
271
272 /*
273 * If we reach here and there are one or more WQEs which have
274 * been successfully chained together, then we need to link
275 * the current chain to the previously executing chain of
276 * descriptor (if there is one) and ring the doorbell for the
277 * send work queue.
278 */
279 if (currindx != 0) {
280 /*
281 * Before we link the chain, we need to ensure that the
282 * "next" field on the last WQE is set to NULL (to
283 * indicate the end of the chain). Note: Just as it
284 * did above, the format for the "next" fields in a
285 * given WQE depend on whether the WQE is MLX
286 * transport or not.
287 */
288 if (qp->qp_is_special) {
289 tavor_wqe_mlx_linknext(&wr[chainbegin +
290 currindx - 1], NULL, 0, prev, NULL, qp);
291 } else {
292 tavor_wqe_send_linknext(NULL,
293 &wr[chainbegin + currindx - 1], NULL, 0,
294 prev, NULL, qp);
295 }
296
297 /* Save away updated "tail index" for the DMA sync */
298 sync_to = tail;
299
300 /* Do a DMA sync for current send WQE(s) */
301 tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_SEND,
302 DDI_DMA_SYNC_FORDEV);
303
304 /*
305 * Now link the chain to the old chain (if there was
306 * one. Note: still need to pay attention to whether
307 * the QP used MLX transport WQEs or not.
308 */
309 if (qp->qp_is_special) {
310 tavor_wqe_mlx_linknext(NULL, first, first_sz,
311 qp->qp_sq_lastwqeaddr, &dbinfo, qp);
312 } else {
313 tavor_wqe_send_linknext(&wr[chainbegin], NULL,
314 first, first_sz, qp->qp_sq_lastwqeaddr,
315 &dbinfo, qp);
316 }
317
318 /*
319 * If there was a valid previous WQE (i.e. non-NULL),
320 * then sync it too. This is because we have updated
321 * its "next" fields and we want to ensure that the
322 * hardware can see the changes.
323 */
324 if (qp->qp_sq_lastwqeaddr != NULL) {
325 sync_to = sync_from;
326 sync_from = (sync_from - 1) & qsize_msk;
327 tavor_wqe_sync(qp, sync_from, sync_to,
328 TAVOR_WR_SEND, DDI_DMA_SYNC_FORDEV);
329 }
330
331 /*
332 * Now if the WRID tail entry is non-NULL, then this
333 * represents the entry to which we are chaining the
334 * new entries. Since we are going to ring the
335 * doorbell for this WQE, we want set its "dbd" bit.
336 *
337 * On the other hand, if the tail is NULL, even though
338 * we will have rung the doorbell for the previous WQE
339 * (for the hardware's sake) it is irrelevant to our
340 * purposes (for tracking WRIDs) because we know the
341 * request must have already completed.
342 */
343 wre_last = wridlist->wl_wre_old_tail;
344 if (wre_last != NULL) {
345 wre_last->wr_signaled_dbd |=
346 TAVOR_WRID_ENTRY_DOORBELLED;
347 }
348
349 /* Update some of the state in the QP */
350 qp->qp_sq_lastwqeaddr = desc;
351 qp->qp_sq_wqhdr->wq_tail = tail;
352
353 /* Ring the doorbell */
354 tavor_qp_send_doorbell(state,
355 (uint32_t)((uintptr_t)first - qp->qp_desc_off),
356 first_sz, qp->qp_qpnum, dbinfo.db_fence,
357 dbinfo.db_nopcode);
358 }
359 }
360
361 /*
362 * Update the "num_posted" return value (if necessary). Then drop
363 * the locks and return success.
364 */
365 if (num_posted != NULL) {
366 *num_posted = posted_cnt;
367 }
368
369 mutex_exit(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
370 mutex_exit(&qp->qp_lock);
371
372 return (status);
373 }
374
375
376 /*
377 * tavor_post_recv()
378 * Context: Can be called from interrupt or base context.
379 */
380 int
tavor_post_recv(tavor_state_t * state,tavor_qphdl_t qp,ibt_recv_wr_t * wr,uint_t num_wr,uint_t * num_posted)381 tavor_post_recv(tavor_state_t *state, tavor_qphdl_t qp,
382 ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
383 {
384 uint64_t *desc, *prev, *first;
385 uint32_t desc_sz, first_sz;
386 uint32_t wqeaddrsz, signaled_dbd;
387 uint32_t head, tail, next_tail, qsize_msk;
388 uint32_t sync_from, sync_to;
389 uint_t currindx, wrindx, numremain;
390 uint_t chainlen, posted_cnt;
391 uint_t maxdb = TAVOR_QP_MAXDESC_PER_DB;
392 int status;
393
394 /*
395 * Check for user-mappable QP memory. Note: We do not allow kernel
396 * clients to post to QP memory that is accessible directly by the
397 * user. If the QP memory is user accessible, then return an error.
398 */
399 if (qp->qp_is_umap) {
400 return (IBT_QP_HDL_INVALID);
401 }
402
403 /* Initialize posted_cnt */
404 posted_cnt = 0;
405
406 mutex_enter(&qp->qp_lock);
407
408 /*
409 * Check if QP is associated with an SRQ
410 */
411 if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
412 mutex_exit(&qp->qp_lock);
413 return (IBT_SRQ_IN_USE);
414 }
415
416 /*
417 * Check QP state. Can not post Recv requests from the "Reset" state
418 */
419 if (qp->qp_state == TAVOR_QP_RESET) {
420 mutex_exit(&qp->qp_lock);
421 return (IBT_QP_STATE_INVALID);
422 }
423
424 /* Grab the lock for the WRID list */
425 mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
426
427 /* Save away some initial QP state */
428 qsize_msk = qp->qp_rq_wqhdr->wq_size - 1;
429 tail = qp->qp_rq_wqhdr->wq_tail;
430 head = qp->qp_rq_wqhdr->wq_head;
431
432 /*
433 * For each ibt_recv_wr_t in the wr[] list passed in, parse the
434 * request and build a Recv WQE. Note: Because we are potentially
435 * building a chain of WQEs, we want to link them all together.
436 * However, we do not want to link the first one to the previous
437 * WQE until the entire chain has been linked. Then in the last
438 * step we ring the appropriate doorbell. Note: It is possible for
439 * more Work Requests to be posted than the HW will support at one
440 * shot. If this happens, we need to be able to post and ring
441 * several chains here until the the entire request is complete.
442 */
443 wrindx = 0;
444 numremain = num_wr;
445 status = DDI_SUCCESS;
446 while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
447 /*
448 * For the first WQE on a new chain we need "prev" to point
449 * to the current descriptor. As we begin to process
450 * further, "prev" will be updated to point to the previous
451 * WQE on the current chain (see below).
452 */
453 prev = TAVOR_QP_RQ_ENTRY(qp, tail);
454
455 /*
456 * Before we begin, save the current "tail index" for later
457 * DMA sync
458 */
459 sync_from = tail;
460
461 /*
462 * Break the request up into chains that are less than or
463 * equal to the maximum number of WQEs that can be posted
464 * per doorbell ring
465 */
466 chainlen = (numremain > maxdb) ? maxdb : numremain;
467 numremain -= chainlen;
468 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
469 /*
470 * Check for "queue full" condition. If the queue
471 * is already full, then no more WQEs can be posted.
472 * So break out, ring a doorbell (if necessary) and
473 * return an error
474 */
475 if (qp->qp_rq_wqhdr->wq_full != 0) {
476 status = IBT_QP_FULL;
477 break;
478 }
479
480 /*
481 * Increment the "tail index" and check for "queue
482 * full" condition. If we detect that the current
483 * work request is going to fill the work queue, then
484 * we mark this condition and continue.
485 */
486 next_tail = (tail + 1) & qsize_msk;
487 if (next_tail == head) {
488 qp->qp_rq_wqhdr->wq_full = 1;
489 }
490
491 /*
492 * Get the address of the location where the next
493 * Recv WQE should be built
494 */
495 desc = TAVOR_QP_RQ_ENTRY(qp, tail);
496
497 /*
498 * Call tavor_wqe_recv_build() to build the WQE
499 * at the given address. This routine uses the
500 * information in the ibt_recv_wr_t list (wr[]) and
501 * returns the size of the WQE when it returns.
502 */
503 status = tavor_wqe_recv_build(state, qp, &wr[wrindx],
504 desc, &desc_sz);
505 if (status != DDI_SUCCESS) {
506 break;
507 }
508
509 /*
510 * Add a WRID entry to the WRID list. Need to
511 * calculate the "wqeaddrsz" and "signaled_dbd"
512 * values to pass to tavor_wrid_add_entry(). Note:
513 * all Recv WQEs are essentially "signaled" and
514 * "doorbelled" (since Tavor HW requires all
515 * RecvWQE's to have their "DBD" bits set).
516 */
517 wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
518 ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
519 desc_sz);
520 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED |
521 TAVOR_WRID_ENTRY_DOORBELLED;
522 tavor_wrid_add_entry(qp->qp_rq_wqhdr,
523 wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
524
525 /*
526 * If this is not the first descriptor on the current
527 * chain, then link it to the previous WQE. Otherwise,
528 * save the address and size of this descriptor (in
529 * "first" and "first_sz" respectively) and continue.
530 */
531 if (currindx != 0) {
532 tavor_wqe_recv_linknext(desc, desc_sz, prev,
533 qp);
534 prev = desc;
535 } else {
536 first = desc;
537 first_sz = desc_sz;
538 }
539
540 /*
541 * Update the current "tail index" and increment
542 * "posted_cnt"
543 */
544 tail = next_tail;
545 posted_cnt++;
546 }
547
548 /*
549 * If we reach here and there are one or more WQEs which have
550 * been successfully chained together, then we need to link
551 * the current chain to the previously executing chain of
552 * descriptor (if there is one) and ring the doorbell for the
553 * recv work queue.
554 */
555 if (currindx != 0) {
556 /*
557 * Before we link the chain, we need to ensure that the
558 * "next" field on the last WQE is set to NULL (to
559 * indicate the end of the chain).
560 */
561 tavor_wqe_recv_linknext(NULL, 0, prev, qp);
562
563 /* Save away updated "tail index" for the DMA sync */
564 sync_to = tail;
565
566 /* Do a DMA sync for current recv WQE(s) */
567 tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_RECV,
568 DDI_DMA_SYNC_FORDEV);
569
570 /*
571 * Now link the chain to the old chain (if there was
572 * one.
573 */
574 tavor_wqe_recv_linknext(first, first_sz,
575 qp->qp_rq_lastwqeaddr, qp);
576
577 /*
578 * If there was a valid previous WQE (i.e. non-NULL),
579 * then sync it too. This is because we have updated
580 * its "next" fields and we want to ensure that the
581 * hardware can see the changes.
582 */
583 if (qp->qp_rq_lastwqeaddr != NULL) {
584 sync_to = sync_from;
585 sync_from = (sync_from - 1) & qsize_msk;
586 tavor_wqe_sync(qp, sync_from, sync_to,
587 TAVOR_WR_RECV, DDI_DMA_SYNC_FORDEV);
588 }
589
590 /* Update some of the state in the QP */
591 qp->qp_rq_lastwqeaddr = desc;
592 qp->qp_rq_wqhdr->wq_tail = tail;
593
594 /* Ring the doorbell */
595 tavor_qp_recv_doorbell(state,
596 (uint32_t)((uintptr_t)first - qp->qp_desc_off),
597 first_sz, qp->qp_qpnum, (chainlen % maxdb));
598 }
599 }
600
601 /*
602 * Update the "num_posted" return value (if necessary). Then drop
603 * the locks and return success.
604 */
605 if (num_posted != NULL) {
606 *num_posted = posted_cnt;
607 }
608
609 mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
610 mutex_exit(&qp->qp_lock);
611
612 return (status);
613 }
614
615 /*
616 * tavor_post_srq()
617 * Context: Can be called from interrupt or base context.
618 */
619 int
tavor_post_srq(tavor_state_t * state,tavor_srqhdl_t srq,ibt_recv_wr_t * wr,uint_t num_wr,uint_t * num_posted)620 tavor_post_srq(tavor_state_t *state, tavor_srqhdl_t srq,
621 ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
622 {
623 uint64_t *desc, *prev, *first, *last_wqe_addr;
624 uint32_t signaled_dbd;
625 uint32_t sync_indx;
626 uint_t currindx, wrindx, numremain;
627 uint_t chainlen, posted_cnt;
628 uint_t maxdb = TAVOR_QP_MAXDESC_PER_DB;
629 int status;
630
631 /*
632 * Check for user-mappable QP memory. Note: We do not allow kernel
633 * clients to post to QP memory that is accessible directly by the
634 * user. If the QP memory is user accessible, then return an error.
635 */
636 if (srq->srq_is_umap) {
637 return (IBT_SRQ_HDL_INVALID);
638 }
639
640 /* Initialize posted_cnt */
641 posted_cnt = 0;
642
643 mutex_enter(&srq->srq_lock);
644
645 /*
646 * Check SRQ state. Can not post Recv requests when SRQ is in error
647 */
648 if (srq->srq_state == TAVOR_SRQ_STATE_ERROR) {
649 mutex_exit(&srq->srq_lock);
650 return (IBT_QP_STATE_INVALID);
651 }
652
653 /* Grab the lock for the WRID list */
654 mutex_enter(&srq->srq_wrid_wql->wql_lock);
655
656 /*
657 * For each ibt_recv_wr_t in the wr[] list passed in, parse the
658 * request and build a Recv WQE. Note: Because we are potentially
659 * building a chain of WQEs, we want to link them all together.
660 * However, we do not want to link the first one to the previous
661 * WQE until the entire chain has been linked. Then in the last
662 * step we ring the appropriate doorbell. Note: It is possible for
663 * more Work Requests to be posted than the HW will support at one
664 * shot. If this happens, we need to be able to post and ring
665 * several chains here until the the entire request is complete.
666 */
667 wrindx = 0;
668 numremain = num_wr;
669 status = DDI_SUCCESS;
670 while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
671 /*
672 * For the first WQE on a new chain we need "prev" to point
673 * to the current descriptor. As we begin to process
674 * further, "prev" will be updated to point to the previous
675 * WQE on the current chain (see below).
676 */
677 if (srq->srq_wq_lastwqeindx == -1) {
678 prev = NULL;
679 } else {
680 prev = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wq_lastwqeindx);
681 }
682
683 /*
684 * Break the request up into chains that are less than or
685 * equal to the maximum number of WQEs that can be posted
686 * per doorbell ring
687 */
688 chainlen = (numremain > maxdb) ? maxdb : numremain;
689 numremain -= chainlen;
690 for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
691
692 /*
693 * Check for "queue full" condition. If the queue
694 * is already full, then no more WQEs can be posted.
695 * So break out, ring a doorbell (if necessary) and
696 * return an error
697 */
698 if (srq->srq_wridlist->wl_free_list_indx == -1) {
699 status = IBT_QP_FULL;
700 break;
701 }
702
703 /*
704 * Get the address of the location where the next
705 * Recv WQE should be built
706 */
707 desc = TAVOR_SRQ_WQE_ADDR(srq,
708 srq->srq_wridlist->wl_free_list_indx);
709
710 /*
711 * Add a WRID entry to the WRID list. Need to
712 * set the "signaled_dbd" values to pass to
713 * tavor_wrid_add_entry(). Note: all Recv WQEs are
714 * essentially "signaled"
715 *
716 * The 'size' is stored at srq_alloc time, in the
717 * srq_wq_stride. This is a constant value required
718 * for SRQ.
719 */
720 signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
721 tavor_wrid_add_entry_srq(srq, wr[wrindx].wr_id,
722 signaled_dbd);
723
724 /*
725 * Call tavor_wqe_srq_build() to build the WQE
726 * at the given address. This routine uses the
727 * information in the ibt_recv_wr_t list (wr[]) and
728 * returns the size of the WQE when it returns.
729 */
730 status = tavor_wqe_srq_build(state, srq, &wr[wrindx],
731 desc);
732 if (status != DDI_SUCCESS) {
733 break;
734 }
735
736 /*
737 * If this is not the first descriptor on the current
738 * chain, then link it to the previous WQE. Otherwise,
739 * save the address of this descriptor (in "first") and
740 * continue.
741 */
742 if (currindx != 0) {
743 tavor_wqe_srq_linknext(desc, prev, srq);
744 sync_indx = TAVOR_SRQ_WQE_INDEX(
745 srq->srq_wq_buf, prev,
746 srq->srq_wq_log_wqesz);
747
748 /* Do a DMA sync for previous recv WQE */
749 tavor_wqe_sync(srq, sync_indx, sync_indx+1,
750 TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
751
752 prev = desc;
753 } else {
754
755 /*
756 * In this case, the last WQE on the chain is
757 * also considered 'first'. So set prev to
758 * first, here.
759 */
760 first = prev = desc;
761 }
762
763 /*
764 * Increment "posted_cnt"
765 */
766 posted_cnt++;
767 }
768
769 /*
770 * If we reach here and there are one or more WQEs which have
771 * been successfully chained together, then we need to link
772 * the current chain to the previously executing chain of
773 * descriptor (if there is one) and ring the doorbell for the
774 * recv work queue.
775 */
776 if (currindx != 0) {
777 /*
778 * Before we link the chain, we need to ensure that the
779 * "next" field on the last WQE is set to NULL (to
780 * indicate the end of the chain).
781 */
782 tavor_wqe_srq_linknext(NULL, prev, srq);
783
784 sync_indx = TAVOR_SRQ_WQE_INDEX(srq->srq_wq_buf, prev,
785 srq->srq_wq_log_wqesz);
786
787 /* Do a DMA sync for current recv WQE */
788 tavor_wqe_sync(srq, sync_indx, sync_indx+1,
789 TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
790
791 /*
792 * Now link the chain to the old chain (if there was
793 * one).
794 */
795 if (srq->srq_wq_lastwqeindx == -1) {
796 last_wqe_addr = NULL;
797 } else {
798 last_wqe_addr = TAVOR_SRQ_WQE_ADDR(srq,
799 srq->srq_wq_lastwqeindx);
800 }
801 tavor_wqe_srq_linknext(first, last_wqe_addr, srq);
802
803 /*
804 * If there was a valid previous WQE (i.e. valid index),
805 * then sync it too. This is because we have updated
806 * its "next" fields and we want to ensure that the
807 * hardware can see the changes.
808 */
809 if (srq->srq_wq_lastwqeindx != -1) {
810 sync_indx = srq->srq_wq_lastwqeindx;
811 tavor_wqe_sync(srq, sync_indx, sync_indx+1,
812 TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
813 }
814
815 /* Update some of the state in the QP */
816 srq->srq_wq_lastwqeindx = TAVOR_SRQ_WQE_INDEX(
817 srq->srq_wq_buf, desc,
818 srq->srq_wq_log_wqesz);
819
820 /* Ring the doorbell */
821 /* SRQ needs NDS of 0 */
822 tavor_qp_recv_doorbell(state,
823 (uint32_t)((uintptr_t)first - srq->srq_desc_off),
824 0, srq->srq_srqnum, (chainlen % maxdb));
825 }
826 }
827
828 /*
829 * Update the "num_posted" return value (if necessary). Then drop
830 * the locks and return success.
831 */
832 if (num_posted != NULL) {
833 *num_posted = posted_cnt;
834 }
835
836 mutex_exit(&srq->srq_wrid_wql->wql_lock);
837 mutex_exit(&srq->srq_lock);
838
839 return (status);
840 }
841
842
843 /*
844 * tavor_qp_send_doorbell()
845 * Context: Can be called from interrupt or base context.
846 */
847 static void
tavor_qp_send_doorbell(tavor_state_t * state,uint32_t nda,uint32_t nds,uint32_t qpn,uint32_t fence,uint32_t nopcode)848 tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
849 uint32_t qpn, uint32_t fence, uint32_t nopcode)
850 {
851 uint64_t doorbell = 0;
852
853 /* Build the doorbell from the parameters */
854 doorbell = (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) <<
855 TAVOR_QPSNDDB_NDA_SHIFT) |
856 ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) |
857 ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) |
858 ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds;
859
860 /* Write the doorbell to UAR */
861 TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->send,
862 doorbell);
863 }
864
865
866 /*
867 * tavor_qp_recv_doorbell()
868 * Context: Can be called from interrupt or base context.
869 */
870 static void
tavor_qp_recv_doorbell(tavor_state_t * state,uint32_t nda,uint32_t nds,uint32_t qpn,uint32_t credits)871 tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
872 uint32_t qpn, uint32_t credits)
873 {
874 uint64_t doorbell = 0;
875
876 /* Build the doorbell from the parameters */
877 doorbell = (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) <<
878 TAVOR_QPRCVDB_NDA_SHIFT) |
879 ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) |
880 ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits;
881
882 /* Write the doorbell to UAR */
883 TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->recv,
884 doorbell);
885 }
886
887
888 /*
889 * tavor_wqe_send_build()
890 * Context: Can be called from interrupt or base context.
891 */
892 static int
tavor_wqe_send_build(tavor_state_t * state,tavor_qphdl_t qp,ibt_send_wr_t * wr,uint64_t * desc,uint_t * size)893 tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
894 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
895 {
896 tavor_hw_snd_wqe_ud_t *ud;
897 tavor_hw_snd_wqe_remaddr_t *rc;
898 tavor_hw_snd_wqe_atomic_t *at;
899 tavor_hw_snd_wqe_remaddr_t *uc;
900 tavor_hw_snd_wqe_bind_t *bn;
901 tavor_hw_wqe_sgl_t *ds;
902 ibt_wr_ds_t *sgl;
903 tavor_ahhdl_t ah;
904 uint32_t nds;
905 int i, num_ds, status;
906
907 ASSERT(MUTEX_HELD(&qp->qp_lock));
908
909 /* Initialize the information for the Data Segments */
910 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
911 sizeof (tavor_hw_snd_wqe_nextctrl_t));
912 nds = wr->wr_nds;
913 sgl = wr->wr_sgl;
914 num_ds = 0;
915
916 /*
917 * Build a Send WQE depends first and foremost on the transport
918 * type of Work Request (i.e. UD, RC, or UC)
919 */
920 switch (wr->wr_trans) {
921 case IBT_UD_SRV:
922 /* Ensure that work request transport type matches QP type */
923 if (qp->qp_serv_type != TAVOR_QP_UD) {
924 return (IBT_QP_SRV_TYPE_INVALID);
925 }
926
927 /*
928 * Validate the operation type. For UD requests, only the
929 * "Send" operation is valid
930 */
931 if (wr->wr_opcode != IBT_WRC_SEND) {
932 return (IBT_QP_OP_TYPE_INVALID);
933 }
934
935 /*
936 * If this is a Special QP (QP0 or QP1), then we need to
937 * build MLX WQEs instead. So jump to tavor_wqe_mlx_build()
938 * and return whatever status it returns
939 */
940 if (qp->qp_is_special) {
941 status = tavor_wqe_mlx_build(state, qp, wr, desc, size);
942 return (status);
943 }
944
945 /*
946 * Otherwise, if this is a normal UD Send request, then fill
947 * all the fields in the Tavor UD header for the WQE. Note:
948 * to do this we'll need to extract some information from the
949 * Address Handle passed with the work request.
950 */
951 ud = (tavor_hw_snd_wqe_ud_t *)((uintptr_t)desc +
952 sizeof (tavor_hw_snd_wqe_nextctrl_t));
953 ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
954 if (ah == NULL) {
955 return (IBT_AH_HDL_INVALID);
956 }
957
958 /*
959 * Build the Unreliable Datagram Segment for the WQE, using
960 * the information from the address handle and the work
961 * request.
962 */
963 mutex_enter(&ah->ah_lock);
964 TAVOR_WQE_BUILD_UD(qp, ud, ah, wr);
965 mutex_exit(&ah->ah_lock);
966
967 /* Update "ds" for filling in Data Segments (below) */
968 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)ud +
969 sizeof (tavor_hw_snd_wqe_ud_t));
970 break;
971
972 case IBT_RC_SRV:
973 /* Ensure that work request transport type matches QP type */
974 if (qp->qp_serv_type != TAVOR_QP_RC) {
975 return (IBT_QP_SRV_TYPE_INVALID);
976 }
977
978 /*
979 * Validate the operation type. For RC requests, we allow
980 * "Send", "RDMA Read", "RDMA Write", various "Atomic"
981 * operations, and memory window "Bind"
982 */
983 if ((wr->wr_opcode != IBT_WRC_SEND) &&
984 (wr->wr_opcode != IBT_WRC_RDMAR) &&
985 (wr->wr_opcode != IBT_WRC_RDMAW) &&
986 (wr->wr_opcode != IBT_WRC_CSWAP) &&
987 (wr->wr_opcode != IBT_WRC_FADD) &&
988 (wr->wr_opcode != IBT_WRC_BIND)) {
989 return (IBT_QP_OP_TYPE_INVALID);
990 }
991
992 /*
993 * If this is a Send request, then all we need to do is break
994 * out and here and begin the Data Segment processing below
995 */
996 if (wr->wr_opcode == IBT_WRC_SEND) {
997 break;
998 }
999
1000 /*
1001 * If this is an RDMA Read or RDMA Write request, then fill
1002 * in the "Remote Address" header fields.
1003 */
1004 if ((wr->wr_opcode == IBT_WRC_RDMAR) ||
1005 (wr->wr_opcode == IBT_WRC_RDMAW)) {
1006 rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1007 sizeof (tavor_hw_snd_wqe_nextctrl_t));
1008
1009 /*
1010 * Build the Remote Address Segment for the WQE, using
1011 * the information from the RC work request.
1012 */
1013 TAVOR_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
1014
1015 /* Update "ds" for filling in Data Segments (below) */
1016 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc +
1017 sizeof (tavor_hw_snd_wqe_remaddr_t));
1018 break;
1019 }
1020
1021 /*
1022 * If this is one of the Atomic type operations (i.e
1023 * Compare-Swap or Fetch-Add), then fill in both the "Remote
1024 * Address" header fields and the "Atomic" header fields.
1025 */
1026 if ((wr->wr_opcode == IBT_WRC_CSWAP) ||
1027 (wr->wr_opcode == IBT_WRC_FADD)) {
1028 rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1029 sizeof (tavor_hw_snd_wqe_nextctrl_t));
1030 at = (tavor_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
1031 sizeof (tavor_hw_snd_wqe_remaddr_t));
1032
1033 /*
1034 * Build the Remote Address and Atomic Segments for
1035 * the WQE, using the information from the RC Atomic
1036 * work request.
1037 */
1038 TAVOR_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
1039 TAVOR_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
1040
1041 /* Update "ds" for filling in Data Segments (below) */
1042 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)at +
1043 sizeof (tavor_hw_snd_wqe_atomic_t));
1044
1045 /*
1046 * Update "nds" and "sgl" because Atomic requests have
1047 * only a single Data Segment (and they are encoded
1048 * somewhat differently in the work request.
1049 */
1050 nds = 1;
1051 sgl = wr->wr_sgl;
1052 break;
1053 }
1054
1055 /*
1056 * If this is memory window Bind operation, then we call the
1057 * tavor_wr_bind_check() routine to validate the request and
1058 * to generate the updated RKey. If this is successful, then
1059 * we fill in the WQE's "Bind" header fields.
1060 */
1061 if (wr->wr_opcode == IBT_WRC_BIND) {
1062 status = tavor_wr_bind_check(state, wr);
1063 if (status != DDI_SUCCESS) {
1064 return (status);
1065 }
1066
1067 bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1068 sizeof (tavor_hw_snd_wqe_nextctrl_t));
1069
1070 /*
1071 * Build the Bind Memory Window Segments for the WQE,
1072 * using the information from the RC Bind memory
1073 * window work request.
1074 */
1075 TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
1076
1077 /*
1078 * Update the "ds" pointer. Even though the "bind"
1079 * operation requires no SGLs, this is necessary to
1080 * facilitate the correct descriptor size calculations
1081 * (below).
1082 */
1083 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1084 sizeof (tavor_hw_snd_wqe_bind_t));
1085 nds = 0;
1086 }
1087 break;
1088
1089 case IBT_UC_SRV:
1090 /* Ensure that work request transport type matches QP type */
1091 if (qp->qp_serv_type != TAVOR_QP_UC) {
1092 return (IBT_QP_SRV_TYPE_INVALID);
1093 }
1094
1095 /*
1096 * Validate the operation type. For UC requests, we only
1097 * allow "Send", "RDMA Write", and memory window "Bind".
1098 * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic"
1099 * operations
1100 */
1101 if ((wr->wr_opcode != IBT_WRC_SEND) &&
1102 (wr->wr_opcode != IBT_WRC_RDMAW) &&
1103 (wr->wr_opcode != IBT_WRC_BIND)) {
1104 return (IBT_QP_OP_TYPE_INVALID);
1105 }
1106
1107 /*
1108 * If this is a Send request, then all we need to do is break
1109 * out and here and begin the Data Segment processing below
1110 */
1111 if (wr->wr_opcode == IBT_WRC_SEND) {
1112 break;
1113 }
1114
1115 /*
1116 * If this is an RDMA Write request, then fill in the "Remote
1117 * Address" header fields.
1118 */
1119 if (wr->wr_opcode == IBT_WRC_RDMAW) {
1120 uc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1121 sizeof (tavor_hw_snd_wqe_nextctrl_t));
1122
1123 /*
1124 * Build the Remote Address Segment for the WQE, using
1125 * the information from the UC work request.
1126 */
1127 TAVOR_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma);
1128
1129 /* Update "ds" for filling in Data Segments (below) */
1130 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)uc +
1131 sizeof (tavor_hw_snd_wqe_remaddr_t));
1132 break;
1133 }
1134
1135 /*
1136 * If this is memory window Bind operation, then we call the
1137 * tavor_wr_bind_check() routine to validate the request and
1138 * to generate the updated RKey. If this is successful, then
1139 * we fill in the WQE's "Bind" header fields.
1140 */
1141 if (wr->wr_opcode == IBT_WRC_BIND) {
1142 status = tavor_wr_bind_check(state, wr);
1143 if (status != DDI_SUCCESS) {
1144 return (status);
1145 }
1146
1147 bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1148 sizeof (tavor_hw_snd_wqe_nextctrl_t));
1149
1150 /*
1151 * Build the Bind Memory Window Segments for the WQE,
1152 * using the information from the UC Bind memory
1153 * window work request.
1154 */
1155 TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind);
1156
1157 /*
1158 * Update the "ds" pointer. Even though the "bind"
1159 * operation requires no SGLs, this is necessary to
1160 * facilitate the correct descriptor size calculations
1161 * (below).
1162 */
1163 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1164 sizeof (tavor_hw_snd_wqe_bind_t));
1165 nds = 0;
1166 }
1167 break;
1168
1169 default:
1170 return (IBT_QP_SRV_TYPE_INVALID);
1171 }
1172
1173 /*
1174 * Now fill in the Data Segments (SGL) for the Send WQE based on
1175 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
1176 * Start by checking for a valid number of SGL entries
1177 */
1178 if (nds > qp->qp_sq_sgl) {
1179 return (IBT_QP_SGL_LEN_INVALID);
1180 }
1181
1182 /*
1183 * For each SGL in the Send Work Request, fill in the Send WQE's data
1184 * segments. Note: We skip any SGL with zero size because Tavor
1185 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
1186 * the encoding for zero means a 2GB transfer. Because of this special
1187 * encoding in the hardware, we mask the requested length with
1188 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1189 * zero.)
1190 */
1191 for (i = 0; i < nds; i++) {
1192 if (sgl[i].ds_len == 0) {
1193 continue;
1194 }
1195
1196 /*
1197 * Fill in the Data Segment(s) for the current WQE, using the
1198 * information contained in the scatter-gather list of the
1199 * work request.
1200 */
1201 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1202 num_ds++;
1203 }
1204
1205 /* Return the size of descriptor (in 16-byte chunks) */
1206 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 4;
1207
1208 return (DDI_SUCCESS);
1209 }
1210
1211
1212 /*
1213 * tavor_wqe_send_linknext()
1214 * Context: Can be called from interrupt or base context.
1215 */
1216 static void
tavor_wqe_send_linknext(ibt_send_wr_t * curr_wr,ibt_send_wr_t * prev_wr,uint64_t * curr_desc,uint_t curr_descsz,uint64_t * prev_desc,tavor_sw_wqe_dbinfo_t * dbinfo,tavor_qphdl_t qp)1217 tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, ibt_send_wr_t *prev_wr,
1218 uint64_t *curr_desc, uint_t curr_descsz, uint64_t *prev_desc,
1219 tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp)
1220 {
1221 uint64_t next, ctrl;
1222 uint32_t nopcode, fence;
1223
1224 /*
1225 * Calculate the "next" field of the descriptor. This amounts to
1226 * setting up the "next_wqe_addr", "nopcode", "fence", and "nds"
1227 * fields (see tavor_hw.h for more). Note: If there is no next
1228 * descriptor (i.e. if the current descriptor is the last WQE on
1229 * the chain), then set "next" to zero.
1230 */
1231 if (curr_desc != NULL) {
1232 /*
1233 * Determine the value for the Tavor WQE "nopcode" field
1234 * by using the IBTF opcode from the work request
1235 */
1236 switch (curr_wr->wr_opcode) {
1237 case IBT_WRC_RDMAW:
1238 if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1239 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAWI;
1240 } else {
1241 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW;
1242 }
1243 break;
1244
1245 case IBT_WRC_SEND:
1246 if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1247 nopcode = TAVOR_WQE_SEND_NOPCODE_SENDI;
1248 } else {
1249 nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1250 }
1251 break;
1252
1253 case IBT_WRC_RDMAR:
1254 nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR;
1255 break;
1256
1257 case IBT_WRC_CSWAP:
1258 nopcode = TAVOR_WQE_SEND_NOPCODE_ATMCS;
1259 break;
1260
1261 case IBT_WRC_FADD:
1262 nopcode = TAVOR_WQE_SEND_NOPCODE_ATMFA;
1263 break;
1264
1265 case IBT_WRC_BIND:
1266 nopcode = TAVOR_WQE_SEND_NOPCODE_BIND;
1267 break;
1268 }
1269
1270 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc
1271 - qp->qp_desc_off);
1272 next = ((uint64_t)(uintptr_t)curr_desc &
1273 TAVOR_WQE_NDA_MASK) << 32;
1274 next = next | ((uint64_t)nopcode << 32);
1275 fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
1276 if (fence) {
1277 next = next | TAVOR_WQE_SEND_FENCE_MASK;
1278 }
1279 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1280
1281 /*
1282 * If a send queue doorbell will be rung for the next
1283 * WQE on the chain, then set the current WQE's "dbd" bit.
1284 * Note: We also update the "dbinfo" structure here to pass
1285 * back information about what should (later) be included
1286 * in the send queue doorbell.
1287 */
1288 if (dbinfo) {
1289 next = next | TAVOR_WQE_DBD_MASK;
1290 dbinfo->db_nopcode = nopcode;
1291 dbinfo->db_fence = fence;
1292 }
1293 } else {
1294 next = 0;
1295 }
1296
1297 /*
1298 * If this WQE is supposed to be linked to the previous descriptor,
1299 * then we need to update not only the previous WQE's "next" fields
1300 * but we must also update this WQE's "ctrl" fields (i.e. the "c", "e",
1301 * "s", "i" and "immediate" fields - see tavor_hw.h for more). Note:
1302 * the "e" bit is always hardcoded to zero.
1303 */
1304 if (prev_desc != NULL) {
1305 /*
1306 * If a send queue doorbell will be rung for the next WQE on
1307 * the chain, then update the current WQE's "next" field and
1308 * return.
1309 * Note: We don't want to modify the "ctrl" field here because
1310 * that portion of the previous WQE has already been set
1311 * correctly at some previous point in time.
1312 */
1313 if (dbinfo) {
1314 TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1315 return;
1316 }
1317
1318 ctrl = 0;
1319
1320 /* Set the "c" (i.e. "signaled") bit appropriately */
1321 if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1322 ctrl = ctrl | TAVOR_WQE_SEND_SIGNALED_MASK;
1323 }
1324
1325 /* Set the "s" (i.e. "solicited") bit appropriately */
1326 if (prev_wr->wr_flags & IBT_WR_SEND_SOLICIT) {
1327 ctrl = ctrl | TAVOR_WQE_SEND_SOLICIT_MASK;
1328 }
1329
1330 /* Set the "i" bit and the immediate data appropriately */
1331 if (prev_wr->wr_flags & IBT_WR_SEND_IMMED) {
1332 ctrl = ctrl | TAVOR_WQE_SEND_IMMEDIATE_MASK;
1333 ctrl = ctrl | tavor_wr_get_immediate(prev_wr);
1334 }
1335
1336 TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1337 }
1338 }
1339
1340
1341 /*
1342 * tavor_wqe_mlx_build()
1343 * Context: Can be called from interrupt or base context.
1344 */
1345 static int
tavor_wqe_mlx_build(tavor_state_t * state,tavor_qphdl_t qp,ibt_send_wr_t * wr,uint64_t * desc,uint_t * size)1346 tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
1347 ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1348 {
1349 tavor_hw_udav_t udav;
1350 tavor_ahhdl_t ah;
1351 ib_lrh_hdr_t *lrh;
1352 ib_grh_t *grh;
1353 ib_bth_hdr_t *bth;
1354 ib_deth_hdr_t *deth;
1355 tavor_hw_wqe_sgl_t *ds;
1356 ibt_wr_ds_t *sgl;
1357 uint8_t *mgmtclass, *hpoint, *hcount;
1358 uint64_t data;
1359 uint32_t nds, offset, pktlen;
1360 uint32_t desc_sz, udav_sz;
1361 int i, num_ds;
1362
1363 ASSERT(MUTEX_HELD(&qp->qp_lock));
1364
1365 /* Initialize the information for the Data Segments */
1366 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1367 sizeof (tavor_hw_mlx_wqe_nextctrl_t));
1368
1369 /*
1370 * Pull the address handle from the work request and read in
1371 * the contents of the UDAV. This will be used to answer some
1372 * questions about the request.
1373 */
1374 ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1375 if (ah == NULL) {
1376 return (IBT_AH_HDL_INVALID);
1377 }
1378 mutex_enter(&ah->ah_lock);
1379 udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1380 for (i = 0; i < udav_sz; i++) {
1381 data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1382 ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1383 ((uint64_t *)&udav)[i] = data;
1384 }
1385 mutex_exit(&ah->ah_lock);
1386
1387 /*
1388 * If the request is for QP1 and the destination LID is equal to
1389 * the Permissive LID, then return an error. This combination is
1390 * not allowed
1391 */
1392 if ((udav.rlid == IB_LID_PERMISSIVE) &&
1393 (qp->qp_is_special == TAVOR_QP_GSI)) {
1394 return (IBT_AH_HDL_INVALID);
1395 }
1396
1397 /*
1398 * Calculate the size of the packet headers, including the GRH
1399 * (if necessary)
1400 */
1401 desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) +
1402 sizeof (ib_deth_hdr_t);
1403 if (udav.grh) {
1404 desc_sz += sizeof (ib_grh_t);
1405 }
1406
1407 /*
1408 * Begin to build the first "inline" data segment for the packet
1409 * headers. Note: By specifying "inline" we can build the contents
1410 * of the MAD packet headers directly into the work queue (as part
1411 * descriptor). This has the advantage of both speeding things up
1412 * and of not requiring the driver to allocate/register any additional
1413 * memory for the packet headers.
1414 */
1415 TAVOR_WQE_BUILD_INLINE(qp, &ds[0], desc_sz);
1416 desc_sz += 4;
1417
1418 /*
1419 * Build Local Route Header (LRH)
1420 * We start here by building the LRH into a temporary location.
1421 * When we have finished we copy the LRH data into the descriptor.
1422 *
1423 * Notice that the VL values are hardcoded. This is not a problem
1424 * because VL15 is decided later based on the value in the MLX
1425 * transport "next/ctrl" header (see the "vl15" bit below), and it
1426 * is otherwise (meaning for QP1) chosen from the SL-to-VL table
1427 * values. This rule does not hold for loopback packets however
1428 * (all of which bypass the SL-to-VL tables) and it is the reason
1429 * that non-QP0 MADs are setup with VL hardcoded to zero below.
1430 *
1431 * Notice also that Source LID is hardcoded to the Permissive LID
1432 * (0xFFFF). This is also not a problem because if the Destination
1433 * LID is not the Permissive LID, then the "slr" value in the MLX
1434 * transport "next/ctrl" header will be set to zero and the hardware
1435 * will pull the LID from value in the port.
1436 */
1437 lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4);
1438 pktlen = (desc_sz + 0x100) >> 2;
1439 TAVOR_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen);
1440
1441 /*
1442 * Build Global Route Header (GRH)
1443 * This is only built if necessary as defined by the "grh" bit in
1444 * the address vector. Note: We also calculate the offset to the
1445 * next header (BTH) based on whether or not the "grh" bit is set.
1446 */
1447 if (udav.grh) {
1448 /*
1449 * If the request is for QP0, then return an error. The
1450 * combination of global routine (GRH) and QP0 is not allowed.
1451 */
1452 if (qp->qp_is_special == TAVOR_QP_SMI) {
1453 return (IBT_AH_HDL_INVALID);
1454 }
1455 grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1456 TAVOR_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen);
1457
1458 bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t));
1459 } else {
1460 bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1461 }
1462
1463
1464 /*
1465 * Build Base Transport Header (BTH)
1466 * Notice that the M, PadCnt, and TVer fields are all set
1467 * to zero implicitly. This is true for all Management Datagrams
1468 * MADs whether GSI are SMI.
1469 */
1470 TAVOR_WQE_BUILD_MLX_BTH(state, bth, qp, wr);
1471
1472 /*
1473 * Build Datagram Extended Transport Header (DETH)
1474 */
1475 deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t));
1476 TAVOR_WQE_BUILD_MLX_DETH(deth, qp);
1477
1478 /* Ensure that the Data Segment is aligned on a 16-byte boundary */
1479 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t));
1480 ds = (tavor_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF);
1481 nds = wr->wr_nds;
1482 sgl = wr->wr_sgl;
1483 num_ds = 0;
1484
1485 /*
1486 * Now fill in the Data Segments (SGL) for the MLX WQE based on the
1487 * values set up above (i.e. "sgl", "nds", and the "ds" pointer
1488 * Start by checking for a valid number of SGL entries
1489 */
1490 if (nds > qp->qp_sq_sgl) {
1491 return (IBT_QP_SGL_LEN_INVALID);
1492 }
1493
1494 /*
1495 * For each SGL in the Send Work Request, fill in the MLX WQE's data
1496 * segments. Note: We skip any SGL with zero size because Tavor
1497 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
1498 * the encoding for zero means a 2GB transfer. Because of this special
1499 * encoding in the hardware, we mask the requested length with
1500 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1501 * zero.)
1502 */
1503 mgmtclass = hpoint = hcount = NULL;
1504 offset = 0;
1505 for (i = 0; i < nds; i++) {
1506 if (sgl[i].ds_len == 0) {
1507 continue;
1508 }
1509
1510 /*
1511 * Fill in the Data Segment(s) for the MLX send WQE, using
1512 * the information contained in the scatter-gather list of
1513 * the work request.
1514 */
1515 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1516
1517 /*
1518 * Search through the contents of all MADs posted to QP0 to
1519 * initialize pointers to the places where Directed Route "hop
1520 * pointer", "hop count", and "mgmtclass" would be. Tavor
1521 * needs these updated (i.e. incremented or decremented, as
1522 * necessary) by software.
1523 */
1524 if (qp->qp_is_special == TAVOR_QP_SMI) {
1525
1526 TAVOR_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass,
1527 offset, sgl[i].ds_va, sgl[i].ds_len);
1528
1529 TAVOR_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint,
1530 offset, sgl[i].ds_va, sgl[i].ds_len);
1531
1532 TAVOR_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount,
1533 offset, sgl[i].ds_va, sgl[i].ds_len);
1534
1535 offset += sgl[i].ds_len;
1536 }
1537 num_ds++;
1538 }
1539
1540 /*
1541 * Tavor's Directed Route MADs need to have the "hop pointer"
1542 * incremented/decremented (as necessary) depending on whether it is
1543 * currently less than or greater than the "hop count" (i.e. whether
1544 * the MAD is a request or a response.)
1545 */
1546 if (qp->qp_is_special == TAVOR_QP_SMI) {
1547 TAVOR_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass,
1548 *hpoint, *hcount);
1549 }
1550
1551 /*
1552 * Now fill in the ICRC Data Segment. This data segment is inlined
1553 * just like the packets headers above, but it is only four bytes and
1554 * set to zero (to indicate that we wish the hardware to generate ICRC.
1555 */
1556 TAVOR_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0);
1557 num_ds++;
1558
1559 /* Return the size of descriptor (in 16-byte chunks) */
1560 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1561
1562 return (DDI_SUCCESS);
1563 }
1564
1565
1566 /*
1567 * tavor_wqe_mlx_linknext()
1568 * Context: Can be called from interrupt or base context.
1569 */
1570 static void
tavor_wqe_mlx_linknext(ibt_send_wr_t * prev_wr,uint64_t * curr_desc,uint_t curr_descsz,uint64_t * prev_desc,tavor_sw_wqe_dbinfo_t * dbinfo,tavor_qphdl_t qp)1571 tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
1572 uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
1573 tavor_qphdl_t qp)
1574 {
1575 tavor_hw_udav_t udav;
1576 tavor_ahhdl_t ah;
1577 uint64_t next, ctrl, data;
1578 uint_t nopcode;
1579 uint_t udav_sz;
1580 int i;
1581
1582 /*
1583 * Calculate the "next" field of the descriptor. This amounts to
1584 * setting up the "next_wqe_addr", "nopcode", and "nds" fields (see
1585 * tavor_hw.h for more). Note: If there is no next descriptor (i.e.
1586 * if the current descriptor is the last WQE on the chain), then set
1587 * "next" to zero.
1588 */
1589 if (curr_desc != NULL) {
1590 /*
1591 * The only valid Tavor WQE "nopcode" for MLX transport
1592 * requests is the "Send" code.
1593 */
1594 nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1595 curr_desc = (uint64_t *)(uintptr_t)((uint64_t)
1596 (uintptr_t)curr_desc - qp->qp_desc_off);
1597 next = (uint64_t)((uintptr_t)curr_desc &
1598 TAVOR_WQE_NDA_MASK) << 32;
1599 next = next | ((uint64_t)nopcode << 32);
1600 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1601
1602 /*
1603 * If a send queue doorbell will be rung for the next
1604 * WQE on the chain, then set the current WQE's "dbd" bit.
1605 * Note: We also update the "dbinfo" structure here to pass
1606 * back information about what should (later) be included
1607 * in the send queue doorbell.
1608 */
1609 if (dbinfo) {
1610 next = next | TAVOR_WQE_DBD_MASK;
1611 dbinfo->db_nopcode = nopcode;
1612 dbinfo->db_fence = 0;
1613 }
1614 } else {
1615 next = 0;
1616 }
1617
1618 /*
1619 * If this WQE is supposed to be linked to the previous descriptor,
1620 * then we need to update not only the previous WQE's "next" fields
1621 * but we must also update this WQE's "ctrl" fields (i.e. the "vl15",
1622 * "slr", "max_srate", "sl", "c", "e", "rlid", and "vcrc" fields -
1623 * see tavor_hw.h for more) Note: the "e" bit and "vcrc" fields are
1624 * always hardcoded to zero.
1625 */
1626 if (prev_desc != NULL) {
1627 /*
1628 * If a send queue doorbell will be rung for the next WQE on
1629 * the chain, then update the current WQE's "next" field and
1630 * return.
1631 * Note: We don't want to modify the "ctrl" field here because
1632 * that portion of the previous WQE has already been set
1633 * correctly at some previous point in time.
1634 */
1635 if (dbinfo) {
1636 TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1637 return;
1638 }
1639
1640 /*
1641 * Pull the address handle from the work request and read in
1642 * the contents of the UDAV. This will be used to answer some
1643 * questions about the request.
1644 */
1645 ah = (tavor_ahhdl_t)prev_wr->wr.ud.udwr_dest->ud_ah;
1646 mutex_enter(&ah->ah_lock);
1647 udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1648 for (i = 0; i < udav_sz; i++) {
1649 data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1650 ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1651 ((uint64_t *)&udav)[i] = data;
1652 }
1653 mutex_exit(&ah->ah_lock);
1654
1655 ctrl = 0;
1656
1657 /* Only QP0 uses VL15, otherwise use VL in the packet */
1658 if (qp->qp_is_special == TAVOR_QP_SMI) {
1659 ctrl = ctrl | TAVOR_WQE_MLXHDR_VL15_MASK;
1660 }
1661
1662 /*
1663 * The SLR (Source LID Replace) bit determines whether the
1664 * source LID for an outgoing MLX packet should come from the
1665 * PortInfo (SLR = 0) or should be left as it is in the
1666 * descriptor (SLR = 1). The latter is necessary for packets
1667 * to be sent with the Permissive LID.
1668 */
1669 if (udav.rlid == IB_LID_PERMISSIVE) {
1670 ctrl = ctrl | TAVOR_WQE_MLXHDR_SLR_MASK;
1671 }
1672
1673 /* Fill in the max static rate from the address handle */
1674 ctrl = ctrl | ((uint64_t)udav.max_stat_rate <<
1675 TAVOR_WQE_MLXHDR_SRATE_SHIFT);
1676
1677 /* All VL15 (i.e. SMI) traffic is required to use SL 0 */
1678 if (qp->qp_is_special != TAVOR_QP_SMI) {
1679 ctrl = ctrl | ((uint64_t)udav.sl <<
1680 TAVOR_WQE_MLXHDR_SL_SHIFT);
1681 }
1682
1683 /* Set the "c" (i.e. "signaled") bit appropriately */
1684 if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1685 ctrl = ctrl | TAVOR_WQE_MLXHDR_SIGNALED_MASK;
1686 }
1687
1688 /* Fill in the destination LID from the address handle */
1689 ctrl = ctrl | ((uint64_t)udav.rlid <<
1690 TAVOR_WQE_MLXHDR_RLID_SHIFT);
1691
1692 TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1693 }
1694 }
1695
1696
1697 /*
1698 * tavor_wqe_recv_build()
1699 * Context: Can be called from interrupt or base context.
1700 */
1701 /* ARGSUSED */
1702 static int
tavor_wqe_recv_build(tavor_state_t * state,tavor_qphdl_t qp,ibt_recv_wr_t * wr,uint64_t * desc,uint_t * size)1703 tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
1704 ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size)
1705 {
1706 tavor_hw_wqe_sgl_t *ds;
1707 int i, num_ds;
1708
1709 ASSERT(MUTEX_HELD(&qp->qp_lock));
1710
1711 /* Check that work request transport type is valid */
1712 if ((qp->qp_serv_type != TAVOR_QP_UD) &&
1713 (qp->qp_serv_type != TAVOR_QP_RC) &&
1714 (qp->qp_serv_type != TAVOR_QP_UC)) {
1715 return (IBT_QP_SRV_TYPE_INVALID);
1716 }
1717
1718 /* Fill in the Data Segments (SGL) for the Recv WQE */
1719 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1720 sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1721 num_ds = 0;
1722
1723 /* Check for valid number of SGL entries */
1724 if (wr->wr_nds > qp->qp_rq_sgl) {
1725 return (IBT_QP_SGL_LEN_INVALID);
1726 }
1727
1728 /*
1729 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1730 * segments. Note: We skip any SGL with zero size because Tavor
1731 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
1732 * the encoding for zero means a 2GB transfer. Because of this special
1733 * encoding in the hardware, we mask the requested length with
1734 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1735 * zero.)
1736 */
1737 for (i = 0; i < wr->wr_nds; i++) {
1738 if (wr->wr_sgl[i].ds_len == 0) {
1739 continue;
1740 }
1741
1742 /*
1743 * Fill in the Data Segment(s) for the receive WQE, using the
1744 * information contained in the scatter-gather list of the
1745 * work request.
1746 */
1747 TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &wr->wr_sgl[i]);
1748 num_ds++;
1749 }
1750
1751 /* Return the size of descriptor (in 16-byte chunks) */
1752 *size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1753
1754 return (DDI_SUCCESS);
1755 }
1756
1757
1758 /*
1759 * tavor_wqe_recv_linknext()
1760 * Context: Can be called from interrupt or base context.
1761 */
1762 static void
tavor_wqe_recv_linknext(uint64_t * curr_desc,uint_t curr_descsz,uint64_t * prev_desc,tavor_qphdl_t qp)1763 tavor_wqe_recv_linknext(uint64_t *curr_desc, uint_t curr_descsz,
1764 uint64_t *prev_desc, tavor_qphdl_t qp)
1765 {
1766 uint64_t next;
1767
1768 /*
1769 * Calculate the "next" field of the descriptor. This amounts to
1770 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
1771 * tavor_hw.h for more). Note: If there is no next descriptor (i.e.
1772 * if the current descriptor is the last WQE on the chain), then set
1773 * "next" field to TAVOR_WQE_DBD_MASK. This is because the Tavor
1774 * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
1775 * In either case, we must add a single bit in the "reserved" field
1776 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA. This is the
1777 * workaround for a known Tavor errata that can cause Recv WQEs with
1778 * zero in the NDA field to behave improperly.
1779 */
1780 if (curr_desc != NULL) {
1781 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
1782 qp->qp_desc_off);
1783 next = (uint64_t)((uintptr_t)curr_desc &
1784 TAVOR_WQE_NDA_MASK) << 32;
1785 next = next | (curr_descsz & TAVOR_WQE_NDS_MASK) |
1786 TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1787 } else {
1788 next = TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1789 }
1790
1791 /*
1792 * If this WQE is supposed to be linked to the previous descriptor,
1793 * then we need to update not only the previous WQE's "next" fields
1794 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
1795 * "e" bits - see tavor_hw.h for more). Note: both the "c" and "e"
1796 * bits are always hardcoded to zero.
1797 */
1798 if (prev_desc != NULL) {
1799 TAVOR_WQE_LINKNEXT(qp, prev_desc, 0, next);
1800 }
1801 }
1802
1803
1804 /*
1805 * tavor_wqe_srq_build()
1806 * Context: Can be called from interrupt or base context.
1807 */
1808 /* ARGSUSED */
1809 static int
tavor_wqe_srq_build(tavor_state_t * state,tavor_srqhdl_t srq,ibt_recv_wr_t * wr,uint64_t * desc)1810 tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
1811 ibt_recv_wr_t *wr, uint64_t *desc)
1812 {
1813 tavor_hw_wqe_sgl_t *ds;
1814 ibt_wr_ds_t end_sgl;
1815 int i, num_ds;
1816
1817 ASSERT(MUTEX_HELD(&srq->srq_lock));
1818
1819 /* Fill in the Data Segments (SGL) for the Recv WQE */
1820 ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1821 sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1822 num_ds = 0;
1823
1824 /* Check for valid number of SGL entries */
1825 if (wr->wr_nds > srq->srq_wq_sgl) {
1826 return (IBT_QP_SGL_LEN_INVALID);
1827 }
1828
1829 /*
1830 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1831 * segments. Note: We skip any SGL with zero size because Tavor
1832 * hardware cannot handle a zero for "byte_cnt" in the WQE. Actually
1833 * the encoding for zero means a 2GB transfer. Because of this special
1834 * encoding in the hardware, we mask the requested length with
1835 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1836 * zero.)
1837 */
1838 for (i = 0; i < wr->wr_nds; i++) {
1839 if (wr->wr_sgl[i].ds_len == 0) {
1840 continue;
1841 }
1842
1843 /*
1844 * Fill in the Data Segment(s) for the receive WQE, using the
1845 * information contained in the scatter-gather list of the
1846 * work request.
1847 */
1848 TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &wr->wr_sgl[i]);
1849 num_ds++;
1850 }
1851
1852 /*
1853 * For SRQ, if the number of data segments is less than the maximum
1854 * specified at alloc, then we have to fill in a special "key" entry in
1855 * the sgl entry after the last valid one in this post request. We do
1856 * that here.
1857 */
1858 if (num_ds < srq->srq_wq_sgl) {
1859 end_sgl.ds_va = 0;
1860 end_sgl.ds_len = 0;
1861 end_sgl.ds_key = 0x1;
1862 TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &end_sgl);
1863 }
1864
1865 return (DDI_SUCCESS);
1866 }
1867
1868
1869 /*
1870 * tavor_wqe_srq_linknext()
1871 * Context: Can be called from interrupt or base context.
1872 */
1873 static void
tavor_wqe_srq_linknext(uint64_t * curr_desc,uint64_t * prev_desc,tavor_srqhdl_t srq)1874 tavor_wqe_srq_linknext(uint64_t *curr_desc, uint64_t *prev_desc,
1875 tavor_srqhdl_t srq)
1876 {
1877 uint64_t next;
1878
1879 /*
1880 * Calculate the "next" field of the descriptor. This amounts to
1881 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
1882 * tavor_hw.h for more). Note: If there is no next descriptor (i.e.
1883 * if the current descriptor is the last WQE on the chain), then set
1884 * "next" field to TAVOR_WQE_DBD_MASK. This is because the Tavor
1885 * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
1886 * In either case, we must add a single bit in the "reserved" field
1887 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA. This is the
1888 * workaround for a known Tavor errata that can cause Recv WQEs with
1889 * zero in the NDA field to behave improperly.
1890 */
1891 if (curr_desc != NULL) {
1892 curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
1893 srq->srq_desc_off);
1894 next = (uint64_t)((uintptr_t)curr_desc &
1895 TAVOR_WQE_NDA_MASK) << 32;
1896 next = next | TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1897 } else {
1898 next = TAVOR_RCV_WQE_NDA0_WA_MASK;
1899 }
1900
1901 /*
1902 * If this WQE is supposed to be linked to the previous descriptor,
1903 * then we need to update not only the previous WQE's "next" fields
1904 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
1905 * "e" bits - see tavor_hw.h for more). Note: both the "c" and "e"
1906 * bits are always hardcoded to zero.
1907 */
1908 if (prev_desc != NULL) {
1909 TAVOR_WQE_LINKNEXT_SRQ(srq, prev_desc, 0, next);
1910 }
1911 }
1912
1913
1914 /*
1915 * tavor_wr_get_immediate()
1916 * Context: Can be called from interrupt or base context.
1917 */
1918 static uint32_t
tavor_wr_get_immediate(ibt_send_wr_t * wr)1919 tavor_wr_get_immediate(ibt_send_wr_t *wr)
1920 {
1921 /*
1922 * This routine extracts the "immediate data" from the appropriate
1923 * location in the IBTF work request. Because of the way the
1924 * work request structure is defined, the location for this data
1925 * depends on the actual work request operation type.
1926 */
1927
1928 /* For RDMA Write, test if RC or UC */
1929 if (wr->wr_opcode == IBT_WRC_RDMAW) {
1930 if (wr->wr_trans == IBT_RC_SRV) {
1931 return (wr->wr.rc.rcwr.rdma.rdma_immed);
1932 } else { /* IBT_UC_SRV */
1933 return (wr->wr.uc.ucwr.rdma.rdma_immed);
1934 }
1935 }
1936
1937 /* For Send, test if RC, UD, or UC */
1938 if (wr->wr_opcode == IBT_WRC_SEND) {
1939 if (wr->wr_trans == IBT_RC_SRV) {
1940 return (wr->wr.rc.rcwr.send_immed);
1941 } else if (wr->wr_trans == IBT_UD_SRV) {
1942 return (wr->wr.ud.udwr_immed);
1943 } else { /* IBT_UC_SRV */
1944 return (wr->wr.uc.ucwr.send_immed);
1945 }
1946 }
1947
1948 /*
1949 * If any other type of request, then immediate is undefined
1950 */
1951 return (0);
1952 }
1953
1954
1955 /*
1956 * tavor_wqe_sync()
1957 * Context: Can be called from interrupt or base context.
1958 */
1959 static void
tavor_wqe_sync(void * hdl,uint_t sync_from,uint_t sync_to,uint_t sync_type,uint_t flag)1960 tavor_wqe_sync(void *hdl, uint_t sync_from, uint_t sync_to,
1961 uint_t sync_type, uint_t flag)
1962 {
1963 tavor_qphdl_t qp;
1964 tavor_srqhdl_t srq;
1965 uint_t is_sync_req;
1966 uint64_t *wqe_from, *wqe_to, *wqe_base, *wqe_top;
1967 ddi_dma_handle_t dmahdl;
1968 off_t offset;
1969 size_t length;
1970 uint32_t qsize;
1971 int status;
1972
1973 if (sync_type == TAVOR_WR_SRQ) {
1974 srq = (tavor_srqhdl_t)hdl;
1975 is_sync_req = srq->srq_sync;
1976 /* Get the DMA handle from SRQ context */
1977 dmahdl = srq->srq_mrhdl->mr_bindinfo.bi_dmahdl;
1978 } else {
1979 qp = (tavor_qphdl_t)hdl;
1980 is_sync_req = qp->qp_sync;
1981 /* Get the DMA handle from QP context */
1982 dmahdl = qp->qp_mrhdl->mr_bindinfo.bi_dmahdl;
1983 }
1984
1985 /* Determine if the work queues need to be synced or not */
1986 if (is_sync_req == 0) {
1987 return;
1988 }
1989
1990 /*
1991 * Depending on the type of the work queue, we grab information
1992 * about the address ranges we need to DMA sync.
1993 */
1994 if (sync_type == TAVOR_WR_SEND) {
1995 wqe_from = TAVOR_QP_SQ_ENTRY(qp, sync_from);
1996 wqe_to = TAVOR_QP_SQ_ENTRY(qp, sync_to);
1997 qsize = qp->qp_sq_bufsz;
1998
1999 wqe_base = TAVOR_QP_SQ_ENTRY(qp, 0);
2000 wqe_top = TAVOR_QP_SQ_ENTRY(qp, qsize);
2001 } else if (sync_type == TAVOR_WR_RECV) {
2002 wqe_from = TAVOR_QP_RQ_ENTRY(qp, sync_from);
2003 wqe_to = TAVOR_QP_RQ_ENTRY(qp, sync_to);
2004 qsize = qp->qp_rq_bufsz;
2005
2006 wqe_base = TAVOR_QP_RQ_ENTRY(qp, 0);
2007 wqe_top = TAVOR_QP_RQ_ENTRY(qp, qsize);
2008 } else {
2009 wqe_from = TAVOR_SRQ_WQ_ENTRY(srq, sync_from);
2010 wqe_to = TAVOR_SRQ_WQ_ENTRY(srq, sync_to);
2011 qsize = srq->srq_wq_bufsz;
2012
2013 wqe_base = TAVOR_SRQ_WQ_ENTRY(srq, 0);
2014 wqe_top = TAVOR_SRQ_WQ_ENTRY(srq, qsize);
2015 }
2016
2017 /*
2018 * There are two possible cases for the beginning and end of the WQE
2019 * chain we are trying to sync. Either this is the simple case, where
2020 * the end of the chain is below the beginning of the chain, or it is
2021 * the "wrap-around" case, where the end of the chain has wrapped over
2022 * the end of the queue. In the former case, we simply need to
2023 * calculate the span from beginning to end and sync it. In the latter
2024 * case, however, we need to calculate the span from the top of the
2025 * work queue to the end of the chain and sync that, and then we need
2026 * to find the other portion (from beginning of chain to end of queue)
2027 * and sync that as well. Note: if the "top to end" span is actually
2028 * zero length, then we don't do a DMA sync because a zero length DMA
2029 * sync unnecessarily syncs the entire work queue.
2030 */
2031 if (wqe_to > wqe_from) {
2032 /* "From Beginning to End" */
2033 offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2034 length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_from);
2035
2036 status = ddi_dma_sync(dmahdl, offset, length, flag);
2037 if (status != DDI_SUCCESS) {
2038 return;
2039 }
2040 } else {
2041 /* "From Top to End" */
2042 offset = (off_t)0;
2043 length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_base);
2044 if (length) {
2045 status = ddi_dma_sync(dmahdl, offset, length, flag);
2046 if (status != DDI_SUCCESS) {
2047 return;
2048 }
2049 }
2050
2051 /* "From Beginning to Bottom" */
2052 offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2053 length = (size_t)((uintptr_t)wqe_top - (uintptr_t)wqe_from);
2054 status = ddi_dma_sync(dmahdl, offset, length, flag);
2055 if (status != DDI_SUCCESS) {
2056 return;
2057 }
2058 }
2059 }
2060
2061
2062 /*
2063 * tavor_wr_bind_check()
2064 * Context: Can be called from interrupt or base context.
2065 */
2066 static int
tavor_wr_bind_check(tavor_state_t * state,ibt_send_wr_t * wr)2067 tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr)
2068 {
2069 ibt_bind_flags_t bind_flags;
2070 uint64_t vaddr, len;
2071 uint64_t reg_start_addr, reg_end_addr;
2072 tavor_mwhdl_t mw;
2073 tavor_mrhdl_t mr;
2074 tavor_rsrc_t *mpt;
2075 uint32_t new_rkey;
2076
2077 /* Check for a valid Memory Window handle in the WR */
2078 mw = (tavor_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl;
2079 if (mw == NULL) {
2080 return (IBT_MW_HDL_INVALID);
2081 }
2082
2083 /* Check for a valid Memory Region handle in the WR */
2084 mr = (tavor_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl;
2085 if (mr == NULL) {
2086 return (IBT_MR_HDL_INVALID);
2087 }
2088
2089 mutex_enter(&mr->mr_lock);
2090 mutex_enter(&mw->mr_lock);
2091
2092 /*
2093 * Check here to see if the memory region has already been partially
2094 * deregistered as a result of a tavor_umap_umemlock_cb() callback.
2095 * If so, this is an error, return failure.
2096 */
2097 if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
2098 mutex_exit(&mr->mr_lock);
2099 mutex_exit(&mw->mr_lock);
2100 return (IBT_MR_HDL_INVALID);
2101 }
2102
2103 /* Check for a valid Memory Window RKey (i.e. a matching RKey) */
2104 if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) {
2105 mutex_exit(&mr->mr_lock);
2106 mutex_exit(&mw->mr_lock);
2107 return (IBT_MR_RKEY_INVALID);
2108 }
2109
2110 /* Check for a valid Memory Region LKey (i.e. a matching LKey) */
2111 if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) {
2112 mutex_exit(&mr->mr_lock);
2113 mutex_exit(&mw->mr_lock);
2114 return (IBT_MR_LKEY_INVALID);
2115 }
2116
2117 /*
2118 * Now check for valid "vaddr" and "len". Note: We don't check the
2119 * "vaddr" range when "len == 0" (i.e. on unbind operations)
2120 */
2121 len = wr->wr.rc.rcwr.bind->bind_len;
2122 if (len != 0) {
2123 vaddr = wr->wr.rc.rcwr.bind->bind_va;
2124 reg_start_addr = mr->mr_bindinfo.bi_addr;
2125 reg_end_addr = mr->mr_bindinfo.bi_addr +
2126 (mr->mr_bindinfo.bi_len - 1);
2127 if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) {
2128 mutex_exit(&mr->mr_lock);
2129 mutex_exit(&mw->mr_lock);
2130 return (IBT_MR_VA_INVALID);
2131 }
2132 vaddr = (vaddr + len) - 1;
2133 if (vaddr > reg_end_addr) {
2134 mutex_exit(&mr->mr_lock);
2135 mutex_exit(&mw->mr_lock);
2136 return (IBT_MR_LEN_INVALID);
2137 }
2138 }
2139
2140 /*
2141 * Validate the bind access flags. Remote Write and Atomic access for
2142 * the Memory Window require that Local Write access be set in the
2143 * corresponding Memory Region.
2144 */
2145 bind_flags = wr->wr.rc.rcwr.bind->bind_flags;
2146 if (((bind_flags & IBT_WR_BIND_WRITE) ||
2147 (bind_flags & IBT_WR_BIND_ATOMIC)) &&
2148 !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) {
2149 mutex_exit(&mr->mr_lock);
2150 mutex_exit(&mw->mr_lock);
2151 return (IBT_MR_ACCESS_REQ_INVALID);
2152 }
2153
2154 /* Calculate the new RKey for the Memory Window */
2155 mpt = mw->mr_mptrsrcp;
2156 tavor_mr_keycalc(state, mpt->tr_indx, &new_rkey);
2157
2158 wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
2159 mw->mr_rkey = new_rkey;
2160
2161 mutex_exit(&mr->mr_lock);
2162 mutex_exit(&mw->mr_lock);
2163 return (DDI_SUCCESS);
2164 }
2165
2166
2167 /*
2168 * tavor_wrid_from_reset_handling()
2169 * Context: Can be called from interrupt or base context.
2170 */
2171 int
tavor_wrid_from_reset_handling(tavor_state_t * state,tavor_qphdl_t qp)2172 tavor_wrid_from_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2173 {
2174 tavor_workq_hdr_t *swq, *rwq;
2175 tavor_wrid_list_hdr_t *s_wridlist, *r_wridlist;
2176 uint_t create_new_swq = 0, create_new_rwq = 0;
2177 uint_t create_wql = 0;
2178 uint_t qp_srq_en;
2179
2180 /*
2181 * For each of this QP's Work Queues, make sure we have a (properly
2182 * initialized) Work Request ID list attached to the relevant
2183 * completion queue. Grab the CQ lock(s) before manipulating the
2184 * lists.
2185 */
2186 tavor_wrid_wqhdr_lock_both(qp);
2187 swq = tavor_wrid_wqhdr_find(qp->qp_sq_cqhdl, qp->qp_qpnum,
2188 TAVOR_WR_SEND);
2189 if (swq == NULL) {
2190 /* Couldn't find matching work queue header, create it */
2191 create_new_swq = create_wql = 1;
2192 swq = tavor_wrid_wqhdr_create(state, qp->qp_sq_cqhdl,
2193 qp->qp_qpnum, TAVOR_WR_SEND, create_wql);
2194 if (swq == NULL) {
2195 /*
2196 * If we couldn't find/allocate space for the workq
2197 * header, then drop the lock(s) and return failure.
2198 */
2199 tavor_wrid_wqhdr_unlock_both(qp);
2200 return (ibc_get_ci_failure(0));
2201 }
2202 }
2203 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swq))
2204 qp->qp_sq_wqhdr = swq;
2205 swq->wq_size = qp->qp_sq_bufsz;
2206 swq->wq_head = 0;
2207 swq->wq_tail = 0;
2208 swq->wq_full = 0;
2209
2210 /*
2211 * Allocate space for the tavor_wrid_entry_t container
2212 */
2213 s_wridlist = tavor_wrid_get_list(swq->wq_size);
2214 if (s_wridlist == NULL) {
2215 /*
2216 * If we couldn't allocate space for tracking the WRID
2217 * entries, then cleanup the workq header from above (if
2218 * necessary, i.e. if we created the workq header). Then
2219 * drop the lock(s) and return failure.
2220 */
2221 if (create_new_swq) {
2222 tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2223 }
2224
2225 tavor_wrid_wqhdr_unlock_both(qp);
2226 return (ibc_get_ci_failure(0));
2227 }
2228 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*s_wridlist))
2229 s_wridlist->wl_wqhdr = swq;
2230
2231 /* Chain the new WRID list container to the workq hdr list */
2232 mutex_enter(&swq->wq_wrid_wql->wql_lock);
2233 tavor_wrid_wqhdr_add(swq, s_wridlist);
2234 mutex_exit(&swq->wq_wrid_wql->wql_lock);
2235
2236 qp_srq_en = qp->qp_srq_en;
2237
2238 #ifdef __lock_lint
2239 mutex_enter(&qp->qp_srqhdl->srq_lock);
2240 #else
2241 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2242 mutex_enter(&qp->qp_srqhdl->srq_lock);
2243 }
2244 #endif
2245 /*
2246 * Now we repeat all the above operations for the receive work queue,
2247 * or shared receive work queue.
2248 *
2249 * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case.
2250 */
2251 rwq = tavor_wrid_wqhdr_find(qp->qp_rq_cqhdl, qp->qp_qpnum,
2252 TAVOR_WR_RECV);
2253 if (rwq == NULL) {
2254 create_new_rwq = create_wql = 1;
2255
2256 /*
2257 * If this QP is associated with an SRQ, and this isn't the
2258 * first QP on the SRQ, then the 'srq_wrid_wql' will already be
2259 * created. Since the WQL is created at 'wqhdr_create' time we
2260 * pass in the flag 'create_wql' here to be 0 if we have
2261 * already created it. And later on below we then next setup
2262 * the WQL and rwq information based off the existing SRQ info.
2263 */
2264 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2265 qp->qp_srqhdl->srq_wrid_wql != NULL) {
2266 create_wql = 0;
2267 }
2268
2269 rwq = tavor_wrid_wqhdr_create(state, qp->qp_rq_cqhdl,
2270 qp->qp_qpnum, TAVOR_WR_RECV, create_wql);
2271 if (rwq == NULL) {
2272 /*
2273 * If we couldn't find/allocate space for the workq
2274 * header, then free all the send queue resources we
2275 * just allocated and setup (above), drop the lock(s)
2276 * and return failure.
2277 */
2278 mutex_enter(&swq->wq_wrid_wql->wql_lock);
2279 tavor_wrid_wqhdr_remove(swq, s_wridlist);
2280 mutex_exit(&swq->wq_wrid_wql->wql_lock);
2281 if (create_new_swq) {
2282 tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl,
2283 swq);
2284 }
2285
2286 #ifdef __lock_lint
2287 mutex_exit(&qp->qp_srqhdl->srq_lock);
2288 #else
2289 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2290 mutex_exit(&qp->qp_srqhdl->srq_lock);
2291 }
2292 #endif
2293
2294 tavor_wrid_wqhdr_unlock_both(qp);
2295 return (ibc_get_ci_failure(0));
2296 }
2297 }
2298 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*rwq))
2299
2300 /*
2301 * Setup receive workq hdr
2302 *
2303 * If the QP is on an SRQ, we setup the SRQ specific fields, setting
2304 * keeping a copy of the rwq pointer, setting the rwq bufsize
2305 * appropriately, and initializing our part of the WQLock.
2306 *
2307 * In the normal QP case, the QP recv queue bufsize is used.
2308 */
2309 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2310 rwq->wq_size = qp->qp_srqhdl->srq_wq_bufsz;
2311 if (qp->qp_srqhdl->srq_wrid_wql == NULL) {
2312 qp->qp_srqhdl->srq_wrid_wql = rwq->wq_wrid_wql;
2313 } else {
2314 rwq->wq_wrid_wql = qp->qp_srqhdl->srq_wrid_wql;
2315 }
2316 tavor_wql_refcnt_inc(qp->qp_srqhdl->srq_wrid_wql);
2317
2318 } else {
2319 rwq->wq_size = qp->qp_rq_bufsz;
2320 }
2321
2322 qp->qp_rq_wqhdr = rwq;
2323 rwq->wq_head = 0;
2324 rwq->wq_tail = 0;
2325 rwq->wq_full = 0;
2326
2327 /*
2328 * Allocate space for the tavor_wrid_entry_t container.
2329 *
2330 * If QP is on an SRQ, and the wrq_wridlist is NULL then we must
2331 * allocate the wridlist normally. However, if the srq_wridlist is !=
2332 * NULL, then we know this SRQ has already been initialized, thus the
2333 * wridlist has already been initialized. So we re-use the
2334 * srq_wridlist as the r_wridlist for this QP in this case.
2335 */
2336 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2337 qp->qp_srqhdl->srq_wridlist != NULL) {
2338 /* Use existing srq_wridlist pointer */
2339 r_wridlist = qp->qp_srqhdl->srq_wridlist;
2340 ASSERT(r_wridlist != NULL);
2341 } else {
2342 /* Allocate memory for the r_wridlist */
2343 r_wridlist = tavor_wrid_get_list(rwq->wq_size);
2344 }
2345
2346 /*
2347 * If the memory allocation failed for r_wridlist (or the SRQ pointer
2348 * is mistakenly NULL), we cleanup our previous swq allocation from
2349 * above
2350 */
2351 if (r_wridlist == NULL) {
2352 /*
2353 * If we couldn't allocate space for tracking the WRID
2354 * entries, then cleanup all the stuff from above. Then
2355 * drop the lock(s) and return failure.
2356 */
2357 mutex_enter(&swq->wq_wrid_wql->wql_lock);
2358 tavor_wrid_wqhdr_remove(swq, s_wridlist);
2359 mutex_exit(&swq->wq_wrid_wql->wql_lock);
2360 if (create_new_swq) {
2361 tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2362 }
2363 if (create_new_rwq) {
2364 tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, rwq);
2365 }
2366
2367 #ifdef __lock_lint
2368 mutex_exit(&qp->qp_srqhdl->srq_lock);
2369 #else
2370 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2371 mutex_exit(&qp->qp_srqhdl->srq_lock);
2372 }
2373 #endif
2374
2375 tavor_wrid_wqhdr_unlock_both(qp);
2376 return (ibc_get_ci_failure(0));
2377 }
2378 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*r_wridlist))
2379
2380 /*
2381 * Initialize the wridlist
2382 *
2383 * In the normal QP case, there is no special initialization needed.
2384 * We simply setup the wridlist backpointer to be the receive wqhdr
2385 * (rwq).
2386 *
2387 * But in the SRQ case, there is no backpointer to the wqhdr possible.
2388 * Instead we set 'wl_srq_en', specifying this wridlist is on an SRQ
2389 * and thus potentially shared across multiple QPs with the SRQ. We
2390 * also setup the srq_wridlist pointer to be the r_wridlist, and
2391 * intialize the freelist to an invalid index. This srq_wridlist
2392 * pointer is used above on future moves from_reset to let us know that
2393 * the srq_wridlist has been initialized already.
2394 *
2395 * And finally, if we are in a non-UMAP case, we setup the srq wrid
2396 * free list.
2397 */
2398 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2399 qp->qp_srqhdl->srq_wridlist == NULL) {
2400 r_wridlist->wl_srq_en = 1;
2401 r_wridlist->wl_free_list_indx = -1;
2402 qp->qp_srqhdl->srq_wridlist = r_wridlist;
2403
2404 /* Initialize srq wrid free list */
2405 if (qp->qp_srqhdl->srq_is_umap == 0) {
2406 mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2407 tavor_wrid_list_srq_init(r_wridlist, qp->qp_srqhdl, 0);
2408 mutex_exit(&rwq->wq_wrid_wql->wql_lock);
2409 }
2410 } else {
2411 r_wridlist->wl_wqhdr = rwq;
2412 }
2413
2414 /* Chain the WRID list "container" to the workq hdr list */
2415 mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2416 tavor_wrid_wqhdr_add(rwq, r_wridlist);
2417 mutex_exit(&rwq->wq_wrid_wql->wql_lock);
2418
2419 #ifdef __lock_lint
2420 mutex_exit(&qp->qp_srqhdl->srq_lock);
2421 #else
2422 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2423 mutex_exit(&qp->qp_srqhdl->srq_lock);
2424 }
2425 #endif
2426
2427 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*r_wridlist))
2428 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*rwq))
2429 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*s_wridlist))
2430 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*swq))
2431
2432 tavor_wrid_wqhdr_unlock_both(qp);
2433 return (DDI_SUCCESS);
2434 }
2435
2436
2437 /*
2438 * tavor_wrid_to_reset_handling()
2439 * Context: Can be called from interrupt or base context.
2440 */
2441 void
tavor_wrid_to_reset_handling(tavor_state_t * state,tavor_qphdl_t qp)2442 tavor_wrid_to_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2443 {
2444 uint_t free_wqhdr = 0;
2445
2446 /*
2447 * For each of this QP's Work Queues, move the WRID "container" to
2448 * the "reapable" list. Although there may still be unpolled
2449 * entries in these containers, it is not a big deal. We will not
2450 * reap the list until either the Poll CQ command detects an empty
2451 * condition or the CQ itself is freed. Grab the CQ lock(s) before
2452 * manipulating the lists.
2453 */
2454 mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2455 tavor_wrid_wqhdr_lock_both(qp);
2456 tavor_wrid_reaplist_add(qp->qp_sq_cqhdl, qp->qp_sq_wqhdr);
2457
2458 /*
2459 * Add the receive work queue header on to the reaplist. But if we are
2460 * on SRQ, then don't add anything to the reaplist. Instead we flush
2461 * the SRQ entries on the CQ, remove wridlist from WQHDR, and free the
2462 * WQHDR (if needed). We must hold the WQL for these operations, yet
2463 * the call to tavor_cq_wqhdr_remove grabs the WQL internally. So we
2464 * drop WQL before that call. Then release the CQ WQHDR locks and the
2465 * CQ lock and return.
2466 */
2467 if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2468
2469 /*
2470 * Pull off all (if any) entries for this QP from CQ. This
2471 * only includes entries that have not yet been polled
2472 */
2473 mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2474 tavor_cq_srq_entries_flush(state, qp);
2475
2476 /* Remove wridlist from WQHDR */
2477 tavor_wrid_wqhdr_remove(qp->qp_rq_wqhdr,
2478 qp->qp_rq_wqhdr->wq_wrid_post);
2479
2480 /* If wridlist chain is now empty, remove the wqhdr as well */
2481 if (qp->qp_rq_wqhdr->wq_wrid_post == NULL) {
2482 free_wqhdr = 1;
2483 } else {
2484 free_wqhdr = 0;
2485 }
2486
2487 mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2488
2489 /* Free the WQHDR */
2490 if (free_wqhdr) {
2491 tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2492 }
2493 } else {
2494 tavor_wrid_reaplist_add(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2495 }
2496 tavor_wrid_wqhdr_unlock_both(qp);
2497 mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2498 }
2499
2500
2501 /*
2502 * tavor_wrid_add_entry()
2503 * Context: Can be called from interrupt or base context.
2504 */
2505 void
tavor_wrid_add_entry(tavor_workq_hdr_t * wq,uint64_t wrid,uint32_t wqeaddrsz,uint_t signaled_dbd)2506 tavor_wrid_add_entry(tavor_workq_hdr_t *wq, uint64_t wrid, uint32_t wqeaddrsz,
2507 uint_t signaled_dbd)
2508 {
2509 tavor_wrid_entry_t *wre_tmp;
2510 uint32_t head, tail, size;
2511
2512 ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2513
2514 /*
2515 * Find the entry in the container pointed to by the "tail" index.
2516 * Add all of the relevant information to that entry, including WRID,
2517 * "wqeaddrsz" parameter, and whether it was signaled/unsignaled
2518 * and/or doorbelled.
2519 */
2520 head = wq->wq_wrid_post->wl_head;
2521 tail = wq->wq_wrid_post->wl_tail;
2522 size = wq->wq_wrid_post->wl_size;
2523 wre_tmp = &wq->wq_wrid_post->wl_wre[tail];
2524 wre_tmp->wr_wrid = wrid;
2525 wre_tmp->wr_wqeaddrsz = wqeaddrsz;
2526 wre_tmp->wr_signaled_dbd = signaled_dbd;
2527
2528 /*
2529 * Update the "wrid_old_tail" pointer to point to the entry we just
2530 * inserted into the queue. By tracking this pointer (the pointer to
2531 * the most recently inserted entry) it will possible later in the
2532 * PostSend() and PostRecv() code paths to find the entry that needs
2533 * its "doorbelled" flag set (see comment in tavor_post_recv() and/or
2534 * tavor_post_send()).
2535 */
2536 wq->wq_wrid_post->wl_wre_old_tail = wre_tmp;
2537
2538 /* Update the tail index */
2539 tail = ((tail + 1) & (size - 1));
2540 wq->wq_wrid_post->wl_tail = tail;
2541
2542 /*
2543 * If the "tail" index has just wrapped over into the "head" index,
2544 * then we have filled the container. We use the "full" flag to
2545 * indicate this condition and to distinguish it from the "empty"
2546 * condition (where head and tail are also equal).
2547 */
2548 if (head == tail) {
2549 wq->wq_wrid_post->wl_full = 1;
2550 }
2551 }
2552
2553 /*
2554 * tavor_wrid_add_entry_srq()
2555 * Context: Can be called from interrupt or base context
2556 */
2557 void
tavor_wrid_add_entry_srq(tavor_srqhdl_t srq,uint64_t wrid,uint_t signaled_dbd)2558 tavor_wrid_add_entry_srq(tavor_srqhdl_t srq, uint64_t wrid, uint_t signaled_dbd)
2559 {
2560 tavor_wrid_entry_t *wre;
2561 uint64_t *wl_wqe;
2562 uint32_t wqe_index;
2563
2564 /*
2565 * Find the next available WQE from the SRQ free_list. Then update the
2566 * free_list to point to the next entry
2567 */
2568 wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wridlist->wl_free_list_indx);
2569
2570 wqe_index = srq->srq_wridlist->wl_free_list_indx;
2571
2572 /* ASSERT on impossible wqe_index values */
2573 ASSERT(wqe_index < srq->srq_wq_bufsz);
2574
2575 /*
2576 * Setup the WRE.
2577 *
2578 * Given the 'wqe_index' value, we store the WRID at this WRE offset.
2579 * And we set the WRE to be signaled_dbd so that on poll CQ we can find
2580 * this information and associate the WRID to the WQE found on the CQE.
2581 */
2582 wre = &srq->srq_wridlist->wl_wre[wqe_index];
2583 wre->wr_wrid = wrid;
2584 wre->wr_signaled_dbd = signaled_dbd;
2585
2586 /* Update the free list index */
2587 srq->srq_wridlist->wl_free_list_indx = ddi_get32(
2588 srq->srq_wridlist->wl_acchdl, (uint32_t *)wl_wqe);
2589 }
2590
2591
2592 /*
2593 * tavor_wrid_get_entry()
2594 * Context: Can be called from interrupt or base context.
2595 */
2596 uint64_t
tavor_wrid_get_entry(tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe,tavor_wrid_entry_t * wre)2597 tavor_wrid_get_entry(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
2598 tavor_wrid_entry_t *wre)
2599 {
2600 tavor_workq_hdr_t *wq;
2601 tavor_wrid_entry_t *wre_tmp;
2602 uint64_t wrid;
2603 uint_t send_or_recv, qpnum, error, opcode;
2604
2605 /* Lock the list of work queues associated with this CQ */
2606 mutex_enter(&cq->cq_wrid_wqhdr_lock);
2607
2608 /*
2609 * Determine whether this CQE is a send or receive completion (and
2610 * whether it was a "successful" completion or not)
2611 */
2612 opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
2613 if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
2614 (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
2615 error = 1;
2616 send_or_recv = (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ?
2617 TAVOR_COMPLETION_SEND : TAVOR_COMPLETION_RECV;
2618 } else {
2619 error = 0;
2620 send_or_recv = TAVOR_CQE_SENDRECV_GET(cq, cqe);
2621 }
2622
2623 /* Find the work queue for this QP number (send or receive side) */
2624 qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
2625 wq = tavor_wrid_wqhdr_find(cq, qpnum, send_or_recv);
2626 ASSERT(wq != NULL);
2627
2628 /*
2629 * Regardless of whether the completion is the result of a "success"
2630 * or a "failure", we lock the list of "containers" and attempt to
2631 * search for the the first matching completion (i.e. the first WR
2632 * with a matching WQE addr and size). Once we find it, we pull out
2633 * the "wrid" field and return it (see below). Note: One possible
2634 * future enhancement would be to enable this routine to skip over
2635 * any "unsignaled" completions to go directly to the next "signaled"
2636 * entry on success. XXX
2637 */
2638 mutex_enter(&wq->wq_wrid_wql->wql_lock);
2639 wre_tmp = tavor_wrid_find_match(wq, cq, cqe);
2640
2641 /*
2642 * If this is a "successful" completion, then we assert that this
2643 * completion must be a "signaled" completion.
2644 */
2645 ASSERT(error || (wre_tmp->wr_signaled_dbd & TAVOR_WRID_ENTRY_SIGNALED));
2646
2647 /*
2648 * If the completion is a "failed" completion, then we save away the
2649 * contents of the entry (into the "wre" field passed in) for use
2650 * in later CQE processing. Note: We use the tavor_wrid_get_wqeaddrsz()
2651 * function to grab "wqeaddrsz" from the next entry in the container.
2652 * This is required for error processing (where updating these fields
2653 * properly is necessary to correct handling of the "error" CQE)
2654 */
2655 if (error && (wre != NULL)) {
2656 *wre = *wre_tmp;
2657 wre->wr_wqeaddrsz = tavor_wrid_get_wqeaddrsz(wq);
2658 }
2659
2660 /* Pull out the WRID and return it */
2661 wrid = wre_tmp->wr_wrid;
2662
2663 mutex_exit(&wq->wq_wrid_wql->wql_lock);
2664 mutex_exit(&cq->cq_wrid_wqhdr_lock);
2665
2666 return (wrid);
2667 }
2668
2669
2670 /*
2671 * tavor_wrid_find_match()
2672 * Context: Can be called from interrupt or base context.
2673 */
2674 static tavor_wrid_entry_t *
tavor_wrid_find_match(tavor_workq_hdr_t * wq,tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe)2675 tavor_wrid_find_match(tavor_workq_hdr_t *wq, tavor_cqhdl_t cq,
2676 tavor_hw_cqe_t *cqe)
2677 {
2678 tavor_wrid_entry_t *curr = NULL;
2679 tavor_wrid_list_hdr_t *container;
2680 uint32_t wqeaddr_size;
2681 uint32_t head, tail, size;
2682 int found = 0, last_container;
2683
2684 ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2685
2686 /* Pull the "wqeaddrsz" information from the CQE */
2687 wqeaddr_size = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe);
2688
2689 /*
2690 * Walk the "containers" list(s), find first WR with a matching WQE
2691 * addr. If the current "container" is not the last one on the list,
2692 * i.e. not the current one to which we are posting new WRID entries,
2693 * then we do not attempt to update the "q_head", "q_tail", and
2694 * "q_full" indicators on the main work queue header. We do, however,
2695 * update the "head" and "full" indicators on the individual containers
2696 * as we go. This is imperative because we need to be able to
2697 * determine when the current container has been emptied (so that we
2698 * can move on to the next container).
2699 */
2700 container = wq->wq_wrid_poll;
2701 while (container != NULL) {
2702 /* Is this the last/only "container" on the list */
2703 last_container = (container != wq->wq_wrid_post) ? 0 : 1;
2704
2705 /*
2706 * First check if we are on an SRQ. If so, we grab the entry
2707 * and break out. Since SRQ wridlist's are never added to
2708 * reaplist, they can only be the last container.
2709 */
2710 if (container->wl_srq_en) {
2711 ASSERT(last_container == 1);
2712 curr = tavor_wrid_find_match_srq(container, cq, cqe);
2713 break;
2714 }
2715
2716 /*
2717 * Grab the current "head", "tail" and "size" fields before
2718 * walking the list in the current container. Note: the "size"
2719 * field here must always be a power-of-2. The "full"
2720 * parameter is checked (and updated) here to distinguish the
2721 * "queue full" condition from "queue empty".
2722 */
2723 head = container->wl_head;
2724 tail = container->wl_tail;
2725 size = container->wl_size;
2726 while ((head != tail) || (container->wl_full)) {
2727 container->wl_full = 0;
2728 curr = &container->wl_wre[head];
2729 head = ((head + 1) & (size - 1));
2730
2731 /*
2732 * If the current entry's "wqeaddrsz" matches the one
2733 * we're searching for, then this must correspond to
2734 * the work request that caused the completion. Set
2735 * the "found" flag and bail out.
2736 */
2737 if (curr->wr_wqeaddrsz == wqeaddr_size) {
2738 found = 1;
2739 break;
2740 }
2741 }
2742
2743 /*
2744 * If the current container is empty (having reached here the
2745 * "head == tail" condition can only mean that the container
2746 * is empty), then NULL out the "wrid_old_tail" field (see
2747 * tavor_post_send() and tavor_post_recv() for more details)
2748 * and (potentially) remove the current container from future
2749 * searches.
2750 */
2751 if (head == tail) {
2752
2753 container->wl_wre_old_tail = NULL;
2754 /*
2755 * If this wasn't the last "container" on the chain,
2756 * i.e. the one to which new WRID entries will be
2757 * added, then remove it from the list.
2758 * Note: we don't "lose" the memory pointed to by this
2759 * because we should have already put this container
2760 * on the "reapable" list (from where it will later be
2761 * pulled).
2762 */
2763 if (!last_container) {
2764 wq->wq_wrid_poll = container->wl_next;
2765 }
2766 }
2767
2768 /* Update the head index for the container */
2769 container->wl_head = head;
2770
2771 /*
2772 * If the entry was found in this container, then continue to
2773 * bail out. Else reset the "curr" pointer and move on to the
2774 * next container (if there is one). Note: the only real
2775 * reason for setting "curr = NULL" here is so that the ASSERT
2776 * below can catch the case where no matching entry was found
2777 * on any of the lists.
2778 */
2779 if (found) {
2780 break;
2781 } else {
2782 curr = NULL;
2783 container = container->wl_next;
2784 }
2785 }
2786
2787 /*
2788 * Update work queue header's "head" and "full" conditions to match
2789 * the last entry on the container list. (Note: Only if we're pulling
2790 * entries from the last work queue portion of the list, i.e. not from
2791 * the previous portions that may be the "reapable" list.)
2792 */
2793 if (last_container) {
2794 wq->wq_head = wq->wq_wrid_post->wl_head;
2795 wq->wq_full = wq->wq_wrid_post->wl_full;
2796 }
2797
2798 /* Ensure that we've actually found what we were searching for */
2799 ASSERT(curr != NULL);
2800
2801 return (curr);
2802 }
2803
2804
2805 /*
2806 * tavor_wrid_find_match_srq()
2807 * Context: Can be called from interrupt or base context.
2808 */
2809 tavor_wrid_entry_t *
tavor_wrid_find_match_srq(tavor_wrid_list_hdr_t * wl,tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe)2810 tavor_wrid_find_match_srq(tavor_wrid_list_hdr_t *wl, tavor_cqhdl_t cq,
2811 tavor_hw_cqe_t *cqe)
2812 {
2813 tavor_wrid_entry_t *wre;
2814 uint64_t *wl_wqe;
2815 uint32_t wqe_index;
2816 uint64_t wqe_addr;
2817 uint32_t cqe_wqe_addr;
2818
2819 /* Grab the WQE addr out of the CQE */
2820 cqe_wqe_addr = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe) & 0xFFFFFFC0;
2821
2822 /*
2823 * Use the WQE addr as the lower 32-bit, we add back on the
2824 * 'wl_srq_desc_off' because we have a zero-based queue. Then the
2825 * upper 32-bit of the 'wl_srq_wq_buf' OR'd on gives us the WQE addr in
2826 * the SRQ Work Queue itself. We use this address as the index to find
2827 * out which Work Queue Entry this CQE corresponds with.
2828 *
2829 * We also use this address below to add the WQE back on to the free
2830 * list.
2831 */
2832 wqe_addr = ((uintptr_t)wl->wl_srq_wq_buf & 0xFFFFFFFF00000000ull) |
2833 (cqe_wqe_addr + wl->wl_srq_desc_off);
2834
2835 /*
2836 * Given the 'wqe_addr' just calculated and the srq buf address, we
2837 * find the 'wqe_index'. The 'wre' returned below contains the WRID
2838 * that we are looking for. This indexes into the wre_list for this
2839 * specific WQE.
2840 */
2841 wqe_index = TAVOR_SRQ_WQE_INDEX(wl->wl_srq_wq_buf, wqe_addr,
2842 wl->wl_srq_log_wqesz);
2843
2844 /* ASSERT on impossible wqe_index values */
2845 ASSERT(wqe_index < wl->wl_srq_wq_bufsz);
2846
2847 /* Get the pointer to this WQE */
2848 wl_wqe = (uint64_t *)(uintptr_t)wqe_addr;
2849
2850 /* Put this WQE index back on the free list */
2851 ddi_put32(wl->wl_acchdl, (uint32_t *)wl_wqe, wl->wl_free_list_indx);
2852 wl->wl_free_list_indx = wqe_index;
2853
2854 /* Using the index, return the Work Request ID Entry (wre) */
2855 wre = &wl->wl_wre[wqe_index];
2856
2857 return (wre);
2858 }
2859
2860
2861 /*
2862 * tavor_wrid_cq_reap()
2863 * Context: Can be called from interrupt or base context.
2864 */
2865 void
tavor_wrid_cq_reap(tavor_cqhdl_t cq)2866 tavor_wrid_cq_reap(tavor_cqhdl_t cq)
2867 {
2868 tavor_workq_hdr_t *consume_wqhdr;
2869 tavor_wrid_list_hdr_t *container, *to_free;
2870
2871 ASSERT(MUTEX_HELD(&cq->cq_lock));
2872
2873 /* Lock the list of work queues associated with this CQ */
2874 mutex_enter(&cq->cq_wrid_wqhdr_lock);
2875
2876 /* Walk the "reapable" list and free up containers */
2877 container = cq->cq_wrid_reap_head;
2878 while (container != NULL) {
2879 to_free = container;
2880 container = container->wl_reap_next;
2881 /*
2882 * If reaping the WRID list containers pulls the last
2883 * container from the given work queue header, then we free
2884 * the work queue header as well.
2885 */
2886 consume_wqhdr = tavor_wrid_list_reap(to_free);
2887 if (consume_wqhdr != NULL) {
2888 tavor_cq_wqhdr_remove(cq, consume_wqhdr);
2889 }
2890 }
2891
2892 /* Once finished reaping, we reset the CQ's reap list */
2893 cq->cq_wrid_reap_head = cq->cq_wrid_reap_tail = NULL;
2894
2895 mutex_exit(&cq->cq_wrid_wqhdr_lock);
2896 }
2897
2898
2899 /*
2900 * tavor_wrid_cq_force_reap()
2901 * Context: Can be called from interrupt or base context.
2902 */
2903 void
tavor_wrid_cq_force_reap(tavor_cqhdl_t cq)2904 tavor_wrid_cq_force_reap(tavor_cqhdl_t cq)
2905 {
2906 tavor_workq_hdr_t *curr;
2907 tavor_wrid_list_hdr_t *container, *to_free;
2908 avl_tree_t *treep;
2909 void *cookie = NULL;
2910
2911 ASSERT(MUTEX_HELD(&cq->cq_lock));
2912
2913 /*
2914 * The first step is to walk the "reapable" list and free up those
2915 * containers. This is necessary because the containers on the
2916 * reapable list are not otherwise connected to the work queue headers
2917 * anymore.
2918 */
2919 tavor_wrid_cq_reap(cq);
2920
2921 /* Now lock the list of work queues associated with this CQ */
2922 mutex_enter(&cq->cq_wrid_wqhdr_lock);
2923
2924 /*
2925 * Walk the list of work queue headers and free up all the WRID list
2926 * containers chained to it. Note: We don't need to grab the locks
2927 * for each of the individual WRID lists here because the only way
2928 * things can be added or removed from the list at this point would be
2929 * through post a work request to a QP. But if we've come this far,
2930 * then we can be assured that there are no longer any QP associated
2931 * with the CQ that we are trying to free.
2932 */
2933 #ifdef __lock_lint
2934 tavor_wrid_wqhdr_compare(NULL, NULL);
2935 #endif
2936 treep = &cq->cq_wrid_wqhdr_avl_tree;
2937 while ((curr = avl_destroy_nodes(treep, &cookie)) != NULL) {
2938 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*curr))
2939 container = curr->wq_wrid_poll;
2940 while (container != NULL) {
2941 to_free = container;
2942 container = container->wl_next;
2943 /*
2944 * If reaping the WRID list containers pulls the last
2945 * container from the given work queue header, then
2946 * we free the work queue header as well. Note: we
2947 * ignore the return value because we know that the
2948 * work queue header should always be freed once the
2949 * list of containers has come to an end.
2950 */
2951 (void) tavor_wrid_list_reap(to_free);
2952 if (container == NULL) {
2953 tavor_cq_wqhdr_remove(cq, curr);
2954 }
2955 }
2956 }
2957 avl_destroy(treep);
2958
2959 mutex_exit(&cq->cq_wrid_wqhdr_lock);
2960 }
2961
2962
2963 /*
2964 * tavor_wrid_get_list()
2965 * Context: Can be called from interrupt or base context.
2966 */
2967 tavor_wrid_list_hdr_t *
tavor_wrid_get_list(uint32_t qsize)2968 tavor_wrid_get_list(uint32_t qsize)
2969 {
2970 tavor_wrid_list_hdr_t *wridlist;
2971 uint32_t size;
2972
2973 /*
2974 * The WRID list "container" consists of the tavor_wrid_list_hdr_t,
2975 * which holds the pointers necessary for maintaining the "reapable"
2976 * list, chaining together multiple "containers" old and new, and
2977 * tracking the head, tail, size, etc. for each container.
2978 *
2979 * The "container" also holds all the tavor_wrid_entry_t's, which is
2980 * allocated separately, one for each entry on the corresponding work
2981 * queue.
2982 */
2983 size = sizeof (tavor_wrid_list_hdr_t);
2984
2985 /*
2986 * Note that this allocation has to be a NOSLEEP operation here
2987 * because we are holding the "wqhdr_list_lock" and, therefore,
2988 * could get raised to the interrupt level.
2989 */
2990 wridlist = (tavor_wrid_list_hdr_t *)kmem_zalloc(size, KM_NOSLEEP);
2991 if (wridlist == NULL) {
2992 return (NULL);
2993 }
2994 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wridlist))
2995
2996 /* Complete the "container" initialization */
2997 wridlist->wl_size = qsize;
2998 wridlist->wl_full = 0;
2999 wridlist->wl_head = 0;
3000 wridlist->wl_tail = 0;
3001 wridlist->wl_wre = (tavor_wrid_entry_t *)kmem_zalloc(qsize *
3002 sizeof (tavor_wrid_entry_t), KM_NOSLEEP);
3003 if (wridlist->wl_wre == NULL) {
3004 kmem_free(wridlist, size);
3005 return (NULL);
3006 }
3007 wridlist->wl_wre_old_tail = NULL;
3008 wridlist->wl_reap_next = NULL;
3009 wridlist->wl_next = NULL;
3010 wridlist->wl_prev = NULL;
3011 wridlist->wl_srq_en = 0;
3012
3013 return (wridlist);
3014 }
3015
3016 /*
3017 * tavor_wrid_list_srq_init()
3018 * Context: Can be called from interrupt or base context
3019 */
3020 void
tavor_wrid_list_srq_init(tavor_wrid_list_hdr_t * wridlist,tavor_srqhdl_t srq,uint_t wq_start)3021 tavor_wrid_list_srq_init(tavor_wrid_list_hdr_t *wridlist, tavor_srqhdl_t srq,
3022 uint_t wq_start)
3023 {
3024 uint64_t *wl_wqe;
3025 int wqe_index;
3026
3027 ASSERT(MUTEX_HELD(&srq->srq_wrid_wql->wql_lock));
3028
3029 /* Setup pointers for use later when we are polling the CQ */
3030 wridlist->wl_srq_wq_buf = srq->srq_wq_buf;
3031 wridlist->wl_srq_wq_bufsz = srq->srq_wq_bufsz;
3032 wridlist->wl_srq_log_wqesz = srq->srq_wq_log_wqesz;
3033 wridlist->wl_srq_desc_off = srq->srq_desc_off;
3034 wridlist->wl_acchdl = srq->srq_wqinfo.qa_acchdl;
3035
3036 /* Given wq_start to start initializing buf at, verify sanity */
3037 ASSERT(wq_start >= 0 && wq_start < srq->srq_wq_bufsz);
3038
3039 /*
3040 * Initialize wridlist free list
3041 *
3042 * For each WQ up to the size of our queue, we store an index in the WQ
3043 * memory itself, representing the next available free entry. The
3044 * 'wl_free_list_indx' always holds the index of the next available
3045 * free entry in the WQ. If 'wl_free_list_indx' is -1, then we are
3046 * completely full. This gives us the advantage of being able to have
3047 * entries complete or be polled off the WQ out-of-order.
3048 *
3049 * For now, we write the free_list entries inside the WQ itself. It
3050 * may be useful in the future to store this information in a separate
3051 * structure for debugging purposes.
3052 */
3053 for (wqe_index = wq_start; wqe_index < srq->srq_wq_bufsz; wqe_index++) {
3054 wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, wqe_index);
3055 ddi_put32(wridlist->wl_acchdl, (uint32_t *)wl_wqe,
3056 wridlist->wl_free_list_indx);
3057 wridlist->wl_free_list_indx = wqe_index;
3058 }
3059 }
3060
3061
3062 /*
3063 * tavor_wrid_reaplist_add()
3064 * Context: Can be called from interrupt or base context.
3065 */
3066 static void
tavor_wrid_reaplist_add(tavor_cqhdl_t cq,tavor_workq_hdr_t * wq)3067 tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq)
3068 {
3069 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3070
3071 mutex_enter(&wq->wq_wrid_wql->wql_lock);
3072
3073 /*
3074 * Add the "post" container (the last one on the current chain) to
3075 * the CQ's "reapable" list
3076 */
3077 if ((cq->cq_wrid_reap_head == NULL) &&
3078 (cq->cq_wrid_reap_tail == NULL)) {
3079 cq->cq_wrid_reap_head = wq->wq_wrid_post;
3080 cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3081 } else {
3082 cq->cq_wrid_reap_tail->wl_reap_next = wq->wq_wrid_post;
3083 cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3084 }
3085
3086 mutex_exit(&wq->wq_wrid_wql->wql_lock);
3087 }
3088
3089
3090 int
tavor_wrid_wqhdr_compare(const void * p1,const void * p2)3091 tavor_wrid_wqhdr_compare(const void *p1, const void *p2)
3092 {
3093 tavor_workq_compare_t *cmpp;
3094 tavor_workq_hdr_t *curr;
3095
3096 cmpp = (tavor_workq_compare_t *)p1;
3097 curr = (tavor_workq_hdr_t *)p2;
3098
3099 if (cmpp->cmp_qpn < curr->wq_qpn)
3100 return (-1);
3101 else if (cmpp->cmp_qpn > curr->wq_qpn)
3102 return (+1);
3103 else if (cmpp->cmp_type < curr->wq_type)
3104 return (-1);
3105 else if (cmpp->cmp_type > curr->wq_type)
3106 return (+1);
3107 else
3108 return (0);
3109 }
3110
3111
3112 /*
3113 * tavor_wrid_wqhdr_find()
3114 * Context: Can be called from interrupt or base context.
3115 */
3116 static tavor_workq_hdr_t *
tavor_wrid_wqhdr_find(tavor_cqhdl_t cq,uint_t qpn,uint_t wq_type)3117 tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type)
3118 {
3119 tavor_workq_hdr_t *curr;
3120 tavor_workq_compare_t cmp;
3121
3122 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3123
3124 /*
3125 * Walk the CQ's work queue list, trying to find a send or recv queue
3126 * with the same QP number. We do this even if we are going to later
3127 * create a new entry because it helps us easily find the end of the
3128 * list.
3129 */
3130 cmp.cmp_qpn = qpn;
3131 cmp.cmp_type = wq_type;
3132 #ifdef __lock_lint
3133 tavor_wrid_wqhdr_compare(NULL, NULL);
3134 #endif
3135 curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL);
3136
3137 return (curr);
3138 }
3139
3140
3141 /*
3142 * tavor_wrid_wqhdr_create()
3143 * Context: Can be called from interrupt or base context.
3144 */
3145 static tavor_workq_hdr_t *
tavor_wrid_wqhdr_create(tavor_state_t * state,tavor_cqhdl_t cq,uint_t qpn,uint_t wq_type,uint_t create_wql)3146 tavor_wrid_wqhdr_create(tavor_state_t *state, tavor_cqhdl_t cq, uint_t qpn,
3147 uint_t wq_type, uint_t create_wql)
3148 {
3149 tavor_workq_hdr_t *wqhdr_tmp;
3150
3151 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3152
3153 /*
3154 * Allocate space a work queue header structure and initialize it.
3155 * Each work queue header structure includes a "wq_wrid_wql"
3156 * which needs to be initialized. Note that this allocation has to be
3157 * a NOSLEEP operation because we are holding the "cq_wrid_wqhdr_lock"
3158 * and, therefore, could get raised to the interrupt level.
3159 */
3160 wqhdr_tmp = (tavor_workq_hdr_t *)kmem_zalloc(
3161 sizeof (tavor_workq_hdr_t), KM_NOSLEEP);
3162 if (wqhdr_tmp == NULL) {
3163 return (NULL);
3164 }
3165 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr_tmp))
3166 wqhdr_tmp->wq_qpn = qpn;
3167 wqhdr_tmp->wq_type = wq_type;
3168
3169 if (create_wql) {
3170 wqhdr_tmp->wq_wrid_wql = tavor_wrid_wql_create(state);
3171 if (wqhdr_tmp->wq_wrid_wql == NULL) {
3172 kmem_free(wqhdr_tmp, sizeof (tavor_workq_hdr_t));
3173 return (NULL);
3174 }
3175 }
3176
3177 wqhdr_tmp->wq_wrid_poll = NULL;
3178 wqhdr_tmp->wq_wrid_post = NULL;
3179
3180 /* Chain the newly allocated work queue header to the CQ's list */
3181 tavor_cq_wqhdr_add(cq, wqhdr_tmp);
3182
3183 return (wqhdr_tmp);
3184 }
3185
3186
3187 /*
3188 * tavor_wrid_wql_create()
3189 * Context: Can be called from interrupt or base context.
3190 */
3191 tavor_wq_lock_t *
tavor_wrid_wql_create(tavor_state_t * state)3192 tavor_wrid_wql_create(tavor_state_t *state)
3193 {
3194 tavor_wq_lock_t *wql;
3195
3196 /*
3197 * Allocate the WQL and initialize it.
3198 */
3199 wql = kmem_zalloc(sizeof (tavor_wq_lock_t), KM_NOSLEEP);
3200 if (wql == NULL) {
3201 return (NULL);
3202 }
3203
3204 mutex_init(&wql->wql_lock, NULL, MUTEX_DRIVER,
3205 DDI_INTR_PRI(state->ts_intrmsi_pri));
3206
3207 /* Add refcount to WQL */
3208 tavor_wql_refcnt_inc(wql);
3209
3210 return (wql);
3211 }
3212
3213
3214 /*
3215 * tavor_wrid_get_wqeaddrsz()
3216 * Context: Can be called from interrupt or base context.
3217 */
3218 static uint32_t
tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t * wq)3219 tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq)
3220 {
3221 tavor_wrid_entry_t *wre;
3222 uint32_t wqeaddrsz;
3223 uint32_t head;
3224
3225 /*
3226 * If the container is empty, then there is no next entry. So just
3227 * return zero. Note: the "head == tail" condition here can only
3228 * mean that the container is empty because we have previously pulled
3229 * something from the container.
3230 *
3231 * If the container is not empty, then find the next entry and return
3232 * the contents of its "wqeaddrsz" field.
3233 */
3234 if (wq->wq_wrid_poll->wl_head == wq->wq_wrid_poll->wl_tail) {
3235 wqeaddrsz = 0;
3236 } else {
3237 /*
3238 * We don't need to calculate the "next" head pointer here
3239 * because "head" should already point to the next entry on
3240 * the list (since we just pulled something off - in
3241 * tavor_wrid_find_match() - and moved the head index forward.)
3242 */
3243 head = wq->wq_wrid_poll->wl_head;
3244 wre = &wq->wq_wrid_poll->wl_wre[head];
3245 wqeaddrsz = wre->wr_wqeaddrsz;
3246 }
3247 return (wqeaddrsz);
3248 }
3249
3250
3251 /*
3252 * tavor_wrid_wqhdr_add()
3253 * Context: Can be called from interrupt or base context.
3254 */
3255 static void
tavor_wrid_wqhdr_add(tavor_workq_hdr_t * wqhdr,tavor_wrid_list_hdr_t * wridlist)3256 tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
3257 tavor_wrid_list_hdr_t *wridlist)
3258 {
3259 ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3260
3261 /* Chain the new WRID list "container" to the work queue list */
3262 if ((wqhdr->wq_wrid_post == NULL) &&
3263 (wqhdr->wq_wrid_poll == NULL)) {
3264 wqhdr->wq_wrid_poll = wridlist;
3265 wqhdr->wq_wrid_post = wridlist;
3266 } else {
3267 wqhdr->wq_wrid_post->wl_next = wridlist;
3268 wridlist->wl_prev = wqhdr->wq_wrid_post;
3269 wqhdr->wq_wrid_post = wridlist;
3270 }
3271 }
3272
3273
3274 /*
3275 * tavor_wrid_wqhdr_remove()
3276 * Context: Can be called from interrupt or base context.
3277 *
3278 * Note: this is only called to remove the most recently added WRID list
3279 * container (i.e. in tavor_from_reset() above)
3280 */
3281 static void
tavor_wrid_wqhdr_remove(tavor_workq_hdr_t * wqhdr,tavor_wrid_list_hdr_t * wridlist)3282 tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
3283 tavor_wrid_list_hdr_t *wridlist)
3284 {
3285 tavor_wrid_list_hdr_t *prev, *next;
3286
3287 ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3288
3289 /* Unlink the WRID list "container" from the work queue list */
3290 prev = wridlist->wl_prev;
3291 next = wridlist->wl_next;
3292 if (prev != NULL) {
3293 prev->wl_next = next;
3294 }
3295 if (next != NULL) {
3296 next->wl_prev = prev;
3297 }
3298
3299 /*
3300 * Update any pointers in the work queue hdr that may point to this
3301 * WRID list container
3302 */
3303 if (wqhdr->wq_wrid_post == wridlist) {
3304 wqhdr->wq_wrid_post = prev;
3305 }
3306 if (wqhdr->wq_wrid_poll == wridlist) {
3307 wqhdr->wq_wrid_poll = NULL;
3308 }
3309 }
3310
3311
3312 /*
3313 * tavor_wrid_list_reap()
3314 * Context: Can be called from interrupt or base context.
3315 * Note: The "wqhdr_list_lock" must be held.
3316 */
3317 static tavor_workq_hdr_t *
tavor_wrid_list_reap(tavor_wrid_list_hdr_t * wridlist)3318 tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wridlist)
3319 {
3320 tavor_workq_hdr_t *wqhdr, *consume_wqhdr = NULL;
3321 tavor_wrid_list_hdr_t *prev, *next;
3322 uint32_t size;
3323
3324 /* Get the back pointer to the work queue header (see below) */
3325 wqhdr = wridlist->wl_wqhdr;
3326 mutex_enter(&wqhdr->wq_wrid_wql->wql_lock);
3327
3328 /* Unlink the WRID list "container" from the work queue list */
3329 prev = wridlist->wl_prev;
3330 next = wridlist->wl_next;
3331 if (prev != NULL) {
3332 prev->wl_next = next;
3333 }
3334 if (next != NULL) {
3335 next->wl_prev = prev;
3336 }
3337
3338 /*
3339 * If the back pointer to the work queue header shows that it
3340 * was pointing to the entry we are about to remove, then the work
3341 * queue header is reapable as well.
3342 */
3343 if ((wqhdr->wq_wrid_poll == wridlist) &&
3344 (wqhdr->wq_wrid_post == wridlist)) {
3345 consume_wqhdr = wqhdr;
3346 }
3347
3348 /* Be sure to update the "poll" and "post" container pointers */
3349 if (wqhdr->wq_wrid_poll == wridlist) {
3350 wqhdr->wq_wrid_poll = next;
3351 }
3352 if (wqhdr->wq_wrid_post == wridlist) {
3353 wqhdr->wq_wrid_post = NULL;
3354 }
3355
3356 /* Calculate the size and free the container */
3357 size = (wridlist->wl_size * sizeof (tavor_wrid_entry_t));
3358 kmem_free(wridlist->wl_wre, size);
3359 kmem_free(wridlist, sizeof (tavor_wrid_list_hdr_t));
3360
3361 mutex_exit(&wqhdr->wq_wrid_wql->wql_lock);
3362
3363 return (consume_wqhdr);
3364 }
3365
3366
3367 /*
3368 * tavor_wrid_wqhdr_lock_both()
3369 * Context: Can be called from interrupt or base context.
3370 */
3371 static void
tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp)3372 tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp)
3373 {
3374 tavor_cqhdl_t sq_cq, rq_cq;
3375
3376 sq_cq = qp->qp_sq_cqhdl;
3377 rq_cq = qp->qp_rq_cqhdl;
3378
3379 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3380 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3381
3382 /*
3383 * If both work queues (send and recv) share a completion queue, then
3384 * grab the common lock. If they use different CQs (hence different
3385 * "cq_wrid_wqhdr_list" locks), then grab the send one first, then the
3386 * receive. We do this consistently and correctly in
3387 * tavor_wrid_wqhdr_unlock_both() below to avoid introducing any kind
3388 * of dead lock condition. Note: We add the "__lock_lint" code here
3389 * to fake out warlock into thinking we've grabbed both locks (when,
3390 * in fact, we only needed the one).
3391 */
3392 if (sq_cq == rq_cq) {
3393 mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3394 #ifdef __lock_lint
3395 mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3396 #endif
3397 } else {
3398 mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3399 mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3400 }
3401 }
3402
3403 /*
3404 * tavor_wrid_wqhdr_unlock_both()
3405 * Context: Can be called from interrupt or base context.
3406 */
3407 static void
tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp)3408 tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp)
3409 {
3410 tavor_cqhdl_t sq_cq, rq_cq;
3411
3412 sq_cq = qp->qp_sq_cqhdl;
3413 rq_cq = qp->qp_rq_cqhdl;
3414
3415 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3416 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3417
3418 /*
3419 * See tavor_wrid_wqhdr_lock_both() above for more detail
3420 */
3421 if (sq_cq == rq_cq) {
3422 #ifdef __lock_lint
3423 mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3424 #endif
3425 mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3426 } else {
3427 mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3428 mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3429 }
3430 }
3431
3432
3433 /*
3434 * tavor_cq_wqhdr_add()
3435 * Context: Can be called from interrupt or base context.
3436 */
3437 static void
tavor_cq_wqhdr_add(tavor_cqhdl_t cq,tavor_workq_hdr_t * wqhdr)3438 tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3439 {
3440 tavor_workq_compare_t cmp;
3441 avl_index_t where;
3442
3443 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3444
3445 cmp.cmp_qpn = wqhdr->wq_qpn;
3446 cmp.cmp_type = wqhdr->wq_type;
3447 #ifdef __lock_lint
3448 tavor_wrid_wqhdr_compare(NULL, NULL);
3449 #endif
3450 (void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where);
3451 /*
3452 * If the CQ's work queue list is empty, then just add it.
3453 * Otherwise, chain it to the beginning of the list.
3454 */
3455 avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqhdr, where);
3456 }
3457
3458
3459 /*
3460 * tavor_cq_wqhdr_remove()
3461 * Context: Can be called from interrupt or base context.
3462 */
3463 static void
tavor_cq_wqhdr_remove(tavor_cqhdl_t cq,tavor_workq_hdr_t * wqhdr)3464 tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3465 {
3466 ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3467
3468 #ifdef __lock_lint
3469 tavor_wrid_wqhdr_compare(NULL, NULL);
3470 #endif
3471 /* Remove "wqhdr" from the work queue header list on "cq" */
3472 avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqhdr);
3473
3474 /*
3475 * Release reference to WQL; If this is the last reference, this call
3476 * also has the side effect of freeing up the 'wq_wrid_wql' memory.
3477 */
3478 tavor_wql_refcnt_dec(wqhdr->wq_wrid_wql);
3479
3480 /* Free the memory associated with "wqhdr" */
3481 kmem_free(wqhdr, sizeof (tavor_workq_hdr_t));
3482 }
3483
3484
3485 /*
3486 * tavor_wql_refcnt_inc()
3487 * Context: Can be called from interrupt or base context
3488 */
3489 void
tavor_wql_refcnt_inc(tavor_wq_lock_t * wql)3490 tavor_wql_refcnt_inc(tavor_wq_lock_t *wql)
3491 {
3492 ASSERT(wql != NULL);
3493
3494 mutex_enter(&wql->wql_lock);
3495 wql->wql_refcnt++;
3496 mutex_exit(&wql->wql_lock);
3497 }
3498
3499 /*
3500 * tavor_wql_refcnt_dec()
3501 * Context: Can be called from interrupt or base context
3502 */
3503 void
tavor_wql_refcnt_dec(tavor_wq_lock_t * wql)3504 tavor_wql_refcnt_dec(tavor_wq_lock_t *wql)
3505 {
3506 int refcnt;
3507
3508 ASSERT(wql != NULL);
3509
3510 mutex_enter(&wql->wql_lock);
3511 wql->wql_refcnt--;
3512 refcnt = wql->wql_refcnt;
3513 mutex_exit(&wql->wql_lock);
3514
3515 /*
3516 *
3517 * Free up WQL memory if we're the last one associated with this
3518 * structure.
3519 */
3520 if (refcnt == 0) {
3521 mutex_destroy(&wql->wql_lock);
3522 kmem_free(wql, sizeof (tavor_wq_lock_t));
3523 }
3524 }
3525