xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/tavor/tavor_wr.c (revision fcdb3229a31dd4ff700c69238814e326aad49098)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_wr.c
29  *    Tavor Work Request Processing Routines
30  *
31  *    Implements all the routines necessary to provide the PostSend(),
32  *    PostRecv() and PostSRQ() verbs.  Also contains all the code
33  *    necessary to implement the Tavor WRID tracking mechanism.
34  */
35 
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/avl.h>
42 
43 #include <sys/ib/adapters/tavor/tavor.h>
44 
45 static void tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda,
46     uint32_t nds, uint32_t qpn, uint32_t fence, uint32_t nopcode);
47 static void tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda,
48     uint32_t nds, uint32_t qpn, uint32_t credits);
49 static uint32_t tavor_wr_get_immediate(ibt_send_wr_t *wr);
50 static int tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr);
51 static int tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
52     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
53 static void tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr,
54     ibt_send_wr_t *prev_wr, uint64_t *curr_desc, uint_t curr_descsz,
55     uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp);
56 static int tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
57     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
58 static void tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
59     uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
60     tavor_qphdl_t qp);
61 static int tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
62     ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size);
63 static void tavor_wqe_recv_linknext(uint64_t *desc, uint_t desc_sz,
64     uint64_t *prev, tavor_qphdl_t qp);
65 static int tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
66     ibt_recv_wr_t *wr, uint64_t *desc);
67 static void tavor_wqe_srq_linknext(uint64_t *desc, uint64_t *prev,
68     tavor_srqhdl_t srq);
69 static void tavor_wqe_sync(void *hdl, uint_t sync_from,
70     uint_t sync_to, uint_t sync_type, uint_t flag);
71 static tavor_wrid_entry_t *tavor_wrid_find_match(tavor_workq_hdr_t *wq,
72     tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe);
73 static void tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq);
74 static tavor_workq_hdr_t *tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn,
75     uint_t send_or_recv);
76 static tavor_workq_hdr_t *tavor_wrid_wqhdr_create(tavor_state_t *state,
77     tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type, uint_t create_wql);
78 static uint32_t tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq);
79 static void tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
80     tavor_wrid_list_hdr_t *wrid_list);
81 static void tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
82     tavor_wrid_list_hdr_t *wrid_list);
83 static tavor_workq_hdr_t *tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wq);
84 static void tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp);
85 static void tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp);
86 static void tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
87 static void tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr);
88 
89 /*
90  * tavor_post_send()
91  *    Context: Can be called from interrupt or base context.
92  */
93 int
tavor_post_send(tavor_state_t * state,tavor_qphdl_t qp,ibt_send_wr_t * wr,uint_t num_wr,uint_t * num_posted)94 tavor_post_send(tavor_state_t *state, tavor_qphdl_t qp,
95     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
96 {
97 	tavor_sw_wqe_dbinfo_t		dbinfo;
98 	tavor_wrid_list_hdr_t		*wridlist;
99 	tavor_wrid_entry_t		*wre_last;
100 	uint64_t			*desc, *prev, *first;
101 	uint32_t			desc_sz, first_sz;
102 	uint32_t			wqeaddrsz, signaled_dbd;
103 	uint32_t			head, tail, next_tail, qsize_msk;
104 	uint32_t			sync_from, sync_to;
105 	uint_t				currindx, wrindx, numremain;
106 	uint_t				chainlen, chainbegin, posted_cnt;
107 	uint_t				maxdb = TAVOR_QP_MAXDESC_PER_DB;
108 	int				status;
109 
110 	/*
111 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
112 	 * clients to post to QP memory that is accessible directly by the
113 	 * user.  If the QP memory is user accessible, then return an error.
114 	 */
115 	if (qp->qp_is_umap) {
116 		return (IBT_QP_HDL_INVALID);
117 	}
118 
119 	/* Initialize posted_cnt */
120 	posted_cnt = 0;
121 
122 	mutex_enter(&qp->qp_lock);
123 
124 	/*
125 	 * Check QP state.  Can not post Send requests from the "Reset",
126 	 * "Init", or "RTR" states
127 	 */
128 	if ((qp->qp_state == TAVOR_QP_RESET) ||
129 	    (qp->qp_state == TAVOR_QP_INIT) ||
130 	    (qp->qp_state == TAVOR_QP_RTR)) {
131 		mutex_exit(&qp->qp_lock);
132 		return (IBT_QP_STATE_INVALID);
133 	}
134 
135 	/* Grab the lock for the WRID list */
136 	mutex_enter(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
137 	wridlist  = qp->qp_sq_wqhdr->wq_wrid_post;
138 
139 	/* Save away some initial QP state */
140 	qsize_msk = qp->qp_sq_wqhdr->wq_size - 1;
141 	tail	  = qp->qp_sq_wqhdr->wq_tail;
142 	head	  = qp->qp_sq_wqhdr->wq_head;
143 
144 	/*
145 	 * For each ibt_send_wr_t in the wr[] list passed in, parse the
146 	 * request and build a Send WQE.  Note:  Because we are potentially
147 	 * building a chain of WQEs, we want to link them all together.
148 	 * However, we do not want to link the first one to the previous
149 	 * WQE until the entire chain has been linked.  Then in the last
150 	 * step we ring the appropriate doorbell.  Note:  It is possible for
151 	 * more Work Requests to be posted than the HW will support at one
152 	 * shot.  If this happens, we need to be able to post and ring
153 	 * several chains here until the the entire request is complete.
154 	 */
155 	wrindx = 0;
156 	numremain = num_wr;
157 	status	  = DDI_SUCCESS;
158 	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
159 		/*
160 		 * For the first WQE on a new chain we need "prev" to point
161 		 * to the current descriptor.  As we begin to process
162 		 * further, "prev" will be updated to point to the previous
163 		 * WQE on the current chain (see below).
164 		 */
165 		prev = TAVOR_QP_SQ_ENTRY(qp, tail);
166 
167 		/*
168 		 * Before we begin, save the current "tail index" for later
169 		 * DMA sync
170 		 */
171 		sync_from = tail;
172 
173 		/*
174 		 * Break the request up into chains that are less than or
175 		 * equal to the maximum number of WQEs that can be posted
176 		 * per doorbell ring
177 		 */
178 		chainlen   = (numremain > maxdb) ? maxdb : numremain;
179 		numremain -= chainlen;
180 		chainbegin = wrindx;
181 		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
182 			/*
183 			 * Check for "queue full" condition.  If the queue
184 			 * is already full, then no more WQEs can be posted.
185 			 * So break out, ring a doorbell (if necessary) and
186 			 * return an error
187 			 */
188 			if (qp->qp_sq_wqhdr->wq_full != 0) {
189 				status = IBT_QP_FULL;
190 				break;
191 			}
192 
193 			/*
194 			 * Increment the "tail index" and check for "queue
195 			 * full" condition.  If we detect that the current
196 			 * work request is going to fill the work queue, then
197 			 * we mark this condition and continue.
198 			 */
199 			next_tail = (tail + 1) & qsize_msk;
200 			if (next_tail == head) {
201 				qp->qp_sq_wqhdr->wq_full = 1;
202 			}
203 
204 			/*
205 			 * Get the address of the location where the next
206 			 * Send WQE should be built
207 			 */
208 			desc = TAVOR_QP_SQ_ENTRY(qp, tail);
209 
210 			/*
211 			 * Call tavor_wqe_send_build() to build the WQE
212 			 * at the given address.  This routine uses the
213 			 * information in the ibt_send_wr_t list (wr[]) and
214 			 * returns the size of the WQE when it returns.
215 			 */
216 			status = tavor_wqe_send_build(state, qp,
217 			    &wr[wrindx], desc, &desc_sz);
218 			if (status != DDI_SUCCESS) {
219 				break;
220 			}
221 
222 			/*
223 			 * Add a WRID entry to the WRID list.  Need to
224 			 * calculate the "wqeaddrsz" and "signaled_dbd"
225 			 * values to pass to tavor_wrid_add_entry()
226 			 */
227 			wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
228 			    ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
229 			    desc_sz);
230 			if ((qp->qp_sq_sigtype == TAVOR_QP_SQ_ALL_SIGNALED) ||
231 			    (wr[wrindx].wr_flags & IBT_WR_SEND_SIGNAL)) {
232 				signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
233 			} else {
234 				signaled_dbd = 0;
235 			}
236 			tavor_wrid_add_entry(qp->qp_sq_wqhdr,
237 			    wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
238 
239 			/*
240 			 * If this is not the first descriptor on the current
241 			 * chain, then link it to the previous WQE.  Otherwise,
242 			 * save the address and size of this descriptor (in
243 			 * "first" and "first_sz" respectively) and continue.
244 			 * Note: Linking a WQE to the the previous one will
245 			 * depend on whether the two WQEs are from "special
246 			 * QPs" (i.e. MLX transport WQEs) or whether they are
247 			 * normal Send WQEs.
248 			 */
249 			if (currindx != 0) {
250 				if (qp->qp_is_special) {
251 					tavor_wqe_mlx_linknext(&wr[wrindx - 1],
252 					    desc, desc_sz, prev, NULL, qp);
253 				} else {
254 					tavor_wqe_send_linknext(&wr[wrindx],
255 					    &wr[wrindx - 1], desc, desc_sz,
256 					    prev, NULL, qp);
257 				}
258 				prev = desc;
259 			} else {
260 				first	 = desc;
261 				first_sz = desc_sz;
262 			}
263 
264 			/*
265 			 * Update the current "tail index" and increment
266 			 * "posted_cnt"
267 			 */
268 			tail = next_tail;
269 			posted_cnt++;
270 		}
271 
272 		/*
273 		 * If we reach here and there are one or more WQEs which have
274 		 * been successfully chained together, then we need to link
275 		 * the current chain to the previously executing chain of
276 		 * descriptor (if there is one) and ring the doorbell for the
277 		 * send work queue.
278 		 */
279 		if (currindx != 0) {
280 			/*
281 			 * Before we link the chain, we need to ensure that the
282 			 * "next" field on the last WQE is set to NULL (to
283 			 * indicate the end of the chain).  Note: Just as it
284 			 * did above, the format for the "next" fields in a
285 			 * given WQE depend on whether the WQE is MLX
286 			 * transport or not.
287 			 */
288 			if (qp->qp_is_special) {
289 				tavor_wqe_mlx_linknext(&wr[chainbegin +
290 				    currindx - 1], NULL, 0, prev, NULL, qp);
291 			} else {
292 				tavor_wqe_send_linknext(NULL,
293 				    &wr[chainbegin + currindx - 1], NULL, 0,
294 				    prev, NULL, qp);
295 			}
296 
297 			/* Save away updated "tail index" for the DMA sync */
298 			sync_to = tail;
299 
300 			/* Do a DMA sync for current send WQE(s) */
301 			tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_SEND,
302 			    DDI_DMA_SYNC_FORDEV);
303 
304 			/*
305 			 * Now link the chain to the old chain (if there was
306 			 * one.  Note: still need to pay attention to whether
307 			 * the QP used MLX transport WQEs or not.
308 			 */
309 			if (qp->qp_is_special) {
310 				tavor_wqe_mlx_linknext(NULL, first, first_sz,
311 				    qp->qp_sq_lastwqeaddr, &dbinfo, qp);
312 			} else {
313 				tavor_wqe_send_linknext(&wr[chainbegin], NULL,
314 				    first, first_sz, qp->qp_sq_lastwqeaddr,
315 				    &dbinfo, qp);
316 			}
317 
318 			/*
319 			 * If there was a valid previous WQE (i.e. non-NULL),
320 			 * then sync it too.  This is because we have updated
321 			 * its "next" fields and we want to ensure that the
322 			 * hardware can see the changes.
323 			 */
324 			if (qp->qp_sq_lastwqeaddr != NULL) {
325 				sync_to   = sync_from;
326 				sync_from = (sync_from - 1) & qsize_msk;
327 				tavor_wqe_sync(qp, sync_from, sync_to,
328 				    TAVOR_WR_SEND, DDI_DMA_SYNC_FORDEV);
329 			}
330 
331 			/*
332 			 * Now if the WRID tail entry is non-NULL, then this
333 			 * represents the entry to which we are chaining the
334 			 * new entries.  Since we are going to ring the
335 			 * doorbell for this WQE, we want set its "dbd" bit.
336 			 *
337 			 * On the other hand, if the tail is NULL, even though
338 			 * we will have rung the doorbell for the previous WQE
339 			 * (for the hardware's sake) it is irrelevant to our
340 			 * purposes (for tracking WRIDs) because we know the
341 			 * request must have already completed.
342 			 */
343 			wre_last = wridlist->wl_wre_old_tail;
344 			if (wre_last != NULL) {
345 				wre_last->wr_signaled_dbd |=
346 				    TAVOR_WRID_ENTRY_DOORBELLED;
347 			}
348 
349 			/* Update some of the state in the QP */
350 			qp->qp_sq_lastwqeaddr	 = desc;
351 			qp->qp_sq_wqhdr->wq_tail = tail;
352 
353 			/* Ring the doorbell */
354 			tavor_qp_send_doorbell(state,
355 			    (uint32_t)((uintptr_t)first - qp->qp_desc_off),
356 			    first_sz, qp->qp_qpnum, dbinfo.db_fence,
357 			    dbinfo.db_nopcode);
358 		}
359 	}
360 
361 	/*
362 	 * Update the "num_posted" return value (if necessary).  Then drop
363 	 * the locks and return success.
364 	 */
365 	if (num_posted != NULL) {
366 		*num_posted = posted_cnt;
367 	}
368 
369 	mutex_exit(&qp->qp_sq_wqhdr->wq_wrid_wql->wql_lock);
370 	mutex_exit(&qp->qp_lock);
371 
372 	return (status);
373 }
374 
375 
376 /*
377  * tavor_post_recv()
378  *    Context: Can be called from interrupt or base context.
379  */
380 int
tavor_post_recv(tavor_state_t * state,tavor_qphdl_t qp,ibt_recv_wr_t * wr,uint_t num_wr,uint_t * num_posted)381 tavor_post_recv(tavor_state_t *state, tavor_qphdl_t qp,
382     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
383 {
384 	uint64_t			*desc, *prev, *first;
385 	uint32_t			desc_sz, first_sz;
386 	uint32_t			wqeaddrsz, signaled_dbd;
387 	uint32_t			head, tail, next_tail, qsize_msk;
388 	uint32_t			sync_from, sync_to;
389 	uint_t				currindx, wrindx, numremain;
390 	uint_t				chainlen, posted_cnt;
391 	uint_t				maxdb = TAVOR_QP_MAXDESC_PER_DB;
392 	int				status;
393 
394 	/*
395 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
396 	 * clients to post to QP memory that is accessible directly by the
397 	 * user.  If the QP memory is user accessible, then return an error.
398 	 */
399 	if (qp->qp_is_umap) {
400 		return (IBT_QP_HDL_INVALID);
401 	}
402 
403 	/* Initialize posted_cnt */
404 	posted_cnt = 0;
405 
406 	mutex_enter(&qp->qp_lock);
407 
408 	/*
409 	 * Check if QP is associated with an SRQ
410 	 */
411 	if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
412 		mutex_exit(&qp->qp_lock);
413 		return (IBT_SRQ_IN_USE);
414 	}
415 
416 	/*
417 	 * Check QP state.  Can not post Recv requests from the "Reset" state
418 	 */
419 	if (qp->qp_state == TAVOR_QP_RESET) {
420 		mutex_exit(&qp->qp_lock);
421 		return (IBT_QP_STATE_INVALID);
422 	}
423 
424 	/* Grab the lock for the WRID list */
425 	mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
426 
427 	/* Save away some initial QP state */
428 	qsize_msk = qp->qp_rq_wqhdr->wq_size - 1;
429 	tail	  = qp->qp_rq_wqhdr->wq_tail;
430 	head	  = qp->qp_rq_wqhdr->wq_head;
431 
432 	/*
433 	 * For each ibt_recv_wr_t in the wr[] list passed in, parse the
434 	 * request and build a Recv WQE.  Note:  Because we are potentially
435 	 * building a chain of WQEs, we want to link them all together.
436 	 * However, we do not want to link the first one to the previous
437 	 * WQE until the entire chain has been linked.  Then in the last
438 	 * step we ring the appropriate doorbell.  Note:  It is possible for
439 	 * more Work Requests to be posted than the HW will support at one
440 	 * shot.  If this happens, we need to be able to post and ring
441 	 * several chains here until the the entire request is complete.
442 	 */
443 	wrindx = 0;
444 	numremain = num_wr;
445 	status	  = DDI_SUCCESS;
446 	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
447 		/*
448 		 * For the first WQE on a new chain we need "prev" to point
449 		 * to the current descriptor.  As we begin to process
450 		 * further, "prev" will be updated to point to the previous
451 		 * WQE on the current chain (see below).
452 		 */
453 		prev = TAVOR_QP_RQ_ENTRY(qp, tail);
454 
455 		/*
456 		 * Before we begin, save the current "tail index" for later
457 		 * DMA sync
458 		 */
459 		sync_from = tail;
460 
461 		/*
462 		 * Break the request up into chains that are less than or
463 		 * equal to the maximum number of WQEs that can be posted
464 		 * per doorbell ring
465 		 */
466 		chainlen = (numremain > maxdb) ? maxdb : numremain;
467 		numremain -= chainlen;
468 		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
469 			/*
470 			 * Check for "queue full" condition.  If the queue
471 			 * is already full, then no more WQEs can be posted.
472 			 * So break out, ring a doorbell (if necessary) and
473 			 * return an error
474 			 */
475 			if (qp->qp_rq_wqhdr->wq_full != 0) {
476 				status = IBT_QP_FULL;
477 				break;
478 			}
479 
480 			/*
481 			 * Increment the "tail index" and check for "queue
482 			 * full" condition.  If we detect that the current
483 			 * work request is going to fill the work queue, then
484 			 * we mark this condition and continue.
485 			 */
486 			next_tail = (tail + 1) & qsize_msk;
487 			if (next_tail == head) {
488 				qp->qp_rq_wqhdr->wq_full = 1;
489 			}
490 
491 			/*
492 			 * Get the address of the location where the next
493 			 * Recv WQE should be built
494 			 */
495 			desc = TAVOR_QP_RQ_ENTRY(qp, tail);
496 
497 			/*
498 			 * Call tavor_wqe_recv_build() to build the WQE
499 			 * at the given address.  This routine uses the
500 			 * information in the ibt_recv_wr_t list (wr[]) and
501 			 * returns the size of the WQE when it returns.
502 			 */
503 			status = tavor_wqe_recv_build(state, qp, &wr[wrindx],
504 			    desc, &desc_sz);
505 			if (status != DDI_SUCCESS) {
506 				break;
507 			}
508 
509 			/*
510 			 * Add a WRID entry to the WRID list.  Need to
511 			 * calculate the "wqeaddrsz" and "signaled_dbd"
512 			 * values to pass to tavor_wrid_add_entry().  Note:
513 			 * all Recv WQEs are essentially "signaled" and
514 			 * "doorbelled" (since Tavor HW requires all
515 			 * RecvWQE's to have their "DBD" bits set).
516 			 */
517 			wqeaddrsz = TAVOR_QP_WQEADDRSZ((uint64_t *)(uintptr_t)
518 			    ((uint64_t)(uintptr_t)desc - qp->qp_desc_off),
519 			    desc_sz);
520 			signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED |
521 			    TAVOR_WRID_ENTRY_DOORBELLED;
522 			tavor_wrid_add_entry(qp->qp_rq_wqhdr,
523 			    wr[wrindx].wr_id, wqeaddrsz, signaled_dbd);
524 
525 			/*
526 			 * If this is not the first descriptor on the current
527 			 * chain, then link it to the previous WQE.  Otherwise,
528 			 * save the address and size of this descriptor (in
529 			 * "first" and "first_sz" respectively) and continue.
530 			 */
531 			if (currindx != 0) {
532 				tavor_wqe_recv_linknext(desc, desc_sz, prev,
533 				    qp);
534 				prev = desc;
535 			} else {
536 				first	 = desc;
537 				first_sz = desc_sz;
538 			}
539 
540 			/*
541 			 * Update the current "tail index" and increment
542 			 * "posted_cnt"
543 			 */
544 			tail = next_tail;
545 			posted_cnt++;
546 		}
547 
548 		/*
549 		 * If we reach here and there are one or more WQEs which have
550 		 * been successfully chained together, then we need to link
551 		 * the current chain to the previously executing chain of
552 		 * descriptor (if there is one) and ring the doorbell for the
553 		 * recv work queue.
554 		 */
555 		if (currindx != 0) {
556 			/*
557 			 * Before we link the chain, we need to ensure that the
558 			 * "next" field on the last WQE is set to NULL (to
559 			 * indicate the end of the chain).
560 			 */
561 			tavor_wqe_recv_linknext(NULL, 0, prev, qp);
562 
563 			/* Save away updated "tail index" for the DMA sync */
564 			sync_to = tail;
565 
566 			/* Do a DMA sync for current recv WQE(s) */
567 			tavor_wqe_sync(qp, sync_from, sync_to, TAVOR_WR_RECV,
568 			    DDI_DMA_SYNC_FORDEV);
569 
570 			/*
571 			 * Now link the chain to the old chain (if there was
572 			 * one.
573 			 */
574 			tavor_wqe_recv_linknext(first, first_sz,
575 			    qp->qp_rq_lastwqeaddr, qp);
576 
577 			/*
578 			 * If there was a valid previous WQE (i.e. non-NULL),
579 			 * then sync it too.  This is because we have updated
580 			 * its "next" fields and we want to ensure that the
581 			 * hardware can see the changes.
582 			 */
583 			if (qp->qp_rq_lastwqeaddr != NULL) {
584 				sync_to	  = sync_from;
585 				sync_from = (sync_from - 1) & qsize_msk;
586 				tavor_wqe_sync(qp, sync_from, sync_to,
587 				    TAVOR_WR_RECV, DDI_DMA_SYNC_FORDEV);
588 			}
589 
590 			/* Update some of the state in the QP */
591 			qp->qp_rq_lastwqeaddr	 = desc;
592 			qp->qp_rq_wqhdr->wq_tail = tail;
593 
594 			/* Ring the doorbell */
595 			tavor_qp_recv_doorbell(state,
596 			    (uint32_t)((uintptr_t)first - qp->qp_desc_off),
597 			    first_sz, qp->qp_qpnum, (chainlen % maxdb));
598 		}
599 	}
600 
601 	/*
602 	 * Update the "num_posted" return value (if necessary).  Then drop
603 	 * the locks and return success.
604 	 */
605 	if (num_posted != NULL) {
606 		*num_posted = posted_cnt;
607 	}
608 
609 	mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
610 	mutex_exit(&qp->qp_lock);
611 
612 	return (status);
613 }
614 
615 /*
616  * tavor_post_srq()
617  *    Context: Can be called from interrupt or base context.
618  */
619 int
tavor_post_srq(tavor_state_t * state,tavor_srqhdl_t srq,ibt_recv_wr_t * wr,uint_t num_wr,uint_t * num_posted)620 tavor_post_srq(tavor_state_t *state, tavor_srqhdl_t srq,
621     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
622 {
623 	uint64_t			*desc, *prev, *first, *last_wqe_addr;
624 	uint32_t			signaled_dbd;
625 	uint32_t			sync_indx;
626 	uint_t				currindx, wrindx, numremain;
627 	uint_t				chainlen, posted_cnt;
628 	uint_t				maxdb = TAVOR_QP_MAXDESC_PER_DB;
629 	int				status;
630 
631 	/*
632 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
633 	 * clients to post to QP memory that is accessible directly by the
634 	 * user.  If the QP memory is user accessible, then return an error.
635 	 */
636 	if (srq->srq_is_umap) {
637 		return (IBT_SRQ_HDL_INVALID);
638 	}
639 
640 	/* Initialize posted_cnt */
641 	posted_cnt = 0;
642 
643 	mutex_enter(&srq->srq_lock);
644 
645 	/*
646 	 * Check SRQ state.  Can not post Recv requests when SRQ is in error
647 	 */
648 	if (srq->srq_state == TAVOR_SRQ_STATE_ERROR) {
649 		mutex_exit(&srq->srq_lock);
650 		return (IBT_QP_STATE_INVALID);
651 	}
652 
653 	/* Grab the lock for the WRID list */
654 	mutex_enter(&srq->srq_wrid_wql->wql_lock);
655 
656 	/*
657 	 * For each ibt_recv_wr_t in the wr[] list passed in, parse the
658 	 * request and build a Recv WQE.  Note:  Because we are potentially
659 	 * building a chain of WQEs, we want to link them all together.
660 	 * However, we do not want to link the first one to the previous
661 	 * WQE until the entire chain has been linked.  Then in the last
662 	 * step we ring the appropriate doorbell.  Note:  It is possible for
663 	 * more Work Requests to be posted than the HW will support at one
664 	 * shot.  If this happens, we need to be able to post and ring
665 	 * several chains here until the the entire request is complete.
666 	 */
667 	wrindx = 0;
668 	numremain = num_wr;
669 	status	  = DDI_SUCCESS;
670 	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
671 		/*
672 		 * For the first WQE on a new chain we need "prev" to point
673 		 * to the current descriptor.  As we begin to process
674 		 * further, "prev" will be updated to point to the previous
675 		 * WQE on the current chain (see below).
676 		 */
677 		if (srq->srq_wq_lastwqeindx == -1) {
678 			prev = NULL;
679 		} else {
680 			prev = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wq_lastwqeindx);
681 		}
682 
683 		/*
684 		 * Break the request up into chains that are less than or
685 		 * equal to the maximum number of WQEs that can be posted
686 		 * per doorbell ring
687 		 */
688 		chainlen = (numremain > maxdb) ? maxdb : numremain;
689 		numremain -= chainlen;
690 		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
691 
692 			/*
693 			 * Check for "queue full" condition.  If the queue
694 			 * is already full, then no more WQEs can be posted.
695 			 * So break out, ring a doorbell (if necessary) and
696 			 * return an error
697 			 */
698 			if (srq->srq_wridlist->wl_free_list_indx == -1) {
699 				status = IBT_QP_FULL;
700 				break;
701 			}
702 
703 			/*
704 			 * Get the address of the location where the next
705 			 * Recv WQE should be built
706 			 */
707 			desc = TAVOR_SRQ_WQE_ADDR(srq,
708 			    srq->srq_wridlist->wl_free_list_indx);
709 
710 			/*
711 			 * Add a WRID entry to the WRID list.  Need to
712 			 * set the "signaled_dbd" values to pass to
713 			 * tavor_wrid_add_entry().  Note: all Recv WQEs are
714 			 * essentially "signaled"
715 			 *
716 			 * The 'size' is stored at srq_alloc time, in the
717 			 * srq_wq_stride.  This is a constant value required
718 			 * for SRQ.
719 			 */
720 			signaled_dbd = TAVOR_WRID_ENTRY_SIGNALED;
721 			tavor_wrid_add_entry_srq(srq, wr[wrindx].wr_id,
722 			    signaled_dbd);
723 
724 			/*
725 			 * Call tavor_wqe_srq_build() to build the WQE
726 			 * at the given address.  This routine uses the
727 			 * information in the ibt_recv_wr_t list (wr[]) and
728 			 * returns the size of the WQE when it returns.
729 			 */
730 			status = tavor_wqe_srq_build(state, srq, &wr[wrindx],
731 			    desc);
732 			if (status != DDI_SUCCESS) {
733 				break;
734 			}
735 
736 			/*
737 			 * If this is not the first descriptor on the current
738 			 * chain, then link it to the previous WQE.  Otherwise,
739 			 * save the address of this descriptor (in "first") and
740 			 * continue.
741 			 */
742 			if (currindx != 0) {
743 				tavor_wqe_srq_linknext(desc, prev, srq);
744 				sync_indx = TAVOR_SRQ_WQE_INDEX(
745 				    srq->srq_wq_buf, prev,
746 				    srq->srq_wq_log_wqesz);
747 
748 				/* Do a DMA sync for previous recv WQE */
749 				tavor_wqe_sync(srq, sync_indx, sync_indx+1,
750 				    TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
751 
752 				prev = desc;
753 			} else {
754 
755 				/*
756 				 * In this case, the last WQE on the chain is
757 				 * also considered 'first'.  So set prev to
758 				 * first, here.
759 				 */
760 				first = prev = desc;
761 			}
762 
763 			/*
764 			 * Increment "posted_cnt"
765 			 */
766 			posted_cnt++;
767 		}
768 
769 		/*
770 		 * If we reach here and there are one or more WQEs which have
771 		 * been successfully chained together, then we need to link
772 		 * the current chain to the previously executing chain of
773 		 * descriptor (if there is one) and ring the doorbell for the
774 		 * recv work queue.
775 		 */
776 		if (currindx != 0) {
777 			/*
778 			 * Before we link the chain, we need to ensure that the
779 			 * "next" field on the last WQE is set to NULL (to
780 			 * indicate the end of the chain).
781 			 */
782 			tavor_wqe_srq_linknext(NULL, prev, srq);
783 
784 			sync_indx = TAVOR_SRQ_WQE_INDEX(srq->srq_wq_buf, prev,
785 			    srq->srq_wq_log_wqesz);
786 
787 			/* Do a DMA sync for current recv WQE */
788 			tavor_wqe_sync(srq, sync_indx, sync_indx+1,
789 			    TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
790 
791 			/*
792 			 * Now link the chain to the old chain (if there was
793 			 * one).
794 			 */
795 			if (srq->srq_wq_lastwqeindx == -1) {
796 				last_wqe_addr = NULL;
797 			} else {
798 				last_wqe_addr = TAVOR_SRQ_WQE_ADDR(srq,
799 				    srq->srq_wq_lastwqeindx);
800 			}
801 			tavor_wqe_srq_linknext(first, last_wqe_addr, srq);
802 
803 			/*
804 			 * If there was a valid previous WQE (i.e. valid index),
805 			 * then sync it too.  This is because we have updated
806 			 * its "next" fields and we want to ensure that the
807 			 * hardware can see the changes.
808 			 */
809 			if (srq->srq_wq_lastwqeindx != -1) {
810 				sync_indx = srq->srq_wq_lastwqeindx;
811 				tavor_wqe_sync(srq, sync_indx, sync_indx+1,
812 				    TAVOR_WR_SRQ, DDI_DMA_SYNC_FORDEV);
813 			}
814 
815 			/* Update some of the state in the QP */
816 			srq->srq_wq_lastwqeindx = TAVOR_SRQ_WQE_INDEX(
817 			    srq->srq_wq_buf, desc,
818 			    srq->srq_wq_log_wqesz);
819 
820 			/* Ring the doorbell */
821 			/* SRQ needs NDS of 0 */
822 			tavor_qp_recv_doorbell(state,
823 			    (uint32_t)((uintptr_t)first - srq->srq_desc_off),
824 			    0, srq->srq_srqnum, (chainlen % maxdb));
825 		}
826 	}
827 
828 	/*
829 	 * Update the "num_posted" return value (if necessary).  Then drop
830 	 * the locks and return success.
831 	 */
832 	if (num_posted != NULL) {
833 		*num_posted = posted_cnt;
834 	}
835 
836 	mutex_exit(&srq->srq_wrid_wql->wql_lock);
837 	mutex_exit(&srq->srq_lock);
838 
839 	return (status);
840 }
841 
842 
843 /*
844  * tavor_qp_send_doorbell()
845  *    Context: Can be called from interrupt or base context.
846  */
847 static void
tavor_qp_send_doorbell(tavor_state_t * state,uint32_t nda,uint32_t nds,uint32_t qpn,uint32_t fence,uint32_t nopcode)848 tavor_qp_send_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
849     uint32_t qpn, uint32_t fence, uint32_t nopcode)
850 {
851 	uint64_t	doorbell = 0;
852 
853 	/* Build the doorbell from the parameters */
854 	doorbell = (((uint64_t)nda & TAVOR_QPSNDDB_NDA_MASK) <<
855 	    TAVOR_QPSNDDB_NDA_SHIFT) |
856 	    ((uint64_t)fence << TAVOR_QPSNDDB_F_SHIFT) |
857 	    ((uint64_t)nopcode << TAVOR_QPSNDDB_NOPCODE_SHIFT) |
858 	    ((uint64_t)qpn << TAVOR_QPSNDDB_QPN_SHIFT) | nds;
859 
860 	/* Write the doorbell to UAR */
861 	TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->send,
862 	    doorbell);
863 }
864 
865 
866 /*
867  * tavor_qp_recv_doorbell()
868  *    Context: Can be called from interrupt or base context.
869  */
870 static void
tavor_qp_recv_doorbell(tavor_state_t * state,uint32_t nda,uint32_t nds,uint32_t qpn,uint32_t credits)871 tavor_qp_recv_doorbell(tavor_state_t *state, uint32_t nda, uint32_t nds,
872     uint32_t qpn, uint32_t credits)
873 {
874 	uint64_t	doorbell = 0;
875 
876 	/* Build the doorbell from the parameters */
877 	doorbell = (((uint64_t)nda & TAVOR_QPRCVDB_NDA_MASK) <<
878 	    TAVOR_QPRCVDB_NDA_SHIFT) |
879 	    ((uint64_t)nds << TAVOR_QPRCVDB_NDS_SHIFT) |
880 	    ((uint64_t)qpn << TAVOR_QPRCVDB_QPN_SHIFT) | credits;
881 
882 	/* Write the doorbell to UAR */
883 	TAVOR_UAR_DOORBELL(state, (uint64_t *)&state->ts_uar->recv,
884 	    doorbell);
885 }
886 
887 
888 /*
889  * tavor_wqe_send_build()
890  *    Context: Can be called from interrupt or base context.
891  */
892 static int
tavor_wqe_send_build(tavor_state_t * state,tavor_qphdl_t qp,ibt_send_wr_t * wr,uint64_t * desc,uint_t * size)893 tavor_wqe_send_build(tavor_state_t *state, tavor_qphdl_t qp,
894     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
895 {
896 	tavor_hw_snd_wqe_ud_t		*ud;
897 	tavor_hw_snd_wqe_remaddr_t	*rc;
898 	tavor_hw_snd_wqe_atomic_t	*at;
899 	tavor_hw_snd_wqe_remaddr_t	*uc;
900 	tavor_hw_snd_wqe_bind_t		*bn;
901 	tavor_hw_wqe_sgl_t		*ds;
902 	ibt_wr_ds_t			*sgl;
903 	tavor_ahhdl_t			ah;
904 	uint32_t			nds;
905 	int				i, num_ds, status;
906 
907 	ASSERT(MUTEX_HELD(&qp->qp_lock));
908 
909 	/* Initialize the information for the Data Segments */
910 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
911 	    sizeof (tavor_hw_snd_wqe_nextctrl_t));
912 	nds = wr->wr_nds;
913 	sgl = wr->wr_sgl;
914 	num_ds = 0;
915 
916 	/*
917 	 * Build a Send WQE depends first and foremost on the transport
918 	 * type of Work Request (i.e. UD, RC, or UC)
919 	 */
920 	switch (wr->wr_trans) {
921 	case IBT_UD_SRV:
922 		/* Ensure that work request transport type matches QP type */
923 		if (qp->qp_serv_type != TAVOR_QP_UD) {
924 			return (IBT_QP_SRV_TYPE_INVALID);
925 		}
926 
927 		/*
928 		 * Validate the operation type.  For UD requests, only the
929 		 * "Send" operation is valid
930 		 */
931 		if (wr->wr_opcode != IBT_WRC_SEND) {
932 			return (IBT_QP_OP_TYPE_INVALID);
933 		}
934 
935 		/*
936 		 * If this is a Special QP (QP0 or QP1), then we need to
937 		 * build MLX WQEs instead.  So jump to tavor_wqe_mlx_build()
938 		 * and return whatever status it returns
939 		 */
940 		if (qp->qp_is_special) {
941 			status = tavor_wqe_mlx_build(state, qp, wr, desc, size);
942 			return (status);
943 		}
944 
945 		/*
946 		 * Otherwise, if this is a normal UD Send request, then fill
947 		 * all the fields in the Tavor UD header for the WQE.  Note:
948 		 * to do this we'll need to extract some information from the
949 		 * Address Handle passed with the work request.
950 		 */
951 		ud = (tavor_hw_snd_wqe_ud_t *)((uintptr_t)desc +
952 		    sizeof (tavor_hw_snd_wqe_nextctrl_t));
953 		ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
954 		if (ah == NULL) {
955 			return (IBT_AH_HDL_INVALID);
956 		}
957 
958 		/*
959 		 * Build the Unreliable Datagram Segment for the WQE, using
960 		 * the information from the address handle and the work
961 		 * request.
962 		 */
963 		mutex_enter(&ah->ah_lock);
964 		TAVOR_WQE_BUILD_UD(qp, ud, ah, wr);
965 		mutex_exit(&ah->ah_lock);
966 
967 		/* Update "ds" for filling in Data Segments (below) */
968 		ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)ud +
969 		    sizeof (tavor_hw_snd_wqe_ud_t));
970 		break;
971 
972 	case IBT_RC_SRV:
973 		/* Ensure that work request transport type matches QP type */
974 		if (qp->qp_serv_type != TAVOR_QP_RC) {
975 			return (IBT_QP_SRV_TYPE_INVALID);
976 		}
977 
978 		/*
979 		 * Validate the operation type.  For RC requests, we allow
980 		 * "Send", "RDMA Read", "RDMA Write", various "Atomic"
981 		 * operations, and memory window "Bind"
982 		 */
983 		if ((wr->wr_opcode != IBT_WRC_SEND) &&
984 		    (wr->wr_opcode != IBT_WRC_RDMAR) &&
985 		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
986 		    (wr->wr_opcode != IBT_WRC_CSWAP) &&
987 		    (wr->wr_opcode != IBT_WRC_FADD) &&
988 		    (wr->wr_opcode != IBT_WRC_BIND)) {
989 			return (IBT_QP_OP_TYPE_INVALID);
990 		}
991 
992 		/*
993 		 * If this is a Send request, then all we need to do is break
994 		 * out and here and begin the Data Segment processing below
995 		 */
996 		if (wr->wr_opcode == IBT_WRC_SEND) {
997 			break;
998 		}
999 
1000 		/*
1001 		 * If this is an RDMA Read or RDMA Write request, then fill
1002 		 * in the "Remote Address" header fields.
1003 		 */
1004 		if ((wr->wr_opcode == IBT_WRC_RDMAR) ||
1005 		    (wr->wr_opcode == IBT_WRC_RDMAW)) {
1006 			rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1007 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1008 
1009 			/*
1010 			 * Build the Remote Address Segment for the WQE, using
1011 			 * the information from the RC work request.
1012 			 */
1013 			TAVOR_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
1014 
1015 			/* Update "ds" for filling in Data Segments (below) */
1016 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)rc +
1017 			    sizeof (tavor_hw_snd_wqe_remaddr_t));
1018 			break;
1019 		}
1020 
1021 		/*
1022 		 * If this is one of the Atomic type operations (i.e
1023 		 * Compare-Swap or Fetch-Add), then fill in both the "Remote
1024 		 * Address" header fields and the "Atomic" header fields.
1025 		 */
1026 		if ((wr->wr_opcode == IBT_WRC_CSWAP) ||
1027 		    (wr->wr_opcode == IBT_WRC_FADD)) {
1028 			rc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1029 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1030 			at = (tavor_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
1031 			    sizeof (tavor_hw_snd_wqe_remaddr_t));
1032 
1033 			/*
1034 			 * Build the Remote Address and Atomic Segments for
1035 			 * the WQE, using the information from the RC Atomic
1036 			 * work request.
1037 			 */
1038 			TAVOR_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
1039 			TAVOR_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
1040 
1041 			/* Update "ds" for filling in Data Segments (below) */
1042 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)at +
1043 			    sizeof (tavor_hw_snd_wqe_atomic_t));
1044 
1045 			/*
1046 			 * Update "nds" and "sgl" because Atomic requests have
1047 			 * only a single Data Segment (and they are encoded
1048 			 * somewhat differently in the work request.
1049 			 */
1050 			nds = 1;
1051 			sgl = wr->wr_sgl;
1052 			break;
1053 		}
1054 
1055 		/*
1056 		 * If this is memory window Bind operation, then we call the
1057 		 * tavor_wr_bind_check() routine to validate the request and
1058 		 * to generate the updated RKey.  If this is successful, then
1059 		 * we fill in the WQE's "Bind" header fields.
1060 		 */
1061 		if (wr->wr_opcode == IBT_WRC_BIND) {
1062 			status = tavor_wr_bind_check(state, wr);
1063 			if (status != DDI_SUCCESS) {
1064 				return (status);
1065 			}
1066 
1067 			bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1068 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1069 
1070 			/*
1071 			 * Build the Bind Memory Window Segments for the WQE,
1072 			 * using the information from the RC Bind memory
1073 			 * window work request.
1074 			 */
1075 			TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
1076 
1077 			/*
1078 			 * Update the "ds" pointer.  Even though the "bind"
1079 			 * operation requires no SGLs, this is necessary to
1080 			 * facilitate the correct descriptor size calculations
1081 			 * (below).
1082 			 */
1083 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1084 			    sizeof (tavor_hw_snd_wqe_bind_t));
1085 			nds = 0;
1086 		}
1087 		break;
1088 
1089 	case IBT_UC_SRV:
1090 		/* Ensure that work request transport type matches QP type */
1091 		if (qp->qp_serv_type != TAVOR_QP_UC) {
1092 			return (IBT_QP_SRV_TYPE_INVALID);
1093 		}
1094 
1095 		/*
1096 		 * Validate the operation type.  For UC requests, we only
1097 		 * allow "Send", "RDMA Write", and memory window "Bind".
1098 		 * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic"
1099 		 * operations
1100 		 */
1101 		if ((wr->wr_opcode != IBT_WRC_SEND) &&
1102 		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
1103 		    (wr->wr_opcode != IBT_WRC_BIND)) {
1104 			return (IBT_QP_OP_TYPE_INVALID);
1105 		}
1106 
1107 		/*
1108 		 * If this is a Send request, then all we need to do is break
1109 		 * out and here and begin the Data Segment processing below
1110 		 */
1111 		if (wr->wr_opcode == IBT_WRC_SEND) {
1112 			break;
1113 		}
1114 
1115 		/*
1116 		 * If this is an RDMA Write request, then fill in the "Remote
1117 		 * Address" header fields.
1118 		 */
1119 		if (wr->wr_opcode == IBT_WRC_RDMAW) {
1120 			uc = (tavor_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1121 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1122 
1123 			/*
1124 			 * Build the Remote Address Segment for the WQE, using
1125 			 * the information from the UC work request.
1126 			 */
1127 			TAVOR_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma);
1128 
1129 			/* Update "ds" for filling in Data Segments (below) */
1130 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)uc +
1131 			    sizeof (tavor_hw_snd_wqe_remaddr_t));
1132 			break;
1133 		}
1134 
1135 		/*
1136 		 * If this is memory window Bind operation, then we call the
1137 		 * tavor_wr_bind_check() routine to validate the request and
1138 		 * to generate the updated RKey.  If this is successful, then
1139 		 * we fill in the WQE's "Bind" header fields.
1140 		 */
1141 		if (wr->wr_opcode == IBT_WRC_BIND) {
1142 			status = tavor_wr_bind_check(state, wr);
1143 			if (status != DDI_SUCCESS) {
1144 				return (status);
1145 			}
1146 
1147 			bn = (tavor_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1148 			    sizeof (tavor_hw_snd_wqe_nextctrl_t));
1149 
1150 			/*
1151 			 * Build the Bind Memory Window Segments for the WQE,
1152 			 * using the information from the UC Bind memory
1153 			 * window work request.
1154 			 */
1155 			TAVOR_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind);
1156 
1157 			/*
1158 			 * Update the "ds" pointer.  Even though the "bind"
1159 			 * operation requires no SGLs, this is necessary to
1160 			 * facilitate the correct descriptor size calculations
1161 			 * (below).
1162 			 */
1163 			ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)bn +
1164 			    sizeof (tavor_hw_snd_wqe_bind_t));
1165 			nds = 0;
1166 		}
1167 		break;
1168 
1169 	default:
1170 		return (IBT_QP_SRV_TYPE_INVALID);
1171 	}
1172 
1173 	/*
1174 	 * Now fill in the Data Segments (SGL) for the Send WQE based on
1175 	 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
1176 	 * Start by checking for a valid number of SGL entries
1177 	 */
1178 	if (nds > qp->qp_sq_sgl) {
1179 		return (IBT_QP_SGL_LEN_INVALID);
1180 	}
1181 
1182 	/*
1183 	 * For each SGL in the Send Work Request, fill in the Send WQE's data
1184 	 * segments.  Note: We skip any SGL with zero size because Tavor
1185 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1186 	 * the encoding for zero means a 2GB transfer.  Because of this special
1187 	 * encoding in the hardware, we mask the requested length with
1188 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1189 	 * zero.)
1190 	 */
1191 	for (i = 0; i < nds; i++) {
1192 		if (sgl[i].ds_len == 0) {
1193 			continue;
1194 		}
1195 
1196 		/*
1197 		 * Fill in the Data Segment(s) for the current WQE, using the
1198 		 * information contained in the scatter-gather list of the
1199 		 * work request.
1200 		 */
1201 		TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1202 		num_ds++;
1203 	}
1204 
1205 	/* Return the size of descriptor (in 16-byte chunks) */
1206 	*size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 4;
1207 
1208 	return (DDI_SUCCESS);
1209 }
1210 
1211 
1212 /*
1213  * tavor_wqe_send_linknext()
1214  *    Context: Can be called from interrupt or base context.
1215  */
1216 static void
tavor_wqe_send_linknext(ibt_send_wr_t * curr_wr,ibt_send_wr_t * prev_wr,uint64_t * curr_desc,uint_t curr_descsz,uint64_t * prev_desc,tavor_sw_wqe_dbinfo_t * dbinfo,tavor_qphdl_t qp)1217 tavor_wqe_send_linknext(ibt_send_wr_t *curr_wr, ibt_send_wr_t *prev_wr,
1218     uint64_t *curr_desc, uint_t curr_descsz, uint64_t *prev_desc,
1219     tavor_sw_wqe_dbinfo_t *dbinfo, tavor_qphdl_t qp)
1220 {
1221 	uint64_t	next, ctrl;
1222 	uint32_t	nopcode, fence;
1223 
1224 	/*
1225 	 * Calculate the "next" field of the descriptor.  This amounts to
1226 	 * setting up the "next_wqe_addr", "nopcode", "fence", and "nds"
1227 	 * fields (see tavor_hw.h for more).  Note:  If there is no next
1228 	 * descriptor (i.e. if the current descriptor is the last WQE on
1229 	 * the chain), then set "next" to zero.
1230 	 */
1231 	if (curr_desc != NULL) {
1232 		/*
1233 		 * Determine the value for the Tavor WQE "nopcode" field
1234 		 * by using the IBTF opcode from the work request
1235 		 */
1236 		switch (curr_wr->wr_opcode) {
1237 		case IBT_WRC_RDMAW:
1238 			if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1239 				nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAWI;
1240 			} else {
1241 				nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAW;
1242 			}
1243 			break;
1244 
1245 		case IBT_WRC_SEND:
1246 			if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
1247 				nopcode = TAVOR_WQE_SEND_NOPCODE_SENDI;
1248 			} else {
1249 				nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1250 			}
1251 			break;
1252 
1253 		case IBT_WRC_RDMAR:
1254 			nopcode = TAVOR_WQE_SEND_NOPCODE_RDMAR;
1255 			break;
1256 
1257 		case IBT_WRC_CSWAP:
1258 			nopcode = TAVOR_WQE_SEND_NOPCODE_ATMCS;
1259 			break;
1260 
1261 		case IBT_WRC_FADD:
1262 			nopcode = TAVOR_WQE_SEND_NOPCODE_ATMFA;
1263 			break;
1264 
1265 		case IBT_WRC_BIND:
1266 			nopcode = TAVOR_WQE_SEND_NOPCODE_BIND;
1267 			break;
1268 		}
1269 
1270 		curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc
1271 		    - qp->qp_desc_off);
1272 		next  = ((uint64_t)(uintptr_t)curr_desc &
1273 		    TAVOR_WQE_NDA_MASK) << 32;
1274 		next  = next | ((uint64_t)nopcode << 32);
1275 		fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
1276 		if (fence) {
1277 			next = next | TAVOR_WQE_SEND_FENCE_MASK;
1278 		}
1279 		next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1280 
1281 		/*
1282 		 * If a send queue doorbell will be rung for the next
1283 		 * WQE on the chain, then set the current WQE's "dbd" bit.
1284 		 * Note: We also update the "dbinfo" structure here to pass
1285 		 * back information about what should (later) be included
1286 		 * in the send queue doorbell.
1287 		 */
1288 		if (dbinfo) {
1289 			next = next | TAVOR_WQE_DBD_MASK;
1290 			dbinfo->db_nopcode = nopcode;
1291 			dbinfo->db_fence   = fence;
1292 		}
1293 	} else {
1294 		next = 0;
1295 	}
1296 
1297 	/*
1298 	 * If this WQE is supposed to be linked to the previous descriptor,
1299 	 * then we need to update not only the previous WQE's "next" fields
1300 	 * but we must also update this WQE's "ctrl" fields (i.e. the "c", "e",
1301 	 * "s", "i" and "immediate" fields - see tavor_hw.h for more).  Note:
1302 	 * the "e" bit is always hardcoded to zero.
1303 	 */
1304 	if (prev_desc != NULL) {
1305 		/*
1306 		 * If a send queue doorbell will be rung for the next WQE on
1307 		 * the chain, then update the current WQE's "next" field and
1308 		 * return.
1309 		 * Note: We don't want to modify the "ctrl" field here because
1310 		 * that portion of the previous WQE has already been set
1311 		 * correctly at some previous point in time.
1312 		 */
1313 		if (dbinfo) {
1314 			TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1315 			return;
1316 		}
1317 
1318 		ctrl = 0;
1319 
1320 		/* Set the "c" (i.e. "signaled") bit appropriately */
1321 		if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1322 			ctrl = ctrl | TAVOR_WQE_SEND_SIGNALED_MASK;
1323 		}
1324 
1325 		/* Set the "s" (i.e. "solicited") bit appropriately */
1326 		if (prev_wr->wr_flags & IBT_WR_SEND_SOLICIT) {
1327 			ctrl = ctrl | TAVOR_WQE_SEND_SOLICIT_MASK;
1328 		}
1329 
1330 		/* Set the "i" bit and the immediate data appropriately */
1331 		if (prev_wr->wr_flags & IBT_WR_SEND_IMMED) {
1332 			ctrl = ctrl | TAVOR_WQE_SEND_IMMEDIATE_MASK;
1333 			ctrl = ctrl | tavor_wr_get_immediate(prev_wr);
1334 		}
1335 
1336 		TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1337 	}
1338 }
1339 
1340 
1341 /*
1342  * tavor_wqe_mlx_build()
1343  *    Context: Can be called from interrupt or base context.
1344  */
1345 static int
tavor_wqe_mlx_build(tavor_state_t * state,tavor_qphdl_t qp,ibt_send_wr_t * wr,uint64_t * desc,uint_t * size)1346 tavor_wqe_mlx_build(tavor_state_t *state, tavor_qphdl_t qp,
1347     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1348 {
1349 	tavor_hw_udav_t		udav;
1350 	tavor_ahhdl_t		ah;
1351 	ib_lrh_hdr_t		*lrh;
1352 	ib_grh_t		*grh;
1353 	ib_bth_hdr_t		*bth;
1354 	ib_deth_hdr_t		*deth;
1355 	tavor_hw_wqe_sgl_t	*ds;
1356 	ibt_wr_ds_t		*sgl;
1357 	uint8_t			*mgmtclass, *hpoint, *hcount;
1358 	uint64_t		data;
1359 	uint32_t		nds, offset, pktlen;
1360 	uint32_t		desc_sz, udav_sz;
1361 	int			i, num_ds;
1362 
1363 	ASSERT(MUTEX_HELD(&qp->qp_lock));
1364 
1365 	/* Initialize the information for the Data Segments */
1366 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1367 	    sizeof (tavor_hw_mlx_wqe_nextctrl_t));
1368 
1369 	/*
1370 	 * Pull the address handle from the work request and read in
1371 	 * the contents of the UDAV.  This will be used to answer some
1372 	 * questions about the request.
1373 	 */
1374 	ah = (tavor_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1375 	if (ah == NULL) {
1376 		return (IBT_AH_HDL_INVALID);
1377 	}
1378 	mutex_enter(&ah->ah_lock);
1379 	udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1380 	for (i = 0; i < udav_sz; i++) {
1381 		data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1382 		    ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1383 		((uint64_t *)&udav)[i] = data;
1384 	}
1385 	mutex_exit(&ah->ah_lock);
1386 
1387 	/*
1388 	 * If the request is for QP1 and the destination LID is equal to
1389 	 * the Permissive LID, then return an error.  This combination is
1390 	 * not allowed
1391 	 */
1392 	if ((udav.rlid == IB_LID_PERMISSIVE) &&
1393 	    (qp->qp_is_special == TAVOR_QP_GSI)) {
1394 		return (IBT_AH_HDL_INVALID);
1395 	}
1396 
1397 	/*
1398 	 * Calculate the size of the packet headers, including the GRH
1399 	 * (if necessary)
1400 	 */
1401 	desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) +
1402 	    sizeof (ib_deth_hdr_t);
1403 	if (udav.grh) {
1404 		desc_sz += sizeof (ib_grh_t);
1405 	}
1406 
1407 	/*
1408 	 * Begin to build the first "inline" data segment for the packet
1409 	 * headers.  Note:  By specifying "inline" we can build the contents
1410 	 * of the MAD packet headers directly into the work queue (as part
1411 	 * descriptor).  This has the advantage of both speeding things up
1412 	 * and of not requiring the driver to allocate/register any additional
1413 	 * memory for the packet headers.
1414 	 */
1415 	TAVOR_WQE_BUILD_INLINE(qp, &ds[0], desc_sz);
1416 	desc_sz += 4;
1417 
1418 	/*
1419 	 * Build Local Route Header (LRH)
1420 	 *    We start here by building the LRH into a temporary location.
1421 	 *    When we have finished we copy the LRH data into the descriptor.
1422 	 *
1423 	 *    Notice that the VL values are hardcoded.  This is not a problem
1424 	 *    because VL15 is decided later based on the value in the MLX
1425 	 *    transport "next/ctrl" header (see the "vl15" bit below), and it
1426 	 *    is otherwise (meaning for QP1) chosen from the SL-to-VL table
1427 	 *    values.  This rule does not hold for loopback packets however
1428 	 *    (all of which bypass the SL-to-VL tables) and it is the reason
1429 	 *    that non-QP0 MADs are setup with VL hardcoded to zero below.
1430 	 *
1431 	 *    Notice also that Source LID is hardcoded to the Permissive LID
1432 	 *    (0xFFFF).  This is also not a problem because if the Destination
1433 	 *    LID is not the Permissive LID, then the "slr" value in the MLX
1434 	 *    transport "next/ctrl" header will be set to zero and the hardware
1435 	 *    will pull the LID from value in the port.
1436 	 */
1437 	lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4);
1438 	pktlen = (desc_sz + 0x100) >> 2;
1439 	TAVOR_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen);
1440 
1441 	/*
1442 	 * Build Global Route Header (GRH)
1443 	 *    This is only built if necessary as defined by the "grh" bit in
1444 	 *    the address vector.  Note:  We also calculate the offset to the
1445 	 *    next header (BTH) based on whether or not the "grh" bit is set.
1446 	 */
1447 	if (udav.grh) {
1448 		/*
1449 		 * If the request is for QP0, then return an error.  The
1450 		 * combination of global routine (GRH) and QP0 is not allowed.
1451 		 */
1452 		if (qp->qp_is_special == TAVOR_QP_SMI) {
1453 			return (IBT_AH_HDL_INVALID);
1454 		}
1455 		grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1456 		TAVOR_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen);
1457 
1458 		bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t));
1459 	} else {
1460 		bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1461 	}
1462 
1463 
1464 	/*
1465 	 * Build Base Transport Header (BTH)
1466 	 *    Notice that the M, PadCnt, and TVer fields are all set
1467 	 *    to zero implicitly.  This is true for all Management Datagrams
1468 	 *    MADs whether GSI are SMI.
1469 	 */
1470 	TAVOR_WQE_BUILD_MLX_BTH(state, bth, qp, wr);
1471 
1472 	/*
1473 	 * Build Datagram Extended Transport Header (DETH)
1474 	 */
1475 	deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t));
1476 	TAVOR_WQE_BUILD_MLX_DETH(deth, qp);
1477 
1478 	/* Ensure that the Data Segment is aligned on a 16-byte boundary */
1479 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t));
1480 	ds = (tavor_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF);
1481 	nds = wr->wr_nds;
1482 	sgl = wr->wr_sgl;
1483 	num_ds = 0;
1484 
1485 	/*
1486 	 * Now fill in the Data Segments (SGL) for the MLX WQE based on the
1487 	 * values set up above (i.e. "sgl", "nds", and the "ds" pointer
1488 	 * Start by checking for a valid number of SGL entries
1489 	 */
1490 	if (nds > qp->qp_sq_sgl) {
1491 		return (IBT_QP_SGL_LEN_INVALID);
1492 	}
1493 
1494 	/*
1495 	 * For each SGL in the Send Work Request, fill in the MLX WQE's data
1496 	 * segments.  Note: We skip any SGL with zero size because Tavor
1497 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1498 	 * the encoding for zero means a 2GB transfer.  Because of this special
1499 	 * encoding in the hardware, we mask the requested length with
1500 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1501 	 * zero.)
1502 	 */
1503 	mgmtclass = hpoint = hcount = NULL;
1504 	offset = 0;
1505 	for (i = 0; i < nds; i++) {
1506 		if (sgl[i].ds_len == 0) {
1507 			continue;
1508 		}
1509 
1510 		/*
1511 		 * Fill in the Data Segment(s) for the MLX send WQE, using
1512 		 * the information contained in the scatter-gather list of
1513 		 * the work request.
1514 		 */
1515 		TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &sgl[i]);
1516 
1517 		/*
1518 		 * Search through the contents of all MADs posted to QP0 to
1519 		 * initialize pointers to the places where Directed Route "hop
1520 		 * pointer", "hop count", and "mgmtclass" would be.  Tavor
1521 		 * needs these updated (i.e. incremented or decremented, as
1522 		 * necessary) by software.
1523 		 */
1524 		if (qp->qp_is_special == TAVOR_QP_SMI) {
1525 
1526 			TAVOR_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass,
1527 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1528 
1529 			TAVOR_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint,
1530 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1531 
1532 			TAVOR_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount,
1533 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1534 
1535 			offset += sgl[i].ds_len;
1536 		}
1537 		num_ds++;
1538 	}
1539 
1540 	/*
1541 	 * Tavor's Directed Route MADs need to have the "hop pointer"
1542 	 * incremented/decremented (as necessary) depending on whether it is
1543 	 * currently less than or greater than the "hop count" (i.e. whether
1544 	 * the MAD is a request or a response.)
1545 	 */
1546 	if (qp->qp_is_special == TAVOR_QP_SMI) {
1547 		TAVOR_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass,
1548 		    *hpoint, *hcount);
1549 	}
1550 
1551 	/*
1552 	 * Now fill in the ICRC Data Segment.  This data segment is inlined
1553 	 * just like the packets headers above, but it is only four bytes and
1554 	 * set to zero (to indicate that we wish the hardware to generate ICRC.
1555 	 */
1556 	TAVOR_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0);
1557 	num_ds++;
1558 
1559 	/* Return the size of descriptor (in 16-byte chunks) */
1560 	*size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1561 
1562 	return (DDI_SUCCESS);
1563 }
1564 
1565 
1566 /*
1567  * tavor_wqe_mlx_linknext()
1568  *    Context: Can be called from interrupt or base context.
1569  */
1570 static void
tavor_wqe_mlx_linknext(ibt_send_wr_t * prev_wr,uint64_t * curr_desc,uint_t curr_descsz,uint64_t * prev_desc,tavor_sw_wqe_dbinfo_t * dbinfo,tavor_qphdl_t qp)1571 tavor_wqe_mlx_linknext(ibt_send_wr_t *prev_wr, uint64_t *curr_desc,
1572     uint_t curr_descsz, uint64_t *prev_desc, tavor_sw_wqe_dbinfo_t *dbinfo,
1573     tavor_qphdl_t qp)
1574 {
1575 	tavor_hw_udav_t		udav;
1576 	tavor_ahhdl_t		ah;
1577 	uint64_t		next, ctrl, data;
1578 	uint_t			nopcode;
1579 	uint_t			udav_sz;
1580 	int			i;
1581 
1582 	/*
1583 	 * Calculate the "next" field of the descriptor.  This amounts to
1584 	 * setting up the "next_wqe_addr", "nopcode", and "nds" fields (see
1585 	 * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
1586 	 * if the current descriptor is the last WQE on the chain), then set
1587 	 * "next" to zero.
1588 	 */
1589 	if (curr_desc != NULL) {
1590 		/*
1591 		 * The only valid Tavor WQE "nopcode" for MLX transport
1592 		 * requests is the "Send" code.
1593 		 */
1594 		nopcode = TAVOR_WQE_SEND_NOPCODE_SEND;
1595 		curr_desc = (uint64_t *)(uintptr_t)((uint64_t)
1596 		    (uintptr_t)curr_desc - qp->qp_desc_off);
1597 		next = (uint64_t)((uintptr_t)curr_desc &
1598 		    TAVOR_WQE_NDA_MASK) << 32;
1599 		next = next | ((uint64_t)nopcode << 32);
1600 		next = next | (curr_descsz & TAVOR_WQE_NDS_MASK);
1601 
1602 		/*
1603 		 * If a send queue doorbell will be rung for the next
1604 		 * WQE on the chain, then set the current WQE's "dbd" bit.
1605 		 * Note: We also update the "dbinfo" structure here to pass
1606 		 * back information about what should (later) be included
1607 		 * in the send queue doorbell.
1608 		 */
1609 		if (dbinfo) {
1610 			next = next | TAVOR_WQE_DBD_MASK;
1611 			dbinfo->db_nopcode = nopcode;
1612 			dbinfo->db_fence   = 0;
1613 		}
1614 	} else {
1615 		next = 0;
1616 	}
1617 
1618 	/*
1619 	 * If this WQE is supposed to be linked to the previous descriptor,
1620 	 * then we need to update not only the previous WQE's "next" fields
1621 	 * but we must also update this WQE's "ctrl" fields (i.e. the "vl15",
1622 	 * "slr", "max_srate", "sl", "c", "e", "rlid", and "vcrc" fields -
1623 	 * see tavor_hw.h for more) Note: the "e" bit and "vcrc" fields are
1624 	 * always hardcoded to zero.
1625 	 */
1626 	if (prev_desc != NULL) {
1627 		/*
1628 		 * If a send queue doorbell will be rung for the next WQE on
1629 		 * the chain, then update the current WQE's "next" field and
1630 		 * return.
1631 		 * Note: We don't want to modify the "ctrl" field here because
1632 		 * that portion of the previous WQE has already been set
1633 		 * correctly at some previous point in time.
1634 		 */
1635 		if (dbinfo) {
1636 			TAVOR_WQE_LINKFIRST(qp, prev_desc, next);
1637 			return;
1638 		}
1639 
1640 		/*
1641 		 * Pull the address handle from the work request and read in
1642 		 * the contents of the UDAV.  This will be used to answer some
1643 		 * questions about the request.
1644 		 */
1645 		ah = (tavor_ahhdl_t)prev_wr->wr.ud.udwr_dest->ud_ah;
1646 		mutex_enter(&ah->ah_lock);
1647 		udav_sz = sizeof (tavor_hw_udav_t) >> 3;
1648 		for (i = 0; i < udav_sz; i++) {
1649 			data = ddi_get64(ah->ah_udavrsrcp->tr_acchdl,
1650 			    ((uint64_t *)ah->ah_udavrsrcp->tr_addr + i));
1651 			((uint64_t *)&udav)[i] = data;
1652 		}
1653 		mutex_exit(&ah->ah_lock);
1654 
1655 		ctrl = 0;
1656 
1657 		/* Only QP0 uses VL15, otherwise use VL in the packet */
1658 		if (qp->qp_is_special == TAVOR_QP_SMI) {
1659 			ctrl = ctrl | TAVOR_WQE_MLXHDR_VL15_MASK;
1660 		}
1661 
1662 		/*
1663 		 * The SLR (Source LID Replace) bit determines whether the
1664 		 * source LID for an outgoing MLX packet should come from the
1665 		 * PortInfo (SLR = 0) or should be left as it is in the
1666 		 * descriptor (SLR = 1).  The latter is necessary for packets
1667 		 * to be sent with the Permissive LID.
1668 		 */
1669 		if (udav.rlid == IB_LID_PERMISSIVE) {
1670 			ctrl = ctrl | TAVOR_WQE_MLXHDR_SLR_MASK;
1671 		}
1672 
1673 		/* Fill in the max static rate from the address handle */
1674 		ctrl = ctrl | ((uint64_t)udav.max_stat_rate <<
1675 		    TAVOR_WQE_MLXHDR_SRATE_SHIFT);
1676 
1677 		/* All VL15 (i.e. SMI) traffic is required to use SL 0 */
1678 		if (qp->qp_is_special != TAVOR_QP_SMI) {
1679 			ctrl = ctrl | ((uint64_t)udav.sl <<
1680 			    TAVOR_WQE_MLXHDR_SL_SHIFT);
1681 		}
1682 
1683 		/* Set the "c" (i.e. "signaled") bit appropriately */
1684 		if (prev_wr->wr_flags & IBT_WR_SEND_SIGNAL) {
1685 			ctrl = ctrl | TAVOR_WQE_MLXHDR_SIGNALED_MASK;
1686 		}
1687 
1688 		/* Fill in the destination LID from the address handle */
1689 		ctrl = ctrl | ((uint64_t)udav.rlid <<
1690 		    TAVOR_WQE_MLXHDR_RLID_SHIFT);
1691 
1692 		TAVOR_WQE_LINKNEXT(qp, prev_desc, ctrl, next);
1693 	}
1694 }
1695 
1696 
1697 /*
1698  * tavor_wqe_recv_build()
1699  *    Context: Can be called from interrupt or base context.
1700  */
1701 /* ARGSUSED */
1702 static int
tavor_wqe_recv_build(tavor_state_t * state,tavor_qphdl_t qp,ibt_recv_wr_t * wr,uint64_t * desc,uint_t * size)1703 tavor_wqe_recv_build(tavor_state_t *state, tavor_qphdl_t qp,
1704     ibt_recv_wr_t *wr, uint64_t *desc, uint_t *size)
1705 {
1706 	tavor_hw_wqe_sgl_t	*ds;
1707 	int			i, num_ds;
1708 
1709 	ASSERT(MUTEX_HELD(&qp->qp_lock));
1710 
1711 	/* Check that work request transport type is valid */
1712 	if ((qp->qp_serv_type != TAVOR_QP_UD) &&
1713 	    (qp->qp_serv_type != TAVOR_QP_RC) &&
1714 	    (qp->qp_serv_type != TAVOR_QP_UC)) {
1715 		return (IBT_QP_SRV_TYPE_INVALID);
1716 	}
1717 
1718 	/* Fill in the Data Segments (SGL) for the Recv WQE */
1719 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1720 	    sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1721 	num_ds = 0;
1722 
1723 	/* Check for valid number of SGL entries */
1724 	if (wr->wr_nds > qp->qp_rq_sgl) {
1725 		return (IBT_QP_SGL_LEN_INVALID);
1726 	}
1727 
1728 	/*
1729 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1730 	 * segments.  Note: We skip any SGL with zero size because Tavor
1731 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1732 	 * the encoding for zero means a 2GB transfer.  Because of this special
1733 	 * encoding in the hardware, we mask the requested length with
1734 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1735 	 * zero.)
1736 	 */
1737 	for (i = 0; i < wr->wr_nds; i++) {
1738 		if (wr->wr_sgl[i].ds_len == 0) {
1739 			continue;
1740 		}
1741 
1742 		/*
1743 		 * Fill in the Data Segment(s) for the receive WQE, using the
1744 		 * information contained in the scatter-gather list of the
1745 		 * work request.
1746 		 */
1747 		TAVOR_WQE_BUILD_DATA_SEG(qp, &ds[num_ds], &wr->wr_sgl[i]);
1748 		num_ds++;
1749 	}
1750 
1751 	/* Return the size of descriptor (in 16-byte chunks) */
1752 	*size = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc) >> 0x4;
1753 
1754 	return (DDI_SUCCESS);
1755 }
1756 
1757 
1758 /*
1759  * tavor_wqe_recv_linknext()
1760  *    Context: Can be called from interrupt or base context.
1761  */
1762 static void
tavor_wqe_recv_linknext(uint64_t * curr_desc,uint_t curr_descsz,uint64_t * prev_desc,tavor_qphdl_t qp)1763 tavor_wqe_recv_linknext(uint64_t *curr_desc, uint_t curr_descsz,
1764     uint64_t *prev_desc, tavor_qphdl_t qp)
1765 {
1766 	uint64_t	next;
1767 
1768 	/*
1769 	 * Calculate the "next" field of the descriptor.  This amounts to
1770 	 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
1771 	 * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
1772 	 * if the current descriptor is the last WQE on the chain), then set
1773 	 * "next" field to TAVOR_WQE_DBD_MASK.  This is because the Tavor
1774 	 * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
1775 	 * In either case, we must add a single bit in the "reserved" field
1776 	 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
1777 	 * workaround for a known Tavor errata that can cause Recv WQEs with
1778 	 * zero in the NDA field to behave improperly.
1779 	 */
1780 	if (curr_desc != NULL) {
1781 		curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
1782 		    qp->qp_desc_off);
1783 		next = (uint64_t)((uintptr_t)curr_desc &
1784 		    TAVOR_WQE_NDA_MASK) << 32;
1785 		next = next | (curr_descsz & TAVOR_WQE_NDS_MASK) |
1786 		    TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1787 	} else {
1788 		next = TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1789 	}
1790 
1791 	/*
1792 	 * If this WQE is supposed to be linked to the previous descriptor,
1793 	 * then we need to update not only the previous WQE's "next" fields
1794 	 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
1795 	 * "e" bits - see tavor_hw.h for more).  Note: both the "c" and "e"
1796 	 * bits are always hardcoded to zero.
1797 	 */
1798 	if (prev_desc != NULL) {
1799 		TAVOR_WQE_LINKNEXT(qp, prev_desc, 0, next);
1800 	}
1801 }
1802 
1803 
1804 /*
1805  * tavor_wqe_srq_build()
1806  *    Context: Can be called from interrupt or base context.
1807  */
1808 /* ARGSUSED */
1809 static int
tavor_wqe_srq_build(tavor_state_t * state,tavor_srqhdl_t srq,ibt_recv_wr_t * wr,uint64_t * desc)1810 tavor_wqe_srq_build(tavor_state_t *state, tavor_srqhdl_t srq,
1811     ibt_recv_wr_t *wr, uint64_t *desc)
1812 {
1813 	tavor_hw_wqe_sgl_t	*ds;
1814 	ibt_wr_ds_t		end_sgl;
1815 	int			i, num_ds;
1816 
1817 	ASSERT(MUTEX_HELD(&srq->srq_lock));
1818 
1819 	/* Fill in the Data Segments (SGL) for the Recv WQE */
1820 	ds = (tavor_hw_wqe_sgl_t *)((uintptr_t)desc +
1821 	    sizeof (tavor_hw_rcv_wqe_nextctrl_t));
1822 	num_ds = 0;
1823 
1824 	/* Check for valid number of SGL entries */
1825 	if (wr->wr_nds > srq->srq_wq_sgl) {
1826 		return (IBT_QP_SGL_LEN_INVALID);
1827 	}
1828 
1829 	/*
1830 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1831 	 * segments.  Note: We skip any SGL with zero size because Tavor
1832 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1833 	 * the encoding for zero means a 2GB transfer.  Because of this special
1834 	 * encoding in the hardware, we mask the requested length with
1835 	 * TAVOR_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1836 	 * zero.)
1837 	 */
1838 	for (i = 0; i < wr->wr_nds; i++) {
1839 		if (wr->wr_sgl[i].ds_len == 0) {
1840 			continue;
1841 		}
1842 
1843 		/*
1844 		 * Fill in the Data Segment(s) for the receive WQE, using the
1845 		 * information contained in the scatter-gather list of the
1846 		 * work request.
1847 		 */
1848 		TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &wr->wr_sgl[i]);
1849 		num_ds++;
1850 	}
1851 
1852 	/*
1853 	 * For SRQ, if the number of data segments is less than the maximum
1854 	 * specified at alloc, then we have to fill in a special "key" entry in
1855 	 * the sgl entry after the last valid one in this post request.  We do
1856 	 * that here.
1857 	 */
1858 	if (num_ds < srq->srq_wq_sgl) {
1859 		end_sgl.ds_va  = 0;
1860 		end_sgl.ds_len = 0;
1861 		end_sgl.ds_key = 0x1;
1862 		TAVOR_WQE_BUILD_DATA_SEG_SRQ(srq, &ds[num_ds], &end_sgl);
1863 	}
1864 
1865 	return (DDI_SUCCESS);
1866 }
1867 
1868 
1869 /*
1870  * tavor_wqe_srq_linknext()
1871  *    Context: Can be called from interrupt or base context.
1872  */
1873 static void
tavor_wqe_srq_linknext(uint64_t * curr_desc,uint64_t * prev_desc,tavor_srqhdl_t srq)1874 tavor_wqe_srq_linknext(uint64_t *curr_desc, uint64_t *prev_desc,
1875     tavor_srqhdl_t srq)
1876 {
1877 	uint64_t	next;
1878 
1879 	/*
1880 	 * Calculate the "next" field of the descriptor.  This amounts to
1881 	 * setting up the "next_wqe_addr", "dbd", and "nds" fields (see
1882 	 * tavor_hw.h for more).  Note:  If there is no next descriptor (i.e.
1883 	 * if the current descriptor is the last WQE on the chain), then set
1884 	 * "next" field to TAVOR_WQE_DBD_MASK.  This is because the Tavor
1885 	 * hardware requires the "dbd" bit to be set to one for all Recv WQEs.
1886 	 * In either case, we must add a single bit in the "reserved" field
1887 	 * (TAVOR_RCV_WQE_NDA0_WA_MASK) following the NDA.  This is the
1888 	 * workaround for a known Tavor errata that can cause Recv WQEs with
1889 	 * zero in the NDA field to behave improperly.
1890 	 */
1891 	if (curr_desc != NULL) {
1892 		curr_desc = (uint64_t *)(uintptr_t)((uintptr_t)curr_desc -
1893 		    srq->srq_desc_off);
1894 		next = (uint64_t)((uintptr_t)curr_desc &
1895 		    TAVOR_WQE_NDA_MASK) << 32;
1896 		next = next | TAVOR_WQE_DBD_MASK | TAVOR_RCV_WQE_NDA0_WA_MASK;
1897 	} else {
1898 		next = TAVOR_RCV_WQE_NDA0_WA_MASK;
1899 	}
1900 
1901 	/*
1902 	 * If this WQE is supposed to be linked to the previous descriptor,
1903 	 * then we need to update not only the previous WQE's "next" fields
1904 	 * but we must also update this WQE's "ctrl" fields (i.e. the "c" and
1905 	 * "e" bits - see tavor_hw.h for more).  Note: both the "c" and "e"
1906 	 * bits are always hardcoded to zero.
1907 	 */
1908 	if (prev_desc != NULL) {
1909 		TAVOR_WQE_LINKNEXT_SRQ(srq, prev_desc, 0, next);
1910 	}
1911 }
1912 
1913 
1914 /*
1915  * tavor_wr_get_immediate()
1916  *    Context: Can be called from interrupt or base context.
1917  */
1918 static uint32_t
tavor_wr_get_immediate(ibt_send_wr_t * wr)1919 tavor_wr_get_immediate(ibt_send_wr_t *wr)
1920 {
1921 	/*
1922 	 * This routine extracts the "immediate data" from the appropriate
1923 	 * location in the IBTF work request.  Because of the way the
1924 	 * work request structure is defined, the location for this data
1925 	 * depends on the actual work request operation type.
1926 	 */
1927 
1928 	/* For RDMA Write, test if RC or UC */
1929 	if (wr->wr_opcode == IBT_WRC_RDMAW) {
1930 		if (wr->wr_trans == IBT_RC_SRV) {
1931 			return (wr->wr.rc.rcwr.rdma.rdma_immed);
1932 		} else {  /* IBT_UC_SRV */
1933 			return (wr->wr.uc.ucwr.rdma.rdma_immed);
1934 		}
1935 	}
1936 
1937 	/* For Send, test if RC, UD, or UC */
1938 	if (wr->wr_opcode == IBT_WRC_SEND) {
1939 		if (wr->wr_trans == IBT_RC_SRV) {
1940 			return (wr->wr.rc.rcwr.send_immed);
1941 		} else if (wr->wr_trans == IBT_UD_SRV) {
1942 			return (wr->wr.ud.udwr_immed);
1943 		} else {  /* IBT_UC_SRV */
1944 			return (wr->wr.uc.ucwr.send_immed);
1945 		}
1946 	}
1947 
1948 	/*
1949 	 * If any other type of request, then immediate is undefined
1950 	 */
1951 	return (0);
1952 }
1953 
1954 
1955 /*
1956  * tavor_wqe_sync()
1957  *    Context: Can be called from interrupt or base context.
1958  */
1959 static void
tavor_wqe_sync(void * hdl,uint_t sync_from,uint_t sync_to,uint_t sync_type,uint_t flag)1960 tavor_wqe_sync(void *hdl, uint_t sync_from, uint_t sync_to,
1961     uint_t sync_type, uint_t flag)
1962 {
1963 	tavor_qphdl_t		qp;
1964 	tavor_srqhdl_t		srq;
1965 	uint_t			is_sync_req;
1966 	uint64_t		*wqe_from, *wqe_to, *wqe_base, *wqe_top;
1967 	ddi_dma_handle_t	dmahdl;
1968 	off_t			offset;
1969 	size_t			length;
1970 	uint32_t		qsize;
1971 	int			status;
1972 
1973 	if (sync_type == TAVOR_WR_SRQ) {
1974 		srq = (tavor_srqhdl_t)hdl;
1975 		is_sync_req = srq->srq_sync;
1976 		/* Get the DMA handle from SRQ context */
1977 		dmahdl = srq->srq_mrhdl->mr_bindinfo.bi_dmahdl;
1978 	} else {
1979 		qp = (tavor_qphdl_t)hdl;
1980 		is_sync_req = qp->qp_sync;
1981 		/* Get the DMA handle from QP context */
1982 		dmahdl = qp->qp_mrhdl->mr_bindinfo.bi_dmahdl;
1983 	}
1984 
1985 	/* Determine if the work queues need to be synced or not */
1986 	if (is_sync_req == 0) {
1987 		return;
1988 	}
1989 
1990 	/*
1991 	 * Depending on the type of the work queue, we grab information
1992 	 * about the address ranges we need to DMA sync.
1993 	 */
1994 	if (sync_type == TAVOR_WR_SEND) {
1995 		wqe_from = TAVOR_QP_SQ_ENTRY(qp, sync_from);
1996 		wqe_to   = TAVOR_QP_SQ_ENTRY(qp, sync_to);
1997 		qsize	 = qp->qp_sq_bufsz;
1998 
1999 		wqe_base = TAVOR_QP_SQ_ENTRY(qp, 0);
2000 		wqe_top	 = TAVOR_QP_SQ_ENTRY(qp, qsize);
2001 	} else if (sync_type == TAVOR_WR_RECV) {
2002 		wqe_from = TAVOR_QP_RQ_ENTRY(qp, sync_from);
2003 		wqe_to   = TAVOR_QP_RQ_ENTRY(qp, sync_to);
2004 		qsize	 = qp->qp_rq_bufsz;
2005 
2006 		wqe_base = TAVOR_QP_RQ_ENTRY(qp, 0);
2007 		wqe_top	 = TAVOR_QP_RQ_ENTRY(qp, qsize);
2008 	} else {
2009 		wqe_from = TAVOR_SRQ_WQ_ENTRY(srq, sync_from);
2010 		wqe_to   = TAVOR_SRQ_WQ_ENTRY(srq, sync_to);
2011 		qsize	 = srq->srq_wq_bufsz;
2012 
2013 		wqe_base = TAVOR_SRQ_WQ_ENTRY(srq, 0);
2014 		wqe_top	 = TAVOR_SRQ_WQ_ENTRY(srq, qsize);
2015 	}
2016 
2017 	/*
2018 	 * There are two possible cases for the beginning and end of the WQE
2019 	 * chain we are trying to sync.  Either this is the simple case, where
2020 	 * the end of the chain is below the beginning of the chain, or it is
2021 	 * the "wrap-around" case, where the end of the chain has wrapped over
2022 	 * the end of the queue.  In the former case, we simply need to
2023 	 * calculate the span from beginning to end and sync it.  In the latter
2024 	 * case, however, we need to calculate the span from the top of the
2025 	 * work queue to the end of the chain and sync that, and then we need
2026 	 * to find the other portion (from beginning of chain to end of queue)
2027 	 * and sync that as well.  Note: if the "top to end" span is actually
2028 	 * zero length, then we don't do a DMA sync because a zero length DMA
2029 	 * sync unnecessarily syncs the entire work queue.
2030 	 */
2031 	if (wqe_to > wqe_from) {
2032 		/* "From Beginning to End" */
2033 		offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2034 		length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_from);
2035 
2036 		status = ddi_dma_sync(dmahdl, offset, length, flag);
2037 		if (status != DDI_SUCCESS) {
2038 			return;
2039 		}
2040 	} else {
2041 		/* "From Top to End" */
2042 		offset = (off_t)0;
2043 		length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_base);
2044 		if (length) {
2045 			status = ddi_dma_sync(dmahdl, offset, length, flag);
2046 			if (status != DDI_SUCCESS) {
2047 				return;
2048 			}
2049 		}
2050 
2051 		/* "From Beginning to Bottom" */
2052 		offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)wqe_base);
2053 		length = (size_t)((uintptr_t)wqe_top - (uintptr_t)wqe_from);
2054 		status = ddi_dma_sync(dmahdl, offset, length, flag);
2055 		if (status != DDI_SUCCESS) {
2056 			return;
2057 		}
2058 	}
2059 }
2060 
2061 
2062 /*
2063  * tavor_wr_bind_check()
2064  *    Context: Can be called from interrupt or base context.
2065  */
2066 static int
tavor_wr_bind_check(tavor_state_t * state,ibt_send_wr_t * wr)2067 tavor_wr_bind_check(tavor_state_t *state, ibt_send_wr_t *wr)
2068 {
2069 	ibt_bind_flags_t	bind_flags;
2070 	uint64_t		vaddr, len;
2071 	uint64_t		reg_start_addr, reg_end_addr;
2072 	tavor_mwhdl_t		mw;
2073 	tavor_mrhdl_t		mr;
2074 	tavor_rsrc_t		*mpt;
2075 	uint32_t		new_rkey;
2076 
2077 	/* Check for a valid Memory Window handle in the WR */
2078 	mw = (tavor_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl;
2079 	if (mw == NULL) {
2080 		return (IBT_MW_HDL_INVALID);
2081 	}
2082 
2083 	/* Check for a valid Memory Region handle in the WR */
2084 	mr = (tavor_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl;
2085 	if (mr == NULL) {
2086 		return (IBT_MR_HDL_INVALID);
2087 	}
2088 
2089 	mutex_enter(&mr->mr_lock);
2090 	mutex_enter(&mw->mr_lock);
2091 
2092 	/*
2093 	 * Check here to see if the memory region has already been partially
2094 	 * deregistered as a result of a tavor_umap_umemlock_cb() callback.
2095 	 * If so, this is an error, return failure.
2096 	 */
2097 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
2098 		mutex_exit(&mr->mr_lock);
2099 		mutex_exit(&mw->mr_lock);
2100 		return (IBT_MR_HDL_INVALID);
2101 	}
2102 
2103 	/* Check for a valid Memory Window RKey (i.e. a matching RKey) */
2104 	if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) {
2105 		mutex_exit(&mr->mr_lock);
2106 		mutex_exit(&mw->mr_lock);
2107 		return (IBT_MR_RKEY_INVALID);
2108 	}
2109 
2110 	/* Check for a valid Memory Region LKey (i.e. a matching LKey) */
2111 	if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) {
2112 		mutex_exit(&mr->mr_lock);
2113 		mutex_exit(&mw->mr_lock);
2114 		return (IBT_MR_LKEY_INVALID);
2115 	}
2116 
2117 	/*
2118 	 * Now check for valid "vaddr" and "len".  Note:  We don't check the
2119 	 * "vaddr" range when "len == 0" (i.e. on unbind operations)
2120 	 */
2121 	len = wr->wr.rc.rcwr.bind->bind_len;
2122 	if (len != 0) {
2123 		vaddr = wr->wr.rc.rcwr.bind->bind_va;
2124 		reg_start_addr = mr->mr_bindinfo.bi_addr;
2125 		reg_end_addr   = mr->mr_bindinfo.bi_addr +
2126 		    (mr->mr_bindinfo.bi_len - 1);
2127 		if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) {
2128 			mutex_exit(&mr->mr_lock);
2129 			mutex_exit(&mw->mr_lock);
2130 			return (IBT_MR_VA_INVALID);
2131 		}
2132 		vaddr = (vaddr + len) - 1;
2133 		if (vaddr > reg_end_addr) {
2134 			mutex_exit(&mr->mr_lock);
2135 			mutex_exit(&mw->mr_lock);
2136 			return (IBT_MR_LEN_INVALID);
2137 		}
2138 	}
2139 
2140 	/*
2141 	 * Validate the bind access flags.  Remote Write and Atomic access for
2142 	 * the Memory Window require that Local Write access be set in the
2143 	 * corresponding Memory Region.
2144 	 */
2145 	bind_flags = wr->wr.rc.rcwr.bind->bind_flags;
2146 	if (((bind_flags & IBT_WR_BIND_WRITE) ||
2147 	    (bind_flags & IBT_WR_BIND_ATOMIC)) &&
2148 	    !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) {
2149 		mutex_exit(&mr->mr_lock);
2150 		mutex_exit(&mw->mr_lock);
2151 		return (IBT_MR_ACCESS_REQ_INVALID);
2152 	}
2153 
2154 	/* Calculate the new RKey for the Memory Window */
2155 	mpt = mw->mr_mptrsrcp;
2156 	tavor_mr_keycalc(state, mpt->tr_indx, &new_rkey);
2157 
2158 	wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
2159 	mw->mr_rkey = new_rkey;
2160 
2161 	mutex_exit(&mr->mr_lock);
2162 	mutex_exit(&mw->mr_lock);
2163 	return (DDI_SUCCESS);
2164 }
2165 
2166 
2167 /*
2168  * tavor_wrid_from_reset_handling()
2169  *    Context: Can be called from interrupt or base context.
2170  */
2171 int
tavor_wrid_from_reset_handling(tavor_state_t * state,tavor_qphdl_t qp)2172 tavor_wrid_from_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2173 {
2174 	tavor_workq_hdr_t	*swq, *rwq;
2175 	tavor_wrid_list_hdr_t	*s_wridlist, *r_wridlist;
2176 	uint_t			create_new_swq = 0, create_new_rwq = 0;
2177 	uint_t			create_wql = 0;
2178 	uint_t			qp_srq_en;
2179 
2180 	/*
2181 	 * For each of this QP's Work Queues, make sure we have a (properly
2182 	 * initialized) Work Request ID list attached to the relevant
2183 	 * completion queue.  Grab the CQ lock(s) before manipulating the
2184 	 * lists.
2185 	 */
2186 	tavor_wrid_wqhdr_lock_both(qp);
2187 	swq = tavor_wrid_wqhdr_find(qp->qp_sq_cqhdl, qp->qp_qpnum,
2188 	    TAVOR_WR_SEND);
2189 	if (swq == NULL) {
2190 		/* Couldn't find matching work queue header, create it */
2191 		create_new_swq = create_wql = 1;
2192 		swq = tavor_wrid_wqhdr_create(state, qp->qp_sq_cqhdl,
2193 		    qp->qp_qpnum, TAVOR_WR_SEND, create_wql);
2194 		if (swq == NULL) {
2195 			/*
2196 			 * If we couldn't find/allocate space for the workq
2197 			 * header, then drop the lock(s) and return failure.
2198 			 */
2199 			tavor_wrid_wqhdr_unlock_both(qp);
2200 			return (ibc_get_ci_failure(0));
2201 		}
2202 	}
2203 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swq))
2204 	qp->qp_sq_wqhdr = swq;
2205 	swq->wq_size = qp->qp_sq_bufsz;
2206 	swq->wq_head = 0;
2207 	swq->wq_tail = 0;
2208 	swq->wq_full = 0;
2209 
2210 	/*
2211 	 * Allocate space for the tavor_wrid_entry_t container
2212 	 */
2213 	s_wridlist = tavor_wrid_get_list(swq->wq_size);
2214 	if (s_wridlist == NULL) {
2215 		/*
2216 		 * If we couldn't allocate space for tracking the WRID
2217 		 * entries, then cleanup the workq header from above (if
2218 		 * necessary, i.e. if we created the workq header).  Then
2219 		 * drop the lock(s) and return failure.
2220 		 */
2221 		if (create_new_swq) {
2222 			tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2223 		}
2224 
2225 		tavor_wrid_wqhdr_unlock_both(qp);
2226 		return (ibc_get_ci_failure(0));
2227 	}
2228 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*s_wridlist))
2229 	s_wridlist->wl_wqhdr = swq;
2230 
2231 	/* Chain the new WRID list container to the workq hdr list */
2232 	mutex_enter(&swq->wq_wrid_wql->wql_lock);
2233 	tavor_wrid_wqhdr_add(swq, s_wridlist);
2234 	mutex_exit(&swq->wq_wrid_wql->wql_lock);
2235 
2236 	qp_srq_en = qp->qp_srq_en;
2237 
2238 #ifdef __lock_lint
2239 	mutex_enter(&qp->qp_srqhdl->srq_lock);
2240 #else
2241 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2242 		mutex_enter(&qp->qp_srqhdl->srq_lock);
2243 	}
2244 #endif
2245 	/*
2246 	 * Now we repeat all the above operations for the receive work queue,
2247 	 * or shared receive work queue.
2248 	 *
2249 	 * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case.
2250 	 */
2251 	rwq = tavor_wrid_wqhdr_find(qp->qp_rq_cqhdl, qp->qp_qpnum,
2252 	    TAVOR_WR_RECV);
2253 	if (rwq == NULL) {
2254 		create_new_rwq = create_wql = 1;
2255 
2256 		/*
2257 		 * If this QP is associated with an SRQ, and this isn't the
2258 		 * first QP on the SRQ, then the 'srq_wrid_wql' will already be
2259 		 * created.  Since the WQL is created at 'wqhdr_create' time we
2260 		 * pass in the flag 'create_wql' here to be 0 if we have
2261 		 * already created it.  And later on below we then next setup
2262 		 * the WQL and rwq information based off the existing SRQ info.
2263 		 */
2264 		if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2265 		    qp->qp_srqhdl->srq_wrid_wql != NULL) {
2266 			create_wql = 0;
2267 		}
2268 
2269 		rwq = tavor_wrid_wqhdr_create(state, qp->qp_rq_cqhdl,
2270 		    qp->qp_qpnum, TAVOR_WR_RECV, create_wql);
2271 		if (rwq == NULL) {
2272 			/*
2273 			 * If we couldn't find/allocate space for the workq
2274 			 * header, then free all the send queue resources we
2275 			 * just allocated and setup (above), drop the lock(s)
2276 			 * and return failure.
2277 			 */
2278 			mutex_enter(&swq->wq_wrid_wql->wql_lock);
2279 			tavor_wrid_wqhdr_remove(swq, s_wridlist);
2280 			mutex_exit(&swq->wq_wrid_wql->wql_lock);
2281 			if (create_new_swq) {
2282 				tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl,
2283 				    swq);
2284 			}
2285 
2286 #ifdef __lock_lint
2287 			mutex_exit(&qp->qp_srqhdl->srq_lock);
2288 #else
2289 			if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2290 				mutex_exit(&qp->qp_srqhdl->srq_lock);
2291 			}
2292 #endif
2293 
2294 			tavor_wrid_wqhdr_unlock_both(qp);
2295 			return (ibc_get_ci_failure(0));
2296 		}
2297 	}
2298 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*rwq))
2299 
2300 	/*
2301 	 * Setup receive workq hdr
2302 	 *
2303 	 * If the QP is on an SRQ, we setup the SRQ specific fields, setting
2304 	 * keeping a copy of the rwq pointer, setting the rwq bufsize
2305 	 * appropriately, and initializing our part of the WQLock.
2306 	 *
2307 	 * In the normal QP case, the QP recv queue bufsize is used.
2308 	 */
2309 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2310 		rwq->wq_size = qp->qp_srqhdl->srq_wq_bufsz;
2311 		if (qp->qp_srqhdl->srq_wrid_wql == NULL) {
2312 			qp->qp_srqhdl->srq_wrid_wql = rwq->wq_wrid_wql;
2313 		} else {
2314 			rwq->wq_wrid_wql = qp->qp_srqhdl->srq_wrid_wql;
2315 		}
2316 		tavor_wql_refcnt_inc(qp->qp_srqhdl->srq_wrid_wql);
2317 
2318 	} else {
2319 		rwq->wq_size = qp->qp_rq_bufsz;
2320 	}
2321 
2322 	qp->qp_rq_wqhdr = rwq;
2323 	rwq->wq_head = 0;
2324 	rwq->wq_tail = 0;
2325 	rwq->wq_full = 0;
2326 
2327 	/*
2328 	 * Allocate space for the tavor_wrid_entry_t container.
2329 	 *
2330 	 * If QP is on an SRQ, and the wrq_wridlist is NULL then we must
2331 	 * allocate the wridlist normally.  However, if the srq_wridlist is !=
2332 	 * NULL, then we know this SRQ has already been initialized, thus the
2333 	 * wridlist has already been initialized.  So we re-use the
2334 	 * srq_wridlist as the r_wridlist for this QP in this case.
2335 	 */
2336 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2337 	    qp->qp_srqhdl->srq_wridlist != NULL) {
2338 		/* Use existing srq_wridlist pointer */
2339 		r_wridlist = qp->qp_srqhdl->srq_wridlist;
2340 		ASSERT(r_wridlist != NULL);
2341 	} else {
2342 		/* Allocate memory for the r_wridlist */
2343 		r_wridlist = tavor_wrid_get_list(rwq->wq_size);
2344 	}
2345 
2346 	/*
2347 	 * If the memory allocation failed for r_wridlist (or the SRQ pointer
2348 	 * is mistakenly NULL), we cleanup our previous swq allocation from
2349 	 * above
2350 	 */
2351 	if (r_wridlist == NULL) {
2352 		/*
2353 		 * If we couldn't allocate space for tracking the WRID
2354 		 * entries, then cleanup all the stuff from above.  Then
2355 		 * drop the lock(s) and return failure.
2356 		 */
2357 		mutex_enter(&swq->wq_wrid_wql->wql_lock);
2358 		tavor_wrid_wqhdr_remove(swq, s_wridlist);
2359 		mutex_exit(&swq->wq_wrid_wql->wql_lock);
2360 		if (create_new_swq) {
2361 			tavor_cq_wqhdr_remove(qp->qp_sq_cqhdl, swq);
2362 		}
2363 		if (create_new_rwq) {
2364 			tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, rwq);
2365 		}
2366 
2367 #ifdef __lock_lint
2368 		mutex_exit(&qp->qp_srqhdl->srq_lock);
2369 #else
2370 		if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2371 			mutex_exit(&qp->qp_srqhdl->srq_lock);
2372 		}
2373 #endif
2374 
2375 		tavor_wrid_wqhdr_unlock_both(qp);
2376 		return (ibc_get_ci_failure(0));
2377 	}
2378 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*r_wridlist))
2379 
2380 	/*
2381 	 * Initialize the wridlist
2382 	 *
2383 	 * In the normal QP case, there is no special initialization needed.
2384 	 * We simply setup the wridlist backpointer to be the receive wqhdr
2385 	 * (rwq).
2386 	 *
2387 	 * But in the SRQ case, there is no backpointer to the wqhdr possible.
2388 	 * Instead we set 'wl_srq_en', specifying this wridlist is on an SRQ
2389 	 * and thus potentially shared across multiple QPs with the SRQ.  We
2390 	 * also setup the srq_wridlist pointer to be the r_wridlist, and
2391 	 * intialize the freelist to an invalid index.  This srq_wridlist
2392 	 * pointer is used above on future moves from_reset to let us know that
2393 	 * the srq_wridlist has been initialized already.
2394 	 *
2395 	 * And finally, if we are in a non-UMAP case, we setup the srq wrid
2396 	 * free list.
2397 	 */
2398 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED &&
2399 	    qp->qp_srqhdl->srq_wridlist == NULL) {
2400 		r_wridlist->wl_srq_en = 1;
2401 		r_wridlist->wl_free_list_indx = -1;
2402 		qp->qp_srqhdl->srq_wridlist = r_wridlist;
2403 
2404 		/* Initialize srq wrid free list */
2405 		if (qp->qp_srqhdl->srq_is_umap == 0) {
2406 			mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2407 			tavor_wrid_list_srq_init(r_wridlist, qp->qp_srqhdl, 0);
2408 			mutex_exit(&rwq->wq_wrid_wql->wql_lock);
2409 		}
2410 	} else {
2411 		r_wridlist->wl_wqhdr = rwq;
2412 	}
2413 
2414 	/* Chain the WRID list "container" to the workq hdr list */
2415 	mutex_enter(&rwq->wq_wrid_wql->wql_lock);
2416 	tavor_wrid_wqhdr_add(rwq, r_wridlist);
2417 	mutex_exit(&rwq->wq_wrid_wql->wql_lock);
2418 
2419 #ifdef __lock_lint
2420 	mutex_exit(&qp->qp_srqhdl->srq_lock);
2421 #else
2422 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2423 		mutex_exit(&qp->qp_srqhdl->srq_lock);
2424 	}
2425 #endif
2426 
2427 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*r_wridlist))
2428 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*rwq))
2429 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*s_wridlist))
2430 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*swq))
2431 
2432 	tavor_wrid_wqhdr_unlock_both(qp);
2433 	return (DDI_SUCCESS);
2434 }
2435 
2436 
2437 /*
2438  * tavor_wrid_to_reset_handling()
2439  *    Context: Can be called from interrupt or base context.
2440  */
2441 void
tavor_wrid_to_reset_handling(tavor_state_t * state,tavor_qphdl_t qp)2442 tavor_wrid_to_reset_handling(tavor_state_t *state, tavor_qphdl_t qp)
2443 {
2444 	uint_t		free_wqhdr = 0;
2445 
2446 	/*
2447 	 * For each of this QP's Work Queues, move the WRID "container" to
2448 	 * the "reapable" list.  Although there may still be unpolled
2449 	 * entries in these containers, it is not a big deal.  We will not
2450 	 * reap the list until either the Poll CQ command detects an empty
2451 	 * condition or the CQ itself is freed.  Grab the CQ lock(s) before
2452 	 * manipulating the lists.
2453 	 */
2454 	mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2455 	tavor_wrid_wqhdr_lock_both(qp);
2456 	tavor_wrid_reaplist_add(qp->qp_sq_cqhdl, qp->qp_sq_wqhdr);
2457 
2458 	/*
2459 	 * Add the receive work queue header on to the reaplist.  But if we are
2460 	 * on SRQ, then don't add anything to the reaplist.  Instead we flush
2461 	 * the SRQ entries on the CQ, remove wridlist from WQHDR, and free the
2462 	 * WQHDR (if needed).  We must hold the WQL for these operations, yet
2463 	 * the call to tavor_cq_wqhdr_remove grabs the WQL internally.  So we
2464 	 * drop WQL before that call.  Then release the CQ WQHDR locks and the
2465 	 * CQ lock and return.
2466 	 */
2467 	if (qp->qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
2468 
2469 		/*
2470 		 * Pull off all (if any) entries for this QP from CQ.  This
2471 		 * only includes entries that have not yet been polled
2472 		 */
2473 		mutex_enter(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2474 		tavor_cq_srq_entries_flush(state, qp);
2475 
2476 		/* Remove wridlist from WQHDR */
2477 		tavor_wrid_wqhdr_remove(qp->qp_rq_wqhdr,
2478 		    qp->qp_rq_wqhdr->wq_wrid_post);
2479 
2480 		/* If wridlist chain is now empty, remove the wqhdr as well */
2481 		if (qp->qp_rq_wqhdr->wq_wrid_post == NULL) {
2482 			free_wqhdr = 1;
2483 		} else {
2484 			free_wqhdr = 0;
2485 		}
2486 
2487 		mutex_exit(&qp->qp_rq_wqhdr->wq_wrid_wql->wql_lock);
2488 
2489 		/* Free the WQHDR */
2490 		if (free_wqhdr) {
2491 			tavor_cq_wqhdr_remove(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2492 		}
2493 	} else {
2494 		tavor_wrid_reaplist_add(qp->qp_rq_cqhdl, qp->qp_rq_wqhdr);
2495 	}
2496 	tavor_wrid_wqhdr_unlock_both(qp);
2497 	mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2498 }
2499 
2500 
2501 /*
2502  * tavor_wrid_add_entry()
2503  *    Context: Can be called from interrupt or base context.
2504  */
2505 void
tavor_wrid_add_entry(tavor_workq_hdr_t * wq,uint64_t wrid,uint32_t wqeaddrsz,uint_t signaled_dbd)2506 tavor_wrid_add_entry(tavor_workq_hdr_t *wq, uint64_t wrid, uint32_t wqeaddrsz,
2507     uint_t signaled_dbd)
2508 {
2509 	tavor_wrid_entry_t	*wre_tmp;
2510 	uint32_t		head, tail, size;
2511 
2512 	ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2513 
2514 	/*
2515 	 * Find the entry in the container pointed to by the "tail" index.
2516 	 * Add all of the relevant information to that entry, including WRID,
2517 	 * "wqeaddrsz" parameter, and whether it was signaled/unsignaled
2518 	 * and/or doorbelled.
2519 	 */
2520 	head = wq->wq_wrid_post->wl_head;
2521 	tail = wq->wq_wrid_post->wl_tail;
2522 	size = wq->wq_wrid_post->wl_size;
2523 	wre_tmp = &wq->wq_wrid_post->wl_wre[tail];
2524 	wre_tmp->wr_wrid	  = wrid;
2525 	wre_tmp->wr_wqeaddrsz	  = wqeaddrsz;
2526 	wre_tmp->wr_signaled_dbd  = signaled_dbd;
2527 
2528 	/*
2529 	 * Update the "wrid_old_tail" pointer to point to the entry we just
2530 	 * inserted into the queue.  By tracking this pointer (the pointer to
2531 	 * the most recently inserted entry) it will possible later in the
2532 	 * PostSend() and PostRecv() code paths to find the entry that needs
2533 	 * its "doorbelled" flag set (see comment in tavor_post_recv() and/or
2534 	 * tavor_post_send()).
2535 	 */
2536 	wq->wq_wrid_post->wl_wre_old_tail = wre_tmp;
2537 
2538 	/* Update the tail index */
2539 	tail = ((tail + 1) & (size - 1));
2540 	wq->wq_wrid_post->wl_tail = tail;
2541 
2542 	/*
2543 	 * If the "tail" index has just wrapped over into the "head" index,
2544 	 * then we have filled the container.  We use the "full" flag to
2545 	 * indicate this condition and to distinguish it from the "empty"
2546 	 * condition (where head and tail are also equal).
2547 	 */
2548 	if (head == tail) {
2549 		wq->wq_wrid_post->wl_full = 1;
2550 	}
2551 }
2552 
2553 /*
2554  * tavor_wrid_add_entry_srq()
2555  * Context: Can be called from interrupt or base context
2556  */
2557 void
tavor_wrid_add_entry_srq(tavor_srqhdl_t srq,uint64_t wrid,uint_t signaled_dbd)2558 tavor_wrid_add_entry_srq(tavor_srqhdl_t srq, uint64_t wrid, uint_t signaled_dbd)
2559 {
2560 	tavor_wrid_entry_t	*wre;
2561 	uint64_t		*wl_wqe;
2562 	uint32_t		wqe_index;
2563 
2564 	/*
2565 	 * Find the next available WQE from the SRQ free_list.  Then update the
2566 	 * free_list to point to the next entry
2567 	 */
2568 	wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, srq->srq_wridlist->wl_free_list_indx);
2569 
2570 	wqe_index = srq->srq_wridlist->wl_free_list_indx;
2571 
2572 	/* ASSERT on impossible wqe_index values */
2573 	ASSERT(wqe_index < srq->srq_wq_bufsz);
2574 
2575 	/*
2576 	 * Setup the WRE.
2577 	 *
2578 	 * Given the 'wqe_index' value, we store the WRID at this WRE offset.
2579 	 * And we set the WRE to be signaled_dbd so that on poll CQ we can find
2580 	 * this information and associate the WRID to the WQE found on the CQE.
2581 	 */
2582 	wre = &srq->srq_wridlist->wl_wre[wqe_index];
2583 	wre->wr_wrid = wrid;
2584 	wre->wr_signaled_dbd  = signaled_dbd;
2585 
2586 	/* Update the free list index */
2587 	srq->srq_wridlist->wl_free_list_indx = ddi_get32(
2588 	    srq->srq_wridlist->wl_acchdl, (uint32_t *)wl_wqe);
2589 }
2590 
2591 
2592 /*
2593  * tavor_wrid_get_entry()
2594  *    Context: Can be called from interrupt or base context.
2595  */
2596 uint64_t
tavor_wrid_get_entry(tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe,tavor_wrid_entry_t * wre)2597 tavor_wrid_get_entry(tavor_cqhdl_t cq, tavor_hw_cqe_t *cqe,
2598     tavor_wrid_entry_t *wre)
2599 {
2600 	tavor_workq_hdr_t	*wq;
2601 	tavor_wrid_entry_t	*wre_tmp;
2602 	uint64_t		wrid;
2603 	uint_t			send_or_recv, qpnum, error, opcode;
2604 
2605 	/* Lock the list of work queues associated with this CQ */
2606 	mutex_enter(&cq->cq_wrid_wqhdr_lock);
2607 
2608 	/*
2609 	 * Determine whether this CQE is a send or receive completion (and
2610 	 * whether it was a "successful" completion or not)
2611 	 */
2612 	opcode = TAVOR_CQE_OPCODE_GET(cq, cqe);
2613 	if ((opcode == TAVOR_CQE_SEND_ERR_OPCODE) ||
2614 	    (opcode == TAVOR_CQE_RECV_ERR_OPCODE)) {
2615 		error = 1;
2616 		send_or_recv = (opcode == TAVOR_CQE_SEND_ERR_OPCODE) ?
2617 		    TAVOR_COMPLETION_SEND : TAVOR_COMPLETION_RECV;
2618 	} else {
2619 		error = 0;
2620 		send_or_recv = TAVOR_CQE_SENDRECV_GET(cq, cqe);
2621 	}
2622 
2623 	/* Find the work queue for this QP number (send or receive side) */
2624 	qpnum = TAVOR_CQE_QPNUM_GET(cq, cqe);
2625 	wq = tavor_wrid_wqhdr_find(cq, qpnum, send_or_recv);
2626 	ASSERT(wq != NULL);
2627 
2628 	/*
2629 	 * Regardless of whether the completion is the result of a "success"
2630 	 * or a "failure", we lock the list of "containers" and attempt to
2631 	 * search for the the first matching completion (i.e. the first WR
2632 	 * with a matching WQE addr and size).  Once we find it, we pull out
2633 	 * the "wrid" field and return it (see below).  Note: One possible
2634 	 * future enhancement would be to enable this routine to skip over
2635 	 * any "unsignaled" completions to go directly to the next "signaled"
2636 	 * entry on success. XXX
2637 	 */
2638 	mutex_enter(&wq->wq_wrid_wql->wql_lock);
2639 	wre_tmp = tavor_wrid_find_match(wq, cq, cqe);
2640 
2641 	/*
2642 	 * If this is a "successful" completion, then we assert that this
2643 	 * completion must be a "signaled" completion.
2644 	 */
2645 	ASSERT(error || (wre_tmp->wr_signaled_dbd & TAVOR_WRID_ENTRY_SIGNALED));
2646 
2647 	/*
2648 	 * If the completion is a "failed" completion, then we save away the
2649 	 * contents of the entry (into the "wre" field passed in) for use
2650 	 * in later CQE processing. Note: We use the tavor_wrid_get_wqeaddrsz()
2651 	 * function to grab "wqeaddrsz" from the next entry in the container.
2652 	 * This is required for error processing (where updating these fields
2653 	 * properly is necessary to correct handling of the "error" CQE)
2654 	 */
2655 	if (error && (wre != NULL)) {
2656 		*wre = *wre_tmp;
2657 		wre->wr_wqeaddrsz = tavor_wrid_get_wqeaddrsz(wq);
2658 	}
2659 
2660 	/* Pull out the WRID and return it */
2661 	wrid = wre_tmp->wr_wrid;
2662 
2663 	mutex_exit(&wq->wq_wrid_wql->wql_lock);
2664 	mutex_exit(&cq->cq_wrid_wqhdr_lock);
2665 
2666 	return (wrid);
2667 }
2668 
2669 
2670 /*
2671  * tavor_wrid_find_match()
2672  *    Context: Can be called from interrupt or base context.
2673  */
2674 static tavor_wrid_entry_t *
tavor_wrid_find_match(tavor_workq_hdr_t * wq,tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe)2675 tavor_wrid_find_match(tavor_workq_hdr_t *wq, tavor_cqhdl_t cq,
2676     tavor_hw_cqe_t *cqe)
2677 {
2678 	tavor_wrid_entry_t	*curr = NULL;
2679 	tavor_wrid_list_hdr_t	*container;
2680 	uint32_t		wqeaddr_size;
2681 	uint32_t		head, tail, size;
2682 	int			found = 0, last_container;
2683 
2684 	ASSERT(MUTEX_HELD(&wq->wq_wrid_wql->wql_lock));
2685 
2686 	/* Pull the "wqeaddrsz" information from the CQE */
2687 	wqeaddr_size = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe);
2688 
2689 	/*
2690 	 * Walk the "containers" list(s), find first WR with a matching WQE
2691 	 * addr.  If the current "container" is not the last one on the list,
2692 	 * i.e. not the current one to which we are posting new WRID entries,
2693 	 * then we do not attempt to update the "q_head", "q_tail", and
2694 	 * "q_full" indicators on the main work queue header.  We do, however,
2695 	 * update the "head" and "full" indicators on the individual containers
2696 	 * as we go.  This is imperative because we need to be able to
2697 	 * determine when the current container has been emptied (so that we
2698 	 * can move on to the next container).
2699 	 */
2700 	container = wq->wq_wrid_poll;
2701 	while (container != NULL) {
2702 		/* Is this the last/only "container" on the list */
2703 		last_container = (container != wq->wq_wrid_post) ? 0 : 1;
2704 
2705 		/*
2706 		 * First check if we are on an SRQ.  If so, we grab the entry
2707 		 * and break out.  Since SRQ wridlist's are never added to
2708 		 * reaplist, they can only be the last container.
2709 		 */
2710 		if (container->wl_srq_en) {
2711 			ASSERT(last_container == 1);
2712 			curr = tavor_wrid_find_match_srq(container, cq, cqe);
2713 			break;
2714 		}
2715 
2716 		/*
2717 		 * Grab the current "head", "tail" and "size" fields before
2718 		 * walking the list in the current container. Note: the "size"
2719 		 * field here must always be a power-of-2.  The "full"
2720 		 * parameter is checked (and updated) here to distinguish the
2721 		 * "queue full" condition from "queue empty".
2722 		 */
2723 		head = container->wl_head;
2724 		tail = container->wl_tail;
2725 		size = container->wl_size;
2726 		while ((head != tail) || (container->wl_full)) {
2727 			container->wl_full = 0;
2728 			curr = &container->wl_wre[head];
2729 			head = ((head + 1) & (size - 1));
2730 
2731 			/*
2732 			 * If the current entry's "wqeaddrsz" matches the one
2733 			 * we're searching for, then this must correspond to
2734 			 * the work request that caused the completion.  Set
2735 			 * the "found" flag and bail out.
2736 			 */
2737 			if (curr->wr_wqeaddrsz == wqeaddr_size) {
2738 				found = 1;
2739 				break;
2740 			}
2741 		}
2742 
2743 		/*
2744 		 * If the current container is empty (having reached here the
2745 		 * "head == tail" condition can only mean that the container
2746 		 * is empty), then NULL out the "wrid_old_tail" field (see
2747 		 * tavor_post_send() and tavor_post_recv() for more details)
2748 		 * and (potentially) remove the current container from future
2749 		 * searches.
2750 		 */
2751 		if (head == tail) {
2752 
2753 			container->wl_wre_old_tail = NULL;
2754 			/*
2755 			 * If this wasn't the last "container" on the chain,
2756 			 * i.e. the one to which new WRID entries will be
2757 			 * added, then remove it from the list.
2758 			 * Note: we don't "lose" the memory pointed to by this
2759 			 * because we should have already put this container
2760 			 * on the "reapable" list (from where it will later be
2761 			 * pulled).
2762 			 */
2763 			if (!last_container) {
2764 				wq->wq_wrid_poll = container->wl_next;
2765 			}
2766 		}
2767 
2768 		/* Update the head index for the container */
2769 		container->wl_head = head;
2770 
2771 		/*
2772 		 * If the entry was found in this container, then continue to
2773 		 * bail out.  Else reset the "curr" pointer and move on to the
2774 		 * next container (if there is one).  Note: the only real
2775 		 * reason for setting "curr = NULL" here is so that the ASSERT
2776 		 * below can catch the case where no matching entry was found
2777 		 * on any of the lists.
2778 		 */
2779 		if (found) {
2780 			break;
2781 		} else {
2782 			curr = NULL;
2783 			container = container->wl_next;
2784 		}
2785 	}
2786 
2787 	/*
2788 	 * Update work queue header's "head" and "full" conditions to match
2789 	 * the last entry on the container list.  (Note: Only if we're pulling
2790 	 * entries from the last work queue portion of the list, i.e. not from
2791 	 * the previous portions that may be the "reapable" list.)
2792 	 */
2793 	if (last_container) {
2794 		wq->wq_head = wq->wq_wrid_post->wl_head;
2795 		wq->wq_full = wq->wq_wrid_post->wl_full;
2796 	}
2797 
2798 	/* Ensure that we've actually found what we were searching for */
2799 	ASSERT(curr != NULL);
2800 
2801 	return (curr);
2802 }
2803 
2804 
2805 /*
2806  * tavor_wrid_find_match_srq()
2807  *    Context: Can be called from interrupt or base context.
2808  */
2809 tavor_wrid_entry_t *
tavor_wrid_find_match_srq(tavor_wrid_list_hdr_t * wl,tavor_cqhdl_t cq,tavor_hw_cqe_t * cqe)2810 tavor_wrid_find_match_srq(tavor_wrid_list_hdr_t *wl, tavor_cqhdl_t cq,
2811     tavor_hw_cqe_t *cqe)
2812 {
2813 	tavor_wrid_entry_t	*wre;
2814 	uint64_t		*wl_wqe;
2815 	uint32_t		wqe_index;
2816 	uint64_t		wqe_addr;
2817 	uint32_t		cqe_wqe_addr;
2818 
2819 	/* Grab the WQE addr out of the CQE */
2820 	cqe_wqe_addr = TAVOR_CQE_WQEADDRSZ_GET(cq, cqe) & 0xFFFFFFC0;
2821 
2822 	/*
2823 	 * Use the WQE addr as the lower 32-bit, we add back on the
2824 	 * 'wl_srq_desc_off' because we have a zero-based queue.  Then the
2825 	 * upper 32-bit of the 'wl_srq_wq_buf' OR'd on gives us the WQE addr in
2826 	 * the SRQ Work Queue itself.  We use this address as the index to find
2827 	 * out which Work Queue Entry this CQE corresponds with.
2828 	 *
2829 	 * We also use this address below to add the WQE back on to the free
2830 	 * list.
2831 	 */
2832 	wqe_addr = ((uintptr_t)wl->wl_srq_wq_buf & 0xFFFFFFFF00000000ull) |
2833 	    (cqe_wqe_addr + wl->wl_srq_desc_off);
2834 
2835 	/*
2836 	 * Given the 'wqe_addr' just calculated and the srq buf address, we
2837 	 * find the 'wqe_index'.  The 'wre' returned below contains the WRID
2838 	 * that we are looking for.  This indexes into the wre_list for this
2839 	 * specific WQE.
2840 	 */
2841 	wqe_index = TAVOR_SRQ_WQE_INDEX(wl->wl_srq_wq_buf, wqe_addr,
2842 	    wl->wl_srq_log_wqesz);
2843 
2844 	/* ASSERT on impossible wqe_index values */
2845 	ASSERT(wqe_index < wl->wl_srq_wq_bufsz);
2846 
2847 	/* Get the pointer to this WQE */
2848 	wl_wqe = (uint64_t *)(uintptr_t)wqe_addr;
2849 
2850 	/* Put this WQE index back on the free list */
2851 	ddi_put32(wl->wl_acchdl, (uint32_t *)wl_wqe, wl->wl_free_list_indx);
2852 	wl->wl_free_list_indx = wqe_index;
2853 
2854 	/* Using the index, return the Work Request ID Entry (wre) */
2855 	wre = &wl->wl_wre[wqe_index];
2856 
2857 	return (wre);
2858 }
2859 
2860 
2861 /*
2862  * tavor_wrid_cq_reap()
2863  *    Context: Can be called from interrupt or base context.
2864  */
2865 void
tavor_wrid_cq_reap(tavor_cqhdl_t cq)2866 tavor_wrid_cq_reap(tavor_cqhdl_t cq)
2867 {
2868 	tavor_workq_hdr_t	*consume_wqhdr;
2869 	tavor_wrid_list_hdr_t	*container, *to_free;
2870 
2871 	ASSERT(MUTEX_HELD(&cq->cq_lock));
2872 
2873 	/* Lock the list of work queues associated with this CQ */
2874 	mutex_enter(&cq->cq_wrid_wqhdr_lock);
2875 
2876 	/* Walk the "reapable" list and free up containers */
2877 	container = cq->cq_wrid_reap_head;
2878 	while (container != NULL) {
2879 		to_free	  = container;
2880 		container = container->wl_reap_next;
2881 		/*
2882 		 * If reaping the WRID list containers pulls the last
2883 		 * container from the given work queue header, then we free
2884 		 * the work queue header as well.
2885 		 */
2886 		consume_wqhdr = tavor_wrid_list_reap(to_free);
2887 		if (consume_wqhdr != NULL) {
2888 			tavor_cq_wqhdr_remove(cq, consume_wqhdr);
2889 		}
2890 	}
2891 
2892 	/* Once finished reaping, we reset the CQ's reap list */
2893 	cq->cq_wrid_reap_head = cq->cq_wrid_reap_tail = NULL;
2894 
2895 	mutex_exit(&cq->cq_wrid_wqhdr_lock);
2896 }
2897 
2898 
2899 /*
2900  * tavor_wrid_cq_force_reap()
2901  *    Context: Can be called from interrupt or base context.
2902  */
2903 void
tavor_wrid_cq_force_reap(tavor_cqhdl_t cq)2904 tavor_wrid_cq_force_reap(tavor_cqhdl_t cq)
2905 {
2906 	tavor_workq_hdr_t	*curr;
2907 	tavor_wrid_list_hdr_t	*container, *to_free;
2908 	avl_tree_t		*treep;
2909 	void			*cookie = NULL;
2910 
2911 	ASSERT(MUTEX_HELD(&cq->cq_lock));
2912 
2913 	/*
2914 	 * The first step is to walk the "reapable" list and free up those
2915 	 * containers.  This is necessary because the containers on the
2916 	 * reapable list are not otherwise connected to the work queue headers
2917 	 * anymore.
2918 	 */
2919 	tavor_wrid_cq_reap(cq);
2920 
2921 	/* Now lock the list of work queues associated with this CQ */
2922 	mutex_enter(&cq->cq_wrid_wqhdr_lock);
2923 
2924 	/*
2925 	 * Walk the list of work queue headers and free up all the WRID list
2926 	 * containers chained to it.  Note: We don't need to grab the locks
2927 	 * for each of the individual WRID lists here because the only way
2928 	 * things can be added or removed from the list at this point would be
2929 	 * through post a work request to a QP.  But if we've come this far,
2930 	 * then we can be assured that there are no longer any QP associated
2931 	 * with the CQ that we are trying to free.
2932 	 */
2933 #ifdef __lock_lint
2934 	tavor_wrid_wqhdr_compare(NULL, NULL);
2935 #endif
2936 	treep = &cq->cq_wrid_wqhdr_avl_tree;
2937 	while ((curr = avl_destroy_nodes(treep, &cookie)) != NULL) {
2938 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*curr))
2939 		container = curr->wq_wrid_poll;
2940 		while (container != NULL) {
2941 			to_free	  = container;
2942 			container = container->wl_next;
2943 			/*
2944 			 * If reaping the WRID list containers pulls the last
2945 			 * container from the given work queue header, then
2946 			 * we free the work queue header as well.  Note: we
2947 			 * ignore the return value because we know that the
2948 			 * work queue header should always be freed once the
2949 			 * list of containers has come to an end.
2950 			 */
2951 			(void) tavor_wrid_list_reap(to_free);
2952 			if (container == NULL) {
2953 				tavor_cq_wqhdr_remove(cq, curr);
2954 			}
2955 		}
2956 	}
2957 	avl_destroy(treep);
2958 
2959 	mutex_exit(&cq->cq_wrid_wqhdr_lock);
2960 }
2961 
2962 
2963 /*
2964  * tavor_wrid_get_list()
2965  *    Context: Can be called from interrupt or base context.
2966  */
2967 tavor_wrid_list_hdr_t *
tavor_wrid_get_list(uint32_t qsize)2968 tavor_wrid_get_list(uint32_t qsize)
2969 {
2970 	tavor_wrid_list_hdr_t	*wridlist;
2971 	uint32_t		size;
2972 
2973 	/*
2974 	 * The WRID list "container" consists of the tavor_wrid_list_hdr_t,
2975 	 * which holds the pointers necessary for maintaining the "reapable"
2976 	 * list, chaining together multiple "containers" old and new, and
2977 	 * tracking the head, tail, size, etc. for each container.
2978 	 *
2979 	 * The "container" also holds all the tavor_wrid_entry_t's, which is
2980 	 * allocated separately, one for each entry on the corresponding work
2981 	 * queue.
2982 	 */
2983 	size = sizeof (tavor_wrid_list_hdr_t);
2984 
2985 	/*
2986 	 * Note that this allocation has to be a NOSLEEP operation here
2987 	 * because we are holding the "wqhdr_list_lock" and, therefore,
2988 	 * could get raised to the interrupt level.
2989 	 */
2990 	wridlist = (tavor_wrid_list_hdr_t *)kmem_zalloc(size, KM_NOSLEEP);
2991 	if (wridlist == NULL) {
2992 		return (NULL);
2993 	}
2994 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wridlist))
2995 
2996 	/* Complete the "container" initialization */
2997 	wridlist->wl_size = qsize;
2998 	wridlist->wl_full = 0;
2999 	wridlist->wl_head = 0;
3000 	wridlist->wl_tail = 0;
3001 	wridlist->wl_wre = (tavor_wrid_entry_t *)kmem_zalloc(qsize *
3002 	    sizeof (tavor_wrid_entry_t), KM_NOSLEEP);
3003 	if (wridlist->wl_wre == NULL) {
3004 		kmem_free(wridlist, size);
3005 		return (NULL);
3006 	}
3007 	wridlist->wl_wre_old_tail  = NULL;
3008 	wridlist->wl_reap_next = NULL;
3009 	wridlist->wl_next  = NULL;
3010 	wridlist->wl_prev  = NULL;
3011 	wridlist->wl_srq_en = 0;
3012 
3013 	return (wridlist);
3014 }
3015 
3016 /*
3017  * tavor_wrid_list_srq_init()
3018  * Context: Can be called from interrupt or base context
3019  */
3020 void
tavor_wrid_list_srq_init(tavor_wrid_list_hdr_t * wridlist,tavor_srqhdl_t srq,uint_t wq_start)3021 tavor_wrid_list_srq_init(tavor_wrid_list_hdr_t *wridlist, tavor_srqhdl_t srq,
3022     uint_t wq_start)
3023 {
3024 	uint64_t *wl_wqe;
3025 	int wqe_index;
3026 
3027 	ASSERT(MUTEX_HELD(&srq->srq_wrid_wql->wql_lock));
3028 
3029 	/* Setup pointers for use later when we are polling the CQ */
3030 	wridlist->wl_srq_wq_buf = srq->srq_wq_buf;
3031 	wridlist->wl_srq_wq_bufsz = srq->srq_wq_bufsz;
3032 	wridlist->wl_srq_log_wqesz = srq->srq_wq_log_wqesz;
3033 	wridlist->wl_srq_desc_off = srq->srq_desc_off;
3034 	wridlist->wl_acchdl = srq->srq_wqinfo.qa_acchdl;
3035 
3036 	/* Given wq_start to start initializing buf at, verify sanity */
3037 	ASSERT(wq_start >= 0 && wq_start < srq->srq_wq_bufsz);
3038 
3039 	/*
3040 	 * Initialize wridlist free list
3041 	 *
3042 	 * For each WQ up to the size of our queue, we store an index in the WQ
3043 	 * memory itself, representing the next available free entry.  The
3044 	 * 'wl_free_list_indx' always holds the index of the next available
3045 	 * free entry in the WQ.  If 'wl_free_list_indx' is -1, then we are
3046 	 * completely full.  This gives us the advantage of being able to have
3047 	 * entries complete or be polled off the WQ out-of-order.
3048 	 *
3049 	 * For now, we write the free_list entries inside the WQ itself.  It
3050 	 * may be useful in the future to store this information in a separate
3051 	 * structure for debugging purposes.
3052 	 */
3053 	for (wqe_index = wq_start; wqe_index < srq->srq_wq_bufsz; wqe_index++) {
3054 		wl_wqe = TAVOR_SRQ_WQE_ADDR(srq, wqe_index);
3055 		ddi_put32(wridlist->wl_acchdl, (uint32_t *)wl_wqe,
3056 		    wridlist->wl_free_list_indx);
3057 		wridlist->wl_free_list_indx = wqe_index;
3058 	}
3059 }
3060 
3061 
3062 /*
3063  * tavor_wrid_reaplist_add()
3064  *    Context: Can be called from interrupt or base context.
3065  */
3066 static void
tavor_wrid_reaplist_add(tavor_cqhdl_t cq,tavor_workq_hdr_t * wq)3067 tavor_wrid_reaplist_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wq)
3068 {
3069 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3070 
3071 	mutex_enter(&wq->wq_wrid_wql->wql_lock);
3072 
3073 	/*
3074 	 * Add the "post" container (the last one on the current chain) to
3075 	 * the CQ's "reapable" list
3076 	 */
3077 	if ((cq->cq_wrid_reap_head == NULL) &&
3078 	    (cq->cq_wrid_reap_tail == NULL)) {
3079 		cq->cq_wrid_reap_head = wq->wq_wrid_post;
3080 		cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3081 	} else {
3082 		cq->cq_wrid_reap_tail->wl_reap_next = wq->wq_wrid_post;
3083 		cq->cq_wrid_reap_tail = wq->wq_wrid_post;
3084 	}
3085 
3086 	mutex_exit(&wq->wq_wrid_wql->wql_lock);
3087 }
3088 
3089 
3090 int
tavor_wrid_wqhdr_compare(const void * p1,const void * p2)3091 tavor_wrid_wqhdr_compare(const void *p1, const void *p2)
3092 {
3093 	tavor_workq_compare_t	*cmpp;
3094 	tavor_workq_hdr_t	*curr;
3095 
3096 	cmpp = (tavor_workq_compare_t *)p1;
3097 	curr = (tavor_workq_hdr_t *)p2;
3098 
3099 	if (cmpp->cmp_qpn < curr->wq_qpn)
3100 		return (-1);
3101 	else if (cmpp->cmp_qpn > curr->wq_qpn)
3102 		return (+1);
3103 	else if (cmpp->cmp_type < curr->wq_type)
3104 		return (-1);
3105 	else if (cmpp->cmp_type > curr->wq_type)
3106 		return (+1);
3107 	else
3108 		return (0);
3109 }
3110 
3111 
3112 /*
3113  * tavor_wrid_wqhdr_find()
3114  *    Context: Can be called from interrupt or base context.
3115  */
3116 static tavor_workq_hdr_t *
tavor_wrid_wqhdr_find(tavor_cqhdl_t cq,uint_t qpn,uint_t wq_type)3117 tavor_wrid_wqhdr_find(tavor_cqhdl_t cq, uint_t qpn, uint_t wq_type)
3118 {
3119 	tavor_workq_hdr_t	*curr;
3120 	tavor_workq_compare_t	cmp;
3121 
3122 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3123 
3124 	/*
3125 	 * Walk the CQ's work queue list, trying to find a send or recv queue
3126 	 * with the same QP number.  We do this even if we are going to later
3127 	 * create a new entry because it helps us easily find the end of the
3128 	 * list.
3129 	 */
3130 	cmp.cmp_qpn = qpn;
3131 	cmp.cmp_type = wq_type;
3132 #ifdef __lock_lint
3133 	tavor_wrid_wqhdr_compare(NULL, NULL);
3134 #endif
3135 	curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL);
3136 
3137 	return (curr);
3138 }
3139 
3140 
3141 /*
3142  * tavor_wrid_wqhdr_create()
3143  *    Context: Can be called from interrupt or base context.
3144  */
3145 static tavor_workq_hdr_t *
tavor_wrid_wqhdr_create(tavor_state_t * state,tavor_cqhdl_t cq,uint_t qpn,uint_t wq_type,uint_t create_wql)3146 tavor_wrid_wqhdr_create(tavor_state_t *state, tavor_cqhdl_t cq, uint_t qpn,
3147     uint_t wq_type, uint_t create_wql)
3148 {
3149 	tavor_workq_hdr_t	*wqhdr_tmp;
3150 
3151 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3152 
3153 	/*
3154 	 * Allocate space a work queue header structure and initialize it.
3155 	 * Each work queue header structure includes a "wq_wrid_wql"
3156 	 * which needs to be initialized.  Note that this allocation has to be
3157 	 * a NOSLEEP operation because we are holding the "cq_wrid_wqhdr_lock"
3158 	 * and, therefore, could get raised to the interrupt level.
3159 	 */
3160 	wqhdr_tmp = (tavor_workq_hdr_t *)kmem_zalloc(
3161 	    sizeof (tavor_workq_hdr_t), KM_NOSLEEP);
3162 	if (wqhdr_tmp == NULL) {
3163 		return (NULL);
3164 	}
3165 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr_tmp))
3166 	wqhdr_tmp->wq_qpn	= qpn;
3167 	wqhdr_tmp->wq_type	= wq_type;
3168 
3169 	if (create_wql) {
3170 		wqhdr_tmp->wq_wrid_wql = tavor_wrid_wql_create(state);
3171 		if (wqhdr_tmp->wq_wrid_wql == NULL) {
3172 			kmem_free(wqhdr_tmp, sizeof (tavor_workq_hdr_t));
3173 			return (NULL);
3174 		}
3175 	}
3176 
3177 	wqhdr_tmp->wq_wrid_poll = NULL;
3178 	wqhdr_tmp->wq_wrid_post = NULL;
3179 
3180 	/* Chain the newly allocated work queue header to the CQ's list */
3181 	tavor_cq_wqhdr_add(cq, wqhdr_tmp);
3182 
3183 	return (wqhdr_tmp);
3184 }
3185 
3186 
3187 /*
3188  * tavor_wrid_wql_create()
3189  *    Context: Can be called from interrupt or base context.
3190  */
3191 tavor_wq_lock_t *
tavor_wrid_wql_create(tavor_state_t * state)3192 tavor_wrid_wql_create(tavor_state_t *state)
3193 {
3194 	tavor_wq_lock_t *wql;
3195 
3196 	/*
3197 	 * Allocate the WQL and initialize it.
3198 	 */
3199 	wql = kmem_zalloc(sizeof (tavor_wq_lock_t), KM_NOSLEEP);
3200 	if (wql == NULL) {
3201 		return (NULL);
3202 	}
3203 
3204 	mutex_init(&wql->wql_lock, NULL, MUTEX_DRIVER,
3205 	    DDI_INTR_PRI(state->ts_intrmsi_pri));
3206 
3207 	/* Add refcount to WQL */
3208 	tavor_wql_refcnt_inc(wql);
3209 
3210 	return (wql);
3211 }
3212 
3213 
3214 /*
3215  * tavor_wrid_get_wqeaddrsz()
3216  *    Context: Can be called from interrupt or base context.
3217  */
3218 static uint32_t
tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t * wq)3219 tavor_wrid_get_wqeaddrsz(tavor_workq_hdr_t *wq)
3220 {
3221 	tavor_wrid_entry_t	*wre;
3222 	uint32_t		wqeaddrsz;
3223 	uint32_t		head;
3224 
3225 	/*
3226 	 * If the container is empty, then there is no next entry. So just
3227 	 * return zero.  Note: the "head == tail" condition here can only
3228 	 * mean that the container is empty because we have previously pulled
3229 	 * something from the container.
3230 	 *
3231 	 * If the container is not empty, then find the next entry and return
3232 	 * the contents of its "wqeaddrsz" field.
3233 	 */
3234 	if (wq->wq_wrid_poll->wl_head == wq->wq_wrid_poll->wl_tail) {
3235 		wqeaddrsz = 0;
3236 	} else {
3237 		/*
3238 		 * We don't need to calculate the "next" head pointer here
3239 		 * because "head" should already point to the next entry on
3240 		 * the list (since we just pulled something off - in
3241 		 * tavor_wrid_find_match() - and moved the head index forward.)
3242 		 */
3243 		head = wq->wq_wrid_poll->wl_head;
3244 		wre = &wq->wq_wrid_poll->wl_wre[head];
3245 		wqeaddrsz = wre->wr_wqeaddrsz;
3246 	}
3247 	return (wqeaddrsz);
3248 }
3249 
3250 
3251 /*
3252  * tavor_wrid_wqhdr_add()
3253  *    Context: Can be called from interrupt or base context.
3254  */
3255 static void
tavor_wrid_wqhdr_add(tavor_workq_hdr_t * wqhdr,tavor_wrid_list_hdr_t * wridlist)3256 tavor_wrid_wqhdr_add(tavor_workq_hdr_t *wqhdr,
3257     tavor_wrid_list_hdr_t *wridlist)
3258 {
3259 	ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3260 
3261 	/* Chain the new WRID list "container" to the work queue list */
3262 	if ((wqhdr->wq_wrid_post == NULL) &&
3263 	    (wqhdr->wq_wrid_poll == NULL)) {
3264 		wqhdr->wq_wrid_poll = wridlist;
3265 		wqhdr->wq_wrid_post = wridlist;
3266 	} else {
3267 		wqhdr->wq_wrid_post->wl_next = wridlist;
3268 		wridlist->wl_prev = wqhdr->wq_wrid_post;
3269 		wqhdr->wq_wrid_post = wridlist;
3270 	}
3271 }
3272 
3273 
3274 /*
3275  * tavor_wrid_wqhdr_remove()
3276  *    Context: Can be called from interrupt or base context.
3277  *
3278  *    Note: this is only called to remove the most recently added WRID list
3279  *    container (i.e. in tavor_from_reset() above)
3280  */
3281 static void
tavor_wrid_wqhdr_remove(tavor_workq_hdr_t * wqhdr,tavor_wrid_list_hdr_t * wridlist)3282 tavor_wrid_wqhdr_remove(tavor_workq_hdr_t *wqhdr,
3283     tavor_wrid_list_hdr_t *wridlist)
3284 {
3285 	tavor_wrid_list_hdr_t	*prev, *next;
3286 
3287 	ASSERT(MUTEX_HELD(&wqhdr->wq_wrid_wql->wql_lock));
3288 
3289 	/* Unlink the WRID list "container" from the work queue list */
3290 	prev = wridlist->wl_prev;
3291 	next = wridlist->wl_next;
3292 	if (prev != NULL) {
3293 		prev->wl_next = next;
3294 	}
3295 	if (next != NULL) {
3296 		next->wl_prev = prev;
3297 	}
3298 
3299 	/*
3300 	 * Update any pointers in the work queue hdr that may point to this
3301 	 * WRID list container
3302 	 */
3303 	if (wqhdr->wq_wrid_post == wridlist) {
3304 		wqhdr->wq_wrid_post = prev;
3305 	}
3306 	if (wqhdr->wq_wrid_poll == wridlist) {
3307 		wqhdr->wq_wrid_poll = NULL;
3308 	}
3309 }
3310 
3311 
3312 /*
3313  * tavor_wrid_list_reap()
3314  *    Context: Can be called from interrupt or base context.
3315  *    Note: The "wqhdr_list_lock" must be held.
3316  */
3317 static tavor_workq_hdr_t *
tavor_wrid_list_reap(tavor_wrid_list_hdr_t * wridlist)3318 tavor_wrid_list_reap(tavor_wrid_list_hdr_t *wridlist)
3319 {
3320 	tavor_workq_hdr_t	*wqhdr, *consume_wqhdr = NULL;
3321 	tavor_wrid_list_hdr_t	*prev, *next;
3322 	uint32_t		size;
3323 
3324 	/* Get the back pointer to the work queue header (see below) */
3325 	wqhdr = wridlist->wl_wqhdr;
3326 	mutex_enter(&wqhdr->wq_wrid_wql->wql_lock);
3327 
3328 	/* Unlink the WRID list "container" from the work queue list */
3329 	prev = wridlist->wl_prev;
3330 	next = wridlist->wl_next;
3331 	if (prev != NULL) {
3332 		prev->wl_next = next;
3333 	}
3334 	if (next != NULL) {
3335 		next->wl_prev = prev;
3336 	}
3337 
3338 	/*
3339 	 * If the back pointer to the work queue header shows that it
3340 	 * was pointing to the entry we are about to remove, then the work
3341 	 * queue header is reapable as well.
3342 	 */
3343 	if ((wqhdr->wq_wrid_poll == wridlist) &&
3344 	    (wqhdr->wq_wrid_post == wridlist)) {
3345 		consume_wqhdr = wqhdr;
3346 	}
3347 
3348 	/* Be sure to update the "poll" and "post" container pointers */
3349 	if (wqhdr->wq_wrid_poll == wridlist) {
3350 		wqhdr->wq_wrid_poll = next;
3351 	}
3352 	if (wqhdr->wq_wrid_post == wridlist) {
3353 		wqhdr->wq_wrid_post = NULL;
3354 	}
3355 
3356 	/* Calculate the size and free the container */
3357 	size = (wridlist->wl_size * sizeof (tavor_wrid_entry_t));
3358 	kmem_free(wridlist->wl_wre, size);
3359 	kmem_free(wridlist, sizeof (tavor_wrid_list_hdr_t));
3360 
3361 	mutex_exit(&wqhdr->wq_wrid_wql->wql_lock);
3362 
3363 	return (consume_wqhdr);
3364 }
3365 
3366 
3367 /*
3368  * tavor_wrid_wqhdr_lock_both()
3369  *    Context: Can be called from interrupt or base context.
3370  */
3371 static void
tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp)3372 tavor_wrid_wqhdr_lock_both(tavor_qphdl_t qp)
3373 {
3374 	tavor_cqhdl_t	sq_cq, rq_cq;
3375 
3376 	sq_cq = qp->qp_sq_cqhdl;
3377 	rq_cq = qp->qp_rq_cqhdl;
3378 
3379 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3380 _NOTE(MUTEX_ACQUIRED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3381 
3382 	/*
3383 	 * If both work queues (send and recv) share a completion queue, then
3384 	 * grab the common lock.  If they use different CQs (hence different
3385 	 * "cq_wrid_wqhdr_list" locks), then grab the send one first, then the
3386 	 * receive.  We do this consistently and correctly in
3387 	 * tavor_wrid_wqhdr_unlock_both() below to avoid introducing any kind
3388 	 * of dead lock condition.  Note:  We add the "__lock_lint" code here
3389 	 * to fake out warlock into thinking we've grabbed both locks (when,
3390 	 * in fact, we only needed the one).
3391 	 */
3392 	if (sq_cq == rq_cq) {
3393 		mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3394 #ifdef	__lock_lint
3395 		mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3396 #endif
3397 	} else {
3398 		mutex_enter(&sq_cq->cq_wrid_wqhdr_lock);
3399 		mutex_enter(&rq_cq->cq_wrid_wqhdr_lock);
3400 	}
3401 }
3402 
3403 /*
3404  * tavor_wrid_wqhdr_unlock_both()
3405  *    Context: Can be called from interrupt or base context.
3406  */
3407 static void
tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp)3408 tavor_wrid_wqhdr_unlock_both(tavor_qphdl_t qp)
3409 {
3410 	tavor_cqhdl_t	sq_cq, rq_cq;
3411 
3412 	sq_cq = qp->qp_sq_cqhdl;
3413 	rq_cq = qp->qp_rq_cqhdl;
3414 
3415 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&rq_cq->cq_wrid_wqhdr_lock))
3416 _NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&sq_cq->cq_wrid_wqhdr_lock))
3417 
3418 	/*
3419 	 * See tavor_wrid_wqhdr_lock_both() above for more detail
3420 	 */
3421 	if (sq_cq == rq_cq) {
3422 #ifdef	__lock_lint
3423 		mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3424 #endif
3425 		mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3426 	} else {
3427 		mutex_exit(&rq_cq->cq_wrid_wqhdr_lock);
3428 		mutex_exit(&sq_cq->cq_wrid_wqhdr_lock);
3429 	}
3430 }
3431 
3432 
3433 /*
3434  * tavor_cq_wqhdr_add()
3435  *    Context: Can be called from interrupt or base context.
3436  */
3437 static void
tavor_cq_wqhdr_add(tavor_cqhdl_t cq,tavor_workq_hdr_t * wqhdr)3438 tavor_cq_wqhdr_add(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3439 {
3440 	tavor_workq_compare_t	cmp;
3441 	avl_index_t		where;
3442 
3443 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3444 
3445 	cmp.cmp_qpn = wqhdr->wq_qpn;
3446 	cmp.cmp_type = wqhdr->wq_type;
3447 #ifdef __lock_lint
3448 	tavor_wrid_wqhdr_compare(NULL, NULL);
3449 #endif
3450 	(void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where);
3451 	/*
3452 	 * If the CQ's work queue list is empty, then just add it.
3453 	 * Otherwise, chain it to the beginning of the list.
3454 	 */
3455 	avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqhdr, where);
3456 }
3457 
3458 
3459 /*
3460  * tavor_cq_wqhdr_remove()
3461  *    Context: Can be called from interrupt or base context.
3462  */
3463 static void
tavor_cq_wqhdr_remove(tavor_cqhdl_t cq,tavor_workq_hdr_t * wqhdr)3464 tavor_cq_wqhdr_remove(tavor_cqhdl_t cq, tavor_workq_hdr_t *wqhdr)
3465 {
3466 	ASSERT(MUTEX_HELD(&cq->cq_wrid_wqhdr_lock));
3467 
3468 #ifdef __lock_lint
3469 	tavor_wrid_wqhdr_compare(NULL, NULL);
3470 #endif
3471 	/* Remove "wqhdr" from the work queue header list on "cq" */
3472 	avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqhdr);
3473 
3474 	/*
3475 	 * Release reference to WQL; If this is the last reference, this call
3476 	 * also has the side effect of freeing up the 'wq_wrid_wql' memory.
3477 	 */
3478 	tavor_wql_refcnt_dec(wqhdr->wq_wrid_wql);
3479 
3480 	/* Free the memory associated with "wqhdr" */
3481 	kmem_free(wqhdr, sizeof (tavor_workq_hdr_t));
3482 }
3483 
3484 
3485 /*
3486  * tavor_wql_refcnt_inc()
3487  * Context: Can be called from interrupt or base context
3488  */
3489 void
tavor_wql_refcnt_inc(tavor_wq_lock_t * wql)3490 tavor_wql_refcnt_inc(tavor_wq_lock_t *wql)
3491 {
3492 	ASSERT(wql != NULL);
3493 
3494 	mutex_enter(&wql->wql_lock);
3495 	wql->wql_refcnt++;
3496 	mutex_exit(&wql->wql_lock);
3497 }
3498 
3499 /*
3500  * tavor_wql_refcnt_dec()
3501  * Context: Can be called from interrupt or base context
3502  */
3503 void
tavor_wql_refcnt_dec(tavor_wq_lock_t * wql)3504 tavor_wql_refcnt_dec(tavor_wq_lock_t *wql)
3505 {
3506 	int	refcnt;
3507 
3508 	ASSERT(wql != NULL);
3509 
3510 	mutex_enter(&wql->wql_lock);
3511 	wql->wql_refcnt--;
3512 	refcnt = wql->wql_refcnt;
3513 	mutex_exit(&wql->wql_lock);
3514 
3515 	/*
3516 	 *
3517 	 * Free up WQL memory if we're the last one associated with this
3518 	 * structure.
3519 	 */
3520 	if (refcnt == 0) {
3521 		mutex_destroy(&wql->wql_lock);
3522 		kmem_free(wql, sizeof (tavor_wq_lock_t));
3523 	}
3524 }
3525