xref: /titanic_52/usr/src/uts/common/io/ib/adapters/tavor/tavor_qp.c (revision de710d24d2fae4468e64da999e1d952a247f142c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_qp.c
29  *    Tavor Queue Pair Processing Routines
30  *
31  *    Implements all the routines necessary for allocating, freeing, and
32  *    querying the Tavor queue pairs.
33  */
34 
35 #include <sys/types.h>
36 #include <sys/conf.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/modctl.h>
40 #include <sys/bitmap.h>
41 #include <sys/sysmacros.h>
42 
43 #include <sys/ib/adapters/tavor/tavor.h>
44 #include <sys/ib/ib_pkt_hdrs.h>
45 
46 static int tavor_qp_create_qpn(tavor_state_t *state, tavor_qphdl_t qp,
47     tavor_rsrc_t *qpc);
48 static int tavor_qpn_avl_compare(const void *q, const void *e);
49 static int tavor_special_qp_rsrc_alloc(tavor_state_t *state,
50     ibt_sqp_type_t type, uint_t port, tavor_rsrc_t **qp_rsrc);
51 static int tavor_special_qp_rsrc_free(tavor_state_t *state, ibt_sqp_type_t type,
52     uint_t port);
53 static void tavor_qp_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
54     tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl);
55 
56 /*
57  * tavor_qp_alloc()
58  *    Context: Can be called only from user or kernel context.
59  */
60 int
61 tavor_qp_alloc(tavor_state_t *state, tavor_qp_info_t *qpinfo,
62     uint_t sleepflag, tavor_qp_options_t *op)
63 {
64 	tavor_rsrc_pool_info_t	*rsrc_pool;
65 	tavor_rsrc_t		*qpc, *rsrc, *rdb;
66 	tavor_umap_db_entry_t	*umapdb;
67 	tavor_qphdl_t		qp;
68 	ibt_qp_alloc_attr_t	*attr_p;
69 	ibt_qp_type_t		type;
70 	ibtl_qp_hdl_t		ibt_qphdl;
71 	ibt_chan_sizes_t	*queuesz_p;
72 	ib_qpn_t		*qpn;
73 	tavor_qphdl_t		*qphdl;
74 	ibt_mr_attr_t		mr_attr;
75 	tavor_mr_options_t	mr_op;
76 	tavor_srqhdl_t		srq;
77 	tavor_pdhdl_t		pd;
78 	tavor_cqhdl_t		sq_cq, rq_cq;
79 	tavor_mrhdl_t		mr;
80 	uint64_t		value, qp_desc_off;
81 	uint32_t		*sq_buf, *rq_buf;
82 	uint32_t		log_qp_sq_size, log_qp_rq_size;
83 	uint32_t		sq_size, rq_size;
84 	uint32_t		sq_wqe_size, rq_wqe_size;
85 	uint32_t		max_rdb, max_sgl, uarpg;
86 	uint_t			wq_location, dma_xfer_mode, qp_is_umap;
87 	uint_t			qp_srq_en;
88 	int			status, flag;
89 	char			*errormsg;
90 
91 	TAVOR_TNF_ENTER(tavor_qp_alloc);
92 
93 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr_p, *queuesz_p))
94 
95 	/*
96 	 * Check the "options" flag.  Currently this flag tells the driver
97 	 * whether or not the QP's work queues should be come from normal
98 	 * system memory or whether they should be allocated from DDR memory.
99 	 */
100 	if (op == NULL) {
101 		wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
102 	} else {
103 		wq_location = op->qpo_wq_loc;
104 	}
105 
106 	/*
107 	 * Extract the necessary info from the tavor_qp_info_t structure
108 	 */
109 	attr_p	  = qpinfo->qpi_attrp;
110 	type	  = qpinfo->qpi_type;
111 	ibt_qphdl = qpinfo->qpi_ibt_qphdl;
112 	queuesz_p = qpinfo->qpi_queueszp;
113 	qpn	  = qpinfo->qpi_qpn;
114 	qphdl	  = &qpinfo->qpi_qphdl;
115 
116 	/*
117 	 * Determine whether QP is being allocated for userland access or
118 	 * whether it is being allocated for kernel access.  If the QP is
119 	 * being allocated for userland access, then lookup the UAR doorbell
120 	 * page number for the current process.  Note:  If this is not found
121 	 * (e.g. if the process has not previously open()'d the Tavor driver),
122 	 * then an error is returned.
123 	 */
124 	qp_is_umap = (attr_p->qp_alloc_flags & IBT_QP_USER_MAP) ? 1 : 0;
125 	if (qp_is_umap) {
126 		status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
127 		    MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
128 		if (status != DDI_SUCCESS) {
129 			/* Set "status" and "errormsg" and goto failure */
130 			TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page");
131 			goto qpalloc_fail;
132 		}
133 		uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
134 	}
135 
136 	/*
137 	 * Determine whether QP is being associated with an SRQ
138 	 */
139 	qp_srq_en = (attr_p->qp_alloc_flags & IBT_QP_USES_SRQ) ? 1 : 0;
140 	if (qp_srq_en) {
141 		/*
142 		 * Check for valid SRQ handle pointers
143 		 */
144 		if (attr_p->qp_ibc_srq_hdl == NULL) {
145 			/* Set "status" and "errormsg" and goto failure */
146 			TAVOR_TNF_FAIL(IBT_SRQ_HDL_INVALID,
147 			    "invalid SRQ handle");
148 			goto qpalloc_fail;
149 		}
150 		srq = (tavor_srqhdl_t)attr_p->qp_ibc_srq_hdl;
151 	}
152 
153 	/*
154 	 * Check for valid QP service type (only UD/RC/UC supported)
155 	 */
156 	if (((type != IBT_UD_RQP) && (type != IBT_RC_RQP) &&
157 	    (type != IBT_UC_RQP))) {
158 		/* Set "status" and "errormsg" and goto failure */
159 		TAVOR_TNF_FAIL(IBT_QP_SRV_TYPE_INVALID, "invalid serv type");
160 		goto qpalloc_fail;
161 	}
162 
163 	/*
164 	 * Only RC is supported on an SRQ -- This is a Tavor hardware
165 	 * limitation.  Arbel native mode will not have this shortcoming.
166 	 */
167 	if (qp_srq_en && type != IBT_RC_RQP) {
168 		/* Set "status" and "errormsg" and goto failure */
169 		TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid serv type with SRQ");
170 		goto qpalloc_fail;
171 	}
172 
173 	/*
174 	 * Check for valid PD handle pointer
175 	 */
176 	if (attr_p->qp_pd_hdl == NULL) {
177 		/* Set "status" and "errormsg" and goto failure */
178 		TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID, "invalid PD handle");
179 		goto qpalloc_fail;
180 	}
181 	pd = (tavor_pdhdl_t)attr_p->qp_pd_hdl;
182 
183 	/*
184 	 * If on an SRQ, check to make sure the PD is the same
185 	 */
186 	if (qp_srq_en && (pd->pd_pdnum != srq->srq_pdhdl->pd_pdnum)) {
187 		/* Set "status" and "errormsg" and goto failure */
188 		TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID, "invalid PD handle");
189 		goto qpalloc_fail;
190 	}
191 
192 	/* Increment the reference count on the protection domain (PD) */
193 	tavor_pd_refcnt_inc(pd);
194 
195 	/*
196 	 * Check for valid CQ handle pointers
197 	 */
198 	if ((attr_p->qp_ibc_scq_hdl == NULL) ||
199 	    (attr_p->qp_ibc_rcq_hdl == NULL)) {
200 		/* Set "status" and "errormsg" and goto failure */
201 		TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle");
202 		goto qpalloc_fail1;
203 	}
204 	sq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_scq_hdl;
205 	rq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_rcq_hdl;
206 
207 	/*
208 	 * Increment the reference count on the CQs.  One or both of these
209 	 * could return error if we determine that the given CQ is already
210 	 * being used with a special (SMI/GSI) QP.
211 	 */
212 	status = tavor_cq_refcnt_inc(sq_cq, TAVOR_CQ_IS_NORMAL);
213 	if (status != DDI_SUCCESS) {
214 		/* Set "status" and "errormsg" and goto failure */
215 		TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle");
216 		goto qpalloc_fail1;
217 	}
218 	status = tavor_cq_refcnt_inc(rq_cq, TAVOR_CQ_IS_NORMAL);
219 	if (status != DDI_SUCCESS) {
220 		/* Set "status" and "errormsg" and goto failure */
221 		TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle");
222 		goto qpalloc_fail2;
223 	}
224 
225 	/*
226 	 * Allocate an QP context entry.  This will be filled in with all
227 	 * the necessary parameters to define the Queue Pair.  Unlike
228 	 * other Tavor hardware resources, ownership is not immediately
229 	 * given to hardware in the final step here.  Instead, we must
230 	 * wait until the QP is later transitioned to the "Init" state before
231 	 * passing the QP to hardware.  If we fail here, we must undo all
232 	 * the reference count (CQ and PD).
233 	 */
234 	status = tavor_rsrc_alloc(state, TAVOR_QPC, 1, sleepflag, &qpc);
235 	if (status != DDI_SUCCESS) {
236 		/* Set "status" and "errormsg" and goto failure */
237 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed QP context");
238 		goto qpalloc_fail3;
239 	}
240 
241 	/*
242 	 * Allocate the software structure for tracking the queue pair
243 	 * (i.e. the Tavor Queue Pair handle).  If we fail here, we must
244 	 * undo the reference counts and the previous resource allocation.
245 	 */
246 	status = tavor_rsrc_alloc(state, TAVOR_QPHDL, 1, sleepflag, &rsrc);
247 	if (status != DDI_SUCCESS) {
248 		/* Set "status" and "errormsg" and goto failure */
249 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed QP handle");
250 		goto qpalloc_fail4;
251 	}
252 	qp = (tavor_qphdl_t)rsrc->tr_addr;
253 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp))
254 
255 	/*
256 	 * Calculate the QP number from QPC index.  This routine handles
257 	 * all of the operations necessary to keep track of used, unused,
258 	 * and released QP numbers.
259 	 */
260 	status = tavor_qp_create_qpn(state, qp, qpc);
261 	if (status != DDI_SUCCESS) {
262 		/* Set "status" and "errormsg" and goto failure */
263 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed QPN create");
264 		goto qpalloc_fail5;
265 	}
266 
267 	/*
268 	 * If this will be a user-mappable QP, then allocate an entry for
269 	 * the "userland resources database".  This will later be added to
270 	 * the database (after all further QP operations are successful).
271 	 * If we fail here, we must undo the reference counts and the
272 	 * previous resource allocation.
273 	 */
274 	if (qp_is_umap) {
275 		umapdb = tavor_umap_db_alloc(state->ts_instance, qp->qp_qpnum,
276 		    MLNX_UMAP_QPMEM_RSRC, (uint64_t)(uintptr_t)rsrc);
277 		if (umapdb == NULL) {
278 			/* Set "status" and "errormsg" and goto failure */
279 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
280 			goto qpalloc_fail6;
281 		}
282 	}
283 
284 	/*
285 	 * If this is an RC QP, then pre-allocate the maximum number of RDB
286 	 * entries.  This allows us to ensure that we can later cover all
287 	 * the resources needed by hardware for handling multiple incoming
288 	 * RDMA Reads.  Note: These resources are obviously not always
289 	 * necessary.  They are allocated here anyway.  Someday maybe this
290 	 * can be modified to allocate these on-the-fly (i.e. only if RDMA
291 	 * Read or Atomic operations are enabled) XXX
292 	 * If we fail here, we have a bunch of resource and reference count
293 	 * cleanup to do.
294 	 */
295 	if (type == IBT_RC_RQP) {
296 		max_rdb = state->ts_cfg_profile->cp_hca_max_rdma_in_qp;
297 		status = tavor_rsrc_alloc(state, TAVOR_RDB, max_rdb,
298 		    sleepflag, &rdb);
299 		if (status != DDI_SUCCESS) {
300 			/* Set "status" and "errormsg" and goto failure */
301 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed RDB");
302 			goto qpalloc_fail7;
303 		}
304 		qp->qp_rdbrsrcp = rdb;
305 		/* Calculate offset (into DDR memory) of RDB entries */
306 		rsrc_pool = &state->ts_rsrc_hdl[TAVOR_RDB];
307 		qp->qp_rdb_ddraddr = (uintptr_t)rsrc_pool->rsrc_ddr_offset +
308 		    (rdb->tr_indx << TAVOR_RDB_SIZE_SHIFT);
309 	}
310 
311 	/*
312 	 * Calculate the appropriate size for the work queues.
313 	 * Note:  All Tavor QP work queues must be a power-of-2 in size.  Also
314 	 * they may not be any smaller than TAVOR_QP_MIN_SIZE.  This step is
315 	 * to round the requested size up to the next highest power-of-2
316 	 */
317 	attr_p->qp_sizes.cs_sq = max(attr_p->qp_sizes.cs_sq, TAVOR_QP_MIN_SIZE);
318 	attr_p->qp_sizes.cs_rq = max(attr_p->qp_sizes.cs_rq, TAVOR_QP_MIN_SIZE);
319 	log_qp_sq_size = highbit(attr_p->qp_sizes.cs_sq);
320 	if (ISP2(attr_p->qp_sizes.cs_sq)) {
321 		log_qp_sq_size = log_qp_sq_size - 1;
322 	}
323 	log_qp_rq_size = highbit(attr_p->qp_sizes.cs_rq);
324 	if (ISP2(attr_p->qp_sizes.cs_rq)) {
325 		log_qp_rq_size = log_qp_rq_size - 1;
326 	}
327 
328 	/*
329 	 * Next we verify that the rounded-up size is valid (i.e. consistent
330 	 * with the device limits and/or software-configured limits).  If not,
331 	 * then obviously we have a lot of cleanup to do before returning.
332 	 */
333 	if ((log_qp_sq_size > state->ts_cfg_profile->cp_log_max_qp_sz) ||
334 	    (!qp_srq_en && (log_qp_rq_size >
335 	    state->ts_cfg_profile->cp_log_max_qp_sz))) {
336 		/* Set "status" and "errormsg" and goto failure */
337 		TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max QP size");
338 		goto qpalloc_fail8;
339 	}
340 
341 	/*
342 	 * Next we verify that the requested number of SGL is valid (i.e.
343 	 * consistent with the device limits and/or software-configured
344 	 * limits).  If not, then obviously the same cleanup needs to be done.
345 	 */
346 	max_sgl = state->ts_cfg_profile->cp_wqe_real_max_sgl;
347 	if ((attr_p->qp_sizes.cs_sq_sgl > max_sgl) ||
348 	    (!qp_srq_en && (attr_p->qp_sizes.cs_rq_sgl > max_sgl))) {
349 		/* Set "status" and "errormsg" and goto failure */
350 		TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED, "max QP SGL");
351 		goto qpalloc_fail8;
352 	}
353 
354 	/*
355 	 * Determine this QP's WQE sizes (for both the Send and Recv WQEs).
356 	 * This will depend on the requested number of SGLs.  Note: this
357 	 * has the side-effect of also calculating the real number of SGLs
358 	 * (for the calculated WQE size).
359 	 *
360 	 * For QP's on an SRQ, we set these to 0.
361 	 */
362 	if (qp_srq_en) {
363 		qp->qp_rq_log_wqesz = 0;
364 		qp->qp_rq_sgl = 0;
365 	} else {
366 		tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_rq_sgl,
367 		    TAVOR_QP_WQ_TYPE_RECVQ, &qp->qp_rq_log_wqesz,
368 		    &qp->qp_rq_sgl);
369 	}
370 	tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_sq_sgl,
371 	    TAVOR_QP_WQ_TYPE_SENDQ, &qp->qp_sq_log_wqesz, &qp->qp_sq_sgl);
372 
373 	/*
374 	 * Allocate the memory for QP work queues.  Note:  The location from
375 	 * which we will allocate these work queues has been passed in
376 	 * through the tavor_qp_options_t structure.  Since Tavor work queues
377 	 * are not allowed to cross a 32-bit (4GB) boundary, the alignment of
378 	 * the work queue memory is very important.  We used to allocate
379 	 * work queues (the combined receive and send queues) so that they
380 	 * would be aligned on their combined size.  That alignment guaranteed
381 	 * that they would never cross the 4GB boundary (Tavor work queues
382 	 * are on the order of MBs at maximum).  Now we are able to relax
383 	 * this alignment constraint by ensuring that the IB address assigned
384 	 * to the queue memory (as a result of the tavor_mr_register() call)
385 	 * is offset from zero.
386 	 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
387 	 * guarantee the alignment, but when attempting to use IOMMU bypass
388 	 * mode we found that we were not allowed to specify any alignment
389 	 * that was more restrictive than the system page size.
390 	 * So we avoided this constraint by passing two alignment values,
391 	 * one for the memory allocation itself and the other for the DMA
392 	 * handle (for later bind).  This used to cause more memory than
393 	 * necessary to be allocated (in order to guarantee the more
394 	 * restrictive alignment contraint).  But be guaranteeing the
395 	 * zero-based IB virtual address for the queue, we are able to
396 	 * conserve this memory.
397 	 * Note: If QP is not user-mappable, then it may come from either
398 	 * kernel system memory or from HCA-attached local DDR memory.
399 	 */
400 	sq_wqe_size = 1 << qp->qp_sq_log_wqesz;
401 	sq_size	    = (1 << log_qp_sq_size) * sq_wqe_size;
402 
403 	/* QP on SRQ sets these to 0 */
404 	if (qp_srq_en) {
405 		rq_wqe_size = 0;
406 		rq_size	    = 0;
407 	} else {
408 		rq_wqe_size = 1 << qp->qp_rq_log_wqesz;
409 		rq_size	    = (1 << log_qp_rq_size) * rq_wqe_size;
410 	}
411 
412 	qp->qp_wqinfo.qa_size = sq_size + rq_size;
413 	qp->qp_wqinfo.qa_alloc_align = max(sq_wqe_size, rq_wqe_size);
414 	qp->qp_wqinfo.qa_bind_align  = max(sq_wqe_size, rq_wqe_size);
415 	if (qp_is_umap) {
416 		qp->qp_wqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
417 	} else {
418 		qp->qp_wqinfo.qa_location = wq_location;
419 	}
420 	status = tavor_queue_alloc(state, &qp->qp_wqinfo, sleepflag);
421 	if (status != DDI_SUCCESS) {
422 		/* Set "status" and "errormsg" and goto failure */
423 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed work queue");
424 		goto qpalloc_fail8;
425 	}
426 	if (sq_wqe_size > rq_wqe_size) {
427 		sq_buf = qp->qp_wqinfo.qa_buf_aligned;
428 
429 		/*
430 		 * If QP's on an SRQ, we set the rq_buf to NULL
431 		 */
432 		if (qp_srq_en)
433 			rq_buf = NULL;
434 		else
435 			rq_buf = (uint32_t *)((uintptr_t)sq_buf + sq_size);
436 	} else {
437 		rq_buf = qp->qp_wqinfo.qa_buf_aligned;
438 		sq_buf = (uint32_t *)((uintptr_t)rq_buf + rq_size);
439 	}
440 
441 	/*
442 	 * Register the memory for the QP work queues.  The memory for the
443 	 * QP must be registered in the Tavor TPT tables.  This gives us the
444 	 * LKey to specify in the QP context later.  Note: The memory for
445 	 * Tavor work queues (both Send and Recv) must be contiguous and
446 	 * registered as a single memory region.  Note also: If the work
447 	 * queue is to be allocated from DDR memory, then only a "bypass"
448 	 * mapping is appropriate.  And if the QP memory is user-mappable,
449 	 * then we force DDI_DMA_CONSISTENT mapping.
450 	 * Also, in order to meet the alignment restriction, we pass the
451 	 * "mro_bind_override_addr" flag in the call to tavor_mr_register().
452 	 * This guarantees that the resulting IB vaddr will be zero-based
453 	 * (modulo the offset into the first page).
454 	 * If we fail here, we still have the bunch of resource and reference
455 	 * count cleanup to do.
456 	 */
457 	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
458 	    IBT_MR_NOSLEEP;
459 	mr_attr.mr_vaddr    = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned;
460 	mr_attr.mr_len	    = qp->qp_wqinfo.qa_size;
461 	mr_attr.mr_as	    = NULL;
462 	mr_attr.mr_flags    = flag;
463 	if (qp_is_umap) {
464 		mr_op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
465 	} else {
466 		if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
467 			mr_op.mro_bind_type =
468 			    state->ts_cfg_profile->cp_iommu_bypass;
469 			dma_xfer_mode =
470 			    state->ts_cfg_profile->cp_streaming_consistent;
471 			if (dma_xfer_mode == DDI_DMA_STREAMING) {
472 				mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
473 			}
474 		} else {
475 			mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
476 		}
477 	}
478 	mr_op.mro_bind_dmahdl = qp->qp_wqinfo.qa_dmahdl;
479 	mr_op.mro_bind_override_addr = 1;
480 	status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
481 	if (status != DDI_SUCCESS) {
482 		/* Set "status" and "errormsg" and goto failure */
483 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
484 		goto qpalloc_fail9;
485 	}
486 
487 	/*
488 	 * Calculate the offset between the kernel virtual address space
489 	 * and the IB virtual address space.  This will be used when
490 	 * posting work requests to properly initialize each WQE.
491 	 */
492 	qp_desc_off = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned -
493 	    (uint64_t)mr->mr_bindinfo.bi_addr;
494 
495 	/*
496 	 * Fill in all the return arguments (if necessary).  This includes
497 	 * real work queue sizes, real SGLs, and QP number
498 	 */
499 	if (queuesz_p != NULL) {
500 		queuesz_p->cs_sq	= (1 << log_qp_sq_size);
501 		queuesz_p->cs_sq_sgl	= qp->qp_sq_sgl;
502 
503 		/* QP on an SRQ set these to 0 */
504 		if (qp_srq_en) {
505 			queuesz_p->cs_rq	= 0;
506 			queuesz_p->cs_rq_sgl	= 0;
507 		} else {
508 			queuesz_p->cs_rq	= (1 << log_qp_rq_size);
509 			queuesz_p->cs_rq_sgl	= qp->qp_rq_sgl;
510 		}
511 	}
512 	if (qpn != NULL) {
513 		*qpn = (ib_qpn_t)qp->qp_qpnum;
514 	}
515 
516 	/*
517 	 * Fill in the rest of the Tavor Queue Pair handle.  We can update
518 	 * the following fields for use in further operations on the QP.
519 	 */
520 	qp->qp_qpcrsrcp		= qpc;
521 	qp->qp_rsrcp		= rsrc;
522 	qp->qp_state		= TAVOR_QP_RESET;
523 	qp->qp_pdhdl		= pd;
524 	qp->qp_mrhdl		= mr;
525 	qp->qp_sq_sigtype	= (attr_p->qp_flags & IBT_WR_SIGNALED) ?
526 	    TAVOR_QP_SQ_WR_SIGNALED : TAVOR_QP_SQ_ALL_SIGNALED;
527 	qp->qp_is_special	= 0;
528 	qp->qp_is_umap		= qp_is_umap;
529 	qp->qp_uarpg		= (qp->qp_is_umap) ? uarpg : 0;
530 	qp->qp_umap_dhp		= (devmap_cookie_t)NULL;
531 	qp->qp_sq_cqhdl		= sq_cq;
532 	qp->qp_sq_lastwqeaddr	= NULL;
533 	qp->qp_sq_bufsz		= (1 << log_qp_sq_size);
534 	qp->qp_sq_buf		= sq_buf;
535 	qp->qp_desc_off		= qp_desc_off;
536 	qp->qp_rq_cqhdl		= rq_cq;
537 	qp->qp_rq_lastwqeaddr	= NULL;
538 	qp->qp_rq_buf		= rq_buf;
539 
540 	/* QP on an SRQ sets this to 0 */
541 	if (qp_srq_en) {
542 		qp->qp_rq_bufsz		= 0;
543 	} else {
544 		qp->qp_rq_bufsz		= (1 << log_qp_rq_size);
545 	}
546 
547 	qp->qp_forward_sqd_event  = 0;
548 	qp->qp_sqd_still_draining = 0;
549 	qp->qp_hdlrarg		= (void *)ibt_qphdl;
550 	qp->qp_mcg_refcnt	= 0;
551 
552 	/*
553 	 * If this QP is to be associated with an SRQ, then set the SRQ handle
554 	 * appropriately.
555 	 */
556 	if (qp_srq_en) {
557 		qp->qp_srqhdl = srq;
558 		qp->qp_srq_en = TAVOR_QP_SRQ_ENABLED;
559 		tavor_srq_refcnt_inc(qp->qp_srqhdl);
560 	} else {
561 		qp->qp_srqhdl = NULL;
562 		qp->qp_srq_en = TAVOR_QP_SRQ_DISABLED;
563 	}
564 
565 	/* Determine if later ddi_dma_sync will be necessary */
566 	qp->qp_sync = TAVOR_QP_IS_SYNC_REQ(state, qp->qp_wqinfo);
567 
568 	/* Determine the QP service type */
569 	if (type == IBT_RC_RQP) {
570 		qp->qp_serv_type = TAVOR_QP_RC;
571 	} else if (type == IBT_UD_RQP) {
572 		qp->qp_serv_type = TAVOR_QP_UD;
573 	} else {
574 		qp->qp_serv_type = TAVOR_QP_UC;
575 	}
576 
577 	/* Zero out the QP context */
578 	bzero(&qp->qpc, sizeof (tavor_hw_qpc_t));
579 
580 	/*
581 	 * Put QP handle in Tavor QPNum-to-QPHdl list.  Then fill in the
582 	 * "qphdl" and return success
583 	 */
584 	ASSERT(state->ts_qphdl[qpc->tr_indx] == NULL);
585 	state->ts_qphdl[qpc->tr_indx] = qp;
586 
587 	/*
588 	 * If this is a user-mappable QP, then we need to insert the previously
589 	 * allocated entry into the "userland resources database".  This will
590 	 * allow for later lookup during devmap() (i.e. mmap()) calls.
591 	 */
592 	if (qp_is_umap) {
593 		tavor_umap_db_add(umapdb);
594 	}
595 
596 	*qphdl = qp;
597 
598 	TAVOR_TNF_EXIT(tavor_qp_alloc);
599 	return (DDI_SUCCESS);
600 
601 /*
602  * The following is cleanup for all possible failure cases in this routine
603  */
604 qpalloc_fail9:
605 	tavor_queue_free(state, &qp->qp_wqinfo);
606 qpalloc_fail8:
607 	if (type == IBT_RC_RQP) {
608 		tavor_rsrc_free(state, &rdb);
609 	}
610 qpalloc_fail7:
611 	if (qp_is_umap) {
612 		tavor_umap_db_free(umapdb);
613 	}
614 qpalloc_fail6:
615 	/*
616 	 * Releasing the QPN will also free up the QPC context.  Update
617 	 * the QPC context pointer to indicate this.
618 	 */
619 	tavor_qp_release_qpn(state, qp->qp_qpn_hdl, TAVOR_QPN_RELEASE);
620 	qpc = NULL;
621 qpalloc_fail5:
622 	tavor_rsrc_free(state, &rsrc);
623 qpalloc_fail4:
624 	if (qpc) {
625 		tavor_rsrc_free(state, &qpc);
626 	}
627 qpalloc_fail3:
628 	tavor_cq_refcnt_dec(rq_cq);
629 qpalloc_fail2:
630 	tavor_cq_refcnt_dec(sq_cq);
631 qpalloc_fail1:
632 	tavor_pd_refcnt_dec(pd);
633 qpalloc_fail:
634 	TNF_PROBE_1(tavor_qp_alloc_fail, TAVOR_TNF_ERROR, "",
635 	    tnf_string, msg, errormsg);
636 	TAVOR_TNF_EXIT(tavor_qp_alloc);
637 	return (status);
638 }
639 
640 
641 
642 /*
643  * tavor_special_qp_alloc()
644  *    Context: Can be called only from user or kernel context.
645  */
646 int
647 tavor_special_qp_alloc(tavor_state_t *state, tavor_qp_info_t *qpinfo,
648     uint_t sleepflag, tavor_qp_options_t *op)
649 {
650 	tavor_rsrc_t		*qpc, *rsrc;
651 	tavor_qphdl_t		qp;
652 	ibt_qp_alloc_attr_t	*attr_p;
653 	ibt_sqp_type_t		type;
654 	uint8_t			port;
655 	ibtl_qp_hdl_t		ibt_qphdl;
656 	ibt_chan_sizes_t	*queuesz_p;
657 	tavor_qphdl_t		*qphdl;
658 	ibt_mr_attr_t		mr_attr;
659 	tavor_mr_options_t	mr_op;
660 	tavor_pdhdl_t		pd;
661 	tavor_cqhdl_t		sq_cq, rq_cq;
662 	tavor_mrhdl_t		mr;
663 	uint64_t		qp_desc_off;
664 	uint32_t		*sq_buf, *rq_buf;
665 	uint32_t		log_qp_sq_size, log_qp_rq_size;
666 	uint32_t		sq_size, rq_size, max_sgl;
667 	uint32_t		sq_wqe_size, rq_wqe_size;
668 	uint_t			wq_location, dma_xfer_mode;
669 	int			status, flag;
670 	char			*errormsg;
671 
672 	TAVOR_TNF_ENTER(tavor_special_qp_alloc);
673 
674 	/*
675 	 * Check the "options" flag.  Currently this flag tells the driver
676 	 * whether or not the QP's work queues should be come from normal
677 	 * system memory or whether they should be allocated from DDR memory.
678 	 */
679 	if (op == NULL) {
680 		wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
681 	} else {
682 		wq_location = op->qpo_wq_loc;
683 	}
684 
685 	/*
686 	 * Extract the necessary info from the tavor_qp_info_t structure
687 	 */
688 	attr_p	  = qpinfo->qpi_attrp;
689 	type	  = qpinfo->qpi_type;
690 	port	  = qpinfo->qpi_port;
691 	ibt_qphdl = qpinfo->qpi_ibt_qphdl;
692 	queuesz_p = qpinfo->qpi_queueszp;
693 	qphdl	  = &qpinfo->qpi_qphdl;
694 
695 	/*
696 	 * Check for valid special QP type (only SMI & GSI supported)
697 	 */
698 	if ((type != IBT_SMI_SQP) && (type != IBT_GSI_SQP)) {
699 		/* Set "status" and "errormsg" and goto failure */
700 		TAVOR_TNF_FAIL(IBT_QP_SPECIAL_TYPE_INVALID, "invalid QP type");
701 		goto spec_qpalloc_fail;
702 	}
703 
704 	/*
705 	 * Check for valid port number
706 	 */
707 	if (!tavor_portnum_is_valid(state, port)) {
708 		/* Set "status" and "errormsg" and goto failure */
709 		TAVOR_TNF_FAIL(IBT_HCA_PORT_INVALID, "invalid port num");
710 		goto spec_qpalloc_fail;
711 	}
712 	port = port - 1;
713 
714 	/*
715 	 * Check for valid PD handle pointer
716 	 */
717 	if (attr_p->qp_pd_hdl == NULL) {
718 		/* Set "status" and "errormsg" and goto failure */
719 		TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID, "invalid PD handle");
720 		goto spec_qpalloc_fail;
721 	}
722 	pd = (tavor_pdhdl_t)attr_p->qp_pd_hdl;
723 
724 	/* Increment the reference count on the PD */
725 	tavor_pd_refcnt_inc(pd);
726 
727 	/*
728 	 * Check for valid CQ handle pointers
729 	 */
730 	if ((attr_p->qp_ibc_scq_hdl == NULL) ||
731 	    (attr_p->qp_ibc_rcq_hdl == NULL)) {
732 		/* Set "status" and "errormsg" and goto failure */
733 		TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle");
734 		goto spec_qpalloc_fail1;
735 	}
736 	sq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_scq_hdl;
737 	rq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_rcq_hdl;
738 
739 	/*
740 	 * Increment the reference count on the CQs.  One or both of these
741 	 * could return error if we determine that the given CQ is already
742 	 * being used with a non-special QP (i.e. a normal QP).
743 	 */
744 	status = tavor_cq_refcnt_inc(sq_cq, TAVOR_CQ_IS_SPECIAL);
745 	if (status != DDI_SUCCESS) {
746 		/* Set "status" and "errormsg" and goto failure */
747 		TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle");
748 		goto spec_qpalloc_fail1;
749 	}
750 	status = tavor_cq_refcnt_inc(rq_cq, TAVOR_CQ_IS_SPECIAL);
751 	if (status != DDI_SUCCESS) {
752 		/* Set "status" and "errormsg" and goto failure */
753 		TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle");
754 		goto spec_qpalloc_fail2;
755 	}
756 
757 	/*
758 	 * Allocate the special QP resources.  Essentially, this allocation
759 	 * amounts to checking if the request special QP has already been
760 	 * allocated.  If successful, the QP context return is an actual
761 	 * QP context that has been "aliased" to act as a special QP of the
762 	 * appropriate type (and for the appropriate port).  Just as in
763 	 * tavor_qp_alloc() above, ownership for this QP context is not
764 	 * immediately given to hardware in the final step here.  Instead, we
765 	 * wait until the QP is later transitioned to the "Init" state before
766 	 * passing the QP to hardware.  If we fail here, we must undo all
767 	 * the reference count (CQ and PD).
768 	 */
769 	status = tavor_special_qp_rsrc_alloc(state, type, port, &qpc);
770 	if (status != DDI_SUCCESS) {
771 		/* Set "status" and "errormsg" and goto failure */
772 		TAVOR_TNF_FAIL(status, "failed special QP rsrc");
773 		goto spec_qpalloc_fail3;
774 	}
775 
776 	/*
777 	 * Allocate the software structure for tracking the special queue
778 	 * pair (i.e. the Tavor Queue Pair handle).  If we fail here, we
779 	 * must undo the reference counts and the previous resource allocation.
780 	 */
781 	status = tavor_rsrc_alloc(state, TAVOR_QPHDL, 1, sleepflag, &rsrc);
782 	if (status != DDI_SUCCESS) {
783 		/* Set "status" and "errormsg" and goto failure */
784 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed QP handle");
785 		goto spec_qpalloc_fail4;
786 	}
787 	qp = (tavor_qphdl_t)rsrc->tr_addr;
788 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp))
789 
790 	/*
791 	 * Actual QP number is a combination of the index of the QPC and
792 	 * the port number.  This is because the special QP contexts must
793 	 * be allocated two-at-a-time.
794 	 */
795 	qp->qp_qpnum = qpc->tr_indx + port;
796 
797 	/*
798 	 * Calculate the appropriate size for the work queues.
799 	 * Note:  All Tavor QP work queues must be a power-of-2 in size.  Also
800 	 * they may not be any smaller than TAVOR_QP_MIN_SIZE.  This step is
801 	 * to round the requested size up to the next highest power-of-2
802 	 */
803 	attr_p->qp_sizes.cs_sq = max(attr_p->qp_sizes.cs_sq, TAVOR_QP_MIN_SIZE);
804 	attr_p->qp_sizes.cs_rq = max(attr_p->qp_sizes.cs_rq, TAVOR_QP_MIN_SIZE);
805 	log_qp_sq_size = highbit(attr_p->qp_sizes.cs_sq);
806 	if (ISP2(attr_p->qp_sizes.cs_sq)) {
807 		log_qp_sq_size = log_qp_sq_size - 1;
808 	}
809 	log_qp_rq_size = highbit(attr_p->qp_sizes.cs_rq);
810 	if (ISP2(attr_p->qp_sizes.cs_rq)) {
811 		log_qp_rq_size = log_qp_rq_size - 1;
812 	}
813 
814 	/*
815 	 * Next we verify that the rounded-up size is valid (i.e. consistent
816 	 * with the device limits and/or software-configured limits).  If not,
817 	 * then obviously we have a bit of cleanup to do before returning.
818 	 */
819 	if ((log_qp_sq_size > state->ts_cfg_profile->cp_log_max_qp_sz) ||
820 	    (log_qp_rq_size > state->ts_cfg_profile->cp_log_max_qp_sz)) {
821 		/* Set "status" and "errormsg" and goto failure */
822 		TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max QP size");
823 		goto spec_qpalloc_fail5;
824 	}
825 
826 	/*
827 	 * Next we verify that the requested number of SGL is valid (i.e.
828 	 * consistent with the device limits and/or software-configured
829 	 * limits).  If not, then obviously the same cleanup needs to be done.
830 	 */
831 	max_sgl = state->ts_cfg_profile->cp_wqe_real_max_sgl;
832 	if ((attr_p->qp_sizes.cs_sq_sgl > max_sgl) ||
833 	    (attr_p->qp_sizes.cs_rq_sgl > max_sgl)) {
834 		/* Set "status" and "errormsg" and goto failure */
835 		TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED, "max QP SGL");
836 		goto spec_qpalloc_fail5;
837 	}
838 
839 	/*
840 	 * Determine this QP's WQE sizes (for both the Send and Recv WQEs).
841 	 * This will depend on the requested number of SGLs.  Note: this
842 	 * has the side-effect of also calculating the real number of SGLs
843 	 * (for the calculated WQE size).
844 	 */
845 	tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_rq_sgl,
846 	    TAVOR_QP_WQ_TYPE_RECVQ, &qp->qp_rq_log_wqesz, &qp->qp_rq_sgl);
847 	if (type == IBT_SMI_SQP) {
848 		tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_sq_sgl,
849 		    TAVOR_QP_WQ_TYPE_SENDMLX_QP0, &qp->qp_sq_log_wqesz,
850 		    &qp->qp_sq_sgl);
851 	} else {
852 		tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_sq_sgl,
853 		    TAVOR_QP_WQ_TYPE_SENDMLX_QP1, &qp->qp_sq_log_wqesz,
854 		    &qp->qp_sq_sgl);
855 	}
856 
857 	/*
858 	 * Allocate the memory for QP work queues.  Note:  The location from
859 	 * which we will allocate these work queues has been passed in
860 	 * through the tavor_qp_options_t structure.  Since Tavor work queues
861 	 * are not allowed to cross a 32-bit (4GB) boundary, the alignment of
862 	 * the work queue memory is very important.  We used to allocate
863 	 * work queues (the combined receive and send queues) so that they
864 	 * would be aligned on their combined size.  That alignment guaranteed
865 	 * that they would never cross the 4GB boundary (Tavor work queues
866 	 * are on the order of MBs at maximum).  Now we are able to relax
867 	 * this alignment constraint by ensuring that the IB address assigned
868 	 * to the queue memory (as a result of the tavor_mr_register() call)
869 	 * is offset from zero.
870 	 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
871 	 * guarantee the alignment, but when attempting to use IOMMU bypass
872 	 * mode we found that we were not allowed to specify any alignment
873 	 * that was more restrictive than the system page size.
874 	 * So we avoided this constraint by passing two alignment values,
875 	 * one for the memory allocation itself and the other for the DMA
876 	 * handle (for later bind).  This used to cause more memory than
877 	 * necessary to be allocated (in order to guarantee the more
878 	 * restrictive alignment contraint).  But be guaranteeing the
879 	 * zero-based IB virtual address for the queue, we are able to
880 	 * conserve this memory.
881 	 */
882 	sq_wqe_size = 1 << qp->qp_sq_log_wqesz;
883 	rq_wqe_size = 1 << qp->qp_rq_log_wqesz;
884 	sq_size	    = (1 << log_qp_sq_size) * sq_wqe_size;
885 	rq_size	    = (1 << log_qp_rq_size) * rq_wqe_size;
886 	qp->qp_wqinfo.qa_size	  = sq_size + rq_size;
887 	qp->qp_wqinfo.qa_alloc_align = max(sq_wqe_size, rq_wqe_size);
888 	qp->qp_wqinfo.qa_bind_align  = max(sq_wqe_size, rq_wqe_size);
889 	qp->qp_wqinfo.qa_location = wq_location;
890 	status = tavor_queue_alloc(state, &qp->qp_wqinfo, sleepflag);
891 	if (status != NULL) {
892 		/* Set "status" and "errormsg" and goto failure */
893 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed work queue");
894 		goto spec_qpalloc_fail5;
895 	}
896 	if (sq_wqe_size > rq_wqe_size) {
897 		sq_buf = qp->qp_wqinfo.qa_buf_aligned;
898 		rq_buf = (uint32_t *)((uintptr_t)sq_buf + sq_size);
899 	} else {
900 		rq_buf = qp->qp_wqinfo.qa_buf_aligned;
901 		sq_buf = (uint32_t *)((uintptr_t)rq_buf + rq_size);
902 	}
903 
904 	/*
905 	 * Register the memory for the special QP work queues.  The memory for
906 	 * the special QP must be registered in the Tavor TPT tables.  This
907 	 * gives us the LKey to specify in the QP context later.  Note: The
908 	 * memory for Tavor work queues (both Send and Recv) must be contiguous
909 	 * and registered as a single memory region.  Note also: If the work
910 	 * queue is to be allocated from DDR memory, then only a "bypass"
911 	 * mapping is appropriate.
912 	 * Also, in order to meet the alignment restriction, we pass the
913 	 * "mro_bind_override_addr" flag in the call to tavor_mr_register().
914 	 * This guarantees that the resulting IB vaddr will be zero-based
915 	 * (modulo the offset into the first page).
916 	 * If we fail here, we have a bunch of resource and reference count
917 	 * cleanup to do.
918 	 */
919 	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
920 	    IBT_MR_NOSLEEP;
921 	mr_attr.mr_vaddr    = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned;
922 	mr_attr.mr_len	    = qp->qp_wqinfo.qa_size;
923 	mr_attr.mr_as	    = NULL;
924 	mr_attr.mr_flags    = flag;
925 	if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
926 		mr_op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
927 
928 		dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
929 		if (dma_xfer_mode == DDI_DMA_STREAMING) {
930 			mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
931 		}
932 	} else {
933 		mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
934 	}
935 	mr_op.mro_bind_dmahdl = qp->qp_wqinfo.qa_dmahdl;
936 	mr_op.mro_bind_override_addr = 1;
937 	status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
938 	if (status != DDI_SUCCESS) {
939 		/* Set "status" and "errormsg" and goto failure */
940 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
941 		goto spec_qpalloc_fail6;
942 	}
943 
944 	/*
945 	 * Calculate the offset between the kernel virtual address space
946 	 * and the IB virtual address space.  This will be used when
947 	 * posting work requests to properly initialize each WQE.
948 	 */
949 	qp_desc_off = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned -
950 	    (uint64_t)mr->mr_bindinfo.bi_addr;
951 
952 	/*
953 	 * Fill in all the return arguments (if necessary).  This includes
954 	 * real work queue sizes, real SGLs, and QP number (which will be
955 	 * either zero or one, depending on the special QP type)
956 	 */
957 	if (queuesz_p != NULL) {
958 		queuesz_p->cs_sq	= (1 << log_qp_sq_size);
959 		queuesz_p->cs_sq_sgl	= qp->qp_sq_sgl;
960 		queuesz_p->cs_rq	= (1 << log_qp_rq_size);
961 		queuesz_p->cs_rq_sgl	= qp->qp_rq_sgl;
962 	}
963 
964 	/*
965 	 * Fill in the rest of the Tavor Queue Pair handle.  We can update
966 	 * the following fields for use in further operations on the QP.
967 	 */
968 	qp->qp_qpcrsrcp		= qpc;
969 	qp->qp_rsrcp		= rsrc;
970 	qp->qp_state		= TAVOR_QP_RESET;
971 	qp->qp_pdhdl		= pd;
972 	qp->qp_mrhdl		= mr;
973 	qp->qp_sq_sigtype	= (attr_p->qp_flags & IBT_WR_SIGNALED) ?
974 	    TAVOR_QP_SQ_WR_SIGNALED : TAVOR_QP_SQ_ALL_SIGNALED;
975 	qp->qp_is_special	= (type == IBT_SMI_SQP) ?
976 	    TAVOR_QP_SMI : TAVOR_QP_GSI;
977 	qp->qp_is_umap		= 0;
978 	qp->qp_uarpg		= 0;
979 	qp->qp_sq_cqhdl		= sq_cq;
980 	qp->qp_sq_lastwqeaddr	= NULL;
981 	qp->qp_sq_bufsz		= (1 << log_qp_sq_size);
982 	qp->qp_sq_buf		= sq_buf;
983 	qp->qp_desc_off		= qp_desc_off;
984 	qp->qp_rq_cqhdl		= rq_cq;
985 	qp->qp_rq_lastwqeaddr	= NULL;
986 	qp->qp_rq_bufsz		= (1 << log_qp_rq_size);
987 	qp->qp_rq_buf		= rq_buf;
988 	qp->qp_portnum		= port;
989 	qp->qp_pkeyindx		= 0;
990 	qp->qp_hdlrarg		= (void *)ibt_qphdl;
991 	qp->qp_mcg_refcnt	= 0;
992 	qp->qp_srq_en		= 0;
993 	qp->qp_srqhdl		= NULL;
994 
995 	/* Determine if later ddi_dma_sync will be necessary */
996 	qp->qp_sync = TAVOR_QP_IS_SYNC_REQ(state, qp->qp_wqinfo);
997 
998 	/* All special QPs are UD QP service type */
999 	qp->qp_serv_type = TAVOR_QP_UD;
1000 
1001 	/* Zero out the QP context */
1002 	bzero(&qp->qpc, sizeof (tavor_hw_qpc_t));
1003 
1004 	/*
1005 	 * Put QP handle in Tavor QPNum-to-QPHdl list.  Then fill in the
1006 	 * "qphdl" and return success
1007 	 */
1008 	ASSERT(state->ts_qphdl[qpc->tr_indx + port] == NULL);
1009 	state->ts_qphdl[qpc->tr_indx + port] = qp;
1010 
1011 	*qphdl = qp;
1012 
1013 	TAVOR_TNF_EXIT(tavor_special_qp_alloc);
1014 	return (DDI_SUCCESS);
1015 
1016 /*
1017  * The following is cleanup for all possible failure cases in this routine
1018  */
1019 spec_qpalloc_fail6:
1020 	tavor_queue_free(state, &qp->qp_wqinfo);
1021 spec_qpalloc_fail5:
1022 	tavor_rsrc_free(state, &rsrc);
1023 spec_qpalloc_fail4:
1024 	if (tavor_special_qp_rsrc_free(state, type, port) != DDI_SUCCESS) {
1025 		TAVOR_WARNING(state, "failed to free special QP rsrc");
1026 	}
1027 spec_qpalloc_fail3:
1028 	tavor_cq_refcnt_dec(rq_cq);
1029 spec_qpalloc_fail2:
1030 	tavor_cq_refcnt_dec(sq_cq);
1031 spec_qpalloc_fail1:
1032 	tavor_pd_refcnt_dec(pd);
1033 spec_qpalloc_fail:
1034 	TNF_PROBE_1(tavor_special_qp_alloc_fail, TAVOR_TNF_ERROR, "",
1035 	    tnf_string, msg, errormsg);
1036 	TAVOR_TNF_EXIT(tavor_special_qp_alloc);
1037 	return (status);
1038 }
1039 
1040 
1041 /*
1042  * tavor_qp_free()
1043  *    This function frees up the QP resources.  Depending on the value
1044  *    of the "free_qp_flags", the QP number may not be released until
1045  *    a subsequent call to tavor_qp_release_qpn().
1046  *
1047  *    Context: Can be called only from user or kernel context.
1048  */
1049 /* ARGSUSED */
1050 int
1051 tavor_qp_free(tavor_state_t *state, tavor_qphdl_t *qphdl,
1052     ibc_free_qp_flags_t free_qp_flags, ibc_qpn_hdl_t *qpnh,
1053     uint_t sleepflag)
1054 {
1055 	tavor_rsrc_t		*qpc, *rdb, *rsrc;
1056 	tavor_umap_db_entry_t	*umapdb;
1057 	tavor_qpn_entry_t	*entry;
1058 	tavor_pdhdl_t		pd;
1059 	tavor_mrhdl_t		mr;
1060 	tavor_cqhdl_t		sq_cq, rq_cq;
1061 	tavor_srqhdl_t		srq;
1062 	tavor_qphdl_t		qp;
1063 	uint64_t		value;
1064 	uint_t			type, port;
1065 	uint_t			maxprot;
1066 	uint_t			qp_srq_en;
1067 	int			status;
1068 	char			*errormsg;
1069 
1070 	TAVOR_TNF_ENTER(tavor_qp_free);
1071 
1072 	/*
1073 	 * Pull all the necessary information from the Tavor Queue Pair
1074 	 * handle.  This is necessary here because the resource for the
1075 	 * QP handle is going to be freed up as part of this operation.
1076 	 */
1077 	qp	= *qphdl;
1078 	mutex_enter(&qp->qp_lock);
1079 	qpc	= qp->qp_qpcrsrcp;
1080 	rsrc	= qp->qp_rsrcp;
1081 	pd	= qp->qp_pdhdl;
1082 	srq	= qp->qp_srqhdl;
1083 	mr	= qp->qp_mrhdl;
1084 	rq_cq	= qp->qp_rq_cqhdl;
1085 	sq_cq	= qp->qp_sq_cqhdl;
1086 	rdb	= qp->qp_rdbrsrcp;
1087 	port	= qp->qp_portnum;
1088 	qp_srq_en = qp->qp_srq_en;
1089 
1090 	/*
1091 	 * If the QP is part of an MCG, then we fail the qp_free
1092 	 */
1093 	if (qp->qp_mcg_refcnt != 0) {
1094 		mutex_exit(&qp->qp_lock);
1095 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "QP part of MCG on free");
1096 		goto qpfree_fail;
1097 	}
1098 
1099 	/*
1100 	 * If the QP is not already in "Reset" state, then transition to
1101 	 * "Reset".  This is necessary because software does not reclaim
1102 	 * ownership of the QP context until the QP is in the "Reset" state.
1103 	 * If the ownership transfer fails for any reason, then it is an
1104 	 * indication that something (either in HW or SW) has gone seriously
1105 	 * wrong.  So we print a warning message and return.
1106 	 */
1107 	if (qp->qp_state != TAVOR_QP_RESET) {
1108 		if (tavor_qp_to_reset(state, qp) != DDI_SUCCESS) {
1109 			mutex_exit(&qp->qp_lock);
1110 			TAVOR_WARNING(state, "failed to reset QP context");
1111 			/* Set "status" and "errormsg" and goto failure */
1112 			TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
1113 			    "reset QP context");
1114 			goto qpfree_fail;
1115 		}
1116 		qp->qp_state = TAVOR_QP_RESET;
1117 
1118 		/*
1119 		 * Do any additional handling necessary for the transition
1120 		 * to the "Reset" state (e.g. update the WRID lists)
1121 		 */
1122 		tavor_wrid_to_reset_handling(state, qp);
1123 	}
1124 
1125 	/*
1126 	 * If this was a user-mappable QP, then we need to remove its entry
1127 	 * from the "userland resources database".  If it is also currently
1128 	 * mmap()'d out to a user process, then we need to call
1129 	 * devmap_devmem_remap() to remap the QP memory to an invalid mapping.
1130 	 * We also need to invalidate the QP tracking information for the
1131 	 * user mapping.
1132 	 */
1133 	if (qp->qp_is_umap) {
1134 		status = tavor_umap_db_find(state->ts_instance, qp->qp_qpnum,
1135 		    MLNX_UMAP_QPMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
1136 		    &umapdb);
1137 		if (status != DDI_SUCCESS) {
1138 			mutex_exit(&qp->qp_lock);
1139 			TAVOR_WARNING(state, "failed to find in database");
1140 			TAVOR_TNF_EXIT(tavor_qp_free);
1141 			return (ibc_get_ci_failure(0));
1142 		}
1143 		tavor_umap_db_free(umapdb);
1144 		if (qp->qp_umap_dhp != NULL) {
1145 			maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
1146 			status = devmap_devmem_remap(qp->qp_umap_dhp,
1147 			    state->ts_dip, 0, 0, qp->qp_wqinfo.qa_size,
1148 			    maxprot, DEVMAP_MAPPING_INVALID, NULL);
1149 			if (status != DDI_SUCCESS) {
1150 				mutex_exit(&qp->qp_lock);
1151 				TAVOR_WARNING(state, "failed in QP memory "
1152 				    "devmap_devmem_remap()");
1153 				TAVOR_TNF_EXIT(tavor_qp_free);
1154 				return (ibc_get_ci_failure(0));
1155 			}
1156 			qp->qp_umap_dhp = (devmap_cookie_t)NULL;
1157 		}
1158 	}
1159 
1160 	/*
1161 	 * Put NULL into the Tavor QPNum-to-QPHdl list.  This will allow any
1162 	 * in-progress events to detect that the QP corresponding to this
1163 	 * number has been freed.  Note: it does depend in whether we are
1164 	 * freeing a special QP or not.
1165 	 */
1166 	if (qp->qp_is_special) {
1167 		state->ts_qphdl[qpc->tr_indx + port] = NULL;
1168 	} else {
1169 		state->ts_qphdl[qpc->tr_indx] = NULL;
1170 	}
1171 
1172 	/*
1173 	 * Drop the QP lock
1174 	 *    At this point the lock is no longer necessary.  We cannot
1175 	 *    protect from multiple simultaneous calls to free the same QP.
1176 	 *    In addition, since the QP lock is contained in the QP "software
1177 	 *    handle" resource, which we will free (see below), it is
1178 	 *    important that we have no further references to that memory.
1179 	 */
1180 	mutex_exit(&qp->qp_lock);
1181 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp))
1182 
1183 	/*
1184 	 * Free the QP resources
1185 	 *    Start by deregistering and freeing the memory for work queues.
1186 	 *    Next free any previously allocated context information
1187 	 *    (depending on QP type)
1188 	 *    Finally, decrement the necessary reference counts.
1189 	 * If this fails for any reason, then it is an indication that
1190 	 * something (either in HW or SW) has gone seriously wrong.  So we
1191 	 * print a warning message and return.
1192 	 */
1193 	status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
1194 	    sleepflag);
1195 	if (status != DDI_SUCCESS) {
1196 		TAVOR_WARNING(state, "failed to deregister QP memory");
1197 		/* Set "status" and "errormsg" and goto failure */
1198 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "failed deregister mr");
1199 		goto qpfree_fail;
1200 	}
1201 
1202 	/* Free the memory for the QP */
1203 	tavor_queue_free(state, &qp->qp_wqinfo);
1204 
1205 	/*
1206 	 * Free up the remainder of the QP resources.  Note: we have a few
1207 	 * different resources to free up depending on whether the QP is a
1208 	 * special QP or not.  As described above, if any of these fail for
1209 	 * any reason it is an indication that something (either in HW or SW)
1210 	 * has gone seriously wrong.  So we print a warning message and
1211 	 * return.
1212 	 */
1213 	if (qp->qp_is_special) {
1214 		type = (qp->qp_is_special == TAVOR_QP_SMI) ?
1215 		    IBT_SMI_SQP : IBT_GSI_SQP;
1216 
1217 		/* Free up resources for the special QP */
1218 		status = tavor_special_qp_rsrc_free(state, type, port);
1219 		if (status != DDI_SUCCESS) {
1220 			TAVOR_WARNING(state, "failed to free special QP rsrc");
1221 			/* Set "status" and "errormsg" and goto failure */
1222 			TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
1223 			    "failed special QP rsrc");
1224 			goto qpfree_fail;
1225 		}
1226 
1227 	} else {
1228 		type = qp->qp_serv_type;
1229 
1230 		/* Free up the RDB entries resource */
1231 		if (type == TAVOR_QP_RC) {
1232 			tavor_rsrc_free(state, &rdb);
1233 		}
1234 
1235 		/*
1236 		 * Check the flags and determine whether to release the
1237 		 * QPN or not, based on their value.
1238 		 */
1239 		if (free_qp_flags == IBC_FREE_QP_ONLY) {
1240 			entry = qp->qp_qpn_hdl;
1241 			tavor_qp_release_qpn(state, qp->qp_qpn_hdl,
1242 			    TAVOR_QPN_FREE_ONLY);
1243 			*qpnh = (ibc_qpn_hdl_t)entry;
1244 		} else {
1245 			tavor_qp_release_qpn(state, qp->qp_qpn_hdl,
1246 			    TAVOR_QPN_RELEASE);
1247 		}
1248 	}
1249 
1250 	/* Free the Tavor Queue Pair handle */
1251 	tavor_rsrc_free(state, &rsrc);
1252 
1253 	/* Decrement the reference counts on CQs, PD and SRQ (if needed) */
1254 	tavor_cq_refcnt_dec(rq_cq);
1255 	tavor_cq_refcnt_dec(sq_cq);
1256 	tavor_pd_refcnt_dec(pd);
1257 	if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
1258 		tavor_srq_refcnt_dec(srq);
1259 	}
1260 
1261 	/* Set the qphdl pointer to NULL and return success */
1262 	*qphdl = NULL;
1263 
1264 	TAVOR_TNF_EXIT(tavor_qp_free);
1265 	return (DDI_SUCCESS);
1266 
1267 qpfree_fail:
1268 	TNF_PROBE_1(tavor_qp_free_fail, TAVOR_TNF_ERROR, "",
1269 	    tnf_string, msg, errormsg);
1270 	TAVOR_TNF_EXIT(tavor_qp_free);
1271 	return (status);
1272 }
1273 
1274 
1275 /*
1276  * tavor_qp_query()
1277  *    Context: Can be called from interrupt or base context.
1278  */
1279 int
1280 tavor_qp_query(tavor_state_t *state, tavor_qphdl_t qp,
1281     ibt_qp_query_attr_t *attr_p)
1282 {
1283 	ibt_cep_state_t		qp_state;
1284 	ibt_qp_ud_attr_t	*ud;
1285 	ibt_qp_rc_attr_t	*rc;
1286 	ibt_qp_uc_attr_t	*uc;
1287 	ibt_cep_flags_t		enable_flags;
1288 	tavor_hw_addr_path_t	*qpc_path, *qpc_alt_path;
1289 	ibt_cep_path_t		*path_ptr, *alt_path_ptr;
1290 	tavor_hw_qpc_t		*qpc;
1291 	int			status;
1292 
1293 	TAVOR_TNF_ENTER(tavor_qp_query);
1294 
1295 	mutex_enter(&qp->qp_lock);
1296 
1297 	/*
1298 	 * Grab the temporary QPC entry from QP software state
1299 	 */
1300 	qpc = &qp->qpc;
1301 
1302 	/* Convert the current Tavor QP state to IBTF QP state */
1303 	switch (qp->qp_state) {
1304 	case TAVOR_QP_RESET:
1305 		qp_state = IBT_STATE_RESET;		/* "Reset" */
1306 		break;
1307 	case TAVOR_QP_INIT:
1308 		qp_state = IBT_STATE_INIT;		/* Initialized */
1309 		break;
1310 	case TAVOR_QP_RTR:
1311 		qp_state = IBT_STATE_RTR;		/* Ready to Receive */
1312 		break;
1313 	case TAVOR_QP_RTS:
1314 		qp_state = IBT_STATE_RTS;		/* Ready to Send */
1315 		break;
1316 	case TAVOR_QP_SQERR:
1317 		qp_state = IBT_STATE_SQE;		/* Send Queue Error */
1318 		break;
1319 	case TAVOR_QP_SQD:
1320 		if (qp->qp_sqd_still_draining) {
1321 			qp_state = IBT_STATE_SQDRAIN;	/* SQ Draining */
1322 		} else {
1323 			qp_state = IBT_STATE_SQD;	/* SQ Drained */
1324 		}
1325 		break;
1326 	case TAVOR_QP_ERR:
1327 		qp_state = IBT_STATE_ERROR;		/* Error */
1328 		break;
1329 	default:
1330 		mutex_exit(&qp->qp_lock);
1331 		TNF_PROBE_1(tavor_qp_query_inv_qpstate_fail,
1332 		    TAVOR_TNF_ERROR, "", tnf_uint, qpstate, qp->qp_state);
1333 		TAVOR_TNF_EXIT(tavor_qp_query);
1334 		return (ibc_get_ci_failure(0));
1335 	}
1336 	attr_p->qp_info.qp_state = qp_state;
1337 
1338 	/* SRQ Hook. */
1339 	attr_p->qp_srq = NULL;
1340 
1341 	/*
1342 	 * The following QP information is always returned, regardless of
1343 	 * the current QP state.  Note: Some special handling is necessary
1344 	 * for calculating the QP number on special QP (QP0 and QP1).
1345 	 */
1346 	attr_p->qp_sq_cq    = qp->qp_sq_cqhdl->cq_hdlrarg;
1347 	attr_p->qp_rq_cq    = qp->qp_rq_cqhdl->cq_hdlrarg;
1348 	if (qp->qp_is_special) {
1349 		attr_p->qp_qpn = (qp->qp_is_special == TAVOR_QP_SMI) ? 0 : 1;
1350 	} else {
1351 		attr_p->qp_qpn = (ib_qpn_t)qp->qp_qpnum;
1352 	}
1353 	attr_p->qp_sq_sgl   = qp->qp_sq_sgl;
1354 	attr_p->qp_rq_sgl   = qp->qp_rq_sgl;
1355 	attr_p->qp_info.qp_sq_sz = qp->qp_sq_bufsz;
1356 	attr_p->qp_info.qp_rq_sz = qp->qp_rq_bufsz;
1357 
1358 	/*
1359 	 * If QP is currently in the "Reset" state, then only the above are
1360 	 * returned
1361 	 */
1362 	if (qp_state == IBT_STATE_RESET) {
1363 		mutex_exit(&qp->qp_lock);
1364 		TAVOR_TNF_EXIT(tavor_qp_query);
1365 		return (DDI_SUCCESS);
1366 	}
1367 
1368 	/*
1369 	 * Post QUERY_QP command to firmware
1370 	 *
1371 	 * We do a TAVOR_NOSLEEP here because we are holding the "qp_lock".
1372 	 * Since we may be in the interrupt context (or subsequently raised
1373 	 * to interrupt level by priority inversion), we do not want to block
1374 	 * in this routine waiting for success.
1375 	 */
1376 	status = tavor_cmn_query_cmd_post(state, QUERY_QP, qp->qp_qpnum,
1377 	    qpc, sizeof (tavor_hw_qpc_t), TAVOR_CMD_NOSLEEP_SPIN);
1378 	if (status != TAVOR_CMD_SUCCESS) {
1379 		mutex_exit(&qp->qp_lock);
1380 		cmn_err(CE_CONT, "Tavor: QUERY_QP command failed: %08x\n",
1381 		    status);
1382 		TNF_PROBE_1(tavor_qp_query_cmd_fail, TAVOR_TNF_ERROR, "",
1383 		    tnf_uint, status, status);
1384 		TAVOR_TNF_EXIT(tavor_qp_query);
1385 		return (ibc_get_ci_failure(0));
1386 	}
1387 
1388 	/*
1389 	 * Fill in the additional QP info based on the QP's transport type.
1390 	 */
1391 	if (qp->qp_serv_type == TAVOR_QP_UD) {
1392 
1393 		/* Fill in the UD-specific info */
1394 		ud = &attr_p->qp_info.qp_transport.ud;
1395 		ud->ud_qkey	= (ib_qkey_t)qpc->qkey;
1396 		ud->ud_sq_psn	= qpc->next_snd_psn;
1397 		ud->ud_pkey_ix	= qpc->pri_addr_path.pkey_indx;
1398 		ud->ud_port	= qpc->pri_addr_path.portnum;
1399 
1400 		attr_p->qp_info.qp_trans = IBT_UD_SRV;
1401 
1402 	} else if (qp->qp_serv_type == TAVOR_QP_RC) {
1403 
1404 		/* Fill in the RC-specific info */
1405 		rc = &attr_p->qp_info.qp_transport.rc;
1406 		rc->rc_sq_psn	= qpc->next_snd_psn;
1407 		rc->rc_rq_psn	= qpc->next_rcv_psn;
1408 		rc->rc_dst_qpn	= qpc->rem_qpn;
1409 
1410 		/* Grab the path migration state information */
1411 		if (qpc->pm_state == TAVOR_QP_PMSTATE_MIGRATED) {
1412 			rc->rc_mig_state = IBT_STATE_MIGRATED;
1413 		} else if (qpc->pm_state == TAVOR_QP_PMSTATE_REARM) {
1414 			rc->rc_mig_state = IBT_STATE_REARMED;
1415 		} else {
1416 			rc->rc_mig_state = IBT_STATE_ARMED;
1417 		}
1418 		rc->rc_rdma_ra_out = (1 << qpc->sra_max);
1419 		rc->rc_rdma_ra_in  = (1 << qpc->rra_max);
1420 		rc->rc_min_rnr_nak = qpc->min_rnr_nak;
1421 		rc->rc_path_mtu	   = qpc->mtu;
1422 		rc->rc_retry_cnt   = qpc->retry_cnt;
1423 
1424 		/* Get the common primary address path fields */
1425 		qpc_path = &qpc->pri_addr_path;
1426 		path_ptr = &rc->rc_path;
1427 		tavor_get_addr_path(state, qpc_path, &path_ptr->cep_adds_vect,
1428 		    TAVOR_ADDRPATH_QP, qp);
1429 
1430 		/* Fill in the additional primary address path fields */
1431 		path_ptr->cep_pkey_ix	   = qpc_path->pkey_indx;
1432 		path_ptr->cep_hca_port_num = qpc_path->portnum;
1433 		path_ptr->cep_timeout	   = qpc_path->ack_timeout;
1434 
1435 		/* Get the common alternate address path fields */
1436 		qpc_alt_path = &qpc->alt_addr_path;
1437 		alt_path_ptr = &rc->rc_alt_path;
1438 		tavor_get_addr_path(state, qpc_alt_path,
1439 		    &alt_path_ptr->cep_adds_vect, TAVOR_ADDRPATH_QP, qp);
1440 
1441 		/* Fill in the additional alternate address path fields */
1442 		alt_path_ptr->cep_pkey_ix	= qpc_alt_path->pkey_indx;
1443 		alt_path_ptr->cep_hca_port_num	= qpc_alt_path->portnum;
1444 		alt_path_ptr->cep_timeout	= qpc_alt_path->ack_timeout;
1445 
1446 		/* Get the RNR retry time from primary path */
1447 		rc->rc_rnr_retry_cnt = qpc_path->rnr_retry;
1448 
1449 		/* Set the enable flags based on RDMA/Atomic enable bits */
1450 		enable_flags = IBT_CEP_NO_FLAGS;
1451 		enable_flags |= ((qpc->rre == 0) ? 0 : IBT_CEP_RDMA_RD);
1452 		enable_flags |= ((qpc->rwe == 0) ? 0 : IBT_CEP_RDMA_WR);
1453 		enable_flags |= ((qpc->rae == 0) ? 0 : IBT_CEP_ATOMIC);
1454 		attr_p->qp_info.qp_flags = enable_flags;
1455 
1456 		attr_p->qp_info.qp_trans = IBT_RC_SRV;
1457 
1458 	} else if (qp->qp_serv_type == TAVOR_QP_UC) {
1459 
1460 		/* Fill in the UC-specific info */
1461 		uc = &attr_p->qp_info.qp_transport.uc;
1462 		uc->uc_sq_psn	= qpc->next_snd_psn;
1463 		uc->uc_rq_psn	= qpc->next_rcv_psn;
1464 		uc->uc_dst_qpn	= qpc->rem_qpn;
1465 
1466 		/* Grab the path migration state information */
1467 		if (qpc->pm_state == TAVOR_QP_PMSTATE_MIGRATED) {
1468 			uc->uc_mig_state = IBT_STATE_MIGRATED;
1469 		} else if (qpc->pm_state == TAVOR_QP_PMSTATE_REARM) {
1470 			uc->uc_mig_state = IBT_STATE_REARMED;
1471 		} else {
1472 			uc->uc_mig_state = IBT_STATE_ARMED;
1473 		}
1474 		uc->uc_path_mtu = qpc->mtu;
1475 
1476 		/* Get the common primary address path fields */
1477 		qpc_path = &qpc->pri_addr_path;
1478 		path_ptr = &uc->uc_path;
1479 		tavor_get_addr_path(state, qpc_path, &path_ptr->cep_adds_vect,
1480 		    TAVOR_ADDRPATH_QP, qp);
1481 
1482 		/* Fill in the additional primary address path fields */
1483 		path_ptr->cep_pkey_ix	   = qpc_path->pkey_indx;
1484 		path_ptr->cep_hca_port_num = qpc_path->portnum;
1485 
1486 		/* Get the common alternate address path fields */
1487 		qpc_alt_path = &qpc->alt_addr_path;
1488 		alt_path_ptr = &uc->uc_alt_path;
1489 		tavor_get_addr_path(state, qpc_alt_path,
1490 		    &alt_path_ptr->cep_adds_vect, TAVOR_ADDRPATH_QP, qp);
1491 
1492 		/* Fill in the additional alternate address path fields */
1493 		alt_path_ptr->cep_pkey_ix	= qpc_alt_path->pkey_indx;
1494 		alt_path_ptr->cep_hca_port_num	= qpc_alt_path->portnum;
1495 
1496 		/*
1497 		 * Set the enable flags based on RDMA enable bits (by
1498 		 * definition UC doesn't support Atomic or RDMA Read)
1499 		 */
1500 		enable_flags = ((qpc->rwe == 0) ? 0 : IBT_CEP_RDMA_WR);
1501 		attr_p->qp_info.qp_flags = enable_flags;
1502 
1503 		attr_p->qp_info.qp_trans = IBT_UC_SRV;
1504 
1505 	} else {
1506 		TAVOR_WARNING(state, "unexpected QP transport type");
1507 		mutex_exit(&qp->qp_lock);
1508 		return (ibc_get_ci_failure(0));
1509 	}
1510 
1511 	/*
1512 	 * Under certain circumstances it is possible for the Tavor hardware
1513 	 * to transition to one of the error states without software directly
1514 	 * knowing about it.  The QueryQP() call is the one place where we
1515 	 * have an opportunity to sample and update our view of the QP state.
1516 	 */
1517 	if (qpc->state == TAVOR_QP_SQERR) {
1518 		attr_p->qp_info.qp_state = IBT_STATE_SQE;
1519 		qp->qp_state = TAVOR_QP_SQERR;
1520 	}
1521 	if (qpc->state == TAVOR_QP_ERR) {
1522 		attr_p->qp_info.qp_state = IBT_STATE_ERROR;
1523 		qp->qp_state = TAVOR_QP_ERR;
1524 	}
1525 	mutex_exit(&qp->qp_lock);
1526 
1527 	TAVOR_TNF_EXIT(tavor_qp_query);
1528 	return (DDI_SUCCESS);
1529 }
1530 
1531 
1532 /*
1533  * tavor_qp_create_qpn()
1534  *    Context: Can be called from interrupt or base context.
1535  */
1536 static int
1537 tavor_qp_create_qpn(tavor_state_t *state, tavor_qphdl_t qp, tavor_rsrc_t *qpc)
1538 {
1539 	tavor_qpn_entry_t	query;
1540 	tavor_qpn_entry_t	*entry;
1541 	avl_index_t		where;
1542 
1543 	TAVOR_TNF_ENTER(tavor_qp_create_qpn);
1544 
1545 	/*
1546 	 * Build a query (for the AVL tree lookup) and attempt to find
1547 	 * a previously added entry that has a matching QPC index.  If
1548 	 * no matching entry is found, then allocate, initialize, and
1549 	 * add an entry to the AVL tree.
1550 	 * If a matching entry is found, then increment its QPN counter
1551 	 * and reference counter.
1552 	 */
1553 	query.qpn_indx = qpc->tr_indx;
1554 	mutex_enter(&state->ts_qpn_avl_lock);
1555 	entry = (tavor_qpn_entry_t *)avl_find(&state->ts_qpn_avl,
1556 	    &query, &where);
1557 	if (entry == NULL) {
1558 		/*
1559 		 * Allocate and initialize a QPN entry, then insert
1560 		 * it into the AVL tree.
1561 		 */
1562 		entry = (tavor_qpn_entry_t *)kmem_zalloc(
1563 		    sizeof (tavor_qpn_entry_t), KM_NOSLEEP);
1564 		if (entry == NULL) {
1565 			mutex_exit(&state->ts_qpn_avl_lock);
1566 			TAVOR_TNF_EXIT(tavor_qp_create_qpn);
1567 			return (DDI_FAILURE);
1568 		}
1569 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*entry))
1570 
1571 		entry->qpn_indx	   = qpc->tr_indx;
1572 		entry->qpn_refcnt  = 0;
1573 		entry->qpn_counter = 0;
1574 
1575 		avl_insert(&state->ts_qpn_avl, entry, where);
1576 	}
1577 
1578 	/*
1579 	 * Make the AVL tree entry point to the QP context resource that
1580 	 * it will be responsible for tracking
1581 	 */
1582 	entry->qpn_qpc = qpc;
1583 
1584 	/*
1585 	 * Setup the QP handle to point to the AVL tree entry.  Then
1586 	 * generate the new QP number from the entry's QPN counter value
1587 	 * and the hardware's QP context table index.
1588 	 */
1589 	qp->qp_qpn_hdl	= entry;
1590 	qp->qp_qpnum	= ((entry->qpn_counter <<
1591 	    state->ts_cfg_profile->cp_log_num_qp) | qpc->tr_indx) &
1592 	    TAVOR_QP_MAXNUMBER_MSK;
1593 
1594 	/*
1595 	 * Increment the reference counter and QPN counter.  The QPN
1596 	 * counter always indicates the next available number for use.
1597 	 */
1598 	entry->qpn_counter++;
1599 	entry->qpn_refcnt++;
1600 
1601 	mutex_exit(&state->ts_qpn_avl_lock);
1602 	TAVOR_TNF_EXIT(tavor_qp_create_qpn);
1603 	return (DDI_SUCCESS);
1604 }
1605 
1606 
1607 /*
1608  * tavor_qp_release_qpn()
1609  *    Context: Can be called only from user or kernel context.
1610  */
1611 void
1612 tavor_qp_release_qpn(tavor_state_t *state, tavor_qpn_entry_t *entry, int flags)
1613 {
1614 	TAVOR_TNF_ENTER(tavor_qp_release_qpn);
1615 
1616 	ASSERT(entry != NULL);
1617 
1618 	mutex_enter(&state->ts_qpn_avl_lock);
1619 
1620 	/*
1621 	 * If we are releasing the QP number here, then we decrement the
1622 	 * reference count and check for zero references.  If there are
1623 	 * zero references, then we free the QPC context (if it hadn't
1624 	 * already been freed during a TAVOR_QPN_FREE_ONLY free, i.e. for
1625 	 * reuse with another similar QP number) and remove the tracking
1626 	 * structure from the QP number AVL tree and free the structure.
1627 	 * If we are not releasing the QP number here, then, as long as we
1628 	 * have not exhausted the usefulness of the QPC context (that is,
1629 	 * re-used it too many times without the reference count having
1630 	 * gone to zero), we free up the QPC context for use by another
1631 	 * thread (which will use it to construct a different QP number
1632 	 * from the same QPC table index).
1633 	 */
1634 	if (flags == TAVOR_QPN_RELEASE) {
1635 		entry->qpn_refcnt--;
1636 
1637 		/*
1638 		 * If the reference count is zero, then we free the QPC
1639 		 * context (if it hadn't already been freed in an early
1640 		 * step, e.g. TAVOR_QPN_FREE_ONLY) and remove/free the
1641 		 * tracking structure from the QP number AVL tree.
1642 		 */
1643 		if (entry->qpn_refcnt == 0) {
1644 			if (entry->qpn_qpc != NULL) {
1645 				tavor_rsrc_free(state, &entry->qpn_qpc);
1646 			}
1647 
1648 			/*
1649 			 * If the current entry has served it's useful
1650 			 * purpose (i.e. been reused the maximum allowable
1651 			 * number of times), then remove it from QP number
1652 			 * AVL tree and free it up.
1653 			 */
1654 			if (entry->qpn_counter >= (1 <<
1655 			    (24 - state->ts_cfg_profile->cp_log_num_qp))) {
1656 				avl_remove(&state->ts_qpn_avl, entry);
1657 				kmem_free(entry, sizeof (tavor_qpn_entry_t));
1658 			}
1659 		}
1660 
1661 	} else if (flags == TAVOR_QPN_FREE_ONLY) {
1662 		/*
1663 		 * Even if we are not freeing the QP number, that will not
1664 		 * always prevent us from releasing the QPC context.  In fact,
1665 		 * since the QPC context only forms part of the whole QPN,
1666 		 * we want to free it up for use by other consumers.  But
1667 		 * if the reference count is non-zero (which it will always
1668 		 * be when we are doing TAVOR_QPN_FREE_ONLY) and the counter
1669 		 * has reached its maximum value, then we cannot reuse the
1670 		 * QPC context until the reference count eventually reaches
1671 		 * zero (in TAVOR_QPN_RELEASE, above).
1672 		 */
1673 		if (entry->qpn_counter < (1 <<
1674 		    (24 - state->ts_cfg_profile->cp_log_num_qp))) {
1675 			tavor_rsrc_free(state, &entry->qpn_qpc);
1676 		}
1677 	}
1678 	mutex_exit(&state->ts_qpn_avl_lock);
1679 
1680 	TAVOR_TNF_EXIT(tavor_qp_release_qpn);
1681 }
1682 
1683 
1684 /*
1685  * tavor_qpn_db_compare()
1686  *    Context: Can be called from user or kernel context.
1687  */
1688 static int
1689 tavor_qpn_avl_compare(const void *q, const void *e)
1690 {
1691 	tavor_qpn_entry_t	*entry, *query;
1692 
1693 	TAVOR_TNF_ENTER(tavor_qpn_avl_compare);
1694 
1695 	entry = (tavor_qpn_entry_t *)e;
1696 	query = (tavor_qpn_entry_t *)q;
1697 
1698 	if (query->qpn_indx < entry->qpn_indx) {
1699 		TAVOR_TNF_EXIT(tavor_qpn_avl_compare);
1700 		return (-1);
1701 	} else if (query->qpn_indx > entry->qpn_indx) {
1702 		TAVOR_TNF_EXIT(tavor_qpn_avl_compare);
1703 		return (+1);
1704 	} else {
1705 		TAVOR_TNF_EXIT(tavor_qpn_avl_compare);
1706 		return (0);
1707 	}
1708 }
1709 
1710 
1711 /*
1712  * tavor_qpn_avl_init()
1713  *    Context: Only called from attach() path context
1714  */
1715 void
1716 tavor_qpn_avl_init(tavor_state_t *state)
1717 {
1718 	TAVOR_TNF_ENTER(tavor_qpn_avl_init);
1719 
1720 	/* Initialize the lock used for QP number (QPN) AVL tree access */
1721 	mutex_init(&state->ts_qpn_avl_lock, NULL, MUTEX_DRIVER,
1722 	    DDI_INTR_PRI(state->ts_intrmsi_pri));
1723 
1724 	/* Initialize the AVL tree for the QP number (QPN) storage */
1725 	avl_create(&state->ts_qpn_avl, tavor_qpn_avl_compare,
1726 	    sizeof (tavor_qpn_entry_t),
1727 	    offsetof(tavor_qpn_entry_t, qpn_avlnode));
1728 
1729 	TAVOR_TNF_EXIT(tavor_qpn_avl_init);
1730 }
1731 
1732 
1733 /*
1734  * tavor_qpn_avl_fini()
1735  *    Context: Only called from attach() and/or detach() path contexts
1736  */
1737 void
1738 tavor_qpn_avl_fini(tavor_state_t *state)
1739 {
1740 	tavor_qpn_entry_t	*entry;
1741 	void			*cookie;
1742 
1743 	TAVOR_TNF_ENTER(tavor_qpn_avl_fini);
1744 
1745 	/*
1746 	 * Empty all entries (if necessary) and destroy the AVL tree
1747 	 * that was used for QP number (QPN) tracking.
1748 	 */
1749 	cookie = NULL;
1750 	while ((entry = (tavor_qpn_entry_t *)avl_destroy_nodes(
1751 	    &state->ts_qpn_avl, &cookie)) != NULL) {
1752 		kmem_free(entry, sizeof (tavor_qpn_entry_t));
1753 	}
1754 	avl_destroy(&state->ts_qpn_avl);
1755 
1756 	/* Destroy the lock used for QP number (QPN) AVL tree access */
1757 	mutex_destroy(&state->ts_qpn_avl_lock);
1758 
1759 	TAVOR_TNF_EXIT(tavor_qpn_avl_fini);
1760 }
1761 
1762 
1763 /*
1764  * tavor_qphdl_from_qpnum()
1765  *    Context: Can be called from interrupt or base context.
1766  *
1767  *    This routine is important because changing the unconstrained
1768  *    portion of the QP number is critical to the detection of a
1769  *    potential race condition in the QP event handler code (i.e. the case
1770  *    where a QP is freed and alloc'd again before an event for the
1771  *    "old" QP can be handled).
1772  *
1773  *    While this is not a perfect solution (not sure that one exists)
1774  *    it does help to mitigate the chance that this race condition will
1775  *    cause us to deliver a "stale" event to the new QP owner.  Note:
1776  *    this solution does not scale well because the number of constrained
1777  *    bits increases (and, hence, the number of unconstrained bits
1778  *    decreases) as the number of supported QPs grows.  For small and
1779  *    intermediate values, it should hopefully provide sufficient
1780  *    protection.
1781  */
1782 tavor_qphdl_t
1783 tavor_qphdl_from_qpnum(tavor_state_t *state, uint_t qpnum)
1784 {
1785 	uint_t	qpindx, qpmask;
1786 
1787 	/* Calculate the QP table index from the qpnum */
1788 	qpmask = (1 << state->ts_cfg_profile->cp_log_num_qp) - 1;
1789 	qpindx = qpnum & qpmask;
1790 	return (state->ts_qphdl[qpindx]);
1791 }
1792 
1793 
1794 /*
1795  * tavor_special_qp_rsrc_alloc
1796  *    Context: Can be called from interrupt or base context.
1797  */
1798 static int
1799 tavor_special_qp_rsrc_alloc(tavor_state_t *state, ibt_sqp_type_t type,
1800     uint_t port, tavor_rsrc_t **qp_rsrc)
1801 {
1802 	uint_t		mask, flags;
1803 	int		status;
1804 
1805 	TAVOR_TNF_ENTER(tavor_special_qp_rsrc_alloc);
1806 
1807 	mutex_enter(&state->ts_spec_qplock);
1808 	flags = state->ts_spec_qpflags;
1809 	if (type == IBT_SMI_SQP) {
1810 		/*
1811 		 * Check here to see if the driver has been configured
1812 		 * to instruct the Tavor firmware to handle all incoming
1813 		 * SMP messages (i.e. messages sent to SMA).  If so,
1814 		 * then we will treat QP0 as if it has already been
1815 		 * allocated (for internal use).  Otherwise, if we allow
1816 		 * the allocation to happen, it will cause unexpected
1817 		 * behaviors (e.g. Tavor SMA becomes unresponsive).
1818 		 */
1819 		if (state->ts_cfg_profile->cp_qp0_agents_in_fw != 0) {
1820 			mutex_exit(&state->ts_spec_qplock);
1821 			TNF_PROBE_0(tavor_special_qp0_alloc_already_in_fw,
1822 			    TAVOR_TNF_ERROR, "");
1823 			TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc);
1824 			return (IBT_QP_IN_USE);
1825 		}
1826 
1827 		/*
1828 		 * If this is the first QP0 allocation, then post
1829 		 * a CONF_SPECIAL_QP firmware command
1830 		 */
1831 		if ((flags & TAVOR_SPECIAL_QP0_RSRC_MASK) == 0) {
1832 			status = tavor_conf_special_qp_cmd_post(state,
1833 			    state->ts_spec_qp0->tr_indx, TAVOR_CMD_QP_SMI,
1834 			    TAVOR_CMD_NOSLEEP_SPIN);
1835 			if (status != TAVOR_CMD_SUCCESS) {
1836 				mutex_exit(&state->ts_spec_qplock);
1837 				cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP "
1838 				    "command failed: %08x\n", status);
1839 				TNF_PROBE_1(tavor_conf_special_qp_cmd_fail,
1840 				    TAVOR_TNF_ERROR, "", tnf_uint, status,
1841 				    status);
1842 				TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc);
1843 				return (IBT_INSUFF_RESOURCE);
1844 			}
1845 		}
1846 
1847 		/*
1848 		 * Now check (and, if necessary, modify) the flags to indicate
1849 		 * whether the allocation was successful
1850 		 */
1851 		mask = (1 << (TAVOR_SPECIAL_QP0_RSRC + port));
1852 		if (flags & mask) {
1853 			mutex_exit(&state->ts_spec_qplock);
1854 			TNF_PROBE_1(tavor_ts_spec_qp0_alloc_already,
1855 			    TAVOR_TNF_ERROR, "", tnf_uint, port, port);
1856 			TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc);
1857 			return (IBT_QP_IN_USE);
1858 		}
1859 		state->ts_spec_qpflags |= mask;
1860 		*qp_rsrc = state->ts_spec_qp0;
1861 
1862 	} else {
1863 		/*
1864 		 * If this is the first QP1 allocation, then post
1865 		 * a CONF_SPECIAL_QP firmware command
1866 		 */
1867 		if ((flags & TAVOR_SPECIAL_QP1_RSRC_MASK) == 0) {
1868 			status = tavor_conf_special_qp_cmd_post(state,
1869 			    state->ts_spec_qp1->tr_indx, TAVOR_CMD_QP_GSI,
1870 			    TAVOR_CMD_NOSLEEP_SPIN);
1871 			if (status != TAVOR_CMD_SUCCESS) {
1872 				mutex_exit(&state->ts_spec_qplock);
1873 				cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP "
1874 				    "command failed: %08x\n", status);
1875 				TNF_PROBE_1(tavor_conf_special_qp_cmd_fail,
1876 				    TAVOR_TNF_ERROR, "", tnf_uint, status,
1877 				    status);
1878 				TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc);
1879 				return (IBT_INSUFF_RESOURCE);
1880 			}
1881 		}
1882 
1883 		/*
1884 		 * Now check (and, if necessary, modify) the flags to indicate
1885 		 * whether the allocation was successful
1886 		 */
1887 		mask = (1 << (TAVOR_SPECIAL_QP1_RSRC + port));
1888 		if (flags & mask) {
1889 			mutex_exit(&state->ts_spec_qplock);
1890 			TNF_PROBE_0(tavor_ts_spec_qp1_alloc_already,
1891 			    TAVOR_TNF_ERROR, "");
1892 			TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc);
1893 			return (IBT_QP_IN_USE);
1894 		}
1895 		state->ts_spec_qpflags |= mask;
1896 		*qp_rsrc = state->ts_spec_qp1;
1897 	}
1898 
1899 	mutex_exit(&state->ts_spec_qplock);
1900 	TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc);
1901 	return (DDI_SUCCESS);
1902 }
1903 
1904 
1905 /*
1906  * tavor_special_qp_rsrc_free
1907  *    Context: Can be called from interrupt or base context.
1908  */
1909 static int
1910 tavor_special_qp_rsrc_free(tavor_state_t *state, ibt_sqp_type_t type,
1911     uint_t port)
1912 {
1913 	uint_t		mask, flags;
1914 	int		status;
1915 
1916 	TAVOR_TNF_ENTER(tavor_special_qp_rsrc_free);
1917 
1918 	mutex_enter(&state->ts_spec_qplock);
1919 	if (type == IBT_SMI_SQP) {
1920 		mask = (1 << (TAVOR_SPECIAL_QP0_RSRC + port));
1921 		state->ts_spec_qpflags &= ~mask;
1922 		flags = state->ts_spec_qpflags;
1923 
1924 		/*
1925 		 * If this is the last QP0 free, then post a CONF_SPECIAL_QP
1926 		 * firmware command
1927 		 */
1928 		if ((flags & TAVOR_SPECIAL_QP0_RSRC_MASK) == 0) {
1929 			status = tavor_conf_special_qp_cmd_post(state, 0,
1930 			    TAVOR_CMD_QP_SMI, TAVOR_CMD_NOSLEEP_SPIN);
1931 			if (status != TAVOR_CMD_SUCCESS) {
1932 				mutex_exit(&state->ts_spec_qplock);
1933 				cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP "
1934 				    "command failed: %08x\n", status);
1935 				TNF_PROBE_1(tavor_conf_special_qp_cmd_fail,
1936 				    TAVOR_TNF_ERROR, "", tnf_uint, status,
1937 				    status);
1938 				TAVOR_TNF_EXIT(tavor_special_qp_rsrc_free);
1939 				return (ibc_get_ci_failure(0));
1940 			}
1941 		}
1942 	} else {
1943 		mask = (1 << (TAVOR_SPECIAL_QP1_RSRC + port));
1944 		state->ts_spec_qpflags &= ~mask;
1945 		flags = state->ts_spec_qpflags;
1946 
1947 		/*
1948 		 * If this is the last QP1 free, then post a CONF_SPECIAL_QP
1949 		 * firmware command
1950 		 */
1951 		if ((flags & TAVOR_SPECIAL_QP1_RSRC_MASK) == 0) {
1952 			status = tavor_conf_special_qp_cmd_post(state, 0,
1953 			    TAVOR_CMD_QP_GSI, TAVOR_CMD_NOSLEEP_SPIN);
1954 			if (status != TAVOR_CMD_SUCCESS) {
1955 				mutex_exit(&state->ts_spec_qplock);
1956 				cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP "
1957 				    "command failed: %08x\n", status);
1958 				TNF_PROBE_1(tavor_conf_special_qp_cmd_fail,
1959 				    TAVOR_TNF_ERROR, "", tnf_uint, status,
1960 				    status);
1961 				TAVOR_TNF_EXIT(tavor_special_qp_rsrc_free);
1962 				return (ibc_get_ci_failure(0));
1963 			}
1964 		}
1965 	}
1966 
1967 	mutex_exit(&state->ts_spec_qplock);
1968 	TAVOR_TNF_EXIT(tavor_special_qp_rsrc_free);
1969 	return (DDI_SUCCESS);
1970 }
1971 
1972 
1973 /*
1974  * tavor_qp_sgl_to_logwqesz()
1975  *    Context: Can be called from interrupt or base context.
1976  */
1977 static void
1978 tavor_qp_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
1979     tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl)
1980 {
1981 	uint_t	max_size, log2, actual_sgl;
1982 
1983 	TAVOR_TNF_ENTER(tavor_qp_sgl_to_logwqesz);
1984 
1985 	switch (wq_type) {
1986 	case TAVOR_QP_WQ_TYPE_SENDQ:
1987 		/*
1988 		 * Use requested maximum SGL to calculate max descriptor size
1989 		 * (while guaranteeing that the descriptor size is a
1990 		 * power-of-2 cachelines).
1991 		 */
1992 		max_size = (TAVOR_QP_WQE_MLX_SND_HDRS + (num_sgl << 4));
1993 		log2 = highbit(max_size);
1994 		if (ISP2(max_size)) {
1995 			log2 = log2 - 1;
1996 		}
1997 
1998 		/* Make sure descriptor is at least the minimum size */
1999 		log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
2000 
2001 		/* Calculate actual number of SGL (given WQE size) */
2002 		actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_SND_HDRS) >> 4;
2003 		break;
2004 
2005 	case TAVOR_QP_WQ_TYPE_RECVQ:
2006 		/*
2007 		 * Same as above (except for Recv WQEs)
2008 		 */
2009 		max_size = (TAVOR_QP_WQE_MLX_RCV_HDRS + (num_sgl << 4));
2010 		log2 = highbit(max_size);
2011 		if (ISP2(max_size)) {
2012 			log2 = log2 - 1;
2013 		}
2014 
2015 		/* Make sure descriptor is at least the minimum size */
2016 		log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
2017 
2018 		/* Calculate actual number of SGL (given WQE size) */
2019 		actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_RCV_HDRS) >> 4;
2020 		break;
2021 
2022 	case TAVOR_QP_WQ_TYPE_SENDMLX_QP0:
2023 		/*
2024 		 * Same as above (except for MLX transport WQEs).  For these
2025 		 * WQEs we have to account for the space consumed by the
2026 		 * "inline" packet headers.  (This is smaller than for QP1
2027 		 * below because QP0 is not allowed to send packets with a GRH.
2028 		 */
2029 		max_size = (TAVOR_QP_WQE_MLX_QP0_HDRS + (num_sgl << 4));
2030 		log2 = highbit(max_size);
2031 		if (ISP2(max_size)) {
2032 			log2 = log2 - 1;
2033 		}
2034 
2035 		/* Make sure descriptor is at least the minimum size */
2036 		log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
2037 
2038 		/* Calculate actual number of SGL (given WQE size) */
2039 		actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_QP0_HDRS) >> 4;
2040 		break;
2041 
2042 	case TAVOR_QP_WQ_TYPE_SENDMLX_QP1:
2043 		/*
2044 		 * Same as above.  For these WQEs we again have to account for
2045 		 * the space consumed by the "inline" packet headers.  (This
2046 		 * is larger than for QP0 above because we have to account for
2047 		 * the possibility of a GRH in each packet - and this
2048 		 * introduces an alignment issue that causes us to consume
2049 		 * an additional 8 bytes).
2050 		 */
2051 		max_size = (TAVOR_QP_WQE_MLX_QP1_HDRS + (num_sgl << 4));
2052 		log2 = highbit(max_size);
2053 		if (ISP2(max_size)) {
2054 			log2 = log2 - 1;
2055 		}
2056 
2057 		/* Make sure descriptor is at least the minimum size */
2058 		log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
2059 
2060 		/* Calculate actual number of SGL (given WQE size) */
2061 		actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_QP1_HDRS) >> 4;
2062 		break;
2063 
2064 	default:
2065 		TAVOR_WARNING(state, "unexpected work queue type");
2066 		TNF_PROBE_0(tavor_qp_sgl_to_logwqesz_inv_wqtype_fail,
2067 		    TAVOR_TNF_ERROR, "");
2068 		break;
2069 	}
2070 
2071 	/* Fill in the return values */
2072 	*logwqesz = log2;
2073 	*max_sgl  = min(state->ts_cfg_profile->cp_wqe_real_max_sgl, actual_sgl);
2074 
2075 	TAVOR_TNF_EXIT(tavor_qp_sgl_to_logwqesz);
2076 }
2077