xref: /titanic_41/usr/src/uts/common/io/ib/adapters/tavor/tavor_srq.c (revision 79a77829f1ca134b5058f1269fe5a7a52b874aa9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * tavor_srq.c
29  *    Tavor Shared Receive Queue Processing Routines
30  *
31  *    Implements all the routines necessary for allocating, freeing, querying,
32  *    modifying and posting shared receive queues.
33  */
34 
35 #include <sys/sysmacros.h>
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/bitmap.h>
42 
43 #include <sys/ib/adapters/tavor/tavor.h>
44 
45 static void tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
46     tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl);
47 
48 /*
49  * tavor_srq_alloc()
50  *    Context: Can be called only from user or kernel context.
51  */
52 int
tavor_srq_alloc(tavor_state_t * state,tavor_srq_info_t * srqinfo,uint_t sleepflag,tavor_srq_options_t * op)53 tavor_srq_alloc(tavor_state_t *state, tavor_srq_info_t *srqinfo,
54     uint_t sleepflag, tavor_srq_options_t *op)
55 {
56 	ibt_srq_hdl_t		ibt_srqhdl;
57 	tavor_pdhdl_t		pd;
58 	ibt_srq_sizes_t		*sizes;
59 	ibt_srq_sizes_t		*real_sizes;
60 	tavor_srqhdl_t		*srqhdl;
61 	ibt_srq_flags_t		flags;
62 	tavor_rsrc_t		*srqc, *rsrc;
63 	tavor_hw_srqc_t		srqc_entry;
64 	uint32_t		*buf;
65 	tavor_srqhdl_t		srq;
66 	tavor_umap_db_entry_t	*umapdb;
67 	ibt_mr_attr_t		mr_attr;
68 	tavor_mr_options_t	mr_op;
69 	tavor_mrhdl_t		mr;
70 	uint64_t		addr;
71 	uint64_t		value, srq_desc_off;
72 	uint32_t		lkey;
73 	uint32_t		log_srq_size;
74 	uint32_t		uarpg;
75 	uint_t			wq_location, dma_xfer_mode, srq_is_umap;
76 	int			flag, status;
77 	char			*errormsg;
78 	uint_t			max_sgl;
79 	uint_t			wqesz;
80 
81 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sizes))
82 
83 	TAVOR_TNF_ENTER(tavor_srq_alloc);
84 
85 	/*
86 	 * Check the "options" flag.  Currently this flag tells the driver
87 	 * whether or not the SRQ's work queues should be come from normal
88 	 * system memory or whether they should be allocated from DDR memory.
89 	 */
90 	if (op == NULL) {
91 		wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
92 	} else {
93 		wq_location = op->srqo_wq_loc;
94 	}
95 
96 	/*
97 	 * Extract the necessary info from the tavor_srq_info_t structure
98 	 */
99 	real_sizes = srqinfo->srqi_real_sizes;
100 	sizes	   = srqinfo->srqi_sizes;
101 	pd	   = srqinfo->srqi_pd;
102 	ibt_srqhdl = srqinfo->srqi_ibt_srqhdl;
103 	flags	   = srqinfo->srqi_flags;
104 	srqhdl	   = srqinfo->srqi_srqhdl;
105 
106 	/*
107 	 * Determine whether SRQ is being allocated for userland access or
108 	 * whether it is being allocated for kernel access.  If the SRQ is
109 	 * being allocated for userland access, then lookup the UAR doorbell
110 	 * page number for the current process.  Note:  If this is not found
111 	 * (e.g. if the process has not previously open()'d the Tavor driver),
112 	 * then an error is returned.
113 	 */
114 	srq_is_umap = (flags & IBT_SRQ_USER_MAP) ? 1 : 0;
115 	if (srq_is_umap) {
116 		status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
117 		    MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
118 		if (status != DDI_SUCCESS) {
119 			/* Set "status" and "errormsg" and goto failure */
120 			TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page");
121 			goto srqalloc_fail3;
122 		}
123 		uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
124 	}
125 
126 	/* Increase PD refcnt */
127 	tavor_pd_refcnt_inc(pd);
128 
129 	/* Allocate an SRQ context entry */
130 	status = tavor_rsrc_alloc(state, TAVOR_SRQC, 1, sleepflag, &srqc);
131 	if (status != DDI_SUCCESS) {
132 		/* Set "status" and "errormsg" and goto failure */
133 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ context");
134 		goto srqalloc_fail1;
135 	}
136 
137 	/* Allocate the SRQ Handle entry */
138 	status = tavor_rsrc_alloc(state, TAVOR_SRQHDL, 1, sleepflag, &rsrc);
139 	if (status != DDI_SUCCESS) {
140 		/* Set "status" and "errormsg" and goto failure */
141 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ handle");
142 		goto srqalloc_fail2;
143 	}
144 
145 	srq = (tavor_srqhdl_t)rsrc->tr_addr;
146 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq))
147 
148 	srq->srq_srqnum = srqc->tr_indx;	/* just use index */
149 
150 	/*
151 	 * If this will be a user-mappable SRQ, then allocate an entry for
152 	 * the "userland resources database".  This will later be added to
153 	 * the database (after all further SRQ operations are successful).
154 	 * If we fail here, we must undo the reference counts and the
155 	 * previous resource allocation.
156 	 */
157 	if (srq_is_umap) {
158 		umapdb = tavor_umap_db_alloc(state->ts_instance,
159 		    srq->srq_srqnum, MLNX_UMAP_SRQMEM_RSRC,
160 		    (uint64_t)(uintptr_t)rsrc);
161 		if (umapdb == NULL) {
162 			/* Set "status" and "errormsg" and goto failure */
163 			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
164 			goto srqalloc_fail3;
165 		}
166 	}
167 
168 	/*
169 	 * Calculate the appropriate size for the SRQ.
170 	 * Note:  All Tavor SRQs must be a power-of-2 in size.  Also
171 	 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE.  This step
172 	 * is to round the requested size up to the next highest power-of-2
173 	 */
174 	sizes->srq_wr_sz = max(sizes->srq_wr_sz, TAVOR_SRQ_MIN_SIZE);
175 	log_srq_size = highbit(sizes->srq_wr_sz);
176 	if (ISP2(sizes->srq_wr_sz)) {
177 		log_srq_size = log_srq_size - 1;
178 	}
179 
180 	/*
181 	 * Next we verify that the rounded-up size is valid (i.e. consistent
182 	 * with the device limits and/or software-configured limits).  If not,
183 	 * then obviously we have a lot of cleanup to do before returning.
184 	 */
185 	if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
186 		/* Set "status" and "errormsg" and goto failure */
187 		TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max SRQ size");
188 		goto srqalloc_fail4;
189 	}
190 
191 	/*
192 	 * Next we verify that the requested number of SGL is valid (i.e.
193 	 * consistent with the device limits and/or software-configured
194 	 * limits).  If not, then obviously the same cleanup needs to be done.
195 	 */
196 	max_sgl = state->ts_cfg_profile->cp_srq_max_sgl;
197 	if (sizes->srq_sgl_sz > max_sgl) {
198 		/* Set "status" and "errormsg" and goto failure */
199 		TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED, "max SRQ SGL");
200 		goto srqalloc_fail4;
201 	}
202 
203 	/*
204 	 * Determine the SRQ's WQE sizes.  This depends on the requested
205 	 * number of SGLs.  Note: This also has the side-effect of
206 	 * calculating the real number of SGLs (for the calculated WQE size)
207 	 */
208 	tavor_srq_sgl_to_logwqesz(state, sizes->srq_sgl_sz,
209 	    TAVOR_QP_WQ_TYPE_RECVQ, &srq->srq_wq_log_wqesz,
210 	    &srq->srq_wq_sgl);
211 
212 	/*
213 	 * Allocate the memory for SRQ work queues.  Note:  The location from
214 	 * which we will allocate these work queues has been passed in through
215 	 * the tavor_qp_options_t structure.  Since Tavor work queues are not
216 	 * allowed to cross a 32-bit (4GB) boundary, the alignment of the work
217 	 * queue memory is very important.  We used to allocate work queues
218 	 * (the combined receive and send queues) so that they would be aligned
219 	 * on their combined size.  That alignment guaranteed that they would
220 	 * never cross the 4GB boundary (Tavor work queues are on the order of
221 	 * MBs at maximum).  Now we are able to relax this alignment constraint
222 	 * by ensuring that the IB address assigned to the queue memory (as a
223 	 * result of the tavor_mr_register() call) is offset from zero.
224 	 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
225 	 * guarantee the alignment, but when attempting to use IOMMU bypass
226 	 * mode we found that we were not allowed to specify any alignment that
227 	 * was more restrictive than the system page size.  So we avoided this
228 	 * constraint by passing two alignment values, one for the memory
229 	 * allocation itself and the other for the DMA handle (for later bind).
230 	 * This used to cause more memory than necessary to be allocated (in
231 	 * order to guarantee the more restrictive alignment contraint).  But
232 	 * be guaranteeing the zero-based IB virtual address for the queue, we
233 	 * are able to conserve this memory.
234 	 *
235 	 * Note: If SRQ is not user-mappable, then it may come from either
236 	 * kernel system memory or from HCA-attached local DDR memory.
237 	 *
238 	 * Note2: We align this queue on a pagesize boundary.  This is required
239 	 * to make sure that all the resulting IB addresses will start at 0, for
240 	 * a zero-based queue.  By making sure we are aligned on at least a
241 	 * page, any offset we use into our queue will be the same as when we
242 	 * perform tavor_srq_modify() operations later.
243 	 */
244 	wqesz = (1 << srq->srq_wq_log_wqesz);
245 	srq->srq_wqinfo.qa_size = (1 << log_srq_size) * wqesz;
246 	srq->srq_wqinfo.qa_alloc_align = PAGESIZE;
247 	srq->srq_wqinfo.qa_bind_align = PAGESIZE;
248 	if (srq_is_umap) {
249 		srq->srq_wqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
250 	} else {
251 		srq->srq_wqinfo.qa_location = wq_location;
252 	}
253 	status = tavor_queue_alloc(state, &srq->srq_wqinfo, sleepflag);
254 	if (status != DDI_SUCCESS) {
255 		/* Set "status" and "errormsg" and goto failure */
256 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
257 		goto srqalloc_fail4;
258 	}
259 	buf = (uint32_t *)srq->srq_wqinfo.qa_buf_aligned;
260 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
261 
262 	/*
263 	 * Register the memory for the SRQ work queues.  The memory for the SRQ
264 	 * must be registered in the Tavor TPT tables.  This gives us the LKey
265 	 * to specify in the SRQ context later.  Note: If the work queue is to
266 	 * be allocated from DDR memory, then only a "bypass" mapping is
267 	 * appropriate.  And if the SRQ memory is user-mappable, then we force
268 	 * DDI_DMA_CONSISTENT mapping.  Also, in order to meet the alignment
269 	 * restriction, we pass the "mro_bind_override_addr" flag in the call
270 	 * to tavor_mr_register().  This guarantees that the resulting IB vaddr
271 	 * will be zero-based (modulo the offset into the first page).  If we
272 	 * fail here, we still have the bunch of resource and reference count
273 	 * cleanup to do.
274 	 */
275 	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
276 	    IBT_MR_NOSLEEP;
277 	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
278 	mr_attr.mr_len   = srq->srq_wqinfo.qa_size;
279 	mr_attr.mr_as    = NULL;
280 	mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
281 	if (srq_is_umap) {
282 		mr_op.mro_bind_type   = state->ts_cfg_profile->cp_iommu_bypass;
283 	} else {
284 		if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
285 			mr_op.mro_bind_type =
286 			    state->ts_cfg_profile->cp_iommu_bypass;
287 			dma_xfer_mode =
288 			    state->ts_cfg_profile->cp_streaming_consistent;
289 			if (dma_xfer_mode == DDI_DMA_STREAMING) {
290 				mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
291 			}
292 		} else {
293 			mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
294 		}
295 	}
296 	mr_op.mro_bind_dmahdl = srq->srq_wqinfo.qa_dmahdl;
297 	mr_op.mro_bind_override_addr = 1;
298 	status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
299 	if (status != DDI_SUCCESS) {
300 		/* Set "status" and "errormsg" and goto failure */
301 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
302 		goto srqalloc_fail5;
303 	}
304 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
305 	addr = mr->mr_bindinfo.bi_addr;
306 	lkey = mr->mr_lkey;
307 
308 	/*
309 	 * Calculate the offset between the kernel virtual address space
310 	 * and the IB virtual address space.  This will be used when
311 	 * posting work requests to properly initialize each WQE.
312 	 */
313 	srq_desc_off = (uint64_t)(uintptr_t)srq->srq_wqinfo.qa_buf_aligned -
314 	    (uint64_t)mr->mr_bindinfo.bi_addr;
315 
316 	/*
317 	 * Create WQL and Wridlist for use by this SRQ
318 	 */
319 	srq->srq_wrid_wql = tavor_wrid_wql_create(state);
320 	if (srq->srq_wrid_wql == NULL) {
321 		/* Set "status" and "errormsg" and goto failure */
322 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wql create");
323 		goto srqalloc_fail6;
324 	}
325 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wrid_wql)))
326 
327 	srq->srq_wridlist = tavor_wrid_get_list(1 << log_srq_size);
328 	if (srq->srq_wridlist == NULL) {
329 		/* Set "status" and "errormsg" and goto failure */
330 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wridlist create");
331 		goto srqalloc_fail7;
332 	}
333 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wridlist)))
334 
335 	srq->srq_wridlist->wl_srq_en = 1;
336 	srq->srq_wridlist->wl_free_list_indx = -1;
337 
338 	/*
339 	 * Fill in all the return arguments (if necessary).  This includes
340 	 * real queue size and real SGLs.
341 	 */
342 	if (real_sizes != NULL) {
343 		real_sizes->srq_wr_sz = (1 << log_srq_size);
344 		real_sizes->srq_sgl_sz = srq->srq_wq_sgl;
345 	}
346 
347 	/*
348 	 * Fill in the SRQC entry.  This is the final step before passing
349 	 * ownership of the SRQC entry to the Tavor hardware.  We use all of
350 	 * the information collected/calculated above to fill in the
351 	 * requisite portions of the SRQC.  Note: If this SRQ is going to be
352 	 * used for userland access, then we need to set the UAR page number
353 	 * appropriately (otherwise it's a "don't care")
354 	 */
355 	bzero(&srqc_entry, sizeof (tavor_hw_srqc_t));
356 	srqc_entry.wqe_addr_h	   = (addr >> 32);
357 	srqc_entry.next_wqe_addr_l = 0;
358 	srqc_entry.ds		   = (wqesz >> 4);
359 	srqc_entry.state	   = TAVOR_SRQ_STATE_HW_OWNER;
360 	srqc_entry.pd		   = pd->pd_pdnum;
361 	srqc_entry.lkey		   = lkey;
362 	srqc_entry.wqe_cnt	   = 0;
363 	if (srq_is_umap) {
364 		srqc_entry.uar	   = uarpg;
365 	} else {
366 		srqc_entry.uar	   = 0;
367 	}
368 
369 	/*
370 	 * Write the SRQC entry to hardware.  Lastly, we pass ownership of
371 	 * the entry to the hardware (using the Tavor SW2HW_SRQ firmware
372 	 * command).  Note: In general, this operation shouldn't fail.  But
373 	 * if it does, we have to undo everything we've done above before
374 	 * returning error.
375 	 */
376 	status = tavor_cmn_ownership_cmd_post(state, SW2HW_SRQ, &srqc_entry,
377 	    sizeof (tavor_hw_srqc_t), srq->srq_srqnum,
378 	    sleepflag);
379 	if (status != TAVOR_CMD_SUCCESS) {
380 		cmn_err(CE_CONT, "Tavor: SW2HW_SRQ command failed: %08x\n",
381 		    status);
382 		TNF_PROBE_1(tavor_srq_alloc_sw2hw_srq_cmd_fail,
383 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
384 		/* Set "status" and "errormsg" and goto failure */
385 		TAVOR_TNF_FAIL(IBT_FAILURE, "tavor SW2HW_SRQ command");
386 		goto srqalloc_fail8;
387 	}
388 
389 	/*
390 	 * Fill in the rest of the Tavor SRQ handle.  We can update
391 	 * the following fields for use in further operations on the SRQ.
392 	 */
393 	srq->srq_srqcrsrcp = srqc;
394 	srq->srq_rsrcp	   = rsrc;
395 	srq->srq_mrhdl	   = mr;
396 	srq->srq_refcnt	   = 0;
397 	srq->srq_is_umap   = srq_is_umap;
398 	srq->srq_uarpg	   = (srq->srq_is_umap) ? uarpg : 0;
399 	srq->srq_umap_dhp  = (devmap_cookie_t)NULL;
400 	srq->srq_pdhdl	   = pd;
401 	srq->srq_wq_lastwqeindx = -1;
402 	srq->srq_wq_bufsz  = (1 << log_srq_size);
403 	srq->srq_wq_buf	   = buf;
404 	srq->srq_desc_off  = srq_desc_off;
405 	srq->srq_hdlrarg   = (void *)ibt_srqhdl;
406 	srq->srq_state	   = 0;
407 	srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
408 	srq->srq_real_sizes.srq_sgl_sz = srq->srq_wq_sgl;
409 
410 	/* Determine if later ddi_dma_sync will be necessary */
411 	srq->srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);
412 
413 	/*
414 	 * Put SRQ handle in Tavor SRQNum-to-SRQhdl list.  Then fill in the
415 	 * "srqhdl" and return success
416 	 */
417 	ASSERT(state->ts_srqhdl[srqc->tr_indx] == NULL);
418 	state->ts_srqhdl[srqc->tr_indx] = srq;
419 
420 	/*
421 	 * If this is a user-mappable SRQ, then we need to insert the
422 	 * previously allocated entry into the "userland resources database".
423 	 * This will allow for later lookup during devmap() (i.e. mmap())
424 	 * calls.
425 	 */
426 	if (srq->srq_is_umap) {
427 		tavor_umap_db_add(umapdb);
428 	} else {
429 		mutex_enter(&srq->srq_wrid_wql->wql_lock);
430 		tavor_wrid_list_srq_init(srq->srq_wridlist, srq, 0);
431 		mutex_exit(&srq->srq_wrid_wql->wql_lock);
432 	}
433 
434 	*srqhdl = srq;
435 
436 	TAVOR_TNF_EXIT(tavor_srq_alloc);
437 	return (status);
438 
439 /*
440  * The following is cleanup for all possible failure cases in this routine
441  */
442 srqalloc_fail8:
443 	kmem_free(srq->srq_wridlist->wl_wre, srq->srq_wridlist->wl_size *
444 	    sizeof (tavor_wrid_entry_t));
445 	kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
446 srqalloc_fail7:
447 	tavor_wql_refcnt_dec(srq->srq_wrid_wql);
448 srqalloc_fail6:
449 	if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
450 	    TAVOR_SLEEPFLAG_FOR_CONTEXT()) != DDI_SUCCESS) {
451 		TAVOR_WARNING(state, "failed to deregister SRQ memory");
452 	}
453 srqalloc_fail5:
454 	tavor_queue_free(state, &srq->srq_wqinfo);
455 srqalloc_fail4:
456 	if (srq_is_umap) {
457 		tavor_umap_db_free(umapdb);
458 	}
459 srqalloc_fail3:
460 	tavor_rsrc_free(state, &rsrc);
461 srqalloc_fail2:
462 	tavor_rsrc_free(state, &srqc);
463 srqalloc_fail1:
464 	tavor_pd_refcnt_dec(pd);
465 srqalloc_fail:
466 	TNF_PROBE_1(tavor_srq_alloc_fail, TAVOR_TNF_ERROR, "",
467 	    tnf_string, msg, errormsg);
468 	TAVOR_TNF_EXIT(tavor_srq_alloc);
469 	return (status);
470 }
471 
472 
473 /*
474  * tavor_srq_free()
475  *    Context: Can be called only from user or kernel context.
476  */
477 /* ARGSUSED */
478 int
tavor_srq_free(tavor_state_t * state,tavor_srqhdl_t * srqhdl,uint_t sleepflag)479 tavor_srq_free(tavor_state_t *state, tavor_srqhdl_t *srqhdl, uint_t sleepflag)
480 {
481 	tavor_rsrc_t		*srqc, *rsrc;
482 	tavor_umap_db_entry_t	*umapdb;
483 	uint64_t		value;
484 	tavor_srqhdl_t		srq;
485 	tavor_mrhdl_t		mr;
486 	tavor_pdhdl_t		pd;
487 	tavor_hw_srqc_t		srqc_entry;
488 	uint32_t		srqnum;
489 	uint32_t		size;
490 	uint_t			maxprot;
491 	int			status;
492 
493 	TAVOR_TNF_ENTER(tavor_srq_free);
494 
495 	/*
496 	 * Pull all the necessary information from the Tavor Shared Receive
497 	 * Queue handle.  This is necessary here because the resource for the
498 	 * SRQ handle is going to be freed up as part of this operation.
499 	 */
500 	srq	= *srqhdl;
501 	mutex_enter(&srq->srq_lock);
502 	srqc	= srq->srq_srqcrsrcp;
503 	rsrc	= srq->srq_rsrcp;
504 	pd	= srq->srq_pdhdl;
505 	mr	= srq->srq_mrhdl;
506 	srqnum	= srq->srq_srqnum;
507 
508 	/*
509 	 * If there are work queues still associated with the SRQ, then return
510 	 * an error.  Otherwise, we will be holding the SRQ lock.
511 	 */
512 	if (srq->srq_refcnt != 0) {
513 		mutex_exit(&srq->srq_lock);
514 		TNF_PROBE_1(tavor_srq_free_refcnt_fail, TAVOR_TNF_ERROR, "",
515 		    tnf_int, refcnt, srq->srq_refcnt);
516 		TAVOR_TNF_EXIT(tavor_srq_free);
517 		return (IBT_SRQ_IN_USE);
518 	}
519 
520 	/*
521 	 * If this was a user-mappable SRQ, then we need to remove its entry
522 	 * from the "userland resources database".  If it is also currently
523 	 * mmap()'d out to a user process, then we need to call
524 	 * devmap_devmem_remap() to remap the SRQ memory to an invalid mapping.
525 	 * We also need to invalidate the SRQ tracking information for the
526 	 * user mapping.
527 	 */
528 	if (srq->srq_is_umap) {
529 		status = tavor_umap_db_find(state->ts_instance, srq->srq_srqnum,
530 		    MLNX_UMAP_SRQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
531 		    &umapdb);
532 		if (status != DDI_SUCCESS) {
533 			mutex_exit(&srq->srq_lock);
534 			TAVOR_WARNING(state, "failed to find in database");
535 			TAVOR_TNF_EXIT(tavor_srq_free);
536 			return (ibc_get_ci_failure(0));
537 		}
538 		tavor_umap_db_free(umapdb);
539 		if (srq->srq_umap_dhp != NULL) {
540 			maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
541 			status = devmap_devmem_remap(srq->srq_umap_dhp,
542 			    state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size,
543 			    maxprot, DEVMAP_MAPPING_INVALID, NULL);
544 			if (status != DDI_SUCCESS) {
545 				mutex_exit(&srq->srq_lock);
546 				TAVOR_WARNING(state, "failed in SRQ memory "
547 				    "devmap_devmem_remap()");
548 				TAVOR_TNF_EXIT(tavor_srq_free);
549 				return (ibc_get_ci_failure(0));
550 			}
551 			srq->srq_umap_dhp = (devmap_cookie_t)NULL;
552 		}
553 	}
554 
555 	/*
556 	 * Put NULL into the Tavor SRQNum-to-SRQHdl list.  This will allow any
557 	 * in-progress events to detect that the SRQ corresponding to this
558 	 * number has been freed.
559 	 */
560 	state->ts_srqhdl[srqc->tr_indx] = NULL;
561 
562 	mutex_exit(&srq->srq_lock);
563 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq));
564 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq->srq_wridlist));
565 
566 	/*
567 	 * Reclaim SRQC entry from hardware (using the Tavor HW2SW_SRQ
568 	 * firmware command).  If the ownership transfer fails for any reason,
569 	 * then it is an indication that something (either in HW or SW) has
570 	 * gone seriously wrong.
571 	 */
572 	status = tavor_cmn_ownership_cmd_post(state, HW2SW_SRQ, &srqc_entry,
573 	    sizeof (tavor_hw_srqc_t), srqnum, sleepflag);
574 	if (status != TAVOR_CMD_SUCCESS) {
575 		TAVOR_WARNING(state, "failed to reclaim SRQC ownership");
576 		cmn_err(CE_CONT, "Tavor: HW2SW_SRQ command failed: %08x\n",
577 		    status);
578 		TNF_PROBE_1(tavor_srq_free_hw2sw_srq_cmd_fail,
579 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
580 		TAVOR_TNF_EXIT(tavor_srq_free);
581 		return (IBT_FAILURE);
582 	}
583 
584 	/*
585 	 * Deregister the memory for the Shared Receive Queue.  If this fails
586 	 * for any reason, then it is an indication that something (either
587 	 * in HW or SW) has gone seriously wrong.  So we print a warning
588 	 * message and return.
589 	 */
590 	status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
591 	    sleepflag);
592 	if (status != DDI_SUCCESS) {
593 		TAVOR_WARNING(state, "failed to deregister SRQ memory");
594 		TNF_PROBE_0(tavor_srq_free_dereg_mr_fail, TAVOR_TNF_ERROR, "");
595 		TAVOR_TNF_EXIT(tavor_srq_free);
596 		return (IBT_FAILURE);
597 	}
598 
599 	/* Calculate the size and free the wridlist container */
600 	if (srq->srq_wridlist != NULL) {
601 		size = (srq->srq_wridlist->wl_size *
602 		    sizeof (tavor_wrid_entry_t));
603 		kmem_free(srq->srq_wridlist->wl_wre, size);
604 		kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
605 
606 		/*
607 		 * Release reference to WQL; If this is the last reference,
608 		 * this call also has the side effect of freeing up the
609 		 * 'srq_wrid_wql' memory.
610 		 */
611 		tavor_wql_refcnt_dec(srq->srq_wrid_wql);
612 	}
613 
614 	/* Free the memory for the SRQ */
615 	tavor_queue_free(state, &srq->srq_wqinfo);
616 
617 	/* Free the Tavor SRQ Handle */
618 	tavor_rsrc_free(state, &rsrc);
619 
620 	/* Free the SRQC entry resource */
621 	tavor_rsrc_free(state, &srqc);
622 
623 	/* Decrement the reference count on the protection domain (PD) */
624 	tavor_pd_refcnt_dec(pd);
625 
626 	/* Set the srqhdl pointer to NULL and return success */
627 	*srqhdl = NULL;
628 
629 	TAVOR_TNF_EXIT(tavor_srq_free);
630 	return (DDI_SUCCESS);
631 }
632 
633 
634 /*
635  * tavor_srq_modify()
636  *    Context: Can be called only from user or kernel context.
637  */
638 int
tavor_srq_modify(tavor_state_t * state,tavor_srqhdl_t srq,uint_t size,uint_t * real_size,uint_t sleepflag)639 tavor_srq_modify(tavor_state_t *state, tavor_srqhdl_t srq, uint_t size,
640     uint_t *real_size, uint_t sleepflag)
641 {
642 	tavor_qalloc_info_t	new_srqinfo, old_srqinfo;
643 	tavor_rsrc_t		*mtt, *mpt, *old_mtt;
644 	tavor_bind_info_t	bind;
645 	tavor_bind_info_t	old_bind;
646 	tavor_rsrc_pool_info_t	*rsrc_pool;
647 	tavor_mrhdl_t		mr;
648 	tavor_hw_mpt_t		mpt_entry;
649 	tavor_wrid_entry_t	*wre_new, *wre_old;
650 	uint64_t		mtt_ddrbaseaddr, mtt_addr;
651 	uint64_t		srq_desc_off;
652 	uint32_t		*buf, srq_old_bufsz;
653 	uint32_t		wqesz;
654 	uint_t			max_srq_size;
655 	uint_t			dma_xfer_mode, mtt_pgsize_bits;
656 	uint_t			srq_sync, log_srq_size, maxprot;
657 	uint_t			wq_location;
658 	int			status;
659 	char			*errormsg;
660 
661 	TAVOR_TNF_ENTER(tavor_srq_modify);
662 
663 	/*
664 	 * Check the "inddr" flag.  This flag tells the driver whether or not
665 	 * the SRQ's work queues should be come from normal system memory or
666 	 * whether they should be allocated from DDR memory.
667 	 */
668 	wq_location = state->ts_cfg_profile->cp_srq_wq_inddr;
669 
670 	/*
671 	 * If size requested is larger than device capability, return
672 	 * Insufficient Resources
673 	 */
674 	max_srq_size = (1 << state->ts_cfg_profile->cp_log_max_srq_sz);
675 	if (size > max_srq_size) {
676 		TNF_PROBE_0(tavor_srq_modify_size_larger_than_maxsize,
677 		    TAVOR_TNF_ERROR, "");
678 		TAVOR_TNF_EXIT(tavor_srq_modify);
679 		return (IBT_HCA_WR_EXCEEDED);
680 	}
681 
682 	/*
683 	 * Calculate the appropriate size for the SRQ.
684 	 * Note:  All Tavor SRQs must be a power-of-2 in size.  Also
685 	 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE.  This step
686 	 * is to round the requested size up to the next highest power-of-2
687 	 */
688 	size = max(size, TAVOR_SRQ_MIN_SIZE);
689 	log_srq_size = highbit(size);
690 	if (ISP2(size)) {
691 		log_srq_size = log_srq_size - 1;
692 	}
693 
694 	/*
695 	 * Next we verify that the rounded-up size is valid (i.e. consistent
696 	 * with the device limits and/or software-configured limits).
697 	 */
698 	if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
699 		/* Set "status" and "errormsg" and goto failure */
700 		TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max SRQ size");
701 		goto srqmodify_fail;
702 	}
703 
704 	/*
705 	 * Allocate the memory for newly resized Shared Receive Queue.
706 	 *
707 	 * Note: If SRQ is not user-mappable, then it may come from either
708 	 * kernel system memory or from HCA-attached local DDR memory.
709 	 *
710 	 * Note2: We align this queue on a pagesize boundary.  This is required
711 	 * to make sure that all the resulting IB addresses will start at 0,
712 	 * for a zero-based queue.  By making sure we are aligned on at least a
713 	 * page, any offset we use into our queue will be the same as it was
714 	 * when we allocated it at tavor_srq_alloc() time.
715 	 */
716 	wqesz = (1 << srq->srq_wq_log_wqesz);
717 	new_srqinfo.qa_size = (1 << log_srq_size) * wqesz;
718 	new_srqinfo.qa_alloc_align = PAGESIZE;
719 	new_srqinfo.qa_bind_align  = PAGESIZE;
720 	if (srq->srq_is_umap) {
721 		new_srqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
722 	} else {
723 		new_srqinfo.qa_location = wq_location;
724 	}
725 	status = tavor_queue_alloc(state, &new_srqinfo, sleepflag);
726 	if (status != DDI_SUCCESS) {
727 		/* Set "status" and "errormsg" and goto failure */
728 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
729 		goto srqmodify_fail;
730 	}
731 	buf = (uint32_t *)new_srqinfo.qa_buf_aligned;
732 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
733 
734 	/*
735 	 * Allocate the memory for the new WRE list.  This will be used later
736 	 * when we resize the wridlist based on the new SRQ size.
737 	 */
738 	wre_new = (tavor_wrid_entry_t *)kmem_zalloc((1 << log_srq_size) *
739 	    sizeof (tavor_wrid_entry_t), sleepflag);
740 	if (wre_new == NULL) {
741 		/* Set "status" and "errormsg" and goto failure */
742 		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE,
743 		    "failed wre_new alloc");
744 		goto srqmodify_fail;
745 	}
746 
747 	/*
748 	 * Fill in the "bind" struct.  This struct provides the majority
749 	 * of the information that will be used to distinguish between an
750 	 * "addr" binding (as is the case here) and a "buf" binding (see
751 	 * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
752 	 * which does most of the "heavy lifting" for the Tavor memory
753 	 * registration routines.
754 	 */
755 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(bind))
756 	bzero(&bind, sizeof (tavor_bind_info_t));
757 	bind.bi_type  = TAVOR_BINDHDL_VADDR;
758 	bind.bi_addr  = (uint64_t)(uintptr_t)buf;
759 	bind.bi_len   = new_srqinfo.qa_size;
760 	bind.bi_as    = NULL;
761 	bind.bi_flags = sleepflag == TAVOR_SLEEP ? IBT_MR_SLEEP :
762 	    IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
763 	if (srq->srq_is_umap) {
764 		bind.bi_bypass = state->ts_cfg_profile->cp_iommu_bypass;
765 	} else {
766 		if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
767 			bind.bi_bypass =
768 			    state->ts_cfg_profile->cp_iommu_bypass;
769 			dma_xfer_mode =
770 			    state->ts_cfg_profile->cp_streaming_consistent;
771 			if (dma_xfer_mode == DDI_DMA_STREAMING) {
772 				bind.bi_flags |= IBT_MR_NONCOHERENT;
773 			}
774 		} else {
775 			bind.bi_bypass = TAVOR_BINDMEM_BYPASS;
776 		}
777 	}
778 	status = tavor_mr_mtt_bind(state, &bind, new_srqinfo.qa_dmahdl, &mtt,
779 	    &mtt_pgsize_bits);
780 	if (status != DDI_SUCCESS) {
781 		/* Set "status" and "errormsg" and goto failure */
782 		TAVOR_TNF_FAIL(status, "failed mtt bind");
783 		kmem_free(wre_new, srq->srq_wq_bufsz *
784 		    sizeof (tavor_wrid_entry_t));
785 		tavor_queue_free(state, &new_srqinfo);
786 		goto srqmodify_fail;
787 	}
788 
789 	/*
790 	 * Calculate the offset between the kernel virtual address space
791 	 * and the IB virtual address space.  This will be used when
792 	 * posting work requests to properly initialize each WQE.
793 	 *
794 	 * Note: bind addr is zero-based (from alloc) so we calculate the
795 	 * correct new offset here.
796 	 */
797 	bind.bi_addr = bind.bi_addr & ((1 << mtt_pgsize_bits) - 1);
798 	srq_desc_off = (uint64_t)(uintptr_t)new_srqinfo.qa_buf_aligned -
799 	    (uint64_t)bind.bi_addr;
800 
801 	/*
802 	 * Get the base address for the MTT table.  This will be necessary
803 	 * below when we are modifying the MPT entry.
804 	 */
805 	rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
806 	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
807 
808 	/*
809 	 * Fill in the MPT entry.  This is the final step before passing
810 	 * ownership of the MPT entry to the Tavor hardware.  We use all of
811 	 * the information collected/calculated above to fill in the
812 	 * requisite portions of the MPT.
813 	 */
814 	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
815 	mpt_entry.reg_win_len	= bind.bi_len;
816 	mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
817 	mpt_entry.mttseg_addr_h = mtt_addr >> 32;
818 	mpt_entry.mttseg_addr_l = mtt_addr >> 6;
819 
820 	/*
821 	 * Now we grab the SRQ lock.  Since we will be updating the actual
822 	 * SRQ location and the producer/consumer indexes, we should hold
823 	 * the lock.
824 	 *
825 	 * We do a TAVOR_NOSLEEP here (and below), though, because we are
826 	 * holding the "srq_lock" and if we got raised to interrupt level
827 	 * by priority inversion, we would not want to block in this routine
828 	 * waiting for success.
829 	 */
830 	mutex_enter(&srq->srq_lock);
831 
832 	/*
833 	 * Copy old entries to new buffer
834 	 */
835 	srq_old_bufsz = srq->srq_wq_bufsz;
836 	bcopy(srq->srq_wq_buf, buf, srq_old_bufsz * wqesz);
837 
838 	/* Determine if later ddi_dma_sync will be necessary */
839 	srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);
840 
841 	/* Sync entire "new" SRQ for use by hardware (if necessary) */
842 	if (srq_sync) {
843 		(void) ddi_dma_sync(bind.bi_dmahdl, 0,
844 		    new_srqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
845 	}
846 
847 	/*
848 	 * Setup MPT information for use in the MODIFY_MPT command
849 	 */
850 	mr = srq->srq_mrhdl;
851 	mutex_enter(&mr->mr_lock);
852 	mpt = srq->srq_mrhdl->mr_mptrsrcp;
853 
854 	/*
855 	 * MODIFY_MPT
856 	 *
857 	 * If this fails for any reason, then it is an indication that
858 	 * something (either in HW or SW) has gone seriously wrong.  So we
859 	 * print a warning message and return.
860 	 */
861 	status = tavor_modify_mpt_cmd_post(state, &mpt_entry, mpt->tr_indx,
862 	    TAVOR_CMD_MODIFY_MPT_RESIZESRQ, sleepflag);
863 	if (status != TAVOR_CMD_SUCCESS) {
864 		cmn_err(CE_CONT, "Tavor: MODIFY_MPT command failed: %08x\n",
865 		    status);
866 		TNF_PROBE_1(tavor_mr_common_reg_sw2hw_mpt_cmd_fail,
867 		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
868 		TAVOR_TNF_FAIL(status, "MODIFY_MPT command failed");
869 		(void) tavor_mr_mtt_unbind(state, &srq->srq_mrhdl->mr_bindinfo,
870 		    srq->srq_mrhdl->mr_mttrsrcp);
871 		kmem_free(wre_new, srq->srq_wq_bufsz *
872 		    sizeof (tavor_wrid_entry_t));
873 		tavor_queue_free(state, &new_srqinfo);
874 		mutex_exit(&mr->mr_lock);
875 		mutex_exit(&srq->srq_lock);
876 		return (ibc_get_ci_failure(0));
877 	}
878 
879 	/*
880 	 * Update the Tavor Shared Receive Queue handle with all the new
881 	 * information.  At the same time, save away all the necessary
882 	 * information for freeing up the old resources
883 	 */
884 	old_srqinfo	   = srq->srq_wqinfo;
885 	old_mtt		   = srq->srq_mrhdl->mr_mttrsrcp;
886 	bcopy(&srq->srq_mrhdl->mr_bindinfo, &old_bind,
887 	    sizeof (tavor_bind_info_t));
888 
889 	/* Now set the new info */
890 	srq->srq_wqinfo	   = new_srqinfo;
891 	srq->srq_wq_buf	   = buf;
892 	srq->srq_wq_bufsz  = (1 << log_srq_size);
893 	bcopy(&bind, &srq->srq_mrhdl->mr_bindinfo, sizeof (tavor_bind_info_t));
894 	srq->srq_mrhdl->mr_mttrsrcp = mtt;
895 	srq->srq_desc_off  = srq_desc_off;
896 	srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
897 
898 	/* Update MR mtt pagesize */
899 	mr->mr_logmttpgsz = mtt_pgsize_bits;
900 	mutex_exit(&mr->mr_lock);
901 
902 #ifdef __lock_lint
903 	mutex_enter(&srq->srq_wrid_wql->wql_lock);
904 #else
905 	if (srq->srq_wrid_wql != NULL) {
906 		mutex_enter(&srq->srq_wrid_wql->wql_lock);
907 	}
908 #endif
909 
910 	/*
911 	 * Initialize new wridlist, if needed.
912 	 *
913 	 * If a wridlist already is setup on an SRQ (the QP associated with an
914 	 * SRQ has moved "from_reset") then we must update this wridlist based
915 	 * on the new SRQ size.  We allocate the new size of Work Request ID
916 	 * Entries, copy over the old entries to the new list, and
917 	 * re-initialize the srq wridlist in non-umap case
918 	 */
919 	wre_old = NULL;
920 	if (srq->srq_wridlist != NULL) {
921 		wre_old = srq->srq_wridlist->wl_wre;
922 
923 		bcopy(wre_old, wre_new, srq_old_bufsz *
924 		    sizeof (tavor_wrid_entry_t));
925 
926 		/* Setup new sizes in wre */
927 		srq->srq_wridlist->wl_wre = wre_new;
928 		srq->srq_wridlist->wl_size = srq->srq_wq_bufsz;
929 
930 		if (!srq->srq_is_umap) {
931 			tavor_wrid_list_srq_init(srq->srq_wridlist, srq,
932 			    srq_old_bufsz);
933 		}
934 	}
935 
936 #ifdef __lock_lint
937 	mutex_exit(&srq->srq_wrid_wql->wql_lock);
938 #else
939 	if (srq->srq_wrid_wql != NULL) {
940 		mutex_exit(&srq->srq_wrid_wql->wql_lock);
941 	}
942 #endif
943 
944 	/*
945 	 * If "old" SRQ was a user-mappable SRQ that is currently mmap()'d out
946 	 * to a user process, then we need to call devmap_devmem_remap() to
947 	 * invalidate the mapping to the SRQ memory.  We also need to
948 	 * invalidate the SRQ tracking information for the user mapping.
949 	 *
950 	 * Note: On failure, the remap really shouldn't ever happen.  So, if it
951 	 * does, it is an indication that something has gone seriously wrong.
952 	 * So we print a warning message and return error (knowing, of course,
953 	 * that the "old" SRQ memory will be leaked)
954 	 */
955 	if ((srq->srq_is_umap) && (srq->srq_umap_dhp != NULL)) {
956 		maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
957 		status = devmap_devmem_remap(srq->srq_umap_dhp,
958 		    state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size, maxprot,
959 		    DEVMAP_MAPPING_INVALID, NULL);
960 		if (status != DDI_SUCCESS) {
961 			mutex_exit(&srq->srq_lock);
962 			TAVOR_WARNING(state, "failed in SRQ memory "
963 			    "devmap_devmem_remap()");
964 			/* We can, however, free the memory for old wre */
965 			if (wre_old != NULL) {
966 				kmem_free(wre_old, srq_old_bufsz *
967 				    sizeof (tavor_wrid_entry_t));
968 			}
969 			TAVOR_TNF_EXIT(tavor_srq_modify);
970 			return (ibc_get_ci_failure(0));
971 		}
972 		srq->srq_umap_dhp = (devmap_cookie_t)NULL;
973 	}
974 
975 	/*
976 	 * Drop the SRQ lock now.  The only thing left to do is to free up
977 	 * the old resources.
978 	 */
979 	mutex_exit(&srq->srq_lock);
980 
981 	/*
982 	 * Unbind the MTT entries.
983 	 */
984 	status = tavor_mr_mtt_unbind(state, &old_bind, old_mtt);
985 	if (status != DDI_SUCCESS) {
986 		TAVOR_WARNING(state, "failed to unbind old SRQ memory");
987 		/* Set "status" and "errormsg" and goto failure */
988 		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
989 		    "failed to unbind (old)");
990 		goto srqmodify_fail;
991 	}
992 
993 	/* Free the memory for old wre */
994 	if (wre_old != NULL) {
995 		kmem_free(wre_old, srq_old_bufsz *
996 		    sizeof (tavor_wrid_entry_t));
997 	}
998 
999 	/* Free the memory for the old SRQ */
1000 	tavor_queue_free(state, &old_srqinfo);
1001 
1002 	/*
1003 	 * Fill in the return arguments (if necessary).  This includes the
1004 	 * real new completion queue size.
1005 	 */
1006 	if (real_size != NULL) {
1007 		*real_size = (1 << log_srq_size);
1008 	}
1009 
1010 	TAVOR_TNF_EXIT(tavor_srq_modify);
1011 	return (DDI_SUCCESS);
1012 
1013 srqmodify_fail:
1014 	TNF_PROBE_1(tavor_srq_modify_fail, TAVOR_TNF_ERROR, "",
1015 	    tnf_string, msg, errormsg);
1016 	TAVOR_TNF_EXIT(tavor_srq_modify);
1017 	return (status);
1018 }
1019 
1020 
1021 /*
1022  * tavor_srq_refcnt_inc()
1023  *    Context: Can be called from interrupt or base context.
1024  */
1025 void
tavor_srq_refcnt_inc(tavor_srqhdl_t srq)1026 tavor_srq_refcnt_inc(tavor_srqhdl_t srq)
1027 {
1028 	mutex_enter(&srq->srq_lock);
1029 	TNF_PROBE_1_DEBUG(tavor_srq_refcnt_inc, TAVOR_TNF_TRACE, "",
1030 	    tnf_uint, refcnt, srq->srq_refcnt);
1031 	srq->srq_refcnt++;
1032 	mutex_exit(&srq->srq_lock);
1033 }
1034 
1035 
1036 /*
1037  * tavor_srq_refcnt_dec()
1038  *    Context: Can be called from interrupt or base context.
1039  */
1040 void
tavor_srq_refcnt_dec(tavor_srqhdl_t srq)1041 tavor_srq_refcnt_dec(tavor_srqhdl_t srq)
1042 {
1043 	mutex_enter(&srq->srq_lock);
1044 	srq->srq_refcnt--;
1045 	TNF_PROBE_1_DEBUG(tavor_srq_refcnt_dec, TAVOR_TNF_TRACE, "",
1046 	    tnf_uint, refcnt, srq->srq_refcnt);
1047 	mutex_exit(&srq->srq_lock);
1048 }
1049 
1050 
1051 /*
1052  * tavor_srqhdl_from_srqnum()
1053  *    Context: Can be called from interrupt or base context.
1054  *
1055  *    This routine is important because changing the unconstrained
1056  *    portion of the SRQ number is critical to the detection of a
1057  *    potential race condition in the SRQ handler code (i.e. the case
1058  *    where a SRQ is freed and alloc'd again before an event for the
1059  *    "old" SRQ can be handled).
1060  *
1061  *    While this is not a perfect solution (not sure that one exists)
1062  *    it does help to mitigate the chance that this race condition will
1063  *    cause us to deliver a "stale" event to the new SRQ owner.  Note:
1064  *    this solution does not scale well because the number of constrained
1065  *    bits increases (and, hence, the number of unconstrained bits
1066  *    decreases) as the number of supported SRQ grows.  For small and
1067  *    intermediate values, it should hopefully provide sufficient
1068  *    protection.
1069  */
1070 tavor_srqhdl_t
tavor_srqhdl_from_srqnum(tavor_state_t * state,uint_t srqnum)1071 tavor_srqhdl_from_srqnum(tavor_state_t *state, uint_t srqnum)
1072 {
1073 	uint_t	srqindx, srqmask;
1074 
1075 	/* Calculate the SRQ table index from the srqnum */
1076 	srqmask = (1 << state->ts_cfg_profile->cp_log_num_srq) - 1;
1077 	srqindx = srqnum & srqmask;
1078 	return (state->ts_srqhdl[srqindx]);
1079 }
1080 
1081 
1082 /*
1083  * tavor_srq_sgl_to_logwqesz()
1084  *    Context: Can be called from interrupt or base context.
1085  */
1086 static void
tavor_srq_sgl_to_logwqesz(tavor_state_t * state,uint_t num_sgl,tavor_qp_wq_type_t wq_type,uint_t * logwqesz,uint_t * max_sgl)1087 tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
1088     tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl)
1089 {
1090 	uint_t	max_size, log2, actual_sgl;
1091 
1092 	TAVOR_TNF_ENTER(tavor_srq_sgl_to_logwqesz);
1093 
1094 	switch (wq_type) {
1095 	case TAVOR_QP_WQ_TYPE_RECVQ:
1096 		/*
1097 		 * Use requested maximum SGL to calculate max descriptor size
1098 		 * (while guaranteeing that the descriptor size is a
1099 		 * power-of-2 cachelines).
1100 		 */
1101 		max_size = (TAVOR_QP_WQE_MLX_RCV_HDRS + (num_sgl << 4));
1102 		log2 = highbit(max_size);
1103 		if (ISP2(max_size)) {
1104 			log2 = log2 - 1;
1105 		}
1106 
1107 		/* Make sure descriptor is at least the minimum size */
1108 		log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
1109 
1110 		/* Calculate actual number of SGL (given WQE size) */
1111 		actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_RCV_HDRS) >> 4;
1112 		break;
1113 
1114 	default:
1115 		TAVOR_WARNING(state, "unexpected work queue type");
1116 		TNF_PROBE_0(tavor_srq_sgl_to_logwqesz_inv_wqtype_fail,
1117 		    TAVOR_TNF_ERROR, "");
1118 		break;
1119 	}
1120 
1121 	/* Fill in the return values */
1122 	*logwqesz = log2;
1123 	*max_sgl  = min(state->ts_cfg_profile->cp_srq_max_sgl, actual_sgl);
1124 
1125 	TAVOR_TNF_EXIT(tavor_qp_sgl_to_logwqesz);
1126 }
1127