1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * tavor_srq.c
29 * Tavor Shared Receive Queue Processing Routines
30 *
31 * Implements all the routines necessary for allocating, freeing, querying,
32 * modifying and posting shared receive queues.
33 */
34
35 #include <sys/sysmacros.h>
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/bitmap.h>
42
43 #include <sys/ib/adapters/tavor/tavor.h>
44
45 static void tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
46 tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl);
47
48 /*
49 * tavor_srq_alloc()
50 * Context: Can be called only from user or kernel context.
51 */
52 int
tavor_srq_alloc(tavor_state_t * state,tavor_srq_info_t * srqinfo,uint_t sleepflag,tavor_srq_options_t * op)53 tavor_srq_alloc(tavor_state_t *state, tavor_srq_info_t *srqinfo,
54 uint_t sleepflag, tavor_srq_options_t *op)
55 {
56 ibt_srq_hdl_t ibt_srqhdl;
57 tavor_pdhdl_t pd;
58 ibt_srq_sizes_t *sizes;
59 ibt_srq_sizes_t *real_sizes;
60 tavor_srqhdl_t *srqhdl;
61 ibt_srq_flags_t flags;
62 tavor_rsrc_t *srqc, *rsrc;
63 tavor_hw_srqc_t srqc_entry;
64 uint32_t *buf;
65 tavor_srqhdl_t srq;
66 tavor_umap_db_entry_t *umapdb;
67 ibt_mr_attr_t mr_attr;
68 tavor_mr_options_t mr_op;
69 tavor_mrhdl_t mr;
70 uint64_t addr;
71 uint64_t value, srq_desc_off;
72 uint32_t lkey;
73 uint32_t log_srq_size;
74 uint32_t uarpg;
75 uint_t wq_location, dma_xfer_mode, srq_is_umap;
76 int flag, status;
77 uint_t max_sgl;
78 uint_t wqesz;
79
80 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sizes))
81
82 /*
83 * Check the "options" flag. Currently this flag tells the driver
84 * whether or not the SRQ's work queues should be come from normal
85 * system memory or whether they should be allocated from DDR memory.
86 */
87 if (op == NULL) {
88 wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
89 } else {
90 wq_location = op->srqo_wq_loc;
91 }
92
93 /*
94 * Extract the necessary info from the tavor_srq_info_t structure
95 */
96 real_sizes = srqinfo->srqi_real_sizes;
97 sizes = srqinfo->srqi_sizes;
98 pd = srqinfo->srqi_pd;
99 ibt_srqhdl = srqinfo->srqi_ibt_srqhdl;
100 flags = srqinfo->srqi_flags;
101 srqhdl = srqinfo->srqi_srqhdl;
102
103 /*
104 * Determine whether SRQ is being allocated for userland access or
105 * whether it is being allocated for kernel access. If the SRQ is
106 * being allocated for userland access, then lookup the UAR doorbell
107 * page number for the current process. Note: If this is not found
108 * (e.g. if the process has not previously open()'d the Tavor driver),
109 * then an error is returned.
110 */
111 srq_is_umap = (flags & IBT_SRQ_USER_MAP) ? 1 : 0;
112 if (srq_is_umap) {
113 status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
114 MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
115 if (status != DDI_SUCCESS) {
116 goto srqalloc_fail3;
117 }
118 uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
119 }
120
121 /* Increase PD refcnt */
122 tavor_pd_refcnt_inc(pd);
123
124 /* Allocate an SRQ context entry */
125 status = tavor_rsrc_alloc(state, TAVOR_SRQC, 1, sleepflag, &srqc);
126 if (status != DDI_SUCCESS) {
127 goto srqalloc_fail1;
128 }
129
130 /* Allocate the SRQ Handle entry */
131 status = tavor_rsrc_alloc(state, TAVOR_SRQHDL, 1, sleepflag, &rsrc);
132 if (status != DDI_SUCCESS) {
133 goto srqalloc_fail2;
134 }
135
136 srq = (tavor_srqhdl_t)rsrc->tr_addr;
137 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq))
138
139 srq->srq_srqnum = srqc->tr_indx; /* just use index */
140
141 /*
142 * If this will be a user-mappable SRQ, then allocate an entry for
143 * the "userland resources database". This will later be added to
144 * the database (after all further SRQ operations are successful).
145 * If we fail here, we must undo the reference counts and the
146 * previous resource allocation.
147 */
148 if (srq_is_umap) {
149 umapdb = tavor_umap_db_alloc(state->ts_instance,
150 srq->srq_srqnum, MLNX_UMAP_SRQMEM_RSRC,
151 (uint64_t)(uintptr_t)rsrc);
152 if (umapdb == NULL) {
153 goto srqalloc_fail3;
154 }
155 }
156
157 /*
158 * Calculate the appropriate size for the SRQ.
159 * Note: All Tavor SRQs must be a power-of-2 in size. Also
160 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE. This step
161 * is to round the requested size up to the next highest power-of-2
162 */
163 sizes->srq_wr_sz = max(sizes->srq_wr_sz, TAVOR_SRQ_MIN_SIZE);
164 log_srq_size = highbit(sizes->srq_wr_sz);
165 if (ISP2(sizes->srq_wr_sz)) {
166 log_srq_size = log_srq_size - 1;
167 }
168
169 /*
170 * Next we verify that the rounded-up size is valid (i.e. consistent
171 * with the device limits and/or software-configured limits). If not,
172 * then obviously we have a lot of cleanup to do before returning.
173 */
174 if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
175 goto srqalloc_fail4;
176 }
177
178 /*
179 * Next we verify that the requested number of SGL is valid (i.e.
180 * consistent with the device limits and/or software-configured
181 * limits). If not, then obviously the same cleanup needs to be done.
182 */
183 max_sgl = state->ts_cfg_profile->cp_srq_max_sgl;
184 if (sizes->srq_sgl_sz > max_sgl) {
185 goto srqalloc_fail4;
186 }
187
188 /*
189 * Determine the SRQ's WQE sizes. This depends on the requested
190 * number of SGLs. Note: This also has the side-effect of
191 * calculating the real number of SGLs (for the calculated WQE size)
192 */
193 tavor_srq_sgl_to_logwqesz(state, sizes->srq_sgl_sz,
194 TAVOR_QP_WQ_TYPE_RECVQ, &srq->srq_wq_log_wqesz,
195 &srq->srq_wq_sgl);
196
197 /*
198 * Allocate the memory for SRQ work queues. Note: The location from
199 * which we will allocate these work queues has been passed in through
200 * the tavor_qp_options_t structure. Since Tavor work queues are not
201 * allowed to cross a 32-bit (4GB) boundary, the alignment of the work
202 * queue memory is very important. We used to allocate work queues
203 * (the combined receive and send queues) so that they would be aligned
204 * on their combined size. That alignment guaranteed that they would
205 * never cross the 4GB boundary (Tavor work queues are on the order of
206 * MBs at maximum). Now we are able to relax this alignment constraint
207 * by ensuring that the IB address assigned to the queue memory (as a
208 * result of the tavor_mr_register() call) is offset from zero.
209 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
210 * guarantee the alignment, but when attempting to use IOMMU bypass
211 * mode we found that we were not allowed to specify any alignment that
212 * was more restrictive than the system page size. So we avoided this
213 * constraint by passing two alignment values, one for the memory
214 * allocation itself and the other for the DMA handle (for later bind).
215 * This used to cause more memory than necessary to be allocated (in
216 * order to guarantee the more restrictive alignment contraint). But
217 * be guaranteeing the zero-based IB virtual address for the queue, we
218 * are able to conserve this memory.
219 *
220 * Note: If SRQ is not user-mappable, then it may come from either
221 * kernel system memory or from HCA-attached local DDR memory.
222 *
223 * Note2: We align this queue on a pagesize boundary. This is required
224 * to make sure that all the resulting IB addresses will start at 0, for
225 * a zero-based queue. By making sure we are aligned on at least a
226 * page, any offset we use into our queue will be the same as when we
227 * perform tavor_srq_modify() operations later.
228 */
229 wqesz = (1 << srq->srq_wq_log_wqesz);
230 srq->srq_wqinfo.qa_size = (1 << log_srq_size) * wqesz;
231 srq->srq_wqinfo.qa_alloc_align = PAGESIZE;
232 srq->srq_wqinfo.qa_bind_align = PAGESIZE;
233 if (srq_is_umap) {
234 srq->srq_wqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
235 } else {
236 srq->srq_wqinfo.qa_location = wq_location;
237 }
238 status = tavor_queue_alloc(state, &srq->srq_wqinfo, sleepflag);
239 if (status != DDI_SUCCESS) {
240 goto srqalloc_fail4;
241 }
242 buf = (uint32_t *)srq->srq_wqinfo.qa_buf_aligned;
243 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
244
245 /*
246 * Register the memory for the SRQ work queues. The memory for the SRQ
247 * must be registered in the Tavor TPT tables. This gives us the LKey
248 * to specify in the SRQ context later. Note: If the work queue is to
249 * be allocated from DDR memory, then only a "bypass" mapping is
250 * appropriate. And if the SRQ memory is user-mappable, then we force
251 * DDI_DMA_CONSISTENT mapping. Also, in order to meet the alignment
252 * restriction, we pass the "mro_bind_override_addr" flag in the call
253 * to tavor_mr_register(). This guarantees that the resulting IB vaddr
254 * will be zero-based (modulo the offset into the first page). If we
255 * fail here, we still have the bunch of resource and reference count
256 * cleanup to do.
257 */
258 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
259 IBT_MR_NOSLEEP;
260 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
261 mr_attr.mr_len = srq->srq_wqinfo.qa_size;
262 mr_attr.mr_as = NULL;
263 mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
264 if (srq_is_umap) {
265 mr_op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
266 } else {
267 if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
268 mr_op.mro_bind_type =
269 state->ts_cfg_profile->cp_iommu_bypass;
270 dma_xfer_mode =
271 state->ts_cfg_profile->cp_streaming_consistent;
272 if (dma_xfer_mode == DDI_DMA_STREAMING) {
273 mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
274 }
275 } else {
276 mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
277 }
278 }
279 mr_op.mro_bind_dmahdl = srq->srq_wqinfo.qa_dmahdl;
280 mr_op.mro_bind_override_addr = 1;
281 status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
282 if (status != DDI_SUCCESS) {
283 goto srqalloc_fail5;
284 }
285 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
286 addr = mr->mr_bindinfo.bi_addr;
287 lkey = mr->mr_lkey;
288
289 /*
290 * Calculate the offset between the kernel virtual address space
291 * and the IB virtual address space. This will be used when
292 * posting work requests to properly initialize each WQE.
293 */
294 srq_desc_off = (uint64_t)(uintptr_t)srq->srq_wqinfo.qa_buf_aligned -
295 (uint64_t)mr->mr_bindinfo.bi_addr;
296
297 /*
298 * Create WQL and Wridlist for use by this SRQ
299 */
300 srq->srq_wrid_wql = tavor_wrid_wql_create(state);
301 if (srq->srq_wrid_wql == NULL) {
302 goto srqalloc_fail6;
303 }
304 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wrid_wql)))
305
306 srq->srq_wridlist = tavor_wrid_get_list(1 << log_srq_size);
307 if (srq->srq_wridlist == NULL) {
308 goto srqalloc_fail7;
309 }
310 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wridlist)))
311
312 srq->srq_wridlist->wl_srq_en = 1;
313 srq->srq_wridlist->wl_free_list_indx = -1;
314
315 /*
316 * Fill in all the return arguments (if necessary). This includes
317 * real queue size and real SGLs.
318 */
319 if (real_sizes != NULL) {
320 real_sizes->srq_wr_sz = (1 << log_srq_size);
321 real_sizes->srq_sgl_sz = srq->srq_wq_sgl;
322 }
323
324 /*
325 * Fill in the SRQC entry. This is the final step before passing
326 * ownership of the SRQC entry to the Tavor hardware. We use all of
327 * the information collected/calculated above to fill in the
328 * requisite portions of the SRQC. Note: If this SRQ is going to be
329 * used for userland access, then we need to set the UAR page number
330 * appropriately (otherwise it's a "don't care")
331 */
332 bzero(&srqc_entry, sizeof (tavor_hw_srqc_t));
333 srqc_entry.wqe_addr_h = (addr >> 32);
334 srqc_entry.next_wqe_addr_l = 0;
335 srqc_entry.ds = (wqesz >> 4);
336 srqc_entry.state = TAVOR_SRQ_STATE_HW_OWNER;
337 srqc_entry.pd = pd->pd_pdnum;
338 srqc_entry.lkey = lkey;
339 srqc_entry.wqe_cnt = 0;
340 if (srq_is_umap) {
341 srqc_entry.uar = uarpg;
342 } else {
343 srqc_entry.uar = 0;
344 }
345
346 /*
347 * Write the SRQC entry to hardware. Lastly, we pass ownership of
348 * the entry to the hardware (using the Tavor SW2HW_SRQ firmware
349 * command). Note: In general, this operation shouldn't fail. But
350 * if it does, we have to undo everything we've done above before
351 * returning error.
352 */
353 status = tavor_cmn_ownership_cmd_post(state, SW2HW_SRQ, &srqc_entry,
354 sizeof (tavor_hw_srqc_t), srq->srq_srqnum,
355 sleepflag);
356 if (status != TAVOR_CMD_SUCCESS) {
357 cmn_err(CE_CONT, "Tavor: SW2HW_SRQ command failed: %08x\n",
358 status);
359 goto srqalloc_fail8;
360 }
361
362 /*
363 * Fill in the rest of the Tavor SRQ handle. We can update
364 * the following fields for use in further operations on the SRQ.
365 */
366 srq->srq_srqcrsrcp = srqc;
367 srq->srq_rsrcp = rsrc;
368 srq->srq_mrhdl = mr;
369 srq->srq_refcnt = 0;
370 srq->srq_is_umap = srq_is_umap;
371 srq->srq_uarpg = (srq->srq_is_umap) ? uarpg : 0;
372 srq->srq_umap_dhp = (devmap_cookie_t)NULL;
373 srq->srq_pdhdl = pd;
374 srq->srq_wq_lastwqeindx = -1;
375 srq->srq_wq_bufsz = (1 << log_srq_size);
376 srq->srq_wq_buf = buf;
377 srq->srq_desc_off = srq_desc_off;
378 srq->srq_hdlrarg = (void *)ibt_srqhdl;
379 srq->srq_state = 0;
380 srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
381 srq->srq_real_sizes.srq_sgl_sz = srq->srq_wq_sgl;
382
383 /* Determine if later ddi_dma_sync will be necessary */
384 srq->srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);
385
386 /*
387 * Put SRQ handle in Tavor SRQNum-to-SRQhdl list. Then fill in the
388 * "srqhdl" and return success
389 */
390 ASSERT(state->ts_srqhdl[srqc->tr_indx] == NULL);
391 state->ts_srqhdl[srqc->tr_indx] = srq;
392
393 /*
394 * If this is a user-mappable SRQ, then we need to insert the
395 * previously allocated entry into the "userland resources database".
396 * This will allow for later lookup during devmap() (i.e. mmap())
397 * calls.
398 */
399 if (srq->srq_is_umap) {
400 tavor_umap_db_add(umapdb);
401 } else {
402 mutex_enter(&srq->srq_wrid_wql->wql_lock);
403 tavor_wrid_list_srq_init(srq->srq_wridlist, srq, 0);
404 mutex_exit(&srq->srq_wrid_wql->wql_lock);
405 }
406
407 *srqhdl = srq;
408
409 return (status);
410
411 /*
412 * The following is cleanup for all possible failure cases in this routine
413 */
414 srqalloc_fail8:
415 kmem_free(srq->srq_wridlist->wl_wre, srq->srq_wridlist->wl_size *
416 sizeof (tavor_wrid_entry_t));
417 kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
418 srqalloc_fail7:
419 tavor_wql_refcnt_dec(srq->srq_wrid_wql);
420 srqalloc_fail6:
421 if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
422 TAVOR_SLEEPFLAG_FOR_CONTEXT()) != DDI_SUCCESS) {
423 TAVOR_WARNING(state, "failed to deregister SRQ memory");
424 }
425 srqalloc_fail5:
426 tavor_queue_free(state, &srq->srq_wqinfo);
427 srqalloc_fail4:
428 if (srq_is_umap) {
429 tavor_umap_db_free(umapdb);
430 }
431 srqalloc_fail3:
432 tavor_rsrc_free(state, &rsrc);
433 srqalloc_fail2:
434 tavor_rsrc_free(state, &srqc);
435 srqalloc_fail1:
436 tavor_pd_refcnt_dec(pd);
437 srqalloc_fail:
438 return (status);
439 }
440
441
442 /*
443 * tavor_srq_free()
444 * Context: Can be called only from user or kernel context.
445 */
446 /* ARGSUSED */
447 int
tavor_srq_free(tavor_state_t * state,tavor_srqhdl_t * srqhdl,uint_t sleepflag)448 tavor_srq_free(tavor_state_t *state, tavor_srqhdl_t *srqhdl, uint_t sleepflag)
449 {
450 tavor_rsrc_t *srqc, *rsrc;
451 tavor_umap_db_entry_t *umapdb;
452 uint64_t value;
453 tavor_srqhdl_t srq;
454 tavor_mrhdl_t mr;
455 tavor_pdhdl_t pd;
456 tavor_hw_srqc_t srqc_entry;
457 uint32_t srqnum;
458 uint32_t size;
459 uint_t maxprot;
460 int status;
461
462 /*
463 * Pull all the necessary information from the Tavor Shared Receive
464 * Queue handle. This is necessary here because the resource for the
465 * SRQ handle is going to be freed up as part of this operation.
466 */
467 srq = *srqhdl;
468 mutex_enter(&srq->srq_lock);
469 srqc = srq->srq_srqcrsrcp;
470 rsrc = srq->srq_rsrcp;
471 pd = srq->srq_pdhdl;
472 mr = srq->srq_mrhdl;
473 srqnum = srq->srq_srqnum;
474
475 /*
476 * If there are work queues still associated with the SRQ, then return
477 * an error. Otherwise, we will be holding the SRQ lock.
478 */
479 if (srq->srq_refcnt != 0) {
480 mutex_exit(&srq->srq_lock);
481 return (IBT_SRQ_IN_USE);
482 }
483
484 /*
485 * If this was a user-mappable SRQ, then we need to remove its entry
486 * from the "userland resources database". If it is also currently
487 * mmap()'d out to a user process, then we need to call
488 * devmap_devmem_remap() to remap the SRQ memory to an invalid mapping.
489 * We also need to invalidate the SRQ tracking information for the
490 * user mapping.
491 */
492 if (srq->srq_is_umap) {
493 status = tavor_umap_db_find(state->ts_instance, srq->srq_srqnum,
494 MLNX_UMAP_SRQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
495 &umapdb);
496 if (status != DDI_SUCCESS) {
497 mutex_exit(&srq->srq_lock);
498 TAVOR_WARNING(state, "failed to find in database");
499 return (ibc_get_ci_failure(0));
500 }
501 tavor_umap_db_free(umapdb);
502 if (srq->srq_umap_dhp != NULL) {
503 maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
504 status = devmap_devmem_remap(srq->srq_umap_dhp,
505 state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size,
506 maxprot, DEVMAP_MAPPING_INVALID, NULL);
507 if (status != DDI_SUCCESS) {
508 mutex_exit(&srq->srq_lock);
509 TAVOR_WARNING(state, "failed in SRQ memory "
510 "devmap_devmem_remap()");
511 return (ibc_get_ci_failure(0));
512 }
513 srq->srq_umap_dhp = (devmap_cookie_t)NULL;
514 }
515 }
516
517 /*
518 * Put NULL into the Tavor SRQNum-to-SRQHdl list. This will allow any
519 * in-progress events to detect that the SRQ corresponding to this
520 * number has been freed.
521 */
522 state->ts_srqhdl[srqc->tr_indx] = NULL;
523
524 mutex_exit(&srq->srq_lock);
525 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq));
526 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq->srq_wridlist));
527
528 /*
529 * Reclaim SRQC entry from hardware (using the Tavor HW2SW_SRQ
530 * firmware command). If the ownership transfer fails for any reason,
531 * then it is an indication that something (either in HW or SW) has
532 * gone seriously wrong.
533 */
534 status = tavor_cmn_ownership_cmd_post(state, HW2SW_SRQ, &srqc_entry,
535 sizeof (tavor_hw_srqc_t), srqnum, sleepflag);
536 if (status != TAVOR_CMD_SUCCESS) {
537 TAVOR_WARNING(state, "failed to reclaim SRQC ownership");
538 cmn_err(CE_CONT, "Tavor: HW2SW_SRQ command failed: %08x\n",
539 status);
540 return (IBT_FAILURE);
541 }
542
543 /*
544 * Deregister the memory for the Shared Receive Queue. If this fails
545 * for any reason, then it is an indication that something (either
546 * in HW or SW) has gone seriously wrong. So we print a warning
547 * message and return.
548 */
549 status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
550 sleepflag);
551 if (status != DDI_SUCCESS) {
552 TAVOR_WARNING(state, "failed to deregister SRQ memory");
553 return (IBT_FAILURE);
554 }
555
556 /* Calculate the size and free the wridlist container */
557 if (srq->srq_wridlist != NULL) {
558 size = (srq->srq_wridlist->wl_size *
559 sizeof (tavor_wrid_entry_t));
560 kmem_free(srq->srq_wridlist->wl_wre, size);
561 kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
562
563 /*
564 * Release reference to WQL; If this is the last reference,
565 * this call also has the side effect of freeing up the
566 * 'srq_wrid_wql' memory.
567 */
568 tavor_wql_refcnt_dec(srq->srq_wrid_wql);
569 }
570
571 /* Free the memory for the SRQ */
572 tavor_queue_free(state, &srq->srq_wqinfo);
573
574 /* Free the Tavor SRQ Handle */
575 tavor_rsrc_free(state, &rsrc);
576
577 /* Free the SRQC entry resource */
578 tavor_rsrc_free(state, &srqc);
579
580 /* Decrement the reference count on the protection domain (PD) */
581 tavor_pd_refcnt_dec(pd);
582
583 /* Set the srqhdl pointer to NULL and return success */
584 *srqhdl = NULL;
585
586 return (DDI_SUCCESS);
587 }
588
589
590 /*
591 * tavor_srq_modify()
592 * Context: Can be called only from user or kernel context.
593 */
594 int
tavor_srq_modify(tavor_state_t * state,tavor_srqhdl_t srq,uint_t size,uint_t * real_size,uint_t sleepflag)595 tavor_srq_modify(tavor_state_t *state, tavor_srqhdl_t srq, uint_t size,
596 uint_t *real_size, uint_t sleepflag)
597 {
598 tavor_qalloc_info_t new_srqinfo, old_srqinfo;
599 tavor_rsrc_t *mtt, *mpt, *old_mtt;
600 tavor_bind_info_t bind;
601 tavor_bind_info_t old_bind;
602 tavor_rsrc_pool_info_t *rsrc_pool;
603 tavor_mrhdl_t mr;
604 tavor_hw_mpt_t mpt_entry;
605 tavor_wrid_entry_t *wre_new, *wre_old;
606 uint64_t mtt_ddrbaseaddr, mtt_addr;
607 uint64_t srq_desc_off;
608 uint32_t *buf, srq_old_bufsz;
609 uint32_t wqesz;
610 uint_t max_srq_size;
611 uint_t dma_xfer_mode, mtt_pgsize_bits;
612 uint_t srq_sync, log_srq_size, maxprot;
613 uint_t wq_location;
614 int status;
615
616 /*
617 * Check the "inddr" flag. This flag tells the driver whether or not
618 * the SRQ's work queues should be come from normal system memory or
619 * whether they should be allocated from DDR memory.
620 */
621 wq_location = state->ts_cfg_profile->cp_srq_wq_inddr;
622
623 /*
624 * If size requested is larger than device capability, return
625 * Insufficient Resources
626 */
627 max_srq_size = (1 << state->ts_cfg_profile->cp_log_max_srq_sz);
628 if (size > max_srq_size) {
629 return (IBT_HCA_WR_EXCEEDED);
630 }
631
632 /*
633 * Calculate the appropriate size for the SRQ.
634 * Note: All Tavor SRQs must be a power-of-2 in size. Also
635 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE. This step
636 * is to round the requested size up to the next highest power-of-2
637 */
638 size = max(size, TAVOR_SRQ_MIN_SIZE);
639 log_srq_size = highbit(size);
640 if (ISP2(size)) {
641 log_srq_size = log_srq_size - 1;
642 }
643
644 /*
645 * Next we verify that the rounded-up size is valid (i.e. consistent
646 * with the device limits and/or software-configured limits).
647 */
648 if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
649 goto srqmodify_fail;
650 }
651
652 /*
653 * Allocate the memory for newly resized Shared Receive Queue.
654 *
655 * Note: If SRQ is not user-mappable, then it may come from either
656 * kernel system memory or from HCA-attached local DDR memory.
657 *
658 * Note2: We align this queue on a pagesize boundary. This is required
659 * to make sure that all the resulting IB addresses will start at 0,
660 * for a zero-based queue. By making sure we are aligned on at least a
661 * page, any offset we use into our queue will be the same as it was
662 * when we allocated it at tavor_srq_alloc() time.
663 */
664 wqesz = (1 << srq->srq_wq_log_wqesz);
665 new_srqinfo.qa_size = (1 << log_srq_size) * wqesz;
666 new_srqinfo.qa_alloc_align = PAGESIZE;
667 new_srqinfo.qa_bind_align = PAGESIZE;
668 if (srq->srq_is_umap) {
669 new_srqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
670 } else {
671 new_srqinfo.qa_location = wq_location;
672 }
673 status = tavor_queue_alloc(state, &new_srqinfo, sleepflag);
674 if (status != DDI_SUCCESS) {
675 goto srqmodify_fail;
676 }
677 buf = (uint32_t *)new_srqinfo.qa_buf_aligned;
678 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))
679
680 /*
681 * Allocate the memory for the new WRE list. This will be used later
682 * when we resize the wridlist based on the new SRQ size.
683 */
684 wre_new = (tavor_wrid_entry_t *)kmem_zalloc((1 << log_srq_size) *
685 sizeof (tavor_wrid_entry_t), sleepflag);
686 if (wre_new == NULL) {
687 goto srqmodify_fail;
688 }
689
690 /*
691 * Fill in the "bind" struct. This struct provides the majority
692 * of the information that will be used to distinguish between an
693 * "addr" binding (as is the case here) and a "buf" binding (see
694 * below). The "bind" struct is later passed to tavor_mr_mem_bind()
695 * which does most of the "heavy lifting" for the Tavor memory
696 * registration routines.
697 */
698 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(bind))
699 bzero(&bind, sizeof (tavor_bind_info_t));
700 bind.bi_type = TAVOR_BINDHDL_VADDR;
701 bind.bi_addr = (uint64_t)(uintptr_t)buf;
702 bind.bi_len = new_srqinfo.qa_size;
703 bind.bi_as = NULL;
704 bind.bi_flags = sleepflag == TAVOR_SLEEP ? IBT_MR_SLEEP :
705 IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
706 if (srq->srq_is_umap) {
707 bind.bi_bypass = state->ts_cfg_profile->cp_iommu_bypass;
708 } else {
709 if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
710 bind.bi_bypass =
711 state->ts_cfg_profile->cp_iommu_bypass;
712 dma_xfer_mode =
713 state->ts_cfg_profile->cp_streaming_consistent;
714 if (dma_xfer_mode == DDI_DMA_STREAMING) {
715 bind.bi_flags |= IBT_MR_NONCOHERENT;
716 }
717 } else {
718 bind.bi_bypass = TAVOR_BINDMEM_BYPASS;
719 }
720 }
721 status = tavor_mr_mtt_bind(state, &bind, new_srqinfo.qa_dmahdl, &mtt,
722 &mtt_pgsize_bits);
723 if (status != DDI_SUCCESS) {
724 kmem_free(wre_new, srq->srq_wq_bufsz *
725 sizeof (tavor_wrid_entry_t));
726 tavor_queue_free(state, &new_srqinfo);
727 goto srqmodify_fail;
728 }
729
730 /*
731 * Calculate the offset between the kernel virtual address space
732 * and the IB virtual address space. This will be used when
733 * posting work requests to properly initialize each WQE.
734 *
735 * Note: bind addr is zero-based (from alloc) so we calculate the
736 * correct new offset here.
737 */
738 bind.bi_addr = bind.bi_addr & ((1 << mtt_pgsize_bits) - 1);
739 srq_desc_off = (uint64_t)(uintptr_t)new_srqinfo.qa_buf_aligned -
740 (uint64_t)bind.bi_addr;
741
742 /*
743 * Get the base address for the MTT table. This will be necessary
744 * below when we are modifying the MPT entry.
745 */
746 rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
747 mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
748
749 /*
750 * Fill in the MPT entry. This is the final step before passing
751 * ownership of the MPT entry to the Tavor hardware. We use all of
752 * the information collected/calculated above to fill in the
753 * requisite portions of the MPT.
754 */
755 bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
756 mpt_entry.reg_win_len = bind.bi_len;
757 mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
758 mpt_entry.mttseg_addr_h = mtt_addr >> 32;
759 mpt_entry.mttseg_addr_l = mtt_addr >> 6;
760
761 /*
762 * Now we grab the SRQ lock. Since we will be updating the actual
763 * SRQ location and the producer/consumer indexes, we should hold
764 * the lock.
765 *
766 * We do a TAVOR_NOSLEEP here (and below), though, because we are
767 * holding the "srq_lock" and if we got raised to interrupt level
768 * by priority inversion, we would not want to block in this routine
769 * waiting for success.
770 */
771 mutex_enter(&srq->srq_lock);
772
773 /*
774 * Copy old entries to new buffer
775 */
776 srq_old_bufsz = srq->srq_wq_bufsz;
777 bcopy(srq->srq_wq_buf, buf, srq_old_bufsz * wqesz);
778
779 /* Determine if later ddi_dma_sync will be necessary */
780 srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);
781
782 /* Sync entire "new" SRQ for use by hardware (if necessary) */
783 if (srq_sync) {
784 (void) ddi_dma_sync(bind.bi_dmahdl, 0,
785 new_srqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
786 }
787
788 /*
789 * Setup MPT information for use in the MODIFY_MPT command
790 */
791 mr = srq->srq_mrhdl;
792 mutex_enter(&mr->mr_lock);
793 mpt = srq->srq_mrhdl->mr_mptrsrcp;
794
795 /*
796 * MODIFY_MPT
797 *
798 * If this fails for any reason, then it is an indication that
799 * something (either in HW or SW) has gone seriously wrong. So we
800 * print a warning message and return.
801 */
802 status = tavor_modify_mpt_cmd_post(state, &mpt_entry, mpt->tr_indx,
803 TAVOR_CMD_MODIFY_MPT_RESIZESRQ, sleepflag);
804 if (status != TAVOR_CMD_SUCCESS) {
805 cmn_err(CE_CONT, "Tavor: MODIFY_MPT command failed: %08x\n",
806 status);
807 (void) tavor_mr_mtt_unbind(state, &srq->srq_mrhdl->mr_bindinfo,
808 srq->srq_mrhdl->mr_mttrsrcp);
809 kmem_free(wre_new, srq->srq_wq_bufsz *
810 sizeof (tavor_wrid_entry_t));
811 tavor_queue_free(state, &new_srqinfo);
812 mutex_exit(&mr->mr_lock);
813 mutex_exit(&srq->srq_lock);
814 return (ibc_get_ci_failure(0));
815 }
816
817 /*
818 * Update the Tavor Shared Receive Queue handle with all the new
819 * information. At the same time, save away all the necessary
820 * information for freeing up the old resources
821 */
822 old_srqinfo = srq->srq_wqinfo;
823 old_mtt = srq->srq_mrhdl->mr_mttrsrcp;
824 bcopy(&srq->srq_mrhdl->mr_bindinfo, &old_bind,
825 sizeof (tavor_bind_info_t));
826
827 /* Now set the new info */
828 srq->srq_wqinfo = new_srqinfo;
829 srq->srq_wq_buf = buf;
830 srq->srq_wq_bufsz = (1 << log_srq_size);
831 bcopy(&bind, &srq->srq_mrhdl->mr_bindinfo, sizeof (tavor_bind_info_t));
832 srq->srq_mrhdl->mr_mttrsrcp = mtt;
833 srq->srq_desc_off = srq_desc_off;
834 srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
835
836 /* Update MR mtt pagesize */
837 mr->mr_logmttpgsz = mtt_pgsize_bits;
838 mutex_exit(&mr->mr_lock);
839
840 #ifdef __lock_lint
841 mutex_enter(&srq->srq_wrid_wql->wql_lock);
842 #else
843 if (srq->srq_wrid_wql != NULL) {
844 mutex_enter(&srq->srq_wrid_wql->wql_lock);
845 }
846 #endif
847
848 /*
849 * Initialize new wridlist, if needed.
850 *
851 * If a wridlist already is setup on an SRQ (the QP associated with an
852 * SRQ has moved "from_reset") then we must update this wridlist based
853 * on the new SRQ size. We allocate the new size of Work Request ID
854 * Entries, copy over the old entries to the new list, and
855 * re-initialize the srq wridlist in non-umap case
856 */
857 wre_old = NULL;
858 if (srq->srq_wridlist != NULL) {
859 wre_old = srq->srq_wridlist->wl_wre;
860
861 bcopy(wre_old, wre_new, srq_old_bufsz *
862 sizeof (tavor_wrid_entry_t));
863
864 /* Setup new sizes in wre */
865 srq->srq_wridlist->wl_wre = wre_new;
866 srq->srq_wridlist->wl_size = srq->srq_wq_bufsz;
867
868 if (!srq->srq_is_umap) {
869 tavor_wrid_list_srq_init(srq->srq_wridlist, srq,
870 srq_old_bufsz);
871 }
872 }
873
874 #ifdef __lock_lint
875 mutex_exit(&srq->srq_wrid_wql->wql_lock);
876 #else
877 if (srq->srq_wrid_wql != NULL) {
878 mutex_exit(&srq->srq_wrid_wql->wql_lock);
879 }
880 #endif
881
882 /*
883 * If "old" SRQ was a user-mappable SRQ that is currently mmap()'d out
884 * to a user process, then we need to call devmap_devmem_remap() to
885 * invalidate the mapping to the SRQ memory. We also need to
886 * invalidate the SRQ tracking information for the user mapping.
887 *
888 * Note: On failure, the remap really shouldn't ever happen. So, if it
889 * does, it is an indication that something has gone seriously wrong.
890 * So we print a warning message and return error (knowing, of course,
891 * that the "old" SRQ memory will be leaked)
892 */
893 if ((srq->srq_is_umap) && (srq->srq_umap_dhp != NULL)) {
894 maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
895 status = devmap_devmem_remap(srq->srq_umap_dhp,
896 state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size, maxprot,
897 DEVMAP_MAPPING_INVALID, NULL);
898 if (status != DDI_SUCCESS) {
899 mutex_exit(&srq->srq_lock);
900 TAVOR_WARNING(state, "failed in SRQ memory "
901 "devmap_devmem_remap()");
902 /* We can, however, free the memory for old wre */
903 if (wre_old != NULL) {
904 kmem_free(wre_old, srq_old_bufsz *
905 sizeof (tavor_wrid_entry_t));
906 }
907 return (ibc_get_ci_failure(0));
908 }
909 srq->srq_umap_dhp = (devmap_cookie_t)NULL;
910 }
911
912 /*
913 * Drop the SRQ lock now. The only thing left to do is to free up
914 * the old resources.
915 */
916 mutex_exit(&srq->srq_lock);
917
918 /*
919 * Unbind the MTT entries.
920 */
921 status = tavor_mr_mtt_unbind(state, &old_bind, old_mtt);
922 if (status != DDI_SUCCESS) {
923 TAVOR_WARNING(state, "failed to unbind old SRQ memory");
924 goto srqmodify_fail;
925 }
926
927 /* Free the memory for old wre */
928 if (wre_old != NULL) {
929 kmem_free(wre_old, srq_old_bufsz *
930 sizeof (tavor_wrid_entry_t));
931 }
932
933 /* Free the memory for the old SRQ */
934 tavor_queue_free(state, &old_srqinfo);
935
936 /*
937 * Fill in the return arguments (if necessary). This includes the
938 * real new completion queue size.
939 */
940 if (real_size != NULL) {
941 *real_size = (1 << log_srq_size);
942 }
943
944 return (DDI_SUCCESS);
945
946 srqmodify_fail:
947 return (status);
948 }
949
950
951 /*
952 * tavor_srq_refcnt_inc()
953 * Context: Can be called from interrupt or base context.
954 */
955 void
tavor_srq_refcnt_inc(tavor_srqhdl_t srq)956 tavor_srq_refcnt_inc(tavor_srqhdl_t srq)
957 {
958 mutex_enter(&srq->srq_lock);
959 srq->srq_refcnt++;
960 mutex_exit(&srq->srq_lock);
961 }
962
963
964 /*
965 * tavor_srq_refcnt_dec()
966 * Context: Can be called from interrupt or base context.
967 */
968 void
tavor_srq_refcnt_dec(tavor_srqhdl_t srq)969 tavor_srq_refcnt_dec(tavor_srqhdl_t srq)
970 {
971 mutex_enter(&srq->srq_lock);
972 srq->srq_refcnt--;
973 mutex_exit(&srq->srq_lock);
974 }
975
976
977 /*
978 * tavor_srqhdl_from_srqnum()
979 * Context: Can be called from interrupt or base context.
980 *
981 * This routine is important because changing the unconstrained
982 * portion of the SRQ number is critical to the detection of a
983 * potential race condition in the SRQ handler code (i.e. the case
984 * where a SRQ is freed and alloc'd again before an event for the
985 * "old" SRQ can be handled).
986 *
987 * While this is not a perfect solution (not sure that one exists)
988 * it does help to mitigate the chance that this race condition will
989 * cause us to deliver a "stale" event to the new SRQ owner. Note:
990 * this solution does not scale well because the number of constrained
991 * bits increases (and, hence, the number of unconstrained bits
992 * decreases) as the number of supported SRQ grows. For small and
993 * intermediate values, it should hopefully provide sufficient
994 * protection.
995 */
996 tavor_srqhdl_t
tavor_srqhdl_from_srqnum(tavor_state_t * state,uint_t srqnum)997 tavor_srqhdl_from_srqnum(tavor_state_t *state, uint_t srqnum)
998 {
999 uint_t srqindx, srqmask;
1000
1001 /* Calculate the SRQ table index from the srqnum */
1002 srqmask = (1 << state->ts_cfg_profile->cp_log_num_srq) - 1;
1003 srqindx = srqnum & srqmask;
1004 return (state->ts_srqhdl[srqindx]);
1005 }
1006
1007
1008 /*
1009 * tavor_srq_sgl_to_logwqesz()
1010 * Context: Can be called from interrupt or base context.
1011 */
1012 static void
tavor_srq_sgl_to_logwqesz(tavor_state_t * state,uint_t num_sgl,tavor_qp_wq_type_t wq_type,uint_t * logwqesz,uint_t * max_sgl)1013 tavor_srq_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
1014 tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl)
1015 {
1016 uint_t max_size, log2, actual_sgl;
1017
1018 switch (wq_type) {
1019 case TAVOR_QP_WQ_TYPE_RECVQ:
1020 /*
1021 * Use requested maximum SGL to calculate max descriptor size
1022 * (while guaranteeing that the descriptor size is a
1023 * power-of-2 cachelines).
1024 */
1025 max_size = (TAVOR_QP_WQE_MLX_RCV_HDRS + (num_sgl << 4));
1026 log2 = highbit(max_size);
1027 if (ISP2(max_size)) {
1028 log2 = log2 - 1;
1029 }
1030
1031 /* Make sure descriptor is at least the minimum size */
1032 log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
1033
1034 /* Calculate actual number of SGL (given WQE size) */
1035 actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_RCV_HDRS) >> 4;
1036 break;
1037
1038 default:
1039 TAVOR_WARNING(state, "unexpected work queue type");
1040 break;
1041 }
1042
1043 /* Fill in the return values */
1044 *logwqesz = log2;
1045 *max_sgl = min(state->ts_cfg_profile->cp_srq_max_sgl, actual_sgl);
1046 }
1047