1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /*
27 * tavor_mr.c
28 * Tavor Memory Region/Window Routines
29 *
30 * Implements all the routines necessary to provide the requisite memory
31 * registration verbs. These include operations like RegisterMemRegion(),
32 * DeregisterMemRegion(), ReregisterMemRegion, RegisterSharedMemRegion,
33 * etc., that affect Memory Regions. It also includes the verbs that
34 * affect Memory Windows, including AllocMemWindow(), FreeMemWindow(),
35 * and QueryMemWindow().
36 */
37
38 #include <sys/types.h>
39 #include <sys/conf.h>
40 #include <sys/ddi.h>
41 #include <sys/sunddi.h>
42 #include <sys/modctl.h>
43 #include <sys/esunddi.h>
44
45 #include <sys/ib/adapters/tavor/tavor.h>
46
47
48 /*
49 * Used by tavor_mr_keycalc() below to fill in the "unconstrained" portion
50 * of Tavor memory keys (LKeys and RKeys)
51 */
52 static uint_t tavor_debug_memkey_cnt = 0x00000000;
53
54 static int tavor_mr_common_reg(tavor_state_t *state, tavor_pdhdl_t pd,
55 tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op);
56 static int tavor_mr_common_rereg(tavor_state_t *state, tavor_mrhdl_t mr,
57 tavor_pdhdl_t pd, tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl_new,
58 tavor_mr_options_t *op);
59 static int tavor_mr_rereg_xlat_helper(tavor_state_t *state, tavor_mrhdl_t mr,
60 tavor_bind_info_t *bind, tavor_mr_options_t *op, uint64_t *mtt_addr,
61 uint_t sleep, uint_t *dereg_level);
62 static uint64_t tavor_mr_nummtt_needed(tavor_state_t *state,
63 tavor_bind_info_t *bind, uint_t *mtt_pgsize);
64 static int tavor_mr_mem_bind(tavor_state_t *state, tavor_bind_info_t *bind,
65 ddi_dma_handle_t dmahdl, uint_t sleep);
66 static void tavor_mr_mem_unbind(tavor_state_t *state,
67 tavor_bind_info_t *bind);
68 static int tavor_mr_fast_mtt_write(tavor_rsrc_t *mtt, tavor_bind_info_t *bind,
69 uint32_t mtt_pgsize_bits);
70 static int tavor_mtt_refcnt_inc(tavor_rsrc_t *rsrc);
71 static int tavor_mtt_refcnt_dec(tavor_rsrc_t *rsrc);
72
73 /*
74 * The Tavor umem_lockmemory() callback ops. When userland memory is
75 * registered, these callback ops are specified. The tavor_umap_umemlock_cb()
76 * callback will be called whenever the memory for the corresponding
77 * ddi_umem_cookie_t is being freed.
78 */
79 static struct umem_callback_ops tavor_umem_cbops = {
80 UMEM_CALLBACK_VERSION,
81 tavor_umap_umemlock_cb,
82 };
83
84
85 /*
86 * tavor_mr_register()
87 * Context: Can be called from interrupt or base context.
88 */
89 int
tavor_mr_register(tavor_state_t * state,tavor_pdhdl_t pd,ibt_mr_attr_t * mr_attr,tavor_mrhdl_t * mrhdl,tavor_mr_options_t * op)90 tavor_mr_register(tavor_state_t *state, tavor_pdhdl_t pd,
91 ibt_mr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op)
92 {
93 tavor_bind_info_t bind;
94 int status;
95
96 /*
97 * Fill in the "bind" struct. This struct provides the majority
98 * of the information that will be used to distinguish between an
99 * "addr" binding (as is the case here) and a "buf" binding (see
100 * below). The "bind" struct is later passed to tavor_mr_mem_bind()
101 * which does most of the "heavy lifting" for the Tavor memory
102 * registration routines.
103 */
104 bind.bi_type = TAVOR_BINDHDL_VADDR;
105 bind.bi_addr = mr_attr->mr_vaddr;
106 bind.bi_len = mr_attr->mr_len;
107 bind.bi_as = mr_attr->mr_as;
108 bind.bi_flags = mr_attr->mr_flags;
109 status = tavor_mr_common_reg(state, pd, &bind, mrhdl, op);
110
111 return (status);
112 }
113
114
115 /*
116 * tavor_mr_register_buf()
117 * Context: Can be called from interrupt or base context.
118 */
119 int
tavor_mr_register_buf(tavor_state_t * state,tavor_pdhdl_t pd,ibt_smr_attr_t * mr_attr,struct buf * buf,tavor_mrhdl_t * mrhdl,tavor_mr_options_t * op)120 tavor_mr_register_buf(tavor_state_t *state, tavor_pdhdl_t pd,
121 ibt_smr_attr_t *mr_attr, struct buf *buf, tavor_mrhdl_t *mrhdl,
122 tavor_mr_options_t *op)
123 {
124 tavor_bind_info_t bind;
125 int status;
126
127 /*
128 * Fill in the "bind" struct. This struct provides the majority
129 * of the information that will be used to distinguish between an
130 * "addr" binding (see above) and a "buf" binding (as is the case
131 * here). The "bind" struct is later passed to tavor_mr_mem_bind()
132 * which does most of the "heavy lifting" for the Tavor memory
133 * registration routines. Note: We have chosen to provide
134 * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
135 * not set). It is not critical what value we choose here as it need
136 * only be unique for the given RKey (which will happen by default),
137 * so the choice here is somewhat arbitrary.
138 */
139 bind.bi_type = TAVOR_BINDHDL_BUF;
140 bind.bi_buf = buf;
141 if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
142 bind.bi_addr = mr_attr->mr_vaddr;
143 } else {
144 bind.bi_addr = (uint64_t)(uintptr_t)buf->b_un.b_addr;
145 }
146 bind.bi_as = NULL;
147 bind.bi_len = (uint64_t)buf->b_bcount;
148 bind.bi_flags = mr_attr->mr_flags;
149 status = tavor_mr_common_reg(state, pd, &bind, mrhdl, op);
150
151 return (status);
152 }
153
154
155 /*
156 * tavor_mr_register_shared()
157 * Context: Can be called from interrupt or base context.
158 */
159 int
tavor_mr_register_shared(tavor_state_t * state,tavor_mrhdl_t mrhdl,tavor_pdhdl_t pd,ibt_smr_attr_t * mr_attr,tavor_mrhdl_t * mrhdl_new)160 tavor_mr_register_shared(tavor_state_t *state, tavor_mrhdl_t mrhdl,
161 tavor_pdhdl_t pd, ibt_smr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl_new)
162 {
163 tavor_rsrc_pool_info_t *rsrc_pool;
164 tavor_rsrc_t *mpt, *mtt, *rsrc;
165 tavor_umap_db_entry_t *umapdb;
166 tavor_hw_mpt_t mpt_entry;
167 tavor_mrhdl_t mr;
168 tavor_bind_info_t *bind;
169 ddi_umem_cookie_t umem_cookie;
170 size_t umem_len;
171 caddr_t umem_addr;
172 uint64_t mtt_addr, mtt_ddrbaseaddr, pgsize_msk;
173 uint_t sleep, mr_is_umem;
174 int status, umem_flags;
175
176 /*
177 * Check the sleep flag. Ensure that it is consistent with the
178 * current thread context (i.e. if we are currently in the interrupt
179 * context, then we shouldn't be attempting to sleep).
180 */
181 sleep = (mr_attr->mr_flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP :
182 TAVOR_SLEEP;
183 if ((sleep == TAVOR_SLEEP) &&
184 (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
185 goto mrshared_fail;
186 }
187
188 /* Increment the reference count on the protection domain (PD) */
189 tavor_pd_refcnt_inc(pd);
190
191 /*
192 * Allocate an MPT entry. This will be filled in with all the
193 * necessary parameters to define the shared memory region.
194 * Specifically, it will be made to reference the currently existing
195 * MTT entries and ownership of the MPT will be passed to the hardware
196 * in the last step below. If we fail here, we must undo the
197 * protection domain reference count.
198 */
199 status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
200 if (status != DDI_SUCCESS) {
201 goto mrshared_fail1;
202 }
203
204 /*
205 * Allocate the software structure for tracking the shared memory
206 * region (i.e. the Tavor Memory Region handle). If we fail here, we
207 * must undo the protection domain reference count and the previous
208 * resource allocation.
209 */
210 status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
211 if (status != DDI_SUCCESS) {
212 goto mrshared_fail2;
213 }
214 mr = (tavor_mrhdl_t)rsrc->tr_addr;
215 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
216
217 /*
218 * Setup and validate the memory region access flags. This means
219 * translating the IBTF's enable flags into the access flags that
220 * will be used in later operations.
221 */
222 mr->mr_accflag = 0;
223 if (mr_attr->mr_flags & IBT_MR_ENABLE_WINDOW_BIND)
224 mr->mr_accflag |= IBT_MR_WINDOW_BIND;
225 if (mr_attr->mr_flags & IBT_MR_ENABLE_LOCAL_WRITE)
226 mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
227 if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_READ)
228 mr->mr_accflag |= IBT_MR_REMOTE_READ;
229 if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_WRITE)
230 mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
231 if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
232 mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
233
234 /*
235 * Calculate keys (Lkey, Rkey) from MPT index. Each key is formed
236 * from a certain number of "constrained" bits (the least significant
237 * bits) and some number of "unconstrained" bits. The constrained
238 * bits must be set to the index of the entry in the MPT table, but
239 * the unconstrained bits can be set to any value we wish. Note:
240 * if no remote access is required, then the RKey value is not filled
241 * in. Otherwise both Rkey and LKey are given the same value.
242 */
243 tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
244 if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
245 (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
246 (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
247 mr->mr_rkey = mr->mr_lkey;
248 }
249
250 /* Grab the MR lock for the current memory region */
251 mutex_enter(&mrhdl->mr_lock);
252
253 /*
254 * Check here to see if the memory region has already been partially
255 * deregistered as a result of a tavor_umap_umemlock_cb() callback.
256 * If so, this is an error, return failure.
257 */
258 if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
259 mutex_exit(&mrhdl->mr_lock);
260 goto mrshared_fail3;
261 }
262
263 /*
264 * Determine if the original memory was from userland and, if so, pin
265 * the pages (again) with umem_lockmemory(). This will guarantee a
266 * separate callback for each of this shared region's MR handles.
267 * If this is userland memory, then allocate an entry in the
268 * "userland resources database". This will later be added to
269 * the database (after all further memory registration operations are
270 * successful). If we fail here, we must undo all the above setup.
271 */
272 mr_is_umem = mrhdl->mr_is_umem;
273 if (mr_is_umem) {
274 umem_len = ptob(btopr(mrhdl->mr_bindinfo.bi_len +
275 ((uintptr_t)mrhdl->mr_bindinfo.bi_addr & PAGEOFFSET)));
276 umem_addr = (caddr_t)((uintptr_t)mrhdl->mr_bindinfo.bi_addr &
277 ~PAGEOFFSET);
278 umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
279 DDI_UMEMLOCK_LONGTERM);
280 status = umem_lockmemory(umem_addr, umem_len, umem_flags,
281 &umem_cookie, &tavor_umem_cbops, NULL);
282 if (status != 0) {
283 mutex_exit(&mrhdl->mr_lock);
284 goto mrshared_fail3;
285 }
286
287 umapdb = tavor_umap_db_alloc(state->ts_instance,
288 (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
289 (uint64_t)(uintptr_t)rsrc);
290 if (umapdb == NULL) {
291 mutex_exit(&mrhdl->mr_lock);
292 goto mrshared_fail4;
293 }
294 }
295
296 /*
297 * Copy the MTT resource pointer (and additional parameters) from
298 * the original Tavor Memory Region handle. Note: this is normally
299 * where the tavor_mr_mem_bind() routine would be called, but because
300 * we already have bound and filled-in MTT entries it is simply a
301 * matter here of managing the MTT reference count and grabbing the
302 * address of the MTT table entries (for filling in the shared region's
303 * MPT entry).
304 */
305 mr->mr_mttrsrcp = mrhdl->mr_mttrsrcp;
306 mr->mr_logmttpgsz = mrhdl->mr_logmttpgsz;
307 mr->mr_bindinfo = mrhdl->mr_bindinfo;
308 mr->mr_mttrefcntp = mrhdl->mr_mttrefcntp;
309 mutex_exit(&mrhdl->mr_lock);
310 bind = &mr->mr_bindinfo;
311 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
312 mtt = mr->mr_mttrsrcp;
313
314 /*
315 * Increment the MTT reference count (to reflect the fact that
316 * the MTT is now shared)
317 */
318 (void) tavor_mtt_refcnt_inc(mr->mr_mttrefcntp);
319
320 /*
321 * Update the new "bind" virtual address. Do some extra work here
322 * to ensure proper alignment. That is, make sure that the page
323 * offset for the beginning of the old range is the same as the
324 * offset for this new mapping
325 */
326 pgsize_msk = (((uint64_t)1 << mr->mr_logmttpgsz) - 1);
327 bind->bi_addr = ((mr_attr->mr_vaddr & ~pgsize_msk) |
328 (mr->mr_bindinfo.bi_addr & pgsize_msk));
329
330 /*
331 * Get the base address for the MTT table. This will be necessary
332 * in the next step when we are setting up the MPT entry.
333 */
334 rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
335 mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
336
337 /*
338 * Fill in the MPT entry. This is the final step before passing
339 * ownership of the MPT entry to the Tavor hardware. We use all of
340 * the information collected/calculated above to fill in the
341 * requisite portions of the MPT.
342 */
343 bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
344 mpt_entry.m_io = TAVOR_MEM_CYCLE_GENERATE;
345 mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND) ? 1 : 0;
346 mpt_entry.atomic = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
347 mpt_entry.rw = (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ? 1 : 0;
348 mpt_entry.rr = (mr->mr_accflag & IBT_MR_REMOTE_READ) ? 1 : 0;
349 mpt_entry.lw = (mr->mr_accflag & IBT_MR_LOCAL_WRITE) ? 1 : 0;
350 mpt_entry.lr = 1;
351 mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
352 mpt_entry.page_sz = mr->mr_logmttpgsz - 0xC;
353 mpt_entry.mem_key = mr->mr_lkey;
354 mpt_entry.pd = pd->pd_pdnum;
355 mpt_entry.start_addr = bind->bi_addr;
356 mpt_entry.reg_win_len = bind->bi_len;
357 mpt_entry.win_cnt_limit = TAVOR_UNLIMITED_WIN_BIND;
358 mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
359 mpt_entry.mttseg_addr_h = mtt_addr >> 32;
360 mpt_entry.mttseg_addr_l = mtt_addr >> 6;
361
362 /*
363 * Write the MPT entry to hardware. Lastly, we pass ownership of
364 * the entry to the hardware. Note: in general, this operation
365 * shouldn't fail. But if it does, we have to undo everything we've
366 * done above before returning error.
367 */
368 status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
369 sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
370 if (status != TAVOR_CMD_SUCCESS) {
371 cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
372 status);
373 goto mrshared_fail5;
374 }
375
376 /*
377 * Fill in the rest of the Tavor Memory Region handle. Having
378 * successfully transferred ownership of the MPT, we can update the
379 * following fields for use in further operations on the MR.
380 */
381 mr->mr_mptrsrcp = mpt;
382 mr->mr_mttrsrcp = mtt;
383 mr->mr_pdhdl = pd;
384 mr->mr_rsrcp = rsrc;
385 mr->mr_is_umem = mr_is_umem;
386 mr->mr_umemcookie = (mr_is_umem != 0) ? umem_cookie : NULL;
387 mr->mr_umem_cbfunc = NULL;
388 mr->mr_umem_cbarg1 = NULL;
389 mr->mr_umem_cbarg2 = NULL;
390
391 /*
392 * If this is userland memory, then we need to insert the previously
393 * allocated entry into the "userland resources database". This will
394 * allow for later coordination between the tavor_umap_umemlock_cb()
395 * callback and tavor_mr_deregister().
396 */
397 if (mr_is_umem) {
398 tavor_umap_db_add(umapdb);
399 }
400
401 *mrhdl_new = mr;
402
403 return (DDI_SUCCESS);
404
405 /*
406 * The following is cleanup for all possible failure cases in this routine
407 */
408 mrshared_fail5:
409 (void) tavor_mtt_refcnt_dec(mr->mr_mttrefcntp);
410 if (mr_is_umem) {
411 tavor_umap_db_free(umapdb);
412 }
413 mrshared_fail4:
414 if (mr_is_umem) {
415 ddi_umem_unlock(umem_cookie);
416 }
417 mrshared_fail3:
418 tavor_rsrc_free(state, &rsrc);
419 mrshared_fail2:
420 tavor_rsrc_free(state, &mpt);
421 mrshared_fail1:
422 tavor_pd_refcnt_dec(pd);
423 mrshared_fail:
424 return (status);
425 }
426
427
428 /*
429 * tavor_mr_deregister()
430 * Context: Can be called from interrupt or base context.
431 */
432 /* ARGSUSED */
433 int
tavor_mr_deregister(tavor_state_t * state,tavor_mrhdl_t * mrhdl,uint_t level,uint_t sleep)434 tavor_mr_deregister(tavor_state_t *state, tavor_mrhdl_t *mrhdl, uint_t level,
435 uint_t sleep)
436 {
437 tavor_rsrc_t *mpt, *mtt, *rsrc, *mtt_refcnt;
438 tavor_umap_db_entry_t *umapdb;
439 tavor_pdhdl_t pd;
440 tavor_mrhdl_t mr;
441 tavor_bind_info_t *bind;
442 uint64_t value;
443 int status, shared_mtt;
444
445 /*
446 * Check the sleep flag. Ensure that it is consistent with the
447 * current thread context (i.e. if we are currently in the interrupt
448 * context, then we shouldn't be attempting to sleep).
449 */
450 if ((sleep == TAVOR_SLEEP) &&
451 (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
452 return (status);
453 }
454
455 /*
456 * Pull all the necessary information from the Tavor Memory Region
457 * handle. This is necessary here because the resource for the
458 * MR handle is going to be freed up as part of the this
459 * deregistration
460 */
461 mr = *mrhdl;
462 mutex_enter(&mr->mr_lock);
463 mpt = mr->mr_mptrsrcp;
464 mtt = mr->mr_mttrsrcp;
465 mtt_refcnt = mr->mr_mttrefcntp;
466 rsrc = mr->mr_rsrcp;
467 pd = mr->mr_pdhdl;
468 bind = &mr->mr_bindinfo;
469
470 /*
471 * Check here to see if the memory region has already been partially
472 * deregistered as a result of the tavor_umap_umemlock_cb() callback.
473 * If so, then jump to the end and free the remaining resources.
474 */
475 if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
476 goto mrdereg_finish_cleanup;
477 }
478
479 /*
480 * We must drop the "mr_lock" here to ensure that both SLEEP and
481 * NOSLEEP calls into the firmware work as expected. Also, if two
482 * threads are attemping to access this MR (via de-register,
483 * re-register, or otherwise), then we allow the firmware to enforce
484 * the checking, that only one deregister is valid.
485 */
486 mutex_exit(&mr->mr_lock);
487
488 /*
489 * Reclaim MPT entry from hardware (if necessary). Since the
490 * tavor_mr_deregister() routine is used in the memory region
491 * reregistration process as well, it is possible that we will
492 * not always wish to reclaim ownership of the MPT. Check the
493 * "level" arg and, if necessary, attempt to reclaim it. If
494 * the ownership transfer fails for any reason, we check to see
495 * what command status was returned from the hardware. The only
496 * "expected" error status is the one that indicates an attempt to
497 * deregister a memory region that has memory windows bound to it
498 */
499 if (level >= TAVOR_MR_DEREG_ALL) {
500 status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT,
501 NULL, 0, mpt->tr_indx, sleep);
502 if (status != TAVOR_CMD_SUCCESS) {
503 if (status == TAVOR_CMD_REG_BOUND) {
504 return (IBT_MR_IN_USE);
505 } else {
506 cmn_err(CE_CONT, "Tavor: HW2SW_MPT command "
507 "failed: %08x\n", status);
508 return (IBT_INVALID_PARAM);
509 }
510 }
511 }
512
513 /*
514 * Re-grab the mr_lock here. Since further access to the protected
515 * 'mr' structure is needed, and we would have returned previously for
516 * the multiple deregistration case, we can safely grab the lock here.
517 */
518 mutex_enter(&mr->mr_lock);
519
520 /*
521 * If the memory had come from userland, then we do a lookup in the
522 * "userland resources database". On success, we free the entry, call
523 * ddi_umem_unlock(), and continue the cleanup. On failure (which is
524 * an indication that the umem_lockmemory() callback has called
525 * tavor_mr_deregister()), we call ddi_umem_unlock() and invalidate
526 * the "mr_umemcookie" field in the MR handle (this will be used
527 * later to detect that only partial cleaup still remains to be done
528 * on the MR handle).
529 */
530 if (mr->mr_is_umem) {
531 status = tavor_umap_db_find(state->ts_instance,
532 (uint64_t)(uintptr_t)mr->mr_umemcookie,
533 MLNX_UMAP_MRMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
534 &umapdb);
535 if (status == DDI_SUCCESS) {
536 tavor_umap_db_free(umapdb);
537 ddi_umem_unlock(mr->mr_umemcookie);
538 } else {
539 ddi_umem_unlock(mr->mr_umemcookie);
540 mr->mr_umemcookie = NULL;
541 }
542 }
543
544 /* mtt_refcnt is NULL in the case of tavor_dma_mr_register() */
545 if (mtt_refcnt != NULL) {
546 /*
547 * Decrement the MTT reference count. Since the MTT resource
548 * may be shared between multiple memory regions (as a result
549 * of a "RegisterSharedMR" verb) it is important that we not
550 * free up or unbind resources prematurely. If it's not shared
551 * (as indicated by the return status), then free the resource.
552 */
553 shared_mtt = tavor_mtt_refcnt_dec(mtt_refcnt);
554 if (!shared_mtt) {
555 tavor_rsrc_free(state, &mtt_refcnt);
556 }
557
558 /*
559 * Free up the MTT entries and unbind the memory. Here,
560 * as above, we attempt to free these resources only if
561 * it is appropriate to do so.
562 */
563 if (!shared_mtt) {
564 if (level >= TAVOR_MR_DEREG_NO_HW2SW_MPT) {
565 tavor_mr_mem_unbind(state, bind);
566 }
567 tavor_rsrc_free(state, &mtt);
568 }
569 }
570
571 /*
572 * If the MR handle has been invalidated, then drop the
573 * lock and return success. Note: This only happens because
574 * the umem_lockmemory() callback has been triggered. The
575 * cleanup here is partial, and further cleanup (in a
576 * subsequent tavor_mr_deregister() call) will be necessary.
577 */
578 if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
579 mutex_exit(&mr->mr_lock);
580 return (DDI_SUCCESS);
581 }
582
583 mrdereg_finish_cleanup:
584 mutex_exit(&mr->mr_lock);
585
586 /* Free the Tavor Memory Region handle */
587 tavor_rsrc_free(state, &rsrc);
588
589 /* Free up the MPT entry resource */
590 tavor_rsrc_free(state, &mpt);
591
592 /* Decrement the reference count on the protection domain (PD) */
593 tavor_pd_refcnt_dec(pd);
594
595 /* Set the mrhdl pointer to NULL and return success */
596 *mrhdl = NULL;
597
598 return (DDI_SUCCESS);
599 }
600
601
602 /*
603 * tavor_mr_query()
604 * Context: Can be called from interrupt or base context.
605 */
606 /* ARGSUSED */
607 int
tavor_mr_query(tavor_state_t * state,tavor_mrhdl_t mr,ibt_mr_query_attr_t * attr)608 tavor_mr_query(tavor_state_t *state, tavor_mrhdl_t mr,
609 ibt_mr_query_attr_t *attr)
610 {
611 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr))
612
613 mutex_enter(&mr->mr_lock);
614
615 /*
616 * Check here to see if the memory region has already been partially
617 * deregistered as a result of a tavor_umap_umemlock_cb() callback.
618 * If so, this is an error, return failure.
619 */
620 if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
621 mutex_exit(&mr->mr_lock);
622 return (IBT_MR_HDL_INVALID);
623 }
624
625 /* Fill in the queried attributes */
626 attr->mr_attr_flags = mr->mr_accflag;
627 attr->mr_pd = (ibt_pd_hdl_t)mr->mr_pdhdl;
628
629 /* Fill in the "local" attributes */
630 attr->mr_lkey = (ibt_lkey_t)mr->mr_lkey;
631 attr->mr_lbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
632 attr->mr_lbounds.pb_len = (size_t)mr->mr_bindinfo.bi_len;
633
634 /*
635 * Fill in the "remote" attributes (if necessary). Note: the
636 * remote attributes are only valid if the memory region has one
637 * or more of the remote access flags set.
638 */
639 if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
640 (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
641 (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
642 attr->mr_rkey = (ibt_rkey_t)mr->mr_rkey;
643 attr->mr_rbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
644 attr->mr_rbounds.pb_len = (size_t)mr->mr_bindinfo.bi_len;
645 }
646
647 /*
648 * If region is mapped for streaming (i.e. noncoherent), then set sync
649 * is required
650 */
651 attr->mr_sync_required = (mr->mr_bindinfo.bi_flags &
652 IBT_MR_NONCOHERENT) ? B_TRUE : B_FALSE;
653
654 mutex_exit(&mr->mr_lock);
655 return (DDI_SUCCESS);
656 }
657
658
659 /*
660 * tavor_mr_reregister()
661 * Context: Can be called from interrupt or base context.
662 */
663 int
tavor_mr_reregister(tavor_state_t * state,tavor_mrhdl_t mr,tavor_pdhdl_t pd,ibt_mr_attr_t * mr_attr,tavor_mrhdl_t * mrhdl_new,tavor_mr_options_t * op)664 tavor_mr_reregister(tavor_state_t *state, tavor_mrhdl_t mr,
665 tavor_pdhdl_t pd, ibt_mr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl_new,
666 tavor_mr_options_t *op)
667 {
668 tavor_bind_info_t bind;
669 int status;
670
671 /*
672 * Fill in the "bind" struct. This struct provides the majority
673 * of the information that will be used to distinguish between an
674 * "addr" binding (as is the case here) and a "buf" binding (see
675 * below). The "bind" struct is later passed to tavor_mr_mem_bind()
676 * which does most of the "heavy lifting" for the Tavor memory
677 * registration (and reregistration) routines.
678 */
679 bind.bi_type = TAVOR_BINDHDL_VADDR;
680 bind.bi_addr = mr_attr->mr_vaddr;
681 bind.bi_len = mr_attr->mr_len;
682 bind.bi_as = mr_attr->mr_as;
683 bind.bi_flags = mr_attr->mr_flags;
684 status = tavor_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
685
686 return (status);
687 }
688
689
690 /*
691 * tavor_mr_reregister_buf()
692 * Context: Can be called from interrupt or base context.
693 */
694 int
tavor_mr_reregister_buf(tavor_state_t * state,tavor_mrhdl_t mr,tavor_pdhdl_t pd,ibt_smr_attr_t * mr_attr,struct buf * buf,tavor_mrhdl_t * mrhdl_new,tavor_mr_options_t * op)695 tavor_mr_reregister_buf(tavor_state_t *state, tavor_mrhdl_t mr,
696 tavor_pdhdl_t pd, ibt_smr_attr_t *mr_attr, struct buf *buf,
697 tavor_mrhdl_t *mrhdl_new, tavor_mr_options_t *op)
698 {
699 tavor_bind_info_t bind;
700 int status;
701
702 /*
703 * Fill in the "bind" struct. This struct provides the majority
704 * of the information that will be used to distinguish between an
705 * "addr" binding (see above) and a "buf" binding (as is the case
706 * here). The "bind" struct is later passed to tavor_mr_mem_bind()
707 * which does most of the "heavy lifting" for the Tavor memory
708 * registration routines. Note: We have chosen to provide
709 * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
710 * not set). It is not critical what value we choose here as it need
711 * only be unique for the given RKey (which will happen by default),
712 * so the choice here is somewhat arbitrary.
713 */
714 bind.bi_type = TAVOR_BINDHDL_BUF;
715 bind.bi_buf = buf;
716 if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
717 bind.bi_addr = mr_attr->mr_vaddr;
718 } else {
719 bind.bi_addr = (uint64_t)(uintptr_t)buf->b_un.b_addr;
720 }
721 bind.bi_len = (uint64_t)buf->b_bcount;
722 bind.bi_flags = mr_attr->mr_flags;
723 bind.bi_as = NULL;
724 status = tavor_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
725
726 return (status);
727 }
728
729
730 /*
731 * tavor_mr_sync()
732 * Context: Can be called from interrupt or base context.
733 */
734 /* ARGSUSED */
735 int
tavor_mr_sync(tavor_state_t * state,ibt_mr_sync_t * mr_segs,size_t num_segs)736 tavor_mr_sync(tavor_state_t *state, ibt_mr_sync_t *mr_segs, size_t num_segs)
737 {
738 tavor_mrhdl_t mrhdl;
739 uint64_t seg_vaddr, seg_len, seg_end;
740 uint64_t mr_start, mr_end;
741 uint_t type;
742 int status, i;
743
744 /* Process each of the ibt_mr_sync_t's */
745 for (i = 0; i < num_segs; i++) {
746 mrhdl = (tavor_mrhdl_t)mr_segs[i].ms_handle;
747
748 /* Check for valid memory region handle */
749 if (mrhdl == NULL) {
750 goto mrsync_fail;
751 }
752
753 mutex_enter(&mrhdl->mr_lock);
754
755 /*
756 * Check here to see if the memory region has already been
757 * partially deregistered as a result of a
758 * tavor_umap_umemlock_cb() callback. If so, this is an
759 * error, return failure.
760 */
761 if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
762 mutex_exit(&mrhdl->mr_lock);
763 goto mrsync_fail;
764 }
765
766 /* Check for valid bounds on sync request */
767 seg_vaddr = mr_segs[i].ms_vaddr;
768 seg_len = mr_segs[i].ms_len;
769 seg_end = seg_vaddr + seg_len - 1;
770 mr_start = mrhdl->mr_bindinfo.bi_addr;
771 mr_end = mr_start + mrhdl->mr_bindinfo.bi_len - 1;
772 if ((seg_vaddr < mr_start) || (seg_vaddr > mr_end)) {
773 mutex_exit(&mrhdl->mr_lock);
774 goto mrsync_fail;
775 }
776 if ((seg_end < mr_start) || (seg_end > mr_end)) {
777 mutex_exit(&mrhdl->mr_lock);
778 goto mrsync_fail;
779 }
780
781 /* Determine what type (i.e. direction) for sync */
782 if (mr_segs[i].ms_flags & IBT_SYNC_READ) {
783 type = DDI_DMA_SYNC_FORDEV;
784 } else if (mr_segs[i].ms_flags & IBT_SYNC_WRITE) {
785 type = DDI_DMA_SYNC_FORCPU;
786 } else {
787 mutex_exit(&mrhdl->mr_lock);
788 goto mrsync_fail;
789 }
790
791 (void) ddi_dma_sync(mrhdl->mr_bindinfo.bi_dmahdl,
792 (off_t)(seg_vaddr - mr_start), (size_t)seg_len, type);
793 mutex_exit(&mrhdl->mr_lock);
794 }
795
796 return (DDI_SUCCESS);
797
798 mrsync_fail:
799 return (status);
800 }
801
802
803 /*
804 * tavor_mw_alloc()
805 * Context: Can be called from interrupt or base context.
806 */
807 int
tavor_mw_alloc(tavor_state_t * state,tavor_pdhdl_t pd,ibt_mw_flags_t flags,tavor_mwhdl_t * mwhdl)808 tavor_mw_alloc(tavor_state_t *state, tavor_pdhdl_t pd, ibt_mw_flags_t flags,
809 tavor_mwhdl_t *mwhdl)
810 {
811 tavor_rsrc_t *mpt, *rsrc;
812 tavor_hw_mpt_t mpt_entry;
813 tavor_mwhdl_t mw;
814 uint_t sleep;
815 int status;
816
817 /*
818 * Check the sleep flag. Ensure that it is consistent with the
819 * current thread context (i.e. if we are currently in the interrupt
820 * context, then we shouldn't be attempting to sleep).
821 */
822 sleep = (flags & IBT_MW_NOSLEEP) ? TAVOR_NOSLEEP : TAVOR_SLEEP;
823 if ((sleep == TAVOR_SLEEP) &&
824 (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
825 goto mwalloc_fail;
826 }
827
828 /* Increment the reference count on the protection domain (PD) */
829 tavor_pd_refcnt_inc(pd);
830
831 /*
832 * Allocate an MPT entry (for use as a memory window). Since the
833 * Tavor hardware uses the MPT entry for memory regions and for
834 * memory windows, we will fill in this MPT with all the necessary
835 * parameters for the memory window. And then (just as we do for
836 * memory regions) ownership will be passed to the hardware in the
837 * final step below. If we fail here, we must undo the protection
838 * domain reference count.
839 */
840 status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
841 if (status != DDI_SUCCESS) {
842 goto mwalloc_fail1;
843 }
844
845 /*
846 * Allocate the software structure for tracking the memory window (i.e.
847 * the Tavor Memory Window handle). Note: This is actually the same
848 * software structure used for tracking memory regions, but since many
849 * of the same properties are needed, only a single structure is
850 * necessary. If we fail here, we must undo the protection domain
851 * reference count and the previous resource allocation.
852 */
853 status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
854 if (status != DDI_SUCCESS) {
855 goto mwalloc_fail2;
856 }
857 mw = (tavor_mwhdl_t)rsrc->tr_addr;
858 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw))
859
860 /*
861 * Calculate an "unbound" RKey from MPT index. In much the same way
862 * as we do for memory regions (above), this key is constructed from
863 * a "constrained" (which depends on the MPT index) and an
864 * "unconstrained" portion (which may be arbitrarily chosen).
865 */
866 tavor_mr_keycalc(state, mpt->tr_indx, &mw->mr_rkey);
867
868 /*
869 * Fill in the MPT entry. This is the final step before passing
870 * ownership of the MPT entry to the Tavor hardware. We use all of
871 * the information collected/calculated above to fill in the
872 * requisite portions of the MPT. Note: fewer entries in the MPT
873 * entry are necessary to allocate a memory window.
874 */
875 bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
876 mpt_entry.reg_win = TAVOR_MPT_IS_WINDOW;
877 mpt_entry.mem_key = mw->mr_rkey;
878 mpt_entry.pd = pd->pd_pdnum;
879
880 /*
881 * Write the MPT entry to hardware. Lastly, we pass ownership of
882 * the entry to the hardware. Note: in general, this operation
883 * shouldn't fail. But if it does, we have to undo everything we've
884 * done above before returning error.
885 */
886 status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
887 sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
888 if (status != TAVOR_CMD_SUCCESS) {
889 cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
890 status);
891 goto mwalloc_fail3;
892 }
893
894 /*
895 * Fill in the rest of the Tavor Memory Window handle. Having
896 * successfully transferred ownership of the MPT, we can update the
897 * following fields for use in further operations on the MW.
898 */
899 mw->mr_mptrsrcp = mpt;
900 mw->mr_pdhdl = pd;
901 mw->mr_rsrcp = rsrc;
902 *mwhdl = mw;
903
904 return (DDI_SUCCESS);
905
906 mwalloc_fail3:
907 tavor_rsrc_free(state, &rsrc);
908 mwalloc_fail2:
909 tavor_rsrc_free(state, &mpt);
910 mwalloc_fail1:
911 tavor_pd_refcnt_dec(pd);
912 mwalloc_fail:
913 return (status);
914 }
915
916
917 /*
918 * tavor_mw_free()
919 * Context: Can be called from interrupt or base context.
920 */
921 int
tavor_mw_free(tavor_state_t * state,tavor_mwhdl_t * mwhdl,uint_t sleep)922 tavor_mw_free(tavor_state_t *state, tavor_mwhdl_t *mwhdl, uint_t sleep)
923 {
924 tavor_rsrc_t *mpt, *rsrc;
925 tavor_mwhdl_t mw;
926 int status;
927 tavor_pdhdl_t pd;
928
929 /*
930 * Check the sleep flag. Ensure that it is consistent with the
931 * current thread context (i.e. if we are currently in the interrupt
932 * context, then we shouldn't be attempting to sleep).
933 */
934 if ((sleep == TAVOR_SLEEP) &&
935 (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
936 return (status);
937 }
938
939 /*
940 * Pull all the necessary information from the Tavor Memory Window
941 * handle. This is necessary here because the resource for the
942 * MW handle is going to be freed up as part of the this operation.
943 */
944 mw = *mwhdl;
945 mutex_enter(&mw->mr_lock);
946 mpt = mw->mr_mptrsrcp;
947 rsrc = mw->mr_rsrcp;
948 pd = mw->mr_pdhdl;
949 mutex_exit(&mw->mr_lock);
950 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw))
951
952 /*
953 * Reclaim the MPT entry from hardware. Note: in general, it is
954 * unexpected for this operation to return an error.
955 */
956 status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT, NULL,
957 0, mpt->tr_indx, sleep);
958 if (status != TAVOR_CMD_SUCCESS) {
959 cmn_err(CE_CONT, "Tavor: HW2SW_MPT command failed: %08x\n",
960 status);
961 return (IBT_INVALID_PARAM);
962 }
963
964 /* Free the Tavor Memory Window handle */
965 tavor_rsrc_free(state, &rsrc);
966
967 /* Free up the MPT entry resource */
968 tavor_rsrc_free(state, &mpt);
969
970 /* Decrement the reference count on the protection domain (PD) */
971 tavor_pd_refcnt_dec(pd);
972
973 /* Set the mwhdl pointer to NULL and return success */
974 *mwhdl = NULL;
975
976 return (DDI_SUCCESS);
977 }
978
979
980 /*
981 * tavor_mr_keycalc()
982 * Context: Can be called from interrupt or base context.
983 */
984 void
tavor_mr_keycalc(tavor_state_t * state,uint32_t indx,uint32_t * key)985 tavor_mr_keycalc(tavor_state_t *state, uint32_t indx, uint32_t *key)
986 {
987 uint32_t tmp, log_num_mpt;
988
989 /*
990 * Generate a simple key from counter. Note: We increment this
991 * static variable _intentionally_ without any kind of mutex around
992 * it. First, single-threading all operations through a single lock
993 * would be a bad idea (from a performance point-of-view). Second,
994 * the upper "unconstrained" bits don't really have to be unique
995 * because the lower bits are guaranteed to be (although we do make a
996 * best effort to ensure that they are). Third, the window for the
997 * race (where both threads read and update the counter at the same
998 * time) is incredibly small.
999 * And, lastly, we'd like to make this into a "random" key XXX
1000 */
1001 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(tavor_debug_memkey_cnt))
1002 log_num_mpt = state->ts_cfg_profile->cp_log_num_mpt;
1003 tmp = (tavor_debug_memkey_cnt++) << log_num_mpt;
1004 *key = tmp | indx;
1005 }
1006
1007
1008 /*
1009 * tavor_mr_common_reg()
1010 * Context: Can be called from interrupt or base context.
1011 */
1012 static int
tavor_mr_common_reg(tavor_state_t * state,tavor_pdhdl_t pd,tavor_bind_info_t * bind,tavor_mrhdl_t * mrhdl,tavor_mr_options_t * op)1013 tavor_mr_common_reg(tavor_state_t *state, tavor_pdhdl_t pd,
1014 tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl, tavor_mr_options_t *op)
1015 {
1016 tavor_rsrc_pool_info_t *rsrc_pool;
1017 tavor_rsrc_t *mpt, *mtt, *rsrc, *mtt_refcnt;
1018 tavor_umap_db_entry_t *umapdb;
1019 tavor_sw_refcnt_t *swrc_tmp;
1020 tavor_hw_mpt_t mpt_entry;
1021 tavor_mrhdl_t mr;
1022 ibt_mr_flags_t flags;
1023 tavor_bind_info_t *bh;
1024 ddi_dma_handle_t bind_dmahdl;
1025 ddi_umem_cookie_t umem_cookie;
1026 size_t umem_len;
1027 caddr_t umem_addr;
1028 uint64_t mtt_addr, mtt_ddrbaseaddr, max_sz;
1029 uint_t sleep, mtt_pgsize_bits, bind_type, mr_is_umem;
1030 int status, umem_flags, bind_override_addr;
1031
1032 /*
1033 * Check the "options" flag. Currently this flag tells the driver
1034 * whether or not the region should be bound normally (i.e. with
1035 * entries written into the PCI IOMMU), whether it should be
1036 * registered to bypass the IOMMU, and whether or not the resulting
1037 * address should be "zero-based" (to aid the alignment restrictions
1038 * for QPs).
1039 */
1040 if (op == NULL) {
1041 bind_type = TAVOR_BINDMEM_NORMAL;
1042 bind_dmahdl = NULL;
1043 bind_override_addr = 0;
1044 } else {
1045 bind_type = op->mro_bind_type;
1046 bind_dmahdl = op->mro_bind_dmahdl;
1047 bind_override_addr = op->mro_bind_override_addr;
1048 }
1049
1050 /* Extract the flags field from the tavor_bind_info_t */
1051 flags = bind->bi_flags;
1052
1053 /*
1054 * Check for invalid length. Check is the length is zero or if the
1055 * length is larger than the maximum configured value. Return error
1056 * if it is.
1057 */
1058 max_sz = ((uint64_t)1 << state->ts_cfg_profile->cp_log_max_mrw_sz);
1059 if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
1060 goto mrcommon_fail;
1061 }
1062
1063 /*
1064 * Check the sleep flag. Ensure that it is consistent with the
1065 * current thread context (i.e. if we are currently in the interrupt
1066 * context, then we shouldn't be attempting to sleep).
1067 */
1068 sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1069 if ((sleep == TAVOR_SLEEP) &&
1070 (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1071 goto mrcommon_fail;
1072 }
1073
1074 /*
1075 * Get the base address for the MTT table. This will be necessary
1076 * below when we are setting up the MPT entry.
1077 */
1078 rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
1079 mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
1080
1081 /* Increment the reference count on the protection domain (PD) */
1082 tavor_pd_refcnt_inc(pd);
1083
1084 /*
1085 * Allocate an MPT entry. This will be filled in with all the
1086 * necessary parameters to define the memory region. And then
1087 * ownership will be passed to the hardware in the final step
1088 * below. If we fail here, we must undo the protection domain
1089 * reference count.
1090 */
1091 status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
1092 if (status != DDI_SUCCESS) {
1093 goto mrcommon_fail1;
1094 }
1095
1096 /*
1097 * Allocate the software structure for tracking the memory region (i.e.
1098 * the Tavor Memory Region handle). If we fail here, we must undo
1099 * the protection domain reference count and the previous resource
1100 * allocation.
1101 */
1102 status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
1103 if (status != DDI_SUCCESS) {
1104 goto mrcommon_fail2;
1105 }
1106 mr = (tavor_mrhdl_t)rsrc->tr_addr;
1107 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
1108
1109 /*
1110 * Setup and validate the memory region access flags. This means
1111 * translating the IBTF's enable flags into the access flags that
1112 * will be used in later operations.
1113 */
1114 mr->mr_accflag = 0;
1115 if (flags & IBT_MR_ENABLE_WINDOW_BIND)
1116 mr->mr_accflag |= IBT_MR_WINDOW_BIND;
1117 if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
1118 mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
1119 if (flags & IBT_MR_ENABLE_REMOTE_READ)
1120 mr->mr_accflag |= IBT_MR_REMOTE_READ;
1121 if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
1122 mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
1123 if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
1124 mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
1125
1126 /*
1127 * Calculate keys (Lkey, Rkey) from MPT index. Each key is formed
1128 * from a certain number of "constrained" bits (the least significant
1129 * bits) and some number of "unconstrained" bits. The constrained
1130 * bits must be set to the index of the entry in the MPT table, but
1131 * the unconstrained bits can be set to any value we wish. Note:
1132 * if no remote access is required, then the RKey value is not filled
1133 * in. Otherwise both Rkey and LKey are given the same value.
1134 */
1135 tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
1136 if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
1137 (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
1138 (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
1139 mr->mr_rkey = mr->mr_lkey;
1140 }
1141
1142 /*
1143 * Determine if the memory is from userland and pin the pages
1144 * with umem_lockmemory() if necessary.
1145 * Then, if this is userland memory, allocate an entry in the
1146 * "userland resources database". This will later be added to
1147 * the database (after all further memory registration operations are
1148 * successful). If we fail here, we must undo the reference counts
1149 * and the previous resource allocations.
1150 */
1151 mr_is_umem = (((bind->bi_as != NULL) && (bind->bi_as != &kas)) ? 1 : 0);
1152 if (mr_is_umem) {
1153 umem_len = ptob(btopr(bind->bi_len +
1154 ((uintptr_t)bind->bi_addr & PAGEOFFSET)));
1155 umem_addr = (caddr_t)((uintptr_t)bind->bi_addr & ~PAGEOFFSET);
1156 umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
1157 DDI_UMEMLOCK_LONGTERM);
1158 status = umem_lockmemory(umem_addr, umem_len, umem_flags,
1159 &umem_cookie, &tavor_umem_cbops, NULL);
1160 if (status != 0) {
1161 goto mrcommon_fail3;
1162 }
1163
1164 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1165 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind->bi_buf))
1166
1167 bind->bi_buf = ddi_umem_iosetup(umem_cookie, 0, umem_len,
1168 B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP);
1169 if (bind->bi_buf == NULL) {
1170 goto mrcommon_fail3;
1171 }
1172 bind->bi_type = TAVOR_BINDHDL_UBUF;
1173 bind->bi_buf->b_flags |= B_READ;
1174
1175 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind->bi_buf))
1176 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
1177
1178 umapdb = tavor_umap_db_alloc(state->ts_instance,
1179 (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
1180 (uint64_t)(uintptr_t)rsrc);
1181 if (umapdb == NULL) {
1182 goto mrcommon_fail4;
1183 }
1184 }
1185
1186 /*
1187 * Setup the bindinfo for the mtt bind call
1188 */
1189 bh = &mr->mr_bindinfo;
1190 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bh))
1191 bcopy(bind, bh, sizeof (tavor_bind_info_t));
1192 bh->bi_bypass = bind_type;
1193 status = tavor_mr_mtt_bind(state, bh, bind_dmahdl, &mtt,
1194 &mtt_pgsize_bits);
1195 if (status != DDI_SUCCESS) {
1196 /*
1197 * When mtt_bind fails, freerbuf has already been done,
1198 * so make sure not to call it again.
1199 */
1200 bind->bi_type = bh->bi_type;
1201 goto mrcommon_fail5;
1202 }
1203 mr->mr_logmttpgsz = mtt_pgsize_bits;
1204
1205 /*
1206 * Allocate MTT reference count (to track shared memory regions).
1207 * This reference count resource may never be used on the given
1208 * memory region, but if it is ever later registered as "shared"
1209 * memory region then this resource will be necessary. If we fail
1210 * here, we do pretty much the same as above to clean up.
1211 */
1212 status = tavor_rsrc_alloc(state, TAVOR_REFCNT, 1, sleep,
1213 &mtt_refcnt);
1214 if (status != DDI_SUCCESS) {
1215 goto mrcommon_fail6;
1216 }
1217 mr->mr_mttrefcntp = mtt_refcnt;
1218 swrc_tmp = (tavor_sw_refcnt_t *)mtt_refcnt->tr_addr;
1219 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_tmp))
1220 TAVOR_MTT_REFCNT_INIT(swrc_tmp);
1221
1222 /*
1223 * Fill in the MPT entry. This is the final step before passing
1224 * ownership of the MPT entry to the Tavor hardware. We use all of
1225 * the information collected/calculated above to fill in the
1226 * requisite portions of the MPT.
1227 */
1228 bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
1229 mpt_entry.m_io = TAVOR_MEM_CYCLE_GENERATE;
1230 mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND) ? 1 : 0;
1231 mpt_entry.atomic = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
1232 mpt_entry.rw = (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ? 1 : 0;
1233 mpt_entry.rr = (mr->mr_accflag & IBT_MR_REMOTE_READ) ? 1 : 0;
1234 mpt_entry.lw = (mr->mr_accflag & IBT_MR_LOCAL_WRITE) ? 1 : 0;
1235 mpt_entry.lr = 1;
1236 mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
1237 mpt_entry.page_sz = mr->mr_logmttpgsz - 0xC;
1238 mpt_entry.mem_key = mr->mr_lkey;
1239 mpt_entry.pd = pd->pd_pdnum;
1240 if (bind_override_addr == 0) {
1241 mpt_entry.start_addr = bh->bi_addr;
1242 } else {
1243 bh->bi_addr = bh->bi_addr & ((1 << mr->mr_logmttpgsz) - 1);
1244 mpt_entry.start_addr = bh->bi_addr;
1245 }
1246 mpt_entry.reg_win_len = bh->bi_len;
1247 mpt_entry.win_cnt_limit = TAVOR_UNLIMITED_WIN_BIND;
1248 mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
1249 mpt_entry.mttseg_addr_h = mtt_addr >> 32;
1250 mpt_entry.mttseg_addr_l = mtt_addr >> 6;
1251
1252 /*
1253 * Write the MPT entry to hardware. Lastly, we pass ownership of
1254 * the entry to the hardware. Note: in general, this operation
1255 * shouldn't fail. But if it does, we have to undo everything we've
1256 * done above before returning error.
1257 */
1258 status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1259 sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
1260 if (status != TAVOR_CMD_SUCCESS) {
1261 cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
1262 status);
1263 goto mrcommon_fail7;
1264 }
1265
1266 /*
1267 * Fill in the rest of the Tavor Memory Region handle. Having
1268 * successfully transferred ownership of the MPT, we can update the
1269 * following fields for use in further operations on the MR.
1270 */
1271 mr->mr_mptrsrcp = mpt;
1272 mr->mr_mttrsrcp = mtt;
1273 mr->mr_pdhdl = pd;
1274 mr->mr_rsrcp = rsrc;
1275 mr->mr_is_umem = mr_is_umem;
1276 mr->mr_umemcookie = (mr_is_umem != 0) ? umem_cookie : NULL;
1277 mr->mr_umem_cbfunc = NULL;
1278 mr->mr_umem_cbarg1 = NULL;
1279 mr->mr_umem_cbarg2 = NULL;
1280
1281 /*
1282 * If this is userland memory, then we need to insert the previously
1283 * allocated entry into the "userland resources database". This will
1284 * allow for later coordination between the tavor_umap_umemlock_cb()
1285 * callback and tavor_mr_deregister().
1286 */
1287 if (mr_is_umem) {
1288 tavor_umap_db_add(umapdb);
1289 }
1290
1291 *mrhdl = mr;
1292
1293 return (DDI_SUCCESS);
1294
1295 /*
1296 * The following is cleanup for all possible failure cases in this routine
1297 */
1298 mrcommon_fail7:
1299 tavor_rsrc_free(state, &mtt_refcnt);
1300 mrcommon_fail6:
1301 tavor_rsrc_free(state, &mtt);
1302 tavor_mr_mem_unbind(state, bh);
1303 bind->bi_type = bh->bi_type;
1304 mrcommon_fail5:
1305 if (mr_is_umem) {
1306 tavor_umap_db_free(umapdb);
1307 }
1308 mrcommon_fail4:
1309 if (mr_is_umem) {
1310 /*
1311 * Free up the memory ddi_umem_iosetup() allocates
1312 * internally.
1313 */
1314 if (bind->bi_type == TAVOR_BINDHDL_UBUF) {
1315 freerbuf(bind->bi_buf);
1316 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1317 bind->bi_type = TAVOR_BINDHDL_NONE;
1318 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
1319 }
1320 ddi_umem_unlock(umem_cookie);
1321 }
1322 mrcommon_fail3:
1323 tavor_rsrc_free(state, &rsrc);
1324 mrcommon_fail2:
1325 tavor_rsrc_free(state, &mpt);
1326 mrcommon_fail1:
1327 tavor_pd_refcnt_dec(pd);
1328 mrcommon_fail:
1329 return (status);
1330 }
1331
1332 int
tavor_dma_mr_register(tavor_state_t * state,tavor_pdhdl_t pd,ibt_dmr_attr_t * mr_attr,tavor_mrhdl_t * mrhdl)1333 tavor_dma_mr_register(tavor_state_t *state, tavor_pdhdl_t pd,
1334 ibt_dmr_attr_t *mr_attr, tavor_mrhdl_t *mrhdl)
1335 {
1336 tavor_rsrc_t *mpt, *rsrc;
1337 tavor_hw_mpt_t mpt_entry;
1338 tavor_mrhdl_t mr;
1339 ibt_mr_flags_t flags;
1340 uint_t sleep;
1341 int status;
1342
1343 /* Extract the flags field */
1344 flags = mr_attr->dmr_flags;
1345
1346 /*
1347 * Check the sleep flag. Ensure that it is consistent with the
1348 * current thread context (i.e. if we are currently in the interrupt
1349 * context, then we shouldn't be attempting to sleep).
1350 */
1351 sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1352 if ((sleep == TAVOR_SLEEP) &&
1353 (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1354 status = IBT_INVALID_PARAM;
1355 goto mrcommon_fail;
1356 }
1357
1358 /* Increment the reference count on the protection domain (PD) */
1359 tavor_pd_refcnt_inc(pd);
1360
1361 /*
1362 * Allocate an MPT entry. This will be filled in with all the
1363 * necessary parameters to define the memory region. And then
1364 * ownership will be passed to the hardware in the final step
1365 * below. If we fail here, we must undo the protection domain
1366 * reference count.
1367 */
1368 status = tavor_rsrc_alloc(state, TAVOR_MPT, 1, sleep, &mpt);
1369 if (status != DDI_SUCCESS) {
1370 status = IBT_INSUFF_RESOURCE;
1371 goto mrcommon_fail1;
1372 }
1373
1374 /*
1375 * Allocate the software structure for tracking the memory region (i.e.
1376 * the Tavor Memory Region handle). If we fail here, we must undo
1377 * the protection domain reference count and the previous resource
1378 * allocation.
1379 */
1380 status = tavor_rsrc_alloc(state, TAVOR_MRHDL, 1, sleep, &rsrc);
1381 if (status != DDI_SUCCESS) {
1382 status = IBT_INSUFF_RESOURCE;
1383 goto mrcommon_fail2;
1384 }
1385 mr = (tavor_mrhdl_t)rsrc->tr_addr;
1386 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
1387 bzero(mr, sizeof (*mr));
1388
1389 /*
1390 * Setup and validate the memory region access flags. This means
1391 * translating the IBTF's enable flags into the access flags that
1392 * will be used in later operations.
1393 */
1394 mr->mr_accflag = 0;
1395 if (flags & IBT_MR_ENABLE_WINDOW_BIND)
1396 mr->mr_accflag |= IBT_MR_WINDOW_BIND;
1397 if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
1398 mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
1399 if (flags & IBT_MR_ENABLE_REMOTE_READ)
1400 mr->mr_accflag |= IBT_MR_REMOTE_READ;
1401 if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
1402 mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
1403 if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
1404 mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
1405
1406 /*
1407 * Calculate keys (Lkey, Rkey) from MPT index. Each key is formed
1408 * from a certain number of "constrained" bits (the least significant
1409 * bits) and some number of "unconstrained" bits. The constrained
1410 * bits must be set to the index of the entry in the MPT table, but
1411 * the unconstrained bits can be set to any value we wish. Note:
1412 * if no remote access is required, then the RKey value is not filled
1413 * in. Otherwise both Rkey and LKey are given the same value.
1414 */
1415 tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
1416 if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
1417 (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
1418 (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
1419 mr->mr_rkey = mr->mr_lkey;
1420 }
1421
1422 /*
1423 * Fill in the MPT entry. This is the final step before passing
1424 * ownership of the MPT entry to the Tavor hardware. We use all of
1425 * the information collected/calculated above to fill in the
1426 * requisite portions of the MPT.
1427 */
1428 bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
1429
1430 mpt_entry.m_io = TAVOR_MEM_CYCLE_GENERATE;
1431 mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND) ? 1 : 0;
1432 mpt_entry.atomic = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
1433 mpt_entry.rw = (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ? 1 : 0;
1434 mpt_entry.rr = (mr->mr_accflag & IBT_MR_REMOTE_READ) ? 1 : 0;
1435 mpt_entry.lw = (mr->mr_accflag & IBT_MR_LOCAL_WRITE) ? 1 : 0;
1436 mpt_entry.lr = 1;
1437 mpt_entry.phys_addr = 1; /* critical bit for this */
1438 mpt_entry.reg_win = TAVOR_MPT_IS_REGION;
1439
1440 mpt_entry.page_sz = mr->mr_logmttpgsz - 0xC;
1441 mpt_entry.mem_key = mr->mr_lkey;
1442 mpt_entry.pd = pd->pd_pdnum;
1443 mpt_entry.win_cnt_limit = TAVOR_UNLIMITED_WIN_BIND;
1444
1445 mpt_entry.start_addr = mr_attr->dmr_paddr;
1446 mpt_entry.reg_win_len = mr_attr->dmr_len;
1447
1448 mpt_entry.mttseg_addr_h = 0;
1449 mpt_entry.mttseg_addr_l = 0;
1450
1451 /*
1452 * Write the MPT entry to hardware. Lastly, we pass ownership of
1453 * the entry to the hardware if needed. Note: in general, this
1454 * operation shouldn't fail. But if it does, we have to undo
1455 * everything we've done above before returning error.
1456 *
1457 * For Tavor, this routine (which is common to the contexts) will only
1458 * set the ownership if needed - the process of passing the context
1459 * itself to HW will take care of setting up the MPT (based on type
1460 * and index).
1461 */
1462
1463 status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1464 sizeof (tavor_hw_mpt_t), mpt->tr_indx, sleep);
1465 if (status != TAVOR_CMD_SUCCESS) {
1466 cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
1467 status);
1468 status = ibc_get_ci_failure(0);
1469 goto mrcommon_fail7;
1470 }
1471
1472 /*
1473 * Fill in the rest of the Tavor Memory Region handle. Having
1474 * successfully transferred ownership of the MPT, we can update the
1475 * following fields for use in further operations on the MR.
1476 */
1477 mr->mr_mptrsrcp = mpt;
1478 mr->mr_mttrsrcp = NULL;
1479 mr->mr_pdhdl = pd;
1480 mr->mr_rsrcp = rsrc;
1481 mr->mr_is_umem = 0;
1482 mr->mr_umemcookie = NULL;
1483 mr->mr_umem_cbfunc = NULL;
1484 mr->mr_umem_cbarg1 = NULL;
1485 mr->mr_umem_cbarg2 = NULL;
1486
1487 *mrhdl = mr;
1488
1489 return (DDI_SUCCESS);
1490
1491 /*
1492 * The following is cleanup for all possible failure cases in this routine
1493 */
1494 mrcommon_fail7:
1495 tavor_rsrc_free(state, &rsrc);
1496 mrcommon_fail2:
1497 tavor_rsrc_free(state, &mpt);
1498 mrcommon_fail1:
1499 tavor_pd_refcnt_dec(pd);
1500 mrcommon_fail:
1501 return (status);
1502 }
1503
1504 /*
1505 * tavor_mr_mtt_bind()
1506 * Context: Can be called from interrupt or base context.
1507 */
1508 int
tavor_mr_mtt_bind(tavor_state_t * state,tavor_bind_info_t * bind,ddi_dma_handle_t bind_dmahdl,tavor_rsrc_t ** mtt,uint_t * mtt_pgsize_bits)1509 tavor_mr_mtt_bind(tavor_state_t *state, tavor_bind_info_t *bind,
1510 ddi_dma_handle_t bind_dmahdl, tavor_rsrc_t **mtt, uint_t *mtt_pgsize_bits)
1511 {
1512 uint64_t nummtt;
1513 uint_t sleep;
1514 int status;
1515
1516 /*
1517 * Check the sleep flag. Ensure that it is consistent with the
1518 * current thread context (i.e. if we are currently in the interrupt
1519 * context, then we shouldn't be attempting to sleep).
1520 */
1521 sleep = (bind->bi_flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1522 if ((sleep == TAVOR_SLEEP) &&
1523 (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1524 goto mrmttbind_fail;
1525 }
1526
1527 /*
1528 * Bind the memory and determine the mapped addresses. This is
1529 * the first of two routines that do all the "heavy lifting" for
1530 * the Tavor memory registration routines. The tavor_mr_mem_bind()
1531 * routine takes the "bind" struct with all its fields filled
1532 * in and returns a list of DMA cookies (for the PCI mapped addresses
1533 * corresponding to the specified address region) which are used by
1534 * the tavor_mr_fast_mtt_write() routine below. If we fail here, we
1535 * must undo all the previous resource allocation (and PD reference
1536 * count).
1537 */
1538 status = tavor_mr_mem_bind(state, bind, bind_dmahdl, sleep);
1539 if (status != DDI_SUCCESS) {
1540 goto mrmttbind_fail;
1541 }
1542
1543 /*
1544 * Determine number of pages spanned. This routine uses the
1545 * information in the "bind" struct to determine the required
1546 * number of MTT entries needed (and returns the suggested page size -
1547 * as a "power-of-2" - for each MTT entry).
1548 */
1549 nummtt = tavor_mr_nummtt_needed(state, bind, mtt_pgsize_bits);
1550
1551 /*
1552 * Allocate the MTT entries. Use the calculations performed above to
1553 * allocate the required number of MTT entries. Note: MTT entries are
1554 * allocated in "MTT segments" which consist of complete cachelines
1555 * (i.e. 8 entries, 16 entries, etc.) So the TAVOR_NUMMTT_TO_MTTSEG()
1556 * macro is used to do the proper conversion. If we fail here, we
1557 * must not only undo all the previous resource allocation (and PD
1558 * reference count), but we must also unbind the memory.
1559 */
1560 status = tavor_rsrc_alloc(state, TAVOR_MTT,
1561 TAVOR_NUMMTT_TO_MTTSEG(nummtt), sleep, mtt);
1562 if (status != DDI_SUCCESS) {
1563 goto mrmttbind_fail2;
1564 }
1565
1566 /*
1567 * Write the mapped addresses into the MTT entries. This is part two
1568 * of the "heavy lifting" routines that we talked about above. Note:
1569 * we pass the suggested page size from the earlier operation here.
1570 * And if we fail here, we again do pretty much the same huge clean up.
1571 */
1572 status = tavor_mr_fast_mtt_write(*mtt, bind, *mtt_pgsize_bits);
1573 if (status != DDI_SUCCESS) {
1574 goto mrmttbind_fail3;
1575 }
1576 return (DDI_SUCCESS);
1577
1578 /*
1579 * The following is cleanup for all possible failure cases in this routine
1580 */
1581 mrmttbind_fail3:
1582 tavor_rsrc_free(state, mtt);
1583 mrmttbind_fail2:
1584 tavor_mr_mem_unbind(state, bind);
1585 mrmttbind_fail:
1586 return (status);
1587 }
1588
1589
1590 /*
1591 * tavor_mr_mtt_unbind()
1592 * Context: Can be called from interrupt or base context.
1593 */
1594 int
tavor_mr_mtt_unbind(tavor_state_t * state,tavor_bind_info_t * bind,tavor_rsrc_t * mtt)1595 tavor_mr_mtt_unbind(tavor_state_t *state, tavor_bind_info_t *bind,
1596 tavor_rsrc_t *mtt)
1597 {
1598 /*
1599 * Free up the MTT entries and unbind the memory. Here, as above, we
1600 * attempt to free these resources only if it is appropriate to do so.
1601 */
1602 tavor_mr_mem_unbind(state, bind);
1603 tavor_rsrc_free(state, &mtt);
1604
1605 return (DDI_SUCCESS);
1606 }
1607
1608
1609 /*
1610 * tavor_mr_common_rereg()
1611 * Context: Can be called from interrupt or base context.
1612 */
1613 static int
tavor_mr_common_rereg(tavor_state_t * state,tavor_mrhdl_t mr,tavor_pdhdl_t pd,tavor_bind_info_t * bind,tavor_mrhdl_t * mrhdl_new,tavor_mr_options_t * op)1614 tavor_mr_common_rereg(tavor_state_t *state, tavor_mrhdl_t mr,
1615 tavor_pdhdl_t pd, tavor_bind_info_t *bind, tavor_mrhdl_t *mrhdl_new,
1616 tavor_mr_options_t *op)
1617 {
1618 tavor_rsrc_t *mpt;
1619 ibt_mr_attr_flags_t acc_flags_to_use;
1620 ibt_mr_flags_t flags;
1621 tavor_pdhdl_t pd_to_use;
1622 tavor_hw_mpt_t mpt_entry;
1623 uint64_t mtt_addr_to_use, vaddr_to_use, len_to_use;
1624 uint_t sleep, dereg_level;
1625 int status;
1626
1627 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1628
1629 /*
1630 * Check here to see if the memory region corresponds to a userland
1631 * mapping. Reregistration of userland memory regions is not
1632 * currently supported. Return failure. XXX
1633 */
1634 if (mr->mr_is_umem) {
1635 goto mrrereg_fail;
1636 }
1637
1638 mutex_enter(&mr->mr_lock);
1639
1640 /* Pull MPT resource pointer from the Tavor Memory Region handle */
1641 mpt = mr->mr_mptrsrcp;
1642
1643 /* Extract the flags field from the tavor_bind_info_t */
1644 flags = bind->bi_flags;
1645
1646 /*
1647 * Check the sleep flag. Ensure that it is consistent with the
1648 * current thread context (i.e. if we are currently in the interrupt
1649 * context, then we shouldn't be attempting to sleep).
1650 */
1651 sleep = (flags & IBT_MR_NOSLEEP) ? TAVOR_NOSLEEP: TAVOR_SLEEP;
1652 if ((sleep == TAVOR_SLEEP) &&
1653 (sleep != TAVOR_SLEEPFLAG_FOR_CONTEXT())) {
1654 mutex_exit(&mr->mr_lock);
1655 goto mrrereg_fail;
1656 }
1657
1658 /*
1659 * First step is to temporarily invalidate the MPT entry. This
1660 * regains ownership from the hardware, and gives us the opportunity
1661 * to modify the entry. Note: The HW2SW_MPT command returns the
1662 * current MPT entry contents. These are saved away here because
1663 * they will be reused in a later step below. If the region has
1664 * bound memory windows that we fail returning an "in use" error code.
1665 * Otherwise, this is an unexpected error and we deregister the
1666 * memory region and return error.
1667 *
1668 * We use TAVOR_CMD_NOSLEEP_SPIN here always because we must protect
1669 * against holding the lock around this rereg call in all contexts.
1670 */
1671 status = tavor_cmn_ownership_cmd_post(state, HW2SW_MPT, &mpt_entry,
1672 sizeof (tavor_hw_mpt_t), mpt->tr_indx, TAVOR_CMD_NOSLEEP_SPIN);
1673 if (status != TAVOR_CMD_SUCCESS) {
1674 mutex_exit(&mr->mr_lock);
1675 if (status == TAVOR_CMD_REG_BOUND) {
1676 return (IBT_MR_IN_USE);
1677 } else {
1678 cmn_err(CE_CONT, "Tavor: HW2SW_MPT command failed: "
1679 "%08x\n", status);
1680
1681 /*
1682 * Call deregister and ensure that all current
1683 * resources get freed up
1684 */
1685 if (tavor_mr_deregister(state, &mr,
1686 TAVOR_MR_DEREG_ALL, sleep) != DDI_SUCCESS) {
1687 TAVOR_WARNING(state, "failed to deregister "
1688 "memory region");
1689 }
1690 return (ibc_get_ci_failure(0));
1691 }
1692 }
1693
1694 /*
1695 * If we're changing the protection domain, then validate the new one
1696 */
1697 if (flags & IBT_MR_CHANGE_PD) {
1698
1699 /* Check for valid PD handle pointer */
1700 if (pd == NULL) {
1701 mutex_exit(&mr->mr_lock);
1702 /*
1703 * Call deregister and ensure that all current
1704 * resources get properly freed up. Unnecessary
1705 * here to attempt to regain software ownership
1706 * of the MPT entry as that has already been
1707 * done above.
1708 */
1709 if (tavor_mr_deregister(state, &mr,
1710 TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) !=
1711 DDI_SUCCESS) {
1712 TAVOR_WARNING(state, "failed to deregister "
1713 "memory region");
1714 }
1715 goto mrrereg_fail;
1716 }
1717
1718 /* Use the new PD handle in all operations below */
1719 pd_to_use = pd;
1720
1721 } else {
1722 /* Use the current PD handle in all operations below */
1723 pd_to_use = mr->mr_pdhdl;
1724 }
1725
1726 /*
1727 * If we're changing access permissions, then validate the new ones
1728 */
1729 if (flags & IBT_MR_CHANGE_ACCESS) {
1730 /*
1731 * Validate the access flags. Both remote write and remote
1732 * atomic require the local write flag to be set
1733 */
1734 if (((flags & IBT_MR_ENABLE_REMOTE_WRITE) ||
1735 (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)) &&
1736 !(flags & IBT_MR_ENABLE_LOCAL_WRITE)) {
1737 mutex_exit(&mr->mr_lock);
1738 /*
1739 * Call deregister and ensure that all current
1740 * resources get properly freed up. Unnecessary
1741 * here to attempt to regain software ownership
1742 * of the MPT entry as that has already been
1743 * done above.
1744 */
1745 if (tavor_mr_deregister(state, &mr,
1746 TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) !=
1747 DDI_SUCCESS) {
1748 TAVOR_WARNING(state, "failed to deregister "
1749 "memory region");
1750 }
1751 goto mrrereg_fail;
1752 }
1753
1754 /*
1755 * Setup and validate the memory region access flags. This
1756 * means translating the IBTF's enable flags into the access
1757 * flags that will be used in later operations.
1758 */
1759 acc_flags_to_use = 0;
1760 if (flags & IBT_MR_ENABLE_WINDOW_BIND)
1761 acc_flags_to_use |= IBT_MR_WINDOW_BIND;
1762 if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
1763 acc_flags_to_use |= IBT_MR_LOCAL_WRITE;
1764 if (flags & IBT_MR_ENABLE_REMOTE_READ)
1765 acc_flags_to_use |= IBT_MR_REMOTE_READ;
1766 if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
1767 acc_flags_to_use |= IBT_MR_REMOTE_WRITE;
1768 if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
1769 acc_flags_to_use |= IBT_MR_REMOTE_ATOMIC;
1770
1771 } else {
1772 acc_flags_to_use = mr->mr_accflag;
1773 }
1774
1775 /*
1776 * If we're modifying the translation, then figure out whether
1777 * we can reuse the current MTT resources. This means calling
1778 * tavor_mr_rereg_xlat_helper() which does most of the heavy lifting
1779 * for the reregistration. If the current memory region contains
1780 * sufficient MTT entries for the new regions, then it will be
1781 * reused and filled in. Otherwise, new entries will be allocated,
1782 * the old ones will be freed, and the new entries will be filled
1783 * in. Note: If we're not modifying the translation, then we
1784 * should already have all the information we need to update the MPT.
1785 * Also note: If tavor_mr_rereg_xlat_helper() fails, it will return
1786 * a "dereg_level" which is the level of cleanup that needs to be
1787 * passed to tavor_mr_deregister() to finish the cleanup.
1788 */
1789 if (flags & IBT_MR_CHANGE_TRANSLATION) {
1790 status = tavor_mr_rereg_xlat_helper(state, mr, bind, op,
1791 &mtt_addr_to_use, sleep, &dereg_level);
1792 if (status != DDI_SUCCESS) {
1793 mutex_exit(&mr->mr_lock);
1794 /*
1795 * Call deregister and ensure that all resources get
1796 * properly freed up.
1797 */
1798 if (tavor_mr_deregister(state, &mr, dereg_level,
1799 sleep) != DDI_SUCCESS) {
1800 TAVOR_WARNING(state, "failed to deregister "
1801 "memory region");
1802 }
1803
1804 goto mrrereg_fail;
1805 }
1806 vaddr_to_use = mr->mr_bindinfo.bi_addr;
1807 len_to_use = mr->mr_bindinfo.bi_len;
1808 } else {
1809 mtt_addr_to_use = (((uint64_t)mpt_entry.mttseg_addr_h << 32) |
1810 ((uint64_t)mpt_entry.mttseg_addr_l << 6));
1811 vaddr_to_use = mr->mr_bindinfo.bi_addr;
1812 len_to_use = mr->mr_bindinfo.bi_len;
1813 }
1814
1815 /*
1816 * Calculate new keys (Lkey, Rkey) from MPT index. Just like they were
1817 * when the region was first registered, each key is formed from
1818 * "constrained" bits and "unconstrained" bits. Note: If no remote
1819 * access is required, then the RKey value is not filled in. Otherwise
1820 * both Rkey and LKey are given the same value.
1821 */
1822 tavor_mr_keycalc(state, mpt->tr_indx, &mr->mr_lkey);
1823 if ((acc_flags_to_use & IBT_MR_REMOTE_READ) ||
1824 (acc_flags_to_use & IBT_MR_REMOTE_WRITE) ||
1825 (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC)) {
1826 mr->mr_rkey = mr->mr_lkey;
1827 }
1828
1829 /*
1830 * Update the MPT entry with the new information. Some of this
1831 * information is retained from the previous operation, some of
1832 * it is new based on request.
1833 */
1834 mpt_entry.en_bind = (acc_flags_to_use & IBT_MR_WINDOW_BIND) ? 1 : 0;
1835 mpt_entry.atomic = (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
1836 mpt_entry.rw = (acc_flags_to_use & IBT_MR_REMOTE_WRITE) ? 1 : 0;
1837 mpt_entry.rr = (acc_flags_to_use & IBT_MR_REMOTE_READ) ? 1 : 0;
1838 mpt_entry.lw = (acc_flags_to_use & IBT_MR_LOCAL_WRITE) ? 1 : 0;
1839 mpt_entry.page_sz = mr->mr_logmttpgsz - 0xC;
1840 mpt_entry.mem_key = mr->mr_lkey;
1841 mpt_entry.pd = pd_to_use->pd_pdnum;
1842 mpt_entry.start_addr = vaddr_to_use;
1843 mpt_entry.reg_win_len = len_to_use;
1844 mpt_entry.mttseg_addr_h = mtt_addr_to_use >> 32;
1845 mpt_entry.mttseg_addr_l = mtt_addr_to_use >> 6;
1846
1847 /*
1848 * Write the updated MPT entry to hardware
1849 *
1850 * We use TAVOR_CMD_NOSLEEP_SPIN here always because we must protect
1851 * against holding the lock around this rereg call in all contexts.
1852 */
1853 status = tavor_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1854 sizeof (tavor_hw_mpt_t), mpt->tr_indx, TAVOR_CMD_NOSLEEP_SPIN);
1855 if (status != TAVOR_CMD_SUCCESS) {
1856 mutex_exit(&mr->mr_lock);
1857 cmn_err(CE_CONT, "Tavor: SW2HW_MPT command failed: %08x\n",
1858 status);
1859 /*
1860 * Call deregister and ensure that all current resources get
1861 * properly freed up. Unnecessary here to attempt to regain
1862 * software ownership of the MPT entry as that has already
1863 * been done above.
1864 */
1865 if (tavor_mr_deregister(state, &mr,
1866 TAVOR_MR_DEREG_NO_HW2SW_MPT, sleep) != DDI_SUCCESS) {
1867 TAVOR_WARNING(state, "failed to deregister memory "
1868 "region");
1869 }
1870 return (ibc_get_ci_failure(0));
1871 }
1872
1873 /*
1874 * If we're changing PD, then update their reference counts now.
1875 * This means decrementing the reference count on the old PD and
1876 * incrementing the reference count on the new PD.
1877 */
1878 if (flags & IBT_MR_CHANGE_PD) {
1879 tavor_pd_refcnt_dec(mr->mr_pdhdl);
1880 tavor_pd_refcnt_inc(pd);
1881 }
1882
1883 /*
1884 * Update the contents of the Tavor Memory Region handle to reflect
1885 * what has been changed.
1886 */
1887 mr->mr_pdhdl = pd_to_use;
1888 mr->mr_accflag = acc_flags_to_use;
1889 mr->mr_is_umem = 0;
1890 mr->mr_umemcookie = NULL;
1891
1892 /* New MR handle is same as the old */
1893 *mrhdl_new = mr;
1894 mutex_exit(&mr->mr_lock);
1895
1896 return (DDI_SUCCESS);
1897
1898 mrrereg_fail:
1899 return (status);
1900 }
1901
1902
1903 /*
1904 * tavor_mr_rereg_xlat_helper
1905 * Context: Can be called from interrupt or base context.
1906 * Note: This routine expects the "mr_lock" to be held when it
1907 * is called. Upon returning failure, this routine passes information
1908 * about what "dereg_level" should be passed to tavor_mr_deregister().
1909 */
1910 static int
tavor_mr_rereg_xlat_helper(tavor_state_t * state,tavor_mrhdl_t mr,tavor_bind_info_t * bind,tavor_mr_options_t * op,uint64_t * mtt_addr,uint_t sleep,uint_t * dereg_level)1911 tavor_mr_rereg_xlat_helper(tavor_state_t *state, tavor_mrhdl_t mr,
1912 tavor_bind_info_t *bind, tavor_mr_options_t *op, uint64_t *mtt_addr,
1913 uint_t sleep, uint_t *dereg_level)
1914 {
1915 tavor_rsrc_pool_info_t *rsrc_pool;
1916 tavor_rsrc_t *mtt, *mtt_refcnt;
1917 tavor_sw_refcnt_t *swrc_old, *swrc_new;
1918 ddi_dma_handle_t dmahdl;
1919 uint64_t nummtt_needed, nummtt_in_currrsrc, max_sz;
1920 uint64_t mtt_ddrbaseaddr;
1921 uint_t mtt_pgsize_bits, bind_type, reuse_dmahdl;
1922 int status;
1923
1924 ASSERT(MUTEX_HELD(&mr->mr_lock));
1925
1926 /*
1927 * Check the "options" flag. Currently this flag tells the driver
1928 * whether or not the region should be bound normally (i.e. with
1929 * entries written into the PCI IOMMU) or whether it should be
1930 * registered to bypass the IOMMU.
1931 */
1932 if (op == NULL) {
1933 bind_type = TAVOR_BINDMEM_NORMAL;
1934 } else {
1935 bind_type = op->mro_bind_type;
1936 }
1937
1938 /*
1939 * Check for invalid length. Check is the length is zero or if the
1940 * length is larger than the maximum configured value. Return error
1941 * if it is.
1942 */
1943 max_sz = ((uint64_t)1 << state->ts_cfg_profile->cp_log_max_mrw_sz);
1944 if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
1945 /*
1946 * Deregister will be called upon returning failure from this
1947 * routine. This will ensure that all current resources get
1948 * properly freed up. Unnecessary to attempt to regain
1949 * software ownership of the MPT entry as that has already
1950 * been done above (in tavor_mr_reregister())
1951 */
1952 *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT;
1953
1954 goto mrrereghelp_fail;
1955 }
1956
1957 /*
1958 * Determine the number of pages necessary for new region and the
1959 * number of pages supported by the current MTT resources
1960 */
1961 nummtt_needed = tavor_mr_nummtt_needed(state, bind, &mtt_pgsize_bits);
1962 nummtt_in_currrsrc = mr->mr_mttrsrcp->tr_len >> TAVOR_MTT_SIZE_SHIFT;
1963
1964 /*
1965 * Depending on whether we have enough pages or not, the next step is
1966 * to fill in a set of MTT entries that reflect the new mapping. In
1967 * the first case below, we already have enough entries. This means
1968 * we need to unbind the memory from the previous mapping, bind the
1969 * memory for the new mapping, write the new MTT entries, and update
1970 * the mr to reflect the changes.
1971 * In the second case below, we do not have enough entries in the
1972 * current mapping. So, in this case, we need not only to unbind the
1973 * current mapping, but we need to free up the MTT resources associated
1974 * with that mapping. After we've successfully done that, we continue
1975 * by binding the new memory, allocating new MTT entries, writing the
1976 * new MTT entries, and updating the mr to reflect the changes.
1977 */
1978
1979 /*
1980 * If this region is being shared (i.e. MTT refcount != 1), then we
1981 * can't reuse the current MTT resources regardless of their size.
1982 * Instead we'll need to alloc new ones (below) just as if there
1983 * hadn't been enough room in the current entries.
1984 */
1985 swrc_old = (tavor_sw_refcnt_t *)mr->mr_mttrefcntp->tr_addr;
1986 if (TAVOR_MTT_IS_NOT_SHARED(swrc_old) &&
1987 (nummtt_needed <= nummtt_in_currrsrc)) {
1988
1989 /*
1990 * Unbind the old mapping for this memory region, but retain
1991 * the ddi_dma_handle_t (if possible) for reuse in the bind
1992 * operation below. Note: If original memory region was
1993 * bound for IOMMU bypass and the new region can not use
1994 * bypass, then a new DMA handle will be necessary.
1995 */
1996 if (TAVOR_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
1997 mr->mr_bindinfo.bi_free_dmahdl = 0;
1998 tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
1999 dmahdl = mr->mr_bindinfo.bi_dmahdl;
2000 reuse_dmahdl = 1;
2001 } else {
2002 tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2003 dmahdl = NULL;
2004 reuse_dmahdl = 0;
2005 }
2006
2007 /*
2008 * Bind the new memory and determine the mapped addresses.
2009 * As described, this routine and tavor_mr_fast_mtt_write()
2010 * do the majority of the work for the memory registration
2011 * operations. Note: When we successfully finish the binding,
2012 * we will set the "bi_free_dmahdl" flag to indicate that
2013 * even though we may have reused the ddi_dma_handle_t we do
2014 * wish it to be freed up at some later time. Note also that
2015 * if we fail, we may need to cleanup the ddi_dma_handle_t.
2016 */
2017 bind->bi_bypass = bind_type;
2018 status = tavor_mr_mem_bind(state, bind, dmahdl, sleep);
2019 if (status != DDI_SUCCESS) {
2020 if (reuse_dmahdl) {
2021 ddi_dma_free_handle(&dmahdl);
2022 }
2023
2024 /*
2025 * Deregister will be called upon returning failure
2026 * from this routine. This will ensure that all
2027 * current resources get properly freed up.
2028 * Unnecessary to attempt to regain software ownership
2029 * of the MPT entry as that has already been done
2030 * above (in tavor_mr_reregister()). Also unnecessary
2031 * to attempt to unbind the memory.
2032 */
2033 *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2034
2035 goto mrrereghelp_fail;
2036 }
2037 if (reuse_dmahdl) {
2038 bind->bi_free_dmahdl = 1;
2039 }
2040
2041 /*
2042 * Using the new mapping, but reusing the current MTT
2043 * resources, write the updated entries to MTT
2044 */
2045 mtt = mr->mr_mttrsrcp;
2046 status = tavor_mr_fast_mtt_write(mtt, bind, mtt_pgsize_bits);
2047 if (status != DDI_SUCCESS) {
2048 /*
2049 * Deregister will be called upon returning failure
2050 * from this routine. This will ensure that all
2051 * current resources get properly freed up.
2052 * Unnecessary to attempt to regain software ownership
2053 * of the MPT entry as that has already been done
2054 * above (in tavor_mr_reregister()). Also unnecessary
2055 * to attempt to unbind the memory.
2056 *
2057 * But we do need to unbind the newly bound memory
2058 * before returning.
2059 */
2060 tavor_mr_mem_unbind(state, bind);
2061 *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2062
2063 goto mrrereghelp_fail;
2064 }
2065
2066 /* Put the updated information into the Mem Region handle */
2067 mr->mr_bindinfo = *bind;
2068 mr->mr_logmttpgsz = mtt_pgsize_bits;
2069
2070 } else {
2071 /*
2072 * Check if the memory region MTT is shared by any other MRs.
2073 * Since the resource may be shared between multiple memory
2074 * regions (as a result of a "RegisterSharedMR()" verb) it is
2075 * important that we not unbind any resources prematurely.
2076 */
2077 if (!TAVOR_MTT_IS_SHARED(swrc_old)) {
2078 /*
2079 * Unbind the old mapping for this memory region, but
2080 * retain the ddi_dma_handle_t for reuse in the bind
2081 * operation below. Note: This can only be done here
2082 * because the region being reregistered is not
2083 * currently shared. Also if original memory region
2084 * was bound for IOMMU bypass and the new region can
2085 * not use bypass, then a new DMA handle will be
2086 * necessary.
2087 */
2088 if (TAVOR_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
2089 mr->mr_bindinfo.bi_free_dmahdl = 0;
2090 tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2091 dmahdl = mr->mr_bindinfo.bi_dmahdl;
2092 reuse_dmahdl = 1;
2093 } else {
2094 tavor_mr_mem_unbind(state, &mr->mr_bindinfo);
2095 dmahdl = NULL;
2096 reuse_dmahdl = 0;
2097 }
2098 } else {
2099 dmahdl = NULL;
2100 reuse_dmahdl = 0;
2101 }
2102
2103 /*
2104 * Bind the new memory and determine the mapped addresses.
2105 * As described, this routine and tavor_mr_fast_mtt_write()
2106 * do the majority of the work for the memory registration
2107 * operations. Note: When we successfully finish the binding,
2108 * we will set the "bi_free_dmahdl" flag to indicate that
2109 * even though we may have reused the ddi_dma_handle_t we do
2110 * wish it to be freed up at some later time. Note also that
2111 * if we fail, we may need to cleanup the ddi_dma_handle_t.
2112 */
2113 bind->bi_bypass = bind_type;
2114 status = tavor_mr_mem_bind(state, bind, dmahdl, sleep);
2115 if (status != DDI_SUCCESS) {
2116 if (reuse_dmahdl) {
2117 ddi_dma_free_handle(&dmahdl);
2118 }
2119
2120 /*
2121 * Deregister will be called upon returning failure
2122 * from this routine. This will ensure that all
2123 * current resources get properly freed up.
2124 * Unnecessary to attempt to regain software ownership
2125 * of the MPT entry as that has already been done
2126 * above (in tavor_mr_reregister()). Also unnecessary
2127 * to attempt to unbind the memory.
2128 */
2129 *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2130
2131 goto mrrereghelp_fail;
2132 }
2133 if (reuse_dmahdl) {
2134 bind->bi_free_dmahdl = 1;
2135 }
2136
2137 /*
2138 * Allocate the new MTT entries resource
2139 */
2140 status = tavor_rsrc_alloc(state, TAVOR_MTT,
2141 TAVOR_NUMMTT_TO_MTTSEG(nummtt_needed), sleep, &mtt);
2142 if (status != DDI_SUCCESS) {
2143 /*
2144 * Deregister will be called upon returning failure
2145 * from this routine. This will ensure that all
2146 * current resources get properly freed up.
2147 * Unnecessary to attempt to regain software ownership
2148 * of the MPT entry as that has already been done
2149 * above (in tavor_mr_reregister()). Also unnecessary
2150 * to attempt to unbind the memory.
2151 *
2152 * But we do need to unbind the newly bound memory
2153 * before returning.
2154 */
2155 tavor_mr_mem_unbind(state, bind);
2156 *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2157
2158 goto mrrereghelp_fail;
2159 }
2160
2161 /*
2162 * Allocate MTT reference count (to track shared memory
2163 * regions). As mentioned elsewhere above, this reference
2164 * count resource may never be used on the given memory region,
2165 * but if it is ever later registered as a "shared" memory
2166 * region then this resource will be necessary. Note: This
2167 * is only necessary here if the existing memory region is
2168 * already being shared (because otherwise we already have
2169 * a useable reference count resource).
2170 */
2171 if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2172 status = tavor_rsrc_alloc(state, TAVOR_REFCNT, 1,
2173 sleep, &mtt_refcnt);
2174 if (status != DDI_SUCCESS) {
2175 /*
2176 * Deregister will be called upon returning
2177 * failure from this routine. This will ensure
2178 * that all current resources get properly
2179 * freed up. Unnecessary to attempt to regain
2180 * software ownership of the MPT entry as that
2181 * has already been done above (in
2182 * tavor_mr_reregister()). Also unnecessary
2183 * to attempt to unbind the memory.
2184 *
2185 * But we need to unbind the newly bound
2186 * memory and free up the newly allocated MTT
2187 * entries before returning.
2188 */
2189 tavor_mr_mem_unbind(state, bind);
2190 tavor_rsrc_free(state, &mtt);
2191 *dereg_level =
2192 TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2193
2194 goto mrrereghelp_fail;
2195 }
2196 swrc_new = (tavor_sw_refcnt_t *)mtt_refcnt->tr_addr;
2197 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_new))
2198 TAVOR_MTT_REFCNT_INIT(swrc_new);
2199 } else {
2200 mtt_refcnt = mr->mr_mttrefcntp;
2201 }
2202
2203 /*
2204 * Using the new mapping and the new MTT resources, write the
2205 * updated entries to MTT
2206 */
2207 status = tavor_mr_fast_mtt_write(mtt, bind, mtt_pgsize_bits);
2208 if (status != DDI_SUCCESS) {
2209 /*
2210 * Deregister will be called upon returning failure
2211 * from this routine. This will ensure that all
2212 * current resources get properly freed up.
2213 * Unnecessary to attempt to regain software ownership
2214 * of the MPT entry as that has already been done
2215 * above (in tavor_mr_reregister()). Also unnecessary
2216 * to attempt to unbind the memory.
2217 *
2218 * But we need to unbind the newly bound memory,
2219 * free up the newly allocated MTT entries, and
2220 * (possibly) free the new MTT reference count
2221 * resource before returning.
2222 */
2223 if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2224 tavor_rsrc_free(state, &mtt_refcnt);
2225 }
2226 tavor_mr_mem_unbind(state, bind);
2227 tavor_rsrc_free(state, &mtt);
2228 *dereg_level = TAVOR_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2229
2230 goto mrrereghelp_fail;
2231 }
2232
2233 /*
2234 * Check if the memory region MTT is shared by any other MRs.
2235 * Since the resource may be shared between multiple memory
2236 * regions (as a result of a "RegisterSharedMR()" verb) it is
2237 * important that we not free up any resources prematurely.
2238 */
2239 if (TAVOR_MTT_IS_SHARED(swrc_old)) {
2240 /* Decrement MTT reference count for "old" region */
2241 (void) tavor_mtt_refcnt_dec(mr->mr_mttrefcntp);
2242 } else {
2243 /* Free up the old MTT entries resource */
2244 tavor_rsrc_free(state, &mr->mr_mttrsrcp);
2245 }
2246
2247 /* Put the updated information into the mrhdl */
2248 mr->mr_bindinfo = *bind;
2249 mr->mr_logmttpgsz = mtt_pgsize_bits;
2250 mr->mr_mttrsrcp = mtt;
2251 mr->mr_mttrefcntp = mtt_refcnt;
2252 }
2253
2254 /*
2255 * Calculate and return the updated MTT address (in the DDR address
2256 * space). This will be used by the caller (tavor_mr_reregister) in
2257 * the updated MPT entry
2258 */
2259 rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
2260 mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;
2261 *mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx <<
2262 TAVOR_MTT_SIZE_SHIFT);
2263
2264 return (DDI_SUCCESS);
2265
2266 mrrereghelp_fail:
2267 return (status);
2268 }
2269
2270
2271 /*
2272 * tavor_mr_nummtt_needed()
2273 * Context: Can be called from interrupt or base context.
2274 */
2275 /* ARGSUSED */
2276 static uint64_t
tavor_mr_nummtt_needed(tavor_state_t * state,tavor_bind_info_t * bind,uint_t * mtt_pgsize_bits)2277 tavor_mr_nummtt_needed(tavor_state_t *state, tavor_bind_info_t *bind,
2278 uint_t *mtt_pgsize_bits)
2279 {
2280 uint64_t pg_offset_mask;
2281 uint64_t pg_offset, tmp_length;
2282
2283 /*
2284 * For now we specify the page size as 8Kb (the default page size for
2285 * the sun4u architecture), or 4Kb for x86. Figure out optimal page
2286 * size by examining the dmacookies XXX
2287 */
2288 *mtt_pgsize_bits = PAGESHIFT;
2289
2290 pg_offset_mask = ((uint64_t)1 << *mtt_pgsize_bits) - 1;
2291 pg_offset = bind->bi_addr & pg_offset_mask;
2292 tmp_length = pg_offset + (bind->bi_len - 1);
2293 return ((tmp_length >> *mtt_pgsize_bits) + 1);
2294 }
2295
2296
2297 /*
2298 * tavor_mr_mem_bind()
2299 * Context: Can be called from interrupt or base context.
2300 */
2301 static int
tavor_mr_mem_bind(tavor_state_t * state,tavor_bind_info_t * bind,ddi_dma_handle_t dmahdl,uint_t sleep)2302 tavor_mr_mem_bind(tavor_state_t *state, tavor_bind_info_t *bind,
2303 ddi_dma_handle_t dmahdl, uint_t sleep)
2304 {
2305 ddi_dma_attr_t dma_attr;
2306 int (*callback)(caddr_t);
2307 uint_t dma_xfer_mode;
2308 int status;
2309
2310 /* bi_type must be set to a meaningful value to get a bind handle */
2311 ASSERT(bind->bi_type == TAVOR_BINDHDL_VADDR ||
2312 bind->bi_type == TAVOR_BINDHDL_BUF ||
2313 bind->bi_type == TAVOR_BINDHDL_UBUF);
2314
2315 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2316
2317 /* Set the callback flag appropriately */
2318 callback = (sleep == TAVOR_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT;
2319
2320 /* Determine whether to map STREAMING or CONSISTENT */
2321 dma_xfer_mode = (bind->bi_flags & IBT_MR_NONCOHERENT) ?
2322 DDI_DMA_STREAMING : DDI_DMA_CONSISTENT;
2323
2324 /*
2325 * Initialize many of the default DMA attributes. Then, if we're
2326 * bypassing the IOMMU, set the DDI_DMA_FORCE_PHYSICAL flag.
2327 */
2328 if (dmahdl == NULL) {
2329 tavor_dma_attr_init(&dma_attr);
2330 #ifdef __sparc
2331 /*
2332 * First, disable streaming and switch to consistent if
2333 * configured to do so and IOMMU BYPASS is enabled.
2334 */
2335 if (state->ts_cfg_profile->cp_disable_streaming_on_bypass &&
2336 dma_xfer_mode == DDI_DMA_STREAMING &&
2337 bind->bi_bypass == TAVOR_BINDMEM_BYPASS) {
2338 dma_xfer_mode = DDI_DMA_CONSISTENT;
2339 }
2340
2341 /*
2342 * Then, if streaming is still specified, then "bypass" is not
2343 * allowed.
2344 */
2345 if ((dma_xfer_mode == DDI_DMA_CONSISTENT) &&
2346 (bind->bi_bypass == TAVOR_BINDMEM_BYPASS)) {
2347 dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
2348 }
2349 #endif
2350 /* Allocate a DMA handle for the binding */
2351 status = ddi_dma_alloc_handle(state->ts_dip, &dma_attr,
2352 callback, NULL, &bind->bi_dmahdl);
2353 if (status != DDI_SUCCESS) {
2354 return (status);
2355 }
2356 bind->bi_free_dmahdl = 1;
2357
2358 } else {
2359 bind->bi_dmahdl = dmahdl;
2360 bind->bi_free_dmahdl = 0;
2361 }
2362
2363 /*
2364 * Bind the memory to get the PCI mapped addresses. The decision
2365 * to call ddi_dma_addr_bind_handle() or ddi_dma_buf_bind_handle()
2366 * is determined by the "bi_type" flag. Note: if the bind operation
2367 * fails then we have to free up the DMA handle and return error.
2368 */
2369 if (bind->bi_type == TAVOR_BINDHDL_VADDR) {
2370 status = ddi_dma_addr_bind_handle(bind->bi_dmahdl, NULL,
2371 (caddr_t)(uintptr_t)bind->bi_addr, bind->bi_len,
2372 (DDI_DMA_RDWR | dma_xfer_mode), callback, NULL,
2373 &bind->bi_dmacookie, &bind->bi_cookiecnt);
2374 } else { /* TAVOR_BINDHDL_BUF || TAVOR_BINDHDL_UBUF */
2375 status = ddi_dma_buf_bind_handle(bind->bi_dmahdl,
2376 bind->bi_buf, (DDI_DMA_RDWR | dma_xfer_mode), callback,
2377 NULL, &bind->bi_dmacookie, &bind->bi_cookiecnt);
2378 }
2379
2380 if (status != DDI_DMA_MAPPED) {
2381 if (bind->bi_free_dmahdl != 0) {
2382 ddi_dma_free_handle(&bind->bi_dmahdl);
2383 }
2384 return (status);
2385 }
2386
2387 return (DDI_SUCCESS);
2388 }
2389
2390
2391 /*
2392 * tavor_mr_mem_unbind()
2393 * Context: Can be called from interrupt or base context.
2394 */
2395 static void
tavor_mr_mem_unbind(tavor_state_t * state,tavor_bind_info_t * bind)2396 tavor_mr_mem_unbind(tavor_state_t *state, tavor_bind_info_t *bind)
2397 {
2398 int status;
2399
2400 /*
2401 * In case of TAVOR_BINDHDL_UBUF, the memory bi_buf points to
2402 * is actually allocated by ddi_umem_iosetup() internally, then
2403 * it's required to free it here. Reset bi_type to TAVOR_BINDHDL_NONE
2404 * not to free it again later.
2405 */
2406 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2407 if (bind->bi_type == TAVOR_BINDHDL_UBUF) {
2408 freerbuf(bind->bi_buf);
2409 bind->bi_type = TAVOR_BINDHDL_NONE;
2410 }
2411 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
2412
2413 /*
2414 * Unbind the DMA memory for the region
2415 *
2416 * Note: The only way ddi_dma_unbind_handle() currently
2417 * can return an error is if the handle passed in is invalid.
2418 * Since this should never happen, we choose to return void
2419 * from this function! If this does return an error, however,
2420 * then we print a warning message to the console.
2421 */
2422 status = ddi_dma_unbind_handle(bind->bi_dmahdl);
2423 if (status != DDI_SUCCESS) {
2424 TAVOR_WARNING(state, "failed to unbind DMA mapping");
2425 return;
2426 }
2427
2428 /* Free up the DMA handle */
2429 if (bind->bi_free_dmahdl != 0) {
2430 ddi_dma_free_handle(&bind->bi_dmahdl);
2431 }
2432 }
2433
2434
2435 /*
2436 * tavor_mr_fast_mtt_write()
2437 * Context: Can be called from interrupt or base context.
2438 */
2439 static int
tavor_mr_fast_mtt_write(tavor_rsrc_t * mtt,tavor_bind_info_t * bind,uint32_t mtt_pgsize_bits)2440 tavor_mr_fast_mtt_write(tavor_rsrc_t *mtt, tavor_bind_info_t *bind,
2441 uint32_t mtt_pgsize_bits)
2442 {
2443 ddi_dma_cookie_t dmacookie;
2444 uint_t cookie_cnt;
2445 uint64_t *mtt_table;
2446 uint64_t mtt_entry;
2447 uint64_t addr, endaddr;
2448 uint64_t pagesize;
2449 int i;
2450
2451 /* Calculate page size from the suggested value passed in */
2452 pagesize = ((uint64_t)1 << mtt_pgsize_bits);
2453
2454 /*
2455 * Walk the "cookie list" and fill in the MTT table entries
2456 */
2457 i = 0;
2458 mtt_table = (uint64_t *)mtt->tr_addr;
2459 dmacookie = bind->bi_dmacookie;
2460 cookie_cnt = bind->bi_cookiecnt;
2461 while (cookie_cnt-- > 0) {
2462 addr = dmacookie.dmac_laddress;
2463 endaddr = addr + (dmacookie.dmac_size - 1);
2464 addr = addr & ~((uint64_t)pagesize - 1);
2465 while (addr <= endaddr) {
2466 /*
2467 * Fill in the mapped addresses (calculated above) and
2468 * set TAVOR_MTT_ENTRY_PRESET flag for each MTT entry.
2469 */
2470 mtt_entry = addr | TAVOR_MTT_ENTRY_PRESET;
2471 ddi_put64(mtt->tr_acchdl, &mtt_table[i], mtt_entry);
2472 addr += pagesize;
2473 i++;
2474
2475 if (addr == 0) {
2476 static int do_once = 1;
2477 _NOTE(SCHEME_PROTECTS_DATA("safe sharing",
2478 do_once))
2479 if (do_once) {
2480 do_once = 0;
2481 cmn_err(CE_NOTE, "probable error in "
2482 "dma_cookie address from caller\n");
2483 }
2484 break;
2485 }
2486 }
2487
2488 /*
2489 * When we've reached the end of the current DMA cookie,
2490 * jump to the next cookie (if there are more)
2491 */
2492 if (cookie_cnt != 0) {
2493 ddi_dma_nextcookie(bind->bi_dmahdl, &dmacookie);
2494 }
2495 }
2496
2497 return (DDI_SUCCESS);
2498 }
2499
2500 /*
2501 * tavor_mtt_refcnt_inc()
2502 * Context: Can be called from interrupt or base context.
2503 */
2504 static int
tavor_mtt_refcnt_inc(tavor_rsrc_t * rsrc)2505 tavor_mtt_refcnt_inc(tavor_rsrc_t *rsrc)
2506 {
2507 tavor_sw_refcnt_t *rc;
2508 uint32_t cnt;
2509
2510 rc = (tavor_sw_refcnt_t *)rsrc->tr_addr;
2511
2512 /* Increment the MTT's reference count */
2513 mutex_enter(&rc->swrc_lock);
2514 cnt = rc->swrc_refcnt++;
2515 mutex_exit(&rc->swrc_lock);
2516
2517 return (cnt);
2518 }
2519
2520
2521 /*
2522 * tavor_mtt_refcnt_dec()
2523 * Context: Can be called from interrupt or base context.
2524 */
2525 static int
tavor_mtt_refcnt_dec(tavor_rsrc_t * rsrc)2526 tavor_mtt_refcnt_dec(tavor_rsrc_t *rsrc)
2527 {
2528 tavor_sw_refcnt_t *rc;
2529 uint32_t cnt;
2530
2531 rc = (tavor_sw_refcnt_t *)rsrc->tr_addr;
2532
2533 /* Decrement the MTT's reference count */
2534 mutex_enter(&rc->swrc_lock);
2535 cnt = --rc->swrc_refcnt;
2536 mutex_exit(&rc->swrc_lock);
2537
2538 return (cnt);
2539 }
2540