xref: /illumos-gate/usr/src/uts/common/io/ib/adapters/hermon/hermon_mr.c (revision 4e567b4443d7a1680a7319275e5288eef2c92319)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * hermon_mr.c
29  *    Hermon Memory Region/Window Routines
30  *
31  *    Implements all the routines necessary to provide the requisite memory
32  *    registration verbs.  These include operations like RegisterMemRegion(),
33  *    DeregisterMemRegion(), ReregisterMemRegion, RegisterSharedMemRegion,
34  *    etc., that affect Memory Regions.  It also includes the verbs that
35  *    affect Memory Windows, including AllocMemWindow(), FreeMemWindow(),
36  *    and QueryMemWindow().
37  */
38 
39 #include <sys/types.h>
40 #include <sys/conf.h>
41 #include <sys/ddi.h>
42 #include <sys/sunddi.h>
43 #include <sys/modctl.h>
44 #include <sys/esunddi.h>
45 
46 #include <sys/ib/adapters/hermon/hermon.h>
47 
48 extern uint32_t hermon_kernel_data_ro;
49 extern uint32_t hermon_user_data_ro;
50 extern int hermon_rdma_debug;
51 
52 /*
53  * Used by hermon_mr_keycalc() below to fill in the "unconstrained" portion
54  * of Hermon memory keys (LKeys and RKeys)
55  */
56 static	uint_t hermon_memkey_cnt = 0x00;
57 #define	HERMON_MEMKEY_SHIFT	24
58 
59 /* initial state of an MPT */
60 #define	HERMON_MPT_SW_OWNERSHIP	0xF	/* memory regions */
61 #define	HERMON_MPT_FREE		0x3	/* allocate lkey */
62 
63 static int hermon_mr_common_reg(hermon_state_t *state, hermon_pdhdl_t pd,
64     hermon_bind_info_t *bind, hermon_mrhdl_t *mrhdl, hermon_mr_options_t *op,
65     hermon_mpt_rsrc_type_t mpt_type);
66 static int hermon_mr_common_rereg(hermon_state_t *state, hermon_mrhdl_t mr,
67     hermon_pdhdl_t pd, hermon_bind_info_t *bind, hermon_mrhdl_t *mrhdl_new,
68     hermon_mr_options_t *op);
69 static int hermon_mr_rereg_xlat_helper(hermon_state_t *state, hermon_mrhdl_t mr,
70     hermon_bind_info_t *bind, hermon_mr_options_t *op, uint64_t *mtt_addr,
71     uint_t sleep, uint_t *dereg_level);
72 static uint64_t hermon_mr_nummtt_needed(hermon_state_t *state,
73     hermon_bind_info_t *bind, uint_t *mtt_pgsize);
74 static int hermon_mr_mem_bind(hermon_state_t *state, hermon_bind_info_t *bind,
75     ddi_dma_handle_t dmahdl, uint_t sleep, uint_t is_buffer);
76 static void hermon_mr_mem_unbind(hermon_state_t *state,
77     hermon_bind_info_t *bind);
78 static int hermon_mr_fast_mtt_write(hermon_state_t *state, hermon_rsrc_t *mtt,
79     hermon_bind_info_t *bind, uint32_t mtt_pgsize_bits);
80 static int hermon_mr_fast_mtt_write_fmr(hermon_state_t *state,
81     hermon_rsrc_t *mtt, ibt_pmr_attr_t *mem_pattr, uint32_t mtt_pgsize_bits);
82 static uint_t hermon_mtt_refcnt_inc(hermon_rsrc_t *rsrc);
83 static uint_t hermon_mtt_refcnt_dec(hermon_rsrc_t *rsrc);
84 
85 
86 /*
87  * The Hermon umem_lockmemory() callback ops.  When userland memory is
88  * registered, these callback ops are specified.  The hermon_umap_umemlock_cb()
89  * callback will be called whenever the memory for the corresponding
90  * ddi_umem_cookie_t is being freed.
91  */
92 static struct umem_callback_ops hermon_umem_cbops = {
93 	UMEM_CALLBACK_VERSION,
94 	hermon_umap_umemlock_cb,
95 };
96 
97 
98 
99 /*
100  * hermon_mr_register()
101  *    Context: Can be called from interrupt or base context.
102  */
103 int
104 hermon_mr_register(hermon_state_t *state, hermon_pdhdl_t pd,
105     ibt_mr_attr_t *mr_attr, hermon_mrhdl_t *mrhdl, hermon_mr_options_t *op,
106     hermon_mpt_rsrc_type_t mpt_type)
107 {
108 	hermon_bind_info_t	bind;
109 	int			status;
110 
111 	/*
112 	 * Fill in the "bind" struct.  This struct provides the majority
113 	 * of the information that will be used to distinguish between an
114 	 * "addr" binding (as is the case here) and a "buf" binding (see
115 	 * below).  The "bind" struct is later passed to hermon_mr_mem_bind()
116 	 * which does most of the "heavy lifting" for the Hermon memory
117 	 * registration routines.
118 	 */
119 	bind.bi_type  = HERMON_BINDHDL_VADDR;
120 	bind.bi_addr  = mr_attr->mr_vaddr;
121 	bind.bi_len   = mr_attr->mr_len;
122 	bind.bi_as    = mr_attr->mr_as;
123 	bind.bi_flags = mr_attr->mr_flags;
124 	status = hermon_mr_common_reg(state, pd, &bind, mrhdl, op,
125 	    mpt_type);
126 	return (status);
127 }
128 
129 
130 /*
131  * hermon_mr_register_buf()
132  *    Context: Can be called from interrupt or base context.
133  */
134 int
135 hermon_mr_register_buf(hermon_state_t *state, hermon_pdhdl_t pd,
136     ibt_smr_attr_t *mr_attr, struct buf *buf, hermon_mrhdl_t *mrhdl,
137     hermon_mr_options_t *op, hermon_mpt_rsrc_type_t mpt_type)
138 {
139 	hermon_bind_info_t	bind;
140 	int			status;
141 
142 	/*
143 	 * Fill in the "bind" struct.  This struct provides the majority
144 	 * of the information that will be used to distinguish between an
145 	 * "addr" binding (see above) and a "buf" binding (as is the case
146 	 * here).  The "bind" struct is later passed to hermon_mr_mem_bind()
147 	 * which does most of the "heavy lifting" for the Hermon memory
148 	 * registration routines.  Note: We have chosen to provide
149 	 * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
150 	 * not set).  It is not critical what value we choose here as it need
151 	 * only be unique for the given RKey (which will happen by default),
152 	 * so the choice here is somewhat arbitrary.
153 	 */
154 	bind.bi_type  = HERMON_BINDHDL_BUF;
155 	bind.bi_buf   = buf;
156 	if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
157 		bind.bi_addr  = mr_attr->mr_vaddr;
158 	} else {
159 		bind.bi_addr  = (uint64_t)(uintptr_t)buf->b_un.b_addr;
160 	}
161 	bind.bi_as    = NULL;
162 	bind.bi_len   = (uint64_t)buf->b_bcount;
163 	bind.bi_flags = mr_attr->mr_flags;
164 	status = hermon_mr_common_reg(state, pd, &bind, mrhdl, op, mpt_type);
165 	return (status);
166 }
167 
168 
169 /*
170  * hermon_mr_register_shared()
171  *    Context: Can be called from interrupt or base context.
172  */
173 int
174 hermon_mr_register_shared(hermon_state_t *state, hermon_mrhdl_t mrhdl,
175     hermon_pdhdl_t pd, ibt_smr_attr_t *mr_attr, hermon_mrhdl_t *mrhdl_new)
176 {
177 	hermon_rsrc_t		*mpt, *mtt, *rsrc;
178 	hermon_umap_db_entry_t	*umapdb;
179 	hermon_hw_dmpt_t	mpt_entry;
180 	hermon_mrhdl_t		mr;
181 	hermon_bind_info_t	*bind;
182 	ddi_umem_cookie_t	umem_cookie;
183 	size_t			umem_len;
184 	caddr_t			umem_addr;
185 	uint64_t		mtt_addr, pgsize_msk;
186 	uint_t			sleep, mr_is_umem;
187 	int			status, umem_flags;
188 
189 	/*
190 	 * Check the sleep flag.  Ensure that it is consistent with the
191 	 * current thread context (i.e. if we are currently in the interrupt
192 	 * context, then we shouldn't be attempting to sleep).
193 	 */
194 	sleep = (mr_attr->mr_flags & IBT_MR_NOSLEEP) ? HERMON_NOSLEEP :
195 	    HERMON_SLEEP;
196 	if ((sleep == HERMON_SLEEP) &&
197 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
198 		status = IBT_INVALID_PARAM;
199 		goto mrshared_fail;
200 	}
201 
202 	/* Increment the reference count on the protection domain (PD) */
203 	hermon_pd_refcnt_inc(pd);
204 
205 	/*
206 	 * Allocate an MPT entry.  This will be filled in with all the
207 	 * necessary parameters to define the shared memory region.
208 	 * Specifically, it will be made to reference the currently existing
209 	 * MTT entries and ownership of the MPT will be passed to the hardware
210 	 * in the last step below.  If we fail here, we must undo the
211 	 * protection domain reference count.
212 	 */
213 	status = hermon_rsrc_alloc(state, HERMON_DMPT, 1, sleep, &mpt);
214 	if (status != DDI_SUCCESS) {
215 		status = IBT_INSUFF_RESOURCE;
216 		goto mrshared_fail1;
217 	}
218 
219 	/*
220 	 * Allocate the software structure for tracking the shared memory
221 	 * region (i.e. the Hermon Memory Region handle).  If we fail here, we
222 	 * must undo the protection domain reference count and the previous
223 	 * resource allocation.
224 	 */
225 	status = hermon_rsrc_alloc(state, HERMON_MRHDL, 1, sleep, &rsrc);
226 	if (status != DDI_SUCCESS) {
227 		status = IBT_INSUFF_RESOURCE;
228 		goto mrshared_fail2;
229 	}
230 	mr = (hermon_mrhdl_t)rsrc->hr_addr;
231 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
232 
233 	/*
234 	 * Setup and validate the memory region access flags.  This means
235 	 * translating the IBTF's enable flags into the access flags that
236 	 * will be used in later operations.
237 	 */
238 	mr->mr_accflag = 0;
239 	if (mr_attr->mr_flags & IBT_MR_ENABLE_WINDOW_BIND)
240 		mr->mr_accflag |= IBT_MR_WINDOW_BIND;
241 	if (mr_attr->mr_flags & IBT_MR_ENABLE_LOCAL_WRITE)
242 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
243 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_READ)
244 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
245 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_WRITE)
246 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
247 	if (mr_attr->mr_flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
248 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
249 
250 	/*
251 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
252 	 * from a certain number of "constrained" bits (the least significant
253 	 * bits) and some number of "unconstrained" bits.  The constrained
254 	 * bits must be set to the index of the entry in the MPT table, but
255 	 * the unconstrained bits can be set to any value we wish.  Note:
256 	 * if no remote access is required, then the RKey value is not filled
257 	 * in.  Otherwise both Rkey and LKey are given the same value.
258 	 */
259 	mr->mr_rkey = mr->mr_lkey = hermon_mr_keycalc(mpt->hr_indx);
260 
261 	/* Grab the MR lock for the current memory region */
262 	mutex_enter(&mrhdl->mr_lock);
263 
264 	/*
265 	 * Check here to see if the memory region has already been partially
266 	 * deregistered as a result of a hermon_umap_umemlock_cb() callback.
267 	 * If so, this is an error, return failure.
268 	 */
269 	if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
270 		mutex_exit(&mrhdl->mr_lock);
271 		status = IBT_MR_HDL_INVALID;
272 		goto mrshared_fail3;
273 	}
274 
275 	/*
276 	 * Determine if the original memory was from userland and, if so, pin
277 	 * the pages (again) with umem_lockmemory().  This will guarantee a
278 	 * separate callback for each of this shared region's MR handles.
279 	 * If this is userland memory, then allocate an entry in the
280 	 * "userland resources database".  This will later be added to
281 	 * the database (after all further memory registration operations are
282 	 * successful).  If we fail here, we must undo all the above setup.
283 	 */
284 	mr_is_umem = mrhdl->mr_is_umem;
285 	if (mr_is_umem) {
286 		umem_len   = ptob(btopr(mrhdl->mr_bindinfo.bi_len));
287 		umem_addr  = (caddr_t)((uintptr_t)mrhdl->mr_bindinfo.bi_addr &
288 		    ~PAGEOFFSET);
289 		umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
290 		    DDI_UMEMLOCK_LONGTERM);
291 		status = umem_lockmemory(umem_addr, umem_len, umem_flags,
292 		    &umem_cookie, &hermon_umem_cbops, NULL);
293 		if (status != 0) {
294 			mutex_exit(&mrhdl->mr_lock);
295 			status = IBT_INSUFF_RESOURCE;
296 			goto mrshared_fail3;
297 		}
298 
299 		umapdb = hermon_umap_db_alloc(state->hs_instance,
300 		    (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
301 		    (uint64_t)(uintptr_t)rsrc);
302 		if (umapdb == NULL) {
303 			mutex_exit(&mrhdl->mr_lock);
304 			status = IBT_INSUFF_RESOURCE;
305 			goto mrshared_fail4;
306 		}
307 	}
308 
309 	/*
310 	 * Copy the MTT resource pointer (and additional parameters) from
311 	 * the original Hermon Memory Region handle.  Note: this is normally
312 	 * where the hermon_mr_mem_bind() routine would be called, but because
313 	 * we already have bound and filled-in MTT entries it is simply a
314 	 * matter here of managing the MTT reference count and grabbing the
315 	 * address of the MTT table entries (for filling in the shared region's
316 	 * MPT entry).
317 	 */
318 	mr->mr_mttrsrcp	  = mrhdl->mr_mttrsrcp;
319 	mr->mr_logmttpgsz = mrhdl->mr_logmttpgsz;
320 	mr->mr_bindinfo	  = mrhdl->mr_bindinfo;
321 	mr->mr_mttrefcntp = mrhdl->mr_mttrefcntp;
322 	mutex_exit(&mrhdl->mr_lock);
323 	bind = &mr->mr_bindinfo;
324 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
325 	mtt = mr->mr_mttrsrcp;
326 
327 	/*
328 	 * Increment the MTT reference count (to reflect the fact that
329 	 * the MTT is now shared)
330 	 */
331 	(void) hermon_mtt_refcnt_inc(mr->mr_mttrefcntp);
332 
333 	/*
334 	 * Update the new "bind" virtual address.  Do some extra work here
335 	 * to ensure proper alignment.  That is, make sure that the page
336 	 * offset for the beginning of the old range is the same as the
337 	 * offset for this new mapping
338 	 */
339 	pgsize_msk = (((uint64_t)1 << mr->mr_logmttpgsz) - 1);
340 	bind->bi_addr = ((mr_attr->mr_vaddr & ~pgsize_msk) |
341 	    (mr->mr_bindinfo.bi_addr & pgsize_msk));
342 
343 	/*
344 	 * Fill in the MPT entry.  This is the final step before passing
345 	 * ownership of the MPT entry to the Hermon hardware.  We use all of
346 	 * the information collected/calculated above to fill in the
347 	 * requisite portions of the MPT.
348 	 */
349 	bzero(&mpt_entry, sizeof (hermon_hw_dmpt_t));
350 	mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
351 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
352 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
353 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
354 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
355 	mpt_entry.lr	  = 1;
356 	mpt_entry.reg_win = HERMON_MPT_IS_REGION;
357 	mpt_entry.entity_sz	= mr->mr_logmttpgsz;
358 	mpt_entry.mem_key	= mr->mr_lkey;
359 	mpt_entry.pd		= pd->pd_pdnum;
360 	mpt_entry.start_addr	= bind->bi_addr;
361 	mpt_entry.reg_win_len	= bind->bi_len;
362 	mtt_addr = (mtt->hr_indx << HERMON_MTT_SIZE_SHIFT);
363 	mpt_entry.mtt_addr_h = mtt_addr >> 32;
364 	mpt_entry.mtt_addr_l = mtt_addr >> 3;
365 
366 	/*
367 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
368 	 * the entry to the hardware.  Note: in general, this operation
369 	 * shouldn't fail.  But if it does, we have to undo everything we've
370 	 * done above before returning error.
371 	 */
372 	status = hermon_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
373 	    sizeof (hermon_hw_dmpt_t), mpt->hr_indx, sleep);
374 	if (status != HERMON_CMD_SUCCESS) {
375 		cmn_err(CE_CONT, "Hermon: SW2HW_MPT command failed: %08x\n",
376 		    status);
377 		if (status == HERMON_CMD_INVALID_STATUS) {
378 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
379 		}
380 		status = ibc_get_ci_failure(0);
381 		goto mrshared_fail5;
382 	}
383 
384 	/*
385 	 * Fill in the rest of the Hermon Memory Region handle.  Having
386 	 * successfully transferred ownership of the MPT, we can update the
387 	 * following fields for use in further operations on the MR.
388 	 */
389 	mr->mr_mptrsrcp	  = mpt;
390 	mr->mr_mttrsrcp	  = mtt;
391 	mr->mr_mpt_type	  = HERMON_MPT_DMPT;
392 	mr->mr_pdhdl	  = pd;
393 	mr->mr_rsrcp	  = rsrc;
394 	mr->mr_is_umem	  = mr_is_umem;
395 	mr->mr_is_fmr	  = 0;
396 	mr->mr_umemcookie = (mr_is_umem != 0) ? umem_cookie : NULL;
397 	mr->mr_umem_cbfunc = NULL;
398 	mr->mr_umem_cbarg1 = NULL;
399 	mr->mr_umem_cbarg2 = NULL;
400 	mr->mr_lkey	   = hermon_mr_key_swap(mr->mr_lkey);
401 	mr->mr_rkey	   = hermon_mr_key_swap(mr->mr_rkey);
402 
403 	/*
404 	 * If this is userland memory, then we need to insert the previously
405 	 * allocated entry into the "userland resources database".  This will
406 	 * allow for later coordination between the hermon_umap_umemlock_cb()
407 	 * callback and hermon_mr_deregister().
408 	 */
409 	if (mr_is_umem) {
410 		hermon_umap_db_add(umapdb);
411 	}
412 
413 	*mrhdl_new = mr;
414 
415 	return (DDI_SUCCESS);
416 
417 /*
418  * The following is cleanup for all possible failure cases in this routine
419  */
420 mrshared_fail5:
421 	(void) hermon_mtt_refcnt_dec(mr->mr_mttrefcntp);
422 	if (mr_is_umem) {
423 		hermon_umap_db_free(umapdb);
424 	}
425 mrshared_fail4:
426 	if (mr_is_umem) {
427 		ddi_umem_unlock(umem_cookie);
428 	}
429 mrshared_fail3:
430 	hermon_rsrc_free(state, &rsrc);
431 mrshared_fail2:
432 	hermon_rsrc_free(state, &mpt);
433 mrshared_fail1:
434 	hermon_pd_refcnt_dec(pd);
435 mrshared_fail:
436 	return (status);
437 }
438 
439 /*
440  * hermon_mr_alloc_fmr()
441  *    Context: Can be called from interrupt or base context.
442  */
443 int
444 hermon_mr_alloc_fmr(hermon_state_t *state, hermon_pdhdl_t pd,
445     hermon_fmrhdl_t fmr_pool, hermon_mrhdl_t *mrhdl)
446 {
447 	hermon_rsrc_t		*mpt, *mtt, *rsrc;
448 	hermon_hw_dmpt_t	mpt_entry;
449 	hermon_mrhdl_t		mr;
450 	hermon_bind_info_t	bind;
451 	uint64_t		mtt_addr;
452 	uint64_t		nummtt;
453 	uint_t			sleep, mtt_pgsize_bits;
454 	int			status;
455 	offset_t		i;
456 	hermon_icm_table_t	*icm_table;
457 	hermon_dma_info_t	*dma_info;
458 	uint32_t		index1, index2, rindx;
459 
460 	/*
461 	 * Check the sleep flag.  Ensure that it is consistent with the
462 	 * current thread context (i.e. if we are currently in the interrupt
463 	 * context, then we shouldn't be attempting to sleep).
464 	 */
465 	sleep = (fmr_pool->fmr_flags & IBT_MR_SLEEP) ? HERMON_SLEEP :
466 	    HERMON_NOSLEEP;
467 	if ((sleep == HERMON_SLEEP) &&
468 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
469 		return (IBT_INVALID_PARAM);
470 	}
471 
472 	/* Increment the reference count on the protection domain (PD) */
473 	hermon_pd_refcnt_inc(pd);
474 
475 	/*
476 	 * Allocate an MPT entry.  This will be filled in with all the
477 	 * necessary parameters to define the FMR.  Specifically, it will be
478 	 * made to reference the currently existing MTT entries and ownership
479 	 * of the MPT will be passed to the hardware in the last step below.
480 	 * If we fail here, we must undo the protection domain reference count.
481 	 */
482 
483 	status = hermon_rsrc_alloc(state, HERMON_DMPT, 1, sleep, &mpt);
484 	if (status != DDI_SUCCESS) {
485 		status = IBT_INSUFF_RESOURCE;
486 		goto fmralloc_fail1;
487 	}
488 
489 	/*
490 	 * Allocate the software structure for tracking the fmr memory
491 	 * region (i.e. the Hermon Memory Region handle).  If we fail here, we
492 	 * must undo the protection domain reference count and the previous
493 	 * resource allocation.
494 	 */
495 	status = hermon_rsrc_alloc(state, HERMON_MRHDL, 1, sleep, &rsrc);
496 	if (status != DDI_SUCCESS) {
497 		status = IBT_INSUFF_RESOURCE;
498 		goto fmralloc_fail2;
499 	}
500 	mr = (hermon_mrhdl_t)rsrc->hr_addr;
501 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
502 
503 	/*
504 	 * Setup and validate the memory region access flags.  This means
505 	 * translating the IBTF's enable flags into the access flags that
506 	 * will be used in later operations.
507 	 */
508 	mr->mr_accflag = 0;
509 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_LOCAL_WRITE)
510 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
511 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_REMOTE_READ)
512 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
513 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_REMOTE_WRITE)
514 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
515 	if (fmr_pool->fmr_flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
516 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
517 
518 	/*
519 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
520 	 * from a certain number of "constrained" bits (the least significant
521 	 * bits) and some number of "unconstrained" bits.  The constrained
522 	 * bits must be set to the index of the entry in the MPT table, but
523 	 * the unconstrained bits can be set to any value we wish.  Note:
524 	 * if no remote access is required, then the RKey value is not filled
525 	 * in.  Otherwise both Rkey and LKey are given the same value.
526 	 */
527 	mr->mr_fmr_key = 1;	/* ready for the next reload */
528 	mr->mr_rkey = mr->mr_lkey = mpt->hr_indx;
529 
530 	/*
531 	 * Determine number of pages spanned.  This routine uses the
532 	 * information in the "bind" struct to determine the required
533 	 * number of MTT entries needed (and returns the suggested page size -
534 	 * as a "power-of-2" - for each MTT entry).
535 	 */
536 	/* Assume address will be page aligned later */
537 	bind.bi_addr = 0;
538 	/* Calculate size based on given max pages */
539 	bind.bi_len = fmr_pool->fmr_max_pages << PAGESHIFT;
540 	nummtt = hermon_mr_nummtt_needed(state, &bind, &mtt_pgsize_bits);
541 
542 	/*
543 	 * Allocate the MTT entries.  Use the calculations performed above to
544 	 * allocate the required number of MTT entries.  If we fail here, we
545 	 * must not only undo all the previous resource allocation (and PD
546 	 * reference count), but we must also unbind the memory.
547 	 */
548 	status = hermon_rsrc_alloc(state, HERMON_MTT, nummtt, sleep, &mtt);
549 	if (status != DDI_SUCCESS) {
550 		status = IBT_INSUFF_RESOURCE;
551 		goto fmralloc_fail3;
552 	}
553 	mr->mr_logmttpgsz = mtt_pgsize_bits;
554 
555 	/*
556 	 * Fill in the MPT entry.  This is the final step before passing
557 	 * ownership of the MPT entry to the Hermon hardware.  We use all of
558 	 * the information collected/calculated above to fill in the
559 	 * requisite portions of the MPT.
560 	 */
561 	bzero(&mpt_entry, sizeof (hermon_hw_dmpt_t));
562 	mpt_entry.en_bind = 0;
563 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
564 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
565 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
566 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
567 	mpt_entry.lr	  = 1;
568 	mpt_entry.reg_win = HERMON_MPT_IS_REGION;
569 	mpt_entry.pd		= pd->pd_pdnum;
570 
571 	mpt_entry.entity_sz	= mr->mr_logmttpgsz;
572 	mtt_addr = (mtt->hr_indx << HERMON_MTT_SIZE_SHIFT);
573 	mpt_entry.fast_reg_en = 1;
574 	mpt_entry.mtt_size = (uint_t)nummtt;
575 	mpt_entry.mtt_addr_h = mtt_addr >> 32;
576 	mpt_entry.mtt_addr_l = mtt_addr >> 3;
577 	mpt_entry.mem_key = mr->mr_lkey;
578 
579 	/*
580 	 * FMR sets these to 0 for now.  Later during actual fmr registration
581 	 * these values are filled in.
582 	 */
583 	mpt_entry.start_addr	= 0;
584 	mpt_entry.reg_win_len	= 0;
585 
586 	/*
587 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
588 	 * the entry to the hardware.  Note: in general, this operation
589 	 * shouldn't fail.  But if it does, we have to undo everything we've
590 	 * done above before returning error.
591 	 */
592 	status = hermon_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
593 	    sizeof (hermon_hw_dmpt_t), mpt->hr_indx, sleep);
594 	if (status != HERMON_CMD_SUCCESS) {
595 		cmn_err(CE_CONT, "Hermon: SW2HW_MPT command failed: %08x\n",
596 		    status);
597 		if (status == HERMON_CMD_INVALID_STATUS) {
598 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
599 		}
600 		status = ibc_get_ci_failure(0);
601 		goto fmralloc_fail4;
602 	}
603 
604 	/*
605 	 * Fill in the rest of the Hermon Memory Region handle.  Having
606 	 * successfully transferred ownership of the MPT, we can update the
607 	 * following fields for use in further operations on the MR.  Also, set
608 	 * that this is an FMR region.
609 	 */
610 	mr->mr_mptrsrcp	  = mpt;
611 	mr->mr_mttrsrcp	  = mtt;
612 
613 	mr->mr_mpt_type   = HERMON_MPT_DMPT;
614 	mr->mr_pdhdl	  = pd;
615 	mr->mr_rsrcp	  = rsrc;
616 	mr->mr_is_fmr	  = 1;
617 	mr->mr_lkey	   = hermon_mr_key_swap(mr->mr_lkey);
618 	mr->mr_rkey	   = hermon_mr_key_swap(mr->mr_rkey);
619 	mr->mr_mttaddr	   = mtt_addr;
620 	(void) memcpy(&mr->mr_bindinfo, &bind, sizeof (hermon_bind_info_t));
621 
622 	/* initialize hr_addr for use during register/deregister/invalidate */
623 	icm_table = &state->hs_icm[HERMON_DMPT];
624 	rindx = mpt->hr_indx;
625 	hermon_index(index1, index2, rindx, icm_table, i);
626 	dma_info = icm_table->icm_dma[index1] + index2;
627 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mpt))
628 	mpt->hr_addr = (void *)((uintptr_t)(dma_info->vaddr + i * mpt->hr_len));
629 
630 	*mrhdl = mr;
631 
632 	return (DDI_SUCCESS);
633 
634 /*
635  * The following is cleanup for all possible failure cases in this routine
636  */
637 fmralloc_fail4:
638 	kmem_free(mtt, sizeof (hermon_rsrc_t) * nummtt);
639 fmralloc_fail3:
640 	hermon_rsrc_free(state, &rsrc);
641 fmralloc_fail2:
642 	hermon_rsrc_free(state, &mpt);
643 fmralloc_fail1:
644 	hermon_pd_refcnt_dec(pd);
645 fmralloc_fail:
646 	return (status);
647 }
648 
649 
650 /*
651  * hermon_mr_register_physical_fmr()
652  *    Context: Can be called from interrupt or base context.
653  */
654 /*ARGSUSED*/
655 int
656 hermon_mr_register_physical_fmr(hermon_state_t *state,
657     ibt_pmr_attr_t *mem_pattr_p, hermon_mrhdl_t mr, ibt_pmr_desc_t *mem_desc_p)
658 {
659 	hermon_rsrc_t		*mpt;
660 	uint64_t		*mpt_table;
661 	int			status;
662 	uint32_t		key;
663 
664 	mutex_enter(&mr->mr_lock);
665 	mpt = mr->mr_mptrsrcp;
666 	mpt_table = (uint64_t *)mpt->hr_addr;
667 
668 	/* Write MPT status to SW bit */
669 	*(uint8_t *)mpt_table = 0xF0;
670 
671 	membar_producer();
672 
673 	/*
674 	 * Write the mapped addresses into the MTT entries.  FMR needs to do
675 	 * this a little differently, so we call the fmr specific fast mtt
676 	 * write here.
677 	 */
678 	status = hermon_mr_fast_mtt_write_fmr(state, mr->mr_mttrsrcp,
679 	    mem_pattr_p, mr->mr_logmttpgsz);
680 	if (status != DDI_SUCCESS) {
681 		mutex_exit(&mr->mr_lock);
682 		status = ibc_get_ci_failure(0);
683 		goto fmr_reg_fail1;
684 	}
685 
686 	/*
687 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
688 	 * from a certain number of "constrained" bits (the least significant
689 	 * bits) and some number of "unconstrained" bits.  The constrained
690 	 * bits must be set to the index of the entry in the MPT table, but
691 	 * the unconstrained bits can be set to any value we wish.  Note:
692 	 * if no remote access is required, then the RKey value is not filled
693 	 * in.  Otherwise both Rkey and LKey are given the same value.
694 	 */
695 	key = mpt->hr_indx | (mr->mr_fmr_key++ << HERMON_MEMKEY_SHIFT);
696 	mr->mr_lkey = mr->mr_rkey = hermon_mr_key_swap(key);
697 
698 	/* write mem key value */
699 	*(uint32_t *)&mpt_table[1] = htonl(key);
700 
701 	/* write length value */
702 	mpt_table[3] = htonll(mem_pattr_p->pmr_len);
703 
704 	/* write start addr value */
705 	mpt_table[2] = htonll(mem_pattr_p->pmr_iova);
706 
707 	/* write lkey value */
708 	*(uint32_t *)&mpt_table[4] = htonl(key);
709 
710 	membar_producer();
711 
712 	/* Write MPT status to HW bit */
713 	*(uint8_t *)mpt_table = 0x00;
714 
715 	/* Fill in return parameters */
716 	mem_desc_p->pmd_lkey = mr->mr_lkey;
717 	mem_desc_p->pmd_rkey = mr->mr_rkey;
718 	mem_desc_p->pmd_iova = mem_pattr_p->pmr_iova;
719 	mem_desc_p->pmd_phys_buf_list_sz = mem_pattr_p->pmr_len;
720 
721 	/* Fill in MR bindinfo struct for later sync or query operations */
722 	mr->mr_bindinfo.bi_addr = mem_pattr_p->pmr_iova;
723 	mr->mr_bindinfo.bi_flags = mem_pattr_p->pmr_flags & IBT_MR_NONCOHERENT;
724 
725 	mutex_exit(&mr->mr_lock);
726 
727 	return (DDI_SUCCESS);
728 
729 fmr_reg_fail1:
730 	/*
731 	 * Note, we fail here, and purposely leave the memory ownership in
732 	 * software.  The memory tables may be corrupt, so we leave the region
733 	 * unregistered.
734 	 */
735 	return (status);
736 }
737 
738 
739 /*
740  * hermon_mr_deregister()
741  *    Context: Can be called from interrupt or base context.
742  */
743 /* ARGSUSED */
744 int
745 hermon_mr_deregister(hermon_state_t *state, hermon_mrhdl_t *mrhdl, uint_t level,
746     uint_t sleep)
747 {
748 	hermon_rsrc_t		*mpt, *mtt, *rsrc, *mtt_refcnt;
749 	hermon_umap_db_entry_t	*umapdb;
750 	hermon_pdhdl_t		pd;
751 	hermon_mrhdl_t		mr;
752 	hermon_bind_info_t	*bind;
753 	uint64_t		value;
754 	int			status;
755 	uint_t			shared_mtt;
756 
757 	/*
758 	 * Check the sleep flag.  Ensure that it is consistent with the
759 	 * current thread context (i.e. if we are currently in the interrupt
760 	 * context, then we shouldn't be attempting to sleep).
761 	 */
762 	if ((sleep == HERMON_SLEEP) &&
763 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
764 		status = IBT_INVALID_PARAM;
765 		return (status);
766 	}
767 
768 	/*
769 	 * Pull all the necessary information from the Hermon Memory Region
770 	 * handle.  This is necessary here because the resource for the
771 	 * MR handle is going to be freed up as part of the this
772 	 * deregistration
773 	 */
774 	mr	= *mrhdl;
775 	mutex_enter(&mr->mr_lock);
776 	mpt	= mr->mr_mptrsrcp;
777 	mtt	= mr->mr_mttrsrcp;
778 	mtt_refcnt = mr->mr_mttrefcntp;
779 	rsrc	= mr->mr_rsrcp;
780 	pd	= mr->mr_pdhdl;
781 	bind	= &mr->mr_bindinfo;
782 
783 	/*
784 	 * Check here if the memory region is really an FMR.  If so, this is a
785 	 * bad thing and we shouldn't be here.  Return failure.
786 	 */
787 	if (mr->mr_is_fmr) {
788 		mutex_exit(&mr->mr_lock);
789 		return (IBT_INVALID_PARAM);
790 	}
791 
792 	/*
793 	 * Check here to see if the memory region has already been partially
794 	 * deregistered as a result of the hermon_umap_umemlock_cb() callback.
795 	 * If so, then jump to the end and free the remaining resources.
796 	 */
797 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
798 		goto mrdereg_finish_cleanup;
799 	}
800 	if (hermon_rdma_debug & 0x4)
801 		IBTF_DPRINTF_L2("mr", "dereg: mr %p  key %x",
802 		    mr, mr->mr_rkey);
803 
804 	/*
805 	 * We must drop the "mr_lock" here to ensure that both SLEEP and
806 	 * NOSLEEP calls into the firmware work as expected.  Also, if two
807 	 * threads are attemping to access this MR (via de-register,
808 	 * re-register, or otherwise), then we allow the firmware to enforce
809 	 * the checking, that only one deregister is valid.
810 	 */
811 	mutex_exit(&mr->mr_lock);
812 
813 	/*
814 	 * Reclaim MPT entry from hardware (if necessary).  Since the
815 	 * hermon_mr_deregister() routine is used in the memory region
816 	 * reregistration process as well, it is possible that we will
817 	 * not always wish to reclaim ownership of the MPT.  Check the
818 	 * "level" arg and, if necessary, attempt to reclaim it.  If
819 	 * the ownership transfer fails for any reason, we check to see
820 	 * what command status was returned from the hardware.  The only
821 	 * "expected" error status is the one that indicates an attempt to
822 	 * deregister a memory region that has memory windows bound to it
823 	 */
824 	if (level >= HERMON_MR_DEREG_ALL) {
825 		if (mr->mr_mpt_type >= HERMON_MPT_DMPT) {
826 			status = hermon_cmn_ownership_cmd_post(state, HW2SW_MPT,
827 			    NULL, 0, mpt->hr_indx, sleep);
828 			if (status != HERMON_CMD_SUCCESS) {
829 				if (status == HERMON_CMD_REG_BOUND) {
830 					return (IBT_MR_IN_USE);
831 				} else {
832 					cmn_err(CE_CONT, "Hermon: HW2SW_MPT "
833 					    "command failed: %08x\n", status);
834 					if (status ==
835 					    HERMON_CMD_INVALID_STATUS) {
836 						hermon_fm_ereport(state,
837 						    HCA_SYS_ERR,
838 						    DDI_SERVICE_LOST);
839 					}
840 					return (IBT_INVALID_PARAM);
841 				}
842 			}
843 		}
844 	}
845 
846 	/*
847 	 * Re-grab the mr_lock here.  Since further access to the protected
848 	 * 'mr' structure is needed, and we would have returned previously for
849 	 * the multiple deregistration case, we can safely grab the lock here.
850 	 */
851 	mutex_enter(&mr->mr_lock);
852 
853 	/*
854 	 * If the memory had come from userland, then we do a lookup in the
855 	 * "userland resources database".  On success, we free the entry, call
856 	 * ddi_umem_unlock(), and continue the cleanup.  On failure (which is
857 	 * an indication that the umem_lockmemory() callback has called
858 	 * hermon_mr_deregister()), we call ddi_umem_unlock() and invalidate
859 	 * the "mr_umemcookie" field in the MR handle (this will be used
860 	 * later to detect that only partial cleaup still remains to be done
861 	 * on the MR handle).
862 	 */
863 	if (mr->mr_is_umem) {
864 		status = hermon_umap_db_find(state->hs_instance,
865 		    (uint64_t)(uintptr_t)mr->mr_umemcookie,
866 		    MLNX_UMAP_MRMEM_RSRC, &value, HERMON_UMAP_DB_REMOVE,
867 		    &umapdb);
868 		if (status == DDI_SUCCESS) {
869 			hermon_umap_db_free(umapdb);
870 			ddi_umem_unlock(mr->mr_umemcookie);
871 		} else {
872 			ddi_umem_unlock(mr->mr_umemcookie);
873 			mr->mr_umemcookie = NULL;
874 		}
875 	}
876 
877 	/*
878 	 * Decrement the MTT reference count.  Since the MTT resource
879 	 * may be shared between multiple memory regions (as a result
880 	 * of a "RegisterSharedMR" verb) it is important that we not
881 	 * free up or unbind resources prematurely.  If it's not shared (as
882 	 * indicated by the return status), then free the resource.
883 	 */
884 	shared_mtt = hermon_mtt_refcnt_dec(mtt_refcnt);
885 	if (!shared_mtt) {
886 		hermon_rsrc_free(state, &mtt_refcnt);
887 	}
888 
889 	/*
890 	 * Free up the MTT entries and unbind the memory.  Here, as above, we
891 	 * attempt to free these resources only if it is appropriate to do so.
892 	 */
893 	if (!shared_mtt) {
894 		if (level >= HERMON_MR_DEREG_NO_HW2SW_MPT) {
895 			hermon_mr_mem_unbind(state, bind);
896 		}
897 		hermon_rsrc_free(state, &mtt);
898 	}
899 
900 	/*
901 	 * If the MR handle has been invalidated, then drop the
902 	 * lock and return success.  Note: This only happens because
903 	 * the umem_lockmemory() callback has been triggered.  The
904 	 * cleanup here is partial, and further cleanup (in a
905 	 * subsequent hermon_mr_deregister() call) will be necessary.
906 	 */
907 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
908 		mutex_exit(&mr->mr_lock);
909 		return (DDI_SUCCESS);
910 	}
911 
912 mrdereg_finish_cleanup:
913 	mutex_exit(&mr->mr_lock);
914 
915 	/* Free the Hermon Memory Region handle */
916 	hermon_rsrc_free(state, &rsrc);
917 
918 	/* Free up the MPT entry resource */
919 	if (mpt != NULL)
920 		hermon_rsrc_free(state, &mpt);
921 
922 	/* Decrement the reference count on the protection domain (PD) */
923 	hermon_pd_refcnt_dec(pd);
924 
925 	/* Set the mrhdl pointer to NULL and return success */
926 	*mrhdl = NULL;
927 
928 	return (DDI_SUCCESS);
929 }
930 
931 /*
932  * hermon_mr_dealloc_fmr()
933  *    Context: Can be called from interrupt or base context.
934  */
935 /* ARGSUSED */
936 int
937 hermon_mr_dealloc_fmr(hermon_state_t *state, hermon_mrhdl_t *mrhdl)
938 {
939 	hermon_rsrc_t		*mpt, *mtt, *rsrc;
940 	hermon_pdhdl_t		pd;
941 	hermon_mrhdl_t		mr;
942 
943 	/*
944 	 * Pull all the necessary information from the Hermon Memory Region
945 	 * handle.  This is necessary here because the resource for the
946 	 * MR handle is going to be freed up as part of the this
947 	 * deregistration
948 	 */
949 	mr	= *mrhdl;
950 	mutex_enter(&mr->mr_lock);
951 	mpt	= mr->mr_mptrsrcp;
952 	mtt	= mr->mr_mttrsrcp;
953 	rsrc	= mr->mr_rsrcp;
954 	pd	= mr->mr_pdhdl;
955 	mutex_exit(&mr->mr_lock);
956 
957 	/* Free the MTT entries */
958 	hermon_rsrc_free(state, &mtt);
959 
960 	/* Free the Hermon Memory Region handle */
961 	hermon_rsrc_free(state, &rsrc);
962 
963 	/* Free up the MPT entry resource */
964 	hermon_rsrc_free(state, &mpt);
965 
966 	/* Decrement the reference count on the protection domain (PD) */
967 	hermon_pd_refcnt_dec(pd);
968 
969 	/* Set the mrhdl pointer to NULL and return success */
970 	*mrhdl = NULL;
971 
972 	return (DDI_SUCCESS);
973 }
974 
975 /*
976  * hermon_mr_invalidate_fmr()
977  *    Context: Can be called from interrupt or base context.
978  */
979 /* ARGSUSED */
980 int
981 hermon_mr_invalidate_fmr(hermon_state_t *state, hermon_mrhdl_t mr)
982 {
983 	hermon_rsrc_t		*mpt;
984 	uint64_t		*mpt_table;
985 
986 	mutex_enter(&mr->mr_lock);
987 	mpt = mr->mr_mptrsrcp;
988 	mpt_table = (uint64_t *)mpt->hr_addr;
989 
990 	/* Write MPT status to SW bit */
991 	*(uint8_t *)&mpt_table[0] = 0xF0;
992 
993 	membar_producer();
994 
995 	/* invalidate mem key value */
996 	*(uint32_t *)&mpt_table[1] = 0;
997 
998 	/* invalidate lkey value */
999 	*(uint32_t *)&mpt_table[4] = 0;
1000 
1001 	membar_producer();
1002 
1003 	/* Write MPT status to HW bit */
1004 	*(uint8_t *)&mpt_table[0] = 0x00;
1005 
1006 	mutex_exit(&mr->mr_lock);
1007 
1008 	return (DDI_SUCCESS);
1009 }
1010 
1011 /*
1012  * hermon_mr_deregister_fmr()
1013  *    Context: Can be called from interrupt or base context.
1014  */
1015 /* ARGSUSED */
1016 int
1017 hermon_mr_deregister_fmr(hermon_state_t *state, hermon_mrhdl_t mr)
1018 {
1019 	hermon_rsrc_t		*mpt;
1020 	uint64_t		*mpt_table;
1021 
1022 	mutex_enter(&mr->mr_lock);
1023 	mpt = mr->mr_mptrsrcp;
1024 	mpt_table = (uint64_t *)mpt->hr_addr;
1025 
1026 	/* Write MPT status to SW bit */
1027 	*(uint8_t *)&mpt_table[0] = 0xF0;
1028 
1029 	mutex_exit(&mr->mr_lock);
1030 
1031 	return (DDI_SUCCESS);
1032 }
1033 
1034 
1035 /*
1036  * hermon_mr_query()
1037  *    Context: Can be called from interrupt or base context.
1038  */
1039 /* ARGSUSED */
1040 int
1041 hermon_mr_query(hermon_state_t *state, hermon_mrhdl_t mr,
1042     ibt_mr_query_attr_t *attr)
1043 {
1044 	int			status;
1045 	hermon_hw_dmpt_t	mpt_entry;
1046 	uint32_t		lkey;
1047 
1048 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr))
1049 
1050 	mutex_enter(&mr->mr_lock);
1051 
1052 	/*
1053 	 * Check here to see if the memory region has already been partially
1054 	 * deregistered as a result of a hermon_umap_umemlock_cb() callback.
1055 	 * If so, this is an error, return failure.
1056 	 */
1057 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
1058 		mutex_exit(&mr->mr_lock);
1059 		return (IBT_MR_HDL_INVALID);
1060 	}
1061 
1062 	status = hermon_cmn_query_cmd_post(state, QUERY_MPT, 0,
1063 	    mr->mr_lkey >> 8, &mpt_entry, sizeof (hermon_hw_dmpt_t),
1064 	    HERMON_NOSLEEP);
1065 	if (status != HERMON_CMD_SUCCESS) {
1066 		cmn_err(CE_CONT, "Hermon: QUERY_MPT failed: status %x", status);
1067 		mutex_exit(&mr->mr_lock);
1068 		return (ibc_get_ci_failure(0));
1069 	}
1070 
1071 	/* Update the mr sw struct from the hw struct. */
1072 	lkey = mpt_entry.mem_key;
1073 	mr->mr_lkey = mr->mr_rkey = (lkey >> 8) | (lkey << 24);
1074 	mr->mr_bindinfo.bi_addr = mpt_entry.start_addr;
1075 	mr->mr_bindinfo.bi_len = mpt_entry.reg_win_len;
1076 	mr->mr_accflag = (mr->mr_accflag & IBT_MR_RO_DISABLED) |
1077 	    (mpt_entry.lw ? IBT_MR_LOCAL_WRITE : 0) |
1078 	    (mpt_entry.rr ? IBT_MR_REMOTE_READ : 0) |
1079 	    (mpt_entry.rw ? IBT_MR_REMOTE_WRITE : 0) |
1080 	    (mpt_entry.atomic ? IBT_MR_REMOTE_ATOMIC : 0) |
1081 	    (mpt_entry.en_bind ? IBT_MR_WINDOW_BIND : 0);
1082 	mr->mr_mttaddr = ((uint64_t)mpt_entry.mtt_addr_h << 32) |
1083 	    (mpt_entry.mtt_addr_l << 3);
1084 	mr->mr_logmttpgsz = mpt_entry.entity_sz;
1085 
1086 	/* Fill in the queried attributes */
1087 	attr->mr_lkey_state =
1088 	    (mpt_entry.status == HERMON_MPT_FREE) ? IBT_KEY_FREE :
1089 	    (mpt_entry.status == HERMON_MPT_SW_OWNERSHIP) ? IBT_KEY_INVALID :
1090 	    IBT_KEY_VALID;
1091 	attr->mr_phys_buf_list_sz = mpt_entry.mtt_size;
1092 	attr->mr_attr_flags = mr->mr_accflag;
1093 	attr->mr_pd = (ibt_pd_hdl_t)mr->mr_pdhdl;
1094 
1095 	/* Fill in the "local" attributes */
1096 	attr->mr_lkey = (ibt_lkey_t)mr->mr_lkey;
1097 	attr->mr_lbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
1098 	attr->mr_lbounds.pb_len  = (size_t)mr->mr_bindinfo.bi_len;
1099 
1100 	/*
1101 	 * Fill in the "remote" attributes (if necessary).  Note: the
1102 	 * remote attributes are only valid if the memory region has one
1103 	 * or more of the remote access flags set.
1104 	 */
1105 	if ((mr->mr_accflag & IBT_MR_REMOTE_READ) ||
1106 	    (mr->mr_accflag & IBT_MR_REMOTE_WRITE) ||
1107 	    (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC)) {
1108 		attr->mr_rkey = (ibt_rkey_t)mr->mr_rkey;
1109 		attr->mr_rbounds.pb_addr = (ib_vaddr_t)mr->mr_bindinfo.bi_addr;
1110 		attr->mr_rbounds.pb_len  = (size_t)mr->mr_bindinfo.bi_len;
1111 	}
1112 
1113 	/*
1114 	 * If region is mapped for streaming (i.e. noncoherent), then set sync
1115 	 * is required
1116 	 */
1117 	attr->mr_sync_required = (mr->mr_bindinfo.bi_flags &
1118 	    IBT_MR_NONCOHERENT) ? B_TRUE : B_FALSE;
1119 
1120 	mutex_exit(&mr->mr_lock);
1121 	return (DDI_SUCCESS);
1122 }
1123 
1124 
1125 /*
1126  * hermon_mr_reregister()
1127  *    Context: Can be called from interrupt or base context.
1128  */
1129 int
1130 hermon_mr_reregister(hermon_state_t *state, hermon_mrhdl_t mr,
1131     hermon_pdhdl_t pd, ibt_mr_attr_t *mr_attr, hermon_mrhdl_t *mrhdl_new,
1132     hermon_mr_options_t *op)
1133 {
1134 	hermon_bind_info_t	bind;
1135 	int			status;
1136 
1137 	/*
1138 	 * Fill in the "bind" struct.  This struct provides the majority
1139 	 * of the information that will be used to distinguish between an
1140 	 * "addr" binding (as is the case here) and a "buf" binding (see
1141 	 * below).  The "bind" struct is later passed to hermon_mr_mem_bind()
1142 	 * which does most of the "heavy lifting" for the Hermon memory
1143 	 * registration (and reregistration) routines.
1144 	 */
1145 	bind.bi_type  = HERMON_BINDHDL_VADDR;
1146 	bind.bi_addr  = mr_attr->mr_vaddr;
1147 	bind.bi_len   = mr_attr->mr_len;
1148 	bind.bi_as    = mr_attr->mr_as;
1149 	bind.bi_flags = mr_attr->mr_flags;
1150 	status = hermon_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
1151 	return (status);
1152 }
1153 
1154 
1155 /*
1156  * hermon_mr_reregister_buf()
1157  *    Context: Can be called from interrupt or base context.
1158  */
1159 int
1160 hermon_mr_reregister_buf(hermon_state_t *state, hermon_mrhdl_t mr,
1161     hermon_pdhdl_t pd, ibt_smr_attr_t *mr_attr, struct buf *buf,
1162     hermon_mrhdl_t *mrhdl_new, hermon_mr_options_t *op)
1163 {
1164 	hermon_bind_info_t	bind;
1165 	int			status;
1166 
1167 	/*
1168 	 * Fill in the "bind" struct.  This struct provides the majority
1169 	 * of the information that will be used to distinguish between an
1170 	 * "addr" binding (see above) and a "buf" binding (as is the case
1171 	 * here).  The "bind" struct is later passed to hermon_mr_mem_bind()
1172 	 * which does most of the "heavy lifting" for the Hermon memory
1173 	 * registration routines.  Note: We have chosen to provide
1174 	 * "b_un.b_addr" as the IB address (when the IBT_MR_PHYS_IOVA flag is
1175 	 * not set).  It is not critical what value we choose here as it need
1176 	 * only be unique for the given RKey (which will happen by default),
1177 	 * so the choice here is somewhat arbitrary.
1178 	 */
1179 	bind.bi_type  = HERMON_BINDHDL_BUF;
1180 	bind.bi_buf   = buf;
1181 	if (mr_attr->mr_flags & IBT_MR_PHYS_IOVA) {
1182 		bind.bi_addr  = mr_attr->mr_vaddr;
1183 	} else {
1184 		bind.bi_addr  = (uint64_t)(uintptr_t)buf->b_un.b_addr;
1185 	}
1186 	bind.bi_len   = (uint64_t)buf->b_bcount;
1187 	bind.bi_flags = mr_attr->mr_flags;
1188 	bind.bi_as    = NULL;
1189 	status = hermon_mr_common_rereg(state, mr, pd, &bind, mrhdl_new, op);
1190 	return (status);
1191 }
1192 
1193 
1194 /*
1195  * hermon_mr_sync()
1196  *    Context: Can be called from interrupt or base context.
1197  */
1198 /* ARGSUSED */
1199 int
1200 hermon_mr_sync(hermon_state_t *state, ibt_mr_sync_t *mr_segs, size_t num_segs)
1201 {
1202 	hermon_mrhdl_t		mrhdl;
1203 	uint64_t		seg_vaddr, seg_len, seg_end;
1204 	uint64_t		mr_start, mr_end;
1205 	uint_t			type;
1206 	int			status, i;
1207 
1208 	/* Process each of the ibt_mr_sync_t's */
1209 	for (i = 0; i < num_segs; i++) {
1210 		mrhdl = (hermon_mrhdl_t)mr_segs[i].ms_handle;
1211 
1212 		/* Check for valid memory region handle */
1213 		if (mrhdl == NULL) {
1214 			status = IBT_MR_HDL_INVALID;
1215 			goto mrsync_fail;
1216 		}
1217 
1218 		mutex_enter(&mrhdl->mr_lock);
1219 
1220 		/*
1221 		 * Check here to see if the memory region has already been
1222 		 * partially deregistered as a result of a
1223 		 * hermon_umap_umemlock_cb() callback.  If so, this is an
1224 		 * error, return failure.
1225 		 */
1226 		if ((mrhdl->mr_is_umem) && (mrhdl->mr_umemcookie == NULL)) {
1227 			mutex_exit(&mrhdl->mr_lock);
1228 			status = IBT_MR_HDL_INVALID;
1229 			goto mrsync_fail;
1230 		}
1231 
1232 		/* Check for valid bounds on sync request */
1233 		seg_vaddr = mr_segs[i].ms_vaddr;
1234 		seg_len	  = mr_segs[i].ms_len;
1235 		seg_end	  = seg_vaddr + seg_len - 1;
1236 		mr_start  = mrhdl->mr_bindinfo.bi_addr;
1237 		mr_end	  = mr_start + mrhdl->mr_bindinfo.bi_len - 1;
1238 		if ((seg_vaddr < mr_start) || (seg_vaddr > mr_end)) {
1239 			mutex_exit(&mrhdl->mr_lock);
1240 			status = IBT_MR_VA_INVALID;
1241 			goto mrsync_fail;
1242 		}
1243 		if ((seg_end < mr_start) || (seg_end > mr_end)) {
1244 			mutex_exit(&mrhdl->mr_lock);
1245 			status = IBT_MR_LEN_INVALID;
1246 			goto mrsync_fail;
1247 		}
1248 
1249 		/* Determine what type (i.e. direction) for sync */
1250 		if (mr_segs[i].ms_flags & IBT_SYNC_READ) {
1251 			type = DDI_DMA_SYNC_FORDEV;
1252 		} else if (mr_segs[i].ms_flags & IBT_SYNC_WRITE) {
1253 			type = DDI_DMA_SYNC_FORCPU;
1254 		} else {
1255 			mutex_exit(&mrhdl->mr_lock);
1256 			status = IBT_INVALID_PARAM;
1257 			goto mrsync_fail;
1258 		}
1259 
1260 		(void) ddi_dma_sync(mrhdl->mr_bindinfo.bi_dmahdl,
1261 		    (off_t)(seg_vaddr - mr_start), (size_t)seg_len, type);
1262 
1263 		mutex_exit(&mrhdl->mr_lock);
1264 	}
1265 
1266 	return (DDI_SUCCESS);
1267 
1268 mrsync_fail:
1269 	return (status);
1270 }
1271 
1272 
1273 /*
1274  * hermon_mw_alloc()
1275  *    Context: Can be called from interrupt or base context.
1276  */
1277 int
1278 hermon_mw_alloc(hermon_state_t *state, hermon_pdhdl_t pd, ibt_mw_flags_t flags,
1279     hermon_mwhdl_t *mwhdl)
1280 {
1281 	hermon_rsrc_t		*mpt, *rsrc;
1282 	hermon_hw_dmpt_t		mpt_entry;
1283 	hermon_mwhdl_t		mw;
1284 	uint_t			sleep;
1285 	int			status;
1286 
1287 	if (state != NULL)	/* XXX - bogus test that is always TRUE */
1288 		return (IBT_INSUFF_RESOURCE);
1289 
1290 	/*
1291 	 * Check the sleep flag.  Ensure that it is consistent with the
1292 	 * current thread context (i.e. if we are currently in the interrupt
1293 	 * context, then we shouldn't be attempting to sleep).
1294 	 */
1295 	sleep = (flags & IBT_MW_NOSLEEP) ? HERMON_NOSLEEP : HERMON_SLEEP;
1296 	if ((sleep == HERMON_SLEEP) &&
1297 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
1298 		status = IBT_INVALID_PARAM;
1299 		goto mwalloc_fail;
1300 	}
1301 
1302 	/* Increment the reference count on the protection domain (PD) */
1303 	hermon_pd_refcnt_inc(pd);
1304 
1305 	/*
1306 	 * Allocate an MPT entry (for use as a memory window).  Since the
1307 	 * Hermon hardware uses the MPT entry for memory regions and for
1308 	 * memory windows, we will fill in this MPT with all the necessary
1309 	 * parameters for the memory window.  And then (just as we do for
1310 	 * memory regions) ownership will be passed to the hardware in the
1311 	 * final step below.  If we fail here, we must undo the protection
1312 	 * domain reference count.
1313 	 */
1314 	status = hermon_rsrc_alloc(state, HERMON_DMPT, 1, sleep, &mpt);
1315 	if (status != DDI_SUCCESS) {
1316 		status = IBT_INSUFF_RESOURCE;
1317 		goto mwalloc_fail1;
1318 	}
1319 
1320 	/*
1321 	 * Allocate the software structure for tracking the memory window (i.e.
1322 	 * the Hermon Memory Window handle).  Note: This is actually the same
1323 	 * software structure used for tracking memory regions, but since many
1324 	 * of the same properties are needed, only a single structure is
1325 	 * necessary.  If we fail here, we must undo the protection domain
1326 	 * reference count and the previous resource allocation.
1327 	 */
1328 	status = hermon_rsrc_alloc(state, HERMON_MRHDL, 1, sleep, &rsrc);
1329 	if (status != DDI_SUCCESS) {
1330 		status = IBT_INSUFF_RESOURCE;
1331 		goto mwalloc_fail2;
1332 	}
1333 	mw = (hermon_mwhdl_t)rsrc->hr_addr;
1334 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw))
1335 
1336 	/*
1337 	 * Calculate an "unbound" RKey from MPT index.  In much the same way
1338 	 * as we do for memory regions (above), this key is constructed from
1339 	 * a "constrained" (which depends on the MPT index) and an
1340 	 * "unconstrained" portion (which may be arbitrarily chosen).
1341 	 */
1342 	mw->mr_rkey = hermon_mr_keycalc(mpt->hr_indx);
1343 
1344 	/*
1345 	 * Fill in the MPT entry.  This is the final step before passing
1346 	 * ownership of the MPT entry to the Hermon hardware.  We use all of
1347 	 * the information collected/calculated above to fill in the
1348 	 * requisite portions of the MPT.  Note: fewer entries in the MPT
1349 	 * entry are necessary to allocate a memory window.
1350 	 */
1351 	bzero(&mpt_entry, sizeof (hermon_hw_dmpt_t));
1352 	mpt_entry.reg_win	= HERMON_MPT_IS_WINDOW;
1353 	mpt_entry.mem_key	= mw->mr_rkey;
1354 	mpt_entry.pd		= pd->pd_pdnum;
1355 	mpt_entry.lr		= 1;
1356 
1357 	/*
1358 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
1359 	 * the entry to the hardware.  Note: in general, this operation
1360 	 * shouldn't fail.  But if it does, we have to undo everything we've
1361 	 * done above before returning error.
1362 	 */
1363 	status = hermon_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1364 	    sizeof (hermon_hw_dmpt_t), mpt->hr_indx, sleep);
1365 	if (status != HERMON_CMD_SUCCESS) {
1366 		cmn_err(CE_CONT, "Hermon: SW2HW_MPT command failed: %08x\n",
1367 		    status);
1368 		if (status == HERMON_CMD_INVALID_STATUS) {
1369 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1370 		}
1371 		status = ibc_get_ci_failure(0);
1372 		goto mwalloc_fail3;
1373 	}
1374 
1375 	/*
1376 	 * Fill in the rest of the Hermon Memory Window handle.  Having
1377 	 * successfully transferred ownership of the MPT, we can update the
1378 	 * following fields for use in further operations on the MW.
1379 	 */
1380 	mw->mr_mptrsrcp	= mpt;
1381 	mw->mr_pdhdl	= pd;
1382 	mw->mr_rsrcp	= rsrc;
1383 	mw->mr_rkey	= hermon_mr_key_swap(mw->mr_rkey);
1384 	*mwhdl = mw;
1385 
1386 	return (DDI_SUCCESS);
1387 
1388 mwalloc_fail3:
1389 	hermon_rsrc_free(state, &rsrc);
1390 mwalloc_fail2:
1391 	hermon_rsrc_free(state, &mpt);
1392 mwalloc_fail1:
1393 	hermon_pd_refcnt_dec(pd);
1394 mwalloc_fail:
1395 	return (status);
1396 }
1397 
1398 
1399 /*
1400  * hermon_mw_free()
1401  *    Context: Can be called from interrupt or base context.
1402  */
1403 int
1404 hermon_mw_free(hermon_state_t *state, hermon_mwhdl_t *mwhdl, uint_t sleep)
1405 {
1406 	hermon_rsrc_t		*mpt, *rsrc;
1407 	hermon_mwhdl_t		mw;
1408 	int			status;
1409 	hermon_pdhdl_t		pd;
1410 
1411 	/*
1412 	 * Check the sleep flag.  Ensure that it is consistent with the
1413 	 * current thread context (i.e. if we are currently in the interrupt
1414 	 * context, then we shouldn't be attempting to sleep).
1415 	 */
1416 	if ((sleep == HERMON_SLEEP) &&
1417 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
1418 		status = IBT_INVALID_PARAM;
1419 		return (status);
1420 	}
1421 
1422 	/*
1423 	 * Pull all the necessary information from the Hermon Memory Window
1424 	 * handle.  This is necessary here because the resource for the
1425 	 * MW handle is going to be freed up as part of the this operation.
1426 	 */
1427 	mw	= *mwhdl;
1428 	mutex_enter(&mw->mr_lock);
1429 	mpt	= mw->mr_mptrsrcp;
1430 	rsrc	= mw->mr_rsrcp;
1431 	pd	= mw->mr_pdhdl;
1432 	mutex_exit(&mw->mr_lock);
1433 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mw))
1434 
1435 	/*
1436 	 * Reclaim the MPT entry from hardware.  Note: in general, it is
1437 	 * unexpected for this operation to return an error.
1438 	 */
1439 	status = hermon_cmn_ownership_cmd_post(state, HW2SW_MPT, NULL,
1440 	    0, mpt->hr_indx, sleep);
1441 	if (status != HERMON_CMD_SUCCESS) {
1442 		cmn_err(CE_CONT, "Hermon: HW2SW_MPT command failed: %08x\n",
1443 		    status);
1444 		if (status == HERMON_CMD_INVALID_STATUS) {
1445 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1446 		}
1447 		return (ibc_get_ci_failure(0));
1448 	}
1449 
1450 	/* Free the Hermon Memory Window handle */
1451 	hermon_rsrc_free(state, &rsrc);
1452 
1453 	/* Free up the MPT entry resource */
1454 	hermon_rsrc_free(state, &mpt);
1455 
1456 	/* Decrement the reference count on the protection domain (PD) */
1457 	hermon_pd_refcnt_dec(pd);
1458 
1459 	/* Set the mwhdl pointer to NULL and return success */
1460 	*mwhdl = NULL;
1461 
1462 	return (DDI_SUCCESS);
1463 }
1464 
1465 
1466 /*
1467  * hermon_mr_keycalc()
1468  *    Context: Can be called from interrupt or base context.
1469  *    NOTE:  Produces a key in the form of
1470  *		KKKKKKKK IIIIIIII IIIIIIII IIIIIIIII
1471  *    where K == the arbitrary bits and I == the index
1472  */
1473 uint32_t
1474 hermon_mr_keycalc(uint32_t indx)
1475 {
1476 	uint32_t tmp_key, tmp_indx;
1477 
1478 	/*
1479 	 * Generate a simple key from counter.  Note:  We increment this
1480 	 * static variable _intentionally_ without any kind of mutex around
1481 	 * it.  First, single-threading all operations through a single lock
1482 	 * would be a bad idea (from a performance point-of-view).  Second,
1483 	 * the upper "unconstrained" bits don't really have to be unique
1484 	 * because the lower bits are guaranteed to be (although we do make a
1485 	 * best effort to ensure that they are).  Third, the window for the
1486 	 * race (where both threads read and update the counter at the same
1487 	 * time) is incredibly small.
1488 	 * And, lastly, we'd like to make this into a "random" key
1489 	 */
1490 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(hermon_memkey_cnt))
1491 	tmp_key = (hermon_memkey_cnt++) << HERMON_MEMKEY_SHIFT;
1492 	tmp_indx = indx & 0xffffff;
1493 	return (tmp_key | tmp_indx);
1494 }
1495 
1496 
1497 /*
1498  * hermon_mr_key_swap()
1499  *    Context: Can be called from interrupt or base context.
1500  *    NOTE:  Produces a key in the form of
1501  *		IIIIIIII IIIIIIII IIIIIIIII KKKKKKKK
1502  *    where K == the arbitrary bits and I == the index
1503  */
1504 uint32_t
1505 hermon_mr_key_swap(uint32_t indx)
1506 {
1507 	/*
1508 	 * The memory key format to pass down to the hardware is
1509 	 * (key[7:0],index[23:0]), which defines the index to the
1510 	 * hardware resource. When the driver passes this as a memory
1511 	 * key, (i.e. to retrieve a resource) the format is
1512 	 * (index[23:0],key[7:0]).
1513 	 */
1514 	return (((indx >> 24) & 0x000000ff) | ((indx << 8) & 0xffffff00));
1515 }
1516 
1517 /*
1518  * hermon_mr_common_reg()
1519  *    Context: Can be called from interrupt or base context.
1520  */
1521 static int
1522 hermon_mr_common_reg(hermon_state_t *state, hermon_pdhdl_t pd,
1523     hermon_bind_info_t *bind, hermon_mrhdl_t *mrhdl, hermon_mr_options_t *op,
1524     hermon_mpt_rsrc_type_t mpt_type)
1525 {
1526 	hermon_rsrc_t		*mpt, *mtt, *rsrc, *mtt_refcnt;
1527 	hermon_umap_db_entry_t	*umapdb;
1528 	hermon_sw_refcnt_t	*swrc_tmp;
1529 	hermon_hw_dmpt_t	mpt_entry;
1530 	hermon_mrhdl_t		mr;
1531 	ibt_mr_flags_t		flags;
1532 	hermon_bind_info_t	*bh;
1533 	ddi_dma_handle_t	bind_dmahdl;
1534 	ddi_umem_cookie_t	umem_cookie;
1535 	size_t			umem_len;
1536 	caddr_t			umem_addr;
1537 	uint64_t		mtt_addr, max_sz;
1538 	uint_t			sleep, mtt_pgsize_bits, bind_type, mr_is_umem;
1539 	int			status, umem_flags, bind_override_addr;
1540 
1541 	/*
1542 	 * Check the "options" flag.  Currently this flag tells the driver
1543 	 * whether or not the region should be bound normally (i.e. with
1544 	 * entries written into the PCI IOMMU), whether it should be
1545 	 * registered to bypass the IOMMU, and whether or not the resulting
1546 	 * address should be "zero-based" (to aid the alignment restrictions
1547 	 * for QPs).
1548 	 */
1549 	if (op == NULL) {
1550 		bind_type   = HERMON_BINDMEM_NORMAL;
1551 		bind_dmahdl = NULL;
1552 		bind_override_addr = 0;
1553 	} else {
1554 		bind_type	   = op->mro_bind_type;
1555 		bind_dmahdl	   = op->mro_bind_dmahdl;
1556 		bind_override_addr = op->mro_bind_override_addr;
1557 	}
1558 
1559 	/* check what kind of mpt to use */
1560 
1561 	/* Extract the flags field from the hermon_bind_info_t */
1562 	flags = bind->bi_flags;
1563 
1564 	/*
1565 	 * Check for invalid length.  Check is the length is zero or if the
1566 	 * length is larger than the maximum configured value.  Return error
1567 	 * if it is.
1568 	 */
1569 	max_sz = ((uint64_t)1 << state->hs_cfg_profile->cp_log_max_mrw_sz);
1570 	if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
1571 		status = IBT_MR_LEN_INVALID;
1572 		goto mrcommon_fail;
1573 	}
1574 
1575 	/*
1576 	 * Check the sleep flag.  Ensure that it is consistent with the
1577 	 * current thread context (i.e. if we are currently in the interrupt
1578 	 * context, then we shouldn't be attempting to sleep).
1579 	 */
1580 	sleep = (flags & IBT_MR_NOSLEEP) ? HERMON_NOSLEEP: HERMON_SLEEP;
1581 	if ((sleep == HERMON_SLEEP) &&
1582 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
1583 		status = IBT_INVALID_PARAM;
1584 		goto mrcommon_fail;
1585 	}
1586 
1587 	/* Increment the reference count on the protection domain (PD) */
1588 	hermon_pd_refcnt_inc(pd);
1589 
1590 	/*
1591 	 * Allocate an MPT entry.  This will be filled in with all the
1592 	 * necessary parameters to define the memory region.  And then
1593 	 * ownership will be passed to the hardware in the final step
1594 	 * below.  If we fail here, we must undo the protection domain
1595 	 * reference count.
1596 	 */
1597 	if (mpt_type == HERMON_MPT_DMPT) {
1598 		status = hermon_rsrc_alloc(state, HERMON_DMPT, 1, sleep, &mpt);
1599 		if (status != DDI_SUCCESS) {
1600 			status = IBT_INSUFF_RESOURCE;
1601 			goto mrcommon_fail1;
1602 		}
1603 	} else {
1604 		mpt = NULL;
1605 	}
1606 
1607 	/*
1608 	 * Allocate the software structure for tracking the memory region (i.e.
1609 	 * the Hermon Memory Region handle).  If we fail here, we must undo
1610 	 * the protection domain reference count and the previous resource
1611 	 * allocation.
1612 	 */
1613 	status = hermon_rsrc_alloc(state, HERMON_MRHDL, 1, sleep, &rsrc);
1614 	if (status != DDI_SUCCESS) {
1615 		status = IBT_INSUFF_RESOURCE;
1616 		goto mrcommon_fail2;
1617 	}
1618 	mr = (hermon_mrhdl_t)rsrc->hr_addr;
1619 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
1620 
1621 	/*
1622 	 * Setup and validate the memory region access flags.  This means
1623 	 * translating the IBTF's enable flags into the access flags that
1624 	 * will be used in later operations.
1625 	 */
1626 	mr->mr_accflag = 0;
1627 	if (flags & IBT_MR_ENABLE_WINDOW_BIND)
1628 		mr->mr_accflag |= IBT_MR_WINDOW_BIND;
1629 	if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
1630 		mr->mr_accflag |= IBT_MR_LOCAL_WRITE;
1631 	if (flags & IBT_MR_ENABLE_REMOTE_READ)
1632 		mr->mr_accflag |= IBT_MR_REMOTE_READ;
1633 	if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
1634 		mr->mr_accflag |= IBT_MR_REMOTE_WRITE;
1635 	if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
1636 		mr->mr_accflag |= IBT_MR_REMOTE_ATOMIC;
1637 
1638 	/*
1639 	 * Calculate keys (Lkey, Rkey) from MPT index.  Each key is formed
1640 	 * from a certain number of "constrained" bits (the least significant
1641 	 * bits) and some number of "unconstrained" bits.  The constrained
1642 	 * bits must be set to the index of the entry in the MPT table, but
1643 	 * the unconstrained bits can be set to any value we wish.  Note:
1644 	 * if no remote access is required, then the RKey value is not filled
1645 	 * in.  Otherwise both Rkey and LKey are given the same value.
1646 	 */
1647 	if (mpt)
1648 		mr->mr_rkey = mr->mr_lkey = hermon_mr_keycalc(mpt->hr_indx);
1649 
1650 	/*
1651 	 * Determine if the memory is from userland and pin the pages
1652 	 * with umem_lockmemory() if necessary.
1653 	 * Then, if this is userland memory, allocate an entry in the
1654 	 * "userland resources database".  This will later be added to
1655 	 * the database (after all further memory registration operations are
1656 	 * successful).  If we fail here, we must undo the reference counts
1657 	 * and the previous resource allocations.
1658 	 */
1659 	mr_is_umem = (((bind->bi_as != NULL) && (bind->bi_as != &kas)) ? 1 : 0);
1660 	if (mr_is_umem) {
1661 		umem_len   = ptob(btopr(bind->bi_len +
1662 		    ((uintptr_t)bind->bi_addr & PAGEOFFSET)));
1663 		umem_addr  = (caddr_t)((uintptr_t)bind->bi_addr & ~PAGEOFFSET);
1664 		umem_flags = (DDI_UMEMLOCK_WRITE | DDI_UMEMLOCK_READ |
1665 		    DDI_UMEMLOCK_LONGTERM);
1666 		status = umem_lockmemory(umem_addr, umem_len, umem_flags,
1667 		    &umem_cookie, &hermon_umem_cbops, NULL);
1668 		if (status != 0) {
1669 			status = IBT_INSUFF_RESOURCE;
1670 			goto mrcommon_fail3;
1671 		}
1672 
1673 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1674 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind->bi_buf))
1675 
1676 		bind->bi_buf = ddi_umem_iosetup(umem_cookie, 0, umem_len,
1677 		    B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP);
1678 		if (bind->bi_buf == NULL) {
1679 			status = IBT_INSUFF_RESOURCE;
1680 			goto mrcommon_fail3;
1681 		}
1682 		bind->bi_type = HERMON_BINDHDL_UBUF;
1683 		bind->bi_buf->b_flags |= B_READ;
1684 
1685 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind->bi_buf))
1686 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
1687 
1688 		umapdb = hermon_umap_db_alloc(state->hs_instance,
1689 		    (uint64_t)(uintptr_t)umem_cookie, MLNX_UMAP_MRMEM_RSRC,
1690 		    (uint64_t)(uintptr_t)rsrc);
1691 		if (umapdb == NULL) {
1692 			status = IBT_INSUFF_RESOURCE;
1693 			goto mrcommon_fail4;
1694 		}
1695 	}
1696 
1697 	/*
1698 	 * Setup the bindinfo for the mtt bind call
1699 	 */
1700 	bh = &mr->mr_bindinfo;
1701 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bh))
1702 	bcopy(bind, bh, sizeof (hermon_bind_info_t));
1703 	bh->bi_bypass = bind_type;
1704 	status = hermon_mr_mtt_bind(state, bh, bind_dmahdl, &mtt,
1705 	    &mtt_pgsize_bits, mpt != NULL);
1706 	if (status != DDI_SUCCESS) {
1707 		/*
1708 		 * When mtt_bind fails, freerbuf has already been done,
1709 		 * so make sure not to call it again.
1710 		 */
1711 		bind->bi_type = bh->bi_type;
1712 		goto mrcommon_fail5;
1713 	}
1714 	mr->mr_logmttpgsz = mtt_pgsize_bits;
1715 
1716 	/*
1717 	 * Allocate MTT reference count (to track shared memory regions).
1718 	 * This reference count resource may never be used on the given
1719 	 * memory region, but if it is ever later registered as "shared"
1720 	 * memory region then this resource will be necessary.  If we fail
1721 	 * here, we do pretty much the same as above to clean up.
1722 	 */
1723 	status = hermon_rsrc_alloc(state, HERMON_REFCNT, 1, sleep,
1724 	    &mtt_refcnt);
1725 	if (status != DDI_SUCCESS) {
1726 		status = IBT_INSUFF_RESOURCE;
1727 		goto mrcommon_fail6;
1728 	}
1729 	mr->mr_mttrefcntp = mtt_refcnt;
1730 	swrc_tmp = (hermon_sw_refcnt_t *)mtt_refcnt->hr_addr;
1731 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_tmp))
1732 	HERMON_MTT_REFCNT_INIT(swrc_tmp);
1733 
1734 	mtt_addr = (mtt->hr_indx << HERMON_MTT_SIZE_SHIFT);
1735 
1736 	/*
1737 	 * Fill in the MPT entry.  This is the final step before passing
1738 	 * ownership of the MPT entry to the Hermon hardware.  We use all of
1739 	 * the information collected/calculated above to fill in the
1740 	 * requisite portions of the MPT.  Do this ONLY for DMPTs.
1741 	 */
1742 	if (mpt == NULL)
1743 		goto no_passown;
1744 
1745 	bzero(&mpt_entry, sizeof (hermon_hw_dmpt_t));
1746 
1747 	mpt_entry.status  = HERMON_MPT_SW_OWNERSHIP;
1748 	mpt_entry.en_bind = (mr->mr_accflag & IBT_MR_WINDOW_BIND)   ? 1 : 0;
1749 	mpt_entry.atomic  = (mr->mr_accflag & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
1750 	mpt_entry.rw	  = (mr->mr_accflag & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
1751 	mpt_entry.rr	  = (mr->mr_accflag & IBT_MR_REMOTE_READ)   ? 1 : 0;
1752 	mpt_entry.lw	  = (mr->mr_accflag & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
1753 	mpt_entry.lr	  = 1;
1754 	mpt_entry.phys_addr = 0;
1755 	mpt_entry.reg_win = HERMON_MPT_IS_REGION;
1756 
1757 	mpt_entry.entity_sz	= mr->mr_logmttpgsz;
1758 	mpt_entry.mem_key	= mr->mr_lkey;
1759 	mpt_entry.pd		= pd->pd_pdnum;
1760 	mpt_entry.rem_acc_en = 0;
1761 	mpt_entry.fast_reg_en = 0;
1762 	mpt_entry.en_inval = 0;
1763 	mpt_entry.lkey = 0;
1764 	mpt_entry.win_cnt = 0;
1765 
1766 	if (bind_override_addr == 0) {
1767 		mpt_entry.start_addr = bh->bi_addr;
1768 	} else {
1769 		bh->bi_addr = bh->bi_addr & ((1 << mr->mr_logmttpgsz) - 1);
1770 		mpt_entry.start_addr = bh->bi_addr;
1771 	}
1772 	mpt_entry.reg_win_len	= bh->bi_len;
1773 
1774 	mpt_entry.mtt_addr_h = mtt_addr >> 32;  /* only 8 more bits */
1775 	mpt_entry.mtt_addr_l = mtt_addr >> 3;	/* only 29 bits */
1776 
1777 	/*
1778 	 * Write the MPT entry to hardware.  Lastly, we pass ownership of
1779 	 * the entry to the hardware if needed.  Note: in general, this
1780 	 * operation shouldn't fail.  But if it does, we have to undo
1781 	 * everything we've done above before returning error.
1782 	 *
1783 	 * For Hermon, this routine (which is common to the contexts) will only
1784 	 * set the ownership if needed - the process of passing the context
1785 	 * itself to HW will take care of setting up the MPT (based on type
1786 	 * and index).
1787 	 */
1788 
1789 	mpt_entry.bnd_qp = 0;	/* dMPT for a qp, check for window */
1790 	status = hermon_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
1791 	    sizeof (hermon_hw_dmpt_t), mpt->hr_indx, sleep);
1792 	if (status != HERMON_CMD_SUCCESS) {
1793 		cmn_err(CE_CONT, "Hermon: SW2HW_MPT command failed: %08x\n",
1794 		    status);
1795 		if (status == HERMON_CMD_INVALID_STATUS) {
1796 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1797 		}
1798 		status = ibc_get_ci_failure(0);
1799 		goto mrcommon_fail7;
1800 	}
1801 	if (hermon_rdma_debug & 0x4)
1802 		IBTF_DPRINTF_L2("mr", "  reg: mr %p  key %x",
1803 		    mr, hermon_mr_key_swap(mr->mr_rkey));
1804 no_passown:
1805 
1806 	/*
1807 	 * Fill in the rest of the Hermon Memory Region handle.  Having
1808 	 * successfully transferred ownership of the MPT, we can update the
1809 	 * following fields for use in further operations on the MR.
1810 	 */
1811 	mr->mr_mttaddr	   = mtt_addr;
1812 
1813 	mr->mr_log2_pgsz   = (mr->mr_logmttpgsz - HERMON_PAGESHIFT);
1814 	mr->mr_mptrsrcp	   = mpt;
1815 	mr->mr_mttrsrcp	   = mtt;
1816 	mr->mr_pdhdl	   = pd;
1817 	mr->mr_rsrcp	   = rsrc;
1818 	mr->mr_is_umem	   = mr_is_umem;
1819 	mr->mr_is_fmr	   = 0;
1820 	mr->mr_umemcookie  = (mr_is_umem != 0) ? umem_cookie : NULL;
1821 	mr->mr_umem_cbfunc = NULL;
1822 	mr->mr_umem_cbarg1 = NULL;
1823 	mr->mr_umem_cbarg2 = NULL;
1824 	mr->mr_lkey	   = hermon_mr_key_swap(mr->mr_lkey);
1825 	mr->mr_rkey	   = hermon_mr_key_swap(mr->mr_rkey);
1826 	mr->mr_mpt_type	   = mpt_type;
1827 
1828 	/*
1829 	 * If this is userland memory, then we need to insert the previously
1830 	 * allocated entry into the "userland resources database".  This will
1831 	 * allow for later coordination between the hermon_umap_umemlock_cb()
1832 	 * callback and hermon_mr_deregister().
1833 	 */
1834 	if (mr_is_umem) {
1835 		hermon_umap_db_add(umapdb);
1836 	}
1837 
1838 	*mrhdl = mr;
1839 
1840 	return (DDI_SUCCESS);
1841 
1842 /*
1843  * The following is cleanup for all possible failure cases in this routine
1844  */
1845 mrcommon_fail7:
1846 	hermon_rsrc_free(state, &mtt_refcnt);
1847 mrcommon_fail6:
1848 	hermon_mr_mem_unbind(state, bh);
1849 	bind->bi_type = bh->bi_type;
1850 mrcommon_fail5:
1851 	if (mr_is_umem) {
1852 		hermon_umap_db_free(umapdb);
1853 	}
1854 mrcommon_fail4:
1855 	if (mr_is_umem) {
1856 		/*
1857 		 * Free up the memory ddi_umem_iosetup() allocates
1858 		 * internally.
1859 		 */
1860 		if (bind->bi_type == HERMON_BINDHDL_UBUF) {
1861 			freerbuf(bind->bi_buf);
1862 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
1863 			bind->bi_type = HERMON_BINDHDL_NONE;
1864 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
1865 		}
1866 		ddi_umem_unlock(umem_cookie);
1867 	}
1868 mrcommon_fail3:
1869 	hermon_rsrc_free(state, &rsrc);
1870 mrcommon_fail2:
1871 	if (mpt != NULL)
1872 		hermon_rsrc_free(state, &mpt);
1873 mrcommon_fail1:
1874 	hermon_pd_refcnt_dec(pd);
1875 mrcommon_fail:
1876 	return (status);
1877 }
1878 
1879 /*
1880  * hermon_mr_mtt_bind()
1881  *    Context: Can be called from interrupt or base context.
1882  */
1883 int
1884 hermon_mr_mtt_bind(hermon_state_t *state, hermon_bind_info_t *bind,
1885     ddi_dma_handle_t bind_dmahdl, hermon_rsrc_t **mtt, uint_t *mtt_pgsize_bits,
1886     uint_t is_buffer)
1887 {
1888 	uint64_t		nummtt;
1889 	uint_t			sleep;
1890 	int			status;
1891 
1892 	/*
1893 	 * Check the sleep flag.  Ensure that it is consistent with the
1894 	 * current thread context (i.e. if we are currently in the interrupt
1895 	 * context, then we shouldn't be attempting to sleep).
1896 	 */
1897 	sleep = (bind->bi_flags & IBT_MR_NOSLEEP) ?
1898 	    HERMON_NOSLEEP : HERMON_SLEEP;
1899 	if ((sleep == HERMON_SLEEP) &&
1900 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
1901 		status = IBT_INVALID_PARAM;
1902 		goto mrmttbind_fail;
1903 	}
1904 
1905 	/*
1906 	 * Bind the memory and determine the mapped addresses.  This is
1907 	 * the first of two routines that do all the "heavy lifting" for
1908 	 * the Hermon memory registration routines.  The hermon_mr_mem_bind()
1909 	 * routine takes the "bind" struct with all its fields filled
1910 	 * in and returns a list of DMA cookies (for the PCI mapped addresses
1911 	 * corresponding to the specified address region) which are used by
1912 	 * the hermon_mr_fast_mtt_write() routine below.  If we fail here, we
1913 	 * must undo all the previous resource allocation (and PD reference
1914 	 * count).
1915 	 */
1916 	status = hermon_mr_mem_bind(state, bind, bind_dmahdl, sleep, is_buffer);
1917 	if (status != DDI_SUCCESS) {
1918 		status = IBT_INSUFF_RESOURCE;
1919 		goto mrmttbind_fail;
1920 	}
1921 
1922 	/*
1923 	 * Determine number of pages spanned.  This routine uses the
1924 	 * information in the "bind" struct to determine the required
1925 	 * number of MTT entries needed (and returns the suggested page size -
1926 	 * as a "power-of-2" - for each MTT entry).
1927 	 */
1928 	nummtt = hermon_mr_nummtt_needed(state, bind, mtt_pgsize_bits);
1929 
1930 	/*
1931 	 * Allocate the MTT entries.  Use the calculations performed above to
1932 	 * allocate the required number of MTT entries. If we fail here, we
1933 	 * must not only undo all the previous resource allocation (and PD
1934 	 * reference count), but we must also unbind the memory.
1935 	 */
1936 	status = hermon_rsrc_alloc(state, HERMON_MTT, nummtt, sleep, mtt);
1937 	if (status != DDI_SUCCESS) {
1938 		status = IBT_INSUFF_RESOURCE;
1939 		goto mrmttbind_fail2;
1940 	}
1941 
1942 	/*
1943 	 * Write the mapped addresses into the MTT entries.  This is part two
1944 	 * of the "heavy lifting" routines that we talked about above.  Note:
1945 	 * we pass the suggested page size from the earlier operation here.
1946 	 * And if we fail here, we again do pretty much the same huge clean up.
1947 	 */
1948 	status = hermon_mr_fast_mtt_write(state, *mtt, bind, *mtt_pgsize_bits);
1949 	if (status != DDI_SUCCESS) {
1950 		/*
1951 		 * hermon_mr_fast_mtt_write() returns DDI_FAILURE
1952 		 * only if it detects a HW error during DMA.
1953 		 */
1954 		hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1955 		status = ibc_get_ci_failure(0);
1956 		goto mrmttbind_fail3;
1957 	}
1958 	return (DDI_SUCCESS);
1959 
1960 /*
1961  * The following is cleanup for all possible failure cases in this routine
1962  */
1963 mrmttbind_fail3:
1964 	hermon_rsrc_free(state, mtt);
1965 mrmttbind_fail2:
1966 	hermon_mr_mem_unbind(state, bind);
1967 mrmttbind_fail:
1968 	return (status);
1969 }
1970 
1971 
1972 /*
1973  * hermon_mr_mtt_unbind()
1974  *    Context: Can be called from interrupt or base context.
1975  */
1976 int
1977 hermon_mr_mtt_unbind(hermon_state_t *state, hermon_bind_info_t *bind,
1978     hermon_rsrc_t *mtt)
1979 {
1980 	/*
1981 	 * Free up the MTT entries and unbind the memory.  Here, as above, we
1982 	 * attempt to free these resources only if it is appropriate to do so.
1983 	 */
1984 	hermon_mr_mem_unbind(state, bind);
1985 	hermon_rsrc_free(state, &mtt);
1986 
1987 	return (DDI_SUCCESS);
1988 }
1989 
1990 
1991 /*
1992  * hermon_mr_common_rereg()
1993  *    Context: Can be called from interrupt or base context.
1994  */
1995 static int
1996 hermon_mr_common_rereg(hermon_state_t *state, hermon_mrhdl_t mr,
1997     hermon_pdhdl_t pd, hermon_bind_info_t *bind, hermon_mrhdl_t *mrhdl_new,
1998     hermon_mr_options_t *op)
1999 {
2000 	hermon_rsrc_t		*mpt;
2001 	ibt_mr_attr_flags_t	acc_flags_to_use;
2002 	ibt_mr_flags_t		flags;
2003 	hermon_pdhdl_t		pd_to_use;
2004 	hermon_hw_dmpt_t	mpt_entry;
2005 	uint64_t		mtt_addr_to_use, vaddr_to_use, len_to_use;
2006 	uint_t			sleep, dereg_level;
2007 	int			status;
2008 
2009 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2010 
2011 	/*
2012 	 * Check here to see if the memory region corresponds to a userland
2013 	 * mapping.  Reregistration of userland memory regions is not
2014 	 * currently supported.  Return failure.
2015 	 */
2016 	if (mr->mr_is_umem) {
2017 		status = IBT_MR_HDL_INVALID;
2018 		goto mrrereg_fail;
2019 	}
2020 
2021 	mutex_enter(&mr->mr_lock);
2022 
2023 	/* Pull MPT resource pointer from the Hermon Memory Region handle */
2024 	mpt = mr->mr_mptrsrcp;
2025 
2026 	/* Extract the flags field from the hermon_bind_info_t */
2027 	flags = bind->bi_flags;
2028 
2029 	/*
2030 	 * Check the sleep flag.  Ensure that it is consistent with the
2031 	 * current thread context (i.e. if we are currently in the interrupt
2032 	 * context, then we shouldn't be attempting to sleep).
2033 	 */
2034 	sleep = (flags & IBT_MR_NOSLEEP) ? HERMON_NOSLEEP: HERMON_SLEEP;
2035 	if ((sleep == HERMON_SLEEP) &&
2036 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
2037 		mutex_exit(&mr->mr_lock);
2038 		status = IBT_INVALID_PARAM;
2039 		goto mrrereg_fail;
2040 	}
2041 
2042 	/*
2043 	 * First step is to temporarily invalidate the MPT entry.  This
2044 	 * regains ownership from the hardware, and gives us the opportunity
2045 	 * to modify the entry.  Note: The HW2SW_MPT command returns the
2046 	 * current MPT entry contents.  These are saved away here because
2047 	 * they will be reused in a later step below.  If the region has
2048 	 * bound memory windows that we fail returning an "in use" error code.
2049 	 * Otherwise, this is an unexpected error and we deregister the
2050 	 * memory region and return error.
2051 	 *
2052 	 * We use HERMON_CMD_NOSLEEP_SPIN here always because we must protect
2053 	 * against holding the lock around this rereg call in all contexts.
2054 	 */
2055 	status = hermon_cmn_ownership_cmd_post(state, HW2SW_MPT, &mpt_entry,
2056 	    sizeof (hermon_hw_dmpt_t), mpt->hr_indx, HERMON_CMD_NOSLEEP_SPIN);
2057 	if (status != HERMON_CMD_SUCCESS) {
2058 		mutex_exit(&mr->mr_lock);
2059 		if (status == HERMON_CMD_REG_BOUND) {
2060 			return (IBT_MR_IN_USE);
2061 		} else {
2062 			cmn_err(CE_CONT, "Hermon: HW2SW_MPT command failed: "
2063 			    "%08x\n", status);
2064 			if (status == HERMON_CMD_INVALID_STATUS) {
2065 				hermon_fm_ereport(state, HCA_SYS_ERR,
2066 				    HCA_ERR_SRV_LOST);
2067 			}
2068 			/*
2069 			 * Call deregister and ensure that all current
2070 			 * resources get freed up
2071 			 */
2072 			if (hermon_mr_deregister(state, &mr,
2073 			    HERMON_MR_DEREG_ALL, sleep) != DDI_SUCCESS) {
2074 				HERMON_WARNING(state, "failed to deregister "
2075 				    "memory region");
2076 			}
2077 			return (ibc_get_ci_failure(0));
2078 		}
2079 	}
2080 
2081 	/*
2082 	 * If we're changing the protection domain, then validate the new one
2083 	 */
2084 	if (flags & IBT_MR_CHANGE_PD) {
2085 
2086 		/* Check for valid PD handle pointer */
2087 		if (pd == NULL) {
2088 			mutex_exit(&mr->mr_lock);
2089 			/*
2090 			 * Call deregister and ensure that all current
2091 			 * resources get properly freed up. Unnecessary
2092 			 * here to attempt to regain software ownership
2093 			 * of the MPT entry as that has already been
2094 			 * done above.
2095 			 */
2096 			if (hermon_mr_deregister(state, &mr,
2097 			    HERMON_MR_DEREG_NO_HW2SW_MPT, sleep) !=
2098 			    DDI_SUCCESS) {
2099 				HERMON_WARNING(state, "failed to deregister "
2100 				    "memory region");
2101 			}
2102 			status = IBT_PD_HDL_INVALID;
2103 			goto mrrereg_fail;
2104 		}
2105 
2106 		/* Use the new PD handle in all operations below */
2107 		pd_to_use = pd;
2108 
2109 	} else {
2110 		/* Use the current PD handle in all operations below */
2111 		pd_to_use = mr->mr_pdhdl;
2112 	}
2113 
2114 	/*
2115 	 * If we're changing access permissions, then validate the new ones
2116 	 */
2117 	if (flags & IBT_MR_CHANGE_ACCESS) {
2118 		/*
2119 		 * Validate the access flags.  Both remote write and remote
2120 		 * atomic require the local write flag to be set
2121 		 */
2122 		if (((flags & IBT_MR_ENABLE_REMOTE_WRITE) ||
2123 		    (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)) &&
2124 		    !(flags & IBT_MR_ENABLE_LOCAL_WRITE)) {
2125 			mutex_exit(&mr->mr_lock);
2126 			/*
2127 			 * Call deregister and ensure that all current
2128 			 * resources get properly freed up. Unnecessary
2129 			 * here to attempt to regain software ownership
2130 			 * of the MPT entry as that has already been
2131 			 * done above.
2132 			 */
2133 			if (hermon_mr_deregister(state, &mr,
2134 			    HERMON_MR_DEREG_NO_HW2SW_MPT, sleep) !=
2135 			    DDI_SUCCESS) {
2136 				HERMON_WARNING(state, "failed to deregister "
2137 				    "memory region");
2138 			}
2139 			status = IBT_MR_ACCESS_REQ_INVALID;
2140 			goto mrrereg_fail;
2141 		}
2142 
2143 		/*
2144 		 * Setup and validate the memory region access flags.  This
2145 		 * means translating the IBTF's enable flags into the access
2146 		 * flags that will be used in later operations.
2147 		 */
2148 		acc_flags_to_use = 0;
2149 		if (flags & IBT_MR_ENABLE_WINDOW_BIND)
2150 			acc_flags_to_use |= IBT_MR_WINDOW_BIND;
2151 		if (flags & IBT_MR_ENABLE_LOCAL_WRITE)
2152 			acc_flags_to_use |= IBT_MR_LOCAL_WRITE;
2153 		if (flags & IBT_MR_ENABLE_REMOTE_READ)
2154 			acc_flags_to_use |= IBT_MR_REMOTE_READ;
2155 		if (flags & IBT_MR_ENABLE_REMOTE_WRITE)
2156 			acc_flags_to_use |= IBT_MR_REMOTE_WRITE;
2157 		if (flags & IBT_MR_ENABLE_REMOTE_ATOMIC)
2158 			acc_flags_to_use |= IBT_MR_REMOTE_ATOMIC;
2159 
2160 	} else {
2161 		acc_flags_to_use = mr->mr_accflag;
2162 	}
2163 
2164 	/*
2165 	 * If we're modifying the translation, then figure out whether
2166 	 * we can reuse the current MTT resources.  This means calling
2167 	 * hermon_mr_rereg_xlat_helper() which does most of the heavy lifting
2168 	 * for the reregistration.  If the current memory region contains
2169 	 * sufficient MTT entries for the new regions, then it will be
2170 	 * reused and filled in.  Otherwise, new entries will be allocated,
2171 	 * the old ones will be freed, and the new entries will be filled
2172 	 * in.  Note:  If we're not modifying the translation, then we
2173 	 * should already have all the information we need to update the MPT.
2174 	 * Also note: If hermon_mr_rereg_xlat_helper() fails, it will return
2175 	 * a "dereg_level" which is the level of cleanup that needs to be
2176 	 * passed to hermon_mr_deregister() to finish the cleanup.
2177 	 */
2178 	if (flags & IBT_MR_CHANGE_TRANSLATION) {
2179 		status = hermon_mr_rereg_xlat_helper(state, mr, bind, op,
2180 		    &mtt_addr_to_use, sleep, &dereg_level);
2181 		if (status != DDI_SUCCESS) {
2182 			mutex_exit(&mr->mr_lock);
2183 			/*
2184 			 * Call deregister and ensure that all resources get
2185 			 * properly freed up.
2186 			 */
2187 			if (hermon_mr_deregister(state, &mr, dereg_level,
2188 			    sleep) != DDI_SUCCESS) {
2189 				HERMON_WARNING(state, "failed to deregister "
2190 				    "memory region");
2191 			}
2192 			goto mrrereg_fail;
2193 		}
2194 		vaddr_to_use = mr->mr_bindinfo.bi_addr;
2195 		len_to_use   = mr->mr_bindinfo.bi_len;
2196 	} else {
2197 		mtt_addr_to_use = mr->mr_mttaddr;
2198 		vaddr_to_use = mr->mr_bindinfo.bi_addr;
2199 		len_to_use   = mr->mr_bindinfo.bi_len;
2200 	}
2201 
2202 	/*
2203 	 * Calculate new keys (Lkey, Rkey) from MPT index.  Just like they were
2204 	 * when the region was first registered, each key is formed from
2205 	 * "constrained" bits and "unconstrained" bits.  Note:  If no remote
2206 	 * access is required, then the RKey value is not filled in.  Otherwise
2207 	 * both Rkey and LKey are given the same value.
2208 	 */
2209 	mr->mr_lkey = hermon_mr_keycalc(mpt->hr_indx);
2210 	if ((acc_flags_to_use & IBT_MR_REMOTE_READ) ||
2211 	    (acc_flags_to_use & IBT_MR_REMOTE_WRITE) ||
2212 	    (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC)) {
2213 		mr->mr_rkey = mr->mr_lkey;
2214 	} else
2215 		mr->mr_rkey = 0;
2216 
2217 	/*
2218 	 * Fill in the MPT entry.  This is the final step before passing
2219 	 * ownership of the MPT entry to the Hermon hardware.  We use all of
2220 	 * the information collected/calculated above to fill in the
2221 	 * requisite portions of the MPT.
2222 	 */
2223 	bzero(&mpt_entry, sizeof (hermon_hw_dmpt_t));
2224 
2225 	mpt_entry.status  = HERMON_MPT_SW_OWNERSHIP;
2226 	mpt_entry.en_bind = (acc_flags_to_use & IBT_MR_WINDOW_BIND)   ? 1 : 0;
2227 	mpt_entry.atomic  = (acc_flags_to_use & IBT_MR_REMOTE_ATOMIC) ? 1 : 0;
2228 	mpt_entry.rw	  = (acc_flags_to_use & IBT_MR_REMOTE_WRITE)  ? 1 : 0;
2229 	mpt_entry.rr	  = (acc_flags_to_use & IBT_MR_REMOTE_READ)   ? 1 : 0;
2230 	mpt_entry.lw	  = (acc_flags_to_use & IBT_MR_LOCAL_WRITE)   ? 1 : 0;
2231 	mpt_entry.lr	  = 1;
2232 	mpt_entry.phys_addr = 0;
2233 	mpt_entry.reg_win = HERMON_MPT_IS_REGION;
2234 
2235 	mpt_entry.entity_sz	= mr->mr_logmttpgsz;
2236 	mpt_entry.mem_key	= mr->mr_lkey;
2237 	mpt_entry.pd		= pd_to_use->pd_pdnum;
2238 
2239 	mpt_entry.start_addr	= vaddr_to_use;
2240 	mpt_entry.reg_win_len	= len_to_use;
2241 	mpt_entry.mtt_addr_h = mtt_addr_to_use >> 32;
2242 	mpt_entry.mtt_addr_l = mtt_addr_to_use >> 3;
2243 
2244 	/*
2245 	 * Write the updated MPT entry to hardware
2246 	 *
2247 	 * We use HERMON_CMD_NOSLEEP_SPIN here always because we must protect
2248 	 * against holding the lock around this rereg call in all contexts.
2249 	 */
2250 	status = hermon_cmn_ownership_cmd_post(state, SW2HW_MPT, &mpt_entry,
2251 	    sizeof (hermon_hw_dmpt_t), mpt->hr_indx, HERMON_CMD_NOSLEEP_SPIN);
2252 	if (status != HERMON_CMD_SUCCESS) {
2253 		mutex_exit(&mr->mr_lock);
2254 		cmn_err(CE_CONT, "Hermon: SW2HW_MPT command failed: %08x\n",
2255 		    status);
2256 		if (status == HERMON_CMD_INVALID_STATUS) {
2257 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
2258 		}
2259 		/*
2260 		 * Call deregister and ensure that all current resources get
2261 		 * properly freed up. Unnecessary here to attempt to regain
2262 		 * software ownership of the MPT entry as that has already
2263 		 * been done above.
2264 		 */
2265 		if (hermon_mr_deregister(state, &mr,
2266 		    HERMON_MR_DEREG_NO_HW2SW_MPT, sleep) != DDI_SUCCESS) {
2267 			HERMON_WARNING(state, "failed to deregister memory "
2268 			    "region");
2269 		}
2270 		return (ibc_get_ci_failure(0));
2271 	}
2272 
2273 	/*
2274 	 * If we're changing PD, then update their reference counts now.
2275 	 * This means decrementing the reference count on the old PD and
2276 	 * incrementing the reference count on the new PD.
2277 	 */
2278 	if (flags & IBT_MR_CHANGE_PD) {
2279 		hermon_pd_refcnt_dec(mr->mr_pdhdl);
2280 		hermon_pd_refcnt_inc(pd);
2281 	}
2282 
2283 	/*
2284 	 * Update the contents of the Hermon Memory Region handle to reflect
2285 	 * what has been changed.
2286 	 */
2287 	mr->mr_pdhdl	  = pd_to_use;
2288 	mr->mr_accflag	  = acc_flags_to_use;
2289 	mr->mr_is_umem	  = 0;
2290 	mr->mr_is_fmr	  = 0;
2291 	mr->mr_umemcookie = NULL;
2292 	mr->mr_lkey	  = hermon_mr_key_swap(mr->mr_lkey);
2293 	mr->mr_rkey	  = hermon_mr_key_swap(mr->mr_rkey);
2294 
2295 	/* New MR handle is same as the old */
2296 	*mrhdl_new = mr;
2297 	mutex_exit(&mr->mr_lock);
2298 
2299 	return (DDI_SUCCESS);
2300 
2301 mrrereg_fail:
2302 	return (status);
2303 }
2304 
2305 
2306 /*
2307  * hermon_mr_rereg_xlat_helper
2308  *    Context: Can be called from interrupt or base context.
2309  *    Note: This routine expects the "mr_lock" to be held when it
2310  *    is called.  Upon returning failure, this routine passes information
2311  *    about what "dereg_level" should be passed to hermon_mr_deregister().
2312  */
2313 static int
2314 hermon_mr_rereg_xlat_helper(hermon_state_t *state, hermon_mrhdl_t mr,
2315     hermon_bind_info_t *bind, hermon_mr_options_t *op, uint64_t *mtt_addr,
2316     uint_t sleep, uint_t *dereg_level)
2317 {
2318 	hermon_rsrc_t		*mtt, *mtt_refcnt;
2319 	hermon_sw_refcnt_t	*swrc_old, *swrc_new;
2320 	ddi_dma_handle_t	dmahdl;
2321 	uint64_t		nummtt_needed, nummtt_in_currrsrc, max_sz;
2322 	uint_t			mtt_pgsize_bits, bind_type, reuse_dmahdl;
2323 	int			status;
2324 
2325 	ASSERT(MUTEX_HELD(&mr->mr_lock));
2326 
2327 	/*
2328 	 * Check the "options" flag.  Currently this flag tells the driver
2329 	 * whether or not the region should be bound normally (i.e. with
2330 	 * entries written into the PCI IOMMU) or whether it should be
2331 	 * registered to bypass the IOMMU.
2332 	 */
2333 	if (op == NULL) {
2334 		bind_type = HERMON_BINDMEM_NORMAL;
2335 	} else {
2336 		bind_type = op->mro_bind_type;
2337 	}
2338 
2339 	/*
2340 	 * Check for invalid length.  Check is the length is zero or if the
2341 	 * length is larger than the maximum configured value.  Return error
2342 	 * if it is.
2343 	 */
2344 	max_sz = ((uint64_t)1 << state->hs_cfg_profile->cp_log_max_mrw_sz);
2345 	if ((bind->bi_len == 0) || (bind->bi_len > max_sz)) {
2346 		/*
2347 		 * Deregister will be called upon returning failure from this
2348 		 * routine. This will ensure that all current resources get
2349 		 * properly freed up. Unnecessary to attempt to regain
2350 		 * software ownership of the MPT entry as that has already
2351 		 * been done above (in hermon_mr_reregister())
2352 		 */
2353 		*dereg_level = HERMON_MR_DEREG_NO_HW2SW_MPT;
2354 
2355 		status = IBT_MR_LEN_INVALID;
2356 		goto mrrereghelp_fail;
2357 	}
2358 
2359 	/*
2360 	 * Determine the number of pages necessary for new region and the
2361 	 * number of pages supported by the current MTT resources
2362 	 */
2363 	nummtt_needed = hermon_mr_nummtt_needed(state, bind, &mtt_pgsize_bits);
2364 	nummtt_in_currrsrc = mr->mr_mttrsrcp->hr_len >> HERMON_MTT_SIZE_SHIFT;
2365 
2366 	/*
2367 	 * Depending on whether we have enough pages or not, the next step is
2368 	 * to fill in a set of MTT entries that reflect the new mapping.  In
2369 	 * the first case below, we already have enough entries.  This means
2370 	 * we need to unbind the memory from the previous mapping, bind the
2371 	 * memory for the new mapping, write the new MTT entries, and update
2372 	 * the mr to reflect the changes.
2373 	 * In the second case below, we do not have enough entries in the
2374 	 * current mapping.  So, in this case, we need not only to unbind the
2375 	 * current mapping, but we need to free up the MTT resources associated
2376 	 * with that mapping.  After we've successfully done that, we continue
2377 	 * by binding the new memory, allocating new MTT entries, writing the
2378 	 * new MTT entries, and updating the mr to reflect the changes.
2379 	 */
2380 
2381 	/*
2382 	 * If this region is being shared (i.e. MTT refcount != 1), then we
2383 	 * can't reuse the current MTT resources regardless of their size.
2384 	 * Instead we'll need to alloc new ones (below) just as if there
2385 	 * hadn't been enough room in the current entries.
2386 	 */
2387 	swrc_old = (hermon_sw_refcnt_t *)mr->mr_mttrefcntp->hr_addr;
2388 	if (HERMON_MTT_IS_NOT_SHARED(swrc_old) &&
2389 	    (nummtt_needed <= nummtt_in_currrsrc)) {
2390 
2391 		/*
2392 		 * Unbind the old mapping for this memory region, but retain
2393 		 * the ddi_dma_handle_t (if possible) for reuse in the bind
2394 		 * operation below.  Note:  If original memory region was
2395 		 * bound for IOMMU bypass and the new region can not use
2396 		 * bypass, then a new DMA handle will be necessary.
2397 		 */
2398 		if (HERMON_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
2399 			mr->mr_bindinfo.bi_free_dmahdl = 0;
2400 			hermon_mr_mem_unbind(state, &mr->mr_bindinfo);
2401 			dmahdl = mr->mr_bindinfo.bi_dmahdl;
2402 			reuse_dmahdl = 1;
2403 		} else {
2404 			hermon_mr_mem_unbind(state, &mr->mr_bindinfo);
2405 			dmahdl = NULL;
2406 			reuse_dmahdl = 0;
2407 		}
2408 
2409 		/*
2410 		 * Bind the new memory and determine the mapped addresses.
2411 		 * As described, this routine and hermon_mr_fast_mtt_write()
2412 		 * do the majority of the work for the memory registration
2413 		 * operations.  Note:  When we successfully finish the binding,
2414 		 * we will set the "bi_free_dmahdl" flag to indicate that
2415 		 * even though we may have reused the ddi_dma_handle_t we do
2416 		 * wish it to be freed up at some later time.  Note also that
2417 		 * if we fail, we may need to cleanup the ddi_dma_handle_t.
2418 		 */
2419 		bind->bi_bypass	= bind_type;
2420 		status = hermon_mr_mem_bind(state, bind, dmahdl, sleep, 1);
2421 		if (status != DDI_SUCCESS) {
2422 			if (reuse_dmahdl) {
2423 				ddi_dma_free_handle(&dmahdl);
2424 			}
2425 
2426 			/*
2427 			 * Deregister will be called upon returning failure
2428 			 * from this routine. This will ensure that all
2429 			 * current resources get properly freed up.
2430 			 * Unnecessary to attempt to regain software ownership
2431 			 * of the MPT entry as that has already been done
2432 			 * above (in hermon_mr_reregister()).  Also unnecessary
2433 			 * to attempt to unbind the memory.
2434 			 */
2435 			*dereg_level = HERMON_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2436 
2437 			status = IBT_INSUFF_RESOURCE;
2438 			goto mrrereghelp_fail;
2439 		}
2440 		if (reuse_dmahdl) {
2441 			bind->bi_free_dmahdl = 1;
2442 		}
2443 
2444 		/*
2445 		 * Using the new mapping, but reusing the current MTT
2446 		 * resources, write the updated entries to MTT
2447 		 */
2448 		mtt    = mr->mr_mttrsrcp;
2449 		status = hermon_mr_fast_mtt_write(state, mtt, bind,
2450 		    mtt_pgsize_bits);
2451 		if (status != DDI_SUCCESS) {
2452 			/*
2453 			 * Deregister will be called upon returning failure
2454 			 * from this routine. This will ensure that all
2455 			 * current resources get properly freed up.
2456 			 * Unnecessary to attempt to regain software ownership
2457 			 * of the MPT entry as that has already been done
2458 			 * above (in hermon_mr_reregister()).  Also unnecessary
2459 			 * to attempt to unbind the memory.
2460 			 *
2461 			 * But we do need to unbind the newly bound memory
2462 			 * before returning.
2463 			 */
2464 			hermon_mr_mem_unbind(state, bind);
2465 			*dereg_level = HERMON_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2466 
2467 			/*
2468 			 * hermon_mr_fast_mtt_write() returns DDI_FAILURE
2469 			 * only if it detects a HW error during DMA.
2470 			 */
2471 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
2472 			status = ibc_get_ci_failure(0);
2473 			goto mrrereghelp_fail;
2474 		}
2475 
2476 		/* Put the updated information into the Mem Region handle */
2477 		mr->mr_bindinfo	  = *bind;
2478 		mr->mr_logmttpgsz = mtt_pgsize_bits;
2479 
2480 	} else {
2481 		/*
2482 		 * Check if the memory region MTT is shared by any other MRs.
2483 		 * Since the resource may be shared between multiple memory
2484 		 * regions (as a result of a "RegisterSharedMR()" verb) it is
2485 		 * important that we not unbind any resources prematurely.
2486 		 */
2487 		if (!HERMON_MTT_IS_SHARED(swrc_old)) {
2488 			/*
2489 			 * Unbind the old mapping for this memory region, but
2490 			 * retain the ddi_dma_handle_t for reuse in the bind
2491 			 * operation below. Note: This can only be done here
2492 			 * because the region being reregistered is not
2493 			 * currently shared.  Also if original memory region
2494 			 * was bound for IOMMU bypass and the new region can
2495 			 * not use bypass, then a new DMA handle will be
2496 			 * necessary.
2497 			 */
2498 			if (HERMON_MR_REUSE_DMAHDL(mr, bind->bi_flags)) {
2499 				mr->mr_bindinfo.bi_free_dmahdl = 0;
2500 				hermon_mr_mem_unbind(state, &mr->mr_bindinfo);
2501 				dmahdl = mr->mr_bindinfo.bi_dmahdl;
2502 				reuse_dmahdl = 1;
2503 			} else {
2504 				hermon_mr_mem_unbind(state, &mr->mr_bindinfo);
2505 				dmahdl = NULL;
2506 				reuse_dmahdl = 0;
2507 			}
2508 		} else {
2509 			dmahdl = NULL;
2510 			reuse_dmahdl = 0;
2511 		}
2512 
2513 		/*
2514 		 * Bind the new memory and determine the mapped addresses.
2515 		 * As described, this routine and hermon_mr_fast_mtt_write()
2516 		 * do the majority of the work for the memory registration
2517 		 * operations.  Note:  When we successfully finish the binding,
2518 		 * we will set the "bi_free_dmahdl" flag to indicate that
2519 		 * even though we may have reused the ddi_dma_handle_t we do
2520 		 * wish it to be freed up at some later time.  Note also that
2521 		 * if we fail, we may need to cleanup the ddi_dma_handle_t.
2522 		 */
2523 		bind->bi_bypass	= bind_type;
2524 		status = hermon_mr_mem_bind(state, bind, dmahdl, sleep, 1);
2525 		if (status != DDI_SUCCESS) {
2526 			if (reuse_dmahdl) {
2527 				ddi_dma_free_handle(&dmahdl);
2528 			}
2529 
2530 			/*
2531 			 * Deregister will be called upon returning failure
2532 			 * from this routine. This will ensure that all
2533 			 * current resources get properly freed up.
2534 			 * Unnecessary to attempt to regain software ownership
2535 			 * of the MPT entry as that has already been done
2536 			 * above (in hermon_mr_reregister()).  Also unnecessary
2537 			 * to attempt to unbind the memory.
2538 			 */
2539 			*dereg_level = HERMON_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2540 
2541 			status = IBT_INSUFF_RESOURCE;
2542 			goto mrrereghelp_fail;
2543 		}
2544 		if (reuse_dmahdl) {
2545 			bind->bi_free_dmahdl = 1;
2546 		}
2547 
2548 		/*
2549 		 * Allocate the new MTT entries resource
2550 		 */
2551 		status = hermon_rsrc_alloc(state, HERMON_MTT, nummtt_needed,
2552 		    sleep, &mtt);
2553 		if (status != DDI_SUCCESS) {
2554 			/*
2555 			 * Deregister will be called upon returning failure
2556 			 * from this routine. This will ensure that all
2557 			 * current resources get properly freed up.
2558 			 * Unnecessary to attempt to regain software ownership
2559 			 * of the MPT entry as that has already been done
2560 			 * above (in hermon_mr_reregister()).  Also unnecessary
2561 			 * to attempt to unbind the memory.
2562 			 *
2563 			 * But we do need to unbind the newly bound memory
2564 			 * before returning.
2565 			 */
2566 			hermon_mr_mem_unbind(state, bind);
2567 			*dereg_level = HERMON_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2568 
2569 			status = IBT_INSUFF_RESOURCE;
2570 			goto mrrereghelp_fail;
2571 		}
2572 
2573 		/*
2574 		 * Allocate MTT reference count (to track shared memory
2575 		 * regions).  As mentioned elsewhere above, this reference
2576 		 * count resource may never be used on the given memory region,
2577 		 * but if it is ever later registered as a "shared" memory
2578 		 * region then this resource will be necessary.  Note:  This
2579 		 * is only necessary here if the existing memory region is
2580 		 * already being shared (because otherwise we already have
2581 		 * a useable reference count resource).
2582 		 */
2583 		if (HERMON_MTT_IS_SHARED(swrc_old)) {
2584 			status = hermon_rsrc_alloc(state, HERMON_REFCNT, 1,
2585 			    sleep, &mtt_refcnt);
2586 			if (status != DDI_SUCCESS) {
2587 				/*
2588 				 * Deregister will be called upon returning
2589 				 * failure from this routine. This will ensure
2590 				 * that all current resources get properly
2591 				 * freed up.  Unnecessary to attempt to regain
2592 				 * software ownership of the MPT entry as that
2593 				 * has already been done above (in
2594 				 * hermon_mr_reregister()).  Also unnecessary
2595 				 * to attempt to unbind the memory.
2596 				 *
2597 				 * But we need to unbind the newly bound
2598 				 * memory and free up the newly allocated MTT
2599 				 * entries before returning.
2600 				 */
2601 				hermon_mr_mem_unbind(state, bind);
2602 				hermon_rsrc_free(state, &mtt);
2603 				*dereg_level =
2604 				    HERMON_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2605 
2606 				status = IBT_INSUFF_RESOURCE;
2607 				goto mrrereghelp_fail;
2608 			}
2609 			swrc_new = (hermon_sw_refcnt_t *)mtt_refcnt->hr_addr;
2610 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*swrc_new))
2611 			HERMON_MTT_REFCNT_INIT(swrc_new);
2612 		} else {
2613 			mtt_refcnt = mr->mr_mttrefcntp;
2614 		}
2615 
2616 		/*
2617 		 * Using the new mapping and the new MTT resources, write the
2618 		 * updated entries to MTT
2619 		 */
2620 		status = hermon_mr_fast_mtt_write(state, mtt, bind,
2621 		    mtt_pgsize_bits);
2622 		if (status != DDI_SUCCESS) {
2623 			/*
2624 			 * Deregister will be called upon returning failure
2625 			 * from this routine. This will ensure that all
2626 			 * current resources get properly freed up.
2627 			 * Unnecessary to attempt to regain software ownership
2628 			 * of the MPT entry as that has already been done
2629 			 * above (in hermon_mr_reregister()).  Also unnecessary
2630 			 * to attempt to unbind the memory.
2631 			 *
2632 			 * But we need to unbind the newly bound memory,
2633 			 * free up the newly allocated MTT entries, and
2634 			 * (possibly) free the new MTT reference count
2635 			 * resource before returning.
2636 			 */
2637 			if (HERMON_MTT_IS_SHARED(swrc_old)) {
2638 				hermon_rsrc_free(state, &mtt_refcnt);
2639 			}
2640 			hermon_mr_mem_unbind(state, bind);
2641 			hermon_rsrc_free(state, &mtt);
2642 			*dereg_level = HERMON_MR_DEREG_NO_HW2SW_MPT_OR_UNBIND;
2643 
2644 			status = IBT_INSUFF_RESOURCE;
2645 			goto mrrereghelp_fail;
2646 		}
2647 
2648 		/*
2649 		 * Check if the memory region MTT is shared by any other MRs.
2650 		 * Since the resource may be shared between multiple memory
2651 		 * regions (as a result of a "RegisterSharedMR()" verb) it is
2652 		 * important that we not free up any resources prematurely.
2653 		 */
2654 		if (HERMON_MTT_IS_SHARED(swrc_old)) {
2655 			/* Decrement MTT reference count for "old" region */
2656 			(void) hermon_mtt_refcnt_dec(mr->mr_mttrefcntp);
2657 		} else {
2658 			/* Free up the old MTT entries resource */
2659 			hermon_rsrc_free(state, &mr->mr_mttrsrcp);
2660 		}
2661 
2662 		/* Put the updated information into the mrhdl */
2663 		mr->mr_bindinfo	  = *bind;
2664 		mr->mr_logmttpgsz = mtt_pgsize_bits;
2665 		mr->mr_mttrsrcp   = mtt;
2666 		mr->mr_mttrefcntp = mtt_refcnt;
2667 	}
2668 
2669 	/*
2670 	 * Calculate and return the updated MTT address (in the DDR address
2671 	 * space).  This will be used by the caller (hermon_mr_reregister) in
2672 	 * the updated MPT entry
2673 	 */
2674 	*mtt_addr = mtt->hr_indx << HERMON_MTT_SIZE_SHIFT;
2675 
2676 	return (DDI_SUCCESS);
2677 
2678 mrrereghelp_fail:
2679 	return (status);
2680 }
2681 
2682 
2683 /*
2684  * hermon_mr_nummtt_needed()
2685  *    Context: Can be called from interrupt or base context.
2686  */
2687 /* ARGSUSED */
2688 static uint64_t
2689 hermon_mr_nummtt_needed(hermon_state_t *state, hermon_bind_info_t *bind,
2690     uint_t *mtt_pgsize_bits)
2691 {
2692 	uint64_t	pg_offset_mask;
2693 	uint64_t	pg_offset, tmp_length;
2694 
2695 	/*
2696 	 * For now we specify the page size as 8Kb (the default page size for
2697 	 * the sun4u architecture), or 4Kb for x86.  Figure out optimal page
2698 	 * size by examining the dmacookies
2699 	 */
2700 	*mtt_pgsize_bits = PAGESHIFT;
2701 
2702 	pg_offset_mask = ((uint64_t)1 << *mtt_pgsize_bits) - 1;
2703 	pg_offset = bind->bi_addr & pg_offset_mask;
2704 	tmp_length = pg_offset + (bind->bi_len - 1);
2705 	return ((tmp_length >> *mtt_pgsize_bits) + 1);
2706 }
2707 
2708 
2709 /*
2710  * hermon_mr_mem_bind()
2711  *    Context: Can be called from interrupt or base context.
2712  */
2713 static int
2714 hermon_mr_mem_bind(hermon_state_t *state, hermon_bind_info_t *bind,
2715     ddi_dma_handle_t dmahdl, uint_t sleep, uint_t is_buffer)
2716 {
2717 	ddi_dma_attr_t	dma_attr;
2718 	int		(*callback)(caddr_t);
2719 	int		status;
2720 
2721 	/* bi_type must be set to a meaningful value to get a bind handle */
2722 	ASSERT(bind->bi_type == HERMON_BINDHDL_VADDR ||
2723 	    bind->bi_type == HERMON_BINDHDL_BUF ||
2724 	    bind->bi_type == HERMON_BINDHDL_UBUF);
2725 
2726 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2727 
2728 	/* Set the callback flag appropriately */
2729 	callback = (sleep == HERMON_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT;
2730 
2731 	/*
2732 	 * Initialize many of the default DMA attributes.  Then, if we're
2733 	 * bypassing the IOMMU, set the DDI_DMA_FORCE_PHYSICAL flag.
2734 	 */
2735 	if (dmahdl == NULL) {
2736 		hermon_dma_attr_init(state, &dma_attr);
2737 #ifdef	__sparc
2738 		if (bind->bi_bypass == HERMON_BINDMEM_BYPASS) {
2739 			dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
2740 		}
2741 #endif
2742 
2743 		/* set RO if needed - tunable set and 'is_buffer' is non-0 */
2744 		if (is_buffer) {
2745 			if (! (bind->bi_flags & IBT_MR_DISABLE_RO)) {
2746 				if ((bind->bi_type != HERMON_BINDHDL_UBUF) &&
2747 				    (hermon_kernel_data_ro ==
2748 				    HERMON_RO_ENABLED)) {
2749 					dma_attr.dma_attr_flags |=
2750 					    DDI_DMA_RELAXED_ORDERING;
2751 				}
2752 				if (((bind->bi_type == HERMON_BINDHDL_UBUF) &&
2753 				    (hermon_user_data_ro ==
2754 				    HERMON_RO_ENABLED))) {
2755 					dma_attr.dma_attr_flags |=
2756 					    DDI_DMA_RELAXED_ORDERING;
2757 				}
2758 			}
2759 		}
2760 
2761 		/* Allocate a DMA handle for the binding */
2762 		status = ddi_dma_alloc_handle(state->hs_dip, &dma_attr,
2763 		    callback, NULL, &bind->bi_dmahdl);
2764 		if (status != DDI_SUCCESS) {
2765 			return (status);
2766 		}
2767 		bind->bi_free_dmahdl = 1;
2768 
2769 	} else  {
2770 		bind->bi_dmahdl = dmahdl;
2771 		bind->bi_free_dmahdl = 0;
2772 	}
2773 
2774 
2775 	/*
2776 	 * Bind the memory to get the PCI mapped addresses.  The decision
2777 	 * to call ddi_dma_addr_bind_handle() or ddi_dma_buf_bind_handle()
2778 	 * is determined by the "bi_type" flag.  Note: if the bind operation
2779 	 * fails then we have to free up the DMA handle and return error.
2780 	 */
2781 	if (bind->bi_type == HERMON_BINDHDL_VADDR) {
2782 		status = ddi_dma_addr_bind_handle(bind->bi_dmahdl, NULL,
2783 		    (caddr_t)(uintptr_t)bind->bi_addr, bind->bi_len,
2784 		    (DDI_DMA_RDWR | DDI_DMA_CONSISTENT), callback, NULL,
2785 		    &bind->bi_dmacookie, &bind->bi_cookiecnt);
2786 
2787 	} else {  /* HERMON_BINDHDL_BUF or HERMON_BINDHDL_UBUF */
2788 
2789 		status = ddi_dma_buf_bind_handle(bind->bi_dmahdl,
2790 		    bind->bi_buf, (DDI_DMA_RDWR | DDI_DMA_CONSISTENT), callback,
2791 		    NULL, &bind->bi_dmacookie, &bind->bi_cookiecnt);
2792 	}
2793 	if (status != DDI_DMA_MAPPED) {
2794 		if (bind->bi_free_dmahdl != 0) {
2795 			ddi_dma_free_handle(&bind->bi_dmahdl);
2796 		}
2797 		return (status);
2798 	}
2799 
2800 	return (DDI_SUCCESS);
2801 }
2802 
2803 
2804 /*
2805  * hermon_mr_mem_unbind()
2806  *    Context: Can be called from interrupt or base context.
2807  */
2808 static void
2809 hermon_mr_mem_unbind(hermon_state_t *state, hermon_bind_info_t *bind)
2810 {
2811 	int	status;
2812 
2813 	/*
2814 	 * In case of HERMON_BINDHDL_UBUF, the memory bi_buf points to
2815 	 * is actually allocated by ddi_umem_iosetup() internally, then
2816 	 * it's required to free it here. Reset bi_type to HERMON_BINDHDL_NONE
2817 	 * not to free it again later.
2818 	 */
2819 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*bind))
2820 	if (bind->bi_type == HERMON_BINDHDL_UBUF) {
2821 		freerbuf(bind->bi_buf);
2822 		bind->bi_type = HERMON_BINDHDL_NONE;
2823 	}
2824 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*bind))
2825 
2826 	/*
2827 	 * Unbind the DMA memory for the region
2828 	 *
2829 	 * Note: The only way ddi_dma_unbind_handle() currently
2830 	 * can return an error is if the handle passed in is invalid.
2831 	 * Since this should never happen, we choose to return void
2832 	 * from this function!  If this does return an error, however,
2833 	 * then we print a warning message to the console.
2834 	 */
2835 	status = ddi_dma_unbind_handle(bind->bi_dmahdl);
2836 	if (status != DDI_SUCCESS) {
2837 		HERMON_WARNING(state, "failed to unbind DMA mapping");
2838 		return;
2839 	}
2840 
2841 	/* Free up the DMA handle */
2842 	if (bind->bi_free_dmahdl != 0) {
2843 		ddi_dma_free_handle(&bind->bi_dmahdl);
2844 	}
2845 }
2846 
2847 
2848 /*
2849  * hermon_mr_fast_mtt_write()
2850  *    Context: Can be called from interrupt or base context.
2851  */
2852 static int
2853 hermon_mr_fast_mtt_write(hermon_state_t *state, hermon_rsrc_t *mtt,
2854     hermon_bind_info_t *bind, uint32_t mtt_pgsize_bits)
2855 {
2856 	hermon_icm_table_t	*icm_table;
2857 	hermon_dma_info_t	*dma_info;
2858 	uint32_t		index1, index2, rindx;
2859 	ddi_dma_cookie_t	dmacookie;
2860 	uint_t			cookie_cnt;
2861 	uint64_t		*mtt_table;
2862 	uint64_t		mtt_entry;
2863 	uint64_t		addr, endaddr;
2864 	uint64_t		pagesize;
2865 	offset_t		i, start;
2866 	uint_t			per_span;
2867 	int			sync_needed;
2868 
2869 	/*
2870 	 * XXX According to the PRM, we are to use the WRITE_MTT
2871 	 * command to write out MTTs. Tavor does not do this,
2872 	 * instead taking advantage of direct access to the MTTs,
2873 	 * and knowledge that Mellanox FMR relies on our ability
2874 	 * to write directly to the MTTs without any further
2875 	 * notification to the firmware. Likewise, we will choose
2876 	 * to not use the WRITE_MTT command, but to simply write
2877 	 * out the MTTs.
2878 	 */
2879 
2880 	/* Calculate page size from the suggested value passed in */
2881 	pagesize = ((uint64_t)1 << mtt_pgsize_bits);
2882 
2883 	/* Walk the "cookie list" and fill in the MTT table entries */
2884 	dmacookie  = bind->bi_dmacookie;
2885 	cookie_cnt = bind->bi_cookiecnt;
2886 
2887 	icm_table = &state->hs_icm[HERMON_MTT];
2888 	rindx = mtt->hr_indx;
2889 	hermon_index(index1, index2, rindx, icm_table, i);
2890 	start = i;
2891 
2892 	per_span   = icm_table->span;
2893 	dma_info   = icm_table->icm_dma[index1] + index2;
2894 	mtt_table  = (uint64_t *)(uintptr_t)dma_info->vaddr;
2895 
2896 	sync_needed = 0;
2897 	while (cookie_cnt-- > 0) {
2898 		addr    = dmacookie.dmac_laddress;
2899 		endaddr = addr + (dmacookie.dmac_size - 1);
2900 		addr    = addr & ~((uint64_t)pagesize - 1);
2901 
2902 		while (addr <= endaddr) {
2903 
2904 			/*
2905 			 * Fill in the mapped addresses (calculated above) and
2906 			 * set HERMON_MTT_ENTRY_PRESENT flag for each MTT entry.
2907 			 */
2908 			mtt_entry = addr | HERMON_MTT_ENTRY_PRESENT;
2909 			mtt_table[i] = htonll(mtt_entry);
2910 			i++;
2911 			rindx++;
2912 
2913 			if (i == per_span) {
2914 
2915 				(void) ddi_dma_sync(dma_info->dma_hdl,
2916 				    start * sizeof (hermon_hw_mtt_t),
2917 				    (i - start) * sizeof (hermon_hw_mtt_t),
2918 				    DDI_DMA_SYNC_FORDEV);
2919 
2920 				if ((addr + pagesize > endaddr) &&
2921 				    (cookie_cnt == 0))
2922 					return (DDI_SUCCESS);
2923 
2924 				hermon_index(index1, index2, rindx, icm_table,
2925 				    i);
2926 				start = i * sizeof (hermon_hw_mtt_t);
2927 				dma_info = icm_table->icm_dma[index1] + index2;
2928 				mtt_table =
2929 				    (uint64_t *)(uintptr_t)dma_info->vaddr;
2930 
2931 				sync_needed = 0;
2932 			} else {
2933 				sync_needed = 1;
2934 			}
2935 
2936 			addr += pagesize;
2937 			if (addr == 0) {
2938 				static int do_once = 1;
2939 				_NOTE(SCHEME_PROTECTS_DATA("safe sharing",
2940 				    do_once))
2941 				if (do_once) {
2942 					do_once = 0;
2943 					cmn_err(CE_NOTE, "probable error in "
2944 					    "dma_cookie address from caller\n");
2945 				}
2946 				break;
2947 			}
2948 		}
2949 
2950 		/*
2951 		 * When we've reached the end of the current DMA cookie,
2952 		 * jump to the next cookie (if there are more)
2953 		 */
2954 		if (cookie_cnt != 0) {
2955 			ddi_dma_nextcookie(bind->bi_dmahdl, &dmacookie);
2956 		}
2957 	}
2958 
2959 	/* done all the cookies, now sync the memory for the device */
2960 	if (sync_needed)
2961 		(void) ddi_dma_sync(dma_info->dma_hdl,
2962 		    start * sizeof (hermon_hw_mtt_t),
2963 		    (i - start) * sizeof (hermon_hw_mtt_t),
2964 		    DDI_DMA_SYNC_FORDEV);
2965 
2966 	return (DDI_SUCCESS);
2967 }
2968 
2969 /*
2970  * hermon_mr_fast_mtt_write_fmr()
2971  *    Context: Can be called from interrupt or base context.
2972  */
2973 /* ARGSUSED */
2974 static int
2975 hermon_mr_fast_mtt_write_fmr(hermon_state_t *state, hermon_rsrc_t *mtt,
2976     ibt_pmr_attr_t *mem_pattr, uint32_t mtt_pgsize_bits)
2977 {
2978 	hermon_icm_table_t	*icm_table;
2979 	hermon_dma_info_t	*dma_info;
2980 	uint32_t		index1, index2, rindx;
2981 	uint64_t		*mtt_table;
2982 	offset_t		i, j;
2983 	uint_t			per_span;
2984 
2985 	icm_table = &state->hs_icm[HERMON_MTT];
2986 	rindx = mtt->hr_indx;
2987 	hermon_index(index1, index2, rindx, icm_table, i);
2988 	per_span   = icm_table->span;
2989 	dma_info   = icm_table->icm_dma[index1] + index2;
2990 	mtt_table  = (uint64_t *)(uintptr_t)dma_info->vaddr;
2991 
2992 	/*
2993 	 * Fill in the MTT table entries
2994 	 */
2995 	for (j = 0; j < mem_pattr->pmr_num_buf; j++) {
2996 		mtt_table[i] = mem_pattr->pmr_addr_list[j].p_laddr;
2997 		i++;
2998 		rindx++;
2999 		if (i == per_span) {
3000 			hermon_index(index1, index2, rindx, icm_table, i);
3001 			dma_info = icm_table->icm_dma[index1] + index2;
3002 			mtt_table = (uint64_t *)(uintptr_t)dma_info->vaddr;
3003 		}
3004 	}
3005 
3006 	return (DDI_SUCCESS);
3007 }
3008 
3009 
3010 /*
3011  * hermon_mtt_refcnt_inc()
3012  *    Context: Can be called from interrupt or base context.
3013  */
3014 static uint_t
3015 hermon_mtt_refcnt_inc(hermon_rsrc_t *rsrc)
3016 {
3017 	hermon_sw_refcnt_t *rc;
3018 
3019 	rc = (hermon_sw_refcnt_t *)rsrc->hr_addr;
3020 	return (atomic_inc_uint_nv(&rc->swrc_refcnt));
3021 }
3022 
3023 
3024 /*
3025  * hermon_mtt_refcnt_dec()
3026  *    Context: Can be called from interrupt or base context.
3027  */
3028 static uint_t
3029 hermon_mtt_refcnt_dec(hermon_rsrc_t *rsrc)
3030 {
3031 	hermon_sw_refcnt_t *rc;
3032 
3033 	rc = (hermon_sw_refcnt_t *)rsrc->hr_addr;
3034 	return (atomic_dec_uint_nv(&rc->swrc_refcnt));
3035 }
3036