xref: /titanic_51/usr/src/uts/common/io/ib/adapters/hermon/hermon_misc.c (revision 17a2b317610f531d565bf4e940433aab2d9e6985)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * hermon_misc.c
28  *    Hermon Miscellaneous routines - Address Handle, Multicast, Protection
29  *    Domain, and port-related operations
30  *
31  *    Implements all the routines necessary for allocating, freeing, querying
32  *    and modifying Address Handles and Protection Domains.  Also implements
33  *    all the routines necessary for adding and removing Queue Pairs to/from
34  *    Multicast Groups.  Lastly, it implements the routines necessary for
35  *    port-related query and modify operations.
36  */
37 
38 #include <sys/types.h>
39 #include <sys/conf.h>
40 #include <sys/ddi.h>
41 #include <sys/sunddi.h>
42 #include <sys/modctl.h>
43 #include <sys/bitmap.h>
44 #include <sys/sysmacros.h>
45 
46 #include <sys/ib/adapters/hermon/hermon.h>
47 
48 extern int hermon_rdma_debug;
49 int hermon_fmr_verbose = 0;
50 
51 static int hermon_mcg_qplist_add(hermon_state_t *state, hermon_mcghdl_t mcg,
52     hermon_hw_mcg_qp_list_t *mcg_qplist, hermon_qphdl_t qp, uint_t *qp_found);
53 static int hermon_mcg_qplist_remove(hermon_mcghdl_t mcg,
54     hermon_hw_mcg_qp_list_t *mcg_qplist, hermon_qphdl_t qp);
55 static void hermon_qp_mcg_refcnt_inc(hermon_qphdl_t qp);
56 static void hermon_qp_mcg_refcnt_dec(hermon_qphdl_t qp);
57 static uint_t hermon_mcg_walk_mgid_hash(hermon_state_t *state,
58     uint64_t start_indx, ib_gid_t mgid, uint_t *prev_indx);
59 static void hermon_mcg_setup_new_hdr(hermon_mcghdl_t mcg,
60     hermon_hw_mcg_t *mcg_hdr, ib_gid_t mgid, hermon_rsrc_t *mcg_rsrc);
61 static int hermon_mcg_hash_list_remove(hermon_state_t *state, uint_t curr_indx,
62     uint_t prev_indx, hermon_hw_mcg_t *mcg_entry);
63 static int hermon_mcg_entry_invalidate(hermon_state_t *state,
64     hermon_hw_mcg_t *mcg_entry, uint_t indx);
65 static int hermon_mgid_is_valid(ib_gid_t gid);
66 static int hermon_mlid_is_valid(ib_lid_t lid);
67 static void hermon_fmr_cleanup(hermon_fmrhdl_t pool);
68 
69 
70 #define	HERMON_MAX_DBR_PAGES_PER_USER	64
71 #define	HERMON_DBR_KEY(index, page) \
72 	(((uint64_t)index) * HERMON_MAX_DBR_PAGES_PER_USER + (page))
73 
74 static hermon_udbr_page_t *
75 hermon_dbr_new_user_page(hermon_state_t *state, uint_t index,
76     uint_t page)
77 {
78 	hermon_udbr_page_t *pagep;
79 	ddi_dma_attr_t dma_attr;
80 	uint_t cookiecnt;
81 	int status;
82 	hermon_umap_db_entry_t *umapdb;
83 	ulong_t pagesize = PAGESIZE;
84 
85 	pagep = kmem_alloc(sizeof (*pagep), KM_SLEEP);
86 	pagep->upg_index = page;
87 	pagep->upg_nfree = pagesize / sizeof (hermon_dbr_t);
88 
89 	/* Allocate 1 bit per dbr for free/alloc management (0 => "free") */
90 	pagep->upg_free = kmem_zalloc(pagesize / sizeof (hermon_dbr_t) / 8,
91 	    KM_SLEEP);
92 	pagep->upg_kvaddr = ddi_umem_alloc(pagesize, DDI_UMEM_SLEEP,
93 	    &pagep->upg_umemcookie); /* not HERMON_PAGESIZE here */
94 
95 	pagep->upg_buf = ddi_umem_iosetup(pagep->upg_umemcookie, 0,
96 	    pagesize, B_WRITE, 0, 0, NULL, DDI_UMEM_SLEEP);
97 
98 	hermon_dma_attr_init(state, &dma_attr);
99 #ifdef	__sparc
100 	if (state->hs_cfg_profile->cp_iommu_bypass == HERMON_BINDMEM_BYPASS)
101 		dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
102 #endif
103 	status = ddi_dma_alloc_handle(state->hs_dip, &dma_attr,
104 	    DDI_DMA_SLEEP, NULL, &pagep->upg_dmahdl);
105 	if (status != DDI_SUCCESS) {
106 		IBTF_DPRINTF_L2("hermon", "hermon_new_user_page: "
107 		    "ddi_dma_buf_bind_handle failed: %d", status);
108 		return (NULL);
109 	}
110 	status = ddi_dma_buf_bind_handle(pagep->upg_dmahdl,
111 	    pagep->upg_buf, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
112 	    DDI_DMA_SLEEP, NULL, &pagep->upg_dmacookie, &cookiecnt);
113 	if (status != DDI_SUCCESS) {
114 		IBTF_DPRINTF_L2("hermon", "hermon_dbr_new_user_page: "
115 		    "ddi_dma_buf_bind_handle failed: %d", status);
116 		ddi_dma_free_handle(&pagep->upg_dmahdl);
117 		return (NULL);
118 	}
119 	ASSERT(cookiecnt == 1);
120 
121 	/* create db entry for mmap */
122 	umapdb = hermon_umap_db_alloc(state->hs_instance,
123 	    HERMON_DBR_KEY(index, page), MLNX_UMAP_DBRMEM_RSRC,
124 	    (uint64_t)(uintptr_t)pagep);
125 	hermon_umap_db_add(umapdb);
126 	return (pagep);
127 }
128 
129 
130 /*ARGSUSED*/
131 static int
132 hermon_user_dbr_alloc(hermon_state_t *state, uint_t index,
133     ddi_acc_handle_t *acchdl, hermon_dbr_t **vdbr, uint64_t *pdbr,
134     uint64_t *mapoffset)
135 {
136 	hermon_user_dbr_t *udbr;
137 	hermon_udbr_page_t *pagep;
138 	uint_t next_page;
139 	int dbr_index;
140 	int i1, i2, i3, last;
141 	uint64_t u64, mask;
142 
143 	mutex_enter(&state->hs_dbr_lock);
144 	for (udbr = state->hs_user_dbr; udbr != NULL; udbr = udbr->udbr_link)
145 		if (udbr->udbr_index == index)
146 			break;
147 	if (udbr == NULL) {
148 		udbr = kmem_alloc(sizeof (*udbr), KM_SLEEP);
149 		udbr->udbr_link = state->hs_user_dbr;
150 		state->hs_user_dbr = udbr;
151 		udbr->udbr_index = index;
152 		udbr->udbr_pagep = NULL;
153 	}
154 	pagep = udbr->udbr_pagep;
155 	next_page = (pagep == NULL) ? 0 : (pagep->upg_index + 1);
156 	while (pagep != NULL)
157 		if (pagep->upg_nfree > 0)
158 			break;
159 		else
160 			pagep = pagep->upg_link;
161 	if (pagep == NULL) {
162 		pagep = hermon_dbr_new_user_page(state, index, next_page);
163 		if (pagep == NULL) {
164 			mutex_exit(&state->hs_dbr_lock);
165 			return (DDI_FAILURE);
166 		}
167 		pagep->upg_link = udbr->udbr_pagep;
168 		udbr->udbr_pagep = pagep;
169 	}
170 
171 	/* Since nfree > 0, we're assured the loops below will succeed */
172 
173 	/* First, find a 64-bit (not ~0) that has a free dbr */
174 	last = PAGESIZE / sizeof (uint64_t) / 64;
175 	mask = ~0ull;
176 	for (i1 = 0; i1 < last; i1++)
177 		if ((pagep->upg_free[i1] & mask) != mask)
178 			break;
179 	u64 = pagep->upg_free[i1];
180 
181 	/* Second, find a byte (not 0xff) that has a free dbr */
182 	last = sizeof (uint64_t) / sizeof (uint8_t);
183 	for (i2 = 0, mask = 0xff; i2 < last; i2++, mask <<= 8)
184 		if ((u64 & mask) != mask)
185 			break;
186 
187 	/* Third, find a bit that is free (0) */
188 	for (i3 = 0; i3 < sizeof (uint64_t) / sizeof (uint8_t); i3++)
189 		if ((u64 & (1ul << (i3 + 8 * i2))) == 0)
190 			break;
191 
192 	/* Mark it as allocated */
193 	pagep->upg_free[i1] |= (1ul << (i3 + 8 * i2));
194 
195 	dbr_index = ((i1 * sizeof (uint64_t)) + i2) * sizeof (uint64_t) + i3;
196 	pagep->upg_nfree--;
197 	((uint64_t *)(void *)pagep->upg_kvaddr)[dbr_index] = 0;	/* clear dbr */
198 	*mapoffset = ((HERMON_DBR_KEY(index, pagep->upg_index) <<
199 	    MLNX_UMAP_RSRC_TYPE_SHIFT) | MLNX_UMAP_DBRMEM_RSRC) << PAGESHIFT;
200 	*vdbr = (hermon_dbr_t *)((uint64_t *)(void *)pagep->upg_kvaddr +
201 	    dbr_index);
202 	*pdbr = pagep->upg_dmacookie.dmac_laddress + dbr_index *
203 	    sizeof (uint64_t);
204 
205 	mutex_exit(&state->hs_dbr_lock);
206 	return (DDI_SUCCESS);
207 }
208 
209 static void
210 hermon_user_dbr_free(hermon_state_t *state, uint_t index, hermon_dbr_t *record)
211 {
212 	hermon_user_dbr_t	*udbr;
213 	hermon_udbr_page_t	*pagep;
214 	caddr_t			kvaddr;
215 	uint_t			dbr_index;
216 	uint_t			max_free = PAGESIZE / sizeof (hermon_dbr_t);
217 	int			i1, i2;
218 
219 	dbr_index = (uintptr_t)record & PAGEOFFSET; /* offset (not yet index) */
220 	kvaddr = (caddr_t)record - dbr_index;
221 	dbr_index /= sizeof (hermon_dbr_t); /* now it's the index */
222 
223 	mutex_enter(&state->hs_dbr_lock);
224 	for (udbr = state->hs_user_dbr; udbr != NULL; udbr = udbr->udbr_link)
225 		if (udbr->udbr_index == index)
226 			break;
227 	if (udbr == NULL) {
228 		IBTF_DPRINTF_L2("hermon", "free user dbr: udbr struct not "
229 		    "found for index %x", index);
230 		mutex_exit(&state->hs_dbr_lock);
231 		return;
232 	}
233 	for (pagep = udbr->udbr_pagep; pagep != NULL; pagep = pagep->upg_link)
234 		if (pagep->upg_kvaddr == kvaddr)
235 			break;
236 	if (pagep == NULL) {
237 		IBTF_DPRINTF_L2("hermon", "free user dbr: pagep struct not"
238 		    " found for index %x, kvaddr %p, DBR index %x",
239 		    index, kvaddr, dbr_index);
240 		mutex_exit(&state->hs_dbr_lock);
241 		return;
242 	}
243 	if (pagep->upg_nfree >= max_free) {
244 		IBTF_DPRINTF_L2("hermon", "free user dbr: overflow: "
245 		    "UCE index %x, DBR index %x", index, dbr_index);
246 		mutex_exit(&state->hs_dbr_lock);
247 		return;
248 	}
249 	ASSERT(dbr_index < max_free);
250 	i1 = dbr_index / 64;
251 	i2 = dbr_index % 64;
252 	ASSERT((pagep->upg_free[i1] & (1ul << i2)) == (1ul << i2));
253 	pagep->upg_free[i1] &= ~(1ul << i2);
254 	pagep->upg_nfree++;
255 	mutex_exit(&state->hs_dbr_lock);
256 }
257 
258 /*
259  * hermon_dbr_page_alloc()
260  *	first page allocation - called from attach or open
261  *	in this case, we want exactly one page per call, and aligned on a
262  *	page - and may need to be mapped to the user for access
263  */
264 int
265 hermon_dbr_page_alloc(hermon_state_t *state, hermon_dbr_info_t **dinfo)
266 {
267 	int			status;
268 	ddi_dma_handle_t	dma_hdl;
269 	ddi_acc_handle_t	acc_hdl;
270 	ddi_dma_attr_t		dma_attr;
271 	ddi_dma_cookie_t	cookie;
272 	uint_t			cookie_cnt;
273 	int			i;
274 	hermon_dbr_info_t 	*info;
275 	caddr_t			dmaaddr;
276 	uint64_t		dmalen;
277 	ulong_t			pagesize = PAGESIZE;
278 
279 	info = kmem_zalloc(sizeof (hermon_dbr_info_t), KM_SLEEP);
280 
281 	/*
282 	 * Initialize many of the default DMA attributes.  Then set additional
283 	 * alignment restrictions if necessary for the dbr memory, meaning
284 	 * page aligned.  Also use the configured value for IOMMU bypass
285 	 */
286 	hermon_dma_attr_init(state, &dma_attr);
287 	dma_attr.dma_attr_align = pagesize;
288 	dma_attr.dma_attr_sgllen = 1;	/* make sure only one cookie */
289 #ifdef	__sparc
290 	if (state->hs_cfg_profile->cp_iommu_bypass == HERMON_BINDMEM_BYPASS)
291 		dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
292 #endif
293 
294 	status = ddi_dma_alloc_handle(state->hs_dip, &dma_attr,
295 	    DDI_DMA_SLEEP, NULL, &dma_hdl);
296 	if (status != DDI_SUCCESS) {
297 		kmem_free((void *)info, sizeof (hermon_dbr_info_t));
298 		cmn_err(CE_NOTE, "dbr DMA handle alloc failed\n");
299 		return (DDI_FAILURE);
300 	}
301 
302 	status = ddi_dma_mem_alloc(dma_hdl, pagesize,
303 	    &state->hs_reg_accattr, DDI_DMA_CONSISTENT, DDI_DMA_SLEEP,
304 	    NULL, &dmaaddr, (size_t *)&dmalen, &acc_hdl);
305 	if (status != DDI_SUCCESS)	{
306 		ddi_dma_free_handle(&dma_hdl);
307 		cmn_err(CE_CONT, "dbr DMA mem alloc failed(status %d)", status);
308 		kmem_free((void *)info, sizeof (hermon_dbr_info_t));
309 		return (DDI_FAILURE);
310 	}
311 
312 	/* this memory won't be IB registered, so do the bind here */
313 	status = ddi_dma_addr_bind_handle(dma_hdl, NULL,
314 	    dmaaddr, (size_t)dmalen, DDI_DMA_RDWR |
315 	    DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &cookie, &cookie_cnt);
316 	if (status != DDI_SUCCESS) {
317 		ddi_dma_mem_free(&acc_hdl);
318 		ddi_dma_free_handle(&dma_hdl);
319 		kmem_free((void *)info, sizeof (hermon_dbr_info_t));
320 		cmn_err(CE_CONT, "dbr DMA bind handle failed (status %d)",
321 		    status);
322 		return (DDI_FAILURE);
323 	}
324 	*dinfo = info;		/* Pass back the pointer */
325 
326 	/* init the info structure with returned info */
327 	info->dbr_dmahdl = dma_hdl;
328 	info->dbr_acchdl = acc_hdl;
329 	info->dbr_page   = (hermon_dbr_t *)(void *)dmaaddr;
330 	info->dbr_link = NULL;
331 	/* extract the phys addr from the cookie */
332 	info->dbr_paddr = cookie.dmac_laddress;
333 	info->dbr_firstfree = 0;
334 	info->dbr_nfree = HERMON_NUM_DBR_PER_PAGE;
335 	/* link all DBrs onto the free list */
336 	for (i = 0; i < HERMON_NUM_DBR_PER_PAGE; i++) {
337 		info->dbr_page[i] = i + 1;
338 	}
339 
340 	return (DDI_SUCCESS);
341 }
342 
343 
344 /*
345  * hermon_dbr_alloc()
346  *	DBr record allocation - called from alloc cq/qp/srq
347  *	will check for available dbrs in current
348  *	page - if needed it will allocate another and link them
349  */
350 
351 int
352 hermon_dbr_alloc(hermon_state_t *state, uint_t index, ddi_acc_handle_t *acchdl,
353     hermon_dbr_t **vdbr, uint64_t *pdbr, uint64_t *mapoffset)
354 {
355 	hermon_dbr_t		*record = NULL;
356 	hermon_dbr_info_t	*info = NULL;
357 	uint32_t		idx;
358 	int			status;
359 
360 	if (index != state->hs_kernel_uar_index)
361 		return (hermon_user_dbr_alloc(state, index, acchdl, vdbr, pdbr,
362 		    mapoffset));
363 
364 	mutex_enter(&state->hs_dbr_lock);
365 	for (info = state->hs_kern_dbr; info != NULL; info = info->dbr_link)
366 		if (info->dbr_nfree != 0)
367 			break;		/* found a page w/ one available */
368 
369 	if (info == NULL) {	/* did NOT find a page with one available */
370 		status = hermon_dbr_page_alloc(state, &info);
371 		if (status != DDI_SUCCESS) {
372 			/* do error handling */
373 			mutex_exit(&state->hs_dbr_lock);
374 			return (DDI_FAILURE);
375 		}
376 		/* got a new page, so link it in. */
377 		info->dbr_link = state->hs_kern_dbr;
378 		state->hs_kern_dbr = info;
379 	}
380 	idx = info->dbr_firstfree;
381 	record = info->dbr_page + idx;
382 	info->dbr_firstfree = *record;
383 	info->dbr_nfree--;
384 	*record = 0;
385 
386 	*acchdl = info->dbr_acchdl;
387 	*vdbr = record;
388 	*pdbr = info->dbr_paddr + idx * sizeof (hermon_dbr_t);
389 	mutex_exit(&state->hs_dbr_lock);
390 	return (DDI_SUCCESS);
391 }
392 
393 /*
394  * hermon_dbr_free()
395  *	DBr record deallocation - called from free cq/qp
396  *	will update the counter in the header, and invalidate
397  *	the dbr, but will NEVER free pages of dbrs - small
398  *	price to pay, but userland access never will anyway
399  */
400 void
401 hermon_dbr_free(hermon_state_t *state, uint_t indx, hermon_dbr_t *record)
402 {
403 	hermon_dbr_t		*page;
404 	hermon_dbr_info_t	*info;
405 
406 	if (indx != state->hs_kernel_uar_index) {
407 		hermon_user_dbr_free(state, indx, record);
408 		return;
409 	}
410 	page = (hermon_dbr_t *)(uintptr_t)((uintptr_t)record & PAGEMASK);
411 	mutex_enter(&state->hs_dbr_lock);
412 	for (info = state->hs_kern_dbr; info != NULL; info = info->dbr_link)
413 		if (info->dbr_page == page)
414 			break;
415 	ASSERT(info != NULL);
416 	*record = info->dbr_firstfree;
417 	info->dbr_firstfree = record - info->dbr_page;
418 	info->dbr_nfree++;
419 	mutex_exit(&state->hs_dbr_lock);
420 }
421 
422 /*
423  * hermon_dbr_kern_free()
424  *    Context: Can be called only from detach context.
425  *
426  *	Free all kernel dbr pages.  This includes the freeing of all the dma
427  *	resources acquired during the allocation of the pages.
428  *
429  *	Also, free all the user dbr pages.
430  */
431 void
432 hermon_dbr_kern_free(hermon_state_t *state)
433 {
434 	hermon_dbr_info_t	*info, *link;
435 	hermon_user_dbr_t	*udbr, *next;
436 	hermon_udbr_page_t	*pagep, *nextp;
437 	hermon_umap_db_entry_t	*umapdb;
438 	int			instance, status;
439 	uint64_t		value;
440 	extern			hermon_umap_db_t hermon_userland_rsrc_db;
441 
442 	mutex_enter(&state->hs_dbr_lock);
443 	for (info = state->hs_kern_dbr; info != NULL; info = link) {
444 		(void) ddi_dma_unbind_handle(info->dbr_dmahdl);
445 		ddi_dma_mem_free(&info->dbr_acchdl);	/* free page */
446 		ddi_dma_free_handle(&info->dbr_dmahdl);
447 		link = info->dbr_link;
448 		kmem_free(info, sizeof (hermon_dbr_info_t));
449 	}
450 
451 	udbr = state->hs_user_dbr;
452 	instance = state->hs_instance;
453 	mutex_enter(&hermon_userland_rsrc_db.hdl_umapdb_lock);
454 	while (udbr != NULL) {
455 		pagep = udbr->udbr_pagep;
456 		while (pagep != NULL) {
457 			/* probably need to remove "db" */
458 			(void) ddi_dma_unbind_handle(pagep->upg_dmahdl);
459 			ddi_dma_free_handle(&pagep->upg_dmahdl);
460 			freerbuf(pagep->upg_buf);
461 			ddi_umem_free(pagep->upg_umemcookie);
462 			status = hermon_umap_db_find_nolock(instance,
463 			    HERMON_DBR_KEY(udbr->udbr_index,
464 			    pagep->upg_index), MLNX_UMAP_DBRMEM_RSRC,
465 			    &value, HERMON_UMAP_DB_REMOVE, &umapdb);
466 			if (status == DDI_SUCCESS)
467 				hermon_umap_db_free(umapdb);
468 			kmem_free(pagep->upg_free,
469 			    PAGESIZE / sizeof (hermon_dbr_t) / 8);
470 			nextp = pagep->upg_link;
471 			kmem_free(pagep, sizeof (*pagep));
472 			pagep = nextp;
473 		}
474 		next = udbr->udbr_link;
475 		kmem_free(udbr, sizeof (*udbr));
476 		udbr = next;
477 	}
478 	mutex_exit(&hermon_userland_rsrc_db.hdl_umapdb_lock);
479 	mutex_exit(&state->hs_dbr_lock);
480 }
481 
482 /*
483  * hermon_ah_alloc()
484  *    Context: Can be called only from user or kernel context.
485  */
486 int
487 hermon_ah_alloc(hermon_state_t *state, hermon_pdhdl_t pd,
488     ibt_adds_vect_t *attr_p, hermon_ahhdl_t *ahhdl, uint_t sleepflag)
489 {
490 	hermon_rsrc_t		*rsrc;
491 	hermon_hw_udav_t	*udav;
492 	hermon_ahhdl_t		ah;
493 	int			status;
494 
495 	/*
496 	 * Someday maybe the "ibt_adds_vect_t *attr_p" will be NULL to
497 	 * indicate that we wish to allocate an "invalid" (i.e. empty)
498 	 * address handle XXX
499 	 */
500 
501 	/* Validate that specified port number is legal */
502 	if (!hermon_portnum_is_valid(state, attr_p->av_port_num)) {
503 		return (IBT_HCA_PORT_INVALID);
504 	}
505 
506 	/*
507 	 * Allocate the software structure for tracking the address handle
508 	 * (i.e. the Hermon Address Handle struct).
509 	 */
510 	status = hermon_rsrc_alloc(state, HERMON_AHHDL, 1, sleepflag, &rsrc);
511 	if (status != DDI_SUCCESS) {
512 		return (IBT_INSUFF_RESOURCE);
513 	}
514 	ah = (hermon_ahhdl_t)rsrc->hr_addr;
515 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))
516 
517 	/* Increment the reference count on the protection domain (PD) */
518 	hermon_pd_refcnt_inc(pd);
519 
520 	udav = (hermon_hw_udav_t *)kmem_zalloc(sizeof (hermon_hw_udav_t),
521 	    KM_SLEEP);
522 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*udav))
523 
524 	/*
525 	 * Fill in the UDAV data. We first zero out the UDAV, then populate
526 	 * it by then calling hermon_set_addr_path() to fill in the common
527 	 * portions that can be pulled from the "ibt_adds_vect_t" passed in
528 	 */
529 	status = hermon_set_addr_path(state, attr_p,
530 	    (hermon_hw_addr_path_t *)udav, HERMON_ADDRPATH_UDAV);
531 	if (status != DDI_SUCCESS) {
532 		hermon_pd_refcnt_dec(pd);
533 		hermon_rsrc_free(state, &rsrc);
534 		return (status);
535 	}
536 	udav->pd	= pd->pd_pdnum;
537 	udav->sl	= attr_p->av_srvl;
538 
539 	/*
540 	 * Fill in the rest of the Hermon Address Handle struct.
541 	 *
542 	 * NOTE: We are saving away a copy of the "av_dgid.gid_guid" field
543 	 * here because we may need to return it later to the IBTF (as a
544 	 * result of a subsequent query operation).  Unlike the other UDAV
545 	 * parameters, the value of "av_dgid.gid_guid" is not always preserved.
546 	 * The reason for this is described in hermon_set_addr_path().
547 	 */
548 	ah->ah_rsrcp	 = rsrc;
549 	ah->ah_pdhdl	 = pd;
550 	ah->ah_udav	 = udav;
551 	ah->ah_save_guid = attr_p->av_dgid.gid_guid;
552 	*ahhdl = ah;
553 
554 	return (DDI_SUCCESS);
555 }
556 
557 
558 /*
559  * hermon_ah_free()
560  *    Context: Can be called only from user or kernel context.
561  */
562 /* ARGSUSED */
563 int
564 hermon_ah_free(hermon_state_t *state, hermon_ahhdl_t *ahhdl, uint_t sleepflag)
565 {
566 	hermon_rsrc_t		*rsrc;
567 	hermon_pdhdl_t		pd;
568 	hermon_ahhdl_t		ah;
569 
570 	/*
571 	 * Pull all the necessary information from the Hermon Address Handle
572 	 * struct.  This is necessary here because the resource for the
573 	 * AH is going to be freed up as part of this operation.
574 	 */
575 	ah    = *ahhdl;
576 	mutex_enter(&ah->ah_lock);
577 	rsrc  = ah->ah_rsrcp;
578 	pd    = ah->ah_pdhdl;
579 	mutex_exit(&ah->ah_lock);
580 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*ah))
581 
582 	/* Free the UDAV memory */
583 	kmem_free(ah->ah_udav, sizeof (hermon_hw_udav_t));
584 
585 	/* Decrement the reference count on the protection domain (PD) */
586 	hermon_pd_refcnt_dec(pd);
587 
588 	/* Free the Hermon Address Handle structure */
589 	hermon_rsrc_free(state, &rsrc);
590 
591 	/* Set the ahhdl pointer to NULL and return success */
592 	*ahhdl = NULL;
593 
594 	return (DDI_SUCCESS);
595 }
596 
597 
598 /*
599  * hermon_ah_query()
600  *    Context: Can be called from interrupt or base context.
601  */
602 /* ARGSUSED */
603 int
604 hermon_ah_query(hermon_state_t *state, hermon_ahhdl_t ah, hermon_pdhdl_t *pd,
605     ibt_adds_vect_t *attr_p)
606 {
607 	mutex_enter(&ah->ah_lock);
608 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr_p))
609 
610 	/*
611 	 * Pull the PD and UDAV from the Hermon Address Handle structure
612 	 */
613 	*pd = ah->ah_pdhdl;
614 
615 	/*
616 	 * Fill in "ibt_adds_vect_t".  We call hermon_get_addr_path() to fill
617 	 * the common portions that can be pulled from the UDAV we pass in.
618 	 *
619 	 * NOTE: We will also fill the "av_dgid.gid_guid" field from the
620 	 * "ah_save_guid" field we have previously saved away.  The reason
621 	 * for this is described in hermon_ah_alloc() and hermon_ah_modify().
622 	 */
623 	hermon_get_addr_path(state, (hermon_hw_addr_path_t *)ah->ah_udav,
624 	    attr_p, HERMON_ADDRPATH_UDAV);
625 
626 	attr_p->av_dgid.gid_guid = ah->ah_save_guid;
627 
628 	mutex_exit(&ah->ah_lock);
629 	return (DDI_SUCCESS);
630 }
631 
632 
633 /*
634  * hermon_ah_modify()
635  *    Context: Can be called from interrupt or base context.
636  */
637 /* ARGSUSED */
638 int
639 hermon_ah_modify(hermon_state_t *state, hermon_ahhdl_t ah,
640     ibt_adds_vect_t *attr_p)
641 {
642 	hermon_hw_udav_t	old_udav;
643 	uint64_t		data_old;
644 	int			status, size, i;
645 
646 	/* Validate that specified port number is legal */
647 	if (!hermon_portnum_is_valid(state, attr_p->av_port_num)) {
648 		return (IBT_HCA_PORT_INVALID);
649 	}
650 
651 	mutex_enter(&ah->ah_lock);
652 
653 	/* Save a copy of the current UDAV data in old_udav. */
654 	bcopy(ah->ah_udav, &old_udav, sizeof (hermon_hw_udav_t));
655 
656 	/*
657 	 * Fill in the new UDAV with the caller's data, passed in via the
658 	 * "ibt_adds_vect_t" structure.
659 	 *
660 	 * NOTE: We also need to save away a copy of the "av_dgid.gid_guid"
661 	 * field here (just as we did during hermon_ah_alloc()) because we
662 	 * may need to return it later to the IBTF (as a result of a
663 	 * subsequent query operation).  As explained in hermon_ah_alloc(),
664 	 * unlike the other UDAV parameters, the value of "av_dgid.gid_guid"
665 	 * is not always preserved. The reason for this is described in
666 	 * hermon_set_addr_path().
667 	 */
668 	status = hermon_set_addr_path(state, attr_p,
669 	    (hermon_hw_addr_path_t *)ah->ah_udav, HERMON_ADDRPATH_UDAV);
670 	if (status != DDI_SUCCESS) {
671 		mutex_exit(&ah->ah_lock);
672 		return (status);
673 	}
674 	ah->ah_save_guid = attr_p->av_dgid.gid_guid;
675 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(ah->ah_udav)))
676 	ah->ah_udav->sl  = attr_p->av_srvl;
677 
678 	/*
679 	 * Copy changes into the new UDAV.
680 	 *    Note:  We copy in 64-bit chunks.  For the first two of these
681 	 *    chunks it is necessary to read the current contents of the
682 	 *    UDAV, mask off the modifiable portions (maintaining any
683 	 *    of the "reserved" portions), and then mask on the new data.
684 	 */
685 	size = sizeof (hermon_hw_udav_t) >> 3;
686 	for (i = 0; i < size; i++) {
687 		data_old = ((uint64_t *)&old_udav)[i];
688 
689 		/*
690 		 * Apply mask to change only the relevant values.
691 		 */
692 		if (i == 0) {
693 			data_old = data_old & HERMON_UDAV_MODIFY_MASK0;
694 		} else if (i == 1) {
695 			data_old = data_old & HERMON_UDAV_MODIFY_MASK1;
696 		} else {
697 			data_old = 0;
698 		}
699 
700 		/* Store the updated values to the UDAV */
701 		((uint64_t *)ah->ah_udav)[i] |= data_old;
702 	}
703 
704 	/*
705 	 * Put the valid PD number back into the UDAV entry, as it
706 	 * might have been clobbered above.
707 	 */
708 	ah->ah_udav->pd = old_udav.pd;
709 
710 
711 	mutex_exit(&ah->ah_lock);
712 	return (DDI_SUCCESS);
713 }
714 
715 /*
716  * hermon_mcg_attach()
717  *    Context: Can be called only from user or kernel context.
718  */
719 int
720 hermon_mcg_attach(hermon_state_t *state, hermon_qphdl_t qp, ib_gid_t gid,
721     ib_lid_t lid)
722 {
723 	hermon_rsrc_t		*rsrc;
724 	hermon_hw_mcg_t		*mcg_entry;
725 	hermon_hw_mcg_qp_list_t	*mcg_entry_qplist;
726 	hermon_mcghdl_t		mcg, newmcg;
727 	uint64_t		mgid_hash;
728 	uint32_t		end_indx;
729 	int			status;
730 	uint_t			qp_found;
731 
732 	/*
733 	 * It is only allowed to attach MCG to UD queue pairs.  Verify
734 	 * that the intended QP is of the appropriate transport type
735 	 */
736 	if (qp->qp_serv_type != HERMON_QP_UD) {
737 		return (IBT_QP_SRV_TYPE_INVALID);
738 	}
739 
740 	/*
741 	 * Check for invalid Multicast DLID.  Specifically, all Multicast
742 	 * LIDs should be within a well defined range.  If the specified LID
743 	 * is outside of that range, then return an error.
744 	 */
745 	if (hermon_mlid_is_valid(lid) == 0) {
746 		return (IBT_MC_MLID_INVALID);
747 	}
748 	/*
749 	 * Check for invalid Multicast GID.  All Multicast GIDs should have
750 	 * a well-defined pattern of bits and flags that are allowable.  If
751 	 * the specified GID does not meet the criteria, then return an error.
752 	 */
753 	if (hermon_mgid_is_valid(gid) == 0) {
754 		return (IBT_MC_MGID_INVALID);
755 	}
756 
757 	/*
758 	 * Compute the MGID hash value.  Since the MCG table is arranged as
759 	 * a number of separate hash chains, this operation converts the
760 	 * specified MGID into the starting index of an entry in the hash
761 	 * table (i.e. the index for the start of the appropriate hash chain).
762 	 * Subsequent operations below will walk the chain searching for the
763 	 * right place to add this new QP.
764 	 */
765 	status = hermon_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid,
766 	    &mgid_hash, HERMON_SLEEPFLAG_FOR_CONTEXT());
767 	if (status != HERMON_CMD_SUCCESS) {
768 		cmn_err(CE_CONT, "Hermon: MGID_HASH command failed: %08x\n",
769 		    status);
770 		if (status == HERMON_CMD_INVALID_STATUS) {
771 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
772 		}
773 		return (ibc_get_ci_failure(0));
774 	}
775 
776 	/*
777 	 * Grab the multicast group mutex.  Then grab the pre-allocated
778 	 * temporary buffer used for holding and/or modifying MCG entries.
779 	 * Zero out the temporary MCG entry before we begin.
780 	 */
781 	mutex_enter(&state->hs_mcglock);
782 	mcg_entry = state->hs_mcgtmp;
783 	mcg_entry_qplist = HERMON_MCG_GET_QPLIST_PTR(mcg_entry);
784 	bzero(mcg_entry, HERMON_MCGMEM_SZ(state));
785 
786 	/*
787 	 * Walk through the array of MCG entries starting at "mgid_hash".
788 	 * Try to find the appropriate place for this new QP to be added.
789 	 * This could happen when the first entry of the chain has MGID == 0
790 	 * (which means that the hash chain is empty), or because we find
791 	 * an entry with the same MGID (in which case we'll add the QP to
792 	 * that MCG), or because we come to the end of the chain (in which
793 	 * case this is the first QP being added to the multicast group that
794 	 * corresponds to the MGID.  The hermon_mcg_walk_mgid_hash() routine
795 	 * walks the list and returns an index into the MCG table.  The entry
796 	 * at this index is then checked to determine which case we have
797 	 * fallen into (see below).  Note:  We are using the "shadow" MCG
798 	 * list (of hermon_mcg_t structs) for this lookup because the real
799 	 * MCG entries are in hardware (and the lookup process would be much
800 	 * more time consuming).
801 	 */
802 	end_indx = hermon_mcg_walk_mgid_hash(state, mgid_hash, gid, NULL);
803 	mcg	 = &state->hs_mcghdl[end_indx];
804 
805 	/*
806 	 * If MGID == 0, then the hash chain is empty.  Just fill in the
807 	 * current entry.  Note:  No need to allocate an MCG table entry
808 	 * as all the hash chain "heads" are already preallocated.
809 	 */
810 	if ((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) {
811 
812 		/* Fill in the current entry in the "shadow" MCG list */
813 		hermon_mcg_setup_new_hdr(mcg, mcg_entry, gid, NULL);
814 
815 		/*
816 		 * Try to add the new QP number to the list.  This (and the
817 		 * above) routine fills in a temporary MCG.  The "mcg_entry"
818 		 * and "mcg_entry_qplist" pointers simply point to different
819 		 * offsets within the same temporary copy of the MCG (for
820 		 * convenience).  Note:  If this fails, we need to invalidate
821 		 * the entries we've already put into the "shadow" list entry
822 		 * above.
823 		 */
824 		status = hermon_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp,
825 		    &qp_found);
826 		if (status != DDI_SUCCESS) {
827 			bzero(mcg, sizeof (struct hermon_sw_mcg_list_s));
828 			mutex_exit(&state->hs_mcglock);
829 			return (status);
830 		}
831 		if (!qp_found)
832 			mcg_entry->member_cnt = (mcg->mcg_num_qps + 1);
833 			    /* set the member count */
834 
835 		/*
836 		 * Once the temporary MCG has been filled in, write the entry
837 		 * into the appropriate location in the Hermon MCG entry table.
838 		 * If it's successful, then drop the lock and return success.
839 		 * Note: In general, this operation shouldn't fail.  If it
840 		 * does, then it is an indication that something (probably in
841 		 * HW, but maybe in SW) has gone seriously wrong.  We still
842 		 * want to zero out the entries that we've filled in above
843 		 * (in the hermon_mcg_setup_new_hdr() routine).
844 		 */
845 		status = hermon_write_mgm_cmd_post(state, mcg_entry, end_indx,
846 		    HERMON_CMD_NOSLEEP_SPIN);
847 		if (status != HERMON_CMD_SUCCESS) {
848 			bzero(mcg, sizeof (struct hermon_sw_mcg_list_s));
849 			mutex_exit(&state->hs_mcglock);
850 			HERMON_WARNING(state, "failed to write MCG entry");
851 			cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: "
852 			    "%08x\n", status);
853 			if (status == HERMON_CMD_INVALID_STATUS) {
854 				hermon_fm_ereport(state, HCA_SYS_ERR,
855 				    HCA_ERR_SRV_LOST);
856 			}
857 			return (ibc_get_ci_failure(0));
858 		}
859 
860 		/*
861 		 * Now that we know all the Hermon firmware accesses have been
862 		 * successful, we update the "shadow" MCG entry by incrementing
863 		 * the "number of attached QPs" count.
864 		 *
865 		 * We increment only if the QP is not already part of the
866 		 * MCG by checking the 'qp_found' flag returned from the
867 		 * qplist_add above.
868 		 */
869 		if (!qp_found) {
870 			mcg->mcg_num_qps++;
871 
872 			/*
873 			 * Increment the refcnt for this QP.  Because the QP
874 			 * was added to this MCG, the refcnt must be
875 			 * incremented.
876 			 */
877 			hermon_qp_mcg_refcnt_inc(qp);
878 		}
879 
880 		/*
881 		 * We drop the lock and return success.
882 		 */
883 		mutex_exit(&state->hs_mcglock);
884 		return (DDI_SUCCESS);
885 	}
886 
887 	/*
888 	 * If the specified MGID matches the MGID in the current entry, then
889 	 * we need to try to add the QP to the current MCG entry.  In this
890 	 * case, it means that we need to read the existing MCG entry (into
891 	 * the temporary MCG), add the new QP number to the temporary entry
892 	 * (using the same method we used above), and write the entry back
893 	 * to the hardware (same as above).
894 	 */
895 	if ((mcg->mcg_mgid_h == gid.gid_prefix) &&
896 	    (mcg->mcg_mgid_l == gid.gid_guid)) {
897 
898 		/*
899 		 * Read the current MCG entry into the temporary MCG.  Note:
900 		 * In general, this operation shouldn't fail.  If it does,
901 		 * then it is an indication that something (probably in HW,
902 		 * but maybe in SW) has gone seriously wrong.
903 		 */
904 		status = hermon_read_mgm_cmd_post(state, mcg_entry, end_indx,
905 		    HERMON_CMD_NOSLEEP_SPIN);
906 		if (status != HERMON_CMD_SUCCESS) {
907 			mutex_exit(&state->hs_mcglock);
908 			HERMON_WARNING(state, "failed to read MCG entry");
909 			cmn_err(CE_CONT, "Hermon: READ_MGM command failed: "
910 			    "%08x\n", status);
911 			if (status == HERMON_CMD_INVALID_STATUS) {
912 				hermon_fm_ereport(state, HCA_SYS_ERR,
913 				    HCA_ERR_SRV_LOST);
914 			}
915 			return (ibc_get_ci_failure(0));
916 		}
917 
918 		/*
919 		 * Try to add the new QP number to the list.  This routine
920 		 * fills in the necessary pieces of the temporary MCG.  The
921 		 * "mcg_entry_qplist" pointer is used to point to the portion
922 		 * of the temporary MCG that holds the QP numbers.
923 		 *
924 		 * Note: hermon_mcg_qplist_add() returns SUCCESS if it
925 		 * already found the QP in the list.  In this case, the QP is
926 		 * not added on to the list again.  Check the flag 'qp_found'
927 		 * if this value is needed to be known.
928 		 *
929 		 */
930 		status = hermon_mcg_qplist_add(state, mcg, mcg_entry_qplist, qp,
931 		    &qp_found);
932 		if (status != DDI_SUCCESS) {
933 			mutex_exit(&state->hs_mcglock);
934 			return (status);
935 		}
936 		if (!qp_found)
937 			mcg_entry->member_cnt = (mcg->mcg_num_qps + 1);
938 			    /* set the member count */
939 
940 		/*
941 		 * Once the temporary MCG has been updated, write the entry
942 		 * into the appropriate location in the Hermon MCG entry table.
943 		 * If it's successful, then drop the lock and return success.
944 		 * Note: In general, this operation shouldn't fail.  If it
945 		 * does, then it is an indication that something (probably in
946 		 * HW, but maybe in SW) has gone seriously wrong.
947 		 */
948 		status = hermon_write_mgm_cmd_post(state, mcg_entry, end_indx,
949 		    HERMON_CMD_NOSLEEP_SPIN);
950 		if (status != HERMON_CMD_SUCCESS) {
951 			mutex_exit(&state->hs_mcglock);
952 			HERMON_WARNING(state, "failed to write MCG entry");
953 			cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: "
954 			    "%08x\n", status);
955 			if (status == HERMON_CMD_INVALID_STATUS) {
956 				hermon_fm_ereport(state, HCA_SYS_ERR,
957 				    HCA_ERR_SRV_LOST);
958 			}
959 			return (ibc_get_ci_failure(0));
960 		}
961 
962 		/*
963 		 * Now that we know all the Hermon firmware accesses have been
964 		 * successful, we update the current "shadow" MCG entry by
965 		 * incrementing the "number of attached QPs" count.
966 		 *
967 		 * We increment only if the QP is not already part of the
968 		 * MCG by checking the 'qp_found' flag returned
969 		 * hermon_mcg_walk_mgid_hashfrom the qplist_add above.
970 		 */
971 		if (!qp_found) {
972 			mcg->mcg_num_qps++;
973 
974 			/*
975 			 * Increment the refcnt for this QP.  Because the QP
976 			 * was added to this MCG, the refcnt must be
977 			 * incremented.
978 			 */
979 			hermon_qp_mcg_refcnt_inc(qp);
980 		}
981 
982 		/*
983 		 * We drop the lock and return success.
984 		 */
985 		mutex_exit(&state->hs_mcglock);
986 		return (DDI_SUCCESS);
987 	}
988 
989 	/*
990 	 * If we've reached here, then we're at the end of the hash chain.
991 	 * We need to allocate a new MCG entry, fill it in, write it to Hermon,
992 	 * and update the previous entry to link the new one to the end of the
993 	 * chain.
994 	 */
995 
996 	/*
997 	 * Allocate an MCG table entry.  This will be filled in with all
998 	 * the necessary parameters to define the multicast group.  Then it
999 	 * will be written to the hardware in the next-to-last step below.
1000 	 */
1001 	status = hermon_rsrc_alloc(state, HERMON_MCG, 1, HERMON_NOSLEEP, &rsrc);
1002 	if (status != DDI_SUCCESS) {
1003 		mutex_exit(&state->hs_mcglock);
1004 		return (IBT_INSUFF_RESOURCE);
1005 	}
1006 
1007 	/*
1008 	 * Fill in the new entry in the "shadow" MCG list.  Note:  Just as
1009 	 * it does above, hermon_mcg_setup_new_hdr() also fills in a portion
1010 	 * of the temporary MCG entry (the rest of which will be filled in by
1011 	 * hermon_mcg_qplist_add() below)
1012 	 */
1013 	newmcg = &state->hs_mcghdl[rsrc->hr_indx];
1014 	hermon_mcg_setup_new_hdr(newmcg, mcg_entry, gid, rsrc);
1015 
1016 	/*
1017 	 * Try to add the new QP number to the list.  This routine fills in
1018 	 * the final necessary pieces of the temporary MCG.  The
1019 	 * "mcg_entry_qplist" pointer is used to point to the portion of the
1020 	 * temporary MCG that holds the QP numbers.  If we fail here, we
1021 	 * must undo the previous resource allocation.
1022 	 *
1023 	 * Note: hermon_mcg_qplist_add() can we return SUCCESS if it already
1024 	 * found the QP in the list.  In this case, the QP is not added on to
1025 	 * the list again.  Check the flag 'qp_found' if this value is needed
1026 	 * to be known.
1027 	 */
1028 	status = hermon_mcg_qplist_add(state, newmcg, mcg_entry_qplist, qp,
1029 	    &qp_found);
1030 	if (status != DDI_SUCCESS) {
1031 		bzero(newmcg, sizeof (struct hermon_sw_mcg_list_s));
1032 		hermon_rsrc_free(state, &rsrc);
1033 		mutex_exit(&state->hs_mcglock);
1034 		return (status);
1035 	}
1036 	mcg_entry->member_cnt = (newmcg->mcg_num_qps + 1);
1037 	    /* set the member count */
1038 
1039 	/*
1040 	 * Once the temporary MCG has been updated, write the entry into the
1041 	 * appropriate location in the Hermon MCG entry table.  If this is
1042 	 * successful, then we need to chain the previous entry to this one.
1043 	 * Note: In general, this operation shouldn't fail.  If it does, then
1044 	 * it is an indication that something (probably in HW, but maybe in
1045 	 * SW) has gone seriously wrong.
1046 	 */
1047 	status = hermon_write_mgm_cmd_post(state, mcg_entry, rsrc->hr_indx,
1048 	    HERMON_CMD_NOSLEEP_SPIN);
1049 	if (status != HERMON_CMD_SUCCESS) {
1050 		bzero(newmcg, sizeof (struct hermon_sw_mcg_list_s));
1051 		hermon_rsrc_free(state, &rsrc);
1052 		mutex_exit(&state->hs_mcglock);
1053 		HERMON_WARNING(state, "failed to write MCG entry");
1054 		cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: %08x\n",
1055 		    status);
1056 		if (status == HERMON_CMD_INVALID_STATUS) {
1057 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1058 		}
1059 		return (ibc_get_ci_failure(0));
1060 	}
1061 
1062 	/*
1063 	 * Now read the current MCG entry (the one previously at the end of
1064 	 * hash chain) into the temporary MCG.  We are going to update its
1065 	 * "next_gid_indx" now and write the entry back to the MCG table.
1066 	 * Note:  In general, this operation shouldn't fail.  If it does, then
1067 	 * it is an indication that something (probably in HW, but maybe in SW)
1068 	 * has gone seriously wrong.  We will free up the MCG entry resource,
1069 	 * but we will not undo the previously written MCG entry in the HW.
1070 	 * This is OK, though, because the MCG entry is not currently attached
1071 	 * to any hash chain.
1072 	 */
1073 	status = hermon_read_mgm_cmd_post(state, mcg_entry, end_indx,
1074 	    HERMON_CMD_NOSLEEP_SPIN);
1075 	if (status != HERMON_CMD_SUCCESS) {
1076 		bzero(newmcg, sizeof (struct hermon_sw_mcg_list_s));
1077 		hermon_rsrc_free(state, &rsrc);
1078 		mutex_exit(&state->hs_mcglock);
1079 		HERMON_WARNING(state, "failed to read MCG entry");
1080 		cmn_err(CE_CONT, "Hermon: READ_MGM command failed: %08x\n",
1081 		    status);
1082 		if (status == HERMON_CMD_INVALID_STATUS) {
1083 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1084 		}
1085 		return (ibc_get_ci_failure(0));
1086 	}
1087 
1088 	/*
1089 	 * Finally, we update the "next_gid_indx" field in the temporary MCG
1090 	 * and attempt to write the entry back into the Hermon MCG table.  If
1091 	 * this succeeds, then we update the "shadow" list to reflect the
1092 	 * change, drop the lock, and return success.  Note:  In general, this
1093 	 * operation shouldn't fail.  If it does, then it is an indication
1094 	 * that something (probably in HW, but maybe in SW) has gone seriously
1095 	 * wrong.  Just as we do above, we will free up the MCG entry resource,
1096 	 * but we will not try to undo the previously written MCG entry.  This
1097 	 * is OK, though, because (since we failed here to update the end of
1098 	 * the chain) that other entry is not currently attached to any chain.
1099 	 */
1100 	mcg_entry->next_gid_indx = rsrc->hr_indx;
1101 	status = hermon_write_mgm_cmd_post(state, mcg_entry, end_indx,
1102 	    HERMON_CMD_NOSLEEP_SPIN);
1103 	if (status != HERMON_CMD_SUCCESS) {
1104 		bzero(newmcg, sizeof (struct hermon_sw_mcg_list_s));
1105 		hermon_rsrc_free(state, &rsrc);
1106 		mutex_exit(&state->hs_mcglock);
1107 		HERMON_WARNING(state, "failed to write MCG entry");
1108 		cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: %08x\n",
1109 		    status);
1110 		if (status == HERMON_CMD_INVALID_STATUS) {
1111 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1112 		}
1113 		return (ibc_get_ci_failure(0));
1114 	}
1115 	mcg = &state->hs_mcghdl[end_indx];
1116 	mcg->mcg_next_indx = rsrc->hr_indx;
1117 
1118 	/*
1119 	 * Now that we know all the Hermon firmware accesses have been
1120 	 * successful, we update the new "shadow" MCG entry by incrementing
1121 	 * the "number of attached QPs" count.  Then we drop the lock and
1122 	 * return success.
1123 	 */
1124 	newmcg->mcg_num_qps++;
1125 
1126 	/*
1127 	 * Increment the refcnt for this QP.  Because the QP
1128 	 * was added to this MCG, the refcnt must be
1129 	 * incremented.
1130 	 */
1131 	hermon_qp_mcg_refcnt_inc(qp);
1132 
1133 	mutex_exit(&state->hs_mcglock);
1134 	return (DDI_SUCCESS);
1135 }
1136 
1137 
1138 /*
1139  * hermon_mcg_detach()
1140  *    Context: Can be called only from user or kernel context.
1141  */
1142 int
1143 hermon_mcg_detach(hermon_state_t *state, hermon_qphdl_t qp, ib_gid_t gid,
1144     ib_lid_t lid)
1145 {
1146 	hermon_hw_mcg_t		*mcg_entry;
1147 	hermon_hw_mcg_qp_list_t	*mcg_entry_qplist;
1148 	hermon_mcghdl_t		mcg;
1149 	uint64_t		mgid_hash;
1150 	uint32_t		end_indx, prev_indx;
1151 	int			status;
1152 
1153 	/*
1154 	 * Check for invalid Multicast DLID.  Specifically, all Multicast
1155 	 * LIDs should be within a well defined range.  If the specified LID
1156 	 * is outside of that range, then return an error.
1157 	 */
1158 	if (hermon_mlid_is_valid(lid) == 0) {
1159 		return (IBT_MC_MLID_INVALID);
1160 	}
1161 
1162 	/*
1163 	 * Compute the MGID hash value.  As described above, the MCG table is
1164 	 * arranged as a number of separate hash chains.  This operation
1165 	 * converts the specified MGID into the starting index of an entry in
1166 	 * the hash table (i.e. the index for the start of the appropriate
1167 	 * hash chain).  Subsequent operations below will walk the chain
1168 	 * searching for a matching entry from which to attempt to remove
1169 	 * the specified QP.
1170 	 */
1171 	status = hermon_mgid_hash_cmd_post(state, gid.gid_prefix, gid.gid_guid,
1172 	    &mgid_hash, HERMON_SLEEPFLAG_FOR_CONTEXT());
1173 	if (status != HERMON_CMD_SUCCESS) {
1174 		cmn_err(CE_CONT, "Hermon: MGID_HASH command failed: %08x\n",
1175 		    status);
1176 		if (status == HERMON_CMD_INVALID_STATUS) {
1177 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1178 		}
1179 		return (ibc_get_ci_failure(0));
1180 	}
1181 
1182 	/*
1183 	 * Grab the multicast group mutex.  Then grab the pre-allocated
1184 	 * temporary buffer used for holding and/or modifying MCG entries.
1185 	 */
1186 	mutex_enter(&state->hs_mcglock);
1187 	mcg_entry = state->hs_mcgtmp;
1188 	mcg_entry_qplist = HERMON_MCG_GET_QPLIST_PTR(mcg_entry);
1189 
1190 	/*
1191 	 * Walk through the array of MCG entries starting at "mgid_hash".
1192 	 * Try to find an MCG entry with a matching MGID.  The
1193 	 * hermon_mcg_walk_mgid_hash() routine walks the list and returns an
1194 	 * index into the MCG table.  The entry at this index is checked to
1195 	 * determine whether it is a match or not.  If it is a match, then
1196 	 * we continue on to attempt to remove the QP from the MCG.  If it
1197 	 * is not a match (or not a valid MCG entry), then we return an error.
1198 	 */
1199 	end_indx = hermon_mcg_walk_mgid_hash(state, mgid_hash, gid, &prev_indx);
1200 	mcg	 = &state->hs_mcghdl[end_indx];
1201 
1202 	/*
1203 	 * If MGID == 0 (the hash chain is empty) or if the specified MGID
1204 	 * does not match the MGID in the current entry, then return
1205 	 * IBT_MC_MGID_INVALID (to indicate that the specified MGID is not
1206 	 * valid).
1207 	 */
1208 	if (((mcg->mcg_mgid_h == 0) && (mcg->mcg_mgid_l == 0)) ||
1209 	    ((mcg->mcg_mgid_h != gid.gid_prefix) ||
1210 	    (mcg->mcg_mgid_l != gid.gid_guid))) {
1211 		mutex_exit(&state->hs_mcglock);
1212 		return (IBT_MC_MGID_INVALID);
1213 	}
1214 
1215 	/*
1216 	 * Read the current MCG entry into the temporary MCG.  Note: In
1217 	 * general, this operation shouldn't fail.  If it does, then it is
1218 	 * an indication that something (probably in HW, but maybe in SW)
1219 	 * has gone seriously wrong.
1220 	 */
1221 	status = hermon_read_mgm_cmd_post(state, mcg_entry, end_indx,
1222 	    HERMON_CMD_NOSLEEP_SPIN);
1223 	if (status != HERMON_CMD_SUCCESS) {
1224 		mutex_exit(&state->hs_mcglock);
1225 		HERMON_WARNING(state, "failed to read MCG entry");
1226 		cmn_err(CE_CONT, "Hermon: READ_MGM command failed: %08x\n",
1227 		    status);
1228 		if (status == HERMON_CMD_INVALID_STATUS) {
1229 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1230 		}
1231 		return (ibc_get_ci_failure(0));
1232 	}
1233 
1234 	/*
1235 	 * Search the QP number list for a match.  If a match is found, then
1236 	 * remove the entry from the QP list.  Otherwise, if no match is found,
1237 	 * return an error.
1238 	 */
1239 	status = hermon_mcg_qplist_remove(mcg, mcg_entry_qplist, qp);
1240 	if (status != DDI_SUCCESS) {
1241 		mutex_exit(&state->hs_mcglock);
1242 		return (status);
1243 	}
1244 
1245 	/*
1246 	 * Decrement the MCG count for this QP.  When the 'qp_mcg'
1247 	 * field becomes 0, then this QP is no longer a member of any
1248 	 * MCG.
1249 	 */
1250 	hermon_qp_mcg_refcnt_dec(qp);
1251 
1252 	/*
1253 	 * If the current MCG's QP number list is about to be made empty
1254 	 * ("mcg_num_qps" == 1), then remove the entry itself from the hash
1255 	 * chain.  Otherwise, just write the updated MCG entry back to the
1256 	 * hardware.  In either case, once we successfully update the hardware
1257 	 * chain, then we decrement the "shadow" list entry's "mcg_num_qps"
1258 	 * count (or zero out the entire "shadow" list entry) before returning
1259 	 * success.  Note:  Zeroing out the "shadow" list entry is done
1260 	 * inside of hermon_mcg_hash_list_remove().
1261 	 */
1262 	if (mcg->mcg_num_qps == 1) {
1263 
1264 		/* Remove an MCG entry from the hash chain */
1265 		status = hermon_mcg_hash_list_remove(state, end_indx, prev_indx,
1266 		    mcg_entry);
1267 		if (status != DDI_SUCCESS) {
1268 			mutex_exit(&state->hs_mcglock);
1269 			return (status);
1270 		}
1271 
1272 	} else {
1273 		/*
1274 		 * Write the updated MCG entry back to the Hermon MCG table.
1275 		 * If this succeeds, then we update the "shadow" list to
1276 		 * reflect the change (i.e. decrement the "mcg_num_qps"),
1277 		 * drop the lock, and return success.  Note:  In general,
1278 		 * this operation shouldn't fail.  If it does, then it is an
1279 		 * indication that something (probably in HW, but maybe in SW)
1280 		 * has gone seriously wrong.
1281 		 */
1282 		mcg_entry->member_cnt = (mcg->mcg_num_qps - 1);
1283 		status = hermon_write_mgm_cmd_post(state, mcg_entry, end_indx,
1284 		    HERMON_CMD_NOSLEEP_SPIN);
1285 		if (status != HERMON_CMD_SUCCESS) {
1286 			mutex_exit(&state->hs_mcglock);
1287 			HERMON_WARNING(state, "failed to write MCG entry");
1288 			cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: "
1289 			    "%08x\n", status);
1290 			if (status == HERMON_CMD_INVALID_STATUS) {
1291 				hermon_fm_ereport(state, HCA_SYS_ERR,
1292 				    HCA_ERR_SRV_LOST);
1293 			}
1294 			return (ibc_get_ci_failure(0));
1295 		}
1296 		mcg->mcg_num_qps--;
1297 	}
1298 
1299 	mutex_exit(&state->hs_mcglock);
1300 	return (DDI_SUCCESS);
1301 }
1302 
1303 /*
1304  * hermon_qp_mcg_refcnt_inc()
1305  *    Context: Can be called from interrupt or base context.
1306  */
1307 static void
1308 hermon_qp_mcg_refcnt_inc(hermon_qphdl_t qp)
1309 {
1310 	/* Increment the QP's MCG reference count */
1311 	mutex_enter(&qp->qp_lock);
1312 	qp->qp_mcg_refcnt++;
1313 	mutex_exit(&qp->qp_lock);
1314 }
1315 
1316 
1317 /*
1318  * hermon_qp_mcg_refcnt_dec()
1319  *    Context: Can be called from interrupt or base context.
1320  */
1321 static void
1322 hermon_qp_mcg_refcnt_dec(hermon_qphdl_t qp)
1323 {
1324 	/* Decrement the QP's MCG reference count */
1325 	mutex_enter(&qp->qp_lock);
1326 	qp->qp_mcg_refcnt--;
1327 	mutex_exit(&qp->qp_lock);
1328 }
1329 
1330 
1331 /*
1332  * hermon_mcg_qplist_add()
1333  *    Context: Can be called from interrupt or base context.
1334  */
1335 static int
1336 hermon_mcg_qplist_add(hermon_state_t *state, hermon_mcghdl_t mcg,
1337     hermon_hw_mcg_qp_list_t *mcg_qplist, hermon_qphdl_t qp,
1338     uint_t *qp_found)
1339 {
1340 	uint_t		qplist_indx;
1341 
1342 	ASSERT(MUTEX_HELD(&state->hs_mcglock));
1343 
1344 	qplist_indx = mcg->mcg_num_qps;
1345 
1346 	/*
1347 	 * Determine if we have exceeded the maximum number of QP per
1348 	 * multicast group.  If we have, then return an error
1349 	 */
1350 	if (qplist_indx >= state->hs_cfg_profile->cp_num_qp_per_mcg) {
1351 		return (IBT_HCA_MCG_QP_EXCEEDED);
1352 	}
1353 
1354 	/*
1355 	 * Determine if the QP is already attached to this MCG table.  If it
1356 	 * is, then we break out and treat this operation as a NO-OP
1357 	 */
1358 	for (qplist_indx = 0; qplist_indx < mcg->mcg_num_qps;
1359 	    qplist_indx++) {
1360 		if (mcg_qplist[qplist_indx].qpn == qp->qp_qpnum) {
1361 			break;
1362 		}
1363 	}
1364 
1365 	/*
1366 	 * If the QP was already on the list, set 'qp_found' to TRUE.  We still
1367 	 * return SUCCESS in this case, but the qplist will not have been
1368 	 * updated because the QP was already on the list.
1369 	 */
1370 	if (qplist_indx < mcg->mcg_num_qps) {
1371 		*qp_found = 1;
1372 	} else {
1373 		/*
1374 		 * Otherwise, append the new QP number to the end of the
1375 		 * current QP list.  Note: We will increment the "mcg_num_qps"
1376 		 * field on the "shadow" MCG list entry later (after we know
1377 		 * that all necessary Hermon firmware accesses have been
1378 		 * successful).
1379 		 *
1380 		 * Set 'qp_found' to 0 so we know the QP was added on to the
1381 		 * list for sure.
1382 		 */
1383 		mcg_qplist[qplist_indx].qpn =
1384 		    (qp->qp_qpnum | HERMON_MCG_QPN_BLOCK_LB);
1385 		*qp_found = 0;
1386 	}
1387 
1388 	return (DDI_SUCCESS);
1389 }
1390 
1391 
1392 
1393 /*
1394  * hermon_mcg_qplist_remove()
1395  *    Context: Can be called from interrupt or base context.
1396  */
1397 static int
1398 hermon_mcg_qplist_remove(hermon_mcghdl_t mcg,
1399     hermon_hw_mcg_qp_list_t *mcg_qplist, hermon_qphdl_t qp)
1400 {
1401 	uint_t		i, qplist_indx;
1402 
1403 	/*
1404 	 * Search the MCG QP list for a matching QPN.  When
1405 	 * it's found, we swap the last entry with the current
1406 	 * one, set the last entry to zero, decrement the last
1407 	 * entry, and return.  If it's not found, then it's
1408 	 * and error.
1409 	 */
1410 	qplist_indx = mcg->mcg_num_qps;
1411 	for (i = 0; i < qplist_indx; i++) {
1412 		if (mcg_qplist[i].qpn == qp->qp_qpnum) {
1413 			mcg_qplist[i] = mcg_qplist[qplist_indx - 1];
1414 			mcg_qplist[qplist_indx - 1].qpn = 0;
1415 
1416 			return (DDI_SUCCESS);
1417 		}
1418 	}
1419 
1420 	return (IBT_QP_HDL_INVALID);
1421 }
1422 
1423 
1424 /*
1425  * hermon_mcg_walk_mgid_hash()
1426  *    Context: Can be called from interrupt or base context.
1427  */
1428 static uint_t
1429 hermon_mcg_walk_mgid_hash(hermon_state_t *state, uint64_t start_indx,
1430     ib_gid_t mgid, uint_t *p_indx)
1431 {
1432 	hermon_mcghdl_t	curr_mcghdl;
1433 	uint_t		curr_indx, prev_indx;
1434 
1435 	ASSERT(MUTEX_HELD(&state->hs_mcglock));
1436 
1437 	/* Start at the head of the hash chain */
1438 	curr_indx   = (uint_t)start_indx;
1439 	prev_indx   = curr_indx;
1440 	curr_mcghdl = &state->hs_mcghdl[curr_indx];
1441 
1442 	/* If the first entry in the chain has MGID == 0, then stop */
1443 	if ((curr_mcghdl->mcg_mgid_h == 0) &&
1444 	    (curr_mcghdl->mcg_mgid_l == 0)) {
1445 		goto end_mgid_hash_walk;
1446 	}
1447 
1448 	/* If the first entry in the chain matches the MGID, then stop */
1449 	if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) &&
1450 	    (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) {
1451 		goto end_mgid_hash_walk;
1452 	}
1453 
1454 	/* Otherwise, walk the hash chain looking for a match */
1455 	while (curr_mcghdl->mcg_next_indx != 0) {
1456 		prev_indx = curr_indx;
1457 		curr_indx = curr_mcghdl->mcg_next_indx;
1458 		curr_mcghdl = &state->hs_mcghdl[curr_indx];
1459 
1460 		if ((curr_mcghdl->mcg_mgid_h == mgid.gid_prefix) &&
1461 		    (curr_mcghdl->mcg_mgid_l == mgid.gid_guid)) {
1462 			break;
1463 		}
1464 	}
1465 
1466 end_mgid_hash_walk:
1467 	/*
1468 	 * If necessary, return the index of the previous entry too.  This
1469 	 * is primarily used for detaching a QP from a multicast group.  It
1470 	 * may be necessary, in that case, to delete an MCG entry from the
1471 	 * hash chain and having the index of the previous entry is helpful.
1472 	 */
1473 	if (p_indx != NULL) {
1474 		*p_indx = prev_indx;
1475 	}
1476 	return (curr_indx);
1477 }
1478 
1479 
1480 /*
1481  * hermon_mcg_setup_new_hdr()
1482  *    Context: Can be called from interrupt or base context.
1483  */
1484 static void
1485 hermon_mcg_setup_new_hdr(hermon_mcghdl_t mcg, hermon_hw_mcg_t *mcg_hdr,
1486     ib_gid_t mgid, hermon_rsrc_t *mcg_rsrc)
1487 {
1488 	/*
1489 	 * Fill in the fields of the "shadow" entry used by software
1490 	 * to track MCG hardware entry
1491 	 */
1492 	mcg->mcg_mgid_h	   = mgid.gid_prefix;
1493 	mcg->mcg_mgid_l	   = mgid.gid_guid;
1494 	mcg->mcg_rsrcp	   = mcg_rsrc;
1495 	mcg->mcg_next_indx = 0;
1496 	mcg->mcg_num_qps   = 0;
1497 
1498 	/*
1499 	 * Fill the header fields of the MCG entry (in the temporary copy)
1500 	 */
1501 	mcg_hdr->mgid_h		= mgid.gid_prefix;
1502 	mcg_hdr->mgid_l		= mgid.gid_guid;
1503 	mcg_hdr->next_gid_indx	= 0;
1504 }
1505 
1506 
1507 /*
1508  * hermon_mcg_hash_list_remove()
1509  *    Context: Can be called only from user or kernel context.
1510  */
1511 static int
1512 hermon_mcg_hash_list_remove(hermon_state_t *state, uint_t curr_indx,
1513     uint_t prev_indx, hermon_hw_mcg_t *mcg_entry)
1514 {
1515 	hermon_mcghdl_t		curr_mcg, prev_mcg, next_mcg;
1516 	uint_t			next_indx;
1517 	int			status;
1518 
1519 	/* Get the pointer to "shadow" list for current entry */
1520 	curr_mcg = &state->hs_mcghdl[curr_indx];
1521 
1522 	/*
1523 	 * If this is the first entry on a hash chain, then attempt to replace
1524 	 * the entry with the next entry on the chain.  If there are no
1525 	 * subsequent entries on the chain, then this is the only entry and
1526 	 * should be invalidated.
1527 	 */
1528 	if (curr_indx == prev_indx) {
1529 
1530 		/*
1531 		 * If this is the only entry on the chain, then invalidate it.
1532 		 * Note:  Invalidating an MCG entry means writing all zeros
1533 		 * to the entry.  This is only necessary for those MCG
1534 		 * entries that are the "head" entries of the individual hash
1535 		 * chains.  Regardless of whether this operation returns
1536 		 * success or failure, return that result to the caller.
1537 		 */
1538 		next_indx = curr_mcg->mcg_next_indx;
1539 		if (next_indx == 0) {
1540 			status = hermon_mcg_entry_invalidate(state, mcg_entry,
1541 			    curr_indx);
1542 			bzero(curr_mcg, sizeof (struct hermon_sw_mcg_list_s));
1543 			return (status);
1544 		}
1545 
1546 		/*
1547 		 * Otherwise, this is just the first entry on the chain, so
1548 		 * grab the next one
1549 		 */
1550 		next_mcg = &state->hs_mcghdl[next_indx];
1551 
1552 		/*
1553 		 * Read the next MCG entry into the temporary MCG.  Note:
1554 		 * In general, this operation shouldn't fail.  If it does,
1555 		 * then it is an indication that something (probably in HW,
1556 		 * but maybe in SW) has gone seriously wrong.
1557 		 */
1558 		status = hermon_read_mgm_cmd_post(state, mcg_entry, next_indx,
1559 		    HERMON_CMD_NOSLEEP_SPIN);
1560 		if (status != HERMON_CMD_SUCCESS) {
1561 			HERMON_WARNING(state, "failed to read MCG entry");
1562 			cmn_err(CE_CONT, "Hermon: READ_MGM command failed: "
1563 			    "%08x\n", status);
1564 			if (status == HERMON_CMD_INVALID_STATUS) {
1565 				hermon_fm_ereport(state, HCA_SYS_ERR,
1566 				    HCA_ERR_SRV_LOST);
1567 			}
1568 			return (ibc_get_ci_failure(0));
1569 		}
1570 
1571 		/*
1572 		 * Copy/Write the temporary MCG back to the hardware MCG list
1573 		 * using the current index.  This essentially removes the
1574 		 * current MCG entry from the list by writing over it with
1575 		 * the next one.  If this is successful, then we can do the
1576 		 * same operation for the "shadow" list.  And we can also
1577 		 * free up the Hermon MCG entry resource that was associated
1578 		 * with the (old) next entry.  Note:  In general, this
1579 		 * operation shouldn't fail.  If it does, then it is an
1580 		 * indication that something (probably in HW, but maybe in SW)
1581 		 * has gone seriously wrong.
1582 		 */
1583 		status = hermon_write_mgm_cmd_post(state, mcg_entry, curr_indx,
1584 		    HERMON_CMD_NOSLEEP_SPIN);
1585 		if (status != HERMON_CMD_SUCCESS) {
1586 			HERMON_WARNING(state, "failed to write MCG entry");
1587 			cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: "
1588 			    "%08x\n", status);
1589 			if (status == HERMON_CMD_INVALID_STATUS) {
1590 				hermon_fm_ereport(state, HCA_SYS_ERR,
1591 				    HCA_ERR_SRV_LOST);
1592 			}
1593 			return (ibc_get_ci_failure(0));
1594 		}
1595 
1596 		/*
1597 		 * Copy all the software tracking information from the next
1598 		 * entry on the "shadow" MCG list into the current entry on
1599 		 * the list.  Then invalidate (zero out) the other "shadow"
1600 		 * list entry.
1601 		 */
1602 		bcopy(next_mcg, curr_mcg, sizeof (struct hermon_sw_mcg_list_s));
1603 		bzero(next_mcg, sizeof (struct hermon_sw_mcg_list_s));
1604 
1605 		/*
1606 		 * Free up the Hermon MCG entry resource used by the "next"
1607 		 * MCG entry.  That resource is no longer needed by any
1608 		 * MCG entry which is first on a hash chain (like the "next"
1609 		 * entry has just become).
1610 		 */
1611 		hermon_rsrc_free(state, &curr_mcg->mcg_rsrcp);
1612 
1613 		return (DDI_SUCCESS);
1614 	}
1615 
1616 	/*
1617 	 * Else if this is the last entry on the hash chain (or a middle
1618 	 * entry, then we update the previous entry's "next_gid_index" field
1619 	 * to make it point instead to the next entry on the chain.  By
1620 	 * skipping over the removed entry in this way, we can then free up
1621 	 * any resources associated with the current entry.  Note:  We don't
1622 	 * need to invalidate the "skipped over" hardware entry because it
1623 	 * will no be longer connected to any hash chains, and if/when it is
1624 	 * finally re-used, it will be written with entirely new values.
1625 	 */
1626 
1627 	/*
1628 	 * Read the next MCG entry into the temporary MCG.  Note:  In general,
1629 	 * this operation shouldn't fail.  If it does, then it is an
1630 	 * indication that something (probably in HW, but maybe in SW) has
1631 	 * gone seriously wrong.
1632 	 */
1633 	status = hermon_read_mgm_cmd_post(state, mcg_entry, prev_indx,
1634 	    HERMON_CMD_NOSLEEP_SPIN);
1635 	if (status != HERMON_CMD_SUCCESS) {
1636 		HERMON_WARNING(state, "failed to read MCG entry");
1637 		cmn_err(CE_CONT, "Hermon: READ_MGM command failed: %08x\n",
1638 		    status);
1639 		if (status == HERMON_CMD_INVALID_STATUS) {
1640 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1641 		}
1642 		return (ibc_get_ci_failure(0));
1643 	}
1644 
1645 	/*
1646 	 * Finally, we update the "next_gid_indx" field in the temporary MCG
1647 	 * and attempt to write the entry back into the Hermon MCG table.  If
1648 	 * this succeeds, then we update the "shadow" list to reflect the
1649 	 * change, free up the Hermon MCG entry resource that was associated
1650 	 * with the current entry, and return success.  Note:  In general,
1651 	 * this operation shouldn't fail.  If it does, then it is an indication
1652 	 * that something (probably in HW, but maybe in SW) has gone seriously
1653 	 * wrong.
1654 	 */
1655 	mcg_entry->next_gid_indx = curr_mcg->mcg_next_indx;
1656 	status = hermon_write_mgm_cmd_post(state, mcg_entry, prev_indx,
1657 	    HERMON_CMD_NOSLEEP_SPIN);
1658 	if (status != HERMON_CMD_SUCCESS) {
1659 		HERMON_WARNING(state, "failed to write MCG entry");
1660 		cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: %08x\n",
1661 		    status);
1662 		if (status == HERMON_CMD_INVALID_STATUS) {
1663 			hermon_fm_ereport(state, HCA_SYS_ERR,
1664 			    HCA_ERR_SRV_LOST);
1665 		}
1666 		return (ibc_get_ci_failure(0));
1667 	}
1668 
1669 	/*
1670 	 * Get the pointer to the "shadow" MCG list entry for the previous
1671 	 * MCG.  Update its "mcg_next_indx" to point to the next entry
1672 	 * the one after the current entry. Note:  This next index may be
1673 	 * zero, indicating the end of the list.
1674 	 */
1675 	prev_mcg = &state->hs_mcghdl[prev_indx];
1676 	prev_mcg->mcg_next_indx = curr_mcg->mcg_next_indx;
1677 
1678 	/*
1679 	 * Free up the Hermon MCG entry resource used by the current entry.
1680 	 * This resource is no longer needed because the chain now skips over
1681 	 * the current entry.  Then invalidate (zero out) the current "shadow"
1682 	 * list entry.
1683 	 */
1684 	hermon_rsrc_free(state, &curr_mcg->mcg_rsrcp);
1685 	bzero(curr_mcg, sizeof (struct hermon_sw_mcg_list_s));
1686 
1687 	return (DDI_SUCCESS);
1688 }
1689 
1690 
1691 /*
1692  * hermon_mcg_entry_invalidate()
1693  *    Context: Can be called only from user or kernel context.
1694  */
1695 static int
1696 hermon_mcg_entry_invalidate(hermon_state_t *state, hermon_hw_mcg_t *mcg_entry,
1697     uint_t indx)
1698 {
1699 	int		status;
1700 
1701 	/*
1702 	 * Invalidate the hardware MCG entry by zeroing out this temporary
1703 	 * MCG and writing it the the hardware.  Note: In general, this
1704 	 * operation shouldn't fail.  If it does, then it is an indication
1705 	 * that something (probably in HW, but maybe in SW) has gone seriously
1706 	 * wrong.
1707 	 */
1708 	bzero(mcg_entry, HERMON_MCGMEM_SZ(state));
1709 	status = hermon_write_mgm_cmd_post(state, mcg_entry, indx,
1710 	    HERMON_CMD_NOSLEEP_SPIN);
1711 	if (status != HERMON_CMD_SUCCESS) {
1712 		HERMON_WARNING(state, "failed to write MCG entry");
1713 		cmn_err(CE_CONT, "Hermon: WRITE_MGM command failed: %08x\n",
1714 		    status);
1715 		if (status == HERMON_CMD_INVALID_STATUS) {
1716 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1717 		}
1718 		return (ibc_get_ci_failure(0));
1719 	}
1720 
1721 	return (DDI_SUCCESS);
1722 }
1723 
1724 
1725 /*
1726  * hermon_mgid_is_valid()
1727  *    Context: Can be called from interrupt or base context.
1728  */
1729 static int
1730 hermon_mgid_is_valid(ib_gid_t gid)
1731 {
1732 	uint_t		topbits, flags, scope;
1733 
1734 	/*
1735 	 * According to IBA 1.1 specification (section 4.1.1) a valid
1736 	 * "multicast GID" must have its top eight bits set to all ones
1737 	 */
1738 	topbits = (gid.gid_prefix >> HERMON_MCG_TOPBITS_SHIFT) &
1739 	    HERMON_MCG_TOPBITS_MASK;
1740 	if (topbits != HERMON_MCG_TOPBITS) {
1741 		return (0);
1742 	}
1743 
1744 	/*
1745 	 * The next 4 bits are the "flag" bits.  These are valid only
1746 	 * if they are "0" (which correspond to permanently assigned/
1747 	 * "well-known" multicast GIDs) or "1" (for so-called "transient"
1748 	 * multicast GIDs).  All other values are reserved.
1749 	 */
1750 	flags = (gid.gid_prefix >> HERMON_MCG_FLAGS_SHIFT) &
1751 	    HERMON_MCG_FLAGS_MASK;
1752 	if (!((flags == HERMON_MCG_FLAGS_PERM) ||
1753 	    (flags == HERMON_MCG_FLAGS_NONPERM))) {
1754 		return (0);
1755 	}
1756 
1757 	/*
1758 	 * The next 4 bits are the "scope" bits.  These are valid only
1759 	 * if they are "2" (Link-local), "5" (Site-local), "8"
1760 	 * (Organization-local) or "E" (Global).  All other values
1761 	 * are reserved (or currently unassigned).
1762 	 */
1763 	scope = (gid.gid_prefix >> HERMON_MCG_SCOPE_SHIFT) &
1764 	    HERMON_MCG_SCOPE_MASK;
1765 	if (!((scope == HERMON_MCG_SCOPE_LINKLOC) ||
1766 	    (scope == HERMON_MCG_SCOPE_SITELOC)	 ||
1767 	    (scope == HERMON_MCG_SCOPE_ORGLOC)	 ||
1768 	    (scope == HERMON_MCG_SCOPE_GLOBAL))) {
1769 		return (0);
1770 	}
1771 
1772 	/*
1773 	 * If it passes all of the above checks, then we will consider it
1774 	 * a valid multicast GID.
1775 	 */
1776 	return (1);
1777 }
1778 
1779 
1780 /*
1781  * hermon_mlid_is_valid()
1782  *    Context: Can be called from interrupt or base context.
1783  */
1784 static int
1785 hermon_mlid_is_valid(ib_lid_t lid)
1786 {
1787 	/*
1788 	 * According to IBA 1.1 specification (section 4.1.1) a valid
1789 	 * "multicast DLID" must be between 0xC000 and 0xFFFE.
1790 	 */
1791 	if ((lid < IB_LID_MC_FIRST) || (lid > IB_LID_MC_LAST)) {
1792 		return (0);
1793 	}
1794 
1795 	return (1);
1796 }
1797 
1798 
1799 /*
1800  * hermon_pd_alloc()
1801  *    Context: Can be called only from user or kernel context.
1802  */
1803 int
1804 hermon_pd_alloc(hermon_state_t *state, hermon_pdhdl_t *pdhdl, uint_t sleepflag)
1805 {
1806 	hermon_rsrc_t	*rsrc;
1807 	hermon_pdhdl_t	pd;
1808 	int		status;
1809 
1810 	/*
1811 	 * Allocate the software structure for tracking the protection domain
1812 	 * (i.e. the Hermon Protection Domain handle).  By default each PD
1813 	 * structure will have a unique PD number assigned to it.  All that
1814 	 * is necessary is for software to initialize the PD reference count
1815 	 * (to zero) and return success.
1816 	 */
1817 	status = hermon_rsrc_alloc(state, HERMON_PDHDL, 1, sleepflag, &rsrc);
1818 	if (status != DDI_SUCCESS) {
1819 		return (IBT_INSUFF_RESOURCE);
1820 	}
1821 	pd = (hermon_pdhdl_t)rsrc->hr_addr;
1822 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pd))
1823 
1824 	pd->pd_refcnt = 0;
1825 	*pdhdl = pd;
1826 
1827 	return (DDI_SUCCESS);
1828 }
1829 
1830 
1831 /*
1832  * hermon_pd_free()
1833  *    Context: Can be called only from user or kernel context.
1834  */
1835 int
1836 hermon_pd_free(hermon_state_t *state, hermon_pdhdl_t *pdhdl)
1837 {
1838 	hermon_rsrc_t	*rsrc;
1839 	hermon_pdhdl_t	pd;
1840 
1841 	/*
1842 	 * Pull all the necessary information from the Hermon Protection Domain
1843 	 * handle.  This is necessary here because the resource for the
1844 	 * PD is going to be freed up as part of this operation.
1845 	 */
1846 	pd   = *pdhdl;
1847 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pd))
1848 	rsrc = pd->pd_rsrcp;
1849 
1850 	/*
1851 	 * Check the PD reference count.  If the reference count is non-zero,
1852 	 * then it means that this protection domain is still referenced by
1853 	 * some memory region, queue pair, address handle, or other IB object
1854 	 * If it is non-zero, then return an error.  Otherwise, free the
1855 	 * Hermon resource and return success.
1856 	 */
1857 	if (pd->pd_refcnt != 0) {
1858 		return (IBT_PD_IN_USE);
1859 	}
1860 
1861 	/* Free the Hermon Protection Domain handle */
1862 	hermon_rsrc_free(state, &rsrc);
1863 
1864 	/* Set the pdhdl pointer to NULL and return success */
1865 	*pdhdl = (hermon_pdhdl_t)NULL;
1866 
1867 	return (DDI_SUCCESS);
1868 }
1869 
1870 
1871 /*
1872  * hermon_pd_refcnt_inc()
1873  *    Context: Can be called from interrupt or base context.
1874  */
1875 void
1876 hermon_pd_refcnt_inc(hermon_pdhdl_t pd)
1877 {
1878 	/* Increment the protection domain's reference count */
1879 	atomic_inc_32(&pd->pd_refcnt);
1880 }
1881 
1882 
1883 /*
1884  * hermon_pd_refcnt_dec()
1885  *    Context: Can be called from interrupt or base context.
1886  */
1887 void
1888 hermon_pd_refcnt_dec(hermon_pdhdl_t pd)
1889 {
1890 	/* Decrement the protection domain's reference count */
1891 	atomic_dec_32(&pd->pd_refcnt);
1892 }
1893 
1894 
1895 /*
1896  * hermon_port_query()
1897  *    Context: Can be called only from user or kernel context.
1898  */
1899 int
1900 hermon_port_query(hermon_state_t *state, uint_t port, ibt_hca_portinfo_t *pi)
1901 {
1902 	sm_portinfo_t		portinfo;
1903 	sm_guidinfo_t		guidinfo;
1904 	sm_pkey_table_t		pkeytable;
1905 	ib_gid_t		*sgid;
1906 	uint_t			sgid_max, pkey_max, tbl_size;
1907 	int			i, j, indx, status;
1908 	ib_pkey_t		*pkeyp;
1909 	ib_guid_t		*guidp;
1910 
1911 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pi))
1912 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1913 
1914 	/* Validate that specified port number is legal */
1915 	if (!hermon_portnum_is_valid(state, port)) {
1916 		return (IBT_HCA_PORT_INVALID);
1917 	}
1918 	pkeyp = state->hs_pkey[port - 1];
1919 	guidp = state->hs_guid[port - 1];
1920 
1921 	/*
1922 	 * We use the Hermon MAD_IFC command to post a GetPortInfo MAD
1923 	 * to the firmware (for the specified port number).  This returns
1924 	 * a full PortInfo MAD (in "portinfo") which we subsequently
1925 	 * parse to fill in the "ibt_hca_portinfo_t" structure returned
1926 	 * to the IBTF.
1927 	 */
1928 	status = hermon_getportinfo_cmd_post(state, port,
1929 	    HERMON_SLEEPFLAG_FOR_CONTEXT(), &portinfo);
1930 	if (status != HERMON_CMD_SUCCESS) {
1931 		cmn_err(CE_CONT, "Hermon: GetPortInfo (port %02d) command "
1932 		    "failed: %08x\n", port, status);
1933 		if (status == HERMON_CMD_INVALID_STATUS) {
1934 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
1935 		}
1936 		return (ibc_get_ci_failure(0));
1937 	}
1938 
1939 	/*
1940 	 * Parse the PortInfo MAD and fill in the IBTF structure
1941 	 */
1942 	pi->p_base_lid		= portinfo.LID;
1943 	pi->p_qkey_violations	= portinfo.Q_KeyViolations;
1944 	pi->p_pkey_violations	= portinfo.P_KeyViolations;
1945 	pi->p_sm_sl		= portinfo.MasterSMSL;
1946 	pi->p_sm_lid		= portinfo.MasterSMLID;
1947 	pi->p_linkstate		= portinfo.PortState;
1948 	pi->p_port_num		= portinfo.LocalPortNum;
1949 	pi->p_phys_state	= portinfo.PortPhysicalState;
1950 	pi->p_width_supported	= portinfo.LinkWidthSupported;
1951 	pi->p_width_enabled	= portinfo.LinkWidthEnabled;
1952 	pi->p_width_active	= portinfo.LinkWidthActive;
1953 	pi->p_speed_supported	= portinfo.LinkSpeedSupported;
1954 	pi->p_speed_enabled	= portinfo.LinkSpeedEnabled;
1955 	pi->p_speed_active	= portinfo.LinkSpeedActive;
1956 	pi->p_mtu		= portinfo.MTUCap;
1957 	pi->p_lmc		= portinfo.LMC;
1958 	pi->p_max_vl		= portinfo.VLCap;
1959 	pi->p_subnet_timeout	= portinfo.SubnetTimeOut;
1960 	pi->p_msg_sz		= ((uint32_t)1 << HERMON_QP_LOG_MAX_MSGSZ);
1961 	tbl_size = state->hs_cfg_profile->cp_log_max_gidtbl;
1962 	pi->p_sgid_tbl_sz	= (1 << tbl_size);
1963 	tbl_size = state->hs_cfg_profile->cp_log_max_pkeytbl;
1964 	pi->p_pkey_tbl_sz	= (1 << tbl_size);
1965 	state->hs_sn_prefix[port - 1] = portinfo.GidPrefix;
1966 
1967 	/*
1968 	 * Convert InfiniBand-defined port capability flags to the format
1969 	 * specified by the IBTF
1970 	 */
1971 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM)
1972 		pi->p_capabilities |= IBT_PORT_CAP_SM;
1973 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SM_DISABLED)
1974 		pi->p_capabilities |= IBT_PORT_CAP_SM_DISABLED;
1975 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_SNMP_SUPPD)
1976 		pi->p_capabilities |= IBT_PORT_CAP_SNMP_TUNNEL;
1977 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_DM_SUPPD)
1978 		pi->p_capabilities |= IBT_PORT_CAP_DM;
1979 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_VM_SUPPD)
1980 		pi->p_capabilities |= IBT_PORT_CAP_VENDOR;
1981 	if (portinfo.CapabilityMask & SM_CAP_MASK_IS_CLNT_REREG_SUPPD)
1982 		pi->p_capabilities |= IBT_PORT_CAP_CLNT_REREG;
1983 
1984 	/*
1985 	 * Fill in the SGID table.  Since the only access to the Hermon
1986 	 * GID tables is through the firmware's MAD_IFC interface, we
1987 	 * post as many GetGUIDInfo MADs as necessary to read in the entire
1988 	 * contents of the SGID table (for the specified port).  Note:  The
1989 	 * GetGUIDInfo command only gets eight GUIDs per operation.  These
1990 	 * GUIDs are then appended to the GID prefix for the port (from the
1991 	 * GetPortInfo above) to form the entire SGID table.
1992 	 */
1993 	for (i = 0; i < pi->p_sgid_tbl_sz; i += 8) {
1994 		status = hermon_getguidinfo_cmd_post(state, port, i >> 3,
1995 		    HERMON_SLEEPFLAG_FOR_CONTEXT(), &guidinfo);
1996 		if (status != HERMON_CMD_SUCCESS) {
1997 			cmn_err(CE_CONT, "Hermon: GetGUIDInfo (port %02d) "
1998 			    "command failed: %08x\n", port, status);
1999 			if (status == HERMON_CMD_INVALID_STATUS) {
2000 				hermon_fm_ereport(state, HCA_SYS_ERR,
2001 				    HCA_ERR_SRV_LOST);
2002 			}
2003 			return (ibc_get_ci_failure(0));
2004 		}
2005 
2006 		/* Figure out how many of the entries are valid */
2007 		sgid_max = min((pi->p_sgid_tbl_sz - i), 8);
2008 		for (j = 0; j < sgid_max; j++) {
2009 			indx = (i + j);
2010 			sgid = &pi->p_sgid_tbl[indx];
2011 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sgid))
2012 			sgid->gid_prefix = portinfo.GidPrefix;
2013 			guidp[indx] = sgid->gid_guid =
2014 			    guidinfo.GUIDBlocks[j];
2015 		}
2016 	}
2017 
2018 	/*
2019 	 * Fill in the PKey table.  Just as for the GID tables above, the
2020 	 * only access to the Hermon PKey tables is through the firmware's
2021 	 * MAD_IFC interface.  We post as many GetPKeyTable MADs as necessary
2022 	 * to read in the entire contents of the PKey table (for the specified
2023 	 * port).  Note:  The GetPKeyTable command only gets 32 PKeys per
2024 	 * operation.
2025 	 */
2026 	for (i = 0; i < pi->p_pkey_tbl_sz; i += 32) {
2027 		status = hermon_getpkeytable_cmd_post(state, port, i,
2028 		    HERMON_SLEEPFLAG_FOR_CONTEXT(), &pkeytable);
2029 		if (status != HERMON_CMD_SUCCESS) {
2030 			cmn_err(CE_CONT, "Hermon: GetPKeyTable (port %02d) "
2031 			    "command failed: %08x\n", port, status);
2032 			if (status == HERMON_CMD_INVALID_STATUS) {
2033 				hermon_fm_ereport(state, HCA_SYS_ERR,
2034 				    HCA_ERR_SRV_LOST);
2035 			}
2036 			return (ibc_get_ci_failure(0));
2037 		}
2038 
2039 		/* Figure out how many of the entries are valid */
2040 		pkey_max = min((pi->p_pkey_tbl_sz - i), 32);
2041 		for (j = 0; j < pkey_max; j++) {
2042 			indx = (i + j);
2043 			pkeyp[indx] = pi->p_pkey_tbl[indx] =
2044 			    pkeytable.P_KeyTableBlocks[j];
2045 		}
2046 	}
2047 
2048 	return (DDI_SUCCESS);
2049 }
2050 
2051 
2052 /*
2053  * hermon_port_modify()
2054  *    Context: Can be called only from user or kernel context.
2055  */
2056 /* ARGSUSED */
2057 int
2058 hermon_port_modify(hermon_state_t *state, uint8_t port,
2059     ibt_port_modify_flags_t flags, uint8_t init_type)
2060 {
2061 	sm_portinfo_t		portinfo;
2062 	uint32_t		capmask;
2063 	int			status;
2064 	hermon_hw_set_port_t	set_port;
2065 
2066 	/*
2067 	 * Return an error if either of the unsupported flags are set
2068 	 */
2069 	if ((flags & IBT_PORT_SHUTDOWN) ||
2070 	    (flags & IBT_PORT_SET_INIT_TYPE)) {
2071 		return (IBT_NOT_SUPPORTED);
2072 	}
2073 
2074 	bzero(&set_port, sizeof (set_port));
2075 
2076 	/*
2077 	 * Determine whether we are trying to reset the QKey counter
2078 	 */
2079 	if (flags & IBT_PORT_RESET_QKEY)
2080 		set_port.rqk = 1;
2081 
2082 	/* Validate that specified port number is legal */
2083 	if (!hermon_portnum_is_valid(state, port)) {
2084 		return (IBT_HCA_PORT_INVALID);
2085 	}
2086 
2087 	/*
2088 	 * Use the Hermon MAD_IFC command to post a GetPortInfo MAD to the
2089 	 * firmware (for the specified port number).  This returns a full
2090 	 * PortInfo MAD (in "portinfo") from which we pull the current
2091 	 * capability mask.  We then modify the capability mask as directed
2092 	 * by the "pmod_flags" field, and write the updated capability mask
2093 	 * using the Hermon SET_IB command (below).
2094 	 */
2095 	status = hermon_getportinfo_cmd_post(state, port,
2096 	    HERMON_SLEEPFLAG_FOR_CONTEXT(), &portinfo);
2097 	if (status != HERMON_CMD_SUCCESS) {
2098 		if (status == HERMON_CMD_INVALID_STATUS) {
2099 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
2100 		}
2101 		return (ibc_get_ci_failure(0));
2102 	}
2103 
2104 	/*
2105 	 * Convert InfiniBand-defined port capability flags to the format
2106 	 * specified by the IBTF.  Specifically, we modify the capability
2107 	 * mask based on the specified values.
2108 	 */
2109 	capmask = portinfo.CapabilityMask;
2110 
2111 	if (flags & IBT_PORT_RESET_SM)
2112 		capmask &= ~SM_CAP_MASK_IS_SM;
2113 	else if (flags & IBT_PORT_SET_SM)
2114 		capmask |= SM_CAP_MASK_IS_SM;
2115 
2116 	if (flags & IBT_PORT_RESET_SNMP)
2117 		capmask &= ~SM_CAP_MASK_IS_SNMP_SUPPD;
2118 	else if (flags & IBT_PORT_SET_SNMP)
2119 		capmask |= SM_CAP_MASK_IS_SNMP_SUPPD;
2120 
2121 	if (flags & IBT_PORT_RESET_DEVMGT)
2122 		capmask &= ~SM_CAP_MASK_IS_DM_SUPPD;
2123 	else if (flags & IBT_PORT_SET_DEVMGT)
2124 		capmask |= SM_CAP_MASK_IS_DM_SUPPD;
2125 
2126 	if (flags & IBT_PORT_RESET_VENDOR)
2127 		capmask &= ~SM_CAP_MASK_IS_VM_SUPPD;
2128 	else if (flags & IBT_PORT_SET_VENDOR)
2129 		capmask |= SM_CAP_MASK_IS_VM_SUPPD;
2130 
2131 	set_port.cap_mask = capmask;
2132 
2133 	/*
2134 	 * Use the Hermon SET_PORT command to update the capability mask and
2135 	 * (possibly) reset the QKey violation counter for the specified port.
2136 	 * Note: In general, this operation shouldn't fail.  If it does, then
2137 	 * it is an indication that something (probably in HW, but maybe in
2138 	 * SW) has gone seriously wrong.
2139 	 */
2140 	status = hermon_set_port_cmd_post(state, &set_port, port,
2141 	    HERMON_SLEEPFLAG_FOR_CONTEXT());
2142 	if (status != HERMON_CMD_SUCCESS) {
2143 		HERMON_WARNING(state, "failed to modify port capabilities");
2144 		cmn_err(CE_CONT, "Hermon: SET_IB (port %02d) command failed: "
2145 		    "%08x\n", port, status);
2146 		if (status == HERMON_CMD_INVALID_STATUS) {
2147 			hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
2148 		}
2149 		return (ibc_get_ci_failure(0));
2150 	}
2151 
2152 	return (DDI_SUCCESS);
2153 }
2154 
2155 
2156 /*
2157  * hermon_set_addr_path()
2158  *    Context: Can be called from interrupt or base context.
2159  *
2160  * Note: This routine is used for two purposes.  It is used to fill in the
2161  * Hermon UDAV fields, and it is used to fill in the address path information
2162  * for QPs.  Because the two Hermon structures are similar, common fields can
2163  * be filled in here.  Because they are different, however, we pass
2164  * an additional flag to indicate which type is being filled and do each one
2165  * uniquely
2166  */
2167 
2168 int hermon_srate_override = -1;	/* allows ease of testing */
2169 
2170 int
2171 hermon_set_addr_path(hermon_state_t *state, ibt_adds_vect_t *av,
2172     hermon_hw_addr_path_t *path, uint_t type)
2173 {
2174 	uint_t		gidtbl_sz;
2175 	hermon_hw_udav_t *udav;
2176 
2177 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*av))
2178 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*path))
2179 
2180 	udav = (hermon_hw_udav_t *)(void *)path;
2181 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*udav))
2182 	path->mlid	= av->av_src_path;
2183 	path->rlid	= av->av_dlid;
2184 
2185 	switch (av->av_srate) {
2186 	case IBT_SRATE_2:	/* 1xSDR-2.5Gb/s injection rate */
2187 		path->max_stat_rate = 7; break;
2188 	case IBT_SRATE_10:	/* 4xSDR-10.0Gb/s injection rate */
2189 		path->max_stat_rate = 8; break;
2190 	case IBT_SRATE_30:	/* 12xSDR-30Gb/s injection rate */
2191 		path->max_stat_rate = 9; break;
2192 	case IBT_SRATE_5:	/* 1xDDR-5Gb/s injection rate */
2193 		path->max_stat_rate = 10; break;
2194 	case IBT_SRATE_20:	/* 4xDDR-20Gb/s injection rate */
2195 		path->max_stat_rate = 11; break;
2196 	case IBT_SRATE_40:	/* 4xQDR-40Gb/s injection rate */
2197 		path->max_stat_rate = 12; break;
2198 	case IBT_SRATE_60:	/* 12xDDR-60Gb/s injection rate */
2199 		path->max_stat_rate = 13; break;
2200 	case IBT_SRATE_80:	/* 8xQDR-80Gb/s injection rate */
2201 		path->max_stat_rate = 14; break;
2202 	case IBT_SRATE_120:	/* 12xQDR-120Gb/s injection rate */
2203 		path->max_stat_rate = 15; break;
2204 	case IBT_SRATE_NOT_SPECIFIED:	/* Max */
2205 		path->max_stat_rate = 0; break;
2206 	default:
2207 		return (IBT_STATIC_RATE_INVALID);
2208 	}
2209 	if (hermon_srate_override != -1) /* for evaluating HCA firmware */
2210 		path->max_stat_rate = hermon_srate_override;
2211 
2212 	/* If "grh" flag is set, then check for valid SGID index too */
2213 	gidtbl_sz = (1 << state->hs_queryport.log_max_gid);
2214 	if ((av->av_send_grh) && (av->av_sgid_ix > gidtbl_sz)) {
2215 		return (IBT_SGID_INVALID);
2216 	}
2217 
2218 	/*
2219 	 * Fill in all "global" values regardless of the value in the GRH
2220 	 * flag.  Because "grh" is not set unless "av_send_grh" is set, the
2221 	 * hardware will ignore the other "global" values as necessary.  Note:
2222 	 * SW does this here to enable later query operations to return
2223 	 * exactly the same params that were passed when the addr path was
2224 	 * last written.
2225 	 */
2226 	path->grh = av->av_send_grh;
2227 	if (type == HERMON_ADDRPATH_QP) {
2228 		path->mgid_index = av->av_sgid_ix;
2229 	} else {
2230 		/*
2231 		 * For Hermon UDAV, the "mgid_index" field is the index into
2232 		 * a combined table (not a per-port table), but having sections
2233 		 * for each port. So some extra calculations are necessary.
2234 		 */
2235 
2236 		path->mgid_index = ((av->av_port_num - 1) * gidtbl_sz) +
2237 		    av->av_sgid_ix;
2238 
2239 		udav->portnum = av->av_port_num;
2240 	}
2241 
2242 	/*
2243 	 * According to Hermon PRM, the (31:0) part of rgid_l must be set to
2244 	 * "0x2" if the 'grh' or 'g' bit is cleared.  It also says that we
2245 	 * only need to do it for UDAV's.  So we enforce that here.
2246 	 *
2247 	 * NOTE: The entire 64 bits worth of GUID info is actually being
2248 	 * preserved (for UDAVs) by the callers of this function
2249 	 * (hermon_ah_alloc() and hermon_ah_modify()) and as long as the
2250 	 * 'grh' bit is not set, the upper 32 bits (63:32) of rgid_l are
2251 	 * "don't care".
2252 	 */
2253 	if ((path->grh) || (type == HERMON_ADDRPATH_QP)) {
2254 		path->flow_label = av->av_flow;
2255 		path->tclass	 = av->av_tclass;
2256 		path->hop_limit	 = av->av_hop;
2257 		bcopy(&(av->av_dgid.gid_prefix), &(path->rgid_h),
2258 		    sizeof (uint64_t));
2259 		bcopy(&(av->av_dgid.gid_guid), &(path->rgid_l),
2260 		    sizeof (uint64_t));
2261 	} else {
2262 		path->rgid_l	 = 0x2;
2263 		path->flow_label = 0;
2264 		path->tclass	 = 0;
2265 		path->hop_limit	 = 0;
2266 		path->rgid_h	 = 0;
2267 	}
2268 	/* extract the default service level */
2269 	udav->sl = (HERMON_DEF_SCHED_SELECTION & 0x3C) >> 2;
2270 
2271 	return (DDI_SUCCESS);
2272 }
2273 
2274 
2275 /*
2276  * hermon_get_addr_path()
2277  *    Context: Can be called from interrupt or base context.
2278  *
2279  * Note: Just like hermon_set_addr_path() above, this routine is used for two
2280  * purposes.  It is used to read in the Hermon UDAV fields, and it is used to
2281  * read in the address path information for QPs.  Because the two Hermon
2282  * structures are similar, common fields can be read in here.  But because
2283  * they are slightly different, we pass an additional flag to indicate which
2284  * type is being read.
2285  */
2286 void
2287 hermon_get_addr_path(hermon_state_t *state, hermon_hw_addr_path_t *path,
2288     ibt_adds_vect_t *av, uint_t type)
2289 {
2290 	uint_t		gidtbl_sz;
2291 
2292 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*path))
2293 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*av))
2294 
2295 	av->av_src_path	= path->mlid;
2296 	av->av_dlid	= path->rlid;
2297 
2298 	/* Set "av_ipd" value from max_stat_rate */
2299 	switch (path->max_stat_rate) {
2300 	case 7:				/* 1xSDR-2.5Gb/s injection rate */
2301 		av->av_srate = IBT_SRATE_2; break;
2302 	case 8:				/* 4xSDR-10.0Gb/s injection rate */
2303 		av->av_srate = IBT_SRATE_10; break;
2304 	case 9:				/* 12xSDR-30Gb/s injection rate */
2305 		av->av_srate = IBT_SRATE_30; break;
2306 	case 10:			/* 1xDDR-5Gb/s injection rate */
2307 		av->av_srate = IBT_SRATE_5; break;
2308 	case 11:			/* 4xDDR-20Gb/s injection rate */
2309 		av->av_srate = IBT_SRATE_20; break;
2310 	case 12:			/* xQDR-40Gb/s injection rate */
2311 		av->av_srate = IBT_SRATE_40; break;
2312 	case 13:			/* 12xDDR-60Gb/s injection rate */
2313 		av->av_srate = IBT_SRATE_60; break;
2314 	case 14:			/* 8xQDR-80Gb/s injection rate */
2315 		av->av_srate = IBT_SRATE_80; break;
2316 	case 15:			/* 12xQDR-120Gb/s injection rate */
2317 		av->av_srate = IBT_SRATE_120; break;
2318 	case 0:				/* max */
2319 		av->av_srate = IBT_SRATE_NOT_SPECIFIED; break;
2320 	default:			/* 1x injection rate */
2321 		av->av_srate = IBT_SRATE_1X;
2322 	}
2323 
2324 	/*
2325 	 * Extract all "global" values regardless of the value in the GRH
2326 	 * flag.  Because "av_send_grh" is set only if "grh" is set, software
2327 	 * knows to ignore the other "global" values as necessary.  Note: SW
2328 	 * does it this way to enable these query operations to return exactly
2329 	 * the same params that were passed when the addr path was last written.
2330 	 */
2331 	av->av_send_grh		= path->grh;
2332 	if (type == HERMON_ADDRPATH_QP) {
2333 		av->av_sgid_ix  = path->mgid_index;
2334 	} else {
2335 		/*
2336 		 * For Hermon UDAV, the "mgid_index" field is the index into
2337 		 * a combined table (not a per-port table).
2338 		 */
2339 		gidtbl_sz = (1 << state->hs_queryport.log_max_gid);
2340 		av->av_sgid_ix = path->mgid_index - ((av->av_port_num - 1) *
2341 		    gidtbl_sz);
2342 
2343 		av->av_port_num = ((hermon_hw_udav_t *)(void *)path)->portnum;
2344 	}
2345 	av->av_flow		= path->flow_label;
2346 	av->av_tclass		= path->tclass;
2347 	av->av_hop		= path->hop_limit;
2348 	/* this is for alignment issue w/ the addr path struct in Hermon */
2349 	bcopy(&(path->rgid_h), &(av->av_dgid.gid_prefix), sizeof (uint64_t));
2350 	bcopy(&(path->rgid_l), &(av->av_dgid.gid_guid), sizeof (uint64_t));
2351 }
2352 
2353 
2354 /*
2355  * hermon_portnum_is_valid()
2356  *    Context: Can be called from interrupt or base context.
2357  */
2358 int
2359 hermon_portnum_is_valid(hermon_state_t *state, uint_t portnum)
2360 {
2361 	uint_t	max_port;
2362 
2363 	max_port = state->hs_cfg_profile->cp_num_ports;
2364 	if ((portnum <= max_port) && (portnum != 0)) {
2365 		return (1);
2366 	} else {
2367 		return (0);
2368 	}
2369 }
2370 
2371 
2372 /*
2373  * hermon_pkeyindex_is_valid()
2374  *    Context: Can be called from interrupt or base context.
2375  */
2376 int
2377 hermon_pkeyindex_is_valid(hermon_state_t *state, uint_t pkeyindx)
2378 {
2379 	uint_t	max_pkeyindx;
2380 
2381 	max_pkeyindx = 1 << state->hs_cfg_profile->cp_log_max_pkeytbl;
2382 	if (pkeyindx < max_pkeyindx) {
2383 		return (1);
2384 	} else {
2385 		return (0);
2386 	}
2387 }
2388 
2389 
2390 /*
2391  * hermon_queue_alloc()
2392  *    Context: Can be called from interrupt or base context.
2393  */
2394 int
2395 hermon_queue_alloc(hermon_state_t *state, hermon_qalloc_info_t *qa_info,
2396     uint_t sleepflag)
2397 {
2398 	ddi_dma_attr_t		dma_attr;
2399 	int			(*callback)(caddr_t);
2400 	uint64_t		realsize, alloc_mask;
2401 	int			flag, status;
2402 
2403 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qa_info))
2404 
2405 	/* Set the callback flag appropriately */
2406 	callback = (sleepflag == HERMON_SLEEP) ? DDI_DMA_SLEEP :
2407 	    DDI_DMA_DONTWAIT;
2408 
2409 	/*
2410 	 * Initialize many of the default DMA attributes.  Then set additional
2411 	 * alignment restrictions as necessary for the queue memory.  Also
2412 	 * respect the configured value for IOMMU bypass
2413 	 */
2414 	hermon_dma_attr_init(state, &dma_attr);
2415 	dma_attr.dma_attr_align = qa_info->qa_bind_align;
2416 #ifdef	__sparc
2417 	if (state->hs_cfg_profile->cp_iommu_bypass == HERMON_BINDMEM_BYPASS) {
2418 		dma_attr.dma_attr_flags = DDI_DMA_FORCE_PHYSICAL;
2419 	}
2420 #endif
2421 
2422 	/* Allocate a DMA handle */
2423 	status = ddi_dma_alloc_handle(state->hs_dip, &dma_attr, callback, NULL,
2424 	    &qa_info->qa_dmahdl);
2425 	if (status != DDI_SUCCESS) {
2426 		return (DDI_FAILURE);
2427 	}
2428 
2429 	/*
2430 	 * Determine the amount of memory to allocate, depending on the values
2431 	 * in "qa_bind_align" and "qa_alloc_align".  The problem we are trying
2432 	 * to solve here is that allocating a DMA handle with IOMMU bypass
2433 	 * (DDI_DMA_FORCE_PHYSICAL) constrains us to only requesting alignments
2434 	 * that are less restrictive than the page size.  Since we may need
2435 	 * stricter alignments on the memory allocated by ddi_dma_mem_alloc()
2436 	 * (e.g. in Hermon QP work queue memory allocation), we use the
2437 	 * following method to calculate how much additional memory to request,
2438 	 * and we enforce our own alignment on the allocated result.
2439 	 */
2440 	alloc_mask = qa_info->qa_alloc_align - 1;
2441 	if (qa_info->qa_bind_align == qa_info->qa_alloc_align) {
2442 		realsize = qa_info->qa_size;
2443 	} else {
2444 		realsize = qa_info->qa_size + alloc_mask;
2445 	}
2446 
2447 	/*
2448 	 * If we are to allocate the queue from system memory, then use
2449 	 * ddi_dma_mem_alloc() to find the space.  Otherwise, this is a
2450 	 * host memory allocation, use ddi_umem_alloc(). In either case,
2451 	 * return a pointer to the memory range allocated (including any
2452 	 * necessary alignment adjustments), the "real" memory pointer,
2453 	 * the "real" size, and a ddi_acc_handle_t to use when reading
2454 	 * from/writing to the memory.
2455 	 */
2456 	if (qa_info->qa_location == HERMON_QUEUE_LOCATION_NORMAL) {
2457 		/* Allocate system memory for the queue */
2458 		status = ddi_dma_mem_alloc(qa_info->qa_dmahdl, realsize,
2459 		    &state->hs_reg_accattr, DDI_DMA_CONSISTENT, callback, NULL,
2460 		    (caddr_t *)&qa_info->qa_buf_real,
2461 		    (size_t *)&qa_info->qa_buf_realsz, &qa_info->qa_acchdl);
2462 		if (status != DDI_SUCCESS) {
2463 			ddi_dma_free_handle(&qa_info->qa_dmahdl);
2464 			return (DDI_FAILURE);
2465 		}
2466 
2467 		/*
2468 		 * Save temporary copy of the real pointer.  (This may be
2469 		 * modified in the last step below).
2470 		 */
2471 		qa_info->qa_buf_aligned = qa_info->qa_buf_real;
2472 
2473 		bzero(qa_info->qa_buf_real, qa_info->qa_buf_realsz);
2474 
2475 	} else { /* HERMON_QUEUE_LOCATION_USERLAND */
2476 
2477 		/* Allocate userland mappable memory for the queue */
2478 		flag = (sleepflag == HERMON_SLEEP) ? DDI_UMEM_SLEEP :
2479 		    DDI_UMEM_NOSLEEP;
2480 		qa_info->qa_buf_real = ddi_umem_alloc(realsize, flag,
2481 		    &qa_info->qa_umemcookie);
2482 		if (qa_info->qa_buf_real == NULL) {
2483 			ddi_dma_free_handle(&qa_info->qa_dmahdl);
2484 			return (DDI_FAILURE);
2485 		}
2486 
2487 		/*
2488 		 * Save temporary copy of the real pointer.  (This may be
2489 		 * modified in the last step below).
2490 		 */
2491 		qa_info->qa_buf_aligned = qa_info->qa_buf_real;
2492 
2493 	}
2494 
2495 	/*
2496 	 * The next to last step is to ensure that the final address
2497 	 * ("qa_buf_aligned") has the appropriate "alloc" alignment
2498 	 * restriction applied to it (if necessary).
2499 	 */
2500 	if (qa_info->qa_bind_align != qa_info->qa_alloc_align) {
2501 		qa_info->qa_buf_aligned = (uint32_t *)(uintptr_t)(((uintptr_t)
2502 		    qa_info->qa_buf_aligned + alloc_mask) & ~alloc_mask);
2503 	}
2504 	/*
2505 	 * The last step is to figure out the offset of the start relative
2506 	 * to the first page of the region - will be used in the eqc/cqc
2507 	 * passed to the HW
2508 	 */
2509 	qa_info->qa_pgoffs = (uint_t)((uintptr_t)
2510 	    qa_info->qa_buf_aligned & HERMON_PAGEOFFSET);
2511 
2512 	return (DDI_SUCCESS);
2513 }
2514 
2515 
2516 /*
2517  * hermon_queue_free()
2518  *    Context: Can be called from interrupt or base context.
2519  */
2520 void
2521 hermon_queue_free(hermon_qalloc_info_t *qa_info)
2522 {
2523 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qa_info))
2524 
2525 	/*
2526 	 * Depending on how (i.e. from where) we allocated the memory for
2527 	 * this queue, we choose the appropriate method for releasing the
2528 	 * resources.
2529 	 */
2530 	if (qa_info->qa_location == HERMON_QUEUE_LOCATION_NORMAL) {
2531 
2532 		ddi_dma_mem_free(&qa_info->qa_acchdl);
2533 
2534 	} else if (qa_info->qa_location == HERMON_QUEUE_LOCATION_USERLAND) {
2535 
2536 		ddi_umem_free(qa_info->qa_umemcookie);
2537 
2538 	}
2539 
2540 	/* Always free the dma handle */
2541 	ddi_dma_free_handle(&qa_info->qa_dmahdl);
2542 }
2543 
2544 /*
2545  * hermon_create_fmr_pool()
2546  * Create a pool of FMRs.
2547  *     Context: Can be called from kernel context only.
2548  */
2549 int
2550 hermon_create_fmr_pool(hermon_state_t *state, hermon_pdhdl_t pd,
2551     ibt_fmr_pool_attr_t *fmr_attr, hermon_fmrhdl_t *fmrpoolp)
2552 {
2553 	hermon_fmrhdl_t	fmrpool;
2554 	hermon_fmr_list_t *fmr, *fmr_next;
2555 	hermon_mrhdl_t   mr;
2556 	int		status;
2557 	int		sleep;
2558 	int		i;
2559 
2560 	sleep = (fmr_attr->fmr_flags & IBT_MR_SLEEP) ? HERMON_SLEEP :
2561 	    HERMON_NOSLEEP;
2562 	if ((sleep == HERMON_SLEEP) &&
2563 	    (sleep != HERMON_SLEEPFLAG_FOR_CONTEXT())) {
2564 		return (IBT_INVALID_PARAM);
2565 	}
2566 
2567 	fmrpool = (hermon_fmrhdl_t)kmem_zalloc(sizeof (*fmrpool), sleep);
2568 	if (fmrpool == NULL) {
2569 		status = IBT_INSUFF_RESOURCE;
2570 		goto fail;
2571 	}
2572 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmrpool))
2573 
2574 	mutex_init(&fmrpool->fmr_lock, NULL, MUTEX_DRIVER,
2575 	    DDI_INTR_PRI(state->hs_intrmsi_pri));
2576 	mutex_init(&fmrpool->remap_lock, NULL, MUTEX_DRIVER,
2577 	    DDI_INTR_PRI(state->hs_intrmsi_pri));
2578 	mutex_init(&fmrpool->dirty_lock, NULL, MUTEX_DRIVER,
2579 	    DDI_INTR_PRI(state->hs_intrmsi_pri));
2580 
2581 	fmrpool->fmr_state	    = state;
2582 	fmrpool->fmr_flush_function = fmr_attr->fmr_func_hdlr;
2583 	fmrpool->fmr_flush_arg	    = fmr_attr->fmr_func_arg;
2584 	fmrpool->fmr_pool_size	    = 0;
2585 	fmrpool->fmr_max_pages	    = fmr_attr->fmr_max_pages_per_fmr;
2586 	fmrpool->fmr_page_sz	    = fmr_attr->fmr_page_sz;
2587 	fmrpool->fmr_dirty_watermark = fmr_attr->fmr_pool_size / 4;
2588 	fmrpool->fmr_dirty_len	    = 0;
2589 	fmrpool->fmr_remap_watermark = fmr_attr->fmr_pool_size / 32;
2590 	fmrpool->fmr_remap_len	    = 0;
2591 	fmrpool->fmr_flags	    = fmr_attr->fmr_flags;
2592 	fmrpool->fmr_stat_register  = 0;
2593 	fmrpool->fmr_max_remaps	    = state->hs_cfg_profile->cp_fmr_max_remaps;
2594 	fmrpool->fmr_remap_gen	    = 1;
2595 
2596 	fmrpool->fmr_free_list_tail = &fmrpool->fmr_free_list;
2597 	fmrpool->fmr_dirty_list = NULL;
2598 	fmrpool->fmr_dirty_list_tail = &fmrpool->fmr_dirty_list;
2599 	fmrpool->fmr_remap_list = NULL;
2600 	fmrpool->fmr_remap_list_tail = &fmrpool->fmr_remap_list;
2601 	fmrpool->fmr_pool_size = fmrpool->fmr_free_len =
2602 	    fmr_attr->fmr_pool_size;
2603 
2604 	for (i = 0; i < fmr_attr->fmr_pool_size; i++) {
2605 		status = hermon_mr_alloc_fmr(state, pd, fmrpool, &mr);
2606 		if (status != DDI_SUCCESS) {
2607 			goto fail2;
2608 		}
2609 
2610 		fmr = (hermon_fmr_list_t *)kmem_zalloc(
2611 		    sizeof (hermon_fmr_list_t), sleep);
2612 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmr))
2613 
2614 		fmr->fmr = mr;
2615 		fmr->fmr_remaps = 0;
2616 		fmr->fmr_remap_gen = fmrpool->fmr_remap_gen;
2617 		fmr->fmr_pool = fmrpool;
2618 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
2619 		mr->mr_fmr = fmr;
2620 
2621 		if (!i)		/* address of last entry's link */
2622 			fmrpool->fmr_free_list_tail = &fmr->fmr_next;
2623 		fmr->fmr_next = fmrpool->fmr_free_list;
2624 		fmrpool->fmr_free_list = fmr;
2625 	}
2626 
2627 	/* Set to return pool */
2628 	*fmrpoolp = fmrpool;
2629 
2630 	IBTF_DPRINTF_L2("fmr", "create_fmr_pool SUCCESS");
2631 	return (IBT_SUCCESS);
2632 fail2:
2633 	for (fmr = fmrpool->fmr_free_list; fmr != NULL; fmr = fmr_next) {
2634 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmr))
2635 		fmr_next = fmr->fmr_next;
2636 		(void) hermon_mr_dealloc_fmr(state, &fmr->fmr);
2637 		kmem_free(fmr, sizeof (hermon_fmr_list_t));
2638 	}
2639 	kmem_free(fmrpool, sizeof (*fmrpool));
2640 fail:
2641 	*fmrpoolp = NULL;
2642 	IBTF_DPRINTF_L2("fmr", "create_fmr_pool FAILED");
2643 	if (status == DDI_FAILURE) {
2644 		return (ibc_get_ci_failure(0));
2645 	} else {
2646 		return (status);
2647 	}
2648 }
2649 
2650 /*
2651  * hermon_destroy_fmr_pool()
2652  * Destroy an FMR pool and free all associated resources.
2653  *     Context: Can be called from kernel context only.
2654  */
2655 int
2656 hermon_destroy_fmr_pool(hermon_state_t *state, hermon_fmrhdl_t fmrpool)
2657 {
2658 	hermon_fmr_list_t	*fmr, *fmr_next;
2659 
2660 	mutex_enter(&fmrpool->fmr_lock);
2661 	hermon_fmr_cleanup(fmrpool);
2662 
2663 	for (fmr = fmrpool->fmr_free_list; fmr != NULL; fmr = fmr_next) {
2664 		fmr_next = fmr->fmr_next;
2665 
2666 		(void) hermon_mr_dealloc_fmr(state, &fmr->fmr);
2667 		kmem_free(fmr, sizeof (hermon_fmr_list_t));
2668 
2669 		--fmrpool->fmr_pool_size;
2670 	}
2671 	ASSERT(fmrpool->fmr_pool_size == 0);
2672 	mutex_exit(&fmrpool->fmr_lock);
2673 
2674 	mutex_destroy(&fmrpool->fmr_lock);
2675 	mutex_destroy(&fmrpool->dirty_lock);
2676 	mutex_destroy(&fmrpool->remap_lock);
2677 
2678 	kmem_free(fmrpool, sizeof (*fmrpool));
2679 	IBTF_DPRINTF_L2("fmr", "destroy_fmr_pool SUCCESS");
2680 	return (DDI_SUCCESS);
2681 }
2682 
2683 /*
2684  * hermon_flush_fmr_pool()
2685  * Ensure that all unmapped FMRs are fully invalidated.
2686  *     Context: Can be called from kernel context only.
2687  */
2688 /* ARGSUSED */
2689 int
2690 hermon_flush_fmr_pool(hermon_state_t *state, hermon_fmrhdl_t fmrpool)
2691 {
2692 	/*
2693 	 * Force the unmapping of all entries on the dirty list, regardless of
2694 	 * whether the watermark has been hit yet.
2695 	 */
2696 	/* grab the pool lock */
2697 	mutex_enter(&fmrpool->fmr_lock);
2698 	hermon_fmr_cleanup(fmrpool);
2699 	mutex_exit(&fmrpool->fmr_lock);
2700 	return (DDI_SUCCESS);
2701 }
2702 
2703 /*
2704  * hermon_register_physical_fmr()
2705  * Map memory into FMR
2706  *    Context: Can be called from interrupt or base context.
2707  */
2708 int
2709 hermon_register_physical_fmr(hermon_state_t *state, hermon_fmrhdl_t fmrpool,
2710     ibt_pmr_attr_t *mem_pattr, hermon_mrhdl_t *mr,
2711     ibt_pmr_desc_t *mem_desc_p)
2712 {
2713 	hermon_fmr_list_t	*fmr;
2714 	int			status;
2715 
2716 	/* Check length */
2717 	if (mem_pattr->pmr_len < 1 || (mem_pattr->pmr_num_buf >
2718 	    fmrpool->fmr_max_pages)) {
2719 		return (IBT_MR_LEN_INVALID);
2720 	}
2721 
2722 	mutex_enter(&fmrpool->fmr_lock);
2723 	if (fmrpool->fmr_free_list == NULL) {
2724 		if (hermon_fmr_verbose & 2)
2725 			IBTF_DPRINTF_L2("fmr", "register needs remap");
2726 		mutex_enter(&fmrpool->remap_lock);
2727 		if (fmrpool->fmr_remap_list) {
2728 			/* add to free list */
2729 			*(fmrpool->fmr_free_list_tail) =
2730 			    fmrpool->fmr_remap_list;
2731 			fmrpool->fmr_remap_list = NULL;
2732 			fmrpool->fmr_free_list_tail =
2733 			    fmrpool->fmr_remap_list_tail;
2734 
2735 			/* reset list */
2736 			fmrpool->fmr_remap_list_tail = &fmrpool->fmr_remap_list;
2737 			fmrpool->fmr_free_len += fmrpool->fmr_remap_len;
2738 			fmrpool->fmr_remap_len = 0;
2739 		}
2740 		mutex_exit(&fmrpool->remap_lock);
2741 	}
2742 	if (fmrpool->fmr_free_list == NULL) {
2743 		if (hermon_fmr_verbose & 2)
2744 			IBTF_DPRINTF_L2("fmr", "register needs cleanup");
2745 		hermon_fmr_cleanup(fmrpool);
2746 	}
2747 
2748 	/* grab next free entry */
2749 	fmr = fmrpool->fmr_free_list;
2750 	if (fmr == NULL) {
2751 		IBTF_DPRINTF_L2("fmr", "WARNING: no free fmr resource");
2752 		cmn_err(CE_CONT, "no free fmr resource\n");
2753 		mutex_exit(&fmrpool->fmr_lock);
2754 		return (IBT_INSUFF_RESOURCE);
2755 	}
2756 
2757 	if ((fmrpool->fmr_free_list = fmr->fmr_next) == NULL)
2758 		fmrpool->fmr_free_list_tail = &fmrpool->fmr_free_list;
2759 	fmr->fmr_next = NULL;
2760 	fmrpool->fmr_stat_register++;
2761 	mutex_exit(&fmrpool->fmr_lock);
2762 
2763 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmr))
2764 	status = hermon_mr_register_physical_fmr(state, mem_pattr, fmr->fmr,
2765 	    mem_desc_p);
2766 	if (status != DDI_SUCCESS) {
2767 		return (status);
2768 	}
2769 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmr->fmr))
2770 	if (hermon_rdma_debug & 0x4)
2771 		IBTF_DPRINTF_L2("fmr", "  reg: mr %p  key %x",
2772 		    fmr->fmr, fmr->fmr->mr_rkey);
2773 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*fmr->fmr))
2774 	if (fmr->fmr_remap_gen != fmrpool->fmr_remap_gen) {
2775 		fmr->fmr_remap_gen = fmrpool->fmr_remap_gen;
2776 		fmr->fmr_remaps = 0;
2777 	}
2778 
2779 	fmr->fmr_remaps++;
2780 
2781 	*mr = (hermon_mrhdl_t)fmr->fmr;
2782 
2783 	return (DDI_SUCCESS);
2784 }
2785 
2786 /*
2787  * hermon_deregister_fmr()
2788  * Unmap FMR
2789  *    Context: Can be called from kernel context only.
2790  */
2791 int
2792 hermon_deregister_fmr(hermon_state_t *state, hermon_mrhdl_t mr)
2793 {
2794 	hermon_fmrhdl_t		fmrpool;
2795 	hermon_fmr_list_t	*fmr, **fmrlast;
2796 	int			len;
2797 
2798 	fmr = mr->mr_fmr;
2799 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*fmr))
2800 	fmrpool = fmr->fmr_pool;
2801 
2802 	/* mark as owned by software */
2803 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(fmr->fmr)))
2804 	*(uint8_t *)(fmr->fmr->mr_mptrsrcp->hr_addr) = 0xF0;
2805 
2806 	if (fmr->fmr_remaps <
2807 	    state->hs_cfg_profile->cp_fmr_max_remaps) {
2808 		/* add to remap list */
2809 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(fmr->fmr)))
2810 		if (hermon_rdma_debug & 0x4)
2811 			IBTF_DPRINTF_L2("fmr", "dereg: mr %p  key %x",
2812 			    fmr->fmr, fmr->fmr->mr_rkey);
2813 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*(fmr->fmr)))
2814 		mutex_enter(&fmrpool->remap_lock);
2815 		fmr->fmr_next = NULL;
2816 		*(fmrpool->fmr_remap_list_tail) = fmr;
2817 		fmrpool->fmr_remap_list_tail = &fmr->fmr_next;
2818 		fmrpool->fmr_remap_len++;
2819 
2820 		/* conditionally add remap list back to free list */
2821 		fmrlast = NULL;
2822 		if (fmrpool->fmr_remap_len >=
2823 		    fmrpool->fmr_remap_watermark) {
2824 			fmr = fmrpool->fmr_remap_list;
2825 			fmrlast = fmrpool->fmr_remap_list_tail;
2826 			len = fmrpool->fmr_remap_len;
2827 			fmrpool->fmr_remap_len = 0;
2828 			fmrpool->fmr_remap_list = NULL;
2829 			fmrpool->fmr_remap_list_tail =
2830 			    &fmrpool->fmr_remap_list;
2831 		}
2832 		mutex_exit(&fmrpool->remap_lock);
2833 		if (fmrlast) {
2834 			mutex_enter(&fmrpool->fmr_lock);
2835 			*(fmrpool->fmr_free_list_tail) = fmr;
2836 			fmrpool->fmr_free_list_tail = fmrlast;
2837 			fmrpool->fmr_free_len += len;
2838 			mutex_exit(&fmrpool->fmr_lock);
2839 		}
2840 	} else {
2841 		/* add to dirty list */
2842 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(fmr->fmr)))
2843 		if (hermon_rdma_debug & 0x4)
2844 			IBTF_DPRINTF_L2("fmr", "dirty: mr %p  key %x",
2845 			    fmr->fmr, fmr->fmr->mr_rkey);
2846 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*(fmr->fmr)))
2847 
2848 		mutex_enter(&fmrpool->dirty_lock);
2849 		fmr->fmr_next = NULL;
2850 		*(fmrpool->fmr_dirty_list_tail) = fmr;
2851 		fmrpool->fmr_dirty_list_tail = &fmr->fmr_next;
2852 		fmrpool->fmr_dirty_len++;
2853 
2854 		if (fmrpool->fmr_dirty_len >=
2855 		    fmrpool->fmr_dirty_watermark) {
2856 			mutex_exit(&fmrpool->dirty_lock);
2857 			mutex_enter(&fmrpool->fmr_lock);
2858 			hermon_fmr_cleanup(fmrpool);
2859 			mutex_exit(&fmrpool->fmr_lock);
2860 		} else
2861 			mutex_exit(&fmrpool->dirty_lock);
2862 	}
2863 	return (DDI_SUCCESS);
2864 }
2865 
2866 /*
2867  * hermon_fmr_cleanup()
2868  *     Context: Called from any context.
2869  */
2870 static void
2871 hermon_fmr_cleanup(hermon_fmrhdl_t fmrpool)
2872 {
2873 	int			status;
2874 
2875 	ASSERT(MUTEX_HELD(&fmrpool->fmr_lock));
2876 
2877 	if (fmrpool->fmr_stat_register == 0)
2878 		return;
2879 
2880 	fmrpool->fmr_stat_register = 0;
2881 	membar_producer();
2882 
2883 	if (hermon_fmr_verbose)
2884 		IBTF_DPRINTF_L2("fmr", "TPT_SYNC");
2885 	status = hermon_sync_tpt_cmd_post(fmrpool->fmr_state,
2886 	    HERMON_CMD_NOSLEEP_SPIN);
2887 	if (status != HERMON_CMD_SUCCESS) {
2888 		cmn_err(CE_WARN, "fmr SYNC_TPT failed(%x)\n", status);
2889 	}
2890 	fmrpool->fmr_remap_gen++;
2891 
2892 	/* add everything back to the free list */
2893 	mutex_enter(&fmrpool->dirty_lock);
2894 	if (fmrpool->fmr_dirty_list) {
2895 		/* add to free list */
2896 		*(fmrpool->fmr_free_list_tail) = fmrpool->fmr_dirty_list;
2897 		fmrpool->fmr_dirty_list = NULL;
2898 		fmrpool->fmr_free_list_tail = fmrpool->fmr_dirty_list_tail;
2899 
2900 		/* reset list */
2901 		fmrpool->fmr_dirty_list_tail = &fmrpool->fmr_dirty_list;
2902 		fmrpool->fmr_free_len += fmrpool->fmr_dirty_len;
2903 		fmrpool->fmr_dirty_len = 0;
2904 	}
2905 	mutex_exit(&fmrpool->dirty_lock);
2906 
2907 	mutex_enter(&fmrpool->remap_lock);
2908 	if (fmrpool->fmr_remap_list) {
2909 		/* add to free list */
2910 		*(fmrpool->fmr_free_list_tail) = fmrpool->fmr_remap_list;
2911 		fmrpool->fmr_remap_list = NULL;
2912 		fmrpool->fmr_free_list_tail = fmrpool->fmr_remap_list_tail;
2913 
2914 		/* reset list */
2915 		fmrpool->fmr_remap_list_tail = &fmrpool->fmr_remap_list;
2916 		fmrpool->fmr_free_len += fmrpool->fmr_remap_len;
2917 		fmrpool->fmr_remap_len = 0;
2918 	}
2919 	mutex_exit(&fmrpool->remap_lock);
2920 
2921 	if (fmrpool->fmr_flush_function != NULL) {
2922 		(void) fmrpool->fmr_flush_function(
2923 		    (ibc_fmr_pool_hdl_t)fmrpool,
2924 		    fmrpool->fmr_flush_arg);
2925 	}
2926 }
2927