xref: /illumos-gate/usr/src/uts/common/io/comstar/port/srpt/srpt_ioc.c (revision 828d47c166ce67972b1f1929669b9af5be769423)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * I/O Controller functions for the Solaris COMSTAR SCSI RDMA Protocol
29  * Target (SRPT) port provider.
30  */
31 
32 #include <sys/types.h>
33 #include <sys/ddi.h>
34 #include <sys/types.h>
35 #include <sys/sunddi.h>
36 #include <sys/atomic.h>
37 #include <sys/sysmacros.h>
38 #include <sys/ib/ibtl/ibti.h>
39 
40 #include "srp.h"
41 #include "srpt_impl.h"
42 #include "srpt_ioc.h"
43 #include "srpt_stp.h"
44 #include "srpt_ch.h"
45 
46 /*
47  * srpt_ioc_srq_size - Tunable parameter that specifies the number
48  * of receive WQ entries that can be posted to the IOC shared
49  * receive queue.
50  */
51 uint32_t	srpt_ioc_srq_size = SRPT_DEFAULT_IOC_SRQ_SIZE;
52 extern uint16_t srpt_send_msg_depth;
53 
54 /* IOC profile capabilities mask must be big-endian */
55 typedef struct srpt_ioc_opcap_bits_s {
56 #if	defined(_BIT_FIELDS_LTOH)
57 	uint8_t		af:1,
58 			at:1,
59 			wf:1,
60 			wt:1,
61 			rf:1,
62 			rt:1,
63 			sf:1,
64 			st:1;
65 #elif	defined(_BIT_FIELDS_HTOL)
66 	uint8_t		st:1,
67 			sf:1,
68 			rt:1,
69 			rf:1,
70 			wt:1,
71 			wf:1,
72 			at:1,
73 			af:1;
74 #else
75 #error	One of _BIT_FIELDS_LTOH or _BIT_FIELDS_HTOL must be defined
76 #endif
77 } srpt_ioc_opcap_bits_t;
78 
79 typedef union {
80 	srpt_ioc_opcap_bits_t	bits;
81 	uint8_t			mask;
82 } srpt_ioc_opcap_mask_t;
83 
84 /*
85  * vmem arena variables - values derived from iSER
86  */
87 #define	SRPT_MR_QUANTSIZE	0x400			/* 1K */
88 #define	SRPT_MIN_CHUNKSIZE	0x100000		/* 1MB */
89 
90 /* use less memory on 32-bit kernels as it's much more constrained */
91 #ifdef _LP64
92 #define	SRPT_BUF_MR_CHUNKSIZE	0x1000000		/* 16MB */
93 #define	SRPT_BUF_POOL_MAX	0x40000000		/* 1GB */
94 #else
95 #define	SRPT_BUF_MR_CHUNKSIZE	0x400000		/* 4MB */
96 #define	SRPT_BUF_POOL_MAX	0x4000000		/* 64MB */
97 #endif
98 
99 static ibt_mr_flags_t	srpt_dbuf_mr_flags =
100     IBT_MR_ENABLE_LOCAL_WRITE | IBT_MR_ENABLE_REMOTE_WRITE |
101     IBT_MR_ENABLE_REMOTE_READ;
102 
103 void srpt_ioc_ib_async_hdlr(void *clnt, ibt_hca_hdl_t hdl,
104 	ibt_async_code_t code, ibt_async_event_t *event);
105 
106 static struct ibt_clnt_modinfo_s srpt_ibt_modinfo = {
107 	IBTI_V_CURR,
108 	IBT_STORAGE_DEV,
109 	srpt_ioc_ib_async_hdlr,
110 	NULL,
111 	"srpt"
112 };
113 
114 static srpt_ioc_t *srpt_ioc_init(ib_guid_t guid);
115 static void srpt_ioc_fini(srpt_ioc_t *ioc);
116 
117 static srpt_vmem_pool_t *srpt_vmem_create(const char *name, srpt_ioc_t *ioc,
118     ib_memlen_t chunksize, uint64_t maxsize, ibt_mr_flags_t flags);
119 static void *srpt_vmem_alloc(srpt_vmem_pool_t *vm_pool, size_t size);
120 static int srpt_vmem_mr_compare(const void *a, const void *b);
121 static srpt_mr_t *srpt_vmem_chunk_alloc(srpt_vmem_pool_t *ioc,
122     ib_memlen_t chunksize);
123 static void srpt_vmem_destroy(srpt_vmem_pool_t *vm_pool);
124 static void srpt_vmem_free(srpt_vmem_pool_t *vm_pool, void *vaddr, size_t size);
125 static srpt_mr_t *srpt_reg_mem(srpt_vmem_pool_t *vm_pool, ib_vaddr_t vaddr,
126     ib_memlen_t len);
127 static void srpt_vmem_chunk_free(srpt_vmem_pool_t *vm_pool, srpt_mr_t *mr);
128 static void srpt_dereg_mem(srpt_ioc_t *ioc, srpt_mr_t *mr);
129 static int srpt_vmem_mr(srpt_vmem_pool_t *vm_pool, void *vaddr, size_t size,
130     srpt_mr_t *mr);
131 
132 /*
133  * srpt_ioc_attach() - I/O Controller attach
134  *
135  * Attach to IBTF and initialize I/O controllers. The srpt_ctxt->sc_rwlock
136  * should be held outside of this call.
137  */
138 int
139 srpt_ioc_attach()
140 {
141 	int		status;
142 	int		hca_cnt;
143 	int		hca_ndx;
144 	ib_guid_t	*guid;
145 	srpt_ioc_t	*ioc;
146 
147 	ASSERT(srpt_ctxt != NULL);
148 
149 	/*
150 	 * Attach to IBTF and initialize a list of IB devices.  Each
151 	 * HCA will be represented by an I/O Controller.
152 	 */
153 	status = ibt_attach(&srpt_ibt_modinfo, srpt_ctxt->sc_dip,
154 	    srpt_ctxt,  &srpt_ctxt->sc_ibt_hdl);
155 	if (status != DDI_SUCCESS) {
156 		SRPT_DPRINTF_L1("ioc_attach, ibt_attach failed (0x%x)",
157 		    status);
158 		return (DDI_FAILURE);
159 	}
160 
161 	hca_cnt = ibt_get_hca_list(&guid);
162 	if (hca_cnt < 1) {
163 		SRPT_DPRINTF_L2("ioc_attach, no HCA found");
164 		ibt_detach(srpt_ctxt->sc_ibt_hdl);
165 		srpt_ctxt->sc_ibt_hdl = NULL;
166 		return (DDI_FAILURE);
167 	}
168 
169 	list_create(&srpt_ctxt->sc_ioc_list, sizeof (srpt_ioc_t),
170 	    offsetof(srpt_ioc_t, ioc_node));
171 
172 	for (hca_ndx = 0; hca_ndx < hca_cnt; hca_ndx++) {
173 		SRPT_DPRINTF_L2("ioc_attach, adding I/O"
174 		    " Controller (%016llx)", (u_longlong_t)guid[hca_ndx]);
175 
176 		ioc = srpt_ioc_init(guid[hca_ndx]);
177 		if (ioc == NULL) {
178 			SRPT_DPRINTF_L1("ioc_attach, ioc_init GUID(%016llx)"
179 			    " failed", (u_longlong_t)guid[hca_ndx]);
180 			continue;
181 		}
182 		list_insert_tail(&srpt_ctxt->sc_ioc_list, ioc);
183 		SRPT_DPRINTF_L2("ioc_attach, I/O Controller ibt HCA hdl (%p)",
184 		    (void *)ioc->ioc_ibt_hdl);
185 		srpt_ctxt->sc_num_iocs++;
186 	}
187 
188 	ibt_free_hca_list(guid, hca_cnt);
189 	SRPT_DPRINTF_L3("ioc_attach, added %d I/O Controller(s)",
190 	    srpt_ctxt->sc_num_iocs);
191 	return (DDI_SUCCESS);
192 }
193 
194 /*
195  * srpt_ioc_detach() - I/O Controller detach
196  *
197  * srpt_ctxt->sc_rwlock should be held outside of this call.
198  */
199 void
200 srpt_ioc_detach()
201 {
202 	srpt_ioc_t	*ioc;
203 
204 	ASSERT(srpt_ctxt != NULL);
205 
206 	while ((ioc = list_head(&srpt_ctxt->sc_ioc_list)) != NULL) {
207 		list_remove(&srpt_ctxt->sc_ioc_list, ioc);
208 		SRPT_DPRINTF_L2("ioc_detach, removing I/O Controller(%p)"
209 		    " (%016llx), ibt_hdl(%p)",
210 		    (void *)ioc,
211 		    ioc ? (u_longlong_t)ioc->ioc_guid : 0x0ll,
212 		    (void *)ioc->ioc_ibt_hdl);
213 		srpt_ioc_fini(ioc);
214 	}
215 
216 	list_destroy(&srpt_ctxt->sc_ioc_list);
217 
218 	ibt_detach(srpt_ctxt->sc_ibt_hdl);
219 	srpt_ctxt->sc_ibt_hdl = NULL;
220 }
221 
222 /*
223  * srpt_ioc_init() - I/O Controller initialization
224  *
225  * Requires srpt_ctxt->rw_lock be held outside of call.
226  */
227 static srpt_ioc_t *
228 srpt_ioc_init(ib_guid_t guid)
229 {
230 	ibt_status_t		status;
231 	srpt_ioc_t		*ioc;
232 	ibt_hca_attr_t		hca_attr;
233 	uint_t			iu_ndx;
234 	uint_t			err_ndx;
235 	ibt_mr_attr_t		mr_attr;
236 	ibt_mr_desc_t		mr_desc;
237 	srpt_iu_t		*iu;
238 	ibt_srq_sizes_t		srq_attr;
239 	char			namebuf[32];
240 	size_t			iu_offset;
241 
242 	status = ibt_query_hca_byguid(guid, &hca_attr);
243 	if (status != IBT_SUCCESS) {
244 		SRPT_DPRINTF_L1("ioc_init, HCA query error (%d)",
245 		    status);
246 		return (NULL);
247 	}
248 
249 	ioc = srpt_ioc_get_locked(guid);
250 	if (ioc != NULL) {
251 		SRPT_DPRINTF_L1("ioc_init, HCA already exists");
252 		return (NULL);
253 	}
254 
255 	ioc = kmem_zalloc(sizeof (srpt_ioc_t), KM_SLEEP);
256 
257 	rw_init(&ioc->ioc_rwlock, NULL, RW_DRIVER, NULL);
258 	rw_enter(&ioc->ioc_rwlock, RW_WRITER);
259 
260 	bcopy(&hca_attr, &ioc->ioc_attr, sizeof (ibt_hca_attr_t));
261 
262 	SRPT_DPRINTF_L2("ioc_init, HCA max mr=%d, mrlen=%lld",
263 	    hca_attr.hca_max_memr, (u_longlong_t)hca_attr.hca_max_memr_len);
264 	ioc->ioc_guid   = guid;
265 
266 	status = ibt_open_hca(srpt_ctxt->sc_ibt_hdl, guid, &ioc->ioc_ibt_hdl);
267 	if (status != IBT_SUCCESS) {
268 		SRPT_DPRINTF_L1("ioc_init, IBT open failed (%d)", status);
269 		goto hca_open_err;
270 	}
271 
272 	status = ibt_alloc_pd(ioc->ioc_ibt_hdl, IBT_PD_NO_FLAGS,
273 	    &ioc->ioc_pd_hdl);
274 	if (status != IBT_SUCCESS) {
275 		SRPT_DPRINTF_L1("ioc_init, IBT create PD failed (%d)", status);
276 		goto pd_alloc_err;
277 	}
278 
279 	/*
280 	 * We require hardware support for SRQs.  We use a common SRQ to
281 	 * reduce channel memory consumption.
282 	 */
283 	if ((ioc->ioc_attr.hca_flags & IBT_HCA_SRQ) == 0) {
284 		SRPT_DPRINTF_L0("ioc_init, no SRQ capability, not supported");
285 		goto srq_alloc_err;
286 	}
287 
288 	SRPT_DPRINTF_L3("ioc_init, Using shared receive queues, max srq work"
289 	    " queue size(%d), def size = %d", ioc->ioc_attr.hca_max_srqs_sz,
290 	    srpt_ioc_srq_size);
291 	srq_attr.srq_wr_sz = min(srpt_ioc_srq_size,
292 	    ioc->ioc_attr.hca_max_srqs_sz);
293 	srq_attr.srq_sgl_sz = 1;
294 
295 	status = ibt_alloc_srq(ioc->ioc_ibt_hdl, IBT_SRQ_NO_FLAGS,
296 	    ioc->ioc_pd_hdl, &srq_attr, &ioc->ioc_srq_hdl,
297 	    &ioc->ioc_srq_attr);
298 	if (status != IBT_SUCCESS) {
299 		SRPT_DPRINTF_L1("ioc_init, IBT create SRQ failed(%d)", status);
300 		goto srq_alloc_err;
301 	}
302 
303 	SRPT_DPRINTF_L2("ioc_init, SRQ WR size(%d), SG size(%d)",
304 	    ioc->ioc_srq_attr.srq_wr_sz, ioc->ioc_srq_attr.srq_sgl_sz);
305 
306 	ibt_set_srq_private(ioc->ioc_srq_hdl, ioc);
307 
308 	/*
309 	 * Allocate a pool of SRP IU message buffers and post them to
310 	 * the I/O Controller SRQ.  We let the SRQ manage the free IU
311 	 * messages.
312 	 */
313 	ioc->ioc_num_iu_entries =
314 	    min(srq_attr.srq_wr_sz, srpt_ioc_srq_size) - 1;
315 
316 	ioc->ioc_iu_pool = kmem_zalloc(sizeof (srpt_iu_t) *
317 	    ioc->ioc_num_iu_entries, KM_SLEEP);
318 
319 	ioc->ioc_iu_bufs = kmem_alloc(SRPT_DEFAULT_SEND_MSG_SIZE *
320 	    ioc->ioc_num_iu_entries, KM_SLEEP);
321 
322 	if ((ioc->ioc_iu_pool == NULL) || (ioc->ioc_iu_bufs == NULL)) {
323 		SRPT_DPRINTF_L1("ioc_init, failed to allocate SRQ IUs");
324 		goto srq_iu_alloc_err;
325 	}
326 
327 	mr_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)ioc->ioc_iu_bufs;
328 	mr_attr.mr_len   = SRPT_DEFAULT_SEND_MSG_SIZE * ioc->ioc_num_iu_entries;
329 	mr_attr.mr_as    = NULL;
330 	mr_attr.mr_flags = IBT_MR_ENABLE_LOCAL_WRITE;
331 
332 	status = ibt_register_mr(ioc->ioc_ibt_hdl, ioc->ioc_pd_hdl,
333 	    &mr_attr, &ioc->ioc_iu_mr_hdl, &mr_desc);
334 	if (status != IBT_SUCCESS) {
335 		SRPT_DPRINTF_L1("ioc_init, IU buffer pool MR err(%d)",
336 		    status);
337 		goto srq_iu_alloc_err;
338 	}
339 
340 	for (iu_ndx = 0, iu = ioc->ioc_iu_pool; iu_ndx <
341 	    ioc->ioc_num_iu_entries; iu_ndx++, iu++) {
342 
343 		iu_offset = (iu_ndx * SRPT_DEFAULT_SEND_MSG_SIZE);
344 		iu->iu_buf = (void *)((uintptr_t)ioc->ioc_iu_bufs + iu_offset);
345 
346 		mutex_init(&iu->iu_lock, NULL, MUTEX_DRIVER, NULL);
347 
348 		iu->iu_sge.ds_va  = mr_desc.md_vaddr + iu_offset;
349 		iu->iu_sge.ds_key = mr_desc.md_lkey;
350 		iu->iu_sge.ds_len = SRPT_DEFAULT_SEND_MSG_SIZE;
351 		iu->iu_ioc	  = ioc;
352 		iu->iu_pool_ndx   = iu_ndx;
353 
354 		status = srpt_ioc_post_recv_iu(ioc, &ioc->ioc_iu_pool[iu_ndx]);
355 		if (status != IBT_SUCCESS) {
356 			SRPT_DPRINTF_L1("ioc_init, SRQ IU post err(%d)",
357 			    status);
358 			goto srq_iu_post_err;
359 		}
360 	}
361 
362 	/*
363 	 * Initialize the dbuf vmem arena
364 	 */
365 	(void) snprintf(namebuf, sizeof (namebuf),
366 	    "srpt_buf_pool_%16llX", (u_longlong_t)guid);
367 	ioc->ioc_dbuf_pool = srpt_vmem_create(namebuf, ioc,
368 	    SRPT_BUF_MR_CHUNKSIZE, SRPT_BUF_POOL_MAX, srpt_dbuf_mr_flags);
369 
370 	if (ioc->ioc_dbuf_pool == NULL) {
371 		goto stmf_db_alloc_err;
372 	}
373 
374 	/*
375 	 * Allocate the I/O Controller STMF data buffer allocator.  The
376 	 * data store will span all targets associated with this IOC.
377 	 */
378 	ioc->ioc_stmf_ds = stmf_alloc(STMF_STRUCT_DBUF_STORE, 0, 0);
379 	if (ioc->ioc_stmf_ds == NULL) {
380 		SRPT_DPRINTF_L1("ioc_attach, STMF DBUF alloc failure for IOC");
381 		goto stmf_db_alloc_err;
382 	}
383 	ioc->ioc_stmf_ds->ds_alloc_data_buf = &srpt_ioc_ds_alloc_dbuf;
384 	ioc->ioc_stmf_ds->ds_free_data_buf  = &srpt_ioc_ds_free_dbuf;
385 	ioc->ioc_stmf_ds->ds_port_private   = ioc;
386 
387 	rw_exit(&ioc->ioc_rwlock);
388 	return (ioc);
389 
390 stmf_db_alloc_err:
391 	if (ioc->ioc_dbuf_pool != NULL) {
392 		srpt_vmem_destroy(ioc->ioc_dbuf_pool);
393 	}
394 
395 srq_iu_post_err:
396 	if (ioc->ioc_iu_mr_hdl != NULL) {
397 		status = ibt_deregister_mr(ioc->ioc_ibt_hdl,
398 		    ioc->ioc_iu_mr_hdl);
399 		if (status != IBT_SUCCESS) {
400 			SRPT_DPRINTF_L1("ioc_init, error deregistering"
401 			    " memory region (%d)", status);
402 		}
403 	}
404 	for (err_ndx = 0, iu = ioc->ioc_iu_pool; err_ndx < iu_ndx;
405 	    err_ndx++, iu++) {
406 		mutex_destroy(&iu->iu_lock);
407 	}
408 
409 srq_iu_alloc_err:
410 	if (ioc->ioc_iu_bufs != NULL) {
411 		kmem_free(ioc->ioc_iu_bufs, SRPT_DEFAULT_SEND_MSG_SIZE *
412 		    ioc->ioc_num_iu_entries);
413 	}
414 	if (ioc->ioc_iu_pool != NULL) {
415 		kmem_free(ioc->ioc_iu_pool,
416 		    sizeof (srpt_iu_t) * ioc->ioc_num_iu_entries);
417 	}
418 	if (ioc->ioc_srq_hdl != NULL) {
419 		status = ibt_free_srq(ioc->ioc_srq_hdl);
420 		if (status != IBT_SUCCESS) {
421 			SRPT_DPRINTF_L1("ioc_init, error freeing SRQ (%d)",
422 			    status);
423 		}
424 
425 	}
426 
427 srq_alloc_err:
428 	status = ibt_free_pd(ioc->ioc_ibt_hdl, ioc->ioc_pd_hdl);
429 	if (status != IBT_SUCCESS) {
430 		SRPT_DPRINTF_L1("ioc_init, free PD error (%d)", status);
431 	}
432 
433 pd_alloc_err:
434 	status = ibt_close_hca(ioc->ioc_ibt_hdl);
435 	if (status != IBT_SUCCESS) {
436 		SRPT_DPRINTF_L1("ioc_init, close ioc error (%d)", status);
437 	}
438 
439 hca_open_err:
440 	rw_exit(&ioc->ioc_rwlock);
441 	rw_destroy(&ioc->ioc_rwlock);
442 	kmem_free(ioc, sizeof (*ioc));
443 	return (NULL);
444 }
445 
446 /*
447  * srpt_ioc_fini() - I/O Controller Cleanup
448  *
449  * Requires srpt_ctxt->sc_rwlock be held outside of call.
450  */
451 static void
452 srpt_ioc_fini(srpt_ioc_t *ioc)
453 {
454 	int		status;
455 	int		ndx;
456 
457 	/*
458 	 * Note driver flows will have already taken all SRP
459 	 * services running on the I/O Controller off-line.
460 	 */
461 	rw_enter(&ioc->ioc_rwlock, RW_WRITER);
462 	if (ioc->ioc_ibt_hdl != NULL) {
463 		if (ioc->ioc_stmf_ds != NULL) {
464 			stmf_free(ioc->ioc_stmf_ds);
465 		}
466 
467 		if (ioc->ioc_srq_hdl != NULL) {
468 			SRPT_DPRINTF_L4("ioc_fini, freeing SRQ");
469 			status = ibt_free_srq(ioc->ioc_srq_hdl);
470 			if (status != IBT_SUCCESS) {
471 				SRPT_DPRINTF_L1("ioc_fini, free SRQ"
472 				    " error (%d)", status);
473 			}
474 		}
475 
476 		if (ioc->ioc_iu_mr_hdl != NULL) {
477 			status = ibt_deregister_mr(
478 			    ioc->ioc_ibt_hdl, ioc->ioc_iu_mr_hdl);
479 			if (status != IBT_SUCCESS) {
480 				SRPT_DPRINTF_L1("ioc_fini, error deregistering"
481 				    " memory region (%d)", status);
482 			}
483 		}
484 
485 		if (ioc->ioc_iu_bufs != NULL) {
486 			kmem_free(ioc->ioc_iu_bufs, SRPT_DEFAULT_SEND_MSG_SIZE *
487 			    ioc->ioc_num_iu_entries);
488 		}
489 
490 		if (ioc->ioc_iu_pool != NULL) {
491 			SRPT_DPRINTF_L4("ioc_fini, freeing IU entries");
492 			for (ndx = 0; ndx < ioc->ioc_num_iu_entries; ndx++) {
493 				mutex_destroy(&ioc->ioc_iu_pool[ndx].iu_lock);
494 			}
495 
496 			SRPT_DPRINTF_L4("ioc_fini, free IU pool struct");
497 			kmem_free(ioc->ioc_iu_pool,
498 			    sizeof (srpt_iu_t) * (ioc->ioc_num_iu_entries));
499 			ioc->ioc_iu_pool = NULL;
500 			ioc->ioc_num_iu_entries = 0;
501 		}
502 
503 		if (ioc->ioc_dbuf_pool != NULL) {
504 			srpt_vmem_destroy(ioc->ioc_dbuf_pool);
505 		}
506 
507 		if (ioc->ioc_pd_hdl != NULL) {
508 			status = ibt_free_pd(ioc->ioc_ibt_hdl,
509 			    ioc->ioc_pd_hdl);
510 			if (status != IBT_SUCCESS) {
511 				SRPT_DPRINTF_L1("ioc_fini, free PD"
512 				    " error (%d)", status);
513 			}
514 		}
515 
516 		status = ibt_close_hca(ioc->ioc_ibt_hdl);
517 		if (status != IBT_SUCCESS) {
518 			SRPT_DPRINTF_L1(
519 			    "ioc_fini, close ioc error (%d)", status);
520 		}
521 	}
522 	rw_exit(&ioc->ioc_rwlock);
523 	rw_destroy(&ioc->ioc_rwlock);
524 	kmem_free(ioc, sizeof (srpt_ioc_t));
525 }
526 
527 /*
528  * srpt_ioc_port_active() - I/O Controller port active
529  */
530 static void
531 srpt_ioc_port_active(ibt_async_event_t *event)
532 {
533 	ibt_status_t		status;
534 	srpt_ioc_t		*ioc;
535 
536 	ASSERT(event != NULL);
537 
538 	SRPT_DPRINTF_L3("ioc_port_active event handler, invoked");
539 
540 	/*
541 	 * Find the HCA in question and if the HCA has completed
542 	 * initialization, and the SRP Target service for the
543 	 * the I/O Controller exists, then bind this port.
544 	 */
545 	ioc = srpt_ioc_get(event->ev_hca_guid);
546 
547 	if (ioc == NULL) {
548 		SRPT_DPRINTF_L2("ioc_port_active, I/O Controller not"
549 		    " active");
550 		return;
551 	}
552 
553 	if (ioc->ioc_tgt_port == NULL) {
554 		SRPT_DPRINTF_L2("ioc_port_active, no I/O Controller target"
555 		    " undefined");
556 		return;
557 	}
558 
559 
560 	/*
561 	 * We take the target lock here to serialize this operation
562 	 * with any STMF initiated target state transitions.  If
563 	 * SRP is off-line then the service handle is NULL.
564 	 */
565 	mutex_enter(&ioc->ioc_tgt_port->tp_lock);
566 
567 	if (ioc->ioc_tgt_port->tp_ibt_svc_hdl != NULL) {
568 		status = srpt_ioc_svc_bind(ioc->ioc_tgt_port, event->ev_port);
569 		if (status != IBT_SUCCESS &&
570 		    status != IBT_HCA_PORT_NOT_ACTIVE) {
571 			SRPT_DPRINTF_L1("ioc_port_active, bind failed (%d)",
572 			    status);
573 		}
574 	}
575 	mutex_exit(&ioc->ioc_tgt_port->tp_lock);
576 }
577 
578 /*
579  * srpt_ioc_port_down()
580  */
581 static void
582 srpt_ioc_port_down(ibt_async_event_t *event)
583 {
584 	srpt_ioc_t		*ioc;
585 	srpt_target_port_t	*tgt;
586 	srpt_channel_t		*ch;
587 	srpt_channel_t		*next_ch;
588 
589 	SRPT_DPRINTF_L3("ioc_port_down event handler, invoked");
590 
591 	/*
592 	 * Find the HCA in question and if the HCA has completed
593 	 * initialization, and the SRP Target service for the
594 	 * the I/O Controller exists, then logout initiators
595 	 * through this port.
596 	 */
597 	ioc = srpt_ioc_get(event->ev_hca_guid);
598 
599 	if (ioc == NULL) {
600 		SRPT_DPRINTF_L2("ioc_port_down, I/O Controller not"
601 		    " active");
602 		return;
603 	}
604 
605 	/*
606 	 * We only have one target now, but we could go through all
607 	 * SCSI target ports if more are added.
608 	 */
609 	tgt = ioc->ioc_tgt_port;
610 	if (tgt == NULL) {
611 		SRPT_DPRINTF_L2("ioc_port_down, no I/O Controller target"
612 		    " undefined");
613 		return;
614 	}
615 	mutex_enter(&tgt->tp_lock);
616 
617 	/*
618 	 * For all channel's logged in through this port, initiate a
619 	 * disconnect.
620 	 */
621 	mutex_enter(&tgt->tp_ch_list_lock);
622 	ch = list_head(&tgt->tp_ch_list);
623 	while (ch != NULL) {
624 		next_ch = list_next(&tgt->tp_ch_list, ch);
625 		if (ch->ch_session && (ch->ch_session->ss_hw_port ==
626 		    event->ev_port)) {
627 			srpt_ch_disconnect(ch);
628 		}
629 		ch = next_ch;
630 	}
631 	mutex_exit(&tgt->tp_ch_list_lock);
632 
633 	mutex_exit(&tgt->tp_lock);
634 }
635 
636 /*
637  * srpt_ioc_ib_async_hdlr - I/O Controller IB asynchronous events
638  */
639 /* ARGSUSED */
640 void
641 srpt_ioc_ib_async_hdlr(void *clnt, ibt_hca_hdl_t hdl,
642 	ibt_async_code_t code, ibt_async_event_t *event)
643 {
644 	srpt_ioc_t		*ioc;
645 	srpt_channel_t		*ch;
646 
647 	switch (code) {
648 	case IBT_EVENT_PORT_UP:
649 		srpt_ioc_port_active(event);
650 		break;
651 
652 	case IBT_ERROR_PORT_DOWN:
653 		srpt_ioc_port_down(event);
654 		break;
655 
656 	case IBT_HCA_ATTACH_EVENT:
657 		rw_enter(&srpt_ctxt->sc_rwlock, RW_WRITER);
658 		ioc = srpt_ioc_init(event->ev_hca_guid);
659 
660 		if (ioc == NULL) {
661 			rw_exit(&srpt_ctxt->sc_rwlock);
662 			SRPT_DPRINTF_L1("ib_async_hdlr, HCA_ATTACH"
663 			    " event failed to initialize HCA (0x%016llx)",
664 			    (u_longlong_t)event->ev_hca_guid);
665 			return;
666 		}
667 		SRPT_DPRINTF_L2("HCA_ATTACH_EVENT: I/O Controller"
668 		    " ibt hdl (%p)",
669 		    (void *)ioc->ioc_ibt_hdl);
670 
671 		rw_enter(&ioc->ioc_rwlock, RW_WRITER);
672 		ioc->ioc_tgt_port = srpt_stp_alloc_port(ioc, ioc->ioc_guid);
673 		if (ioc->ioc_tgt_port == NULL) {
674 			SRPT_DPRINTF_L1("ioc_ib_async_hdlr, alloc SCSI "
675 			    "target port error for HCA (0x%016llx)",
676 			    (u_longlong_t)event->ev_hca_guid);
677 			rw_exit(&ioc->ioc_rwlock);
678 			srpt_ioc_fini(ioc);
679 			rw_exit(&srpt_ctxt->sc_rwlock);
680 			return;
681 		}
682 
683 		/*
684 		 * New HCA added with default SCSI Target Port, SRP service
685 		 * will be started when SCSI Target Port is brought
686 		 * on-line by STMF.
687 		 */
688 		srpt_ctxt->sc_num_iocs++;
689 		list_insert_tail(&srpt_ctxt->sc_ioc_list, ioc);
690 
691 		rw_exit(&ioc->ioc_rwlock);
692 		rw_exit(&srpt_ctxt->sc_rwlock);
693 		break;
694 
695 	case IBT_HCA_DETACH_EVENT:
696 		SRPT_DPRINTF_L1(
697 		    "ioc_iob_async_hdlr, HCA_DETACH_EVENT received.");
698 		break;
699 
700 	case IBT_EVENT_EMPTY_CHAN:
701 		/* Channel in ERROR state is now empty */
702 		ch = (srpt_channel_t *)ibt_get_chan_private(event->ev_chan_hdl);
703 		SRPT_DPRINTF_L3(
704 		    "ioc_iob_async_hdlr, received empty channel error on %p",
705 		    (void *)ch);
706 		break;
707 
708 	default:
709 		SRPT_DPRINTF_L2("ioc_ib_async_hdlr, event not "
710 		    "handled (%d)", code);
711 		break;
712 	}
713 }
714 
715 /*
716  * srpt_ioc_svc_bind()
717  */
718 ibt_status_t
719 srpt_ioc_svc_bind(srpt_target_port_t *tgt, uint_t portnum)
720 {
721 	ibt_status_t		status;
722 	srpt_hw_port_t		*port;
723 	ibt_hca_portinfo_t	*portinfo;
724 	uint_t			qportinfo_sz;
725 	uint_t			qportnum;
726 	ib_gid_t		new_gid;
727 	srpt_ioc_t		*ioc;
728 
729 	ASSERT(tgt != NULL);
730 	ASSERT(tgt->tp_ioc != NULL);
731 	ioc = tgt->tp_ioc;
732 
733 	if (tgt->tp_ibt_svc_hdl == NULL) {
734 		SRPT_DPRINTF_L2("ioc_svc_bind, NULL SCSI target port"
735 		    " service");
736 		return (IBT_INVALID_PARAM);
737 	}
738 
739 	if (portnum == 0 || portnum > tgt->tp_nports) {
740 		SRPT_DPRINTF_L2("ioc_svc_bind, bad port (%d)", portnum);
741 		return (IBT_INVALID_PARAM);
742 	}
743 	status = ibt_query_hca_ports(ioc->ioc_ibt_hdl, portnum,
744 	    &portinfo, &qportnum, &qportinfo_sz);
745 	if (status != IBT_SUCCESS) {
746 		SRPT_DPRINTF_L1("ioc_svc_bind, query port error (%d)",
747 		    portnum);
748 		return (IBT_INVALID_PARAM);
749 	}
750 
751 	ASSERT(portinfo != NULL);
752 
753 	/*
754 	 * If port is not active do nothing, caller should attempt to bind
755 	 * after the port goes active.
756 	 */
757 	if (portinfo->p_linkstate != IBT_PORT_ACTIVE) {
758 		SRPT_DPRINTF_L2("ioc_svc_bind, port %d not in active state",
759 		    portnum);
760 		ibt_free_portinfo(portinfo, qportinfo_sz);
761 		return (IBT_HCA_PORT_NOT_ACTIVE);
762 	}
763 
764 	port    = &tgt->tp_hw_port[portnum-1];
765 	new_gid = portinfo->p_sgid_tbl[0];
766 	ibt_free_portinfo(portinfo, qportinfo_sz);
767 
768 	/*
769 	 * If previously bound and the port GID has changed,
770 	 * rebind to the new GID.
771 	 */
772 	if (port->hwp_bind_hdl != NULL) {
773 		if (new_gid.gid_guid != port->hwp_gid.gid_guid ||
774 		    new_gid.gid_prefix != port->hwp_gid.gid_prefix) {
775 			SRPT_DPRINTF_L2("ioc_svc_bind, unregister current"
776 			    " bind");
777 			ibt_unbind_service(tgt->tp_ibt_svc_hdl,
778 			    port->hwp_bind_hdl);
779 			port->hwp_bind_hdl = NULL;
780 		}
781 	}
782 	SRPT_DPRINTF_L2("ioc_svc_bind, bind service, %016llx:%016llx",
783 	    (u_longlong_t)new_gid.gid_prefix,
784 	    (u_longlong_t)new_gid.gid_guid);
785 
786 	/*
787 	 * Pass SCSI Target Port as CM private data, the target will always
788 	 * exist while this service is bound.
789 	 */
790 	status = ibt_bind_service(tgt->tp_ibt_svc_hdl, new_gid, NULL, tgt,
791 	    &port->hwp_bind_hdl);
792 	if (status != IBT_SUCCESS && status != IBT_CM_SERVICE_EXISTS) {
793 		SRPT_DPRINTF_L1("ioc_svc_bind, bind error (%d)", status);
794 		return (status);
795 	}
796 
797 	return (IBT_SUCCESS);
798 }
799 
800 /*
801  * srpt_ioc_svc_unbind()
802  */
803 void
804 srpt_ioc_svc_unbind(srpt_target_port_t *tgt, uint_t portnum)
805 {
806 	srpt_hw_port_t		*port;
807 
808 	if (tgt == NULL) {
809 		SRPT_DPRINTF_L2("ioc_svc_unbind, SCSI target does not exist");
810 		return;
811 	}
812 
813 	if (portnum == 0 || portnum > tgt->tp_nports) {
814 		SRPT_DPRINTF_L2("ioc_svc_unbind, bad port (%d)", portnum);
815 		return;
816 	}
817 	port = &tgt->tp_hw_port[portnum-1];
818 
819 	if (tgt->tp_ibt_svc_hdl != NULL && port->hwp_bind_hdl != NULL) {
820 		SRPT_DPRINTF_L2("ioc_svc_unbind, unregister current bind");
821 		ibt_unbind_service(tgt->tp_ibt_svc_hdl, port->hwp_bind_hdl);
822 	}
823 	port->hwp_bind_hdl = NULL;
824 }
825 
826 /*
827  * srpt_ioc_svc_unbind_all()
828  */
829 void
830 srpt_ioc_svc_unbind_all(srpt_target_port_t *tgt)
831 {
832 	uint_t		portnum;
833 
834 	if (tgt == NULL) {
835 		SRPT_DPRINTF_L2("ioc_svc_unbind_all, NULL SCSI target port"
836 		    " specified");
837 		return;
838 	}
839 	for (portnum = 1; portnum <= tgt->tp_nports; portnum++) {
840 		srpt_ioc_svc_unbind(tgt, portnum);
841 	}
842 }
843 
844 /*
845  * srpt_ioc_get_locked()
846  *
847  * Requires srpt_ctxt->rw_lock be held outside of call.
848  */
849 srpt_ioc_t *
850 srpt_ioc_get_locked(ib_guid_t guid)
851 {
852 	srpt_ioc_t	*ioc;
853 
854 	ioc = list_head(&srpt_ctxt->sc_ioc_list);
855 	while (ioc != NULL) {
856 		if (ioc->ioc_guid == guid) {
857 			break;
858 		}
859 		ioc = list_next(&srpt_ctxt->sc_ioc_list, ioc);
860 	}
861 	return (ioc);
862 }
863 
864 /*
865  * srpt_ioc_get()
866  */
867 srpt_ioc_t *
868 srpt_ioc_get(ib_guid_t guid)
869 {
870 	srpt_ioc_t	*ioc;
871 
872 	rw_enter(&srpt_ctxt->sc_rwlock, RW_READER);
873 	ioc = srpt_ioc_get_locked(guid);
874 	rw_exit(&srpt_ctxt->sc_rwlock);
875 	return (ioc);
876 }
877 
878 /*
879  * srpt_ioc_post_recv_iu()
880  */
881 ibt_status_t
882 srpt_ioc_post_recv_iu(srpt_ioc_t *ioc, srpt_iu_t *iu)
883 {
884 	ibt_status_t		status;
885 	ibt_recv_wr_t		wr;
886 	uint_t			posted;
887 
888 	ASSERT(ioc != NULL);
889 	ASSERT(iu != NULL);
890 
891 	wr.wr_id  = (ibt_wrid_t)(uintptr_t)iu;
892 	wr.wr_nds = 1;
893 	wr.wr_sgl = &iu->iu_sge;
894 	posted    = 0;
895 
896 	status = ibt_post_srq(ioc->ioc_srq_hdl, &wr, 1, &posted);
897 	if (status != IBT_SUCCESS) {
898 		SRPT_DPRINTF_L2("ioc_post_recv_iu, post error (%d)",
899 		    status);
900 	}
901 	return (status);
902 }
903 
904 /*
905  * srpt_ioc_repost_recv_iu()
906  */
907 void
908 srpt_ioc_repost_recv_iu(srpt_ioc_t *ioc, srpt_iu_t *iu)
909 {
910 	srpt_channel_t		*ch;
911 	ibt_status_t		status;
912 
913 	ASSERT(iu != NULL);
914 	ASSERT(mutex_owned(&iu->iu_lock));
915 
916 	/*
917 	 * Some additional sanity checks while in debug state, all STMF
918 	 * related task activities should be complete prior to returning
919 	 * this IU to the available pool.
920 	 */
921 	ASSERT(iu->iu_stmf_task == NULL);
922 	ASSERT(iu->iu_sq_posted_cnt == 0);
923 
924 	ch = iu->iu_ch;
925 	iu->iu_ch = NULL;
926 	iu->iu_num_rdescs = 0;
927 	iu->iu_rdescs = NULL;
928 	iu->iu_tot_xfer_len = 0;
929 	iu->iu_tag = 0;
930 	iu->iu_flags = 0;
931 	iu->iu_sq_posted_cnt = 0;
932 
933 	status = srpt_ioc_post_recv_iu(ioc, iu);
934 
935 	if (status != IBT_SUCCESS) {
936 		/*
937 		 * Very bad, we should initiate a shutdown of the I/O
938 		 * Controller here, off-lining any targets associated
939 		 * with this I/O Controller (and therefore disconnecting
940 		 * any logins that remain).
941 		 *
942 		 * In practice this should never happen so we put
943 		 * the code near the bottom of the implementation list.
944 		 */
945 		SRPT_DPRINTF_L0("ioc_repost_recv_iu, error RX IU (%d)",
946 		    status);
947 		ASSERT(0);
948 	} else if (ch != NULL) {
949 		atomic_inc_32(&ch->ch_req_lim_delta);
950 	}
951 }
952 
953 /*
954  * srpt_ioc_init_profile()
955  *
956  * SRP I/O Controller serialization lock must be held when this
957  * routine is invoked.
958  */
959 void
960 srpt_ioc_init_profile(srpt_ioc_t *ioc)
961 {
962 	srpt_ioc_opcap_mask_t		capmask = {0};
963 
964 	ASSERT(ioc != NULL);
965 
966 	ioc->ioc_profile.ioc_guid = h2b64(ioc->ioc_guid);
967 	(void) memcpy(ioc->ioc_profile.ioc_id_string,
968 	    "Solaris SRP Target 0.9a", 23);
969 
970 	/*
971 	 * Note vendor ID and subsystem ID are 24 bit values.  Low order
972 	 * 8 bits in vendor ID field is slot and is initialized to zero.
973 	 * Low order 8 bits of subsystem ID is a reserved field and
974 	 * initialized to zero.
975 	 */
976 	ioc->ioc_profile.ioc_vendorid =
977 	    h2b32((uint32_t)(ioc->ioc_attr.hca_vendor_id << 8));
978 	ioc->ioc_profile.ioc_deviceid =
979 	    h2b32((uint32_t)ioc->ioc_attr.hca_device_id);
980 	ioc->ioc_profile.ioc_device_ver =
981 	    h2b16((uint16_t)ioc->ioc_attr.hca_version_id);
982 	ioc->ioc_profile.ioc_subsys_vendorid =
983 	    h2b32((uint32_t)(ioc->ioc_attr.hca_vendor_id << 8));
984 	ioc->ioc_profile.ioc_subsys_id = h2b32(0);
985 	ioc->ioc_profile.ioc_io_class = h2b16(SRP_REV_16A_IO_CLASS);
986 	ioc->ioc_profile.ioc_io_subclass = h2b16(SRP_IO_SUBCLASS);
987 	ioc->ioc_profile.ioc_protocol = h2b16(SRP_PROTOCOL);
988 	ioc->ioc_profile.ioc_protocol_ver = h2b16(SRP_PROTOCOL_VERSION);
989 	ioc->ioc_profile.ioc_send_msg_qdepth = h2b16(srpt_send_msg_depth);
990 	ioc->ioc_profile.ioc_rdma_read_qdepth =
991 	    ioc->ioc_attr.hca_max_rdma_out_chan;
992 	ioc->ioc_profile.ioc_send_msg_sz = h2b32(SRPT_DEFAULT_SEND_MSG_SIZE);
993 	ioc->ioc_profile.ioc_rdma_xfer_sz = h2b32(SRPT_DEFAULT_MAX_RDMA_SIZE);
994 
995 	capmask.bits.st = 1;	/* Messages can be sent to IOC */
996 	capmask.bits.sf = 1;	/* Messages can be sent from IOC */
997 	capmask.bits.rf = 1;	/* RDMA Reads can be sent from IOC */
998 	capmask.bits.wf = 1;	/* RDMA Writes can be sent from IOC */
999 	ioc->ioc_profile.ioc_ctrl_opcap_mask = capmask.mask;
1000 
1001 	/*
1002 	 * We currently only have one target, but if we had a list we would
1003 	 * go through that list and only count those that are ONLINE when
1004 	 * setting the services count and entries.
1005 	 */
1006 	if (ioc->ioc_tgt_port->tp_srp_enabled) {
1007 		ioc->ioc_profile.ioc_service_entries = 1;
1008 		ioc->ioc_svc.srv_id = h2b64(ioc->ioc_guid);
1009 		(void) snprintf((char *)ioc->ioc_svc.srv_name,
1010 		    IB_DM_MAX_SVC_NAME_LEN, "SRP.T10:%016llx",
1011 		    (u_longlong_t)ioc->ioc_guid);
1012 	} else {
1013 		ioc->ioc_profile.ioc_service_entries = 0;
1014 		ioc->ioc_svc.srv_id = 0;
1015 	}
1016 }
1017 
1018 /*
1019  * srpt_ioc_ds_alloc_dbuf()
1020  */
1021 /* ARGSUSED */
1022 stmf_data_buf_t *
1023 srpt_ioc_ds_alloc_dbuf(struct scsi_task *task, uint32_t size,
1024 	uint32_t *pminsize, uint32_t flags)
1025 {
1026 	srpt_iu_t		*iu;
1027 	srpt_ioc_t		*ioc;
1028 	srpt_ds_dbuf_t		*dbuf;
1029 	stmf_data_buf_t		*stmf_dbuf;
1030 	void			*buf;
1031 	srpt_mr_t		mr;
1032 
1033 	ASSERT(task != NULL);
1034 	iu  = task->task_port_private;
1035 	ioc = iu->iu_ioc;
1036 
1037 	SRPT_DPRINTF_L4("ioc_ds_alloc_dbuf, invoked ioc(%p)"
1038 	    " size(%d), flags(%x)",
1039 	    (void *)ioc, size, flags);
1040 
1041 	buf = srpt_vmem_alloc(ioc->ioc_dbuf_pool, size);
1042 	if (buf == NULL) {
1043 		return (NULL);
1044 	}
1045 
1046 	if (srpt_vmem_mr(ioc->ioc_dbuf_pool, buf, size, &mr) != 0) {
1047 		goto stmf_alloc_err;
1048 	}
1049 
1050 	stmf_dbuf = stmf_alloc(STMF_STRUCT_DATA_BUF, sizeof (srpt_ds_dbuf_t),
1051 	    0);
1052 	if (stmf_dbuf == NULL) {
1053 		SRPT_DPRINTF_L2("ioc_ds_alloc_dbuf, stmf_alloc failed");
1054 		goto stmf_alloc_err;
1055 	}
1056 
1057 	dbuf = stmf_dbuf->db_port_private;
1058 	dbuf->db_stmf_buf = stmf_dbuf;
1059 	dbuf->db_mr_hdl = mr.mr_hdl;
1060 	dbuf->db_ioc = ioc;
1061 	dbuf->db_sge.ds_va = mr.mr_va;
1062 	dbuf->db_sge.ds_key = mr.mr_lkey;
1063 	dbuf->db_sge.ds_len = size;
1064 
1065 	stmf_dbuf->db_buf_size = size;
1066 	stmf_dbuf->db_data_size = size;
1067 	stmf_dbuf->db_relative_offset = 0;
1068 	stmf_dbuf->db_flags = 0;
1069 	stmf_dbuf->db_xfer_status = 0;
1070 	stmf_dbuf->db_sglist_length = 1;
1071 	stmf_dbuf->db_sglist[0].seg_addr = buf;
1072 	stmf_dbuf->db_sglist[0].seg_length = size;
1073 
1074 	return (stmf_dbuf);
1075 
1076 buf_mr_err:
1077 	stmf_free(stmf_dbuf);
1078 
1079 stmf_alloc_err:
1080 	srpt_vmem_free(ioc->ioc_dbuf_pool, buf, size);
1081 
1082 	return (NULL);
1083 }
1084 
1085 void
1086 srpt_ioc_ds_free_dbuf(struct stmf_dbuf_store *ds,
1087 	stmf_data_buf_t *dbuf)
1088 {
1089 	srpt_ioc_t	*ioc;
1090 
1091 	SRPT_DPRINTF_L4("ioc_ds_free_dbuf, invoked buf (%p)",
1092 	    (void *)dbuf);
1093 	ioc = ds->ds_port_private;
1094 
1095 	srpt_vmem_free(ioc->ioc_dbuf_pool, dbuf->db_sglist[0].seg_addr,
1096 	    dbuf->db_buf_size);
1097 	stmf_free(dbuf);
1098 }
1099 
1100 /* Memory arena routines */
1101 
1102 static srpt_vmem_pool_t *
1103 srpt_vmem_create(const char *name, srpt_ioc_t *ioc, ib_memlen_t chunksize,
1104     uint64_t maxsize, ibt_mr_flags_t flags)
1105 {
1106 	srpt_mr_t		*chunk;
1107 	srpt_vmem_pool_t	*result;
1108 
1109 	ASSERT(chunksize <= maxsize);
1110 
1111 	result = kmem_zalloc(sizeof (srpt_vmem_pool_t), KM_SLEEP);
1112 
1113 	result->svp_ioc = ioc;
1114 	result->svp_chunksize = chunksize;
1115 	result->svp_max_size = maxsize;
1116 	result->svp_flags = flags;
1117 
1118 	rw_init(&result->svp_lock, NULL, RW_DRIVER, NULL);
1119 	avl_create(&result->svp_mr_list, srpt_vmem_mr_compare,
1120 	    sizeof (srpt_mr_t), offsetof(srpt_mr_t, mr_avl));
1121 
1122 	chunk = srpt_vmem_chunk_alloc(result, chunksize);
1123 
1124 	avl_add(&result->svp_mr_list, chunk);
1125 	result->svp_total_size = chunksize;
1126 
1127 	result->svp_vmem = vmem_create(name,
1128 	    (void*)(uintptr_t)chunk->mr_va,
1129 	    (size_t)chunk->mr_len, SRPT_MR_QUANTSIZE,
1130 	    NULL, NULL, NULL, 0, VM_SLEEP);
1131 
1132 	return (result);
1133 }
1134 
1135 static void
1136 srpt_vmem_destroy(srpt_vmem_pool_t *vm_pool)
1137 {
1138 	srpt_mr_t		*chunk;
1139 	srpt_mr_t		*next;
1140 
1141 	rw_enter(&vm_pool->svp_lock, RW_WRITER);
1142 	vmem_destroy(vm_pool->svp_vmem);
1143 
1144 	chunk = avl_first(&vm_pool->svp_mr_list);
1145 
1146 	while (chunk != NULL) {
1147 		next = AVL_NEXT(&vm_pool->svp_mr_list, chunk);
1148 		avl_remove(&vm_pool->svp_mr_list, chunk);
1149 		srpt_vmem_chunk_free(vm_pool, chunk);
1150 		chunk = next;
1151 	}
1152 
1153 	avl_destroy(&vm_pool->svp_mr_list);
1154 
1155 	rw_exit(&vm_pool->svp_lock);
1156 	rw_destroy(&vm_pool->svp_lock);
1157 
1158 	kmem_free(vm_pool, sizeof (srpt_vmem_pool_t));
1159 }
1160 
1161 static void *
1162 srpt_vmem_alloc(srpt_vmem_pool_t *vm_pool, size_t size)
1163 {
1164 	void		*result;
1165 	srpt_mr_t	*next;
1166 	ib_memlen_t	chunklen;
1167 
1168 	ASSERT(vm_pool != NULL);
1169 
1170 	result = vmem_alloc(vm_pool->svp_vmem, size,
1171 	    VM_NOSLEEP | VM_FIRSTFIT);
1172 
1173 	if (result != NULL) {
1174 		/* memory successfully allocated */
1175 		return (result);
1176 	}
1177 
1178 	/* need more vmem */
1179 	rw_enter(&vm_pool->svp_lock, RW_WRITER);
1180 	chunklen = vm_pool->svp_chunksize;
1181 
1182 	if (vm_pool->svp_total_size >= vm_pool->svp_max_size) {
1183 		/* no more room to alloc */
1184 		rw_exit(&vm_pool->svp_lock);
1185 		return (NULL);
1186 	}
1187 
1188 	if ((vm_pool->svp_total_size + chunklen) > vm_pool->svp_max_size) {
1189 		chunklen = vm_pool->svp_max_size - vm_pool->svp_total_size;
1190 	}
1191 
1192 	next = srpt_vmem_chunk_alloc(vm_pool, chunklen);
1193 	if (next != NULL) {
1194 		/*
1195 		 * Note that the size of the chunk we got
1196 		 * may not be the size we requested.  Use the
1197 		 * length returned in the chunk itself.
1198 		 */
1199 		if (vmem_add(vm_pool->svp_vmem, (void*)(uintptr_t)next->mr_va,
1200 		    next->mr_len, VM_NOSLEEP) == NULL) {
1201 			srpt_vmem_chunk_free(vm_pool, next);
1202 			SRPT_DPRINTF_L2("vmem_add failed");
1203 		} else {
1204 			vm_pool->svp_total_size += next->mr_len;
1205 			avl_add(&vm_pool->svp_mr_list, next);
1206 		}
1207 	}
1208 
1209 	rw_exit(&vm_pool->svp_lock);
1210 
1211 	result = vmem_alloc(vm_pool->svp_vmem, size, VM_NOSLEEP | VM_FIRSTFIT);
1212 
1213 	return (result);
1214 }
1215 
1216 static void
1217 srpt_vmem_free(srpt_vmem_pool_t *vm_pool, void *vaddr, size_t size)
1218 {
1219 	vmem_free(vm_pool->svp_vmem, vaddr, size);
1220 }
1221 
1222 static int
1223 srpt_vmem_mr(srpt_vmem_pool_t *vm_pool, void *vaddr, size_t size,
1224     srpt_mr_t *mr)
1225 {
1226 	avl_index_t		where;
1227 	ib_vaddr_t		mrva = (ib_vaddr_t)(uintptr_t)vaddr;
1228 	srpt_mr_t		chunk;
1229 	srpt_mr_t		*nearest;
1230 	ib_vaddr_t		chunk_end;
1231 	int			status = DDI_FAILURE;
1232 
1233 	rw_enter(&vm_pool->svp_lock, RW_READER);
1234 
1235 	chunk.mr_va = mrva;
1236 	nearest = avl_find(&vm_pool->svp_mr_list, &chunk, &where);
1237 
1238 	if (nearest == NULL) {
1239 		nearest = avl_nearest(&vm_pool->svp_mr_list, where,
1240 		    AVL_BEFORE);
1241 	}
1242 
1243 	if (nearest != NULL) {
1244 		/* Verify this chunk contains the specified address range */
1245 		ASSERT(nearest->mr_va <= mrva);
1246 
1247 		chunk_end = nearest->mr_va + nearest->mr_len;
1248 		if (chunk_end >= mrva + size) {
1249 			mr->mr_hdl = nearest->mr_hdl;
1250 			mr->mr_va = mrva;
1251 			mr->mr_len = size;
1252 			mr->mr_lkey = nearest->mr_lkey;
1253 			mr->mr_rkey = nearest->mr_rkey;
1254 			status = DDI_SUCCESS;
1255 		}
1256 	}
1257 
1258 	rw_exit(&vm_pool->svp_lock);
1259 	return (status);
1260 }
1261 
1262 static srpt_mr_t *
1263 srpt_vmem_chunk_alloc(srpt_vmem_pool_t *vm_pool, ib_memlen_t chunksize)
1264 {
1265 	void			*chunk = NULL;
1266 	srpt_mr_t		*result = NULL;
1267 
1268 	while ((chunk == NULL) && (chunksize >= SRPT_MIN_CHUNKSIZE)) {
1269 		chunk = kmem_alloc(chunksize, KM_NOSLEEP);
1270 		if (chunk == NULL) {
1271 			SRPT_DPRINTF_L2("srpt_vmem_chunk_alloc: "
1272 			    "failed to alloc chunk of %d, trying %d",
1273 			    (int)chunksize, (int)chunksize/2);
1274 			chunksize /= 2;
1275 		}
1276 	}
1277 
1278 	if (chunk != NULL) {
1279 		result = srpt_reg_mem(vm_pool, (ib_vaddr_t)(uintptr_t)chunk,
1280 		    chunksize);
1281 		if (result == NULL) {
1282 			SRPT_DPRINTF_L2("srpt_vmem_chunk_alloc: "
1283 			    "chunk registration failed");
1284 			kmem_free(chunk, chunksize);
1285 		}
1286 	}
1287 
1288 	return (result);
1289 }
1290 
1291 static void
1292 srpt_vmem_chunk_free(srpt_vmem_pool_t *vm_pool, srpt_mr_t *mr)
1293 {
1294 	void			*chunk = (void *)(uintptr_t)mr->mr_va;
1295 	ib_memlen_t		chunksize = mr->mr_len;
1296 
1297 	srpt_dereg_mem(vm_pool->svp_ioc, mr);
1298 	kmem_free(chunk, chunksize);
1299 }
1300 
1301 static srpt_mr_t *
1302 srpt_reg_mem(srpt_vmem_pool_t *vm_pool, ib_vaddr_t vaddr, ib_memlen_t len)
1303 {
1304 	srpt_mr_t		*result = NULL;
1305 	ibt_mr_attr_t		mr_attr;
1306 	ibt_mr_desc_t		mr_desc;
1307 	ibt_status_t		status;
1308 	srpt_ioc_t		*ioc = vm_pool->svp_ioc;
1309 
1310 	result = kmem_zalloc(sizeof (srpt_mr_t), KM_NOSLEEP);
1311 	if (result == NULL) {
1312 		SRPT_DPRINTF_L2("srpt_reg_mem: failed to allocate");
1313 		return (NULL);
1314 	}
1315 
1316 	bzero(&mr_attr, sizeof (ibt_mr_attr_t));
1317 	bzero(&mr_desc, sizeof (ibt_mr_desc_t));
1318 
1319 	mr_attr.mr_vaddr = vaddr;
1320 	mr_attr.mr_len = len;
1321 	mr_attr.mr_as = NULL;
1322 	mr_attr.mr_flags = vm_pool->svp_flags;
1323 
1324 	status = ibt_register_mr(ioc->ioc_ibt_hdl, ioc->ioc_pd_hdl,
1325 	    &mr_attr, &result->mr_hdl, &mr_desc);
1326 	if (status != IBT_SUCCESS) {
1327 		SRPT_DPRINTF_L2("srpt_reg_mem: ibt_register_mr "
1328 		    "failed %d", status);
1329 		kmem_free(result, sizeof (srpt_mr_t));
1330 		return (NULL);
1331 	}
1332 
1333 	result->mr_va = mr_attr.mr_vaddr;
1334 	result->mr_len = mr_attr.mr_len;
1335 	result->mr_lkey = mr_desc.md_lkey;
1336 	result->mr_rkey = mr_desc.md_rkey;
1337 
1338 	return (result);
1339 }
1340 
1341 static void
1342 srpt_dereg_mem(srpt_ioc_t *ioc, srpt_mr_t *mr)
1343 {
1344 	ibt_status_t		status;
1345 
1346 	status = ibt_deregister_mr(ioc->ioc_ibt_hdl, mr->mr_hdl);
1347 	if (status != IBT_SUCCESS) {
1348 		SRPT_DPRINTF_L1("ioc_fini, error deregistering MR (%d)",
1349 		    status);
1350 	}
1351 	kmem_free(mr, sizeof (srpt_mr_t));
1352 }
1353 
1354 static int
1355 srpt_vmem_mr_compare(const void *a, const void *b)
1356 {
1357 	srpt_mr_t		*mr1 = (srpt_mr_t *)a;
1358 	srpt_mr_t		*mr2 = (srpt_mr_t *)b;
1359 
1360 	/* sort and match by virtual address */
1361 	if (mr1->mr_va < mr2->mr_va) {
1362 		return (-1);
1363 	} else if (mr1->mr_va > mr2->mr_va) {
1364 		return (1);
1365 	}
1366 
1367 	return (0);
1368 }
1369