xref: /titanic_52/usr/src/uts/common/io/ib/clients/of/sol_ofs/sol_kverbs.c (revision 17a2b317610f531d565bf4e940433aab2d9e6985)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /* Solaris Open Fabric kernel verbs */
26 
27 #include <sys/types.h>
28 #include <sys/ddi.h>
29 #include <sys/sunddi.h>
30 #include <sys/modctl.h>
31 #include <sys/ib/clients/of/rdma/ib_verbs.h>
32 #include <sys/ib/clients/of/rdma/ib_addr.h>
33 #include <sys/ib/clients/of/rdma/rdma_cm.h>
34 #include <sys/ib/clients/of/sol_ofs/sol_kverb_impl.h>
35 
36 static void *statep;
37 char *sol_kverbs_dbg_str = "sol_kverbs";
38 
39 static llist_head_t client_list = LLIST_HEAD_INIT(client_list);
40 kmutex_t clist_lock; /* mutex for client_list */
41 
42 static void ofs_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
43     ibt_async_event_t *);
44 
45 /*
46  * set ibt_client_t members. clnt->ib_client must be set before
47  * this func is called.
48  */
49 static int
50 alloc_ibt_client(ofs_client_t *clnt)
51 {
52 	int namelen;
53 	ASSERT(clnt->ib_client != NULL);
54 
55 	SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
56 	    "alloc_ibt_client: client: 0x%p", clnt);
57 
58 	/*
59 	 * double-check the name string. if it's longer than MAXNAMELEN
60 	 * including the string terminator, assuming the name is invalid,
61 	 * return EINVAL.
62 	 */
63 	namelen = strlen(clnt->ib_client->name);
64 	if (namelen >= MAXNAMELEN) {
65 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
66 		    "alloc_ibt_client: client: 0x%p => "
67 		    "namelen(%d) is larger than MAXNAMELEN", clnt, namelen);
68 		return (-EINVAL);
69 	}
70 	clnt->ibt_client.mi_clnt_name = kmem_zalloc(namelen + 1, KM_NOSLEEP);
71 	if (clnt->ibt_client.mi_clnt_name == NULL) {
72 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
73 		    "alloc_ibt_client: client: 0x%p => "
74 		    "no sufficient memory", clnt);
75 		return (-ENOMEM);
76 	}
77 	bcopy(clnt->ib_client->name, clnt->ibt_client.mi_clnt_name, namelen);
78 	clnt->ibt_client.mi_ibt_version = IBTI_V_CURR;
79 	if (clnt->ib_client->dip) {
80 		clnt->ibt_client.mi_clnt_class = IBT_GENERIC;
81 	} else {
82 		clnt->ibt_client.mi_clnt_class = IBT_GENERIC_MISC;
83 	}
84 	clnt->ibt_client.mi_async_handler = ofs_async_handler;
85 
86 	return (0);
87 }
88 
89 static void
90 free_ibt_client(ofs_client_t *clnt)
91 {
92 	int namelen = strlen(clnt->ib_client->name);
93 	ASSERT(namelen < MAXNAMELEN);
94 
95 	SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
96 	    "free_ibt_client: client: 0x%p", clnt);
97 
98 	kmem_free(clnt->ibt_client.mi_clnt_name, namelen + 1);
99 	clnt->ibt_client.mi_clnt_name = NULL;
100 }
101 
102 /*
103  * get_device() returns a pointer to struct ib_devcie with
104  * the same guid as one passed to the function.
105  */
106 static ib_device_t *
107 get_device(ofs_client_t *ofs_client, ib_guid_t guid)
108 {
109 	ib_device_t *device;
110 	llist_head_t *entry;
111 
112 	ASSERT(RW_LOCK_HELD(&ofs_client->lock));
113 
114 	SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
115 	    "get_device: client: 0x%p, guid:0x%p",
116 	    ofs_client, (void *)(uintptr_t)htonll(guid));
117 
118 	list_for_each(entry, &ofs_client->device_list) {
119 		device = entry->ptr;
120 		if (device->node_guid == htonll(guid)) {
121 			ASSERT(device->reg_state == IB_DEV_CLOSE);
122 			ASSERT(device->node_type == RDMA_NODE_IB_CA);
123 			ASSERT(device->clnt_hdl == (ofs_client_p_t)ofs_client);
124 			return (device);
125 		}
126 	}
127 
128 	SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
129 	    "get_device: client: 0x%p, guid:0x%p => no match guid",
130 	    ofs_client, (void *)(uintptr_t)htonll(guid));
131 
132 	return (NULL);
133 }
134 
135 /*
136  * ofs_async_handler() is a delegated function to handle asynchrnonous events,
137  * which dispatches each event to corresponding qp/cq handlers registered
138  * with ib_create_qp() and/or ib_create_cq().
139  */
140 static void
141 ofs_async_handler(void *clntp, ibt_hca_hdl_t hdl, ibt_async_code_t code,
142     ibt_async_event_t *event)
143 {
144 	ofs_client_t 	*ofs_client = (ofs_client_t *)clntp;
145 	struct ib_event ib_event;
146 	struct ib_qp 	*qpp;
147 	struct ib_cq	*cqp;
148 
149 
150 	ASSERT(ofs_client != NULL);
151 
152 	cqp = event->ev_cq_hdl ? ibt_get_cq_private(event->ev_cq_hdl) : NULL;
153 	qpp = event->ev_chan_hdl ?
154 	    ibt_get_qp_private(event->ev_chan_hdl) : NULL;
155 
156 	SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
157 	    "ofs_async_handler: client: 0x%p, hca_hdl: 0x%p, code:0x%x, "
158 	    "event->qp: 0x%p, event->cq: 0x%p, event->srq: 0x%p "
159 	    "event->guid: 0x%p, event->port: 0x%x",
160 	    clntp, hdl, code, qpp, cqp, event->ev_srq_hdl,
161 	    (void *)(uintptr_t)event->ev_hca_guid, event->ev_port);
162 
163 	bzero(&ib_event, sizeof (struct ib_event));
164 	switch (code) {
165 	case IBT_EVENT_PATH_MIGRATED:
166 		FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp,
167 		    IB_EVENT_PATH_MIG);
168 		return;
169 	case IBT_EVENT_SQD:
170 		FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp,
171 		    IB_EVENT_SQ_DRAINED);
172 		return;
173 	case IBT_EVENT_COM_EST:
174 		FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp,
175 		    IB_EVENT_COMM_EST);
176 		return;
177 	case IBT_ERROR_CATASTROPHIC_CHAN:
178 		FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp,
179 		    IB_EVENT_QP_FATAL);
180 		return;
181 	case IBT_ERROR_INVALID_REQUEST_CHAN:
182 		FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp,
183 		    IB_EVENT_QP_REQ_ERR);
184 		return;
185 	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
186 		FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp,
187 		    IB_EVENT_QP_ACCESS_ERR);
188 		return;
189 	case IBT_ERROR_PATH_MIGRATE_REQ:
190 		FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp,
191 		    IB_EVENT_PATH_MIG);
192 		return;
193 	case IBT_EVENT_EMPTY_CHAN:
194 		FIRE_QP_EVENT(ofs_client, hdl, ib_event, qpp,
195 		    IB_EVENT_QP_LAST_WQE_REACHED);
196 		return;
197 	case IBT_ERROR_CQ:
198 		FIRE_CQ_EVENT(ofs_client, hdl, ib_event, cqp,
199 		    IB_EVENT_CQ_ERR);
200 		return;
201 	case IBT_HCA_ATTACH_EVENT:
202 	{
203 		ib_device_t	*device;
204 		int		rtn;
205 
206 		/* re-use the device once it was created */
207 		rw_enter(&ofs_client->lock, RW_WRITER);
208 		device = get_device(ofs_client, event->ev_hca_guid);
209 		if (device == NULL) {
210 			device = kmem_alloc(sizeof (ib_device_t), KM_SLEEP);
211 			device->node_type = RDMA_NODE_IB_CA;
212 			device->reg_state = IB_DEV_CLOSE;
213 			device->clnt_hdl = (ofs_client_p_t)ofs_client;
214 			device->node_guid = htonll(event->ev_hca_guid);
215 			device->data = NULL;
216 			/* add this HCA */
217 			ofs_client->hca_num++;
218 			llist_head_init(&device->list, device);
219 			llist_add_tail(&device->list, &ofs_client->device_list);
220 		}
221 		device->hca_hdl = NULL;
222 		device->local_dma_lkey = 0;
223 		device->phys_port_cnt = 0;
224 
225 		/* open this HCA */
226 		rtn = ibt_open_hca(ofs_client->ibt_hdl, event->ev_hca_guid,
227 		    &device->hca_hdl);
228 		if (rtn == IBT_SUCCESS) {
229 			ibt_hca_attr_t hattr;
230 
231 			ofs_client->hca_open_num++;
232 			device->reg_state = IB_DEV_OPEN;
233 			ibt_set_hca_private(device->hca_hdl, device);
234 
235 			rtn = ibt_query_hca(device->hca_hdl, &hattr);
236 			if (rtn != IBT_SUCCESS) {
237 				device->reg_state = IB_DEV_CLOSE;
238 				rtn = ibt_close_hca(device->hca_hdl);
239 				ASSERT(rtn == IBT_SUCCESS);
240 				ofs_client->hca_open_num--;
241 				return;
242 			}
243 
244 			(void) sprintf(device->name, "%x:%x:%x",
245 			    hattr.hca_vendor_id, hattr.hca_device_id,
246 			    hattr.hca_version_id);
247 			device->local_dma_lkey = hattr.hca_reserved_lkey;
248 			device->phys_port_cnt = hattr.hca_nports;
249 			ibt_set_hca_private(device->hca_hdl, device);
250 
251 			/* invoke client's callback */
252 			if (ofs_client->ib_client->add) {
253 				ofs_client->ib_client->add(device);
254 			}
255 		}
256 		rw_exit(&ofs_client->lock);
257 
258 		return;
259 	}
260 	case IBT_HCA_DETACH_EVENT:
261 	{
262 		struct ib_device *device;
263 
264 		rw_enter(&ofs_client->lock, RW_WRITER);
265 		device = ibt_get_hca_private(hdl);
266 		if (device->reg_state == IB_DEV_OPEN) {
267 			ibt_status_t rtn;
268 			/* invoke client's callback */
269 			if (ofs_client->ib_client->remove) {
270 				ofs_client->ib_client->remove(device);
271 			}
272 			/* change the state only */
273 			device->reg_state = IB_DEV_CLOSE;
274 			/* close this HCA */
275 			rtn = ibt_close_hca(device->hca_hdl);
276 			ASSERT(rtn == IBT_SUCCESS);
277 			ofs_client->hca_open_num--;
278 		}
279 		rw_exit(&ofs_client->lock);
280 
281 		return;
282 	}
283 	case IBT_EVENT_LIMIT_REACHED_SRQ:
284 	case IBT_ERROR_CATASTROPHIC_SRQ:
285 	default:
286 		SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
287 		    "sol_ofs does not support this event(0x%x).\n"
288 		    "\t clntp=0x%p, hca_hdl=0x%p, code=%d, eventp=0x%p\n",
289 		    code, clntp, hdl, code, event);
290 		return;
291 	}
292 }
293 
294 /*
295  * ib_register_client - Register an IB client
296  * @client:Client to register
297  *
298  * Upper level users of the IB drivers can use ib_register_client() to
299  * register callbacks for IB device addition and removal.  When an IB
300  * device is added, each registered client's add method will be called
301  * (in the order the clients were registered), and when a device is
302  * removed, each client's remove method will be called (in the reverse
303  * order that clients were registered).  In addition, when
304  * ib_register_client() is called, the client will receive an add
305  * callback for all devices already registered.
306  *
307  * Note that struct ib_client should have a dip pointer to the client,
308  * which is different from the Linux implementation.
309  */
310 int
311 ib_register_client(struct ib_client *client)
312 {
313 	uint_t		i, nhcas; /* number of HCAs */
314 	ib_guid_t	*guidp;
315 	ofs_client_t	*ofs_client;
316 	llist_head_t	*entry, *tmp;
317 	ib_device_t	*device;
318 	int		rtn;
319 
320 	SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
321 	    "ib_register_client: client: 0x%p", client);
322 
323 	/* get the number of HCAs on this system */
324 	if ((nhcas = ibt_get_hca_list(&guidp)) == 0) {
325 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
326 		    "ib_register_client: client: 0x%p => no HCA", client);
327 		return (-ENXIO);
328 	}
329 
330 	/* allocate a new sol_ofs_client structure */
331 	ofs_client = kmem_zalloc(sizeof (ofs_client_t), KM_NOSLEEP);
332 	if (ofs_client == NULL) {
333 		(void) ibt_free_hca_list(guidp, nhcas);
334 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
335 		    "ib_register_client: client: 0x%p => "
336 		    "no sufficient memory for ofs_client", client);
337 		return (-ENOMEM);
338 	}
339 
340 	/* set members */
341 	ofs_client->ib_client = client;
342 	if ((rtn = alloc_ibt_client(ofs_client)) != 0) {
343 		kmem_free(ofs_client, sizeof (ofs_client_t));
344 		(void) ibt_free_hca_list(guidp, nhcas);
345 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
346 		    "ib_register_client: client: 0x%p => "
347 		    "alloc_ibt_client failed w/ 0x%x", client, rtn);
348 		return (rtn);
349 	}
350 	ofs_client->state = IB_OFS_CLNT_INITIALIZED;
351 	llist_head_init(&ofs_client->device_list, NULL);
352 	llist_head_init(&ofs_client->client_list, ofs_client);
353 	rw_init(&ofs_client->lock, NULL, RW_DEFAULT, NULL);
354 
355 	/* initialize IB client */
356 	rw_enter(&ofs_client->lock, RW_WRITER);
357 	if (client->state != IB_CLNT_UNINITIALIZED) {
358 		rw_exit(&ofs_client->lock);
359 		kmem_free(ofs_client, sizeof (ofs_client_t));
360 		(void) ibt_free_hca_list(guidp, nhcas);
361 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
362 		    "ib_register_client: client: 0x%p => "
363 		    "invalid client state(%d)", client, client->state);
364 		return (-EPERM);
365 	}
366 
367 	/* attach this client to IBTF */
368 	rtn = ibt_attach(&ofs_client->ibt_client, client->dip, ofs_client,
369 	    &ofs_client->ibt_hdl);
370 	if (rtn != IBT_SUCCESS) {
371 		rw_exit(&ofs_client->lock);
372 		free_ibt_client(ofs_client);
373 		kmem_free(ofs_client, sizeof (ofs_client_t));
374 		(void) ibt_free_hca_list(guidp, nhcas);
375 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
376 		    "ib_register_client: client: 0x%p => "
377 		    "ibt_attach failed w/ 0x%x", client, rtn);
378 		return (-EINVAL);
379 	}
380 	client->clnt_hdl = (ofs_client_p_t)ofs_client;
381 	client->state = IB_CLNT_INITIALIZED;
382 
383 	/* link this client */
384 	mutex_enter(&clist_lock);
385 	llist_add_tail(&ofs_client->client_list, &client_list);
386 	mutex_exit(&clist_lock);
387 
388 	/* Open HCAs */
389 	ofs_client->hca_num = nhcas;
390 	for (i = 0; i < ofs_client->hca_num; i++) {
391 		/* allocate the ib_device structure */
392 		device = kmem_zalloc(sizeof (ib_device_t), KM_NOSLEEP);
393 		if (device == NULL) {
394 			rtn = -ENOMEM;
395 			SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
396 			    "ib_register_client: client: 0x%p => "
397 			    "no sufficient memory for ib_device", client);
398 			goto err;
399 		}
400 		device->node_guid = htonll(guidp[i]);
401 		device->node_type = RDMA_NODE_IB_CA;
402 		device->reg_state = IB_DEV_CLOSE;
403 		device->clnt_hdl = (ofs_client_p_t)ofs_client;
404 		llist_head_init(&device->list, device);
405 		llist_add_tail(&device->list, &ofs_client->device_list);
406 
407 		rtn = ibt_open_hca(ofs_client->ibt_hdl, guidp[i],
408 		    &device->hca_hdl);
409 		if (rtn == IBT_SUCCESS) {
410 			ibt_hca_attr_t hattr;
411 
412 			ofs_client->hca_open_num++;
413 			device->reg_state = IB_DEV_OPEN;
414 
415 			rtn = ibt_query_hca(device->hca_hdl, &hattr);
416 			if (rtn != IBT_SUCCESS) {
417 				rtn = -EIO;
418 				SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
419 				    "ib_register_client: client: 0x%p,"
420 				    "hca_hdl: 0x%p ==> "
421 				    "ibt_query_hca() failed w/ %d",
422 				    client, device->hca_hdl, rtn);
423 				goto err;
424 			}
425 
426 			(void) sprintf(device->name, "%x:%x:%x",
427 			    hattr.hca_vendor_id, hattr.hca_device_id,
428 			    hattr.hca_version_id);
429 			device->local_dma_lkey = hattr.hca_reserved_lkey;
430 			device->phys_port_cnt = hattr.hca_nports;
431 			ibt_set_hca_private(device->hca_hdl, device);
432 
433 			/* invoke client's callback */
434 			if (client->add) {
435 				client->add(device);
436 			}
437 		}
438 	}
439 	if (ofs_client->hca_open_num == 0) {
440 		rtn = -ENXIO;
441 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
442 		    "ib_register_client: client: 0x%p => "
443 		    "no available HCA", client);
444 		goto err;
445 	}
446 	rw_exit(&ofs_client->lock);
447 
448 	(void) ibt_free_hca_list(guidp, nhcas);
449 	return (0);
450 
451 err:
452 	/* first close all open HCAs */
453 	list_for_each(entry, &ofs_client->device_list) {
454 		device = entry->ptr;
455 		/*
456 		 * If it's open already, close it after the remove
457 		 * callback.
458 		 */
459 		if (device->reg_state == IB_DEV_OPEN) {
460 			ibt_status_t rtn;
461 			/* invoke client's callback */
462 			if (client->remove) {
463 				client->remove(device);
464 			}
465 			device->reg_state = IB_DEV_CLOSE;
466 			rtn = ibt_close_hca(device->hca_hdl);
467 			ASSERT(rtn == IBT_SUCCESS);
468 			ofs_client->hca_open_num--;
469 		}
470 	}
471 	ASSERT(ofs_client->hca_open_num == 0);
472 
473 	/* then free the devices */
474 	list_for_each_safe(entry, tmp, &ofs_client->device_list) {
475 		device = entry->ptr;
476 		/* de-link and free the device */
477 		llist_del(entry);
478 		kmem_free(device, sizeof (ib_device_t));
479 		ofs_client->hca_num--;
480 	}
481 	ASSERT(ofs_client->hca_num == 0);
482 
483 	/* delink this client */
484 	mutex_enter(&clist_lock);
485 	llist_del(&ofs_client->client_list);
486 	mutex_exit(&clist_lock);
487 
488 	/* detach the client */
489 	client->clnt_hdl = NULL;
490 	client->state = IB_CLNT_UNINITIALIZED;
491 	(void) ibt_detach(ofs_client->ibt_hdl);
492 	rw_exit(&ofs_client->lock);
493 
494 	/* free sol_ofs_client */
495 	free_ibt_client(ofs_client);
496 	kmem_free(ofs_client, sizeof (ofs_client_t));
497 
498 	(void) ibt_free_hca_list(guidp, nhcas);
499 	return (rtn);
500 }
501 
502 /*
503  * ib_unregister_client - Unregister an IB client
504  * @client:Client to unregister
505  *
506  * Upper level users use ib_unregister_client() to remove their client
507  * registration.  When ib_unregister_client() is called, the client
508  * will receive a remove callback for each IB device still registered.
509  */
510 void
511 ib_unregister_client(struct ib_client *client)
512 {
513 	ofs_client_t	*ofs_client;
514 	ib_device_t	*device;
515 	llist_head_t	*entry, *tmp;
516 
517 	ASSERT(client->state == IB_CLNT_INITIALIZED &&
518 	    client->clnt_hdl != NULL);
519 
520 	SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
521 	    "ib_unregister_client: client: 0x%p", client);
522 
523 	ofs_client = (ofs_client_t *)client->clnt_hdl;
524 	rw_enter(&ofs_client->lock, RW_WRITER);
525 
526 	/* first close all open HCAs */
527 	list_for_each(entry, &ofs_client->device_list) {
528 		device = entry->ptr;
529 		/*
530 		 * If it's open already, close it after the remove
531 		 * callback.
532 		 */
533 		if (device->reg_state == IB_DEV_OPEN) {
534 			ibt_status_t rtn;
535 			/* invoke client's callback */
536 			if (client->remove) {
537 				client->remove(device);
538 			}
539 			device->reg_state = IB_DEV_CLOSE;
540 			rtn = ibt_close_hca(device->hca_hdl);
541 			if (rtn != IBT_SUCCESS)
542 				SOL_OFS_DPRINTF_L3(
543 				    sol_kverbs_dbg_str,
544 				    "ib_unregister_client(%p) - "
545 				    "ibt_close_hca failed %d",
546 				    client, rtn);
547 
548 			ofs_client->hca_open_num--;
549 		}
550 	}
551 	ASSERT(ofs_client->hca_open_num == 0);
552 
553 	/* then free the devices */
554 	list_for_each_safe(entry, tmp, &ofs_client->device_list) {
555 		device = entry->ptr;
556 		/* de-link and free the device */
557 		llist_del(entry);
558 		kmem_free(device, sizeof (ib_device_t));
559 		ofs_client->hca_num--;
560 	}
561 	ASSERT(ofs_client->hca_num == 0);
562 
563 	/* delink this client */
564 	mutex_enter(&clist_lock);
565 	llist_del(&ofs_client->client_list);
566 	mutex_exit(&clist_lock);
567 
568 	/* detach the client */
569 	client->clnt_hdl = NULL;
570 	client->state = IB_CLNT_UNINITIALIZED;
571 	(void) ibt_detach(ofs_client->ibt_hdl);
572 	rw_exit(&ofs_client->lock);
573 
574 	/* free sol_ofs_client */
575 	free_ibt_client(ofs_client);
576 	kmem_free(ofs_client, sizeof (ofs_client_t));
577 }
578 
579 /*
580  * ofs_lock_enter() and ofs_lock_exit() are used to avoid the recursive
581  * rwlock while the client callbacks are invoked.
582  *
583  * Note that the writer lock is used only in the client callback case,
584  * so that the kverb functions wanting to acquire the reader lock can
585  * safely ignore the reader lock if the writer lock is already held.
586  * The writer lock shouldn't be used in no other plances.
587  */
588 static inline void
589 ofs_lock_enter(krwlock_t *lock)
590 {
591 	if (!RW_WRITE_HELD(lock)) {
592 		rw_enter(lock, RW_READER);
593 	}
594 }
595 
596 static inline void
597 ofs_lock_exit(krwlock_t *lock)
598 {
599 	if (!RW_WRITE_HELD(lock)) {
600 		rw_exit(lock);
601 	}
602 }
603 
604 /*
605  * ib_get_client_data - Get IB client context
606  * @device:Device to get context for
607  * @client:Client to get context for
608  *
609  * ib_get_client_data() returns client context set with
610  * ib_set_client_data() and returns NULL if it's not found.
611  */
612 void *ib_get_client_data(struct ib_device *device,
613     struct ib_client *client)
614 {
615 	ofs_client_t		*ofs_client;
616 	struct ib_device	*ib_device;
617 	boolean_t		found = B_FALSE;
618 	llist_head_t		*entry;
619 	void			*data;
620 
621 	ASSERT(device != 0 && client != 0);
622 
623 	ofs_client = (ofs_client_t *)client->clnt_hdl;
624 	if (ofs_client == 0) {
625 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
626 		    "ib_get_client_data: device: 0x%p, client: 0x%p => "
627 		    "no ofs_client", device, client);
628 		return (NULL);
629 	}
630 
631 	ofs_lock_enter(&ofs_client->lock);
632 	list_for_each(entry, &ofs_client->device_list) {
633 		ib_device = entry->ptr;
634 		if (ib_device->node_guid == device->node_guid) {
635 			found = B_TRUE;
636 			break;
637 		}
638 	}
639 	if (!found) {
640 		ofs_lock_exit(&ofs_client->lock);
641 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
642 		    "ib_get_client_data: device: 0x%p, client: 0x%p => "
643 		    "no ib_device found", device, client);
644 		return (NULL);
645 	}
646 	data = ib_device->data;
647 	ofs_lock_exit(&ofs_client->lock);
648 
649 	SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
650 	    "ib_get_client_data: device: 0x%p, client: 0x%p",
651 	    device, client);
652 
653 	return (data);
654 }
655 
656 /*
657  * ib_set_client_data - Set IB client context
658  * @device:Device to set context for
659  * @client:Client to set context for
660  * @data:Context to set
661  *
662  * ib_set_client_data() sets client context that can be retrieved with
663  * ib_get_client_data(). If the specified device is not found, the function
664  * returns w/o any operations.
665  */
666 void ib_set_client_data(struct ib_device *device, struct ib_client *client,
667     void *data)
668 {
669 	ofs_client_t		*ofs_client;
670 	struct ib_device	*ib_device;
671 	boolean_t		found = B_FALSE;
672 	llist_head_t		*entry;
673 
674 	ASSERT(device != 0 && client != 0);
675 
676 	ofs_client = (ofs_client_t *)client->clnt_hdl;
677 	if (ofs_client == 0) {
678 		cmn_err(CE_WARN, "No client context found for %s/%s\n",
679 		    device->name, client->name);
680 		return;
681 	}
682 
683 	ofs_lock_enter(&ofs_client->lock);
684 	list_for_each(entry, &ofs_client->device_list) {
685 		ib_device = entry->ptr;
686 		if (ib_device->node_guid == device->node_guid) {
687 			found = B_TRUE;
688 			break;
689 		}
690 	}
691 	if (!found) {
692 		cmn_err(CE_WARN, "No client context found for %s/%s\n",
693 		    device->name, client->name);
694 		ofs_lock_exit(&ofs_client->lock);
695 		return;
696 	}
697 	ib_device->data = data;
698 	ofs_lock_exit(&ofs_client->lock);
699 
700 	SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
701 	    "ib_set_client_data: device: 0x%p, client: 0x%p, "
702 	    "data: 0x%p", device, client, data);
703 }
704 
705 /*
706  * ib_query_device - Query IB device attributes
707  * @device:Device to query
708  * @device_attr:Device attributes
709  *
710  * ib_query_device() returns the attributes of a device through the
711  * @device_attr pointer.
712  */
713 int
714 ib_query_device(struct ib_device *device, struct ib_device_attr *attr)
715 {
716 	ofs_client_t	*ofs_client = (ofs_client_t *)device->clnt_hdl;
717 	ibt_hca_attr_t	hattr;
718 	int		rtn;
719 
720 	ofs_lock_enter(&ofs_client->lock);
721 	if (device->reg_state != IB_DEV_OPEN) {
722 		ofs_lock_exit(&ofs_client->lock);
723 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
724 		    "ib_query_device: device: 0x%p => "
725 		    "invalid device state (%d)", device, device->reg_state);
726 		return (-ENXIO);
727 	}
728 	if ((rtn = ibt_query_hca(device->hca_hdl, &hattr)) != IBT_SUCCESS) {
729 		ofs_lock_exit(&ofs_client->lock);
730 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
731 		    "ib_query_device: device: 0x%p => "
732 		    "ibt_query_hca failed w/ 0x%x", device, rtn);
733 		return (-EIO);
734 	}
735 	ofs_lock_exit(&ofs_client->lock);
736 
737 	SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
738 	    "ib_query_device: device: 0x%p, attr: 0x%p, rtn: 0x%p",
739 	    device, attr, rtn);
740 
741 	/* OF order is major.micro.minor, so keep it here */
742 	attr->fw_ver = (uint64_t)hattr.hca_fw_major_version << 32	|
743 	    hattr.hca_fw_micro_version << 16 & 0xFFFF0000		|
744 	    hattr.hca_fw_minor_version & 0xFFFF;
745 
746 	attr->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT		|
747 	    IB_DEVICE_PORT_ACTIVE_EVENT					|
748 	    IB_DEVICE_SYS_IMAGE_GUID					|
749 	    IB_DEVICE_RC_RNR_NAK_GEN;
750 	if (hattr.hca_flags & IBT_HCA_PKEY_CNTR) {
751 		attr->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
752 	}
753 	if (hattr.hca_flags & IBT_HCA_QKEY_CNTR) {
754 		attr->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
755 	}
756 	if (hattr.hca_flags & IBT_HCA_AUTO_PATH_MIG) {
757 		attr->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
758 	}
759 	if (hattr.hca_flags & IBT_HCA_AH_PORT_CHECK) {
760 		attr->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
761 	}
762 
763 	attr->vendor_id		= hattr.hca_vendor_id;
764 	attr->vendor_part_id	= hattr.hca_device_id;
765 	attr->hw_ver		= hattr.hca_version_id;
766 	attr->sys_image_guid	= htonll(hattr.hca_si_guid);
767 	attr->max_mr_size	= ~0ull;
768 	attr->page_size_cap	= IBTF2OF_PGSZ(hattr.hca_page_sz);
769 	attr->max_qp		= hattr.hca_max_qp;
770 	attr->max_qp_wr		= hattr.hca_max_qp_sz;
771 	attr->max_sge		= hattr.hca_max_sgl;
772 	attr->max_sge_rd	= hattr.hca_max_rd_sgl;
773 	attr->max_cq		= hattr.hca_max_cq;
774 	attr->max_cqe		= hattr.hca_max_cq_sz;
775 	attr->max_mr		= hattr.hca_max_memr;
776 	attr->max_pd		= hattr.hca_max_pd;
777 	attr->max_qp_rd_atom	= hattr.hca_max_rdma_in_qp;
778 	attr->max_qp_init_rd_atom	= hattr.hca_max_rdma_in_qp;
779 	attr->max_ee_rd_atom	= hattr.hca_max_rdma_in_ee;
780 	attr->max_ee_init_rd_atom	= hattr.hca_max_rdma_in_ee;
781 	attr->max_res_rd_atom	= hattr.hca_max_rsc;
782 	attr->max_srq		= hattr.hca_max_srqs;
783 	attr->max_srq_wr	= hattr.hca_max_srqs_sz -1;
784 	attr->max_srq_sge	= hattr.hca_max_srq_sgl;
785 	attr->local_ca_ack_delay	= hattr.hca_local_ack_delay;
786 	attr->atomic_cap = hattr.hca_flags & IBT_HCA_ATOMICS_GLOBAL ?
787 	    IB_ATOMIC_GLOB : (hattr.hca_flags & IBT_HCA_ATOMICS_HCA ?
788 	    IB_ATOMIC_HCA : IB_ATOMIC_NONE);
789 	attr->max_ee		= hattr.hca_max_eec;
790 	attr->max_rdd		= hattr.hca_max_rdd;
791 	attr->max_mw		= hattr.hca_max_mem_win;
792 	attr->max_pkeys		= hattr.hca_max_port_pkey_tbl_sz;
793 	attr->max_raw_ipv6_qp	= hattr.hca_max_ipv6_qp;
794 	attr->max_raw_ethy_qp	= hattr.hca_max_ether_qp;
795 	attr->max_mcast_grp	= hattr.hca_max_mcg;
796 	attr->max_mcast_qp_attach	= hattr.hca_max_qp_per_mcg;
797 	attr->max_total_mcast_qp_attach = hattr.hca_max_mcg_qps;
798 	attr->max_ah		= hattr.hca_max_ah;
799 	attr->max_fmr		= hattr.hca_max_fmrs;
800 	attr->max_map_per_fmr	= hattr.hca_opaque9; /* hca_max_map_per_fmr */
801 
802 	return (0);
803 }
804 
805 /* Protection domains */
806 struct ib_pd *
807 ib_alloc_pd(struct ib_device *device)
808 {
809 	ofs_client_t	*ofs_client = (ofs_client_t *)device->clnt_hdl;
810 	struct ib_pd	*pd;
811 	int		rtn;
812 
813 	if ((pd = kmem_alloc(sizeof (struct ib_pd), KM_NOSLEEP)) == NULL) {
814 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
815 		    "ib_alloc_pd: device: 0x%p => no sufficient memory",
816 		    device);
817 		return ((struct ib_pd *)-ENOMEM);
818 	}
819 
820 	ofs_lock_enter(&ofs_client->lock);
821 	if (device->reg_state != IB_DEV_OPEN) {
822 		ofs_lock_exit(&ofs_client->lock);
823 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
824 		    "ib_alloc_pd: device: 0x%p => invalid device state (%d)",
825 		    device, device->reg_state);
826 		return ((struct ib_pd *)-ENXIO);
827 	}
828 
829 	SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
830 	    "ib_alloc_pd: device: 0x%p", device);
831 
832 	rtn = ibt_alloc_pd(device->hca_hdl, IBT_PD_NO_FLAGS, &pd->ibt_pd);
833 	ofs_lock_exit(&ofs_client->lock);
834 
835 	if (rtn == IBT_SUCCESS) {
836 		pd->device = device;
837 		SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
838 		    "ib_alloc_pd: device: 0x%p, pd: 0x%p, ibt_pd: 0x%p, "
839 		    "rtn: 0x%x", device, pd, pd->ibt_pd, rtn);
840 		return (pd);
841 	}
842 	kmem_free(pd, sizeof (struct ib_pd));
843 
844 	SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
845 	    "ib_alloc_pd: device: 0x%p, pd: 0x%p, ibt_pd: 0x%p => "
846 	    "ibt_alloc_pd failed w/ 0x%x", device, pd, pd->ibt_pd, rtn);
847 
848 	switch (rtn) {
849 	case IBT_INSUFF_RESOURCE:
850 		return ((struct ib_pd *)-ENOMEM);
851 	case IBT_HCA_HDL_INVALID:
852 		return ((struct ib_pd *)-EFAULT);
853 	default:
854 		return ((struct ib_pd *)-EIO);
855 	}
856 }
857 
858 int
859 ib_dealloc_pd(struct ib_pd *pd)
860 {
861 	ofs_client_t *ofs_client = (ofs_client_t *)pd->device->clnt_hdl;
862 	int rtn;
863 
864 	ofs_lock_enter(&ofs_client->lock);
865 	if (pd->device->reg_state != IB_DEV_OPEN) {
866 		ofs_lock_exit(&ofs_client->lock);
867 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
868 		    "ib_dealloc_pd: pd: 0x%p => invalid device state (%d)",
869 		    pd, pd->device->reg_state);
870 		return (-ENXIO);
871 	}
872 
873 	SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
874 	    "ib_dealloc_pd: pd: 0x%p", pd);
875 
876 	rtn = ibt_free_pd(pd->device->hca_hdl, pd->ibt_pd);
877 	ofs_lock_exit(&ofs_client->lock);
878 
879 	if (rtn == IBT_SUCCESS) {
880 		SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
881 		    "ib_dealloc_pd: pd: 0x%p, device: 0x%p, ibt_pd: 0x%p, "
882 		    "rtn: 0x%x", pd, pd->device, pd->ibt_pd, rtn);
883 		kmem_free(pd, sizeof (struct ib_pd));
884 		return (0);
885 	}
886 
887 	SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
888 	    "ib_dealloc_pd: pd: 0x%p => ibt_free_pd failed w/ 0x%x",
889 	    pd, rtn);
890 
891 	switch (rtn) {
892 	case IBT_PD_IN_USE:
893 		return (-EBUSY);
894 	case IBT_HCA_HDL_INVALID:
895 		return (-EFAULT);
896 	default:
897 		return (-EIO);
898 	}
899 }
900 
901 /*
902  * ofs_cq_handler() is a delegated function to handle CQ events,
903  * which dispatches them to corresponding cq handlers registered
904  * with ib_create_cq().
905  */
906 static void
907 ofs_cq_handler(ibt_cq_hdl_t ibt_cq, void *arg)
908 {
909 	struct ib_cq *cq = (struct ib_cq *)ibt_get_cq_private(ibt_cq);
910 
911 	SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
912 	    "ofs_cq_handler: ibt_cq: 0x%p, ib_cq: 0x%p, comp_handler: 0x%p, "
913 	    "arg: 0x%p", ibt_cq, cq, cq->comp_handler, arg);
914 
915 	if (cq->comp_handler) {
916 		cq->comp_handler(cq, cq->cq_context);
917 	}
918 }
919 
920 /*
921  * ib_create_cq - Creates a CQ on the specified device.
922  * @device: The device on which to create the CQ.
923  * @comp_handler: A user-specified callback that is invoked when a
924  *   completion event occurs on the CQ.
925  * @event_handler: A user-specified callback that is invoked when an
926  *   asynchronous event not associated with a completion occurs on the CQ.
927  * @cq_context: Context associated with the CQ returned to the user via
928  *   the associated completion and event handlers.
929  * @cqe: The minimum size of the CQ.
930  * @comp_vector - Completion vector used to signal completion events.
931  *     Must be >= 0 and < context->num_comp_vectors.
932  *
933  * Users can examine the cq structure to determine the actual CQ size.
934  *
935  * Note that comp_vector is not supported currently.
936  */
937 struct ib_cq *
938 ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler,
939     void (*event_handler)(struct ib_event *, void *), void *cq_context,
940     int cqe, void *comp_vector)
941 {
942 	ofs_client_t	*ofs_client = (ofs_client_t *)device->clnt_hdl;
943 	ibt_cq_attr_t	cq_attr;
944 	uint32_t	real_size;
945 	struct ib_cq	*cq;
946 	int		rtn;
947 
948 	if ((cq = kmem_alloc(sizeof (struct ib_cq), KM_NOSLEEP)) == NULL) {
949 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
950 		    "ib_create_cq: device: 0x%p, comp_handler: 0x%p, "
951 		    "event_handler: 0x%p, cq_context: 0x%p, cqe: 0x%x, "
952 		    "comp_vector: %p => no sufficient memory", device,
953 		    comp_handler, event_handler, cq_context, cqe, comp_vector);
954 		return ((struct ib_cq *)-ENOMEM);
955 	}
956 
957 	ofs_lock_enter(&ofs_client->lock);
958 	if (device->reg_state != IB_DEV_OPEN) {
959 		ofs_lock_exit(&ofs_client->lock);
960 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
961 		    "ib_create_cq: device: 0x%p, comp_handler: 0x%p, "
962 		    "event_handler: 0x%p, cq_context: 0x%p, cqe: 0x%x, "
963 		    "comp_vector: %p => invalid device state (%d)", device,
964 		    comp_handler, event_handler, cq_context, cqe, comp_vector,
965 		    device->reg_state);
966 		return ((struct ib_cq *)-ENXIO);
967 	}
968 
969 	SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
970 	    "ib_create_cq: device: 0x%p, comp_handler: 0x%p, "
971 	    "event_handler: 0x%p, cq_context: 0x%p, cqe: 0x%x, "
972 	    "comp_vector: %d", device, comp_handler, event_handler,
973 	    cq_context, cqe, comp_vector);
974 
975 	cq_attr.cq_size = cqe;
976 	cq_attr.cq_sched = comp_vector;
977 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
978 	rtn = ibt_alloc_cq(device->hca_hdl, &cq_attr, &cq->ibt_cq, &real_size);
979 	ofs_lock_exit(&ofs_client->lock);
980 
981 	if (rtn == IBT_SUCCESS) {
982 		cq->device = device;
983 		cq->comp_handler = comp_handler;
984 		cq->event_handler = event_handler;
985 		cq->cq_context = cq_context;
986 		cq->cqe = real_size;
987 		ibt_set_cq_private(cq->ibt_cq, cq);
988 		ibt_set_cq_handler(cq->ibt_cq, ofs_cq_handler, cq_context);
989 		mutex_init(&cq->lock, NULL, MUTEX_DEFAULT, NULL);
990 		SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
991 		    "ib_create_cq: device: 0x%p, cqe: 0x%x, ibt_cq: 0x%p, "
992 		    "rtn: 0x%x", device, cqe, cq->ibt_cq, rtn);
993 		return (cq);
994 	}
995 	kmem_free(cq, sizeof (struct ib_cq));
996 
997 	SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
998 	    "ib_create_cq: device: 0x%p, cqe: 0x%x, ibt_cq: 0x%p => "
999 	    "ibt_alloc_cq failed w/ 0x%x", device, cqe, cq->ibt_cq, rtn);
1000 
1001 	switch (rtn) {
1002 	case IBT_HCA_CQ_EXCEEDED:
1003 	case IBT_INVALID_PARAM:
1004 	case IBT_HCA_HDL_INVALID:
1005 		return ((struct ib_cq *)-EINVAL);
1006 	case IBT_INSUFF_RESOURCE:
1007 		return ((struct ib_cq *)-ENOMEM);
1008 	default:
1009 		return ((struct ib_cq *)-EIO);
1010 	}
1011 }
1012 
1013 int
1014 ib_destroy_cq(struct ib_cq *cq)
1015 {
1016 	ofs_client_t	*ofs_client = (ofs_client_t *)cq->device->clnt_hdl;
1017 	int		rtn;
1018 
1019 	ofs_lock_enter(&ofs_client->lock);
1020 	if (cq->device->reg_state != IB_DEV_OPEN) {
1021 		ofs_lock_exit(&ofs_client->lock);
1022 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1023 		    "ib_destroy_cq: cq: 0x%p => invalid device state (%d)",
1024 		    cq, cq->device->reg_state);
1025 		return (-ENXIO);
1026 	}
1027 
1028 	SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
1029 	    "ib_destroy_cq: cq: 0x%p", cq);
1030 
1031 	/*
1032 	 * if IBTL_ASYNC_PENDING is set, ibt_qp is not freed
1033 	 * at this moment, but yet alive for a while. Then
1034 	 * there is a possibility that this qp is used even after
1035 	 * ib_destroy_cq() is called. To distinguish this case from
1036 	 * others, clear ibt_qp here.
1037 	 */
1038 	ibt_set_cq_private(cq->ibt_cq, NULL);
1039 
1040 	rtn = ibt_free_cq(cq->ibt_cq);
1041 	if (rtn == IBT_SUCCESS) {
1042 		ofs_lock_exit(&ofs_client->lock);
1043 		kmem_free(cq, sizeof (struct ib_cq));
1044 		SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
1045 		    "ib_destroy_cq: cq: 0x%p, rtn: 0x%x", cq, rtn);
1046 		return (0);
1047 	}
1048 	ibt_set_cq_private(cq->ibt_cq, cq);
1049 	ofs_lock_exit(&ofs_client->lock);
1050 
1051 	SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1052 	    "ib_destroy_cq: cq: 0x%p => ibt_free_cq failed w/ 0x%x", cq, rtn);
1053 
1054 	switch (rtn) {
1055 	case IBT_CQ_BUSY:
1056 		return (-EBUSY);
1057 	case IBT_HCA_HDL_INVALID:
1058 	case IBT_CQ_HDL_INVALID:
1059 		return (-EINVAL);
1060 	default:
1061 		return (-EIO);
1062 	}
1063 }
1064 
1065 struct ib_qp *
1066 ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr)
1067 {
1068 	ofs_client_t		*ofs_client = pd->device->clnt_hdl;
1069 	ibt_qp_alloc_attr_t	attrs;
1070 	ibt_chan_sizes_t	sizes;
1071 	ib_qpn_t		qpn;
1072 	ibt_qp_hdl_t		ibt_qp;
1073 	struct ib_qp		*qp;
1074 	int			rtn;
1075 
1076 	/* sanity check */
1077 	if (!(qp_init_attr->send_cq && qp_init_attr->recv_cq)) {
1078 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1079 		    "ib_create_qp: pd: 0x%p => invalid cqs "
1080 		    "(send_cq=0x%p, recv_cq=0x%p)", pd,
1081 		    qp_init_attr->send_cq, qp_init_attr->recv_cq);
1082 		return ((struct ib_qp *)-EINVAL);
1083 	}
1084 
1085 	/* UC, Raw IPv6 and Raw Ethernet are not supported */
1086 	if (qp_init_attr->qp_type == IB_QPT_UC ||
1087 	    qp_init_attr->qp_type == IB_QPT_RAW_IPV6 ||
1088 	    qp_init_attr->qp_type == IB_QPT_RAW_ETY) {
1089 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1090 		    "ib_create_qp: pd: 0x%p => invalid qp_type",
1091 		    pd, qp_init_attr->qp_type);
1092 		return ((struct ib_qp *)-EINVAL);
1093 	}
1094 
1095 	if ((qp = kmem_alloc(sizeof (struct ib_qp), KM_NOSLEEP)) == NULL) {
1096 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1097 		    "ib_create_qp: pd: 0x%p, init_attr: 0x%p => "
1098 		    "no sufficient memory", pd, qp_init_attr);
1099 		return ((struct ib_qp *)-ENOMEM);
1100 	}
1101 
1102 	ofs_lock_enter(&ofs_client->lock);
1103 	if (pd->device->reg_state != IB_DEV_OPEN) {
1104 		ofs_lock_exit(&ofs_client->lock);
1105 		kmem_free(qp, sizeof (struct ib_qp));
1106 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1107 		    "ib_create_qp: pd: 0x%p, init_attr: 0x%p => "
1108 		    "invalid device state (%d)", pd, qp_init_attr,
1109 		    pd->device->reg_state);
1110 		return ((struct ib_qp *)-ENXIO);
1111 	}
1112 
1113 	SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
1114 	    "ib_create_qp: pd: 0x%p, event_handler: 0x%p, qp_context: 0x%p, "
1115 	    "send_cq: 0x%p, recv_cq: 0x%p, srq: 0x%p, max_send_wr: 0x%x, "
1116 	    "max_recv_wr: 0x%x, max_send_sge: 0x%x, max_recv_sge: 0x%x, "
1117 	    "max_inline_data: 0x%x, sq_sig_type: %d, qp_type: %d, "
1118 	    "port_num: %d",
1119 	    pd, qp_init_attr->event_handler, qp_init_attr->qp_context,
1120 	    qp_init_attr->send_cq, qp_init_attr->recv_cq, qp_init_attr->srq,
1121 	    qp_init_attr->cap.max_send_wr, qp_init_attr->cap.max_recv_wr,
1122 	    qp_init_attr->cap.max_send_sge, qp_init_attr->cap.max_recv_sge,
1123 	    qp_init_attr->cap.max_inline_data, qp_init_attr->sq_sig_type,
1124 	    qp_init_attr->qp_type, qp_init_attr->port_num);
1125 
1126 	attrs.qp_alloc_flags = IBT_QP_NO_FLAGS;
1127 	if (qp_init_attr->srq) {
1128 		attrs.qp_alloc_flags |= IBT_QP_USES_SRQ;
1129 	}
1130 
1131 	attrs.qp_flags = IBT_ALL_SIGNALED | IBT_FAST_REG_RES_LKEY;
1132 	if (qp_init_attr->sq_sig_type == IB_SIGNAL_REQ_WR) {
1133 		attrs.qp_flags |= IBT_WR_SIGNALED;
1134 	}
1135 
1136 	attrs.qp_scq_hdl = qp_init_attr->send_cq->ibt_cq;
1137 	attrs.qp_rcq_hdl = qp_init_attr->recv_cq->ibt_cq;
1138 	attrs.qp_pd_hdl = pd->ibt_pd;
1139 
1140 	attrs.qp_sizes.cs_sq = qp_init_attr->cap.max_send_wr;
1141 	attrs.qp_sizes.cs_rq = qp_init_attr->cap.max_recv_wr;
1142 	attrs.qp_sizes.cs_sq_sgl = qp_init_attr->cap.max_send_sge;
1143 	attrs.qp_sizes.cs_rq_sgl = qp_init_attr->cap.max_recv_sge;
1144 	attrs.qp_sizes.cs_inline = qp_init_attr->cap.max_inline_data;
1145 
1146 	switch (qp_init_attr->qp_type) {
1147 	case IB_QPT_RC:
1148 		rtn = ibt_alloc_qp(pd->device->hca_hdl, IBT_RC_RQP, &attrs,
1149 		    &sizes, &qpn, &ibt_qp);
1150 		break;
1151 	case IB_QPT_UD:
1152 		rtn = ibt_alloc_qp(pd->device->hca_hdl, IBT_UD_RQP, &attrs,
1153 		    &sizes, &qpn, &ibt_qp);
1154 		break;
1155 	case IB_QPT_SMI:
1156 		rtn = ibt_alloc_special_qp(pd->device->hca_hdl,
1157 		    qp_init_attr->port_num, IBT_SMI_SQP, &attrs, &sizes,
1158 		    &ibt_qp);
1159 		break;
1160 	case IB_QPT_GSI:
1161 		rtn = ibt_alloc_special_qp(pd->device->hca_hdl,
1162 		    qp_init_attr->port_num, IBT_GSI_SQP, &attrs, &sizes,
1163 		    &ibt_qp);
1164 		break;
1165 	default:
1166 		/* this should never happens */
1167 		ofs_lock_exit(&ofs_client->lock);
1168 		kmem_free(qp, sizeof (struct ib_qp));
1169 		return ((struct ib_qp *)-EINVAL);
1170 	}
1171 	ofs_lock_exit(&ofs_client->lock);
1172 
1173 	if (rtn == IBT_SUCCESS) {
1174 		/* fill in ib_qp_cap w/ the real values */
1175 		qp_init_attr->cap.max_send_wr = sizes.cs_sq;
1176 		qp_init_attr->cap.max_recv_wr = sizes.cs_rq;
1177 		qp_init_attr->cap.max_send_sge = sizes.cs_sq_sgl;
1178 		qp_init_attr->cap.max_recv_sge = sizes.cs_rq_sgl;
1179 		/* max_inline_data is not supported */
1180 		qp_init_attr->cap.max_inline_data = 0;
1181 		/* fill in ib_qp */
1182 		qp->device = pd->device;
1183 		qp->pd = pd;
1184 		qp->send_cq = qp_init_attr->send_cq;
1185 		qp->recv_cq = qp_init_attr->recv_cq;
1186 		qp->srq = qp_init_attr->srq;
1187 		qp->event_handler = qp_init_attr->event_handler;
1188 		qp->qp_context = qp_init_attr->qp_context;
1189 		qp->qp_num = qp_init_attr->qp_type == IB_QPT_SMI ? 0 :
1190 		    qp_init_attr->qp_type == IB_QPT_GSI ? 1 : qpn;
1191 		qp->qp_type = qp_init_attr->qp_type;
1192 		qp->ibt_qp = ibt_qp;
1193 		ibt_set_qp_private(qp->ibt_qp, qp);
1194 		mutex_init(&qp->lock, NULL, MUTEX_DEFAULT, NULL);
1195 		SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
1196 		    "ib_create_qp: device: 0x%p, pd: 0x%x, init_attr: 0x%p, "
1197 		    "rtn: 0x%x", pd->device, pd, qp_init_attr, rtn);
1198 		return (qp);
1199 	}
1200 	kmem_free(qp, sizeof (struct ib_qp));
1201 
1202 	SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1203 	    "ib_create_qp: device: 0x%p, pd: 0x%x, init_attr: 0x%p => "
1204 	    "ibt_alloc_(special)_qp failed w/ rtn: 0x%x", pd->device, pd,
1205 	    qp_init_attr, rtn);
1206 
1207 	switch (rtn) {
1208 	case IBT_NOT_SUPPORTED:
1209 	case IBT_QP_SRV_TYPE_INVALID:
1210 	case IBT_CQ_HDL_INVALID:
1211 	case IBT_HCA_HDL_INVALID:
1212 	case IBT_INVALID_PARAM:
1213 	case IBT_SRQ_HDL_INVALID:
1214 	case IBT_PD_HDL_INVALID:
1215 	case IBT_HCA_SGL_EXCEEDED:
1216 	case IBT_HCA_WR_EXCEEDED:
1217 		return ((struct ib_qp *)-EINVAL);
1218 	case IBT_INSUFF_RESOURCE:
1219 		return ((struct ib_qp *)-ENOMEM);
1220 	default:
1221 		return ((struct ib_qp *)-EIO);
1222 	}
1223 }
1224 
1225 int
1226 ib_destroy_qp(struct ib_qp *qp)
1227 {
1228 	ofs_client_t	*ofs_client = (ofs_client_t *)qp->device->clnt_hdl;
1229 	int		rtn;
1230 
1231 	ofs_lock_enter(&ofs_client->lock);
1232 	if (qp->device->reg_state != IB_DEV_OPEN) {
1233 		ofs_lock_exit(&ofs_client->lock);
1234 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1235 		    "ib_destroy_qp: qp: 0x%p => invalid device state (%d)",
1236 		    qp, qp->device->reg_state);
1237 		return (-ENXIO);
1238 	}
1239 
1240 	/*
1241 	 * if IBTL_ASYNC_PENDING is set, ibt_qp is not freed
1242 	 * at this moment, but yet alive for a while. Then
1243 	 * there is a possibility that this qp is used even after
1244 	 * ib_destroy_qp() is called. To distinguish this case from
1245 	 * others, clear ibt_qp here.
1246 	 */
1247 	ibt_set_qp_private(qp->ibt_qp, NULL);
1248 
1249 	rtn = ibt_free_qp(qp->ibt_qp);
1250 	if (rtn == IBT_SUCCESS) {
1251 		ofs_lock_exit(&ofs_client->lock);
1252 		kmem_free(qp, sizeof (struct ib_qp));
1253 		SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
1254 		    "ib_destroy_qp: qp: 0x%p, rtn: 0x%x", qp, rtn);
1255 		return (0);
1256 	}
1257 	ibt_set_qp_private(qp->ibt_qp, qp);
1258 	ofs_lock_exit(&ofs_client->lock);
1259 
1260 	SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1261 	    "ib_destroy_qp: qp: 0x%p => ibt_free_qp failed w/ 0x%x", qp, rtn);
1262 
1263 	switch (rtn) {
1264 	case IBT_CHAN_STATE_INVALID:
1265 	case IBT_HCA_HDL_INVALID:
1266 	case IBT_QP_HDL_INVALID:
1267 		return (-EINVAL);
1268 	default:
1269 		return (-EIO);
1270 	}
1271 }
1272 
1273 /*
1274  * ib_req_notify_cq - Request completion notification on a CQ.
1275  * @cq: The CQ to generate an event for.
1276  * @flags:
1277  *   Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP
1278  *   to request an event on the next solicited event or next work
1279  *   completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS
1280  *   may also be |ed in to request a hint about missed events, as
1281  *   described below.
1282  *
1283  * Return Value:
1284  *    < 0 means an error occurred while requesting notification
1285  *   == 0 means notification was requested successfully, and if
1286  *        IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events
1287  *        were missed and it is safe to wait for another event.  In
1288  *        this case is it guaranteed that any work completions added
1289  *        to the CQ since the last CQ poll will trigger a completion
1290  *        notification event.
1291  *    > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed
1292  *        in.  It means that the consumer must poll the CQ again to
1293  *        make sure it is empty to avoid missing an event because of a
1294  *        race between requesting notification and an entry being
1295  *        added to the CQ.  This return value means it is possible
1296  *        (but not guaranteed) that a work completion has been added
1297  *        to the CQ since the last poll without triggering a
1298  *        completion notification event.
1299  *
1300  * Note that IB_CQ_REPORT_MISSED_EVENTS is currently not supported.
1301  */
1302 int
1303 ib_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags)
1304 {
1305 	ibt_cq_notify_flags_t	notify_type;
1306 	int			rtn;
1307 	ofs_client_t		*ofs_client = cq->device->clnt_hdl;
1308 
1309 	ofs_lock_enter(&ofs_client->lock);
1310 	if (cq->device->reg_state != IB_DEV_OPEN) {
1311 		ofs_lock_exit(&ofs_client->lock);
1312 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1313 		    "ib_req_notify_cq: cq: 0x%p, flag: 0x%x", cq, flags);
1314 		return (-ENXIO);
1315 	}
1316 
1317 	SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
1318 	    "ib_req_notify_cq: cq: 0x%p, flag: 0x%x", cq, flags);
1319 
1320 	switch (flags & IB_CQ_SOLICITED_MASK) {
1321 	case IB_CQ_SOLICITED:
1322 		notify_type = IBT_NEXT_SOLICITED;
1323 		break;
1324 	case IB_CQ_NEXT_COMP:
1325 		notify_type = IBT_NEXT_COMPLETION;
1326 		break;
1327 	default:
1328 		/* Currently only two flags are supported */
1329 		ofs_lock_exit(&ofs_client->lock);
1330 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1331 		    "ib_req_notify_cq: cq: 0x%p, flag: 0x%x => invalid flag",
1332 		    cq, flags);
1333 		return (-EINVAL);
1334 	}
1335 
1336 	rtn = ibt_enable_cq_notify(cq->ibt_cq, notify_type);
1337 	ofs_lock_exit(&ofs_client->lock);
1338 
1339 	if (rtn == IBT_SUCCESS) {
1340 		SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
1341 		    "ib_req_notify_cq: cq: 0x%p, flag: 0x%x rtn: 0x%x",
1342 		    cq, flags, rtn);
1343 		return (0);
1344 	}
1345 
1346 	SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1347 	    "ib_req_notify_cq: cq: 0x%p, flag: 0x%x => ibt_enable_cq_notify "
1348 	    "failed w/ 0x%x", cq, flags, rtn);
1349 
1350 	switch (rtn) {
1351 	case IBT_HCA_HDL_INVALID:
1352 	case IBT_CQ_HDL_INVALID:
1353 	case IBT_CQ_NOTIFY_TYPE_INVALID:
1354 		return (-EINVAL);
1355 	default:
1356 		return (-EIO);
1357 	}
1358 }
1359 
1360 static const struct {
1361 	int			valid;
1362 	enum ib_qp_attr_mask	req_param[IB_QPT_RAW_ETY + 1];
1363 	enum ib_qp_attr_mask	opt_param[IB_QPT_RAW_ETY + 1];
1364 } qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
1365 
1366 	[IB_QPS_RESET] = {
1367 		[IB_QPS_RESET] = { .valid = 1 },
1368 		[IB_QPS_INIT]  = {
1369 			.valid = 1,
1370 			.req_param = {
1371 				[IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_PORT |
1372 				    IB_QP_QKEY),
1373 				[IB_QPT_UC] = (IB_QP_PKEY_INDEX | IB_QP_PORT |
1374 				    IB_QP_ACCESS_FLAGS),
1375 				[IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT |
1376 				    IB_QP_ACCESS_FLAGS),
1377 				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
1378 				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
1379 			}
1380 		},
1381 	},
1382 	[IB_QPS_INIT]  = {
1383 		[IB_QPS_RESET] = { .valid = 1 },
1384 		[IB_QPS_ERR] =   { .valid = 1 },
1385 		[IB_QPS_INIT]  = {
1386 			.valid = 1,
1387 			.opt_param = {
1388 				[IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_PORT |
1389 				    IB_QP_QKEY),
1390 				[IB_QPT_UC] = (IB_QP_PKEY_INDEX | IB_QP_PORT |
1391 				    IB_QP_ACCESS_FLAGS),
1392 				[IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT |
1393 				    IB_QP_ACCESS_FLAGS),
1394 				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
1395 				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
1396 			}
1397 		},
1398 		[IB_QPS_RTR]   = {
1399 			.valid = 1,
1400 			.req_param = {
1401 				[IB_QPT_UC] = (IB_QP_AV | IB_QP_PATH_MTU |
1402 				    IB_QP_DEST_QPN | IB_QP_RQ_PSN),
1403 				[IB_QPT_RC] = (IB_QP_AV | IB_QP_PATH_MTU |
1404 				    IB_QP_DEST_QPN | IB_QP_RQ_PSN |
1405 				    IB_QP_MAX_DEST_RD_ATOMIC |
1406 				    IB_QP_MIN_RNR_TIMER),
1407 			},
1408 			.opt_param = {
1409 				[IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
1410 				[IB_QPT_UC] = (IB_QP_ALT_PATH |
1411 				    IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX),
1412 				[IB_QPT_RC] = (IB_QP_ALT_PATH |
1413 				    IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX),
1414 				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
1415 				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
1416 			}
1417 		}
1418 	},
1419 	[IB_QPS_RTR]   = {
1420 		[IB_QPS_RESET] = { .valid = 1 },
1421 		[IB_QPS_ERR] =   { .valid = 1 },
1422 		[IB_QPS_RTS]   = {
1423 			.valid = 1,
1424 			.req_param = {
1425 				[IB_QPT_UD] = IB_QP_SQ_PSN,
1426 				[IB_QPT_UC] = IB_QP_SQ_PSN,
1427 				[IB_QPT_RC] = (IB_QP_TIMEOUT |
1428 				    IB_QP_RETRY_CNT | IB_QP_RNR_RETRY |
1429 				    IB_QP_SQ_PSN | IB_QP_MAX_QP_RD_ATOMIC),
1430 				[IB_QPT_SMI] = IB_QP_SQ_PSN,
1431 				[IB_QPT_GSI] = IB_QP_SQ_PSN,
1432 			},
1433 			.opt_param = {
1434 				[IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY),
1435 				[IB_QPT_UC] = (IB_QP_CUR_STATE |
1436 				    IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS |
1437 				    IB_QP_PATH_MIG_STATE),
1438 				[IB_QPT_RC] = (IB_QP_CUR_STATE |
1439 				    IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS	|
1440 				    IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE),
1441 				[IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
1442 				[IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
1443 			}
1444 		}
1445 	},
1446 	[IB_QPS_RTS] = {
1447 		[IB_QPS_RESET] = { .valid = 1 },
1448 		[IB_QPS_ERR] =  { .valid = 1 },
1449 		[IB_QPS_RTS] = {
1450 			.valid = 1,
1451 			.opt_param = {
1452 				[IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY),
1453 				[IB_QPT_UC] = (IB_QP_CUR_STATE	|
1454 				    IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH |
1455 				    IB_QP_PATH_MIG_STATE),
1456 				[IB_QPT_RC] = (IB_QP_CUR_STATE	|
1457 				    IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH |
1458 				    IB_QP_PATH_MIG_STATE | IB_QP_MIN_RNR_TIMER),
1459 				[IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
1460 				[IB_QPT_GSI] = (IB_QP_CUR_STATE	| IB_QP_QKEY),
1461 			}
1462 		},
1463 		[IB_QPS_SQD] = {
1464 			.valid = 1,
1465 			.opt_param = {
1466 				[IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY,
1467 				[IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
1468 				[IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
1469 				[IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
1470 				[IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY
1471 			}
1472 		},
1473 	},
1474 	[IB_QPS_SQD] = {
1475 		[IB_QPS_RESET] = { .valid = 1 },
1476 		[IB_QPS_ERR] = { .valid = 1 },
1477 		[IB_QPS_RTS] = {
1478 			.valid = 1,
1479 			.opt_param = {
1480 				[IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY),
1481 				[IB_QPT_UC] = (IB_QP_CUR_STATE |
1482 				    IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS |
1483 				    IB_QP_PATH_MIG_STATE),
1484 				[IB_QPT_RC] = (IB_QP_CUR_STATE |
1485 				    IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS |
1486 				    IB_QP_MIN_RNR_TIMER	| IB_QP_PATH_MIG_STATE),
1487 				[IB_QPT_SMI] = (IB_QP_CUR_STATE	| IB_QP_QKEY),
1488 				[IB_QPT_GSI] = (IB_QP_CUR_STATE	| IB_QP_QKEY),
1489 			}
1490 		},
1491 		[IB_QPS_SQD] = {
1492 			.valid = 1,
1493 			.opt_param = {
1494 				[IB_QPT_UD] = (IB_QP_PKEY_INDEX	| IB_QP_QKEY),
1495 				[IB_QPT_UC] = (IB_QP_AV | IB_QP_ALT_PATH |
1496 				    IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX |
1497 				    IB_QP_PATH_MIG_STATE),
1498 				[IB_QPT_RC] = (IB_QP_PORT | IB_QP_AV |
1499 				    IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
1500 				    IB_QP_RNR_RETRY | IB_QP_MAX_QP_RD_ATOMIC |
1501 				    IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_ALT_PATH |
1502 				    IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX |
1503 				    IB_QP_MIN_RNR_TIMER	| IB_QP_PATH_MIG_STATE),
1504 				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
1505 				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY),
1506 			}
1507 		}
1508 	},
1509 	[IB_QPS_SQE]  = {
1510 		[IB_QPS_RESET] = { .valid = 1 },
1511 		[IB_QPS_ERR] = { .valid = 1 },
1512 		[IB_QPS_RTS] = {
1513 			.valid = 1,
1514 			.opt_param = {
1515 				[IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY),
1516 				[IB_QPT_UC] = (IB_QP_CUR_STATE |
1517 				    IB_QP_ACCESS_FLAGS),
1518 				[IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY),
1519 				[IB_QPT_GSI] = (IB_QP_CUR_STATE	| IB_QP_QKEY),
1520 			}
1521 		}
1522 	},
1523 	[IB_QPS_ERR] = {
1524 		[IB_QPS_RESET] = { .valid = 1 },
1525 		[IB_QPS_ERR] =  { .valid = 1 }
1526 	}
1527 };
1528 
1529 static inline int
1530 ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
1531     enum ib_qp_type type, enum ib_qp_attr_mask mask)
1532 {
1533 	enum ib_qp_attr_mask req_param, opt_param;
1534 
1535 	if (cur_state  < 0 || cur_state  > IB_QPS_ERR ||
1536 	    next_state < 0 || next_state > IB_QPS_ERR) {
1537 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1538 		    "ib_modify_qp_is_ok: cur_state: %d, next_state: %d, "
1539 		    "qp_type: %d, attr_mask: 0x%x => invalid state(1)",
1540 		    cur_state, next_state, type, mask);
1541 		return (0);
1542 	}
1543 
1544 	if (mask & IB_QP_CUR_STATE &&
1545 	    cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
1546 	    cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE) {
1547 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1548 		    "ib_modify_qp_is_ok: cur_state: %d, next_state: %d, "
1549 		    "qp_type: %d, attr_mask: 0x%x => invalid state(2)",
1550 		    cur_state, next_state, type, mask);
1551 		return (0);
1552 	}
1553 
1554 	if (!qp_state_table[cur_state][next_state].valid) {
1555 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1556 		    "ib_modify_qp_is_ok: cur_state: %d, next_state: %d, "
1557 		    "qp_type: %d, attr_mask: 0x%x => state is not valid",
1558 		    cur_state, next_state, type, mask);
1559 		return (0);
1560 	}
1561 
1562 	req_param = qp_state_table[cur_state][next_state].req_param[type];
1563 	opt_param = qp_state_table[cur_state][next_state].opt_param[type];
1564 
1565 	if ((mask & req_param) != req_param) {
1566 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1567 		    "ib_modify_qp_is_ok: cur_state: %d, next_state: %d, "
1568 		    "qp_type: %d, attr_mask: 0x%x => "
1569 		    "required param doesn't match. req_param = 0x%x",
1570 		    cur_state, next_state, type, mask, req_param);
1571 		return (0);
1572 	}
1573 
1574 	if (mask & ~(req_param | opt_param | IB_QP_STATE)) {
1575 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1576 		    "ib_modify_qp_is_ok: cur_state: %d, next_state: %d, "
1577 		    "qp_type: %d, attr_mask: 0x%x => "
1578 		    "unsupported options. req_param = 0x%x, opt_param = 0x%x",
1579 		    cur_state, next_state, type, mask, req_param, opt_param);
1580 		return (0);
1581 	}
1582 
1583 	return (1);
1584 }
1585 
1586 static inline enum ib_qp_state
1587 qp_current_state(ibt_qp_query_attr_t *qp_attr)
1588 {
1589 	ASSERT(qp_attr->qp_info.qp_state != IBT_STATE_SQDRAIN);
1590 	return (enum ib_qp_state)(qp_attr->qp_info.qp_state);
1591 }
1592 
1593 static inline ibt_tran_srv_t
1594 of2ibtf_qp_type(enum ib_qp_type type)
1595 {
1596 	switch (type) {
1597 	case IB_QPT_SMI:
1598 	case IB_QPT_GSI:
1599 	case IB_QPT_UD:
1600 		return (IBT_UD_SRV);
1601 	case IB_QPT_RC:
1602 		return (IBT_RC_SRV);
1603 	case IB_QPT_UC:
1604 		return (IBT_UC_SRV);
1605 	case IB_QPT_RAW_IPV6:
1606 		return (IBT_RAWIP_SRV);
1607 	case IB_QPT_RAW_ETY:
1608 	default:
1609 		ASSERT(type == IB_QPT_RAW_ETY);
1610 		return (IBT_RAWETHER_SRV);
1611 	}
1612 }
1613 
1614 static inline void
1615 set_av(struct ib_ah_attr *attr, ibt_cep_path_t *pathp)
1616 {
1617 	ibt_adds_vect_t		*av = &pathp->cep_adds_vect;
1618 
1619 	pathp->cep_hca_port_num = attr->port_num;
1620 	av->av_srate = OF2IBTF_SRATE(attr->static_rate);
1621 	av->av_srvl = attr->sl & 0xF;
1622 	av->av_send_grh = attr->ah_flags & IB_AH_GRH ? 1 : 0;
1623 
1624 	if (av->av_send_grh) {
1625 		av->av_dgid.gid_prefix =
1626 		    attr->grh.dgid.global.subnet_prefix;
1627 		av->av_dgid.gid_guid =
1628 		    attr->grh.dgid.global.interface_id;
1629 		av->av_flow = attr->grh.flow_label & 0xFFFFF;
1630 		av->av_tclass = attr->grh.traffic_class;
1631 		av->av_hop = attr->grh.hop_limit;
1632 		av->av_sgid_ix = attr->grh.sgid_index;
1633 	}
1634 	av->av_dlid = attr->dlid;
1635 	av->av_src_path = attr->src_path_bits;
1636 }
1637 
1638 int
1639 ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, int attr_mask)
1640 {
1641 	enum ib_qp_state	cur_state, new_state;
1642 	ibt_hca_attr_t		hattr;
1643 	ibt_qp_query_attr_t	qp_attr;
1644 	ibt_qp_info_t		modify_attr;
1645 	ibt_cep_modify_flags_t	flags;
1646 	int			rtn;
1647 	ofs_client_t		*ofs_client = qp->device->clnt_hdl;
1648 
1649 	ofs_lock_enter(&ofs_client->lock);
1650 	if (qp->device->reg_state != IB_DEV_OPEN) {
1651 		ofs_lock_exit(&ofs_client->lock);
1652 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1653 		    "ib_modify_qp: qp: 0x%p => invalid device state (%d)",
1654 		    qp, qp->device->reg_state);
1655 		return (-ENXIO);
1656 	}
1657 
1658 	rtn = ibt_query_hca(qp->device->hca_hdl, &hattr);
1659 	if (rtn != IBT_SUCCESS) {
1660 		ofs_lock_exit(&ofs_client->lock);
1661 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1662 		    "ib_modify_qp: qp: 0x%p, hca_hdl: 0x%p => "
1663 		    "ibt_query_hca() failed w/ %d",
1664 		    qp, qp->device->hca_hdl, rtn);
1665 		return (-EIO);
1666 	}
1667 
1668 	/* only one thread per qp is allowed during the qp modification */
1669 	mutex_enter(&qp->lock);
1670 
1671 	/* Get the current QP attributes first */
1672 	bzero(&qp_attr, sizeof (ibt_qp_query_attr_t));
1673 	if ((rtn = ibt_query_qp(qp->ibt_qp, &qp_attr)) != IBT_SUCCESS) {
1674 		mutex_exit(&qp->lock);
1675 		ofs_lock_exit(&ofs_client->lock);
1676 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1677 		    "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => "
1678 		    "ibt_query_qp failed w/ 0x%x", qp, attr, attr_mask, rtn);
1679 		return (-EIO);
1680 	}
1681 
1682 	/* Get the current and new state for this QP */
1683 	cur_state = attr_mask & IB_QP_CUR_STATE ?  attr->cur_qp_state :
1684 	    qp_current_state(&qp_attr);
1685 	new_state = attr_mask & IB_QP_STATE ? attr->qp_state :
1686 	    cur_state;
1687 
1688 	/* Sanity check of the current/new states */
1689 	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
1690 		/* Linux OF returns 0 in this case */
1691 		mutex_exit(&qp->lock);
1692 		ofs_lock_exit(&ofs_client->lock);
1693 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1694 		    "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => "
1695 		    "invalid state (both of current/new states are RESET)",
1696 		    qp, attr, attr_mask);
1697 		return (0);
1698 	}
1699 
1700 	/*
1701 	 * Check if this modification request is supported with the new
1702 	 * and/or current state.
1703 	 */
1704 	if (!ib_modify_qp_is_ok(cur_state, new_state, qp->qp_type, attr_mask)) {
1705 		mutex_exit(&qp->lock);
1706 		ofs_lock_exit(&ofs_client->lock);
1707 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1708 		    "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => "
1709 		    "invalid arguments",
1710 		    qp, attr, attr_mask);
1711 		return (-EINVAL);
1712 	}
1713 
1714 	/* Sanity checks */
1715 	if (attr_mask & IB_QP_PORT && (attr->port_num == 0 ||
1716 	    attr->port_num > hattr.hca_nports)) {
1717 		mutex_exit(&qp->lock);
1718 		ofs_lock_exit(&ofs_client->lock);
1719 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1720 		    "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => "
1721 		    "invalid attr->port_num(%d), max_nports(%d)",
1722 		    qp, attr, attr_mask, attr->port_num, hattr.hca_nports);
1723 		return (-EINVAL);
1724 	}
1725 
1726 	if (attr_mask & IB_QP_PKEY_INDEX &&
1727 	    attr->pkey_index >= hattr.hca_max_port_pkey_tbl_sz) {
1728 		mutex_exit(&qp->lock);
1729 		ofs_lock_exit(&ofs_client->lock);
1730 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1731 		    "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => "
1732 		    "invalid attr->pkey_index(%d), max_pkey_index(%d)",
1733 		    qp, attr, attr_mask, attr->pkey_index,
1734 		    hattr.hca_max_port_pkey_tbl_sz);
1735 		return (-EINVAL);
1736 	}
1737 
1738 	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
1739 	    attr->max_rd_atomic > hattr.hca_max_rdma_out_qp) {
1740 		mutex_exit(&qp->lock);
1741 		ofs_lock_exit(&ofs_client->lock);
1742 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1743 		    "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => "
1744 		    "invalid attr->max_rd_atomic(0x%x), max_rdma_out_qp(0x%x)",
1745 		    qp, attr, attr_mask, attr->max_rd_atomic,
1746 		    hattr.hca_max_rdma_out_qp);
1747 		return (-EINVAL);
1748 	}
1749 
1750 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
1751 	    attr->max_dest_rd_atomic > hattr.hca_max_rdma_in_qp) {
1752 		mutex_exit(&qp->lock);
1753 		ofs_lock_exit(&ofs_client->lock);
1754 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1755 		    "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => "
1756 		    "invalid attr->max_dest_rd_atomic(0x%x), "
1757 		    "max_rdma_in_qp(0x%x)", qp, attr, attr_mask,
1758 		    attr->max_dest_rd_atomic, hattr.hca_max_rdma_in_qp);
1759 		return (-EINVAL);
1760 	}
1761 
1762 	/* copy the current setting */
1763 	modify_attr = qp_attr.qp_info;
1764 
1765 	/*
1766 	 * Since it's already checked if the modification request matches
1767 	 * the new and/or current states, just assign both of states to
1768 	 * modify_attr here. The current state is required if qp_state
1769 	 * is RTR, but it's harmelss otherwise, so it's set always.
1770 	 */
1771 	modify_attr.qp_current_state = OF2IBTF_STATE(cur_state);
1772 	modify_attr.qp_state = OF2IBTF_STATE(new_state);
1773 	modify_attr.qp_trans = of2ibtf_qp_type(qp->qp_type);
1774 
1775 	/* Convert OF modification requests into IBTF ones */
1776 	flags = IBT_CEP_SET_STATE;	/* IBTF needs IBT_CEP_SET_STATE */
1777 	if (cur_state == IB_QPS_RESET &&
1778 	    new_state == IB_QPS_INIT) {
1779 		flags |= IBT_CEP_SET_RESET_INIT;
1780 	} else if (cur_state == IB_QPS_INIT &&
1781 	    new_state == IB_QPS_RTR) {
1782 		flags |= IBT_CEP_SET_INIT_RTR;
1783 	} else if (cur_state == IB_QPS_RTR &&
1784 	    new_state == IB_QPS_RTS) {
1785 		flags |= IBT_CEP_SET_RTR_RTS;
1786 	}
1787 	if (attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) {
1788 		flags |= IBT_CEP_SET_SQD_EVENT;
1789 	}
1790 	if (attr_mask & IB_QP_ACCESS_FLAGS) {
1791 		modify_attr.qp_flags &= ~(IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR |
1792 		    IBT_CEP_ATOMIC);
1793 		if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) {
1794 			flags |= IBT_CEP_SET_RDMA_R;
1795 			modify_attr.qp_flags |= IBT_CEP_RDMA_RD;
1796 		}
1797 		if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) {
1798 			flags |= IBT_CEP_SET_RDMA_W;
1799 			modify_attr.qp_flags |= IBT_CEP_RDMA_WR;
1800 		}
1801 		if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) {
1802 			flags |= IBT_CEP_SET_ATOMIC;
1803 			modify_attr.qp_flags |= IBT_CEP_ATOMIC;
1804 		}
1805 	}
1806 	if (attr_mask & IB_QP_PKEY_INDEX) {
1807 		flags |= IBT_CEP_SET_PKEY_IX;
1808 		switch (qp->qp_type)  {
1809 		case IB_QPT_SMI:
1810 		case IB_QPT_GSI:
1811 		case IB_QPT_UD:
1812 			modify_attr.qp_transport.ud.ud_pkey_ix =
1813 			    attr->pkey_index;
1814 			break;
1815 		case IB_QPT_RC:
1816 			modify_attr.qp_transport.rc.rc_path.cep_pkey_ix =
1817 			    attr->pkey_index;
1818 			break;
1819 		case IB_QPT_UC:
1820 			modify_attr.qp_transport.uc.uc_path.cep_pkey_ix =
1821 			    attr->pkey_index;
1822 			break;
1823 		default:
1824 			/* This should never happen */
1825 			mutex_exit(&qp->lock);
1826 			ofs_lock_exit(&ofs_client->lock);
1827 			SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1828 			    "ib_modify_qp(IB_QP_PKEY_INDEX): qp: 0x%p, "
1829 			    "attr: 0x%p, attr_mask: 0x%x => "
1830 			    "invalid qp->qp_type(%d)",
1831 			    qp, attr, attr_mask, qp->qp_type);
1832 			return (-EINVAL);
1833 		}
1834 	}
1835 	if (attr_mask & IB_QP_PORT) {
1836 		flags |= IBT_CEP_SET_PORT;
1837 		switch (qp->qp_type) {
1838 		case IB_QPT_SMI:
1839 		case IB_QPT_GSI:
1840 		case IB_QPT_UD:
1841 			modify_attr.qp_transport.ud.ud_port = attr->port_num;
1842 			break;
1843 		case IB_QPT_RC:
1844 			modify_attr.qp_transport.rc.rc_path.cep_hca_port_num =
1845 			    attr->port_num;
1846 			break;
1847 		case IB_QPT_UC:
1848 			modify_attr.qp_transport.uc.uc_path.cep_hca_port_num =
1849 			    attr->port_num;
1850 			break;
1851 		default:
1852 			/* This should never happen */
1853 			mutex_exit(&qp->lock);
1854 			ofs_lock_exit(&ofs_client->lock);
1855 			SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1856 			    "ib_modify_qp(IB_QP_PORT): qp: 0x%p, "
1857 			    "attr: 0x%p, attr_mask: 0x%x => "
1858 			    "invalid qp->qp_type(%d)",
1859 			    qp, attr, attr_mask, qp->qp_type);
1860 			return (-EINVAL);
1861 		}
1862 	}
1863 	if (attr_mask & IB_QP_QKEY) {
1864 		ASSERT(qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_SMI ||
1865 		    qp->qp_type == IB_QPT_GSI);
1866 		flags |= IBT_CEP_SET_QKEY;
1867 		modify_attr.qp_transport.ud.ud_qkey = attr->qkey;
1868 	}
1869 	if (attr_mask & IB_QP_AV) {
1870 		flags |= IBT_CEP_SET_ADDS_VECT;
1871 		switch (qp->qp_type) {
1872 		case IB_QPT_RC:
1873 			set_av(&attr->ah_attr,
1874 			    &modify_attr.qp_transport.rc.rc_path);
1875 			break;
1876 		case IB_QPT_UC:
1877 			set_av(&attr->ah_attr,
1878 			    &modify_attr.qp_transport.uc.uc_path);
1879 			break;
1880 		case IB_QPT_SMI:
1881 		case IB_QPT_GSI:
1882 		case IB_QPT_UD:
1883 		default:
1884 			/* This should never happen */
1885 			mutex_exit(&qp->lock);
1886 			ofs_lock_exit(&ofs_client->lock);
1887 			SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1888 			    "ib_modify_qp(IB_QP_AV): qp: 0x%p, "
1889 			    "attr: 0x%p, attr_mask: 0x%x => "
1890 			    "invalid qp->qp_type(%d)",
1891 			    qp, attr, attr_mask, qp->qp_type);
1892 			return (-EINVAL);
1893 		}
1894 	}
1895 	if (attr_mask & IB_QP_PATH_MTU) {
1896 		switch (qp->qp_type) {
1897 		case IB_QPT_RC:
1898 			modify_attr.qp_transport.rc.rc_path_mtu =
1899 			    OF2IBTF_PATH_MTU(attr->path_mtu);
1900 			break;
1901 		case IB_QPT_UC:
1902 			modify_attr.qp_transport.uc.uc_path_mtu =
1903 			    OF2IBTF_PATH_MTU(attr->path_mtu);
1904 			break;
1905 		case IB_QPT_SMI:
1906 		case IB_QPT_GSI:
1907 		case IB_QPT_UD:
1908 		default:
1909 			/* nothing to do */
1910 			break;
1911 		}
1912 	}
1913 	if (attr_mask & IB_QP_TIMEOUT && qp->qp_type == IB_QPT_RC) {
1914 		flags |= IBT_CEP_SET_TIMEOUT;
1915 		modify_attr.qp_transport.rc.rc_path.cep_timeout =
1916 		    attr->timeout;
1917 	}
1918 	if (attr_mask & IB_QP_RETRY_CNT && qp->qp_type == IB_QPT_RC) {
1919 		flags |= IBT_CEP_SET_RETRY;
1920 		modify_attr.qp_transport.rc.rc_retry_cnt =
1921 		    attr->retry_cnt & 0x7;
1922 	}
1923 	if (attr_mask & IB_QP_RNR_RETRY && qp->qp_type == IB_QPT_RC) {
1924 		flags |= IBT_CEP_SET_RNR_NAK_RETRY;
1925 		modify_attr.qp_transport.rc.rc_rnr_retry_cnt =
1926 		    attr->rnr_retry & 0x7;
1927 	}
1928 	if (attr_mask & IB_QP_RQ_PSN) {
1929 		switch (qp->qp_type) {
1930 		case IB_QPT_RC:
1931 			modify_attr.qp_transport.rc.rc_rq_psn =
1932 			    attr->rq_psn & 0xFFFFFF;
1933 			break;
1934 		case IB_QPT_UC:
1935 			modify_attr.qp_transport.uc.uc_rq_psn =
1936 			    attr->rq_psn & 0xFFFFFF;
1937 			break;
1938 		case IB_QPT_SMI:
1939 		case IB_QPT_GSI:
1940 		case IB_QPT_UD:
1941 		default:
1942 			/* nothing to do */
1943 			break;
1944 		}
1945 	}
1946 	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && qp->qp_type == IB_QPT_RC) {
1947 		if (attr->max_rd_atomic) {
1948 			flags |= IBT_CEP_SET_RDMARA_OUT;
1949 			modify_attr.qp_transport.rc.rc_rdma_ra_out =
1950 			    attr->max_rd_atomic;
1951 		}
1952 	}
1953 	if (attr_mask & IB_QP_ALT_PATH) {
1954 		/* Sanity checks */
1955 		if (attr->alt_port_num == 0 ||
1956 		    attr->alt_port_num > hattr.hca_nports) {
1957 			mutex_exit(&qp->lock);
1958 			ofs_lock_exit(&ofs_client->lock);
1959 			SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1960 			    "ib_modify_qp: qp: 0x%p, attr: 0x%p, "
1961 			    "attr_mask: 0x%x => invalid attr->alt_port_num"
1962 			    "(%d), max_nports(%d)",
1963 			    qp, attr, attr_mask, attr->alt_port_num,
1964 			    hattr.hca_nports);
1965 			return (-EINVAL);
1966 		}
1967 		if (attr->alt_pkey_index >= hattr.hca_max_port_pkey_tbl_sz) {
1968 			mutex_exit(&qp->lock);
1969 			ofs_lock_exit(&ofs_client->lock);
1970 			SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
1971 			    "ib_modify_qp: qp: 0x%p, attr: 0x%p, "
1972 			    "attr_mask: 0x%x => invalid attr->alt_pkey_index"
1973 			    "(%d), max_port_key_index(%d)",
1974 			    qp, attr, attr_mask, attr->alt_pkey_index,
1975 			    hattr.hca_max_port_pkey_tbl_sz);
1976 			return (-EINVAL);
1977 		}
1978 		flags |= IBT_CEP_SET_ALT_PATH;
1979 		switch (qp->qp_type) {
1980 		case IB_QPT_RC:
1981 			modify_attr.qp_transport.rc.rc_alt_path.
1982 			    cep_pkey_ix = attr->alt_pkey_index;
1983 			modify_attr.qp_transport.rc.rc_alt_path.
1984 			    cep_hca_port_num = attr->alt_port_num;
1985 			set_av(&attr->alt_ah_attr,
1986 			    &modify_attr.qp_transport.rc.rc_alt_path);
1987 			modify_attr.qp_transport.rc.rc_alt_path.
1988 			    cep_timeout = attr->alt_timeout;
1989 			break;
1990 		case IB_QPT_UC:
1991 			modify_attr.qp_transport.uc.uc_alt_path.
1992 			    cep_pkey_ix = attr->alt_pkey_index;
1993 			modify_attr.qp_transport.uc.uc_alt_path.
1994 			    cep_hca_port_num = attr->alt_port_num;
1995 			set_av(&attr->alt_ah_attr,
1996 			    &modify_attr.qp_transport.uc.uc_alt_path);
1997 			modify_attr.qp_transport.uc.uc_alt_path.
1998 			    cep_timeout = attr->alt_timeout;
1999 			break;
2000 		case IB_QPT_SMI:
2001 		case IB_QPT_GSI:
2002 		case IB_QPT_UD:
2003 		default:
2004 			/* This should never happen */
2005 			mutex_exit(&qp->lock);
2006 			ofs_lock_exit(&ofs_client->lock);
2007 			SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
2008 			    "ib_modify_qp(IB_QP_ALT_PATH): qp: 0x%p, "
2009 			    "attr: 0x%p, attr_mask: 0x%x => "
2010 			    "invalid qp->qp_type(%d)",
2011 			    qp, attr, attr_mask, qp->qp_type);
2012 			return (-EINVAL);
2013 		}
2014 	}
2015 	if (attr_mask & IB_QP_MIN_RNR_TIMER && qp->qp_type == IB_QPT_RC) {
2016 		flags |= IBT_CEP_SET_MIN_RNR_NAK;
2017 		modify_attr.qp_transport.rc.rc_min_rnr_nak =
2018 		    attr->min_rnr_timer & 0x1F;
2019 	}
2020 	if (attr_mask & IB_QP_SQ_PSN) {
2021 		switch (qp->qp_type)  {
2022 		case IB_QPT_SMI:
2023 		case IB_QPT_GSI:
2024 		case IB_QPT_UD:
2025 			modify_attr.qp_transport.ud.ud_sq_psn =
2026 			    attr->sq_psn;
2027 			break;
2028 		case IB_QPT_RC:
2029 			modify_attr.qp_transport.rc.rc_sq_psn =
2030 			    attr->sq_psn;
2031 			break;
2032 		case IB_QPT_UC:
2033 			modify_attr.qp_transport.uc.uc_sq_psn =
2034 			    attr->sq_psn;
2035 			break;
2036 		default:
2037 			/* This should never happen */
2038 			mutex_exit(&qp->lock);
2039 			ofs_lock_exit(&ofs_client->lock);
2040 			SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
2041 			    "ib_modify_qp(IB_QP_SQ_PSN): qp: 0x%p, "
2042 			    "attr: 0x%p, attr_mask: 0x%x => "
2043 			    "invalid qp->qp_type(%d)",
2044 			    qp, attr, attr_mask, qp->qp_type);
2045 			return (-EINVAL);
2046 		}
2047 	}
2048 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && qp->qp_type == IB_QPT_RC) {
2049 		/* Linux OF sets the value if max_dest_rd_atomic is not zero */
2050 		if (attr->max_dest_rd_atomic) {
2051 			flags |= IBT_CEP_SET_RDMARA_IN;
2052 			modify_attr.qp_transport.rc.rc_rdma_ra_in =
2053 			    attr->max_dest_rd_atomic;
2054 		}
2055 	}
2056 	if (attr_mask & IB_QP_PATH_MIG_STATE) {
2057 		flags |= IBT_CEP_SET_MIG;
2058 		switch (qp->qp_type)  {
2059 		case IB_QPT_RC:
2060 			modify_attr.qp_transport.rc.rc_mig_state =
2061 			    OF2IBTF_PATH_MIG_STATE(attr->path_mig_state);
2062 			break;
2063 		case IB_QPT_UC:
2064 			modify_attr.qp_transport.uc.uc_mig_state =
2065 			    OF2IBTF_PATH_MIG_STATE(attr->path_mig_state);
2066 			break;
2067 		case IB_QPT_SMI:
2068 		case IB_QPT_GSI:
2069 		case IB_QPT_UD:
2070 		default:
2071 			/* This should never happen */
2072 			mutex_exit(&qp->lock);
2073 			ofs_lock_exit(&ofs_client->lock);
2074 			SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
2075 			    "ib_modify_qp(IB_QP_PATH_MIG_STATE): qp: 0x%p, "
2076 			    "attr: 0x%p, attr_mask: 0x%x => "
2077 			    "invalid qp->qp_type(%d)",
2078 			    qp, attr, attr_mask, qp->qp_type);
2079 			return (-EINVAL);
2080 		}
2081 	}
2082 	if (attr_mask & IB_QP_CAP) {
2083 		/* IB_QP_CAP is not supported */
2084 		mutex_exit(&qp->lock);
2085 		ofs_lock_exit(&ofs_client->lock);
2086 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
2087 		    "ib_modify_qp: qp: 0x%p, attr: 0x%p, "
2088 		    "attr_mask: 0x%x => IB_QP_CAP is not supported",
2089 		    qp, attr, attr_mask);
2090 		return (-EINVAL);
2091 	}
2092 	if (attr_mask & IB_QP_DEST_QPN) {
2093 		switch (qp->qp_type)  {
2094 		case IB_QPT_RC:
2095 			modify_attr.qp_transport.rc.rc_dst_qpn =
2096 			    attr->dest_qp_num;
2097 			break;
2098 		case IB_QPT_UC:
2099 			modify_attr.qp_transport.uc.uc_dst_qpn =
2100 			    attr->dest_qp_num;
2101 			break;
2102 		case IB_QPT_SMI:
2103 		case IB_QPT_GSI:
2104 		case IB_QPT_UD:
2105 		default:
2106 			/* This should never happen */
2107 			mutex_exit(&qp->lock);
2108 			ofs_lock_exit(&ofs_client->lock);
2109 			SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
2110 			    "ib_modify_qp(IB_QP_DEST_PSN): qp: 0x%p, "
2111 			    "attr: 0x%p, attr_mask: 0x%x => "
2112 			    "invalid qp->qp_type(%d)",
2113 			    qp, attr, attr_mask, qp->qp_type);
2114 			return (-EINVAL);
2115 		}
2116 	}
2117 
2118 	SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
2119 	    "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x, "
2120 	    "flags: 0x%x, modify_attr: 0x%p",
2121 	    qp, attr, attr_mask, flags, &modify_attr);
2122 
2123 	/* Modify the QP attributes */
2124 	rtn = ibt_modify_qp(qp->ibt_qp, flags, &modify_attr, NULL);
2125 	if (rtn == IBT_SUCCESS) {
2126 		mutex_exit(&qp->lock);
2127 		ofs_lock_exit(&ofs_client->lock);
2128 		return (0);
2129 	}
2130 	mutex_exit(&qp->lock);
2131 	ofs_lock_exit(&ofs_client->lock);
2132 
2133 	SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
2134 	    "ib_modify_qp: qp: 0x%p, attr: 0x%p, attr_mask: 0x%x => "
2135 	    "ibt_modify_qp failed w/ %d, flags: 0x%x",
2136 	    qp, attr, attr_mask, rtn, flags);
2137 
2138 	switch (rtn) {
2139 	case IBT_HCA_HDL_INVALID:
2140 	case IBT_QP_HDL_INVALID:
2141 	case IBT_QP_SRV_TYPE_INVALID:
2142 	case IBT_QP_STATE_INVALID:
2143 	case IBT_HCA_PORT_INVALID:
2144 	case IBT_PKEY_IX_ILLEGAL:
2145 		return (-EINVAL);
2146 	default:
2147 		return (-EIO);
2148 	}
2149 }
2150 
2151 static inline enum ib_wc_status
2152 ibt2of_wc_status(ibt_wc_status_t status)
2153 {
2154 	switch (status) {
2155 	case IBT_WC_LOCAL_LEN_ERR:
2156 		return (IB_WC_LOC_LEN_ERR);
2157 	case IBT_WC_LOCAL_CHAN_OP_ERR:
2158 		return (IB_WC_LOC_QP_OP_ERR);
2159 	case IBT_WC_LOCAL_PROTECT_ERR:
2160 		return (IB_WC_LOC_PROT_ERR);
2161 	case IBT_WC_WR_FLUSHED_ERR:
2162 		return (IB_WC_WR_FLUSH_ERR);
2163 	case IBT_WC_MEM_WIN_BIND_ERR:
2164 		return (IB_WC_MW_BIND_ERR);
2165 	case IBT_WC_BAD_RESPONSE_ERR:
2166 		return (IB_WC_BAD_RESP_ERR);
2167 	case IBT_WC_LOCAL_ACCESS_ERR:
2168 		return (IB_WC_LOC_ACCESS_ERR);
2169 	case IBT_WC_REMOTE_INVALID_REQ_ERR:
2170 		return (IB_WC_REM_INV_REQ_ERR);
2171 	case IBT_WC_REMOTE_ACCESS_ERR:
2172 		return (IB_WC_REM_ACCESS_ERR);
2173 	case IBT_WC_REMOTE_OP_ERR:
2174 		return (IB_WC_REM_OP_ERR);
2175 	case IBT_WC_TRANS_TIMEOUT_ERR:
2176 		return (IB_WC_RETRY_EXC_ERR);
2177 	case IBT_WC_RNR_NAK_TIMEOUT_ERR:
2178 		return (IB_WC_RNR_RETRY_EXC_ERR);
2179 	case IBT_WC_SUCCESS:
2180 	default:
2181 		/* Hermon doesn't support EEC yet */
2182 		ASSERT(status == IBT_WC_SUCCESS);
2183 		return (IB_WC_SUCCESS);
2184 	}
2185 }
2186 
2187 static inline enum ib_wc_opcode
2188 ibt2of_wc_opcode(ibt_wrc_opcode_t wc_type)
2189 {
2190 	switch (wc_type) {
2191 	case IBT_WRC_SEND:
2192 		return (IB_WC_SEND);
2193 	case IBT_WRC_RDMAR:
2194 		return (IB_WC_RDMA_READ);
2195 	case IBT_WRC_RDMAW:
2196 		return (IB_WC_RDMA_WRITE);
2197 	case IBT_WRC_CSWAP:
2198 		return (IB_WC_COMP_SWAP);
2199 	case IBT_WRC_FADD:
2200 		return (IB_WC_FETCH_ADD);
2201 	case IBT_WRC_BIND:
2202 		return (IB_WC_BIND_MW);
2203 	case IBT_WRC_RECV:
2204 		return (IB_WC_RECV);
2205 	case IBT_WRC_RECV_RDMAWI:
2206 	default:
2207 		ASSERT(wc_type == IBT_WRC_RECV_RDMAWI);
2208 		return (IB_WC_RECV_RDMA_WITH_IMM);
2209 	}
2210 }
2211 
2212 static inline int
2213 ibt2of_wc_flags(ibt_wc_flags_t wc_flags)
2214 {
2215 	return (wc_flags & ~IBT_WC_CKSUM_OK);
2216 }
2217 
2218 static inline void
2219 set_wc(ibt_wc_t *ibt_wc, struct ib_wc *wc)
2220 {
2221 	wc->wr_id = ibt_wc->wc_id;
2222 	wc->status = ibt2of_wc_status(ibt_wc->wc_status);
2223 	/* opcode can be undefined if status is not success */
2224 	if (wc->status == IB_WC_SUCCESS) {
2225 		wc->opcode = ibt2of_wc_opcode(ibt_wc->wc_type);
2226 	}
2227 	wc->vendor_err = 0;			/* not supported */
2228 	wc->byte_len = ibt_wc->wc_bytes_xfer;
2229 	wc->qp = NULL;				/* not supported */
2230 	wc->imm_data = htonl(ibt_wc->wc_immed_data);
2231 	wc->src_qp = ibt_wc->wc_qpn;
2232 	wc->wc_flags = ibt2of_wc_flags(ibt_wc->wc_flags);
2233 	wc->pkey_index = ibt_wc->wc_pkey_ix;
2234 	wc->slid = ibt_wc->wc_slid;
2235 	wc->sl = ibt_wc->wc_sl;
2236 	wc->dlid_path_bits = ibt_wc->wc_path_bits;
2237 	wc->port_num = 0;			/* not supported */
2238 }
2239 
2240 /*
2241  * ib_poll_cq - poll a CQ for completion(s)
2242  * @cq:the CQ being polled
2243  * @num_entries:maximum number of completions to return
2244  * @wc:array of at least @num_entries &struct ib_wc where completions
2245  *   will be returned
2246  *
2247  * Poll a CQ for (possibly multiple) completions.  If the return value
2248  * is < 0, an error occurred.  If the return value is >= 0, it is the
2249  * number of completions returned.  If the return value is
2250  * non-negative and < num_entries, then the CQ was emptied.
2251  *
2252  * Note that three following memebers in struct ib_wc are not supported
2253  * currently, and the values are always either 0 or NULL.
2254  *	u32			vendor_err;
2255  *	struct ib_qp		*qp;
2256  *	u8			port_num;
2257  */
2258 int
2259 ib_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc)
2260 {
2261 	ibt_wc_t	ibt_wc;
2262 	int		npolled;
2263 	ibt_status_t	rtn;
2264 	ofs_client_t	*ofs_client = (ofs_client_t *)cq->device->clnt_hdl;
2265 
2266 	ofs_lock_enter(&ofs_client->lock);
2267 	if (cq->device->reg_state != IB_DEV_OPEN) {
2268 		ofs_lock_exit(&ofs_client->lock);
2269 		SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
2270 		    "ib_poll_cq: cq: 0x%p => invalid device state (%d)",
2271 		    cq, cq->device->reg_state);
2272 		return (-ENXIO);
2273 	}
2274 
2275 	SOL_OFS_DPRINTF_L3(sol_kverbs_dbg_str,
2276 	    "ib_poll_cq: cq: 0x%p, num_entries: %d, wc: 0x%p, "
2277 	    "ibt_cq: 0x%p, ibt_wc: 0x%p",
2278 	    cq, num_entries, wc, cq->ibt_cq, &ibt_wc);
2279 
2280 	/* only one thread per cq is allowed during ibt_poll_cq() */
2281 	mutex_enter(&cq->lock);
2282 	for (npolled = 0; npolled < num_entries; ++npolled) {
2283 		bzero(&ibt_wc, sizeof (ibt_wc_t));
2284 		rtn = ibt_poll_cq(cq->ibt_cq, &ibt_wc, 1, NULL);
2285 		if (rtn != IBT_SUCCESS) {
2286 			break;
2287 		}
2288 		/* save this result to struct ib_wc */
2289 		set_wc(&ibt_wc, wc + npolled);
2290 	}
2291 	mutex_exit(&cq->lock);
2292 	ofs_lock_exit(&ofs_client->lock);
2293 
2294 	if (rtn == IBT_SUCCESS || rtn == IBT_CQ_EMPTY) {
2295 		return (npolled);
2296 	}
2297 
2298 	SOL_OFS_DPRINTF_L2(sol_kverbs_dbg_str,
2299 	    "ib_poll_cq: cq: 0x%p, num_entries: %d, wc: 0x%p => "
2300 	    "ibt_poll_cq failed w/ %d, npolled = %d",
2301 	    cq, num_entries, wc, rtn, npolled);
2302 
2303 	switch (rtn) {
2304 	case IBT_HCA_HDL_INVALID:
2305 	case IBT_CQ_HDL_INVALID:
2306 	case IBT_INVALID_PARAM:
2307 		return (-EINVAL);
2308 	default:
2309 		return (-EIO);
2310 	}
2311 }
2312 
2313 ibt_hca_hdl_t
2314 ib_get_ibt_hca_hdl(struct ib_device *device)
2315 {
2316 	return (device->hca_hdl);
2317 }
2318 
2319 ibt_channel_hdl_t
2320 ib_get_ibt_channel_hdl(struct rdma_cm_id *cm)
2321 {
2322 	return (cm->qp == NULL ? NULL : cm->qp->ibt_qp);
2323 }
2324