xref: /freebsd/contrib/ofed/librdmacm/cma.c (revision 7ef62cebc2f965b0f640263e179276928885e33d)
1 /*
2  * Copyright (c) 2005-2014 Intel Corporation.  All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 #include <config.h>
34 
35 #include <stdlib.h>
36 #include <string.h>
37 #include <glob.h>
38 #include <stdio.h>
39 #include <fcntl.h>
40 #include <errno.h>
41 #include <stdint.h>
42 #include <poll.h>
43 #include <unistd.h>
44 #include <pthread.h>
45 #include <infiniband/endian.h>
46 #include <stddef.h>
47 #include <netdb.h>
48 #include <syslog.h>
49 #include <limits.h>
50 
51 #include "cma.h"
52 #include "indexer.h"
53 #include <infiniband/driver.h>
54 #include <infiniband/marshall.h>
55 #include <rdma/rdma_cma.h>
56 #include <rdma/rdma_cma_abi.h>
57 #include <rdma/rdma_verbs.h>
58 #include <infiniband/ib.h>
59 
60 #define CMA_INIT_CMD(req, req_size, op)		\
61 do {						\
62 	memset(req, 0, req_size);		\
63 	(req)->cmd = UCMA_CMD_##op;		\
64 	(req)->in  = req_size - sizeof(struct ucma_abi_cmd_hdr); \
65 } while (0)
66 
67 #define CMA_INIT_CMD_RESP(req, req_size, op, resp, resp_size) \
68 do {						\
69 	CMA_INIT_CMD(req, req_size, op);	\
70 	(req)->out = resp_size;			\
71 	(req)->response = (uintptr_t) (resp);	\
72 } while (0)
73 
74 struct cma_port {
75 	uint8_t			link_layer;
76 };
77 
78 struct cma_device {
79 	struct ibv_context *verbs;
80 	struct ibv_pd	   *pd;
81 	struct ibv_xrcd    *xrcd;
82 	struct cma_port    *port;
83 	__be64		    guid;
84 	int		    port_cnt;
85 	int		    refcnt;
86 	int		    max_qpsize;
87 	uint8_t		    max_initiator_depth;
88 	uint8_t		    max_responder_resources;
89 };
90 
91 struct cma_id_private {
92 	struct rdma_cm_id	id;
93 	struct cma_device	*cma_dev;
94 	void			*connect;
95 	size_t			connect_len;
96 	int			events_completed;
97 	int			connect_error;
98 	int			sync;
99 	pthread_cond_t		cond;
100 	pthread_mutex_t		mut;
101 	uint32_t		handle;
102 	struct cma_multicast	*mc_list;
103 	struct ibv_qp_init_attr	*qp_init_attr;
104 	uint8_t			initiator_depth;
105 	uint8_t			responder_resources;
106 };
107 
108 struct cma_multicast {
109 	struct cma_multicast  *next;
110 	struct cma_id_private *id_priv;
111 	void		*context;
112 	int		events_completed;
113 	pthread_cond_t	cond;
114 	uint32_t	handle;
115 	union ibv_gid	mgid;
116 	uint16_t	mlid;
117 	struct sockaddr_storage addr;
118 };
119 
120 struct cma_event {
121 	struct rdma_cm_event	event;
122 	uint8_t			private_data[RDMA_MAX_PRIVATE_DATA];
123 	struct cma_id_private	*id_priv;
124 	struct cma_multicast	*mc;
125 };
126 
127 static struct cma_device *cma_dev_array;
128 static int cma_dev_cnt;
129 static int cma_init_cnt;
130 static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER;
131 static int abi_ver = RDMA_USER_CM_MAX_ABI_VERSION;
132 int af_ib_support;
133 static struct index_map ucma_idm;
134 static fastlock_t idm_lock;
135 
136 static int check_abi_version(void)
137 {
138 	char value[8];
139 
140 	if ((ibv_read_sysfs_file(ibv_get_sysfs_path(),
141 				 "class/misc/rdma_cm/abi_version",
142 				 value, sizeof value) < 0) &&
143 	    (ibv_read_sysfs_file(ibv_get_sysfs_path(),
144 				 "class/infiniband_ucma/abi_version",
145 				 value, sizeof value) < 0)) {
146 		/*
147 		 * Older version of Linux do not have class/misc.  To support
148 		 * backports, assume the most recent version of the ABI.  If
149 		 * we're wrong, we'll simply fail later when calling the ABI.
150 		 */
151 		return 0;
152 	}
153 
154 	abi_ver = strtol(value, NULL, 10);
155 	if (abi_ver < RDMA_USER_CM_MIN_ABI_VERSION ||
156 	    abi_ver > RDMA_USER_CM_MAX_ABI_VERSION) {
157 		return -1;
158 	}
159 	return 0;
160 }
161 
162 /*
163  * This function is called holding the mutex lock
164  * cma_dev_cnt must be set before calling this function to
165  * ensure that the lock is not acquired recursively.
166  */
167 static void ucma_set_af_ib_support(void)
168 {
169 	struct rdma_cm_id *id;
170 	struct sockaddr_ib sib;
171 	int ret;
172 
173 	ret = rdma_create_id(NULL, &id, NULL, RDMA_PS_IB);
174 	if (ret)
175 		return;
176 
177 	memset(&sib, 0, sizeof sib);
178 	sib.sib_family = AF_IB;
179 	sib.sib_sid = htobe64(RDMA_IB_IP_PS_TCP);
180 	sib.sib_sid_mask = htobe64(RDMA_IB_IP_PS_MASK);
181 	af_ib_support = 1;
182 	ret = rdma_bind_addr(id, (struct sockaddr *) &sib);
183 	af_ib_support = !ret;
184 
185 	rdma_destroy_id(id);
186 }
187 
188 int ucma_init(void)
189 {
190 	struct ibv_device **dev_list = NULL;
191 	int i, ret, dev_cnt;
192 
193 	/* Quick check without lock to see if we're already initialized */
194 	if (cma_dev_cnt)
195 		return 0;
196 
197 	pthread_mutex_lock(&mut);
198 	if (cma_dev_cnt) {
199 		pthread_mutex_unlock(&mut);
200 		return 0;
201 	}
202 
203 	fastlock_init(&idm_lock);
204 	ret = check_abi_version();
205 	if (ret)
206 		goto err1;
207 
208 	dev_list = ibv_get_device_list(&dev_cnt);
209 	if (!dev_list) {
210 		ret = ERR(ENODEV);
211 		goto err1;
212 	}
213 
214 	if (!dev_cnt) {
215 		ret = ERR(ENODEV);
216 		goto err2;
217 	}
218 
219 	cma_dev_array = calloc(dev_cnt, sizeof(*cma_dev_array));
220 	if (!cma_dev_array) {
221 		ret = ERR(ENOMEM);
222 		goto err2;
223 	}
224 
225 	for (i = 0; dev_list[i]; i++)
226 		cma_dev_array[i].guid = ibv_get_device_guid(dev_list[i]);
227 
228 	cma_dev_cnt = dev_cnt;
229 	ucma_set_af_ib_support();
230 	pthread_mutex_unlock(&mut);
231 	ibv_free_device_list(dev_list);
232 	return 0;
233 
234 err2:
235 	ibv_free_device_list(dev_list);
236 err1:
237 	fastlock_destroy(&idm_lock);
238 	pthread_mutex_unlock(&mut);
239 	return ret;
240 }
241 
242 static struct ibv_context *ucma_open_device(__be64 guid)
243 {
244 	struct ibv_device **dev_list;
245 	struct ibv_context *verbs = NULL;
246 	int i;
247 
248 	dev_list = ibv_get_device_list(NULL);
249 	if (!dev_list) {
250 		return NULL;
251 	}
252 
253 	for (i = 0; dev_list[i]; i++) {
254 		if (ibv_get_device_guid(dev_list[i]) == guid) {
255 			verbs = ibv_open_device(dev_list[i]);
256 			break;
257 		}
258 	}
259 
260 	ibv_free_device_list(dev_list);
261 	return verbs;
262 }
263 
264 static int ucma_init_device(struct cma_device *cma_dev)
265 {
266 	struct ibv_port_attr port_attr;
267 	struct ibv_device_attr attr;
268 	int i, ret;
269 
270 	if (cma_dev->verbs)
271 		return 0;
272 
273 	cma_dev->verbs = ucma_open_device(cma_dev->guid);
274 	if (!cma_dev->verbs)
275 		return ERR(ENODEV);
276 
277 	ret = ibv_query_device(cma_dev->verbs, &attr);
278 	if (ret) {
279 		ret = ERR(ret);
280 		goto err;
281 	}
282 
283 	cma_dev->port = malloc(sizeof(*cma_dev->port) * attr.phys_port_cnt);
284 	if (!cma_dev->port) {
285 		ret = ERR(ENOMEM);
286 		goto err;
287 	}
288 
289 	for (i = 1; i <= attr.phys_port_cnt; i++) {
290 		if (ibv_query_port(cma_dev->verbs, i, &port_attr))
291 			cma_dev->port[i - 1].link_layer = IBV_LINK_LAYER_UNSPECIFIED;
292 		else
293 			cma_dev->port[i - 1].link_layer = port_attr.link_layer;
294 	}
295 
296 	cma_dev->port_cnt = attr.phys_port_cnt;
297 	cma_dev->max_qpsize = attr.max_qp_wr;
298 	cma_dev->max_initiator_depth = (uint8_t) attr.max_qp_init_rd_atom;
299 	cma_dev->max_responder_resources = (uint8_t) attr.max_qp_rd_atom;
300 	cma_init_cnt++;
301 	return 0;
302 
303 err:
304 	ibv_close_device(cma_dev->verbs);
305 	cma_dev->verbs = NULL;
306 	return ret;
307 }
308 
309 static int ucma_init_all(void)
310 {
311 	int i, ret = 0;
312 
313 	if (!cma_dev_cnt) {
314 		ret = ucma_init();
315 		if (ret)
316 			return ret;
317 	}
318 
319 	if (cma_init_cnt == cma_dev_cnt)
320 		return 0;
321 
322 	pthread_mutex_lock(&mut);
323 	for (i = 0; i < cma_dev_cnt; i++) {
324 		ret = ucma_init_device(&cma_dev_array[i]);
325 		if (ret)
326 			break;
327 	}
328 	pthread_mutex_unlock(&mut);
329 	return ret;
330 }
331 
332 struct ibv_context **rdma_get_devices(int *num_devices)
333 {
334 	struct ibv_context **devs = NULL;
335 	int i;
336 
337 	if (ucma_init_all())
338 		goto out;
339 
340 	devs = malloc(sizeof(*devs) * (cma_dev_cnt + 1));
341 	if (!devs)
342 		goto out;
343 
344 	for (i = 0; i < cma_dev_cnt; i++)
345 		devs[i] = cma_dev_array[i].verbs;
346 	devs[i] = NULL;
347 out:
348 	if (num_devices)
349 		*num_devices = devs ? cma_dev_cnt : 0;
350 	return devs;
351 }
352 
353 void rdma_free_devices(struct ibv_context **list)
354 {
355 	free(list);
356 }
357 
358 struct rdma_event_channel *rdma_create_event_channel(void)
359 {
360 	struct rdma_event_channel *channel;
361 
362 	if (ucma_init())
363 		return NULL;
364 
365 	channel = malloc(sizeof(*channel));
366 	if (!channel)
367 		return NULL;
368 
369 	channel->fd = open("/dev/rdma_cm", O_RDWR | O_CLOEXEC);
370 	if (channel->fd < 0) {
371 		goto err;
372 	}
373 	return channel;
374 err:
375 	free(channel);
376 	return NULL;
377 }
378 
379 void rdma_destroy_event_channel(struct rdma_event_channel *channel)
380 {
381 	close(channel->fd);
382 	free(channel);
383 }
384 
385 static int ucma_get_device(struct cma_id_private *id_priv, __be64 guid)
386 {
387 	struct cma_device *cma_dev;
388 	int i, ret;
389 
390 	for (i = 0; i < cma_dev_cnt; i++) {
391 		cma_dev = &cma_dev_array[i];
392 		if (cma_dev->guid == guid)
393 			goto match;
394 	}
395 
396 	return ERR(ENODEV);
397 match:
398 	pthread_mutex_lock(&mut);
399 	if ((ret = ucma_init_device(cma_dev)))
400 		goto out;
401 
402 	if (!cma_dev->refcnt++) {
403 		cma_dev->pd = ibv_alloc_pd(cma_dev->verbs);
404 		if (!cma_dev->pd) {
405 			cma_dev->refcnt--;
406 			ret = ERR(ENOMEM);
407 			goto out;
408 		}
409 	}
410 	id_priv->cma_dev = cma_dev;
411 	id_priv->id.verbs = cma_dev->verbs;
412 	id_priv->id.pd = cma_dev->pd;
413 out:
414 	pthread_mutex_unlock(&mut);
415 	return ret;
416 }
417 
418 static void ucma_put_device(struct cma_device *cma_dev)
419 {
420 	pthread_mutex_lock(&mut);
421 	if (!--cma_dev->refcnt) {
422 		ibv_dealloc_pd(cma_dev->pd);
423 		if (cma_dev->xrcd)
424 			ibv_close_xrcd(cma_dev->xrcd);
425 	}
426 	pthread_mutex_unlock(&mut);
427 }
428 
429 static struct ibv_xrcd *ucma_get_xrcd(struct cma_device *cma_dev)
430 {
431 	struct ibv_xrcd_init_attr attr;
432 
433 	pthread_mutex_lock(&mut);
434 	if (!cma_dev->xrcd) {
435 		memset(&attr, 0, sizeof attr);
436 		attr.comp_mask = IBV_XRCD_INIT_ATTR_FD | IBV_XRCD_INIT_ATTR_OFLAGS;
437 		attr.fd = -1;
438 		attr.oflags = O_CREAT;
439 		cma_dev->xrcd = ibv_open_xrcd(cma_dev->verbs, &attr);
440 	}
441 	pthread_mutex_unlock(&mut);
442 	return cma_dev->xrcd;
443 }
444 
445 static void ucma_insert_id(struct cma_id_private *id_priv)
446 {
447 	fastlock_acquire(&idm_lock);
448 	idm_set(&ucma_idm, id_priv->handle, id_priv);
449 	fastlock_release(&idm_lock);
450 }
451 
452 static void ucma_remove_id(struct cma_id_private *id_priv)
453 {
454 	if (id_priv->handle <= IDX_MAX_INDEX)
455 		idm_clear(&ucma_idm, id_priv->handle);
456 }
457 
458 static struct cma_id_private *ucma_lookup_id(int handle)
459 {
460 	return idm_lookup(&ucma_idm, handle);
461 }
462 
463 static void ucma_free_id(struct cma_id_private *id_priv)
464 {
465 	ucma_remove_id(id_priv);
466 	if (id_priv->cma_dev)
467 		ucma_put_device(id_priv->cma_dev);
468 	pthread_cond_destroy(&id_priv->cond);
469 	pthread_mutex_destroy(&id_priv->mut);
470 	if (id_priv->id.route.path_rec)
471 		free(id_priv->id.route.path_rec);
472 
473 	if (id_priv->sync)
474 		rdma_destroy_event_channel(id_priv->id.channel);
475 	if (id_priv->connect_len)
476 		free(id_priv->connect);
477 	free(id_priv);
478 }
479 
480 static struct cma_id_private *ucma_alloc_id(struct rdma_event_channel *channel,
481 					    void *context,
482 					    enum rdma_port_space ps,
483 					    enum ibv_qp_type qp_type)
484 {
485 	struct cma_id_private *id_priv;
486 
487 	id_priv = calloc(1, sizeof(*id_priv));
488 	if (!id_priv)
489 		return NULL;
490 
491 	id_priv->id.context = context;
492 	id_priv->id.ps = ps;
493 	id_priv->id.qp_type = qp_type;
494 	id_priv->handle = 0xFFFFFFFF;
495 
496 	if (!channel) {
497 		id_priv->id.channel = rdma_create_event_channel();
498 		if (!id_priv->id.channel)
499 			goto err;
500 		id_priv->sync = 1;
501 	} else {
502 		id_priv->id.channel = channel;
503 	}
504 
505 	pthread_mutex_init(&id_priv->mut, NULL);
506 	if (pthread_cond_init(&id_priv->cond, NULL))
507 		goto err;
508 
509 	return id_priv;
510 
511 err:	ucma_free_id(id_priv);
512 	return NULL;
513 }
514 
515 static int rdma_create_id2(struct rdma_event_channel *channel,
516 			   struct rdma_cm_id **id, void *context,
517 			   enum rdma_port_space ps, enum ibv_qp_type qp_type)
518 {
519 	struct ucma_abi_create_id_resp resp;
520 	struct ucma_abi_create_id cmd;
521 	struct cma_id_private *id_priv;
522 	int ret;
523 
524 	ret = ucma_init();
525 	if (ret)
526 		return ret;
527 
528 	id_priv = ucma_alloc_id(channel, context, ps, qp_type);
529 	if (!id_priv)
530 		return ERR(ENOMEM);
531 
532 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_ID, &resp, sizeof resp);
533 	cmd.uid = (uintptr_t) id_priv;
534 	cmd.ps = ps;
535 	cmd.qp_type = qp_type;
536 
537 	ret = write(id_priv->id.channel->fd, &cmd, sizeof cmd);
538 	if (ret != sizeof cmd)
539 		goto err;
540 
541 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
542 
543 	id_priv->handle = resp.id;
544 	ucma_insert_id(id_priv);
545 	*id = &id_priv->id;
546 	return 0;
547 
548 err:	ucma_free_id(id_priv);
549 	return ret;
550 }
551 
552 int rdma_create_id(struct rdma_event_channel *channel,
553 		   struct rdma_cm_id **id, void *context,
554 		   enum rdma_port_space ps)
555 {
556 	enum ibv_qp_type qp_type;
557 
558 	qp_type = (ps == RDMA_PS_IPOIB || ps == RDMA_PS_UDP) ?
559 		  IBV_QPT_UD : IBV_QPT_RC;
560 	return rdma_create_id2(channel, id, context, ps, qp_type);
561 }
562 
563 static int ucma_destroy_kern_id(int fd, uint32_t handle)
564 {
565 	struct ucma_abi_destroy_id_resp resp;
566 	struct ucma_abi_destroy_id cmd;
567 	int ret;
568 
569 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, DESTROY_ID, &resp, sizeof resp);
570 	cmd.id = handle;
571 
572 	ret = write(fd, &cmd, sizeof cmd);
573 	if (ret != sizeof cmd)
574 		return (ret >= 0) ? ERR(ENODATA) : -1;
575 
576 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
577 
578 	return resp.events_reported;
579 }
580 
581 int rdma_destroy_id(struct rdma_cm_id *id)
582 {
583 	struct cma_id_private *id_priv;
584 	int ret;
585 
586 	id_priv = container_of(id, struct cma_id_private, id);
587 	ret = ucma_destroy_kern_id(id->channel->fd, id_priv->handle);
588 	if (ret < 0)
589 		return ret;
590 
591 	if (id_priv->id.event)
592 		rdma_ack_cm_event(id_priv->id.event);
593 
594 	pthread_mutex_lock(&id_priv->mut);
595 	while (id_priv->events_completed < ret)
596 		pthread_cond_wait(&id_priv->cond, &id_priv->mut);
597 	pthread_mutex_unlock(&id_priv->mut);
598 
599 	ucma_free_id(id_priv);
600 	return 0;
601 }
602 
603 int ucma_addrlen(struct sockaddr *addr)
604 {
605 	if (!addr)
606 		return 0;
607 
608 	switch (addr->sa_family) {
609 	case PF_INET:
610 		return sizeof(struct sockaddr_in);
611 	case PF_INET6:
612 		return sizeof(struct sockaddr_in6);
613 	case PF_IB:
614 		return af_ib_support ? sizeof(struct sockaddr_ib) : 0;
615 	default:
616 		return 0;
617 	}
618 }
619 
620 static int ucma_query_addr(struct rdma_cm_id *id)
621 {
622 	struct ucma_abi_query_addr_resp resp;
623 	struct ucma_abi_query cmd;
624 	struct cma_id_private *id_priv;
625 	int ret;
626 
627 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp);
628 	id_priv = container_of(id, struct cma_id_private, id);
629 	cmd.id = id_priv->handle;
630 	cmd.option = UCMA_QUERY_ADDR;
631 
632 	ret = write(id->channel->fd, &cmd, sizeof cmd);
633 	if (ret != sizeof cmd)
634 		return (ret >= 0) ? ERR(ENODATA) : -1;
635 
636 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
637 
638 	memcpy(&id->route.addr.src_addr, &resp.src_addr, resp.src_size);
639 	memcpy(&id->route.addr.dst_addr, &resp.dst_addr, resp.dst_size);
640 
641 	if (!id_priv->cma_dev && resp.node_guid) {
642 		ret = ucma_get_device(id_priv, resp.node_guid);
643 		if (ret)
644 			return ret;
645 		id->port_num = resp.port_num;
646 		id->route.addr.addr.ibaddr.pkey = resp.pkey;
647 	}
648 
649 	return 0;
650 }
651 
652 static int ucma_query_gid(struct rdma_cm_id *id)
653 {
654 	struct ucma_abi_query_addr_resp resp;
655 	struct ucma_abi_query cmd;
656 	struct cma_id_private *id_priv;
657 	struct sockaddr_ib *sib;
658 	int ret;
659 
660 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp);
661 	id_priv = container_of(id, struct cma_id_private, id);
662 	cmd.id = id_priv->handle;
663 	cmd.option = UCMA_QUERY_GID;
664 
665 	ret = write(id->channel->fd, &cmd, sizeof cmd);
666 	if (ret != sizeof cmd)
667 		return (ret >= 0) ? ERR(ENODATA) : -1;
668 
669 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
670 
671 	sib = (struct sockaddr_ib *) &resp.src_addr;
672 	memcpy(id->route.addr.addr.ibaddr.sgid.raw, sib->sib_addr.sib_raw,
673 	       sizeof id->route.addr.addr.ibaddr.sgid);
674 
675 	sib = (struct sockaddr_ib *) &resp.dst_addr;
676 	memcpy(id->route.addr.addr.ibaddr.dgid.raw, sib->sib_addr.sib_raw,
677 	       sizeof id->route.addr.addr.ibaddr.dgid);
678 
679 	return 0;
680 }
681 
682 static void ucma_convert_path(struct ibv_path_data *path_data,
683 			      struct ibv_sa_path_rec *sa_path)
684 {
685 	uint32_t fl_hop;
686 
687 	sa_path->dgid = path_data->path.dgid;
688 	sa_path->sgid = path_data->path.sgid;
689 	sa_path->dlid = path_data->path.dlid;
690 	sa_path->slid = path_data->path.slid;
691 	sa_path->raw_traffic = 0;
692 
693 	fl_hop = be32toh(path_data->path.flowlabel_hoplimit);
694 	sa_path->flow_label = htobe32(fl_hop >> 8);
695 	sa_path->hop_limit = (uint8_t) fl_hop;
696 
697 	sa_path->traffic_class = path_data->path.tclass;
698 	sa_path->reversible = path_data->path.reversible_numpath >> 7;
699 	sa_path->numb_path = 1;
700 	sa_path->pkey = path_data->path.pkey;
701 	sa_path->sl = be16toh(path_data->path.qosclass_sl) & 0xF;
702 	sa_path->mtu_selector = 2;	/* exactly */
703 	sa_path->mtu = path_data->path.mtu & 0x1F;
704 	sa_path->rate_selector = 2;
705 	sa_path->rate = path_data->path.rate & 0x1F;
706 	sa_path->packet_life_time_selector = 2;
707 	sa_path->packet_life_time = path_data->path.packetlifetime & 0x1F;
708 
709 	sa_path->preference = (uint8_t) path_data->flags;
710 }
711 
712 static int ucma_query_path(struct rdma_cm_id *id)
713 {
714 	struct ucma_abi_query_path_resp *resp;
715 	struct ucma_abi_query cmd;
716 	struct cma_id_private *id_priv;
717 	int ret, i, size;
718 
719 	size = sizeof(*resp) + sizeof(struct ibv_path_data) * 6;
720 	resp = alloca(size);
721 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, resp, size);
722 	id_priv = container_of(id, struct cma_id_private, id);
723 	cmd.id = id_priv->handle;
724 	cmd.option = UCMA_QUERY_PATH;
725 
726 	ret = write(id->channel->fd, &cmd, sizeof cmd);
727 	if (ret != sizeof cmd)
728 		return (ret >= 0) ? ERR(ENODATA) : -1;
729 
730 	VALGRIND_MAKE_MEM_DEFINED(resp, size);
731 
732 	if (resp->num_paths) {
733 		id->route.path_rec = malloc(sizeof(*id->route.path_rec) *
734 					    resp->num_paths);
735 		if (!id->route.path_rec)
736 			return ERR(ENOMEM);
737 
738 		id->route.num_paths = resp->num_paths;
739 		for (i = 0; i < resp->num_paths; i++)
740 			ucma_convert_path(&resp->path_data[i], &id->route.path_rec[i]);
741 	}
742 
743 	return 0;
744 }
745 
746 static int ucma_query_route(struct rdma_cm_id *id)
747 {
748 	struct ucma_abi_query_route_resp resp;
749 	struct ucma_abi_query cmd;
750 	struct cma_id_private *id_priv;
751 	int ret, i;
752 
753 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY_ROUTE, &resp, sizeof resp);
754 	id_priv = container_of(id, struct cma_id_private, id);
755 	cmd.id = id_priv->handle;
756 
757 	ret = write(id->channel->fd, &cmd, sizeof cmd);
758 	if (ret != sizeof cmd)
759 		return (ret >= 0) ? ERR(ENODATA) : -1;
760 
761 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
762 
763 	if (resp.num_paths) {
764 		id->route.path_rec = malloc(sizeof(*id->route.path_rec) *
765 					    resp.num_paths);
766 		if (!id->route.path_rec)
767 			return ERR(ENOMEM);
768 
769 		id->route.num_paths = resp.num_paths;
770 		for (i = 0; i < resp.num_paths; i++)
771 			ibv_copy_path_rec_from_kern(&id->route.path_rec[i],
772 						    &resp.ib_route[i]);
773 	}
774 
775 	memcpy(id->route.addr.addr.ibaddr.sgid.raw, resp.ib_route[0].sgid,
776 	       sizeof id->route.addr.addr.ibaddr.sgid);
777 	memcpy(id->route.addr.addr.ibaddr.dgid.raw, resp.ib_route[0].dgid,
778 	       sizeof id->route.addr.addr.ibaddr.dgid);
779 	id->route.addr.addr.ibaddr.pkey = resp.ib_route[0].pkey;
780 	memcpy(&id->route.addr.src_addr, &resp.src_addr,
781 	       sizeof resp.src_addr);
782 	memcpy(&id->route.addr.dst_addr, &resp.dst_addr,
783 	       sizeof resp.dst_addr);
784 
785 	if (!id_priv->cma_dev && resp.node_guid) {
786 		ret = ucma_get_device(id_priv, resp.node_guid);
787 		if (ret)
788 			return ret;
789 		id_priv->id.port_num = resp.port_num;
790 	}
791 
792 	return 0;
793 }
794 
795 static int rdma_bind_addr2(struct rdma_cm_id *id, struct sockaddr *addr,
796 			   socklen_t addrlen)
797 {
798 	struct ucma_abi_bind cmd;
799 	struct cma_id_private *id_priv;
800 	int ret;
801 
802 	CMA_INIT_CMD(&cmd, sizeof cmd, BIND);
803 	id_priv = container_of(id, struct cma_id_private, id);
804 	cmd.id = id_priv->handle;
805 	cmd.addr_size = addrlen;
806 	memcpy(&cmd.addr, addr, addrlen);
807 
808 	ret = write(id->channel->fd, &cmd, sizeof cmd);
809 	if (ret != sizeof cmd)
810 		return (ret >= 0) ? ERR(ENODATA) : -1;
811 
812 	ret = ucma_query_addr(id);
813 	if (!ret)
814 		ret = ucma_query_gid(id);
815 	return ret;
816 }
817 
818 int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
819 {
820 	struct ucma_abi_bind_ip cmd;
821 	struct cma_id_private *id_priv;
822 	int ret, addrlen;
823 
824 	addrlen = ucma_addrlen(addr);
825 	if (!addrlen)
826 		return ERR(EINVAL);
827 
828 	if (af_ib_support)
829 		return rdma_bind_addr2(id, addr, addrlen);
830 
831 	CMA_INIT_CMD(&cmd, sizeof cmd, BIND_IP);
832 	id_priv = container_of(id, struct cma_id_private, id);
833 	cmd.id = id_priv->handle;
834 	memcpy(&cmd.addr, addr, addrlen);
835 
836 	ret = write(id->channel->fd, &cmd, sizeof cmd);
837 	if (ret != sizeof cmd)
838 		return (ret >= 0) ? ERR(ENODATA) : -1;
839 
840 	return ucma_query_route(id);
841 }
842 
843 int ucma_complete(struct rdma_cm_id *id)
844 {
845 	struct cma_id_private *id_priv;
846 	int ret;
847 
848 	id_priv = container_of(id, struct cma_id_private, id);
849 	if (!id_priv->sync)
850 		return 0;
851 
852 	if (id_priv->id.event) {
853 		rdma_ack_cm_event(id_priv->id.event);
854 		id_priv->id.event = NULL;
855 	}
856 
857 	ret = rdma_get_cm_event(id_priv->id.channel, &id_priv->id.event);
858 	if (ret)
859 		return ret;
860 
861 	if (id_priv->id.event->status) {
862 		if (id_priv->id.event->event == RDMA_CM_EVENT_REJECTED)
863 			ret = ERR(ECONNREFUSED);
864 		else if (id_priv->id.event->status < 0)
865 			ret = ERR(-id_priv->id.event->status);
866 		else
867 			ret = ERR(-id_priv->id.event->status);
868 	}
869 	return ret;
870 }
871 
872 static int rdma_resolve_addr2(struct rdma_cm_id *id, struct sockaddr *src_addr,
873 			      socklen_t src_len, struct sockaddr *dst_addr,
874 			      socklen_t dst_len, int timeout_ms)
875 {
876 	struct ucma_abi_resolve_addr cmd;
877 	struct cma_id_private *id_priv;
878 	int ret;
879 
880 	CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ADDR);
881 	id_priv = container_of(id, struct cma_id_private, id);
882 	cmd.id = id_priv->handle;
883 	if ((cmd.src_size = src_len))
884 		memcpy(&cmd.src_addr, src_addr, src_len);
885 	memcpy(&cmd.dst_addr, dst_addr, dst_len);
886 	cmd.dst_size = dst_len;
887 	cmd.timeout_ms = timeout_ms;
888 
889 	ret = write(id->channel->fd, &cmd, sizeof cmd);
890 	if (ret != sizeof cmd)
891 		return (ret >= 0) ? ERR(ENODATA) : -1;
892 
893 	memcpy(&id->route.addr.dst_addr, dst_addr, dst_len);
894 	return ucma_complete(id);
895 }
896 
897 int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
898 		      struct sockaddr *dst_addr, int timeout_ms)
899 {
900 	struct ucma_abi_resolve_ip cmd;
901 	struct cma_id_private *id_priv;
902 	int ret, dst_len, src_len;
903 
904 	dst_len = ucma_addrlen(dst_addr);
905 	if (!dst_len)
906 		return ERR(EINVAL);
907 
908 	src_len = ucma_addrlen(src_addr);
909 	if (src_addr && !src_len)
910 		return ERR(EINVAL);
911 
912 	if (af_ib_support)
913 		return rdma_resolve_addr2(id, src_addr, src_len, dst_addr,
914 					  dst_len, timeout_ms);
915 
916 	CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_IP);
917 	id_priv = container_of(id, struct cma_id_private, id);
918 	cmd.id = id_priv->handle;
919 	if (src_addr)
920 		memcpy(&cmd.src_addr, src_addr, src_len);
921 	memcpy(&cmd.dst_addr, dst_addr, dst_len);
922 	cmd.timeout_ms = timeout_ms;
923 
924 	ret = write(id->channel->fd, &cmd, sizeof cmd);
925 	if (ret != sizeof cmd)
926 		return (ret >= 0) ? ERR(ENODATA) : -1;
927 
928 	memcpy(&id->route.addr.dst_addr, dst_addr, dst_len);
929 	return ucma_complete(id);
930 }
931 
932 static int ucma_set_ib_route(struct rdma_cm_id *id)
933 {
934 	struct rdma_addrinfo hint, *rai;
935 	int ret;
936 
937 	memset(&hint, 0, sizeof hint);
938 	hint.ai_flags = RAI_ROUTEONLY;
939 	hint.ai_family = id->route.addr.src_addr.sa_family;
940 	hint.ai_src_len = ucma_addrlen((struct sockaddr *) &id->route.addr.src_addr);
941 	hint.ai_src_addr = &id->route.addr.src_addr;
942 	hint.ai_dst_len = ucma_addrlen((struct sockaddr *) &id->route.addr.dst_addr);
943 	hint.ai_dst_addr = &id->route.addr.dst_addr;
944 
945 	ret = rdma_getaddrinfo(NULL, NULL, &hint, &rai);
946 	if (ret)
947 		return ret;
948 
949 	if (rai->ai_route_len)
950 		ret = rdma_set_option(id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH,
951 				      rai->ai_route, rai->ai_route_len);
952 	else
953 		ret = -1;
954 
955 	rdma_freeaddrinfo(rai);
956 	return ret;
957 }
958 
959 int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
960 {
961 	struct ucma_abi_resolve_route cmd;
962 	struct cma_id_private *id_priv;
963 	int ret;
964 
965 	id_priv = container_of(id, struct cma_id_private, id);
966 	if (id->verbs->device->transport_type == IBV_TRANSPORT_IB) {
967 		ret = ucma_set_ib_route(id);
968 		if (!ret)
969 			goto out;
970 	}
971 
972 	CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ROUTE);
973 	cmd.id = id_priv->handle;
974 	cmd.timeout_ms = timeout_ms;
975 
976 	ret = write(id->channel->fd, &cmd, sizeof cmd);
977 	if (ret != sizeof cmd)
978 		return (ret >= 0) ? ERR(ENODATA) : -1;
979 
980 out:
981 	return ucma_complete(id);
982 }
983 
984 static int ucma_is_ud_qp(enum ibv_qp_type qp_type)
985 {
986 	return (qp_type == IBV_QPT_UD);
987 }
988 
989 static int rdma_init_qp_attr(struct rdma_cm_id *id, struct ibv_qp_attr *qp_attr,
990 			     int *qp_attr_mask)
991 {
992 	struct ucma_abi_init_qp_attr cmd;
993 	struct ibv_kern_qp_attr resp;
994 	struct cma_id_private *id_priv;
995 	int ret;
996 
997 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, INIT_QP_ATTR, &resp, sizeof resp);
998 	id_priv = container_of(id, struct cma_id_private, id);
999 	cmd.id = id_priv->handle;
1000 	cmd.qp_state = qp_attr->qp_state;
1001 
1002 	ret = write(id->channel->fd, &cmd, sizeof cmd);
1003 	if (ret != sizeof cmd)
1004 		return (ret >= 0) ? ERR(ENODATA) : -1;
1005 
1006 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
1007 
1008 	ibv_copy_qp_attr_from_kern(qp_attr, &resp);
1009 	*qp_attr_mask = resp.qp_attr_mask;
1010 	return 0;
1011 }
1012 
1013 static int ucma_modify_qp_rtr(struct rdma_cm_id *id, uint8_t resp_res)
1014 {
1015 	struct cma_id_private *id_priv;
1016 	struct ibv_qp_attr qp_attr;
1017 	int qp_attr_mask, ret;
1018 	uint8_t link_layer;
1019 
1020 	if (!id->qp)
1021 		return ERR(EINVAL);
1022 
1023 	/* Need to update QP attributes from default values. */
1024 	qp_attr.qp_state = IBV_QPS_INIT;
1025 	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
1026 	if (ret)
1027 		return ret;
1028 
1029 	ret = ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask);
1030 	if (ret)
1031 		return ERR(ret);
1032 
1033 	qp_attr.qp_state = IBV_QPS_RTR;
1034 	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
1035 	if (ret)
1036 		return ret;
1037 
1038 	/*
1039 	 * Workaround for rdma_ucm kernel bug:
1040 	 * mask off qp_attr_mask bits 21-24 which are used for RoCE
1041 	 */
1042 	id_priv = container_of(id, struct cma_id_private, id);
1043 	link_layer = id_priv->cma_dev->port[id->port_num - 1].link_layer;
1044 
1045 	if (link_layer == IBV_LINK_LAYER_INFINIBAND)
1046 		qp_attr_mask &= UINT_MAX ^ 0xe00000;
1047 
1048 	if (resp_res != RDMA_MAX_RESP_RES)
1049 		qp_attr.max_dest_rd_atomic = resp_res;
1050 	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask));
1051 }
1052 
1053 static int ucma_modify_qp_rts(struct rdma_cm_id *id, uint8_t init_depth)
1054 {
1055 	struct ibv_qp_attr qp_attr;
1056 	int qp_attr_mask, ret;
1057 
1058 	qp_attr.qp_state = IBV_QPS_RTS;
1059 	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
1060 	if (ret)
1061 		return ret;
1062 
1063 	if (init_depth != RDMA_MAX_INIT_DEPTH)
1064 		qp_attr.max_rd_atomic = init_depth;
1065 	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask));
1066 }
1067 
1068 static int ucma_modify_qp_sqd(struct rdma_cm_id *id)
1069 {
1070 	struct ibv_qp_attr qp_attr;
1071 
1072 	if (!id->qp)
1073 		return 0;
1074 
1075 	qp_attr.qp_state = IBV_QPS_SQD;
1076 	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE));
1077 }
1078 
1079 static int ucma_modify_qp_err(struct rdma_cm_id *id)
1080 {
1081 	struct ibv_qp_attr qp_attr;
1082 
1083 	if (!id->qp)
1084 		return 0;
1085 
1086 	qp_attr.qp_state = IBV_QPS_ERR;
1087 	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE));
1088 }
1089 
1090 static int ucma_find_pkey(struct cma_device *cma_dev, uint8_t port_num,
1091 			  __be16 pkey, uint16_t *pkey_index)
1092 {
1093 	int ret, i;
1094 	__be16 chk_pkey;
1095 
1096 	for (i = 0, ret = 0; !ret; i++) {
1097 		ret = ibv_query_pkey(cma_dev->verbs, port_num, i, &chk_pkey);
1098 		if (!ret && pkey == chk_pkey) {
1099 			*pkey_index = (uint16_t) i;
1100 			return 0;
1101 		}
1102 	}
1103 	return ERR(EINVAL);
1104 }
1105 
1106 static int ucma_init_conn_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp)
1107 {
1108 	struct ibv_qp_attr qp_attr;
1109 	int ret;
1110 
1111 	ret = ucma_find_pkey(id_priv->cma_dev, id_priv->id.port_num,
1112 			     id_priv->id.route.addr.addr.ibaddr.pkey,
1113 			     &qp_attr.pkey_index);
1114 	if (ret)
1115 		return ret;
1116 
1117 	qp_attr.port_num = id_priv->id.port_num;
1118 	qp_attr.qp_state = IBV_QPS_INIT;
1119 	qp_attr.qp_access_flags = 0;
1120 
1121 	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_ACCESS_FLAGS |
1122 					  IBV_QP_PKEY_INDEX | IBV_QP_PORT);
1123 	return rdma_seterrno(ret);
1124 }
1125 
1126 static int ucma_init_conn_qp(struct cma_id_private *id_priv, struct ibv_qp *qp)
1127 {
1128 	struct ibv_qp_attr qp_attr;
1129 	int qp_attr_mask, ret;
1130 
1131 	if (abi_ver == 3)
1132 		return ucma_init_conn_qp3(id_priv, qp);
1133 
1134 	qp_attr.qp_state = IBV_QPS_INIT;
1135 	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
1136 	if (ret)
1137 		return ret;
1138 
1139 	return rdma_seterrno(ibv_modify_qp(qp, &qp_attr, qp_attr_mask));
1140 }
1141 
1142 static int ucma_init_ud_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp)
1143 {
1144 	struct ibv_qp_attr qp_attr;
1145 	int ret;
1146 
1147 	ret = ucma_find_pkey(id_priv->cma_dev, id_priv->id.port_num,
1148 			     id_priv->id.route.addr.addr.ibaddr.pkey,
1149 			     &qp_attr.pkey_index);
1150 	if (ret)
1151 		return ret;
1152 
1153 	qp_attr.port_num = id_priv->id.port_num;
1154 	qp_attr.qp_state = IBV_QPS_INIT;
1155 	qp_attr.qkey = RDMA_UDP_QKEY;
1156 
1157 	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_QKEY |
1158 					  IBV_QP_PKEY_INDEX | IBV_QP_PORT);
1159 	if (ret)
1160 		return ERR(ret);
1161 
1162 	qp_attr.qp_state = IBV_QPS_RTR;
1163 	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE);
1164 	if (ret)
1165 		return ERR(ret);
1166 
1167 	qp_attr.qp_state = IBV_QPS_RTS;
1168 	qp_attr.sq_psn = 0;
1169 	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN);
1170 	return rdma_seterrno(ret);
1171 }
1172 
1173 static int ucma_init_ud_qp(struct cma_id_private *id_priv, struct ibv_qp *qp)
1174 {
1175 	struct ibv_qp_attr qp_attr;
1176 	int qp_attr_mask, ret;
1177 
1178 	if (abi_ver == 3)
1179 		return ucma_init_ud_qp3(id_priv, qp);
1180 
1181 	qp_attr.qp_state = IBV_QPS_INIT;
1182 	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
1183 	if (ret)
1184 		return ret;
1185 
1186 	ret = ibv_modify_qp(qp, &qp_attr, qp_attr_mask);
1187 	if (ret)
1188 		return ERR(ret);
1189 
1190 	qp_attr.qp_state = IBV_QPS_RTR;
1191 	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE);
1192 	if (ret)
1193 		return ERR(ret);
1194 
1195 	qp_attr.qp_state = IBV_QPS_RTS;
1196 	qp_attr.sq_psn = 0;
1197 	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN);
1198 	return rdma_seterrno(ret);
1199 }
1200 
1201 static void ucma_destroy_cqs(struct rdma_cm_id *id)
1202 {
1203 	if (id->qp_type == IBV_QPT_XRC_RECV && id->srq)
1204 		return;
1205 
1206 	if (id->recv_cq) {
1207 		ibv_destroy_cq(id->recv_cq);
1208 		if (id->send_cq && (id->send_cq != id->recv_cq)) {
1209 			ibv_destroy_cq(id->send_cq);
1210 			id->send_cq = NULL;
1211 		}
1212 		id->recv_cq = NULL;
1213 	}
1214 
1215 	if (id->recv_cq_channel) {
1216 		ibv_destroy_comp_channel(id->recv_cq_channel);
1217 		if (id->send_cq_channel && (id->send_cq_channel != id->recv_cq_channel)) {
1218 			ibv_destroy_comp_channel(id->send_cq_channel);
1219 			id->send_cq_channel = NULL;
1220 		}
1221 		id->recv_cq_channel = NULL;
1222 	}
1223 }
1224 
1225 static int ucma_create_cqs(struct rdma_cm_id *id, uint32_t send_size, uint32_t recv_size)
1226 {
1227 	if (recv_size) {
1228 		id->recv_cq_channel = ibv_create_comp_channel(id->verbs);
1229 		if (!id->recv_cq_channel)
1230 			goto err;
1231 
1232 		id->recv_cq = ibv_create_cq(id->verbs, recv_size,
1233 					    id, id->recv_cq_channel, 0);
1234 		if (!id->recv_cq)
1235 			goto err;
1236 	}
1237 
1238 	if (send_size) {
1239 		id->send_cq_channel = ibv_create_comp_channel(id->verbs);
1240 		if (!id->send_cq_channel)
1241 			goto err;
1242 
1243 		id->send_cq = ibv_create_cq(id->verbs, send_size,
1244 					    id, id->send_cq_channel, 0);
1245 		if (!id->send_cq)
1246 			goto err;
1247 	}
1248 
1249 	return 0;
1250 err:
1251 	ucma_destroy_cqs(id);
1252 	return ERR(ENOMEM);
1253 }
1254 
1255 int rdma_create_srq_ex(struct rdma_cm_id *id, struct ibv_srq_init_attr_ex *attr)
1256 {
1257 	struct cma_id_private *id_priv;
1258 	struct ibv_srq *srq;
1259 	int ret;
1260 
1261 	id_priv = container_of(id, struct cma_id_private, id);
1262 	if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_TYPE))
1263 		return ERR(EINVAL);
1264 
1265 	if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_PD) || !attr->pd) {
1266 		attr->pd = id->pd;
1267 		attr->comp_mask |= IBV_SRQ_INIT_ATTR_PD;
1268 	}
1269 
1270 	if (attr->srq_type == IBV_SRQT_XRC) {
1271 		if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_XRCD) || !attr->xrcd) {
1272 			attr->xrcd = ucma_get_xrcd(id_priv->cma_dev);
1273 			if (!attr->xrcd)
1274 				return -1;
1275 		}
1276 		if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_CQ) || !attr->cq) {
1277 			ret = ucma_create_cqs(id, 0, attr->attr.max_wr);
1278 			if (ret)
1279 				return ret;
1280 			attr->cq = id->recv_cq;
1281 		}
1282 		attr->comp_mask |= IBV_SRQ_INIT_ATTR_XRCD | IBV_SRQ_INIT_ATTR_CQ;
1283 	}
1284 
1285 	srq = ibv_create_srq_ex(id->verbs, attr);
1286 	if (!srq) {
1287 		ret = -1;
1288 		goto err;
1289 	}
1290 
1291 	if (!id->pd)
1292 		id->pd = attr->pd;
1293 	id->srq = srq;
1294 	return 0;
1295 err:
1296 	ucma_destroy_cqs(id);
1297 	return ret;
1298 }
1299 
1300 int rdma_create_srq(struct rdma_cm_id *id, struct ibv_pd *pd,
1301 		    struct ibv_srq_init_attr *attr)
1302 {
1303 	struct ibv_srq_init_attr_ex attr_ex;
1304 	int ret;
1305 
1306 	memcpy(&attr_ex, attr, sizeof(*attr));
1307 	attr_ex.comp_mask = IBV_SRQ_INIT_ATTR_TYPE | IBV_SRQ_INIT_ATTR_PD;
1308 	if (id->qp_type == IBV_QPT_XRC_RECV) {
1309 		attr_ex.srq_type = IBV_SRQT_XRC;
1310 	} else {
1311 		attr_ex.srq_type = IBV_SRQT_BASIC;
1312 	}
1313 	attr_ex.pd = pd;
1314 	ret = rdma_create_srq_ex(id, &attr_ex);
1315 	memcpy(attr, &attr_ex, sizeof(*attr));
1316 	return ret;
1317 }
1318 
1319 void rdma_destroy_srq(struct rdma_cm_id *id)
1320 {
1321 	ibv_destroy_srq(id->srq);
1322 	id->srq = NULL;
1323 	ucma_destroy_cqs(id);
1324 }
1325 
1326 int rdma_create_qp_ex(struct rdma_cm_id *id,
1327 		      struct ibv_qp_init_attr_ex *attr)
1328 {
1329 	struct cma_id_private *id_priv;
1330 	struct ibv_qp *qp;
1331 	int ret;
1332 
1333 	if (id->qp)
1334 		return ERR(EINVAL);
1335 
1336 	id_priv = container_of(id, struct cma_id_private, id);
1337 	if (!(attr->comp_mask & IBV_QP_INIT_ATTR_PD) || !attr->pd) {
1338 		attr->comp_mask |= IBV_QP_INIT_ATTR_PD;
1339 		attr->pd = id->pd;
1340 	} else if (id->verbs != attr->pd->context)
1341 		return ERR(EINVAL);
1342 
1343 	if ((id->recv_cq && attr->recv_cq && id->recv_cq != attr->recv_cq) ||
1344 	    (id->send_cq && attr->send_cq && id->send_cq != attr->send_cq))
1345 		return ERR(EINVAL);
1346 
1347 	if (id->qp_type == IBV_QPT_XRC_RECV) {
1348 		if (!(attr->comp_mask & IBV_QP_INIT_ATTR_XRCD) || !attr->xrcd) {
1349 			attr->xrcd = ucma_get_xrcd(id_priv->cma_dev);
1350 			if (!attr->xrcd)
1351 				return -1;
1352 			attr->comp_mask |= IBV_QP_INIT_ATTR_XRCD;
1353 		}
1354 	}
1355 
1356 	ret = ucma_create_cqs(id, attr->send_cq || id->send_cq ? 0 : attr->cap.max_send_wr,
1357 				  attr->recv_cq || id->recv_cq ? 0 : attr->cap.max_recv_wr);
1358 	if (ret)
1359 		return ret;
1360 
1361 	if (!attr->send_cq)
1362 		attr->send_cq = id->send_cq;
1363 	if (!attr->recv_cq)
1364 		attr->recv_cq = id->recv_cq;
1365 	if (id->srq && !attr->srq)
1366 		attr->srq = id->srq;
1367 	qp = ibv_create_qp_ex(id->verbs, attr);
1368 	if (!qp) {
1369 		ret = ERR(ENOMEM);
1370 		goto err1;
1371 	}
1372 
1373 	if (ucma_is_ud_qp(id->qp_type))
1374 		ret = ucma_init_ud_qp(id_priv, qp);
1375 	else
1376 		ret = ucma_init_conn_qp(id_priv, qp);
1377 	if (ret)
1378 		goto err2;
1379 
1380 	id->pd = qp->pd;
1381 	id->qp = qp;
1382 	return 0;
1383 err2:
1384 	ibv_destroy_qp(qp);
1385 err1:
1386 	ucma_destroy_cqs(id);
1387 	return ret;
1388 }
1389 
1390 int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd,
1391 		   struct ibv_qp_init_attr *qp_init_attr)
1392 {
1393 	struct ibv_qp_init_attr_ex attr_ex;
1394 	int ret;
1395 
1396 	memcpy(&attr_ex, qp_init_attr, sizeof(*qp_init_attr));
1397 	attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD;
1398 	attr_ex.pd = pd ? pd : id->pd;
1399 	ret = rdma_create_qp_ex(id, &attr_ex);
1400 	memcpy(qp_init_attr, &attr_ex, sizeof(*qp_init_attr));
1401 	return ret;
1402 }
1403 
1404 void rdma_destroy_qp(struct rdma_cm_id *id)
1405 {
1406 	ibv_destroy_qp(id->qp);
1407 	id->qp = NULL;
1408 	ucma_destroy_cqs(id);
1409 }
1410 
1411 static int ucma_valid_param(struct cma_id_private *id_priv,
1412 			    struct rdma_conn_param *param)
1413 {
1414 	if (id_priv->id.ps != RDMA_PS_TCP)
1415 		return 0;
1416 
1417 	if (!id_priv->id.qp && !param)
1418 		goto err;
1419 
1420 	if (!param)
1421 		return 0;
1422 
1423 	if ((param->responder_resources != RDMA_MAX_RESP_RES) &&
1424 	    (param->responder_resources > id_priv->cma_dev->max_responder_resources))
1425 		goto err;
1426 
1427 	if ((param->initiator_depth != RDMA_MAX_INIT_DEPTH) &&
1428 	    (param->initiator_depth > id_priv->cma_dev->max_initiator_depth))
1429 		goto err;
1430 
1431 	return 0;
1432 err:
1433 	return ERR(EINVAL);
1434 }
1435 
1436 static void ucma_copy_conn_param_to_kern(struct cma_id_private *id_priv,
1437 					 struct ucma_abi_conn_param *dst,
1438 					 struct rdma_conn_param *src,
1439 					 uint32_t qp_num, uint8_t srq)
1440 {
1441 	dst->qp_num = qp_num;
1442 	dst->srq = srq;
1443 	dst->responder_resources = id_priv->responder_resources;
1444 	dst->initiator_depth = id_priv->initiator_depth;
1445 	dst->valid = 1;
1446 
1447 	if (id_priv->connect_len) {
1448 		memcpy(dst->private_data, id_priv->connect, id_priv->connect_len);
1449 		dst->private_data_len = id_priv->connect_len;
1450 	}
1451 
1452 	if (src) {
1453 		dst->flow_control = src->flow_control;
1454 		dst->retry_count = src->retry_count;
1455 		dst->rnr_retry_count = src->rnr_retry_count;
1456 
1457 		if (src->private_data && src->private_data_len) {
1458 			memcpy(dst->private_data + dst->private_data_len,
1459 			       src->private_data, src->private_data_len);
1460 			dst->private_data_len += src->private_data_len;
1461 		}
1462 	} else {
1463 		dst->retry_count = 7;
1464 		dst->rnr_retry_count = 7;
1465 	}
1466 }
1467 
1468 int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
1469 {
1470 	struct ucma_abi_connect cmd;
1471 	struct cma_id_private *id_priv;
1472 	int ret;
1473 
1474 	id_priv = container_of(id, struct cma_id_private, id);
1475 	ret = ucma_valid_param(id_priv, conn_param);
1476 	if (ret)
1477 		return ret;
1478 
1479 	if (conn_param && conn_param->initiator_depth != RDMA_MAX_INIT_DEPTH)
1480 		id_priv->initiator_depth = conn_param->initiator_depth;
1481 	else
1482 		id_priv->initiator_depth = id_priv->cma_dev->max_initiator_depth;
1483 	if (conn_param && conn_param->responder_resources != RDMA_MAX_RESP_RES)
1484 		id_priv->responder_resources = conn_param->responder_resources;
1485 	else
1486 		id_priv->responder_resources = id_priv->cma_dev->max_responder_resources;
1487 
1488 	CMA_INIT_CMD(&cmd, sizeof cmd, CONNECT);
1489 	cmd.id = id_priv->handle;
1490 	if (id->qp) {
1491 		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1492 					     conn_param, id->qp->qp_num,
1493 					     (id->qp->srq != NULL));
1494 	} else if (conn_param) {
1495 		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1496 					     conn_param, conn_param->qp_num,
1497 					     conn_param->srq);
1498 	} else {
1499 		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1500 					     conn_param, 0, 0);
1501 	}
1502 
1503 	ret = write(id->channel->fd, &cmd, sizeof cmd);
1504 	if (ret != sizeof cmd)
1505 		return (ret >= 0) ? ERR(ENODATA) : -1;
1506 
1507 	if (id_priv->connect_len) {
1508 		free(id_priv->connect);
1509 		id_priv->connect_len = 0;
1510 	}
1511 
1512 	return ucma_complete(id);
1513 }
1514 
1515 int rdma_listen(struct rdma_cm_id *id, int backlog)
1516 {
1517 	struct ucma_abi_listen cmd;
1518 	struct cma_id_private *id_priv;
1519 	int ret;
1520 
1521 	CMA_INIT_CMD(&cmd, sizeof cmd, LISTEN);
1522 	id_priv = container_of(id, struct cma_id_private, id);
1523 	cmd.id = id_priv->handle;
1524 	cmd.backlog = backlog;
1525 
1526 	ret = write(id->channel->fd, &cmd, sizeof cmd);
1527 	if (ret != sizeof cmd)
1528 		return (ret >= 0) ? ERR(ENODATA) : -1;
1529 
1530 	if (af_ib_support)
1531 		return ucma_query_addr(id);
1532 	else
1533 		return ucma_query_route(id);
1534 }
1535 
1536 int rdma_get_request(struct rdma_cm_id *listen, struct rdma_cm_id **id)
1537 {
1538 	struct cma_id_private *id_priv;
1539 	struct rdma_cm_event *event;
1540 	int ret;
1541 
1542 	id_priv = container_of(listen, struct cma_id_private, id);
1543 	if (!id_priv->sync)
1544 		return ERR(EINVAL);
1545 
1546 	if (listen->event) {
1547 		rdma_ack_cm_event(listen->event);
1548 		listen->event = NULL;
1549 	}
1550 
1551 	ret = rdma_get_cm_event(listen->channel, &event);
1552 	if (ret)
1553 		return ret;
1554 
1555 	if (event->status) {
1556 		ret = ERR(event->status);
1557 		goto err;
1558 	}
1559 
1560 	if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
1561 		ret = ERR(EINVAL);
1562 		goto err;
1563 	}
1564 
1565 	if (id_priv->qp_init_attr) {
1566 		struct ibv_qp_init_attr attr;
1567 
1568 		attr = *id_priv->qp_init_attr;
1569 		ret = rdma_create_qp(event->id, listen->pd, &attr);
1570 		if (ret)
1571 			goto err;
1572 	}
1573 
1574 	*id = event->id;
1575 	(*id)->event = event;
1576 	return 0;
1577 
1578 err:
1579 	listen->event = event;
1580 	return ret;
1581 }
1582 
1583 int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
1584 {
1585 	struct ucma_abi_accept cmd;
1586 	struct cma_id_private *id_priv;
1587 	int ret;
1588 
1589 	id_priv = container_of(id, struct cma_id_private, id);
1590 	ret = ucma_valid_param(id_priv, conn_param);
1591 	if (ret)
1592 		return ret;
1593 
1594 	if (!conn_param || conn_param->initiator_depth == RDMA_MAX_INIT_DEPTH) {
1595 		id_priv->initiator_depth = min(id_priv->initiator_depth,
1596 					       id_priv->cma_dev->max_initiator_depth);
1597 	} else {
1598 		id_priv->initiator_depth = conn_param->initiator_depth;
1599 	}
1600 	if (!conn_param || conn_param->responder_resources == RDMA_MAX_RESP_RES) {
1601 		id_priv->responder_resources = min(id_priv->responder_resources,
1602 						   id_priv->cma_dev->max_responder_resources);
1603 	} else {
1604 		id_priv->responder_resources = conn_param->responder_resources;
1605 	}
1606 
1607 	if (!ucma_is_ud_qp(id->qp_type)) {
1608 		ret = ucma_modify_qp_rtr(id, id_priv->responder_resources);
1609 		if (ret)
1610 			return ret;
1611 
1612 		ret = ucma_modify_qp_rts(id, id_priv->initiator_depth);
1613 		if (ret)
1614 			return ret;
1615 	}
1616 
1617 	CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT);
1618 	cmd.id = id_priv->handle;
1619 	cmd.uid = (uintptr_t) id_priv;
1620 	if (id->qp)
1621 		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1622 					     conn_param, id->qp->qp_num,
1623 					     (id->qp->srq != NULL));
1624 	else
1625 		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1626 					     conn_param, conn_param->qp_num,
1627 					     conn_param->srq);
1628 
1629 	ret = write(id->channel->fd, &cmd, sizeof cmd);
1630 	if (ret != sizeof cmd) {
1631 		ucma_modify_qp_err(id);
1632 		return (ret >= 0) ? ERR(ENODATA) : -1;
1633 	}
1634 
1635 	if (ucma_is_ud_qp(id->qp_type))
1636 		return 0;
1637 
1638 	return ucma_complete(id);
1639 }
1640 
1641 int rdma_reject(struct rdma_cm_id *id, const void *private_data,
1642 		uint8_t private_data_len)
1643 {
1644 	struct ucma_abi_reject cmd;
1645 	struct cma_id_private *id_priv;
1646 	int ret;
1647 
1648 	CMA_INIT_CMD(&cmd, sizeof cmd, REJECT);
1649 
1650 	id_priv = container_of(id, struct cma_id_private, id);
1651 	cmd.id = id_priv->handle;
1652 	if (private_data && private_data_len) {
1653 		memcpy(cmd.private_data, private_data, private_data_len);
1654 		cmd.private_data_len = private_data_len;
1655 	}
1656 
1657 	ret = write(id->channel->fd, &cmd, sizeof cmd);
1658 	if (ret != sizeof cmd)
1659 		return (ret >= 0) ? ERR(ENODATA) : -1;
1660 
1661 	return 0;
1662 }
1663 
1664 int rdma_notify(struct rdma_cm_id *id, enum ibv_event_type event)
1665 {
1666 	struct ucma_abi_notify cmd;
1667 	struct cma_id_private *id_priv;
1668 	int ret;
1669 
1670 	CMA_INIT_CMD(&cmd, sizeof cmd, NOTIFY);
1671 
1672 	id_priv = container_of(id, struct cma_id_private, id);
1673 	cmd.id = id_priv->handle;
1674 	cmd.event = event;
1675 	ret = write(id->channel->fd, &cmd, sizeof cmd);
1676 	if (ret != sizeof cmd)
1677 		return (ret >= 0) ? ERR(ENODATA) : -1;
1678 
1679 	return 0;
1680 }
1681 
1682 int ucma_shutdown(struct rdma_cm_id *id)
1683 {
1684 	switch (id->verbs->device->transport_type) {
1685 	case IBV_TRANSPORT_IB:
1686 		return ucma_modify_qp_err(id);
1687 	case IBV_TRANSPORT_IWARP:
1688 		return ucma_modify_qp_sqd(id);
1689 	default:
1690 		return ERR(EINVAL);
1691 	}
1692 }
1693 
1694 int rdma_disconnect(struct rdma_cm_id *id)
1695 {
1696 	struct ucma_abi_disconnect cmd;
1697 	struct cma_id_private *id_priv;
1698 	int ret;
1699 
1700 	ret = ucma_shutdown(id);
1701 	if (ret)
1702 		return ret;
1703 
1704 	CMA_INIT_CMD(&cmd, sizeof cmd, DISCONNECT);
1705 	id_priv = container_of(id, struct cma_id_private, id);
1706 	cmd.id = id_priv->handle;
1707 
1708 	ret = write(id->channel->fd, &cmd, sizeof cmd);
1709 	if (ret != sizeof cmd)
1710 		return (ret >= 0) ? ERR(ENODATA) : -1;
1711 
1712 	return ucma_complete(id);
1713 }
1714 
1715 static int rdma_join_multicast2(struct rdma_cm_id *id, struct sockaddr *addr,
1716 				socklen_t addrlen, void *context)
1717 {
1718 	struct ucma_abi_create_id_resp resp;
1719 	struct cma_id_private *id_priv;
1720 	struct cma_multicast *mc, **pos;
1721 	int ret;
1722 
1723 	id_priv = container_of(id, struct cma_id_private, id);
1724 	mc = calloc(1, sizeof(*mc));
1725 	if (!mc)
1726 		return ERR(ENOMEM);
1727 
1728 	mc->context = context;
1729 	mc->id_priv = id_priv;
1730 	memcpy(&mc->addr, addr, addrlen);
1731 	if (pthread_cond_init(&mc->cond, NULL)) {
1732 		ret = -1;
1733 		goto err1;
1734 	}
1735 
1736 	pthread_mutex_lock(&id_priv->mut);
1737 	mc->next = id_priv->mc_list;
1738 	id_priv->mc_list = mc;
1739 	pthread_mutex_unlock(&id_priv->mut);
1740 
1741 	if (af_ib_support) {
1742 		struct ucma_abi_join_mcast cmd;
1743 
1744 		CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_MCAST, &resp, sizeof resp);
1745 		cmd.id = id_priv->handle;
1746 		memcpy(&cmd.addr, addr, addrlen);
1747 		cmd.addr_size = addrlen;
1748 		cmd.uid = (uintptr_t) mc;
1749 		cmd.reserved = 0;
1750 
1751 		ret = write(id->channel->fd, &cmd, sizeof cmd);
1752 		if (ret != sizeof cmd) {
1753 			ret = (ret >= 0) ? ERR(ENODATA) : -1;
1754 			goto err2;
1755 		}
1756 	} else {
1757 		struct ucma_abi_join_ip_mcast cmd;
1758 
1759 		CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_IP_MCAST, &resp, sizeof resp);
1760 		cmd.id = id_priv->handle;
1761 		memcpy(&cmd.addr, addr, addrlen);
1762 		cmd.uid = (uintptr_t) mc;
1763 
1764 		ret = write(id->channel->fd, &cmd, sizeof cmd);
1765 		if (ret != sizeof cmd) {
1766 			ret = (ret >= 0) ? ERR(ENODATA) : -1;
1767 			goto err2;
1768 		}
1769 	}
1770 
1771 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
1772 
1773 	mc->handle = resp.id;
1774 	return ucma_complete(id);
1775 
1776 err2:
1777 	pthread_mutex_lock(&id_priv->mut);
1778 	for (pos = &id_priv->mc_list; *pos != mc; pos = &(*pos)->next)
1779 		;
1780 	*pos = mc->next;
1781 	pthread_mutex_unlock(&id_priv->mut);
1782 err1:
1783 	free(mc);
1784 	return ret;
1785 }
1786 
1787 int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
1788 			void *context)
1789 {
1790 	int addrlen;
1791 
1792 	addrlen = ucma_addrlen(addr);
1793 	if (!addrlen)
1794 		return ERR(EINVAL);
1795 
1796 	return rdma_join_multicast2(id, addr, addrlen, context);
1797 }
1798 
1799 int rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
1800 {
1801 	struct ucma_abi_destroy_id cmd;
1802 	struct ucma_abi_destroy_id_resp resp;
1803 	struct cma_id_private *id_priv;
1804 	struct cma_multicast *mc, **pos;
1805 	int ret, addrlen;
1806 
1807 	addrlen = ucma_addrlen(addr);
1808 	if (!addrlen)
1809 		return ERR(EINVAL);
1810 
1811 	id_priv = container_of(id, struct cma_id_private, id);
1812 	pthread_mutex_lock(&id_priv->mut);
1813 	for (pos = &id_priv->mc_list; *pos; pos = &(*pos)->next)
1814 		if (!memcmp(&(*pos)->addr, addr, addrlen))
1815 			break;
1816 
1817 	mc = *pos;
1818 	if (*pos)
1819 		*pos = mc->next;
1820 	pthread_mutex_unlock(&id_priv->mut);
1821 	if (!mc)
1822 		return ERR(EADDRNOTAVAIL);
1823 
1824 	if (id->qp)
1825 		ibv_detach_mcast(id->qp, &mc->mgid, mc->mlid);
1826 
1827 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, LEAVE_MCAST, &resp, sizeof resp);
1828 	cmd.id = mc->handle;
1829 
1830 	ret = write(id->channel->fd, &cmd, sizeof cmd);
1831 	if (ret != sizeof cmd) {
1832 		ret = (ret >= 0) ? ERR(ENODATA) : -1;
1833 		goto free;
1834 	}
1835 
1836 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
1837 
1838 	pthread_mutex_lock(&id_priv->mut);
1839 	while (mc->events_completed < resp.events_reported)
1840 		pthread_cond_wait(&mc->cond, &id_priv->mut);
1841 	pthread_mutex_unlock(&id_priv->mut);
1842 
1843 	ret = 0;
1844 free:
1845 	free(mc);
1846 	return ret;
1847 }
1848 
1849 static void ucma_complete_event(struct cma_id_private *id_priv)
1850 {
1851 	pthread_mutex_lock(&id_priv->mut);
1852 	id_priv->events_completed++;
1853 	pthread_cond_signal(&id_priv->cond);
1854 	pthread_mutex_unlock(&id_priv->mut);
1855 }
1856 
1857 static void ucma_complete_mc_event(struct cma_multicast *mc)
1858 {
1859 	pthread_mutex_lock(&mc->id_priv->mut);
1860 	mc->events_completed++;
1861 	pthread_cond_signal(&mc->cond);
1862 	mc->id_priv->events_completed++;
1863 	pthread_cond_signal(&mc->id_priv->cond);
1864 	pthread_mutex_unlock(&mc->id_priv->mut);
1865 }
1866 
1867 int rdma_ack_cm_event(struct rdma_cm_event *event)
1868 {
1869 	struct cma_event *evt;
1870 
1871 	if (!event)
1872 		return ERR(EINVAL);
1873 
1874 	evt = container_of(event, struct cma_event, event);
1875 
1876 	if (evt->mc)
1877 		ucma_complete_mc_event(evt->mc);
1878 	else
1879 		ucma_complete_event(evt->id_priv);
1880 	free(evt);
1881 	return 0;
1882 }
1883 
1884 static void ucma_process_addr_resolved(struct cma_event *evt)
1885 {
1886 	if (af_ib_support) {
1887 		evt->event.status = ucma_query_addr(&evt->id_priv->id);
1888 		if (!evt->event.status &&
1889 		    evt->id_priv->id.verbs->device->transport_type == IBV_TRANSPORT_IB)
1890 			evt->event.status = ucma_query_gid(&evt->id_priv->id);
1891 	} else {
1892 		evt->event.status = ucma_query_route(&evt->id_priv->id);
1893 	}
1894 
1895 	if (evt->event.status)
1896 		evt->event.event = RDMA_CM_EVENT_ADDR_ERROR;
1897 }
1898 
1899 static void ucma_process_route_resolved(struct cma_event *evt)
1900 {
1901 	if (evt->id_priv->id.verbs->device->transport_type != IBV_TRANSPORT_IB)
1902 		return;
1903 
1904 	if (af_ib_support)
1905 		evt->event.status = ucma_query_path(&evt->id_priv->id);
1906 	else
1907 		evt->event.status = ucma_query_route(&evt->id_priv->id);
1908 
1909 	if (evt->event.status)
1910 		evt->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
1911 }
1912 
1913 static int ucma_query_req_info(struct rdma_cm_id *id)
1914 {
1915 	int ret;
1916 
1917 	if (!af_ib_support)
1918 		return ucma_query_route(id);
1919 
1920 	ret = ucma_query_addr(id);
1921 	if (ret)
1922 		return ret;
1923 
1924 	ret = ucma_query_gid(id);
1925 	if (ret)
1926 		return ret;
1927 
1928 	ret = ucma_query_path(id);
1929 	if (ret)
1930 		return ret;
1931 
1932 	return 0;
1933 }
1934 
1935 static int ucma_process_conn_req(struct cma_event *evt,
1936 				 uint32_t handle)
1937 {
1938 	struct cma_id_private *id_priv;
1939 	int ret;
1940 
1941 	id_priv = ucma_alloc_id(evt->id_priv->id.channel,
1942 				evt->id_priv->id.context, evt->id_priv->id.ps,
1943 				evt->id_priv->id.qp_type);
1944 	if (!id_priv) {
1945 		ucma_destroy_kern_id(evt->id_priv->id.channel->fd, handle);
1946 		ret = ERR(ENOMEM);
1947 		goto err1;
1948 	}
1949 
1950 	evt->event.listen_id = &evt->id_priv->id;
1951 	evt->event.id = &id_priv->id;
1952 	id_priv->handle = handle;
1953 	ucma_insert_id(id_priv);
1954 	id_priv->initiator_depth = evt->event.param.conn.initiator_depth;
1955 	id_priv->responder_resources = evt->event.param.conn.responder_resources;
1956 
1957 	if (evt->id_priv->sync) {
1958 		ret = rdma_migrate_id(&id_priv->id, NULL);
1959 		if (ret)
1960 			goto err2;
1961 	}
1962 
1963 	ret = ucma_query_req_info(&id_priv->id);
1964 	if (ret)
1965 		goto err2;
1966 
1967 	return 0;
1968 
1969 err2:
1970 	rdma_destroy_id(&id_priv->id);
1971 err1:
1972 	ucma_complete_event(evt->id_priv);
1973 	return ret;
1974 }
1975 
1976 static int ucma_process_conn_resp(struct cma_id_private *id_priv)
1977 {
1978 	struct ucma_abi_accept cmd;
1979 	int ret;
1980 
1981 	ret = ucma_modify_qp_rtr(&id_priv->id, RDMA_MAX_RESP_RES);
1982 	if (ret)
1983 		goto err;
1984 
1985 	ret = ucma_modify_qp_rts(&id_priv->id, RDMA_MAX_INIT_DEPTH);
1986 	if (ret)
1987 		goto err;
1988 
1989 	CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT);
1990 	cmd.id = id_priv->handle;
1991 
1992 	ret = write(id_priv->id.channel->fd, &cmd, sizeof cmd);
1993 	if (ret != sizeof cmd) {
1994 		ret = (ret >= 0) ? ERR(ENODATA) : -1;
1995 		goto err;
1996 	}
1997 
1998 	return 0;
1999 err:
2000 	ucma_modify_qp_err(&id_priv->id);
2001 	return ret;
2002 }
2003 
2004 static int ucma_process_join(struct cma_event *evt)
2005 {
2006 	evt->mc->mgid = evt->event.param.ud.ah_attr.grh.dgid;
2007 	evt->mc->mlid = evt->event.param.ud.ah_attr.dlid;
2008 
2009 	if (!evt->id_priv->id.qp)
2010 		return 0;
2011 
2012 	return rdma_seterrno(ibv_attach_mcast(evt->id_priv->id.qp,
2013 					      &evt->mc->mgid, evt->mc->mlid));
2014 }
2015 
2016 static void ucma_copy_conn_event(struct cma_event *event,
2017 				 struct ucma_abi_conn_param *src)
2018 {
2019 	struct rdma_conn_param *dst = &event->event.param.conn;
2020 
2021 	dst->private_data_len = src->private_data_len;
2022 	if (src->private_data_len) {
2023 		dst->private_data = &event->private_data;
2024 		memcpy(&event->private_data, src->private_data,
2025 		       src->private_data_len);
2026 	}
2027 
2028 	dst->responder_resources = src->responder_resources;
2029 	dst->initiator_depth = src->initiator_depth;
2030 	dst->flow_control = src->flow_control;
2031 	dst->retry_count = src->retry_count;
2032 	dst->rnr_retry_count = src->rnr_retry_count;
2033 	dst->srq = src->srq;
2034 	dst->qp_num = src->qp_num;
2035 }
2036 
2037 static void ucma_copy_ud_event(struct cma_event *event,
2038 			       struct ucma_abi_ud_param *src)
2039 {
2040 	struct rdma_ud_param *dst = &event->event.param.ud;
2041 
2042 	dst->private_data_len = src->private_data_len;
2043 	if (src->private_data_len) {
2044 		dst->private_data = &event->private_data;
2045 		memcpy(&event->private_data, src->private_data,
2046 		       src->private_data_len);
2047 	}
2048 
2049 	ibv_copy_ah_attr_from_kern(&dst->ah_attr, &src->ah_attr);
2050 	dst->qp_num = src->qp_num;
2051 	dst->qkey = src->qkey;
2052 }
2053 
2054 int rdma_get_cm_event(struct rdma_event_channel *channel,
2055 		      struct rdma_cm_event **event)
2056 {
2057 	struct ucma_abi_event_resp resp;
2058 	struct ucma_abi_get_event cmd;
2059 	struct cma_event *evt;
2060 	int ret;
2061 
2062 	ret = ucma_init();
2063 	if (ret)
2064 		return ret;
2065 
2066 	if (!event)
2067 		return ERR(EINVAL);
2068 
2069 	evt = malloc(sizeof(*evt));
2070 	if (!evt)
2071 		return ERR(ENOMEM);
2072 
2073 retry:
2074 	memset(evt, 0, sizeof(*evt));
2075 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, GET_EVENT, &resp, sizeof resp);
2076 	ret = write(channel->fd, &cmd, sizeof cmd);
2077 	if (ret != sizeof cmd) {
2078 		free(evt);
2079 		return (ret >= 0) ? ERR(ENODATA) : -1;
2080 	}
2081 
2082 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
2083 
2084 	evt->event.event = resp.event;
2085 	/*
2086 	 * We should have a non-zero uid, except for connection requests.
2087 	 * But a bug in older kernels can report a uid 0.  Work-around this
2088 	 * issue by looking up the cma_id based on the kernel's id when the
2089 	 * uid is 0 and we're processing a connection established event.
2090 	 * In all other cases, if the uid is 0, we discard the event, like
2091 	 * the kernel should have done.
2092 	 */
2093 	if (resp.uid) {
2094 		evt->id_priv = (void *) (uintptr_t) resp.uid;
2095 	} else {
2096 		evt->id_priv = ucma_lookup_id(resp.id);
2097 		if (!evt->id_priv) {
2098 			syslog(LOG_WARNING, PFX "Warning: discarding unmatched "
2099 				"event - rdma_destroy_id may hang.\n");
2100 			goto retry;
2101 		}
2102 		if (resp.event != RDMA_CM_EVENT_ESTABLISHED) {
2103 			ucma_complete_event(evt->id_priv);
2104 			goto retry;
2105 		}
2106 	}
2107 	evt->event.id = &evt->id_priv->id;
2108 	evt->event.status = resp.status;
2109 
2110 	switch (resp.event) {
2111 	case RDMA_CM_EVENT_ADDR_RESOLVED:
2112 		ucma_process_addr_resolved(evt);
2113 		break;
2114 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
2115 		ucma_process_route_resolved(evt);
2116 		break;
2117 	case RDMA_CM_EVENT_CONNECT_REQUEST:
2118 		evt->id_priv = (void *) (uintptr_t) resp.uid;
2119 		if (ucma_is_ud_qp(evt->id_priv->id.qp_type))
2120 			ucma_copy_ud_event(evt, &resp.param.ud);
2121 		else
2122 			ucma_copy_conn_event(evt, &resp.param.conn);
2123 
2124 		ret = ucma_process_conn_req(evt, resp.id);
2125 		if (ret)
2126 			goto retry;
2127 		break;
2128 	case RDMA_CM_EVENT_CONNECT_RESPONSE:
2129 		ucma_copy_conn_event(evt, &resp.param.conn);
2130 		evt->event.status = ucma_process_conn_resp(evt->id_priv);
2131 		if (!evt->event.status)
2132 			evt->event.event = RDMA_CM_EVENT_ESTABLISHED;
2133 		else {
2134 			evt->event.event = RDMA_CM_EVENT_CONNECT_ERROR;
2135 			evt->id_priv->connect_error = 1;
2136 		}
2137 		break;
2138 	case RDMA_CM_EVENT_ESTABLISHED:
2139 		if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) {
2140 			ucma_copy_ud_event(evt, &resp.param.ud);
2141 			break;
2142 		}
2143 
2144 		ucma_copy_conn_event(evt, &resp.param.conn);
2145 		break;
2146 	case RDMA_CM_EVENT_REJECTED:
2147 		if (evt->id_priv->connect_error) {
2148 			ucma_complete_event(evt->id_priv);
2149 			goto retry;
2150 		}
2151 		ucma_copy_conn_event(evt, &resp.param.conn);
2152 		ucma_modify_qp_err(evt->event.id);
2153 		break;
2154 	case RDMA_CM_EVENT_DISCONNECTED:
2155 		if (evt->id_priv->connect_error) {
2156 			ucma_complete_event(evt->id_priv);
2157 			goto retry;
2158 		}
2159 		ucma_copy_conn_event(evt, &resp.param.conn);
2160 		break;
2161 	case RDMA_CM_EVENT_MULTICAST_JOIN:
2162 		evt->mc = (void *) (uintptr_t) resp.uid;
2163 		evt->id_priv = evt->mc->id_priv;
2164 		evt->event.id = &evt->id_priv->id;
2165 		ucma_copy_ud_event(evt, &resp.param.ud);
2166 		evt->event.param.ud.private_data = evt->mc->context;
2167 		evt->event.status = ucma_process_join(evt);
2168 		if (evt->event.status)
2169 			evt->event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
2170 		break;
2171 	case RDMA_CM_EVENT_MULTICAST_ERROR:
2172 		evt->mc = (void *) (uintptr_t) resp.uid;
2173 		evt->id_priv = evt->mc->id_priv;
2174 		evt->event.id = &evt->id_priv->id;
2175 		evt->event.param.ud.private_data = evt->mc->context;
2176 		break;
2177 	default:
2178 		evt->id_priv = (void *) (uintptr_t) resp.uid;
2179 		evt->event.id = &evt->id_priv->id;
2180 		evt->event.status = resp.status;
2181 		if (ucma_is_ud_qp(evt->id_priv->id.qp_type))
2182 			ucma_copy_ud_event(evt, &resp.param.ud);
2183 		else
2184 			ucma_copy_conn_event(evt, &resp.param.conn);
2185 		break;
2186 	}
2187 
2188 	*event = &evt->event;
2189 	return 0;
2190 }
2191 
2192 const char *rdma_event_str(enum rdma_cm_event_type event)
2193 {
2194 	switch (event) {
2195 	case RDMA_CM_EVENT_ADDR_RESOLVED:
2196 		return "RDMA_CM_EVENT_ADDR_RESOLVED";
2197 	case RDMA_CM_EVENT_ADDR_ERROR:
2198 		return "RDMA_CM_EVENT_ADDR_ERROR";
2199 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
2200 		return "RDMA_CM_EVENT_ROUTE_RESOLVED";
2201 	case RDMA_CM_EVENT_ROUTE_ERROR:
2202 		return "RDMA_CM_EVENT_ROUTE_ERROR";
2203 	case RDMA_CM_EVENT_CONNECT_REQUEST:
2204 		return "RDMA_CM_EVENT_CONNECT_REQUEST";
2205 	case RDMA_CM_EVENT_CONNECT_RESPONSE:
2206 		return "RDMA_CM_EVENT_CONNECT_RESPONSE";
2207 	case RDMA_CM_EVENT_CONNECT_ERROR:
2208 		return "RDMA_CM_EVENT_CONNECT_ERROR";
2209 	case RDMA_CM_EVENT_UNREACHABLE:
2210 		return "RDMA_CM_EVENT_UNREACHABLE";
2211 	case RDMA_CM_EVENT_REJECTED:
2212 		return "RDMA_CM_EVENT_REJECTED";
2213 	case RDMA_CM_EVENT_ESTABLISHED:
2214 		return "RDMA_CM_EVENT_ESTABLISHED";
2215 	case RDMA_CM_EVENT_DISCONNECTED:
2216 		return "RDMA_CM_EVENT_DISCONNECTED";
2217 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
2218 		return "RDMA_CM_EVENT_DEVICE_REMOVAL";
2219 	case RDMA_CM_EVENT_MULTICAST_JOIN:
2220 		return "RDMA_CM_EVENT_MULTICAST_JOIN";
2221 	case RDMA_CM_EVENT_MULTICAST_ERROR:
2222 		return "RDMA_CM_EVENT_MULTICAST_ERROR";
2223 	case RDMA_CM_EVENT_ADDR_CHANGE:
2224 		return "RDMA_CM_EVENT_ADDR_CHANGE";
2225 	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
2226 		return "RDMA_CM_EVENT_TIMEWAIT_EXIT";
2227 	default:
2228 		return "UNKNOWN EVENT";
2229 	}
2230 }
2231 
2232 int rdma_set_option(struct rdma_cm_id *id, int level, int optname,
2233 		    void *optval, size_t optlen)
2234 {
2235 	struct ucma_abi_set_option cmd;
2236 	struct cma_id_private *id_priv;
2237 	int ret;
2238 
2239 	CMA_INIT_CMD(&cmd, sizeof cmd, SET_OPTION);
2240 	id_priv = container_of(id, struct cma_id_private, id);
2241 	cmd.id = id_priv->handle;
2242 	cmd.optval = (uintptr_t) optval;
2243 	cmd.level = level;
2244 	cmd.optname = optname;
2245 	cmd.optlen = optlen;
2246 
2247 	ret = write(id->channel->fd, &cmd, sizeof cmd);
2248 	if (ret != sizeof cmd)
2249 		return (ret >= 0) ? ERR(ENODATA) : -1;
2250 
2251 	return 0;
2252 }
2253 
2254 int rdma_migrate_id(struct rdma_cm_id *id, struct rdma_event_channel *channel)
2255 {
2256 	struct ucma_abi_migrate_resp resp;
2257 	struct ucma_abi_migrate_id cmd;
2258 	struct cma_id_private *id_priv;
2259 	int ret, sync;
2260 
2261 	id_priv = container_of(id, struct cma_id_private, id);
2262 	if (id_priv->sync && !channel)
2263 		return ERR(EINVAL);
2264 
2265 	if ((sync = (channel == NULL))) {
2266 		channel = rdma_create_event_channel();
2267 		if (!channel)
2268 			return -1;
2269 	}
2270 
2271 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, MIGRATE_ID, &resp, sizeof resp);
2272 	cmd.id = id_priv->handle;
2273 	cmd.fd = id->channel->fd;
2274 
2275 	ret = write(channel->fd, &cmd, sizeof cmd);
2276 	if (ret != sizeof cmd) {
2277 		if (sync)
2278 			rdma_destroy_event_channel(channel);
2279 		return (ret >= 0) ? ERR(ENODATA) : -1;
2280 	}
2281 
2282 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
2283 
2284 	if (id_priv->sync) {
2285 		if (id->event) {
2286 			rdma_ack_cm_event(id->event);
2287 			id->event = NULL;
2288 		}
2289 		rdma_destroy_event_channel(id->channel);
2290 	}
2291 
2292 	/*
2293 	 * Eventually if we want to support migrating channels while events are
2294 	 * being processed on the current channel, we need to block here while
2295 	 * there are any outstanding events on the current channel for this id
2296 	 * to prevent the user from processing events for this id on the old
2297 	 * channel after this call returns.
2298 	 */
2299 	pthread_mutex_lock(&id_priv->mut);
2300 	id_priv->sync = sync;
2301 	id->channel = channel;
2302 	while (id_priv->events_completed < resp.events_reported)
2303 		pthread_cond_wait(&id_priv->cond, &id_priv->mut);
2304 	pthread_mutex_unlock(&id_priv->mut);
2305 
2306 	return 0;
2307 }
2308 
2309 static int ucma_passive_ep(struct rdma_cm_id *id, struct rdma_addrinfo *res,
2310 			   struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr)
2311 {
2312 	struct cma_id_private *id_priv;
2313 	int ret;
2314 
2315 	if (af_ib_support)
2316 		ret = rdma_bind_addr2(id, res->ai_src_addr, res->ai_src_len);
2317 	else
2318 		ret = rdma_bind_addr(id, res->ai_src_addr);
2319 	if (ret)
2320 		return ret;
2321 
2322 	id_priv = container_of(id, struct cma_id_private, id);
2323 	if (pd)
2324 		id->pd = pd;
2325 
2326 	if (qp_init_attr) {
2327 		id_priv->qp_init_attr = malloc(sizeof(*qp_init_attr));
2328 		if (!id_priv->qp_init_attr)
2329 			return ERR(ENOMEM);
2330 
2331 		*id_priv->qp_init_attr = *qp_init_attr;
2332 		id_priv->qp_init_attr->qp_type = res->ai_qp_type;
2333 	}
2334 
2335 	return 0;
2336 }
2337 
2338 int rdma_create_ep(struct rdma_cm_id **id, struct rdma_addrinfo *res,
2339 		   struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr)
2340 {
2341 	struct rdma_cm_id *cm_id;
2342 	struct cma_id_private *id_priv;
2343 	int ret;
2344 
2345 	ret = rdma_create_id2(NULL, &cm_id, NULL, res->ai_port_space, res->ai_qp_type);
2346 	if (ret)
2347 		return ret;
2348 
2349 	if (res->ai_flags & RAI_PASSIVE) {
2350 		ret = ucma_passive_ep(cm_id, res, pd, qp_init_attr);
2351 		if (ret)
2352 			goto err;
2353 		goto out;
2354 	}
2355 
2356 	if (af_ib_support)
2357 		ret = rdma_resolve_addr2(cm_id, res->ai_src_addr, res->ai_src_len,
2358 					 res->ai_dst_addr, res->ai_dst_len, 2000);
2359 	else
2360 		ret = rdma_resolve_addr(cm_id, res->ai_src_addr, res->ai_dst_addr, 2000);
2361 	if (ret)
2362 		goto err;
2363 
2364 	if (res->ai_route_len) {
2365 		ret = rdma_set_option(cm_id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH,
2366 				      res->ai_route, res->ai_route_len);
2367 		if (!ret)
2368 			ret = ucma_complete(cm_id);
2369 	} else {
2370 		ret = rdma_resolve_route(cm_id, 2000);
2371 	}
2372 	if (ret)
2373 		goto err;
2374 
2375 	if (qp_init_attr) {
2376 		qp_init_attr->qp_type = res->ai_qp_type;
2377 		ret = rdma_create_qp(cm_id, pd, qp_init_attr);
2378 		if (ret)
2379 			goto err;
2380 	}
2381 
2382 	if (res->ai_connect_len) {
2383 		id_priv = container_of(cm_id, struct cma_id_private, id);
2384 		id_priv->connect = malloc(res->ai_connect_len);
2385 		if (!id_priv->connect) {
2386 			ret = ERR(ENOMEM);
2387 			goto err;
2388 		}
2389 		memcpy(id_priv->connect, res->ai_connect, res->ai_connect_len);
2390 		id_priv->connect_len = res->ai_connect_len;
2391 	}
2392 
2393 out:
2394 	*id = cm_id;
2395 	return 0;
2396 
2397 err:
2398 	rdma_destroy_ep(cm_id);
2399 	return ret;
2400 }
2401 
2402 void rdma_destroy_ep(struct rdma_cm_id *id)
2403 {
2404 	struct cma_id_private *id_priv;
2405 
2406 	if (id->qp)
2407 		rdma_destroy_qp(id);
2408 
2409 	if (id->srq)
2410 		rdma_destroy_srq(id);
2411 
2412 	id_priv = container_of(id, struct cma_id_private, id);
2413 	if (id_priv->qp_init_attr)
2414 		free(id_priv->qp_init_attr);
2415 
2416 	rdma_destroy_id(id);
2417 }
2418 
2419 int ucma_max_qpsize(struct rdma_cm_id *id)
2420 {
2421 	struct cma_id_private *id_priv;
2422 	int i, max_size = 0;
2423 
2424 	id_priv = container_of(id, struct cma_id_private, id);
2425 	if (id && id_priv->cma_dev) {
2426 		max_size = id_priv->cma_dev->max_qpsize;
2427 	} else {
2428 		ucma_init_all();
2429 		for (i = 0; i < cma_dev_cnt; i++) {
2430 			if (!max_size || max_size > cma_dev_array[i].max_qpsize)
2431 				max_size = cma_dev_array[i].max_qpsize;
2432 		}
2433 	}
2434 	return max_size;
2435 }
2436 
2437 __be16 ucma_get_port(struct sockaddr *addr)
2438 {
2439 	switch (addr->sa_family) {
2440 	case AF_INET:
2441 		return ((struct sockaddr_in *) addr)->sin_port;
2442 	case AF_INET6:
2443 		return ((struct sockaddr_in6 *) addr)->sin6_port;
2444 	case AF_IB:
2445 		return htobe16((uint16_t) be64toh(((struct sockaddr_ib *) addr)->sib_sid));
2446 	default:
2447 		return 0;
2448 	}
2449 }
2450 
2451 __be16 rdma_get_src_port(struct rdma_cm_id *id)
2452 {
2453 	return ucma_get_port(&id->route.addr.src_addr);
2454 }
2455 
2456 __be16 rdma_get_dst_port(struct rdma_cm_id *id)
2457 {
2458 	return ucma_get_port(&id->route.addr.dst_addr);
2459 }
2460 
2461