xref: /freebsd/contrib/ofed/librdmacm/cma.c (revision ddfc6f84f24215b418af19260e9156219f6df03e)
1 /*
2  * Copyright (c) 2005-2014 Intel Corporation.  All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 #include <config.h>
34 
35 #include <stdlib.h>
36 #include <string.h>
37 #include <glob.h>
38 #include <stdio.h>
39 #include <fcntl.h>
40 #include <errno.h>
41 #include <stdint.h>
42 #include <poll.h>
43 #include <unistd.h>
44 #include <pthread.h>
45 #include <infiniband/endian.h>
46 #include <stddef.h>
47 #include <netdb.h>
48 #include <syslog.h>
49 #include <limits.h>
50 
51 #include "cma.h"
52 #include "indexer.h"
53 #include <infiniband/driver.h>
54 #include <infiniband/marshall.h>
55 #include <rdma/rdma_cma.h>
56 #include <rdma/rdma_cma_abi.h>
57 #include <rdma/rdma_verbs.h>
58 #include <infiniband/ib.h>
59 
60 #define CMA_INIT_CMD(req, req_size, op)		\
61 do {						\
62 	memset(req, 0, req_size);		\
63 	(req)->cmd = UCMA_CMD_##op;		\
64 	(req)->in  = req_size - sizeof(struct ucma_abi_cmd_hdr); \
65 } while (0)
66 
67 #define CMA_INIT_CMD_RESP(req, req_size, op, resp, resp_size) \
68 do {						\
69 	CMA_INIT_CMD(req, req_size, op);	\
70 	(req)->out = resp_size;			\
71 	(req)->response = (uintptr_t) (resp);	\
72 } while (0)
73 
74 struct cma_port {
75 	uint8_t			link_layer;
76 };
77 
78 struct cma_device {
79 	struct ibv_context *verbs;
80 	struct ibv_pd	   *pd;
81 	struct ibv_xrcd    *xrcd;
82 	struct cma_port    *port;
83 	__be64		    guid;
84 	int		    port_cnt;
85 	int		    refcnt;
86 	int		    max_qpsize;
87 	uint8_t		    max_initiator_depth;
88 	uint8_t		    max_responder_resources;
89 };
90 
91 struct cma_id_private {
92 	struct rdma_cm_id	id;
93 	struct cma_device	*cma_dev;
94 	void			*connect;
95 	size_t			connect_len;
96 	int			events_completed;
97 	int			connect_error;
98 	int			sync;
99 	pthread_cond_t		cond;
100 	pthread_mutex_t		mut;
101 	uint32_t		handle;
102 	struct cma_multicast	*mc_list;
103 	struct ibv_qp_init_attr	*qp_init_attr;
104 	uint8_t			initiator_depth;
105 	uint8_t			responder_resources;
106 };
107 
108 struct cma_multicast {
109 	struct cma_multicast  *next;
110 	struct cma_id_private *id_priv;
111 	void		*context;
112 	int		events_completed;
113 	pthread_cond_t	cond;
114 	uint32_t	handle;
115 	union ibv_gid	mgid;
116 	uint16_t	mlid;
117 	struct sockaddr_storage addr;
118 };
119 
120 struct cma_event {
121 	struct rdma_cm_event	event;
122 	uint8_t			private_data[RDMA_MAX_PRIVATE_DATA];
123 	struct cma_id_private	*id_priv;
124 	struct cma_multicast	*mc;
125 };
126 
127 static struct cma_device *cma_dev_array;
128 static int cma_dev_cnt;
129 static int cma_init_cnt;
130 static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER;
131 static int abi_ver = RDMA_USER_CM_MAX_ABI_VERSION;
132 int af_ib_support;
133 static struct index_map ucma_idm;
134 static fastlock_t idm_lock;
135 
136 static int check_abi_version(void)
137 {
138 	char value[8];
139 
140 	if ((ibv_read_sysfs_file(ibv_get_sysfs_path(),
141 				 "class/misc/rdma_cm/abi_version",
142 				 value, sizeof value) < 0) &&
143 	    (ibv_read_sysfs_file(ibv_get_sysfs_path(),
144 				 "class/infiniband_ucma/abi_version",
145 				 value, sizeof value) < 0)) {
146 		/*
147 		 * Older version of Linux do not have class/misc.  To support
148 		 * backports, assume the most recent version of the ABI.  If
149 		 * we're wrong, we'll simply fail later when calling the ABI.
150 		 */
151 		return 0;
152 	}
153 
154 	abi_ver = strtol(value, NULL, 10);
155 	if (abi_ver < RDMA_USER_CM_MIN_ABI_VERSION ||
156 	    abi_ver > RDMA_USER_CM_MAX_ABI_VERSION) {
157 		return -1;
158 	}
159 	return 0;
160 }
161 
162 /*
163  * This function is called holding the mutex lock
164  * cma_dev_cnt must be set before calling this function to
165  * ensure that the lock is not acquired recursively.
166  */
167 static void ucma_set_af_ib_support(void)
168 {
169 	struct rdma_cm_id *id;
170 	struct sockaddr_ib sib;
171 	int ret;
172 
173 	ret = rdma_create_id(NULL, &id, NULL, RDMA_PS_IB);
174 	if (ret)
175 		return;
176 
177 	memset(&sib, 0, sizeof sib);
178 	sib.sib_family = AF_IB;
179 	sib.sib_sid = htobe64(RDMA_IB_IP_PS_TCP);
180 	sib.sib_sid_mask = htobe64(RDMA_IB_IP_PS_MASK);
181 	af_ib_support = 1;
182 	ret = rdma_bind_addr(id, (struct sockaddr *) &sib);
183 	af_ib_support = !ret;
184 
185 	rdma_destroy_id(id);
186 }
187 
188 int ucma_init(void)
189 {
190 	struct ibv_device **dev_list = NULL;
191 	int i, ret, dev_cnt;
192 
193 	/* Quick check without lock to see if we're already initialized */
194 	if (cma_dev_cnt)
195 		return 0;
196 
197 	pthread_mutex_lock(&mut);
198 	if (cma_dev_cnt) {
199 		pthread_mutex_unlock(&mut);
200 		return 0;
201 	}
202 
203 	fastlock_init(&idm_lock);
204 	ret = check_abi_version();
205 	if (ret)
206 		goto err1;
207 
208 	dev_list = ibv_get_device_list(&dev_cnt);
209 	if (!dev_list) {
210 		ret = ERR(ENODEV);
211 		goto err1;
212 	}
213 
214 	if (!dev_cnt) {
215 		ret = ERR(ENODEV);
216 		goto err2;
217 	}
218 
219 	cma_dev_array = calloc(dev_cnt, sizeof(*cma_dev_array));
220 	if (!cma_dev_array) {
221 		ret = ERR(ENOMEM);
222 		goto err2;
223 	}
224 
225 	for (i = 0; dev_list[i]; i++)
226 		cma_dev_array[i].guid = ibv_get_device_guid(dev_list[i]);
227 
228 	cma_dev_cnt = dev_cnt;
229 	ucma_set_af_ib_support();
230 	pthread_mutex_unlock(&mut);
231 	ibv_free_device_list(dev_list);
232 	return 0;
233 
234 err2:
235 	ibv_free_device_list(dev_list);
236 err1:
237 	fastlock_destroy(&idm_lock);
238 	pthread_mutex_unlock(&mut);
239 	return ret;
240 }
241 
242 static struct ibv_context *ucma_open_device(__be64 guid)
243 {
244 	struct ibv_device **dev_list;
245 	struct ibv_context *verbs = NULL;
246 	int i;
247 
248 	dev_list = ibv_get_device_list(NULL);
249 	if (!dev_list) {
250 		return NULL;
251 	}
252 
253 	for (i = 0; dev_list[i]; i++) {
254 		if (ibv_get_device_guid(dev_list[i]) == guid) {
255 			verbs = ibv_open_device(dev_list[i]);
256 			break;
257 		}
258 	}
259 
260 	ibv_free_device_list(dev_list);
261 	return verbs;
262 }
263 
264 static int ucma_init_device(struct cma_device *cma_dev)
265 {
266 	struct ibv_port_attr port_attr;
267 	struct ibv_device_attr attr;
268 	int i, ret;
269 
270 	if (cma_dev->verbs)
271 		return 0;
272 
273 	cma_dev->verbs = ucma_open_device(cma_dev->guid);
274 	if (!cma_dev->verbs)
275 		return ERR(ENODEV);
276 
277 	ret = ibv_query_device(cma_dev->verbs, &attr);
278 	if (ret) {
279 		ret = ERR(ret);
280 		goto err;
281 	}
282 
283 	cma_dev->port = malloc(sizeof(*cma_dev->port) * attr.phys_port_cnt);
284 	if (!cma_dev->port) {
285 		ret = ERR(ENOMEM);
286 		goto err;
287 	}
288 
289 	for (i = 1; i <= attr.phys_port_cnt; i++) {
290 		if (ibv_query_port(cma_dev->verbs, i, &port_attr))
291 			cma_dev->port[i - 1].link_layer = IBV_LINK_LAYER_UNSPECIFIED;
292 		else
293 			cma_dev->port[i - 1].link_layer = port_attr.link_layer;
294 	}
295 
296 	cma_dev->port_cnt = attr.phys_port_cnt;
297 	cma_dev->max_qpsize = attr.max_qp_wr;
298 	cma_dev->max_initiator_depth = (uint8_t) attr.max_qp_init_rd_atom;
299 	cma_dev->max_responder_resources = (uint8_t) attr.max_qp_rd_atom;
300 	cma_init_cnt++;
301 	return 0;
302 
303 err:
304 	ibv_close_device(cma_dev->verbs);
305 	cma_dev->verbs = NULL;
306 	return ret;
307 }
308 
309 static int ucma_init_all(void)
310 {
311 	int i, ret = 0;
312 
313 	if (!cma_dev_cnt) {
314 		ret = ucma_init();
315 		if (ret)
316 			return ret;
317 	}
318 
319 	if (cma_init_cnt == cma_dev_cnt)
320 		return 0;
321 
322 	pthread_mutex_lock(&mut);
323 	for (i = 0; i < cma_dev_cnt; i++) {
324 		ret = ucma_init_device(&cma_dev_array[i]);
325 		if (ret)
326 			break;
327 	}
328 	pthread_mutex_unlock(&mut);
329 	return ret;
330 }
331 
332 struct ibv_context **rdma_get_devices(int *num_devices)
333 {
334 	struct ibv_context **devs = NULL;
335 	int i;
336 
337 	if (ucma_init_all())
338 		goto out;
339 
340 	devs = malloc(sizeof(*devs) * (cma_dev_cnt + 1));
341 	if (!devs)
342 		goto out;
343 
344 	for (i = 0; i < cma_dev_cnt; i++)
345 		devs[i] = cma_dev_array[i].verbs;
346 	devs[i] = NULL;
347 out:
348 	if (num_devices)
349 		*num_devices = devs ? cma_dev_cnt : 0;
350 	return devs;
351 }
352 
353 void rdma_free_devices(struct ibv_context **list)
354 {
355 	free(list);
356 }
357 
358 struct rdma_event_channel *rdma_create_event_channel(void)
359 {
360 	struct rdma_event_channel *channel;
361 
362 	if (ucma_init())
363 		return NULL;
364 
365 	channel = malloc(sizeof(*channel));
366 	if (!channel)
367 		return NULL;
368 
369 	channel->fd = open("/dev/rdma_cm", O_RDWR | O_CLOEXEC);
370 	if (channel->fd < 0) {
371 		goto err;
372 	}
373 	return channel;
374 err:
375 	free(channel);
376 	return NULL;
377 }
378 
379 void rdma_destroy_event_channel(struct rdma_event_channel *channel)
380 {
381 	close(channel->fd);
382 	free(channel);
383 }
384 
385 static int ucma_get_device(struct cma_id_private *id_priv, __be64 guid)
386 {
387 	struct cma_device *cma_dev;
388 	int i, ret;
389 
390 	for (i = 0; i < cma_dev_cnt; i++) {
391 		cma_dev = &cma_dev_array[i];
392 		if (cma_dev->guid == guid)
393 			goto match;
394 	}
395 
396 	return ERR(ENODEV);
397 match:
398 	pthread_mutex_lock(&mut);
399 	if ((ret = ucma_init_device(cma_dev)))
400 		goto out;
401 
402 	if (!cma_dev->refcnt++) {
403 		cma_dev->pd = ibv_alloc_pd(cma_dev->verbs);
404 		if (!cma_dev->pd) {
405 			cma_dev->refcnt--;
406 			ret = ERR(ENOMEM);
407 			goto out;
408 		}
409 	}
410 	id_priv->cma_dev = cma_dev;
411 	id_priv->id.verbs = cma_dev->verbs;
412 	id_priv->id.pd = cma_dev->pd;
413 out:
414 	pthread_mutex_unlock(&mut);
415 	return ret;
416 }
417 
418 static void ucma_put_device(struct cma_device *cma_dev)
419 {
420 	pthread_mutex_lock(&mut);
421 	if (!--cma_dev->refcnt) {
422 		ibv_dealloc_pd(cma_dev->pd);
423 		if (cma_dev->xrcd)
424 			ibv_close_xrcd(cma_dev->xrcd);
425 	}
426 	pthread_mutex_unlock(&mut);
427 }
428 
429 static struct ibv_xrcd *ucma_get_xrcd(struct cma_device *cma_dev)
430 {
431 	struct ibv_xrcd_init_attr attr;
432 
433 	pthread_mutex_lock(&mut);
434 	if (!cma_dev->xrcd) {
435 		memset(&attr, 0, sizeof attr);
436 		attr.comp_mask = IBV_XRCD_INIT_ATTR_FD | IBV_XRCD_INIT_ATTR_OFLAGS;
437 		attr.fd = -1;
438 		attr.oflags = O_CREAT;
439 		cma_dev->xrcd = ibv_open_xrcd(cma_dev->verbs, &attr);
440 	}
441 	pthread_mutex_unlock(&mut);
442 	return cma_dev->xrcd;
443 }
444 
445 static void ucma_insert_id(struct cma_id_private *id_priv)
446 {
447 	fastlock_acquire(&idm_lock);
448 	idm_set(&ucma_idm, id_priv->handle, id_priv);
449 	fastlock_release(&idm_lock);
450 }
451 
452 static void ucma_remove_id(struct cma_id_private *id_priv)
453 {
454 	if (id_priv->handle <= IDX_MAX_INDEX)
455 		idm_clear(&ucma_idm, id_priv->handle);
456 }
457 
458 static struct cma_id_private *ucma_lookup_id(int handle)
459 {
460 	return idm_lookup(&ucma_idm, handle);
461 }
462 
463 static void ucma_free_id(struct cma_id_private *id_priv)
464 {
465 	ucma_remove_id(id_priv);
466 	if (id_priv->cma_dev)
467 		ucma_put_device(id_priv->cma_dev);
468 	pthread_cond_destroy(&id_priv->cond);
469 	pthread_mutex_destroy(&id_priv->mut);
470 	if (id_priv->id.route.path_rec)
471 		free(id_priv->id.route.path_rec);
472 
473 	if (id_priv->sync)
474 		rdma_destroy_event_channel(id_priv->id.channel);
475 	if (id_priv->connect_len)
476 		free(id_priv->connect);
477 	free(id_priv);
478 }
479 
480 static struct cma_id_private *ucma_alloc_id(struct rdma_event_channel *channel,
481 					    void *context,
482 					    enum rdma_port_space ps,
483 					    enum ibv_qp_type qp_type)
484 {
485 	struct cma_id_private *id_priv;
486 
487 	id_priv = calloc(1, sizeof(*id_priv));
488 	if (!id_priv)
489 		return NULL;
490 
491 	id_priv->id.context = context;
492 	id_priv->id.ps = ps;
493 	id_priv->id.qp_type = qp_type;
494 	id_priv->handle = 0xFFFFFFFF;
495 
496 	if (!channel) {
497 		id_priv->id.channel = rdma_create_event_channel();
498 		if (!id_priv->id.channel)
499 			goto err;
500 		id_priv->sync = 1;
501 	} else {
502 		id_priv->id.channel = channel;
503 	}
504 
505 	if (pthread_mutex_init(&id_priv->mut, NULL))
506 		goto err;
507 	if (pthread_cond_init(&id_priv->cond, NULL))
508 		goto err;
509 
510 	return id_priv;
511 
512 err:	ucma_free_id(id_priv);
513 	return NULL;
514 }
515 
516 static int rdma_create_id2(struct rdma_event_channel *channel,
517 			   struct rdma_cm_id **id, void *context,
518 			   enum rdma_port_space ps, enum ibv_qp_type qp_type)
519 {
520 	struct ucma_abi_create_id_resp resp;
521 	struct ucma_abi_create_id cmd;
522 	struct cma_id_private *id_priv;
523 	int ret;
524 
525 	ret = ucma_init();
526 	if (ret)
527 		return ret;
528 
529 	id_priv = ucma_alloc_id(channel, context, ps, qp_type);
530 	if (!id_priv)
531 		return ERR(ENOMEM);
532 
533 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, CREATE_ID, &resp, sizeof resp);
534 	cmd.uid = (uintptr_t) id_priv;
535 	cmd.ps = ps;
536 	cmd.qp_type = qp_type;
537 
538 	ret = write(id_priv->id.channel->fd, &cmd, sizeof cmd);
539 	if (ret != sizeof cmd)
540 		goto err;
541 
542 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
543 
544 	id_priv->handle = resp.id;
545 	ucma_insert_id(id_priv);
546 	*id = &id_priv->id;
547 	return 0;
548 
549 err:	ucma_free_id(id_priv);
550 	return ret;
551 }
552 
553 int rdma_create_id(struct rdma_event_channel *channel,
554 		   struct rdma_cm_id **id, void *context,
555 		   enum rdma_port_space ps)
556 {
557 	enum ibv_qp_type qp_type;
558 
559 	qp_type = (ps == RDMA_PS_IPOIB || ps == RDMA_PS_UDP) ?
560 		  IBV_QPT_UD : IBV_QPT_RC;
561 	return rdma_create_id2(channel, id, context, ps, qp_type);
562 }
563 
564 static int ucma_destroy_kern_id(int fd, uint32_t handle)
565 {
566 	struct ucma_abi_destroy_id_resp resp;
567 	struct ucma_abi_destroy_id cmd;
568 	int ret;
569 
570 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, DESTROY_ID, &resp, sizeof resp);
571 	cmd.id = handle;
572 
573 	ret = write(fd, &cmd, sizeof cmd);
574 	if (ret != sizeof cmd)
575 		return (ret >= 0) ? ERR(ENODATA) : -1;
576 
577 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
578 
579 	return resp.events_reported;
580 }
581 
582 int rdma_destroy_id(struct rdma_cm_id *id)
583 {
584 	struct cma_id_private *id_priv;
585 	int ret;
586 
587 	id_priv = container_of(id, struct cma_id_private, id);
588 	ret = ucma_destroy_kern_id(id->channel->fd, id_priv->handle);
589 	if (ret < 0)
590 		return ret;
591 
592 	if (id_priv->id.event)
593 		rdma_ack_cm_event(id_priv->id.event);
594 
595 	pthread_mutex_lock(&id_priv->mut);
596 	while (id_priv->events_completed < ret)
597 		pthread_cond_wait(&id_priv->cond, &id_priv->mut);
598 	pthread_mutex_unlock(&id_priv->mut);
599 
600 	ucma_free_id(id_priv);
601 	return 0;
602 }
603 
604 int ucma_addrlen(struct sockaddr *addr)
605 {
606 	if (!addr)
607 		return 0;
608 
609 	switch (addr->sa_family) {
610 	case PF_INET:
611 		return sizeof(struct sockaddr_in);
612 	case PF_INET6:
613 		return sizeof(struct sockaddr_in6);
614 	case PF_IB:
615 		return af_ib_support ? sizeof(struct sockaddr_ib) : 0;
616 	default:
617 		return 0;
618 	}
619 }
620 
621 static int ucma_query_addr(struct rdma_cm_id *id)
622 {
623 	struct ucma_abi_query_addr_resp resp;
624 	struct ucma_abi_query cmd;
625 	struct cma_id_private *id_priv;
626 	int ret;
627 
628 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp);
629 	id_priv = container_of(id, struct cma_id_private, id);
630 	cmd.id = id_priv->handle;
631 	cmd.option = UCMA_QUERY_ADDR;
632 
633 	ret = write(id->channel->fd, &cmd, sizeof cmd);
634 	if (ret != sizeof cmd)
635 		return (ret >= 0) ? ERR(ENODATA) : -1;
636 
637 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
638 
639 	memcpy(&id->route.addr.src_addr, &resp.src_addr, resp.src_size);
640 	memcpy(&id->route.addr.dst_addr, &resp.dst_addr, resp.dst_size);
641 
642 	if (!id_priv->cma_dev && resp.node_guid) {
643 		ret = ucma_get_device(id_priv, resp.node_guid);
644 		if (ret)
645 			return ret;
646 		id->port_num = resp.port_num;
647 		id->route.addr.addr.ibaddr.pkey = resp.pkey;
648 	}
649 
650 	return 0;
651 }
652 
653 static int ucma_query_gid(struct rdma_cm_id *id)
654 {
655 	struct ucma_abi_query_addr_resp resp;
656 	struct ucma_abi_query cmd;
657 	struct cma_id_private *id_priv;
658 	struct sockaddr_ib *sib;
659 	int ret;
660 
661 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, &resp, sizeof resp);
662 	id_priv = container_of(id, struct cma_id_private, id);
663 	cmd.id = id_priv->handle;
664 	cmd.option = UCMA_QUERY_GID;
665 
666 	ret = write(id->channel->fd, &cmd, sizeof cmd);
667 	if (ret != sizeof cmd)
668 		return (ret >= 0) ? ERR(ENODATA) : -1;
669 
670 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
671 
672 	sib = (struct sockaddr_ib *) &resp.src_addr;
673 	memcpy(id->route.addr.addr.ibaddr.sgid.raw, sib->sib_addr.sib_raw,
674 	       sizeof id->route.addr.addr.ibaddr.sgid);
675 
676 	sib = (struct sockaddr_ib *) &resp.dst_addr;
677 	memcpy(id->route.addr.addr.ibaddr.dgid.raw, sib->sib_addr.sib_raw,
678 	       sizeof id->route.addr.addr.ibaddr.dgid);
679 
680 	return 0;
681 }
682 
683 static void ucma_convert_path(struct ibv_path_data *path_data,
684 			      struct ibv_sa_path_rec *sa_path)
685 {
686 	uint32_t fl_hop;
687 
688 	sa_path->dgid = path_data->path.dgid;
689 	sa_path->sgid = path_data->path.sgid;
690 	sa_path->dlid = path_data->path.dlid;
691 	sa_path->slid = path_data->path.slid;
692 	sa_path->raw_traffic = 0;
693 
694 	fl_hop = be32toh(path_data->path.flowlabel_hoplimit);
695 	sa_path->flow_label = htobe32(fl_hop >> 8);
696 	sa_path->hop_limit = (uint8_t) fl_hop;
697 
698 	sa_path->traffic_class = path_data->path.tclass;
699 	sa_path->reversible = path_data->path.reversible_numpath >> 7;
700 	sa_path->numb_path = 1;
701 	sa_path->pkey = path_data->path.pkey;
702 	sa_path->sl = be16toh(path_data->path.qosclass_sl) & 0xF;
703 	sa_path->mtu_selector = 2;	/* exactly */
704 	sa_path->mtu = path_data->path.mtu & 0x1F;
705 	sa_path->rate_selector = 2;
706 	sa_path->rate = path_data->path.rate & 0x1F;
707 	sa_path->packet_life_time_selector = 2;
708 	sa_path->packet_life_time = path_data->path.packetlifetime & 0x1F;
709 
710 	sa_path->preference = (uint8_t) path_data->flags;
711 }
712 
713 static int ucma_query_path(struct rdma_cm_id *id)
714 {
715 	struct ucma_abi_query_path_resp *resp;
716 	struct ucma_abi_query cmd;
717 	struct cma_id_private *id_priv;
718 	int ret, i, size;
719 
720 	size = sizeof(*resp) + sizeof(struct ibv_path_data) * 6;
721 	resp = alloca(size);
722 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY, resp, size);
723 	id_priv = container_of(id, struct cma_id_private, id);
724 	cmd.id = id_priv->handle;
725 	cmd.option = UCMA_QUERY_PATH;
726 
727 	ret = write(id->channel->fd, &cmd, sizeof cmd);
728 	if (ret != sizeof cmd)
729 		return (ret >= 0) ? ERR(ENODATA) : -1;
730 
731 	VALGRIND_MAKE_MEM_DEFINED(resp, size);
732 
733 	if (resp->num_paths) {
734 		id->route.path_rec = malloc(sizeof(*id->route.path_rec) *
735 					    resp->num_paths);
736 		if (!id->route.path_rec)
737 			return ERR(ENOMEM);
738 
739 		id->route.num_paths = resp->num_paths;
740 		for (i = 0; i < resp->num_paths; i++)
741 			ucma_convert_path(&resp->path_data[i], &id->route.path_rec[i]);
742 	}
743 
744 	return 0;
745 }
746 
747 static int ucma_query_route(struct rdma_cm_id *id)
748 {
749 	struct ucma_abi_query_route_resp resp;
750 	struct ucma_abi_query cmd;
751 	struct cma_id_private *id_priv;
752 	int ret, i;
753 
754 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, QUERY_ROUTE, &resp, sizeof resp);
755 	id_priv = container_of(id, struct cma_id_private, id);
756 	cmd.id = id_priv->handle;
757 
758 	ret = write(id->channel->fd, &cmd, sizeof cmd);
759 	if (ret != sizeof cmd)
760 		return (ret >= 0) ? ERR(ENODATA) : -1;
761 
762 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
763 
764 	if (resp.num_paths) {
765 		id->route.path_rec = malloc(sizeof(*id->route.path_rec) *
766 					    resp.num_paths);
767 		if (!id->route.path_rec)
768 			return ERR(ENOMEM);
769 
770 		id->route.num_paths = resp.num_paths;
771 		for (i = 0; i < resp.num_paths; i++)
772 			ibv_copy_path_rec_from_kern(&id->route.path_rec[i],
773 						    &resp.ib_route[i]);
774 	}
775 
776 	memcpy(id->route.addr.addr.ibaddr.sgid.raw, resp.ib_route[0].sgid,
777 	       sizeof id->route.addr.addr.ibaddr.sgid);
778 	memcpy(id->route.addr.addr.ibaddr.dgid.raw, resp.ib_route[0].dgid,
779 	       sizeof id->route.addr.addr.ibaddr.dgid);
780 	id->route.addr.addr.ibaddr.pkey = resp.ib_route[0].pkey;
781 	memcpy(&id->route.addr.src_addr, &resp.src_addr,
782 	       sizeof resp.src_addr);
783 	memcpy(&id->route.addr.dst_addr, &resp.dst_addr,
784 	       sizeof resp.dst_addr);
785 
786 	if (!id_priv->cma_dev && resp.node_guid) {
787 		ret = ucma_get_device(id_priv, resp.node_guid);
788 		if (ret)
789 			return ret;
790 		id_priv->id.port_num = resp.port_num;
791 	}
792 
793 	return 0;
794 }
795 
796 static int rdma_bind_addr2(struct rdma_cm_id *id, struct sockaddr *addr,
797 			   socklen_t addrlen)
798 {
799 	struct ucma_abi_bind cmd;
800 	struct cma_id_private *id_priv;
801 	int ret;
802 
803 	CMA_INIT_CMD(&cmd, sizeof cmd, BIND);
804 	id_priv = container_of(id, struct cma_id_private, id);
805 	cmd.id = id_priv->handle;
806 	cmd.addr_size = addrlen;
807 	memcpy(&cmd.addr, addr, addrlen);
808 
809 	ret = write(id->channel->fd, &cmd, sizeof cmd);
810 	if (ret != sizeof cmd)
811 		return (ret >= 0) ? ERR(ENODATA) : -1;
812 
813 	ret = ucma_query_addr(id);
814 	if (!ret)
815 		ret = ucma_query_gid(id);
816 	return ret;
817 }
818 
819 int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
820 {
821 	struct ucma_abi_bind_ip cmd;
822 	struct cma_id_private *id_priv;
823 	int ret, addrlen;
824 
825 	addrlen = ucma_addrlen(addr);
826 	if (!addrlen)
827 		return ERR(EINVAL);
828 
829 	if (af_ib_support)
830 		return rdma_bind_addr2(id, addr, addrlen);
831 
832 	CMA_INIT_CMD(&cmd, sizeof cmd, BIND_IP);
833 	id_priv = container_of(id, struct cma_id_private, id);
834 	cmd.id = id_priv->handle;
835 	memcpy(&cmd.addr, addr, addrlen);
836 
837 	ret = write(id->channel->fd, &cmd, sizeof cmd);
838 	if (ret != sizeof cmd)
839 		return (ret >= 0) ? ERR(ENODATA) : -1;
840 
841 	return ucma_query_route(id);
842 }
843 
844 int ucma_complete(struct rdma_cm_id *id)
845 {
846 	struct cma_id_private *id_priv;
847 	int ret;
848 
849 	id_priv = container_of(id, struct cma_id_private, id);
850 	if (!id_priv->sync)
851 		return 0;
852 
853 	if (id_priv->id.event) {
854 		rdma_ack_cm_event(id_priv->id.event);
855 		id_priv->id.event = NULL;
856 	}
857 
858 	ret = rdma_get_cm_event(id_priv->id.channel, &id_priv->id.event);
859 	if (ret)
860 		return ret;
861 
862 	if (id_priv->id.event->status) {
863 		if (id_priv->id.event->event == RDMA_CM_EVENT_REJECTED)
864 			ret = ERR(ECONNREFUSED);
865 		else if (id_priv->id.event->status < 0)
866 			ret = ERR(-id_priv->id.event->status);
867 		else
868 			ret = ERR(-id_priv->id.event->status);
869 	}
870 	return ret;
871 }
872 
873 static int rdma_resolve_addr2(struct rdma_cm_id *id, struct sockaddr *src_addr,
874 			      socklen_t src_len, struct sockaddr *dst_addr,
875 			      socklen_t dst_len, int timeout_ms)
876 {
877 	struct ucma_abi_resolve_addr cmd;
878 	struct cma_id_private *id_priv;
879 	int ret;
880 
881 	CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ADDR);
882 	id_priv = container_of(id, struct cma_id_private, id);
883 	cmd.id = id_priv->handle;
884 	if ((cmd.src_size = src_len))
885 		memcpy(&cmd.src_addr, src_addr, src_len);
886 	memcpy(&cmd.dst_addr, dst_addr, dst_len);
887 	cmd.dst_size = dst_len;
888 	cmd.timeout_ms = timeout_ms;
889 
890 	ret = write(id->channel->fd, &cmd, sizeof cmd);
891 	if (ret != sizeof cmd)
892 		return (ret >= 0) ? ERR(ENODATA) : -1;
893 
894 	memcpy(&id->route.addr.dst_addr, dst_addr, dst_len);
895 	return ucma_complete(id);
896 }
897 
898 int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
899 		      struct sockaddr *dst_addr, int timeout_ms)
900 {
901 	struct ucma_abi_resolve_ip cmd;
902 	struct cma_id_private *id_priv;
903 	int ret, dst_len, src_len;
904 
905 	dst_len = ucma_addrlen(dst_addr);
906 	if (!dst_len)
907 		return ERR(EINVAL);
908 
909 	src_len = ucma_addrlen(src_addr);
910 	if (src_addr && !src_len)
911 		return ERR(EINVAL);
912 
913 	if (af_ib_support)
914 		return rdma_resolve_addr2(id, src_addr, src_len, dst_addr,
915 					  dst_len, timeout_ms);
916 
917 	CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_IP);
918 	id_priv = container_of(id, struct cma_id_private, id);
919 	cmd.id = id_priv->handle;
920 	if (src_addr)
921 		memcpy(&cmd.src_addr, src_addr, src_len);
922 	memcpy(&cmd.dst_addr, dst_addr, dst_len);
923 	cmd.timeout_ms = timeout_ms;
924 
925 	ret = write(id->channel->fd, &cmd, sizeof cmd);
926 	if (ret != sizeof cmd)
927 		return (ret >= 0) ? ERR(ENODATA) : -1;
928 
929 	memcpy(&id->route.addr.dst_addr, dst_addr, dst_len);
930 	return ucma_complete(id);
931 }
932 
933 static int ucma_set_ib_route(struct rdma_cm_id *id)
934 {
935 	struct rdma_addrinfo hint, *rai;
936 	int ret;
937 
938 	memset(&hint, 0, sizeof hint);
939 	hint.ai_flags = RAI_ROUTEONLY;
940 	hint.ai_family = id->route.addr.src_addr.sa_family;
941 	hint.ai_src_len = ucma_addrlen((struct sockaddr *) &id->route.addr.src_addr);
942 	hint.ai_src_addr = &id->route.addr.src_addr;
943 	hint.ai_dst_len = ucma_addrlen((struct sockaddr *) &id->route.addr.dst_addr);
944 	hint.ai_dst_addr = &id->route.addr.dst_addr;
945 
946 	ret = rdma_getaddrinfo(NULL, NULL, &hint, &rai);
947 	if (ret)
948 		return ret;
949 
950 	if (rai->ai_route_len)
951 		ret = rdma_set_option(id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH,
952 				      rai->ai_route, rai->ai_route_len);
953 	else
954 		ret = -1;
955 
956 	rdma_freeaddrinfo(rai);
957 	return ret;
958 }
959 
960 int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
961 {
962 	struct ucma_abi_resolve_route cmd;
963 	struct cma_id_private *id_priv;
964 	int ret;
965 
966 	id_priv = container_of(id, struct cma_id_private, id);
967 	if (id->verbs->device->transport_type == IBV_TRANSPORT_IB) {
968 		ret = ucma_set_ib_route(id);
969 		if (!ret)
970 			goto out;
971 	}
972 
973 	CMA_INIT_CMD(&cmd, sizeof cmd, RESOLVE_ROUTE);
974 	cmd.id = id_priv->handle;
975 	cmd.timeout_ms = timeout_ms;
976 
977 	ret = write(id->channel->fd, &cmd, sizeof cmd);
978 	if (ret != sizeof cmd)
979 		return (ret >= 0) ? ERR(ENODATA) : -1;
980 
981 out:
982 	return ucma_complete(id);
983 }
984 
985 static int ucma_is_ud_qp(enum ibv_qp_type qp_type)
986 {
987 	return (qp_type == IBV_QPT_UD);
988 }
989 
990 static int rdma_init_qp_attr(struct rdma_cm_id *id, struct ibv_qp_attr *qp_attr,
991 			     int *qp_attr_mask)
992 {
993 	struct ucma_abi_init_qp_attr cmd;
994 	struct ibv_kern_qp_attr resp;
995 	struct cma_id_private *id_priv;
996 	int ret;
997 
998 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, INIT_QP_ATTR, &resp, sizeof resp);
999 	id_priv = container_of(id, struct cma_id_private, id);
1000 	cmd.id = id_priv->handle;
1001 	cmd.qp_state = qp_attr->qp_state;
1002 
1003 	ret = write(id->channel->fd, &cmd, sizeof cmd);
1004 	if (ret != sizeof cmd)
1005 		return (ret >= 0) ? ERR(ENODATA) : -1;
1006 
1007 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
1008 
1009 	ibv_copy_qp_attr_from_kern(qp_attr, &resp);
1010 	*qp_attr_mask = resp.qp_attr_mask;
1011 	return 0;
1012 }
1013 
1014 static int ucma_modify_qp_rtr(struct rdma_cm_id *id, uint8_t resp_res)
1015 {
1016 	struct cma_id_private *id_priv;
1017 	struct ibv_qp_attr qp_attr;
1018 	int qp_attr_mask, ret;
1019 	uint8_t link_layer;
1020 
1021 	if (!id->qp)
1022 		return ERR(EINVAL);
1023 
1024 	/* Need to update QP attributes from default values. */
1025 	qp_attr.qp_state = IBV_QPS_INIT;
1026 	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
1027 	if (ret)
1028 		return ret;
1029 
1030 	ret = ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask);
1031 	if (ret)
1032 		return ERR(ret);
1033 
1034 	qp_attr.qp_state = IBV_QPS_RTR;
1035 	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
1036 	if (ret)
1037 		return ret;
1038 
1039 	/*
1040 	 * Workaround for rdma_ucm kernel bug:
1041 	 * mask off qp_attr_mask bits 21-24 which are used for RoCE
1042 	 */
1043 	id_priv = container_of(id, struct cma_id_private, id);
1044 	link_layer = id_priv->cma_dev->port[id->port_num - 1].link_layer;
1045 
1046 	if (link_layer == IBV_LINK_LAYER_INFINIBAND)
1047 		qp_attr_mask &= UINT_MAX ^ 0xe00000;
1048 
1049 	if (resp_res != RDMA_MAX_RESP_RES)
1050 		qp_attr.max_dest_rd_atomic = resp_res;
1051 	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask));
1052 }
1053 
1054 static int ucma_modify_qp_rts(struct rdma_cm_id *id, uint8_t init_depth)
1055 {
1056 	struct ibv_qp_attr qp_attr;
1057 	int qp_attr_mask, ret;
1058 
1059 	qp_attr.qp_state = IBV_QPS_RTS;
1060 	ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask);
1061 	if (ret)
1062 		return ret;
1063 
1064 	if (init_depth != RDMA_MAX_INIT_DEPTH)
1065 		qp_attr.max_rd_atomic = init_depth;
1066 	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, qp_attr_mask));
1067 }
1068 
1069 static int ucma_modify_qp_sqd(struct rdma_cm_id *id)
1070 {
1071 	struct ibv_qp_attr qp_attr;
1072 
1073 	if (!id->qp)
1074 		return 0;
1075 
1076 	qp_attr.qp_state = IBV_QPS_SQD;
1077 	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE));
1078 }
1079 
1080 static int ucma_modify_qp_err(struct rdma_cm_id *id)
1081 {
1082 	struct ibv_qp_attr qp_attr;
1083 
1084 	if (!id->qp)
1085 		return 0;
1086 
1087 	qp_attr.qp_state = IBV_QPS_ERR;
1088 	return rdma_seterrno(ibv_modify_qp(id->qp, &qp_attr, IBV_QP_STATE));
1089 }
1090 
1091 static int ucma_find_pkey(struct cma_device *cma_dev, uint8_t port_num,
1092 			  __be16 pkey, uint16_t *pkey_index)
1093 {
1094 	int ret, i;
1095 	__be16 chk_pkey;
1096 
1097 	for (i = 0, ret = 0; !ret; i++) {
1098 		ret = ibv_query_pkey(cma_dev->verbs, port_num, i, &chk_pkey);
1099 		if (!ret && pkey == chk_pkey) {
1100 			*pkey_index = (uint16_t) i;
1101 			return 0;
1102 		}
1103 	}
1104 	return ERR(EINVAL);
1105 }
1106 
1107 static int ucma_init_conn_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp)
1108 {
1109 	struct ibv_qp_attr qp_attr;
1110 	int ret;
1111 
1112 	ret = ucma_find_pkey(id_priv->cma_dev, id_priv->id.port_num,
1113 			     id_priv->id.route.addr.addr.ibaddr.pkey,
1114 			     &qp_attr.pkey_index);
1115 	if (ret)
1116 		return ret;
1117 
1118 	qp_attr.port_num = id_priv->id.port_num;
1119 	qp_attr.qp_state = IBV_QPS_INIT;
1120 	qp_attr.qp_access_flags = 0;
1121 
1122 	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_ACCESS_FLAGS |
1123 					  IBV_QP_PKEY_INDEX | IBV_QP_PORT);
1124 	return rdma_seterrno(ret);
1125 }
1126 
1127 static int ucma_init_conn_qp(struct cma_id_private *id_priv, struct ibv_qp *qp)
1128 {
1129 	struct ibv_qp_attr qp_attr;
1130 	int qp_attr_mask, ret;
1131 
1132 	if (abi_ver == 3)
1133 		return ucma_init_conn_qp3(id_priv, qp);
1134 
1135 	qp_attr.qp_state = IBV_QPS_INIT;
1136 	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
1137 	if (ret)
1138 		return ret;
1139 
1140 	return rdma_seterrno(ibv_modify_qp(qp, &qp_attr, qp_attr_mask));
1141 }
1142 
1143 static int ucma_init_ud_qp3(struct cma_id_private *id_priv, struct ibv_qp *qp)
1144 {
1145 	struct ibv_qp_attr qp_attr;
1146 	int ret;
1147 
1148 	ret = ucma_find_pkey(id_priv->cma_dev, id_priv->id.port_num,
1149 			     id_priv->id.route.addr.addr.ibaddr.pkey,
1150 			     &qp_attr.pkey_index);
1151 	if (ret)
1152 		return ret;
1153 
1154 	qp_attr.port_num = id_priv->id.port_num;
1155 	qp_attr.qp_state = IBV_QPS_INIT;
1156 	qp_attr.qkey = RDMA_UDP_QKEY;
1157 
1158 	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_QKEY |
1159 					  IBV_QP_PKEY_INDEX | IBV_QP_PORT);
1160 	if (ret)
1161 		return ERR(ret);
1162 
1163 	qp_attr.qp_state = IBV_QPS_RTR;
1164 	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE);
1165 	if (ret)
1166 		return ERR(ret);
1167 
1168 	qp_attr.qp_state = IBV_QPS_RTS;
1169 	qp_attr.sq_psn = 0;
1170 	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN);
1171 	return rdma_seterrno(ret);
1172 }
1173 
1174 static int ucma_init_ud_qp(struct cma_id_private *id_priv, struct ibv_qp *qp)
1175 {
1176 	struct ibv_qp_attr qp_attr;
1177 	int qp_attr_mask, ret;
1178 
1179 	if (abi_ver == 3)
1180 		return ucma_init_ud_qp3(id_priv, qp);
1181 
1182 	qp_attr.qp_state = IBV_QPS_INIT;
1183 	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
1184 	if (ret)
1185 		return ret;
1186 
1187 	ret = ibv_modify_qp(qp, &qp_attr, qp_attr_mask);
1188 	if (ret)
1189 		return ERR(ret);
1190 
1191 	qp_attr.qp_state = IBV_QPS_RTR;
1192 	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE);
1193 	if (ret)
1194 		return ERR(ret);
1195 
1196 	qp_attr.qp_state = IBV_QPS_RTS;
1197 	qp_attr.sq_psn = 0;
1198 	ret = ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE | IBV_QP_SQ_PSN);
1199 	return rdma_seterrno(ret);
1200 }
1201 
1202 static void ucma_destroy_cqs(struct rdma_cm_id *id)
1203 {
1204 	if (id->qp_type == IBV_QPT_XRC_RECV && id->srq)
1205 		return;
1206 
1207 	if (id->recv_cq) {
1208 		ibv_destroy_cq(id->recv_cq);
1209 		if (id->send_cq && (id->send_cq != id->recv_cq)) {
1210 			ibv_destroy_cq(id->send_cq);
1211 			id->send_cq = NULL;
1212 		}
1213 		id->recv_cq = NULL;
1214 	}
1215 
1216 	if (id->recv_cq_channel) {
1217 		ibv_destroy_comp_channel(id->recv_cq_channel);
1218 		if (id->send_cq_channel && (id->send_cq_channel != id->recv_cq_channel)) {
1219 			ibv_destroy_comp_channel(id->send_cq_channel);
1220 			id->send_cq_channel = NULL;
1221 		}
1222 		id->recv_cq_channel = NULL;
1223 	}
1224 }
1225 
1226 static int ucma_create_cqs(struct rdma_cm_id *id, uint32_t send_size, uint32_t recv_size)
1227 {
1228 	if (recv_size) {
1229 		id->recv_cq_channel = ibv_create_comp_channel(id->verbs);
1230 		if (!id->recv_cq_channel)
1231 			goto err;
1232 
1233 		id->recv_cq = ibv_create_cq(id->verbs, recv_size,
1234 					    id, id->recv_cq_channel, 0);
1235 		if (!id->recv_cq)
1236 			goto err;
1237 	}
1238 
1239 	if (send_size) {
1240 		id->send_cq_channel = ibv_create_comp_channel(id->verbs);
1241 		if (!id->send_cq_channel)
1242 			goto err;
1243 
1244 		id->send_cq = ibv_create_cq(id->verbs, send_size,
1245 					    id, id->send_cq_channel, 0);
1246 		if (!id->send_cq)
1247 			goto err;
1248 	}
1249 
1250 	return 0;
1251 err:
1252 	ucma_destroy_cqs(id);
1253 	return ERR(ENOMEM);
1254 }
1255 
1256 int rdma_create_srq_ex(struct rdma_cm_id *id, struct ibv_srq_init_attr_ex *attr)
1257 {
1258 	struct cma_id_private *id_priv;
1259 	struct ibv_srq *srq;
1260 	int ret;
1261 
1262 	id_priv = container_of(id, struct cma_id_private, id);
1263 	if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_TYPE))
1264 		return ERR(EINVAL);
1265 
1266 	if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_PD) || !attr->pd) {
1267 		attr->pd = id->pd;
1268 		attr->comp_mask |= IBV_SRQ_INIT_ATTR_PD;
1269 	}
1270 
1271 	if (attr->srq_type == IBV_SRQT_XRC) {
1272 		if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_XRCD) || !attr->xrcd) {
1273 			attr->xrcd = ucma_get_xrcd(id_priv->cma_dev);
1274 			if (!attr->xrcd)
1275 				return -1;
1276 		}
1277 		if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_CQ) || !attr->cq) {
1278 			ret = ucma_create_cqs(id, 0, attr->attr.max_wr);
1279 			if (ret)
1280 				return ret;
1281 			attr->cq = id->recv_cq;
1282 		}
1283 		attr->comp_mask |= IBV_SRQ_INIT_ATTR_XRCD | IBV_SRQ_INIT_ATTR_CQ;
1284 	}
1285 
1286 	srq = ibv_create_srq_ex(id->verbs, attr);
1287 	if (!srq) {
1288 		ret = -1;
1289 		goto err;
1290 	}
1291 
1292 	if (!id->pd)
1293 		id->pd = attr->pd;
1294 	id->srq = srq;
1295 	return 0;
1296 err:
1297 	ucma_destroy_cqs(id);
1298 	return ret;
1299 }
1300 
1301 int rdma_create_srq(struct rdma_cm_id *id, struct ibv_pd *pd,
1302 		    struct ibv_srq_init_attr *attr)
1303 {
1304 	struct ibv_srq_init_attr_ex attr_ex;
1305 	int ret;
1306 
1307 	memcpy(&attr_ex, attr, sizeof(*attr));
1308 	attr_ex.comp_mask = IBV_SRQ_INIT_ATTR_TYPE | IBV_SRQ_INIT_ATTR_PD;
1309 	if (id->qp_type == IBV_QPT_XRC_RECV) {
1310 		attr_ex.srq_type = IBV_SRQT_XRC;
1311 	} else {
1312 		attr_ex.srq_type = IBV_SRQT_BASIC;
1313 	}
1314 	attr_ex.pd = pd;
1315 	ret = rdma_create_srq_ex(id, &attr_ex);
1316 	memcpy(attr, &attr_ex, sizeof(*attr));
1317 	return ret;
1318 }
1319 
1320 void rdma_destroy_srq(struct rdma_cm_id *id)
1321 {
1322 	ibv_destroy_srq(id->srq);
1323 	id->srq = NULL;
1324 	ucma_destroy_cqs(id);
1325 }
1326 
1327 int rdma_create_qp_ex(struct rdma_cm_id *id,
1328 		      struct ibv_qp_init_attr_ex *attr)
1329 {
1330 	struct cma_id_private *id_priv;
1331 	struct ibv_qp *qp;
1332 	int ret;
1333 
1334 	if (id->qp)
1335 		return ERR(EINVAL);
1336 
1337 	id_priv = container_of(id, struct cma_id_private, id);
1338 	if (!(attr->comp_mask & IBV_QP_INIT_ATTR_PD) || !attr->pd) {
1339 		attr->comp_mask |= IBV_QP_INIT_ATTR_PD;
1340 		attr->pd = id->pd;
1341 	} else if (id->verbs != attr->pd->context)
1342 		return ERR(EINVAL);
1343 
1344 	if ((id->recv_cq && attr->recv_cq && id->recv_cq != attr->recv_cq) ||
1345 	    (id->send_cq && attr->send_cq && id->send_cq != attr->send_cq))
1346 		return ERR(EINVAL);
1347 
1348 	if (id->qp_type == IBV_QPT_XRC_RECV) {
1349 		if (!(attr->comp_mask & IBV_QP_INIT_ATTR_XRCD) || !attr->xrcd) {
1350 			attr->xrcd = ucma_get_xrcd(id_priv->cma_dev);
1351 			if (!attr->xrcd)
1352 				return -1;
1353 			attr->comp_mask |= IBV_QP_INIT_ATTR_XRCD;
1354 		}
1355 	}
1356 
1357 	ret = ucma_create_cqs(id, attr->send_cq || id->send_cq ? 0 : attr->cap.max_send_wr,
1358 				  attr->recv_cq || id->recv_cq ? 0 : attr->cap.max_recv_wr);
1359 	if (ret)
1360 		return ret;
1361 
1362 	if (!attr->send_cq)
1363 		attr->send_cq = id->send_cq;
1364 	if (!attr->recv_cq)
1365 		attr->recv_cq = id->recv_cq;
1366 	if (id->srq && !attr->srq)
1367 		attr->srq = id->srq;
1368 	qp = ibv_create_qp_ex(id->verbs, attr);
1369 	if (!qp) {
1370 		ret = ERR(ENOMEM);
1371 		goto err1;
1372 	}
1373 
1374 	if (ucma_is_ud_qp(id->qp_type))
1375 		ret = ucma_init_ud_qp(id_priv, qp);
1376 	else
1377 		ret = ucma_init_conn_qp(id_priv, qp);
1378 	if (ret)
1379 		goto err2;
1380 
1381 	id->pd = qp->pd;
1382 	id->qp = qp;
1383 	return 0;
1384 err2:
1385 	ibv_destroy_qp(qp);
1386 err1:
1387 	ucma_destroy_cqs(id);
1388 	return ret;
1389 }
1390 
1391 int rdma_create_qp(struct rdma_cm_id *id, struct ibv_pd *pd,
1392 		   struct ibv_qp_init_attr *qp_init_attr)
1393 {
1394 	struct ibv_qp_init_attr_ex attr_ex;
1395 	int ret;
1396 
1397 	memcpy(&attr_ex, qp_init_attr, sizeof(*qp_init_attr));
1398 	attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD;
1399 	attr_ex.pd = pd ? pd : id->pd;
1400 	ret = rdma_create_qp_ex(id, &attr_ex);
1401 	memcpy(qp_init_attr, &attr_ex, sizeof(*qp_init_attr));
1402 	return ret;
1403 }
1404 
1405 void rdma_destroy_qp(struct rdma_cm_id *id)
1406 {
1407 	ibv_destroy_qp(id->qp);
1408 	id->qp = NULL;
1409 	ucma_destroy_cqs(id);
1410 }
1411 
1412 static int ucma_valid_param(struct cma_id_private *id_priv,
1413 			    struct rdma_conn_param *param)
1414 {
1415 	if (id_priv->id.ps != RDMA_PS_TCP)
1416 		return 0;
1417 
1418 	if (!id_priv->id.qp && !param)
1419 		goto err;
1420 
1421 	if (!param)
1422 		return 0;
1423 
1424 	if ((param->responder_resources != RDMA_MAX_RESP_RES) &&
1425 	    (param->responder_resources > id_priv->cma_dev->max_responder_resources))
1426 		goto err;
1427 
1428 	if ((param->initiator_depth != RDMA_MAX_INIT_DEPTH) &&
1429 	    (param->initiator_depth > id_priv->cma_dev->max_initiator_depth))
1430 		goto err;
1431 
1432 	return 0;
1433 err:
1434 	return ERR(EINVAL);
1435 }
1436 
1437 static void ucma_copy_conn_param_to_kern(struct cma_id_private *id_priv,
1438 					 struct ucma_abi_conn_param *dst,
1439 					 struct rdma_conn_param *src,
1440 					 uint32_t qp_num, uint8_t srq)
1441 {
1442 	dst->qp_num = qp_num;
1443 	dst->srq = srq;
1444 	dst->responder_resources = id_priv->responder_resources;
1445 	dst->initiator_depth = id_priv->initiator_depth;
1446 	dst->valid = 1;
1447 
1448 	if (id_priv->connect_len) {
1449 		memcpy(dst->private_data, id_priv->connect, id_priv->connect_len);
1450 		dst->private_data_len = id_priv->connect_len;
1451 	}
1452 
1453 	if (src) {
1454 		dst->flow_control = src->flow_control;
1455 		dst->retry_count = src->retry_count;
1456 		dst->rnr_retry_count = src->rnr_retry_count;
1457 
1458 		if (src->private_data && src->private_data_len) {
1459 			memcpy(dst->private_data + dst->private_data_len,
1460 			       src->private_data, src->private_data_len);
1461 			dst->private_data_len += src->private_data_len;
1462 		}
1463 	} else {
1464 		dst->retry_count = 7;
1465 		dst->rnr_retry_count = 7;
1466 	}
1467 }
1468 
1469 int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
1470 {
1471 	struct ucma_abi_connect cmd;
1472 	struct cma_id_private *id_priv;
1473 	int ret;
1474 
1475 	id_priv = container_of(id, struct cma_id_private, id);
1476 	ret = ucma_valid_param(id_priv, conn_param);
1477 	if (ret)
1478 		return ret;
1479 
1480 	if (conn_param && conn_param->initiator_depth != RDMA_MAX_INIT_DEPTH)
1481 		id_priv->initiator_depth = conn_param->initiator_depth;
1482 	else
1483 		id_priv->initiator_depth = id_priv->cma_dev->max_initiator_depth;
1484 	if (conn_param && conn_param->responder_resources != RDMA_MAX_RESP_RES)
1485 		id_priv->responder_resources = conn_param->responder_resources;
1486 	else
1487 		id_priv->responder_resources = id_priv->cma_dev->max_responder_resources;
1488 
1489 	CMA_INIT_CMD(&cmd, sizeof cmd, CONNECT);
1490 	cmd.id = id_priv->handle;
1491 	if (id->qp) {
1492 		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1493 					     conn_param, id->qp->qp_num,
1494 					     (id->qp->srq != NULL));
1495 	} else if (conn_param) {
1496 		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1497 					     conn_param, conn_param->qp_num,
1498 					     conn_param->srq);
1499 	} else {
1500 		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1501 					     conn_param, 0, 0);
1502 	}
1503 
1504 	ret = write(id->channel->fd, &cmd, sizeof cmd);
1505 	if (ret != sizeof cmd)
1506 		return (ret >= 0) ? ERR(ENODATA) : -1;
1507 
1508 	if (id_priv->connect_len) {
1509 		free(id_priv->connect);
1510 		id_priv->connect_len = 0;
1511 	}
1512 
1513 	return ucma_complete(id);
1514 }
1515 
1516 int rdma_listen(struct rdma_cm_id *id, int backlog)
1517 {
1518 	struct ucma_abi_listen cmd;
1519 	struct cma_id_private *id_priv;
1520 	int ret;
1521 
1522 	CMA_INIT_CMD(&cmd, sizeof cmd, LISTEN);
1523 	id_priv = container_of(id, struct cma_id_private, id);
1524 	cmd.id = id_priv->handle;
1525 	cmd.backlog = backlog;
1526 
1527 	ret = write(id->channel->fd, &cmd, sizeof cmd);
1528 	if (ret != sizeof cmd)
1529 		return (ret >= 0) ? ERR(ENODATA) : -1;
1530 
1531 	if (af_ib_support)
1532 		return ucma_query_addr(id);
1533 	else
1534 		return ucma_query_route(id);
1535 }
1536 
1537 int rdma_get_request(struct rdma_cm_id *listen, struct rdma_cm_id **id)
1538 {
1539 	struct cma_id_private *id_priv;
1540 	struct rdma_cm_event *event;
1541 	int ret;
1542 
1543 	id_priv = container_of(listen, struct cma_id_private, id);
1544 	if (!id_priv->sync)
1545 		return ERR(EINVAL);
1546 
1547 	if (listen->event) {
1548 		rdma_ack_cm_event(listen->event);
1549 		listen->event = NULL;
1550 	}
1551 
1552 	ret = rdma_get_cm_event(listen->channel, &event);
1553 	if (ret)
1554 		return ret;
1555 
1556 	if (event->status) {
1557 		ret = ERR(event->status);
1558 		goto err;
1559 	}
1560 
1561 	if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
1562 		ret = ERR(EINVAL);
1563 		goto err;
1564 	}
1565 
1566 	if (id_priv->qp_init_attr) {
1567 		struct ibv_qp_init_attr attr;
1568 
1569 		attr = *id_priv->qp_init_attr;
1570 		ret = rdma_create_qp(event->id, listen->pd, &attr);
1571 		if (ret)
1572 			goto err;
1573 	}
1574 
1575 	*id = event->id;
1576 	(*id)->event = event;
1577 	return 0;
1578 
1579 err:
1580 	listen->event = event;
1581 	return ret;
1582 }
1583 
1584 int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
1585 {
1586 	struct ucma_abi_accept cmd;
1587 	struct cma_id_private *id_priv;
1588 	int ret;
1589 
1590 	id_priv = container_of(id, struct cma_id_private, id);
1591 	ret = ucma_valid_param(id_priv, conn_param);
1592 	if (ret)
1593 		return ret;
1594 
1595 	if (!conn_param || conn_param->initiator_depth == RDMA_MAX_INIT_DEPTH) {
1596 		id_priv->initiator_depth = min(id_priv->initiator_depth,
1597 					       id_priv->cma_dev->max_initiator_depth);
1598 	} else {
1599 		id_priv->initiator_depth = conn_param->initiator_depth;
1600 	}
1601 	if (!conn_param || conn_param->responder_resources == RDMA_MAX_RESP_RES) {
1602 		id_priv->responder_resources = min(id_priv->responder_resources,
1603 						   id_priv->cma_dev->max_responder_resources);
1604 	} else {
1605 		id_priv->responder_resources = conn_param->responder_resources;
1606 	}
1607 
1608 	if (!ucma_is_ud_qp(id->qp_type)) {
1609 		ret = ucma_modify_qp_rtr(id, id_priv->responder_resources);
1610 		if (ret)
1611 			return ret;
1612 
1613 		ret = ucma_modify_qp_rts(id, id_priv->initiator_depth);
1614 		if (ret)
1615 			return ret;
1616 	}
1617 
1618 	CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT);
1619 	cmd.id = id_priv->handle;
1620 	cmd.uid = (uintptr_t) id_priv;
1621 	if (id->qp)
1622 		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1623 					     conn_param, id->qp->qp_num,
1624 					     (id->qp->srq != NULL));
1625 	else
1626 		ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param,
1627 					     conn_param, conn_param->qp_num,
1628 					     conn_param->srq);
1629 
1630 	ret = write(id->channel->fd, &cmd, sizeof cmd);
1631 	if (ret != sizeof cmd) {
1632 		ucma_modify_qp_err(id);
1633 		return (ret >= 0) ? ERR(ENODATA) : -1;
1634 	}
1635 
1636 	if (ucma_is_ud_qp(id->qp_type))
1637 		return 0;
1638 
1639 	return ucma_complete(id);
1640 }
1641 
1642 int rdma_reject(struct rdma_cm_id *id, const void *private_data,
1643 		uint8_t private_data_len)
1644 {
1645 	struct ucma_abi_reject cmd;
1646 	struct cma_id_private *id_priv;
1647 	int ret;
1648 
1649 	CMA_INIT_CMD(&cmd, sizeof cmd, REJECT);
1650 
1651 	id_priv = container_of(id, struct cma_id_private, id);
1652 	cmd.id = id_priv->handle;
1653 	if (private_data && private_data_len) {
1654 		memcpy(cmd.private_data, private_data, private_data_len);
1655 		cmd.private_data_len = private_data_len;
1656 	}
1657 
1658 	ret = write(id->channel->fd, &cmd, sizeof cmd);
1659 	if (ret != sizeof cmd)
1660 		return (ret >= 0) ? ERR(ENODATA) : -1;
1661 
1662 	return 0;
1663 }
1664 
1665 int rdma_notify(struct rdma_cm_id *id, enum ibv_event_type event)
1666 {
1667 	struct ucma_abi_notify cmd;
1668 	struct cma_id_private *id_priv;
1669 	int ret;
1670 
1671 	CMA_INIT_CMD(&cmd, sizeof cmd, NOTIFY);
1672 
1673 	id_priv = container_of(id, struct cma_id_private, id);
1674 	cmd.id = id_priv->handle;
1675 	cmd.event = event;
1676 	ret = write(id->channel->fd, &cmd, sizeof cmd);
1677 	if (ret != sizeof cmd)
1678 		return (ret >= 0) ? ERR(ENODATA) : -1;
1679 
1680 	return 0;
1681 }
1682 
1683 int ucma_shutdown(struct rdma_cm_id *id)
1684 {
1685 	switch (id->verbs->device->transport_type) {
1686 	case IBV_TRANSPORT_IB:
1687 		return ucma_modify_qp_err(id);
1688 	case IBV_TRANSPORT_IWARP:
1689 		return ucma_modify_qp_sqd(id);
1690 	default:
1691 		return ERR(EINVAL);
1692 	}
1693 }
1694 
1695 int rdma_disconnect(struct rdma_cm_id *id)
1696 {
1697 	struct ucma_abi_disconnect cmd;
1698 	struct cma_id_private *id_priv;
1699 	int ret;
1700 
1701 	ret = ucma_shutdown(id);
1702 	if (ret)
1703 		return ret;
1704 
1705 	CMA_INIT_CMD(&cmd, sizeof cmd, DISCONNECT);
1706 	id_priv = container_of(id, struct cma_id_private, id);
1707 	cmd.id = id_priv->handle;
1708 
1709 	ret = write(id->channel->fd, &cmd, sizeof cmd);
1710 	if (ret != sizeof cmd)
1711 		return (ret >= 0) ? ERR(ENODATA) : -1;
1712 
1713 	return ucma_complete(id);
1714 }
1715 
1716 static int rdma_join_multicast2(struct rdma_cm_id *id, struct sockaddr *addr,
1717 				socklen_t addrlen, void *context)
1718 {
1719 	struct ucma_abi_create_id_resp resp;
1720 	struct cma_id_private *id_priv;
1721 	struct cma_multicast *mc, **pos;
1722 	int ret;
1723 
1724 	id_priv = container_of(id, struct cma_id_private, id);
1725 	mc = calloc(1, sizeof(*mc));
1726 	if (!mc)
1727 		return ERR(ENOMEM);
1728 
1729 	mc->context = context;
1730 	mc->id_priv = id_priv;
1731 	memcpy(&mc->addr, addr, addrlen);
1732 	if (pthread_cond_init(&mc->cond, NULL)) {
1733 		ret = -1;
1734 		goto err1;
1735 	}
1736 
1737 	pthread_mutex_lock(&id_priv->mut);
1738 	mc->next = id_priv->mc_list;
1739 	id_priv->mc_list = mc;
1740 	pthread_mutex_unlock(&id_priv->mut);
1741 
1742 	if (af_ib_support) {
1743 		struct ucma_abi_join_mcast cmd;
1744 
1745 		CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_MCAST, &resp, sizeof resp);
1746 		cmd.id = id_priv->handle;
1747 		memcpy(&cmd.addr, addr, addrlen);
1748 		cmd.addr_size = addrlen;
1749 		cmd.uid = (uintptr_t) mc;
1750 		cmd.reserved = 0;
1751 
1752 		ret = write(id->channel->fd, &cmd, sizeof cmd);
1753 		if (ret != sizeof cmd) {
1754 			ret = (ret >= 0) ? ERR(ENODATA) : -1;
1755 			goto err2;
1756 		}
1757 	} else {
1758 		struct ucma_abi_join_ip_mcast cmd;
1759 
1760 		CMA_INIT_CMD_RESP(&cmd, sizeof cmd, JOIN_IP_MCAST, &resp, sizeof resp);
1761 		cmd.id = id_priv->handle;
1762 		memcpy(&cmd.addr, addr, addrlen);
1763 		cmd.uid = (uintptr_t) mc;
1764 
1765 		ret = write(id->channel->fd, &cmd, sizeof cmd);
1766 		if (ret != sizeof cmd) {
1767 			ret = (ret >= 0) ? ERR(ENODATA) : -1;
1768 			goto err2;
1769 		}
1770 	}
1771 
1772 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
1773 
1774 	mc->handle = resp.id;
1775 	return ucma_complete(id);
1776 
1777 err2:
1778 	pthread_mutex_lock(&id_priv->mut);
1779 	for (pos = &id_priv->mc_list; *pos != mc; pos = &(*pos)->next)
1780 		;
1781 	*pos = mc->next;
1782 	pthread_mutex_unlock(&id_priv->mut);
1783 err1:
1784 	free(mc);
1785 	return ret;
1786 }
1787 
1788 int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
1789 			void *context)
1790 {
1791 	int addrlen;
1792 
1793 	addrlen = ucma_addrlen(addr);
1794 	if (!addrlen)
1795 		return ERR(EINVAL);
1796 
1797 	return rdma_join_multicast2(id, addr, addrlen, context);
1798 }
1799 
1800 int rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
1801 {
1802 	struct ucma_abi_destroy_id cmd;
1803 	struct ucma_abi_destroy_id_resp resp;
1804 	struct cma_id_private *id_priv;
1805 	struct cma_multicast *mc, **pos;
1806 	int ret, addrlen;
1807 
1808 	addrlen = ucma_addrlen(addr);
1809 	if (!addrlen)
1810 		return ERR(EINVAL);
1811 
1812 	id_priv = container_of(id, struct cma_id_private, id);
1813 	pthread_mutex_lock(&id_priv->mut);
1814 	for (pos = &id_priv->mc_list; *pos; pos = &(*pos)->next)
1815 		if (!memcmp(&(*pos)->addr, addr, addrlen))
1816 			break;
1817 
1818 	mc = *pos;
1819 	if (*pos)
1820 		*pos = mc->next;
1821 	pthread_mutex_unlock(&id_priv->mut);
1822 	if (!mc)
1823 		return ERR(EADDRNOTAVAIL);
1824 
1825 	if (id->qp)
1826 		ibv_detach_mcast(id->qp, &mc->mgid, mc->mlid);
1827 
1828 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, LEAVE_MCAST, &resp, sizeof resp);
1829 	cmd.id = mc->handle;
1830 
1831 	ret = write(id->channel->fd, &cmd, sizeof cmd);
1832 	if (ret != sizeof cmd) {
1833 		ret = (ret >= 0) ? ERR(ENODATA) : -1;
1834 		goto free;
1835 	}
1836 
1837 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
1838 
1839 	pthread_mutex_lock(&id_priv->mut);
1840 	while (mc->events_completed < resp.events_reported)
1841 		pthread_cond_wait(&mc->cond, &id_priv->mut);
1842 	pthread_mutex_unlock(&id_priv->mut);
1843 
1844 	ret = 0;
1845 free:
1846 	free(mc);
1847 	return ret;
1848 }
1849 
1850 static void ucma_complete_event(struct cma_id_private *id_priv)
1851 {
1852 	pthread_mutex_lock(&id_priv->mut);
1853 	id_priv->events_completed++;
1854 	pthread_cond_signal(&id_priv->cond);
1855 	pthread_mutex_unlock(&id_priv->mut);
1856 }
1857 
1858 static void ucma_complete_mc_event(struct cma_multicast *mc)
1859 {
1860 	pthread_mutex_lock(&mc->id_priv->mut);
1861 	mc->events_completed++;
1862 	pthread_cond_signal(&mc->cond);
1863 	mc->id_priv->events_completed++;
1864 	pthread_cond_signal(&mc->id_priv->cond);
1865 	pthread_mutex_unlock(&mc->id_priv->mut);
1866 }
1867 
1868 int rdma_ack_cm_event(struct rdma_cm_event *event)
1869 {
1870 	struct cma_event *evt;
1871 
1872 	if (!event)
1873 		return ERR(EINVAL);
1874 
1875 	evt = container_of(event, struct cma_event, event);
1876 
1877 	if (evt->mc)
1878 		ucma_complete_mc_event(evt->mc);
1879 	else
1880 		ucma_complete_event(evt->id_priv);
1881 	free(evt);
1882 	return 0;
1883 }
1884 
1885 static void ucma_process_addr_resolved(struct cma_event *evt)
1886 {
1887 	if (af_ib_support) {
1888 		evt->event.status = ucma_query_addr(&evt->id_priv->id);
1889 		if (!evt->event.status &&
1890 		    evt->id_priv->id.verbs->device->transport_type == IBV_TRANSPORT_IB)
1891 			evt->event.status = ucma_query_gid(&evt->id_priv->id);
1892 	} else {
1893 		evt->event.status = ucma_query_route(&evt->id_priv->id);
1894 	}
1895 
1896 	if (evt->event.status)
1897 		evt->event.event = RDMA_CM_EVENT_ADDR_ERROR;
1898 }
1899 
1900 static void ucma_process_route_resolved(struct cma_event *evt)
1901 {
1902 	if (evt->id_priv->id.verbs->device->transport_type != IBV_TRANSPORT_IB)
1903 		return;
1904 
1905 	if (af_ib_support)
1906 		evt->event.status = ucma_query_path(&evt->id_priv->id);
1907 	else
1908 		evt->event.status = ucma_query_route(&evt->id_priv->id);
1909 
1910 	if (evt->event.status)
1911 		evt->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
1912 }
1913 
1914 static int ucma_query_req_info(struct rdma_cm_id *id)
1915 {
1916 	int ret;
1917 
1918 	if (!af_ib_support)
1919 		return ucma_query_route(id);
1920 
1921 	ret = ucma_query_addr(id);
1922 	if (ret)
1923 		return ret;
1924 
1925 	ret = ucma_query_gid(id);
1926 	if (ret)
1927 		return ret;
1928 
1929 	ret = ucma_query_path(id);
1930 	if (ret)
1931 		return ret;
1932 
1933 	return 0;
1934 }
1935 
1936 static int ucma_process_conn_req(struct cma_event *evt,
1937 				 uint32_t handle)
1938 {
1939 	struct cma_id_private *id_priv;
1940 	int ret;
1941 
1942 	id_priv = ucma_alloc_id(evt->id_priv->id.channel,
1943 				evt->id_priv->id.context, evt->id_priv->id.ps,
1944 				evt->id_priv->id.qp_type);
1945 	if (!id_priv) {
1946 		ucma_destroy_kern_id(evt->id_priv->id.channel->fd, handle);
1947 		ret = ERR(ENOMEM);
1948 		goto err1;
1949 	}
1950 
1951 	evt->event.listen_id = &evt->id_priv->id;
1952 	evt->event.id = &id_priv->id;
1953 	id_priv->handle = handle;
1954 	ucma_insert_id(id_priv);
1955 	id_priv->initiator_depth = evt->event.param.conn.initiator_depth;
1956 	id_priv->responder_resources = evt->event.param.conn.responder_resources;
1957 
1958 	if (evt->id_priv->sync) {
1959 		ret = rdma_migrate_id(&id_priv->id, NULL);
1960 		if (ret)
1961 			goto err2;
1962 	}
1963 
1964 	ret = ucma_query_req_info(&id_priv->id);
1965 	if (ret)
1966 		goto err2;
1967 
1968 	return 0;
1969 
1970 err2:
1971 	rdma_destroy_id(&id_priv->id);
1972 err1:
1973 	ucma_complete_event(evt->id_priv);
1974 	return ret;
1975 }
1976 
1977 static int ucma_process_conn_resp(struct cma_id_private *id_priv)
1978 {
1979 	struct ucma_abi_accept cmd;
1980 	int ret;
1981 
1982 	ret = ucma_modify_qp_rtr(&id_priv->id, RDMA_MAX_RESP_RES);
1983 	if (ret)
1984 		goto err;
1985 
1986 	ret = ucma_modify_qp_rts(&id_priv->id, RDMA_MAX_INIT_DEPTH);
1987 	if (ret)
1988 		goto err;
1989 
1990 	CMA_INIT_CMD(&cmd, sizeof cmd, ACCEPT);
1991 	cmd.id = id_priv->handle;
1992 
1993 	ret = write(id_priv->id.channel->fd, &cmd, sizeof cmd);
1994 	if (ret != sizeof cmd) {
1995 		ret = (ret >= 0) ? ERR(ENODATA) : -1;
1996 		goto err;
1997 	}
1998 
1999 	return 0;
2000 err:
2001 	ucma_modify_qp_err(&id_priv->id);
2002 	return ret;
2003 }
2004 
2005 static int ucma_process_join(struct cma_event *evt)
2006 {
2007 	evt->mc->mgid = evt->event.param.ud.ah_attr.grh.dgid;
2008 	evt->mc->mlid = evt->event.param.ud.ah_attr.dlid;
2009 
2010 	if (!evt->id_priv->id.qp)
2011 		return 0;
2012 
2013 	return rdma_seterrno(ibv_attach_mcast(evt->id_priv->id.qp,
2014 					      &evt->mc->mgid, evt->mc->mlid));
2015 }
2016 
2017 static void ucma_copy_conn_event(struct cma_event *event,
2018 				 struct ucma_abi_conn_param *src)
2019 {
2020 	struct rdma_conn_param *dst = &event->event.param.conn;
2021 
2022 	dst->private_data_len = src->private_data_len;
2023 	if (src->private_data_len) {
2024 		dst->private_data = &event->private_data;
2025 		memcpy(&event->private_data, src->private_data,
2026 		       src->private_data_len);
2027 	}
2028 
2029 	dst->responder_resources = src->responder_resources;
2030 	dst->initiator_depth = src->initiator_depth;
2031 	dst->flow_control = src->flow_control;
2032 	dst->retry_count = src->retry_count;
2033 	dst->rnr_retry_count = src->rnr_retry_count;
2034 	dst->srq = src->srq;
2035 	dst->qp_num = src->qp_num;
2036 }
2037 
2038 static void ucma_copy_ud_event(struct cma_event *event,
2039 			       struct ucma_abi_ud_param *src)
2040 {
2041 	struct rdma_ud_param *dst = &event->event.param.ud;
2042 
2043 	dst->private_data_len = src->private_data_len;
2044 	if (src->private_data_len) {
2045 		dst->private_data = &event->private_data;
2046 		memcpy(&event->private_data, src->private_data,
2047 		       src->private_data_len);
2048 	}
2049 
2050 	ibv_copy_ah_attr_from_kern(&dst->ah_attr, &src->ah_attr);
2051 	dst->qp_num = src->qp_num;
2052 	dst->qkey = src->qkey;
2053 }
2054 
2055 int rdma_get_cm_event(struct rdma_event_channel *channel,
2056 		      struct rdma_cm_event **event)
2057 {
2058 	struct ucma_abi_event_resp resp;
2059 	struct ucma_abi_get_event cmd;
2060 	struct cma_event *evt;
2061 	int ret;
2062 
2063 	ret = ucma_init();
2064 	if (ret)
2065 		return ret;
2066 
2067 	if (!event)
2068 		return ERR(EINVAL);
2069 
2070 	evt = malloc(sizeof(*evt));
2071 	if (!evt)
2072 		return ERR(ENOMEM);
2073 
2074 retry:
2075 	memset(evt, 0, sizeof(*evt));
2076 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, GET_EVENT, &resp, sizeof resp);
2077 	ret = write(channel->fd, &cmd, sizeof cmd);
2078 	if (ret != sizeof cmd) {
2079 		free(evt);
2080 		return (ret >= 0) ? ERR(ENODATA) : -1;
2081 	}
2082 
2083 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
2084 
2085 	evt->event.event = resp.event;
2086 	/*
2087 	 * We should have a non-zero uid, except for connection requests.
2088 	 * But a bug in older kernels can report a uid 0.  Work-around this
2089 	 * issue by looking up the cma_id based on the kernel's id when the
2090 	 * uid is 0 and we're processing a connection established event.
2091 	 * In all other cases, if the uid is 0, we discard the event, like
2092 	 * the kernel should have done.
2093 	 */
2094 	if (resp.uid) {
2095 		evt->id_priv = (void *) (uintptr_t) resp.uid;
2096 	} else {
2097 		evt->id_priv = ucma_lookup_id(resp.id);
2098 		if (!evt->id_priv) {
2099 			syslog(LOG_WARNING, PFX "Warning: discarding unmatched "
2100 				"event - rdma_destroy_id may hang.\n");
2101 			goto retry;
2102 		}
2103 		if (resp.event != RDMA_CM_EVENT_ESTABLISHED) {
2104 			ucma_complete_event(evt->id_priv);
2105 			goto retry;
2106 		}
2107 	}
2108 	evt->event.id = &evt->id_priv->id;
2109 	evt->event.status = resp.status;
2110 
2111 	switch (resp.event) {
2112 	case RDMA_CM_EVENT_ADDR_RESOLVED:
2113 		ucma_process_addr_resolved(evt);
2114 		break;
2115 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
2116 		ucma_process_route_resolved(evt);
2117 		break;
2118 	case RDMA_CM_EVENT_CONNECT_REQUEST:
2119 		evt->id_priv = (void *) (uintptr_t) resp.uid;
2120 		if (ucma_is_ud_qp(evt->id_priv->id.qp_type))
2121 			ucma_copy_ud_event(evt, &resp.param.ud);
2122 		else
2123 			ucma_copy_conn_event(evt, &resp.param.conn);
2124 
2125 		ret = ucma_process_conn_req(evt, resp.id);
2126 		if (ret)
2127 			goto retry;
2128 		break;
2129 	case RDMA_CM_EVENT_CONNECT_RESPONSE:
2130 		ucma_copy_conn_event(evt, &resp.param.conn);
2131 		evt->event.status = ucma_process_conn_resp(evt->id_priv);
2132 		if (!evt->event.status)
2133 			evt->event.event = RDMA_CM_EVENT_ESTABLISHED;
2134 		else {
2135 			evt->event.event = RDMA_CM_EVENT_CONNECT_ERROR;
2136 			evt->id_priv->connect_error = 1;
2137 		}
2138 		break;
2139 	case RDMA_CM_EVENT_ESTABLISHED:
2140 		if (ucma_is_ud_qp(evt->id_priv->id.qp_type)) {
2141 			ucma_copy_ud_event(evt, &resp.param.ud);
2142 			break;
2143 		}
2144 
2145 		ucma_copy_conn_event(evt, &resp.param.conn);
2146 		break;
2147 	case RDMA_CM_EVENT_REJECTED:
2148 		if (evt->id_priv->connect_error) {
2149 			ucma_complete_event(evt->id_priv);
2150 			goto retry;
2151 		}
2152 		ucma_copy_conn_event(evt, &resp.param.conn);
2153 		ucma_modify_qp_err(evt->event.id);
2154 		break;
2155 	case RDMA_CM_EVENT_DISCONNECTED:
2156 		if (evt->id_priv->connect_error) {
2157 			ucma_complete_event(evt->id_priv);
2158 			goto retry;
2159 		}
2160 		ucma_copy_conn_event(evt, &resp.param.conn);
2161 		break;
2162 	case RDMA_CM_EVENT_MULTICAST_JOIN:
2163 		evt->mc = (void *) (uintptr_t) resp.uid;
2164 		evt->id_priv = evt->mc->id_priv;
2165 		evt->event.id = &evt->id_priv->id;
2166 		ucma_copy_ud_event(evt, &resp.param.ud);
2167 		evt->event.param.ud.private_data = evt->mc->context;
2168 		evt->event.status = ucma_process_join(evt);
2169 		if (evt->event.status)
2170 			evt->event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
2171 		break;
2172 	case RDMA_CM_EVENT_MULTICAST_ERROR:
2173 		evt->mc = (void *) (uintptr_t) resp.uid;
2174 		evt->id_priv = evt->mc->id_priv;
2175 		evt->event.id = &evt->id_priv->id;
2176 		evt->event.param.ud.private_data = evt->mc->context;
2177 		break;
2178 	default:
2179 		evt->id_priv = (void *) (uintptr_t) resp.uid;
2180 		evt->event.id = &evt->id_priv->id;
2181 		evt->event.status = resp.status;
2182 		if (ucma_is_ud_qp(evt->id_priv->id.qp_type))
2183 			ucma_copy_ud_event(evt, &resp.param.ud);
2184 		else
2185 			ucma_copy_conn_event(evt, &resp.param.conn);
2186 		break;
2187 	}
2188 
2189 	*event = &evt->event;
2190 	return 0;
2191 }
2192 
2193 const char *rdma_event_str(enum rdma_cm_event_type event)
2194 {
2195 	switch (event) {
2196 	case RDMA_CM_EVENT_ADDR_RESOLVED:
2197 		return "RDMA_CM_EVENT_ADDR_RESOLVED";
2198 	case RDMA_CM_EVENT_ADDR_ERROR:
2199 		return "RDMA_CM_EVENT_ADDR_ERROR";
2200 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
2201 		return "RDMA_CM_EVENT_ROUTE_RESOLVED";
2202 	case RDMA_CM_EVENT_ROUTE_ERROR:
2203 		return "RDMA_CM_EVENT_ROUTE_ERROR";
2204 	case RDMA_CM_EVENT_CONNECT_REQUEST:
2205 		return "RDMA_CM_EVENT_CONNECT_REQUEST";
2206 	case RDMA_CM_EVENT_CONNECT_RESPONSE:
2207 		return "RDMA_CM_EVENT_CONNECT_RESPONSE";
2208 	case RDMA_CM_EVENT_CONNECT_ERROR:
2209 		return "RDMA_CM_EVENT_CONNECT_ERROR";
2210 	case RDMA_CM_EVENT_UNREACHABLE:
2211 		return "RDMA_CM_EVENT_UNREACHABLE";
2212 	case RDMA_CM_EVENT_REJECTED:
2213 		return "RDMA_CM_EVENT_REJECTED";
2214 	case RDMA_CM_EVENT_ESTABLISHED:
2215 		return "RDMA_CM_EVENT_ESTABLISHED";
2216 	case RDMA_CM_EVENT_DISCONNECTED:
2217 		return "RDMA_CM_EVENT_DISCONNECTED";
2218 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
2219 		return "RDMA_CM_EVENT_DEVICE_REMOVAL";
2220 	case RDMA_CM_EVENT_MULTICAST_JOIN:
2221 		return "RDMA_CM_EVENT_MULTICAST_JOIN";
2222 	case RDMA_CM_EVENT_MULTICAST_ERROR:
2223 		return "RDMA_CM_EVENT_MULTICAST_ERROR";
2224 	case RDMA_CM_EVENT_ADDR_CHANGE:
2225 		return "RDMA_CM_EVENT_ADDR_CHANGE";
2226 	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
2227 		return "RDMA_CM_EVENT_TIMEWAIT_EXIT";
2228 	default:
2229 		return "UNKNOWN EVENT";
2230 	}
2231 }
2232 
2233 int rdma_set_option(struct rdma_cm_id *id, int level, int optname,
2234 		    void *optval, size_t optlen)
2235 {
2236 	struct ucma_abi_set_option cmd;
2237 	struct cma_id_private *id_priv;
2238 	int ret;
2239 
2240 	CMA_INIT_CMD(&cmd, sizeof cmd, SET_OPTION);
2241 	id_priv = container_of(id, struct cma_id_private, id);
2242 	cmd.id = id_priv->handle;
2243 	cmd.optval = (uintptr_t) optval;
2244 	cmd.level = level;
2245 	cmd.optname = optname;
2246 	cmd.optlen = optlen;
2247 
2248 	ret = write(id->channel->fd, &cmd, sizeof cmd);
2249 	if (ret != sizeof cmd)
2250 		return (ret >= 0) ? ERR(ENODATA) : -1;
2251 
2252 	return 0;
2253 }
2254 
2255 int rdma_migrate_id(struct rdma_cm_id *id, struct rdma_event_channel *channel)
2256 {
2257 	struct ucma_abi_migrate_resp resp;
2258 	struct ucma_abi_migrate_id cmd;
2259 	struct cma_id_private *id_priv;
2260 	int ret, sync;
2261 
2262 	id_priv = container_of(id, struct cma_id_private, id);
2263 	if (id_priv->sync && !channel)
2264 		return ERR(EINVAL);
2265 
2266 	if ((sync = (channel == NULL))) {
2267 		channel = rdma_create_event_channel();
2268 		if (!channel)
2269 			return -1;
2270 	}
2271 
2272 	CMA_INIT_CMD_RESP(&cmd, sizeof cmd, MIGRATE_ID, &resp, sizeof resp);
2273 	cmd.id = id_priv->handle;
2274 	cmd.fd = id->channel->fd;
2275 
2276 	ret = write(channel->fd, &cmd, sizeof cmd);
2277 	if (ret != sizeof cmd) {
2278 		if (sync)
2279 			rdma_destroy_event_channel(channel);
2280 		return (ret >= 0) ? ERR(ENODATA) : -1;
2281 	}
2282 
2283 	VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof resp);
2284 
2285 	if (id_priv->sync) {
2286 		if (id->event) {
2287 			rdma_ack_cm_event(id->event);
2288 			id->event = NULL;
2289 		}
2290 		rdma_destroy_event_channel(id->channel);
2291 	}
2292 
2293 	/*
2294 	 * Eventually if we want to support migrating channels while events are
2295 	 * being processed on the current channel, we need to block here while
2296 	 * there are any outstanding events on the current channel for this id
2297 	 * to prevent the user from processing events for this id on the old
2298 	 * channel after this call returns.
2299 	 */
2300 	pthread_mutex_lock(&id_priv->mut);
2301 	id_priv->sync = sync;
2302 	id->channel = channel;
2303 	while (id_priv->events_completed < resp.events_reported)
2304 		pthread_cond_wait(&id_priv->cond, &id_priv->mut);
2305 	pthread_mutex_unlock(&id_priv->mut);
2306 
2307 	return 0;
2308 }
2309 
2310 static int ucma_passive_ep(struct rdma_cm_id *id, struct rdma_addrinfo *res,
2311 			   struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr)
2312 {
2313 	struct cma_id_private *id_priv;
2314 	int ret;
2315 
2316 	if (af_ib_support)
2317 		ret = rdma_bind_addr2(id, res->ai_src_addr, res->ai_src_len);
2318 	else
2319 		ret = rdma_bind_addr(id, res->ai_src_addr);
2320 	if (ret)
2321 		return ret;
2322 
2323 	id_priv = container_of(id, struct cma_id_private, id);
2324 	if (pd)
2325 		id->pd = pd;
2326 
2327 	if (qp_init_attr) {
2328 		id_priv->qp_init_attr = malloc(sizeof(*qp_init_attr));
2329 		if (!id_priv->qp_init_attr)
2330 			return ERR(ENOMEM);
2331 
2332 		*id_priv->qp_init_attr = *qp_init_attr;
2333 		id_priv->qp_init_attr->qp_type = res->ai_qp_type;
2334 	}
2335 
2336 	return 0;
2337 }
2338 
2339 int rdma_create_ep(struct rdma_cm_id **id, struct rdma_addrinfo *res,
2340 		   struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr)
2341 {
2342 	struct rdma_cm_id *cm_id;
2343 	struct cma_id_private *id_priv;
2344 	int ret;
2345 
2346 	ret = rdma_create_id2(NULL, &cm_id, NULL, res->ai_port_space, res->ai_qp_type);
2347 	if (ret)
2348 		return ret;
2349 
2350 	if (res->ai_flags & RAI_PASSIVE) {
2351 		ret = ucma_passive_ep(cm_id, res, pd, qp_init_attr);
2352 		if (ret)
2353 			goto err;
2354 		goto out;
2355 	}
2356 
2357 	if (af_ib_support)
2358 		ret = rdma_resolve_addr2(cm_id, res->ai_src_addr, res->ai_src_len,
2359 					 res->ai_dst_addr, res->ai_dst_len, 2000);
2360 	else
2361 		ret = rdma_resolve_addr(cm_id, res->ai_src_addr, res->ai_dst_addr, 2000);
2362 	if (ret)
2363 		goto err;
2364 
2365 	if (res->ai_route_len) {
2366 		ret = rdma_set_option(cm_id, RDMA_OPTION_IB, RDMA_OPTION_IB_PATH,
2367 				      res->ai_route, res->ai_route_len);
2368 		if (!ret)
2369 			ret = ucma_complete(cm_id);
2370 	} else {
2371 		ret = rdma_resolve_route(cm_id, 2000);
2372 	}
2373 	if (ret)
2374 		goto err;
2375 
2376 	if (qp_init_attr) {
2377 		qp_init_attr->qp_type = res->ai_qp_type;
2378 		ret = rdma_create_qp(cm_id, pd, qp_init_attr);
2379 		if (ret)
2380 			goto err;
2381 	}
2382 
2383 	if (res->ai_connect_len) {
2384 		id_priv = container_of(cm_id, struct cma_id_private, id);
2385 		id_priv->connect = malloc(res->ai_connect_len);
2386 		if (!id_priv->connect) {
2387 			ret = ERR(ENOMEM);
2388 			goto err;
2389 		}
2390 		memcpy(id_priv->connect, res->ai_connect, res->ai_connect_len);
2391 		id_priv->connect_len = res->ai_connect_len;
2392 	}
2393 
2394 out:
2395 	*id = cm_id;
2396 	return 0;
2397 
2398 err:
2399 	rdma_destroy_ep(cm_id);
2400 	return ret;
2401 }
2402 
2403 void rdma_destroy_ep(struct rdma_cm_id *id)
2404 {
2405 	struct cma_id_private *id_priv;
2406 
2407 	if (id->qp)
2408 		rdma_destroy_qp(id);
2409 
2410 	if (id->srq)
2411 		rdma_destroy_srq(id);
2412 
2413 	id_priv = container_of(id, struct cma_id_private, id);
2414 	if (id_priv->qp_init_attr)
2415 		free(id_priv->qp_init_attr);
2416 
2417 	rdma_destroy_id(id);
2418 }
2419 
2420 int ucma_max_qpsize(struct rdma_cm_id *id)
2421 {
2422 	struct cma_id_private *id_priv;
2423 	int i, max_size = 0;
2424 
2425 	id_priv = container_of(id, struct cma_id_private, id);
2426 	if (id && id_priv->cma_dev) {
2427 		max_size = id_priv->cma_dev->max_qpsize;
2428 	} else {
2429 		ucma_init_all();
2430 		for (i = 0; i < cma_dev_cnt; i++) {
2431 			if (!max_size || max_size > cma_dev_array[i].max_qpsize)
2432 				max_size = cma_dev_array[i].max_qpsize;
2433 		}
2434 	}
2435 	return max_size;
2436 }
2437 
2438 __be16 ucma_get_port(struct sockaddr *addr)
2439 {
2440 	switch (addr->sa_family) {
2441 	case AF_INET:
2442 		return ((struct sockaddr_in *) addr)->sin_port;
2443 	case AF_INET6:
2444 		return ((struct sockaddr_in6 *) addr)->sin6_port;
2445 	case AF_IB:
2446 		return htobe16((uint16_t) be64toh(((struct sockaddr_ib *) addr)->sib_sid));
2447 	default:
2448 		return 0;
2449 	}
2450 }
2451 
2452 __be16 rdma_get_src_port(struct rdma_cm_id *id)
2453 {
2454 	return ucma_get_port(&id->route.addr.src_addr);
2455 }
2456 
2457 __be16 rdma_get_dst_port(struct rdma_cm_id *id)
2458 {
2459 	return ucma_get_port(&id->route.addr.dst_addr);
2460 }
2461 
2462