xref: /linux/drivers/infiniband/core/ucma.c (revision 15a1fbdcfb519c2bd291ed01c6c94e0b89537a77)
1 /*
2  * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *	copyright notice, this list of conditions and the following
16  *	disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *	copyright notice, this list of conditions and the following
20  *	disclaimer in the documentation and/or other materials
21  *	provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 #include <linux/completion.h>
34 #include <linux/file.h>
35 #include <linux/mutex.h>
36 #include <linux/poll.h>
37 #include <linux/sched.h>
38 #include <linux/idr.h>
39 #include <linux/in.h>
40 #include <linux/in6.h>
41 #include <linux/miscdevice.h>
42 #include <linux/slab.h>
43 #include <linux/sysctl.h>
44 #include <linux/module.h>
45 #include <linux/nsproxy.h>
46 
47 #include <linux/nospec.h>
48 
49 #include <rdma/rdma_user_cm.h>
50 #include <rdma/ib_marshall.h>
51 #include <rdma/rdma_cm.h>
52 #include <rdma/rdma_cm_ib.h>
53 #include <rdma/ib_addr.h>
54 #include <rdma/ib.h>
55 #include <rdma/rdma_netlink.h>
56 #include "core_priv.h"
57 
58 MODULE_AUTHOR("Sean Hefty");
59 MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
60 MODULE_LICENSE("Dual BSD/GPL");
61 
62 static unsigned int max_backlog = 1024;
63 
64 static struct ctl_table_header *ucma_ctl_table_hdr;
65 static struct ctl_table ucma_ctl_table[] = {
66 	{
67 		.procname	= "max_backlog",
68 		.data		= &max_backlog,
69 		.maxlen		= sizeof max_backlog,
70 		.mode		= 0644,
71 		.proc_handler	= proc_dointvec,
72 	},
73 	{ }
74 };
75 
76 struct ucma_file {
77 	struct mutex		mut;
78 	struct file		*filp;
79 	struct list_head	ctx_list;
80 	struct list_head	event_list;
81 	wait_queue_head_t	poll_wait;
82 	struct workqueue_struct	*close_wq;
83 };
84 
85 struct ucma_context {
86 	u32			id;
87 	struct completion	comp;
88 	atomic_t		ref;
89 	int			events_reported;
90 	int			backlog;
91 
92 	struct ucma_file	*file;
93 	struct rdma_cm_id	*cm_id;
94 	u64			uid;
95 
96 	struct list_head	list;
97 	struct list_head	mc_list;
98 	/* mark that device is in process of destroying the internal HW
99 	 * resources, protected by the ctx_table lock
100 	 */
101 	int			closing;
102 	/* sync between removal event and id destroy, protected by file mut */
103 	int			destroying;
104 	struct work_struct	close_work;
105 };
106 
107 struct ucma_multicast {
108 	struct ucma_context	*ctx;
109 	u32			id;
110 	int			events_reported;
111 
112 	u64			uid;
113 	u8			join_state;
114 	struct list_head	list;
115 	struct sockaddr_storage	addr;
116 };
117 
118 struct ucma_event {
119 	struct ucma_context	*ctx;
120 	struct ucma_multicast	*mc;
121 	struct list_head	list;
122 	struct rdma_cm_id	*cm_id;
123 	struct rdma_ucm_event_resp resp;
124 	struct work_struct	close_work;
125 };
126 
127 static DEFINE_XARRAY_ALLOC(ctx_table);
128 static DEFINE_XARRAY_ALLOC(multicast_table);
129 
130 static const struct file_operations ucma_fops;
131 
132 static inline struct ucma_context *_ucma_find_context(int id,
133 						      struct ucma_file *file)
134 {
135 	struct ucma_context *ctx;
136 
137 	ctx = xa_load(&ctx_table, id);
138 	if (!ctx)
139 		ctx = ERR_PTR(-ENOENT);
140 	else if (ctx->file != file || !ctx->cm_id)
141 		ctx = ERR_PTR(-EINVAL);
142 	return ctx;
143 }
144 
145 static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
146 {
147 	struct ucma_context *ctx;
148 
149 	xa_lock(&ctx_table);
150 	ctx = _ucma_find_context(id, file);
151 	if (!IS_ERR(ctx)) {
152 		if (ctx->closing)
153 			ctx = ERR_PTR(-EIO);
154 		else
155 			atomic_inc(&ctx->ref);
156 	}
157 	xa_unlock(&ctx_table);
158 	return ctx;
159 }
160 
161 static void ucma_put_ctx(struct ucma_context *ctx)
162 {
163 	if (atomic_dec_and_test(&ctx->ref))
164 		complete(&ctx->comp);
165 }
166 
167 /*
168  * Same as ucm_get_ctx but requires that ->cm_id->device is valid, eg that the
169  * CM_ID is bound.
170  */
171 static struct ucma_context *ucma_get_ctx_dev(struct ucma_file *file, int id)
172 {
173 	struct ucma_context *ctx = ucma_get_ctx(file, id);
174 
175 	if (IS_ERR(ctx))
176 		return ctx;
177 	if (!ctx->cm_id->device) {
178 		ucma_put_ctx(ctx);
179 		return ERR_PTR(-EINVAL);
180 	}
181 	return ctx;
182 }
183 
184 static void ucma_close_event_id(struct work_struct *work)
185 {
186 	struct ucma_event *uevent_close =  container_of(work, struct ucma_event, close_work);
187 
188 	rdma_destroy_id(uevent_close->cm_id);
189 	kfree(uevent_close);
190 }
191 
192 static void ucma_close_id(struct work_struct *work)
193 {
194 	struct ucma_context *ctx =  container_of(work, struct ucma_context, close_work);
195 
196 	/* once all inflight tasks are finished, we close all underlying
197 	 * resources. The context is still alive till its explicit destryoing
198 	 * by its creator.
199 	 */
200 	ucma_put_ctx(ctx);
201 	wait_for_completion(&ctx->comp);
202 	/* No new events will be generated after destroying the id. */
203 	rdma_destroy_id(ctx->cm_id);
204 }
205 
206 static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
207 {
208 	struct ucma_context *ctx;
209 
210 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
211 	if (!ctx)
212 		return NULL;
213 
214 	INIT_WORK(&ctx->close_work, ucma_close_id);
215 	atomic_set(&ctx->ref, 1);
216 	init_completion(&ctx->comp);
217 	INIT_LIST_HEAD(&ctx->mc_list);
218 	ctx->file = file;
219 
220 	if (xa_alloc(&ctx_table, &ctx->id, ctx, xa_limit_32b, GFP_KERNEL))
221 		goto error;
222 
223 	list_add_tail(&ctx->list, &file->ctx_list);
224 	return ctx;
225 
226 error:
227 	kfree(ctx);
228 	return NULL;
229 }
230 
231 static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx)
232 {
233 	struct ucma_multicast *mc;
234 
235 	mc = kzalloc(sizeof(*mc), GFP_KERNEL);
236 	if (!mc)
237 		return NULL;
238 
239 	mc->ctx = ctx;
240 	if (xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, GFP_KERNEL))
241 		goto error;
242 
243 	list_add_tail(&mc->list, &ctx->mc_list);
244 	return mc;
245 
246 error:
247 	kfree(mc);
248 	return NULL;
249 }
250 
251 static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst,
252 				 struct rdma_conn_param *src)
253 {
254 	if (src->private_data_len)
255 		memcpy(dst->private_data, src->private_data,
256 		       src->private_data_len);
257 	dst->private_data_len = src->private_data_len;
258 	dst->responder_resources =src->responder_resources;
259 	dst->initiator_depth = src->initiator_depth;
260 	dst->flow_control = src->flow_control;
261 	dst->retry_count = src->retry_count;
262 	dst->rnr_retry_count = src->rnr_retry_count;
263 	dst->srq = src->srq;
264 	dst->qp_num = src->qp_num;
265 }
266 
267 static void ucma_copy_ud_event(struct ib_device *device,
268 			       struct rdma_ucm_ud_param *dst,
269 			       struct rdma_ud_param *src)
270 {
271 	if (src->private_data_len)
272 		memcpy(dst->private_data, src->private_data,
273 		       src->private_data_len);
274 	dst->private_data_len = src->private_data_len;
275 	ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr);
276 	dst->qp_num = src->qp_num;
277 	dst->qkey = src->qkey;
278 }
279 
280 static void ucma_set_event_context(struct ucma_context *ctx,
281 				   struct rdma_cm_event *event,
282 				   struct ucma_event *uevent)
283 {
284 	uevent->ctx = ctx;
285 	switch (event->event) {
286 	case RDMA_CM_EVENT_MULTICAST_JOIN:
287 	case RDMA_CM_EVENT_MULTICAST_ERROR:
288 		uevent->mc = (struct ucma_multicast *)
289 			     event->param.ud.private_data;
290 		uevent->resp.uid = uevent->mc->uid;
291 		uevent->resp.id = uevent->mc->id;
292 		break;
293 	default:
294 		uevent->resp.uid = ctx->uid;
295 		uevent->resp.id = ctx->id;
296 		break;
297 	}
298 }
299 
300 /* Called with file->mut locked for the relevant context. */
301 static void ucma_removal_event_handler(struct rdma_cm_id *cm_id)
302 {
303 	struct ucma_context *ctx = cm_id->context;
304 	struct ucma_event *con_req_eve;
305 	int event_found = 0;
306 
307 	if (ctx->destroying)
308 		return;
309 
310 	/* only if context is pointing to cm_id that it owns it and can be
311 	 * queued to be closed, otherwise that cm_id is an inflight one that
312 	 * is part of that context event list pending to be detached and
313 	 * reattached to its new context as part of ucma_get_event,
314 	 * handled separately below.
315 	 */
316 	if (ctx->cm_id == cm_id) {
317 		xa_lock(&ctx_table);
318 		ctx->closing = 1;
319 		xa_unlock(&ctx_table);
320 		queue_work(ctx->file->close_wq, &ctx->close_work);
321 		return;
322 	}
323 
324 	list_for_each_entry(con_req_eve, &ctx->file->event_list, list) {
325 		if (con_req_eve->cm_id == cm_id &&
326 		    con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
327 			list_del(&con_req_eve->list);
328 			INIT_WORK(&con_req_eve->close_work, ucma_close_event_id);
329 			queue_work(ctx->file->close_wq, &con_req_eve->close_work);
330 			event_found = 1;
331 			break;
332 		}
333 	}
334 	if (!event_found)
335 		pr_err("ucma_removal_event_handler: warning: connect request event wasn't found\n");
336 }
337 
338 static int ucma_event_handler(struct rdma_cm_id *cm_id,
339 			      struct rdma_cm_event *event)
340 {
341 	struct ucma_event *uevent;
342 	struct ucma_context *ctx = cm_id->context;
343 	int ret = 0;
344 
345 	uevent = kzalloc(sizeof(*uevent), GFP_KERNEL);
346 	if (!uevent)
347 		return event->event == RDMA_CM_EVENT_CONNECT_REQUEST;
348 
349 	mutex_lock(&ctx->file->mut);
350 	uevent->cm_id = cm_id;
351 	ucma_set_event_context(ctx, event, uevent);
352 	uevent->resp.event = event->event;
353 	uevent->resp.status = event->status;
354 	if (cm_id->qp_type == IB_QPT_UD)
355 		ucma_copy_ud_event(cm_id->device, &uevent->resp.param.ud,
356 				   &event->param.ud);
357 	else
358 		ucma_copy_conn_event(&uevent->resp.param.conn,
359 				     &event->param.conn);
360 
361 	if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) {
362 		if (!ctx->backlog) {
363 			ret = -ENOMEM;
364 			kfree(uevent);
365 			goto out;
366 		}
367 		ctx->backlog--;
368 	} else if (!ctx->uid || ctx->cm_id != cm_id) {
369 		/*
370 		 * We ignore events for new connections until userspace has set
371 		 * their context.  This can only happen if an error occurs on a
372 		 * new connection before the user accepts it.  This is okay,
373 		 * since the accept will just fail later. However, we do need
374 		 * to release the underlying HW resources in case of a device
375 		 * removal event.
376 		 */
377 		if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
378 			ucma_removal_event_handler(cm_id);
379 
380 		kfree(uevent);
381 		goto out;
382 	}
383 
384 	list_add_tail(&uevent->list, &ctx->file->event_list);
385 	wake_up_interruptible(&ctx->file->poll_wait);
386 	if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
387 		ucma_removal_event_handler(cm_id);
388 out:
389 	mutex_unlock(&ctx->file->mut);
390 	return ret;
391 }
392 
393 static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf,
394 			      int in_len, int out_len)
395 {
396 	struct ucma_context *ctx;
397 	struct rdma_ucm_get_event cmd;
398 	struct ucma_event *uevent;
399 	int ret = 0;
400 
401 	/*
402 	 * Old 32 bit user space does not send the 4 byte padding in the
403 	 * reserved field. We don't care, allow it to keep working.
404 	 */
405 	if (out_len < sizeof(uevent->resp) - sizeof(uevent->resp.reserved))
406 		return -ENOSPC;
407 
408 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
409 		return -EFAULT;
410 
411 	mutex_lock(&file->mut);
412 	while (list_empty(&file->event_list)) {
413 		mutex_unlock(&file->mut);
414 
415 		if (file->filp->f_flags & O_NONBLOCK)
416 			return -EAGAIN;
417 
418 		if (wait_event_interruptible(file->poll_wait,
419 					     !list_empty(&file->event_list)))
420 			return -ERESTARTSYS;
421 
422 		mutex_lock(&file->mut);
423 	}
424 
425 	uevent = list_entry(file->event_list.next, struct ucma_event, list);
426 
427 	if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
428 		ctx = ucma_alloc_ctx(file);
429 		if (!ctx) {
430 			ret = -ENOMEM;
431 			goto done;
432 		}
433 		uevent->ctx->backlog++;
434 		ctx->cm_id = uevent->cm_id;
435 		ctx->cm_id->context = ctx;
436 		uevent->resp.id = ctx->id;
437 	}
438 
439 	if (copy_to_user(u64_to_user_ptr(cmd.response),
440 			 &uevent->resp,
441 			 min_t(size_t, out_len, sizeof(uevent->resp)))) {
442 		ret = -EFAULT;
443 		goto done;
444 	}
445 
446 	list_del(&uevent->list);
447 	uevent->ctx->events_reported++;
448 	if (uevent->mc)
449 		uevent->mc->events_reported++;
450 	kfree(uevent);
451 done:
452 	mutex_unlock(&file->mut);
453 	return ret;
454 }
455 
456 static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type)
457 {
458 	switch (cmd->ps) {
459 	case RDMA_PS_TCP:
460 		*qp_type = IB_QPT_RC;
461 		return 0;
462 	case RDMA_PS_UDP:
463 	case RDMA_PS_IPOIB:
464 		*qp_type = IB_QPT_UD;
465 		return 0;
466 	case RDMA_PS_IB:
467 		*qp_type = cmd->qp_type;
468 		return 0;
469 	default:
470 		return -EINVAL;
471 	}
472 }
473 
474 static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf,
475 			      int in_len, int out_len)
476 {
477 	struct rdma_ucm_create_id cmd;
478 	struct rdma_ucm_create_id_resp resp;
479 	struct ucma_context *ctx;
480 	struct rdma_cm_id *cm_id;
481 	enum ib_qp_type qp_type;
482 	int ret;
483 
484 	if (out_len < sizeof(resp))
485 		return -ENOSPC;
486 
487 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
488 		return -EFAULT;
489 
490 	ret = ucma_get_qp_type(&cmd, &qp_type);
491 	if (ret)
492 		return ret;
493 
494 	mutex_lock(&file->mut);
495 	ctx = ucma_alloc_ctx(file);
496 	mutex_unlock(&file->mut);
497 	if (!ctx)
498 		return -ENOMEM;
499 
500 	ctx->uid = cmd.uid;
501 	cm_id = __rdma_create_id(current->nsproxy->net_ns,
502 				 ucma_event_handler, ctx, cmd.ps, qp_type, NULL);
503 	if (IS_ERR(cm_id)) {
504 		ret = PTR_ERR(cm_id);
505 		goto err1;
506 	}
507 
508 	resp.id = ctx->id;
509 	if (copy_to_user(u64_to_user_ptr(cmd.response),
510 			 &resp, sizeof(resp))) {
511 		ret = -EFAULT;
512 		goto err2;
513 	}
514 
515 	ctx->cm_id = cm_id;
516 	return 0;
517 
518 err2:
519 	rdma_destroy_id(cm_id);
520 err1:
521 	xa_erase(&ctx_table, ctx->id);
522 	mutex_lock(&file->mut);
523 	list_del(&ctx->list);
524 	mutex_unlock(&file->mut);
525 	kfree(ctx);
526 	return ret;
527 }
528 
529 static void ucma_cleanup_multicast(struct ucma_context *ctx)
530 {
531 	struct ucma_multicast *mc, *tmp;
532 
533 	mutex_lock(&ctx->file->mut);
534 	list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) {
535 		list_del(&mc->list);
536 		xa_erase(&multicast_table, mc->id);
537 		kfree(mc);
538 	}
539 	mutex_unlock(&ctx->file->mut);
540 }
541 
542 static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
543 {
544 	struct ucma_event *uevent, *tmp;
545 
546 	list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) {
547 		if (uevent->mc != mc)
548 			continue;
549 
550 		list_del(&uevent->list);
551 		kfree(uevent);
552 	}
553 }
554 
555 /*
556  * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At
557  * this point, no new events will be reported from the hardware. However, we
558  * still need to cleanup the UCMA context for this ID. Specifically, there
559  * might be events that have not yet been consumed by the user space software.
560  * These might include pending connect requests which we have not completed
561  * processing.  We cannot call rdma_destroy_id while holding the lock of the
562  * context (file->mut), as it might cause a deadlock. We therefore extract all
563  * relevant events from the context pending events list while holding the
564  * mutex. After that we release them as needed.
565  */
566 static int ucma_free_ctx(struct ucma_context *ctx)
567 {
568 	int events_reported;
569 	struct ucma_event *uevent, *tmp;
570 	LIST_HEAD(list);
571 
572 
573 	ucma_cleanup_multicast(ctx);
574 
575 	/* Cleanup events not yet reported to the user. */
576 	mutex_lock(&ctx->file->mut);
577 	list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) {
578 		if (uevent->ctx == ctx)
579 			list_move_tail(&uevent->list, &list);
580 	}
581 	list_del(&ctx->list);
582 	mutex_unlock(&ctx->file->mut);
583 
584 	list_for_each_entry_safe(uevent, tmp, &list, list) {
585 		list_del(&uevent->list);
586 		if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST)
587 			rdma_destroy_id(uevent->cm_id);
588 		kfree(uevent);
589 	}
590 
591 	events_reported = ctx->events_reported;
592 	kfree(ctx);
593 	return events_reported;
594 }
595 
596 static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
597 			       int in_len, int out_len)
598 {
599 	struct rdma_ucm_destroy_id cmd;
600 	struct rdma_ucm_destroy_id_resp resp;
601 	struct ucma_context *ctx;
602 	int ret = 0;
603 
604 	if (out_len < sizeof(resp))
605 		return -ENOSPC;
606 
607 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
608 		return -EFAULT;
609 
610 	xa_lock(&ctx_table);
611 	ctx = _ucma_find_context(cmd.id, file);
612 	if (!IS_ERR(ctx))
613 		__xa_erase(&ctx_table, ctx->id);
614 	xa_unlock(&ctx_table);
615 
616 	if (IS_ERR(ctx))
617 		return PTR_ERR(ctx);
618 
619 	mutex_lock(&ctx->file->mut);
620 	ctx->destroying = 1;
621 	mutex_unlock(&ctx->file->mut);
622 
623 	flush_workqueue(ctx->file->close_wq);
624 	/* At this point it's guaranteed that there is no inflight
625 	 * closing task */
626 	xa_lock(&ctx_table);
627 	if (!ctx->closing) {
628 		xa_unlock(&ctx_table);
629 		ucma_put_ctx(ctx);
630 		wait_for_completion(&ctx->comp);
631 		rdma_destroy_id(ctx->cm_id);
632 	} else {
633 		xa_unlock(&ctx_table);
634 	}
635 
636 	resp.events_reported = ucma_free_ctx(ctx);
637 	if (copy_to_user(u64_to_user_ptr(cmd.response),
638 			 &resp, sizeof(resp)))
639 		ret = -EFAULT;
640 
641 	return ret;
642 }
643 
644 static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf,
645 			      int in_len, int out_len)
646 {
647 	struct rdma_ucm_bind_ip cmd;
648 	struct ucma_context *ctx;
649 	int ret;
650 
651 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
652 		return -EFAULT;
653 
654 	if (!rdma_addr_size_in6(&cmd.addr))
655 		return -EINVAL;
656 
657 	ctx = ucma_get_ctx(file, cmd.id);
658 	if (IS_ERR(ctx))
659 		return PTR_ERR(ctx);
660 
661 	ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr);
662 	ucma_put_ctx(ctx);
663 	return ret;
664 }
665 
666 static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf,
667 			 int in_len, int out_len)
668 {
669 	struct rdma_ucm_bind cmd;
670 	struct ucma_context *ctx;
671 	int ret;
672 
673 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
674 		return -EFAULT;
675 
676 	if (cmd.reserved || !cmd.addr_size ||
677 	    cmd.addr_size != rdma_addr_size_kss(&cmd.addr))
678 		return -EINVAL;
679 
680 	ctx = ucma_get_ctx(file, cmd.id);
681 	if (IS_ERR(ctx))
682 		return PTR_ERR(ctx);
683 
684 	ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr);
685 	ucma_put_ctx(ctx);
686 	return ret;
687 }
688 
689 static ssize_t ucma_resolve_ip(struct ucma_file *file,
690 			       const char __user *inbuf,
691 			       int in_len, int out_len)
692 {
693 	struct rdma_ucm_resolve_ip cmd;
694 	struct ucma_context *ctx;
695 	int ret;
696 
697 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
698 		return -EFAULT;
699 
700 	if ((cmd.src_addr.sin6_family && !rdma_addr_size_in6(&cmd.src_addr)) ||
701 	    !rdma_addr_size_in6(&cmd.dst_addr))
702 		return -EINVAL;
703 
704 	ctx = ucma_get_ctx(file, cmd.id);
705 	if (IS_ERR(ctx))
706 		return PTR_ERR(ctx);
707 
708 	ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr,
709 				(struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms);
710 	ucma_put_ctx(ctx);
711 	return ret;
712 }
713 
714 static ssize_t ucma_resolve_addr(struct ucma_file *file,
715 				 const char __user *inbuf,
716 				 int in_len, int out_len)
717 {
718 	struct rdma_ucm_resolve_addr cmd;
719 	struct ucma_context *ctx;
720 	int ret;
721 
722 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
723 		return -EFAULT;
724 
725 	if (cmd.reserved ||
726 	    (cmd.src_size && (cmd.src_size != rdma_addr_size_kss(&cmd.src_addr))) ||
727 	    !cmd.dst_size || (cmd.dst_size != rdma_addr_size_kss(&cmd.dst_addr)))
728 		return -EINVAL;
729 
730 	ctx = ucma_get_ctx(file, cmd.id);
731 	if (IS_ERR(ctx))
732 		return PTR_ERR(ctx);
733 
734 	ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr,
735 				(struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms);
736 	ucma_put_ctx(ctx);
737 	return ret;
738 }
739 
740 static ssize_t ucma_resolve_route(struct ucma_file *file,
741 				  const char __user *inbuf,
742 				  int in_len, int out_len)
743 {
744 	struct rdma_ucm_resolve_route cmd;
745 	struct ucma_context *ctx;
746 	int ret;
747 
748 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
749 		return -EFAULT;
750 
751 	ctx = ucma_get_ctx_dev(file, cmd.id);
752 	if (IS_ERR(ctx))
753 		return PTR_ERR(ctx);
754 
755 	ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms);
756 	ucma_put_ctx(ctx);
757 	return ret;
758 }
759 
760 static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp,
761 			       struct rdma_route *route)
762 {
763 	struct rdma_dev_addr *dev_addr;
764 
765 	resp->num_paths = route->num_paths;
766 	switch (route->num_paths) {
767 	case 0:
768 		dev_addr = &route->addr.dev_addr;
769 		rdma_addr_get_dgid(dev_addr,
770 				   (union ib_gid *) &resp->ib_route[0].dgid);
771 		rdma_addr_get_sgid(dev_addr,
772 				   (union ib_gid *) &resp->ib_route[0].sgid);
773 		resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
774 		break;
775 	case 2:
776 		ib_copy_path_rec_to_user(&resp->ib_route[1],
777 					 &route->path_rec[1]);
778 		/* fall through */
779 	case 1:
780 		ib_copy_path_rec_to_user(&resp->ib_route[0],
781 					 &route->path_rec[0]);
782 		break;
783 	default:
784 		break;
785 	}
786 }
787 
788 static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp,
789 				 struct rdma_route *route)
790 {
791 
792 	resp->num_paths = route->num_paths;
793 	switch (route->num_paths) {
794 	case 0:
795 		rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr,
796 			    (union ib_gid *)&resp->ib_route[0].dgid);
797 		rdma_ip2gid((struct sockaddr *)&route->addr.src_addr,
798 			    (union ib_gid *)&resp->ib_route[0].sgid);
799 		resp->ib_route[0].pkey = cpu_to_be16(0xffff);
800 		break;
801 	case 2:
802 		ib_copy_path_rec_to_user(&resp->ib_route[1],
803 					 &route->path_rec[1]);
804 		/* fall through */
805 	case 1:
806 		ib_copy_path_rec_to_user(&resp->ib_route[0],
807 					 &route->path_rec[0]);
808 		break;
809 	default:
810 		break;
811 	}
812 }
813 
814 static void ucma_copy_iw_route(struct rdma_ucm_query_route_resp *resp,
815 			       struct rdma_route *route)
816 {
817 	struct rdma_dev_addr *dev_addr;
818 
819 	dev_addr = &route->addr.dev_addr;
820 	rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid);
821 	rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid);
822 }
823 
824 static ssize_t ucma_query_route(struct ucma_file *file,
825 				const char __user *inbuf,
826 				int in_len, int out_len)
827 {
828 	struct rdma_ucm_query cmd;
829 	struct rdma_ucm_query_route_resp resp;
830 	struct ucma_context *ctx;
831 	struct sockaddr *addr;
832 	int ret = 0;
833 
834 	if (out_len < sizeof(resp))
835 		return -ENOSPC;
836 
837 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
838 		return -EFAULT;
839 
840 	ctx = ucma_get_ctx(file, cmd.id);
841 	if (IS_ERR(ctx))
842 		return PTR_ERR(ctx);
843 
844 	memset(&resp, 0, sizeof resp);
845 	addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
846 	memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ?
847 				     sizeof(struct sockaddr_in) :
848 				     sizeof(struct sockaddr_in6));
849 	addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr;
850 	memcpy(&resp.dst_addr, addr, addr->sa_family == AF_INET ?
851 				     sizeof(struct sockaddr_in) :
852 				     sizeof(struct sockaddr_in6));
853 	if (!ctx->cm_id->device)
854 		goto out;
855 
856 	resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid;
857 	resp.port_num = ctx->cm_id->port_num;
858 
859 	if (rdma_cap_ib_sa(ctx->cm_id->device, ctx->cm_id->port_num))
860 		ucma_copy_ib_route(&resp, &ctx->cm_id->route);
861 	else if (rdma_protocol_roce(ctx->cm_id->device, ctx->cm_id->port_num))
862 		ucma_copy_iboe_route(&resp, &ctx->cm_id->route);
863 	else if (rdma_protocol_iwarp(ctx->cm_id->device, ctx->cm_id->port_num))
864 		ucma_copy_iw_route(&resp, &ctx->cm_id->route);
865 
866 out:
867 	if (copy_to_user(u64_to_user_ptr(cmd.response),
868 			 &resp, sizeof(resp)))
869 		ret = -EFAULT;
870 
871 	ucma_put_ctx(ctx);
872 	return ret;
873 }
874 
875 static void ucma_query_device_addr(struct rdma_cm_id *cm_id,
876 				   struct rdma_ucm_query_addr_resp *resp)
877 {
878 	if (!cm_id->device)
879 		return;
880 
881 	resp->node_guid = (__force __u64) cm_id->device->node_guid;
882 	resp->port_num = cm_id->port_num;
883 	resp->pkey = (__force __u16) cpu_to_be16(
884 		     ib_addr_get_pkey(&cm_id->route.addr.dev_addr));
885 }
886 
887 static ssize_t ucma_query_addr(struct ucma_context *ctx,
888 			       void __user *response, int out_len)
889 {
890 	struct rdma_ucm_query_addr_resp resp;
891 	struct sockaddr *addr;
892 	int ret = 0;
893 
894 	if (out_len < sizeof(resp))
895 		return -ENOSPC;
896 
897 	memset(&resp, 0, sizeof resp);
898 
899 	addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
900 	resp.src_size = rdma_addr_size(addr);
901 	memcpy(&resp.src_addr, addr, resp.src_size);
902 
903 	addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr;
904 	resp.dst_size = rdma_addr_size(addr);
905 	memcpy(&resp.dst_addr, addr, resp.dst_size);
906 
907 	ucma_query_device_addr(ctx->cm_id, &resp);
908 
909 	if (copy_to_user(response, &resp, sizeof(resp)))
910 		ret = -EFAULT;
911 
912 	return ret;
913 }
914 
915 static ssize_t ucma_query_path(struct ucma_context *ctx,
916 			       void __user *response, int out_len)
917 {
918 	struct rdma_ucm_query_path_resp *resp;
919 	int i, ret = 0;
920 
921 	if (out_len < sizeof(*resp))
922 		return -ENOSPC;
923 
924 	resp = kzalloc(out_len, GFP_KERNEL);
925 	if (!resp)
926 		return -ENOMEM;
927 
928 	resp->num_paths = ctx->cm_id->route.num_paths;
929 	for (i = 0, out_len -= sizeof(*resp);
930 	     i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data);
931 	     i++, out_len -= sizeof(struct ib_path_rec_data)) {
932 		struct sa_path_rec *rec = &ctx->cm_id->route.path_rec[i];
933 
934 		resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY |
935 					   IB_PATH_BIDIRECTIONAL;
936 		if (rec->rec_type == SA_PATH_REC_TYPE_OPA) {
937 			struct sa_path_rec ib;
938 
939 			sa_convert_path_opa_to_ib(&ib, rec);
940 			ib_sa_pack_path(&ib, &resp->path_data[i].path_rec);
941 
942 		} else {
943 			ib_sa_pack_path(rec, &resp->path_data[i].path_rec);
944 		}
945 	}
946 
947 	if (copy_to_user(response, resp, struct_size(resp, path_data, i)))
948 		ret = -EFAULT;
949 
950 	kfree(resp);
951 	return ret;
952 }
953 
954 static ssize_t ucma_query_gid(struct ucma_context *ctx,
955 			      void __user *response, int out_len)
956 {
957 	struct rdma_ucm_query_addr_resp resp;
958 	struct sockaddr_ib *addr;
959 	int ret = 0;
960 
961 	if (out_len < sizeof(resp))
962 		return -ENOSPC;
963 
964 	memset(&resp, 0, sizeof resp);
965 
966 	ucma_query_device_addr(ctx->cm_id, &resp);
967 
968 	addr = (struct sockaddr_ib *) &resp.src_addr;
969 	resp.src_size = sizeof(*addr);
970 	if (ctx->cm_id->route.addr.src_addr.ss_family == AF_IB) {
971 		memcpy(addr, &ctx->cm_id->route.addr.src_addr, resp.src_size);
972 	} else {
973 		addr->sib_family = AF_IB;
974 		addr->sib_pkey = (__force __be16) resp.pkey;
975 		rdma_read_gids(ctx->cm_id, (union ib_gid *)&addr->sib_addr,
976 			       NULL);
977 		addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
978 						    &ctx->cm_id->route.addr.src_addr);
979 	}
980 
981 	addr = (struct sockaddr_ib *) &resp.dst_addr;
982 	resp.dst_size = sizeof(*addr);
983 	if (ctx->cm_id->route.addr.dst_addr.ss_family == AF_IB) {
984 		memcpy(addr, &ctx->cm_id->route.addr.dst_addr, resp.dst_size);
985 	} else {
986 		addr->sib_family = AF_IB;
987 		addr->sib_pkey = (__force __be16) resp.pkey;
988 		rdma_read_gids(ctx->cm_id, NULL,
989 			       (union ib_gid *)&addr->sib_addr);
990 		addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
991 						    &ctx->cm_id->route.addr.dst_addr);
992 	}
993 
994 	if (copy_to_user(response, &resp, sizeof(resp)))
995 		ret = -EFAULT;
996 
997 	return ret;
998 }
999 
1000 static ssize_t ucma_query(struct ucma_file *file,
1001 			  const char __user *inbuf,
1002 			  int in_len, int out_len)
1003 {
1004 	struct rdma_ucm_query cmd;
1005 	struct ucma_context *ctx;
1006 	void __user *response;
1007 	int ret;
1008 
1009 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
1010 		return -EFAULT;
1011 
1012 	response = u64_to_user_ptr(cmd.response);
1013 	ctx = ucma_get_ctx(file, cmd.id);
1014 	if (IS_ERR(ctx))
1015 		return PTR_ERR(ctx);
1016 
1017 	switch (cmd.option) {
1018 	case RDMA_USER_CM_QUERY_ADDR:
1019 		ret = ucma_query_addr(ctx, response, out_len);
1020 		break;
1021 	case RDMA_USER_CM_QUERY_PATH:
1022 		ret = ucma_query_path(ctx, response, out_len);
1023 		break;
1024 	case RDMA_USER_CM_QUERY_GID:
1025 		ret = ucma_query_gid(ctx, response, out_len);
1026 		break;
1027 	default:
1028 		ret = -ENOSYS;
1029 		break;
1030 	}
1031 
1032 	ucma_put_ctx(ctx);
1033 	return ret;
1034 }
1035 
1036 static void ucma_copy_conn_param(struct rdma_cm_id *id,
1037 				 struct rdma_conn_param *dst,
1038 				 struct rdma_ucm_conn_param *src)
1039 {
1040 	dst->private_data = src->private_data;
1041 	dst->private_data_len = src->private_data_len;
1042 	dst->responder_resources =src->responder_resources;
1043 	dst->initiator_depth = src->initiator_depth;
1044 	dst->flow_control = src->flow_control;
1045 	dst->retry_count = src->retry_count;
1046 	dst->rnr_retry_count = src->rnr_retry_count;
1047 	dst->srq = src->srq;
1048 	dst->qp_num = src->qp_num;
1049 	dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0;
1050 }
1051 
1052 static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf,
1053 			    int in_len, int out_len)
1054 {
1055 	struct rdma_ucm_connect cmd;
1056 	struct rdma_conn_param conn_param;
1057 	struct ucma_context *ctx;
1058 	int ret;
1059 
1060 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
1061 		return -EFAULT;
1062 
1063 	if (!cmd.conn_param.valid)
1064 		return -EINVAL;
1065 
1066 	ctx = ucma_get_ctx_dev(file, cmd.id);
1067 	if (IS_ERR(ctx))
1068 		return PTR_ERR(ctx);
1069 
1070 	ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param);
1071 	ret = rdma_connect(ctx->cm_id, &conn_param);
1072 	ucma_put_ctx(ctx);
1073 	return ret;
1074 }
1075 
1076 static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf,
1077 			   int in_len, int out_len)
1078 {
1079 	struct rdma_ucm_listen cmd;
1080 	struct ucma_context *ctx;
1081 	int ret;
1082 
1083 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
1084 		return -EFAULT;
1085 
1086 	ctx = ucma_get_ctx(file, cmd.id);
1087 	if (IS_ERR(ctx))
1088 		return PTR_ERR(ctx);
1089 
1090 	ctx->backlog = cmd.backlog > 0 && cmd.backlog < max_backlog ?
1091 		       cmd.backlog : max_backlog;
1092 	ret = rdma_listen(ctx->cm_id, ctx->backlog);
1093 	ucma_put_ctx(ctx);
1094 	return ret;
1095 }
1096 
1097 static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf,
1098 			   int in_len, int out_len)
1099 {
1100 	struct rdma_ucm_accept cmd;
1101 	struct rdma_conn_param conn_param;
1102 	struct ucma_context *ctx;
1103 	int ret;
1104 
1105 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
1106 		return -EFAULT;
1107 
1108 	ctx = ucma_get_ctx_dev(file, cmd.id);
1109 	if (IS_ERR(ctx))
1110 		return PTR_ERR(ctx);
1111 
1112 	if (cmd.conn_param.valid) {
1113 		ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param);
1114 		mutex_lock(&file->mut);
1115 		ret = __rdma_accept(ctx->cm_id, &conn_param, NULL);
1116 		if (!ret)
1117 			ctx->uid = cmd.uid;
1118 		mutex_unlock(&file->mut);
1119 	} else
1120 		ret = __rdma_accept(ctx->cm_id, NULL, NULL);
1121 
1122 	ucma_put_ctx(ctx);
1123 	return ret;
1124 }
1125 
1126 static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf,
1127 			   int in_len, int out_len)
1128 {
1129 	struct rdma_ucm_reject cmd;
1130 	struct ucma_context *ctx;
1131 	int ret;
1132 
1133 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
1134 		return -EFAULT;
1135 
1136 	ctx = ucma_get_ctx_dev(file, cmd.id);
1137 	if (IS_ERR(ctx))
1138 		return PTR_ERR(ctx);
1139 
1140 	ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len);
1141 	ucma_put_ctx(ctx);
1142 	return ret;
1143 }
1144 
1145 static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf,
1146 			       int in_len, int out_len)
1147 {
1148 	struct rdma_ucm_disconnect cmd;
1149 	struct ucma_context *ctx;
1150 	int ret;
1151 
1152 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
1153 		return -EFAULT;
1154 
1155 	ctx = ucma_get_ctx_dev(file, cmd.id);
1156 	if (IS_ERR(ctx))
1157 		return PTR_ERR(ctx);
1158 
1159 	ret = rdma_disconnect(ctx->cm_id);
1160 	ucma_put_ctx(ctx);
1161 	return ret;
1162 }
1163 
1164 static ssize_t ucma_init_qp_attr(struct ucma_file *file,
1165 				 const char __user *inbuf,
1166 				 int in_len, int out_len)
1167 {
1168 	struct rdma_ucm_init_qp_attr cmd;
1169 	struct ib_uverbs_qp_attr resp;
1170 	struct ucma_context *ctx;
1171 	struct ib_qp_attr qp_attr;
1172 	int ret;
1173 
1174 	if (out_len < sizeof(resp))
1175 		return -ENOSPC;
1176 
1177 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
1178 		return -EFAULT;
1179 
1180 	if (cmd.qp_state > IB_QPS_ERR)
1181 		return -EINVAL;
1182 
1183 	ctx = ucma_get_ctx_dev(file, cmd.id);
1184 	if (IS_ERR(ctx))
1185 		return PTR_ERR(ctx);
1186 
1187 	resp.qp_attr_mask = 0;
1188 	memset(&qp_attr, 0, sizeof qp_attr);
1189 	qp_attr.qp_state = cmd.qp_state;
1190 	ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
1191 	if (ret)
1192 		goto out;
1193 
1194 	ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr);
1195 	if (copy_to_user(u64_to_user_ptr(cmd.response),
1196 			 &resp, sizeof(resp)))
1197 		ret = -EFAULT;
1198 
1199 out:
1200 	ucma_put_ctx(ctx);
1201 	return ret;
1202 }
1203 
1204 static int ucma_set_option_id(struct ucma_context *ctx, int optname,
1205 			      void *optval, size_t optlen)
1206 {
1207 	int ret = 0;
1208 
1209 	switch (optname) {
1210 	case RDMA_OPTION_ID_TOS:
1211 		if (optlen != sizeof(u8)) {
1212 			ret = -EINVAL;
1213 			break;
1214 		}
1215 		rdma_set_service_type(ctx->cm_id, *((u8 *) optval));
1216 		break;
1217 	case RDMA_OPTION_ID_REUSEADDR:
1218 		if (optlen != sizeof(int)) {
1219 			ret = -EINVAL;
1220 			break;
1221 		}
1222 		ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0);
1223 		break;
1224 	case RDMA_OPTION_ID_AFONLY:
1225 		if (optlen != sizeof(int)) {
1226 			ret = -EINVAL;
1227 			break;
1228 		}
1229 		ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0);
1230 		break;
1231 	case RDMA_OPTION_ID_ACK_TIMEOUT:
1232 		if (optlen != sizeof(u8)) {
1233 			ret = -EINVAL;
1234 			break;
1235 		}
1236 		ret = rdma_set_ack_timeout(ctx->cm_id, *((u8 *)optval));
1237 		break;
1238 	default:
1239 		ret = -ENOSYS;
1240 	}
1241 
1242 	return ret;
1243 }
1244 
1245 static int ucma_set_ib_path(struct ucma_context *ctx,
1246 			    struct ib_path_rec_data *path_data, size_t optlen)
1247 {
1248 	struct sa_path_rec sa_path;
1249 	struct rdma_cm_event event;
1250 	int ret;
1251 
1252 	if (optlen % sizeof(*path_data))
1253 		return -EINVAL;
1254 
1255 	for (; optlen; optlen -= sizeof(*path_data), path_data++) {
1256 		if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY |
1257 					 IB_PATH_BIDIRECTIONAL))
1258 			break;
1259 	}
1260 
1261 	if (!optlen)
1262 		return -EINVAL;
1263 
1264 	if (!ctx->cm_id->device)
1265 		return -EINVAL;
1266 
1267 	memset(&sa_path, 0, sizeof(sa_path));
1268 
1269 	sa_path.rec_type = SA_PATH_REC_TYPE_IB;
1270 	ib_sa_unpack_path(path_data->path_rec, &sa_path);
1271 
1272 	if (rdma_cap_opa_ah(ctx->cm_id->device, ctx->cm_id->port_num)) {
1273 		struct sa_path_rec opa;
1274 
1275 		sa_convert_path_ib_to_opa(&opa, &sa_path);
1276 		ret = rdma_set_ib_path(ctx->cm_id, &opa);
1277 	} else {
1278 		ret = rdma_set_ib_path(ctx->cm_id, &sa_path);
1279 	}
1280 	if (ret)
1281 		return ret;
1282 
1283 	memset(&event, 0, sizeof event);
1284 	event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
1285 	return ucma_event_handler(ctx->cm_id, &event);
1286 }
1287 
1288 static int ucma_set_option_ib(struct ucma_context *ctx, int optname,
1289 			      void *optval, size_t optlen)
1290 {
1291 	int ret;
1292 
1293 	switch (optname) {
1294 	case RDMA_OPTION_IB_PATH:
1295 		ret = ucma_set_ib_path(ctx, optval, optlen);
1296 		break;
1297 	default:
1298 		ret = -ENOSYS;
1299 	}
1300 
1301 	return ret;
1302 }
1303 
1304 static int ucma_set_option_level(struct ucma_context *ctx, int level,
1305 				 int optname, void *optval, size_t optlen)
1306 {
1307 	int ret;
1308 
1309 	switch (level) {
1310 	case RDMA_OPTION_ID:
1311 		ret = ucma_set_option_id(ctx, optname, optval, optlen);
1312 		break;
1313 	case RDMA_OPTION_IB:
1314 		ret = ucma_set_option_ib(ctx, optname, optval, optlen);
1315 		break;
1316 	default:
1317 		ret = -ENOSYS;
1318 	}
1319 
1320 	return ret;
1321 }
1322 
1323 static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf,
1324 			       int in_len, int out_len)
1325 {
1326 	struct rdma_ucm_set_option cmd;
1327 	struct ucma_context *ctx;
1328 	void *optval;
1329 	int ret;
1330 
1331 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
1332 		return -EFAULT;
1333 
1334 	if (unlikely(cmd.optlen > KMALLOC_MAX_SIZE))
1335 		return -EINVAL;
1336 
1337 	ctx = ucma_get_ctx(file, cmd.id);
1338 	if (IS_ERR(ctx))
1339 		return PTR_ERR(ctx);
1340 
1341 	optval = memdup_user(u64_to_user_ptr(cmd.optval),
1342 			     cmd.optlen);
1343 	if (IS_ERR(optval)) {
1344 		ret = PTR_ERR(optval);
1345 		goto out;
1346 	}
1347 
1348 	ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval,
1349 				    cmd.optlen);
1350 	kfree(optval);
1351 
1352 out:
1353 	ucma_put_ctx(ctx);
1354 	return ret;
1355 }
1356 
1357 static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf,
1358 			   int in_len, int out_len)
1359 {
1360 	struct rdma_ucm_notify cmd;
1361 	struct ucma_context *ctx;
1362 	int ret = -EINVAL;
1363 
1364 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
1365 		return -EFAULT;
1366 
1367 	ctx = ucma_get_ctx(file, cmd.id);
1368 	if (IS_ERR(ctx))
1369 		return PTR_ERR(ctx);
1370 
1371 	if (ctx->cm_id->device)
1372 		ret = rdma_notify(ctx->cm_id, (enum ib_event_type)cmd.event);
1373 
1374 	ucma_put_ctx(ctx);
1375 	return ret;
1376 }
1377 
1378 static ssize_t ucma_process_join(struct ucma_file *file,
1379 				 struct rdma_ucm_join_mcast *cmd,  int out_len)
1380 {
1381 	struct rdma_ucm_create_id_resp resp;
1382 	struct ucma_context *ctx;
1383 	struct ucma_multicast *mc;
1384 	struct sockaddr *addr;
1385 	int ret;
1386 	u8 join_state;
1387 
1388 	if (out_len < sizeof(resp))
1389 		return -ENOSPC;
1390 
1391 	addr = (struct sockaddr *) &cmd->addr;
1392 	if (cmd->addr_size != rdma_addr_size(addr))
1393 		return -EINVAL;
1394 
1395 	if (cmd->join_flags == RDMA_MC_JOIN_FLAG_FULLMEMBER)
1396 		join_state = BIT(FULLMEMBER_JOIN);
1397 	else if (cmd->join_flags == RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER)
1398 		join_state = BIT(SENDONLY_FULLMEMBER_JOIN);
1399 	else
1400 		return -EINVAL;
1401 
1402 	ctx = ucma_get_ctx_dev(file, cmd->id);
1403 	if (IS_ERR(ctx))
1404 		return PTR_ERR(ctx);
1405 
1406 	mutex_lock(&file->mut);
1407 	mc = ucma_alloc_multicast(ctx);
1408 	if (!mc) {
1409 		ret = -ENOMEM;
1410 		goto err1;
1411 	}
1412 	mc->join_state = join_state;
1413 	mc->uid = cmd->uid;
1414 	memcpy(&mc->addr, addr, cmd->addr_size);
1415 	ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr,
1416 				  join_state, mc);
1417 	if (ret)
1418 		goto err2;
1419 
1420 	resp.id = mc->id;
1421 	if (copy_to_user(u64_to_user_ptr(cmd->response),
1422 			 &resp, sizeof(resp))) {
1423 		ret = -EFAULT;
1424 		goto err3;
1425 	}
1426 
1427 	xa_store(&multicast_table, mc->id, mc, 0);
1428 
1429 	mutex_unlock(&file->mut);
1430 	ucma_put_ctx(ctx);
1431 	return 0;
1432 
1433 err3:
1434 	rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr);
1435 	ucma_cleanup_mc_events(mc);
1436 err2:
1437 	xa_erase(&multicast_table, mc->id);
1438 	list_del(&mc->list);
1439 	kfree(mc);
1440 err1:
1441 	mutex_unlock(&file->mut);
1442 	ucma_put_ctx(ctx);
1443 	return ret;
1444 }
1445 
1446 static ssize_t ucma_join_ip_multicast(struct ucma_file *file,
1447 				      const char __user *inbuf,
1448 				      int in_len, int out_len)
1449 {
1450 	struct rdma_ucm_join_ip_mcast cmd;
1451 	struct rdma_ucm_join_mcast join_cmd;
1452 
1453 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
1454 		return -EFAULT;
1455 
1456 	join_cmd.response = cmd.response;
1457 	join_cmd.uid = cmd.uid;
1458 	join_cmd.id = cmd.id;
1459 	join_cmd.addr_size = rdma_addr_size_in6(&cmd.addr);
1460 	if (!join_cmd.addr_size)
1461 		return -EINVAL;
1462 
1463 	join_cmd.join_flags = RDMA_MC_JOIN_FLAG_FULLMEMBER;
1464 	memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size);
1465 
1466 	return ucma_process_join(file, &join_cmd, out_len);
1467 }
1468 
1469 static ssize_t ucma_join_multicast(struct ucma_file *file,
1470 				   const char __user *inbuf,
1471 				   int in_len, int out_len)
1472 {
1473 	struct rdma_ucm_join_mcast cmd;
1474 
1475 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
1476 		return -EFAULT;
1477 
1478 	if (!rdma_addr_size_kss(&cmd.addr))
1479 		return -EINVAL;
1480 
1481 	return ucma_process_join(file, &cmd, out_len);
1482 }
1483 
1484 static ssize_t ucma_leave_multicast(struct ucma_file *file,
1485 				    const char __user *inbuf,
1486 				    int in_len, int out_len)
1487 {
1488 	struct rdma_ucm_destroy_id cmd;
1489 	struct rdma_ucm_destroy_id_resp resp;
1490 	struct ucma_multicast *mc;
1491 	int ret = 0;
1492 
1493 	if (out_len < sizeof(resp))
1494 		return -ENOSPC;
1495 
1496 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
1497 		return -EFAULT;
1498 
1499 	xa_lock(&multicast_table);
1500 	mc = xa_load(&multicast_table, cmd.id);
1501 	if (!mc)
1502 		mc = ERR_PTR(-ENOENT);
1503 	else if (mc->ctx->file != file)
1504 		mc = ERR_PTR(-EINVAL);
1505 	else if (!atomic_inc_not_zero(&mc->ctx->ref))
1506 		mc = ERR_PTR(-ENXIO);
1507 	else
1508 		__xa_erase(&multicast_table, mc->id);
1509 	xa_unlock(&multicast_table);
1510 
1511 	if (IS_ERR(mc)) {
1512 		ret = PTR_ERR(mc);
1513 		goto out;
1514 	}
1515 
1516 	rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr);
1517 	mutex_lock(&mc->ctx->file->mut);
1518 	ucma_cleanup_mc_events(mc);
1519 	list_del(&mc->list);
1520 	mutex_unlock(&mc->ctx->file->mut);
1521 
1522 	ucma_put_ctx(mc->ctx);
1523 	resp.events_reported = mc->events_reported;
1524 	kfree(mc);
1525 
1526 	if (copy_to_user(u64_to_user_ptr(cmd.response),
1527 			 &resp, sizeof(resp)))
1528 		ret = -EFAULT;
1529 out:
1530 	return ret;
1531 }
1532 
1533 static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2)
1534 {
1535 	/* Acquire mutex's based on pointer comparison to prevent deadlock. */
1536 	if (file1 < file2) {
1537 		mutex_lock(&file1->mut);
1538 		mutex_lock_nested(&file2->mut, SINGLE_DEPTH_NESTING);
1539 	} else {
1540 		mutex_lock(&file2->mut);
1541 		mutex_lock_nested(&file1->mut, SINGLE_DEPTH_NESTING);
1542 	}
1543 }
1544 
1545 static void ucma_unlock_files(struct ucma_file *file1, struct ucma_file *file2)
1546 {
1547 	if (file1 < file2) {
1548 		mutex_unlock(&file2->mut);
1549 		mutex_unlock(&file1->mut);
1550 	} else {
1551 		mutex_unlock(&file1->mut);
1552 		mutex_unlock(&file2->mut);
1553 	}
1554 }
1555 
1556 static void ucma_move_events(struct ucma_context *ctx, struct ucma_file *file)
1557 {
1558 	struct ucma_event *uevent, *tmp;
1559 
1560 	list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list)
1561 		if (uevent->ctx == ctx)
1562 			list_move_tail(&uevent->list, &file->event_list);
1563 }
1564 
1565 static ssize_t ucma_migrate_id(struct ucma_file *new_file,
1566 			       const char __user *inbuf,
1567 			       int in_len, int out_len)
1568 {
1569 	struct rdma_ucm_migrate_id cmd;
1570 	struct rdma_ucm_migrate_resp resp;
1571 	struct ucma_context *ctx;
1572 	struct fd f;
1573 	struct ucma_file *cur_file;
1574 	int ret = 0;
1575 
1576 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
1577 		return -EFAULT;
1578 
1579 	/* Get current fd to protect against it being closed */
1580 	f = fdget(cmd.fd);
1581 	if (!f.file)
1582 		return -ENOENT;
1583 	if (f.file->f_op != &ucma_fops) {
1584 		ret = -EINVAL;
1585 		goto file_put;
1586 	}
1587 
1588 	/* Validate current fd and prevent destruction of id. */
1589 	ctx = ucma_get_ctx(f.file->private_data, cmd.id);
1590 	if (IS_ERR(ctx)) {
1591 		ret = PTR_ERR(ctx);
1592 		goto file_put;
1593 	}
1594 
1595 	cur_file = ctx->file;
1596 	if (cur_file == new_file) {
1597 		resp.events_reported = ctx->events_reported;
1598 		goto response;
1599 	}
1600 
1601 	/*
1602 	 * Migrate events between fd's, maintaining order, and avoiding new
1603 	 * events being added before existing events.
1604 	 */
1605 	ucma_lock_files(cur_file, new_file);
1606 	xa_lock(&ctx_table);
1607 
1608 	list_move_tail(&ctx->list, &new_file->ctx_list);
1609 	ucma_move_events(ctx, new_file);
1610 	ctx->file = new_file;
1611 	resp.events_reported = ctx->events_reported;
1612 
1613 	xa_unlock(&ctx_table);
1614 	ucma_unlock_files(cur_file, new_file);
1615 
1616 response:
1617 	if (copy_to_user(u64_to_user_ptr(cmd.response),
1618 			 &resp, sizeof(resp)))
1619 		ret = -EFAULT;
1620 
1621 	ucma_put_ctx(ctx);
1622 file_put:
1623 	fdput(f);
1624 	return ret;
1625 }
1626 
1627 static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
1628 				   const char __user *inbuf,
1629 				   int in_len, int out_len) = {
1630 	[RDMA_USER_CM_CMD_CREATE_ID] 	 = ucma_create_id,
1631 	[RDMA_USER_CM_CMD_DESTROY_ID]	 = ucma_destroy_id,
1632 	[RDMA_USER_CM_CMD_BIND_IP]	 = ucma_bind_ip,
1633 	[RDMA_USER_CM_CMD_RESOLVE_IP]	 = ucma_resolve_ip,
1634 	[RDMA_USER_CM_CMD_RESOLVE_ROUTE] = ucma_resolve_route,
1635 	[RDMA_USER_CM_CMD_QUERY_ROUTE]	 = ucma_query_route,
1636 	[RDMA_USER_CM_CMD_CONNECT]	 = ucma_connect,
1637 	[RDMA_USER_CM_CMD_LISTEN]	 = ucma_listen,
1638 	[RDMA_USER_CM_CMD_ACCEPT]	 = ucma_accept,
1639 	[RDMA_USER_CM_CMD_REJECT]	 = ucma_reject,
1640 	[RDMA_USER_CM_CMD_DISCONNECT]	 = ucma_disconnect,
1641 	[RDMA_USER_CM_CMD_INIT_QP_ATTR]	 = ucma_init_qp_attr,
1642 	[RDMA_USER_CM_CMD_GET_EVENT]	 = ucma_get_event,
1643 	[RDMA_USER_CM_CMD_GET_OPTION]	 = NULL,
1644 	[RDMA_USER_CM_CMD_SET_OPTION]	 = ucma_set_option,
1645 	[RDMA_USER_CM_CMD_NOTIFY]	 = ucma_notify,
1646 	[RDMA_USER_CM_CMD_JOIN_IP_MCAST] = ucma_join_ip_multicast,
1647 	[RDMA_USER_CM_CMD_LEAVE_MCAST]	 = ucma_leave_multicast,
1648 	[RDMA_USER_CM_CMD_MIGRATE_ID]	 = ucma_migrate_id,
1649 	[RDMA_USER_CM_CMD_QUERY]	 = ucma_query,
1650 	[RDMA_USER_CM_CMD_BIND]		 = ucma_bind,
1651 	[RDMA_USER_CM_CMD_RESOLVE_ADDR]	 = ucma_resolve_addr,
1652 	[RDMA_USER_CM_CMD_JOIN_MCAST]	 = ucma_join_multicast
1653 };
1654 
1655 static ssize_t ucma_write(struct file *filp, const char __user *buf,
1656 			  size_t len, loff_t *pos)
1657 {
1658 	struct ucma_file *file = filp->private_data;
1659 	struct rdma_ucm_cmd_hdr hdr;
1660 	ssize_t ret;
1661 
1662 	if (!ib_safe_file_access(filp)) {
1663 		pr_err_once("ucma_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n",
1664 			    task_tgid_vnr(current), current->comm);
1665 		return -EACCES;
1666 	}
1667 
1668 	if (len < sizeof(hdr))
1669 		return -EINVAL;
1670 
1671 	if (copy_from_user(&hdr, buf, sizeof(hdr)))
1672 		return -EFAULT;
1673 
1674 	if (hdr.cmd >= ARRAY_SIZE(ucma_cmd_table))
1675 		return -EINVAL;
1676 	hdr.cmd = array_index_nospec(hdr.cmd, ARRAY_SIZE(ucma_cmd_table));
1677 
1678 	if (hdr.in + sizeof(hdr) > len)
1679 		return -EINVAL;
1680 
1681 	if (!ucma_cmd_table[hdr.cmd])
1682 		return -ENOSYS;
1683 
1684 	ret = ucma_cmd_table[hdr.cmd](file, buf + sizeof(hdr), hdr.in, hdr.out);
1685 	if (!ret)
1686 		ret = len;
1687 
1688 	return ret;
1689 }
1690 
1691 static __poll_t ucma_poll(struct file *filp, struct poll_table_struct *wait)
1692 {
1693 	struct ucma_file *file = filp->private_data;
1694 	__poll_t mask = 0;
1695 
1696 	poll_wait(filp, &file->poll_wait, wait);
1697 
1698 	if (!list_empty(&file->event_list))
1699 		mask = EPOLLIN | EPOLLRDNORM;
1700 
1701 	return mask;
1702 }
1703 
1704 /*
1705  * ucma_open() does not need the BKL:
1706  *
1707  *  - no global state is referred to;
1708  *  - there is no ioctl method to race against;
1709  *  - no further module initialization is required for open to work
1710  *    after the device is registered.
1711  */
1712 static int ucma_open(struct inode *inode, struct file *filp)
1713 {
1714 	struct ucma_file *file;
1715 
1716 	file = kmalloc(sizeof *file, GFP_KERNEL);
1717 	if (!file)
1718 		return -ENOMEM;
1719 
1720 	file->close_wq = alloc_ordered_workqueue("ucma_close_id",
1721 						 WQ_MEM_RECLAIM);
1722 	if (!file->close_wq) {
1723 		kfree(file);
1724 		return -ENOMEM;
1725 	}
1726 
1727 	INIT_LIST_HEAD(&file->event_list);
1728 	INIT_LIST_HEAD(&file->ctx_list);
1729 	init_waitqueue_head(&file->poll_wait);
1730 	mutex_init(&file->mut);
1731 
1732 	filp->private_data = file;
1733 	file->filp = filp;
1734 
1735 	return stream_open(inode, filp);
1736 }
1737 
1738 static int ucma_close(struct inode *inode, struct file *filp)
1739 {
1740 	struct ucma_file *file = filp->private_data;
1741 	struct ucma_context *ctx, *tmp;
1742 
1743 	mutex_lock(&file->mut);
1744 	list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) {
1745 		ctx->destroying = 1;
1746 		mutex_unlock(&file->mut);
1747 
1748 		xa_erase(&ctx_table, ctx->id);
1749 		flush_workqueue(file->close_wq);
1750 		/* At that step once ctx was marked as destroying and workqueue
1751 		 * was flushed we are safe from any inflights handlers that
1752 		 * might put other closing task.
1753 		 */
1754 		xa_lock(&ctx_table);
1755 		if (!ctx->closing) {
1756 			xa_unlock(&ctx_table);
1757 			ucma_put_ctx(ctx);
1758 			wait_for_completion(&ctx->comp);
1759 			/* rdma_destroy_id ensures that no event handlers are
1760 			 * inflight for that id before releasing it.
1761 			 */
1762 			rdma_destroy_id(ctx->cm_id);
1763 		} else {
1764 			xa_unlock(&ctx_table);
1765 		}
1766 
1767 		ucma_free_ctx(ctx);
1768 		mutex_lock(&file->mut);
1769 	}
1770 	mutex_unlock(&file->mut);
1771 	destroy_workqueue(file->close_wq);
1772 	kfree(file);
1773 	return 0;
1774 }
1775 
1776 static const struct file_operations ucma_fops = {
1777 	.owner 	 = THIS_MODULE,
1778 	.open 	 = ucma_open,
1779 	.release = ucma_close,
1780 	.write	 = ucma_write,
1781 	.poll    = ucma_poll,
1782 	.llseek	 = no_llseek,
1783 };
1784 
1785 static struct miscdevice ucma_misc = {
1786 	.minor		= MISC_DYNAMIC_MINOR,
1787 	.name		= "rdma_cm",
1788 	.nodename	= "infiniband/rdma_cm",
1789 	.mode		= 0666,
1790 	.fops		= &ucma_fops,
1791 };
1792 
1793 static int ucma_get_global_nl_info(struct ib_client_nl_info *res)
1794 {
1795 	res->abi = RDMA_USER_CM_ABI_VERSION;
1796 	res->cdev = ucma_misc.this_device;
1797 	return 0;
1798 }
1799 
1800 static struct ib_client rdma_cma_client = {
1801 	.name = "rdma_cm",
1802 	.get_global_nl_info = ucma_get_global_nl_info,
1803 };
1804 MODULE_ALIAS_RDMA_CLIENT("rdma_cm");
1805 
1806 static ssize_t show_abi_version(struct device *dev,
1807 				struct device_attribute *attr,
1808 				char *buf)
1809 {
1810 	return sprintf(buf, "%d\n", RDMA_USER_CM_ABI_VERSION);
1811 }
1812 static DEVICE_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
1813 
1814 static int __init ucma_init(void)
1815 {
1816 	int ret;
1817 
1818 	ret = misc_register(&ucma_misc);
1819 	if (ret)
1820 		return ret;
1821 
1822 	ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version);
1823 	if (ret) {
1824 		pr_err("rdma_ucm: couldn't create abi_version attr\n");
1825 		goto err1;
1826 	}
1827 
1828 	ucma_ctl_table_hdr = register_net_sysctl(&init_net, "net/rdma_ucm", ucma_ctl_table);
1829 	if (!ucma_ctl_table_hdr) {
1830 		pr_err("rdma_ucm: couldn't register sysctl paths\n");
1831 		ret = -ENOMEM;
1832 		goto err2;
1833 	}
1834 
1835 	ret = ib_register_client(&rdma_cma_client);
1836 	if (ret)
1837 		goto err3;
1838 
1839 	return 0;
1840 err3:
1841 	unregister_net_sysctl_table(ucma_ctl_table_hdr);
1842 err2:
1843 	device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
1844 err1:
1845 	misc_deregister(&ucma_misc);
1846 	return ret;
1847 }
1848 
1849 static void __exit ucma_cleanup(void)
1850 {
1851 	ib_unregister_client(&rdma_cma_client);
1852 	unregister_net_sysctl_table(ucma_ctl_table_hdr);
1853 	device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
1854 	misc_deregister(&ucma_misc);
1855 }
1856 
1857 module_init(ucma_init);
1858 module_exit(ucma_cleanup);
1859