xref: /linux/drivers/infiniband/core/device.c (revision d2e9ace47aac92a465c4ad8e0cd1f5f8422a117e)
1 /*
2  * Copyright (c) 2004 Topspin Communications.  All rights reserved.
3  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #include <linux/module.h>
35 #include <linux/string.h>
36 #include <linux/errno.h>
37 #include <linux/kernel.h>
38 #include <linux/slab.h>
39 #include <linux/init.h>
40 #include <linux/mutex.h>
41 #include <linux/netdevice.h>
42 #include <linux/security.h>
43 #include <linux/notifier.h>
44 #include <rdma/rdma_netlink.h>
45 #include <rdma/ib_addr.h>
46 #include <rdma/ib_cache.h>
47 
48 #include "core_priv.h"
49 
50 MODULE_AUTHOR("Roland Dreier");
51 MODULE_DESCRIPTION("core kernel InfiniBand API");
52 MODULE_LICENSE("Dual BSD/GPL");
53 
54 struct ib_client_data {
55 	struct list_head  list;
56 	struct ib_client *client;
57 	void *            data;
58 	/* The device or client is going down. Do not call client or device
59 	 * callbacks other than remove(). */
60 	bool		  going_down;
61 };
62 
63 struct workqueue_struct *ib_comp_wq;
64 struct workqueue_struct *ib_comp_unbound_wq;
65 struct workqueue_struct *ib_wq;
66 EXPORT_SYMBOL_GPL(ib_wq);
67 
68 /* The device_list and client_list contain devices and clients after their
69  * registration has completed, and the devices and clients are removed
70  * during unregistration. */
71 static LIST_HEAD(device_list);
72 static LIST_HEAD(client_list);
73 
74 /*
75  * device_mutex and lists_rwsem protect access to both device_list and
76  * client_list.  device_mutex protects writer access by device and client
77  * registration / de-registration.  lists_rwsem protects reader access to
78  * these lists.  Iterators of these lists must lock it for read, while updates
79  * to the lists must be done with a write lock. A special case is when the
80  * device_mutex is locked. In this case locking the lists for read access is
81  * not necessary as the device_mutex implies it.
82  *
83  * lists_rwsem also protects access to the client data list.
84  */
85 static DEFINE_MUTEX(device_mutex);
86 static DECLARE_RWSEM(lists_rwsem);
87 
88 static int ib_security_change(struct notifier_block *nb, unsigned long event,
89 			      void *lsm_data);
90 static void ib_policy_change_task(struct work_struct *work);
91 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task);
92 
93 static struct notifier_block ibdev_lsm_nb = {
94 	.notifier_call = ib_security_change,
95 };
96 
97 static int ib_device_check_mandatory(struct ib_device *device)
98 {
99 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x }
100 	static const struct {
101 		size_t offset;
102 		char  *name;
103 	} mandatory_table[] = {
104 		IB_MANDATORY_FUNC(query_device),
105 		IB_MANDATORY_FUNC(query_port),
106 		IB_MANDATORY_FUNC(query_pkey),
107 		IB_MANDATORY_FUNC(alloc_pd),
108 		IB_MANDATORY_FUNC(dealloc_pd),
109 		IB_MANDATORY_FUNC(create_qp),
110 		IB_MANDATORY_FUNC(modify_qp),
111 		IB_MANDATORY_FUNC(destroy_qp),
112 		IB_MANDATORY_FUNC(post_send),
113 		IB_MANDATORY_FUNC(post_recv),
114 		IB_MANDATORY_FUNC(create_cq),
115 		IB_MANDATORY_FUNC(destroy_cq),
116 		IB_MANDATORY_FUNC(poll_cq),
117 		IB_MANDATORY_FUNC(req_notify_cq),
118 		IB_MANDATORY_FUNC(get_dma_mr),
119 		IB_MANDATORY_FUNC(dereg_mr),
120 		IB_MANDATORY_FUNC(get_port_immutable)
121 	};
122 	int i;
123 
124 	for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
125 		if (!*(void **) ((void *) device + mandatory_table[i].offset)) {
126 			dev_warn(&device->dev,
127 				 "Device is missing mandatory function %s\n",
128 				 mandatory_table[i].name);
129 			return -EINVAL;
130 		}
131 	}
132 
133 	return 0;
134 }
135 
136 static struct ib_device *__ib_device_get_by_index(u32 index)
137 {
138 	struct ib_device *device;
139 
140 	list_for_each_entry(device, &device_list, core_list)
141 		if (device->index == index)
142 			return device;
143 
144 	return NULL;
145 }
146 
147 /*
148  * Caller must perform ib_device_put() to return the device reference count
149  * when ib_device_get_by_index() returns valid device pointer.
150  */
151 struct ib_device *ib_device_get_by_index(u32 index)
152 {
153 	struct ib_device *device;
154 
155 	down_read(&lists_rwsem);
156 	device = __ib_device_get_by_index(index);
157 	if (device) {
158 		/* Do not return a device if unregistration has started. */
159 		if (!refcount_inc_not_zero(&device->refcount))
160 			device = NULL;
161 	}
162 	up_read(&lists_rwsem);
163 	return device;
164 }
165 
166 void ib_device_put(struct ib_device *device)
167 {
168 	if (refcount_dec_and_test(&device->refcount))
169 		complete(&device->unreg_completion);
170 }
171 
172 static struct ib_device *__ib_device_get_by_name(const char *name)
173 {
174 	struct ib_device *device;
175 
176 	list_for_each_entry(device, &device_list, core_list)
177 		if (!strcmp(name, dev_name(&device->dev)))
178 			return device;
179 
180 	return NULL;
181 }
182 
183 int ib_device_rename(struct ib_device *ibdev, const char *name)
184 {
185 	struct ib_device *device;
186 	int ret = 0;
187 
188 	if (!strcmp(name, dev_name(&ibdev->dev)))
189 		return ret;
190 
191 	mutex_lock(&device_mutex);
192 	list_for_each_entry(device, &device_list, core_list) {
193 		if (!strcmp(name, dev_name(&device->dev))) {
194 			ret = -EEXIST;
195 			goto out;
196 		}
197 	}
198 
199 	ret = device_rename(&ibdev->dev, name);
200 	if (ret)
201 		goto out;
202 	strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
203 out:
204 	mutex_unlock(&device_mutex);
205 	return ret;
206 }
207 
208 static int alloc_name(struct ib_device *ibdev, const char *name)
209 {
210 	unsigned long *inuse;
211 	struct ib_device *device;
212 	int i;
213 
214 	inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL);
215 	if (!inuse)
216 		return -ENOMEM;
217 
218 	list_for_each_entry(device, &device_list, core_list) {
219 		char buf[IB_DEVICE_NAME_MAX];
220 
221 		if (sscanf(dev_name(&device->dev), name, &i) != 1)
222 			continue;
223 		if (i < 0 || i >= PAGE_SIZE * 8)
224 			continue;
225 		snprintf(buf, sizeof buf, name, i);
226 		if (!strcmp(buf, dev_name(&device->dev)))
227 			set_bit(i, inuse);
228 	}
229 
230 	i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
231 	free_page((unsigned long) inuse);
232 
233 	return dev_set_name(&ibdev->dev, name, i);
234 }
235 
236 static void ib_device_release(struct device *device)
237 {
238 	struct ib_device *dev = container_of(device, struct ib_device, dev);
239 
240 	WARN_ON(dev->reg_state == IB_DEV_REGISTERED);
241 	if (dev->reg_state == IB_DEV_UNREGISTERED) {
242 		/*
243 		 * In IB_DEV_UNINITIALIZED state, cache or port table
244 		 * is not even created. Free cache and port table only when
245 		 * device reaches UNREGISTERED state.
246 		 */
247 		ib_cache_release_one(dev);
248 		kfree(dev->port_immutable);
249 	}
250 	kfree(dev);
251 }
252 
253 static int ib_device_uevent(struct device *device,
254 			    struct kobj_uevent_env *env)
255 {
256 	if (add_uevent_var(env, "NAME=%s", dev_name(device)))
257 		return -ENOMEM;
258 
259 	/*
260 	 * It would be nice to pass the node GUID with the event...
261 	 */
262 
263 	return 0;
264 }
265 
266 static struct class ib_class = {
267 	.name    = "infiniband",
268 	.dev_release = ib_device_release,
269 	.dev_uevent = ib_device_uevent,
270 };
271 
272 /**
273  * ib_alloc_device - allocate an IB device struct
274  * @size:size of structure to allocate
275  *
276  * Low-level drivers should use ib_alloc_device() to allocate &struct
277  * ib_device.  @size is the size of the structure to be allocated,
278  * including any private data used by the low-level driver.
279  * ib_dealloc_device() must be used to free structures allocated with
280  * ib_alloc_device().
281  */
282 struct ib_device *ib_alloc_device(size_t size)
283 {
284 	struct ib_device *device;
285 
286 	if (WARN_ON(size < sizeof(struct ib_device)))
287 		return NULL;
288 
289 	device = kzalloc(size, GFP_KERNEL);
290 	if (!device)
291 		return NULL;
292 
293 	rdma_restrack_init(&device->res);
294 
295 	device->dev.class = &ib_class;
296 	device_initialize(&device->dev);
297 
298 	dev_set_drvdata(&device->dev, device);
299 
300 	INIT_LIST_HEAD(&device->event_handler_list);
301 	spin_lock_init(&device->event_handler_lock);
302 	rwlock_init(&device->client_data_lock);
303 	INIT_LIST_HEAD(&device->client_data_list);
304 	INIT_LIST_HEAD(&device->port_list);
305 	refcount_set(&device->refcount, 1);
306 	init_completion(&device->unreg_completion);
307 
308 	return device;
309 }
310 EXPORT_SYMBOL(ib_alloc_device);
311 
312 /**
313  * ib_dealloc_device - free an IB device struct
314  * @device:structure to free
315  *
316  * Free a structure allocated with ib_alloc_device().
317  */
318 void ib_dealloc_device(struct ib_device *device)
319 {
320 	WARN_ON(!list_empty(&device->client_data_list));
321 	WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
322 		device->reg_state != IB_DEV_UNINITIALIZED);
323 	rdma_restrack_clean(&device->res);
324 	put_device(&device->dev);
325 }
326 EXPORT_SYMBOL(ib_dealloc_device);
327 
328 static int add_client_context(struct ib_device *device, struct ib_client *client)
329 {
330 	struct ib_client_data *context;
331 
332 	context = kmalloc(sizeof(*context), GFP_KERNEL);
333 	if (!context)
334 		return -ENOMEM;
335 
336 	context->client = client;
337 	context->data   = NULL;
338 	context->going_down = false;
339 
340 	down_write(&lists_rwsem);
341 	write_lock_irq(&device->client_data_lock);
342 	list_add(&context->list, &device->client_data_list);
343 	write_unlock_irq(&device->client_data_lock);
344 	up_write(&lists_rwsem);
345 
346 	return 0;
347 }
348 
349 static int verify_immutable(const struct ib_device *dev, u8 port)
350 {
351 	return WARN_ON(!rdma_cap_ib_mad(dev, port) &&
352 			    rdma_max_mad_size(dev, port) != 0);
353 }
354 
355 static int read_port_immutable(struct ib_device *device)
356 {
357 	int ret;
358 	u8 start_port = rdma_start_port(device);
359 	u8 end_port = rdma_end_port(device);
360 	u8 port;
361 
362 	/**
363 	 * device->port_immutable is indexed directly by the port number to make
364 	 * access to this data as efficient as possible.
365 	 *
366 	 * Therefore port_immutable is declared as a 1 based array with
367 	 * potential empty slots at the beginning.
368 	 */
369 	device->port_immutable = kcalloc(end_port + 1,
370 					 sizeof(*device->port_immutable),
371 					 GFP_KERNEL);
372 	if (!device->port_immutable)
373 		return -ENOMEM;
374 
375 	for (port = start_port; port <= end_port; ++port) {
376 		ret = device->get_port_immutable(device, port,
377 						 &device->port_immutable[port]);
378 		if (ret)
379 			return ret;
380 
381 		if (verify_immutable(device, port))
382 			return -EINVAL;
383 	}
384 	return 0;
385 }
386 
387 void ib_get_device_fw_str(struct ib_device *dev, char *str)
388 {
389 	if (dev->get_dev_fw_str)
390 		dev->get_dev_fw_str(dev, str);
391 	else
392 		str[0] = '\0';
393 }
394 EXPORT_SYMBOL(ib_get_device_fw_str);
395 
396 static int setup_port_pkey_list(struct ib_device *device)
397 {
398 	int i;
399 
400 	/**
401 	 * device->port_pkey_list is indexed directly by the port number,
402 	 * Therefore it is declared as a 1 based array with potential empty
403 	 * slots at the beginning.
404 	 */
405 	device->port_pkey_list = kcalloc(rdma_end_port(device) + 1,
406 					 sizeof(*device->port_pkey_list),
407 					 GFP_KERNEL);
408 
409 	if (!device->port_pkey_list)
410 		return -ENOMEM;
411 
412 	for (i = 0; i < (rdma_end_port(device) + 1); i++) {
413 		spin_lock_init(&device->port_pkey_list[i].list_lock);
414 		INIT_LIST_HEAD(&device->port_pkey_list[i].pkey_list);
415 	}
416 
417 	return 0;
418 }
419 
420 static void ib_policy_change_task(struct work_struct *work)
421 {
422 	struct ib_device *dev;
423 
424 	down_read(&lists_rwsem);
425 	list_for_each_entry(dev, &device_list, core_list) {
426 		int i;
427 
428 		for (i = rdma_start_port(dev); i <= rdma_end_port(dev); i++) {
429 			u64 sp;
430 			int ret = ib_get_cached_subnet_prefix(dev,
431 							      i,
432 							      &sp);
433 
434 			WARN_ONCE(ret,
435 				  "ib_get_cached_subnet_prefix err: %d, this should never happen here\n",
436 				  ret);
437 			if (!ret)
438 				ib_security_cache_change(dev, i, sp);
439 		}
440 	}
441 	up_read(&lists_rwsem);
442 }
443 
444 static int ib_security_change(struct notifier_block *nb, unsigned long event,
445 			      void *lsm_data)
446 {
447 	if (event != LSM_POLICY_CHANGE)
448 		return NOTIFY_DONE;
449 
450 	schedule_work(&ib_policy_change_work);
451 
452 	return NOTIFY_OK;
453 }
454 
455 /**
456  *	__dev_new_index	-	allocate an device index
457  *
458  *	Returns a suitable unique value for a new device interface
459  *	number.  It assumes that there are less than 2^32-1 ib devices
460  *	will be present in the system.
461  */
462 static u32 __dev_new_index(void)
463 {
464 	/*
465 	 * The device index to allow stable naming.
466 	 * Similar to struct net -> ifindex.
467 	 */
468 	static u32 index;
469 
470 	for (;;) {
471 		if (!(++index))
472 			index = 1;
473 
474 		if (!__ib_device_get_by_index(index))
475 			return index;
476 	}
477 }
478 
479 static void setup_dma_device(struct ib_device *device)
480 {
481 	struct device *parent = device->dev.parent;
482 
483 	WARN_ON_ONCE(device->dma_device);
484 	if (device->dev.dma_ops) {
485 		/*
486 		 * The caller provided custom DMA operations. Copy the
487 		 * DMA-related fields that are used by e.g. dma_alloc_coherent()
488 		 * into device->dev.
489 		 */
490 		device->dma_device = &device->dev;
491 		if (!device->dev.dma_mask) {
492 			if (parent)
493 				device->dev.dma_mask = parent->dma_mask;
494 			else
495 				WARN_ON_ONCE(true);
496 		}
497 		if (!device->dev.coherent_dma_mask) {
498 			if (parent)
499 				device->dev.coherent_dma_mask =
500 					parent->coherent_dma_mask;
501 			else
502 				WARN_ON_ONCE(true);
503 		}
504 	} else {
505 		/*
506 		 * The caller did not provide custom DMA operations. Use the
507 		 * DMA mapping operations of the parent device.
508 		 */
509 		WARN_ON_ONCE(!parent);
510 		device->dma_device = parent;
511 	}
512 }
513 
514 static void cleanup_device(struct ib_device *device)
515 {
516 	ib_cache_cleanup_one(device);
517 	ib_cache_release_one(device);
518 	kfree(device->port_pkey_list);
519 	kfree(device->port_immutable);
520 }
521 
522 static int setup_device(struct ib_device *device)
523 {
524 	struct ib_udata uhw = {.outlen = 0, .inlen = 0};
525 	int ret;
526 
527 	ret = ib_device_check_mandatory(device);
528 	if (ret)
529 		return ret;
530 
531 	ret = read_port_immutable(device);
532 	if (ret) {
533 		dev_warn(&device->dev,
534 			 "Couldn't create per port immutable data\n");
535 		return ret;
536 	}
537 
538 	memset(&device->attrs, 0, sizeof(device->attrs));
539 	ret = device->query_device(device, &device->attrs, &uhw);
540 	if (ret) {
541 		dev_warn(&device->dev,
542 			 "Couldn't query the device attributes\n");
543 		goto port_cleanup;
544 	}
545 
546 	ret = setup_port_pkey_list(device);
547 	if (ret) {
548 		dev_warn(&device->dev, "Couldn't create per port_pkey_list\n");
549 		goto port_cleanup;
550 	}
551 
552 	ret = ib_cache_setup_one(device);
553 	if (ret) {
554 		dev_warn(&device->dev,
555 			 "Couldn't set up InfiniBand P_Key/GID cache\n");
556 		goto pkey_cleanup;
557 	}
558 	return 0;
559 
560 pkey_cleanup:
561 	kfree(device->port_pkey_list);
562 port_cleanup:
563 	kfree(device->port_immutable);
564 	return ret;
565 }
566 
567 /**
568  * ib_register_device - Register an IB device with IB core
569  * @device:Device to register
570  *
571  * Low-level drivers use ib_register_device() to register their
572  * devices with the IB core.  All registered clients will receive a
573  * callback for each device that is added. @device must be allocated
574  * with ib_alloc_device().
575  */
576 int ib_register_device(struct ib_device *device, const char *name,
577 		       int (*port_callback)(struct ib_device *, u8,
578 					    struct kobject *))
579 {
580 	int ret;
581 	struct ib_client *client;
582 
583 	setup_dma_device(device);
584 
585 	mutex_lock(&device_mutex);
586 
587 	if (strchr(name, '%')) {
588 		ret = alloc_name(device, name);
589 		if (ret)
590 			goto out;
591 	} else {
592 		ret = dev_set_name(&device->dev, name);
593 		if (ret)
594 			goto out;
595 	}
596 	if (__ib_device_get_by_name(dev_name(&device->dev))) {
597 		ret = -ENFILE;
598 		goto out;
599 	}
600 	strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX);
601 
602 	ret = setup_device(device);
603 	if (ret)
604 		goto out;
605 
606 	device->index = __dev_new_index();
607 
608 	ret = ib_device_register_rdmacg(device);
609 	if (ret) {
610 		dev_warn(&device->dev,
611 			 "Couldn't register device with rdma cgroup\n");
612 		goto dev_cleanup;
613 	}
614 
615 	ret = ib_device_register_sysfs(device, port_callback);
616 	if (ret) {
617 		dev_warn(&device->dev,
618 			 "Couldn't register device with driver model\n");
619 		goto cg_cleanup;
620 	}
621 
622 	device->reg_state = IB_DEV_REGISTERED;
623 
624 	list_for_each_entry(client, &client_list, list)
625 		if (!add_client_context(device, client) && client->add)
626 			client->add(device);
627 
628 	down_write(&lists_rwsem);
629 	list_add_tail(&device->core_list, &device_list);
630 	up_write(&lists_rwsem);
631 	mutex_unlock(&device_mutex);
632 	return 0;
633 
634 cg_cleanup:
635 	ib_device_unregister_rdmacg(device);
636 dev_cleanup:
637 	cleanup_device(device);
638 out:
639 	mutex_unlock(&device_mutex);
640 	return ret;
641 }
642 EXPORT_SYMBOL(ib_register_device);
643 
644 /**
645  * ib_unregister_device - Unregister an IB device
646  * @device:Device to unregister
647  *
648  * Unregister an IB device.  All clients will receive a remove callback.
649  */
650 void ib_unregister_device(struct ib_device *device)
651 {
652 	struct ib_client_data *context, *tmp;
653 	unsigned long flags;
654 
655 	/*
656 	 * Wait for all netlink command callers to finish working on the
657 	 * device.
658 	 */
659 	ib_device_put(device);
660 	wait_for_completion(&device->unreg_completion);
661 
662 	mutex_lock(&device_mutex);
663 
664 	down_write(&lists_rwsem);
665 	list_del(&device->core_list);
666 	write_lock_irq(&device->client_data_lock);
667 	list_for_each_entry(context, &device->client_data_list, list)
668 		context->going_down = true;
669 	write_unlock_irq(&device->client_data_lock);
670 	downgrade_write(&lists_rwsem);
671 
672 	list_for_each_entry(context, &device->client_data_list, list) {
673 		if (context->client->remove)
674 			context->client->remove(device, context->data);
675 	}
676 	up_read(&lists_rwsem);
677 
678 	ib_device_unregister_sysfs(device);
679 	ib_device_unregister_rdmacg(device);
680 
681 	mutex_unlock(&device_mutex);
682 
683 	ib_cache_cleanup_one(device);
684 
685 	ib_security_destroy_port_pkey_list(device);
686 	kfree(device->port_pkey_list);
687 
688 	down_write(&lists_rwsem);
689 	write_lock_irqsave(&device->client_data_lock, flags);
690 	list_for_each_entry_safe(context, tmp, &device->client_data_list,
691 				 list) {
692 		list_del(&context->list);
693 		kfree(context);
694 	}
695 	write_unlock_irqrestore(&device->client_data_lock, flags);
696 	up_write(&lists_rwsem);
697 
698 	device->reg_state = IB_DEV_UNREGISTERED;
699 }
700 EXPORT_SYMBOL(ib_unregister_device);
701 
702 /**
703  * ib_register_client - Register an IB client
704  * @client:Client to register
705  *
706  * Upper level users of the IB drivers can use ib_register_client() to
707  * register callbacks for IB device addition and removal.  When an IB
708  * device is added, each registered client's add method will be called
709  * (in the order the clients were registered), and when a device is
710  * removed, each client's remove method will be called (in the reverse
711  * order that clients were registered).  In addition, when
712  * ib_register_client() is called, the client will receive an add
713  * callback for all devices already registered.
714  */
715 int ib_register_client(struct ib_client *client)
716 {
717 	struct ib_device *device;
718 
719 	mutex_lock(&device_mutex);
720 
721 	list_for_each_entry(device, &device_list, core_list)
722 		if (!add_client_context(device, client) && client->add)
723 			client->add(device);
724 
725 	down_write(&lists_rwsem);
726 	list_add_tail(&client->list, &client_list);
727 	up_write(&lists_rwsem);
728 
729 	mutex_unlock(&device_mutex);
730 
731 	return 0;
732 }
733 EXPORT_SYMBOL(ib_register_client);
734 
735 /**
736  * ib_unregister_client - Unregister an IB client
737  * @client:Client to unregister
738  *
739  * Upper level users use ib_unregister_client() to remove their client
740  * registration.  When ib_unregister_client() is called, the client
741  * will receive a remove callback for each IB device still registered.
742  */
743 void ib_unregister_client(struct ib_client *client)
744 {
745 	struct ib_client_data *context;
746 	struct ib_device *device;
747 
748 	mutex_lock(&device_mutex);
749 
750 	down_write(&lists_rwsem);
751 	list_del(&client->list);
752 	up_write(&lists_rwsem);
753 
754 	list_for_each_entry(device, &device_list, core_list) {
755 		struct ib_client_data *found_context = NULL;
756 
757 		down_write(&lists_rwsem);
758 		write_lock_irq(&device->client_data_lock);
759 		list_for_each_entry(context, &device->client_data_list, list)
760 			if (context->client == client) {
761 				context->going_down = true;
762 				found_context = context;
763 				break;
764 			}
765 		write_unlock_irq(&device->client_data_lock);
766 		up_write(&lists_rwsem);
767 
768 		if (client->remove)
769 			client->remove(device, found_context ?
770 					       found_context->data : NULL);
771 
772 		if (!found_context) {
773 			dev_warn(&device->dev,
774 				 "No client context found for %s\n",
775 				 client->name);
776 			continue;
777 		}
778 
779 		down_write(&lists_rwsem);
780 		write_lock_irq(&device->client_data_lock);
781 		list_del(&found_context->list);
782 		write_unlock_irq(&device->client_data_lock);
783 		up_write(&lists_rwsem);
784 		kfree(found_context);
785 	}
786 
787 	mutex_unlock(&device_mutex);
788 }
789 EXPORT_SYMBOL(ib_unregister_client);
790 
791 /**
792  * ib_get_client_data - Get IB client context
793  * @device:Device to get context for
794  * @client:Client to get context for
795  *
796  * ib_get_client_data() returns client context set with
797  * ib_set_client_data().
798  */
799 void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
800 {
801 	struct ib_client_data *context;
802 	void *ret = NULL;
803 	unsigned long flags;
804 
805 	read_lock_irqsave(&device->client_data_lock, flags);
806 	list_for_each_entry(context, &device->client_data_list, list)
807 		if (context->client == client) {
808 			ret = context->data;
809 			break;
810 		}
811 	read_unlock_irqrestore(&device->client_data_lock, flags);
812 
813 	return ret;
814 }
815 EXPORT_SYMBOL(ib_get_client_data);
816 
817 /**
818  * ib_set_client_data - Set IB client context
819  * @device:Device to set context for
820  * @client:Client to set context for
821  * @data:Context to set
822  *
823  * ib_set_client_data() sets client context that can be retrieved with
824  * ib_get_client_data().
825  */
826 void ib_set_client_data(struct ib_device *device, struct ib_client *client,
827 			void *data)
828 {
829 	struct ib_client_data *context;
830 	unsigned long flags;
831 
832 	write_lock_irqsave(&device->client_data_lock, flags);
833 	list_for_each_entry(context, &device->client_data_list, list)
834 		if (context->client == client) {
835 			context->data = data;
836 			goto out;
837 		}
838 
839 	dev_warn(&device->dev, "No client context found for %s\n",
840 		 client->name);
841 
842 out:
843 	write_unlock_irqrestore(&device->client_data_lock, flags);
844 }
845 EXPORT_SYMBOL(ib_set_client_data);
846 
847 /**
848  * ib_register_event_handler - Register an IB event handler
849  * @event_handler:Handler to register
850  *
851  * ib_register_event_handler() registers an event handler that will be
852  * called back when asynchronous IB events occur (as defined in
853  * chapter 11 of the InfiniBand Architecture Specification).  This
854  * callback may occur in interrupt context.
855  */
856 void ib_register_event_handler(struct ib_event_handler *event_handler)
857 {
858 	unsigned long flags;
859 
860 	spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
861 	list_add_tail(&event_handler->list,
862 		      &event_handler->device->event_handler_list);
863 	spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
864 }
865 EXPORT_SYMBOL(ib_register_event_handler);
866 
867 /**
868  * ib_unregister_event_handler - Unregister an event handler
869  * @event_handler:Handler to unregister
870  *
871  * Unregister an event handler registered with
872  * ib_register_event_handler().
873  */
874 void ib_unregister_event_handler(struct ib_event_handler *event_handler)
875 {
876 	unsigned long flags;
877 
878 	spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
879 	list_del(&event_handler->list);
880 	spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
881 }
882 EXPORT_SYMBOL(ib_unregister_event_handler);
883 
884 /**
885  * ib_dispatch_event - Dispatch an asynchronous event
886  * @event:Event to dispatch
887  *
888  * Low-level drivers must call ib_dispatch_event() to dispatch the
889  * event to all registered event handlers when an asynchronous event
890  * occurs.
891  */
892 void ib_dispatch_event(struct ib_event *event)
893 {
894 	unsigned long flags;
895 	struct ib_event_handler *handler;
896 
897 	spin_lock_irqsave(&event->device->event_handler_lock, flags);
898 
899 	list_for_each_entry(handler, &event->device->event_handler_list, list)
900 		handler->handler(handler, event);
901 
902 	spin_unlock_irqrestore(&event->device->event_handler_lock, flags);
903 }
904 EXPORT_SYMBOL(ib_dispatch_event);
905 
906 /**
907  * ib_query_port - Query IB port attributes
908  * @device:Device to query
909  * @port_num:Port number to query
910  * @port_attr:Port attributes
911  *
912  * ib_query_port() returns the attributes of a port through the
913  * @port_attr pointer.
914  */
915 int ib_query_port(struct ib_device *device,
916 		  u8 port_num,
917 		  struct ib_port_attr *port_attr)
918 {
919 	union ib_gid gid;
920 	int err;
921 
922 	if (!rdma_is_port_valid(device, port_num))
923 		return -EINVAL;
924 
925 	memset(port_attr, 0, sizeof(*port_attr));
926 	err = device->query_port(device, port_num, port_attr);
927 	if (err || port_attr->subnet_prefix)
928 		return err;
929 
930 	if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND)
931 		return 0;
932 
933 	err = device->query_gid(device, port_num, 0, &gid);
934 	if (err)
935 		return err;
936 
937 	port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix);
938 	return 0;
939 }
940 EXPORT_SYMBOL(ib_query_port);
941 
942 /**
943  * ib_enum_roce_netdev - enumerate all RoCE ports
944  * @ib_dev : IB device we want to query
945  * @filter: Should we call the callback?
946  * @filter_cookie: Cookie passed to filter
947  * @cb: Callback to call for each found RoCE ports
948  * @cookie: Cookie passed back to the callback
949  *
950  * Enumerates all of the physical RoCE ports of ib_dev
951  * which are related to netdevice and calls callback() on each
952  * device for which filter() function returns non zero.
953  */
954 void ib_enum_roce_netdev(struct ib_device *ib_dev,
955 			 roce_netdev_filter filter,
956 			 void *filter_cookie,
957 			 roce_netdev_callback cb,
958 			 void *cookie)
959 {
960 	u8 port;
961 
962 	for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev);
963 	     port++)
964 		if (rdma_protocol_roce(ib_dev, port)) {
965 			struct net_device *idev = NULL;
966 
967 			if (ib_dev->get_netdev)
968 				idev = ib_dev->get_netdev(ib_dev, port);
969 
970 			if (idev &&
971 			    idev->reg_state >= NETREG_UNREGISTERED) {
972 				dev_put(idev);
973 				idev = NULL;
974 			}
975 
976 			if (filter(ib_dev, port, idev, filter_cookie))
977 				cb(ib_dev, port, idev, cookie);
978 
979 			if (idev)
980 				dev_put(idev);
981 		}
982 }
983 
984 /**
985  * ib_enum_all_roce_netdevs - enumerate all RoCE devices
986  * @filter: Should we call the callback?
987  * @filter_cookie: Cookie passed to filter
988  * @cb: Callback to call for each found RoCE ports
989  * @cookie: Cookie passed back to the callback
990  *
991  * Enumerates all RoCE devices' physical ports which are related
992  * to netdevices and calls callback() on each device for which
993  * filter() function returns non zero.
994  */
995 void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
996 			      void *filter_cookie,
997 			      roce_netdev_callback cb,
998 			      void *cookie)
999 {
1000 	struct ib_device *dev;
1001 
1002 	down_read(&lists_rwsem);
1003 	list_for_each_entry(dev, &device_list, core_list)
1004 		ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
1005 	up_read(&lists_rwsem);
1006 }
1007 
1008 /**
1009  * ib_enum_all_devs - enumerate all ib_devices
1010  * @cb: Callback to call for each found ib_device
1011  *
1012  * Enumerates all ib_devices and calls callback() on each device.
1013  */
1014 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
1015 		     struct netlink_callback *cb)
1016 {
1017 	struct ib_device *dev;
1018 	unsigned int idx = 0;
1019 	int ret = 0;
1020 
1021 	down_read(&lists_rwsem);
1022 	list_for_each_entry(dev, &device_list, core_list) {
1023 		ret = nldev_cb(dev, skb, cb, idx);
1024 		if (ret)
1025 			break;
1026 		idx++;
1027 	}
1028 
1029 	up_read(&lists_rwsem);
1030 	return ret;
1031 }
1032 
1033 /**
1034  * ib_query_pkey - Get P_Key table entry
1035  * @device:Device to query
1036  * @port_num:Port number to query
1037  * @index:P_Key table index to query
1038  * @pkey:Returned P_Key
1039  *
1040  * ib_query_pkey() fetches the specified P_Key table entry.
1041  */
1042 int ib_query_pkey(struct ib_device *device,
1043 		  u8 port_num, u16 index, u16 *pkey)
1044 {
1045 	return device->query_pkey(device, port_num, index, pkey);
1046 }
1047 EXPORT_SYMBOL(ib_query_pkey);
1048 
1049 /**
1050  * ib_modify_device - Change IB device attributes
1051  * @device:Device to modify
1052  * @device_modify_mask:Mask of attributes to change
1053  * @device_modify:New attribute values
1054  *
1055  * ib_modify_device() changes a device's attributes as specified by
1056  * the @device_modify_mask and @device_modify structure.
1057  */
1058 int ib_modify_device(struct ib_device *device,
1059 		     int device_modify_mask,
1060 		     struct ib_device_modify *device_modify)
1061 {
1062 	if (!device->modify_device)
1063 		return -ENOSYS;
1064 
1065 	return device->modify_device(device, device_modify_mask,
1066 				     device_modify);
1067 }
1068 EXPORT_SYMBOL(ib_modify_device);
1069 
1070 /**
1071  * ib_modify_port - Modifies the attributes for the specified port.
1072  * @device: The device to modify.
1073  * @port_num: The number of the port to modify.
1074  * @port_modify_mask: Mask used to specify which attributes of the port
1075  *   to change.
1076  * @port_modify: New attribute values for the port.
1077  *
1078  * ib_modify_port() changes a port's attributes as specified by the
1079  * @port_modify_mask and @port_modify structure.
1080  */
1081 int ib_modify_port(struct ib_device *device,
1082 		   u8 port_num, int port_modify_mask,
1083 		   struct ib_port_modify *port_modify)
1084 {
1085 	int rc;
1086 
1087 	if (!rdma_is_port_valid(device, port_num))
1088 		return -EINVAL;
1089 
1090 	if (device->modify_port)
1091 		rc = device->modify_port(device, port_num, port_modify_mask,
1092 					   port_modify);
1093 	else
1094 		rc = rdma_protocol_roce(device, port_num) ? 0 : -ENOSYS;
1095 	return rc;
1096 }
1097 EXPORT_SYMBOL(ib_modify_port);
1098 
1099 /**
1100  * ib_find_gid - Returns the port number and GID table index where
1101  *   a specified GID value occurs. Its searches only for IB link layer.
1102  * @device: The device to query.
1103  * @gid: The GID value to search for.
1104  * @port_num: The port number of the device where the GID value was found.
1105  * @index: The index into the GID table where the GID was found.  This
1106  *   parameter may be NULL.
1107  */
1108 int ib_find_gid(struct ib_device *device, union ib_gid *gid,
1109 		u8 *port_num, u16 *index)
1110 {
1111 	union ib_gid tmp_gid;
1112 	int ret, port, i;
1113 
1114 	for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
1115 		if (!rdma_protocol_ib(device, port))
1116 			continue;
1117 
1118 		for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
1119 			ret = rdma_query_gid(device, port, i, &tmp_gid);
1120 			if (ret)
1121 				return ret;
1122 			if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
1123 				*port_num = port;
1124 				if (index)
1125 					*index = i;
1126 				return 0;
1127 			}
1128 		}
1129 	}
1130 
1131 	return -ENOENT;
1132 }
1133 EXPORT_SYMBOL(ib_find_gid);
1134 
1135 /**
1136  * ib_find_pkey - Returns the PKey table index where a specified
1137  *   PKey value occurs.
1138  * @device: The device to query.
1139  * @port_num: The port number of the device to search for the PKey.
1140  * @pkey: The PKey value to search for.
1141  * @index: The index into the PKey table where the PKey was found.
1142  */
1143 int ib_find_pkey(struct ib_device *device,
1144 		 u8 port_num, u16 pkey, u16 *index)
1145 {
1146 	int ret, i;
1147 	u16 tmp_pkey;
1148 	int partial_ix = -1;
1149 
1150 	for (i = 0; i < device->port_immutable[port_num].pkey_tbl_len; ++i) {
1151 		ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
1152 		if (ret)
1153 			return ret;
1154 		if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
1155 			/* if there is full-member pkey take it.*/
1156 			if (tmp_pkey & 0x8000) {
1157 				*index = i;
1158 				return 0;
1159 			}
1160 			if (partial_ix < 0)
1161 				partial_ix = i;
1162 		}
1163 	}
1164 
1165 	/*no full-member, if exists take the limited*/
1166 	if (partial_ix >= 0) {
1167 		*index = partial_ix;
1168 		return 0;
1169 	}
1170 	return -ENOENT;
1171 }
1172 EXPORT_SYMBOL(ib_find_pkey);
1173 
1174 /**
1175  * ib_get_net_dev_by_params() - Return the appropriate net_dev
1176  * for a received CM request
1177  * @dev:	An RDMA device on which the request has been received.
1178  * @port:	Port number on the RDMA device.
1179  * @pkey:	The Pkey the request came on.
1180  * @gid:	A GID that the net_dev uses to communicate.
1181  * @addr:	Contains the IP address that the request specified as its
1182  *		destination.
1183  */
1184 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
1185 					    u8 port,
1186 					    u16 pkey,
1187 					    const union ib_gid *gid,
1188 					    const struct sockaddr *addr)
1189 {
1190 	struct net_device *net_dev = NULL;
1191 	struct ib_client_data *context;
1192 
1193 	if (!rdma_protocol_ib(dev, port))
1194 		return NULL;
1195 
1196 	down_read(&lists_rwsem);
1197 
1198 	list_for_each_entry(context, &dev->client_data_list, list) {
1199 		struct ib_client *client = context->client;
1200 
1201 		if (context->going_down)
1202 			continue;
1203 
1204 		if (client->get_net_dev_by_params) {
1205 			net_dev = client->get_net_dev_by_params(dev, port, pkey,
1206 								gid, addr,
1207 								context->data);
1208 			if (net_dev)
1209 				break;
1210 		}
1211 	}
1212 
1213 	up_read(&lists_rwsem);
1214 
1215 	return net_dev;
1216 }
1217 EXPORT_SYMBOL(ib_get_net_dev_by_params);
1218 
1219 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = {
1220 	[RDMA_NL_LS_OP_RESOLVE] = {
1221 		.doit = ib_nl_handle_resolve_resp,
1222 		.flags = RDMA_NL_ADMIN_PERM,
1223 	},
1224 	[RDMA_NL_LS_OP_SET_TIMEOUT] = {
1225 		.doit = ib_nl_handle_set_timeout,
1226 		.flags = RDMA_NL_ADMIN_PERM,
1227 	},
1228 	[RDMA_NL_LS_OP_IP_RESOLVE] = {
1229 		.doit = ib_nl_handle_ip_res_resp,
1230 		.flags = RDMA_NL_ADMIN_PERM,
1231 	},
1232 };
1233 
1234 static int __init ib_core_init(void)
1235 {
1236 	int ret;
1237 
1238 	ib_wq = alloc_workqueue("infiniband", 0, 0);
1239 	if (!ib_wq)
1240 		return -ENOMEM;
1241 
1242 	ib_comp_wq = alloc_workqueue("ib-comp-wq",
1243 			WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
1244 	if (!ib_comp_wq) {
1245 		ret = -ENOMEM;
1246 		goto err;
1247 	}
1248 
1249 	ib_comp_unbound_wq =
1250 		alloc_workqueue("ib-comp-unb-wq",
1251 				WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM |
1252 				WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE);
1253 	if (!ib_comp_unbound_wq) {
1254 		ret = -ENOMEM;
1255 		goto err_comp;
1256 	}
1257 
1258 	ret = class_register(&ib_class);
1259 	if (ret) {
1260 		pr_warn("Couldn't create InfiniBand device class\n");
1261 		goto err_comp_unbound;
1262 	}
1263 
1264 	ret = rdma_nl_init();
1265 	if (ret) {
1266 		pr_warn("Couldn't init IB netlink interface: err %d\n", ret);
1267 		goto err_sysfs;
1268 	}
1269 
1270 	ret = addr_init();
1271 	if (ret) {
1272 		pr_warn("Could't init IB address resolution\n");
1273 		goto err_ibnl;
1274 	}
1275 
1276 	ret = ib_mad_init();
1277 	if (ret) {
1278 		pr_warn("Couldn't init IB MAD\n");
1279 		goto err_addr;
1280 	}
1281 
1282 	ret = ib_sa_init();
1283 	if (ret) {
1284 		pr_warn("Couldn't init SA\n");
1285 		goto err_mad;
1286 	}
1287 
1288 	ret = register_lsm_notifier(&ibdev_lsm_nb);
1289 	if (ret) {
1290 		pr_warn("Couldn't register LSM notifier. ret %d\n", ret);
1291 		goto err_sa;
1292 	}
1293 
1294 	nldev_init();
1295 	rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
1296 	roce_gid_mgmt_init();
1297 
1298 	return 0;
1299 
1300 err_sa:
1301 	ib_sa_cleanup();
1302 err_mad:
1303 	ib_mad_cleanup();
1304 err_addr:
1305 	addr_cleanup();
1306 err_ibnl:
1307 	rdma_nl_exit();
1308 err_sysfs:
1309 	class_unregister(&ib_class);
1310 err_comp_unbound:
1311 	destroy_workqueue(ib_comp_unbound_wq);
1312 err_comp:
1313 	destroy_workqueue(ib_comp_wq);
1314 err:
1315 	destroy_workqueue(ib_wq);
1316 	return ret;
1317 }
1318 
1319 static void __exit ib_core_cleanup(void)
1320 {
1321 	roce_gid_mgmt_cleanup();
1322 	nldev_exit();
1323 	rdma_nl_unregister(RDMA_NL_LS);
1324 	unregister_lsm_notifier(&ibdev_lsm_nb);
1325 	ib_sa_cleanup();
1326 	ib_mad_cleanup();
1327 	addr_cleanup();
1328 	rdma_nl_exit();
1329 	class_unregister(&ib_class);
1330 	destroy_workqueue(ib_comp_unbound_wq);
1331 	destroy_workqueue(ib_comp_wq);
1332 	/* Make sure that any pending umem accounting work is done. */
1333 	destroy_workqueue(ib_wq);
1334 }
1335 
1336 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4);
1337 
1338 subsys_initcall(ib_core_init);
1339 module_exit(ib_core_cleanup);
1340