1 /* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <linux/module.h> 35 #include <linux/string.h> 36 #include <linux/errno.h> 37 #include <linux/kernel.h> 38 #include <linux/slab.h> 39 #include <linux/init.h> 40 #include <linux/netdevice.h> 41 #include <net/net_namespace.h> 42 #include <net/netns/generic.h> 43 #include <linux/security.h> 44 #include <linux/notifier.h> 45 #include <linux/hashtable.h> 46 #include <rdma/rdma_netlink.h> 47 #include <rdma/ib_addr.h> 48 #include <rdma/ib_cache.h> 49 50 #include "core_priv.h" 51 #include "restrack.h" 52 53 MODULE_AUTHOR("Roland Dreier"); 54 MODULE_DESCRIPTION("core kernel InfiniBand API"); 55 MODULE_LICENSE("Dual BSD/GPL"); 56 57 struct workqueue_struct *ib_comp_wq; 58 struct workqueue_struct *ib_comp_unbound_wq; 59 struct workqueue_struct *ib_wq; 60 EXPORT_SYMBOL_GPL(ib_wq); 61 62 /* 63 * Each of the three rwsem locks (devices, clients, client_data) protects the 64 * xarray of the same name. Specifically it allows the caller to assert that 65 * the MARK will/will not be changing under the lock, and for devices and 66 * clients, that the value in the xarray is still a valid pointer. Change of 67 * the MARK is linked to the object state, so holding the lock and testing the 68 * MARK also asserts that the contained object is in a certain state. 69 * 70 * This is used to build a two stage register/unregister flow where objects 71 * can continue to be in the xarray even though they are still in progress to 72 * register/unregister. 73 * 74 * The xarray itself provides additional locking, and restartable iteration, 75 * which is also relied on. 76 * 77 * Locks should not be nested, with the exception of client_data, which is 78 * allowed to nest under the read side of the other two locks. 79 * 80 * The devices_rwsem also protects the device name list, any change or 81 * assignment of device name must also hold the write side to guarantee unique 82 * names. 83 */ 84 85 /* 86 * devices contains devices that have had their names assigned. The 87 * devices may not be registered. Users that care about the registration 88 * status need to call ib_device_try_get() on the device to ensure it is 89 * registered, and keep it registered, for the required duration. 90 * 91 */ 92 static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); 93 static DECLARE_RWSEM(devices_rwsem); 94 #define DEVICE_REGISTERED XA_MARK_1 95 96 static LIST_HEAD(client_list); 97 #define CLIENT_REGISTERED XA_MARK_1 98 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); 99 static DECLARE_RWSEM(clients_rwsem); 100 101 /* 102 * If client_data is registered then the corresponding client must also still 103 * be registered. 104 */ 105 #define CLIENT_DATA_REGISTERED XA_MARK_1 106 107 /** 108 * struct rdma_dev_net - rdma net namespace metadata for a net 109 * @net: Pointer to owner net namespace 110 * @id: xarray id to identify the net namespace. 111 */ 112 struct rdma_dev_net { 113 possible_net_t net; 114 u32 id; 115 }; 116 117 static unsigned int rdma_dev_net_id; 118 119 /* 120 * A list of net namespaces is maintained in an xarray. This is necessary 121 * because we can't get the locking right using the existing net ns list. We 122 * would require a init_net callback after the list is updated. 123 */ 124 static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); 125 /* 126 * rwsem to protect accessing the rdma_nets xarray entries. 127 */ 128 static DECLARE_RWSEM(rdma_nets_rwsem); 129 130 bool ib_devices_shared_netns = true; 131 module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); 132 MODULE_PARM_DESC(netns_mode, 133 "Share device among net namespaces; default=1 (shared)"); 134 /** 135 * rdma_dev_access_netns() - Return whether a rdma device can be accessed 136 * from a specified net namespace or not. 137 * @device: Pointer to rdma device which needs to be checked 138 * @net: Pointer to net namesapce for which access to be checked 139 * 140 * rdma_dev_access_netns() - Return whether a rdma device can be accessed 141 * from a specified net namespace or not. When 142 * rdma device is in shared mode, it ignores the 143 * net namespace. When rdma device is exclusive 144 * to a net namespace, rdma device net namespace is 145 * checked against the specified one. 146 */ 147 bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) 148 { 149 return (ib_devices_shared_netns || 150 net_eq(read_pnet(&dev->coredev.rdma_net), net)); 151 } 152 EXPORT_SYMBOL(rdma_dev_access_netns); 153 154 /* 155 * xarray has this behavior where it won't iterate over NULL values stored in 156 * allocated arrays. So we need our own iterator to see all values stored in 157 * the array. This does the same thing as xa_for_each except that it also 158 * returns NULL valued entries if the array is allocating. Simplified to only 159 * work on simple xarrays. 160 */ 161 static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, 162 xa_mark_t filter) 163 { 164 XA_STATE(xas, xa, *indexp); 165 void *entry; 166 167 rcu_read_lock(); 168 do { 169 entry = xas_find_marked(&xas, ULONG_MAX, filter); 170 if (xa_is_zero(entry)) 171 break; 172 } while (xas_retry(&xas, entry)); 173 rcu_read_unlock(); 174 175 if (entry) { 176 *indexp = xas.xa_index; 177 if (xa_is_zero(entry)) 178 return NULL; 179 return entry; 180 } 181 return XA_ERROR(-ENOENT); 182 } 183 #define xan_for_each_marked(xa, index, entry, filter) \ 184 for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ 185 !xa_is_err(entry); \ 186 (index)++, entry = xan_find_marked(xa, &(index), filter)) 187 188 /* RCU hash table mapping netdevice pointers to struct ib_port_data */ 189 static DEFINE_SPINLOCK(ndev_hash_lock); 190 static DECLARE_HASHTABLE(ndev_hash, 5); 191 192 static void free_netdevs(struct ib_device *ib_dev); 193 static void ib_unregister_work(struct work_struct *work); 194 static void __ib_unregister_device(struct ib_device *device); 195 static int ib_security_change(struct notifier_block *nb, unsigned long event, 196 void *lsm_data); 197 static void ib_policy_change_task(struct work_struct *work); 198 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); 199 200 static void __ibdev_printk(const char *level, const struct ib_device *ibdev, 201 struct va_format *vaf) 202 { 203 if (ibdev && ibdev->dev.parent) 204 dev_printk_emit(level[1] - '0', 205 ibdev->dev.parent, 206 "%s %s %s: %pV", 207 dev_driver_string(ibdev->dev.parent), 208 dev_name(ibdev->dev.parent), 209 dev_name(&ibdev->dev), 210 vaf); 211 else if (ibdev) 212 printk("%s%s: %pV", 213 level, dev_name(&ibdev->dev), vaf); 214 else 215 printk("%s(NULL ib_device): %pV", level, vaf); 216 } 217 218 void ibdev_printk(const char *level, const struct ib_device *ibdev, 219 const char *format, ...) 220 { 221 struct va_format vaf; 222 va_list args; 223 224 va_start(args, format); 225 226 vaf.fmt = format; 227 vaf.va = &args; 228 229 __ibdev_printk(level, ibdev, &vaf); 230 231 va_end(args); 232 } 233 EXPORT_SYMBOL(ibdev_printk); 234 235 #define define_ibdev_printk_level(func, level) \ 236 void func(const struct ib_device *ibdev, const char *fmt, ...) \ 237 { \ 238 struct va_format vaf; \ 239 va_list args; \ 240 \ 241 va_start(args, fmt); \ 242 \ 243 vaf.fmt = fmt; \ 244 vaf.va = &args; \ 245 \ 246 __ibdev_printk(level, ibdev, &vaf); \ 247 \ 248 va_end(args); \ 249 } \ 250 EXPORT_SYMBOL(func); 251 252 define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); 253 define_ibdev_printk_level(ibdev_alert, KERN_ALERT); 254 define_ibdev_printk_level(ibdev_crit, KERN_CRIT); 255 define_ibdev_printk_level(ibdev_err, KERN_ERR); 256 define_ibdev_printk_level(ibdev_warn, KERN_WARNING); 257 define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); 258 define_ibdev_printk_level(ibdev_info, KERN_INFO); 259 260 static struct notifier_block ibdev_lsm_nb = { 261 .notifier_call = ib_security_change, 262 }; 263 264 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 265 struct net *net); 266 267 /* Pointer to the RCU head at the start of the ib_port_data array */ 268 struct ib_port_data_rcu { 269 struct rcu_head rcu_head; 270 struct ib_port_data pdata[]; 271 }; 272 273 static void ib_device_check_mandatory(struct ib_device *device) 274 { 275 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } 276 static const struct { 277 size_t offset; 278 char *name; 279 } mandatory_table[] = { 280 IB_MANDATORY_FUNC(query_device), 281 IB_MANDATORY_FUNC(query_port), 282 IB_MANDATORY_FUNC(query_pkey), 283 IB_MANDATORY_FUNC(alloc_pd), 284 IB_MANDATORY_FUNC(dealloc_pd), 285 IB_MANDATORY_FUNC(create_qp), 286 IB_MANDATORY_FUNC(modify_qp), 287 IB_MANDATORY_FUNC(destroy_qp), 288 IB_MANDATORY_FUNC(post_send), 289 IB_MANDATORY_FUNC(post_recv), 290 IB_MANDATORY_FUNC(create_cq), 291 IB_MANDATORY_FUNC(destroy_cq), 292 IB_MANDATORY_FUNC(poll_cq), 293 IB_MANDATORY_FUNC(req_notify_cq), 294 IB_MANDATORY_FUNC(get_dma_mr), 295 IB_MANDATORY_FUNC(dereg_mr), 296 IB_MANDATORY_FUNC(get_port_immutable) 297 }; 298 int i; 299 300 device->kverbs_provider = true; 301 for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { 302 if (!*(void **) ((void *) &device->ops + 303 mandatory_table[i].offset)) { 304 device->kverbs_provider = false; 305 break; 306 } 307 } 308 } 309 310 /* 311 * Caller must perform ib_device_put() to return the device reference count 312 * when ib_device_get_by_index() returns valid device pointer. 313 */ 314 struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) 315 { 316 struct ib_device *device; 317 318 down_read(&devices_rwsem); 319 device = xa_load(&devices, index); 320 if (device) { 321 if (!rdma_dev_access_netns(device, net)) { 322 device = NULL; 323 goto out; 324 } 325 326 if (!ib_device_try_get(device)) 327 device = NULL; 328 } 329 out: 330 up_read(&devices_rwsem); 331 return device; 332 } 333 334 /** 335 * ib_device_put - Release IB device reference 336 * @device: device whose reference to be released 337 * 338 * ib_device_put() releases reference to the IB device to allow it to be 339 * unregistered and eventually free. 340 */ 341 void ib_device_put(struct ib_device *device) 342 { 343 if (refcount_dec_and_test(&device->refcount)) 344 complete(&device->unreg_completion); 345 } 346 EXPORT_SYMBOL(ib_device_put); 347 348 static struct ib_device *__ib_device_get_by_name(const char *name) 349 { 350 struct ib_device *device; 351 unsigned long index; 352 353 xa_for_each (&devices, index, device) 354 if (!strcmp(name, dev_name(&device->dev))) 355 return device; 356 357 return NULL; 358 } 359 360 /** 361 * ib_device_get_by_name - Find an IB device by name 362 * @name: The name to look for 363 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 364 * 365 * Find and hold an ib_device by its name. The caller must call 366 * ib_device_put() on the returned pointer. 367 */ 368 struct ib_device *ib_device_get_by_name(const char *name, 369 enum rdma_driver_id driver_id) 370 { 371 struct ib_device *device; 372 373 down_read(&devices_rwsem); 374 device = __ib_device_get_by_name(name); 375 if (device && driver_id != RDMA_DRIVER_UNKNOWN && 376 device->ops.driver_id != driver_id) 377 device = NULL; 378 379 if (device) { 380 if (!ib_device_try_get(device)) 381 device = NULL; 382 } 383 up_read(&devices_rwsem); 384 return device; 385 } 386 EXPORT_SYMBOL(ib_device_get_by_name); 387 388 static int rename_compat_devs(struct ib_device *device) 389 { 390 struct ib_core_device *cdev; 391 unsigned long index; 392 int ret = 0; 393 394 mutex_lock(&device->compat_devs_mutex); 395 xa_for_each (&device->compat_devs, index, cdev) { 396 ret = device_rename(&cdev->dev, dev_name(&device->dev)); 397 if (ret) { 398 dev_warn(&cdev->dev, 399 "Fail to rename compatdev to new name %s\n", 400 dev_name(&device->dev)); 401 break; 402 } 403 } 404 mutex_unlock(&device->compat_devs_mutex); 405 return ret; 406 } 407 408 int ib_device_rename(struct ib_device *ibdev, const char *name) 409 { 410 int ret; 411 412 down_write(&devices_rwsem); 413 if (!strcmp(name, dev_name(&ibdev->dev))) { 414 ret = 0; 415 goto out; 416 } 417 418 if (__ib_device_get_by_name(name)) { 419 ret = -EEXIST; 420 goto out; 421 } 422 423 ret = device_rename(&ibdev->dev, name); 424 if (ret) 425 goto out; 426 strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); 427 ret = rename_compat_devs(ibdev); 428 out: 429 up_write(&devices_rwsem); 430 return ret; 431 } 432 433 static int alloc_name(struct ib_device *ibdev, const char *name) 434 { 435 struct ib_device *device; 436 unsigned long index; 437 struct ida inuse; 438 int rc; 439 int i; 440 441 lockdep_assert_held_exclusive(&devices_rwsem); 442 ida_init(&inuse); 443 xa_for_each (&devices, index, device) { 444 char buf[IB_DEVICE_NAME_MAX]; 445 446 if (sscanf(dev_name(&device->dev), name, &i) != 1) 447 continue; 448 if (i < 0 || i >= INT_MAX) 449 continue; 450 snprintf(buf, sizeof buf, name, i); 451 if (strcmp(buf, dev_name(&device->dev)) != 0) 452 continue; 453 454 rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); 455 if (rc < 0) 456 goto out; 457 } 458 459 rc = ida_alloc(&inuse, GFP_KERNEL); 460 if (rc < 0) 461 goto out; 462 463 rc = dev_set_name(&ibdev->dev, name, rc); 464 out: 465 ida_destroy(&inuse); 466 return rc; 467 } 468 469 static void ib_device_release(struct device *device) 470 { 471 struct ib_device *dev = container_of(device, struct ib_device, dev); 472 473 free_netdevs(dev); 474 WARN_ON(refcount_read(&dev->refcount)); 475 ib_cache_release_one(dev); 476 ib_security_release_port_pkey_list(dev); 477 xa_destroy(&dev->compat_devs); 478 xa_destroy(&dev->client_data); 479 if (dev->port_data) 480 kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, 481 pdata[0]), 482 rcu_head); 483 kfree_rcu(dev, rcu_head); 484 } 485 486 static int ib_device_uevent(struct device *device, 487 struct kobj_uevent_env *env) 488 { 489 if (add_uevent_var(env, "NAME=%s", dev_name(device))) 490 return -ENOMEM; 491 492 /* 493 * It would be nice to pass the node GUID with the event... 494 */ 495 496 return 0; 497 } 498 499 static const void *net_namespace(struct device *d) 500 { 501 struct ib_core_device *coredev = 502 container_of(d, struct ib_core_device, dev); 503 504 return read_pnet(&coredev->rdma_net); 505 } 506 507 static struct class ib_class = { 508 .name = "infiniband", 509 .dev_release = ib_device_release, 510 .dev_uevent = ib_device_uevent, 511 .ns_type = &net_ns_type_operations, 512 .namespace = net_namespace, 513 }; 514 515 static void rdma_init_coredev(struct ib_core_device *coredev, 516 struct ib_device *dev, struct net *net) 517 { 518 /* This BUILD_BUG_ON is intended to catch layout change 519 * of union of ib_core_device and device. 520 * dev must be the first element as ib_core and providers 521 * driver uses it. Adding anything in ib_core_device before 522 * device will break this assumption. 523 */ 524 BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != 525 offsetof(struct ib_device, dev)); 526 527 coredev->dev.class = &ib_class; 528 coredev->dev.groups = dev->groups; 529 device_initialize(&coredev->dev); 530 coredev->owner = dev; 531 INIT_LIST_HEAD(&coredev->port_list); 532 write_pnet(&coredev->rdma_net, net); 533 } 534 535 /** 536 * _ib_alloc_device - allocate an IB device struct 537 * @size:size of structure to allocate 538 * 539 * Low-level drivers should use ib_alloc_device() to allocate &struct 540 * ib_device. @size is the size of the structure to be allocated, 541 * including any private data used by the low-level driver. 542 * ib_dealloc_device() must be used to free structures allocated with 543 * ib_alloc_device(). 544 */ 545 struct ib_device *_ib_alloc_device(size_t size) 546 { 547 struct ib_device *device; 548 549 if (WARN_ON(size < sizeof(struct ib_device))) 550 return NULL; 551 552 device = kzalloc(size, GFP_KERNEL); 553 if (!device) 554 return NULL; 555 556 if (rdma_restrack_init(device)) { 557 kfree(device); 558 return NULL; 559 } 560 561 device->groups[0] = &ib_dev_attr_group; 562 rdma_init_coredev(&device->coredev, device, &init_net); 563 564 INIT_LIST_HEAD(&device->event_handler_list); 565 spin_lock_init(&device->event_handler_lock); 566 mutex_init(&device->unregistration_lock); 567 /* 568 * client_data needs to be alloc because we don't want our mark to be 569 * destroyed if the user stores NULL in the client data. 570 */ 571 xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); 572 init_rwsem(&device->client_data_rwsem); 573 xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); 574 mutex_init(&device->compat_devs_mutex); 575 init_completion(&device->unreg_completion); 576 INIT_WORK(&device->unregistration_work, ib_unregister_work); 577 578 return device; 579 } 580 EXPORT_SYMBOL(_ib_alloc_device); 581 582 /** 583 * ib_dealloc_device - free an IB device struct 584 * @device:structure to free 585 * 586 * Free a structure allocated with ib_alloc_device(). 587 */ 588 void ib_dealloc_device(struct ib_device *device) 589 { 590 if (device->ops.dealloc_driver) 591 device->ops.dealloc_driver(device); 592 593 /* 594 * ib_unregister_driver() requires all devices to remain in the xarray 595 * while their ops are callable. The last op we call is dealloc_driver 596 * above. This is needed to create a fence on op callbacks prior to 597 * allowing the driver module to unload. 598 */ 599 down_write(&devices_rwsem); 600 if (xa_load(&devices, device->index) == device) 601 xa_erase(&devices, device->index); 602 up_write(&devices_rwsem); 603 604 /* Expedite releasing netdev references */ 605 free_netdevs(device); 606 607 WARN_ON(!xa_empty(&device->compat_devs)); 608 WARN_ON(!xa_empty(&device->client_data)); 609 WARN_ON(refcount_read(&device->refcount)); 610 rdma_restrack_clean(device); 611 /* Balances with device_initialize */ 612 put_device(&device->dev); 613 } 614 EXPORT_SYMBOL(ib_dealloc_device); 615 616 /* 617 * add_client_context() and remove_client_context() must be safe against 618 * parallel calls on the same device - registration/unregistration of both the 619 * device and client can be occurring in parallel. 620 * 621 * The routines need to be a fence, any caller must not return until the add 622 * or remove is fully completed. 623 */ 624 static int add_client_context(struct ib_device *device, 625 struct ib_client *client) 626 { 627 int ret = 0; 628 629 if (!device->kverbs_provider && !client->no_kverbs_req) 630 return 0; 631 632 down_write(&device->client_data_rwsem); 633 /* 634 * Another caller to add_client_context got here first and has already 635 * completely initialized context. 636 */ 637 if (xa_get_mark(&device->client_data, client->client_id, 638 CLIENT_DATA_REGISTERED)) 639 goto out; 640 641 ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, 642 GFP_KERNEL)); 643 if (ret) 644 goto out; 645 downgrade_write(&device->client_data_rwsem); 646 if (client->add) 647 client->add(device); 648 649 /* Readers shall not see a client until add has been completed */ 650 xa_set_mark(&device->client_data, client->client_id, 651 CLIENT_DATA_REGISTERED); 652 up_read(&device->client_data_rwsem); 653 return 0; 654 655 out: 656 up_write(&device->client_data_rwsem); 657 return ret; 658 } 659 660 static void remove_client_context(struct ib_device *device, 661 unsigned int client_id) 662 { 663 struct ib_client *client; 664 void *client_data; 665 666 down_write(&device->client_data_rwsem); 667 if (!xa_get_mark(&device->client_data, client_id, 668 CLIENT_DATA_REGISTERED)) { 669 up_write(&device->client_data_rwsem); 670 return; 671 } 672 client_data = xa_load(&device->client_data, client_id); 673 xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); 674 client = xa_load(&clients, client_id); 675 downgrade_write(&device->client_data_rwsem); 676 677 /* 678 * Notice we cannot be holding any exclusive locks when calling the 679 * remove callback as the remove callback can recurse back into any 680 * public functions in this module and thus try for any locks those 681 * functions take. 682 * 683 * For this reason clients and drivers should not call the 684 * unregistration functions will holdling any locks. 685 * 686 * It tempting to drop the client_data_rwsem too, but this is required 687 * to ensure that unregister_client does not return until all clients 688 * are completely unregistered, which is required to avoid module 689 * unloading races. 690 */ 691 if (client->remove) 692 client->remove(device, client_data); 693 694 xa_erase(&device->client_data, client_id); 695 up_read(&device->client_data_rwsem); 696 } 697 698 static int alloc_port_data(struct ib_device *device) 699 { 700 struct ib_port_data_rcu *pdata_rcu; 701 unsigned int port; 702 703 if (device->port_data) 704 return 0; 705 706 /* This can only be called once the physical port range is defined */ 707 if (WARN_ON(!device->phys_port_cnt)) 708 return -EINVAL; 709 710 /* 711 * device->port_data is indexed directly by the port number to make 712 * access to this data as efficient as possible. 713 * 714 * Therefore port_data is declared as a 1 based array with potential 715 * empty slots at the beginning. 716 */ 717 pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, 718 rdma_end_port(device) + 1), 719 GFP_KERNEL); 720 if (!pdata_rcu) 721 return -ENOMEM; 722 /* 723 * The rcu_head is put in front of the port data array and the stored 724 * pointer is adjusted since we never need to see that member until 725 * kfree_rcu. 726 */ 727 device->port_data = pdata_rcu->pdata; 728 729 rdma_for_each_port (device, port) { 730 struct ib_port_data *pdata = &device->port_data[port]; 731 732 pdata->ib_dev = device; 733 spin_lock_init(&pdata->pkey_list_lock); 734 INIT_LIST_HEAD(&pdata->pkey_list); 735 spin_lock_init(&pdata->netdev_lock); 736 INIT_HLIST_NODE(&pdata->ndev_hash_link); 737 } 738 return 0; 739 } 740 741 static int verify_immutable(const struct ib_device *dev, u8 port) 742 { 743 return WARN_ON(!rdma_cap_ib_mad(dev, port) && 744 rdma_max_mad_size(dev, port) != 0); 745 } 746 747 static int setup_port_data(struct ib_device *device) 748 { 749 unsigned int port; 750 int ret; 751 752 ret = alloc_port_data(device); 753 if (ret) 754 return ret; 755 756 rdma_for_each_port (device, port) { 757 struct ib_port_data *pdata = &device->port_data[port]; 758 759 ret = device->ops.get_port_immutable(device, port, 760 &pdata->immutable); 761 if (ret) 762 return ret; 763 764 if (verify_immutable(device, port)) 765 return -EINVAL; 766 } 767 return 0; 768 } 769 770 void ib_get_device_fw_str(struct ib_device *dev, char *str) 771 { 772 if (dev->ops.get_dev_fw_str) 773 dev->ops.get_dev_fw_str(dev, str); 774 else 775 str[0] = '\0'; 776 } 777 EXPORT_SYMBOL(ib_get_device_fw_str); 778 779 static void ib_policy_change_task(struct work_struct *work) 780 { 781 struct ib_device *dev; 782 unsigned long index; 783 784 down_read(&devices_rwsem); 785 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 786 unsigned int i; 787 788 rdma_for_each_port (dev, i) { 789 u64 sp; 790 int ret = ib_get_cached_subnet_prefix(dev, 791 i, 792 &sp); 793 794 WARN_ONCE(ret, 795 "ib_get_cached_subnet_prefix err: %d, this should never happen here\n", 796 ret); 797 if (!ret) 798 ib_security_cache_change(dev, i, sp); 799 } 800 } 801 up_read(&devices_rwsem); 802 } 803 804 static int ib_security_change(struct notifier_block *nb, unsigned long event, 805 void *lsm_data) 806 { 807 if (event != LSM_POLICY_CHANGE) 808 return NOTIFY_DONE; 809 810 schedule_work(&ib_policy_change_work); 811 ib_mad_agent_security_change(); 812 813 return NOTIFY_OK; 814 } 815 816 static void compatdev_release(struct device *dev) 817 { 818 struct ib_core_device *cdev = 819 container_of(dev, struct ib_core_device, dev); 820 821 kfree(cdev); 822 } 823 824 static int add_one_compat_dev(struct ib_device *device, 825 struct rdma_dev_net *rnet) 826 { 827 struct ib_core_device *cdev; 828 int ret; 829 830 lockdep_assert_held(&rdma_nets_rwsem); 831 if (!ib_devices_shared_netns) 832 return 0; 833 834 /* 835 * Create and add compat device in all namespaces other than where it 836 * is currently bound to. 837 */ 838 if (net_eq(read_pnet(&rnet->net), 839 read_pnet(&device->coredev.rdma_net))) 840 return 0; 841 842 /* 843 * The first of init_net() or ib_register_device() to take the 844 * compat_devs_mutex wins and gets to add the device. Others will wait 845 * for completion here. 846 */ 847 mutex_lock(&device->compat_devs_mutex); 848 cdev = xa_load(&device->compat_devs, rnet->id); 849 if (cdev) { 850 ret = 0; 851 goto done; 852 } 853 ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); 854 if (ret) 855 goto done; 856 857 cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); 858 if (!cdev) { 859 ret = -ENOMEM; 860 goto cdev_err; 861 } 862 863 cdev->dev.parent = device->dev.parent; 864 rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); 865 cdev->dev.release = compatdev_release; 866 dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); 867 868 ret = device_add(&cdev->dev); 869 if (ret) 870 goto add_err; 871 ret = ib_setup_port_attrs(cdev); 872 if (ret) 873 goto port_err; 874 875 ret = xa_err(xa_store(&device->compat_devs, rnet->id, 876 cdev, GFP_KERNEL)); 877 if (ret) 878 goto insert_err; 879 880 mutex_unlock(&device->compat_devs_mutex); 881 return 0; 882 883 insert_err: 884 ib_free_port_attrs(cdev); 885 port_err: 886 device_del(&cdev->dev); 887 add_err: 888 put_device(&cdev->dev); 889 cdev_err: 890 xa_release(&device->compat_devs, rnet->id); 891 done: 892 mutex_unlock(&device->compat_devs_mutex); 893 return ret; 894 } 895 896 static void remove_one_compat_dev(struct ib_device *device, u32 id) 897 { 898 struct ib_core_device *cdev; 899 900 mutex_lock(&device->compat_devs_mutex); 901 cdev = xa_erase(&device->compat_devs, id); 902 mutex_unlock(&device->compat_devs_mutex); 903 if (cdev) { 904 ib_free_port_attrs(cdev); 905 device_del(&cdev->dev); 906 put_device(&cdev->dev); 907 } 908 } 909 910 static void remove_compat_devs(struct ib_device *device) 911 { 912 struct ib_core_device *cdev; 913 unsigned long index; 914 915 xa_for_each (&device->compat_devs, index, cdev) 916 remove_one_compat_dev(device, index); 917 } 918 919 static int add_compat_devs(struct ib_device *device) 920 { 921 struct rdma_dev_net *rnet; 922 unsigned long index; 923 int ret = 0; 924 925 lockdep_assert_held(&devices_rwsem); 926 927 down_read(&rdma_nets_rwsem); 928 xa_for_each (&rdma_nets, index, rnet) { 929 ret = add_one_compat_dev(device, rnet); 930 if (ret) 931 break; 932 } 933 up_read(&rdma_nets_rwsem); 934 return ret; 935 } 936 937 static void remove_all_compat_devs(void) 938 { 939 struct ib_compat_device *cdev; 940 struct ib_device *dev; 941 unsigned long index; 942 943 down_read(&devices_rwsem); 944 xa_for_each (&devices, index, dev) { 945 unsigned long c_index = 0; 946 947 /* Hold nets_rwsem so that any other thread modifying this 948 * system param can sync with this thread. 949 */ 950 down_read(&rdma_nets_rwsem); 951 xa_for_each (&dev->compat_devs, c_index, cdev) 952 remove_one_compat_dev(dev, c_index); 953 up_read(&rdma_nets_rwsem); 954 } 955 up_read(&devices_rwsem); 956 } 957 958 static int add_all_compat_devs(void) 959 { 960 struct rdma_dev_net *rnet; 961 struct ib_device *dev; 962 unsigned long index; 963 int ret = 0; 964 965 down_read(&devices_rwsem); 966 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 967 unsigned long net_index = 0; 968 969 /* Hold nets_rwsem so that any other thread modifying this 970 * system param can sync with this thread. 971 */ 972 down_read(&rdma_nets_rwsem); 973 xa_for_each (&rdma_nets, net_index, rnet) { 974 ret = add_one_compat_dev(dev, rnet); 975 if (ret) 976 break; 977 } 978 up_read(&rdma_nets_rwsem); 979 } 980 up_read(&devices_rwsem); 981 if (ret) 982 remove_all_compat_devs(); 983 return ret; 984 } 985 986 int rdma_compatdev_set(u8 enable) 987 { 988 struct rdma_dev_net *rnet; 989 unsigned long index; 990 int ret = 0; 991 992 down_write(&rdma_nets_rwsem); 993 if (ib_devices_shared_netns == enable) { 994 up_write(&rdma_nets_rwsem); 995 return 0; 996 } 997 998 /* enable/disable of compat devices is not supported 999 * when more than default init_net exists. 1000 */ 1001 xa_for_each (&rdma_nets, index, rnet) { 1002 ret++; 1003 break; 1004 } 1005 if (!ret) 1006 ib_devices_shared_netns = enable; 1007 up_write(&rdma_nets_rwsem); 1008 if (ret) 1009 return -EBUSY; 1010 1011 if (enable) 1012 ret = add_all_compat_devs(); 1013 else 1014 remove_all_compat_devs(); 1015 return ret; 1016 } 1017 1018 static void rdma_dev_exit_net(struct net *net) 1019 { 1020 struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id); 1021 struct ib_device *dev; 1022 unsigned long index; 1023 int ret; 1024 1025 down_write(&rdma_nets_rwsem); 1026 /* 1027 * Prevent the ID from being re-used and hide the id from xa_for_each. 1028 */ 1029 ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); 1030 WARN_ON(ret); 1031 up_write(&rdma_nets_rwsem); 1032 1033 down_read(&devices_rwsem); 1034 xa_for_each (&devices, index, dev) { 1035 get_device(&dev->dev); 1036 /* 1037 * Release the devices_rwsem so that pontentially blocking 1038 * device_del, doesn't hold the devices_rwsem for too long. 1039 */ 1040 up_read(&devices_rwsem); 1041 1042 remove_one_compat_dev(dev, rnet->id); 1043 1044 /* 1045 * If the real device is in the NS then move it back to init. 1046 */ 1047 rdma_dev_change_netns(dev, net, &init_net); 1048 1049 put_device(&dev->dev); 1050 down_read(&devices_rwsem); 1051 } 1052 up_read(&devices_rwsem); 1053 1054 xa_erase(&rdma_nets, rnet->id); 1055 } 1056 1057 static __net_init int rdma_dev_init_net(struct net *net) 1058 { 1059 struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id); 1060 unsigned long index; 1061 struct ib_device *dev; 1062 int ret; 1063 1064 /* No need to create any compat devices in default init_net. */ 1065 if (net_eq(net, &init_net)) 1066 return 0; 1067 1068 write_pnet(&rnet->net, net); 1069 1070 ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); 1071 if (ret) 1072 return ret; 1073 1074 down_read(&devices_rwsem); 1075 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1076 /* Hold nets_rwsem so that netlink command cannot change 1077 * system configuration for device sharing mode. 1078 */ 1079 down_read(&rdma_nets_rwsem); 1080 ret = add_one_compat_dev(dev, rnet); 1081 up_read(&rdma_nets_rwsem); 1082 if (ret) 1083 break; 1084 } 1085 up_read(&devices_rwsem); 1086 1087 if (ret) 1088 rdma_dev_exit_net(net); 1089 1090 return ret; 1091 } 1092 1093 /* 1094 * Assign the unique string device name and the unique device index. This is 1095 * undone by ib_dealloc_device. 1096 */ 1097 static int assign_name(struct ib_device *device, const char *name) 1098 { 1099 static u32 last_id; 1100 int ret; 1101 1102 down_write(&devices_rwsem); 1103 /* Assign a unique name to the device */ 1104 if (strchr(name, '%')) 1105 ret = alloc_name(device, name); 1106 else 1107 ret = dev_set_name(&device->dev, name); 1108 if (ret) 1109 goto out; 1110 1111 if (__ib_device_get_by_name(dev_name(&device->dev))) { 1112 ret = -ENFILE; 1113 goto out; 1114 } 1115 strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); 1116 1117 ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, 1118 &last_id, GFP_KERNEL); 1119 if (ret > 0) 1120 ret = 0; 1121 1122 out: 1123 up_write(&devices_rwsem); 1124 return ret; 1125 } 1126 1127 static void setup_dma_device(struct ib_device *device) 1128 { 1129 struct device *parent = device->dev.parent; 1130 1131 WARN_ON_ONCE(device->dma_device); 1132 if (device->dev.dma_ops) { 1133 /* 1134 * The caller provided custom DMA operations. Copy the 1135 * DMA-related fields that are used by e.g. dma_alloc_coherent() 1136 * into device->dev. 1137 */ 1138 device->dma_device = &device->dev; 1139 if (!device->dev.dma_mask) { 1140 if (parent) 1141 device->dev.dma_mask = parent->dma_mask; 1142 else 1143 WARN_ON_ONCE(true); 1144 } 1145 if (!device->dev.coherent_dma_mask) { 1146 if (parent) 1147 device->dev.coherent_dma_mask = 1148 parent->coherent_dma_mask; 1149 else 1150 WARN_ON_ONCE(true); 1151 } 1152 } else { 1153 /* 1154 * The caller did not provide custom DMA operations. Use the 1155 * DMA mapping operations of the parent device. 1156 */ 1157 WARN_ON_ONCE(!parent); 1158 device->dma_device = parent; 1159 } 1160 /* Setup default max segment size for all IB devices */ 1161 dma_set_max_seg_size(device->dma_device, SZ_2G); 1162 1163 } 1164 1165 /* 1166 * setup_device() allocates memory and sets up data that requires calling the 1167 * device ops, this is the only reason these actions are not done during 1168 * ib_alloc_device. It is undone by ib_dealloc_device(). 1169 */ 1170 static int setup_device(struct ib_device *device) 1171 { 1172 struct ib_udata uhw = {.outlen = 0, .inlen = 0}; 1173 int ret; 1174 1175 setup_dma_device(device); 1176 ib_device_check_mandatory(device); 1177 1178 ret = setup_port_data(device); 1179 if (ret) { 1180 dev_warn(&device->dev, "Couldn't create per-port data\n"); 1181 return ret; 1182 } 1183 1184 memset(&device->attrs, 0, sizeof(device->attrs)); 1185 ret = device->ops.query_device(device, &device->attrs, &uhw); 1186 if (ret) { 1187 dev_warn(&device->dev, 1188 "Couldn't query the device attributes\n"); 1189 return ret; 1190 } 1191 1192 return 0; 1193 } 1194 1195 static void disable_device(struct ib_device *device) 1196 { 1197 struct ib_client *client; 1198 1199 WARN_ON(!refcount_read(&device->refcount)); 1200 1201 down_write(&devices_rwsem); 1202 xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); 1203 up_write(&devices_rwsem); 1204 1205 down_read(&clients_rwsem); 1206 list_for_each_entry_reverse(client, &client_list, list) 1207 remove_client_context(device, client->client_id); 1208 up_read(&clients_rwsem); 1209 1210 /* Pairs with refcount_set in enable_device */ 1211 ib_device_put(device); 1212 wait_for_completion(&device->unreg_completion); 1213 1214 /* 1215 * compat devices must be removed after device refcount drops to zero. 1216 * Otherwise init_net() may add more compatdevs after removing compat 1217 * devices and before device is disabled. 1218 */ 1219 remove_compat_devs(device); 1220 } 1221 1222 /* 1223 * An enabled device is visible to all clients and to all the public facing 1224 * APIs that return a device pointer. This always returns with a new get, even 1225 * if it fails. 1226 */ 1227 static int enable_device_and_get(struct ib_device *device) 1228 { 1229 struct ib_client *client; 1230 unsigned long index; 1231 int ret = 0; 1232 1233 /* 1234 * One ref belongs to the xa and the other belongs to this 1235 * thread. This is needed to guard against parallel unregistration. 1236 */ 1237 refcount_set(&device->refcount, 2); 1238 down_write(&devices_rwsem); 1239 xa_set_mark(&devices, device->index, DEVICE_REGISTERED); 1240 1241 /* 1242 * By using downgrade_write() we ensure that no other thread can clear 1243 * DEVICE_REGISTERED while we are completing the client setup. 1244 */ 1245 downgrade_write(&devices_rwsem); 1246 1247 if (device->ops.enable_driver) { 1248 ret = device->ops.enable_driver(device); 1249 if (ret) 1250 goto out; 1251 } 1252 1253 down_read(&clients_rwsem); 1254 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1255 ret = add_client_context(device, client); 1256 if (ret) 1257 break; 1258 } 1259 up_read(&clients_rwsem); 1260 if (!ret) 1261 ret = add_compat_devs(device); 1262 out: 1263 up_read(&devices_rwsem); 1264 return ret; 1265 } 1266 1267 /** 1268 * ib_register_device - Register an IB device with IB core 1269 * @device:Device to register 1270 * 1271 * Low-level drivers use ib_register_device() to register their 1272 * devices with the IB core. All registered clients will receive a 1273 * callback for each device that is added. @device must be allocated 1274 * with ib_alloc_device(). 1275 * 1276 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() 1277 * asynchronously then the device pointer may become freed as soon as this 1278 * function returns. 1279 */ 1280 int ib_register_device(struct ib_device *device, const char *name) 1281 { 1282 int ret; 1283 1284 ret = assign_name(device, name); 1285 if (ret) 1286 return ret; 1287 1288 ret = setup_device(device); 1289 if (ret) 1290 return ret; 1291 1292 ret = ib_cache_setup_one(device); 1293 if (ret) { 1294 dev_warn(&device->dev, 1295 "Couldn't set up InfiniBand P_Key/GID cache\n"); 1296 return ret; 1297 } 1298 1299 ib_device_register_rdmacg(device); 1300 1301 /* 1302 * Ensure that ADD uevent is not fired because it 1303 * is too early amd device is not initialized yet. 1304 */ 1305 dev_set_uevent_suppress(&device->dev, true); 1306 ret = device_add(&device->dev); 1307 if (ret) 1308 goto cg_cleanup; 1309 1310 ret = ib_device_register_sysfs(device); 1311 if (ret) { 1312 dev_warn(&device->dev, 1313 "Couldn't register device with driver model\n"); 1314 goto dev_cleanup; 1315 } 1316 1317 ret = enable_device_and_get(device); 1318 dev_set_uevent_suppress(&device->dev, false); 1319 /* Mark for userspace that device is ready */ 1320 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1321 if (ret) { 1322 void (*dealloc_fn)(struct ib_device *); 1323 1324 /* 1325 * If we hit this error flow then we don't want to 1326 * automatically dealloc the device since the caller is 1327 * expected to call ib_dealloc_device() after 1328 * ib_register_device() fails. This is tricky due to the 1329 * possibility for a parallel unregistration along with this 1330 * error flow. Since we have a refcount here we know any 1331 * parallel flow is stopped in disable_device and will see the 1332 * NULL pointers, causing the responsibility to 1333 * ib_dealloc_device() to revert back to this thread. 1334 */ 1335 dealloc_fn = device->ops.dealloc_driver; 1336 device->ops.dealloc_driver = NULL; 1337 ib_device_put(device); 1338 __ib_unregister_device(device); 1339 device->ops.dealloc_driver = dealloc_fn; 1340 return ret; 1341 } 1342 ib_device_put(device); 1343 1344 return 0; 1345 1346 dev_cleanup: 1347 device_del(&device->dev); 1348 cg_cleanup: 1349 dev_set_uevent_suppress(&device->dev, false); 1350 ib_device_unregister_rdmacg(device); 1351 ib_cache_cleanup_one(device); 1352 return ret; 1353 } 1354 EXPORT_SYMBOL(ib_register_device); 1355 1356 /* Callers must hold a get on the device. */ 1357 static void __ib_unregister_device(struct ib_device *ib_dev) 1358 { 1359 /* 1360 * We have a registration lock so that all the calls to unregister are 1361 * fully fenced, once any unregister returns the device is truely 1362 * unregistered even if multiple callers are unregistering it at the 1363 * same time. This also interacts with the registration flow and 1364 * provides sane semantics if register and unregister are racing. 1365 */ 1366 mutex_lock(&ib_dev->unregistration_lock); 1367 if (!refcount_read(&ib_dev->refcount)) 1368 goto out; 1369 1370 disable_device(ib_dev); 1371 1372 /* Expedite removing unregistered pointers from the hash table */ 1373 free_netdevs(ib_dev); 1374 1375 ib_device_unregister_sysfs(ib_dev); 1376 device_del(&ib_dev->dev); 1377 ib_device_unregister_rdmacg(ib_dev); 1378 ib_cache_cleanup_one(ib_dev); 1379 1380 /* 1381 * Drivers using the new flow may not call ib_dealloc_device except 1382 * in error unwind prior to registration success. 1383 */ 1384 if (ib_dev->ops.dealloc_driver) { 1385 WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); 1386 ib_dealloc_device(ib_dev); 1387 } 1388 out: 1389 mutex_unlock(&ib_dev->unregistration_lock); 1390 } 1391 1392 /** 1393 * ib_unregister_device - Unregister an IB device 1394 * @device: The device to unregister 1395 * 1396 * Unregister an IB device. All clients will receive a remove callback. 1397 * 1398 * Callers should call this routine only once, and protect against races with 1399 * registration. Typically it should only be called as part of a remove 1400 * callback in an implementation of driver core's struct device_driver and 1401 * related. 1402 * 1403 * If ops.dealloc_driver is used then ib_dev will be freed upon return from 1404 * this function. 1405 */ 1406 void ib_unregister_device(struct ib_device *ib_dev) 1407 { 1408 get_device(&ib_dev->dev); 1409 __ib_unregister_device(ib_dev); 1410 put_device(&ib_dev->dev); 1411 } 1412 EXPORT_SYMBOL(ib_unregister_device); 1413 1414 /** 1415 * ib_unregister_device_and_put - Unregister a device while holding a 'get' 1416 * device: The device to unregister 1417 * 1418 * This is the same as ib_unregister_device(), except it includes an internal 1419 * ib_device_put() that should match a 'get' obtained by the caller. 1420 * 1421 * It is safe to call this routine concurrently from multiple threads while 1422 * holding the 'get'. When the function returns the device is fully 1423 * unregistered. 1424 * 1425 * Drivers using this flow MUST use the driver_unregister callback to clean up 1426 * their resources associated with the device and dealloc it. 1427 */ 1428 void ib_unregister_device_and_put(struct ib_device *ib_dev) 1429 { 1430 WARN_ON(!ib_dev->ops.dealloc_driver); 1431 get_device(&ib_dev->dev); 1432 ib_device_put(ib_dev); 1433 __ib_unregister_device(ib_dev); 1434 put_device(&ib_dev->dev); 1435 } 1436 EXPORT_SYMBOL(ib_unregister_device_and_put); 1437 1438 /** 1439 * ib_unregister_driver - Unregister all IB devices for a driver 1440 * @driver_id: The driver to unregister 1441 * 1442 * This implements a fence for device unregistration. It only returns once all 1443 * devices associated with the driver_id have fully completed their 1444 * unregistration and returned from ib_unregister_device*(). 1445 * 1446 * If device's are not yet unregistered it goes ahead and starts unregistering 1447 * them. 1448 * 1449 * This does not block creation of new devices with the given driver_id, that 1450 * is the responsibility of the caller. 1451 */ 1452 void ib_unregister_driver(enum rdma_driver_id driver_id) 1453 { 1454 struct ib_device *ib_dev; 1455 unsigned long index; 1456 1457 down_read(&devices_rwsem); 1458 xa_for_each (&devices, index, ib_dev) { 1459 if (ib_dev->ops.driver_id != driver_id) 1460 continue; 1461 1462 get_device(&ib_dev->dev); 1463 up_read(&devices_rwsem); 1464 1465 WARN_ON(!ib_dev->ops.dealloc_driver); 1466 __ib_unregister_device(ib_dev); 1467 1468 put_device(&ib_dev->dev); 1469 down_read(&devices_rwsem); 1470 } 1471 up_read(&devices_rwsem); 1472 } 1473 EXPORT_SYMBOL(ib_unregister_driver); 1474 1475 static void ib_unregister_work(struct work_struct *work) 1476 { 1477 struct ib_device *ib_dev = 1478 container_of(work, struct ib_device, unregistration_work); 1479 1480 __ib_unregister_device(ib_dev); 1481 put_device(&ib_dev->dev); 1482 } 1483 1484 /** 1485 * ib_unregister_device_queued - Unregister a device using a work queue 1486 * device: The device to unregister 1487 * 1488 * This schedules an asynchronous unregistration using a WQ for the device. A 1489 * driver should use this to avoid holding locks while doing unregistration, 1490 * such as holding the RTNL lock. 1491 * 1492 * Drivers using this API must use ib_unregister_driver before module unload 1493 * to ensure that all scheduled unregistrations have completed. 1494 */ 1495 void ib_unregister_device_queued(struct ib_device *ib_dev) 1496 { 1497 WARN_ON(!refcount_read(&ib_dev->refcount)); 1498 WARN_ON(!ib_dev->ops.dealloc_driver); 1499 get_device(&ib_dev->dev); 1500 if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work)) 1501 put_device(&ib_dev->dev); 1502 } 1503 EXPORT_SYMBOL(ib_unregister_device_queued); 1504 1505 /* 1506 * The caller must pass in a device that has the kref held and the refcount 1507 * released. If the device is in cur_net and still registered then it is moved 1508 * into net. 1509 */ 1510 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 1511 struct net *net) 1512 { 1513 int ret2 = -EINVAL; 1514 int ret; 1515 1516 mutex_lock(&device->unregistration_lock); 1517 1518 /* 1519 * If a device not under ib_device_get() or if the unregistration_lock 1520 * is not held, the namespace can be changed, or it can be unregistered. 1521 * Check again under the lock. 1522 */ 1523 if (refcount_read(&device->refcount) == 0 || 1524 !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { 1525 ret = -ENODEV; 1526 goto out; 1527 } 1528 1529 kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); 1530 disable_device(device); 1531 1532 /* 1533 * At this point no one can be using the device, so it is safe to 1534 * change the namespace. 1535 */ 1536 write_pnet(&device->coredev.rdma_net, net); 1537 1538 down_read(&devices_rwsem); 1539 /* 1540 * Currently rdma devices are system wide unique. So the device name 1541 * is guaranteed free in the new namespace. Publish the new namespace 1542 * at the sysfs level. 1543 */ 1544 ret = device_rename(&device->dev, dev_name(&device->dev)); 1545 up_read(&devices_rwsem); 1546 if (ret) { 1547 dev_warn(&device->dev, 1548 "%s: Couldn't rename device after namespace change\n", 1549 __func__); 1550 /* Try and put things back and re-enable the device */ 1551 write_pnet(&device->coredev.rdma_net, cur_net); 1552 } 1553 1554 ret2 = enable_device_and_get(device); 1555 if (ret2) { 1556 /* 1557 * This shouldn't really happen, but if it does, let the user 1558 * retry at later point. So don't disable the device. 1559 */ 1560 dev_warn(&device->dev, 1561 "%s: Couldn't re-enable device after namespace change\n", 1562 __func__); 1563 } 1564 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1565 1566 ib_device_put(device); 1567 out: 1568 mutex_unlock(&device->unregistration_lock); 1569 if (ret) 1570 return ret; 1571 return ret2; 1572 } 1573 1574 int ib_device_set_netns_put(struct sk_buff *skb, 1575 struct ib_device *dev, u32 ns_fd) 1576 { 1577 struct net *net; 1578 int ret; 1579 1580 net = get_net_ns_by_fd(ns_fd); 1581 if (IS_ERR(net)) { 1582 ret = PTR_ERR(net); 1583 goto net_err; 1584 } 1585 1586 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { 1587 ret = -EPERM; 1588 goto ns_err; 1589 } 1590 1591 /* 1592 * Currently supported only for those providers which support 1593 * disassociation and don't do port specific sysfs init. Once a 1594 * port_cleanup infrastructure is implemented, this limitation will be 1595 * removed. 1596 */ 1597 if (!dev->ops.disassociate_ucontext || dev->ops.init_port || 1598 ib_devices_shared_netns) { 1599 ret = -EOPNOTSUPP; 1600 goto ns_err; 1601 } 1602 1603 get_device(&dev->dev); 1604 ib_device_put(dev); 1605 ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); 1606 put_device(&dev->dev); 1607 1608 put_net(net); 1609 return ret; 1610 1611 ns_err: 1612 put_net(net); 1613 net_err: 1614 ib_device_put(dev); 1615 return ret; 1616 } 1617 1618 static struct pernet_operations rdma_dev_net_ops = { 1619 .init = rdma_dev_init_net, 1620 .exit = rdma_dev_exit_net, 1621 .id = &rdma_dev_net_id, 1622 .size = sizeof(struct rdma_dev_net), 1623 }; 1624 1625 static int assign_client_id(struct ib_client *client) 1626 { 1627 int ret; 1628 1629 down_write(&clients_rwsem); 1630 /* 1631 * The add/remove callbacks must be called in FIFO/LIFO order. To 1632 * achieve this we assign client_ids so they are sorted in 1633 * registration order, and retain a linked list we can reverse iterate 1634 * to get the LIFO order. The extra linked list can go away if xarray 1635 * learns to reverse iterate. 1636 */ 1637 if (list_empty(&client_list)) { 1638 client->client_id = 0; 1639 } else { 1640 struct ib_client *last; 1641 1642 last = list_last_entry(&client_list, struct ib_client, list); 1643 client->client_id = last->client_id + 1; 1644 } 1645 ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); 1646 if (ret) 1647 goto out; 1648 1649 xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); 1650 list_add_tail(&client->list, &client_list); 1651 1652 out: 1653 up_write(&clients_rwsem); 1654 return ret; 1655 } 1656 1657 /** 1658 * ib_register_client - Register an IB client 1659 * @client:Client to register 1660 * 1661 * Upper level users of the IB drivers can use ib_register_client() to 1662 * register callbacks for IB device addition and removal. When an IB 1663 * device is added, each registered client's add method will be called 1664 * (in the order the clients were registered), and when a device is 1665 * removed, each client's remove method will be called (in the reverse 1666 * order that clients were registered). In addition, when 1667 * ib_register_client() is called, the client will receive an add 1668 * callback for all devices already registered. 1669 */ 1670 int ib_register_client(struct ib_client *client) 1671 { 1672 struct ib_device *device; 1673 unsigned long index; 1674 int ret; 1675 1676 ret = assign_client_id(client); 1677 if (ret) 1678 return ret; 1679 1680 down_read(&devices_rwsem); 1681 xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { 1682 ret = add_client_context(device, client); 1683 if (ret) { 1684 up_read(&devices_rwsem); 1685 ib_unregister_client(client); 1686 return ret; 1687 } 1688 } 1689 up_read(&devices_rwsem); 1690 return 0; 1691 } 1692 EXPORT_SYMBOL(ib_register_client); 1693 1694 /** 1695 * ib_unregister_client - Unregister an IB client 1696 * @client:Client to unregister 1697 * 1698 * Upper level users use ib_unregister_client() to remove their client 1699 * registration. When ib_unregister_client() is called, the client 1700 * will receive a remove callback for each IB device still registered. 1701 * 1702 * This is a full fence, once it returns no client callbacks will be called, 1703 * or are running in another thread. 1704 */ 1705 void ib_unregister_client(struct ib_client *client) 1706 { 1707 struct ib_device *device; 1708 unsigned long index; 1709 1710 down_write(&clients_rwsem); 1711 xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); 1712 up_write(&clients_rwsem); 1713 /* 1714 * Every device still known must be serialized to make sure we are 1715 * done with the client callbacks before we return. 1716 */ 1717 down_read(&devices_rwsem); 1718 xa_for_each (&devices, index, device) 1719 remove_client_context(device, client->client_id); 1720 up_read(&devices_rwsem); 1721 1722 down_write(&clients_rwsem); 1723 list_del(&client->list); 1724 xa_erase(&clients, client->client_id); 1725 up_write(&clients_rwsem); 1726 } 1727 EXPORT_SYMBOL(ib_unregister_client); 1728 1729 static int __ib_get_global_client_nl_info(const char *client_name, 1730 struct ib_client_nl_info *res) 1731 { 1732 struct ib_client *client; 1733 unsigned long index; 1734 int ret = -ENOENT; 1735 1736 down_read(&clients_rwsem); 1737 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1738 if (strcmp(client->name, client_name) != 0) 1739 continue; 1740 if (!client->get_global_nl_info) { 1741 ret = -EOPNOTSUPP; 1742 break; 1743 } 1744 ret = client->get_global_nl_info(res); 1745 if (WARN_ON(ret == -ENOENT)) 1746 ret = -EINVAL; 1747 if (!ret && res->cdev) 1748 get_device(res->cdev); 1749 break; 1750 } 1751 up_read(&clients_rwsem); 1752 return ret; 1753 } 1754 1755 static int __ib_get_client_nl_info(struct ib_device *ibdev, 1756 const char *client_name, 1757 struct ib_client_nl_info *res) 1758 { 1759 unsigned long index; 1760 void *client_data; 1761 int ret = -ENOENT; 1762 1763 down_read(&ibdev->client_data_rwsem); 1764 xan_for_each_marked (&ibdev->client_data, index, client_data, 1765 CLIENT_DATA_REGISTERED) { 1766 struct ib_client *client = xa_load(&clients, index); 1767 1768 if (!client || strcmp(client->name, client_name) != 0) 1769 continue; 1770 if (!client->get_nl_info) { 1771 ret = -EOPNOTSUPP; 1772 break; 1773 } 1774 ret = client->get_nl_info(ibdev, client_data, res); 1775 if (WARN_ON(ret == -ENOENT)) 1776 ret = -EINVAL; 1777 1778 /* 1779 * The cdev is guaranteed valid as long as we are inside the 1780 * client_data_rwsem as remove_one can't be called. Keep it 1781 * valid for the caller. 1782 */ 1783 if (!ret && res->cdev) 1784 get_device(res->cdev); 1785 break; 1786 } 1787 up_read(&ibdev->client_data_rwsem); 1788 1789 return ret; 1790 } 1791 1792 /** 1793 * ib_get_client_nl_info - Fetch the nl_info from a client 1794 * @device - IB device 1795 * @client_name - Name of the client 1796 * @res - Result of the query 1797 */ 1798 int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, 1799 struct ib_client_nl_info *res) 1800 { 1801 int ret; 1802 1803 if (ibdev) 1804 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1805 else 1806 ret = __ib_get_global_client_nl_info(client_name, res); 1807 #ifdef CONFIG_MODULES 1808 if (ret == -ENOENT) { 1809 request_module("rdma-client-%s", client_name); 1810 if (ibdev) 1811 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1812 else 1813 ret = __ib_get_global_client_nl_info(client_name, res); 1814 } 1815 #endif 1816 if (ret) { 1817 if (ret == -ENOENT) 1818 return -EOPNOTSUPP; 1819 return ret; 1820 } 1821 1822 if (WARN_ON(!res->cdev)) 1823 return -EINVAL; 1824 return 0; 1825 } 1826 1827 /** 1828 * ib_set_client_data - Set IB client context 1829 * @device:Device to set context for 1830 * @client:Client to set context for 1831 * @data:Context to set 1832 * 1833 * ib_set_client_data() sets client context data that can be retrieved with 1834 * ib_get_client_data(). This can only be called while the client is 1835 * registered to the device, once the ib_client remove() callback returns this 1836 * cannot be called. 1837 */ 1838 void ib_set_client_data(struct ib_device *device, struct ib_client *client, 1839 void *data) 1840 { 1841 void *rc; 1842 1843 if (WARN_ON(IS_ERR(data))) 1844 data = NULL; 1845 1846 rc = xa_store(&device->client_data, client->client_id, data, 1847 GFP_KERNEL); 1848 WARN_ON(xa_is_err(rc)); 1849 } 1850 EXPORT_SYMBOL(ib_set_client_data); 1851 1852 /** 1853 * ib_register_event_handler - Register an IB event handler 1854 * @event_handler:Handler to register 1855 * 1856 * ib_register_event_handler() registers an event handler that will be 1857 * called back when asynchronous IB events occur (as defined in 1858 * chapter 11 of the InfiniBand Architecture Specification). This 1859 * callback may occur in interrupt context. 1860 */ 1861 void ib_register_event_handler(struct ib_event_handler *event_handler) 1862 { 1863 unsigned long flags; 1864 1865 spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); 1866 list_add_tail(&event_handler->list, 1867 &event_handler->device->event_handler_list); 1868 spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); 1869 } 1870 EXPORT_SYMBOL(ib_register_event_handler); 1871 1872 /** 1873 * ib_unregister_event_handler - Unregister an event handler 1874 * @event_handler:Handler to unregister 1875 * 1876 * Unregister an event handler registered with 1877 * ib_register_event_handler(). 1878 */ 1879 void ib_unregister_event_handler(struct ib_event_handler *event_handler) 1880 { 1881 unsigned long flags; 1882 1883 spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); 1884 list_del(&event_handler->list); 1885 spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); 1886 } 1887 EXPORT_SYMBOL(ib_unregister_event_handler); 1888 1889 /** 1890 * ib_dispatch_event - Dispatch an asynchronous event 1891 * @event:Event to dispatch 1892 * 1893 * Low-level drivers must call ib_dispatch_event() to dispatch the 1894 * event to all registered event handlers when an asynchronous event 1895 * occurs. 1896 */ 1897 void ib_dispatch_event(struct ib_event *event) 1898 { 1899 unsigned long flags; 1900 struct ib_event_handler *handler; 1901 1902 spin_lock_irqsave(&event->device->event_handler_lock, flags); 1903 1904 list_for_each_entry(handler, &event->device->event_handler_list, list) 1905 handler->handler(handler, event); 1906 1907 spin_unlock_irqrestore(&event->device->event_handler_lock, flags); 1908 } 1909 EXPORT_SYMBOL(ib_dispatch_event); 1910 1911 /** 1912 * ib_query_port - Query IB port attributes 1913 * @device:Device to query 1914 * @port_num:Port number to query 1915 * @port_attr:Port attributes 1916 * 1917 * ib_query_port() returns the attributes of a port through the 1918 * @port_attr pointer. 1919 */ 1920 int ib_query_port(struct ib_device *device, 1921 u8 port_num, 1922 struct ib_port_attr *port_attr) 1923 { 1924 union ib_gid gid; 1925 int err; 1926 1927 if (!rdma_is_port_valid(device, port_num)) 1928 return -EINVAL; 1929 1930 memset(port_attr, 0, sizeof(*port_attr)); 1931 err = device->ops.query_port(device, port_num, port_attr); 1932 if (err || port_attr->subnet_prefix) 1933 return err; 1934 1935 if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND) 1936 return 0; 1937 1938 err = device->ops.query_gid(device, port_num, 0, &gid); 1939 if (err) 1940 return err; 1941 1942 port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix); 1943 return 0; 1944 } 1945 EXPORT_SYMBOL(ib_query_port); 1946 1947 static void add_ndev_hash(struct ib_port_data *pdata) 1948 { 1949 unsigned long flags; 1950 1951 might_sleep(); 1952 1953 spin_lock_irqsave(&ndev_hash_lock, flags); 1954 if (hash_hashed(&pdata->ndev_hash_link)) { 1955 hash_del_rcu(&pdata->ndev_hash_link); 1956 spin_unlock_irqrestore(&ndev_hash_lock, flags); 1957 /* 1958 * We cannot do hash_add_rcu after a hash_del_rcu until the 1959 * grace period 1960 */ 1961 synchronize_rcu(); 1962 spin_lock_irqsave(&ndev_hash_lock, flags); 1963 } 1964 if (pdata->netdev) 1965 hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, 1966 (uintptr_t)pdata->netdev); 1967 spin_unlock_irqrestore(&ndev_hash_lock, flags); 1968 } 1969 1970 /** 1971 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device 1972 * @ib_dev: Device to modify 1973 * @ndev: net_device to affiliate, may be NULL 1974 * @port: IB port the net_device is connected to 1975 * 1976 * Drivers should use this to link the ib_device to a netdev so the netdev 1977 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be 1978 * affiliated with any port. 1979 * 1980 * The caller must ensure that the given ndev is not unregistered or 1981 * unregistering, and that either the ib_device is unregistered or 1982 * ib_device_set_netdev() is called with NULL when the ndev sends a 1983 * NETDEV_UNREGISTER event. 1984 */ 1985 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, 1986 unsigned int port) 1987 { 1988 struct net_device *old_ndev; 1989 struct ib_port_data *pdata; 1990 unsigned long flags; 1991 int ret; 1992 1993 /* 1994 * Drivers wish to call this before ib_register_driver, so we have to 1995 * setup the port data early. 1996 */ 1997 ret = alloc_port_data(ib_dev); 1998 if (ret) 1999 return ret; 2000 2001 if (!rdma_is_port_valid(ib_dev, port)) 2002 return -EINVAL; 2003 2004 pdata = &ib_dev->port_data[port]; 2005 spin_lock_irqsave(&pdata->netdev_lock, flags); 2006 old_ndev = rcu_dereference_protected( 2007 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2008 if (old_ndev == ndev) { 2009 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2010 return 0; 2011 } 2012 2013 if (ndev) 2014 dev_hold(ndev); 2015 rcu_assign_pointer(pdata->netdev, ndev); 2016 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2017 2018 add_ndev_hash(pdata); 2019 if (old_ndev) 2020 dev_put(old_ndev); 2021 2022 return 0; 2023 } 2024 EXPORT_SYMBOL(ib_device_set_netdev); 2025 2026 static void free_netdevs(struct ib_device *ib_dev) 2027 { 2028 unsigned long flags; 2029 unsigned int port; 2030 2031 rdma_for_each_port (ib_dev, port) { 2032 struct ib_port_data *pdata = &ib_dev->port_data[port]; 2033 struct net_device *ndev; 2034 2035 spin_lock_irqsave(&pdata->netdev_lock, flags); 2036 ndev = rcu_dereference_protected( 2037 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2038 if (ndev) { 2039 spin_lock(&ndev_hash_lock); 2040 hash_del_rcu(&pdata->ndev_hash_link); 2041 spin_unlock(&ndev_hash_lock); 2042 2043 /* 2044 * If this is the last dev_put there is still a 2045 * synchronize_rcu before the netdev is kfreed, so we 2046 * can continue to rely on unlocked pointer 2047 * comparisons after the put 2048 */ 2049 rcu_assign_pointer(pdata->netdev, NULL); 2050 dev_put(ndev); 2051 } 2052 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2053 } 2054 } 2055 2056 struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, 2057 unsigned int port) 2058 { 2059 struct ib_port_data *pdata; 2060 struct net_device *res; 2061 2062 if (!rdma_is_port_valid(ib_dev, port)) 2063 return NULL; 2064 2065 pdata = &ib_dev->port_data[port]; 2066 2067 /* 2068 * New drivers should use ib_device_set_netdev() not the legacy 2069 * get_netdev(). 2070 */ 2071 if (ib_dev->ops.get_netdev) 2072 res = ib_dev->ops.get_netdev(ib_dev, port); 2073 else { 2074 spin_lock(&pdata->netdev_lock); 2075 res = rcu_dereference_protected( 2076 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2077 if (res) 2078 dev_hold(res); 2079 spin_unlock(&pdata->netdev_lock); 2080 } 2081 2082 /* 2083 * If we are starting to unregister expedite things by preventing 2084 * propagation of an unregistering netdev. 2085 */ 2086 if (res && res->reg_state != NETREG_REGISTERED) { 2087 dev_put(res); 2088 return NULL; 2089 } 2090 2091 return res; 2092 } 2093 2094 /** 2095 * ib_device_get_by_netdev - Find an IB device associated with a netdev 2096 * @ndev: netdev to locate 2097 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 2098 * 2099 * Find and hold an ib_device that is associated with a netdev via 2100 * ib_device_set_netdev(). The caller must call ib_device_put() on the 2101 * returned pointer. 2102 */ 2103 struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, 2104 enum rdma_driver_id driver_id) 2105 { 2106 struct ib_device *res = NULL; 2107 struct ib_port_data *cur; 2108 2109 rcu_read_lock(); 2110 hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, 2111 (uintptr_t)ndev) { 2112 if (rcu_access_pointer(cur->netdev) == ndev && 2113 (driver_id == RDMA_DRIVER_UNKNOWN || 2114 cur->ib_dev->ops.driver_id == driver_id) && 2115 ib_device_try_get(cur->ib_dev)) { 2116 res = cur->ib_dev; 2117 break; 2118 } 2119 } 2120 rcu_read_unlock(); 2121 2122 return res; 2123 } 2124 EXPORT_SYMBOL(ib_device_get_by_netdev); 2125 2126 /** 2127 * ib_enum_roce_netdev - enumerate all RoCE ports 2128 * @ib_dev : IB device we want to query 2129 * @filter: Should we call the callback? 2130 * @filter_cookie: Cookie passed to filter 2131 * @cb: Callback to call for each found RoCE ports 2132 * @cookie: Cookie passed back to the callback 2133 * 2134 * Enumerates all of the physical RoCE ports of ib_dev 2135 * which are related to netdevice and calls callback() on each 2136 * device for which filter() function returns non zero. 2137 */ 2138 void ib_enum_roce_netdev(struct ib_device *ib_dev, 2139 roce_netdev_filter filter, 2140 void *filter_cookie, 2141 roce_netdev_callback cb, 2142 void *cookie) 2143 { 2144 unsigned int port; 2145 2146 rdma_for_each_port (ib_dev, port) 2147 if (rdma_protocol_roce(ib_dev, port)) { 2148 struct net_device *idev = 2149 ib_device_get_netdev(ib_dev, port); 2150 2151 if (filter(ib_dev, port, idev, filter_cookie)) 2152 cb(ib_dev, port, idev, cookie); 2153 2154 if (idev) 2155 dev_put(idev); 2156 } 2157 } 2158 2159 /** 2160 * ib_enum_all_roce_netdevs - enumerate all RoCE devices 2161 * @filter: Should we call the callback? 2162 * @filter_cookie: Cookie passed to filter 2163 * @cb: Callback to call for each found RoCE ports 2164 * @cookie: Cookie passed back to the callback 2165 * 2166 * Enumerates all RoCE devices' physical ports which are related 2167 * to netdevices and calls callback() on each device for which 2168 * filter() function returns non zero. 2169 */ 2170 void ib_enum_all_roce_netdevs(roce_netdev_filter filter, 2171 void *filter_cookie, 2172 roce_netdev_callback cb, 2173 void *cookie) 2174 { 2175 struct ib_device *dev; 2176 unsigned long index; 2177 2178 down_read(&devices_rwsem); 2179 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) 2180 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); 2181 up_read(&devices_rwsem); 2182 } 2183 2184 /** 2185 * ib_enum_all_devs - enumerate all ib_devices 2186 * @cb: Callback to call for each found ib_device 2187 * 2188 * Enumerates all ib_devices and calls callback() on each device. 2189 */ 2190 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, 2191 struct netlink_callback *cb) 2192 { 2193 unsigned long index; 2194 struct ib_device *dev; 2195 unsigned int idx = 0; 2196 int ret = 0; 2197 2198 down_read(&devices_rwsem); 2199 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 2200 if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) 2201 continue; 2202 2203 ret = nldev_cb(dev, skb, cb, idx); 2204 if (ret) 2205 break; 2206 idx++; 2207 } 2208 up_read(&devices_rwsem); 2209 return ret; 2210 } 2211 2212 /** 2213 * ib_query_pkey - Get P_Key table entry 2214 * @device:Device to query 2215 * @port_num:Port number to query 2216 * @index:P_Key table index to query 2217 * @pkey:Returned P_Key 2218 * 2219 * ib_query_pkey() fetches the specified P_Key table entry. 2220 */ 2221 int ib_query_pkey(struct ib_device *device, 2222 u8 port_num, u16 index, u16 *pkey) 2223 { 2224 if (!rdma_is_port_valid(device, port_num)) 2225 return -EINVAL; 2226 2227 return device->ops.query_pkey(device, port_num, index, pkey); 2228 } 2229 EXPORT_SYMBOL(ib_query_pkey); 2230 2231 /** 2232 * ib_modify_device - Change IB device attributes 2233 * @device:Device to modify 2234 * @device_modify_mask:Mask of attributes to change 2235 * @device_modify:New attribute values 2236 * 2237 * ib_modify_device() changes a device's attributes as specified by 2238 * the @device_modify_mask and @device_modify structure. 2239 */ 2240 int ib_modify_device(struct ib_device *device, 2241 int device_modify_mask, 2242 struct ib_device_modify *device_modify) 2243 { 2244 if (!device->ops.modify_device) 2245 return -ENOSYS; 2246 2247 return device->ops.modify_device(device, device_modify_mask, 2248 device_modify); 2249 } 2250 EXPORT_SYMBOL(ib_modify_device); 2251 2252 /** 2253 * ib_modify_port - Modifies the attributes for the specified port. 2254 * @device: The device to modify. 2255 * @port_num: The number of the port to modify. 2256 * @port_modify_mask: Mask used to specify which attributes of the port 2257 * to change. 2258 * @port_modify: New attribute values for the port. 2259 * 2260 * ib_modify_port() changes a port's attributes as specified by the 2261 * @port_modify_mask and @port_modify structure. 2262 */ 2263 int ib_modify_port(struct ib_device *device, 2264 u8 port_num, int port_modify_mask, 2265 struct ib_port_modify *port_modify) 2266 { 2267 int rc; 2268 2269 if (!rdma_is_port_valid(device, port_num)) 2270 return -EINVAL; 2271 2272 if (device->ops.modify_port) 2273 rc = device->ops.modify_port(device, port_num, 2274 port_modify_mask, 2275 port_modify); 2276 else 2277 rc = rdma_protocol_roce(device, port_num) ? 0 : -ENOSYS; 2278 return rc; 2279 } 2280 EXPORT_SYMBOL(ib_modify_port); 2281 2282 /** 2283 * ib_find_gid - Returns the port number and GID table index where 2284 * a specified GID value occurs. Its searches only for IB link layer. 2285 * @device: The device to query. 2286 * @gid: The GID value to search for. 2287 * @port_num: The port number of the device where the GID value was found. 2288 * @index: The index into the GID table where the GID was found. This 2289 * parameter may be NULL. 2290 */ 2291 int ib_find_gid(struct ib_device *device, union ib_gid *gid, 2292 u8 *port_num, u16 *index) 2293 { 2294 union ib_gid tmp_gid; 2295 unsigned int port; 2296 int ret, i; 2297 2298 rdma_for_each_port (device, port) { 2299 if (!rdma_protocol_ib(device, port)) 2300 continue; 2301 2302 for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; 2303 ++i) { 2304 ret = rdma_query_gid(device, port, i, &tmp_gid); 2305 if (ret) 2306 return ret; 2307 if (!memcmp(&tmp_gid, gid, sizeof *gid)) { 2308 *port_num = port; 2309 if (index) 2310 *index = i; 2311 return 0; 2312 } 2313 } 2314 } 2315 2316 return -ENOENT; 2317 } 2318 EXPORT_SYMBOL(ib_find_gid); 2319 2320 /** 2321 * ib_find_pkey - Returns the PKey table index where a specified 2322 * PKey value occurs. 2323 * @device: The device to query. 2324 * @port_num: The port number of the device to search for the PKey. 2325 * @pkey: The PKey value to search for. 2326 * @index: The index into the PKey table where the PKey was found. 2327 */ 2328 int ib_find_pkey(struct ib_device *device, 2329 u8 port_num, u16 pkey, u16 *index) 2330 { 2331 int ret, i; 2332 u16 tmp_pkey; 2333 int partial_ix = -1; 2334 2335 for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; 2336 ++i) { 2337 ret = ib_query_pkey(device, port_num, i, &tmp_pkey); 2338 if (ret) 2339 return ret; 2340 if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { 2341 /* if there is full-member pkey take it.*/ 2342 if (tmp_pkey & 0x8000) { 2343 *index = i; 2344 return 0; 2345 } 2346 if (partial_ix < 0) 2347 partial_ix = i; 2348 } 2349 } 2350 2351 /*no full-member, if exists take the limited*/ 2352 if (partial_ix >= 0) { 2353 *index = partial_ix; 2354 return 0; 2355 } 2356 return -ENOENT; 2357 } 2358 EXPORT_SYMBOL(ib_find_pkey); 2359 2360 /** 2361 * ib_get_net_dev_by_params() - Return the appropriate net_dev 2362 * for a received CM request 2363 * @dev: An RDMA device on which the request has been received. 2364 * @port: Port number on the RDMA device. 2365 * @pkey: The Pkey the request came on. 2366 * @gid: A GID that the net_dev uses to communicate. 2367 * @addr: Contains the IP address that the request specified as its 2368 * destination. 2369 * 2370 */ 2371 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, 2372 u8 port, 2373 u16 pkey, 2374 const union ib_gid *gid, 2375 const struct sockaddr *addr) 2376 { 2377 struct net_device *net_dev = NULL; 2378 unsigned long index; 2379 void *client_data; 2380 2381 if (!rdma_protocol_ib(dev, port)) 2382 return NULL; 2383 2384 /* 2385 * Holding the read side guarantees that the client will not become 2386 * unregistered while we are calling get_net_dev_by_params() 2387 */ 2388 down_read(&dev->client_data_rwsem); 2389 xan_for_each_marked (&dev->client_data, index, client_data, 2390 CLIENT_DATA_REGISTERED) { 2391 struct ib_client *client = xa_load(&clients, index); 2392 2393 if (!client || !client->get_net_dev_by_params) 2394 continue; 2395 2396 net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, 2397 addr, client_data); 2398 if (net_dev) 2399 break; 2400 } 2401 up_read(&dev->client_data_rwsem); 2402 2403 return net_dev; 2404 } 2405 EXPORT_SYMBOL(ib_get_net_dev_by_params); 2406 2407 void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) 2408 { 2409 struct ib_device_ops *dev_ops = &dev->ops; 2410 #define SET_DEVICE_OP(ptr, name) \ 2411 do { \ 2412 if (ops->name) \ 2413 if (!((ptr)->name)) \ 2414 (ptr)->name = ops->name; \ 2415 } while (0) 2416 2417 #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) 2418 2419 if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { 2420 WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && 2421 dev_ops->driver_id != ops->driver_id); 2422 dev_ops->driver_id = ops->driver_id; 2423 } 2424 if (ops->owner) { 2425 WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); 2426 dev_ops->owner = ops->owner; 2427 } 2428 if (ops->uverbs_abi_ver) 2429 dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; 2430 2431 dev_ops->uverbs_no_driver_id_binding |= 2432 ops->uverbs_no_driver_id_binding; 2433 2434 SET_DEVICE_OP(dev_ops, add_gid); 2435 SET_DEVICE_OP(dev_ops, advise_mr); 2436 SET_DEVICE_OP(dev_ops, alloc_dm); 2437 SET_DEVICE_OP(dev_ops, alloc_fmr); 2438 SET_DEVICE_OP(dev_ops, alloc_hw_stats); 2439 SET_DEVICE_OP(dev_ops, alloc_mr); 2440 SET_DEVICE_OP(dev_ops, alloc_mr_integrity); 2441 SET_DEVICE_OP(dev_ops, alloc_mw); 2442 SET_DEVICE_OP(dev_ops, alloc_pd); 2443 SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); 2444 SET_DEVICE_OP(dev_ops, alloc_ucontext); 2445 SET_DEVICE_OP(dev_ops, alloc_xrcd); 2446 SET_DEVICE_OP(dev_ops, attach_mcast); 2447 SET_DEVICE_OP(dev_ops, check_mr_status); 2448 SET_DEVICE_OP(dev_ops, create_ah); 2449 SET_DEVICE_OP(dev_ops, create_counters); 2450 SET_DEVICE_OP(dev_ops, create_cq); 2451 SET_DEVICE_OP(dev_ops, create_flow); 2452 SET_DEVICE_OP(dev_ops, create_flow_action_esp); 2453 SET_DEVICE_OP(dev_ops, create_qp); 2454 SET_DEVICE_OP(dev_ops, create_rwq_ind_table); 2455 SET_DEVICE_OP(dev_ops, create_srq); 2456 SET_DEVICE_OP(dev_ops, create_wq); 2457 SET_DEVICE_OP(dev_ops, dealloc_dm); 2458 SET_DEVICE_OP(dev_ops, dealloc_driver); 2459 SET_DEVICE_OP(dev_ops, dealloc_fmr); 2460 SET_DEVICE_OP(dev_ops, dealloc_mw); 2461 SET_DEVICE_OP(dev_ops, dealloc_pd); 2462 SET_DEVICE_OP(dev_ops, dealloc_ucontext); 2463 SET_DEVICE_OP(dev_ops, dealloc_xrcd); 2464 SET_DEVICE_OP(dev_ops, del_gid); 2465 SET_DEVICE_OP(dev_ops, dereg_mr); 2466 SET_DEVICE_OP(dev_ops, destroy_ah); 2467 SET_DEVICE_OP(dev_ops, destroy_counters); 2468 SET_DEVICE_OP(dev_ops, destroy_cq); 2469 SET_DEVICE_OP(dev_ops, destroy_flow); 2470 SET_DEVICE_OP(dev_ops, destroy_flow_action); 2471 SET_DEVICE_OP(dev_ops, destroy_qp); 2472 SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); 2473 SET_DEVICE_OP(dev_ops, destroy_srq); 2474 SET_DEVICE_OP(dev_ops, destroy_wq); 2475 SET_DEVICE_OP(dev_ops, detach_mcast); 2476 SET_DEVICE_OP(dev_ops, disassociate_ucontext); 2477 SET_DEVICE_OP(dev_ops, drain_rq); 2478 SET_DEVICE_OP(dev_ops, drain_sq); 2479 SET_DEVICE_OP(dev_ops, enable_driver); 2480 SET_DEVICE_OP(dev_ops, fill_res_entry); 2481 SET_DEVICE_OP(dev_ops, get_dev_fw_str); 2482 SET_DEVICE_OP(dev_ops, get_dma_mr); 2483 SET_DEVICE_OP(dev_ops, get_hw_stats); 2484 SET_DEVICE_OP(dev_ops, get_link_layer); 2485 SET_DEVICE_OP(dev_ops, get_netdev); 2486 SET_DEVICE_OP(dev_ops, get_port_immutable); 2487 SET_DEVICE_OP(dev_ops, get_vector_affinity); 2488 SET_DEVICE_OP(dev_ops, get_vf_config); 2489 SET_DEVICE_OP(dev_ops, get_vf_stats); 2490 SET_DEVICE_OP(dev_ops, init_port); 2491 SET_DEVICE_OP(dev_ops, iw_accept); 2492 SET_DEVICE_OP(dev_ops, iw_add_ref); 2493 SET_DEVICE_OP(dev_ops, iw_connect); 2494 SET_DEVICE_OP(dev_ops, iw_create_listen); 2495 SET_DEVICE_OP(dev_ops, iw_destroy_listen); 2496 SET_DEVICE_OP(dev_ops, iw_get_qp); 2497 SET_DEVICE_OP(dev_ops, iw_reject); 2498 SET_DEVICE_OP(dev_ops, iw_rem_ref); 2499 SET_DEVICE_OP(dev_ops, map_mr_sg); 2500 SET_DEVICE_OP(dev_ops, map_mr_sg_pi); 2501 SET_DEVICE_OP(dev_ops, map_phys_fmr); 2502 SET_DEVICE_OP(dev_ops, mmap); 2503 SET_DEVICE_OP(dev_ops, modify_ah); 2504 SET_DEVICE_OP(dev_ops, modify_cq); 2505 SET_DEVICE_OP(dev_ops, modify_device); 2506 SET_DEVICE_OP(dev_ops, modify_flow_action_esp); 2507 SET_DEVICE_OP(dev_ops, modify_port); 2508 SET_DEVICE_OP(dev_ops, modify_qp); 2509 SET_DEVICE_OP(dev_ops, modify_srq); 2510 SET_DEVICE_OP(dev_ops, modify_wq); 2511 SET_DEVICE_OP(dev_ops, peek_cq); 2512 SET_DEVICE_OP(dev_ops, poll_cq); 2513 SET_DEVICE_OP(dev_ops, post_recv); 2514 SET_DEVICE_OP(dev_ops, post_send); 2515 SET_DEVICE_OP(dev_ops, post_srq_recv); 2516 SET_DEVICE_OP(dev_ops, process_mad); 2517 SET_DEVICE_OP(dev_ops, query_ah); 2518 SET_DEVICE_OP(dev_ops, query_device); 2519 SET_DEVICE_OP(dev_ops, query_gid); 2520 SET_DEVICE_OP(dev_ops, query_pkey); 2521 SET_DEVICE_OP(dev_ops, query_port); 2522 SET_DEVICE_OP(dev_ops, query_qp); 2523 SET_DEVICE_OP(dev_ops, query_srq); 2524 SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); 2525 SET_DEVICE_OP(dev_ops, read_counters); 2526 SET_DEVICE_OP(dev_ops, reg_dm_mr); 2527 SET_DEVICE_OP(dev_ops, reg_user_mr); 2528 SET_DEVICE_OP(dev_ops, req_ncomp_notif); 2529 SET_DEVICE_OP(dev_ops, req_notify_cq); 2530 SET_DEVICE_OP(dev_ops, rereg_user_mr); 2531 SET_DEVICE_OP(dev_ops, resize_cq); 2532 SET_DEVICE_OP(dev_ops, set_vf_guid); 2533 SET_DEVICE_OP(dev_ops, set_vf_link_state); 2534 SET_DEVICE_OP(dev_ops, unmap_fmr); 2535 2536 SET_OBJ_SIZE(dev_ops, ib_ah); 2537 SET_OBJ_SIZE(dev_ops, ib_cq); 2538 SET_OBJ_SIZE(dev_ops, ib_pd); 2539 SET_OBJ_SIZE(dev_ops, ib_srq); 2540 SET_OBJ_SIZE(dev_ops, ib_ucontext); 2541 } 2542 EXPORT_SYMBOL(ib_set_device_ops); 2543 2544 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { 2545 [RDMA_NL_LS_OP_RESOLVE] = { 2546 .doit = ib_nl_handle_resolve_resp, 2547 .flags = RDMA_NL_ADMIN_PERM, 2548 }, 2549 [RDMA_NL_LS_OP_SET_TIMEOUT] = { 2550 .doit = ib_nl_handle_set_timeout, 2551 .flags = RDMA_NL_ADMIN_PERM, 2552 }, 2553 [RDMA_NL_LS_OP_IP_RESOLVE] = { 2554 .doit = ib_nl_handle_ip_res_resp, 2555 .flags = RDMA_NL_ADMIN_PERM, 2556 }, 2557 }; 2558 2559 static int __init ib_core_init(void) 2560 { 2561 int ret; 2562 2563 ib_wq = alloc_workqueue("infiniband", 0, 0); 2564 if (!ib_wq) 2565 return -ENOMEM; 2566 2567 ib_comp_wq = alloc_workqueue("ib-comp-wq", 2568 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 2569 if (!ib_comp_wq) { 2570 ret = -ENOMEM; 2571 goto err; 2572 } 2573 2574 ib_comp_unbound_wq = 2575 alloc_workqueue("ib-comp-unb-wq", 2576 WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | 2577 WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); 2578 if (!ib_comp_unbound_wq) { 2579 ret = -ENOMEM; 2580 goto err_comp; 2581 } 2582 2583 ret = class_register(&ib_class); 2584 if (ret) { 2585 pr_warn("Couldn't create InfiniBand device class\n"); 2586 goto err_comp_unbound; 2587 } 2588 2589 ret = rdma_nl_init(); 2590 if (ret) { 2591 pr_warn("Couldn't init IB netlink interface: err %d\n", ret); 2592 goto err_sysfs; 2593 } 2594 2595 ret = addr_init(); 2596 if (ret) { 2597 pr_warn("Could't init IB address resolution\n"); 2598 goto err_ibnl; 2599 } 2600 2601 ret = ib_mad_init(); 2602 if (ret) { 2603 pr_warn("Couldn't init IB MAD\n"); 2604 goto err_addr; 2605 } 2606 2607 ret = ib_sa_init(); 2608 if (ret) { 2609 pr_warn("Couldn't init SA\n"); 2610 goto err_mad; 2611 } 2612 2613 ret = register_lsm_notifier(&ibdev_lsm_nb); 2614 if (ret) { 2615 pr_warn("Couldn't register LSM notifier. ret %d\n", ret); 2616 goto err_sa; 2617 } 2618 2619 ret = register_pernet_device(&rdma_dev_net_ops); 2620 if (ret) { 2621 pr_warn("Couldn't init compat dev. ret %d\n", ret); 2622 goto err_compat; 2623 } 2624 2625 nldev_init(); 2626 rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); 2627 roce_gid_mgmt_init(); 2628 2629 return 0; 2630 2631 err_compat: 2632 unregister_lsm_notifier(&ibdev_lsm_nb); 2633 err_sa: 2634 ib_sa_cleanup(); 2635 err_mad: 2636 ib_mad_cleanup(); 2637 err_addr: 2638 addr_cleanup(); 2639 err_ibnl: 2640 rdma_nl_exit(); 2641 err_sysfs: 2642 class_unregister(&ib_class); 2643 err_comp_unbound: 2644 destroy_workqueue(ib_comp_unbound_wq); 2645 err_comp: 2646 destroy_workqueue(ib_comp_wq); 2647 err: 2648 destroy_workqueue(ib_wq); 2649 return ret; 2650 } 2651 2652 static void __exit ib_core_cleanup(void) 2653 { 2654 roce_gid_mgmt_cleanup(); 2655 nldev_exit(); 2656 rdma_nl_unregister(RDMA_NL_LS); 2657 unregister_pernet_device(&rdma_dev_net_ops); 2658 unregister_lsm_notifier(&ibdev_lsm_nb); 2659 ib_sa_cleanup(); 2660 ib_mad_cleanup(); 2661 addr_cleanup(); 2662 rdma_nl_exit(); 2663 class_unregister(&ib_class); 2664 destroy_workqueue(ib_comp_unbound_wq); 2665 destroy_workqueue(ib_comp_wq); 2666 /* Make sure that any pending umem accounting work is done. */ 2667 destroy_workqueue(ib_wq); 2668 flush_workqueue(system_unbound_wq); 2669 WARN_ON(!xa_empty(&clients)); 2670 WARN_ON(!xa_empty(&devices)); 2671 } 2672 2673 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); 2674 2675 /* ib core relies on netdev stack to first register net_ns_type_operations 2676 * ns kobject type before ib_core initialization. 2677 */ 2678 fs_initcall(ib_core_init); 2679 module_exit(ib_core_cleanup); 2680