1 /* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <linux/module.h> 35 #include <linux/string.h> 36 #include <linux/errno.h> 37 #include <linux/kernel.h> 38 #include <linux/slab.h> 39 #include <linux/init.h> 40 #include <linux/netdevice.h> 41 #include <net/net_namespace.h> 42 #include <linux/security.h> 43 #include <linux/notifier.h> 44 #include <linux/hashtable.h> 45 #include <rdma/rdma_netlink.h> 46 #include <rdma/ib_addr.h> 47 #include <rdma/ib_cache.h> 48 #include <rdma/rdma_counter.h> 49 50 #include "core_priv.h" 51 #include "restrack.h" 52 53 MODULE_AUTHOR("Roland Dreier"); 54 MODULE_DESCRIPTION("core kernel InfiniBand API"); 55 MODULE_LICENSE("Dual BSD/GPL"); 56 57 struct workqueue_struct *ib_comp_wq; 58 struct workqueue_struct *ib_comp_unbound_wq; 59 struct workqueue_struct *ib_wq; 60 EXPORT_SYMBOL_GPL(ib_wq); 61 static struct workqueue_struct *ib_unreg_wq; 62 63 /* 64 * Each of the three rwsem locks (devices, clients, client_data) protects the 65 * xarray of the same name. Specifically it allows the caller to assert that 66 * the MARK will/will not be changing under the lock, and for devices and 67 * clients, that the value in the xarray is still a valid pointer. Change of 68 * the MARK is linked to the object state, so holding the lock and testing the 69 * MARK also asserts that the contained object is in a certain state. 70 * 71 * This is used to build a two stage register/unregister flow where objects 72 * can continue to be in the xarray even though they are still in progress to 73 * register/unregister. 74 * 75 * The xarray itself provides additional locking, and restartable iteration, 76 * which is also relied on. 77 * 78 * Locks should not be nested, with the exception of client_data, which is 79 * allowed to nest under the read side of the other two locks. 80 * 81 * The devices_rwsem also protects the device name list, any change or 82 * assignment of device name must also hold the write side to guarantee unique 83 * names. 84 */ 85 86 /* 87 * devices contains devices that have had their names assigned. The 88 * devices may not be registered. Users that care about the registration 89 * status need to call ib_device_try_get() on the device to ensure it is 90 * registered, and keep it registered, for the required duration. 91 * 92 */ 93 static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); 94 static DECLARE_RWSEM(devices_rwsem); 95 #define DEVICE_REGISTERED XA_MARK_1 96 97 static u32 highest_client_id; 98 #define CLIENT_REGISTERED XA_MARK_1 99 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); 100 static DECLARE_RWSEM(clients_rwsem); 101 102 static void ib_client_put(struct ib_client *client) 103 { 104 if (refcount_dec_and_test(&client->uses)) 105 complete(&client->uses_zero); 106 } 107 108 /* 109 * If client_data is registered then the corresponding client must also still 110 * be registered. 111 */ 112 #define CLIENT_DATA_REGISTERED XA_MARK_1 113 114 unsigned int rdma_dev_net_id; 115 116 /* 117 * A list of net namespaces is maintained in an xarray. This is necessary 118 * because we can't get the locking right using the existing net ns list. We 119 * would require a init_net callback after the list is updated. 120 */ 121 static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); 122 /* 123 * rwsem to protect accessing the rdma_nets xarray entries. 124 */ 125 static DECLARE_RWSEM(rdma_nets_rwsem); 126 127 bool ib_devices_shared_netns = true; 128 module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); 129 MODULE_PARM_DESC(netns_mode, 130 "Share device among net namespaces; default=1 (shared)"); 131 /** 132 * rdma_dev_access_netns() - Return whether an rdma device can be accessed 133 * from a specified net namespace or not. 134 * @dev: Pointer to rdma device which needs to be checked 135 * @net: Pointer to net namesapce for which access to be checked 136 * 137 * When the rdma device is in shared mode, it ignores the net namespace. 138 * When the rdma device is exclusive to a net namespace, rdma device net 139 * namespace is checked against the specified one. 140 */ 141 bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) 142 { 143 return (ib_devices_shared_netns || 144 net_eq(read_pnet(&dev->coredev.rdma_net), net)); 145 } 146 EXPORT_SYMBOL(rdma_dev_access_netns); 147 148 /** 149 * rdma_dev_has_raw_cap() - Returns whether a specified rdma device has 150 * CAP_NET_RAW capability or not. 151 * 152 * @dev: Pointer to rdma device whose capability to be checked 153 * 154 * Returns true if a rdma device's owning user namespace has CAP_NET_RAW 155 * capability, otherwise false. When rdma subsystem is in legacy shared network, 156 * namespace mode, the default net namespace is considered. 157 */ 158 bool rdma_dev_has_raw_cap(const struct ib_device *dev) 159 { 160 const struct net *net; 161 162 /* Network namespace is the resource whose user namespace 163 * to be considered. When in shared mode, there is no reliable 164 * network namespace resource, so consider the default net namespace. 165 */ 166 if (ib_devices_shared_netns) 167 net = &init_net; 168 else 169 net = read_pnet(&dev->coredev.rdma_net); 170 171 return ns_capable(net->user_ns, CAP_NET_RAW); 172 } 173 EXPORT_SYMBOL(rdma_dev_has_raw_cap); 174 175 /* 176 * xarray has this behavior where it won't iterate over NULL values stored in 177 * allocated arrays. So we need our own iterator to see all values stored in 178 * the array. This does the same thing as xa_for_each except that it also 179 * returns NULL valued entries if the array is allocating. Simplified to only 180 * work on simple xarrays. 181 */ 182 static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, 183 xa_mark_t filter) 184 { 185 XA_STATE(xas, xa, *indexp); 186 void *entry; 187 188 rcu_read_lock(); 189 do { 190 entry = xas_find_marked(&xas, ULONG_MAX, filter); 191 if (xa_is_zero(entry)) 192 break; 193 } while (xas_retry(&xas, entry)); 194 rcu_read_unlock(); 195 196 if (entry) { 197 *indexp = xas.xa_index; 198 if (xa_is_zero(entry)) 199 return NULL; 200 return entry; 201 } 202 return XA_ERROR(-ENOENT); 203 } 204 #define xan_for_each_marked(xa, index, entry, filter) \ 205 for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ 206 !xa_is_err(entry); \ 207 (index)++, entry = xan_find_marked(xa, &(index), filter)) 208 209 /* RCU hash table mapping netdevice pointers to struct ib_port_data */ 210 static DEFINE_SPINLOCK(ndev_hash_lock); 211 static DECLARE_HASHTABLE(ndev_hash, 5); 212 213 static void free_netdevs(struct ib_device *ib_dev); 214 static void ib_unregister_work(struct work_struct *work); 215 static void __ib_unregister_device(struct ib_device *device); 216 static int ib_security_change(struct notifier_block *nb, unsigned long event, 217 void *lsm_data); 218 static void ib_policy_change_task(struct work_struct *work); 219 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); 220 221 static void __ibdev_printk(const char *level, const struct ib_device *ibdev, 222 struct va_format *vaf) 223 { 224 if (ibdev && ibdev->dev.parent) 225 dev_printk_emit(level[1] - '0', 226 ibdev->dev.parent, 227 "%s %s %s: %pV", 228 dev_driver_string(ibdev->dev.parent), 229 dev_name(ibdev->dev.parent), 230 dev_name(&ibdev->dev), 231 vaf); 232 else if (ibdev) 233 printk("%s%s: %pV", 234 level, dev_name(&ibdev->dev), vaf); 235 else 236 printk("%s(NULL ib_device): %pV", level, vaf); 237 } 238 239 #define define_ibdev_printk_level(func, level) \ 240 void func(const struct ib_device *ibdev, const char *fmt, ...) \ 241 { \ 242 struct va_format vaf; \ 243 va_list args; \ 244 \ 245 va_start(args, fmt); \ 246 \ 247 vaf.fmt = fmt; \ 248 vaf.va = &args; \ 249 \ 250 __ibdev_printk(level, ibdev, &vaf); \ 251 \ 252 va_end(args); \ 253 } \ 254 EXPORT_SYMBOL(func); 255 256 define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); 257 define_ibdev_printk_level(ibdev_alert, KERN_ALERT); 258 define_ibdev_printk_level(ibdev_crit, KERN_CRIT); 259 define_ibdev_printk_level(ibdev_err, KERN_ERR); 260 define_ibdev_printk_level(ibdev_warn, KERN_WARNING); 261 define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); 262 define_ibdev_printk_level(ibdev_info, KERN_INFO); 263 264 static struct notifier_block ibdev_lsm_nb = { 265 .notifier_call = ib_security_change, 266 }; 267 268 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 269 struct net *net); 270 271 /* Pointer to the RCU head at the start of the ib_port_data array */ 272 struct ib_port_data_rcu { 273 struct rcu_head rcu_head; 274 struct ib_port_data pdata[]; 275 }; 276 277 static void ib_device_check_mandatory(struct ib_device *device) 278 { 279 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } 280 static const struct { 281 size_t offset; 282 char *name; 283 } mandatory_table[] = { 284 IB_MANDATORY_FUNC(query_device), 285 IB_MANDATORY_FUNC(query_port), 286 IB_MANDATORY_FUNC(alloc_pd), 287 IB_MANDATORY_FUNC(dealloc_pd), 288 IB_MANDATORY_FUNC(create_qp), 289 IB_MANDATORY_FUNC(modify_qp), 290 IB_MANDATORY_FUNC(destroy_qp), 291 IB_MANDATORY_FUNC(post_send), 292 IB_MANDATORY_FUNC(post_recv), 293 IB_MANDATORY_FUNC(create_cq), 294 IB_MANDATORY_FUNC(destroy_cq), 295 IB_MANDATORY_FUNC(poll_cq), 296 IB_MANDATORY_FUNC(req_notify_cq), 297 IB_MANDATORY_FUNC(get_dma_mr), 298 IB_MANDATORY_FUNC(reg_user_mr), 299 IB_MANDATORY_FUNC(dereg_mr), 300 IB_MANDATORY_FUNC(get_port_immutable) 301 }; 302 int i; 303 304 device->kverbs_provider = true; 305 for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { 306 if (!*(void **) ((void *) &device->ops + 307 mandatory_table[i].offset)) { 308 device->kverbs_provider = false; 309 break; 310 } 311 } 312 } 313 314 /* 315 * Caller must perform ib_device_put() to return the device reference count 316 * when ib_device_get_by_index() returns valid device pointer. 317 */ 318 struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) 319 { 320 struct ib_device *device; 321 322 down_read(&devices_rwsem); 323 device = xa_load(&devices, index); 324 if (device) { 325 if (!rdma_dev_access_netns(device, net)) { 326 device = NULL; 327 goto out; 328 } 329 330 if (!ib_device_try_get(device)) 331 device = NULL; 332 } 333 out: 334 up_read(&devices_rwsem); 335 return device; 336 } 337 338 /** 339 * ib_device_put - Release IB device reference 340 * @device: device whose reference to be released 341 * 342 * ib_device_put() releases reference to the IB device to allow it to be 343 * unregistered and eventually free. 344 */ 345 void ib_device_put(struct ib_device *device) 346 { 347 if (refcount_dec_and_test(&device->refcount)) 348 complete(&device->unreg_completion); 349 } 350 EXPORT_SYMBOL(ib_device_put); 351 352 static struct ib_device *__ib_device_get_by_name(const char *name) 353 { 354 struct ib_device *device; 355 unsigned long index; 356 357 xa_for_each (&devices, index, device) 358 if (!strcmp(name, dev_name(&device->dev))) 359 return device; 360 361 return NULL; 362 } 363 364 /** 365 * ib_device_get_by_name - Find an IB device by name 366 * @name: The name to look for 367 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 368 * 369 * Find and hold an ib_device by its name. The caller must call 370 * ib_device_put() on the returned pointer. 371 */ 372 struct ib_device *ib_device_get_by_name(const char *name, 373 enum rdma_driver_id driver_id) 374 { 375 struct ib_device *device; 376 377 down_read(&devices_rwsem); 378 device = __ib_device_get_by_name(name); 379 if (device && driver_id != RDMA_DRIVER_UNKNOWN && 380 device->ops.driver_id != driver_id) 381 device = NULL; 382 383 if (device) { 384 if (!ib_device_try_get(device)) 385 device = NULL; 386 } 387 up_read(&devices_rwsem); 388 return device; 389 } 390 EXPORT_SYMBOL(ib_device_get_by_name); 391 392 static int rename_compat_devs(struct ib_device *device) 393 { 394 struct ib_core_device *cdev; 395 unsigned long index; 396 int ret = 0; 397 398 mutex_lock(&device->compat_devs_mutex); 399 xa_for_each (&device->compat_devs, index, cdev) { 400 ret = device_rename(&cdev->dev, dev_name(&device->dev)); 401 if (ret) { 402 dev_warn(&cdev->dev, 403 "Fail to rename compatdev to new name %s\n", 404 dev_name(&device->dev)); 405 break; 406 } 407 } 408 mutex_unlock(&device->compat_devs_mutex); 409 return ret; 410 } 411 412 int ib_device_rename(struct ib_device *ibdev, const char *name) 413 { 414 unsigned long index; 415 void *client_data; 416 int ret; 417 418 down_write(&devices_rwsem); 419 if (!strcmp(name, dev_name(&ibdev->dev))) { 420 up_write(&devices_rwsem); 421 return 0; 422 } 423 424 if (__ib_device_get_by_name(name)) { 425 up_write(&devices_rwsem); 426 return -EEXIST; 427 } 428 429 ret = device_rename(&ibdev->dev, name); 430 if (ret) { 431 up_write(&devices_rwsem); 432 return ret; 433 } 434 435 strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX); 436 ret = rename_compat_devs(ibdev); 437 438 downgrade_write(&devices_rwsem); 439 down_read(&ibdev->client_data_rwsem); 440 xan_for_each_marked(&ibdev->client_data, index, client_data, 441 CLIENT_DATA_REGISTERED) { 442 struct ib_client *client = xa_load(&clients, index); 443 444 if (!client || !client->rename) 445 continue; 446 447 client->rename(ibdev, client_data); 448 } 449 up_read(&ibdev->client_data_rwsem); 450 rdma_nl_notify_event(ibdev, 0, RDMA_RENAME_EVENT); 451 up_read(&devices_rwsem); 452 return 0; 453 } 454 455 int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) 456 { 457 if (use_dim > 1) 458 return -EINVAL; 459 ibdev->use_cq_dim = use_dim; 460 461 return 0; 462 } 463 464 static int alloc_name(struct ib_device *ibdev, const char *name) 465 { 466 struct ib_device *device; 467 unsigned long index; 468 struct ida inuse; 469 int rc; 470 int i; 471 472 lockdep_assert_held_write(&devices_rwsem); 473 ida_init(&inuse); 474 xa_for_each (&devices, index, device) { 475 char buf[IB_DEVICE_NAME_MAX]; 476 477 if (sscanf(dev_name(&device->dev), name, &i) != 1) 478 continue; 479 if (i < 0 || i >= INT_MAX) 480 continue; 481 snprintf(buf, sizeof buf, name, i); 482 if (strcmp(buf, dev_name(&device->dev)) != 0) 483 continue; 484 485 rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); 486 if (rc < 0) 487 goto out; 488 } 489 490 rc = ida_alloc(&inuse, GFP_KERNEL); 491 if (rc < 0) 492 goto out; 493 494 rc = dev_set_name(&ibdev->dev, name, rc); 495 out: 496 ida_destroy(&inuse); 497 return rc; 498 } 499 500 static void ib_device_release(struct device *device) 501 { 502 struct ib_device *dev = container_of(device, struct ib_device, dev); 503 504 free_netdevs(dev); 505 WARN_ON(refcount_read(&dev->refcount)); 506 if (dev->hw_stats_data) 507 ib_device_release_hw_stats(dev->hw_stats_data); 508 if (dev->port_data) { 509 ib_cache_release_one(dev); 510 ib_security_release_port_pkey_list(dev); 511 rdma_counter_release(dev); 512 kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, 513 pdata[0]), 514 rcu_head); 515 } 516 517 mutex_destroy(&dev->subdev_lock); 518 mutex_destroy(&dev->unregistration_lock); 519 mutex_destroy(&dev->compat_devs_mutex); 520 521 xa_destroy(&dev->compat_devs); 522 xa_destroy(&dev->client_data); 523 kfree_rcu(dev, rcu_head); 524 } 525 526 static int ib_device_uevent(const struct device *device, 527 struct kobj_uevent_env *env) 528 { 529 if (add_uevent_var(env, "NAME=%s", dev_name(device))) 530 return -ENOMEM; 531 532 /* 533 * It would be nice to pass the node GUID with the event... 534 */ 535 536 return 0; 537 } 538 539 static const void *net_namespace(const struct device *d) 540 { 541 const struct ib_core_device *coredev = 542 container_of(d, struct ib_core_device, dev); 543 544 return read_pnet(&coredev->rdma_net); 545 } 546 547 static struct class ib_class = { 548 .name = "infiniband", 549 .dev_release = ib_device_release, 550 .dev_uevent = ib_device_uevent, 551 .ns_type = &net_ns_type_operations, 552 .namespace = net_namespace, 553 }; 554 555 static void rdma_init_coredev(struct ib_core_device *coredev, 556 struct ib_device *dev, struct net *net) 557 { 558 bool is_full_dev = &dev->coredev == coredev; 559 560 /* This BUILD_BUG_ON is intended to catch layout change 561 * of union of ib_core_device and device. 562 * dev must be the first element as ib_core and providers 563 * driver uses it. Adding anything in ib_core_device before 564 * device will break this assumption. 565 */ 566 BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != 567 offsetof(struct ib_device, dev)); 568 569 coredev->dev.class = &ib_class; 570 coredev->dev.groups = dev->groups; 571 572 /* 573 * Don't expose hw counters outside of the init namespace. 574 */ 575 if (!is_full_dev && dev->hw_stats_attr_index) 576 coredev->dev.groups[dev->hw_stats_attr_index] = NULL; 577 578 device_initialize(&coredev->dev); 579 coredev->owner = dev; 580 INIT_LIST_HEAD(&coredev->port_list); 581 write_pnet(&coredev->rdma_net, net); 582 } 583 584 /** 585 * _ib_alloc_device - allocate an IB device struct 586 * @size:size of structure to allocate 587 * @net: network namespace device should be located in, namespace 588 * must stay valid until ib_register_device() is completed. 589 * 590 * Low-level drivers should use ib_alloc_device() to allocate &struct 591 * ib_device. @size is the size of the structure to be allocated, 592 * including any private data used by the low-level driver. 593 * ib_dealloc_device() must be used to free structures allocated with 594 * ib_alloc_device(). 595 */ 596 struct ib_device *_ib_alloc_device(size_t size, struct net *net) 597 { 598 struct ib_device *device; 599 unsigned int i; 600 601 if (WARN_ON(size < sizeof(struct ib_device))) 602 return NULL; 603 604 device = kzalloc(size, GFP_KERNEL); 605 if (!device) 606 return NULL; 607 608 if (rdma_restrack_init(device)) { 609 kfree(device); 610 return NULL; 611 } 612 613 /* ib_devices_shared_netns can't change while we have active namespaces 614 * in the system which means either init_net is passed or the user has 615 * no idea what they are doing. 616 * 617 * To avoid breaking backward compatibility, when in shared mode, 618 * force to init the device in the init_net. 619 */ 620 net = ib_devices_shared_netns ? &init_net : net; 621 rdma_init_coredev(&device->coredev, device, net); 622 623 INIT_LIST_HEAD(&device->event_handler_list); 624 spin_lock_init(&device->qp_open_list_lock); 625 init_rwsem(&device->event_handler_rwsem); 626 mutex_init(&device->unregistration_lock); 627 /* 628 * client_data needs to be alloc because we don't want our mark to be 629 * destroyed if the user stores NULL in the client data. 630 */ 631 xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); 632 init_rwsem(&device->client_data_rwsem); 633 xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); 634 mutex_init(&device->compat_devs_mutex); 635 init_completion(&device->unreg_completion); 636 INIT_WORK(&device->unregistration_work, ib_unregister_work); 637 638 spin_lock_init(&device->cq_pools_lock); 639 for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++) 640 INIT_LIST_HEAD(&device->cq_pools[i]); 641 642 rwlock_init(&device->cache_lock); 643 644 device->uverbs_cmd_mask = 645 BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) | 646 BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) | 647 BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) | 648 BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) | 649 BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) | 650 BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | 651 BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) | 652 BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) | 653 BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) | 654 BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) | 655 BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) | 656 BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) | 657 BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) | 658 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) | 659 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) | 660 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) | 661 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) | 662 BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) | 663 BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | 664 BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) | 665 BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) | 666 BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) | 667 BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) | 668 BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) | 669 BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) | 670 BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) | 671 BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) | 672 BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | 673 BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | 674 BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); 675 676 mutex_init(&device->subdev_lock); 677 INIT_LIST_HEAD(&device->subdev_list_head); 678 INIT_LIST_HEAD(&device->subdev_list); 679 680 return device; 681 } 682 EXPORT_SYMBOL(_ib_alloc_device); 683 684 /** 685 * ib_dealloc_device - free an IB device struct 686 * @device:structure to free 687 * 688 * Free a structure allocated with ib_alloc_device(). 689 */ 690 void ib_dealloc_device(struct ib_device *device) 691 { 692 if (device->ops.dealloc_driver) 693 device->ops.dealloc_driver(device); 694 695 /* 696 * ib_unregister_driver() requires all devices to remain in the xarray 697 * while their ops are callable. The last op we call is dealloc_driver 698 * above. This is needed to create a fence on op callbacks prior to 699 * allowing the driver module to unload. 700 */ 701 down_write(&devices_rwsem); 702 if (xa_load(&devices, device->index) == device) 703 xa_erase(&devices, device->index); 704 up_write(&devices_rwsem); 705 706 /* Expedite releasing netdev references */ 707 free_netdevs(device); 708 709 WARN_ON(!xa_empty(&device->compat_devs)); 710 WARN_ON(!xa_empty(&device->client_data)); 711 WARN_ON(refcount_read(&device->refcount)); 712 rdma_restrack_clean(device); 713 /* Balances with device_initialize */ 714 put_device(&device->dev); 715 } 716 EXPORT_SYMBOL(ib_dealloc_device); 717 718 /* 719 * add_client_context() and remove_client_context() must be safe against 720 * parallel calls on the same device - registration/unregistration of both the 721 * device and client can be occurring in parallel. 722 * 723 * The routines need to be a fence, any caller must not return until the add 724 * or remove is fully completed. 725 */ 726 static int add_client_context(struct ib_device *device, 727 struct ib_client *client) 728 { 729 int ret = 0; 730 731 if (!device->kverbs_provider && !client->no_kverbs_req) 732 return 0; 733 734 down_write(&device->client_data_rwsem); 735 /* 736 * So long as the client is registered hold both the client and device 737 * unregistration locks. 738 */ 739 if (!refcount_inc_not_zero(&client->uses)) 740 goto out_unlock; 741 refcount_inc(&device->refcount); 742 743 /* 744 * Another caller to add_client_context got here first and has already 745 * completely initialized context. 746 */ 747 if (xa_get_mark(&device->client_data, client->client_id, 748 CLIENT_DATA_REGISTERED)) 749 goto out; 750 751 ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, 752 GFP_KERNEL)); 753 if (ret) 754 goto out; 755 downgrade_write(&device->client_data_rwsem); 756 if (client->add) { 757 if (client->add(device)) { 758 /* 759 * If a client fails to add then the error code is 760 * ignored, but we won't call any more ops on this 761 * client. 762 */ 763 xa_erase(&device->client_data, client->client_id); 764 up_read(&device->client_data_rwsem); 765 ib_device_put(device); 766 ib_client_put(client); 767 return 0; 768 } 769 } 770 771 /* Readers shall not see a client until add has been completed */ 772 xa_set_mark(&device->client_data, client->client_id, 773 CLIENT_DATA_REGISTERED); 774 up_read(&device->client_data_rwsem); 775 return 0; 776 777 out: 778 ib_device_put(device); 779 ib_client_put(client); 780 out_unlock: 781 up_write(&device->client_data_rwsem); 782 return ret; 783 } 784 785 static void remove_client_context(struct ib_device *device, 786 unsigned int client_id) 787 { 788 struct ib_client *client; 789 void *client_data; 790 791 down_write(&device->client_data_rwsem); 792 if (!xa_get_mark(&device->client_data, client_id, 793 CLIENT_DATA_REGISTERED)) { 794 up_write(&device->client_data_rwsem); 795 return; 796 } 797 client_data = xa_load(&device->client_data, client_id); 798 xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); 799 client = xa_load(&clients, client_id); 800 up_write(&device->client_data_rwsem); 801 802 /* 803 * Notice we cannot be holding any exclusive locks when calling the 804 * remove callback as the remove callback can recurse back into any 805 * public functions in this module and thus try for any locks those 806 * functions take. 807 * 808 * For this reason clients and drivers should not call the 809 * unregistration functions will holdling any locks. 810 */ 811 if (client->remove) 812 client->remove(device, client_data); 813 814 xa_erase(&device->client_data, client_id); 815 ib_device_put(device); 816 ib_client_put(client); 817 } 818 819 static int alloc_port_data(struct ib_device *device) 820 { 821 struct ib_port_data_rcu *pdata_rcu; 822 u32 port; 823 824 if (device->port_data) 825 return 0; 826 827 /* This can only be called once the physical port range is defined */ 828 if (WARN_ON(!device->phys_port_cnt)) 829 return -EINVAL; 830 831 /* Reserve U32_MAX so the logic to go over all the ports is sane */ 832 if (WARN_ON(device->phys_port_cnt == U32_MAX)) 833 return -EINVAL; 834 835 /* 836 * device->port_data is indexed directly by the port number to make 837 * access to this data as efficient as possible. 838 * 839 * Therefore port_data is declared as a 1 based array with potential 840 * empty slots at the beginning. 841 */ 842 pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, 843 size_add(rdma_end_port(device), 1)), 844 GFP_KERNEL); 845 if (!pdata_rcu) 846 return -ENOMEM; 847 /* 848 * The rcu_head is put in front of the port data array and the stored 849 * pointer is adjusted since we never need to see that member until 850 * kfree_rcu. 851 */ 852 device->port_data = pdata_rcu->pdata; 853 854 rdma_for_each_port (device, port) { 855 struct ib_port_data *pdata = &device->port_data[port]; 856 857 pdata->ib_dev = device; 858 spin_lock_init(&pdata->pkey_list_lock); 859 INIT_LIST_HEAD(&pdata->pkey_list); 860 spin_lock_init(&pdata->netdev_lock); 861 INIT_HLIST_NODE(&pdata->ndev_hash_link); 862 } 863 return 0; 864 } 865 866 static int verify_immutable(const struct ib_device *dev, u32 port) 867 { 868 return WARN_ON(!rdma_cap_ib_mad(dev, port) && 869 rdma_max_mad_size(dev, port) != 0); 870 } 871 872 static int setup_port_data(struct ib_device *device) 873 { 874 u32 port; 875 int ret; 876 877 ret = alloc_port_data(device); 878 if (ret) 879 return ret; 880 881 rdma_for_each_port (device, port) { 882 struct ib_port_data *pdata = &device->port_data[port]; 883 884 ret = device->ops.get_port_immutable(device, port, 885 &pdata->immutable); 886 if (ret) 887 return ret; 888 889 if (verify_immutable(device, port)) 890 return -EINVAL; 891 } 892 return 0; 893 } 894 895 /** 896 * ib_port_immutable_read() - Read rdma port's immutable data 897 * @dev: IB device 898 * @port: port number whose immutable data to read. It starts with index 1 and 899 * valid upto including rdma_end_port(). 900 */ 901 const struct ib_port_immutable* 902 ib_port_immutable_read(struct ib_device *dev, unsigned int port) 903 { 904 WARN_ON(!rdma_is_port_valid(dev, port)); 905 return &dev->port_data[port].immutable; 906 } 907 EXPORT_SYMBOL(ib_port_immutable_read); 908 909 void ib_get_device_fw_str(struct ib_device *dev, char *str) 910 { 911 if (dev->ops.get_dev_fw_str) 912 dev->ops.get_dev_fw_str(dev, str); 913 else 914 str[0] = '\0'; 915 } 916 EXPORT_SYMBOL(ib_get_device_fw_str); 917 918 static void ib_policy_change_task(struct work_struct *work) 919 { 920 struct ib_device *dev; 921 unsigned long index; 922 923 down_read(&devices_rwsem); 924 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 925 unsigned int i; 926 927 rdma_for_each_port (dev, i) { 928 u64 sp; 929 ib_get_cached_subnet_prefix(dev, i, &sp); 930 ib_security_cache_change(dev, i, sp); 931 } 932 } 933 up_read(&devices_rwsem); 934 } 935 936 static int ib_security_change(struct notifier_block *nb, unsigned long event, 937 void *lsm_data) 938 { 939 if (event != LSM_POLICY_CHANGE) 940 return NOTIFY_DONE; 941 942 schedule_work(&ib_policy_change_work); 943 ib_mad_agent_security_change(); 944 945 return NOTIFY_OK; 946 } 947 948 static void compatdev_release(struct device *dev) 949 { 950 struct ib_core_device *cdev = 951 container_of(dev, struct ib_core_device, dev); 952 953 kfree(cdev); 954 } 955 956 static int add_one_compat_dev(struct ib_device *device, 957 struct rdma_dev_net *rnet) 958 { 959 struct ib_core_device *cdev; 960 int ret; 961 962 lockdep_assert_held(&rdma_nets_rwsem); 963 if (!ib_devices_shared_netns) 964 return 0; 965 966 /* 967 * Create and add compat device in all namespaces other than where it 968 * is currently bound to. 969 */ 970 if (net_eq(read_pnet(&rnet->net), 971 read_pnet(&device->coredev.rdma_net))) 972 return 0; 973 974 /* 975 * The first of init_net() or ib_register_device() to take the 976 * compat_devs_mutex wins and gets to add the device. Others will wait 977 * for completion here. 978 */ 979 mutex_lock(&device->compat_devs_mutex); 980 cdev = xa_load(&device->compat_devs, rnet->id); 981 if (cdev) { 982 ret = 0; 983 goto done; 984 } 985 ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); 986 if (ret) 987 goto done; 988 989 cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); 990 if (!cdev) { 991 ret = -ENOMEM; 992 goto cdev_err; 993 } 994 995 cdev->dev.parent = device->dev.parent; 996 rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); 997 cdev->dev.release = compatdev_release; 998 ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); 999 if (ret) 1000 goto add_err; 1001 1002 ret = device_add(&cdev->dev); 1003 if (ret) 1004 goto add_err; 1005 ret = ib_setup_port_attrs(cdev); 1006 if (ret) 1007 goto port_err; 1008 1009 ret = xa_err(xa_store(&device->compat_devs, rnet->id, 1010 cdev, GFP_KERNEL)); 1011 if (ret) 1012 goto insert_err; 1013 1014 mutex_unlock(&device->compat_devs_mutex); 1015 return 0; 1016 1017 insert_err: 1018 ib_free_port_attrs(cdev); 1019 port_err: 1020 device_del(&cdev->dev); 1021 add_err: 1022 put_device(&cdev->dev); 1023 cdev_err: 1024 xa_release(&device->compat_devs, rnet->id); 1025 done: 1026 mutex_unlock(&device->compat_devs_mutex); 1027 return ret; 1028 } 1029 1030 static void remove_one_compat_dev(struct ib_device *device, u32 id) 1031 { 1032 struct ib_core_device *cdev; 1033 1034 mutex_lock(&device->compat_devs_mutex); 1035 cdev = xa_erase(&device->compat_devs, id); 1036 mutex_unlock(&device->compat_devs_mutex); 1037 if (cdev) { 1038 ib_free_port_attrs(cdev); 1039 device_del(&cdev->dev); 1040 put_device(&cdev->dev); 1041 } 1042 } 1043 1044 static void remove_compat_devs(struct ib_device *device) 1045 { 1046 struct ib_core_device *cdev; 1047 unsigned long index; 1048 1049 xa_for_each (&device->compat_devs, index, cdev) 1050 remove_one_compat_dev(device, index); 1051 } 1052 1053 static int add_compat_devs(struct ib_device *device) 1054 { 1055 struct rdma_dev_net *rnet; 1056 unsigned long index; 1057 int ret = 0; 1058 1059 lockdep_assert_held(&devices_rwsem); 1060 1061 down_read(&rdma_nets_rwsem); 1062 xa_for_each (&rdma_nets, index, rnet) { 1063 ret = add_one_compat_dev(device, rnet); 1064 if (ret) 1065 break; 1066 } 1067 up_read(&rdma_nets_rwsem); 1068 return ret; 1069 } 1070 1071 static void remove_all_compat_devs(void) 1072 { 1073 struct ib_compat_device *cdev; 1074 struct ib_device *dev; 1075 unsigned long index; 1076 1077 down_read(&devices_rwsem); 1078 xa_for_each (&devices, index, dev) { 1079 unsigned long c_index = 0; 1080 1081 /* Hold nets_rwsem so that any other thread modifying this 1082 * system param can sync with this thread. 1083 */ 1084 down_read(&rdma_nets_rwsem); 1085 xa_for_each (&dev->compat_devs, c_index, cdev) 1086 remove_one_compat_dev(dev, c_index); 1087 up_read(&rdma_nets_rwsem); 1088 } 1089 up_read(&devices_rwsem); 1090 } 1091 1092 static int add_all_compat_devs(void) 1093 { 1094 struct rdma_dev_net *rnet; 1095 struct ib_device *dev; 1096 unsigned long index; 1097 int ret = 0; 1098 1099 down_read(&devices_rwsem); 1100 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1101 unsigned long net_index = 0; 1102 1103 /* Hold nets_rwsem so that any other thread modifying this 1104 * system param can sync with this thread. 1105 */ 1106 down_read(&rdma_nets_rwsem); 1107 xa_for_each (&rdma_nets, net_index, rnet) { 1108 ret = add_one_compat_dev(dev, rnet); 1109 if (ret) 1110 break; 1111 } 1112 up_read(&rdma_nets_rwsem); 1113 } 1114 up_read(&devices_rwsem); 1115 if (ret) 1116 remove_all_compat_devs(); 1117 return ret; 1118 } 1119 1120 int rdma_compatdev_set(u8 enable) 1121 { 1122 struct rdma_dev_net *rnet; 1123 unsigned long index; 1124 int ret = 0; 1125 1126 down_write(&rdma_nets_rwsem); 1127 if (ib_devices_shared_netns == enable) { 1128 up_write(&rdma_nets_rwsem); 1129 return 0; 1130 } 1131 1132 /* enable/disable of compat devices is not supported 1133 * when more than default init_net exists. 1134 */ 1135 xa_for_each (&rdma_nets, index, rnet) { 1136 ret++; 1137 break; 1138 } 1139 if (!ret) 1140 ib_devices_shared_netns = enable; 1141 up_write(&rdma_nets_rwsem); 1142 if (ret) 1143 return -EBUSY; 1144 1145 if (enable) 1146 ret = add_all_compat_devs(); 1147 else 1148 remove_all_compat_devs(); 1149 return ret; 1150 } 1151 1152 static void rdma_dev_exit_net(struct net *net) 1153 { 1154 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1155 struct ib_device *dev; 1156 unsigned long index; 1157 int ret; 1158 1159 down_write(&rdma_nets_rwsem); 1160 /* 1161 * Prevent the ID from being re-used and hide the id from xa_for_each. 1162 */ 1163 ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); 1164 WARN_ON(ret); 1165 up_write(&rdma_nets_rwsem); 1166 1167 down_read(&devices_rwsem); 1168 xa_for_each (&devices, index, dev) { 1169 get_device(&dev->dev); 1170 /* 1171 * Release the devices_rwsem so that pontentially blocking 1172 * device_del, doesn't hold the devices_rwsem for too long. 1173 */ 1174 up_read(&devices_rwsem); 1175 1176 remove_one_compat_dev(dev, rnet->id); 1177 1178 /* 1179 * If the real device is in the NS then move it back to init. 1180 */ 1181 rdma_dev_change_netns(dev, net, &init_net); 1182 1183 put_device(&dev->dev); 1184 down_read(&devices_rwsem); 1185 } 1186 up_read(&devices_rwsem); 1187 1188 rdma_nl_net_exit(rnet); 1189 xa_erase(&rdma_nets, rnet->id); 1190 } 1191 1192 static __net_init int rdma_dev_init_net(struct net *net) 1193 { 1194 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1195 unsigned long index; 1196 struct ib_device *dev; 1197 int ret; 1198 1199 write_pnet(&rnet->net, net); 1200 1201 ret = rdma_nl_net_init(rnet); 1202 if (ret) 1203 return ret; 1204 1205 /* No need to create any compat devices in default init_net. */ 1206 if (net_eq(net, &init_net)) 1207 return 0; 1208 1209 ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); 1210 if (ret) { 1211 rdma_nl_net_exit(rnet); 1212 return ret; 1213 } 1214 1215 down_read(&devices_rwsem); 1216 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1217 /* Hold nets_rwsem so that netlink command cannot change 1218 * system configuration for device sharing mode. 1219 */ 1220 down_read(&rdma_nets_rwsem); 1221 ret = add_one_compat_dev(dev, rnet); 1222 up_read(&rdma_nets_rwsem); 1223 if (ret) 1224 break; 1225 } 1226 up_read(&devices_rwsem); 1227 1228 if (ret) 1229 rdma_dev_exit_net(net); 1230 1231 return ret; 1232 } 1233 1234 /* 1235 * Assign the unique string device name and the unique device index. This is 1236 * undone by ib_dealloc_device. 1237 */ 1238 static int assign_name(struct ib_device *device, const char *name) 1239 { 1240 static u32 last_id; 1241 int ret; 1242 1243 down_write(&devices_rwsem); 1244 /* Assign a unique name to the device */ 1245 if (strchr(name, '%')) 1246 ret = alloc_name(device, name); 1247 else 1248 ret = dev_set_name(&device->dev, name); 1249 if (ret) 1250 goto out; 1251 1252 if (__ib_device_get_by_name(dev_name(&device->dev))) { 1253 ret = -ENFILE; 1254 goto out; 1255 } 1256 strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); 1257 1258 ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, 1259 &last_id, GFP_KERNEL); 1260 if (ret > 0) 1261 ret = 0; 1262 1263 out: 1264 up_write(&devices_rwsem); 1265 return ret; 1266 } 1267 1268 /* 1269 * setup_device() allocates memory and sets up data that requires calling the 1270 * device ops, this is the only reason these actions are not done during 1271 * ib_alloc_device. It is undone by ib_dealloc_device(). 1272 */ 1273 static int setup_device(struct ib_device *device) 1274 { 1275 struct ib_udata uhw = {.outlen = 0, .inlen = 0}; 1276 int ret; 1277 1278 ib_device_check_mandatory(device); 1279 1280 ret = setup_port_data(device); 1281 if (ret) { 1282 dev_warn(&device->dev, "Couldn't create per-port data\n"); 1283 return ret; 1284 } 1285 1286 memset(&device->attrs, 0, sizeof(device->attrs)); 1287 ret = device->ops.query_device(device, &device->attrs, &uhw); 1288 if (ret) { 1289 dev_warn(&device->dev, 1290 "Couldn't query the device attributes\n"); 1291 return ret; 1292 } 1293 1294 return 0; 1295 } 1296 1297 static void disable_device(struct ib_device *device) 1298 { 1299 u32 cid; 1300 1301 WARN_ON(!refcount_read(&device->refcount)); 1302 1303 down_write(&devices_rwsem); 1304 xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); 1305 up_write(&devices_rwsem); 1306 1307 /* 1308 * Remove clients in LIFO order, see assign_client_id. This could be 1309 * more efficient if xarray learns to reverse iterate. Since no new 1310 * clients can be added to this ib_device past this point we only need 1311 * the maximum possible client_id value here. 1312 */ 1313 down_read(&clients_rwsem); 1314 cid = highest_client_id; 1315 up_read(&clients_rwsem); 1316 while (cid) { 1317 cid--; 1318 remove_client_context(device, cid); 1319 } 1320 1321 ib_cq_pool_cleanup(device); 1322 1323 /* Pairs with refcount_set in enable_device */ 1324 ib_device_put(device); 1325 wait_for_completion(&device->unreg_completion); 1326 1327 /* 1328 * compat devices must be removed after device refcount drops to zero. 1329 * Otherwise init_net() may add more compatdevs after removing compat 1330 * devices and before device is disabled. 1331 */ 1332 remove_compat_devs(device); 1333 } 1334 1335 /* 1336 * An enabled device is visible to all clients and to all the public facing 1337 * APIs that return a device pointer. This always returns with a new get, even 1338 * if it fails. 1339 */ 1340 static int enable_device_and_get(struct ib_device *device) 1341 { 1342 struct ib_client *client; 1343 unsigned long index; 1344 int ret = 0; 1345 1346 /* 1347 * One ref belongs to the xa and the other belongs to this 1348 * thread. This is needed to guard against parallel unregistration. 1349 */ 1350 refcount_set(&device->refcount, 2); 1351 down_write(&devices_rwsem); 1352 xa_set_mark(&devices, device->index, DEVICE_REGISTERED); 1353 1354 /* 1355 * By using downgrade_write() we ensure that no other thread can clear 1356 * DEVICE_REGISTERED while we are completing the client setup. 1357 */ 1358 downgrade_write(&devices_rwsem); 1359 1360 if (device->ops.enable_driver) { 1361 ret = device->ops.enable_driver(device); 1362 if (ret) 1363 goto out; 1364 } 1365 1366 down_read(&clients_rwsem); 1367 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1368 ret = add_client_context(device, client); 1369 if (ret) 1370 break; 1371 } 1372 up_read(&clients_rwsem); 1373 if (!ret) 1374 ret = add_compat_devs(device); 1375 out: 1376 up_read(&devices_rwsem); 1377 return ret; 1378 } 1379 1380 static void prevent_dealloc_device(struct ib_device *ib_dev) 1381 { 1382 } 1383 1384 static void ib_device_notify_register(struct ib_device *device) 1385 { 1386 struct net_device *netdev; 1387 u32 port; 1388 int ret; 1389 1390 down_read(&devices_rwsem); 1391 1392 /* Mark for userspace that device is ready */ 1393 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1394 1395 ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); 1396 if (ret) 1397 goto out; 1398 1399 rdma_for_each_port(device, port) { 1400 netdev = ib_device_get_netdev(device, port); 1401 if (!netdev) 1402 continue; 1403 1404 ret = rdma_nl_notify_event(device, port, 1405 RDMA_NETDEV_ATTACH_EVENT); 1406 dev_put(netdev); 1407 if (ret) 1408 goto out; 1409 } 1410 1411 out: 1412 up_read(&devices_rwsem); 1413 } 1414 1415 /** 1416 * ib_register_device - Register an IB device with IB core 1417 * @device: Device to register 1418 * @name: unique string device name. This may include a '%' which will 1419 * cause a unique index to be added to the passed device name. 1420 * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB 1421 * device will be used. In this case the caller should fully 1422 * setup the ibdev for DMA. This usually means using dma_virt_ops. 1423 * 1424 * Low-level drivers use ib_register_device() to register their 1425 * devices with the IB core. All registered clients will receive a 1426 * callback for each device that is added. @device must be allocated 1427 * with ib_alloc_device(). 1428 * 1429 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() 1430 * asynchronously then the device pointer may become freed as soon as this 1431 * function returns. 1432 */ 1433 int ib_register_device(struct ib_device *device, const char *name, 1434 struct device *dma_device) 1435 { 1436 int ret; 1437 1438 ret = assign_name(device, name); 1439 if (ret) 1440 return ret; 1441 1442 /* 1443 * If the caller does not provide a DMA capable device then the IB core 1444 * will set up ib_sge and scatterlist structures that stash the kernel 1445 * virtual address into the address field. 1446 */ 1447 WARN_ON(dma_device && !dma_device->dma_parms); 1448 device->dma_device = dma_device; 1449 1450 ret = setup_device(device); 1451 if (ret) 1452 return ret; 1453 1454 ret = ib_cache_setup_one(device); 1455 if (ret) { 1456 dev_warn(&device->dev, 1457 "Couldn't set up InfiniBand P_Key/GID cache\n"); 1458 return ret; 1459 } 1460 1461 device->groups[0] = &ib_dev_attr_group; 1462 device->groups[1] = device->ops.device_group; 1463 ret = ib_setup_device_attrs(device); 1464 if (ret) 1465 goto cache_cleanup; 1466 1467 ib_device_register_rdmacg(device); 1468 1469 rdma_counter_init(device); 1470 1471 /* 1472 * Ensure that ADD uevent is not fired because it 1473 * is too early amd device is not initialized yet. 1474 */ 1475 dev_set_uevent_suppress(&device->dev, true); 1476 ret = device_add(&device->dev); 1477 if (ret) 1478 goto cg_cleanup; 1479 1480 ret = ib_setup_port_attrs(&device->coredev); 1481 if (ret) { 1482 dev_warn(&device->dev, 1483 "Couldn't register device with driver model\n"); 1484 goto dev_cleanup; 1485 } 1486 1487 ret = enable_device_and_get(device); 1488 if (ret) { 1489 void (*dealloc_fn)(struct ib_device *); 1490 1491 /* 1492 * If we hit this error flow then we don't want to 1493 * automatically dealloc the device since the caller is 1494 * expected to call ib_dealloc_device() after 1495 * ib_register_device() fails. This is tricky due to the 1496 * possibility for a parallel unregistration along with this 1497 * error flow. Since we have a refcount here we know any 1498 * parallel flow is stopped in disable_device and will see the 1499 * special dealloc_driver pointer, causing the responsibility to 1500 * ib_dealloc_device() to revert back to this thread. 1501 */ 1502 dealloc_fn = device->ops.dealloc_driver; 1503 device->ops.dealloc_driver = prevent_dealloc_device; 1504 ib_device_put(device); 1505 __ib_unregister_device(device); 1506 device->ops.dealloc_driver = dealloc_fn; 1507 dev_set_uevent_suppress(&device->dev, false); 1508 return ret; 1509 } 1510 dev_set_uevent_suppress(&device->dev, false); 1511 1512 ib_device_notify_register(device); 1513 1514 ib_device_put(device); 1515 1516 return 0; 1517 1518 dev_cleanup: 1519 device_del(&device->dev); 1520 cg_cleanup: 1521 dev_set_uevent_suppress(&device->dev, false); 1522 ib_device_unregister_rdmacg(device); 1523 cache_cleanup: 1524 ib_cache_cleanup_one(device); 1525 return ret; 1526 } 1527 EXPORT_SYMBOL(ib_register_device); 1528 1529 /* Callers must hold a get on the device. */ 1530 static void __ib_unregister_device(struct ib_device *ib_dev) 1531 { 1532 struct ib_device *sub, *tmp; 1533 1534 mutex_lock(&ib_dev->subdev_lock); 1535 list_for_each_entry_safe_reverse(sub, tmp, 1536 &ib_dev->subdev_list_head, 1537 subdev_list) { 1538 list_del(&sub->subdev_list); 1539 ib_dev->ops.del_sub_dev(sub); 1540 ib_device_put(ib_dev); 1541 } 1542 mutex_unlock(&ib_dev->subdev_lock); 1543 1544 /* 1545 * We have a registration lock so that all the calls to unregister are 1546 * fully fenced, once any unregister returns the device is truely 1547 * unregistered even if multiple callers are unregistering it at the 1548 * same time. This also interacts with the registration flow and 1549 * provides sane semantics if register and unregister are racing. 1550 */ 1551 mutex_lock(&ib_dev->unregistration_lock); 1552 if (!refcount_read(&ib_dev->refcount)) 1553 goto out; 1554 1555 disable_device(ib_dev); 1556 rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT); 1557 1558 /* Expedite removing unregistered pointers from the hash table */ 1559 free_netdevs(ib_dev); 1560 1561 ib_free_port_attrs(&ib_dev->coredev); 1562 device_del(&ib_dev->dev); 1563 ib_device_unregister_rdmacg(ib_dev); 1564 ib_cache_cleanup_one(ib_dev); 1565 1566 /* 1567 * Drivers using the new flow may not call ib_dealloc_device except 1568 * in error unwind prior to registration success. 1569 */ 1570 if (ib_dev->ops.dealloc_driver && 1571 ib_dev->ops.dealloc_driver != prevent_dealloc_device) { 1572 WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); 1573 ib_dealloc_device(ib_dev); 1574 } 1575 out: 1576 mutex_unlock(&ib_dev->unregistration_lock); 1577 } 1578 1579 /** 1580 * ib_unregister_device - Unregister an IB device 1581 * @ib_dev: The device to unregister 1582 * 1583 * Unregister an IB device. All clients will receive a remove callback. 1584 * 1585 * Callers should call this routine only once, and protect against races with 1586 * registration. Typically it should only be called as part of a remove 1587 * callback in an implementation of driver core's struct device_driver and 1588 * related. 1589 * 1590 * If ops.dealloc_driver is used then ib_dev will be freed upon return from 1591 * this function. 1592 */ 1593 void ib_unregister_device(struct ib_device *ib_dev) 1594 { 1595 get_device(&ib_dev->dev); 1596 __ib_unregister_device(ib_dev); 1597 put_device(&ib_dev->dev); 1598 } 1599 EXPORT_SYMBOL(ib_unregister_device); 1600 1601 /** 1602 * ib_unregister_device_and_put - Unregister a device while holding a 'get' 1603 * @ib_dev: The device to unregister 1604 * 1605 * This is the same as ib_unregister_device(), except it includes an internal 1606 * ib_device_put() that should match a 'get' obtained by the caller. 1607 * 1608 * It is safe to call this routine concurrently from multiple threads while 1609 * holding the 'get'. When the function returns the device is fully 1610 * unregistered. 1611 * 1612 * Drivers using this flow MUST use the driver_unregister callback to clean up 1613 * their resources associated with the device and dealloc it. 1614 */ 1615 void ib_unregister_device_and_put(struct ib_device *ib_dev) 1616 { 1617 WARN_ON(!ib_dev->ops.dealloc_driver); 1618 get_device(&ib_dev->dev); 1619 ib_device_put(ib_dev); 1620 __ib_unregister_device(ib_dev); 1621 put_device(&ib_dev->dev); 1622 } 1623 EXPORT_SYMBOL(ib_unregister_device_and_put); 1624 1625 /** 1626 * ib_unregister_driver - Unregister all IB devices for a driver 1627 * @driver_id: The driver to unregister 1628 * 1629 * This implements a fence for device unregistration. It only returns once all 1630 * devices associated with the driver_id have fully completed their 1631 * unregistration and returned from ib_unregister_device*(). 1632 * 1633 * If device's are not yet unregistered it goes ahead and starts unregistering 1634 * them. 1635 * 1636 * This does not block creation of new devices with the given driver_id, that 1637 * is the responsibility of the caller. 1638 */ 1639 void ib_unregister_driver(enum rdma_driver_id driver_id) 1640 { 1641 struct ib_device *ib_dev; 1642 unsigned long index; 1643 1644 down_read(&devices_rwsem); 1645 xa_for_each (&devices, index, ib_dev) { 1646 if (ib_dev->ops.driver_id != driver_id) 1647 continue; 1648 1649 get_device(&ib_dev->dev); 1650 up_read(&devices_rwsem); 1651 1652 WARN_ON(!ib_dev->ops.dealloc_driver); 1653 __ib_unregister_device(ib_dev); 1654 1655 put_device(&ib_dev->dev); 1656 down_read(&devices_rwsem); 1657 } 1658 up_read(&devices_rwsem); 1659 } 1660 EXPORT_SYMBOL(ib_unregister_driver); 1661 1662 static void ib_unregister_work(struct work_struct *work) 1663 { 1664 struct ib_device *ib_dev = 1665 container_of(work, struct ib_device, unregistration_work); 1666 1667 __ib_unregister_device(ib_dev); 1668 put_device(&ib_dev->dev); 1669 } 1670 1671 /** 1672 * ib_unregister_device_queued - Unregister a device using a work queue 1673 * @ib_dev: The device to unregister 1674 * 1675 * This schedules an asynchronous unregistration using a WQ for the device. A 1676 * driver should use this to avoid holding locks while doing unregistration, 1677 * such as holding the RTNL lock. 1678 * 1679 * Drivers using this API must use ib_unregister_driver before module unload 1680 * to ensure that all scheduled unregistrations have completed. 1681 */ 1682 void ib_unregister_device_queued(struct ib_device *ib_dev) 1683 { 1684 WARN_ON(!refcount_read(&ib_dev->refcount)); 1685 WARN_ON(!ib_dev->ops.dealloc_driver); 1686 get_device(&ib_dev->dev); 1687 if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work)) 1688 put_device(&ib_dev->dev); 1689 } 1690 EXPORT_SYMBOL(ib_unregister_device_queued); 1691 1692 /* 1693 * The caller must pass in a device that has the kref held and the refcount 1694 * released. If the device is in cur_net and still registered then it is moved 1695 * into net. 1696 */ 1697 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 1698 struct net *net) 1699 { 1700 int ret2 = -EINVAL; 1701 int ret; 1702 1703 mutex_lock(&device->unregistration_lock); 1704 1705 /* 1706 * If a device not under ib_device_get() or if the unregistration_lock 1707 * is not held, the namespace can be changed, or it can be unregistered. 1708 * Check again under the lock. 1709 */ 1710 if (refcount_read(&device->refcount) == 0 || 1711 !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { 1712 ret = -ENODEV; 1713 goto out; 1714 } 1715 1716 kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); 1717 disable_device(device); 1718 1719 /* 1720 * At this point no one can be using the device, so it is safe to 1721 * change the namespace. 1722 */ 1723 write_pnet(&device->coredev.rdma_net, net); 1724 1725 down_read(&devices_rwsem); 1726 /* 1727 * Currently rdma devices are system wide unique. So the device name 1728 * is guaranteed free in the new namespace. Publish the new namespace 1729 * at the sysfs level. 1730 */ 1731 ret = device_rename(&device->dev, dev_name(&device->dev)); 1732 up_read(&devices_rwsem); 1733 if (ret) { 1734 dev_warn(&device->dev, 1735 "%s: Couldn't rename device after namespace change\n", 1736 __func__); 1737 /* Try and put things back and re-enable the device */ 1738 write_pnet(&device->coredev.rdma_net, cur_net); 1739 } 1740 1741 ret2 = enable_device_and_get(device); 1742 if (ret2) { 1743 /* 1744 * This shouldn't really happen, but if it does, let the user 1745 * retry at later point. So don't disable the device. 1746 */ 1747 dev_warn(&device->dev, 1748 "%s: Couldn't re-enable device after namespace change\n", 1749 __func__); 1750 } 1751 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1752 1753 ib_device_put(device); 1754 out: 1755 mutex_unlock(&device->unregistration_lock); 1756 if (ret) 1757 return ret; 1758 return ret2; 1759 } 1760 1761 int ib_device_set_netns_put(struct sk_buff *skb, 1762 struct ib_device *dev, u32 ns_fd) 1763 { 1764 struct net *net; 1765 int ret; 1766 1767 net = get_net_ns_by_fd(ns_fd); 1768 if (IS_ERR(net)) { 1769 ret = PTR_ERR(net); 1770 goto net_err; 1771 } 1772 1773 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { 1774 ret = -EPERM; 1775 goto ns_err; 1776 } 1777 1778 /* 1779 * All the ib_clients, including uverbs, are reset when the namespace is 1780 * changed and this cannot be blocked waiting for userspace to do 1781 * something, so disassociation is mandatory. 1782 */ 1783 if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) { 1784 ret = -EOPNOTSUPP; 1785 goto ns_err; 1786 } 1787 1788 get_device(&dev->dev); 1789 ib_device_put(dev); 1790 ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); 1791 put_device(&dev->dev); 1792 1793 put_net(net); 1794 return ret; 1795 1796 ns_err: 1797 put_net(net); 1798 net_err: 1799 ib_device_put(dev); 1800 return ret; 1801 } 1802 1803 static struct pernet_operations rdma_dev_net_ops = { 1804 .init = rdma_dev_init_net, 1805 .exit = rdma_dev_exit_net, 1806 .id = &rdma_dev_net_id, 1807 .size = sizeof(struct rdma_dev_net), 1808 }; 1809 1810 static int assign_client_id(struct ib_client *client) 1811 { 1812 int ret; 1813 1814 lockdep_assert_held(&clients_rwsem); 1815 /* 1816 * The add/remove callbacks must be called in FIFO/LIFO order. To 1817 * achieve this we assign client_ids so they are sorted in 1818 * registration order. 1819 */ 1820 client->client_id = highest_client_id; 1821 ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); 1822 if (ret) 1823 return ret; 1824 1825 highest_client_id++; 1826 xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); 1827 return 0; 1828 } 1829 1830 static void remove_client_id(struct ib_client *client) 1831 { 1832 down_write(&clients_rwsem); 1833 xa_erase(&clients, client->client_id); 1834 for (; highest_client_id; highest_client_id--) 1835 if (xa_load(&clients, highest_client_id - 1)) 1836 break; 1837 up_write(&clients_rwsem); 1838 } 1839 1840 /** 1841 * ib_register_client - Register an IB client 1842 * @client:Client to register 1843 * 1844 * Upper level users of the IB drivers can use ib_register_client() to 1845 * register callbacks for IB device addition and removal. When an IB 1846 * device is added, each registered client's add method will be called 1847 * (in the order the clients were registered), and when a device is 1848 * removed, each client's remove method will be called (in the reverse 1849 * order that clients were registered). In addition, when 1850 * ib_register_client() is called, the client will receive an add 1851 * callback for all devices already registered. 1852 */ 1853 int ib_register_client(struct ib_client *client) 1854 { 1855 struct ib_device *device; 1856 unsigned long index; 1857 bool need_unreg = false; 1858 int ret; 1859 1860 refcount_set(&client->uses, 1); 1861 init_completion(&client->uses_zero); 1862 1863 /* 1864 * The devices_rwsem is held in write mode to ensure that a racing 1865 * ib_register_device() sees a consisent view of clients and devices. 1866 */ 1867 down_write(&devices_rwsem); 1868 down_write(&clients_rwsem); 1869 ret = assign_client_id(client); 1870 if (ret) 1871 goto out; 1872 1873 need_unreg = true; 1874 xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { 1875 ret = add_client_context(device, client); 1876 if (ret) 1877 goto out; 1878 } 1879 ret = 0; 1880 out: 1881 up_write(&clients_rwsem); 1882 up_write(&devices_rwsem); 1883 if (need_unreg && ret) 1884 ib_unregister_client(client); 1885 return ret; 1886 } 1887 EXPORT_SYMBOL(ib_register_client); 1888 1889 /** 1890 * ib_unregister_client - Unregister an IB client 1891 * @client:Client to unregister 1892 * 1893 * Upper level users use ib_unregister_client() to remove their client 1894 * registration. When ib_unregister_client() is called, the client 1895 * will receive a remove callback for each IB device still registered. 1896 * 1897 * This is a full fence, once it returns no client callbacks will be called, 1898 * or are running in another thread. 1899 */ 1900 void ib_unregister_client(struct ib_client *client) 1901 { 1902 struct ib_device *device; 1903 unsigned long index; 1904 1905 down_write(&clients_rwsem); 1906 ib_client_put(client); 1907 xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); 1908 up_write(&clients_rwsem); 1909 1910 /* We do not want to have locks while calling client->remove() */ 1911 rcu_read_lock(); 1912 xa_for_each (&devices, index, device) { 1913 if (!ib_device_try_get(device)) 1914 continue; 1915 rcu_read_unlock(); 1916 1917 remove_client_context(device, client->client_id); 1918 1919 ib_device_put(device); 1920 rcu_read_lock(); 1921 } 1922 rcu_read_unlock(); 1923 1924 /* 1925 * remove_client_context() is not a fence, it can return even though a 1926 * removal is ongoing. Wait until all removals are completed. 1927 */ 1928 wait_for_completion(&client->uses_zero); 1929 remove_client_id(client); 1930 } 1931 EXPORT_SYMBOL(ib_unregister_client); 1932 1933 static int __ib_get_global_client_nl_info(const char *client_name, 1934 struct ib_client_nl_info *res) 1935 { 1936 struct ib_client *client; 1937 unsigned long index; 1938 int ret = -ENOENT; 1939 1940 down_read(&clients_rwsem); 1941 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1942 if (strcmp(client->name, client_name) != 0) 1943 continue; 1944 if (!client->get_global_nl_info) { 1945 ret = -EOPNOTSUPP; 1946 break; 1947 } 1948 ret = client->get_global_nl_info(res); 1949 if (WARN_ON(ret == -ENOENT)) 1950 ret = -EINVAL; 1951 if (!ret && res->cdev) 1952 get_device(res->cdev); 1953 break; 1954 } 1955 up_read(&clients_rwsem); 1956 return ret; 1957 } 1958 1959 static int __ib_get_client_nl_info(struct ib_device *ibdev, 1960 const char *client_name, 1961 struct ib_client_nl_info *res) 1962 { 1963 unsigned long index; 1964 void *client_data; 1965 int ret = -ENOENT; 1966 1967 down_read(&ibdev->client_data_rwsem); 1968 xan_for_each_marked (&ibdev->client_data, index, client_data, 1969 CLIENT_DATA_REGISTERED) { 1970 struct ib_client *client = xa_load(&clients, index); 1971 1972 if (!client || strcmp(client->name, client_name) != 0) 1973 continue; 1974 if (!client->get_nl_info) { 1975 ret = -EOPNOTSUPP; 1976 break; 1977 } 1978 ret = client->get_nl_info(ibdev, client_data, res); 1979 if (WARN_ON(ret == -ENOENT)) 1980 ret = -EINVAL; 1981 1982 /* 1983 * The cdev is guaranteed valid as long as we are inside the 1984 * client_data_rwsem as remove_one can't be called. Keep it 1985 * valid for the caller. 1986 */ 1987 if (!ret && res->cdev) 1988 get_device(res->cdev); 1989 break; 1990 } 1991 up_read(&ibdev->client_data_rwsem); 1992 1993 return ret; 1994 } 1995 1996 /** 1997 * ib_get_client_nl_info - Fetch the nl_info from a client 1998 * @ibdev: IB device 1999 * @client_name: Name of the client 2000 * @res: Result of the query 2001 */ 2002 int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, 2003 struct ib_client_nl_info *res) 2004 { 2005 int ret; 2006 2007 if (ibdev) 2008 ret = __ib_get_client_nl_info(ibdev, client_name, res); 2009 else 2010 ret = __ib_get_global_client_nl_info(client_name, res); 2011 #ifdef CONFIG_MODULES 2012 if (ret == -ENOENT) { 2013 request_module("rdma-client-%s", client_name); 2014 if (ibdev) 2015 ret = __ib_get_client_nl_info(ibdev, client_name, res); 2016 else 2017 ret = __ib_get_global_client_nl_info(client_name, res); 2018 } 2019 #endif 2020 if (ret) { 2021 if (ret == -ENOENT) 2022 return -EOPNOTSUPP; 2023 return ret; 2024 } 2025 2026 if (WARN_ON(!res->cdev)) 2027 return -EINVAL; 2028 return 0; 2029 } 2030 2031 /** 2032 * ib_set_client_data - Set IB client context 2033 * @device:Device to set context for 2034 * @client:Client to set context for 2035 * @data:Context to set 2036 * 2037 * ib_set_client_data() sets client context data that can be retrieved with 2038 * ib_get_client_data(). This can only be called while the client is 2039 * registered to the device, once the ib_client remove() callback returns this 2040 * cannot be called. 2041 */ 2042 void ib_set_client_data(struct ib_device *device, struct ib_client *client, 2043 void *data) 2044 { 2045 void *rc; 2046 2047 if (WARN_ON(IS_ERR(data))) 2048 data = NULL; 2049 2050 rc = xa_store(&device->client_data, client->client_id, data, 2051 GFP_KERNEL); 2052 WARN_ON(xa_is_err(rc)); 2053 } 2054 EXPORT_SYMBOL(ib_set_client_data); 2055 2056 /** 2057 * ib_register_event_handler - Register an IB event handler 2058 * @event_handler:Handler to register 2059 * 2060 * ib_register_event_handler() registers an event handler that will be 2061 * called back when asynchronous IB events occur (as defined in 2062 * chapter 11 of the InfiniBand Architecture Specification). This 2063 * callback occurs in workqueue context. 2064 */ 2065 void ib_register_event_handler(struct ib_event_handler *event_handler) 2066 { 2067 down_write(&event_handler->device->event_handler_rwsem); 2068 list_add_tail(&event_handler->list, 2069 &event_handler->device->event_handler_list); 2070 up_write(&event_handler->device->event_handler_rwsem); 2071 } 2072 EXPORT_SYMBOL(ib_register_event_handler); 2073 2074 /** 2075 * ib_unregister_event_handler - Unregister an event handler 2076 * @event_handler:Handler to unregister 2077 * 2078 * Unregister an event handler registered with 2079 * ib_register_event_handler(). 2080 */ 2081 void ib_unregister_event_handler(struct ib_event_handler *event_handler) 2082 { 2083 down_write(&event_handler->device->event_handler_rwsem); 2084 list_del(&event_handler->list); 2085 up_write(&event_handler->device->event_handler_rwsem); 2086 } 2087 EXPORT_SYMBOL(ib_unregister_event_handler); 2088 2089 void ib_dispatch_event_clients(struct ib_event *event) 2090 { 2091 struct ib_event_handler *handler; 2092 2093 down_read(&event->device->event_handler_rwsem); 2094 2095 list_for_each_entry(handler, &event->device->event_handler_list, list) 2096 handler->handler(handler, event); 2097 2098 up_read(&event->device->event_handler_rwsem); 2099 } 2100 2101 static int iw_query_port(struct ib_device *device, 2102 u32 port_num, 2103 struct ib_port_attr *port_attr) 2104 { 2105 struct in_device *inetdev; 2106 struct net_device *netdev; 2107 2108 memset(port_attr, 0, sizeof(*port_attr)); 2109 2110 netdev = ib_device_get_netdev(device, port_num); 2111 if (!netdev) 2112 return -ENODEV; 2113 2114 port_attr->max_mtu = IB_MTU_4096; 2115 port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); 2116 2117 if (!netif_carrier_ok(netdev)) { 2118 port_attr->state = IB_PORT_DOWN; 2119 port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; 2120 } else { 2121 rcu_read_lock(); 2122 inetdev = __in_dev_get_rcu(netdev); 2123 2124 if (inetdev && inetdev->ifa_list) { 2125 port_attr->state = IB_PORT_ACTIVE; 2126 port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; 2127 } else { 2128 port_attr->state = IB_PORT_INIT; 2129 port_attr->phys_state = 2130 IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; 2131 } 2132 2133 rcu_read_unlock(); 2134 } 2135 2136 dev_put(netdev); 2137 return device->ops.query_port(device, port_num, port_attr); 2138 } 2139 2140 static int __ib_query_port(struct ib_device *device, 2141 u32 port_num, 2142 struct ib_port_attr *port_attr) 2143 { 2144 int err; 2145 2146 memset(port_attr, 0, sizeof(*port_attr)); 2147 2148 err = device->ops.query_port(device, port_num, port_attr); 2149 if (err || port_attr->subnet_prefix) 2150 return err; 2151 2152 if (rdma_port_get_link_layer(device, port_num) != 2153 IB_LINK_LAYER_INFINIBAND) 2154 return 0; 2155 2156 ib_get_cached_subnet_prefix(device, port_num, 2157 &port_attr->subnet_prefix); 2158 return 0; 2159 } 2160 2161 /** 2162 * ib_query_port - Query IB port attributes 2163 * @device:Device to query 2164 * @port_num:Port number to query 2165 * @port_attr:Port attributes 2166 * 2167 * ib_query_port() returns the attributes of a port through the 2168 * @port_attr pointer. 2169 */ 2170 int ib_query_port(struct ib_device *device, 2171 u32 port_num, 2172 struct ib_port_attr *port_attr) 2173 { 2174 if (!rdma_is_port_valid(device, port_num)) 2175 return -EINVAL; 2176 2177 if (rdma_protocol_iwarp(device, port_num)) 2178 return iw_query_port(device, port_num, port_attr); 2179 else 2180 return __ib_query_port(device, port_num, port_attr); 2181 } 2182 EXPORT_SYMBOL(ib_query_port); 2183 2184 static void add_ndev_hash(struct ib_port_data *pdata) 2185 { 2186 unsigned long flags; 2187 2188 might_sleep(); 2189 2190 spin_lock_irqsave(&ndev_hash_lock, flags); 2191 if (hash_hashed(&pdata->ndev_hash_link)) { 2192 hash_del_rcu(&pdata->ndev_hash_link); 2193 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2194 /* 2195 * We cannot do hash_add_rcu after a hash_del_rcu until the 2196 * grace period 2197 */ 2198 synchronize_rcu(); 2199 spin_lock_irqsave(&ndev_hash_lock, flags); 2200 } 2201 if (pdata->netdev) 2202 hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, 2203 (uintptr_t)pdata->netdev); 2204 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2205 } 2206 2207 /** 2208 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device 2209 * @ib_dev: Device to modify 2210 * @ndev: net_device to affiliate, may be NULL 2211 * @port: IB port the net_device is connected to 2212 * 2213 * Drivers should use this to link the ib_device to a netdev so the netdev 2214 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be 2215 * affiliated with any port. 2216 * 2217 * The caller must ensure that the given ndev is not unregistered or 2218 * unregistering, and that either the ib_device is unregistered or 2219 * ib_device_set_netdev() is called with NULL when the ndev sends a 2220 * NETDEV_UNREGISTER event. 2221 */ 2222 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, 2223 u32 port) 2224 { 2225 enum rdma_nl_notify_event_type etype; 2226 struct net_device *old_ndev; 2227 struct ib_port_data *pdata; 2228 unsigned long flags; 2229 int ret; 2230 2231 if (!rdma_is_port_valid(ib_dev, port)) 2232 return -EINVAL; 2233 2234 /* 2235 * Drivers wish to call this before ib_register_driver, so we have to 2236 * setup the port data early. 2237 */ 2238 ret = alloc_port_data(ib_dev); 2239 if (ret) 2240 return ret; 2241 2242 pdata = &ib_dev->port_data[port]; 2243 spin_lock_irqsave(&pdata->netdev_lock, flags); 2244 old_ndev = rcu_dereference_protected( 2245 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2246 if (old_ndev == ndev) { 2247 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2248 return 0; 2249 } 2250 2251 rcu_assign_pointer(pdata->netdev, ndev); 2252 netdev_put(old_ndev, &pdata->netdev_tracker); 2253 netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); 2254 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2255 2256 add_ndev_hash(pdata); 2257 2258 /* Make sure that the device is registered before we send events */ 2259 if (xa_load(&devices, ib_dev->index) != ib_dev) 2260 return 0; 2261 2262 etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT; 2263 rdma_nl_notify_event(ib_dev, port, etype); 2264 2265 return 0; 2266 } 2267 EXPORT_SYMBOL(ib_device_set_netdev); 2268 2269 static void free_netdevs(struct ib_device *ib_dev) 2270 { 2271 unsigned long flags; 2272 u32 port; 2273 2274 if (!ib_dev->port_data) 2275 return; 2276 2277 rdma_for_each_port (ib_dev, port) { 2278 struct ib_port_data *pdata = &ib_dev->port_data[port]; 2279 struct net_device *ndev; 2280 2281 spin_lock_irqsave(&pdata->netdev_lock, flags); 2282 ndev = rcu_dereference_protected( 2283 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2284 if (ndev) { 2285 spin_lock(&ndev_hash_lock); 2286 hash_del_rcu(&pdata->ndev_hash_link); 2287 spin_unlock(&ndev_hash_lock); 2288 2289 /* 2290 * If this is the last dev_put there is still a 2291 * synchronize_rcu before the netdev is kfreed, so we 2292 * can continue to rely on unlocked pointer 2293 * comparisons after the put 2294 */ 2295 rcu_assign_pointer(pdata->netdev, NULL); 2296 netdev_put(ndev, &pdata->netdev_tracker); 2297 } 2298 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2299 } 2300 } 2301 2302 struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, 2303 u32 port) 2304 { 2305 struct ib_port_data *pdata; 2306 struct net_device *res; 2307 2308 if (!rdma_is_port_valid(ib_dev, port)) 2309 return NULL; 2310 2311 if (!ib_dev->port_data) 2312 return NULL; 2313 2314 pdata = &ib_dev->port_data[port]; 2315 2316 /* 2317 * New drivers should use ib_device_set_netdev() not the legacy 2318 * get_netdev(). 2319 */ 2320 if (ib_dev->ops.get_netdev) 2321 res = ib_dev->ops.get_netdev(ib_dev, port); 2322 else { 2323 spin_lock(&pdata->netdev_lock); 2324 res = rcu_dereference_protected( 2325 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2326 dev_hold(res); 2327 spin_unlock(&pdata->netdev_lock); 2328 } 2329 2330 return res; 2331 } 2332 EXPORT_SYMBOL(ib_device_get_netdev); 2333 2334 /** 2335 * ib_query_netdev_port - Query the port number of a net_device 2336 * associated with an ibdev 2337 * @ibdev: IB device 2338 * @ndev: Network device 2339 * @port: IB port the net_device is connected to 2340 */ 2341 int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, 2342 u32 *port) 2343 { 2344 struct net_device *ib_ndev; 2345 u32 port_num; 2346 2347 rdma_for_each_port(ibdev, port_num) { 2348 ib_ndev = ib_device_get_netdev(ibdev, port_num); 2349 if (ndev == ib_ndev) { 2350 *port = port_num; 2351 dev_put(ib_ndev); 2352 return 0; 2353 } 2354 dev_put(ib_ndev); 2355 } 2356 2357 return -ENOENT; 2358 } 2359 EXPORT_SYMBOL(ib_query_netdev_port); 2360 2361 /** 2362 * ib_device_get_by_netdev - Find an IB device associated with a netdev 2363 * @ndev: netdev to locate 2364 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 2365 * 2366 * Find and hold an ib_device that is associated with a netdev via 2367 * ib_device_set_netdev(). The caller must call ib_device_put() on the 2368 * returned pointer. 2369 */ 2370 struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, 2371 enum rdma_driver_id driver_id) 2372 { 2373 struct ib_device *res = NULL; 2374 struct ib_port_data *cur; 2375 2376 rcu_read_lock(); 2377 hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, 2378 (uintptr_t)ndev) { 2379 if (rcu_access_pointer(cur->netdev) == ndev && 2380 (driver_id == RDMA_DRIVER_UNKNOWN || 2381 cur->ib_dev->ops.driver_id == driver_id) && 2382 ib_device_try_get(cur->ib_dev)) { 2383 res = cur->ib_dev; 2384 break; 2385 } 2386 } 2387 rcu_read_unlock(); 2388 2389 return res; 2390 } 2391 EXPORT_SYMBOL(ib_device_get_by_netdev); 2392 2393 /** 2394 * ib_enum_roce_netdev - enumerate all RoCE ports 2395 * @ib_dev : IB device we want to query 2396 * @filter: Should we call the callback? 2397 * @filter_cookie: Cookie passed to filter 2398 * @cb: Callback to call for each found RoCE ports 2399 * @cookie: Cookie passed back to the callback 2400 * 2401 * Enumerates all of the physical RoCE ports of ib_dev 2402 * which are related to netdevice and calls callback() on each 2403 * device for which filter() function returns non zero. 2404 */ 2405 void ib_enum_roce_netdev(struct ib_device *ib_dev, 2406 roce_netdev_filter filter, 2407 void *filter_cookie, 2408 roce_netdev_callback cb, 2409 void *cookie) 2410 { 2411 u32 port; 2412 2413 rdma_for_each_port (ib_dev, port) 2414 if (rdma_protocol_roce(ib_dev, port)) { 2415 struct net_device *idev = 2416 ib_device_get_netdev(ib_dev, port); 2417 2418 if (filter(ib_dev, port, idev, filter_cookie)) 2419 cb(ib_dev, port, idev, cookie); 2420 dev_put(idev); 2421 } 2422 } 2423 2424 /** 2425 * ib_enum_all_roce_netdevs - enumerate all RoCE devices 2426 * @filter: Should we call the callback? 2427 * @filter_cookie: Cookie passed to filter 2428 * @cb: Callback to call for each found RoCE ports 2429 * @cookie: Cookie passed back to the callback 2430 * 2431 * Enumerates all RoCE devices' physical ports which are related 2432 * to netdevices and calls callback() on each device for which 2433 * filter() function returns non zero. 2434 */ 2435 void ib_enum_all_roce_netdevs(roce_netdev_filter filter, 2436 void *filter_cookie, 2437 roce_netdev_callback cb, 2438 void *cookie) 2439 { 2440 struct ib_device *dev; 2441 unsigned long index; 2442 2443 down_read(&devices_rwsem); 2444 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) 2445 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); 2446 up_read(&devices_rwsem); 2447 } 2448 2449 /* 2450 * ib_enum_all_devs - enumerate all ib_devices 2451 * @cb: Callback to call for each found ib_device 2452 * 2453 * Enumerates all ib_devices and calls callback() on each device. 2454 */ 2455 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, 2456 struct netlink_callback *cb) 2457 { 2458 unsigned long index; 2459 struct ib_device *dev; 2460 unsigned int idx = 0; 2461 int ret = 0; 2462 2463 down_read(&devices_rwsem); 2464 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 2465 if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) 2466 continue; 2467 2468 ret = nldev_cb(dev, skb, cb, idx); 2469 if (ret) 2470 break; 2471 idx++; 2472 } 2473 up_read(&devices_rwsem); 2474 return ret; 2475 } 2476 2477 /** 2478 * ib_query_pkey - Get P_Key table entry 2479 * @device:Device to query 2480 * @port_num:Port number to query 2481 * @index:P_Key table index to query 2482 * @pkey:Returned P_Key 2483 * 2484 * ib_query_pkey() fetches the specified P_Key table entry. 2485 */ 2486 int ib_query_pkey(struct ib_device *device, 2487 u32 port_num, u16 index, u16 *pkey) 2488 { 2489 if (!rdma_is_port_valid(device, port_num)) 2490 return -EINVAL; 2491 2492 if (!device->ops.query_pkey) 2493 return -EOPNOTSUPP; 2494 2495 return device->ops.query_pkey(device, port_num, index, pkey); 2496 } 2497 EXPORT_SYMBOL(ib_query_pkey); 2498 2499 /** 2500 * ib_modify_device - Change IB device attributes 2501 * @device:Device to modify 2502 * @device_modify_mask:Mask of attributes to change 2503 * @device_modify:New attribute values 2504 * 2505 * ib_modify_device() changes a device's attributes as specified by 2506 * the @device_modify_mask and @device_modify structure. 2507 */ 2508 int ib_modify_device(struct ib_device *device, 2509 int device_modify_mask, 2510 struct ib_device_modify *device_modify) 2511 { 2512 if (!device->ops.modify_device) 2513 return -EOPNOTSUPP; 2514 2515 return device->ops.modify_device(device, device_modify_mask, 2516 device_modify); 2517 } 2518 EXPORT_SYMBOL(ib_modify_device); 2519 2520 /** 2521 * ib_modify_port - Modifies the attributes for the specified port. 2522 * @device: The device to modify. 2523 * @port_num: The number of the port to modify. 2524 * @port_modify_mask: Mask used to specify which attributes of the port 2525 * to change. 2526 * @port_modify: New attribute values for the port. 2527 * 2528 * ib_modify_port() changes a port's attributes as specified by the 2529 * @port_modify_mask and @port_modify structure. 2530 */ 2531 int ib_modify_port(struct ib_device *device, 2532 u32 port_num, int port_modify_mask, 2533 struct ib_port_modify *port_modify) 2534 { 2535 int rc; 2536 2537 if (!rdma_is_port_valid(device, port_num)) 2538 return -EINVAL; 2539 2540 if (device->ops.modify_port) 2541 rc = device->ops.modify_port(device, port_num, 2542 port_modify_mask, 2543 port_modify); 2544 else if (rdma_protocol_roce(device, port_num) && 2545 ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || 2546 (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) 2547 rc = 0; 2548 else 2549 rc = -EOPNOTSUPP; 2550 return rc; 2551 } 2552 EXPORT_SYMBOL(ib_modify_port); 2553 2554 /** 2555 * ib_find_gid - Returns the port number and GID table index where 2556 * a specified GID value occurs. Its searches only for IB link layer. 2557 * @device: The device to query. 2558 * @gid: The GID value to search for. 2559 * @port_num: The port number of the device where the GID value was found. 2560 * @index: The index into the GID table where the GID was found. This 2561 * parameter may be NULL. 2562 */ 2563 int ib_find_gid(struct ib_device *device, union ib_gid *gid, 2564 u32 *port_num, u16 *index) 2565 { 2566 union ib_gid tmp_gid; 2567 u32 port; 2568 int ret, i; 2569 2570 rdma_for_each_port (device, port) { 2571 if (!rdma_protocol_ib(device, port)) 2572 continue; 2573 2574 for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; 2575 ++i) { 2576 ret = rdma_query_gid(device, port, i, &tmp_gid); 2577 if (ret) 2578 continue; 2579 2580 if (!memcmp(&tmp_gid, gid, sizeof *gid)) { 2581 *port_num = port; 2582 if (index) 2583 *index = i; 2584 return 0; 2585 } 2586 } 2587 } 2588 2589 return -ENOENT; 2590 } 2591 EXPORT_SYMBOL(ib_find_gid); 2592 2593 /** 2594 * ib_find_pkey - Returns the PKey table index where a specified 2595 * PKey value occurs. 2596 * @device: The device to query. 2597 * @port_num: The port number of the device to search for the PKey. 2598 * @pkey: The PKey value to search for. 2599 * @index: The index into the PKey table where the PKey was found. 2600 */ 2601 int ib_find_pkey(struct ib_device *device, 2602 u32 port_num, u16 pkey, u16 *index) 2603 { 2604 int ret, i; 2605 u16 tmp_pkey; 2606 int partial_ix = -1; 2607 2608 for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; 2609 ++i) { 2610 ret = ib_query_pkey(device, port_num, i, &tmp_pkey); 2611 if (ret) 2612 return ret; 2613 if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { 2614 /* if there is full-member pkey take it.*/ 2615 if (tmp_pkey & 0x8000) { 2616 *index = i; 2617 return 0; 2618 } 2619 if (partial_ix < 0) 2620 partial_ix = i; 2621 } 2622 } 2623 2624 /*no full-member, if exists take the limited*/ 2625 if (partial_ix >= 0) { 2626 *index = partial_ix; 2627 return 0; 2628 } 2629 return -ENOENT; 2630 } 2631 EXPORT_SYMBOL(ib_find_pkey); 2632 2633 /** 2634 * ib_get_net_dev_by_params() - Return the appropriate net_dev 2635 * for a received CM request 2636 * @dev: An RDMA device on which the request has been received. 2637 * @port: Port number on the RDMA device. 2638 * @pkey: The Pkey the request came on. 2639 * @gid: A GID that the net_dev uses to communicate. 2640 * @addr: Contains the IP address that the request specified as its 2641 * destination. 2642 * 2643 */ 2644 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, 2645 u32 port, 2646 u16 pkey, 2647 const union ib_gid *gid, 2648 const struct sockaddr *addr) 2649 { 2650 struct net_device *net_dev = NULL; 2651 unsigned long index; 2652 void *client_data; 2653 2654 if (!rdma_protocol_ib(dev, port)) 2655 return NULL; 2656 2657 /* 2658 * Holding the read side guarantees that the client will not become 2659 * unregistered while we are calling get_net_dev_by_params() 2660 */ 2661 down_read(&dev->client_data_rwsem); 2662 xan_for_each_marked (&dev->client_data, index, client_data, 2663 CLIENT_DATA_REGISTERED) { 2664 struct ib_client *client = xa_load(&clients, index); 2665 2666 if (!client || !client->get_net_dev_by_params) 2667 continue; 2668 2669 net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, 2670 addr, client_data); 2671 if (net_dev) 2672 break; 2673 } 2674 up_read(&dev->client_data_rwsem); 2675 2676 return net_dev; 2677 } 2678 EXPORT_SYMBOL(ib_get_net_dev_by_params); 2679 2680 void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) 2681 { 2682 struct ib_device_ops *dev_ops = &dev->ops; 2683 #define SET_DEVICE_OP(ptr, name) \ 2684 do { \ 2685 if (ops->name) \ 2686 if (!((ptr)->name)) \ 2687 (ptr)->name = ops->name; \ 2688 } while (0) 2689 2690 #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) 2691 2692 if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { 2693 WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && 2694 dev_ops->driver_id != ops->driver_id); 2695 dev_ops->driver_id = ops->driver_id; 2696 } 2697 if (ops->owner) { 2698 WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); 2699 dev_ops->owner = ops->owner; 2700 } 2701 if (ops->uverbs_abi_ver) 2702 dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; 2703 2704 dev_ops->uverbs_no_driver_id_binding |= 2705 ops->uverbs_no_driver_id_binding; 2706 2707 SET_DEVICE_OP(dev_ops, add_gid); 2708 SET_DEVICE_OP(dev_ops, add_sub_dev); 2709 SET_DEVICE_OP(dev_ops, advise_mr); 2710 SET_DEVICE_OP(dev_ops, alloc_dm); 2711 SET_DEVICE_OP(dev_ops, alloc_dmah); 2712 SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); 2713 SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); 2714 SET_DEVICE_OP(dev_ops, alloc_mr); 2715 SET_DEVICE_OP(dev_ops, alloc_mr_integrity); 2716 SET_DEVICE_OP(dev_ops, alloc_mw); 2717 SET_DEVICE_OP(dev_ops, alloc_pd); 2718 SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); 2719 SET_DEVICE_OP(dev_ops, alloc_ucontext); 2720 SET_DEVICE_OP(dev_ops, alloc_xrcd); 2721 SET_DEVICE_OP(dev_ops, attach_mcast); 2722 SET_DEVICE_OP(dev_ops, check_mr_status); 2723 SET_DEVICE_OP(dev_ops, counter_alloc_stats); 2724 SET_DEVICE_OP(dev_ops, counter_bind_qp); 2725 SET_DEVICE_OP(dev_ops, counter_dealloc); 2726 SET_DEVICE_OP(dev_ops, counter_init); 2727 SET_DEVICE_OP(dev_ops, counter_unbind_qp); 2728 SET_DEVICE_OP(dev_ops, counter_update_stats); 2729 SET_DEVICE_OP(dev_ops, create_ah); 2730 SET_DEVICE_OP(dev_ops, create_counters); 2731 SET_DEVICE_OP(dev_ops, create_cq); 2732 SET_DEVICE_OP(dev_ops, create_cq_umem); 2733 SET_DEVICE_OP(dev_ops, create_flow); 2734 SET_DEVICE_OP(dev_ops, create_qp); 2735 SET_DEVICE_OP(dev_ops, create_rwq_ind_table); 2736 SET_DEVICE_OP(dev_ops, create_srq); 2737 SET_DEVICE_OP(dev_ops, create_user_ah); 2738 SET_DEVICE_OP(dev_ops, create_wq); 2739 SET_DEVICE_OP(dev_ops, dealloc_dm); 2740 SET_DEVICE_OP(dev_ops, dealloc_dmah); 2741 SET_DEVICE_OP(dev_ops, dealloc_driver); 2742 SET_DEVICE_OP(dev_ops, dealloc_mw); 2743 SET_DEVICE_OP(dev_ops, dealloc_pd); 2744 SET_DEVICE_OP(dev_ops, dealloc_ucontext); 2745 SET_DEVICE_OP(dev_ops, dealloc_xrcd); 2746 SET_DEVICE_OP(dev_ops, del_gid); 2747 SET_DEVICE_OP(dev_ops, del_sub_dev); 2748 SET_DEVICE_OP(dev_ops, dereg_mr); 2749 SET_DEVICE_OP(dev_ops, destroy_ah); 2750 SET_DEVICE_OP(dev_ops, destroy_counters); 2751 SET_DEVICE_OP(dev_ops, destroy_cq); 2752 SET_DEVICE_OP(dev_ops, destroy_flow); 2753 SET_DEVICE_OP(dev_ops, destroy_flow_action); 2754 SET_DEVICE_OP(dev_ops, destroy_qp); 2755 SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); 2756 SET_DEVICE_OP(dev_ops, destroy_srq); 2757 SET_DEVICE_OP(dev_ops, destroy_wq); 2758 SET_DEVICE_OP(dev_ops, device_group); 2759 SET_DEVICE_OP(dev_ops, detach_mcast); 2760 SET_DEVICE_OP(dev_ops, disassociate_ucontext); 2761 SET_DEVICE_OP(dev_ops, drain_rq); 2762 SET_DEVICE_OP(dev_ops, drain_sq); 2763 SET_DEVICE_OP(dev_ops, enable_driver); 2764 SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); 2765 SET_DEVICE_OP(dev_ops, fill_res_cq_entry); 2766 SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); 2767 SET_DEVICE_OP(dev_ops, fill_res_mr_entry); 2768 SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); 2769 SET_DEVICE_OP(dev_ops, fill_res_qp_entry); 2770 SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); 2771 SET_DEVICE_OP(dev_ops, fill_res_srq_entry); 2772 SET_DEVICE_OP(dev_ops, fill_res_srq_entry_raw); 2773 SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); 2774 SET_DEVICE_OP(dev_ops, get_dev_fw_str); 2775 SET_DEVICE_OP(dev_ops, get_dma_mr); 2776 SET_DEVICE_OP(dev_ops, get_hw_stats); 2777 SET_DEVICE_OP(dev_ops, get_link_layer); 2778 SET_DEVICE_OP(dev_ops, get_netdev); 2779 SET_DEVICE_OP(dev_ops, get_numa_node); 2780 SET_DEVICE_OP(dev_ops, get_port_immutable); 2781 SET_DEVICE_OP(dev_ops, get_vector_affinity); 2782 SET_DEVICE_OP(dev_ops, get_vf_config); 2783 SET_DEVICE_OP(dev_ops, get_vf_guid); 2784 SET_DEVICE_OP(dev_ops, get_vf_stats); 2785 SET_DEVICE_OP(dev_ops, iw_accept); 2786 SET_DEVICE_OP(dev_ops, iw_add_ref); 2787 SET_DEVICE_OP(dev_ops, iw_connect); 2788 SET_DEVICE_OP(dev_ops, iw_create_listen); 2789 SET_DEVICE_OP(dev_ops, iw_destroy_listen); 2790 SET_DEVICE_OP(dev_ops, iw_get_qp); 2791 SET_DEVICE_OP(dev_ops, iw_reject); 2792 SET_DEVICE_OP(dev_ops, iw_rem_ref); 2793 SET_DEVICE_OP(dev_ops, map_mr_sg); 2794 SET_DEVICE_OP(dev_ops, map_mr_sg_pi); 2795 SET_DEVICE_OP(dev_ops, mmap); 2796 SET_DEVICE_OP(dev_ops, mmap_free); 2797 SET_DEVICE_OP(dev_ops, modify_ah); 2798 SET_DEVICE_OP(dev_ops, modify_cq); 2799 SET_DEVICE_OP(dev_ops, modify_device); 2800 SET_DEVICE_OP(dev_ops, modify_hw_stat); 2801 SET_DEVICE_OP(dev_ops, modify_port); 2802 SET_DEVICE_OP(dev_ops, modify_qp); 2803 SET_DEVICE_OP(dev_ops, modify_srq); 2804 SET_DEVICE_OP(dev_ops, modify_wq); 2805 SET_DEVICE_OP(dev_ops, peek_cq); 2806 SET_DEVICE_OP(dev_ops, pre_destroy_cq); 2807 SET_DEVICE_OP(dev_ops, poll_cq); 2808 SET_DEVICE_OP(dev_ops, port_groups); 2809 SET_DEVICE_OP(dev_ops, post_destroy_cq); 2810 SET_DEVICE_OP(dev_ops, post_recv); 2811 SET_DEVICE_OP(dev_ops, post_send); 2812 SET_DEVICE_OP(dev_ops, post_srq_recv); 2813 SET_DEVICE_OP(dev_ops, process_mad); 2814 SET_DEVICE_OP(dev_ops, query_ah); 2815 SET_DEVICE_OP(dev_ops, query_device); 2816 SET_DEVICE_OP(dev_ops, query_gid); 2817 SET_DEVICE_OP(dev_ops, query_pkey); 2818 SET_DEVICE_OP(dev_ops, query_port); 2819 SET_DEVICE_OP(dev_ops, query_qp); 2820 SET_DEVICE_OP(dev_ops, query_srq); 2821 SET_DEVICE_OP(dev_ops, query_ucontext); 2822 SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); 2823 SET_DEVICE_OP(dev_ops, read_counters); 2824 SET_DEVICE_OP(dev_ops, reg_dm_mr); 2825 SET_DEVICE_OP(dev_ops, reg_user_mr); 2826 SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf); 2827 SET_DEVICE_OP(dev_ops, req_notify_cq); 2828 SET_DEVICE_OP(dev_ops, rereg_user_mr); 2829 SET_DEVICE_OP(dev_ops, resize_cq); 2830 SET_DEVICE_OP(dev_ops, set_vf_guid); 2831 SET_DEVICE_OP(dev_ops, set_vf_link_state); 2832 SET_DEVICE_OP(dev_ops, ufile_hw_cleanup); 2833 SET_DEVICE_OP(dev_ops, report_port_event); 2834 2835 SET_OBJ_SIZE(dev_ops, ib_ah); 2836 SET_OBJ_SIZE(dev_ops, ib_counters); 2837 SET_OBJ_SIZE(dev_ops, ib_cq); 2838 SET_OBJ_SIZE(dev_ops, ib_dmah); 2839 SET_OBJ_SIZE(dev_ops, ib_mw); 2840 SET_OBJ_SIZE(dev_ops, ib_pd); 2841 SET_OBJ_SIZE(dev_ops, ib_qp); 2842 SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); 2843 SET_OBJ_SIZE(dev_ops, ib_srq); 2844 SET_OBJ_SIZE(dev_ops, ib_ucontext); 2845 SET_OBJ_SIZE(dev_ops, ib_xrcd); 2846 SET_OBJ_SIZE(dev_ops, rdma_counter); 2847 } 2848 EXPORT_SYMBOL(ib_set_device_ops); 2849 2850 int ib_add_sub_device(struct ib_device *parent, 2851 enum rdma_nl_dev_type type, 2852 const char *name) 2853 { 2854 struct ib_device *sub; 2855 int ret = 0; 2856 2857 if (!parent->ops.add_sub_dev || !parent->ops.del_sub_dev) 2858 return -EOPNOTSUPP; 2859 2860 if (!ib_device_try_get(parent)) 2861 return -EINVAL; 2862 2863 sub = parent->ops.add_sub_dev(parent, type, name); 2864 if (IS_ERR(sub)) { 2865 ib_device_put(parent); 2866 return PTR_ERR(sub); 2867 } 2868 2869 sub->type = type; 2870 sub->parent = parent; 2871 2872 mutex_lock(&parent->subdev_lock); 2873 list_add_tail(&parent->subdev_list_head, &sub->subdev_list); 2874 mutex_unlock(&parent->subdev_lock); 2875 2876 return ret; 2877 } 2878 EXPORT_SYMBOL(ib_add_sub_device); 2879 2880 int ib_del_sub_device_and_put(struct ib_device *sub) 2881 { 2882 struct ib_device *parent = sub->parent; 2883 2884 if (!parent) 2885 return -EOPNOTSUPP; 2886 2887 mutex_lock(&parent->subdev_lock); 2888 list_del(&sub->subdev_list); 2889 mutex_unlock(&parent->subdev_lock); 2890 2891 ib_device_put(sub); 2892 parent->ops.del_sub_dev(sub); 2893 ib_device_put(parent); 2894 2895 return 0; 2896 } 2897 EXPORT_SYMBOL(ib_del_sub_device_and_put); 2898 2899 #ifdef CONFIG_INFINIBAND_VIRT_DMA 2900 int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) 2901 { 2902 struct scatterlist *s; 2903 int i; 2904 2905 for_each_sg(sg, s, nents, i) { 2906 sg_dma_address(s) = (uintptr_t)sg_virt(s); 2907 sg_dma_len(s) = s->length; 2908 } 2909 return nents; 2910 } 2911 EXPORT_SYMBOL(ib_dma_virt_map_sg); 2912 #endif /* CONFIG_INFINIBAND_VIRT_DMA */ 2913 2914 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { 2915 [RDMA_NL_LS_OP_RESOLVE] = { 2916 .doit = ib_nl_handle_resolve_resp, 2917 .flags = RDMA_NL_ADMIN_PERM, 2918 }, 2919 [RDMA_NL_LS_OP_SET_TIMEOUT] = { 2920 .doit = ib_nl_handle_set_timeout, 2921 .flags = RDMA_NL_ADMIN_PERM, 2922 }, 2923 [RDMA_NL_LS_OP_IP_RESOLVE] = { 2924 .doit = ib_nl_handle_ip_res_resp, 2925 .flags = RDMA_NL_ADMIN_PERM, 2926 }, 2927 }; 2928 2929 void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev) 2930 { 2931 enum ib_port_state curr_state; 2932 struct ib_event ibevent = {}; 2933 u32 port; 2934 2935 if (ib_query_netdev_port(ibdev, ndev, &port)) 2936 return; 2937 2938 curr_state = ib_get_curr_port_state(ndev); 2939 2940 write_lock_irq(&ibdev->cache_lock); 2941 if (ibdev->port_data[port].cache.last_port_state == curr_state) { 2942 write_unlock_irq(&ibdev->cache_lock); 2943 return; 2944 } 2945 ibdev->port_data[port].cache.last_port_state = curr_state; 2946 write_unlock_irq(&ibdev->cache_lock); 2947 2948 ibevent.event = (curr_state == IB_PORT_DOWN) ? 2949 IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE; 2950 ibevent.device = ibdev; 2951 ibevent.element.port_num = port; 2952 ib_dispatch_event(&ibevent); 2953 } 2954 EXPORT_SYMBOL(ib_dispatch_port_state_event); 2955 2956 static void handle_port_event(struct net_device *ndev, unsigned long event) 2957 { 2958 struct ib_device *ibdev; 2959 2960 /* Currently, link events in bonding scenarios are still 2961 * reported by drivers that support bonding. 2962 */ 2963 if (netif_is_lag_master(ndev) || netif_is_lag_port(ndev)) 2964 return; 2965 2966 ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); 2967 if (!ibdev) 2968 return; 2969 2970 if (ibdev->ops.report_port_event) { 2971 ibdev->ops.report_port_event(ibdev, ndev, event); 2972 goto put_ibdev; 2973 } 2974 2975 ib_dispatch_port_state_event(ibdev, ndev); 2976 2977 put_ibdev: 2978 ib_device_put(ibdev); 2979 }; 2980 2981 static int ib_netdevice_event(struct notifier_block *this, 2982 unsigned long event, void *ptr) 2983 { 2984 struct net_device *ndev = netdev_notifier_info_to_dev(ptr); 2985 struct ib_device *ibdev; 2986 u32 port; 2987 2988 switch (event) { 2989 case NETDEV_CHANGENAME: 2990 ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); 2991 if (!ibdev) 2992 return NOTIFY_DONE; 2993 2994 if (ib_query_netdev_port(ibdev, ndev, &port)) { 2995 ib_device_put(ibdev); 2996 break; 2997 } 2998 2999 rdma_nl_notify_event(ibdev, port, RDMA_NETDEV_RENAME_EVENT); 3000 ib_device_put(ibdev); 3001 break; 3002 3003 case NETDEV_UP: 3004 case NETDEV_CHANGE: 3005 case NETDEV_DOWN: 3006 handle_port_event(ndev, event); 3007 break; 3008 3009 default: 3010 break; 3011 } 3012 3013 return NOTIFY_DONE; 3014 } 3015 3016 static struct notifier_block nb_netdevice = { 3017 .notifier_call = ib_netdevice_event, 3018 }; 3019 3020 static int __init ib_core_init(void) 3021 { 3022 int ret = -ENOMEM; 3023 3024 ib_wq = alloc_workqueue("infiniband", 0, 0); 3025 if (!ib_wq) 3026 return -ENOMEM; 3027 3028 ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND, 3029 WQ_UNBOUND_MAX_ACTIVE); 3030 if (!ib_unreg_wq) 3031 goto err; 3032 3033 ib_comp_wq = alloc_workqueue("ib-comp-wq", 3034 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 3035 if (!ib_comp_wq) 3036 goto err_unbound; 3037 3038 ib_comp_unbound_wq = 3039 alloc_workqueue("ib-comp-unb-wq", 3040 WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | 3041 WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); 3042 if (!ib_comp_unbound_wq) 3043 goto err_comp; 3044 3045 ret = class_register(&ib_class); 3046 if (ret) { 3047 pr_warn("Couldn't create InfiniBand device class\n"); 3048 goto err_comp_unbound; 3049 } 3050 3051 rdma_nl_init(); 3052 3053 ret = addr_init(); 3054 if (ret) { 3055 pr_warn("Couldn't init IB address resolution\n"); 3056 goto err_ibnl; 3057 } 3058 3059 ret = ib_mad_init(); 3060 if (ret) { 3061 pr_warn("Couldn't init IB MAD\n"); 3062 goto err_addr; 3063 } 3064 3065 ret = ib_sa_init(); 3066 if (ret) { 3067 pr_warn("Couldn't init SA\n"); 3068 goto err_mad; 3069 } 3070 3071 ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); 3072 if (ret) { 3073 pr_warn("Couldn't register LSM notifier. ret %d\n", ret); 3074 goto err_sa; 3075 } 3076 3077 ret = register_pernet_device(&rdma_dev_net_ops); 3078 if (ret) { 3079 pr_warn("Couldn't init compat dev. ret %d\n", ret); 3080 goto err_compat; 3081 } 3082 3083 nldev_init(); 3084 rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); 3085 ret = roce_gid_mgmt_init(); 3086 if (ret) { 3087 pr_warn("Couldn't init RoCE GID management\n"); 3088 goto err_parent; 3089 } 3090 3091 register_netdevice_notifier(&nb_netdevice); 3092 3093 return 0; 3094 3095 err_parent: 3096 rdma_nl_unregister(RDMA_NL_LS); 3097 nldev_exit(); 3098 unregister_pernet_device(&rdma_dev_net_ops); 3099 err_compat: 3100 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 3101 err_sa: 3102 ib_sa_cleanup(); 3103 err_mad: 3104 ib_mad_cleanup(); 3105 err_addr: 3106 addr_cleanup(); 3107 err_ibnl: 3108 class_unregister(&ib_class); 3109 err_comp_unbound: 3110 destroy_workqueue(ib_comp_unbound_wq); 3111 err_comp: 3112 destroy_workqueue(ib_comp_wq); 3113 err_unbound: 3114 destroy_workqueue(ib_unreg_wq); 3115 err: 3116 destroy_workqueue(ib_wq); 3117 return ret; 3118 } 3119 3120 static void __exit ib_core_cleanup(void) 3121 { 3122 unregister_netdevice_notifier(&nb_netdevice); 3123 roce_gid_mgmt_cleanup(); 3124 rdma_nl_unregister(RDMA_NL_LS); 3125 nldev_exit(); 3126 unregister_pernet_device(&rdma_dev_net_ops); 3127 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 3128 ib_sa_cleanup(); 3129 ib_mad_cleanup(); 3130 addr_cleanup(); 3131 rdma_nl_exit(); 3132 class_unregister(&ib_class); 3133 destroy_workqueue(ib_comp_unbound_wq); 3134 destroy_workqueue(ib_comp_wq); 3135 /* Make sure that any pending umem accounting work is done. */ 3136 destroy_workqueue(ib_wq); 3137 destroy_workqueue(ib_unreg_wq); 3138 WARN_ON(!xa_empty(&clients)); 3139 WARN_ON(!xa_empty(&devices)); 3140 } 3141 3142 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); 3143 3144 /* ib core relies on netdev stack to first register net_ns_type_operations 3145 * ns kobject type before ib_core initialization. 3146 */ 3147 fs_initcall(ib_core_init); 3148 module_exit(ib_core_cleanup); 3149