1 /* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <linux/module.h> 35 #include <linux/string.h> 36 #include <linux/errno.h> 37 #include <linux/kernel.h> 38 #include <linux/slab.h> 39 #include <linux/init.h> 40 #include <linux/netdevice.h> 41 #include <net/net_namespace.h> 42 #include <linux/security.h> 43 #include <linux/notifier.h> 44 #include <linux/hashtable.h> 45 #include <linux/cc_platform.h> 46 #include <rdma/rdma_netlink.h> 47 #include <rdma/ib_addr.h> 48 #include <rdma/ib_cache.h> 49 #include <rdma/rdma_counter.h> 50 51 #include "core_priv.h" 52 #include "restrack.h" 53 54 MODULE_AUTHOR("Roland Dreier"); 55 MODULE_DESCRIPTION("core kernel InfiniBand API"); 56 MODULE_LICENSE("Dual BSD/GPL"); 57 58 struct workqueue_struct *ib_comp_wq; 59 struct workqueue_struct *ib_comp_unbound_wq; 60 struct workqueue_struct *ib_wq; 61 EXPORT_SYMBOL_GPL(ib_wq); 62 static struct workqueue_struct *ib_unreg_wq; 63 64 /* 65 * Each of the three rwsem locks (devices, clients, client_data) protects the 66 * xarray of the same name. Specifically it allows the caller to assert that 67 * the MARK will/will not be changing under the lock, and for devices and 68 * clients, that the value in the xarray is still a valid pointer. Change of 69 * the MARK is linked to the object state, so holding the lock and testing the 70 * MARK also asserts that the contained object is in a certain state. 71 * 72 * This is used to build a two stage register/unregister flow where objects 73 * can continue to be in the xarray even though they are still in progress to 74 * register/unregister. 75 * 76 * The xarray itself provides additional locking, and restartable iteration, 77 * which is also relied on. 78 * 79 * Locks should not be nested, with the exception of client_data, which is 80 * allowed to nest under the read side of the other two locks. 81 * 82 * The devices_rwsem also protects the device name list, any change or 83 * assignment of device name must also hold the write side to guarantee unique 84 * names. 85 */ 86 87 /* 88 * devices contains devices that have had their names assigned. The 89 * devices may not be registered. Users that care about the registration 90 * status need to call ib_device_try_get() on the device to ensure it is 91 * registered, and keep it registered, for the required duration. 92 * 93 */ 94 static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); 95 static DECLARE_RWSEM(devices_rwsem); 96 #define DEVICE_REGISTERED XA_MARK_1 97 #define DEVICE_GID_UPDATES XA_MARK_2 98 99 static u32 highest_client_id; 100 #define CLIENT_REGISTERED XA_MARK_1 101 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); 102 static DECLARE_RWSEM(clients_rwsem); 103 104 static void ib_client_put(struct ib_client *client) 105 { 106 if (refcount_dec_and_test(&client->uses)) 107 complete(&client->uses_zero); 108 } 109 110 /* 111 * If client_data is registered then the corresponding client must also still 112 * be registered. 113 */ 114 #define CLIENT_DATA_REGISTERED XA_MARK_1 115 116 unsigned int rdma_dev_net_id; 117 118 /* 119 * A list of net namespaces is maintained in an xarray. This is necessary 120 * because we can't get the locking right using the existing net ns list. We 121 * would require a init_net callback after the list is updated. 122 */ 123 static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); 124 /* 125 * rwsem to protect accessing the rdma_nets xarray entries. 126 */ 127 static DECLARE_RWSEM(rdma_nets_rwsem); 128 129 bool ib_devices_shared_netns = true; 130 module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); 131 MODULE_PARM_DESC(netns_mode, 132 "Share device among net namespaces; default=1 (shared)"); 133 /** 134 * rdma_dev_access_netns() - Return whether an rdma device can be accessed 135 * from a specified net namespace or not. 136 * @dev: Pointer to rdma device which needs to be checked 137 * @net: Pointer to net namesapce for which access to be checked 138 * 139 * When the rdma device is in shared mode, it ignores the net namespace. 140 * When the rdma device is exclusive to a net namespace, rdma device net 141 * namespace is checked against the specified one. 142 */ 143 bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) 144 { 145 return (ib_devices_shared_netns || 146 net_eq(read_pnet(&dev->coredev.rdma_net), net)); 147 } 148 EXPORT_SYMBOL(rdma_dev_access_netns); 149 150 /** 151 * rdma_dev_has_raw_cap() - Returns whether a specified rdma device has 152 * CAP_NET_RAW capability or not. 153 * 154 * @dev: Pointer to rdma device whose capability to be checked 155 * 156 * Returns true if a rdma device's owning user namespace has CAP_NET_RAW 157 * capability, otherwise false. When rdma subsystem is in legacy shared network, 158 * namespace mode, the default net namespace is considered. 159 */ 160 bool rdma_dev_has_raw_cap(const struct ib_device *dev) 161 { 162 const struct net *net; 163 164 /* Network namespace is the resource whose user namespace 165 * to be considered. When in shared mode, there is no reliable 166 * network namespace resource, so consider the default net namespace. 167 */ 168 if (ib_devices_shared_netns) 169 net = &init_net; 170 else 171 net = read_pnet(&dev->coredev.rdma_net); 172 173 return ns_capable(net->user_ns, CAP_NET_RAW); 174 } 175 EXPORT_SYMBOL(rdma_dev_has_raw_cap); 176 177 /* 178 * xarray has this behavior where it won't iterate over NULL values stored in 179 * allocated arrays. So we need our own iterator to see all values stored in 180 * the array. This does the same thing as xa_for_each except that it also 181 * returns NULL valued entries if the array is allocating. Simplified to only 182 * work on simple xarrays. 183 */ 184 static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, 185 xa_mark_t filter) 186 { 187 XA_STATE(xas, xa, *indexp); 188 void *entry; 189 190 rcu_read_lock(); 191 do { 192 entry = xas_find_marked(&xas, ULONG_MAX, filter); 193 if (xa_is_zero(entry)) 194 break; 195 } while (xas_retry(&xas, entry)); 196 rcu_read_unlock(); 197 198 if (entry) { 199 *indexp = xas.xa_index; 200 if (xa_is_zero(entry)) 201 return NULL; 202 return entry; 203 } 204 return XA_ERROR(-ENOENT); 205 } 206 #define xan_for_each_marked(xa, index, entry, filter) \ 207 for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ 208 !xa_is_err(entry); \ 209 (index)++, entry = xan_find_marked(xa, &(index), filter)) 210 211 /* RCU hash table mapping netdevice pointers to struct ib_port_data */ 212 static DEFINE_SPINLOCK(ndev_hash_lock); 213 static DECLARE_HASHTABLE(ndev_hash, 5); 214 215 static void free_netdevs(struct ib_device *ib_dev); 216 static void ib_unregister_work(struct work_struct *work); 217 static void __ib_unregister_device(struct ib_device *device); 218 static int ib_security_change(struct notifier_block *nb, unsigned long event, 219 void *lsm_data); 220 static void ib_policy_change_task(struct work_struct *work); 221 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); 222 223 static void __ibdev_printk(const char *level, const struct ib_device *ibdev, 224 struct va_format *vaf) 225 { 226 if (ibdev && ibdev->dev.parent) 227 dev_printk_emit(level[1] - '0', 228 ibdev->dev.parent, 229 "%s %s %s: %pV", 230 dev_driver_string(ibdev->dev.parent), 231 dev_name(ibdev->dev.parent), 232 dev_name(&ibdev->dev), 233 vaf); 234 else if (ibdev) 235 printk("%s%s: %pV", 236 level, dev_name(&ibdev->dev), vaf); 237 else 238 printk("%s(NULL ib_device): %pV", level, vaf); 239 } 240 241 #define define_ibdev_printk_level(func, level) \ 242 void func(const struct ib_device *ibdev, const char *fmt, ...) \ 243 { \ 244 struct va_format vaf; \ 245 va_list args; \ 246 \ 247 va_start(args, fmt); \ 248 \ 249 vaf.fmt = fmt; \ 250 vaf.va = &args; \ 251 \ 252 __ibdev_printk(level, ibdev, &vaf); \ 253 \ 254 va_end(args); \ 255 } \ 256 EXPORT_SYMBOL(func); 257 258 define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); 259 define_ibdev_printk_level(ibdev_alert, KERN_ALERT); 260 define_ibdev_printk_level(ibdev_crit, KERN_CRIT); 261 define_ibdev_printk_level(ibdev_err, KERN_ERR); 262 define_ibdev_printk_level(ibdev_warn, KERN_WARNING); 263 define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); 264 define_ibdev_printk_level(ibdev_info, KERN_INFO); 265 266 static struct notifier_block ibdev_lsm_nb = { 267 .notifier_call = ib_security_change, 268 }; 269 270 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 271 struct net *net); 272 273 /* Pointer to the RCU head at the start of the ib_port_data array */ 274 struct ib_port_data_rcu { 275 struct rcu_head rcu_head; 276 struct ib_port_data pdata[]; 277 }; 278 279 static void ib_device_check_mandatory(struct ib_device *device) 280 { 281 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } 282 static const struct { 283 size_t offset; 284 char *name; 285 } mandatory_table[] = { 286 IB_MANDATORY_FUNC(query_device), 287 IB_MANDATORY_FUNC(query_port), 288 IB_MANDATORY_FUNC(alloc_pd), 289 IB_MANDATORY_FUNC(dealloc_pd), 290 IB_MANDATORY_FUNC(create_qp), 291 IB_MANDATORY_FUNC(modify_qp), 292 IB_MANDATORY_FUNC(destroy_qp), 293 IB_MANDATORY_FUNC(post_send), 294 IB_MANDATORY_FUNC(post_recv), 295 IB_MANDATORY_FUNC(create_cq), 296 IB_MANDATORY_FUNC(destroy_cq), 297 IB_MANDATORY_FUNC(poll_cq), 298 IB_MANDATORY_FUNC(req_notify_cq), 299 IB_MANDATORY_FUNC(get_dma_mr), 300 IB_MANDATORY_FUNC(reg_user_mr), 301 IB_MANDATORY_FUNC(dereg_mr), 302 IB_MANDATORY_FUNC(get_port_immutable) 303 }; 304 int i; 305 306 device->kverbs_provider = true; 307 for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { 308 if (!*(void **) ((void *) &device->ops + 309 mandatory_table[i].offset)) { 310 device->kverbs_provider = false; 311 break; 312 } 313 } 314 } 315 316 /* 317 * Caller must perform ib_device_put() to return the device reference count 318 * when ib_device_get_by_index() returns valid device pointer. 319 */ 320 struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) 321 { 322 struct ib_device *device; 323 324 down_read(&devices_rwsem); 325 device = xa_load(&devices, index); 326 if (device) { 327 if (!rdma_dev_access_netns(device, net)) { 328 device = NULL; 329 goto out; 330 } 331 332 if (!ib_device_try_get(device)) 333 device = NULL; 334 } 335 out: 336 up_read(&devices_rwsem); 337 return device; 338 } 339 340 /** 341 * ib_device_put - Release IB device reference 342 * @device: device whose reference to be released 343 * 344 * ib_device_put() releases reference to the IB device to allow it to be 345 * unregistered and eventually free. 346 */ 347 void ib_device_put(struct ib_device *device) 348 { 349 if (refcount_dec_and_test(&device->refcount)) 350 complete(&device->unreg_completion); 351 } 352 EXPORT_SYMBOL(ib_device_put); 353 354 static struct ib_device *__ib_device_get_by_name(const char *name) 355 { 356 struct ib_device *device; 357 unsigned long index; 358 359 xa_for_each (&devices, index, device) 360 if (!strcmp(name, dev_name(&device->dev))) 361 return device; 362 363 return NULL; 364 } 365 366 static int rename_compat_devs(struct ib_device *device) 367 { 368 struct ib_core_device *cdev; 369 unsigned long index; 370 int ret = 0; 371 372 mutex_lock(&device->compat_devs_mutex); 373 xa_for_each (&device->compat_devs, index, cdev) { 374 ret = device_rename(&cdev->dev, dev_name(&device->dev)); 375 if (ret) { 376 dev_warn(&cdev->dev, 377 "Fail to rename compatdev to new name %s\n", 378 dev_name(&device->dev)); 379 break; 380 } 381 } 382 mutex_unlock(&device->compat_devs_mutex); 383 return ret; 384 } 385 386 int ib_device_rename(struct ib_device *ibdev, const char *name) 387 { 388 unsigned long index; 389 void *client_data; 390 int ret; 391 392 down_write(&devices_rwsem); 393 if (!strcmp(name, dev_name(&ibdev->dev))) { 394 up_write(&devices_rwsem); 395 return 0; 396 } 397 398 if (__ib_device_get_by_name(name)) { 399 up_write(&devices_rwsem); 400 return -EEXIST; 401 } 402 403 ret = device_rename(&ibdev->dev, name); 404 if (ret) { 405 up_write(&devices_rwsem); 406 return ret; 407 } 408 409 strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX); 410 ret = rename_compat_devs(ibdev); 411 412 downgrade_write(&devices_rwsem); 413 down_read(&ibdev->client_data_rwsem); 414 xan_for_each_marked(&ibdev->client_data, index, client_data, 415 CLIENT_DATA_REGISTERED) { 416 struct ib_client *client = xa_load(&clients, index); 417 418 if (!client || !client->rename) 419 continue; 420 421 client->rename(ibdev, client_data); 422 } 423 up_read(&ibdev->client_data_rwsem); 424 rdma_nl_notify_event(ibdev, 0, RDMA_RENAME_EVENT); 425 up_read(&devices_rwsem); 426 return 0; 427 } 428 429 int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) 430 { 431 if (use_dim > 1) 432 return -EINVAL; 433 ibdev->use_cq_dim = use_dim; 434 435 return 0; 436 } 437 438 static int alloc_name(struct ib_device *ibdev, const char *name) 439 { 440 struct ib_device *device; 441 unsigned long index; 442 struct ida inuse; 443 int rc; 444 int i; 445 446 lockdep_assert_held_write(&devices_rwsem); 447 ida_init(&inuse); 448 xa_for_each (&devices, index, device) { 449 char buf[IB_DEVICE_NAME_MAX]; 450 451 if (sscanf(dev_name(&device->dev), name, &i) != 1) 452 continue; 453 if (i < 0 || i >= INT_MAX) 454 continue; 455 snprintf(buf, sizeof buf, name, i); 456 if (strcmp(buf, dev_name(&device->dev)) != 0) 457 continue; 458 459 rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); 460 if (rc < 0) 461 goto out; 462 } 463 464 rc = ida_alloc(&inuse, GFP_KERNEL); 465 if (rc < 0) 466 goto out; 467 468 rc = dev_set_name(&ibdev->dev, name, rc); 469 out: 470 ida_destroy(&inuse); 471 return rc; 472 } 473 474 static void ib_device_release(struct device *device) 475 { 476 struct ib_device *dev = container_of(device, struct ib_device, dev); 477 478 free_netdevs(dev); 479 WARN_ON(refcount_read(&dev->refcount)); 480 if (dev->hw_stats_data) 481 ib_device_release_hw_stats(dev->hw_stats_data); 482 if (dev->port_data) { 483 ib_cache_release_one(dev); 484 ib_security_release_port_pkey_list(dev); 485 rdma_counter_release(dev); 486 kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, 487 pdata[0]), 488 rcu_head); 489 } 490 491 mutex_destroy(&dev->subdev_lock); 492 mutex_destroy(&dev->unregistration_lock); 493 mutex_destroy(&dev->compat_devs_mutex); 494 495 xa_destroy(&dev->compat_devs); 496 xa_destroy(&dev->client_data); 497 kfree_rcu(dev, rcu_head); 498 } 499 500 static int ib_device_uevent(const struct device *device, 501 struct kobj_uevent_env *env) 502 { 503 if (add_uevent_var(env, "NAME=%s", dev_name(device))) 504 return -ENOMEM; 505 506 /* 507 * It would be nice to pass the node GUID with the event... 508 */ 509 510 return 0; 511 } 512 513 static const struct ns_common *net_namespace(const struct device *d) 514 { 515 const struct ib_core_device *coredev = 516 container_of(d, struct ib_core_device, dev); 517 struct net *net = read_pnet(&coredev->rdma_net); 518 519 return net ? to_ns_common(net) : NULL; 520 } 521 522 static struct class ib_class = { 523 .name = "infiniband", 524 .dev_release = ib_device_release, 525 .dev_uevent = ib_device_uevent, 526 .ns_type = &net_ns_type_operations, 527 .namespace = net_namespace, 528 }; 529 530 static void rdma_init_coredev(struct ib_core_device *coredev, 531 struct ib_device *dev, struct net *net) 532 { 533 bool is_full_dev = &dev->coredev == coredev; 534 535 /* This BUILD_BUG_ON is intended to catch layout change 536 * of union of ib_core_device and device. 537 * dev must be the first element as ib_core and providers 538 * driver uses it. Adding anything in ib_core_device before 539 * device will break this assumption. 540 */ 541 BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != 542 offsetof(struct ib_device, dev)); 543 544 coredev->dev.class = &ib_class; 545 coredev->dev.groups = dev->groups; 546 547 /* 548 * Don't expose hw counters outside of the init namespace. 549 */ 550 if (!is_full_dev && dev->hw_stats_attr_index) 551 coredev->dev.groups[dev->hw_stats_attr_index] = NULL; 552 553 device_initialize(&coredev->dev); 554 coredev->owner = dev; 555 INIT_LIST_HEAD(&coredev->port_list); 556 write_pnet(&coredev->rdma_net, net); 557 } 558 559 /** 560 * _ib_alloc_device - allocate an IB device struct 561 * @size:size of structure to allocate 562 * @net: network namespace device should be located in, namespace 563 * must stay valid until ib_register_device() is completed. 564 * 565 * Low-level drivers should use ib_alloc_device() to allocate &struct 566 * ib_device. @size is the size of the structure to be allocated, 567 * including any private data used by the low-level driver. 568 * ib_dealloc_device() must be used to free structures allocated with 569 * ib_alloc_device(). 570 */ 571 struct ib_device *_ib_alloc_device(size_t size, struct net *net) 572 { 573 struct ib_device *device; 574 unsigned int i; 575 576 if (WARN_ON(size < sizeof(struct ib_device))) 577 return NULL; 578 579 device = kzalloc(size, GFP_KERNEL); 580 if (!device) 581 return NULL; 582 583 if (rdma_restrack_init(device)) { 584 kfree(device); 585 return NULL; 586 } 587 588 /* ib_devices_shared_netns can't change while we have active namespaces 589 * in the system which means either init_net is passed or the user has 590 * no idea what they are doing. 591 * 592 * To avoid breaking backward compatibility, when in shared mode, 593 * force to init the device in the init_net. 594 */ 595 net = ib_devices_shared_netns ? &init_net : net; 596 rdma_init_coredev(&device->coredev, device, net); 597 598 INIT_LIST_HEAD(&device->event_handler_list); 599 spin_lock_init(&device->qp_open_list_lock); 600 init_rwsem(&device->event_handler_rwsem); 601 mutex_init(&device->unregistration_lock); 602 /* 603 * client_data needs to be alloc because we don't want our mark to be 604 * destroyed if the user stores NULL in the client data. 605 */ 606 xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); 607 init_rwsem(&device->client_data_rwsem); 608 xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); 609 mutex_init(&device->compat_devs_mutex); 610 init_completion(&device->unreg_completion); 611 INIT_WORK(&device->unregistration_work, ib_unregister_work); 612 613 spin_lock_init(&device->cq_pools_lock); 614 for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++) 615 INIT_LIST_HEAD(&device->cq_pools[i]); 616 617 rwlock_init(&device->cache_lock); 618 619 device->uverbs_cmd_mask = 620 BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) | 621 BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) | 622 BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) | 623 BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) | 624 BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) | 625 BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | 626 BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) | 627 BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) | 628 BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) | 629 BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) | 630 BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) | 631 BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) | 632 BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) | 633 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) | 634 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) | 635 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) | 636 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) | 637 BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) | 638 BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | 639 BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) | 640 BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) | 641 BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) | 642 BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) | 643 BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) | 644 BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) | 645 BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) | 646 BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) | 647 BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | 648 BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | 649 BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); 650 651 mutex_init(&device->subdev_lock); 652 INIT_LIST_HEAD(&device->subdev_list_head); 653 INIT_LIST_HEAD(&device->subdev_list); 654 655 return device; 656 } 657 EXPORT_SYMBOL(_ib_alloc_device); 658 659 /** 660 * ib_dealloc_device - free an IB device struct 661 * @device:structure to free 662 * 663 * Free a structure allocated with ib_alloc_device(). 664 */ 665 void ib_dealloc_device(struct ib_device *device) 666 { 667 if (device->ops.dealloc_driver) 668 device->ops.dealloc_driver(device); 669 670 /* 671 * ib_unregister_driver() requires all devices to remain in the xarray 672 * while their ops are callable. The last op we call is dealloc_driver 673 * above. This is needed to create a fence on op callbacks prior to 674 * allowing the driver module to unload. 675 */ 676 down_write(&devices_rwsem); 677 if (xa_load(&devices, device->index) == device) 678 xa_erase(&devices, device->index); 679 up_write(&devices_rwsem); 680 681 /* Expedite releasing netdev references */ 682 free_netdevs(device); 683 684 WARN_ON(!xa_empty(&device->compat_devs)); 685 WARN_ON(!xa_empty(&device->client_data)); 686 WARN_ON(refcount_read(&device->refcount)); 687 rdma_restrack_clean(device); 688 /* Balances with device_initialize */ 689 put_device(&device->dev); 690 } 691 EXPORT_SYMBOL(ib_dealloc_device); 692 693 /* 694 * add_client_context() and remove_client_context() must be safe against 695 * parallel calls on the same device - registration/unregistration of both the 696 * device and client can be occurring in parallel. 697 * 698 * The routines need to be a fence, any caller must not return until the add 699 * or remove is fully completed. 700 */ 701 static int add_client_context(struct ib_device *device, 702 struct ib_client *client) 703 { 704 int ret = 0; 705 706 if (!device->kverbs_provider && !client->no_kverbs_req) 707 return 0; 708 709 down_write(&device->client_data_rwsem); 710 /* 711 * So long as the client is registered hold both the client and device 712 * unregistration locks. 713 */ 714 if (!refcount_inc_not_zero(&client->uses)) 715 goto out_unlock; 716 refcount_inc(&device->refcount); 717 718 /* 719 * Another caller to add_client_context got here first and has already 720 * completely initialized context. 721 */ 722 if (xa_get_mark(&device->client_data, client->client_id, 723 CLIENT_DATA_REGISTERED)) 724 goto out; 725 726 ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, 727 GFP_KERNEL)); 728 if (ret) 729 goto out; 730 downgrade_write(&device->client_data_rwsem); 731 if (client->add) { 732 if (client->add(device)) { 733 /* 734 * If a client fails to add then the error code is 735 * ignored, but we won't call any more ops on this 736 * client. 737 */ 738 xa_erase(&device->client_data, client->client_id); 739 up_read(&device->client_data_rwsem); 740 ib_device_put(device); 741 ib_client_put(client); 742 return 0; 743 } 744 } 745 746 /* Readers shall not see a client until add has been completed */ 747 xa_set_mark(&device->client_data, client->client_id, 748 CLIENT_DATA_REGISTERED); 749 up_read(&device->client_data_rwsem); 750 return 0; 751 752 out: 753 ib_device_put(device); 754 ib_client_put(client); 755 out_unlock: 756 up_write(&device->client_data_rwsem); 757 return ret; 758 } 759 760 static void remove_client_context(struct ib_device *device, 761 unsigned int client_id) 762 { 763 struct ib_client *client; 764 void *client_data; 765 766 down_write(&device->client_data_rwsem); 767 if (!xa_get_mark(&device->client_data, client_id, 768 CLIENT_DATA_REGISTERED)) { 769 up_write(&device->client_data_rwsem); 770 return; 771 } 772 client_data = xa_load(&device->client_data, client_id); 773 xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); 774 client = xa_load(&clients, client_id); 775 up_write(&device->client_data_rwsem); 776 777 /* 778 * Notice we cannot be holding any exclusive locks when calling the 779 * remove callback as the remove callback can recurse back into any 780 * public functions in this module and thus try for any locks those 781 * functions take. 782 * 783 * For this reason clients and drivers should not call the 784 * unregistration functions will holdling any locks. 785 */ 786 if (client->remove) 787 client->remove(device, client_data); 788 789 xa_erase(&device->client_data, client_id); 790 ib_device_put(device); 791 ib_client_put(client); 792 } 793 794 static int alloc_port_data(struct ib_device *device) 795 { 796 struct ib_port_data_rcu *pdata_rcu; 797 u32 port; 798 799 if (device->port_data) 800 return 0; 801 802 /* This can only be called once the physical port range is defined */ 803 if (WARN_ON(!device->phys_port_cnt)) 804 return -EINVAL; 805 806 /* Reserve U32_MAX so the logic to go over all the ports is sane */ 807 if (WARN_ON(device->phys_port_cnt == U32_MAX)) 808 return -EINVAL; 809 810 /* 811 * device->port_data is indexed directly by the port number to make 812 * access to this data as efficient as possible. 813 * 814 * Therefore port_data is declared as a 1 based array with potential 815 * empty slots at the beginning. 816 */ 817 pdata_rcu = kzalloc_flex(*pdata_rcu, pdata, 818 size_add(rdma_end_port(device), 1)); 819 if (!pdata_rcu) 820 return -ENOMEM; 821 /* 822 * The rcu_head is put in front of the port data array and the stored 823 * pointer is adjusted since we never need to see that member until 824 * kfree_rcu. 825 */ 826 device->port_data = pdata_rcu->pdata; 827 828 rdma_for_each_port (device, port) { 829 struct ib_port_data *pdata = &device->port_data[port]; 830 831 pdata->ib_dev = device; 832 spin_lock_init(&pdata->pkey_list_lock); 833 INIT_LIST_HEAD(&pdata->pkey_list); 834 spin_lock_init(&pdata->netdev_lock); 835 INIT_HLIST_NODE(&pdata->ndev_hash_link); 836 } 837 return 0; 838 } 839 840 static int verify_immutable(const struct ib_device *dev, u32 port) 841 { 842 return WARN_ON(!rdma_cap_ib_mad(dev, port) && 843 rdma_max_mad_size(dev, port) != 0); 844 } 845 846 static int setup_port_data(struct ib_device *device) 847 { 848 u32 port; 849 int ret; 850 851 ret = alloc_port_data(device); 852 if (ret) 853 return ret; 854 855 rdma_for_each_port (device, port) { 856 struct ib_port_data *pdata = &device->port_data[port]; 857 858 ret = device->ops.get_port_immutable(device, port, 859 &pdata->immutable); 860 if (ret) 861 return ret; 862 863 if (verify_immutable(device, port)) 864 return -EINVAL; 865 } 866 return 0; 867 } 868 869 /** 870 * ib_port_immutable_read() - Read rdma port's immutable data 871 * @dev: IB device 872 * @port: port number whose immutable data to read. It starts with index 1 and 873 * valid upto including rdma_end_port(). 874 */ 875 const struct ib_port_immutable* 876 ib_port_immutable_read(struct ib_device *dev, unsigned int port) 877 { 878 WARN_ON(!rdma_is_port_valid(dev, port)); 879 return &dev->port_data[port].immutable; 880 } 881 EXPORT_SYMBOL(ib_port_immutable_read); 882 883 void ib_get_device_fw_str(struct ib_device *dev, char *str) 884 { 885 if (dev->ops.get_dev_fw_str) 886 dev->ops.get_dev_fw_str(dev, str); 887 else 888 str[0] = '\0'; 889 } 890 EXPORT_SYMBOL(ib_get_device_fw_str); 891 892 static void ib_policy_change_task(struct work_struct *work) 893 { 894 struct ib_device *dev; 895 unsigned long index; 896 897 down_read(&devices_rwsem); 898 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 899 unsigned int i; 900 901 rdma_for_each_port (dev, i) { 902 u64 sp; 903 ib_get_cached_subnet_prefix(dev, i, &sp); 904 ib_security_cache_change(dev, i, sp); 905 } 906 } 907 up_read(&devices_rwsem); 908 } 909 910 static int ib_security_change(struct notifier_block *nb, unsigned long event, 911 void *lsm_data) 912 { 913 if (event != LSM_POLICY_CHANGE) 914 return NOTIFY_DONE; 915 916 schedule_work(&ib_policy_change_work); 917 ib_mad_agent_security_change(); 918 919 return NOTIFY_OK; 920 } 921 922 static void compatdev_release(struct device *dev) 923 { 924 struct ib_core_device *cdev = 925 container_of(dev, struct ib_core_device, dev); 926 927 kfree(cdev); 928 } 929 930 static int add_one_compat_dev(struct ib_device *device, 931 struct rdma_dev_net *rnet) 932 { 933 struct ib_core_device *cdev; 934 int ret; 935 936 lockdep_assert_held(&rdma_nets_rwsem); 937 if (!ib_devices_shared_netns) 938 return 0; 939 940 /* 941 * Create and add compat device in all namespaces other than where it 942 * is currently bound to. 943 */ 944 if (net_eq(read_pnet(&rnet->net), 945 read_pnet(&device->coredev.rdma_net))) 946 return 0; 947 948 /* 949 * The first of init_net() or ib_register_device() to take the 950 * compat_devs_mutex wins and gets to add the device. Others will wait 951 * for completion here. 952 */ 953 mutex_lock(&device->compat_devs_mutex); 954 cdev = xa_load(&device->compat_devs, rnet->id); 955 if (cdev) { 956 ret = 0; 957 goto done; 958 } 959 ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); 960 if (ret) 961 goto done; 962 963 cdev = kzalloc_obj(*cdev); 964 if (!cdev) { 965 ret = -ENOMEM; 966 goto cdev_err; 967 } 968 969 cdev->dev.parent = device->dev.parent; 970 rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); 971 cdev->dev.release = compatdev_release; 972 ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); 973 if (ret) 974 goto add_err; 975 976 ret = device_add(&cdev->dev); 977 if (ret) 978 goto add_err; 979 ret = ib_setup_port_attrs(cdev); 980 if (ret) 981 goto port_err; 982 983 ret = xa_err(xa_store(&device->compat_devs, rnet->id, 984 cdev, GFP_KERNEL)); 985 if (ret) 986 goto insert_err; 987 988 mutex_unlock(&device->compat_devs_mutex); 989 return 0; 990 991 insert_err: 992 ib_free_port_attrs(cdev); 993 port_err: 994 device_del(&cdev->dev); 995 add_err: 996 put_device(&cdev->dev); 997 cdev_err: 998 xa_release(&device->compat_devs, rnet->id); 999 done: 1000 mutex_unlock(&device->compat_devs_mutex); 1001 return ret; 1002 } 1003 1004 static void remove_one_compat_dev(struct ib_device *device, u32 id) 1005 { 1006 struct ib_core_device *cdev; 1007 1008 mutex_lock(&device->compat_devs_mutex); 1009 cdev = xa_erase(&device->compat_devs, id); 1010 mutex_unlock(&device->compat_devs_mutex); 1011 if (cdev) { 1012 ib_free_port_attrs(cdev); 1013 device_del(&cdev->dev); 1014 put_device(&cdev->dev); 1015 } 1016 } 1017 1018 static void remove_compat_devs(struct ib_device *device) 1019 { 1020 struct ib_core_device *cdev; 1021 unsigned long index; 1022 1023 xa_for_each (&device->compat_devs, index, cdev) 1024 remove_one_compat_dev(device, index); 1025 } 1026 1027 static int add_compat_devs(struct ib_device *device) 1028 { 1029 struct rdma_dev_net *rnet; 1030 unsigned long index; 1031 int ret = 0; 1032 1033 lockdep_assert_held(&devices_rwsem); 1034 1035 down_read(&rdma_nets_rwsem); 1036 xa_for_each (&rdma_nets, index, rnet) { 1037 ret = add_one_compat_dev(device, rnet); 1038 if (ret) 1039 break; 1040 } 1041 up_read(&rdma_nets_rwsem); 1042 return ret; 1043 } 1044 1045 static void remove_all_compat_devs(void) 1046 { 1047 struct ib_compat_device *cdev; 1048 struct ib_device *dev; 1049 unsigned long index; 1050 1051 down_read(&devices_rwsem); 1052 xa_for_each (&devices, index, dev) { 1053 unsigned long c_index = 0; 1054 1055 /* Hold nets_rwsem so that any other thread modifying this 1056 * system param can sync with this thread. 1057 */ 1058 down_read(&rdma_nets_rwsem); 1059 xa_for_each (&dev->compat_devs, c_index, cdev) 1060 remove_one_compat_dev(dev, c_index); 1061 up_read(&rdma_nets_rwsem); 1062 } 1063 up_read(&devices_rwsem); 1064 } 1065 1066 static int add_all_compat_devs(void) 1067 { 1068 struct rdma_dev_net *rnet; 1069 struct ib_device *dev; 1070 unsigned long index; 1071 int ret = 0; 1072 1073 down_read(&devices_rwsem); 1074 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1075 unsigned long net_index = 0; 1076 1077 /* Hold nets_rwsem so that any other thread modifying this 1078 * system param can sync with this thread. 1079 */ 1080 down_read(&rdma_nets_rwsem); 1081 xa_for_each (&rdma_nets, net_index, rnet) { 1082 ret = add_one_compat_dev(dev, rnet); 1083 if (ret) 1084 break; 1085 } 1086 up_read(&rdma_nets_rwsem); 1087 } 1088 up_read(&devices_rwsem); 1089 if (ret) 1090 remove_all_compat_devs(); 1091 return ret; 1092 } 1093 1094 int rdma_compatdev_set(u8 enable) 1095 { 1096 struct rdma_dev_net *rnet; 1097 unsigned long index; 1098 int ret = 0; 1099 1100 down_write(&rdma_nets_rwsem); 1101 if (ib_devices_shared_netns == enable) { 1102 up_write(&rdma_nets_rwsem); 1103 return 0; 1104 } 1105 1106 /* enable/disable of compat devices is not supported 1107 * when more than default init_net exists. 1108 */ 1109 xa_for_each (&rdma_nets, index, rnet) { 1110 ret++; 1111 break; 1112 } 1113 if (!ret) 1114 ib_devices_shared_netns = enable; 1115 up_write(&rdma_nets_rwsem); 1116 if (ret) 1117 return -EBUSY; 1118 1119 if (enable) 1120 ret = add_all_compat_devs(); 1121 else 1122 remove_all_compat_devs(); 1123 return ret; 1124 } 1125 1126 static void rdma_dev_exit_net(struct net *net) 1127 { 1128 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1129 struct ib_device *dev; 1130 unsigned long index; 1131 int ret; 1132 1133 down_write(&rdma_nets_rwsem); 1134 /* 1135 * Prevent the ID from being re-used and hide the id from xa_for_each. 1136 */ 1137 ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); 1138 WARN_ON(ret); 1139 up_write(&rdma_nets_rwsem); 1140 1141 down_read(&devices_rwsem); 1142 xa_for_each (&devices, index, dev) { 1143 get_device(&dev->dev); 1144 /* 1145 * Release the devices_rwsem so that pontentially blocking 1146 * device_del, doesn't hold the devices_rwsem for too long. 1147 */ 1148 up_read(&devices_rwsem); 1149 1150 remove_one_compat_dev(dev, rnet->id); 1151 1152 /* 1153 * If the real device is in the NS then move it back to init. 1154 */ 1155 rdma_dev_change_netns(dev, net, &init_net); 1156 1157 put_device(&dev->dev); 1158 down_read(&devices_rwsem); 1159 } 1160 up_read(&devices_rwsem); 1161 1162 rdma_nl_net_exit(rnet); 1163 xa_erase(&rdma_nets, rnet->id); 1164 } 1165 1166 static __net_init int rdma_dev_init_net(struct net *net) 1167 { 1168 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1169 unsigned long index; 1170 struct ib_device *dev; 1171 int ret; 1172 1173 write_pnet(&rnet->net, net); 1174 1175 ret = rdma_nl_net_init(rnet); 1176 if (ret) 1177 return ret; 1178 1179 /* No need to create any compat devices in default init_net. */ 1180 if (net_eq(net, &init_net)) 1181 return 0; 1182 1183 ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); 1184 if (ret) { 1185 rdma_nl_net_exit(rnet); 1186 return ret; 1187 } 1188 1189 down_read(&devices_rwsem); 1190 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1191 /* Hold nets_rwsem so that netlink command cannot change 1192 * system configuration for device sharing mode. 1193 */ 1194 down_read(&rdma_nets_rwsem); 1195 ret = add_one_compat_dev(dev, rnet); 1196 up_read(&rdma_nets_rwsem); 1197 if (ret) 1198 break; 1199 } 1200 up_read(&devices_rwsem); 1201 1202 if (ret) 1203 rdma_dev_exit_net(net); 1204 1205 return ret; 1206 } 1207 1208 /* 1209 * Assign the unique string device name and the unique device index. This is 1210 * undone by ib_dealloc_device. 1211 */ 1212 static int assign_name(struct ib_device *device, const char *name) 1213 { 1214 static u32 last_id; 1215 int ret; 1216 1217 down_write(&devices_rwsem); 1218 /* Assign a unique name to the device */ 1219 if (strchr(name, '%')) 1220 ret = alloc_name(device, name); 1221 else 1222 ret = dev_set_name(&device->dev, name); 1223 if (ret) 1224 goto out; 1225 1226 if (__ib_device_get_by_name(dev_name(&device->dev))) { 1227 ret = -ENFILE; 1228 goto out; 1229 } 1230 strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); 1231 1232 ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, 1233 &last_id, GFP_KERNEL); 1234 if (ret > 0) 1235 ret = 0; 1236 1237 out: 1238 up_write(&devices_rwsem); 1239 return ret; 1240 } 1241 1242 /* 1243 * setup_device() allocates memory and sets up data that requires calling the 1244 * device ops, this is the only reason these actions are not done during 1245 * ib_alloc_device. It is undone by ib_dealloc_device(). 1246 */ 1247 static int setup_device(struct ib_device *device) 1248 { 1249 int ret; 1250 1251 ib_device_check_mandatory(device); 1252 1253 ret = setup_port_data(device); 1254 if (ret) { 1255 dev_warn(&device->dev, "Couldn't create per-port data\n"); 1256 return ret; 1257 } 1258 1259 memset(&device->attrs, 0, sizeof(device->attrs)); 1260 ret = device->ops.query_device(device, &device->attrs, NULL); 1261 if (ret) { 1262 dev_warn(&device->dev, 1263 "Couldn't query the device attributes\n"); 1264 return ret; 1265 } 1266 1267 return 0; 1268 } 1269 1270 static void disable_device(struct ib_device *device) 1271 { 1272 u32 cid; 1273 1274 WARN_ON(!refcount_read(&device->refcount)); 1275 1276 down_write(&devices_rwsem); 1277 xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); 1278 up_write(&devices_rwsem); 1279 1280 /* 1281 * Remove clients in LIFO order, see assign_client_id. This could be 1282 * more efficient if xarray learns to reverse iterate. Since no new 1283 * clients can be added to this ib_device past this point we only need 1284 * the maximum possible client_id value here. 1285 */ 1286 down_read(&clients_rwsem); 1287 cid = highest_client_id; 1288 up_read(&clients_rwsem); 1289 while (cid) { 1290 cid--; 1291 remove_client_context(device, cid); 1292 } 1293 1294 ib_cq_pool_cleanup(device); 1295 1296 /* Pairs with refcount_set in enable_device */ 1297 ib_device_put(device); 1298 wait_for_completion(&device->unreg_completion); 1299 1300 /* 1301 * compat devices must be removed after device refcount drops to zero. 1302 * Otherwise init_net() may add more compatdevs after removing compat 1303 * devices and before device is disabled. 1304 */ 1305 remove_compat_devs(device); 1306 } 1307 1308 /* 1309 * An enabled device is visible to all clients and to all the public facing 1310 * APIs that return a device pointer. This always returns with a new get, even 1311 * if it fails. 1312 */ 1313 static int enable_device_and_get(struct ib_device *device) 1314 { 1315 struct ib_client *client; 1316 unsigned long index; 1317 int ret = 0; 1318 1319 /* 1320 * One ref belongs to the xa and the other belongs to this 1321 * thread. This is needed to guard against parallel unregistration. 1322 */ 1323 refcount_set(&device->refcount, 2); 1324 down_write(&devices_rwsem); 1325 xa_set_mark(&devices, device->index, DEVICE_REGISTERED); 1326 1327 /* 1328 * By using downgrade_write() we ensure that no other thread can clear 1329 * DEVICE_REGISTERED while we are completing the client setup. 1330 */ 1331 downgrade_write(&devices_rwsem); 1332 1333 if (device->ops.enable_driver) { 1334 ret = device->ops.enable_driver(device); 1335 if (ret) 1336 goto out; 1337 } 1338 1339 down_read(&clients_rwsem); 1340 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1341 ret = add_client_context(device, client); 1342 if (ret) 1343 break; 1344 } 1345 up_read(&clients_rwsem); 1346 if (!ret) 1347 ret = add_compat_devs(device); 1348 out: 1349 up_read(&devices_rwsem); 1350 return ret; 1351 } 1352 1353 static void prevent_dealloc_device(struct ib_device *ib_dev) 1354 { 1355 } 1356 1357 static void ib_device_notify_register(struct ib_device *device) 1358 { 1359 struct net_device *netdev; 1360 u32 port; 1361 int ret; 1362 1363 down_read(&devices_rwsem); 1364 1365 /* Mark for userspace that device is ready */ 1366 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1367 1368 ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); 1369 if (ret) 1370 goto out; 1371 1372 rdma_for_each_port(device, port) { 1373 netdev = ib_device_get_netdev(device, port); 1374 if (!netdev) 1375 continue; 1376 1377 ret = rdma_nl_notify_event(device, port, 1378 RDMA_NETDEV_ATTACH_EVENT); 1379 dev_put(netdev); 1380 if (ret) 1381 goto out; 1382 } 1383 1384 out: 1385 up_read(&devices_rwsem); 1386 } 1387 1388 /** 1389 * ib_register_device - Register an IB device with IB core 1390 * @device: Device to register 1391 * @name: unique string device name. This may include a '%' which will 1392 * cause a unique index to be added to the passed device name. 1393 * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB 1394 * device will be used. In this case the caller should fully 1395 * setup the ibdev for DMA. This usually means using dma_virt_ops. 1396 * 1397 * Low-level drivers use ib_register_device() to register their 1398 * devices with the IB core. All registered clients will receive a 1399 * callback for each device that is added. @device must be allocated 1400 * with ib_alloc_device(). 1401 * 1402 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() 1403 * asynchronously then the device pointer may become freed as soon as this 1404 * function returns. 1405 */ 1406 int ib_register_device(struct ib_device *device, const char *name, 1407 struct device *dma_device) 1408 { 1409 int ret; 1410 1411 ret = assign_name(device, name); 1412 if (ret) 1413 return ret; 1414 1415 /* 1416 * If the caller does not provide a DMA capable device then the IB core 1417 * will set up ib_sge and scatterlist structures that stash the kernel 1418 * virtual address into the address field. 1419 */ 1420 WARN_ON(dma_device && !dma_device->dma_parms); 1421 device->dma_device = dma_device; 1422 /* 1423 * In a CoCo guest every device is currently assumed to be untrusted 1424 * (T=0) and therefore subject to DMA bouncing. Once trusted (T=1) 1425 * device detection is wired up, narrow this check to exclude such 1426 * devices. 1427 */ 1428 if (dma_device && cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) 1429 device->cc_dma_bounce = 1; 1430 1431 ret = setup_device(device); 1432 if (ret) 1433 return ret; 1434 1435 ret = ib_cache_setup_one(device); 1436 if (ret) { 1437 dev_warn(&device->dev, 1438 "Couldn't set up InfiniBand P_Key/GID cache\n"); 1439 return ret; 1440 } 1441 1442 device->groups[0] = &ib_dev_attr_group; 1443 device->groups[1] = device->ops.device_group; 1444 ret = ib_setup_device_attrs(device); 1445 if (ret) 1446 goto cache_cleanup; 1447 1448 ib_device_register_rdmacg(device); 1449 1450 rdma_counter_init(device); 1451 1452 /* 1453 * Ensure that ADD uevent is not fired because it 1454 * is too early amd device is not initialized yet. 1455 */ 1456 dev_set_uevent_suppress(&device->dev, true); 1457 ret = device_add(&device->dev); 1458 if (ret) 1459 goto cg_cleanup; 1460 1461 ret = ib_setup_port_attrs(&device->coredev); 1462 if (ret) { 1463 dev_warn(&device->dev, 1464 "Couldn't register device with driver model\n"); 1465 goto dev_cleanup; 1466 } 1467 1468 ret = enable_device_and_get(device); 1469 if (ret) { 1470 void (*dealloc_fn)(struct ib_device *); 1471 1472 /* 1473 * If we hit this error flow then we don't want to 1474 * automatically dealloc the device since the caller is 1475 * expected to call ib_dealloc_device() after 1476 * ib_register_device() fails. This is tricky due to the 1477 * possibility for a parallel unregistration along with this 1478 * error flow. Since we have a refcount here we know any 1479 * parallel flow is stopped in disable_device and will see the 1480 * special dealloc_driver pointer, causing the responsibility to 1481 * ib_dealloc_device() to revert back to this thread. 1482 */ 1483 dealloc_fn = device->ops.dealloc_driver; 1484 device->ops.dealloc_driver = prevent_dealloc_device; 1485 ib_device_put(device); 1486 __ib_unregister_device(device); 1487 device->ops.dealloc_driver = dealloc_fn; 1488 dev_set_uevent_suppress(&device->dev, false); 1489 return ret; 1490 } 1491 dev_set_uevent_suppress(&device->dev, false); 1492 1493 ib_device_notify_register(device); 1494 1495 ib_device_put(device); 1496 1497 return 0; 1498 1499 dev_cleanup: 1500 device_del(&device->dev); 1501 cg_cleanup: 1502 dev_set_uevent_suppress(&device->dev, false); 1503 ib_device_unregister_rdmacg(device); 1504 cache_cleanup: 1505 ib_cache_cleanup_one(device); 1506 return ret; 1507 } 1508 EXPORT_SYMBOL(ib_register_device); 1509 1510 /* Callers must hold a get on the device. */ 1511 static void __ib_unregister_device(struct ib_device *ib_dev) 1512 { 1513 struct ib_device *sub, *tmp; 1514 1515 mutex_lock(&ib_dev->subdev_lock); 1516 list_for_each_entry_safe_reverse(sub, tmp, 1517 &ib_dev->subdev_list_head, 1518 subdev_list) { 1519 list_del(&sub->subdev_list); 1520 ib_dev->ops.del_sub_dev(sub); 1521 ib_device_put(ib_dev); 1522 } 1523 mutex_unlock(&ib_dev->subdev_lock); 1524 1525 /* 1526 * We have a registration lock so that all the calls to unregister are 1527 * fully fenced, once any unregister returns the device is truly 1528 * unregistered even if multiple callers are unregistering it at the 1529 * same time. This also interacts with the registration flow and 1530 * provides sane semantics if register and unregister are racing. 1531 */ 1532 mutex_lock(&ib_dev->unregistration_lock); 1533 if (!refcount_read(&ib_dev->refcount)) 1534 goto out; 1535 1536 disable_device(ib_dev); 1537 rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT); 1538 1539 /* Expedite removing unregistered pointers from the hash table */ 1540 free_netdevs(ib_dev); 1541 1542 ib_free_port_attrs(&ib_dev->coredev); 1543 device_del(&ib_dev->dev); 1544 ib_device_unregister_rdmacg(ib_dev); 1545 ib_cache_cleanup_one(ib_dev); 1546 1547 /* 1548 * Drivers using the new flow may not call ib_dealloc_device except 1549 * in error unwind prior to registration success. 1550 */ 1551 if (ib_dev->ops.dealloc_driver && 1552 ib_dev->ops.dealloc_driver != prevent_dealloc_device) { 1553 WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); 1554 ib_dealloc_device(ib_dev); 1555 } 1556 out: 1557 mutex_unlock(&ib_dev->unregistration_lock); 1558 } 1559 1560 /** 1561 * ib_unregister_device - Unregister an IB device 1562 * @ib_dev: The device to unregister 1563 * 1564 * Unregister an IB device. All clients will receive a remove callback. 1565 * 1566 * Callers should call this routine only once, and protect against races with 1567 * registration. Typically it should only be called as part of a remove 1568 * callback in an implementation of driver core's struct device_driver and 1569 * related. 1570 * 1571 * If ops.dealloc_driver is used then ib_dev will be freed upon return from 1572 * this function. 1573 */ 1574 void ib_unregister_device(struct ib_device *ib_dev) 1575 { 1576 get_device(&ib_dev->dev); 1577 __ib_unregister_device(ib_dev); 1578 put_device(&ib_dev->dev); 1579 } 1580 EXPORT_SYMBOL(ib_unregister_device); 1581 1582 /** 1583 * ib_unregister_device_and_put - Unregister a device while holding a 'get' 1584 * @ib_dev: The device to unregister 1585 * 1586 * This is the same as ib_unregister_device(), except it includes an internal 1587 * ib_device_put() that should match a 'get' obtained by the caller. 1588 * 1589 * It is safe to call this routine concurrently from multiple threads while 1590 * holding the 'get'. When the function returns the device is fully 1591 * unregistered. 1592 * 1593 * Drivers using this flow MUST use the driver_unregister callback to clean up 1594 * their resources associated with the device and dealloc it. 1595 */ 1596 void ib_unregister_device_and_put(struct ib_device *ib_dev) 1597 { 1598 WARN_ON(!ib_dev->ops.dealloc_driver); 1599 get_device(&ib_dev->dev); 1600 ib_device_put(ib_dev); 1601 __ib_unregister_device(ib_dev); 1602 put_device(&ib_dev->dev); 1603 } 1604 EXPORT_SYMBOL(ib_unregister_device_and_put); 1605 1606 /** 1607 * ib_unregister_driver - Unregister all IB devices for a driver 1608 * @driver_id: The driver to unregister 1609 * 1610 * This implements a fence for device unregistration. It only returns once all 1611 * devices associated with the driver_id have fully completed their 1612 * unregistration and returned from ib_unregister_device*(). 1613 * 1614 * If device's are not yet unregistered it goes ahead and starts unregistering 1615 * them. 1616 * 1617 * This does not block creation of new devices with the given driver_id, that 1618 * is the responsibility of the caller. 1619 */ 1620 void ib_unregister_driver(enum rdma_driver_id driver_id) 1621 { 1622 struct ib_device *ib_dev; 1623 unsigned long index; 1624 1625 down_read(&devices_rwsem); 1626 xa_for_each (&devices, index, ib_dev) { 1627 if (ib_dev->ops.driver_id != driver_id) 1628 continue; 1629 1630 get_device(&ib_dev->dev); 1631 up_read(&devices_rwsem); 1632 1633 WARN_ON(!ib_dev->ops.dealloc_driver); 1634 __ib_unregister_device(ib_dev); 1635 1636 put_device(&ib_dev->dev); 1637 down_read(&devices_rwsem); 1638 } 1639 up_read(&devices_rwsem); 1640 } 1641 EXPORT_SYMBOL(ib_unregister_driver); 1642 1643 static void ib_unregister_work(struct work_struct *work) 1644 { 1645 struct ib_device *ib_dev = 1646 container_of(work, struct ib_device, unregistration_work); 1647 1648 __ib_unregister_device(ib_dev); 1649 put_device(&ib_dev->dev); 1650 } 1651 1652 /** 1653 * ib_unregister_device_queued - Unregister a device using a work queue 1654 * @ib_dev: The device to unregister 1655 * 1656 * This schedules an asynchronous unregistration using a WQ for the device. A 1657 * driver should use this to avoid holding locks while doing unregistration, 1658 * such as holding the RTNL lock. 1659 * 1660 * Drivers using this API must use ib_unregister_driver before module unload 1661 * to ensure that all scheduled unregistrations have completed. 1662 */ 1663 void ib_unregister_device_queued(struct ib_device *ib_dev) 1664 { 1665 WARN_ON(!refcount_read(&ib_dev->refcount)); 1666 WARN_ON(!ib_dev->ops.dealloc_driver); 1667 get_device(&ib_dev->dev); 1668 if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work)) 1669 put_device(&ib_dev->dev); 1670 } 1671 EXPORT_SYMBOL(ib_unregister_device_queued); 1672 1673 /* 1674 * The caller must pass in a device that has the kref held and the refcount 1675 * released. If the device is in cur_net and still registered then it is moved 1676 * into net. 1677 */ 1678 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 1679 struct net *net) 1680 { 1681 int ret2 = -EINVAL; 1682 int ret; 1683 1684 mutex_lock(&device->unregistration_lock); 1685 1686 /* 1687 * If a device not under ib_device_get() or if the unregistration_lock 1688 * is not held, the namespace can be changed, or it can be unregistered. 1689 * Check again under the lock. 1690 */ 1691 if (refcount_read(&device->refcount) == 0 || 1692 !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { 1693 ret = -ENODEV; 1694 goto out; 1695 } 1696 1697 kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); 1698 disable_device(device); 1699 1700 /* 1701 * At this point no one can be using the device, so it is safe to 1702 * change the namespace. 1703 */ 1704 write_pnet(&device->coredev.rdma_net, net); 1705 1706 down_read(&devices_rwsem); 1707 /* 1708 * Currently rdma devices are system wide unique. So the device name 1709 * is guaranteed free in the new namespace. Publish the new namespace 1710 * at the sysfs level. 1711 */ 1712 ret = device_rename(&device->dev, dev_name(&device->dev)); 1713 up_read(&devices_rwsem); 1714 if (ret) { 1715 dev_warn(&device->dev, 1716 "%s: Couldn't rename device after namespace change\n", 1717 __func__); 1718 /* Try and put things back and re-enable the device */ 1719 write_pnet(&device->coredev.rdma_net, cur_net); 1720 } 1721 1722 ret2 = enable_device_and_get(device); 1723 if (ret2) { 1724 /* 1725 * This shouldn't really happen, but if it does, let the user 1726 * retry at later point. So don't disable the device. 1727 */ 1728 dev_warn(&device->dev, 1729 "%s: Couldn't re-enable device after namespace change\n", 1730 __func__); 1731 } 1732 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1733 1734 ib_device_put(device); 1735 out: 1736 mutex_unlock(&device->unregistration_lock); 1737 if (ret) 1738 return ret; 1739 return ret2; 1740 } 1741 1742 int ib_device_set_netns_put(struct sk_buff *skb, 1743 struct ib_device *dev, u32 ns_fd) 1744 { 1745 struct net *net; 1746 int ret; 1747 1748 net = get_net_ns_by_fd(ns_fd); 1749 if (IS_ERR(net)) { 1750 ret = PTR_ERR(net); 1751 goto net_err; 1752 } 1753 1754 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { 1755 ret = -EPERM; 1756 goto ns_err; 1757 } 1758 1759 /* 1760 * All the ib_clients, including uverbs, are reset when the namespace is 1761 * changed and this cannot be blocked waiting for userspace to do 1762 * something, so disassociation is mandatory. 1763 */ 1764 if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) { 1765 ret = -EOPNOTSUPP; 1766 goto ns_err; 1767 } 1768 1769 get_device(&dev->dev); 1770 ib_device_put(dev); 1771 ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); 1772 put_device(&dev->dev); 1773 1774 put_net(net); 1775 return ret; 1776 1777 ns_err: 1778 put_net(net); 1779 net_err: 1780 ib_device_put(dev); 1781 return ret; 1782 } 1783 1784 static struct pernet_operations rdma_dev_net_ops = { 1785 .init = rdma_dev_init_net, 1786 .exit = rdma_dev_exit_net, 1787 .id = &rdma_dev_net_id, 1788 .size = sizeof(struct rdma_dev_net), 1789 }; 1790 1791 static int assign_client_id(struct ib_client *client) 1792 { 1793 int ret; 1794 1795 lockdep_assert_held(&clients_rwsem); 1796 /* 1797 * The add/remove callbacks must be called in FIFO/LIFO order. To 1798 * achieve this we assign client_ids so they are sorted in 1799 * registration order. 1800 */ 1801 client->client_id = highest_client_id; 1802 ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); 1803 if (ret) 1804 return ret; 1805 1806 highest_client_id++; 1807 xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); 1808 return 0; 1809 } 1810 1811 static void remove_client_id(struct ib_client *client) 1812 { 1813 down_write(&clients_rwsem); 1814 xa_erase(&clients, client->client_id); 1815 for (; highest_client_id; highest_client_id--) 1816 if (xa_load(&clients, highest_client_id - 1)) 1817 break; 1818 up_write(&clients_rwsem); 1819 } 1820 1821 /** 1822 * ib_register_client - Register an IB client 1823 * @client:Client to register 1824 * 1825 * Upper level users of the IB drivers can use ib_register_client() to 1826 * register callbacks for IB device addition and removal. When an IB 1827 * device is added, each registered client's add method will be called 1828 * (in the order the clients were registered), and when a device is 1829 * removed, each client's remove method will be called (in the reverse 1830 * order that clients were registered). In addition, when 1831 * ib_register_client() is called, the client will receive an add 1832 * callback for all devices already registered. 1833 */ 1834 int ib_register_client(struct ib_client *client) 1835 { 1836 struct ib_device *device; 1837 unsigned long index; 1838 bool need_unreg = false; 1839 int ret; 1840 1841 refcount_set(&client->uses, 1); 1842 init_completion(&client->uses_zero); 1843 1844 /* 1845 * The devices_rwsem is held in write mode to ensure that a racing 1846 * ib_register_device() sees a consisent view of clients and devices. 1847 */ 1848 down_write(&devices_rwsem); 1849 down_write(&clients_rwsem); 1850 ret = assign_client_id(client); 1851 if (ret) 1852 goto out; 1853 1854 need_unreg = true; 1855 xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { 1856 ret = add_client_context(device, client); 1857 if (ret) 1858 goto out; 1859 } 1860 ret = 0; 1861 out: 1862 up_write(&clients_rwsem); 1863 up_write(&devices_rwsem); 1864 if (need_unreg && ret) 1865 ib_unregister_client(client); 1866 return ret; 1867 } 1868 EXPORT_SYMBOL(ib_register_client); 1869 1870 /** 1871 * ib_unregister_client - Unregister an IB client 1872 * @client:Client to unregister 1873 * 1874 * Upper level users use ib_unregister_client() to remove their client 1875 * registration. When ib_unregister_client() is called, the client 1876 * will receive a remove callback for each IB device still registered. 1877 * 1878 * This is a full fence, once it returns no client callbacks will be called, 1879 * or are running in another thread. 1880 */ 1881 void ib_unregister_client(struct ib_client *client) 1882 { 1883 struct ib_device *device; 1884 unsigned long index; 1885 1886 down_write(&clients_rwsem); 1887 ib_client_put(client); 1888 xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); 1889 up_write(&clients_rwsem); 1890 1891 /* We do not want to have locks while calling client->remove() */ 1892 rcu_read_lock(); 1893 xa_for_each (&devices, index, device) { 1894 if (!ib_device_try_get(device)) 1895 continue; 1896 rcu_read_unlock(); 1897 1898 remove_client_context(device, client->client_id); 1899 1900 ib_device_put(device); 1901 rcu_read_lock(); 1902 } 1903 rcu_read_unlock(); 1904 1905 /* 1906 * remove_client_context() is not a fence, it can return even though a 1907 * removal is ongoing. Wait until all removals are completed. 1908 */ 1909 wait_for_completion(&client->uses_zero); 1910 remove_client_id(client); 1911 } 1912 EXPORT_SYMBOL(ib_unregister_client); 1913 1914 static int __ib_get_global_client_nl_info(const char *client_name, 1915 struct ib_client_nl_info *res) 1916 { 1917 struct ib_client *client; 1918 unsigned long index; 1919 int ret = -ENOENT; 1920 1921 down_read(&clients_rwsem); 1922 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1923 if (strcmp(client->name, client_name) != 0) 1924 continue; 1925 if (!client->get_global_nl_info) { 1926 ret = -EOPNOTSUPP; 1927 break; 1928 } 1929 ret = client->get_global_nl_info(res); 1930 if (WARN_ON(ret == -ENOENT)) 1931 ret = -EINVAL; 1932 if (!ret && res->cdev) 1933 get_device(res->cdev); 1934 break; 1935 } 1936 up_read(&clients_rwsem); 1937 return ret; 1938 } 1939 1940 static int __ib_get_client_nl_info(struct ib_device *ibdev, 1941 const char *client_name, 1942 struct ib_client_nl_info *res) 1943 { 1944 unsigned long index; 1945 void *client_data; 1946 int ret = -ENOENT; 1947 1948 down_read(&ibdev->client_data_rwsem); 1949 xan_for_each_marked (&ibdev->client_data, index, client_data, 1950 CLIENT_DATA_REGISTERED) { 1951 struct ib_client *client = xa_load(&clients, index); 1952 1953 if (!client || strcmp(client->name, client_name) != 0) 1954 continue; 1955 if (!client->get_nl_info) { 1956 ret = -EOPNOTSUPP; 1957 break; 1958 } 1959 ret = client->get_nl_info(ibdev, client_data, res); 1960 if (WARN_ON(ret == -ENOENT)) 1961 ret = -EINVAL; 1962 1963 /* 1964 * The cdev is guaranteed valid as long as we are inside the 1965 * client_data_rwsem as remove_one can't be called. Keep it 1966 * valid for the caller. 1967 */ 1968 if (!ret && res->cdev) 1969 get_device(res->cdev); 1970 break; 1971 } 1972 up_read(&ibdev->client_data_rwsem); 1973 1974 return ret; 1975 } 1976 1977 /** 1978 * ib_get_client_nl_info - Fetch the nl_info from a client 1979 * @ibdev: IB device 1980 * @client_name: Name of the client 1981 * @res: Result of the query 1982 */ 1983 int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, 1984 struct ib_client_nl_info *res) 1985 { 1986 int ret; 1987 1988 if (ibdev) 1989 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1990 else 1991 ret = __ib_get_global_client_nl_info(client_name, res); 1992 #ifdef CONFIG_MODULES 1993 if (ret == -ENOENT) { 1994 request_module("rdma-client-%s", client_name); 1995 if (ibdev) 1996 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1997 else 1998 ret = __ib_get_global_client_nl_info(client_name, res); 1999 } 2000 #endif 2001 if (ret) { 2002 if (ret == -ENOENT) 2003 return -EOPNOTSUPP; 2004 return ret; 2005 } 2006 2007 if (WARN_ON(!res->cdev)) 2008 return -EINVAL; 2009 return 0; 2010 } 2011 2012 /** 2013 * ib_set_client_data - Set IB client context 2014 * @device:Device to set context for 2015 * @client:Client to set context for 2016 * @data:Context to set 2017 * 2018 * ib_set_client_data() sets client context data that can be retrieved with 2019 * ib_get_client_data(). This can only be called while the client is 2020 * registered to the device, once the ib_client remove() callback returns this 2021 * cannot be called. 2022 */ 2023 void ib_set_client_data(struct ib_device *device, struct ib_client *client, 2024 void *data) 2025 { 2026 void *rc; 2027 2028 if (WARN_ON(IS_ERR(data))) 2029 data = NULL; 2030 2031 rc = xa_store(&device->client_data, client->client_id, data, 2032 GFP_KERNEL); 2033 WARN_ON(xa_is_err(rc)); 2034 } 2035 EXPORT_SYMBOL(ib_set_client_data); 2036 2037 /** 2038 * ib_register_event_handler - Register an IB event handler 2039 * @event_handler:Handler to register 2040 * 2041 * ib_register_event_handler() registers an event handler that will be 2042 * called back when asynchronous IB events occur (as defined in 2043 * chapter 11 of the InfiniBand Architecture Specification). This 2044 * callback occurs in workqueue context. 2045 */ 2046 void ib_register_event_handler(struct ib_event_handler *event_handler) 2047 { 2048 down_write(&event_handler->device->event_handler_rwsem); 2049 list_add_tail(&event_handler->list, 2050 &event_handler->device->event_handler_list); 2051 up_write(&event_handler->device->event_handler_rwsem); 2052 } 2053 EXPORT_SYMBOL(ib_register_event_handler); 2054 2055 /** 2056 * ib_unregister_event_handler - Unregister an event handler 2057 * @event_handler:Handler to unregister 2058 * 2059 * Unregister an event handler registered with 2060 * ib_register_event_handler(). 2061 */ 2062 void ib_unregister_event_handler(struct ib_event_handler *event_handler) 2063 { 2064 down_write(&event_handler->device->event_handler_rwsem); 2065 list_del(&event_handler->list); 2066 up_write(&event_handler->device->event_handler_rwsem); 2067 } 2068 EXPORT_SYMBOL(ib_unregister_event_handler); 2069 2070 void ib_dispatch_event_clients(struct ib_event *event) 2071 { 2072 struct ib_event_handler *handler; 2073 2074 down_read(&event->device->event_handler_rwsem); 2075 2076 list_for_each_entry(handler, &event->device->event_handler_list, list) 2077 handler->handler(handler, event); 2078 2079 up_read(&event->device->event_handler_rwsem); 2080 } 2081 2082 static int iw_query_port(struct ib_device *device, 2083 u32 port_num, 2084 struct ib_port_attr *port_attr) 2085 { 2086 struct in_device *inetdev; 2087 struct net_device *netdev; 2088 2089 memset(port_attr, 0, sizeof(*port_attr)); 2090 2091 netdev = ib_device_get_netdev(device, port_num); 2092 if (!netdev) 2093 return -ENODEV; 2094 2095 port_attr->max_mtu = IB_MTU_4096; 2096 port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); 2097 2098 if (!netif_carrier_ok(netdev)) { 2099 port_attr->state = IB_PORT_DOWN; 2100 port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; 2101 } else { 2102 rcu_read_lock(); 2103 inetdev = __in_dev_get_rcu(netdev); 2104 2105 if (inetdev && inetdev->ifa_list) { 2106 port_attr->state = IB_PORT_ACTIVE; 2107 port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; 2108 } else { 2109 port_attr->state = IB_PORT_INIT; 2110 port_attr->phys_state = 2111 IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; 2112 } 2113 2114 rcu_read_unlock(); 2115 } 2116 2117 dev_put(netdev); 2118 return device->ops.query_port(device, port_num, port_attr); 2119 } 2120 2121 static int __ib_query_port(struct ib_device *device, 2122 u32 port_num, 2123 struct ib_port_attr *port_attr) 2124 { 2125 int err; 2126 2127 memset(port_attr, 0, sizeof(*port_attr)); 2128 2129 err = device->ops.query_port(device, port_num, port_attr); 2130 if (err || port_attr->subnet_prefix) 2131 return err; 2132 2133 if (rdma_port_get_link_layer(device, port_num) != 2134 IB_LINK_LAYER_INFINIBAND) 2135 return 0; 2136 2137 ib_get_cached_subnet_prefix(device, port_num, 2138 &port_attr->subnet_prefix); 2139 return 0; 2140 } 2141 2142 /** 2143 * ib_query_port - Query IB port attributes 2144 * @device:Device to query 2145 * @port_num:Port number to query 2146 * @port_attr:Port attributes 2147 * 2148 * ib_query_port() returns the attributes of a port through the 2149 * @port_attr pointer. 2150 */ 2151 int ib_query_port(struct ib_device *device, 2152 u32 port_num, 2153 struct ib_port_attr *port_attr) 2154 { 2155 if (!rdma_is_port_valid(device, port_num)) 2156 return -EINVAL; 2157 2158 if (rdma_protocol_iwarp(device, port_num)) 2159 return iw_query_port(device, port_num, port_attr); 2160 else 2161 return __ib_query_port(device, port_num, port_attr); 2162 } 2163 EXPORT_SYMBOL(ib_query_port); 2164 2165 static void add_ndev_hash(struct ib_port_data *pdata) 2166 { 2167 unsigned long flags; 2168 2169 might_sleep(); 2170 2171 spin_lock_irqsave(&ndev_hash_lock, flags); 2172 if (hash_hashed(&pdata->ndev_hash_link)) { 2173 hash_del_rcu(&pdata->ndev_hash_link); 2174 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2175 /* 2176 * We cannot do hash_add_rcu after a hash_del_rcu until the 2177 * grace period 2178 */ 2179 synchronize_rcu(); 2180 spin_lock_irqsave(&ndev_hash_lock, flags); 2181 } 2182 if (pdata->netdev) 2183 hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, 2184 (uintptr_t)pdata->netdev); 2185 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2186 } 2187 2188 /** 2189 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device 2190 * @ib_dev: Device to modify 2191 * @ndev: net_device to affiliate, may be NULL 2192 * @port: IB port the net_device is connected to 2193 * 2194 * Drivers should use this to link the ib_device to a netdev so the netdev 2195 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be 2196 * affiliated with any port. 2197 * 2198 * The caller must ensure that the given ndev is not unregistered or 2199 * unregistering, and that either the ib_device is unregistered or 2200 * ib_device_set_netdev() is called with NULL when the ndev sends a 2201 * NETDEV_UNREGISTER event. 2202 */ 2203 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, 2204 u32 port) 2205 { 2206 enum rdma_nl_notify_event_type etype; 2207 struct net_device *old_ndev; 2208 struct ib_port_data *pdata; 2209 unsigned long flags; 2210 int ret; 2211 2212 if (!rdma_is_port_valid(ib_dev, port)) 2213 return -EINVAL; 2214 2215 /* 2216 * Drivers wish to call this before ib_register_driver, so we have to 2217 * setup the port data early. 2218 */ 2219 ret = alloc_port_data(ib_dev); 2220 if (ret) 2221 return ret; 2222 2223 pdata = &ib_dev->port_data[port]; 2224 spin_lock_irqsave(&pdata->netdev_lock, flags); 2225 old_ndev = rcu_dereference_protected( 2226 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2227 if (old_ndev == ndev) { 2228 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2229 return 0; 2230 } 2231 2232 rcu_assign_pointer(pdata->netdev, ndev); 2233 netdev_put(old_ndev, &pdata->netdev_tracker); 2234 netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); 2235 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2236 2237 add_ndev_hash(pdata); 2238 2239 /* Make sure that the device is registered before we send events */ 2240 if (xa_load(&devices, ib_dev->index) != ib_dev) 2241 return 0; 2242 2243 etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT; 2244 rdma_nl_notify_event(ib_dev, port, etype); 2245 2246 return 0; 2247 } 2248 EXPORT_SYMBOL(ib_device_set_netdev); 2249 2250 static void free_netdevs(struct ib_device *ib_dev) 2251 { 2252 unsigned long flags; 2253 u32 port; 2254 2255 if (!ib_dev->port_data) 2256 return; 2257 2258 rdma_for_each_port (ib_dev, port) { 2259 struct ib_port_data *pdata = &ib_dev->port_data[port]; 2260 struct net_device *ndev; 2261 2262 spin_lock_irqsave(&pdata->netdev_lock, flags); 2263 ndev = rcu_dereference_protected( 2264 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2265 if (ndev) { 2266 spin_lock(&ndev_hash_lock); 2267 hash_del_rcu(&pdata->ndev_hash_link); 2268 spin_unlock(&ndev_hash_lock); 2269 2270 /* 2271 * If this is the last dev_put there is still a 2272 * synchronize_rcu before the netdev is kfreed, so we 2273 * can continue to rely on unlocked pointer 2274 * comparisons after the put 2275 */ 2276 rcu_assign_pointer(pdata->netdev, NULL); 2277 netdev_put(ndev, &pdata->netdev_tracker); 2278 } 2279 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2280 } 2281 } 2282 2283 struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, 2284 u32 port) 2285 { 2286 struct ib_port_data *pdata; 2287 struct net_device *res; 2288 2289 if (!rdma_is_port_valid(ib_dev, port)) 2290 return NULL; 2291 2292 if (!ib_dev->port_data) 2293 return NULL; 2294 2295 pdata = &ib_dev->port_data[port]; 2296 2297 /* 2298 * New drivers should use ib_device_set_netdev() not the legacy 2299 * get_netdev(). 2300 */ 2301 if (ib_dev->ops.get_netdev) 2302 res = ib_dev->ops.get_netdev(ib_dev, port); 2303 else { 2304 spin_lock(&pdata->netdev_lock); 2305 res = rcu_dereference_protected( 2306 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2307 dev_hold(res); 2308 spin_unlock(&pdata->netdev_lock); 2309 } 2310 2311 return res; 2312 } 2313 EXPORT_SYMBOL(ib_device_get_netdev); 2314 2315 /** 2316 * ib_query_netdev_port - Query the port number of a net_device 2317 * associated with an ibdev 2318 * @ibdev: IB device 2319 * @ndev: Network device 2320 * @port: IB port the net_device is connected to 2321 */ 2322 int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, 2323 u32 *port) 2324 { 2325 struct net_device *ib_ndev; 2326 u32 port_num; 2327 2328 rdma_for_each_port(ibdev, port_num) { 2329 ib_ndev = ib_device_get_netdev(ibdev, port_num); 2330 if (ndev == ib_ndev) { 2331 *port = port_num; 2332 dev_put(ib_ndev); 2333 return 0; 2334 } 2335 dev_put(ib_ndev); 2336 } 2337 2338 return -ENOENT; 2339 } 2340 EXPORT_SYMBOL(ib_query_netdev_port); 2341 2342 /** 2343 * ib_device_get_by_netdev - Find an IB device associated with a netdev 2344 * @ndev: netdev to locate 2345 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 2346 * 2347 * Find and hold an ib_device that is associated with a netdev via 2348 * ib_device_set_netdev(). The caller must call ib_device_put() on the 2349 * returned pointer. 2350 */ 2351 struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, 2352 enum rdma_driver_id driver_id) 2353 { 2354 struct ib_device *res = NULL; 2355 struct ib_port_data *cur; 2356 2357 rcu_read_lock(); 2358 hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, 2359 (uintptr_t)ndev) { 2360 if (rcu_access_pointer(cur->netdev) == ndev && 2361 (driver_id == RDMA_DRIVER_UNKNOWN || 2362 cur->ib_dev->ops.driver_id == driver_id) && 2363 ib_device_try_get(cur->ib_dev)) { 2364 res = cur->ib_dev; 2365 break; 2366 } 2367 } 2368 rcu_read_unlock(); 2369 2370 return res; 2371 } 2372 EXPORT_SYMBOL(ib_device_get_by_netdev); 2373 2374 /** 2375 * ib_enum_roce_netdev - enumerate all RoCE ports 2376 * @ib_dev : IB device we want to query 2377 * @filter: Should we call the callback? 2378 * @filter_cookie: Cookie passed to filter 2379 * @cb: Callback to call for each found RoCE ports 2380 * @cookie: Cookie passed back to the callback 2381 * 2382 * Enumerates all of the physical RoCE ports of ib_dev 2383 * which are related to netdevice and calls callback() on each 2384 * device for which filter() function returns non zero. 2385 */ 2386 void ib_enum_roce_netdev(struct ib_device *ib_dev, 2387 roce_netdev_filter filter, 2388 void *filter_cookie, 2389 roce_netdev_callback cb, 2390 void *cookie) 2391 { 2392 u32 port; 2393 2394 rdma_for_each_port (ib_dev, port) 2395 if (rdma_protocol_roce(ib_dev, port)) { 2396 struct net_device *idev = 2397 ib_device_get_netdev(ib_dev, port); 2398 2399 if (filter(ib_dev, port, idev, filter_cookie)) 2400 cb(ib_dev, port, idev, cookie); 2401 dev_put(idev); 2402 } 2403 } 2404 2405 /** 2406 * ib_enum_all_roce_netdevs - enumerate all RoCE devices 2407 * @filter: Should we call the callback? 2408 * @filter_cookie: Cookie passed to filter 2409 * @cb: Callback to call for each found RoCE ports 2410 * @cookie: Cookie passed back to the callback 2411 * 2412 * Enumerates all RoCE devices' physical ports which are related 2413 * to netdevices and calls callback() on each device for which 2414 * filter() function returns non zero. 2415 */ 2416 void ib_enum_all_roce_netdevs(roce_netdev_filter filter, 2417 void *filter_cookie, 2418 roce_netdev_callback cb, 2419 void *cookie) 2420 { 2421 struct ib_device *dev; 2422 unsigned long index; 2423 2424 down_read(&devices_rwsem); 2425 xa_for_each_marked(&devices, index, dev, DEVICE_GID_UPDATES) 2426 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); 2427 up_read(&devices_rwsem); 2428 } 2429 2430 /** 2431 * ib_device_enable_gid_updates - Mark device as ready for GID cache updates 2432 * @device: Device to mark 2433 * 2434 * Called after GID table is allocated and initialized. After this mark is set, 2435 * netdevice event handlers can update the device's GID cache. This allows 2436 * events that arrive during device registration to be processed, avoiding 2437 * stale GID entries when netdev properties change during the device 2438 * registration process. 2439 */ 2440 void ib_device_enable_gid_updates(struct ib_device *device) 2441 { 2442 down_write(&devices_rwsem); 2443 xa_set_mark(&devices, device->index, DEVICE_GID_UPDATES); 2444 up_write(&devices_rwsem); 2445 } 2446 2447 /** 2448 * ib_device_disable_gid_updates - Clear the GID updates mark 2449 * @device: Device to unmark 2450 * 2451 * Called before GID table cleanup to prevent event handlers from accessing 2452 * the device while it's being torn down. 2453 */ 2454 void ib_device_disable_gid_updates(struct ib_device *device) 2455 { 2456 down_write(&devices_rwsem); 2457 xa_clear_mark(&devices, device->index, DEVICE_GID_UPDATES); 2458 up_write(&devices_rwsem); 2459 } 2460 2461 /* 2462 * ib_enum_all_devs - enumerate all ib_devices 2463 * @cb: Callback to call for each found ib_device 2464 * 2465 * Enumerates all ib_devices and calls callback() on each device. 2466 */ 2467 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, 2468 struct netlink_callback *cb) 2469 { 2470 unsigned long index; 2471 struct ib_device *dev; 2472 unsigned int idx = 0; 2473 int ret = 0; 2474 2475 down_read(&devices_rwsem); 2476 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 2477 if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) 2478 continue; 2479 2480 ret = nldev_cb(dev, skb, cb, idx); 2481 if (ret) 2482 break; 2483 idx++; 2484 } 2485 up_read(&devices_rwsem); 2486 return ret; 2487 } 2488 2489 /** 2490 * ib_query_pkey - Get P_Key table entry 2491 * @device:Device to query 2492 * @port_num:Port number to query 2493 * @index:P_Key table index to query 2494 * @pkey:Returned P_Key 2495 * 2496 * ib_query_pkey() fetches the specified P_Key table entry. 2497 */ 2498 int ib_query_pkey(struct ib_device *device, 2499 u32 port_num, u16 index, u16 *pkey) 2500 { 2501 if (!rdma_is_port_valid(device, port_num)) 2502 return -EINVAL; 2503 2504 if (!device->ops.query_pkey) 2505 return -EOPNOTSUPP; 2506 2507 return device->ops.query_pkey(device, port_num, index, pkey); 2508 } 2509 EXPORT_SYMBOL(ib_query_pkey); 2510 2511 /** 2512 * ib_modify_device - Change IB device attributes 2513 * @device:Device to modify 2514 * @device_modify_mask:Mask of attributes to change 2515 * @device_modify:New attribute values 2516 * 2517 * ib_modify_device() changes a device's attributes as specified by 2518 * the @device_modify_mask and @device_modify structure. 2519 */ 2520 int ib_modify_device(struct ib_device *device, 2521 int device_modify_mask, 2522 struct ib_device_modify *device_modify) 2523 { 2524 if (!device->ops.modify_device) 2525 return -EOPNOTSUPP; 2526 2527 return device->ops.modify_device(device, device_modify_mask, 2528 device_modify); 2529 } 2530 EXPORT_SYMBOL(ib_modify_device); 2531 2532 /** 2533 * ib_modify_port - Modifies the attributes for the specified port. 2534 * @device: The device to modify. 2535 * @port_num: The number of the port to modify. 2536 * @port_modify_mask: Mask used to specify which attributes of the port 2537 * to change. 2538 * @port_modify: New attribute values for the port. 2539 * 2540 * ib_modify_port() changes a port's attributes as specified by the 2541 * @port_modify_mask and @port_modify structure. 2542 */ 2543 int ib_modify_port(struct ib_device *device, 2544 u32 port_num, int port_modify_mask, 2545 struct ib_port_modify *port_modify) 2546 { 2547 int rc; 2548 2549 if (!rdma_is_port_valid(device, port_num)) 2550 return -EINVAL; 2551 2552 if (device->ops.modify_port) 2553 rc = device->ops.modify_port(device, port_num, 2554 port_modify_mask, 2555 port_modify); 2556 else if (rdma_protocol_roce(device, port_num) && 2557 ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || 2558 (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) 2559 rc = 0; 2560 else 2561 rc = -EOPNOTSUPP; 2562 return rc; 2563 } 2564 EXPORT_SYMBOL(ib_modify_port); 2565 2566 /** 2567 * ib_find_gid - Returns the port number and GID table index where 2568 * a specified GID value occurs. Its searches only for IB link layer. 2569 * @device: The device to query. 2570 * @gid: The GID value to search for. 2571 * @port_num: The port number of the device where the GID value was found. 2572 * @index: The index into the GID table where the GID was found. This 2573 * parameter may be NULL. 2574 */ 2575 int ib_find_gid(struct ib_device *device, union ib_gid *gid, 2576 u32 *port_num, u16 *index) 2577 { 2578 union ib_gid tmp_gid; 2579 u32 port; 2580 int ret, i; 2581 2582 rdma_for_each_port (device, port) { 2583 if (!rdma_protocol_ib(device, port)) 2584 continue; 2585 2586 for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; 2587 ++i) { 2588 ret = rdma_query_gid(device, port, i, &tmp_gid); 2589 if (ret) 2590 continue; 2591 2592 if (!memcmp(&tmp_gid, gid, sizeof *gid)) { 2593 *port_num = port; 2594 if (index) 2595 *index = i; 2596 return 0; 2597 } 2598 } 2599 } 2600 2601 return -ENOENT; 2602 } 2603 EXPORT_SYMBOL(ib_find_gid); 2604 2605 /** 2606 * ib_find_pkey - Returns the PKey table index where a specified 2607 * PKey value occurs. 2608 * @device: The device to query. 2609 * @port_num: The port number of the device to search for the PKey. 2610 * @pkey: The PKey value to search for. 2611 * @index: The index into the PKey table where the PKey was found. 2612 */ 2613 int ib_find_pkey(struct ib_device *device, 2614 u32 port_num, u16 pkey, u16 *index) 2615 { 2616 int ret, i; 2617 u16 tmp_pkey; 2618 int partial_ix = -1; 2619 2620 for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; 2621 ++i) { 2622 ret = ib_query_pkey(device, port_num, i, &tmp_pkey); 2623 if (ret) 2624 return ret; 2625 if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { 2626 /* if there is full-member pkey take it.*/ 2627 if (tmp_pkey & 0x8000) { 2628 *index = i; 2629 return 0; 2630 } 2631 if (partial_ix < 0) 2632 partial_ix = i; 2633 } 2634 } 2635 2636 /*no full-member, if exists take the limited*/ 2637 if (partial_ix >= 0) { 2638 *index = partial_ix; 2639 return 0; 2640 } 2641 return -ENOENT; 2642 } 2643 EXPORT_SYMBOL(ib_find_pkey); 2644 2645 /** 2646 * ib_get_net_dev_by_params() - Return the appropriate net_dev 2647 * for a received CM request 2648 * @dev: An RDMA device on which the request has been received. 2649 * @port: Port number on the RDMA device. 2650 * @pkey: The Pkey the request came on. 2651 * @gid: A GID that the net_dev uses to communicate. 2652 * @addr: Contains the IP address that the request specified as its 2653 * destination. 2654 * 2655 */ 2656 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, 2657 u32 port, 2658 u16 pkey, 2659 const union ib_gid *gid, 2660 const struct sockaddr *addr) 2661 { 2662 struct net_device *net_dev = NULL; 2663 unsigned long index; 2664 void *client_data; 2665 2666 if (!rdma_protocol_ib(dev, port)) 2667 return NULL; 2668 2669 /* 2670 * Holding the read side guarantees that the client will not become 2671 * unregistered while we are calling get_net_dev_by_params() 2672 */ 2673 down_read(&dev->client_data_rwsem); 2674 xan_for_each_marked (&dev->client_data, index, client_data, 2675 CLIENT_DATA_REGISTERED) { 2676 struct ib_client *client = xa_load(&clients, index); 2677 2678 if (!client || !client->get_net_dev_by_params) 2679 continue; 2680 2681 net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, 2682 addr, client_data); 2683 if (net_dev) 2684 break; 2685 } 2686 up_read(&dev->client_data_rwsem); 2687 2688 return net_dev; 2689 } 2690 EXPORT_SYMBOL(ib_get_net_dev_by_params); 2691 2692 void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) 2693 { 2694 struct ib_device_ops *dev_ops = &dev->ops; 2695 #define SET_DEVICE_OP(ptr, name) \ 2696 do { \ 2697 if (ops->name) \ 2698 if (!((ptr)->name)) \ 2699 (ptr)->name = ops->name; \ 2700 } while (0) 2701 2702 #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) 2703 2704 if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { 2705 WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && 2706 dev_ops->driver_id != ops->driver_id); 2707 dev_ops->driver_id = ops->driver_id; 2708 } 2709 if (ops->owner) { 2710 WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); 2711 dev_ops->owner = ops->owner; 2712 } 2713 if (ops->uverbs_abi_ver) 2714 dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; 2715 2716 dev_ops->uverbs_no_driver_id_binding |= 2717 ops->uverbs_no_driver_id_binding; 2718 dev_ops->uverbs_robust_udata |= ops->uverbs_robust_udata; 2719 2720 SET_DEVICE_OP(dev_ops, add_gid); 2721 SET_DEVICE_OP(dev_ops, add_sub_dev); 2722 SET_DEVICE_OP(dev_ops, advise_mr); 2723 SET_DEVICE_OP(dev_ops, alloc_dm); 2724 SET_DEVICE_OP(dev_ops, alloc_dmah); 2725 SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); 2726 SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); 2727 SET_DEVICE_OP(dev_ops, alloc_mr); 2728 SET_DEVICE_OP(dev_ops, alloc_mr_integrity); 2729 SET_DEVICE_OP(dev_ops, alloc_mw); 2730 SET_DEVICE_OP(dev_ops, alloc_pd); 2731 SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); 2732 SET_DEVICE_OP(dev_ops, alloc_ucontext); 2733 SET_DEVICE_OP(dev_ops, alloc_xrcd); 2734 SET_DEVICE_OP(dev_ops, attach_mcast); 2735 SET_DEVICE_OP(dev_ops, check_mr_status); 2736 SET_DEVICE_OP(dev_ops, counter_alloc_stats); 2737 SET_DEVICE_OP(dev_ops, counter_bind_qp); 2738 SET_DEVICE_OP(dev_ops, counter_dealloc); 2739 SET_DEVICE_OP(dev_ops, counter_init); 2740 SET_DEVICE_OP(dev_ops, counter_unbind_qp); 2741 SET_DEVICE_OP(dev_ops, counter_update_stats); 2742 SET_DEVICE_OP(dev_ops, create_ah); 2743 SET_DEVICE_OP(dev_ops, create_counters); 2744 SET_DEVICE_OP(dev_ops, create_cq); 2745 SET_DEVICE_OP(dev_ops, create_user_cq); 2746 SET_DEVICE_OP(dev_ops, create_flow); 2747 SET_DEVICE_OP(dev_ops, create_qp); 2748 SET_DEVICE_OP(dev_ops, create_rwq_ind_table); 2749 SET_DEVICE_OP(dev_ops, create_srq); 2750 SET_DEVICE_OP(dev_ops, create_user_ah); 2751 SET_DEVICE_OP(dev_ops, create_wq); 2752 SET_DEVICE_OP(dev_ops, dealloc_dm); 2753 SET_DEVICE_OP(dev_ops, dealloc_dmah); 2754 SET_DEVICE_OP(dev_ops, dealloc_driver); 2755 SET_DEVICE_OP(dev_ops, dealloc_mw); 2756 SET_DEVICE_OP(dev_ops, dealloc_pd); 2757 SET_DEVICE_OP(dev_ops, dealloc_ucontext); 2758 SET_DEVICE_OP(dev_ops, dealloc_xrcd); 2759 SET_DEVICE_OP(dev_ops, del_gid); 2760 SET_DEVICE_OP(dev_ops, del_sub_dev); 2761 SET_DEVICE_OP(dev_ops, dereg_mr); 2762 SET_DEVICE_OP(dev_ops, destroy_ah); 2763 SET_DEVICE_OP(dev_ops, destroy_counters); 2764 SET_DEVICE_OP(dev_ops, destroy_cq); 2765 SET_DEVICE_OP(dev_ops, destroy_flow); 2766 SET_DEVICE_OP(dev_ops, destroy_flow_action); 2767 SET_DEVICE_OP(dev_ops, destroy_qp); 2768 SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); 2769 SET_DEVICE_OP(dev_ops, destroy_srq); 2770 SET_DEVICE_OP(dev_ops, destroy_wq); 2771 SET_DEVICE_OP(dev_ops, device_group); 2772 SET_DEVICE_OP(dev_ops, detach_mcast); 2773 SET_DEVICE_OP(dev_ops, disassociate_ucontext); 2774 SET_DEVICE_OP(dev_ops, drain_rq); 2775 SET_DEVICE_OP(dev_ops, drain_sq); 2776 SET_DEVICE_OP(dev_ops, enable_driver); 2777 SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); 2778 SET_DEVICE_OP(dev_ops, fill_res_cq_entry); 2779 SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); 2780 SET_DEVICE_OP(dev_ops, fill_res_mr_entry); 2781 SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); 2782 SET_DEVICE_OP(dev_ops, fill_res_qp_entry); 2783 SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); 2784 SET_DEVICE_OP(dev_ops, fill_res_srq_entry); 2785 SET_DEVICE_OP(dev_ops, fill_res_srq_entry_raw); 2786 SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); 2787 SET_DEVICE_OP(dev_ops, get_dev_fw_str); 2788 SET_DEVICE_OP(dev_ops, get_dma_mr); 2789 SET_DEVICE_OP(dev_ops, get_hw_stats); 2790 SET_DEVICE_OP(dev_ops, get_link_layer); 2791 SET_DEVICE_OP(dev_ops, get_netdev); 2792 SET_DEVICE_OP(dev_ops, get_numa_node); 2793 SET_DEVICE_OP(dev_ops, get_port_immutable); 2794 SET_DEVICE_OP(dev_ops, get_vf_config); 2795 SET_DEVICE_OP(dev_ops, get_vf_guid); 2796 SET_DEVICE_OP(dev_ops, get_vf_stats); 2797 SET_DEVICE_OP(dev_ops, iw_accept); 2798 SET_DEVICE_OP(dev_ops, iw_add_ref); 2799 SET_DEVICE_OP(dev_ops, iw_connect); 2800 SET_DEVICE_OP(dev_ops, iw_create_listen); 2801 SET_DEVICE_OP(dev_ops, iw_destroy_listen); 2802 SET_DEVICE_OP(dev_ops, iw_get_qp); 2803 SET_DEVICE_OP(dev_ops, iw_reject); 2804 SET_DEVICE_OP(dev_ops, iw_rem_ref); 2805 SET_DEVICE_OP(dev_ops, map_mr_sg); 2806 SET_DEVICE_OP(dev_ops, map_mr_sg_pi); 2807 SET_DEVICE_OP(dev_ops, mmap); 2808 SET_DEVICE_OP(dev_ops, mmap_get_pfns); 2809 SET_DEVICE_OP(dev_ops, mmap_free); 2810 SET_DEVICE_OP(dev_ops, modify_ah); 2811 SET_DEVICE_OP(dev_ops, modify_cq); 2812 SET_DEVICE_OP(dev_ops, modify_device); 2813 SET_DEVICE_OP(dev_ops, modify_hw_stat); 2814 SET_DEVICE_OP(dev_ops, modify_port); 2815 SET_DEVICE_OP(dev_ops, modify_qp); 2816 SET_DEVICE_OP(dev_ops, modify_srq); 2817 SET_DEVICE_OP(dev_ops, modify_wq); 2818 SET_DEVICE_OP(dev_ops, peek_cq); 2819 SET_DEVICE_OP(dev_ops, pgoff_to_mmap_entry); 2820 SET_DEVICE_OP(dev_ops, pre_destroy_cq); 2821 SET_DEVICE_OP(dev_ops, poll_cq); 2822 SET_DEVICE_OP(dev_ops, port_groups); 2823 SET_DEVICE_OP(dev_ops, post_destroy_cq); 2824 SET_DEVICE_OP(dev_ops, post_recv); 2825 SET_DEVICE_OP(dev_ops, post_send); 2826 SET_DEVICE_OP(dev_ops, post_srq_recv); 2827 SET_DEVICE_OP(dev_ops, process_mad); 2828 SET_DEVICE_OP(dev_ops, query_ah); 2829 SET_DEVICE_OP(dev_ops, query_device); 2830 SET_DEVICE_OP(dev_ops, query_gid); 2831 SET_DEVICE_OP(dev_ops, query_pkey); 2832 SET_DEVICE_OP(dev_ops, query_port); 2833 SET_DEVICE_OP(dev_ops, query_port_speed); 2834 SET_DEVICE_OP(dev_ops, query_qp); 2835 SET_DEVICE_OP(dev_ops, query_srq); 2836 SET_DEVICE_OP(dev_ops, query_ucontext); 2837 SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); 2838 SET_DEVICE_OP(dev_ops, read_counters); 2839 SET_DEVICE_OP(dev_ops, reg_dm_mr); 2840 SET_DEVICE_OP(dev_ops, reg_user_mr); 2841 SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf); 2842 SET_DEVICE_OP(dev_ops, req_notify_cq); 2843 SET_DEVICE_OP(dev_ops, rereg_user_mr); 2844 SET_DEVICE_OP(dev_ops, resize_user_cq); 2845 SET_DEVICE_OP(dev_ops, set_vf_guid); 2846 SET_DEVICE_OP(dev_ops, set_vf_link_state); 2847 SET_DEVICE_OP(dev_ops, ufile_hw_cleanup); 2848 SET_DEVICE_OP(dev_ops, report_port_event); 2849 2850 SET_OBJ_SIZE(dev_ops, ib_ah); 2851 SET_OBJ_SIZE(dev_ops, ib_counters); 2852 SET_OBJ_SIZE(dev_ops, ib_cq); 2853 SET_OBJ_SIZE(dev_ops, ib_dmah); 2854 SET_OBJ_SIZE(dev_ops, ib_mw); 2855 SET_OBJ_SIZE(dev_ops, ib_pd); 2856 SET_OBJ_SIZE(dev_ops, ib_qp); 2857 SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); 2858 SET_OBJ_SIZE(dev_ops, ib_srq); 2859 SET_OBJ_SIZE(dev_ops, ib_ucontext); 2860 SET_OBJ_SIZE(dev_ops, ib_xrcd); 2861 SET_OBJ_SIZE(dev_ops, rdma_counter); 2862 } 2863 EXPORT_SYMBOL(ib_set_device_ops); 2864 2865 int ib_add_sub_device(struct ib_device *parent, 2866 enum rdma_nl_dev_type type, 2867 const char *name) 2868 { 2869 struct ib_device *sub; 2870 int ret = 0; 2871 2872 if (!parent->ops.add_sub_dev || !parent->ops.del_sub_dev) 2873 return -EOPNOTSUPP; 2874 2875 if (!ib_device_try_get(parent)) 2876 return -EINVAL; 2877 2878 sub = parent->ops.add_sub_dev(parent, type, name); 2879 if (IS_ERR(sub)) { 2880 ib_device_put(parent); 2881 return PTR_ERR(sub); 2882 } 2883 2884 sub->type = type; 2885 sub->parent = parent; 2886 2887 mutex_lock(&parent->subdev_lock); 2888 list_add_tail(&parent->subdev_list_head, &sub->subdev_list); 2889 mutex_unlock(&parent->subdev_lock); 2890 2891 return ret; 2892 } 2893 2894 int ib_del_sub_device_and_put(struct ib_device *sub) 2895 { 2896 struct ib_device *parent = sub->parent; 2897 2898 if (!parent) { 2899 ib_device_put(sub); 2900 return -EOPNOTSUPP; 2901 } 2902 2903 mutex_lock(&parent->subdev_lock); 2904 list_del(&sub->subdev_list); 2905 mutex_unlock(&parent->subdev_lock); 2906 2907 ib_device_put(sub); 2908 parent->ops.del_sub_dev(sub); 2909 ib_device_put(parent); 2910 2911 return 0; 2912 } 2913 2914 #ifdef CONFIG_INFINIBAND_VIRT_DMA 2915 int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) 2916 { 2917 struct scatterlist *s; 2918 int i; 2919 2920 for_each_sg(sg, s, nents, i) { 2921 sg_dma_address(s) = (uintptr_t)sg_virt(s); 2922 sg_dma_len(s) = s->length; 2923 } 2924 return nents; 2925 } 2926 EXPORT_SYMBOL(ib_dma_virt_map_sg); 2927 #endif /* CONFIG_INFINIBAND_VIRT_DMA */ 2928 2929 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { 2930 [RDMA_NL_LS_OP_RESOLVE] = { 2931 .doit = ib_nl_handle_resolve_resp, 2932 .flags = RDMA_NL_ADMIN_PERM, 2933 }, 2934 [RDMA_NL_LS_OP_SET_TIMEOUT] = { 2935 .doit = ib_nl_handle_set_timeout, 2936 .flags = RDMA_NL_ADMIN_PERM, 2937 }, 2938 [RDMA_NL_LS_OP_IP_RESOLVE] = { 2939 .doit = ib_nl_handle_ip_res_resp, 2940 .flags = RDMA_NL_ADMIN_PERM, 2941 }, 2942 }; 2943 2944 void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev) 2945 { 2946 enum ib_port_state curr_state; 2947 struct ib_event ibevent = {}; 2948 u32 port; 2949 2950 if (ib_query_netdev_port(ibdev, ndev, &port)) 2951 return; 2952 2953 curr_state = ib_get_curr_port_state(ndev); 2954 2955 write_lock_irq(&ibdev->cache_lock); 2956 if (ibdev->port_data[port].cache.last_port_state == curr_state) { 2957 write_unlock_irq(&ibdev->cache_lock); 2958 return; 2959 } 2960 ibdev->port_data[port].cache.last_port_state = curr_state; 2961 write_unlock_irq(&ibdev->cache_lock); 2962 2963 ibevent.event = (curr_state == IB_PORT_DOWN) ? 2964 IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE; 2965 ibevent.device = ibdev; 2966 ibevent.element.port_num = port; 2967 ib_dispatch_event(&ibevent); 2968 } 2969 EXPORT_SYMBOL(ib_dispatch_port_state_event); 2970 2971 static void handle_port_event(struct net_device *ndev, unsigned long event) 2972 { 2973 struct ib_device *ibdev; 2974 2975 /* Currently, link events in bonding scenarios are still 2976 * reported by drivers that support bonding. 2977 */ 2978 if (netif_is_lag_master(ndev) || netif_is_lag_port(ndev)) 2979 return; 2980 2981 ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); 2982 if (!ibdev) 2983 return; 2984 2985 if (ibdev->ops.report_port_event) { 2986 ibdev->ops.report_port_event(ibdev, ndev, event); 2987 goto put_ibdev; 2988 } 2989 2990 ib_dispatch_port_state_event(ibdev, ndev); 2991 2992 put_ibdev: 2993 ib_device_put(ibdev); 2994 }; 2995 2996 static int ib_netdevice_event(struct notifier_block *this, 2997 unsigned long event, void *ptr) 2998 { 2999 struct net_device *ndev = netdev_notifier_info_to_dev(ptr); 3000 struct ib_device *ibdev; 3001 u32 port; 3002 3003 switch (event) { 3004 case NETDEV_CHANGENAME: 3005 ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); 3006 if (!ibdev) 3007 return NOTIFY_DONE; 3008 3009 if (ib_query_netdev_port(ibdev, ndev, &port)) { 3010 ib_device_put(ibdev); 3011 break; 3012 } 3013 3014 rdma_nl_notify_event(ibdev, port, RDMA_NETDEV_RENAME_EVENT); 3015 ib_device_put(ibdev); 3016 break; 3017 3018 case NETDEV_UP: 3019 case NETDEV_CHANGE: 3020 case NETDEV_DOWN: 3021 handle_port_event(ndev, event); 3022 break; 3023 3024 default: 3025 break; 3026 } 3027 3028 return NOTIFY_DONE; 3029 } 3030 3031 static struct notifier_block nb_netdevice = { 3032 .notifier_call = ib_netdevice_event, 3033 }; 3034 3035 static int __init ib_core_init(void) 3036 { 3037 int ret = -ENOMEM; 3038 3039 ib_wq = alloc_workqueue("infiniband", WQ_PERCPU, 0); 3040 if (!ib_wq) 3041 return -ENOMEM; 3042 3043 ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND, 3044 WQ_UNBOUND_MAX_ACTIVE); 3045 if (!ib_unreg_wq) 3046 goto err; 3047 3048 ib_comp_wq = alloc_workqueue("ib-comp-wq", 3049 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS | WQ_PERCPU, 0); 3050 if (!ib_comp_wq) 3051 goto err_unbound; 3052 3053 ib_comp_unbound_wq = 3054 alloc_workqueue("ib-comp-unb-wq", 3055 WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | 3056 WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); 3057 if (!ib_comp_unbound_wq) 3058 goto err_comp; 3059 3060 ret = class_register(&ib_class); 3061 if (ret) { 3062 pr_warn("Couldn't create InfiniBand device class\n"); 3063 goto err_comp_unbound; 3064 } 3065 3066 rdma_nl_init(); 3067 3068 ret = addr_init(); 3069 if (ret) { 3070 pr_warn("Couldn't init IB address resolution\n"); 3071 goto err_ibnl; 3072 } 3073 3074 ret = ib_mad_init(); 3075 if (ret) { 3076 pr_warn("Couldn't init IB MAD\n"); 3077 goto err_addr; 3078 } 3079 3080 ret = ib_sa_init(); 3081 if (ret) { 3082 pr_warn("Couldn't init SA\n"); 3083 goto err_mad; 3084 } 3085 3086 ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); 3087 if (ret) { 3088 pr_warn("Couldn't register LSM notifier. ret %d\n", ret); 3089 goto err_sa; 3090 } 3091 3092 ret = register_pernet_device(&rdma_dev_net_ops); 3093 if (ret) { 3094 pr_warn("Couldn't init compat dev. ret %d\n", ret); 3095 goto err_compat; 3096 } 3097 3098 nldev_init(); 3099 rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); 3100 ret = roce_gid_mgmt_init(); 3101 if (ret) { 3102 pr_warn("Couldn't init RoCE GID management\n"); 3103 goto err_parent; 3104 } 3105 3106 register_netdevice_notifier(&nb_netdevice); 3107 3108 return 0; 3109 3110 err_parent: 3111 rdma_nl_unregister(RDMA_NL_LS); 3112 nldev_exit(); 3113 unregister_pernet_device(&rdma_dev_net_ops); 3114 err_compat: 3115 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 3116 err_sa: 3117 ib_sa_cleanup(); 3118 err_mad: 3119 ib_mad_cleanup(); 3120 err_addr: 3121 addr_cleanup(); 3122 err_ibnl: 3123 class_unregister(&ib_class); 3124 err_comp_unbound: 3125 destroy_workqueue(ib_comp_unbound_wq); 3126 err_comp: 3127 destroy_workqueue(ib_comp_wq); 3128 err_unbound: 3129 destroy_workqueue(ib_unreg_wq); 3130 err: 3131 destroy_workqueue(ib_wq); 3132 return ret; 3133 } 3134 3135 static void __exit ib_core_cleanup(void) 3136 { 3137 unregister_netdevice_notifier(&nb_netdevice); 3138 roce_gid_mgmt_cleanup(); 3139 rdma_nl_unregister(RDMA_NL_LS); 3140 nldev_exit(); 3141 unregister_pernet_device(&rdma_dev_net_ops); 3142 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 3143 ib_sa_cleanup(); 3144 ib_mad_cleanup(); 3145 addr_cleanup(); 3146 rdma_nl_exit(); 3147 class_unregister(&ib_class); 3148 destroy_workqueue(ib_comp_unbound_wq); 3149 destroy_workqueue(ib_comp_wq); 3150 /* Make sure that any pending umem accounting work is done. */ 3151 destroy_workqueue(ib_wq); 3152 destroy_workqueue(ib_unreg_wq); 3153 WARN_ON(!xa_empty(&clients)); 3154 WARN_ON(!xa_empty(&devices)); 3155 } 3156 3157 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); 3158 3159 /* ib core relies on netdev stack to first register net_ns_type_operations 3160 * ns kobject type before ib_core initialization. 3161 */ 3162 fs_initcall(ib_core_init); 3163 module_exit(ib_core_cleanup); 3164