1 /* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <linux/module.h> 35 #include <linux/string.h> 36 #include <linux/errno.h> 37 #include <linux/kernel.h> 38 #include <linux/slab.h> 39 #include <linux/init.h> 40 #include <linux/netdevice.h> 41 #include <net/net_namespace.h> 42 #include <linux/security.h> 43 #include <linux/notifier.h> 44 #include <linux/hashtable.h> 45 #include <rdma/rdma_netlink.h> 46 #include <rdma/ib_addr.h> 47 #include <rdma/ib_cache.h> 48 #include <rdma/rdma_counter.h> 49 50 #include "core_priv.h" 51 #include "restrack.h" 52 53 MODULE_AUTHOR("Roland Dreier"); 54 MODULE_DESCRIPTION("core kernel InfiniBand API"); 55 MODULE_LICENSE("Dual BSD/GPL"); 56 57 struct workqueue_struct *ib_comp_wq; 58 struct workqueue_struct *ib_comp_unbound_wq; 59 struct workqueue_struct *ib_wq; 60 EXPORT_SYMBOL_GPL(ib_wq); 61 static struct workqueue_struct *ib_unreg_wq; 62 63 /* 64 * Each of the three rwsem locks (devices, clients, client_data) protects the 65 * xarray of the same name. Specifically it allows the caller to assert that 66 * the MARK will/will not be changing under the lock, and for devices and 67 * clients, that the value in the xarray is still a valid pointer. Change of 68 * the MARK is linked to the object state, so holding the lock and testing the 69 * MARK also asserts that the contained object is in a certain state. 70 * 71 * This is used to build a two stage register/unregister flow where objects 72 * can continue to be in the xarray even though they are still in progress to 73 * register/unregister. 74 * 75 * The xarray itself provides additional locking, and restartable iteration, 76 * which is also relied on. 77 * 78 * Locks should not be nested, with the exception of client_data, which is 79 * allowed to nest under the read side of the other two locks. 80 * 81 * The devices_rwsem also protects the device name list, any change or 82 * assignment of device name must also hold the write side to guarantee unique 83 * names. 84 */ 85 86 /* 87 * devices contains devices that have had their names assigned. The 88 * devices may not be registered. Users that care about the registration 89 * status need to call ib_device_try_get() on the device to ensure it is 90 * registered, and keep it registered, for the required duration. 91 * 92 */ 93 static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); 94 static DECLARE_RWSEM(devices_rwsem); 95 #define DEVICE_REGISTERED XA_MARK_1 96 97 static u32 highest_client_id; 98 #define CLIENT_REGISTERED XA_MARK_1 99 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); 100 static DECLARE_RWSEM(clients_rwsem); 101 102 static void ib_client_put(struct ib_client *client) 103 { 104 if (refcount_dec_and_test(&client->uses)) 105 complete(&client->uses_zero); 106 } 107 108 /* 109 * If client_data is registered then the corresponding client must also still 110 * be registered. 111 */ 112 #define CLIENT_DATA_REGISTERED XA_MARK_1 113 114 unsigned int rdma_dev_net_id; 115 116 /* 117 * A list of net namespaces is maintained in an xarray. This is necessary 118 * because we can't get the locking right using the existing net ns list. We 119 * would require a init_net callback after the list is updated. 120 */ 121 static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); 122 /* 123 * rwsem to protect accessing the rdma_nets xarray entries. 124 */ 125 static DECLARE_RWSEM(rdma_nets_rwsem); 126 127 bool ib_devices_shared_netns = true; 128 module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); 129 MODULE_PARM_DESC(netns_mode, 130 "Share device among net namespaces; default=1 (shared)"); 131 /** 132 * rdma_dev_access_netns() - Return whether an rdma device can be accessed 133 * from a specified net namespace or not. 134 * @dev: Pointer to rdma device which needs to be checked 135 * @net: Pointer to net namesapce for which access to be checked 136 * 137 * When the rdma device is in shared mode, it ignores the net namespace. 138 * When the rdma device is exclusive to a net namespace, rdma device net 139 * namespace is checked against the specified one. 140 */ 141 bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) 142 { 143 return (ib_devices_shared_netns || 144 net_eq(read_pnet(&dev->coredev.rdma_net), net)); 145 } 146 EXPORT_SYMBOL(rdma_dev_access_netns); 147 148 /** 149 * rdma_dev_has_raw_cap() - Returns whether a specified rdma device has 150 * CAP_NET_RAW capability or not. 151 * 152 * @dev: Pointer to rdma device whose capability to be checked 153 * 154 * Returns true if a rdma device's owning user namespace has CAP_NET_RAW 155 * capability, otherwise false. When rdma subsystem is in legacy shared network, 156 * namespace mode, the default net namespace is considered. 157 */ 158 bool rdma_dev_has_raw_cap(const struct ib_device *dev) 159 { 160 const struct net *net; 161 162 /* Network namespace is the resource whose user namespace 163 * to be considered. When in shared mode, there is no reliable 164 * network namespace resource, so consider the default net namespace. 165 */ 166 if (ib_devices_shared_netns) 167 net = &init_net; 168 else 169 net = read_pnet(&dev->coredev.rdma_net); 170 171 return ns_capable(net->user_ns, CAP_NET_RAW); 172 } 173 EXPORT_SYMBOL(rdma_dev_has_raw_cap); 174 175 /* 176 * xarray has this behavior where it won't iterate over NULL values stored in 177 * allocated arrays. So we need our own iterator to see all values stored in 178 * the array. This does the same thing as xa_for_each except that it also 179 * returns NULL valued entries if the array is allocating. Simplified to only 180 * work on simple xarrays. 181 */ 182 static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, 183 xa_mark_t filter) 184 { 185 XA_STATE(xas, xa, *indexp); 186 void *entry; 187 188 rcu_read_lock(); 189 do { 190 entry = xas_find_marked(&xas, ULONG_MAX, filter); 191 if (xa_is_zero(entry)) 192 break; 193 } while (xas_retry(&xas, entry)); 194 rcu_read_unlock(); 195 196 if (entry) { 197 *indexp = xas.xa_index; 198 if (xa_is_zero(entry)) 199 return NULL; 200 return entry; 201 } 202 return XA_ERROR(-ENOENT); 203 } 204 #define xan_for_each_marked(xa, index, entry, filter) \ 205 for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ 206 !xa_is_err(entry); \ 207 (index)++, entry = xan_find_marked(xa, &(index), filter)) 208 209 /* RCU hash table mapping netdevice pointers to struct ib_port_data */ 210 static DEFINE_SPINLOCK(ndev_hash_lock); 211 static DECLARE_HASHTABLE(ndev_hash, 5); 212 213 static void free_netdevs(struct ib_device *ib_dev); 214 static void ib_unregister_work(struct work_struct *work); 215 static void __ib_unregister_device(struct ib_device *device); 216 static int ib_security_change(struct notifier_block *nb, unsigned long event, 217 void *lsm_data); 218 static void ib_policy_change_task(struct work_struct *work); 219 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); 220 221 static void __ibdev_printk(const char *level, const struct ib_device *ibdev, 222 struct va_format *vaf) 223 { 224 if (ibdev && ibdev->dev.parent) 225 dev_printk_emit(level[1] - '0', 226 ibdev->dev.parent, 227 "%s %s %s: %pV", 228 dev_driver_string(ibdev->dev.parent), 229 dev_name(ibdev->dev.parent), 230 dev_name(&ibdev->dev), 231 vaf); 232 else if (ibdev) 233 printk("%s%s: %pV", 234 level, dev_name(&ibdev->dev), vaf); 235 else 236 printk("%s(NULL ib_device): %pV", level, vaf); 237 } 238 239 #define define_ibdev_printk_level(func, level) \ 240 void func(const struct ib_device *ibdev, const char *fmt, ...) \ 241 { \ 242 struct va_format vaf; \ 243 va_list args; \ 244 \ 245 va_start(args, fmt); \ 246 \ 247 vaf.fmt = fmt; \ 248 vaf.va = &args; \ 249 \ 250 __ibdev_printk(level, ibdev, &vaf); \ 251 \ 252 va_end(args); \ 253 } \ 254 EXPORT_SYMBOL(func); 255 256 define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); 257 define_ibdev_printk_level(ibdev_alert, KERN_ALERT); 258 define_ibdev_printk_level(ibdev_crit, KERN_CRIT); 259 define_ibdev_printk_level(ibdev_err, KERN_ERR); 260 define_ibdev_printk_level(ibdev_warn, KERN_WARNING); 261 define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); 262 define_ibdev_printk_level(ibdev_info, KERN_INFO); 263 264 static struct notifier_block ibdev_lsm_nb = { 265 .notifier_call = ib_security_change, 266 }; 267 268 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 269 struct net *net); 270 271 /* Pointer to the RCU head at the start of the ib_port_data array */ 272 struct ib_port_data_rcu { 273 struct rcu_head rcu_head; 274 struct ib_port_data pdata[]; 275 }; 276 277 static void ib_device_check_mandatory(struct ib_device *device) 278 { 279 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } 280 static const struct { 281 size_t offset; 282 char *name; 283 } mandatory_table[] = { 284 IB_MANDATORY_FUNC(query_device), 285 IB_MANDATORY_FUNC(query_port), 286 IB_MANDATORY_FUNC(alloc_pd), 287 IB_MANDATORY_FUNC(dealloc_pd), 288 IB_MANDATORY_FUNC(create_qp), 289 IB_MANDATORY_FUNC(modify_qp), 290 IB_MANDATORY_FUNC(destroy_qp), 291 IB_MANDATORY_FUNC(post_send), 292 IB_MANDATORY_FUNC(post_recv), 293 IB_MANDATORY_FUNC(create_cq), 294 IB_MANDATORY_FUNC(destroy_cq), 295 IB_MANDATORY_FUNC(poll_cq), 296 IB_MANDATORY_FUNC(req_notify_cq), 297 IB_MANDATORY_FUNC(get_dma_mr), 298 IB_MANDATORY_FUNC(reg_user_mr), 299 IB_MANDATORY_FUNC(dereg_mr), 300 IB_MANDATORY_FUNC(get_port_immutable) 301 }; 302 int i; 303 304 device->kverbs_provider = true; 305 for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { 306 if (!*(void **) ((void *) &device->ops + 307 mandatory_table[i].offset)) { 308 device->kverbs_provider = false; 309 break; 310 } 311 } 312 } 313 314 /* 315 * Caller must perform ib_device_put() to return the device reference count 316 * when ib_device_get_by_index() returns valid device pointer. 317 */ 318 struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) 319 { 320 struct ib_device *device; 321 322 down_read(&devices_rwsem); 323 device = xa_load(&devices, index); 324 if (device) { 325 if (!rdma_dev_access_netns(device, net)) { 326 device = NULL; 327 goto out; 328 } 329 330 if (!ib_device_try_get(device)) 331 device = NULL; 332 } 333 out: 334 up_read(&devices_rwsem); 335 return device; 336 } 337 338 /** 339 * ib_device_put - Release IB device reference 340 * @device: device whose reference to be released 341 * 342 * ib_device_put() releases reference to the IB device to allow it to be 343 * unregistered and eventually free. 344 */ 345 void ib_device_put(struct ib_device *device) 346 { 347 if (refcount_dec_and_test(&device->refcount)) 348 complete(&device->unreg_completion); 349 } 350 EXPORT_SYMBOL(ib_device_put); 351 352 static struct ib_device *__ib_device_get_by_name(const char *name) 353 { 354 struct ib_device *device; 355 unsigned long index; 356 357 xa_for_each (&devices, index, device) 358 if (!strcmp(name, dev_name(&device->dev))) 359 return device; 360 361 return NULL; 362 } 363 364 static int rename_compat_devs(struct ib_device *device) 365 { 366 struct ib_core_device *cdev; 367 unsigned long index; 368 int ret = 0; 369 370 mutex_lock(&device->compat_devs_mutex); 371 xa_for_each (&device->compat_devs, index, cdev) { 372 ret = device_rename(&cdev->dev, dev_name(&device->dev)); 373 if (ret) { 374 dev_warn(&cdev->dev, 375 "Fail to rename compatdev to new name %s\n", 376 dev_name(&device->dev)); 377 break; 378 } 379 } 380 mutex_unlock(&device->compat_devs_mutex); 381 return ret; 382 } 383 384 int ib_device_rename(struct ib_device *ibdev, const char *name) 385 { 386 unsigned long index; 387 void *client_data; 388 int ret; 389 390 down_write(&devices_rwsem); 391 if (!strcmp(name, dev_name(&ibdev->dev))) { 392 up_write(&devices_rwsem); 393 return 0; 394 } 395 396 if (__ib_device_get_by_name(name)) { 397 up_write(&devices_rwsem); 398 return -EEXIST; 399 } 400 401 ret = device_rename(&ibdev->dev, name); 402 if (ret) { 403 up_write(&devices_rwsem); 404 return ret; 405 } 406 407 strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX); 408 ret = rename_compat_devs(ibdev); 409 410 downgrade_write(&devices_rwsem); 411 down_read(&ibdev->client_data_rwsem); 412 xan_for_each_marked(&ibdev->client_data, index, client_data, 413 CLIENT_DATA_REGISTERED) { 414 struct ib_client *client = xa_load(&clients, index); 415 416 if (!client || !client->rename) 417 continue; 418 419 client->rename(ibdev, client_data); 420 } 421 up_read(&ibdev->client_data_rwsem); 422 rdma_nl_notify_event(ibdev, 0, RDMA_RENAME_EVENT); 423 up_read(&devices_rwsem); 424 return 0; 425 } 426 427 int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) 428 { 429 if (use_dim > 1) 430 return -EINVAL; 431 ibdev->use_cq_dim = use_dim; 432 433 return 0; 434 } 435 436 static int alloc_name(struct ib_device *ibdev, const char *name) 437 { 438 struct ib_device *device; 439 unsigned long index; 440 struct ida inuse; 441 int rc; 442 int i; 443 444 lockdep_assert_held_write(&devices_rwsem); 445 ida_init(&inuse); 446 xa_for_each (&devices, index, device) { 447 char buf[IB_DEVICE_NAME_MAX]; 448 449 if (sscanf(dev_name(&device->dev), name, &i) != 1) 450 continue; 451 if (i < 0 || i >= INT_MAX) 452 continue; 453 snprintf(buf, sizeof buf, name, i); 454 if (strcmp(buf, dev_name(&device->dev)) != 0) 455 continue; 456 457 rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); 458 if (rc < 0) 459 goto out; 460 } 461 462 rc = ida_alloc(&inuse, GFP_KERNEL); 463 if (rc < 0) 464 goto out; 465 466 rc = dev_set_name(&ibdev->dev, name, rc); 467 out: 468 ida_destroy(&inuse); 469 return rc; 470 } 471 472 static void ib_device_release(struct device *device) 473 { 474 struct ib_device *dev = container_of(device, struct ib_device, dev); 475 476 free_netdevs(dev); 477 WARN_ON(refcount_read(&dev->refcount)); 478 if (dev->hw_stats_data) 479 ib_device_release_hw_stats(dev->hw_stats_data); 480 if (dev->port_data) { 481 ib_cache_release_one(dev); 482 ib_security_release_port_pkey_list(dev); 483 rdma_counter_release(dev); 484 kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, 485 pdata[0]), 486 rcu_head); 487 } 488 489 mutex_destroy(&dev->subdev_lock); 490 mutex_destroy(&dev->unregistration_lock); 491 mutex_destroy(&dev->compat_devs_mutex); 492 493 xa_destroy(&dev->compat_devs); 494 xa_destroy(&dev->client_data); 495 kfree_rcu(dev, rcu_head); 496 } 497 498 static int ib_device_uevent(const struct device *device, 499 struct kobj_uevent_env *env) 500 { 501 if (add_uevent_var(env, "NAME=%s", dev_name(device))) 502 return -ENOMEM; 503 504 /* 505 * It would be nice to pass the node GUID with the event... 506 */ 507 508 return 0; 509 } 510 511 static const void *net_namespace(const struct device *d) 512 { 513 const struct ib_core_device *coredev = 514 container_of(d, struct ib_core_device, dev); 515 516 return read_pnet(&coredev->rdma_net); 517 } 518 519 static struct class ib_class = { 520 .name = "infiniband", 521 .dev_release = ib_device_release, 522 .dev_uevent = ib_device_uevent, 523 .ns_type = &net_ns_type_operations, 524 .namespace = net_namespace, 525 }; 526 527 static void rdma_init_coredev(struct ib_core_device *coredev, 528 struct ib_device *dev, struct net *net) 529 { 530 bool is_full_dev = &dev->coredev == coredev; 531 532 /* This BUILD_BUG_ON is intended to catch layout change 533 * of union of ib_core_device and device. 534 * dev must be the first element as ib_core and providers 535 * driver uses it. Adding anything in ib_core_device before 536 * device will break this assumption. 537 */ 538 BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != 539 offsetof(struct ib_device, dev)); 540 541 coredev->dev.class = &ib_class; 542 coredev->dev.groups = dev->groups; 543 544 /* 545 * Don't expose hw counters outside of the init namespace. 546 */ 547 if (!is_full_dev && dev->hw_stats_attr_index) 548 coredev->dev.groups[dev->hw_stats_attr_index] = NULL; 549 550 device_initialize(&coredev->dev); 551 coredev->owner = dev; 552 INIT_LIST_HEAD(&coredev->port_list); 553 write_pnet(&coredev->rdma_net, net); 554 } 555 556 /** 557 * _ib_alloc_device - allocate an IB device struct 558 * @size:size of structure to allocate 559 * @net: network namespace device should be located in, namespace 560 * must stay valid until ib_register_device() is completed. 561 * 562 * Low-level drivers should use ib_alloc_device() to allocate &struct 563 * ib_device. @size is the size of the structure to be allocated, 564 * including any private data used by the low-level driver. 565 * ib_dealloc_device() must be used to free structures allocated with 566 * ib_alloc_device(). 567 */ 568 struct ib_device *_ib_alloc_device(size_t size, struct net *net) 569 { 570 struct ib_device *device; 571 unsigned int i; 572 573 if (WARN_ON(size < sizeof(struct ib_device))) 574 return NULL; 575 576 device = kzalloc(size, GFP_KERNEL); 577 if (!device) 578 return NULL; 579 580 if (rdma_restrack_init(device)) { 581 kfree(device); 582 return NULL; 583 } 584 585 /* ib_devices_shared_netns can't change while we have active namespaces 586 * in the system which means either init_net is passed or the user has 587 * no idea what they are doing. 588 * 589 * To avoid breaking backward compatibility, when in shared mode, 590 * force to init the device in the init_net. 591 */ 592 net = ib_devices_shared_netns ? &init_net : net; 593 rdma_init_coredev(&device->coredev, device, net); 594 595 INIT_LIST_HEAD(&device->event_handler_list); 596 spin_lock_init(&device->qp_open_list_lock); 597 init_rwsem(&device->event_handler_rwsem); 598 mutex_init(&device->unregistration_lock); 599 /* 600 * client_data needs to be alloc because we don't want our mark to be 601 * destroyed if the user stores NULL in the client data. 602 */ 603 xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); 604 init_rwsem(&device->client_data_rwsem); 605 xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); 606 mutex_init(&device->compat_devs_mutex); 607 init_completion(&device->unreg_completion); 608 INIT_WORK(&device->unregistration_work, ib_unregister_work); 609 610 spin_lock_init(&device->cq_pools_lock); 611 for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++) 612 INIT_LIST_HEAD(&device->cq_pools[i]); 613 614 rwlock_init(&device->cache_lock); 615 616 device->uverbs_cmd_mask = 617 BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) | 618 BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) | 619 BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) | 620 BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) | 621 BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) | 622 BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | 623 BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) | 624 BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) | 625 BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) | 626 BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) | 627 BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) | 628 BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) | 629 BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) | 630 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) | 631 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) | 632 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) | 633 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) | 634 BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) | 635 BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | 636 BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) | 637 BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) | 638 BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) | 639 BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) | 640 BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) | 641 BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) | 642 BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) | 643 BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) | 644 BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | 645 BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | 646 BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); 647 648 mutex_init(&device->subdev_lock); 649 INIT_LIST_HEAD(&device->subdev_list_head); 650 INIT_LIST_HEAD(&device->subdev_list); 651 652 return device; 653 } 654 EXPORT_SYMBOL(_ib_alloc_device); 655 656 /** 657 * ib_dealloc_device - free an IB device struct 658 * @device:structure to free 659 * 660 * Free a structure allocated with ib_alloc_device(). 661 */ 662 void ib_dealloc_device(struct ib_device *device) 663 { 664 if (device->ops.dealloc_driver) 665 device->ops.dealloc_driver(device); 666 667 /* 668 * ib_unregister_driver() requires all devices to remain in the xarray 669 * while their ops are callable. The last op we call is dealloc_driver 670 * above. This is needed to create a fence on op callbacks prior to 671 * allowing the driver module to unload. 672 */ 673 down_write(&devices_rwsem); 674 if (xa_load(&devices, device->index) == device) 675 xa_erase(&devices, device->index); 676 up_write(&devices_rwsem); 677 678 /* Expedite releasing netdev references */ 679 free_netdevs(device); 680 681 WARN_ON(!xa_empty(&device->compat_devs)); 682 WARN_ON(!xa_empty(&device->client_data)); 683 WARN_ON(refcount_read(&device->refcount)); 684 rdma_restrack_clean(device); 685 /* Balances with device_initialize */ 686 put_device(&device->dev); 687 } 688 EXPORT_SYMBOL(ib_dealloc_device); 689 690 /* 691 * add_client_context() and remove_client_context() must be safe against 692 * parallel calls on the same device - registration/unregistration of both the 693 * device and client can be occurring in parallel. 694 * 695 * The routines need to be a fence, any caller must not return until the add 696 * or remove is fully completed. 697 */ 698 static int add_client_context(struct ib_device *device, 699 struct ib_client *client) 700 { 701 int ret = 0; 702 703 if (!device->kverbs_provider && !client->no_kverbs_req) 704 return 0; 705 706 down_write(&device->client_data_rwsem); 707 /* 708 * So long as the client is registered hold both the client and device 709 * unregistration locks. 710 */ 711 if (!refcount_inc_not_zero(&client->uses)) 712 goto out_unlock; 713 refcount_inc(&device->refcount); 714 715 /* 716 * Another caller to add_client_context got here first and has already 717 * completely initialized context. 718 */ 719 if (xa_get_mark(&device->client_data, client->client_id, 720 CLIENT_DATA_REGISTERED)) 721 goto out; 722 723 ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, 724 GFP_KERNEL)); 725 if (ret) 726 goto out; 727 downgrade_write(&device->client_data_rwsem); 728 if (client->add) { 729 if (client->add(device)) { 730 /* 731 * If a client fails to add then the error code is 732 * ignored, but we won't call any more ops on this 733 * client. 734 */ 735 xa_erase(&device->client_data, client->client_id); 736 up_read(&device->client_data_rwsem); 737 ib_device_put(device); 738 ib_client_put(client); 739 return 0; 740 } 741 } 742 743 /* Readers shall not see a client until add has been completed */ 744 xa_set_mark(&device->client_data, client->client_id, 745 CLIENT_DATA_REGISTERED); 746 up_read(&device->client_data_rwsem); 747 return 0; 748 749 out: 750 ib_device_put(device); 751 ib_client_put(client); 752 out_unlock: 753 up_write(&device->client_data_rwsem); 754 return ret; 755 } 756 757 static void remove_client_context(struct ib_device *device, 758 unsigned int client_id) 759 { 760 struct ib_client *client; 761 void *client_data; 762 763 down_write(&device->client_data_rwsem); 764 if (!xa_get_mark(&device->client_data, client_id, 765 CLIENT_DATA_REGISTERED)) { 766 up_write(&device->client_data_rwsem); 767 return; 768 } 769 client_data = xa_load(&device->client_data, client_id); 770 xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); 771 client = xa_load(&clients, client_id); 772 up_write(&device->client_data_rwsem); 773 774 /* 775 * Notice we cannot be holding any exclusive locks when calling the 776 * remove callback as the remove callback can recurse back into any 777 * public functions in this module and thus try for any locks those 778 * functions take. 779 * 780 * For this reason clients and drivers should not call the 781 * unregistration functions will holdling any locks. 782 */ 783 if (client->remove) 784 client->remove(device, client_data); 785 786 xa_erase(&device->client_data, client_id); 787 ib_device_put(device); 788 ib_client_put(client); 789 } 790 791 static int alloc_port_data(struct ib_device *device) 792 { 793 struct ib_port_data_rcu *pdata_rcu; 794 u32 port; 795 796 if (device->port_data) 797 return 0; 798 799 /* This can only be called once the physical port range is defined */ 800 if (WARN_ON(!device->phys_port_cnt)) 801 return -EINVAL; 802 803 /* Reserve U32_MAX so the logic to go over all the ports is sane */ 804 if (WARN_ON(device->phys_port_cnt == U32_MAX)) 805 return -EINVAL; 806 807 /* 808 * device->port_data is indexed directly by the port number to make 809 * access to this data as efficient as possible. 810 * 811 * Therefore port_data is declared as a 1 based array with potential 812 * empty slots at the beginning. 813 */ 814 pdata_rcu = kzalloc_flex(*pdata_rcu, pdata, 815 size_add(rdma_end_port(device), 1), GFP_KERNEL); 816 if (!pdata_rcu) 817 return -ENOMEM; 818 /* 819 * The rcu_head is put in front of the port data array and the stored 820 * pointer is adjusted since we never need to see that member until 821 * kfree_rcu. 822 */ 823 device->port_data = pdata_rcu->pdata; 824 825 rdma_for_each_port (device, port) { 826 struct ib_port_data *pdata = &device->port_data[port]; 827 828 pdata->ib_dev = device; 829 spin_lock_init(&pdata->pkey_list_lock); 830 INIT_LIST_HEAD(&pdata->pkey_list); 831 spin_lock_init(&pdata->netdev_lock); 832 INIT_HLIST_NODE(&pdata->ndev_hash_link); 833 } 834 return 0; 835 } 836 837 static int verify_immutable(const struct ib_device *dev, u32 port) 838 { 839 return WARN_ON(!rdma_cap_ib_mad(dev, port) && 840 rdma_max_mad_size(dev, port) != 0); 841 } 842 843 static int setup_port_data(struct ib_device *device) 844 { 845 u32 port; 846 int ret; 847 848 ret = alloc_port_data(device); 849 if (ret) 850 return ret; 851 852 rdma_for_each_port (device, port) { 853 struct ib_port_data *pdata = &device->port_data[port]; 854 855 ret = device->ops.get_port_immutable(device, port, 856 &pdata->immutable); 857 if (ret) 858 return ret; 859 860 if (verify_immutable(device, port)) 861 return -EINVAL; 862 } 863 return 0; 864 } 865 866 /** 867 * ib_port_immutable_read() - Read rdma port's immutable data 868 * @dev: IB device 869 * @port: port number whose immutable data to read. It starts with index 1 and 870 * valid upto including rdma_end_port(). 871 */ 872 const struct ib_port_immutable* 873 ib_port_immutable_read(struct ib_device *dev, unsigned int port) 874 { 875 WARN_ON(!rdma_is_port_valid(dev, port)); 876 return &dev->port_data[port].immutable; 877 } 878 EXPORT_SYMBOL(ib_port_immutable_read); 879 880 void ib_get_device_fw_str(struct ib_device *dev, char *str) 881 { 882 if (dev->ops.get_dev_fw_str) 883 dev->ops.get_dev_fw_str(dev, str); 884 else 885 str[0] = '\0'; 886 } 887 EXPORT_SYMBOL(ib_get_device_fw_str); 888 889 static void ib_policy_change_task(struct work_struct *work) 890 { 891 struct ib_device *dev; 892 unsigned long index; 893 894 down_read(&devices_rwsem); 895 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 896 unsigned int i; 897 898 rdma_for_each_port (dev, i) { 899 u64 sp; 900 ib_get_cached_subnet_prefix(dev, i, &sp); 901 ib_security_cache_change(dev, i, sp); 902 } 903 } 904 up_read(&devices_rwsem); 905 } 906 907 static int ib_security_change(struct notifier_block *nb, unsigned long event, 908 void *lsm_data) 909 { 910 if (event != LSM_POLICY_CHANGE) 911 return NOTIFY_DONE; 912 913 schedule_work(&ib_policy_change_work); 914 ib_mad_agent_security_change(); 915 916 return NOTIFY_OK; 917 } 918 919 static void compatdev_release(struct device *dev) 920 { 921 struct ib_core_device *cdev = 922 container_of(dev, struct ib_core_device, dev); 923 924 kfree(cdev); 925 } 926 927 static int add_one_compat_dev(struct ib_device *device, 928 struct rdma_dev_net *rnet) 929 { 930 struct ib_core_device *cdev; 931 int ret; 932 933 lockdep_assert_held(&rdma_nets_rwsem); 934 if (!ib_devices_shared_netns) 935 return 0; 936 937 /* 938 * Create and add compat device in all namespaces other than where it 939 * is currently bound to. 940 */ 941 if (net_eq(read_pnet(&rnet->net), 942 read_pnet(&device->coredev.rdma_net))) 943 return 0; 944 945 /* 946 * The first of init_net() or ib_register_device() to take the 947 * compat_devs_mutex wins and gets to add the device. Others will wait 948 * for completion here. 949 */ 950 mutex_lock(&device->compat_devs_mutex); 951 cdev = xa_load(&device->compat_devs, rnet->id); 952 if (cdev) { 953 ret = 0; 954 goto done; 955 } 956 ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); 957 if (ret) 958 goto done; 959 960 cdev = kzalloc_obj(*cdev, GFP_KERNEL); 961 if (!cdev) { 962 ret = -ENOMEM; 963 goto cdev_err; 964 } 965 966 cdev->dev.parent = device->dev.parent; 967 rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); 968 cdev->dev.release = compatdev_release; 969 ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); 970 if (ret) 971 goto add_err; 972 973 ret = device_add(&cdev->dev); 974 if (ret) 975 goto add_err; 976 ret = ib_setup_port_attrs(cdev); 977 if (ret) 978 goto port_err; 979 980 ret = xa_err(xa_store(&device->compat_devs, rnet->id, 981 cdev, GFP_KERNEL)); 982 if (ret) 983 goto insert_err; 984 985 mutex_unlock(&device->compat_devs_mutex); 986 return 0; 987 988 insert_err: 989 ib_free_port_attrs(cdev); 990 port_err: 991 device_del(&cdev->dev); 992 add_err: 993 put_device(&cdev->dev); 994 cdev_err: 995 xa_release(&device->compat_devs, rnet->id); 996 done: 997 mutex_unlock(&device->compat_devs_mutex); 998 return ret; 999 } 1000 1001 static void remove_one_compat_dev(struct ib_device *device, u32 id) 1002 { 1003 struct ib_core_device *cdev; 1004 1005 mutex_lock(&device->compat_devs_mutex); 1006 cdev = xa_erase(&device->compat_devs, id); 1007 mutex_unlock(&device->compat_devs_mutex); 1008 if (cdev) { 1009 ib_free_port_attrs(cdev); 1010 device_del(&cdev->dev); 1011 put_device(&cdev->dev); 1012 } 1013 } 1014 1015 static void remove_compat_devs(struct ib_device *device) 1016 { 1017 struct ib_core_device *cdev; 1018 unsigned long index; 1019 1020 xa_for_each (&device->compat_devs, index, cdev) 1021 remove_one_compat_dev(device, index); 1022 } 1023 1024 static int add_compat_devs(struct ib_device *device) 1025 { 1026 struct rdma_dev_net *rnet; 1027 unsigned long index; 1028 int ret = 0; 1029 1030 lockdep_assert_held(&devices_rwsem); 1031 1032 down_read(&rdma_nets_rwsem); 1033 xa_for_each (&rdma_nets, index, rnet) { 1034 ret = add_one_compat_dev(device, rnet); 1035 if (ret) 1036 break; 1037 } 1038 up_read(&rdma_nets_rwsem); 1039 return ret; 1040 } 1041 1042 static void remove_all_compat_devs(void) 1043 { 1044 struct ib_compat_device *cdev; 1045 struct ib_device *dev; 1046 unsigned long index; 1047 1048 down_read(&devices_rwsem); 1049 xa_for_each (&devices, index, dev) { 1050 unsigned long c_index = 0; 1051 1052 /* Hold nets_rwsem so that any other thread modifying this 1053 * system param can sync with this thread. 1054 */ 1055 down_read(&rdma_nets_rwsem); 1056 xa_for_each (&dev->compat_devs, c_index, cdev) 1057 remove_one_compat_dev(dev, c_index); 1058 up_read(&rdma_nets_rwsem); 1059 } 1060 up_read(&devices_rwsem); 1061 } 1062 1063 static int add_all_compat_devs(void) 1064 { 1065 struct rdma_dev_net *rnet; 1066 struct ib_device *dev; 1067 unsigned long index; 1068 int ret = 0; 1069 1070 down_read(&devices_rwsem); 1071 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1072 unsigned long net_index = 0; 1073 1074 /* Hold nets_rwsem so that any other thread modifying this 1075 * system param can sync with this thread. 1076 */ 1077 down_read(&rdma_nets_rwsem); 1078 xa_for_each (&rdma_nets, net_index, rnet) { 1079 ret = add_one_compat_dev(dev, rnet); 1080 if (ret) 1081 break; 1082 } 1083 up_read(&rdma_nets_rwsem); 1084 } 1085 up_read(&devices_rwsem); 1086 if (ret) 1087 remove_all_compat_devs(); 1088 return ret; 1089 } 1090 1091 int rdma_compatdev_set(u8 enable) 1092 { 1093 struct rdma_dev_net *rnet; 1094 unsigned long index; 1095 int ret = 0; 1096 1097 down_write(&rdma_nets_rwsem); 1098 if (ib_devices_shared_netns == enable) { 1099 up_write(&rdma_nets_rwsem); 1100 return 0; 1101 } 1102 1103 /* enable/disable of compat devices is not supported 1104 * when more than default init_net exists. 1105 */ 1106 xa_for_each (&rdma_nets, index, rnet) { 1107 ret++; 1108 break; 1109 } 1110 if (!ret) 1111 ib_devices_shared_netns = enable; 1112 up_write(&rdma_nets_rwsem); 1113 if (ret) 1114 return -EBUSY; 1115 1116 if (enable) 1117 ret = add_all_compat_devs(); 1118 else 1119 remove_all_compat_devs(); 1120 return ret; 1121 } 1122 1123 static void rdma_dev_exit_net(struct net *net) 1124 { 1125 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1126 struct ib_device *dev; 1127 unsigned long index; 1128 int ret; 1129 1130 down_write(&rdma_nets_rwsem); 1131 /* 1132 * Prevent the ID from being re-used and hide the id from xa_for_each. 1133 */ 1134 ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); 1135 WARN_ON(ret); 1136 up_write(&rdma_nets_rwsem); 1137 1138 down_read(&devices_rwsem); 1139 xa_for_each (&devices, index, dev) { 1140 get_device(&dev->dev); 1141 /* 1142 * Release the devices_rwsem so that pontentially blocking 1143 * device_del, doesn't hold the devices_rwsem for too long. 1144 */ 1145 up_read(&devices_rwsem); 1146 1147 remove_one_compat_dev(dev, rnet->id); 1148 1149 /* 1150 * If the real device is in the NS then move it back to init. 1151 */ 1152 rdma_dev_change_netns(dev, net, &init_net); 1153 1154 put_device(&dev->dev); 1155 down_read(&devices_rwsem); 1156 } 1157 up_read(&devices_rwsem); 1158 1159 rdma_nl_net_exit(rnet); 1160 xa_erase(&rdma_nets, rnet->id); 1161 } 1162 1163 static __net_init int rdma_dev_init_net(struct net *net) 1164 { 1165 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1166 unsigned long index; 1167 struct ib_device *dev; 1168 int ret; 1169 1170 write_pnet(&rnet->net, net); 1171 1172 ret = rdma_nl_net_init(rnet); 1173 if (ret) 1174 return ret; 1175 1176 /* No need to create any compat devices in default init_net. */ 1177 if (net_eq(net, &init_net)) 1178 return 0; 1179 1180 ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); 1181 if (ret) { 1182 rdma_nl_net_exit(rnet); 1183 return ret; 1184 } 1185 1186 down_read(&devices_rwsem); 1187 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1188 /* Hold nets_rwsem so that netlink command cannot change 1189 * system configuration for device sharing mode. 1190 */ 1191 down_read(&rdma_nets_rwsem); 1192 ret = add_one_compat_dev(dev, rnet); 1193 up_read(&rdma_nets_rwsem); 1194 if (ret) 1195 break; 1196 } 1197 up_read(&devices_rwsem); 1198 1199 if (ret) 1200 rdma_dev_exit_net(net); 1201 1202 return ret; 1203 } 1204 1205 /* 1206 * Assign the unique string device name and the unique device index. This is 1207 * undone by ib_dealloc_device. 1208 */ 1209 static int assign_name(struct ib_device *device, const char *name) 1210 { 1211 static u32 last_id; 1212 int ret; 1213 1214 down_write(&devices_rwsem); 1215 /* Assign a unique name to the device */ 1216 if (strchr(name, '%')) 1217 ret = alloc_name(device, name); 1218 else 1219 ret = dev_set_name(&device->dev, name); 1220 if (ret) 1221 goto out; 1222 1223 if (__ib_device_get_by_name(dev_name(&device->dev))) { 1224 ret = -ENFILE; 1225 goto out; 1226 } 1227 strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); 1228 1229 ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, 1230 &last_id, GFP_KERNEL); 1231 if (ret > 0) 1232 ret = 0; 1233 1234 out: 1235 up_write(&devices_rwsem); 1236 return ret; 1237 } 1238 1239 /* 1240 * setup_device() allocates memory and sets up data that requires calling the 1241 * device ops, this is the only reason these actions are not done during 1242 * ib_alloc_device. It is undone by ib_dealloc_device(). 1243 */ 1244 static int setup_device(struct ib_device *device) 1245 { 1246 struct ib_udata uhw = {.outlen = 0, .inlen = 0}; 1247 int ret; 1248 1249 ib_device_check_mandatory(device); 1250 1251 ret = setup_port_data(device); 1252 if (ret) { 1253 dev_warn(&device->dev, "Couldn't create per-port data\n"); 1254 return ret; 1255 } 1256 1257 memset(&device->attrs, 0, sizeof(device->attrs)); 1258 ret = device->ops.query_device(device, &device->attrs, &uhw); 1259 if (ret) { 1260 dev_warn(&device->dev, 1261 "Couldn't query the device attributes\n"); 1262 return ret; 1263 } 1264 1265 return 0; 1266 } 1267 1268 static void disable_device(struct ib_device *device) 1269 { 1270 u32 cid; 1271 1272 WARN_ON(!refcount_read(&device->refcount)); 1273 1274 down_write(&devices_rwsem); 1275 xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); 1276 up_write(&devices_rwsem); 1277 1278 /* 1279 * Remove clients in LIFO order, see assign_client_id. This could be 1280 * more efficient if xarray learns to reverse iterate. Since no new 1281 * clients can be added to this ib_device past this point we only need 1282 * the maximum possible client_id value here. 1283 */ 1284 down_read(&clients_rwsem); 1285 cid = highest_client_id; 1286 up_read(&clients_rwsem); 1287 while (cid) { 1288 cid--; 1289 remove_client_context(device, cid); 1290 } 1291 1292 ib_cq_pool_cleanup(device); 1293 1294 /* Pairs with refcount_set in enable_device */ 1295 ib_device_put(device); 1296 wait_for_completion(&device->unreg_completion); 1297 1298 /* 1299 * compat devices must be removed after device refcount drops to zero. 1300 * Otherwise init_net() may add more compatdevs after removing compat 1301 * devices and before device is disabled. 1302 */ 1303 remove_compat_devs(device); 1304 } 1305 1306 /* 1307 * An enabled device is visible to all clients and to all the public facing 1308 * APIs that return a device pointer. This always returns with a new get, even 1309 * if it fails. 1310 */ 1311 static int enable_device_and_get(struct ib_device *device) 1312 { 1313 struct ib_client *client; 1314 unsigned long index; 1315 int ret = 0; 1316 1317 /* 1318 * One ref belongs to the xa and the other belongs to this 1319 * thread. This is needed to guard against parallel unregistration. 1320 */ 1321 refcount_set(&device->refcount, 2); 1322 down_write(&devices_rwsem); 1323 xa_set_mark(&devices, device->index, DEVICE_REGISTERED); 1324 1325 /* 1326 * By using downgrade_write() we ensure that no other thread can clear 1327 * DEVICE_REGISTERED while we are completing the client setup. 1328 */ 1329 downgrade_write(&devices_rwsem); 1330 1331 if (device->ops.enable_driver) { 1332 ret = device->ops.enable_driver(device); 1333 if (ret) 1334 goto out; 1335 } 1336 1337 down_read(&clients_rwsem); 1338 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1339 ret = add_client_context(device, client); 1340 if (ret) 1341 break; 1342 } 1343 up_read(&clients_rwsem); 1344 if (!ret) 1345 ret = add_compat_devs(device); 1346 out: 1347 up_read(&devices_rwsem); 1348 return ret; 1349 } 1350 1351 static void prevent_dealloc_device(struct ib_device *ib_dev) 1352 { 1353 } 1354 1355 static void ib_device_notify_register(struct ib_device *device) 1356 { 1357 struct net_device *netdev; 1358 u32 port; 1359 int ret; 1360 1361 down_read(&devices_rwsem); 1362 1363 /* Mark for userspace that device is ready */ 1364 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1365 1366 ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); 1367 if (ret) 1368 goto out; 1369 1370 rdma_for_each_port(device, port) { 1371 netdev = ib_device_get_netdev(device, port); 1372 if (!netdev) 1373 continue; 1374 1375 ret = rdma_nl_notify_event(device, port, 1376 RDMA_NETDEV_ATTACH_EVENT); 1377 dev_put(netdev); 1378 if (ret) 1379 goto out; 1380 } 1381 1382 out: 1383 up_read(&devices_rwsem); 1384 } 1385 1386 /** 1387 * ib_register_device - Register an IB device with IB core 1388 * @device: Device to register 1389 * @name: unique string device name. This may include a '%' which will 1390 * cause a unique index to be added to the passed device name. 1391 * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB 1392 * device will be used. In this case the caller should fully 1393 * setup the ibdev for DMA. This usually means using dma_virt_ops. 1394 * 1395 * Low-level drivers use ib_register_device() to register their 1396 * devices with the IB core. All registered clients will receive a 1397 * callback for each device that is added. @device must be allocated 1398 * with ib_alloc_device(). 1399 * 1400 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() 1401 * asynchronously then the device pointer may become freed as soon as this 1402 * function returns. 1403 */ 1404 int ib_register_device(struct ib_device *device, const char *name, 1405 struct device *dma_device) 1406 { 1407 int ret; 1408 1409 ret = assign_name(device, name); 1410 if (ret) 1411 return ret; 1412 1413 /* 1414 * If the caller does not provide a DMA capable device then the IB core 1415 * will set up ib_sge and scatterlist structures that stash the kernel 1416 * virtual address into the address field. 1417 */ 1418 WARN_ON(dma_device && !dma_device->dma_parms); 1419 device->dma_device = dma_device; 1420 1421 ret = setup_device(device); 1422 if (ret) 1423 return ret; 1424 1425 ret = ib_cache_setup_one(device); 1426 if (ret) { 1427 dev_warn(&device->dev, 1428 "Couldn't set up InfiniBand P_Key/GID cache\n"); 1429 return ret; 1430 } 1431 1432 device->groups[0] = &ib_dev_attr_group; 1433 device->groups[1] = device->ops.device_group; 1434 ret = ib_setup_device_attrs(device); 1435 if (ret) 1436 goto cache_cleanup; 1437 1438 ib_device_register_rdmacg(device); 1439 1440 rdma_counter_init(device); 1441 1442 /* 1443 * Ensure that ADD uevent is not fired because it 1444 * is too early amd device is not initialized yet. 1445 */ 1446 dev_set_uevent_suppress(&device->dev, true); 1447 ret = device_add(&device->dev); 1448 if (ret) 1449 goto cg_cleanup; 1450 1451 ret = ib_setup_port_attrs(&device->coredev); 1452 if (ret) { 1453 dev_warn(&device->dev, 1454 "Couldn't register device with driver model\n"); 1455 goto dev_cleanup; 1456 } 1457 1458 ret = enable_device_and_get(device); 1459 if (ret) { 1460 void (*dealloc_fn)(struct ib_device *); 1461 1462 /* 1463 * If we hit this error flow then we don't want to 1464 * automatically dealloc the device since the caller is 1465 * expected to call ib_dealloc_device() after 1466 * ib_register_device() fails. This is tricky due to the 1467 * possibility for a parallel unregistration along with this 1468 * error flow. Since we have a refcount here we know any 1469 * parallel flow is stopped in disable_device and will see the 1470 * special dealloc_driver pointer, causing the responsibility to 1471 * ib_dealloc_device() to revert back to this thread. 1472 */ 1473 dealloc_fn = device->ops.dealloc_driver; 1474 device->ops.dealloc_driver = prevent_dealloc_device; 1475 ib_device_put(device); 1476 __ib_unregister_device(device); 1477 device->ops.dealloc_driver = dealloc_fn; 1478 dev_set_uevent_suppress(&device->dev, false); 1479 return ret; 1480 } 1481 dev_set_uevent_suppress(&device->dev, false); 1482 1483 ib_device_notify_register(device); 1484 1485 ib_device_put(device); 1486 1487 return 0; 1488 1489 dev_cleanup: 1490 device_del(&device->dev); 1491 cg_cleanup: 1492 dev_set_uevent_suppress(&device->dev, false); 1493 ib_device_unregister_rdmacg(device); 1494 cache_cleanup: 1495 ib_cache_cleanup_one(device); 1496 return ret; 1497 } 1498 EXPORT_SYMBOL(ib_register_device); 1499 1500 /* Callers must hold a get on the device. */ 1501 static void __ib_unregister_device(struct ib_device *ib_dev) 1502 { 1503 struct ib_device *sub, *tmp; 1504 1505 mutex_lock(&ib_dev->subdev_lock); 1506 list_for_each_entry_safe_reverse(sub, tmp, 1507 &ib_dev->subdev_list_head, 1508 subdev_list) { 1509 list_del(&sub->subdev_list); 1510 ib_dev->ops.del_sub_dev(sub); 1511 ib_device_put(ib_dev); 1512 } 1513 mutex_unlock(&ib_dev->subdev_lock); 1514 1515 /* 1516 * We have a registration lock so that all the calls to unregister are 1517 * fully fenced, once any unregister returns the device is truly 1518 * unregistered even if multiple callers are unregistering it at the 1519 * same time. This also interacts with the registration flow and 1520 * provides sane semantics if register and unregister are racing. 1521 */ 1522 mutex_lock(&ib_dev->unregistration_lock); 1523 if (!refcount_read(&ib_dev->refcount)) 1524 goto out; 1525 1526 disable_device(ib_dev); 1527 rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT); 1528 1529 /* Expedite removing unregistered pointers from the hash table */ 1530 free_netdevs(ib_dev); 1531 1532 ib_free_port_attrs(&ib_dev->coredev); 1533 device_del(&ib_dev->dev); 1534 ib_device_unregister_rdmacg(ib_dev); 1535 ib_cache_cleanup_one(ib_dev); 1536 1537 /* 1538 * Drivers using the new flow may not call ib_dealloc_device except 1539 * in error unwind prior to registration success. 1540 */ 1541 if (ib_dev->ops.dealloc_driver && 1542 ib_dev->ops.dealloc_driver != prevent_dealloc_device) { 1543 WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); 1544 ib_dealloc_device(ib_dev); 1545 } 1546 out: 1547 mutex_unlock(&ib_dev->unregistration_lock); 1548 } 1549 1550 /** 1551 * ib_unregister_device - Unregister an IB device 1552 * @ib_dev: The device to unregister 1553 * 1554 * Unregister an IB device. All clients will receive a remove callback. 1555 * 1556 * Callers should call this routine only once, and protect against races with 1557 * registration. Typically it should only be called as part of a remove 1558 * callback in an implementation of driver core's struct device_driver and 1559 * related. 1560 * 1561 * If ops.dealloc_driver is used then ib_dev will be freed upon return from 1562 * this function. 1563 */ 1564 void ib_unregister_device(struct ib_device *ib_dev) 1565 { 1566 get_device(&ib_dev->dev); 1567 __ib_unregister_device(ib_dev); 1568 put_device(&ib_dev->dev); 1569 } 1570 EXPORT_SYMBOL(ib_unregister_device); 1571 1572 /** 1573 * ib_unregister_device_and_put - Unregister a device while holding a 'get' 1574 * @ib_dev: The device to unregister 1575 * 1576 * This is the same as ib_unregister_device(), except it includes an internal 1577 * ib_device_put() that should match a 'get' obtained by the caller. 1578 * 1579 * It is safe to call this routine concurrently from multiple threads while 1580 * holding the 'get'. When the function returns the device is fully 1581 * unregistered. 1582 * 1583 * Drivers using this flow MUST use the driver_unregister callback to clean up 1584 * their resources associated with the device and dealloc it. 1585 */ 1586 void ib_unregister_device_and_put(struct ib_device *ib_dev) 1587 { 1588 WARN_ON(!ib_dev->ops.dealloc_driver); 1589 get_device(&ib_dev->dev); 1590 ib_device_put(ib_dev); 1591 __ib_unregister_device(ib_dev); 1592 put_device(&ib_dev->dev); 1593 } 1594 EXPORT_SYMBOL(ib_unregister_device_and_put); 1595 1596 /** 1597 * ib_unregister_driver - Unregister all IB devices for a driver 1598 * @driver_id: The driver to unregister 1599 * 1600 * This implements a fence for device unregistration. It only returns once all 1601 * devices associated with the driver_id have fully completed their 1602 * unregistration and returned from ib_unregister_device*(). 1603 * 1604 * If device's are not yet unregistered it goes ahead and starts unregistering 1605 * them. 1606 * 1607 * This does not block creation of new devices with the given driver_id, that 1608 * is the responsibility of the caller. 1609 */ 1610 void ib_unregister_driver(enum rdma_driver_id driver_id) 1611 { 1612 struct ib_device *ib_dev; 1613 unsigned long index; 1614 1615 down_read(&devices_rwsem); 1616 xa_for_each (&devices, index, ib_dev) { 1617 if (ib_dev->ops.driver_id != driver_id) 1618 continue; 1619 1620 get_device(&ib_dev->dev); 1621 up_read(&devices_rwsem); 1622 1623 WARN_ON(!ib_dev->ops.dealloc_driver); 1624 __ib_unregister_device(ib_dev); 1625 1626 put_device(&ib_dev->dev); 1627 down_read(&devices_rwsem); 1628 } 1629 up_read(&devices_rwsem); 1630 } 1631 EXPORT_SYMBOL(ib_unregister_driver); 1632 1633 static void ib_unregister_work(struct work_struct *work) 1634 { 1635 struct ib_device *ib_dev = 1636 container_of(work, struct ib_device, unregistration_work); 1637 1638 __ib_unregister_device(ib_dev); 1639 put_device(&ib_dev->dev); 1640 } 1641 1642 /** 1643 * ib_unregister_device_queued - Unregister a device using a work queue 1644 * @ib_dev: The device to unregister 1645 * 1646 * This schedules an asynchronous unregistration using a WQ for the device. A 1647 * driver should use this to avoid holding locks while doing unregistration, 1648 * such as holding the RTNL lock. 1649 * 1650 * Drivers using this API must use ib_unregister_driver before module unload 1651 * to ensure that all scheduled unregistrations have completed. 1652 */ 1653 void ib_unregister_device_queued(struct ib_device *ib_dev) 1654 { 1655 WARN_ON(!refcount_read(&ib_dev->refcount)); 1656 WARN_ON(!ib_dev->ops.dealloc_driver); 1657 get_device(&ib_dev->dev); 1658 if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work)) 1659 put_device(&ib_dev->dev); 1660 } 1661 EXPORT_SYMBOL(ib_unregister_device_queued); 1662 1663 /* 1664 * The caller must pass in a device that has the kref held and the refcount 1665 * released. If the device is in cur_net and still registered then it is moved 1666 * into net. 1667 */ 1668 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 1669 struct net *net) 1670 { 1671 int ret2 = -EINVAL; 1672 int ret; 1673 1674 mutex_lock(&device->unregistration_lock); 1675 1676 /* 1677 * If a device not under ib_device_get() or if the unregistration_lock 1678 * is not held, the namespace can be changed, or it can be unregistered. 1679 * Check again under the lock. 1680 */ 1681 if (refcount_read(&device->refcount) == 0 || 1682 !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { 1683 ret = -ENODEV; 1684 goto out; 1685 } 1686 1687 kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); 1688 disable_device(device); 1689 1690 /* 1691 * At this point no one can be using the device, so it is safe to 1692 * change the namespace. 1693 */ 1694 write_pnet(&device->coredev.rdma_net, net); 1695 1696 down_read(&devices_rwsem); 1697 /* 1698 * Currently rdma devices are system wide unique. So the device name 1699 * is guaranteed free in the new namespace. Publish the new namespace 1700 * at the sysfs level. 1701 */ 1702 ret = device_rename(&device->dev, dev_name(&device->dev)); 1703 up_read(&devices_rwsem); 1704 if (ret) { 1705 dev_warn(&device->dev, 1706 "%s: Couldn't rename device after namespace change\n", 1707 __func__); 1708 /* Try and put things back and re-enable the device */ 1709 write_pnet(&device->coredev.rdma_net, cur_net); 1710 } 1711 1712 ret2 = enable_device_and_get(device); 1713 if (ret2) { 1714 /* 1715 * This shouldn't really happen, but if it does, let the user 1716 * retry at later point. So don't disable the device. 1717 */ 1718 dev_warn(&device->dev, 1719 "%s: Couldn't re-enable device after namespace change\n", 1720 __func__); 1721 } 1722 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1723 1724 ib_device_put(device); 1725 out: 1726 mutex_unlock(&device->unregistration_lock); 1727 if (ret) 1728 return ret; 1729 return ret2; 1730 } 1731 1732 int ib_device_set_netns_put(struct sk_buff *skb, 1733 struct ib_device *dev, u32 ns_fd) 1734 { 1735 struct net *net; 1736 int ret; 1737 1738 net = get_net_ns_by_fd(ns_fd); 1739 if (IS_ERR(net)) { 1740 ret = PTR_ERR(net); 1741 goto net_err; 1742 } 1743 1744 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { 1745 ret = -EPERM; 1746 goto ns_err; 1747 } 1748 1749 /* 1750 * All the ib_clients, including uverbs, are reset when the namespace is 1751 * changed and this cannot be blocked waiting for userspace to do 1752 * something, so disassociation is mandatory. 1753 */ 1754 if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) { 1755 ret = -EOPNOTSUPP; 1756 goto ns_err; 1757 } 1758 1759 get_device(&dev->dev); 1760 ib_device_put(dev); 1761 ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); 1762 put_device(&dev->dev); 1763 1764 put_net(net); 1765 return ret; 1766 1767 ns_err: 1768 put_net(net); 1769 net_err: 1770 ib_device_put(dev); 1771 return ret; 1772 } 1773 1774 static struct pernet_operations rdma_dev_net_ops = { 1775 .init = rdma_dev_init_net, 1776 .exit = rdma_dev_exit_net, 1777 .id = &rdma_dev_net_id, 1778 .size = sizeof(struct rdma_dev_net), 1779 }; 1780 1781 static int assign_client_id(struct ib_client *client) 1782 { 1783 int ret; 1784 1785 lockdep_assert_held(&clients_rwsem); 1786 /* 1787 * The add/remove callbacks must be called in FIFO/LIFO order. To 1788 * achieve this we assign client_ids so they are sorted in 1789 * registration order. 1790 */ 1791 client->client_id = highest_client_id; 1792 ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); 1793 if (ret) 1794 return ret; 1795 1796 highest_client_id++; 1797 xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); 1798 return 0; 1799 } 1800 1801 static void remove_client_id(struct ib_client *client) 1802 { 1803 down_write(&clients_rwsem); 1804 xa_erase(&clients, client->client_id); 1805 for (; highest_client_id; highest_client_id--) 1806 if (xa_load(&clients, highest_client_id - 1)) 1807 break; 1808 up_write(&clients_rwsem); 1809 } 1810 1811 /** 1812 * ib_register_client - Register an IB client 1813 * @client:Client to register 1814 * 1815 * Upper level users of the IB drivers can use ib_register_client() to 1816 * register callbacks for IB device addition and removal. When an IB 1817 * device is added, each registered client's add method will be called 1818 * (in the order the clients were registered), and when a device is 1819 * removed, each client's remove method will be called (in the reverse 1820 * order that clients were registered). In addition, when 1821 * ib_register_client() is called, the client will receive an add 1822 * callback for all devices already registered. 1823 */ 1824 int ib_register_client(struct ib_client *client) 1825 { 1826 struct ib_device *device; 1827 unsigned long index; 1828 bool need_unreg = false; 1829 int ret; 1830 1831 refcount_set(&client->uses, 1); 1832 init_completion(&client->uses_zero); 1833 1834 /* 1835 * The devices_rwsem is held in write mode to ensure that a racing 1836 * ib_register_device() sees a consisent view of clients and devices. 1837 */ 1838 down_write(&devices_rwsem); 1839 down_write(&clients_rwsem); 1840 ret = assign_client_id(client); 1841 if (ret) 1842 goto out; 1843 1844 need_unreg = true; 1845 xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { 1846 ret = add_client_context(device, client); 1847 if (ret) 1848 goto out; 1849 } 1850 ret = 0; 1851 out: 1852 up_write(&clients_rwsem); 1853 up_write(&devices_rwsem); 1854 if (need_unreg && ret) 1855 ib_unregister_client(client); 1856 return ret; 1857 } 1858 EXPORT_SYMBOL(ib_register_client); 1859 1860 /** 1861 * ib_unregister_client - Unregister an IB client 1862 * @client:Client to unregister 1863 * 1864 * Upper level users use ib_unregister_client() to remove their client 1865 * registration. When ib_unregister_client() is called, the client 1866 * will receive a remove callback for each IB device still registered. 1867 * 1868 * This is a full fence, once it returns no client callbacks will be called, 1869 * or are running in another thread. 1870 */ 1871 void ib_unregister_client(struct ib_client *client) 1872 { 1873 struct ib_device *device; 1874 unsigned long index; 1875 1876 down_write(&clients_rwsem); 1877 ib_client_put(client); 1878 xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); 1879 up_write(&clients_rwsem); 1880 1881 /* We do not want to have locks while calling client->remove() */ 1882 rcu_read_lock(); 1883 xa_for_each (&devices, index, device) { 1884 if (!ib_device_try_get(device)) 1885 continue; 1886 rcu_read_unlock(); 1887 1888 remove_client_context(device, client->client_id); 1889 1890 ib_device_put(device); 1891 rcu_read_lock(); 1892 } 1893 rcu_read_unlock(); 1894 1895 /* 1896 * remove_client_context() is not a fence, it can return even though a 1897 * removal is ongoing. Wait until all removals are completed. 1898 */ 1899 wait_for_completion(&client->uses_zero); 1900 remove_client_id(client); 1901 } 1902 EXPORT_SYMBOL(ib_unregister_client); 1903 1904 static int __ib_get_global_client_nl_info(const char *client_name, 1905 struct ib_client_nl_info *res) 1906 { 1907 struct ib_client *client; 1908 unsigned long index; 1909 int ret = -ENOENT; 1910 1911 down_read(&clients_rwsem); 1912 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1913 if (strcmp(client->name, client_name) != 0) 1914 continue; 1915 if (!client->get_global_nl_info) { 1916 ret = -EOPNOTSUPP; 1917 break; 1918 } 1919 ret = client->get_global_nl_info(res); 1920 if (WARN_ON(ret == -ENOENT)) 1921 ret = -EINVAL; 1922 if (!ret && res->cdev) 1923 get_device(res->cdev); 1924 break; 1925 } 1926 up_read(&clients_rwsem); 1927 return ret; 1928 } 1929 1930 static int __ib_get_client_nl_info(struct ib_device *ibdev, 1931 const char *client_name, 1932 struct ib_client_nl_info *res) 1933 { 1934 unsigned long index; 1935 void *client_data; 1936 int ret = -ENOENT; 1937 1938 down_read(&ibdev->client_data_rwsem); 1939 xan_for_each_marked (&ibdev->client_data, index, client_data, 1940 CLIENT_DATA_REGISTERED) { 1941 struct ib_client *client = xa_load(&clients, index); 1942 1943 if (!client || strcmp(client->name, client_name) != 0) 1944 continue; 1945 if (!client->get_nl_info) { 1946 ret = -EOPNOTSUPP; 1947 break; 1948 } 1949 ret = client->get_nl_info(ibdev, client_data, res); 1950 if (WARN_ON(ret == -ENOENT)) 1951 ret = -EINVAL; 1952 1953 /* 1954 * The cdev is guaranteed valid as long as we are inside the 1955 * client_data_rwsem as remove_one can't be called. Keep it 1956 * valid for the caller. 1957 */ 1958 if (!ret && res->cdev) 1959 get_device(res->cdev); 1960 break; 1961 } 1962 up_read(&ibdev->client_data_rwsem); 1963 1964 return ret; 1965 } 1966 1967 /** 1968 * ib_get_client_nl_info - Fetch the nl_info from a client 1969 * @ibdev: IB device 1970 * @client_name: Name of the client 1971 * @res: Result of the query 1972 */ 1973 int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, 1974 struct ib_client_nl_info *res) 1975 { 1976 int ret; 1977 1978 if (ibdev) 1979 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1980 else 1981 ret = __ib_get_global_client_nl_info(client_name, res); 1982 #ifdef CONFIG_MODULES 1983 if (ret == -ENOENT) { 1984 request_module("rdma-client-%s", client_name); 1985 if (ibdev) 1986 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1987 else 1988 ret = __ib_get_global_client_nl_info(client_name, res); 1989 } 1990 #endif 1991 if (ret) { 1992 if (ret == -ENOENT) 1993 return -EOPNOTSUPP; 1994 return ret; 1995 } 1996 1997 if (WARN_ON(!res->cdev)) 1998 return -EINVAL; 1999 return 0; 2000 } 2001 2002 /** 2003 * ib_set_client_data - Set IB client context 2004 * @device:Device to set context for 2005 * @client:Client to set context for 2006 * @data:Context to set 2007 * 2008 * ib_set_client_data() sets client context data that can be retrieved with 2009 * ib_get_client_data(). This can only be called while the client is 2010 * registered to the device, once the ib_client remove() callback returns this 2011 * cannot be called. 2012 */ 2013 void ib_set_client_data(struct ib_device *device, struct ib_client *client, 2014 void *data) 2015 { 2016 void *rc; 2017 2018 if (WARN_ON(IS_ERR(data))) 2019 data = NULL; 2020 2021 rc = xa_store(&device->client_data, client->client_id, data, 2022 GFP_KERNEL); 2023 WARN_ON(xa_is_err(rc)); 2024 } 2025 EXPORT_SYMBOL(ib_set_client_data); 2026 2027 /** 2028 * ib_register_event_handler - Register an IB event handler 2029 * @event_handler:Handler to register 2030 * 2031 * ib_register_event_handler() registers an event handler that will be 2032 * called back when asynchronous IB events occur (as defined in 2033 * chapter 11 of the InfiniBand Architecture Specification). This 2034 * callback occurs in workqueue context. 2035 */ 2036 void ib_register_event_handler(struct ib_event_handler *event_handler) 2037 { 2038 down_write(&event_handler->device->event_handler_rwsem); 2039 list_add_tail(&event_handler->list, 2040 &event_handler->device->event_handler_list); 2041 up_write(&event_handler->device->event_handler_rwsem); 2042 } 2043 EXPORT_SYMBOL(ib_register_event_handler); 2044 2045 /** 2046 * ib_unregister_event_handler - Unregister an event handler 2047 * @event_handler:Handler to unregister 2048 * 2049 * Unregister an event handler registered with 2050 * ib_register_event_handler(). 2051 */ 2052 void ib_unregister_event_handler(struct ib_event_handler *event_handler) 2053 { 2054 down_write(&event_handler->device->event_handler_rwsem); 2055 list_del(&event_handler->list); 2056 up_write(&event_handler->device->event_handler_rwsem); 2057 } 2058 EXPORT_SYMBOL(ib_unregister_event_handler); 2059 2060 void ib_dispatch_event_clients(struct ib_event *event) 2061 { 2062 struct ib_event_handler *handler; 2063 2064 down_read(&event->device->event_handler_rwsem); 2065 2066 list_for_each_entry(handler, &event->device->event_handler_list, list) 2067 handler->handler(handler, event); 2068 2069 up_read(&event->device->event_handler_rwsem); 2070 } 2071 2072 static int iw_query_port(struct ib_device *device, 2073 u32 port_num, 2074 struct ib_port_attr *port_attr) 2075 { 2076 struct in_device *inetdev; 2077 struct net_device *netdev; 2078 2079 memset(port_attr, 0, sizeof(*port_attr)); 2080 2081 netdev = ib_device_get_netdev(device, port_num); 2082 if (!netdev) 2083 return -ENODEV; 2084 2085 port_attr->max_mtu = IB_MTU_4096; 2086 port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); 2087 2088 if (!netif_carrier_ok(netdev)) { 2089 port_attr->state = IB_PORT_DOWN; 2090 port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; 2091 } else { 2092 rcu_read_lock(); 2093 inetdev = __in_dev_get_rcu(netdev); 2094 2095 if (inetdev && inetdev->ifa_list) { 2096 port_attr->state = IB_PORT_ACTIVE; 2097 port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; 2098 } else { 2099 port_attr->state = IB_PORT_INIT; 2100 port_attr->phys_state = 2101 IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; 2102 } 2103 2104 rcu_read_unlock(); 2105 } 2106 2107 dev_put(netdev); 2108 return device->ops.query_port(device, port_num, port_attr); 2109 } 2110 2111 static int __ib_query_port(struct ib_device *device, 2112 u32 port_num, 2113 struct ib_port_attr *port_attr) 2114 { 2115 int err; 2116 2117 memset(port_attr, 0, sizeof(*port_attr)); 2118 2119 err = device->ops.query_port(device, port_num, port_attr); 2120 if (err || port_attr->subnet_prefix) 2121 return err; 2122 2123 if (rdma_port_get_link_layer(device, port_num) != 2124 IB_LINK_LAYER_INFINIBAND) 2125 return 0; 2126 2127 ib_get_cached_subnet_prefix(device, port_num, 2128 &port_attr->subnet_prefix); 2129 return 0; 2130 } 2131 2132 /** 2133 * ib_query_port - Query IB port attributes 2134 * @device:Device to query 2135 * @port_num:Port number to query 2136 * @port_attr:Port attributes 2137 * 2138 * ib_query_port() returns the attributes of a port through the 2139 * @port_attr pointer. 2140 */ 2141 int ib_query_port(struct ib_device *device, 2142 u32 port_num, 2143 struct ib_port_attr *port_attr) 2144 { 2145 if (!rdma_is_port_valid(device, port_num)) 2146 return -EINVAL; 2147 2148 if (rdma_protocol_iwarp(device, port_num)) 2149 return iw_query_port(device, port_num, port_attr); 2150 else 2151 return __ib_query_port(device, port_num, port_attr); 2152 } 2153 EXPORT_SYMBOL(ib_query_port); 2154 2155 static void add_ndev_hash(struct ib_port_data *pdata) 2156 { 2157 unsigned long flags; 2158 2159 might_sleep(); 2160 2161 spin_lock_irqsave(&ndev_hash_lock, flags); 2162 if (hash_hashed(&pdata->ndev_hash_link)) { 2163 hash_del_rcu(&pdata->ndev_hash_link); 2164 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2165 /* 2166 * We cannot do hash_add_rcu after a hash_del_rcu until the 2167 * grace period 2168 */ 2169 synchronize_rcu(); 2170 spin_lock_irqsave(&ndev_hash_lock, flags); 2171 } 2172 if (pdata->netdev) 2173 hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, 2174 (uintptr_t)pdata->netdev); 2175 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2176 } 2177 2178 /** 2179 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device 2180 * @ib_dev: Device to modify 2181 * @ndev: net_device to affiliate, may be NULL 2182 * @port: IB port the net_device is connected to 2183 * 2184 * Drivers should use this to link the ib_device to a netdev so the netdev 2185 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be 2186 * affiliated with any port. 2187 * 2188 * The caller must ensure that the given ndev is not unregistered or 2189 * unregistering, and that either the ib_device is unregistered or 2190 * ib_device_set_netdev() is called with NULL when the ndev sends a 2191 * NETDEV_UNREGISTER event. 2192 */ 2193 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, 2194 u32 port) 2195 { 2196 enum rdma_nl_notify_event_type etype; 2197 struct net_device *old_ndev; 2198 struct ib_port_data *pdata; 2199 unsigned long flags; 2200 int ret; 2201 2202 if (!rdma_is_port_valid(ib_dev, port)) 2203 return -EINVAL; 2204 2205 /* 2206 * Drivers wish to call this before ib_register_driver, so we have to 2207 * setup the port data early. 2208 */ 2209 ret = alloc_port_data(ib_dev); 2210 if (ret) 2211 return ret; 2212 2213 pdata = &ib_dev->port_data[port]; 2214 spin_lock_irqsave(&pdata->netdev_lock, flags); 2215 old_ndev = rcu_dereference_protected( 2216 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2217 if (old_ndev == ndev) { 2218 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2219 return 0; 2220 } 2221 2222 rcu_assign_pointer(pdata->netdev, ndev); 2223 netdev_put(old_ndev, &pdata->netdev_tracker); 2224 netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); 2225 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2226 2227 add_ndev_hash(pdata); 2228 2229 /* Make sure that the device is registered before we send events */ 2230 if (xa_load(&devices, ib_dev->index) != ib_dev) 2231 return 0; 2232 2233 etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT; 2234 rdma_nl_notify_event(ib_dev, port, etype); 2235 2236 return 0; 2237 } 2238 EXPORT_SYMBOL(ib_device_set_netdev); 2239 2240 static void free_netdevs(struct ib_device *ib_dev) 2241 { 2242 unsigned long flags; 2243 u32 port; 2244 2245 if (!ib_dev->port_data) 2246 return; 2247 2248 rdma_for_each_port (ib_dev, port) { 2249 struct ib_port_data *pdata = &ib_dev->port_data[port]; 2250 struct net_device *ndev; 2251 2252 spin_lock_irqsave(&pdata->netdev_lock, flags); 2253 ndev = rcu_dereference_protected( 2254 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2255 if (ndev) { 2256 spin_lock(&ndev_hash_lock); 2257 hash_del_rcu(&pdata->ndev_hash_link); 2258 spin_unlock(&ndev_hash_lock); 2259 2260 /* 2261 * If this is the last dev_put there is still a 2262 * synchronize_rcu before the netdev is kfreed, so we 2263 * can continue to rely on unlocked pointer 2264 * comparisons after the put 2265 */ 2266 rcu_assign_pointer(pdata->netdev, NULL); 2267 netdev_put(ndev, &pdata->netdev_tracker); 2268 } 2269 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2270 } 2271 } 2272 2273 struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, 2274 u32 port) 2275 { 2276 struct ib_port_data *pdata; 2277 struct net_device *res; 2278 2279 if (!rdma_is_port_valid(ib_dev, port)) 2280 return NULL; 2281 2282 if (!ib_dev->port_data) 2283 return NULL; 2284 2285 pdata = &ib_dev->port_data[port]; 2286 2287 /* 2288 * New drivers should use ib_device_set_netdev() not the legacy 2289 * get_netdev(). 2290 */ 2291 if (ib_dev->ops.get_netdev) 2292 res = ib_dev->ops.get_netdev(ib_dev, port); 2293 else { 2294 spin_lock(&pdata->netdev_lock); 2295 res = rcu_dereference_protected( 2296 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2297 dev_hold(res); 2298 spin_unlock(&pdata->netdev_lock); 2299 } 2300 2301 return res; 2302 } 2303 EXPORT_SYMBOL(ib_device_get_netdev); 2304 2305 /** 2306 * ib_query_netdev_port - Query the port number of a net_device 2307 * associated with an ibdev 2308 * @ibdev: IB device 2309 * @ndev: Network device 2310 * @port: IB port the net_device is connected to 2311 */ 2312 int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, 2313 u32 *port) 2314 { 2315 struct net_device *ib_ndev; 2316 u32 port_num; 2317 2318 rdma_for_each_port(ibdev, port_num) { 2319 ib_ndev = ib_device_get_netdev(ibdev, port_num); 2320 if (ndev == ib_ndev) { 2321 *port = port_num; 2322 dev_put(ib_ndev); 2323 return 0; 2324 } 2325 dev_put(ib_ndev); 2326 } 2327 2328 return -ENOENT; 2329 } 2330 EXPORT_SYMBOL(ib_query_netdev_port); 2331 2332 /** 2333 * ib_device_get_by_netdev - Find an IB device associated with a netdev 2334 * @ndev: netdev to locate 2335 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 2336 * 2337 * Find and hold an ib_device that is associated with a netdev via 2338 * ib_device_set_netdev(). The caller must call ib_device_put() on the 2339 * returned pointer. 2340 */ 2341 struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, 2342 enum rdma_driver_id driver_id) 2343 { 2344 struct ib_device *res = NULL; 2345 struct ib_port_data *cur; 2346 2347 rcu_read_lock(); 2348 hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, 2349 (uintptr_t)ndev) { 2350 if (rcu_access_pointer(cur->netdev) == ndev && 2351 (driver_id == RDMA_DRIVER_UNKNOWN || 2352 cur->ib_dev->ops.driver_id == driver_id) && 2353 ib_device_try_get(cur->ib_dev)) { 2354 res = cur->ib_dev; 2355 break; 2356 } 2357 } 2358 rcu_read_unlock(); 2359 2360 return res; 2361 } 2362 EXPORT_SYMBOL(ib_device_get_by_netdev); 2363 2364 /** 2365 * ib_enum_roce_netdev - enumerate all RoCE ports 2366 * @ib_dev : IB device we want to query 2367 * @filter: Should we call the callback? 2368 * @filter_cookie: Cookie passed to filter 2369 * @cb: Callback to call for each found RoCE ports 2370 * @cookie: Cookie passed back to the callback 2371 * 2372 * Enumerates all of the physical RoCE ports of ib_dev 2373 * which are related to netdevice and calls callback() on each 2374 * device for which filter() function returns non zero. 2375 */ 2376 void ib_enum_roce_netdev(struct ib_device *ib_dev, 2377 roce_netdev_filter filter, 2378 void *filter_cookie, 2379 roce_netdev_callback cb, 2380 void *cookie) 2381 { 2382 u32 port; 2383 2384 rdma_for_each_port (ib_dev, port) 2385 if (rdma_protocol_roce(ib_dev, port)) { 2386 struct net_device *idev = 2387 ib_device_get_netdev(ib_dev, port); 2388 2389 if (filter(ib_dev, port, idev, filter_cookie)) 2390 cb(ib_dev, port, idev, cookie); 2391 dev_put(idev); 2392 } 2393 } 2394 2395 /** 2396 * ib_enum_all_roce_netdevs - enumerate all RoCE devices 2397 * @filter: Should we call the callback? 2398 * @filter_cookie: Cookie passed to filter 2399 * @cb: Callback to call for each found RoCE ports 2400 * @cookie: Cookie passed back to the callback 2401 * 2402 * Enumerates all RoCE devices' physical ports which are related 2403 * to netdevices and calls callback() on each device for which 2404 * filter() function returns non zero. 2405 */ 2406 void ib_enum_all_roce_netdevs(roce_netdev_filter filter, 2407 void *filter_cookie, 2408 roce_netdev_callback cb, 2409 void *cookie) 2410 { 2411 struct ib_device *dev; 2412 unsigned long index; 2413 2414 down_read(&devices_rwsem); 2415 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) 2416 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); 2417 up_read(&devices_rwsem); 2418 } 2419 2420 /* 2421 * ib_enum_all_devs - enumerate all ib_devices 2422 * @cb: Callback to call for each found ib_device 2423 * 2424 * Enumerates all ib_devices and calls callback() on each device. 2425 */ 2426 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, 2427 struct netlink_callback *cb) 2428 { 2429 unsigned long index; 2430 struct ib_device *dev; 2431 unsigned int idx = 0; 2432 int ret = 0; 2433 2434 down_read(&devices_rwsem); 2435 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 2436 if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) 2437 continue; 2438 2439 ret = nldev_cb(dev, skb, cb, idx); 2440 if (ret) 2441 break; 2442 idx++; 2443 } 2444 up_read(&devices_rwsem); 2445 return ret; 2446 } 2447 2448 /** 2449 * ib_query_pkey - Get P_Key table entry 2450 * @device:Device to query 2451 * @port_num:Port number to query 2452 * @index:P_Key table index to query 2453 * @pkey:Returned P_Key 2454 * 2455 * ib_query_pkey() fetches the specified P_Key table entry. 2456 */ 2457 int ib_query_pkey(struct ib_device *device, 2458 u32 port_num, u16 index, u16 *pkey) 2459 { 2460 if (!rdma_is_port_valid(device, port_num)) 2461 return -EINVAL; 2462 2463 if (!device->ops.query_pkey) 2464 return -EOPNOTSUPP; 2465 2466 return device->ops.query_pkey(device, port_num, index, pkey); 2467 } 2468 EXPORT_SYMBOL(ib_query_pkey); 2469 2470 /** 2471 * ib_modify_device - Change IB device attributes 2472 * @device:Device to modify 2473 * @device_modify_mask:Mask of attributes to change 2474 * @device_modify:New attribute values 2475 * 2476 * ib_modify_device() changes a device's attributes as specified by 2477 * the @device_modify_mask and @device_modify structure. 2478 */ 2479 int ib_modify_device(struct ib_device *device, 2480 int device_modify_mask, 2481 struct ib_device_modify *device_modify) 2482 { 2483 if (!device->ops.modify_device) 2484 return -EOPNOTSUPP; 2485 2486 return device->ops.modify_device(device, device_modify_mask, 2487 device_modify); 2488 } 2489 EXPORT_SYMBOL(ib_modify_device); 2490 2491 /** 2492 * ib_modify_port - Modifies the attributes for the specified port. 2493 * @device: The device to modify. 2494 * @port_num: The number of the port to modify. 2495 * @port_modify_mask: Mask used to specify which attributes of the port 2496 * to change. 2497 * @port_modify: New attribute values for the port. 2498 * 2499 * ib_modify_port() changes a port's attributes as specified by the 2500 * @port_modify_mask and @port_modify structure. 2501 */ 2502 int ib_modify_port(struct ib_device *device, 2503 u32 port_num, int port_modify_mask, 2504 struct ib_port_modify *port_modify) 2505 { 2506 int rc; 2507 2508 if (!rdma_is_port_valid(device, port_num)) 2509 return -EINVAL; 2510 2511 if (device->ops.modify_port) 2512 rc = device->ops.modify_port(device, port_num, 2513 port_modify_mask, 2514 port_modify); 2515 else if (rdma_protocol_roce(device, port_num) && 2516 ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || 2517 (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) 2518 rc = 0; 2519 else 2520 rc = -EOPNOTSUPP; 2521 return rc; 2522 } 2523 EXPORT_SYMBOL(ib_modify_port); 2524 2525 /** 2526 * ib_find_gid - Returns the port number and GID table index where 2527 * a specified GID value occurs. Its searches only for IB link layer. 2528 * @device: The device to query. 2529 * @gid: The GID value to search for. 2530 * @port_num: The port number of the device where the GID value was found. 2531 * @index: The index into the GID table where the GID was found. This 2532 * parameter may be NULL. 2533 */ 2534 int ib_find_gid(struct ib_device *device, union ib_gid *gid, 2535 u32 *port_num, u16 *index) 2536 { 2537 union ib_gid tmp_gid; 2538 u32 port; 2539 int ret, i; 2540 2541 rdma_for_each_port (device, port) { 2542 if (!rdma_protocol_ib(device, port)) 2543 continue; 2544 2545 for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; 2546 ++i) { 2547 ret = rdma_query_gid(device, port, i, &tmp_gid); 2548 if (ret) 2549 continue; 2550 2551 if (!memcmp(&tmp_gid, gid, sizeof *gid)) { 2552 *port_num = port; 2553 if (index) 2554 *index = i; 2555 return 0; 2556 } 2557 } 2558 } 2559 2560 return -ENOENT; 2561 } 2562 EXPORT_SYMBOL(ib_find_gid); 2563 2564 /** 2565 * ib_find_pkey - Returns the PKey table index where a specified 2566 * PKey value occurs. 2567 * @device: The device to query. 2568 * @port_num: The port number of the device to search for the PKey. 2569 * @pkey: The PKey value to search for. 2570 * @index: The index into the PKey table where the PKey was found. 2571 */ 2572 int ib_find_pkey(struct ib_device *device, 2573 u32 port_num, u16 pkey, u16 *index) 2574 { 2575 int ret, i; 2576 u16 tmp_pkey; 2577 int partial_ix = -1; 2578 2579 for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; 2580 ++i) { 2581 ret = ib_query_pkey(device, port_num, i, &tmp_pkey); 2582 if (ret) 2583 return ret; 2584 if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { 2585 /* if there is full-member pkey take it.*/ 2586 if (tmp_pkey & 0x8000) { 2587 *index = i; 2588 return 0; 2589 } 2590 if (partial_ix < 0) 2591 partial_ix = i; 2592 } 2593 } 2594 2595 /*no full-member, if exists take the limited*/ 2596 if (partial_ix >= 0) { 2597 *index = partial_ix; 2598 return 0; 2599 } 2600 return -ENOENT; 2601 } 2602 EXPORT_SYMBOL(ib_find_pkey); 2603 2604 /** 2605 * ib_get_net_dev_by_params() - Return the appropriate net_dev 2606 * for a received CM request 2607 * @dev: An RDMA device on which the request has been received. 2608 * @port: Port number on the RDMA device. 2609 * @pkey: The Pkey the request came on. 2610 * @gid: A GID that the net_dev uses to communicate. 2611 * @addr: Contains the IP address that the request specified as its 2612 * destination. 2613 * 2614 */ 2615 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, 2616 u32 port, 2617 u16 pkey, 2618 const union ib_gid *gid, 2619 const struct sockaddr *addr) 2620 { 2621 struct net_device *net_dev = NULL; 2622 unsigned long index; 2623 void *client_data; 2624 2625 if (!rdma_protocol_ib(dev, port)) 2626 return NULL; 2627 2628 /* 2629 * Holding the read side guarantees that the client will not become 2630 * unregistered while we are calling get_net_dev_by_params() 2631 */ 2632 down_read(&dev->client_data_rwsem); 2633 xan_for_each_marked (&dev->client_data, index, client_data, 2634 CLIENT_DATA_REGISTERED) { 2635 struct ib_client *client = xa_load(&clients, index); 2636 2637 if (!client || !client->get_net_dev_by_params) 2638 continue; 2639 2640 net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, 2641 addr, client_data); 2642 if (net_dev) 2643 break; 2644 } 2645 up_read(&dev->client_data_rwsem); 2646 2647 return net_dev; 2648 } 2649 EXPORT_SYMBOL(ib_get_net_dev_by_params); 2650 2651 void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) 2652 { 2653 struct ib_device_ops *dev_ops = &dev->ops; 2654 #define SET_DEVICE_OP(ptr, name) \ 2655 do { \ 2656 if (ops->name) \ 2657 if (!((ptr)->name)) \ 2658 (ptr)->name = ops->name; \ 2659 } while (0) 2660 2661 #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) 2662 2663 if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { 2664 WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && 2665 dev_ops->driver_id != ops->driver_id); 2666 dev_ops->driver_id = ops->driver_id; 2667 } 2668 if (ops->owner) { 2669 WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); 2670 dev_ops->owner = ops->owner; 2671 } 2672 if (ops->uverbs_abi_ver) 2673 dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; 2674 2675 dev_ops->uverbs_no_driver_id_binding |= 2676 ops->uverbs_no_driver_id_binding; 2677 2678 SET_DEVICE_OP(dev_ops, add_gid); 2679 SET_DEVICE_OP(dev_ops, add_sub_dev); 2680 SET_DEVICE_OP(dev_ops, advise_mr); 2681 SET_DEVICE_OP(dev_ops, alloc_dm); 2682 SET_DEVICE_OP(dev_ops, alloc_dmah); 2683 SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); 2684 SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); 2685 SET_DEVICE_OP(dev_ops, alloc_mr); 2686 SET_DEVICE_OP(dev_ops, alloc_mr_integrity); 2687 SET_DEVICE_OP(dev_ops, alloc_mw); 2688 SET_DEVICE_OP(dev_ops, alloc_pd); 2689 SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); 2690 SET_DEVICE_OP(dev_ops, alloc_ucontext); 2691 SET_DEVICE_OP(dev_ops, alloc_xrcd); 2692 SET_DEVICE_OP(dev_ops, attach_mcast); 2693 SET_DEVICE_OP(dev_ops, check_mr_status); 2694 SET_DEVICE_OP(dev_ops, counter_alloc_stats); 2695 SET_DEVICE_OP(dev_ops, counter_bind_qp); 2696 SET_DEVICE_OP(dev_ops, counter_dealloc); 2697 SET_DEVICE_OP(dev_ops, counter_init); 2698 SET_DEVICE_OP(dev_ops, counter_unbind_qp); 2699 SET_DEVICE_OP(dev_ops, counter_update_stats); 2700 SET_DEVICE_OP(dev_ops, create_ah); 2701 SET_DEVICE_OP(dev_ops, create_counters); 2702 SET_DEVICE_OP(dev_ops, create_cq); 2703 SET_DEVICE_OP(dev_ops, create_cq_umem); 2704 SET_DEVICE_OP(dev_ops, create_flow); 2705 SET_DEVICE_OP(dev_ops, create_qp); 2706 SET_DEVICE_OP(dev_ops, create_rwq_ind_table); 2707 SET_DEVICE_OP(dev_ops, create_srq); 2708 SET_DEVICE_OP(dev_ops, create_user_ah); 2709 SET_DEVICE_OP(dev_ops, create_wq); 2710 SET_DEVICE_OP(dev_ops, dealloc_dm); 2711 SET_DEVICE_OP(dev_ops, dealloc_dmah); 2712 SET_DEVICE_OP(dev_ops, dealloc_driver); 2713 SET_DEVICE_OP(dev_ops, dealloc_mw); 2714 SET_DEVICE_OP(dev_ops, dealloc_pd); 2715 SET_DEVICE_OP(dev_ops, dealloc_ucontext); 2716 SET_DEVICE_OP(dev_ops, dealloc_xrcd); 2717 SET_DEVICE_OP(dev_ops, del_gid); 2718 SET_DEVICE_OP(dev_ops, del_sub_dev); 2719 SET_DEVICE_OP(dev_ops, dereg_mr); 2720 SET_DEVICE_OP(dev_ops, destroy_ah); 2721 SET_DEVICE_OP(dev_ops, destroy_counters); 2722 SET_DEVICE_OP(dev_ops, destroy_cq); 2723 SET_DEVICE_OP(dev_ops, destroy_flow); 2724 SET_DEVICE_OP(dev_ops, destroy_flow_action); 2725 SET_DEVICE_OP(dev_ops, destroy_qp); 2726 SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); 2727 SET_DEVICE_OP(dev_ops, destroy_srq); 2728 SET_DEVICE_OP(dev_ops, destroy_wq); 2729 SET_DEVICE_OP(dev_ops, device_group); 2730 SET_DEVICE_OP(dev_ops, detach_mcast); 2731 SET_DEVICE_OP(dev_ops, disassociate_ucontext); 2732 SET_DEVICE_OP(dev_ops, drain_rq); 2733 SET_DEVICE_OP(dev_ops, drain_sq); 2734 SET_DEVICE_OP(dev_ops, enable_driver); 2735 SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); 2736 SET_DEVICE_OP(dev_ops, fill_res_cq_entry); 2737 SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); 2738 SET_DEVICE_OP(dev_ops, fill_res_mr_entry); 2739 SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); 2740 SET_DEVICE_OP(dev_ops, fill_res_qp_entry); 2741 SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); 2742 SET_DEVICE_OP(dev_ops, fill_res_srq_entry); 2743 SET_DEVICE_OP(dev_ops, fill_res_srq_entry_raw); 2744 SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); 2745 SET_DEVICE_OP(dev_ops, get_dev_fw_str); 2746 SET_DEVICE_OP(dev_ops, get_dma_mr); 2747 SET_DEVICE_OP(dev_ops, get_hw_stats); 2748 SET_DEVICE_OP(dev_ops, get_link_layer); 2749 SET_DEVICE_OP(dev_ops, get_netdev); 2750 SET_DEVICE_OP(dev_ops, get_numa_node); 2751 SET_DEVICE_OP(dev_ops, get_port_immutable); 2752 SET_DEVICE_OP(dev_ops, get_vector_affinity); 2753 SET_DEVICE_OP(dev_ops, get_vf_config); 2754 SET_DEVICE_OP(dev_ops, get_vf_guid); 2755 SET_DEVICE_OP(dev_ops, get_vf_stats); 2756 SET_DEVICE_OP(dev_ops, iw_accept); 2757 SET_DEVICE_OP(dev_ops, iw_add_ref); 2758 SET_DEVICE_OP(dev_ops, iw_connect); 2759 SET_DEVICE_OP(dev_ops, iw_create_listen); 2760 SET_DEVICE_OP(dev_ops, iw_destroy_listen); 2761 SET_DEVICE_OP(dev_ops, iw_get_qp); 2762 SET_DEVICE_OP(dev_ops, iw_reject); 2763 SET_DEVICE_OP(dev_ops, iw_rem_ref); 2764 SET_DEVICE_OP(dev_ops, map_mr_sg); 2765 SET_DEVICE_OP(dev_ops, map_mr_sg_pi); 2766 SET_DEVICE_OP(dev_ops, mmap); 2767 SET_DEVICE_OP(dev_ops, mmap_get_pfns); 2768 SET_DEVICE_OP(dev_ops, mmap_free); 2769 SET_DEVICE_OP(dev_ops, modify_ah); 2770 SET_DEVICE_OP(dev_ops, modify_cq); 2771 SET_DEVICE_OP(dev_ops, modify_device); 2772 SET_DEVICE_OP(dev_ops, modify_hw_stat); 2773 SET_DEVICE_OP(dev_ops, modify_port); 2774 SET_DEVICE_OP(dev_ops, modify_qp); 2775 SET_DEVICE_OP(dev_ops, modify_srq); 2776 SET_DEVICE_OP(dev_ops, modify_wq); 2777 SET_DEVICE_OP(dev_ops, peek_cq); 2778 SET_DEVICE_OP(dev_ops, pgoff_to_mmap_entry); 2779 SET_DEVICE_OP(dev_ops, pre_destroy_cq); 2780 SET_DEVICE_OP(dev_ops, poll_cq); 2781 SET_DEVICE_OP(dev_ops, port_groups); 2782 SET_DEVICE_OP(dev_ops, post_destroy_cq); 2783 SET_DEVICE_OP(dev_ops, post_recv); 2784 SET_DEVICE_OP(dev_ops, post_send); 2785 SET_DEVICE_OP(dev_ops, post_srq_recv); 2786 SET_DEVICE_OP(dev_ops, process_mad); 2787 SET_DEVICE_OP(dev_ops, query_ah); 2788 SET_DEVICE_OP(dev_ops, query_device); 2789 SET_DEVICE_OP(dev_ops, query_gid); 2790 SET_DEVICE_OP(dev_ops, query_pkey); 2791 SET_DEVICE_OP(dev_ops, query_port); 2792 SET_DEVICE_OP(dev_ops, query_port_speed); 2793 SET_DEVICE_OP(dev_ops, query_qp); 2794 SET_DEVICE_OP(dev_ops, query_srq); 2795 SET_DEVICE_OP(dev_ops, query_ucontext); 2796 SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); 2797 SET_DEVICE_OP(dev_ops, read_counters); 2798 SET_DEVICE_OP(dev_ops, reg_dm_mr); 2799 SET_DEVICE_OP(dev_ops, reg_user_mr); 2800 SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf); 2801 SET_DEVICE_OP(dev_ops, req_notify_cq); 2802 SET_DEVICE_OP(dev_ops, rereg_user_mr); 2803 SET_DEVICE_OP(dev_ops, resize_cq); 2804 SET_DEVICE_OP(dev_ops, set_vf_guid); 2805 SET_DEVICE_OP(dev_ops, set_vf_link_state); 2806 SET_DEVICE_OP(dev_ops, ufile_hw_cleanup); 2807 SET_DEVICE_OP(dev_ops, report_port_event); 2808 2809 SET_OBJ_SIZE(dev_ops, ib_ah); 2810 SET_OBJ_SIZE(dev_ops, ib_counters); 2811 SET_OBJ_SIZE(dev_ops, ib_cq); 2812 SET_OBJ_SIZE(dev_ops, ib_dmah); 2813 SET_OBJ_SIZE(dev_ops, ib_mw); 2814 SET_OBJ_SIZE(dev_ops, ib_pd); 2815 SET_OBJ_SIZE(dev_ops, ib_qp); 2816 SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); 2817 SET_OBJ_SIZE(dev_ops, ib_srq); 2818 SET_OBJ_SIZE(dev_ops, ib_ucontext); 2819 SET_OBJ_SIZE(dev_ops, ib_xrcd); 2820 SET_OBJ_SIZE(dev_ops, rdma_counter); 2821 } 2822 EXPORT_SYMBOL(ib_set_device_ops); 2823 2824 int ib_add_sub_device(struct ib_device *parent, 2825 enum rdma_nl_dev_type type, 2826 const char *name) 2827 { 2828 struct ib_device *sub; 2829 int ret = 0; 2830 2831 if (!parent->ops.add_sub_dev || !parent->ops.del_sub_dev) 2832 return -EOPNOTSUPP; 2833 2834 if (!ib_device_try_get(parent)) 2835 return -EINVAL; 2836 2837 sub = parent->ops.add_sub_dev(parent, type, name); 2838 if (IS_ERR(sub)) { 2839 ib_device_put(parent); 2840 return PTR_ERR(sub); 2841 } 2842 2843 sub->type = type; 2844 sub->parent = parent; 2845 2846 mutex_lock(&parent->subdev_lock); 2847 list_add_tail(&parent->subdev_list_head, &sub->subdev_list); 2848 mutex_unlock(&parent->subdev_lock); 2849 2850 return ret; 2851 } 2852 2853 int ib_del_sub_device_and_put(struct ib_device *sub) 2854 { 2855 struct ib_device *parent = sub->parent; 2856 2857 if (!parent) { 2858 ib_device_put(sub); 2859 return -EOPNOTSUPP; 2860 } 2861 2862 mutex_lock(&parent->subdev_lock); 2863 list_del(&sub->subdev_list); 2864 mutex_unlock(&parent->subdev_lock); 2865 2866 ib_device_put(sub); 2867 parent->ops.del_sub_dev(sub); 2868 ib_device_put(parent); 2869 2870 return 0; 2871 } 2872 2873 #ifdef CONFIG_INFINIBAND_VIRT_DMA 2874 int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) 2875 { 2876 struct scatterlist *s; 2877 int i; 2878 2879 for_each_sg(sg, s, nents, i) { 2880 sg_dma_address(s) = (uintptr_t)sg_virt(s); 2881 sg_dma_len(s) = s->length; 2882 } 2883 return nents; 2884 } 2885 EXPORT_SYMBOL(ib_dma_virt_map_sg); 2886 #endif /* CONFIG_INFINIBAND_VIRT_DMA */ 2887 2888 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { 2889 [RDMA_NL_LS_OP_RESOLVE] = { 2890 .doit = ib_nl_handle_resolve_resp, 2891 .flags = RDMA_NL_ADMIN_PERM, 2892 }, 2893 [RDMA_NL_LS_OP_SET_TIMEOUT] = { 2894 .doit = ib_nl_handle_set_timeout, 2895 .flags = RDMA_NL_ADMIN_PERM, 2896 }, 2897 [RDMA_NL_LS_OP_IP_RESOLVE] = { 2898 .doit = ib_nl_handle_ip_res_resp, 2899 .flags = RDMA_NL_ADMIN_PERM, 2900 }, 2901 }; 2902 2903 void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev) 2904 { 2905 enum ib_port_state curr_state; 2906 struct ib_event ibevent = {}; 2907 u32 port; 2908 2909 if (ib_query_netdev_port(ibdev, ndev, &port)) 2910 return; 2911 2912 curr_state = ib_get_curr_port_state(ndev); 2913 2914 write_lock_irq(&ibdev->cache_lock); 2915 if (ibdev->port_data[port].cache.last_port_state == curr_state) { 2916 write_unlock_irq(&ibdev->cache_lock); 2917 return; 2918 } 2919 ibdev->port_data[port].cache.last_port_state = curr_state; 2920 write_unlock_irq(&ibdev->cache_lock); 2921 2922 ibevent.event = (curr_state == IB_PORT_DOWN) ? 2923 IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE; 2924 ibevent.device = ibdev; 2925 ibevent.element.port_num = port; 2926 ib_dispatch_event(&ibevent); 2927 } 2928 EXPORT_SYMBOL(ib_dispatch_port_state_event); 2929 2930 static void handle_port_event(struct net_device *ndev, unsigned long event) 2931 { 2932 struct ib_device *ibdev; 2933 2934 /* Currently, link events in bonding scenarios are still 2935 * reported by drivers that support bonding. 2936 */ 2937 if (netif_is_lag_master(ndev) || netif_is_lag_port(ndev)) 2938 return; 2939 2940 ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); 2941 if (!ibdev) 2942 return; 2943 2944 if (ibdev->ops.report_port_event) { 2945 ibdev->ops.report_port_event(ibdev, ndev, event); 2946 goto put_ibdev; 2947 } 2948 2949 ib_dispatch_port_state_event(ibdev, ndev); 2950 2951 put_ibdev: 2952 ib_device_put(ibdev); 2953 }; 2954 2955 static int ib_netdevice_event(struct notifier_block *this, 2956 unsigned long event, void *ptr) 2957 { 2958 struct net_device *ndev = netdev_notifier_info_to_dev(ptr); 2959 struct ib_device *ibdev; 2960 u32 port; 2961 2962 switch (event) { 2963 case NETDEV_CHANGENAME: 2964 ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); 2965 if (!ibdev) 2966 return NOTIFY_DONE; 2967 2968 if (ib_query_netdev_port(ibdev, ndev, &port)) { 2969 ib_device_put(ibdev); 2970 break; 2971 } 2972 2973 rdma_nl_notify_event(ibdev, port, RDMA_NETDEV_RENAME_EVENT); 2974 ib_device_put(ibdev); 2975 break; 2976 2977 case NETDEV_UP: 2978 case NETDEV_CHANGE: 2979 case NETDEV_DOWN: 2980 handle_port_event(ndev, event); 2981 break; 2982 2983 default: 2984 break; 2985 } 2986 2987 return NOTIFY_DONE; 2988 } 2989 2990 static struct notifier_block nb_netdevice = { 2991 .notifier_call = ib_netdevice_event, 2992 }; 2993 2994 static int __init ib_core_init(void) 2995 { 2996 int ret = -ENOMEM; 2997 2998 ib_wq = alloc_workqueue("infiniband", WQ_PERCPU, 0); 2999 if (!ib_wq) 3000 return -ENOMEM; 3001 3002 ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND, 3003 WQ_UNBOUND_MAX_ACTIVE); 3004 if (!ib_unreg_wq) 3005 goto err; 3006 3007 ib_comp_wq = alloc_workqueue("ib-comp-wq", 3008 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS | WQ_PERCPU, 0); 3009 if (!ib_comp_wq) 3010 goto err_unbound; 3011 3012 ib_comp_unbound_wq = 3013 alloc_workqueue("ib-comp-unb-wq", 3014 WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | 3015 WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); 3016 if (!ib_comp_unbound_wq) 3017 goto err_comp; 3018 3019 ret = class_register(&ib_class); 3020 if (ret) { 3021 pr_warn("Couldn't create InfiniBand device class\n"); 3022 goto err_comp_unbound; 3023 } 3024 3025 rdma_nl_init(); 3026 3027 ret = addr_init(); 3028 if (ret) { 3029 pr_warn("Couldn't init IB address resolution\n"); 3030 goto err_ibnl; 3031 } 3032 3033 ret = ib_mad_init(); 3034 if (ret) { 3035 pr_warn("Couldn't init IB MAD\n"); 3036 goto err_addr; 3037 } 3038 3039 ret = ib_sa_init(); 3040 if (ret) { 3041 pr_warn("Couldn't init SA\n"); 3042 goto err_mad; 3043 } 3044 3045 ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); 3046 if (ret) { 3047 pr_warn("Couldn't register LSM notifier. ret %d\n", ret); 3048 goto err_sa; 3049 } 3050 3051 ret = register_pernet_device(&rdma_dev_net_ops); 3052 if (ret) { 3053 pr_warn("Couldn't init compat dev. ret %d\n", ret); 3054 goto err_compat; 3055 } 3056 3057 nldev_init(); 3058 rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); 3059 ret = roce_gid_mgmt_init(); 3060 if (ret) { 3061 pr_warn("Couldn't init RoCE GID management\n"); 3062 goto err_parent; 3063 } 3064 3065 register_netdevice_notifier(&nb_netdevice); 3066 3067 return 0; 3068 3069 err_parent: 3070 rdma_nl_unregister(RDMA_NL_LS); 3071 nldev_exit(); 3072 unregister_pernet_device(&rdma_dev_net_ops); 3073 err_compat: 3074 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 3075 err_sa: 3076 ib_sa_cleanup(); 3077 err_mad: 3078 ib_mad_cleanup(); 3079 err_addr: 3080 addr_cleanup(); 3081 err_ibnl: 3082 class_unregister(&ib_class); 3083 err_comp_unbound: 3084 destroy_workqueue(ib_comp_unbound_wq); 3085 err_comp: 3086 destroy_workqueue(ib_comp_wq); 3087 err_unbound: 3088 destroy_workqueue(ib_unreg_wq); 3089 err: 3090 destroy_workqueue(ib_wq); 3091 return ret; 3092 } 3093 3094 static void __exit ib_core_cleanup(void) 3095 { 3096 unregister_netdevice_notifier(&nb_netdevice); 3097 roce_gid_mgmt_cleanup(); 3098 rdma_nl_unregister(RDMA_NL_LS); 3099 nldev_exit(); 3100 unregister_pernet_device(&rdma_dev_net_ops); 3101 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 3102 ib_sa_cleanup(); 3103 ib_mad_cleanup(); 3104 addr_cleanup(); 3105 rdma_nl_exit(); 3106 class_unregister(&ib_class); 3107 destroy_workqueue(ib_comp_unbound_wq); 3108 destroy_workqueue(ib_comp_wq); 3109 /* Make sure that any pending umem accounting work is done. */ 3110 destroy_workqueue(ib_wq); 3111 destroy_workqueue(ib_unreg_wq); 3112 WARN_ON(!xa_empty(&clients)); 3113 WARN_ON(!xa_empty(&devices)); 3114 } 3115 3116 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); 3117 3118 /* ib core relies on netdev stack to first register net_ns_type_operations 3119 * ns kobject type before ib_core initialization. 3120 */ 3121 fs_initcall(ib_core_init); 3122 module_exit(ib_core_cleanup); 3123