1 /* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <linux/module.h> 35 #include <linux/string.h> 36 #include <linux/errno.h> 37 #include <linux/kernel.h> 38 #include <linux/slab.h> 39 #include <linux/init.h> 40 #include <linux/netdevice.h> 41 #include <net/net_namespace.h> 42 #include <linux/security.h> 43 #include <linux/notifier.h> 44 #include <linux/hashtable.h> 45 #include <rdma/rdma_netlink.h> 46 #include <rdma/ib_addr.h> 47 #include <rdma/ib_cache.h> 48 #include <rdma/rdma_counter.h> 49 50 #include "core_priv.h" 51 #include "restrack.h" 52 53 MODULE_AUTHOR("Roland Dreier"); 54 MODULE_DESCRIPTION("core kernel InfiniBand API"); 55 MODULE_LICENSE("Dual BSD/GPL"); 56 57 struct workqueue_struct *ib_comp_wq; 58 struct workqueue_struct *ib_comp_unbound_wq; 59 struct workqueue_struct *ib_wq; 60 EXPORT_SYMBOL_GPL(ib_wq); 61 static struct workqueue_struct *ib_unreg_wq; 62 63 /* 64 * Each of the three rwsem locks (devices, clients, client_data) protects the 65 * xarray of the same name. Specifically it allows the caller to assert that 66 * the MARK will/will not be changing under the lock, and for devices and 67 * clients, that the value in the xarray is still a valid pointer. Change of 68 * the MARK is linked to the object state, so holding the lock and testing the 69 * MARK also asserts that the contained object is in a certain state. 70 * 71 * This is used to build a two stage register/unregister flow where objects 72 * can continue to be in the xarray even though they are still in progress to 73 * register/unregister. 74 * 75 * The xarray itself provides additional locking, and restartable iteration, 76 * which is also relied on. 77 * 78 * Locks should not be nested, with the exception of client_data, which is 79 * allowed to nest under the read side of the other two locks. 80 * 81 * The devices_rwsem also protects the device name list, any change or 82 * assignment of device name must also hold the write side to guarantee unique 83 * names. 84 */ 85 86 /* 87 * devices contains devices that have had their names assigned. The 88 * devices may not be registered. Users that care about the registration 89 * status need to call ib_device_try_get() on the device to ensure it is 90 * registered, and keep it registered, for the required duration. 91 * 92 */ 93 static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); 94 static DECLARE_RWSEM(devices_rwsem); 95 #define DEVICE_REGISTERED XA_MARK_1 96 97 static u32 highest_client_id; 98 #define CLIENT_REGISTERED XA_MARK_1 99 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); 100 static DECLARE_RWSEM(clients_rwsem); 101 102 static void ib_client_put(struct ib_client *client) 103 { 104 if (refcount_dec_and_test(&client->uses)) 105 complete(&client->uses_zero); 106 } 107 108 /* 109 * If client_data is registered then the corresponding client must also still 110 * be registered. 111 */ 112 #define CLIENT_DATA_REGISTERED XA_MARK_1 113 114 unsigned int rdma_dev_net_id; 115 116 /* 117 * A list of net namespaces is maintained in an xarray. This is necessary 118 * because we can't get the locking right using the existing net ns list. We 119 * would require a init_net callback after the list is updated. 120 */ 121 static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); 122 /* 123 * rwsem to protect accessing the rdma_nets xarray entries. 124 */ 125 static DECLARE_RWSEM(rdma_nets_rwsem); 126 127 bool ib_devices_shared_netns = true; 128 module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); 129 MODULE_PARM_DESC(netns_mode, 130 "Share device among net namespaces; default=1 (shared)"); 131 /** 132 * rdma_dev_access_netns() - Return whether an rdma device can be accessed 133 * from a specified net namespace or not. 134 * @dev: Pointer to rdma device which needs to be checked 135 * @net: Pointer to net namesapce for which access to be checked 136 * 137 * When the rdma device is in shared mode, it ignores the net namespace. 138 * When the rdma device is exclusive to a net namespace, rdma device net 139 * namespace is checked against the specified one. 140 */ 141 bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) 142 { 143 return (ib_devices_shared_netns || 144 net_eq(read_pnet(&dev->coredev.rdma_net), net)); 145 } 146 EXPORT_SYMBOL(rdma_dev_access_netns); 147 148 /* 149 * xarray has this behavior where it won't iterate over NULL values stored in 150 * allocated arrays. So we need our own iterator to see all values stored in 151 * the array. This does the same thing as xa_for_each except that it also 152 * returns NULL valued entries if the array is allocating. Simplified to only 153 * work on simple xarrays. 154 */ 155 static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, 156 xa_mark_t filter) 157 { 158 XA_STATE(xas, xa, *indexp); 159 void *entry; 160 161 rcu_read_lock(); 162 do { 163 entry = xas_find_marked(&xas, ULONG_MAX, filter); 164 if (xa_is_zero(entry)) 165 break; 166 } while (xas_retry(&xas, entry)); 167 rcu_read_unlock(); 168 169 if (entry) { 170 *indexp = xas.xa_index; 171 if (xa_is_zero(entry)) 172 return NULL; 173 return entry; 174 } 175 return XA_ERROR(-ENOENT); 176 } 177 #define xan_for_each_marked(xa, index, entry, filter) \ 178 for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ 179 !xa_is_err(entry); \ 180 (index)++, entry = xan_find_marked(xa, &(index), filter)) 181 182 /* RCU hash table mapping netdevice pointers to struct ib_port_data */ 183 static DEFINE_SPINLOCK(ndev_hash_lock); 184 static DECLARE_HASHTABLE(ndev_hash, 5); 185 186 static void free_netdevs(struct ib_device *ib_dev); 187 static void ib_unregister_work(struct work_struct *work); 188 static void __ib_unregister_device(struct ib_device *device); 189 static int ib_security_change(struct notifier_block *nb, unsigned long event, 190 void *lsm_data); 191 static void ib_policy_change_task(struct work_struct *work); 192 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); 193 194 static void __ibdev_printk(const char *level, const struct ib_device *ibdev, 195 struct va_format *vaf) 196 { 197 if (ibdev && ibdev->dev.parent) 198 dev_printk_emit(level[1] - '0', 199 ibdev->dev.parent, 200 "%s %s %s: %pV", 201 dev_driver_string(ibdev->dev.parent), 202 dev_name(ibdev->dev.parent), 203 dev_name(&ibdev->dev), 204 vaf); 205 else if (ibdev) 206 printk("%s%s: %pV", 207 level, dev_name(&ibdev->dev), vaf); 208 else 209 printk("%s(NULL ib_device): %pV", level, vaf); 210 } 211 212 #define define_ibdev_printk_level(func, level) \ 213 void func(const struct ib_device *ibdev, const char *fmt, ...) \ 214 { \ 215 struct va_format vaf; \ 216 va_list args; \ 217 \ 218 va_start(args, fmt); \ 219 \ 220 vaf.fmt = fmt; \ 221 vaf.va = &args; \ 222 \ 223 __ibdev_printk(level, ibdev, &vaf); \ 224 \ 225 va_end(args); \ 226 } \ 227 EXPORT_SYMBOL(func); 228 229 define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); 230 define_ibdev_printk_level(ibdev_alert, KERN_ALERT); 231 define_ibdev_printk_level(ibdev_crit, KERN_CRIT); 232 define_ibdev_printk_level(ibdev_err, KERN_ERR); 233 define_ibdev_printk_level(ibdev_warn, KERN_WARNING); 234 define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); 235 define_ibdev_printk_level(ibdev_info, KERN_INFO); 236 237 static struct notifier_block ibdev_lsm_nb = { 238 .notifier_call = ib_security_change, 239 }; 240 241 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 242 struct net *net); 243 244 /* Pointer to the RCU head at the start of the ib_port_data array */ 245 struct ib_port_data_rcu { 246 struct rcu_head rcu_head; 247 struct ib_port_data pdata[]; 248 }; 249 250 static void ib_device_check_mandatory(struct ib_device *device) 251 { 252 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } 253 static const struct { 254 size_t offset; 255 char *name; 256 } mandatory_table[] = { 257 IB_MANDATORY_FUNC(query_device), 258 IB_MANDATORY_FUNC(query_port), 259 IB_MANDATORY_FUNC(alloc_pd), 260 IB_MANDATORY_FUNC(dealloc_pd), 261 IB_MANDATORY_FUNC(create_qp), 262 IB_MANDATORY_FUNC(modify_qp), 263 IB_MANDATORY_FUNC(destroy_qp), 264 IB_MANDATORY_FUNC(post_send), 265 IB_MANDATORY_FUNC(post_recv), 266 IB_MANDATORY_FUNC(create_cq), 267 IB_MANDATORY_FUNC(destroy_cq), 268 IB_MANDATORY_FUNC(poll_cq), 269 IB_MANDATORY_FUNC(req_notify_cq), 270 IB_MANDATORY_FUNC(get_dma_mr), 271 IB_MANDATORY_FUNC(reg_user_mr), 272 IB_MANDATORY_FUNC(dereg_mr), 273 IB_MANDATORY_FUNC(get_port_immutable) 274 }; 275 int i; 276 277 device->kverbs_provider = true; 278 for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { 279 if (!*(void **) ((void *) &device->ops + 280 mandatory_table[i].offset)) { 281 device->kverbs_provider = false; 282 break; 283 } 284 } 285 } 286 287 /* 288 * Caller must perform ib_device_put() to return the device reference count 289 * when ib_device_get_by_index() returns valid device pointer. 290 */ 291 struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) 292 { 293 struct ib_device *device; 294 295 down_read(&devices_rwsem); 296 device = xa_load(&devices, index); 297 if (device) { 298 if (!rdma_dev_access_netns(device, net)) { 299 device = NULL; 300 goto out; 301 } 302 303 if (!ib_device_try_get(device)) 304 device = NULL; 305 } 306 out: 307 up_read(&devices_rwsem); 308 return device; 309 } 310 311 /** 312 * ib_device_put - Release IB device reference 313 * @device: device whose reference to be released 314 * 315 * ib_device_put() releases reference to the IB device to allow it to be 316 * unregistered and eventually free. 317 */ 318 void ib_device_put(struct ib_device *device) 319 { 320 if (refcount_dec_and_test(&device->refcount)) 321 complete(&device->unreg_completion); 322 } 323 EXPORT_SYMBOL(ib_device_put); 324 325 static struct ib_device *__ib_device_get_by_name(const char *name) 326 { 327 struct ib_device *device; 328 unsigned long index; 329 330 xa_for_each (&devices, index, device) 331 if (!strcmp(name, dev_name(&device->dev))) 332 return device; 333 334 return NULL; 335 } 336 337 /** 338 * ib_device_get_by_name - Find an IB device by name 339 * @name: The name to look for 340 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 341 * 342 * Find and hold an ib_device by its name. The caller must call 343 * ib_device_put() on the returned pointer. 344 */ 345 struct ib_device *ib_device_get_by_name(const char *name, 346 enum rdma_driver_id driver_id) 347 { 348 struct ib_device *device; 349 350 down_read(&devices_rwsem); 351 device = __ib_device_get_by_name(name); 352 if (device && driver_id != RDMA_DRIVER_UNKNOWN && 353 device->ops.driver_id != driver_id) 354 device = NULL; 355 356 if (device) { 357 if (!ib_device_try_get(device)) 358 device = NULL; 359 } 360 up_read(&devices_rwsem); 361 return device; 362 } 363 EXPORT_SYMBOL(ib_device_get_by_name); 364 365 static int rename_compat_devs(struct ib_device *device) 366 { 367 struct ib_core_device *cdev; 368 unsigned long index; 369 int ret = 0; 370 371 mutex_lock(&device->compat_devs_mutex); 372 xa_for_each (&device->compat_devs, index, cdev) { 373 ret = device_rename(&cdev->dev, dev_name(&device->dev)); 374 if (ret) { 375 dev_warn(&cdev->dev, 376 "Fail to rename compatdev to new name %s\n", 377 dev_name(&device->dev)); 378 break; 379 } 380 } 381 mutex_unlock(&device->compat_devs_mutex); 382 return ret; 383 } 384 385 int ib_device_rename(struct ib_device *ibdev, const char *name) 386 { 387 unsigned long index; 388 void *client_data; 389 int ret; 390 391 down_write(&devices_rwsem); 392 if (!strcmp(name, dev_name(&ibdev->dev))) { 393 up_write(&devices_rwsem); 394 return 0; 395 } 396 397 if (__ib_device_get_by_name(name)) { 398 up_write(&devices_rwsem); 399 return -EEXIST; 400 } 401 402 ret = device_rename(&ibdev->dev, name); 403 if (ret) { 404 up_write(&devices_rwsem); 405 return ret; 406 } 407 408 strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX); 409 ret = rename_compat_devs(ibdev); 410 411 downgrade_write(&devices_rwsem); 412 down_read(&ibdev->client_data_rwsem); 413 xan_for_each_marked(&ibdev->client_data, index, client_data, 414 CLIENT_DATA_REGISTERED) { 415 struct ib_client *client = xa_load(&clients, index); 416 417 if (!client || !client->rename) 418 continue; 419 420 client->rename(ibdev, client_data); 421 } 422 up_read(&ibdev->client_data_rwsem); 423 rdma_nl_notify_event(ibdev, 0, RDMA_RENAME_EVENT); 424 up_read(&devices_rwsem); 425 return 0; 426 } 427 428 int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) 429 { 430 if (use_dim > 1) 431 return -EINVAL; 432 ibdev->use_cq_dim = use_dim; 433 434 return 0; 435 } 436 437 static int alloc_name(struct ib_device *ibdev, const char *name) 438 { 439 struct ib_device *device; 440 unsigned long index; 441 struct ida inuse; 442 int rc; 443 int i; 444 445 lockdep_assert_held_write(&devices_rwsem); 446 ida_init(&inuse); 447 xa_for_each (&devices, index, device) { 448 char buf[IB_DEVICE_NAME_MAX]; 449 450 if (sscanf(dev_name(&device->dev), name, &i) != 1) 451 continue; 452 if (i < 0 || i >= INT_MAX) 453 continue; 454 snprintf(buf, sizeof buf, name, i); 455 if (strcmp(buf, dev_name(&device->dev)) != 0) 456 continue; 457 458 rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); 459 if (rc < 0) 460 goto out; 461 } 462 463 rc = ida_alloc(&inuse, GFP_KERNEL); 464 if (rc < 0) 465 goto out; 466 467 rc = dev_set_name(&ibdev->dev, name, rc); 468 out: 469 ida_destroy(&inuse); 470 return rc; 471 } 472 473 static void ib_device_release(struct device *device) 474 { 475 struct ib_device *dev = container_of(device, struct ib_device, dev); 476 477 free_netdevs(dev); 478 WARN_ON(refcount_read(&dev->refcount)); 479 if (dev->hw_stats_data) 480 ib_device_release_hw_stats(dev->hw_stats_data); 481 if (dev->port_data) { 482 ib_cache_release_one(dev); 483 ib_security_release_port_pkey_list(dev); 484 rdma_counter_release(dev); 485 kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, 486 pdata[0]), 487 rcu_head); 488 } 489 490 mutex_destroy(&dev->subdev_lock); 491 mutex_destroy(&dev->unregistration_lock); 492 mutex_destroy(&dev->compat_devs_mutex); 493 494 xa_destroy(&dev->compat_devs); 495 xa_destroy(&dev->client_data); 496 kfree_rcu(dev, rcu_head); 497 } 498 499 static int ib_device_uevent(const struct device *device, 500 struct kobj_uevent_env *env) 501 { 502 if (add_uevent_var(env, "NAME=%s", dev_name(device))) 503 return -ENOMEM; 504 505 /* 506 * It would be nice to pass the node GUID with the event... 507 */ 508 509 return 0; 510 } 511 512 static const void *net_namespace(const struct device *d) 513 { 514 const struct ib_core_device *coredev = 515 container_of(d, struct ib_core_device, dev); 516 517 return read_pnet(&coredev->rdma_net); 518 } 519 520 static struct class ib_class = { 521 .name = "infiniband", 522 .dev_release = ib_device_release, 523 .dev_uevent = ib_device_uevent, 524 .ns_type = &net_ns_type_operations, 525 .namespace = net_namespace, 526 }; 527 528 static void rdma_init_coredev(struct ib_core_device *coredev, 529 struct ib_device *dev, struct net *net) 530 { 531 /* This BUILD_BUG_ON is intended to catch layout change 532 * of union of ib_core_device and device. 533 * dev must be the first element as ib_core and providers 534 * driver uses it. Adding anything in ib_core_device before 535 * device will break this assumption. 536 */ 537 BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != 538 offsetof(struct ib_device, dev)); 539 540 coredev->dev.class = &ib_class; 541 coredev->dev.groups = dev->groups; 542 device_initialize(&coredev->dev); 543 coredev->owner = dev; 544 INIT_LIST_HEAD(&coredev->port_list); 545 write_pnet(&coredev->rdma_net, net); 546 } 547 548 /** 549 * _ib_alloc_device - allocate an IB device struct 550 * @size:size of structure to allocate 551 * 552 * Low-level drivers should use ib_alloc_device() to allocate &struct 553 * ib_device. @size is the size of the structure to be allocated, 554 * including any private data used by the low-level driver. 555 * ib_dealloc_device() must be used to free structures allocated with 556 * ib_alloc_device(). 557 */ 558 struct ib_device *_ib_alloc_device(size_t size) 559 { 560 struct ib_device *device; 561 unsigned int i; 562 563 if (WARN_ON(size < sizeof(struct ib_device))) 564 return NULL; 565 566 device = kzalloc(size, GFP_KERNEL); 567 if (!device) 568 return NULL; 569 570 if (rdma_restrack_init(device)) { 571 kfree(device); 572 return NULL; 573 } 574 575 rdma_init_coredev(&device->coredev, device, &init_net); 576 577 INIT_LIST_HEAD(&device->event_handler_list); 578 spin_lock_init(&device->qp_open_list_lock); 579 init_rwsem(&device->event_handler_rwsem); 580 mutex_init(&device->unregistration_lock); 581 /* 582 * client_data needs to be alloc because we don't want our mark to be 583 * destroyed if the user stores NULL in the client data. 584 */ 585 xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); 586 init_rwsem(&device->client_data_rwsem); 587 xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); 588 mutex_init(&device->compat_devs_mutex); 589 init_completion(&device->unreg_completion); 590 INIT_WORK(&device->unregistration_work, ib_unregister_work); 591 592 spin_lock_init(&device->cq_pools_lock); 593 for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++) 594 INIT_LIST_HEAD(&device->cq_pools[i]); 595 596 rwlock_init(&device->cache_lock); 597 598 device->uverbs_cmd_mask = 599 BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) | 600 BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) | 601 BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) | 602 BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) | 603 BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) | 604 BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | 605 BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) | 606 BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) | 607 BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) | 608 BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) | 609 BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) | 610 BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) | 611 BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) | 612 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) | 613 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) | 614 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) | 615 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) | 616 BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) | 617 BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | 618 BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) | 619 BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) | 620 BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) | 621 BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) | 622 BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) | 623 BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) | 624 BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) | 625 BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) | 626 BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | 627 BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | 628 BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); 629 630 mutex_init(&device->subdev_lock); 631 INIT_LIST_HEAD(&device->subdev_list_head); 632 INIT_LIST_HEAD(&device->subdev_list); 633 634 return device; 635 } 636 EXPORT_SYMBOL(_ib_alloc_device); 637 638 /** 639 * ib_dealloc_device - free an IB device struct 640 * @device:structure to free 641 * 642 * Free a structure allocated with ib_alloc_device(). 643 */ 644 void ib_dealloc_device(struct ib_device *device) 645 { 646 if (device->ops.dealloc_driver) 647 device->ops.dealloc_driver(device); 648 649 /* 650 * ib_unregister_driver() requires all devices to remain in the xarray 651 * while their ops are callable. The last op we call is dealloc_driver 652 * above. This is needed to create a fence on op callbacks prior to 653 * allowing the driver module to unload. 654 */ 655 down_write(&devices_rwsem); 656 if (xa_load(&devices, device->index) == device) 657 xa_erase(&devices, device->index); 658 up_write(&devices_rwsem); 659 660 /* Expedite releasing netdev references */ 661 free_netdevs(device); 662 663 WARN_ON(!xa_empty(&device->compat_devs)); 664 WARN_ON(!xa_empty(&device->client_data)); 665 WARN_ON(refcount_read(&device->refcount)); 666 rdma_restrack_clean(device); 667 /* Balances with device_initialize */ 668 put_device(&device->dev); 669 } 670 EXPORT_SYMBOL(ib_dealloc_device); 671 672 /* 673 * add_client_context() and remove_client_context() must be safe against 674 * parallel calls on the same device - registration/unregistration of both the 675 * device and client can be occurring in parallel. 676 * 677 * The routines need to be a fence, any caller must not return until the add 678 * or remove is fully completed. 679 */ 680 static int add_client_context(struct ib_device *device, 681 struct ib_client *client) 682 { 683 int ret = 0; 684 685 if (!device->kverbs_provider && !client->no_kverbs_req) 686 return 0; 687 688 down_write(&device->client_data_rwsem); 689 /* 690 * So long as the client is registered hold both the client and device 691 * unregistration locks. 692 */ 693 if (!refcount_inc_not_zero(&client->uses)) 694 goto out_unlock; 695 refcount_inc(&device->refcount); 696 697 /* 698 * Another caller to add_client_context got here first and has already 699 * completely initialized context. 700 */ 701 if (xa_get_mark(&device->client_data, client->client_id, 702 CLIENT_DATA_REGISTERED)) 703 goto out; 704 705 ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, 706 GFP_KERNEL)); 707 if (ret) 708 goto out; 709 downgrade_write(&device->client_data_rwsem); 710 if (client->add) { 711 if (client->add(device)) { 712 /* 713 * If a client fails to add then the error code is 714 * ignored, but we won't call any more ops on this 715 * client. 716 */ 717 xa_erase(&device->client_data, client->client_id); 718 up_read(&device->client_data_rwsem); 719 ib_device_put(device); 720 ib_client_put(client); 721 return 0; 722 } 723 } 724 725 /* Readers shall not see a client until add has been completed */ 726 xa_set_mark(&device->client_data, client->client_id, 727 CLIENT_DATA_REGISTERED); 728 up_read(&device->client_data_rwsem); 729 return 0; 730 731 out: 732 ib_device_put(device); 733 ib_client_put(client); 734 out_unlock: 735 up_write(&device->client_data_rwsem); 736 return ret; 737 } 738 739 static void remove_client_context(struct ib_device *device, 740 unsigned int client_id) 741 { 742 struct ib_client *client; 743 void *client_data; 744 745 down_write(&device->client_data_rwsem); 746 if (!xa_get_mark(&device->client_data, client_id, 747 CLIENT_DATA_REGISTERED)) { 748 up_write(&device->client_data_rwsem); 749 return; 750 } 751 client_data = xa_load(&device->client_data, client_id); 752 xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); 753 client = xa_load(&clients, client_id); 754 up_write(&device->client_data_rwsem); 755 756 /* 757 * Notice we cannot be holding any exclusive locks when calling the 758 * remove callback as the remove callback can recurse back into any 759 * public functions in this module and thus try for any locks those 760 * functions take. 761 * 762 * For this reason clients and drivers should not call the 763 * unregistration functions will holdling any locks. 764 */ 765 if (client->remove) 766 client->remove(device, client_data); 767 768 xa_erase(&device->client_data, client_id); 769 ib_device_put(device); 770 ib_client_put(client); 771 } 772 773 static int alloc_port_data(struct ib_device *device) 774 { 775 struct ib_port_data_rcu *pdata_rcu; 776 u32 port; 777 778 if (device->port_data) 779 return 0; 780 781 /* This can only be called once the physical port range is defined */ 782 if (WARN_ON(!device->phys_port_cnt)) 783 return -EINVAL; 784 785 /* Reserve U32_MAX so the logic to go over all the ports is sane */ 786 if (WARN_ON(device->phys_port_cnt == U32_MAX)) 787 return -EINVAL; 788 789 /* 790 * device->port_data is indexed directly by the port number to make 791 * access to this data as efficient as possible. 792 * 793 * Therefore port_data is declared as a 1 based array with potential 794 * empty slots at the beginning. 795 */ 796 pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, 797 size_add(rdma_end_port(device), 1)), 798 GFP_KERNEL); 799 if (!pdata_rcu) 800 return -ENOMEM; 801 /* 802 * The rcu_head is put in front of the port data array and the stored 803 * pointer is adjusted since we never need to see that member until 804 * kfree_rcu. 805 */ 806 device->port_data = pdata_rcu->pdata; 807 808 rdma_for_each_port (device, port) { 809 struct ib_port_data *pdata = &device->port_data[port]; 810 811 pdata->ib_dev = device; 812 spin_lock_init(&pdata->pkey_list_lock); 813 INIT_LIST_HEAD(&pdata->pkey_list); 814 spin_lock_init(&pdata->netdev_lock); 815 INIT_HLIST_NODE(&pdata->ndev_hash_link); 816 } 817 return 0; 818 } 819 820 static int verify_immutable(const struct ib_device *dev, u32 port) 821 { 822 return WARN_ON(!rdma_cap_ib_mad(dev, port) && 823 rdma_max_mad_size(dev, port) != 0); 824 } 825 826 static int setup_port_data(struct ib_device *device) 827 { 828 u32 port; 829 int ret; 830 831 ret = alloc_port_data(device); 832 if (ret) 833 return ret; 834 835 rdma_for_each_port (device, port) { 836 struct ib_port_data *pdata = &device->port_data[port]; 837 838 ret = device->ops.get_port_immutable(device, port, 839 &pdata->immutable); 840 if (ret) 841 return ret; 842 843 if (verify_immutable(device, port)) 844 return -EINVAL; 845 } 846 return 0; 847 } 848 849 /** 850 * ib_port_immutable_read() - Read rdma port's immutable data 851 * @dev: IB device 852 * @port: port number whose immutable data to read. It starts with index 1 and 853 * valid upto including rdma_end_port(). 854 */ 855 const struct ib_port_immutable* 856 ib_port_immutable_read(struct ib_device *dev, unsigned int port) 857 { 858 WARN_ON(!rdma_is_port_valid(dev, port)); 859 return &dev->port_data[port].immutable; 860 } 861 EXPORT_SYMBOL(ib_port_immutable_read); 862 863 void ib_get_device_fw_str(struct ib_device *dev, char *str) 864 { 865 if (dev->ops.get_dev_fw_str) 866 dev->ops.get_dev_fw_str(dev, str); 867 else 868 str[0] = '\0'; 869 } 870 EXPORT_SYMBOL(ib_get_device_fw_str); 871 872 static void ib_policy_change_task(struct work_struct *work) 873 { 874 struct ib_device *dev; 875 unsigned long index; 876 877 down_read(&devices_rwsem); 878 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 879 unsigned int i; 880 881 rdma_for_each_port (dev, i) { 882 u64 sp; 883 ib_get_cached_subnet_prefix(dev, i, &sp); 884 ib_security_cache_change(dev, i, sp); 885 } 886 } 887 up_read(&devices_rwsem); 888 } 889 890 static int ib_security_change(struct notifier_block *nb, unsigned long event, 891 void *lsm_data) 892 { 893 if (event != LSM_POLICY_CHANGE) 894 return NOTIFY_DONE; 895 896 schedule_work(&ib_policy_change_work); 897 ib_mad_agent_security_change(); 898 899 return NOTIFY_OK; 900 } 901 902 static void compatdev_release(struct device *dev) 903 { 904 struct ib_core_device *cdev = 905 container_of(dev, struct ib_core_device, dev); 906 907 kfree(cdev); 908 } 909 910 static int add_one_compat_dev(struct ib_device *device, 911 struct rdma_dev_net *rnet) 912 { 913 struct ib_core_device *cdev; 914 int ret; 915 916 lockdep_assert_held(&rdma_nets_rwsem); 917 if (!ib_devices_shared_netns) 918 return 0; 919 920 /* 921 * Create and add compat device in all namespaces other than where it 922 * is currently bound to. 923 */ 924 if (net_eq(read_pnet(&rnet->net), 925 read_pnet(&device->coredev.rdma_net))) 926 return 0; 927 928 /* 929 * The first of init_net() or ib_register_device() to take the 930 * compat_devs_mutex wins and gets to add the device. Others will wait 931 * for completion here. 932 */ 933 mutex_lock(&device->compat_devs_mutex); 934 cdev = xa_load(&device->compat_devs, rnet->id); 935 if (cdev) { 936 ret = 0; 937 goto done; 938 } 939 ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); 940 if (ret) 941 goto done; 942 943 cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); 944 if (!cdev) { 945 ret = -ENOMEM; 946 goto cdev_err; 947 } 948 949 cdev->dev.parent = device->dev.parent; 950 rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); 951 cdev->dev.release = compatdev_release; 952 ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); 953 if (ret) 954 goto add_err; 955 956 ret = device_add(&cdev->dev); 957 if (ret) 958 goto add_err; 959 ret = ib_setup_port_attrs(cdev); 960 if (ret) 961 goto port_err; 962 963 ret = xa_err(xa_store(&device->compat_devs, rnet->id, 964 cdev, GFP_KERNEL)); 965 if (ret) 966 goto insert_err; 967 968 mutex_unlock(&device->compat_devs_mutex); 969 return 0; 970 971 insert_err: 972 ib_free_port_attrs(cdev); 973 port_err: 974 device_del(&cdev->dev); 975 add_err: 976 put_device(&cdev->dev); 977 cdev_err: 978 xa_release(&device->compat_devs, rnet->id); 979 done: 980 mutex_unlock(&device->compat_devs_mutex); 981 return ret; 982 } 983 984 static void remove_one_compat_dev(struct ib_device *device, u32 id) 985 { 986 struct ib_core_device *cdev; 987 988 mutex_lock(&device->compat_devs_mutex); 989 cdev = xa_erase(&device->compat_devs, id); 990 mutex_unlock(&device->compat_devs_mutex); 991 if (cdev) { 992 ib_free_port_attrs(cdev); 993 device_del(&cdev->dev); 994 put_device(&cdev->dev); 995 } 996 } 997 998 static void remove_compat_devs(struct ib_device *device) 999 { 1000 struct ib_core_device *cdev; 1001 unsigned long index; 1002 1003 xa_for_each (&device->compat_devs, index, cdev) 1004 remove_one_compat_dev(device, index); 1005 } 1006 1007 static int add_compat_devs(struct ib_device *device) 1008 { 1009 struct rdma_dev_net *rnet; 1010 unsigned long index; 1011 int ret = 0; 1012 1013 lockdep_assert_held(&devices_rwsem); 1014 1015 down_read(&rdma_nets_rwsem); 1016 xa_for_each (&rdma_nets, index, rnet) { 1017 ret = add_one_compat_dev(device, rnet); 1018 if (ret) 1019 break; 1020 } 1021 up_read(&rdma_nets_rwsem); 1022 return ret; 1023 } 1024 1025 static void remove_all_compat_devs(void) 1026 { 1027 struct ib_compat_device *cdev; 1028 struct ib_device *dev; 1029 unsigned long index; 1030 1031 down_read(&devices_rwsem); 1032 xa_for_each (&devices, index, dev) { 1033 unsigned long c_index = 0; 1034 1035 /* Hold nets_rwsem so that any other thread modifying this 1036 * system param can sync with this thread. 1037 */ 1038 down_read(&rdma_nets_rwsem); 1039 xa_for_each (&dev->compat_devs, c_index, cdev) 1040 remove_one_compat_dev(dev, c_index); 1041 up_read(&rdma_nets_rwsem); 1042 } 1043 up_read(&devices_rwsem); 1044 } 1045 1046 static int add_all_compat_devs(void) 1047 { 1048 struct rdma_dev_net *rnet; 1049 struct ib_device *dev; 1050 unsigned long index; 1051 int ret = 0; 1052 1053 down_read(&devices_rwsem); 1054 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1055 unsigned long net_index = 0; 1056 1057 /* Hold nets_rwsem so that any other thread modifying this 1058 * system param can sync with this thread. 1059 */ 1060 down_read(&rdma_nets_rwsem); 1061 xa_for_each (&rdma_nets, net_index, rnet) { 1062 ret = add_one_compat_dev(dev, rnet); 1063 if (ret) 1064 break; 1065 } 1066 up_read(&rdma_nets_rwsem); 1067 } 1068 up_read(&devices_rwsem); 1069 if (ret) 1070 remove_all_compat_devs(); 1071 return ret; 1072 } 1073 1074 int rdma_compatdev_set(u8 enable) 1075 { 1076 struct rdma_dev_net *rnet; 1077 unsigned long index; 1078 int ret = 0; 1079 1080 down_write(&rdma_nets_rwsem); 1081 if (ib_devices_shared_netns == enable) { 1082 up_write(&rdma_nets_rwsem); 1083 return 0; 1084 } 1085 1086 /* enable/disable of compat devices is not supported 1087 * when more than default init_net exists. 1088 */ 1089 xa_for_each (&rdma_nets, index, rnet) { 1090 ret++; 1091 break; 1092 } 1093 if (!ret) 1094 ib_devices_shared_netns = enable; 1095 up_write(&rdma_nets_rwsem); 1096 if (ret) 1097 return -EBUSY; 1098 1099 if (enable) 1100 ret = add_all_compat_devs(); 1101 else 1102 remove_all_compat_devs(); 1103 return ret; 1104 } 1105 1106 static void rdma_dev_exit_net(struct net *net) 1107 { 1108 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1109 struct ib_device *dev; 1110 unsigned long index; 1111 int ret; 1112 1113 down_write(&rdma_nets_rwsem); 1114 /* 1115 * Prevent the ID from being re-used and hide the id from xa_for_each. 1116 */ 1117 ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); 1118 WARN_ON(ret); 1119 up_write(&rdma_nets_rwsem); 1120 1121 down_read(&devices_rwsem); 1122 xa_for_each (&devices, index, dev) { 1123 get_device(&dev->dev); 1124 /* 1125 * Release the devices_rwsem so that pontentially blocking 1126 * device_del, doesn't hold the devices_rwsem for too long. 1127 */ 1128 up_read(&devices_rwsem); 1129 1130 remove_one_compat_dev(dev, rnet->id); 1131 1132 /* 1133 * If the real device is in the NS then move it back to init. 1134 */ 1135 rdma_dev_change_netns(dev, net, &init_net); 1136 1137 put_device(&dev->dev); 1138 down_read(&devices_rwsem); 1139 } 1140 up_read(&devices_rwsem); 1141 1142 rdma_nl_net_exit(rnet); 1143 xa_erase(&rdma_nets, rnet->id); 1144 } 1145 1146 static __net_init int rdma_dev_init_net(struct net *net) 1147 { 1148 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1149 unsigned long index; 1150 struct ib_device *dev; 1151 int ret; 1152 1153 write_pnet(&rnet->net, net); 1154 1155 ret = rdma_nl_net_init(rnet); 1156 if (ret) 1157 return ret; 1158 1159 /* No need to create any compat devices in default init_net. */ 1160 if (net_eq(net, &init_net)) 1161 return 0; 1162 1163 ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); 1164 if (ret) { 1165 rdma_nl_net_exit(rnet); 1166 return ret; 1167 } 1168 1169 down_read(&devices_rwsem); 1170 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1171 /* Hold nets_rwsem so that netlink command cannot change 1172 * system configuration for device sharing mode. 1173 */ 1174 down_read(&rdma_nets_rwsem); 1175 ret = add_one_compat_dev(dev, rnet); 1176 up_read(&rdma_nets_rwsem); 1177 if (ret) 1178 break; 1179 } 1180 up_read(&devices_rwsem); 1181 1182 if (ret) 1183 rdma_dev_exit_net(net); 1184 1185 return ret; 1186 } 1187 1188 /* 1189 * Assign the unique string device name and the unique device index. This is 1190 * undone by ib_dealloc_device. 1191 */ 1192 static int assign_name(struct ib_device *device, const char *name) 1193 { 1194 static u32 last_id; 1195 int ret; 1196 1197 down_write(&devices_rwsem); 1198 /* Assign a unique name to the device */ 1199 if (strchr(name, '%')) 1200 ret = alloc_name(device, name); 1201 else 1202 ret = dev_set_name(&device->dev, name); 1203 if (ret) 1204 goto out; 1205 1206 if (__ib_device_get_by_name(dev_name(&device->dev))) { 1207 ret = -ENFILE; 1208 goto out; 1209 } 1210 strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); 1211 1212 ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, 1213 &last_id, GFP_KERNEL); 1214 if (ret > 0) 1215 ret = 0; 1216 1217 out: 1218 up_write(&devices_rwsem); 1219 return ret; 1220 } 1221 1222 /* 1223 * setup_device() allocates memory and sets up data that requires calling the 1224 * device ops, this is the only reason these actions are not done during 1225 * ib_alloc_device. It is undone by ib_dealloc_device(). 1226 */ 1227 static int setup_device(struct ib_device *device) 1228 { 1229 struct ib_udata uhw = {.outlen = 0, .inlen = 0}; 1230 int ret; 1231 1232 ib_device_check_mandatory(device); 1233 1234 ret = setup_port_data(device); 1235 if (ret) { 1236 dev_warn(&device->dev, "Couldn't create per-port data\n"); 1237 return ret; 1238 } 1239 1240 memset(&device->attrs, 0, sizeof(device->attrs)); 1241 ret = device->ops.query_device(device, &device->attrs, &uhw); 1242 if (ret) { 1243 dev_warn(&device->dev, 1244 "Couldn't query the device attributes\n"); 1245 return ret; 1246 } 1247 1248 return 0; 1249 } 1250 1251 static void disable_device(struct ib_device *device) 1252 { 1253 u32 cid; 1254 1255 WARN_ON(!refcount_read(&device->refcount)); 1256 1257 down_write(&devices_rwsem); 1258 xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); 1259 up_write(&devices_rwsem); 1260 1261 /* 1262 * Remove clients in LIFO order, see assign_client_id. This could be 1263 * more efficient if xarray learns to reverse iterate. Since no new 1264 * clients can be added to this ib_device past this point we only need 1265 * the maximum possible client_id value here. 1266 */ 1267 down_read(&clients_rwsem); 1268 cid = highest_client_id; 1269 up_read(&clients_rwsem); 1270 while (cid) { 1271 cid--; 1272 remove_client_context(device, cid); 1273 } 1274 1275 ib_cq_pool_cleanup(device); 1276 1277 /* Pairs with refcount_set in enable_device */ 1278 ib_device_put(device); 1279 wait_for_completion(&device->unreg_completion); 1280 1281 /* 1282 * compat devices must be removed after device refcount drops to zero. 1283 * Otherwise init_net() may add more compatdevs after removing compat 1284 * devices and before device is disabled. 1285 */ 1286 remove_compat_devs(device); 1287 } 1288 1289 /* 1290 * An enabled device is visible to all clients and to all the public facing 1291 * APIs that return a device pointer. This always returns with a new get, even 1292 * if it fails. 1293 */ 1294 static int enable_device_and_get(struct ib_device *device) 1295 { 1296 struct ib_client *client; 1297 unsigned long index; 1298 int ret = 0; 1299 1300 /* 1301 * One ref belongs to the xa and the other belongs to this 1302 * thread. This is needed to guard against parallel unregistration. 1303 */ 1304 refcount_set(&device->refcount, 2); 1305 down_write(&devices_rwsem); 1306 xa_set_mark(&devices, device->index, DEVICE_REGISTERED); 1307 1308 /* 1309 * By using downgrade_write() we ensure that no other thread can clear 1310 * DEVICE_REGISTERED while we are completing the client setup. 1311 */ 1312 downgrade_write(&devices_rwsem); 1313 1314 if (device->ops.enable_driver) { 1315 ret = device->ops.enable_driver(device); 1316 if (ret) 1317 goto out; 1318 } 1319 1320 down_read(&clients_rwsem); 1321 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1322 ret = add_client_context(device, client); 1323 if (ret) 1324 break; 1325 } 1326 up_read(&clients_rwsem); 1327 if (!ret) 1328 ret = add_compat_devs(device); 1329 out: 1330 up_read(&devices_rwsem); 1331 return ret; 1332 } 1333 1334 static void prevent_dealloc_device(struct ib_device *ib_dev) 1335 { 1336 } 1337 1338 static void ib_device_notify_register(struct ib_device *device) 1339 { 1340 struct net_device *netdev; 1341 u32 port; 1342 int ret; 1343 1344 ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); 1345 if (ret) 1346 return; 1347 1348 rdma_for_each_port(device, port) { 1349 netdev = ib_device_get_netdev(device, port); 1350 if (!netdev) 1351 continue; 1352 1353 ret = rdma_nl_notify_event(device, port, 1354 RDMA_NETDEV_ATTACH_EVENT); 1355 dev_put(netdev); 1356 if (ret) 1357 return; 1358 } 1359 } 1360 1361 /** 1362 * ib_register_device - Register an IB device with IB core 1363 * @device: Device to register 1364 * @name: unique string device name. This may include a '%' which will 1365 * cause a unique index to be added to the passed device name. 1366 * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB 1367 * device will be used. In this case the caller should fully 1368 * setup the ibdev for DMA. This usually means using dma_virt_ops. 1369 * 1370 * Low-level drivers use ib_register_device() to register their 1371 * devices with the IB core. All registered clients will receive a 1372 * callback for each device that is added. @device must be allocated 1373 * with ib_alloc_device(). 1374 * 1375 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() 1376 * asynchronously then the device pointer may become freed as soon as this 1377 * function returns. 1378 */ 1379 int ib_register_device(struct ib_device *device, const char *name, 1380 struct device *dma_device) 1381 { 1382 int ret; 1383 1384 ret = assign_name(device, name); 1385 if (ret) 1386 return ret; 1387 1388 /* 1389 * If the caller does not provide a DMA capable device then the IB core 1390 * will set up ib_sge and scatterlist structures that stash the kernel 1391 * virtual address into the address field. 1392 */ 1393 WARN_ON(dma_device && !dma_device->dma_parms); 1394 device->dma_device = dma_device; 1395 1396 ret = setup_device(device); 1397 if (ret) 1398 return ret; 1399 1400 ret = ib_cache_setup_one(device); 1401 if (ret) { 1402 dev_warn(&device->dev, 1403 "Couldn't set up InfiniBand P_Key/GID cache\n"); 1404 return ret; 1405 } 1406 1407 device->groups[0] = &ib_dev_attr_group; 1408 device->groups[1] = device->ops.device_group; 1409 ret = ib_setup_device_attrs(device); 1410 if (ret) 1411 goto cache_cleanup; 1412 1413 ib_device_register_rdmacg(device); 1414 1415 rdma_counter_init(device); 1416 1417 /* 1418 * Ensure that ADD uevent is not fired because it 1419 * is too early amd device is not initialized yet. 1420 */ 1421 dev_set_uevent_suppress(&device->dev, true); 1422 ret = device_add(&device->dev); 1423 if (ret) 1424 goto cg_cleanup; 1425 1426 ret = ib_setup_port_attrs(&device->coredev); 1427 if (ret) { 1428 dev_warn(&device->dev, 1429 "Couldn't register device with driver model\n"); 1430 goto dev_cleanup; 1431 } 1432 1433 ret = enable_device_and_get(device); 1434 if (ret) { 1435 void (*dealloc_fn)(struct ib_device *); 1436 1437 /* 1438 * If we hit this error flow then we don't want to 1439 * automatically dealloc the device since the caller is 1440 * expected to call ib_dealloc_device() after 1441 * ib_register_device() fails. This is tricky due to the 1442 * possibility for a parallel unregistration along with this 1443 * error flow. Since we have a refcount here we know any 1444 * parallel flow is stopped in disable_device and will see the 1445 * special dealloc_driver pointer, causing the responsibility to 1446 * ib_dealloc_device() to revert back to this thread. 1447 */ 1448 dealloc_fn = device->ops.dealloc_driver; 1449 device->ops.dealloc_driver = prevent_dealloc_device; 1450 ib_device_put(device); 1451 __ib_unregister_device(device); 1452 device->ops.dealloc_driver = dealloc_fn; 1453 dev_set_uevent_suppress(&device->dev, false); 1454 return ret; 1455 } 1456 dev_set_uevent_suppress(&device->dev, false); 1457 /* Mark for userspace that device is ready */ 1458 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1459 1460 ib_device_notify_register(device); 1461 ib_device_put(device); 1462 1463 return 0; 1464 1465 dev_cleanup: 1466 device_del(&device->dev); 1467 cg_cleanup: 1468 dev_set_uevent_suppress(&device->dev, false); 1469 ib_device_unregister_rdmacg(device); 1470 cache_cleanup: 1471 ib_cache_cleanup_one(device); 1472 return ret; 1473 } 1474 EXPORT_SYMBOL(ib_register_device); 1475 1476 /* Callers must hold a get on the device. */ 1477 static void __ib_unregister_device(struct ib_device *ib_dev) 1478 { 1479 struct ib_device *sub, *tmp; 1480 1481 mutex_lock(&ib_dev->subdev_lock); 1482 list_for_each_entry_safe_reverse(sub, tmp, 1483 &ib_dev->subdev_list_head, 1484 subdev_list) { 1485 list_del(&sub->subdev_list); 1486 ib_dev->ops.del_sub_dev(sub); 1487 ib_device_put(ib_dev); 1488 } 1489 mutex_unlock(&ib_dev->subdev_lock); 1490 1491 /* 1492 * We have a registration lock so that all the calls to unregister are 1493 * fully fenced, once any unregister returns the device is truely 1494 * unregistered even if multiple callers are unregistering it at the 1495 * same time. This also interacts with the registration flow and 1496 * provides sane semantics if register and unregister are racing. 1497 */ 1498 mutex_lock(&ib_dev->unregistration_lock); 1499 if (!refcount_read(&ib_dev->refcount)) 1500 goto out; 1501 1502 disable_device(ib_dev); 1503 rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT); 1504 1505 /* Expedite removing unregistered pointers from the hash table */ 1506 free_netdevs(ib_dev); 1507 1508 ib_free_port_attrs(&ib_dev->coredev); 1509 device_del(&ib_dev->dev); 1510 ib_device_unregister_rdmacg(ib_dev); 1511 ib_cache_cleanup_one(ib_dev); 1512 1513 /* 1514 * Drivers using the new flow may not call ib_dealloc_device except 1515 * in error unwind prior to registration success. 1516 */ 1517 if (ib_dev->ops.dealloc_driver && 1518 ib_dev->ops.dealloc_driver != prevent_dealloc_device) { 1519 WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); 1520 ib_dealloc_device(ib_dev); 1521 } 1522 out: 1523 mutex_unlock(&ib_dev->unregistration_lock); 1524 } 1525 1526 /** 1527 * ib_unregister_device - Unregister an IB device 1528 * @ib_dev: The device to unregister 1529 * 1530 * Unregister an IB device. All clients will receive a remove callback. 1531 * 1532 * Callers should call this routine only once, and protect against races with 1533 * registration. Typically it should only be called as part of a remove 1534 * callback in an implementation of driver core's struct device_driver and 1535 * related. 1536 * 1537 * If ops.dealloc_driver is used then ib_dev will be freed upon return from 1538 * this function. 1539 */ 1540 void ib_unregister_device(struct ib_device *ib_dev) 1541 { 1542 get_device(&ib_dev->dev); 1543 __ib_unregister_device(ib_dev); 1544 put_device(&ib_dev->dev); 1545 } 1546 EXPORT_SYMBOL(ib_unregister_device); 1547 1548 /** 1549 * ib_unregister_device_and_put - Unregister a device while holding a 'get' 1550 * @ib_dev: The device to unregister 1551 * 1552 * This is the same as ib_unregister_device(), except it includes an internal 1553 * ib_device_put() that should match a 'get' obtained by the caller. 1554 * 1555 * It is safe to call this routine concurrently from multiple threads while 1556 * holding the 'get'. When the function returns the device is fully 1557 * unregistered. 1558 * 1559 * Drivers using this flow MUST use the driver_unregister callback to clean up 1560 * their resources associated with the device and dealloc it. 1561 */ 1562 void ib_unregister_device_and_put(struct ib_device *ib_dev) 1563 { 1564 WARN_ON(!ib_dev->ops.dealloc_driver); 1565 get_device(&ib_dev->dev); 1566 ib_device_put(ib_dev); 1567 __ib_unregister_device(ib_dev); 1568 put_device(&ib_dev->dev); 1569 } 1570 EXPORT_SYMBOL(ib_unregister_device_and_put); 1571 1572 /** 1573 * ib_unregister_driver - Unregister all IB devices for a driver 1574 * @driver_id: The driver to unregister 1575 * 1576 * This implements a fence for device unregistration. It only returns once all 1577 * devices associated with the driver_id have fully completed their 1578 * unregistration and returned from ib_unregister_device*(). 1579 * 1580 * If device's are not yet unregistered it goes ahead and starts unregistering 1581 * them. 1582 * 1583 * This does not block creation of new devices with the given driver_id, that 1584 * is the responsibility of the caller. 1585 */ 1586 void ib_unregister_driver(enum rdma_driver_id driver_id) 1587 { 1588 struct ib_device *ib_dev; 1589 unsigned long index; 1590 1591 down_read(&devices_rwsem); 1592 xa_for_each (&devices, index, ib_dev) { 1593 if (ib_dev->ops.driver_id != driver_id) 1594 continue; 1595 1596 get_device(&ib_dev->dev); 1597 up_read(&devices_rwsem); 1598 1599 WARN_ON(!ib_dev->ops.dealloc_driver); 1600 __ib_unregister_device(ib_dev); 1601 1602 put_device(&ib_dev->dev); 1603 down_read(&devices_rwsem); 1604 } 1605 up_read(&devices_rwsem); 1606 } 1607 EXPORT_SYMBOL(ib_unregister_driver); 1608 1609 static void ib_unregister_work(struct work_struct *work) 1610 { 1611 struct ib_device *ib_dev = 1612 container_of(work, struct ib_device, unregistration_work); 1613 1614 __ib_unregister_device(ib_dev); 1615 put_device(&ib_dev->dev); 1616 } 1617 1618 /** 1619 * ib_unregister_device_queued - Unregister a device using a work queue 1620 * @ib_dev: The device to unregister 1621 * 1622 * This schedules an asynchronous unregistration using a WQ for the device. A 1623 * driver should use this to avoid holding locks while doing unregistration, 1624 * such as holding the RTNL lock. 1625 * 1626 * Drivers using this API must use ib_unregister_driver before module unload 1627 * to ensure that all scheduled unregistrations have completed. 1628 */ 1629 void ib_unregister_device_queued(struct ib_device *ib_dev) 1630 { 1631 WARN_ON(!refcount_read(&ib_dev->refcount)); 1632 WARN_ON(!ib_dev->ops.dealloc_driver); 1633 get_device(&ib_dev->dev); 1634 if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work)) 1635 put_device(&ib_dev->dev); 1636 } 1637 EXPORT_SYMBOL(ib_unregister_device_queued); 1638 1639 /* 1640 * The caller must pass in a device that has the kref held and the refcount 1641 * released. If the device is in cur_net and still registered then it is moved 1642 * into net. 1643 */ 1644 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 1645 struct net *net) 1646 { 1647 int ret2 = -EINVAL; 1648 int ret; 1649 1650 mutex_lock(&device->unregistration_lock); 1651 1652 /* 1653 * If a device not under ib_device_get() or if the unregistration_lock 1654 * is not held, the namespace can be changed, or it can be unregistered. 1655 * Check again under the lock. 1656 */ 1657 if (refcount_read(&device->refcount) == 0 || 1658 !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { 1659 ret = -ENODEV; 1660 goto out; 1661 } 1662 1663 kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); 1664 disable_device(device); 1665 1666 /* 1667 * At this point no one can be using the device, so it is safe to 1668 * change the namespace. 1669 */ 1670 write_pnet(&device->coredev.rdma_net, net); 1671 1672 down_read(&devices_rwsem); 1673 /* 1674 * Currently rdma devices are system wide unique. So the device name 1675 * is guaranteed free in the new namespace. Publish the new namespace 1676 * at the sysfs level. 1677 */ 1678 ret = device_rename(&device->dev, dev_name(&device->dev)); 1679 up_read(&devices_rwsem); 1680 if (ret) { 1681 dev_warn(&device->dev, 1682 "%s: Couldn't rename device after namespace change\n", 1683 __func__); 1684 /* Try and put things back and re-enable the device */ 1685 write_pnet(&device->coredev.rdma_net, cur_net); 1686 } 1687 1688 ret2 = enable_device_and_get(device); 1689 if (ret2) { 1690 /* 1691 * This shouldn't really happen, but if it does, let the user 1692 * retry at later point. So don't disable the device. 1693 */ 1694 dev_warn(&device->dev, 1695 "%s: Couldn't re-enable device after namespace change\n", 1696 __func__); 1697 } 1698 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1699 1700 ib_device_put(device); 1701 out: 1702 mutex_unlock(&device->unregistration_lock); 1703 if (ret) 1704 return ret; 1705 return ret2; 1706 } 1707 1708 int ib_device_set_netns_put(struct sk_buff *skb, 1709 struct ib_device *dev, u32 ns_fd) 1710 { 1711 struct net *net; 1712 int ret; 1713 1714 net = get_net_ns_by_fd(ns_fd); 1715 if (IS_ERR(net)) { 1716 ret = PTR_ERR(net); 1717 goto net_err; 1718 } 1719 1720 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { 1721 ret = -EPERM; 1722 goto ns_err; 1723 } 1724 1725 /* 1726 * All the ib_clients, including uverbs, are reset when the namespace is 1727 * changed and this cannot be blocked waiting for userspace to do 1728 * something, so disassociation is mandatory. 1729 */ 1730 if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) { 1731 ret = -EOPNOTSUPP; 1732 goto ns_err; 1733 } 1734 1735 get_device(&dev->dev); 1736 ib_device_put(dev); 1737 ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); 1738 put_device(&dev->dev); 1739 1740 put_net(net); 1741 return ret; 1742 1743 ns_err: 1744 put_net(net); 1745 net_err: 1746 ib_device_put(dev); 1747 return ret; 1748 } 1749 1750 static struct pernet_operations rdma_dev_net_ops = { 1751 .init = rdma_dev_init_net, 1752 .exit = rdma_dev_exit_net, 1753 .id = &rdma_dev_net_id, 1754 .size = sizeof(struct rdma_dev_net), 1755 }; 1756 1757 static int assign_client_id(struct ib_client *client) 1758 { 1759 int ret; 1760 1761 lockdep_assert_held(&clients_rwsem); 1762 /* 1763 * The add/remove callbacks must be called in FIFO/LIFO order. To 1764 * achieve this we assign client_ids so they are sorted in 1765 * registration order. 1766 */ 1767 client->client_id = highest_client_id; 1768 ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); 1769 if (ret) 1770 return ret; 1771 1772 highest_client_id++; 1773 xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); 1774 return 0; 1775 } 1776 1777 static void remove_client_id(struct ib_client *client) 1778 { 1779 down_write(&clients_rwsem); 1780 xa_erase(&clients, client->client_id); 1781 for (; highest_client_id; highest_client_id--) 1782 if (xa_load(&clients, highest_client_id - 1)) 1783 break; 1784 up_write(&clients_rwsem); 1785 } 1786 1787 /** 1788 * ib_register_client - Register an IB client 1789 * @client:Client to register 1790 * 1791 * Upper level users of the IB drivers can use ib_register_client() to 1792 * register callbacks for IB device addition and removal. When an IB 1793 * device is added, each registered client's add method will be called 1794 * (in the order the clients were registered), and when a device is 1795 * removed, each client's remove method will be called (in the reverse 1796 * order that clients were registered). In addition, when 1797 * ib_register_client() is called, the client will receive an add 1798 * callback for all devices already registered. 1799 */ 1800 int ib_register_client(struct ib_client *client) 1801 { 1802 struct ib_device *device; 1803 unsigned long index; 1804 bool need_unreg = false; 1805 int ret; 1806 1807 refcount_set(&client->uses, 1); 1808 init_completion(&client->uses_zero); 1809 1810 /* 1811 * The devices_rwsem is held in write mode to ensure that a racing 1812 * ib_register_device() sees a consisent view of clients and devices. 1813 */ 1814 down_write(&devices_rwsem); 1815 down_write(&clients_rwsem); 1816 ret = assign_client_id(client); 1817 if (ret) 1818 goto out; 1819 1820 need_unreg = true; 1821 xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { 1822 ret = add_client_context(device, client); 1823 if (ret) 1824 goto out; 1825 } 1826 ret = 0; 1827 out: 1828 up_write(&clients_rwsem); 1829 up_write(&devices_rwsem); 1830 if (need_unreg && ret) 1831 ib_unregister_client(client); 1832 return ret; 1833 } 1834 EXPORT_SYMBOL(ib_register_client); 1835 1836 /** 1837 * ib_unregister_client - Unregister an IB client 1838 * @client:Client to unregister 1839 * 1840 * Upper level users use ib_unregister_client() to remove their client 1841 * registration. When ib_unregister_client() is called, the client 1842 * will receive a remove callback for each IB device still registered. 1843 * 1844 * This is a full fence, once it returns no client callbacks will be called, 1845 * or are running in another thread. 1846 */ 1847 void ib_unregister_client(struct ib_client *client) 1848 { 1849 struct ib_device *device; 1850 unsigned long index; 1851 1852 down_write(&clients_rwsem); 1853 ib_client_put(client); 1854 xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); 1855 up_write(&clients_rwsem); 1856 1857 /* We do not want to have locks while calling client->remove() */ 1858 rcu_read_lock(); 1859 xa_for_each (&devices, index, device) { 1860 if (!ib_device_try_get(device)) 1861 continue; 1862 rcu_read_unlock(); 1863 1864 remove_client_context(device, client->client_id); 1865 1866 ib_device_put(device); 1867 rcu_read_lock(); 1868 } 1869 rcu_read_unlock(); 1870 1871 /* 1872 * remove_client_context() is not a fence, it can return even though a 1873 * removal is ongoing. Wait until all removals are completed. 1874 */ 1875 wait_for_completion(&client->uses_zero); 1876 remove_client_id(client); 1877 } 1878 EXPORT_SYMBOL(ib_unregister_client); 1879 1880 static int __ib_get_global_client_nl_info(const char *client_name, 1881 struct ib_client_nl_info *res) 1882 { 1883 struct ib_client *client; 1884 unsigned long index; 1885 int ret = -ENOENT; 1886 1887 down_read(&clients_rwsem); 1888 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1889 if (strcmp(client->name, client_name) != 0) 1890 continue; 1891 if (!client->get_global_nl_info) { 1892 ret = -EOPNOTSUPP; 1893 break; 1894 } 1895 ret = client->get_global_nl_info(res); 1896 if (WARN_ON(ret == -ENOENT)) 1897 ret = -EINVAL; 1898 if (!ret && res->cdev) 1899 get_device(res->cdev); 1900 break; 1901 } 1902 up_read(&clients_rwsem); 1903 return ret; 1904 } 1905 1906 static int __ib_get_client_nl_info(struct ib_device *ibdev, 1907 const char *client_name, 1908 struct ib_client_nl_info *res) 1909 { 1910 unsigned long index; 1911 void *client_data; 1912 int ret = -ENOENT; 1913 1914 down_read(&ibdev->client_data_rwsem); 1915 xan_for_each_marked (&ibdev->client_data, index, client_data, 1916 CLIENT_DATA_REGISTERED) { 1917 struct ib_client *client = xa_load(&clients, index); 1918 1919 if (!client || strcmp(client->name, client_name) != 0) 1920 continue; 1921 if (!client->get_nl_info) { 1922 ret = -EOPNOTSUPP; 1923 break; 1924 } 1925 ret = client->get_nl_info(ibdev, client_data, res); 1926 if (WARN_ON(ret == -ENOENT)) 1927 ret = -EINVAL; 1928 1929 /* 1930 * The cdev is guaranteed valid as long as we are inside the 1931 * client_data_rwsem as remove_one can't be called. Keep it 1932 * valid for the caller. 1933 */ 1934 if (!ret && res->cdev) 1935 get_device(res->cdev); 1936 break; 1937 } 1938 up_read(&ibdev->client_data_rwsem); 1939 1940 return ret; 1941 } 1942 1943 /** 1944 * ib_get_client_nl_info - Fetch the nl_info from a client 1945 * @ibdev: IB device 1946 * @client_name: Name of the client 1947 * @res: Result of the query 1948 */ 1949 int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, 1950 struct ib_client_nl_info *res) 1951 { 1952 int ret; 1953 1954 if (ibdev) 1955 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1956 else 1957 ret = __ib_get_global_client_nl_info(client_name, res); 1958 #ifdef CONFIG_MODULES 1959 if (ret == -ENOENT) { 1960 request_module("rdma-client-%s", client_name); 1961 if (ibdev) 1962 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1963 else 1964 ret = __ib_get_global_client_nl_info(client_name, res); 1965 } 1966 #endif 1967 if (ret) { 1968 if (ret == -ENOENT) 1969 return -EOPNOTSUPP; 1970 return ret; 1971 } 1972 1973 if (WARN_ON(!res->cdev)) 1974 return -EINVAL; 1975 return 0; 1976 } 1977 1978 /** 1979 * ib_set_client_data - Set IB client context 1980 * @device:Device to set context for 1981 * @client:Client to set context for 1982 * @data:Context to set 1983 * 1984 * ib_set_client_data() sets client context data that can be retrieved with 1985 * ib_get_client_data(). This can only be called while the client is 1986 * registered to the device, once the ib_client remove() callback returns this 1987 * cannot be called. 1988 */ 1989 void ib_set_client_data(struct ib_device *device, struct ib_client *client, 1990 void *data) 1991 { 1992 void *rc; 1993 1994 if (WARN_ON(IS_ERR(data))) 1995 data = NULL; 1996 1997 rc = xa_store(&device->client_data, client->client_id, data, 1998 GFP_KERNEL); 1999 WARN_ON(xa_is_err(rc)); 2000 } 2001 EXPORT_SYMBOL(ib_set_client_data); 2002 2003 /** 2004 * ib_register_event_handler - Register an IB event handler 2005 * @event_handler:Handler to register 2006 * 2007 * ib_register_event_handler() registers an event handler that will be 2008 * called back when asynchronous IB events occur (as defined in 2009 * chapter 11 of the InfiniBand Architecture Specification). This 2010 * callback occurs in workqueue context. 2011 */ 2012 void ib_register_event_handler(struct ib_event_handler *event_handler) 2013 { 2014 down_write(&event_handler->device->event_handler_rwsem); 2015 list_add_tail(&event_handler->list, 2016 &event_handler->device->event_handler_list); 2017 up_write(&event_handler->device->event_handler_rwsem); 2018 } 2019 EXPORT_SYMBOL(ib_register_event_handler); 2020 2021 /** 2022 * ib_unregister_event_handler - Unregister an event handler 2023 * @event_handler:Handler to unregister 2024 * 2025 * Unregister an event handler registered with 2026 * ib_register_event_handler(). 2027 */ 2028 void ib_unregister_event_handler(struct ib_event_handler *event_handler) 2029 { 2030 down_write(&event_handler->device->event_handler_rwsem); 2031 list_del(&event_handler->list); 2032 up_write(&event_handler->device->event_handler_rwsem); 2033 } 2034 EXPORT_SYMBOL(ib_unregister_event_handler); 2035 2036 void ib_dispatch_event_clients(struct ib_event *event) 2037 { 2038 struct ib_event_handler *handler; 2039 2040 down_read(&event->device->event_handler_rwsem); 2041 2042 list_for_each_entry(handler, &event->device->event_handler_list, list) 2043 handler->handler(handler, event); 2044 2045 up_read(&event->device->event_handler_rwsem); 2046 } 2047 2048 static int iw_query_port(struct ib_device *device, 2049 u32 port_num, 2050 struct ib_port_attr *port_attr) 2051 { 2052 struct in_device *inetdev; 2053 struct net_device *netdev; 2054 2055 memset(port_attr, 0, sizeof(*port_attr)); 2056 2057 netdev = ib_device_get_netdev(device, port_num); 2058 if (!netdev) 2059 return -ENODEV; 2060 2061 port_attr->max_mtu = IB_MTU_4096; 2062 port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); 2063 2064 if (!netif_carrier_ok(netdev)) { 2065 port_attr->state = IB_PORT_DOWN; 2066 port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; 2067 } else { 2068 rcu_read_lock(); 2069 inetdev = __in_dev_get_rcu(netdev); 2070 2071 if (inetdev && inetdev->ifa_list) { 2072 port_attr->state = IB_PORT_ACTIVE; 2073 port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; 2074 } else { 2075 port_attr->state = IB_PORT_INIT; 2076 port_attr->phys_state = 2077 IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; 2078 } 2079 2080 rcu_read_unlock(); 2081 } 2082 2083 dev_put(netdev); 2084 return device->ops.query_port(device, port_num, port_attr); 2085 } 2086 2087 static int __ib_query_port(struct ib_device *device, 2088 u32 port_num, 2089 struct ib_port_attr *port_attr) 2090 { 2091 int err; 2092 2093 memset(port_attr, 0, sizeof(*port_attr)); 2094 2095 err = device->ops.query_port(device, port_num, port_attr); 2096 if (err || port_attr->subnet_prefix) 2097 return err; 2098 2099 if (rdma_port_get_link_layer(device, port_num) != 2100 IB_LINK_LAYER_INFINIBAND) 2101 return 0; 2102 2103 ib_get_cached_subnet_prefix(device, port_num, 2104 &port_attr->subnet_prefix); 2105 return 0; 2106 } 2107 2108 /** 2109 * ib_query_port - Query IB port attributes 2110 * @device:Device to query 2111 * @port_num:Port number to query 2112 * @port_attr:Port attributes 2113 * 2114 * ib_query_port() returns the attributes of a port through the 2115 * @port_attr pointer. 2116 */ 2117 int ib_query_port(struct ib_device *device, 2118 u32 port_num, 2119 struct ib_port_attr *port_attr) 2120 { 2121 if (!rdma_is_port_valid(device, port_num)) 2122 return -EINVAL; 2123 2124 if (rdma_protocol_iwarp(device, port_num)) 2125 return iw_query_port(device, port_num, port_attr); 2126 else 2127 return __ib_query_port(device, port_num, port_attr); 2128 } 2129 EXPORT_SYMBOL(ib_query_port); 2130 2131 static void add_ndev_hash(struct ib_port_data *pdata) 2132 { 2133 unsigned long flags; 2134 2135 might_sleep(); 2136 2137 spin_lock_irqsave(&ndev_hash_lock, flags); 2138 if (hash_hashed(&pdata->ndev_hash_link)) { 2139 hash_del_rcu(&pdata->ndev_hash_link); 2140 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2141 /* 2142 * We cannot do hash_add_rcu after a hash_del_rcu until the 2143 * grace period 2144 */ 2145 synchronize_rcu(); 2146 spin_lock_irqsave(&ndev_hash_lock, flags); 2147 } 2148 if (pdata->netdev) 2149 hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, 2150 (uintptr_t)pdata->netdev); 2151 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2152 } 2153 2154 /** 2155 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device 2156 * @ib_dev: Device to modify 2157 * @ndev: net_device to affiliate, may be NULL 2158 * @port: IB port the net_device is connected to 2159 * 2160 * Drivers should use this to link the ib_device to a netdev so the netdev 2161 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be 2162 * affiliated with any port. 2163 * 2164 * The caller must ensure that the given ndev is not unregistered or 2165 * unregistering, and that either the ib_device is unregistered or 2166 * ib_device_set_netdev() is called with NULL when the ndev sends a 2167 * NETDEV_UNREGISTER event. 2168 */ 2169 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, 2170 u32 port) 2171 { 2172 enum rdma_nl_notify_event_type etype; 2173 struct net_device *old_ndev; 2174 struct ib_port_data *pdata; 2175 unsigned long flags; 2176 int ret; 2177 2178 if (!rdma_is_port_valid(ib_dev, port)) 2179 return -EINVAL; 2180 2181 /* 2182 * Drivers wish to call this before ib_register_driver, so we have to 2183 * setup the port data early. 2184 */ 2185 ret = alloc_port_data(ib_dev); 2186 if (ret) 2187 return ret; 2188 2189 pdata = &ib_dev->port_data[port]; 2190 spin_lock_irqsave(&pdata->netdev_lock, flags); 2191 old_ndev = rcu_dereference_protected( 2192 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2193 if (old_ndev == ndev) { 2194 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2195 return 0; 2196 } 2197 2198 rcu_assign_pointer(pdata->netdev, ndev); 2199 netdev_put(old_ndev, &pdata->netdev_tracker); 2200 netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); 2201 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2202 2203 add_ndev_hash(pdata); 2204 2205 /* Make sure that the device is registered before we send events */ 2206 if (xa_load(&devices, ib_dev->index) != ib_dev) 2207 return 0; 2208 2209 etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT; 2210 rdma_nl_notify_event(ib_dev, port, etype); 2211 2212 return 0; 2213 } 2214 EXPORT_SYMBOL(ib_device_set_netdev); 2215 2216 static void free_netdevs(struct ib_device *ib_dev) 2217 { 2218 unsigned long flags; 2219 u32 port; 2220 2221 if (!ib_dev->port_data) 2222 return; 2223 2224 rdma_for_each_port (ib_dev, port) { 2225 struct ib_port_data *pdata = &ib_dev->port_data[port]; 2226 struct net_device *ndev; 2227 2228 spin_lock_irqsave(&pdata->netdev_lock, flags); 2229 ndev = rcu_dereference_protected( 2230 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2231 if (ndev) { 2232 spin_lock(&ndev_hash_lock); 2233 hash_del_rcu(&pdata->ndev_hash_link); 2234 spin_unlock(&ndev_hash_lock); 2235 2236 /* 2237 * If this is the last dev_put there is still a 2238 * synchronize_rcu before the netdev is kfreed, so we 2239 * can continue to rely on unlocked pointer 2240 * comparisons after the put 2241 */ 2242 rcu_assign_pointer(pdata->netdev, NULL); 2243 netdev_put(ndev, &pdata->netdev_tracker); 2244 } 2245 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2246 } 2247 } 2248 2249 struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, 2250 u32 port) 2251 { 2252 struct ib_port_data *pdata; 2253 struct net_device *res; 2254 2255 if (!rdma_is_port_valid(ib_dev, port)) 2256 return NULL; 2257 2258 if (!ib_dev->port_data) 2259 return NULL; 2260 2261 pdata = &ib_dev->port_data[port]; 2262 2263 /* 2264 * New drivers should use ib_device_set_netdev() not the legacy 2265 * get_netdev(). 2266 */ 2267 if (ib_dev->ops.get_netdev) 2268 res = ib_dev->ops.get_netdev(ib_dev, port); 2269 else { 2270 spin_lock(&pdata->netdev_lock); 2271 res = rcu_dereference_protected( 2272 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2273 dev_hold(res); 2274 spin_unlock(&pdata->netdev_lock); 2275 } 2276 2277 return res; 2278 } 2279 EXPORT_SYMBOL(ib_device_get_netdev); 2280 2281 /** 2282 * ib_query_netdev_port - Query the port number of a net_device 2283 * associated with an ibdev 2284 * @ibdev: IB device 2285 * @ndev: Network device 2286 * @port: IB port the net_device is connected to 2287 */ 2288 int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, 2289 u32 *port) 2290 { 2291 struct net_device *ib_ndev; 2292 u32 port_num; 2293 2294 rdma_for_each_port(ibdev, port_num) { 2295 ib_ndev = ib_device_get_netdev(ibdev, port_num); 2296 if (ndev == ib_ndev) { 2297 *port = port_num; 2298 dev_put(ib_ndev); 2299 return 0; 2300 } 2301 dev_put(ib_ndev); 2302 } 2303 2304 return -ENOENT; 2305 } 2306 EXPORT_SYMBOL(ib_query_netdev_port); 2307 2308 /** 2309 * ib_device_get_by_netdev - Find an IB device associated with a netdev 2310 * @ndev: netdev to locate 2311 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 2312 * 2313 * Find and hold an ib_device that is associated with a netdev via 2314 * ib_device_set_netdev(). The caller must call ib_device_put() on the 2315 * returned pointer. 2316 */ 2317 struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, 2318 enum rdma_driver_id driver_id) 2319 { 2320 struct ib_device *res = NULL; 2321 struct ib_port_data *cur; 2322 2323 rcu_read_lock(); 2324 hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, 2325 (uintptr_t)ndev) { 2326 if (rcu_access_pointer(cur->netdev) == ndev && 2327 (driver_id == RDMA_DRIVER_UNKNOWN || 2328 cur->ib_dev->ops.driver_id == driver_id) && 2329 ib_device_try_get(cur->ib_dev)) { 2330 res = cur->ib_dev; 2331 break; 2332 } 2333 } 2334 rcu_read_unlock(); 2335 2336 return res; 2337 } 2338 EXPORT_SYMBOL(ib_device_get_by_netdev); 2339 2340 /** 2341 * ib_enum_roce_netdev - enumerate all RoCE ports 2342 * @ib_dev : IB device we want to query 2343 * @filter: Should we call the callback? 2344 * @filter_cookie: Cookie passed to filter 2345 * @cb: Callback to call for each found RoCE ports 2346 * @cookie: Cookie passed back to the callback 2347 * 2348 * Enumerates all of the physical RoCE ports of ib_dev 2349 * which are related to netdevice and calls callback() on each 2350 * device for which filter() function returns non zero. 2351 */ 2352 void ib_enum_roce_netdev(struct ib_device *ib_dev, 2353 roce_netdev_filter filter, 2354 void *filter_cookie, 2355 roce_netdev_callback cb, 2356 void *cookie) 2357 { 2358 u32 port; 2359 2360 rdma_for_each_port (ib_dev, port) 2361 if (rdma_protocol_roce(ib_dev, port)) { 2362 struct net_device *idev = 2363 ib_device_get_netdev(ib_dev, port); 2364 2365 if (filter(ib_dev, port, idev, filter_cookie)) 2366 cb(ib_dev, port, idev, cookie); 2367 dev_put(idev); 2368 } 2369 } 2370 2371 /** 2372 * ib_enum_all_roce_netdevs - enumerate all RoCE devices 2373 * @filter: Should we call the callback? 2374 * @filter_cookie: Cookie passed to filter 2375 * @cb: Callback to call for each found RoCE ports 2376 * @cookie: Cookie passed back to the callback 2377 * 2378 * Enumerates all RoCE devices' physical ports which are related 2379 * to netdevices and calls callback() on each device for which 2380 * filter() function returns non zero. 2381 */ 2382 void ib_enum_all_roce_netdevs(roce_netdev_filter filter, 2383 void *filter_cookie, 2384 roce_netdev_callback cb, 2385 void *cookie) 2386 { 2387 struct ib_device *dev; 2388 unsigned long index; 2389 2390 down_read(&devices_rwsem); 2391 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) 2392 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); 2393 up_read(&devices_rwsem); 2394 } 2395 2396 /* 2397 * ib_enum_all_devs - enumerate all ib_devices 2398 * @cb: Callback to call for each found ib_device 2399 * 2400 * Enumerates all ib_devices and calls callback() on each device. 2401 */ 2402 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, 2403 struct netlink_callback *cb) 2404 { 2405 unsigned long index; 2406 struct ib_device *dev; 2407 unsigned int idx = 0; 2408 int ret = 0; 2409 2410 down_read(&devices_rwsem); 2411 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 2412 if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) 2413 continue; 2414 2415 ret = nldev_cb(dev, skb, cb, idx); 2416 if (ret) 2417 break; 2418 idx++; 2419 } 2420 up_read(&devices_rwsem); 2421 return ret; 2422 } 2423 2424 /** 2425 * ib_query_pkey - Get P_Key table entry 2426 * @device:Device to query 2427 * @port_num:Port number to query 2428 * @index:P_Key table index to query 2429 * @pkey:Returned P_Key 2430 * 2431 * ib_query_pkey() fetches the specified P_Key table entry. 2432 */ 2433 int ib_query_pkey(struct ib_device *device, 2434 u32 port_num, u16 index, u16 *pkey) 2435 { 2436 if (!rdma_is_port_valid(device, port_num)) 2437 return -EINVAL; 2438 2439 if (!device->ops.query_pkey) 2440 return -EOPNOTSUPP; 2441 2442 return device->ops.query_pkey(device, port_num, index, pkey); 2443 } 2444 EXPORT_SYMBOL(ib_query_pkey); 2445 2446 /** 2447 * ib_modify_device - Change IB device attributes 2448 * @device:Device to modify 2449 * @device_modify_mask:Mask of attributes to change 2450 * @device_modify:New attribute values 2451 * 2452 * ib_modify_device() changes a device's attributes as specified by 2453 * the @device_modify_mask and @device_modify structure. 2454 */ 2455 int ib_modify_device(struct ib_device *device, 2456 int device_modify_mask, 2457 struct ib_device_modify *device_modify) 2458 { 2459 if (!device->ops.modify_device) 2460 return -EOPNOTSUPP; 2461 2462 return device->ops.modify_device(device, device_modify_mask, 2463 device_modify); 2464 } 2465 EXPORT_SYMBOL(ib_modify_device); 2466 2467 /** 2468 * ib_modify_port - Modifies the attributes for the specified port. 2469 * @device: The device to modify. 2470 * @port_num: The number of the port to modify. 2471 * @port_modify_mask: Mask used to specify which attributes of the port 2472 * to change. 2473 * @port_modify: New attribute values for the port. 2474 * 2475 * ib_modify_port() changes a port's attributes as specified by the 2476 * @port_modify_mask and @port_modify structure. 2477 */ 2478 int ib_modify_port(struct ib_device *device, 2479 u32 port_num, int port_modify_mask, 2480 struct ib_port_modify *port_modify) 2481 { 2482 int rc; 2483 2484 if (!rdma_is_port_valid(device, port_num)) 2485 return -EINVAL; 2486 2487 if (device->ops.modify_port) 2488 rc = device->ops.modify_port(device, port_num, 2489 port_modify_mask, 2490 port_modify); 2491 else if (rdma_protocol_roce(device, port_num) && 2492 ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || 2493 (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) 2494 rc = 0; 2495 else 2496 rc = -EOPNOTSUPP; 2497 return rc; 2498 } 2499 EXPORT_SYMBOL(ib_modify_port); 2500 2501 /** 2502 * ib_find_gid - Returns the port number and GID table index where 2503 * a specified GID value occurs. Its searches only for IB link layer. 2504 * @device: The device to query. 2505 * @gid: The GID value to search for. 2506 * @port_num: The port number of the device where the GID value was found. 2507 * @index: The index into the GID table where the GID was found. This 2508 * parameter may be NULL. 2509 */ 2510 int ib_find_gid(struct ib_device *device, union ib_gid *gid, 2511 u32 *port_num, u16 *index) 2512 { 2513 union ib_gid tmp_gid; 2514 u32 port; 2515 int ret, i; 2516 2517 rdma_for_each_port (device, port) { 2518 if (!rdma_protocol_ib(device, port)) 2519 continue; 2520 2521 for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; 2522 ++i) { 2523 ret = rdma_query_gid(device, port, i, &tmp_gid); 2524 if (ret) 2525 continue; 2526 2527 if (!memcmp(&tmp_gid, gid, sizeof *gid)) { 2528 *port_num = port; 2529 if (index) 2530 *index = i; 2531 return 0; 2532 } 2533 } 2534 } 2535 2536 return -ENOENT; 2537 } 2538 EXPORT_SYMBOL(ib_find_gid); 2539 2540 /** 2541 * ib_find_pkey - Returns the PKey table index where a specified 2542 * PKey value occurs. 2543 * @device: The device to query. 2544 * @port_num: The port number of the device to search for the PKey. 2545 * @pkey: The PKey value to search for. 2546 * @index: The index into the PKey table where the PKey was found. 2547 */ 2548 int ib_find_pkey(struct ib_device *device, 2549 u32 port_num, u16 pkey, u16 *index) 2550 { 2551 int ret, i; 2552 u16 tmp_pkey; 2553 int partial_ix = -1; 2554 2555 for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; 2556 ++i) { 2557 ret = ib_query_pkey(device, port_num, i, &tmp_pkey); 2558 if (ret) 2559 return ret; 2560 if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { 2561 /* if there is full-member pkey take it.*/ 2562 if (tmp_pkey & 0x8000) { 2563 *index = i; 2564 return 0; 2565 } 2566 if (partial_ix < 0) 2567 partial_ix = i; 2568 } 2569 } 2570 2571 /*no full-member, if exists take the limited*/ 2572 if (partial_ix >= 0) { 2573 *index = partial_ix; 2574 return 0; 2575 } 2576 return -ENOENT; 2577 } 2578 EXPORT_SYMBOL(ib_find_pkey); 2579 2580 /** 2581 * ib_get_net_dev_by_params() - Return the appropriate net_dev 2582 * for a received CM request 2583 * @dev: An RDMA device on which the request has been received. 2584 * @port: Port number on the RDMA device. 2585 * @pkey: The Pkey the request came on. 2586 * @gid: A GID that the net_dev uses to communicate. 2587 * @addr: Contains the IP address that the request specified as its 2588 * destination. 2589 * 2590 */ 2591 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, 2592 u32 port, 2593 u16 pkey, 2594 const union ib_gid *gid, 2595 const struct sockaddr *addr) 2596 { 2597 struct net_device *net_dev = NULL; 2598 unsigned long index; 2599 void *client_data; 2600 2601 if (!rdma_protocol_ib(dev, port)) 2602 return NULL; 2603 2604 /* 2605 * Holding the read side guarantees that the client will not become 2606 * unregistered while we are calling get_net_dev_by_params() 2607 */ 2608 down_read(&dev->client_data_rwsem); 2609 xan_for_each_marked (&dev->client_data, index, client_data, 2610 CLIENT_DATA_REGISTERED) { 2611 struct ib_client *client = xa_load(&clients, index); 2612 2613 if (!client || !client->get_net_dev_by_params) 2614 continue; 2615 2616 net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, 2617 addr, client_data); 2618 if (net_dev) 2619 break; 2620 } 2621 up_read(&dev->client_data_rwsem); 2622 2623 return net_dev; 2624 } 2625 EXPORT_SYMBOL(ib_get_net_dev_by_params); 2626 2627 void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) 2628 { 2629 struct ib_device_ops *dev_ops = &dev->ops; 2630 #define SET_DEVICE_OP(ptr, name) \ 2631 do { \ 2632 if (ops->name) \ 2633 if (!((ptr)->name)) \ 2634 (ptr)->name = ops->name; \ 2635 } while (0) 2636 2637 #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) 2638 2639 if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { 2640 WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && 2641 dev_ops->driver_id != ops->driver_id); 2642 dev_ops->driver_id = ops->driver_id; 2643 } 2644 if (ops->owner) { 2645 WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); 2646 dev_ops->owner = ops->owner; 2647 } 2648 if (ops->uverbs_abi_ver) 2649 dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; 2650 2651 dev_ops->uverbs_no_driver_id_binding |= 2652 ops->uverbs_no_driver_id_binding; 2653 2654 SET_DEVICE_OP(dev_ops, add_gid); 2655 SET_DEVICE_OP(dev_ops, add_sub_dev); 2656 SET_DEVICE_OP(dev_ops, advise_mr); 2657 SET_DEVICE_OP(dev_ops, alloc_dm); 2658 SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); 2659 SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); 2660 SET_DEVICE_OP(dev_ops, alloc_mr); 2661 SET_DEVICE_OP(dev_ops, alloc_mr_integrity); 2662 SET_DEVICE_OP(dev_ops, alloc_mw); 2663 SET_DEVICE_OP(dev_ops, alloc_pd); 2664 SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); 2665 SET_DEVICE_OP(dev_ops, alloc_ucontext); 2666 SET_DEVICE_OP(dev_ops, alloc_xrcd); 2667 SET_DEVICE_OP(dev_ops, attach_mcast); 2668 SET_DEVICE_OP(dev_ops, check_mr_status); 2669 SET_DEVICE_OP(dev_ops, counter_alloc_stats); 2670 SET_DEVICE_OP(dev_ops, counter_bind_qp); 2671 SET_DEVICE_OP(dev_ops, counter_dealloc); 2672 SET_DEVICE_OP(dev_ops, counter_unbind_qp); 2673 SET_DEVICE_OP(dev_ops, counter_update_stats); 2674 SET_DEVICE_OP(dev_ops, create_ah); 2675 SET_DEVICE_OP(dev_ops, create_counters); 2676 SET_DEVICE_OP(dev_ops, create_cq); 2677 SET_DEVICE_OP(dev_ops, create_flow); 2678 SET_DEVICE_OP(dev_ops, create_qp); 2679 SET_DEVICE_OP(dev_ops, create_rwq_ind_table); 2680 SET_DEVICE_OP(dev_ops, create_srq); 2681 SET_DEVICE_OP(dev_ops, create_user_ah); 2682 SET_DEVICE_OP(dev_ops, create_wq); 2683 SET_DEVICE_OP(dev_ops, dealloc_dm); 2684 SET_DEVICE_OP(dev_ops, dealloc_driver); 2685 SET_DEVICE_OP(dev_ops, dealloc_mw); 2686 SET_DEVICE_OP(dev_ops, dealloc_pd); 2687 SET_DEVICE_OP(dev_ops, dealloc_ucontext); 2688 SET_DEVICE_OP(dev_ops, dealloc_xrcd); 2689 SET_DEVICE_OP(dev_ops, del_gid); 2690 SET_DEVICE_OP(dev_ops, del_sub_dev); 2691 SET_DEVICE_OP(dev_ops, dereg_mr); 2692 SET_DEVICE_OP(dev_ops, destroy_ah); 2693 SET_DEVICE_OP(dev_ops, destroy_counters); 2694 SET_DEVICE_OP(dev_ops, destroy_cq); 2695 SET_DEVICE_OP(dev_ops, destroy_flow); 2696 SET_DEVICE_OP(dev_ops, destroy_flow_action); 2697 SET_DEVICE_OP(dev_ops, destroy_qp); 2698 SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); 2699 SET_DEVICE_OP(dev_ops, destroy_srq); 2700 SET_DEVICE_OP(dev_ops, destroy_wq); 2701 SET_DEVICE_OP(dev_ops, device_group); 2702 SET_DEVICE_OP(dev_ops, detach_mcast); 2703 SET_DEVICE_OP(dev_ops, disassociate_ucontext); 2704 SET_DEVICE_OP(dev_ops, drain_rq); 2705 SET_DEVICE_OP(dev_ops, drain_sq); 2706 SET_DEVICE_OP(dev_ops, enable_driver); 2707 SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); 2708 SET_DEVICE_OP(dev_ops, fill_res_cq_entry); 2709 SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); 2710 SET_DEVICE_OP(dev_ops, fill_res_mr_entry); 2711 SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); 2712 SET_DEVICE_OP(dev_ops, fill_res_qp_entry); 2713 SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); 2714 SET_DEVICE_OP(dev_ops, fill_res_srq_entry); 2715 SET_DEVICE_OP(dev_ops, fill_res_srq_entry_raw); 2716 SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); 2717 SET_DEVICE_OP(dev_ops, get_dev_fw_str); 2718 SET_DEVICE_OP(dev_ops, get_dma_mr); 2719 SET_DEVICE_OP(dev_ops, get_hw_stats); 2720 SET_DEVICE_OP(dev_ops, get_link_layer); 2721 SET_DEVICE_OP(dev_ops, get_netdev); 2722 SET_DEVICE_OP(dev_ops, get_numa_node); 2723 SET_DEVICE_OP(dev_ops, get_port_immutable); 2724 SET_DEVICE_OP(dev_ops, get_vector_affinity); 2725 SET_DEVICE_OP(dev_ops, get_vf_config); 2726 SET_DEVICE_OP(dev_ops, get_vf_guid); 2727 SET_DEVICE_OP(dev_ops, get_vf_stats); 2728 SET_DEVICE_OP(dev_ops, iw_accept); 2729 SET_DEVICE_OP(dev_ops, iw_add_ref); 2730 SET_DEVICE_OP(dev_ops, iw_connect); 2731 SET_DEVICE_OP(dev_ops, iw_create_listen); 2732 SET_DEVICE_OP(dev_ops, iw_destroy_listen); 2733 SET_DEVICE_OP(dev_ops, iw_get_qp); 2734 SET_DEVICE_OP(dev_ops, iw_reject); 2735 SET_DEVICE_OP(dev_ops, iw_rem_ref); 2736 SET_DEVICE_OP(dev_ops, map_mr_sg); 2737 SET_DEVICE_OP(dev_ops, map_mr_sg_pi); 2738 SET_DEVICE_OP(dev_ops, mmap); 2739 SET_DEVICE_OP(dev_ops, mmap_free); 2740 SET_DEVICE_OP(dev_ops, modify_ah); 2741 SET_DEVICE_OP(dev_ops, modify_cq); 2742 SET_DEVICE_OP(dev_ops, modify_device); 2743 SET_DEVICE_OP(dev_ops, modify_hw_stat); 2744 SET_DEVICE_OP(dev_ops, modify_port); 2745 SET_DEVICE_OP(dev_ops, modify_qp); 2746 SET_DEVICE_OP(dev_ops, modify_srq); 2747 SET_DEVICE_OP(dev_ops, modify_wq); 2748 SET_DEVICE_OP(dev_ops, peek_cq); 2749 SET_DEVICE_OP(dev_ops, poll_cq); 2750 SET_DEVICE_OP(dev_ops, port_groups); 2751 SET_DEVICE_OP(dev_ops, post_recv); 2752 SET_DEVICE_OP(dev_ops, post_send); 2753 SET_DEVICE_OP(dev_ops, post_srq_recv); 2754 SET_DEVICE_OP(dev_ops, process_mad); 2755 SET_DEVICE_OP(dev_ops, query_ah); 2756 SET_DEVICE_OP(dev_ops, query_device); 2757 SET_DEVICE_OP(dev_ops, query_gid); 2758 SET_DEVICE_OP(dev_ops, query_pkey); 2759 SET_DEVICE_OP(dev_ops, query_port); 2760 SET_DEVICE_OP(dev_ops, query_qp); 2761 SET_DEVICE_OP(dev_ops, query_srq); 2762 SET_DEVICE_OP(dev_ops, query_ucontext); 2763 SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); 2764 SET_DEVICE_OP(dev_ops, read_counters); 2765 SET_DEVICE_OP(dev_ops, reg_dm_mr); 2766 SET_DEVICE_OP(dev_ops, reg_user_mr); 2767 SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf); 2768 SET_DEVICE_OP(dev_ops, req_notify_cq); 2769 SET_DEVICE_OP(dev_ops, rereg_user_mr); 2770 SET_DEVICE_OP(dev_ops, resize_cq); 2771 SET_DEVICE_OP(dev_ops, set_vf_guid); 2772 SET_DEVICE_OP(dev_ops, set_vf_link_state); 2773 SET_DEVICE_OP(dev_ops, ufile_hw_cleanup); 2774 SET_DEVICE_OP(dev_ops, report_port_event); 2775 2776 SET_OBJ_SIZE(dev_ops, ib_ah); 2777 SET_OBJ_SIZE(dev_ops, ib_counters); 2778 SET_OBJ_SIZE(dev_ops, ib_cq); 2779 SET_OBJ_SIZE(dev_ops, ib_mw); 2780 SET_OBJ_SIZE(dev_ops, ib_pd); 2781 SET_OBJ_SIZE(dev_ops, ib_qp); 2782 SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); 2783 SET_OBJ_SIZE(dev_ops, ib_srq); 2784 SET_OBJ_SIZE(dev_ops, ib_ucontext); 2785 SET_OBJ_SIZE(dev_ops, ib_xrcd); 2786 } 2787 EXPORT_SYMBOL(ib_set_device_ops); 2788 2789 int ib_add_sub_device(struct ib_device *parent, 2790 enum rdma_nl_dev_type type, 2791 const char *name) 2792 { 2793 struct ib_device *sub; 2794 int ret = 0; 2795 2796 if (!parent->ops.add_sub_dev || !parent->ops.del_sub_dev) 2797 return -EOPNOTSUPP; 2798 2799 if (!ib_device_try_get(parent)) 2800 return -EINVAL; 2801 2802 sub = parent->ops.add_sub_dev(parent, type, name); 2803 if (IS_ERR(sub)) { 2804 ib_device_put(parent); 2805 return PTR_ERR(sub); 2806 } 2807 2808 sub->type = type; 2809 sub->parent = parent; 2810 2811 mutex_lock(&parent->subdev_lock); 2812 list_add_tail(&parent->subdev_list_head, &sub->subdev_list); 2813 mutex_unlock(&parent->subdev_lock); 2814 2815 return ret; 2816 } 2817 EXPORT_SYMBOL(ib_add_sub_device); 2818 2819 int ib_del_sub_device_and_put(struct ib_device *sub) 2820 { 2821 struct ib_device *parent = sub->parent; 2822 2823 if (!parent) 2824 return -EOPNOTSUPP; 2825 2826 mutex_lock(&parent->subdev_lock); 2827 list_del(&sub->subdev_list); 2828 mutex_unlock(&parent->subdev_lock); 2829 2830 ib_device_put(sub); 2831 parent->ops.del_sub_dev(sub); 2832 ib_device_put(parent); 2833 2834 return 0; 2835 } 2836 EXPORT_SYMBOL(ib_del_sub_device_and_put); 2837 2838 #ifdef CONFIG_INFINIBAND_VIRT_DMA 2839 int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) 2840 { 2841 struct scatterlist *s; 2842 int i; 2843 2844 for_each_sg(sg, s, nents, i) { 2845 sg_dma_address(s) = (uintptr_t)sg_virt(s); 2846 sg_dma_len(s) = s->length; 2847 } 2848 return nents; 2849 } 2850 EXPORT_SYMBOL(ib_dma_virt_map_sg); 2851 #endif /* CONFIG_INFINIBAND_VIRT_DMA */ 2852 2853 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { 2854 [RDMA_NL_LS_OP_RESOLVE] = { 2855 .doit = ib_nl_handle_resolve_resp, 2856 .flags = RDMA_NL_ADMIN_PERM, 2857 }, 2858 [RDMA_NL_LS_OP_SET_TIMEOUT] = { 2859 .doit = ib_nl_handle_set_timeout, 2860 .flags = RDMA_NL_ADMIN_PERM, 2861 }, 2862 [RDMA_NL_LS_OP_IP_RESOLVE] = { 2863 .doit = ib_nl_handle_ip_res_resp, 2864 .flags = RDMA_NL_ADMIN_PERM, 2865 }, 2866 }; 2867 2868 void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev) 2869 { 2870 enum ib_port_state curr_state; 2871 struct ib_event ibevent = {}; 2872 u32 port; 2873 2874 if (ib_query_netdev_port(ibdev, ndev, &port)) 2875 return; 2876 2877 curr_state = ib_get_curr_port_state(ndev); 2878 2879 write_lock_irq(&ibdev->cache_lock); 2880 if (ibdev->port_data[port].cache.last_port_state == curr_state) { 2881 write_unlock_irq(&ibdev->cache_lock); 2882 return; 2883 } 2884 ibdev->port_data[port].cache.last_port_state = curr_state; 2885 write_unlock_irq(&ibdev->cache_lock); 2886 2887 ibevent.event = (curr_state == IB_PORT_DOWN) ? 2888 IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE; 2889 ibevent.device = ibdev; 2890 ibevent.element.port_num = port; 2891 ib_dispatch_event(&ibevent); 2892 } 2893 EXPORT_SYMBOL(ib_dispatch_port_state_event); 2894 2895 static void handle_port_event(struct net_device *ndev, unsigned long event) 2896 { 2897 struct ib_device *ibdev; 2898 2899 /* Currently, link events in bonding scenarios are still 2900 * reported by drivers that support bonding. 2901 */ 2902 if (netif_is_lag_master(ndev) || netif_is_lag_port(ndev)) 2903 return; 2904 2905 ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); 2906 if (!ibdev) 2907 return; 2908 2909 if (ibdev->ops.report_port_event) { 2910 ibdev->ops.report_port_event(ibdev, ndev, event); 2911 goto put_ibdev; 2912 } 2913 2914 ib_dispatch_port_state_event(ibdev, ndev); 2915 2916 put_ibdev: 2917 ib_device_put(ibdev); 2918 }; 2919 2920 static int ib_netdevice_event(struct notifier_block *this, 2921 unsigned long event, void *ptr) 2922 { 2923 struct net_device *ndev = netdev_notifier_info_to_dev(ptr); 2924 struct ib_device *ibdev; 2925 u32 port; 2926 2927 switch (event) { 2928 case NETDEV_CHANGENAME: 2929 ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); 2930 if (!ibdev) 2931 return NOTIFY_DONE; 2932 2933 if (ib_query_netdev_port(ibdev, ndev, &port)) { 2934 ib_device_put(ibdev); 2935 break; 2936 } 2937 2938 rdma_nl_notify_event(ibdev, port, RDMA_NETDEV_RENAME_EVENT); 2939 ib_device_put(ibdev); 2940 break; 2941 2942 case NETDEV_UP: 2943 case NETDEV_CHANGE: 2944 case NETDEV_DOWN: 2945 handle_port_event(ndev, event); 2946 break; 2947 2948 default: 2949 break; 2950 } 2951 2952 return NOTIFY_DONE; 2953 } 2954 2955 static struct notifier_block nb_netdevice = { 2956 .notifier_call = ib_netdevice_event, 2957 }; 2958 2959 static int __init ib_core_init(void) 2960 { 2961 int ret = -ENOMEM; 2962 2963 ib_wq = alloc_workqueue("infiniband", 0, 0); 2964 if (!ib_wq) 2965 return -ENOMEM; 2966 2967 ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND, 2968 WQ_UNBOUND_MAX_ACTIVE); 2969 if (!ib_unreg_wq) 2970 goto err; 2971 2972 ib_comp_wq = alloc_workqueue("ib-comp-wq", 2973 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 2974 if (!ib_comp_wq) 2975 goto err_unbound; 2976 2977 ib_comp_unbound_wq = 2978 alloc_workqueue("ib-comp-unb-wq", 2979 WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | 2980 WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); 2981 if (!ib_comp_unbound_wq) 2982 goto err_comp; 2983 2984 ret = class_register(&ib_class); 2985 if (ret) { 2986 pr_warn("Couldn't create InfiniBand device class\n"); 2987 goto err_comp_unbound; 2988 } 2989 2990 rdma_nl_init(); 2991 2992 ret = addr_init(); 2993 if (ret) { 2994 pr_warn("Couldn't init IB address resolution\n"); 2995 goto err_ibnl; 2996 } 2997 2998 ret = ib_mad_init(); 2999 if (ret) { 3000 pr_warn("Couldn't init IB MAD\n"); 3001 goto err_addr; 3002 } 3003 3004 ret = ib_sa_init(); 3005 if (ret) { 3006 pr_warn("Couldn't init SA\n"); 3007 goto err_mad; 3008 } 3009 3010 ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); 3011 if (ret) { 3012 pr_warn("Couldn't register LSM notifier. ret %d\n", ret); 3013 goto err_sa; 3014 } 3015 3016 ret = register_pernet_device(&rdma_dev_net_ops); 3017 if (ret) { 3018 pr_warn("Couldn't init compat dev. ret %d\n", ret); 3019 goto err_compat; 3020 } 3021 3022 nldev_init(); 3023 rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); 3024 ret = roce_gid_mgmt_init(); 3025 if (ret) { 3026 pr_warn("Couldn't init RoCE GID management\n"); 3027 goto err_parent; 3028 } 3029 3030 register_netdevice_notifier(&nb_netdevice); 3031 3032 return 0; 3033 3034 err_parent: 3035 rdma_nl_unregister(RDMA_NL_LS); 3036 nldev_exit(); 3037 unregister_pernet_device(&rdma_dev_net_ops); 3038 err_compat: 3039 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 3040 err_sa: 3041 ib_sa_cleanup(); 3042 err_mad: 3043 ib_mad_cleanup(); 3044 err_addr: 3045 addr_cleanup(); 3046 err_ibnl: 3047 class_unregister(&ib_class); 3048 err_comp_unbound: 3049 destroy_workqueue(ib_comp_unbound_wq); 3050 err_comp: 3051 destroy_workqueue(ib_comp_wq); 3052 err_unbound: 3053 destroy_workqueue(ib_unreg_wq); 3054 err: 3055 destroy_workqueue(ib_wq); 3056 return ret; 3057 } 3058 3059 static void __exit ib_core_cleanup(void) 3060 { 3061 unregister_netdevice_notifier(&nb_netdevice); 3062 roce_gid_mgmt_cleanup(); 3063 rdma_nl_unregister(RDMA_NL_LS); 3064 nldev_exit(); 3065 unregister_pernet_device(&rdma_dev_net_ops); 3066 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 3067 ib_sa_cleanup(); 3068 ib_mad_cleanup(); 3069 addr_cleanup(); 3070 rdma_nl_exit(); 3071 class_unregister(&ib_class); 3072 destroy_workqueue(ib_comp_unbound_wq); 3073 destroy_workqueue(ib_comp_wq); 3074 /* Make sure that any pending umem accounting work is done. */ 3075 destroy_workqueue(ib_wq); 3076 destroy_workqueue(ib_unreg_wq); 3077 WARN_ON(!xa_empty(&clients)); 3078 WARN_ON(!xa_empty(&devices)); 3079 } 3080 3081 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); 3082 3083 /* ib core relies on netdev stack to first register net_ns_type_operations 3084 * ns kobject type before ib_core initialization. 3085 */ 3086 fs_initcall(ib_core_init); 3087 module_exit(ib_core_cleanup); 3088