1 /* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <linux/module.h> 35 #include <linux/string.h> 36 #include <linux/errno.h> 37 #include <linux/kernel.h> 38 #include <linux/slab.h> 39 #include <linux/init.h> 40 #include <linux/netdevice.h> 41 #include <net/net_namespace.h> 42 #include <linux/security.h> 43 #include <linux/notifier.h> 44 #include <linux/hashtable.h> 45 #include <rdma/rdma_netlink.h> 46 #include <rdma/ib_addr.h> 47 #include <rdma/ib_cache.h> 48 #include <rdma/rdma_counter.h> 49 50 #include "core_priv.h" 51 #include "restrack.h" 52 53 MODULE_AUTHOR("Roland Dreier"); 54 MODULE_DESCRIPTION("core kernel InfiniBand API"); 55 MODULE_LICENSE("Dual BSD/GPL"); 56 57 struct workqueue_struct *ib_comp_wq; 58 struct workqueue_struct *ib_comp_unbound_wq; 59 struct workqueue_struct *ib_wq; 60 EXPORT_SYMBOL_GPL(ib_wq); 61 static struct workqueue_struct *ib_unreg_wq; 62 63 /* 64 * Each of the three rwsem locks (devices, clients, client_data) protects the 65 * xarray of the same name. Specifically it allows the caller to assert that 66 * the MARK will/will not be changing under the lock, and for devices and 67 * clients, that the value in the xarray is still a valid pointer. Change of 68 * the MARK is linked to the object state, so holding the lock and testing the 69 * MARK also asserts that the contained object is in a certain state. 70 * 71 * This is used to build a two stage register/unregister flow where objects 72 * can continue to be in the xarray even though they are still in progress to 73 * register/unregister. 74 * 75 * The xarray itself provides additional locking, and restartable iteration, 76 * which is also relied on. 77 * 78 * Locks should not be nested, with the exception of client_data, which is 79 * allowed to nest under the read side of the other two locks. 80 * 81 * The devices_rwsem also protects the device name list, any change or 82 * assignment of device name must also hold the write side to guarantee unique 83 * names. 84 */ 85 86 /* 87 * devices contains devices that have had their names assigned. The 88 * devices may not be registered. Users that care about the registration 89 * status need to call ib_device_try_get() on the device to ensure it is 90 * registered, and keep it registered, for the required duration. 91 * 92 */ 93 static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); 94 static DECLARE_RWSEM(devices_rwsem); 95 #define DEVICE_REGISTERED XA_MARK_1 96 97 static u32 highest_client_id; 98 #define CLIENT_REGISTERED XA_MARK_1 99 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); 100 static DECLARE_RWSEM(clients_rwsem); 101 102 static void ib_client_put(struct ib_client *client) 103 { 104 if (refcount_dec_and_test(&client->uses)) 105 complete(&client->uses_zero); 106 } 107 108 /* 109 * If client_data is registered then the corresponding client must also still 110 * be registered. 111 */ 112 #define CLIENT_DATA_REGISTERED XA_MARK_1 113 114 unsigned int rdma_dev_net_id; 115 116 /* 117 * A list of net namespaces is maintained in an xarray. This is necessary 118 * because we can't get the locking right using the existing net ns list. We 119 * would require a init_net callback after the list is updated. 120 */ 121 static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); 122 /* 123 * rwsem to protect accessing the rdma_nets xarray entries. 124 */ 125 static DECLARE_RWSEM(rdma_nets_rwsem); 126 127 bool ib_devices_shared_netns = true; 128 module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); 129 MODULE_PARM_DESC(netns_mode, 130 "Share device among net namespaces; default=1 (shared)"); 131 /** 132 * rdma_dev_access_netns() - Return whether an rdma device can be accessed 133 * from a specified net namespace or not. 134 * @dev: Pointer to rdma device which needs to be checked 135 * @net: Pointer to net namesapce for which access to be checked 136 * 137 * When the rdma device is in shared mode, it ignores the net namespace. 138 * When the rdma device is exclusive to a net namespace, rdma device net 139 * namespace is checked against the specified one. 140 */ 141 bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) 142 { 143 return (ib_devices_shared_netns || 144 net_eq(read_pnet(&dev->coredev.rdma_net), net)); 145 } 146 EXPORT_SYMBOL(rdma_dev_access_netns); 147 148 /* 149 * xarray has this behavior where it won't iterate over NULL values stored in 150 * allocated arrays. So we need our own iterator to see all values stored in 151 * the array. This does the same thing as xa_for_each except that it also 152 * returns NULL valued entries if the array is allocating. Simplified to only 153 * work on simple xarrays. 154 */ 155 static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, 156 xa_mark_t filter) 157 { 158 XA_STATE(xas, xa, *indexp); 159 void *entry; 160 161 rcu_read_lock(); 162 do { 163 entry = xas_find_marked(&xas, ULONG_MAX, filter); 164 if (xa_is_zero(entry)) 165 break; 166 } while (xas_retry(&xas, entry)); 167 rcu_read_unlock(); 168 169 if (entry) { 170 *indexp = xas.xa_index; 171 if (xa_is_zero(entry)) 172 return NULL; 173 return entry; 174 } 175 return XA_ERROR(-ENOENT); 176 } 177 #define xan_for_each_marked(xa, index, entry, filter) \ 178 for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ 179 !xa_is_err(entry); \ 180 (index)++, entry = xan_find_marked(xa, &(index), filter)) 181 182 /* RCU hash table mapping netdevice pointers to struct ib_port_data */ 183 static DEFINE_SPINLOCK(ndev_hash_lock); 184 static DECLARE_HASHTABLE(ndev_hash, 5); 185 186 static void free_netdevs(struct ib_device *ib_dev); 187 static void ib_unregister_work(struct work_struct *work); 188 static void __ib_unregister_device(struct ib_device *device); 189 static int ib_security_change(struct notifier_block *nb, unsigned long event, 190 void *lsm_data); 191 static void ib_policy_change_task(struct work_struct *work); 192 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); 193 194 static void __ibdev_printk(const char *level, const struct ib_device *ibdev, 195 struct va_format *vaf) 196 { 197 if (ibdev && ibdev->dev.parent) 198 dev_printk_emit(level[1] - '0', 199 ibdev->dev.parent, 200 "%s %s %s: %pV", 201 dev_driver_string(ibdev->dev.parent), 202 dev_name(ibdev->dev.parent), 203 dev_name(&ibdev->dev), 204 vaf); 205 else if (ibdev) 206 printk("%s%s: %pV", 207 level, dev_name(&ibdev->dev), vaf); 208 else 209 printk("%s(NULL ib_device): %pV", level, vaf); 210 } 211 212 void ibdev_printk(const char *level, const struct ib_device *ibdev, 213 const char *format, ...) 214 { 215 struct va_format vaf; 216 va_list args; 217 218 va_start(args, format); 219 220 vaf.fmt = format; 221 vaf.va = &args; 222 223 __ibdev_printk(level, ibdev, &vaf); 224 225 va_end(args); 226 } 227 EXPORT_SYMBOL(ibdev_printk); 228 229 #define define_ibdev_printk_level(func, level) \ 230 void func(const struct ib_device *ibdev, const char *fmt, ...) \ 231 { \ 232 struct va_format vaf; \ 233 va_list args; \ 234 \ 235 va_start(args, fmt); \ 236 \ 237 vaf.fmt = fmt; \ 238 vaf.va = &args; \ 239 \ 240 __ibdev_printk(level, ibdev, &vaf); \ 241 \ 242 va_end(args); \ 243 } \ 244 EXPORT_SYMBOL(func); 245 246 define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); 247 define_ibdev_printk_level(ibdev_alert, KERN_ALERT); 248 define_ibdev_printk_level(ibdev_crit, KERN_CRIT); 249 define_ibdev_printk_level(ibdev_err, KERN_ERR); 250 define_ibdev_printk_level(ibdev_warn, KERN_WARNING); 251 define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); 252 define_ibdev_printk_level(ibdev_info, KERN_INFO); 253 254 static struct notifier_block ibdev_lsm_nb = { 255 .notifier_call = ib_security_change, 256 }; 257 258 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 259 struct net *net); 260 261 /* Pointer to the RCU head at the start of the ib_port_data array */ 262 struct ib_port_data_rcu { 263 struct rcu_head rcu_head; 264 struct ib_port_data pdata[]; 265 }; 266 267 static void ib_device_check_mandatory(struct ib_device *device) 268 { 269 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } 270 static const struct { 271 size_t offset; 272 char *name; 273 } mandatory_table[] = { 274 IB_MANDATORY_FUNC(query_device), 275 IB_MANDATORY_FUNC(query_port), 276 IB_MANDATORY_FUNC(alloc_pd), 277 IB_MANDATORY_FUNC(dealloc_pd), 278 IB_MANDATORY_FUNC(create_qp), 279 IB_MANDATORY_FUNC(modify_qp), 280 IB_MANDATORY_FUNC(destroy_qp), 281 IB_MANDATORY_FUNC(post_send), 282 IB_MANDATORY_FUNC(post_recv), 283 IB_MANDATORY_FUNC(create_cq), 284 IB_MANDATORY_FUNC(destroy_cq), 285 IB_MANDATORY_FUNC(poll_cq), 286 IB_MANDATORY_FUNC(req_notify_cq), 287 IB_MANDATORY_FUNC(get_dma_mr), 288 IB_MANDATORY_FUNC(reg_user_mr), 289 IB_MANDATORY_FUNC(dereg_mr), 290 IB_MANDATORY_FUNC(get_port_immutable) 291 }; 292 int i; 293 294 device->kverbs_provider = true; 295 for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { 296 if (!*(void **) ((void *) &device->ops + 297 mandatory_table[i].offset)) { 298 device->kverbs_provider = false; 299 break; 300 } 301 } 302 } 303 304 /* 305 * Caller must perform ib_device_put() to return the device reference count 306 * when ib_device_get_by_index() returns valid device pointer. 307 */ 308 struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) 309 { 310 struct ib_device *device; 311 312 down_read(&devices_rwsem); 313 device = xa_load(&devices, index); 314 if (device) { 315 if (!rdma_dev_access_netns(device, net)) { 316 device = NULL; 317 goto out; 318 } 319 320 if (!ib_device_try_get(device)) 321 device = NULL; 322 } 323 out: 324 up_read(&devices_rwsem); 325 return device; 326 } 327 328 /** 329 * ib_device_put - Release IB device reference 330 * @device: device whose reference to be released 331 * 332 * ib_device_put() releases reference to the IB device to allow it to be 333 * unregistered and eventually free. 334 */ 335 void ib_device_put(struct ib_device *device) 336 { 337 if (refcount_dec_and_test(&device->refcount)) 338 complete(&device->unreg_completion); 339 } 340 EXPORT_SYMBOL(ib_device_put); 341 342 static struct ib_device *__ib_device_get_by_name(const char *name) 343 { 344 struct ib_device *device; 345 unsigned long index; 346 347 xa_for_each (&devices, index, device) 348 if (!strcmp(name, dev_name(&device->dev))) 349 return device; 350 351 return NULL; 352 } 353 354 /** 355 * ib_device_get_by_name - Find an IB device by name 356 * @name: The name to look for 357 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 358 * 359 * Find and hold an ib_device by its name. The caller must call 360 * ib_device_put() on the returned pointer. 361 */ 362 struct ib_device *ib_device_get_by_name(const char *name, 363 enum rdma_driver_id driver_id) 364 { 365 struct ib_device *device; 366 367 down_read(&devices_rwsem); 368 device = __ib_device_get_by_name(name); 369 if (device && driver_id != RDMA_DRIVER_UNKNOWN && 370 device->ops.driver_id != driver_id) 371 device = NULL; 372 373 if (device) { 374 if (!ib_device_try_get(device)) 375 device = NULL; 376 } 377 up_read(&devices_rwsem); 378 return device; 379 } 380 EXPORT_SYMBOL(ib_device_get_by_name); 381 382 static int rename_compat_devs(struct ib_device *device) 383 { 384 struct ib_core_device *cdev; 385 unsigned long index; 386 int ret = 0; 387 388 mutex_lock(&device->compat_devs_mutex); 389 xa_for_each (&device->compat_devs, index, cdev) { 390 ret = device_rename(&cdev->dev, dev_name(&device->dev)); 391 if (ret) { 392 dev_warn(&cdev->dev, 393 "Fail to rename compatdev to new name %s\n", 394 dev_name(&device->dev)); 395 break; 396 } 397 } 398 mutex_unlock(&device->compat_devs_mutex); 399 return ret; 400 } 401 402 int ib_device_rename(struct ib_device *ibdev, const char *name) 403 { 404 unsigned long index; 405 void *client_data; 406 int ret; 407 408 down_write(&devices_rwsem); 409 if (!strcmp(name, dev_name(&ibdev->dev))) { 410 up_write(&devices_rwsem); 411 return 0; 412 } 413 414 if (__ib_device_get_by_name(name)) { 415 up_write(&devices_rwsem); 416 return -EEXIST; 417 } 418 419 ret = device_rename(&ibdev->dev, name); 420 if (ret) { 421 up_write(&devices_rwsem); 422 return ret; 423 } 424 425 strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX); 426 ret = rename_compat_devs(ibdev); 427 428 downgrade_write(&devices_rwsem); 429 down_read(&ibdev->client_data_rwsem); 430 xan_for_each_marked(&ibdev->client_data, index, client_data, 431 CLIENT_DATA_REGISTERED) { 432 struct ib_client *client = xa_load(&clients, index); 433 434 if (!client || !client->rename) 435 continue; 436 437 client->rename(ibdev, client_data); 438 } 439 up_read(&ibdev->client_data_rwsem); 440 up_read(&devices_rwsem); 441 return 0; 442 } 443 444 int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) 445 { 446 if (use_dim > 1) 447 return -EINVAL; 448 ibdev->use_cq_dim = use_dim; 449 450 return 0; 451 } 452 453 static int alloc_name(struct ib_device *ibdev, const char *name) 454 { 455 struct ib_device *device; 456 unsigned long index; 457 struct ida inuse; 458 int rc; 459 int i; 460 461 lockdep_assert_held_write(&devices_rwsem); 462 ida_init(&inuse); 463 xa_for_each (&devices, index, device) { 464 char buf[IB_DEVICE_NAME_MAX]; 465 466 if (sscanf(dev_name(&device->dev), name, &i) != 1) 467 continue; 468 if (i < 0 || i >= INT_MAX) 469 continue; 470 snprintf(buf, sizeof buf, name, i); 471 if (strcmp(buf, dev_name(&device->dev)) != 0) 472 continue; 473 474 rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); 475 if (rc < 0) 476 goto out; 477 } 478 479 rc = ida_alloc(&inuse, GFP_KERNEL); 480 if (rc < 0) 481 goto out; 482 483 rc = dev_set_name(&ibdev->dev, name, rc); 484 out: 485 ida_destroy(&inuse); 486 return rc; 487 } 488 489 static void ib_device_release(struct device *device) 490 { 491 struct ib_device *dev = container_of(device, struct ib_device, dev); 492 493 free_netdevs(dev); 494 WARN_ON(refcount_read(&dev->refcount)); 495 if (dev->hw_stats_data) 496 ib_device_release_hw_stats(dev->hw_stats_data); 497 if (dev->port_data) { 498 ib_cache_release_one(dev); 499 ib_security_release_port_pkey_list(dev); 500 rdma_counter_release(dev); 501 kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, 502 pdata[0]), 503 rcu_head); 504 } 505 506 mutex_destroy(&dev->unregistration_lock); 507 mutex_destroy(&dev->compat_devs_mutex); 508 509 xa_destroy(&dev->compat_devs); 510 xa_destroy(&dev->client_data); 511 kfree_rcu(dev, rcu_head); 512 } 513 514 static int ib_device_uevent(const struct device *device, 515 struct kobj_uevent_env *env) 516 { 517 if (add_uevent_var(env, "NAME=%s", dev_name(device))) 518 return -ENOMEM; 519 520 /* 521 * It would be nice to pass the node GUID with the event... 522 */ 523 524 return 0; 525 } 526 527 static const void *net_namespace(const struct device *d) 528 { 529 const struct ib_core_device *coredev = 530 container_of(d, struct ib_core_device, dev); 531 532 return read_pnet(&coredev->rdma_net); 533 } 534 535 static struct class ib_class = { 536 .name = "infiniband", 537 .dev_release = ib_device_release, 538 .dev_uevent = ib_device_uevent, 539 .ns_type = &net_ns_type_operations, 540 .namespace = net_namespace, 541 }; 542 543 static void rdma_init_coredev(struct ib_core_device *coredev, 544 struct ib_device *dev, struct net *net) 545 { 546 /* This BUILD_BUG_ON is intended to catch layout change 547 * of union of ib_core_device and device. 548 * dev must be the first element as ib_core and providers 549 * driver uses it. Adding anything in ib_core_device before 550 * device will break this assumption. 551 */ 552 BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != 553 offsetof(struct ib_device, dev)); 554 555 coredev->dev.class = &ib_class; 556 coredev->dev.groups = dev->groups; 557 device_initialize(&coredev->dev); 558 coredev->owner = dev; 559 INIT_LIST_HEAD(&coredev->port_list); 560 write_pnet(&coredev->rdma_net, net); 561 } 562 563 /** 564 * _ib_alloc_device - allocate an IB device struct 565 * @size:size of structure to allocate 566 * 567 * Low-level drivers should use ib_alloc_device() to allocate &struct 568 * ib_device. @size is the size of the structure to be allocated, 569 * including any private data used by the low-level driver. 570 * ib_dealloc_device() must be used to free structures allocated with 571 * ib_alloc_device(). 572 */ 573 struct ib_device *_ib_alloc_device(size_t size) 574 { 575 struct ib_device *device; 576 unsigned int i; 577 578 if (WARN_ON(size < sizeof(struct ib_device))) 579 return NULL; 580 581 device = kzalloc(size, GFP_KERNEL); 582 if (!device) 583 return NULL; 584 585 if (rdma_restrack_init(device)) { 586 kfree(device); 587 return NULL; 588 } 589 590 rdma_init_coredev(&device->coredev, device, &init_net); 591 592 INIT_LIST_HEAD(&device->event_handler_list); 593 spin_lock_init(&device->qp_open_list_lock); 594 init_rwsem(&device->event_handler_rwsem); 595 mutex_init(&device->unregistration_lock); 596 /* 597 * client_data needs to be alloc because we don't want our mark to be 598 * destroyed if the user stores NULL in the client data. 599 */ 600 xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); 601 init_rwsem(&device->client_data_rwsem); 602 xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); 603 mutex_init(&device->compat_devs_mutex); 604 init_completion(&device->unreg_completion); 605 INIT_WORK(&device->unregistration_work, ib_unregister_work); 606 607 spin_lock_init(&device->cq_pools_lock); 608 for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++) 609 INIT_LIST_HEAD(&device->cq_pools[i]); 610 611 rwlock_init(&device->cache_lock); 612 613 device->uverbs_cmd_mask = 614 BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) | 615 BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) | 616 BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) | 617 BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) | 618 BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) | 619 BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | 620 BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) | 621 BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) | 622 BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) | 623 BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) | 624 BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) | 625 BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) | 626 BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) | 627 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) | 628 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) | 629 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) | 630 BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) | 631 BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) | 632 BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | 633 BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) | 634 BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) | 635 BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) | 636 BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) | 637 BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) | 638 BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) | 639 BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) | 640 BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) | 641 BIT_ULL(IB_USER_VERBS_CMD_REG_MR) | 642 BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) | 643 BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ); 644 return device; 645 } 646 EXPORT_SYMBOL(_ib_alloc_device); 647 648 /** 649 * ib_dealloc_device - free an IB device struct 650 * @device:structure to free 651 * 652 * Free a structure allocated with ib_alloc_device(). 653 */ 654 void ib_dealloc_device(struct ib_device *device) 655 { 656 if (device->ops.dealloc_driver) 657 device->ops.dealloc_driver(device); 658 659 /* 660 * ib_unregister_driver() requires all devices to remain in the xarray 661 * while their ops are callable. The last op we call is dealloc_driver 662 * above. This is needed to create a fence on op callbacks prior to 663 * allowing the driver module to unload. 664 */ 665 down_write(&devices_rwsem); 666 if (xa_load(&devices, device->index) == device) 667 xa_erase(&devices, device->index); 668 up_write(&devices_rwsem); 669 670 /* Expedite releasing netdev references */ 671 free_netdevs(device); 672 673 WARN_ON(!xa_empty(&device->compat_devs)); 674 WARN_ON(!xa_empty(&device->client_data)); 675 WARN_ON(refcount_read(&device->refcount)); 676 rdma_restrack_clean(device); 677 /* Balances with device_initialize */ 678 put_device(&device->dev); 679 } 680 EXPORT_SYMBOL(ib_dealloc_device); 681 682 /* 683 * add_client_context() and remove_client_context() must be safe against 684 * parallel calls on the same device - registration/unregistration of both the 685 * device and client can be occurring in parallel. 686 * 687 * The routines need to be a fence, any caller must not return until the add 688 * or remove is fully completed. 689 */ 690 static int add_client_context(struct ib_device *device, 691 struct ib_client *client) 692 { 693 int ret = 0; 694 695 if (!device->kverbs_provider && !client->no_kverbs_req) 696 return 0; 697 698 down_write(&device->client_data_rwsem); 699 /* 700 * So long as the client is registered hold both the client and device 701 * unregistration locks. 702 */ 703 if (!refcount_inc_not_zero(&client->uses)) 704 goto out_unlock; 705 refcount_inc(&device->refcount); 706 707 /* 708 * Another caller to add_client_context got here first and has already 709 * completely initialized context. 710 */ 711 if (xa_get_mark(&device->client_data, client->client_id, 712 CLIENT_DATA_REGISTERED)) 713 goto out; 714 715 ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, 716 GFP_KERNEL)); 717 if (ret) 718 goto out; 719 downgrade_write(&device->client_data_rwsem); 720 if (client->add) { 721 if (client->add(device)) { 722 /* 723 * If a client fails to add then the error code is 724 * ignored, but we won't call any more ops on this 725 * client. 726 */ 727 xa_erase(&device->client_data, client->client_id); 728 up_read(&device->client_data_rwsem); 729 ib_device_put(device); 730 ib_client_put(client); 731 return 0; 732 } 733 } 734 735 /* Readers shall not see a client until add has been completed */ 736 xa_set_mark(&device->client_data, client->client_id, 737 CLIENT_DATA_REGISTERED); 738 up_read(&device->client_data_rwsem); 739 return 0; 740 741 out: 742 ib_device_put(device); 743 ib_client_put(client); 744 out_unlock: 745 up_write(&device->client_data_rwsem); 746 return ret; 747 } 748 749 static void remove_client_context(struct ib_device *device, 750 unsigned int client_id) 751 { 752 struct ib_client *client; 753 void *client_data; 754 755 down_write(&device->client_data_rwsem); 756 if (!xa_get_mark(&device->client_data, client_id, 757 CLIENT_DATA_REGISTERED)) { 758 up_write(&device->client_data_rwsem); 759 return; 760 } 761 client_data = xa_load(&device->client_data, client_id); 762 xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); 763 client = xa_load(&clients, client_id); 764 up_write(&device->client_data_rwsem); 765 766 /* 767 * Notice we cannot be holding any exclusive locks when calling the 768 * remove callback as the remove callback can recurse back into any 769 * public functions in this module and thus try for any locks those 770 * functions take. 771 * 772 * For this reason clients and drivers should not call the 773 * unregistration functions will holdling any locks. 774 */ 775 if (client->remove) 776 client->remove(device, client_data); 777 778 xa_erase(&device->client_data, client_id); 779 ib_device_put(device); 780 ib_client_put(client); 781 } 782 783 static int alloc_port_data(struct ib_device *device) 784 { 785 struct ib_port_data_rcu *pdata_rcu; 786 u32 port; 787 788 if (device->port_data) 789 return 0; 790 791 /* This can only be called once the physical port range is defined */ 792 if (WARN_ON(!device->phys_port_cnt)) 793 return -EINVAL; 794 795 /* Reserve U32_MAX so the logic to go over all the ports is sane */ 796 if (WARN_ON(device->phys_port_cnt == U32_MAX)) 797 return -EINVAL; 798 799 /* 800 * device->port_data is indexed directly by the port number to make 801 * access to this data as efficient as possible. 802 * 803 * Therefore port_data is declared as a 1 based array with potential 804 * empty slots at the beginning. 805 */ 806 pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, 807 size_add(rdma_end_port(device), 1)), 808 GFP_KERNEL); 809 if (!pdata_rcu) 810 return -ENOMEM; 811 /* 812 * The rcu_head is put in front of the port data array and the stored 813 * pointer is adjusted since we never need to see that member until 814 * kfree_rcu. 815 */ 816 device->port_data = pdata_rcu->pdata; 817 818 rdma_for_each_port (device, port) { 819 struct ib_port_data *pdata = &device->port_data[port]; 820 821 pdata->ib_dev = device; 822 spin_lock_init(&pdata->pkey_list_lock); 823 INIT_LIST_HEAD(&pdata->pkey_list); 824 spin_lock_init(&pdata->netdev_lock); 825 INIT_HLIST_NODE(&pdata->ndev_hash_link); 826 } 827 return 0; 828 } 829 830 static int verify_immutable(const struct ib_device *dev, u32 port) 831 { 832 return WARN_ON(!rdma_cap_ib_mad(dev, port) && 833 rdma_max_mad_size(dev, port) != 0); 834 } 835 836 static int setup_port_data(struct ib_device *device) 837 { 838 u32 port; 839 int ret; 840 841 ret = alloc_port_data(device); 842 if (ret) 843 return ret; 844 845 rdma_for_each_port (device, port) { 846 struct ib_port_data *pdata = &device->port_data[port]; 847 848 ret = device->ops.get_port_immutable(device, port, 849 &pdata->immutable); 850 if (ret) 851 return ret; 852 853 if (verify_immutable(device, port)) 854 return -EINVAL; 855 } 856 return 0; 857 } 858 859 /** 860 * ib_port_immutable_read() - Read rdma port's immutable data 861 * @dev: IB device 862 * @port: port number whose immutable data to read. It starts with index 1 and 863 * valid upto including rdma_end_port(). 864 */ 865 const struct ib_port_immutable* 866 ib_port_immutable_read(struct ib_device *dev, unsigned int port) 867 { 868 WARN_ON(!rdma_is_port_valid(dev, port)); 869 return &dev->port_data[port].immutable; 870 } 871 EXPORT_SYMBOL(ib_port_immutable_read); 872 873 void ib_get_device_fw_str(struct ib_device *dev, char *str) 874 { 875 if (dev->ops.get_dev_fw_str) 876 dev->ops.get_dev_fw_str(dev, str); 877 else 878 str[0] = '\0'; 879 } 880 EXPORT_SYMBOL(ib_get_device_fw_str); 881 882 static void ib_policy_change_task(struct work_struct *work) 883 { 884 struct ib_device *dev; 885 unsigned long index; 886 887 down_read(&devices_rwsem); 888 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 889 unsigned int i; 890 891 rdma_for_each_port (dev, i) { 892 u64 sp; 893 ib_get_cached_subnet_prefix(dev, i, &sp); 894 ib_security_cache_change(dev, i, sp); 895 } 896 } 897 up_read(&devices_rwsem); 898 } 899 900 static int ib_security_change(struct notifier_block *nb, unsigned long event, 901 void *lsm_data) 902 { 903 if (event != LSM_POLICY_CHANGE) 904 return NOTIFY_DONE; 905 906 schedule_work(&ib_policy_change_work); 907 ib_mad_agent_security_change(); 908 909 return NOTIFY_OK; 910 } 911 912 static void compatdev_release(struct device *dev) 913 { 914 struct ib_core_device *cdev = 915 container_of(dev, struct ib_core_device, dev); 916 917 kfree(cdev); 918 } 919 920 static int add_one_compat_dev(struct ib_device *device, 921 struct rdma_dev_net *rnet) 922 { 923 struct ib_core_device *cdev; 924 int ret; 925 926 lockdep_assert_held(&rdma_nets_rwsem); 927 if (!ib_devices_shared_netns) 928 return 0; 929 930 /* 931 * Create and add compat device in all namespaces other than where it 932 * is currently bound to. 933 */ 934 if (net_eq(read_pnet(&rnet->net), 935 read_pnet(&device->coredev.rdma_net))) 936 return 0; 937 938 /* 939 * The first of init_net() or ib_register_device() to take the 940 * compat_devs_mutex wins and gets to add the device. Others will wait 941 * for completion here. 942 */ 943 mutex_lock(&device->compat_devs_mutex); 944 cdev = xa_load(&device->compat_devs, rnet->id); 945 if (cdev) { 946 ret = 0; 947 goto done; 948 } 949 ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); 950 if (ret) 951 goto done; 952 953 cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); 954 if (!cdev) { 955 ret = -ENOMEM; 956 goto cdev_err; 957 } 958 959 cdev->dev.parent = device->dev.parent; 960 rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); 961 cdev->dev.release = compatdev_release; 962 ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); 963 if (ret) 964 goto add_err; 965 966 ret = device_add(&cdev->dev); 967 if (ret) 968 goto add_err; 969 ret = ib_setup_port_attrs(cdev); 970 if (ret) 971 goto port_err; 972 973 ret = xa_err(xa_store(&device->compat_devs, rnet->id, 974 cdev, GFP_KERNEL)); 975 if (ret) 976 goto insert_err; 977 978 mutex_unlock(&device->compat_devs_mutex); 979 return 0; 980 981 insert_err: 982 ib_free_port_attrs(cdev); 983 port_err: 984 device_del(&cdev->dev); 985 add_err: 986 put_device(&cdev->dev); 987 cdev_err: 988 xa_release(&device->compat_devs, rnet->id); 989 done: 990 mutex_unlock(&device->compat_devs_mutex); 991 return ret; 992 } 993 994 static void remove_one_compat_dev(struct ib_device *device, u32 id) 995 { 996 struct ib_core_device *cdev; 997 998 mutex_lock(&device->compat_devs_mutex); 999 cdev = xa_erase(&device->compat_devs, id); 1000 mutex_unlock(&device->compat_devs_mutex); 1001 if (cdev) { 1002 ib_free_port_attrs(cdev); 1003 device_del(&cdev->dev); 1004 put_device(&cdev->dev); 1005 } 1006 } 1007 1008 static void remove_compat_devs(struct ib_device *device) 1009 { 1010 struct ib_core_device *cdev; 1011 unsigned long index; 1012 1013 xa_for_each (&device->compat_devs, index, cdev) 1014 remove_one_compat_dev(device, index); 1015 } 1016 1017 static int add_compat_devs(struct ib_device *device) 1018 { 1019 struct rdma_dev_net *rnet; 1020 unsigned long index; 1021 int ret = 0; 1022 1023 lockdep_assert_held(&devices_rwsem); 1024 1025 down_read(&rdma_nets_rwsem); 1026 xa_for_each (&rdma_nets, index, rnet) { 1027 ret = add_one_compat_dev(device, rnet); 1028 if (ret) 1029 break; 1030 } 1031 up_read(&rdma_nets_rwsem); 1032 return ret; 1033 } 1034 1035 static void remove_all_compat_devs(void) 1036 { 1037 struct ib_compat_device *cdev; 1038 struct ib_device *dev; 1039 unsigned long index; 1040 1041 down_read(&devices_rwsem); 1042 xa_for_each (&devices, index, dev) { 1043 unsigned long c_index = 0; 1044 1045 /* Hold nets_rwsem so that any other thread modifying this 1046 * system param can sync with this thread. 1047 */ 1048 down_read(&rdma_nets_rwsem); 1049 xa_for_each (&dev->compat_devs, c_index, cdev) 1050 remove_one_compat_dev(dev, c_index); 1051 up_read(&rdma_nets_rwsem); 1052 } 1053 up_read(&devices_rwsem); 1054 } 1055 1056 static int add_all_compat_devs(void) 1057 { 1058 struct rdma_dev_net *rnet; 1059 struct ib_device *dev; 1060 unsigned long index; 1061 int ret = 0; 1062 1063 down_read(&devices_rwsem); 1064 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1065 unsigned long net_index = 0; 1066 1067 /* Hold nets_rwsem so that any other thread modifying this 1068 * system param can sync with this thread. 1069 */ 1070 down_read(&rdma_nets_rwsem); 1071 xa_for_each (&rdma_nets, net_index, rnet) { 1072 ret = add_one_compat_dev(dev, rnet); 1073 if (ret) 1074 break; 1075 } 1076 up_read(&rdma_nets_rwsem); 1077 } 1078 up_read(&devices_rwsem); 1079 if (ret) 1080 remove_all_compat_devs(); 1081 return ret; 1082 } 1083 1084 int rdma_compatdev_set(u8 enable) 1085 { 1086 struct rdma_dev_net *rnet; 1087 unsigned long index; 1088 int ret = 0; 1089 1090 down_write(&rdma_nets_rwsem); 1091 if (ib_devices_shared_netns == enable) { 1092 up_write(&rdma_nets_rwsem); 1093 return 0; 1094 } 1095 1096 /* enable/disable of compat devices is not supported 1097 * when more than default init_net exists. 1098 */ 1099 xa_for_each (&rdma_nets, index, rnet) { 1100 ret++; 1101 break; 1102 } 1103 if (!ret) 1104 ib_devices_shared_netns = enable; 1105 up_write(&rdma_nets_rwsem); 1106 if (ret) 1107 return -EBUSY; 1108 1109 if (enable) 1110 ret = add_all_compat_devs(); 1111 else 1112 remove_all_compat_devs(); 1113 return ret; 1114 } 1115 1116 static void rdma_dev_exit_net(struct net *net) 1117 { 1118 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1119 struct ib_device *dev; 1120 unsigned long index; 1121 int ret; 1122 1123 down_write(&rdma_nets_rwsem); 1124 /* 1125 * Prevent the ID from being re-used and hide the id from xa_for_each. 1126 */ 1127 ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); 1128 WARN_ON(ret); 1129 up_write(&rdma_nets_rwsem); 1130 1131 down_read(&devices_rwsem); 1132 xa_for_each (&devices, index, dev) { 1133 get_device(&dev->dev); 1134 /* 1135 * Release the devices_rwsem so that pontentially blocking 1136 * device_del, doesn't hold the devices_rwsem for too long. 1137 */ 1138 up_read(&devices_rwsem); 1139 1140 remove_one_compat_dev(dev, rnet->id); 1141 1142 /* 1143 * If the real device is in the NS then move it back to init. 1144 */ 1145 rdma_dev_change_netns(dev, net, &init_net); 1146 1147 put_device(&dev->dev); 1148 down_read(&devices_rwsem); 1149 } 1150 up_read(&devices_rwsem); 1151 1152 rdma_nl_net_exit(rnet); 1153 xa_erase(&rdma_nets, rnet->id); 1154 } 1155 1156 static __net_init int rdma_dev_init_net(struct net *net) 1157 { 1158 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1159 unsigned long index; 1160 struct ib_device *dev; 1161 int ret; 1162 1163 write_pnet(&rnet->net, net); 1164 1165 ret = rdma_nl_net_init(rnet); 1166 if (ret) 1167 return ret; 1168 1169 /* No need to create any compat devices in default init_net. */ 1170 if (net_eq(net, &init_net)) 1171 return 0; 1172 1173 ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); 1174 if (ret) { 1175 rdma_nl_net_exit(rnet); 1176 return ret; 1177 } 1178 1179 down_read(&devices_rwsem); 1180 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1181 /* Hold nets_rwsem so that netlink command cannot change 1182 * system configuration for device sharing mode. 1183 */ 1184 down_read(&rdma_nets_rwsem); 1185 ret = add_one_compat_dev(dev, rnet); 1186 up_read(&rdma_nets_rwsem); 1187 if (ret) 1188 break; 1189 } 1190 up_read(&devices_rwsem); 1191 1192 if (ret) 1193 rdma_dev_exit_net(net); 1194 1195 return ret; 1196 } 1197 1198 /* 1199 * Assign the unique string device name and the unique device index. This is 1200 * undone by ib_dealloc_device. 1201 */ 1202 static int assign_name(struct ib_device *device, const char *name) 1203 { 1204 static u32 last_id; 1205 int ret; 1206 1207 down_write(&devices_rwsem); 1208 /* Assign a unique name to the device */ 1209 if (strchr(name, '%')) 1210 ret = alloc_name(device, name); 1211 else 1212 ret = dev_set_name(&device->dev, name); 1213 if (ret) 1214 goto out; 1215 1216 if (__ib_device_get_by_name(dev_name(&device->dev))) { 1217 ret = -ENFILE; 1218 goto out; 1219 } 1220 strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); 1221 1222 ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, 1223 &last_id, GFP_KERNEL); 1224 if (ret > 0) 1225 ret = 0; 1226 1227 out: 1228 up_write(&devices_rwsem); 1229 return ret; 1230 } 1231 1232 /* 1233 * setup_device() allocates memory and sets up data that requires calling the 1234 * device ops, this is the only reason these actions are not done during 1235 * ib_alloc_device. It is undone by ib_dealloc_device(). 1236 */ 1237 static int setup_device(struct ib_device *device) 1238 { 1239 struct ib_udata uhw = {.outlen = 0, .inlen = 0}; 1240 int ret; 1241 1242 ib_device_check_mandatory(device); 1243 1244 ret = setup_port_data(device); 1245 if (ret) { 1246 dev_warn(&device->dev, "Couldn't create per-port data\n"); 1247 return ret; 1248 } 1249 1250 memset(&device->attrs, 0, sizeof(device->attrs)); 1251 ret = device->ops.query_device(device, &device->attrs, &uhw); 1252 if (ret) { 1253 dev_warn(&device->dev, 1254 "Couldn't query the device attributes\n"); 1255 return ret; 1256 } 1257 1258 return 0; 1259 } 1260 1261 static void disable_device(struct ib_device *device) 1262 { 1263 u32 cid; 1264 1265 WARN_ON(!refcount_read(&device->refcount)); 1266 1267 down_write(&devices_rwsem); 1268 xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); 1269 up_write(&devices_rwsem); 1270 1271 /* 1272 * Remove clients in LIFO order, see assign_client_id. This could be 1273 * more efficient if xarray learns to reverse iterate. Since no new 1274 * clients can be added to this ib_device past this point we only need 1275 * the maximum possible client_id value here. 1276 */ 1277 down_read(&clients_rwsem); 1278 cid = highest_client_id; 1279 up_read(&clients_rwsem); 1280 while (cid) { 1281 cid--; 1282 remove_client_context(device, cid); 1283 } 1284 1285 ib_cq_pool_cleanup(device); 1286 1287 /* Pairs with refcount_set in enable_device */ 1288 ib_device_put(device); 1289 wait_for_completion(&device->unreg_completion); 1290 1291 /* 1292 * compat devices must be removed after device refcount drops to zero. 1293 * Otherwise init_net() may add more compatdevs after removing compat 1294 * devices and before device is disabled. 1295 */ 1296 remove_compat_devs(device); 1297 } 1298 1299 /* 1300 * An enabled device is visible to all clients and to all the public facing 1301 * APIs that return a device pointer. This always returns with a new get, even 1302 * if it fails. 1303 */ 1304 static int enable_device_and_get(struct ib_device *device) 1305 { 1306 struct ib_client *client; 1307 unsigned long index; 1308 int ret = 0; 1309 1310 /* 1311 * One ref belongs to the xa and the other belongs to this 1312 * thread. This is needed to guard against parallel unregistration. 1313 */ 1314 refcount_set(&device->refcount, 2); 1315 down_write(&devices_rwsem); 1316 xa_set_mark(&devices, device->index, DEVICE_REGISTERED); 1317 1318 /* 1319 * By using downgrade_write() we ensure that no other thread can clear 1320 * DEVICE_REGISTERED while we are completing the client setup. 1321 */ 1322 downgrade_write(&devices_rwsem); 1323 1324 if (device->ops.enable_driver) { 1325 ret = device->ops.enable_driver(device); 1326 if (ret) 1327 goto out; 1328 } 1329 1330 down_read(&clients_rwsem); 1331 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1332 ret = add_client_context(device, client); 1333 if (ret) 1334 break; 1335 } 1336 up_read(&clients_rwsem); 1337 if (!ret) 1338 ret = add_compat_devs(device); 1339 out: 1340 up_read(&devices_rwsem); 1341 return ret; 1342 } 1343 1344 static void prevent_dealloc_device(struct ib_device *ib_dev) 1345 { 1346 } 1347 1348 /** 1349 * ib_register_device - Register an IB device with IB core 1350 * @device: Device to register 1351 * @name: unique string device name. This may include a '%' which will 1352 * cause a unique index to be added to the passed device name. 1353 * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB 1354 * device will be used. In this case the caller should fully 1355 * setup the ibdev for DMA. This usually means using dma_virt_ops. 1356 * 1357 * Low-level drivers use ib_register_device() to register their 1358 * devices with the IB core. All registered clients will receive a 1359 * callback for each device that is added. @device must be allocated 1360 * with ib_alloc_device(). 1361 * 1362 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() 1363 * asynchronously then the device pointer may become freed as soon as this 1364 * function returns. 1365 */ 1366 int ib_register_device(struct ib_device *device, const char *name, 1367 struct device *dma_device) 1368 { 1369 int ret; 1370 1371 ret = assign_name(device, name); 1372 if (ret) 1373 return ret; 1374 1375 /* 1376 * If the caller does not provide a DMA capable device then the IB core 1377 * will set up ib_sge and scatterlist structures that stash the kernel 1378 * virtual address into the address field. 1379 */ 1380 WARN_ON(dma_device && !dma_device->dma_parms); 1381 device->dma_device = dma_device; 1382 1383 ret = setup_device(device); 1384 if (ret) 1385 return ret; 1386 1387 ret = ib_cache_setup_one(device); 1388 if (ret) { 1389 dev_warn(&device->dev, 1390 "Couldn't set up InfiniBand P_Key/GID cache\n"); 1391 return ret; 1392 } 1393 1394 device->groups[0] = &ib_dev_attr_group; 1395 device->groups[1] = device->ops.device_group; 1396 ret = ib_setup_device_attrs(device); 1397 if (ret) 1398 goto cache_cleanup; 1399 1400 ib_device_register_rdmacg(device); 1401 1402 rdma_counter_init(device); 1403 1404 /* 1405 * Ensure that ADD uevent is not fired because it 1406 * is too early amd device is not initialized yet. 1407 */ 1408 dev_set_uevent_suppress(&device->dev, true); 1409 ret = device_add(&device->dev); 1410 if (ret) 1411 goto cg_cleanup; 1412 1413 ret = ib_setup_port_attrs(&device->coredev); 1414 if (ret) { 1415 dev_warn(&device->dev, 1416 "Couldn't register device with driver model\n"); 1417 goto dev_cleanup; 1418 } 1419 1420 ret = enable_device_and_get(device); 1421 if (ret) { 1422 void (*dealloc_fn)(struct ib_device *); 1423 1424 /* 1425 * If we hit this error flow then we don't want to 1426 * automatically dealloc the device since the caller is 1427 * expected to call ib_dealloc_device() after 1428 * ib_register_device() fails. This is tricky due to the 1429 * possibility for a parallel unregistration along with this 1430 * error flow. Since we have a refcount here we know any 1431 * parallel flow is stopped in disable_device and will see the 1432 * special dealloc_driver pointer, causing the responsibility to 1433 * ib_dealloc_device() to revert back to this thread. 1434 */ 1435 dealloc_fn = device->ops.dealloc_driver; 1436 device->ops.dealloc_driver = prevent_dealloc_device; 1437 ib_device_put(device); 1438 __ib_unregister_device(device); 1439 device->ops.dealloc_driver = dealloc_fn; 1440 dev_set_uevent_suppress(&device->dev, false); 1441 return ret; 1442 } 1443 dev_set_uevent_suppress(&device->dev, false); 1444 /* Mark for userspace that device is ready */ 1445 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1446 ib_device_put(device); 1447 1448 return 0; 1449 1450 dev_cleanup: 1451 device_del(&device->dev); 1452 cg_cleanup: 1453 dev_set_uevent_suppress(&device->dev, false); 1454 ib_device_unregister_rdmacg(device); 1455 cache_cleanup: 1456 ib_cache_cleanup_one(device); 1457 return ret; 1458 } 1459 EXPORT_SYMBOL(ib_register_device); 1460 1461 /* Callers must hold a get on the device. */ 1462 static void __ib_unregister_device(struct ib_device *ib_dev) 1463 { 1464 /* 1465 * We have a registration lock so that all the calls to unregister are 1466 * fully fenced, once any unregister returns the device is truely 1467 * unregistered even if multiple callers are unregistering it at the 1468 * same time. This also interacts with the registration flow and 1469 * provides sane semantics if register and unregister are racing. 1470 */ 1471 mutex_lock(&ib_dev->unregistration_lock); 1472 if (!refcount_read(&ib_dev->refcount)) 1473 goto out; 1474 1475 disable_device(ib_dev); 1476 1477 /* Expedite removing unregistered pointers from the hash table */ 1478 free_netdevs(ib_dev); 1479 1480 ib_free_port_attrs(&ib_dev->coredev); 1481 device_del(&ib_dev->dev); 1482 ib_device_unregister_rdmacg(ib_dev); 1483 ib_cache_cleanup_one(ib_dev); 1484 1485 /* 1486 * Drivers using the new flow may not call ib_dealloc_device except 1487 * in error unwind prior to registration success. 1488 */ 1489 if (ib_dev->ops.dealloc_driver && 1490 ib_dev->ops.dealloc_driver != prevent_dealloc_device) { 1491 WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); 1492 ib_dealloc_device(ib_dev); 1493 } 1494 out: 1495 mutex_unlock(&ib_dev->unregistration_lock); 1496 } 1497 1498 /** 1499 * ib_unregister_device - Unregister an IB device 1500 * @ib_dev: The device to unregister 1501 * 1502 * Unregister an IB device. All clients will receive a remove callback. 1503 * 1504 * Callers should call this routine only once, and protect against races with 1505 * registration. Typically it should only be called as part of a remove 1506 * callback in an implementation of driver core's struct device_driver and 1507 * related. 1508 * 1509 * If ops.dealloc_driver is used then ib_dev will be freed upon return from 1510 * this function. 1511 */ 1512 void ib_unregister_device(struct ib_device *ib_dev) 1513 { 1514 get_device(&ib_dev->dev); 1515 __ib_unregister_device(ib_dev); 1516 put_device(&ib_dev->dev); 1517 } 1518 EXPORT_SYMBOL(ib_unregister_device); 1519 1520 /** 1521 * ib_unregister_device_and_put - Unregister a device while holding a 'get' 1522 * @ib_dev: The device to unregister 1523 * 1524 * This is the same as ib_unregister_device(), except it includes an internal 1525 * ib_device_put() that should match a 'get' obtained by the caller. 1526 * 1527 * It is safe to call this routine concurrently from multiple threads while 1528 * holding the 'get'. When the function returns the device is fully 1529 * unregistered. 1530 * 1531 * Drivers using this flow MUST use the driver_unregister callback to clean up 1532 * their resources associated with the device and dealloc it. 1533 */ 1534 void ib_unregister_device_and_put(struct ib_device *ib_dev) 1535 { 1536 WARN_ON(!ib_dev->ops.dealloc_driver); 1537 get_device(&ib_dev->dev); 1538 ib_device_put(ib_dev); 1539 __ib_unregister_device(ib_dev); 1540 put_device(&ib_dev->dev); 1541 } 1542 EXPORT_SYMBOL(ib_unregister_device_and_put); 1543 1544 /** 1545 * ib_unregister_driver - Unregister all IB devices for a driver 1546 * @driver_id: The driver to unregister 1547 * 1548 * This implements a fence for device unregistration. It only returns once all 1549 * devices associated with the driver_id have fully completed their 1550 * unregistration and returned from ib_unregister_device*(). 1551 * 1552 * If device's are not yet unregistered it goes ahead and starts unregistering 1553 * them. 1554 * 1555 * This does not block creation of new devices with the given driver_id, that 1556 * is the responsibility of the caller. 1557 */ 1558 void ib_unregister_driver(enum rdma_driver_id driver_id) 1559 { 1560 struct ib_device *ib_dev; 1561 unsigned long index; 1562 1563 down_read(&devices_rwsem); 1564 xa_for_each (&devices, index, ib_dev) { 1565 if (ib_dev->ops.driver_id != driver_id) 1566 continue; 1567 1568 get_device(&ib_dev->dev); 1569 up_read(&devices_rwsem); 1570 1571 WARN_ON(!ib_dev->ops.dealloc_driver); 1572 __ib_unregister_device(ib_dev); 1573 1574 put_device(&ib_dev->dev); 1575 down_read(&devices_rwsem); 1576 } 1577 up_read(&devices_rwsem); 1578 } 1579 EXPORT_SYMBOL(ib_unregister_driver); 1580 1581 static void ib_unregister_work(struct work_struct *work) 1582 { 1583 struct ib_device *ib_dev = 1584 container_of(work, struct ib_device, unregistration_work); 1585 1586 __ib_unregister_device(ib_dev); 1587 put_device(&ib_dev->dev); 1588 } 1589 1590 /** 1591 * ib_unregister_device_queued - Unregister a device using a work queue 1592 * @ib_dev: The device to unregister 1593 * 1594 * This schedules an asynchronous unregistration using a WQ for the device. A 1595 * driver should use this to avoid holding locks while doing unregistration, 1596 * such as holding the RTNL lock. 1597 * 1598 * Drivers using this API must use ib_unregister_driver before module unload 1599 * to ensure that all scheduled unregistrations have completed. 1600 */ 1601 void ib_unregister_device_queued(struct ib_device *ib_dev) 1602 { 1603 WARN_ON(!refcount_read(&ib_dev->refcount)); 1604 WARN_ON(!ib_dev->ops.dealloc_driver); 1605 get_device(&ib_dev->dev); 1606 if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work)) 1607 put_device(&ib_dev->dev); 1608 } 1609 EXPORT_SYMBOL(ib_unregister_device_queued); 1610 1611 /* 1612 * The caller must pass in a device that has the kref held and the refcount 1613 * released. If the device is in cur_net and still registered then it is moved 1614 * into net. 1615 */ 1616 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 1617 struct net *net) 1618 { 1619 int ret2 = -EINVAL; 1620 int ret; 1621 1622 mutex_lock(&device->unregistration_lock); 1623 1624 /* 1625 * If a device not under ib_device_get() or if the unregistration_lock 1626 * is not held, the namespace can be changed, or it can be unregistered. 1627 * Check again under the lock. 1628 */ 1629 if (refcount_read(&device->refcount) == 0 || 1630 !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { 1631 ret = -ENODEV; 1632 goto out; 1633 } 1634 1635 kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); 1636 disable_device(device); 1637 1638 /* 1639 * At this point no one can be using the device, so it is safe to 1640 * change the namespace. 1641 */ 1642 write_pnet(&device->coredev.rdma_net, net); 1643 1644 down_read(&devices_rwsem); 1645 /* 1646 * Currently rdma devices are system wide unique. So the device name 1647 * is guaranteed free in the new namespace. Publish the new namespace 1648 * at the sysfs level. 1649 */ 1650 ret = device_rename(&device->dev, dev_name(&device->dev)); 1651 up_read(&devices_rwsem); 1652 if (ret) { 1653 dev_warn(&device->dev, 1654 "%s: Couldn't rename device after namespace change\n", 1655 __func__); 1656 /* Try and put things back and re-enable the device */ 1657 write_pnet(&device->coredev.rdma_net, cur_net); 1658 } 1659 1660 ret2 = enable_device_and_get(device); 1661 if (ret2) { 1662 /* 1663 * This shouldn't really happen, but if it does, let the user 1664 * retry at later point. So don't disable the device. 1665 */ 1666 dev_warn(&device->dev, 1667 "%s: Couldn't re-enable device after namespace change\n", 1668 __func__); 1669 } 1670 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1671 1672 ib_device_put(device); 1673 out: 1674 mutex_unlock(&device->unregistration_lock); 1675 if (ret) 1676 return ret; 1677 return ret2; 1678 } 1679 1680 int ib_device_set_netns_put(struct sk_buff *skb, 1681 struct ib_device *dev, u32 ns_fd) 1682 { 1683 struct net *net; 1684 int ret; 1685 1686 net = get_net_ns_by_fd(ns_fd); 1687 if (IS_ERR(net)) { 1688 ret = PTR_ERR(net); 1689 goto net_err; 1690 } 1691 1692 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { 1693 ret = -EPERM; 1694 goto ns_err; 1695 } 1696 1697 /* 1698 * All the ib_clients, including uverbs, are reset when the namespace is 1699 * changed and this cannot be blocked waiting for userspace to do 1700 * something, so disassociation is mandatory. 1701 */ 1702 if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) { 1703 ret = -EOPNOTSUPP; 1704 goto ns_err; 1705 } 1706 1707 get_device(&dev->dev); 1708 ib_device_put(dev); 1709 ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); 1710 put_device(&dev->dev); 1711 1712 put_net(net); 1713 return ret; 1714 1715 ns_err: 1716 put_net(net); 1717 net_err: 1718 ib_device_put(dev); 1719 return ret; 1720 } 1721 1722 static struct pernet_operations rdma_dev_net_ops = { 1723 .init = rdma_dev_init_net, 1724 .exit = rdma_dev_exit_net, 1725 .id = &rdma_dev_net_id, 1726 .size = sizeof(struct rdma_dev_net), 1727 }; 1728 1729 static int assign_client_id(struct ib_client *client) 1730 { 1731 int ret; 1732 1733 lockdep_assert_held(&clients_rwsem); 1734 /* 1735 * The add/remove callbacks must be called in FIFO/LIFO order. To 1736 * achieve this we assign client_ids so they are sorted in 1737 * registration order. 1738 */ 1739 client->client_id = highest_client_id; 1740 ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); 1741 if (ret) 1742 return ret; 1743 1744 highest_client_id++; 1745 xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); 1746 return 0; 1747 } 1748 1749 static void remove_client_id(struct ib_client *client) 1750 { 1751 down_write(&clients_rwsem); 1752 xa_erase(&clients, client->client_id); 1753 for (; highest_client_id; highest_client_id--) 1754 if (xa_load(&clients, highest_client_id - 1)) 1755 break; 1756 up_write(&clients_rwsem); 1757 } 1758 1759 /** 1760 * ib_register_client - Register an IB client 1761 * @client:Client to register 1762 * 1763 * Upper level users of the IB drivers can use ib_register_client() to 1764 * register callbacks for IB device addition and removal. When an IB 1765 * device is added, each registered client's add method will be called 1766 * (in the order the clients were registered), and when a device is 1767 * removed, each client's remove method will be called (in the reverse 1768 * order that clients were registered). In addition, when 1769 * ib_register_client() is called, the client will receive an add 1770 * callback for all devices already registered. 1771 */ 1772 int ib_register_client(struct ib_client *client) 1773 { 1774 struct ib_device *device; 1775 unsigned long index; 1776 bool need_unreg = false; 1777 int ret; 1778 1779 refcount_set(&client->uses, 1); 1780 init_completion(&client->uses_zero); 1781 1782 /* 1783 * The devices_rwsem is held in write mode to ensure that a racing 1784 * ib_register_device() sees a consisent view of clients and devices. 1785 */ 1786 down_write(&devices_rwsem); 1787 down_write(&clients_rwsem); 1788 ret = assign_client_id(client); 1789 if (ret) 1790 goto out; 1791 1792 need_unreg = true; 1793 xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { 1794 ret = add_client_context(device, client); 1795 if (ret) 1796 goto out; 1797 } 1798 ret = 0; 1799 out: 1800 up_write(&clients_rwsem); 1801 up_write(&devices_rwsem); 1802 if (need_unreg && ret) 1803 ib_unregister_client(client); 1804 return ret; 1805 } 1806 EXPORT_SYMBOL(ib_register_client); 1807 1808 /** 1809 * ib_unregister_client - Unregister an IB client 1810 * @client:Client to unregister 1811 * 1812 * Upper level users use ib_unregister_client() to remove their client 1813 * registration. When ib_unregister_client() is called, the client 1814 * will receive a remove callback for each IB device still registered. 1815 * 1816 * This is a full fence, once it returns no client callbacks will be called, 1817 * or are running in another thread. 1818 */ 1819 void ib_unregister_client(struct ib_client *client) 1820 { 1821 struct ib_device *device; 1822 unsigned long index; 1823 1824 down_write(&clients_rwsem); 1825 ib_client_put(client); 1826 xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); 1827 up_write(&clients_rwsem); 1828 1829 /* We do not want to have locks while calling client->remove() */ 1830 rcu_read_lock(); 1831 xa_for_each (&devices, index, device) { 1832 if (!ib_device_try_get(device)) 1833 continue; 1834 rcu_read_unlock(); 1835 1836 remove_client_context(device, client->client_id); 1837 1838 ib_device_put(device); 1839 rcu_read_lock(); 1840 } 1841 rcu_read_unlock(); 1842 1843 /* 1844 * remove_client_context() is not a fence, it can return even though a 1845 * removal is ongoing. Wait until all removals are completed. 1846 */ 1847 wait_for_completion(&client->uses_zero); 1848 remove_client_id(client); 1849 } 1850 EXPORT_SYMBOL(ib_unregister_client); 1851 1852 static int __ib_get_global_client_nl_info(const char *client_name, 1853 struct ib_client_nl_info *res) 1854 { 1855 struct ib_client *client; 1856 unsigned long index; 1857 int ret = -ENOENT; 1858 1859 down_read(&clients_rwsem); 1860 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1861 if (strcmp(client->name, client_name) != 0) 1862 continue; 1863 if (!client->get_global_nl_info) { 1864 ret = -EOPNOTSUPP; 1865 break; 1866 } 1867 ret = client->get_global_nl_info(res); 1868 if (WARN_ON(ret == -ENOENT)) 1869 ret = -EINVAL; 1870 if (!ret && res->cdev) 1871 get_device(res->cdev); 1872 break; 1873 } 1874 up_read(&clients_rwsem); 1875 return ret; 1876 } 1877 1878 static int __ib_get_client_nl_info(struct ib_device *ibdev, 1879 const char *client_name, 1880 struct ib_client_nl_info *res) 1881 { 1882 unsigned long index; 1883 void *client_data; 1884 int ret = -ENOENT; 1885 1886 down_read(&ibdev->client_data_rwsem); 1887 xan_for_each_marked (&ibdev->client_data, index, client_data, 1888 CLIENT_DATA_REGISTERED) { 1889 struct ib_client *client = xa_load(&clients, index); 1890 1891 if (!client || strcmp(client->name, client_name) != 0) 1892 continue; 1893 if (!client->get_nl_info) { 1894 ret = -EOPNOTSUPP; 1895 break; 1896 } 1897 ret = client->get_nl_info(ibdev, client_data, res); 1898 if (WARN_ON(ret == -ENOENT)) 1899 ret = -EINVAL; 1900 1901 /* 1902 * The cdev is guaranteed valid as long as we are inside the 1903 * client_data_rwsem as remove_one can't be called. Keep it 1904 * valid for the caller. 1905 */ 1906 if (!ret && res->cdev) 1907 get_device(res->cdev); 1908 break; 1909 } 1910 up_read(&ibdev->client_data_rwsem); 1911 1912 return ret; 1913 } 1914 1915 /** 1916 * ib_get_client_nl_info - Fetch the nl_info from a client 1917 * @ibdev: IB device 1918 * @client_name: Name of the client 1919 * @res: Result of the query 1920 */ 1921 int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, 1922 struct ib_client_nl_info *res) 1923 { 1924 int ret; 1925 1926 if (ibdev) 1927 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1928 else 1929 ret = __ib_get_global_client_nl_info(client_name, res); 1930 #ifdef CONFIG_MODULES 1931 if (ret == -ENOENT) { 1932 request_module("rdma-client-%s", client_name); 1933 if (ibdev) 1934 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1935 else 1936 ret = __ib_get_global_client_nl_info(client_name, res); 1937 } 1938 #endif 1939 if (ret) { 1940 if (ret == -ENOENT) 1941 return -EOPNOTSUPP; 1942 return ret; 1943 } 1944 1945 if (WARN_ON(!res->cdev)) 1946 return -EINVAL; 1947 return 0; 1948 } 1949 1950 /** 1951 * ib_set_client_data - Set IB client context 1952 * @device:Device to set context for 1953 * @client:Client to set context for 1954 * @data:Context to set 1955 * 1956 * ib_set_client_data() sets client context data that can be retrieved with 1957 * ib_get_client_data(). This can only be called while the client is 1958 * registered to the device, once the ib_client remove() callback returns this 1959 * cannot be called. 1960 */ 1961 void ib_set_client_data(struct ib_device *device, struct ib_client *client, 1962 void *data) 1963 { 1964 void *rc; 1965 1966 if (WARN_ON(IS_ERR(data))) 1967 data = NULL; 1968 1969 rc = xa_store(&device->client_data, client->client_id, data, 1970 GFP_KERNEL); 1971 WARN_ON(xa_is_err(rc)); 1972 } 1973 EXPORT_SYMBOL(ib_set_client_data); 1974 1975 /** 1976 * ib_register_event_handler - Register an IB event handler 1977 * @event_handler:Handler to register 1978 * 1979 * ib_register_event_handler() registers an event handler that will be 1980 * called back when asynchronous IB events occur (as defined in 1981 * chapter 11 of the InfiniBand Architecture Specification). This 1982 * callback occurs in workqueue context. 1983 */ 1984 void ib_register_event_handler(struct ib_event_handler *event_handler) 1985 { 1986 down_write(&event_handler->device->event_handler_rwsem); 1987 list_add_tail(&event_handler->list, 1988 &event_handler->device->event_handler_list); 1989 up_write(&event_handler->device->event_handler_rwsem); 1990 } 1991 EXPORT_SYMBOL(ib_register_event_handler); 1992 1993 /** 1994 * ib_unregister_event_handler - Unregister an event handler 1995 * @event_handler:Handler to unregister 1996 * 1997 * Unregister an event handler registered with 1998 * ib_register_event_handler(). 1999 */ 2000 void ib_unregister_event_handler(struct ib_event_handler *event_handler) 2001 { 2002 down_write(&event_handler->device->event_handler_rwsem); 2003 list_del(&event_handler->list); 2004 up_write(&event_handler->device->event_handler_rwsem); 2005 } 2006 EXPORT_SYMBOL(ib_unregister_event_handler); 2007 2008 void ib_dispatch_event_clients(struct ib_event *event) 2009 { 2010 struct ib_event_handler *handler; 2011 2012 down_read(&event->device->event_handler_rwsem); 2013 2014 list_for_each_entry(handler, &event->device->event_handler_list, list) 2015 handler->handler(handler, event); 2016 2017 up_read(&event->device->event_handler_rwsem); 2018 } 2019 2020 static int iw_query_port(struct ib_device *device, 2021 u32 port_num, 2022 struct ib_port_attr *port_attr) 2023 { 2024 struct in_device *inetdev; 2025 struct net_device *netdev; 2026 2027 memset(port_attr, 0, sizeof(*port_attr)); 2028 2029 netdev = ib_device_get_netdev(device, port_num); 2030 if (!netdev) 2031 return -ENODEV; 2032 2033 port_attr->max_mtu = IB_MTU_4096; 2034 port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); 2035 2036 if (!netif_carrier_ok(netdev)) { 2037 port_attr->state = IB_PORT_DOWN; 2038 port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; 2039 } else { 2040 rcu_read_lock(); 2041 inetdev = __in_dev_get_rcu(netdev); 2042 2043 if (inetdev && inetdev->ifa_list) { 2044 port_attr->state = IB_PORT_ACTIVE; 2045 port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; 2046 } else { 2047 port_attr->state = IB_PORT_INIT; 2048 port_attr->phys_state = 2049 IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; 2050 } 2051 2052 rcu_read_unlock(); 2053 } 2054 2055 dev_put(netdev); 2056 return device->ops.query_port(device, port_num, port_attr); 2057 } 2058 2059 static int __ib_query_port(struct ib_device *device, 2060 u32 port_num, 2061 struct ib_port_attr *port_attr) 2062 { 2063 int err; 2064 2065 memset(port_attr, 0, sizeof(*port_attr)); 2066 2067 err = device->ops.query_port(device, port_num, port_attr); 2068 if (err || port_attr->subnet_prefix) 2069 return err; 2070 2071 if (rdma_port_get_link_layer(device, port_num) != 2072 IB_LINK_LAYER_INFINIBAND) 2073 return 0; 2074 2075 ib_get_cached_subnet_prefix(device, port_num, 2076 &port_attr->subnet_prefix); 2077 return 0; 2078 } 2079 2080 /** 2081 * ib_query_port - Query IB port attributes 2082 * @device:Device to query 2083 * @port_num:Port number to query 2084 * @port_attr:Port attributes 2085 * 2086 * ib_query_port() returns the attributes of a port through the 2087 * @port_attr pointer. 2088 */ 2089 int ib_query_port(struct ib_device *device, 2090 u32 port_num, 2091 struct ib_port_attr *port_attr) 2092 { 2093 if (!rdma_is_port_valid(device, port_num)) 2094 return -EINVAL; 2095 2096 if (rdma_protocol_iwarp(device, port_num)) 2097 return iw_query_port(device, port_num, port_attr); 2098 else 2099 return __ib_query_port(device, port_num, port_attr); 2100 } 2101 EXPORT_SYMBOL(ib_query_port); 2102 2103 static void add_ndev_hash(struct ib_port_data *pdata) 2104 { 2105 unsigned long flags; 2106 2107 might_sleep(); 2108 2109 spin_lock_irqsave(&ndev_hash_lock, flags); 2110 if (hash_hashed(&pdata->ndev_hash_link)) { 2111 hash_del_rcu(&pdata->ndev_hash_link); 2112 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2113 /* 2114 * We cannot do hash_add_rcu after a hash_del_rcu until the 2115 * grace period 2116 */ 2117 synchronize_rcu(); 2118 spin_lock_irqsave(&ndev_hash_lock, flags); 2119 } 2120 if (pdata->netdev) 2121 hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, 2122 (uintptr_t)pdata->netdev); 2123 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2124 } 2125 2126 /** 2127 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device 2128 * @ib_dev: Device to modify 2129 * @ndev: net_device to affiliate, may be NULL 2130 * @port: IB port the net_device is connected to 2131 * 2132 * Drivers should use this to link the ib_device to a netdev so the netdev 2133 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be 2134 * affiliated with any port. 2135 * 2136 * The caller must ensure that the given ndev is not unregistered or 2137 * unregistering, and that either the ib_device is unregistered or 2138 * ib_device_set_netdev() is called with NULL when the ndev sends a 2139 * NETDEV_UNREGISTER event. 2140 */ 2141 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, 2142 u32 port) 2143 { 2144 struct net_device *old_ndev; 2145 struct ib_port_data *pdata; 2146 unsigned long flags; 2147 int ret; 2148 2149 /* 2150 * Drivers wish to call this before ib_register_driver, so we have to 2151 * setup the port data early. 2152 */ 2153 ret = alloc_port_data(ib_dev); 2154 if (ret) 2155 return ret; 2156 2157 if (!rdma_is_port_valid(ib_dev, port)) 2158 return -EINVAL; 2159 2160 pdata = &ib_dev->port_data[port]; 2161 spin_lock_irqsave(&pdata->netdev_lock, flags); 2162 old_ndev = rcu_dereference_protected( 2163 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2164 if (old_ndev == ndev) { 2165 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2166 return 0; 2167 } 2168 2169 if (old_ndev) 2170 netdev_tracker_free(ndev, &pdata->netdev_tracker); 2171 if (ndev) 2172 netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC); 2173 rcu_assign_pointer(pdata->netdev, ndev); 2174 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2175 2176 add_ndev_hash(pdata); 2177 __dev_put(old_ndev); 2178 2179 return 0; 2180 } 2181 EXPORT_SYMBOL(ib_device_set_netdev); 2182 2183 static void free_netdevs(struct ib_device *ib_dev) 2184 { 2185 unsigned long flags; 2186 u32 port; 2187 2188 if (!ib_dev->port_data) 2189 return; 2190 2191 rdma_for_each_port (ib_dev, port) { 2192 struct ib_port_data *pdata = &ib_dev->port_data[port]; 2193 struct net_device *ndev; 2194 2195 spin_lock_irqsave(&pdata->netdev_lock, flags); 2196 ndev = rcu_dereference_protected( 2197 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2198 if (ndev) { 2199 spin_lock(&ndev_hash_lock); 2200 hash_del_rcu(&pdata->ndev_hash_link); 2201 spin_unlock(&ndev_hash_lock); 2202 2203 /* 2204 * If this is the last dev_put there is still a 2205 * synchronize_rcu before the netdev is kfreed, so we 2206 * can continue to rely on unlocked pointer 2207 * comparisons after the put 2208 */ 2209 rcu_assign_pointer(pdata->netdev, NULL); 2210 netdev_put(ndev, &pdata->netdev_tracker); 2211 } 2212 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2213 } 2214 } 2215 2216 struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, 2217 u32 port) 2218 { 2219 struct ib_port_data *pdata; 2220 struct net_device *res; 2221 2222 if (!rdma_is_port_valid(ib_dev, port)) 2223 return NULL; 2224 2225 pdata = &ib_dev->port_data[port]; 2226 2227 /* 2228 * New drivers should use ib_device_set_netdev() not the legacy 2229 * get_netdev(). 2230 */ 2231 if (ib_dev->ops.get_netdev) 2232 res = ib_dev->ops.get_netdev(ib_dev, port); 2233 else { 2234 spin_lock(&pdata->netdev_lock); 2235 res = rcu_dereference_protected( 2236 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2237 dev_hold(res); 2238 spin_unlock(&pdata->netdev_lock); 2239 } 2240 2241 /* 2242 * If we are starting to unregister expedite things by preventing 2243 * propagation of an unregistering netdev. 2244 */ 2245 if (res && res->reg_state != NETREG_REGISTERED) { 2246 dev_put(res); 2247 return NULL; 2248 } 2249 2250 return res; 2251 } 2252 2253 /** 2254 * ib_device_get_by_netdev - Find an IB device associated with a netdev 2255 * @ndev: netdev to locate 2256 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 2257 * 2258 * Find and hold an ib_device that is associated with a netdev via 2259 * ib_device_set_netdev(). The caller must call ib_device_put() on the 2260 * returned pointer. 2261 */ 2262 struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, 2263 enum rdma_driver_id driver_id) 2264 { 2265 struct ib_device *res = NULL; 2266 struct ib_port_data *cur; 2267 2268 rcu_read_lock(); 2269 hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, 2270 (uintptr_t)ndev) { 2271 if (rcu_access_pointer(cur->netdev) == ndev && 2272 (driver_id == RDMA_DRIVER_UNKNOWN || 2273 cur->ib_dev->ops.driver_id == driver_id) && 2274 ib_device_try_get(cur->ib_dev)) { 2275 res = cur->ib_dev; 2276 break; 2277 } 2278 } 2279 rcu_read_unlock(); 2280 2281 return res; 2282 } 2283 EXPORT_SYMBOL(ib_device_get_by_netdev); 2284 2285 /** 2286 * ib_enum_roce_netdev - enumerate all RoCE ports 2287 * @ib_dev : IB device we want to query 2288 * @filter: Should we call the callback? 2289 * @filter_cookie: Cookie passed to filter 2290 * @cb: Callback to call for each found RoCE ports 2291 * @cookie: Cookie passed back to the callback 2292 * 2293 * Enumerates all of the physical RoCE ports of ib_dev 2294 * which are related to netdevice and calls callback() on each 2295 * device for which filter() function returns non zero. 2296 */ 2297 void ib_enum_roce_netdev(struct ib_device *ib_dev, 2298 roce_netdev_filter filter, 2299 void *filter_cookie, 2300 roce_netdev_callback cb, 2301 void *cookie) 2302 { 2303 u32 port; 2304 2305 rdma_for_each_port (ib_dev, port) 2306 if (rdma_protocol_roce(ib_dev, port)) { 2307 struct net_device *idev = 2308 ib_device_get_netdev(ib_dev, port); 2309 2310 if (filter(ib_dev, port, idev, filter_cookie)) 2311 cb(ib_dev, port, idev, cookie); 2312 dev_put(idev); 2313 } 2314 } 2315 2316 /** 2317 * ib_enum_all_roce_netdevs - enumerate all RoCE devices 2318 * @filter: Should we call the callback? 2319 * @filter_cookie: Cookie passed to filter 2320 * @cb: Callback to call for each found RoCE ports 2321 * @cookie: Cookie passed back to the callback 2322 * 2323 * Enumerates all RoCE devices' physical ports which are related 2324 * to netdevices and calls callback() on each device for which 2325 * filter() function returns non zero. 2326 */ 2327 void ib_enum_all_roce_netdevs(roce_netdev_filter filter, 2328 void *filter_cookie, 2329 roce_netdev_callback cb, 2330 void *cookie) 2331 { 2332 struct ib_device *dev; 2333 unsigned long index; 2334 2335 down_read(&devices_rwsem); 2336 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) 2337 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); 2338 up_read(&devices_rwsem); 2339 } 2340 2341 /* 2342 * ib_enum_all_devs - enumerate all ib_devices 2343 * @cb: Callback to call for each found ib_device 2344 * 2345 * Enumerates all ib_devices and calls callback() on each device. 2346 */ 2347 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, 2348 struct netlink_callback *cb) 2349 { 2350 unsigned long index; 2351 struct ib_device *dev; 2352 unsigned int idx = 0; 2353 int ret = 0; 2354 2355 down_read(&devices_rwsem); 2356 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 2357 if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) 2358 continue; 2359 2360 ret = nldev_cb(dev, skb, cb, idx); 2361 if (ret) 2362 break; 2363 idx++; 2364 } 2365 up_read(&devices_rwsem); 2366 return ret; 2367 } 2368 2369 /** 2370 * ib_query_pkey - Get P_Key table entry 2371 * @device:Device to query 2372 * @port_num:Port number to query 2373 * @index:P_Key table index to query 2374 * @pkey:Returned P_Key 2375 * 2376 * ib_query_pkey() fetches the specified P_Key table entry. 2377 */ 2378 int ib_query_pkey(struct ib_device *device, 2379 u32 port_num, u16 index, u16 *pkey) 2380 { 2381 if (!rdma_is_port_valid(device, port_num)) 2382 return -EINVAL; 2383 2384 if (!device->ops.query_pkey) 2385 return -EOPNOTSUPP; 2386 2387 return device->ops.query_pkey(device, port_num, index, pkey); 2388 } 2389 EXPORT_SYMBOL(ib_query_pkey); 2390 2391 /** 2392 * ib_modify_device - Change IB device attributes 2393 * @device:Device to modify 2394 * @device_modify_mask:Mask of attributes to change 2395 * @device_modify:New attribute values 2396 * 2397 * ib_modify_device() changes a device's attributes as specified by 2398 * the @device_modify_mask and @device_modify structure. 2399 */ 2400 int ib_modify_device(struct ib_device *device, 2401 int device_modify_mask, 2402 struct ib_device_modify *device_modify) 2403 { 2404 if (!device->ops.modify_device) 2405 return -EOPNOTSUPP; 2406 2407 return device->ops.modify_device(device, device_modify_mask, 2408 device_modify); 2409 } 2410 EXPORT_SYMBOL(ib_modify_device); 2411 2412 /** 2413 * ib_modify_port - Modifies the attributes for the specified port. 2414 * @device: The device to modify. 2415 * @port_num: The number of the port to modify. 2416 * @port_modify_mask: Mask used to specify which attributes of the port 2417 * to change. 2418 * @port_modify: New attribute values for the port. 2419 * 2420 * ib_modify_port() changes a port's attributes as specified by the 2421 * @port_modify_mask and @port_modify structure. 2422 */ 2423 int ib_modify_port(struct ib_device *device, 2424 u32 port_num, int port_modify_mask, 2425 struct ib_port_modify *port_modify) 2426 { 2427 int rc; 2428 2429 if (!rdma_is_port_valid(device, port_num)) 2430 return -EINVAL; 2431 2432 if (device->ops.modify_port) 2433 rc = device->ops.modify_port(device, port_num, 2434 port_modify_mask, 2435 port_modify); 2436 else if (rdma_protocol_roce(device, port_num) && 2437 ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || 2438 (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) 2439 rc = 0; 2440 else 2441 rc = -EOPNOTSUPP; 2442 return rc; 2443 } 2444 EXPORT_SYMBOL(ib_modify_port); 2445 2446 /** 2447 * ib_find_gid - Returns the port number and GID table index where 2448 * a specified GID value occurs. Its searches only for IB link layer. 2449 * @device: The device to query. 2450 * @gid: The GID value to search for. 2451 * @port_num: The port number of the device where the GID value was found. 2452 * @index: The index into the GID table where the GID was found. This 2453 * parameter may be NULL. 2454 */ 2455 int ib_find_gid(struct ib_device *device, union ib_gid *gid, 2456 u32 *port_num, u16 *index) 2457 { 2458 union ib_gid tmp_gid; 2459 u32 port; 2460 int ret, i; 2461 2462 rdma_for_each_port (device, port) { 2463 if (!rdma_protocol_ib(device, port)) 2464 continue; 2465 2466 for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; 2467 ++i) { 2468 ret = rdma_query_gid(device, port, i, &tmp_gid); 2469 if (ret) 2470 continue; 2471 2472 if (!memcmp(&tmp_gid, gid, sizeof *gid)) { 2473 *port_num = port; 2474 if (index) 2475 *index = i; 2476 return 0; 2477 } 2478 } 2479 } 2480 2481 return -ENOENT; 2482 } 2483 EXPORT_SYMBOL(ib_find_gid); 2484 2485 /** 2486 * ib_find_pkey - Returns the PKey table index where a specified 2487 * PKey value occurs. 2488 * @device: The device to query. 2489 * @port_num: The port number of the device to search for the PKey. 2490 * @pkey: The PKey value to search for. 2491 * @index: The index into the PKey table where the PKey was found. 2492 */ 2493 int ib_find_pkey(struct ib_device *device, 2494 u32 port_num, u16 pkey, u16 *index) 2495 { 2496 int ret, i; 2497 u16 tmp_pkey; 2498 int partial_ix = -1; 2499 2500 for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; 2501 ++i) { 2502 ret = ib_query_pkey(device, port_num, i, &tmp_pkey); 2503 if (ret) 2504 return ret; 2505 if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { 2506 /* if there is full-member pkey take it.*/ 2507 if (tmp_pkey & 0x8000) { 2508 *index = i; 2509 return 0; 2510 } 2511 if (partial_ix < 0) 2512 partial_ix = i; 2513 } 2514 } 2515 2516 /*no full-member, if exists take the limited*/ 2517 if (partial_ix >= 0) { 2518 *index = partial_ix; 2519 return 0; 2520 } 2521 return -ENOENT; 2522 } 2523 EXPORT_SYMBOL(ib_find_pkey); 2524 2525 /** 2526 * ib_get_net_dev_by_params() - Return the appropriate net_dev 2527 * for a received CM request 2528 * @dev: An RDMA device on which the request has been received. 2529 * @port: Port number on the RDMA device. 2530 * @pkey: The Pkey the request came on. 2531 * @gid: A GID that the net_dev uses to communicate. 2532 * @addr: Contains the IP address that the request specified as its 2533 * destination. 2534 * 2535 */ 2536 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, 2537 u32 port, 2538 u16 pkey, 2539 const union ib_gid *gid, 2540 const struct sockaddr *addr) 2541 { 2542 struct net_device *net_dev = NULL; 2543 unsigned long index; 2544 void *client_data; 2545 2546 if (!rdma_protocol_ib(dev, port)) 2547 return NULL; 2548 2549 /* 2550 * Holding the read side guarantees that the client will not become 2551 * unregistered while we are calling get_net_dev_by_params() 2552 */ 2553 down_read(&dev->client_data_rwsem); 2554 xan_for_each_marked (&dev->client_data, index, client_data, 2555 CLIENT_DATA_REGISTERED) { 2556 struct ib_client *client = xa_load(&clients, index); 2557 2558 if (!client || !client->get_net_dev_by_params) 2559 continue; 2560 2561 net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, 2562 addr, client_data); 2563 if (net_dev) 2564 break; 2565 } 2566 up_read(&dev->client_data_rwsem); 2567 2568 return net_dev; 2569 } 2570 EXPORT_SYMBOL(ib_get_net_dev_by_params); 2571 2572 void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) 2573 { 2574 struct ib_device_ops *dev_ops = &dev->ops; 2575 #define SET_DEVICE_OP(ptr, name) \ 2576 do { \ 2577 if (ops->name) \ 2578 if (!((ptr)->name)) \ 2579 (ptr)->name = ops->name; \ 2580 } while (0) 2581 2582 #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) 2583 2584 if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { 2585 WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && 2586 dev_ops->driver_id != ops->driver_id); 2587 dev_ops->driver_id = ops->driver_id; 2588 } 2589 if (ops->owner) { 2590 WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); 2591 dev_ops->owner = ops->owner; 2592 } 2593 if (ops->uverbs_abi_ver) 2594 dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; 2595 2596 dev_ops->uverbs_no_driver_id_binding |= 2597 ops->uverbs_no_driver_id_binding; 2598 2599 SET_DEVICE_OP(dev_ops, add_gid); 2600 SET_DEVICE_OP(dev_ops, advise_mr); 2601 SET_DEVICE_OP(dev_ops, alloc_dm); 2602 SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); 2603 SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); 2604 SET_DEVICE_OP(dev_ops, alloc_mr); 2605 SET_DEVICE_OP(dev_ops, alloc_mr_integrity); 2606 SET_DEVICE_OP(dev_ops, alloc_mw); 2607 SET_DEVICE_OP(dev_ops, alloc_pd); 2608 SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); 2609 SET_DEVICE_OP(dev_ops, alloc_ucontext); 2610 SET_DEVICE_OP(dev_ops, alloc_xrcd); 2611 SET_DEVICE_OP(dev_ops, attach_mcast); 2612 SET_DEVICE_OP(dev_ops, check_mr_status); 2613 SET_DEVICE_OP(dev_ops, counter_alloc_stats); 2614 SET_DEVICE_OP(dev_ops, counter_bind_qp); 2615 SET_DEVICE_OP(dev_ops, counter_dealloc); 2616 SET_DEVICE_OP(dev_ops, counter_unbind_qp); 2617 SET_DEVICE_OP(dev_ops, counter_update_stats); 2618 SET_DEVICE_OP(dev_ops, create_ah); 2619 SET_DEVICE_OP(dev_ops, create_counters); 2620 SET_DEVICE_OP(dev_ops, create_cq); 2621 SET_DEVICE_OP(dev_ops, create_flow); 2622 SET_DEVICE_OP(dev_ops, create_qp); 2623 SET_DEVICE_OP(dev_ops, create_rwq_ind_table); 2624 SET_DEVICE_OP(dev_ops, create_srq); 2625 SET_DEVICE_OP(dev_ops, create_user_ah); 2626 SET_DEVICE_OP(dev_ops, create_wq); 2627 SET_DEVICE_OP(dev_ops, dealloc_dm); 2628 SET_DEVICE_OP(dev_ops, dealloc_driver); 2629 SET_DEVICE_OP(dev_ops, dealloc_mw); 2630 SET_DEVICE_OP(dev_ops, dealloc_pd); 2631 SET_DEVICE_OP(dev_ops, dealloc_ucontext); 2632 SET_DEVICE_OP(dev_ops, dealloc_xrcd); 2633 SET_DEVICE_OP(dev_ops, del_gid); 2634 SET_DEVICE_OP(dev_ops, dereg_mr); 2635 SET_DEVICE_OP(dev_ops, destroy_ah); 2636 SET_DEVICE_OP(dev_ops, destroy_counters); 2637 SET_DEVICE_OP(dev_ops, destroy_cq); 2638 SET_DEVICE_OP(dev_ops, destroy_flow); 2639 SET_DEVICE_OP(dev_ops, destroy_flow_action); 2640 SET_DEVICE_OP(dev_ops, destroy_qp); 2641 SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); 2642 SET_DEVICE_OP(dev_ops, destroy_srq); 2643 SET_DEVICE_OP(dev_ops, destroy_wq); 2644 SET_DEVICE_OP(dev_ops, device_group); 2645 SET_DEVICE_OP(dev_ops, detach_mcast); 2646 SET_DEVICE_OP(dev_ops, disassociate_ucontext); 2647 SET_DEVICE_OP(dev_ops, drain_rq); 2648 SET_DEVICE_OP(dev_ops, drain_sq); 2649 SET_DEVICE_OP(dev_ops, enable_driver); 2650 SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); 2651 SET_DEVICE_OP(dev_ops, fill_res_cq_entry); 2652 SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); 2653 SET_DEVICE_OP(dev_ops, fill_res_mr_entry); 2654 SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); 2655 SET_DEVICE_OP(dev_ops, fill_res_qp_entry); 2656 SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); 2657 SET_DEVICE_OP(dev_ops, fill_res_srq_entry); 2658 SET_DEVICE_OP(dev_ops, fill_res_srq_entry_raw); 2659 SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); 2660 SET_DEVICE_OP(dev_ops, get_dev_fw_str); 2661 SET_DEVICE_OP(dev_ops, get_dma_mr); 2662 SET_DEVICE_OP(dev_ops, get_hw_stats); 2663 SET_DEVICE_OP(dev_ops, get_link_layer); 2664 SET_DEVICE_OP(dev_ops, get_netdev); 2665 SET_DEVICE_OP(dev_ops, get_numa_node); 2666 SET_DEVICE_OP(dev_ops, get_port_immutable); 2667 SET_DEVICE_OP(dev_ops, get_vector_affinity); 2668 SET_DEVICE_OP(dev_ops, get_vf_config); 2669 SET_DEVICE_OP(dev_ops, get_vf_guid); 2670 SET_DEVICE_OP(dev_ops, get_vf_stats); 2671 SET_DEVICE_OP(dev_ops, iw_accept); 2672 SET_DEVICE_OP(dev_ops, iw_add_ref); 2673 SET_DEVICE_OP(dev_ops, iw_connect); 2674 SET_DEVICE_OP(dev_ops, iw_create_listen); 2675 SET_DEVICE_OP(dev_ops, iw_destroy_listen); 2676 SET_DEVICE_OP(dev_ops, iw_get_qp); 2677 SET_DEVICE_OP(dev_ops, iw_reject); 2678 SET_DEVICE_OP(dev_ops, iw_rem_ref); 2679 SET_DEVICE_OP(dev_ops, map_mr_sg); 2680 SET_DEVICE_OP(dev_ops, map_mr_sg_pi); 2681 SET_DEVICE_OP(dev_ops, mmap); 2682 SET_DEVICE_OP(dev_ops, mmap_free); 2683 SET_DEVICE_OP(dev_ops, modify_ah); 2684 SET_DEVICE_OP(dev_ops, modify_cq); 2685 SET_DEVICE_OP(dev_ops, modify_device); 2686 SET_DEVICE_OP(dev_ops, modify_hw_stat); 2687 SET_DEVICE_OP(dev_ops, modify_port); 2688 SET_DEVICE_OP(dev_ops, modify_qp); 2689 SET_DEVICE_OP(dev_ops, modify_srq); 2690 SET_DEVICE_OP(dev_ops, modify_wq); 2691 SET_DEVICE_OP(dev_ops, peek_cq); 2692 SET_DEVICE_OP(dev_ops, poll_cq); 2693 SET_DEVICE_OP(dev_ops, port_groups); 2694 SET_DEVICE_OP(dev_ops, post_recv); 2695 SET_DEVICE_OP(dev_ops, post_send); 2696 SET_DEVICE_OP(dev_ops, post_srq_recv); 2697 SET_DEVICE_OP(dev_ops, process_mad); 2698 SET_DEVICE_OP(dev_ops, query_ah); 2699 SET_DEVICE_OP(dev_ops, query_device); 2700 SET_DEVICE_OP(dev_ops, query_gid); 2701 SET_DEVICE_OP(dev_ops, query_pkey); 2702 SET_DEVICE_OP(dev_ops, query_port); 2703 SET_DEVICE_OP(dev_ops, query_qp); 2704 SET_DEVICE_OP(dev_ops, query_srq); 2705 SET_DEVICE_OP(dev_ops, query_ucontext); 2706 SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); 2707 SET_DEVICE_OP(dev_ops, read_counters); 2708 SET_DEVICE_OP(dev_ops, reg_dm_mr); 2709 SET_DEVICE_OP(dev_ops, reg_user_mr); 2710 SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf); 2711 SET_DEVICE_OP(dev_ops, req_notify_cq); 2712 SET_DEVICE_OP(dev_ops, rereg_user_mr); 2713 SET_DEVICE_OP(dev_ops, resize_cq); 2714 SET_DEVICE_OP(dev_ops, set_vf_guid); 2715 SET_DEVICE_OP(dev_ops, set_vf_link_state); 2716 2717 SET_OBJ_SIZE(dev_ops, ib_ah); 2718 SET_OBJ_SIZE(dev_ops, ib_counters); 2719 SET_OBJ_SIZE(dev_ops, ib_cq); 2720 SET_OBJ_SIZE(dev_ops, ib_mw); 2721 SET_OBJ_SIZE(dev_ops, ib_pd); 2722 SET_OBJ_SIZE(dev_ops, ib_qp); 2723 SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table); 2724 SET_OBJ_SIZE(dev_ops, ib_srq); 2725 SET_OBJ_SIZE(dev_ops, ib_ucontext); 2726 SET_OBJ_SIZE(dev_ops, ib_xrcd); 2727 } 2728 EXPORT_SYMBOL(ib_set_device_ops); 2729 2730 #ifdef CONFIG_INFINIBAND_VIRT_DMA 2731 int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents) 2732 { 2733 struct scatterlist *s; 2734 int i; 2735 2736 for_each_sg(sg, s, nents, i) { 2737 sg_dma_address(s) = (uintptr_t)sg_virt(s); 2738 sg_dma_len(s) = s->length; 2739 } 2740 return nents; 2741 } 2742 EXPORT_SYMBOL(ib_dma_virt_map_sg); 2743 #endif /* CONFIG_INFINIBAND_VIRT_DMA */ 2744 2745 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { 2746 [RDMA_NL_LS_OP_RESOLVE] = { 2747 .doit = ib_nl_handle_resolve_resp, 2748 .flags = RDMA_NL_ADMIN_PERM, 2749 }, 2750 [RDMA_NL_LS_OP_SET_TIMEOUT] = { 2751 .doit = ib_nl_handle_set_timeout, 2752 .flags = RDMA_NL_ADMIN_PERM, 2753 }, 2754 [RDMA_NL_LS_OP_IP_RESOLVE] = { 2755 .doit = ib_nl_handle_ip_res_resp, 2756 .flags = RDMA_NL_ADMIN_PERM, 2757 }, 2758 }; 2759 2760 static int __init ib_core_init(void) 2761 { 2762 int ret = -ENOMEM; 2763 2764 ib_wq = alloc_workqueue("infiniband", 0, 0); 2765 if (!ib_wq) 2766 return -ENOMEM; 2767 2768 ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND, 2769 WQ_UNBOUND_MAX_ACTIVE); 2770 if (!ib_unreg_wq) 2771 goto err; 2772 2773 ib_comp_wq = alloc_workqueue("ib-comp-wq", 2774 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 2775 if (!ib_comp_wq) 2776 goto err_unbound; 2777 2778 ib_comp_unbound_wq = 2779 alloc_workqueue("ib-comp-unb-wq", 2780 WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | 2781 WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); 2782 if (!ib_comp_unbound_wq) 2783 goto err_comp; 2784 2785 ret = class_register(&ib_class); 2786 if (ret) { 2787 pr_warn("Couldn't create InfiniBand device class\n"); 2788 goto err_comp_unbound; 2789 } 2790 2791 rdma_nl_init(); 2792 2793 ret = addr_init(); 2794 if (ret) { 2795 pr_warn("Couldn't init IB address resolution\n"); 2796 goto err_ibnl; 2797 } 2798 2799 ret = ib_mad_init(); 2800 if (ret) { 2801 pr_warn("Couldn't init IB MAD\n"); 2802 goto err_addr; 2803 } 2804 2805 ret = ib_sa_init(); 2806 if (ret) { 2807 pr_warn("Couldn't init SA\n"); 2808 goto err_mad; 2809 } 2810 2811 ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); 2812 if (ret) { 2813 pr_warn("Couldn't register LSM notifier. ret %d\n", ret); 2814 goto err_sa; 2815 } 2816 2817 ret = register_pernet_device(&rdma_dev_net_ops); 2818 if (ret) { 2819 pr_warn("Couldn't init compat dev. ret %d\n", ret); 2820 goto err_compat; 2821 } 2822 2823 nldev_init(); 2824 rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); 2825 ret = roce_gid_mgmt_init(); 2826 if (ret) { 2827 pr_warn("Couldn't init RoCE GID management\n"); 2828 goto err_parent; 2829 } 2830 2831 return 0; 2832 2833 err_parent: 2834 rdma_nl_unregister(RDMA_NL_LS); 2835 nldev_exit(); 2836 unregister_pernet_device(&rdma_dev_net_ops); 2837 err_compat: 2838 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 2839 err_sa: 2840 ib_sa_cleanup(); 2841 err_mad: 2842 ib_mad_cleanup(); 2843 err_addr: 2844 addr_cleanup(); 2845 err_ibnl: 2846 class_unregister(&ib_class); 2847 err_comp_unbound: 2848 destroy_workqueue(ib_comp_unbound_wq); 2849 err_comp: 2850 destroy_workqueue(ib_comp_wq); 2851 err_unbound: 2852 destroy_workqueue(ib_unreg_wq); 2853 err: 2854 destroy_workqueue(ib_wq); 2855 return ret; 2856 } 2857 2858 static void __exit ib_core_cleanup(void) 2859 { 2860 roce_gid_mgmt_cleanup(); 2861 rdma_nl_unregister(RDMA_NL_LS); 2862 nldev_exit(); 2863 unregister_pernet_device(&rdma_dev_net_ops); 2864 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 2865 ib_sa_cleanup(); 2866 ib_mad_cleanup(); 2867 addr_cleanup(); 2868 rdma_nl_exit(); 2869 class_unregister(&ib_class); 2870 destroy_workqueue(ib_comp_unbound_wq); 2871 destroy_workqueue(ib_comp_wq); 2872 /* Make sure that any pending umem accounting work is done. */ 2873 destroy_workqueue(ib_wq); 2874 destroy_workqueue(ib_unreg_wq); 2875 WARN_ON(!xa_empty(&clients)); 2876 WARN_ON(!xa_empty(&devices)); 2877 } 2878 2879 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); 2880 2881 /* ib core relies on netdev stack to first register net_ns_type_operations 2882 * ns kobject type before ib_core initialization. 2883 */ 2884 fs_initcall(ib_core_init); 2885 module_exit(ib_core_cleanup); 2886