1 /*- 2 * Copyright (c) 2010 Isilon Systems, Inc. 3 * Copyright (c) 2010 iX Systems, Inc. 4 * Copyright (c) 2010 Panasas, Inc. 5 * Copyright (c) 2013-2021 Mellanox Technologies, Ltd. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice unmodified, this list of conditions, and the following 13 * disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 #include "opt_global.h" 32 #include "opt_stack.h" 33 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/malloc.h> 37 #include <sys/kernel.h> 38 #include <sys/sysctl.h> 39 #include <sys/proc.h> 40 #include <sys/sglist.h> 41 #include <sys/sleepqueue.h> 42 #include <sys/refcount.h> 43 #include <sys/lock.h> 44 #include <sys/mutex.h> 45 #include <sys/bus.h> 46 #include <sys/eventhandler.h> 47 #include <sys/fcntl.h> 48 #include <sys/file.h> 49 #include <sys/filio.h> 50 #include <sys/rwlock.h> 51 #include <sys/mman.h> 52 #include <sys/stack.h> 53 #include <sys/stdarg.h> 54 #include <sys/sysent.h> 55 #include <sys/time.h> 56 #include <sys/user.h> 57 58 #include <vm/vm.h> 59 #include <vm/pmap.h> 60 #include <vm/vm_object.h> 61 #include <vm/vm_page.h> 62 #include <vm/vm_pager.h> 63 #include <vm/vm_radix.h> 64 65 #if defined(__i386__) || defined(__amd64__) 66 #include <machine/cputypes.h> 67 #include <machine/md_var.h> 68 #endif 69 70 #include <linux/kobject.h> 71 #include <linux/cpu.h> 72 #include <linux/device.h> 73 #include <linux/slab.h> 74 #include <linux/module.h> 75 #include <linux/moduleparam.h> 76 #include <linux/cdev.h> 77 #include <linux/file.h> 78 #include <linux/fs.h> 79 #include <linux/sysfs.h> 80 #include <linux/mm.h> 81 #include <linux/io.h> 82 #include <linux/vmalloc.h> 83 #include <linux/netdevice.h> 84 #include <linux/timer.h> 85 #include <linux/interrupt.h> 86 #include <linux/uaccess.h> 87 #include <linux/utsname.h> 88 #include <linux/list.h> 89 #include <linux/kthread.h> 90 #include <linux/kernel.h> 91 #include <linux/compat.h> 92 #include <linux/io-mapping.h> 93 #include <linux/poll.h> 94 #include <linux/smp.h> 95 #include <linux/wait_bit.h> 96 #include <linux/rcupdate.h> 97 #include <linux/interval_tree.h> 98 #include <linux/interval_tree_generic.h> 99 #include <linux/printk.h> 100 #include <linux/seq_file.h> 101 #include <linux/uuid.h> 102 103 #if defined(__i386__) || defined(__amd64__) 104 #include <asm/smp.h> 105 #include <asm/processor.h> 106 #endif 107 108 #include <xen/xen.h> 109 #ifdef XENHVM 110 #undef xen_pv_domain 111 #undef xen_initial_domain 112 /* xen/xen-os.h redefines __must_check */ 113 #undef __must_check 114 #include <xen/xen-os.h> 115 #endif 116 117 SYSCTL_NODE(_compat, OID_AUTO, linuxkpi, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 118 "LinuxKPI parameters"); 119 120 int linuxkpi_debug; 121 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, debug, CTLFLAG_RWTUN, 122 &linuxkpi_debug, 0, "Set to enable pr_debug() prints. Clear to disable."); 123 124 int linuxkpi_rcu_debug; 125 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, rcu_debug, CTLFLAG_RWTUN, 126 &linuxkpi_rcu_debug, 0, "Set to enable RCU warning. Clear to disable."); 127 128 int linuxkpi_warn_dump_stack = 0; 129 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, warn_dump_stack, CTLFLAG_RWTUN, 130 &linuxkpi_warn_dump_stack, 0, 131 "Set to enable stack traces from WARN_ON(). Clear to disable."); 132 133 static struct timeval lkpi_net_lastlog; 134 static int lkpi_net_curpps; 135 static int lkpi_net_maxpps = 99; 136 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, net_ratelimit, CTLFLAG_RWTUN, 137 &lkpi_net_maxpps, 0, "Limit number of LinuxKPI net messages per second."); 138 139 MALLOC_DEFINE(M_KMALLOC, "lkpikmalloc", "Linux kmalloc compat"); 140 141 #include <linux/rbtree.h> 142 /* Undo Linux compat changes. */ 143 #undef RB_ROOT 144 #undef file 145 #undef cdev 146 #define RB_ROOT(head) (head)->rbh_root 147 148 static void linux_destroy_dev(struct linux_cdev *); 149 static void linux_cdev_deref(struct linux_cdev *ldev); 150 static struct vm_area_struct *linux_cdev_handle_find(void *handle); 151 152 cpumask_t cpu_online_mask; 153 static cpumask_t **static_single_cpu_mask; 154 static cpumask_t *static_single_cpu_mask_lcs; 155 struct kobject linux_class_root; 156 struct device linux_root_device; 157 struct class linux_class_misc; 158 struct list_head pci_drivers; 159 struct list_head pci_devices; 160 spinlock_t pci_lock; 161 struct uts_namespace init_uts_ns; 162 163 unsigned long linux_timer_hz_mask; 164 165 wait_queue_head_t linux_bit_waitq; 166 wait_queue_head_t linux_var_waitq; 167 168 const guid_t guid_null; 169 170 enum system_states system_state = SYSTEM_RUNNING; 171 172 int 173 panic_cmp(struct rb_node *one, struct rb_node *two) 174 { 175 panic("no cmp"); 176 } 177 178 RB_GENERATE(linux_root, rb_node, __entry, panic_cmp); 179 180 #define START(node) ((node)->start) 181 #define LAST(node) ((node)->last) 182 183 INTERVAL_TREE_DEFINE(struct interval_tree_node, rb, unsigned long,, START, 184 LAST,, lkpi_interval_tree) 185 186 static void 187 linux_device_release(struct device *dev) 188 { 189 pr_debug("linux_device_release: %s\n", dev_name(dev)); 190 kfree(dev); 191 } 192 193 static ssize_t 194 linux_class_show(struct kobject *kobj, struct attribute *attr, char *buf) 195 { 196 struct class_attribute *dattr; 197 ssize_t error; 198 199 dattr = container_of(attr, struct class_attribute, attr); 200 error = -EIO; 201 if (dattr->show) 202 error = dattr->show(container_of(kobj, struct class, kobj), 203 dattr, buf); 204 return (error); 205 } 206 207 static ssize_t 208 linux_class_store(struct kobject *kobj, struct attribute *attr, const char *buf, 209 size_t count) 210 { 211 struct class_attribute *dattr; 212 ssize_t error; 213 214 dattr = container_of(attr, struct class_attribute, attr); 215 error = -EIO; 216 if (dattr->store) 217 error = dattr->store(container_of(kobj, struct class, kobj), 218 dattr, buf, count); 219 return (error); 220 } 221 222 static void 223 linux_class_release(struct kobject *kobj) 224 { 225 struct class *class; 226 227 class = container_of(kobj, struct class, kobj); 228 if (class->class_release) 229 class->class_release(class); 230 } 231 232 static const struct sysfs_ops linux_class_sysfs = { 233 .show = linux_class_show, 234 .store = linux_class_store, 235 }; 236 237 const struct kobj_type linux_class_ktype = { 238 .release = linux_class_release, 239 .sysfs_ops = &linux_class_sysfs 240 }; 241 242 static void 243 linux_dev_release(struct kobject *kobj) 244 { 245 struct device *dev; 246 247 dev = container_of(kobj, struct device, kobj); 248 /* This is the precedence defined by linux. */ 249 if (dev->release) 250 dev->release(dev); 251 else if (dev->class && dev->class->dev_release) 252 dev->class->dev_release(dev); 253 } 254 255 static ssize_t 256 linux_dev_show(struct kobject *kobj, struct attribute *attr, char *buf) 257 { 258 struct device_attribute *dattr; 259 ssize_t error; 260 261 dattr = container_of(attr, struct device_attribute, attr); 262 error = -EIO; 263 if (dattr->show) 264 error = dattr->show(container_of(kobj, struct device, kobj), 265 dattr, buf); 266 return (error); 267 } 268 269 static ssize_t 270 linux_dev_store(struct kobject *kobj, struct attribute *attr, const char *buf, 271 size_t count) 272 { 273 struct device_attribute *dattr; 274 ssize_t error; 275 276 dattr = container_of(attr, struct device_attribute, attr); 277 error = -EIO; 278 if (dattr->store) 279 error = dattr->store(container_of(kobj, struct device, kobj), 280 dattr, buf, count); 281 return (error); 282 } 283 284 static const struct sysfs_ops linux_dev_sysfs = { 285 .show = linux_dev_show, 286 .store = linux_dev_store, 287 }; 288 289 const struct kobj_type linux_dev_ktype = { 290 .release = linux_dev_release, 291 .sysfs_ops = &linux_dev_sysfs 292 }; 293 294 struct device * 295 device_create(struct class *class, struct device *parent, dev_t devt, 296 void *drvdata, const char *fmt, ...) 297 { 298 struct device *dev; 299 va_list args; 300 301 dev = kzalloc(sizeof(*dev), M_WAITOK); 302 dev->parent = parent; 303 dev->class = class; 304 dev->devt = devt; 305 dev->driver_data = drvdata; 306 dev->release = linux_device_release; 307 va_start(args, fmt); 308 kobject_set_name_vargs(&dev->kobj, fmt, args); 309 va_end(args); 310 device_register(dev); 311 312 return (dev); 313 } 314 315 struct device * 316 device_create_groups_vargs(struct class *class, struct device *parent, 317 dev_t devt, void *drvdata, const struct attribute_group **groups, 318 const char *fmt, va_list args) 319 { 320 struct device *dev = NULL; 321 int retval = -ENODEV; 322 323 if (class == NULL || IS_ERR(class)) 324 goto error; 325 326 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 327 if (!dev) { 328 retval = -ENOMEM; 329 goto error; 330 } 331 332 dev->devt = devt; 333 dev->class = class; 334 dev->parent = parent; 335 dev->groups = groups; 336 dev->release = device_create_release; 337 /* device_initialize() needs the class and parent to be set */ 338 device_initialize(dev); 339 dev_set_drvdata(dev, drvdata); 340 341 retval = kobject_set_name_vargs(&dev->kobj, fmt, args); 342 if (retval) 343 goto error; 344 345 retval = device_add(dev); 346 if (retval) 347 goto error; 348 349 return dev; 350 351 error: 352 put_device(dev); 353 return ERR_PTR(retval); 354 } 355 356 struct class * 357 lkpi_class_create(const char *name) 358 { 359 struct class *class; 360 int error; 361 362 class = kzalloc(sizeof(*class), M_WAITOK); 363 class->name = name; 364 class->class_release = linux_class_kfree; 365 error = class_register(class); 366 if (error) { 367 kfree(class); 368 return (NULL); 369 } 370 371 return (class); 372 } 373 374 static void 375 linux_kq_lock(void *arg) 376 { 377 spinlock_t *s = arg; 378 379 spin_lock(s); 380 } 381 static void 382 linux_kq_unlock(void *arg) 383 { 384 spinlock_t *s = arg; 385 386 spin_unlock(s); 387 } 388 389 static void 390 linux_kq_assert_lock(void *arg, int what) 391 { 392 #ifdef INVARIANTS 393 spinlock_t *s = arg; 394 395 if (what == LA_LOCKED) 396 mtx_assert(s, MA_OWNED); 397 else 398 mtx_assert(s, MA_NOTOWNED); 399 #endif 400 } 401 402 static void 403 linux_file_kqfilter_poll(struct linux_file *, int); 404 405 struct linux_file * 406 linux_file_alloc(void) 407 { 408 struct linux_file *filp; 409 410 filp = kzalloc(sizeof(*filp), GFP_KERNEL); 411 412 /* set initial refcount */ 413 filp->f_count = 1; 414 415 /* setup fields needed by kqueue support */ 416 spin_lock_init(&filp->f_kqlock); 417 knlist_init(&filp->f_selinfo.si_note, &filp->f_kqlock, 418 linux_kq_lock, linux_kq_unlock, linux_kq_assert_lock); 419 420 return (filp); 421 } 422 423 void 424 linux_file_free(struct linux_file *filp) 425 { 426 if (filp->_file == NULL) { 427 if (filp->f_op != NULL && filp->f_op->release != NULL) 428 filp->f_op->release(filp->f_vnode, filp); 429 if (filp->f_shmem != NULL) 430 vm_object_deallocate(filp->f_shmem); 431 kfree_rcu(filp, rcu); 432 } else { 433 /* 434 * The close method of the character device or file 435 * will free the linux_file structure: 436 */ 437 _fdrop(filp->_file, curthread); 438 } 439 } 440 441 struct linux_cdev * 442 cdev_alloc(void) 443 { 444 struct linux_cdev *cdev; 445 446 cdev = kzalloc(sizeof(struct linux_cdev), M_WAITOK); 447 kobject_init(&cdev->kobj, &linux_cdev_ktype); 448 cdev->refs = 1; 449 return (cdev); 450 } 451 452 static int 453 linux_cdev_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot, 454 vm_page_t *mres) 455 { 456 struct vm_area_struct *vmap; 457 458 vmap = linux_cdev_handle_find(vm_obj->handle); 459 460 MPASS(vmap != NULL); 461 MPASS(vmap->vm_private_data == vm_obj->handle); 462 463 if (likely(vmap->vm_ops != NULL && offset < vmap->vm_len)) { 464 vm_paddr_t paddr = IDX_TO_OFF(vmap->vm_pfn) + offset; 465 vm_page_t page; 466 467 if (((*mres)->flags & PG_FICTITIOUS) != 0) { 468 /* 469 * If the passed in result page is a fake 470 * page, update it with the new physical 471 * address. 472 */ 473 page = *mres; 474 vm_page_updatefake(page, paddr, vm_obj->memattr); 475 } else { 476 /* 477 * Replace the passed in "mres" page with our 478 * own fake page and free up the all of the 479 * original pages. 480 */ 481 VM_OBJECT_WUNLOCK(vm_obj); 482 page = vm_page_getfake(paddr, vm_obj->memattr); 483 VM_OBJECT_WLOCK(vm_obj); 484 485 vm_page_replace(page, vm_obj, (*mres)->pindex, *mres); 486 *mres = page; 487 } 488 vm_page_valid(page); 489 return (VM_PAGER_OK); 490 } 491 return (VM_PAGER_FAIL); 492 } 493 494 static int 495 linux_cdev_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type, 496 vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) 497 { 498 struct vm_area_struct *vmap; 499 int err; 500 501 /* get VM area structure */ 502 vmap = linux_cdev_handle_find(vm_obj->handle); 503 MPASS(vmap != NULL); 504 MPASS(vmap->vm_private_data == vm_obj->handle); 505 506 VM_OBJECT_WUNLOCK(vm_obj); 507 508 linux_set_current(curthread); 509 510 down_write(&vmap->vm_mm->mmap_sem); 511 if (unlikely(vmap->vm_ops == NULL)) { 512 err = VM_FAULT_SIGBUS; 513 } else { 514 struct vm_fault vmf; 515 516 /* fill out VM fault structure */ 517 vmf.virtual_address = (void *)(uintptr_t)IDX_TO_OFF(pidx); 518 vmf.flags = (fault_type & VM_PROT_WRITE) ? FAULT_FLAG_WRITE : 0; 519 vmf.pgoff = 0; 520 vmf.page = NULL; 521 vmf.vma = vmap; 522 523 vmap->vm_pfn_count = 0; 524 vmap->vm_pfn_pcount = &vmap->vm_pfn_count; 525 vmap->vm_obj = vm_obj; 526 527 err = vmap->vm_ops->fault(&vmf); 528 529 while (vmap->vm_pfn_count == 0 && err == VM_FAULT_NOPAGE) { 530 kern_yield(PRI_USER); 531 err = vmap->vm_ops->fault(&vmf); 532 } 533 } 534 535 /* translate return code */ 536 switch (err) { 537 case VM_FAULT_OOM: 538 err = VM_PAGER_AGAIN; 539 break; 540 case VM_FAULT_SIGBUS: 541 err = VM_PAGER_BAD; 542 break; 543 case VM_FAULT_NOPAGE: 544 /* 545 * By contract the fault handler will return having 546 * busied all the pages itself. If pidx is already 547 * found in the object, it will simply xbusy the first 548 * page and return with vm_pfn_count set to 1. 549 */ 550 *first = vmap->vm_pfn_first; 551 *last = *first + vmap->vm_pfn_count - 1; 552 err = VM_PAGER_OK; 553 break; 554 default: 555 err = VM_PAGER_ERROR; 556 break; 557 } 558 up_write(&vmap->vm_mm->mmap_sem); 559 VM_OBJECT_WLOCK(vm_obj); 560 return (err); 561 } 562 563 static struct rwlock linux_vma_lock; 564 static TAILQ_HEAD(, vm_area_struct) linux_vma_head = 565 TAILQ_HEAD_INITIALIZER(linux_vma_head); 566 567 static void 568 linux_cdev_handle_free(struct vm_area_struct *vmap) 569 { 570 /* Drop reference on vm_file */ 571 if (vmap->vm_file != NULL) 572 fput(vmap->vm_file); 573 574 /* Drop reference on mm_struct */ 575 mmput(vmap->vm_mm); 576 577 kfree(vmap); 578 } 579 580 static void 581 linux_cdev_handle_remove(struct vm_area_struct *vmap) 582 { 583 rw_wlock(&linux_vma_lock); 584 TAILQ_REMOVE(&linux_vma_head, vmap, vm_entry); 585 rw_wunlock(&linux_vma_lock); 586 } 587 588 static struct vm_area_struct * 589 linux_cdev_handle_find(void *handle) 590 { 591 struct vm_area_struct *vmap; 592 593 rw_rlock(&linux_vma_lock); 594 TAILQ_FOREACH(vmap, &linux_vma_head, vm_entry) { 595 if (vmap->vm_private_data == handle) 596 break; 597 } 598 rw_runlock(&linux_vma_lock); 599 return (vmap); 600 } 601 602 static int 603 linux_cdev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, 604 vm_ooffset_t foff, struct ucred *cred, u_short *color) 605 { 606 607 MPASS(linux_cdev_handle_find(handle) != NULL); 608 *color = 0; 609 return (0); 610 } 611 612 static void 613 linux_cdev_pager_dtor(void *handle) 614 { 615 const struct vm_operations_struct *vm_ops; 616 struct vm_area_struct *vmap; 617 618 vmap = linux_cdev_handle_find(handle); 619 MPASS(vmap != NULL); 620 621 /* 622 * Remove handle before calling close operation to prevent 623 * other threads from reusing the handle pointer. 624 */ 625 linux_cdev_handle_remove(vmap); 626 627 down_write(&vmap->vm_mm->mmap_sem); 628 vm_ops = vmap->vm_ops; 629 if (likely(vm_ops != NULL)) 630 vm_ops->close(vmap); 631 up_write(&vmap->vm_mm->mmap_sem); 632 633 linux_cdev_handle_free(vmap); 634 } 635 636 static struct cdev_pager_ops linux_cdev_pager_ops[2] = { 637 { 638 /* OBJT_MGTDEVICE */ 639 .cdev_pg_populate = linux_cdev_pager_populate, 640 .cdev_pg_ctor = linux_cdev_pager_ctor, 641 .cdev_pg_dtor = linux_cdev_pager_dtor 642 }, 643 { 644 /* OBJT_DEVICE */ 645 .cdev_pg_fault = linux_cdev_pager_fault, 646 .cdev_pg_ctor = linux_cdev_pager_ctor, 647 .cdev_pg_dtor = linux_cdev_pager_dtor 648 }, 649 }; 650 651 int 652 zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, 653 unsigned long size) 654 { 655 struct pctrie_iter pages; 656 vm_object_t obj; 657 vm_page_t m; 658 659 obj = vma->vm_obj; 660 if (obj == NULL || (obj->flags & OBJ_UNMANAGED) != 0) 661 return (-ENOTSUP); 662 VM_OBJECT_RLOCK(obj); 663 vm_page_iter_limit_init(&pages, obj, OFF_TO_IDX(address + size)); 664 VM_RADIX_FOREACH_FROM(m, &pages, OFF_TO_IDX(address)) 665 pmap_remove_all(m); 666 VM_OBJECT_RUNLOCK(obj); 667 return (0); 668 } 669 670 void 671 vma_set_file(struct vm_area_struct *vma, struct linux_file *file) 672 { 673 struct linux_file *tmp; 674 675 /* Changing an anonymous vma with this is illegal */ 676 get_file(file); 677 tmp = vma->vm_file; 678 vma->vm_file = file; 679 fput(tmp); 680 } 681 682 static struct file_operations dummy_ldev_ops = { 683 /* XXXKIB */ 684 }; 685 686 static struct linux_cdev dummy_ldev = { 687 .ops = &dummy_ldev_ops, 688 }; 689 690 #define LDEV_SI_DTR 0x0001 691 #define LDEV_SI_REF 0x0002 692 693 static void 694 linux_get_fop(struct linux_file *filp, const struct file_operations **fop, 695 struct linux_cdev **dev) 696 { 697 struct linux_cdev *ldev; 698 u_int siref; 699 700 ldev = filp->f_cdev; 701 *fop = filp->f_op; 702 if (ldev != NULL) { 703 if (ldev->kobj.ktype == &linux_cdev_static_ktype) { 704 refcount_acquire(&ldev->refs); 705 } else { 706 for (siref = ldev->siref;;) { 707 if ((siref & LDEV_SI_DTR) != 0) { 708 ldev = &dummy_ldev; 709 *fop = ldev->ops; 710 siref = ldev->siref; 711 MPASS((ldev->siref & LDEV_SI_DTR) == 0); 712 } else if (atomic_fcmpset_int(&ldev->siref, 713 &siref, siref + LDEV_SI_REF)) { 714 break; 715 } 716 } 717 } 718 } 719 *dev = ldev; 720 } 721 722 static void 723 linux_drop_fop(struct linux_cdev *ldev) 724 { 725 726 if (ldev == NULL) 727 return; 728 if (ldev->kobj.ktype == &linux_cdev_static_ktype) { 729 linux_cdev_deref(ldev); 730 } else { 731 MPASS(ldev->kobj.ktype == &linux_cdev_ktype); 732 MPASS((ldev->siref & ~LDEV_SI_DTR) != 0); 733 atomic_subtract_int(&ldev->siref, LDEV_SI_REF); 734 } 735 } 736 737 #define OPW(fp,td,code) ({ \ 738 struct file *__fpop; \ 739 __typeof(code) __retval; \ 740 \ 741 __fpop = (td)->td_fpop; \ 742 (td)->td_fpop = (fp); \ 743 __retval = (code); \ 744 (td)->td_fpop = __fpop; \ 745 __retval; \ 746 }) 747 748 static int 749 linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td, 750 struct file *file) 751 { 752 struct linux_cdev *ldev; 753 struct linux_file *filp; 754 const struct file_operations *fop; 755 int error; 756 757 ldev = dev->si_drv1; 758 759 filp = linux_file_alloc(); 760 filp->f_dentry = &filp->f_dentry_store; 761 filp->f_op = ldev->ops; 762 filp->f_mode = file->f_flag; 763 filp->f_flags = file->f_flag; 764 filp->f_vnode = file->f_vnode; 765 filp->_file = file; 766 refcount_acquire(&ldev->refs); 767 filp->f_cdev = ldev; 768 769 linux_set_current(td); 770 linux_get_fop(filp, &fop, &ldev); 771 772 if (fop->open != NULL) { 773 error = -fop->open(file->f_vnode, filp); 774 if (error != 0) { 775 linux_drop_fop(ldev); 776 linux_cdev_deref(filp->f_cdev); 777 kfree(filp); 778 return (error); 779 } 780 } 781 782 /* hold on to the vnode - used for fstat() */ 783 vref(filp->f_vnode); 784 785 /* release the file from devfs */ 786 finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops); 787 linux_drop_fop(ldev); 788 return (ENXIO); 789 } 790 791 #define LINUX_IOCTL_MIN_PTR 0x10000UL 792 #define LINUX_IOCTL_MAX_PTR (LINUX_IOCTL_MIN_PTR + IOCPARM_MAX) 793 794 static inline int 795 linux_remap_address(void **uaddr, size_t len) 796 { 797 uintptr_t uaddr_val = (uintptr_t)(*uaddr); 798 799 if (unlikely(uaddr_val >= LINUX_IOCTL_MIN_PTR && 800 uaddr_val < LINUX_IOCTL_MAX_PTR)) { 801 struct task_struct *pts = current; 802 if (pts == NULL) { 803 *uaddr = NULL; 804 return (1); 805 } 806 807 /* compute data offset */ 808 uaddr_val -= LINUX_IOCTL_MIN_PTR; 809 810 /* check that length is within bounds */ 811 if ((len > IOCPARM_MAX) || 812 (uaddr_val + len) > pts->bsd_ioctl_len) { 813 *uaddr = NULL; 814 return (1); 815 } 816 817 /* re-add kernel buffer address */ 818 uaddr_val += (uintptr_t)pts->bsd_ioctl_data; 819 820 /* update address location */ 821 *uaddr = (void *)uaddr_val; 822 return (1); 823 } 824 return (0); 825 } 826 827 int 828 linux_copyin(const void *uaddr, void *kaddr, size_t len) 829 { 830 if (linux_remap_address(__DECONST(void **, &uaddr), len)) { 831 if (uaddr == NULL) 832 return (-EFAULT); 833 memcpy(kaddr, uaddr, len); 834 return (0); 835 } 836 return (-copyin(uaddr, kaddr, len)); 837 } 838 839 int 840 linux_copyout(const void *kaddr, void *uaddr, size_t len) 841 { 842 if (linux_remap_address(&uaddr, len)) { 843 if (uaddr == NULL) 844 return (-EFAULT); 845 memcpy(uaddr, kaddr, len); 846 return (0); 847 } 848 return (-copyout(kaddr, uaddr, len)); 849 } 850 851 size_t 852 linux_clear_user(void *_uaddr, size_t _len) 853 { 854 uint8_t *uaddr = _uaddr; 855 size_t len = _len; 856 857 /* make sure uaddr is aligned before going into the fast loop */ 858 while (((uintptr_t)uaddr & 7) != 0 && len > 7) { 859 if (subyte(uaddr, 0)) 860 return (_len); 861 uaddr++; 862 len--; 863 } 864 865 /* zero 8 bytes at a time */ 866 while (len > 7) { 867 #ifdef __LP64__ 868 if (suword64(uaddr, 0)) 869 return (_len); 870 #else 871 if (suword32(uaddr, 0)) 872 return (_len); 873 if (suword32(uaddr + 4, 0)) 874 return (_len); 875 #endif 876 uaddr += 8; 877 len -= 8; 878 } 879 880 /* zero fill end, if any */ 881 while (len > 0) { 882 if (subyte(uaddr, 0)) 883 return (_len); 884 uaddr++; 885 len--; 886 } 887 return (0); 888 } 889 890 int 891 linux_access_ok(const void *uaddr, size_t len) 892 { 893 uintptr_t saddr; 894 uintptr_t eaddr; 895 896 /* get start and end address */ 897 saddr = (uintptr_t)uaddr; 898 eaddr = (uintptr_t)uaddr + len; 899 900 /* verify addresses are valid for userspace */ 901 return ((saddr == eaddr) || 902 (eaddr > saddr && eaddr <= VM_MAXUSER_ADDRESS)); 903 } 904 905 /* 906 * This function should return either EINTR or ERESTART depending on 907 * the signal type sent to this thread: 908 */ 909 static int 910 linux_get_error(struct task_struct *task, int error) 911 { 912 /* check for signal type interrupt code */ 913 if (error == EINTR || error == ERESTARTSYS || error == ERESTART) { 914 error = -linux_schedule_get_interrupt_value(task); 915 if (error == 0) 916 error = EINTR; 917 } 918 return (error); 919 } 920 921 static int 922 linux_file_ioctl_sub(struct file *fp, struct linux_file *filp, 923 const struct file_operations *fop, u_long cmd, caddr_t data, 924 struct thread *td) 925 { 926 struct task_struct *task = current; 927 unsigned size; 928 int error; 929 930 size = IOCPARM_LEN(cmd); 931 /* refer to logic in sys_ioctl() */ 932 if (size > 0) { 933 /* 934 * Setup hint for linux_copyin() and linux_copyout(). 935 * 936 * Background: Linux code expects a user-space address 937 * while FreeBSD supplies a kernel-space address. 938 */ 939 task->bsd_ioctl_data = data; 940 task->bsd_ioctl_len = size; 941 data = (void *)LINUX_IOCTL_MIN_PTR; 942 } else { 943 /* fetch user-space pointer */ 944 data = *(void **)data; 945 } 946 #ifdef COMPAT_FREEBSD32 947 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 948 /* try the compat IOCTL handler first */ 949 if (fop->compat_ioctl != NULL) { 950 error = -OPW(fp, td, fop->compat_ioctl(filp, 951 cmd, (u_long)data)); 952 } else { 953 error = ENOTTY; 954 } 955 956 /* fallback to the regular IOCTL handler, if any */ 957 if (error == ENOTTY && fop->unlocked_ioctl != NULL) { 958 error = -OPW(fp, td, fop->unlocked_ioctl(filp, 959 cmd, (u_long)data)); 960 } 961 } else 962 #endif 963 { 964 if (fop->unlocked_ioctl != NULL) { 965 error = -OPW(fp, td, fop->unlocked_ioctl(filp, 966 cmd, (u_long)data)); 967 } else { 968 error = ENOTTY; 969 } 970 } 971 if (size > 0) { 972 task->bsd_ioctl_data = NULL; 973 task->bsd_ioctl_len = 0; 974 } 975 976 if (error == EWOULDBLOCK) { 977 /* update kqfilter status, if any */ 978 linux_file_kqfilter_poll(filp, 979 LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE); 980 } else { 981 error = linux_get_error(task, error); 982 } 983 return (error); 984 } 985 986 #define LINUX_POLL_TABLE_NORMAL ((poll_table *)1) 987 988 /* 989 * This function atomically updates the poll wakeup state and returns 990 * the previous state at the time of update. 991 */ 992 static uint8_t 993 linux_poll_wakeup_state(atomic_t *v, const uint8_t *pstate) 994 { 995 int c, old; 996 997 c = v->counter; 998 999 while ((old = atomic_cmpxchg(v, c, pstate[c])) != c) 1000 c = old; 1001 1002 return (c); 1003 } 1004 1005 static int 1006 linux_poll_wakeup_callback(wait_queue_t *wq, unsigned int wq_state, int flags, void *key) 1007 { 1008 static const uint8_t state[LINUX_FWQ_STATE_MAX] = { 1009 [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */ 1010 [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */ 1011 [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_READY, 1012 [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_READY, /* NOP */ 1013 }; 1014 struct linux_file *filp = container_of(wq, struct linux_file, f_wait_queue.wq); 1015 1016 switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) { 1017 case LINUX_FWQ_STATE_QUEUED: 1018 linux_poll_wakeup(filp); 1019 return (1); 1020 default: 1021 return (0); 1022 } 1023 } 1024 1025 void 1026 linux_poll_wait(struct linux_file *filp, wait_queue_head_t *wqh, poll_table *p) 1027 { 1028 static const uint8_t state[LINUX_FWQ_STATE_MAX] = { 1029 [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_NOT_READY, 1030 [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */ 1031 [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_QUEUED, /* NOP */ 1032 [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_QUEUED, 1033 }; 1034 1035 /* check if we are called inside the select system call */ 1036 if (p == LINUX_POLL_TABLE_NORMAL) 1037 selrecord(curthread, &filp->f_selinfo); 1038 1039 switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) { 1040 case LINUX_FWQ_STATE_INIT: 1041 /* NOTE: file handles can only belong to one wait-queue */ 1042 filp->f_wait_queue.wqh = wqh; 1043 filp->f_wait_queue.wq.func = &linux_poll_wakeup_callback; 1044 add_wait_queue(wqh, &filp->f_wait_queue.wq); 1045 atomic_set(&filp->f_wait_queue.state, LINUX_FWQ_STATE_QUEUED); 1046 break; 1047 default: 1048 break; 1049 } 1050 } 1051 1052 static void 1053 linux_poll_wait_dequeue(struct linux_file *filp) 1054 { 1055 static const uint8_t state[LINUX_FWQ_STATE_MAX] = { 1056 [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */ 1057 [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_INIT, 1058 [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_INIT, 1059 [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_INIT, 1060 }; 1061 1062 seldrain(&filp->f_selinfo); 1063 1064 switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) { 1065 case LINUX_FWQ_STATE_NOT_READY: 1066 case LINUX_FWQ_STATE_QUEUED: 1067 case LINUX_FWQ_STATE_READY: 1068 remove_wait_queue(filp->f_wait_queue.wqh, &filp->f_wait_queue.wq); 1069 break; 1070 default: 1071 break; 1072 } 1073 } 1074 1075 void 1076 linux_poll_wakeup(struct linux_file *filp) 1077 { 1078 /* this function should be NULL-safe */ 1079 if (filp == NULL) 1080 return; 1081 1082 selwakeup(&filp->f_selinfo); 1083 1084 spin_lock(&filp->f_kqlock); 1085 filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ | 1086 LINUX_KQ_FLAG_NEED_WRITE; 1087 1088 /* make sure the "knote" gets woken up */ 1089 KNOTE_LOCKED(&filp->f_selinfo.si_note, 1); 1090 spin_unlock(&filp->f_kqlock); 1091 } 1092 1093 static struct linux_file * 1094 __get_file_rcu(struct linux_file **f) 1095 { 1096 struct linux_file *file1, *file2; 1097 1098 file1 = READ_ONCE(*f); 1099 if (file1 == NULL) 1100 return (NULL); 1101 1102 if (!refcount_acquire_if_not_zero( 1103 file1->_file == NULL ? &file1->f_count : &file1->_file->f_count)) 1104 return (ERR_PTR(-EAGAIN)); 1105 1106 file2 = READ_ONCE(*f); 1107 if (file2 == file1) 1108 return (file2); 1109 1110 fput(file1); 1111 return (ERR_PTR(-EAGAIN)); 1112 } 1113 1114 struct linux_file * 1115 linux_get_file_rcu(struct linux_file **f) 1116 { 1117 struct linux_file *file1; 1118 1119 for (;;) { 1120 file1 = __get_file_rcu(f); 1121 if (file1 == NULL) 1122 return (NULL); 1123 1124 if (IS_ERR(file1)) 1125 continue; 1126 1127 return (file1); 1128 } 1129 } 1130 1131 struct linux_file * 1132 get_file_active(struct linux_file **f) 1133 { 1134 struct linux_file *file1; 1135 1136 rcu_read_lock(); 1137 file1 = __get_file_rcu(f); 1138 rcu_read_unlock(); 1139 if (IS_ERR(file1)) 1140 file1 = NULL; 1141 1142 return (file1); 1143 } 1144 1145 static void 1146 linux_file_kqfilter_detach(struct knote *kn) 1147 { 1148 struct linux_file *filp = kn->kn_hook; 1149 1150 spin_lock(&filp->f_kqlock); 1151 knlist_remove(&filp->f_selinfo.si_note, kn, 1); 1152 spin_unlock(&filp->f_kqlock); 1153 } 1154 1155 static int 1156 linux_file_kqfilter_read_event(struct knote *kn, long hint) 1157 { 1158 struct linux_file *filp = kn->kn_hook; 1159 1160 mtx_assert(&filp->f_kqlock, MA_OWNED); 1161 1162 return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_READ) ? 1 : 0); 1163 } 1164 1165 static int 1166 linux_file_kqfilter_write_event(struct knote *kn, long hint) 1167 { 1168 struct linux_file *filp = kn->kn_hook; 1169 1170 mtx_assert(&filp->f_kqlock, MA_OWNED); 1171 1172 return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_WRITE) ? 1 : 0); 1173 } 1174 1175 static const struct filterops linux_dev_kqfiltops_read = { 1176 .f_isfd = 1, 1177 .f_detach = linux_file_kqfilter_detach, 1178 .f_event = linux_file_kqfilter_read_event, 1179 .f_copy = knote_triv_copy, 1180 }; 1181 1182 static const struct filterops linux_dev_kqfiltops_write = { 1183 .f_isfd = 1, 1184 .f_detach = linux_file_kqfilter_detach, 1185 .f_event = linux_file_kqfilter_write_event, 1186 .f_copy = knote_triv_copy, 1187 }; 1188 1189 static void 1190 linux_file_kqfilter_poll(struct linux_file *filp, int kqflags) 1191 { 1192 struct thread *td; 1193 const struct file_operations *fop; 1194 struct linux_cdev *ldev; 1195 int temp; 1196 1197 if ((filp->f_kqflags & kqflags) == 0) 1198 return; 1199 1200 td = curthread; 1201 1202 linux_get_fop(filp, &fop, &ldev); 1203 /* get the latest polling state */ 1204 temp = OPW(filp->_file, td, fop->poll(filp, NULL)); 1205 linux_drop_fop(ldev); 1206 1207 spin_lock(&filp->f_kqlock); 1208 /* clear kqflags */ 1209 filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ | 1210 LINUX_KQ_FLAG_NEED_WRITE); 1211 /* update kqflags */ 1212 if ((temp & (POLLIN | POLLOUT)) != 0) { 1213 if ((temp & POLLIN) != 0) 1214 filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ; 1215 if ((temp & POLLOUT) != 0) 1216 filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE; 1217 1218 /* make sure the "knote" gets woken up */ 1219 KNOTE_LOCKED(&filp->f_selinfo.si_note, 0); 1220 } 1221 spin_unlock(&filp->f_kqlock); 1222 } 1223 1224 static int 1225 linux_file_kqfilter(struct file *file, struct knote *kn) 1226 { 1227 struct linux_file *filp; 1228 struct thread *td; 1229 int error; 1230 1231 td = curthread; 1232 filp = (struct linux_file *)file->f_data; 1233 filp->f_flags = file->f_flag; 1234 if (filp->f_op->poll == NULL) 1235 return (EINVAL); 1236 1237 spin_lock(&filp->f_kqlock); 1238 switch (kn->kn_filter) { 1239 case EVFILT_READ: 1240 filp->f_kqflags |= LINUX_KQ_FLAG_HAS_READ; 1241 kn->kn_fop = &linux_dev_kqfiltops_read; 1242 kn->kn_hook = filp; 1243 knlist_add(&filp->f_selinfo.si_note, kn, 1); 1244 error = 0; 1245 break; 1246 case EVFILT_WRITE: 1247 filp->f_kqflags |= LINUX_KQ_FLAG_HAS_WRITE; 1248 kn->kn_fop = &linux_dev_kqfiltops_write; 1249 kn->kn_hook = filp; 1250 knlist_add(&filp->f_selinfo.si_note, kn, 1); 1251 error = 0; 1252 break; 1253 default: 1254 error = EINVAL; 1255 break; 1256 } 1257 spin_unlock(&filp->f_kqlock); 1258 1259 if (error == 0) { 1260 linux_set_current(td); 1261 1262 /* update kqfilter status, if any */ 1263 linux_file_kqfilter_poll(filp, 1264 LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE); 1265 } 1266 return (error); 1267 } 1268 1269 static int 1270 linux_file_mmap_single(struct file *fp, const struct file_operations *fop, 1271 vm_ooffset_t *offset, vm_size_t size, struct vm_object **object, 1272 int nprot, bool is_shared, struct thread *td) 1273 { 1274 struct task_struct *task; 1275 struct vm_area_struct *vmap; 1276 struct mm_struct *mm; 1277 struct linux_file *filp; 1278 vm_memattr_t attr; 1279 int error; 1280 1281 filp = (struct linux_file *)fp->f_data; 1282 filp->f_flags = fp->f_flag; 1283 1284 if (fop->mmap == NULL) 1285 return (EOPNOTSUPP); 1286 1287 linux_set_current(td); 1288 1289 /* 1290 * The same VM object might be shared by multiple processes 1291 * and the mm_struct is usually freed when a process exits. 1292 * 1293 * The atomic reference below makes sure the mm_struct is 1294 * available as long as the vmap is in the linux_vma_head. 1295 */ 1296 task = current; 1297 mm = task->mm; 1298 if (atomic_inc_not_zero(&mm->mm_users) == 0) 1299 return (EINVAL); 1300 1301 vmap = kzalloc(sizeof(*vmap), GFP_KERNEL); 1302 vmap->vm_start = 0; 1303 vmap->vm_end = size; 1304 vmap->vm_pgoff = *offset / PAGE_SIZE; 1305 vmap->vm_pfn = 0; 1306 vmap->vm_flags = vmap->vm_page_prot = (nprot & VM_PROT_ALL); 1307 if (is_shared) 1308 vmap->vm_flags |= VM_SHARED; 1309 vmap->vm_ops = NULL; 1310 vmap->vm_file = get_file(filp); 1311 vmap->vm_mm = mm; 1312 1313 if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) { 1314 error = linux_get_error(task, EINTR); 1315 } else { 1316 error = -OPW(fp, td, fop->mmap(filp, vmap)); 1317 error = linux_get_error(task, error); 1318 up_write(&vmap->vm_mm->mmap_sem); 1319 } 1320 1321 if (error != 0) { 1322 linux_cdev_handle_free(vmap); 1323 return (error); 1324 } 1325 1326 attr = pgprot2cachemode(vmap->vm_page_prot); 1327 1328 if (vmap->vm_ops != NULL) { 1329 struct vm_area_struct *ptr; 1330 void *vm_private_data; 1331 bool vm_no_fault; 1332 1333 if (vmap->vm_ops->open == NULL || 1334 vmap->vm_ops->close == NULL || 1335 vmap->vm_private_data == NULL) { 1336 /* free allocated VM area struct */ 1337 linux_cdev_handle_free(vmap); 1338 return (EINVAL); 1339 } 1340 1341 vm_private_data = vmap->vm_private_data; 1342 1343 rw_wlock(&linux_vma_lock); 1344 TAILQ_FOREACH(ptr, &linux_vma_head, vm_entry) { 1345 if (ptr->vm_private_data == vm_private_data) 1346 break; 1347 } 1348 /* check if there is an existing VM area struct */ 1349 if (ptr != NULL) { 1350 /* check if the VM area structure is invalid */ 1351 if (ptr->vm_ops == NULL || 1352 ptr->vm_ops->open == NULL || 1353 ptr->vm_ops->close == NULL) { 1354 error = ESTALE; 1355 vm_no_fault = 1; 1356 } else { 1357 error = EEXIST; 1358 vm_no_fault = (ptr->vm_ops->fault == NULL); 1359 } 1360 } else { 1361 /* insert VM area structure into list */ 1362 TAILQ_INSERT_TAIL(&linux_vma_head, vmap, vm_entry); 1363 error = 0; 1364 vm_no_fault = (vmap->vm_ops->fault == NULL); 1365 } 1366 rw_wunlock(&linux_vma_lock); 1367 1368 if (error != 0) { 1369 /* free allocated VM area struct */ 1370 linux_cdev_handle_free(vmap); 1371 /* check for stale VM area struct */ 1372 if (error != EEXIST) 1373 return (error); 1374 } 1375 1376 /* check if there is no fault handler */ 1377 if (vm_no_fault) { 1378 *object = cdev_pager_allocate(vm_private_data, OBJT_DEVICE, 1379 &linux_cdev_pager_ops[1], size, nprot, *offset, 1380 td->td_ucred); 1381 } else { 1382 *object = cdev_pager_allocate(vm_private_data, OBJT_MGTDEVICE, 1383 &linux_cdev_pager_ops[0], size, nprot, *offset, 1384 td->td_ucred); 1385 } 1386 1387 /* check if allocating the VM object failed */ 1388 if (*object == NULL) { 1389 if (error == 0) { 1390 /* remove VM area struct from list */ 1391 linux_cdev_handle_remove(vmap); 1392 /* free allocated VM area struct */ 1393 linux_cdev_handle_free(vmap); 1394 } 1395 return (EINVAL); 1396 } 1397 } else { 1398 struct sglist *sg; 1399 1400 sg = sglist_alloc(1, M_WAITOK); 1401 sglist_append_phys(sg, 1402 (vm_paddr_t)vmap->vm_pfn << PAGE_SHIFT, vmap->vm_len); 1403 1404 *object = vm_pager_allocate(OBJT_SG, sg, vmap->vm_len, 1405 nprot, 0, td->td_ucred); 1406 1407 linux_cdev_handle_free(vmap); 1408 1409 if (*object == NULL) { 1410 sglist_free(sg); 1411 return (EINVAL); 1412 } 1413 } 1414 1415 if (attr != VM_MEMATTR_DEFAULT) { 1416 VM_OBJECT_WLOCK(*object); 1417 vm_object_set_memattr(*object, attr); 1418 VM_OBJECT_WUNLOCK(*object); 1419 } 1420 *offset = 0; 1421 return (0); 1422 } 1423 1424 struct cdevsw linuxcdevsw = { 1425 .d_version = D_VERSION, 1426 .d_fdopen = linux_dev_fdopen, 1427 .d_name = "lkpidev", 1428 }; 1429 1430 static int 1431 linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred, 1432 int flags, struct thread *td) 1433 { 1434 struct linux_file *filp; 1435 const struct file_operations *fop; 1436 struct linux_cdev *ldev; 1437 ssize_t bytes; 1438 int error; 1439 1440 error = 0; 1441 filp = (struct linux_file *)file->f_data; 1442 filp->f_flags = file->f_flag; 1443 /* XXX no support for I/O vectors currently */ 1444 if (uio->uio_iovcnt != 1) 1445 return (EOPNOTSUPP); 1446 if (uio->uio_resid > DEVFS_IOSIZE_MAX) 1447 return (EINVAL); 1448 linux_set_current(td); 1449 linux_get_fop(filp, &fop, &ldev); 1450 if (fop->read != NULL) { 1451 bytes = OPW(file, td, fop->read(filp, 1452 uio->uio_iov->iov_base, 1453 uio->uio_iov->iov_len, &uio->uio_offset)); 1454 if (bytes >= 0) { 1455 uio->uio_iov->iov_base = 1456 ((uint8_t *)uio->uio_iov->iov_base) + bytes; 1457 uio->uio_iov->iov_len -= bytes; 1458 uio->uio_resid -= bytes; 1459 } else { 1460 error = linux_get_error(current, -bytes); 1461 } 1462 } else 1463 error = ENXIO; 1464 1465 /* update kqfilter status, if any */ 1466 linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ); 1467 linux_drop_fop(ldev); 1468 1469 return (error); 1470 } 1471 1472 static int 1473 linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred, 1474 int flags, struct thread *td) 1475 { 1476 struct linux_file *filp; 1477 const struct file_operations *fop; 1478 struct linux_cdev *ldev; 1479 ssize_t bytes; 1480 int error; 1481 1482 filp = (struct linux_file *)file->f_data; 1483 filp->f_flags = file->f_flag; 1484 /* XXX no support for I/O vectors currently */ 1485 if (uio->uio_iovcnt != 1) 1486 return (EOPNOTSUPP); 1487 if (uio->uio_resid > DEVFS_IOSIZE_MAX) 1488 return (EINVAL); 1489 linux_set_current(td); 1490 linux_get_fop(filp, &fop, &ldev); 1491 if (fop->write != NULL) { 1492 bytes = OPW(file, td, fop->write(filp, 1493 uio->uio_iov->iov_base, 1494 uio->uio_iov->iov_len, &uio->uio_offset)); 1495 if (bytes >= 0) { 1496 uio->uio_iov->iov_base = 1497 ((uint8_t *)uio->uio_iov->iov_base) + bytes; 1498 uio->uio_iov->iov_len -= bytes; 1499 uio->uio_resid -= bytes; 1500 error = 0; 1501 } else { 1502 error = linux_get_error(current, -bytes); 1503 } 1504 } else 1505 error = ENXIO; 1506 1507 /* update kqfilter status, if any */ 1508 linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_WRITE); 1509 1510 linux_drop_fop(ldev); 1511 1512 return (error); 1513 } 1514 1515 static int 1516 linux_file_poll(struct file *file, int events, struct ucred *active_cred, 1517 struct thread *td) 1518 { 1519 struct linux_file *filp; 1520 const struct file_operations *fop; 1521 struct linux_cdev *ldev; 1522 int revents; 1523 1524 filp = (struct linux_file *)file->f_data; 1525 filp->f_flags = file->f_flag; 1526 linux_set_current(td); 1527 linux_get_fop(filp, &fop, &ldev); 1528 if (fop->poll != NULL) { 1529 revents = OPW(file, td, fop->poll(filp, 1530 LINUX_POLL_TABLE_NORMAL)) & events; 1531 } else { 1532 revents = 0; 1533 } 1534 linux_drop_fop(ldev); 1535 return (revents); 1536 } 1537 1538 static int 1539 linux_file_close(struct file *file, struct thread *td) 1540 { 1541 struct linux_file *filp; 1542 int (*release)(struct inode *, struct linux_file *); 1543 const struct file_operations *fop; 1544 struct linux_cdev *ldev; 1545 int error; 1546 1547 filp = (struct linux_file *)file->f_data; 1548 1549 KASSERT(file_count(filp) == 0, 1550 ("File refcount(%d) is not zero", file_count(filp))); 1551 1552 if (td == NULL) 1553 td = curthread; 1554 1555 error = 0; 1556 filp->f_flags = file->f_flag; 1557 linux_set_current(td); 1558 linux_poll_wait_dequeue(filp); 1559 linux_get_fop(filp, &fop, &ldev); 1560 /* 1561 * Always use the real release function, if any, to avoid 1562 * leaking device resources: 1563 */ 1564 release = filp->f_op->release; 1565 if (release != NULL) 1566 error = -OPW(file, td, release(filp->f_vnode, filp)); 1567 funsetown(&filp->f_sigio); 1568 if (filp->f_vnode != NULL) 1569 vrele(filp->f_vnode); 1570 linux_drop_fop(ldev); 1571 ldev = filp->f_cdev; 1572 if (ldev != NULL) 1573 linux_cdev_deref(ldev); 1574 linux_synchronize_rcu(RCU_TYPE_REGULAR); 1575 kfree(filp); 1576 1577 return (error); 1578 } 1579 1580 static int 1581 linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred, 1582 struct thread *td) 1583 { 1584 struct linux_file *filp; 1585 const struct file_operations *fop; 1586 struct linux_cdev *ldev; 1587 struct fiodgname_arg *fgn; 1588 const char *p; 1589 int error, i; 1590 1591 error = 0; 1592 filp = (struct linux_file *)fp->f_data; 1593 filp->f_flags = fp->f_flag; 1594 linux_get_fop(filp, &fop, &ldev); 1595 1596 linux_set_current(td); 1597 switch (cmd) { 1598 case FIONBIO: 1599 break; 1600 case FIOASYNC: 1601 if (fop->fasync == NULL) 1602 break; 1603 error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC)); 1604 break; 1605 case FIOSETOWN: 1606 error = fsetown(*(int *)data, &filp->f_sigio); 1607 if (error == 0) { 1608 if (fop->fasync == NULL) 1609 break; 1610 error = -OPW(fp, td, fop->fasync(0, filp, 1611 fp->f_flag & FASYNC)); 1612 } 1613 break; 1614 case FIOGETOWN: 1615 *(int *)data = fgetown(&filp->f_sigio); 1616 break; 1617 case FIODGNAME: 1618 #ifdef COMPAT_FREEBSD32 1619 case FIODGNAME_32: 1620 #endif 1621 if (filp->f_cdev == NULL || filp->f_cdev->cdev == NULL) { 1622 error = ENXIO; 1623 break; 1624 } 1625 fgn = data; 1626 p = devtoname(filp->f_cdev->cdev); 1627 i = strlen(p) + 1; 1628 if (i > fgn->len) { 1629 error = EINVAL; 1630 break; 1631 } 1632 error = copyout(p, fiodgname_buf_get_ptr(fgn, cmd), i); 1633 break; 1634 default: 1635 error = linux_file_ioctl_sub(fp, filp, fop, cmd, data, td); 1636 break; 1637 } 1638 linux_drop_fop(ldev); 1639 return (error); 1640 } 1641 1642 static int 1643 linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot, 1644 vm_prot_t maxprot, int flags, struct file *fp, 1645 vm_ooffset_t *foff, const struct file_operations *fop, vm_object_t *objp) 1646 { 1647 /* 1648 * Character devices do not provide private mappings 1649 * of any kind: 1650 */ 1651 if ((maxprot & VM_PROT_WRITE) == 0 && 1652 (prot & VM_PROT_WRITE) != 0) 1653 return (EACCES); 1654 if ((flags & (MAP_PRIVATE | MAP_COPY)) != 0) 1655 return (EINVAL); 1656 1657 return (linux_file_mmap_single(fp, fop, foff, objsize, objp, 1658 (int)prot, (flags & MAP_SHARED) ? true : false, td)); 1659 } 1660 1661 static int 1662 linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, 1663 vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, 1664 struct thread *td) 1665 { 1666 struct linux_file *filp; 1667 const struct file_operations *fop; 1668 struct linux_cdev *ldev; 1669 struct mount *mp; 1670 struct vnode *vp; 1671 vm_object_t object; 1672 vm_prot_t maxprot; 1673 int error; 1674 1675 filp = (struct linux_file *)fp->f_data; 1676 1677 vp = filp->f_vnode; 1678 if (vp == NULL) 1679 return (EOPNOTSUPP); 1680 1681 /* 1682 * Ensure that file and memory protections are 1683 * compatible. 1684 */ 1685 mp = vp->v_mount; 1686 if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { 1687 maxprot = VM_PROT_NONE; 1688 if ((prot & VM_PROT_EXECUTE) != 0) 1689 return (EACCES); 1690 } else 1691 maxprot = VM_PROT_EXECUTE; 1692 if ((fp->f_flag & FREAD) != 0) 1693 maxprot |= VM_PROT_READ; 1694 else if ((prot & VM_PROT_READ) != 0) 1695 return (EACCES); 1696 1697 /* 1698 * If we are sharing potential changes via MAP_SHARED and we 1699 * are trying to get write permission although we opened it 1700 * without asking for it, bail out. 1701 * 1702 * Note that most character devices always share mappings. 1703 * 1704 * Rely on linux_file_mmap_sub() to fail invalid MAP_PRIVATE 1705 * requests rather than doing it here. 1706 */ 1707 if ((flags & MAP_SHARED) != 0) { 1708 if ((fp->f_flag & FWRITE) != 0) 1709 maxprot |= VM_PROT_WRITE; 1710 else if ((prot & VM_PROT_WRITE) != 0) 1711 return (EACCES); 1712 } 1713 maxprot &= cap_maxprot; 1714 1715 linux_get_fop(filp, &fop, &ldev); 1716 error = linux_file_mmap_sub(td, size, prot, maxprot, flags, fp, 1717 &foff, fop, &object); 1718 if (error != 0) 1719 goto out; 1720 1721 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, 1722 foff, FALSE, td); 1723 if (error != 0) 1724 vm_object_deallocate(object); 1725 out: 1726 linux_drop_fop(ldev); 1727 return (error); 1728 } 1729 1730 static int 1731 linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) 1732 { 1733 struct linux_file *filp; 1734 struct vnode *vp; 1735 int error; 1736 1737 filp = (struct linux_file *)fp->f_data; 1738 if (filp->f_vnode == NULL) 1739 return (EOPNOTSUPP); 1740 1741 vp = filp->f_vnode; 1742 1743 vn_lock(vp, LK_SHARED | LK_RETRY); 1744 error = VOP_STAT(vp, sb, curthread->td_ucred, NOCRED); 1745 VOP_UNLOCK(vp); 1746 1747 return (error); 1748 } 1749 1750 static int 1751 linux_file_fill_kinfo(struct file *fp, struct kinfo_file *kif, 1752 struct filedesc *fdp) 1753 { 1754 struct linux_file *filp; 1755 struct vnode *vp; 1756 int error; 1757 1758 filp = fp->f_data; 1759 vp = filp->f_vnode; 1760 if (vp == NULL) { 1761 error = 0; 1762 kif->kf_type = KF_TYPE_DEV; 1763 } else { 1764 vref(vp); 1765 FILEDESC_SUNLOCK(fdp); 1766 error = vn_fill_kinfo_vnode(vp, kif); 1767 vrele(vp); 1768 kif->kf_type = KF_TYPE_VNODE; 1769 FILEDESC_SLOCK(fdp); 1770 } 1771 return (error); 1772 } 1773 1774 unsigned int 1775 linux_iminor(struct inode *inode) 1776 { 1777 struct linux_cdev *ldev; 1778 1779 if (inode == NULL || inode->v_rdev == NULL || 1780 inode->v_rdev->si_devsw != &linuxcdevsw) 1781 return (-1U); 1782 ldev = inode->v_rdev->si_drv1; 1783 if (ldev == NULL) 1784 return (-1U); 1785 1786 return (minor(ldev->dev)); 1787 } 1788 1789 static int 1790 linux_file_kcmp(struct file *fp1, struct file *fp2, struct thread *td) 1791 { 1792 struct linux_file *filp1, *filp2; 1793 1794 if (fp2->f_type != DTYPE_DEV) 1795 return (3); 1796 1797 filp1 = fp1->f_data; 1798 filp2 = fp2->f_data; 1799 return (kcmp_cmp((uintptr_t)filp1->f_cdev, (uintptr_t)filp2->f_cdev)); 1800 } 1801 1802 const struct fileops linuxfileops = { 1803 .fo_read = linux_file_read, 1804 .fo_write = linux_file_write, 1805 .fo_truncate = invfo_truncate, 1806 .fo_kqfilter = linux_file_kqfilter, 1807 .fo_stat = linux_file_stat, 1808 .fo_fill_kinfo = linux_file_fill_kinfo, 1809 .fo_poll = linux_file_poll, 1810 .fo_close = linux_file_close, 1811 .fo_ioctl = linux_file_ioctl, 1812 .fo_mmap = linux_file_mmap, 1813 .fo_chmod = invfo_chmod, 1814 .fo_chown = invfo_chown, 1815 .fo_sendfile = invfo_sendfile, 1816 .fo_cmp = linux_file_kcmp, 1817 .fo_flags = DFLAG_PASSABLE, 1818 }; 1819 1820 static char * 1821 devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap) 1822 { 1823 unsigned int len; 1824 char *p; 1825 va_list aq; 1826 1827 va_copy(aq, ap); 1828 len = vsnprintf(NULL, 0, fmt, aq); 1829 va_end(aq); 1830 1831 if (dev != NULL) 1832 p = devm_kmalloc(dev, len + 1, gfp); 1833 else 1834 p = kmalloc(len + 1, gfp); 1835 if (p != NULL) 1836 vsnprintf(p, len + 1, fmt, ap); 1837 1838 return (p); 1839 } 1840 1841 char * 1842 kvasprintf(gfp_t gfp, const char *fmt, va_list ap) 1843 { 1844 1845 return (devm_kvasprintf(NULL, gfp, fmt, ap)); 1846 } 1847 1848 char * 1849 lkpi_devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...) 1850 { 1851 va_list ap; 1852 char *p; 1853 1854 va_start(ap, fmt); 1855 p = devm_kvasprintf(dev, gfp, fmt, ap); 1856 va_end(ap); 1857 1858 return (p); 1859 } 1860 1861 char * 1862 kasprintf(gfp_t gfp, const char *fmt, ...) 1863 { 1864 va_list ap; 1865 char *p; 1866 1867 va_start(ap, fmt); 1868 p = kvasprintf(gfp, fmt, ap); 1869 va_end(ap); 1870 1871 return (p); 1872 } 1873 1874 int 1875 __lkpi_hexdump_printf(void *arg1 __unused, const char *fmt, ...) 1876 { 1877 va_list ap; 1878 int result; 1879 1880 va_start(ap, fmt); 1881 result = vprintf(fmt, ap); 1882 va_end(ap); 1883 return (result); 1884 } 1885 1886 int 1887 __lkpi_hexdump_sbuf_printf(void *arg1, const char *fmt, ...) 1888 { 1889 va_list ap; 1890 int result; 1891 1892 va_start(ap, fmt); 1893 result = sbuf_vprintf(arg1, fmt, ap); 1894 va_end(ap); 1895 return (result); 1896 } 1897 1898 void 1899 lkpi_hex_dump(int(*_fpf)(void *, const char *, ...), void *arg1, 1900 const char *level, const char *prefix_str, 1901 const int prefix_type, const int rowsize, const int groupsize, 1902 const void *buf, size_t len, const bool ascii, const bool trailing_newline) 1903 { 1904 typedef const struct { long long value; } __packed *print_64p_t; 1905 typedef const struct { uint32_t value; } __packed *print_32p_t; 1906 typedef const struct { uint16_t value; } __packed *print_16p_t; 1907 const void *buf_old = buf; 1908 int row, linelen, ret; 1909 1910 while (len > 0) { 1911 linelen = 0; 1912 if (level != NULL) { 1913 ret = _fpf(arg1, "%s", level); 1914 if (ret < 0) 1915 break; 1916 linelen += ret; 1917 } 1918 if (prefix_str != NULL) { 1919 ret = _fpf( 1920 arg1, "%s%s", linelen ? " " : "", prefix_str); 1921 if (ret < 0) 1922 break; 1923 linelen += ret; 1924 } 1925 1926 switch (prefix_type) { 1927 case DUMP_PREFIX_ADDRESS: 1928 ret = _fpf( 1929 arg1, "%s[%p]", linelen ? " " : "", buf); 1930 if (ret < 0) 1931 return; 1932 linelen += ret; 1933 break; 1934 case DUMP_PREFIX_OFFSET: 1935 ret = _fpf( 1936 arg1, "%s[%#tx]", linelen ? " " : "", 1937 ((const char *)buf - (const char *)buf_old)); 1938 if (ret < 0) 1939 return; 1940 linelen += ret; 1941 break; 1942 default: 1943 break; 1944 } 1945 for (row = 0; row != rowsize; row++) { 1946 if (groupsize == 8 && len > 7) { 1947 ret = _fpf( 1948 arg1, "%s%016llx", linelen ? " " : "", 1949 ((print_64p_t)buf)->value); 1950 if (ret < 0) 1951 return; 1952 linelen += ret; 1953 buf = (const uint8_t *)buf + 8; 1954 len -= 8; 1955 } else if (groupsize == 4 && len > 3) { 1956 ret = _fpf( 1957 arg1, "%s%08x", linelen ? " " : "", 1958 ((print_32p_t)buf)->value); 1959 if (ret < 0) 1960 return; 1961 linelen += ret; 1962 buf = (const uint8_t *)buf + 4; 1963 len -= 4; 1964 } else if (groupsize == 2 && len > 1) { 1965 ret = _fpf( 1966 arg1, "%s%04x", linelen ? " " : "", 1967 ((print_16p_t)buf)->value); 1968 if (ret < 0) 1969 return; 1970 linelen += ret; 1971 buf = (const uint8_t *)buf + 2; 1972 len -= 2; 1973 } else if (len > 0) { 1974 ret = _fpf( 1975 arg1, "%s%02x", linelen ? " " : "", 1976 *(const uint8_t *)buf); 1977 if (ret < 0) 1978 return; 1979 linelen += ret; 1980 buf = (const uint8_t *)buf + 1; 1981 len--; 1982 } else { 1983 break; 1984 } 1985 } 1986 if (len > 0 && trailing_newline) { 1987 ret = _fpf(arg1, "\n"); 1988 if (ret < 0) 1989 break; 1990 } 1991 } 1992 } 1993 1994 struct hdtb_context { 1995 char *linebuf; 1996 size_t linebuflen; 1997 int written; 1998 }; 1999 2000 static int 2001 hdtb_cb(void *arg, const char *format, ...) 2002 { 2003 struct hdtb_context *context; 2004 int written; 2005 va_list args; 2006 2007 context = arg; 2008 2009 va_start(args, format); 2010 written = vsnprintf( 2011 context->linebuf, context->linebuflen, format, args); 2012 va_end(args); 2013 2014 if (written < 0) 2015 return (written); 2016 2017 /* 2018 * Linux' hex_dump_to_buffer() function has the same behaviour as 2019 * snprintf() basically. Therefore, it returns the number of bytes it 2020 * would have written if the destination buffer was large enough. 2021 * 2022 * If the destination buffer was exhausted, lkpi_hex_dump() will 2023 * continue to call this callback but it will only compute the bytes it 2024 * would have written but write nothing to that buffer. 2025 */ 2026 context->written += written; 2027 2028 if (written < context->linebuflen) { 2029 context->linebuf += written; 2030 context->linebuflen -= written; 2031 } else { 2032 context->linebuf += context->linebuflen; 2033 context->linebuflen = 0; 2034 } 2035 2036 return (written); 2037 } 2038 2039 int 2040 lkpi_hex_dump_to_buffer(const void *buf, size_t len, int rowsize, 2041 int groupsize, char *linebuf, size_t linebuflen, bool ascii) 2042 { 2043 int written; 2044 struct hdtb_context context; 2045 2046 context.linebuf = linebuf; 2047 context.linebuflen = linebuflen; 2048 context.written = 0; 2049 2050 if (rowsize != 16 && rowsize != 32) 2051 rowsize = 16; 2052 2053 len = min(len, rowsize); 2054 2055 lkpi_hex_dump( 2056 hdtb_cb, &context, NULL, NULL, DUMP_PREFIX_NONE, 2057 rowsize, groupsize, buf, len, ascii, false); 2058 2059 written = context.written; 2060 2061 return (written); 2062 } 2063 2064 static void 2065 linux_timer_callback_wrapper(void *context) 2066 { 2067 struct timer_list *timer; 2068 2069 timer = context; 2070 2071 /* the timer is about to be shutdown permanently */ 2072 if (timer->function == NULL) 2073 return; 2074 2075 if (linux_set_current_flags(curthread, M_NOWAIT)) { 2076 /* try again later */ 2077 callout_reset(&timer->callout, 1, 2078 &linux_timer_callback_wrapper, timer); 2079 return; 2080 } 2081 2082 timer->function(timer->data); 2083 } 2084 2085 static int 2086 linux_timer_jiffies_until(unsigned long expires) 2087 { 2088 unsigned long delta = expires - jiffies; 2089 2090 /* 2091 * Guard against already expired values and make sure that the value can 2092 * be used as a tick count, rather than a jiffies count. 2093 */ 2094 if ((long)delta < 1) 2095 delta = 1; 2096 else if (delta > INT_MAX) 2097 delta = INT_MAX; 2098 return ((int)delta); 2099 } 2100 2101 int 2102 mod_timer(struct timer_list *timer, unsigned long expires) 2103 { 2104 int ret; 2105 2106 timer->expires = expires; 2107 ret = callout_reset(&timer->callout, 2108 linux_timer_jiffies_until(expires), 2109 &linux_timer_callback_wrapper, timer); 2110 2111 MPASS(ret == 0 || ret == 1); 2112 2113 return (ret == 1); 2114 } 2115 2116 void 2117 add_timer(struct timer_list *timer) 2118 { 2119 2120 callout_reset(&timer->callout, 2121 linux_timer_jiffies_until(timer->expires), 2122 &linux_timer_callback_wrapper, timer); 2123 } 2124 2125 void 2126 add_timer_on(struct timer_list *timer, int cpu) 2127 { 2128 2129 callout_reset_on(&timer->callout, 2130 linux_timer_jiffies_until(timer->expires), 2131 &linux_timer_callback_wrapper, timer, cpu); 2132 } 2133 2134 int 2135 timer_delete(struct timer_list *timer) 2136 { 2137 2138 if (callout_stop(&(timer)->callout) == -1) 2139 return (0); 2140 return (1); 2141 } 2142 2143 int 2144 timer_delete_sync(struct timer_list *timer) 2145 { 2146 2147 if (callout_drain(&(timer)->callout) == -1) 2148 return (0); 2149 return (1); 2150 } 2151 2152 int 2153 timer_shutdown_sync(struct timer_list *timer) 2154 { 2155 2156 timer->function = NULL; 2157 return (del_timer_sync(timer)); 2158 } 2159 2160 /* greatest common divisor, Euclid equation */ 2161 static uint64_t 2162 lkpi_gcd_64(uint64_t a, uint64_t b) 2163 { 2164 uint64_t an; 2165 uint64_t bn; 2166 2167 while (b != 0) { 2168 an = b; 2169 bn = a % b; 2170 a = an; 2171 b = bn; 2172 } 2173 return (a); 2174 } 2175 2176 uint64_t lkpi_nsec2hz_rem; 2177 uint64_t lkpi_nsec2hz_div = 1000000000ULL; 2178 uint64_t lkpi_nsec2hz_max; 2179 2180 uint64_t lkpi_usec2hz_rem; 2181 uint64_t lkpi_usec2hz_div = 1000000ULL; 2182 uint64_t lkpi_usec2hz_max; 2183 2184 uint64_t lkpi_msec2hz_rem; 2185 uint64_t lkpi_msec2hz_div = 1000ULL; 2186 uint64_t lkpi_msec2hz_max; 2187 2188 static void 2189 linux_timer_init(void *arg) 2190 { 2191 uint64_t gcd; 2192 2193 /* 2194 * Compute an internal HZ value which can divide 2**32 to 2195 * avoid timer rounding problems when the tick value wraps 2196 * around 2**32: 2197 */ 2198 linux_timer_hz_mask = 1; 2199 while (linux_timer_hz_mask < (unsigned long)hz) 2200 linux_timer_hz_mask *= 2; 2201 linux_timer_hz_mask--; 2202 2203 /* compute some internal constants */ 2204 2205 lkpi_nsec2hz_rem = hz; 2206 lkpi_usec2hz_rem = hz; 2207 lkpi_msec2hz_rem = hz; 2208 2209 gcd = lkpi_gcd_64(lkpi_nsec2hz_rem, lkpi_nsec2hz_div); 2210 lkpi_nsec2hz_rem /= gcd; 2211 lkpi_nsec2hz_div /= gcd; 2212 lkpi_nsec2hz_max = -1ULL / lkpi_nsec2hz_rem; 2213 2214 gcd = lkpi_gcd_64(lkpi_usec2hz_rem, lkpi_usec2hz_div); 2215 lkpi_usec2hz_rem /= gcd; 2216 lkpi_usec2hz_div /= gcd; 2217 lkpi_usec2hz_max = -1ULL / lkpi_usec2hz_rem; 2218 2219 gcd = lkpi_gcd_64(lkpi_msec2hz_rem, lkpi_msec2hz_div); 2220 lkpi_msec2hz_rem /= gcd; 2221 lkpi_msec2hz_div /= gcd; 2222 lkpi_msec2hz_max = -1ULL / lkpi_msec2hz_rem; 2223 } 2224 SYSINIT(linux_timer, SI_SUB_DRIVERS, SI_ORDER_FIRST, linux_timer_init, NULL); 2225 2226 void 2227 linux_complete_common(struct completion *c, int all) 2228 { 2229 sleepq_lock(c); 2230 if (all) { 2231 c->done = UINT_MAX; 2232 sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0); 2233 } else { 2234 if (c->done != UINT_MAX) 2235 c->done++; 2236 sleepq_signal(c, SLEEPQ_SLEEP, 0, 0); 2237 } 2238 sleepq_release(c); 2239 } 2240 2241 /* 2242 * Indefinite wait for done != 0 with or without signals. 2243 */ 2244 int 2245 linux_wait_for_common(struct completion *c, int flags) 2246 { 2247 struct task_struct *task; 2248 int error; 2249 2250 if (SCHEDULER_STOPPED()) 2251 return (0); 2252 2253 task = current; 2254 2255 if (flags != 0) 2256 flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP; 2257 else 2258 flags = SLEEPQ_SLEEP; 2259 error = 0; 2260 for (;;) { 2261 sleepq_lock(c); 2262 if (c->done) 2263 break; 2264 sleepq_add(c, NULL, "completion", flags, 0); 2265 if (flags & SLEEPQ_INTERRUPTIBLE) { 2266 DROP_GIANT(); 2267 error = -sleepq_wait_sig(c, 0); 2268 PICKUP_GIANT(); 2269 if (error != 0) { 2270 linux_schedule_save_interrupt_value(task, error); 2271 error = -ERESTARTSYS; 2272 goto intr; 2273 } 2274 } else { 2275 DROP_GIANT(); 2276 sleepq_wait(c, 0); 2277 PICKUP_GIANT(); 2278 } 2279 } 2280 if (c->done != UINT_MAX) 2281 c->done--; 2282 sleepq_release(c); 2283 2284 intr: 2285 return (error); 2286 } 2287 2288 /* 2289 * Time limited wait for done != 0 with or without signals. 2290 */ 2291 unsigned long 2292 linux_wait_for_timeout_common(struct completion *c, unsigned long timeout, 2293 int flags) 2294 { 2295 struct task_struct *task; 2296 unsigned long end = jiffies + timeout, error; 2297 2298 if (SCHEDULER_STOPPED()) 2299 return (0); 2300 2301 task = current; 2302 2303 if (flags != 0) 2304 flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP; 2305 else 2306 flags = SLEEPQ_SLEEP; 2307 2308 for (;;) { 2309 sleepq_lock(c); 2310 if (c->done) 2311 break; 2312 sleepq_add(c, NULL, "completion", flags, 0); 2313 sleepq_set_timeout(c, linux_timer_jiffies_until(end)); 2314 2315 DROP_GIANT(); 2316 if (flags & SLEEPQ_INTERRUPTIBLE) 2317 error = -sleepq_timedwait_sig(c, 0); 2318 else 2319 error = -sleepq_timedwait(c, 0); 2320 PICKUP_GIANT(); 2321 2322 if (error != 0) { 2323 /* check for timeout */ 2324 if (error == -EWOULDBLOCK) { 2325 error = 0; /* timeout */ 2326 } else { 2327 /* signal happened */ 2328 linux_schedule_save_interrupt_value(task, error); 2329 error = -ERESTARTSYS; 2330 } 2331 goto done; 2332 } 2333 } 2334 if (c->done != UINT_MAX) 2335 c->done--; 2336 sleepq_release(c); 2337 2338 /* return how many jiffies are left */ 2339 error = linux_timer_jiffies_until(end); 2340 done: 2341 return (error); 2342 } 2343 2344 int 2345 linux_try_wait_for_completion(struct completion *c) 2346 { 2347 int isdone; 2348 2349 sleepq_lock(c); 2350 isdone = (c->done != 0); 2351 if (c->done != 0 && c->done != UINT_MAX) 2352 c->done--; 2353 sleepq_release(c); 2354 return (isdone); 2355 } 2356 2357 int 2358 linux_completion_done(struct completion *c) 2359 { 2360 int isdone; 2361 2362 sleepq_lock(c); 2363 isdone = (c->done != 0); 2364 sleepq_release(c); 2365 return (isdone); 2366 } 2367 2368 static void 2369 linux_cdev_deref(struct linux_cdev *ldev) 2370 { 2371 if (refcount_release(&ldev->refs) && 2372 ldev->kobj.ktype == &linux_cdev_ktype) 2373 kfree(ldev); 2374 } 2375 2376 static void 2377 linux_cdev_release(struct kobject *kobj) 2378 { 2379 struct linux_cdev *cdev; 2380 struct kobject *parent; 2381 2382 cdev = container_of(kobj, struct linux_cdev, kobj); 2383 parent = kobj->parent; 2384 linux_destroy_dev(cdev); 2385 linux_cdev_deref(cdev); 2386 kobject_put(parent); 2387 } 2388 2389 static void 2390 linux_cdev_static_release(struct kobject *kobj) 2391 { 2392 struct cdev *cdev; 2393 struct linux_cdev *ldev; 2394 2395 ldev = container_of(kobj, struct linux_cdev, kobj); 2396 cdev = ldev->cdev; 2397 if (cdev != NULL) { 2398 destroy_dev(cdev); 2399 ldev->cdev = NULL; 2400 } 2401 kobject_put(kobj->parent); 2402 } 2403 2404 int 2405 linux_cdev_device_add(struct linux_cdev *ldev, struct device *dev) 2406 { 2407 int ret; 2408 2409 if (dev->devt != 0) { 2410 /* Set parent kernel object. */ 2411 ldev->kobj.parent = &dev->kobj; 2412 2413 /* 2414 * Unlike Linux we require the kobject of the 2415 * character device structure to have a valid name 2416 * before calling this function: 2417 */ 2418 if (ldev->kobj.name == NULL) 2419 return (-EINVAL); 2420 2421 ret = cdev_add(ldev, dev->devt, 1); 2422 if (ret) 2423 return (ret); 2424 } 2425 ret = device_add(dev); 2426 if (ret != 0 && dev->devt != 0) 2427 cdev_del(ldev); 2428 return (ret); 2429 } 2430 2431 void 2432 linux_cdev_device_del(struct linux_cdev *ldev, struct device *dev) 2433 { 2434 device_del(dev); 2435 2436 if (dev->devt != 0) 2437 cdev_del(ldev); 2438 } 2439 2440 static void 2441 linux_destroy_dev(struct linux_cdev *ldev) 2442 { 2443 2444 if (ldev->cdev == NULL) 2445 return; 2446 2447 MPASS((ldev->siref & LDEV_SI_DTR) == 0); 2448 MPASS(ldev->kobj.ktype == &linux_cdev_ktype); 2449 2450 atomic_set_int(&ldev->siref, LDEV_SI_DTR); 2451 while ((atomic_load_int(&ldev->siref) & ~LDEV_SI_DTR) != 0) 2452 pause("ldevdtr", hz / 4); 2453 2454 destroy_dev(ldev->cdev); 2455 ldev->cdev = NULL; 2456 } 2457 2458 const struct kobj_type linux_cdev_ktype = { 2459 .release = linux_cdev_release, 2460 }; 2461 2462 const struct kobj_type linux_cdev_static_ktype = { 2463 .release = linux_cdev_static_release, 2464 }; 2465 2466 static void 2467 linux_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate) 2468 { 2469 struct notifier_block *nb; 2470 struct netdev_notifier_info ni; 2471 2472 nb = arg; 2473 ni.ifp = ifp; 2474 ni.dev = (struct net_device *)ifp; 2475 if (linkstate == LINK_STATE_UP) 2476 nb->notifier_call(nb, NETDEV_UP, &ni); 2477 else 2478 nb->notifier_call(nb, NETDEV_DOWN, &ni); 2479 } 2480 2481 static void 2482 linux_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp) 2483 { 2484 struct notifier_block *nb; 2485 struct netdev_notifier_info ni; 2486 2487 nb = arg; 2488 ni.ifp = ifp; 2489 ni.dev = (struct net_device *)ifp; 2490 nb->notifier_call(nb, NETDEV_REGISTER, &ni); 2491 } 2492 2493 static void 2494 linux_handle_ifnet_departure_event(void *arg, struct ifnet *ifp) 2495 { 2496 struct notifier_block *nb; 2497 struct netdev_notifier_info ni; 2498 2499 nb = arg; 2500 ni.ifp = ifp; 2501 ni.dev = (struct net_device *)ifp; 2502 nb->notifier_call(nb, NETDEV_UNREGISTER, &ni); 2503 } 2504 2505 static void 2506 linux_handle_iflladdr_event(void *arg, struct ifnet *ifp) 2507 { 2508 struct notifier_block *nb; 2509 struct netdev_notifier_info ni; 2510 2511 nb = arg; 2512 ni.ifp = ifp; 2513 ni.dev = (struct net_device *)ifp; 2514 nb->notifier_call(nb, NETDEV_CHANGEADDR, &ni); 2515 } 2516 2517 static void 2518 linux_handle_ifaddr_event(void *arg, struct ifnet *ifp) 2519 { 2520 struct notifier_block *nb; 2521 struct netdev_notifier_info ni; 2522 2523 nb = arg; 2524 ni.ifp = ifp; 2525 ni.dev = (struct net_device *)ifp; 2526 nb->notifier_call(nb, NETDEV_CHANGEIFADDR, &ni); 2527 } 2528 2529 int 2530 register_netdevice_notifier(struct notifier_block *nb) 2531 { 2532 2533 nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER( 2534 ifnet_link_event, linux_handle_ifnet_link_event, nb, 0); 2535 nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER( 2536 ifnet_arrival_event, linux_handle_ifnet_arrival_event, nb, 0); 2537 nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER( 2538 ifnet_departure_event, linux_handle_ifnet_departure_event, nb, 0); 2539 nb->tags[NETDEV_CHANGEADDR] = EVENTHANDLER_REGISTER( 2540 iflladdr_event, linux_handle_iflladdr_event, nb, 0); 2541 2542 return (0); 2543 } 2544 2545 int 2546 register_inetaddr_notifier(struct notifier_block *nb) 2547 { 2548 2549 nb->tags[NETDEV_CHANGEIFADDR] = EVENTHANDLER_REGISTER( 2550 ifaddr_event, linux_handle_ifaddr_event, nb, 0); 2551 return (0); 2552 } 2553 2554 int 2555 unregister_netdevice_notifier(struct notifier_block *nb) 2556 { 2557 2558 EVENTHANDLER_DEREGISTER(ifnet_link_event, 2559 nb->tags[NETDEV_UP]); 2560 EVENTHANDLER_DEREGISTER(ifnet_arrival_event, 2561 nb->tags[NETDEV_REGISTER]); 2562 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2563 nb->tags[NETDEV_UNREGISTER]); 2564 EVENTHANDLER_DEREGISTER(iflladdr_event, 2565 nb->tags[NETDEV_CHANGEADDR]); 2566 2567 return (0); 2568 } 2569 2570 int 2571 unregister_inetaddr_notifier(struct notifier_block *nb) 2572 { 2573 2574 EVENTHANDLER_DEREGISTER(ifaddr_event, 2575 nb->tags[NETDEV_CHANGEIFADDR]); 2576 2577 return (0); 2578 } 2579 2580 struct list_sort_thunk { 2581 int (*cmp)(void *, struct list_head *, struct list_head *); 2582 void *priv; 2583 }; 2584 2585 static inline int 2586 linux_le_cmp(const void *d1, const void *d2, void *priv) 2587 { 2588 struct list_head *le1, *le2; 2589 struct list_sort_thunk *thunk; 2590 2591 thunk = priv; 2592 le1 = *(__DECONST(struct list_head **, d1)); 2593 le2 = *(__DECONST(struct list_head **, d2)); 2594 return ((thunk->cmp)(thunk->priv, le1, le2)); 2595 } 2596 2597 void 2598 list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv, 2599 struct list_head *a, struct list_head *b)) 2600 { 2601 struct list_sort_thunk thunk; 2602 struct list_head **ar, *le; 2603 size_t count, i; 2604 2605 count = 0; 2606 list_for_each(le, head) 2607 count++; 2608 ar = malloc(sizeof(struct list_head *) * count, M_KMALLOC, M_WAITOK); 2609 i = 0; 2610 list_for_each(le, head) 2611 ar[i++] = le; 2612 thunk.cmp = cmp; 2613 thunk.priv = priv; 2614 qsort_r(ar, count, sizeof(struct list_head *), linux_le_cmp, &thunk); 2615 INIT_LIST_HEAD(head); 2616 for (i = 0; i < count; i++) 2617 list_add_tail(ar[i], head); 2618 free(ar, M_KMALLOC); 2619 } 2620 2621 #if defined(__i386__) || defined(__amd64__) 2622 int 2623 linux_wbinvd_on_all_cpus(void) 2624 { 2625 2626 pmap_invalidate_cache(); 2627 return (0); 2628 } 2629 #endif 2630 2631 int 2632 linux_on_each_cpu(void callback(void *), void *data) 2633 { 2634 2635 smp_rendezvous(smp_no_rendezvous_barrier, callback, 2636 smp_no_rendezvous_barrier, data); 2637 return (0); 2638 } 2639 2640 int 2641 linux_in_atomic(void) 2642 { 2643 2644 return ((curthread->td_pflags & TDP_NOFAULTING) != 0); 2645 } 2646 2647 struct linux_cdev * 2648 linux_find_cdev(const char *name, unsigned major, unsigned minor) 2649 { 2650 dev_t dev = MKDEV(major, minor); 2651 struct cdev *cdev; 2652 2653 dev_lock(); 2654 LIST_FOREACH(cdev, &linuxcdevsw.d_devs, si_list) { 2655 struct linux_cdev *ldev = cdev->si_drv1; 2656 if (ldev->dev == dev && 2657 strcmp(kobject_name(&ldev->kobj), name) == 0) { 2658 break; 2659 } 2660 } 2661 dev_unlock(); 2662 2663 return (cdev != NULL ? cdev->si_drv1 : NULL); 2664 } 2665 2666 int 2667 __register_chrdev(unsigned int major, unsigned int baseminor, 2668 unsigned int count, const char *name, 2669 const struct file_operations *fops) 2670 { 2671 struct linux_cdev *cdev; 2672 int ret = 0; 2673 int i; 2674 2675 for (i = baseminor; i < baseminor + count; i++) { 2676 cdev = cdev_alloc(); 2677 cdev->ops = fops; 2678 kobject_set_name(&cdev->kobj, name); 2679 2680 ret = cdev_add(cdev, makedev(major, i), 1); 2681 if (ret != 0) 2682 break; 2683 } 2684 return (ret); 2685 } 2686 2687 int 2688 __register_chrdev_p(unsigned int major, unsigned int baseminor, 2689 unsigned int count, const char *name, 2690 const struct file_operations *fops, uid_t uid, 2691 gid_t gid, int mode) 2692 { 2693 struct linux_cdev *cdev; 2694 int ret = 0; 2695 int i; 2696 2697 for (i = baseminor; i < baseminor + count; i++) { 2698 cdev = cdev_alloc(); 2699 cdev->ops = fops; 2700 kobject_set_name(&cdev->kobj, name); 2701 2702 ret = cdev_add_ext(cdev, makedev(major, i), uid, gid, mode); 2703 if (ret != 0) 2704 break; 2705 } 2706 return (ret); 2707 } 2708 2709 void 2710 __unregister_chrdev(unsigned int major, unsigned int baseminor, 2711 unsigned int count, const char *name) 2712 { 2713 struct linux_cdev *cdevp; 2714 int i; 2715 2716 for (i = baseminor; i < baseminor + count; i++) { 2717 cdevp = linux_find_cdev(name, major, i); 2718 if (cdevp != NULL) 2719 cdev_del(cdevp); 2720 } 2721 } 2722 2723 void 2724 linux_dump_stack(void) 2725 { 2726 #ifdef STACK 2727 struct stack st; 2728 2729 stack_save(&st); 2730 stack_print(&st); 2731 #endif 2732 } 2733 2734 int 2735 linuxkpi_net_ratelimit(void) 2736 { 2737 2738 return (ppsratecheck(&lkpi_net_lastlog, &lkpi_net_curpps, 2739 lkpi_net_maxpps)); 2740 } 2741 2742 struct io_mapping * 2743 io_mapping_create_wc(resource_size_t base, unsigned long size) 2744 { 2745 struct io_mapping *mapping; 2746 2747 mapping = kmalloc(sizeof(*mapping), GFP_KERNEL); 2748 if (mapping == NULL) 2749 return (NULL); 2750 return (io_mapping_init_wc(mapping, base, size)); 2751 } 2752 2753 /* We likely want a linuxkpi_device.c at some point. */ 2754 bool 2755 device_can_wakeup(struct device *dev) 2756 { 2757 2758 if (dev == NULL) 2759 return (false); 2760 /* 2761 * XXX-BZ iwlwifi queries it as part of enabling WoWLAN. 2762 * Normally this would be based on a bool in dev->power.XXX. 2763 * Check such as PCI PCIM_PCAP_*PME. We have no way to enable this yet. 2764 * We may get away by directly calling into bsddev for as long as 2765 * we can assume PCI only avoiding changing struct device breaking KBI. 2766 */ 2767 pr_debug("%s:%d: not enabled; see comment.\n", __func__, __LINE__); 2768 return (false); 2769 } 2770 2771 static void 2772 devm_device_group_remove(struct device *dev, void *p) 2773 { 2774 const struct attribute_group **dr = p; 2775 const struct attribute_group *group = *dr; 2776 2777 sysfs_remove_group(&dev->kobj, group); 2778 } 2779 2780 int 2781 lkpi_devm_device_add_group(struct device *dev, 2782 const struct attribute_group *group) 2783 { 2784 const struct attribute_group **dr; 2785 int ret; 2786 2787 dr = devres_alloc(devm_device_group_remove, sizeof(*dr), GFP_KERNEL); 2788 if (dr == NULL) 2789 return (-ENOMEM); 2790 2791 ret = sysfs_create_group(&dev->kobj, group); 2792 if (ret == 0) { 2793 *dr = group; 2794 devres_add(dev, dr); 2795 } else 2796 devres_free(dr); 2797 2798 return (ret); 2799 } 2800 2801 #if defined(__i386__) || defined(__amd64__) 2802 bool linux_cpu_has_clflush; 2803 struct cpuinfo_x86 boot_cpu_data; 2804 struct cpuinfo_x86 *__cpu_data; 2805 #endif 2806 2807 cpumask_t * 2808 lkpi_get_static_single_cpu_mask(int cpuid) 2809 { 2810 2811 KASSERT((cpuid >= 0 && cpuid <= mp_maxid), ("%s: invalid cpuid %d\n", 2812 __func__, cpuid)); 2813 KASSERT(!CPU_ABSENT(cpuid), ("%s: cpu with cpuid %d is absent\n", 2814 __func__, cpuid)); 2815 2816 return (static_single_cpu_mask[cpuid]); 2817 } 2818 2819 bool 2820 lkpi_xen_initial_domain(void) 2821 { 2822 #ifdef XENHVM 2823 return (xen_initial_domain()); 2824 #else 2825 return (false); 2826 #endif 2827 } 2828 2829 bool 2830 lkpi_xen_pv_domain(void) 2831 { 2832 #ifdef XENHVM 2833 return (xen_pv_domain()); 2834 #else 2835 return (false); 2836 #endif 2837 } 2838 2839 static void 2840 linux_compat_init(void *arg) 2841 { 2842 struct sysctl_oid *rootoid; 2843 int i; 2844 2845 #if defined(__i386__) || defined(__amd64__) 2846 static const uint32_t x86_vendors[X86_VENDOR_NUM] = { 2847 [X86_VENDOR_INTEL] = CPU_VENDOR_INTEL, 2848 [X86_VENDOR_CYRIX] = CPU_VENDOR_CYRIX, 2849 [X86_VENDOR_AMD] = CPU_VENDOR_AMD, 2850 [X86_VENDOR_UMC] = CPU_VENDOR_UMC, 2851 [X86_VENDOR_CENTAUR] = CPU_VENDOR_CENTAUR, 2852 [X86_VENDOR_TRANSMETA] = CPU_VENDOR_TRANSMETA, 2853 [X86_VENDOR_NSC] = CPU_VENDOR_NSC, 2854 [X86_VENDOR_HYGON] = CPU_VENDOR_HYGON, 2855 }; 2856 uint8_t x86_vendor = X86_VENDOR_UNKNOWN; 2857 2858 for (i = 0; i < X86_VENDOR_NUM; i++) { 2859 if (cpu_vendor_id != 0 && cpu_vendor_id == x86_vendors[i]) { 2860 x86_vendor = i; 2861 break; 2862 } 2863 } 2864 linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH); 2865 boot_cpu_data.x86_clflush_size = cpu_clflush_line_size; 2866 boot_cpu_data.x86_max_cores = mp_ncpus; 2867 boot_cpu_data.x86 = CPUID_TO_FAMILY(cpu_id); 2868 boot_cpu_data.x86_model = CPUID_TO_MODEL(cpu_id); 2869 boot_cpu_data.x86_vendor = x86_vendor; 2870 2871 __cpu_data = kmalloc_array(mp_maxid + 1, 2872 sizeof(*__cpu_data), M_WAITOK | M_ZERO); 2873 CPU_FOREACH(i) { 2874 __cpu_data[i].x86_clflush_size = cpu_clflush_line_size; 2875 __cpu_data[i].x86_max_cores = mp_ncpus; 2876 __cpu_data[i].x86 = CPUID_TO_FAMILY(cpu_id); 2877 __cpu_data[i].x86_model = CPUID_TO_MODEL(cpu_id); 2878 __cpu_data[i].x86_vendor = x86_vendor; 2879 } 2880 #endif 2881 rw_init(&linux_vma_lock, "lkpi-vma-lock"); 2882 2883 rootoid = SYSCTL_ADD_ROOT_NODE(NULL, 2884 OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys"); 2885 kobject_init(&linux_class_root, &linux_class_ktype); 2886 kobject_set_name(&linux_class_root, "class"); 2887 linux_class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid), 2888 OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class"); 2889 kobject_init(&linux_root_device.kobj, &linux_dev_ktype); 2890 kobject_set_name(&linux_root_device.kobj, "device"); 2891 linux_root_device.kobj.oidp = SYSCTL_ADD_NODE(NULL, 2892 SYSCTL_CHILDREN(rootoid), OID_AUTO, "device", 2893 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "device"); 2894 linux_root_device.bsddev = root_bus; 2895 linux_class_misc.name = "misc"; 2896 class_register(&linux_class_misc); 2897 INIT_LIST_HEAD(&pci_drivers); 2898 INIT_LIST_HEAD(&pci_devices); 2899 spin_lock_init(&pci_lock); 2900 init_waitqueue_head(&linux_bit_waitq); 2901 init_waitqueue_head(&linux_var_waitq); 2902 2903 CPU_COPY(&all_cpus, &cpu_online_mask); 2904 /* 2905 * Generate a single-CPU cpumask_t for each CPU (possibly) in the system. 2906 * CPUs are indexed from 0..(mp_maxid). The entry for cpuid 0 will only 2907 * have itself in the cpumask, cupid 1 only itself on entry 1, and so on. 2908 * This is used by cpumask_of() (and possibly others in the future) for, 2909 * e.g., drivers to pass hints to irq_set_affinity_hint(). 2910 */ 2911 static_single_cpu_mask = kmalloc_array(mp_maxid + 1, 2912 sizeof(static_single_cpu_mask), M_WAITOK | M_ZERO); 2913 2914 /* 2915 * When the number of CPUs reach a threshold, we start to save memory 2916 * given the sets are static by overlapping those having their single 2917 * bit set at same position in a bitset word. Asymptotically, this 2918 * regular scheme is in O(n²) whereas the overlapping one is in O(n) 2919 * only with n being the maximum number of CPUs, so the gain will become 2920 * huge quite quickly. The threshold for 64-bit architectures is 128 2921 * CPUs. 2922 */ 2923 if (mp_ncpus < (2 * _BITSET_BITS)) { 2924 cpumask_t *sscm_ptr; 2925 2926 /* 2927 * This represents 'mp_ncpus * __bitset_words(CPU_SETSIZE) * 2928 * (_BITSET_BITS / 8)' bytes (for comparison with the 2929 * overlapping scheme). 2930 */ 2931 static_single_cpu_mask_lcs = kmalloc_array(mp_ncpus, 2932 sizeof(*static_single_cpu_mask_lcs), 2933 M_WAITOK | M_ZERO); 2934 2935 sscm_ptr = static_single_cpu_mask_lcs; 2936 CPU_FOREACH(i) { 2937 static_single_cpu_mask[i] = sscm_ptr++; 2938 CPU_SET(i, static_single_cpu_mask[i]); 2939 } 2940 } else { 2941 /* Pointer to a bitset word. */ 2942 __typeof(((cpuset_t *)NULL)->__bits[0]) *bwp; 2943 2944 /* 2945 * Allocate memory for (static) spans of 'cpumask_t' ('cpuset_t' 2946 * really) with a single bit set that can be reused for all 2947 * single CPU masks by making them start at different offsets. 2948 * We need '__bitset_words(CPU_SETSIZE) - 1' bitset words before 2949 * the word having its single bit set, and the same amount 2950 * after. 2951 */ 2952 static_single_cpu_mask_lcs = mallocarray(_BITSET_BITS, 2953 (2 * __bitset_words(CPU_SETSIZE) - 1) * (_BITSET_BITS / 8), 2954 M_KMALLOC, M_WAITOK | M_ZERO); 2955 2956 /* 2957 * We rely below on cpuset_t and the bitset generic 2958 * implementation assigning words in the '__bits' array in the 2959 * same order of bits (i.e., little-endian ordering, not to be 2960 * confused with machine endianness, which concerns bits in 2961 * words and other integers). This is an imperfect test, but it 2962 * will detect a change to big-endian ordering. 2963 */ 2964 _Static_assert( 2965 __bitset_word(_BITSET_BITS + 1, _BITSET_BITS) == 1, 2966 "Assumes a bitset implementation that is little-endian " 2967 "on its words"); 2968 2969 /* Initialize the single bit of each static span. */ 2970 bwp = (__typeof(bwp))static_single_cpu_mask_lcs + 2971 (__bitset_words(CPU_SETSIZE) - 1); 2972 for (i = 0; i < _BITSET_BITS; i++) { 2973 CPU_SET(i, (cpuset_t *)bwp); 2974 bwp += (2 * __bitset_words(CPU_SETSIZE) - 1); 2975 } 2976 2977 /* 2978 * Finally set all CPU masks to the proper word in their 2979 * relevant span. 2980 */ 2981 CPU_FOREACH(i) { 2982 bwp = (__typeof(bwp))static_single_cpu_mask_lcs; 2983 /* Find the non-zero word of the relevant span. */ 2984 bwp += (2 * __bitset_words(CPU_SETSIZE) - 1) * 2985 (i % _BITSET_BITS) + 2986 __bitset_words(CPU_SETSIZE) - 1; 2987 /* Shift to find the CPU mask start. */ 2988 bwp -= (i / _BITSET_BITS); 2989 static_single_cpu_mask[i] = (cpuset_t *)bwp; 2990 } 2991 } 2992 2993 strlcpy(init_uts_ns.name.release, osrelease, sizeof(init_uts_ns.name.release)); 2994 } 2995 SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL); 2996 2997 static void 2998 linux_compat_uninit(void *arg) 2999 { 3000 linux_kobject_kfree_name(&linux_class_root); 3001 linux_kobject_kfree_name(&linux_root_device.kobj); 3002 linux_kobject_kfree_name(&linux_class_misc.kobj); 3003 3004 free(static_single_cpu_mask_lcs, M_KMALLOC); 3005 free(static_single_cpu_mask, M_KMALLOC); 3006 #if defined(__i386__) || defined(__amd64__) 3007 free(__cpu_data, M_KMALLOC); 3008 #endif 3009 3010 spin_lock_destroy(&pci_lock); 3011 rw_destroy(&linux_vma_lock); 3012 } 3013 SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL); 3014 3015 /* 3016 * NOTE: Linux frequently uses "unsigned long" for pointer to integer 3017 * conversion and vice versa, where in FreeBSD "uintptr_t" would be 3018 * used. Assert these types have the same size, else some parts of the 3019 * LinuxKPI may not work like expected: 3020 */ 3021 CTASSERT(sizeof(unsigned long) == sizeof(uintptr_t)); 3022