1 /*- 2 * Copyright (c) 2010 Isilon Systems, Inc. 3 * Copyright (c) 2010 iX Systems, Inc. 4 * Copyright (c) 2010 Panasas, Inc. 5 * Copyright (c) 2013-2021 Mellanox Technologies, Ltd. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice unmodified, this list of conditions, and the following 13 * disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 #include "opt_stack.h" 32 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/malloc.h> 36 #include <sys/kernel.h> 37 #include <sys/sysctl.h> 38 #include <sys/proc.h> 39 #include <sys/sglist.h> 40 #include <sys/sleepqueue.h> 41 #include <sys/refcount.h> 42 #include <sys/lock.h> 43 #include <sys/mutex.h> 44 #include <sys/bus.h> 45 #include <sys/eventhandler.h> 46 #include <sys/fcntl.h> 47 #include <sys/file.h> 48 #include <sys/filio.h> 49 #include <sys/rwlock.h> 50 #include <sys/mman.h> 51 #include <sys/stack.h> 52 #include <sys/sysent.h> 53 #include <sys/time.h> 54 #include <sys/user.h> 55 56 #include <vm/vm.h> 57 #include <vm/pmap.h> 58 #include <vm/vm_object.h> 59 #include <vm/vm_page.h> 60 #include <vm/vm_pager.h> 61 62 #include <machine/stdarg.h> 63 64 #if defined(__i386__) || defined(__amd64__) 65 #include <machine/md_var.h> 66 #endif 67 68 #include <linux/kobject.h> 69 #include <linux/cpu.h> 70 #include <linux/device.h> 71 #include <linux/slab.h> 72 #include <linux/module.h> 73 #include <linux/moduleparam.h> 74 #include <linux/cdev.h> 75 #include <linux/file.h> 76 #include <linux/sysfs.h> 77 #include <linux/mm.h> 78 #include <linux/io.h> 79 #include <linux/vmalloc.h> 80 #include <linux/netdevice.h> 81 #include <linux/timer.h> 82 #include <linux/interrupt.h> 83 #include <linux/uaccess.h> 84 #include <linux/utsname.h> 85 #include <linux/list.h> 86 #include <linux/kthread.h> 87 #include <linux/kernel.h> 88 #include <linux/compat.h> 89 #include <linux/io-mapping.h> 90 #include <linux/poll.h> 91 #include <linux/smp.h> 92 #include <linux/wait_bit.h> 93 #include <linux/rcupdate.h> 94 #include <linux/interval_tree.h> 95 #include <linux/interval_tree_generic.h> 96 97 #if defined(__i386__) || defined(__amd64__) 98 #include <asm/smp.h> 99 #include <asm/processor.h> 100 #endif 101 102 SYSCTL_NODE(_compat, OID_AUTO, linuxkpi, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 103 "LinuxKPI parameters"); 104 105 int linuxkpi_debug; 106 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, debug, CTLFLAG_RWTUN, 107 &linuxkpi_debug, 0, "Set to enable pr_debug() prints. Clear to disable."); 108 109 int linuxkpi_warn_dump_stack = 0; 110 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, warn_dump_stack, CTLFLAG_RWTUN, 111 &linuxkpi_warn_dump_stack, 0, 112 "Set to enable stack traces from WARN_ON(). Clear to disable."); 113 114 static struct timeval lkpi_net_lastlog; 115 static int lkpi_net_curpps; 116 static int lkpi_net_maxpps = 99; 117 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, net_ratelimit, CTLFLAG_RWTUN, 118 &lkpi_net_maxpps, 0, "Limit number of LinuxKPI net messages per second."); 119 120 MALLOC_DEFINE(M_KMALLOC, "lkpikmalloc", "Linux kmalloc compat"); 121 122 #include <linux/rbtree.h> 123 /* Undo Linux compat changes. */ 124 #undef RB_ROOT 125 #undef file 126 #undef cdev 127 #define RB_ROOT(head) (head)->rbh_root 128 129 static void linux_destroy_dev(struct linux_cdev *); 130 static void linux_cdev_deref(struct linux_cdev *ldev); 131 static struct vm_area_struct *linux_cdev_handle_find(void *handle); 132 133 cpumask_t cpu_online_mask; 134 static cpumask_t **static_single_cpu_mask; 135 static cpumask_t *static_single_cpu_mask_lcs; 136 struct kobject linux_class_root; 137 struct device linux_root_device; 138 struct class linux_class_misc; 139 struct list_head pci_drivers; 140 struct list_head pci_devices; 141 spinlock_t pci_lock; 142 struct uts_namespace init_uts_ns; 143 144 unsigned long linux_timer_hz_mask; 145 146 wait_queue_head_t linux_bit_waitq; 147 wait_queue_head_t linux_var_waitq; 148 149 int 150 panic_cmp(struct rb_node *one, struct rb_node *two) 151 { 152 panic("no cmp"); 153 } 154 155 RB_GENERATE(linux_root, rb_node, __entry, panic_cmp); 156 157 #define START(node) ((node)->start) 158 #define LAST(node) ((node)->last) 159 160 INTERVAL_TREE_DEFINE(struct interval_tree_node, rb, unsigned long,, START, 161 LAST,, lkpi_interval_tree) 162 163 static void 164 linux_device_release(struct device *dev) 165 { 166 pr_debug("linux_device_release: %s\n", dev_name(dev)); 167 kfree(dev); 168 } 169 170 static ssize_t 171 linux_class_show(struct kobject *kobj, struct attribute *attr, char *buf) 172 { 173 struct class_attribute *dattr; 174 ssize_t error; 175 176 dattr = container_of(attr, struct class_attribute, attr); 177 error = -EIO; 178 if (dattr->show) 179 error = dattr->show(container_of(kobj, struct class, kobj), 180 dattr, buf); 181 return (error); 182 } 183 184 static ssize_t 185 linux_class_store(struct kobject *kobj, struct attribute *attr, const char *buf, 186 size_t count) 187 { 188 struct class_attribute *dattr; 189 ssize_t error; 190 191 dattr = container_of(attr, struct class_attribute, attr); 192 error = -EIO; 193 if (dattr->store) 194 error = dattr->store(container_of(kobj, struct class, kobj), 195 dattr, buf, count); 196 return (error); 197 } 198 199 static void 200 linux_class_release(struct kobject *kobj) 201 { 202 struct class *class; 203 204 class = container_of(kobj, struct class, kobj); 205 if (class->class_release) 206 class->class_release(class); 207 } 208 209 static const struct sysfs_ops linux_class_sysfs = { 210 .show = linux_class_show, 211 .store = linux_class_store, 212 }; 213 214 const struct kobj_type linux_class_ktype = { 215 .release = linux_class_release, 216 .sysfs_ops = &linux_class_sysfs 217 }; 218 219 static void 220 linux_dev_release(struct kobject *kobj) 221 { 222 struct device *dev; 223 224 dev = container_of(kobj, struct device, kobj); 225 /* This is the precedence defined by linux. */ 226 if (dev->release) 227 dev->release(dev); 228 else if (dev->class && dev->class->dev_release) 229 dev->class->dev_release(dev); 230 } 231 232 static ssize_t 233 linux_dev_show(struct kobject *kobj, struct attribute *attr, char *buf) 234 { 235 struct device_attribute *dattr; 236 ssize_t error; 237 238 dattr = container_of(attr, struct device_attribute, attr); 239 error = -EIO; 240 if (dattr->show) 241 error = dattr->show(container_of(kobj, struct device, kobj), 242 dattr, buf); 243 return (error); 244 } 245 246 static ssize_t 247 linux_dev_store(struct kobject *kobj, struct attribute *attr, const char *buf, 248 size_t count) 249 { 250 struct device_attribute *dattr; 251 ssize_t error; 252 253 dattr = container_of(attr, struct device_attribute, attr); 254 error = -EIO; 255 if (dattr->store) 256 error = dattr->store(container_of(kobj, struct device, kobj), 257 dattr, buf, count); 258 return (error); 259 } 260 261 static const struct sysfs_ops linux_dev_sysfs = { 262 .show = linux_dev_show, 263 .store = linux_dev_store, 264 }; 265 266 const struct kobj_type linux_dev_ktype = { 267 .release = linux_dev_release, 268 .sysfs_ops = &linux_dev_sysfs 269 }; 270 271 struct device * 272 device_create(struct class *class, struct device *parent, dev_t devt, 273 void *drvdata, const char *fmt, ...) 274 { 275 struct device *dev; 276 va_list args; 277 278 dev = kzalloc(sizeof(*dev), M_WAITOK); 279 dev->parent = parent; 280 dev->class = class; 281 dev->devt = devt; 282 dev->driver_data = drvdata; 283 dev->release = linux_device_release; 284 va_start(args, fmt); 285 kobject_set_name_vargs(&dev->kobj, fmt, args); 286 va_end(args); 287 device_register(dev); 288 289 return (dev); 290 } 291 292 struct device * 293 device_create_groups_vargs(struct class *class, struct device *parent, 294 dev_t devt, void *drvdata, const struct attribute_group **groups, 295 const char *fmt, va_list args) 296 { 297 struct device *dev = NULL; 298 int retval = -ENODEV; 299 300 if (class == NULL || IS_ERR(class)) 301 goto error; 302 303 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 304 if (!dev) { 305 retval = -ENOMEM; 306 goto error; 307 } 308 309 dev->devt = devt; 310 dev->class = class; 311 dev->parent = parent; 312 dev->groups = groups; 313 dev->release = device_create_release; 314 /* device_initialize() needs the class and parent to be set */ 315 device_initialize(dev); 316 dev_set_drvdata(dev, drvdata); 317 318 retval = kobject_set_name_vargs(&dev->kobj, fmt, args); 319 if (retval) 320 goto error; 321 322 retval = device_add(dev); 323 if (retval) 324 goto error; 325 326 return dev; 327 328 error: 329 put_device(dev); 330 return ERR_PTR(retval); 331 } 332 333 struct class * 334 class_create(struct module *owner, const char *name) 335 { 336 struct class *class; 337 int error; 338 339 class = kzalloc(sizeof(*class), M_WAITOK); 340 class->owner = owner; 341 class->name = name; 342 class->class_release = linux_class_kfree; 343 error = class_register(class); 344 if (error) { 345 kfree(class); 346 return (NULL); 347 } 348 349 return (class); 350 } 351 352 static void 353 linux_kq_lock(void *arg) 354 { 355 spinlock_t *s = arg; 356 357 spin_lock(s); 358 } 359 static void 360 linux_kq_unlock(void *arg) 361 { 362 spinlock_t *s = arg; 363 364 spin_unlock(s); 365 } 366 367 static void 368 linux_kq_assert_lock(void *arg, int what) 369 { 370 #ifdef INVARIANTS 371 spinlock_t *s = arg; 372 373 if (what == LA_LOCKED) 374 mtx_assert(&s->m, MA_OWNED); 375 else 376 mtx_assert(&s->m, MA_NOTOWNED); 377 #endif 378 } 379 380 static void 381 linux_file_kqfilter_poll(struct linux_file *, int); 382 383 struct linux_file * 384 linux_file_alloc(void) 385 { 386 struct linux_file *filp; 387 388 filp = kzalloc(sizeof(*filp), GFP_KERNEL); 389 390 /* set initial refcount */ 391 filp->f_count = 1; 392 393 /* setup fields needed by kqueue support */ 394 spin_lock_init(&filp->f_kqlock); 395 knlist_init(&filp->f_selinfo.si_note, &filp->f_kqlock, 396 linux_kq_lock, linux_kq_unlock, linux_kq_assert_lock); 397 398 return (filp); 399 } 400 401 void 402 linux_file_free(struct linux_file *filp) 403 { 404 if (filp->_file == NULL) { 405 if (filp->f_op != NULL && filp->f_op->release != NULL) 406 filp->f_op->release(filp->f_vnode, filp); 407 if (filp->f_shmem != NULL) 408 vm_object_deallocate(filp->f_shmem); 409 kfree_rcu(filp, rcu); 410 } else { 411 /* 412 * The close method of the character device or file 413 * will free the linux_file structure: 414 */ 415 _fdrop(filp->_file, curthread); 416 } 417 } 418 419 struct linux_cdev * 420 cdev_alloc(void) 421 { 422 struct linux_cdev *cdev; 423 424 cdev = kzalloc(sizeof(struct linux_cdev), M_WAITOK); 425 kobject_init(&cdev->kobj, &linux_cdev_ktype); 426 cdev->refs = 1; 427 return (cdev); 428 } 429 430 static int 431 linux_cdev_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot, 432 vm_page_t *mres) 433 { 434 struct vm_area_struct *vmap; 435 436 vmap = linux_cdev_handle_find(vm_obj->handle); 437 438 MPASS(vmap != NULL); 439 MPASS(vmap->vm_private_data == vm_obj->handle); 440 441 if (likely(vmap->vm_ops != NULL && offset < vmap->vm_len)) { 442 vm_paddr_t paddr = IDX_TO_OFF(vmap->vm_pfn) + offset; 443 vm_page_t page; 444 445 if (((*mres)->flags & PG_FICTITIOUS) != 0) { 446 /* 447 * If the passed in result page is a fake 448 * page, update it with the new physical 449 * address. 450 */ 451 page = *mres; 452 vm_page_updatefake(page, paddr, vm_obj->memattr); 453 } else { 454 /* 455 * Replace the passed in "mres" page with our 456 * own fake page and free up the all of the 457 * original pages. 458 */ 459 VM_OBJECT_WUNLOCK(vm_obj); 460 page = vm_page_getfake(paddr, vm_obj->memattr); 461 VM_OBJECT_WLOCK(vm_obj); 462 463 vm_page_replace(page, vm_obj, (*mres)->pindex, *mres); 464 *mres = page; 465 } 466 vm_page_valid(page); 467 return (VM_PAGER_OK); 468 } 469 return (VM_PAGER_FAIL); 470 } 471 472 static int 473 linux_cdev_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type, 474 vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) 475 { 476 struct vm_area_struct *vmap; 477 int err; 478 479 /* get VM area structure */ 480 vmap = linux_cdev_handle_find(vm_obj->handle); 481 MPASS(vmap != NULL); 482 MPASS(vmap->vm_private_data == vm_obj->handle); 483 484 VM_OBJECT_WUNLOCK(vm_obj); 485 486 linux_set_current(curthread); 487 488 down_write(&vmap->vm_mm->mmap_sem); 489 if (unlikely(vmap->vm_ops == NULL)) { 490 err = VM_FAULT_SIGBUS; 491 } else { 492 struct vm_fault vmf; 493 494 /* fill out VM fault structure */ 495 vmf.virtual_address = (void *)(uintptr_t)IDX_TO_OFF(pidx); 496 vmf.flags = (fault_type & VM_PROT_WRITE) ? FAULT_FLAG_WRITE : 0; 497 vmf.pgoff = 0; 498 vmf.page = NULL; 499 vmf.vma = vmap; 500 501 vmap->vm_pfn_count = 0; 502 vmap->vm_pfn_pcount = &vmap->vm_pfn_count; 503 vmap->vm_obj = vm_obj; 504 505 err = vmap->vm_ops->fault(&vmf); 506 507 while (vmap->vm_pfn_count == 0 && err == VM_FAULT_NOPAGE) { 508 kern_yield(PRI_USER); 509 err = vmap->vm_ops->fault(&vmf); 510 } 511 } 512 513 /* translate return code */ 514 switch (err) { 515 case VM_FAULT_OOM: 516 err = VM_PAGER_AGAIN; 517 break; 518 case VM_FAULT_SIGBUS: 519 err = VM_PAGER_BAD; 520 break; 521 case VM_FAULT_NOPAGE: 522 /* 523 * By contract the fault handler will return having 524 * busied all the pages itself. If pidx is already 525 * found in the object, it will simply xbusy the first 526 * page and return with vm_pfn_count set to 1. 527 */ 528 *first = vmap->vm_pfn_first; 529 *last = *first + vmap->vm_pfn_count - 1; 530 err = VM_PAGER_OK; 531 break; 532 default: 533 err = VM_PAGER_ERROR; 534 break; 535 } 536 up_write(&vmap->vm_mm->mmap_sem); 537 VM_OBJECT_WLOCK(vm_obj); 538 return (err); 539 } 540 541 static struct rwlock linux_vma_lock; 542 static TAILQ_HEAD(, vm_area_struct) linux_vma_head = 543 TAILQ_HEAD_INITIALIZER(linux_vma_head); 544 545 static void 546 linux_cdev_handle_free(struct vm_area_struct *vmap) 547 { 548 /* Drop reference on vm_file */ 549 if (vmap->vm_file != NULL) 550 fput(vmap->vm_file); 551 552 /* Drop reference on mm_struct */ 553 mmput(vmap->vm_mm); 554 555 kfree(vmap); 556 } 557 558 static void 559 linux_cdev_handle_remove(struct vm_area_struct *vmap) 560 { 561 rw_wlock(&linux_vma_lock); 562 TAILQ_REMOVE(&linux_vma_head, vmap, vm_entry); 563 rw_wunlock(&linux_vma_lock); 564 } 565 566 static struct vm_area_struct * 567 linux_cdev_handle_find(void *handle) 568 { 569 struct vm_area_struct *vmap; 570 571 rw_rlock(&linux_vma_lock); 572 TAILQ_FOREACH(vmap, &linux_vma_head, vm_entry) { 573 if (vmap->vm_private_data == handle) 574 break; 575 } 576 rw_runlock(&linux_vma_lock); 577 return (vmap); 578 } 579 580 static int 581 linux_cdev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, 582 vm_ooffset_t foff, struct ucred *cred, u_short *color) 583 { 584 585 MPASS(linux_cdev_handle_find(handle) != NULL); 586 *color = 0; 587 return (0); 588 } 589 590 static void 591 linux_cdev_pager_dtor(void *handle) 592 { 593 const struct vm_operations_struct *vm_ops; 594 struct vm_area_struct *vmap; 595 596 vmap = linux_cdev_handle_find(handle); 597 MPASS(vmap != NULL); 598 599 /* 600 * Remove handle before calling close operation to prevent 601 * other threads from reusing the handle pointer. 602 */ 603 linux_cdev_handle_remove(vmap); 604 605 down_write(&vmap->vm_mm->mmap_sem); 606 vm_ops = vmap->vm_ops; 607 if (likely(vm_ops != NULL)) 608 vm_ops->close(vmap); 609 up_write(&vmap->vm_mm->mmap_sem); 610 611 linux_cdev_handle_free(vmap); 612 } 613 614 static struct cdev_pager_ops linux_cdev_pager_ops[2] = { 615 { 616 /* OBJT_MGTDEVICE */ 617 .cdev_pg_populate = linux_cdev_pager_populate, 618 .cdev_pg_ctor = linux_cdev_pager_ctor, 619 .cdev_pg_dtor = linux_cdev_pager_dtor 620 }, 621 { 622 /* OBJT_DEVICE */ 623 .cdev_pg_fault = linux_cdev_pager_fault, 624 .cdev_pg_ctor = linux_cdev_pager_ctor, 625 .cdev_pg_dtor = linux_cdev_pager_dtor 626 }, 627 }; 628 629 int 630 zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, 631 unsigned long size) 632 { 633 vm_object_t obj; 634 vm_page_t m; 635 636 obj = vma->vm_obj; 637 if (obj == NULL || (obj->flags & OBJ_UNMANAGED) != 0) 638 return (-ENOTSUP); 639 VM_OBJECT_RLOCK(obj); 640 for (m = vm_page_find_least(obj, OFF_TO_IDX(address)); 641 m != NULL && m->pindex < OFF_TO_IDX(address + size); 642 m = TAILQ_NEXT(m, listq)) 643 pmap_remove_all(m); 644 VM_OBJECT_RUNLOCK(obj); 645 return (0); 646 } 647 648 void 649 vma_set_file(struct vm_area_struct *vma, struct linux_file *file) 650 { 651 struct linux_file *tmp; 652 653 /* Changing an anonymous vma with this is illegal */ 654 get_file(file); 655 tmp = vma->vm_file; 656 vma->vm_file = file; 657 fput(tmp); 658 } 659 660 static struct file_operations dummy_ldev_ops = { 661 /* XXXKIB */ 662 }; 663 664 static struct linux_cdev dummy_ldev = { 665 .ops = &dummy_ldev_ops, 666 }; 667 668 #define LDEV_SI_DTR 0x0001 669 #define LDEV_SI_REF 0x0002 670 671 static void 672 linux_get_fop(struct linux_file *filp, const struct file_operations **fop, 673 struct linux_cdev **dev) 674 { 675 struct linux_cdev *ldev; 676 u_int siref; 677 678 ldev = filp->f_cdev; 679 *fop = filp->f_op; 680 if (ldev != NULL) { 681 if (ldev->kobj.ktype == &linux_cdev_static_ktype) { 682 refcount_acquire(&ldev->refs); 683 } else { 684 for (siref = ldev->siref;;) { 685 if ((siref & LDEV_SI_DTR) != 0) { 686 ldev = &dummy_ldev; 687 *fop = ldev->ops; 688 siref = ldev->siref; 689 MPASS((ldev->siref & LDEV_SI_DTR) == 0); 690 } else if (atomic_fcmpset_int(&ldev->siref, 691 &siref, siref + LDEV_SI_REF)) { 692 break; 693 } 694 } 695 } 696 } 697 *dev = ldev; 698 } 699 700 static void 701 linux_drop_fop(struct linux_cdev *ldev) 702 { 703 704 if (ldev == NULL) 705 return; 706 if (ldev->kobj.ktype == &linux_cdev_static_ktype) { 707 linux_cdev_deref(ldev); 708 } else { 709 MPASS(ldev->kobj.ktype == &linux_cdev_ktype); 710 MPASS((ldev->siref & ~LDEV_SI_DTR) != 0); 711 atomic_subtract_int(&ldev->siref, LDEV_SI_REF); 712 } 713 } 714 715 #define OPW(fp,td,code) ({ \ 716 struct file *__fpop; \ 717 __typeof(code) __retval; \ 718 \ 719 __fpop = (td)->td_fpop; \ 720 (td)->td_fpop = (fp); \ 721 __retval = (code); \ 722 (td)->td_fpop = __fpop; \ 723 __retval; \ 724 }) 725 726 static int 727 linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td, 728 struct file *file) 729 { 730 struct linux_cdev *ldev; 731 struct linux_file *filp; 732 const struct file_operations *fop; 733 int error; 734 735 ldev = dev->si_drv1; 736 737 filp = linux_file_alloc(); 738 filp->f_dentry = &filp->f_dentry_store; 739 filp->f_op = ldev->ops; 740 filp->f_mode = file->f_flag; 741 filp->f_flags = file->f_flag; 742 filp->f_vnode = file->f_vnode; 743 filp->_file = file; 744 refcount_acquire(&ldev->refs); 745 filp->f_cdev = ldev; 746 747 linux_set_current(td); 748 linux_get_fop(filp, &fop, &ldev); 749 750 if (fop->open != NULL) { 751 error = -fop->open(file->f_vnode, filp); 752 if (error != 0) { 753 linux_drop_fop(ldev); 754 linux_cdev_deref(filp->f_cdev); 755 kfree(filp); 756 return (error); 757 } 758 } 759 760 /* hold on to the vnode - used for fstat() */ 761 vhold(filp->f_vnode); 762 763 /* release the file from devfs */ 764 finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops); 765 linux_drop_fop(ldev); 766 return (ENXIO); 767 } 768 769 #define LINUX_IOCTL_MIN_PTR 0x10000UL 770 #define LINUX_IOCTL_MAX_PTR (LINUX_IOCTL_MIN_PTR + IOCPARM_MAX) 771 772 static inline int 773 linux_remap_address(void **uaddr, size_t len) 774 { 775 uintptr_t uaddr_val = (uintptr_t)(*uaddr); 776 777 if (unlikely(uaddr_val >= LINUX_IOCTL_MIN_PTR && 778 uaddr_val < LINUX_IOCTL_MAX_PTR)) { 779 struct task_struct *pts = current; 780 if (pts == NULL) { 781 *uaddr = NULL; 782 return (1); 783 } 784 785 /* compute data offset */ 786 uaddr_val -= LINUX_IOCTL_MIN_PTR; 787 788 /* check that length is within bounds */ 789 if ((len > IOCPARM_MAX) || 790 (uaddr_val + len) > pts->bsd_ioctl_len) { 791 *uaddr = NULL; 792 return (1); 793 } 794 795 /* re-add kernel buffer address */ 796 uaddr_val += (uintptr_t)pts->bsd_ioctl_data; 797 798 /* update address location */ 799 *uaddr = (void *)uaddr_val; 800 return (1); 801 } 802 return (0); 803 } 804 805 int 806 linux_copyin(const void *uaddr, void *kaddr, size_t len) 807 { 808 if (linux_remap_address(__DECONST(void **, &uaddr), len)) { 809 if (uaddr == NULL) 810 return (-EFAULT); 811 memcpy(kaddr, uaddr, len); 812 return (0); 813 } 814 return (-copyin(uaddr, kaddr, len)); 815 } 816 817 int 818 linux_copyout(const void *kaddr, void *uaddr, size_t len) 819 { 820 if (linux_remap_address(&uaddr, len)) { 821 if (uaddr == NULL) 822 return (-EFAULT); 823 memcpy(uaddr, kaddr, len); 824 return (0); 825 } 826 return (-copyout(kaddr, uaddr, len)); 827 } 828 829 size_t 830 linux_clear_user(void *_uaddr, size_t _len) 831 { 832 uint8_t *uaddr = _uaddr; 833 size_t len = _len; 834 835 /* make sure uaddr is aligned before going into the fast loop */ 836 while (((uintptr_t)uaddr & 7) != 0 && len > 7) { 837 if (subyte(uaddr, 0)) 838 return (_len); 839 uaddr++; 840 len--; 841 } 842 843 /* zero 8 bytes at a time */ 844 while (len > 7) { 845 #ifdef __LP64__ 846 if (suword64(uaddr, 0)) 847 return (_len); 848 #else 849 if (suword32(uaddr, 0)) 850 return (_len); 851 if (suword32(uaddr + 4, 0)) 852 return (_len); 853 #endif 854 uaddr += 8; 855 len -= 8; 856 } 857 858 /* zero fill end, if any */ 859 while (len > 0) { 860 if (subyte(uaddr, 0)) 861 return (_len); 862 uaddr++; 863 len--; 864 } 865 return (0); 866 } 867 868 int 869 linux_access_ok(const void *uaddr, size_t len) 870 { 871 uintptr_t saddr; 872 uintptr_t eaddr; 873 874 /* get start and end address */ 875 saddr = (uintptr_t)uaddr; 876 eaddr = (uintptr_t)uaddr + len; 877 878 /* verify addresses are valid for userspace */ 879 return ((saddr == eaddr) || 880 (eaddr > saddr && eaddr <= VM_MAXUSER_ADDRESS)); 881 } 882 883 /* 884 * This function should return either EINTR or ERESTART depending on 885 * the signal type sent to this thread: 886 */ 887 static int 888 linux_get_error(struct task_struct *task, int error) 889 { 890 /* check for signal type interrupt code */ 891 if (error == EINTR || error == ERESTARTSYS || error == ERESTART) { 892 error = -linux_schedule_get_interrupt_value(task); 893 if (error == 0) 894 error = EINTR; 895 } 896 return (error); 897 } 898 899 static int 900 linux_file_ioctl_sub(struct file *fp, struct linux_file *filp, 901 const struct file_operations *fop, u_long cmd, caddr_t data, 902 struct thread *td) 903 { 904 struct task_struct *task = current; 905 unsigned size; 906 int error; 907 908 size = IOCPARM_LEN(cmd); 909 /* refer to logic in sys_ioctl() */ 910 if (size > 0) { 911 /* 912 * Setup hint for linux_copyin() and linux_copyout(). 913 * 914 * Background: Linux code expects a user-space address 915 * while FreeBSD supplies a kernel-space address. 916 */ 917 task->bsd_ioctl_data = data; 918 task->bsd_ioctl_len = size; 919 data = (void *)LINUX_IOCTL_MIN_PTR; 920 } else { 921 /* fetch user-space pointer */ 922 data = *(void **)data; 923 } 924 #ifdef COMPAT_FREEBSD32 925 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 926 /* try the compat IOCTL handler first */ 927 if (fop->compat_ioctl != NULL) { 928 error = -OPW(fp, td, fop->compat_ioctl(filp, 929 cmd, (u_long)data)); 930 } else { 931 error = ENOTTY; 932 } 933 934 /* fallback to the regular IOCTL handler, if any */ 935 if (error == ENOTTY && fop->unlocked_ioctl != NULL) { 936 error = -OPW(fp, td, fop->unlocked_ioctl(filp, 937 cmd, (u_long)data)); 938 } 939 } else 940 #endif 941 { 942 if (fop->unlocked_ioctl != NULL) { 943 error = -OPW(fp, td, fop->unlocked_ioctl(filp, 944 cmd, (u_long)data)); 945 } else { 946 error = ENOTTY; 947 } 948 } 949 if (size > 0) { 950 task->bsd_ioctl_data = NULL; 951 task->bsd_ioctl_len = 0; 952 } 953 954 if (error == EWOULDBLOCK) { 955 /* update kqfilter status, if any */ 956 linux_file_kqfilter_poll(filp, 957 LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE); 958 } else { 959 error = linux_get_error(task, error); 960 } 961 return (error); 962 } 963 964 #define LINUX_POLL_TABLE_NORMAL ((poll_table *)1) 965 966 /* 967 * This function atomically updates the poll wakeup state and returns 968 * the previous state at the time of update. 969 */ 970 static uint8_t 971 linux_poll_wakeup_state(atomic_t *v, const uint8_t *pstate) 972 { 973 int c, old; 974 975 c = v->counter; 976 977 while ((old = atomic_cmpxchg(v, c, pstate[c])) != c) 978 c = old; 979 980 return (c); 981 } 982 983 static int 984 linux_poll_wakeup_callback(wait_queue_t *wq, unsigned int wq_state, int flags, void *key) 985 { 986 static const uint8_t state[LINUX_FWQ_STATE_MAX] = { 987 [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */ 988 [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */ 989 [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_READY, 990 [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_READY, /* NOP */ 991 }; 992 struct linux_file *filp = container_of(wq, struct linux_file, f_wait_queue.wq); 993 994 switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) { 995 case LINUX_FWQ_STATE_QUEUED: 996 linux_poll_wakeup(filp); 997 return (1); 998 default: 999 return (0); 1000 } 1001 } 1002 1003 void 1004 linux_poll_wait(struct linux_file *filp, wait_queue_head_t *wqh, poll_table *p) 1005 { 1006 static const uint8_t state[LINUX_FWQ_STATE_MAX] = { 1007 [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_NOT_READY, 1008 [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */ 1009 [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_QUEUED, /* NOP */ 1010 [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_QUEUED, 1011 }; 1012 1013 /* check if we are called inside the select system call */ 1014 if (p == LINUX_POLL_TABLE_NORMAL) 1015 selrecord(curthread, &filp->f_selinfo); 1016 1017 switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) { 1018 case LINUX_FWQ_STATE_INIT: 1019 /* NOTE: file handles can only belong to one wait-queue */ 1020 filp->f_wait_queue.wqh = wqh; 1021 filp->f_wait_queue.wq.func = &linux_poll_wakeup_callback; 1022 add_wait_queue(wqh, &filp->f_wait_queue.wq); 1023 atomic_set(&filp->f_wait_queue.state, LINUX_FWQ_STATE_QUEUED); 1024 break; 1025 default: 1026 break; 1027 } 1028 } 1029 1030 static void 1031 linux_poll_wait_dequeue(struct linux_file *filp) 1032 { 1033 static const uint8_t state[LINUX_FWQ_STATE_MAX] = { 1034 [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */ 1035 [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_INIT, 1036 [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_INIT, 1037 [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_INIT, 1038 }; 1039 1040 seldrain(&filp->f_selinfo); 1041 1042 switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) { 1043 case LINUX_FWQ_STATE_NOT_READY: 1044 case LINUX_FWQ_STATE_QUEUED: 1045 case LINUX_FWQ_STATE_READY: 1046 remove_wait_queue(filp->f_wait_queue.wqh, &filp->f_wait_queue.wq); 1047 break; 1048 default: 1049 break; 1050 } 1051 } 1052 1053 void 1054 linux_poll_wakeup(struct linux_file *filp) 1055 { 1056 /* this function should be NULL-safe */ 1057 if (filp == NULL) 1058 return; 1059 1060 selwakeup(&filp->f_selinfo); 1061 1062 spin_lock(&filp->f_kqlock); 1063 filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ | 1064 LINUX_KQ_FLAG_NEED_WRITE; 1065 1066 /* make sure the "knote" gets woken up */ 1067 KNOTE_LOCKED(&filp->f_selinfo.si_note, 1); 1068 spin_unlock(&filp->f_kqlock); 1069 } 1070 1071 static void 1072 linux_file_kqfilter_detach(struct knote *kn) 1073 { 1074 struct linux_file *filp = kn->kn_hook; 1075 1076 spin_lock(&filp->f_kqlock); 1077 knlist_remove(&filp->f_selinfo.si_note, kn, 1); 1078 spin_unlock(&filp->f_kqlock); 1079 } 1080 1081 static int 1082 linux_file_kqfilter_read_event(struct knote *kn, long hint) 1083 { 1084 struct linux_file *filp = kn->kn_hook; 1085 1086 mtx_assert(&filp->f_kqlock.m, MA_OWNED); 1087 1088 return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_READ) ? 1 : 0); 1089 } 1090 1091 static int 1092 linux_file_kqfilter_write_event(struct knote *kn, long hint) 1093 { 1094 struct linux_file *filp = kn->kn_hook; 1095 1096 mtx_assert(&filp->f_kqlock.m, MA_OWNED); 1097 1098 return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_WRITE) ? 1 : 0); 1099 } 1100 1101 static struct filterops linux_dev_kqfiltops_read = { 1102 .f_isfd = 1, 1103 .f_detach = linux_file_kqfilter_detach, 1104 .f_event = linux_file_kqfilter_read_event, 1105 }; 1106 1107 static struct filterops linux_dev_kqfiltops_write = { 1108 .f_isfd = 1, 1109 .f_detach = linux_file_kqfilter_detach, 1110 .f_event = linux_file_kqfilter_write_event, 1111 }; 1112 1113 static void 1114 linux_file_kqfilter_poll(struct linux_file *filp, int kqflags) 1115 { 1116 struct thread *td; 1117 const struct file_operations *fop; 1118 struct linux_cdev *ldev; 1119 int temp; 1120 1121 if ((filp->f_kqflags & kqflags) == 0) 1122 return; 1123 1124 td = curthread; 1125 1126 linux_get_fop(filp, &fop, &ldev); 1127 /* get the latest polling state */ 1128 temp = OPW(filp->_file, td, fop->poll(filp, NULL)); 1129 linux_drop_fop(ldev); 1130 1131 spin_lock(&filp->f_kqlock); 1132 /* clear kqflags */ 1133 filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ | 1134 LINUX_KQ_FLAG_NEED_WRITE); 1135 /* update kqflags */ 1136 if ((temp & (POLLIN | POLLOUT)) != 0) { 1137 if ((temp & POLLIN) != 0) 1138 filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ; 1139 if ((temp & POLLOUT) != 0) 1140 filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE; 1141 1142 /* make sure the "knote" gets woken up */ 1143 KNOTE_LOCKED(&filp->f_selinfo.si_note, 0); 1144 } 1145 spin_unlock(&filp->f_kqlock); 1146 } 1147 1148 static int 1149 linux_file_kqfilter(struct file *file, struct knote *kn) 1150 { 1151 struct linux_file *filp; 1152 struct thread *td; 1153 int error; 1154 1155 td = curthread; 1156 filp = (struct linux_file *)file->f_data; 1157 filp->f_flags = file->f_flag; 1158 if (filp->f_op->poll == NULL) 1159 return (EINVAL); 1160 1161 spin_lock(&filp->f_kqlock); 1162 switch (kn->kn_filter) { 1163 case EVFILT_READ: 1164 filp->f_kqflags |= LINUX_KQ_FLAG_HAS_READ; 1165 kn->kn_fop = &linux_dev_kqfiltops_read; 1166 kn->kn_hook = filp; 1167 knlist_add(&filp->f_selinfo.si_note, kn, 1); 1168 error = 0; 1169 break; 1170 case EVFILT_WRITE: 1171 filp->f_kqflags |= LINUX_KQ_FLAG_HAS_WRITE; 1172 kn->kn_fop = &linux_dev_kqfiltops_write; 1173 kn->kn_hook = filp; 1174 knlist_add(&filp->f_selinfo.si_note, kn, 1); 1175 error = 0; 1176 break; 1177 default: 1178 error = EINVAL; 1179 break; 1180 } 1181 spin_unlock(&filp->f_kqlock); 1182 1183 if (error == 0) { 1184 linux_set_current(td); 1185 1186 /* update kqfilter status, if any */ 1187 linux_file_kqfilter_poll(filp, 1188 LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE); 1189 } 1190 return (error); 1191 } 1192 1193 static int 1194 linux_file_mmap_single(struct file *fp, const struct file_operations *fop, 1195 vm_ooffset_t *offset, vm_size_t size, struct vm_object **object, 1196 int nprot, bool is_shared, struct thread *td) 1197 { 1198 struct task_struct *task; 1199 struct vm_area_struct *vmap; 1200 struct mm_struct *mm; 1201 struct linux_file *filp; 1202 vm_memattr_t attr; 1203 int error; 1204 1205 filp = (struct linux_file *)fp->f_data; 1206 filp->f_flags = fp->f_flag; 1207 1208 if (fop->mmap == NULL) 1209 return (EOPNOTSUPP); 1210 1211 linux_set_current(td); 1212 1213 /* 1214 * The same VM object might be shared by multiple processes 1215 * and the mm_struct is usually freed when a process exits. 1216 * 1217 * The atomic reference below makes sure the mm_struct is 1218 * available as long as the vmap is in the linux_vma_head. 1219 */ 1220 task = current; 1221 mm = task->mm; 1222 if (atomic_inc_not_zero(&mm->mm_users) == 0) 1223 return (EINVAL); 1224 1225 vmap = kzalloc(sizeof(*vmap), GFP_KERNEL); 1226 vmap->vm_start = 0; 1227 vmap->vm_end = size; 1228 vmap->vm_pgoff = *offset / PAGE_SIZE; 1229 vmap->vm_pfn = 0; 1230 vmap->vm_flags = vmap->vm_page_prot = (nprot & VM_PROT_ALL); 1231 if (is_shared) 1232 vmap->vm_flags |= VM_SHARED; 1233 vmap->vm_ops = NULL; 1234 vmap->vm_file = get_file(filp); 1235 vmap->vm_mm = mm; 1236 1237 if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) { 1238 error = linux_get_error(task, EINTR); 1239 } else { 1240 error = -OPW(fp, td, fop->mmap(filp, vmap)); 1241 error = linux_get_error(task, error); 1242 up_write(&vmap->vm_mm->mmap_sem); 1243 } 1244 1245 if (error != 0) { 1246 linux_cdev_handle_free(vmap); 1247 return (error); 1248 } 1249 1250 attr = pgprot2cachemode(vmap->vm_page_prot); 1251 1252 if (vmap->vm_ops != NULL) { 1253 struct vm_area_struct *ptr; 1254 void *vm_private_data; 1255 bool vm_no_fault; 1256 1257 if (vmap->vm_ops->open == NULL || 1258 vmap->vm_ops->close == NULL || 1259 vmap->vm_private_data == NULL) { 1260 /* free allocated VM area struct */ 1261 linux_cdev_handle_free(vmap); 1262 return (EINVAL); 1263 } 1264 1265 vm_private_data = vmap->vm_private_data; 1266 1267 rw_wlock(&linux_vma_lock); 1268 TAILQ_FOREACH(ptr, &linux_vma_head, vm_entry) { 1269 if (ptr->vm_private_data == vm_private_data) 1270 break; 1271 } 1272 /* check if there is an existing VM area struct */ 1273 if (ptr != NULL) { 1274 /* check if the VM area structure is invalid */ 1275 if (ptr->vm_ops == NULL || 1276 ptr->vm_ops->open == NULL || 1277 ptr->vm_ops->close == NULL) { 1278 error = ESTALE; 1279 vm_no_fault = 1; 1280 } else { 1281 error = EEXIST; 1282 vm_no_fault = (ptr->vm_ops->fault == NULL); 1283 } 1284 } else { 1285 /* insert VM area structure into list */ 1286 TAILQ_INSERT_TAIL(&linux_vma_head, vmap, vm_entry); 1287 error = 0; 1288 vm_no_fault = (vmap->vm_ops->fault == NULL); 1289 } 1290 rw_wunlock(&linux_vma_lock); 1291 1292 if (error != 0) { 1293 /* free allocated VM area struct */ 1294 linux_cdev_handle_free(vmap); 1295 /* check for stale VM area struct */ 1296 if (error != EEXIST) 1297 return (error); 1298 } 1299 1300 /* check if there is no fault handler */ 1301 if (vm_no_fault) { 1302 *object = cdev_pager_allocate(vm_private_data, OBJT_DEVICE, 1303 &linux_cdev_pager_ops[1], size, nprot, *offset, 1304 td->td_ucred); 1305 } else { 1306 *object = cdev_pager_allocate(vm_private_data, OBJT_MGTDEVICE, 1307 &linux_cdev_pager_ops[0], size, nprot, *offset, 1308 td->td_ucred); 1309 } 1310 1311 /* check if allocating the VM object failed */ 1312 if (*object == NULL) { 1313 if (error == 0) { 1314 /* remove VM area struct from list */ 1315 linux_cdev_handle_remove(vmap); 1316 /* free allocated VM area struct */ 1317 linux_cdev_handle_free(vmap); 1318 } 1319 return (EINVAL); 1320 } 1321 } else { 1322 struct sglist *sg; 1323 1324 sg = sglist_alloc(1, M_WAITOK); 1325 sglist_append_phys(sg, 1326 (vm_paddr_t)vmap->vm_pfn << PAGE_SHIFT, vmap->vm_len); 1327 1328 *object = vm_pager_allocate(OBJT_SG, sg, vmap->vm_len, 1329 nprot, 0, td->td_ucred); 1330 1331 linux_cdev_handle_free(vmap); 1332 1333 if (*object == NULL) { 1334 sglist_free(sg); 1335 return (EINVAL); 1336 } 1337 } 1338 1339 if (attr != VM_MEMATTR_DEFAULT) { 1340 VM_OBJECT_WLOCK(*object); 1341 vm_object_set_memattr(*object, attr); 1342 VM_OBJECT_WUNLOCK(*object); 1343 } 1344 *offset = 0; 1345 return (0); 1346 } 1347 1348 struct cdevsw linuxcdevsw = { 1349 .d_version = D_VERSION, 1350 .d_fdopen = linux_dev_fdopen, 1351 .d_name = "lkpidev", 1352 }; 1353 1354 static int 1355 linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred, 1356 int flags, struct thread *td) 1357 { 1358 struct linux_file *filp; 1359 const struct file_operations *fop; 1360 struct linux_cdev *ldev; 1361 ssize_t bytes; 1362 int error; 1363 1364 error = 0; 1365 filp = (struct linux_file *)file->f_data; 1366 filp->f_flags = file->f_flag; 1367 /* XXX no support for I/O vectors currently */ 1368 if (uio->uio_iovcnt != 1) 1369 return (EOPNOTSUPP); 1370 if (uio->uio_resid > DEVFS_IOSIZE_MAX) 1371 return (EINVAL); 1372 linux_set_current(td); 1373 linux_get_fop(filp, &fop, &ldev); 1374 if (fop->read != NULL) { 1375 bytes = OPW(file, td, fop->read(filp, 1376 uio->uio_iov->iov_base, 1377 uio->uio_iov->iov_len, &uio->uio_offset)); 1378 if (bytes >= 0) { 1379 uio->uio_iov->iov_base = 1380 ((uint8_t *)uio->uio_iov->iov_base) + bytes; 1381 uio->uio_iov->iov_len -= bytes; 1382 uio->uio_resid -= bytes; 1383 } else { 1384 error = linux_get_error(current, -bytes); 1385 } 1386 } else 1387 error = ENXIO; 1388 1389 /* update kqfilter status, if any */ 1390 linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ); 1391 linux_drop_fop(ldev); 1392 1393 return (error); 1394 } 1395 1396 static int 1397 linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred, 1398 int flags, struct thread *td) 1399 { 1400 struct linux_file *filp; 1401 const struct file_operations *fop; 1402 struct linux_cdev *ldev; 1403 ssize_t bytes; 1404 int error; 1405 1406 filp = (struct linux_file *)file->f_data; 1407 filp->f_flags = file->f_flag; 1408 /* XXX no support for I/O vectors currently */ 1409 if (uio->uio_iovcnt != 1) 1410 return (EOPNOTSUPP); 1411 if (uio->uio_resid > DEVFS_IOSIZE_MAX) 1412 return (EINVAL); 1413 linux_set_current(td); 1414 linux_get_fop(filp, &fop, &ldev); 1415 if (fop->write != NULL) { 1416 bytes = OPW(file, td, fop->write(filp, 1417 uio->uio_iov->iov_base, 1418 uio->uio_iov->iov_len, &uio->uio_offset)); 1419 if (bytes >= 0) { 1420 uio->uio_iov->iov_base = 1421 ((uint8_t *)uio->uio_iov->iov_base) + bytes; 1422 uio->uio_iov->iov_len -= bytes; 1423 uio->uio_resid -= bytes; 1424 error = 0; 1425 } else { 1426 error = linux_get_error(current, -bytes); 1427 } 1428 } else 1429 error = ENXIO; 1430 1431 /* update kqfilter status, if any */ 1432 linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_WRITE); 1433 1434 linux_drop_fop(ldev); 1435 1436 return (error); 1437 } 1438 1439 static int 1440 linux_file_poll(struct file *file, int events, struct ucred *active_cred, 1441 struct thread *td) 1442 { 1443 struct linux_file *filp; 1444 const struct file_operations *fop; 1445 struct linux_cdev *ldev; 1446 int revents; 1447 1448 filp = (struct linux_file *)file->f_data; 1449 filp->f_flags = file->f_flag; 1450 linux_set_current(td); 1451 linux_get_fop(filp, &fop, &ldev); 1452 if (fop->poll != NULL) { 1453 revents = OPW(file, td, fop->poll(filp, 1454 LINUX_POLL_TABLE_NORMAL)) & events; 1455 } else { 1456 revents = 0; 1457 } 1458 linux_drop_fop(ldev); 1459 return (revents); 1460 } 1461 1462 static int 1463 linux_file_close(struct file *file, struct thread *td) 1464 { 1465 struct linux_file *filp; 1466 int (*release)(struct inode *, struct linux_file *); 1467 const struct file_operations *fop; 1468 struct linux_cdev *ldev; 1469 int error; 1470 1471 filp = (struct linux_file *)file->f_data; 1472 1473 KASSERT(file_count(filp) == 0, 1474 ("File refcount(%d) is not zero", file_count(filp))); 1475 1476 if (td == NULL) 1477 td = curthread; 1478 1479 error = 0; 1480 filp->f_flags = file->f_flag; 1481 linux_set_current(td); 1482 linux_poll_wait_dequeue(filp); 1483 linux_get_fop(filp, &fop, &ldev); 1484 /* 1485 * Always use the real release function, if any, to avoid 1486 * leaking device resources: 1487 */ 1488 release = filp->f_op->release; 1489 if (release != NULL) 1490 error = -OPW(file, td, release(filp->f_vnode, filp)); 1491 funsetown(&filp->f_sigio); 1492 if (filp->f_vnode != NULL) 1493 vdrop(filp->f_vnode); 1494 linux_drop_fop(ldev); 1495 ldev = filp->f_cdev; 1496 if (ldev != NULL) 1497 linux_cdev_deref(ldev); 1498 linux_synchronize_rcu(RCU_TYPE_REGULAR); 1499 kfree(filp); 1500 1501 return (error); 1502 } 1503 1504 static int 1505 linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred, 1506 struct thread *td) 1507 { 1508 struct linux_file *filp; 1509 const struct file_operations *fop; 1510 struct linux_cdev *ldev; 1511 struct fiodgname_arg *fgn; 1512 const char *p; 1513 int error, i; 1514 1515 error = 0; 1516 filp = (struct linux_file *)fp->f_data; 1517 filp->f_flags = fp->f_flag; 1518 linux_get_fop(filp, &fop, &ldev); 1519 1520 linux_set_current(td); 1521 switch (cmd) { 1522 case FIONBIO: 1523 break; 1524 case FIOASYNC: 1525 if (fop->fasync == NULL) 1526 break; 1527 error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC)); 1528 break; 1529 case FIOSETOWN: 1530 error = fsetown(*(int *)data, &filp->f_sigio); 1531 if (error == 0) { 1532 if (fop->fasync == NULL) 1533 break; 1534 error = -OPW(fp, td, fop->fasync(0, filp, 1535 fp->f_flag & FASYNC)); 1536 } 1537 break; 1538 case FIOGETOWN: 1539 *(int *)data = fgetown(&filp->f_sigio); 1540 break; 1541 case FIODGNAME: 1542 #ifdef COMPAT_FREEBSD32 1543 case FIODGNAME_32: 1544 #endif 1545 if (filp->f_cdev == NULL || filp->f_cdev->cdev == NULL) { 1546 error = ENXIO; 1547 break; 1548 } 1549 fgn = data; 1550 p = devtoname(filp->f_cdev->cdev); 1551 i = strlen(p) + 1; 1552 if (i > fgn->len) { 1553 error = EINVAL; 1554 break; 1555 } 1556 error = copyout(p, fiodgname_buf_get_ptr(fgn, cmd), i); 1557 break; 1558 default: 1559 error = linux_file_ioctl_sub(fp, filp, fop, cmd, data, td); 1560 break; 1561 } 1562 linux_drop_fop(ldev); 1563 return (error); 1564 } 1565 1566 static int 1567 linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot, 1568 vm_prot_t maxprot, int flags, struct file *fp, 1569 vm_ooffset_t *foff, const struct file_operations *fop, vm_object_t *objp) 1570 { 1571 /* 1572 * Character devices do not provide private mappings 1573 * of any kind: 1574 */ 1575 if ((maxprot & VM_PROT_WRITE) == 0 && 1576 (prot & VM_PROT_WRITE) != 0) 1577 return (EACCES); 1578 if ((flags & (MAP_PRIVATE | MAP_COPY)) != 0) 1579 return (EINVAL); 1580 1581 return (linux_file_mmap_single(fp, fop, foff, objsize, objp, 1582 (int)prot, (flags & MAP_SHARED) ? true : false, td)); 1583 } 1584 1585 static int 1586 linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, 1587 vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff, 1588 struct thread *td) 1589 { 1590 struct linux_file *filp; 1591 const struct file_operations *fop; 1592 struct linux_cdev *ldev; 1593 struct mount *mp; 1594 struct vnode *vp; 1595 vm_object_t object; 1596 vm_prot_t maxprot; 1597 int error; 1598 1599 filp = (struct linux_file *)fp->f_data; 1600 1601 vp = filp->f_vnode; 1602 if (vp == NULL) 1603 return (EOPNOTSUPP); 1604 1605 /* 1606 * Ensure that file and memory protections are 1607 * compatible. 1608 */ 1609 mp = vp->v_mount; 1610 if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) { 1611 maxprot = VM_PROT_NONE; 1612 if ((prot & VM_PROT_EXECUTE) != 0) 1613 return (EACCES); 1614 } else 1615 maxprot = VM_PROT_EXECUTE; 1616 if ((fp->f_flag & FREAD) != 0) 1617 maxprot |= VM_PROT_READ; 1618 else if ((prot & VM_PROT_READ) != 0) 1619 return (EACCES); 1620 1621 /* 1622 * If we are sharing potential changes via MAP_SHARED and we 1623 * are trying to get write permission although we opened it 1624 * without asking for it, bail out. 1625 * 1626 * Note that most character devices always share mappings. 1627 * 1628 * Rely on linux_file_mmap_sub() to fail invalid MAP_PRIVATE 1629 * requests rather than doing it here. 1630 */ 1631 if ((flags & MAP_SHARED) != 0) { 1632 if ((fp->f_flag & FWRITE) != 0) 1633 maxprot |= VM_PROT_WRITE; 1634 else if ((prot & VM_PROT_WRITE) != 0) 1635 return (EACCES); 1636 } 1637 maxprot &= cap_maxprot; 1638 1639 linux_get_fop(filp, &fop, &ldev); 1640 error = linux_file_mmap_sub(td, size, prot, maxprot, flags, fp, 1641 &foff, fop, &object); 1642 if (error != 0) 1643 goto out; 1644 1645 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, 1646 foff, FALSE, td); 1647 if (error != 0) 1648 vm_object_deallocate(object); 1649 out: 1650 linux_drop_fop(ldev); 1651 return (error); 1652 } 1653 1654 static int 1655 linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred) 1656 { 1657 struct linux_file *filp; 1658 struct vnode *vp; 1659 int error; 1660 1661 filp = (struct linux_file *)fp->f_data; 1662 if (filp->f_vnode == NULL) 1663 return (EOPNOTSUPP); 1664 1665 vp = filp->f_vnode; 1666 1667 vn_lock(vp, LK_SHARED | LK_RETRY); 1668 error = VOP_STAT(vp, sb, curthread->td_ucred, NOCRED); 1669 VOP_UNLOCK(vp); 1670 1671 return (error); 1672 } 1673 1674 static int 1675 linux_file_fill_kinfo(struct file *fp, struct kinfo_file *kif, 1676 struct filedesc *fdp) 1677 { 1678 struct linux_file *filp; 1679 struct vnode *vp; 1680 int error; 1681 1682 filp = fp->f_data; 1683 vp = filp->f_vnode; 1684 if (vp == NULL) { 1685 error = 0; 1686 kif->kf_type = KF_TYPE_DEV; 1687 } else { 1688 vref(vp); 1689 FILEDESC_SUNLOCK(fdp); 1690 error = vn_fill_kinfo_vnode(vp, kif); 1691 vrele(vp); 1692 kif->kf_type = KF_TYPE_VNODE; 1693 FILEDESC_SLOCK(fdp); 1694 } 1695 return (error); 1696 } 1697 1698 unsigned int 1699 linux_iminor(struct inode *inode) 1700 { 1701 struct linux_cdev *ldev; 1702 1703 if (inode == NULL || inode->v_rdev == NULL || 1704 inode->v_rdev->si_devsw != &linuxcdevsw) 1705 return (-1U); 1706 ldev = inode->v_rdev->si_drv1; 1707 if (ldev == NULL) 1708 return (-1U); 1709 1710 return (minor(ldev->dev)); 1711 } 1712 1713 struct fileops linuxfileops = { 1714 .fo_read = linux_file_read, 1715 .fo_write = linux_file_write, 1716 .fo_truncate = invfo_truncate, 1717 .fo_kqfilter = linux_file_kqfilter, 1718 .fo_stat = linux_file_stat, 1719 .fo_fill_kinfo = linux_file_fill_kinfo, 1720 .fo_poll = linux_file_poll, 1721 .fo_close = linux_file_close, 1722 .fo_ioctl = linux_file_ioctl, 1723 .fo_mmap = linux_file_mmap, 1724 .fo_chmod = invfo_chmod, 1725 .fo_chown = invfo_chown, 1726 .fo_sendfile = invfo_sendfile, 1727 .fo_flags = DFLAG_PASSABLE, 1728 }; 1729 1730 /* 1731 * Hash of vmmap addresses. This is infrequently accessed and does not 1732 * need to be particularly large. This is done because we must store the 1733 * caller's idea of the map size to properly unmap. 1734 */ 1735 struct vmmap { 1736 LIST_ENTRY(vmmap) vm_next; 1737 void *vm_addr; 1738 unsigned long vm_size; 1739 }; 1740 1741 struct vmmaphd { 1742 struct vmmap *lh_first; 1743 }; 1744 #define VMMAP_HASH_SIZE 64 1745 #define VMMAP_HASH_MASK (VMMAP_HASH_SIZE - 1) 1746 #define VM_HASH(addr) ((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK 1747 static struct vmmaphd vmmaphead[VMMAP_HASH_SIZE]; 1748 static struct mtx vmmaplock; 1749 1750 static void 1751 vmmap_add(void *addr, unsigned long size) 1752 { 1753 struct vmmap *vmmap; 1754 1755 vmmap = kmalloc(sizeof(*vmmap), GFP_KERNEL); 1756 mtx_lock(&vmmaplock); 1757 vmmap->vm_size = size; 1758 vmmap->vm_addr = addr; 1759 LIST_INSERT_HEAD(&vmmaphead[VM_HASH(addr)], vmmap, vm_next); 1760 mtx_unlock(&vmmaplock); 1761 } 1762 1763 static struct vmmap * 1764 vmmap_remove(void *addr) 1765 { 1766 struct vmmap *vmmap; 1767 1768 mtx_lock(&vmmaplock); 1769 LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next) 1770 if (vmmap->vm_addr == addr) 1771 break; 1772 if (vmmap) 1773 LIST_REMOVE(vmmap, vm_next); 1774 mtx_unlock(&vmmaplock); 1775 1776 return (vmmap); 1777 } 1778 1779 #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__) || defined(__riscv) 1780 void * 1781 _ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr) 1782 { 1783 void *addr; 1784 1785 addr = pmap_mapdev_attr(phys_addr, size, attr); 1786 if (addr == NULL) 1787 return (NULL); 1788 vmmap_add(addr, size); 1789 1790 return (addr); 1791 } 1792 #endif 1793 1794 void 1795 iounmap(void *addr) 1796 { 1797 struct vmmap *vmmap; 1798 1799 vmmap = vmmap_remove(addr); 1800 if (vmmap == NULL) 1801 return; 1802 #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__) || defined(__riscv) 1803 pmap_unmapdev(addr, vmmap->vm_size); 1804 #endif 1805 kfree(vmmap); 1806 } 1807 1808 void * 1809 vmap(struct page **pages, unsigned int count, unsigned long flags, int prot) 1810 { 1811 vm_offset_t off; 1812 size_t size; 1813 1814 size = count * PAGE_SIZE; 1815 off = kva_alloc(size); 1816 if (off == 0) 1817 return (NULL); 1818 vmmap_add((void *)off, size); 1819 pmap_qenter(off, pages, count); 1820 1821 return ((void *)off); 1822 } 1823 1824 void 1825 vunmap(void *addr) 1826 { 1827 struct vmmap *vmmap; 1828 1829 vmmap = vmmap_remove(addr); 1830 if (vmmap == NULL) 1831 return; 1832 pmap_qremove((vm_offset_t)addr, vmmap->vm_size / PAGE_SIZE); 1833 kva_free((vm_offset_t)addr, vmmap->vm_size); 1834 kfree(vmmap); 1835 } 1836 1837 static char * 1838 devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap) 1839 { 1840 unsigned int len; 1841 char *p; 1842 va_list aq; 1843 1844 va_copy(aq, ap); 1845 len = vsnprintf(NULL, 0, fmt, aq); 1846 va_end(aq); 1847 1848 if (dev != NULL) 1849 p = devm_kmalloc(dev, len + 1, gfp); 1850 else 1851 p = kmalloc(len + 1, gfp); 1852 if (p != NULL) 1853 vsnprintf(p, len + 1, fmt, ap); 1854 1855 return (p); 1856 } 1857 1858 char * 1859 kvasprintf(gfp_t gfp, const char *fmt, va_list ap) 1860 { 1861 1862 return (devm_kvasprintf(NULL, gfp, fmt, ap)); 1863 } 1864 1865 char * 1866 lkpi_devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...) 1867 { 1868 va_list ap; 1869 char *p; 1870 1871 va_start(ap, fmt); 1872 p = devm_kvasprintf(dev, gfp, fmt, ap); 1873 va_end(ap); 1874 1875 return (p); 1876 } 1877 1878 char * 1879 kasprintf(gfp_t gfp, const char *fmt, ...) 1880 { 1881 va_list ap; 1882 char *p; 1883 1884 va_start(ap, fmt); 1885 p = kvasprintf(gfp, fmt, ap); 1886 va_end(ap); 1887 1888 return (p); 1889 } 1890 1891 static void 1892 linux_timer_callback_wrapper(void *context) 1893 { 1894 struct timer_list *timer; 1895 1896 timer = context; 1897 1898 if (linux_set_current_flags(curthread, M_NOWAIT)) { 1899 /* try again later */ 1900 callout_reset(&timer->callout, 1, 1901 &linux_timer_callback_wrapper, timer); 1902 return; 1903 } 1904 1905 timer->function(timer->data); 1906 } 1907 1908 int 1909 mod_timer(struct timer_list *timer, int expires) 1910 { 1911 int ret; 1912 1913 timer->expires = expires; 1914 ret = callout_reset(&timer->callout, 1915 linux_timer_jiffies_until(expires), 1916 &linux_timer_callback_wrapper, timer); 1917 1918 MPASS(ret == 0 || ret == 1); 1919 1920 return (ret == 1); 1921 } 1922 1923 void 1924 add_timer(struct timer_list *timer) 1925 { 1926 1927 callout_reset(&timer->callout, 1928 linux_timer_jiffies_until(timer->expires), 1929 &linux_timer_callback_wrapper, timer); 1930 } 1931 1932 void 1933 add_timer_on(struct timer_list *timer, int cpu) 1934 { 1935 1936 callout_reset_on(&timer->callout, 1937 linux_timer_jiffies_until(timer->expires), 1938 &linux_timer_callback_wrapper, timer, cpu); 1939 } 1940 1941 int 1942 del_timer(struct timer_list *timer) 1943 { 1944 1945 if (callout_stop(&(timer)->callout) == -1) 1946 return (0); 1947 return (1); 1948 } 1949 1950 int 1951 del_timer_sync(struct timer_list *timer) 1952 { 1953 1954 if (callout_drain(&(timer)->callout) == -1) 1955 return (0); 1956 return (1); 1957 } 1958 1959 int 1960 timer_delete_sync(struct timer_list *timer) 1961 { 1962 1963 return (del_timer_sync(timer)); 1964 } 1965 1966 int 1967 timer_shutdown_sync(struct timer_list *timer) 1968 { 1969 1970 return (del_timer_sync(timer)); 1971 } 1972 1973 /* greatest common divisor, Euclid equation */ 1974 static uint64_t 1975 lkpi_gcd_64(uint64_t a, uint64_t b) 1976 { 1977 uint64_t an; 1978 uint64_t bn; 1979 1980 while (b != 0) { 1981 an = b; 1982 bn = a % b; 1983 a = an; 1984 b = bn; 1985 } 1986 return (a); 1987 } 1988 1989 uint64_t lkpi_nsec2hz_rem; 1990 uint64_t lkpi_nsec2hz_div = 1000000000ULL; 1991 uint64_t lkpi_nsec2hz_max; 1992 1993 uint64_t lkpi_usec2hz_rem; 1994 uint64_t lkpi_usec2hz_div = 1000000ULL; 1995 uint64_t lkpi_usec2hz_max; 1996 1997 uint64_t lkpi_msec2hz_rem; 1998 uint64_t lkpi_msec2hz_div = 1000ULL; 1999 uint64_t lkpi_msec2hz_max; 2000 2001 static void 2002 linux_timer_init(void *arg) 2003 { 2004 uint64_t gcd; 2005 2006 /* 2007 * Compute an internal HZ value which can divide 2**32 to 2008 * avoid timer rounding problems when the tick value wraps 2009 * around 2**32: 2010 */ 2011 linux_timer_hz_mask = 1; 2012 while (linux_timer_hz_mask < (unsigned long)hz) 2013 linux_timer_hz_mask *= 2; 2014 linux_timer_hz_mask--; 2015 2016 /* compute some internal constants */ 2017 2018 lkpi_nsec2hz_rem = hz; 2019 lkpi_usec2hz_rem = hz; 2020 lkpi_msec2hz_rem = hz; 2021 2022 gcd = lkpi_gcd_64(lkpi_nsec2hz_rem, lkpi_nsec2hz_div); 2023 lkpi_nsec2hz_rem /= gcd; 2024 lkpi_nsec2hz_div /= gcd; 2025 lkpi_nsec2hz_max = -1ULL / lkpi_nsec2hz_rem; 2026 2027 gcd = lkpi_gcd_64(lkpi_usec2hz_rem, lkpi_usec2hz_div); 2028 lkpi_usec2hz_rem /= gcd; 2029 lkpi_usec2hz_div /= gcd; 2030 lkpi_usec2hz_max = -1ULL / lkpi_usec2hz_rem; 2031 2032 gcd = lkpi_gcd_64(lkpi_msec2hz_rem, lkpi_msec2hz_div); 2033 lkpi_msec2hz_rem /= gcd; 2034 lkpi_msec2hz_div /= gcd; 2035 lkpi_msec2hz_max = -1ULL / lkpi_msec2hz_rem; 2036 } 2037 SYSINIT(linux_timer, SI_SUB_DRIVERS, SI_ORDER_FIRST, linux_timer_init, NULL); 2038 2039 void 2040 linux_complete_common(struct completion *c, int all) 2041 { 2042 int wakeup_swapper; 2043 2044 sleepq_lock(c); 2045 if (all) { 2046 c->done = UINT_MAX; 2047 wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0); 2048 } else { 2049 if (c->done != UINT_MAX) 2050 c->done++; 2051 wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0); 2052 } 2053 sleepq_release(c); 2054 if (wakeup_swapper) 2055 kick_proc0(); 2056 } 2057 2058 /* 2059 * Indefinite wait for done != 0 with or without signals. 2060 */ 2061 int 2062 linux_wait_for_common(struct completion *c, int flags) 2063 { 2064 struct task_struct *task; 2065 int error; 2066 2067 if (SCHEDULER_STOPPED()) 2068 return (0); 2069 2070 task = current; 2071 2072 if (flags != 0) 2073 flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP; 2074 else 2075 flags = SLEEPQ_SLEEP; 2076 error = 0; 2077 for (;;) { 2078 sleepq_lock(c); 2079 if (c->done) 2080 break; 2081 sleepq_add(c, NULL, "completion", flags, 0); 2082 if (flags & SLEEPQ_INTERRUPTIBLE) { 2083 DROP_GIANT(); 2084 error = -sleepq_wait_sig(c, 0); 2085 PICKUP_GIANT(); 2086 if (error != 0) { 2087 linux_schedule_save_interrupt_value(task, error); 2088 error = -ERESTARTSYS; 2089 goto intr; 2090 } 2091 } else { 2092 DROP_GIANT(); 2093 sleepq_wait(c, 0); 2094 PICKUP_GIANT(); 2095 } 2096 } 2097 if (c->done != UINT_MAX) 2098 c->done--; 2099 sleepq_release(c); 2100 2101 intr: 2102 return (error); 2103 } 2104 2105 /* 2106 * Time limited wait for done != 0 with or without signals. 2107 */ 2108 int 2109 linux_wait_for_timeout_common(struct completion *c, int timeout, int flags) 2110 { 2111 struct task_struct *task; 2112 int end = jiffies + timeout; 2113 int error; 2114 2115 if (SCHEDULER_STOPPED()) 2116 return (0); 2117 2118 task = current; 2119 2120 if (flags != 0) 2121 flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP; 2122 else 2123 flags = SLEEPQ_SLEEP; 2124 2125 for (;;) { 2126 sleepq_lock(c); 2127 if (c->done) 2128 break; 2129 sleepq_add(c, NULL, "completion", flags, 0); 2130 sleepq_set_timeout(c, linux_timer_jiffies_until(end)); 2131 2132 DROP_GIANT(); 2133 if (flags & SLEEPQ_INTERRUPTIBLE) 2134 error = -sleepq_timedwait_sig(c, 0); 2135 else 2136 error = -sleepq_timedwait(c, 0); 2137 PICKUP_GIANT(); 2138 2139 if (error != 0) { 2140 /* check for timeout */ 2141 if (error == -EWOULDBLOCK) { 2142 error = 0; /* timeout */ 2143 } else { 2144 /* signal happened */ 2145 linux_schedule_save_interrupt_value(task, error); 2146 error = -ERESTARTSYS; 2147 } 2148 goto done; 2149 } 2150 } 2151 if (c->done != UINT_MAX) 2152 c->done--; 2153 sleepq_release(c); 2154 2155 /* return how many jiffies are left */ 2156 error = linux_timer_jiffies_until(end); 2157 done: 2158 return (error); 2159 } 2160 2161 int 2162 linux_try_wait_for_completion(struct completion *c) 2163 { 2164 int isdone; 2165 2166 sleepq_lock(c); 2167 isdone = (c->done != 0); 2168 if (c->done != 0 && c->done != UINT_MAX) 2169 c->done--; 2170 sleepq_release(c); 2171 return (isdone); 2172 } 2173 2174 int 2175 linux_completion_done(struct completion *c) 2176 { 2177 int isdone; 2178 2179 sleepq_lock(c); 2180 isdone = (c->done != 0); 2181 sleepq_release(c); 2182 return (isdone); 2183 } 2184 2185 static void 2186 linux_cdev_deref(struct linux_cdev *ldev) 2187 { 2188 if (refcount_release(&ldev->refs) && 2189 ldev->kobj.ktype == &linux_cdev_ktype) 2190 kfree(ldev); 2191 } 2192 2193 static void 2194 linux_cdev_release(struct kobject *kobj) 2195 { 2196 struct linux_cdev *cdev; 2197 struct kobject *parent; 2198 2199 cdev = container_of(kobj, struct linux_cdev, kobj); 2200 parent = kobj->parent; 2201 linux_destroy_dev(cdev); 2202 linux_cdev_deref(cdev); 2203 kobject_put(parent); 2204 } 2205 2206 static void 2207 linux_cdev_static_release(struct kobject *kobj) 2208 { 2209 struct cdev *cdev; 2210 struct linux_cdev *ldev; 2211 2212 ldev = container_of(kobj, struct linux_cdev, kobj); 2213 cdev = ldev->cdev; 2214 if (cdev != NULL) { 2215 destroy_dev(cdev); 2216 ldev->cdev = NULL; 2217 } 2218 kobject_put(kobj->parent); 2219 } 2220 2221 int 2222 linux_cdev_device_add(struct linux_cdev *ldev, struct device *dev) 2223 { 2224 int ret; 2225 2226 if (dev->devt != 0) { 2227 /* Set parent kernel object. */ 2228 ldev->kobj.parent = &dev->kobj; 2229 2230 /* 2231 * Unlike Linux we require the kobject of the 2232 * character device structure to have a valid name 2233 * before calling this function: 2234 */ 2235 if (ldev->kobj.name == NULL) 2236 return (-EINVAL); 2237 2238 ret = cdev_add(ldev, dev->devt, 1); 2239 if (ret) 2240 return (ret); 2241 } 2242 ret = device_add(dev); 2243 if (ret != 0 && dev->devt != 0) 2244 cdev_del(ldev); 2245 return (ret); 2246 } 2247 2248 void 2249 linux_cdev_device_del(struct linux_cdev *ldev, struct device *dev) 2250 { 2251 device_del(dev); 2252 2253 if (dev->devt != 0) 2254 cdev_del(ldev); 2255 } 2256 2257 static void 2258 linux_destroy_dev(struct linux_cdev *ldev) 2259 { 2260 2261 if (ldev->cdev == NULL) 2262 return; 2263 2264 MPASS((ldev->siref & LDEV_SI_DTR) == 0); 2265 MPASS(ldev->kobj.ktype == &linux_cdev_ktype); 2266 2267 atomic_set_int(&ldev->siref, LDEV_SI_DTR); 2268 while ((atomic_load_int(&ldev->siref) & ~LDEV_SI_DTR) != 0) 2269 pause("ldevdtr", hz / 4); 2270 2271 destroy_dev(ldev->cdev); 2272 ldev->cdev = NULL; 2273 } 2274 2275 const struct kobj_type linux_cdev_ktype = { 2276 .release = linux_cdev_release, 2277 }; 2278 2279 const struct kobj_type linux_cdev_static_ktype = { 2280 .release = linux_cdev_static_release, 2281 }; 2282 2283 static void 2284 linux_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate) 2285 { 2286 struct notifier_block *nb; 2287 struct netdev_notifier_info ni; 2288 2289 nb = arg; 2290 ni.ifp = ifp; 2291 ni.dev = (struct net_device *)ifp; 2292 if (linkstate == LINK_STATE_UP) 2293 nb->notifier_call(nb, NETDEV_UP, &ni); 2294 else 2295 nb->notifier_call(nb, NETDEV_DOWN, &ni); 2296 } 2297 2298 static void 2299 linux_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp) 2300 { 2301 struct notifier_block *nb; 2302 struct netdev_notifier_info ni; 2303 2304 nb = arg; 2305 ni.ifp = ifp; 2306 ni.dev = (struct net_device *)ifp; 2307 nb->notifier_call(nb, NETDEV_REGISTER, &ni); 2308 } 2309 2310 static void 2311 linux_handle_ifnet_departure_event(void *arg, struct ifnet *ifp) 2312 { 2313 struct notifier_block *nb; 2314 struct netdev_notifier_info ni; 2315 2316 nb = arg; 2317 ni.ifp = ifp; 2318 ni.dev = (struct net_device *)ifp; 2319 nb->notifier_call(nb, NETDEV_UNREGISTER, &ni); 2320 } 2321 2322 static void 2323 linux_handle_iflladdr_event(void *arg, struct ifnet *ifp) 2324 { 2325 struct notifier_block *nb; 2326 struct netdev_notifier_info ni; 2327 2328 nb = arg; 2329 ni.ifp = ifp; 2330 ni.dev = (struct net_device *)ifp; 2331 nb->notifier_call(nb, NETDEV_CHANGEADDR, &ni); 2332 } 2333 2334 static void 2335 linux_handle_ifaddr_event(void *arg, struct ifnet *ifp) 2336 { 2337 struct notifier_block *nb; 2338 struct netdev_notifier_info ni; 2339 2340 nb = arg; 2341 ni.ifp = ifp; 2342 ni.dev = (struct net_device *)ifp; 2343 nb->notifier_call(nb, NETDEV_CHANGEIFADDR, &ni); 2344 } 2345 2346 int 2347 register_netdevice_notifier(struct notifier_block *nb) 2348 { 2349 2350 nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER( 2351 ifnet_link_event, linux_handle_ifnet_link_event, nb, 0); 2352 nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER( 2353 ifnet_arrival_event, linux_handle_ifnet_arrival_event, nb, 0); 2354 nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER( 2355 ifnet_departure_event, linux_handle_ifnet_departure_event, nb, 0); 2356 nb->tags[NETDEV_CHANGEADDR] = EVENTHANDLER_REGISTER( 2357 iflladdr_event, linux_handle_iflladdr_event, nb, 0); 2358 2359 return (0); 2360 } 2361 2362 int 2363 register_inetaddr_notifier(struct notifier_block *nb) 2364 { 2365 2366 nb->tags[NETDEV_CHANGEIFADDR] = EVENTHANDLER_REGISTER( 2367 ifaddr_event, linux_handle_ifaddr_event, nb, 0); 2368 return (0); 2369 } 2370 2371 int 2372 unregister_netdevice_notifier(struct notifier_block *nb) 2373 { 2374 2375 EVENTHANDLER_DEREGISTER(ifnet_link_event, 2376 nb->tags[NETDEV_UP]); 2377 EVENTHANDLER_DEREGISTER(ifnet_arrival_event, 2378 nb->tags[NETDEV_REGISTER]); 2379 EVENTHANDLER_DEREGISTER(ifnet_departure_event, 2380 nb->tags[NETDEV_UNREGISTER]); 2381 EVENTHANDLER_DEREGISTER(iflladdr_event, 2382 nb->tags[NETDEV_CHANGEADDR]); 2383 2384 return (0); 2385 } 2386 2387 int 2388 unregister_inetaddr_notifier(struct notifier_block *nb) 2389 { 2390 2391 EVENTHANDLER_DEREGISTER(ifaddr_event, 2392 nb->tags[NETDEV_CHANGEIFADDR]); 2393 2394 return (0); 2395 } 2396 2397 struct list_sort_thunk { 2398 int (*cmp)(void *, struct list_head *, struct list_head *); 2399 void *priv; 2400 }; 2401 2402 static inline int 2403 linux_le_cmp(const void *d1, const void *d2, void *priv) 2404 { 2405 struct list_head *le1, *le2; 2406 struct list_sort_thunk *thunk; 2407 2408 thunk = priv; 2409 le1 = *(__DECONST(struct list_head **, d1)); 2410 le2 = *(__DECONST(struct list_head **, d2)); 2411 return ((thunk->cmp)(thunk->priv, le1, le2)); 2412 } 2413 2414 void 2415 list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv, 2416 struct list_head *a, struct list_head *b)) 2417 { 2418 struct list_sort_thunk thunk; 2419 struct list_head **ar, *le; 2420 size_t count, i; 2421 2422 count = 0; 2423 list_for_each(le, head) 2424 count++; 2425 ar = malloc(sizeof(struct list_head *) * count, M_KMALLOC, M_WAITOK); 2426 i = 0; 2427 list_for_each(le, head) 2428 ar[i++] = le; 2429 thunk.cmp = cmp; 2430 thunk.priv = priv; 2431 qsort_r(ar, count, sizeof(struct list_head *), linux_le_cmp, &thunk); 2432 INIT_LIST_HEAD(head); 2433 for (i = 0; i < count; i++) 2434 list_add_tail(ar[i], head); 2435 free(ar, M_KMALLOC); 2436 } 2437 2438 #if defined(__i386__) || defined(__amd64__) 2439 int 2440 linux_wbinvd_on_all_cpus(void) 2441 { 2442 2443 pmap_invalidate_cache(); 2444 return (0); 2445 } 2446 #endif 2447 2448 int 2449 linux_on_each_cpu(void callback(void *), void *data) 2450 { 2451 2452 smp_rendezvous(smp_no_rendezvous_barrier, callback, 2453 smp_no_rendezvous_barrier, data); 2454 return (0); 2455 } 2456 2457 int 2458 linux_in_atomic(void) 2459 { 2460 2461 return ((curthread->td_pflags & TDP_NOFAULTING) != 0); 2462 } 2463 2464 struct linux_cdev * 2465 linux_find_cdev(const char *name, unsigned major, unsigned minor) 2466 { 2467 dev_t dev = MKDEV(major, minor); 2468 struct cdev *cdev; 2469 2470 dev_lock(); 2471 LIST_FOREACH(cdev, &linuxcdevsw.d_devs, si_list) { 2472 struct linux_cdev *ldev = cdev->si_drv1; 2473 if (ldev->dev == dev && 2474 strcmp(kobject_name(&ldev->kobj), name) == 0) { 2475 break; 2476 } 2477 } 2478 dev_unlock(); 2479 2480 return (cdev != NULL ? cdev->si_drv1 : NULL); 2481 } 2482 2483 int 2484 __register_chrdev(unsigned int major, unsigned int baseminor, 2485 unsigned int count, const char *name, 2486 const struct file_operations *fops) 2487 { 2488 struct linux_cdev *cdev; 2489 int ret = 0; 2490 int i; 2491 2492 for (i = baseminor; i < baseminor + count; i++) { 2493 cdev = cdev_alloc(); 2494 cdev->ops = fops; 2495 kobject_set_name(&cdev->kobj, name); 2496 2497 ret = cdev_add(cdev, makedev(major, i), 1); 2498 if (ret != 0) 2499 break; 2500 } 2501 return (ret); 2502 } 2503 2504 int 2505 __register_chrdev_p(unsigned int major, unsigned int baseminor, 2506 unsigned int count, const char *name, 2507 const struct file_operations *fops, uid_t uid, 2508 gid_t gid, int mode) 2509 { 2510 struct linux_cdev *cdev; 2511 int ret = 0; 2512 int i; 2513 2514 for (i = baseminor; i < baseminor + count; i++) { 2515 cdev = cdev_alloc(); 2516 cdev->ops = fops; 2517 kobject_set_name(&cdev->kobj, name); 2518 2519 ret = cdev_add_ext(cdev, makedev(major, i), uid, gid, mode); 2520 if (ret != 0) 2521 break; 2522 } 2523 return (ret); 2524 } 2525 2526 void 2527 __unregister_chrdev(unsigned int major, unsigned int baseminor, 2528 unsigned int count, const char *name) 2529 { 2530 struct linux_cdev *cdevp; 2531 int i; 2532 2533 for (i = baseminor; i < baseminor + count; i++) { 2534 cdevp = linux_find_cdev(name, major, i); 2535 if (cdevp != NULL) 2536 cdev_del(cdevp); 2537 } 2538 } 2539 2540 void 2541 linux_dump_stack(void) 2542 { 2543 #ifdef STACK 2544 struct stack st; 2545 2546 stack_save(&st); 2547 stack_print(&st); 2548 #endif 2549 } 2550 2551 int 2552 linuxkpi_net_ratelimit(void) 2553 { 2554 2555 return (ppsratecheck(&lkpi_net_lastlog, &lkpi_net_curpps, 2556 lkpi_net_maxpps)); 2557 } 2558 2559 struct io_mapping * 2560 io_mapping_create_wc(resource_size_t base, unsigned long size) 2561 { 2562 struct io_mapping *mapping; 2563 2564 mapping = kmalloc(sizeof(*mapping), GFP_KERNEL); 2565 if (mapping == NULL) 2566 return (NULL); 2567 return (io_mapping_init_wc(mapping, base, size)); 2568 } 2569 2570 #if defined(__i386__) || defined(__amd64__) 2571 bool linux_cpu_has_clflush; 2572 struct cpuinfo_x86 boot_cpu_data; 2573 struct cpuinfo_x86 *__cpu_data; 2574 #endif 2575 2576 cpumask_t * 2577 lkpi_get_static_single_cpu_mask(int cpuid) 2578 { 2579 2580 KASSERT((cpuid >= 0 && cpuid <= mp_maxid), ("%s: invalid cpuid %d\n", 2581 __func__, cpuid)); 2582 KASSERT(!CPU_ABSENT(cpuid), ("%s: cpu with cpuid %d is absent\n", 2583 __func__, cpuid)); 2584 2585 return (static_single_cpu_mask[cpuid]); 2586 } 2587 2588 static void 2589 linux_compat_init(void *arg) 2590 { 2591 struct sysctl_oid *rootoid; 2592 int i; 2593 2594 #if defined(__i386__) || defined(__amd64__) 2595 linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH); 2596 boot_cpu_data.x86_clflush_size = cpu_clflush_line_size; 2597 boot_cpu_data.x86_max_cores = mp_ncpus; 2598 boot_cpu_data.x86 = CPUID_TO_FAMILY(cpu_id); 2599 boot_cpu_data.x86_model = CPUID_TO_MODEL(cpu_id); 2600 2601 __cpu_data = mallocarray(mp_maxid + 1, 2602 sizeof(*__cpu_data), M_KMALLOC, M_WAITOK | M_ZERO); 2603 CPU_FOREACH(i) { 2604 __cpu_data[i].x86_clflush_size = cpu_clflush_line_size; 2605 __cpu_data[i].x86_max_cores = mp_ncpus; 2606 __cpu_data[i].x86 = CPUID_TO_FAMILY(cpu_id); 2607 __cpu_data[i].x86_model = CPUID_TO_MODEL(cpu_id); 2608 } 2609 #endif 2610 rw_init(&linux_vma_lock, "lkpi-vma-lock"); 2611 2612 rootoid = SYSCTL_ADD_ROOT_NODE(NULL, 2613 OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys"); 2614 kobject_init(&linux_class_root, &linux_class_ktype); 2615 kobject_set_name(&linux_class_root, "class"); 2616 linux_class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid), 2617 OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class"); 2618 kobject_init(&linux_root_device.kobj, &linux_dev_ktype); 2619 kobject_set_name(&linux_root_device.kobj, "device"); 2620 linux_root_device.kobj.oidp = SYSCTL_ADD_NODE(NULL, 2621 SYSCTL_CHILDREN(rootoid), OID_AUTO, "device", 2622 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "device"); 2623 linux_root_device.bsddev = root_bus; 2624 linux_class_misc.name = "misc"; 2625 class_register(&linux_class_misc); 2626 INIT_LIST_HEAD(&pci_drivers); 2627 INIT_LIST_HEAD(&pci_devices); 2628 spin_lock_init(&pci_lock); 2629 mtx_init(&vmmaplock, "IO Map lock", NULL, MTX_DEF); 2630 for (i = 0; i < VMMAP_HASH_SIZE; i++) 2631 LIST_INIT(&vmmaphead[i]); 2632 init_waitqueue_head(&linux_bit_waitq); 2633 init_waitqueue_head(&linux_var_waitq); 2634 2635 CPU_COPY(&all_cpus, &cpu_online_mask); 2636 /* 2637 * Generate a single-CPU cpumask_t for each CPU (possibly) in the system. 2638 * CPUs are indexed from 0..(mp_maxid). The entry for cpuid 0 will only 2639 * have itself in the cpumask, cupid 1 only itself on entry 1, and so on. 2640 * This is used by cpumask_of() (and possibly others in the future) for, 2641 * e.g., drivers to pass hints to irq_set_affinity_hint(). 2642 */ 2643 static_single_cpu_mask = mallocarray(mp_maxid + 1, 2644 sizeof(static_single_cpu_mask), M_KMALLOC, M_WAITOK | M_ZERO); 2645 2646 /* 2647 * When the number of CPUs reach a threshold, we start to save memory 2648 * given the sets are static by overlapping those having their single 2649 * bit set at same position in a bitset word. Asymptotically, this 2650 * regular scheme is in O(n²) whereas the overlapping one is in O(n) 2651 * only with n being the maximum number of CPUs, so the gain will become 2652 * huge quite quickly. The threshold for 64-bit architectures is 128 2653 * CPUs. 2654 */ 2655 if (mp_ncpus < (2 * _BITSET_BITS)) { 2656 cpumask_t *sscm_ptr; 2657 2658 /* 2659 * This represents 'mp_ncpus * __bitset_words(CPU_SETSIZE) * 2660 * (_BITSET_BITS / 8)' bytes (for comparison with the 2661 * overlapping scheme). 2662 */ 2663 static_single_cpu_mask_lcs = mallocarray(mp_ncpus, 2664 sizeof(*static_single_cpu_mask_lcs), 2665 M_KMALLOC, M_WAITOK | M_ZERO); 2666 2667 sscm_ptr = static_single_cpu_mask_lcs; 2668 CPU_FOREACH(i) { 2669 static_single_cpu_mask[i] = sscm_ptr++; 2670 CPU_SET(i, static_single_cpu_mask[i]); 2671 } 2672 } else { 2673 /* Pointer to a bitset word. */ 2674 __typeof(((cpuset_t *)NULL)->__bits[0]) *bwp; 2675 2676 /* 2677 * Allocate memory for (static) spans of 'cpumask_t' ('cpuset_t' 2678 * really) with a single bit set that can be reused for all 2679 * single CPU masks by making them start at different offsets. 2680 * We need '__bitset_words(CPU_SETSIZE) - 1' bitset words before 2681 * the word having its single bit set, and the same amount 2682 * after. 2683 */ 2684 static_single_cpu_mask_lcs = mallocarray(_BITSET_BITS, 2685 (2 * __bitset_words(CPU_SETSIZE) - 1) * (_BITSET_BITS / 8), 2686 M_KMALLOC, M_WAITOK | M_ZERO); 2687 2688 /* 2689 * We rely below on cpuset_t and the bitset generic 2690 * implementation assigning words in the '__bits' array in the 2691 * same order of bits (i.e., little-endian ordering, not to be 2692 * confused with machine endianness, which concerns bits in 2693 * words and other integers). This is an imperfect test, but it 2694 * will detect a change to big-endian ordering. 2695 */ 2696 _Static_assert( 2697 __bitset_word(_BITSET_BITS + 1, _BITSET_BITS) == 1, 2698 "Assumes a bitset implementation that is little-endian " 2699 "on its words"); 2700 2701 /* Initialize the single bit of each static span. */ 2702 bwp = (__typeof(bwp))static_single_cpu_mask_lcs + 2703 (__bitset_words(CPU_SETSIZE) - 1); 2704 for (i = 0; i < _BITSET_BITS; i++) { 2705 CPU_SET(i, (cpuset_t *)bwp); 2706 bwp += (2 * __bitset_words(CPU_SETSIZE) - 1); 2707 } 2708 2709 /* 2710 * Finally set all CPU masks to the proper word in their 2711 * relevant span. 2712 */ 2713 CPU_FOREACH(i) { 2714 bwp = (__typeof(bwp))static_single_cpu_mask_lcs; 2715 /* Find the non-zero word of the relevant span. */ 2716 bwp += (2 * __bitset_words(CPU_SETSIZE) - 1) * 2717 (i % _BITSET_BITS) + 2718 __bitset_words(CPU_SETSIZE) - 1; 2719 /* Shift to find the CPU mask start. */ 2720 bwp -= (i / _BITSET_BITS); 2721 static_single_cpu_mask[i] = (cpuset_t *)bwp; 2722 } 2723 } 2724 2725 strlcpy(init_uts_ns.name.release, osrelease, sizeof(init_uts_ns.name.release)); 2726 } 2727 SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL); 2728 2729 static void 2730 linux_compat_uninit(void *arg) 2731 { 2732 linux_kobject_kfree_name(&linux_class_root); 2733 linux_kobject_kfree_name(&linux_root_device.kobj); 2734 linux_kobject_kfree_name(&linux_class_misc.kobj); 2735 2736 free(static_single_cpu_mask_lcs, M_KMALLOC); 2737 free(static_single_cpu_mask, M_KMALLOC); 2738 #if defined(__i386__) || defined(__amd64__) 2739 free(__cpu_data, M_KMALLOC); 2740 #endif 2741 2742 mtx_destroy(&vmmaplock); 2743 spin_lock_destroy(&pci_lock); 2744 rw_destroy(&linux_vma_lock); 2745 } 2746 SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL); 2747 2748 /* 2749 * NOTE: Linux frequently uses "unsigned long" for pointer to integer 2750 * conversion and vice versa, where in FreeBSD "uintptr_t" would be 2751 * used. Assert these types have the same size, else some parts of the 2752 * LinuxKPI may not work like expected: 2753 */ 2754 CTASSERT(sizeof(unsigned long) == sizeof(uintptr_t)); 2755