1 /*-
2 * Copyright (c) 2010 Isilon Systems, Inc.
3 * Copyright (c) 2010 iX Systems, Inc.
4 * Copyright (c) 2010 Panasas, Inc.
5 * Copyright (c) 2013-2021 Mellanox Technologies, Ltd.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice unmodified, this list of conditions, and the following
13 * disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
23 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29
30 #include <sys/cdefs.h>
31 #include "opt_global.h"
32 #include "opt_stack.h"
33
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/malloc.h>
37 #include <sys/kernel.h>
38 #include <sys/sysctl.h>
39 #include <sys/proc.h>
40 #include <sys/sglist.h>
41 #include <sys/sleepqueue.h>
42 #include <sys/refcount.h>
43 #include <sys/lock.h>
44 #include <sys/mutex.h>
45 #include <sys/bus.h>
46 #include <sys/eventhandler.h>
47 #include <sys/fcntl.h>
48 #include <sys/file.h>
49 #include <sys/filio.h>
50 #include <sys/rwlock.h>
51 #include <sys/mman.h>
52 #include <sys/stack.h>
53 #include <sys/stdarg.h>
54 #include <sys/sysent.h>
55 #include <sys/time.h>
56 #include <sys/user.h>
57
58 #include <vm/vm.h>
59 #include <vm/pmap.h>
60 #include <vm/vm_object.h>
61 #include <vm/vm_page.h>
62 #include <vm/vm_pager.h>
63 #include <vm/vm_radix.h>
64
65 #if defined(__i386__) || defined(__amd64__)
66 #include <machine/cputypes.h>
67 #include <machine/md_var.h>
68 #endif
69
70 #include <linux/kobject.h>
71 #include <linux/cpu.h>
72 #include <linux/device.h>
73 #include <linux/slab.h>
74 #include <linux/module.h>
75 #include <linux/moduleparam.h>
76 #include <linux/cdev.h>
77 #include <linux/file.h>
78 #include <linux/fs.h>
79 #include <linux/sysfs.h>
80 #include <linux/mm.h>
81 #include <linux/io.h>
82 #include <linux/vmalloc.h>
83 #include <linux/netdevice.h>
84 #include <linux/timer.h>
85 #include <linux/interrupt.h>
86 #include <linux/uaccess.h>
87 #include <linux/utsname.h>
88 #include <linux/list.h>
89 #include <linux/kthread.h>
90 #include <linux/kernel.h>
91 #include <linux/compat.h>
92 #include <linux/io-mapping.h>
93 #include <linux/poll.h>
94 #include <linux/smp.h>
95 #include <linux/wait_bit.h>
96 #include <linux/rcupdate.h>
97 #include <linux/interval_tree.h>
98 #include <linux/interval_tree_generic.h>
99 #include <linux/printk.h>
100 #include <linux/seq_file.h>
101 #include <linux/uuid.h>
102
103 #if defined(__i386__) || defined(__amd64__)
104 #include <asm/smp.h>
105 #include <asm/processor.h>
106 #endif
107
108 #include <xen/xen.h>
109 #ifdef XENHVM
110 #undef xen_pv_domain
111 #undef xen_initial_domain
112 /* xen/xen-os.h redefines __must_check */
113 #undef __must_check
114 #include <xen/xen-os.h>
115 #endif
116
117 SYSCTL_NODE(_compat, OID_AUTO, linuxkpi, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
118 "LinuxKPI parameters");
119
120 int linuxkpi_debug;
121 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, debug, CTLFLAG_RWTUN,
122 &linuxkpi_debug, 0, "Set to enable pr_debug() prints. Clear to disable.");
123
124 int linuxkpi_rcu_debug;
125 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, rcu_debug, CTLFLAG_RWTUN,
126 &linuxkpi_rcu_debug, 0, "Set to enable RCU warning. Clear to disable.");
127
128 int linuxkpi_warn_dump_stack = 0;
129 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, warn_dump_stack, CTLFLAG_RWTUN,
130 &linuxkpi_warn_dump_stack, 0,
131 "Set to enable stack traces from WARN_ON(). Clear to disable.");
132
133 static struct timeval lkpi_net_lastlog;
134 static int lkpi_net_curpps;
135 static int lkpi_net_maxpps = 99;
136 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, net_ratelimit, CTLFLAG_RWTUN,
137 &lkpi_net_maxpps, 0, "Limit number of LinuxKPI net messages per second.");
138
139 MALLOC_DEFINE(M_KMALLOC, "lkpikmalloc", "Linux kmalloc compat");
140
141 #include <linux/rbtree.h>
142 /* Undo Linux compat changes. */
143 #undef RB_ROOT
144 #undef file
145 #undef cdev
146 #define RB_ROOT(head) (head)->rbh_root
147
148 static void linux_destroy_dev(struct linux_cdev *);
149 static void linux_cdev_deref(struct linux_cdev *ldev);
150 static struct vm_area_struct *linux_cdev_handle_find(void *handle);
151
152 cpumask_t cpu_online_mask;
153 static cpumask_t **static_single_cpu_mask;
154 static cpumask_t *static_single_cpu_mask_lcs;
155 struct kobject linux_class_root;
156 struct device linux_root_device;
157 struct class linux_class_misc;
158 struct list_head pci_drivers;
159 struct list_head pci_devices;
160 spinlock_t pci_lock;
161 struct uts_namespace init_uts_ns;
162
163 unsigned long linux_timer_hz_mask;
164
165 wait_queue_head_t linux_bit_waitq;
166 wait_queue_head_t linux_var_waitq;
167
168 const guid_t guid_null;
169
170 enum system_states system_state = SYSTEM_RUNNING;
171
172 int
panic_cmp(struct rb_node * one,struct rb_node * two)173 panic_cmp(struct rb_node *one, struct rb_node *two)
174 {
175 panic("no cmp");
176 }
177
178 RB_GENERATE(linux_root, rb_node, __entry, panic_cmp);
179
180 #define START(node) ((node)->start)
181 #define LAST(node) ((node)->last)
182
183 INTERVAL_TREE_DEFINE(struct interval_tree_node, rb, unsigned long,, START,
184 LAST,, lkpi_interval_tree)
185
186 static void
linux_device_release(struct device * dev)187 linux_device_release(struct device *dev)
188 {
189 pr_debug("linux_device_release: %s\n", dev_name(dev));
190 kfree(dev);
191 }
192
193 static ssize_t
linux_class_show(struct kobject * kobj,struct attribute * attr,char * buf)194 linux_class_show(struct kobject *kobj, struct attribute *attr, char *buf)
195 {
196 struct class_attribute *dattr;
197 ssize_t error;
198
199 dattr = container_of(attr, struct class_attribute, attr);
200 error = -EIO;
201 if (dattr->show)
202 error = dattr->show(container_of(kobj, struct class, kobj),
203 dattr, buf);
204 return (error);
205 }
206
207 static ssize_t
linux_class_store(struct kobject * kobj,struct attribute * attr,const char * buf,size_t count)208 linux_class_store(struct kobject *kobj, struct attribute *attr, const char *buf,
209 size_t count)
210 {
211 struct class_attribute *dattr;
212 ssize_t error;
213
214 dattr = container_of(attr, struct class_attribute, attr);
215 error = -EIO;
216 if (dattr->store)
217 error = dattr->store(container_of(kobj, struct class, kobj),
218 dattr, buf, count);
219 return (error);
220 }
221
222 static void
linux_class_release(struct kobject * kobj)223 linux_class_release(struct kobject *kobj)
224 {
225 struct class *class;
226
227 class = container_of(kobj, struct class, kobj);
228 if (class->class_release)
229 class->class_release(class);
230 }
231
232 static const struct sysfs_ops linux_class_sysfs = {
233 .show = linux_class_show,
234 .store = linux_class_store,
235 };
236
237 const struct kobj_type linux_class_ktype = {
238 .release = linux_class_release,
239 .sysfs_ops = &linux_class_sysfs
240 };
241
242 static void
linux_dev_release(struct kobject * kobj)243 linux_dev_release(struct kobject *kobj)
244 {
245 struct device *dev;
246
247 dev = container_of(kobj, struct device, kobj);
248 /* This is the precedence defined by linux. */
249 if (dev->release)
250 dev->release(dev);
251 else if (dev->class && dev->class->dev_release)
252 dev->class->dev_release(dev);
253 }
254
255 static ssize_t
linux_dev_show(struct kobject * kobj,struct attribute * attr,char * buf)256 linux_dev_show(struct kobject *kobj, struct attribute *attr, char *buf)
257 {
258 struct device_attribute *dattr;
259 ssize_t error;
260
261 dattr = container_of(attr, struct device_attribute, attr);
262 error = -EIO;
263 if (dattr->show)
264 error = dattr->show(container_of(kobj, struct device, kobj),
265 dattr, buf);
266 return (error);
267 }
268
269 static ssize_t
linux_dev_store(struct kobject * kobj,struct attribute * attr,const char * buf,size_t count)270 linux_dev_store(struct kobject *kobj, struct attribute *attr, const char *buf,
271 size_t count)
272 {
273 struct device_attribute *dattr;
274 ssize_t error;
275
276 dattr = container_of(attr, struct device_attribute, attr);
277 error = -EIO;
278 if (dattr->store)
279 error = dattr->store(container_of(kobj, struct device, kobj),
280 dattr, buf, count);
281 return (error);
282 }
283
284 static const struct sysfs_ops linux_dev_sysfs = {
285 .show = linux_dev_show,
286 .store = linux_dev_store,
287 };
288
289 const struct kobj_type linux_dev_ktype = {
290 .release = linux_dev_release,
291 .sysfs_ops = &linux_dev_sysfs
292 };
293
294 struct device *
device_create(struct class * class,struct device * parent,dev_t devt,void * drvdata,const char * fmt,...)295 device_create(struct class *class, struct device *parent, dev_t devt,
296 void *drvdata, const char *fmt, ...)
297 {
298 struct device *dev;
299 va_list args;
300
301 dev = kzalloc(sizeof(*dev), M_WAITOK);
302 dev->parent = parent;
303 dev->class = class;
304 dev->devt = devt;
305 dev->driver_data = drvdata;
306 dev->release = linux_device_release;
307 va_start(args, fmt);
308 kobject_set_name_vargs(&dev->kobj, fmt, args);
309 va_end(args);
310 device_register(dev);
311
312 return (dev);
313 }
314
315 struct device *
device_create_groups_vargs(struct class * class,struct device * parent,dev_t devt,void * drvdata,const struct attribute_group ** groups,const char * fmt,va_list args)316 device_create_groups_vargs(struct class *class, struct device *parent,
317 dev_t devt, void *drvdata, const struct attribute_group **groups,
318 const char *fmt, va_list args)
319 {
320 struct device *dev = NULL;
321 int retval = -ENODEV;
322
323 if (class == NULL || IS_ERR(class))
324 goto error;
325
326 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
327 if (!dev) {
328 retval = -ENOMEM;
329 goto error;
330 }
331
332 dev->devt = devt;
333 dev->class = class;
334 dev->parent = parent;
335 dev->groups = groups;
336 dev->release = device_create_release;
337 /* device_initialize() needs the class and parent to be set */
338 device_initialize(dev);
339 dev_set_drvdata(dev, drvdata);
340
341 retval = kobject_set_name_vargs(&dev->kobj, fmt, args);
342 if (retval)
343 goto error;
344
345 retval = device_add(dev);
346 if (retval)
347 goto error;
348
349 return dev;
350
351 error:
352 put_device(dev);
353 return ERR_PTR(retval);
354 }
355
356 struct class *
lkpi_class_create(const char * name)357 lkpi_class_create(const char *name)
358 {
359 struct class *class;
360 int error;
361
362 class = kzalloc(sizeof(*class), M_WAITOK);
363 class->name = name;
364 class->class_release = linux_class_kfree;
365 error = class_register(class);
366 if (error) {
367 kfree(class);
368 return (NULL);
369 }
370
371 return (class);
372 }
373
374 static void
linux_kq_lock(void * arg)375 linux_kq_lock(void *arg)
376 {
377 spinlock_t *s = arg;
378
379 spin_lock(s);
380 }
381 static void
linux_kq_unlock(void * arg)382 linux_kq_unlock(void *arg)
383 {
384 spinlock_t *s = arg;
385
386 spin_unlock(s);
387 }
388
389 static void
linux_kq_assert_lock(void * arg,int what)390 linux_kq_assert_lock(void *arg, int what)
391 {
392 #ifdef INVARIANTS
393 spinlock_t *s = arg;
394
395 if (what == LA_LOCKED)
396 mtx_assert(s, MA_OWNED);
397 else
398 mtx_assert(s, MA_NOTOWNED);
399 #endif
400 }
401
402 static void
403 linux_file_kqfilter_poll(struct linux_file *, int);
404
405 struct linux_file *
linux_file_alloc(void)406 linux_file_alloc(void)
407 {
408 struct linux_file *filp;
409
410 filp = kzalloc(sizeof(*filp), GFP_KERNEL);
411
412 /* set initial refcount */
413 filp->f_count = 1;
414
415 /* setup fields needed by kqueue support */
416 spin_lock_init(&filp->f_kqlock);
417 knlist_init(&filp->f_selinfo.si_note, &filp->f_kqlock,
418 linux_kq_lock, linux_kq_unlock, linux_kq_assert_lock);
419
420 return (filp);
421 }
422
423 void
linux_file_free(struct linux_file * filp)424 linux_file_free(struct linux_file *filp)
425 {
426 if (filp->_file == NULL) {
427 if (filp->f_op != NULL && filp->f_op->release != NULL)
428 filp->f_op->release(filp->f_vnode, filp);
429 if (filp->f_shmem != NULL)
430 vm_object_deallocate(filp->f_shmem);
431 kfree_rcu(filp, rcu);
432 } else {
433 /*
434 * The close method of the character device or file
435 * will free the linux_file structure:
436 */
437 _fdrop(filp->_file, curthread);
438 }
439 }
440
441 struct linux_cdev *
cdev_alloc(void)442 cdev_alloc(void)
443 {
444 struct linux_cdev *cdev;
445
446 cdev = kzalloc(sizeof(struct linux_cdev), M_WAITOK);
447 kobject_init(&cdev->kobj, &linux_cdev_ktype);
448 cdev->refs = 1;
449 return (cdev);
450 }
451
452 static int
linux_cdev_pager_fault(vm_object_t vm_obj,vm_ooffset_t offset,int prot,vm_page_t * mres)453 linux_cdev_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot,
454 vm_page_t *mres)
455 {
456 struct vm_area_struct *vmap;
457
458 vmap = linux_cdev_handle_find(vm_obj->handle);
459
460 MPASS(vmap != NULL);
461 MPASS(vmap->vm_private_data == vm_obj->handle);
462
463 if (likely(vmap->vm_ops != NULL && offset < vmap->vm_len)) {
464 vm_paddr_t paddr = IDX_TO_OFF(vmap->vm_pfn) + offset;
465 vm_page_t page;
466
467 if (((*mres)->flags & PG_FICTITIOUS) != 0) {
468 /*
469 * If the passed in result page is a fake
470 * page, update it with the new physical
471 * address.
472 */
473 page = *mres;
474 vm_page_updatefake(page, paddr, vm_obj->memattr);
475 } else {
476 /*
477 * Replace the passed in "mres" page with our
478 * own fake page and free up the all of the
479 * original pages.
480 */
481 VM_OBJECT_WUNLOCK(vm_obj);
482 page = vm_page_getfake(paddr, vm_obj->memattr);
483 VM_OBJECT_WLOCK(vm_obj);
484
485 vm_page_replace(page, vm_obj, (*mres)->pindex, *mres);
486 *mres = page;
487 }
488 vm_page_valid(page);
489 return (VM_PAGER_OK);
490 }
491 return (VM_PAGER_FAIL);
492 }
493
494 static int
linux_cdev_pager_populate(vm_object_t vm_obj,vm_pindex_t pidx,int fault_type,vm_prot_t max_prot,vm_pindex_t * first,vm_pindex_t * last)495 linux_cdev_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type,
496 vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
497 {
498 struct vm_area_struct *vmap;
499 int err;
500
501 /* get VM area structure */
502 vmap = linux_cdev_handle_find(vm_obj->handle);
503 MPASS(vmap != NULL);
504 MPASS(vmap->vm_private_data == vm_obj->handle);
505
506 VM_OBJECT_WUNLOCK(vm_obj);
507
508 linux_set_current(curthread);
509
510 down_write(&vmap->vm_mm->mmap_sem);
511 if (unlikely(vmap->vm_ops == NULL)) {
512 err = VM_FAULT_SIGBUS;
513 } else {
514 struct vm_fault vmf;
515
516 /* fill out VM fault structure */
517 vmf.virtual_address = (void *)(uintptr_t)IDX_TO_OFF(pidx);
518 vmf.flags = (fault_type & VM_PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
519 vmf.pgoff = 0;
520 vmf.page = NULL;
521 vmf.vma = vmap;
522
523 vmap->vm_pfn_count = 0;
524 vmap->vm_pfn_pcount = &vmap->vm_pfn_count;
525 vmap->vm_obj = vm_obj;
526
527 err = vmap->vm_ops->fault(&vmf);
528
529 while (vmap->vm_pfn_count == 0 && err == VM_FAULT_NOPAGE) {
530 kern_yield(PRI_USER);
531 err = vmap->vm_ops->fault(&vmf);
532 }
533 }
534
535 /* translate return code */
536 switch (err) {
537 case VM_FAULT_OOM:
538 err = VM_PAGER_AGAIN;
539 break;
540 case VM_FAULT_SIGBUS:
541 err = VM_PAGER_BAD;
542 break;
543 case VM_FAULT_NOPAGE:
544 /*
545 * By contract the fault handler will return having
546 * busied all the pages itself. If pidx is already
547 * found in the object, it will simply xbusy the first
548 * page and return with vm_pfn_count set to 1.
549 */
550 *first = vmap->vm_pfn_first;
551 *last = *first + vmap->vm_pfn_count - 1;
552 err = VM_PAGER_OK;
553 break;
554 default:
555 err = VM_PAGER_ERROR;
556 break;
557 }
558 up_write(&vmap->vm_mm->mmap_sem);
559 VM_OBJECT_WLOCK(vm_obj);
560 return (err);
561 }
562
563 static struct rwlock linux_vma_lock;
564 static TAILQ_HEAD(, vm_area_struct) linux_vma_head =
565 TAILQ_HEAD_INITIALIZER(linux_vma_head);
566
567 static void
linux_cdev_handle_free(struct vm_area_struct * vmap)568 linux_cdev_handle_free(struct vm_area_struct *vmap)
569 {
570 /* Drop reference on vm_file */
571 if (vmap->vm_file != NULL)
572 fput(vmap->vm_file);
573
574 /* Drop reference on mm_struct */
575 mmput(vmap->vm_mm);
576
577 kfree(vmap);
578 }
579
580 static void
linux_cdev_handle_remove(struct vm_area_struct * vmap)581 linux_cdev_handle_remove(struct vm_area_struct *vmap)
582 {
583 rw_wlock(&linux_vma_lock);
584 TAILQ_REMOVE(&linux_vma_head, vmap, vm_entry);
585 rw_wunlock(&linux_vma_lock);
586 }
587
588 static struct vm_area_struct *
linux_cdev_handle_find(void * handle)589 linux_cdev_handle_find(void *handle)
590 {
591 struct vm_area_struct *vmap;
592
593 rw_rlock(&linux_vma_lock);
594 TAILQ_FOREACH(vmap, &linux_vma_head, vm_entry) {
595 if (vmap->vm_private_data == handle)
596 break;
597 }
598 rw_runlock(&linux_vma_lock);
599 return (vmap);
600 }
601
602 static int
linux_cdev_pager_ctor(void * handle,vm_ooffset_t size,vm_prot_t prot,vm_ooffset_t foff,struct ucred * cred,u_short * color)603 linux_cdev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
604 vm_ooffset_t foff, struct ucred *cred, u_short *color)
605 {
606
607 MPASS(linux_cdev_handle_find(handle) != NULL);
608 *color = 0;
609 return (0);
610 }
611
612 static void
linux_cdev_pager_dtor(void * handle)613 linux_cdev_pager_dtor(void *handle)
614 {
615 const struct vm_operations_struct *vm_ops;
616 struct vm_area_struct *vmap;
617
618 vmap = linux_cdev_handle_find(handle);
619 MPASS(vmap != NULL);
620
621 /*
622 * Remove handle before calling close operation to prevent
623 * other threads from reusing the handle pointer.
624 */
625 linux_cdev_handle_remove(vmap);
626
627 down_write(&vmap->vm_mm->mmap_sem);
628 vm_ops = vmap->vm_ops;
629 if (likely(vm_ops != NULL))
630 vm_ops->close(vmap);
631 up_write(&vmap->vm_mm->mmap_sem);
632
633 linux_cdev_handle_free(vmap);
634 }
635
636 static struct cdev_pager_ops linux_cdev_pager_ops[2] = {
637 {
638 /* OBJT_MGTDEVICE */
639 .cdev_pg_populate = linux_cdev_pager_populate,
640 .cdev_pg_ctor = linux_cdev_pager_ctor,
641 .cdev_pg_dtor = linux_cdev_pager_dtor
642 },
643 {
644 /* OBJT_DEVICE */
645 .cdev_pg_fault = linux_cdev_pager_fault,
646 .cdev_pg_ctor = linux_cdev_pager_ctor,
647 .cdev_pg_dtor = linux_cdev_pager_dtor
648 },
649 };
650
651 int
zap_vma_ptes(struct vm_area_struct * vma,unsigned long address,unsigned long size)652 zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
653 unsigned long size)
654 {
655 struct pctrie_iter pages;
656 vm_object_t obj;
657 vm_page_t m;
658
659 obj = vma->vm_obj;
660 if (obj == NULL || (obj->flags & OBJ_UNMANAGED) != 0)
661 return (-ENOTSUP);
662 VM_OBJECT_RLOCK(obj);
663 vm_page_iter_limit_init(&pages, obj, OFF_TO_IDX(address + size));
664 VM_RADIX_FOREACH_FROM(m, &pages, OFF_TO_IDX(address))
665 pmap_remove_all(m);
666 VM_OBJECT_RUNLOCK(obj);
667 return (0);
668 }
669
670 void
vma_set_file(struct vm_area_struct * vma,struct linux_file * file)671 vma_set_file(struct vm_area_struct *vma, struct linux_file *file)
672 {
673 struct linux_file *tmp;
674
675 /* Changing an anonymous vma with this is illegal */
676 get_file(file);
677 tmp = vma->vm_file;
678 vma->vm_file = file;
679 fput(tmp);
680 }
681
682 static struct file_operations dummy_ldev_ops = {
683 /* XXXKIB */
684 };
685
686 static struct linux_cdev dummy_ldev = {
687 .ops = &dummy_ldev_ops,
688 };
689
690 #define LDEV_SI_DTR 0x0001
691 #define LDEV_SI_REF 0x0002
692
693 static void
linux_get_fop(struct linux_file * filp,const struct file_operations ** fop,struct linux_cdev ** dev)694 linux_get_fop(struct linux_file *filp, const struct file_operations **fop,
695 struct linux_cdev **dev)
696 {
697 struct linux_cdev *ldev;
698 u_int siref;
699
700 ldev = filp->f_cdev;
701 *fop = filp->f_op;
702 if (ldev != NULL) {
703 if (ldev->kobj.ktype == &linux_cdev_static_ktype) {
704 refcount_acquire(&ldev->refs);
705 } else {
706 for (siref = ldev->siref;;) {
707 if ((siref & LDEV_SI_DTR) != 0) {
708 ldev = &dummy_ldev;
709 *fop = ldev->ops;
710 siref = ldev->siref;
711 MPASS((ldev->siref & LDEV_SI_DTR) == 0);
712 } else if (atomic_fcmpset_int(&ldev->siref,
713 &siref, siref + LDEV_SI_REF)) {
714 break;
715 }
716 }
717 }
718 }
719 *dev = ldev;
720 }
721
722 static void
linux_drop_fop(struct linux_cdev * ldev)723 linux_drop_fop(struct linux_cdev *ldev)
724 {
725
726 if (ldev == NULL)
727 return;
728 if (ldev->kobj.ktype == &linux_cdev_static_ktype) {
729 linux_cdev_deref(ldev);
730 } else {
731 MPASS(ldev->kobj.ktype == &linux_cdev_ktype);
732 MPASS((ldev->siref & ~LDEV_SI_DTR) != 0);
733 atomic_subtract_int(&ldev->siref, LDEV_SI_REF);
734 }
735 }
736
737 #define OPW(fp,td,code) ({ \
738 struct file *__fpop; \
739 __typeof(code) __retval; \
740 \
741 __fpop = (td)->td_fpop; \
742 (td)->td_fpop = (fp); \
743 __retval = (code); \
744 (td)->td_fpop = __fpop; \
745 __retval; \
746 })
747
748 static int
linux_dev_fdopen(struct cdev * dev,int fflags,struct thread * td,struct file * file)749 linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td,
750 struct file *file)
751 {
752 struct linux_cdev *ldev;
753 struct linux_file *filp;
754 const struct file_operations *fop;
755 int error;
756
757 ldev = dev->si_drv1;
758
759 filp = linux_file_alloc();
760 filp->f_dentry = &filp->f_dentry_store;
761 filp->f_op = ldev->ops;
762 filp->f_mode = file->f_flag;
763 filp->f_flags = file->f_flag;
764 filp->f_vnode = file->f_vnode;
765 filp->_file = file;
766 refcount_acquire(&ldev->refs);
767 filp->f_cdev = ldev;
768
769 linux_set_current(td);
770 linux_get_fop(filp, &fop, &ldev);
771
772 if (fop->open != NULL) {
773 error = -fop->open(file->f_vnode, filp);
774 if (error != 0) {
775 linux_drop_fop(ldev);
776 linux_cdev_deref(filp->f_cdev);
777 kfree(filp);
778 return (error);
779 }
780 }
781
782 /* hold on to the vnode - used for fstat() */
783 vref(filp->f_vnode);
784
785 /* release the file from devfs */
786 finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops);
787 linux_drop_fop(ldev);
788 return (ENXIO);
789 }
790
791 #define LINUX_IOCTL_MIN_PTR 0x10000UL
792 #define LINUX_IOCTL_MAX_PTR (LINUX_IOCTL_MIN_PTR + IOCPARM_MAX)
793
794 static inline int
linux_remap_address(void ** uaddr,size_t len)795 linux_remap_address(void **uaddr, size_t len)
796 {
797 uintptr_t uaddr_val = (uintptr_t)(*uaddr);
798
799 if (unlikely(uaddr_val >= LINUX_IOCTL_MIN_PTR &&
800 uaddr_val < LINUX_IOCTL_MAX_PTR)) {
801 struct task_struct *pts = current;
802 if (pts == NULL) {
803 *uaddr = NULL;
804 return (1);
805 }
806
807 /* compute data offset */
808 uaddr_val -= LINUX_IOCTL_MIN_PTR;
809
810 /* check that length is within bounds */
811 if ((len > IOCPARM_MAX) ||
812 (uaddr_val + len) > pts->bsd_ioctl_len) {
813 *uaddr = NULL;
814 return (1);
815 }
816
817 /* re-add kernel buffer address */
818 uaddr_val += (uintptr_t)pts->bsd_ioctl_data;
819
820 /* update address location */
821 *uaddr = (void *)uaddr_val;
822 return (1);
823 }
824 return (0);
825 }
826
827 int
linux_copyin(const void * uaddr,void * kaddr,size_t len)828 linux_copyin(const void *uaddr, void *kaddr, size_t len)
829 {
830 if (linux_remap_address(__DECONST(void **, &uaddr), len)) {
831 if (uaddr == NULL)
832 return (-EFAULT);
833 memcpy(kaddr, uaddr, len);
834 return (0);
835 }
836 return (-copyin(uaddr, kaddr, len));
837 }
838
839 int
linux_copyout(const void * kaddr,void * uaddr,size_t len)840 linux_copyout(const void *kaddr, void *uaddr, size_t len)
841 {
842 if (linux_remap_address(&uaddr, len)) {
843 if (uaddr == NULL)
844 return (-EFAULT);
845 memcpy(uaddr, kaddr, len);
846 return (0);
847 }
848 return (-copyout(kaddr, uaddr, len));
849 }
850
851 size_t
linux_clear_user(void * _uaddr,size_t _len)852 linux_clear_user(void *_uaddr, size_t _len)
853 {
854 uint8_t *uaddr = _uaddr;
855 size_t len = _len;
856
857 /* make sure uaddr is aligned before going into the fast loop */
858 while (((uintptr_t)uaddr & 7) != 0 && len > 7) {
859 if (subyte(uaddr, 0))
860 return (_len);
861 uaddr++;
862 len--;
863 }
864
865 /* zero 8 bytes at a time */
866 while (len > 7) {
867 #ifdef __LP64__
868 if (suword64(uaddr, 0))
869 return (_len);
870 #else
871 if (suword32(uaddr, 0))
872 return (_len);
873 if (suword32(uaddr + 4, 0))
874 return (_len);
875 #endif
876 uaddr += 8;
877 len -= 8;
878 }
879
880 /* zero fill end, if any */
881 while (len > 0) {
882 if (subyte(uaddr, 0))
883 return (_len);
884 uaddr++;
885 len--;
886 }
887 return (0);
888 }
889
890 int
linux_access_ok(const void * uaddr,size_t len)891 linux_access_ok(const void *uaddr, size_t len)
892 {
893 uintptr_t saddr;
894 uintptr_t eaddr;
895
896 /* get start and end address */
897 saddr = (uintptr_t)uaddr;
898 eaddr = (uintptr_t)uaddr + len;
899
900 /* verify addresses are valid for userspace */
901 return ((saddr == eaddr) ||
902 (eaddr > saddr && eaddr <= VM_MAXUSER_ADDRESS));
903 }
904
905 /*
906 * This function should return either EINTR or ERESTART depending on
907 * the signal type sent to this thread:
908 */
909 static int
linux_get_error(struct task_struct * task,int error)910 linux_get_error(struct task_struct *task, int error)
911 {
912 /* check for signal type interrupt code */
913 if (error == EINTR || error == ERESTARTSYS || error == ERESTART) {
914 error = -linux_schedule_get_interrupt_value(task);
915 if (error == 0)
916 error = EINTR;
917 }
918 return (error);
919 }
920
921 static int
linux_file_ioctl_sub(struct file * fp,struct linux_file * filp,const struct file_operations * fop,u_long cmd,caddr_t data,struct thread * td)922 linux_file_ioctl_sub(struct file *fp, struct linux_file *filp,
923 const struct file_operations *fop, u_long cmd, caddr_t data,
924 struct thread *td)
925 {
926 struct task_struct *task = current;
927 unsigned size;
928 int error;
929
930 size = IOCPARM_LEN(cmd);
931 /* refer to logic in sys_ioctl() */
932 if (size > 0) {
933 /*
934 * Setup hint for linux_copyin() and linux_copyout().
935 *
936 * Background: Linux code expects a user-space address
937 * while FreeBSD supplies a kernel-space address.
938 */
939 task->bsd_ioctl_data = data;
940 task->bsd_ioctl_len = size;
941 data = (void *)LINUX_IOCTL_MIN_PTR;
942 } else {
943 /* fetch user-space pointer */
944 data = *(void **)data;
945 }
946 #ifdef COMPAT_FREEBSD32
947 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
948 /* try the compat IOCTL handler first */
949 if (fop->compat_ioctl != NULL) {
950 error = -OPW(fp, td, fop->compat_ioctl(filp,
951 cmd, (u_long)data));
952 } else {
953 error = ENOTTY;
954 }
955
956 /* fallback to the regular IOCTL handler, if any */
957 if (error == ENOTTY && fop->unlocked_ioctl != NULL) {
958 error = -OPW(fp, td, fop->unlocked_ioctl(filp,
959 cmd, (u_long)data));
960 }
961 } else
962 #endif
963 {
964 if (fop->unlocked_ioctl != NULL) {
965 error = -OPW(fp, td, fop->unlocked_ioctl(filp,
966 cmd, (u_long)data));
967 } else {
968 error = ENOTTY;
969 }
970 }
971 if (size > 0) {
972 task->bsd_ioctl_data = NULL;
973 task->bsd_ioctl_len = 0;
974 }
975
976 if (error == EWOULDBLOCK) {
977 /* update kqfilter status, if any */
978 linux_file_kqfilter_poll(filp,
979 LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);
980 } else {
981 error = linux_get_error(task, error);
982 }
983 return (error);
984 }
985
986 #define LINUX_POLL_TABLE_NORMAL ((poll_table *)1)
987
988 /*
989 * This function atomically updates the poll wakeup state and returns
990 * the previous state at the time of update.
991 */
992 static uint8_t
linux_poll_wakeup_state(atomic_t * v,const uint8_t * pstate)993 linux_poll_wakeup_state(atomic_t *v, const uint8_t *pstate)
994 {
995 int c, old;
996
997 c = v->counter;
998
999 while ((old = atomic_cmpxchg(v, c, pstate[c])) != c)
1000 c = old;
1001
1002 return (c);
1003 }
1004
1005 static int
linux_poll_wakeup_callback(wait_queue_t * wq,unsigned int wq_state,int flags,void * key)1006 linux_poll_wakeup_callback(wait_queue_t *wq, unsigned int wq_state, int flags, void *key)
1007 {
1008 static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
1009 [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */
1010 [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */
1011 [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_READY,
1012 [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_READY, /* NOP */
1013 };
1014 struct linux_file *filp = container_of(wq, struct linux_file, f_wait_queue.wq);
1015
1016 switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
1017 case LINUX_FWQ_STATE_QUEUED:
1018 linux_poll_wakeup(filp);
1019 return (1);
1020 default:
1021 return (0);
1022 }
1023 }
1024
1025 void
linux_poll_wait(struct linux_file * filp,wait_queue_head_t * wqh,poll_table * p)1026 linux_poll_wait(struct linux_file *filp, wait_queue_head_t *wqh, poll_table *p)
1027 {
1028 static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
1029 [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_NOT_READY,
1030 [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */
1031 [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_QUEUED, /* NOP */
1032 [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_QUEUED,
1033 };
1034
1035 /* check if we are called inside the select system call */
1036 if (p == LINUX_POLL_TABLE_NORMAL)
1037 selrecord(curthread, &filp->f_selinfo);
1038
1039 switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
1040 case LINUX_FWQ_STATE_INIT:
1041 /* NOTE: file handles can only belong to one wait-queue */
1042 filp->f_wait_queue.wqh = wqh;
1043 filp->f_wait_queue.wq.func = &linux_poll_wakeup_callback;
1044 add_wait_queue(wqh, &filp->f_wait_queue.wq);
1045 atomic_set(&filp->f_wait_queue.state, LINUX_FWQ_STATE_QUEUED);
1046 break;
1047 default:
1048 break;
1049 }
1050 }
1051
1052 static void
linux_poll_wait_dequeue(struct linux_file * filp)1053 linux_poll_wait_dequeue(struct linux_file *filp)
1054 {
1055 static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
1056 [LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */
1057 [LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_INIT,
1058 [LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_INIT,
1059 [LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_INIT,
1060 };
1061
1062 seldrain(&filp->f_selinfo);
1063
1064 switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
1065 case LINUX_FWQ_STATE_NOT_READY:
1066 case LINUX_FWQ_STATE_QUEUED:
1067 case LINUX_FWQ_STATE_READY:
1068 remove_wait_queue(filp->f_wait_queue.wqh, &filp->f_wait_queue.wq);
1069 break;
1070 default:
1071 break;
1072 }
1073 }
1074
1075 void
linux_poll_wakeup(struct linux_file * filp)1076 linux_poll_wakeup(struct linux_file *filp)
1077 {
1078 /* this function should be NULL-safe */
1079 if (filp == NULL)
1080 return;
1081
1082 selwakeup(&filp->f_selinfo);
1083
1084 spin_lock(&filp->f_kqlock);
1085 filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ |
1086 LINUX_KQ_FLAG_NEED_WRITE;
1087
1088 /* make sure the "knote" gets woken up */
1089 KNOTE_LOCKED(&filp->f_selinfo.si_note, 1);
1090 spin_unlock(&filp->f_kqlock);
1091 }
1092
1093 static struct linux_file *
__get_file_rcu(struct linux_file ** f)1094 __get_file_rcu(struct linux_file **f)
1095 {
1096 struct linux_file *file1, *file2;
1097
1098 file1 = READ_ONCE(*f);
1099 if (file1 == NULL)
1100 return (NULL);
1101
1102 if (!refcount_acquire_if_not_zero(
1103 file1->_file == NULL ? &file1->f_count : &file1->_file->f_count))
1104 return (ERR_PTR(-EAGAIN));
1105
1106 file2 = READ_ONCE(*f);
1107 if (file2 == file1)
1108 return (file2);
1109
1110 fput(file1);
1111 return (ERR_PTR(-EAGAIN));
1112 }
1113
1114 struct linux_file *
linux_get_file_rcu(struct linux_file ** f)1115 linux_get_file_rcu(struct linux_file **f)
1116 {
1117 struct linux_file *file1;
1118
1119 for (;;) {
1120 file1 = __get_file_rcu(f);
1121 if (file1 == NULL)
1122 return (NULL);
1123
1124 if (IS_ERR(file1))
1125 continue;
1126
1127 return (file1);
1128 }
1129 }
1130
1131 struct linux_file *
get_file_active(struct linux_file ** f)1132 get_file_active(struct linux_file **f)
1133 {
1134 struct linux_file *file1;
1135
1136 rcu_read_lock();
1137 file1 = __get_file_rcu(f);
1138 rcu_read_unlock();
1139 if (IS_ERR(file1))
1140 file1 = NULL;
1141
1142 return (file1);
1143 }
1144
1145 static void
linux_file_kqfilter_detach(struct knote * kn)1146 linux_file_kqfilter_detach(struct knote *kn)
1147 {
1148 struct linux_file *filp = kn->kn_hook;
1149
1150 spin_lock(&filp->f_kqlock);
1151 knlist_remove(&filp->f_selinfo.si_note, kn, 1);
1152 spin_unlock(&filp->f_kqlock);
1153 }
1154
1155 static int
linux_file_kqfilter_read_event(struct knote * kn,long hint)1156 linux_file_kqfilter_read_event(struct knote *kn, long hint)
1157 {
1158 struct linux_file *filp = kn->kn_hook;
1159
1160 mtx_assert(&filp->f_kqlock, MA_OWNED);
1161
1162 return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_READ) ? 1 : 0);
1163 }
1164
1165 static int
linux_file_kqfilter_write_event(struct knote * kn,long hint)1166 linux_file_kqfilter_write_event(struct knote *kn, long hint)
1167 {
1168 struct linux_file *filp = kn->kn_hook;
1169
1170 mtx_assert(&filp->f_kqlock, MA_OWNED);
1171
1172 return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_WRITE) ? 1 : 0);
1173 }
1174
1175 static const struct filterops linux_dev_kqfiltops_read = {
1176 .f_isfd = 1,
1177 .f_detach = linux_file_kqfilter_detach,
1178 .f_event = linux_file_kqfilter_read_event,
1179 .f_copy = knote_triv_copy,
1180 };
1181
1182 static const struct filterops linux_dev_kqfiltops_write = {
1183 .f_isfd = 1,
1184 .f_detach = linux_file_kqfilter_detach,
1185 .f_event = linux_file_kqfilter_write_event,
1186 .f_copy = knote_triv_copy,
1187 };
1188
1189 static void
linux_file_kqfilter_poll(struct linux_file * filp,int kqflags)1190 linux_file_kqfilter_poll(struct linux_file *filp, int kqflags)
1191 {
1192 struct thread *td;
1193 const struct file_operations *fop;
1194 struct linux_cdev *ldev;
1195 int temp;
1196
1197 if ((filp->f_kqflags & kqflags) == 0)
1198 return;
1199
1200 td = curthread;
1201
1202 linux_get_fop(filp, &fop, &ldev);
1203 /* get the latest polling state */
1204 temp = OPW(filp->_file, td, fop->poll(filp, NULL));
1205 linux_drop_fop(ldev);
1206
1207 spin_lock(&filp->f_kqlock);
1208 /* clear kqflags */
1209 filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ |
1210 LINUX_KQ_FLAG_NEED_WRITE);
1211 /* update kqflags */
1212 if ((temp & (POLLIN | POLLOUT)) != 0) {
1213 if ((temp & POLLIN) != 0)
1214 filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ;
1215 if ((temp & POLLOUT) != 0)
1216 filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE;
1217
1218 /* make sure the "knote" gets woken up */
1219 KNOTE_LOCKED(&filp->f_selinfo.si_note, 0);
1220 }
1221 spin_unlock(&filp->f_kqlock);
1222 }
1223
1224 static int
linux_file_kqfilter(struct file * file,struct knote * kn)1225 linux_file_kqfilter(struct file *file, struct knote *kn)
1226 {
1227 struct linux_file *filp;
1228 struct thread *td;
1229 int error;
1230
1231 td = curthread;
1232 filp = (struct linux_file *)file->f_data;
1233 filp->f_flags = file->f_flag;
1234 if (filp->f_op->poll == NULL)
1235 return (EINVAL);
1236
1237 spin_lock(&filp->f_kqlock);
1238 switch (kn->kn_filter) {
1239 case EVFILT_READ:
1240 filp->f_kqflags |= LINUX_KQ_FLAG_HAS_READ;
1241 kn->kn_fop = &linux_dev_kqfiltops_read;
1242 kn->kn_hook = filp;
1243 knlist_add(&filp->f_selinfo.si_note, kn, 1);
1244 error = 0;
1245 break;
1246 case EVFILT_WRITE:
1247 filp->f_kqflags |= LINUX_KQ_FLAG_HAS_WRITE;
1248 kn->kn_fop = &linux_dev_kqfiltops_write;
1249 kn->kn_hook = filp;
1250 knlist_add(&filp->f_selinfo.si_note, kn, 1);
1251 error = 0;
1252 break;
1253 default:
1254 error = EINVAL;
1255 break;
1256 }
1257 spin_unlock(&filp->f_kqlock);
1258
1259 if (error == 0) {
1260 linux_set_current(td);
1261
1262 /* update kqfilter status, if any */
1263 linux_file_kqfilter_poll(filp,
1264 LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);
1265 }
1266 return (error);
1267 }
1268
1269 static int
linux_file_mmap_single(struct file * fp,const struct file_operations * fop,vm_ooffset_t * offset,vm_size_t size,struct vm_object ** object,int nprot,bool is_shared,struct thread * td)1270 linux_file_mmap_single(struct file *fp, const struct file_operations *fop,
1271 vm_ooffset_t *offset, vm_size_t size, struct vm_object **object,
1272 int nprot, bool is_shared, struct thread *td)
1273 {
1274 struct task_struct *task;
1275 struct vm_area_struct *vmap;
1276 struct mm_struct *mm;
1277 struct linux_file *filp;
1278 vm_memattr_t attr;
1279 int error;
1280
1281 filp = (struct linux_file *)fp->f_data;
1282 filp->f_flags = fp->f_flag;
1283
1284 if (fop->mmap == NULL)
1285 return (EOPNOTSUPP);
1286
1287 linux_set_current(td);
1288
1289 /*
1290 * The same VM object might be shared by multiple processes
1291 * and the mm_struct is usually freed when a process exits.
1292 *
1293 * The atomic reference below makes sure the mm_struct is
1294 * available as long as the vmap is in the linux_vma_head.
1295 */
1296 task = current;
1297 mm = task->mm;
1298 if (atomic_inc_not_zero(&mm->mm_users) == 0)
1299 return (EINVAL);
1300
1301 vmap = kzalloc(sizeof(*vmap), GFP_KERNEL);
1302 vmap->vm_start = 0;
1303 vmap->vm_end = size;
1304 vmap->vm_pgoff = *offset / PAGE_SIZE;
1305 vmap->vm_pfn = 0;
1306 vmap->vm_flags = vmap->vm_page_prot = (nprot & VM_PROT_ALL);
1307 if (is_shared)
1308 vmap->vm_flags |= VM_SHARED;
1309 vmap->vm_ops = NULL;
1310 vmap->vm_file = get_file(filp);
1311 vmap->vm_mm = mm;
1312
1313 if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) {
1314 error = linux_get_error(task, EINTR);
1315 } else {
1316 error = -OPW(fp, td, fop->mmap(filp, vmap));
1317 error = linux_get_error(task, error);
1318 up_write(&vmap->vm_mm->mmap_sem);
1319 }
1320
1321 if (error != 0) {
1322 linux_cdev_handle_free(vmap);
1323 return (error);
1324 }
1325
1326 attr = pgprot2cachemode(vmap->vm_page_prot);
1327
1328 if (vmap->vm_ops != NULL) {
1329 struct vm_area_struct *ptr;
1330 void *vm_private_data;
1331 bool vm_no_fault;
1332
1333 if (vmap->vm_ops->open == NULL ||
1334 vmap->vm_ops->close == NULL ||
1335 vmap->vm_private_data == NULL) {
1336 /* free allocated VM area struct */
1337 linux_cdev_handle_free(vmap);
1338 return (EINVAL);
1339 }
1340
1341 vm_private_data = vmap->vm_private_data;
1342
1343 rw_wlock(&linux_vma_lock);
1344 TAILQ_FOREACH(ptr, &linux_vma_head, vm_entry) {
1345 if (ptr->vm_private_data == vm_private_data)
1346 break;
1347 }
1348 /* check if there is an existing VM area struct */
1349 if (ptr != NULL) {
1350 /* check if the VM area structure is invalid */
1351 if (ptr->vm_ops == NULL ||
1352 ptr->vm_ops->open == NULL ||
1353 ptr->vm_ops->close == NULL) {
1354 error = ESTALE;
1355 vm_no_fault = 1;
1356 } else {
1357 error = EEXIST;
1358 vm_no_fault = (ptr->vm_ops->fault == NULL);
1359 }
1360 } else {
1361 /* insert VM area structure into list */
1362 TAILQ_INSERT_TAIL(&linux_vma_head, vmap, vm_entry);
1363 error = 0;
1364 vm_no_fault = (vmap->vm_ops->fault == NULL);
1365 }
1366 rw_wunlock(&linux_vma_lock);
1367
1368 if (error != 0) {
1369 /* free allocated VM area struct */
1370 linux_cdev_handle_free(vmap);
1371 /* check for stale VM area struct */
1372 if (error != EEXIST)
1373 return (error);
1374 }
1375
1376 /* check if there is no fault handler */
1377 if (vm_no_fault) {
1378 *object = cdev_pager_allocate(vm_private_data, OBJT_DEVICE,
1379 &linux_cdev_pager_ops[1], size, nprot, *offset,
1380 td->td_ucred);
1381 } else {
1382 *object = cdev_pager_allocate(vm_private_data, OBJT_MGTDEVICE,
1383 &linux_cdev_pager_ops[0], size, nprot, *offset,
1384 td->td_ucred);
1385 }
1386
1387 /* check if allocating the VM object failed */
1388 if (*object == NULL) {
1389 if (error == 0) {
1390 /* remove VM area struct from list */
1391 linux_cdev_handle_remove(vmap);
1392 /* free allocated VM area struct */
1393 linux_cdev_handle_free(vmap);
1394 }
1395 return (EINVAL);
1396 }
1397 } else {
1398 struct sglist *sg;
1399
1400 sg = sglist_alloc(1, M_WAITOK);
1401 sglist_append_phys(sg,
1402 (vm_paddr_t)vmap->vm_pfn << PAGE_SHIFT, vmap->vm_len);
1403
1404 *object = vm_pager_allocate(OBJT_SG, sg, vmap->vm_len,
1405 nprot, 0, td->td_ucred);
1406
1407 linux_cdev_handle_free(vmap);
1408
1409 if (*object == NULL) {
1410 sglist_free(sg);
1411 return (EINVAL);
1412 }
1413 }
1414
1415 if (attr != VM_MEMATTR_DEFAULT) {
1416 VM_OBJECT_WLOCK(*object);
1417 vm_object_set_memattr(*object, attr);
1418 VM_OBJECT_WUNLOCK(*object);
1419 }
1420 *offset = 0;
1421 return (0);
1422 }
1423
1424 struct cdevsw linuxcdevsw = {
1425 .d_version = D_VERSION,
1426 .d_fdopen = linux_dev_fdopen,
1427 .d_name = "lkpidev",
1428 };
1429
1430 static int
linux_file_read(struct file * file,struct uio * uio,struct ucred * active_cred,int flags,struct thread * td)1431 linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred,
1432 int flags, struct thread *td)
1433 {
1434 struct linux_file *filp;
1435 const struct file_operations *fop;
1436 struct linux_cdev *ldev;
1437 ssize_t bytes;
1438 int error;
1439
1440 error = 0;
1441 filp = (struct linux_file *)file->f_data;
1442 filp->f_flags = file->f_flag;
1443 /* XXX no support for I/O vectors currently */
1444 if (uio->uio_iovcnt != 1)
1445 return (EOPNOTSUPP);
1446 if (uio->uio_resid > DEVFS_IOSIZE_MAX)
1447 return (EINVAL);
1448 linux_set_current(td);
1449 linux_get_fop(filp, &fop, &ldev);
1450 if (fop->read != NULL) {
1451 bytes = OPW(file, td, fop->read(filp,
1452 uio->uio_iov->iov_base,
1453 uio->uio_iov->iov_len, &uio->uio_offset));
1454 if (bytes >= 0) {
1455 uio->uio_iov->iov_base =
1456 ((uint8_t *)uio->uio_iov->iov_base) + bytes;
1457 uio->uio_iov->iov_len -= bytes;
1458 uio->uio_resid -= bytes;
1459 } else {
1460 error = linux_get_error(current, -bytes);
1461 }
1462 } else
1463 error = ENXIO;
1464
1465 /* update kqfilter status, if any */
1466 linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ);
1467 linux_drop_fop(ldev);
1468
1469 return (error);
1470 }
1471
1472 static int
linux_file_write(struct file * file,struct uio * uio,struct ucred * active_cred,int flags,struct thread * td)1473 linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred,
1474 int flags, struct thread *td)
1475 {
1476 struct linux_file *filp;
1477 const struct file_operations *fop;
1478 struct linux_cdev *ldev;
1479 ssize_t bytes;
1480 int error;
1481
1482 filp = (struct linux_file *)file->f_data;
1483 filp->f_flags = file->f_flag;
1484 /* XXX no support for I/O vectors currently */
1485 if (uio->uio_iovcnt != 1)
1486 return (EOPNOTSUPP);
1487 if (uio->uio_resid > DEVFS_IOSIZE_MAX)
1488 return (EINVAL);
1489 linux_set_current(td);
1490 linux_get_fop(filp, &fop, &ldev);
1491 if (fop->write != NULL) {
1492 bytes = OPW(file, td, fop->write(filp,
1493 uio->uio_iov->iov_base,
1494 uio->uio_iov->iov_len, &uio->uio_offset));
1495 if (bytes >= 0) {
1496 uio->uio_iov->iov_base =
1497 ((uint8_t *)uio->uio_iov->iov_base) + bytes;
1498 uio->uio_iov->iov_len -= bytes;
1499 uio->uio_resid -= bytes;
1500 error = 0;
1501 } else {
1502 error = linux_get_error(current, -bytes);
1503 }
1504 } else
1505 error = ENXIO;
1506
1507 /* update kqfilter status, if any */
1508 linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_WRITE);
1509
1510 linux_drop_fop(ldev);
1511
1512 return (error);
1513 }
1514
1515 static int
linux_file_poll(struct file * file,int events,struct ucred * active_cred,struct thread * td)1516 linux_file_poll(struct file *file, int events, struct ucred *active_cred,
1517 struct thread *td)
1518 {
1519 struct linux_file *filp;
1520 const struct file_operations *fop;
1521 struct linux_cdev *ldev;
1522 int revents;
1523
1524 filp = (struct linux_file *)file->f_data;
1525 filp->f_flags = file->f_flag;
1526 linux_set_current(td);
1527 linux_get_fop(filp, &fop, &ldev);
1528 if (fop->poll != NULL) {
1529 revents = OPW(file, td, fop->poll(filp,
1530 LINUX_POLL_TABLE_NORMAL)) & events;
1531 } else {
1532 revents = 0;
1533 }
1534 linux_drop_fop(ldev);
1535 return (revents);
1536 }
1537
1538 static int
linux_file_close(struct file * file,struct thread * td)1539 linux_file_close(struct file *file, struct thread *td)
1540 {
1541 struct linux_file *filp;
1542 int (*release)(struct inode *, struct linux_file *);
1543 const struct file_operations *fop;
1544 struct linux_cdev *ldev;
1545 int error;
1546
1547 filp = (struct linux_file *)file->f_data;
1548
1549 KASSERT(file_count(filp) == 0,
1550 ("File refcount(%d) is not zero", file_count(filp)));
1551
1552 if (td == NULL)
1553 td = curthread;
1554
1555 error = 0;
1556 filp->f_flags = file->f_flag;
1557 linux_set_current(td);
1558 linux_poll_wait_dequeue(filp);
1559 linux_get_fop(filp, &fop, &ldev);
1560 /*
1561 * Always use the real release function, if any, to avoid
1562 * leaking device resources:
1563 */
1564 release = filp->f_op->release;
1565 if (release != NULL)
1566 error = -OPW(file, td, release(filp->f_vnode, filp));
1567 funsetown(&filp->f_sigio);
1568 if (filp->f_vnode != NULL)
1569 vrele(filp->f_vnode);
1570 linux_drop_fop(ldev);
1571 ldev = filp->f_cdev;
1572 if (ldev != NULL)
1573 linux_cdev_deref(ldev);
1574 linux_synchronize_rcu(RCU_TYPE_REGULAR);
1575 kfree(filp);
1576
1577 return (error);
1578 }
1579
1580 static int
linux_file_ioctl(struct file * fp,u_long cmd,void * data,struct ucred * cred,struct thread * td)1581 linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred,
1582 struct thread *td)
1583 {
1584 struct linux_file *filp;
1585 const struct file_operations *fop;
1586 struct linux_cdev *ldev;
1587 struct fiodgname_arg *fgn;
1588 const char *p;
1589 int error, i;
1590
1591 error = 0;
1592 filp = (struct linux_file *)fp->f_data;
1593 filp->f_flags = fp->f_flag;
1594 linux_get_fop(filp, &fop, &ldev);
1595
1596 linux_set_current(td);
1597 switch (cmd) {
1598 case FIONBIO:
1599 break;
1600 case FIOASYNC:
1601 if (fop->fasync == NULL)
1602 break;
1603 error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC));
1604 break;
1605 case FIOSETOWN:
1606 error = fsetown(*(int *)data, &filp->f_sigio);
1607 if (error == 0) {
1608 if (fop->fasync == NULL)
1609 break;
1610 error = -OPW(fp, td, fop->fasync(0, filp,
1611 fp->f_flag & FASYNC));
1612 }
1613 break;
1614 case FIOGETOWN:
1615 *(int *)data = fgetown(&filp->f_sigio);
1616 break;
1617 case FIODGNAME:
1618 #ifdef COMPAT_FREEBSD32
1619 case FIODGNAME_32:
1620 #endif
1621 if (filp->f_cdev == NULL || filp->f_cdev->cdev == NULL) {
1622 error = ENXIO;
1623 break;
1624 }
1625 fgn = data;
1626 p = devtoname(filp->f_cdev->cdev);
1627 i = strlen(p) + 1;
1628 if (i > fgn->len) {
1629 error = EINVAL;
1630 break;
1631 }
1632 error = copyout(p, fiodgname_buf_get_ptr(fgn, cmd), i);
1633 break;
1634 default:
1635 error = linux_file_ioctl_sub(fp, filp, fop, cmd, data, td);
1636 break;
1637 }
1638 linux_drop_fop(ldev);
1639 return (error);
1640 }
1641
1642 static int
linux_file_mmap_sub(struct thread * td,vm_size_t objsize,vm_prot_t prot,vm_prot_t maxprot,int flags,struct file * fp,vm_ooffset_t * foff,const struct file_operations * fop,vm_object_t * objp)1643 linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot,
1644 vm_prot_t maxprot, int flags, struct file *fp,
1645 vm_ooffset_t *foff, const struct file_operations *fop, vm_object_t *objp)
1646 {
1647 /*
1648 * Character devices do not provide private mappings
1649 * of any kind:
1650 */
1651 if ((maxprot & VM_PROT_WRITE) == 0 &&
1652 (prot & VM_PROT_WRITE) != 0)
1653 return (EACCES);
1654 if ((flags & (MAP_PRIVATE | MAP_COPY)) != 0)
1655 return (EINVAL);
1656
1657 return (linux_file_mmap_single(fp, fop, foff, objsize, objp,
1658 (int)prot, (flags & MAP_SHARED) ? true : false, td));
1659 }
1660
1661 static int
linux_file_mmap(struct file * fp,vm_map_t map,vm_offset_t * addr,vm_size_t size,vm_prot_t prot,vm_prot_t cap_maxprot,int flags,vm_ooffset_t foff,struct thread * td)1662 linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
1663 vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
1664 struct thread *td)
1665 {
1666 struct linux_file *filp;
1667 const struct file_operations *fop;
1668 struct linux_cdev *ldev;
1669 struct mount *mp;
1670 struct vnode *vp;
1671 vm_object_t object;
1672 vm_prot_t maxprot;
1673 int error;
1674
1675 filp = (struct linux_file *)fp->f_data;
1676
1677 vp = filp->f_vnode;
1678 if (vp == NULL)
1679 return (EOPNOTSUPP);
1680
1681 /*
1682 * Ensure that file and memory protections are
1683 * compatible.
1684 */
1685 mp = vp->v_mount;
1686 if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
1687 maxprot = VM_PROT_NONE;
1688 if ((prot & VM_PROT_EXECUTE) != 0)
1689 return (EACCES);
1690 } else
1691 maxprot = VM_PROT_EXECUTE;
1692 if ((fp->f_flag & FREAD) != 0)
1693 maxprot |= VM_PROT_READ;
1694 else if ((prot & VM_PROT_READ) != 0)
1695 return (EACCES);
1696
1697 /*
1698 * If we are sharing potential changes via MAP_SHARED and we
1699 * are trying to get write permission although we opened it
1700 * without asking for it, bail out.
1701 *
1702 * Note that most character devices always share mappings.
1703 *
1704 * Rely on linux_file_mmap_sub() to fail invalid MAP_PRIVATE
1705 * requests rather than doing it here.
1706 */
1707 if ((flags & MAP_SHARED) != 0) {
1708 if ((fp->f_flag & FWRITE) != 0)
1709 maxprot |= VM_PROT_WRITE;
1710 else if ((prot & VM_PROT_WRITE) != 0)
1711 return (EACCES);
1712 }
1713 maxprot &= cap_maxprot;
1714
1715 linux_get_fop(filp, &fop, &ldev);
1716 error = linux_file_mmap_sub(td, size, prot, maxprot, flags, fp,
1717 &foff, fop, &object);
1718 if (error != 0)
1719 goto out;
1720
1721 error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
1722 foff, FALSE, td);
1723 if (error != 0)
1724 vm_object_deallocate(object);
1725 out:
1726 linux_drop_fop(ldev);
1727 return (error);
1728 }
1729
1730 static int
linux_file_stat(struct file * fp,struct stat * sb,struct ucred * active_cred)1731 linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
1732 {
1733 struct linux_file *filp;
1734 struct vnode *vp;
1735 int error;
1736
1737 filp = (struct linux_file *)fp->f_data;
1738 if (filp->f_vnode == NULL)
1739 return (EOPNOTSUPP);
1740
1741 vp = filp->f_vnode;
1742
1743 vn_lock(vp, LK_SHARED | LK_RETRY);
1744 error = VOP_STAT(vp, sb, curthread->td_ucred, NOCRED);
1745 VOP_UNLOCK(vp);
1746
1747 return (error);
1748 }
1749
1750 static int
linux_file_fill_kinfo(struct file * fp,struct kinfo_file * kif,struct filedesc * fdp)1751 linux_file_fill_kinfo(struct file *fp, struct kinfo_file *kif,
1752 struct filedesc *fdp)
1753 {
1754 struct linux_file *filp;
1755 struct vnode *vp;
1756 int error;
1757
1758 filp = fp->f_data;
1759 vp = filp->f_vnode;
1760 if (vp == NULL) {
1761 error = 0;
1762 kif->kf_type = KF_TYPE_DEV;
1763 } else {
1764 vref(vp);
1765 FILEDESC_SUNLOCK(fdp);
1766 error = vn_fill_kinfo_vnode(vp, kif);
1767 vrele(vp);
1768 kif->kf_type = KF_TYPE_VNODE;
1769 FILEDESC_SLOCK(fdp);
1770 }
1771 return (error);
1772 }
1773
1774 unsigned int
linux_iminor(struct inode * inode)1775 linux_iminor(struct inode *inode)
1776 {
1777 struct linux_cdev *ldev;
1778
1779 if (inode == NULL || inode->v_rdev == NULL ||
1780 inode->v_rdev->si_devsw != &linuxcdevsw)
1781 return (-1U);
1782 ldev = inode->v_rdev->si_drv1;
1783 if (ldev == NULL)
1784 return (-1U);
1785
1786 return (minor(ldev->dev));
1787 }
1788
1789 static int
linux_file_kcmp(struct file * fp1,struct file * fp2,struct thread * td)1790 linux_file_kcmp(struct file *fp1, struct file *fp2, struct thread *td)
1791 {
1792 struct linux_file *filp1, *filp2;
1793
1794 if (fp2->f_type != DTYPE_DEV)
1795 return (3);
1796
1797 filp1 = fp1->f_data;
1798 filp2 = fp2->f_data;
1799 return (kcmp_cmp((uintptr_t)filp1->f_cdev, (uintptr_t)filp2->f_cdev));
1800 }
1801
1802 const struct fileops linuxfileops = {
1803 .fo_read = linux_file_read,
1804 .fo_write = linux_file_write,
1805 .fo_truncate = invfo_truncate,
1806 .fo_kqfilter = linux_file_kqfilter,
1807 .fo_stat = linux_file_stat,
1808 .fo_fill_kinfo = linux_file_fill_kinfo,
1809 .fo_poll = linux_file_poll,
1810 .fo_close = linux_file_close,
1811 .fo_ioctl = linux_file_ioctl,
1812 .fo_mmap = linux_file_mmap,
1813 .fo_chmod = invfo_chmod,
1814 .fo_chown = invfo_chown,
1815 .fo_sendfile = invfo_sendfile,
1816 .fo_cmp = linux_file_kcmp,
1817 .fo_flags = DFLAG_PASSABLE,
1818 };
1819
1820 static char *
devm_kvasprintf(struct device * dev,gfp_t gfp,const char * fmt,va_list ap)1821 devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap)
1822 {
1823 unsigned int len;
1824 char *p;
1825 va_list aq;
1826
1827 va_copy(aq, ap);
1828 len = vsnprintf(NULL, 0, fmt, aq);
1829 va_end(aq);
1830
1831 if (dev != NULL)
1832 p = devm_kmalloc(dev, len + 1, gfp);
1833 else
1834 p = kmalloc(len + 1, gfp);
1835 if (p != NULL)
1836 vsnprintf(p, len + 1, fmt, ap);
1837
1838 return (p);
1839 }
1840
1841 char *
kvasprintf(gfp_t gfp,const char * fmt,va_list ap)1842 kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
1843 {
1844
1845 return (devm_kvasprintf(NULL, gfp, fmt, ap));
1846 }
1847
1848 char *
lkpi_devm_kasprintf(struct device * dev,gfp_t gfp,const char * fmt,...)1849 lkpi_devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...)
1850 {
1851 va_list ap;
1852 char *p;
1853
1854 va_start(ap, fmt);
1855 p = devm_kvasprintf(dev, gfp, fmt, ap);
1856 va_end(ap);
1857
1858 return (p);
1859 }
1860
1861 char *
kasprintf(gfp_t gfp,const char * fmt,...)1862 kasprintf(gfp_t gfp, const char *fmt, ...)
1863 {
1864 va_list ap;
1865 char *p;
1866
1867 va_start(ap, fmt);
1868 p = kvasprintf(gfp, fmt, ap);
1869 va_end(ap);
1870
1871 return (p);
1872 }
1873
1874 int
__lkpi_hexdump_printf(void * arg1 __unused,const char * fmt,...)1875 __lkpi_hexdump_printf(void *arg1 __unused, const char *fmt, ...)
1876 {
1877 va_list ap;
1878 int result;
1879
1880 va_start(ap, fmt);
1881 result = vprintf(fmt, ap);
1882 va_end(ap);
1883 return (result);
1884 }
1885
1886 int
__lkpi_hexdump_sbuf_printf(void * arg1,const char * fmt,...)1887 __lkpi_hexdump_sbuf_printf(void *arg1, const char *fmt, ...)
1888 {
1889 va_list ap;
1890 int result;
1891
1892 va_start(ap, fmt);
1893 result = sbuf_vprintf(arg1, fmt, ap);
1894 va_end(ap);
1895 return (result);
1896 }
1897
1898 void
lkpi_hex_dump(int (* _fpf)(void *,const char *,...),void * arg1,const char * level,const char * prefix_str,const int prefix_type,const int rowsize,const int groupsize,const void * buf,size_t len,const bool ascii,const bool trailing_newline)1899 lkpi_hex_dump(int(*_fpf)(void *, const char *, ...), void *arg1,
1900 const char *level, const char *prefix_str,
1901 const int prefix_type, const int rowsize, const int groupsize,
1902 const void *buf, size_t len, const bool ascii, const bool trailing_newline)
1903 {
1904 typedef const struct { long long value; } __packed *print_64p_t;
1905 typedef const struct { uint32_t value; } __packed *print_32p_t;
1906 typedef const struct { uint16_t value; } __packed *print_16p_t;
1907 const void *buf_old = buf;
1908 int row, linelen, ret;
1909
1910 while (len > 0) {
1911 linelen = 0;
1912 if (level != NULL) {
1913 ret = _fpf(arg1, "%s", level);
1914 if (ret < 0)
1915 break;
1916 linelen += ret;
1917 }
1918 if (prefix_str != NULL) {
1919 ret = _fpf(
1920 arg1, "%s%s", linelen ? " " : "", prefix_str);
1921 if (ret < 0)
1922 break;
1923 linelen += ret;
1924 }
1925
1926 switch (prefix_type) {
1927 case DUMP_PREFIX_ADDRESS:
1928 ret = _fpf(
1929 arg1, "%s[%p]", linelen ? " " : "", buf);
1930 if (ret < 0)
1931 return;
1932 linelen += ret;
1933 break;
1934 case DUMP_PREFIX_OFFSET:
1935 ret = _fpf(
1936 arg1, "%s[%#tx]", linelen ? " " : "",
1937 ((const char *)buf - (const char *)buf_old));
1938 if (ret < 0)
1939 return;
1940 linelen += ret;
1941 break;
1942 default:
1943 break;
1944 }
1945 for (row = 0; row != rowsize; row++) {
1946 if (groupsize == 8 && len > 7) {
1947 ret = _fpf(
1948 arg1, "%s%016llx", linelen ? " " : "",
1949 ((print_64p_t)buf)->value);
1950 if (ret < 0)
1951 return;
1952 linelen += ret;
1953 buf = (const uint8_t *)buf + 8;
1954 len -= 8;
1955 } else if (groupsize == 4 && len > 3) {
1956 ret = _fpf(
1957 arg1, "%s%08x", linelen ? " " : "",
1958 ((print_32p_t)buf)->value);
1959 if (ret < 0)
1960 return;
1961 linelen += ret;
1962 buf = (const uint8_t *)buf + 4;
1963 len -= 4;
1964 } else if (groupsize == 2 && len > 1) {
1965 ret = _fpf(
1966 arg1, "%s%04x", linelen ? " " : "",
1967 ((print_16p_t)buf)->value);
1968 if (ret < 0)
1969 return;
1970 linelen += ret;
1971 buf = (const uint8_t *)buf + 2;
1972 len -= 2;
1973 } else if (len > 0) {
1974 ret = _fpf(
1975 arg1, "%s%02x", linelen ? " " : "",
1976 *(const uint8_t *)buf);
1977 if (ret < 0)
1978 return;
1979 linelen += ret;
1980 buf = (const uint8_t *)buf + 1;
1981 len--;
1982 } else {
1983 break;
1984 }
1985 }
1986 if (len > 0 && trailing_newline) {
1987 ret = _fpf(arg1, "\n");
1988 if (ret < 0)
1989 break;
1990 }
1991 }
1992 }
1993
1994 struct hdtb_context {
1995 char *linebuf;
1996 size_t linebuflen;
1997 int written;
1998 };
1999
2000 static int
hdtb_cb(void * arg,const char * format,...)2001 hdtb_cb(void *arg, const char *format, ...)
2002 {
2003 struct hdtb_context *context;
2004 int written;
2005 va_list args;
2006
2007 context = arg;
2008
2009 va_start(args, format);
2010 written = vsnprintf(
2011 context->linebuf, context->linebuflen, format, args);
2012 va_end(args);
2013
2014 if (written < 0)
2015 return (written);
2016
2017 /*
2018 * Linux' hex_dump_to_buffer() function has the same behaviour as
2019 * snprintf() basically. Therefore, it returns the number of bytes it
2020 * would have written if the destination buffer was large enough.
2021 *
2022 * If the destination buffer was exhausted, lkpi_hex_dump() will
2023 * continue to call this callback but it will only compute the bytes it
2024 * would have written but write nothing to that buffer.
2025 */
2026 context->written += written;
2027
2028 if (written < context->linebuflen) {
2029 context->linebuf += written;
2030 context->linebuflen -= written;
2031 } else {
2032 context->linebuf += context->linebuflen;
2033 context->linebuflen = 0;
2034 }
2035
2036 return (written);
2037 }
2038
2039 int
lkpi_hex_dump_to_buffer(const void * buf,size_t len,int rowsize,int groupsize,char * linebuf,size_t linebuflen,bool ascii)2040 lkpi_hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
2041 int groupsize, char *linebuf, size_t linebuflen, bool ascii)
2042 {
2043 int written;
2044 struct hdtb_context context;
2045
2046 context.linebuf = linebuf;
2047 context.linebuflen = linebuflen;
2048 context.written = 0;
2049
2050 if (rowsize != 16 && rowsize != 32)
2051 rowsize = 16;
2052
2053 len = min(len, rowsize);
2054
2055 lkpi_hex_dump(
2056 hdtb_cb, &context, NULL, NULL, DUMP_PREFIX_NONE,
2057 rowsize, groupsize, buf, len, ascii, false);
2058
2059 written = context.written;
2060
2061 return (written);
2062 }
2063
2064 static void
linux_timer_callback_wrapper(void * context)2065 linux_timer_callback_wrapper(void *context)
2066 {
2067 struct timer_list *timer;
2068
2069 timer = context;
2070
2071 /* the timer is about to be shutdown permanently */
2072 if (timer->function == NULL)
2073 return;
2074
2075 if (linux_set_current_flags(curthread, M_NOWAIT)) {
2076 /* try again later */
2077 callout_reset(&timer->callout, 1,
2078 &linux_timer_callback_wrapper, timer);
2079 return;
2080 }
2081
2082 timer->function(timer->data);
2083 }
2084
2085 static int
linux_timer_jiffies_until(unsigned long expires)2086 linux_timer_jiffies_until(unsigned long expires)
2087 {
2088 unsigned long delta = expires - jiffies;
2089
2090 /*
2091 * Guard against already expired values and make sure that the value can
2092 * be used as a tick count, rather than a jiffies count.
2093 */
2094 if ((long)delta < 1)
2095 delta = 1;
2096 else if (delta > INT_MAX)
2097 delta = INT_MAX;
2098 return ((int)delta);
2099 }
2100
2101 int
mod_timer(struct timer_list * timer,unsigned long expires)2102 mod_timer(struct timer_list *timer, unsigned long expires)
2103 {
2104 int ret;
2105
2106 timer->expires = expires;
2107 ret = callout_reset(&timer->callout,
2108 linux_timer_jiffies_until(expires),
2109 &linux_timer_callback_wrapper, timer);
2110
2111 MPASS(ret == 0 || ret == 1);
2112
2113 return (ret == 1);
2114 }
2115
2116 void
add_timer(struct timer_list * timer)2117 add_timer(struct timer_list *timer)
2118 {
2119
2120 callout_reset(&timer->callout,
2121 linux_timer_jiffies_until(timer->expires),
2122 &linux_timer_callback_wrapper, timer);
2123 }
2124
2125 void
add_timer_on(struct timer_list * timer,int cpu)2126 add_timer_on(struct timer_list *timer, int cpu)
2127 {
2128
2129 callout_reset_on(&timer->callout,
2130 linux_timer_jiffies_until(timer->expires),
2131 &linux_timer_callback_wrapper, timer, cpu);
2132 }
2133
2134 int
timer_delete(struct timer_list * timer)2135 timer_delete(struct timer_list *timer)
2136 {
2137
2138 if (callout_stop(&(timer)->callout) == -1)
2139 return (0);
2140 return (1);
2141 }
2142
2143 int
timer_delete_sync(struct timer_list * timer)2144 timer_delete_sync(struct timer_list *timer)
2145 {
2146
2147 if (callout_drain(&(timer)->callout) == -1)
2148 return (0);
2149 return (1);
2150 }
2151
2152 int
timer_shutdown_sync(struct timer_list * timer)2153 timer_shutdown_sync(struct timer_list *timer)
2154 {
2155
2156 timer->function = NULL;
2157 return (del_timer_sync(timer));
2158 }
2159
2160 /* greatest common divisor, Euclid equation */
2161 static uint64_t
lkpi_gcd_64(uint64_t a,uint64_t b)2162 lkpi_gcd_64(uint64_t a, uint64_t b)
2163 {
2164 uint64_t an;
2165 uint64_t bn;
2166
2167 while (b != 0) {
2168 an = b;
2169 bn = a % b;
2170 a = an;
2171 b = bn;
2172 }
2173 return (a);
2174 }
2175
2176 uint64_t lkpi_nsec2hz_rem;
2177 uint64_t lkpi_nsec2hz_div = 1000000000ULL;
2178 uint64_t lkpi_nsec2hz_max;
2179
2180 uint64_t lkpi_usec2hz_rem;
2181 uint64_t lkpi_usec2hz_div = 1000000ULL;
2182 uint64_t lkpi_usec2hz_max;
2183
2184 uint64_t lkpi_msec2hz_rem;
2185 uint64_t lkpi_msec2hz_div = 1000ULL;
2186 uint64_t lkpi_msec2hz_max;
2187
2188 static void
linux_timer_init(void * arg)2189 linux_timer_init(void *arg)
2190 {
2191 uint64_t gcd;
2192
2193 /*
2194 * Compute an internal HZ value which can divide 2**32 to
2195 * avoid timer rounding problems when the tick value wraps
2196 * around 2**32:
2197 */
2198 linux_timer_hz_mask = 1;
2199 while (linux_timer_hz_mask < (unsigned long)hz)
2200 linux_timer_hz_mask *= 2;
2201 linux_timer_hz_mask--;
2202
2203 /* compute some internal constants */
2204
2205 lkpi_nsec2hz_rem = hz;
2206 lkpi_usec2hz_rem = hz;
2207 lkpi_msec2hz_rem = hz;
2208
2209 gcd = lkpi_gcd_64(lkpi_nsec2hz_rem, lkpi_nsec2hz_div);
2210 lkpi_nsec2hz_rem /= gcd;
2211 lkpi_nsec2hz_div /= gcd;
2212 lkpi_nsec2hz_max = -1ULL / lkpi_nsec2hz_rem;
2213
2214 gcd = lkpi_gcd_64(lkpi_usec2hz_rem, lkpi_usec2hz_div);
2215 lkpi_usec2hz_rem /= gcd;
2216 lkpi_usec2hz_div /= gcd;
2217 lkpi_usec2hz_max = -1ULL / lkpi_usec2hz_rem;
2218
2219 gcd = lkpi_gcd_64(lkpi_msec2hz_rem, lkpi_msec2hz_div);
2220 lkpi_msec2hz_rem /= gcd;
2221 lkpi_msec2hz_div /= gcd;
2222 lkpi_msec2hz_max = -1ULL / lkpi_msec2hz_rem;
2223 }
2224 SYSINIT(linux_timer, SI_SUB_DRIVERS, SI_ORDER_FIRST, linux_timer_init, NULL);
2225
2226 void
linux_complete_common(struct completion * c,int all)2227 linux_complete_common(struct completion *c, int all)
2228 {
2229 sleepq_lock(c);
2230 if (all) {
2231 c->done = UINT_MAX;
2232 sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0);
2233 } else {
2234 if (c->done != UINT_MAX)
2235 c->done++;
2236 sleepq_signal(c, SLEEPQ_SLEEP, 0, 0);
2237 }
2238 sleepq_release(c);
2239 }
2240
2241 /*
2242 * Indefinite wait for done != 0 with or without signals.
2243 */
2244 int
linux_wait_for_common(struct completion * c,int flags)2245 linux_wait_for_common(struct completion *c, int flags)
2246 {
2247 struct task_struct *task;
2248 int error;
2249
2250 if (SCHEDULER_STOPPED())
2251 return (0);
2252
2253 task = current;
2254
2255 if (flags != 0)
2256 flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
2257 else
2258 flags = SLEEPQ_SLEEP;
2259 error = 0;
2260 for (;;) {
2261 sleepq_lock(c);
2262 if (c->done)
2263 break;
2264 sleepq_add(c, NULL, "completion", flags, 0);
2265 if (flags & SLEEPQ_INTERRUPTIBLE) {
2266 DROP_GIANT();
2267 error = -sleepq_wait_sig(c, 0);
2268 PICKUP_GIANT();
2269 if (error != 0) {
2270 linux_schedule_save_interrupt_value(task, error);
2271 error = -ERESTARTSYS;
2272 goto intr;
2273 }
2274 } else {
2275 DROP_GIANT();
2276 sleepq_wait(c, 0);
2277 PICKUP_GIANT();
2278 }
2279 }
2280 if (c->done != UINT_MAX)
2281 c->done--;
2282 sleepq_release(c);
2283
2284 intr:
2285 return (error);
2286 }
2287
2288 /*
2289 * Time limited wait for done != 0 with or without signals.
2290 */
2291 unsigned long
linux_wait_for_timeout_common(struct completion * c,unsigned long timeout,int flags)2292 linux_wait_for_timeout_common(struct completion *c, unsigned long timeout,
2293 int flags)
2294 {
2295 struct task_struct *task;
2296 unsigned long end = jiffies + timeout, error;
2297
2298 if (SCHEDULER_STOPPED())
2299 return (0);
2300
2301 task = current;
2302
2303 if (flags != 0)
2304 flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
2305 else
2306 flags = SLEEPQ_SLEEP;
2307
2308 for (;;) {
2309 sleepq_lock(c);
2310 if (c->done)
2311 break;
2312 sleepq_add(c, NULL, "completion", flags, 0);
2313 sleepq_set_timeout(c, linux_timer_jiffies_until(end));
2314
2315 DROP_GIANT();
2316 if (flags & SLEEPQ_INTERRUPTIBLE)
2317 error = -sleepq_timedwait_sig(c, 0);
2318 else
2319 error = -sleepq_timedwait(c, 0);
2320 PICKUP_GIANT();
2321
2322 if (error != 0) {
2323 /* check for timeout */
2324 if (error == -EWOULDBLOCK) {
2325 error = 0; /* timeout */
2326 } else {
2327 /* signal happened */
2328 linux_schedule_save_interrupt_value(task, error);
2329 error = -ERESTARTSYS;
2330 }
2331 goto done;
2332 }
2333 }
2334 if (c->done != UINT_MAX)
2335 c->done--;
2336 sleepq_release(c);
2337
2338 /* return how many jiffies are left */
2339 error = linux_timer_jiffies_until(end);
2340 done:
2341 return (error);
2342 }
2343
2344 int
linux_try_wait_for_completion(struct completion * c)2345 linux_try_wait_for_completion(struct completion *c)
2346 {
2347 int isdone;
2348
2349 sleepq_lock(c);
2350 isdone = (c->done != 0);
2351 if (c->done != 0 && c->done != UINT_MAX)
2352 c->done--;
2353 sleepq_release(c);
2354 return (isdone);
2355 }
2356
2357 int
linux_completion_done(struct completion * c)2358 linux_completion_done(struct completion *c)
2359 {
2360 int isdone;
2361
2362 sleepq_lock(c);
2363 isdone = (c->done != 0);
2364 sleepq_release(c);
2365 return (isdone);
2366 }
2367
2368 static void
linux_cdev_deref(struct linux_cdev * ldev)2369 linux_cdev_deref(struct linux_cdev *ldev)
2370 {
2371 if (refcount_release(&ldev->refs) &&
2372 ldev->kobj.ktype == &linux_cdev_ktype)
2373 kfree(ldev);
2374 }
2375
2376 static void
linux_cdev_release(struct kobject * kobj)2377 linux_cdev_release(struct kobject *kobj)
2378 {
2379 struct linux_cdev *cdev;
2380 struct kobject *parent;
2381
2382 cdev = container_of(kobj, struct linux_cdev, kobj);
2383 parent = kobj->parent;
2384 linux_destroy_dev(cdev);
2385 linux_cdev_deref(cdev);
2386 kobject_put(parent);
2387 }
2388
2389 static void
linux_cdev_static_release(struct kobject * kobj)2390 linux_cdev_static_release(struct kobject *kobj)
2391 {
2392 struct cdev *cdev;
2393 struct linux_cdev *ldev;
2394
2395 ldev = container_of(kobj, struct linux_cdev, kobj);
2396 cdev = ldev->cdev;
2397 if (cdev != NULL) {
2398 destroy_dev(cdev);
2399 ldev->cdev = NULL;
2400 }
2401 kobject_put(kobj->parent);
2402 }
2403
2404 int
linux_cdev_device_add(struct linux_cdev * ldev,struct device * dev)2405 linux_cdev_device_add(struct linux_cdev *ldev, struct device *dev)
2406 {
2407 int ret;
2408
2409 if (dev->devt != 0) {
2410 /* Set parent kernel object. */
2411 ldev->kobj.parent = &dev->kobj;
2412
2413 /*
2414 * Unlike Linux we require the kobject of the
2415 * character device structure to have a valid name
2416 * before calling this function:
2417 */
2418 if (ldev->kobj.name == NULL)
2419 return (-EINVAL);
2420
2421 ret = cdev_add(ldev, dev->devt, 1);
2422 if (ret)
2423 return (ret);
2424 }
2425 ret = device_add(dev);
2426 if (ret != 0 && dev->devt != 0)
2427 cdev_del(ldev);
2428 return (ret);
2429 }
2430
2431 void
linux_cdev_device_del(struct linux_cdev * ldev,struct device * dev)2432 linux_cdev_device_del(struct linux_cdev *ldev, struct device *dev)
2433 {
2434 device_del(dev);
2435
2436 if (dev->devt != 0)
2437 cdev_del(ldev);
2438 }
2439
2440 static void
linux_destroy_dev(struct linux_cdev * ldev)2441 linux_destroy_dev(struct linux_cdev *ldev)
2442 {
2443
2444 if (ldev->cdev == NULL)
2445 return;
2446
2447 MPASS((ldev->siref & LDEV_SI_DTR) == 0);
2448 MPASS(ldev->kobj.ktype == &linux_cdev_ktype);
2449
2450 atomic_set_int(&ldev->siref, LDEV_SI_DTR);
2451 while ((atomic_load_int(&ldev->siref) & ~LDEV_SI_DTR) != 0)
2452 pause("ldevdtr", hz / 4);
2453
2454 destroy_dev(ldev->cdev);
2455 ldev->cdev = NULL;
2456 }
2457
2458 const struct kobj_type linux_cdev_ktype = {
2459 .release = linux_cdev_release,
2460 };
2461
2462 const struct kobj_type linux_cdev_static_ktype = {
2463 .release = linux_cdev_static_release,
2464 };
2465
2466 static void
linux_handle_ifnet_link_event(void * arg,struct ifnet * ifp,int linkstate)2467 linux_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate)
2468 {
2469 struct notifier_block *nb;
2470 struct netdev_notifier_info ni;
2471
2472 nb = arg;
2473 ni.ifp = ifp;
2474 ni.dev = (struct net_device *)ifp;
2475 if (linkstate == LINK_STATE_UP)
2476 nb->notifier_call(nb, NETDEV_UP, &ni);
2477 else
2478 nb->notifier_call(nb, NETDEV_DOWN, &ni);
2479 }
2480
2481 static void
linux_handle_ifnet_arrival_event(void * arg,struct ifnet * ifp)2482 linux_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp)
2483 {
2484 struct notifier_block *nb;
2485 struct netdev_notifier_info ni;
2486
2487 nb = arg;
2488 ni.ifp = ifp;
2489 ni.dev = (struct net_device *)ifp;
2490 nb->notifier_call(nb, NETDEV_REGISTER, &ni);
2491 }
2492
2493 static void
linux_handle_ifnet_departure_event(void * arg,struct ifnet * ifp)2494 linux_handle_ifnet_departure_event(void *arg, struct ifnet *ifp)
2495 {
2496 struct notifier_block *nb;
2497 struct netdev_notifier_info ni;
2498
2499 nb = arg;
2500 ni.ifp = ifp;
2501 ni.dev = (struct net_device *)ifp;
2502 nb->notifier_call(nb, NETDEV_UNREGISTER, &ni);
2503 }
2504
2505 static void
linux_handle_iflladdr_event(void * arg,struct ifnet * ifp)2506 linux_handle_iflladdr_event(void *arg, struct ifnet *ifp)
2507 {
2508 struct notifier_block *nb;
2509 struct netdev_notifier_info ni;
2510
2511 nb = arg;
2512 ni.ifp = ifp;
2513 ni.dev = (struct net_device *)ifp;
2514 nb->notifier_call(nb, NETDEV_CHANGEADDR, &ni);
2515 }
2516
2517 static void
linux_handle_ifaddr_event(void * arg,struct ifnet * ifp)2518 linux_handle_ifaddr_event(void *arg, struct ifnet *ifp)
2519 {
2520 struct notifier_block *nb;
2521 struct netdev_notifier_info ni;
2522
2523 nb = arg;
2524 ni.ifp = ifp;
2525 ni.dev = (struct net_device *)ifp;
2526 nb->notifier_call(nb, NETDEV_CHANGEIFADDR, &ni);
2527 }
2528
2529 int
register_netdevice_notifier(struct notifier_block * nb)2530 register_netdevice_notifier(struct notifier_block *nb)
2531 {
2532
2533 nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER(
2534 ifnet_link_event, linux_handle_ifnet_link_event, nb, 0);
2535 nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER(
2536 ifnet_arrival_event, linux_handle_ifnet_arrival_event, nb, 0);
2537 nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER(
2538 ifnet_departure_event, linux_handle_ifnet_departure_event, nb, 0);
2539 nb->tags[NETDEV_CHANGEADDR] = EVENTHANDLER_REGISTER(
2540 iflladdr_event, linux_handle_iflladdr_event, nb, 0);
2541
2542 return (0);
2543 }
2544
2545 int
register_inetaddr_notifier(struct notifier_block * nb)2546 register_inetaddr_notifier(struct notifier_block *nb)
2547 {
2548
2549 nb->tags[NETDEV_CHANGEIFADDR] = EVENTHANDLER_REGISTER(
2550 ifaddr_event, linux_handle_ifaddr_event, nb, 0);
2551 return (0);
2552 }
2553
2554 int
unregister_netdevice_notifier(struct notifier_block * nb)2555 unregister_netdevice_notifier(struct notifier_block *nb)
2556 {
2557
2558 EVENTHANDLER_DEREGISTER(ifnet_link_event,
2559 nb->tags[NETDEV_UP]);
2560 EVENTHANDLER_DEREGISTER(ifnet_arrival_event,
2561 nb->tags[NETDEV_REGISTER]);
2562 EVENTHANDLER_DEREGISTER(ifnet_departure_event,
2563 nb->tags[NETDEV_UNREGISTER]);
2564 EVENTHANDLER_DEREGISTER(iflladdr_event,
2565 nb->tags[NETDEV_CHANGEADDR]);
2566
2567 return (0);
2568 }
2569
2570 int
unregister_inetaddr_notifier(struct notifier_block * nb)2571 unregister_inetaddr_notifier(struct notifier_block *nb)
2572 {
2573
2574 EVENTHANDLER_DEREGISTER(ifaddr_event,
2575 nb->tags[NETDEV_CHANGEIFADDR]);
2576
2577 return (0);
2578 }
2579
2580 struct list_sort_thunk {
2581 int (*cmp)(void *, struct list_head *, struct list_head *);
2582 void *priv;
2583 };
2584
2585 static inline int
linux_le_cmp(const void * d1,const void * d2,void * priv)2586 linux_le_cmp(const void *d1, const void *d2, void *priv)
2587 {
2588 struct list_head *le1, *le2;
2589 struct list_sort_thunk *thunk;
2590
2591 thunk = priv;
2592 le1 = *(__DECONST(struct list_head **, d1));
2593 le2 = *(__DECONST(struct list_head **, d2));
2594 return ((thunk->cmp)(thunk->priv, le1, le2));
2595 }
2596
2597 void
list_sort(void * priv,struct list_head * head,int (* cmp)(void * priv,struct list_head * a,struct list_head * b))2598 list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv,
2599 struct list_head *a, struct list_head *b))
2600 {
2601 struct list_sort_thunk thunk;
2602 struct list_head **ar, *le;
2603 size_t count, i;
2604
2605 count = 0;
2606 list_for_each(le, head)
2607 count++;
2608 ar = malloc(sizeof(struct list_head *) * count, M_KMALLOC, M_WAITOK);
2609 i = 0;
2610 list_for_each(le, head)
2611 ar[i++] = le;
2612 thunk.cmp = cmp;
2613 thunk.priv = priv;
2614 qsort_r(ar, count, sizeof(struct list_head *), linux_le_cmp, &thunk);
2615 INIT_LIST_HEAD(head);
2616 for (i = 0; i < count; i++)
2617 list_add_tail(ar[i], head);
2618 free(ar, M_KMALLOC);
2619 }
2620
2621 #if defined(__i386__) || defined(__amd64__)
2622 int
linux_wbinvd_on_all_cpus(void)2623 linux_wbinvd_on_all_cpus(void)
2624 {
2625
2626 pmap_invalidate_cache();
2627 return (0);
2628 }
2629 #endif
2630
2631 int
linux_on_each_cpu(void callback (void *),void * data)2632 linux_on_each_cpu(void callback(void *), void *data)
2633 {
2634
2635 smp_rendezvous(smp_no_rendezvous_barrier, callback,
2636 smp_no_rendezvous_barrier, data);
2637 return (0);
2638 }
2639
2640 int
linux_in_atomic(void)2641 linux_in_atomic(void)
2642 {
2643
2644 return ((curthread->td_pflags & TDP_NOFAULTING) != 0);
2645 }
2646
2647 struct linux_cdev *
linux_find_cdev(const char * name,unsigned major,unsigned minor)2648 linux_find_cdev(const char *name, unsigned major, unsigned minor)
2649 {
2650 dev_t dev = MKDEV(major, minor);
2651 struct cdev *cdev;
2652
2653 dev_lock();
2654 LIST_FOREACH(cdev, &linuxcdevsw.d_devs, si_list) {
2655 struct linux_cdev *ldev = cdev->si_drv1;
2656 if (ldev->dev == dev &&
2657 strcmp(kobject_name(&ldev->kobj), name) == 0) {
2658 break;
2659 }
2660 }
2661 dev_unlock();
2662
2663 return (cdev != NULL ? cdev->si_drv1 : NULL);
2664 }
2665
2666 int
__register_chrdev(unsigned int major,unsigned int baseminor,unsigned int count,const char * name,const struct file_operations * fops)2667 __register_chrdev(unsigned int major, unsigned int baseminor,
2668 unsigned int count, const char *name,
2669 const struct file_operations *fops)
2670 {
2671 struct linux_cdev *cdev;
2672 int ret = 0;
2673 int i;
2674
2675 for (i = baseminor; i < baseminor + count; i++) {
2676 cdev = cdev_alloc();
2677 cdev->ops = fops;
2678 kobject_set_name(&cdev->kobj, name);
2679
2680 ret = cdev_add(cdev, makedev(major, i), 1);
2681 if (ret != 0)
2682 break;
2683 }
2684 return (ret);
2685 }
2686
2687 int
__register_chrdev_p(unsigned int major,unsigned int baseminor,unsigned int count,const char * name,const struct file_operations * fops,uid_t uid,gid_t gid,int mode)2688 __register_chrdev_p(unsigned int major, unsigned int baseminor,
2689 unsigned int count, const char *name,
2690 const struct file_operations *fops, uid_t uid,
2691 gid_t gid, int mode)
2692 {
2693 struct linux_cdev *cdev;
2694 int ret = 0;
2695 int i;
2696
2697 for (i = baseminor; i < baseminor + count; i++) {
2698 cdev = cdev_alloc();
2699 cdev->ops = fops;
2700 kobject_set_name(&cdev->kobj, name);
2701
2702 ret = cdev_add_ext(cdev, makedev(major, i), uid, gid, mode);
2703 if (ret != 0)
2704 break;
2705 }
2706 return (ret);
2707 }
2708
2709 void
__unregister_chrdev(unsigned int major,unsigned int baseminor,unsigned int count,const char * name)2710 __unregister_chrdev(unsigned int major, unsigned int baseminor,
2711 unsigned int count, const char *name)
2712 {
2713 struct linux_cdev *cdevp;
2714 int i;
2715
2716 for (i = baseminor; i < baseminor + count; i++) {
2717 cdevp = linux_find_cdev(name, major, i);
2718 if (cdevp != NULL)
2719 cdev_del(cdevp);
2720 }
2721 }
2722
2723 void
linux_dump_stack(void)2724 linux_dump_stack(void)
2725 {
2726 #ifdef STACK
2727 struct stack st;
2728
2729 stack_save(&st);
2730 stack_print(&st);
2731 #endif
2732 }
2733
2734 int
linuxkpi_net_ratelimit(void)2735 linuxkpi_net_ratelimit(void)
2736 {
2737
2738 return (ppsratecheck(&lkpi_net_lastlog, &lkpi_net_curpps,
2739 lkpi_net_maxpps));
2740 }
2741
2742 struct io_mapping *
io_mapping_create_wc(resource_size_t base,unsigned long size)2743 io_mapping_create_wc(resource_size_t base, unsigned long size)
2744 {
2745 struct io_mapping *mapping;
2746
2747 mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
2748 if (mapping == NULL)
2749 return (NULL);
2750 return (io_mapping_init_wc(mapping, base, size));
2751 }
2752
2753 /* We likely want a linuxkpi_device.c at some point. */
2754 bool
device_can_wakeup(struct device * dev)2755 device_can_wakeup(struct device *dev)
2756 {
2757
2758 if (dev == NULL)
2759 return (false);
2760 /*
2761 * XXX-BZ iwlwifi queries it as part of enabling WoWLAN.
2762 * Normally this would be based on a bool in dev->power.XXX.
2763 * Check such as PCI PCIM_PCAP_*PME. We have no way to enable this yet.
2764 * We may get away by directly calling into bsddev for as long as
2765 * we can assume PCI only avoiding changing struct device breaking KBI.
2766 */
2767 pr_debug("%s:%d: not enabled; see comment.\n", __func__, __LINE__);
2768 return (false);
2769 }
2770
2771 static void
devm_device_group_remove(struct device * dev,void * p)2772 devm_device_group_remove(struct device *dev, void *p)
2773 {
2774 const struct attribute_group **dr = p;
2775 const struct attribute_group *group = *dr;
2776
2777 sysfs_remove_group(&dev->kobj, group);
2778 }
2779
2780 int
lkpi_devm_device_add_group(struct device * dev,const struct attribute_group * group)2781 lkpi_devm_device_add_group(struct device *dev,
2782 const struct attribute_group *group)
2783 {
2784 const struct attribute_group **dr;
2785 int ret;
2786
2787 dr = devres_alloc(devm_device_group_remove, sizeof(*dr), GFP_KERNEL);
2788 if (dr == NULL)
2789 return (-ENOMEM);
2790
2791 ret = sysfs_create_group(&dev->kobj, group);
2792 if (ret == 0) {
2793 *dr = group;
2794 devres_add(dev, dr);
2795 } else
2796 devres_free(dr);
2797
2798 return (ret);
2799 }
2800
2801 #if defined(__i386__) || defined(__amd64__)
2802 bool linux_cpu_has_clflush;
2803 struct cpuinfo_x86 boot_cpu_data;
2804 struct cpuinfo_x86 *__cpu_data;
2805 #endif
2806
2807 cpumask_t *
lkpi_get_static_single_cpu_mask(int cpuid)2808 lkpi_get_static_single_cpu_mask(int cpuid)
2809 {
2810
2811 KASSERT((cpuid >= 0 && cpuid <= mp_maxid), ("%s: invalid cpuid %d\n",
2812 __func__, cpuid));
2813 KASSERT(!CPU_ABSENT(cpuid), ("%s: cpu with cpuid %d is absent\n",
2814 __func__, cpuid));
2815
2816 return (static_single_cpu_mask[cpuid]);
2817 }
2818
2819 bool
lkpi_xen_initial_domain(void)2820 lkpi_xen_initial_domain(void)
2821 {
2822 #ifdef XENHVM
2823 return (xen_initial_domain());
2824 #else
2825 return (false);
2826 #endif
2827 }
2828
2829 bool
lkpi_xen_pv_domain(void)2830 lkpi_xen_pv_domain(void)
2831 {
2832 #ifdef XENHVM
2833 return (xen_pv_domain());
2834 #else
2835 return (false);
2836 #endif
2837 }
2838
2839 static void
linux_compat_init(void * arg)2840 linux_compat_init(void *arg)
2841 {
2842 struct sysctl_oid *rootoid;
2843 int i;
2844
2845 #if defined(__i386__) || defined(__amd64__)
2846 static const uint32_t x86_vendors[X86_VENDOR_NUM] = {
2847 [X86_VENDOR_INTEL] = CPU_VENDOR_INTEL,
2848 [X86_VENDOR_CYRIX] = CPU_VENDOR_CYRIX,
2849 [X86_VENDOR_AMD] = CPU_VENDOR_AMD,
2850 [X86_VENDOR_UMC] = CPU_VENDOR_UMC,
2851 [X86_VENDOR_CENTAUR] = CPU_VENDOR_CENTAUR,
2852 [X86_VENDOR_TRANSMETA] = CPU_VENDOR_TRANSMETA,
2853 [X86_VENDOR_NSC] = CPU_VENDOR_NSC,
2854 [X86_VENDOR_HYGON] = CPU_VENDOR_HYGON,
2855 };
2856 uint8_t x86_vendor = X86_VENDOR_UNKNOWN;
2857
2858 for (i = 0; i < X86_VENDOR_NUM; i++) {
2859 if (cpu_vendor_id != 0 && cpu_vendor_id == x86_vendors[i]) {
2860 x86_vendor = i;
2861 break;
2862 }
2863 }
2864 linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH);
2865 boot_cpu_data.x86_clflush_size = cpu_clflush_line_size;
2866 boot_cpu_data.x86_max_cores = mp_ncpus;
2867 boot_cpu_data.x86 = CPUID_TO_FAMILY(cpu_id);
2868 boot_cpu_data.x86_model = CPUID_TO_MODEL(cpu_id);
2869 boot_cpu_data.x86_vendor = x86_vendor;
2870
2871 __cpu_data = kmalloc_array(mp_maxid + 1,
2872 sizeof(*__cpu_data), M_WAITOK | M_ZERO);
2873 CPU_FOREACH(i) {
2874 __cpu_data[i].x86_clflush_size = cpu_clflush_line_size;
2875 __cpu_data[i].x86_max_cores = mp_ncpus;
2876 __cpu_data[i].x86 = CPUID_TO_FAMILY(cpu_id);
2877 __cpu_data[i].x86_model = CPUID_TO_MODEL(cpu_id);
2878 __cpu_data[i].x86_vendor = x86_vendor;
2879 }
2880 #endif
2881 rw_init(&linux_vma_lock, "lkpi-vma-lock");
2882
2883 rootoid = SYSCTL_ADD_ROOT_NODE(NULL,
2884 OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys");
2885 kobject_init(&linux_class_root, &linux_class_ktype);
2886 kobject_set_name(&linux_class_root, "class");
2887 linux_class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid),
2888 OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class");
2889 kobject_init(&linux_root_device.kobj, &linux_dev_ktype);
2890 kobject_set_name(&linux_root_device.kobj, "device");
2891 linux_root_device.kobj.oidp = SYSCTL_ADD_NODE(NULL,
2892 SYSCTL_CHILDREN(rootoid), OID_AUTO, "device",
2893 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "device");
2894 linux_root_device.bsddev = root_bus;
2895 linux_class_misc.name = "misc";
2896 class_register(&linux_class_misc);
2897 INIT_LIST_HEAD(&pci_drivers);
2898 INIT_LIST_HEAD(&pci_devices);
2899 spin_lock_init(&pci_lock);
2900 init_waitqueue_head(&linux_bit_waitq);
2901 init_waitqueue_head(&linux_var_waitq);
2902
2903 CPU_COPY(&all_cpus, &cpu_online_mask);
2904 /*
2905 * Generate a single-CPU cpumask_t for each CPU (possibly) in the system.
2906 * CPUs are indexed from 0..(mp_maxid). The entry for cpuid 0 will only
2907 * have itself in the cpumask, cupid 1 only itself on entry 1, and so on.
2908 * This is used by cpumask_of() (and possibly others in the future) for,
2909 * e.g., drivers to pass hints to irq_set_affinity_hint().
2910 */
2911 static_single_cpu_mask = kmalloc_array(mp_maxid + 1,
2912 sizeof(static_single_cpu_mask), M_WAITOK | M_ZERO);
2913
2914 /*
2915 * When the number of CPUs reach a threshold, we start to save memory
2916 * given the sets are static by overlapping those having their single
2917 * bit set at same position in a bitset word. Asymptotically, this
2918 * regular scheme is in O(n²) whereas the overlapping one is in O(n)
2919 * only with n being the maximum number of CPUs, so the gain will become
2920 * huge quite quickly. The threshold for 64-bit architectures is 128
2921 * CPUs.
2922 */
2923 if (mp_ncpus < (2 * _BITSET_BITS)) {
2924 cpumask_t *sscm_ptr;
2925
2926 /*
2927 * This represents 'mp_ncpus * __bitset_words(CPU_SETSIZE) *
2928 * (_BITSET_BITS / 8)' bytes (for comparison with the
2929 * overlapping scheme).
2930 */
2931 static_single_cpu_mask_lcs = kmalloc_array(mp_ncpus,
2932 sizeof(*static_single_cpu_mask_lcs),
2933 M_WAITOK | M_ZERO);
2934
2935 sscm_ptr = static_single_cpu_mask_lcs;
2936 CPU_FOREACH(i) {
2937 static_single_cpu_mask[i] = sscm_ptr++;
2938 CPU_SET(i, static_single_cpu_mask[i]);
2939 }
2940 } else {
2941 /* Pointer to a bitset word. */
2942 __typeof(((cpuset_t *)NULL)->__bits[0]) *bwp;
2943
2944 /*
2945 * Allocate memory for (static) spans of 'cpumask_t' ('cpuset_t'
2946 * really) with a single bit set that can be reused for all
2947 * single CPU masks by making them start at different offsets.
2948 * We need '__bitset_words(CPU_SETSIZE) - 1' bitset words before
2949 * the word having its single bit set, and the same amount
2950 * after.
2951 */
2952 static_single_cpu_mask_lcs = mallocarray(_BITSET_BITS,
2953 (2 * __bitset_words(CPU_SETSIZE) - 1) * (_BITSET_BITS / 8),
2954 M_KMALLOC, M_WAITOK | M_ZERO);
2955
2956 /*
2957 * We rely below on cpuset_t and the bitset generic
2958 * implementation assigning words in the '__bits' array in the
2959 * same order of bits (i.e., little-endian ordering, not to be
2960 * confused with machine endianness, which concerns bits in
2961 * words and other integers). This is an imperfect test, but it
2962 * will detect a change to big-endian ordering.
2963 */
2964 _Static_assert(
2965 __bitset_word(_BITSET_BITS + 1, _BITSET_BITS) == 1,
2966 "Assumes a bitset implementation that is little-endian "
2967 "on its words");
2968
2969 /* Initialize the single bit of each static span. */
2970 bwp = (__typeof(bwp))static_single_cpu_mask_lcs +
2971 (__bitset_words(CPU_SETSIZE) - 1);
2972 for (i = 0; i < _BITSET_BITS; i++) {
2973 CPU_SET(i, (cpuset_t *)bwp);
2974 bwp += (2 * __bitset_words(CPU_SETSIZE) - 1);
2975 }
2976
2977 /*
2978 * Finally set all CPU masks to the proper word in their
2979 * relevant span.
2980 */
2981 CPU_FOREACH(i) {
2982 bwp = (__typeof(bwp))static_single_cpu_mask_lcs;
2983 /* Find the non-zero word of the relevant span. */
2984 bwp += (2 * __bitset_words(CPU_SETSIZE) - 1) *
2985 (i % _BITSET_BITS) +
2986 __bitset_words(CPU_SETSIZE) - 1;
2987 /* Shift to find the CPU mask start. */
2988 bwp -= (i / _BITSET_BITS);
2989 static_single_cpu_mask[i] = (cpuset_t *)bwp;
2990 }
2991 }
2992
2993 strlcpy(init_uts_ns.name.release, osrelease, sizeof(init_uts_ns.name.release));
2994 }
2995 SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL);
2996
2997 static void
linux_compat_uninit(void * arg)2998 linux_compat_uninit(void *arg)
2999 {
3000 linux_kobject_kfree_name(&linux_class_root);
3001 linux_kobject_kfree_name(&linux_root_device.kobj);
3002 linux_kobject_kfree_name(&linux_class_misc.kobj);
3003
3004 free(static_single_cpu_mask_lcs, M_KMALLOC);
3005 free(static_single_cpu_mask, M_KMALLOC);
3006 #if defined(__i386__) || defined(__amd64__)
3007 free(__cpu_data, M_KMALLOC);
3008 #endif
3009
3010 spin_lock_destroy(&pci_lock);
3011 rw_destroy(&linux_vma_lock);
3012 }
3013 SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL);
3014
3015 /*
3016 * NOTE: Linux frequently uses "unsigned long" for pointer to integer
3017 * conversion and vice versa, where in FreeBSD "uintptr_t" would be
3018 * used. Assert these types have the same size, else some parts of the
3019 * LinuxKPI may not work like expected:
3020 */
3021 CTASSERT(sizeof(unsigned long) == sizeof(uintptr_t));
3022