xref: /freebsd/sys/compat/linux/linux_misc.c (revision 7ef62cebc2f965b0f640263e179276928885e33d)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2002 Doug Rabson
5  * Copyright (c) 1994-1995 Søren Schmidt
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer
13  *    in this position and unchanged.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. The name of the author may not be used to endorse or promote products
18  *    derived from this software without specific prior written permission
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include <sys/param.h>
36 #include <sys/fcntl.h>
37 #include <sys/jail.h>
38 #include <sys/imgact.h>
39 #include <sys/limits.h>
40 #include <sys/lock.h>
41 #include <sys/msgbuf.h>
42 #include <sys/mutex.h>
43 #include <sys/poll.h>
44 #include <sys/priv.h>
45 #include <sys/proc.h>
46 #include <sys/procctl.h>
47 #include <sys/reboot.h>
48 #include <sys/random.h>
49 #include <sys/resourcevar.h>
50 #include <sys/rtprio.h>
51 #include <sys/sched.h>
52 #include <sys/smp.h>
53 #include <sys/stat.h>
54 #include <sys/syscallsubr.h>
55 #include <sys/sysctl.h>
56 #include <sys/sysent.h>
57 #include <sys/sysproto.h>
58 #include <sys/time.h>
59 #include <sys/vmmeter.h>
60 #include <sys/vnode.h>
61 
62 #include <security/audit/audit.h>
63 #include <security/mac/mac_framework.h>
64 
65 #include <vm/pmap.h>
66 #include <vm/vm_map.h>
67 #include <vm/swap_pager.h>
68 
69 #ifdef COMPAT_LINUX32
70 #include <machine/../linux32/linux.h>
71 #include <machine/../linux32/linux32_proto.h>
72 #else
73 #include <machine/../linux/linux.h>
74 #include <machine/../linux/linux_proto.h>
75 #endif
76 
77 #include <compat/linux/linux_common.h>
78 #include <compat/linux/linux_dtrace.h>
79 #include <compat/linux/linux_file.h>
80 #include <compat/linux/linux_mib.h>
81 #include <compat/linux/linux_signal.h>
82 #include <compat/linux/linux_time.h>
83 #include <compat/linux/linux_util.h>
84 #include <compat/linux/linux_sysproto.h>
85 #include <compat/linux/linux_emul.h>
86 #include <compat/linux/linux_misc.h>
87 
88 int stclohz;				/* Statistics clock frequency */
89 
90 static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = {
91 	RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK,
92 	RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE,
93 	RLIMIT_MEMLOCK, RLIMIT_AS
94 };
95 
96 struct l_sysinfo {
97 	l_long		uptime;		/* Seconds since boot */
98 	l_ulong		loads[3];	/* 1, 5, and 15 minute load averages */
99 #define LINUX_SYSINFO_LOADS_SCALE 65536
100 	l_ulong		totalram;	/* Total usable main memory size */
101 	l_ulong		freeram;	/* Available memory size */
102 	l_ulong		sharedram;	/* Amount of shared memory */
103 	l_ulong		bufferram;	/* Memory used by buffers */
104 	l_ulong		totalswap;	/* Total swap space size */
105 	l_ulong		freeswap;	/* swap space still available */
106 	l_ushort	procs;		/* Number of current processes */
107 	l_ushort	pads;
108 	l_ulong		totalhigh;
109 	l_ulong		freehigh;
110 	l_uint		mem_unit;
111 	char		_f[20-2*sizeof(l_long)-sizeof(l_int)];	/* padding */
112 };
113 
114 struct l_pselect6arg {
115 	l_uintptr_t	ss;
116 	l_size_t	ss_len;
117 };
118 
119 static int	linux_utimensat_lts_to_ts(struct l_timespec *,
120 			struct timespec *);
121 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
122 static int	linux_utimensat_lts64_to_ts(struct l_timespec64 *,
123 			struct timespec *);
124 #endif
125 static int	linux_common_utimensat(struct thread *, int,
126 			const char *, struct timespec *, int);
127 static int	linux_common_pselect6(struct thread *, l_int,
128 			l_fd_set *, l_fd_set *, l_fd_set *,
129 			struct timespec *, l_uintptr_t *);
130 static int	linux_common_ppoll(struct thread *, struct pollfd *,
131 			uint32_t, struct timespec *, l_sigset_t *,
132 			l_size_t);
133 static int	linux_pollin(struct thread *, struct pollfd *,
134 			struct pollfd *, u_int);
135 static int	linux_pollout(struct thread *, struct pollfd *,
136 			struct pollfd *, u_int);
137 
138 int
139 linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args)
140 {
141 	struct l_sysinfo sysinfo;
142 	int i, j;
143 	struct timespec ts;
144 
145 	bzero(&sysinfo, sizeof(sysinfo));
146 	getnanouptime(&ts);
147 	if (ts.tv_nsec != 0)
148 		ts.tv_sec++;
149 	sysinfo.uptime = ts.tv_sec;
150 
151 	/* Use the information from the mib to get our load averages */
152 	for (i = 0; i < 3; i++)
153 		sysinfo.loads[i] = averunnable.ldavg[i] *
154 		    LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale;
155 
156 	sysinfo.totalram = physmem * PAGE_SIZE;
157 	sysinfo.freeram = (u_long)vm_free_count() * PAGE_SIZE;
158 
159 	/*
160 	 * sharedram counts pages allocated to named, swap-backed objects such
161 	 * as shared memory segments and tmpfs files.  There is no cheap way to
162 	 * compute this, so just leave the field unpopulated.  Linux itself only
163 	 * started setting this field in the 3.x timeframe.
164 	 */
165 	sysinfo.sharedram = 0;
166 	sysinfo.bufferram = 0;
167 
168 	swap_pager_status(&i, &j);
169 	sysinfo.totalswap = i * PAGE_SIZE;
170 	sysinfo.freeswap = (i - j) * PAGE_SIZE;
171 
172 	sysinfo.procs = nprocs;
173 
174 	/*
175 	 * Platforms supported by the emulation layer do not have a notion of
176 	 * high memory.
177 	 */
178 	sysinfo.totalhigh = 0;
179 	sysinfo.freehigh = 0;
180 
181 	sysinfo.mem_unit = 1;
182 
183 	return (copyout(&sysinfo, args->info, sizeof(sysinfo)));
184 }
185 
186 #ifdef LINUX_LEGACY_SYSCALLS
187 int
188 linux_alarm(struct thread *td, struct linux_alarm_args *args)
189 {
190 	struct itimerval it, old_it;
191 	u_int secs;
192 	int error __diagused;
193 
194 	secs = args->secs;
195 	/*
196 	 * Linux alarm() is always successful. Limit secs to INT32_MAX / 2
197 	 * to match kern_setitimer()'s limit to avoid error from it.
198 	 *
199 	 * XXX. Linux limit secs to INT_MAX on 32 and does not limit on 64-bit
200 	 * platforms.
201 	 */
202 	if (secs > INT32_MAX / 2)
203 		secs = INT32_MAX / 2;
204 
205 	it.it_value.tv_sec = secs;
206 	it.it_value.tv_usec = 0;
207 	timevalclear(&it.it_interval);
208 	error = kern_setitimer(td, ITIMER_REAL, &it, &old_it);
209 	KASSERT(error == 0, ("kern_setitimer returns %d", error));
210 
211 	if ((old_it.it_value.tv_sec == 0 && old_it.it_value.tv_usec > 0) ||
212 	    old_it.it_value.tv_usec >= 500000)
213 		old_it.it_value.tv_sec++;
214 	td->td_retval[0] = old_it.it_value.tv_sec;
215 	return (0);
216 }
217 #endif
218 
219 int
220 linux_brk(struct thread *td, struct linux_brk_args *args)
221 {
222 	struct vmspace *vm = td->td_proc->p_vmspace;
223 	uintptr_t new, old;
224 
225 	old = (uintptr_t)vm->vm_daddr + ctob(vm->vm_dsize);
226 	new = (uintptr_t)args->dsend;
227 	if ((caddr_t)new > vm->vm_daddr && !kern_break(td, &new))
228 		td->td_retval[0] = (register_t)new;
229 	else
230 		td->td_retval[0] = (register_t)old;
231 
232 	return (0);
233 }
234 
235 #ifdef LINUX_LEGACY_SYSCALLS
236 int
237 linux_select(struct thread *td, struct linux_select_args *args)
238 {
239 	l_timeval ltv;
240 	struct timeval tv0, tv1, utv, *tvp;
241 	int error;
242 
243 	/*
244 	 * Store current time for computation of the amount of
245 	 * time left.
246 	 */
247 	if (args->timeout) {
248 		if ((error = copyin(args->timeout, &ltv, sizeof(ltv))))
249 			goto select_out;
250 		utv.tv_sec = ltv.tv_sec;
251 		utv.tv_usec = ltv.tv_usec;
252 
253 		if (itimerfix(&utv)) {
254 			/*
255 			 * The timeval was invalid.  Convert it to something
256 			 * valid that will act as it does under Linux.
257 			 */
258 			utv.tv_sec += utv.tv_usec / 1000000;
259 			utv.tv_usec %= 1000000;
260 			if (utv.tv_usec < 0) {
261 				utv.tv_sec -= 1;
262 				utv.tv_usec += 1000000;
263 			}
264 			if (utv.tv_sec < 0)
265 				timevalclear(&utv);
266 		}
267 		microtime(&tv0);
268 		tvp = &utv;
269 	} else
270 		tvp = NULL;
271 
272 	error = kern_select(td, args->nfds, args->readfds, args->writefds,
273 	    args->exceptfds, tvp, LINUX_NFDBITS);
274 	if (error)
275 		goto select_out;
276 
277 	if (args->timeout) {
278 		if (td->td_retval[0]) {
279 			/*
280 			 * Compute how much time was left of the timeout,
281 			 * by subtracting the current time and the time
282 			 * before we started the call, and subtracting
283 			 * that result from the user-supplied value.
284 			 */
285 			microtime(&tv1);
286 			timevalsub(&tv1, &tv0);
287 			timevalsub(&utv, &tv1);
288 			if (utv.tv_sec < 0)
289 				timevalclear(&utv);
290 		} else
291 			timevalclear(&utv);
292 		ltv.tv_sec = utv.tv_sec;
293 		ltv.tv_usec = utv.tv_usec;
294 		if ((error = copyout(&ltv, args->timeout, sizeof(ltv))))
295 			goto select_out;
296 	}
297 
298 select_out:
299 	return (error);
300 }
301 #endif
302 
303 int
304 linux_mremap(struct thread *td, struct linux_mremap_args *args)
305 {
306 	uintptr_t addr;
307 	size_t len;
308 	int error = 0;
309 
310 	if (args->flags & ~(LINUX_MREMAP_FIXED | LINUX_MREMAP_MAYMOVE)) {
311 		td->td_retval[0] = 0;
312 		return (EINVAL);
313 	}
314 
315 	/*
316 	 * Check for the page alignment.
317 	 * Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK.
318 	 */
319 	if (args->addr & PAGE_MASK) {
320 		td->td_retval[0] = 0;
321 		return (EINVAL);
322 	}
323 
324 	args->new_len = round_page(args->new_len);
325 	args->old_len = round_page(args->old_len);
326 
327 	if (args->new_len > args->old_len) {
328 		td->td_retval[0] = 0;
329 		return (ENOMEM);
330 	}
331 
332 	if (args->new_len < args->old_len) {
333 		addr = args->addr + args->new_len;
334 		len = args->old_len - args->new_len;
335 		error = kern_munmap(td, addr, len);
336 	}
337 
338 	td->td_retval[0] = error ? 0 : (uintptr_t)args->addr;
339 	return (error);
340 }
341 
342 #define LINUX_MS_ASYNC       0x0001
343 #define LINUX_MS_INVALIDATE  0x0002
344 #define LINUX_MS_SYNC        0x0004
345 
346 int
347 linux_msync(struct thread *td, struct linux_msync_args *args)
348 {
349 
350 	return (kern_msync(td, args->addr, args->len,
351 	    args->fl & ~LINUX_MS_SYNC));
352 }
353 
354 #ifdef LINUX_LEGACY_SYSCALLS
355 int
356 linux_time(struct thread *td, struct linux_time_args *args)
357 {
358 	struct timeval tv;
359 	l_time_t tm;
360 	int error;
361 
362 	microtime(&tv);
363 	tm = tv.tv_sec;
364 	if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm))))
365 		return (error);
366 	td->td_retval[0] = tm;
367 	return (0);
368 }
369 #endif
370 
371 struct l_times_argv {
372 	l_clock_t	tms_utime;
373 	l_clock_t	tms_stime;
374 	l_clock_t	tms_cutime;
375 	l_clock_t	tms_cstime;
376 };
377 
378 /*
379  * Glibc versions prior to 2.2.1 always use hard-coded CLK_TCK value.
380  * Since 2.2.1 Glibc uses value exported from kernel via AT_CLKTCK
381  * auxiliary vector entry.
382  */
383 #define	CLK_TCK		100
384 
385 #define	CONVOTCK(r)	(r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK))
386 #define	CONVNTCK(r)	(r.tv_sec * stclohz + r.tv_usec / (1000000 / stclohz))
387 
388 #define	CONVTCK(r)	(linux_kernver(td) >= LINUX_KERNVER(2,4,0) ?	\
389 			    CONVNTCK(r) : CONVOTCK(r))
390 
391 int
392 linux_times(struct thread *td, struct linux_times_args *args)
393 {
394 	struct timeval tv, utime, stime, cutime, cstime;
395 	struct l_times_argv tms;
396 	struct proc *p;
397 	int error;
398 
399 	if (args->buf != NULL) {
400 		p = td->td_proc;
401 		PROC_LOCK(p);
402 		PROC_STATLOCK(p);
403 		calcru(p, &utime, &stime);
404 		PROC_STATUNLOCK(p);
405 		calccru(p, &cutime, &cstime);
406 		PROC_UNLOCK(p);
407 
408 		tms.tms_utime = CONVTCK(utime);
409 		tms.tms_stime = CONVTCK(stime);
410 
411 		tms.tms_cutime = CONVTCK(cutime);
412 		tms.tms_cstime = CONVTCK(cstime);
413 
414 		if ((error = copyout(&tms, args->buf, sizeof(tms))))
415 			return (error);
416 	}
417 
418 	microuptime(&tv);
419 	td->td_retval[0] = (int)CONVTCK(tv);
420 	return (0);
421 }
422 
423 int
424 linux_newuname(struct thread *td, struct linux_newuname_args *args)
425 {
426 	struct l_new_utsname utsname;
427 	char osname[LINUX_MAX_UTSNAME];
428 	char osrelease[LINUX_MAX_UTSNAME];
429 	char *p;
430 
431 	linux_get_osname(td, osname);
432 	linux_get_osrelease(td, osrelease);
433 
434 	bzero(&utsname, sizeof(utsname));
435 	strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME);
436 	getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME);
437 	getcreddomainname(td->td_ucred, utsname.domainname, LINUX_MAX_UTSNAME);
438 	strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME);
439 	strlcpy(utsname.version, version, LINUX_MAX_UTSNAME);
440 	for (p = utsname.version; *p != '\0'; ++p)
441 		if (*p == '\n') {
442 			*p = '\0';
443 			break;
444 		}
445 #if defined(__amd64__)
446 	/*
447 	 * On amd64, Linux uname(2) needs to return "x86_64"
448 	 * for both 64-bit and 32-bit applications.  On 32-bit,
449 	 * the string returned by getauxval(AT_PLATFORM) needs
450 	 * to remain "i686", though.
451 	 */
452 #if defined(COMPAT_LINUX32)
453 	if (linux32_emulate_i386)
454 		strlcpy(utsname.machine, "i686", LINUX_MAX_UTSNAME);
455 	else
456 #endif
457 	strlcpy(utsname.machine, "x86_64", LINUX_MAX_UTSNAME);
458 #elif defined(__aarch64__)
459 	strlcpy(utsname.machine, "aarch64", LINUX_MAX_UTSNAME);
460 #elif defined(__i386__)
461 	strlcpy(utsname.machine, "i686", LINUX_MAX_UTSNAME);
462 #endif
463 
464 	return (copyout(&utsname, args->buf, sizeof(utsname)));
465 }
466 
467 struct l_utimbuf {
468 	l_time_t l_actime;
469 	l_time_t l_modtime;
470 };
471 
472 #ifdef LINUX_LEGACY_SYSCALLS
473 int
474 linux_utime(struct thread *td, struct linux_utime_args *args)
475 {
476 	struct timeval tv[2], *tvp;
477 	struct l_utimbuf lut;
478 	int error;
479 
480 	if (args->times) {
481 		if ((error = copyin(args->times, &lut, sizeof lut)) != 0)
482 			return (error);
483 		tv[0].tv_sec = lut.l_actime;
484 		tv[0].tv_usec = 0;
485 		tv[1].tv_sec = lut.l_modtime;
486 		tv[1].tv_usec = 0;
487 		tvp = tv;
488 	} else
489 		tvp = NULL;
490 
491 	return (kern_utimesat(td, AT_FDCWD, args->fname, UIO_USERSPACE,
492 	    tvp, UIO_SYSSPACE));
493 }
494 #endif
495 
496 #ifdef LINUX_LEGACY_SYSCALLS
497 int
498 linux_utimes(struct thread *td, struct linux_utimes_args *args)
499 {
500 	l_timeval ltv[2];
501 	struct timeval tv[2], *tvp = NULL;
502 	int error;
503 
504 	if (args->tptr != NULL) {
505 		if ((error = copyin(args->tptr, ltv, sizeof ltv)) != 0)
506 			return (error);
507 		tv[0].tv_sec = ltv[0].tv_sec;
508 		tv[0].tv_usec = ltv[0].tv_usec;
509 		tv[1].tv_sec = ltv[1].tv_sec;
510 		tv[1].tv_usec = ltv[1].tv_usec;
511 		tvp = tv;
512 	}
513 
514 	return (kern_utimesat(td, AT_FDCWD, args->fname, UIO_USERSPACE,
515 	    tvp, UIO_SYSSPACE));
516 }
517 #endif
518 
519 static int
520 linux_utimensat_lts_to_ts(struct l_timespec *l_times, struct timespec *times)
521 {
522 
523 	if (l_times->tv_nsec != LINUX_UTIME_OMIT &&
524 	    l_times->tv_nsec != LINUX_UTIME_NOW &&
525 	    (l_times->tv_nsec < 0 || l_times->tv_nsec > 999999999))
526 		return (EINVAL);
527 
528 	times->tv_sec = l_times->tv_sec;
529 	switch (l_times->tv_nsec)
530 	{
531 	case LINUX_UTIME_OMIT:
532 		times->tv_nsec = UTIME_OMIT;
533 		break;
534 	case LINUX_UTIME_NOW:
535 		times->tv_nsec = UTIME_NOW;
536 		break;
537 	default:
538 		times->tv_nsec = l_times->tv_nsec;
539 	}
540 
541 	return (0);
542 }
543 
544 static int
545 linux_common_utimensat(struct thread *td, int ldfd, const char *pathname,
546     struct timespec *timesp, int lflags)
547 {
548 	int dfd, flags = 0;
549 
550 	dfd = (ldfd == LINUX_AT_FDCWD) ? AT_FDCWD : ldfd;
551 
552 	if (lflags & ~(LINUX_AT_SYMLINK_NOFOLLOW | LINUX_AT_EMPTY_PATH))
553 		return (EINVAL);
554 
555 	if (timesp != NULL) {
556 		/* This breaks POSIX, but is what the Linux kernel does
557 		 * _on purpose_ (documented in the man page for utimensat(2)),
558 		 * so we must follow that behaviour. */
559 		if (timesp[0].tv_nsec == UTIME_OMIT &&
560 		    timesp[1].tv_nsec == UTIME_OMIT)
561 			return (0);
562 	}
563 
564 	if (lflags & LINUX_AT_SYMLINK_NOFOLLOW)
565 		flags |= AT_SYMLINK_NOFOLLOW;
566 	if (lflags & LINUX_AT_EMPTY_PATH)
567 		flags |= AT_EMPTY_PATH;
568 
569 	if (pathname != NULL)
570 		return (kern_utimensat(td, dfd, pathname,
571 		    UIO_USERSPACE, timesp, UIO_SYSSPACE, flags));
572 
573 	if (lflags != 0)
574 		return (EINVAL);
575 
576 	return (kern_futimens(td, dfd, timesp, UIO_SYSSPACE));
577 }
578 
579 int
580 linux_utimensat(struct thread *td, struct linux_utimensat_args *args)
581 {
582 	struct l_timespec l_times[2];
583 	struct timespec times[2], *timesp;
584 	int error;
585 
586 	if (args->times != NULL) {
587 		error = copyin(args->times, l_times, sizeof(l_times));
588 		if (error != 0)
589 			return (error);
590 
591 		error = linux_utimensat_lts_to_ts(&l_times[0], &times[0]);
592 		if (error != 0)
593 			return (error);
594 		error = linux_utimensat_lts_to_ts(&l_times[1], &times[1]);
595 		if (error != 0)
596 			return (error);
597 		timesp = times;
598 	} else
599 		timesp = NULL;
600 
601 	return (linux_common_utimensat(td, args->dfd, args->pathname,
602 	    timesp, args->flags));
603 }
604 
605 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
606 static int
607 linux_utimensat_lts64_to_ts(struct l_timespec64 *l_times, struct timespec *times)
608 {
609 
610 	/* Zero out the padding in compat mode. */
611 	l_times->tv_nsec &= 0xFFFFFFFFUL;
612 
613 	if (l_times->tv_nsec != LINUX_UTIME_OMIT &&
614 	    l_times->tv_nsec != LINUX_UTIME_NOW &&
615 	    (l_times->tv_nsec < 0 || l_times->tv_nsec > 999999999))
616 		return (EINVAL);
617 
618 	times->tv_sec = l_times->tv_sec;
619 	switch (l_times->tv_nsec)
620 	{
621 	case LINUX_UTIME_OMIT:
622 		times->tv_nsec = UTIME_OMIT;
623 		break;
624 	case LINUX_UTIME_NOW:
625 		times->tv_nsec = UTIME_NOW;
626 		break;
627 	default:
628 		times->tv_nsec = l_times->tv_nsec;
629 	}
630 
631 	return (0);
632 }
633 
634 int
635 linux_utimensat_time64(struct thread *td, struct linux_utimensat_time64_args *args)
636 {
637 	struct l_timespec64 l_times[2];
638 	struct timespec times[2], *timesp;
639 	int error;
640 
641 	if (args->times64 != NULL) {
642 		error = copyin(args->times64, l_times, sizeof(l_times));
643 		if (error != 0)
644 			return (error);
645 
646 		error = linux_utimensat_lts64_to_ts(&l_times[0], &times[0]);
647 		if (error != 0)
648 			return (error);
649 		error = linux_utimensat_lts64_to_ts(&l_times[1], &times[1]);
650 		if (error != 0)
651 			return (error);
652 		timesp = times;
653 	} else
654 		timesp = NULL;
655 
656 	return (linux_common_utimensat(td, args->dfd, args->pathname,
657 	    timesp, args->flags));
658 }
659 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
660 
661 #ifdef LINUX_LEGACY_SYSCALLS
662 int
663 linux_futimesat(struct thread *td, struct linux_futimesat_args *args)
664 {
665 	l_timeval ltv[2];
666 	struct timeval tv[2], *tvp = NULL;
667 	int error, dfd;
668 
669 	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
670 
671 	if (args->utimes != NULL) {
672 		if ((error = copyin(args->utimes, ltv, sizeof ltv)) != 0)
673 			return (error);
674 		tv[0].tv_sec = ltv[0].tv_sec;
675 		tv[0].tv_usec = ltv[0].tv_usec;
676 		tv[1].tv_sec = ltv[1].tv_sec;
677 		tv[1].tv_usec = ltv[1].tv_usec;
678 		tvp = tv;
679 	}
680 
681 	return (kern_utimesat(td, dfd, args->filename, UIO_USERSPACE,
682 	    tvp, UIO_SYSSPACE));
683 }
684 #endif
685 
686 static int
687 linux_common_wait(struct thread *td, idtype_t idtype, int id, int *statusp,
688     int options, void *rup, l_siginfo_t *infop)
689 {
690 	l_siginfo_t lsi;
691 	siginfo_t siginfo;
692 	struct __wrusage wru;
693 	int error, status, tmpstat, sig;
694 
695 	error = kern_wait6(td, idtype, id, &status, options,
696 	    rup != NULL ? &wru : NULL, &siginfo);
697 
698 	if (error == 0 && statusp) {
699 		tmpstat = status & 0xffff;
700 		if (WIFSIGNALED(tmpstat)) {
701 			tmpstat = (tmpstat & 0xffffff80) |
702 			    bsd_to_linux_signal(WTERMSIG(tmpstat));
703 		} else if (WIFSTOPPED(tmpstat)) {
704 			tmpstat = (tmpstat & 0xffff00ff) |
705 			    (bsd_to_linux_signal(WSTOPSIG(tmpstat)) << 8);
706 #if defined(__aarch64__) || (defined(__amd64__) && !defined(COMPAT_LINUX32))
707 			if (WSTOPSIG(status) == SIGTRAP) {
708 				tmpstat = linux_ptrace_status(td,
709 				    siginfo.si_pid, tmpstat);
710 			}
711 #endif
712 		} else if (WIFCONTINUED(tmpstat)) {
713 			tmpstat = 0xffff;
714 		}
715 		error = copyout(&tmpstat, statusp, sizeof(int));
716 	}
717 	if (error == 0 && rup != NULL)
718 		error = linux_copyout_rusage(&wru.wru_self, rup);
719 	if (error == 0 && infop != NULL && td->td_retval[0] != 0) {
720 		sig = bsd_to_linux_signal(siginfo.si_signo);
721 		siginfo_to_lsiginfo(&siginfo, &lsi, sig);
722 		error = copyout(&lsi, infop, sizeof(lsi));
723 	}
724 
725 	return (error);
726 }
727 
728 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
729 int
730 linux_waitpid(struct thread *td, struct linux_waitpid_args *args)
731 {
732 	struct linux_wait4_args wait4_args = {
733 		.pid = args->pid,
734 		.status = args->status,
735 		.options = args->options,
736 		.rusage = NULL,
737 	};
738 
739 	return (linux_wait4(td, &wait4_args));
740 }
741 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
742 
743 int
744 linux_wait4(struct thread *td, struct linux_wait4_args *args)
745 {
746 	struct proc *p;
747 	int options, id, idtype;
748 
749 	if (args->options & ~(LINUX_WUNTRACED | LINUX_WNOHANG |
750 	    LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL))
751 		return (EINVAL);
752 
753 	/* -INT_MIN is not defined. */
754 	if (args->pid == INT_MIN)
755 		return (ESRCH);
756 
757 	options = 0;
758 	linux_to_bsd_waitopts(args->options, &options);
759 
760 	/*
761 	 * For backward compatibility we implicitly add flags WEXITED
762 	 * and WTRAPPED here.
763 	 */
764 	options |= WEXITED | WTRAPPED;
765 
766 	if (args->pid == WAIT_ANY) {
767 		idtype = P_ALL;
768 		id = 0;
769 	} else if (args->pid < 0) {
770 		idtype = P_PGID;
771 		id = (id_t)-args->pid;
772 	} else if (args->pid == 0) {
773 		idtype = P_PGID;
774 		p = td->td_proc;
775 		PROC_LOCK(p);
776 		id = p->p_pgid;
777 		PROC_UNLOCK(p);
778 	} else {
779 		idtype = P_PID;
780 		id = (id_t)args->pid;
781 	}
782 
783 	return (linux_common_wait(td, idtype, id, args->status, options,
784 	    args->rusage, NULL));
785 }
786 
787 int
788 linux_waitid(struct thread *td, struct linux_waitid_args *args)
789 {
790 	idtype_t idtype;
791 	int error, options;
792 	struct proc *p;
793 	pid_t id;
794 
795 	if (args->options & ~(LINUX_WNOHANG | LINUX_WNOWAIT | LINUX_WEXITED |
796 	    LINUX_WSTOPPED | LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL))
797 		return (EINVAL);
798 
799 	options = 0;
800 	linux_to_bsd_waitopts(args->options, &options);
801 
802 	id = args->id;
803 	switch (args->idtype) {
804 	case LINUX_P_ALL:
805 		idtype = P_ALL;
806 		break;
807 	case LINUX_P_PID:
808 		if (args->id <= 0)
809 			return (EINVAL);
810 		idtype = P_PID;
811 		break;
812 	case LINUX_P_PGID:
813 		if (linux_kernver(td) >= LINUX_KERNVER(5,4,0) && args->id == 0) {
814 			p = td->td_proc;
815 			PROC_LOCK(p);
816 			id = p->p_pgid;
817 			PROC_UNLOCK(p);
818 		} else if (args->id <= 0)
819 			return (EINVAL);
820 		idtype = P_PGID;
821 		break;
822 	case LINUX_P_PIDFD:
823 		LINUX_RATELIMIT_MSG("unsupported waitid P_PIDFD idtype");
824 		return (ENOSYS);
825 	default:
826 		return (EINVAL);
827 	}
828 
829 	error = linux_common_wait(td, idtype, id, NULL, options,
830 	    args->rusage, args->info);
831 	td->td_retval[0] = 0;
832 
833 	return (error);
834 }
835 
836 #ifdef LINUX_LEGACY_SYSCALLS
837 int
838 linux_mknod(struct thread *td, struct linux_mknod_args *args)
839 {
840 	int error;
841 
842 	switch (args->mode & S_IFMT) {
843 	case S_IFIFO:
844 	case S_IFSOCK:
845 		error = kern_mkfifoat(td, AT_FDCWD, args->path, UIO_USERSPACE,
846 		    args->mode);
847 		break;
848 
849 	case S_IFCHR:
850 	case S_IFBLK:
851 		error = kern_mknodat(td, AT_FDCWD, args->path, UIO_USERSPACE,
852 		    args->mode, linux_decode_dev(args->dev));
853 		break;
854 
855 	case S_IFDIR:
856 		error = EPERM;
857 		break;
858 
859 	case 0:
860 		args->mode |= S_IFREG;
861 		/* FALLTHROUGH */
862 	case S_IFREG:
863 		error = kern_openat(td, AT_FDCWD, args->path, UIO_USERSPACE,
864 		    O_WRONLY | O_CREAT | O_TRUNC, args->mode);
865 		if (error == 0)
866 			kern_close(td, td->td_retval[0]);
867 		break;
868 
869 	default:
870 		error = EINVAL;
871 		break;
872 	}
873 	return (error);
874 }
875 #endif
876 
877 int
878 linux_mknodat(struct thread *td, struct linux_mknodat_args *args)
879 {
880 	int error, dfd;
881 
882 	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
883 
884 	switch (args->mode & S_IFMT) {
885 	case S_IFIFO:
886 	case S_IFSOCK:
887 		error = kern_mkfifoat(td, dfd, args->filename, UIO_USERSPACE,
888 		    args->mode);
889 		break;
890 
891 	case S_IFCHR:
892 	case S_IFBLK:
893 		error = kern_mknodat(td, dfd, args->filename, UIO_USERSPACE,
894 		    args->mode, linux_decode_dev(args->dev));
895 		break;
896 
897 	case S_IFDIR:
898 		error = EPERM;
899 		break;
900 
901 	case 0:
902 		args->mode |= S_IFREG;
903 		/* FALLTHROUGH */
904 	case S_IFREG:
905 		error = kern_openat(td, dfd, args->filename, UIO_USERSPACE,
906 		    O_WRONLY | O_CREAT | O_TRUNC, args->mode);
907 		if (error == 0)
908 			kern_close(td, td->td_retval[0]);
909 		break;
910 
911 	default:
912 		error = EINVAL;
913 		break;
914 	}
915 	return (error);
916 }
917 
918 /*
919  * UGH! This is just about the dumbest idea I've ever heard!!
920  */
921 int
922 linux_personality(struct thread *td, struct linux_personality_args *args)
923 {
924 	struct linux_pemuldata *pem;
925 	struct proc *p = td->td_proc;
926 	uint32_t old;
927 
928 	PROC_LOCK(p);
929 	pem = pem_find(p);
930 	old = pem->persona;
931 	if (args->per != 0xffffffff)
932 		pem->persona = args->per;
933 	PROC_UNLOCK(p);
934 
935 	td->td_retval[0] = old;
936 	return (0);
937 }
938 
939 struct l_itimerval {
940 	l_timeval it_interval;
941 	l_timeval it_value;
942 };
943 
944 #define	B2L_ITIMERVAL(bip, lip)						\
945 	(bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec;		\
946 	(bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec;	\
947 	(bip)->it_value.tv_sec = (lip)->it_value.tv_sec;		\
948 	(bip)->it_value.tv_usec = (lip)->it_value.tv_usec;
949 
950 int
951 linux_setitimer(struct thread *td, struct linux_setitimer_args *uap)
952 {
953 	int error;
954 	struct l_itimerval ls;
955 	struct itimerval aitv, oitv;
956 
957 	if (uap->itv == NULL) {
958 		uap->itv = uap->oitv;
959 		return (linux_getitimer(td, (struct linux_getitimer_args *)uap));
960 	}
961 
962 	error = copyin(uap->itv, &ls, sizeof(ls));
963 	if (error != 0)
964 		return (error);
965 	B2L_ITIMERVAL(&aitv, &ls);
966 	error = kern_setitimer(td, uap->which, &aitv, &oitv);
967 	if (error != 0 || uap->oitv == NULL)
968 		return (error);
969 	B2L_ITIMERVAL(&ls, &oitv);
970 
971 	return (copyout(&ls, uap->oitv, sizeof(ls)));
972 }
973 
974 int
975 linux_getitimer(struct thread *td, struct linux_getitimer_args *uap)
976 {
977 	int error;
978 	struct l_itimerval ls;
979 	struct itimerval aitv;
980 
981 	error = kern_getitimer(td, uap->which, &aitv);
982 	if (error != 0)
983 		return (error);
984 	B2L_ITIMERVAL(&ls, &aitv);
985 	return (copyout(&ls, uap->itv, sizeof(ls)));
986 }
987 
988 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
989 int
990 linux_nice(struct thread *td, struct linux_nice_args *args)
991 {
992 
993 	return (kern_setpriority(td, PRIO_PROCESS, 0, args->inc));
994 }
995 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
996 
997 int
998 linux_setgroups(struct thread *td, struct linux_setgroups_args *args)
999 {
1000 	struct ucred *newcred, *oldcred;
1001 	l_gid_t *linux_gidset;
1002 	gid_t *bsd_gidset;
1003 	int ngrp, error;
1004 	struct proc *p;
1005 
1006 	ngrp = args->gidsetsize;
1007 	if (ngrp < 0 || ngrp >= ngroups_max + 1)
1008 		return (EINVAL);
1009 	linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_LINUX, M_WAITOK);
1010 	error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t));
1011 	if (error)
1012 		goto out;
1013 	newcred = crget();
1014 	crextend(newcred, ngrp + 1);
1015 	p = td->td_proc;
1016 	PROC_LOCK(p);
1017 	oldcred = p->p_ucred;
1018 	crcopy(newcred, oldcred);
1019 
1020 	/*
1021 	 * cr_groups[0] holds egid. Setting the whole set from
1022 	 * the supplied set will cause egid to be changed too.
1023 	 * Keep cr_groups[0] unchanged to prevent that.
1024 	 */
1025 
1026 	if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS)) != 0) {
1027 		PROC_UNLOCK(p);
1028 		crfree(newcred);
1029 		goto out;
1030 	}
1031 
1032 	if (ngrp > 0) {
1033 		newcred->cr_ngroups = ngrp + 1;
1034 
1035 		bsd_gidset = newcred->cr_groups;
1036 		ngrp--;
1037 		while (ngrp >= 0) {
1038 			bsd_gidset[ngrp + 1] = linux_gidset[ngrp];
1039 			ngrp--;
1040 		}
1041 	} else
1042 		newcred->cr_ngroups = 1;
1043 
1044 	setsugid(p);
1045 	proc_set_cred(p, newcred);
1046 	PROC_UNLOCK(p);
1047 	crfree(oldcred);
1048 	error = 0;
1049 out:
1050 	free(linux_gidset, M_LINUX);
1051 	return (error);
1052 }
1053 
1054 int
1055 linux_getgroups(struct thread *td, struct linux_getgroups_args *args)
1056 {
1057 	struct ucred *cred;
1058 	l_gid_t *linux_gidset;
1059 	gid_t *bsd_gidset;
1060 	int bsd_gidsetsz, ngrp, error;
1061 
1062 	cred = td->td_ucred;
1063 	bsd_gidset = cred->cr_groups;
1064 	bsd_gidsetsz = cred->cr_ngroups - 1;
1065 
1066 	/*
1067 	 * cr_groups[0] holds egid. Returning the whole set
1068 	 * here will cause a duplicate. Exclude cr_groups[0]
1069 	 * to prevent that.
1070 	 */
1071 
1072 	if ((ngrp = args->gidsetsize) == 0) {
1073 		td->td_retval[0] = bsd_gidsetsz;
1074 		return (0);
1075 	}
1076 
1077 	if (ngrp < bsd_gidsetsz)
1078 		return (EINVAL);
1079 
1080 	ngrp = 0;
1081 	linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset),
1082 	    M_LINUX, M_WAITOK);
1083 	while (ngrp < bsd_gidsetsz) {
1084 		linux_gidset[ngrp] = bsd_gidset[ngrp + 1];
1085 		ngrp++;
1086 	}
1087 
1088 	error = copyout(linux_gidset, args->grouplist, ngrp * sizeof(l_gid_t));
1089 	free(linux_gidset, M_LINUX);
1090 	if (error)
1091 		return (error);
1092 
1093 	td->td_retval[0] = ngrp;
1094 	return (0);
1095 }
1096 
1097 static bool
1098 linux_get_dummy_limit(l_uint resource, struct rlimit *rlim)
1099 {
1100 
1101 	if (linux_dummy_rlimits == 0)
1102 		return (false);
1103 
1104 	switch (resource) {
1105 	case LINUX_RLIMIT_LOCKS:
1106 	case LINUX_RLIMIT_SIGPENDING:
1107 	case LINUX_RLIMIT_MSGQUEUE:
1108 	case LINUX_RLIMIT_RTTIME:
1109 		rlim->rlim_cur = LINUX_RLIM_INFINITY;
1110 		rlim->rlim_max = LINUX_RLIM_INFINITY;
1111 		return (true);
1112 	case LINUX_RLIMIT_NICE:
1113 	case LINUX_RLIMIT_RTPRIO:
1114 		rlim->rlim_cur = 0;
1115 		rlim->rlim_max = 0;
1116 		return (true);
1117 	default:
1118 		return (false);
1119 	}
1120 }
1121 
1122 int
1123 linux_setrlimit(struct thread *td, struct linux_setrlimit_args *args)
1124 {
1125 	struct rlimit bsd_rlim;
1126 	struct l_rlimit rlim;
1127 	u_int which;
1128 	int error;
1129 
1130 	if (args->resource >= LINUX_RLIM_NLIMITS)
1131 		return (EINVAL);
1132 
1133 	which = linux_to_bsd_resource[args->resource];
1134 	if (which == -1)
1135 		return (EINVAL);
1136 
1137 	error = copyin(args->rlim, &rlim, sizeof(rlim));
1138 	if (error)
1139 		return (error);
1140 
1141 	bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur;
1142 	bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max;
1143 	return (kern_setrlimit(td, which, &bsd_rlim));
1144 }
1145 
1146 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1147 int
1148 linux_old_getrlimit(struct thread *td, struct linux_old_getrlimit_args *args)
1149 {
1150 	struct l_rlimit rlim;
1151 	struct rlimit bsd_rlim;
1152 	u_int which;
1153 
1154 	if (linux_get_dummy_limit(args->resource, &bsd_rlim)) {
1155 		rlim.rlim_cur = bsd_rlim.rlim_cur;
1156 		rlim.rlim_max = bsd_rlim.rlim_max;
1157 		return (copyout(&rlim, args->rlim, sizeof(rlim)));
1158 	}
1159 
1160 	if (args->resource >= LINUX_RLIM_NLIMITS)
1161 		return (EINVAL);
1162 
1163 	which = linux_to_bsd_resource[args->resource];
1164 	if (which == -1)
1165 		return (EINVAL);
1166 
1167 	lim_rlimit(td, which, &bsd_rlim);
1168 
1169 #ifdef COMPAT_LINUX32
1170 	rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur;
1171 	if (rlim.rlim_cur == UINT_MAX)
1172 		rlim.rlim_cur = INT_MAX;
1173 	rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max;
1174 	if (rlim.rlim_max == UINT_MAX)
1175 		rlim.rlim_max = INT_MAX;
1176 #else
1177 	rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur;
1178 	if (rlim.rlim_cur == ULONG_MAX)
1179 		rlim.rlim_cur = LONG_MAX;
1180 	rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max;
1181 	if (rlim.rlim_max == ULONG_MAX)
1182 		rlim.rlim_max = LONG_MAX;
1183 #endif
1184 	return (copyout(&rlim, args->rlim, sizeof(rlim)));
1185 }
1186 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
1187 
1188 int
1189 linux_getrlimit(struct thread *td, struct linux_getrlimit_args *args)
1190 {
1191 	struct l_rlimit rlim;
1192 	struct rlimit bsd_rlim;
1193 	u_int which;
1194 
1195 	if (linux_get_dummy_limit(args->resource, &bsd_rlim)) {
1196 		rlim.rlim_cur = bsd_rlim.rlim_cur;
1197 		rlim.rlim_max = bsd_rlim.rlim_max;
1198 		return (copyout(&rlim, args->rlim, sizeof(rlim)));
1199 	}
1200 
1201 	if (args->resource >= LINUX_RLIM_NLIMITS)
1202 		return (EINVAL);
1203 
1204 	which = linux_to_bsd_resource[args->resource];
1205 	if (which == -1)
1206 		return (EINVAL);
1207 
1208 	lim_rlimit(td, which, &bsd_rlim);
1209 
1210 	rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur;
1211 	rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max;
1212 	return (copyout(&rlim, args->rlim, sizeof(rlim)));
1213 }
1214 
1215 int
1216 linux_sched_setscheduler(struct thread *td,
1217     struct linux_sched_setscheduler_args *args)
1218 {
1219 	struct sched_param sched_param;
1220 	struct thread *tdt;
1221 	int error, policy;
1222 
1223 	switch (args->policy) {
1224 	case LINUX_SCHED_OTHER:
1225 		policy = SCHED_OTHER;
1226 		break;
1227 	case LINUX_SCHED_FIFO:
1228 		policy = SCHED_FIFO;
1229 		break;
1230 	case LINUX_SCHED_RR:
1231 		policy = SCHED_RR;
1232 		break;
1233 	default:
1234 		return (EINVAL);
1235 	}
1236 
1237 	error = copyin(args->param, &sched_param, sizeof(sched_param));
1238 	if (error)
1239 		return (error);
1240 
1241 	if (linux_map_sched_prio) {
1242 		switch (policy) {
1243 		case SCHED_OTHER:
1244 			if (sched_param.sched_priority != 0)
1245 				return (EINVAL);
1246 
1247 			sched_param.sched_priority =
1248 			    PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
1249 			break;
1250 		case SCHED_FIFO:
1251 		case SCHED_RR:
1252 			if (sched_param.sched_priority < 1 ||
1253 			    sched_param.sched_priority >= LINUX_MAX_RT_PRIO)
1254 				return (EINVAL);
1255 
1256 			/*
1257 			 * Map [1, LINUX_MAX_RT_PRIO - 1] to
1258 			 * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down).
1259 			 */
1260 			sched_param.sched_priority =
1261 			    (sched_param.sched_priority - 1) *
1262 			    (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) /
1263 			    (LINUX_MAX_RT_PRIO - 1);
1264 			break;
1265 		}
1266 	}
1267 
1268 	tdt = linux_tdfind(td, args->pid, -1);
1269 	if (tdt == NULL)
1270 		return (ESRCH);
1271 
1272 	error = kern_sched_setscheduler(td, tdt, policy, &sched_param);
1273 	PROC_UNLOCK(tdt->td_proc);
1274 	return (error);
1275 }
1276 
1277 int
1278 linux_sched_getscheduler(struct thread *td,
1279     struct linux_sched_getscheduler_args *args)
1280 {
1281 	struct thread *tdt;
1282 	int error, policy;
1283 
1284 	tdt = linux_tdfind(td, args->pid, -1);
1285 	if (tdt == NULL)
1286 		return (ESRCH);
1287 
1288 	error = kern_sched_getscheduler(td, tdt, &policy);
1289 	PROC_UNLOCK(tdt->td_proc);
1290 
1291 	switch (policy) {
1292 	case SCHED_OTHER:
1293 		td->td_retval[0] = LINUX_SCHED_OTHER;
1294 		break;
1295 	case SCHED_FIFO:
1296 		td->td_retval[0] = LINUX_SCHED_FIFO;
1297 		break;
1298 	case SCHED_RR:
1299 		td->td_retval[0] = LINUX_SCHED_RR;
1300 		break;
1301 	}
1302 	return (error);
1303 }
1304 
1305 int
1306 linux_sched_get_priority_max(struct thread *td,
1307     struct linux_sched_get_priority_max_args *args)
1308 {
1309 	struct sched_get_priority_max_args bsd;
1310 
1311 	if (linux_map_sched_prio) {
1312 		switch (args->policy) {
1313 		case LINUX_SCHED_OTHER:
1314 			td->td_retval[0] = 0;
1315 			return (0);
1316 		case LINUX_SCHED_FIFO:
1317 		case LINUX_SCHED_RR:
1318 			td->td_retval[0] = LINUX_MAX_RT_PRIO - 1;
1319 			return (0);
1320 		default:
1321 			return (EINVAL);
1322 		}
1323 	}
1324 
1325 	switch (args->policy) {
1326 	case LINUX_SCHED_OTHER:
1327 		bsd.policy = SCHED_OTHER;
1328 		break;
1329 	case LINUX_SCHED_FIFO:
1330 		bsd.policy = SCHED_FIFO;
1331 		break;
1332 	case LINUX_SCHED_RR:
1333 		bsd.policy = SCHED_RR;
1334 		break;
1335 	default:
1336 		return (EINVAL);
1337 	}
1338 	return (sys_sched_get_priority_max(td, &bsd));
1339 }
1340 
1341 int
1342 linux_sched_get_priority_min(struct thread *td,
1343     struct linux_sched_get_priority_min_args *args)
1344 {
1345 	struct sched_get_priority_min_args bsd;
1346 
1347 	if (linux_map_sched_prio) {
1348 		switch (args->policy) {
1349 		case LINUX_SCHED_OTHER:
1350 			td->td_retval[0] = 0;
1351 			return (0);
1352 		case LINUX_SCHED_FIFO:
1353 		case LINUX_SCHED_RR:
1354 			td->td_retval[0] = 1;
1355 			return (0);
1356 		default:
1357 			return (EINVAL);
1358 		}
1359 	}
1360 
1361 	switch (args->policy) {
1362 	case LINUX_SCHED_OTHER:
1363 		bsd.policy = SCHED_OTHER;
1364 		break;
1365 	case LINUX_SCHED_FIFO:
1366 		bsd.policy = SCHED_FIFO;
1367 		break;
1368 	case LINUX_SCHED_RR:
1369 		bsd.policy = SCHED_RR;
1370 		break;
1371 	default:
1372 		return (EINVAL);
1373 	}
1374 	return (sys_sched_get_priority_min(td, &bsd));
1375 }
1376 
1377 #define REBOOT_CAD_ON	0x89abcdef
1378 #define REBOOT_CAD_OFF	0
1379 #define REBOOT_HALT	0xcdef0123
1380 #define REBOOT_RESTART	0x01234567
1381 #define REBOOT_RESTART2	0xA1B2C3D4
1382 #define REBOOT_POWEROFF	0x4321FEDC
1383 #define REBOOT_MAGIC1	0xfee1dead
1384 #define REBOOT_MAGIC2	0x28121969
1385 #define REBOOT_MAGIC2A	0x05121996
1386 #define REBOOT_MAGIC2B	0x16041998
1387 
1388 int
1389 linux_reboot(struct thread *td, struct linux_reboot_args *args)
1390 {
1391 	struct reboot_args bsd_args;
1392 
1393 	if (args->magic1 != REBOOT_MAGIC1)
1394 		return (EINVAL);
1395 
1396 	switch (args->magic2) {
1397 	case REBOOT_MAGIC2:
1398 	case REBOOT_MAGIC2A:
1399 	case REBOOT_MAGIC2B:
1400 		break;
1401 	default:
1402 		return (EINVAL);
1403 	}
1404 
1405 	switch (args->cmd) {
1406 	case REBOOT_CAD_ON:
1407 	case REBOOT_CAD_OFF:
1408 		return (priv_check(td, PRIV_REBOOT));
1409 	case REBOOT_HALT:
1410 		bsd_args.opt = RB_HALT;
1411 		break;
1412 	case REBOOT_RESTART:
1413 	case REBOOT_RESTART2:
1414 		bsd_args.opt = 0;
1415 		break;
1416 	case REBOOT_POWEROFF:
1417 		bsd_args.opt = RB_POWEROFF;
1418 		break;
1419 	default:
1420 		return (EINVAL);
1421 	}
1422 	return (sys_reboot(td, &bsd_args));
1423 }
1424 
1425 int
1426 linux_getpid(struct thread *td, struct linux_getpid_args *args)
1427 {
1428 
1429 	td->td_retval[0] = td->td_proc->p_pid;
1430 
1431 	return (0);
1432 }
1433 
1434 int
1435 linux_gettid(struct thread *td, struct linux_gettid_args *args)
1436 {
1437 	struct linux_emuldata *em;
1438 
1439 	em = em_find(td);
1440 	KASSERT(em != NULL, ("gettid: emuldata not found.\n"));
1441 
1442 	td->td_retval[0] = em->em_tid;
1443 
1444 	return (0);
1445 }
1446 
1447 int
1448 linux_getppid(struct thread *td, struct linux_getppid_args *args)
1449 {
1450 
1451 	td->td_retval[0] = kern_getppid(td);
1452 	return (0);
1453 }
1454 
1455 int
1456 linux_getgid(struct thread *td, struct linux_getgid_args *args)
1457 {
1458 
1459 	td->td_retval[0] = td->td_ucred->cr_rgid;
1460 	return (0);
1461 }
1462 
1463 int
1464 linux_getuid(struct thread *td, struct linux_getuid_args *args)
1465 {
1466 
1467 	td->td_retval[0] = td->td_ucred->cr_ruid;
1468 	return (0);
1469 }
1470 
1471 int
1472 linux_getsid(struct thread *td, struct linux_getsid_args *args)
1473 {
1474 
1475 	return (kern_getsid(td, args->pid));
1476 }
1477 
1478 int
1479 linux_nosys(struct thread *td, struct nosys_args *ignore)
1480 {
1481 
1482 	return (ENOSYS);
1483 }
1484 
1485 int
1486 linux_getpriority(struct thread *td, struct linux_getpriority_args *args)
1487 {
1488 	int error;
1489 
1490 	error = kern_getpriority(td, args->which, args->who);
1491 	td->td_retval[0] = 20 - td->td_retval[0];
1492 	return (error);
1493 }
1494 
1495 int
1496 linux_sethostname(struct thread *td, struct linux_sethostname_args *args)
1497 {
1498 	int name[2];
1499 
1500 	name[0] = CTL_KERN;
1501 	name[1] = KERN_HOSTNAME;
1502 	return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname,
1503 	    args->len, 0, 0));
1504 }
1505 
1506 int
1507 linux_setdomainname(struct thread *td, struct linux_setdomainname_args *args)
1508 {
1509 	int name[2];
1510 
1511 	name[0] = CTL_KERN;
1512 	name[1] = KERN_NISDOMAINNAME;
1513 	return (userland_sysctl(td, name, 2, 0, 0, 0, args->name,
1514 	    args->len, 0, 0));
1515 }
1516 
1517 int
1518 linux_exit_group(struct thread *td, struct linux_exit_group_args *args)
1519 {
1520 
1521 	LINUX_CTR2(exit_group, "thread(%d) (%d)", td->td_tid,
1522 	    args->error_code);
1523 
1524 	/*
1525 	 * XXX: we should send a signal to the parent if
1526 	 * SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?)
1527 	 * as it doesnt occur often.
1528 	 */
1529 	exit1(td, args->error_code, 0);
1530 		/* NOTREACHED */
1531 }
1532 
1533 #define _LINUX_CAPABILITY_VERSION_1  0x19980330
1534 #define _LINUX_CAPABILITY_VERSION_2  0x20071026
1535 #define _LINUX_CAPABILITY_VERSION_3  0x20080522
1536 
1537 struct l_user_cap_header {
1538 	l_int	version;
1539 	l_int	pid;
1540 };
1541 
1542 struct l_user_cap_data {
1543 	l_int	effective;
1544 	l_int	permitted;
1545 	l_int	inheritable;
1546 };
1547 
1548 int
1549 linux_capget(struct thread *td, struct linux_capget_args *uap)
1550 {
1551 	struct l_user_cap_header luch;
1552 	struct l_user_cap_data lucd[2];
1553 	int error, u32s;
1554 
1555 	if (uap->hdrp == NULL)
1556 		return (EFAULT);
1557 
1558 	error = copyin(uap->hdrp, &luch, sizeof(luch));
1559 	if (error != 0)
1560 		return (error);
1561 
1562 	switch (luch.version) {
1563 	case _LINUX_CAPABILITY_VERSION_1:
1564 		u32s = 1;
1565 		break;
1566 	case _LINUX_CAPABILITY_VERSION_2:
1567 	case _LINUX_CAPABILITY_VERSION_3:
1568 		u32s = 2;
1569 		break;
1570 	default:
1571 		luch.version = _LINUX_CAPABILITY_VERSION_1;
1572 		error = copyout(&luch, uap->hdrp, sizeof(luch));
1573 		if (error)
1574 			return (error);
1575 		return (EINVAL);
1576 	}
1577 
1578 	if (luch.pid)
1579 		return (EPERM);
1580 
1581 	if (uap->datap) {
1582 		/*
1583 		 * The current implementation doesn't support setting
1584 		 * a capability (it's essentially a stub) so indicate
1585 		 * that no capabilities are currently set or available
1586 		 * to request.
1587 		 */
1588 		memset(&lucd, 0, u32s * sizeof(lucd[0]));
1589 		error = copyout(&lucd, uap->datap, u32s * sizeof(lucd[0]));
1590 	}
1591 
1592 	return (error);
1593 }
1594 
1595 int
1596 linux_capset(struct thread *td, struct linux_capset_args *uap)
1597 {
1598 	struct l_user_cap_header luch;
1599 	struct l_user_cap_data lucd[2];
1600 	int error, i, u32s;
1601 
1602 	if (uap->hdrp == NULL || uap->datap == NULL)
1603 		return (EFAULT);
1604 
1605 	error = copyin(uap->hdrp, &luch, sizeof(luch));
1606 	if (error != 0)
1607 		return (error);
1608 
1609 	switch (luch.version) {
1610 	case _LINUX_CAPABILITY_VERSION_1:
1611 		u32s = 1;
1612 		break;
1613 	case _LINUX_CAPABILITY_VERSION_2:
1614 	case _LINUX_CAPABILITY_VERSION_3:
1615 		u32s = 2;
1616 		break;
1617 	default:
1618 		luch.version = _LINUX_CAPABILITY_VERSION_1;
1619 		error = copyout(&luch, uap->hdrp, sizeof(luch));
1620 		if (error)
1621 			return (error);
1622 		return (EINVAL);
1623 	}
1624 
1625 	if (luch.pid)
1626 		return (EPERM);
1627 
1628 	error = copyin(uap->datap, &lucd, u32s * sizeof(lucd[0]));
1629 	if (error != 0)
1630 		return (error);
1631 
1632 	/* We currently don't support setting any capabilities. */
1633 	for (i = 0; i < u32s; i++) {
1634 		if (lucd[i].effective || lucd[i].permitted ||
1635 		    lucd[i].inheritable) {
1636 			linux_msg(td,
1637 			    "capset[%d] effective=0x%x, permitted=0x%x, "
1638 			    "inheritable=0x%x is not implemented", i,
1639 			    (int)lucd[i].effective, (int)lucd[i].permitted,
1640 			    (int)lucd[i].inheritable);
1641 			return (EPERM);
1642 		}
1643 	}
1644 
1645 	return (0);
1646 }
1647 
1648 int
1649 linux_prctl(struct thread *td, struct linux_prctl_args *args)
1650 {
1651 	int error = 0, max_size, arg;
1652 	struct proc *p = td->td_proc;
1653 	char comm[LINUX_MAX_COMM_LEN];
1654 	int pdeath_signal, trace_state;
1655 
1656 	switch (args->option) {
1657 	case LINUX_PR_SET_PDEATHSIG:
1658 		if (!LINUX_SIG_VALID(args->arg2))
1659 			return (EINVAL);
1660 		pdeath_signal = linux_to_bsd_signal(args->arg2);
1661 		return (kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_CTL,
1662 		    &pdeath_signal));
1663 	case LINUX_PR_GET_PDEATHSIG:
1664 		error = kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_STATUS,
1665 		    &pdeath_signal);
1666 		if (error != 0)
1667 			return (error);
1668 		pdeath_signal = bsd_to_linux_signal(pdeath_signal);
1669 		return (copyout(&pdeath_signal,
1670 		    (void *)(register_t)args->arg2,
1671 		    sizeof(pdeath_signal)));
1672 	/*
1673 	 * In Linux, this flag controls if set[gu]id processes can coredump.
1674 	 * There are additional semantics imposed on processes that cannot
1675 	 * coredump:
1676 	 * - Such processes can not be ptraced.
1677 	 * - There are some semantics around ownership of process-related files
1678 	 *   in the /proc namespace.
1679 	 *
1680 	 * In FreeBSD, we can (and by default, do) disable setuid coredump
1681 	 * system-wide with 'sugid_coredump.'  We control tracability on a
1682 	 * per-process basis with the procctl PROC_TRACE (=> P2_NOTRACE flag).
1683 	 * By happy coincidence, P2_NOTRACE also prevents coredumping.  So the
1684 	 * procctl is roughly analogous to Linux's DUMPABLE.
1685 	 *
1686 	 * So, proxy these knobs to the corresponding PROC_TRACE setting.
1687 	 */
1688 	case LINUX_PR_GET_DUMPABLE:
1689 		error = kern_procctl(td, P_PID, p->p_pid, PROC_TRACE_STATUS,
1690 		    &trace_state);
1691 		if (error != 0)
1692 			return (error);
1693 		td->td_retval[0] = (trace_state != -1);
1694 		return (0);
1695 	case LINUX_PR_SET_DUMPABLE:
1696 		/*
1697 		 * It is only valid for userspace to set one of these two
1698 		 * flags, and only one at a time.
1699 		 */
1700 		switch (args->arg2) {
1701 		case LINUX_SUID_DUMP_DISABLE:
1702 			trace_state = PROC_TRACE_CTL_DISABLE_EXEC;
1703 			break;
1704 		case LINUX_SUID_DUMP_USER:
1705 			trace_state = PROC_TRACE_CTL_ENABLE;
1706 			break;
1707 		default:
1708 			return (EINVAL);
1709 		}
1710 		return (kern_procctl(td, P_PID, p->p_pid, PROC_TRACE_CTL,
1711 		    &trace_state));
1712 	case LINUX_PR_GET_KEEPCAPS:
1713 		/*
1714 		 * Indicate that we always clear the effective and
1715 		 * permitted capability sets when the user id becomes
1716 		 * non-zero (actually the capability sets are simply
1717 		 * always zero in the current implementation).
1718 		 */
1719 		td->td_retval[0] = 0;
1720 		break;
1721 	case LINUX_PR_SET_KEEPCAPS:
1722 		/*
1723 		 * Ignore requests to keep the effective and permitted
1724 		 * capability sets when the user id becomes non-zero.
1725 		 */
1726 		break;
1727 	case LINUX_PR_SET_NAME:
1728 		/*
1729 		 * To be on the safe side we need to make sure to not
1730 		 * overflow the size a Linux program expects. We already
1731 		 * do this here in the copyin, so that we don't need to
1732 		 * check on copyout.
1733 		 */
1734 		max_size = MIN(sizeof(comm), sizeof(p->p_comm));
1735 		error = copyinstr((void *)(register_t)args->arg2, comm,
1736 		    max_size, NULL);
1737 
1738 		/* Linux silently truncates the name if it is too long. */
1739 		if (error == ENAMETOOLONG) {
1740 			/*
1741 			 * XXX: copyinstr() isn't documented to populate the
1742 			 * array completely, so do a copyin() to be on the
1743 			 * safe side. This should be changed in case
1744 			 * copyinstr() is changed to guarantee this.
1745 			 */
1746 			error = copyin((void *)(register_t)args->arg2, comm,
1747 			    max_size - 1);
1748 			comm[max_size - 1] = '\0';
1749 		}
1750 		if (error)
1751 			return (error);
1752 
1753 		PROC_LOCK(p);
1754 		strlcpy(p->p_comm, comm, sizeof(p->p_comm));
1755 		PROC_UNLOCK(p);
1756 		break;
1757 	case LINUX_PR_GET_NAME:
1758 		PROC_LOCK(p);
1759 		strlcpy(comm, p->p_comm, sizeof(comm));
1760 		PROC_UNLOCK(p);
1761 		error = copyout(comm, (void *)(register_t)args->arg2,
1762 		    strlen(comm) + 1);
1763 		break;
1764 	case LINUX_PR_GET_SECCOMP:
1765 	case LINUX_PR_SET_SECCOMP:
1766 		/*
1767 		 * Same as returned by Linux without CONFIG_SECCOMP enabled.
1768 		 */
1769 		error = EINVAL;
1770 		break;
1771 	case LINUX_PR_CAPBSET_READ:
1772 #if 0
1773 		/*
1774 		 * This makes too much noise with Ubuntu Focal.
1775 		 */
1776 		linux_msg(td, "unsupported prctl PR_CAPBSET_READ %d",
1777 		    (int)args->arg2);
1778 #endif
1779 		error = EINVAL;
1780 		break;
1781 	case LINUX_PR_SET_NO_NEW_PRIVS:
1782 		arg = args->arg2 == 1 ?
1783 		    PROC_NO_NEW_PRIVS_ENABLE : PROC_NO_NEW_PRIVS_DISABLE;
1784 		error = kern_procctl(td, P_PID, p->p_pid,
1785 		    PROC_NO_NEW_PRIVS_CTL, &arg);
1786 		break;
1787 	case LINUX_PR_SET_PTRACER:
1788 		linux_msg(td, "unsupported prctl PR_SET_PTRACER");
1789 		error = EINVAL;
1790 		break;
1791 	default:
1792 		linux_msg(td, "unsupported prctl option %d", args->option);
1793 		error = EINVAL;
1794 		break;
1795 	}
1796 
1797 	return (error);
1798 }
1799 
1800 int
1801 linux_sched_setparam(struct thread *td,
1802     struct linux_sched_setparam_args *uap)
1803 {
1804 	struct sched_param sched_param;
1805 	struct thread *tdt;
1806 	int error, policy;
1807 
1808 	error = copyin(uap->param, &sched_param, sizeof(sched_param));
1809 	if (error)
1810 		return (error);
1811 
1812 	tdt = linux_tdfind(td, uap->pid, -1);
1813 	if (tdt == NULL)
1814 		return (ESRCH);
1815 
1816 	if (linux_map_sched_prio) {
1817 		error = kern_sched_getscheduler(td, tdt, &policy);
1818 		if (error)
1819 			goto out;
1820 
1821 		switch (policy) {
1822 		case SCHED_OTHER:
1823 			if (sched_param.sched_priority != 0) {
1824 				error = EINVAL;
1825 				goto out;
1826 			}
1827 			sched_param.sched_priority =
1828 			    PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
1829 			break;
1830 		case SCHED_FIFO:
1831 		case SCHED_RR:
1832 			if (sched_param.sched_priority < 1 ||
1833 			    sched_param.sched_priority >= LINUX_MAX_RT_PRIO) {
1834 				error = EINVAL;
1835 				goto out;
1836 			}
1837 			/*
1838 			 * Map [1, LINUX_MAX_RT_PRIO - 1] to
1839 			 * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down).
1840 			 */
1841 			sched_param.sched_priority =
1842 			    (sched_param.sched_priority - 1) *
1843 			    (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) /
1844 			    (LINUX_MAX_RT_PRIO - 1);
1845 			break;
1846 		}
1847 	}
1848 
1849 	error = kern_sched_setparam(td, tdt, &sched_param);
1850 out:	PROC_UNLOCK(tdt->td_proc);
1851 	return (error);
1852 }
1853 
1854 int
1855 linux_sched_getparam(struct thread *td,
1856     struct linux_sched_getparam_args *uap)
1857 {
1858 	struct sched_param sched_param;
1859 	struct thread *tdt;
1860 	int error, policy;
1861 
1862 	tdt = linux_tdfind(td, uap->pid, -1);
1863 	if (tdt == NULL)
1864 		return (ESRCH);
1865 
1866 	error = kern_sched_getparam(td, tdt, &sched_param);
1867 	if (error) {
1868 		PROC_UNLOCK(tdt->td_proc);
1869 		return (error);
1870 	}
1871 
1872 	if (linux_map_sched_prio) {
1873 		error = kern_sched_getscheduler(td, tdt, &policy);
1874 		PROC_UNLOCK(tdt->td_proc);
1875 		if (error)
1876 			return (error);
1877 
1878 		switch (policy) {
1879 		case SCHED_OTHER:
1880 			sched_param.sched_priority = 0;
1881 			break;
1882 		case SCHED_FIFO:
1883 		case SCHED_RR:
1884 			/*
1885 			 * Map [0, RTP_PRIO_MAX - RTP_PRIO_MIN] to
1886 			 * [1, LINUX_MAX_RT_PRIO - 1] (rounding up).
1887 			 */
1888 			sched_param.sched_priority =
1889 			    (sched_param.sched_priority *
1890 			    (LINUX_MAX_RT_PRIO - 1) +
1891 			    (RTP_PRIO_MAX - RTP_PRIO_MIN - 1)) /
1892 			    (RTP_PRIO_MAX - RTP_PRIO_MIN) + 1;
1893 			break;
1894 		}
1895 	} else
1896 		PROC_UNLOCK(tdt->td_proc);
1897 
1898 	error = copyout(&sched_param, uap->param, sizeof(sched_param));
1899 	return (error);
1900 }
1901 
1902 /*
1903  * Get affinity of a process.
1904  */
1905 int
1906 linux_sched_getaffinity(struct thread *td,
1907     struct linux_sched_getaffinity_args *args)
1908 {
1909 	struct thread *tdt;
1910 	cpuset_t *mask;
1911 	size_t size;
1912 	int error;
1913 	id_t tid;
1914 
1915 	tdt = linux_tdfind(td, args->pid, -1);
1916 	if (tdt == NULL)
1917 		return (ESRCH);
1918 	tid = tdt->td_tid;
1919 	PROC_UNLOCK(tdt->td_proc);
1920 
1921 	mask = malloc(sizeof(cpuset_t), M_LINUX, M_WAITOK | M_ZERO);
1922 	size = min(args->len, sizeof(cpuset_t));
1923 	error = kern_cpuset_getaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID,
1924 	    tid, size, mask);
1925 	if (error == ERANGE)
1926 		error = EINVAL;
1927  	if (error == 0)
1928 		error = copyout(mask, args->user_mask_ptr, size);
1929 	if (error == 0)
1930 		td->td_retval[0] = size;
1931 	free(mask, M_LINUX);
1932 	return (error);
1933 }
1934 
1935 /*
1936  *  Set affinity of a process.
1937  */
1938 int
1939 linux_sched_setaffinity(struct thread *td,
1940     struct linux_sched_setaffinity_args *args)
1941 {
1942 	struct thread *tdt;
1943 	cpuset_t *mask;
1944 	int cpu, error;
1945 	size_t len;
1946 	id_t tid;
1947 
1948 	tdt = linux_tdfind(td, args->pid, -1);
1949 	if (tdt == NULL)
1950 		return (ESRCH);
1951 	tid = tdt->td_tid;
1952 	PROC_UNLOCK(tdt->td_proc);
1953 
1954 	len = min(args->len, sizeof(cpuset_t));
1955 	mask = malloc(sizeof(cpuset_t), M_TEMP, M_WAITOK | M_ZERO);;
1956 	error = copyin(args->user_mask_ptr, mask, len);
1957 	if (error != 0)
1958 		goto out;
1959 	/* Linux ignore high bits */
1960 	CPU_FOREACH_ISSET(cpu, mask)
1961 		if (cpu > mp_maxid)
1962 			CPU_CLR(cpu, mask);
1963 
1964 	error = kern_cpuset_setaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID,
1965 	    tid, mask);
1966 	if (error == EDEADLK)
1967 		error = EINVAL;
1968 out:
1969 	free(mask, M_TEMP);
1970 	return (error);
1971 }
1972 
1973 struct linux_rlimit64 {
1974 	uint64_t	rlim_cur;
1975 	uint64_t	rlim_max;
1976 };
1977 
1978 int
1979 linux_prlimit64(struct thread *td, struct linux_prlimit64_args *args)
1980 {
1981 	struct rlimit rlim, nrlim;
1982 	struct linux_rlimit64 lrlim;
1983 	struct proc *p;
1984 	u_int which;
1985 	int flags;
1986 	int error;
1987 
1988 	if (args->new == NULL && args->old != NULL) {
1989 		if (linux_get_dummy_limit(args->resource, &rlim)) {
1990 			lrlim.rlim_cur = rlim.rlim_cur;
1991 			lrlim.rlim_max = rlim.rlim_max;
1992 			return (copyout(&lrlim, args->old, sizeof(lrlim)));
1993 		}
1994 	}
1995 
1996 	if (args->resource >= LINUX_RLIM_NLIMITS)
1997 		return (EINVAL);
1998 
1999 	which = linux_to_bsd_resource[args->resource];
2000 	if (which == -1)
2001 		return (EINVAL);
2002 
2003 	if (args->new != NULL) {
2004 		/*
2005 		 * Note. Unlike FreeBSD where rlim is signed 64-bit Linux
2006 		 * rlim is unsigned 64-bit. FreeBSD treats negative limits
2007 		 * as INFINITY so we do not need a conversion even.
2008 		 */
2009 		error = copyin(args->new, &nrlim, sizeof(nrlim));
2010 		if (error != 0)
2011 			return (error);
2012 	}
2013 
2014 	flags = PGET_HOLD | PGET_NOTWEXIT;
2015 	if (args->new != NULL)
2016 		flags |= PGET_CANDEBUG;
2017 	else
2018 		flags |= PGET_CANSEE;
2019 	if (args->pid == 0) {
2020 		p = td->td_proc;
2021 		PHOLD(p);
2022 	} else {
2023 		error = pget(args->pid, flags, &p);
2024 		if (error != 0)
2025 			return (error);
2026 	}
2027 	if (args->old != NULL) {
2028 		PROC_LOCK(p);
2029 		lim_rlimit_proc(p, which, &rlim);
2030 		PROC_UNLOCK(p);
2031 		if (rlim.rlim_cur == RLIM_INFINITY)
2032 			lrlim.rlim_cur = LINUX_RLIM_INFINITY;
2033 		else
2034 			lrlim.rlim_cur = rlim.rlim_cur;
2035 		if (rlim.rlim_max == RLIM_INFINITY)
2036 			lrlim.rlim_max = LINUX_RLIM_INFINITY;
2037 		else
2038 			lrlim.rlim_max = rlim.rlim_max;
2039 		error = copyout(&lrlim, args->old, sizeof(lrlim));
2040 		if (error != 0)
2041 			goto out;
2042 	}
2043 
2044 	if (args->new != NULL)
2045 		error = kern_proc_setrlimit(td, p, which, &nrlim);
2046 
2047  out:
2048 	PRELE(p);
2049 	return (error);
2050 }
2051 
2052 int
2053 linux_pselect6(struct thread *td, struct linux_pselect6_args *args)
2054 {
2055 	struct timespec ts, *tsp;
2056 	int error;
2057 
2058 	if (args->tsp != NULL) {
2059 		error = linux_get_timespec(&ts, args->tsp);
2060 		if (error != 0)
2061 			return (error);
2062 		tsp = &ts;
2063 	} else
2064 		tsp = NULL;
2065 
2066 	error = linux_common_pselect6(td, args->nfds, args->readfds,
2067 	    args->writefds, args->exceptfds, tsp, args->sig);
2068 
2069 	if (args->tsp != NULL)
2070 		linux_put_timespec(&ts, args->tsp);
2071 	return (error);
2072 }
2073 
2074 static int
2075 linux_common_pselect6(struct thread *td, l_int nfds, l_fd_set *readfds,
2076     l_fd_set *writefds, l_fd_set *exceptfds, struct timespec *tsp,
2077     l_uintptr_t *sig)
2078 {
2079 	struct timeval utv, tv0, tv1, *tvp;
2080 	struct l_pselect6arg lpse6;
2081 	sigset_t *ssp;
2082 	sigset_t ss;
2083 	int error;
2084 
2085 	ssp = NULL;
2086 	if (sig != NULL) {
2087 		error = copyin(sig, &lpse6, sizeof(lpse6));
2088 		if (error != 0)
2089 			return (error);
2090 		error = linux_copyin_sigset(td, PTRIN(lpse6.ss),
2091 		    lpse6.ss_len, &ss, &ssp);
2092 		if (error != 0)
2093 		    return (error);
2094 	} else
2095 		ssp = NULL;
2096 
2097 	/*
2098 	 * Currently glibc changes nanosecond number to microsecond.
2099 	 * This mean losing precision but for now it is hardly seen.
2100 	 */
2101 	if (tsp != NULL) {
2102 		TIMESPEC_TO_TIMEVAL(&utv, tsp);
2103 		if (itimerfix(&utv))
2104 			return (EINVAL);
2105 
2106 		microtime(&tv0);
2107 		tvp = &utv;
2108 	} else
2109 		tvp = NULL;
2110 
2111 	error = kern_pselect(td, nfds, readfds, writefds,
2112 	    exceptfds, tvp, ssp, LINUX_NFDBITS);
2113 
2114 	if (tsp != NULL) {
2115 		/*
2116 		 * Compute how much time was left of the timeout,
2117 		 * by subtracting the current time and the time
2118 		 * before we started the call, and subtracting
2119 		 * that result from the user-supplied value.
2120 		 */
2121 		microtime(&tv1);
2122 		timevalsub(&tv1, &tv0);
2123 		timevalsub(&utv, &tv1);
2124 		if (utv.tv_sec < 0)
2125 			timevalclear(&utv);
2126 		TIMEVAL_TO_TIMESPEC(&utv, tsp);
2127 	}
2128 	return (error);
2129 }
2130 
2131 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
2132 int
2133 linux_pselect6_time64(struct thread *td,
2134     struct linux_pselect6_time64_args *args)
2135 {
2136 	struct timespec ts, *tsp;
2137 	int error;
2138 
2139 	if (args->tsp != NULL) {
2140 		error = linux_get_timespec64(&ts, args->tsp);
2141 		if (error != 0)
2142 			return (error);
2143 		tsp = &ts;
2144 	} else
2145 		tsp = NULL;
2146 
2147 	error = linux_common_pselect6(td, args->nfds, args->readfds,
2148 	    args->writefds, args->exceptfds, tsp, args->sig);
2149 
2150 	if (args->tsp != NULL)
2151 		linux_put_timespec64(&ts, args->tsp);
2152 	return (error);
2153 }
2154 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
2155 
2156 int
2157 linux_ppoll(struct thread *td, struct linux_ppoll_args *args)
2158 {
2159 	struct timespec uts, *tsp;
2160 	int error;
2161 
2162 	if (args->tsp != NULL) {
2163 		error = linux_get_timespec(&uts, args->tsp);
2164 		if (error != 0)
2165 			return (error);
2166 		tsp = &uts;
2167 	} else
2168 		tsp = NULL;
2169 
2170 	error = linux_common_ppoll(td, args->fds, args->nfds, tsp,
2171 	    args->sset, args->ssize);
2172 	if (error == 0 && args->tsp != NULL)
2173 		error = linux_put_timespec(&uts, args->tsp);
2174 	return (error);
2175 }
2176 
2177 static int
2178 linux_common_ppoll(struct thread *td, struct pollfd *fds, uint32_t nfds,
2179     struct timespec *tsp, l_sigset_t *sset, l_size_t ssize)
2180 {
2181 	struct timespec ts0, ts1;
2182 	struct pollfd stackfds[32];
2183 	struct pollfd *kfds;
2184  	sigset_t *ssp;
2185  	sigset_t ss;
2186  	int error;
2187 
2188 	if (kern_poll_maxfds(nfds))
2189 		return (EINVAL);
2190 	if (sset != NULL) {
2191 		error = linux_copyin_sigset(td, sset, ssize, &ss, &ssp);
2192 		if (error != 0)
2193 		    return (error);
2194 	} else
2195 		ssp = NULL;
2196 	if (tsp != NULL)
2197 		nanotime(&ts0);
2198 
2199 	if (nfds > nitems(stackfds))
2200 		kfds = mallocarray(nfds, sizeof(*kfds), M_TEMP, M_WAITOK);
2201 	else
2202 		kfds = stackfds;
2203 	error = linux_pollin(td, kfds, fds, nfds);
2204 	if (error != 0)
2205 		goto out;
2206 
2207 	error = kern_poll_kfds(td, kfds, nfds, tsp, ssp);
2208 	if (error == 0)
2209 		error = linux_pollout(td, kfds, fds, nfds);
2210 
2211 	if (error == 0 && tsp != NULL) {
2212 		if (td->td_retval[0]) {
2213 			nanotime(&ts1);
2214 			timespecsub(&ts1, &ts0, &ts1);
2215 			timespecsub(tsp, &ts1, tsp);
2216 			if (tsp->tv_sec < 0)
2217 				timespecclear(tsp);
2218 		} else
2219 			timespecclear(tsp);
2220 	}
2221 
2222 out:
2223 	if (nfds > nitems(stackfds))
2224 		free(kfds, M_TEMP);
2225 	return (error);
2226 }
2227 
2228 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
2229 int
2230 linux_ppoll_time64(struct thread *td, struct linux_ppoll_time64_args *args)
2231 {
2232 	struct timespec uts, *tsp;
2233 	int error;
2234 
2235 	if (args->tsp != NULL) {
2236 		error = linux_get_timespec64(&uts, args->tsp);
2237 		if (error != 0)
2238 			return (error);
2239 		tsp = &uts;
2240 	} else
2241  		tsp = NULL;
2242 	error = linux_common_ppoll(td, args->fds, args->nfds, tsp,
2243 	    args->sset, args->ssize);
2244 	if (error == 0 && args->tsp != NULL)
2245 		error = linux_put_timespec64(&uts, args->tsp);
2246 	return (error);
2247 }
2248 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
2249 
2250 static int
2251 linux_pollin(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd)
2252 {
2253 	int error;
2254 	u_int i;
2255 
2256 	error = copyin(ufds, fds, nfd * sizeof(*fds));
2257 	if (error != 0)
2258 		return (error);
2259 
2260 	for (i = 0; i < nfd; i++) {
2261 		if (fds->events != 0)
2262 			linux_to_bsd_poll_events(td, fds->fd,
2263 			    fds->events, &fds->events);
2264 		fds++;
2265 	}
2266 	return (0);
2267 }
2268 
2269 static int
2270 linux_pollout(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd)
2271 {
2272 	int error = 0;
2273 	u_int i, n = 0;
2274 
2275 	for (i = 0; i < nfd; i++) {
2276 		if (fds->revents != 0) {
2277 			bsd_to_linux_poll_events(fds->revents,
2278 			    &fds->revents);
2279 			n++;
2280 		}
2281 		error = copyout(&fds->revents, &ufds->revents,
2282 		    sizeof(ufds->revents));
2283 		if (error)
2284 			return (error);
2285 		fds++;
2286 		ufds++;
2287 	}
2288 	td->td_retval[0] = n;
2289 	return (0);
2290 }
2291 
2292 static int
2293 linux_sched_rr_get_interval_common(struct thread *td, pid_t pid,
2294     struct timespec *ts)
2295 {
2296 	struct thread *tdt;
2297 	int error;
2298 
2299 	/*
2300 	 * According to man in case the invalid pid specified
2301 	 * EINVAL should be returned.
2302 	 */
2303 	if (pid < 0)
2304 		return (EINVAL);
2305 
2306 	tdt = linux_tdfind(td, pid, -1);
2307 	if (tdt == NULL)
2308 		return (ESRCH);
2309 
2310 	error = kern_sched_rr_get_interval_td(td, tdt, ts);
2311 	PROC_UNLOCK(tdt->td_proc);
2312 	return (error);
2313 }
2314 
2315 int
2316 linux_sched_rr_get_interval(struct thread *td,
2317     struct linux_sched_rr_get_interval_args *uap)
2318 {
2319 	struct timespec ts;
2320 	int error;
2321 
2322 	error = linux_sched_rr_get_interval_common(td, uap->pid, &ts);
2323 	if (error != 0)
2324 		return (error);
2325 	return (linux_put_timespec(&ts, uap->interval));
2326 }
2327 
2328 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
2329 int
2330 linux_sched_rr_get_interval_time64(struct thread *td,
2331     struct linux_sched_rr_get_interval_time64_args *uap)
2332 {
2333 	struct timespec ts;
2334 	int error;
2335 
2336 	error = linux_sched_rr_get_interval_common(td, uap->pid, &ts);
2337 	if (error != 0)
2338 		return (error);
2339 	return (linux_put_timespec64(&ts, uap->interval));
2340 }
2341 #endif
2342 
2343 /*
2344  * In case when the Linux thread is the initial thread in
2345  * the thread group thread id is equal to the process id.
2346  * Glibc depends on this magic (assert in pthread_getattr_np.c).
2347  */
2348 struct thread *
2349 linux_tdfind(struct thread *td, lwpid_t tid, pid_t pid)
2350 {
2351 	struct linux_emuldata *em;
2352 	struct thread *tdt;
2353 	struct proc *p;
2354 
2355 	tdt = NULL;
2356 	if (tid == 0 || tid == td->td_tid) {
2357 		if (pid != -1 && td->td_proc->p_pid != pid)
2358 			return (NULL);
2359 		PROC_LOCK(td->td_proc);
2360 		return (td);
2361 	} else if (tid > PID_MAX)
2362 		return (tdfind(tid, pid));
2363 
2364 	/*
2365 	 * Initial thread where the tid equal to the pid.
2366 	 */
2367 	p = pfind(tid);
2368 	if (p != NULL) {
2369 		if (SV_PROC_ABI(p) != SV_ABI_LINUX ||
2370 		    (pid != -1 && tid != pid)) {
2371 			/*
2372 			 * p is not a Linuxulator process.
2373 			 */
2374 			PROC_UNLOCK(p);
2375 			return (NULL);
2376 		}
2377 		FOREACH_THREAD_IN_PROC(p, tdt) {
2378 			em = em_find(tdt);
2379 			if (tid == em->em_tid)
2380 				return (tdt);
2381 		}
2382 		PROC_UNLOCK(p);
2383 	}
2384 	return (NULL);
2385 }
2386 
2387 void
2388 linux_to_bsd_waitopts(int options, int *bsdopts)
2389 {
2390 
2391 	if (options & LINUX_WNOHANG)
2392 		*bsdopts |= WNOHANG;
2393 	if (options & LINUX_WUNTRACED)
2394 		*bsdopts |= WUNTRACED;
2395 	if (options & LINUX_WEXITED)
2396 		*bsdopts |= WEXITED;
2397 	if (options & LINUX_WCONTINUED)
2398 		*bsdopts |= WCONTINUED;
2399 	if (options & LINUX_WNOWAIT)
2400 		*bsdopts |= WNOWAIT;
2401 
2402 	if (options & __WCLONE)
2403 		*bsdopts |= WLINUXCLONE;
2404 }
2405 
2406 int
2407 linux_getrandom(struct thread *td, struct linux_getrandom_args *args)
2408 {
2409 	struct uio uio;
2410 	struct iovec iov;
2411 	int error;
2412 
2413 	if (args->flags & ~(LINUX_GRND_NONBLOCK|LINUX_GRND_RANDOM))
2414 		return (EINVAL);
2415 	if (args->count > INT_MAX)
2416 		args->count = INT_MAX;
2417 
2418 	iov.iov_base = args->buf;
2419 	iov.iov_len = args->count;
2420 
2421 	uio.uio_iov = &iov;
2422 	uio.uio_iovcnt = 1;
2423 	uio.uio_resid = iov.iov_len;
2424 	uio.uio_segflg = UIO_USERSPACE;
2425 	uio.uio_rw = UIO_READ;
2426 	uio.uio_td = td;
2427 
2428 	error = read_random_uio(&uio, args->flags & LINUX_GRND_NONBLOCK);
2429 	if (error == 0)
2430 		td->td_retval[0] = args->count - uio.uio_resid;
2431 	return (error);
2432 }
2433 
2434 int
2435 linux_mincore(struct thread *td, struct linux_mincore_args *args)
2436 {
2437 
2438 	/* Needs to be page-aligned */
2439 	if (args->start & PAGE_MASK)
2440 		return (EINVAL);
2441 	return (kern_mincore(td, args->start, args->len, args->vec));
2442 }
2443 
2444 #define	SYSLOG_TAG	"<6>"
2445 
2446 int
2447 linux_syslog(struct thread *td, struct linux_syslog_args *args)
2448 {
2449 	char buf[128], *src, *dst;
2450 	u_int seq;
2451 	int buflen, error;
2452 
2453 	if (args->type != LINUX_SYSLOG_ACTION_READ_ALL) {
2454 		linux_msg(td, "syslog unsupported type 0x%x", args->type);
2455 		return (EINVAL);
2456 	}
2457 
2458 	if (args->len < 6) {
2459 		td->td_retval[0] = 0;
2460 		return (0);
2461 	}
2462 
2463 	error = priv_check(td, PRIV_MSGBUF);
2464 	if (error)
2465 		return (error);
2466 
2467 	mtx_lock(&msgbuf_lock);
2468 	msgbuf_peekbytes(msgbufp, NULL, 0, &seq);
2469 	mtx_unlock(&msgbuf_lock);
2470 
2471 	dst = args->buf;
2472 	error = copyout(&SYSLOG_TAG, dst, sizeof(SYSLOG_TAG));
2473 	/* The -1 is to skip the trailing '\0'. */
2474 	dst += sizeof(SYSLOG_TAG) - 1;
2475 
2476 	while (error == 0) {
2477 		mtx_lock(&msgbuf_lock);
2478 		buflen = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq);
2479 		mtx_unlock(&msgbuf_lock);
2480 
2481 		if (buflen == 0)
2482 			break;
2483 
2484 		for (src = buf; src < buf + buflen && error == 0; src++) {
2485 			if (*src == '\0')
2486 				continue;
2487 
2488 			if (dst >= args->buf + args->len)
2489 				goto out;
2490 
2491 			error = copyout(src, dst, 1);
2492 			dst++;
2493 
2494 			if (*src == '\n' && *(src + 1) != '<' &&
2495 			    dst + sizeof(SYSLOG_TAG) < args->buf + args->len) {
2496 				error = copyout(&SYSLOG_TAG,
2497 				    dst, sizeof(SYSLOG_TAG));
2498 				dst += sizeof(SYSLOG_TAG) - 1;
2499 			}
2500 		}
2501 	}
2502 out:
2503 	td->td_retval[0] = dst - args->buf;
2504 	return (error);
2505 }
2506 
2507 int
2508 linux_getcpu(struct thread *td, struct linux_getcpu_args *args)
2509 {
2510 	int cpu, error, node;
2511 
2512 	cpu = td->td_oncpu; /* Make sure it doesn't change during copyout(9) */
2513 	error = 0;
2514 	node = cpuid_to_pcpu[cpu]->pc_domain;
2515 
2516 	if (args->cpu != NULL)
2517 		error = copyout(&cpu, args->cpu, sizeof(l_int));
2518 	if (args->node != NULL)
2519 		error = copyout(&node, args->node, sizeof(l_int));
2520 	return (error);
2521 }
2522 
2523 #if defined(__i386__) || defined(__amd64__)
2524 int
2525 linux_poll(struct thread *td, struct linux_poll_args *args)
2526 {
2527 	struct timespec ts, *tsp;
2528 
2529 	if (args->timeout != INFTIM) {
2530 		if (args->timeout < 0)
2531 			return (EINVAL);
2532 		ts.tv_sec = args->timeout / 1000;
2533 		ts.tv_nsec = (args->timeout % 1000) * 1000000;
2534 		tsp = &ts;
2535 	} else
2536 		tsp = NULL;
2537 
2538 	return (linux_common_ppoll(td, args->fds, args->nfds,
2539 	    tsp, NULL, 0));
2540 }
2541 #endif /* __i386__ || __amd64__ */
2542 
2543 int
2544 linux_seccomp(struct thread *td, struct linux_seccomp_args *args)
2545 {
2546 
2547 	switch (args->op) {
2548 	case LINUX_SECCOMP_GET_ACTION_AVAIL:
2549 		return (EOPNOTSUPP);
2550 	default:
2551 		/*
2552 		 * Ignore unknown operations, just like Linux kernel built
2553 		 * without CONFIG_SECCOMP.
2554 		 */
2555 		return (EINVAL);
2556 	}
2557 }
2558 
2559 /*
2560  * Custom version of exec_copyin_args(), to copy out argument and environment
2561  * strings from the old process address space into the temporary string buffer.
2562  * Based on freebsd32_exec_copyin_args.
2563  */
2564 static int
2565 linux_exec_copyin_args(struct image_args *args, const char *fname,
2566     enum uio_seg segflg, l_uintptr_t *argv, l_uintptr_t *envv)
2567 {
2568 	char *argp, *envp;
2569 	l_uintptr_t *ptr, arg;
2570 	int error;
2571 
2572 	bzero(args, sizeof(*args));
2573 	if (argv == NULL)
2574 		return (EFAULT);
2575 
2576 	/*
2577 	 * Allocate demand-paged memory for the file name, argument, and
2578 	 * environment strings.
2579 	 */
2580 	error = exec_alloc_args(args);
2581 	if (error != 0)
2582 		return (error);
2583 
2584 	/*
2585 	 * Copy the file name.
2586 	 */
2587 	error = exec_args_add_fname(args, fname, segflg);
2588 	if (error != 0)
2589 		goto err_exit;
2590 
2591 	/*
2592 	 * extract arguments first
2593 	 */
2594 	ptr = argv;
2595 	for (;;) {
2596 		error = copyin(ptr++, &arg, sizeof(arg));
2597 		if (error)
2598 			goto err_exit;
2599 		if (arg == 0)
2600 			break;
2601 		argp = PTRIN(arg);
2602 		error = exec_args_add_arg(args, argp, UIO_USERSPACE);
2603 		if (error != 0)
2604 			goto err_exit;
2605 	}
2606 
2607 	/*
2608 	 * This comment is from Linux do_execveat_common:
2609 	 * When argv is empty, add an empty string ("") as argv[0] to
2610 	 * ensure confused userspace programs that start processing
2611 	 * from argv[1] won't end up walking envp.
2612 	 */
2613 	if (args->argc == 0 &&
2614 	    (error = exec_args_add_arg(args, "", UIO_SYSSPACE) != 0))
2615 		goto err_exit;
2616 
2617 	/*
2618 	 * extract environment strings
2619 	 */
2620 	if (envv) {
2621 		ptr = envv;
2622 		for (;;) {
2623 			error = copyin(ptr++, &arg, sizeof(arg));
2624 			if (error)
2625 				goto err_exit;
2626 			if (arg == 0)
2627 				break;
2628 			envp = PTRIN(arg);
2629 			error = exec_args_add_env(args, envp, UIO_USERSPACE);
2630 			if (error != 0)
2631 				goto err_exit;
2632 		}
2633 	}
2634 
2635 	return (0);
2636 
2637 err_exit:
2638 	exec_free_args(args);
2639 	return (error);
2640 }
2641 
2642 int
2643 linux_execve(struct thread *td, struct linux_execve_args *args)
2644 {
2645 	struct image_args eargs;
2646 	int error;
2647 
2648 	LINUX_CTR(execve);
2649 
2650 	error = linux_exec_copyin_args(&eargs, args->path, UIO_USERSPACE,
2651 	    args->argp, args->envp);
2652 	if (error == 0)
2653 		error = linux_common_execve(td, &eargs);
2654 	AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td);
2655 	return (error);
2656 }
2657 
2658 static void
2659 linux_up_rtprio_if(struct thread *td1, struct rtprio *rtp)
2660 {
2661 	struct rtprio rtp2;
2662 
2663 	pri_to_rtp(td1, &rtp2);
2664 	if (rtp2.type <  rtp->type ||
2665 	    (rtp2.type == rtp->type &&
2666 	    rtp2.prio < rtp->prio)) {
2667 		rtp->type = rtp2.type;
2668 		rtp->prio = rtp2.prio;
2669 	}
2670 }
2671 
2672 #define	LINUX_PRIO_DIVIDER	RTP_PRIO_MAX / LINUX_IOPRIO_MAX
2673 
2674 static int
2675 linux_rtprio2ioprio(struct rtprio *rtp)
2676 {
2677 	int ioprio, prio;
2678 
2679 	switch (rtp->type) {
2680 	case RTP_PRIO_IDLE:
2681 		prio = RTP_PRIO_MIN;
2682 		ioprio = LINUX_IOPRIO_PRIO(LINUX_IOPRIO_CLASS_IDLE, prio);
2683 		break;
2684 	case RTP_PRIO_NORMAL:
2685 		prio = rtp->prio / LINUX_PRIO_DIVIDER;
2686 		ioprio = LINUX_IOPRIO_PRIO(LINUX_IOPRIO_CLASS_BE, prio);
2687 		break;
2688 	case RTP_PRIO_REALTIME:
2689 		prio = rtp->prio / LINUX_PRIO_DIVIDER;
2690 		ioprio = LINUX_IOPRIO_PRIO(LINUX_IOPRIO_CLASS_RT, prio);
2691 		break;
2692 	default:
2693 		prio = RTP_PRIO_MIN;
2694 		ioprio = LINUX_IOPRIO_PRIO(LINUX_IOPRIO_CLASS_NONE, prio);
2695 		break;
2696 	}
2697 	return (ioprio);
2698 }
2699 
2700 static int
2701 linux_ioprio2rtprio(int ioprio, struct rtprio *rtp)
2702 {
2703 
2704 	switch (LINUX_IOPRIO_PRIO_CLASS(ioprio)) {
2705 	case LINUX_IOPRIO_CLASS_IDLE:
2706 		rtp->prio = RTP_PRIO_MIN;
2707 		rtp->type = RTP_PRIO_IDLE;
2708 		break;
2709 	case LINUX_IOPRIO_CLASS_BE:
2710 		rtp->prio = LINUX_IOPRIO_PRIO_DATA(ioprio) * LINUX_PRIO_DIVIDER;
2711 		rtp->type = RTP_PRIO_NORMAL;
2712 		break;
2713 	case LINUX_IOPRIO_CLASS_RT:
2714 		rtp->prio = LINUX_IOPRIO_PRIO_DATA(ioprio) * LINUX_PRIO_DIVIDER;
2715 		rtp->type = RTP_PRIO_REALTIME;
2716 		break;
2717 	default:
2718 		return (EINVAL);
2719 	}
2720 	return (0);
2721 }
2722 #undef LINUX_PRIO_DIVIDER
2723 
2724 int
2725 linux_ioprio_get(struct thread *td, struct linux_ioprio_get_args *args)
2726 {
2727 	struct thread *td1;
2728 	struct rtprio rtp;
2729 	struct pgrp *pg;
2730 	struct proc *p;
2731 	int error, found;
2732 
2733 	p = NULL;
2734 	td1 = NULL;
2735 	error = 0;
2736 	found = 0;
2737 	rtp.type = RTP_PRIO_IDLE;
2738 	rtp.prio = RTP_PRIO_MAX;
2739 	switch (args->which) {
2740 	case LINUX_IOPRIO_WHO_PROCESS:
2741 		if (args->who == 0) {
2742 			td1 = td;
2743 			p = td1->td_proc;
2744 			PROC_LOCK(p);
2745 		} else if (args->who > PID_MAX) {
2746 			td1 = linux_tdfind(td, args->who, -1);
2747 			if (td1 != NULL)
2748 				p = td1->td_proc;
2749 		} else
2750 			p = pfind(args->who);
2751 		if (p == NULL)
2752 			return (ESRCH);
2753 		if ((error = p_cansee(td, p))) {
2754 			PROC_UNLOCK(p);
2755 			break;
2756 		}
2757 		if (td1 != NULL) {
2758 			pri_to_rtp(td1, &rtp);
2759 		} else {
2760 			FOREACH_THREAD_IN_PROC(p, td1) {
2761 				linux_up_rtprio_if(td1, &rtp);
2762 			}
2763 		}
2764 		found++;
2765 		PROC_UNLOCK(p);
2766 		break;
2767 	case LINUX_IOPRIO_WHO_PGRP:
2768 		sx_slock(&proctree_lock);
2769 		if (args->who == 0) {
2770 			pg = td->td_proc->p_pgrp;
2771 			PGRP_LOCK(pg);
2772 		} else {
2773 			pg = pgfind(args->who);
2774 			if (pg == NULL) {
2775 				sx_sunlock(&proctree_lock);
2776 				error = ESRCH;
2777 				break;
2778 			}
2779 		}
2780 		sx_sunlock(&proctree_lock);
2781 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
2782 			PROC_LOCK(p);
2783 			if (p->p_state == PRS_NORMAL &&
2784 			    p_cansee(td, p) == 0) {
2785 				FOREACH_THREAD_IN_PROC(p, td1) {
2786 					linux_up_rtprio_if(td1, &rtp);
2787 					found++;
2788 				}
2789 			}
2790 			PROC_UNLOCK(p);
2791 		}
2792 		PGRP_UNLOCK(pg);
2793 		break;
2794 	case LINUX_IOPRIO_WHO_USER:
2795 		if (args->who == 0)
2796 			args->who = td->td_ucred->cr_uid;
2797 		sx_slock(&allproc_lock);
2798 		FOREACH_PROC_IN_SYSTEM(p) {
2799 			PROC_LOCK(p);
2800 			if (p->p_state == PRS_NORMAL &&
2801 			    p->p_ucred->cr_uid == args->who &&
2802 			    p_cansee(td, p) == 0) {
2803 				FOREACH_THREAD_IN_PROC(p, td1) {
2804 					linux_up_rtprio_if(td1, &rtp);
2805 					found++;
2806 				}
2807 			}
2808 			PROC_UNLOCK(p);
2809 		}
2810 		sx_sunlock(&allproc_lock);
2811 		break;
2812 	default:
2813 		error = EINVAL;
2814 		break;
2815 	}
2816 	if (error == 0) {
2817 		if (found != 0)
2818 			td->td_retval[0] = linux_rtprio2ioprio(&rtp);
2819 		else
2820 			error = ESRCH;
2821 	}
2822 	return (error);
2823 }
2824 
2825 int
2826 linux_ioprio_set(struct thread *td, struct linux_ioprio_set_args *args)
2827 {
2828 	struct thread *td1;
2829 	struct rtprio rtp;
2830 	struct pgrp *pg;
2831 	struct proc *p;
2832 	int error;
2833 
2834 	if ((error = linux_ioprio2rtprio(args->ioprio, &rtp)) != 0)
2835 		return (error);
2836 	/* Attempts to set high priorities (REALTIME) require su privileges. */
2837 	if (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_REALTIME &&
2838 	    (error = priv_check(td, PRIV_SCHED_RTPRIO)) != 0)
2839 		return (error);
2840 
2841 	p = NULL;
2842 	td1 = NULL;
2843 	switch (args->which) {
2844 	case LINUX_IOPRIO_WHO_PROCESS:
2845 		if (args->who == 0) {
2846 			td1 = td;
2847 			p = td1->td_proc;
2848 			PROC_LOCK(p);
2849 		} else if (args->who > PID_MAX) {
2850 			td1 = linux_tdfind(td, args->who, -1);
2851 			if (td1 != NULL)
2852 				p = td1->td_proc;
2853 		} else
2854 			p = pfind(args->who);
2855 		if (p == NULL)
2856 			return (ESRCH);
2857 		if ((error = p_cansched(td, p))) {
2858 			PROC_UNLOCK(p);
2859 			break;
2860 		}
2861 		if (td1 != NULL) {
2862 			error = rtp_to_pri(&rtp, td1);
2863 		} else {
2864 			FOREACH_THREAD_IN_PROC(p, td1) {
2865 				if ((error = rtp_to_pri(&rtp, td1)) != 0)
2866 					break;
2867 			}
2868 		}
2869 		PROC_UNLOCK(p);
2870 		break;
2871 	case LINUX_IOPRIO_WHO_PGRP:
2872 		sx_slock(&proctree_lock);
2873 		if (args->who == 0) {
2874 			pg = td->td_proc->p_pgrp;
2875 			PGRP_LOCK(pg);
2876 		} else {
2877 			pg = pgfind(args->who);
2878 			if (pg == NULL) {
2879 				sx_sunlock(&proctree_lock);
2880 				error = ESRCH;
2881 				break;
2882 			}
2883 		}
2884 		sx_sunlock(&proctree_lock);
2885 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
2886 			PROC_LOCK(p);
2887 			if (p->p_state == PRS_NORMAL &&
2888 			    p_cansched(td, p) == 0) {
2889 				FOREACH_THREAD_IN_PROC(p, td1) {
2890 					if ((error = rtp_to_pri(&rtp, td1)) != 0)
2891 						break;
2892 				}
2893 			}
2894 			PROC_UNLOCK(p);
2895 			if (error != 0)
2896 				break;
2897 		}
2898 		PGRP_UNLOCK(pg);
2899 		break;
2900 	case LINUX_IOPRIO_WHO_USER:
2901 		if (args->who == 0)
2902 			args->who = td->td_ucred->cr_uid;
2903 		sx_slock(&allproc_lock);
2904 		FOREACH_PROC_IN_SYSTEM(p) {
2905 			PROC_LOCK(p);
2906 			if (p->p_state == PRS_NORMAL &&
2907 			    p->p_ucred->cr_uid == args->who &&
2908 			    p_cansched(td, p) == 0) {
2909 				FOREACH_THREAD_IN_PROC(p, td1) {
2910 					if ((error = rtp_to_pri(&rtp, td1)) != 0)
2911 						break;
2912 				}
2913 			}
2914 			PROC_UNLOCK(p);
2915 			if (error != 0)
2916 				break;
2917 		}
2918 		sx_sunlock(&allproc_lock);
2919 		break;
2920 	default:
2921 		error = EINVAL;
2922 		break;
2923 	}
2924 	return (error);
2925 }
2926