xref: /freebsd/sys/compat/linux/linux_misc.c (revision da5432eda807c4b7232d030d5157d5b417ea4f52)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 2002 Doug Rabson
5  * Copyright (c) 1994-1995 Søren Schmidt
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer
13  *    in this position and unchanged.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. The name of the author may not be used to endorse or promote products
18  *    derived from this software without specific prior written permission
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 #include <sys/param.h>
34 #include <sys/fcntl.h>
35 #include <sys/jail.h>
36 #include <sys/imgact.h>
37 #include <sys/limits.h>
38 #include <sys/lock.h>
39 #include <sys/msgbuf.h>
40 #include <sys/mutex.h>
41 #include <sys/poll.h>
42 #include <sys/priv.h>
43 #include <sys/proc.h>
44 #include <sys/procctl.h>
45 #include <sys/reboot.h>
46 #include <sys/random.h>
47 #include <sys/resourcevar.h>
48 #include <sys/rtprio.h>
49 #include <sys/sched.h>
50 #include <sys/smp.h>
51 #include <sys/stat.h>
52 #include <sys/syscallsubr.h>
53 #include <sys/sysctl.h>
54 #include <sys/sysent.h>
55 #include <sys/sysproto.h>
56 #include <sys/time.h>
57 #include <sys/vmmeter.h>
58 #include <sys/vnode.h>
59 
60 #include <security/audit/audit.h>
61 #include <security/mac/mac_framework.h>
62 
63 #include <vm/pmap.h>
64 #include <vm/vm_map.h>
65 #include <vm/swap_pager.h>
66 
67 #ifdef COMPAT_LINUX32
68 #include <machine/../linux32/linux.h>
69 #include <machine/../linux32/linux32_proto.h>
70 #else
71 #include <machine/../linux/linux.h>
72 #include <machine/../linux/linux_proto.h>
73 #endif
74 
75 #include <compat/linux/linux_common.h>
76 #include <compat/linux/linux_dtrace.h>
77 #include <compat/linux/linux_file.h>
78 #include <compat/linux/linux_mib.h>
79 #include <compat/linux/linux_signal.h>
80 #include <compat/linux/linux_time.h>
81 #include <compat/linux/linux_util.h>
82 #include <compat/linux/linux_sysproto.h>
83 #include <compat/linux/linux_emul.h>
84 #include <compat/linux/linux_misc.h>
85 
86 int stclohz;				/* Statistics clock frequency */
87 
88 static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = {
89 	RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK,
90 	RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE,
91 	RLIMIT_MEMLOCK, RLIMIT_AS
92 };
93 
94 struct l_sysinfo {
95 	l_long		uptime;		/* Seconds since boot */
96 	l_ulong		loads[3];	/* 1, 5, and 15 minute load averages */
97 #define LINUX_SYSINFO_LOADS_SCALE 65536
98 	l_ulong		totalram;	/* Total usable main memory size */
99 	l_ulong		freeram;	/* Available memory size */
100 	l_ulong		sharedram;	/* Amount of shared memory */
101 	l_ulong		bufferram;	/* Memory used by buffers */
102 	l_ulong		totalswap;	/* Total swap space size */
103 	l_ulong		freeswap;	/* swap space still available */
104 	l_ushort	procs;		/* Number of current processes */
105 	l_ushort	pads;
106 	l_ulong		totalhigh;
107 	l_ulong		freehigh;
108 	l_uint		mem_unit;
109 	char		_f[20-2*sizeof(l_long)-sizeof(l_int)];	/* padding */
110 };
111 
112 struct l_pselect6arg {
113 	l_uintptr_t	ss;
114 	l_size_t	ss_len;
115 };
116 
117 static int	linux_utimensat_lts_to_ts(struct l_timespec *,
118 			struct timespec *);
119 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
120 static int	linux_utimensat_lts64_to_ts(struct l_timespec64 *,
121 			struct timespec *);
122 #endif
123 static int	linux_common_utimensat(struct thread *, int,
124 			const char *, struct timespec *, int);
125 static int	linux_common_pselect6(struct thread *, l_int,
126 			l_fd_set *, l_fd_set *, l_fd_set *,
127 			struct timespec *, l_uintptr_t *);
128 static int	linux_common_ppoll(struct thread *, struct pollfd *,
129 			uint32_t, struct timespec *, l_sigset_t *,
130 			l_size_t);
131 static int	linux_pollin(struct thread *, struct pollfd *,
132 			struct pollfd *, u_int);
133 static int	linux_pollout(struct thread *, struct pollfd *,
134 			struct pollfd *, u_int);
135 
136 int
137 linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args)
138 {
139 	struct l_sysinfo sysinfo;
140 	int i, j;
141 	struct timespec ts;
142 
143 	bzero(&sysinfo, sizeof(sysinfo));
144 	getnanouptime(&ts);
145 	if (ts.tv_nsec != 0)
146 		ts.tv_sec++;
147 	sysinfo.uptime = ts.tv_sec;
148 
149 	/* Use the information from the mib to get our load averages */
150 	for (i = 0; i < 3; i++)
151 		sysinfo.loads[i] = averunnable.ldavg[i] *
152 		    LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale;
153 
154 	sysinfo.totalram = physmem * PAGE_SIZE;
155 	sysinfo.freeram = (u_long)vm_free_count() * PAGE_SIZE;
156 
157 	/*
158 	 * sharedram counts pages allocated to named, swap-backed objects such
159 	 * as shared memory segments and tmpfs files.  There is no cheap way to
160 	 * compute this, so just leave the field unpopulated.  Linux itself only
161 	 * started setting this field in the 3.x timeframe.
162 	 */
163 	sysinfo.sharedram = 0;
164 	sysinfo.bufferram = 0;
165 
166 	swap_pager_status(&i, &j);
167 	sysinfo.totalswap = i * PAGE_SIZE;
168 	sysinfo.freeswap = (i - j) * PAGE_SIZE;
169 
170 	sysinfo.procs = nprocs;
171 
172 	/*
173 	 * Platforms supported by the emulation layer do not have a notion of
174 	 * high memory.
175 	 */
176 	sysinfo.totalhigh = 0;
177 	sysinfo.freehigh = 0;
178 
179 	sysinfo.mem_unit = 1;
180 
181 	return (copyout(&sysinfo, args->info, sizeof(sysinfo)));
182 }
183 
184 #ifdef LINUX_LEGACY_SYSCALLS
185 int
186 linux_alarm(struct thread *td, struct linux_alarm_args *args)
187 {
188 	struct itimerval it, old_it;
189 	u_int secs;
190 	int error __diagused;
191 
192 	secs = args->secs;
193 	/*
194 	 * Linux alarm() is always successful. Limit secs to INT32_MAX / 2
195 	 * to match kern_setitimer()'s limit to avoid error from it.
196 	 *
197 	 * XXX. Linux limit secs to INT_MAX on 32 and does not limit on 64-bit
198 	 * platforms.
199 	 */
200 	if (secs > INT32_MAX / 2)
201 		secs = INT32_MAX / 2;
202 
203 	it.it_value.tv_sec = secs;
204 	it.it_value.tv_usec = 0;
205 	timevalclear(&it.it_interval);
206 	error = kern_setitimer(td, ITIMER_REAL, &it, &old_it);
207 	KASSERT(error == 0, ("kern_setitimer returns %d", error));
208 
209 	if ((old_it.it_value.tv_sec == 0 && old_it.it_value.tv_usec > 0) ||
210 	    old_it.it_value.tv_usec >= 500000)
211 		old_it.it_value.tv_sec++;
212 	td->td_retval[0] = old_it.it_value.tv_sec;
213 	return (0);
214 }
215 #endif
216 
217 int
218 linux_brk(struct thread *td, struct linux_brk_args *args)
219 {
220 	struct vmspace *vm = td->td_proc->p_vmspace;
221 	uintptr_t new, old;
222 
223 	old = (uintptr_t)vm->vm_daddr + ctob(vm->vm_dsize);
224 	new = (uintptr_t)args->dsend;
225 	if ((caddr_t)new > vm->vm_daddr && !kern_break(td, &new))
226 		td->td_retval[0] = (register_t)new;
227 	else
228 		td->td_retval[0] = (register_t)old;
229 
230 	return (0);
231 }
232 
233 #ifdef LINUX_LEGACY_SYSCALLS
234 int
235 linux_select(struct thread *td, struct linux_select_args *args)
236 {
237 	l_timeval ltv;
238 	struct timeval tv0, tv1, utv, *tvp;
239 	int error;
240 
241 	/*
242 	 * Store current time for computation of the amount of
243 	 * time left.
244 	 */
245 	if (args->timeout) {
246 		if ((error = copyin(args->timeout, &ltv, sizeof(ltv))))
247 			goto select_out;
248 		utv.tv_sec = ltv.tv_sec;
249 		utv.tv_usec = ltv.tv_usec;
250 
251 		if (itimerfix(&utv)) {
252 			/*
253 			 * The timeval was invalid.  Convert it to something
254 			 * valid that will act as it does under Linux.
255 			 */
256 			utv.tv_sec += utv.tv_usec / 1000000;
257 			utv.tv_usec %= 1000000;
258 			if (utv.tv_usec < 0) {
259 				utv.tv_sec -= 1;
260 				utv.tv_usec += 1000000;
261 			}
262 			if (utv.tv_sec < 0)
263 				timevalclear(&utv);
264 		}
265 		microtime(&tv0);
266 		tvp = &utv;
267 	} else
268 		tvp = NULL;
269 
270 	error = kern_select(td, args->nfds, args->readfds, args->writefds,
271 	    args->exceptfds, tvp, LINUX_NFDBITS);
272 	if (error)
273 		goto select_out;
274 
275 	if (args->timeout) {
276 		if (td->td_retval[0]) {
277 			/*
278 			 * Compute how much time was left of the timeout,
279 			 * by subtracting the current time and the time
280 			 * before we started the call, and subtracting
281 			 * that result from the user-supplied value.
282 			 */
283 			microtime(&tv1);
284 			timevalsub(&tv1, &tv0);
285 			timevalsub(&utv, &tv1);
286 			if (utv.tv_sec < 0)
287 				timevalclear(&utv);
288 		} else
289 			timevalclear(&utv);
290 		ltv.tv_sec = utv.tv_sec;
291 		ltv.tv_usec = utv.tv_usec;
292 		if ((error = copyout(&ltv, args->timeout, sizeof(ltv))))
293 			goto select_out;
294 	}
295 
296 select_out:
297 	return (error);
298 }
299 #endif
300 
301 int
302 linux_mremap(struct thread *td, struct linux_mremap_args *args)
303 {
304 	uintptr_t addr;
305 	size_t len;
306 	int error = 0;
307 
308 	if (args->flags & ~(LINUX_MREMAP_FIXED | LINUX_MREMAP_MAYMOVE)) {
309 		td->td_retval[0] = 0;
310 		return (EINVAL);
311 	}
312 
313 	/*
314 	 * Check for the page alignment.
315 	 * Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK.
316 	 */
317 	if (args->addr & PAGE_MASK) {
318 		td->td_retval[0] = 0;
319 		return (EINVAL);
320 	}
321 
322 	args->new_len = round_page(args->new_len);
323 	args->old_len = round_page(args->old_len);
324 
325 	if (args->new_len > args->old_len) {
326 		td->td_retval[0] = 0;
327 		return (ENOMEM);
328 	}
329 
330 	if (args->new_len < args->old_len) {
331 		addr = args->addr + args->new_len;
332 		len = args->old_len - args->new_len;
333 		error = kern_munmap(td, addr, len);
334 	}
335 
336 	td->td_retval[0] = error ? 0 : (uintptr_t)args->addr;
337 	return (error);
338 }
339 
340 #define LINUX_MS_ASYNC       0x0001
341 #define LINUX_MS_INVALIDATE  0x0002
342 #define LINUX_MS_SYNC        0x0004
343 
344 int
345 linux_msync(struct thread *td, struct linux_msync_args *args)
346 {
347 
348 	return (kern_msync(td, args->addr, args->len,
349 	    args->fl & ~LINUX_MS_SYNC));
350 }
351 
352 #ifdef LINUX_LEGACY_SYSCALLS
353 int
354 linux_time(struct thread *td, struct linux_time_args *args)
355 {
356 	struct timeval tv;
357 	l_time_t tm;
358 	int error;
359 
360 	microtime(&tv);
361 	tm = tv.tv_sec;
362 	if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm))))
363 		return (error);
364 	td->td_retval[0] = tm;
365 	return (0);
366 }
367 #endif
368 
369 struct l_times_argv {
370 	l_clock_t	tms_utime;
371 	l_clock_t	tms_stime;
372 	l_clock_t	tms_cutime;
373 	l_clock_t	tms_cstime;
374 };
375 
376 /*
377  * Glibc versions prior to 2.2.1 always use hard-coded CLK_TCK value.
378  * Since 2.2.1 Glibc uses value exported from kernel via AT_CLKTCK
379  * auxiliary vector entry.
380  */
381 #define	CLK_TCK		100
382 
383 #define	CONVOTCK(r)	(r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK))
384 #define	CONVNTCK(r)	(r.tv_sec * stclohz + r.tv_usec / (1000000 / stclohz))
385 
386 #define	CONVTCK(r)	(linux_kernver(td) >= LINUX_KERNVER(2,4,0) ?	\
387 			    CONVNTCK(r) : CONVOTCK(r))
388 
389 int
390 linux_times(struct thread *td, struct linux_times_args *args)
391 {
392 	struct timeval tv, utime, stime, cutime, cstime;
393 	struct l_times_argv tms;
394 	struct proc *p;
395 	int error;
396 
397 	if (args->buf != NULL) {
398 		p = td->td_proc;
399 		PROC_LOCK(p);
400 		PROC_STATLOCK(p);
401 		calcru(p, &utime, &stime);
402 		PROC_STATUNLOCK(p);
403 		calccru(p, &cutime, &cstime);
404 		PROC_UNLOCK(p);
405 
406 		tms.tms_utime = CONVTCK(utime);
407 		tms.tms_stime = CONVTCK(stime);
408 
409 		tms.tms_cutime = CONVTCK(cutime);
410 		tms.tms_cstime = CONVTCK(cstime);
411 
412 		if ((error = copyout(&tms, args->buf, sizeof(tms))))
413 			return (error);
414 	}
415 
416 	microuptime(&tv);
417 	td->td_retval[0] = (int)CONVTCK(tv);
418 	return (0);
419 }
420 
421 int
422 linux_newuname(struct thread *td, struct linux_newuname_args *args)
423 {
424 	struct l_new_utsname utsname;
425 	char osname[LINUX_MAX_UTSNAME];
426 	char osrelease[LINUX_MAX_UTSNAME];
427 	char *p;
428 
429 	linux_get_osname(td, osname);
430 	linux_get_osrelease(td, osrelease);
431 
432 	bzero(&utsname, sizeof(utsname));
433 	strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME);
434 	getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME);
435 	getcreddomainname(td->td_ucred, utsname.domainname, LINUX_MAX_UTSNAME);
436 	strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME);
437 	strlcpy(utsname.version, version, LINUX_MAX_UTSNAME);
438 	for (p = utsname.version; *p != '\0'; ++p)
439 		if (*p == '\n') {
440 			*p = '\0';
441 			break;
442 		}
443 #if defined(__amd64__)
444 	/*
445 	 * On amd64, Linux uname(2) needs to return "x86_64"
446 	 * for both 64-bit and 32-bit applications.  On 32-bit,
447 	 * the string returned by getauxval(AT_PLATFORM) needs
448 	 * to remain "i686", though.
449 	 */
450 #if defined(COMPAT_LINUX32)
451 	if (linux32_emulate_i386)
452 		strlcpy(utsname.machine, "i686", LINUX_MAX_UTSNAME);
453 	else
454 #endif
455 	strlcpy(utsname.machine, "x86_64", LINUX_MAX_UTSNAME);
456 #elif defined(__aarch64__)
457 	strlcpy(utsname.machine, "aarch64", LINUX_MAX_UTSNAME);
458 #elif defined(__i386__)
459 	strlcpy(utsname.machine, "i686", LINUX_MAX_UTSNAME);
460 #endif
461 
462 	return (copyout(&utsname, args->buf, sizeof(utsname)));
463 }
464 
465 struct l_utimbuf {
466 	l_time_t l_actime;
467 	l_time_t l_modtime;
468 };
469 
470 #ifdef LINUX_LEGACY_SYSCALLS
471 int
472 linux_utime(struct thread *td, struct linux_utime_args *args)
473 {
474 	struct timeval tv[2], *tvp;
475 	struct l_utimbuf lut;
476 	int error;
477 
478 	if (args->times) {
479 		if ((error = copyin(args->times, &lut, sizeof lut)) != 0)
480 			return (error);
481 		tv[0].tv_sec = lut.l_actime;
482 		tv[0].tv_usec = 0;
483 		tv[1].tv_sec = lut.l_modtime;
484 		tv[1].tv_usec = 0;
485 		tvp = tv;
486 	} else
487 		tvp = NULL;
488 
489 	return (kern_utimesat(td, AT_FDCWD, args->fname, UIO_USERSPACE,
490 	    tvp, UIO_SYSSPACE));
491 }
492 #endif
493 
494 #ifdef LINUX_LEGACY_SYSCALLS
495 int
496 linux_utimes(struct thread *td, struct linux_utimes_args *args)
497 {
498 	l_timeval ltv[2];
499 	struct timeval tv[2], *tvp = NULL;
500 	int error;
501 
502 	if (args->tptr != NULL) {
503 		if ((error = copyin(args->tptr, ltv, sizeof ltv)) != 0)
504 			return (error);
505 		tv[0].tv_sec = ltv[0].tv_sec;
506 		tv[0].tv_usec = ltv[0].tv_usec;
507 		tv[1].tv_sec = ltv[1].tv_sec;
508 		tv[1].tv_usec = ltv[1].tv_usec;
509 		tvp = tv;
510 	}
511 
512 	return (kern_utimesat(td, AT_FDCWD, args->fname, UIO_USERSPACE,
513 	    tvp, UIO_SYSSPACE));
514 }
515 #endif
516 
517 static int
518 linux_utimensat_lts_to_ts(struct l_timespec *l_times, struct timespec *times)
519 {
520 
521 	if (l_times->tv_nsec != LINUX_UTIME_OMIT &&
522 	    l_times->tv_nsec != LINUX_UTIME_NOW &&
523 	    (l_times->tv_nsec < 0 || l_times->tv_nsec > 999999999))
524 		return (EINVAL);
525 
526 	times->tv_sec = l_times->tv_sec;
527 	switch (l_times->tv_nsec)
528 	{
529 	case LINUX_UTIME_OMIT:
530 		times->tv_nsec = UTIME_OMIT;
531 		break;
532 	case LINUX_UTIME_NOW:
533 		times->tv_nsec = UTIME_NOW;
534 		break;
535 	default:
536 		times->tv_nsec = l_times->tv_nsec;
537 	}
538 
539 	return (0);
540 }
541 
542 static int
543 linux_common_utimensat(struct thread *td, int ldfd, const char *pathname,
544     struct timespec *timesp, int lflags)
545 {
546 	int dfd, flags = 0;
547 
548 	dfd = (ldfd == LINUX_AT_FDCWD) ? AT_FDCWD : ldfd;
549 
550 	if (lflags & ~(LINUX_AT_SYMLINK_NOFOLLOW | LINUX_AT_EMPTY_PATH))
551 		return (EINVAL);
552 
553 	if (timesp != NULL) {
554 		/* This breaks POSIX, but is what the Linux kernel does
555 		 * _on purpose_ (documented in the man page for utimensat(2)),
556 		 * so we must follow that behaviour. */
557 		if (timesp[0].tv_nsec == UTIME_OMIT &&
558 		    timesp[1].tv_nsec == UTIME_OMIT)
559 			return (0);
560 	}
561 
562 	if (lflags & LINUX_AT_SYMLINK_NOFOLLOW)
563 		flags |= AT_SYMLINK_NOFOLLOW;
564 	if (lflags & LINUX_AT_EMPTY_PATH)
565 		flags |= AT_EMPTY_PATH;
566 
567 	if (pathname != NULL)
568 		return (kern_utimensat(td, dfd, pathname,
569 		    UIO_USERSPACE, timesp, UIO_SYSSPACE, flags));
570 
571 	if (lflags != 0)
572 		return (EINVAL);
573 
574 	return (kern_futimens(td, dfd, timesp, UIO_SYSSPACE));
575 }
576 
577 int
578 linux_utimensat(struct thread *td, struct linux_utimensat_args *args)
579 {
580 	struct l_timespec l_times[2];
581 	struct timespec times[2], *timesp;
582 	int error;
583 
584 	if (args->times != NULL) {
585 		error = copyin(args->times, l_times, sizeof(l_times));
586 		if (error != 0)
587 			return (error);
588 
589 		error = linux_utimensat_lts_to_ts(&l_times[0], &times[0]);
590 		if (error != 0)
591 			return (error);
592 		error = linux_utimensat_lts_to_ts(&l_times[1], &times[1]);
593 		if (error != 0)
594 			return (error);
595 		timesp = times;
596 	} else
597 		timesp = NULL;
598 
599 	return (linux_common_utimensat(td, args->dfd, args->pathname,
600 	    timesp, args->flags));
601 }
602 
603 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
604 static int
605 linux_utimensat_lts64_to_ts(struct l_timespec64 *l_times, struct timespec *times)
606 {
607 
608 	/* Zero out the padding in compat mode. */
609 	l_times->tv_nsec &= 0xFFFFFFFFUL;
610 
611 	if (l_times->tv_nsec != LINUX_UTIME_OMIT &&
612 	    l_times->tv_nsec != LINUX_UTIME_NOW &&
613 	    (l_times->tv_nsec < 0 || l_times->tv_nsec > 999999999))
614 		return (EINVAL);
615 
616 	times->tv_sec = l_times->tv_sec;
617 	switch (l_times->tv_nsec)
618 	{
619 	case LINUX_UTIME_OMIT:
620 		times->tv_nsec = UTIME_OMIT;
621 		break;
622 	case LINUX_UTIME_NOW:
623 		times->tv_nsec = UTIME_NOW;
624 		break;
625 	default:
626 		times->tv_nsec = l_times->tv_nsec;
627 	}
628 
629 	return (0);
630 }
631 
632 int
633 linux_utimensat_time64(struct thread *td, struct linux_utimensat_time64_args *args)
634 {
635 	struct l_timespec64 l_times[2];
636 	struct timespec times[2], *timesp;
637 	int error;
638 
639 	if (args->times64 != NULL) {
640 		error = copyin(args->times64, l_times, sizeof(l_times));
641 		if (error != 0)
642 			return (error);
643 
644 		error = linux_utimensat_lts64_to_ts(&l_times[0], &times[0]);
645 		if (error != 0)
646 			return (error);
647 		error = linux_utimensat_lts64_to_ts(&l_times[1], &times[1]);
648 		if (error != 0)
649 			return (error);
650 		timesp = times;
651 	} else
652 		timesp = NULL;
653 
654 	return (linux_common_utimensat(td, args->dfd, args->pathname,
655 	    timesp, args->flags));
656 }
657 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
658 
659 #ifdef LINUX_LEGACY_SYSCALLS
660 int
661 linux_futimesat(struct thread *td, struct linux_futimesat_args *args)
662 {
663 	l_timeval ltv[2];
664 	struct timeval tv[2], *tvp = NULL;
665 	int error, dfd;
666 
667 	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
668 
669 	if (args->utimes != NULL) {
670 		if ((error = copyin(args->utimes, ltv, sizeof ltv)) != 0)
671 			return (error);
672 		tv[0].tv_sec = ltv[0].tv_sec;
673 		tv[0].tv_usec = ltv[0].tv_usec;
674 		tv[1].tv_sec = ltv[1].tv_sec;
675 		tv[1].tv_usec = ltv[1].tv_usec;
676 		tvp = tv;
677 	}
678 
679 	return (kern_utimesat(td, dfd, args->filename, UIO_USERSPACE,
680 	    tvp, UIO_SYSSPACE));
681 }
682 #endif
683 
684 static int
685 linux_common_wait(struct thread *td, idtype_t idtype, int id, int *statusp,
686     int options, void *rup, l_siginfo_t *infop)
687 {
688 	l_siginfo_t lsi;
689 	siginfo_t siginfo;
690 	struct __wrusage wru;
691 	int error, status, tmpstat, sig;
692 
693 	error = kern_wait6(td, idtype, id, &status, options,
694 	    rup != NULL ? &wru : NULL, &siginfo);
695 
696 	if (error == 0 && statusp) {
697 		tmpstat = status & 0xffff;
698 		if (WIFSIGNALED(tmpstat)) {
699 			tmpstat = (tmpstat & 0xffffff80) |
700 			    bsd_to_linux_signal(WTERMSIG(tmpstat));
701 		} else if (WIFSTOPPED(tmpstat)) {
702 			tmpstat = (tmpstat & 0xffff00ff) |
703 			    (bsd_to_linux_signal(WSTOPSIG(tmpstat)) << 8);
704 #if defined(__aarch64__) || (defined(__amd64__) && !defined(COMPAT_LINUX32))
705 			if (WSTOPSIG(status) == SIGTRAP) {
706 				tmpstat = linux_ptrace_status(td,
707 				    siginfo.si_pid, tmpstat);
708 			}
709 #endif
710 		} else if (WIFCONTINUED(tmpstat)) {
711 			tmpstat = 0xffff;
712 		}
713 		error = copyout(&tmpstat, statusp, sizeof(int));
714 	}
715 	if (error == 0 && rup != NULL)
716 		error = linux_copyout_rusage(&wru.wru_self, rup);
717 	if (error == 0 && infop != NULL && td->td_retval[0] != 0) {
718 		sig = bsd_to_linux_signal(siginfo.si_signo);
719 		siginfo_to_lsiginfo(&siginfo, &lsi, sig);
720 		error = copyout(&lsi, infop, sizeof(lsi));
721 	}
722 
723 	return (error);
724 }
725 
726 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
727 int
728 linux_waitpid(struct thread *td, struct linux_waitpid_args *args)
729 {
730 	struct linux_wait4_args wait4_args = {
731 		.pid = args->pid,
732 		.status = args->status,
733 		.options = args->options,
734 		.rusage = NULL,
735 	};
736 
737 	return (linux_wait4(td, &wait4_args));
738 }
739 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
740 
741 int
742 linux_wait4(struct thread *td, struct linux_wait4_args *args)
743 {
744 	struct proc *p;
745 	int options, id, idtype;
746 
747 	if (args->options & ~(LINUX_WUNTRACED | LINUX_WNOHANG |
748 	    LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL))
749 		return (EINVAL);
750 
751 	/* -INT_MIN is not defined. */
752 	if (args->pid == INT_MIN)
753 		return (ESRCH);
754 
755 	options = 0;
756 	linux_to_bsd_waitopts(args->options, &options);
757 
758 	/*
759 	 * For backward compatibility we implicitly add flags WEXITED
760 	 * and WTRAPPED here.
761 	 */
762 	options |= WEXITED | WTRAPPED;
763 
764 	if (args->pid == WAIT_ANY) {
765 		idtype = P_ALL;
766 		id = 0;
767 	} else if (args->pid < 0) {
768 		idtype = P_PGID;
769 		id = (id_t)-args->pid;
770 	} else if (args->pid == 0) {
771 		idtype = P_PGID;
772 		p = td->td_proc;
773 		PROC_LOCK(p);
774 		id = p->p_pgid;
775 		PROC_UNLOCK(p);
776 	} else {
777 		idtype = P_PID;
778 		id = (id_t)args->pid;
779 	}
780 
781 	return (linux_common_wait(td, idtype, id, args->status, options,
782 	    args->rusage, NULL));
783 }
784 
785 int
786 linux_waitid(struct thread *td, struct linux_waitid_args *args)
787 {
788 	idtype_t idtype;
789 	int error, options;
790 	struct proc *p;
791 	pid_t id;
792 
793 	if (args->options & ~(LINUX_WNOHANG | LINUX_WNOWAIT | LINUX_WEXITED |
794 	    LINUX_WSTOPPED | LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL))
795 		return (EINVAL);
796 
797 	options = 0;
798 	linux_to_bsd_waitopts(args->options, &options);
799 
800 	id = args->id;
801 	switch (args->idtype) {
802 	case LINUX_P_ALL:
803 		idtype = P_ALL;
804 		break;
805 	case LINUX_P_PID:
806 		if (args->id <= 0)
807 			return (EINVAL);
808 		idtype = P_PID;
809 		break;
810 	case LINUX_P_PGID:
811 		if (linux_kernver(td) >= LINUX_KERNVER(5,4,0) && args->id == 0) {
812 			p = td->td_proc;
813 			PROC_LOCK(p);
814 			id = p->p_pgid;
815 			PROC_UNLOCK(p);
816 		} else if (args->id <= 0)
817 			return (EINVAL);
818 		idtype = P_PGID;
819 		break;
820 	case LINUX_P_PIDFD:
821 		LINUX_RATELIMIT_MSG("unsupported waitid P_PIDFD idtype");
822 		return (ENOSYS);
823 	default:
824 		return (EINVAL);
825 	}
826 
827 	error = linux_common_wait(td, idtype, id, NULL, options,
828 	    args->rusage, args->info);
829 	td->td_retval[0] = 0;
830 
831 	return (error);
832 }
833 
834 #ifdef LINUX_LEGACY_SYSCALLS
835 int
836 linux_mknod(struct thread *td, struct linux_mknod_args *args)
837 {
838 	int error;
839 
840 	switch (args->mode & S_IFMT) {
841 	case S_IFIFO:
842 	case S_IFSOCK:
843 		error = kern_mkfifoat(td, AT_FDCWD, args->path, UIO_USERSPACE,
844 		    args->mode);
845 		break;
846 
847 	case S_IFCHR:
848 	case S_IFBLK:
849 		error = kern_mknodat(td, AT_FDCWD, args->path, UIO_USERSPACE,
850 		    args->mode, linux_decode_dev(args->dev));
851 		break;
852 
853 	case S_IFDIR:
854 		error = EPERM;
855 		break;
856 
857 	case 0:
858 		args->mode |= S_IFREG;
859 		/* FALLTHROUGH */
860 	case S_IFREG:
861 		error = kern_openat(td, AT_FDCWD, args->path, UIO_USERSPACE,
862 		    O_WRONLY | O_CREAT | O_TRUNC, args->mode);
863 		if (error == 0)
864 			kern_close(td, td->td_retval[0]);
865 		break;
866 
867 	default:
868 		error = EINVAL;
869 		break;
870 	}
871 	return (error);
872 }
873 #endif
874 
875 int
876 linux_mknodat(struct thread *td, struct linux_mknodat_args *args)
877 {
878 	int error, dfd;
879 
880 	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
881 
882 	switch (args->mode & S_IFMT) {
883 	case S_IFIFO:
884 	case S_IFSOCK:
885 		error = kern_mkfifoat(td, dfd, args->filename, UIO_USERSPACE,
886 		    args->mode);
887 		break;
888 
889 	case S_IFCHR:
890 	case S_IFBLK:
891 		error = kern_mknodat(td, dfd, args->filename, UIO_USERSPACE,
892 		    args->mode, linux_decode_dev(args->dev));
893 		break;
894 
895 	case S_IFDIR:
896 		error = EPERM;
897 		break;
898 
899 	case 0:
900 		args->mode |= S_IFREG;
901 		/* FALLTHROUGH */
902 	case S_IFREG:
903 		error = kern_openat(td, dfd, args->filename, UIO_USERSPACE,
904 		    O_WRONLY | O_CREAT | O_TRUNC, args->mode);
905 		if (error == 0)
906 			kern_close(td, td->td_retval[0]);
907 		break;
908 
909 	default:
910 		error = EINVAL;
911 		break;
912 	}
913 	return (error);
914 }
915 
916 /*
917  * UGH! This is just about the dumbest idea I've ever heard!!
918  */
919 int
920 linux_personality(struct thread *td, struct linux_personality_args *args)
921 {
922 	struct linux_pemuldata *pem;
923 	struct proc *p = td->td_proc;
924 	uint32_t old;
925 
926 	PROC_LOCK(p);
927 	pem = pem_find(p);
928 	old = pem->persona;
929 	if (args->per != 0xffffffff)
930 		pem->persona = args->per;
931 	PROC_UNLOCK(p);
932 
933 	td->td_retval[0] = old;
934 	return (0);
935 }
936 
937 struct l_itimerval {
938 	l_timeval it_interval;
939 	l_timeval it_value;
940 };
941 
942 #define	B2L_ITIMERVAL(bip, lip)						\
943 	(bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec;		\
944 	(bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec;	\
945 	(bip)->it_value.tv_sec = (lip)->it_value.tv_sec;		\
946 	(bip)->it_value.tv_usec = (lip)->it_value.tv_usec;
947 
948 int
949 linux_setitimer(struct thread *td, struct linux_setitimer_args *uap)
950 {
951 	int error;
952 	struct l_itimerval ls;
953 	struct itimerval aitv, oitv;
954 
955 	if (uap->itv == NULL) {
956 		uap->itv = uap->oitv;
957 		return (linux_getitimer(td, (struct linux_getitimer_args *)uap));
958 	}
959 
960 	error = copyin(uap->itv, &ls, sizeof(ls));
961 	if (error != 0)
962 		return (error);
963 	B2L_ITIMERVAL(&aitv, &ls);
964 	error = kern_setitimer(td, uap->which, &aitv, &oitv);
965 	if (error != 0 || uap->oitv == NULL)
966 		return (error);
967 	B2L_ITIMERVAL(&ls, &oitv);
968 
969 	return (copyout(&ls, uap->oitv, sizeof(ls)));
970 }
971 
972 int
973 linux_getitimer(struct thread *td, struct linux_getitimer_args *uap)
974 {
975 	int error;
976 	struct l_itimerval ls;
977 	struct itimerval aitv;
978 
979 	error = kern_getitimer(td, uap->which, &aitv);
980 	if (error != 0)
981 		return (error);
982 	B2L_ITIMERVAL(&ls, &aitv);
983 	return (copyout(&ls, uap->itv, sizeof(ls)));
984 }
985 
986 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
987 int
988 linux_nice(struct thread *td, struct linux_nice_args *args)
989 {
990 
991 	return (kern_setpriority(td, PRIO_PROCESS, 0, args->inc));
992 }
993 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
994 
995 int
996 linux_setgroups(struct thread *td, struct linux_setgroups_args *args)
997 {
998 	struct ucred *newcred, *oldcred;
999 	l_gid_t *linux_gidset;
1000 	gid_t *bsd_gidset;
1001 	int ngrp, error;
1002 	struct proc *p;
1003 
1004 	ngrp = args->gidsetsize;
1005 	if (ngrp < 0 || ngrp >= ngroups_max + 1)
1006 		return (EINVAL);
1007 	linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_LINUX, M_WAITOK);
1008 	error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t));
1009 	if (error)
1010 		goto out;
1011 	newcred = crget();
1012 	crextend(newcred, ngrp + 1);
1013 	p = td->td_proc;
1014 	PROC_LOCK(p);
1015 	oldcred = p->p_ucred;
1016 	crcopy(newcred, oldcred);
1017 
1018 	/*
1019 	 * cr_groups[0] holds egid. Setting the whole set from
1020 	 * the supplied set will cause egid to be changed too.
1021 	 * Keep cr_groups[0] unchanged to prevent that.
1022 	 */
1023 
1024 	if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS)) != 0) {
1025 		PROC_UNLOCK(p);
1026 		crfree(newcred);
1027 		goto out;
1028 	}
1029 
1030 	if (ngrp > 0) {
1031 		newcred->cr_ngroups = ngrp + 1;
1032 
1033 		bsd_gidset = newcred->cr_groups;
1034 		ngrp--;
1035 		while (ngrp >= 0) {
1036 			bsd_gidset[ngrp + 1] = linux_gidset[ngrp];
1037 			ngrp--;
1038 		}
1039 	} else
1040 		newcred->cr_ngroups = 1;
1041 
1042 	setsugid(p);
1043 	proc_set_cred(p, newcred);
1044 	PROC_UNLOCK(p);
1045 	crfree(oldcred);
1046 	error = 0;
1047 out:
1048 	free(linux_gidset, M_LINUX);
1049 	return (error);
1050 }
1051 
1052 int
1053 linux_getgroups(struct thread *td, struct linux_getgroups_args *args)
1054 {
1055 	struct ucred *cred;
1056 	l_gid_t *linux_gidset;
1057 	gid_t *bsd_gidset;
1058 	int bsd_gidsetsz, ngrp, error;
1059 
1060 	cred = td->td_ucred;
1061 	bsd_gidset = cred->cr_groups;
1062 	bsd_gidsetsz = cred->cr_ngroups - 1;
1063 
1064 	/*
1065 	 * cr_groups[0] holds egid. Returning the whole set
1066 	 * here will cause a duplicate. Exclude cr_groups[0]
1067 	 * to prevent that.
1068 	 */
1069 
1070 	if ((ngrp = args->gidsetsize) == 0) {
1071 		td->td_retval[0] = bsd_gidsetsz;
1072 		return (0);
1073 	}
1074 
1075 	if (ngrp < bsd_gidsetsz)
1076 		return (EINVAL);
1077 
1078 	ngrp = 0;
1079 	linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset),
1080 	    M_LINUX, M_WAITOK);
1081 	while (ngrp < bsd_gidsetsz) {
1082 		linux_gidset[ngrp] = bsd_gidset[ngrp + 1];
1083 		ngrp++;
1084 	}
1085 
1086 	error = copyout(linux_gidset, args->grouplist, ngrp * sizeof(l_gid_t));
1087 	free(linux_gidset, M_LINUX);
1088 	if (error)
1089 		return (error);
1090 
1091 	td->td_retval[0] = ngrp;
1092 	return (0);
1093 }
1094 
1095 static bool
1096 linux_get_dummy_limit(l_uint resource, struct rlimit *rlim)
1097 {
1098 
1099 	if (linux_dummy_rlimits == 0)
1100 		return (false);
1101 
1102 	switch (resource) {
1103 	case LINUX_RLIMIT_LOCKS:
1104 	case LINUX_RLIMIT_SIGPENDING:
1105 	case LINUX_RLIMIT_MSGQUEUE:
1106 	case LINUX_RLIMIT_RTTIME:
1107 		rlim->rlim_cur = LINUX_RLIM_INFINITY;
1108 		rlim->rlim_max = LINUX_RLIM_INFINITY;
1109 		return (true);
1110 	case LINUX_RLIMIT_NICE:
1111 	case LINUX_RLIMIT_RTPRIO:
1112 		rlim->rlim_cur = 0;
1113 		rlim->rlim_max = 0;
1114 		return (true);
1115 	default:
1116 		return (false);
1117 	}
1118 }
1119 
1120 int
1121 linux_setrlimit(struct thread *td, struct linux_setrlimit_args *args)
1122 {
1123 	struct rlimit bsd_rlim;
1124 	struct l_rlimit rlim;
1125 	u_int which;
1126 	int error;
1127 
1128 	if (args->resource >= LINUX_RLIM_NLIMITS)
1129 		return (EINVAL);
1130 
1131 	which = linux_to_bsd_resource[args->resource];
1132 	if (which == -1)
1133 		return (EINVAL);
1134 
1135 	error = copyin(args->rlim, &rlim, sizeof(rlim));
1136 	if (error)
1137 		return (error);
1138 
1139 	bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur;
1140 	bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max;
1141 	return (kern_setrlimit(td, which, &bsd_rlim));
1142 }
1143 
1144 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1145 int
1146 linux_old_getrlimit(struct thread *td, struct linux_old_getrlimit_args *args)
1147 {
1148 	struct l_rlimit rlim;
1149 	struct rlimit bsd_rlim;
1150 	u_int which;
1151 
1152 	if (linux_get_dummy_limit(args->resource, &bsd_rlim)) {
1153 		rlim.rlim_cur = bsd_rlim.rlim_cur;
1154 		rlim.rlim_max = bsd_rlim.rlim_max;
1155 		return (copyout(&rlim, args->rlim, sizeof(rlim)));
1156 	}
1157 
1158 	if (args->resource >= LINUX_RLIM_NLIMITS)
1159 		return (EINVAL);
1160 
1161 	which = linux_to_bsd_resource[args->resource];
1162 	if (which == -1)
1163 		return (EINVAL);
1164 
1165 	lim_rlimit(td, which, &bsd_rlim);
1166 
1167 #ifdef COMPAT_LINUX32
1168 	rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur;
1169 	if (rlim.rlim_cur == UINT_MAX)
1170 		rlim.rlim_cur = INT_MAX;
1171 	rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max;
1172 	if (rlim.rlim_max == UINT_MAX)
1173 		rlim.rlim_max = INT_MAX;
1174 #else
1175 	rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur;
1176 	if (rlim.rlim_cur == ULONG_MAX)
1177 		rlim.rlim_cur = LONG_MAX;
1178 	rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max;
1179 	if (rlim.rlim_max == ULONG_MAX)
1180 		rlim.rlim_max = LONG_MAX;
1181 #endif
1182 	return (copyout(&rlim, args->rlim, sizeof(rlim)));
1183 }
1184 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
1185 
1186 int
1187 linux_getrlimit(struct thread *td, struct linux_getrlimit_args *args)
1188 {
1189 	struct l_rlimit rlim;
1190 	struct rlimit bsd_rlim;
1191 	u_int which;
1192 
1193 	if (linux_get_dummy_limit(args->resource, &bsd_rlim)) {
1194 		rlim.rlim_cur = bsd_rlim.rlim_cur;
1195 		rlim.rlim_max = bsd_rlim.rlim_max;
1196 		return (copyout(&rlim, args->rlim, sizeof(rlim)));
1197 	}
1198 
1199 	if (args->resource >= LINUX_RLIM_NLIMITS)
1200 		return (EINVAL);
1201 
1202 	which = linux_to_bsd_resource[args->resource];
1203 	if (which == -1)
1204 		return (EINVAL);
1205 
1206 	lim_rlimit(td, which, &bsd_rlim);
1207 
1208 	rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur;
1209 	rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max;
1210 	return (copyout(&rlim, args->rlim, sizeof(rlim)));
1211 }
1212 
1213 int
1214 linux_sched_setscheduler(struct thread *td,
1215     struct linux_sched_setscheduler_args *args)
1216 {
1217 	struct sched_param sched_param;
1218 	struct thread *tdt;
1219 	int error, policy;
1220 
1221 	switch (args->policy) {
1222 	case LINUX_SCHED_OTHER:
1223 		policy = SCHED_OTHER;
1224 		break;
1225 	case LINUX_SCHED_FIFO:
1226 		policy = SCHED_FIFO;
1227 		break;
1228 	case LINUX_SCHED_RR:
1229 		policy = SCHED_RR;
1230 		break;
1231 	default:
1232 		return (EINVAL);
1233 	}
1234 
1235 	error = copyin(args->param, &sched_param, sizeof(sched_param));
1236 	if (error)
1237 		return (error);
1238 
1239 	if (linux_map_sched_prio) {
1240 		switch (policy) {
1241 		case SCHED_OTHER:
1242 			if (sched_param.sched_priority != 0)
1243 				return (EINVAL);
1244 
1245 			sched_param.sched_priority =
1246 			    PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
1247 			break;
1248 		case SCHED_FIFO:
1249 		case SCHED_RR:
1250 			if (sched_param.sched_priority < 1 ||
1251 			    sched_param.sched_priority >= LINUX_MAX_RT_PRIO)
1252 				return (EINVAL);
1253 
1254 			/*
1255 			 * Map [1, LINUX_MAX_RT_PRIO - 1] to
1256 			 * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down).
1257 			 */
1258 			sched_param.sched_priority =
1259 			    (sched_param.sched_priority - 1) *
1260 			    (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) /
1261 			    (LINUX_MAX_RT_PRIO - 1);
1262 			break;
1263 		}
1264 	}
1265 
1266 	tdt = linux_tdfind(td, args->pid, -1);
1267 	if (tdt == NULL)
1268 		return (ESRCH);
1269 
1270 	error = kern_sched_setscheduler(td, tdt, policy, &sched_param);
1271 	PROC_UNLOCK(tdt->td_proc);
1272 	return (error);
1273 }
1274 
1275 int
1276 linux_sched_getscheduler(struct thread *td,
1277     struct linux_sched_getscheduler_args *args)
1278 {
1279 	struct thread *tdt;
1280 	int error, policy;
1281 
1282 	tdt = linux_tdfind(td, args->pid, -1);
1283 	if (tdt == NULL)
1284 		return (ESRCH);
1285 
1286 	error = kern_sched_getscheduler(td, tdt, &policy);
1287 	PROC_UNLOCK(tdt->td_proc);
1288 
1289 	switch (policy) {
1290 	case SCHED_OTHER:
1291 		td->td_retval[0] = LINUX_SCHED_OTHER;
1292 		break;
1293 	case SCHED_FIFO:
1294 		td->td_retval[0] = LINUX_SCHED_FIFO;
1295 		break;
1296 	case SCHED_RR:
1297 		td->td_retval[0] = LINUX_SCHED_RR;
1298 		break;
1299 	}
1300 	return (error);
1301 }
1302 
1303 int
1304 linux_sched_get_priority_max(struct thread *td,
1305     struct linux_sched_get_priority_max_args *args)
1306 {
1307 	struct sched_get_priority_max_args bsd;
1308 
1309 	if (linux_map_sched_prio) {
1310 		switch (args->policy) {
1311 		case LINUX_SCHED_OTHER:
1312 			td->td_retval[0] = 0;
1313 			return (0);
1314 		case LINUX_SCHED_FIFO:
1315 		case LINUX_SCHED_RR:
1316 			td->td_retval[0] = LINUX_MAX_RT_PRIO - 1;
1317 			return (0);
1318 		default:
1319 			return (EINVAL);
1320 		}
1321 	}
1322 
1323 	switch (args->policy) {
1324 	case LINUX_SCHED_OTHER:
1325 		bsd.policy = SCHED_OTHER;
1326 		break;
1327 	case LINUX_SCHED_FIFO:
1328 		bsd.policy = SCHED_FIFO;
1329 		break;
1330 	case LINUX_SCHED_RR:
1331 		bsd.policy = SCHED_RR;
1332 		break;
1333 	default:
1334 		return (EINVAL);
1335 	}
1336 	return (sys_sched_get_priority_max(td, &bsd));
1337 }
1338 
1339 int
1340 linux_sched_get_priority_min(struct thread *td,
1341     struct linux_sched_get_priority_min_args *args)
1342 {
1343 	struct sched_get_priority_min_args bsd;
1344 
1345 	if (linux_map_sched_prio) {
1346 		switch (args->policy) {
1347 		case LINUX_SCHED_OTHER:
1348 			td->td_retval[0] = 0;
1349 			return (0);
1350 		case LINUX_SCHED_FIFO:
1351 		case LINUX_SCHED_RR:
1352 			td->td_retval[0] = 1;
1353 			return (0);
1354 		default:
1355 			return (EINVAL);
1356 		}
1357 	}
1358 
1359 	switch (args->policy) {
1360 	case LINUX_SCHED_OTHER:
1361 		bsd.policy = SCHED_OTHER;
1362 		break;
1363 	case LINUX_SCHED_FIFO:
1364 		bsd.policy = SCHED_FIFO;
1365 		break;
1366 	case LINUX_SCHED_RR:
1367 		bsd.policy = SCHED_RR;
1368 		break;
1369 	default:
1370 		return (EINVAL);
1371 	}
1372 	return (sys_sched_get_priority_min(td, &bsd));
1373 }
1374 
1375 #define REBOOT_CAD_ON	0x89abcdef
1376 #define REBOOT_CAD_OFF	0
1377 #define REBOOT_HALT	0xcdef0123
1378 #define REBOOT_RESTART	0x01234567
1379 #define REBOOT_RESTART2	0xA1B2C3D4
1380 #define REBOOT_POWEROFF	0x4321FEDC
1381 #define REBOOT_MAGIC1	0xfee1dead
1382 #define REBOOT_MAGIC2	0x28121969
1383 #define REBOOT_MAGIC2A	0x05121996
1384 #define REBOOT_MAGIC2B	0x16041998
1385 
1386 int
1387 linux_reboot(struct thread *td, struct linux_reboot_args *args)
1388 {
1389 	struct reboot_args bsd_args;
1390 
1391 	if (args->magic1 != REBOOT_MAGIC1)
1392 		return (EINVAL);
1393 
1394 	switch (args->magic2) {
1395 	case REBOOT_MAGIC2:
1396 	case REBOOT_MAGIC2A:
1397 	case REBOOT_MAGIC2B:
1398 		break;
1399 	default:
1400 		return (EINVAL);
1401 	}
1402 
1403 	switch (args->cmd) {
1404 	case REBOOT_CAD_ON:
1405 	case REBOOT_CAD_OFF:
1406 		return (priv_check(td, PRIV_REBOOT));
1407 	case REBOOT_HALT:
1408 		bsd_args.opt = RB_HALT;
1409 		break;
1410 	case REBOOT_RESTART:
1411 	case REBOOT_RESTART2:
1412 		bsd_args.opt = 0;
1413 		break;
1414 	case REBOOT_POWEROFF:
1415 		bsd_args.opt = RB_POWEROFF;
1416 		break;
1417 	default:
1418 		return (EINVAL);
1419 	}
1420 	return (sys_reboot(td, &bsd_args));
1421 }
1422 
1423 int
1424 linux_getpid(struct thread *td, struct linux_getpid_args *args)
1425 {
1426 
1427 	td->td_retval[0] = td->td_proc->p_pid;
1428 
1429 	return (0);
1430 }
1431 
1432 int
1433 linux_gettid(struct thread *td, struct linux_gettid_args *args)
1434 {
1435 	struct linux_emuldata *em;
1436 
1437 	em = em_find(td);
1438 	KASSERT(em != NULL, ("gettid: emuldata not found.\n"));
1439 
1440 	td->td_retval[0] = em->em_tid;
1441 
1442 	return (0);
1443 }
1444 
1445 int
1446 linux_getppid(struct thread *td, struct linux_getppid_args *args)
1447 {
1448 
1449 	td->td_retval[0] = kern_getppid(td);
1450 	return (0);
1451 }
1452 
1453 int
1454 linux_getgid(struct thread *td, struct linux_getgid_args *args)
1455 {
1456 
1457 	td->td_retval[0] = td->td_ucred->cr_rgid;
1458 	return (0);
1459 }
1460 
1461 int
1462 linux_getuid(struct thread *td, struct linux_getuid_args *args)
1463 {
1464 
1465 	td->td_retval[0] = td->td_ucred->cr_ruid;
1466 	return (0);
1467 }
1468 
1469 int
1470 linux_getsid(struct thread *td, struct linux_getsid_args *args)
1471 {
1472 
1473 	return (kern_getsid(td, args->pid));
1474 }
1475 
1476 int
1477 linux_nosys(struct thread *td, struct nosys_args *ignore)
1478 {
1479 
1480 	return (ENOSYS);
1481 }
1482 
1483 int
1484 linux_getpriority(struct thread *td, struct linux_getpriority_args *args)
1485 {
1486 	int error;
1487 
1488 	error = kern_getpriority(td, args->which, args->who);
1489 	td->td_retval[0] = 20 - td->td_retval[0];
1490 	return (error);
1491 }
1492 
1493 int
1494 linux_sethostname(struct thread *td, struct linux_sethostname_args *args)
1495 {
1496 	int name[2];
1497 
1498 	name[0] = CTL_KERN;
1499 	name[1] = KERN_HOSTNAME;
1500 	return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname,
1501 	    args->len, 0, 0));
1502 }
1503 
1504 int
1505 linux_setdomainname(struct thread *td, struct linux_setdomainname_args *args)
1506 {
1507 	int name[2];
1508 
1509 	name[0] = CTL_KERN;
1510 	name[1] = KERN_NISDOMAINNAME;
1511 	return (userland_sysctl(td, name, 2, 0, 0, 0, args->name,
1512 	    args->len, 0, 0));
1513 }
1514 
1515 int
1516 linux_exit_group(struct thread *td, struct linux_exit_group_args *args)
1517 {
1518 
1519 	LINUX_CTR2(exit_group, "thread(%d) (%d)", td->td_tid,
1520 	    args->error_code);
1521 
1522 	/*
1523 	 * XXX: we should send a signal to the parent if
1524 	 * SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?)
1525 	 * as it doesnt occur often.
1526 	 */
1527 	exit1(td, args->error_code, 0);
1528 		/* NOTREACHED */
1529 }
1530 
1531 #define _LINUX_CAPABILITY_VERSION_1  0x19980330
1532 #define _LINUX_CAPABILITY_VERSION_2  0x20071026
1533 #define _LINUX_CAPABILITY_VERSION_3  0x20080522
1534 
1535 struct l_user_cap_header {
1536 	l_int	version;
1537 	l_int	pid;
1538 };
1539 
1540 struct l_user_cap_data {
1541 	l_int	effective;
1542 	l_int	permitted;
1543 	l_int	inheritable;
1544 };
1545 
1546 int
1547 linux_capget(struct thread *td, struct linux_capget_args *uap)
1548 {
1549 	struct l_user_cap_header luch;
1550 	struct l_user_cap_data lucd[2];
1551 	int error, u32s;
1552 
1553 	if (uap->hdrp == NULL)
1554 		return (EFAULT);
1555 
1556 	error = copyin(uap->hdrp, &luch, sizeof(luch));
1557 	if (error != 0)
1558 		return (error);
1559 
1560 	switch (luch.version) {
1561 	case _LINUX_CAPABILITY_VERSION_1:
1562 		u32s = 1;
1563 		break;
1564 	case _LINUX_CAPABILITY_VERSION_2:
1565 	case _LINUX_CAPABILITY_VERSION_3:
1566 		u32s = 2;
1567 		break;
1568 	default:
1569 		luch.version = _LINUX_CAPABILITY_VERSION_1;
1570 		error = copyout(&luch, uap->hdrp, sizeof(luch));
1571 		if (error)
1572 			return (error);
1573 		return (EINVAL);
1574 	}
1575 
1576 	if (luch.pid)
1577 		return (EPERM);
1578 
1579 	if (uap->datap) {
1580 		/*
1581 		 * The current implementation doesn't support setting
1582 		 * a capability (it's essentially a stub) so indicate
1583 		 * that no capabilities are currently set or available
1584 		 * to request.
1585 		 */
1586 		memset(&lucd, 0, u32s * sizeof(lucd[0]));
1587 		error = copyout(&lucd, uap->datap, u32s * sizeof(lucd[0]));
1588 	}
1589 
1590 	return (error);
1591 }
1592 
1593 int
1594 linux_capset(struct thread *td, struct linux_capset_args *uap)
1595 {
1596 	struct l_user_cap_header luch;
1597 	struct l_user_cap_data lucd[2];
1598 	int error, i, u32s;
1599 
1600 	if (uap->hdrp == NULL || uap->datap == NULL)
1601 		return (EFAULT);
1602 
1603 	error = copyin(uap->hdrp, &luch, sizeof(luch));
1604 	if (error != 0)
1605 		return (error);
1606 
1607 	switch (luch.version) {
1608 	case _LINUX_CAPABILITY_VERSION_1:
1609 		u32s = 1;
1610 		break;
1611 	case _LINUX_CAPABILITY_VERSION_2:
1612 	case _LINUX_CAPABILITY_VERSION_3:
1613 		u32s = 2;
1614 		break;
1615 	default:
1616 		luch.version = _LINUX_CAPABILITY_VERSION_1;
1617 		error = copyout(&luch, uap->hdrp, sizeof(luch));
1618 		if (error)
1619 			return (error);
1620 		return (EINVAL);
1621 	}
1622 
1623 	if (luch.pid)
1624 		return (EPERM);
1625 
1626 	error = copyin(uap->datap, &lucd, u32s * sizeof(lucd[0]));
1627 	if (error != 0)
1628 		return (error);
1629 
1630 	/* We currently don't support setting any capabilities. */
1631 	for (i = 0; i < u32s; i++) {
1632 		if (lucd[i].effective || lucd[i].permitted ||
1633 		    lucd[i].inheritable) {
1634 			linux_msg(td,
1635 			    "capset[%d] effective=0x%x, permitted=0x%x, "
1636 			    "inheritable=0x%x is not implemented", i,
1637 			    (int)lucd[i].effective, (int)lucd[i].permitted,
1638 			    (int)lucd[i].inheritable);
1639 			return (EPERM);
1640 		}
1641 	}
1642 
1643 	return (0);
1644 }
1645 
1646 int
1647 linux_prctl(struct thread *td, struct linux_prctl_args *args)
1648 {
1649 	int error = 0, max_size, arg;
1650 	struct proc *p = td->td_proc;
1651 	char comm[LINUX_MAX_COMM_LEN];
1652 	int pdeath_signal, trace_state;
1653 
1654 	switch (args->option) {
1655 	case LINUX_PR_SET_PDEATHSIG:
1656 		if (!LINUX_SIG_VALID(args->arg2))
1657 			return (EINVAL);
1658 		pdeath_signal = linux_to_bsd_signal(args->arg2);
1659 		return (kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_CTL,
1660 		    &pdeath_signal));
1661 	case LINUX_PR_GET_PDEATHSIG:
1662 		error = kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_STATUS,
1663 		    &pdeath_signal);
1664 		if (error != 0)
1665 			return (error);
1666 		pdeath_signal = bsd_to_linux_signal(pdeath_signal);
1667 		return (copyout(&pdeath_signal,
1668 		    (void *)(register_t)args->arg2,
1669 		    sizeof(pdeath_signal)));
1670 	/*
1671 	 * In Linux, this flag controls if set[gu]id processes can coredump.
1672 	 * There are additional semantics imposed on processes that cannot
1673 	 * coredump:
1674 	 * - Such processes can not be ptraced.
1675 	 * - There are some semantics around ownership of process-related files
1676 	 *   in the /proc namespace.
1677 	 *
1678 	 * In FreeBSD, we can (and by default, do) disable setuid coredump
1679 	 * system-wide with 'sugid_coredump.'  We control tracability on a
1680 	 * per-process basis with the procctl PROC_TRACE (=> P2_NOTRACE flag).
1681 	 * By happy coincidence, P2_NOTRACE also prevents coredumping.  So the
1682 	 * procctl is roughly analogous to Linux's DUMPABLE.
1683 	 *
1684 	 * So, proxy these knobs to the corresponding PROC_TRACE setting.
1685 	 */
1686 	case LINUX_PR_GET_DUMPABLE:
1687 		error = kern_procctl(td, P_PID, p->p_pid, PROC_TRACE_STATUS,
1688 		    &trace_state);
1689 		if (error != 0)
1690 			return (error);
1691 		td->td_retval[0] = (trace_state != -1);
1692 		return (0);
1693 	case LINUX_PR_SET_DUMPABLE:
1694 		/*
1695 		 * It is only valid for userspace to set one of these two
1696 		 * flags, and only one at a time.
1697 		 */
1698 		switch (args->arg2) {
1699 		case LINUX_SUID_DUMP_DISABLE:
1700 			trace_state = PROC_TRACE_CTL_DISABLE_EXEC;
1701 			break;
1702 		case LINUX_SUID_DUMP_USER:
1703 			trace_state = PROC_TRACE_CTL_ENABLE;
1704 			break;
1705 		default:
1706 			return (EINVAL);
1707 		}
1708 		return (kern_procctl(td, P_PID, p->p_pid, PROC_TRACE_CTL,
1709 		    &trace_state));
1710 	case LINUX_PR_GET_KEEPCAPS:
1711 		/*
1712 		 * Indicate that we always clear the effective and
1713 		 * permitted capability sets when the user id becomes
1714 		 * non-zero (actually the capability sets are simply
1715 		 * always zero in the current implementation).
1716 		 */
1717 		td->td_retval[0] = 0;
1718 		break;
1719 	case LINUX_PR_SET_KEEPCAPS:
1720 		/*
1721 		 * Ignore requests to keep the effective and permitted
1722 		 * capability sets when the user id becomes non-zero.
1723 		 */
1724 		break;
1725 	case LINUX_PR_SET_NAME:
1726 		/*
1727 		 * To be on the safe side we need to make sure to not
1728 		 * overflow the size a Linux program expects. We already
1729 		 * do this here in the copyin, so that we don't need to
1730 		 * check on copyout.
1731 		 */
1732 		max_size = MIN(sizeof(comm), sizeof(p->p_comm));
1733 		error = copyinstr((void *)(register_t)args->arg2, comm,
1734 		    max_size, NULL);
1735 
1736 		/* Linux silently truncates the name if it is too long. */
1737 		if (error == ENAMETOOLONG) {
1738 			/*
1739 			 * XXX: copyinstr() isn't documented to populate the
1740 			 * array completely, so do a copyin() to be on the
1741 			 * safe side. This should be changed in case
1742 			 * copyinstr() is changed to guarantee this.
1743 			 */
1744 			error = copyin((void *)(register_t)args->arg2, comm,
1745 			    max_size - 1);
1746 			comm[max_size - 1] = '\0';
1747 		}
1748 		if (error)
1749 			return (error);
1750 
1751 		PROC_LOCK(p);
1752 		strlcpy(p->p_comm, comm, sizeof(p->p_comm));
1753 		PROC_UNLOCK(p);
1754 		break;
1755 	case LINUX_PR_GET_NAME:
1756 		PROC_LOCK(p);
1757 		strlcpy(comm, p->p_comm, sizeof(comm));
1758 		PROC_UNLOCK(p);
1759 		error = copyout(comm, (void *)(register_t)args->arg2,
1760 		    strlen(comm) + 1);
1761 		break;
1762 	case LINUX_PR_GET_SECCOMP:
1763 	case LINUX_PR_SET_SECCOMP:
1764 		/*
1765 		 * Same as returned by Linux without CONFIG_SECCOMP enabled.
1766 		 */
1767 		error = EINVAL;
1768 		break;
1769 	case LINUX_PR_CAPBSET_READ:
1770 #if 0
1771 		/*
1772 		 * This makes too much noise with Ubuntu Focal.
1773 		 */
1774 		linux_msg(td, "unsupported prctl PR_CAPBSET_READ %d",
1775 		    (int)args->arg2);
1776 #endif
1777 		error = EINVAL;
1778 		break;
1779 	case LINUX_PR_SET_NO_NEW_PRIVS:
1780 		arg = args->arg2 == 1 ?
1781 		    PROC_NO_NEW_PRIVS_ENABLE : PROC_NO_NEW_PRIVS_DISABLE;
1782 		error = kern_procctl(td, P_PID, p->p_pid,
1783 		    PROC_NO_NEW_PRIVS_CTL, &arg);
1784 		break;
1785 	case LINUX_PR_SET_PTRACER:
1786 		linux_msg(td, "unsupported prctl PR_SET_PTRACER");
1787 		error = EINVAL;
1788 		break;
1789 	default:
1790 		linux_msg(td, "unsupported prctl option %d", args->option);
1791 		error = EINVAL;
1792 		break;
1793 	}
1794 
1795 	return (error);
1796 }
1797 
1798 int
1799 linux_sched_setparam(struct thread *td,
1800     struct linux_sched_setparam_args *uap)
1801 {
1802 	struct sched_param sched_param;
1803 	struct thread *tdt;
1804 	int error, policy;
1805 
1806 	error = copyin(uap->param, &sched_param, sizeof(sched_param));
1807 	if (error)
1808 		return (error);
1809 
1810 	tdt = linux_tdfind(td, uap->pid, -1);
1811 	if (tdt == NULL)
1812 		return (ESRCH);
1813 
1814 	if (linux_map_sched_prio) {
1815 		error = kern_sched_getscheduler(td, tdt, &policy);
1816 		if (error)
1817 			goto out;
1818 
1819 		switch (policy) {
1820 		case SCHED_OTHER:
1821 			if (sched_param.sched_priority != 0) {
1822 				error = EINVAL;
1823 				goto out;
1824 			}
1825 			sched_param.sched_priority =
1826 			    PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
1827 			break;
1828 		case SCHED_FIFO:
1829 		case SCHED_RR:
1830 			if (sched_param.sched_priority < 1 ||
1831 			    sched_param.sched_priority >= LINUX_MAX_RT_PRIO) {
1832 				error = EINVAL;
1833 				goto out;
1834 			}
1835 			/*
1836 			 * Map [1, LINUX_MAX_RT_PRIO - 1] to
1837 			 * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down).
1838 			 */
1839 			sched_param.sched_priority =
1840 			    (sched_param.sched_priority - 1) *
1841 			    (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) /
1842 			    (LINUX_MAX_RT_PRIO - 1);
1843 			break;
1844 		}
1845 	}
1846 
1847 	error = kern_sched_setparam(td, tdt, &sched_param);
1848 out:	PROC_UNLOCK(tdt->td_proc);
1849 	return (error);
1850 }
1851 
1852 int
1853 linux_sched_getparam(struct thread *td,
1854     struct linux_sched_getparam_args *uap)
1855 {
1856 	struct sched_param sched_param;
1857 	struct thread *tdt;
1858 	int error, policy;
1859 
1860 	tdt = linux_tdfind(td, uap->pid, -1);
1861 	if (tdt == NULL)
1862 		return (ESRCH);
1863 
1864 	error = kern_sched_getparam(td, tdt, &sched_param);
1865 	if (error) {
1866 		PROC_UNLOCK(tdt->td_proc);
1867 		return (error);
1868 	}
1869 
1870 	if (linux_map_sched_prio) {
1871 		error = kern_sched_getscheduler(td, tdt, &policy);
1872 		PROC_UNLOCK(tdt->td_proc);
1873 		if (error)
1874 			return (error);
1875 
1876 		switch (policy) {
1877 		case SCHED_OTHER:
1878 			sched_param.sched_priority = 0;
1879 			break;
1880 		case SCHED_FIFO:
1881 		case SCHED_RR:
1882 			/*
1883 			 * Map [0, RTP_PRIO_MAX - RTP_PRIO_MIN] to
1884 			 * [1, LINUX_MAX_RT_PRIO - 1] (rounding up).
1885 			 */
1886 			sched_param.sched_priority =
1887 			    (sched_param.sched_priority *
1888 			    (LINUX_MAX_RT_PRIO - 1) +
1889 			    (RTP_PRIO_MAX - RTP_PRIO_MIN - 1)) /
1890 			    (RTP_PRIO_MAX - RTP_PRIO_MIN) + 1;
1891 			break;
1892 		}
1893 	} else
1894 		PROC_UNLOCK(tdt->td_proc);
1895 
1896 	error = copyout(&sched_param, uap->param, sizeof(sched_param));
1897 	return (error);
1898 }
1899 
1900 /*
1901  * Get affinity of a process.
1902  */
1903 int
1904 linux_sched_getaffinity(struct thread *td,
1905     struct linux_sched_getaffinity_args *args)
1906 {
1907 	struct thread *tdt;
1908 	cpuset_t *mask;
1909 	size_t size;
1910 	int error;
1911 	id_t tid;
1912 
1913 	tdt = linux_tdfind(td, args->pid, -1);
1914 	if (tdt == NULL)
1915 		return (ESRCH);
1916 	tid = tdt->td_tid;
1917 	PROC_UNLOCK(tdt->td_proc);
1918 
1919 	mask = malloc(sizeof(cpuset_t), M_LINUX, M_WAITOK | M_ZERO);
1920 	size = min(args->len, sizeof(cpuset_t));
1921 	error = kern_cpuset_getaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID,
1922 	    tid, size, mask);
1923 	if (error == ERANGE)
1924 		error = EINVAL;
1925  	if (error == 0)
1926 		error = copyout(mask, args->user_mask_ptr, size);
1927 	if (error == 0)
1928 		td->td_retval[0] = size;
1929 	free(mask, M_LINUX);
1930 	return (error);
1931 }
1932 
1933 /*
1934  *  Set affinity of a process.
1935  */
1936 int
1937 linux_sched_setaffinity(struct thread *td,
1938     struct linux_sched_setaffinity_args *args)
1939 {
1940 	struct thread *tdt;
1941 	cpuset_t *mask;
1942 	int cpu, error;
1943 	size_t len;
1944 	id_t tid;
1945 
1946 	tdt = linux_tdfind(td, args->pid, -1);
1947 	if (tdt == NULL)
1948 		return (ESRCH);
1949 	tid = tdt->td_tid;
1950 	PROC_UNLOCK(tdt->td_proc);
1951 
1952 	len = min(args->len, sizeof(cpuset_t));
1953 	mask = malloc(sizeof(cpuset_t), M_TEMP, M_WAITOK | M_ZERO);;
1954 	error = copyin(args->user_mask_ptr, mask, len);
1955 	if (error != 0)
1956 		goto out;
1957 	/* Linux ignore high bits */
1958 	CPU_FOREACH_ISSET(cpu, mask)
1959 		if (cpu > mp_maxid)
1960 			CPU_CLR(cpu, mask);
1961 
1962 	error = kern_cpuset_setaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID,
1963 	    tid, mask);
1964 	if (error == EDEADLK)
1965 		error = EINVAL;
1966 out:
1967 	free(mask, M_TEMP);
1968 	return (error);
1969 }
1970 
1971 struct linux_rlimit64 {
1972 	uint64_t	rlim_cur;
1973 	uint64_t	rlim_max;
1974 };
1975 
1976 int
1977 linux_prlimit64(struct thread *td, struct linux_prlimit64_args *args)
1978 {
1979 	struct rlimit rlim, nrlim;
1980 	struct linux_rlimit64 lrlim;
1981 	struct proc *p;
1982 	u_int which;
1983 	int flags;
1984 	int error;
1985 
1986 	if (args->new == NULL && args->old != NULL) {
1987 		if (linux_get_dummy_limit(args->resource, &rlim)) {
1988 			lrlim.rlim_cur = rlim.rlim_cur;
1989 			lrlim.rlim_max = rlim.rlim_max;
1990 			return (copyout(&lrlim, args->old, sizeof(lrlim)));
1991 		}
1992 	}
1993 
1994 	if (args->resource >= LINUX_RLIM_NLIMITS)
1995 		return (EINVAL);
1996 
1997 	which = linux_to_bsd_resource[args->resource];
1998 	if (which == -1)
1999 		return (EINVAL);
2000 
2001 	if (args->new != NULL) {
2002 		/*
2003 		 * Note. Unlike FreeBSD where rlim is signed 64-bit Linux
2004 		 * rlim is unsigned 64-bit. FreeBSD treats negative limits
2005 		 * as INFINITY so we do not need a conversion even.
2006 		 */
2007 		error = copyin(args->new, &nrlim, sizeof(nrlim));
2008 		if (error != 0)
2009 			return (error);
2010 	}
2011 
2012 	flags = PGET_HOLD | PGET_NOTWEXIT;
2013 	if (args->new != NULL)
2014 		flags |= PGET_CANDEBUG;
2015 	else
2016 		flags |= PGET_CANSEE;
2017 	if (args->pid == 0) {
2018 		p = td->td_proc;
2019 		PHOLD(p);
2020 	} else {
2021 		error = pget(args->pid, flags, &p);
2022 		if (error != 0)
2023 			return (error);
2024 	}
2025 	if (args->old != NULL) {
2026 		PROC_LOCK(p);
2027 		lim_rlimit_proc(p, which, &rlim);
2028 		PROC_UNLOCK(p);
2029 		if (rlim.rlim_cur == RLIM_INFINITY)
2030 			lrlim.rlim_cur = LINUX_RLIM_INFINITY;
2031 		else
2032 			lrlim.rlim_cur = rlim.rlim_cur;
2033 		if (rlim.rlim_max == RLIM_INFINITY)
2034 			lrlim.rlim_max = LINUX_RLIM_INFINITY;
2035 		else
2036 			lrlim.rlim_max = rlim.rlim_max;
2037 		error = copyout(&lrlim, args->old, sizeof(lrlim));
2038 		if (error != 0)
2039 			goto out;
2040 	}
2041 
2042 	if (args->new != NULL)
2043 		error = kern_proc_setrlimit(td, p, which, &nrlim);
2044 
2045  out:
2046 	PRELE(p);
2047 	return (error);
2048 }
2049 
2050 int
2051 linux_pselect6(struct thread *td, struct linux_pselect6_args *args)
2052 {
2053 	struct timespec ts, *tsp;
2054 	int error;
2055 
2056 	if (args->tsp != NULL) {
2057 		error = linux_get_timespec(&ts, args->tsp);
2058 		if (error != 0)
2059 			return (error);
2060 		tsp = &ts;
2061 	} else
2062 		tsp = NULL;
2063 
2064 	error = linux_common_pselect6(td, args->nfds, args->readfds,
2065 	    args->writefds, args->exceptfds, tsp, args->sig);
2066 
2067 	if (args->tsp != NULL)
2068 		linux_put_timespec(&ts, args->tsp);
2069 	return (error);
2070 }
2071 
2072 static int
2073 linux_common_pselect6(struct thread *td, l_int nfds, l_fd_set *readfds,
2074     l_fd_set *writefds, l_fd_set *exceptfds, struct timespec *tsp,
2075     l_uintptr_t *sig)
2076 {
2077 	struct timeval utv, tv0, tv1, *tvp;
2078 	struct l_pselect6arg lpse6;
2079 	sigset_t *ssp;
2080 	sigset_t ss;
2081 	int error;
2082 
2083 	ssp = NULL;
2084 	if (sig != NULL) {
2085 		error = copyin(sig, &lpse6, sizeof(lpse6));
2086 		if (error != 0)
2087 			return (error);
2088 		error = linux_copyin_sigset(td, PTRIN(lpse6.ss),
2089 		    lpse6.ss_len, &ss, &ssp);
2090 		if (error != 0)
2091 		    return (error);
2092 	} else
2093 		ssp = NULL;
2094 
2095 	/*
2096 	 * Currently glibc changes nanosecond number to microsecond.
2097 	 * This mean losing precision but for now it is hardly seen.
2098 	 */
2099 	if (tsp != NULL) {
2100 		TIMESPEC_TO_TIMEVAL(&utv, tsp);
2101 		if (itimerfix(&utv))
2102 			return (EINVAL);
2103 
2104 		microtime(&tv0);
2105 		tvp = &utv;
2106 	} else
2107 		tvp = NULL;
2108 
2109 	error = kern_pselect(td, nfds, readfds, writefds,
2110 	    exceptfds, tvp, ssp, LINUX_NFDBITS);
2111 
2112 	if (tsp != NULL) {
2113 		/*
2114 		 * Compute how much time was left of the timeout,
2115 		 * by subtracting the current time and the time
2116 		 * before we started the call, and subtracting
2117 		 * that result from the user-supplied value.
2118 		 */
2119 		microtime(&tv1);
2120 		timevalsub(&tv1, &tv0);
2121 		timevalsub(&utv, &tv1);
2122 		if (utv.tv_sec < 0)
2123 			timevalclear(&utv);
2124 		TIMEVAL_TO_TIMESPEC(&utv, tsp);
2125 	}
2126 	return (error);
2127 }
2128 
2129 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
2130 int
2131 linux_pselect6_time64(struct thread *td,
2132     struct linux_pselect6_time64_args *args)
2133 {
2134 	struct timespec ts, *tsp;
2135 	int error;
2136 
2137 	if (args->tsp != NULL) {
2138 		error = linux_get_timespec64(&ts, args->tsp);
2139 		if (error != 0)
2140 			return (error);
2141 		tsp = &ts;
2142 	} else
2143 		tsp = NULL;
2144 
2145 	error = linux_common_pselect6(td, args->nfds, args->readfds,
2146 	    args->writefds, args->exceptfds, tsp, args->sig);
2147 
2148 	if (args->tsp != NULL)
2149 		linux_put_timespec64(&ts, args->tsp);
2150 	return (error);
2151 }
2152 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
2153 
2154 int
2155 linux_ppoll(struct thread *td, struct linux_ppoll_args *args)
2156 {
2157 	struct timespec uts, *tsp;
2158 	int error;
2159 
2160 	if (args->tsp != NULL) {
2161 		error = linux_get_timespec(&uts, args->tsp);
2162 		if (error != 0)
2163 			return (error);
2164 		tsp = &uts;
2165 	} else
2166 		tsp = NULL;
2167 
2168 	error = linux_common_ppoll(td, args->fds, args->nfds, tsp,
2169 	    args->sset, args->ssize);
2170 	if (error == 0 && args->tsp != NULL)
2171 		error = linux_put_timespec(&uts, args->tsp);
2172 	return (error);
2173 }
2174 
2175 static int
2176 linux_common_ppoll(struct thread *td, struct pollfd *fds, uint32_t nfds,
2177     struct timespec *tsp, l_sigset_t *sset, l_size_t ssize)
2178 {
2179 	struct timespec ts0, ts1;
2180 	struct pollfd stackfds[32];
2181 	struct pollfd *kfds;
2182  	sigset_t *ssp;
2183  	sigset_t ss;
2184  	int error;
2185 
2186 	if (kern_poll_maxfds(nfds))
2187 		return (EINVAL);
2188 	if (sset != NULL) {
2189 		error = linux_copyin_sigset(td, sset, ssize, &ss, &ssp);
2190 		if (error != 0)
2191 		    return (error);
2192 	} else
2193 		ssp = NULL;
2194 	if (tsp != NULL)
2195 		nanotime(&ts0);
2196 
2197 	if (nfds > nitems(stackfds))
2198 		kfds = mallocarray(nfds, sizeof(*kfds), M_TEMP, M_WAITOK);
2199 	else
2200 		kfds = stackfds;
2201 	error = linux_pollin(td, kfds, fds, nfds);
2202 	if (error != 0)
2203 		goto out;
2204 
2205 	error = kern_poll_kfds(td, kfds, nfds, tsp, ssp);
2206 	if (error == 0)
2207 		error = linux_pollout(td, kfds, fds, nfds);
2208 
2209 	if (error == 0 && tsp != NULL) {
2210 		if (td->td_retval[0]) {
2211 			nanotime(&ts1);
2212 			timespecsub(&ts1, &ts0, &ts1);
2213 			timespecsub(tsp, &ts1, tsp);
2214 			if (tsp->tv_sec < 0)
2215 				timespecclear(tsp);
2216 		} else
2217 			timespecclear(tsp);
2218 	}
2219 
2220 out:
2221 	if (nfds > nitems(stackfds))
2222 		free(kfds, M_TEMP);
2223 	return (error);
2224 }
2225 
2226 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
2227 int
2228 linux_ppoll_time64(struct thread *td, struct linux_ppoll_time64_args *args)
2229 {
2230 	struct timespec uts, *tsp;
2231 	int error;
2232 
2233 	if (args->tsp != NULL) {
2234 		error = linux_get_timespec64(&uts, args->tsp);
2235 		if (error != 0)
2236 			return (error);
2237 		tsp = &uts;
2238 	} else
2239  		tsp = NULL;
2240 	error = linux_common_ppoll(td, args->fds, args->nfds, tsp,
2241 	    args->sset, args->ssize);
2242 	if (error == 0 && args->tsp != NULL)
2243 		error = linux_put_timespec64(&uts, args->tsp);
2244 	return (error);
2245 }
2246 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
2247 
2248 static int
2249 linux_pollin(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd)
2250 {
2251 	int error;
2252 	u_int i;
2253 
2254 	error = copyin(ufds, fds, nfd * sizeof(*fds));
2255 	if (error != 0)
2256 		return (error);
2257 
2258 	for (i = 0; i < nfd; i++) {
2259 		if (fds->events != 0)
2260 			linux_to_bsd_poll_events(td, fds->fd,
2261 			    fds->events, &fds->events);
2262 		fds++;
2263 	}
2264 	return (0);
2265 }
2266 
2267 static int
2268 linux_pollout(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd)
2269 {
2270 	int error = 0;
2271 	u_int i, n = 0;
2272 
2273 	for (i = 0; i < nfd; i++) {
2274 		if (fds->revents != 0) {
2275 			bsd_to_linux_poll_events(fds->revents,
2276 			    &fds->revents);
2277 			n++;
2278 		}
2279 		error = copyout(&fds->revents, &ufds->revents,
2280 		    sizeof(ufds->revents));
2281 		if (error)
2282 			return (error);
2283 		fds++;
2284 		ufds++;
2285 	}
2286 	td->td_retval[0] = n;
2287 	return (0);
2288 }
2289 
2290 static int
2291 linux_sched_rr_get_interval_common(struct thread *td, pid_t pid,
2292     struct timespec *ts)
2293 {
2294 	struct thread *tdt;
2295 	int error;
2296 
2297 	/*
2298 	 * According to man in case the invalid pid specified
2299 	 * EINVAL should be returned.
2300 	 */
2301 	if (pid < 0)
2302 		return (EINVAL);
2303 
2304 	tdt = linux_tdfind(td, pid, -1);
2305 	if (tdt == NULL)
2306 		return (ESRCH);
2307 
2308 	error = kern_sched_rr_get_interval_td(td, tdt, ts);
2309 	PROC_UNLOCK(tdt->td_proc);
2310 	return (error);
2311 }
2312 
2313 int
2314 linux_sched_rr_get_interval(struct thread *td,
2315     struct linux_sched_rr_get_interval_args *uap)
2316 {
2317 	struct timespec ts;
2318 	int error;
2319 
2320 	error = linux_sched_rr_get_interval_common(td, uap->pid, &ts);
2321 	if (error != 0)
2322 		return (error);
2323 	return (linux_put_timespec(&ts, uap->interval));
2324 }
2325 
2326 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
2327 int
2328 linux_sched_rr_get_interval_time64(struct thread *td,
2329     struct linux_sched_rr_get_interval_time64_args *uap)
2330 {
2331 	struct timespec ts;
2332 	int error;
2333 
2334 	error = linux_sched_rr_get_interval_common(td, uap->pid, &ts);
2335 	if (error != 0)
2336 		return (error);
2337 	return (linux_put_timespec64(&ts, uap->interval));
2338 }
2339 #endif
2340 
2341 /*
2342  * In case when the Linux thread is the initial thread in
2343  * the thread group thread id is equal to the process id.
2344  * Glibc depends on this magic (assert in pthread_getattr_np.c).
2345  */
2346 struct thread *
2347 linux_tdfind(struct thread *td, lwpid_t tid, pid_t pid)
2348 {
2349 	struct linux_emuldata *em;
2350 	struct thread *tdt;
2351 	struct proc *p;
2352 
2353 	tdt = NULL;
2354 	if (tid == 0 || tid == td->td_tid) {
2355 		if (pid != -1 && td->td_proc->p_pid != pid)
2356 			return (NULL);
2357 		PROC_LOCK(td->td_proc);
2358 		return (td);
2359 	} else if (tid > PID_MAX)
2360 		return (tdfind(tid, pid));
2361 
2362 	/*
2363 	 * Initial thread where the tid equal to the pid.
2364 	 */
2365 	p = pfind(tid);
2366 	if (p != NULL) {
2367 		if (SV_PROC_ABI(p) != SV_ABI_LINUX ||
2368 		    (pid != -1 && tid != pid)) {
2369 			/*
2370 			 * p is not a Linuxulator process.
2371 			 */
2372 			PROC_UNLOCK(p);
2373 			return (NULL);
2374 		}
2375 		FOREACH_THREAD_IN_PROC(p, tdt) {
2376 			em = em_find(tdt);
2377 			if (tid == em->em_tid)
2378 				return (tdt);
2379 		}
2380 		PROC_UNLOCK(p);
2381 	}
2382 	return (NULL);
2383 }
2384 
2385 void
2386 linux_to_bsd_waitopts(int options, int *bsdopts)
2387 {
2388 
2389 	if (options & LINUX_WNOHANG)
2390 		*bsdopts |= WNOHANG;
2391 	if (options & LINUX_WUNTRACED)
2392 		*bsdopts |= WUNTRACED;
2393 	if (options & LINUX_WEXITED)
2394 		*bsdopts |= WEXITED;
2395 	if (options & LINUX_WCONTINUED)
2396 		*bsdopts |= WCONTINUED;
2397 	if (options & LINUX_WNOWAIT)
2398 		*bsdopts |= WNOWAIT;
2399 
2400 	if (options & __WCLONE)
2401 		*bsdopts |= WLINUXCLONE;
2402 }
2403 
2404 int
2405 linux_getrandom(struct thread *td, struct linux_getrandom_args *args)
2406 {
2407 	struct uio uio;
2408 	struct iovec iov;
2409 	int error;
2410 
2411 	if (args->flags & ~(LINUX_GRND_NONBLOCK|LINUX_GRND_RANDOM))
2412 		return (EINVAL);
2413 	if (args->count > INT_MAX)
2414 		args->count = INT_MAX;
2415 
2416 	iov.iov_base = args->buf;
2417 	iov.iov_len = args->count;
2418 
2419 	uio.uio_iov = &iov;
2420 	uio.uio_iovcnt = 1;
2421 	uio.uio_resid = iov.iov_len;
2422 	uio.uio_segflg = UIO_USERSPACE;
2423 	uio.uio_rw = UIO_READ;
2424 	uio.uio_td = td;
2425 
2426 	error = read_random_uio(&uio, args->flags & LINUX_GRND_NONBLOCK);
2427 	if (error == 0)
2428 		td->td_retval[0] = args->count - uio.uio_resid;
2429 	return (error);
2430 }
2431 
2432 int
2433 linux_mincore(struct thread *td, struct linux_mincore_args *args)
2434 {
2435 
2436 	/* Needs to be page-aligned */
2437 	if (args->start & PAGE_MASK)
2438 		return (EINVAL);
2439 	return (kern_mincore(td, args->start, args->len, args->vec));
2440 }
2441 
2442 #define	SYSLOG_TAG	"<6>"
2443 
2444 int
2445 linux_syslog(struct thread *td, struct linux_syslog_args *args)
2446 {
2447 	char buf[128], *src, *dst;
2448 	u_int seq;
2449 	int buflen, error;
2450 
2451 	if (args->type != LINUX_SYSLOG_ACTION_READ_ALL) {
2452 		linux_msg(td, "syslog unsupported type 0x%x", args->type);
2453 		return (EINVAL);
2454 	}
2455 
2456 	if (args->len < 6) {
2457 		td->td_retval[0] = 0;
2458 		return (0);
2459 	}
2460 
2461 	error = priv_check(td, PRIV_MSGBUF);
2462 	if (error)
2463 		return (error);
2464 
2465 	mtx_lock(&msgbuf_lock);
2466 	msgbuf_peekbytes(msgbufp, NULL, 0, &seq);
2467 	mtx_unlock(&msgbuf_lock);
2468 
2469 	dst = args->buf;
2470 	error = copyout(&SYSLOG_TAG, dst, sizeof(SYSLOG_TAG));
2471 	/* The -1 is to skip the trailing '\0'. */
2472 	dst += sizeof(SYSLOG_TAG) - 1;
2473 
2474 	while (error == 0) {
2475 		mtx_lock(&msgbuf_lock);
2476 		buflen = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq);
2477 		mtx_unlock(&msgbuf_lock);
2478 
2479 		if (buflen == 0)
2480 			break;
2481 
2482 		for (src = buf; src < buf + buflen && error == 0; src++) {
2483 			if (*src == '\0')
2484 				continue;
2485 
2486 			if (dst >= args->buf + args->len)
2487 				goto out;
2488 
2489 			error = copyout(src, dst, 1);
2490 			dst++;
2491 
2492 			if (*src == '\n' && *(src + 1) != '<' &&
2493 			    dst + sizeof(SYSLOG_TAG) < args->buf + args->len) {
2494 				error = copyout(&SYSLOG_TAG,
2495 				    dst, sizeof(SYSLOG_TAG));
2496 				dst += sizeof(SYSLOG_TAG) - 1;
2497 			}
2498 		}
2499 	}
2500 out:
2501 	td->td_retval[0] = dst - args->buf;
2502 	return (error);
2503 }
2504 
2505 int
2506 linux_getcpu(struct thread *td, struct linux_getcpu_args *args)
2507 {
2508 	int cpu, error, node;
2509 
2510 	cpu = td->td_oncpu; /* Make sure it doesn't change during copyout(9) */
2511 	error = 0;
2512 	node = cpuid_to_pcpu[cpu]->pc_domain;
2513 
2514 	if (args->cpu != NULL)
2515 		error = copyout(&cpu, args->cpu, sizeof(l_int));
2516 	if (args->node != NULL)
2517 		error = copyout(&node, args->node, sizeof(l_int));
2518 	return (error);
2519 }
2520 
2521 #if defined(__i386__) || defined(__amd64__)
2522 int
2523 linux_poll(struct thread *td, struct linux_poll_args *args)
2524 {
2525 	struct timespec ts, *tsp;
2526 
2527 	if (args->timeout != INFTIM) {
2528 		if (args->timeout < 0)
2529 			return (EINVAL);
2530 		ts.tv_sec = args->timeout / 1000;
2531 		ts.tv_nsec = (args->timeout % 1000) * 1000000;
2532 		tsp = &ts;
2533 	} else
2534 		tsp = NULL;
2535 
2536 	return (linux_common_ppoll(td, args->fds, args->nfds,
2537 	    tsp, NULL, 0));
2538 }
2539 #endif /* __i386__ || __amd64__ */
2540 
2541 int
2542 linux_seccomp(struct thread *td, struct linux_seccomp_args *args)
2543 {
2544 
2545 	switch (args->op) {
2546 	case LINUX_SECCOMP_GET_ACTION_AVAIL:
2547 		return (EOPNOTSUPP);
2548 	default:
2549 		/*
2550 		 * Ignore unknown operations, just like Linux kernel built
2551 		 * without CONFIG_SECCOMP.
2552 		 */
2553 		return (EINVAL);
2554 	}
2555 }
2556 
2557 /*
2558  * Custom version of exec_copyin_args(), to copy out argument and environment
2559  * strings from the old process address space into the temporary string buffer.
2560  * Based on freebsd32_exec_copyin_args.
2561  */
2562 static int
2563 linux_exec_copyin_args(struct image_args *args, const char *fname,
2564     enum uio_seg segflg, l_uintptr_t *argv, l_uintptr_t *envv)
2565 {
2566 	char *argp, *envp;
2567 	l_uintptr_t *ptr, arg;
2568 	int error;
2569 
2570 	bzero(args, sizeof(*args));
2571 	if (argv == NULL)
2572 		return (EFAULT);
2573 
2574 	/*
2575 	 * Allocate demand-paged memory for the file name, argument, and
2576 	 * environment strings.
2577 	 */
2578 	error = exec_alloc_args(args);
2579 	if (error != 0)
2580 		return (error);
2581 
2582 	/*
2583 	 * Copy the file name.
2584 	 */
2585 	error = exec_args_add_fname(args, fname, segflg);
2586 	if (error != 0)
2587 		goto err_exit;
2588 
2589 	/*
2590 	 * extract arguments first
2591 	 */
2592 	ptr = argv;
2593 	for (;;) {
2594 		error = copyin(ptr++, &arg, sizeof(arg));
2595 		if (error)
2596 			goto err_exit;
2597 		if (arg == 0)
2598 			break;
2599 		argp = PTRIN(arg);
2600 		error = exec_args_add_arg(args, argp, UIO_USERSPACE);
2601 		if (error != 0)
2602 			goto err_exit;
2603 	}
2604 
2605 	/*
2606 	 * This comment is from Linux do_execveat_common:
2607 	 * When argv is empty, add an empty string ("") as argv[0] to
2608 	 * ensure confused userspace programs that start processing
2609 	 * from argv[1] won't end up walking envp.
2610 	 */
2611 	if (args->argc == 0 &&
2612 	    (error = exec_args_add_arg(args, "", UIO_SYSSPACE) != 0))
2613 		goto err_exit;
2614 
2615 	/*
2616 	 * extract environment strings
2617 	 */
2618 	if (envv) {
2619 		ptr = envv;
2620 		for (;;) {
2621 			error = copyin(ptr++, &arg, sizeof(arg));
2622 			if (error)
2623 				goto err_exit;
2624 			if (arg == 0)
2625 				break;
2626 			envp = PTRIN(arg);
2627 			error = exec_args_add_env(args, envp, UIO_USERSPACE);
2628 			if (error != 0)
2629 				goto err_exit;
2630 		}
2631 	}
2632 
2633 	return (0);
2634 
2635 err_exit:
2636 	exec_free_args(args);
2637 	return (error);
2638 }
2639 
2640 int
2641 linux_execve(struct thread *td, struct linux_execve_args *args)
2642 {
2643 	struct image_args eargs;
2644 	int error;
2645 
2646 	LINUX_CTR(execve);
2647 
2648 	error = linux_exec_copyin_args(&eargs, args->path, UIO_USERSPACE,
2649 	    args->argp, args->envp);
2650 	if (error == 0)
2651 		error = linux_common_execve(td, &eargs);
2652 	AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td);
2653 	return (error);
2654 }
2655 
2656 static void
2657 linux_up_rtprio_if(struct thread *td1, struct rtprio *rtp)
2658 {
2659 	struct rtprio rtp2;
2660 
2661 	pri_to_rtp(td1, &rtp2);
2662 	if (rtp2.type <  rtp->type ||
2663 	    (rtp2.type == rtp->type &&
2664 	    rtp2.prio < rtp->prio)) {
2665 		rtp->type = rtp2.type;
2666 		rtp->prio = rtp2.prio;
2667 	}
2668 }
2669 
2670 #define	LINUX_PRIO_DIVIDER	RTP_PRIO_MAX / LINUX_IOPRIO_MAX
2671 
2672 static int
2673 linux_rtprio2ioprio(struct rtprio *rtp)
2674 {
2675 	int ioprio, prio;
2676 
2677 	switch (rtp->type) {
2678 	case RTP_PRIO_IDLE:
2679 		prio = RTP_PRIO_MIN;
2680 		ioprio = LINUX_IOPRIO_PRIO(LINUX_IOPRIO_CLASS_IDLE, prio);
2681 		break;
2682 	case RTP_PRIO_NORMAL:
2683 		prio = rtp->prio / LINUX_PRIO_DIVIDER;
2684 		ioprio = LINUX_IOPRIO_PRIO(LINUX_IOPRIO_CLASS_BE, prio);
2685 		break;
2686 	case RTP_PRIO_REALTIME:
2687 		prio = rtp->prio / LINUX_PRIO_DIVIDER;
2688 		ioprio = LINUX_IOPRIO_PRIO(LINUX_IOPRIO_CLASS_RT, prio);
2689 		break;
2690 	default:
2691 		prio = RTP_PRIO_MIN;
2692 		ioprio = LINUX_IOPRIO_PRIO(LINUX_IOPRIO_CLASS_NONE, prio);
2693 		break;
2694 	}
2695 	return (ioprio);
2696 }
2697 
2698 static int
2699 linux_ioprio2rtprio(int ioprio, struct rtprio *rtp)
2700 {
2701 
2702 	switch (LINUX_IOPRIO_PRIO_CLASS(ioprio)) {
2703 	case LINUX_IOPRIO_CLASS_IDLE:
2704 		rtp->prio = RTP_PRIO_MIN;
2705 		rtp->type = RTP_PRIO_IDLE;
2706 		break;
2707 	case LINUX_IOPRIO_CLASS_BE:
2708 		rtp->prio = LINUX_IOPRIO_PRIO_DATA(ioprio) * LINUX_PRIO_DIVIDER;
2709 		rtp->type = RTP_PRIO_NORMAL;
2710 		break;
2711 	case LINUX_IOPRIO_CLASS_RT:
2712 		rtp->prio = LINUX_IOPRIO_PRIO_DATA(ioprio) * LINUX_PRIO_DIVIDER;
2713 		rtp->type = RTP_PRIO_REALTIME;
2714 		break;
2715 	default:
2716 		return (EINVAL);
2717 	}
2718 	return (0);
2719 }
2720 #undef LINUX_PRIO_DIVIDER
2721 
2722 int
2723 linux_ioprio_get(struct thread *td, struct linux_ioprio_get_args *args)
2724 {
2725 	struct thread *td1;
2726 	struct rtprio rtp;
2727 	struct pgrp *pg;
2728 	struct proc *p;
2729 	int error, found;
2730 
2731 	p = NULL;
2732 	td1 = NULL;
2733 	error = 0;
2734 	found = 0;
2735 	rtp.type = RTP_PRIO_IDLE;
2736 	rtp.prio = RTP_PRIO_MAX;
2737 	switch (args->which) {
2738 	case LINUX_IOPRIO_WHO_PROCESS:
2739 		if (args->who == 0) {
2740 			td1 = td;
2741 			p = td1->td_proc;
2742 			PROC_LOCK(p);
2743 		} else if (args->who > PID_MAX) {
2744 			td1 = linux_tdfind(td, args->who, -1);
2745 			if (td1 != NULL)
2746 				p = td1->td_proc;
2747 		} else
2748 			p = pfind(args->who);
2749 		if (p == NULL)
2750 			return (ESRCH);
2751 		if ((error = p_cansee(td, p))) {
2752 			PROC_UNLOCK(p);
2753 			break;
2754 		}
2755 		if (td1 != NULL) {
2756 			pri_to_rtp(td1, &rtp);
2757 		} else {
2758 			FOREACH_THREAD_IN_PROC(p, td1) {
2759 				linux_up_rtprio_if(td1, &rtp);
2760 			}
2761 		}
2762 		found++;
2763 		PROC_UNLOCK(p);
2764 		break;
2765 	case LINUX_IOPRIO_WHO_PGRP:
2766 		sx_slock(&proctree_lock);
2767 		if (args->who == 0) {
2768 			pg = td->td_proc->p_pgrp;
2769 			PGRP_LOCK(pg);
2770 		} else {
2771 			pg = pgfind(args->who);
2772 			if (pg == NULL) {
2773 				sx_sunlock(&proctree_lock);
2774 				error = ESRCH;
2775 				break;
2776 			}
2777 		}
2778 		sx_sunlock(&proctree_lock);
2779 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
2780 			PROC_LOCK(p);
2781 			if (p->p_state == PRS_NORMAL &&
2782 			    p_cansee(td, p) == 0) {
2783 				FOREACH_THREAD_IN_PROC(p, td1) {
2784 					linux_up_rtprio_if(td1, &rtp);
2785 					found++;
2786 				}
2787 			}
2788 			PROC_UNLOCK(p);
2789 		}
2790 		PGRP_UNLOCK(pg);
2791 		break;
2792 	case LINUX_IOPRIO_WHO_USER:
2793 		if (args->who == 0)
2794 			args->who = td->td_ucred->cr_uid;
2795 		sx_slock(&allproc_lock);
2796 		FOREACH_PROC_IN_SYSTEM(p) {
2797 			PROC_LOCK(p);
2798 			if (p->p_state == PRS_NORMAL &&
2799 			    p->p_ucred->cr_uid == args->who &&
2800 			    p_cansee(td, p) == 0) {
2801 				FOREACH_THREAD_IN_PROC(p, td1) {
2802 					linux_up_rtprio_if(td1, &rtp);
2803 					found++;
2804 				}
2805 			}
2806 			PROC_UNLOCK(p);
2807 		}
2808 		sx_sunlock(&allproc_lock);
2809 		break;
2810 	default:
2811 		error = EINVAL;
2812 		break;
2813 	}
2814 	if (error == 0) {
2815 		if (found != 0)
2816 			td->td_retval[0] = linux_rtprio2ioprio(&rtp);
2817 		else
2818 			error = ESRCH;
2819 	}
2820 	return (error);
2821 }
2822 
2823 int
2824 linux_ioprio_set(struct thread *td, struct linux_ioprio_set_args *args)
2825 {
2826 	struct thread *td1;
2827 	struct rtprio rtp;
2828 	struct pgrp *pg;
2829 	struct proc *p;
2830 	int error;
2831 
2832 	if ((error = linux_ioprio2rtprio(args->ioprio, &rtp)) != 0)
2833 		return (error);
2834 	/* Attempts to set high priorities (REALTIME) require su privileges. */
2835 	if (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_REALTIME &&
2836 	    (error = priv_check(td, PRIV_SCHED_RTPRIO)) != 0)
2837 		return (error);
2838 
2839 	p = NULL;
2840 	td1 = NULL;
2841 	switch (args->which) {
2842 	case LINUX_IOPRIO_WHO_PROCESS:
2843 		if (args->who == 0) {
2844 			td1 = td;
2845 			p = td1->td_proc;
2846 			PROC_LOCK(p);
2847 		} else if (args->who > PID_MAX) {
2848 			td1 = linux_tdfind(td, args->who, -1);
2849 			if (td1 != NULL)
2850 				p = td1->td_proc;
2851 		} else
2852 			p = pfind(args->who);
2853 		if (p == NULL)
2854 			return (ESRCH);
2855 		if ((error = p_cansched(td, p))) {
2856 			PROC_UNLOCK(p);
2857 			break;
2858 		}
2859 		if (td1 != NULL) {
2860 			error = rtp_to_pri(&rtp, td1);
2861 		} else {
2862 			FOREACH_THREAD_IN_PROC(p, td1) {
2863 				if ((error = rtp_to_pri(&rtp, td1)) != 0)
2864 					break;
2865 			}
2866 		}
2867 		PROC_UNLOCK(p);
2868 		break;
2869 	case LINUX_IOPRIO_WHO_PGRP:
2870 		sx_slock(&proctree_lock);
2871 		if (args->who == 0) {
2872 			pg = td->td_proc->p_pgrp;
2873 			PGRP_LOCK(pg);
2874 		} else {
2875 			pg = pgfind(args->who);
2876 			if (pg == NULL) {
2877 				sx_sunlock(&proctree_lock);
2878 				error = ESRCH;
2879 				break;
2880 			}
2881 		}
2882 		sx_sunlock(&proctree_lock);
2883 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
2884 			PROC_LOCK(p);
2885 			if (p->p_state == PRS_NORMAL &&
2886 			    p_cansched(td, p) == 0) {
2887 				FOREACH_THREAD_IN_PROC(p, td1) {
2888 					if ((error = rtp_to_pri(&rtp, td1)) != 0)
2889 						break;
2890 				}
2891 			}
2892 			PROC_UNLOCK(p);
2893 			if (error != 0)
2894 				break;
2895 		}
2896 		PGRP_UNLOCK(pg);
2897 		break;
2898 	case LINUX_IOPRIO_WHO_USER:
2899 		if (args->who == 0)
2900 			args->who = td->td_ucred->cr_uid;
2901 		sx_slock(&allproc_lock);
2902 		FOREACH_PROC_IN_SYSTEM(p) {
2903 			PROC_LOCK(p);
2904 			if (p->p_state == PRS_NORMAL &&
2905 			    p->p_ucred->cr_uid == args->who &&
2906 			    p_cansched(td, p) == 0) {
2907 				FOREACH_THREAD_IN_PROC(p, td1) {
2908 					if ((error = rtp_to_pri(&rtp, td1)) != 0)
2909 						break;
2910 				}
2911 			}
2912 			PROC_UNLOCK(p);
2913 			if (error != 0)
2914 				break;
2915 		}
2916 		sx_sunlock(&allproc_lock);
2917 		break;
2918 	default:
2919 		error = EINVAL;
2920 		break;
2921 	}
2922 	return (error);
2923 }
2924