xref: /linux/fs/fcntl.c (revision 69fb09f6ccdb2f070557fd1f4c56c4d646694c8e)
1 /*
2  *  linux/fs/fcntl.c
3  *
4  *  Copyright (C) 1991, 1992  Linus Torvalds
5  */
6 
7 #include <linux/syscalls.h>
8 #include <linux/init.h>
9 #include <linux/mm.h>
10 #include <linux/sched/task.h>
11 #include <linux/fs.h>
12 #include <linux/file.h>
13 #include <linux/fdtable.h>
14 #include <linux/capability.h>
15 #include <linux/dnotify.h>
16 #include <linux/slab.h>
17 #include <linux/module.h>
18 #include <linux/pipe_fs_i.h>
19 #include <linux/security.h>
20 #include <linux/ptrace.h>
21 #include <linux/signal.h>
22 #include <linux/rcupdate.h>
23 #include <linux/pid_namespace.h>
24 #include <linux/user_namespace.h>
25 #include <linux/shmem_fs.h>
26 #include <linux/compat.h>
27 
28 #include <asm/poll.h>
29 #include <asm/siginfo.h>
30 #include <linux/uaccess.h>
31 
32 #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
33 
34 static int setfl(int fd, struct file * filp, unsigned long arg)
35 {
36 	struct inode * inode = file_inode(filp);
37 	int error = 0;
38 
39 	/*
40 	 * O_APPEND cannot be cleared if the file is marked as append-only
41 	 * and the file is open for write.
42 	 */
43 	if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode))
44 		return -EPERM;
45 
46 	/* O_NOATIME can only be set by the owner or superuser */
47 	if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
48 		if (!inode_owner_or_capable(inode))
49 			return -EPERM;
50 
51 	/* required for strict SunOS emulation */
52 	if (O_NONBLOCK != O_NDELAY)
53 	       if (arg & O_NDELAY)
54 		   arg |= O_NONBLOCK;
55 
56 	/* Pipe packetized mode is controlled by O_DIRECT flag */
57 	if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) {
58 		if (!filp->f_mapping || !filp->f_mapping->a_ops ||
59 			!filp->f_mapping->a_ops->direct_IO)
60 				return -EINVAL;
61 	}
62 
63 	if (filp->f_op->check_flags)
64 		error = filp->f_op->check_flags(arg);
65 	if (error)
66 		return error;
67 
68 	/*
69 	 * ->fasync() is responsible for setting the FASYNC bit.
70 	 */
71 	if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op->fasync) {
72 		error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
73 		if (error < 0)
74 			goto out;
75 		if (error > 0)
76 			error = 0;
77 	}
78 	spin_lock(&filp->f_lock);
79 	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
80 	spin_unlock(&filp->f_lock);
81 
82  out:
83 	return error;
84 }
85 
86 static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
87                      int force)
88 {
89 	write_lock_irq(&filp->f_owner.lock);
90 	if (force || !filp->f_owner.pid) {
91 		put_pid(filp->f_owner.pid);
92 		filp->f_owner.pid = get_pid(pid);
93 		filp->f_owner.pid_type = type;
94 
95 		if (pid) {
96 			const struct cred *cred = current_cred();
97 			filp->f_owner.uid = cred->uid;
98 			filp->f_owner.euid = cred->euid;
99 		}
100 	}
101 	write_unlock_irq(&filp->f_owner.lock);
102 }
103 
104 void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
105 		int force)
106 {
107 	security_file_set_fowner(filp);
108 	f_modown(filp, pid, type, force);
109 }
110 EXPORT_SYMBOL(__f_setown);
111 
112 void f_setown(struct file *filp, unsigned long arg, int force)
113 {
114 	enum pid_type type;
115 	struct pid *pid;
116 	int who = arg;
117 	type = PIDTYPE_PID;
118 	if (who < 0) {
119 		type = PIDTYPE_PGID;
120 		who = -who;
121 	}
122 	rcu_read_lock();
123 	pid = find_vpid(who);
124 	__f_setown(filp, pid, type, force);
125 	rcu_read_unlock();
126 }
127 EXPORT_SYMBOL(f_setown);
128 
129 void f_delown(struct file *filp)
130 {
131 	f_modown(filp, NULL, PIDTYPE_PID, 1);
132 }
133 
134 pid_t f_getown(struct file *filp)
135 {
136 	pid_t pid;
137 	read_lock(&filp->f_owner.lock);
138 	pid = pid_vnr(filp->f_owner.pid);
139 	if (filp->f_owner.pid_type == PIDTYPE_PGID)
140 		pid = -pid;
141 	read_unlock(&filp->f_owner.lock);
142 	return pid;
143 }
144 
145 static int f_setown_ex(struct file *filp, unsigned long arg)
146 {
147 	struct f_owner_ex __user *owner_p = (void __user *)arg;
148 	struct f_owner_ex owner;
149 	struct pid *pid;
150 	int type;
151 	int ret;
152 
153 	ret = copy_from_user(&owner, owner_p, sizeof(owner));
154 	if (ret)
155 		return -EFAULT;
156 
157 	switch (owner.type) {
158 	case F_OWNER_TID:
159 		type = PIDTYPE_MAX;
160 		break;
161 
162 	case F_OWNER_PID:
163 		type = PIDTYPE_PID;
164 		break;
165 
166 	case F_OWNER_PGRP:
167 		type = PIDTYPE_PGID;
168 		break;
169 
170 	default:
171 		return -EINVAL;
172 	}
173 
174 	rcu_read_lock();
175 	pid = find_vpid(owner.pid);
176 	if (owner.pid && !pid)
177 		ret = -ESRCH;
178 	else
179 		 __f_setown(filp, pid, type, 1);
180 	rcu_read_unlock();
181 
182 	return ret;
183 }
184 
185 static int f_getown_ex(struct file *filp, unsigned long arg)
186 {
187 	struct f_owner_ex __user *owner_p = (void __user *)arg;
188 	struct f_owner_ex owner;
189 	int ret = 0;
190 
191 	read_lock(&filp->f_owner.lock);
192 	owner.pid = pid_vnr(filp->f_owner.pid);
193 	switch (filp->f_owner.pid_type) {
194 	case PIDTYPE_MAX:
195 		owner.type = F_OWNER_TID;
196 		break;
197 
198 	case PIDTYPE_PID:
199 		owner.type = F_OWNER_PID;
200 		break;
201 
202 	case PIDTYPE_PGID:
203 		owner.type = F_OWNER_PGRP;
204 		break;
205 
206 	default:
207 		WARN_ON(1);
208 		ret = -EINVAL;
209 		break;
210 	}
211 	read_unlock(&filp->f_owner.lock);
212 
213 	if (!ret) {
214 		ret = copy_to_user(owner_p, &owner, sizeof(owner));
215 		if (ret)
216 			ret = -EFAULT;
217 	}
218 	return ret;
219 }
220 
221 #ifdef CONFIG_CHECKPOINT_RESTORE
222 static int f_getowner_uids(struct file *filp, unsigned long arg)
223 {
224 	struct user_namespace *user_ns = current_user_ns();
225 	uid_t __user *dst = (void __user *)arg;
226 	uid_t src[2];
227 	int err;
228 
229 	read_lock(&filp->f_owner.lock);
230 	src[0] = from_kuid(user_ns, filp->f_owner.uid);
231 	src[1] = from_kuid(user_ns, filp->f_owner.euid);
232 	read_unlock(&filp->f_owner.lock);
233 
234 	err  = put_user(src[0], &dst[0]);
235 	err |= put_user(src[1], &dst[1]);
236 
237 	return err;
238 }
239 #else
240 static int f_getowner_uids(struct file *filp, unsigned long arg)
241 {
242 	return -EINVAL;
243 }
244 #endif
245 
246 static bool rw_hint_valid(enum rw_hint hint)
247 {
248 	switch (hint) {
249 	case RWF_WRITE_LIFE_NOT_SET:
250 	case RWH_WRITE_LIFE_NONE:
251 	case RWH_WRITE_LIFE_SHORT:
252 	case RWH_WRITE_LIFE_MEDIUM:
253 	case RWH_WRITE_LIFE_LONG:
254 	case RWH_WRITE_LIFE_EXTREME:
255 		return true;
256 	default:
257 		return false;
258 	}
259 }
260 
261 static long fcntl_rw_hint(struct file *file, unsigned int cmd,
262 			  unsigned long arg)
263 {
264 	struct inode *inode = file_inode(file);
265 	u64 *argp = (u64 __user *)arg;
266 	enum rw_hint hint;
267 	u64 h;
268 
269 	switch (cmd) {
270 	case F_GET_FILE_RW_HINT:
271 		h = file_write_hint(file);
272 		if (copy_to_user(argp, &h, sizeof(*argp)))
273 			return -EFAULT;
274 		return 0;
275 	case F_SET_FILE_RW_HINT:
276 		if (copy_from_user(&h, argp, sizeof(h)))
277 			return -EFAULT;
278 		hint = (enum rw_hint) h;
279 		if (!rw_hint_valid(hint))
280 			return -EINVAL;
281 
282 		spin_lock(&file->f_lock);
283 		file->f_write_hint = hint;
284 		spin_unlock(&file->f_lock);
285 		return 0;
286 	case F_GET_RW_HINT:
287 		h = inode->i_write_hint;
288 		if (copy_to_user(argp, &h, sizeof(*argp)))
289 			return -EFAULT;
290 		return 0;
291 	case F_SET_RW_HINT:
292 		if (copy_from_user(&h, argp, sizeof(h)))
293 			return -EFAULT;
294 		hint = (enum rw_hint) h;
295 		if (!rw_hint_valid(hint))
296 			return -EINVAL;
297 
298 		inode_lock(inode);
299 		inode->i_write_hint = hint;
300 		inode_unlock(inode);
301 		return 0;
302 	default:
303 		return -EINVAL;
304 	}
305 }
306 
307 static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
308 		struct file *filp)
309 {
310 	long err = -EINVAL;
311 
312 	switch (cmd) {
313 	case F_DUPFD:
314 		err = f_dupfd(arg, filp, 0);
315 		break;
316 	case F_DUPFD_CLOEXEC:
317 		err = f_dupfd(arg, filp, O_CLOEXEC);
318 		break;
319 	case F_GETFD:
320 		err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
321 		break;
322 	case F_SETFD:
323 		err = 0;
324 		set_close_on_exec(fd, arg & FD_CLOEXEC);
325 		break;
326 	case F_GETFL:
327 		err = filp->f_flags;
328 		break;
329 	case F_SETFL:
330 		err = setfl(fd, filp, arg);
331 		break;
332 #if BITS_PER_LONG != 32
333 	/* 32-bit arches must use fcntl64() */
334 	case F_OFD_GETLK:
335 #endif
336 	case F_GETLK:
337 		err = fcntl_getlk(filp, cmd, (struct flock __user *) arg);
338 		break;
339 #if BITS_PER_LONG != 32
340 	/* 32-bit arches must use fcntl64() */
341 	case F_OFD_SETLK:
342 	case F_OFD_SETLKW:
343 #endif
344 		/* Fallthrough */
345 	case F_SETLK:
346 	case F_SETLKW:
347 		err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg);
348 		break;
349 	case F_GETOWN:
350 		/*
351 		 * XXX If f_owner is a process group, the
352 		 * negative return value will get converted
353 		 * into an error.  Oops.  If we keep the
354 		 * current syscall conventions, the only way
355 		 * to fix this will be in libc.
356 		 */
357 		err = f_getown(filp);
358 		force_successful_syscall_return();
359 		break;
360 	case F_SETOWN:
361 		f_setown(filp, arg, 1);
362 		err = 0;
363 		break;
364 	case F_GETOWN_EX:
365 		err = f_getown_ex(filp, arg);
366 		break;
367 	case F_SETOWN_EX:
368 		err = f_setown_ex(filp, arg);
369 		break;
370 	case F_GETOWNER_UIDS:
371 		err = f_getowner_uids(filp, arg);
372 		break;
373 	case F_GETSIG:
374 		err = filp->f_owner.signum;
375 		break;
376 	case F_SETSIG:
377 		/* arg == 0 restores default behaviour. */
378 		if (!valid_signal(arg)) {
379 			break;
380 		}
381 		err = 0;
382 		filp->f_owner.signum = arg;
383 		break;
384 	case F_GETLEASE:
385 		err = fcntl_getlease(filp);
386 		break;
387 	case F_SETLEASE:
388 		err = fcntl_setlease(fd, filp, arg);
389 		break;
390 	case F_NOTIFY:
391 		err = fcntl_dirnotify(fd, filp, arg);
392 		break;
393 	case F_SETPIPE_SZ:
394 	case F_GETPIPE_SZ:
395 		err = pipe_fcntl(filp, cmd, arg);
396 		break;
397 	case F_ADD_SEALS:
398 	case F_GET_SEALS:
399 		err = shmem_fcntl(filp, cmd, arg);
400 		break;
401 	case F_GET_RW_HINT:
402 	case F_SET_RW_HINT:
403 	case F_GET_FILE_RW_HINT:
404 	case F_SET_FILE_RW_HINT:
405 		err = fcntl_rw_hint(filp, cmd, arg);
406 		break;
407 	default:
408 		break;
409 	}
410 	return err;
411 }
412 
413 static int check_fcntl_cmd(unsigned cmd)
414 {
415 	switch (cmd) {
416 	case F_DUPFD:
417 	case F_DUPFD_CLOEXEC:
418 	case F_GETFD:
419 	case F_SETFD:
420 	case F_GETFL:
421 		return 1;
422 	}
423 	return 0;
424 }
425 
426 SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
427 {
428 	struct fd f = fdget_raw(fd);
429 	long err = -EBADF;
430 
431 	if (!f.file)
432 		goto out;
433 
434 	if (unlikely(f.file->f_mode & FMODE_PATH)) {
435 		if (!check_fcntl_cmd(cmd))
436 			goto out1;
437 	}
438 
439 	err = security_file_fcntl(f.file, cmd, arg);
440 	if (!err)
441 		err = do_fcntl(fd, cmd, arg, f.file);
442 
443 out1:
444  	fdput(f);
445 out:
446 	return err;
447 }
448 
449 #if BITS_PER_LONG == 32
450 SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
451 		unsigned long, arg)
452 {
453 	struct fd f = fdget_raw(fd);
454 	long err = -EBADF;
455 
456 	if (!f.file)
457 		goto out;
458 
459 	if (unlikely(f.file->f_mode & FMODE_PATH)) {
460 		if (!check_fcntl_cmd(cmd))
461 			goto out1;
462 	}
463 
464 	err = security_file_fcntl(f.file, cmd, arg);
465 	if (err)
466 		goto out1;
467 
468 	switch (cmd) {
469 	case F_GETLK64:
470 	case F_OFD_GETLK:
471 		err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg);
472 		break;
473 	case F_SETLK64:
474 	case F_SETLKW64:
475 	case F_OFD_SETLK:
476 	case F_OFD_SETLKW:
477 		err = fcntl_setlk64(fd, f.file, cmd,
478 				(struct flock64 __user *) arg);
479 		break;
480 	default:
481 		err = do_fcntl(fd, cmd, arg, f.file);
482 		break;
483 	}
484 out1:
485 	fdput(f);
486 out:
487 	return err;
488 }
489 #endif
490 
491 #ifdef CONFIG_COMPAT
492 static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
493 {
494 	if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
495 	    __get_user(kfl->l_type, &ufl->l_type) ||
496 	    __get_user(kfl->l_whence, &ufl->l_whence) ||
497 	    __get_user(kfl->l_start, &ufl->l_start) ||
498 	    __get_user(kfl->l_len, &ufl->l_len) ||
499 	    __get_user(kfl->l_pid, &ufl->l_pid))
500 		return -EFAULT;
501 	return 0;
502 }
503 
504 static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
505 {
506 	if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
507 	    __put_user(kfl->l_type, &ufl->l_type) ||
508 	    __put_user(kfl->l_whence, &ufl->l_whence) ||
509 	    __put_user(kfl->l_start, &ufl->l_start) ||
510 	    __put_user(kfl->l_len, &ufl->l_len) ||
511 	    __put_user(kfl->l_pid, &ufl->l_pid))
512 		return -EFAULT;
513 	return 0;
514 }
515 
516 #ifndef HAVE_ARCH_GET_COMPAT_FLOCK64
517 static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
518 {
519 	if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
520 	    __get_user(kfl->l_type, &ufl->l_type) ||
521 	    __get_user(kfl->l_whence, &ufl->l_whence) ||
522 	    __get_user(kfl->l_start, &ufl->l_start) ||
523 	    __get_user(kfl->l_len, &ufl->l_len) ||
524 	    __get_user(kfl->l_pid, &ufl->l_pid))
525 		return -EFAULT;
526 	return 0;
527 }
528 #endif
529 
530 #ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64
531 static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
532 {
533 	if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
534 	    __put_user(kfl->l_type, &ufl->l_type) ||
535 	    __put_user(kfl->l_whence, &ufl->l_whence) ||
536 	    __put_user(kfl->l_start, &ufl->l_start) ||
537 	    __put_user(kfl->l_len, &ufl->l_len) ||
538 	    __put_user(kfl->l_pid, &ufl->l_pid))
539 		return -EFAULT;
540 	return 0;
541 }
542 #endif
543 
544 static unsigned int
545 convert_fcntl_cmd(unsigned int cmd)
546 {
547 	switch (cmd) {
548 	case F_GETLK64:
549 		return F_GETLK;
550 	case F_SETLK64:
551 		return F_SETLK;
552 	case F_SETLKW64:
553 		return F_SETLKW;
554 	}
555 
556 	return cmd;
557 }
558 
559 COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
560 		       compat_ulong_t, arg)
561 {
562 	mm_segment_t old_fs;
563 	struct flock f;
564 	long ret;
565 	unsigned int conv_cmd;
566 
567 	switch (cmd) {
568 	case F_GETLK:
569 	case F_SETLK:
570 	case F_SETLKW:
571 		ret = get_compat_flock(&f, compat_ptr(arg));
572 		if (ret != 0)
573 			break;
574 		old_fs = get_fs();
575 		set_fs(KERNEL_DS);
576 		ret = sys_fcntl(fd, cmd, (unsigned long)&f);
577 		set_fs(old_fs);
578 		if (cmd == F_GETLK && ret == 0) {
579 			/* GETLK was successful and we need to return the data...
580 			 * but it needs to fit in the compat structure.
581 			 * l_start shouldn't be too big, unless the original
582 			 * start + end is greater than COMPAT_OFF_T_MAX, in which
583 			 * case the app was asking for trouble, so we return
584 			 * -EOVERFLOW in that case.
585 			 * l_len could be too big, in which case we just truncate it,
586 			 * and only allow the app to see that part of the conflicting
587 			 * lock that might make sense to it anyway
588 			 */
589 
590 			if (f.l_start > COMPAT_OFF_T_MAX)
591 				ret = -EOVERFLOW;
592 			if (f.l_len > COMPAT_OFF_T_MAX)
593 				f.l_len = COMPAT_OFF_T_MAX;
594 			if (ret == 0)
595 				ret = put_compat_flock(&f, compat_ptr(arg));
596 		}
597 		break;
598 
599 	case F_GETLK64:
600 	case F_SETLK64:
601 	case F_SETLKW64:
602 	case F_OFD_GETLK:
603 	case F_OFD_SETLK:
604 	case F_OFD_SETLKW:
605 		ret = get_compat_flock64(&f, compat_ptr(arg));
606 		if (ret != 0)
607 			break;
608 		old_fs = get_fs();
609 		set_fs(KERNEL_DS);
610 		conv_cmd = convert_fcntl_cmd(cmd);
611 		ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f);
612 		set_fs(old_fs);
613 		if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) {
614 			/* need to return lock information - see above for commentary */
615 			if (f.l_start > COMPAT_LOFF_T_MAX)
616 				ret = -EOVERFLOW;
617 			if (f.l_len > COMPAT_LOFF_T_MAX)
618 				f.l_len = COMPAT_LOFF_T_MAX;
619 			if (ret == 0)
620 				ret = put_compat_flock64(&f, compat_ptr(arg));
621 		}
622 		break;
623 
624 	default:
625 		ret = sys_fcntl(fd, cmd, arg);
626 		break;
627 	}
628 	return ret;
629 }
630 
631 COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
632 		       compat_ulong_t, arg)
633 {
634 	switch (cmd) {
635 	case F_GETLK64:
636 	case F_SETLK64:
637 	case F_SETLKW64:
638 	case F_OFD_GETLK:
639 	case F_OFD_SETLK:
640 	case F_OFD_SETLKW:
641 		return -EINVAL;
642 	}
643 	return compat_sys_fcntl64(fd, cmd, arg);
644 }
645 #endif
646 
647 /* Table to convert sigio signal codes into poll band bitmaps */
648 
649 static const long band_table[NSIGPOLL] = {
650 	POLLIN | POLLRDNORM,			/* POLL_IN */
651 	POLLOUT | POLLWRNORM | POLLWRBAND,	/* POLL_OUT */
652 	POLLIN | POLLRDNORM | POLLMSG,		/* POLL_MSG */
653 	POLLERR,				/* POLL_ERR */
654 	POLLPRI | POLLRDBAND,			/* POLL_PRI */
655 	POLLHUP | POLLERR			/* POLL_HUP */
656 };
657 
658 static inline int sigio_perm(struct task_struct *p,
659                              struct fown_struct *fown, int sig)
660 {
661 	const struct cred *cred;
662 	int ret;
663 
664 	rcu_read_lock();
665 	cred = __task_cred(p);
666 	ret = ((uid_eq(fown->euid, GLOBAL_ROOT_UID) ||
667 		uid_eq(fown->euid, cred->suid) || uid_eq(fown->euid, cred->uid) ||
668 		uid_eq(fown->uid,  cred->suid) || uid_eq(fown->uid,  cred->uid)) &&
669 	       !security_file_send_sigiotask(p, fown, sig));
670 	rcu_read_unlock();
671 	return ret;
672 }
673 
674 static void send_sigio_to_task(struct task_struct *p,
675 			       struct fown_struct *fown,
676 			       int fd, int reason, int group)
677 {
678 	/*
679 	 * F_SETSIG can change ->signum lockless in parallel, make
680 	 * sure we read it once and use the same value throughout.
681 	 */
682 	int signum = ACCESS_ONCE(fown->signum);
683 
684 	if (!sigio_perm(p, fown, signum))
685 		return;
686 
687 	switch (signum) {
688 		siginfo_t si;
689 		default:
690 			/* Queue a rt signal with the appropriate fd as its
691 			   value.  We use SI_SIGIO as the source, not
692 			   SI_KERNEL, since kernel signals always get
693 			   delivered even if we can't queue.  Failure to
694 			   queue in this case _should_ be reported; we fall
695 			   back to SIGIO in that case. --sct */
696 			si.si_signo = signum;
697 			si.si_errno = 0;
698 		        si.si_code  = reason;
699 			/* Make sure we are called with one of the POLL_*
700 			   reasons, otherwise we could leak kernel stack into
701 			   userspace.  */
702 			BUG_ON((reason & __SI_MASK) != __SI_POLL);
703 			if (reason - POLL_IN >= NSIGPOLL)
704 				si.si_band  = ~0L;
705 			else
706 				si.si_band = band_table[reason - POLL_IN];
707 			si.si_fd    = fd;
708 			if (!do_send_sig_info(signum, &si, p, group))
709 				break;
710 		/* fall-through: fall back on the old plain SIGIO signal */
711 		case 0:
712 			do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, group);
713 	}
714 }
715 
716 void send_sigio(struct fown_struct *fown, int fd, int band)
717 {
718 	struct task_struct *p;
719 	enum pid_type type;
720 	struct pid *pid;
721 	int group = 1;
722 
723 	read_lock(&fown->lock);
724 
725 	type = fown->pid_type;
726 	if (type == PIDTYPE_MAX) {
727 		group = 0;
728 		type = PIDTYPE_PID;
729 	}
730 
731 	pid = fown->pid;
732 	if (!pid)
733 		goto out_unlock_fown;
734 
735 	read_lock(&tasklist_lock);
736 	do_each_pid_task(pid, type, p) {
737 		send_sigio_to_task(p, fown, fd, band, group);
738 	} while_each_pid_task(pid, type, p);
739 	read_unlock(&tasklist_lock);
740  out_unlock_fown:
741 	read_unlock(&fown->lock);
742 }
743 
744 static void send_sigurg_to_task(struct task_struct *p,
745 				struct fown_struct *fown, int group)
746 {
747 	if (sigio_perm(p, fown, SIGURG))
748 		do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, group);
749 }
750 
751 int send_sigurg(struct fown_struct *fown)
752 {
753 	struct task_struct *p;
754 	enum pid_type type;
755 	struct pid *pid;
756 	int group = 1;
757 	int ret = 0;
758 
759 	read_lock(&fown->lock);
760 
761 	type = fown->pid_type;
762 	if (type == PIDTYPE_MAX) {
763 		group = 0;
764 		type = PIDTYPE_PID;
765 	}
766 
767 	pid = fown->pid;
768 	if (!pid)
769 		goto out_unlock_fown;
770 
771 	ret = 1;
772 
773 	read_lock(&tasklist_lock);
774 	do_each_pid_task(pid, type, p) {
775 		send_sigurg_to_task(p, fown, group);
776 	} while_each_pid_task(pid, type, p);
777 	read_unlock(&tasklist_lock);
778  out_unlock_fown:
779 	read_unlock(&fown->lock);
780 	return ret;
781 }
782 
783 static DEFINE_SPINLOCK(fasync_lock);
784 static struct kmem_cache *fasync_cache __read_mostly;
785 
786 static void fasync_free_rcu(struct rcu_head *head)
787 {
788 	kmem_cache_free(fasync_cache,
789 			container_of(head, struct fasync_struct, fa_rcu));
790 }
791 
792 /*
793  * Remove a fasync entry. If successfully removed, return
794  * positive and clear the FASYNC flag. If no entry exists,
795  * do nothing and return 0.
796  *
797  * NOTE! It is very important that the FASYNC flag always
798  * match the state "is the filp on a fasync list".
799  *
800  */
801 int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
802 {
803 	struct fasync_struct *fa, **fp;
804 	int result = 0;
805 
806 	spin_lock(&filp->f_lock);
807 	spin_lock(&fasync_lock);
808 	for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
809 		if (fa->fa_file != filp)
810 			continue;
811 
812 		spin_lock_irq(&fa->fa_lock);
813 		fa->fa_file = NULL;
814 		spin_unlock_irq(&fa->fa_lock);
815 
816 		*fp = fa->fa_next;
817 		call_rcu(&fa->fa_rcu, fasync_free_rcu);
818 		filp->f_flags &= ~FASYNC;
819 		result = 1;
820 		break;
821 	}
822 	spin_unlock(&fasync_lock);
823 	spin_unlock(&filp->f_lock);
824 	return result;
825 }
826 
827 struct fasync_struct *fasync_alloc(void)
828 {
829 	return kmem_cache_alloc(fasync_cache, GFP_KERNEL);
830 }
831 
832 /*
833  * NOTE! This can be used only for unused fasync entries:
834  * entries that actually got inserted on the fasync list
835  * need to be released by rcu - see fasync_remove_entry.
836  */
837 void fasync_free(struct fasync_struct *new)
838 {
839 	kmem_cache_free(fasync_cache, new);
840 }
841 
842 /*
843  * Insert a new entry into the fasync list.  Return the pointer to the
844  * old one if we didn't use the new one.
845  *
846  * NOTE! It is very important that the FASYNC flag always
847  * match the state "is the filp on a fasync list".
848  */
849 struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
850 {
851         struct fasync_struct *fa, **fp;
852 
853 	spin_lock(&filp->f_lock);
854 	spin_lock(&fasync_lock);
855 	for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
856 		if (fa->fa_file != filp)
857 			continue;
858 
859 		spin_lock_irq(&fa->fa_lock);
860 		fa->fa_fd = fd;
861 		spin_unlock_irq(&fa->fa_lock);
862 		goto out;
863 	}
864 
865 	spin_lock_init(&new->fa_lock);
866 	new->magic = FASYNC_MAGIC;
867 	new->fa_file = filp;
868 	new->fa_fd = fd;
869 	new->fa_next = *fapp;
870 	rcu_assign_pointer(*fapp, new);
871 	filp->f_flags |= FASYNC;
872 
873 out:
874 	spin_unlock(&fasync_lock);
875 	spin_unlock(&filp->f_lock);
876 	return fa;
877 }
878 
879 /*
880  * Add a fasync entry. Return negative on error, positive if
881  * added, and zero if did nothing but change an existing one.
882  */
883 static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
884 {
885 	struct fasync_struct *new;
886 
887 	new = fasync_alloc();
888 	if (!new)
889 		return -ENOMEM;
890 
891 	/*
892 	 * fasync_insert_entry() returns the old (update) entry if
893 	 * it existed.
894 	 *
895 	 * So free the (unused) new entry and return 0 to let the
896 	 * caller know that we didn't add any new fasync entries.
897 	 */
898 	if (fasync_insert_entry(fd, filp, fapp, new)) {
899 		fasync_free(new);
900 		return 0;
901 	}
902 
903 	return 1;
904 }
905 
906 /*
907  * fasync_helper() is used by almost all character device drivers
908  * to set up the fasync queue, and for regular files by the file
909  * lease code. It returns negative on error, 0 if it did no changes
910  * and positive if it added/deleted the entry.
911  */
912 int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
913 {
914 	if (!on)
915 		return fasync_remove_entry(filp, fapp);
916 	return fasync_add_entry(fd, filp, fapp);
917 }
918 
919 EXPORT_SYMBOL(fasync_helper);
920 
921 /*
922  * rcu_read_lock() is held
923  */
924 static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
925 {
926 	while (fa) {
927 		struct fown_struct *fown;
928 		unsigned long flags;
929 
930 		if (fa->magic != FASYNC_MAGIC) {
931 			printk(KERN_ERR "kill_fasync: bad magic number in "
932 			       "fasync_struct!\n");
933 			return;
934 		}
935 		spin_lock_irqsave(&fa->fa_lock, flags);
936 		if (fa->fa_file) {
937 			fown = &fa->fa_file->f_owner;
938 			/* Don't send SIGURG to processes which have not set a
939 			   queued signum: SIGURG has its own default signalling
940 			   mechanism. */
941 			if (!(sig == SIGURG && fown->signum == 0))
942 				send_sigio(fown, fa->fa_fd, band);
943 		}
944 		spin_unlock_irqrestore(&fa->fa_lock, flags);
945 		fa = rcu_dereference(fa->fa_next);
946 	}
947 }
948 
949 void kill_fasync(struct fasync_struct **fp, int sig, int band)
950 {
951 	/* First a quick test without locking: usually
952 	 * the list is empty.
953 	 */
954 	if (*fp) {
955 		rcu_read_lock();
956 		kill_fasync_rcu(rcu_dereference(*fp), sig, band);
957 		rcu_read_unlock();
958 	}
959 }
960 EXPORT_SYMBOL(kill_fasync);
961 
962 static int __init fcntl_init(void)
963 {
964 	/*
965 	 * Please add new bits here to ensure allocation uniqueness.
966 	 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
967 	 * is defined as O_NONBLOCK on some platforms and not on others.
968 	 */
969 	BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
970 		HWEIGHT32(
971 			(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
972 			__FMODE_EXEC | __FMODE_NONOTIFY));
973 
974 	fasync_cache = kmem_cache_create("fasync_cache",
975 		sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
976 	return 0;
977 }
978 
979 module_init(fcntl_init)
980