xref: /linux/fs/pipe.c (revision 9cfc5c90ad38c8fc11bfd39de42a107da00871ba)
1 /*
2  *  linux/fs/pipe.c
3  *
4  *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
5  */
6 
7 #include <linux/mm.h>
8 #include <linux/file.h>
9 #include <linux/poll.h>
10 #include <linux/slab.h>
11 #include <linux/module.h>
12 #include <linux/init.h>
13 #include <linux/fs.h>
14 #include <linux/log2.h>
15 #include <linux/mount.h>
16 #include <linux/magic.h>
17 #include <linux/pipe_fs_i.h>
18 #include <linux/uio.h>
19 #include <linux/highmem.h>
20 #include <linux/pagemap.h>
21 #include <linux/audit.h>
22 #include <linux/syscalls.h>
23 #include <linux/fcntl.h>
24 
25 #include <asm/uaccess.h>
26 #include <asm/ioctls.h>
27 
28 #include "internal.h"
29 
30 /*
31  * The max size that a non-root user is allowed to grow the pipe. Can
32  * be set by root in /proc/sys/fs/pipe-max-size
33  */
34 unsigned int pipe_max_size = 1048576;
35 
36 /*
37  * Minimum pipe size, as required by POSIX
38  */
39 unsigned int pipe_min_size = PAGE_SIZE;
40 
41 /*
42  * We use a start+len construction, which provides full use of the
43  * allocated memory.
44  * -- Florian Coosmann (FGC)
45  *
46  * Reads with count = 0 should always return 0.
47  * -- Julian Bradfield 1999-06-07.
48  *
49  * FIFOs and Pipes now generate SIGIO for both readers and writers.
50  * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
51  *
52  * pipe_read & write cleanup
53  * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
54  */
55 
56 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
57 {
58 	if (pipe->files)
59 		mutex_lock_nested(&pipe->mutex, subclass);
60 }
61 
62 void pipe_lock(struct pipe_inode_info *pipe)
63 {
64 	/*
65 	 * pipe_lock() nests non-pipe inode locks (for writing to a file)
66 	 */
67 	pipe_lock_nested(pipe, I_MUTEX_PARENT);
68 }
69 EXPORT_SYMBOL(pipe_lock);
70 
71 void pipe_unlock(struct pipe_inode_info *pipe)
72 {
73 	if (pipe->files)
74 		mutex_unlock(&pipe->mutex);
75 }
76 EXPORT_SYMBOL(pipe_unlock);
77 
78 static inline void __pipe_lock(struct pipe_inode_info *pipe)
79 {
80 	mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
81 }
82 
83 static inline void __pipe_unlock(struct pipe_inode_info *pipe)
84 {
85 	mutex_unlock(&pipe->mutex);
86 }
87 
88 void pipe_double_lock(struct pipe_inode_info *pipe1,
89 		      struct pipe_inode_info *pipe2)
90 {
91 	BUG_ON(pipe1 == pipe2);
92 
93 	if (pipe1 < pipe2) {
94 		pipe_lock_nested(pipe1, I_MUTEX_PARENT);
95 		pipe_lock_nested(pipe2, I_MUTEX_CHILD);
96 	} else {
97 		pipe_lock_nested(pipe2, I_MUTEX_PARENT);
98 		pipe_lock_nested(pipe1, I_MUTEX_CHILD);
99 	}
100 }
101 
102 /* Drop the inode semaphore and wait for a pipe event, atomically */
103 void pipe_wait(struct pipe_inode_info *pipe)
104 {
105 	DEFINE_WAIT(wait);
106 
107 	/*
108 	 * Pipes are system-local resources, so sleeping on them
109 	 * is considered a noninteractive wait:
110 	 */
111 	prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
112 	pipe_unlock(pipe);
113 	schedule();
114 	finish_wait(&pipe->wait, &wait);
115 	pipe_lock(pipe);
116 }
117 
118 static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
119 				  struct pipe_buffer *buf)
120 {
121 	struct page *page = buf->page;
122 
123 	/*
124 	 * If nobody else uses this page, and we don't already have a
125 	 * temporary page, let's keep track of it as a one-deep
126 	 * allocation cache. (Otherwise just release our reference to it)
127 	 */
128 	if (page_count(page) == 1 && !pipe->tmp_page)
129 		pipe->tmp_page = page;
130 	else
131 		page_cache_release(page);
132 }
133 
134 /**
135  * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
136  * @pipe:	the pipe that the buffer belongs to
137  * @buf:	the buffer to attempt to steal
138  *
139  * Description:
140  *	This function attempts to steal the &struct page attached to
141  *	@buf. If successful, this function returns 0 and returns with
142  *	the page locked. The caller may then reuse the page for whatever
143  *	he wishes; the typical use is insertion into a different file
144  *	page cache.
145  */
146 int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
147 			   struct pipe_buffer *buf)
148 {
149 	struct page *page = buf->page;
150 
151 	/*
152 	 * A reference of one is golden, that means that the owner of this
153 	 * page is the only one holding a reference to it. lock the page
154 	 * and return OK.
155 	 */
156 	if (page_count(page) == 1) {
157 		lock_page(page);
158 		return 0;
159 	}
160 
161 	return 1;
162 }
163 EXPORT_SYMBOL(generic_pipe_buf_steal);
164 
165 /**
166  * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
167  * @pipe:	the pipe that the buffer belongs to
168  * @buf:	the buffer to get a reference to
169  *
170  * Description:
171  *	This function grabs an extra reference to @buf. It's used in
172  *	in the tee() system call, when we duplicate the buffers in one
173  *	pipe into another.
174  */
175 void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
176 {
177 	page_cache_get(buf->page);
178 }
179 EXPORT_SYMBOL(generic_pipe_buf_get);
180 
181 /**
182  * generic_pipe_buf_confirm - verify contents of the pipe buffer
183  * @info:	the pipe that the buffer belongs to
184  * @buf:	the buffer to confirm
185  *
186  * Description:
187  *	This function does nothing, because the generic pipe code uses
188  *	pages that are always good when inserted into the pipe.
189  */
190 int generic_pipe_buf_confirm(struct pipe_inode_info *info,
191 			     struct pipe_buffer *buf)
192 {
193 	return 0;
194 }
195 EXPORT_SYMBOL(generic_pipe_buf_confirm);
196 
197 /**
198  * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
199  * @pipe:	the pipe that the buffer belongs to
200  * @buf:	the buffer to put a reference to
201  *
202  * Description:
203  *	This function releases a reference to @buf.
204  */
205 void generic_pipe_buf_release(struct pipe_inode_info *pipe,
206 			      struct pipe_buffer *buf)
207 {
208 	page_cache_release(buf->page);
209 }
210 EXPORT_SYMBOL(generic_pipe_buf_release);
211 
212 static const struct pipe_buf_operations anon_pipe_buf_ops = {
213 	.can_merge = 1,
214 	.confirm = generic_pipe_buf_confirm,
215 	.release = anon_pipe_buf_release,
216 	.steal = generic_pipe_buf_steal,
217 	.get = generic_pipe_buf_get,
218 };
219 
220 static const struct pipe_buf_operations packet_pipe_buf_ops = {
221 	.can_merge = 0,
222 	.confirm = generic_pipe_buf_confirm,
223 	.release = anon_pipe_buf_release,
224 	.steal = generic_pipe_buf_steal,
225 	.get = generic_pipe_buf_get,
226 };
227 
228 static ssize_t
229 pipe_read(struct kiocb *iocb, struct iov_iter *to)
230 {
231 	size_t total_len = iov_iter_count(to);
232 	struct file *filp = iocb->ki_filp;
233 	struct pipe_inode_info *pipe = filp->private_data;
234 	int do_wakeup;
235 	ssize_t ret;
236 
237 	/* Null read succeeds. */
238 	if (unlikely(total_len == 0))
239 		return 0;
240 
241 	do_wakeup = 0;
242 	ret = 0;
243 	__pipe_lock(pipe);
244 	for (;;) {
245 		int bufs = pipe->nrbufs;
246 		if (bufs) {
247 			int curbuf = pipe->curbuf;
248 			struct pipe_buffer *buf = pipe->bufs + curbuf;
249 			const struct pipe_buf_operations *ops = buf->ops;
250 			size_t chars = buf->len;
251 			size_t written;
252 			int error;
253 
254 			if (chars > total_len)
255 				chars = total_len;
256 
257 			error = ops->confirm(pipe, buf);
258 			if (error) {
259 				if (!ret)
260 					ret = error;
261 				break;
262 			}
263 
264 			written = copy_page_to_iter(buf->page, buf->offset, chars, to);
265 			if (unlikely(written < chars)) {
266 				if (!ret)
267 					ret = -EFAULT;
268 				break;
269 			}
270 			ret += chars;
271 			buf->offset += chars;
272 			buf->len -= chars;
273 
274 			/* Was it a packet buffer? Clean up and exit */
275 			if (buf->flags & PIPE_BUF_FLAG_PACKET) {
276 				total_len = chars;
277 				buf->len = 0;
278 			}
279 
280 			if (!buf->len) {
281 				buf->ops = NULL;
282 				ops->release(pipe, buf);
283 				curbuf = (curbuf + 1) & (pipe->buffers - 1);
284 				pipe->curbuf = curbuf;
285 				pipe->nrbufs = --bufs;
286 				do_wakeup = 1;
287 			}
288 			total_len -= chars;
289 			if (!total_len)
290 				break;	/* common path: read succeeded */
291 		}
292 		if (bufs)	/* More to do? */
293 			continue;
294 		if (!pipe->writers)
295 			break;
296 		if (!pipe->waiting_writers) {
297 			/* syscall merging: Usually we must not sleep
298 			 * if O_NONBLOCK is set, or if we got some data.
299 			 * But if a writer sleeps in kernel space, then
300 			 * we can wait for that data without violating POSIX.
301 			 */
302 			if (ret)
303 				break;
304 			if (filp->f_flags & O_NONBLOCK) {
305 				ret = -EAGAIN;
306 				break;
307 			}
308 		}
309 		if (signal_pending(current)) {
310 			if (!ret)
311 				ret = -ERESTARTSYS;
312 			break;
313 		}
314 		if (do_wakeup) {
315 			wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
316  			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
317 		}
318 		pipe_wait(pipe);
319 	}
320 	__pipe_unlock(pipe);
321 
322 	/* Signal writers asynchronously that there is more room. */
323 	if (do_wakeup) {
324 		wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM);
325 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
326 	}
327 	if (ret > 0)
328 		file_accessed(filp);
329 	return ret;
330 }
331 
332 static inline int is_packetized(struct file *file)
333 {
334 	return (file->f_flags & O_DIRECT) != 0;
335 }
336 
337 static ssize_t
338 pipe_write(struct kiocb *iocb, struct iov_iter *from)
339 {
340 	struct file *filp = iocb->ki_filp;
341 	struct pipe_inode_info *pipe = filp->private_data;
342 	ssize_t ret = 0;
343 	int do_wakeup = 0;
344 	size_t total_len = iov_iter_count(from);
345 	ssize_t chars;
346 
347 	/* Null write succeeds. */
348 	if (unlikely(total_len == 0))
349 		return 0;
350 
351 	__pipe_lock(pipe);
352 
353 	if (!pipe->readers) {
354 		send_sig(SIGPIPE, current, 0);
355 		ret = -EPIPE;
356 		goto out;
357 	}
358 
359 	/* We try to merge small writes */
360 	chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
361 	if (pipe->nrbufs && chars != 0) {
362 		int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
363 							(pipe->buffers - 1);
364 		struct pipe_buffer *buf = pipe->bufs + lastbuf;
365 		const struct pipe_buf_operations *ops = buf->ops;
366 		int offset = buf->offset + buf->len;
367 
368 		if (ops->can_merge && offset + chars <= PAGE_SIZE) {
369 			ret = ops->confirm(pipe, buf);
370 			if (ret)
371 				goto out;
372 
373 			ret = copy_page_from_iter(buf->page, offset, chars, from);
374 			if (unlikely(ret < chars)) {
375 				ret = -EFAULT;
376 				goto out;
377 			}
378 			do_wakeup = 1;
379 			buf->len += ret;
380 			if (!iov_iter_count(from))
381 				goto out;
382 		}
383 	}
384 
385 	for (;;) {
386 		int bufs;
387 
388 		if (!pipe->readers) {
389 			send_sig(SIGPIPE, current, 0);
390 			if (!ret)
391 				ret = -EPIPE;
392 			break;
393 		}
394 		bufs = pipe->nrbufs;
395 		if (bufs < pipe->buffers) {
396 			int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
397 			struct pipe_buffer *buf = pipe->bufs + newbuf;
398 			struct page *page = pipe->tmp_page;
399 			int copied;
400 
401 			if (!page) {
402 				page = alloc_page(GFP_HIGHUSER);
403 				if (unlikely(!page)) {
404 					ret = ret ? : -ENOMEM;
405 					break;
406 				}
407 				pipe->tmp_page = page;
408 			}
409 			/* Always wake up, even if the copy fails. Otherwise
410 			 * we lock up (O_NONBLOCK-)readers that sleep due to
411 			 * syscall merging.
412 			 * FIXME! Is this really true?
413 			 */
414 			do_wakeup = 1;
415 			copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
416 			if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
417 				if (!ret)
418 					ret = -EFAULT;
419 				break;
420 			}
421 			ret += copied;
422 
423 			/* Insert it into the buffer array */
424 			buf->page = page;
425 			buf->ops = &anon_pipe_buf_ops;
426 			buf->offset = 0;
427 			buf->len = copied;
428 			buf->flags = 0;
429 			if (is_packetized(filp)) {
430 				buf->ops = &packet_pipe_buf_ops;
431 				buf->flags = PIPE_BUF_FLAG_PACKET;
432 			}
433 			pipe->nrbufs = ++bufs;
434 			pipe->tmp_page = NULL;
435 
436 			if (!iov_iter_count(from))
437 				break;
438 		}
439 		if (bufs < pipe->buffers)
440 			continue;
441 		if (filp->f_flags & O_NONBLOCK) {
442 			if (!ret)
443 				ret = -EAGAIN;
444 			break;
445 		}
446 		if (signal_pending(current)) {
447 			if (!ret)
448 				ret = -ERESTARTSYS;
449 			break;
450 		}
451 		if (do_wakeup) {
452 			wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
453 			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
454 			do_wakeup = 0;
455 		}
456 		pipe->waiting_writers++;
457 		pipe_wait(pipe);
458 		pipe->waiting_writers--;
459 	}
460 out:
461 	__pipe_unlock(pipe);
462 	if (do_wakeup) {
463 		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
464 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
465 	}
466 	if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
467 		int err = file_update_time(filp);
468 		if (err)
469 			ret = err;
470 		sb_end_write(file_inode(filp)->i_sb);
471 	}
472 	return ret;
473 }
474 
475 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
476 {
477 	struct pipe_inode_info *pipe = filp->private_data;
478 	int count, buf, nrbufs;
479 
480 	switch (cmd) {
481 		case FIONREAD:
482 			__pipe_lock(pipe);
483 			count = 0;
484 			buf = pipe->curbuf;
485 			nrbufs = pipe->nrbufs;
486 			while (--nrbufs >= 0) {
487 				count += pipe->bufs[buf].len;
488 				buf = (buf+1) & (pipe->buffers - 1);
489 			}
490 			__pipe_unlock(pipe);
491 
492 			return put_user(count, (int __user *)arg);
493 		default:
494 			return -ENOIOCTLCMD;
495 	}
496 }
497 
498 /* No kernel lock held - fine */
499 static unsigned int
500 pipe_poll(struct file *filp, poll_table *wait)
501 {
502 	unsigned int mask;
503 	struct pipe_inode_info *pipe = filp->private_data;
504 	int nrbufs;
505 
506 	poll_wait(filp, &pipe->wait, wait);
507 
508 	/* Reading only -- no need for acquiring the semaphore.  */
509 	nrbufs = pipe->nrbufs;
510 	mask = 0;
511 	if (filp->f_mode & FMODE_READ) {
512 		mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
513 		if (!pipe->writers && filp->f_version != pipe->w_counter)
514 			mask |= POLLHUP;
515 	}
516 
517 	if (filp->f_mode & FMODE_WRITE) {
518 		mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
519 		/*
520 		 * Most Unices do not set POLLERR for FIFOs but on Linux they
521 		 * behave exactly like pipes for poll().
522 		 */
523 		if (!pipe->readers)
524 			mask |= POLLERR;
525 	}
526 
527 	return mask;
528 }
529 
530 static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
531 {
532 	int kill = 0;
533 
534 	spin_lock(&inode->i_lock);
535 	if (!--pipe->files) {
536 		inode->i_pipe = NULL;
537 		kill = 1;
538 	}
539 	spin_unlock(&inode->i_lock);
540 
541 	if (kill)
542 		free_pipe_info(pipe);
543 }
544 
545 static int
546 pipe_release(struct inode *inode, struct file *file)
547 {
548 	struct pipe_inode_info *pipe = file->private_data;
549 
550 	__pipe_lock(pipe);
551 	if (file->f_mode & FMODE_READ)
552 		pipe->readers--;
553 	if (file->f_mode & FMODE_WRITE)
554 		pipe->writers--;
555 
556 	if (pipe->readers || pipe->writers) {
557 		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
558 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
559 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
560 	}
561 	__pipe_unlock(pipe);
562 
563 	put_pipe_info(inode, pipe);
564 	return 0;
565 }
566 
567 static int
568 pipe_fasync(int fd, struct file *filp, int on)
569 {
570 	struct pipe_inode_info *pipe = filp->private_data;
571 	int retval = 0;
572 
573 	__pipe_lock(pipe);
574 	if (filp->f_mode & FMODE_READ)
575 		retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
576 	if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
577 		retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
578 		if (retval < 0 && (filp->f_mode & FMODE_READ))
579 			/* this can happen only if on == T */
580 			fasync_helper(-1, filp, 0, &pipe->fasync_readers);
581 	}
582 	__pipe_unlock(pipe);
583 	return retval;
584 }
585 
586 struct pipe_inode_info *alloc_pipe_info(void)
587 {
588 	struct pipe_inode_info *pipe;
589 
590 	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
591 	if (pipe) {
592 		pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
593 		if (pipe->bufs) {
594 			init_waitqueue_head(&pipe->wait);
595 			pipe->r_counter = pipe->w_counter = 1;
596 			pipe->buffers = PIPE_DEF_BUFFERS;
597 			mutex_init(&pipe->mutex);
598 			return pipe;
599 		}
600 		kfree(pipe);
601 	}
602 
603 	return NULL;
604 }
605 
606 void free_pipe_info(struct pipe_inode_info *pipe)
607 {
608 	int i;
609 
610 	for (i = 0; i < pipe->buffers; i++) {
611 		struct pipe_buffer *buf = pipe->bufs + i;
612 		if (buf->ops)
613 			buf->ops->release(pipe, buf);
614 	}
615 	if (pipe->tmp_page)
616 		__free_page(pipe->tmp_page);
617 	kfree(pipe->bufs);
618 	kfree(pipe);
619 }
620 
621 static struct vfsmount *pipe_mnt __read_mostly;
622 
623 /*
624  * pipefs_dname() is called from d_path().
625  */
626 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
627 {
628 	return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
629 				d_inode(dentry)->i_ino);
630 }
631 
632 static const struct dentry_operations pipefs_dentry_operations = {
633 	.d_dname	= pipefs_dname,
634 };
635 
636 static struct inode * get_pipe_inode(void)
637 {
638 	struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
639 	struct pipe_inode_info *pipe;
640 
641 	if (!inode)
642 		goto fail_inode;
643 
644 	inode->i_ino = get_next_ino();
645 
646 	pipe = alloc_pipe_info();
647 	if (!pipe)
648 		goto fail_iput;
649 
650 	inode->i_pipe = pipe;
651 	pipe->files = 2;
652 	pipe->readers = pipe->writers = 1;
653 	inode->i_fop = &pipefifo_fops;
654 
655 	/*
656 	 * Mark the inode dirty from the very beginning,
657 	 * that way it will never be moved to the dirty
658 	 * list because "mark_inode_dirty()" will think
659 	 * that it already _is_ on the dirty list.
660 	 */
661 	inode->i_state = I_DIRTY;
662 	inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
663 	inode->i_uid = current_fsuid();
664 	inode->i_gid = current_fsgid();
665 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
666 
667 	return inode;
668 
669 fail_iput:
670 	iput(inode);
671 
672 fail_inode:
673 	return NULL;
674 }
675 
676 int create_pipe_files(struct file **res, int flags)
677 {
678 	int err;
679 	struct inode *inode = get_pipe_inode();
680 	struct file *f;
681 	struct path path;
682 	static struct qstr name = { .name = "" };
683 
684 	if (!inode)
685 		return -ENFILE;
686 
687 	err = -ENOMEM;
688 	path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
689 	if (!path.dentry)
690 		goto err_inode;
691 	path.mnt = mntget(pipe_mnt);
692 
693 	d_instantiate(path.dentry, inode);
694 
695 	f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops);
696 	if (IS_ERR(f)) {
697 		err = PTR_ERR(f);
698 		goto err_dentry;
699 	}
700 
701 	f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
702 	f->private_data = inode->i_pipe;
703 
704 	res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops);
705 	if (IS_ERR(res[0])) {
706 		err = PTR_ERR(res[0]);
707 		goto err_file;
708 	}
709 
710 	path_get(&path);
711 	res[0]->private_data = inode->i_pipe;
712 	res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK);
713 	res[1] = f;
714 	return 0;
715 
716 err_file:
717 	put_filp(f);
718 err_dentry:
719 	free_pipe_info(inode->i_pipe);
720 	path_put(&path);
721 	return err;
722 
723 err_inode:
724 	free_pipe_info(inode->i_pipe);
725 	iput(inode);
726 	return err;
727 }
728 
729 static int __do_pipe_flags(int *fd, struct file **files, int flags)
730 {
731 	int error;
732 	int fdw, fdr;
733 
734 	if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT))
735 		return -EINVAL;
736 
737 	error = create_pipe_files(files, flags);
738 	if (error)
739 		return error;
740 
741 	error = get_unused_fd_flags(flags);
742 	if (error < 0)
743 		goto err_read_pipe;
744 	fdr = error;
745 
746 	error = get_unused_fd_flags(flags);
747 	if (error < 0)
748 		goto err_fdr;
749 	fdw = error;
750 
751 	audit_fd_pair(fdr, fdw);
752 	fd[0] = fdr;
753 	fd[1] = fdw;
754 	return 0;
755 
756  err_fdr:
757 	put_unused_fd(fdr);
758  err_read_pipe:
759 	fput(files[0]);
760 	fput(files[1]);
761 	return error;
762 }
763 
764 int do_pipe_flags(int *fd, int flags)
765 {
766 	struct file *files[2];
767 	int error = __do_pipe_flags(fd, files, flags);
768 	if (!error) {
769 		fd_install(fd[0], files[0]);
770 		fd_install(fd[1], files[1]);
771 	}
772 	return error;
773 }
774 
775 /*
776  * sys_pipe() is the normal C calling standard for creating
777  * a pipe. It's not the way Unix traditionally does this, though.
778  */
779 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
780 {
781 	struct file *files[2];
782 	int fd[2];
783 	int error;
784 
785 	error = __do_pipe_flags(fd, files, flags);
786 	if (!error) {
787 		if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
788 			fput(files[0]);
789 			fput(files[1]);
790 			put_unused_fd(fd[0]);
791 			put_unused_fd(fd[1]);
792 			error = -EFAULT;
793 		} else {
794 			fd_install(fd[0], files[0]);
795 			fd_install(fd[1], files[1]);
796 		}
797 	}
798 	return error;
799 }
800 
801 SYSCALL_DEFINE1(pipe, int __user *, fildes)
802 {
803 	return sys_pipe2(fildes, 0);
804 }
805 
806 static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
807 {
808 	int cur = *cnt;
809 
810 	while (cur == *cnt) {
811 		pipe_wait(pipe);
812 		if (signal_pending(current))
813 			break;
814 	}
815 	return cur == *cnt ? -ERESTARTSYS : 0;
816 }
817 
818 static void wake_up_partner(struct pipe_inode_info *pipe)
819 {
820 	wake_up_interruptible(&pipe->wait);
821 }
822 
823 static int fifo_open(struct inode *inode, struct file *filp)
824 {
825 	struct pipe_inode_info *pipe;
826 	bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
827 	int ret;
828 
829 	filp->f_version = 0;
830 
831 	spin_lock(&inode->i_lock);
832 	if (inode->i_pipe) {
833 		pipe = inode->i_pipe;
834 		pipe->files++;
835 		spin_unlock(&inode->i_lock);
836 	} else {
837 		spin_unlock(&inode->i_lock);
838 		pipe = alloc_pipe_info();
839 		if (!pipe)
840 			return -ENOMEM;
841 		pipe->files = 1;
842 		spin_lock(&inode->i_lock);
843 		if (unlikely(inode->i_pipe)) {
844 			inode->i_pipe->files++;
845 			spin_unlock(&inode->i_lock);
846 			free_pipe_info(pipe);
847 			pipe = inode->i_pipe;
848 		} else {
849 			inode->i_pipe = pipe;
850 			spin_unlock(&inode->i_lock);
851 		}
852 	}
853 	filp->private_data = pipe;
854 	/* OK, we have a pipe and it's pinned down */
855 
856 	__pipe_lock(pipe);
857 
858 	/* We can only do regular read/write on fifos */
859 	filp->f_mode &= (FMODE_READ | FMODE_WRITE);
860 
861 	switch (filp->f_mode) {
862 	case FMODE_READ:
863 	/*
864 	 *  O_RDONLY
865 	 *  POSIX.1 says that O_NONBLOCK means return with the FIFO
866 	 *  opened, even when there is no process writing the FIFO.
867 	 */
868 		pipe->r_counter++;
869 		if (pipe->readers++ == 0)
870 			wake_up_partner(pipe);
871 
872 		if (!is_pipe && !pipe->writers) {
873 			if ((filp->f_flags & O_NONBLOCK)) {
874 				/* suppress POLLHUP until we have
875 				 * seen a writer */
876 				filp->f_version = pipe->w_counter;
877 			} else {
878 				if (wait_for_partner(pipe, &pipe->w_counter))
879 					goto err_rd;
880 			}
881 		}
882 		break;
883 
884 	case FMODE_WRITE:
885 	/*
886 	 *  O_WRONLY
887 	 *  POSIX.1 says that O_NONBLOCK means return -1 with
888 	 *  errno=ENXIO when there is no process reading the FIFO.
889 	 */
890 		ret = -ENXIO;
891 		if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
892 			goto err;
893 
894 		pipe->w_counter++;
895 		if (!pipe->writers++)
896 			wake_up_partner(pipe);
897 
898 		if (!is_pipe && !pipe->readers) {
899 			if (wait_for_partner(pipe, &pipe->r_counter))
900 				goto err_wr;
901 		}
902 		break;
903 
904 	case FMODE_READ | FMODE_WRITE:
905 	/*
906 	 *  O_RDWR
907 	 *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
908 	 *  This implementation will NEVER block on a O_RDWR open, since
909 	 *  the process can at least talk to itself.
910 	 */
911 
912 		pipe->readers++;
913 		pipe->writers++;
914 		pipe->r_counter++;
915 		pipe->w_counter++;
916 		if (pipe->readers == 1 || pipe->writers == 1)
917 			wake_up_partner(pipe);
918 		break;
919 
920 	default:
921 		ret = -EINVAL;
922 		goto err;
923 	}
924 
925 	/* Ok! */
926 	__pipe_unlock(pipe);
927 	return 0;
928 
929 err_rd:
930 	if (!--pipe->readers)
931 		wake_up_interruptible(&pipe->wait);
932 	ret = -ERESTARTSYS;
933 	goto err;
934 
935 err_wr:
936 	if (!--pipe->writers)
937 		wake_up_interruptible(&pipe->wait);
938 	ret = -ERESTARTSYS;
939 	goto err;
940 
941 err:
942 	__pipe_unlock(pipe);
943 
944 	put_pipe_info(inode, pipe);
945 	return ret;
946 }
947 
948 const struct file_operations pipefifo_fops = {
949 	.open		= fifo_open,
950 	.llseek		= no_llseek,
951 	.read_iter	= pipe_read,
952 	.write_iter	= pipe_write,
953 	.poll		= pipe_poll,
954 	.unlocked_ioctl	= pipe_ioctl,
955 	.release	= pipe_release,
956 	.fasync		= pipe_fasync,
957 };
958 
959 /*
960  * Allocate a new array of pipe buffers and copy the info over. Returns the
961  * pipe size if successful, or return -ERROR on error.
962  */
963 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
964 {
965 	struct pipe_buffer *bufs;
966 
967 	/*
968 	 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
969 	 * expect a lot of shrink+grow operations, just free and allocate
970 	 * again like we would do for growing. If the pipe currently
971 	 * contains more buffers than arg, then return busy.
972 	 */
973 	if (nr_pages < pipe->nrbufs)
974 		return -EBUSY;
975 
976 	bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN);
977 	if (unlikely(!bufs))
978 		return -ENOMEM;
979 
980 	/*
981 	 * The pipe array wraps around, so just start the new one at zero
982 	 * and adjust the indexes.
983 	 */
984 	if (pipe->nrbufs) {
985 		unsigned int tail;
986 		unsigned int head;
987 
988 		tail = pipe->curbuf + pipe->nrbufs;
989 		if (tail < pipe->buffers)
990 			tail = 0;
991 		else
992 			tail &= (pipe->buffers - 1);
993 
994 		head = pipe->nrbufs - tail;
995 		if (head)
996 			memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
997 		if (tail)
998 			memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
999 	}
1000 
1001 	pipe->curbuf = 0;
1002 	kfree(pipe->bufs);
1003 	pipe->bufs = bufs;
1004 	pipe->buffers = nr_pages;
1005 	return nr_pages * PAGE_SIZE;
1006 }
1007 
1008 /*
1009  * Currently we rely on the pipe array holding a power-of-2 number
1010  * of pages.
1011  */
1012 static inline unsigned int round_pipe_size(unsigned int size)
1013 {
1014 	unsigned long nr_pages;
1015 
1016 	nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
1017 	return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
1018 }
1019 
1020 /*
1021  * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
1022  * will return an error.
1023  */
1024 int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
1025 		 size_t *lenp, loff_t *ppos)
1026 {
1027 	int ret;
1028 
1029 	ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
1030 	if (ret < 0 || !write)
1031 		return ret;
1032 
1033 	pipe_max_size = round_pipe_size(pipe_max_size);
1034 	return ret;
1035 }
1036 
1037 /*
1038  * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1039  * location, so checking ->i_pipe is not enough to verify that this is a
1040  * pipe.
1041  */
1042 struct pipe_inode_info *get_pipe_info(struct file *file)
1043 {
1044 	return file->f_op == &pipefifo_fops ? file->private_data : NULL;
1045 }
1046 
1047 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1048 {
1049 	struct pipe_inode_info *pipe;
1050 	long ret;
1051 
1052 	pipe = get_pipe_info(file);
1053 	if (!pipe)
1054 		return -EBADF;
1055 
1056 	__pipe_lock(pipe);
1057 
1058 	switch (cmd) {
1059 	case F_SETPIPE_SZ: {
1060 		unsigned int size, nr_pages;
1061 
1062 		size = round_pipe_size(arg);
1063 		nr_pages = size >> PAGE_SHIFT;
1064 
1065 		ret = -EINVAL;
1066 		if (!nr_pages)
1067 			goto out;
1068 
1069 		if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
1070 			ret = -EPERM;
1071 			goto out;
1072 		}
1073 		ret = pipe_set_size(pipe, nr_pages);
1074 		break;
1075 		}
1076 	case F_GETPIPE_SZ:
1077 		ret = pipe->buffers * PAGE_SIZE;
1078 		break;
1079 	default:
1080 		ret = -EINVAL;
1081 		break;
1082 	}
1083 
1084 out:
1085 	__pipe_unlock(pipe);
1086 	return ret;
1087 }
1088 
1089 static const struct super_operations pipefs_ops = {
1090 	.destroy_inode = free_inode_nonrcu,
1091 	.statfs = simple_statfs,
1092 };
1093 
1094 /*
1095  * pipefs should _never_ be mounted by userland - too much of security hassle,
1096  * no real gain from having the whole whorehouse mounted. So we don't need
1097  * any operations on the root directory. However, we need a non-trivial
1098  * d_name - pipe: will go nicely and kill the special-casing in procfs.
1099  */
1100 static struct dentry *pipefs_mount(struct file_system_type *fs_type,
1101 			 int flags, const char *dev_name, void *data)
1102 {
1103 	return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
1104 			&pipefs_dentry_operations, PIPEFS_MAGIC);
1105 }
1106 
1107 static struct file_system_type pipe_fs_type = {
1108 	.name		= "pipefs",
1109 	.mount		= pipefs_mount,
1110 	.kill_sb	= kill_anon_super,
1111 };
1112 
1113 static int __init init_pipe_fs(void)
1114 {
1115 	int err = register_filesystem(&pipe_fs_type);
1116 
1117 	if (!err) {
1118 		pipe_mnt = kern_mount(&pipe_fs_type);
1119 		if (IS_ERR(pipe_mnt)) {
1120 			err = PTR_ERR(pipe_mnt);
1121 			unregister_filesystem(&pipe_fs_type);
1122 		}
1123 	}
1124 	return err;
1125 }
1126 
1127 fs_initcall(init_pipe_fs);
1128