xref: /linux/fs/pipe.c (revision 056a5087d87ead77dedbe9cf5bde53b7cd4b4651)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  linux/fs/pipe.c
4  *
5  *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
6  */
7 
8 #include <linux/mm.h>
9 #include <linux/file.h>
10 #include <linux/poll.h>
11 #include <linux/slab.h>
12 #include <linux/module.h>
13 #include <linux/init.h>
14 #include <linux/fs.h>
15 #include <linux/log2.h>
16 #include <linux/mount.h>
17 #include <linux/pseudo_fs.h>
18 #include <linux/magic.h>
19 #include <linux/pipe_fs_i.h>
20 #include <linux/uio.h>
21 #include <linux/highmem.h>
22 #include <linux/pagemap.h>
23 #include <linux/audit.h>
24 #include <linux/syscalls.h>
25 #include <linux/fcntl.h>
26 #include <linux/memcontrol.h>
27 #include <linux/watch_queue.h>
28 #include <linux/sysctl.h>
29 #include <linux/sort.h>
30 
31 #include <linux/uaccess.h>
32 #include <asm/ioctls.h>
33 
34 #include "internal.h"
35 
36 /*
37  * New pipe buffers will be restricted to this size while the user is exceeding
38  * their pipe buffer quota. The general pipe use case needs at least two
39  * buffers: one for data yet to be read, and one for new data. If this is less
40  * than two, then a write to a non-empty pipe may block even if the pipe is not
41  * full. This can occur with GNU make jobserver or similar uses of pipes as
42  * semaphores: multiple processes may be waiting to write tokens back to the
43  * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/.
44  *
45  * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their
46  * own risk, namely: pipe writes to non-full pipes may block until the pipe is
47  * emptied.
48  */
49 #define PIPE_MIN_DEF_BUFFERS 2
50 
51 /*
52  * The max size that a non-root user is allowed to grow the pipe. Can
53  * be set by root in /proc/sys/fs/pipe-max-size
54  */
55 static unsigned int pipe_max_size = 1048576;
56 
57 /* Maximum allocatable pages per user. Hard limit is unset by default, soft
58  * matches default values.
59  */
60 static unsigned long pipe_user_pages_hard;
61 static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
62 
63 /*
64  * We use head and tail indices that aren't masked off, except at the point of
65  * dereference, but rather they're allowed to wrap naturally.  This means there
66  * isn't a dead spot in the buffer, but the ring has to be a power of two and
67  * <= 2^31.
68  * -- David Howells 2019-09-23.
69  *
70  * Reads with count = 0 should always return 0.
71  * -- Julian Bradfield 1999-06-07.
72  *
73  * FIFOs and Pipes now generate SIGIO for both readers and writers.
74  * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
75  *
76  * pipe_read & write cleanup
77  * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
78  */
79 
80 #ifdef CONFIG_PROVE_LOCKING
81 static int pipe_lock_cmp_fn(const struct lockdep_map *a,
82 			    const struct lockdep_map *b)
83 {
84 	return cmp_int((unsigned long) a, (unsigned long) b);
85 }
86 #endif
87 
88 void pipe_lock(struct pipe_inode_info *pipe)
89 {
90 	if (pipe->files)
91 		mutex_lock(&pipe->mutex);
92 }
93 EXPORT_SYMBOL(pipe_lock);
94 
95 void pipe_unlock(struct pipe_inode_info *pipe)
96 {
97 	if (pipe->files)
98 		mutex_unlock(&pipe->mutex);
99 }
100 EXPORT_SYMBOL(pipe_unlock);
101 
102 void pipe_double_lock(struct pipe_inode_info *pipe1,
103 		      struct pipe_inode_info *pipe2)
104 {
105 	BUG_ON(pipe1 == pipe2);
106 
107 	if (pipe1 > pipe2)
108 		swap(pipe1, pipe2);
109 
110 	pipe_lock(pipe1);
111 	pipe_lock(pipe2);
112 }
113 
114 #define PIPE_PREALLOC_MAX 8
115 
116 struct anon_pipe_prealloc {
117 	struct page *pages[PIPE_PREALLOC_MAX];
118 	unsigned int count;
119 };
120 
121 /*
122  * Pre-allocate pages outside pipe->mutex for multi-page writes.
123  * alloc_page() with GFP_HIGHUSER can sleep in reclaim and runs memcg
124  * charging; doing it under the mutex stalls a concurrent reader.
125  *
126  * Loop alloc_page() instead of alloc_pages_bulk_*(): the bulk path refuses
127  * __GFP_ACCOUNT under memcg (see commit 8dcb3060d81d "memcg: page_alloc:
128  * skip bulk allocator for __GFP_ACCOUNT") and silently degrades to a single
129  * page. A per-page loop keeps memcg accounting and the task NUMA mempolicy
130  * honoured for every page; the per-call overhead is small compared to the
131  * pipe->mutex hold-time being shrunk. Any shortfall is covered by the
132  * in-lock alloc_page() fallback in anon_pipe_get_page().
133  */
134 static void anon_pipe_get_page_prealloc(struct anon_pipe_prealloc *prealloc,
135 					size_t total_len)
136 {
137 	unsigned int want, i;
138 	struct page *page;
139 
140 	prealloc->count = 0;
141 	if (total_len <= PAGE_SIZE)
142 		return;
143 
144 	want = min_t(unsigned int, DIV_ROUND_UP(total_len, PAGE_SIZE),
145 		     PIPE_PREALLOC_MAX);
146 
147 	for (i = 0; i < want; i++) {
148 		page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
149 		if (!page)
150 			break;
151 		prealloc->pages[prealloc->count++] = page;
152 	}
153 }
154 
155 static struct page *anon_pipe_prealloc_pop(struct anon_pipe_prealloc *prealloc)
156 {
157 	if (!prealloc->count)
158 		return NULL;
159 
160 	prealloc->count--;
161 
162 	return prealloc->pages[prealloc->count];
163 }
164 
165 static struct page *anon_pipe_get_page(struct pipe_inode_info *pipe,
166 				       struct anon_pipe_prealloc *prealloc)
167 {
168 	struct page *page;
169 
170 	/* Drain prealloc first to keep tmp_page[] hot for later small writes. */
171 	page = anon_pipe_prealloc_pop(prealloc);
172 	if (page)
173 		return page;
174 
175 	for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) {
176 		if (pipe->tmp_page[i]) {
177 			page = pipe->tmp_page[i];
178 			pipe->tmp_page[i] = NULL;
179 			return page;
180 		}
181 	}
182 
183 	/* FWIW: This is called with pipe->mutex held */
184 	return alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
185 }
186 
187 static void anon_pipe_put_page(struct pipe_inode_info *pipe,
188 			       struct page *page)
189 {
190 	if (page_count(page) == 1) {
191 		for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) {
192 			if (!pipe->tmp_page[i]) {
193 				pipe->tmp_page[i] = page;
194 				return;
195 			}
196 		}
197 	}
198 
199 	put_page(page);
200 }
201 
202 /*
203  * Stash leftover prealloc pages in tmp_page[] so the next write to this
204  * pipe gets a hot page without entering the allocator.
205  */
206 static void anon_pipe_refill_tmp_pages(struct pipe_inode_info *pipe,
207 				       struct anon_pipe_prealloc *prealloc)
208 {
209 	int i, idx;
210 
211 	if (!prealloc->count)
212 		return;
213 
214 	for (i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) {
215 		if (pipe->tmp_page[i])
216 			continue;
217 		if (!prealloc->count)
218 			return;
219 		idx = --prealloc->count;
220 		pipe->tmp_page[i] = prealloc->pages[idx];
221 		prealloc->pages[idx] = NULL;
222 	}
223 }
224 
225 /* Runs after mutex_unlock() to keep put_page() out of the critical section. */
226 static void anon_pipe_free_pages(struct anon_pipe_prealloc *prealloc)
227 {
228 	while (prealloc->count) {
229 		prealloc->count--;
230 		put_page(prealloc->pages[prealloc->count]);
231 	}
232 }
233 
234 static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
235 				  struct pipe_buffer *buf)
236 {
237 	struct page *page = buf->page;
238 
239 	anon_pipe_put_page(pipe, page);
240 }
241 
242 static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
243 		struct pipe_buffer *buf)
244 {
245 	struct page *page = buf->page;
246 
247 	if (page_count(page) != 1)
248 		return false;
249 	memcg_kmem_uncharge_page(page, 0);
250 	__SetPageLocked(page);
251 	return true;
252 }
253 
254 /**
255  * generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer
256  * @pipe:	the pipe that the buffer belongs to
257  * @buf:	the buffer to attempt to steal
258  *
259  * Description:
260  *	This function attempts to steal the &struct page attached to
261  *	@buf. If successful, this function returns 0 and returns with
262  *	the page locked. The caller may then reuse the page for whatever
263  *	he wishes; the typical use is insertion into a different file
264  *	page cache.
265  */
266 bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe,
267 		struct pipe_buffer *buf)
268 {
269 	struct page *page = buf->page;
270 
271 	/*
272 	 * A reference of one is golden, that means that the owner of this
273 	 * page is the only one holding a reference to it. lock the page
274 	 * and return OK.
275 	 */
276 	if (page_count(page) == 1) {
277 		lock_page(page);
278 		return true;
279 	}
280 	return false;
281 }
282 EXPORT_SYMBOL(generic_pipe_buf_try_steal);
283 
284 /**
285  * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
286  * @pipe:	the pipe that the buffer belongs to
287  * @buf:	the buffer to get a reference to
288  *
289  * Description:
290  *	This function grabs an extra reference to @buf. It's used in
291  *	the tee() system call, when we duplicate the buffers in one
292  *	pipe into another.
293  */
294 bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
295 {
296 	return try_get_page(buf->page);
297 }
298 EXPORT_SYMBOL(generic_pipe_buf_get);
299 
300 /**
301  * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
302  * @pipe:	the pipe that the buffer belongs to
303  * @buf:	the buffer to put a reference to
304  *
305  * Description:
306  *	This function releases a reference to @buf.
307  */
308 void generic_pipe_buf_release(struct pipe_inode_info *pipe,
309 			      struct pipe_buffer *buf)
310 {
311 	put_page(buf->page);
312 }
313 EXPORT_SYMBOL(generic_pipe_buf_release);
314 
315 static const struct pipe_buf_operations anon_pipe_buf_ops = {
316 	.release	= anon_pipe_buf_release,
317 	.try_steal	= anon_pipe_buf_try_steal,
318 	.get		= generic_pipe_buf_get,
319 };
320 
321 /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
322 static inline bool pipe_readable(const struct pipe_inode_info *pipe)
323 {
324 	union pipe_index idx = { .head_tail = READ_ONCE(pipe->head_tail) };
325 	unsigned int writers = READ_ONCE(pipe->writers);
326 
327 	return !pipe_empty(idx.head, idx.tail) || !writers;
328 }
329 
330 static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
331 					    struct pipe_buffer *buf,
332 					    unsigned int tail)
333 {
334 	pipe_buf_release(pipe, buf);
335 
336 	/*
337 	 * If the pipe has a watch_queue, we need additional protection
338 	 * by the spinlock because notifications get posted with only
339 	 * this spinlock, no mutex
340 	 */
341 	if (pipe_has_watch_queue(pipe)) {
342 		spin_lock_irq(&pipe->rd_wait.lock);
343 #ifdef CONFIG_WATCH_QUEUE
344 		if (buf->flags & PIPE_BUF_FLAG_LOSS)
345 			pipe->note_loss = true;
346 #endif
347 		pipe->tail = ++tail;
348 		spin_unlock_irq(&pipe->rd_wait.lock);
349 		return tail;
350 	}
351 
352 	/*
353 	 * Without a watch_queue, we can simply increment the tail
354 	 * without the spinlock - the mutex is enough.
355 	 */
356 	pipe->tail = ++tail;
357 	return tail;
358 }
359 
360 static ssize_t
361 anon_pipe_read(struct kiocb *iocb, struct iov_iter *to)
362 {
363 	size_t total_len = iov_iter_count(to);
364 	struct file *filp = iocb->ki_filp;
365 	struct pipe_inode_info *pipe = filp->private_data;
366 	bool wake_writer = false, wake_next_reader = false;
367 	ssize_t ret;
368 
369 	/* Null read succeeds. */
370 	if (unlikely(total_len == 0))
371 		return 0;
372 
373 	ret = 0;
374 	mutex_lock(&pipe->mutex);
375 
376 	/*
377 	 * We only wake up writers if the pipe was full when we started reading
378 	 * and it is no longer full after reading to avoid unnecessary wakeups.
379 	 *
380 	 * But when we do wake up writers, we do so using a sync wakeup
381 	 * (WF_SYNC), because we want them to get going and generate more
382 	 * data for us.
383 	 */
384 	for (;;) {
385 		/* Read ->head with a barrier vs post_one_notification() */
386 		unsigned int head = smp_load_acquire(&pipe->head);
387 		unsigned int tail = pipe->tail;
388 
389 #ifdef CONFIG_WATCH_QUEUE
390 		if (pipe->note_loss) {
391 			struct watch_notification n;
392 
393 			if (total_len < 8) {
394 				if (ret == 0)
395 					ret = -ENOBUFS;
396 				break;
397 			}
398 
399 			n.type = WATCH_TYPE_META;
400 			n.subtype = WATCH_META_LOSS_NOTIFICATION;
401 			n.info = watch_sizeof(n);
402 			if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
403 				if (ret == 0)
404 					ret = -EFAULT;
405 				break;
406 			}
407 			ret += sizeof(n);
408 			total_len -= sizeof(n);
409 			pipe->note_loss = false;
410 		}
411 #endif
412 
413 		if (!pipe_empty(head, tail)) {
414 			struct pipe_buffer *buf = pipe_buf(pipe, tail);
415 			size_t chars = buf->len;
416 			size_t written;
417 			int error;
418 
419 			if (chars > total_len) {
420 				if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
421 					if (ret == 0)
422 						ret = -ENOBUFS;
423 					break;
424 				}
425 				chars = total_len;
426 			}
427 
428 			error = pipe_buf_confirm(pipe, buf);
429 			if (error) {
430 				if (!ret)
431 					ret = error;
432 				break;
433 			}
434 
435 			written = copy_page_to_iter(buf->page, buf->offset, chars, to);
436 			if (unlikely(written < chars)) {
437 				if (!ret)
438 					ret = -EFAULT;
439 				break;
440 			}
441 			ret += chars;
442 			buf->offset += chars;
443 			buf->len -= chars;
444 
445 			/* Was it a packet buffer? Clean up and exit */
446 			if (buf->flags & PIPE_BUF_FLAG_PACKET) {
447 				total_len = chars;
448 				buf->len = 0;
449 			}
450 
451 			if (!buf->len) {
452 				wake_writer |= pipe_full(head, tail, pipe->max_usage);
453 				tail = pipe_update_tail(pipe, buf, tail);
454 			}
455 			total_len -= chars;
456 			if (!total_len)
457 				break;	/* common path: read succeeded */
458 			if (!pipe_empty(head, tail))	/* More to do? */
459 				continue;
460 		}
461 
462 		if (!pipe->writers)
463 			break;
464 		if (ret)
465 			break;
466 		if ((filp->f_flags & O_NONBLOCK) ||
467 		    (iocb->ki_flags & IOCB_NOWAIT)) {
468 			ret = -EAGAIN;
469 			break;
470 		}
471 		mutex_unlock(&pipe->mutex);
472 		/*
473 		 * We only get here if we didn't actually read anything.
474 		 *
475 		 * But because we didn't read anything, at this point we can
476 		 * just return directly with -ERESTARTSYS if we're interrupted,
477 		 * since we've done any required wakeups and there's no need
478 		 * to mark anything accessed. And we've dropped the lock.
479 		 */
480 		if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
481 			return -ERESTARTSYS;
482 
483 		wake_next_reader = true;
484 		mutex_lock(&pipe->mutex);
485 	}
486 	if (pipe_is_empty(pipe))
487 		wake_next_reader = false;
488 	mutex_unlock(&pipe->mutex);
489 
490 	if (wake_writer)
491 		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
492 	if (wake_next_reader)
493 		wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
494 	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
495 	return ret;
496 }
497 
498 static ssize_t
499 fifo_pipe_read(struct kiocb *iocb, struct iov_iter *to)
500 {
501 	int ret = anon_pipe_read(iocb, to);
502 	if (ret > 0)
503 		file_accessed(iocb->ki_filp);
504 	return ret;
505 }
506 
507 static inline int is_packetized(struct file *file)
508 {
509 	return (file->f_flags & O_DIRECT) != 0;
510 }
511 
512 /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
513 static inline bool pipe_writable(const struct pipe_inode_info *pipe)
514 {
515 	union pipe_index idx = { .head_tail = READ_ONCE(pipe->head_tail) };
516 	unsigned int max_usage = READ_ONCE(pipe->max_usage);
517 
518 	return !pipe_full(idx.head, idx.tail, max_usage) ||
519 		!READ_ONCE(pipe->readers);
520 }
521 
522 static ssize_t
523 anon_pipe_write(struct kiocb *iocb, struct iov_iter *from)
524 {
525 	struct file *filp = iocb->ki_filp;
526 	struct pipe_inode_info *pipe = filp->private_data;
527 	struct anon_pipe_prealloc prealloc;
528 	unsigned int head;
529 	ssize_t ret = 0;
530 	size_t total_len = iov_iter_count(from);
531 	ssize_t chars;
532 	bool was_empty = false;
533 	bool wake_next_writer = false;
534 
535 	/*
536 	 * Reject writing to watch queue pipes before the point where we lock
537 	 * the pipe.
538 	 * Otherwise, lockdep would be unhappy if the caller already has another
539 	 * pipe locked.
540 	 * If we had to support locking a normal pipe and a notification pipe at
541 	 * the same time, we could set up lockdep annotations for that, but
542 	 * since we don't actually need that, it's simpler to just bail here.
543 	 */
544 	if (pipe_has_watch_queue(pipe))
545 		return -EXDEV;
546 
547 	/* Null write succeeds. */
548 	if (unlikely(total_len == 0))
549 		return 0;
550 
551 	anon_pipe_get_page_prealloc(&prealloc, total_len);
552 
553 	mutex_lock(&pipe->mutex);
554 
555 	if (!pipe->readers) {
556 		if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0)
557 			send_sig(SIGPIPE, current, 0);
558 		ret = -EPIPE;
559 		goto out;
560 	}
561 
562 	/*
563 	 * If it wasn't empty we try to merge new data into
564 	 * the last buffer.
565 	 *
566 	 * That naturally merges small writes, but it also
567 	 * page-aligns the rest of the writes for large writes
568 	 * spanning multiple pages.
569 	 */
570 	head = pipe->head;
571 	was_empty = pipe_empty(head, pipe->tail);
572 	chars = total_len & (PAGE_SIZE-1);
573 	if (chars && !was_empty) {
574 		struct pipe_buffer *buf = pipe_buf(pipe, head - 1);
575 		int offset = buf->offset + buf->len;
576 
577 		if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
578 		    offset + chars <= PAGE_SIZE) {
579 			ret = pipe_buf_confirm(pipe, buf);
580 			if (ret)
581 				goto out;
582 
583 			ret = copy_page_from_iter(buf->page, offset, chars, from);
584 			if (unlikely(ret < chars)) {
585 				ret = -EFAULT;
586 				goto out;
587 			}
588 
589 			buf->len += ret;
590 			if (!iov_iter_count(from))
591 				goto out;
592 		}
593 	}
594 
595 	for (;;) {
596 		if (!pipe->readers) {
597 			if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0)
598 				send_sig(SIGPIPE, current, 0);
599 			if (!ret)
600 				ret = -EPIPE;
601 			break;
602 		}
603 
604 		head = pipe->head;
605 		if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
606 			struct pipe_buffer *buf;
607 			struct page *page;
608 			int copied;
609 
610 			page = anon_pipe_get_page(pipe, &prealloc);
611 			if (unlikely(!page)) {
612 				if (!ret)
613 					ret = -ENOMEM;
614 				break;
615 			}
616 
617 			copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
618 			if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
619 				anon_pipe_put_page(pipe, page);
620 				if (!ret)
621 					ret = -EFAULT;
622 				break;
623 			}
624 
625 			pipe->head = head + 1;
626 			/* Insert it into the buffer array */
627 			buf = pipe_buf(pipe, head);
628 			buf->page = page;
629 			buf->ops = &anon_pipe_buf_ops;
630 			buf->offset = 0;
631 			if (is_packetized(filp))
632 				buf->flags = PIPE_BUF_FLAG_PACKET;
633 			else
634 				buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
635 
636 			buf->len = copied;
637 			ret += copied;
638 
639 			if (!iov_iter_count(from))
640 				break;
641 
642 			continue;
643 		}
644 
645 		/* Wait for buffer space to become available. */
646 		if ((filp->f_flags & O_NONBLOCK) ||
647 		    (iocb->ki_flags & IOCB_NOWAIT)) {
648 			if (!ret)
649 				ret = -EAGAIN;
650 			break;
651 		}
652 		if (signal_pending(current)) {
653 			if (!ret)
654 				ret = -ERESTARTSYS;
655 			break;
656 		}
657 
658 		/*
659 		 * We're going to release the pipe lock and wait for more
660 		 * space. We wake up any readers if necessary, and then
661 		 * after waiting we need to re-check whether the pipe
662 		 * become empty while we dropped the lock.
663 		 */
664 		mutex_unlock(&pipe->mutex);
665 		if (was_empty)
666 			wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
667 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
668 		wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
669 		mutex_lock(&pipe->mutex);
670 		was_empty = pipe_is_empty(pipe);
671 		wake_next_writer = true;
672 	}
673 out:
674 	anon_pipe_refill_tmp_pages(pipe, &prealloc);
675 	if (pipe_is_full(pipe))
676 		wake_next_writer = false;
677 	mutex_unlock(&pipe->mutex);
678 	anon_pipe_free_pages(&prealloc);
679 
680 	/*
681 	 * If we do do a wakeup event, we do a 'sync' wakeup, because we
682 	 * want the reader to start processing things asap, rather than
683 	 * leave the data pending.
684 	 *
685 	 * This is particularly important for small writes, because of
686 	 * how (for example) the GNU make jobserver uses small writes to
687 	 * wake up pending jobs
688 	 *
689 	 * Epoll nonsensically wants a wakeup whether the pipe
690 	 * was already empty or not.
691 	 */
692 	if (was_empty || pipe->poll_usage)
693 		wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
694 	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
695 	if (wake_next_writer)
696 		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
697 	return ret;
698 }
699 
700 static ssize_t
701 fifo_pipe_write(struct kiocb *iocb, struct iov_iter *from)
702 {
703 	int ret = anon_pipe_write(iocb, from);
704 	if (ret > 0) {
705 		struct file *filp = iocb->ki_filp;
706 		if (sb_start_write_trylock(file_inode(filp)->i_sb)) {
707 			int err = file_update_time(filp);
708 			if (err)
709 				ret = err;
710 			sb_end_write(file_inode(filp)->i_sb);
711 		}
712 	}
713 	return ret;
714 }
715 
716 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
717 {
718 	struct pipe_inode_info *pipe = filp->private_data;
719 	unsigned int count, head, tail;
720 
721 	switch (cmd) {
722 	case FIONREAD:
723 		mutex_lock(&pipe->mutex);
724 		count = 0;
725 		head = pipe->head;
726 		tail = pipe->tail;
727 
728 		while (!pipe_empty(head, tail)) {
729 			count += pipe_buf(pipe, tail)->len;
730 			tail++;
731 		}
732 		mutex_unlock(&pipe->mutex);
733 
734 		return put_user(count, (int __user *)arg);
735 
736 #ifdef CONFIG_WATCH_QUEUE
737 	case IOC_WATCH_QUEUE_SET_SIZE: {
738 		int ret;
739 		mutex_lock(&pipe->mutex);
740 		ret = watch_queue_set_size(pipe, arg);
741 		mutex_unlock(&pipe->mutex);
742 		return ret;
743 	}
744 
745 	case IOC_WATCH_QUEUE_SET_FILTER:
746 		return watch_queue_set_filter(
747 			pipe, (struct watch_notification_filter __user *)arg);
748 #endif
749 
750 	default:
751 		return -ENOIOCTLCMD;
752 	}
753 }
754 
755 /* No kernel lock held - fine */
756 static __poll_t
757 pipe_poll(struct file *filp, poll_table *wait)
758 {
759 	__poll_t mask;
760 	struct pipe_inode_info *pipe = filp->private_data;
761 	union pipe_index idx;
762 
763 	/* Epoll has some historical nasty semantics, this enables them */
764 	if (unlikely(!READ_ONCE(pipe->poll_usage)))
765 		WRITE_ONCE(pipe->poll_usage, true);
766 
767 	/*
768 	 * Reading pipe state only -- no need for acquiring the semaphore.
769 	 *
770 	 * But because this is racy, the code has to add the
771 	 * entry to the poll table _first_ ..
772 	 */
773 	if (filp->f_mode & FMODE_READ)
774 		poll_wait(filp, &pipe->rd_wait, wait);
775 	if (filp->f_mode & FMODE_WRITE)
776 		poll_wait(filp, &pipe->wr_wait, wait);
777 
778 	/*
779 	 * .. and only then can you do the racy tests. That way,
780 	 * if something changes and you got it wrong, the poll
781 	 * table entry will wake you up and fix it.
782 	 */
783 	idx.head_tail = READ_ONCE(pipe->head_tail);
784 
785 	mask = 0;
786 	if (filp->f_mode & FMODE_READ) {
787 		if (!pipe_empty(idx.head, idx.tail))
788 			mask |= EPOLLIN | EPOLLRDNORM;
789 		if (!pipe->writers && filp->f_pipe != pipe->w_counter)
790 			mask |= EPOLLHUP;
791 	}
792 
793 	if (filp->f_mode & FMODE_WRITE) {
794 		if (!pipe_full(idx.head, idx.tail, pipe->max_usage))
795 			mask |= EPOLLOUT | EPOLLWRNORM;
796 		/*
797 		 * Most Unices do not set EPOLLERR for FIFOs but on Linux they
798 		 * behave exactly like pipes for poll().
799 		 */
800 		if (!pipe->readers)
801 			mask |= EPOLLERR;
802 	}
803 
804 	return mask;
805 }
806 
807 static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
808 {
809 	int kill = 0;
810 
811 	spin_lock(&inode->i_lock);
812 	if (!--pipe->files) {
813 		inode->i_pipe = NULL;
814 		kill = 1;
815 	}
816 	spin_unlock(&inode->i_lock);
817 
818 	if (kill)
819 		free_pipe_info(pipe);
820 }
821 
822 static int
823 pipe_release(struct inode *inode, struct file *file)
824 {
825 	struct pipe_inode_info *pipe = file->private_data;
826 
827 	mutex_lock(&pipe->mutex);
828 	if (file->f_mode & FMODE_READ)
829 		pipe->readers--;
830 	if (file->f_mode & FMODE_WRITE)
831 		pipe->writers--;
832 
833 	/* Was that the last reader or writer, but not the other side? */
834 	if (!pipe->readers != !pipe->writers) {
835 		wake_up_interruptible_all(&pipe->rd_wait);
836 		wake_up_interruptible_all(&pipe->wr_wait);
837 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
838 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
839 	}
840 	mutex_unlock(&pipe->mutex);
841 
842 	put_pipe_info(inode, pipe);
843 	return 0;
844 }
845 
846 static int
847 pipe_fasync(int fd, struct file *filp, int on)
848 {
849 	struct pipe_inode_info *pipe = filp->private_data;
850 	int retval = 0;
851 
852 	mutex_lock(&pipe->mutex);
853 	if (filp->f_mode & FMODE_READ)
854 		retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
855 	if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
856 		retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
857 		if (retval < 0 && (filp->f_mode & FMODE_READ))
858 			/* this can happen only if on == T */
859 			fasync_helper(-1, filp, 0, &pipe->fasync_readers);
860 	}
861 	mutex_unlock(&pipe->mutex);
862 	return retval;
863 }
864 
865 unsigned long account_pipe_buffers(struct user_struct *user,
866 				   unsigned long old, unsigned long new)
867 {
868 	return atomic_long_add_return(new - old, &user->pipe_bufs);
869 }
870 
871 bool too_many_pipe_buffers_soft(unsigned long user_bufs)
872 {
873 	unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);
874 
875 	return soft_limit && user_bufs > soft_limit;
876 }
877 
878 bool too_many_pipe_buffers_hard(unsigned long user_bufs)
879 {
880 	unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);
881 
882 	return hard_limit && user_bufs > hard_limit;
883 }
884 
885 bool pipe_is_unprivileged_user(void)
886 {
887 	return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
888 }
889 
890 struct pipe_inode_info *alloc_pipe_info(void)
891 {
892 	struct pipe_inode_info *pipe;
893 	unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
894 	struct user_struct *user = get_current_user();
895 	unsigned long user_bufs;
896 	unsigned int max_size = READ_ONCE(pipe_max_size);
897 
898 	pipe = kzalloc_obj(struct pipe_inode_info, GFP_KERNEL_ACCOUNT);
899 	if (pipe == NULL)
900 		goto out_free_uid;
901 
902 	if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
903 		pipe_bufs = max_size >> PAGE_SHIFT;
904 
905 	user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
906 
907 	if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
908 		user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS);
909 		pipe_bufs = PIPE_MIN_DEF_BUFFERS;
910 	}
911 
912 	if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
913 		goto out_revert_acct;
914 
915 	pipe->bufs = kzalloc_objs(struct pipe_buffer, pipe_bufs,
916 				  GFP_KERNEL_ACCOUNT);
917 
918 	if (pipe->bufs) {
919 		init_waitqueue_head(&pipe->rd_wait);
920 		init_waitqueue_head(&pipe->wr_wait);
921 		pipe->r_counter = pipe->w_counter = 1;
922 		pipe->max_usage = pipe_bufs;
923 		pipe->ring_size = pipe_bufs;
924 		pipe->nr_accounted = pipe_bufs;
925 		pipe->user = user;
926 		mutex_init(&pipe->mutex);
927 		lock_set_cmp_fn(&pipe->mutex, pipe_lock_cmp_fn, NULL);
928 		return pipe;
929 	}
930 
931 out_revert_acct:
932 	(void) account_pipe_buffers(user, pipe_bufs, 0);
933 	kfree(pipe);
934 out_free_uid:
935 	free_uid(user);
936 	return NULL;
937 }
938 
939 void free_pipe_info(struct pipe_inode_info *pipe)
940 {
941 	unsigned int i;
942 
943 #ifdef CONFIG_WATCH_QUEUE
944 	if (pipe->watch_queue)
945 		watch_queue_clear(pipe->watch_queue);
946 #endif
947 
948 	(void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
949 	free_uid(pipe->user);
950 	for (i = 0; i < pipe->ring_size; i++) {
951 		struct pipe_buffer *buf = pipe->bufs + i;
952 		if (buf->ops)
953 			pipe_buf_release(pipe, buf);
954 	}
955 #ifdef CONFIG_WATCH_QUEUE
956 	if (pipe->watch_queue)
957 		put_watch_queue(pipe->watch_queue);
958 #endif
959 	for (i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) {
960 		if (pipe->tmp_page[i])
961 			__free_page(pipe->tmp_page[i]);
962 	}
963 	kfree(pipe->bufs);
964 	kfree(pipe);
965 }
966 
967 static struct vfsmount *pipe_mnt __ro_after_init;
968 
969 /*
970  * pipefs_dname() is called from d_path().
971  */
972 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
973 {
974 	return dynamic_dname(buffer, buflen, "pipe:[%llu]",
975 				d_inode(dentry)->i_ino);
976 }
977 
978 static const struct dentry_operations pipefs_dentry_operations = {
979 	.d_dname	= pipefs_dname,
980 };
981 
982 static const struct file_operations pipeanon_fops;
983 
984 static struct inode * get_pipe_inode(void)
985 {
986 	struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
987 	struct pipe_inode_info *pipe;
988 
989 	if (!inode)
990 		goto fail_inode;
991 
992 	inode->i_ino = get_next_ino();
993 
994 	pipe = alloc_pipe_info();
995 	if (!pipe)
996 		goto fail_iput;
997 
998 	inode->i_pipe = pipe;
999 	pipe->files = 2;
1000 	pipe->readers = pipe->writers = 1;
1001 	inode->i_fop = &pipeanon_fops;
1002 
1003 	/*
1004 	 * Mark the inode dirty from the very beginning,
1005 	 * that way it will never be moved to the dirty
1006 	 * list because "mark_inode_dirty()" will think
1007 	 * that it already _is_ on the dirty list.
1008 	 */
1009 	inode_state_assign_raw(inode, I_DIRTY);
1010 	inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
1011 	inode->i_uid = current_fsuid();
1012 	inode->i_gid = current_fsgid();
1013 	simple_inode_init_ts(inode);
1014 
1015 	return inode;
1016 
1017 fail_iput:
1018 	iput(inode);
1019 
1020 fail_inode:
1021 	return NULL;
1022 }
1023 
1024 int create_pipe_files(struct file **res, int flags)
1025 {
1026 	struct inode *inode = get_pipe_inode();
1027 	struct file *f;
1028 	int error;
1029 
1030 	if (!inode)
1031 		return -ENFILE;
1032 
1033 	if (flags & O_NOTIFICATION_PIPE) {
1034 		error = watch_queue_init(inode->i_pipe);
1035 		if (error) {
1036 			free_pipe_info(inode->i_pipe);
1037 			iput(inode);
1038 			return error;
1039 		}
1040 	}
1041 
1042 	f = alloc_file_pseudo(inode, pipe_mnt, "",
1043 				O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
1044 				&pipeanon_fops);
1045 	if (IS_ERR(f)) {
1046 		free_pipe_info(inode->i_pipe);
1047 		iput(inode);
1048 		return PTR_ERR(f);
1049 	}
1050 
1051 	f->private_data = inode->i_pipe;
1052 	f->f_pipe = 0;
1053 
1054 	res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
1055 				  &pipeanon_fops);
1056 	if (IS_ERR(res[0])) {
1057 		put_pipe_info(inode, inode->i_pipe);
1058 		fput(f);
1059 		return PTR_ERR(res[0]);
1060 	}
1061 	res[0]->private_data = inode->i_pipe;
1062 	res[0]->f_pipe = 0;
1063 	res[1] = f;
1064 	stream_open(inode, res[0]);
1065 	stream_open(inode, res[1]);
1066 
1067 	/* pipe groks IOCB_NOWAIT */
1068 	res[0]->f_mode |= FMODE_NOWAIT;
1069 	res[1]->f_mode |= FMODE_NOWAIT;
1070 
1071 	/*
1072 	 * Disable permission and pre-content events, but enable legacy
1073 	 * inotify events for legacy users.
1074 	 */
1075 	file_set_fsnotify_mode(res[0], FMODE_NONOTIFY_PERM);
1076 	file_set_fsnotify_mode(res[1], FMODE_NONOTIFY_PERM);
1077 	return 0;
1078 }
1079 
1080 static int __do_pipe_flags(int *fd, struct file **files, int flags)
1081 {
1082 	int error;
1083 	int fdw, fdr;
1084 
1085 	if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
1086 		return -EINVAL;
1087 
1088 	error = create_pipe_files(files, flags);
1089 	if (error)
1090 		return error;
1091 
1092 	error = get_unused_fd_flags(flags);
1093 	if (error < 0)
1094 		goto err_read_pipe;
1095 	fdr = error;
1096 
1097 	error = get_unused_fd_flags(flags);
1098 	if (error < 0)
1099 		goto err_fdr;
1100 	fdw = error;
1101 
1102 	audit_fd_pair(fdr, fdw);
1103 	fd[0] = fdr;
1104 	fd[1] = fdw;
1105 	return 0;
1106 
1107  err_fdr:
1108 	put_unused_fd(fdr);
1109  err_read_pipe:
1110 	fput(files[0]);
1111 	fput(files[1]);
1112 	return error;
1113 }
1114 
1115 int do_pipe_flags(int *fd, int flags)
1116 {
1117 	struct file *files[2];
1118 	int error = __do_pipe_flags(fd, files, flags);
1119 	if (!error) {
1120 		fd_install(fd[0], files[0]);
1121 		fd_install(fd[1], files[1]);
1122 	}
1123 	return error;
1124 }
1125 
1126 /*
1127  * sys_pipe() is the normal C calling standard for creating
1128  * a pipe. It's not the way Unix traditionally does this, though.
1129  */
1130 static int do_pipe2(int __user *fildes, int flags)
1131 {
1132 	struct file *files[2];
1133 	int fd[2];
1134 	int error;
1135 
1136 	error = __do_pipe_flags(fd, files, flags);
1137 	if (!error) {
1138 		if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
1139 			fput(files[0]);
1140 			fput(files[1]);
1141 			put_unused_fd(fd[0]);
1142 			put_unused_fd(fd[1]);
1143 			error = -EFAULT;
1144 		} else {
1145 			fd_install(fd[0], files[0]);
1146 			fd_install(fd[1], files[1]);
1147 		}
1148 	}
1149 	return error;
1150 }
1151 
1152 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
1153 {
1154 	return do_pipe2(fildes, flags);
1155 }
1156 
1157 SYSCALL_DEFINE1(pipe, int __user *, fildes)
1158 {
1159 	return do_pipe2(fildes, 0);
1160 }
1161 
1162 /*
1163  * This is the stupid "wait for pipe to be readable or writable"
1164  * model.
1165  *
1166  * See pipe_read/write() for the proper kind of exclusive wait,
1167  * but that requires that we wake up any other readers/writers
1168  * if we then do not end up reading everything (ie the whole
1169  * "wake_next_reader/writer" logic in pipe_read/write()).
1170  */
1171 void pipe_wait_readable(struct pipe_inode_info *pipe)
1172 {
1173 	pipe_unlock(pipe);
1174 	wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe));
1175 	pipe_lock(pipe);
1176 }
1177 
1178 void pipe_wait_writable(struct pipe_inode_info *pipe)
1179 {
1180 	pipe_unlock(pipe);
1181 	wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe));
1182 	pipe_lock(pipe);
1183 }
1184 
1185 /*
1186  * This depends on both the wait (here) and the wakeup (wake_up_partner)
1187  * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot
1188  * race with the count check and waitqueue prep.
1189  *
1190  * Normally in order to avoid races, you'd do the prepare_to_wait() first,
1191  * then check the condition you're waiting for, and only then sleep. But
1192  * because of the pipe lock, we can check the condition before being on
1193  * the wait queue.
1194  *
1195  * We use the 'rd_wait' waitqueue for pipe partner waiting.
1196  */
1197 static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
1198 {
1199 	DEFINE_WAIT(rdwait);
1200 	int cur = *cnt;
1201 
1202 	while (cur == *cnt) {
1203 		prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
1204 		pipe_unlock(pipe);
1205 		schedule();
1206 		finish_wait(&pipe->rd_wait, &rdwait);
1207 		pipe_lock(pipe);
1208 		if (signal_pending(current))
1209 			break;
1210 	}
1211 	return cur == *cnt ? -ERESTARTSYS : 0;
1212 }
1213 
1214 static void wake_up_partner(struct pipe_inode_info *pipe)
1215 {
1216 	wake_up_interruptible_all(&pipe->rd_wait);
1217 }
1218 
1219 static int fifo_open(struct inode *inode, struct file *filp)
1220 {
1221 	bool is_pipe = inode->i_fop == &pipeanon_fops;
1222 	struct pipe_inode_info *pipe;
1223 	int ret;
1224 
1225 	filp->f_pipe = 0;
1226 
1227 	spin_lock(&inode->i_lock);
1228 	if (inode->i_pipe) {
1229 		pipe = inode->i_pipe;
1230 		pipe->files++;
1231 		spin_unlock(&inode->i_lock);
1232 	} else {
1233 		spin_unlock(&inode->i_lock);
1234 		pipe = alloc_pipe_info();
1235 		if (!pipe)
1236 			return -ENOMEM;
1237 		pipe->files = 1;
1238 		spin_lock(&inode->i_lock);
1239 		if (unlikely(inode->i_pipe)) {
1240 			inode->i_pipe->files++;
1241 			spin_unlock(&inode->i_lock);
1242 			free_pipe_info(pipe);
1243 			pipe = inode->i_pipe;
1244 		} else {
1245 			inode->i_pipe = pipe;
1246 			spin_unlock(&inode->i_lock);
1247 		}
1248 	}
1249 	filp->private_data = pipe;
1250 	/* OK, we have a pipe and it's pinned down */
1251 
1252 	mutex_lock(&pipe->mutex);
1253 
1254 	/* We can only do regular read/write on fifos */
1255 	stream_open(inode, filp);
1256 
1257 	switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) {
1258 	case FMODE_READ:
1259 	/*
1260 	 *  O_RDONLY
1261 	 *  POSIX.1 says that O_NONBLOCK means return with the FIFO
1262 	 *  opened, even when there is no process writing the FIFO.
1263 	 */
1264 		pipe->r_counter++;
1265 		if (pipe->readers++ == 0)
1266 			wake_up_partner(pipe);
1267 
1268 		if (!is_pipe && !pipe->writers) {
1269 			if ((filp->f_flags & O_NONBLOCK)) {
1270 				/* suppress EPOLLHUP until we have
1271 				 * seen a writer */
1272 				filp->f_pipe = pipe->w_counter;
1273 			} else {
1274 				if (wait_for_partner(pipe, &pipe->w_counter))
1275 					goto err_rd;
1276 			}
1277 		}
1278 		break;
1279 
1280 	case FMODE_WRITE:
1281 	/*
1282 	 *  O_WRONLY
1283 	 *  POSIX.1 says that O_NONBLOCK means return -1 with
1284 	 *  errno=ENXIO when there is no process reading the FIFO.
1285 	 */
1286 		ret = -ENXIO;
1287 		if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
1288 			goto err;
1289 
1290 		pipe->w_counter++;
1291 		if (!pipe->writers++)
1292 			wake_up_partner(pipe);
1293 
1294 		if (!is_pipe && !pipe->readers) {
1295 			if (wait_for_partner(pipe, &pipe->r_counter))
1296 				goto err_wr;
1297 		}
1298 		break;
1299 
1300 	case FMODE_READ | FMODE_WRITE:
1301 	/*
1302 	 *  O_RDWR
1303 	 *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
1304 	 *  This implementation will NEVER block on a O_RDWR open, since
1305 	 *  the process can at least talk to itself.
1306 	 */
1307 
1308 		pipe->readers++;
1309 		pipe->writers++;
1310 		pipe->r_counter++;
1311 		pipe->w_counter++;
1312 		if (pipe->readers == 1 || pipe->writers == 1)
1313 			wake_up_partner(pipe);
1314 		break;
1315 
1316 	default:
1317 		ret = -EINVAL;
1318 		goto err;
1319 	}
1320 
1321 	/* Ok! */
1322 	mutex_unlock(&pipe->mutex);
1323 	return 0;
1324 
1325 err_rd:
1326 	if (!--pipe->readers)
1327 		wake_up_interruptible(&pipe->wr_wait);
1328 	ret = -ERESTARTSYS;
1329 	goto err;
1330 
1331 err_wr:
1332 	if (!--pipe->writers)
1333 		wake_up_interruptible_all(&pipe->rd_wait);
1334 	ret = -ERESTARTSYS;
1335 	goto err;
1336 
1337 err:
1338 	mutex_unlock(&pipe->mutex);
1339 
1340 	put_pipe_info(inode, pipe);
1341 	return ret;
1342 }
1343 
1344 const struct file_operations pipefifo_fops = {
1345 	.open		= fifo_open,
1346 	.read_iter	= fifo_pipe_read,
1347 	.write_iter	= fifo_pipe_write,
1348 	.poll		= pipe_poll,
1349 	.unlocked_ioctl	= pipe_ioctl,
1350 	.release	= pipe_release,
1351 	.fasync		= pipe_fasync,
1352 	.splice_write	= iter_file_splice_write,
1353 };
1354 
1355 static const struct file_operations pipeanon_fops = {
1356 	.open		= fifo_open,
1357 	.read_iter	= anon_pipe_read,
1358 	.write_iter	= anon_pipe_write,
1359 	.poll		= pipe_poll,
1360 	.unlocked_ioctl	= pipe_ioctl,
1361 	.release	= pipe_release,
1362 	.fasync		= pipe_fasync,
1363 	.splice_write	= iter_file_splice_write,
1364 };
1365 
1366 /*
1367  * Currently we rely on the pipe array holding a power-of-2 number
1368  * of pages. Returns 0 on error.
1369  */
1370 unsigned int round_pipe_size(unsigned int size)
1371 {
1372 	if (size > (1U << 31))
1373 		return 0;
1374 
1375 	/* Minimum pipe size, as required by POSIX */
1376 	if (size < PAGE_SIZE)
1377 		return PAGE_SIZE;
1378 
1379 	return roundup_pow_of_two(size);
1380 }
1381 
1382 /*
1383  * Resize the pipe ring to a number of slots.
1384  *
1385  * Note the pipe can be reduced in capacity, but only if the current
1386  * occupancy doesn't exceed nr_slots; if it does, EBUSY will be
1387  * returned instead.
1388  */
1389 int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
1390 {
1391 	struct pipe_buffer *bufs;
1392 	unsigned int head, tail, mask, n;
1393 
1394 	/* nr_slots larger than limits of pipe->{head,tail} */
1395 	if (unlikely(nr_slots > (pipe_index_t)-1u))
1396 		return -EINVAL;
1397 
1398 	bufs = kzalloc_objs(*bufs, nr_slots, GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
1399 	if (unlikely(!bufs))
1400 		return -ENOMEM;
1401 
1402 	spin_lock_irq(&pipe->rd_wait.lock);
1403 	mask = pipe->ring_size - 1;
1404 	head = pipe->head;
1405 	tail = pipe->tail;
1406 
1407 	n = pipe_occupancy(head, tail);
1408 	if (nr_slots < n) {
1409 		spin_unlock_irq(&pipe->rd_wait.lock);
1410 		kfree(bufs);
1411 		return -EBUSY;
1412 	}
1413 
1414 	/*
1415 	 * The pipe array wraps around, so just start the new one at zero
1416 	 * and adjust the indices.
1417 	 */
1418 	if (n > 0) {
1419 		unsigned int h = head & mask;
1420 		unsigned int t = tail & mask;
1421 		if (h > t) {
1422 			memcpy(bufs, pipe->bufs + t,
1423 			       n * sizeof(struct pipe_buffer));
1424 		} else {
1425 			unsigned int tsize = pipe->ring_size - t;
1426 			if (h > 0)
1427 				memcpy(bufs + tsize, pipe->bufs,
1428 				       h * sizeof(struct pipe_buffer));
1429 			memcpy(bufs, pipe->bufs + t,
1430 			       tsize * sizeof(struct pipe_buffer));
1431 		}
1432 	}
1433 
1434 	head = n;
1435 	tail = 0;
1436 
1437 	kfree(pipe->bufs);
1438 	pipe->bufs = bufs;
1439 	pipe->ring_size = nr_slots;
1440 	if (pipe->max_usage > nr_slots)
1441 		pipe->max_usage = nr_slots;
1442 	pipe->tail = tail;
1443 	pipe->head = head;
1444 
1445 	if (!pipe_has_watch_queue(pipe)) {
1446 		pipe->max_usage = nr_slots;
1447 		pipe->nr_accounted = nr_slots;
1448 	}
1449 
1450 	spin_unlock_irq(&pipe->rd_wait.lock);
1451 
1452 	/* This might have made more room for writers */
1453 	wake_up_interruptible(&pipe->wr_wait);
1454 	return 0;
1455 }
1456 
1457 /*
1458  * Allocate a new array of pipe buffers and copy the info over. Returns the
1459  * pipe size if successful, or return -ERROR on error.
1460  */
1461 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg)
1462 {
1463 	unsigned long user_bufs;
1464 	unsigned int nr_slots, size;
1465 	long ret = 0;
1466 
1467 	if (pipe_has_watch_queue(pipe))
1468 		return -EBUSY;
1469 
1470 	size = round_pipe_size(arg);
1471 	nr_slots = size >> PAGE_SHIFT;
1472 
1473 	if (!nr_slots)
1474 		return -EINVAL;
1475 
1476 	/*
1477 	 * If trying to increase the pipe capacity, check that an
1478 	 * unprivileged user is not trying to exceed various limits
1479 	 * (soft limit check here, hard limit check just below).
1480 	 * Decreasing the pipe capacity is always permitted, even
1481 	 * if the user is currently over a limit.
1482 	 */
1483 	if (nr_slots > pipe->max_usage &&
1484 			size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
1485 		return -EPERM;
1486 
1487 	user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots);
1488 
1489 	if (nr_slots > pipe->max_usage &&
1490 			(too_many_pipe_buffers_hard(user_bufs) ||
1491 			 too_many_pipe_buffers_soft(user_bufs)) &&
1492 			pipe_is_unprivileged_user()) {
1493 		ret = -EPERM;
1494 		goto out_revert_acct;
1495 	}
1496 
1497 	ret = pipe_resize_ring(pipe, nr_slots);
1498 	if (ret < 0)
1499 		goto out_revert_acct;
1500 
1501 	return pipe->max_usage * PAGE_SIZE;
1502 
1503 out_revert_acct:
1504 	(void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted);
1505 	return ret;
1506 }
1507 
1508 /*
1509  * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is
1510  * not enough to verify that this is a pipe.
1511  */
1512 struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
1513 {
1514 	struct pipe_inode_info *pipe = file->private_data;
1515 
1516 	if (!pipe)
1517 		return NULL;
1518 	if (file->f_op != &pipefifo_fops && file->f_op != &pipeanon_fops)
1519 		return NULL;
1520 	if (for_splice && pipe_has_watch_queue(pipe))
1521 		return NULL;
1522 	return pipe;
1523 }
1524 
1525 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
1526 {
1527 	struct pipe_inode_info *pipe;
1528 	long ret;
1529 
1530 	pipe = get_pipe_info(file, false);
1531 	if (!pipe)
1532 		return -EBADF;
1533 
1534 	mutex_lock(&pipe->mutex);
1535 
1536 	switch (cmd) {
1537 	case F_SETPIPE_SZ:
1538 		ret = pipe_set_size(pipe, arg);
1539 		break;
1540 	case F_GETPIPE_SZ:
1541 		ret = pipe->max_usage * PAGE_SIZE;
1542 		break;
1543 	default:
1544 		ret = -EINVAL;
1545 		break;
1546 	}
1547 
1548 	mutex_unlock(&pipe->mutex);
1549 	return ret;
1550 }
1551 
1552 static const struct super_operations pipefs_ops = {
1553 	.destroy_inode = free_inode_nonrcu,
1554 	.statfs = simple_statfs,
1555 };
1556 
1557 /*
1558  * pipefs should _never_ be mounted by userland - too much of security hassle,
1559  * no real gain from having the whole file system mounted. So we don't need
1560  * any operations on the root directory. However, we need a non-trivial
1561  * d_name - pipe: will go nicely and kill the special-casing in procfs.
1562  */
1563 
1564 static int pipefs_init_fs_context(struct fs_context *fc)
1565 {
1566 	struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC);
1567 	if (!ctx)
1568 		return -ENOMEM;
1569 	ctx->ops = &pipefs_ops;
1570 	ctx->dops = &pipefs_dentry_operations;
1571 	return 0;
1572 }
1573 
1574 static struct file_system_type pipe_fs_type = {
1575 	.name		= "pipefs",
1576 	.init_fs_context = pipefs_init_fs_context,
1577 	.kill_sb	= kill_anon_super,
1578 };
1579 
1580 #ifdef CONFIG_SYSCTL
1581 
1582 static ulong round_pipe_size_ul(ulong size)
1583 {
1584 	return round_pipe_size(size);
1585 }
1586 
1587 static int u2k_pipe_maxsz(const ulong *u_ptr, uint *k_ptr)
1588 {
1589 	return proc_uint_u2k_conv_uop(u_ptr, k_ptr, round_pipe_size_ul);
1590 }
1591 
1592 static int do_proc_uint_conv_pipe_maxsz(ulong *u_ptr, uint *k_ptr,
1593 					int dir, const struct ctl_table *table)
1594 {
1595 	return proc_uint_conv(u_ptr, k_ptr, dir, table, true,
1596 			      u2k_pipe_maxsz,
1597 			      proc_uint_k2u_conv);
1598 }
1599 
1600 static int proc_dopipe_max_size(const struct ctl_table *table, int write,
1601 				void *buffer, size_t *lenp, loff_t *ppos)
1602 {
1603 	return proc_douintvec_conv(table, write, buffer, lenp, ppos,
1604 				   do_proc_uint_conv_pipe_maxsz);
1605 }
1606 
1607 static const struct ctl_table fs_pipe_sysctls[] = {
1608 	{
1609 		.procname	= "pipe-max-size",
1610 		.data		= &pipe_max_size,
1611 		.maxlen		= sizeof(pipe_max_size),
1612 		.mode		= 0644,
1613 		.proc_handler	= proc_dopipe_max_size,
1614 		.extra1		= SYSCTL_ONE,
1615 	},
1616 	{
1617 		.procname	= "pipe-user-pages-hard",
1618 		.data		= &pipe_user_pages_hard,
1619 		.maxlen		= sizeof(pipe_user_pages_hard),
1620 		.mode		= 0644,
1621 		.proc_handler	= proc_doulongvec_minmax,
1622 	},
1623 	{
1624 		.procname	= "pipe-user-pages-soft",
1625 		.data		= &pipe_user_pages_soft,
1626 		.maxlen		= sizeof(pipe_user_pages_soft),
1627 		.mode		= 0644,
1628 		.proc_handler	= proc_doulongvec_minmax,
1629 	},
1630 };
1631 #endif
1632 
1633 static int __init init_pipe_fs(void)
1634 {
1635 	int err = register_filesystem(&pipe_fs_type);
1636 
1637 	if (!err) {
1638 		pipe_mnt = kern_mount(&pipe_fs_type);
1639 		if (IS_ERR(pipe_mnt)) {
1640 			err = PTR_ERR(pipe_mnt);
1641 			unregister_filesystem(&pipe_fs_type);
1642 		}
1643 	}
1644 #ifdef CONFIG_SYSCTL
1645 	register_sysctl_init("fs", fs_pipe_sysctls);
1646 #endif
1647 	return err;
1648 }
1649 
1650 fs_initcall(init_pipe_fs);
1651