xref: /linux/fs/pipe.c (revision 530fe6bf0f9ff91e5156f0423ae8db8d106d0159)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   *  linux/fs/pipe.c
4   *
5   *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
6   */
7  
8  #include <linux/mm.h>
9  #include <linux/file.h>
10  #include <linux/poll.h>
11  #include <linux/slab.h>
12  #include <linux/module.h>
13  #include <linux/init.h>
14  #include <linux/fs.h>
15  #include <linux/log2.h>
16  #include <linux/mount.h>
17  #include <linux/pseudo_fs.h>
18  #include <linux/magic.h>
19  #include <linux/pipe_fs_i.h>
20  #include <linux/uio.h>
21  #include <linux/highmem.h>
22  #include <linux/pagemap.h>
23  #include <linux/audit.h>
24  #include <linux/syscalls.h>
25  #include <linux/fcntl.h>
26  #include <linux/memcontrol.h>
27  #include <linux/watch_queue.h>
28  
29  #include <linux/uaccess.h>
30  #include <asm/ioctls.h>
31  
32  #include "internal.h"
33  
34  /*
35   * The max size that a non-root user is allowed to grow the pipe. Can
36   * be set by root in /proc/sys/fs/pipe-max-size
37   */
38  unsigned int pipe_max_size = 1048576;
39  
40  /* Maximum allocatable pages per user. Hard limit is unset by default, soft
41   * matches default values.
42   */
43  unsigned long pipe_user_pages_hard;
44  unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
45  
46  /*
47   * We use head and tail indices that aren't masked off, except at the point of
48   * dereference, but rather they're allowed to wrap naturally.  This means there
49   * isn't a dead spot in the buffer, but the ring has to be a power of two and
50   * <= 2^31.
51   * -- David Howells 2019-09-23.
52   *
53   * Reads with count = 0 should always return 0.
54   * -- Julian Bradfield 1999-06-07.
55   *
56   * FIFOs and Pipes now generate SIGIO for both readers and writers.
57   * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
58   *
59   * pipe_read & write cleanup
60   * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
61   */
62  
63  static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
64  {
65  	if (pipe->files)
66  		mutex_lock_nested(&pipe->mutex, subclass);
67  }
68  
69  void pipe_lock(struct pipe_inode_info *pipe)
70  {
71  	/*
72  	 * pipe_lock() nests non-pipe inode locks (for writing to a file)
73  	 */
74  	pipe_lock_nested(pipe, I_MUTEX_PARENT);
75  }
76  EXPORT_SYMBOL(pipe_lock);
77  
78  void pipe_unlock(struct pipe_inode_info *pipe)
79  {
80  	if (pipe->files)
81  		mutex_unlock(&pipe->mutex);
82  }
83  EXPORT_SYMBOL(pipe_unlock);
84  
85  static inline void __pipe_lock(struct pipe_inode_info *pipe)
86  {
87  	mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
88  }
89  
90  static inline void __pipe_unlock(struct pipe_inode_info *pipe)
91  {
92  	mutex_unlock(&pipe->mutex);
93  }
94  
95  void pipe_double_lock(struct pipe_inode_info *pipe1,
96  		      struct pipe_inode_info *pipe2)
97  {
98  	BUG_ON(pipe1 == pipe2);
99  
100  	if (pipe1 < pipe2) {
101  		pipe_lock_nested(pipe1, I_MUTEX_PARENT);
102  		pipe_lock_nested(pipe2, I_MUTEX_CHILD);
103  	} else {
104  		pipe_lock_nested(pipe2, I_MUTEX_PARENT);
105  		pipe_lock_nested(pipe1, I_MUTEX_CHILD);
106  	}
107  }
108  
109  static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
110  				  struct pipe_buffer *buf)
111  {
112  	struct page *page = buf->page;
113  
114  	/*
115  	 * If nobody else uses this page, and we don't already have a
116  	 * temporary page, let's keep track of it as a one-deep
117  	 * allocation cache. (Otherwise just release our reference to it)
118  	 */
119  	if (page_count(page) == 1 && !pipe->tmp_page)
120  		pipe->tmp_page = page;
121  	else
122  		put_page(page);
123  }
124  
125  static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
126  		struct pipe_buffer *buf)
127  {
128  	struct page *page = buf->page;
129  
130  	if (page_count(page) != 1)
131  		return false;
132  	memcg_kmem_uncharge_page(page, 0);
133  	__SetPageLocked(page);
134  	return true;
135  }
136  
137  /**
138   * generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer
139   * @pipe:	the pipe that the buffer belongs to
140   * @buf:	the buffer to attempt to steal
141   *
142   * Description:
143   *	This function attempts to steal the &struct page attached to
144   *	@buf. If successful, this function returns 0 and returns with
145   *	the page locked. The caller may then reuse the page for whatever
146   *	he wishes; the typical use is insertion into a different file
147   *	page cache.
148   */
149  bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe,
150  		struct pipe_buffer *buf)
151  {
152  	struct page *page = buf->page;
153  
154  	/*
155  	 * A reference of one is golden, that means that the owner of this
156  	 * page is the only one holding a reference to it. lock the page
157  	 * and return OK.
158  	 */
159  	if (page_count(page) == 1) {
160  		lock_page(page);
161  		return true;
162  	}
163  	return false;
164  }
165  EXPORT_SYMBOL(generic_pipe_buf_try_steal);
166  
167  /**
168   * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
169   * @pipe:	the pipe that the buffer belongs to
170   * @buf:	the buffer to get a reference to
171   *
172   * Description:
173   *	This function grabs an extra reference to @buf. It's used in
174   *	in the tee() system call, when we duplicate the buffers in one
175   *	pipe into another.
176   */
177  bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
178  {
179  	return try_get_page(buf->page);
180  }
181  EXPORT_SYMBOL(generic_pipe_buf_get);
182  
183  /**
184   * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
185   * @pipe:	the pipe that the buffer belongs to
186   * @buf:	the buffer to put a reference to
187   *
188   * Description:
189   *	This function releases a reference to @buf.
190   */
191  void generic_pipe_buf_release(struct pipe_inode_info *pipe,
192  			      struct pipe_buffer *buf)
193  {
194  	put_page(buf->page);
195  }
196  EXPORT_SYMBOL(generic_pipe_buf_release);
197  
198  static const struct pipe_buf_operations anon_pipe_buf_ops = {
199  	.release	= anon_pipe_buf_release,
200  	.try_steal	= anon_pipe_buf_try_steal,
201  	.get		= generic_pipe_buf_get,
202  };
203  
204  /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
205  static inline bool pipe_readable(const struct pipe_inode_info *pipe)
206  {
207  	unsigned int head = READ_ONCE(pipe->head);
208  	unsigned int tail = READ_ONCE(pipe->tail);
209  	unsigned int writers = READ_ONCE(pipe->writers);
210  
211  	return !pipe_empty(head, tail) || !writers;
212  }
213  
214  static ssize_t
215  pipe_read(struct kiocb *iocb, struct iov_iter *to)
216  {
217  	size_t total_len = iov_iter_count(to);
218  	struct file *filp = iocb->ki_filp;
219  	struct pipe_inode_info *pipe = filp->private_data;
220  	bool was_full, wake_next_reader = false;
221  	ssize_t ret;
222  
223  	/* Null read succeeds. */
224  	if (unlikely(total_len == 0))
225  		return 0;
226  
227  	ret = 0;
228  	__pipe_lock(pipe);
229  
230  	/*
231  	 * We only wake up writers if the pipe was full when we started
232  	 * reading in order to avoid unnecessary wakeups.
233  	 *
234  	 * But when we do wake up writers, we do so using a sync wakeup
235  	 * (WF_SYNC), because we want them to get going and generate more
236  	 * data for us.
237  	 */
238  	was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
239  	for (;;) {
240  		unsigned int head = pipe->head;
241  		unsigned int tail = pipe->tail;
242  		unsigned int mask = pipe->ring_size - 1;
243  
244  #ifdef CONFIG_WATCH_QUEUE
245  		if (pipe->note_loss) {
246  			struct watch_notification n;
247  
248  			if (total_len < 8) {
249  				if (ret == 0)
250  					ret = -ENOBUFS;
251  				break;
252  			}
253  
254  			n.type = WATCH_TYPE_META;
255  			n.subtype = WATCH_META_LOSS_NOTIFICATION;
256  			n.info = watch_sizeof(n);
257  			if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
258  				if (ret == 0)
259  					ret = -EFAULT;
260  				break;
261  			}
262  			ret += sizeof(n);
263  			total_len -= sizeof(n);
264  			pipe->note_loss = false;
265  		}
266  #endif
267  
268  		if (!pipe_empty(head, tail)) {
269  			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
270  			size_t chars = buf->len;
271  			size_t written;
272  			int error;
273  
274  			if (chars > total_len) {
275  				if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
276  					if (ret == 0)
277  						ret = -ENOBUFS;
278  					break;
279  				}
280  				chars = total_len;
281  			}
282  
283  			error = pipe_buf_confirm(pipe, buf);
284  			if (error) {
285  				if (!ret)
286  					ret = error;
287  				break;
288  			}
289  
290  			written = copy_page_to_iter(buf->page, buf->offset, chars, to);
291  			if (unlikely(written < chars)) {
292  				if (!ret)
293  					ret = -EFAULT;
294  				break;
295  			}
296  			ret += chars;
297  			buf->offset += chars;
298  			buf->len -= chars;
299  
300  			/* Was it a packet buffer? Clean up and exit */
301  			if (buf->flags & PIPE_BUF_FLAG_PACKET) {
302  				total_len = chars;
303  				buf->len = 0;
304  			}
305  
306  			if (!buf->len) {
307  				pipe_buf_release(pipe, buf);
308  				spin_lock_irq(&pipe->rd_wait.lock);
309  #ifdef CONFIG_WATCH_QUEUE
310  				if (buf->flags & PIPE_BUF_FLAG_LOSS)
311  					pipe->note_loss = true;
312  #endif
313  				tail++;
314  				pipe->tail = tail;
315  				spin_unlock_irq(&pipe->rd_wait.lock);
316  			}
317  			total_len -= chars;
318  			if (!total_len)
319  				break;	/* common path: read succeeded */
320  			if (!pipe_empty(head, tail))	/* More to do? */
321  				continue;
322  		}
323  
324  		if (!pipe->writers)
325  			break;
326  		if (ret)
327  			break;
328  		if (filp->f_flags & O_NONBLOCK) {
329  			ret = -EAGAIN;
330  			break;
331  		}
332  		__pipe_unlock(pipe);
333  
334  		/*
335  		 * We only get here if we didn't actually read anything.
336  		 *
337  		 * However, we could have seen (and removed) a zero-sized
338  		 * pipe buffer, and might have made space in the buffers
339  		 * that way.
340  		 *
341  		 * You can't make zero-sized pipe buffers by doing an empty
342  		 * write (not even in packet mode), but they can happen if
343  		 * the writer gets an EFAULT when trying to fill a buffer
344  		 * that already got allocated and inserted in the buffer
345  		 * array.
346  		 *
347  		 * So we still need to wake up any pending writers in the
348  		 * _very_ unlikely case that the pipe was full, but we got
349  		 * no data.
350  		 */
351  		if (unlikely(was_full)) {
352  			wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
353  			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
354  		}
355  
356  		/*
357  		 * But because we didn't read anything, at this point we can
358  		 * just return directly with -ERESTARTSYS if we're interrupted,
359  		 * since we've done any required wakeups and there's no need
360  		 * to mark anything accessed. And we've dropped the lock.
361  		 */
362  		if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
363  			return -ERESTARTSYS;
364  
365  		__pipe_lock(pipe);
366  		was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
367  		wake_next_reader = true;
368  	}
369  	if (pipe_empty(pipe->head, pipe->tail))
370  		wake_next_reader = false;
371  	__pipe_unlock(pipe);
372  
373  	if (was_full) {
374  		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
375  		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
376  	}
377  	if (wake_next_reader)
378  		wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
379  	if (ret > 0)
380  		file_accessed(filp);
381  	return ret;
382  }
383  
384  static inline int is_packetized(struct file *file)
385  {
386  	return (file->f_flags & O_DIRECT) != 0;
387  }
388  
389  /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
390  static inline bool pipe_writable(const struct pipe_inode_info *pipe)
391  {
392  	unsigned int head = READ_ONCE(pipe->head);
393  	unsigned int tail = READ_ONCE(pipe->tail);
394  	unsigned int max_usage = READ_ONCE(pipe->max_usage);
395  
396  	return !pipe_full(head, tail, max_usage) ||
397  		!READ_ONCE(pipe->readers);
398  }
399  
400  static ssize_t
401  pipe_write(struct kiocb *iocb, struct iov_iter *from)
402  {
403  	struct file *filp = iocb->ki_filp;
404  	struct pipe_inode_info *pipe = filp->private_data;
405  	unsigned int head;
406  	ssize_t ret = 0;
407  	size_t total_len = iov_iter_count(from);
408  	ssize_t chars;
409  	bool was_empty = false;
410  	bool wake_next_writer = false;
411  
412  	/* Null write succeeds. */
413  	if (unlikely(total_len == 0))
414  		return 0;
415  
416  	__pipe_lock(pipe);
417  
418  	if (!pipe->readers) {
419  		send_sig(SIGPIPE, current, 0);
420  		ret = -EPIPE;
421  		goto out;
422  	}
423  
424  #ifdef CONFIG_WATCH_QUEUE
425  	if (pipe->watch_queue) {
426  		ret = -EXDEV;
427  		goto out;
428  	}
429  #endif
430  
431  	/*
432  	 * Only wake up if the pipe started out empty, since
433  	 * otherwise there should be no readers waiting.
434  	 *
435  	 * If it wasn't empty we try to merge new data into
436  	 * the last buffer.
437  	 *
438  	 * That naturally merges small writes, but it also
439  	 * page-aligs the rest of the writes for large writes
440  	 * spanning multiple pages.
441  	 */
442  	head = pipe->head;
443  	was_empty = pipe_empty(head, pipe->tail);
444  	chars = total_len & (PAGE_SIZE-1);
445  	if (chars && !was_empty) {
446  		unsigned int mask = pipe->ring_size - 1;
447  		struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
448  		int offset = buf->offset + buf->len;
449  
450  		if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
451  		    offset + chars <= PAGE_SIZE) {
452  			ret = pipe_buf_confirm(pipe, buf);
453  			if (ret)
454  				goto out;
455  
456  			ret = copy_page_from_iter(buf->page, offset, chars, from);
457  			if (unlikely(ret < chars)) {
458  				ret = -EFAULT;
459  				goto out;
460  			}
461  
462  			buf->len += ret;
463  			if (!iov_iter_count(from))
464  				goto out;
465  		}
466  	}
467  
468  	for (;;) {
469  		if (!pipe->readers) {
470  			send_sig(SIGPIPE, current, 0);
471  			if (!ret)
472  				ret = -EPIPE;
473  			break;
474  		}
475  
476  		head = pipe->head;
477  		if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
478  			unsigned int mask = pipe->ring_size - 1;
479  			struct pipe_buffer *buf = &pipe->bufs[head & mask];
480  			struct page *page = pipe->tmp_page;
481  			int copied;
482  
483  			if (!page) {
484  				page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
485  				if (unlikely(!page)) {
486  					ret = ret ? : -ENOMEM;
487  					break;
488  				}
489  				pipe->tmp_page = page;
490  			}
491  
492  			/* Allocate a slot in the ring in advance and attach an
493  			 * empty buffer.  If we fault or otherwise fail to use
494  			 * it, either the reader will consume it or it'll still
495  			 * be there for the next write.
496  			 */
497  			spin_lock_irq(&pipe->rd_wait.lock);
498  
499  			head = pipe->head;
500  			if (pipe_full(head, pipe->tail, pipe->max_usage)) {
501  				spin_unlock_irq(&pipe->rd_wait.lock);
502  				continue;
503  			}
504  
505  			pipe->head = head + 1;
506  			spin_unlock_irq(&pipe->rd_wait.lock);
507  
508  			/* Insert it into the buffer array */
509  			buf = &pipe->bufs[head & mask];
510  			buf->page = page;
511  			buf->ops = &anon_pipe_buf_ops;
512  			buf->offset = 0;
513  			buf->len = 0;
514  			if (is_packetized(filp))
515  				buf->flags = PIPE_BUF_FLAG_PACKET;
516  			else
517  				buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
518  			pipe->tmp_page = NULL;
519  
520  			copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
521  			if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
522  				if (!ret)
523  					ret = -EFAULT;
524  				break;
525  			}
526  			ret += copied;
527  			buf->offset = 0;
528  			buf->len = copied;
529  
530  			if (!iov_iter_count(from))
531  				break;
532  		}
533  
534  		if (!pipe_full(head, pipe->tail, pipe->max_usage))
535  			continue;
536  
537  		/* Wait for buffer space to become available. */
538  		if (filp->f_flags & O_NONBLOCK) {
539  			if (!ret)
540  				ret = -EAGAIN;
541  			break;
542  		}
543  		if (signal_pending(current)) {
544  			if (!ret)
545  				ret = -ERESTARTSYS;
546  			break;
547  		}
548  
549  		/*
550  		 * We're going to release the pipe lock and wait for more
551  		 * space. We wake up any readers if necessary, and then
552  		 * after waiting we need to re-check whether the pipe
553  		 * become empty while we dropped the lock.
554  		 */
555  		__pipe_unlock(pipe);
556  		if (was_empty) {
557  			wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
558  			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
559  		}
560  		wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
561  		__pipe_lock(pipe);
562  		was_empty = pipe_empty(pipe->head, pipe->tail);
563  		wake_next_writer = true;
564  	}
565  out:
566  	if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
567  		wake_next_writer = false;
568  	__pipe_unlock(pipe);
569  
570  	/*
571  	 * If we do do a wakeup event, we do a 'sync' wakeup, because we
572  	 * want the reader to start processing things asap, rather than
573  	 * leave the data pending.
574  	 *
575  	 * This is particularly important for small writes, because of
576  	 * how (for example) the GNU make jobserver uses small writes to
577  	 * wake up pending jobs
578  	 */
579  	if (was_empty) {
580  		wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
581  		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
582  	}
583  	if (wake_next_writer)
584  		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
585  	if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
586  		int err = file_update_time(filp);
587  		if (err)
588  			ret = err;
589  		sb_end_write(file_inode(filp)->i_sb);
590  	}
591  	return ret;
592  }
593  
594  static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
595  {
596  	struct pipe_inode_info *pipe = filp->private_data;
597  	int count, head, tail, mask;
598  
599  	switch (cmd) {
600  	case FIONREAD:
601  		__pipe_lock(pipe);
602  		count = 0;
603  		head = pipe->head;
604  		tail = pipe->tail;
605  		mask = pipe->ring_size - 1;
606  
607  		while (tail != head) {
608  			count += pipe->bufs[tail & mask].len;
609  			tail++;
610  		}
611  		__pipe_unlock(pipe);
612  
613  		return put_user(count, (int __user *)arg);
614  
615  #ifdef CONFIG_WATCH_QUEUE
616  	case IOC_WATCH_QUEUE_SET_SIZE: {
617  		int ret;
618  		__pipe_lock(pipe);
619  		ret = watch_queue_set_size(pipe, arg);
620  		__pipe_unlock(pipe);
621  		return ret;
622  	}
623  
624  	case IOC_WATCH_QUEUE_SET_FILTER:
625  		return watch_queue_set_filter(
626  			pipe, (struct watch_notification_filter __user *)arg);
627  #endif
628  
629  	default:
630  		return -ENOIOCTLCMD;
631  	}
632  }
633  
634  /* No kernel lock held - fine */
635  static __poll_t
636  pipe_poll(struct file *filp, poll_table *wait)
637  {
638  	__poll_t mask;
639  	struct pipe_inode_info *pipe = filp->private_data;
640  	unsigned int head, tail;
641  
642  	/*
643  	 * Reading pipe state only -- no need for acquiring the semaphore.
644  	 *
645  	 * But because this is racy, the code has to add the
646  	 * entry to the poll table _first_ ..
647  	 */
648  	if (filp->f_mode & FMODE_READ)
649  		poll_wait(filp, &pipe->rd_wait, wait);
650  	if (filp->f_mode & FMODE_WRITE)
651  		poll_wait(filp, &pipe->wr_wait, wait);
652  
653  	/*
654  	 * .. and only then can you do the racy tests. That way,
655  	 * if something changes and you got it wrong, the poll
656  	 * table entry will wake you up and fix it.
657  	 */
658  	head = READ_ONCE(pipe->head);
659  	tail = READ_ONCE(pipe->tail);
660  
661  	mask = 0;
662  	if (filp->f_mode & FMODE_READ) {
663  		if (!pipe_empty(head, tail))
664  			mask |= EPOLLIN | EPOLLRDNORM;
665  		if (!pipe->writers && filp->f_version != pipe->w_counter)
666  			mask |= EPOLLHUP;
667  	}
668  
669  	if (filp->f_mode & FMODE_WRITE) {
670  		if (!pipe_full(head, tail, pipe->max_usage))
671  			mask |= EPOLLOUT | EPOLLWRNORM;
672  		/*
673  		 * Most Unices do not set EPOLLERR for FIFOs but on Linux they
674  		 * behave exactly like pipes for poll().
675  		 */
676  		if (!pipe->readers)
677  			mask |= EPOLLERR;
678  	}
679  
680  	return mask;
681  }
682  
683  static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
684  {
685  	int kill = 0;
686  
687  	spin_lock(&inode->i_lock);
688  	if (!--pipe->files) {
689  		inode->i_pipe = NULL;
690  		kill = 1;
691  	}
692  	spin_unlock(&inode->i_lock);
693  
694  	if (kill)
695  		free_pipe_info(pipe);
696  }
697  
698  static int
699  pipe_release(struct inode *inode, struct file *file)
700  {
701  	struct pipe_inode_info *pipe = file->private_data;
702  
703  	__pipe_lock(pipe);
704  	if (file->f_mode & FMODE_READ)
705  		pipe->readers--;
706  	if (file->f_mode & FMODE_WRITE)
707  		pipe->writers--;
708  
709  	/* Was that the last reader or writer, but not the other side? */
710  	if (!pipe->readers != !pipe->writers) {
711  		wake_up_interruptible_all(&pipe->rd_wait);
712  		wake_up_interruptible_all(&pipe->wr_wait);
713  		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
714  		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
715  	}
716  	__pipe_unlock(pipe);
717  
718  	put_pipe_info(inode, pipe);
719  	return 0;
720  }
721  
722  static int
723  pipe_fasync(int fd, struct file *filp, int on)
724  {
725  	struct pipe_inode_info *pipe = filp->private_data;
726  	int retval = 0;
727  
728  	__pipe_lock(pipe);
729  	if (filp->f_mode & FMODE_READ)
730  		retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
731  	if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
732  		retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
733  		if (retval < 0 && (filp->f_mode & FMODE_READ))
734  			/* this can happen only if on == T */
735  			fasync_helper(-1, filp, 0, &pipe->fasync_readers);
736  	}
737  	__pipe_unlock(pipe);
738  	return retval;
739  }
740  
741  unsigned long account_pipe_buffers(struct user_struct *user,
742  				   unsigned long old, unsigned long new)
743  {
744  	return atomic_long_add_return(new - old, &user->pipe_bufs);
745  }
746  
747  bool too_many_pipe_buffers_soft(unsigned long user_bufs)
748  {
749  	unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);
750  
751  	return soft_limit && user_bufs > soft_limit;
752  }
753  
754  bool too_many_pipe_buffers_hard(unsigned long user_bufs)
755  {
756  	unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);
757  
758  	return hard_limit && user_bufs > hard_limit;
759  }
760  
761  bool pipe_is_unprivileged_user(void)
762  {
763  	return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
764  }
765  
766  struct pipe_inode_info *alloc_pipe_info(void)
767  {
768  	struct pipe_inode_info *pipe;
769  	unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
770  	struct user_struct *user = get_current_user();
771  	unsigned long user_bufs;
772  	unsigned int max_size = READ_ONCE(pipe_max_size);
773  
774  	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
775  	if (pipe == NULL)
776  		goto out_free_uid;
777  
778  	if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
779  		pipe_bufs = max_size >> PAGE_SHIFT;
780  
781  	user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
782  
783  	if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
784  		user_bufs = account_pipe_buffers(user, pipe_bufs, 1);
785  		pipe_bufs = 1;
786  	}
787  
788  	if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
789  		goto out_revert_acct;
790  
791  	pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
792  			     GFP_KERNEL_ACCOUNT);
793  
794  	if (pipe->bufs) {
795  		init_waitqueue_head(&pipe->rd_wait);
796  		init_waitqueue_head(&pipe->wr_wait);
797  		pipe->r_counter = pipe->w_counter = 1;
798  		pipe->max_usage = pipe_bufs;
799  		pipe->ring_size = pipe_bufs;
800  		pipe->nr_accounted = pipe_bufs;
801  		pipe->user = user;
802  		mutex_init(&pipe->mutex);
803  		return pipe;
804  	}
805  
806  out_revert_acct:
807  	(void) account_pipe_buffers(user, pipe_bufs, 0);
808  	kfree(pipe);
809  out_free_uid:
810  	free_uid(user);
811  	return NULL;
812  }
813  
814  void free_pipe_info(struct pipe_inode_info *pipe)
815  {
816  	int i;
817  
818  #ifdef CONFIG_WATCH_QUEUE
819  	if (pipe->watch_queue) {
820  		watch_queue_clear(pipe->watch_queue);
821  		put_watch_queue(pipe->watch_queue);
822  	}
823  #endif
824  
825  	(void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
826  	free_uid(pipe->user);
827  	for (i = 0; i < pipe->ring_size; i++) {
828  		struct pipe_buffer *buf = pipe->bufs + i;
829  		if (buf->ops)
830  			pipe_buf_release(pipe, buf);
831  	}
832  	if (pipe->tmp_page)
833  		__free_page(pipe->tmp_page);
834  	kfree(pipe->bufs);
835  	kfree(pipe);
836  }
837  
838  static struct vfsmount *pipe_mnt __read_mostly;
839  
840  /*
841   * pipefs_dname() is called from d_path().
842   */
843  static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
844  {
845  	return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
846  				d_inode(dentry)->i_ino);
847  }
848  
849  static const struct dentry_operations pipefs_dentry_operations = {
850  	.d_dname	= pipefs_dname,
851  };
852  
853  static struct inode * get_pipe_inode(void)
854  {
855  	struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
856  	struct pipe_inode_info *pipe;
857  
858  	if (!inode)
859  		goto fail_inode;
860  
861  	inode->i_ino = get_next_ino();
862  
863  	pipe = alloc_pipe_info();
864  	if (!pipe)
865  		goto fail_iput;
866  
867  	inode->i_pipe = pipe;
868  	pipe->files = 2;
869  	pipe->readers = pipe->writers = 1;
870  	inode->i_fop = &pipefifo_fops;
871  
872  	/*
873  	 * Mark the inode dirty from the very beginning,
874  	 * that way it will never be moved to the dirty
875  	 * list because "mark_inode_dirty()" will think
876  	 * that it already _is_ on the dirty list.
877  	 */
878  	inode->i_state = I_DIRTY;
879  	inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
880  	inode->i_uid = current_fsuid();
881  	inode->i_gid = current_fsgid();
882  	inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
883  
884  	return inode;
885  
886  fail_iput:
887  	iput(inode);
888  
889  fail_inode:
890  	return NULL;
891  }
892  
893  int create_pipe_files(struct file **res, int flags)
894  {
895  	struct inode *inode = get_pipe_inode();
896  	struct file *f;
897  	int error;
898  
899  	if (!inode)
900  		return -ENFILE;
901  
902  	if (flags & O_NOTIFICATION_PIPE) {
903  		error = watch_queue_init(inode->i_pipe);
904  		if (error) {
905  			free_pipe_info(inode->i_pipe);
906  			iput(inode);
907  			return error;
908  		}
909  	}
910  
911  	f = alloc_file_pseudo(inode, pipe_mnt, "",
912  				O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
913  				&pipefifo_fops);
914  	if (IS_ERR(f)) {
915  		free_pipe_info(inode->i_pipe);
916  		iput(inode);
917  		return PTR_ERR(f);
918  	}
919  
920  	f->private_data = inode->i_pipe;
921  
922  	res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
923  				  &pipefifo_fops);
924  	if (IS_ERR(res[0])) {
925  		put_pipe_info(inode, inode->i_pipe);
926  		fput(f);
927  		return PTR_ERR(res[0]);
928  	}
929  	res[0]->private_data = inode->i_pipe;
930  	res[1] = f;
931  	stream_open(inode, res[0]);
932  	stream_open(inode, res[1]);
933  	return 0;
934  }
935  
936  static int __do_pipe_flags(int *fd, struct file **files, int flags)
937  {
938  	int error;
939  	int fdw, fdr;
940  
941  	if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
942  		return -EINVAL;
943  
944  	error = create_pipe_files(files, flags);
945  	if (error)
946  		return error;
947  
948  	error = get_unused_fd_flags(flags);
949  	if (error < 0)
950  		goto err_read_pipe;
951  	fdr = error;
952  
953  	error = get_unused_fd_flags(flags);
954  	if (error < 0)
955  		goto err_fdr;
956  	fdw = error;
957  
958  	audit_fd_pair(fdr, fdw);
959  	fd[0] = fdr;
960  	fd[1] = fdw;
961  	return 0;
962  
963   err_fdr:
964  	put_unused_fd(fdr);
965   err_read_pipe:
966  	fput(files[0]);
967  	fput(files[1]);
968  	return error;
969  }
970  
971  int do_pipe_flags(int *fd, int flags)
972  {
973  	struct file *files[2];
974  	int error = __do_pipe_flags(fd, files, flags);
975  	if (!error) {
976  		fd_install(fd[0], files[0]);
977  		fd_install(fd[1], files[1]);
978  	}
979  	return error;
980  }
981  
982  /*
983   * sys_pipe() is the normal C calling standard for creating
984   * a pipe. It's not the way Unix traditionally does this, though.
985   */
986  static int do_pipe2(int __user *fildes, int flags)
987  {
988  	struct file *files[2];
989  	int fd[2];
990  	int error;
991  
992  	error = __do_pipe_flags(fd, files, flags);
993  	if (!error) {
994  		if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
995  			fput(files[0]);
996  			fput(files[1]);
997  			put_unused_fd(fd[0]);
998  			put_unused_fd(fd[1]);
999  			error = -EFAULT;
1000  		} else {
1001  			fd_install(fd[0], files[0]);
1002  			fd_install(fd[1], files[1]);
1003  		}
1004  	}
1005  	return error;
1006  }
1007  
1008  SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
1009  {
1010  	return do_pipe2(fildes, flags);
1011  }
1012  
1013  SYSCALL_DEFINE1(pipe, int __user *, fildes)
1014  {
1015  	return do_pipe2(fildes, 0);
1016  }
1017  
1018  /*
1019   * This is the stupid "wait for pipe to be readable or writable"
1020   * model.
1021   *
1022   * See pipe_read/write() for the proper kind of exclusive wait,
1023   * but that requires that we wake up any other readers/writers
1024   * if we then do not end up reading everything (ie the whole
1025   * "wake_next_reader/writer" logic in pipe_read/write()).
1026   */
1027  void pipe_wait_readable(struct pipe_inode_info *pipe)
1028  {
1029  	pipe_unlock(pipe);
1030  	wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe));
1031  	pipe_lock(pipe);
1032  }
1033  
1034  void pipe_wait_writable(struct pipe_inode_info *pipe)
1035  {
1036  	pipe_unlock(pipe);
1037  	wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe));
1038  	pipe_lock(pipe);
1039  }
1040  
1041  /*
1042   * This depends on both the wait (here) and the wakeup (wake_up_partner)
1043   * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot
1044   * race with the count check and waitqueue prep.
1045   *
1046   * Normally in order to avoid races, you'd do the prepare_to_wait() first,
1047   * then check the condition you're waiting for, and only then sleep. But
1048   * because of the pipe lock, we can check the condition before being on
1049   * the wait queue.
1050   *
1051   * We use the 'rd_wait' waitqueue for pipe partner waiting.
1052   */
1053  static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
1054  {
1055  	DEFINE_WAIT(rdwait);
1056  	int cur = *cnt;
1057  
1058  	while (cur == *cnt) {
1059  		prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
1060  		pipe_unlock(pipe);
1061  		schedule();
1062  		finish_wait(&pipe->rd_wait, &rdwait);
1063  		pipe_lock(pipe);
1064  		if (signal_pending(current))
1065  			break;
1066  	}
1067  	return cur == *cnt ? -ERESTARTSYS : 0;
1068  }
1069  
1070  static void wake_up_partner(struct pipe_inode_info *pipe)
1071  {
1072  	wake_up_interruptible_all(&pipe->rd_wait);
1073  }
1074  
1075  static int fifo_open(struct inode *inode, struct file *filp)
1076  {
1077  	struct pipe_inode_info *pipe;
1078  	bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
1079  	int ret;
1080  
1081  	filp->f_version = 0;
1082  
1083  	spin_lock(&inode->i_lock);
1084  	if (inode->i_pipe) {
1085  		pipe = inode->i_pipe;
1086  		pipe->files++;
1087  		spin_unlock(&inode->i_lock);
1088  	} else {
1089  		spin_unlock(&inode->i_lock);
1090  		pipe = alloc_pipe_info();
1091  		if (!pipe)
1092  			return -ENOMEM;
1093  		pipe->files = 1;
1094  		spin_lock(&inode->i_lock);
1095  		if (unlikely(inode->i_pipe)) {
1096  			inode->i_pipe->files++;
1097  			spin_unlock(&inode->i_lock);
1098  			free_pipe_info(pipe);
1099  			pipe = inode->i_pipe;
1100  		} else {
1101  			inode->i_pipe = pipe;
1102  			spin_unlock(&inode->i_lock);
1103  		}
1104  	}
1105  	filp->private_data = pipe;
1106  	/* OK, we have a pipe and it's pinned down */
1107  
1108  	__pipe_lock(pipe);
1109  
1110  	/* We can only do regular read/write on fifos */
1111  	stream_open(inode, filp);
1112  
1113  	switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) {
1114  	case FMODE_READ:
1115  	/*
1116  	 *  O_RDONLY
1117  	 *  POSIX.1 says that O_NONBLOCK means return with the FIFO
1118  	 *  opened, even when there is no process writing the FIFO.
1119  	 */
1120  		pipe->r_counter++;
1121  		if (pipe->readers++ == 0)
1122  			wake_up_partner(pipe);
1123  
1124  		if (!is_pipe && !pipe->writers) {
1125  			if ((filp->f_flags & O_NONBLOCK)) {
1126  				/* suppress EPOLLHUP until we have
1127  				 * seen a writer */
1128  				filp->f_version = pipe->w_counter;
1129  			} else {
1130  				if (wait_for_partner(pipe, &pipe->w_counter))
1131  					goto err_rd;
1132  			}
1133  		}
1134  		break;
1135  
1136  	case FMODE_WRITE:
1137  	/*
1138  	 *  O_WRONLY
1139  	 *  POSIX.1 says that O_NONBLOCK means return -1 with
1140  	 *  errno=ENXIO when there is no process reading the FIFO.
1141  	 */
1142  		ret = -ENXIO;
1143  		if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
1144  			goto err;
1145  
1146  		pipe->w_counter++;
1147  		if (!pipe->writers++)
1148  			wake_up_partner(pipe);
1149  
1150  		if (!is_pipe && !pipe->readers) {
1151  			if (wait_for_partner(pipe, &pipe->r_counter))
1152  				goto err_wr;
1153  		}
1154  		break;
1155  
1156  	case FMODE_READ | FMODE_WRITE:
1157  	/*
1158  	 *  O_RDWR
1159  	 *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
1160  	 *  This implementation will NEVER block on a O_RDWR open, since
1161  	 *  the process can at least talk to itself.
1162  	 */
1163  
1164  		pipe->readers++;
1165  		pipe->writers++;
1166  		pipe->r_counter++;
1167  		pipe->w_counter++;
1168  		if (pipe->readers == 1 || pipe->writers == 1)
1169  			wake_up_partner(pipe);
1170  		break;
1171  
1172  	default:
1173  		ret = -EINVAL;
1174  		goto err;
1175  	}
1176  
1177  	/* Ok! */
1178  	__pipe_unlock(pipe);
1179  	return 0;
1180  
1181  err_rd:
1182  	if (!--pipe->readers)
1183  		wake_up_interruptible(&pipe->wr_wait);
1184  	ret = -ERESTARTSYS;
1185  	goto err;
1186  
1187  err_wr:
1188  	if (!--pipe->writers)
1189  		wake_up_interruptible_all(&pipe->rd_wait);
1190  	ret = -ERESTARTSYS;
1191  	goto err;
1192  
1193  err:
1194  	__pipe_unlock(pipe);
1195  
1196  	put_pipe_info(inode, pipe);
1197  	return ret;
1198  }
1199  
1200  const struct file_operations pipefifo_fops = {
1201  	.open		= fifo_open,
1202  	.llseek		= no_llseek,
1203  	.read_iter	= pipe_read,
1204  	.write_iter	= pipe_write,
1205  	.poll		= pipe_poll,
1206  	.unlocked_ioctl	= pipe_ioctl,
1207  	.release	= pipe_release,
1208  	.fasync		= pipe_fasync,
1209  };
1210  
1211  /*
1212   * Currently we rely on the pipe array holding a power-of-2 number
1213   * of pages. Returns 0 on error.
1214   */
1215  unsigned int round_pipe_size(unsigned long size)
1216  {
1217  	if (size > (1U << 31))
1218  		return 0;
1219  
1220  	/* Minimum pipe size, as required by POSIX */
1221  	if (size < PAGE_SIZE)
1222  		return PAGE_SIZE;
1223  
1224  	return roundup_pow_of_two(size);
1225  }
1226  
1227  /*
1228   * Resize the pipe ring to a number of slots.
1229   */
1230  int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
1231  {
1232  	struct pipe_buffer *bufs;
1233  	unsigned int head, tail, mask, n;
1234  
1235  	/*
1236  	 * We can shrink the pipe, if arg is greater than the ring occupancy.
1237  	 * Since we don't expect a lot of shrink+grow operations, just free and
1238  	 * allocate again like we would do for growing.  If the pipe currently
1239  	 * contains more buffers than arg, then return busy.
1240  	 */
1241  	mask = pipe->ring_size - 1;
1242  	head = pipe->head;
1243  	tail = pipe->tail;
1244  	n = pipe_occupancy(pipe->head, pipe->tail);
1245  	if (nr_slots < n)
1246  		return -EBUSY;
1247  
1248  	bufs = kcalloc(nr_slots, sizeof(*bufs),
1249  		       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
1250  	if (unlikely(!bufs))
1251  		return -ENOMEM;
1252  
1253  	/*
1254  	 * The pipe array wraps around, so just start the new one at zero
1255  	 * and adjust the indices.
1256  	 */
1257  	if (n > 0) {
1258  		unsigned int h = head & mask;
1259  		unsigned int t = tail & mask;
1260  		if (h > t) {
1261  			memcpy(bufs, pipe->bufs + t,
1262  			       n * sizeof(struct pipe_buffer));
1263  		} else {
1264  			unsigned int tsize = pipe->ring_size - t;
1265  			if (h > 0)
1266  				memcpy(bufs + tsize, pipe->bufs,
1267  				       h * sizeof(struct pipe_buffer));
1268  			memcpy(bufs, pipe->bufs + t,
1269  			       tsize * sizeof(struct pipe_buffer));
1270  		}
1271  	}
1272  
1273  	head = n;
1274  	tail = 0;
1275  
1276  	kfree(pipe->bufs);
1277  	pipe->bufs = bufs;
1278  	pipe->ring_size = nr_slots;
1279  	if (pipe->max_usage > nr_slots)
1280  		pipe->max_usage = nr_slots;
1281  	pipe->tail = tail;
1282  	pipe->head = head;
1283  
1284  	/* This might have made more room for writers */
1285  	wake_up_interruptible(&pipe->wr_wait);
1286  	return 0;
1287  }
1288  
1289  /*
1290   * Allocate a new array of pipe buffers and copy the info over. Returns the
1291   * pipe size if successful, or return -ERROR on error.
1292   */
1293  static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
1294  {
1295  	unsigned long user_bufs;
1296  	unsigned int nr_slots, size;
1297  	long ret = 0;
1298  
1299  #ifdef CONFIG_WATCH_QUEUE
1300  	if (pipe->watch_queue)
1301  		return -EBUSY;
1302  #endif
1303  
1304  	size = round_pipe_size(arg);
1305  	nr_slots = size >> PAGE_SHIFT;
1306  
1307  	if (!nr_slots)
1308  		return -EINVAL;
1309  
1310  	/*
1311  	 * If trying to increase the pipe capacity, check that an
1312  	 * unprivileged user is not trying to exceed various limits
1313  	 * (soft limit check here, hard limit check just below).
1314  	 * Decreasing the pipe capacity is always permitted, even
1315  	 * if the user is currently over a limit.
1316  	 */
1317  	if (nr_slots > pipe->max_usage &&
1318  			size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
1319  		return -EPERM;
1320  
1321  	user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots);
1322  
1323  	if (nr_slots > pipe->max_usage &&
1324  			(too_many_pipe_buffers_hard(user_bufs) ||
1325  			 too_many_pipe_buffers_soft(user_bufs)) &&
1326  			pipe_is_unprivileged_user()) {
1327  		ret = -EPERM;
1328  		goto out_revert_acct;
1329  	}
1330  
1331  	ret = pipe_resize_ring(pipe, nr_slots);
1332  	if (ret < 0)
1333  		goto out_revert_acct;
1334  
1335  	pipe->max_usage = nr_slots;
1336  	pipe->nr_accounted = nr_slots;
1337  	return pipe->max_usage * PAGE_SIZE;
1338  
1339  out_revert_acct:
1340  	(void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted);
1341  	return ret;
1342  }
1343  
1344  /*
1345   * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is
1346   * not enough to verify that this is a pipe.
1347   */
1348  struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
1349  {
1350  	struct pipe_inode_info *pipe = file->private_data;
1351  
1352  	if (file->f_op != &pipefifo_fops || !pipe)
1353  		return NULL;
1354  #ifdef CONFIG_WATCH_QUEUE
1355  	if (for_splice && pipe->watch_queue)
1356  		return NULL;
1357  #endif
1358  	return pipe;
1359  }
1360  
1361  long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1362  {
1363  	struct pipe_inode_info *pipe;
1364  	long ret;
1365  
1366  	pipe = get_pipe_info(file, false);
1367  	if (!pipe)
1368  		return -EBADF;
1369  
1370  	__pipe_lock(pipe);
1371  
1372  	switch (cmd) {
1373  	case F_SETPIPE_SZ:
1374  		ret = pipe_set_size(pipe, arg);
1375  		break;
1376  	case F_GETPIPE_SZ:
1377  		ret = pipe->max_usage * PAGE_SIZE;
1378  		break;
1379  	default:
1380  		ret = -EINVAL;
1381  		break;
1382  	}
1383  
1384  	__pipe_unlock(pipe);
1385  	return ret;
1386  }
1387  
1388  static const struct super_operations pipefs_ops = {
1389  	.destroy_inode = free_inode_nonrcu,
1390  	.statfs = simple_statfs,
1391  };
1392  
1393  /*
1394   * pipefs should _never_ be mounted by userland - too much of security hassle,
1395   * no real gain from having the whole whorehouse mounted. So we don't need
1396   * any operations on the root directory. However, we need a non-trivial
1397   * d_name - pipe: will go nicely and kill the special-casing in procfs.
1398   */
1399  
1400  static int pipefs_init_fs_context(struct fs_context *fc)
1401  {
1402  	struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC);
1403  	if (!ctx)
1404  		return -ENOMEM;
1405  	ctx->ops = &pipefs_ops;
1406  	ctx->dops = &pipefs_dentry_operations;
1407  	return 0;
1408  }
1409  
1410  static struct file_system_type pipe_fs_type = {
1411  	.name		= "pipefs",
1412  	.init_fs_context = pipefs_init_fs_context,
1413  	.kill_sb	= kill_anon_super,
1414  };
1415  
1416  static int __init init_pipe_fs(void)
1417  {
1418  	int err = register_filesystem(&pipe_fs_type);
1419  
1420  	if (!err) {
1421  		pipe_mnt = kern_mount(&pipe_fs_type);
1422  		if (IS_ERR(pipe_mnt)) {
1423  			err = PTR_ERR(pipe_mnt);
1424  			unregister_filesystem(&pipe_fs_type);
1425  		}
1426  	}
1427  	return err;
1428  }
1429  
1430  fs_initcall(init_pipe_fs);
1431