xref: /freebsd/sys/kern/sys_pipe.c (revision ac77b2621508c6a50ab01d07fe8d43795d908f05)
1 /*-
2  * Copyright (c) 1996 John S. Dyson
3  * Copyright (c) 2012 Giovanni Trematerra
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice immediately at the beginning of the file, without modification,
11  *    this list of conditions, and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Absolutely no warranty of function or purpose is made by the author
16  *    John S. Dyson.
17  * 4. Modifications may be freely made to this file if the above conditions
18  *    are met.
19  */
20 
21 /*
22  * This file contains a high-performance replacement for the socket-based
23  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
24  * all features of sockets, but does do everything that pipes normally
25  * do.
26  */
27 
28 /*
29  * This code has two modes of operation, a small write mode and a large
30  * write mode.  The small write mode acts like conventional pipes with
31  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
32  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
33  * and PIPE_SIZE in size, the sending process pins the underlying pages in
34  * memory, and the receiving process copies directly from these pinned pages
35  * in the sending process.
36  *
37  * If the sending process receives a signal, it is possible that it will
38  * go away, and certainly its address space can change, because control
39  * is returned back to the user-mode side.  In that case, the pipe code
40  * arranges to copy the buffer supplied by the user process, to a pageable
41  * kernel buffer, and the receiving process will grab the data from the
42  * pageable kernel buffer.  Since signals don't happen all that often,
43  * the copy operation is normally eliminated.
44  *
45  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
46  * happen for small transfers so that the system will not spend all of
47  * its time context switching.
48  *
49  * In order to limit the resource use of pipes, two sysctls exist:
50  *
51  * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
52  * address space available to us in pipe_map. This value is normally
53  * autotuned, but may also be loader tuned.
54  *
55  * kern.ipc.pipekva - This read-only sysctl tracks the current amount of
56  * memory in use by pipes.
57  *
58  * Based on how large pipekva is relative to maxpipekva, the following
59  * will happen:
60  *
61  * 0% - 50%:
62  *     New pipes are given 16K of memory backing, pipes may dynamically
63  *     grow to as large as 64K where needed.
64  * 50% - 75%:
65  *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
66  *     existing pipes may NOT grow.
67  * 75% - 100%:
68  *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
69  *     existing pipes will be shrunk down to 4K whenever possible.
70  *
71  * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0.  If
72  * that is set,  the only resize that will occur is the 0 -> SMALL_PIPE_SIZE
73  * resize which MUST occur for reverse-direction pipes when they are
74  * first used.
75  *
76  * Additional information about the current state of pipes may be obtained
77  * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail,
78  * and kern.ipc.piperesizefail.
79  *
80  * Locking rules:  There are two locks present here:  A mutex, used via
81  * PIPE_LOCK, and a flag, used via pipelock().  All locking is done via
82  * the flag, as mutexes can not persist over uiomove.  The mutex
83  * exists only to guard access to the flag, and is not in itself a
84  * locking mechanism.  Also note that there is only a single mutex for
85  * both directions of a pipe.
86  *
87  * As pipelock() may have to sleep before it can acquire the flag, it
88  * is important to reread all data after a call to pipelock(); everything
89  * in the structure may have changed.
90  */
91 
92 #include <sys/param.h>
93 #include <sys/systm.h>
94 #include <sys/conf.h>
95 #include <sys/fcntl.h>
96 #include <sys/file.h>
97 #include <sys/filedesc.h>
98 #include <sys/filio.h>
99 #include <sys/kernel.h>
100 #include <sys/lock.h>
101 #include <sys/mutex.h>
102 #include <sys/ttycom.h>
103 #include <sys/stat.h>
104 #include <sys/malloc.h>
105 #include <sys/poll.h>
106 #include <sys/priv.h>
107 #include <sys/selinfo.h>
108 #include <sys/signalvar.h>
109 #include <sys/syscallsubr.h>
110 #include <sys/sysctl.h>
111 #include <sys/sysproto.h>
112 #include <sys/pipe.h>
113 #include <sys/proc.h>
114 #include <sys/vnode.h>
115 #include <sys/uio.h>
116 #include <sys/user.h>
117 #include <sys/event.h>
118 
119 #include <security/mac/mac_framework.h>
120 
121 #include <vm/vm.h>
122 #include <vm/vm_param.h>
123 #include <vm/vm_object.h>
124 #include <vm/vm_kern.h>
125 #include <vm/vm_extern.h>
126 #include <vm/pmap.h>
127 #include <vm/vm_map.h>
128 #include <vm/vm_page.h>
129 #include <vm/uma.h>
130 
131 /*
132  * Use this define if you want to disable *fancy* VM things.  Expect an
133  * approx 30% decrease in transfer rate.  This could be useful for
134  * NetBSD or OpenBSD.
135  */
136 /* #define PIPE_NODIRECT */
137 
138 #define PIPE_PEER(pipe)	\
139 	(((pipe)->pipe_type & PIPE_TYPE_NAMED) ? (pipe) : ((pipe)->pipe_peer))
140 
141 /*
142  * interfaces to the outside world
143  */
144 static fo_rdwr_t	pipe_read;
145 static fo_rdwr_t	pipe_write;
146 static fo_truncate_t	pipe_truncate;
147 static fo_ioctl_t	pipe_ioctl;
148 static fo_poll_t	pipe_poll;
149 static fo_kqfilter_t	pipe_kqfilter;
150 static fo_stat_t	pipe_stat;
151 static fo_close_t	pipe_close;
152 static fo_chmod_t	pipe_chmod;
153 static fo_chown_t	pipe_chown;
154 static fo_fill_kinfo_t	pipe_fill_kinfo;
155 
156 struct fileops pipeops = {
157 	.fo_read = pipe_read,
158 	.fo_write = pipe_write,
159 	.fo_truncate = pipe_truncate,
160 	.fo_ioctl = pipe_ioctl,
161 	.fo_poll = pipe_poll,
162 	.fo_kqfilter = pipe_kqfilter,
163 	.fo_stat = pipe_stat,
164 	.fo_close = pipe_close,
165 	.fo_chmod = pipe_chmod,
166 	.fo_chown = pipe_chown,
167 	.fo_sendfile = invfo_sendfile,
168 	.fo_fill_kinfo = pipe_fill_kinfo,
169 	.fo_cmp = file_kcmp_generic,
170 	.fo_flags = DFLAG_PASSABLE
171 };
172 
173 static void	filt_pipedetach(struct knote *kn);
174 static void	filt_pipedetach_notsup(struct knote *kn);
175 static int	filt_pipenotsup(struct knote *kn, long hint);
176 static int	filt_piperead(struct knote *kn, long hint);
177 static int	filt_pipewrite(struct knote *kn, long hint);
178 
179 static struct filterops pipe_nfiltops = {
180 	.f_isfd = 1,
181 	.f_detach = filt_pipedetach_notsup,
182 	.f_event = filt_pipenotsup
183 };
184 static struct filterops pipe_rfiltops = {
185 	.f_isfd = 1,
186 	.f_detach = filt_pipedetach,
187 	.f_event = filt_piperead
188 };
189 static struct filterops pipe_wfiltops = {
190 	.f_isfd = 1,
191 	.f_detach = filt_pipedetach,
192 	.f_event = filt_pipewrite
193 };
194 
195 /*
196  * Default pipe buffer size(s), this can be kind-of large now because pipe
197  * space is pageable.  The pipe code will try to maintain locality of
198  * reference for performance reasons, so small amounts of outstanding I/O
199  * will not wipe the cache.
200  */
201 #define MINPIPESIZE (PIPE_SIZE/3)
202 #define MAXPIPESIZE (2*PIPE_SIZE/3)
203 
204 static long amountpipekva;
205 static int pipefragretry;
206 static int pipeallocfail;
207 static int piperesizefail;
208 static int piperesizeallowed = 1;
209 static long pipe_mindirect = PIPE_MINDIRECT;
210 static int pipebuf_reserv = 2;
211 
212 SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
213 	   &maxpipekva, 0, "Pipe KVA limit");
214 SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
215 	   &amountpipekva, 0, "Pipe KVA usage");
216 SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD,
217 	  &pipefragretry, 0, "Pipe allocation retries due to fragmentation");
218 SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD,
219 	  &pipeallocfail, 0, "Pipe allocation failures");
220 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD,
221 	  &piperesizefail, 0, "Pipe resize failures");
222 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW,
223 	  &piperesizeallowed, 0, "Pipe resizing allowed");
224 SYSCTL_INT(_kern_ipc, OID_AUTO, pipebuf_reserv, CTLFLAG_RW,
225     &pipebuf_reserv, 0,
226     "Superuser-reserved percentage of the pipe buffers space");
227 
228 static void pipeinit(void *dummy __unused);
229 static void pipeclose(struct pipe *cpipe);
230 static void pipe_free_kmem(struct pipe *cpipe);
231 static int pipe_create(struct pipe *pipe, bool backing);
232 static int pipe_paircreate(struct thread *td, struct pipepair **p_pp);
233 static __inline int pipelock(struct pipe *cpipe, bool catch);
234 static __inline void pipeunlock(struct pipe *cpipe);
235 static void pipe_timestamp(struct timespec *tsp);
236 #ifndef PIPE_NODIRECT
237 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
238 static void pipe_destroy_write_buffer(struct pipe *wpipe);
239 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
240 static void pipe_clone_write_buffer(struct pipe *wpipe);
241 #endif
242 static int pipespace(struct pipe *cpipe, int size);
243 static int pipespace_new(struct pipe *cpipe, int size);
244 
245 static int	pipe_zone_ctor(void *mem, int size, void *arg, int flags);
246 static int	pipe_zone_init(void *mem, int size, int flags);
247 static void	pipe_zone_fini(void *mem, int size);
248 
249 static uma_zone_t pipe_zone;
250 static struct unrhdr64 pipeino_unr;
251 static dev_t pipedev_ino;
252 
253 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
254 
255 static void
256 pipeinit(void *dummy __unused)
257 {
258 
259 	pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair),
260 	    pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini,
261 	    UMA_ALIGN_PTR, 0);
262 	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
263 	new_unrhdr64(&pipeino_unr, 1);
264 	pipedev_ino = devfs_alloc_cdp_inode();
265 	KASSERT(pipedev_ino > 0, ("pipe dev inode not initialized"));
266 }
267 
268 static int
269 sysctl_handle_pipe_mindirect(SYSCTL_HANDLER_ARGS)
270 {
271 	int error = 0;
272 	long tmp_pipe_mindirect = pipe_mindirect;
273 
274 	error = sysctl_handle_long(oidp, &tmp_pipe_mindirect, arg2, req);
275 	if (error != 0 || req->newptr == NULL)
276 		return (error);
277 
278 	/*
279 	 * Don't allow pipe_mindirect to be set so low that we violate
280 	 * atomicity requirements.
281 	 */
282 	if (tmp_pipe_mindirect <= PIPE_BUF)
283 		return (EINVAL);
284 	pipe_mindirect = tmp_pipe_mindirect;
285 	return (0);
286 }
287 SYSCTL_OID(_kern_ipc, OID_AUTO, pipe_mindirect, CTLTYPE_LONG | CTLFLAG_RW,
288     &pipe_mindirect, 0, sysctl_handle_pipe_mindirect, "L",
289     "Minimum write size triggering VM optimization");
290 
291 static int
292 pipe_zone_ctor(void *mem, int size, void *arg, int flags)
293 {
294 	struct pipepair *pp;
295 	struct pipe *rpipe, *wpipe;
296 
297 	KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
298 
299 	pp = (struct pipepair *)mem;
300 
301 	/*
302 	 * We zero both pipe endpoints to make sure all the kmem pointers
303 	 * are NULL, flag fields are zero'd, etc.  We timestamp both
304 	 * endpoints with the same time.
305 	 */
306 	rpipe = &pp->pp_rpipe;
307 	bzero(rpipe, sizeof(*rpipe));
308 	pipe_timestamp(&rpipe->pipe_ctime);
309 	rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
310 
311 	wpipe = &pp->pp_wpipe;
312 	bzero(wpipe, sizeof(*wpipe));
313 	wpipe->pipe_ctime = rpipe->pipe_ctime;
314 	wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
315 
316 	rpipe->pipe_peer = wpipe;
317 	rpipe->pipe_pair = pp;
318 	wpipe->pipe_peer = rpipe;
319 	wpipe->pipe_pair = pp;
320 
321 	/*
322 	 * Mark both endpoints as present; they will later get free'd
323 	 * one at a time.  When both are free'd, then the whole pair
324 	 * is released.
325 	 */
326 	rpipe->pipe_present = PIPE_ACTIVE;
327 	wpipe->pipe_present = PIPE_ACTIVE;
328 
329 	/*
330 	 * Eventually, the MAC Framework may initialize the label
331 	 * in ctor or init, but for now we do it elswhere to avoid
332 	 * blocking in ctor or init.
333 	 */
334 	pp->pp_label = NULL;
335 
336 	return (0);
337 }
338 
339 static int
340 pipe_zone_init(void *mem, int size, int flags)
341 {
342 	struct pipepair *pp;
343 
344 	KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
345 
346 	pp = (struct pipepair *)mem;
347 
348 	mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_NEW);
349 	return (0);
350 }
351 
352 static void
353 pipe_zone_fini(void *mem, int size)
354 {
355 	struct pipepair *pp;
356 
357 	KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
358 
359 	pp = (struct pipepair *)mem;
360 
361 	mtx_destroy(&pp->pp_mtx);
362 }
363 
364 static int
365 pipe_paircreate(struct thread *td, struct pipepair **p_pp)
366 {
367 	struct pipepair *pp;
368 	struct pipe *rpipe, *wpipe;
369 	int error;
370 
371 	*p_pp = pp = uma_zalloc(pipe_zone, M_WAITOK);
372 #ifdef MAC
373 	/*
374 	 * The MAC label is shared between the connected endpoints.  As a
375 	 * result mac_pipe_init() and mac_pipe_create() are called once
376 	 * for the pair, and not on the endpoints.
377 	 */
378 	mac_pipe_init(pp);
379 	mac_pipe_create(td->td_ucred, pp);
380 #endif
381 	rpipe = &pp->pp_rpipe;
382 	wpipe = &pp->pp_wpipe;
383 	pp->pp_owner = crhold(td->td_ucred);
384 
385 	knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe));
386 	knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe));
387 
388 	/*
389 	 * Only the forward direction pipe is backed by big buffer by
390 	 * default.
391 	 */
392 	error = pipe_create(rpipe, true);
393 	if (error != 0)
394 		goto fail;
395 	error = pipe_create(wpipe, false);
396 	if (error != 0) {
397 		/*
398 		 * This cleanup leaves the pipe inode number for rpipe
399 		 * still allocated, but never used.  We do not free
400 		 * inode numbers for opened pipes, which is required
401 		 * for correctness because numbers must be unique.
402 		 * But also it avoids any memory use by the unr
403 		 * allocator, so stashing away the transient inode
404 		 * number is reasonable.
405 		 */
406 		pipe_free_kmem(rpipe);
407 		goto fail;
408 	}
409 
410 	rpipe->pipe_state |= PIPE_DIRECTOK;
411 	wpipe->pipe_state |= PIPE_DIRECTOK;
412 	return (0);
413 
414 fail:
415 	knlist_destroy(&rpipe->pipe_sel.si_note);
416 	knlist_destroy(&wpipe->pipe_sel.si_note);
417 	crfree(pp->pp_owner);
418 #ifdef MAC
419 	mac_pipe_destroy(pp);
420 #endif
421 	uma_zfree(pipe_zone, pp);
422 	return (error);
423 }
424 
425 int
426 pipe_named_ctor(struct pipe **ppipe, struct thread *td)
427 {
428 	struct pipepair *pp;
429 	int error;
430 
431 	error = pipe_paircreate(td, &pp);
432 	if (error != 0)
433 		return (error);
434 	pp->pp_rpipe.pipe_type |= PIPE_TYPE_NAMED;
435 	*ppipe = &pp->pp_rpipe;
436 	return (0);
437 }
438 
439 void
440 pipe_dtor(struct pipe *dpipe)
441 {
442 	struct pipe *peer;
443 
444 	peer = (dpipe->pipe_type & PIPE_TYPE_NAMED) != 0 ? dpipe->pipe_peer : NULL;
445 	funsetown(&dpipe->pipe_sigio);
446 	pipeclose(dpipe);
447 	if (peer != NULL) {
448 		funsetown(&peer->pipe_sigio);
449 		pipeclose(peer);
450 	}
451 }
452 
453 /*
454  * Get a timestamp.
455  *
456  * This used to be vfs_timestamp but the higher precision is unnecessary and
457  * can very negatively affect performance in virtualized environments (e.g., on
458  * vms running on amd64 when using the rdtscp instruction).
459  */
460 static void
461 pipe_timestamp(struct timespec *tsp)
462 {
463 
464 	getnanotime(tsp);
465 }
466 
467 /*
468  * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail, let
469  * the zone pick up the pieces via pipeclose().
470  */
471 int
472 kern_pipe(struct thread *td, int fildes[2], int flags, struct filecaps *fcaps1,
473     struct filecaps *fcaps2)
474 {
475 	struct file *rf, *wf;
476 	struct pipe *rpipe, *wpipe;
477 	struct pipepair *pp;
478 	int fd, fflags, error;
479 
480 	error = pipe_paircreate(td, &pp);
481 	if (error != 0)
482 		return (error);
483 	rpipe = &pp->pp_rpipe;
484 	wpipe = &pp->pp_wpipe;
485 	error = falloc_caps(td, &rf, &fd, flags, fcaps1);
486 	if (error) {
487 		pipeclose(rpipe);
488 		pipeclose(wpipe);
489 		return (error);
490 	}
491 	/* An extra reference on `rf' has been held for us by falloc_caps(). */
492 	fildes[0] = fd;
493 
494 	fflags = FREAD | FWRITE;
495 	if ((flags & O_NONBLOCK) != 0)
496 		fflags |= FNONBLOCK;
497 
498 	/*
499 	 * Warning: once we've gotten past allocation of the fd for the
500 	 * read-side, we can only drop the read side via fdrop() in order
501 	 * to avoid races against processes which manage to dup() the read
502 	 * side while we are blocked trying to allocate the write side.
503 	 */
504 	finit(rf, fflags, DTYPE_PIPE, rpipe, &pipeops);
505 	error = falloc_caps(td, &wf, &fd, flags, fcaps2);
506 	if (error) {
507 		fdclose(td, rf, fildes[0]);
508 		fdrop(rf, td);
509 		/* rpipe has been closed by fdrop(). */
510 		pipeclose(wpipe);
511 		return (error);
512 	}
513 	/* An extra reference on `wf' has been held for us by falloc_caps(). */
514 	finit(wf, fflags, DTYPE_PIPE, wpipe, &pipeops);
515 	fdrop(wf, td);
516 	fildes[1] = fd;
517 	fdrop(rf, td);
518 
519 	return (0);
520 }
521 
522 #ifdef COMPAT_FREEBSD10
523 /* ARGSUSED */
524 int
525 freebsd10_pipe(struct thread *td, struct freebsd10_pipe_args *uap __unused)
526 {
527 	int error;
528 	int fildes[2];
529 
530 	error = kern_pipe(td, fildes, 0, NULL, NULL);
531 	if (error)
532 		return (error);
533 
534 	td->td_retval[0] = fildes[0];
535 	td->td_retval[1] = fildes[1];
536 
537 	return (0);
538 }
539 #endif
540 
541 int
542 sys_pipe2(struct thread *td, struct pipe2_args *uap)
543 {
544 	int error, fildes[2];
545 
546 	if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK))
547 		return (EINVAL);
548 	error = kern_pipe(td, fildes, uap->flags, NULL, NULL);
549 	if (error)
550 		return (error);
551 	error = copyout(fildes, uap->fildes, 2 * sizeof(int));
552 	if (error) {
553 		(void)kern_close(td, fildes[0]);
554 		(void)kern_close(td, fildes[1]);
555 	}
556 	return (error);
557 }
558 
559 /*
560  * Allocate kva for pipe circular buffer, the space is pageable
561  * This routine will 'realloc' the size of a pipe safely, if it fails
562  * it will retain the old buffer.
563  * If it fails it will return ENOMEM.
564  */
565 static int
566 pipespace_new(struct pipe *cpipe, int size)
567 {
568 	caddr_t buffer;
569 	int error, cnt, firstseg;
570 	static int curfail = 0;
571 	static struct timeval lastfail;
572 
573 	KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
574 	KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW),
575 		("pipespace: resize of direct writes not allowed"));
576 retry:
577 	cnt = cpipe->pipe_buffer.cnt;
578 	if (cnt > size)
579 		size = cnt;
580 
581 	size = round_page(size);
582 	buffer = (caddr_t) vm_map_min(pipe_map);
583 
584 	if (!chgpipecnt(cpipe->pipe_pair->pp_owner->cr_ruidinfo,
585 	    size, lim_cur(curthread, RLIMIT_PIPEBUF))) {
586 		if (cpipe->pipe_buffer.buffer == NULL &&
587 		    size > SMALL_PIPE_SIZE) {
588 			size = SMALL_PIPE_SIZE;
589 			goto retry;
590 		}
591 		return (ENOMEM);
592 	}
593 
594 	vm_map_lock(pipe_map);
595 	if (priv_check(curthread, PRIV_PIPEBUF) != 0 && maxpipekva / 100 *
596 	    (100 - pipebuf_reserv) < amountpipekva + size) {
597 		vm_map_unlock(pipe_map);
598 		chgpipecnt(cpipe->pipe_pair->pp_owner->cr_ruidinfo, -size, 0);
599 		if (cpipe->pipe_buffer.buffer == NULL &&
600 		    size > SMALL_PIPE_SIZE) {
601 			size = SMALL_PIPE_SIZE;
602 			pipefragretry++;
603 			goto retry;
604 		}
605 		return (ENOMEM);
606 	}
607 	error = vm_map_find_locked(pipe_map, NULL, 0, (vm_offset_t *)&buffer,
608 	    size, 0, VMFS_ANY_SPACE, VM_PROT_RW, VM_PROT_RW, 0);
609 	vm_map_unlock(pipe_map);
610 	if (error != KERN_SUCCESS) {
611 		chgpipecnt(cpipe->pipe_pair->pp_owner->cr_ruidinfo, -size, 0);
612 		if (cpipe->pipe_buffer.buffer == NULL &&
613 		    size > SMALL_PIPE_SIZE) {
614 			size = SMALL_PIPE_SIZE;
615 			pipefragretry++;
616 			goto retry;
617 		}
618 		if (cpipe->pipe_buffer.buffer == NULL) {
619 			pipeallocfail++;
620 			if (ppsratecheck(&lastfail, &curfail, 1))
621 				printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
622 		} else {
623 			piperesizefail++;
624 		}
625 		return (ENOMEM);
626 	}
627 
628 	/* copy data, then free old resources if we're resizing */
629 	if (cnt > 0) {
630 		if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) {
631 			firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out;
632 			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
633 				buffer, firstseg);
634 			if ((cnt - firstseg) > 0)
635 				bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg],
636 					cpipe->pipe_buffer.in);
637 		} else {
638 			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
639 				buffer, cnt);
640 		}
641 	}
642 	pipe_free_kmem(cpipe);
643 	cpipe->pipe_buffer.buffer = buffer;
644 	cpipe->pipe_buffer.size = size;
645 	cpipe->pipe_buffer.in = cnt;
646 	cpipe->pipe_buffer.out = 0;
647 	cpipe->pipe_buffer.cnt = cnt;
648 	atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size);
649 	return (0);
650 }
651 
652 /*
653  * Wrapper for pipespace_new() that performs locking assertions.
654  */
655 static int
656 pipespace(struct pipe *cpipe, int size)
657 {
658 
659 	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
660 	    ("Unlocked pipe passed to pipespace"));
661 	return (pipespace_new(cpipe, size));
662 }
663 
664 /*
665  * lock a pipe for I/O, blocking other access
666  */
667 static __inline int
668 pipelock(struct pipe *cpipe, bool catch)
669 {
670 	int error, prio;
671 
672 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
673 
674 	prio = PRIBIO;
675 	if (catch)
676 		prio |= PCATCH;
677 	while (cpipe->pipe_state & PIPE_LOCKFL) {
678 		KASSERT(cpipe->pipe_waiters >= 0,
679 		    ("%s: bad waiter count %d", __func__,
680 		    cpipe->pipe_waiters));
681 		cpipe->pipe_waiters++;
682 		error = msleep(&cpipe->pipe_waiters, PIPE_MTX(cpipe), prio,
683 		    "pipelk", 0);
684 		cpipe->pipe_waiters--;
685 		if (error != 0)
686 			return (error);
687 	}
688 	cpipe->pipe_state |= PIPE_LOCKFL;
689 	return (0);
690 }
691 
692 /*
693  * unlock a pipe I/O lock
694  */
695 static __inline void
696 pipeunlock(struct pipe *cpipe)
697 {
698 
699 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
700 	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
701 		("Unlocked pipe passed to pipeunlock"));
702 	KASSERT(cpipe->pipe_waiters >= 0,
703 	    ("%s: bad waiter count %d", __func__,
704 	    cpipe->pipe_waiters));
705 	cpipe->pipe_state &= ~PIPE_LOCKFL;
706 	if (cpipe->pipe_waiters > 0)
707 		wakeup_one(&cpipe->pipe_waiters);
708 }
709 
710 void
711 pipeselwakeup(struct pipe *cpipe)
712 {
713 
714 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
715 	if (cpipe->pipe_state & PIPE_SEL) {
716 		selwakeuppri(&cpipe->pipe_sel, PSOCK);
717 		if (!SEL_WAITING(&cpipe->pipe_sel))
718 			cpipe->pipe_state &= ~PIPE_SEL;
719 	}
720 	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
721 		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
722 	KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0);
723 }
724 
725 /*
726  * Initialize and allocate VM and memory for pipe.  The structure
727  * will start out zero'd from the ctor, so we just manage the kmem.
728  */
729 static int
730 pipe_create(struct pipe *pipe, bool large_backing)
731 {
732 	int error;
733 
734 	error = pipespace_new(pipe, !large_backing || amountpipekva >
735 	    maxpipekva / 2 ? SMALL_PIPE_SIZE : PIPE_SIZE);
736 	if (error == 0)
737 		pipe->pipe_ino = alloc_unr64(&pipeino_unr);
738 	return (error);
739 }
740 
741 /* ARGSUSED */
742 static int
743 pipe_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
744     int flags, struct thread *td)
745 {
746 	struct pipe *rpipe;
747 	int error;
748 	int nread = 0;
749 	int size;
750 
751 	rpipe = fp->f_data;
752 
753 	/*
754 	 * Try to avoid locking the pipe if we have nothing to do.
755 	 *
756 	 * There are programs which share one pipe amongst multiple processes
757 	 * and perform non-blocking reads in parallel, even if the pipe is
758 	 * empty.  This in particular is the case with BSD make, which when
759 	 * spawned with a high -j number can find itself with over half of the
760 	 * calls failing to find anything.
761 	 */
762 	if ((fp->f_flag & FNONBLOCK) != 0 && !mac_pipe_check_read_enabled()) {
763 		if (__predict_false(uio->uio_resid == 0))
764 			return (0);
765 		if ((atomic_load_short(&rpipe->pipe_state) & PIPE_EOF) == 0 &&
766 		    atomic_load_int(&rpipe->pipe_buffer.cnt) == 0 &&
767 		    atomic_load_int(&rpipe->pipe_pages.cnt) == 0)
768 			return (EAGAIN);
769 	}
770 
771 	PIPE_LOCK(rpipe);
772 	++rpipe->pipe_busy;
773 	error = pipelock(rpipe, true);
774 	if (error)
775 		goto unlocked_error;
776 
777 #ifdef MAC
778 	error = mac_pipe_check_read(active_cred, rpipe->pipe_pair);
779 	if (error)
780 		goto locked_error;
781 #endif
782 	if (amountpipekva > (3 * maxpipekva) / 4) {
783 		if ((rpipe->pipe_state & PIPE_DIRECTW) == 0 &&
784 		    rpipe->pipe_buffer.size > SMALL_PIPE_SIZE &&
785 		    rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE &&
786 		    piperesizeallowed == 1) {
787 			PIPE_UNLOCK(rpipe);
788 			pipespace(rpipe, SMALL_PIPE_SIZE);
789 			PIPE_LOCK(rpipe);
790 		}
791 	}
792 
793 	while (uio->uio_resid) {
794 		/*
795 		 * normal pipe buffer receive
796 		 */
797 		if (rpipe->pipe_buffer.cnt > 0) {
798 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
799 			if (size > rpipe->pipe_buffer.cnt)
800 				size = rpipe->pipe_buffer.cnt;
801 			if (size > uio->uio_resid)
802 				size = uio->uio_resid;
803 
804 			PIPE_UNLOCK(rpipe);
805 			error = uiomove(
806 			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
807 			    size, uio);
808 			PIPE_LOCK(rpipe);
809 			if (error)
810 				break;
811 
812 			rpipe->pipe_buffer.out += size;
813 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
814 				rpipe->pipe_buffer.out = 0;
815 
816 			rpipe->pipe_buffer.cnt -= size;
817 
818 			/*
819 			 * If there is no more to read in the pipe, reset
820 			 * its pointers to the beginning.  This improves
821 			 * cache hit stats.
822 			 */
823 			if (rpipe->pipe_buffer.cnt == 0) {
824 				rpipe->pipe_buffer.in = 0;
825 				rpipe->pipe_buffer.out = 0;
826 			}
827 			nread += size;
828 #ifndef PIPE_NODIRECT
829 		/*
830 		 * Direct copy, bypassing a kernel buffer.
831 		 */
832 		} else if ((size = rpipe->pipe_pages.cnt) != 0) {
833 			if (size > uio->uio_resid)
834 				size = (u_int) uio->uio_resid;
835 			PIPE_UNLOCK(rpipe);
836 			error = uiomove_fromphys(rpipe->pipe_pages.ms,
837 			    rpipe->pipe_pages.pos, size, uio);
838 			PIPE_LOCK(rpipe);
839 			if (error)
840 				break;
841 			nread += size;
842 			rpipe->pipe_pages.pos += size;
843 			rpipe->pipe_pages.cnt -= size;
844 			if (rpipe->pipe_pages.cnt == 0) {
845 				rpipe->pipe_state &= ~PIPE_WANTW;
846 				wakeup(rpipe);
847 			}
848 #endif
849 		} else {
850 			/*
851 			 * detect EOF condition
852 			 * read returns 0 on EOF, no need to set error
853 			 */
854 			if (rpipe->pipe_state & PIPE_EOF)
855 				break;
856 
857 			/*
858 			 * If the "write-side" has been blocked, wake it up now.
859 			 */
860 			if (rpipe->pipe_state & PIPE_WANTW) {
861 				rpipe->pipe_state &= ~PIPE_WANTW;
862 				wakeup(rpipe);
863 			}
864 
865 			/*
866 			 * Break if some data was read.
867 			 */
868 			if (nread > 0)
869 				break;
870 
871 			/*
872 			 * Unlock the pipe buffer for our remaining processing.
873 			 * We will either break out with an error or we will
874 			 * sleep and relock to loop.
875 			 */
876 			pipeunlock(rpipe);
877 
878 			/*
879 			 * Handle non-blocking mode operation or
880 			 * wait for more data.
881 			 */
882 			if (fp->f_flag & FNONBLOCK) {
883 				error = EAGAIN;
884 			} else {
885 				rpipe->pipe_state |= PIPE_WANTR;
886 				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
887 				    PRIBIO | PCATCH,
888 				    "piperd", 0)) == 0)
889 					error = pipelock(rpipe, true);
890 			}
891 			if (error)
892 				goto unlocked_error;
893 		}
894 	}
895 #ifdef MAC
896 locked_error:
897 #endif
898 	pipeunlock(rpipe);
899 
900 	/* XXX: should probably do this before getting any locks. */
901 	if (error == 0)
902 		pipe_timestamp(&rpipe->pipe_atime);
903 unlocked_error:
904 	--rpipe->pipe_busy;
905 
906 	/*
907 	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
908 	 */
909 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
910 		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
911 		wakeup(rpipe);
912 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
913 		/*
914 		 * Handle write blocking hysteresis.
915 		 */
916 		if (rpipe->pipe_state & PIPE_WANTW) {
917 			rpipe->pipe_state &= ~PIPE_WANTW;
918 			wakeup(rpipe);
919 		}
920 	}
921 
922 	/*
923 	 * Only wake up writers if there was actually something read.
924 	 * Otherwise, when calling read(2) at EOF, a spurious wakeup occurs.
925 	 */
926 	if (nread > 0 &&
927 	    rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt >= PIPE_BUF)
928 		pipeselwakeup(rpipe);
929 
930 	PIPE_UNLOCK(rpipe);
931 	if (nread > 0)
932 		td->td_ru.ru_msgrcv++;
933 	return (error);
934 }
935 
936 #ifndef PIPE_NODIRECT
937 /*
938  * Map the sending processes' buffer into kernel space and wire it.
939  * This is similar to a physical write operation.
940  */
941 static int
942 pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio)
943 {
944 	u_int size;
945 	int i;
946 
947 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
948 	KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0,
949 	    ("%s: PIPE_DIRECTW set on %p", __func__, wpipe));
950 	KASSERT(wpipe->pipe_pages.cnt == 0,
951 	    ("%s: pipe map for %p contains residual data", __func__, wpipe));
952 
953 	if (uio->uio_iov->iov_len > wpipe->pipe_buffer.size)
954                 size = wpipe->pipe_buffer.size;
955 	else
956                 size = uio->uio_iov->iov_len;
957 
958 	wpipe->pipe_state |= PIPE_DIRECTW;
959 	PIPE_UNLOCK(wpipe);
960 	i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
961 	    (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ,
962 	    wpipe->pipe_pages.ms, PIPENPAGES);
963 	PIPE_LOCK(wpipe);
964 	if (i < 0) {
965 		wpipe->pipe_state &= ~PIPE_DIRECTW;
966 		return (EFAULT);
967 	}
968 
969 	wpipe->pipe_pages.npages = i;
970 	wpipe->pipe_pages.pos =
971 	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
972 	wpipe->pipe_pages.cnt = size;
973 
974 	uio->uio_iov->iov_len -= size;
975 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
976 	if (uio->uio_iov->iov_len == 0) {
977 		uio->uio_iov++;
978 		uio->uio_iovcnt--;
979 	}
980 	uio->uio_resid -= size;
981 	uio->uio_offset += size;
982 	return (0);
983 }
984 
985 /*
986  * Unwire the process buffer.
987  */
988 static void
989 pipe_destroy_write_buffer(struct pipe *wpipe)
990 {
991 
992 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
993 	KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0,
994 	    ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe));
995 	KASSERT(wpipe->pipe_pages.cnt == 0,
996 	    ("%s: pipe map for %p contains residual data", __func__, wpipe));
997 
998 	wpipe->pipe_state &= ~PIPE_DIRECTW;
999 	vm_page_unhold_pages(wpipe->pipe_pages.ms, wpipe->pipe_pages.npages);
1000 	wpipe->pipe_pages.npages = 0;
1001 }
1002 
1003 /*
1004  * In the case of a signal, the writing process might go away.  This
1005  * code copies the data into the circular buffer so that the source
1006  * pages can be freed without loss of data.
1007  */
1008 static void
1009 pipe_clone_write_buffer(struct pipe *wpipe)
1010 {
1011 	struct uio uio;
1012 	struct iovec iov;
1013 	int size;
1014 	int pos;
1015 
1016 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
1017 	KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0,
1018 	    ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe));
1019 
1020 	size = wpipe->pipe_pages.cnt;
1021 	pos = wpipe->pipe_pages.pos;
1022 	wpipe->pipe_pages.cnt = 0;
1023 
1024 	wpipe->pipe_buffer.in = size;
1025 	wpipe->pipe_buffer.out = 0;
1026 	wpipe->pipe_buffer.cnt = size;
1027 
1028 	PIPE_UNLOCK(wpipe);
1029 	iov.iov_base = wpipe->pipe_buffer.buffer;
1030 	iov.iov_len = size;
1031 	uio.uio_iov = &iov;
1032 	uio.uio_iovcnt = 1;
1033 	uio.uio_offset = 0;
1034 	uio.uio_resid = size;
1035 	uio.uio_segflg = UIO_SYSSPACE;
1036 	uio.uio_rw = UIO_READ;
1037 	uio.uio_td = curthread;
1038 	uiomove_fromphys(wpipe->pipe_pages.ms, pos, size, &uio);
1039 	PIPE_LOCK(wpipe);
1040 	pipe_destroy_write_buffer(wpipe);
1041 }
1042 
1043 /*
1044  * This implements the pipe buffer write mechanism.  Note that only
1045  * a direct write OR a normal pipe write can be pending at any given time.
1046  * If there are any characters in the pipe buffer, the direct write will
1047  * be deferred until the receiving process grabs all of the bytes from
1048  * the pipe buffer.  Then the direct mapping write is set-up.
1049  */
1050 static int
1051 pipe_direct_write(struct pipe *wpipe, struct uio *uio)
1052 {
1053 	int error;
1054 
1055 retry:
1056 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
1057 	if ((wpipe->pipe_state & PIPE_EOF) != 0) {
1058 		error = EPIPE;
1059 		goto error1;
1060 	}
1061 	if (wpipe->pipe_state & PIPE_DIRECTW) {
1062 		if (wpipe->pipe_state & PIPE_WANTR) {
1063 			wpipe->pipe_state &= ~PIPE_WANTR;
1064 			wakeup(wpipe);
1065 		}
1066 		pipeselwakeup(wpipe);
1067 		wpipe->pipe_state |= PIPE_WANTW;
1068 		pipeunlock(wpipe);
1069 		error = msleep(wpipe, PIPE_MTX(wpipe),
1070 		    PRIBIO | PCATCH, "pipdww", 0);
1071 		pipelock(wpipe, false);
1072 		if (error != 0)
1073 			goto error1;
1074 		goto retry;
1075 	}
1076 	if (wpipe->pipe_buffer.cnt > 0) {
1077 		if (wpipe->pipe_state & PIPE_WANTR) {
1078 			wpipe->pipe_state &= ~PIPE_WANTR;
1079 			wakeup(wpipe);
1080 		}
1081 		pipeselwakeup(wpipe);
1082 		wpipe->pipe_state |= PIPE_WANTW;
1083 		pipeunlock(wpipe);
1084 		error = msleep(wpipe, PIPE_MTX(wpipe),
1085 		    PRIBIO | PCATCH, "pipdwc", 0);
1086 		pipelock(wpipe, false);
1087 		if (error != 0)
1088 			goto error1;
1089 		goto retry;
1090 	}
1091 
1092 	error = pipe_build_write_buffer(wpipe, uio);
1093 	if (error) {
1094 		goto error1;
1095 	}
1096 
1097 	while (wpipe->pipe_pages.cnt != 0 &&
1098 	    (wpipe->pipe_state & PIPE_EOF) == 0) {
1099 		if (wpipe->pipe_state & PIPE_WANTR) {
1100 			wpipe->pipe_state &= ~PIPE_WANTR;
1101 			wakeup(wpipe);
1102 		}
1103 		pipeselwakeup(wpipe);
1104 		wpipe->pipe_state |= PIPE_WANTW;
1105 		pipeunlock(wpipe);
1106 		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
1107 		    "pipdwt", 0);
1108 		pipelock(wpipe, false);
1109 		if (error != 0)
1110 			break;
1111 	}
1112 
1113 	if ((wpipe->pipe_state & PIPE_EOF) != 0) {
1114 		wpipe->pipe_pages.cnt = 0;
1115 		pipe_destroy_write_buffer(wpipe);
1116 		pipeselwakeup(wpipe);
1117 		error = EPIPE;
1118 	} else if (error == EINTR || error == ERESTART) {
1119 		pipe_clone_write_buffer(wpipe);
1120 	} else {
1121 		pipe_destroy_write_buffer(wpipe);
1122 	}
1123 	KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0,
1124 	    ("pipe %p leaked PIPE_DIRECTW", wpipe));
1125 	return (error);
1126 
1127 error1:
1128 	wakeup(wpipe);
1129 	return (error);
1130 }
1131 #endif
1132 
1133 static int
1134 pipe_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
1135     int flags, struct thread *td)
1136 {
1137 	struct pipe *wpipe, *rpipe;
1138 	ssize_t orig_resid;
1139 	int desiredsize, error;
1140 
1141 	rpipe = fp->f_data;
1142 	wpipe = PIPE_PEER(rpipe);
1143 	PIPE_LOCK(rpipe);
1144 	error = pipelock(wpipe, true);
1145 	if (error) {
1146 		PIPE_UNLOCK(rpipe);
1147 		return (error);
1148 	}
1149 	/*
1150 	 * detect loss of pipe read side, issue SIGPIPE if lost.
1151 	 */
1152 	if (wpipe->pipe_present != PIPE_ACTIVE ||
1153 	    (wpipe->pipe_state & PIPE_EOF)) {
1154 		pipeunlock(wpipe);
1155 		PIPE_UNLOCK(rpipe);
1156 		return (EPIPE);
1157 	}
1158 #ifdef MAC
1159 	error = mac_pipe_check_write(active_cred, wpipe->pipe_pair);
1160 	if (error) {
1161 		pipeunlock(wpipe);
1162 		PIPE_UNLOCK(rpipe);
1163 		return (error);
1164 	}
1165 #endif
1166 	++wpipe->pipe_busy;
1167 
1168 	/* Choose a larger size if it's advantageous */
1169 	desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size);
1170 	while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) {
1171 		if (piperesizeallowed != 1)
1172 			break;
1173 		if (amountpipekva > maxpipekva / 2)
1174 			break;
1175 		if (desiredsize == BIG_PIPE_SIZE)
1176 			break;
1177 		desiredsize = desiredsize * 2;
1178 	}
1179 
1180 	/* Choose a smaller size if we're in a OOM situation */
1181 	if (amountpipekva > (3 * maxpipekva) / 4 &&
1182 	    wpipe->pipe_buffer.size > SMALL_PIPE_SIZE &&
1183 	    wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE &&
1184 	    piperesizeallowed == 1)
1185 		desiredsize = SMALL_PIPE_SIZE;
1186 
1187 	/* Resize if the above determined that a new size was necessary */
1188 	if (desiredsize != wpipe->pipe_buffer.size &&
1189 	    (wpipe->pipe_state & PIPE_DIRECTW) == 0) {
1190 		PIPE_UNLOCK(wpipe);
1191 		pipespace(wpipe, desiredsize);
1192 		PIPE_LOCK(wpipe);
1193 	}
1194 	MPASS(wpipe->pipe_buffer.size != 0);
1195 
1196 	orig_resid = uio->uio_resid;
1197 
1198 	while (uio->uio_resid) {
1199 		int space;
1200 
1201 		if (wpipe->pipe_state & PIPE_EOF) {
1202 			error = EPIPE;
1203 			break;
1204 		}
1205 #ifndef PIPE_NODIRECT
1206 		/*
1207 		 * If the transfer is large, we can gain performance if
1208 		 * we do process-to-process copies directly.
1209 		 * If the write is non-blocking, we don't use the
1210 		 * direct write mechanism.
1211 		 *
1212 		 * The direct write mechanism will detect the reader going
1213 		 * away on us.
1214 		 */
1215 		if (uio->uio_segflg == UIO_USERSPACE &&
1216 		    uio->uio_iov->iov_len >= pipe_mindirect &&
1217 		    wpipe->pipe_buffer.size >= pipe_mindirect &&
1218 		    (fp->f_flag & FNONBLOCK) == 0) {
1219 			error = pipe_direct_write(wpipe, uio);
1220 			if (error != 0)
1221 				break;
1222 			continue;
1223 		}
1224 #endif
1225 
1226 		/*
1227 		 * Pipe buffered writes cannot be coincidental with
1228 		 * direct writes.  We wait until the currently executing
1229 		 * direct write is completed before we start filling the
1230 		 * pipe buffer.  We break out if a signal occurs or the
1231 		 * reader goes away.
1232 		 */
1233 		if (wpipe->pipe_pages.cnt != 0) {
1234 			if (wpipe->pipe_state & PIPE_WANTR) {
1235 				wpipe->pipe_state &= ~PIPE_WANTR;
1236 				wakeup(wpipe);
1237 			}
1238 			pipeselwakeup(wpipe);
1239 			wpipe->pipe_state |= PIPE_WANTW;
1240 			pipeunlock(wpipe);
1241 			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
1242 			    "pipbww", 0);
1243 			pipelock(wpipe, false);
1244 			if (error != 0)
1245 				break;
1246 			continue;
1247 		}
1248 
1249 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1250 
1251 		/* Writes of size <= PIPE_BUF must be atomic. */
1252 		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
1253 			space = 0;
1254 
1255 		if (space > 0) {
1256 			int size;	/* Transfer size */
1257 			int segsize;	/* first segment to transfer */
1258 
1259 			/*
1260 			 * Transfer size is minimum of uio transfer
1261 			 * and free space in pipe buffer.
1262 			 */
1263 			if (space > uio->uio_resid)
1264 				size = uio->uio_resid;
1265 			else
1266 				size = space;
1267 			/*
1268 			 * First segment to transfer is minimum of
1269 			 * transfer size and contiguous space in
1270 			 * pipe buffer.  If first segment to transfer
1271 			 * is less than the transfer size, we've got
1272 			 * a wraparound in the buffer.
1273 			 */
1274 			segsize = wpipe->pipe_buffer.size -
1275 				wpipe->pipe_buffer.in;
1276 			if (segsize > size)
1277 				segsize = size;
1278 
1279 			/* Transfer first segment */
1280 
1281 			PIPE_UNLOCK(rpipe);
1282 			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1283 					segsize, uio);
1284 			PIPE_LOCK(rpipe);
1285 
1286 			if (error == 0 && segsize < size) {
1287 				KASSERT(wpipe->pipe_buffer.in + segsize ==
1288 					wpipe->pipe_buffer.size,
1289 					("Pipe buffer wraparound disappeared"));
1290 				/*
1291 				 * Transfer remaining part now, to
1292 				 * support atomic writes.  Wraparound
1293 				 * happened.
1294 				 */
1295 
1296 				PIPE_UNLOCK(rpipe);
1297 				error = uiomove(
1298 				    &wpipe->pipe_buffer.buffer[0],
1299 				    size - segsize, uio);
1300 				PIPE_LOCK(rpipe);
1301 			}
1302 			if (error == 0) {
1303 				wpipe->pipe_buffer.in += size;
1304 				if (wpipe->pipe_buffer.in >=
1305 				    wpipe->pipe_buffer.size) {
1306 					KASSERT(wpipe->pipe_buffer.in ==
1307 						size - segsize +
1308 						wpipe->pipe_buffer.size,
1309 						("Expected wraparound bad"));
1310 					wpipe->pipe_buffer.in = size - segsize;
1311 				}
1312 
1313 				wpipe->pipe_buffer.cnt += size;
1314 				KASSERT(wpipe->pipe_buffer.cnt <=
1315 					wpipe->pipe_buffer.size,
1316 					("Pipe buffer overflow"));
1317 			}
1318 			if (error != 0)
1319 				break;
1320 			continue;
1321 		} else {
1322 			/*
1323 			 * If the "read-side" has been blocked, wake it up now.
1324 			 */
1325 			if (wpipe->pipe_state & PIPE_WANTR) {
1326 				wpipe->pipe_state &= ~PIPE_WANTR;
1327 				wakeup(wpipe);
1328 			}
1329 
1330 			/*
1331 			 * don't block on non-blocking I/O
1332 			 */
1333 			if (fp->f_flag & FNONBLOCK) {
1334 				error = EAGAIN;
1335 				break;
1336 			}
1337 
1338 			/*
1339 			 * We have no more space and have something to offer,
1340 			 * wake up select/poll.
1341 			 */
1342 			pipeselwakeup(wpipe);
1343 
1344 			wpipe->pipe_state |= PIPE_WANTW;
1345 			pipeunlock(wpipe);
1346 			error = msleep(wpipe, PIPE_MTX(rpipe),
1347 			    PRIBIO | PCATCH, "pipewr", 0);
1348 			pipelock(wpipe, false);
1349 			if (error != 0)
1350 				break;
1351 			continue;
1352 		}
1353 	}
1354 
1355 	--wpipe->pipe_busy;
1356 
1357 	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1358 		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1359 		wakeup(wpipe);
1360 	} else if (wpipe->pipe_buffer.cnt > 0) {
1361 		/*
1362 		 * If we have put any characters in the buffer, we wake up
1363 		 * the reader.
1364 		 */
1365 		if (wpipe->pipe_state & PIPE_WANTR) {
1366 			wpipe->pipe_state &= ~PIPE_WANTR;
1367 			wakeup(wpipe);
1368 		}
1369 	}
1370 
1371 	/*
1372 	 * Don't return EPIPE if any byte was written.
1373 	 * EINTR and other interrupts are handled by generic I/O layer.
1374 	 * Do not pretend that I/O succeeded for obvious user error
1375 	 * like EFAULT.
1376 	 */
1377 	if (uio->uio_resid != orig_resid && error == EPIPE)
1378 		error = 0;
1379 
1380 	if (error == 0)
1381 		pipe_timestamp(&wpipe->pipe_mtime);
1382 
1383 	/*
1384 	 * We have something to offer,
1385 	 * wake up select/poll.
1386 	 */
1387 	if (wpipe->pipe_buffer.cnt)
1388 		pipeselwakeup(wpipe);
1389 
1390 	pipeunlock(wpipe);
1391 	PIPE_UNLOCK(rpipe);
1392 	if (uio->uio_resid != orig_resid)
1393 		td->td_ru.ru_msgsnd++;
1394 	return (error);
1395 }
1396 
1397 /* ARGSUSED */
1398 static int
1399 pipe_truncate(struct file *fp, off_t length, struct ucred *active_cred,
1400     struct thread *td)
1401 {
1402 	struct pipe *cpipe;
1403 	int error;
1404 
1405 	cpipe = fp->f_data;
1406 	if (cpipe->pipe_type & PIPE_TYPE_NAMED)
1407 		error = vnops.fo_truncate(fp, length, active_cred, td);
1408 	else
1409 		error = invfo_truncate(fp, length, active_cred, td);
1410 	return (error);
1411 }
1412 
1413 /*
1414  * we implement a very minimal set of ioctls for compatibility with sockets.
1415  */
1416 static int
1417 pipe_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred,
1418     struct thread *td)
1419 {
1420 	struct pipe *mpipe = fp->f_data;
1421 	int error;
1422 
1423 	PIPE_LOCK(mpipe);
1424 
1425 #ifdef MAC
1426 	error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
1427 	if (error) {
1428 		PIPE_UNLOCK(mpipe);
1429 		return (error);
1430 	}
1431 #endif
1432 
1433 	error = 0;
1434 	switch (cmd) {
1435 	case FIONBIO:
1436 		break;
1437 
1438 	case FIOASYNC:
1439 		if (*(int *)data) {
1440 			mpipe->pipe_state |= PIPE_ASYNC;
1441 		} else {
1442 			mpipe->pipe_state &= ~PIPE_ASYNC;
1443 		}
1444 		break;
1445 
1446 	case FIONREAD:
1447 		if (!(fp->f_flag & FREAD)) {
1448 			*(int *)data = 0;
1449 			PIPE_UNLOCK(mpipe);
1450 			return (0);
1451 		}
1452 		if (mpipe->pipe_pages.cnt != 0)
1453 			*(int *)data = mpipe->pipe_pages.cnt;
1454 		else
1455 			*(int *)data = mpipe->pipe_buffer.cnt;
1456 		break;
1457 
1458 	case FIOSETOWN:
1459 		PIPE_UNLOCK(mpipe);
1460 		error = fsetown(*(int *)data, &mpipe->pipe_sigio);
1461 		goto out_unlocked;
1462 
1463 	case FIOGETOWN:
1464 		*(int *)data = fgetown(&mpipe->pipe_sigio);
1465 		break;
1466 
1467 	/* This is deprecated, FIOSETOWN should be used instead. */
1468 	case TIOCSPGRP:
1469 		PIPE_UNLOCK(mpipe);
1470 		error = fsetown(-(*(int *)data), &mpipe->pipe_sigio);
1471 		goto out_unlocked;
1472 
1473 	/* This is deprecated, FIOGETOWN should be used instead. */
1474 	case TIOCGPGRP:
1475 		*(int *)data = -fgetown(&mpipe->pipe_sigio);
1476 		break;
1477 
1478 	default:
1479 		error = ENOTTY;
1480 		break;
1481 	}
1482 	PIPE_UNLOCK(mpipe);
1483 out_unlocked:
1484 	return (error);
1485 }
1486 
1487 static int
1488 pipe_poll(struct file *fp, int events, struct ucred *active_cred,
1489     struct thread *td)
1490 {
1491 	struct pipe *rpipe;
1492 	struct pipe *wpipe;
1493 	int levents, revents;
1494 #ifdef MAC
1495 	int error;
1496 #endif
1497 
1498 	revents = 0;
1499 	rpipe = fp->f_data;
1500 	wpipe = PIPE_PEER(rpipe);
1501 	PIPE_LOCK(rpipe);
1502 #ifdef MAC
1503 	error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair);
1504 	if (error)
1505 		goto locked_error;
1506 #endif
1507 	if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM))
1508 		if (rpipe->pipe_pages.cnt > 0 || rpipe->pipe_buffer.cnt > 0)
1509 			revents |= events & (POLLIN | POLLRDNORM);
1510 
1511 	if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM))
1512 		if (wpipe->pipe_present != PIPE_ACTIVE ||
1513 		    (wpipe->pipe_state & PIPE_EOF) ||
1514 		    ((wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
1515 		     ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF ||
1516 			 wpipe->pipe_buffer.size == 0)))
1517 			revents |= events & (POLLOUT | POLLWRNORM);
1518 
1519 	levents = events &
1520 	    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND);
1521 	if (rpipe->pipe_type & PIPE_TYPE_NAMED && fp->f_flag & FREAD && levents &&
1522 	    fp->f_pipegen == rpipe->pipe_wgen)
1523 		events |= POLLINIGNEOF;
1524 
1525 	if ((events & POLLINIGNEOF) == 0) {
1526 		if (rpipe->pipe_state & PIPE_EOF) {
1527 			if (fp->f_flag & FREAD)
1528 				revents |= (events & (POLLIN | POLLRDNORM));
1529 			if (wpipe->pipe_present != PIPE_ACTIVE ||
1530 			    (wpipe->pipe_state & PIPE_EOF))
1531 				revents |= POLLHUP;
1532 		}
1533 	}
1534 
1535 	if (revents == 0) {
1536 		/*
1537 		 * Add ourselves regardless of eventmask as we have to return
1538 		 * POLLHUP even if it was not asked for.
1539 		 */
1540 		if ((fp->f_flag & FREAD) != 0) {
1541 			selrecord(td, &rpipe->pipe_sel);
1542 			if (SEL_WAITING(&rpipe->pipe_sel))
1543 				rpipe->pipe_state |= PIPE_SEL;
1544 		}
1545 
1546 		if ((fp->f_flag & FWRITE) != 0 &&
1547 		    wpipe->pipe_present == PIPE_ACTIVE) {
1548 			selrecord(td, &wpipe->pipe_sel);
1549 			if (SEL_WAITING(&wpipe->pipe_sel))
1550 				wpipe->pipe_state |= PIPE_SEL;
1551 		}
1552 	}
1553 #ifdef MAC
1554 locked_error:
1555 #endif
1556 	PIPE_UNLOCK(rpipe);
1557 
1558 	return (revents);
1559 }
1560 
1561 /*
1562  * We shouldn't need locks here as we're doing a read and this should
1563  * be a natural race.
1564  */
1565 static int
1566 pipe_stat(struct file *fp, struct stat *ub, struct ucred *active_cred)
1567 {
1568 	struct pipe *pipe;
1569 #ifdef MAC
1570 	int error;
1571 #endif
1572 
1573 	pipe = fp->f_data;
1574 #ifdef MAC
1575 	if (mac_pipe_check_stat_enabled()) {
1576 		PIPE_LOCK(pipe);
1577 		error = mac_pipe_check_stat(active_cred, pipe->pipe_pair);
1578 		PIPE_UNLOCK(pipe);
1579 		if (error) {
1580 			return (error);
1581 		}
1582 	}
1583 #endif
1584 
1585 	/* For named pipes ask the underlying filesystem. */
1586 	if (pipe->pipe_type & PIPE_TYPE_NAMED) {
1587 		return (vnops.fo_stat(fp, ub, active_cred));
1588 	}
1589 
1590 	bzero(ub, sizeof(*ub));
1591 	ub->st_mode = S_IFIFO;
1592 	ub->st_blksize = PAGE_SIZE;
1593 	if (pipe->pipe_pages.cnt != 0)
1594 		ub->st_size = pipe->pipe_pages.cnt;
1595 	else
1596 		ub->st_size = pipe->pipe_buffer.cnt;
1597 	ub->st_blocks = howmany(ub->st_size, ub->st_blksize);
1598 	ub->st_atim = pipe->pipe_atime;
1599 	ub->st_mtim = pipe->pipe_mtime;
1600 	ub->st_ctim = pipe->pipe_ctime;
1601 	ub->st_uid = fp->f_cred->cr_uid;
1602 	ub->st_gid = fp->f_cred->cr_gid;
1603 	ub->st_dev = pipedev_ino;
1604 	ub->st_ino = pipe->pipe_ino;
1605 	/*
1606 	 * Left as 0: st_nlink, st_rdev, st_flags, st_gen.
1607 	 */
1608 	return (0);
1609 }
1610 
1611 /* ARGSUSED */
1612 static int
1613 pipe_close(struct file *fp, struct thread *td)
1614 {
1615 
1616 	if (fp->f_vnode != NULL)
1617 		return vnops.fo_close(fp, td);
1618 	fp->f_ops = &badfileops;
1619 	pipe_dtor(fp->f_data);
1620 	fp->f_data = NULL;
1621 	return (0);
1622 }
1623 
1624 static int
1625 pipe_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td)
1626 {
1627 	struct pipe *cpipe;
1628 	int error;
1629 
1630 	cpipe = fp->f_data;
1631 	if (cpipe->pipe_type & PIPE_TYPE_NAMED)
1632 		error = vn_chmod(fp, mode, active_cred, td);
1633 	else
1634 		error = invfo_chmod(fp, mode, active_cred, td);
1635 	return (error);
1636 }
1637 
1638 static int
1639 pipe_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
1640     struct thread *td)
1641 {
1642 	struct pipe *cpipe;
1643 	int error;
1644 
1645 	cpipe = fp->f_data;
1646 	if (cpipe->pipe_type & PIPE_TYPE_NAMED)
1647 		error = vn_chown(fp, uid, gid, active_cred, td);
1648 	else
1649 		error = invfo_chown(fp, uid, gid, active_cred, td);
1650 	return (error);
1651 }
1652 
1653 static int
1654 pipe_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
1655 {
1656 	struct pipe *pi;
1657 
1658 	if (fp->f_type == DTYPE_FIFO)
1659 		return (vn_fill_kinfo(fp, kif, fdp));
1660 	kif->kf_type = KF_TYPE_PIPE;
1661 	pi = fp->f_data;
1662 	kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi;
1663 	kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer;
1664 	kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt;
1665 	kif->kf_un.kf_pipe.kf_pipe_buffer_in = pi->pipe_buffer.in;
1666 	kif->kf_un.kf_pipe.kf_pipe_buffer_out = pi->pipe_buffer.out;
1667 	kif->kf_un.kf_pipe.kf_pipe_buffer_size = pi->pipe_buffer.size;
1668 	return (0);
1669 }
1670 
1671 static void
1672 pipe_free_kmem(struct pipe *cpipe)
1673 {
1674 
1675 	KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
1676 	    ("pipe_free_kmem: pipe mutex locked"));
1677 
1678 	if (cpipe->pipe_buffer.buffer != NULL) {
1679 		atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size);
1680 		chgpipecnt(cpipe->pipe_pair->pp_owner->cr_ruidinfo,
1681 		    -cpipe->pipe_buffer.size, 0);
1682 		vm_map_remove(pipe_map,
1683 		    (vm_offset_t)cpipe->pipe_buffer.buffer,
1684 		    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
1685 		cpipe->pipe_buffer.buffer = NULL;
1686 	}
1687 #ifndef PIPE_NODIRECT
1688 	{
1689 		cpipe->pipe_pages.cnt = 0;
1690 		cpipe->pipe_pages.pos = 0;
1691 		cpipe->pipe_pages.npages = 0;
1692 	}
1693 #endif
1694 }
1695 
1696 /*
1697  * shutdown the pipe
1698  */
1699 static void
1700 pipeclose(struct pipe *cpipe)
1701 {
1702 #ifdef MAC
1703 	struct pipepair *pp;
1704 #endif
1705 	struct pipe *ppipe;
1706 
1707 	KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
1708 
1709 	PIPE_LOCK(cpipe);
1710 	pipelock(cpipe, false);
1711 #ifdef MAC
1712 	pp = cpipe->pipe_pair;
1713 #endif
1714 
1715 	/*
1716 	 * If the other side is blocked, wake it up saying that
1717 	 * we want to close it down.
1718 	 */
1719 	cpipe->pipe_state |= PIPE_EOF;
1720 	while (cpipe->pipe_busy) {
1721 		wakeup(cpipe);
1722 		cpipe->pipe_state |= PIPE_WANT;
1723 		pipeunlock(cpipe);
1724 		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1725 		pipelock(cpipe, false);
1726 	}
1727 
1728 	pipeselwakeup(cpipe);
1729 
1730 	/*
1731 	 * Disconnect from peer, if any.
1732 	 */
1733 	ppipe = cpipe->pipe_peer;
1734 	if (ppipe->pipe_present == PIPE_ACTIVE) {
1735 		ppipe->pipe_state |= PIPE_EOF;
1736 		wakeup(ppipe);
1737 		pipeselwakeup(ppipe);
1738 	}
1739 
1740 	/*
1741 	 * Mark this endpoint as free.  Release kmem resources.  We
1742 	 * don't mark this endpoint as unused until we've finished
1743 	 * doing that, or the pipe might disappear out from under
1744 	 * us.
1745 	 */
1746 	PIPE_UNLOCK(cpipe);
1747 	pipe_free_kmem(cpipe);
1748 	PIPE_LOCK(cpipe);
1749 	cpipe->pipe_present = PIPE_CLOSING;
1750 	pipeunlock(cpipe);
1751 
1752 	/*
1753 	 * knlist_clear() may sleep dropping the PIPE_MTX. Set the
1754 	 * PIPE_FINALIZED, that allows other end to free the
1755 	 * pipe_pair, only after the knotes are completely dismantled.
1756 	 */
1757 	knlist_clear(&cpipe->pipe_sel.si_note, 1);
1758 	cpipe->pipe_present = PIPE_FINALIZED;
1759 	seldrain(&cpipe->pipe_sel);
1760 	knlist_destroy(&cpipe->pipe_sel.si_note);
1761 
1762 	/*
1763 	 * If both endpoints are now closed, release the memory for the
1764 	 * pipe pair.  If not, unlock.
1765 	 */
1766 	if (ppipe->pipe_present == PIPE_FINALIZED) {
1767 		PIPE_UNLOCK(cpipe);
1768 		crfree(cpipe->pipe_pair->pp_owner);
1769 #ifdef MAC
1770 		mac_pipe_destroy(pp);
1771 #endif
1772 		uma_zfree(pipe_zone, cpipe->pipe_pair);
1773 	} else
1774 		PIPE_UNLOCK(cpipe);
1775 }
1776 
1777 /*ARGSUSED*/
1778 static int
1779 pipe_kqfilter(struct file *fp, struct knote *kn)
1780 {
1781 	struct pipe *cpipe;
1782 
1783 	/*
1784 	 * If a filter is requested that is not supported by this file
1785 	 * descriptor, don't return an error, but also don't ever generate an
1786 	 * event.
1787 	 */
1788 	if ((kn->kn_filter == EVFILT_READ) && !(fp->f_flag & FREAD)) {
1789 		kn->kn_fop = &pipe_nfiltops;
1790 		return (0);
1791 	}
1792 	if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) {
1793 		kn->kn_fop = &pipe_nfiltops;
1794 		return (0);
1795 	}
1796 	cpipe = fp->f_data;
1797 	PIPE_LOCK(cpipe);
1798 	switch (kn->kn_filter) {
1799 	case EVFILT_READ:
1800 		kn->kn_fop = &pipe_rfiltops;
1801 		break;
1802 	case EVFILT_WRITE:
1803 		kn->kn_fop = &pipe_wfiltops;
1804 		if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) {
1805 			/* other end of pipe has been closed */
1806 			PIPE_UNLOCK(cpipe);
1807 			return (EPIPE);
1808 		}
1809 		cpipe = PIPE_PEER(cpipe);
1810 		break;
1811 	default:
1812 		if ((cpipe->pipe_type & PIPE_TYPE_NAMED) != 0) {
1813 			PIPE_UNLOCK(cpipe);
1814 			return (vnops.fo_kqfilter(fp, kn));
1815 		}
1816 		PIPE_UNLOCK(cpipe);
1817 		return (EINVAL);
1818 	}
1819 
1820 	kn->kn_hook = cpipe;
1821 	knlist_add(&cpipe->pipe_sel.si_note, kn, 1);
1822 	PIPE_UNLOCK(cpipe);
1823 	return (0);
1824 }
1825 
1826 static void
1827 filt_pipedetach(struct knote *kn)
1828 {
1829 	struct pipe *cpipe = kn->kn_hook;
1830 
1831 	PIPE_LOCK(cpipe);
1832 	knlist_remove(&cpipe->pipe_sel.si_note, kn, 1);
1833 	PIPE_UNLOCK(cpipe);
1834 }
1835 
1836 /*ARGSUSED*/
1837 static int
1838 filt_piperead(struct knote *kn, long hint)
1839 {
1840 	struct file *fp = kn->kn_fp;
1841 	struct pipe *rpipe = kn->kn_hook;
1842 
1843 	PIPE_LOCK_ASSERT(rpipe, MA_OWNED);
1844 	kn->kn_data = rpipe->pipe_buffer.cnt;
1845 	if (kn->kn_data == 0)
1846 		kn->kn_data = rpipe->pipe_pages.cnt;
1847 
1848 	if ((rpipe->pipe_state & PIPE_EOF) != 0 &&
1849 	    ((rpipe->pipe_type & PIPE_TYPE_NAMED) == 0 ||
1850 	    fp->f_pipegen != rpipe->pipe_wgen)) {
1851 		kn->kn_flags |= EV_EOF;
1852 		return (1);
1853 	}
1854 	kn->kn_flags &= ~EV_EOF;
1855 	return (kn->kn_data > 0);
1856 }
1857 
1858 /*ARGSUSED*/
1859 static int
1860 filt_pipewrite(struct knote *kn, long hint)
1861 {
1862 	struct pipe *wpipe = kn->kn_hook;
1863 
1864 	/*
1865 	 * If this end of the pipe is closed, the knote was removed from the
1866 	 * knlist and the list lock (i.e., the pipe lock) is therefore not held.
1867 	 */
1868 	if (wpipe->pipe_present == PIPE_ACTIVE ||
1869 	    (wpipe->pipe_type & PIPE_TYPE_NAMED) != 0) {
1870 		PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
1871 
1872 		if (wpipe->pipe_state & PIPE_DIRECTW) {
1873 			kn->kn_data = 0;
1874 		} else if (wpipe->pipe_buffer.size > 0) {
1875 			kn->kn_data = wpipe->pipe_buffer.size -
1876 			    wpipe->pipe_buffer.cnt;
1877 		} else {
1878 			kn->kn_data = PIPE_BUF;
1879 		}
1880 	}
1881 
1882 	if (wpipe->pipe_present != PIPE_ACTIVE ||
1883 	    (wpipe->pipe_state & PIPE_EOF)) {
1884 		kn->kn_flags |= EV_EOF;
1885 		return (1);
1886 	}
1887 	kn->kn_flags &= ~EV_EOF;
1888 	return (kn->kn_data >= PIPE_BUF);
1889 }
1890 
1891 static void
1892 filt_pipedetach_notsup(struct knote *kn)
1893 {
1894 
1895 }
1896 
1897 static int
1898 filt_pipenotsup(struct knote *kn, long hint)
1899 {
1900 
1901 	return (0);
1902 }
1903