xref: /freebsd/sys/kern/sys_pipe.c (revision ce46e2059e16557a44be599f86de42c0e1a13220)
1 /*
2  * Copyright (c) 1996 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. Absolutely no warranty of function or purpose is made by the author
15  *    John S. Dyson.
16  * 4. Modifications may be freely made to this file if the above conditions
17  *    are met.
18  */
19 
20 /*
21  * This file contains a high-performance replacement for the socket-based
22  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
23  * all features of sockets, but does do everything that pipes normally
24  * do.
25  */
26 
27 /*
28  * This code has two modes of operation, a small write mode and a large
29  * write mode.  The small write mode acts like conventional pipes with
30  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
31  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
32  * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
33  * the receiving process can copy it directly from the pages in the sending
34  * process.
35  *
36  * If the sending process receives a signal, it is possible that it will
37  * go away, and certainly its address space can change, because control
38  * is returned back to the user-mode side.  In that case, the pipe code
39  * arranges to copy the buffer supplied by the user process, to a pageable
40  * kernel buffer, and the receiving process will grab the data from the
41  * pageable kernel buffer.  Since signals don't happen all that often,
42  * the copy operation is normally eliminated.
43  *
44  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
45  * happen for small transfers so that the system will not spend all of
46  * its time context switching.
47  *
48  * In order to limit the resource use of pipes, two sysctls exist:
49  *
50  * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
51  * address space available to us in pipe_map.  Whenever the amount in use
52  * exceeds half of this value, all new pipes will be created with size
53  * SMALL_PIPE_SIZE, rather than PIPE_SIZE.  Big pipe creation will be limited
54  * as well.  This value is loader tunable only.
55  *
56  * These values are autotuned in subr_param.c.
57  *
58  * Memory usage may be monitored through the sysctls
59  * kern.ipc.pipes, kern.ipc.pipekva and kern.ipc.pipekvawired.
60  *
61  */
62 
63 #include <sys/cdefs.h>
64 __FBSDID("$FreeBSD$");
65 
66 #include "opt_mac.h"
67 
68 #include <sys/param.h>
69 #include <sys/systm.h>
70 #include <sys/fcntl.h>
71 #include <sys/file.h>
72 #include <sys/filedesc.h>
73 #include <sys/filio.h>
74 #include <sys/kernel.h>
75 #include <sys/lock.h>
76 #include <sys/mac.h>
77 #include <sys/mutex.h>
78 #include <sys/ttycom.h>
79 #include <sys/stat.h>
80 #include <sys/malloc.h>
81 #include <sys/poll.h>
82 #include <sys/selinfo.h>
83 #include <sys/signalvar.h>
84 #include <sys/sysctl.h>
85 #include <sys/sysproto.h>
86 #include <sys/pipe.h>
87 #include <sys/proc.h>
88 #include <sys/vnode.h>
89 #include <sys/uio.h>
90 #include <sys/event.h>
91 
92 #include <vm/vm.h>
93 #include <vm/vm_param.h>
94 #include <vm/vm_object.h>
95 #include <vm/vm_kern.h>
96 #include <vm/vm_extern.h>
97 #include <vm/pmap.h>
98 #include <vm/vm_map.h>
99 #include <vm/vm_page.h>
100 #include <vm/uma.h>
101 
102 /*
103  * Use this define if you want to disable *fancy* VM things.  Expect an
104  * approx 30% decrease in transfer rate.  This could be useful for
105  * NetBSD or OpenBSD.
106  */
107 /* #define PIPE_NODIRECT */
108 
109 /*
110  * interfaces to the outside world
111  */
112 static fo_rdwr_t	pipe_read;
113 static fo_rdwr_t	pipe_write;
114 static fo_ioctl_t	pipe_ioctl;
115 static fo_poll_t	pipe_poll;
116 static fo_kqfilter_t	pipe_kqfilter;
117 static fo_stat_t	pipe_stat;
118 static fo_close_t	pipe_close;
119 
120 static struct fileops pipeops = {
121 	.fo_read = pipe_read,
122 	.fo_write = pipe_write,
123 	.fo_ioctl = pipe_ioctl,
124 	.fo_poll = pipe_poll,
125 	.fo_kqfilter = pipe_kqfilter,
126 	.fo_stat = pipe_stat,
127 	.fo_close = pipe_close,
128 	.fo_flags = DFLAG_PASSABLE
129 };
130 
131 static void	filt_pipedetach(struct knote *kn);
132 static int	filt_piperead(struct knote *kn, long hint);
133 static int	filt_pipewrite(struct knote *kn, long hint);
134 
135 static struct filterops pipe_rfiltops =
136 	{ 1, NULL, filt_pipedetach, filt_piperead };
137 static struct filterops pipe_wfiltops =
138 	{ 1, NULL, filt_pipedetach, filt_pipewrite };
139 
140 /*
141  * Default pipe buffer size(s), this can be kind-of large now because pipe
142  * space is pageable.  The pipe code will try to maintain locality of
143  * reference for performance reasons, so small amounts of outstanding I/O
144  * will not wipe the cache.
145  */
146 #define MINPIPESIZE (PIPE_SIZE/3)
147 #define MAXPIPESIZE (2*PIPE_SIZE/3)
148 
149 /*
150  * Limit the number of "big" pipes
151  */
152 #define LIMITBIGPIPES	32
153 static int nbigpipe;
154 
155 static int amountpipes;
156 static int amountpipekva;
157 
158 SYSCTL_DECL(_kern_ipc);
159 
160 SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
161 	   &maxpipekva, 0, "Pipe KVA limit");
162 SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD,
163 	   &amountpipes, 0, "Current # of pipes");
164 SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD,
165 	   &nbigpipe, 0, "Current # of big pipes");
166 SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
167 	   &amountpipekva, 0, "Pipe KVA usage");
168 
169 static void pipeinit(void *dummy __unused);
170 static void pipeclose(struct pipe *cpipe);
171 static void pipe_free_kmem(struct pipe *cpipe);
172 static int pipe_create(struct pipe *pipe);
173 static __inline int pipelock(struct pipe *cpipe, int catch);
174 static __inline void pipeunlock(struct pipe *cpipe);
175 static __inline void pipeselwakeup(struct pipe *cpipe);
176 #ifndef PIPE_NODIRECT
177 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
178 static void pipe_destroy_write_buffer(struct pipe *wpipe);
179 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
180 static void pipe_clone_write_buffer(struct pipe *wpipe);
181 #endif
182 static int pipespace(struct pipe *cpipe, int size);
183 static int pipespace_new(struct pipe *cpipe, int size);
184 
185 static int	pipe_zone_ctor(void *mem, int size, void *arg, int flags);
186 static void	pipe_zone_dtor(void *mem, int size, void *arg);
187 static int	pipe_zone_init(void *mem, int size, int flags);
188 static void	pipe_zone_fini(void *mem, int size);
189 
190 static uma_zone_t pipe_zone;
191 
192 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
193 
194 static void
195 pipeinit(void *dummy __unused)
196 {
197 
198 	pipe_zone = uma_zcreate("PIPE", sizeof(struct pipepair),
199 	    pipe_zone_ctor, pipe_zone_dtor, pipe_zone_init, pipe_zone_fini,
200 	    UMA_ALIGN_PTR, 0);
201 	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
202 }
203 
204 static int
205 pipe_zone_ctor(void *mem, int size, void *arg, int flags)
206 {
207 	struct pipepair *pp;
208 	struct pipe *rpipe, *wpipe;
209 
210 	KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
211 
212 	pp = (struct pipepair *)mem;
213 
214 	/*
215 	 * We zero both pipe endpoints to make sure all the kmem pointers
216 	 * are NULL, flag fields are zero'd, etc.  We timestamp both
217 	 * endpoints with the same time.
218 	 */
219 	rpipe = &pp->pp_rpipe;
220 	bzero(rpipe, sizeof(*rpipe));
221 	vfs_timestamp(&rpipe->pipe_ctime);
222 	rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
223 
224 	wpipe = &pp->pp_wpipe;
225 	bzero(wpipe, sizeof(*wpipe));
226 	wpipe->pipe_ctime = rpipe->pipe_ctime;
227 	wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
228 
229 	rpipe->pipe_peer = wpipe;
230 	rpipe->pipe_pair = pp;
231 	wpipe->pipe_peer = rpipe;
232 	wpipe->pipe_pair = pp;
233 
234 	/*
235 	 * Mark both endpoints as present; they will later get free'd
236 	 * one at a time.  When both are free'd, then the whole pair
237 	 * is released.
238 	 */
239 	rpipe->pipe_present = 1;
240 	wpipe->pipe_present = 1;
241 
242 	/*
243 	 * Eventually, the MAC Framework may initialize the label
244 	 * in ctor or init, but for now we do it elswhere to avoid
245 	 * blocking in ctor or init.
246 	 */
247 	pp->pp_label = NULL;
248 
249 	atomic_add_int(&amountpipes, 2);
250 	return (0);
251 }
252 
253 static void
254 pipe_zone_dtor(void *mem, int size, void *arg)
255 {
256 	struct pipepair *pp;
257 
258 	KASSERT(size == sizeof(*pp), ("pipe_zone_dtor: wrong size"));
259 
260 	pp = (struct pipepair *)mem;
261 
262 	atomic_subtract_int(&amountpipes, 2);
263 }
264 
265 static int
266 pipe_zone_init(void *mem, int size, int flags)
267 {
268 	struct pipepair *pp;
269 
270 	KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
271 
272 	pp = (struct pipepair *)mem;
273 
274 	mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
275 	return (0);
276 }
277 
278 static void
279 pipe_zone_fini(void *mem, int size)
280 {
281 	struct pipepair *pp;
282 
283 	KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
284 
285 	pp = (struct pipepair *)mem;
286 
287 	mtx_destroy(&pp->pp_mtx);
288 }
289 
290 /*
291  * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail,
292  * let the zone pick up the pieces via pipeclose().
293  */
294 
295 /* ARGSUSED */
296 int
297 pipe(td, uap)
298 	struct thread *td;
299 	struct pipe_args /* {
300 		int	dummy;
301 	} */ *uap;
302 {
303 	struct filedesc *fdp = td->td_proc->p_fd;
304 	struct file *rf, *wf;
305 	struct pipepair *pp;
306 	struct pipe *rpipe, *wpipe;
307 	int fd, error;
308 
309 	pp = uma_zalloc(pipe_zone, M_WAITOK);
310 #ifdef MAC
311 	/*
312 	 * The MAC label is shared between the connected endpoints.  As a
313 	 * result mac_init_pipe() and mac_create_pipe() are called once
314 	 * for the pair, and not on the endpoints.
315 	 */
316 	mac_init_pipe(pp);
317 	mac_create_pipe(td->td_ucred, pp);
318 #endif
319 	rpipe = &pp->pp_rpipe;
320 	wpipe = &pp->pp_wpipe;
321 
322 	if (pipe_create(rpipe) || pipe_create(wpipe)) {
323 		pipeclose(rpipe);
324 		pipeclose(wpipe);
325 		return (ENFILE);
326 	}
327 
328 	rpipe->pipe_state |= PIPE_DIRECTOK;
329 	wpipe->pipe_state |= PIPE_DIRECTOK;
330 
331 	error = falloc(td, &rf, &fd);
332 	if (error) {
333 		pipeclose(rpipe);
334 		pipeclose(wpipe);
335 		return (error);
336 	}
337 	/* An extra reference on `rf' has been held for us by falloc(). */
338 	td->td_retval[0] = fd;
339 
340 	/*
341 	 * Warning: once we've gotten past allocation of the fd for the
342 	 * read-side, we can only drop the read side via fdrop() in order
343 	 * to avoid races against processes which manage to dup() the read
344 	 * side while we are blocked trying to allocate the write side.
345 	 */
346 	FILE_LOCK(rf);
347 	rf->f_flag = FREAD | FWRITE;
348 	rf->f_type = DTYPE_PIPE;
349 	rf->f_data = rpipe;
350 	rf->f_ops = &pipeops;
351 	FILE_UNLOCK(rf);
352 	error = falloc(td, &wf, &fd);
353 	if (error) {
354 		FILEDESC_LOCK(fdp);
355 		if (fdp->fd_ofiles[td->td_retval[0]] == rf) {
356 			fdp->fd_ofiles[td->td_retval[0]] = NULL;
357 			fdunused(fdp, td->td_retval[0]);
358 			FILEDESC_UNLOCK(fdp);
359 			fdrop(rf, td);
360 		} else {
361 			FILEDESC_UNLOCK(fdp);
362 		}
363 		fdrop(rf, td);
364 		/* rpipe has been closed by fdrop(). */
365 		pipeclose(wpipe);
366 		return (error);
367 	}
368 	/* An extra reference on `wf' has been held for us by falloc(). */
369 	FILE_LOCK(wf);
370 	wf->f_flag = FREAD | FWRITE;
371 	wf->f_type = DTYPE_PIPE;
372 	wf->f_data = wpipe;
373 	wf->f_ops = &pipeops;
374 	FILE_UNLOCK(wf);
375 	fdrop(wf, td);
376 	td->td_retval[1] = fd;
377 	fdrop(rf, td);
378 
379 	return (0);
380 }
381 
382 /*
383  * Allocate kva for pipe circular buffer, the space is pageable
384  * This routine will 'realloc' the size of a pipe safely, if it fails
385  * it will retain the old buffer.
386  * If it fails it will return ENOMEM.
387  */
388 static int
389 pipespace_new(cpipe, size)
390 	struct pipe *cpipe;
391 	int size;
392 {
393 	caddr_t buffer;
394 	int error;
395 	static int curfail = 0;
396 	static struct timeval lastfail;
397 
398 	KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
399 
400 	size = round_page(size);
401 	/*
402 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
403 	 */
404 	buffer = (caddr_t) vm_map_min(pipe_map);
405 
406 	/*
407 	 * The map entry is, by default, pageable.
408 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
409 	 */
410 	error = vm_map_find(pipe_map, NULL, 0,
411 		(vm_offset_t *) &buffer, size, 1,
412 		VM_PROT_ALL, VM_PROT_ALL, 0);
413 	if (error != KERN_SUCCESS) {
414 		if (ppsratecheck(&lastfail, &curfail, 1))
415 			printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
416 		return (ENOMEM);
417 	}
418 
419 	/* free old resources if we're resizing */
420 	pipe_free_kmem(cpipe);
421 	cpipe->pipe_buffer.buffer = buffer;
422 	cpipe->pipe_buffer.size = size;
423 	cpipe->pipe_buffer.in = 0;
424 	cpipe->pipe_buffer.out = 0;
425 	cpipe->pipe_buffer.cnt = 0;
426 	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
427 	return (0);
428 }
429 
430 /*
431  * Wrapper for pipespace_new() that performs locking assertions.
432  */
433 static int
434 pipespace(cpipe, size)
435 	struct pipe *cpipe;
436 	int size;
437 {
438 
439 	/*
440 	 * XXXRW: Seems like we should really assert PIPE_LOCKFL on the
441 	 * pipe_state here.
442 	 */
443 
444 	return (pipespace_new(cpipe, size));
445 }
446 
447 /*
448  * lock a pipe for I/O, blocking other access
449  */
450 static __inline int
451 pipelock(cpipe, catch)
452 	struct pipe *cpipe;
453 	int catch;
454 {
455 	int error;
456 
457 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
458 	while (cpipe->pipe_state & PIPE_LOCKFL) {
459 		cpipe->pipe_state |= PIPE_LWANT;
460 		error = msleep(cpipe, PIPE_MTX(cpipe),
461 		    catch ? (PRIBIO | PCATCH) : PRIBIO,
462 		    "pipelk", 0);
463 		if (error != 0)
464 			return (error);
465 	}
466 	cpipe->pipe_state |= PIPE_LOCKFL;
467 	return (0);
468 }
469 
470 /*
471  * unlock a pipe I/O lock
472  */
473 static __inline void
474 pipeunlock(cpipe)
475 	struct pipe *cpipe;
476 {
477 
478 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
479 	cpipe->pipe_state &= ~PIPE_LOCKFL;
480 	if (cpipe->pipe_state & PIPE_LWANT) {
481 		cpipe->pipe_state &= ~PIPE_LWANT;
482 		wakeup(cpipe);
483 	}
484 }
485 
486 static __inline void
487 pipeselwakeup(cpipe)
488 	struct pipe *cpipe;
489 {
490 
491 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
492 	if (cpipe->pipe_state & PIPE_SEL) {
493 		cpipe->pipe_state &= ~PIPE_SEL;
494 		selwakeuppri(&cpipe->pipe_sel, PSOCK);
495 	}
496 	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
497 		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
498 	KNOTE(&cpipe->pipe_sel.si_note, 0);
499 }
500 
501 /*
502  * Initialize and allocate VM and memory for pipe.  The structure
503  * will start out zero'd from the ctor, so we just manage the kmem.
504  */
505 static int
506 pipe_create(pipe)
507 	struct pipe *pipe;
508 {
509 	int error;
510 
511 	/*
512 	 * Reduce to 1/4th pipe size if we're over our global max.
513 	 */
514 	if (amountpipekva > maxpipekva / 2)
515 		error = pipespace(pipe, SMALL_PIPE_SIZE);
516 	else
517 		error = pipespace(pipe, PIPE_SIZE);
518 	return (error);
519 }
520 
521 /* ARGSUSED */
522 static int
523 pipe_read(fp, uio, active_cred, flags, td)
524 	struct file *fp;
525 	struct uio *uio;
526 	struct ucred *active_cred;
527 	struct thread *td;
528 	int flags;
529 {
530 	struct pipe *rpipe = fp->f_data;
531 	int error;
532 	int nread = 0;
533 	u_int size;
534 
535 	PIPE_LOCK(rpipe);
536 	++rpipe->pipe_busy;
537 	error = pipelock(rpipe, 1);
538 	if (error)
539 		goto unlocked_error;
540 
541 #ifdef MAC
542 	error = mac_check_pipe_read(active_cred, rpipe->pipe_pair);
543 	if (error)
544 		goto locked_error;
545 #endif
546 
547 	while (uio->uio_resid) {
548 		/*
549 		 * normal pipe buffer receive
550 		 */
551 		if (rpipe->pipe_buffer.cnt > 0) {
552 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
553 			if (size > rpipe->pipe_buffer.cnt)
554 				size = rpipe->pipe_buffer.cnt;
555 			if (size > (u_int) uio->uio_resid)
556 				size = (u_int) uio->uio_resid;
557 
558 			PIPE_UNLOCK(rpipe);
559 			error = uiomove(
560 			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
561 			    size, uio);
562 			PIPE_LOCK(rpipe);
563 			if (error)
564 				break;
565 
566 			rpipe->pipe_buffer.out += size;
567 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
568 				rpipe->pipe_buffer.out = 0;
569 
570 			rpipe->pipe_buffer.cnt -= size;
571 
572 			/*
573 			 * If there is no more to read in the pipe, reset
574 			 * its pointers to the beginning.  This improves
575 			 * cache hit stats.
576 			 */
577 			if (rpipe->pipe_buffer.cnt == 0) {
578 				rpipe->pipe_buffer.in = 0;
579 				rpipe->pipe_buffer.out = 0;
580 			}
581 			nread += size;
582 #ifndef PIPE_NODIRECT
583 		/*
584 		 * Direct copy, bypassing a kernel buffer.
585 		 */
586 		} else if ((size = rpipe->pipe_map.cnt) &&
587 			   (rpipe->pipe_state & PIPE_DIRECTW)) {
588 			if (size > (u_int) uio->uio_resid)
589 				size = (u_int) uio->uio_resid;
590 
591 			PIPE_UNLOCK(rpipe);
592 			error = uiomove_fromphys(rpipe->pipe_map.ms,
593 			    rpipe->pipe_map.pos, size, uio);
594 			PIPE_LOCK(rpipe);
595 			if (error)
596 				break;
597 			nread += size;
598 			rpipe->pipe_map.pos += size;
599 			rpipe->pipe_map.cnt -= size;
600 			if (rpipe->pipe_map.cnt == 0) {
601 				rpipe->pipe_state &= ~PIPE_DIRECTW;
602 				wakeup(rpipe);
603 			}
604 #endif
605 		} else {
606 			/*
607 			 * detect EOF condition
608 			 * read returns 0 on EOF, no need to set error
609 			 */
610 			if (rpipe->pipe_state & PIPE_EOF)
611 				break;
612 
613 			/*
614 			 * If the "write-side" has been blocked, wake it up now.
615 			 */
616 			if (rpipe->pipe_state & PIPE_WANTW) {
617 				rpipe->pipe_state &= ~PIPE_WANTW;
618 				wakeup(rpipe);
619 			}
620 
621 			/*
622 			 * Break if some data was read.
623 			 */
624 			if (nread > 0)
625 				break;
626 
627 			/*
628 			 * Unlock the pipe buffer for our remaining processing.
629 			 * We will either break out with an error or we will
630 			 * sleep and relock to loop.
631 			 */
632 			pipeunlock(rpipe);
633 
634 			/*
635 			 * Handle non-blocking mode operation or
636 			 * wait for more data.
637 			 */
638 			if (fp->f_flag & FNONBLOCK) {
639 				error = EAGAIN;
640 			} else {
641 				rpipe->pipe_state |= PIPE_WANTR;
642 				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
643 				    PRIBIO | PCATCH,
644 				    "piperd", 0)) == 0)
645 					error = pipelock(rpipe, 1);
646 			}
647 			if (error)
648 				goto unlocked_error;
649 		}
650 	}
651 #ifdef MAC
652 locked_error:
653 #endif
654 	pipeunlock(rpipe);
655 
656 	/* XXX: should probably do this before getting any locks. */
657 	if (error == 0)
658 		vfs_timestamp(&rpipe->pipe_atime);
659 unlocked_error:
660 	--rpipe->pipe_busy;
661 
662 	/*
663 	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
664 	 */
665 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
666 		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
667 		wakeup(rpipe);
668 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
669 		/*
670 		 * Handle write blocking hysteresis.
671 		 */
672 		if (rpipe->pipe_state & PIPE_WANTW) {
673 			rpipe->pipe_state &= ~PIPE_WANTW;
674 			wakeup(rpipe);
675 		}
676 	}
677 
678 	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
679 		pipeselwakeup(rpipe);
680 
681 	PIPE_UNLOCK(rpipe);
682 	return (error);
683 }
684 
685 #ifndef PIPE_NODIRECT
686 /*
687  * Map the sending processes' buffer into kernel space and wire it.
688  * This is similar to a physical write operation.
689  */
690 static int
691 pipe_build_write_buffer(wpipe, uio)
692 	struct pipe *wpipe;
693 	struct uio *uio;
694 {
695 	pmap_t pmap;
696 	u_int size;
697 	int i, j;
698 	vm_offset_t addr, endaddr;
699 
700 	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
701 
702 	size = (u_int) uio->uio_iov->iov_len;
703 	if (size > wpipe->pipe_buffer.size)
704 		size = wpipe->pipe_buffer.size;
705 
706 	pmap = vmspace_pmap(curproc->p_vmspace);
707 	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
708 	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
709 	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
710 		/*
711 		 * vm_fault_quick() can sleep.  Consequently,
712 		 * vm_page_lock_queue() and vm_page_unlock_queue()
713 		 * should not be performed outside of this loop.
714 		 */
715 	race:
716 		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) {
717 			vm_page_lock_queues();
718 			for (j = 0; j < i; j++)
719 				vm_page_unhold(wpipe->pipe_map.ms[j]);
720 			vm_page_unlock_queues();
721 			return (EFAULT);
722 		}
723 		wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr,
724 		    VM_PROT_READ);
725 		if (wpipe->pipe_map.ms[i] == NULL)
726 			goto race;
727 	}
728 
729 /*
730  * set up the control block
731  */
732 	wpipe->pipe_map.npages = i;
733 	wpipe->pipe_map.pos =
734 	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
735 	wpipe->pipe_map.cnt = size;
736 
737 /*
738  * and update the uio data
739  */
740 
741 	uio->uio_iov->iov_len -= size;
742 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
743 	if (uio->uio_iov->iov_len == 0)
744 		uio->uio_iov++;
745 	uio->uio_resid -= size;
746 	uio->uio_offset += size;
747 	return (0);
748 }
749 
750 /*
751  * unmap and unwire the process buffer
752  */
753 static void
754 pipe_destroy_write_buffer(wpipe)
755 	struct pipe *wpipe;
756 {
757 	int i;
758 
759 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
760 	vm_page_lock_queues();
761 	for (i = 0; i < wpipe->pipe_map.npages; i++) {
762 		vm_page_unhold(wpipe->pipe_map.ms[i]);
763 	}
764 	vm_page_unlock_queues();
765 	wpipe->pipe_map.npages = 0;
766 }
767 
768 /*
769  * In the case of a signal, the writing process might go away.  This
770  * code copies the data into the circular buffer so that the source
771  * pages can be freed without loss of data.
772  */
773 static void
774 pipe_clone_write_buffer(wpipe)
775 	struct pipe *wpipe;
776 {
777 	struct uio uio;
778 	struct iovec iov;
779 	int size;
780 	int pos;
781 
782 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
783 	size = wpipe->pipe_map.cnt;
784 	pos = wpipe->pipe_map.pos;
785 
786 	wpipe->pipe_buffer.in = size;
787 	wpipe->pipe_buffer.out = 0;
788 	wpipe->pipe_buffer.cnt = size;
789 	wpipe->pipe_state &= ~PIPE_DIRECTW;
790 
791 	PIPE_UNLOCK(wpipe);
792 	iov.iov_base = wpipe->pipe_buffer.buffer;
793 	iov.iov_len = size;
794 	uio.uio_iov = &iov;
795 	uio.uio_iovcnt = 1;
796 	uio.uio_offset = 0;
797 	uio.uio_resid = size;
798 	uio.uio_segflg = UIO_SYSSPACE;
799 	uio.uio_rw = UIO_READ;
800 	uio.uio_td = curthread;
801 	uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio);
802 	PIPE_LOCK(wpipe);
803 	pipe_destroy_write_buffer(wpipe);
804 }
805 
806 /*
807  * This implements the pipe buffer write mechanism.  Note that only
808  * a direct write OR a normal pipe write can be pending at any given time.
809  * If there are any characters in the pipe buffer, the direct write will
810  * be deferred until the receiving process grabs all of the bytes from
811  * the pipe buffer.  Then the direct mapping write is set-up.
812  */
813 static int
814 pipe_direct_write(wpipe, uio)
815 	struct pipe *wpipe;
816 	struct uio *uio;
817 {
818 	int error;
819 
820 retry:
821 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
822 	while (wpipe->pipe_state & PIPE_DIRECTW) {
823 		if (wpipe->pipe_state & PIPE_WANTR) {
824 			wpipe->pipe_state &= ~PIPE_WANTR;
825 			wakeup(wpipe);
826 		}
827 		wpipe->pipe_state |= PIPE_WANTW;
828 		error = msleep(wpipe, PIPE_MTX(wpipe),
829 		    PRIBIO | PCATCH, "pipdww", 0);
830 		if (error)
831 			goto error1;
832 		if (wpipe->pipe_state & PIPE_EOF) {
833 			error = EPIPE;
834 			goto error1;
835 		}
836 	}
837 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
838 	if (wpipe->pipe_buffer.cnt > 0) {
839 		if (wpipe->pipe_state & PIPE_WANTR) {
840 			wpipe->pipe_state &= ~PIPE_WANTR;
841 			wakeup(wpipe);
842 		}
843 
844 		wpipe->pipe_state |= PIPE_WANTW;
845 		error = msleep(wpipe, PIPE_MTX(wpipe),
846 		    PRIBIO | PCATCH, "pipdwc", 0);
847 		if (error)
848 			goto error1;
849 		if (wpipe->pipe_state & PIPE_EOF) {
850 			error = EPIPE;
851 			goto error1;
852 		}
853 		goto retry;
854 	}
855 
856 	wpipe->pipe_state |= PIPE_DIRECTW;
857 
858 	pipelock(wpipe, 0);
859 	if (wpipe->pipe_state & PIPE_EOF) {
860 		error = EPIPE;
861 		goto error2;
862 	}
863 	PIPE_UNLOCK(wpipe);
864 	error = pipe_build_write_buffer(wpipe, uio);
865 	PIPE_LOCK(wpipe);
866 	pipeunlock(wpipe);
867 	if (error) {
868 		wpipe->pipe_state &= ~PIPE_DIRECTW;
869 		goto error1;
870 	}
871 
872 	error = 0;
873 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
874 		if (wpipe->pipe_state & PIPE_EOF) {
875 			pipelock(wpipe, 0);
876 			pipe_destroy_write_buffer(wpipe);
877 			pipeselwakeup(wpipe);
878 			pipeunlock(wpipe);
879 			error = EPIPE;
880 			goto error1;
881 		}
882 		if (wpipe->pipe_state & PIPE_WANTR) {
883 			wpipe->pipe_state &= ~PIPE_WANTR;
884 			wakeup(wpipe);
885 		}
886 		pipeselwakeup(wpipe);
887 		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
888 		    "pipdwt", 0);
889 	}
890 
891 	pipelock(wpipe,0);
892 	if (wpipe->pipe_state & PIPE_EOF)
893 		error = EPIPE;
894 	if (wpipe->pipe_state & PIPE_DIRECTW) {
895 		/*
896 		 * this bit of trickery substitutes a kernel buffer for
897 		 * the process that might be going away.
898 		 */
899 		pipe_clone_write_buffer(wpipe);
900 	} else {
901 		pipe_destroy_write_buffer(wpipe);
902 	}
903 error2:
904 	pipeunlock(wpipe);
905 	return (error);
906 
907 error1:
908 	wakeup(wpipe);
909 	return (error);
910 }
911 #endif
912 
913 static int
914 pipe_write(fp, uio, active_cred, flags, td)
915 	struct file *fp;
916 	struct uio *uio;
917 	struct ucred *active_cred;
918 	struct thread *td;
919 	int flags;
920 {
921 	int error = 0;
922 	int orig_resid;
923 	struct pipe *wpipe, *rpipe;
924 
925 	rpipe = fp->f_data;
926 	wpipe = rpipe->pipe_peer;
927 
928 	PIPE_LOCK(rpipe);
929 	/*
930 	 * detect loss of pipe read side, issue SIGPIPE if lost.
931 	 */
932 	if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
933 		PIPE_UNLOCK(rpipe);
934 		return (EPIPE);
935 	}
936 #ifdef MAC
937 	error = mac_check_pipe_write(active_cred, wpipe->pipe_pair);
938 	if (error) {
939 		PIPE_UNLOCK(rpipe);
940 		return (error);
941 	}
942 #endif
943 	++wpipe->pipe_busy;
944 
945 	/*
946 	 * If it is advantageous to resize the pipe buffer, do
947 	 * so.
948 	 */
949 	if ((uio->uio_resid > PIPE_SIZE) &&
950 		(amountpipekva < maxpipekva / 2) &&
951 		(nbigpipe < LIMITBIGPIPES) &&
952 		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
953 		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
954 		(wpipe->pipe_buffer.cnt == 0)) {
955 
956 		if ((error = pipelock(wpipe, 1)) == 0) {
957 			if (wpipe->pipe_state & PIPE_EOF)
958 				error = EPIPE;
959 			else {
960 				PIPE_UNLOCK(wpipe);
961 				if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
962 					atomic_add_int(&nbigpipe, 1);
963 				PIPE_LOCK(wpipe);
964 			}
965 			pipeunlock(wpipe);
966 		}
967 	}
968 
969 	/*
970 	 * If an early error occured unbusy and return, waking up any pending
971 	 * readers.
972 	 */
973 	if (error) {
974 		--wpipe->pipe_busy;
975 		if ((wpipe->pipe_busy == 0) &&
976 		    (wpipe->pipe_state & PIPE_WANT)) {
977 			wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
978 			wakeup(wpipe);
979 		}
980 		PIPE_UNLOCK(rpipe);
981 		return(error);
982 	}
983 
984 	orig_resid = uio->uio_resid;
985 
986 	while (uio->uio_resid) {
987 		int space;
988 
989 #ifndef PIPE_NODIRECT
990 		/*
991 		 * If the transfer is large, we can gain performance if
992 		 * we do process-to-process copies directly.
993 		 * If the write is non-blocking, we don't use the
994 		 * direct write mechanism.
995 		 *
996 		 * The direct write mechanism will detect the reader going
997 		 * away on us.
998 		 */
999 		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
1000 		    (fp->f_flag & FNONBLOCK) == 0) {
1001 			error = pipe_direct_write(wpipe, uio);
1002 			if (error)
1003 				break;
1004 			continue;
1005 		}
1006 #endif
1007 
1008 		/*
1009 		 * Pipe buffered writes cannot be coincidental with
1010 		 * direct writes.  We wait until the currently executing
1011 		 * direct write is completed before we start filling the
1012 		 * pipe buffer.  We break out if a signal occurs or the
1013 		 * reader goes away.
1014 		 */
1015 	retrywrite:
1016 		while (wpipe->pipe_state & PIPE_DIRECTW) {
1017 			if (wpipe->pipe_state & PIPE_WANTR) {
1018 				wpipe->pipe_state &= ~PIPE_WANTR;
1019 				wakeup(wpipe);
1020 			}
1021 			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
1022 			    "pipbww", 0);
1023 			if (wpipe->pipe_state & PIPE_EOF) {
1024 				error = EPIPE;
1025 				break;
1026 			}
1027 			if (error)
1028 				break;
1029 		}
1030 
1031 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1032 
1033 		/* Writes of size <= PIPE_BUF must be atomic. */
1034 		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
1035 			space = 0;
1036 
1037 		if (space > 0) {
1038 			if ((error = pipelock(wpipe,1)) == 0) {
1039 				int size;	/* Transfer size */
1040 				int segsize;	/* first segment to transfer */
1041 
1042 				/*
1043 				 * It is possible for a direct write/EOF to
1044 				 * slip in on us... handle them here...
1045 				 */
1046 				if (wpipe->pipe_state & PIPE_EOF)
1047 					goto lost_wpipe;
1048 				if (wpipe->pipe_state & PIPE_DIRECTW) {
1049 					pipeunlock(wpipe);
1050 					goto retrywrite;
1051 				}
1052 				/*
1053 				 * If a process blocked in uiomove, our
1054 				 * value for space might be bad.
1055 				 *
1056 				 * XXX will we be ok if the reader has gone
1057 				 * away here?
1058 				 */
1059 				if (space > wpipe->pipe_buffer.size -
1060 				    wpipe->pipe_buffer.cnt) {
1061 					pipeunlock(wpipe);
1062 					goto retrywrite;
1063 				}
1064 
1065 				/*
1066 				 * Transfer size is minimum of uio transfer
1067 				 * and free space in pipe buffer.
1068 				 */
1069 				if (space > uio->uio_resid)
1070 					size = uio->uio_resid;
1071 				else
1072 					size = space;
1073 				/*
1074 				 * First segment to transfer is minimum of
1075 				 * transfer size and contiguous space in
1076 				 * pipe buffer.  If first segment to transfer
1077 				 * is less than the transfer size, we've got
1078 				 * a wraparound in the buffer.
1079 				 */
1080 				segsize = wpipe->pipe_buffer.size -
1081 					wpipe->pipe_buffer.in;
1082 				if (segsize > size)
1083 					segsize = size;
1084 
1085 				/* Transfer first segment */
1086 
1087 				PIPE_UNLOCK(rpipe);
1088 				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1089 						segsize, uio);
1090 				PIPE_LOCK(rpipe);
1091 
1092 				if (error == 0 && segsize < size) {
1093 					/*
1094 					 * Transfer remaining part now, to
1095 					 * support atomic writes.  Wraparound
1096 					 * happened.
1097 					 */
1098 					if (wpipe->pipe_buffer.in + segsize !=
1099 					    wpipe->pipe_buffer.size)
1100 						panic("Expected pipe buffer "
1101 						    "wraparound disappeared");
1102 
1103 					PIPE_UNLOCK(rpipe);
1104 					error = uiomove(
1105 					    &wpipe->pipe_buffer.buffer[0],
1106 					    size - segsize, uio);
1107 					PIPE_LOCK(rpipe);
1108 				}
1109 				if (error == 0) {
1110 					wpipe->pipe_buffer.in += size;
1111 					if (wpipe->pipe_buffer.in >=
1112 					    wpipe->pipe_buffer.size) {
1113 						if (wpipe->pipe_buffer.in !=
1114 						    size - segsize +
1115 						    wpipe->pipe_buffer.size)
1116 							panic("Expected "
1117 							    "wraparound bad");
1118 						wpipe->pipe_buffer.in = size -
1119 						    segsize;
1120 					}
1121 
1122 					wpipe->pipe_buffer.cnt += size;
1123 					if (wpipe->pipe_buffer.cnt >
1124 					    wpipe->pipe_buffer.size)
1125 						panic("Pipe buffer overflow");
1126 
1127 				}
1128 lost_wpipe:
1129 				pipeunlock(wpipe);
1130 			}
1131 			if (error)
1132 				break;
1133 
1134 		} else {
1135 			/*
1136 			 * If the "read-side" has been blocked, wake it up now.
1137 			 */
1138 			if (wpipe->pipe_state & PIPE_WANTR) {
1139 				wpipe->pipe_state &= ~PIPE_WANTR;
1140 				wakeup(wpipe);
1141 			}
1142 
1143 			/*
1144 			 * don't block on non-blocking I/O
1145 			 */
1146 			if (fp->f_flag & FNONBLOCK) {
1147 				error = EAGAIN;
1148 				break;
1149 			}
1150 
1151 			/*
1152 			 * We have no more space and have something to offer,
1153 			 * wake up select/poll.
1154 			 */
1155 			pipeselwakeup(wpipe);
1156 
1157 			wpipe->pipe_state |= PIPE_WANTW;
1158 			error = msleep(wpipe, PIPE_MTX(rpipe),
1159 			    PRIBIO | PCATCH, "pipewr", 0);
1160 			if (error != 0)
1161 				break;
1162 			/*
1163 			 * If read side wants to go away, we just issue a signal
1164 			 * to ourselves.
1165 			 */
1166 			if (wpipe->pipe_state & PIPE_EOF) {
1167 				error = EPIPE;
1168 				break;
1169 			}
1170 		}
1171 	}
1172 
1173 	--wpipe->pipe_busy;
1174 
1175 	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1176 		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1177 		wakeup(wpipe);
1178 	} else if (wpipe->pipe_buffer.cnt > 0) {
1179 		/*
1180 		 * If we have put any characters in the buffer, we wake up
1181 		 * the reader.
1182 		 */
1183 		if (wpipe->pipe_state & PIPE_WANTR) {
1184 			wpipe->pipe_state &= ~PIPE_WANTR;
1185 			wakeup(wpipe);
1186 		}
1187 	}
1188 
1189 	/*
1190 	 * Don't return EPIPE if I/O was successful
1191 	 */
1192 	if ((wpipe->pipe_buffer.cnt == 0) &&
1193 	    (uio->uio_resid == 0) &&
1194 	    (error == EPIPE)) {
1195 		error = 0;
1196 	}
1197 
1198 	if (error == 0)
1199 		vfs_timestamp(&wpipe->pipe_mtime);
1200 
1201 	/*
1202 	 * We have something to offer,
1203 	 * wake up select/poll.
1204 	 */
1205 	if (wpipe->pipe_buffer.cnt)
1206 		pipeselwakeup(wpipe);
1207 
1208 	PIPE_UNLOCK(rpipe);
1209 	return (error);
1210 }
1211 
1212 /*
1213  * we implement a very minimal set of ioctls for compatibility with sockets.
1214  */
1215 static int
1216 pipe_ioctl(fp, cmd, data, active_cred, td)
1217 	struct file *fp;
1218 	u_long cmd;
1219 	void *data;
1220 	struct ucred *active_cred;
1221 	struct thread *td;
1222 {
1223 	struct pipe *mpipe = fp->f_data;
1224 #ifdef MAC
1225 	int error;
1226 #endif
1227 
1228 	PIPE_LOCK(mpipe);
1229 
1230 #ifdef MAC
1231 	error = mac_check_pipe_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
1232 	if (error) {
1233 		PIPE_UNLOCK(mpipe);
1234 		return (error);
1235 	}
1236 #endif
1237 
1238 	switch (cmd) {
1239 
1240 	case FIONBIO:
1241 		PIPE_UNLOCK(mpipe);
1242 		return (0);
1243 
1244 	case FIOASYNC:
1245 		if (*(int *)data) {
1246 			mpipe->pipe_state |= PIPE_ASYNC;
1247 		} else {
1248 			mpipe->pipe_state &= ~PIPE_ASYNC;
1249 		}
1250 		PIPE_UNLOCK(mpipe);
1251 		return (0);
1252 
1253 	case FIONREAD:
1254 		if (mpipe->pipe_state & PIPE_DIRECTW)
1255 			*(int *)data = mpipe->pipe_map.cnt;
1256 		else
1257 			*(int *)data = mpipe->pipe_buffer.cnt;
1258 		PIPE_UNLOCK(mpipe);
1259 		return (0);
1260 
1261 	case FIOSETOWN:
1262 		PIPE_UNLOCK(mpipe);
1263 		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1264 
1265 	case FIOGETOWN:
1266 		PIPE_UNLOCK(mpipe);
1267 		*(int *)data = fgetown(&mpipe->pipe_sigio);
1268 		return (0);
1269 
1270 	/* This is deprecated, FIOSETOWN should be used instead. */
1271 	case TIOCSPGRP:
1272 		PIPE_UNLOCK(mpipe);
1273 		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1274 
1275 	/* This is deprecated, FIOGETOWN should be used instead. */
1276 	case TIOCGPGRP:
1277 		PIPE_UNLOCK(mpipe);
1278 		*(int *)data = -fgetown(&mpipe->pipe_sigio);
1279 		return (0);
1280 
1281 	}
1282 	PIPE_UNLOCK(mpipe);
1283 	return (ENOTTY);
1284 }
1285 
1286 static int
1287 pipe_poll(fp, events, active_cred, td)
1288 	struct file *fp;
1289 	int events;
1290 	struct ucred *active_cred;
1291 	struct thread *td;
1292 {
1293 	struct pipe *rpipe = fp->f_data;
1294 	struct pipe *wpipe;
1295 	int revents = 0;
1296 #ifdef MAC
1297 	int error;
1298 #endif
1299 
1300 	wpipe = rpipe->pipe_peer;
1301 	PIPE_LOCK(rpipe);
1302 #ifdef MAC
1303 	error = mac_check_pipe_poll(active_cred, rpipe->pipe_pair);
1304 	if (error)
1305 		goto locked_error;
1306 #endif
1307 	if (events & (POLLIN | POLLRDNORM))
1308 		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1309 		    (rpipe->pipe_buffer.cnt > 0) ||
1310 		    (rpipe->pipe_state & PIPE_EOF))
1311 			revents |= events & (POLLIN | POLLRDNORM);
1312 
1313 	if (events & (POLLOUT | POLLWRNORM))
1314 		if (!wpipe->pipe_present || (wpipe->pipe_state & PIPE_EOF) ||
1315 		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1316 		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1317 			revents |= events & (POLLOUT | POLLWRNORM);
1318 
1319 	if ((rpipe->pipe_state & PIPE_EOF) ||
1320 	    (!wpipe->pipe_present) ||
1321 	    (wpipe->pipe_state & PIPE_EOF))
1322 		revents |= POLLHUP;
1323 
1324 	if (revents == 0) {
1325 		if (events & (POLLIN | POLLRDNORM)) {
1326 			selrecord(td, &rpipe->pipe_sel);
1327 			rpipe->pipe_state |= PIPE_SEL;
1328 		}
1329 
1330 		if (events & (POLLOUT | POLLWRNORM)) {
1331 			selrecord(td, &wpipe->pipe_sel);
1332 			wpipe->pipe_state |= PIPE_SEL;
1333 		}
1334 	}
1335 #ifdef MAC
1336 locked_error:
1337 #endif
1338 	PIPE_UNLOCK(rpipe);
1339 
1340 	return (revents);
1341 }
1342 
1343 /*
1344  * We shouldn't need locks here as we're doing a read and this should
1345  * be a natural race.
1346  */
1347 static int
1348 pipe_stat(fp, ub, active_cred, td)
1349 	struct file *fp;
1350 	struct stat *ub;
1351 	struct ucred *active_cred;
1352 	struct thread *td;
1353 {
1354 	struct pipe *pipe = fp->f_data;
1355 #ifdef MAC
1356 	int error;
1357 
1358 	PIPE_LOCK(pipe);
1359 	error = mac_check_pipe_stat(active_cred, pipe->pipe_pair);
1360 	PIPE_UNLOCK(pipe);
1361 	if (error)
1362 		return (error);
1363 #endif
1364 	bzero(ub, sizeof(*ub));
1365 	ub->st_mode = S_IFIFO;
1366 	ub->st_blksize = pipe->pipe_buffer.size;
1367 	if (pipe->pipe_state & PIPE_DIRECTW)
1368 		ub->st_size = pipe->pipe_map.cnt;
1369 	else
1370 		ub->st_size = pipe->pipe_buffer.cnt;
1371 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1372 	ub->st_atimespec = pipe->pipe_atime;
1373 	ub->st_mtimespec = pipe->pipe_mtime;
1374 	ub->st_ctimespec = pipe->pipe_ctime;
1375 	ub->st_uid = fp->f_cred->cr_uid;
1376 	ub->st_gid = fp->f_cred->cr_gid;
1377 	/*
1378 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1379 	 * XXX (st_dev, st_ino) should be unique.
1380 	 */
1381 	return (0);
1382 }
1383 
1384 /* ARGSUSED */
1385 static int
1386 pipe_close(fp, td)
1387 	struct file *fp;
1388 	struct thread *td;
1389 {
1390 	struct pipe *cpipe = fp->f_data;
1391 
1392 	fp->f_ops = &badfileops;
1393 	fp->f_data = NULL;
1394 	funsetown(&cpipe->pipe_sigio);
1395 	pipeclose(cpipe);
1396 	return (0);
1397 }
1398 
1399 static void
1400 pipe_free_kmem(cpipe)
1401 	struct pipe *cpipe;
1402 {
1403 
1404 	KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
1405 	    ("pipe_free_kmem: pipe mutex locked"));
1406 
1407 	if (cpipe->pipe_buffer.buffer != NULL) {
1408 		if (cpipe->pipe_buffer.size > PIPE_SIZE)
1409 			atomic_subtract_int(&nbigpipe, 1);
1410 		atomic_subtract_int(&amountpipekva, cpipe->pipe_buffer.size);
1411 		vm_map_remove(pipe_map,
1412 		    (vm_offset_t)cpipe->pipe_buffer.buffer,
1413 		    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
1414 		cpipe->pipe_buffer.buffer = NULL;
1415 	}
1416 #ifndef PIPE_NODIRECT
1417 	{
1418 		cpipe->pipe_map.cnt = 0;
1419 		cpipe->pipe_map.pos = 0;
1420 		cpipe->pipe_map.npages = 0;
1421 	}
1422 #endif
1423 }
1424 
1425 /*
1426  * shutdown the pipe
1427  */
1428 static void
1429 pipeclose(cpipe)
1430 	struct pipe *cpipe;
1431 {
1432 	struct pipepair *pp;
1433 	struct pipe *ppipe;
1434 
1435 	KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
1436 
1437 	PIPE_LOCK(cpipe);
1438 	pp = cpipe->pipe_pair;
1439 
1440 	pipeselwakeup(cpipe);
1441 
1442 	/*
1443 	 * If the other side is blocked, wake it up saying that
1444 	 * we want to close it down.
1445 	 */
1446 	cpipe->pipe_state |= PIPE_EOF;
1447 	while (cpipe->pipe_busy) {
1448 		wakeup(cpipe);
1449 		cpipe->pipe_state |= PIPE_WANT;
1450 		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1451 	}
1452 
1453 
1454 	/*
1455 	 * Disconnect from peer, if any.
1456 	 */
1457 	ppipe = cpipe->pipe_peer;
1458 	if (ppipe->pipe_present != 0) {
1459 		pipeselwakeup(ppipe);
1460 
1461 		ppipe->pipe_state |= PIPE_EOF;
1462 		wakeup(ppipe);
1463 		KNOTE(&ppipe->pipe_sel.si_note, 0);
1464 	}
1465 
1466 	/*
1467 	 * Mark this endpoint as free.  Release kmem resources.  We
1468 	 * don't mark this endpoint as unused until we've finished
1469 	 * doing that, or the pipe might disappear out from under
1470 	 * us.
1471 	 */
1472 	pipelock(cpipe, 0);
1473 	PIPE_UNLOCK(cpipe);
1474 	pipe_free_kmem(cpipe);
1475 	PIPE_LOCK(cpipe);
1476 	cpipe->pipe_present = 0;
1477 	pipeunlock(cpipe);
1478 
1479 	/*
1480 	 * If both endpoints are now closed, release the memory for the
1481 	 * pipe pair.  If not, unlock.
1482 	 */
1483 	if (ppipe->pipe_present == 0) {
1484 		PIPE_UNLOCK(cpipe);
1485 #ifdef MAC
1486 		mac_destroy_pipe(pp);
1487 #endif
1488 		uma_zfree(pipe_zone, cpipe->pipe_pair);
1489 	} else
1490 		PIPE_UNLOCK(cpipe);
1491 }
1492 
1493 /*ARGSUSED*/
1494 static int
1495 pipe_kqfilter(struct file *fp, struct knote *kn)
1496 {
1497 	struct pipe *cpipe;
1498 
1499 	cpipe = kn->kn_fp->f_data;
1500 	PIPE_LOCK(cpipe);
1501 	switch (kn->kn_filter) {
1502 	case EVFILT_READ:
1503 		kn->kn_fop = &pipe_rfiltops;
1504 		break;
1505 	case EVFILT_WRITE:
1506 		kn->kn_fop = &pipe_wfiltops;
1507 		if (!cpipe->pipe_peer->pipe_present) {
1508 			/* other end of pipe has been closed */
1509 			PIPE_UNLOCK(cpipe);
1510 			return (EPIPE);
1511 		}
1512 		cpipe = cpipe->pipe_peer;
1513 		break;
1514 	default:
1515 		PIPE_UNLOCK(cpipe);
1516 		return (1);
1517 	}
1518 
1519 	SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
1520 	PIPE_UNLOCK(cpipe);
1521 	return (0);
1522 }
1523 
1524 static void
1525 filt_pipedetach(struct knote *kn)
1526 {
1527 	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1528 
1529 	PIPE_LOCK(cpipe);
1530 	if (kn->kn_filter == EVFILT_WRITE) {
1531 		if (!cpipe->pipe_peer->pipe_present) {
1532 			PIPE_UNLOCK(cpipe);
1533 			return;
1534 		}
1535 		cpipe = cpipe->pipe_peer;
1536 	}
1537 	SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1538 	PIPE_UNLOCK(cpipe);
1539 }
1540 
1541 /*ARGSUSED*/
1542 static int
1543 filt_piperead(struct knote *kn, long hint)
1544 {
1545 	struct pipe *rpipe = kn->kn_fp->f_data;
1546 	struct pipe *wpipe = rpipe->pipe_peer;
1547 
1548 	PIPE_LOCK(rpipe);
1549 	kn->kn_data = rpipe->pipe_buffer.cnt;
1550 	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1551 		kn->kn_data = rpipe->pipe_map.cnt;
1552 
1553 	if ((rpipe->pipe_state & PIPE_EOF) ||
1554 	    (!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
1555 		kn->kn_flags |= EV_EOF;
1556 		PIPE_UNLOCK(rpipe);
1557 		return (1);
1558 	}
1559 	PIPE_UNLOCK(rpipe);
1560 	return (kn->kn_data > 0);
1561 }
1562 
1563 /*ARGSUSED*/
1564 static int
1565 filt_pipewrite(struct knote *kn, long hint)
1566 {
1567 	struct pipe *rpipe = kn->kn_fp->f_data;
1568 	struct pipe *wpipe = rpipe->pipe_peer;
1569 
1570 	PIPE_LOCK(rpipe);
1571 	if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
1572 		kn->kn_data = 0;
1573 		kn->kn_flags |= EV_EOF;
1574 		PIPE_UNLOCK(rpipe);
1575 		return (1);
1576 	}
1577 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1578 	if (wpipe->pipe_state & PIPE_DIRECTW)
1579 		kn->kn_data = 0;
1580 
1581 	PIPE_UNLOCK(rpipe);
1582 	return (kn->kn_data >= PIPE_BUF);
1583 }
1584