xref: /freebsd/sys/kern/sys_pipe.c (revision cf02bf2407cb217c99cc82f78b7a2e7f0703c9ee)
1 /*
2  * Copyright (c) 1996 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. Absolutely no warranty of function or purpose is made by the author
15  *    John S. Dyson.
16  * 4. Modifications may be freely made to this file if the above conditions
17  *    are met.
18  */
19 
20 /*
21  * This file contains a high-performance replacement for the socket-based
22  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
23  * all features of sockets, but does do everything that pipes normally
24  * do.
25  */
26 
27 /*
28  * This code has two modes of operation, a small write mode and a large
29  * write mode.  The small write mode acts like conventional pipes with
30  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
31  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
32  * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
33  * the receiving process can copy it directly from the pages in the sending
34  * process.
35  *
36  * If the sending process receives a signal, it is possible that it will
37  * go away, and certainly its address space can change, because control
38  * is returned back to the user-mode side.  In that case, the pipe code
39  * arranges to copy the buffer supplied by the user process, to a pageable
40  * kernel buffer, and the receiving process will grab the data from the
41  * pageable kernel buffer.  Since signals don't happen all that often,
42  * the copy operation is normally eliminated.
43  *
44  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
45  * happen for small transfers so that the system will not spend all of
46  * its time context switching.
47  *
48  * In order to limit the resource use of pipes, two sysctls exist:
49  *
50  * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
51  * address space available to us in pipe_map.  Whenever the amount in use
52  * exceeds half of this value, all new pipes will be created with size
53  * SMALL_PIPE_SIZE, rather than PIPE_SIZE.  Big pipe creation will be limited
54  * as well.  This value is loader tunable only.
55  *
56  * These values are autotuned in subr_param.c.
57  *
58  * Memory usage may be monitored through the sysctls
59  * kern.ipc.pipes, kern.ipc.pipekva and kern.ipc.pipekvawired.
60  *
61  *
62  * Locking rules:  There are two locks present here:  A mutex, used via
63  * PIPE_LOCK, and a flag, used via pipelock().  All locking is done via
64  * the flag, as mutexes can not persist over uiomove.  The mutex
65  * exists only to guard access to the flag, and is not in itself a
66  * locking mechanism.
67  *
68  * As pipelock() may have to sleep before it can acquire the flag, it
69  * is important to reread all data after a call to pipelock(); everything
70  * in the structure may have changed.
71  */
72 
73 #include <sys/cdefs.h>
74 __FBSDID("$FreeBSD$");
75 
76 #include "opt_mac.h"
77 
78 #include <sys/param.h>
79 #include <sys/systm.h>
80 #include <sys/fcntl.h>
81 #include <sys/file.h>
82 #include <sys/filedesc.h>
83 #include <sys/filio.h>
84 #include <sys/kernel.h>
85 #include <sys/lock.h>
86 #include <sys/mac.h>
87 #include <sys/mutex.h>
88 #include <sys/ttycom.h>
89 #include <sys/stat.h>
90 #include <sys/malloc.h>
91 #include <sys/poll.h>
92 #include <sys/selinfo.h>
93 #include <sys/signalvar.h>
94 #include <sys/sysctl.h>
95 #include <sys/sysproto.h>
96 #include <sys/pipe.h>
97 #include <sys/proc.h>
98 #include <sys/vnode.h>
99 #include <sys/uio.h>
100 #include <sys/event.h>
101 
102 #include <vm/vm.h>
103 #include <vm/vm_param.h>
104 #include <vm/vm_object.h>
105 #include <vm/vm_kern.h>
106 #include <vm/vm_extern.h>
107 #include <vm/pmap.h>
108 #include <vm/vm_map.h>
109 #include <vm/vm_page.h>
110 #include <vm/uma.h>
111 
112 /*
113  * Use this define if you want to disable *fancy* VM things.  Expect an
114  * approx 30% decrease in transfer rate.  This could be useful for
115  * NetBSD or OpenBSD.
116  */
117 /* #define PIPE_NODIRECT */
118 
119 /*
120  * interfaces to the outside world
121  */
122 static fo_rdwr_t	pipe_read;
123 static fo_rdwr_t	pipe_write;
124 static fo_ioctl_t	pipe_ioctl;
125 static fo_poll_t	pipe_poll;
126 static fo_kqfilter_t	pipe_kqfilter;
127 static fo_stat_t	pipe_stat;
128 static fo_close_t	pipe_close;
129 
130 static struct fileops pipeops = {
131 	.fo_read = pipe_read,
132 	.fo_write = pipe_write,
133 	.fo_ioctl = pipe_ioctl,
134 	.fo_poll = pipe_poll,
135 	.fo_kqfilter = pipe_kqfilter,
136 	.fo_stat = pipe_stat,
137 	.fo_close = pipe_close,
138 	.fo_flags = DFLAG_PASSABLE
139 };
140 
141 static void	filt_pipedetach(struct knote *kn);
142 static int	filt_piperead(struct knote *kn, long hint);
143 static int	filt_pipewrite(struct knote *kn, long hint);
144 
145 static struct filterops pipe_rfiltops =
146 	{ 1, NULL, filt_pipedetach, filt_piperead };
147 static struct filterops pipe_wfiltops =
148 	{ 1, NULL, filt_pipedetach, filt_pipewrite };
149 
150 /*
151  * Default pipe buffer size(s), this can be kind-of large now because pipe
152  * space is pageable.  The pipe code will try to maintain locality of
153  * reference for performance reasons, so small amounts of outstanding I/O
154  * will not wipe the cache.
155  */
156 #define MINPIPESIZE (PIPE_SIZE/3)
157 #define MAXPIPESIZE (2*PIPE_SIZE/3)
158 
159 /*
160  * Limit the number of "big" pipes
161  */
162 #define LIMITBIGPIPES	32
163 static int nbigpipe;
164 
165 static int amountpipes;
166 static int amountpipekva;
167 
168 SYSCTL_DECL(_kern_ipc);
169 
170 SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
171 	   &maxpipekva, 0, "Pipe KVA limit");
172 SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD,
173 	   &amountpipes, 0, "Current # of pipes");
174 SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD,
175 	   &nbigpipe, 0, "Current # of big pipes");
176 SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
177 	   &amountpipekva, 0, "Pipe KVA usage");
178 
179 static void pipeinit(void *dummy __unused);
180 static void pipeclose(struct pipe *cpipe);
181 static void pipe_free_kmem(struct pipe *cpipe);
182 static int pipe_create(struct pipe *pipe);
183 static __inline int pipelock(struct pipe *cpipe, int catch);
184 static __inline void pipeunlock(struct pipe *cpipe);
185 static __inline void pipeselwakeup(struct pipe *cpipe);
186 #ifndef PIPE_NODIRECT
187 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
188 static void pipe_destroy_write_buffer(struct pipe *wpipe);
189 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
190 static void pipe_clone_write_buffer(struct pipe *wpipe);
191 #endif
192 static int pipespace(struct pipe *cpipe, int size);
193 static int pipespace_new(struct pipe *cpipe, int size);
194 
195 static int	pipe_zone_ctor(void *mem, int size, void *arg, int flags);
196 static void	pipe_zone_dtor(void *mem, int size, void *arg);
197 static int	pipe_zone_init(void *mem, int size, int flags);
198 static void	pipe_zone_fini(void *mem, int size);
199 
200 static uma_zone_t pipe_zone;
201 
202 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
203 
204 static void
205 pipeinit(void *dummy __unused)
206 {
207 
208 	pipe_zone = uma_zcreate("PIPE", sizeof(struct pipepair),
209 	    pipe_zone_ctor, pipe_zone_dtor, pipe_zone_init, pipe_zone_fini,
210 	    UMA_ALIGN_PTR, 0);
211 	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
212 }
213 
214 static int
215 pipe_zone_ctor(void *mem, int size, void *arg, int flags)
216 {
217 	struct pipepair *pp;
218 	struct pipe *rpipe, *wpipe;
219 
220 	KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
221 
222 	pp = (struct pipepair *)mem;
223 
224 	/*
225 	 * We zero both pipe endpoints to make sure all the kmem pointers
226 	 * are NULL, flag fields are zero'd, etc.  We timestamp both
227 	 * endpoints with the same time.
228 	 */
229 	rpipe = &pp->pp_rpipe;
230 	bzero(rpipe, sizeof(*rpipe));
231 	vfs_timestamp(&rpipe->pipe_ctime);
232 	rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
233 
234 	wpipe = &pp->pp_wpipe;
235 	bzero(wpipe, sizeof(*wpipe));
236 	wpipe->pipe_ctime = rpipe->pipe_ctime;
237 	wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
238 
239 	rpipe->pipe_peer = wpipe;
240 	rpipe->pipe_pair = pp;
241 	wpipe->pipe_peer = rpipe;
242 	wpipe->pipe_pair = pp;
243 
244 	/*
245 	 * Mark both endpoints as present; they will later get free'd
246 	 * one at a time.  When both are free'd, then the whole pair
247 	 * is released.
248 	 */
249 	rpipe->pipe_present = 1;
250 	wpipe->pipe_present = 1;
251 
252 	/*
253 	 * Eventually, the MAC Framework may initialize the label
254 	 * in ctor or init, but for now we do it elswhere to avoid
255 	 * blocking in ctor or init.
256 	 */
257 	pp->pp_label = NULL;
258 
259 	atomic_add_int(&amountpipes, 2);
260 	return (0);
261 }
262 
263 static void
264 pipe_zone_dtor(void *mem, int size, void *arg)
265 {
266 	struct pipepair *pp;
267 
268 	KASSERT(size == sizeof(*pp), ("pipe_zone_dtor: wrong size"));
269 
270 	pp = (struct pipepair *)mem;
271 
272 	atomic_subtract_int(&amountpipes, 2);
273 }
274 
275 static int
276 pipe_zone_init(void *mem, int size, int flags)
277 {
278 	struct pipepair *pp;
279 
280 	KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
281 
282 	pp = (struct pipepair *)mem;
283 
284 	mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
285 	return (0);
286 }
287 
288 static void
289 pipe_zone_fini(void *mem, int size)
290 {
291 	struct pipepair *pp;
292 
293 	KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
294 
295 	pp = (struct pipepair *)mem;
296 
297 	mtx_destroy(&pp->pp_mtx);
298 }
299 
300 /*
301  * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail,
302  * let the zone pick up the pieces via pipeclose().
303  */
304 
305 /* ARGSUSED */
306 int
307 pipe(td, uap)
308 	struct thread *td;
309 	struct pipe_args /* {
310 		int	dummy;
311 	} */ *uap;
312 {
313 	struct filedesc *fdp = td->td_proc->p_fd;
314 	struct file *rf, *wf;
315 	struct pipepair *pp;
316 	struct pipe *rpipe, *wpipe;
317 	int fd, error;
318 
319 	pp = uma_zalloc(pipe_zone, M_WAITOK);
320 #ifdef MAC
321 	/*
322 	 * The MAC label is shared between the connected endpoints.  As a
323 	 * result mac_init_pipe() and mac_create_pipe() are called once
324 	 * for the pair, and not on the endpoints.
325 	 */
326 	mac_init_pipe(pp);
327 	mac_create_pipe(td->td_ucred, pp);
328 #endif
329 	rpipe = &pp->pp_rpipe;
330 	wpipe = &pp->pp_wpipe;
331 
332 	if (pipe_create(rpipe) || pipe_create(wpipe)) {
333 		pipeclose(rpipe);
334 		pipeclose(wpipe);
335 		return (ENFILE);
336 	}
337 
338 	rpipe->pipe_state |= PIPE_DIRECTOK;
339 	wpipe->pipe_state |= PIPE_DIRECTOK;
340 
341 	error = falloc(td, &rf, &fd);
342 	if (error) {
343 		pipeclose(rpipe);
344 		pipeclose(wpipe);
345 		return (error);
346 	}
347 	/* An extra reference on `rf' has been held for us by falloc(). */
348 	td->td_retval[0] = fd;
349 
350 	/*
351 	 * Warning: once we've gotten past allocation of the fd for the
352 	 * read-side, we can only drop the read side via fdrop() in order
353 	 * to avoid races against processes which manage to dup() the read
354 	 * side while we are blocked trying to allocate the write side.
355 	 */
356 	FILE_LOCK(rf);
357 	rf->f_flag = FREAD | FWRITE;
358 	rf->f_type = DTYPE_PIPE;
359 	rf->f_data = rpipe;
360 	rf->f_ops = &pipeops;
361 	FILE_UNLOCK(rf);
362 	error = falloc(td, &wf, &fd);
363 	if (error) {
364 		FILEDESC_LOCK(fdp);
365 		if (fdp->fd_ofiles[td->td_retval[0]] == rf) {
366 			fdp->fd_ofiles[td->td_retval[0]] = NULL;
367 			fdunused(fdp, td->td_retval[0]);
368 			FILEDESC_UNLOCK(fdp);
369 			fdrop(rf, td);
370 		} else {
371 			FILEDESC_UNLOCK(fdp);
372 		}
373 		fdrop(rf, td);
374 		/* rpipe has been closed by fdrop(). */
375 		pipeclose(wpipe);
376 		return (error);
377 	}
378 	/* An extra reference on `wf' has been held for us by falloc(). */
379 	FILE_LOCK(wf);
380 	wf->f_flag = FREAD | FWRITE;
381 	wf->f_type = DTYPE_PIPE;
382 	wf->f_data = wpipe;
383 	wf->f_ops = &pipeops;
384 	FILE_UNLOCK(wf);
385 	fdrop(wf, td);
386 	td->td_retval[1] = fd;
387 	fdrop(rf, td);
388 
389 	return (0);
390 }
391 
392 /*
393  * Allocate kva for pipe circular buffer, the space is pageable
394  * This routine will 'realloc' the size of a pipe safely, if it fails
395  * it will retain the old buffer.
396  * If it fails it will return ENOMEM.
397  */
398 static int
399 pipespace_new(cpipe, size)
400 	struct pipe *cpipe;
401 	int size;
402 {
403 	caddr_t buffer;
404 	int error;
405 	static int curfail = 0;
406 	static struct timeval lastfail;
407 
408 	KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
409 
410 	size = round_page(size);
411 	/*
412 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
413 	 */
414 	buffer = (caddr_t) vm_map_min(pipe_map);
415 
416 	/*
417 	 * The map entry is, by default, pageable.
418 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
419 	 */
420 	error = vm_map_find(pipe_map, NULL, 0,
421 		(vm_offset_t *) &buffer, size, 1,
422 		VM_PROT_ALL, VM_PROT_ALL, 0);
423 	if (error != KERN_SUCCESS) {
424 		if (ppsratecheck(&lastfail, &curfail, 1))
425 			printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
426 		return (ENOMEM);
427 	}
428 
429 	/* free old resources if we're resizing */
430 	pipe_free_kmem(cpipe);
431 	cpipe->pipe_buffer.buffer = buffer;
432 	cpipe->pipe_buffer.size = size;
433 	cpipe->pipe_buffer.in = 0;
434 	cpipe->pipe_buffer.out = 0;
435 	cpipe->pipe_buffer.cnt = 0;
436 	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
437 	return (0);
438 }
439 
440 /*
441  * Wrapper for pipespace_new() that performs locking assertions.
442  */
443 static int
444 pipespace(cpipe, size)
445 	struct pipe *cpipe;
446 	int size;
447 {
448 
449 	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
450 		("Unlocked pipe passed to pipespace"));
451 	return (pipespace_new(cpipe, size));
452 }
453 
454 /*
455  * lock a pipe for I/O, blocking other access
456  */
457 static __inline int
458 pipelock(cpipe, catch)
459 	struct pipe *cpipe;
460 	int catch;
461 {
462 	int error;
463 
464 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
465 	while (cpipe->pipe_state & PIPE_LOCKFL) {
466 		cpipe->pipe_state |= PIPE_LWANT;
467 		error = msleep(cpipe, PIPE_MTX(cpipe),
468 		    catch ? (PRIBIO | PCATCH) : PRIBIO,
469 		    "pipelk", 0);
470 		if (error != 0)
471 			return (error);
472 	}
473 	cpipe->pipe_state |= PIPE_LOCKFL;
474 	return (0);
475 }
476 
477 /*
478  * unlock a pipe I/O lock
479  */
480 static __inline void
481 pipeunlock(cpipe)
482 	struct pipe *cpipe;
483 {
484 
485 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
486 	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
487 		("Unlocked pipe passed to pipeunlock"));
488 	cpipe->pipe_state &= ~PIPE_LOCKFL;
489 	if (cpipe->pipe_state & PIPE_LWANT) {
490 		cpipe->pipe_state &= ~PIPE_LWANT;
491 		wakeup(cpipe);
492 	}
493 }
494 
495 static __inline void
496 pipeselwakeup(cpipe)
497 	struct pipe *cpipe;
498 {
499 
500 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
501 	if (cpipe->pipe_state & PIPE_SEL) {
502 		cpipe->pipe_state &= ~PIPE_SEL;
503 		selwakeuppri(&cpipe->pipe_sel, PSOCK);
504 	}
505 	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
506 		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
507 	KNOTE(&cpipe->pipe_sel.si_note, 0);
508 }
509 
510 /*
511  * Initialize and allocate VM and memory for pipe.  The structure
512  * will start out zero'd from the ctor, so we just manage the kmem.
513  */
514 static int
515 pipe_create(pipe)
516 	struct pipe *pipe;
517 {
518 	int error;
519 
520 	/*
521 	 * Reduce to 1/4th pipe size if we're over our global max.
522 	 */
523 	if (amountpipekva > maxpipekva / 2)
524 		error = pipespace_new(pipe, SMALL_PIPE_SIZE);
525 	else
526 		error = pipespace_new(pipe, PIPE_SIZE);
527 	return (error);
528 }
529 
530 /* ARGSUSED */
531 static int
532 pipe_read(fp, uio, active_cred, flags, td)
533 	struct file *fp;
534 	struct uio *uio;
535 	struct ucred *active_cred;
536 	struct thread *td;
537 	int flags;
538 {
539 	struct pipe *rpipe = fp->f_data;
540 	int error;
541 	int nread = 0;
542 	u_int size;
543 
544 	PIPE_LOCK(rpipe);
545 	++rpipe->pipe_busy;
546 	error = pipelock(rpipe, 1);
547 	if (error)
548 		goto unlocked_error;
549 
550 #ifdef MAC
551 	error = mac_check_pipe_read(active_cred, rpipe->pipe_pair);
552 	if (error)
553 		goto locked_error;
554 #endif
555 
556 	while (uio->uio_resid) {
557 		/*
558 		 * normal pipe buffer receive
559 		 */
560 		if (rpipe->pipe_buffer.cnt > 0) {
561 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
562 			if (size > rpipe->pipe_buffer.cnt)
563 				size = rpipe->pipe_buffer.cnt;
564 			if (size > (u_int) uio->uio_resid)
565 				size = (u_int) uio->uio_resid;
566 
567 			PIPE_UNLOCK(rpipe);
568 			error = uiomove(
569 			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
570 			    size, uio);
571 			PIPE_LOCK(rpipe);
572 			if (error)
573 				break;
574 
575 			rpipe->pipe_buffer.out += size;
576 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
577 				rpipe->pipe_buffer.out = 0;
578 
579 			rpipe->pipe_buffer.cnt -= size;
580 
581 			/*
582 			 * If there is no more to read in the pipe, reset
583 			 * its pointers to the beginning.  This improves
584 			 * cache hit stats.
585 			 */
586 			if (rpipe->pipe_buffer.cnt == 0) {
587 				rpipe->pipe_buffer.in = 0;
588 				rpipe->pipe_buffer.out = 0;
589 			}
590 			nread += size;
591 #ifndef PIPE_NODIRECT
592 		/*
593 		 * Direct copy, bypassing a kernel buffer.
594 		 */
595 		} else if ((size = rpipe->pipe_map.cnt) &&
596 			   (rpipe->pipe_state & PIPE_DIRECTW)) {
597 			if (size > (u_int) uio->uio_resid)
598 				size = (u_int) uio->uio_resid;
599 
600 			PIPE_UNLOCK(rpipe);
601 			error = uiomove_fromphys(rpipe->pipe_map.ms,
602 			    rpipe->pipe_map.pos, size, uio);
603 			PIPE_LOCK(rpipe);
604 			if (error)
605 				break;
606 			nread += size;
607 			rpipe->pipe_map.pos += size;
608 			rpipe->pipe_map.cnt -= size;
609 			if (rpipe->pipe_map.cnt == 0) {
610 				rpipe->pipe_state &= ~PIPE_DIRECTW;
611 				wakeup(rpipe);
612 			}
613 #endif
614 		} else {
615 			/*
616 			 * detect EOF condition
617 			 * read returns 0 on EOF, no need to set error
618 			 */
619 			if (rpipe->pipe_state & PIPE_EOF)
620 				break;
621 
622 			/*
623 			 * If the "write-side" has been blocked, wake it up now.
624 			 */
625 			if (rpipe->pipe_state & PIPE_WANTW) {
626 				rpipe->pipe_state &= ~PIPE_WANTW;
627 				wakeup(rpipe);
628 			}
629 
630 			/*
631 			 * Break if some data was read.
632 			 */
633 			if (nread > 0)
634 				break;
635 
636 			/*
637 			 * Unlock the pipe buffer for our remaining processing.
638 			 * We will either break out with an error or we will
639 			 * sleep and relock to loop.
640 			 */
641 			pipeunlock(rpipe);
642 
643 			/*
644 			 * Handle non-blocking mode operation or
645 			 * wait for more data.
646 			 */
647 			if (fp->f_flag & FNONBLOCK) {
648 				error = EAGAIN;
649 			} else {
650 				rpipe->pipe_state |= PIPE_WANTR;
651 				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
652 				    PRIBIO | PCATCH,
653 				    "piperd", 0)) == 0)
654 					error = pipelock(rpipe, 1);
655 			}
656 			if (error)
657 				goto unlocked_error;
658 		}
659 	}
660 #ifdef MAC
661 locked_error:
662 #endif
663 	pipeunlock(rpipe);
664 
665 	/* XXX: should probably do this before getting any locks. */
666 	if (error == 0)
667 		vfs_timestamp(&rpipe->pipe_atime);
668 unlocked_error:
669 	--rpipe->pipe_busy;
670 
671 	/*
672 	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
673 	 */
674 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
675 		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
676 		wakeup(rpipe);
677 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
678 		/*
679 		 * Handle write blocking hysteresis.
680 		 */
681 		if (rpipe->pipe_state & PIPE_WANTW) {
682 			rpipe->pipe_state &= ~PIPE_WANTW;
683 			wakeup(rpipe);
684 		}
685 	}
686 
687 	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
688 		pipeselwakeup(rpipe);
689 
690 	PIPE_UNLOCK(rpipe);
691 	return (error);
692 }
693 
694 #ifndef PIPE_NODIRECT
695 /*
696  * Map the sending processes' buffer into kernel space and wire it.
697  * This is similar to a physical write operation.
698  */
699 static int
700 pipe_build_write_buffer(wpipe, uio)
701 	struct pipe *wpipe;
702 	struct uio *uio;
703 {
704 	pmap_t pmap;
705 	u_int size;
706 	int i, j;
707 	vm_offset_t addr, endaddr;
708 
709 	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
710 
711 	size = (u_int) uio->uio_iov->iov_len;
712 	if (size > wpipe->pipe_buffer.size)
713 		size = wpipe->pipe_buffer.size;
714 
715 	pmap = vmspace_pmap(curproc->p_vmspace);
716 	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
717 	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
718 	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
719 		/*
720 		 * vm_fault_quick() can sleep.  Consequently,
721 		 * vm_page_lock_queue() and vm_page_unlock_queue()
722 		 * should not be performed outside of this loop.
723 		 */
724 	race:
725 		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) {
726 			vm_page_lock_queues();
727 			for (j = 0; j < i; j++)
728 				vm_page_unhold(wpipe->pipe_map.ms[j]);
729 			vm_page_unlock_queues();
730 			return (EFAULT);
731 		}
732 		wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr,
733 		    VM_PROT_READ);
734 		if (wpipe->pipe_map.ms[i] == NULL)
735 			goto race;
736 	}
737 
738 /*
739  * set up the control block
740  */
741 	wpipe->pipe_map.npages = i;
742 	wpipe->pipe_map.pos =
743 	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
744 	wpipe->pipe_map.cnt = size;
745 
746 /*
747  * and update the uio data
748  */
749 
750 	uio->uio_iov->iov_len -= size;
751 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
752 	if (uio->uio_iov->iov_len == 0)
753 		uio->uio_iov++;
754 	uio->uio_resid -= size;
755 	uio->uio_offset += size;
756 	return (0);
757 }
758 
759 /*
760  * unmap and unwire the process buffer
761  */
762 static void
763 pipe_destroy_write_buffer(wpipe)
764 	struct pipe *wpipe;
765 {
766 	int i;
767 
768 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
769 	vm_page_lock_queues();
770 	for (i = 0; i < wpipe->pipe_map.npages; i++) {
771 		vm_page_unhold(wpipe->pipe_map.ms[i]);
772 	}
773 	vm_page_unlock_queues();
774 	wpipe->pipe_map.npages = 0;
775 }
776 
777 /*
778  * In the case of a signal, the writing process might go away.  This
779  * code copies the data into the circular buffer so that the source
780  * pages can be freed without loss of data.
781  */
782 static void
783 pipe_clone_write_buffer(wpipe)
784 	struct pipe *wpipe;
785 {
786 	struct uio uio;
787 	struct iovec iov;
788 	int size;
789 	int pos;
790 
791 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
792 	size = wpipe->pipe_map.cnt;
793 	pos = wpipe->pipe_map.pos;
794 
795 	wpipe->pipe_buffer.in = size;
796 	wpipe->pipe_buffer.out = 0;
797 	wpipe->pipe_buffer.cnt = size;
798 	wpipe->pipe_state &= ~PIPE_DIRECTW;
799 
800 	PIPE_UNLOCK(wpipe);
801 	iov.iov_base = wpipe->pipe_buffer.buffer;
802 	iov.iov_len = size;
803 	uio.uio_iov = &iov;
804 	uio.uio_iovcnt = 1;
805 	uio.uio_offset = 0;
806 	uio.uio_resid = size;
807 	uio.uio_segflg = UIO_SYSSPACE;
808 	uio.uio_rw = UIO_READ;
809 	uio.uio_td = curthread;
810 	uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio);
811 	PIPE_LOCK(wpipe);
812 	pipe_destroy_write_buffer(wpipe);
813 }
814 
815 /*
816  * This implements the pipe buffer write mechanism.  Note that only
817  * a direct write OR a normal pipe write can be pending at any given time.
818  * If there are any characters in the pipe buffer, the direct write will
819  * be deferred until the receiving process grabs all of the bytes from
820  * the pipe buffer.  Then the direct mapping write is set-up.
821  */
822 static int
823 pipe_direct_write(wpipe, uio)
824 	struct pipe *wpipe;
825 	struct uio *uio;
826 {
827 	int error;
828 
829 retry:
830 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
831 	error = pipelock(wpipe, 1);
832 	if (wpipe->pipe_state & PIPE_EOF)
833 		error = EPIPE;
834 	if (error) {
835 		pipeunlock(wpipe);
836 		goto error1;
837 	}
838 	while (wpipe->pipe_state & PIPE_DIRECTW) {
839 		if (wpipe->pipe_state & PIPE_WANTR) {
840 			wpipe->pipe_state &= ~PIPE_WANTR;
841 			wakeup(wpipe);
842 		}
843 		wpipe->pipe_state |= PIPE_WANTW;
844 		pipeunlock(wpipe);
845 		error = msleep(wpipe, PIPE_MTX(wpipe),
846 		    PRIBIO | PCATCH, "pipdww", 0);
847 		if (error)
848 			goto error1;
849 		else
850 			goto retry;
851 	}
852 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
853 	if (wpipe->pipe_buffer.cnt > 0) {
854 		if (wpipe->pipe_state & PIPE_WANTR) {
855 			wpipe->pipe_state &= ~PIPE_WANTR;
856 			wakeup(wpipe);
857 		}
858 		wpipe->pipe_state |= PIPE_WANTW;
859 		pipeunlock(wpipe);
860 		error = msleep(wpipe, PIPE_MTX(wpipe),
861 		    PRIBIO | PCATCH, "pipdwc", 0);
862 		if (error)
863 			goto error1;
864 		else
865 			goto retry;
866 	}
867 
868 	wpipe->pipe_state |= PIPE_DIRECTW;
869 
870 	PIPE_UNLOCK(wpipe);
871 	error = pipe_build_write_buffer(wpipe, uio);
872 	PIPE_LOCK(wpipe);
873 	if (error) {
874 		wpipe->pipe_state &= ~PIPE_DIRECTW;
875 		pipeunlock(wpipe);
876 		goto error1;
877 	}
878 
879 	error = 0;
880 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
881 		if (wpipe->pipe_state & PIPE_EOF) {
882 			pipe_destroy_write_buffer(wpipe);
883 			pipeselwakeup(wpipe);
884 			pipeunlock(wpipe);
885 			error = EPIPE;
886 			goto error1;
887 		}
888 		if (wpipe->pipe_state & PIPE_WANTR) {
889 			wpipe->pipe_state &= ~PIPE_WANTR;
890 			wakeup(wpipe);
891 		}
892 		pipeselwakeup(wpipe);
893 		pipeunlock(wpipe);
894 		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
895 		    "pipdwt", 0);
896 		pipelock(wpipe, 0);
897 	}
898 
899 	if (wpipe->pipe_state & PIPE_EOF)
900 		error = EPIPE;
901 	if (wpipe->pipe_state & PIPE_DIRECTW) {
902 		/*
903 		 * this bit of trickery substitutes a kernel buffer for
904 		 * the process that might be going away.
905 		 */
906 		pipe_clone_write_buffer(wpipe);
907 	} else {
908 		pipe_destroy_write_buffer(wpipe);
909 	}
910 	pipeunlock(wpipe);
911 	return (error);
912 
913 error1:
914 	wakeup(wpipe);
915 	return (error);
916 }
917 #endif
918 
919 static int
920 pipe_write(fp, uio, active_cred, flags, td)
921 	struct file *fp;
922 	struct uio *uio;
923 	struct ucred *active_cred;
924 	struct thread *td;
925 	int flags;
926 {
927 	int error = 0;
928 	int orig_resid;
929 	struct pipe *wpipe, *rpipe;
930 
931 	rpipe = fp->f_data;
932 	wpipe = rpipe->pipe_peer;
933 
934 	PIPE_LOCK(rpipe);
935 	error = pipelock(wpipe, 1);
936 	if (error) {
937 		PIPE_UNLOCK(rpipe);
938 		return (error);
939 	}
940 	/*
941 	 * detect loss of pipe read side, issue SIGPIPE if lost.
942 	 */
943 	if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
944 		pipeunlock(wpipe);
945 		PIPE_UNLOCK(rpipe);
946 		return (EPIPE);
947 	}
948 #ifdef MAC
949 	error = mac_check_pipe_write(active_cred, wpipe->pipe_pair);
950 	if (error) {
951 		pipeunlock(wpipe);
952 		PIPE_UNLOCK(rpipe);
953 		return (error);
954 	}
955 #endif
956 	++wpipe->pipe_busy;
957 
958 	/*
959 	 * If it is advantageous to resize the pipe buffer, do
960 	 * so.
961 	 */
962 	if ((uio->uio_resid > PIPE_SIZE) &&
963 		(amountpipekva < maxpipekva / 2) &&
964 		(nbigpipe < LIMITBIGPIPES) &&
965 		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
966 		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
967 		(wpipe->pipe_buffer.cnt == 0)) {
968 
969 		PIPE_UNLOCK(wpipe);
970 		if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
971 			atomic_add_int(&nbigpipe, 1);
972 		PIPE_LOCK(wpipe);
973 	}
974 
975 	pipeunlock(wpipe);
976 
977 	orig_resid = uio->uio_resid;
978 
979 	while (uio->uio_resid) {
980 		int space;
981 
982 		pipelock(wpipe, 0);
983 		if (wpipe->pipe_state & PIPE_EOF) {
984 			pipeunlock(wpipe);
985 			error = EPIPE;
986 			break;
987 		}
988 #ifndef PIPE_NODIRECT
989 		/*
990 		 * If the transfer is large, we can gain performance if
991 		 * we do process-to-process copies directly.
992 		 * If the write is non-blocking, we don't use the
993 		 * direct write mechanism.
994 		 *
995 		 * The direct write mechanism will detect the reader going
996 		 * away on us.
997 		 */
998 		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
999 		    (fp->f_flag & FNONBLOCK) == 0) {
1000 			pipeunlock(wpipe);
1001 			error = pipe_direct_write(wpipe, uio);
1002 			if (error)
1003 				break;
1004 			continue;
1005 		}
1006 #endif
1007 
1008 		/*
1009 		 * Pipe buffered writes cannot be coincidental with
1010 		 * direct writes.  We wait until the currently executing
1011 		 * direct write is completed before we start filling the
1012 		 * pipe buffer.  We break out if a signal occurs or the
1013 		 * reader goes away.
1014 		 */
1015 		if (wpipe->pipe_state & PIPE_DIRECTW) {
1016 			if (wpipe->pipe_state & PIPE_WANTR) {
1017 				wpipe->pipe_state &= ~PIPE_WANTR;
1018 				wakeup(wpipe);
1019 			}
1020 			pipeunlock(wpipe);
1021 			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
1022 			    "pipbww", 0);
1023 			if (error)
1024 				break;
1025 			else
1026 				continue;
1027 		}
1028 
1029 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1030 
1031 		/* Writes of size <= PIPE_BUF must be atomic. */
1032 		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
1033 			space = 0;
1034 
1035 		if (space > 0) {
1036 			int size;	/* Transfer size */
1037 			int segsize;	/* first segment to transfer */
1038 
1039 			/*
1040 			 * Transfer size is minimum of uio transfer
1041 			 * and free space in pipe buffer.
1042 			 */
1043 			if (space > uio->uio_resid)
1044 				size = uio->uio_resid;
1045 			else
1046 				size = space;
1047 			/*
1048 			 * First segment to transfer is minimum of
1049 			 * transfer size and contiguous space in
1050 			 * pipe buffer.  If first segment to transfer
1051 			 * is less than the transfer size, we've got
1052 			 * a wraparound in the buffer.
1053 			 */
1054 			segsize = wpipe->pipe_buffer.size -
1055 				wpipe->pipe_buffer.in;
1056 			if (segsize > size)
1057 				segsize = size;
1058 
1059 			/* Transfer first segment */
1060 
1061 			PIPE_UNLOCK(rpipe);
1062 			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1063 					segsize, uio);
1064 			PIPE_LOCK(rpipe);
1065 
1066 			if (error == 0 && segsize < size) {
1067 				KASSERT(wpipe->pipe_buffer.in + segsize ==
1068 					wpipe->pipe_buffer.size,
1069 					("Pipe buffer wraparound disappeared"));
1070 				/*
1071 				 * Transfer remaining part now, to
1072 				 * support atomic writes.  Wraparound
1073 				 * happened.
1074 				 */
1075 
1076 				PIPE_UNLOCK(rpipe);
1077 				error = uiomove(
1078 				    &wpipe->pipe_buffer.buffer[0],
1079 				    size - segsize, uio);
1080 				PIPE_LOCK(rpipe);
1081 			}
1082 			if (error == 0) {
1083 				wpipe->pipe_buffer.in += size;
1084 				if (wpipe->pipe_buffer.in >=
1085 				    wpipe->pipe_buffer.size) {
1086 					KASSERT(wpipe->pipe_buffer.in ==
1087 						size - segsize +
1088 						wpipe->pipe_buffer.size,
1089 						("Expected wraparound bad"));
1090 					wpipe->pipe_buffer.in = size - segsize;
1091 				}
1092 
1093 				wpipe->pipe_buffer.cnt += size;
1094 				KASSERT(wpipe->pipe_buffer.cnt <=
1095 					wpipe->pipe_buffer.size,
1096 					("Pipe buffer overflow"));
1097 			}
1098 			pipeunlock(wpipe);
1099 		} else {
1100 			/*
1101 			 * If the "read-side" has been blocked, wake it up now.
1102 			 */
1103 			if (wpipe->pipe_state & PIPE_WANTR) {
1104 				wpipe->pipe_state &= ~PIPE_WANTR;
1105 				wakeup(wpipe);
1106 			}
1107 
1108 			/*
1109 			 * don't block on non-blocking I/O
1110 			 */
1111 			if (fp->f_flag & FNONBLOCK) {
1112 				error = EAGAIN;
1113 				pipeunlock(wpipe);
1114 				break;
1115 			}
1116 
1117 			/*
1118 			 * We have no more space and have something to offer,
1119 			 * wake up select/poll.
1120 			 */
1121 			pipeselwakeup(wpipe);
1122 
1123 			wpipe->pipe_state |= PIPE_WANTW;
1124 			pipeunlock(wpipe);
1125 			error = msleep(wpipe, PIPE_MTX(rpipe),
1126 			    PRIBIO | PCATCH, "pipewr", 0);
1127 			if (error != 0)
1128 				break;
1129 		}
1130 	}
1131 
1132 	pipelock(wpipe, 0);
1133 	--wpipe->pipe_busy;
1134 
1135 	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1136 		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1137 		wakeup(wpipe);
1138 	} else if (wpipe->pipe_buffer.cnt > 0) {
1139 		/*
1140 		 * If we have put any characters in the buffer, we wake up
1141 		 * the reader.
1142 		 */
1143 		if (wpipe->pipe_state & PIPE_WANTR) {
1144 			wpipe->pipe_state &= ~PIPE_WANTR;
1145 			wakeup(wpipe);
1146 		}
1147 	}
1148 
1149 	/*
1150 	 * Don't return EPIPE if I/O was successful
1151 	 */
1152 	if ((wpipe->pipe_buffer.cnt == 0) &&
1153 	    (uio->uio_resid == 0) &&
1154 	    (error == EPIPE)) {
1155 		error = 0;
1156 	}
1157 
1158 	if (error == 0)
1159 		vfs_timestamp(&wpipe->pipe_mtime);
1160 
1161 	/*
1162 	 * We have something to offer,
1163 	 * wake up select/poll.
1164 	 */
1165 	if (wpipe->pipe_buffer.cnt)
1166 		pipeselwakeup(wpipe);
1167 
1168 	pipeunlock(wpipe);
1169 	PIPE_UNLOCK(rpipe);
1170 	return (error);
1171 }
1172 
1173 /*
1174  * we implement a very minimal set of ioctls for compatibility with sockets.
1175  */
1176 static int
1177 pipe_ioctl(fp, cmd, data, active_cred, td)
1178 	struct file *fp;
1179 	u_long cmd;
1180 	void *data;
1181 	struct ucred *active_cred;
1182 	struct thread *td;
1183 {
1184 	struct pipe *mpipe = fp->f_data;
1185 #ifdef MAC
1186 	int error;
1187 #endif
1188 
1189 	PIPE_LOCK(mpipe);
1190 
1191 #ifdef MAC
1192 	error = mac_check_pipe_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
1193 	if (error) {
1194 		PIPE_UNLOCK(mpipe);
1195 		return (error);
1196 	}
1197 #endif
1198 
1199 	switch (cmd) {
1200 
1201 	case FIONBIO:
1202 		PIPE_UNLOCK(mpipe);
1203 		return (0);
1204 
1205 	case FIOASYNC:
1206 		if (*(int *)data) {
1207 			mpipe->pipe_state |= PIPE_ASYNC;
1208 		} else {
1209 			mpipe->pipe_state &= ~PIPE_ASYNC;
1210 		}
1211 		PIPE_UNLOCK(mpipe);
1212 		return (0);
1213 
1214 	case FIONREAD:
1215 		if (mpipe->pipe_state & PIPE_DIRECTW)
1216 			*(int *)data = mpipe->pipe_map.cnt;
1217 		else
1218 			*(int *)data = mpipe->pipe_buffer.cnt;
1219 		PIPE_UNLOCK(mpipe);
1220 		return (0);
1221 
1222 	case FIOSETOWN:
1223 		PIPE_UNLOCK(mpipe);
1224 		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1225 
1226 	case FIOGETOWN:
1227 		PIPE_UNLOCK(mpipe);
1228 		*(int *)data = fgetown(&mpipe->pipe_sigio);
1229 		return (0);
1230 
1231 	/* This is deprecated, FIOSETOWN should be used instead. */
1232 	case TIOCSPGRP:
1233 		PIPE_UNLOCK(mpipe);
1234 		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1235 
1236 	/* This is deprecated, FIOGETOWN should be used instead. */
1237 	case TIOCGPGRP:
1238 		PIPE_UNLOCK(mpipe);
1239 		*(int *)data = -fgetown(&mpipe->pipe_sigio);
1240 		return (0);
1241 
1242 	}
1243 	PIPE_UNLOCK(mpipe);
1244 	return (ENOTTY);
1245 }
1246 
1247 static int
1248 pipe_poll(fp, events, active_cred, td)
1249 	struct file *fp;
1250 	int events;
1251 	struct ucred *active_cred;
1252 	struct thread *td;
1253 {
1254 	struct pipe *rpipe = fp->f_data;
1255 	struct pipe *wpipe;
1256 	int revents = 0;
1257 #ifdef MAC
1258 	int error;
1259 #endif
1260 
1261 	wpipe = rpipe->pipe_peer;
1262 	PIPE_LOCK(rpipe);
1263 #ifdef MAC
1264 	error = mac_check_pipe_poll(active_cred, rpipe->pipe_pair);
1265 	if (error)
1266 		goto locked_error;
1267 #endif
1268 	if (events & (POLLIN | POLLRDNORM))
1269 		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1270 		    (rpipe->pipe_buffer.cnt > 0) ||
1271 		    (rpipe->pipe_state & PIPE_EOF))
1272 			revents |= events & (POLLIN | POLLRDNORM);
1273 
1274 	if (events & (POLLOUT | POLLWRNORM))
1275 		if (!wpipe->pipe_present || (wpipe->pipe_state & PIPE_EOF) ||
1276 		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1277 		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1278 			revents |= events & (POLLOUT | POLLWRNORM);
1279 
1280 	if ((rpipe->pipe_state & PIPE_EOF) ||
1281 	    (!wpipe->pipe_present) ||
1282 	    (wpipe->pipe_state & PIPE_EOF))
1283 		revents |= POLLHUP;
1284 
1285 	if (revents == 0) {
1286 		if (events & (POLLIN | POLLRDNORM)) {
1287 			selrecord(td, &rpipe->pipe_sel);
1288 			rpipe->pipe_state |= PIPE_SEL;
1289 		}
1290 
1291 		if (events & (POLLOUT | POLLWRNORM)) {
1292 			selrecord(td, &wpipe->pipe_sel);
1293 			wpipe->pipe_state |= PIPE_SEL;
1294 		}
1295 	}
1296 #ifdef MAC
1297 locked_error:
1298 #endif
1299 	PIPE_UNLOCK(rpipe);
1300 
1301 	return (revents);
1302 }
1303 
1304 /*
1305  * We shouldn't need locks here as we're doing a read and this should
1306  * be a natural race.
1307  */
1308 static int
1309 pipe_stat(fp, ub, active_cred, td)
1310 	struct file *fp;
1311 	struct stat *ub;
1312 	struct ucred *active_cred;
1313 	struct thread *td;
1314 {
1315 	struct pipe *pipe = fp->f_data;
1316 #ifdef MAC
1317 	int error;
1318 
1319 	PIPE_LOCK(pipe);
1320 	error = mac_check_pipe_stat(active_cred, pipe->pipe_pair);
1321 	PIPE_UNLOCK(pipe);
1322 	if (error)
1323 		return (error);
1324 #endif
1325 	bzero(ub, sizeof(*ub));
1326 	ub->st_mode = S_IFIFO;
1327 	ub->st_blksize = pipe->pipe_buffer.size;
1328 	if (pipe->pipe_state & PIPE_DIRECTW)
1329 		ub->st_size = pipe->pipe_map.cnt;
1330 	else
1331 		ub->st_size = pipe->pipe_buffer.cnt;
1332 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1333 	ub->st_atimespec = pipe->pipe_atime;
1334 	ub->st_mtimespec = pipe->pipe_mtime;
1335 	ub->st_ctimespec = pipe->pipe_ctime;
1336 	ub->st_uid = fp->f_cred->cr_uid;
1337 	ub->st_gid = fp->f_cred->cr_gid;
1338 	/*
1339 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1340 	 * XXX (st_dev, st_ino) should be unique.
1341 	 */
1342 	return (0);
1343 }
1344 
1345 /* ARGSUSED */
1346 static int
1347 pipe_close(fp, td)
1348 	struct file *fp;
1349 	struct thread *td;
1350 {
1351 	struct pipe *cpipe = fp->f_data;
1352 
1353 	fp->f_ops = &badfileops;
1354 	fp->f_data = NULL;
1355 	funsetown(&cpipe->pipe_sigio);
1356 	pipeclose(cpipe);
1357 	return (0);
1358 }
1359 
1360 static void
1361 pipe_free_kmem(cpipe)
1362 	struct pipe *cpipe;
1363 {
1364 
1365 	KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
1366 	    ("pipe_free_kmem: pipe mutex locked"));
1367 
1368 	if (cpipe->pipe_buffer.buffer != NULL) {
1369 		if (cpipe->pipe_buffer.size > PIPE_SIZE)
1370 			atomic_subtract_int(&nbigpipe, 1);
1371 		atomic_subtract_int(&amountpipekva, cpipe->pipe_buffer.size);
1372 		vm_map_remove(pipe_map,
1373 		    (vm_offset_t)cpipe->pipe_buffer.buffer,
1374 		    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
1375 		cpipe->pipe_buffer.buffer = NULL;
1376 	}
1377 #ifndef PIPE_NODIRECT
1378 	{
1379 		cpipe->pipe_map.cnt = 0;
1380 		cpipe->pipe_map.pos = 0;
1381 		cpipe->pipe_map.npages = 0;
1382 	}
1383 #endif
1384 }
1385 
1386 /*
1387  * shutdown the pipe
1388  */
1389 static void
1390 pipeclose(cpipe)
1391 	struct pipe *cpipe;
1392 {
1393 	struct pipepair *pp;
1394 	struct pipe *ppipe;
1395 
1396 	KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
1397 
1398 	PIPE_LOCK(cpipe);
1399 	pipelock(cpipe, 0);
1400 	pp = cpipe->pipe_pair;
1401 
1402 	pipeselwakeup(cpipe);
1403 
1404 	/*
1405 	 * If the other side is blocked, wake it up saying that
1406 	 * we want to close it down.
1407 	 */
1408 	cpipe->pipe_state |= PIPE_EOF;
1409 	while (cpipe->pipe_busy) {
1410 		wakeup(cpipe);
1411 		cpipe->pipe_state |= PIPE_WANT;
1412 		pipeunlock(cpipe);
1413 		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1414 		pipelock(cpipe, 0);
1415 	}
1416 
1417 
1418 	/*
1419 	 * Disconnect from peer, if any.
1420 	 */
1421 	ppipe = cpipe->pipe_peer;
1422 	if (ppipe->pipe_present != 0) {
1423 		pipeselwakeup(ppipe);
1424 
1425 		ppipe->pipe_state |= PIPE_EOF;
1426 		wakeup(ppipe);
1427 		KNOTE(&ppipe->pipe_sel.si_note, 0);
1428 	}
1429 
1430 	/*
1431 	 * Mark this endpoint as free.  Release kmem resources.  We
1432 	 * don't mark this endpoint as unused until we've finished
1433 	 * doing that, or the pipe might disappear out from under
1434 	 * us.
1435 	 */
1436 	PIPE_UNLOCK(cpipe);
1437 	pipe_free_kmem(cpipe);
1438 	PIPE_LOCK(cpipe);
1439 	cpipe->pipe_present = 0;
1440 	pipeunlock(cpipe);
1441 
1442 	/*
1443 	 * If both endpoints are now closed, release the memory for the
1444 	 * pipe pair.  If not, unlock.
1445 	 */
1446 	if (ppipe->pipe_present == 0) {
1447 		PIPE_UNLOCK(cpipe);
1448 #ifdef MAC
1449 		mac_destroy_pipe(pp);
1450 #endif
1451 		uma_zfree(pipe_zone, cpipe->pipe_pair);
1452 	} else
1453 		PIPE_UNLOCK(cpipe);
1454 }
1455 
1456 /*ARGSUSED*/
1457 static int
1458 pipe_kqfilter(struct file *fp, struct knote *kn)
1459 {
1460 	struct pipe *cpipe;
1461 
1462 	cpipe = kn->kn_fp->f_data;
1463 	PIPE_LOCK(cpipe);
1464 	switch (kn->kn_filter) {
1465 	case EVFILT_READ:
1466 		kn->kn_fop = &pipe_rfiltops;
1467 		break;
1468 	case EVFILT_WRITE:
1469 		kn->kn_fop = &pipe_wfiltops;
1470 		if (!cpipe->pipe_peer->pipe_present) {
1471 			/* other end of pipe has been closed */
1472 			PIPE_UNLOCK(cpipe);
1473 			return (EPIPE);
1474 		}
1475 		cpipe = cpipe->pipe_peer;
1476 		break;
1477 	default:
1478 		PIPE_UNLOCK(cpipe);
1479 		return (1);
1480 	}
1481 
1482 	SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
1483 	PIPE_UNLOCK(cpipe);
1484 	return (0);
1485 }
1486 
1487 static void
1488 filt_pipedetach(struct knote *kn)
1489 {
1490 	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1491 
1492 	PIPE_LOCK(cpipe);
1493 	if (kn->kn_filter == EVFILT_WRITE) {
1494 		if (!cpipe->pipe_peer->pipe_present) {
1495 			PIPE_UNLOCK(cpipe);
1496 			return;
1497 		}
1498 		cpipe = cpipe->pipe_peer;
1499 	}
1500 	SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1501 	PIPE_UNLOCK(cpipe);
1502 }
1503 
1504 /*ARGSUSED*/
1505 static int
1506 filt_piperead(struct knote *kn, long hint)
1507 {
1508 	struct pipe *rpipe = kn->kn_fp->f_data;
1509 	struct pipe *wpipe = rpipe->pipe_peer;
1510 
1511 	PIPE_LOCK(rpipe);
1512 	kn->kn_data = rpipe->pipe_buffer.cnt;
1513 	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1514 		kn->kn_data = rpipe->pipe_map.cnt;
1515 
1516 	if ((rpipe->pipe_state & PIPE_EOF) ||
1517 	    (!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
1518 		kn->kn_flags |= EV_EOF;
1519 		PIPE_UNLOCK(rpipe);
1520 		return (1);
1521 	}
1522 	PIPE_UNLOCK(rpipe);
1523 	return (kn->kn_data > 0);
1524 }
1525 
1526 /*ARGSUSED*/
1527 static int
1528 filt_pipewrite(struct knote *kn, long hint)
1529 {
1530 	struct pipe *rpipe = kn->kn_fp->f_data;
1531 	struct pipe *wpipe = rpipe->pipe_peer;
1532 
1533 	PIPE_LOCK(rpipe);
1534 	if ((!wpipe->pipe_present) || (wpipe->pipe_state & PIPE_EOF)) {
1535 		kn->kn_data = 0;
1536 		kn->kn_flags |= EV_EOF;
1537 		PIPE_UNLOCK(rpipe);
1538 		return (1);
1539 	}
1540 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1541 	if (wpipe->pipe_state & PIPE_DIRECTW)
1542 		kn->kn_data = 0;
1543 
1544 	PIPE_UNLOCK(rpipe);
1545 	return (kn->kn_data >= PIPE_BUF);
1546 }
1547