xref: /freebsd/sys/kern/sys_pipe.c (revision 52ec752989b2e6d4e9a59a8ff25d8ff596d85e62)
1 /*
2  * Copyright (c) 1996 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. Absolutely no warranty of function or purpose is made by the author
15  *    John S. Dyson.
16  * 4. Modifications may be freely made to this file if the above conditions
17  *    are met.
18  */
19 
20 /*
21  * This file contains a high-performance replacement for the socket-based
22  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
23  * all features of sockets, but does do everything that pipes normally
24  * do.
25  */
26 
27 /*
28  * This code has two modes of operation, a small write mode and a large
29  * write mode.  The small write mode acts like conventional pipes with
30  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
31  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
32  * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
33  * the receiving process can copy it directly from the pages in the sending
34  * process.
35  *
36  * If the sending process receives a signal, it is possible that it will
37  * go away, and certainly its address space can change, because control
38  * is returned back to the user-mode side.  In that case, the pipe code
39  * arranges to copy the buffer supplied by the user process, to a pageable
40  * kernel buffer, and the receiving process will grab the data from the
41  * pageable kernel buffer.  Since signals don't happen all that often,
42  * the copy operation is normally eliminated.
43  *
44  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
45  * happen for small transfers so that the system will not spend all of
46  * its time context switching.
47  *
48  * In order to limit the resource use of pipes, two sysctls exist:
49  *
50  * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
51  * address space available to us in pipe_map.  Whenever the amount in use
52  * exceeds half of this value, all new pipes will be created with size
53  * SMALL_PIPE_SIZE, rather than PIPE_SIZE.  Big pipe creation will be limited
54  * as well.  This value is loader tunable only.
55  *
56  * kern.ipc.maxpipekvawired - This value limits the amount of memory that may
57  * be wired in order to facilitate direct copies using page flipping.
58  * Whenever this value is exceeded, pipes will fall back to using regular
59  * copies.  This value is sysctl controllable at all times.
60  *
61  * These values are autotuned in subr_param.c.
62  *
63  * Memory usage may be monitored through the sysctls
64  * kern.ipc.pipes, kern.ipc.pipekva and kern.ipc.pipekvawired.
65  *
66  */
67 
68 #include <sys/cdefs.h>
69 __FBSDID("$FreeBSD$");
70 
71 #include "opt_mac.h"
72 
73 #include <sys/param.h>
74 #include <sys/systm.h>
75 #include <sys/fcntl.h>
76 #include <sys/file.h>
77 #include <sys/filedesc.h>
78 #include <sys/filio.h>
79 #include <sys/kernel.h>
80 #include <sys/lock.h>
81 #include <sys/mac.h>
82 #include <sys/mutex.h>
83 #include <sys/ttycom.h>
84 #include <sys/stat.h>
85 #include <sys/malloc.h>
86 #include <sys/poll.h>
87 #include <sys/selinfo.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/sysproto.h>
91 #include <sys/pipe.h>
92 #include <sys/proc.h>
93 #include <sys/vnode.h>
94 #include <sys/uio.h>
95 #include <sys/event.h>
96 
97 #include <vm/vm.h>
98 #include <vm/vm_param.h>
99 #include <vm/vm_object.h>
100 #include <vm/vm_kern.h>
101 #include <vm/vm_extern.h>
102 #include <vm/pmap.h>
103 #include <vm/vm_map.h>
104 #include <vm/vm_page.h>
105 #include <vm/uma.h>
106 
107 /*
108  * Use this define if you want to disable *fancy* VM things.  Expect an
109  * approx 30% decrease in transfer rate.  This could be useful for
110  * NetBSD or OpenBSD.
111  */
112 /* #define PIPE_NODIRECT */
113 
114 /*
115  * interfaces to the outside world
116  */
117 static fo_rdwr_t	pipe_read;
118 static fo_rdwr_t	pipe_write;
119 static fo_ioctl_t	pipe_ioctl;
120 static fo_poll_t	pipe_poll;
121 static fo_kqfilter_t	pipe_kqfilter;
122 static fo_stat_t	pipe_stat;
123 static fo_close_t	pipe_close;
124 
125 static struct fileops pipeops = {
126 	.fo_read = pipe_read,
127 	.fo_write = pipe_write,
128 	.fo_ioctl = pipe_ioctl,
129 	.fo_poll = pipe_poll,
130 	.fo_kqfilter = pipe_kqfilter,
131 	.fo_stat = pipe_stat,
132 	.fo_close = pipe_close,
133 	.fo_flags = DFLAG_PASSABLE
134 };
135 
136 static void	filt_pipedetach(struct knote *kn);
137 static int	filt_piperead(struct knote *kn, long hint);
138 static int	filt_pipewrite(struct knote *kn, long hint);
139 
140 static struct filterops pipe_rfiltops =
141 	{ 1, NULL, filt_pipedetach, filt_piperead };
142 static struct filterops pipe_wfiltops =
143 	{ 1, NULL, filt_pipedetach, filt_pipewrite };
144 
145 /*
146  * Default pipe buffer size(s), this can be kind-of large now because pipe
147  * space is pageable.  The pipe code will try to maintain locality of
148  * reference for performance reasons, so small amounts of outstanding I/O
149  * will not wipe the cache.
150  */
151 #define MINPIPESIZE (PIPE_SIZE/3)
152 #define MAXPIPESIZE (2*PIPE_SIZE/3)
153 
154 /*
155  * Limit the number of "big" pipes
156  */
157 #define LIMITBIGPIPES	32
158 static int nbigpipe;
159 
160 static int amountpipes;
161 static int amountpipekva;
162 static int amountpipekvawired;
163 
164 SYSCTL_DECL(_kern_ipc);
165 
166 SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
167 	   &maxpipekva, 0, "Pipe KVA limit");
168 SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW,
169 	   &maxpipekvawired, 0, "Pipe KVA wired limit");
170 SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD,
171 	   &amountpipes, 0, "Current # of pipes");
172 SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD,
173 	   &nbigpipe, 0, "Current # of big pipes");
174 SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
175 	   &amountpipekva, 0, "Pipe KVA usage");
176 SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD,
177 	   &amountpipekvawired, 0, "Pipe wired KVA usage");
178 
179 static void pipeinit(void *dummy __unused);
180 static void pipeclose(struct pipe *cpipe);
181 static void pipe_free_kmem(struct pipe *cpipe);
182 static int pipe_create(struct pipe **cpipep);
183 static __inline int pipelock(struct pipe *cpipe, int catch);
184 static __inline void pipeunlock(struct pipe *cpipe);
185 static __inline void pipeselwakeup(struct pipe *cpipe);
186 #ifndef PIPE_NODIRECT
187 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
188 static void pipe_destroy_write_buffer(struct pipe *wpipe);
189 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
190 static void pipe_clone_write_buffer(struct pipe *wpipe);
191 #endif
192 static int pipespace(struct pipe *cpipe, int size);
193 
194 static uma_zone_t pipe_zone;
195 
196 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
197 
198 static void
199 pipeinit(void *dummy __unused)
200 {
201 
202 	pipe_zone = uma_zcreate("PIPE", sizeof(struct pipe), NULL,
203 	    NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
204 	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
205 }
206 
207 /*
208  * The pipe system call for the DTYPE_PIPE type of pipes
209  */
210 
211 /* ARGSUSED */
212 int
213 pipe(td, uap)
214 	struct thread *td;
215 	struct pipe_args /* {
216 		int	dummy;
217 	} */ *uap;
218 {
219 	struct filedesc *fdp = td->td_proc->p_fd;
220 	struct file *rf, *wf;
221 	struct pipe *rpipe, *wpipe;
222 	struct mtx *pmtx;
223 	int fd, error;
224 
225 	rpipe = wpipe = NULL;
226 	if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
227 		pipeclose(rpipe);
228 		pipeclose(wpipe);
229 		return (ENFILE);
230 	}
231 
232 	rpipe->pipe_state |= PIPE_DIRECTOK;
233 	wpipe->pipe_state |= PIPE_DIRECTOK;
234 
235 	error = falloc(td, &rf, &fd);
236 	if (error) {
237 		pipeclose(rpipe);
238 		pipeclose(wpipe);
239 		return (error);
240 	}
241 	/* An extra reference on `rf' has been held for us by falloc(). */
242 	td->td_retval[0] = fd;
243 
244 	/*
245 	 * Warning: once we've gotten past allocation of the fd for the
246 	 * read-side, we can only drop the read side via fdrop() in order
247 	 * to avoid races against processes which manage to dup() the read
248 	 * side while we are blocked trying to allocate the write side.
249 	 */
250 	FILE_LOCK(rf);
251 	rf->f_flag = FREAD | FWRITE;
252 	rf->f_type = DTYPE_PIPE;
253 	rf->f_data = rpipe;
254 	rf->f_ops = &pipeops;
255 	FILE_UNLOCK(rf);
256 	error = falloc(td, &wf, &fd);
257 	if (error) {
258 		FILEDESC_LOCK(fdp);
259 		if (fdp->fd_ofiles[td->td_retval[0]] == rf) {
260 			fdp->fd_ofiles[td->td_retval[0]] = NULL;
261 			fdunused(fdp, td->td_retval[0]);
262 			FILEDESC_UNLOCK(fdp);
263 			fdrop(rf, td);
264 		} else {
265 			FILEDESC_UNLOCK(fdp);
266 		}
267 		fdrop(rf, td);
268 		/* rpipe has been closed by fdrop(). */
269 		pipeclose(wpipe);
270 		return (error);
271 	}
272 	/* An extra reference on `wf' has been held for us by falloc(). */
273 	FILE_LOCK(wf);
274 	wf->f_flag = FREAD | FWRITE;
275 	wf->f_type = DTYPE_PIPE;
276 	wf->f_data = wpipe;
277 	wf->f_ops = &pipeops;
278 	FILE_UNLOCK(wf);
279 	fdrop(wf, td);
280 	td->td_retval[1] = fd;
281 	rpipe->pipe_peer = wpipe;
282 	wpipe->pipe_peer = rpipe;
283 #ifdef MAC
284 	/*
285 	 * struct pipe represents a pipe endpoint.  The MAC label is shared
286 	 * between the connected endpoints.  As a result mac_init_pipe() and
287 	 * mac_create_pipe() should only be called on one of the endpoints
288 	 * after they have been connected.
289 	 */
290 	mac_init_pipe(rpipe);
291 	mac_create_pipe(td->td_ucred, rpipe);
292 #endif
293 	pmtx = malloc(sizeof(*pmtx), M_TEMP, M_WAITOK | M_ZERO);
294 	mtx_init(pmtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
295 	rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx;
296 	fdrop(rf, td);
297 
298 	return (0);
299 }
300 
301 /*
302  * Allocate kva for pipe circular buffer, the space is pageable
303  * This routine will 'realloc' the size of a pipe safely, if it fails
304  * it will retain the old buffer.
305  * If it fails it will return ENOMEM.
306  */
307 static int
308 pipespace(cpipe, size)
309 	struct pipe *cpipe;
310 	int size;
311 {
312 	caddr_t buffer;
313 	int error;
314 	static int curfail = 0;
315 	static struct timeval lastfail;
316 
317 	KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)),
318 	       ("pipespace: pipe mutex locked"));
319 
320 	size = round_page(size);
321 	/*
322 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
323 	 */
324 	buffer = (caddr_t) vm_map_min(pipe_map);
325 
326 	/*
327 	 * The map entry is, by default, pageable.
328 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
329 	 */
330 	error = vm_map_find(pipe_map, NULL, 0,
331 		(vm_offset_t *) &buffer, size, 1,
332 		VM_PROT_ALL, VM_PROT_ALL, 0);
333 	if (error != KERN_SUCCESS) {
334 		if (ppsratecheck(&lastfail, &curfail, 1))
335 			printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
336 		return (ENOMEM);
337 	}
338 
339 	/* free old resources if we're resizing */
340 	pipe_free_kmem(cpipe);
341 	cpipe->pipe_buffer.buffer = buffer;
342 	cpipe->pipe_buffer.size = size;
343 	cpipe->pipe_buffer.in = 0;
344 	cpipe->pipe_buffer.out = 0;
345 	cpipe->pipe_buffer.cnt = 0;
346 	atomic_add_int(&amountpipes, 1);
347 	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
348 	return (0);
349 }
350 
351 /*
352  * initialize and allocate VM and memory for pipe
353  */
354 static int
355 pipe_create(cpipep)
356 	struct pipe **cpipep;
357 {
358 	struct pipe *cpipe;
359 	int error;
360 
361 	*cpipep = uma_zalloc(pipe_zone, M_WAITOK);
362 	if (*cpipep == NULL)
363 		return (ENOMEM);
364 
365 	cpipe = *cpipep;
366 
367 	/*
368 	 * protect so pipeclose() doesn't follow a junk pointer
369 	 * if pipespace() fails.
370 	 */
371 	bzero(&cpipe->pipe_sel, sizeof(cpipe->pipe_sel));
372 	cpipe->pipe_state = 0;
373 	cpipe->pipe_peer = NULL;
374 	cpipe->pipe_busy = 0;
375 
376 #ifndef PIPE_NODIRECT
377 	/*
378 	 * pipe data structure initializations to support direct pipe I/O
379 	 */
380 	cpipe->pipe_map.cnt = 0;
381 	cpipe->pipe_map.kva = 0;
382 	cpipe->pipe_map.pos = 0;
383 	cpipe->pipe_map.npages = 0;
384 	/* cpipe->pipe_map.ms[] = invalid */
385 #endif
386 
387 	cpipe->pipe_mtxp = NULL;	/* avoid pipespace assertion */
388 	/*
389 	 * Reduce to 1/4th pipe size if we're over our global max.
390 	 */
391 	if (amountpipekva > maxpipekva / 2)
392 		error = pipespace(cpipe, SMALL_PIPE_SIZE);
393 	else
394 		error = pipespace(cpipe, PIPE_SIZE);
395 	if (error)
396 		return (error);
397 
398 	vfs_timestamp(&cpipe->pipe_ctime);
399 	cpipe->pipe_atime = cpipe->pipe_ctime;
400 	cpipe->pipe_mtime = cpipe->pipe_ctime;
401 
402 	return (0);
403 }
404 
405 
406 /*
407  * lock a pipe for I/O, blocking other access
408  */
409 static __inline int
410 pipelock(cpipe, catch)
411 	struct pipe *cpipe;
412 	int catch;
413 {
414 	int error;
415 
416 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
417 	while (cpipe->pipe_state & PIPE_LOCKFL) {
418 		cpipe->pipe_state |= PIPE_LWANT;
419 		error = msleep(cpipe, PIPE_MTX(cpipe),
420 		    catch ? (PRIBIO | PCATCH) : PRIBIO,
421 		    "pipelk", 0);
422 		if (error != 0)
423 			return (error);
424 	}
425 	cpipe->pipe_state |= PIPE_LOCKFL;
426 	return (0);
427 }
428 
429 /*
430  * unlock a pipe I/O lock
431  */
432 static __inline void
433 pipeunlock(cpipe)
434 	struct pipe *cpipe;
435 {
436 
437 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
438 	cpipe->pipe_state &= ~PIPE_LOCKFL;
439 	if (cpipe->pipe_state & PIPE_LWANT) {
440 		cpipe->pipe_state &= ~PIPE_LWANT;
441 		wakeup(cpipe);
442 	}
443 }
444 
445 static __inline void
446 pipeselwakeup(cpipe)
447 	struct pipe *cpipe;
448 {
449 
450 	if (cpipe->pipe_state & PIPE_SEL) {
451 		cpipe->pipe_state &= ~PIPE_SEL;
452 		selwakeuppri(&cpipe->pipe_sel, PSOCK);
453 	}
454 	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
455 		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
456 	KNOTE(&cpipe->pipe_sel.si_note, 0);
457 }
458 
459 /* ARGSUSED */
460 static int
461 pipe_read(fp, uio, active_cred, flags, td)
462 	struct file *fp;
463 	struct uio *uio;
464 	struct ucred *active_cred;
465 	struct thread *td;
466 	int flags;
467 {
468 	struct pipe *rpipe = fp->f_data;
469 	int error;
470 	int nread = 0;
471 	u_int size;
472 
473 	PIPE_LOCK(rpipe);
474 	++rpipe->pipe_busy;
475 	error = pipelock(rpipe, 1);
476 	if (error)
477 		goto unlocked_error;
478 
479 #ifdef MAC
480 	error = mac_check_pipe_read(active_cred, rpipe);
481 	if (error)
482 		goto locked_error;
483 #endif
484 
485 	while (uio->uio_resid) {
486 		/*
487 		 * normal pipe buffer receive
488 		 */
489 		if (rpipe->pipe_buffer.cnt > 0) {
490 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
491 			if (size > rpipe->pipe_buffer.cnt)
492 				size = rpipe->pipe_buffer.cnt;
493 			if (size > (u_int) uio->uio_resid)
494 				size = (u_int) uio->uio_resid;
495 
496 			PIPE_UNLOCK(rpipe);
497 			error = uiomove(
498 			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
499 			    size, uio);
500 			PIPE_LOCK(rpipe);
501 			if (error)
502 				break;
503 
504 			rpipe->pipe_buffer.out += size;
505 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
506 				rpipe->pipe_buffer.out = 0;
507 
508 			rpipe->pipe_buffer.cnt -= size;
509 
510 			/*
511 			 * If there is no more to read in the pipe, reset
512 			 * its pointers to the beginning.  This improves
513 			 * cache hit stats.
514 			 */
515 			if (rpipe->pipe_buffer.cnt == 0) {
516 				rpipe->pipe_buffer.in = 0;
517 				rpipe->pipe_buffer.out = 0;
518 			}
519 			nread += size;
520 #ifndef PIPE_NODIRECT
521 		/*
522 		 * Direct copy, bypassing a kernel buffer.
523 		 */
524 		} else if ((size = rpipe->pipe_map.cnt) &&
525 			   (rpipe->pipe_state & PIPE_DIRECTW)) {
526 			caddr_t	va;
527 			if (size > (u_int) uio->uio_resid)
528 				size = (u_int) uio->uio_resid;
529 
530 			va = (caddr_t) rpipe->pipe_map.kva +
531 			    rpipe->pipe_map.pos;
532 			PIPE_UNLOCK(rpipe);
533 			error = uiomove(va, size, uio);
534 			PIPE_LOCK(rpipe);
535 			if (error)
536 				break;
537 			nread += size;
538 			rpipe->pipe_map.pos += size;
539 			rpipe->pipe_map.cnt -= size;
540 			if (rpipe->pipe_map.cnt == 0) {
541 				rpipe->pipe_state &= ~PIPE_DIRECTW;
542 				wakeup(rpipe);
543 			}
544 #endif
545 		} else {
546 			/*
547 			 * detect EOF condition
548 			 * read returns 0 on EOF, no need to set error
549 			 */
550 			if (rpipe->pipe_state & PIPE_EOF)
551 				break;
552 
553 			/*
554 			 * If the "write-side" has been blocked, wake it up now.
555 			 */
556 			if (rpipe->pipe_state & PIPE_WANTW) {
557 				rpipe->pipe_state &= ~PIPE_WANTW;
558 				wakeup(rpipe);
559 			}
560 
561 			/*
562 			 * Break if some data was read.
563 			 */
564 			if (nread > 0)
565 				break;
566 
567 			/*
568 			 * Unlock the pipe buffer for our remaining processing.
569 			 * We will either break out with an error or we will
570 			 * sleep and relock to loop.
571 			 */
572 			pipeunlock(rpipe);
573 
574 			/*
575 			 * Handle non-blocking mode operation or
576 			 * wait for more data.
577 			 */
578 			if (fp->f_flag & FNONBLOCK) {
579 				error = EAGAIN;
580 			} else {
581 				rpipe->pipe_state |= PIPE_WANTR;
582 				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
583 				    PRIBIO | PCATCH,
584 				    "piperd", 0)) == 0)
585 					error = pipelock(rpipe, 1);
586 			}
587 			if (error)
588 				goto unlocked_error;
589 		}
590 	}
591 #ifdef MAC
592 locked_error:
593 #endif
594 	pipeunlock(rpipe);
595 
596 	/* XXX: should probably do this before getting any locks. */
597 	if (error == 0)
598 		vfs_timestamp(&rpipe->pipe_atime);
599 unlocked_error:
600 	--rpipe->pipe_busy;
601 
602 	/*
603 	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
604 	 */
605 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
606 		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
607 		wakeup(rpipe);
608 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
609 		/*
610 		 * Handle write blocking hysteresis.
611 		 */
612 		if (rpipe->pipe_state & PIPE_WANTW) {
613 			rpipe->pipe_state &= ~PIPE_WANTW;
614 			wakeup(rpipe);
615 		}
616 	}
617 
618 	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
619 		pipeselwakeup(rpipe);
620 
621 	PIPE_UNLOCK(rpipe);
622 	return (error);
623 }
624 
625 #ifndef PIPE_NODIRECT
626 /*
627  * Map the sending processes' buffer into kernel space and wire it.
628  * This is similar to a physical write operation.
629  */
630 static int
631 pipe_build_write_buffer(wpipe, uio)
632 	struct pipe *wpipe;
633 	struct uio *uio;
634 {
635 	pmap_t pmap;
636 	u_int size;
637 	int i, j;
638 	vm_offset_t addr, endaddr;
639 
640 	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
641 
642 	size = (u_int) uio->uio_iov->iov_len;
643 	if (size > wpipe->pipe_buffer.size)
644 		size = wpipe->pipe_buffer.size;
645 
646 	pmap = vmspace_pmap(curproc->p_vmspace);
647 	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
648 	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
649 	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
650 		/*
651 		 * vm_fault_quick() can sleep.  Consequently,
652 		 * vm_page_lock_queue() and vm_page_unlock_queue()
653 		 * should not be performed outside of this loop.
654 		 */
655 	race:
656 		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) {
657 			vm_page_lock_queues();
658 			for (j = 0; j < i; j++)
659 				vm_page_unhold(wpipe->pipe_map.ms[j]);
660 			vm_page_unlock_queues();
661 			return (EFAULT);
662 		}
663 		wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr,
664 		    VM_PROT_READ);
665 		if (wpipe->pipe_map.ms[i] == NULL)
666 			goto race;
667 	}
668 
669 /*
670  * set up the control block
671  */
672 	wpipe->pipe_map.npages = i;
673 	wpipe->pipe_map.pos =
674 	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
675 	wpipe->pipe_map.cnt = size;
676 
677 /*
678  * and map the buffer
679  */
680 	if (wpipe->pipe_map.kva == 0) {
681 		/*
682 		 * We need to allocate space for an extra page because the
683 		 * address range might (will) span pages at times.
684 		 */
685 		wpipe->pipe_map.kva = kmem_alloc_nofault(kernel_map,
686 			wpipe->pipe_buffer.size + PAGE_SIZE);
687 		atomic_add_int(&amountpipekvawired,
688 		    wpipe->pipe_buffer.size + PAGE_SIZE);
689 	}
690 	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
691 		wpipe->pipe_map.npages);
692 
693 /*
694  * and update the uio data
695  */
696 
697 	uio->uio_iov->iov_len -= size;
698 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
699 	if (uio->uio_iov->iov_len == 0)
700 		uio->uio_iov++;
701 	uio->uio_resid -= size;
702 	uio->uio_offset += size;
703 	return (0);
704 }
705 
706 /*
707  * unmap and unwire the process buffer
708  */
709 static void
710 pipe_destroy_write_buffer(wpipe)
711 	struct pipe *wpipe;
712 {
713 	int i;
714 
715 	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
716 	if (wpipe->pipe_map.kva) {
717 		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
718 
719 		if (amountpipekvawired > maxpipekvawired / 2) {
720 			/* Conserve address space */
721 			vm_offset_t kva = wpipe->pipe_map.kva;
722 			wpipe->pipe_map.kva = 0;
723 			kmem_free(kernel_map, kva,
724 			    wpipe->pipe_buffer.size + PAGE_SIZE);
725 			atomic_subtract_int(&amountpipekvawired,
726 			    wpipe->pipe_buffer.size + PAGE_SIZE);
727 		}
728 	}
729 	vm_page_lock_queues();
730 	for (i = 0; i < wpipe->pipe_map.npages; i++) {
731 		vm_page_unhold(wpipe->pipe_map.ms[i]);
732 	}
733 	vm_page_unlock_queues();
734 	wpipe->pipe_map.npages = 0;
735 }
736 
737 /*
738  * In the case of a signal, the writing process might go away.  This
739  * code copies the data into the circular buffer so that the source
740  * pages can be freed without loss of data.
741  */
742 static void
743 pipe_clone_write_buffer(wpipe)
744 	struct pipe *wpipe;
745 {
746 	int size;
747 	int pos;
748 
749 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
750 	size = wpipe->pipe_map.cnt;
751 	pos = wpipe->pipe_map.pos;
752 
753 	wpipe->pipe_buffer.in = size;
754 	wpipe->pipe_buffer.out = 0;
755 	wpipe->pipe_buffer.cnt = size;
756 	wpipe->pipe_state &= ~PIPE_DIRECTW;
757 
758 	PIPE_UNLOCK(wpipe);
759 	bcopy((caddr_t) wpipe->pipe_map.kva + pos,
760 	    wpipe->pipe_buffer.buffer, size);
761 	pipe_destroy_write_buffer(wpipe);
762 	PIPE_LOCK(wpipe);
763 }
764 
765 /*
766  * This implements the pipe buffer write mechanism.  Note that only
767  * a direct write OR a normal pipe write can be pending at any given time.
768  * If there are any characters in the pipe buffer, the direct write will
769  * be deferred until the receiving process grabs all of the bytes from
770  * the pipe buffer.  Then the direct mapping write is set-up.
771  */
772 static int
773 pipe_direct_write(wpipe, uio)
774 	struct pipe *wpipe;
775 	struct uio *uio;
776 {
777 	int error;
778 
779 retry:
780 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
781 	while (wpipe->pipe_state & PIPE_DIRECTW) {
782 		if (wpipe->pipe_state & PIPE_WANTR) {
783 			wpipe->pipe_state &= ~PIPE_WANTR;
784 			wakeup(wpipe);
785 		}
786 		wpipe->pipe_state |= PIPE_WANTW;
787 		error = msleep(wpipe, PIPE_MTX(wpipe),
788 		    PRIBIO | PCATCH, "pipdww", 0);
789 		if (error)
790 			goto error1;
791 		if (wpipe->pipe_state & PIPE_EOF) {
792 			error = EPIPE;
793 			goto error1;
794 		}
795 	}
796 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
797 	if (wpipe->pipe_buffer.cnt > 0) {
798 		if (wpipe->pipe_state & PIPE_WANTR) {
799 			wpipe->pipe_state &= ~PIPE_WANTR;
800 			wakeup(wpipe);
801 		}
802 
803 		wpipe->pipe_state |= PIPE_WANTW;
804 		error = msleep(wpipe, PIPE_MTX(wpipe),
805 		    PRIBIO | PCATCH, "pipdwc", 0);
806 		if (error)
807 			goto error1;
808 		if (wpipe->pipe_state & PIPE_EOF) {
809 			error = EPIPE;
810 			goto error1;
811 		}
812 		goto retry;
813 	}
814 
815 	wpipe->pipe_state |= PIPE_DIRECTW;
816 
817 	pipelock(wpipe, 0);
818 	PIPE_UNLOCK(wpipe);
819 	error = pipe_build_write_buffer(wpipe, uio);
820 	PIPE_LOCK(wpipe);
821 	pipeunlock(wpipe);
822 	if (error) {
823 		wpipe->pipe_state &= ~PIPE_DIRECTW;
824 		goto error1;
825 	}
826 
827 	error = 0;
828 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
829 		if (wpipe->pipe_state & PIPE_EOF) {
830 			pipelock(wpipe, 0);
831 			PIPE_UNLOCK(wpipe);
832 			pipe_destroy_write_buffer(wpipe);
833 			PIPE_LOCK(wpipe);
834 			pipeselwakeup(wpipe);
835 			pipeunlock(wpipe);
836 			error = EPIPE;
837 			goto error1;
838 		}
839 		if (wpipe->pipe_state & PIPE_WANTR) {
840 			wpipe->pipe_state &= ~PIPE_WANTR;
841 			wakeup(wpipe);
842 		}
843 		pipeselwakeup(wpipe);
844 		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
845 		    "pipdwt", 0);
846 	}
847 
848 	pipelock(wpipe,0);
849 	if (wpipe->pipe_state & PIPE_DIRECTW) {
850 		/*
851 		 * this bit of trickery substitutes a kernel buffer for
852 		 * the process that might be going away.
853 		 */
854 		pipe_clone_write_buffer(wpipe);
855 	} else {
856 		PIPE_UNLOCK(wpipe);
857 		pipe_destroy_write_buffer(wpipe);
858 		PIPE_LOCK(wpipe);
859 	}
860 	pipeunlock(wpipe);
861 	return (error);
862 
863 error1:
864 	wakeup(wpipe);
865 	return (error);
866 }
867 #endif
868 
869 static int
870 pipe_write(fp, uio, active_cred, flags, td)
871 	struct file *fp;
872 	struct uio *uio;
873 	struct ucred *active_cred;
874 	struct thread *td;
875 	int flags;
876 {
877 	int error = 0;
878 	int orig_resid;
879 	struct pipe *wpipe, *rpipe;
880 
881 	rpipe = fp->f_data;
882 	wpipe = rpipe->pipe_peer;
883 
884 	PIPE_LOCK(rpipe);
885 	/*
886 	 * detect loss of pipe read side, issue SIGPIPE if lost.
887 	 */
888 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
889 		PIPE_UNLOCK(rpipe);
890 		return (EPIPE);
891 	}
892 #ifdef MAC
893 	error = mac_check_pipe_write(active_cred, wpipe);
894 	if (error) {
895 		PIPE_UNLOCK(rpipe);
896 		return (error);
897 	}
898 #endif
899 	++wpipe->pipe_busy;
900 
901 	/*
902 	 * If it is advantageous to resize the pipe buffer, do
903 	 * so.
904 	 */
905 	if ((uio->uio_resid > PIPE_SIZE) &&
906 		(amountpipekva < maxpipekva / 2) &&
907 		(nbigpipe < LIMITBIGPIPES) &&
908 		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
909 		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
910 		(wpipe->pipe_buffer.cnt == 0)) {
911 
912 		if ((error = pipelock(wpipe, 1)) == 0) {
913 			PIPE_UNLOCK(wpipe);
914 			if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
915 				atomic_add_int(&nbigpipe, 1);
916 			PIPE_LOCK(wpipe);
917 			pipeunlock(wpipe);
918 		}
919 	}
920 
921 	/*
922 	 * If an early error occured unbusy and return, waking up any pending
923 	 * readers.
924 	 */
925 	if (error) {
926 		--wpipe->pipe_busy;
927 		if ((wpipe->pipe_busy == 0) &&
928 		    (wpipe->pipe_state & PIPE_WANT)) {
929 			wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
930 			wakeup(wpipe);
931 		}
932 		PIPE_UNLOCK(rpipe);
933 		return(error);
934 	}
935 
936 	orig_resid = uio->uio_resid;
937 
938 	while (uio->uio_resid) {
939 		int space;
940 
941 #ifndef PIPE_NODIRECT
942 		/*
943 		 * If the transfer is large, we can gain performance if
944 		 * we do process-to-process copies directly.
945 		 * If the write is non-blocking, we don't use the
946 		 * direct write mechanism.
947 		 *
948 		 * The direct write mechanism will detect the reader going
949 		 * away on us.
950 		 */
951 		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
952 		    (fp->f_flag & FNONBLOCK) == 0 &&
953 		    amountpipekvawired + uio->uio_resid < maxpipekvawired) {
954 			error = pipe_direct_write(wpipe, uio);
955 			if (error)
956 				break;
957 			continue;
958 		}
959 #endif
960 
961 		/*
962 		 * Pipe buffered writes cannot be coincidental with
963 		 * direct writes.  We wait until the currently executing
964 		 * direct write is completed before we start filling the
965 		 * pipe buffer.  We break out if a signal occurs or the
966 		 * reader goes away.
967 		 */
968 	retrywrite:
969 		while (wpipe->pipe_state & PIPE_DIRECTW) {
970 			if (wpipe->pipe_state & PIPE_WANTR) {
971 				wpipe->pipe_state &= ~PIPE_WANTR;
972 				wakeup(wpipe);
973 			}
974 			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
975 			    "pipbww", 0);
976 			if (wpipe->pipe_state & PIPE_EOF)
977 				break;
978 			if (error)
979 				break;
980 		}
981 		if (wpipe->pipe_state & PIPE_EOF) {
982 			error = EPIPE;
983 			break;
984 		}
985 
986 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
987 
988 		/* Writes of size <= PIPE_BUF must be atomic. */
989 		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
990 			space = 0;
991 
992 		if (space > 0) {
993 			if ((error = pipelock(wpipe,1)) == 0) {
994 				int size;	/* Transfer size */
995 				int segsize;	/* first segment to transfer */
996 
997 				/*
998 				 * It is possible for a direct write to
999 				 * slip in on us... handle it here...
1000 				 */
1001 				if (wpipe->pipe_state & PIPE_DIRECTW) {
1002 					pipeunlock(wpipe);
1003 					goto retrywrite;
1004 				}
1005 				/*
1006 				 * If a process blocked in uiomove, our
1007 				 * value for space might be bad.
1008 				 *
1009 				 * XXX will we be ok if the reader has gone
1010 				 * away here?
1011 				 */
1012 				if (space > wpipe->pipe_buffer.size -
1013 				    wpipe->pipe_buffer.cnt) {
1014 					pipeunlock(wpipe);
1015 					goto retrywrite;
1016 				}
1017 
1018 				/*
1019 				 * Transfer size is minimum of uio transfer
1020 				 * and free space in pipe buffer.
1021 				 */
1022 				if (space > uio->uio_resid)
1023 					size = uio->uio_resid;
1024 				else
1025 					size = space;
1026 				/*
1027 				 * First segment to transfer is minimum of
1028 				 * transfer size and contiguous space in
1029 				 * pipe buffer.  If first segment to transfer
1030 				 * is less than the transfer size, we've got
1031 				 * a wraparound in the buffer.
1032 				 */
1033 				segsize = wpipe->pipe_buffer.size -
1034 					wpipe->pipe_buffer.in;
1035 				if (segsize > size)
1036 					segsize = size;
1037 
1038 				/* Transfer first segment */
1039 
1040 				PIPE_UNLOCK(rpipe);
1041 				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1042 						segsize, uio);
1043 				PIPE_LOCK(rpipe);
1044 
1045 				if (error == 0 && segsize < size) {
1046 					/*
1047 					 * Transfer remaining part now, to
1048 					 * support atomic writes.  Wraparound
1049 					 * happened.
1050 					 */
1051 					if (wpipe->pipe_buffer.in + segsize !=
1052 					    wpipe->pipe_buffer.size)
1053 						panic("Expected pipe buffer "
1054 						    "wraparound disappeared");
1055 
1056 					PIPE_UNLOCK(rpipe);
1057 					error = uiomove(
1058 					    &wpipe->pipe_buffer.buffer[0],
1059 					    size - segsize, uio);
1060 					PIPE_LOCK(rpipe);
1061 				}
1062 				if (error == 0) {
1063 					wpipe->pipe_buffer.in += size;
1064 					if (wpipe->pipe_buffer.in >=
1065 					    wpipe->pipe_buffer.size) {
1066 						if (wpipe->pipe_buffer.in !=
1067 						    size - segsize +
1068 						    wpipe->pipe_buffer.size)
1069 							panic("Expected "
1070 							    "wraparound bad");
1071 						wpipe->pipe_buffer.in = size -
1072 						    segsize;
1073 					}
1074 
1075 					wpipe->pipe_buffer.cnt += size;
1076 					if (wpipe->pipe_buffer.cnt >
1077 					    wpipe->pipe_buffer.size)
1078 						panic("Pipe buffer overflow");
1079 
1080 				}
1081 				pipeunlock(wpipe);
1082 			}
1083 			if (error)
1084 				break;
1085 
1086 		} else {
1087 			/*
1088 			 * If the "read-side" has been blocked, wake it up now.
1089 			 */
1090 			if (wpipe->pipe_state & PIPE_WANTR) {
1091 				wpipe->pipe_state &= ~PIPE_WANTR;
1092 				wakeup(wpipe);
1093 			}
1094 
1095 			/*
1096 			 * don't block on non-blocking I/O
1097 			 */
1098 			if (fp->f_flag & FNONBLOCK) {
1099 				error = EAGAIN;
1100 				break;
1101 			}
1102 
1103 			/*
1104 			 * We have no more space and have something to offer,
1105 			 * wake up select/poll.
1106 			 */
1107 			pipeselwakeup(wpipe);
1108 
1109 			wpipe->pipe_state |= PIPE_WANTW;
1110 			error = msleep(wpipe, PIPE_MTX(rpipe),
1111 			    PRIBIO | PCATCH, "pipewr", 0);
1112 			if (error != 0)
1113 				break;
1114 			/*
1115 			 * If read side wants to go away, we just issue a signal
1116 			 * to ourselves.
1117 			 */
1118 			if (wpipe->pipe_state & PIPE_EOF) {
1119 				error = EPIPE;
1120 				break;
1121 			}
1122 		}
1123 	}
1124 
1125 	--wpipe->pipe_busy;
1126 
1127 	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1128 		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1129 		wakeup(wpipe);
1130 	} else if (wpipe->pipe_buffer.cnt > 0) {
1131 		/*
1132 		 * If we have put any characters in the buffer, we wake up
1133 		 * the reader.
1134 		 */
1135 		if (wpipe->pipe_state & PIPE_WANTR) {
1136 			wpipe->pipe_state &= ~PIPE_WANTR;
1137 			wakeup(wpipe);
1138 		}
1139 	}
1140 
1141 	/*
1142 	 * Don't return EPIPE if I/O was successful
1143 	 */
1144 	if ((wpipe->pipe_buffer.cnt == 0) &&
1145 	    (uio->uio_resid == 0) &&
1146 	    (error == EPIPE)) {
1147 		error = 0;
1148 	}
1149 
1150 	if (error == 0)
1151 		vfs_timestamp(&wpipe->pipe_mtime);
1152 
1153 	/*
1154 	 * We have something to offer,
1155 	 * wake up select/poll.
1156 	 */
1157 	if (wpipe->pipe_buffer.cnt)
1158 		pipeselwakeup(wpipe);
1159 
1160 	PIPE_UNLOCK(rpipe);
1161 	return (error);
1162 }
1163 
1164 /*
1165  * we implement a very minimal set of ioctls for compatibility with sockets.
1166  */
1167 static int
1168 pipe_ioctl(fp, cmd, data, active_cred, td)
1169 	struct file *fp;
1170 	u_long cmd;
1171 	void *data;
1172 	struct ucred *active_cred;
1173 	struct thread *td;
1174 {
1175 	struct pipe *mpipe = fp->f_data;
1176 #ifdef MAC
1177 	int error;
1178 #endif
1179 
1180 	PIPE_LOCK(mpipe);
1181 
1182 #ifdef MAC
1183 	error = mac_check_pipe_ioctl(active_cred, mpipe, cmd, data);
1184 	if (error) {
1185 		PIPE_UNLOCK(mpipe);
1186 		return (error);
1187 	}
1188 #endif
1189 
1190 	switch (cmd) {
1191 
1192 	case FIONBIO:
1193 		PIPE_UNLOCK(mpipe);
1194 		return (0);
1195 
1196 	case FIOASYNC:
1197 		if (*(int *)data) {
1198 			mpipe->pipe_state |= PIPE_ASYNC;
1199 		} else {
1200 			mpipe->pipe_state &= ~PIPE_ASYNC;
1201 		}
1202 		PIPE_UNLOCK(mpipe);
1203 		return (0);
1204 
1205 	case FIONREAD:
1206 		if (mpipe->pipe_state & PIPE_DIRECTW)
1207 			*(int *)data = mpipe->pipe_map.cnt;
1208 		else
1209 			*(int *)data = mpipe->pipe_buffer.cnt;
1210 		PIPE_UNLOCK(mpipe);
1211 		return (0);
1212 
1213 	case FIOSETOWN:
1214 		PIPE_UNLOCK(mpipe);
1215 		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1216 
1217 	case FIOGETOWN:
1218 		PIPE_UNLOCK(mpipe);
1219 		*(int *)data = fgetown(&mpipe->pipe_sigio);
1220 		return (0);
1221 
1222 	/* This is deprecated, FIOSETOWN should be used instead. */
1223 	case TIOCSPGRP:
1224 		PIPE_UNLOCK(mpipe);
1225 		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1226 
1227 	/* This is deprecated, FIOGETOWN should be used instead. */
1228 	case TIOCGPGRP:
1229 		PIPE_UNLOCK(mpipe);
1230 		*(int *)data = -fgetown(&mpipe->pipe_sigio);
1231 		return (0);
1232 
1233 	}
1234 	PIPE_UNLOCK(mpipe);
1235 	return (ENOTTY);
1236 }
1237 
1238 static int
1239 pipe_poll(fp, events, active_cred, td)
1240 	struct file *fp;
1241 	int events;
1242 	struct ucred *active_cred;
1243 	struct thread *td;
1244 {
1245 	struct pipe *rpipe = fp->f_data;
1246 	struct pipe *wpipe;
1247 	int revents = 0;
1248 #ifdef MAC
1249 	int error;
1250 #endif
1251 
1252 	wpipe = rpipe->pipe_peer;
1253 	PIPE_LOCK(rpipe);
1254 #ifdef MAC
1255 	error = mac_check_pipe_poll(active_cred, rpipe);
1256 	if (error)
1257 		goto locked_error;
1258 #endif
1259 	if (events & (POLLIN | POLLRDNORM))
1260 		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1261 		    (rpipe->pipe_buffer.cnt > 0) ||
1262 		    (rpipe->pipe_state & PIPE_EOF))
1263 			revents |= events & (POLLIN | POLLRDNORM);
1264 
1265 	if (events & (POLLOUT | POLLWRNORM))
1266 		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
1267 		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1268 		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1269 			revents |= events & (POLLOUT | POLLWRNORM);
1270 
1271 	if ((rpipe->pipe_state & PIPE_EOF) ||
1272 	    (wpipe == NULL) ||
1273 	    (wpipe->pipe_state & PIPE_EOF))
1274 		revents |= POLLHUP;
1275 
1276 	if (revents == 0) {
1277 		if (events & (POLLIN | POLLRDNORM)) {
1278 			selrecord(td, &rpipe->pipe_sel);
1279 			rpipe->pipe_state |= PIPE_SEL;
1280 		}
1281 
1282 		if (events & (POLLOUT | POLLWRNORM)) {
1283 			selrecord(td, &wpipe->pipe_sel);
1284 			wpipe->pipe_state |= PIPE_SEL;
1285 		}
1286 	}
1287 #ifdef MAC
1288 locked_error:
1289 #endif
1290 	PIPE_UNLOCK(rpipe);
1291 
1292 	return (revents);
1293 }
1294 
1295 /*
1296  * We shouldn't need locks here as we're doing a read and this should
1297  * be a natural race.
1298  */
1299 static int
1300 pipe_stat(fp, ub, active_cred, td)
1301 	struct file *fp;
1302 	struct stat *ub;
1303 	struct ucred *active_cred;
1304 	struct thread *td;
1305 {
1306 	struct pipe *pipe = fp->f_data;
1307 #ifdef MAC
1308 	int error;
1309 
1310 	PIPE_LOCK(pipe);
1311 	error = mac_check_pipe_stat(active_cred, pipe);
1312 	PIPE_UNLOCK(pipe);
1313 	if (error)
1314 		return (error);
1315 #endif
1316 	bzero(ub, sizeof(*ub));
1317 	ub->st_mode = S_IFIFO;
1318 	ub->st_blksize = pipe->pipe_buffer.size;
1319 	ub->st_size = pipe->pipe_buffer.cnt;
1320 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1321 	ub->st_atimespec = pipe->pipe_atime;
1322 	ub->st_mtimespec = pipe->pipe_mtime;
1323 	ub->st_ctimespec = pipe->pipe_ctime;
1324 	ub->st_uid = fp->f_cred->cr_uid;
1325 	ub->st_gid = fp->f_cred->cr_gid;
1326 	/*
1327 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1328 	 * XXX (st_dev, st_ino) should be unique.
1329 	 */
1330 	return (0);
1331 }
1332 
1333 /* ARGSUSED */
1334 static int
1335 pipe_close(fp, td)
1336 	struct file *fp;
1337 	struct thread *td;
1338 {
1339 	struct pipe *cpipe = fp->f_data;
1340 
1341 	fp->f_ops = &badfileops;
1342 	fp->f_data = NULL;
1343 	funsetown(&cpipe->pipe_sigio);
1344 	pipeclose(cpipe);
1345 	return (0);
1346 }
1347 
1348 static void
1349 pipe_free_kmem(cpipe)
1350 	struct pipe *cpipe;
1351 {
1352 
1353 	KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)),
1354 	       ("pipespace: pipe mutex locked"));
1355 
1356 	if (cpipe->pipe_buffer.buffer != NULL) {
1357 		if (cpipe->pipe_buffer.size > PIPE_SIZE)
1358 			atomic_subtract_int(&nbigpipe, 1);
1359 		atomic_subtract_int(&amountpipekva, cpipe->pipe_buffer.size);
1360 		atomic_subtract_int(&amountpipes, 1);
1361 		vm_map_remove(pipe_map,
1362 		    (vm_offset_t)cpipe->pipe_buffer.buffer,
1363 		    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
1364 		cpipe->pipe_buffer.buffer = NULL;
1365 	}
1366 #ifndef PIPE_NODIRECT
1367 	if (cpipe->pipe_map.kva != 0) {
1368 		atomic_subtract_int(&amountpipekvawired,
1369 		    cpipe->pipe_buffer.size + PAGE_SIZE);
1370 		kmem_free(kernel_map,
1371 			cpipe->pipe_map.kva,
1372 			cpipe->pipe_buffer.size + PAGE_SIZE);
1373 		cpipe->pipe_map.cnt = 0;
1374 		cpipe->pipe_map.kva = 0;
1375 		cpipe->pipe_map.pos = 0;
1376 		cpipe->pipe_map.npages = 0;
1377 	}
1378 #endif
1379 }
1380 
1381 /*
1382  * shutdown the pipe
1383  */
1384 static void
1385 pipeclose(cpipe)
1386 	struct pipe *cpipe;
1387 {
1388 	struct pipe *ppipe;
1389 	int hadpeer;
1390 
1391 	if (cpipe == NULL)
1392 		return;
1393 
1394 	hadpeer = 0;
1395 
1396 	/* partially created pipes won't have a valid mutex. */
1397 	if (PIPE_MTX(cpipe) != NULL)
1398 		PIPE_LOCK(cpipe);
1399 
1400 	pipeselwakeup(cpipe);
1401 
1402 	/*
1403 	 * If the other side is blocked, wake it up saying that
1404 	 * we want to close it down.
1405 	 */
1406 	while (cpipe->pipe_busy) {
1407 		wakeup(cpipe);
1408 		cpipe->pipe_state |= PIPE_WANT | PIPE_EOF;
1409 		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1410 	}
1411 
1412 #ifdef MAC
1413 	if (cpipe->pipe_label != NULL && cpipe->pipe_peer == NULL)
1414 		mac_destroy_pipe(cpipe);
1415 #endif
1416 
1417 	/*
1418 	 * Disconnect from peer
1419 	 */
1420 	if ((ppipe = cpipe->pipe_peer) != NULL) {
1421 		hadpeer++;
1422 		pipeselwakeup(ppipe);
1423 
1424 		ppipe->pipe_state |= PIPE_EOF;
1425 		wakeup(ppipe);
1426 		KNOTE(&ppipe->pipe_sel.si_note, 0);
1427 		ppipe->pipe_peer = NULL;
1428 	}
1429 	/*
1430 	 * free resources
1431 	 */
1432 	if (PIPE_MTX(cpipe) != NULL) {
1433 		PIPE_UNLOCK(cpipe);
1434 		if (!hadpeer) {
1435 			mtx_destroy(PIPE_MTX(cpipe));
1436 			free(PIPE_MTX(cpipe), M_TEMP);
1437 		}
1438 	}
1439 	pipe_free_kmem(cpipe);
1440 	uma_zfree(pipe_zone, cpipe);
1441 }
1442 
1443 /*ARGSUSED*/
1444 static int
1445 pipe_kqfilter(struct file *fp, struct knote *kn)
1446 {
1447 	struct pipe *cpipe;
1448 
1449 	cpipe = kn->kn_fp->f_data;
1450 	switch (kn->kn_filter) {
1451 	case EVFILT_READ:
1452 		kn->kn_fop = &pipe_rfiltops;
1453 		break;
1454 	case EVFILT_WRITE:
1455 		kn->kn_fop = &pipe_wfiltops;
1456 		cpipe = cpipe->pipe_peer;
1457 		if (cpipe == NULL)
1458 			/* other end of pipe has been closed */
1459 			return (EPIPE);
1460 		break;
1461 	default:
1462 		return (1);
1463 	}
1464 
1465 	PIPE_LOCK(cpipe);
1466 	SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
1467 	PIPE_UNLOCK(cpipe);
1468 	return (0);
1469 }
1470 
1471 static void
1472 filt_pipedetach(struct knote *kn)
1473 {
1474 	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1475 
1476 	if (kn->kn_filter == EVFILT_WRITE) {
1477 		if (cpipe->pipe_peer == NULL)
1478 			return;
1479 		cpipe = cpipe->pipe_peer;
1480 	}
1481 
1482 	PIPE_LOCK(cpipe);
1483 	SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1484 	PIPE_UNLOCK(cpipe);
1485 }
1486 
1487 /*ARGSUSED*/
1488 static int
1489 filt_piperead(struct knote *kn, long hint)
1490 {
1491 	struct pipe *rpipe = kn->kn_fp->f_data;
1492 	struct pipe *wpipe = rpipe->pipe_peer;
1493 
1494 	PIPE_LOCK(rpipe);
1495 	kn->kn_data = rpipe->pipe_buffer.cnt;
1496 	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1497 		kn->kn_data = rpipe->pipe_map.cnt;
1498 
1499 	if ((rpipe->pipe_state & PIPE_EOF) ||
1500 	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1501 		kn->kn_flags |= EV_EOF;
1502 		PIPE_UNLOCK(rpipe);
1503 		return (1);
1504 	}
1505 	PIPE_UNLOCK(rpipe);
1506 	return (kn->kn_data > 0);
1507 }
1508 
1509 /*ARGSUSED*/
1510 static int
1511 filt_pipewrite(struct knote *kn, long hint)
1512 {
1513 	struct pipe *rpipe = kn->kn_fp->f_data;
1514 	struct pipe *wpipe = rpipe->pipe_peer;
1515 
1516 	PIPE_LOCK(rpipe);
1517 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1518 		kn->kn_data = 0;
1519 		kn->kn_flags |= EV_EOF;
1520 		PIPE_UNLOCK(rpipe);
1521 		return (1);
1522 	}
1523 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1524 	if (wpipe->pipe_state & PIPE_DIRECTW)
1525 		kn->kn_data = 0;
1526 
1527 	PIPE_UNLOCK(rpipe);
1528 	return (kn->kn_data >= PIPE_BUF);
1529 }
1530