xref: /freebsd/sys/kern/sys_pipe.c (revision 3193579b66fd7067f898dbc54bdea81a0e6f9bd0)
1 /*
2  * Copyright (c) 1996 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. Absolutely no warranty of function or purpose is made by the author
15  *    John S. Dyson.
16  * 4. Modifications may be freely made to this file if the above conditions
17  *    are met.
18  */
19 
20 /*
21  * This file contains a high-performance replacement for the socket-based
22  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
23  * all features of sockets, but does do everything that pipes normally
24  * do.
25  */
26 
27 /*
28  * This code has two modes of operation, a small write mode and a large
29  * write mode.  The small write mode acts like conventional pipes with
30  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
31  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
32  * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
33  * the receiving process can copy it directly from the pages in the sending
34  * process.
35  *
36  * If the sending process receives a signal, it is possible that it will
37  * go away, and certainly its address space can change, because control
38  * is returned back to the user-mode side.  In that case, the pipe code
39  * arranges to copy the buffer supplied by the user process, to a pageable
40  * kernel buffer, and the receiving process will grab the data from the
41  * pageable kernel buffer.  Since signals don't happen all that often,
42  * the copy operation is normally eliminated.
43  *
44  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
45  * happen for small transfers so that the system will not spend all of
46  * its time context switching.
47  *
48  * In order to limit the resource use of pipes, two sysctls exist:
49  *
50  * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
51  * address space available to us in pipe_map.  Whenever the amount in use
52  * exceeds half of this value, all new pipes will be created with size
53  * SMALL_PIPE_SIZE, rather than PIPE_SIZE.  Big pipe creation will be limited
54  * as well.  This value is loader tunable only.
55  *
56  * kern.ipc.maxpipekvawired - This value limits the amount of memory that may
57  * be wired in order to facilitate direct copies using page flipping.
58  * Whenever this value is exceeded, pipes will fall back to using regular
59  * copies.  This value is sysctl controllable at all times.
60  *
61  * These values are autotuned in subr_param.c.
62  *
63  * Memory usage may be monitored through the sysctls
64  * kern.ipc.pipes, kern.ipc.pipekva and kern.ipc.pipekvawired.
65  *
66  */
67 
68 #include <sys/cdefs.h>
69 __FBSDID("$FreeBSD$");
70 
71 #include "opt_mac.h"
72 
73 #include <sys/param.h>
74 #include <sys/systm.h>
75 #include <sys/fcntl.h>
76 #include <sys/file.h>
77 #include <sys/filedesc.h>
78 #include <sys/filio.h>
79 #include <sys/kernel.h>
80 #include <sys/lock.h>
81 #include <sys/mac.h>
82 #include <sys/mutex.h>
83 #include <sys/ttycom.h>
84 #include <sys/stat.h>
85 #include <sys/malloc.h>
86 #include <sys/poll.h>
87 #include <sys/selinfo.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/sysproto.h>
91 #include <sys/pipe.h>
92 #include <sys/proc.h>
93 #include <sys/vnode.h>
94 #include <sys/uio.h>
95 #include <sys/event.h>
96 
97 #include <vm/vm.h>
98 #include <vm/vm_param.h>
99 #include <vm/vm_object.h>
100 #include <vm/vm_kern.h>
101 #include <vm/vm_extern.h>
102 #include <vm/pmap.h>
103 #include <vm/vm_map.h>
104 #include <vm/vm_page.h>
105 #include <vm/uma.h>
106 
107 /*
108  * Use this define if you want to disable *fancy* VM things.  Expect an
109  * approx 30% decrease in transfer rate.  This could be useful for
110  * NetBSD or OpenBSD.
111  */
112 /* #define PIPE_NODIRECT */
113 
114 /*
115  * interfaces to the outside world
116  */
117 static fo_rdwr_t	pipe_read;
118 static fo_rdwr_t	pipe_write;
119 static fo_ioctl_t	pipe_ioctl;
120 static fo_poll_t	pipe_poll;
121 static fo_kqfilter_t	pipe_kqfilter;
122 static fo_stat_t	pipe_stat;
123 static fo_close_t	pipe_close;
124 
125 static struct fileops pipeops = {
126 	.fo_read = pipe_read,
127 	.fo_write = pipe_write,
128 	.fo_ioctl = pipe_ioctl,
129 	.fo_poll = pipe_poll,
130 	.fo_kqfilter = pipe_kqfilter,
131 	.fo_stat = pipe_stat,
132 	.fo_close = pipe_close,
133 	.fo_flags = DFLAG_PASSABLE
134 };
135 
136 static void	filt_pipedetach(struct knote *kn);
137 static int	filt_piperead(struct knote *kn, long hint);
138 static int	filt_pipewrite(struct knote *kn, long hint);
139 
140 static struct filterops pipe_rfiltops =
141 	{ 1, NULL, filt_pipedetach, filt_piperead };
142 static struct filterops pipe_wfiltops =
143 	{ 1, NULL, filt_pipedetach, filt_pipewrite };
144 
145 /*
146  * Default pipe buffer size(s), this can be kind-of large now because pipe
147  * space is pageable.  The pipe code will try to maintain locality of
148  * reference for performance reasons, so small amounts of outstanding I/O
149  * will not wipe the cache.
150  */
151 #define MINPIPESIZE (PIPE_SIZE/3)
152 #define MAXPIPESIZE (2*PIPE_SIZE/3)
153 
154 /*
155  * Limit the number of "big" pipes
156  */
157 #define LIMITBIGPIPES	32
158 static int nbigpipe;
159 
160 static int amountpipes;
161 static int amountpipekva;
162 static int amountpipekvawired;
163 
164 SYSCTL_DECL(_kern_ipc);
165 
166 SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
167 	   &maxpipekva, 0, "Pipe KVA limit");
168 SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW,
169 	   &maxpipekvawired, 0, "Pipe KVA wired limit");
170 SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD,
171 	   &amountpipes, 0, "Current # of pipes");
172 SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD,
173 	   &nbigpipe, 0, "Current # of big pipes");
174 SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
175 	   &amountpipekva, 0, "Pipe KVA usage");
176 SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD,
177 	   &amountpipekvawired, 0, "Pipe wired KVA usage");
178 
179 static void pipeinit(void *dummy __unused);
180 static void pipeclose(struct pipe *cpipe);
181 static void pipe_free_kmem(struct pipe *cpipe);
182 static int pipe_create(struct pipe **cpipep);
183 static __inline int pipelock(struct pipe *cpipe, int catch);
184 static __inline void pipeunlock(struct pipe *cpipe);
185 static __inline void pipeselwakeup(struct pipe *cpipe);
186 #ifndef PIPE_NODIRECT
187 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
188 static void pipe_destroy_write_buffer(struct pipe *wpipe);
189 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
190 static void pipe_clone_write_buffer(struct pipe *wpipe);
191 #endif
192 static int pipespace(struct pipe *cpipe, int size);
193 
194 static uma_zone_t pipe_zone;
195 
196 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
197 
198 static void
199 pipeinit(void *dummy __unused)
200 {
201 
202 	pipe_zone = uma_zcreate("PIPE", sizeof(struct pipe), NULL,
203 	    NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
204 	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
205 }
206 
207 /*
208  * The pipe system call for the DTYPE_PIPE type of pipes
209  */
210 
211 /* ARGSUSED */
212 int
213 pipe(td, uap)
214 	struct thread *td;
215 	struct pipe_args /* {
216 		int	dummy;
217 	} */ *uap;
218 {
219 	struct filedesc *fdp = td->td_proc->p_fd;
220 	struct file *rf, *wf;
221 	struct pipe *rpipe, *wpipe;
222 	struct mtx *pmtx;
223 	int fd, error;
224 
225 	rpipe = wpipe = NULL;
226 	if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
227 		pipeclose(rpipe);
228 		pipeclose(wpipe);
229 		return (ENFILE);
230 	}
231 
232 	rpipe->pipe_state |= PIPE_DIRECTOK;
233 	wpipe->pipe_state |= PIPE_DIRECTOK;
234 
235 	error = falloc(td, &rf, &fd);
236 	if (error) {
237 		pipeclose(rpipe);
238 		pipeclose(wpipe);
239 		return (error);
240 	}
241 	/* An extra reference on `rf' has been held for us by falloc(). */
242 	td->td_retval[0] = fd;
243 
244 	/*
245 	 * Warning: once we've gotten past allocation of the fd for the
246 	 * read-side, we can only drop the read side via fdrop() in order
247 	 * to avoid races against processes which manage to dup() the read
248 	 * side while we are blocked trying to allocate the write side.
249 	 */
250 	FILE_LOCK(rf);
251 	rf->f_flag = FREAD | FWRITE;
252 	rf->f_type = DTYPE_PIPE;
253 	rf->f_data = rpipe;
254 	rf->f_ops = &pipeops;
255 	FILE_UNLOCK(rf);
256 	error = falloc(td, &wf, &fd);
257 	if (error) {
258 		FILEDESC_LOCK(fdp);
259 		if (fdp->fd_ofiles[td->td_retval[0]] == rf) {
260 			fdp->fd_ofiles[td->td_retval[0]] = NULL;
261 			FILEDESC_UNLOCK(fdp);
262 			fdrop(rf, td);
263 		} else
264 			FILEDESC_UNLOCK(fdp);
265 		fdrop(rf, td);
266 		/* rpipe has been closed by fdrop(). */
267 		pipeclose(wpipe);
268 		return (error);
269 	}
270 	/* An extra reference on `wf' has been held for us by falloc(). */
271 	FILE_LOCK(wf);
272 	wf->f_flag = FREAD | FWRITE;
273 	wf->f_type = DTYPE_PIPE;
274 	wf->f_data = wpipe;
275 	wf->f_ops = &pipeops;
276 	FILE_UNLOCK(wf);
277 	fdrop(wf, td);
278 	td->td_retval[1] = fd;
279 	rpipe->pipe_peer = wpipe;
280 	wpipe->pipe_peer = rpipe;
281 #ifdef MAC
282 	/*
283 	 * struct pipe represents a pipe endpoint.  The MAC label is shared
284 	 * between the connected endpoints.  As a result mac_init_pipe() and
285 	 * mac_create_pipe() should only be called on one of the endpoints
286 	 * after they have been connected.
287 	 */
288 	mac_init_pipe(rpipe);
289 	mac_create_pipe(td->td_ucred, rpipe);
290 #endif
291 	pmtx = malloc(sizeof(*pmtx), M_TEMP, M_WAITOK | M_ZERO);
292 	mtx_init(pmtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
293 	rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx;
294 	fdrop(rf, td);
295 
296 	return (0);
297 }
298 
299 /*
300  * Allocate kva for pipe circular buffer, the space is pageable
301  * This routine will 'realloc' the size of a pipe safely, if it fails
302  * it will retain the old buffer.
303  * If it fails it will return ENOMEM.
304  */
305 static int
306 pipespace(cpipe, size)
307 	struct pipe *cpipe;
308 	int size;
309 {
310 	caddr_t buffer;
311 	int error;
312 	static int curfail = 0;
313 	static struct timeval lastfail;
314 
315 	KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)),
316 	       ("pipespace: pipe mutex locked"));
317 
318 	size = round_page(size);
319 	/*
320 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
321 	 */
322 	buffer = (caddr_t) vm_map_min(pipe_map);
323 
324 	/*
325 	 * The map entry is, by default, pageable.
326 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
327 	 */
328 	error = vm_map_find(pipe_map, NULL, 0,
329 		(vm_offset_t *) &buffer, size, 1,
330 		VM_PROT_ALL, VM_PROT_ALL, 0);
331 	if (error != KERN_SUCCESS) {
332 		if (ppsratecheck(&lastfail, &curfail, 1))
333 			printf("kern.maxpipekva exceeded, please see tuning(7).\n");
334 		return (ENOMEM);
335 	}
336 
337 	/* free old resources if we're resizing */
338 	pipe_free_kmem(cpipe);
339 	cpipe->pipe_buffer.buffer = buffer;
340 	cpipe->pipe_buffer.size = size;
341 	cpipe->pipe_buffer.in = 0;
342 	cpipe->pipe_buffer.out = 0;
343 	cpipe->pipe_buffer.cnt = 0;
344 	atomic_add_int(&amountpipes, 1);
345 	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
346 	return (0);
347 }
348 
349 /*
350  * initialize and allocate VM and memory for pipe
351  */
352 static int
353 pipe_create(cpipep)
354 	struct pipe **cpipep;
355 {
356 	struct pipe *cpipe;
357 	int error;
358 
359 	*cpipep = uma_zalloc(pipe_zone, M_WAITOK);
360 	if (*cpipep == NULL)
361 		return (ENOMEM);
362 
363 	cpipe = *cpipep;
364 
365 	/*
366 	 * protect so pipeclose() doesn't follow a junk pointer
367 	 * if pipespace() fails.
368 	 */
369 	bzero(&cpipe->pipe_sel, sizeof(cpipe->pipe_sel));
370 	cpipe->pipe_state = 0;
371 	cpipe->pipe_peer = NULL;
372 	cpipe->pipe_busy = 0;
373 
374 #ifndef PIPE_NODIRECT
375 	/*
376 	 * pipe data structure initializations to support direct pipe I/O
377 	 */
378 	cpipe->pipe_map.cnt = 0;
379 	cpipe->pipe_map.kva = 0;
380 	cpipe->pipe_map.pos = 0;
381 	cpipe->pipe_map.npages = 0;
382 	/* cpipe->pipe_map.ms[] = invalid */
383 #endif
384 
385 	cpipe->pipe_mtxp = NULL;	/* avoid pipespace assertion */
386 	/*
387 	 * Reduce to 1/4th pipe size if we're over our global max.
388 	 */
389 	if (amountpipekva > maxpipekva / 2)
390 		error = pipespace(cpipe, SMALL_PIPE_SIZE);
391 	else
392 		error = pipespace(cpipe, PIPE_SIZE);
393 	if (error)
394 		return (error);
395 
396 	vfs_timestamp(&cpipe->pipe_ctime);
397 	cpipe->pipe_atime = cpipe->pipe_ctime;
398 	cpipe->pipe_mtime = cpipe->pipe_ctime;
399 
400 	return (0);
401 }
402 
403 
404 /*
405  * lock a pipe for I/O, blocking other access
406  */
407 static __inline int
408 pipelock(cpipe, catch)
409 	struct pipe *cpipe;
410 	int catch;
411 {
412 	int error;
413 
414 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
415 	while (cpipe->pipe_state & PIPE_LOCKFL) {
416 		cpipe->pipe_state |= PIPE_LWANT;
417 		error = msleep(cpipe, PIPE_MTX(cpipe),
418 		    catch ? (PRIBIO | PCATCH) : PRIBIO,
419 		    "pipelk", 0);
420 		if (error != 0)
421 			return (error);
422 	}
423 	cpipe->pipe_state |= PIPE_LOCKFL;
424 	return (0);
425 }
426 
427 /*
428  * unlock a pipe I/O lock
429  */
430 static __inline void
431 pipeunlock(cpipe)
432 	struct pipe *cpipe;
433 {
434 
435 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
436 	cpipe->pipe_state &= ~PIPE_LOCKFL;
437 	if (cpipe->pipe_state & PIPE_LWANT) {
438 		cpipe->pipe_state &= ~PIPE_LWANT;
439 		wakeup(cpipe);
440 	}
441 }
442 
443 static __inline void
444 pipeselwakeup(cpipe)
445 	struct pipe *cpipe;
446 {
447 
448 	if (cpipe->pipe_state & PIPE_SEL) {
449 		cpipe->pipe_state &= ~PIPE_SEL;
450 		selwakeuppri(&cpipe->pipe_sel, PSOCK);
451 	}
452 	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
453 		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
454 	KNOTE(&cpipe->pipe_sel.si_note, 0);
455 }
456 
457 /* ARGSUSED */
458 static int
459 pipe_read(fp, uio, active_cred, flags, td)
460 	struct file *fp;
461 	struct uio *uio;
462 	struct ucred *active_cred;
463 	struct thread *td;
464 	int flags;
465 {
466 	struct pipe *rpipe = fp->f_data;
467 	int error;
468 	int nread = 0;
469 	u_int size;
470 
471 	PIPE_LOCK(rpipe);
472 	++rpipe->pipe_busy;
473 	error = pipelock(rpipe, 1);
474 	if (error)
475 		goto unlocked_error;
476 
477 #ifdef MAC
478 	error = mac_check_pipe_read(active_cred, rpipe);
479 	if (error)
480 		goto locked_error;
481 #endif
482 
483 	while (uio->uio_resid) {
484 		/*
485 		 * normal pipe buffer receive
486 		 */
487 		if (rpipe->pipe_buffer.cnt > 0) {
488 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
489 			if (size > rpipe->pipe_buffer.cnt)
490 				size = rpipe->pipe_buffer.cnt;
491 			if (size > (u_int) uio->uio_resid)
492 				size = (u_int) uio->uio_resid;
493 
494 			PIPE_UNLOCK(rpipe);
495 			error = uiomove(
496 			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
497 			    size, uio);
498 			PIPE_LOCK(rpipe);
499 			if (error)
500 				break;
501 
502 			rpipe->pipe_buffer.out += size;
503 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
504 				rpipe->pipe_buffer.out = 0;
505 
506 			rpipe->pipe_buffer.cnt -= size;
507 
508 			/*
509 			 * If there is no more to read in the pipe, reset
510 			 * its pointers to the beginning.  This improves
511 			 * cache hit stats.
512 			 */
513 			if (rpipe->pipe_buffer.cnt == 0) {
514 				rpipe->pipe_buffer.in = 0;
515 				rpipe->pipe_buffer.out = 0;
516 			}
517 			nread += size;
518 #ifndef PIPE_NODIRECT
519 		/*
520 		 * Direct copy, bypassing a kernel buffer.
521 		 */
522 		} else if ((size = rpipe->pipe_map.cnt) &&
523 			   (rpipe->pipe_state & PIPE_DIRECTW)) {
524 			caddr_t	va;
525 			if (size > (u_int) uio->uio_resid)
526 				size = (u_int) uio->uio_resid;
527 
528 			va = (caddr_t) rpipe->pipe_map.kva +
529 			    rpipe->pipe_map.pos;
530 			PIPE_UNLOCK(rpipe);
531 			error = uiomove(va, size, uio);
532 			PIPE_LOCK(rpipe);
533 			if (error)
534 				break;
535 			nread += size;
536 			rpipe->pipe_map.pos += size;
537 			rpipe->pipe_map.cnt -= size;
538 			if (rpipe->pipe_map.cnt == 0) {
539 				rpipe->pipe_state &= ~PIPE_DIRECTW;
540 				wakeup(rpipe);
541 			}
542 #endif
543 		} else {
544 			/*
545 			 * detect EOF condition
546 			 * read returns 0 on EOF, no need to set error
547 			 */
548 			if (rpipe->pipe_state & PIPE_EOF)
549 				break;
550 
551 			/*
552 			 * If the "write-side" has been blocked, wake it up now.
553 			 */
554 			if (rpipe->pipe_state & PIPE_WANTW) {
555 				rpipe->pipe_state &= ~PIPE_WANTW;
556 				wakeup(rpipe);
557 			}
558 
559 			/*
560 			 * Break if some data was read.
561 			 */
562 			if (nread > 0)
563 				break;
564 
565 			/*
566 			 * Unlock the pipe buffer for our remaining processing.
567 			 * We will either break out with an error or we will
568 			 * sleep and relock to loop.
569 			 */
570 			pipeunlock(rpipe);
571 
572 			/*
573 			 * Handle non-blocking mode operation or
574 			 * wait for more data.
575 			 */
576 			if (fp->f_flag & FNONBLOCK) {
577 				error = EAGAIN;
578 			} else {
579 				rpipe->pipe_state |= PIPE_WANTR;
580 				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
581 				    PRIBIO | PCATCH,
582 				    "piperd", 0)) == 0)
583 					error = pipelock(rpipe, 1);
584 			}
585 			if (error)
586 				goto unlocked_error;
587 		}
588 	}
589 #ifdef MAC
590 locked_error:
591 #endif
592 	pipeunlock(rpipe);
593 
594 	/* XXX: should probably do this before getting any locks. */
595 	if (error == 0)
596 		vfs_timestamp(&rpipe->pipe_atime);
597 unlocked_error:
598 	--rpipe->pipe_busy;
599 
600 	/*
601 	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
602 	 */
603 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
604 		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
605 		wakeup(rpipe);
606 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
607 		/*
608 		 * Handle write blocking hysteresis.
609 		 */
610 		if (rpipe->pipe_state & PIPE_WANTW) {
611 			rpipe->pipe_state &= ~PIPE_WANTW;
612 			wakeup(rpipe);
613 		}
614 	}
615 
616 	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
617 		pipeselwakeup(rpipe);
618 
619 	PIPE_UNLOCK(rpipe);
620 	return (error);
621 }
622 
623 #ifndef PIPE_NODIRECT
624 /*
625  * Map the sending processes' buffer into kernel space and wire it.
626  * This is similar to a physical write operation.
627  */
628 static int
629 pipe_build_write_buffer(wpipe, uio)
630 	struct pipe *wpipe;
631 	struct uio *uio;
632 {
633 	pmap_t pmap;
634 	u_int size;
635 	int i, j;
636 	vm_offset_t addr, endaddr;
637 
638 	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
639 
640 	size = (u_int) uio->uio_iov->iov_len;
641 	if (size > wpipe->pipe_buffer.size)
642 		size = wpipe->pipe_buffer.size;
643 
644 	pmap = vmspace_pmap(curproc->p_vmspace);
645 	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
646 	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
647 	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
648 		/*
649 		 * vm_fault_quick() can sleep.  Consequently,
650 		 * vm_page_lock_queue() and vm_page_unlock_queue()
651 		 * should not be performed outside of this loop.
652 		 */
653 	race:
654 		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) {
655 			vm_page_lock_queues();
656 			for (j = 0; j < i; j++)
657 				vm_page_unhold(wpipe->pipe_map.ms[j]);
658 			vm_page_unlock_queues();
659 			return (EFAULT);
660 		}
661 		wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr,
662 		    VM_PROT_READ);
663 		if (wpipe->pipe_map.ms[i] == NULL)
664 			goto race;
665 	}
666 
667 /*
668  * set up the control block
669  */
670 	wpipe->pipe_map.npages = i;
671 	wpipe->pipe_map.pos =
672 	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
673 	wpipe->pipe_map.cnt = size;
674 
675 /*
676  * and map the buffer
677  */
678 	if (wpipe->pipe_map.kva == 0) {
679 		/*
680 		 * We need to allocate space for an extra page because the
681 		 * address range might (will) span pages at times.
682 		 */
683 		wpipe->pipe_map.kva = kmem_alloc_nofault(kernel_map,
684 			wpipe->pipe_buffer.size + PAGE_SIZE);
685 		atomic_add_int(&amountpipekvawired,
686 		    wpipe->pipe_buffer.size + PAGE_SIZE);
687 	}
688 	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
689 		wpipe->pipe_map.npages);
690 
691 /*
692  * and update the uio data
693  */
694 
695 	uio->uio_iov->iov_len -= size;
696 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
697 	if (uio->uio_iov->iov_len == 0)
698 		uio->uio_iov++;
699 	uio->uio_resid -= size;
700 	uio->uio_offset += size;
701 	return (0);
702 }
703 
704 /*
705  * unmap and unwire the process buffer
706  */
707 static void
708 pipe_destroy_write_buffer(wpipe)
709 	struct pipe *wpipe;
710 {
711 	int i;
712 
713 	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
714 	if (wpipe->pipe_map.kva) {
715 		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
716 
717 		if (amountpipekvawired > maxpipekvawired / 2) {
718 			/* Conserve address space */
719 			vm_offset_t kva = wpipe->pipe_map.kva;
720 			wpipe->pipe_map.kva = 0;
721 			kmem_free(kernel_map, kva,
722 			    wpipe->pipe_buffer.size + PAGE_SIZE);
723 			atomic_subtract_int(&amountpipekvawired,
724 			    wpipe->pipe_buffer.size + PAGE_SIZE);
725 		}
726 	}
727 	vm_page_lock_queues();
728 	for (i = 0; i < wpipe->pipe_map.npages; i++) {
729 		vm_page_unhold(wpipe->pipe_map.ms[i]);
730 	}
731 	vm_page_unlock_queues();
732 	wpipe->pipe_map.npages = 0;
733 }
734 
735 /*
736  * In the case of a signal, the writing process might go away.  This
737  * code copies the data into the circular buffer so that the source
738  * pages can be freed without loss of data.
739  */
740 static void
741 pipe_clone_write_buffer(wpipe)
742 	struct pipe *wpipe;
743 {
744 	int size;
745 	int pos;
746 
747 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
748 	size = wpipe->pipe_map.cnt;
749 	pos = wpipe->pipe_map.pos;
750 
751 	wpipe->pipe_buffer.in = size;
752 	wpipe->pipe_buffer.out = 0;
753 	wpipe->pipe_buffer.cnt = size;
754 	wpipe->pipe_state &= ~PIPE_DIRECTW;
755 
756 	PIPE_UNLOCK(wpipe);
757 	bcopy((caddr_t) wpipe->pipe_map.kva + pos,
758 	    wpipe->pipe_buffer.buffer, size);
759 	pipe_destroy_write_buffer(wpipe);
760 	PIPE_LOCK(wpipe);
761 }
762 
763 /*
764  * This implements the pipe buffer write mechanism.  Note that only
765  * a direct write OR a normal pipe write can be pending at any given time.
766  * If there are any characters in the pipe buffer, the direct write will
767  * be deferred until the receiving process grabs all of the bytes from
768  * the pipe buffer.  Then the direct mapping write is set-up.
769  */
770 static int
771 pipe_direct_write(wpipe, uio)
772 	struct pipe *wpipe;
773 	struct uio *uio;
774 {
775 	int error;
776 
777 retry:
778 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
779 	while (wpipe->pipe_state & PIPE_DIRECTW) {
780 		if (wpipe->pipe_state & PIPE_WANTR) {
781 			wpipe->pipe_state &= ~PIPE_WANTR;
782 			wakeup(wpipe);
783 		}
784 		wpipe->pipe_state |= PIPE_WANTW;
785 		error = msleep(wpipe, PIPE_MTX(wpipe),
786 		    PRIBIO | PCATCH, "pipdww", 0);
787 		if (error)
788 			goto error1;
789 		if (wpipe->pipe_state & PIPE_EOF) {
790 			error = EPIPE;
791 			goto error1;
792 		}
793 	}
794 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
795 	if (wpipe->pipe_buffer.cnt > 0) {
796 		if (wpipe->pipe_state & PIPE_WANTR) {
797 			wpipe->pipe_state &= ~PIPE_WANTR;
798 			wakeup(wpipe);
799 		}
800 
801 		wpipe->pipe_state |= PIPE_WANTW;
802 		error = msleep(wpipe, PIPE_MTX(wpipe),
803 		    PRIBIO | PCATCH, "pipdwc", 0);
804 		if (error)
805 			goto error1;
806 		if (wpipe->pipe_state & PIPE_EOF) {
807 			error = EPIPE;
808 			goto error1;
809 		}
810 		goto retry;
811 	}
812 
813 	wpipe->pipe_state |= PIPE_DIRECTW;
814 
815 	pipelock(wpipe, 0);
816 	PIPE_UNLOCK(wpipe);
817 	error = pipe_build_write_buffer(wpipe, uio);
818 	PIPE_LOCK(wpipe);
819 	pipeunlock(wpipe);
820 	if (error) {
821 		wpipe->pipe_state &= ~PIPE_DIRECTW;
822 		goto error1;
823 	}
824 
825 	error = 0;
826 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
827 		if (wpipe->pipe_state & PIPE_EOF) {
828 			pipelock(wpipe, 0);
829 			PIPE_UNLOCK(wpipe);
830 			pipe_destroy_write_buffer(wpipe);
831 			PIPE_LOCK(wpipe);
832 			pipeselwakeup(wpipe);
833 			pipeunlock(wpipe);
834 			error = EPIPE;
835 			goto error1;
836 		}
837 		if (wpipe->pipe_state & PIPE_WANTR) {
838 			wpipe->pipe_state &= ~PIPE_WANTR;
839 			wakeup(wpipe);
840 		}
841 		pipeselwakeup(wpipe);
842 		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
843 		    "pipdwt", 0);
844 	}
845 
846 	pipelock(wpipe,0);
847 	if (wpipe->pipe_state & PIPE_DIRECTW) {
848 		/*
849 		 * this bit of trickery substitutes a kernel buffer for
850 		 * the process that might be going away.
851 		 */
852 		pipe_clone_write_buffer(wpipe);
853 	} else {
854 		PIPE_UNLOCK(wpipe);
855 		pipe_destroy_write_buffer(wpipe);
856 		PIPE_LOCK(wpipe);
857 	}
858 	pipeunlock(wpipe);
859 	return (error);
860 
861 error1:
862 	wakeup(wpipe);
863 	return (error);
864 }
865 #endif
866 
867 static int
868 pipe_write(fp, uio, active_cred, flags, td)
869 	struct file *fp;
870 	struct uio *uio;
871 	struct ucred *active_cred;
872 	struct thread *td;
873 	int flags;
874 {
875 	int error = 0;
876 	int orig_resid;
877 	struct pipe *wpipe, *rpipe;
878 
879 	rpipe = fp->f_data;
880 	wpipe = rpipe->pipe_peer;
881 
882 	PIPE_LOCK(rpipe);
883 	/*
884 	 * detect loss of pipe read side, issue SIGPIPE if lost.
885 	 */
886 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
887 		PIPE_UNLOCK(rpipe);
888 		return (EPIPE);
889 	}
890 #ifdef MAC
891 	error = mac_check_pipe_write(active_cred, wpipe);
892 	if (error) {
893 		PIPE_UNLOCK(rpipe);
894 		return (error);
895 	}
896 #endif
897 	++wpipe->pipe_busy;
898 
899 	/*
900 	 * If it is advantageous to resize the pipe buffer, do
901 	 * so.
902 	 */
903 	if ((uio->uio_resid > PIPE_SIZE) &&
904 		(amountpipekva < maxpipekva / 2) &&
905 		(nbigpipe < LIMITBIGPIPES) &&
906 		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
907 		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
908 		(wpipe->pipe_buffer.cnt == 0)) {
909 
910 		if ((error = pipelock(wpipe, 1)) == 0) {
911 			PIPE_UNLOCK(wpipe);
912 			if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
913 				atomic_add_int(&nbigpipe, 1);
914 			PIPE_LOCK(wpipe);
915 			pipeunlock(wpipe);
916 		}
917 	}
918 
919 	/*
920 	 * If an early error occured unbusy and return, waking up any pending
921 	 * readers.
922 	 */
923 	if (error) {
924 		--wpipe->pipe_busy;
925 		if ((wpipe->pipe_busy == 0) &&
926 		    (wpipe->pipe_state & PIPE_WANT)) {
927 			wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
928 			wakeup(wpipe);
929 		}
930 		PIPE_UNLOCK(rpipe);
931 		return(error);
932 	}
933 
934 	orig_resid = uio->uio_resid;
935 
936 	while (uio->uio_resid) {
937 		int space;
938 
939 #ifndef PIPE_NODIRECT
940 		/*
941 		 * If the transfer is large, we can gain performance if
942 		 * we do process-to-process copies directly.
943 		 * If the write is non-blocking, we don't use the
944 		 * direct write mechanism.
945 		 *
946 		 * The direct write mechanism will detect the reader going
947 		 * away on us.
948 		 */
949 		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
950 		    (fp->f_flag & FNONBLOCK) == 0 &&
951 		    amountpipekvawired + uio->uio_resid < maxpipekvawired) {
952 			error = pipe_direct_write(wpipe, uio);
953 			if (error)
954 				break;
955 			continue;
956 		}
957 #endif
958 
959 		/*
960 		 * Pipe buffered writes cannot be coincidental with
961 		 * direct writes.  We wait until the currently executing
962 		 * direct write is completed before we start filling the
963 		 * pipe buffer.  We break out if a signal occurs or the
964 		 * reader goes away.
965 		 */
966 	retrywrite:
967 		while (wpipe->pipe_state & PIPE_DIRECTW) {
968 			if (wpipe->pipe_state & PIPE_WANTR) {
969 				wpipe->pipe_state &= ~PIPE_WANTR;
970 				wakeup(wpipe);
971 			}
972 			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
973 			    "pipbww", 0);
974 			if (wpipe->pipe_state & PIPE_EOF)
975 				break;
976 			if (error)
977 				break;
978 		}
979 		if (wpipe->pipe_state & PIPE_EOF) {
980 			error = EPIPE;
981 			break;
982 		}
983 
984 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
985 
986 		/* Writes of size <= PIPE_BUF must be atomic. */
987 		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
988 			space = 0;
989 
990 		if (space > 0) {
991 			if ((error = pipelock(wpipe,1)) == 0) {
992 				int size;	/* Transfer size */
993 				int segsize;	/* first segment to transfer */
994 
995 				/*
996 				 * It is possible for a direct write to
997 				 * slip in on us... handle it here...
998 				 */
999 				if (wpipe->pipe_state & PIPE_DIRECTW) {
1000 					pipeunlock(wpipe);
1001 					goto retrywrite;
1002 				}
1003 				/*
1004 				 * If a process blocked in uiomove, our
1005 				 * value for space might be bad.
1006 				 *
1007 				 * XXX will we be ok if the reader has gone
1008 				 * away here?
1009 				 */
1010 				if (space > wpipe->pipe_buffer.size -
1011 				    wpipe->pipe_buffer.cnt) {
1012 					pipeunlock(wpipe);
1013 					goto retrywrite;
1014 				}
1015 
1016 				/*
1017 				 * Transfer size is minimum of uio transfer
1018 				 * and free space in pipe buffer.
1019 				 */
1020 				if (space > uio->uio_resid)
1021 					size = uio->uio_resid;
1022 				else
1023 					size = space;
1024 				/*
1025 				 * First segment to transfer is minimum of
1026 				 * transfer size and contiguous space in
1027 				 * pipe buffer.  If first segment to transfer
1028 				 * is less than the transfer size, we've got
1029 				 * a wraparound in the buffer.
1030 				 */
1031 				segsize = wpipe->pipe_buffer.size -
1032 					wpipe->pipe_buffer.in;
1033 				if (segsize > size)
1034 					segsize = size;
1035 
1036 				/* Transfer first segment */
1037 
1038 				PIPE_UNLOCK(rpipe);
1039 				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1040 						segsize, uio);
1041 				PIPE_LOCK(rpipe);
1042 
1043 				if (error == 0 && segsize < size) {
1044 					/*
1045 					 * Transfer remaining part now, to
1046 					 * support atomic writes.  Wraparound
1047 					 * happened.
1048 					 */
1049 					if (wpipe->pipe_buffer.in + segsize !=
1050 					    wpipe->pipe_buffer.size)
1051 						panic("Expected pipe buffer "
1052 						    "wraparound disappeared");
1053 
1054 					PIPE_UNLOCK(rpipe);
1055 					error = uiomove(
1056 					    &wpipe->pipe_buffer.buffer[0],
1057 				    	    size - segsize, uio);
1058 					PIPE_LOCK(rpipe);
1059 				}
1060 				if (error == 0) {
1061 					wpipe->pipe_buffer.in += size;
1062 					if (wpipe->pipe_buffer.in >=
1063 					    wpipe->pipe_buffer.size) {
1064 						if (wpipe->pipe_buffer.in !=
1065 						    size - segsize +
1066 						    wpipe->pipe_buffer.size)
1067 							panic("Expected "
1068 							    "wraparound bad");
1069 						wpipe->pipe_buffer.in = size -
1070 						    segsize;
1071 					}
1072 
1073 					wpipe->pipe_buffer.cnt += size;
1074 					if (wpipe->pipe_buffer.cnt >
1075 					    wpipe->pipe_buffer.size)
1076 						panic("Pipe buffer overflow");
1077 
1078 				}
1079 				pipeunlock(wpipe);
1080 			}
1081 			if (error)
1082 				break;
1083 
1084 		} else {
1085 			/*
1086 			 * If the "read-side" has been blocked, wake it up now.
1087 			 */
1088 			if (wpipe->pipe_state & PIPE_WANTR) {
1089 				wpipe->pipe_state &= ~PIPE_WANTR;
1090 				wakeup(wpipe);
1091 			}
1092 
1093 			/*
1094 			 * don't block on non-blocking I/O
1095 			 */
1096 			if (fp->f_flag & FNONBLOCK) {
1097 				error = EAGAIN;
1098 				break;
1099 			}
1100 
1101 			/*
1102 			 * We have no more space and have something to offer,
1103 			 * wake up select/poll.
1104 			 */
1105 			pipeselwakeup(wpipe);
1106 
1107 			wpipe->pipe_state |= PIPE_WANTW;
1108 			error = msleep(wpipe, PIPE_MTX(rpipe),
1109 			    PRIBIO | PCATCH, "pipewr", 0);
1110 			if (error != 0)
1111 				break;
1112 			/*
1113 			 * If read side wants to go away, we just issue a signal
1114 			 * to ourselves.
1115 			 */
1116 			if (wpipe->pipe_state & PIPE_EOF) {
1117 				error = EPIPE;
1118 				break;
1119 			}
1120 		}
1121 	}
1122 
1123 	--wpipe->pipe_busy;
1124 
1125 	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1126 		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1127 		wakeup(wpipe);
1128 	} else if (wpipe->pipe_buffer.cnt > 0) {
1129 		/*
1130 		 * If we have put any characters in the buffer, we wake up
1131 		 * the reader.
1132 		 */
1133 		if (wpipe->pipe_state & PIPE_WANTR) {
1134 			wpipe->pipe_state &= ~PIPE_WANTR;
1135 			wakeup(wpipe);
1136 		}
1137 	}
1138 
1139 	/*
1140 	 * Don't return EPIPE if I/O was successful
1141 	 */
1142 	if ((wpipe->pipe_buffer.cnt == 0) &&
1143 	    (uio->uio_resid == 0) &&
1144 	    (error == EPIPE)) {
1145 		error = 0;
1146 	}
1147 
1148 	if (error == 0)
1149 		vfs_timestamp(&wpipe->pipe_mtime);
1150 
1151 	/*
1152 	 * We have something to offer,
1153 	 * wake up select/poll.
1154 	 */
1155 	if (wpipe->pipe_buffer.cnt)
1156 		pipeselwakeup(wpipe);
1157 
1158 	PIPE_UNLOCK(rpipe);
1159 	return (error);
1160 }
1161 
1162 /*
1163  * we implement a very minimal set of ioctls for compatibility with sockets.
1164  */
1165 static int
1166 pipe_ioctl(fp, cmd, data, active_cred, td)
1167 	struct file *fp;
1168 	u_long cmd;
1169 	void *data;
1170 	struct ucred *active_cred;
1171 	struct thread *td;
1172 {
1173 	struct pipe *mpipe = fp->f_data;
1174 #ifdef MAC
1175 	int error;
1176 #endif
1177 
1178 	PIPE_LOCK(mpipe);
1179 
1180 #ifdef MAC
1181 	error = mac_check_pipe_ioctl(active_cred, mpipe, cmd, data);
1182 	if (error) {
1183 		PIPE_UNLOCK(mpipe);
1184 		return (error);
1185 	}
1186 #endif
1187 
1188 	switch (cmd) {
1189 
1190 	case FIONBIO:
1191 		PIPE_UNLOCK(mpipe);
1192 		return (0);
1193 
1194 	case FIOASYNC:
1195 		if (*(int *)data) {
1196 			mpipe->pipe_state |= PIPE_ASYNC;
1197 		} else {
1198 			mpipe->pipe_state &= ~PIPE_ASYNC;
1199 		}
1200 		PIPE_UNLOCK(mpipe);
1201 		return (0);
1202 
1203 	case FIONREAD:
1204 		if (mpipe->pipe_state & PIPE_DIRECTW)
1205 			*(int *)data = mpipe->pipe_map.cnt;
1206 		else
1207 			*(int *)data = mpipe->pipe_buffer.cnt;
1208 		PIPE_UNLOCK(mpipe);
1209 		return (0);
1210 
1211 	case FIOSETOWN:
1212 		PIPE_UNLOCK(mpipe);
1213 		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1214 
1215 	case FIOGETOWN:
1216 		PIPE_UNLOCK(mpipe);
1217 		*(int *)data = fgetown(&mpipe->pipe_sigio);
1218 		return (0);
1219 
1220 	/* This is deprecated, FIOSETOWN should be used instead. */
1221 	case TIOCSPGRP:
1222 		PIPE_UNLOCK(mpipe);
1223 		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1224 
1225 	/* This is deprecated, FIOGETOWN should be used instead. */
1226 	case TIOCGPGRP:
1227 		PIPE_UNLOCK(mpipe);
1228 		*(int *)data = -fgetown(&mpipe->pipe_sigio);
1229 		return (0);
1230 
1231 	}
1232 	PIPE_UNLOCK(mpipe);
1233 	return (ENOTTY);
1234 }
1235 
1236 static int
1237 pipe_poll(fp, events, active_cred, td)
1238 	struct file *fp;
1239 	int events;
1240 	struct ucred *active_cred;
1241 	struct thread *td;
1242 {
1243 	struct pipe *rpipe = fp->f_data;
1244 	struct pipe *wpipe;
1245 	int revents = 0;
1246 #ifdef MAC
1247 	int error;
1248 #endif
1249 
1250 	wpipe = rpipe->pipe_peer;
1251 	PIPE_LOCK(rpipe);
1252 #ifdef MAC
1253 	error = mac_check_pipe_poll(active_cred, rpipe);
1254 	if (error)
1255 		goto locked_error;
1256 #endif
1257 	if (events & (POLLIN | POLLRDNORM))
1258 		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1259 		    (rpipe->pipe_buffer.cnt > 0) ||
1260 		    (rpipe->pipe_state & PIPE_EOF))
1261 			revents |= events & (POLLIN | POLLRDNORM);
1262 
1263 	if (events & (POLLOUT | POLLWRNORM))
1264 		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
1265 		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1266 		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1267 			revents |= events & (POLLOUT | POLLWRNORM);
1268 
1269 	if ((rpipe->pipe_state & PIPE_EOF) ||
1270 	    (wpipe == NULL) ||
1271 	    (wpipe->pipe_state & PIPE_EOF))
1272 		revents |= POLLHUP;
1273 
1274 	if (revents == 0) {
1275 		if (events & (POLLIN | POLLRDNORM)) {
1276 			selrecord(td, &rpipe->pipe_sel);
1277 			rpipe->pipe_state |= PIPE_SEL;
1278 		}
1279 
1280 		if (events & (POLLOUT | POLLWRNORM)) {
1281 			selrecord(td, &wpipe->pipe_sel);
1282 			wpipe->pipe_state |= PIPE_SEL;
1283 		}
1284 	}
1285 #ifdef MAC
1286 locked_error:
1287 #endif
1288 	PIPE_UNLOCK(rpipe);
1289 
1290 	return (revents);
1291 }
1292 
1293 /*
1294  * We shouldn't need locks here as we're doing a read and this should
1295  * be a natural race.
1296  */
1297 static int
1298 pipe_stat(fp, ub, active_cred, td)
1299 	struct file *fp;
1300 	struct stat *ub;
1301 	struct ucred *active_cred;
1302 	struct thread *td;
1303 {
1304 	struct pipe *pipe = fp->f_data;
1305 #ifdef MAC
1306 	int error;
1307 
1308 	PIPE_LOCK(pipe);
1309 	error = mac_check_pipe_stat(active_cred, pipe);
1310 	PIPE_UNLOCK(pipe);
1311 	if (error)
1312 		return (error);
1313 #endif
1314 	bzero(ub, sizeof(*ub));
1315 	ub->st_mode = S_IFIFO;
1316 	ub->st_blksize = pipe->pipe_buffer.size;
1317 	ub->st_size = pipe->pipe_buffer.cnt;
1318 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1319 	ub->st_atimespec = pipe->pipe_atime;
1320 	ub->st_mtimespec = pipe->pipe_mtime;
1321 	ub->st_ctimespec = pipe->pipe_ctime;
1322 	ub->st_uid = fp->f_cred->cr_uid;
1323 	ub->st_gid = fp->f_cred->cr_gid;
1324 	/*
1325 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1326 	 * XXX (st_dev, st_ino) should be unique.
1327 	 */
1328 	return (0);
1329 }
1330 
1331 /* ARGSUSED */
1332 static int
1333 pipe_close(fp, td)
1334 	struct file *fp;
1335 	struct thread *td;
1336 {
1337 	struct pipe *cpipe = fp->f_data;
1338 
1339 	fp->f_ops = &badfileops;
1340 	fp->f_data = NULL;
1341 	funsetown(&cpipe->pipe_sigio);
1342 	pipeclose(cpipe);
1343 	return (0);
1344 }
1345 
1346 static void
1347 pipe_free_kmem(cpipe)
1348 	struct pipe *cpipe;
1349 {
1350 
1351 	KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)),
1352 	       ("pipespace: pipe mutex locked"));
1353 
1354 	if (cpipe->pipe_buffer.buffer != NULL) {
1355 		if (cpipe->pipe_buffer.size > PIPE_SIZE)
1356 			atomic_subtract_int(&nbigpipe, 1);
1357 		atomic_subtract_int(&amountpipekva, cpipe->pipe_buffer.size);
1358 		atomic_subtract_int(&amountpipes, 1);
1359 		vm_map_remove(pipe_map,
1360 		    (vm_offset_t)cpipe->pipe_buffer.buffer,
1361 		    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
1362 		cpipe->pipe_buffer.buffer = NULL;
1363 	}
1364 #ifndef PIPE_NODIRECT
1365 	if (cpipe->pipe_map.kva != 0) {
1366 		atomic_subtract_int(&amountpipekvawired,
1367 		    cpipe->pipe_buffer.size + PAGE_SIZE);
1368 		kmem_free(kernel_map,
1369 			cpipe->pipe_map.kva,
1370 			cpipe->pipe_buffer.size + PAGE_SIZE);
1371 		cpipe->pipe_map.cnt = 0;
1372 		cpipe->pipe_map.kva = 0;
1373 		cpipe->pipe_map.pos = 0;
1374 		cpipe->pipe_map.npages = 0;
1375 	}
1376 #endif
1377 }
1378 
1379 /*
1380  * shutdown the pipe
1381  */
1382 static void
1383 pipeclose(cpipe)
1384 	struct pipe *cpipe;
1385 {
1386 	struct pipe *ppipe;
1387 	int hadpeer;
1388 
1389 	if (cpipe == NULL)
1390 		return;
1391 
1392 	hadpeer = 0;
1393 
1394 	/* partially created pipes won't have a valid mutex. */
1395 	if (PIPE_MTX(cpipe) != NULL)
1396 		PIPE_LOCK(cpipe);
1397 
1398 	pipeselwakeup(cpipe);
1399 
1400 	/*
1401 	 * If the other side is blocked, wake it up saying that
1402 	 * we want to close it down.
1403 	 */
1404 	while (cpipe->pipe_busy) {
1405 		wakeup(cpipe);
1406 		cpipe->pipe_state |= PIPE_WANT | PIPE_EOF;
1407 		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1408 	}
1409 
1410 #ifdef MAC
1411 	if (cpipe->pipe_label != NULL && cpipe->pipe_peer == NULL)
1412 		mac_destroy_pipe(cpipe);
1413 #endif
1414 
1415 	/*
1416 	 * Disconnect from peer
1417 	 */
1418 	if ((ppipe = cpipe->pipe_peer) != NULL) {
1419 		hadpeer++;
1420 		pipeselwakeup(ppipe);
1421 
1422 		ppipe->pipe_state |= PIPE_EOF;
1423 		wakeup(ppipe);
1424 		KNOTE(&ppipe->pipe_sel.si_note, 0);
1425 		ppipe->pipe_peer = NULL;
1426 	}
1427 	/*
1428 	 * free resources
1429 	 */
1430 	if (PIPE_MTX(cpipe) != NULL) {
1431 		PIPE_UNLOCK(cpipe);
1432 		if (!hadpeer) {
1433 			mtx_destroy(PIPE_MTX(cpipe));
1434 			free(PIPE_MTX(cpipe), M_TEMP);
1435 		}
1436 	}
1437 	pipe_free_kmem(cpipe);
1438 	uma_zfree(pipe_zone, cpipe);
1439 }
1440 
1441 /*ARGSUSED*/
1442 static int
1443 pipe_kqfilter(struct file *fp, struct knote *kn)
1444 {
1445 	struct pipe *cpipe;
1446 
1447 	cpipe = kn->kn_fp->f_data;
1448 	switch (kn->kn_filter) {
1449 	case EVFILT_READ:
1450 		kn->kn_fop = &pipe_rfiltops;
1451 		break;
1452 	case EVFILT_WRITE:
1453 		kn->kn_fop = &pipe_wfiltops;
1454 		cpipe = cpipe->pipe_peer;
1455 		if (cpipe == NULL)
1456 			/* other end of pipe has been closed */
1457 			return (EPIPE);
1458 		break;
1459 	default:
1460 		return (1);
1461 	}
1462 
1463 	PIPE_LOCK(cpipe);
1464 	SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
1465 	PIPE_UNLOCK(cpipe);
1466 	return (0);
1467 }
1468 
1469 static void
1470 filt_pipedetach(struct knote *kn)
1471 {
1472 	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
1473 
1474 	if (kn->kn_filter == EVFILT_WRITE) {
1475 		if (cpipe->pipe_peer == NULL)
1476 			return;
1477 		cpipe = cpipe->pipe_peer;
1478 	}
1479 
1480 	PIPE_LOCK(cpipe);
1481 	SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1482 	PIPE_UNLOCK(cpipe);
1483 }
1484 
1485 /*ARGSUSED*/
1486 static int
1487 filt_piperead(struct knote *kn, long hint)
1488 {
1489 	struct pipe *rpipe = kn->kn_fp->f_data;
1490 	struct pipe *wpipe = rpipe->pipe_peer;
1491 
1492 	PIPE_LOCK(rpipe);
1493 	kn->kn_data = rpipe->pipe_buffer.cnt;
1494 	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1495 		kn->kn_data = rpipe->pipe_map.cnt;
1496 
1497 	if ((rpipe->pipe_state & PIPE_EOF) ||
1498 	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1499 		kn->kn_flags |= EV_EOF;
1500 		PIPE_UNLOCK(rpipe);
1501 		return (1);
1502 	}
1503 	PIPE_UNLOCK(rpipe);
1504 	return (kn->kn_data > 0);
1505 }
1506 
1507 /*ARGSUSED*/
1508 static int
1509 filt_pipewrite(struct knote *kn, long hint)
1510 {
1511 	struct pipe *rpipe = kn->kn_fp->f_data;
1512 	struct pipe *wpipe = rpipe->pipe_peer;
1513 
1514 	PIPE_LOCK(rpipe);
1515 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1516 		kn->kn_data = 0;
1517 		kn->kn_flags |= EV_EOF;
1518 		PIPE_UNLOCK(rpipe);
1519 		return (1);
1520 	}
1521 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1522 	if (wpipe->pipe_state & PIPE_DIRECTW)
1523 		kn->kn_data = 0;
1524 
1525 	PIPE_UNLOCK(rpipe);
1526 	return (kn->kn_data >= PIPE_BUF);
1527 }
1528