xref: /freebsd/sys/kern/sys_pipe.c (revision 7660b554bc59a07be0431c17e0e33815818baa69)
1 /*
2  * Copyright (c) 1996 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. Absolutely no warranty of function or purpose is made by the author
15  *    John S. Dyson.
16  * 4. Modifications may be freely made to this file if the above conditions
17  *    are met.
18  */
19 
20 /*
21  * This file contains a high-performance replacement for the socket-based
22  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
23  * all features of sockets, but does do everything that pipes normally
24  * do.
25  */
26 
27 /*
28  * This code has two modes of operation, a small write mode and a large
29  * write mode.  The small write mode acts like conventional pipes with
30  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
31  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
32  * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
33  * the receiving process can copy it directly from the pages in the sending
34  * process.
35  *
36  * If the sending process receives a signal, it is possible that it will
37  * go away, and certainly its address space can change, because control
38  * is returned back to the user-mode side.  In that case, the pipe code
39  * arranges to copy the buffer supplied by the user process, to a pageable
40  * kernel buffer, and the receiving process will grab the data from the
41  * pageable kernel buffer.  Since signals don't happen all that often,
42  * the copy operation is normally eliminated.
43  *
44  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
45  * happen for small transfers so that the system will not spend all of
46  * its time context switching.
47  *
48  * In order to limit the resource use of pipes, two sysctls exist:
49  *
50  * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
51  * address space available to us in pipe_map.  Whenever the amount in use
52  * exceeds half of this value, all new pipes will be created with size
53  * SMALL_PIPE_SIZE, rather than PIPE_SIZE.  Big pipe creation will be limited
54  * as well.  This value is loader tunable only.
55  *
56  * kern.ipc.maxpipekvawired - This value limits the amount of memory that may
57  * be wired in order to facilitate direct copies using page flipping.
58  * Whenever this value is exceeded, pipes will fall back to using regular
59  * copies.  This value is sysctl controllable at all times.
60  *
61  * These values are autotuned in subr_param.c.
62  *
63  * Memory usage may be monitored through the sysctls
64  * kern.ipc.pipes, kern.ipc.pipekva and kern.ipc.pipekvawired.
65  *
66  */
67 
68 #include <sys/cdefs.h>
69 __FBSDID("$FreeBSD$");
70 
71 #include "opt_mac.h"
72 
73 #include <sys/param.h>
74 #include <sys/systm.h>
75 #include <sys/fcntl.h>
76 #include <sys/file.h>
77 #include <sys/filedesc.h>
78 #include <sys/filio.h>
79 #include <sys/kernel.h>
80 #include <sys/lock.h>
81 #include <sys/mac.h>
82 #include <sys/mutex.h>
83 #include <sys/ttycom.h>
84 #include <sys/stat.h>
85 #include <sys/malloc.h>
86 #include <sys/poll.h>
87 #include <sys/selinfo.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/sysproto.h>
91 #include <sys/pipe.h>
92 #include <sys/proc.h>
93 #include <sys/vnode.h>
94 #include <sys/uio.h>
95 #include <sys/event.h>
96 
97 #include <vm/vm.h>
98 #include <vm/vm_param.h>
99 #include <vm/vm_object.h>
100 #include <vm/vm_kern.h>
101 #include <vm/vm_extern.h>
102 #include <vm/pmap.h>
103 #include <vm/vm_map.h>
104 #include <vm/vm_page.h>
105 #include <vm/uma.h>
106 
107 /*
108  * Use this define if you want to disable *fancy* VM things.  Expect an
109  * approx 30% decrease in transfer rate.  This could be useful for
110  * NetBSD or OpenBSD.
111  */
112 /* #define PIPE_NODIRECT */
113 
114 /*
115  * interfaces to the outside world
116  */
117 static fo_rdwr_t	pipe_read;
118 static fo_rdwr_t	pipe_write;
119 static fo_ioctl_t	pipe_ioctl;
120 static fo_poll_t	pipe_poll;
121 static fo_kqfilter_t	pipe_kqfilter;
122 static fo_stat_t	pipe_stat;
123 static fo_close_t	pipe_close;
124 
125 static struct fileops pipeops = {
126 	.fo_read = pipe_read,
127 	.fo_write = pipe_write,
128 	.fo_ioctl = pipe_ioctl,
129 	.fo_poll = pipe_poll,
130 	.fo_kqfilter = pipe_kqfilter,
131 	.fo_stat = pipe_stat,
132 	.fo_close = pipe_close,
133 	.fo_flags = DFLAG_PASSABLE
134 };
135 
136 static void	filt_pipedetach(struct knote *kn);
137 static int	filt_piperead(struct knote *kn, long hint);
138 static int	filt_pipewrite(struct knote *kn, long hint);
139 
140 static struct filterops pipe_rfiltops =
141 	{ 1, NULL, filt_pipedetach, filt_piperead };
142 static struct filterops pipe_wfiltops =
143 	{ 1, NULL, filt_pipedetach, filt_pipewrite };
144 
145 /*
146  * Default pipe buffer size(s), this can be kind-of large now because pipe
147  * space is pageable.  The pipe code will try to maintain locality of
148  * reference for performance reasons, so small amounts of outstanding I/O
149  * will not wipe the cache.
150  */
151 #define MINPIPESIZE (PIPE_SIZE/3)
152 #define MAXPIPESIZE (2*PIPE_SIZE/3)
153 
154 /*
155  * Limit the number of "big" pipes
156  */
157 #define LIMITBIGPIPES	32
158 static int nbigpipe;
159 
160 static int amountpipes;
161 static int amountpipekva;
162 static int amountpipekvawired;
163 
164 SYSCTL_DECL(_kern_ipc);
165 
166 SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RD,
167 	   &maxpipekva, 0, "Pipe KVA limit");
168 SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekvawired, CTLFLAG_RW,
169 	   &maxpipekvawired, 0, "Pipe KVA wired limit");
170 SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD,
171 	   &amountpipes, 0, "Current # of pipes");
172 SYSCTL_INT(_kern_ipc, OID_AUTO, bigpipes, CTLFLAG_RD,
173 	   &nbigpipe, 0, "Current # of big pipes");
174 SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
175 	   &amountpipekva, 0, "Pipe KVA usage");
176 SYSCTL_INT(_kern_ipc, OID_AUTO, pipekvawired, CTLFLAG_RD,
177 	   &amountpipekvawired, 0, "Pipe wired KVA usage");
178 
179 static void pipeinit(void *dummy __unused);
180 static void pipeclose(struct pipe *cpipe);
181 static void pipe_free_kmem(struct pipe *cpipe);
182 static int pipe_create(struct pipe **cpipep);
183 static __inline int pipelock(struct pipe *cpipe, int catch);
184 static __inline void pipeunlock(struct pipe *cpipe);
185 static __inline void pipeselwakeup(struct pipe *cpipe);
186 #ifndef PIPE_NODIRECT
187 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
188 static void pipe_destroy_write_buffer(struct pipe *wpipe);
189 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
190 static void pipe_clone_write_buffer(struct pipe *wpipe);
191 #endif
192 static int pipespace(struct pipe *cpipe, int size);
193 
194 static uma_zone_t pipe_zone;
195 
196 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
197 
198 static void
199 pipeinit(void *dummy __unused)
200 {
201 
202 	pipe_zone = uma_zcreate("PIPE", sizeof(struct pipe), NULL,
203 	    NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
204 	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
205 }
206 
207 /*
208  * The pipe system call for the DTYPE_PIPE type of pipes
209  */
210 
211 /* ARGSUSED */
212 int
213 pipe(td, uap)
214 	struct thread *td;
215 	struct pipe_args /* {
216 		int	dummy;
217 	} */ *uap;
218 {
219 	struct filedesc *fdp = td->td_proc->p_fd;
220 	struct file *rf, *wf;
221 	struct pipe *rpipe, *wpipe;
222 	struct mtx *pmtx;
223 	int fd, error;
224 
225 	pmtx = malloc(sizeof(*pmtx), M_TEMP, M_WAITOK | M_ZERO);
226 
227 	rpipe = wpipe = NULL;
228 	if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
229 		pipeclose(rpipe);
230 		pipeclose(wpipe);
231 		free(pmtx, M_TEMP);
232 		return (ENFILE);
233 	}
234 
235 	rpipe->pipe_state |= PIPE_DIRECTOK;
236 	wpipe->pipe_state |= PIPE_DIRECTOK;
237 
238 	error = falloc(td, &rf, &fd);
239 	if (error) {
240 		pipeclose(rpipe);
241 		pipeclose(wpipe);
242 		free(pmtx, M_TEMP);
243 		return (error);
244 	}
245 	fhold(rf);
246 	td->td_retval[0] = fd;
247 
248 	/*
249 	 * Warning: once we've gotten past allocation of the fd for the
250 	 * read-side, we can only drop the read side via fdrop() in order
251 	 * to avoid races against processes which manage to dup() the read
252 	 * side while we are blocked trying to allocate the write side.
253 	 */
254 	FILE_LOCK(rf);
255 	rf->f_flag = FREAD | FWRITE;
256 	rf->f_type = DTYPE_PIPE;
257 	rf->f_data = rpipe;
258 	rf->f_ops = &pipeops;
259 	FILE_UNLOCK(rf);
260 	error = falloc(td, &wf, &fd);
261 	if (error) {
262 		FILEDESC_LOCK(fdp);
263 		if (fdp->fd_ofiles[td->td_retval[0]] == rf) {
264 			fdp->fd_ofiles[td->td_retval[0]] = NULL;
265 			FILEDESC_UNLOCK(fdp);
266 			fdrop(rf, td);
267 		} else
268 			FILEDESC_UNLOCK(fdp);
269 		fdrop(rf, td);
270 		/* rpipe has been closed by fdrop(). */
271 		pipeclose(wpipe);
272 		free(pmtx, M_TEMP);
273 		return (error);
274 	}
275 	FILE_LOCK(wf);
276 	wf->f_flag = FREAD | FWRITE;
277 	wf->f_type = DTYPE_PIPE;
278 	wf->f_data = wpipe;
279 	wf->f_ops = &pipeops;
280 	FILE_UNLOCK(wf);
281 	td->td_retval[1] = fd;
282 	rpipe->pipe_peer = wpipe;
283 	wpipe->pipe_peer = rpipe;
284 #ifdef MAC
285 	/*
286 	 * struct pipe represents a pipe endpoint.  The MAC label is shared
287 	 * between the connected endpoints.  As a result mac_init_pipe() and
288 	 * mac_create_pipe() should only be called on one of the endpoints
289 	 * after they have been connected.
290 	 */
291 	mac_init_pipe(rpipe);
292 	mac_create_pipe(td->td_ucred, rpipe);
293 #endif
294 	mtx_init(pmtx, "pipe mutex", NULL, MTX_DEF | MTX_RECURSE);
295 	rpipe->pipe_mtxp = wpipe->pipe_mtxp = pmtx;
296 	fdrop(rf, td);
297 
298 	return (0);
299 }
300 
301 /*
302  * Allocate kva for pipe circular buffer, the space is pageable
303  * This routine will 'realloc' the size of a pipe safely, if it fails
304  * it will retain the old buffer.
305  * If it fails it will return ENOMEM.
306  */
307 static int
308 pipespace(cpipe, size)
309 	struct pipe *cpipe;
310 	int size;
311 {
312 	struct vm_object *object;
313 	caddr_t buffer;
314 	int npages, error;
315 	static int curfail = 0;
316 	static struct timeval lastfail;
317 
318 	KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)),
319 	       ("pipespace: pipe mutex locked"));
320 
321 	size = round_page(size);
322 	npages = size / PAGE_SIZE;
323 	/*
324 	 * Create an object, I don't like the idea of paging to/from
325 	 * kernel_object.
326 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
327 	 */
328 	object = vm_object_allocate(OBJT_DEFAULT, npages);
329 	buffer = (caddr_t) vm_map_min(pipe_map);
330 
331 	/*
332 	 * Insert the object into the kernel map, and allocate kva for it.
333 	 * The map entry is, by default, pageable.
334 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
335 	 */
336 	error = vm_map_find(pipe_map, object, 0,
337 		(vm_offset_t *) &buffer, size, 1,
338 		VM_PROT_ALL, VM_PROT_ALL, 0);
339 
340 	if (error != KERN_SUCCESS) {
341 		vm_object_deallocate(object);
342 		if (ppsratecheck(&lastfail, &curfail, 1))
343 			printf("kern.maxpipekva exceeded, please see tuning(7).\n");
344 		return (ENOMEM);
345 	}
346 
347 	/* free old resources if we're resizing */
348 	pipe_free_kmem(cpipe);
349 	cpipe->pipe_buffer.buffer = buffer;
350 	cpipe->pipe_buffer.size = size;
351 	cpipe->pipe_buffer.in = 0;
352 	cpipe->pipe_buffer.out = 0;
353 	cpipe->pipe_buffer.cnt = 0;
354 	atomic_add_int(&amountpipes, 1);
355 	atomic_add_int(&amountpipekva, cpipe->pipe_buffer.size);
356 	return (0);
357 }
358 
359 /*
360  * initialize and allocate VM and memory for pipe
361  */
362 static int
363 pipe_create(cpipep)
364 	struct pipe **cpipep;
365 {
366 	struct pipe *cpipe;
367 	int error;
368 
369 	*cpipep = uma_zalloc(pipe_zone, M_WAITOK);
370 	if (*cpipep == NULL)
371 		return (ENOMEM);
372 
373 	cpipe = *cpipep;
374 
375 	/*
376 	 * protect so pipeclose() doesn't follow a junk pointer
377 	 * if pipespace() fails.
378 	 */
379 	bzero(&cpipe->pipe_sel, sizeof(cpipe->pipe_sel));
380 	cpipe->pipe_state = 0;
381 	cpipe->pipe_peer = NULL;
382 	cpipe->pipe_busy = 0;
383 
384 #ifndef PIPE_NODIRECT
385 	/*
386 	 * pipe data structure initializations to support direct pipe I/O
387 	 */
388 	cpipe->pipe_map.cnt = 0;
389 	cpipe->pipe_map.kva = 0;
390 	cpipe->pipe_map.pos = 0;
391 	cpipe->pipe_map.npages = 0;
392 	/* cpipe->pipe_map.ms[] = invalid */
393 #endif
394 
395 	cpipe->pipe_mtxp = NULL;	/* avoid pipespace assertion */
396 	/*
397 	 * Reduce to 1/4th pipe size if we're over our global max.
398 	 */
399 	if (amountpipekva > maxpipekva / 2)
400 		error = pipespace(cpipe, SMALL_PIPE_SIZE);
401 	else
402 		error = pipespace(cpipe, PIPE_SIZE);
403 	if (error)
404 		return (error);
405 
406 	vfs_timestamp(&cpipe->pipe_ctime);
407 	cpipe->pipe_atime = cpipe->pipe_ctime;
408 	cpipe->pipe_mtime = cpipe->pipe_ctime;
409 
410 	return (0);
411 }
412 
413 
414 /*
415  * lock a pipe for I/O, blocking other access
416  */
417 static __inline int
418 pipelock(cpipe, catch)
419 	struct pipe *cpipe;
420 	int catch;
421 {
422 	int error;
423 
424 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
425 	while (cpipe->pipe_state & PIPE_LOCKFL) {
426 		cpipe->pipe_state |= PIPE_LWANT;
427 		error = msleep(cpipe, PIPE_MTX(cpipe),
428 		    catch ? (PRIBIO | PCATCH) : PRIBIO,
429 		    "pipelk", 0);
430 		if (error != 0)
431 			return (error);
432 	}
433 	cpipe->pipe_state |= PIPE_LOCKFL;
434 	return (0);
435 }
436 
437 /*
438  * unlock a pipe I/O lock
439  */
440 static __inline void
441 pipeunlock(cpipe)
442 	struct pipe *cpipe;
443 {
444 
445 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
446 	cpipe->pipe_state &= ~PIPE_LOCKFL;
447 	if (cpipe->pipe_state & PIPE_LWANT) {
448 		cpipe->pipe_state &= ~PIPE_LWANT;
449 		wakeup(cpipe);
450 	}
451 }
452 
453 static __inline void
454 pipeselwakeup(cpipe)
455 	struct pipe *cpipe;
456 {
457 
458 	if (cpipe->pipe_state & PIPE_SEL) {
459 		cpipe->pipe_state &= ~PIPE_SEL;
460 		selwakeup(&cpipe->pipe_sel);
461 	}
462 	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
463 		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
464 	KNOTE(&cpipe->pipe_sel.si_note, 0);
465 }
466 
467 /* ARGSUSED */
468 static int
469 pipe_read(fp, uio, active_cred, flags, td)
470 	struct file *fp;
471 	struct uio *uio;
472 	struct ucred *active_cred;
473 	struct thread *td;
474 	int flags;
475 {
476 	struct pipe *rpipe = fp->f_data;
477 	int error;
478 	int nread = 0;
479 	u_int size;
480 
481 	PIPE_LOCK(rpipe);
482 	++rpipe->pipe_busy;
483 	error = pipelock(rpipe, 1);
484 	if (error)
485 		goto unlocked_error;
486 
487 #ifdef MAC
488 	error = mac_check_pipe_read(active_cred, rpipe);
489 	if (error)
490 		goto locked_error;
491 #endif
492 
493 	while (uio->uio_resid) {
494 		/*
495 		 * normal pipe buffer receive
496 		 */
497 		if (rpipe->pipe_buffer.cnt > 0) {
498 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
499 			if (size > rpipe->pipe_buffer.cnt)
500 				size = rpipe->pipe_buffer.cnt;
501 			if (size > (u_int) uio->uio_resid)
502 				size = (u_int) uio->uio_resid;
503 
504 			PIPE_UNLOCK(rpipe);
505 			error = uiomove(
506 			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
507 			    size, uio);
508 			PIPE_LOCK(rpipe);
509 			if (error)
510 				break;
511 
512 			rpipe->pipe_buffer.out += size;
513 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
514 				rpipe->pipe_buffer.out = 0;
515 
516 			rpipe->pipe_buffer.cnt -= size;
517 
518 			/*
519 			 * If there is no more to read in the pipe, reset
520 			 * its pointers to the beginning.  This improves
521 			 * cache hit stats.
522 			 */
523 			if (rpipe->pipe_buffer.cnt == 0) {
524 				rpipe->pipe_buffer.in = 0;
525 				rpipe->pipe_buffer.out = 0;
526 			}
527 			nread += size;
528 #ifndef PIPE_NODIRECT
529 		/*
530 		 * Direct copy, bypassing a kernel buffer.
531 		 */
532 		} else if ((size = rpipe->pipe_map.cnt) &&
533 			   (rpipe->pipe_state & PIPE_DIRECTW)) {
534 			caddr_t	va;
535 			if (size > (u_int) uio->uio_resid)
536 				size = (u_int) uio->uio_resid;
537 
538 			va = (caddr_t) rpipe->pipe_map.kva +
539 			    rpipe->pipe_map.pos;
540 			PIPE_UNLOCK(rpipe);
541 			error = uiomove(va, size, uio);
542 			PIPE_LOCK(rpipe);
543 			if (error)
544 				break;
545 			nread += size;
546 			rpipe->pipe_map.pos += size;
547 			rpipe->pipe_map.cnt -= size;
548 			if (rpipe->pipe_map.cnt == 0) {
549 				rpipe->pipe_state &= ~PIPE_DIRECTW;
550 				wakeup(rpipe);
551 			}
552 #endif
553 		} else {
554 			/*
555 			 * detect EOF condition
556 			 * read returns 0 on EOF, no need to set error
557 			 */
558 			if (rpipe->pipe_state & PIPE_EOF)
559 				break;
560 
561 			/*
562 			 * If the "write-side" has been blocked, wake it up now.
563 			 */
564 			if (rpipe->pipe_state & PIPE_WANTW) {
565 				rpipe->pipe_state &= ~PIPE_WANTW;
566 				wakeup(rpipe);
567 			}
568 
569 			/*
570 			 * Break if some data was read.
571 			 */
572 			if (nread > 0)
573 				break;
574 
575 			/*
576 			 * Unlock the pipe buffer for our remaining processing.
577 			 * We will either break out with an error or we will
578 			 * sleep and relock to loop.
579 			 */
580 			pipeunlock(rpipe);
581 
582 			/*
583 			 * Handle non-blocking mode operation or
584 			 * wait for more data.
585 			 */
586 			if (fp->f_flag & FNONBLOCK) {
587 				error = EAGAIN;
588 			} else {
589 				rpipe->pipe_state |= PIPE_WANTR;
590 				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
591 				    PRIBIO | PCATCH,
592 				    "piperd", 0)) == 0)
593 					error = pipelock(rpipe, 1);
594 			}
595 			if (error)
596 				goto unlocked_error;
597 		}
598 	}
599 #ifdef MAC
600 locked_error:
601 #endif
602 	pipeunlock(rpipe);
603 
604 	/* XXX: should probably do this before getting any locks. */
605 	if (error == 0)
606 		vfs_timestamp(&rpipe->pipe_atime);
607 unlocked_error:
608 	--rpipe->pipe_busy;
609 
610 	/*
611 	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
612 	 */
613 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
614 		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
615 		wakeup(rpipe);
616 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
617 		/*
618 		 * Handle write blocking hysteresis.
619 		 */
620 		if (rpipe->pipe_state & PIPE_WANTW) {
621 			rpipe->pipe_state &= ~PIPE_WANTW;
622 			wakeup(rpipe);
623 		}
624 	}
625 
626 	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
627 		pipeselwakeup(rpipe);
628 
629 	PIPE_UNLOCK(rpipe);
630 	return (error);
631 }
632 
633 #ifndef PIPE_NODIRECT
634 /*
635  * Map the sending processes' buffer into kernel space and wire it.
636  * This is similar to a physical write operation.
637  */
638 static int
639 pipe_build_write_buffer(wpipe, uio)
640 	struct pipe *wpipe;
641 	struct uio *uio;
642 {
643 	pmap_t pmap;
644 	u_int size;
645 	int i, j;
646 	vm_offset_t addr, endaddr;
647 
648 	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
649 
650 	size = (u_int) uio->uio_iov->iov_len;
651 	if (size > wpipe->pipe_buffer.size)
652 		size = wpipe->pipe_buffer.size;
653 
654 	pmap = vmspace_pmap(curproc->p_vmspace);
655 	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
656 	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
657 	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
658 		/*
659 		 * vm_fault_quick() can sleep.  Consequently,
660 		 * vm_page_lock_queue() and vm_page_unlock_queue()
661 		 * should not be performed outside of this loop.
662 		 */
663 	race:
664 		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0) {
665 			vm_page_lock_queues();
666 			for (j = 0; j < i; j++)
667 				vm_page_unhold(wpipe->pipe_map.ms[j]);
668 			vm_page_unlock_queues();
669 			return (EFAULT);
670 		}
671 		wpipe->pipe_map.ms[i] = pmap_extract_and_hold(pmap, addr,
672 		    VM_PROT_READ);
673 		if (wpipe->pipe_map.ms[i] == NULL)
674 			goto race;
675 	}
676 
677 /*
678  * set up the control block
679  */
680 	wpipe->pipe_map.npages = i;
681 	wpipe->pipe_map.pos =
682 	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
683 	wpipe->pipe_map.cnt = size;
684 
685 /*
686  * and map the buffer
687  */
688 	if (wpipe->pipe_map.kva == 0) {
689 		/*
690 		 * We need to allocate space for an extra page because the
691 		 * address range might (will) span pages at times.
692 		 */
693 		wpipe->pipe_map.kva = kmem_alloc_nofault(kernel_map,
694 			wpipe->pipe_buffer.size + PAGE_SIZE);
695 		atomic_add_int(&amountpipekvawired,
696 		    wpipe->pipe_buffer.size + PAGE_SIZE);
697 	}
698 	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
699 		wpipe->pipe_map.npages);
700 
701 /*
702  * and update the uio data
703  */
704 
705 	uio->uio_iov->iov_len -= size;
706 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
707 	if (uio->uio_iov->iov_len == 0)
708 		uio->uio_iov++;
709 	uio->uio_resid -= size;
710 	uio->uio_offset += size;
711 	return (0);
712 }
713 
714 /*
715  * unmap and unwire the process buffer
716  */
717 static void
718 pipe_destroy_write_buffer(wpipe)
719 	struct pipe *wpipe;
720 {
721 	int i;
722 
723 	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
724 	if (wpipe->pipe_map.kva) {
725 		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
726 
727 		if (amountpipekvawired > maxpipekvawired / 2) {
728 			/* Conserve address space */
729 			vm_offset_t kva = wpipe->pipe_map.kva;
730 			wpipe->pipe_map.kva = 0;
731 			kmem_free(kernel_map, kva,
732 			    wpipe->pipe_buffer.size + PAGE_SIZE);
733 			atomic_subtract_int(&amountpipekvawired,
734 			    wpipe->pipe_buffer.size + PAGE_SIZE);
735 		}
736 	}
737 	vm_page_lock_queues();
738 	for (i = 0; i < wpipe->pipe_map.npages; i++) {
739 		vm_page_unhold(wpipe->pipe_map.ms[i]);
740 	}
741 	vm_page_unlock_queues();
742 	wpipe->pipe_map.npages = 0;
743 }
744 
745 /*
746  * In the case of a signal, the writing process might go away.  This
747  * code copies the data into the circular buffer so that the source
748  * pages can be freed without loss of data.
749  */
750 static void
751 pipe_clone_write_buffer(wpipe)
752 	struct pipe *wpipe;
753 {
754 	int size;
755 	int pos;
756 
757 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
758 	size = wpipe->pipe_map.cnt;
759 	pos = wpipe->pipe_map.pos;
760 
761 	wpipe->pipe_buffer.in = size;
762 	wpipe->pipe_buffer.out = 0;
763 	wpipe->pipe_buffer.cnt = size;
764 	wpipe->pipe_state &= ~PIPE_DIRECTW;
765 
766 	PIPE_UNLOCK(wpipe);
767 	bcopy((caddr_t) wpipe->pipe_map.kva + pos,
768 	    wpipe->pipe_buffer.buffer, size);
769 	pipe_destroy_write_buffer(wpipe);
770 	PIPE_LOCK(wpipe);
771 }
772 
773 /*
774  * This implements the pipe buffer write mechanism.  Note that only
775  * a direct write OR a normal pipe write can be pending at any given time.
776  * If there are any characters in the pipe buffer, the direct write will
777  * be deferred until the receiving process grabs all of the bytes from
778  * the pipe buffer.  Then the direct mapping write is set-up.
779  */
780 static int
781 pipe_direct_write(wpipe, uio)
782 	struct pipe *wpipe;
783 	struct uio *uio;
784 {
785 	int error;
786 
787 retry:
788 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
789 	while (wpipe->pipe_state & PIPE_DIRECTW) {
790 		if (wpipe->pipe_state & PIPE_WANTR) {
791 			wpipe->pipe_state &= ~PIPE_WANTR;
792 			wakeup(wpipe);
793 		}
794 		wpipe->pipe_state |= PIPE_WANTW;
795 		error = msleep(wpipe, PIPE_MTX(wpipe),
796 		    PRIBIO | PCATCH, "pipdww", 0);
797 		if (error)
798 			goto error1;
799 		if (wpipe->pipe_state & PIPE_EOF) {
800 			error = EPIPE;
801 			goto error1;
802 		}
803 	}
804 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
805 	if (wpipe->pipe_buffer.cnt > 0) {
806 		if (wpipe->pipe_state & PIPE_WANTR) {
807 			wpipe->pipe_state &= ~PIPE_WANTR;
808 			wakeup(wpipe);
809 		}
810 
811 		wpipe->pipe_state |= PIPE_WANTW;
812 		error = msleep(wpipe, PIPE_MTX(wpipe),
813 		    PRIBIO | PCATCH, "pipdwc", 0);
814 		if (error)
815 			goto error1;
816 		if (wpipe->pipe_state & PIPE_EOF) {
817 			error = EPIPE;
818 			goto error1;
819 		}
820 		goto retry;
821 	}
822 
823 	wpipe->pipe_state |= PIPE_DIRECTW;
824 
825 	pipelock(wpipe, 0);
826 	PIPE_UNLOCK(wpipe);
827 	error = pipe_build_write_buffer(wpipe, uio);
828 	PIPE_LOCK(wpipe);
829 	pipeunlock(wpipe);
830 	if (error) {
831 		wpipe->pipe_state &= ~PIPE_DIRECTW;
832 		goto error1;
833 	}
834 
835 	error = 0;
836 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
837 		if (wpipe->pipe_state & PIPE_EOF) {
838 			pipelock(wpipe, 0);
839 			PIPE_UNLOCK(wpipe);
840 			pipe_destroy_write_buffer(wpipe);
841 			PIPE_LOCK(wpipe);
842 			pipeselwakeup(wpipe);
843 			pipeunlock(wpipe);
844 			error = EPIPE;
845 			goto error1;
846 		}
847 		if (wpipe->pipe_state & PIPE_WANTR) {
848 			wpipe->pipe_state &= ~PIPE_WANTR;
849 			wakeup(wpipe);
850 		}
851 		pipeselwakeup(wpipe);
852 		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
853 		    "pipdwt", 0);
854 	}
855 
856 	pipelock(wpipe,0);
857 	if (wpipe->pipe_state & PIPE_DIRECTW) {
858 		/*
859 		 * this bit of trickery substitutes a kernel buffer for
860 		 * the process that might be going away.
861 		 */
862 		pipe_clone_write_buffer(wpipe);
863 	} else {
864 		PIPE_UNLOCK(wpipe);
865 		pipe_destroy_write_buffer(wpipe);
866 		PIPE_LOCK(wpipe);
867 	}
868 	pipeunlock(wpipe);
869 	return (error);
870 
871 error1:
872 	wakeup(wpipe);
873 	return (error);
874 }
875 #endif
876 
877 static int
878 pipe_write(fp, uio, active_cred, flags, td)
879 	struct file *fp;
880 	struct uio *uio;
881 	struct ucred *active_cred;
882 	struct thread *td;
883 	int flags;
884 {
885 	int error = 0;
886 	int orig_resid;
887 	struct pipe *wpipe, *rpipe;
888 
889 	rpipe = fp->f_data;
890 	wpipe = rpipe->pipe_peer;
891 
892 	PIPE_LOCK(rpipe);
893 	/*
894 	 * detect loss of pipe read side, issue SIGPIPE if lost.
895 	 */
896 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
897 		PIPE_UNLOCK(rpipe);
898 		return (EPIPE);
899 	}
900 #ifdef MAC
901 	error = mac_check_pipe_write(active_cred, wpipe);
902 	if (error) {
903 		PIPE_UNLOCK(rpipe);
904 		return (error);
905 	}
906 #endif
907 	++wpipe->pipe_busy;
908 
909 	/*
910 	 * If it is advantageous to resize the pipe buffer, do
911 	 * so.
912 	 */
913 	if ((uio->uio_resid > PIPE_SIZE) &&
914 		(amountpipekva < maxpipekva / 2) &&
915 		(nbigpipe < LIMITBIGPIPES) &&
916 		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
917 		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
918 		(wpipe->pipe_buffer.cnt == 0)) {
919 
920 		if ((error = pipelock(wpipe, 1)) == 0) {
921 			PIPE_UNLOCK(wpipe);
922 			if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
923 				atomic_add_int(&nbigpipe, 1);
924 			PIPE_LOCK(wpipe);
925 			pipeunlock(wpipe);
926 		}
927 	}
928 
929 	/*
930 	 * If an early error occured unbusy and return, waking up any pending
931 	 * readers.
932 	 */
933 	if (error) {
934 		--wpipe->pipe_busy;
935 		if ((wpipe->pipe_busy == 0) &&
936 		    (wpipe->pipe_state & PIPE_WANT)) {
937 			wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
938 			wakeup(wpipe);
939 		}
940 		PIPE_UNLOCK(rpipe);
941 		return(error);
942 	}
943 
944 	orig_resid = uio->uio_resid;
945 
946 	while (uio->uio_resid) {
947 		int space;
948 
949 #ifndef PIPE_NODIRECT
950 		/*
951 		 * If the transfer is large, we can gain performance if
952 		 * we do process-to-process copies directly.
953 		 * If the write is non-blocking, we don't use the
954 		 * direct write mechanism.
955 		 *
956 		 * The direct write mechanism will detect the reader going
957 		 * away on us.
958 		 */
959 		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
960 		    (fp->f_flag & FNONBLOCK) == 0 &&
961 		    amountpipekvawired + uio->uio_resid < maxpipekvawired) {
962 			error = pipe_direct_write(wpipe, uio);
963 			if (error)
964 				break;
965 			continue;
966 		}
967 #endif
968 
969 		/*
970 		 * Pipe buffered writes cannot be coincidental with
971 		 * direct writes.  We wait until the currently executing
972 		 * direct write is completed before we start filling the
973 		 * pipe buffer.  We break out if a signal occurs or the
974 		 * reader goes away.
975 		 */
976 	retrywrite:
977 		while (wpipe->pipe_state & PIPE_DIRECTW) {
978 			if (wpipe->pipe_state & PIPE_WANTR) {
979 				wpipe->pipe_state &= ~PIPE_WANTR;
980 				wakeup(wpipe);
981 			}
982 			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
983 			    "pipbww", 0);
984 			if (wpipe->pipe_state & PIPE_EOF)
985 				break;
986 			if (error)
987 				break;
988 		}
989 		if (wpipe->pipe_state & PIPE_EOF) {
990 			error = EPIPE;
991 			break;
992 		}
993 
994 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
995 
996 		/* Writes of size <= PIPE_BUF must be atomic. */
997 		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
998 			space = 0;
999 
1000 		if (space > 0) {
1001 			if ((error = pipelock(wpipe,1)) == 0) {
1002 				int size;	/* Transfer size */
1003 				int segsize;	/* first segment to transfer */
1004 
1005 				/*
1006 				 * It is possible for a direct write to
1007 				 * slip in on us... handle it here...
1008 				 */
1009 				if (wpipe->pipe_state & PIPE_DIRECTW) {
1010 					pipeunlock(wpipe);
1011 					goto retrywrite;
1012 				}
1013 				/*
1014 				 * If a process blocked in uiomove, our
1015 				 * value for space might be bad.
1016 				 *
1017 				 * XXX will we be ok if the reader has gone
1018 				 * away here?
1019 				 */
1020 				if (space > wpipe->pipe_buffer.size -
1021 				    wpipe->pipe_buffer.cnt) {
1022 					pipeunlock(wpipe);
1023 					goto retrywrite;
1024 				}
1025 
1026 				/*
1027 				 * Transfer size is minimum of uio transfer
1028 				 * and free space in pipe buffer.
1029 				 */
1030 				if (space > uio->uio_resid)
1031 					size = uio->uio_resid;
1032 				else
1033 					size = space;
1034 				/*
1035 				 * First segment to transfer is minimum of
1036 				 * transfer size and contiguous space in
1037 				 * pipe buffer.  If first segment to transfer
1038 				 * is less than the transfer size, we've got
1039 				 * a wraparound in the buffer.
1040 				 */
1041 				segsize = wpipe->pipe_buffer.size -
1042 					wpipe->pipe_buffer.in;
1043 				if (segsize > size)
1044 					segsize = size;
1045 
1046 				/* Transfer first segment */
1047 
1048 				PIPE_UNLOCK(rpipe);
1049 				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
1050 						segsize, uio);
1051 				PIPE_LOCK(rpipe);
1052 
1053 				if (error == 0 && segsize < size) {
1054 					/*
1055 					 * Transfer remaining part now, to
1056 					 * support atomic writes.  Wraparound
1057 					 * happened.
1058 					 */
1059 					if (wpipe->pipe_buffer.in + segsize !=
1060 					    wpipe->pipe_buffer.size)
1061 						panic("Expected pipe buffer "
1062 						    "wraparound disappeared");
1063 
1064 					PIPE_UNLOCK(rpipe);
1065 					error = uiomove(
1066 					    &wpipe->pipe_buffer.buffer[0],
1067 				    	    size - segsize, uio);
1068 					PIPE_LOCK(rpipe);
1069 				}
1070 				if (error == 0) {
1071 					wpipe->pipe_buffer.in += size;
1072 					if (wpipe->pipe_buffer.in >=
1073 					    wpipe->pipe_buffer.size) {
1074 						if (wpipe->pipe_buffer.in !=
1075 						    size - segsize +
1076 						    wpipe->pipe_buffer.size)
1077 							panic("Expected "
1078 							    "wraparound bad");
1079 						wpipe->pipe_buffer.in = size -
1080 						    segsize;
1081 					}
1082 
1083 					wpipe->pipe_buffer.cnt += size;
1084 					if (wpipe->pipe_buffer.cnt >
1085 					    wpipe->pipe_buffer.size)
1086 						panic("Pipe buffer overflow");
1087 
1088 				}
1089 				pipeunlock(wpipe);
1090 			}
1091 			if (error)
1092 				break;
1093 
1094 		} else {
1095 			/*
1096 			 * If the "read-side" has been blocked, wake it up now.
1097 			 */
1098 			if (wpipe->pipe_state & PIPE_WANTR) {
1099 				wpipe->pipe_state &= ~PIPE_WANTR;
1100 				wakeup(wpipe);
1101 			}
1102 
1103 			/*
1104 			 * don't block on non-blocking I/O
1105 			 */
1106 			if (fp->f_flag & FNONBLOCK) {
1107 				error = EAGAIN;
1108 				break;
1109 			}
1110 
1111 			/*
1112 			 * We have no more space and have something to offer,
1113 			 * wake up select/poll.
1114 			 */
1115 			pipeselwakeup(wpipe);
1116 
1117 			wpipe->pipe_state |= PIPE_WANTW;
1118 			error = msleep(wpipe, PIPE_MTX(rpipe),
1119 			    PRIBIO | PCATCH, "pipewr", 0);
1120 			if (error != 0)
1121 				break;
1122 			/*
1123 			 * If read side wants to go away, we just issue a signal
1124 			 * to ourselves.
1125 			 */
1126 			if (wpipe->pipe_state & PIPE_EOF) {
1127 				error = EPIPE;
1128 				break;
1129 			}
1130 		}
1131 	}
1132 
1133 	--wpipe->pipe_busy;
1134 
1135 	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
1136 		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
1137 		wakeup(wpipe);
1138 	} else if (wpipe->pipe_buffer.cnt > 0) {
1139 		/*
1140 		 * If we have put any characters in the buffer, we wake up
1141 		 * the reader.
1142 		 */
1143 		if (wpipe->pipe_state & PIPE_WANTR) {
1144 			wpipe->pipe_state &= ~PIPE_WANTR;
1145 			wakeup(wpipe);
1146 		}
1147 	}
1148 
1149 	/*
1150 	 * Don't return EPIPE if I/O was successful
1151 	 */
1152 	if ((wpipe->pipe_buffer.cnt == 0) &&
1153 	    (uio->uio_resid == 0) &&
1154 	    (error == EPIPE)) {
1155 		error = 0;
1156 	}
1157 
1158 	if (error == 0)
1159 		vfs_timestamp(&wpipe->pipe_mtime);
1160 
1161 	/*
1162 	 * We have something to offer,
1163 	 * wake up select/poll.
1164 	 */
1165 	if (wpipe->pipe_buffer.cnt)
1166 		pipeselwakeup(wpipe);
1167 
1168 	PIPE_UNLOCK(rpipe);
1169 	return (error);
1170 }
1171 
1172 /*
1173  * we implement a very minimal set of ioctls for compatibility with sockets.
1174  */
1175 static int
1176 pipe_ioctl(fp, cmd, data, active_cred, td)
1177 	struct file *fp;
1178 	u_long cmd;
1179 	void *data;
1180 	struct ucred *active_cred;
1181 	struct thread *td;
1182 {
1183 	struct pipe *mpipe = fp->f_data;
1184 #ifdef MAC
1185 	int error;
1186 #endif
1187 
1188 	PIPE_LOCK(mpipe);
1189 
1190 #ifdef MAC
1191 	error = mac_check_pipe_ioctl(active_cred, mpipe, cmd, data);
1192 	if (error)
1193 		return (error);
1194 #endif
1195 
1196 	switch (cmd) {
1197 
1198 	case FIONBIO:
1199 		PIPE_UNLOCK(mpipe);
1200 		return (0);
1201 
1202 	case FIOASYNC:
1203 		if (*(int *)data) {
1204 			mpipe->pipe_state |= PIPE_ASYNC;
1205 		} else {
1206 			mpipe->pipe_state &= ~PIPE_ASYNC;
1207 		}
1208 		PIPE_UNLOCK(mpipe);
1209 		return (0);
1210 
1211 	case FIONREAD:
1212 		if (mpipe->pipe_state & PIPE_DIRECTW)
1213 			*(int *)data = mpipe->pipe_map.cnt;
1214 		else
1215 			*(int *)data = mpipe->pipe_buffer.cnt;
1216 		PIPE_UNLOCK(mpipe);
1217 		return (0);
1218 
1219 	case FIOSETOWN:
1220 		PIPE_UNLOCK(mpipe);
1221 		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1222 
1223 	case FIOGETOWN:
1224 		PIPE_UNLOCK(mpipe);
1225 		*(int *)data = fgetown(&mpipe->pipe_sigio);
1226 		return (0);
1227 
1228 	/* This is deprecated, FIOSETOWN should be used instead. */
1229 	case TIOCSPGRP:
1230 		PIPE_UNLOCK(mpipe);
1231 		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1232 
1233 	/* This is deprecated, FIOGETOWN should be used instead. */
1234 	case TIOCGPGRP:
1235 		PIPE_UNLOCK(mpipe);
1236 		*(int *)data = -fgetown(&mpipe->pipe_sigio);
1237 		return (0);
1238 
1239 	}
1240 	PIPE_UNLOCK(mpipe);
1241 	return (ENOTTY);
1242 }
1243 
1244 static int
1245 pipe_poll(fp, events, active_cred, td)
1246 	struct file *fp;
1247 	int events;
1248 	struct ucred *active_cred;
1249 	struct thread *td;
1250 {
1251 	struct pipe *rpipe = fp->f_data;
1252 	struct pipe *wpipe;
1253 	int revents = 0;
1254 #ifdef MAC
1255 	int error;
1256 #endif
1257 
1258 	wpipe = rpipe->pipe_peer;
1259 	PIPE_LOCK(rpipe);
1260 #ifdef MAC
1261 	error = mac_check_pipe_poll(active_cred, rpipe);
1262 	if (error)
1263 		goto locked_error;
1264 #endif
1265 	if (events & (POLLIN | POLLRDNORM))
1266 		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1267 		    (rpipe->pipe_buffer.cnt > 0) ||
1268 		    (rpipe->pipe_state & PIPE_EOF))
1269 			revents |= events & (POLLIN | POLLRDNORM);
1270 
1271 	if (events & (POLLOUT | POLLWRNORM))
1272 		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
1273 		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1274 		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1275 			revents |= events & (POLLOUT | POLLWRNORM);
1276 
1277 	if ((rpipe->pipe_state & PIPE_EOF) ||
1278 	    (wpipe == NULL) ||
1279 	    (wpipe->pipe_state & PIPE_EOF))
1280 		revents |= POLLHUP;
1281 
1282 	if (revents == 0) {
1283 		if (events & (POLLIN | POLLRDNORM)) {
1284 			selrecord(td, &rpipe->pipe_sel);
1285 			rpipe->pipe_state |= PIPE_SEL;
1286 		}
1287 
1288 		if (events & (POLLOUT | POLLWRNORM)) {
1289 			selrecord(td, &wpipe->pipe_sel);
1290 			wpipe->pipe_state |= PIPE_SEL;
1291 		}
1292 	}
1293 #ifdef MAC
1294 locked_error:
1295 #endif
1296 	PIPE_UNLOCK(rpipe);
1297 
1298 	return (revents);
1299 }
1300 
1301 /*
1302  * We shouldn't need locks here as we're doing a read and this should
1303  * be a natural race.
1304  */
1305 static int
1306 pipe_stat(fp, ub, active_cred, td)
1307 	struct file *fp;
1308 	struct stat *ub;
1309 	struct ucred *active_cred;
1310 	struct thread *td;
1311 {
1312 	struct pipe *pipe = fp->f_data;
1313 #ifdef MAC
1314 	int error;
1315 
1316 	PIPE_LOCK(pipe);
1317 	error = mac_check_pipe_stat(active_cred, pipe);
1318 	PIPE_UNLOCK(pipe);
1319 	if (error)
1320 		return (error);
1321 #endif
1322 	bzero(ub, sizeof(*ub));
1323 	ub->st_mode = S_IFIFO;
1324 	ub->st_blksize = pipe->pipe_buffer.size;
1325 	ub->st_size = pipe->pipe_buffer.cnt;
1326 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1327 	ub->st_atimespec = pipe->pipe_atime;
1328 	ub->st_mtimespec = pipe->pipe_mtime;
1329 	ub->st_ctimespec = pipe->pipe_ctime;
1330 	ub->st_uid = fp->f_cred->cr_uid;
1331 	ub->st_gid = fp->f_cred->cr_gid;
1332 	/*
1333 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1334 	 * XXX (st_dev, st_ino) should be unique.
1335 	 */
1336 	return (0);
1337 }
1338 
1339 /* ARGSUSED */
1340 static int
1341 pipe_close(fp, td)
1342 	struct file *fp;
1343 	struct thread *td;
1344 {
1345 	struct pipe *cpipe = fp->f_data;
1346 
1347 	fp->f_ops = &badfileops;
1348 	fp->f_data = NULL;
1349 	funsetown(&cpipe->pipe_sigio);
1350 	pipeclose(cpipe);
1351 	return (0);
1352 }
1353 
1354 static void
1355 pipe_free_kmem(cpipe)
1356 	struct pipe *cpipe;
1357 {
1358 
1359 	KASSERT(cpipe->pipe_mtxp == NULL || !mtx_owned(PIPE_MTX(cpipe)),
1360 	       ("pipespace: pipe mutex locked"));
1361 
1362 	if (cpipe->pipe_buffer.buffer != NULL) {
1363 		if (cpipe->pipe_buffer.size > PIPE_SIZE)
1364 			atomic_subtract_int(&nbigpipe, 1);
1365 		atomic_subtract_int(&amountpipekva, cpipe->pipe_buffer.size);
1366 		atomic_subtract_int(&amountpipes, 1);
1367 		vm_map_remove(pipe_map,
1368 		    (vm_offset_t)cpipe->pipe_buffer.buffer,
1369 		    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
1370 		cpipe->pipe_buffer.buffer = NULL;
1371 	}
1372 #ifndef PIPE_NODIRECT
1373 	if (cpipe->pipe_map.kva != 0) {
1374 		atomic_subtract_int(&amountpipekvawired,
1375 		    cpipe->pipe_buffer.size + PAGE_SIZE);
1376 		kmem_free(kernel_map,
1377 			cpipe->pipe_map.kva,
1378 			cpipe->pipe_buffer.size + PAGE_SIZE);
1379 		cpipe->pipe_map.cnt = 0;
1380 		cpipe->pipe_map.kva = 0;
1381 		cpipe->pipe_map.pos = 0;
1382 		cpipe->pipe_map.npages = 0;
1383 	}
1384 #endif
1385 }
1386 
1387 /*
1388  * shutdown the pipe
1389  */
1390 static void
1391 pipeclose(cpipe)
1392 	struct pipe *cpipe;
1393 {
1394 	struct pipe *ppipe;
1395 	int hadpeer;
1396 
1397 	if (cpipe == NULL)
1398 		return;
1399 
1400 	hadpeer = 0;
1401 
1402 	/* partially created pipes won't have a valid mutex. */
1403 	if (PIPE_MTX(cpipe) != NULL)
1404 		PIPE_LOCK(cpipe);
1405 
1406 	pipeselwakeup(cpipe);
1407 
1408 	/*
1409 	 * If the other side is blocked, wake it up saying that
1410 	 * we want to close it down.
1411 	 */
1412 	while (cpipe->pipe_busy) {
1413 		wakeup(cpipe);
1414 		cpipe->pipe_state |= PIPE_WANT | PIPE_EOF;
1415 		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
1416 	}
1417 
1418 #ifdef MAC
1419 	if (cpipe->pipe_label != NULL && cpipe->pipe_peer == NULL)
1420 		mac_destroy_pipe(cpipe);
1421 #endif
1422 
1423 	/*
1424 	 * Disconnect from peer
1425 	 */
1426 	if ((ppipe = cpipe->pipe_peer) != NULL) {
1427 		hadpeer++;
1428 		pipeselwakeup(ppipe);
1429 
1430 		ppipe->pipe_state |= PIPE_EOF;
1431 		wakeup(ppipe);
1432 		KNOTE(&ppipe->pipe_sel.si_note, 0);
1433 		ppipe->pipe_peer = NULL;
1434 	}
1435 	/*
1436 	 * free resources
1437 	 */
1438 	if (PIPE_MTX(cpipe) != NULL) {
1439 		PIPE_UNLOCK(cpipe);
1440 		if (!hadpeer) {
1441 			mtx_destroy(PIPE_MTX(cpipe));
1442 			free(PIPE_MTX(cpipe), M_TEMP);
1443 		}
1444 	}
1445 	pipe_free_kmem(cpipe);
1446 	uma_zfree(pipe_zone, cpipe);
1447 }
1448 
1449 /*ARGSUSED*/
1450 static int
1451 pipe_kqfilter(struct file *fp, struct knote *kn)
1452 {
1453 	struct pipe *cpipe;
1454 
1455 	cpipe = kn->kn_fp->f_data;
1456 	switch (kn->kn_filter) {
1457 	case EVFILT_READ:
1458 		kn->kn_fop = &pipe_rfiltops;
1459 		break;
1460 	case EVFILT_WRITE:
1461 		kn->kn_fop = &pipe_wfiltops;
1462 		cpipe = cpipe->pipe_peer;
1463 		if (cpipe == NULL)
1464 			/* other end of pipe has been closed */
1465 			return (EPIPE);
1466 		break;
1467 	default:
1468 		return (1);
1469 	}
1470 	kn->kn_hook = cpipe;
1471 
1472 	PIPE_LOCK(cpipe);
1473 	SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
1474 	PIPE_UNLOCK(cpipe);
1475 	return (0);
1476 }
1477 
1478 static void
1479 filt_pipedetach(struct knote *kn)
1480 {
1481 	struct pipe *cpipe = (struct pipe *)kn->kn_hook;
1482 
1483 	PIPE_LOCK(cpipe);
1484 	SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1485 	PIPE_UNLOCK(cpipe);
1486 }
1487 
1488 /*ARGSUSED*/
1489 static int
1490 filt_piperead(struct knote *kn, long hint)
1491 {
1492 	struct pipe *rpipe = kn->kn_fp->f_data;
1493 	struct pipe *wpipe = rpipe->pipe_peer;
1494 
1495 	PIPE_LOCK(rpipe);
1496 	kn->kn_data = rpipe->pipe_buffer.cnt;
1497 	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1498 		kn->kn_data = rpipe->pipe_map.cnt;
1499 
1500 	if ((rpipe->pipe_state & PIPE_EOF) ||
1501 	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1502 		kn->kn_flags |= EV_EOF;
1503 		PIPE_UNLOCK(rpipe);
1504 		return (1);
1505 	}
1506 	PIPE_UNLOCK(rpipe);
1507 	return (kn->kn_data > 0);
1508 }
1509 
1510 /*ARGSUSED*/
1511 static int
1512 filt_pipewrite(struct knote *kn, long hint)
1513 {
1514 	struct pipe *rpipe = kn->kn_fp->f_data;
1515 	struct pipe *wpipe = rpipe->pipe_peer;
1516 
1517 	PIPE_LOCK(rpipe);
1518 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1519 		kn->kn_data = 0;
1520 		kn->kn_flags |= EV_EOF;
1521 		PIPE_UNLOCK(rpipe);
1522 		return (1);
1523 	}
1524 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1525 	if (wpipe->pipe_state & PIPE_DIRECTW)
1526 		kn->kn_data = 0;
1527 
1528 	PIPE_UNLOCK(rpipe);
1529 	return (kn->kn_data >= PIPE_BUF);
1530 }
1531