xref: /freebsd/sys/kern/sys_pipe.c (revision 77a0943ded95b9e6438f7db70c4a28e4d93946d4)
1 /*
2  * Copyright (c) 1996 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. Absolutely no warranty of function or purpose is made by the author
15  *    John S. Dyson.
16  * 4. Modifications may be freely made to this file if the above conditions
17  *    are met.
18  *
19  * $FreeBSD$
20  */
21 
22 /*
23  * This file contains a high-performance replacement for the socket-based
24  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
25  * all features of sockets, but does do everything that pipes normally
26  * do.
27  */
28 
29 /*
30  * This code has two modes of operation, a small write mode and a large
31  * write mode.  The small write mode acts like conventional pipes with
32  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
33  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
34  * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
35  * the receiving process can copy it directly from the pages in the sending
36  * process.
37  *
38  * If the sending process receives a signal, it is possible that it will
39  * go away, and certainly its address space can change, because control
40  * is returned back to the user-mode side.  In that case, the pipe code
41  * arranges to copy the buffer supplied by the user process, to a pageable
42  * kernel buffer, and the receiving process will grab the data from the
43  * pageable kernel buffer.  Since signals don't happen all that often,
44  * the copy operation is normally eliminated.
45  *
46  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
47  * happen for small transfers so that the system will not spend all of
48  * its time context switching.  PIPE_SIZE is constrained by the
49  * amount of kernel virtual memory.
50  */
51 
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/proc.h>
55 #include <sys/fcntl.h>
56 #include <sys/file.h>
57 #include <sys/filedesc.h>
58 #include <sys/filio.h>
59 #include <sys/ttycom.h>
60 #include <sys/stat.h>
61 #include <sys/poll.h>
62 #include <sys/select.h>
63 #include <sys/signalvar.h>
64 #include <sys/sysproto.h>
65 #include <sys/pipe.h>
66 #include <sys/vnode.h>
67 #include <sys/uio.h>
68 #include <sys/event.h>
69 
70 #include <vm/vm.h>
71 #include <vm/vm_param.h>
72 #include <sys/lock.h>
73 #include <vm/vm_object.h>
74 #include <vm/vm_kern.h>
75 #include <vm/vm_extern.h>
76 #include <vm/pmap.h>
77 #include <vm/vm_map.h>
78 #include <vm/vm_page.h>
79 #include <vm/vm_zone.h>
80 
81 /*
82  * Use this define if you want to disable *fancy* VM things.  Expect an
83  * approx 30% decrease in transfer rate.  This could be useful for
84  * NetBSD or OpenBSD.
85  */
86 /* #define PIPE_NODIRECT */
87 
88 /*
89  * interfaces to the outside world
90  */
91 static int pipe_read __P((struct file *fp, struct uio *uio,
92 		struct ucred *cred, int flags, struct proc *p));
93 static int pipe_write __P((struct file *fp, struct uio *uio,
94 		struct ucred *cred, int flags, struct proc *p));
95 static int pipe_close __P((struct file *fp, struct proc *p));
96 static int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
97 		struct proc *p));
98 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
99 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
100 
101 static struct fileops pipeops =
102     { pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_stat, pipe_close };
103 
104 static int	filt_pipeattach(struct knote *kn);
105 static void	filt_pipedetach(struct knote *kn);
106 static int	filt_piperead(struct knote *kn, long hint);
107 static int	filt_pipewrite(struct knote *kn, long hint);
108 
109 struct filterops pipe_rwfiltops[] = {
110 	{ 1, filt_pipeattach, filt_pipedetach, filt_piperead },
111 	{ 1, filt_pipeattach, filt_pipedetach, filt_pipewrite },
112 };
113 
114 /*
115  * Default pipe buffer size(s), this can be kind-of large now because pipe
116  * space is pageable.  The pipe code will try to maintain locality of
117  * reference for performance reasons, so small amounts of outstanding I/O
118  * will not wipe the cache.
119  */
120 #define MINPIPESIZE (PIPE_SIZE/3)
121 #define MAXPIPESIZE (2*PIPE_SIZE/3)
122 
123 /*
124  * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
125  * is there so that on large systems, we don't exhaust it.
126  */
127 #define MAXPIPEKVA (8*1024*1024)
128 
129 /*
130  * Limit for direct transfers, we cannot, of course limit
131  * the amount of kva for pipes in general though.
132  */
133 #define LIMITPIPEKVA (16*1024*1024)
134 
135 /*
136  * Limit the number of "big" pipes
137  */
138 #define LIMITBIGPIPES	32
139 static int nbigpipe;
140 
141 static int amountpipekva;
142 
143 static void pipeclose __P((struct pipe *cpipe));
144 static void pipeinit __P((struct pipe *cpipe));
145 static __inline int pipelock __P((struct pipe *cpipe, int catch));
146 static __inline void pipeunlock __P((struct pipe *cpipe));
147 static __inline void pipeselwakeup __P((struct pipe *cpipe));
148 #ifndef PIPE_NODIRECT
149 static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
150 static void pipe_destroy_write_buffer __P((struct pipe *wpipe));
151 static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
152 static void pipe_clone_write_buffer __P((struct pipe *wpipe));
153 #endif
154 static void pipespace __P((struct pipe *cpipe));
155 
156 static vm_zone_t pipe_zone;
157 
158 /*
159  * The pipe system call for the DTYPE_PIPE type of pipes
160  */
161 
162 /* ARGSUSED */
163 int
164 pipe(p, uap)
165 	struct proc *p;
166 	struct pipe_args /* {
167 		int	dummy;
168 	} */ *uap;
169 {
170 	register struct filedesc *fdp = p->p_fd;
171 	struct file *rf, *wf;
172 	struct pipe *rpipe, *wpipe;
173 	int fd, error;
174 
175 	if (pipe_zone == NULL)
176 		pipe_zone = zinit("PIPE", sizeof (struct pipe), 0, 0, 4);
177 
178 	rpipe = zalloc( pipe_zone);
179 	pipeinit(rpipe);
180 	rpipe->pipe_state |= PIPE_DIRECTOK;
181 	wpipe = zalloc( pipe_zone);
182 	pipeinit(wpipe);
183 	wpipe->pipe_state |= PIPE_DIRECTOK;
184 
185 	error = falloc(p, &rf, &fd);
186 	if (error)
187 		goto free2;
188 	fhold(rf);
189 	p->p_retval[0] = fd;
190 	rf->f_flag = FREAD | FWRITE;
191 	rf->f_type = DTYPE_PIPE;
192 	rf->f_data = (caddr_t)rpipe;
193 	rf->f_ops = &pipeops;
194 	error = falloc(p, &wf, &fd);
195 	if (error)
196 		goto free3;
197 	wf->f_flag = FREAD | FWRITE;
198 	wf->f_type = DTYPE_PIPE;
199 	wf->f_data = (caddr_t)wpipe;
200 	wf->f_ops = &pipeops;
201 	p->p_retval[1] = fd;
202 
203 	rpipe->pipe_peer = wpipe;
204 	wpipe->pipe_peer = rpipe;
205 	fdrop(rf, p);
206 
207 	return (0);
208 free3:
209 	if (fdp->fd_ofiles[p->p_retval[0]] == rf) {
210 		fdp->fd_ofiles[p->p_retval[0]] = NULL;
211 		fdrop(rf, p);
212 	}
213 	fdrop(rf, p);
214 free2:
215 	(void)pipeclose(wpipe);
216 	(void)pipeclose(rpipe);
217 	return (error);
218 }
219 
220 /*
221  * Allocate kva for pipe circular buffer, the space is pageable
222  */
223 static void
224 pipespace(cpipe)
225 	struct pipe *cpipe;
226 {
227 	int npages, error;
228 
229 	npages = round_page(cpipe->pipe_buffer.size)/PAGE_SIZE;
230 	/*
231 	 * Create an object, I don't like the idea of paging to/from
232 	 * kernel_object.
233 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
234 	 */
235 	cpipe->pipe_buffer.object = vm_object_allocate(OBJT_DEFAULT, npages);
236 	cpipe->pipe_buffer.buffer = (caddr_t) vm_map_min(kernel_map);
237 
238 	/*
239 	 * Insert the object into the kernel map, and allocate kva for it.
240 	 * The map entry is, by default, pageable.
241 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
242 	 */
243 	error = vm_map_find(kernel_map, cpipe->pipe_buffer.object, 0,
244 		(vm_offset_t *) &cpipe->pipe_buffer.buffer,
245 		cpipe->pipe_buffer.size, 1,
246 		VM_PROT_ALL, VM_PROT_ALL, 0);
247 
248 	if (error != KERN_SUCCESS)
249 		panic("pipeinit: cannot allocate pipe -- out of kvm -- code = %d", error);
250 	amountpipekva += cpipe->pipe_buffer.size;
251 }
252 
253 /*
254  * initialize and allocate VM and memory for pipe
255  */
256 static void
257 pipeinit(cpipe)
258 	struct pipe *cpipe;
259 {
260 
261 	cpipe->pipe_buffer.in = 0;
262 	cpipe->pipe_buffer.out = 0;
263 	cpipe->pipe_buffer.cnt = 0;
264 	cpipe->pipe_buffer.size = PIPE_SIZE;
265 
266 	/* Buffer kva gets dynamically allocated */
267 	cpipe->pipe_buffer.buffer = NULL;
268 	/* cpipe->pipe_buffer.object = invalid */
269 
270 	cpipe->pipe_state = 0;
271 	cpipe->pipe_peer = NULL;
272 	cpipe->pipe_busy = 0;
273 	vfs_timestamp(&cpipe->pipe_ctime);
274 	cpipe->pipe_atime = cpipe->pipe_ctime;
275 	cpipe->pipe_mtime = cpipe->pipe_ctime;
276 	bzero(&cpipe->pipe_sel, sizeof cpipe->pipe_sel);
277 
278 #ifndef PIPE_NODIRECT
279 	/*
280 	 * pipe data structure initializations to support direct pipe I/O
281 	 */
282 	cpipe->pipe_map.cnt = 0;
283 	cpipe->pipe_map.kva = 0;
284 	cpipe->pipe_map.pos = 0;
285 	cpipe->pipe_map.npages = 0;
286 	/* cpipe->pipe_map.ms[] = invalid */
287 #endif
288 }
289 
290 
291 /*
292  * lock a pipe for I/O, blocking other access
293  */
294 static __inline int
295 pipelock(cpipe, catch)
296 	struct pipe *cpipe;
297 	int catch;
298 {
299 	int error;
300 	while (cpipe->pipe_state & PIPE_LOCK) {
301 		cpipe->pipe_state |= PIPE_LWANT;
302 		if ((error = tsleep( cpipe,
303 			catch?(PRIBIO|PCATCH):PRIBIO, "pipelk", 0)) != 0) {
304 			return error;
305 		}
306 	}
307 	cpipe->pipe_state |= PIPE_LOCK;
308 	return 0;
309 }
310 
311 /*
312  * unlock a pipe I/O lock
313  */
314 static __inline void
315 pipeunlock(cpipe)
316 	struct pipe *cpipe;
317 {
318 	cpipe->pipe_state &= ~PIPE_LOCK;
319 	if (cpipe->pipe_state & PIPE_LWANT) {
320 		cpipe->pipe_state &= ~PIPE_LWANT;
321 		wakeup(cpipe);
322 	}
323 }
324 
325 static __inline void
326 pipeselwakeup(cpipe)
327 	struct pipe *cpipe;
328 {
329 	if (cpipe->pipe_state & PIPE_SEL) {
330 		cpipe->pipe_state &= ~PIPE_SEL;
331 		selwakeup(&cpipe->pipe_sel);
332 	}
333 	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
334 		pgsigio(cpipe->pipe_sigio, SIGIO, 0);
335 	KNOTE(&cpipe->pipe_sel.si_note, 0);
336 }
337 
338 /* ARGSUSED */
339 static int
340 pipe_read(fp, uio, cred, flags, p)
341 	struct file *fp;
342 	struct uio *uio;
343 	struct ucred *cred;
344 	struct proc *p;
345 	int flags;
346 {
347 
348 	struct pipe *rpipe = (struct pipe *) fp->f_data;
349 	int error;
350 	int nread = 0;
351 	u_int size;
352 
353 	++rpipe->pipe_busy;
354 	error = pipelock(rpipe, 1);
355 	if (error)
356 		goto unlocked_error;
357 
358 	while (uio->uio_resid) {
359 		/*
360 		 * normal pipe buffer receive
361 		 */
362 		if (rpipe->pipe_buffer.cnt > 0) {
363 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
364 			if (size > rpipe->pipe_buffer.cnt)
365 				size = rpipe->pipe_buffer.cnt;
366 			if (size > (u_int) uio->uio_resid)
367 				size = (u_int) uio->uio_resid;
368 
369 			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
370 					size, uio);
371 			if (error) {
372 				break;
373 			}
374 			rpipe->pipe_buffer.out += size;
375 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
376 				rpipe->pipe_buffer.out = 0;
377 
378 			rpipe->pipe_buffer.cnt -= size;
379 
380 			/*
381 			 * If there is no more to read in the pipe, reset
382 			 * its pointers to the beginning.  This improves
383 			 * cache hit stats.
384 			 */
385 			if (rpipe->pipe_buffer.cnt == 0) {
386 				rpipe->pipe_buffer.in = 0;
387 				rpipe->pipe_buffer.out = 0;
388 			}
389 			nread += size;
390 #ifndef PIPE_NODIRECT
391 		/*
392 		 * Direct copy, bypassing a kernel buffer.
393 		 */
394 		} else if ((size = rpipe->pipe_map.cnt) &&
395 			   (rpipe->pipe_state & PIPE_DIRECTW)) {
396 			caddr_t	va;
397 			if (size > (u_int) uio->uio_resid)
398 				size = (u_int) uio->uio_resid;
399 
400 			va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos;
401 			error = uiomove(va, size, uio);
402 			if (error)
403 				break;
404 			nread += size;
405 			rpipe->pipe_map.pos += size;
406 			rpipe->pipe_map.cnt -= size;
407 			if (rpipe->pipe_map.cnt == 0) {
408 				rpipe->pipe_state &= ~PIPE_DIRECTW;
409 				wakeup(rpipe);
410 			}
411 #endif
412 		} else {
413 			/*
414 			 * detect EOF condition
415 			 */
416 			if (rpipe->pipe_state & PIPE_EOF) {
417 				/* XXX error = ? */
418 				break;
419 			}
420 
421 			/*
422 			 * If the "write-side" has been blocked, wake it up now.
423 			 */
424 			if (rpipe->pipe_state & PIPE_WANTW) {
425 				rpipe->pipe_state &= ~PIPE_WANTW;
426 				wakeup(rpipe);
427 			}
428 
429 			/*
430 			 * Break if some data was read.
431 			 */
432 			if (nread > 0)
433 				break;
434 
435 			/*
436 			 * Unlock the pipe buffer for our remaining processing.  We
437 			 * will either break out with an error or we will sleep and
438 			 * relock to loop.
439 			 */
440 			pipeunlock(rpipe);
441 
442 			/*
443 			 * Handle non-blocking mode operation or
444 			 * wait for more data.
445 			 */
446 			if (fp->f_flag & FNONBLOCK)
447 				error = EAGAIN;
448 			else {
449 				rpipe->pipe_state |= PIPE_WANTR;
450 				if ((error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) == 0)
451 					error = pipelock(rpipe, 1);
452 			}
453 			if (error)
454 				goto unlocked_error;
455 		}
456 	}
457 	pipeunlock(rpipe);
458 
459 	if (error == 0)
460 		vfs_timestamp(&rpipe->pipe_atime);
461 unlocked_error:
462 	--rpipe->pipe_busy;
463 
464 	/*
465 	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
466 	 */
467 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
468 		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
469 		wakeup(rpipe);
470 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
471 		/*
472 		 * Handle write blocking hysteresis.
473 		 */
474 		if (rpipe->pipe_state & PIPE_WANTW) {
475 			rpipe->pipe_state &= ~PIPE_WANTW;
476 			wakeup(rpipe);
477 		}
478 	}
479 
480 	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
481 		pipeselwakeup(rpipe);
482 
483 	return error;
484 }
485 
486 #ifndef PIPE_NODIRECT
487 /*
488  * Map the sending processes' buffer into kernel space and wire it.
489  * This is similar to a physical write operation.
490  */
491 static int
492 pipe_build_write_buffer(wpipe, uio)
493 	struct pipe *wpipe;
494 	struct uio *uio;
495 {
496 	u_int size;
497 	int i;
498 	vm_offset_t addr, endaddr, paddr;
499 
500 	size = (u_int) uio->uio_iov->iov_len;
501 	if (size > wpipe->pipe_buffer.size)
502 		size = wpipe->pipe_buffer.size;
503 
504 	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
505 	for(i = 0, addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
506 		addr < endaddr;
507 		addr += PAGE_SIZE, i+=1) {
508 
509 		vm_page_t m;
510 
511 		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
512 		    (paddr = pmap_kextract(addr)) == 0) {
513 			int j;
514 			for(j=0;j<i;j++)
515 				vm_page_unwire(wpipe->pipe_map.ms[j], 1);
516 			return EFAULT;
517 		}
518 
519 		m = PHYS_TO_VM_PAGE(paddr);
520 		vm_page_wire(m);
521 		wpipe->pipe_map.ms[i] = m;
522 	}
523 
524 /*
525  * set up the control block
526  */
527 	wpipe->pipe_map.npages = i;
528 	wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
529 	wpipe->pipe_map.cnt = size;
530 
531 /*
532  * and map the buffer
533  */
534 	if (wpipe->pipe_map.kva == 0) {
535 		/*
536 		 * We need to allocate space for an extra page because the
537 		 * address range might (will) span pages at times.
538 		 */
539 		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
540 			wpipe->pipe_buffer.size + PAGE_SIZE);
541 		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
542 	}
543 	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
544 		wpipe->pipe_map.npages);
545 
546 /*
547  * and update the uio data
548  */
549 
550 	uio->uio_iov->iov_len -= size;
551 	uio->uio_iov->iov_base += size;
552 	if (uio->uio_iov->iov_len == 0)
553 		uio->uio_iov++;
554 	uio->uio_resid -= size;
555 	uio->uio_offset += size;
556 	return 0;
557 }
558 
559 /*
560  * unmap and unwire the process buffer
561  */
562 static void
563 pipe_destroy_write_buffer(wpipe)
564 struct pipe *wpipe;
565 {
566 	int i;
567 	if (wpipe->pipe_map.kva) {
568 		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
569 
570 		if (amountpipekva > MAXPIPEKVA) {
571 			vm_offset_t kva = wpipe->pipe_map.kva;
572 			wpipe->pipe_map.kva = 0;
573 			kmem_free(kernel_map, kva,
574 				wpipe->pipe_buffer.size + PAGE_SIZE);
575 			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
576 		}
577 	}
578 	for (i=0;i<wpipe->pipe_map.npages;i++)
579 		vm_page_unwire(wpipe->pipe_map.ms[i], 1);
580 }
581 
582 /*
583  * In the case of a signal, the writing process might go away.  This
584  * code copies the data into the circular buffer so that the source
585  * pages can be freed without loss of data.
586  */
587 static void
588 pipe_clone_write_buffer(wpipe)
589 struct pipe *wpipe;
590 {
591 	int size;
592 	int pos;
593 
594 	size = wpipe->pipe_map.cnt;
595 	pos = wpipe->pipe_map.pos;
596 	bcopy((caddr_t) wpipe->pipe_map.kva+pos,
597 			(caddr_t) wpipe->pipe_buffer.buffer,
598 			size);
599 
600 	wpipe->pipe_buffer.in = size;
601 	wpipe->pipe_buffer.out = 0;
602 	wpipe->pipe_buffer.cnt = size;
603 	wpipe->pipe_state &= ~PIPE_DIRECTW;
604 
605 	pipe_destroy_write_buffer(wpipe);
606 }
607 
608 /*
609  * This implements the pipe buffer write mechanism.  Note that only
610  * a direct write OR a normal pipe write can be pending at any given time.
611  * If there are any characters in the pipe buffer, the direct write will
612  * be deferred until the receiving process grabs all of the bytes from
613  * the pipe buffer.  Then the direct mapping write is set-up.
614  */
615 static int
616 pipe_direct_write(wpipe, uio)
617 	struct pipe *wpipe;
618 	struct uio *uio;
619 {
620 	int error;
621 retry:
622 	while (wpipe->pipe_state & PIPE_DIRECTW) {
623 		if ( wpipe->pipe_state & PIPE_WANTR) {
624 			wpipe->pipe_state &= ~PIPE_WANTR;
625 			wakeup(wpipe);
626 		}
627 		wpipe->pipe_state |= PIPE_WANTW;
628 		error = tsleep(wpipe,
629 				PRIBIO|PCATCH, "pipdww", 0);
630 		if (error)
631 			goto error1;
632 		if (wpipe->pipe_state & PIPE_EOF) {
633 			error = EPIPE;
634 			goto error1;
635 		}
636 	}
637 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
638 	if (wpipe->pipe_buffer.cnt > 0) {
639 		if ( wpipe->pipe_state & PIPE_WANTR) {
640 			wpipe->pipe_state &= ~PIPE_WANTR;
641 			wakeup(wpipe);
642 		}
643 
644 		wpipe->pipe_state |= PIPE_WANTW;
645 		error = tsleep(wpipe,
646 				PRIBIO|PCATCH, "pipdwc", 0);
647 		if (error)
648 			goto error1;
649 		if (wpipe->pipe_state & PIPE_EOF) {
650 			error = EPIPE;
651 			goto error1;
652 		}
653 		goto retry;
654 	}
655 
656 	wpipe->pipe_state |= PIPE_DIRECTW;
657 
658 	error = pipe_build_write_buffer(wpipe, uio);
659 	if (error) {
660 		wpipe->pipe_state &= ~PIPE_DIRECTW;
661 		goto error1;
662 	}
663 
664 	error = 0;
665 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
666 		if (wpipe->pipe_state & PIPE_EOF) {
667 			pipelock(wpipe, 0);
668 			pipe_destroy_write_buffer(wpipe);
669 			pipeunlock(wpipe);
670 			pipeselwakeup(wpipe);
671 			error = EPIPE;
672 			goto error1;
673 		}
674 		if (wpipe->pipe_state & PIPE_WANTR) {
675 			wpipe->pipe_state &= ~PIPE_WANTR;
676 			wakeup(wpipe);
677 		}
678 		pipeselwakeup(wpipe);
679 		error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwt", 0);
680 	}
681 
682 	pipelock(wpipe,0);
683 	if (wpipe->pipe_state & PIPE_DIRECTW) {
684 		/*
685 		 * this bit of trickery substitutes a kernel buffer for
686 		 * the process that might be going away.
687 		 */
688 		pipe_clone_write_buffer(wpipe);
689 	} else {
690 		pipe_destroy_write_buffer(wpipe);
691 	}
692 	pipeunlock(wpipe);
693 	return error;
694 
695 error1:
696 	wakeup(wpipe);
697 	return error;
698 }
699 #endif
700 
701 static int
702 pipe_write(fp, uio, cred, flags, p)
703 	struct file *fp;
704 	struct uio *uio;
705 	struct ucred *cred;
706 	struct proc *p;
707 	int flags;
708 {
709 	int error = 0;
710 	int orig_resid;
711 
712 	struct pipe *wpipe, *rpipe;
713 
714 	rpipe = (struct pipe *) fp->f_data;
715 	wpipe = rpipe->pipe_peer;
716 
717 	/*
718 	 * detect loss of pipe read side, issue SIGPIPE if lost.
719 	 */
720 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
721 		return EPIPE;
722 	}
723 
724 	/*
725 	 * If it is advantageous to resize the pipe buffer, do
726 	 * so.
727 	 */
728 	if ((uio->uio_resid > PIPE_SIZE) &&
729 		(nbigpipe < LIMITBIGPIPES) &&
730 		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
731 		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
732 		(wpipe->pipe_buffer.cnt == 0)) {
733 
734 		if (wpipe->pipe_buffer.buffer) {
735 			amountpipekva -= wpipe->pipe_buffer.size;
736 			kmem_free(kernel_map,
737 				(vm_offset_t)wpipe->pipe_buffer.buffer,
738 				wpipe->pipe_buffer.size);
739 		}
740 
741 #ifndef PIPE_NODIRECT
742 		if (wpipe->pipe_map.kva) {
743 			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
744 			kmem_free(kernel_map,
745 				wpipe->pipe_map.kva,
746 				wpipe->pipe_buffer.size + PAGE_SIZE);
747 		}
748 #endif
749 
750 		wpipe->pipe_buffer.in = 0;
751 		wpipe->pipe_buffer.out = 0;
752 		wpipe->pipe_buffer.cnt = 0;
753 		wpipe->pipe_buffer.size = BIG_PIPE_SIZE;
754 		wpipe->pipe_buffer.buffer = NULL;
755 		++nbigpipe;
756 
757 #ifndef PIPE_NODIRECT
758 		wpipe->pipe_map.cnt = 0;
759 		wpipe->pipe_map.kva = 0;
760 		wpipe->pipe_map.pos = 0;
761 		wpipe->pipe_map.npages = 0;
762 #endif
763 
764 	}
765 
766 
767 	if( wpipe->pipe_buffer.buffer == NULL) {
768 		if ((error = pipelock(wpipe,1)) == 0) {
769 			pipespace(wpipe);
770 			pipeunlock(wpipe);
771 		} else {
772 			return error;
773 		}
774 	}
775 
776 	++wpipe->pipe_busy;
777 	orig_resid = uio->uio_resid;
778 	while (uio->uio_resid) {
779 		int space;
780 #ifndef PIPE_NODIRECT
781 		/*
782 		 * If the transfer is large, we can gain performance if
783 		 * we do process-to-process copies directly.
784 		 * If the write is non-blocking, we don't use the
785 		 * direct write mechanism.
786 		 *
787 		 * The direct write mechanism will detect the reader going
788 		 * away on us.
789 		 */
790 		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
791 		    (fp->f_flag & FNONBLOCK) == 0 &&
792 			(wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
793 			(uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
794 			error = pipe_direct_write( wpipe, uio);
795 			if (error) {
796 				break;
797 			}
798 			continue;
799 		}
800 #endif
801 
802 		/*
803 		 * Pipe buffered writes cannot be coincidental with
804 		 * direct writes.  We wait until the currently executing
805 		 * direct write is completed before we start filling the
806 		 * pipe buffer.  We break out if a signal occurs or the
807 		 * reader goes away.
808 		 */
809 	retrywrite:
810 		while (wpipe->pipe_state & PIPE_DIRECTW) {
811 			if (wpipe->pipe_state & PIPE_WANTR) {
812 				wpipe->pipe_state &= ~PIPE_WANTR;
813 				wakeup(wpipe);
814 			}
815 			error = tsleep(wpipe, PRIBIO|PCATCH, "pipbww", 0);
816 			if (wpipe->pipe_state & PIPE_EOF)
817 				break;
818 			if (error)
819 				break;
820 		}
821 		if (wpipe->pipe_state & PIPE_EOF) {
822 			error = EPIPE;
823 			break;
824 		}
825 
826 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
827 
828 		/* Writes of size <= PIPE_BUF must be atomic. */
829 		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
830 			space = 0;
831 
832 		if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
833 			if ((error = pipelock(wpipe,1)) == 0) {
834 				int size;	/* Transfer size */
835 				int segsize;	/* first segment to transfer */
836 				/*
837 				 * It is possible for a direct write to
838 				 * slip in on us... handle it here...
839 				 */
840 				if (wpipe->pipe_state & PIPE_DIRECTW) {
841 					pipeunlock(wpipe);
842 					goto retrywrite;
843 				}
844 				/*
845 				 * If a process blocked in uiomove, our
846 				 * value for space might be bad.
847 				 *
848 				 * XXX will we be ok if the reader has gone
849 				 * away here?
850 				 */
851 				if (space > wpipe->pipe_buffer.size -
852 				    wpipe->pipe_buffer.cnt) {
853 					pipeunlock(wpipe);
854 					goto retrywrite;
855 				}
856 
857 				/*
858 				 * Transfer size is minimum of uio transfer
859 				 * and free space in pipe buffer.
860 				 */
861 				if (space > uio->uio_resid)
862 					size = uio->uio_resid;
863 				else
864 					size = space;
865 				/*
866 				 * First segment to transfer is minimum of
867 				 * transfer size and contiguous space in
868 				 * pipe buffer.  If first segment to transfer
869 				 * is less than the transfer size, we've got
870 				 * a wraparound in the buffer.
871 				 */
872 				segsize = wpipe->pipe_buffer.size -
873 					wpipe->pipe_buffer.in;
874 				if (segsize > size)
875 					segsize = size;
876 
877 				/* Transfer first segment */
878 
879 				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
880 						segsize, uio);
881 
882 				if (error == 0 && segsize < size) {
883 					/*
884 					 * Transfer remaining part now, to
885 					 * support atomic writes.  Wraparound
886 					 * happened.
887 					 */
888 					if (wpipe->pipe_buffer.in + segsize !=
889 					    wpipe->pipe_buffer.size)
890 						panic("Expected pipe buffer wraparound disappeared");
891 
892 					error = uiomove(&wpipe->pipe_buffer.buffer[0],
893 							size - segsize, uio);
894 				}
895 				if (error == 0) {
896 					wpipe->pipe_buffer.in += size;
897 					if (wpipe->pipe_buffer.in >=
898 					    wpipe->pipe_buffer.size) {
899 						if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
900 							panic("Expected wraparound bad");
901 						wpipe->pipe_buffer.in = size - segsize;
902 					}
903 
904 					wpipe->pipe_buffer.cnt += size;
905 					if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
906 						panic("Pipe buffer overflow");
907 
908 				}
909 				pipeunlock(wpipe);
910 			}
911 			if (error)
912 				break;
913 
914 		} else {
915 			/*
916 			 * If the "read-side" has been blocked, wake it up now.
917 			 */
918 			if (wpipe->pipe_state & PIPE_WANTR) {
919 				wpipe->pipe_state &= ~PIPE_WANTR;
920 				wakeup(wpipe);
921 			}
922 
923 			/*
924 			 * don't block on non-blocking I/O
925 			 */
926 			if (fp->f_flag & FNONBLOCK) {
927 				error = EAGAIN;
928 				break;
929 			}
930 
931 			/*
932 			 * We have no more space and have something to offer,
933 			 * wake up select/poll.
934 			 */
935 			pipeselwakeup(wpipe);
936 
937 			wpipe->pipe_state |= PIPE_WANTW;
938 			if ((error = tsleep(wpipe, (PRIBIO+1)|PCATCH, "pipewr", 0)) != 0) {
939 				break;
940 			}
941 			/*
942 			 * If read side wants to go away, we just issue a signal
943 			 * to ourselves.
944 			 */
945 			if (wpipe->pipe_state & PIPE_EOF) {
946 				error = EPIPE;
947 				break;
948 			}
949 		}
950 	}
951 
952 	--wpipe->pipe_busy;
953 	if ((wpipe->pipe_busy == 0) &&
954 		(wpipe->pipe_state & PIPE_WANT)) {
955 		wpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTR);
956 		wakeup(wpipe);
957 	} else if (wpipe->pipe_buffer.cnt > 0) {
958 		/*
959 		 * If we have put any characters in the buffer, we wake up
960 		 * the reader.
961 		 */
962 		if (wpipe->pipe_state & PIPE_WANTR) {
963 			wpipe->pipe_state &= ~PIPE_WANTR;
964 			wakeup(wpipe);
965 		}
966 	}
967 
968 	/*
969 	 * Don't return EPIPE if I/O was successful
970 	 */
971 	if ((wpipe->pipe_buffer.cnt == 0) &&
972 		(uio->uio_resid == 0) &&
973 		(error == EPIPE))
974 		error = 0;
975 
976 	if (error == 0)
977 		vfs_timestamp(&wpipe->pipe_mtime);
978 
979 	/*
980 	 * We have something to offer,
981 	 * wake up select/poll.
982 	 */
983 	if (wpipe->pipe_buffer.cnt)
984 		pipeselwakeup(wpipe);
985 
986 	return error;
987 }
988 
989 /*
990  * we implement a very minimal set of ioctls for compatibility with sockets.
991  */
992 int
993 pipe_ioctl(fp, cmd, data, p)
994 	struct file *fp;
995 	u_long cmd;
996 	register caddr_t data;
997 	struct proc *p;
998 {
999 	register struct pipe *mpipe = (struct pipe *)fp->f_data;
1000 
1001 	switch (cmd) {
1002 
1003 	case FIONBIO:
1004 		return (0);
1005 
1006 	case FIOASYNC:
1007 		if (*(int *)data) {
1008 			mpipe->pipe_state |= PIPE_ASYNC;
1009 		} else {
1010 			mpipe->pipe_state &= ~PIPE_ASYNC;
1011 		}
1012 		return (0);
1013 
1014 	case FIONREAD:
1015 		if (mpipe->pipe_state & PIPE_DIRECTW)
1016 			*(int *)data = mpipe->pipe_map.cnt;
1017 		else
1018 			*(int *)data = mpipe->pipe_buffer.cnt;
1019 		return (0);
1020 
1021 	case FIOSETOWN:
1022 		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1023 
1024 	case FIOGETOWN:
1025 		*(int *)data = fgetown(mpipe->pipe_sigio);
1026 		return (0);
1027 
1028 	/* This is deprecated, FIOSETOWN should be used instead. */
1029 	case TIOCSPGRP:
1030 		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1031 
1032 	/* This is deprecated, FIOGETOWN should be used instead. */
1033 	case TIOCGPGRP:
1034 		*(int *)data = -fgetown(mpipe->pipe_sigio);
1035 		return (0);
1036 
1037 	}
1038 	return (ENOTTY);
1039 }
1040 
1041 int
1042 pipe_poll(fp, events, cred, p)
1043 	struct file *fp;
1044 	int events;
1045 	struct ucred *cred;
1046 	struct proc *p;
1047 {
1048 	register struct pipe *rpipe = (struct pipe *)fp->f_data;
1049 	struct pipe *wpipe;
1050 	int revents = 0;
1051 
1052 	wpipe = rpipe->pipe_peer;
1053 	if (events & (POLLIN | POLLRDNORM))
1054 		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1055 		    (rpipe->pipe_buffer.cnt > 0) ||
1056 		    (rpipe->pipe_state & PIPE_EOF))
1057 			revents |= events & (POLLIN | POLLRDNORM);
1058 
1059 	if (events & (POLLOUT | POLLWRNORM))
1060 		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
1061 		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1062 		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1063 			revents |= events & (POLLOUT | POLLWRNORM);
1064 
1065 	if ((rpipe->pipe_state & PIPE_EOF) ||
1066 	    (wpipe == NULL) ||
1067 	    (wpipe->pipe_state & PIPE_EOF))
1068 		revents |= POLLHUP;
1069 
1070 	if (revents == 0) {
1071 		if (events & (POLLIN | POLLRDNORM)) {
1072 			selrecord(p, &rpipe->pipe_sel);
1073 			rpipe->pipe_state |= PIPE_SEL;
1074 		}
1075 
1076 		if (events & (POLLOUT | POLLWRNORM)) {
1077 			selrecord(p, &wpipe->pipe_sel);
1078 			wpipe->pipe_state |= PIPE_SEL;
1079 		}
1080 	}
1081 
1082 	return (revents);
1083 }
1084 
1085 static int
1086 pipe_stat(fp, ub, p)
1087 	struct file *fp;
1088 	struct stat *ub;
1089 	struct proc *p;
1090 {
1091 	struct pipe *pipe = (struct pipe *)fp->f_data;
1092 
1093 	bzero((caddr_t)ub, sizeof (*ub));
1094 	ub->st_mode = S_IFIFO;
1095 	ub->st_blksize = pipe->pipe_buffer.size;
1096 	ub->st_size = pipe->pipe_buffer.cnt;
1097 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1098 	ub->st_atimespec = pipe->pipe_atime;
1099 	ub->st_mtimespec = pipe->pipe_mtime;
1100 	ub->st_ctimespec = pipe->pipe_ctime;
1101 	ub->st_uid = fp->f_cred->cr_uid;
1102 	ub->st_gid = fp->f_cred->cr_gid;
1103 	/*
1104 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1105 	 * XXX (st_dev, st_ino) should be unique.
1106 	 */
1107 	return 0;
1108 }
1109 
1110 /* ARGSUSED */
1111 static int
1112 pipe_close(fp, p)
1113 	struct file *fp;
1114 	struct proc *p;
1115 {
1116 	struct pipe *cpipe = (struct pipe *)fp->f_data;
1117 
1118 	fp->f_ops = &badfileops;
1119 	fp->f_data = NULL;
1120 	funsetown(cpipe->pipe_sigio);
1121 	pipeclose(cpipe);
1122 	return 0;
1123 }
1124 
1125 /*
1126  * shutdown the pipe
1127  */
1128 static void
1129 pipeclose(cpipe)
1130 	struct pipe *cpipe;
1131 {
1132 	struct pipe *ppipe;
1133 	if (cpipe) {
1134 
1135 		pipeselwakeup(cpipe);
1136 
1137 		/*
1138 		 * If the other side is blocked, wake it up saying that
1139 		 * we want to close it down.
1140 		 */
1141 		while (cpipe->pipe_busy) {
1142 			wakeup(cpipe);
1143 			cpipe->pipe_state |= PIPE_WANT|PIPE_EOF;
1144 			tsleep(cpipe, PRIBIO, "pipecl", 0);
1145 		}
1146 
1147 		/*
1148 		 * Disconnect from peer
1149 		 */
1150 		if ((ppipe = cpipe->pipe_peer) != NULL) {
1151 			pipeselwakeup(ppipe);
1152 
1153 			ppipe->pipe_state |= PIPE_EOF;
1154 			wakeup(ppipe);
1155 			ppipe->pipe_peer = NULL;
1156 		}
1157 
1158 		/*
1159 		 * free resources
1160 		 */
1161 		if (cpipe->pipe_buffer.buffer) {
1162 			if (cpipe->pipe_buffer.size > PIPE_SIZE)
1163 				--nbigpipe;
1164 			amountpipekva -= cpipe->pipe_buffer.size;
1165 			kmem_free(kernel_map,
1166 				(vm_offset_t)cpipe->pipe_buffer.buffer,
1167 				cpipe->pipe_buffer.size);
1168 		}
1169 #ifndef PIPE_NODIRECT
1170 		if (cpipe->pipe_map.kva) {
1171 			amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
1172 			kmem_free(kernel_map,
1173 				cpipe->pipe_map.kva,
1174 				cpipe->pipe_buffer.size + PAGE_SIZE);
1175 		}
1176 #endif
1177 		zfree(pipe_zone, cpipe);
1178 	}
1179 }
1180 
1181 static int
1182 filt_pipeattach(struct knote *kn)
1183 {
1184 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1185 
1186 	SLIST_INSERT_HEAD(&rpipe->pipe_sel.si_note, kn, kn_selnext);
1187 	return (0);
1188 }
1189 
1190 static void
1191 filt_pipedetach(struct knote *kn)
1192 {
1193 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1194 
1195 	SLIST_REMOVE(&rpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1196 }
1197 
1198 /*ARGSUSED*/
1199 static int
1200 filt_piperead(struct knote *kn, long hint)
1201 {
1202 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1203 	struct pipe *wpipe = rpipe->pipe_peer;
1204 
1205 	kn->kn_data = rpipe->pipe_buffer.cnt;
1206 	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1207 		kn->kn_data = rpipe->pipe_map.cnt;
1208 
1209 	if ((rpipe->pipe_state & PIPE_EOF) ||
1210 	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1211 		kn->kn_flags |= EV_EOF;
1212 		return (1);
1213 	}
1214 	return (kn->kn_data > 0);
1215 }
1216 
1217 /*ARGSUSED*/
1218 static int
1219 filt_pipewrite(struct knote *kn, long hint)
1220 {
1221 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1222 	struct pipe *wpipe = rpipe->pipe_peer;
1223 
1224 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1225 		kn->kn_data = 0;
1226 		kn->kn_flags |= EV_EOF;
1227 		return (1);
1228 	}
1229 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1230 	if (wpipe->pipe_state & PIPE_DIRECTW)
1231 		kn->kn_data = 0;
1232 
1233 	return (kn->kn_data >= PIPE_BUF);
1234 }
1235