xref: /freebsd/sys/kern/sys_pipe.c (revision ce834215a70ff69e7e222827437116eee2f9ac6f)
1 /*
2  * Copyright (c) 1996 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. Absolutely no warranty of function or purpose is made by the author
15  *    John S. Dyson.
16  * 4. Modifications may be freely made to this file if the above conditions
17  *    are met.
18  *
19  * $Id: sys_pipe.c,v 1.27 1997/03/24 11:52:26 bde Exp $
20  */
21 
22 /*
23  * This file contains a high-performance replacement for the socket-based
24  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
25  * all features of sockets, but does do everything that pipes normally
26  * do.
27  */
28 
29 /*
30  * This code has two modes of operation, a small write mode and a large
31  * write mode.  The small write mode acts like conventional pipes with
32  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
33  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
34  * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
35  * the receiving process can copy it directly from the pages in the sending
36  * process.
37  *
38  * If the sending process receives a signal, it is possible that it will
39  * go away, and certainly its address space can change, because control
40  * is returned back to the user-mode side.  In that case, the pipe code
41  * arranges to copy the buffer supplied by the user process, to a pageable
42  * kernel buffer, and the receiving process will grab the data from the
43  * pageable kernel buffer.  Since signals don't happen all that often,
44  * the copy operation is normally eliminated.
45  *
46  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
47  * happen for small transfers so that the system will not spend all of
48  * its time context switching.  PIPE_SIZE is constrained by the
49  * amount of kernel virtual memory.
50  */
51 
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/proc.h>
55 #include <sys/fcntl.h>
56 #include <sys/file.h>
57 #include <sys/protosw.h>
58 #include <sys/stat.h>
59 #include <sys/filedesc.h>
60 #include <sys/malloc.h>
61 #include <sys/filio.h>
62 #include <sys/ttycom.h>
63 #include <sys/stat.h>
64 #include <sys/select.h>
65 #include <sys/signalvar.h>
66 #include <sys/errno.h>
67 #include <sys/queue.h>
68 #include <sys/vmmeter.h>
69 #include <sys/kernel.h>
70 #include <sys/sysproto.h>
71 #include <sys/pipe.h>
72 
73 #include <vm/vm.h>
74 #include <vm/vm_prot.h>
75 #include <vm/vm_param.h>
76 #include <sys/lock.h>
77 #include <vm/vm_object.h>
78 #include <vm/vm_kern.h>
79 #include <vm/vm_extern.h>
80 #include <vm/pmap.h>
81 #include <vm/vm_map.h>
82 #include <vm/vm_page.h>
83 
84 /*
85  * Use this define if you want to disable *fancy* VM things.  Expect an
86  * approx 30% decrease in transfer rate.  This could be useful for
87  * NetBSD or OpenBSD.
88  */
89 /* #define PIPE_NODIRECT */
90 
91 /*
92  * interfaces to the outside world
93  */
94 static int pipe_read __P((struct file *fp, struct uio *uio,
95 		struct ucred *cred));
96 static int pipe_write __P((struct file *fp, struct uio *uio,
97 		struct ucred *cred));
98 static int pipe_close __P((struct file *fp, struct proc *p));
99 static int pipe_select __P((struct file *fp, int which, struct proc *p));
100 static int pipe_ioctl __P((struct file *fp, int cmd, caddr_t data, struct proc *p));
101 
102 static struct fileops pipeops =
103     { pipe_read, pipe_write, pipe_ioctl, pipe_select, pipe_close };
104 
105 /*
106  * Default pipe buffer size(s), this can be kind-of large now because pipe
107  * space is pageable.  The pipe code will try to maintain locality of
108  * reference for performance reasons, so small amounts of outstanding I/O
109  * will not wipe the cache.
110  */
111 #define MINPIPESIZE (PIPE_SIZE/3)
112 #define MAXPIPESIZE (2*PIPE_SIZE/3)
113 
114 /*
115  * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
116  * is there so that on large systems, we don't exhaust it.
117  */
118 #define MAXPIPEKVA (8*1024*1024)
119 
120 /*
121  * Limit for direct transfers, we cannot, of course limit
122  * the amount of kva for pipes in general though.
123  */
124 #define LIMITPIPEKVA (16*1024*1024)
125 
126 /*
127  * Limit the number of "big" pipes
128  */
129 #define LIMITBIGPIPES	32
130 int nbigpipe;
131 
132 static int amountpipekva;
133 
134 static void pipeclose __P((struct pipe *cpipe));
135 static void pipeinit __P((struct pipe *cpipe));
136 static __inline int pipelock __P((struct pipe *cpipe, int catch));
137 static __inline void pipeunlock __P((struct pipe *cpipe));
138 static __inline void pipeselwakeup __P((struct pipe *cpipe));
139 #ifndef PIPE_NODIRECT
140 static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
141 static void pipe_destroy_write_buffer __P((struct pipe *wpipe));
142 static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
143 static void pipe_clone_write_buffer __P((struct pipe *wpipe));
144 #endif
145 static void pipespace __P((struct pipe *cpipe));
146 
147 /*
148  * The pipe system call for the DTYPE_PIPE type of pipes
149  */
150 
151 /* ARGSUSED */
152 int
153 pipe(p, uap, retval)
154 	struct proc *p;
155 	struct pipe_args /* {
156 		int	dummy;
157 	} */ *uap;
158 	int retval[];
159 {
160 	register struct filedesc *fdp = p->p_fd;
161 	struct file *rf, *wf;
162 	struct pipe *rpipe, *wpipe;
163 	int fd, error;
164 
165 	rpipe = malloc( sizeof (*rpipe), M_TEMP, M_WAITOK);
166 	pipeinit(rpipe);
167 	rpipe->pipe_state |= PIPE_DIRECTOK;
168 	wpipe = malloc( sizeof (*wpipe), M_TEMP, M_WAITOK);
169 	pipeinit(wpipe);
170 	wpipe->pipe_state |= PIPE_DIRECTOK;
171 
172 	error = falloc(p, &rf, &fd);
173 	if (error)
174 		goto free2;
175 	retval[0] = fd;
176 	rf->f_flag = FREAD | FWRITE;
177 	rf->f_type = DTYPE_PIPE;
178 	rf->f_ops = &pipeops;
179 	rf->f_data = (caddr_t)rpipe;
180 	error = falloc(p, &wf, &fd);
181 	if (error)
182 		goto free3;
183 	wf->f_flag = FREAD | FWRITE;
184 	wf->f_type = DTYPE_PIPE;
185 	wf->f_ops = &pipeops;
186 	wf->f_data = (caddr_t)wpipe;
187 	retval[1] = fd;
188 
189 	rpipe->pipe_peer = wpipe;
190 	wpipe->pipe_peer = rpipe;
191 
192 	return (0);
193 free3:
194 	ffree(rf);
195 	fdp->fd_ofiles[retval[0]] = 0;
196 free2:
197 	(void)pipeclose(wpipe);
198 	(void)pipeclose(rpipe);
199 	return (error);
200 }
201 
202 /*
203  * Allocate kva for pipe circular buffer, the space is pageable
204  */
205 static void
206 pipespace(cpipe)
207 	struct pipe *cpipe;
208 {
209 	int npages, error;
210 
211 	npages = round_page(cpipe->pipe_buffer.size)/PAGE_SIZE;
212 	/*
213 	 * Create an object, I don't like the idea of paging to/from
214 	 * kernel_object.
215 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
216 	 */
217 	cpipe->pipe_buffer.object = vm_object_allocate(OBJT_DEFAULT, npages);
218 	cpipe->pipe_buffer.buffer = (caddr_t) vm_map_min(kernel_map);
219 
220 	/*
221 	 * Insert the object into the kernel map, and allocate kva for it.
222 	 * The map entry is, by default, pageable.
223 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
224 	 */
225 	error = vm_map_find(kernel_map, cpipe->pipe_buffer.object, 0,
226 		(vm_offset_t *) &cpipe->pipe_buffer.buffer,
227 		cpipe->pipe_buffer.size, 1,
228 		VM_PROT_ALL, VM_PROT_ALL, 0);
229 
230 	if (error != KERN_SUCCESS)
231 		panic("pipeinit: cannot allocate pipe -- out of kvm -- code = %d", error);
232 	amountpipekva += cpipe->pipe_buffer.size;
233 }
234 
235 /*
236  * initialize and allocate VM and memory for pipe
237  */
238 static void
239 pipeinit(cpipe)
240 	struct pipe *cpipe;
241 {
242 	int s;
243 
244 	cpipe->pipe_buffer.in = 0;
245 	cpipe->pipe_buffer.out = 0;
246 	cpipe->pipe_buffer.cnt = 0;
247 	cpipe->pipe_buffer.size = PIPE_SIZE;
248 
249 	/* Buffer kva gets dynamically allocated */
250 	cpipe->pipe_buffer.buffer = NULL;
251 	/* cpipe->pipe_buffer.object = invalid */
252 
253 	cpipe->pipe_state = 0;
254 	cpipe->pipe_peer = NULL;
255 	cpipe->pipe_busy = 0;
256 	gettime(&cpipe->pipe_ctime);
257 	cpipe->pipe_atime = cpipe->pipe_ctime;
258 	cpipe->pipe_mtime = cpipe->pipe_ctime;
259 	bzero(&cpipe->pipe_sel, sizeof cpipe->pipe_sel);
260 	cpipe->pipe_pgid = NO_PID;
261 
262 #ifndef PIPE_NODIRECT
263 	/*
264 	 * pipe data structure initializations to support direct pipe I/O
265 	 */
266 	cpipe->pipe_map.cnt = 0;
267 	cpipe->pipe_map.kva = 0;
268 	cpipe->pipe_map.pos = 0;
269 	cpipe->pipe_map.npages = 0;
270 	/* cpipe->pipe_map.ms[] = invalid */
271 #endif
272 }
273 
274 
275 /*
276  * lock a pipe for I/O, blocking other access
277  */
278 static __inline int
279 pipelock(cpipe, catch)
280 	struct pipe *cpipe;
281 	int catch;
282 {
283 	int error;
284 	while (cpipe->pipe_state & PIPE_LOCK) {
285 		cpipe->pipe_state |= PIPE_LWANT;
286 		if (error = tsleep( cpipe,
287 			catch?(PRIBIO|PCATCH):PRIBIO, "pipelk", 0)) {
288 			return error;
289 		}
290 	}
291 	cpipe->pipe_state |= PIPE_LOCK;
292 	return 0;
293 }
294 
295 /*
296  * unlock a pipe I/O lock
297  */
298 static __inline void
299 pipeunlock(cpipe)
300 	struct pipe *cpipe;
301 {
302 	cpipe->pipe_state &= ~PIPE_LOCK;
303 	if (cpipe->pipe_state & PIPE_LWANT) {
304 		cpipe->pipe_state &= ~PIPE_LWANT;
305 		wakeup(cpipe);
306 	}
307 }
308 
309 static __inline void
310 pipeselwakeup(cpipe)
311 	struct pipe *cpipe;
312 {
313 	struct proc *p;
314 
315 	if (cpipe->pipe_state & PIPE_SEL) {
316 		cpipe->pipe_state &= ~PIPE_SEL;
317 		selwakeup(&cpipe->pipe_sel);
318 	}
319 	if (cpipe->pipe_state & PIPE_ASYNC) {
320 		if (cpipe->pipe_pgid < 0)
321 			gsignal(-cpipe->pipe_pgid, SIGIO);
322 		else if ((p = pfind(cpipe->pipe_pgid)) != NULL)
323 			psignal(p, SIGIO);
324 	}
325 }
326 
327 /* ARGSUSED */
328 static int
329 pipe_read(fp, uio, cred)
330 	struct file *fp;
331 	struct uio *uio;
332 	struct ucred *cred;
333 {
334 
335 	struct pipe *rpipe = (struct pipe *) fp->f_data;
336 	int error = 0;
337 	int nread = 0;
338 	u_int size;
339 
340 	++rpipe->pipe_busy;
341 	while (uio->uio_resid) {
342 		/*
343 		 * normal pipe buffer receive
344 		 */
345 		if (rpipe->pipe_buffer.cnt > 0) {
346 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
347 			if (size > rpipe->pipe_buffer.cnt)
348 				size = rpipe->pipe_buffer.cnt;
349 			if (size > (u_int) uio->uio_resid)
350 				size = (u_int) uio->uio_resid;
351 			if ((error = pipelock(rpipe,1)) == 0) {
352 				error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
353 					size, uio);
354 				pipeunlock(rpipe);
355 			}
356 			if (error) {
357 				break;
358 			}
359 			rpipe->pipe_buffer.out += size;
360 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
361 				rpipe->pipe_buffer.out = 0;
362 
363 			rpipe->pipe_buffer.cnt -= size;
364 			nread += size;
365 #ifndef PIPE_NODIRECT
366 		/*
367 		 * Direct copy, bypassing a kernel buffer.
368 		 */
369 		} else if ((size = rpipe->pipe_map.cnt) &&
370 			(rpipe->pipe_state & PIPE_DIRECTW)) {
371 			caddr_t va;
372 			if (size > (u_int) uio->uio_resid)
373 				size = (u_int) uio->uio_resid;
374 			if ((error = pipelock(rpipe,1)) == 0) {
375 				va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos;
376 				error = uiomove(va, size, uio);
377 				pipeunlock(rpipe);
378 			}
379 			if (error)
380 				break;
381 			nread += size;
382 			rpipe->pipe_map.pos += size;
383 			rpipe->pipe_map.cnt -= size;
384 			if (rpipe->pipe_map.cnt == 0) {
385 				rpipe->pipe_state &= ~PIPE_DIRECTW;
386 				wakeup(rpipe);
387 			}
388 #endif
389 		} else {
390 			/*
391 			 * detect EOF condition
392 			 */
393 			if (rpipe->pipe_state & PIPE_EOF) {
394 				/* XXX error = ? */
395 				break;
396 			}
397 			/*
398 			 * If the "write-side" has been blocked, wake it up now.
399 			 */
400 			if (rpipe->pipe_state & PIPE_WANTW) {
401 				rpipe->pipe_state &= ~PIPE_WANTW;
402 				wakeup(rpipe);
403 			}
404 			if (nread > 0)
405 				break;
406 
407 			if (fp->f_flag & FNONBLOCK) {
408 				error = EAGAIN;
409 				break;
410 			}
411 
412 			/*
413 			 * If there is no more to read in the pipe, reset
414 			 * its pointers to the beginning.  This improves
415 			 * cache hit stats.
416 			 */
417 
418 			if ((error = pipelock(rpipe,1)) == 0) {
419 				if (rpipe->pipe_buffer.cnt == 0) {
420 					rpipe->pipe_buffer.in = 0;
421 					rpipe->pipe_buffer.out = 0;
422 				}
423 				pipeunlock(rpipe);
424 			} else {
425 				break;
426 			}
427 
428 			if (rpipe->pipe_state & PIPE_WANTW) {
429 				rpipe->pipe_state &= ~PIPE_WANTW;
430 				wakeup(rpipe);
431 			}
432 
433 			rpipe->pipe_state |= PIPE_WANTR;
434 			if (error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) {
435 				break;
436 			}
437 		}
438 	}
439 
440 	if (error == 0)
441 		gettime(&rpipe->pipe_atime);
442 
443 	--rpipe->pipe_busy;
444 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
445 		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
446 		wakeup(rpipe);
447 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
448 		/*
449 		 * If there is no more to read in the pipe, reset
450 		 * its pointers to the beginning.  This improves
451 		 * cache hit stats.
452 		 */
453 		if (rpipe->pipe_buffer.cnt == 0) {
454 			if ((error == 0) && (error = pipelock(rpipe,1)) == 0) {
455 				rpipe->pipe_buffer.in = 0;
456 				rpipe->pipe_buffer.out = 0;
457 				pipeunlock(rpipe);
458 			}
459 		}
460 
461 		/*
462 		 * If the "write-side" has been blocked, wake it up now.
463 		 */
464 		if (rpipe->pipe_state & PIPE_WANTW) {
465 			rpipe->pipe_state &= ~PIPE_WANTW;
466 			wakeup(rpipe);
467 		}
468 	}
469 
470 	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
471 		pipeselwakeup(rpipe);
472 
473 	return error;
474 }
475 
476 #ifndef PIPE_NODIRECT
477 /*
478  * Map the sending processes' buffer into kernel space and wire it.
479  * This is similar to a physical write operation.
480  */
481 static int
482 pipe_build_write_buffer(wpipe, uio)
483 	struct pipe *wpipe;
484 	struct uio *uio;
485 {
486 	u_int size;
487 	int i;
488 	vm_offset_t addr, endaddr, paddr;
489 
490 	size = (u_int) uio->uio_iov->iov_len;
491 	if (size > wpipe->pipe_buffer.size)
492 		size = wpipe->pipe_buffer.size;
493 
494 	endaddr = round_page(uio->uio_iov->iov_base + size);
495 	for(i = 0, addr = trunc_page(uio->uio_iov->iov_base);
496 		addr < endaddr;
497 		addr += PAGE_SIZE, i+=1) {
498 
499 		vm_page_t m;
500 
501 		vm_fault_quick( (caddr_t) addr, VM_PROT_READ);
502 		paddr = pmap_kextract(addr);
503 		if (!paddr) {
504 			int j;
505 			for(j=0;j<i;j++)
506 				vm_page_unwire(wpipe->pipe_map.ms[j]);
507 			return EFAULT;
508 		}
509 
510 		m = PHYS_TO_VM_PAGE(paddr);
511 		vm_page_wire(m);
512 		wpipe->pipe_map.ms[i] = m;
513 	}
514 
515 /*
516  * set up the control block
517  */
518 	wpipe->pipe_map.npages = i;
519 	wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
520 	wpipe->pipe_map.cnt = size;
521 
522 /*
523  * and map the buffer
524  */
525 	if (wpipe->pipe_map.kva == 0) {
526 		/*
527 		 * We need to allocate space for an extra page because the
528 		 * address range might (will) span pages at times.
529 		 */
530 		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
531 			wpipe->pipe_buffer.size + PAGE_SIZE);
532 		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
533 	}
534 	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
535 		wpipe->pipe_map.npages);
536 
537 /*
538  * and update the uio data
539  */
540 
541 	uio->uio_iov->iov_len -= size;
542 	uio->uio_iov->iov_base += size;
543 	if (uio->uio_iov->iov_len == 0)
544 		uio->uio_iov++;
545 	uio->uio_resid -= size;
546 	uio->uio_offset += size;
547 	return 0;
548 }
549 
550 /*
551  * unmap and unwire the process buffer
552  */
553 static void
554 pipe_destroy_write_buffer(wpipe)
555 struct pipe *wpipe;
556 {
557 	int i;
558 	if (wpipe->pipe_map.kva) {
559 		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
560 
561 		if (amountpipekva > MAXPIPEKVA) {
562 			vm_offset_t kva = wpipe->pipe_map.kva;
563 			wpipe->pipe_map.kva = 0;
564 			kmem_free(kernel_map, kva,
565 				wpipe->pipe_buffer.size + PAGE_SIZE);
566 			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
567 		}
568 	}
569 	for (i=0;i<wpipe->pipe_map.npages;i++)
570 		vm_page_unwire(wpipe->pipe_map.ms[i]);
571 }
572 
573 /*
574  * In the case of a signal, the writing process might go away.  This
575  * code copies the data into the circular buffer so that the source
576  * pages can be freed without loss of data.
577  */
578 static void
579 pipe_clone_write_buffer(wpipe)
580 struct pipe *wpipe;
581 {
582 	int size;
583 	int pos;
584 
585 	size = wpipe->pipe_map.cnt;
586 	pos = wpipe->pipe_map.pos;
587 	bcopy((caddr_t) wpipe->pipe_map.kva+pos,
588 			(caddr_t) wpipe->pipe_buffer.buffer,
589 			size);
590 
591 	wpipe->pipe_buffer.in = size;
592 	wpipe->pipe_buffer.out = 0;
593 	wpipe->pipe_buffer.cnt = size;
594 	wpipe->pipe_state &= ~PIPE_DIRECTW;
595 
596 	pipe_destroy_write_buffer(wpipe);
597 }
598 
599 /*
600  * This implements the pipe buffer write mechanism.  Note that only
601  * a direct write OR a normal pipe write can be pending at any given time.
602  * If there are any characters in the pipe buffer, the direct write will
603  * be deferred until the receiving process grabs all of the bytes from
604  * the pipe buffer.  Then the direct mapping write is set-up.
605  */
606 static int
607 pipe_direct_write(wpipe, uio)
608 	struct pipe *wpipe;
609 	struct uio *uio;
610 {
611 	int error;
612 retry:
613 	while (wpipe->pipe_state & PIPE_DIRECTW) {
614 		if ( wpipe->pipe_state & PIPE_WANTR) {
615 			wpipe->pipe_state &= ~PIPE_WANTR;
616 			wakeup(wpipe);
617 		}
618 		wpipe->pipe_state |= PIPE_WANTW;
619 		error = tsleep(wpipe,
620 				PRIBIO|PCATCH, "pipdww", 0);
621 		if (error)
622 			goto error1;
623 		if (wpipe->pipe_state & PIPE_EOF) {
624 			error = EPIPE;
625 			goto error1;
626 		}
627 	}
628 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
629 	if (wpipe->pipe_buffer.cnt > 0) {
630 		if ( wpipe->pipe_state & PIPE_WANTR) {
631 			wpipe->pipe_state &= ~PIPE_WANTR;
632 			wakeup(wpipe);
633 		}
634 
635 		wpipe->pipe_state |= PIPE_WANTW;
636 		error = tsleep(wpipe,
637 				PRIBIO|PCATCH, "pipdwc", 0);
638 		if (error)
639 			goto error1;
640 		if (wpipe->pipe_state & PIPE_EOF) {
641 			error = EPIPE;
642 			goto error1;
643 		}
644 		goto retry;
645 	}
646 
647 	wpipe->pipe_state |= PIPE_DIRECTW;
648 
649 	error = pipe_build_write_buffer(wpipe, uio);
650 	if (error) {
651 		wpipe->pipe_state &= ~PIPE_DIRECTW;
652 		goto error1;
653 	}
654 
655 	error = 0;
656 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
657 		if (wpipe->pipe_state & PIPE_EOF) {
658 			pipelock(wpipe, 0);
659 			pipe_destroy_write_buffer(wpipe);
660 			pipeunlock(wpipe);
661 			pipeselwakeup(wpipe);
662 			error = EPIPE;
663 			goto error1;
664 		}
665 		if (wpipe->pipe_state & PIPE_WANTR) {
666 			wpipe->pipe_state &= ~PIPE_WANTR;
667 			wakeup(wpipe);
668 		}
669 		pipeselwakeup(wpipe);
670 		error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwt", 0);
671 	}
672 
673 	pipelock(wpipe,0);
674 	if (wpipe->pipe_state & PIPE_DIRECTW) {
675 		/*
676 		 * this bit of trickery substitutes a kernel buffer for
677 		 * the process that might be going away.
678 		 */
679 		pipe_clone_write_buffer(wpipe);
680 	} else {
681 		pipe_destroy_write_buffer(wpipe);
682 	}
683 	pipeunlock(wpipe);
684 	return error;
685 
686 error1:
687 	wakeup(wpipe);
688 	return error;
689 }
690 #endif
691 
692 static int
693 pipe_write(fp, uio, cred)
694 	struct file *fp;
695 	struct uio *uio;
696 	struct ucred *cred;
697 {
698 	int error = 0;
699 	int orig_resid;
700 
701 	struct pipe *wpipe, *rpipe;
702 
703 	rpipe = (struct pipe *) fp->f_data;
704 	wpipe = rpipe->pipe_peer;
705 
706 	/*
707 	 * detect loss of pipe read side, issue SIGPIPE if lost.
708 	 */
709 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
710 		return EPIPE;
711 	}
712 
713 	/*
714 	 * If it is advantageous to resize the pipe buffer, do
715 	 * so.
716 	 */
717 	if ((uio->uio_resid > PIPE_SIZE) &&
718 		(nbigpipe < LIMITBIGPIPES) &&
719 		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
720 		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
721 		(wpipe->pipe_buffer.cnt == 0)) {
722 
723 		if (wpipe->pipe_buffer.buffer) {
724 			amountpipekva -= wpipe->pipe_buffer.size;
725 			kmem_free(kernel_map,
726 				(vm_offset_t)wpipe->pipe_buffer.buffer,
727 				wpipe->pipe_buffer.size);
728 		}
729 
730 #ifndef PIPE_NODIRECT
731 		if (wpipe->pipe_map.kva) {
732 			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
733 			kmem_free(kernel_map,
734 				wpipe->pipe_map.kva,
735 				wpipe->pipe_buffer.size + PAGE_SIZE);
736 		}
737 #endif
738 
739 		wpipe->pipe_buffer.in = 0;
740 		wpipe->pipe_buffer.out = 0;
741 		wpipe->pipe_buffer.cnt = 0;
742 		wpipe->pipe_buffer.size = BIG_PIPE_SIZE;
743 		wpipe->pipe_buffer.buffer = NULL;
744 		++nbigpipe;
745 
746 #ifndef PIPE_NODIRECT
747 		wpipe->pipe_map.cnt = 0;
748 		wpipe->pipe_map.kva = 0;
749 		wpipe->pipe_map.pos = 0;
750 		wpipe->pipe_map.npages = 0;
751 #endif
752 
753 	}
754 
755 
756 	if( wpipe->pipe_buffer.buffer == NULL) {
757 		if ((error = pipelock(wpipe,1)) == 0) {
758 			pipespace(wpipe);
759 			pipeunlock(wpipe);
760 		} else {
761 			return error;
762 		}
763 	}
764 
765 	++wpipe->pipe_busy;
766 	orig_resid = uio->uio_resid;
767 	while (uio->uio_resid) {
768 		int space;
769 #ifndef PIPE_NODIRECT
770 		/*
771 		 * If the transfer is large, we can gain performance if
772 		 * we do process-to-process copies directly.
773 		 * If the write is non-blocking, we don't use the
774 		 * direct write mechanism.
775 		 */
776 		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
777 		    (fp->f_flag & FNONBLOCK) == 0 &&
778 			(wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
779 			(uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
780 			error = pipe_direct_write( wpipe, uio);
781 			if (error) {
782 				break;
783 			}
784 			continue;
785 		}
786 #endif
787 
788 		/*
789 		 * Pipe buffered writes cannot be coincidental with
790 		 * direct writes.  We wait until the currently executing
791 		 * direct write is completed before we start filling the
792 		 * pipe buffer.
793 		 */
794 	retrywrite:
795 		while (wpipe->pipe_state & PIPE_DIRECTW) {
796 			if (wpipe->pipe_state & PIPE_WANTR) {
797 				wpipe->pipe_state &= ~PIPE_WANTR;
798 				wakeup(wpipe);
799 			}
800 			error = tsleep(wpipe,
801 					PRIBIO|PCATCH, "pipbww", 0);
802 			if (error)
803 				break;
804 		}
805 
806 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
807 
808 		/* Writes of size <= PIPE_BUF must be atomic. */
809 		/* XXX perhaps they need to be contiguous to be atomic? */
810 		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
811 			space = 0;
812 
813 		if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
814 			/*
815 			 * This set the maximum transfer as a segment of
816 			 * the buffer.
817 			 */
818 			int size = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in;
819 			/*
820 			 * space is the size left in the buffer
821 			 */
822 			if (size > space)
823 				size = space;
824 			/*
825 			 * now limit it to the size of the uio transfer
826 			 */
827 			if (size > uio->uio_resid)
828 				size = uio->uio_resid;
829 			if ((error = pipelock(wpipe,1)) == 0) {
830 				/*
831 				 * It is possible for a direct write to
832 				 * slip in on us... handle it here...
833 				 */
834 				if (wpipe->pipe_state & PIPE_DIRECTW) {
835 					pipeunlock(wpipe);
836 					goto retrywrite;
837 				}
838 				error = uiomove( &wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
839 					size, uio);
840 				pipeunlock(wpipe);
841 			}
842 			if (error)
843 				break;
844 
845 			wpipe->pipe_buffer.in += size;
846 			if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size)
847 				wpipe->pipe_buffer.in = 0;
848 
849 			wpipe->pipe_buffer.cnt += size;
850 		} else {
851 			/*
852 			 * If the "read-side" has been blocked, wake it up now.
853 			 */
854 			if (wpipe->pipe_state & PIPE_WANTR) {
855 				wpipe->pipe_state &= ~PIPE_WANTR;
856 				wakeup(wpipe);
857 			}
858 
859 			/*
860 			 * don't block on non-blocking I/O
861 			 */
862 			if (fp->f_flag & FNONBLOCK) {
863 				error = EAGAIN;
864 				break;
865 			}
866 
867 			/*
868 			 * We have no more space and have something to offer,
869 			 * wake up selects.
870 			 */
871 			pipeselwakeup(wpipe);
872 
873 			wpipe->pipe_state |= PIPE_WANTW;
874 			if (error = tsleep(wpipe, (PRIBIO+1)|PCATCH, "pipewr", 0)) {
875 				break;
876 			}
877 			/*
878 			 * If read side wants to go away, we just issue a signal
879 			 * to ourselves.
880 			 */
881 			if (wpipe->pipe_state & PIPE_EOF) {
882 				error = EPIPE;
883 				break;
884 			}
885 		}
886 	}
887 
888 	--wpipe->pipe_busy;
889 	if ((wpipe->pipe_busy == 0) &&
890 		(wpipe->pipe_state & PIPE_WANT)) {
891 		wpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTR);
892 		wakeup(wpipe);
893 	} else if (wpipe->pipe_buffer.cnt > 0) {
894 		/*
895 		 * If we have put any characters in the buffer, we wake up
896 		 * the reader.
897 		 */
898 		if (wpipe->pipe_state & PIPE_WANTR) {
899 			wpipe->pipe_state &= ~PIPE_WANTR;
900 			wakeup(wpipe);
901 		}
902 	}
903 
904 	/*
905 	 * Don't return EPIPE if I/O was successful
906 	 */
907 	if ((wpipe->pipe_buffer.cnt == 0) &&
908 		(uio->uio_resid == 0) &&
909 		(error == EPIPE))
910 		error = 0;
911 
912 	if (error == 0)
913 		gettime(&wpipe->pipe_mtime);
914 
915 	/*
916 	 * We have something to offer,
917 	 * wake up select.
918 	 */
919 	if (wpipe->pipe_buffer.cnt)
920 		pipeselwakeup(wpipe);
921 
922 	return error;
923 }
924 
925 /*
926  * we implement a very minimal set of ioctls for compatibility with sockets.
927  */
928 int
929 pipe_ioctl(fp, cmd, data, p)
930 	struct file *fp;
931 	int cmd;
932 	register caddr_t data;
933 	struct proc *p;
934 {
935 	register struct pipe *mpipe = (struct pipe *)fp->f_data;
936 
937 	switch (cmd) {
938 
939 	case FIONBIO:
940 		return (0);
941 
942 	case FIOASYNC:
943 		if (*(int *)data) {
944 			mpipe->pipe_state |= PIPE_ASYNC;
945 		} else {
946 			mpipe->pipe_state &= ~PIPE_ASYNC;
947 		}
948 		return (0);
949 
950 	case FIONREAD:
951 		if (mpipe->pipe_state & PIPE_DIRECTW)
952 			*(int *)data = mpipe->pipe_map.cnt;
953 		else
954 			*(int *)data = mpipe->pipe_buffer.cnt;
955 		return (0);
956 
957 	case TIOCSPGRP:
958 		mpipe->pipe_pgid = *(int *)data;
959 		return (0);
960 
961 	case TIOCGPGRP:
962 		*(int *)data = mpipe->pipe_pgid;
963 		return (0);
964 
965 	}
966 	return (ENOTTY);
967 }
968 
969 int
970 pipe_select(fp, which, p)
971 	struct file *fp;
972 	int which;
973 	struct proc *p;
974 {
975 	register struct pipe *rpipe = (struct pipe *)fp->f_data;
976 	struct pipe *wpipe;
977 
978 	wpipe = rpipe->pipe_peer;
979 	switch (which) {
980 
981 	case FREAD:
982 		if ( (rpipe->pipe_state & PIPE_DIRECTW) ||
983 			(rpipe->pipe_buffer.cnt > 0) ||
984 			(rpipe->pipe_state & PIPE_EOF)) {
985 			return (1);
986 		}
987 		selrecord(p, &rpipe->pipe_sel);
988 		rpipe->pipe_state |= PIPE_SEL;
989 		break;
990 
991 	case FWRITE:
992 		if ((wpipe == NULL) ||
993 			(wpipe->pipe_state & PIPE_EOF) ||
994 			(((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
995 			 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) {
996 			return (1);
997 		}
998 		selrecord(p, &wpipe->pipe_sel);
999 		wpipe->pipe_state |= PIPE_SEL;
1000 		break;
1001 
1002 	case 0:
1003 		if ((rpipe->pipe_state & PIPE_EOF) ||
1004 			(wpipe == NULL) ||
1005 			(wpipe->pipe_state & PIPE_EOF)) {
1006 			return (1);
1007 		}
1008 
1009 		selrecord(p, &rpipe->pipe_sel);
1010 		rpipe->pipe_state |= PIPE_SEL;
1011 		break;
1012 	}
1013 	return (0);
1014 }
1015 
1016 int
1017 pipe_stat(pipe, ub)
1018 	register struct pipe *pipe;
1019 	register struct stat *ub;
1020 {
1021 	bzero((caddr_t)ub, sizeof (*ub));
1022 	ub->st_mode = S_IFIFO;
1023 	ub->st_blksize = pipe->pipe_buffer.size;
1024 	ub->st_size = pipe->pipe_buffer.cnt;
1025 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1026 	TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec);
1027 	TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec);
1028 	TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec);
1029 	/*
1030 	 * Left as 0: st_dev, st_ino, st_nlink, st_uid, st_gid, st_rdev,
1031 	 * st_flags, st_gen.
1032 	 * XXX (st_dev, st_ino) should be unique.
1033 	 */
1034 	return 0;
1035 }
1036 
1037 /* ARGSUSED */
1038 static int
1039 pipe_close(fp, p)
1040 	struct file *fp;
1041 	struct proc *p;
1042 {
1043 	struct pipe *cpipe = (struct pipe *)fp->f_data;
1044 
1045 	pipeclose(cpipe);
1046 	fp->f_data = NULL;
1047 	return 0;
1048 }
1049 
1050 /*
1051  * shutdown the pipe
1052  */
1053 static void
1054 pipeclose(cpipe)
1055 	struct pipe *cpipe;
1056 {
1057 	struct pipe *ppipe;
1058 	if (cpipe) {
1059 
1060 		pipeselwakeup(cpipe);
1061 
1062 		/*
1063 		 * If the other side is blocked, wake it up saying that
1064 		 * we want to close it down.
1065 		 */
1066 		while (cpipe->pipe_busy) {
1067 			wakeup(cpipe);
1068 			cpipe->pipe_state |= PIPE_WANT|PIPE_EOF;
1069 			tsleep(cpipe, PRIBIO, "pipecl", 0);
1070 		}
1071 
1072 		/*
1073 		 * Disconnect from peer
1074 		 */
1075 		if (ppipe = cpipe->pipe_peer) {
1076 			pipeselwakeup(ppipe);
1077 
1078 			ppipe->pipe_state |= PIPE_EOF;
1079 			wakeup(ppipe);
1080 			ppipe->pipe_peer = NULL;
1081 		}
1082 
1083 		/*
1084 		 * free resources
1085 		 */
1086 		if (cpipe->pipe_buffer.buffer) {
1087 			if (cpipe->pipe_buffer.size > PIPE_SIZE)
1088 				--nbigpipe;
1089 			amountpipekva -= cpipe->pipe_buffer.size;
1090 			kmem_free(kernel_map,
1091 				(vm_offset_t)cpipe->pipe_buffer.buffer,
1092 				cpipe->pipe_buffer.size);
1093 		}
1094 #ifndef PIPE_NODIRECT
1095 		if (cpipe->pipe_map.kva) {
1096 			amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
1097 			kmem_free(kernel_map,
1098 				cpipe->pipe_map.kva,
1099 				cpipe->pipe_buffer.size + PAGE_SIZE);
1100 		}
1101 #endif
1102 		free(cpipe, M_TEMP);
1103 	}
1104 }
1105