xref: /freebsd/sys/kern/sys_pipe.c (revision 1d66272a85cde1c8a69c58f4b5dd649babd6eca6)
1 /*
2  * Copyright (c) 1996 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice immediately at the beginning of the file, without modification,
10  *    this list of conditions, and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. Absolutely no warranty of function or purpose is made by the author
15  *    John S. Dyson.
16  * 4. Modifications may be freely made to this file if the above conditions
17  *    are met.
18  *
19  * $FreeBSD$
20  */
21 
22 /*
23  * This file contains a high-performance replacement for the socket-based
24  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
25  * all features of sockets, but does do everything that pipes normally
26  * do.
27  */
28 
29 /*
30  * This code has two modes of operation, a small write mode and a large
31  * write mode.  The small write mode acts like conventional pipes with
32  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
33  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
34  * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
35  * the receiving process can copy it directly from the pages in the sending
36  * process.
37  *
38  * If the sending process receives a signal, it is possible that it will
39  * go away, and certainly its address space can change, because control
40  * is returned back to the user-mode side.  In that case, the pipe code
41  * arranges to copy the buffer supplied by the user process, to a pageable
42  * kernel buffer, and the receiving process will grab the data from the
43  * pageable kernel buffer.  Since signals don't happen all that often,
44  * the copy operation is normally eliminated.
45  *
46  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
47  * happen for small transfers so that the system will not spend all of
48  * its time context switching.  PIPE_SIZE is constrained by the
49  * amount of kernel virtual memory.
50  */
51 
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/proc.h>
55 #include <sys/fcntl.h>
56 #include <sys/file.h>
57 #include <sys/filedesc.h>
58 #include <sys/filio.h>
59 #include <sys/ttycom.h>
60 #include <sys/stat.h>
61 #include <sys/poll.h>
62 #include <sys/selinfo.h>
63 #include <sys/signalvar.h>
64 #include <sys/sysproto.h>
65 #include <sys/pipe.h>
66 #include <sys/vnode.h>
67 #include <sys/uio.h>
68 #include <sys/event.h>
69 
70 #include <vm/vm.h>
71 #include <vm/vm_param.h>
72 #include <sys/lock.h>
73 #include <vm/vm_object.h>
74 #include <vm/vm_kern.h>
75 #include <vm/vm_extern.h>
76 #include <vm/pmap.h>
77 #include <vm/vm_map.h>
78 #include <vm/vm_page.h>
79 #include <vm/vm_zone.h>
80 
81 /*
82  * Use this define if you want to disable *fancy* VM things.  Expect an
83  * approx 30% decrease in transfer rate.  This could be useful for
84  * NetBSD or OpenBSD.
85  */
86 /* #define PIPE_NODIRECT */
87 
88 /*
89  * interfaces to the outside world
90  */
91 static int pipe_read __P((struct file *fp, struct uio *uio,
92 		struct ucred *cred, int flags, struct proc *p));
93 static int pipe_write __P((struct file *fp, struct uio *uio,
94 		struct ucred *cred, int flags, struct proc *p));
95 static int pipe_close __P((struct file *fp, struct proc *p));
96 static int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
97 		struct proc *p));
98 static int pipe_stat __P((struct file *fp, struct stat *sb, struct proc *p));
99 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p));
100 
101 static struct fileops pipeops =
102     { pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_stat, pipe_close };
103 
104 static int	filt_pipeattach(struct knote *kn);
105 static void	filt_pipedetach(struct knote *kn);
106 static int	filt_piperead(struct knote *kn, long hint);
107 static int	filt_pipewrite(struct knote *kn, long hint);
108 
109 struct filterops pipe_rwfiltops[] = {
110 	{ 1, filt_pipeattach, filt_pipedetach, filt_piperead },
111 	{ 1, filt_pipeattach, filt_pipedetach, filt_pipewrite },
112 };
113 
114 /*
115  * Default pipe buffer size(s), this can be kind-of large now because pipe
116  * space is pageable.  The pipe code will try to maintain locality of
117  * reference for performance reasons, so small amounts of outstanding I/O
118  * will not wipe the cache.
119  */
120 #define MINPIPESIZE (PIPE_SIZE/3)
121 #define MAXPIPESIZE (2*PIPE_SIZE/3)
122 
123 /*
124  * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
125  * is there so that on large systems, we don't exhaust it.
126  */
127 #define MAXPIPEKVA (8*1024*1024)
128 
129 /*
130  * Limit for direct transfers, we cannot, of course limit
131  * the amount of kva for pipes in general though.
132  */
133 #define LIMITPIPEKVA (16*1024*1024)
134 
135 /*
136  * Limit the number of "big" pipes
137  */
138 #define LIMITBIGPIPES	32
139 static int nbigpipe;
140 
141 static int amountpipekva;
142 
143 static void pipeclose __P((struct pipe *cpipe));
144 static void pipeinit __P((struct pipe *cpipe));
145 static __inline int pipelock __P((struct pipe *cpipe, int catch));
146 static __inline void pipeunlock __P((struct pipe *cpipe));
147 static __inline void pipeselwakeup __P((struct pipe *cpipe));
148 #ifndef PIPE_NODIRECT
149 static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
150 static void pipe_destroy_write_buffer __P((struct pipe *wpipe));
151 static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
152 static void pipe_clone_write_buffer __P((struct pipe *wpipe));
153 #endif
154 static void pipespace __P((struct pipe *cpipe));
155 
156 static vm_zone_t pipe_zone;
157 
158 /*
159  * The pipe system call for the DTYPE_PIPE type of pipes
160  */
161 
162 /* ARGSUSED */
163 int
164 pipe(p, uap)
165 	struct proc *p;
166 	struct pipe_args /* {
167 		int	dummy;
168 	} */ *uap;
169 {
170 	register struct filedesc *fdp = p->p_fd;
171 	struct file *rf, *wf;
172 	struct pipe *rpipe, *wpipe;
173 	int fd, error;
174 
175 	if (pipe_zone == NULL)
176 		pipe_zone = zinit("PIPE", sizeof (struct pipe), 0, 0, 4);
177 
178 	rpipe = zalloc( pipe_zone);
179 	pipeinit(rpipe);
180 	rpipe->pipe_state |= PIPE_DIRECTOK;
181 	wpipe = zalloc( pipe_zone);
182 	pipeinit(wpipe);
183 	wpipe->pipe_state |= PIPE_DIRECTOK;
184 
185 	/*
186 	 * Warning: once we've gotten past allocation of the fd for the
187 	 * read-side, we can only drop the read side via fdrop() in order
188 	 * to avoid races against processes which manage to dup() the read
189 	 * side while we are blocked trying to allocate the write side.
190 	 */
191 	error = falloc(p, &rf, &fd);
192 	if (error)
193 		goto free2;
194 	fhold(rf);
195 	p->p_retval[0] = fd;
196 	rf->f_flag = FREAD | FWRITE;
197 	rf->f_type = DTYPE_PIPE;
198 	rf->f_data = (caddr_t)rpipe;
199 	rf->f_ops = &pipeops;
200 	error = falloc(p, &wf, &fd);
201 	if (error)
202 		goto free3;
203 	wf->f_flag = FREAD | FWRITE;
204 	wf->f_type = DTYPE_PIPE;
205 	wf->f_data = (caddr_t)wpipe;
206 	wf->f_ops = &pipeops;
207 	p->p_retval[1] = fd;
208 
209 	rpipe->pipe_peer = wpipe;
210 	wpipe->pipe_peer = rpipe;
211 	fdrop(rf, p);
212 
213 	return (0);
214 free3:
215 	if (fdp->fd_ofiles[p->p_retval[0]] == rf) {
216 		fdp->fd_ofiles[p->p_retval[0]] = NULL;
217 		fdrop(rf, p);
218 	}
219 	fdrop(rf, p);
220 	/* rpipe has been closed by fdrop() */
221 	rpipe = NULL;
222 free2:
223 	(void)pipeclose(wpipe);
224 	(void)pipeclose(rpipe);
225 	return (error);
226 }
227 
228 /*
229  * Allocate kva for pipe circular buffer, the space is pageable
230  */
231 static void
232 pipespace(cpipe)
233 	struct pipe *cpipe;
234 {
235 	int npages, error;
236 
237 	npages = round_page(cpipe->pipe_buffer.size)/PAGE_SIZE;
238 	/*
239 	 * Create an object, I don't like the idea of paging to/from
240 	 * kernel_object.
241 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
242 	 */
243 	cpipe->pipe_buffer.object = vm_object_allocate(OBJT_DEFAULT, npages);
244 	cpipe->pipe_buffer.buffer = (caddr_t) vm_map_min(kernel_map);
245 
246 	/*
247 	 * Insert the object into the kernel map, and allocate kva for it.
248 	 * The map entry is, by default, pageable.
249 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
250 	 */
251 	error = vm_map_find(kernel_map, cpipe->pipe_buffer.object, 0,
252 		(vm_offset_t *) &cpipe->pipe_buffer.buffer,
253 		cpipe->pipe_buffer.size, 1,
254 		VM_PROT_ALL, VM_PROT_ALL, 0);
255 
256 	if (error != KERN_SUCCESS)
257 		panic("pipeinit: cannot allocate pipe -- out of kvm -- code = %d", error);
258 	amountpipekva += cpipe->pipe_buffer.size;
259 }
260 
261 /*
262  * initialize and allocate VM and memory for pipe
263  */
264 static void
265 pipeinit(cpipe)
266 	struct pipe *cpipe;
267 {
268 
269 	cpipe->pipe_buffer.in = 0;
270 	cpipe->pipe_buffer.out = 0;
271 	cpipe->pipe_buffer.cnt = 0;
272 	cpipe->pipe_buffer.size = PIPE_SIZE;
273 
274 	/* Buffer kva gets dynamically allocated */
275 	cpipe->pipe_buffer.buffer = NULL;
276 	/* cpipe->pipe_buffer.object = invalid */
277 
278 	cpipe->pipe_state = 0;
279 	cpipe->pipe_peer = NULL;
280 	cpipe->pipe_busy = 0;
281 	vfs_timestamp(&cpipe->pipe_ctime);
282 	cpipe->pipe_atime = cpipe->pipe_ctime;
283 	cpipe->pipe_mtime = cpipe->pipe_ctime;
284 	bzero(&cpipe->pipe_sel, sizeof cpipe->pipe_sel);
285 
286 #ifndef PIPE_NODIRECT
287 	/*
288 	 * pipe data structure initializations to support direct pipe I/O
289 	 */
290 	cpipe->pipe_map.cnt = 0;
291 	cpipe->pipe_map.kva = 0;
292 	cpipe->pipe_map.pos = 0;
293 	cpipe->pipe_map.npages = 0;
294 	/* cpipe->pipe_map.ms[] = invalid */
295 #endif
296 }
297 
298 
299 /*
300  * lock a pipe for I/O, blocking other access
301  */
302 static __inline int
303 pipelock(cpipe, catch)
304 	struct pipe *cpipe;
305 	int catch;
306 {
307 	int error;
308 	while (cpipe->pipe_state & PIPE_LOCK) {
309 		cpipe->pipe_state |= PIPE_LWANT;
310 		if ((error = tsleep( cpipe,
311 			catch?(PRIBIO|PCATCH):PRIBIO, "pipelk", 0)) != 0) {
312 			return error;
313 		}
314 	}
315 	cpipe->pipe_state |= PIPE_LOCK;
316 	return 0;
317 }
318 
319 /*
320  * unlock a pipe I/O lock
321  */
322 static __inline void
323 pipeunlock(cpipe)
324 	struct pipe *cpipe;
325 {
326 	cpipe->pipe_state &= ~PIPE_LOCK;
327 	if (cpipe->pipe_state & PIPE_LWANT) {
328 		cpipe->pipe_state &= ~PIPE_LWANT;
329 		wakeup(cpipe);
330 	}
331 }
332 
333 static __inline void
334 pipeselwakeup(cpipe)
335 	struct pipe *cpipe;
336 {
337 	if (cpipe->pipe_state & PIPE_SEL) {
338 		cpipe->pipe_state &= ~PIPE_SEL;
339 		selwakeup(&cpipe->pipe_sel);
340 	}
341 	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
342 		pgsigio(cpipe->pipe_sigio, SIGIO, 0);
343 	KNOTE(&cpipe->pipe_sel.si_note, 0);
344 }
345 
346 /* ARGSUSED */
347 static int
348 pipe_read(fp, uio, cred, flags, p)
349 	struct file *fp;
350 	struct uio *uio;
351 	struct ucred *cred;
352 	struct proc *p;
353 	int flags;
354 {
355 
356 	struct pipe *rpipe = (struct pipe *) fp->f_data;
357 	int error;
358 	int nread = 0;
359 	u_int size;
360 
361 	++rpipe->pipe_busy;
362 	error = pipelock(rpipe, 1);
363 	if (error)
364 		goto unlocked_error;
365 
366 	while (uio->uio_resid) {
367 		/*
368 		 * normal pipe buffer receive
369 		 */
370 		if (rpipe->pipe_buffer.cnt > 0) {
371 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
372 			if (size > rpipe->pipe_buffer.cnt)
373 				size = rpipe->pipe_buffer.cnt;
374 			if (size > (u_int) uio->uio_resid)
375 				size = (u_int) uio->uio_resid;
376 
377 			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
378 					size, uio);
379 			if (error) {
380 				break;
381 			}
382 			rpipe->pipe_buffer.out += size;
383 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
384 				rpipe->pipe_buffer.out = 0;
385 
386 			rpipe->pipe_buffer.cnt -= size;
387 
388 			/*
389 			 * If there is no more to read in the pipe, reset
390 			 * its pointers to the beginning.  This improves
391 			 * cache hit stats.
392 			 */
393 			if (rpipe->pipe_buffer.cnt == 0) {
394 				rpipe->pipe_buffer.in = 0;
395 				rpipe->pipe_buffer.out = 0;
396 			}
397 			nread += size;
398 #ifndef PIPE_NODIRECT
399 		/*
400 		 * Direct copy, bypassing a kernel buffer.
401 		 */
402 		} else if ((size = rpipe->pipe_map.cnt) &&
403 			   (rpipe->pipe_state & PIPE_DIRECTW)) {
404 			caddr_t	va;
405 			if (size > (u_int) uio->uio_resid)
406 				size = (u_int) uio->uio_resid;
407 
408 			va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos;
409 			error = uiomove(va, size, uio);
410 			if (error)
411 				break;
412 			nread += size;
413 			rpipe->pipe_map.pos += size;
414 			rpipe->pipe_map.cnt -= size;
415 			if (rpipe->pipe_map.cnt == 0) {
416 				rpipe->pipe_state &= ~PIPE_DIRECTW;
417 				wakeup(rpipe);
418 			}
419 #endif
420 		} else {
421 			/*
422 			 * detect EOF condition
423 			 */
424 			if (rpipe->pipe_state & PIPE_EOF) {
425 				/* XXX error = ? */
426 				break;
427 			}
428 
429 			/*
430 			 * If the "write-side" has been blocked, wake it up now.
431 			 */
432 			if (rpipe->pipe_state & PIPE_WANTW) {
433 				rpipe->pipe_state &= ~PIPE_WANTW;
434 				wakeup(rpipe);
435 			}
436 
437 			/*
438 			 * Break if some data was read.
439 			 */
440 			if (nread > 0)
441 				break;
442 
443 			/*
444 			 * Unlock the pipe buffer for our remaining processing.  We
445 			 * will either break out with an error or we will sleep and
446 			 * relock to loop.
447 			 */
448 			pipeunlock(rpipe);
449 
450 			/*
451 			 * Handle non-blocking mode operation or
452 			 * wait for more data.
453 			 */
454 			if (fp->f_flag & FNONBLOCK)
455 				error = EAGAIN;
456 			else {
457 				rpipe->pipe_state |= PIPE_WANTR;
458 				if ((error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) == 0)
459 					error = pipelock(rpipe, 1);
460 			}
461 			if (error)
462 				goto unlocked_error;
463 		}
464 	}
465 	pipeunlock(rpipe);
466 
467 	if (error == 0)
468 		vfs_timestamp(&rpipe->pipe_atime);
469 unlocked_error:
470 	--rpipe->pipe_busy;
471 
472 	/*
473 	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
474 	 */
475 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
476 		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
477 		wakeup(rpipe);
478 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
479 		/*
480 		 * Handle write blocking hysteresis.
481 		 */
482 		if (rpipe->pipe_state & PIPE_WANTW) {
483 			rpipe->pipe_state &= ~PIPE_WANTW;
484 			wakeup(rpipe);
485 		}
486 	}
487 
488 	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
489 		pipeselwakeup(rpipe);
490 
491 	return error;
492 }
493 
494 #ifndef PIPE_NODIRECT
495 /*
496  * Map the sending processes' buffer into kernel space and wire it.
497  * This is similar to a physical write operation.
498  */
499 static int
500 pipe_build_write_buffer(wpipe, uio)
501 	struct pipe *wpipe;
502 	struct uio *uio;
503 {
504 	u_int size;
505 	int i;
506 	vm_offset_t addr, endaddr, paddr;
507 
508 	size = (u_int) uio->uio_iov->iov_len;
509 	if (size > wpipe->pipe_buffer.size)
510 		size = wpipe->pipe_buffer.size;
511 
512 	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
513 	for(i = 0, addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
514 		addr < endaddr;
515 		addr += PAGE_SIZE, i+=1) {
516 
517 		vm_page_t m;
518 
519 		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
520 		    (paddr = pmap_kextract(addr)) == 0) {
521 			int j;
522 			for(j=0;j<i;j++)
523 				vm_page_unwire(wpipe->pipe_map.ms[j], 1);
524 			return EFAULT;
525 		}
526 
527 		m = PHYS_TO_VM_PAGE(paddr);
528 		vm_page_wire(m);
529 		wpipe->pipe_map.ms[i] = m;
530 	}
531 
532 /*
533  * set up the control block
534  */
535 	wpipe->pipe_map.npages = i;
536 	wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
537 	wpipe->pipe_map.cnt = size;
538 
539 /*
540  * and map the buffer
541  */
542 	if (wpipe->pipe_map.kva == 0) {
543 		/*
544 		 * We need to allocate space for an extra page because the
545 		 * address range might (will) span pages at times.
546 		 */
547 		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
548 			wpipe->pipe_buffer.size + PAGE_SIZE);
549 		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
550 	}
551 	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
552 		wpipe->pipe_map.npages);
553 
554 /*
555  * and update the uio data
556  */
557 
558 	uio->uio_iov->iov_len -= size;
559 	uio->uio_iov->iov_base += size;
560 	if (uio->uio_iov->iov_len == 0)
561 		uio->uio_iov++;
562 	uio->uio_resid -= size;
563 	uio->uio_offset += size;
564 	return 0;
565 }
566 
567 /*
568  * unmap and unwire the process buffer
569  */
570 static void
571 pipe_destroy_write_buffer(wpipe)
572 struct pipe *wpipe;
573 {
574 	int i;
575 	if (wpipe->pipe_map.kva) {
576 		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
577 
578 		if (amountpipekva > MAXPIPEKVA) {
579 			vm_offset_t kva = wpipe->pipe_map.kva;
580 			wpipe->pipe_map.kva = 0;
581 			kmem_free(kernel_map, kva,
582 				wpipe->pipe_buffer.size + PAGE_SIZE);
583 			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
584 		}
585 	}
586 	for (i=0;i<wpipe->pipe_map.npages;i++)
587 		vm_page_unwire(wpipe->pipe_map.ms[i], 1);
588 }
589 
590 /*
591  * In the case of a signal, the writing process might go away.  This
592  * code copies the data into the circular buffer so that the source
593  * pages can be freed without loss of data.
594  */
595 static void
596 pipe_clone_write_buffer(wpipe)
597 struct pipe *wpipe;
598 {
599 	int size;
600 	int pos;
601 
602 	size = wpipe->pipe_map.cnt;
603 	pos = wpipe->pipe_map.pos;
604 	bcopy((caddr_t) wpipe->pipe_map.kva+pos,
605 			(caddr_t) wpipe->pipe_buffer.buffer,
606 			size);
607 
608 	wpipe->pipe_buffer.in = size;
609 	wpipe->pipe_buffer.out = 0;
610 	wpipe->pipe_buffer.cnt = size;
611 	wpipe->pipe_state &= ~PIPE_DIRECTW;
612 
613 	pipe_destroy_write_buffer(wpipe);
614 }
615 
616 /*
617  * This implements the pipe buffer write mechanism.  Note that only
618  * a direct write OR a normal pipe write can be pending at any given time.
619  * If there are any characters in the pipe buffer, the direct write will
620  * be deferred until the receiving process grabs all of the bytes from
621  * the pipe buffer.  Then the direct mapping write is set-up.
622  */
623 static int
624 pipe_direct_write(wpipe, uio)
625 	struct pipe *wpipe;
626 	struct uio *uio;
627 {
628 	int error;
629 retry:
630 	while (wpipe->pipe_state & PIPE_DIRECTW) {
631 		if ( wpipe->pipe_state & PIPE_WANTR) {
632 			wpipe->pipe_state &= ~PIPE_WANTR;
633 			wakeup(wpipe);
634 		}
635 		wpipe->pipe_state |= PIPE_WANTW;
636 		error = tsleep(wpipe,
637 				PRIBIO|PCATCH, "pipdww", 0);
638 		if (error)
639 			goto error1;
640 		if (wpipe->pipe_state & PIPE_EOF) {
641 			error = EPIPE;
642 			goto error1;
643 		}
644 	}
645 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
646 	if (wpipe->pipe_buffer.cnt > 0) {
647 		if ( wpipe->pipe_state & PIPE_WANTR) {
648 			wpipe->pipe_state &= ~PIPE_WANTR;
649 			wakeup(wpipe);
650 		}
651 
652 		wpipe->pipe_state |= PIPE_WANTW;
653 		error = tsleep(wpipe,
654 				PRIBIO|PCATCH, "pipdwc", 0);
655 		if (error)
656 			goto error1;
657 		if (wpipe->pipe_state & PIPE_EOF) {
658 			error = EPIPE;
659 			goto error1;
660 		}
661 		goto retry;
662 	}
663 
664 	wpipe->pipe_state |= PIPE_DIRECTW;
665 
666 	error = pipe_build_write_buffer(wpipe, uio);
667 	if (error) {
668 		wpipe->pipe_state &= ~PIPE_DIRECTW;
669 		goto error1;
670 	}
671 
672 	error = 0;
673 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
674 		if (wpipe->pipe_state & PIPE_EOF) {
675 			pipelock(wpipe, 0);
676 			pipe_destroy_write_buffer(wpipe);
677 			pipeunlock(wpipe);
678 			pipeselwakeup(wpipe);
679 			error = EPIPE;
680 			goto error1;
681 		}
682 		if (wpipe->pipe_state & PIPE_WANTR) {
683 			wpipe->pipe_state &= ~PIPE_WANTR;
684 			wakeup(wpipe);
685 		}
686 		pipeselwakeup(wpipe);
687 		error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwt", 0);
688 	}
689 
690 	pipelock(wpipe,0);
691 	if (wpipe->pipe_state & PIPE_DIRECTW) {
692 		/*
693 		 * this bit of trickery substitutes a kernel buffer for
694 		 * the process that might be going away.
695 		 */
696 		pipe_clone_write_buffer(wpipe);
697 	} else {
698 		pipe_destroy_write_buffer(wpipe);
699 	}
700 	pipeunlock(wpipe);
701 	return error;
702 
703 error1:
704 	wakeup(wpipe);
705 	return error;
706 }
707 #endif
708 
709 static int
710 pipe_write(fp, uio, cred, flags, p)
711 	struct file *fp;
712 	struct uio *uio;
713 	struct ucred *cred;
714 	struct proc *p;
715 	int flags;
716 {
717 	int error = 0;
718 	int orig_resid;
719 
720 	struct pipe *wpipe, *rpipe;
721 
722 	rpipe = (struct pipe *) fp->f_data;
723 	wpipe = rpipe->pipe_peer;
724 
725 	/*
726 	 * detect loss of pipe read side, issue SIGPIPE if lost.
727 	 */
728 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
729 		return EPIPE;
730 	}
731 
732 	/*
733 	 * If it is advantageous to resize the pipe buffer, do
734 	 * so.
735 	 */
736 	if ((uio->uio_resid > PIPE_SIZE) &&
737 		(nbigpipe < LIMITBIGPIPES) &&
738 		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
739 		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
740 		(wpipe->pipe_buffer.cnt == 0)) {
741 
742 		if (wpipe->pipe_buffer.buffer) {
743 			amountpipekva -= wpipe->pipe_buffer.size;
744 			kmem_free(kernel_map,
745 				(vm_offset_t)wpipe->pipe_buffer.buffer,
746 				wpipe->pipe_buffer.size);
747 		}
748 
749 #ifndef PIPE_NODIRECT
750 		if (wpipe->pipe_map.kva) {
751 			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
752 			kmem_free(kernel_map,
753 				wpipe->pipe_map.kva,
754 				wpipe->pipe_buffer.size + PAGE_SIZE);
755 		}
756 #endif
757 
758 		wpipe->pipe_buffer.in = 0;
759 		wpipe->pipe_buffer.out = 0;
760 		wpipe->pipe_buffer.cnt = 0;
761 		wpipe->pipe_buffer.size = BIG_PIPE_SIZE;
762 		wpipe->pipe_buffer.buffer = NULL;
763 		++nbigpipe;
764 
765 #ifndef PIPE_NODIRECT
766 		wpipe->pipe_map.cnt = 0;
767 		wpipe->pipe_map.kva = 0;
768 		wpipe->pipe_map.pos = 0;
769 		wpipe->pipe_map.npages = 0;
770 #endif
771 
772 	}
773 
774 
775 	if( wpipe->pipe_buffer.buffer == NULL) {
776 		if ((error = pipelock(wpipe,1)) == 0) {
777 			pipespace(wpipe);
778 			pipeunlock(wpipe);
779 		} else {
780 			return error;
781 		}
782 	}
783 
784 	++wpipe->pipe_busy;
785 	orig_resid = uio->uio_resid;
786 	while (uio->uio_resid) {
787 		int space;
788 #ifndef PIPE_NODIRECT
789 		/*
790 		 * If the transfer is large, we can gain performance if
791 		 * we do process-to-process copies directly.
792 		 * If the write is non-blocking, we don't use the
793 		 * direct write mechanism.
794 		 *
795 		 * The direct write mechanism will detect the reader going
796 		 * away on us.
797 		 */
798 		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
799 		    (fp->f_flag & FNONBLOCK) == 0 &&
800 			(wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
801 			(uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
802 			error = pipe_direct_write( wpipe, uio);
803 			if (error) {
804 				break;
805 			}
806 			continue;
807 		}
808 #endif
809 
810 		/*
811 		 * Pipe buffered writes cannot be coincidental with
812 		 * direct writes.  We wait until the currently executing
813 		 * direct write is completed before we start filling the
814 		 * pipe buffer.  We break out if a signal occurs or the
815 		 * reader goes away.
816 		 */
817 	retrywrite:
818 		while (wpipe->pipe_state & PIPE_DIRECTW) {
819 			if (wpipe->pipe_state & PIPE_WANTR) {
820 				wpipe->pipe_state &= ~PIPE_WANTR;
821 				wakeup(wpipe);
822 			}
823 			error = tsleep(wpipe, PRIBIO|PCATCH, "pipbww", 0);
824 			if (wpipe->pipe_state & PIPE_EOF)
825 				break;
826 			if (error)
827 				break;
828 		}
829 		if (wpipe->pipe_state & PIPE_EOF) {
830 			error = EPIPE;
831 			break;
832 		}
833 
834 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
835 
836 		/* Writes of size <= PIPE_BUF must be atomic. */
837 		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
838 			space = 0;
839 
840 		if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
841 			if ((error = pipelock(wpipe,1)) == 0) {
842 				int size;	/* Transfer size */
843 				int segsize;	/* first segment to transfer */
844 				/*
845 				 * It is possible for a direct write to
846 				 * slip in on us... handle it here...
847 				 */
848 				if (wpipe->pipe_state & PIPE_DIRECTW) {
849 					pipeunlock(wpipe);
850 					goto retrywrite;
851 				}
852 				/*
853 				 * If a process blocked in uiomove, our
854 				 * value for space might be bad.
855 				 *
856 				 * XXX will we be ok if the reader has gone
857 				 * away here?
858 				 */
859 				if (space > wpipe->pipe_buffer.size -
860 				    wpipe->pipe_buffer.cnt) {
861 					pipeunlock(wpipe);
862 					goto retrywrite;
863 				}
864 
865 				/*
866 				 * Transfer size is minimum of uio transfer
867 				 * and free space in pipe buffer.
868 				 */
869 				if (space > uio->uio_resid)
870 					size = uio->uio_resid;
871 				else
872 					size = space;
873 				/*
874 				 * First segment to transfer is minimum of
875 				 * transfer size and contiguous space in
876 				 * pipe buffer.  If first segment to transfer
877 				 * is less than the transfer size, we've got
878 				 * a wraparound in the buffer.
879 				 */
880 				segsize = wpipe->pipe_buffer.size -
881 					wpipe->pipe_buffer.in;
882 				if (segsize > size)
883 					segsize = size;
884 
885 				/* Transfer first segment */
886 
887 				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
888 						segsize, uio);
889 
890 				if (error == 0 && segsize < size) {
891 					/*
892 					 * Transfer remaining part now, to
893 					 * support atomic writes.  Wraparound
894 					 * happened.
895 					 */
896 					if (wpipe->pipe_buffer.in + segsize !=
897 					    wpipe->pipe_buffer.size)
898 						panic("Expected pipe buffer wraparound disappeared");
899 
900 					error = uiomove(&wpipe->pipe_buffer.buffer[0],
901 							size - segsize, uio);
902 				}
903 				if (error == 0) {
904 					wpipe->pipe_buffer.in += size;
905 					if (wpipe->pipe_buffer.in >=
906 					    wpipe->pipe_buffer.size) {
907 						if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
908 							panic("Expected wraparound bad");
909 						wpipe->pipe_buffer.in = size - segsize;
910 					}
911 
912 					wpipe->pipe_buffer.cnt += size;
913 					if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
914 						panic("Pipe buffer overflow");
915 
916 				}
917 				pipeunlock(wpipe);
918 			}
919 			if (error)
920 				break;
921 
922 		} else {
923 			/*
924 			 * If the "read-side" has been blocked, wake it up now.
925 			 */
926 			if (wpipe->pipe_state & PIPE_WANTR) {
927 				wpipe->pipe_state &= ~PIPE_WANTR;
928 				wakeup(wpipe);
929 			}
930 
931 			/*
932 			 * don't block on non-blocking I/O
933 			 */
934 			if (fp->f_flag & FNONBLOCK) {
935 				error = EAGAIN;
936 				break;
937 			}
938 
939 			/*
940 			 * We have no more space and have something to offer,
941 			 * wake up select/poll.
942 			 */
943 			pipeselwakeup(wpipe);
944 
945 			wpipe->pipe_state |= PIPE_WANTW;
946 			if ((error = tsleep(wpipe, (PRIBIO+1)|PCATCH, "pipewr", 0)) != 0) {
947 				break;
948 			}
949 			/*
950 			 * If read side wants to go away, we just issue a signal
951 			 * to ourselves.
952 			 */
953 			if (wpipe->pipe_state & PIPE_EOF) {
954 				error = EPIPE;
955 				break;
956 			}
957 		}
958 	}
959 
960 	--wpipe->pipe_busy;
961 	if ((wpipe->pipe_busy == 0) &&
962 		(wpipe->pipe_state & PIPE_WANT)) {
963 		wpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTR);
964 		wakeup(wpipe);
965 	} else if (wpipe->pipe_buffer.cnt > 0) {
966 		/*
967 		 * If we have put any characters in the buffer, we wake up
968 		 * the reader.
969 		 */
970 		if (wpipe->pipe_state & PIPE_WANTR) {
971 			wpipe->pipe_state &= ~PIPE_WANTR;
972 			wakeup(wpipe);
973 		}
974 	}
975 
976 	/*
977 	 * Don't return EPIPE if I/O was successful
978 	 */
979 	if ((wpipe->pipe_buffer.cnt == 0) &&
980 		(uio->uio_resid == 0) &&
981 		(error == EPIPE))
982 		error = 0;
983 
984 	if (error == 0)
985 		vfs_timestamp(&wpipe->pipe_mtime);
986 
987 	/*
988 	 * We have something to offer,
989 	 * wake up select/poll.
990 	 */
991 	if (wpipe->pipe_buffer.cnt)
992 		pipeselwakeup(wpipe);
993 
994 	return error;
995 }
996 
997 /*
998  * we implement a very minimal set of ioctls for compatibility with sockets.
999  */
1000 int
1001 pipe_ioctl(fp, cmd, data, p)
1002 	struct file *fp;
1003 	u_long cmd;
1004 	register caddr_t data;
1005 	struct proc *p;
1006 {
1007 	register struct pipe *mpipe = (struct pipe *)fp->f_data;
1008 
1009 	switch (cmd) {
1010 
1011 	case FIONBIO:
1012 		return (0);
1013 
1014 	case FIOASYNC:
1015 		if (*(int *)data) {
1016 			mpipe->pipe_state |= PIPE_ASYNC;
1017 		} else {
1018 			mpipe->pipe_state &= ~PIPE_ASYNC;
1019 		}
1020 		return (0);
1021 
1022 	case FIONREAD:
1023 		if (mpipe->pipe_state & PIPE_DIRECTW)
1024 			*(int *)data = mpipe->pipe_map.cnt;
1025 		else
1026 			*(int *)data = mpipe->pipe_buffer.cnt;
1027 		return (0);
1028 
1029 	case FIOSETOWN:
1030 		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
1031 
1032 	case FIOGETOWN:
1033 		*(int *)data = fgetown(mpipe->pipe_sigio);
1034 		return (0);
1035 
1036 	/* This is deprecated, FIOSETOWN should be used instead. */
1037 	case TIOCSPGRP:
1038 		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
1039 
1040 	/* This is deprecated, FIOGETOWN should be used instead. */
1041 	case TIOCGPGRP:
1042 		*(int *)data = -fgetown(mpipe->pipe_sigio);
1043 		return (0);
1044 
1045 	}
1046 	return (ENOTTY);
1047 }
1048 
1049 int
1050 pipe_poll(fp, events, cred, p)
1051 	struct file *fp;
1052 	int events;
1053 	struct ucred *cred;
1054 	struct proc *p;
1055 {
1056 	register struct pipe *rpipe = (struct pipe *)fp->f_data;
1057 	struct pipe *wpipe;
1058 	int revents = 0;
1059 
1060 	wpipe = rpipe->pipe_peer;
1061 	if (events & (POLLIN | POLLRDNORM))
1062 		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
1063 		    (rpipe->pipe_buffer.cnt > 0) ||
1064 		    (rpipe->pipe_state & PIPE_EOF))
1065 			revents |= events & (POLLIN | POLLRDNORM);
1066 
1067 	if (events & (POLLOUT | POLLWRNORM))
1068 		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
1069 		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
1070 		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
1071 			revents |= events & (POLLOUT | POLLWRNORM);
1072 
1073 	if ((rpipe->pipe_state & PIPE_EOF) ||
1074 	    (wpipe == NULL) ||
1075 	    (wpipe->pipe_state & PIPE_EOF))
1076 		revents |= POLLHUP;
1077 
1078 	if (revents == 0) {
1079 		if (events & (POLLIN | POLLRDNORM)) {
1080 			selrecord(p, &rpipe->pipe_sel);
1081 			rpipe->pipe_state |= PIPE_SEL;
1082 		}
1083 
1084 		if (events & (POLLOUT | POLLWRNORM)) {
1085 			selrecord(p, &wpipe->pipe_sel);
1086 			wpipe->pipe_state |= PIPE_SEL;
1087 		}
1088 	}
1089 
1090 	return (revents);
1091 }
1092 
1093 static int
1094 pipe_stat(fp, ub, p)
1095 	struct file *fp;
1096 	struct stat *ub;
1097 	struct proc *p;
1098 {
1099 	struct pipe *pipe = (struct pipe *)fp->f_data;
1100 
1101 	bzero((caddr_t)ub, sizeof (*ub));
1102 	ub->st_mode = S_IFIFO;
1103 	ub->st_blksize = pipe->pipe_buffer.size;
1104 	ub->st_size = pipe->pipe_buffer.cnt;
1105 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
1106 	ub->st_atimespec = pipe->pipe_atime;
1107 	ub->st_mtimespec = pipe->pipe_mtime;
1108 	ub->st_ctimespec = pipe->pipe_ctime;
1109 	ub->st_uid = fp->f_cred->cr_uid;
1110 	ub->st_gid = fp->f_cred->cr_gid;
1111 	/*
1112 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
1113 	 * XXX (st_dev, st_ino) should be unique.
1114 	 */
1115 	return 0;
1116 }
1117 
1118 /* ARGSUSED */
1119 static int
1120 pipe_close(fp, p)
1121 	struct file *fp;
1122 	struct proc *p;
1123 {
1124 	struct pipe *cpipe = (struct pipe *)fp->f_data;
1125 
1126 	fp->f_ops = &badfileops;
1127 	fp->f_data = NULL;
1128 	funsetown(cpipe->pipe_sigio);
1129 	pipeclose(cpipe);
1130 	return 0;
1131 }
1132 
1133 /*
1134  * shutdown the pipe
1135  */
1136 static void
1137 pipeclose(cpipe)
1138 	struct pipe *cpipe;
1139 {
1140 	struct pipe *ppipe;
1141 	if (cpipe) {
1142 
1143 		pipeselwakeup(cpipe);
1144 
1145 		/*
1146 		 * If the other side is blocked, wake it up saying that
1147 		 * we want to close it down.
1148 		 */
1149 		while (cpipe->pipe_busy) {
1150 			wakeup(cpipe);
1151 			cpipe->pipe_state |= PIPE_WANT|PIPE_EOF;
1152 			tsleep(cpipe, PRIBIO, "pipecl", 0);
1153 		}
1154 
1155 		/*
1156 		 * Disconnect from peer
1157 		 */
1158 		if ((ppipe = cpipe->pipe_peer) != NULL) {
1159 			pipeselwakeup(ppipe);
1160 
1161 			ppipe->pipe_state |= PIPE_EOF;
1162 			wakeup(ppipe);
1163 			ppipe->pipe_peer = NULL;
1164 		}
1165 
1166 		/*
1167 		 * free resources
1168 		 */
1169 		if (cpipe->pipe_buffer.buffer) {
1170 			if (cpipe->pipe_buffer.size > PIPE_SIZE)
1171 				--nbigpipe;
1172 			amountpipekva -= cpipe->pipe_buffer.size;
1173 			kmem_free(kernel_map,
1174 				(vm_offset_t)cpipe->pipe_buffer.buffer,
1175 				cpipe->pipe_buffer.size);
1176 		}
1177 #ifndef PIPE_NODIRECT
1178 		if (cpipe->pipe_map.kva) {
1179 			amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
1180 			kmem_free(kernel_map,
1181 				cpipe->pipe_map.kva,
1182 				cpipe->pipe_buffer.size + PAGE_SIZE);
1183 		}
1184 #endif
1185 		zfree(pipe_zone, cpipe);
1186 	}
1187 }
1188 
1189 static int
1190 filt_pipeattach(struct knote *kn)
1191 {
1192 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1193 
1194 	SLIST_INSERT_HEAD(&rpipe->pipe_sel.si_note, kn, kn_selnext);
1195 	return (0);
1196 }
1197 
1198 static void
1199 filt_pipedetach(struct knote *kn)
1200 {
1201 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1202 
1203 	SLIST_REMOVE(&rpipe->pipe_sel.si_note, kn, knote, kn_selnext);
1204 }
1205 
1206 /*ARGSUSED*/
1207 static int
1208 filt_piperead(struct knote *kn, long hint)
1209 {
1210 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1211 	struct pipe *wpipe = rpipe->pipe_peer;
1212 
1213 	kn->kn_data = rpipe->pipe_buffer.cnt;
1214 	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
1215 		kn->kn_data = rpipe->pipe_map.cnt;
1216 
1217 	if ((rpipe->pipe_state & PIPE_EOF) ||
1218 	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1219 		kn->kn_flags |= EV_EOF;
1220 		return (1);
1221 	}
1222 	return (kn->kn_data > 0);
1223 }
1224 
1225 /*ARGSUSED*/
1226 static int
1227 filt_pipewrite(struct knote *kn, long hint)
1228 {
1229 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
1230 	struct pipe *wpipe = rpipe->pipe_peer;
1231 
1232 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
1233 		kn->kn_data = 0;
1234 		kn->kn_flags |= EV_EOF;
1235 		return (1);
1236 	}
1237 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
1238 	if (wpipe->pipe_state & PIPE_DIRECTW)
1239 		kn->kn_data = 0;
1240 
1241 	return (kn->kn_data >= PIPE_BUF);
1242 }
1243