xref: /freebsd/sys/security/audit/audit_pipe.c (revision 734e82fe33aa764367791a7d603b383996c6b40b)
1 /*-
2  * Copyright (c) 2006 Robert N. M. Watson
3  * Copyright (c) 2008-2009 Apple, Inc.
4  * All rights reserved.
5  *
6  * This software was developed by Robert Watson for the TrustedBSD Project.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 #include <sys/param.h>
32 #include <sys/condvar.h>
33 #include <sys/conf.h>
34 #include <sys/eventhandler.h>
35 #include <sys/filio.h>
36 #include <sys/kernel.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/poll.h>
41 #include <sys/proc.h>
42 #include <sys/queue.h>
43 #include <sys/rwlock.h>
44 #include <sys/selinfo.h>
45 #include <sys/sigio.h>
46 #include <sys/signal.h>
47 #include <sys/signalvar.h>
48 #include <sys/sx.h>
49 #include <sys/systm.h>
50 #include <sys/uio.h>
51 
52 #include <security/audit/audit.h>
53 #include <security/audit/audit_ioctl.h>
54 #include <security/audit/audit_private.h>
55 
56 /*
57  * Implementation of a clonable special device providing a live stream of BSM
58  * audit data.  Consumers receive a "tee" of the system audit trail by
59  * default, but may also define alternative event selections using ioctls.
60  * This interface provides unreliable but timely access to audit events.
61  * Consumers should be very careful to avoid introducing event cycles.
62  */
63 
64 /*
65  * Memory types.
66  */
67 static MALLOC_DEFINE(M_AUDIT_PIPE, "audit_pipe", "Audit pipes");
68 static MALLOC_DEFINE(M_AUDIT_PIPE_ENTRY, "audit_pipeent",
69     "Audit pipe entries and buffers");
70 static MALLOC_DEFINE(M_AUDIT_PIPE_PRESELECT, "audit_pipe_presel",
71     "Audit pipe preselection structure");
72 
73 /*
74  * Audit pipe buffer parameters.
75  */
76 #define	AUDIT_PIPE_QLIMIT_DEFAULT	(128)
77 #define	AUDIT_PIPE_QLIMIT_MIN		(1)
78 #define	AUDIT_PIPE_QLIMIT_MAX		(1024)
79 
80 /*
81  * Description of an entry in an audit_pipe.
82  */
83 struct audit_pipe_entry {
84 	void				*ape_record;
85 	u_int				 ape_record_len;
86 	TAILQ_ENTRY(audit_pipe_entry)	 ape_queue;
87 };
88 
89 /*
90  * Audit pipes allow processes to express "interest" in the set of records
91  * that are delivered via the pipe.  They do this in a similar manner to the
92  * mechanism for audit trail configuration, by expressing two global masks,
93  * and optionally expressing per-auid masks.  The following data structure is
94  * the per-auid mask description.  The global state is stored in the audit
95  * pipe data structure.
96  *
97  * We may want to consider a more space/time-efficient data structure once
98  * usage patterns for per-auid specifications are clear.
99  */
100 struct audit_pipe_preselect {
101 	au_id_t					 app_auid;
102 	au_mask_t				 app_mask;
103 	TAILQ_ENTRY(audit_pipe_preselect)	 app_list;
104 };
105 
106 /*
107  * Description of an individual audit_pipe.  Consists largely of a bounded
108  * length queue.
109  */
110 #define	AUDIT_PIPE_ASYNC	0x00000001
111 #define	AUDIT_PIPE_NBIO		0x00000002
112 struct audit_pipe {
113 	u_int				 ap_flags;
114 
115 	struct selinfo			 ap_selinfo;
116 	struct sigio			*ap_sigio;
117 
118 	/*
119 	 * Per-pipe mutex protecting most fields in this data structure.
120 	 */
121 	struct mtx			 ap_mtx;
122 
123 	/*
124 	 * Per-pipe sleep lock serializing user-generated reads and flushes.
125 	 * uiomove() is called to copy out the current head record's data
126 	 * while the record remains in the queue, so we prevent other threads
127 	 * from removing it using this lock.
128 	 */
129 	struct sx			 ap_sx;
130 
131 	/*
132 	 * Condition variable to signal when data has been delivered to a
133 	 * pipe.
134 	 */
135 	struct cv			 ap_cv;
136 
137 	/*
138 	 * Various queue-reated variables: qlen and qlimit are a count of
139 	 * records in the queue; qbyteslen is the number of bytes of data
140 	 * across all records, and qoffset is the amount read so far of the
141 	 * first record in the queue.  The number of bytes available for
142 	 * reading in the queue is qbyteslen - qoffset.
143 	 */
144 	u_int				 ap_qlen;
145 	u_int				 ap_qlimit;
146 	u_int				 ap_qbyteslen;
147 	u_int				 ap_qoffset;
148 
149 	/*
150 	 * Per-pipe operation statistics.
151 	 */
152 	u_int64_t			 ap_inserts;	/* Records added. */
153 	u_int64_t			 ap_reads;	/* Records read. */
154 	u_int64_t			 ap_drops;	/* Records dropped. */
155 
156 	/*
157 	 * Fields relating to pipe interest: global masks for unmatched
158 	 * processes (attributable, non-attributable), and a list of specific
159 	 * interest specifications by auid.
160 	 */
161 	int				 ap_preselect_mode;
162 	au_mask_t			 ap_preselect_flags;
163 	au_mask_t			 ap_preselect_naflags;
164 	TAILQ_HEAD(, audit_pipe_preselect)	ap_preselect_list;
165 
166 	/*
167 	 * Current pending record list.  Protected by a combination of ap_mtx
168 	 * and ap_sx.  Note particularly that *both* locks are required to
169 	 * remove a record from the head of the queue, as an in-progress read
170 	 * may sleep while copying and therefore cannot hold ap_mtx.
171 	 */
172 	TAILQ_HEAD(, audit_pipe_entry)	 ap_queue;
173 
174 	/*
175 	 * Global pipe list.
176 	 */
177 	TAILQ_ENTRY(audit_pipe)		 ap_list;
178 };
179 
180 #define	AUDIT_PIPE_LOCK(ap)		mtx_lock(&(ap)->ap_mtx)
181 #define	AUDIT_PIPE_LOCK_ASSERT(ap)	mtx_assert(&(ap)->ap_mtx, MA_OWNED)
182 #define	AUDIT_PIPE_LOCK_DESTROY(ap)	mtx_destroy(&(ap)->ap_mtx)
183 #define	AUDIT_PIPE_LOCK_INIT(ap)	mtx_init(&(ap)->ap_mtx, \
184 					    "audit_pipe_mtx", NULL, MTX_DEF)
185 #define	AUDIT_PIPE_UNLOCK(ap)		mtx_unlock(&(ap)->ap_mtx)
186 #define	AUDIT_PIPE_MTX(ap)		(&(ap)->ap_mtx)
187 
188 #define	AUDIT_PIPE_SX_LOCK_DESTROY(ap)	sx_destroy(&(ap)->ap_sx)
189 #define	AUDIT_PIPE_SX_LOCK_INIT(ap)	sx_init(&(ap)->ap_sx, "audit_pipe_sx")
190 #define	AUDIT_PIPE_SX_XLOCK_ASSERT(ap)	sx_assert(&(ap)->ap_sx, SA_XLOCKED)
191 #define	AUDIT_PIPE_SX_XLOCK_SIG(ap)	sx_xlock_sig(&(ap)->ap_sx)
192 #define	AUDIT_PIPE_SX_XUNLOCK(ap)	sx_xunlock(&(ap)->ap_sx)
193 
194 /*
195  * Global list of audit pipes, rwlock to protect it.  Individual record
196  * queues on pipes are protected by per-pipe locks; these locks synchronize
197  * between threads walking the list to deliver to individual pipes and add/
198  * remove of pipes, and are mostly acquired for read.
199  */
200 static TAILQ_HEAD(, audit_pipe)	 audit_pipe_list;
201 static struct rwlock		 audit_pipe_lock;
202 
203 #define	AUDIT_PIPE_LIST_LOCK_INIT()	rw_init(&audit_pipe_lock, \
204 					    "audit_pipe_list_lock")
205 #define	AUDIT_PIPE_LIST_LOCK_DESTROY()	rw_destroy(&audit_pipe_lock)
206 #define	AUDIT_PIPE_LIST_RLOCK()		rw_rlock(&audit_pipe_lock)
207 #define	AUDIT_PIPE_LIST_RUNLOCK()	rw_runlock(&audit_pipe_lock)
208 #define	AUDIT_PIPE_LIST_WLOCK()		rw_wlock(&audit_pipe_lock)
209 #define	AUDIT_PIPE_LIST_WLOCK_ASSERT()	rw_assert(&audit_pipe_lock, \
210 					    RA_WLOCKED)
211 #define	AUDIT_PIPE_LIST_WUNLOCK()	rw_wunlock(&audit_pipe_lock)
212 
213 /*
214  * Audit pipe device.
215  */
216 static struct cdev	*audit_pipe_dev;
217 
218 #define AUDIT_PIPE_NAME	"auditpipe"
219 
220 /*
221  * Special device methods and definition.
222  */
223 static d_open_t		audit_pipe_open;
224 static d_read_t		audit_pipe_read;
225 static d_ioctl_t	audit_pipe_ioctl;
226 static d_poll_t		audit_pipe_poll;
227 static d_kqfilter_t	audit_pipe_kqfilter;
228 
229 static struct cdevsw	audit_pipe_cdevsw = {
230 	.d_version =	D_VERSION,
231 	.d_open =	audit_pipe_open,
232 	.d_read =	audit_pipe_read,
233 	.d_ioctl =	audit_pipe_ioctl,
234 	.d_poll =	audit_pipe_poll,
235 	.d_kqfilter =	audit_pipe_kqfilter,
236 	.d_name =	AUDIT_PIPE_NAME,
237 };
238 
239 static int	audit_pipe_kqread(struct knote *note, long hint);
240 static void	audit_pipe_kqdetach(struct knote *note);
241 
242 static struct filterops audit_pipe_read_filterops = {
243 	.f_isfd =	1,
244 	.f_attach =	NULL,
245 	.f_detach =	audit_pipe_kqdetach,
246 	.f_event =	audit_pipe_kqread,
247 };
248 
249 /*
250  * Some global statistics on audit pipes.
251  */
252 static int		audit_pipe_count;	/* Current number of pipes. */
253 static u_int64_t	audit_pipe_ever;	/* Pipes ever allocated. */
254 static u_int64_t	audit_pipe_records;	/* Records seen. */
255 static u_int64_t	audit_pipe_drops;	/* Global record drop count. */
256 
257 /*
258  * Free an audit pipe entry.
259  */
260 static void
261 audit_pipe_entry_free(struct audit_pipe_entry *ape)
262 {
263 
264 	free(ape->ape_record, M_AUDIT_PIPE_ENTRY);
265 	free(ape, M_AUDIT_PIPE_ENTRY);
266 }
267 
268 /*
269  * Find an audit pipe preselection specification for an auid, if any.
270  */
271 static struct audit_pipe_preselect *
272 audit_pipe_preselect_find(struct audit_pipe *ap, au_id_t auid)
273 {
274 	struct audit_pipe_preselect *app;
275 
276 	AUDIT_PIPE_LOCK_ASSERT(ap);
277 
278 	TAILQ_FOREACH(app, &ap->ap_preselect_list, app_list) {
279 		if (app->app_auid == auid)
280 			return (app);
281 	}
282 	return (NULL);
283 }
284 
285 /*
286  * Query the per-pipe mask for a specific auid.
287  */
288 static int
289 audit_pipe_preselect_get(struct audit_pipe *ap, au_id_t auid,
290     au_mask_t *maskp)
291 {
292 	struct audit_pipe_preselect *app;
293 	int error;
294 
295 	AUDIT_PIPE_LOCK(ap);
296 	app = audit_pipe_preselect_find(ap, auid);
297 	if (app != NULL) {
298 		*maskp = app->app_mask;
299 		error = 0;
300 	} else
301 		error = ENOENT;
302 	AUDIT_PIPE_UNLOCK(ap);
303 	return (error);
304 }
305 
306 /*
307  * Set the per-pipe mask for a specific auid.  Add a new entry if needed;
308  * otherwise, update the current entry.
309  */
310 static void
311 audit_pipe_preselect_set(struct audit_pipe *ap, au_id_t auid, au_mask_t mask)
312 {
313 	struct audit_pipe_preselect *app, *app_new;
314 
315 	/*
316 	 * Pessimistically assume that the auid doesn't already have a mask
317 	 * set, and allocate.  We will free it if it is unneeded.
318 	 */
319 	app_new = malloc(sizeof(*app_new), M_AUDIT_PIPE_PRESELECT, M_WAITOK);
320 	AUDIT_PIPE_LOCK(ap);
321 	app = audit_pipe_preselect_find(ap, auid);
322 	if (app == NULL) {
323 		app = app_new;
324 		app_new = NULL;
325 		app->app_auid = auid;
326 		TAILQ_INSERT_TAIL(&ap->ap_preselect_list, app, app_list);
327 	}
328 	app->app_mask = mask;
329 	AUDIT_PIPE_UNLOCK(ap);
330 	if (app_new != NULL)
331 		free(app_new, M_AUDIT_PIPE_PRESELECT);
332 }
333 
334 /*
335  * Delete a per-auid mask on an audit pipe.
336  */
337 static int
338 audit_pipe_preselect_delete(struct audit_pipe *ap, au_id_t auid)
339 {
340 	struct audit_pipe_preselect *app;
341 	int error;
342 
343 	AUDIT_PIPE_LOCK(ap);
344 	app = audit_pipe_preselect_find(ap, auid);
345 	if (app != NULL) {
346 		TAILQ_REMOVE(&ap->ap_preselect_list, app, app_list);
347 		error = 0;
348 	} else
349 		error = ENOENT;
350 	AUDIT_PIPE_UNLOCK(ap);
351 	if (app != NULL)
352 		free(app, M_AUDIT_PIPE_PRESELECT);
353 	return (error);
354 }
355 
356 /*
357  * Delete all per-auid masks on an audit pipe.
358  */
359 static void
360 audit_pipe_preselect_flush_locked(struct audit_pipe *ap)
361 {
362 	struct audit_pipe_preselect *app;
363 
364 	AUDIT_PIPE_LOCK_ASSERT(ap);
365 
366 	while ((app = TAILQ_FIRST(&ap->ap_preselect_list)) != NULL) {
367 		TAILQ_REMOVE(&ap->ap_preselect_list, app, app_list);
368 		free(app, M_AUDIT_PIPE_PRESELECT);
369 	}
370 }
371 
372 static void
373 audit_pipe_preselect_flush(struct audit_pipe *ap)
374 {
375 
376 	AUDIT_PIPE_LOCK(ap);
377 	audit_pipe_preselect_flush_locked(ap);
378 	AUDIT_PIPE_UNLOCK(ap);
379 }
380 
381 /*-
382  * Determine whether a specific audit pipe matches a record with these
383  * properties.  Algorithm is as follows:
384  *
385  * - If the pipe is configured to track the default trail configuration, then
386  *   use the results of global preselection matching.
387  * - If not, search for a specifically configured auid entry matching the
388  *   event.  If an entry is found, use that.
389  * - Otherwise, use the default flags or naflags configured for the pipe.
390  */
391 static int
392 audit_pipe_preselect_check(struct audit_pipe *ap, au_id_t auid,
393     au_event_t event, au_class_t class, int sorf, int trail_preselect)
394 {
395 	struct audit_pipe_preselect *app;
396 
397 	AUDIT_PIPE_LOCK_ASSERT(ap);
398 
399 	switch (ap->ap_preselect_mode) {
400 	case AUDITPIPE_PRESELECT_MODE_TRAIL:
401 		return (trail_preselect);
402 
403 	case AUDITPIPE_PRESELECT_MODE_LOCAL:
404 		app = audit_pipe_preselect_find(ap, auid);
405 		if (app == NULL) {
406 			if (auid == AU_DEFAUDITID)
407 				return (au_preselect(event, class,
408 				    &ap->ap_preselect_naflags, sorf));
409 			else
410 				return (au_preselect(event, class,
411 				    &ap->ap_preselect_flags, sorf));
412 		} else
413 			return (au_preselect(event, class, &app->app_mask,
414 			    sorf));
415 
416 	default:
417 		panic("audit_pipe_preselect_check: mode %d",
418 		    ap->ap_preselect_mode);
419 	}
420 
421 	return (0);
422 }
423 
424 /*
425  * Determine whether there exists a pipe interested in a record with specific
426  * properties.
427  */
428 int
429 audit_pipe_preselect(au_id_t auid, au_event_t event, au_class_t class,
430     int sorf, int trail_preselect)
431 {
432 	struct audit_pipe *ap;
433 
434 	/* Lockless read to avoid acquiring the global lock if not needed. */
435 	if (TAILQ_EMPTY(&audit_pipe_list))
436 		return (0);
437 
438 	AUDIT_PIPE_LIST_RLOCK();
439 	TAILQ_FOREACH(ap, &audit_pipe_list, ap_list) {
440 		AUDIT_PIPE_LOCK(ap);
441 		if (audit_pipe_preselect_check(ap, auid, event, class, sorf,
442 		    trail_preselect)) {
443 			AUDIT_PIPE_UNLOCK(ap);
444 			AUDIT_PIPE_LIST_RUNLOCK();
445 			return (1);
446 		}
447 		AUDIT_PIPE_UNLOCK(ap);
448 	}
449 	AUDIT_PIPE_LIST_RUNLOCK();
450 	return (0);
451 }
452 
453 /*
454  * Append individual record to a queue -- allocate queue-local buffer, and
455  * add to the queue.  If the queue is full or we can't allocate memory, drop
456  * the newest record.
457  */
458 static void
459 audit_pipe_append(struct audit_pipe *ap, void *record, u_int record_len)
460 {
461 	struct audit_pipe_entry *ape;
462 
463 	AUDIT_PIPE_LOCK_ASSERT(ap);
464 
465 	if (ap->ap_qlen >= ap->ap_qlimit) {
466 		ap->ap_drops++;
467 		audit_pipe_drops++;
468 		return;
469 	}
470 
471 	ape = malloc(sizeof(*ape), M_AUDIT_PIPE_ENTRY, M_NOWAIT | M_ZERO);
472 	if (ape == NULL) {
473 		ap->ap_drops++;
474 		audit_pipe_drops++;
475 		return;
476 	}
477 
478 	ape->ape_record = malloc(record_len, M_AUDIT_PIPE_ENTRY, M_NOWAIT);
479 	if (ape->ape_record == NULL) {
480 		free(ape, M_AUDIT_PIPE_ENTRY);
481 		ap->ap_drops++;
482 		audit_pipe_drops++;
483 		return;
484 	}
485 
486 	bcopy(record, ape->ape_record, record_len);
487 	ape->ape_record_len = record_len;
488 
489 	TAILQ_INSERT_TAIL(&ap->ap_queue, ape, ape_queue);
490 	ap->ap_inserts++;
491 	ap->ap_qlen++;
492 	ap->ap_qbyteslen += ape->ape_record_len;
493 	selwakeuppri(&ap->ap_selinfo, PSOCK);
494 	KNOTE_LOCKED(&ap->ap_selinfo.si_note, 0);
495 	if (ap->ap_flags & AUDIT_PIPE_ASYNC)
496 		pgsigio(&ap->ap_sigio, SIGIO, 0);
497 	cv_broadcast(&ap->ap_cv);
498 }
499 
500 /*
501  * audit_pipe_submit(): audit_worker submits audit records via this
502  * interface, which arranges for them to be delivered to pipe queues.
503  */
504 void
505 audit_pipe_submit(au_id_t auid, au_event_t event, au_class_t class, int sorf,
506     int trail_select, void *record, u_int record_len)
507 {
508 	struct audit_pipe *ap;
509 
510 	/*
511 	 * Lockless read to avoid lock overhead if pipes are not in use.
512 	 */
513 	if (TAILQ_FIRST(&audit_pipe_list) == NULL)
514 		return;
515 
516 	AUDIT_PIPE_LIST_RLOCK();
517 	TAILQ_FOREACH(ap, &audit_pipe_list, ap_list) {
518 		AUDIT_PIPE_LOCK(ap);
519 		if (audit_pipe_preselect_check(ap, auid, event, class, sorf,
520 		    trail_select))
521 			audit_pipe_append(ap, record, record_len);
522 		AUDIT_PIPE_UNLOCK(ap);
523 	}
524 	AUDIT_PIPE_LIST_RUNLOCK();
525 
526 	/* Unlocked increment. */
527 	audit_pipe_records++;
528 }
529 
530 /*
531  * audit_pipe_submit_user(): the same as audit_pipe_submit(), except that
532  * since we don't currently have selection information available, it is
533  * delivered to the pipe unconditionally.
534  *
535  * XXXRW: This is a bug.  The BSM check routine for submitting a user record
536  * should parse that information and return it.
537  */
538 void
539 audit_pipe_submit_user(void *record, u_int record_len)
540 {
541 	struct audit_pipe *ap;
542 
543 	/*
544 	 * Lockless read to avoid lock overhead if pipes are not in use.
545 	 */
546 	if (TAILQ_FIRST(&audit_pipe_list) == NULL)
547 		return;
548 
549 	AUDIT_PIPE_LIST_RLOCK();
550 	TAILQ_FOREACH(ap, &audit_pipe_list, ap_list) {
551 		AUDIT_PIPE_LOCK(ap);
552 		audit_pipe_append(ap, record, record_len);
553 		AUDIT_PIPE_UNLOCK(ap);
554 	}
555 	AUDIT_PIPE_LIST_RUNLOCK();
556 
557 	/* Unlocked increment. */
558 	audit_pipe_records++;
559 }
560 
561 /*
562  * Allocate a new audit pipe.  Connects the pipe, on success, to the global
563  * list and updates statistics.
564  */
565 static struct audit_pipe *
566 audit_pipe_alloc(void)
567 {
568 	struct audit_pipe *ap;
569 
570 	ap = malloc(sizeof(*ap), M_AUDIT_PIPE, M_NOWAIT | M_ZERO);
571 	if (ap == NULL)
572 		return (NULL);
573 	ap->ap_qlimit = AUDIT_PIPE_QLIMIT_DEFAULT;
574 	TAILQ_INIT(&ap->ap_queue);
575 	knlist_init_mtx(&ap->ap_selinfo.si_note, AUDIT_PIPE_MTX(ap));
576 	AUDIT_PIPE_LOCK_INIT(ap);
577 	AUDIT_PIPE_SX_LOCK_INIT(ap);
578 	cv_init(&ap->ap_cv, "audit_pipe");
579 
580 	/*
581 	 * Default flags, naflags, and auid-specific preselection settings to
582 	 * 0.  Initialize the mode to the global trail so that if praudit(1)
583 	 * is run on /dev/auditpipe, it sees events associated with the
584 	 * default trail.  Pipe-aware application can clear the flag, set
585 	 * custom masks, and flush the pipe as needed.
586 	 */
587 	bzero(&ap->ap_preselect_flags, sizeof(ap->ap_preselect_flags));
588 	bzero(&ap->ap_preselect_naflags, sizeof(ap->ap_preselect_naflags));
589 	TAILQ_INIT(&ap->ap_preselect_list);
590 	ap->ap_preselect_mode = AUDITPIPE_PRESELECT_MODE_TRAIL;
591 
592 	/*
593 	 * Add to global list and update global statistics.
594 	 */
595 	AUDIT_PIPE_LIST_WLOCK();
596 	TAILQ_INSERT_HEAD(&audit_pipe_list, ap, ap_list);
597 	audit_pipe_count++;
598 	audit_pipe_ever++;
599 	AUDIT_PIPE_LIST_WUNLOCK();
600 
601 	return (ap);
602 }
603 
604 /*
605  * Flush all records currently present in an audit pipe; assume mutex is held.
606  */
607 static void
608 audit_pipe_flush(struct audit_pipe *ap)
609 {
610 	struct audit_pipe_entry *ape;
611 
612 	AUDIT_PIPE_LOCK_ASSERT(ap);
613 
614 	while ((ape = TAILQ_FIRST(&ap->ap_queue)) != NULL) {
615 		TAILQ_REMOVE(&ap->ap_queue, ape, ape_queue);
616 		ap->ap_qbyteslen -= ape->ape_record_len;
617 		audit_pipe_entry_free(ape);
618 		ap->ap_qlen--;
619 	}
620 	ap->ap_qoffset = 0;
621 
622 	KASSERT(ap->ap_qlen == 0, ("audit_pipe_free: ap_qbyteslen"));
623 	KASSERT(ap->ap_qbyteslen == 0, ("audit_pipe_flush: ap_qbyteslen"));
624 }
625 
626 /*
627  * Free an audit pipe; this means freeing all preselection state and all
628  * records in the pipe.  Assumes global write lock and pipe mutex are held to
629  * prevent any new records from being inserted during the free, and that the
630  * audit pipe is still on the global list.
631  */
632 static void
633 audit_pipe_free(struct audit_pipe *ap)
634 {
635 
636 	AUDIT_PIPE_LIST_WLOCK_ASSERT();
637 	AUDIT_PIPE_LOCK_ASSERT(ap);
638 
639 	audit_pipe_preselect_flush_locked(ap);
640 	audit_pipe_flush(ap);
641 	cv_destroy(&ap->ap_cv);
642 	AUDIT_PIPE_SX_LOCK_DESTROY(ap);
643 	AUDIT_PIPE_LOCK_DESTROY(ap);
644 	seldrain(&ap->ap_selinfo);
645 	knlist_destroy(&ap->ap_selinfo.si_note);
646 	TAILQ_REMOVE(&audit_pipe_list, ap, ap_list);
647 	free(ap, M_AUDIT_PIPE);
648 	audit_pipe_count--;
649 }
650 
651 static void
652 audit_pipe_dtor(void *arg)
653 {
654 	struct audit_pipe *ap;
655 
656 	ap = arg;
657 	funsetown(&ap->ap_sigio);
658 	AUDIT_PIPE_LIST_WLOCK();
659 	AUDIT_PIPE_LOCK(ap);
660 	audit_pipe_free(ap);
661 	AUDIT_PIPE_LIST_WUNLOCK();
662 }
663 
664 /*
665  * Audit pipe open method.  Explicit privilege check isn't used as this
666  * allows file permissions on the special device to be used to grant audit
667  * review access.  Those file permissions should be managed carefully.
668  */
669 static int
670 audit_pipe_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
671 {
672 	struct audit_pipe *ap;
673 	int error;
674 
675 	ap = audit_pipe_alloc();
676 	if (ap == NULL)
677 		return (ENOMEM);
678 	fsetown(td->td_proc->p_pid, &ap->ap_sigio);
679 	error = devfs_set_cdevpriv(ap, audit_pipe_dtor);
680 	if (error != 0)
681 		audit_pipe_dtor(ap);
682 	return (error);
683 }
684 
685 /*
686  * Audit pipe ioctl() routine.  Handle file descriptor and audit pipe layer
687  * commands.
688  */
689 static int
690 audit_pipe_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
691     struct thread *td)
692 {
693 	struct auditpipe_ioctl_preselect *aip;
694 	struct audit_pipe *ap;
695 	au_mask_t *maskp;
696 	int error, mode;
697 	au_id_t auid;
698 
699 	error = devfs_get_cdevpriv((void **)&ap);
700 	if (error != 0)
701 		return (error);
702 
703 	/*
704 	 * Audit pipe ioctls: first come standard device node ioctls, then
705 	 * manipulation of pipe settings, and finally, statistics query
706 	 * ioctls.
707 	 */
708 	switch (cmd) {
709 	case FIONBIO:
710 		AUDIT_PIPE_LOCK(ap);
711 		if (*(int *)data)
712 			ap->ap_flags |= AUDIT_PIPE_NBIO;
713 		else
714 			ap->ap_flags &= ~AUDIT_PIPE_NBIO;
715 		AUDIT_PIPE_UNLOCK(ap);
716 		error = 0;
717 		break;
718 
719 	case FIONREAD:
720 		AUDIT_PIPE_LOCK(ap);
721 		*(int *)data = ap->ap_qbyteslen - ap->ap_qoffset;
722 		AUDIT_PIPE_UNLOCK(ap);
723 		error = 0;
724 		break;
725 
726 	case FIOASYNC:
727 		AUDIT_PIPE_LOCK(ap);
728 		if (*(int *)data)
729 			ap->ap_flags |= AUDIT_PIPE_ASYNC;
730 		else
731 			ap->ap_flags &= ~AUDIT_PIPE_ASYNC;
732 		AUDIT_PIPE_UNLOCK(ap);
733 		error = 0;
734 		break;
735 
736 	case FIOSETOWN:
737 		error = fsetown(*(int *)data, &ap->ap_sigio);
738 		break;
739 
740 	case FIOGETOWN:
741 		*(int *)data = fgetown(&ap->ap_sigio);
742 		error = 0;
743 		break;
744 
745 	case AUDITPIPE_GET_QLEN:
746 		*(u_int *)data = ap->ap_qlen;
747 		error = 0;
748 		break;
749 
750 	case AUDITPIPE_GET_QLIMIT:
751 		*(u_int *)data = ap->ap_qlimit;
752 		error = 0;
753 		break;
754 
755 	case AUDITPIPE_SET_QLIMIT:
756 		/* Lockless integer write. */
757 		if (*(u_int *)data >= AUDIT_PIPE_QLIMIT_MIN &&
758 		    *(u_int *)data <= AUDIT_PIPE_QLIMIT_MAX) {
759 			ap->ap_qlimit = *(u_int *)data;
760 			error = 0;
761 		} else
762 			error = EINVAL;
763 		break;
764 
765 	case AUDITPIPE_GET_QLIMIT_MIN:
766 		*(u_int *)data = AUDIT_PIPE_QLIMIT_MIN;
767 		error = 0;
768 		break;
769 
770 	case AUDITPIPE_GET_QLIMIT_MAX:
771 		*(u_int *)data = AUDIT_PIPE_QLIMIT_MAX;
772 		error = 0;
773 		break;
774 
775 	case AUDITPIPE_GET_PRESELECT_FLAGS:
776 		AUDIT_PIPE_LOCK(ap);
777 		maskp = (au_mask_t *)data;
778 		*maskp = ap->ap_preselect_flags;
779 		AUDIT_PIPE_UNLOCK(ap);
780 		error = 0;
781 		break;
782 
783 	case AUDITPIPE_SET_PRESELECT_FLAGS:
784 		AUDIT_PIPE_LOCK(ap);
785 		maskp = (au_mask_t *)data;
786 		ap->ap_preselect_flags = *maskp;
787 		AUDIT_PIPE_UNLOCK(ap);
788 		error = 0;
789 		break;
790 
791 	case AUDITPIPE_GET_PRESELECT_NAFLAGS:
792 		AUDIT_PIPE_LOCK(ap);
793 		maskp = (au_mask_t *)data;
794 		*maskp = ap->ap_preselect_naflags;
795 		AUDIT_PIPE_UNLOCK(ap);
796 		error = 0;
797 		break;
798 
799 	case AUDITPIPE_SET_PRESELECT_NAFLAGS:
800 		AUDIT_PIPE_LOCK(ap);
801 		maskp = (au_mask_t *)data;
802 		ap->ap_preselect_naflags = *maskp;
803 		AUDIT_PIPE_UNLOCK(ap);
804 		error = 0;
805 		break;
806 
807 	case AUDITPIPE_GET_PRESELECT_AUID:
808 		aip = (struct auditpipe_ioctl_preselect *)data;
809 		error = audit_pipe_preselect_get(ap, aip->aip_auid,
810 		    &aip->aip_mask);
811 		break;
812 
813 	case AUDITPIPE_SET_PRESELECT_AUID:
814 		aip = (struct auditpipe_ioctl_preselect *)data;
815 		audit_pipe_preselect_set(ap, aip->aip_auid, aip->aip_mask);
816 		error = 0;
817 		break;
818 
819 	case AUDITPIPE_DELETE_PRESELECT_AUID:
820 		auid = *(au_id_t *)data;
821 		error = audit_pipe_preselect_delete(ap, auid);
822 		break;
823 
824 	case AUDITPIPE_FLUSH_PRESELECT_AUID:
825 		audit_pipe_preselect_flush(ap);
826 		error = 0;
827 		break;
828 
829 	case AUDITPIPE_GET_PRESELECT_MODE:
830 		AUDIT_PIPE_LOCK(ap);
831 		*(int *)data = ap->ap_preselect_mode;
832 		AUDIT_PIPE_UNLOCK(ap);
833 		error = 0;
834 		break;
835 
836 	case AUDITPIPE_SET_PRESELECT_MODE:
837 		mode = *(int *)data;
838 		switch (mode) {
839 		case AUDITPIPE_PRESELECT_MODE_TRAIL:
840 		case AUDITPIPE_PRESELECT_MODE_LOCAL:
841 			AUDIT_PIPE_LOCK(ap);
842 			ap->ap_preselect_mode = mode;
843 			AUDIT_PIPE_UNLOCK(ap);
844 			error = 0;
845 			break;
846 
847 		default:
848 			error = EINVAL;
849 		}
850 		break;
851 
852 	case AUDITPIPE_FLUSH:
853 		if (AUDIT_PIPE_SX_XLOCK_SIG(ap) != 0)
854 			return (EINTR);
855 		AUDIT_PIPE_LOCK(ap);
856 		audit_pipe_flush(ap);
857 		AUDIT_PIPE_UNLOCK(ap);
858 		AUDIT_PIPE_SX_XUNLOCK(ap);
859 		error = 0;
860 		break;
861 
862 	case AUDITPIPE_GET_MAXAUDITDATA:
863 		*(u_int *)data = MAXAUDITDATA;
864 		error = 0;
865 		break;
866 
867 	case AUDITPIPE_GET_INSERTS:
868 		*(u_int *)data = ap->ap_inserts;
869 		error = 0;
870 		break;
871 
872 	case AUDITPIPE_GET_READS:
873 		*(u_int *)data = ap->ap_reads;
874 		error = 0;
875 		break;
876 
877 	case AUDITPIPE_GET_DROPS:
878 		*(u_int *)data = ap->ap_drops;
879 		error = 0;
880 		break;
881 
882 	case AUDITPIPE_GET_TRUNCATES:
883 		*(u_int *)data = 0;
884 		error = 0;
885 		break;
886 
887 	default:
888 		error = ENOTTY;
889 	}
890 	return (error);
891 }
892 
893 /*
894  * Audit pipe read.  Read one or more partial or complete records to user
895  * memory.
896  */
897 static int
898 audit_pipe_read(struct cdev *dev, struct uio *uio, int flag)
899 {
900 	struct audit_pipe_entry *ape;
901 	struct audit_pipe *ap;
902 	u_int toread;
903 	int error;
904 
905 	error = devfs_get_cdevpriv((void **)&ap);
906 	if (error != 0)
907 		return (error);
908 
909 	/*
910 	 * We hold an sx(9) lock over read and flush because we rely on the
911 	 * stability of a record in the queue during uiomove(9).
912 	 */
913 	if (AUDIT_PIPE_SX_XLOCK_SIG(ap) != 0)
914 		return (EINTR);
915 	AUDIT_PIPE_LOCK(ap);
916 	while (TAILQ_EMPTY(&ap->ap_queue)) {
917 		if (ap->ap_flags & AUDIT_PIPE_NBIO) {
918 			AUDIT_PIPE_UNLOCK(ap);
919 			AUDIT_PIPE_SX_XUNLOCK(ap);
920 			return (EAGAIN);
921 		}
922 		error = cv_wait_sig(&ap->ap_cv, AUDIT_PIPE_MTX(ap));
923 		if (error) {
924 			AUDIT_PIPE_UNLOCK(ap);
925 			AUDIT_PIPE_SX_XUNLOCK(ap);
926 			return (error);
927 		}
928 	}
929 
930 	/*
931 	 * Copy as many remaining bytes from the current record to userspace
932 	 * as we can.  Keep processing records until we run out of records in
933 	 * the queue, or until the user buffer runs out of space.
934 	 *
935 	 * Note: we rely on the SX lock to maintain ape's stability here.
936 	 */
937 	ap->ap_reads++;
938 	while ((ape = TAILQ_FIRST(&ap->ap_queue)) != NULL &&
939 	    uio->uio_resid > 0) {
940 		AUDIT_PIPE_LOCK_ASSERT(ap);
941 
942 		KASSERT(ape->ape_record_len > ap->ap_qoffset,
943 		    ("audit_pipe_read: record_len > qoffset (1)"));
944 		toread = MIN(ape->ape_record_len - ap->ap_qoffset,
945 		    uio->uio_resid);
946 		AUDIT_PIPE_UNLOCK(ap);
947 		error = uiomove((char *)ape->ape_record + ap->ap_qoffset,
948 		    toread, uio);
949 		if (error) {
950 			AUDIT_PIPE_SX_XUNLOCK(ap);
951 			return (error);
952 		}
953 
954 		/*
955 		 * If the copy succeeded, update book-keeping, and if no
956 		 * bytes remain in the current record, free it.
957 		 */
958 		AUDIT_PIPE_LOCK(ap);
959 		KASSERT(TAILQ_FIRST(&ap->ap_queue) == ape,
960 		    ("audit_pipe_read: queue out of sync after uiomove"));
961 		ap->ap_qoffset += toread;
962 		KASSERT(ape->ape_record_len >= ap->ap_qoffset,
963 		    ("audit_pipe_read: record_len >= qoffset (2)"));
964 		if (ap->ap_qoffset == ape->ape_record_len) {
965 			TAILQ_REMOVE(&ap->ap_queue, ape, ape_queue);
966 			ap->ap_qbyteslen -= ape->ape_record_len;
967 			audit_pipe_entry_free(ape);
968 			ap->ap_qlen--;
969 			ap->ap_qoffset = 0;
970 		}
971 	}
972 	AUDIT_PIPE_UNLOCK(ap);
973 	AUDIT_PIPE_SX_XUNLOCK(ap);
974 	return (0);
975 }
976 
977 /*
978  * Audit pipe poll.
979  */
980 static int
981 audit_pipe_poll(struct cdev *dev, int events, struct thread *td)
982 {
983 	struct audit_pipe *ap;
984 	int error, revents;
985 
986 	revents = 0;
987 	error = devfs_get_cdevpriv((void **)&ap);
988 	if (error != 0)
989 		return (error);
990 	if (events & (POLLIN | POLLRDNORM)) {
991 		AUDIT_PIPE_LOCK(ap);
992 		if (TAILQ_FIRST(&ap->ap_queue) != NULL)
993 			revents |= events & (POLLIN | POLLRDNORM);
994 		else
995 			selrecord(td, &ap->ap_selinfo);
996 		AUDIT_PIPE_UNLOCK(ap);
997 	}
998 	return (revents);
999 }
1000 
1001 /*
1002  * Audit pipe kqfilter.
1003  */
1004 static int
1005 audit_pipe_kqfilter(struct cdev *dev, struct knote *kn)
1006 {
1007 	struct audit_pipe *ap;
1008 	int error;
1009 
1010 	error = devfs_get_cdevpriv((void **)&ap);
1011 	if (error != 0)
1012 		return (error);
1013 	if (kn->kn_filter != EVFILT_READ)
1014 		return (EINVAL);
1015 
1016 	kn->kn_fop = &audit_pipe_read_filterops;
1017 	kn->kn_hook = ap;
1018 
1019 	AUDIT_PIPE_LOCK(ap);
1020 	knlist_add(&ap->ap_selinfo.si_note, kn, 1);
1021 	AUDIT_PIPE_UNLOCK(ap);
1022 	return (0);
1023 }
1024 
1025 /*
1026  * Return true if there are records available for reading on the pipe.
1027  */
1028 static int
1029 audit_pipe_kqread(struct knote *kn, long hint)
1030 {
1031 	struct audit_pipe *ap;
1032 
1033 	ap = (struct audit_pipe *)kn->kn_hook;
1034 	AUDIT_PIPE_LOCK_ASSERT(ap);
1035 
1036 	if (ap->ap_qlen != 0) {
1037 		kn->kn_data = ap->ap_qbyteslen - ap->ap_qoffset;
1038 		return (1);
1039 	} else {
1040 		kn->kn_data = 0;
1041 		return (0);
1042 	}
1043 }
1044 
1045 /*
1046  * Detach kqueue state from audit pipe.
1047  */
1048 static void
1049 audit_pipe_kqdetach(struct knote *kn)
1050 {
1051 	struct audit_pipe *ap;
1052 
1053 	ap = (struct audit_pipe *)kn->kn_hook;
1054 	AUDIT_PIPE_LOCK(ap);
1055 	knlist_remove(&ap->ap_selinfo.si_note, kn, 1);
1056 	AUDIT_PIPE_UNLOCK(ap);
1057 }
1058 
1059 /*
1060  * Initialize the audit pipe system.
1061  */
1062 static void
1063 audit_pipe_init(void *unused)
1064 {
1065 
1066 	TAILQ_INIT(&audit_pipe_list);
1067 	AUDIT_PIPE_LIST_LOCK_INIT();
1068 	audit_pipe_dev = make_dev(&audit_pipe_cdevsw, 0, UID_ROOT,
1069 		GID_WHEEL, 0600, "%s", AUDIT_PIPE_NAME);
1070 	if (audit_pipe_dev == NULL) {
1071 		AUDIT_PIPE_LIST_LOCK_DESTROY();
1072 		panic("Can't initialize audit pipe subsystem");
1073 	}
1074 }
1075 
1076 SYSINIT(audit_pipe_init, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, audit_pipe_init,
1077     NULL);
1078