xref: /freebsd/sys/security/audit/audit_pipe.c (revision 4b15965daa99044daf184221b7c283bf7f2d7e66)
1 /*-
2  * Copyright (c) 2006 Robert N. M. Watson
3  * Copyright (c) 2008-2009 Apple, Inc.
4  * All rights reserved.
5  *
6  * This software was developed by Robert Watson for the TrustedBSD Project.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/param.h>
31 #include <sys/condvar.h>
32 #include <sys/conf.h>
33 #include <sys/eventhandler.h>
34 #include <sys/filio.h>
35 #include <sys/kernel.h>
36 #include <sys/lock.h>
37 #include <sys/malloc.h>
38 #include <sys/mutex.h>
39 #include <sys/poll.h>
40 #include <sys/proc.h>
41 #include <sys/queue.h>
42 #include <sys/rwlock.h>
43 #include <sys/selinfo.h>
44 #include <sys/sigio.h>
45 #include <sys/signal.h>
46 #include <sys/signalvar.h>
47 #include <sys/sx.h>
48 #include <sys/systm.h>
49 #include <sys/uio.h>
50 
51 #include <security/audit/audit.h>
52 #include <security/audit/audit_ioctl.h>
53 #include <security/audit/audit_private.h>
54 
55 /*
56  * Implementation of a clonable special device providing a live stream of BSM
57  * audit data.  Consumers receive a "tee" of the system audit trail by
58  * default, but may also define alternative event selections using ioctls.
59  * This interface provides unreliable but timely access to audit events.
60  * Consumers should be very careful to avoid introducing event cycles.
61  */
62 
63 /*
64  * Memory types.
65  */
66 static MALLOC_DEFINE(M_AUDIT_PIPE, "audit_pipe", "Audit pipes");
67 static MALLOC_DEFINE(M_AUDIT_PIPE_ENTRY, "audit_pipeent",
68     "Audit pipe entries and buffers");
69 static MALLOC_DEFINE(M_AUDIT_PIPE_PRESELECT, "audit_pipe_presel",
70     "Audit pipe preselection structure");
71 
72 /*
73  * Audit pipe buffer parameters.
74  */
75 #define	AUDIT_PIPE_QLIMIT_DEFAULT	(128)
76 #define	AUDIT_PIPE_QLIMIT_MIN		(1)
77 #define	AUDIT_PIPE_QLIMIT_MAX		(1024)
78 
79 /*
80  * Description of an entry in an audit_pipe.
81  */
82 struct audit_pipe_entry {
83 	void				*ape_record;
84 	u_int				 ape_record_len;
85 	TAILQ_ENTRY(audit_pipe_entry)	 ape_queue;
86 };
87 
88 /*
89  * Audit pipes allow processes to express "interest" in the set of records
90  * that are delivered via the pipe.  They do this in a similar manner to the
91  * mechanism for audit trail configuration, by expressing two global masks,
92  * and optionally expressing per-auid masks.  The following data structure is
93  * the per-auid mask description.  The global state is stored in the audit
94  * pipe data structure.
95  *
96  * We may want to consider a more space/time-efficient data structure once
97  * usage patterns for per-auid specifications are clear.
98  */
99 struct audit_pipe_preselect {
100 	au_id_t					 app_auid;
101 	au_mask_t				 app_mask;
102 	TAILQ_ENTRY(audit_pipe_preselect)	 app_list;
103 };
104 
105 /*
106  * Description of an individual audit_pipe.  Consists largely of a bounded
107  * length queue.
108  */
109 #define	AUDIT_PIPE_ASYNC	0x00000001
110 #define	AUDIT_PIPE_NBIO		0x00000002
111 struct audit_pipe {
112 	u_int				 ap_flags;
113 
114 	struct selinfo			 ap_selinfo;
115 	struct sigio			*ap_sigio;
116 
117 	/*
118 	 * Per-pipe mutex protecting most fields in this data structure.
119 	 */
120 	struct mtx			 ap_mtx;
121 
122 	/*
123 	 * Per-pipe sleep lock serializing user-generated reads and flushes.
124 	 * uiomove() is called to copy out the current head record's data
125 	 * while the record remains in the queue, so we prevent other threads
126 	 * from removing it using this lock.
127 	 */
128 	struct sx			 ap_sx;
129 
130 	/*
131 	 * Condition variable to signal when data has been delivered to a
132 	 * pipe.
133 	 */
134 	struct cv			 ap_cv;
135 
136 	/*
137 	 * Various queue-reated variables: qlen and qlimit are a count of
138 	 * records in the queue; qbyteslen is the number of bytes of data
139 	 * across all records, and qoffset is the amount read so far of the
140 	 * first record in the queue.  The number of bytes available for
141 	 * reading in the queue is qbyteslen - qoffset.
142 	 */
143 	u_int				 ap_qlen;
144 	u_int				 ap_qlimit;
145 	u_int				 ap_qbyteslen;
146 	u_int				 ap_qoffset;
147 
148 	/*
149 	 * Per-pipe operation statistics.
150 	 */
151 	u_int64_t			 ap_inserts;	/* Records added. */
152 	u_int64_t			 ap_reads;	/* Records read. */
153 	u_int64_t			 ap_drops;	/* Records dropped. */
154 
155 	/*
156 	 * Fields relating to pipe interest: global masks for unmatched
157 	 * processes (attributable, non-attributable), and a list of specific
158 	 * interest specifications by auid.
159 	 */
160 	int				 ap_preselect_mode;
161 	au_mask_t			 ap_preselect_flags;
162 	au_mask_t			 ap_preselect_naflags;
163 	TAILQ_HEAD(, audit_pipe_preselect)	ap_preselect_list;
164 
165 	/*
166 	 * Current pending record list.  Protected by a combination of ap_mtx
167 	 * and ap_sx.  Note particularly that *both* locks are required to
168 	 * remove a record from the head of the queue, as an in-progress read
169 	 * may sleep while copying and therefore cannot hold ap_mtx.
170 	 */
171 	TAILQ_HEAD(, audit_pipe_entry)	 ap_queue;
172 
173 	/*
174 	 * Global pipe list.
175 	 */
176 	TAILQ_ENTRY(audit_pipe)		 ap_list;
177 };
178 
179 #define	AUDIT_PIPE_LOCK(ap)		mtx_lock(&(ap)->ap_mtx)
180 #define	AUDIT_PIPE_LOCK_ASSERT(ap)	mtx_assert(&(ap)->ap_mtx, MA_OWNED)
181 #define	AUDIT_PIPE_LOCK_DESTROY(ap)	mtx_destroy(&(ap)->ap_mtx)
182 #define	AUDIT_PIPE_LOCK_INIT(ap)	mtx_init(&(ap)->ap_mtx, \
183 					    "audit_pipe_mtx", NULL, MTX_DEF)
184 #define	AUDIT_PIPE_UNLOCK(ap)		mtx_unlock(&(ap)->ap_mtx)
185 #define	AUDIT_PIPE_MTX(ap)		(&(ap)->ap_mtx)
186 
187 #define	AUDIT_PIPE_SX_LOCK_DESTROY(ap)	sx_destroy(&(ap)->ap_sx)
188 #define	AUDIT_PIPE_SX_LOCK_INIT(ap)	sx_init(&(ap)->ap_sx, "audit_pipe_sx")
189 #define	AUDIT_PIPE_SX_XLOCK_ASSERT(ap)	sx_assert(&(ap)->ap_sx, SA_XLOCKED)
190 #define	AUDIT_PIPE_SX_XLOCK_SIG(ap)	sx_xlock_sig(&(ap)->ap_sx)
191 #define	AUDIT_PIPE_SX_XUNLOCK(ap)	sx_xunlock(&(ap)->ap_sx)
192 
193 /*
194  * Global list of audit pipes, rwlock to protect it.  Individual record
195  * queues on pipes are protected by per-pipe locks; these locks synchronize
196  * between threads walking the list to deliver to individual pipes and add/
197  * remove of pipes, and are mostly acquired for read.
198  */
199 static TAILQ_HEAD(, audit_pipe)	 audit_pipe_list;
200 static struct rwlock		 audit_pipe_lock;
201 
202 #define	AUDIT_PIPE_LIST_LOCK_INIT()	rw_init(&audit_pipe_lock, \
203 					    "audit_pipe_list_lock")
204 #define	AUDIT_PIPE_LIST_LOCK_DESTROY()	rw_destroy(&audit_pipe_lock)
205 #define	AUDIT_PIPE_LIST_RLOCK()		rw_rlock(&audit_pipe_lock)
206 #define	AUDIT_PIPE_LIST_RUNLOCK()	rw_runlock(&audit_pipe_lock)
207 #define	AUDIT_PIPE_LIST_WLOCK()		rw_wlock(&audit_pipe_lock)
208 #define	AUDIT_PIPE_LIST_WLOCK_ASSERT()	rw_assert(&audit_pipe_lock, \
209 					    RA_WLOCKED)
210 #define	AUDIT_PIPE_LIST_WUNLOCK()	rw_wunlock(&audit_pipe_lock)
211 
212 /*
213  * Audit pipe device.
214  */
215 static struct cdev	*audit_pipe_dev;
216 
217 #define AUDIT_PIPE_NAME	"auditpipe"
218 
219 /*
220  * Special device methods and definition.
221  */
222 static d_open_t		audit_pipe_open;
223 static d_read_t		audit_pipe_read;
224 static d_ioctl_t	audit_pipe_ioctl;
225 static d_poll_t		audit_pipe_poll;
226 static d_kqfilter_t	audit_pipe_kqfilter;
227 
228 static struct cdevsw	audit_pipe_cdevsw = {
229 	.d_version =	D_VERSION,
230 	.d_open =	audit_pipe_open,
231 	.d_read =	audit_pipe_read,
232 	.d_ioctl =	audit_pipe_ioctl,
233 	.d_poll =	audit_pipe_poll,
234 	.d_kqfilter =	audit_pipe_kqfilter,
235 	.d_name =	AUDIT_PIPE_NAME,
236 };
237 
238 static int	audit_pipe_kqread(struct knote *note, long hint);
239 static void	audit_pipe_kqdetach(struct knote *note);
240 
241 static const struct filterops audit_pipe_read_filterops = {
242 	.f_isfd =	1,
243 	.f_attach =	NULL,
244 	.f_detach =	audit_pipe_kqdetach,
245 	.f_event =	audit_pipe_kqread,
246 };
247 
248 /*
249  * Some global statistics on audit pipes.
250  */
251 static int		audit_pipe_count;	/* Current number of pipes. */
252 static u_int64_t	audit_pipe_ever;	/* Pipes ever allocated. */
253 static u_int64_t	audit_pipe_records;	/* Records seen. */
254 static u_int64_t	audit_pipe_drops;	/* Global record drop count. */
255 
256 /*
257  * Free an audit pipe entry.
258  */
259 static void
260 audit_pipe_entry_free(struct audit_pipe_entry *ape)
261 {
262 
263 	free(ape->ape_record, M_AUDIT_PIPE_ENTRY);
264 	free(ape, M_AUDIT_PIPE_ENTRY);
265 }
266 
267 /*
268  * Find an audit pipe preselection specification for an auid, if any.
269  */
270 static struct audit_pipe_preselect *
271 audit_pipe_preselect_find(struct audit_pipe *ap, au_id_t auid)
272 {
273 	struct audit_pipe_preselect *app;
274 
275 	AUDIT_PIPE_LOCK_ASSERT(ap);
276 
277 	TAILQ_FOREACH(app, &ap->ap_preselect_list, app_list) {
278 		if (app->app_auid == auid)
279 			return (app);
280 	}
281 	return (NULL);
282 }
283 
284 /*
285  * Query the per-pipe mask for a specific auid.
286  */
287 static int
288 audit_pipe_preselect_get(struct audit_pipe *ap, au_id_t auid,
289     au_mask_t *maskp)
290 {
291 	struct audit_pipe_preselect *app;
292 	int error;
293 
294 	AUDIT_PIPE_LOCK(ap);
295 	app = audit_pipe_preselect_find(ap, auid);
296 	if (app != NULL) {
297 		*maskp = app->app_mask;
298 		error = 0;
299 	} else
300 		error = ENOENT;
301 	AUDIT_PIPE_UNLOCK(ap);
302 	return (error);
303 }
304 
305 /*
306  * Set the per-pipe mask for a specific auid.  Add a new entry if needed;
307  * otherwise, update the current entry.
308  */
309 static void
310 audit_pipe_preselect_set(struct audit_pipe *ap, au_id_t auid, au_mask_t mask)
311 {
312 	struct audit_pipe_preselect *app, *app_new;
313 
314 	/*
315 	 * Pessimistically assume that the auid doesn't already have a mask
316 	 * set, and allocate.  We will free it if it is unneeded.
317 	 */
318 	app_new = malloc(sizeof(*app_new), M_AUDIT_PIPE_PRESELECT, M_WAITOK);
319 	AUDIT_PIPE_LOCK(ap);
320 	app = audit_pipe_preselect_find(ap, auid);
321 	if (app == NULL) {
322 		app = app_new;
323 		app_new = NULL;
324 		app->app_auid = auid;
325 		TAILQ_INSERT_TAIL(&ap->ap_preselect_list, app, app_list);
326 	}
327 	app->app_mask = mask;
328 	AUDIT_PIPE_UNLOCK(ap);
329 	if (app_new != NULL)
330 		free(app_new, M_AUDIT_PIPE_PRESELECT);
331 }
332 
333 /*
334  * Delete a per-auid mask on an audit pipe.
335  */
336 static int
337 audit_pipe_preselect_delete(struct audit_pipe *ap, au_id_t auid)
338 {
339 	struct audit_pipe_preselect *app;
340 	int error;
341 
342 	AUDIT_PIPE_LOCK(ap);
343 	app = audit_pipe_preselect_find(ap, auid);
344 	if (app != NULL) {
345 		TAILQ_REMOVE(&ap->ap_preselect_list, app, app_list);
346 		error = 0;
347 	} else
348 		error = ENOENT;
349 	AUDIT_PIPE_UNLOCK(ap);
350 	if (app != NULL)
351 		free(app, M_AUDIT_PIPE_PRESELECT);
352 	return (error);
353 }
354 
355 /*
356  * Delete all per-auid masks on an audit pipe.
357  */
358 static void
359 audit_pipe_preselect_flush_locked(struct audit_pipe *ap)
360 {
361 	struct audit_pipe_preselect *app;
362 
363 	AUDIT_PIPE_LOCK_ASSERT(ap);
364 
365 	while ((app = TAILQ_FIRST(&ap->ap_preselect_list)) != NULL) {
366 		TAILQ_REMOVE(&ap->ap_preselect_list, app, app_list);
367 		free(app, M_AUDIT_PIPE_PRESELECT);
368 	}
369 }
370 
371 static void
372 audit_pipe_preselect_flush(struct audit_pipe *ap)
373 {
374 
375 	AUDIT_PIPE_LOCK(ap);
376 	audit_pipe_preselect_flush_locked(ap);
377 	AUDIT_PIPE_UNLOCK(ap);
378 }
379 
380 /*-
381  * Determine whether a specific audit pipe matches a record with these
382  * properties.  Algorithm is as follows:
383  *
384  * - If the pipe is configured to track the default trail configuration, then
385  *   use the results of global preselection matching.
386  * - If not, search for a specifically configured auid entry matching the
387  *   event.  If an entry is found, use that.
388  * - Otherwise, use the default flags or naflags configured for the pipe.
389  */
390 static int
391 audit_pipe_preselect_check(struct audit_pipe *ap, au_id_t auid,
392     au_event_t event, au_class_t class, int sorf, int trail_preselect)
393 {
394 	struct audit_pipe_preselect *app;
395 
396 	AUDIT_PIPE_LOCK_ASSERT(ap);
397 
398 	switch (ap->ap_preselect_mode) {
399 	case AUDITPIPE_PRESELECT_MODE_TRAIL:
400 		return (trail_preselect);
401 
402 	case AUDITPIPE_PRESELECT_MODE_LOCAL:
403 		app = audit_pipe_preselect_find(ap, auid);
404 		if (app == NULL) {
405 			if (auid == AU_DEFAUDITID)
406 				return (au_preselect(event, class,
407 				    &ap->ap_preselect_naflags, sorf));
408 			else
409 				return (au_preselect(event, class,
410 				    &ap->ap_preselect_flags, sorf));
411 		} else
412 			return (au_preselect(event, class, &app->app_mask,
413 			    sorf));
414 
415 	default:
416 		panic("audit_pipe_preselect_check: mode %d",
417 		    ap->ap_preselect_mode);
418 	}
419 
420 	return (0);
421 }
422 
423 /*
424  * Determine whether there exists a pipe interested in a record with specific
425  * properties.
426  */
427 int
428 audit_pipe_preselect(au_id_t auid, au_event_t event, au_class_t class,
429     int sorf, int trail_preselect)
430 {
431 	struct audit_pipe *ap;
432 
433 	/* Lockless read to avoid acquiring the global lock if not needed. */
434 	if (TAILQ_EMPTY(&audit_pipe_list))
435 		return (0);
436 
437 	AUDIT_PIPE_LIST_RLOCK();
438 	TAILQ_FOREACH(ap, &audit_pipe_list, ap_list) {
439 		AUDIT_PIPE_LOCK(ap);
440 		if (audit_pipe_preselect_check(ap, auid, event, class, sorf,
441 		    trail_preselect)) {
442 			AUDIT_PIPE_UNLOCK(ap);
443 			AUDIT_PIPE_LIST_RUNLOCK();
444 			return (1);
445 		}
446 		AUDIT_PIPE_UNLOCK(ap);
447 	}
448 	AUDIT_PIPE_LIST_RUNLOCK();
449 	return (0);
450 }
451 
452 /*
453  * Append individual record to a queue -- allocate queue-local buffer, and
454  * add to the queue.  If the queue is full or we can't allocate memory, drop
455  * the newest record.
456  */
457 static void
458 audit_pipe_append(struct audit_pipe *ap, void *record, u_int record_len)
459 {
460 	struct audit_pipe_entry *ape;
461 
462 	AUDIT_PIPE_LOCK_ASSERT(ap);
463 
464 	if (ap->ap_qlen >= ap->ap_qlimit) {
465 		ap->ap_drops++;
466 		audit_pipe_drops++;
467 		return;
468 	}
469 
470 	ape = malloc(sizeof(*ape), M_AUDIT_PIPE_ENTRY, M_NOWAIT | M_ZERO);
471 	if (ape == NULL) {
472 		ap->ap_drops++;
473 		audit_pipe_drops++;
474 		return;
475 	}
476 
477 	ape->ape_record = malloc(record_len, M_AUDIT_PIPE_ENTRY, M_NOWAIT);
478 	if (ape->ape_record == NULL) {
479 		free(ape, M_AUDIT_PIPE_ENTRY);
480 		ap->ap_drops++;
481 		audit_pipe_drops++;
482 		return;
483 	}
484 
485 	bcopy(record, ape->ape_record, record_len);
486 	ape->ape_record_len = record_len;
487 
488 	TAILQ_INSERT_TAIL(&ap->ap_queue, ape, ape_queue);
489 	ap->ap_inserts++;
490 	ap->ap_qlen++;
491 	ap->ap_qbyteslen += ape->ape_record_len;
492 	selwakeuppri(&ap->ap_selinfo, PSOCK);
493 	KNOTE_LOCKED(&ap->ap_selinfo.si_note, 0);
494 	if (ap->ap_flags & AUDIT_PIPE_ASYNC)
495 		pgsigio(&ap->ap_sigio, SIGIO, 0);
496 	cv_broadcast(&ap->ap_cv);
497 }
498 
499 /*
500  * audit_pipe_submit(): audit_worker submits audit records via this
501  * interface, which arranges for them to be delivered to pipe queues.
502  */
503 void
504 audit_pipe_submit(au_id_t auid, au_event_t event, au_class_t class, int sorf,
505     int trail_select, void *record, u_int record_len)
506 {
507 	struct audit_pipe *ap;
508 
509 	/*
510 	 * Lockless read to avoid lock overhead if pipes are not in use.
511 	 */
512 	if (TAILQ_FIRST(&audit_pipe_list) == NULL)
513 		return;
514 
515 	AUDIT_PIPE_LIST_RLOCK();
516 	TAILQ_FOREACH(ap, &audit_pipe_list, ap_list) {
517 		AUDIT_PIPE_LOCK(ap);
518 		if (audit_pipe_preselect_check(ap, auid, event, class, sorf,
519 		    trail_select))
520 			audit_pipe_append(ap, record, record_len);
521 		AUDIT_PIPE_UNLOCK(ap);
522 	}
523 	AUDIT_PIPE_LIST_RUNLOCK();
524 
525 	/* Unlocked increment. */
526 	audit_pipe_records++;
527 }
528 
529 /*
530  * audit_pipe_submit_user(): the same as audit_pipe_submit(), except that
531  * since we don't currently have selection information available, it is
532  * delivered to the pipe unconditionally.
533  *
534  * XXXRW: This is a bug.  The BSM check routine for submitting a user record
535  * should parse that information and return it.
536  */
537 void
538 audit_pipe_submit_user(void *record, u_int record_len)
539 {
540 	struct audit_pipe *ap;
541 
542 	/*
543 	 * Lockless read to avoid lock overhead if pipes are not in use.
544 	 */
545 	if (TAILQ_FIRST(&audit_pipe_list) == NULL)
546 		return;
547 
548 	AUDIT_PIPE_LIST_RLOCK();
549 	TAILQ_FOREACH(ap, &audit_pipe_list, ap_list) {
550 		AUDIT_PIPE_LOCK(ap);
551 		audit_pipe_append(ap, record, record_len);
552 		AUDIT_PIPE_UNLOCK(ap);
553 	}
554 	AUDIT_PIPE_LIST_RUNLOCK();
555 
556 	/* Unlocked increment. */
557 	audit_pipe_records++;
558 }
559 
560 /*
561  * Allocate a new audit pipe.  Connects the pipe, on success, to the global
562  * list and updates statistics.
563  */
564 static struct audit_pipe *
565 audit_pipe_alloc(void)
566 {
567 	struct audit_pipe *ap;
568 
569 	ap = malloc(sizeof(*ap), M_AUDIT_PIPE, M_NOWAIT | M_ZERO);
570 	if (ap == NULL)
571 		return (NULL);
572 	ap->ap_qlimit = AUDIT_PIPE_QLIMIT_DEFAULT;
573 	TAILQ_INIT(&ap->ap_queue);
574 	knlist_init_mtx(&ap->ap_selinfo.si_note, AUDIT_PIPE_MTX(ap));
575 	AUDIT_PIPE_LOCK_INIT(ap);
576 	AUDIT_PIPE_SX_LOCK_INIT(ap);
577 	cv_init(&ap->ap_cv, "audit_pipe");
578 
579 	/*
580 	 * Default flags, naflags, and auid-specific preselection settings to
581 	 * 0.  Initialize the mode to the global trail so that if praudit(1)
582 	 * is run on /dev/auditpipe, it sees events associated with the
583 	 * default trail.  Pipe-aware application can clear the flag, set
584 	 * custom masks, and flush the pipe as needed.
585 	 */
586 	bzero(&ap->ap_preselect_flags, sizeof(ap->ap_preselect_flags));
587 	bzero(&ap->ap_preselect_naflags, sizeof(ap->ap_preselect_naflags));
588 	TAILQ_INIT(&ap->ap_preselect_list);
589 	ap->ap_preselect_mode = AUDITPIPE_PRESELECT_MODE_TRAIL;
590 
591 	/*
592 	 * Add to global list and update global statistics.
593 	 */
594 	AUDIT_PIPE_LIST_WLOCK();
595 	TAILQ_INSERT_HEAD(&audit_pipe_list, ap, ap_list);
596 	audit_pipe_count++;
597 	audit_pipe_ever++;
598 	AUDIT_PIPE_LIST_WUNLOCK();
599 
600 	return (ap);
601 }
602 
603 /*
604  * Flush all records currently present in an audit pipe; assume mutex is held.
605  */
606 static void
607 audit_pipe_flush(struct audit_pipe *ap)
608 {
609 	struct audit_pipe_entry *ape;
610 
611 	AUDIT_PIPE_LOCK_ASSERT(ap);
612 
613 	while ((ape = TAILQ_FIRST(&ap->ap_queue)) != NULL) {
614 		TAILQ_REMOVE(&ap->ap_queue, ape, ape_queue);
615 		ap->ap_qbyteslen -= ape->ape_record_len;
616 		audit_pipe_entry_free(ape);
617 		ap->ap_qlen--;
618 	}
619 	ap->ap_qoffset = 0;
620 
621 	KASSERT(ap->ap_qlen == 0, ("audit_pipe_free: ap_qbyteslen"));
622 	KASSERT(ap->ap_qbyteslen == 0, ("audit_pipe_flush: ap_qbyteslen"));
623 }
624 
625 /*
626  * Free an audit pipe; this means freeing all preselection state and all
627  * records in the pipe.  Assumes global write lock and pipe mutex are held to
628  * prevent any new records from being inserted during the free, and that the
629  * audit pipe is still on the global list.
630  */
631 static void
632 audit_pipe_free(struct audit_pipe *ap)
633 {
634 
635 	AUDIT_PIPE_LIST_WLOCK_ASSERT();
636 	AUDIT_PIPE_LOCK_ASSERT(ap);
637 
638 	audit_pipe_preselect_flush_locked(ap);
639 	audit_pipe_flush(ap);
640 	cv_destroy(&ap->ap_cv);
641 	AUDIT_PIPE_SX_LOCK_DESTROY(ap);
642 	AUDIT_PIPE_LOCK_DESTROY(ap);
643 	seldrain(&ap->ap_selinfo);
644 	knlist_destroy(&ap->ap_selinfo.si_note);
645 	TAILQ_REMOVE(&audit_pipe_list, ap, ap_list);
646 	free(ap, M_AUDIT_PIPE);
647 	audit_pipe_count--;
648 }
649 
650 static void
651 audit_pipe_dtor(void *arg)
652 {
653 	struct audit_pipe *ap;
654 
655 	ap = arg;
656 	funsetown(&ap->ap_sigio);
657 	AUDIT_PIPE_LIST_WLOCK();
658 	AUDIT_PIPE_LOCK(ap);
659 	audit_pipe_free(ap);
660 	AUDIT_PIPE_LIST_WUNLOCK();
661 }
662 
663 /*
664  * Audit pipe open method.  Explicit privilege check isn't used as this
665  * allows file permissions on the special device to be used to grant audit
666  * review access.  Those file permissions should be managed carefully.
667  */
668 static int
669 audit_pipe_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
670 {
671 	struct audit_pipe *ap;
672 	int error;
673 
674 	ap = audit_pipe_alloc();
675 	if (ap == NULL)
676 		return (ENOMEM);
677 	fsetown(td->td_proc->p_pid, &ap->ap_sigio);
678 	error = devfs_set_cdevpriv(ap, audit_pipe_dtor);
679 	if (error != 0)
680 		audit_pipe_dtor(ap);
681 	return (error);
682 }
683 
684 /*
685  * Audit pipe ioctl() routine.  Handle file descriptor and audit pipe layer
686  * commands.
687  */
688 static int
689 audit_pipe_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
690     struct thread *td)
691 {
692 	struct auditpipe_ioctl_preselect *aip;
693 	struct audit_pipe *ap;
694 	au_mask_t *maskp;
695 	int error, mode;
696 	au_id_t auid;
697 
698 	error = devfs_get_cdevpriv((void **)&ap);
699 	if (error != 0)
700 		return (error);
701 
702 	/*
703 	 * Audit pipe ioctls: first come standard device node ioctls, then
704 	 * manipulation of pipe settings, and finally, statistics query
705 	 * ioctls.
706 	 */
707 	switch (cmd) {
708 	case FIONBIO:
709 		AUDIT_PIPE_LOCK(ap);
710 		if (*(int *)data)
711 			ap->ap_flags |= AUDIT_PIPE_NBIO;
712 		else
713 			ap->ap_flags &= ~AUDIT_PIPE_NBIO;
714 		AUDIT_PIPE_UNLOCK(ap);
715 		error = 0;
716 		break;
717 
718 	case FIONREAD:
719 		AUDIT_PIPE_LOCK(ap);
720 		*(int *)data = ap->ap_qbyteslen - ap->ap_qoffset;
721 		AUDIT_PIPE_UNLOCK(ap);
722 		error = 0;
723 		break;
724 
725 	case FIOASYNC:
726 		AUDIT_PIPE_LOCK(ap);
727 		if (*(int *)data)
728 			ap->ap_flags |= AUDIT_PIPE_ASYNC;
729 		else
730 			ap->ap_flags &= ~AUDIT_PIPE_ASYNC;
731 		AUDIT_PIPE_UNLOCK(ap);
732 		error = 0;
733 		break;
734 
735 	case FIOSETOWN:
736 		error = fsetown(*(int *)data, &ap->ap_sigio);
737 		break;
738 
739 	case FIOGETOWN:
740 		*(int *)data = fgetown(&ap->ap_sigio);
741 		error = 0;
742 		break;
743 
744 	case AUDITPIPE_GET_QLEN:
745 		*(u_int *)data = ap->ap_qlen;
746 		error = 0;
747 		break;
748 
749 	case AUDITPIPE_GET_QLIMIT:
750 		*(u_int *)data = ap->ap_qlimit;
751 		error = 0;
752 		break;
753 
754 	case AUDITPIPE_SET_QLIMIT:
755 		/* Lockless integer write. */
756 		if (*(u_int *)data >= AUDIT_PIPE_QLIMIT_MIN &&
757 		    *(u_int *)data <= AUDIT_PIPE_QLIMIT_MAX) {
758 			ap->ap_qlimit = *(u_int *)data;
759 			error = 0;
760 		} else
761 			error = EINVAL;
762 		break;
763 
764 	case AUDITPIPE_GET_QLIMIT_MIN:
765 		*(u_int *)data = AUDIT_PIPE_QLIMIT_MIN;
766 		error = 0;
767 		break;
768 
769 	case AUDITPIPE_GET_QLIMIT_MAX:
770 		*(u_int *)data = AUDIT_PIPE_QLIMIT_MAX;
771 		error = 0;
772 		break;
773 
774 	case AUDITPIPE_GET_PRESELECT_FLAGS:
775 		AUDIT_PIPE_LOCK(ap);
776 		maskp = (au_mask_t *)data;
777 		*maskp = ap->ap_preselect_flags;
778 		AUDIT_PIPE_UNLOCK(ap);
779 		error = 0;
780 		break;
781 
782 	case AUDITPIPE_SET_PRESELECT_FLAGS:
783 		AUDIT_PIPE_LOCK(ap);
784 		maskp = (au_mask_t *)data;
785 		ap->ap_preselect_flags = *maskp;
786 		AUDIT_PIPE_UNLOCK(ap);
787 		error = 0;
788 		break;
789 
790 	case AUDITPIPE_GET_PRESELECT_NAFLAGS:
791 		AUDIT_PIPE_LOCK(ap);
792 		maskp = (au_mask_t *)data;
793 		*maskp = ap->ap_preselect_naflags;
794 		AUDIT_PIPE_UNLOCK(ap);
795 		error = 0;
796 		break;
797 
798 	case AUDITPIPE_SET_PRESELECT_NAFLAGS:
799 		AUDIT_PIPE_LOCK(ap);
800 		maskp = (au_mask_t *)data;
801 		ap->ap_preselect_naflags = *maskp;
802 		AUDIT_PIPE_UNLOCK(ap);
803 		error = 0;
804 		break;
805 
806 	case AUDITPIPE_GET_PRESELECT_AUID:
807 		aip = (struct auditpipe_ioctl_preselect *)data;
808 		error = audit_pipe_preselect_get(ap, aip->aip_auid,
809 		    &aip->aip_mask);
810 		break;
811 
812 	case AUDITPIPE_SET_PRESELECT_AUID:
813 		aip = (struct auditpipe_ioctl_preselect *)data;
814 		audit_pipe_preselect_set(ap, aip->aip_auid, aip->aip_mask);
815 		error = 0;
816 		break;
817 
818 	case AUDITPIPE_DELETE_PRESELECT_AUID:
819 		auid = *(au_id_t *)data;
820 		error = audit_pipe_preselect_delete(ap, auid);
821 		break;
822 
823 	case AUDITPIPE_FLUSH_PRESELECT_AUID:
824 		audit_pipe_preselect_flush(ap);
825 		error = 0;
826 		break;
827 
828 	case AUDITPIPE_GET_PRESELECT_MODE:
829 		AUDIT_PIPE_LOCK(ap);
830 		*(int *)data = ap->ap_preselect_mode;
831 		AUDIT_PIPE_UNLOCK(ap);
832 		error = 0;
833 		break;
834 
835 	case AUDITPIPE_SET_PRESELECT_MODE:
836 		mode = *(int *)data;
837 		switch (mode) {
838 		case AUDITPIPE_PRESELECT_MODE_TRAIL:
839 		case AUDITPIPE_PRESELECT_MODE_LOCAL:
840 			AUDIT_PIPE_LOCK(ap);
841 			ap->ap_preselect_mode = mode;
842 			AUDIT_PIPE_UNLOCK(ap);
843 			error = 0;
844 			break;
845 
846 		default:
847 			error = EINVAL;
848 		}
849 		break;
850 
851 	case AUDITPIPE_FLUSH:
852 		if (AUDIT_PIPE_SX_XLOCK_SIG(ap) != 0)
853 			return (EINTR);
854 		AUDIT_PIPE_LOCK(ap);
855 		audit_pipe_flush(ap);
856 		AUDIT_PIPE_UNLOCK(ap);
857 		AUDIT_PIPE_SX_XUNLOCK(ap);
858 		error = 0;
859 		break;
860 
861 	case AUDITPIPE_GET_MAXAUDITDATA:
862 		*(u_int *)data = MAXAUDITDATA;
863 		error = 0;
864 		break;
865 
866 	case AUDITPIPE_GET_INSERTS:
867 		*(u_int *)data = ap->ap_inserts;
868 		error = 0;
869 		break;
870 
871 	case AUDITPIPE_GET_READS:
872 		*(u_int *)data = ap->ap_reads;
873 		error = 0;
874 		break;
875 
876 	case AUDITPIPE_GET_DROPS:
877 		*(u_int *)data = ap->ap_drops;
878 		error = 0;
879 		break;
880 
881 	case AUDITPIPE_GET_TRUNCATES:
882 		*(u_int *)data = 0;
883 		error = 0;
884 		break;
885 
886 	default:
887 		error = ENOTTY;
888 	}
889 	return (error);
890 }
891 
892 /*
893  * Audit pipe read.  Read one or more partial or complete records to user
894  * memory.
895  */
896 static int
897 audit_pipe_read(struct cdev *dev, struct uio *uio, int flag)
898 {
899 	struct audit_pipe_entry *ape;
900 	struct audit_pipe *ap;
901 	u_int toread;
902 	int error;
903 
904 	error = devfs_get_cdevpriv((void **)&ap);
905 	if (error != 0)
906 		return (error);
907 
908 	/*
909 	 * We hold an sx(9) lock over read and flush because we rely on the
910 	 * stability of a record in the queue during uiomove(9).
911 	 */
912 	if (AUDIT_PIPE_SX_XLOCK_SIG(ap) != 0)
913 		return (EINTR);
914 	AUDIT_PIPE_LOCK(ap);
915 	while (TAILQ_EMPTY(&ap->ap_queue)) {
916 		if (ap->ap_flags & AUDIT_PIPE_NBIO) {
917 			AUDIT_PIPE_UNLOCK(ap);
918 			AUDIT_PIPE_SX_XUNLOCK(ap);
919 			return (EAGAIN);
920 		}
921 		error = cv_wait_sig(&ap->ap_cv, AUDIT_PIPE_MTX(ap));
922 		if (error) {
923 			AUDIT_PIPE_UNLOCK(ap);
924 			AUDIT_PIPE_SX_XUNLOCK(ap);
925 			return (error);
926 		}
927 	}
928 
929 	/*
930 	 * Copy as many remaining bytes from the current record to userspace
931 	 * as we can.  Keep processing records until we run out of records in
932 	 * the queue, or until the user buffer runs out of space.
933 	 *
934 	 * Note: we rely on the SX lock to maintain ape's stability here.
935 	 */
936 	ap->ap_reads++;
937 	while ((ape = TAILQ_FIRST(&ap->ap_queue)) != NULL &&
938 	    uio->uio_resid > 0) {
939 		AUDIT_PIPE_LOCK_ASSERT(ap);
940 
941 		KASSERT(ape->ape_record_len > ap->ap_qoffset,
942 		    ("audit_pipe_read: record_len > qoffset (1)"));
943 		toread = MIN(ape->ape_record_len - ap->ap_qoffset,
944 		    uio->uio_resid);
945 		AUDIT_PIPE_UNLOCK(ap);
946 		error = uiomove((char *)ape->ape_record + ap->ap_qoffset,
947 		    toread, uio);
948 		if (error) {
949 			AUDIT_PIPE_SX_XUNLOCK(ap);
950 			return (error);
951 		}
952 
953 		/*
954 		 * If the copy succeeded, update book-keeping, and if no
955 		 * bytes remain in the current record, free it.
956 		 */
957 		AUDIT_PIPE_LOCK(ap);
958 		KASSERT(TAILQ_FIRST(&ap->ap_queue) == ape,
959 		    ("audit_pipe_read: queue out of sync after uiomove"));
960 		ap->ap_qoffset += toread;
961 		KASSERT(ape->ape_record_len >= ap->ap_qoffset,
962 		    ("audit_pipe_read: record_len >= qoffset (2)"));
963 		if (ap->ap_qoffset == ape->ape_record_len) {
964 			TAILQ_REMOVE(&ap->ap_queue, ape, ape_queue);
965 			ap->ap_qbyteslen -= ape->ape_record_len;
966 			audit_pipe_entry_free(ape);
967 			ap->ap_qlen--;
968 			ap->ap_qoffset = 0;
969 		}
970 	}
971 	AUDIT_PIPE_UNLOCK(ap);
972 	AUDIT_PIPE_SX_XUNLOCK(ap);
973 	return (0);
974 }
975 
976 /*
977  * Audit pipe poll.
978  */
979 static int
980 audit_pipe_poll(struct cdev *dev, int events, struct thread *td)
981 {
982 	struct audit_pipe *ap;
983 	int error, revents;
984 
985 	revents = 0;
986 	error = devfs_get_cdevpriv((void **)&ap);
987 	if (error != 0)
988 		return (error);
989 	if (events & (POLLIN | POLLRDNORM)) {
990 		AUDIT_PIPE_LOCK(ap);
991 		if (TAILQ_FIRST(&ap->ap_queue) != NULL)
992 			revents |= events & (POLLIN | POLLRDNORM);
993 		else
994 			selrecord(td, &ap->ap_selinfo);
995 		AUDIT_PIPE_UNLOCK(ap);
996 	}
997 	return (revents);
998 }
999 
1000 /*
1001  * Audit pipe kqfilter.
1002  */
1003 static int
1004 audit_pipe_kqfilter(struct cdev *dev, struct knote *kn)
1005 {
1006 	struct audit_pipe *ap;
1007 	int error;
1008 
1009 	error = devfs_get_cdevpriv((void **)&ap);
1010 	if (error != 0)
1011 		return (error);
1012 	if (kn->kn_filter != EVFILT_READ)
1013 		return (EINVAL);
1014 
1015 	kn->kn_fop = &audit_pipe_read_filterops;
1016 	kn->kn_hook = ap;
1017 
1018 	AUDIT_PIPE_LOCK(ap);
1019 	knlist_add(&ap->ap_selinfo.si_note, kn, 1);
1020 	AUDIT_PIPE_UNLOCK(ap);
1021 	return (0);
1022 }
1023 
1024 /*
1025  * Return true if there are records available for reading on the pipe.
1026  */
1027 static int
1028 audit_pipe_kqread(struct knote *kn, long hint)
1029 {
1030 	struct audit_pipe *ap;
1031 
1032 	ap = (struct audit_pipe *)kn->kn_hook;
1033 	AUDIT_PIPE_LOCK_ASSERT(ap);
1034 
1035 	if (ap->ap_qlen != 0) {
1036 		kn->kn_data = ap->ap_qbyteslen - ap->ap_qoffset;
1037 		return (1);
1038 	} else {
1039 		kn->kn_data = 0;
1040 		return (0);
1041 	}
1042 }
1043 
1044 /*
1045  * Detach kqueue state from audit pipe.
1046  */
1047 static void
1048 audit_pipe_kqdetach(struct knote *kn)
1049 {
1050 	struct audit_pipe *ap;
1051 
1052 	ap = (struct audit_pipe *)kn->kn_hook;
1053 	AUDIT_PIPE_LOCK(ap);
1054 	knlist_remove(&ap->ap_selinfo.si_note, kn, 1);
1055 	AUDIT_PIPE_UNLOCK(ap);
1056 }
1057 
1058 /*
1059  * Initialize the audit pipe system.
1060  */
1061 static void
1062 audit_pipe_init(void *unused)
1063 {
1064 
1065 	TAILQ_INIT(&audit_pipe_list);
1066 	AUDIT_PIPE_LIST_LOCK_INIT();
1067 	audit_pipe_dev = make_dev(&audit_pipe_cdevsw, 0, UID_ROOT,
1068 		GID_WHEEL, 0600, "%s", AUDIT_PIPE_NAME);
1069 	if (audit_pipe_dev == NULL) {
1070 		AUDIT_PIPE_LIST_LOCK_DESTROY();
1071 		panic("Can't initialize audit pipe subsystem");
1072 	}
1073 }
1074 
1075 SYSINIT(audit_pipe_init, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, audit_pipe_init,
1076     NULL);
1077