xref: /freebsd/sys/security/audit/audit.c (revision 87569f75a91f298c52a71823c04d41cf53c88889)
1 /*
2  * Copyright (c) 1999-2005 Apple Computer, Inc.
3  * Copyright (c) 2006 Robert N. M. Watson
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1.  Redistributions of source code must retain the above copyright
10  *     notice, this list of conditions and the following disclaimer.
11  * 2.  Redistributions in binary form must reproduce the above copyright
12  *     notice, this list of conditions and the following disclaimer in the
13  *     documentation and/or other materials provided with the distribution.
14  * 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
15  *     its contributors may be used to endorse or promote products derived
16  *     from this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR
22  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
26  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
27  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  *
30  * $FreeBSD$
31  */
32 
33 #include <sys/param.h>
34 #include <sys/condvar.h>
35 #include <sys/conf.h>
36 #include <sys/file.h>
37 #include <sys/filedesc.h>
38 #include <sys/fcntl.h>
39 #include <sys/ipc.h>
40 #include <sys/kernel.h>
41 #include <sys/kthread.h>
42 #include <sys/malloc.h>
43 #include <sys/mount.h>
44 #include <sys/namei.h>
45 #include <sys/proc.h>
46 #include <sys/queue.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/protosw.h>
50 #include <sys/domain.h>
51 #include <sys/sysproto.h>
52 #include <sys/sysent.h>
53 #include <sys/systm.h>
54 #include <sys/ucred.h>
55 #include <sys/uio.h>
56 #include <sys/un.h>
57 #include <sys/unistd.h>
58 #include <sys/vnode.h>
59 
60 #include <bsm/audit.h>
61 #include <bsm/audit_internal.h>
62 #include <bsm/audit_kevents.h>
63 
64 #include <netinet/in.h>
65 #include <netinet/in_pcb.h>
66 
67 #include <security/audit/audit.h>
68 #include <security/audit/audit_private.h>
69 
70 #include <vm/uma.h>
71 
72 /*
73  * The AUDIT_EXCESSIVELY_VERBOSE define enables a number of
74  * gratuitously noisy printf's to the console.  Due to the
75  * volume, it should be left off unless you want your system
76  * to churn a lot whenever the audit record flow gets high.
77  */
78 //#define	AUDIT_EXCESSIVELY_VERBOSE
79 #ifdef AUDIT_EXCESSIVELY_VERBOSE
80 #define	AUDIT_PRINTF(x)	printf x
81 #else
82 #define	AUDIT_PRINTF(X)
83 #endif
84 
85 static uma_zone_t audit_record_zone;
86 static MALLOC_DEFINE(M_AUDITPROC, "audit_proc", "Audit process storage");
87 MALLOC_DEFINE(M_AUDITDATA, "audit_data", "Audit data storage");
88 MALLOC_DEFINE(M_AUDITPATH, "audit_path", "Audit path storage");
89 MALLOC_DEFINE(M_AUDITTEXT, "audit_text", "Audit text storage");
90 
91 /*
92  * Audit control settings that are set/read by system calls and are
93  * hence non-static.
94  */
95 /*
96  * Define the audit control flags.
97  */
98 int					audit_enabled;
99 int					audit_suspended;
100 
101 /*
102  * Flags controlling behavior in low storage situations.
103  * Should we panic if a write fails?  Should we fail stop
104  * if we're out of disk space?
105  */
106 int					audit_panic_on_write_fail;
107 int					audit_fail_stop;
108 
109 /*
110  * Are we currently "failing stop" due to out of disk space?
111  */
112 static int				 audit_in_failure;
113 
114 /*
115  * Global audit statistiscs.
116  */
117 struct audit_fstat 			audit_fstat;
118 
119 /*
120  * Preselection mask for non-attributable events.
121  */
122 struct au_mask			 	audit_nae_mask;
123 
124 /*
125  * Mutex to protect global variables shared between various threads and
126  * processes.
127  */
128 static struct mtx			audit_mtx;
129 
130 /*
131  * Queue of audit records ready for delivery to disk.  We insert new
132  * records at the tail, and remove records from the head.  Also,
133  * a count of the number of records used for checking queue depth.
134  * In addition, a counter of records that we have allocated but are
135  * not yet in the queue, which is needed to estimate the total
136  * size of the combined set of records outstanding in the system.
137  */
138 static TAILQ_HEAD(, kaudit_record)	audit_q;
139 static int				audit_q_len;
140 static int				audit_pre_q_len;
141 
142 /*
143  * Audit queue control settings (minimum free, low/high water marks, etc.)
144  */
145 struct au_qctrl				audit_qctrl;
146 
147 /*
148  * Condition variable to signal to the worker that it has work to do:
149  * either new records are in the queue, or a log replacement is taking
150  * place.
151  */
152 static struct cv			audit_cv;
153 
154 /*
155  * Worker thread that will schedule disk I/O, etc.
156  */
157 static struct proc			*audit_thread;
158 
159 /*
160  * When an audit log is rotated, the actual rotation must be performed
161  * by the audit worker thread, as it may have outstanding writes on the
162  * current audit log.  audit_replacement_vp holds the vnode replacing
163  * the current vnode.  We can't let more than one replacement occur
164  * at a time, so if more than one thread requests a replacement, only
165  * one can have the replacement "in progress" at any given moment.  If
166  * a thread tries to replace the audit vnode and discovers a replacement
167  * is already in progress (i.e., audit_replacement_flag != 0), then it
168  * will sleep on audit_replacement_cv waiting its turn to perform a
169  * replacement.  When a replacement is completed, this cv is signalled
170  * by the worker thread so a waiting thread can start another replacement.
171  * We also store a credential to perform audit log write operations with.
172  *
173  * The current credential and vnode are thread-local to audit_worker.
174  */
175 static struct cv			audit_replacement_cv;
176 
177 static int				audit_replacement_flag;
178 static struct vnode			*audit_replacement_vp;
179 static struct ucred			*audit_replacement_cred;
180 
181 /*
182  * Condition variable to signal to the worker that it has work to do:
183  * either new records are in the queue, or a log replacement is taking
184  * place.
185  */
186 static struct cv			audit_commit_cv;
187 
188 /*
189  * Condition variable for  auditing threads wait on when in fail-stop mode.
190  * Threads wait on this CV forever (and ever), never seeing the light of
191  * day again.
192  */
193 static struct cv			audit_fail_cv;
194 
195 /*
196  * Flags related to Kernel->user-space communication.
197  */
198 static int			audit_file_rotate_wait;
199 
200 /*
201  * Construct an audit record for the passed thread.
202  */
203 static int
204 audit_record_ctor(void *mem, int size, void *arg, int flags)
205 {
206 	struct kaudit_record *ar;
207 	struct thread *td;
208 
209 	KASSERT(sizeof(*ar) == size, ("audit_record_ctor: wrong size"));
210 
211 	td = arg;
212 	ar = mem;
213 	bzero(ar, sizeof(*ar));
214 	ar->k_ar.ar_magic = AUDIT_RECORD_MAGIC;
215 	nanotime(&ar->k_ar.ar_starttime);
216 
217 	/*
218 	 * Export the subject credential.
219 	 *
220 	 * XXXAUDIT: td_ucred access is OK without proc lock, but some other
221 	 * fields here may require the proc lock.
222 	 */
223 	cru2x(td->td_ucred, &ar->k_ar.ar_subj_cred);
224 	ar->k_ar.ar_subj_ruid = td->td_ucred->cr_ruid;
225 	ar->k_ar.ar_subj_rgid = td->td_ucred->cr_rgid;
226 	ar->k_ar.ar_subj_egid = td->td_ucred->cr_groups[0];
227 	ar->k_ar.ar_subj_auid = td->td_proc->p_au->ai_auid;
228 	ar->k_ar.ar_subj_asid = td->td_proc->p_au->ai_asid;
229 	ar->k_ar.ar_subj_pid = td->td_proc->p_pid;
230 	ar->k_ar.ar_subj_amask = td->td_proc->p_au->ai_mask;
231 	ar->k_ar.ar_subj_term = td->td_proc->p_au->ai_termid;
232 	bcopy(td->td_proc->p_comm, ar->k_ar.ar_subj_comm, MAXCOMLEN);
233 
234 	return (0);
235 }
236 
237 static void
238 audit_record_dtor(void *mem, int size, void *arg)
239 {
240 	struct kaudit_record *ar;
241 
242 	KASSERT(sizeof(*ar) == size, ("audit_record_dtor: wrong size"));
243 
244 	ar = mem;
245 	if (ar->k_ar.ar_arg_upath1 != NULL)
246 		free(ar->k_ar.ar_arg_upath1, M_AUDITPATH);
247 	if (ar->k_ar.ar_arg_upath2 != NULL)
248 		free(ar->k_ar.ar_arg_upath2, M_AUDITPATH);
249 	if (ar->k_ar.ar_arg_text != NULL)
250 		free(ar->k_ar.ar_arg_text, M_AUDITTEXT);
251 	if (ar->k_udata != NULL)
252 		free(ar->k_udata, M_AUDITDATA);
253 }
254 
255 /*
256  * XXXAUDIT: Should adjust comments below to make it clear that we get to
257  * this point only if we believe we have storage, so not having space here
258  * is a violation of invariants derived from administrative procedures.
259  * I.e., someone else has written to the audit partition, leaving less space
260  * than we accounted for.
261  */
262 static int
263 audit_record_write(struct vnode *vp, struct kaudit_record *ar,
264     struct ucred *cred, struct thread *td)
265 {
266 	int ret;
267 	long temp;
268 	struct au_record *bsm;
269 	struct vattr vattr;
270 	struct statfs *mnt_stat = &vp->v_mount->mnt_stat;
271 	int vfslocked;
272 
273 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
274 
275 	/*
276 	 * First, gather statistics on the audit log file and file system
277 	 * so that we know how we're doing on space.  In both cases,
278 	 * if we're unable to perform the operation, we drop the record
279 	 * and return.  However, this is arguably an assertion failure.
280 	 * XXX Need a FreeBSD equivalent.
281 	 */
282 	ret = VFS_STATFS(vp->v_mount, mnt_stat, td);
283 	if (ret)
284 		goto out;
285 
286 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
287 	ret = VOP_GETATTR(vp, &vattr, cred, td);
288 	VOP_UNLOCK(vp, 0, td);
289 	if (ret)
290 		goto out;
291 
292 	/* update the global stats struct */
293 	audit_fstat.af_currsz = vattr.va_size;
294 
295 	/*
296 	 * XXX Need to decide what to do if the trigger to the audit daemon
297 	 * fails.
298 	 */
299 
300 	/*
301 	 * If we fall below minimum free blocks (hard limit), tell the audit
302 	 * daemon to force a rotation off of the file system. We also stop
303 	 * writing, which means this audit record is probably lost.
304 	 * If we fall below the minimum percent free blocks (soft limit),
305 	 * then kindly suggest to the audit daemon to do something.
306 	 */
307 	if (mnt_stat->f_bfree < AUDIT_HARD_LIMIT_FREE_BLOCKS) {
308 		(void)send_trigger(AUDIT_TRIGGER_NO_SPACE);
309 		/* Hopefully userspace did something about all the previous
310 		 * triggers that were sent prior to this critical condition.
311 		 * If fail-stop is set, then we're done; goodnight Gracie.
312 		 */
313 		if (audit_fail_stop)
314 			panic("Audit log space exhausted and fail-stop set.");
315 		else {
316 			audit_suspended = 1;
317 			ret = ENOSPC;
318 			goto out;
319 		}
320 	} else
321 		/*
322 		 * Send a message to the audit daemon that disk space
323 		 * is getting low.
324 		 *
325 		 * XXXAUDIT: Check math and block size calculation here.
326 		 */
327 		if (audit_qctrl.aq_minfree != 0) {
328 			temp = mnt_stat->f_blocks / (100 /
329 			    audit_qctrl.aq_minfree);
330 			if (mnt_stat->f_bfree < temp)
331 				(void)send_trigger(AUDIT_TRIGGER_LOW_SPACE);
332 		}
333 
334 	/* Check if the current log file is full; if so, call for
335 	 * a log rotate. This is not an exact comparison; we may
336 	 * write some records over the limit. If that's not
337 	 * acceptable, then add a fudge factor here.
338 	 */
339 	if ((audit_fstat.af_filesz != 0) &&
340 	    (audit_file_rotate_wait == 0) &&
341 	    (vattr.va_size >= audit_fstat.af_filesz)) {
342 		audit_file_rotate_wait = 1;
343 		(void)send_trigger(AUDIT_TRIGGER_OPEN_NEW);
344 	}
345 
346 	/*
347 	 * If the estimated amount of audit data in the audit event queue
348 	 * (plus records allocated but not yet queued) has reached the
349 	 * amount of free space on the disk, then we need to go into an
350 	 * audit fail stop state, in which we do not permit the
351 	 * allocation/committing of any new audit records.  We continue to
352 	 * process packets but don't allow any activities that might
353 	 * generate new records.  In the future, we might want to detect
354 	 * when space is available again and allow operation to continue,
355 	 * but this behavior is sufficient to meet fail stop requirements
356 	 * in CAPP.
357 	 */
358 	if (audit_fail_stop &&
359 	    (unsigned long)
360 	    ((audit_q_len + audit_pre_q_len + 1) * MAX_AUDIT_RECORD_SIZE) /
361 	    mnt_stat->f_bsize >= (unsigned long)(mnt_stat->f_bfree)) {
362 		printf("audit_record_write: free space below size of audit "
363 		    "queue, failing stop\n");
364 		audit_in_failure = 1;
365 	}
366 
367 	/*
368 	 * If there is a user audit record attached to the kernel record,
369 	 * then write the user record.
370 	 */
371 	/* XXX Need to decide a few things here: IF the user audit
372 	 * record is written, but the write of the kernel record fails,
373 	 * what to do? Should the kernel record come before or after the
374 	 * user record? For now, we write the user record first, and
375 	 * we ignore errors.
376 	 */
377 	if (ar->k_ar_commit & AR_COMMIT_USER) {
378 		/*
379 		 * Try submitting the record to any active audit pipes.
380 		 */
381 		audit_pipe_submit((void *)ar->k_udata, ar->k_ulen);
382 
383 		/*
384 		 * And to disk.
385 		 */
386 		ret = vn_rdwr(UIO_WRITE, vp, (void *)ar->k_udata, ar->k_ulen,
387 		          (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, cred, NULL,
388 			  NULL, td);
389 		if (ret)
390 			goto out;
391 	}
392 
393 	/*
394 	 * Convert the internal kernel record to BSM format and write it
395 	 * out if everything's OK.
396 	 */
397 	if (!(ar->k_ar_commit & AR_COMMIT_KERNEL)) {
398 		ret = 0;
399 		goto out;
400 	}
401 
402 	/*
403 	 * XXXAUDIT: Should we actually allow this conversion to fail?  With
404 	 * sleeping memory allocation and invariants checks, perhaps not.
405 	 */
406 	ret = kaudit_to_bsm(ar, &bsm);
407 	if (ret == BSM_NOAUDIT) {
408 		ret = 0;
409 		goto out;
410 	}
411 
412 	/*
413 	 * XXX: We drop the record on BSM conversion failure, but really
414 	 * this is an assertion failure.
415 	 */
416 	if (ret == BSM_FAILURE) {
417 		AUDIT_PRINTF(("BSM conversion failure\n"));
418 		ret = EINVAL;
419 		goto out;
420 	}
421 
422 	/*
423 	 * Try submitting the record to any active audit pipes.
424 	 */
425 	audit_pipe_submit((void *)bsm->data, bsm->len);
426 
427 	/*
428 	 * XXX
429 	 * We should break the write functionality away from the BSM record
430 	 * generation and have the BSM generation done before this function
431 	 * is called. This function will then take the BSM record as a
432 	 * parameter.
433 	 */
434 	ret = (vn_rdwr(UIO_WRITE, vp, (void *)bsm->data, bsm->len,
435 	    (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, cred, NULL, NULL, td));
436 
437 	kau_free(bsm);
438 
439 out:
440 	/*
441 	 * When we're done processing the current record, we have to
442 	 * check to see if we're in a failure mode, and if so, whether
443 	 * this was the last record left to be drained.  If we're done
444 	 * draining, then we fsync the vnode and panic.
445 	 */
446 	if (audit_in_failure &&
447 	    audit_q_len == 0 && audit_pre_q_len == 0) {
448 		VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, td);
449 		(void)VOP_FSYNC(vp, MNT_WAIT, td);
450 		VOP_UNLOCK(vp, 0, td);
451 		panic("Audit store overflow; record queue drained.");
452 	}
453 
454 	VFS_UNLOCK_GIANT(vfslocked);
455 
456 	return (ret);
457 }
458 
459 /*
460  * The audit_worker thread is responsible for watching the event queue,
461  * dequeueing records, converting them to BSM format, and committing them to
462  * disk.  In order to minimize lock thrashing, records are dequeued in sets
463  * to a thread-local work queue.  In addition, the audit_work performs the
464  * actual exchange of audit log vnode pointer, as audit_vp is a thread-local
465  * variable.
466  */
467 static void
468 audit_worker(void *arg)
469 {
470 	int do_replacement_signal, error;
471 	TAILQ_HEAD(, kaudit_record) ar_worklist;
472 	struct kaudit_record *ar;
473 	struct vnode *audit_vp, *old_vp;
474 	int vfslocked;
475 
476 	struct ucred *audit_cred, *old_cred;
477 	struct thread *audit_td;
478 
479 	AUDIT_PRINTF(("audit_worker starting\n"));
480 
481 	/*
482 	 * These are thread-local variables requiring no synchronization.
483 	 */
484 	TAILQ_INIT(&ar_worklist);
485 	audit_cred = NULL;
486 	audit_td = curthread;
487 	audit_vp = NULL;
488 
489 	mtx_lock(&audit_mtx);
490 	while (1) {
491 		/*
492 		 * First priority: replace the audit log target if requested.
493 		 * Accessing the vnode here requires dropping the audit_mtx;
494 		 * in case another replacement was scheduled while the mutex
495 		 * was released, we loop.
496 		 *
497 		 * XXX It could well be we should drain existing records
498 		 * first to ensure that the timestamps and ordering
499 		 * are right.
500 		 */
501 		do_replacement_signal = 0;
502 		while (audit_replacement_flag != 0) {
503 			old_cred = audit_cred;
504 			old_vp = audit_vp;
505 			audit_cred = audit_replacement_cred;
506 			audit_vp = audit_replacement_vp;
507 			audit_replacement_cred = NULL;
508 			audit_replacement_vp = NULL;
509 			audit_replacement_flag = 0;
510 
511 			audit_enabled = (audit_vp != NULL);
512 
513 			/*
514 			 * XXX: What to do about write failures here?
515 			 */
516 			if (old_vp != NULL) {
517 				AUDIT_PRINTF(("Closing old audit file\n"));
518 				mtx_unlock(&audit_mtx);
519 				vfslocked = VFS_LOCK_GIANT(old_vp->v_mount);
520 				vn_close(old_vp, AUDIT_CLOSE_FLAGS, old_cred,
521 				    audit_td);
522 				VFS_UNLOCK_GIANT(vfslocked);
523 				crfree(old_cred);
524 				mtx_lock(&audit_mtx);
525 				old_cred = NULL;
526 				old_vp = NULL;
527 				AUDIT_PRINTF(("Audit file closed\n"));
528 			}
529 			if (audit_vp != NULL) {
530 				AUDIT_PRINTF(("Opening new audit file\n"));
531 			}
532 			do_replacement_signal = 1;
533 		}
534 		/*
535 		 * Signal that replacement have occurred to wake up and
536 		 * start any other replacements started in parallel.  We can
537 		 * continue about our business in the mean time.  We
538 		 * broadcast so that both new replacements can be inserted,
539 		 * but also so that the source(s) of replacement can return
540 		 * successfully.
541 		 */
542 		if (do_replacement_signal)
543 			cv_broadcast(&audit_replacement_cv);
544 
545 		/*
546 		 * Next, check to see if we have any records to drain into
547 		 * the vnode.  If not, go back to waiting for an event.
548 		 */
549 		if (TAILQ_EMPTY(&audit_q)) {
550 			AUDIT_PRINTF(("audit_worker waiting\n"));
551 			cv_wait(&audit_cv, &audit_mtx);
552 			AUDIT_PRINTF(("audit_worker woken up\n"));
553 	AUDIT_PRINTF(("audit_worker: new vp = %p; value of flag %d\n",
554 	    audit_replacement_vp, audit_replacement_flag));
555 			continue;
556 		}
557 
558 		/*
559 		 * If we have records, but there's no active vnode to write
560 		 * to, drain the record queue.  Generally, we prevent the
561 		 * unnecessary allocation of records elsewhere, but we need
562 		 * to allow for races between conditional allocation and
563 		 * queueing.  Go back to waiting when we're done.
564 		 */
565 		if (audit_vp == NULL) {
566 			while ((ar = TAILQ_FIRST(&audit_q))) {
567 				TAILQ_REMOVE(&audit_q, ar, k_q);
568 				uma_zfree(audit_record_zone, ar);
569 				audit_q_len--;
570 				/*
571 				 * XXXRW: Why broadcast if we hold the
572 				 * mutex and know that audit_vp is NULL?
573 				 */
574 				if (audit_q_len <= audit_qctrl.aq_lowater)
575 					cv_broadcast(&audit_commit_cv);
576 			}
577 			continue;
578 		}
579 
580 		/*
581 		 * We have both records to write and an active vnode to write
582 		 * to.  Dequeue a record, and start the write.  Eventually,
583 		 * it might make sense to dequeue several records and perform
584 		 * our own clustering, if the lower layers aren't doing it
585 		 * automatically enough.
586 		 */
587 		while ((ar = TAILQ_FIRST(&audit_q))) {
588 			TAILQ_REMOVE(&audit_q, ar, k_q);
589 			audit_q_len--;
590 			if (audit_q_len <= audit_qctrl.aq_lowater)
591 				cv_broadcast(&audit_commit_cv);
592 			TAILQ_INSERT_TAIL(&ar_worklist, ar, k_q);
593 		}
594 
595 		mtx_unlock(&audit_mtx);
596 		while ((ar = TAILQ_FIRST(&ar_worklist))) {
597 			TAILQ_REMOVE(&ar_worklist, ar, k_q);
598 			if (audit_vp != NULL) {
599 				error = audit_record_write(audit_vp, ar,
600 				    audit_cred, audit_td);
601 				if (error && audit_panic_on_write_fail)
602 					panic("audit_worker: write error %d\n",
603 					    error);
604 				else if (error)
605 					printf("audit_worker: write error %d\n",
606 					    error);
607 			}
608 			uma_zfree(audit_record_zone, ar);
609 		}
610 		mtx_lock(&audit_mtx);
611 	}
612 }
613 
614 /*
615  * Initialize the Audit subsystem: configuration state, work queue,
616  * synchronization primitives, worker thread, and trigger device node.  Also
617  * call into the BSM assembly code to initialize it.
618  */
619 static void
620 audit_init(void)
621 {
622 	int error;
623 
624 	printf("Security auditing service present\n");
625 	audit_enabled = 0;
626 	audit_suspended = 0;
627 	audit_panic_on_write_fail = 0;
628 	audit_fail_stop = 0;
629 	audit_in_failure = 0;
630 
631 	audit_replacement_vp = NULL;
632 	audit_replacement_cred = NULL;
633 	audit_replacement_flag = 0;
634 
635 	audit_fstat.af_filesz = 0;	/* '0' means unset, unbounded */
636 	audit_fstat.af_currsz = 0;
637 	audit_nae_mask.am_success = AU_NULL;
638 	audit_nae_mask.am_failure = AU_NULL;
639 
640 	TAILQ_INIT(&audit_q);
641 	audit_q_len = 0;
642 	audit_pre_q_len = 0;
643 	audit_qctrl.aq_hiwater = AQ_HIWATER;
644 	audit_qctrl.aq_lowater = AQ_LOWATER;
645 	audit_qctrl.aq_bufsz = AQ_BUFSZ;
646 	audit_qctrl.aq_minfree = AU_FS_MINFREE;
647 
648 	mtx_init(&audit_mtx, "audit_mtx", NULL, MTX_DEF);
649 	cv_init(&audit_cv, "audit_cv");
650 	cv_init(&audit_replacement_cv, "audit_replacement_cv");
651 	cv_init(&audit_commit_cv, "audit_commit_cv");
652 	cv_init(&audit_fail_cv, "audit_fail_cv");
653 
654 	audit_record_zone = uma_zcreate("audit_record_zone",
655 	    sizeof(struct kaudit_record), audit_record_ctor,
656 	    audit_record_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
657 
658 	/* Initialize the BSM audit subsystem. */
659 	kau_init();
660 
661 	audit_file_rotate_wait = 0;
662 	audit_trigger_init();
663 
664 	/* Register shutdown handler. */
665 	EVENTHANDLER_REGISTER(shutdown_pre_sync, audit_shutdown, NULL,
666 	    SHUTDOWN_PRI_FIRST);
667 
668 	error = kthread_create(audit_worker, NULL, &audit_thread, RFHIGHPID,
669 	    0, "audit_worker");
670 	if (error != 0)
671 		panic("audit_init: kthread_create returned %d", error);
672 }
673 
674 SYSINIT(audit_init, SI_SUB_AUDIT, SI_ORDER_FIRST, audit_init, NULL)
675 
676 /*
677  * audit_rotate_vnode() is called by a user or kernel thread to configure or
678  * de-configure auditing on a vnode.  The arguments are the replacement
679  * credential and vnode to substitute for the current credential and vnode,
680  * if any.  If either is set to NULL, both should be NULL, and this is used
681  * to indicate that audit is being disabled.  The real work is done in the
682  * audit_worker thread, but audit_rotate_vnode() waits synchronously for that
683  * to complete.
684  *
685  * The vnode should be referenced and opened by the caller.  The credential
686  * should be referenced.  audit_rotate_vnode() will own both references as of
687  * this call, so the caller should not release either.
688  *
689  * XXXAUDIT: Review synchronize communication logic.  Really, this is a
690  * message queue of depth 1.
691  *
692  * XXXAUDIT: Enhance the comments below to indicate that we are basically
693  * acquiring ownership of the communications queue, inserting our message,
694  * and waiting for an acknowledgement.
695  */
696 void
697 audit_rotate_vnode(struct ucred *cred, struct vnode *vp)
698 {
699 
700 	/*
701 	 * If other parallel log replacements have been requested, we wait
702 	 * until they've finished before continuing.
703 	 */
704 	mtx_lock(&audit_mtx);
705 	while (audit_replacement_flag != 0) {
706 		AUDIT_PRINTF(("audit_rotate_vnode: sleeping to wait for "
707 		    "flag\n"));
708 		cv_wait(&audit_replacement_cv, &audit_mtx);
709 		AUDIT_PRINTF(("audit_rotate_vnode: woken up (flag %d)\n",
710 		    audit_replacement_flag));
711 	}
712 	audit_replacement_cred = cred;
713 	audit_replacement_flag = 1;
714 	audit_replacement_vp = vp;
715 
716 	/*
717 	 * Wake up the audit worker to perform the exchange once we
718 	 * release the mutex.
719 	 */
720 	cv_signal(&audit_cv);
721 
722 	/*
723 	 * Wait for the audit_worker to broadcast that a replacement has
724 	 * taken place; we know that once this has happened, our vnode
725 	 * has been replaced in, so we can return successfully.
726 	 */
727 	AUDIT_PRINTF(("audit_rotate_vnode: waiting for news of "
728 	    "replacement\n"));
729 	cv_wait(&audit_replacement_cv, &audit_mtx);
730 	AUDIT_PRINTF(("audit_rotate_vnode: change acknowledged by "
731 	    "audit_worker (flag " "now %d)\n", audit_replacement_flag));
732 	mtx_unlock(&audit_mtx);
733 
734 	audit_file_rotate_wait = 0; /* We can now request another rotation */
735 }
736 
737 /*
738  * Drain the audit queue and close the log at shutdown.  Note that this can
739  * be called both from the system shutdown path and also from audit
740  * configuration syscalls, so 'arg' and 'howto' are ignored.
741  */
742 void
743 audit_shutdown(void *arg, int howto)
744 {
745 
746 	audit_rotate_vnode(NULL, NULL);
747 }
748 
749 /*
750  * Return the current thread's audit record, if any.
751  */
752 __inline__ struct kaudit_record *
753 currecord(void)
754 {
755 
756 	return (curthread->td_ar);
757 }
758 
759 /*
760  * MPSAFE
761  *
762  * XXXAUDIT: There are a number of races present in the code below due to
763  * release and re-grab of the mutex.  The code should be revised to become
764  * slightly less racy.
765  *
766  * XXXAUDIT: Shouldn't there be logic here to sleep waiting on available
767  * pre_q space, suspending the system call until there is room?
768  */
769 struct kaudit_record *
770 audit_new(int event, struct thread *td)
771 {
772 	struct kaudit_record *ar;
773 	int no_record;
774 
775 	mtx_lock(&audit_mtx);
776 	no_record = (audit_suspended || !audit_enabled);
777 	mtx_unlock(&audit_mtx);
778 	if (no_record)
779 		return (NULL);
780 
781 	/*
782 	 * XXX: The number of outstanding uncommitted audit records is
783 	 * limited to the number of concurrent threads servicing system
784 	 * calls in the kernel.
785 	 */
786 	ar = uma_zalloc_arg(audit_record_zone, td, M_WAITOK);
787 	ar->k_ar.ar_event = event;
788 
789 	mtx_lock(&audit_mtx);
790 	audit_pre_q_len++;
791 	mtx_unlock(&audit_mtx);
792 
793 	return (ar);
794 }
795 
796 /*
797  * MPSAFE
798  */
799 void
800 audit_commit(struct kaudit_record *ar, int error, int retval)
801 {
802 	int sorf;
803 	struct au_mask *aumask;
804 
805 	if (ar == NULL)
806 		return;
807 
808 	/*
809 	 * Decide whether to commit the audit record by checking the
810 	 * error value from the system call and using the appropriate
811 	 * audit mask.
812 	 *
813 	 * XXXAUDIT: Synchronize access to audit_nae_mask?
814 	 */
815 	if (ar->k_ar.ar_subj_auid == AU_DEFAUDITID)
816 		aumask = &audit_nae_mask;
817 	else
818 		aumask = &ar->k_ar.ar_subj_amask;
819 
820 	if (error)
821 		sorf = AU_PRS_FAILURE;
822 	else
823 		sorf = AU_PRS_SUCCESS;
824 
825 	switch(ar->k_ar.ar_event) {
826 
827 	case AUE_OPEN_RWTC:
828 		/* The open syscall always writes a AUE_OPEN_RWTC event; change
829 		 * it to the proper type of event based on the flags and the
830 		 * error value.
831 		 */
832 		ar->k_ar.ar_event = flags_and_error_to_openevent(
833 		    ar->k_ar.ar_arg_fflags, error);
834 		break;
835 
836 	case AUE_SYSCTL:
837 		ar->k_ar.ar_event = ctlname_to_sysctlevent(
838 		    ar->k_ar.ar_arg_ctlname, ar->k_ar.ar_valid_arg);
839 		break;
840 
841 	case AUE_AUDITON:
842 		/* Convert the auditon() command to an event */
843 		ar->k_ar.ar_event = auditon_command_event(ar->k_ar.ar_arg_cmd);
844 		break;
845 	}
846 
847 	if (au_preselect(ar->k_ar.ar_event, aumask, sorf) != 0)
848 		ar->k_ar_commit |= AR_COMMIT_KERNEL;
849 
850 	/*
851 	 * XXXRW: Why is this necessary?  Should we ever accept a record that
852 	 * we're not willing to commit?
853 	 */
854 	if ((ar->k_ar_commit & (AR_COMMIT_USER | AR_COMMIT_KERNEL)) == 0) {
855 		mtx_lock(&audit_mtx);
856 		audit_pre_q_len--;
857 		mtx_unlock(&audit_mtx);
858 		uma_zfree(audit_record_zone, ar);
859 		return;
860 	}
861 
862 	ar->k_ar.ar_errno = error;
863 	ar->k_ar.ar_retval = retval;
864 
865 	/*
866 	 * We might want to do some system-wide post-filtering
867 	 * here at some point.
868 	 */
869 
870 	/*
871 	 * Timestamp system call end.
872 	 */
873 	nanotime(&ar->k_ar.ar_endtime);
874 
875 	mtx_lock(&audit_mtx);
876 
877 	/*
878 	 * Note: it could be that some records initiated while audit was
879 	 * enabled should still be committed?
880 	 */
881 	if (audit_suspended || !audit_enabled) {
882 		audit_pre_q_len--;
883 		mtx_unlock(&audit_mtx);
884 		uma_zfree(audit_record_zone, ar);
885 		return;
886 	}
887 
888 	/*
889 	 * Constrain the number of committed audit records based on
890 	 * the configurable parameter.
891 	 */
892 	while (audit_q_len >= audit_qctrl.aq_hiwater) {
893 		AUDIT_PRINTF(("audit_commit: sleeping to wait for "
894 		   "audit queue to drain below high water mark\n"));
895 		cv_wait(&audit_commit_cv, &audit_mtx);
896 		AUDIT_PRINTF(("audit_commit: woke up waiting for "
897 		   "audit queue draining\n"));
898 	}
899 
900 	TAILQ_INSERT_TAIL(&audit_q, ar, k_q);
901 	audit_q_len++;
902 	audit_pre_q_len--;
903 	cv_signal(&audit_cv);
904 	mtx_unlock(&audit_mtx);
905 }
906 
907 /*
908  * audit_syscall_enter() is called on entry to each system call.  It is
909  * responsible for deciding whether or not to audit the call (preselection),
910  * and if so, allocating a per-thread audit record.  audit_new() will fill in
911  * basic thread/credential properties.
912  */
913 void
914 audit_syscall_enter(unsigned short code, struct thread *td)
915 {
916 	int audit_event;
917 	struct au_mask *aumask;
918 
919 	KASSERT(td->td_ar == NULL, ("audit_syscall_enter: td->td_ar != NULL"));
920 
921 	/*
922 	 * In FreeBSD, each ABI has its own system call table, and hence
923 	 * mapping of system call codes to audit events.  Convert the code to
924 	 * an audit event identifier using the process system call table
925 	 * reference.  In Darwin, there's only one, so we use the global
926 	 * symbol for the system call table.
927 	 *
928 	 * XXXAUDIT: Should we audit that a bad system call was made, and if
929 	 * so, how?
930 	 */
931 	if (code >= td->td_proc->p_sysent->sv_size)
932 		return;
933 
934 	audit_event = td->td_proc->p_sysent->sv_table[code].sy_auevent;
935 	if (audit_event == AUE_NULL)
936 		return;
937 
938 	/*
939 	 * Check which audit mask to use; either the kernel non-attributable
940 	 * event mask or the process audit mask.
941 	 */
942 	if (td->td_proc->p_au->ai_auid == AU_DEFAUDITID)
943 		aumask = &audit_nae_mask;
944 	else
945 		aumask = &td->td_proc->p_au->ai_mask;
946 
947 	/*
948 	 * Allocate an audit record, if preselection allows it, and store
949 	 * in the thread for later use.
950 	 */
951 	if (au_preselect(audit_event, aumask,
952 			AU_PRS_FAILURE | AU_PRS_SUCCESS)) {
953 		/*
954 		 * If we're out of space and need to suspend unprivileged
955 		 * processes, do that here rather than trying to allocate
956 		 * another audit record.
957 		 *
958 		 * XXXRW: We might wish to be able to continue here in the
959 		 * future, if the system recovers.  That should be possible
960 		 * by means of checking the condition in a loop around
961 		 * cv_wait().  It might be desirable to reevaluate whether an
962 		 * audit record is still required for this event by
963 		 * re-calling au_preselect().
964 		 */
965 		if (audit_in_failure && suser(td) != 0) {
966 			cv_wait(&audit_fail_cv, &audit_mtx);
967 			panic("audit_failing_stop: thread continued");
968 		}
969 		td->td_ar = audit_new(audit_event, td);
970 	} else
971 		td->td_ar = NULL;
972 }
973 
974 /*
975  * audit_syscall_exit() is called from the return of every system call, or in
976  * the event of exit1(), during the execution of exit1().  It is responsible
977  * for committing the audit record, if any, along with return condition.
978  */
979 void
980 audit_syscall_exit(int error, struct thread *td)
981 {
982 	int retval;
983 
984 	/*
985 	 * Commit the audit record as desired; once we pass the record
986 	 * into audit_commit(), the memory is owned by the audit
987 	 * subsystem.
988 	 * The return value from the system call is stored on the user
989 	 * thread. If there was an error, the return value is set to -1,
990 	 * imitating the behavior of the cerror routine.
991 	 */
992 	if (error)
993 		retval = -1;
994 	else
995 		retval = td->td_retval[0];
996 
997 	audit_commit(td->td_ar, error, retval);
998 	if (td->td_ar != NULL)
999 		AUDIT_PRINTF(("audit record committed by pid %d\n",
1000 			td->td_proc->p_pid));
1001 	td->td_ar = NULL;
1002 
1003 }
1004 
1005 /*
1006  * Allocate storage for a new process (init, or otherwise).
1007  */
1008 void
1009 audit_proc_alloc(struct proc *p)
1010 {
1011 
1012 	KASSERT(p->p_au == NULL, ("audit_proc_alloc: p->p_au != NULL (%d)",
1013 	    p->p_pid));
1014 	p->p_au = malloc(sizeof(*(p->p_au)), M_AUDITPROC, M_WAITOK);
1015 	/* XXXAUDIT: Zero?  Slab allocate? */
1016 	//printf("audit_proc_alloc: pid %d p_au %p\n", p->p_pid, p->p_au);
1017 }
1018 
1019 /*
1020  * Allocate storage for a new thread.
1021  */
1022 void
1023 audit_thread_alloc(struct thread *td)
1024 {
1025 
1026 	td->td_ar = NULL;
1027 }
1028 
1029 /*
1030  * Thread destruction.
1031  */
1032 void
1033 audit_thread_free(struct thread *td)
1034 {
1035 
1036 	KASSERT(td->td_ar == NULL, ("audit_thread_free: td_ar != NULL"));
1037 }
1038 
1039 /*
1040  * Initialize the audit information for the a process, presumably the first
1041  * process in the system.
1042  * XXX It is not clear what the initial values should be for audit ID,
1043  * session ID, etc.
1044  */
1045 void
1046 audit_proc_kproc0(struct proc *p)
1047 {
1048 
1049 	KASSERT(p->p_au != NULL, ("audit_proc_kproc0: p->p_au == NULL (%d)",
1050 	    p->p_pid));
1051 	//printf("audit_proc_kproc0: pid %d p_au %p\n", p->p_pid, p->p_au);
1052 	bzero(p->p_au, sizeof(*(p)->p_au));
1053 }
1054 
1055 void
1056 audit_proc_init(struct proc *p)
1057 {
1058 
1059 	KASSERT(p->p_au != NULL, ("audit_proc_init: p->p_au == NULL (%d)",
1060 	    p->p_pid));
1061 	//printf("audit_proc_init: pid %d p_au %p\n", p->p_pid, p->p_au);
1062 	bzero(p->p_au, sizeof(*(p)->p_au));
1063 	p->p_au->ai_auid = AU_DEFAUDITID;
1064 }
1065 
1066 /*
1067  * Copy the audit info from the parent process to the child process when
1068  * a fork takes place.
1069  */
1070 void
1071 audit_proc_fork(struct proc *parent, struct proc *child)
1072 {
1073 
1074 	PROC_LOCK_ASSERT(parent, MA_OWNED);
1075 	PROC_LOCK_ASSERT(child, MA_OWNED);
1076 	KASSERT(parent->p_au != NULL,
1077 	    ("audit_proc_fork: parent->p_au == NULL (%d)", parent->p_pid));
1078 	KASSERT(child->p_au != NULL,
1079 	    ("audit_proc_fork: child->p_au == NULL (%d)", child->p_pid));
1080 	//printf("audit_proc_fork: parent pid %d p_au %p\n", parent->p_pid,
1081 	//    parent->p_au);
1082 	//printf("audit_proc_fork: child pid %d p_au %p\n", child->p_pid,
1083 	//    child->p_au);
1084 	bcopy(parent->p_au, child->p_au, sizeof(*child->p_au));
1085 	/*
1086 	 * XXXAUDIT: Zero pointers to external memory, or assert they are
1087 	 * zero?
1088 	 */
1089 }
1090 
1091 /*
1092  * Free the auditing structure for the process.
1093  */
1094 void
1095 audit_proc_free(struct proc *p)
1096 {
1097 
1098 	KASSERT(p->p_au != NULL, ("p->p_au == NULL (%d)", p->p_pid));
1099 	//printf("audit_proc_free: pid %d p_au %p\n", p->p_pid, p->p_au);
1100 	/*
1101 	 * XXXAUDIT: Assert that external memory pointers are NULL?
1102 	 */
1103 	free(p->p_au, M_AUDITPROC);
1104 	p->p_au = NULL;
1105 }
1106