xref: /freebsd/sys/dev/hwpmc/hwpmc_logging.c (revision 63d1fd5970ec814904aa0f4580b10a0d302d08b2)
1 /*-
2  * Copyright (c) 2005-2007 Joseph Koshy
3  * Copyright (c) 2007 The FreeBSD Foundation
4  * All rights reserved.
5  *
6  * Portions of this software were developed by A. Joseph Koshy under
7  * sponsorship from the FreeBSD Foundation and Google, Inc.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  *
30  */
31 
32 /*
33  * Logging code for hwpmc(4)
34  */
35 
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38 
39 #include <sys/param.h>
40 #include <sys/capsicum.h>
41 #include <sys/file.h>
42 #include <sys/kernel.h>
43 #include <sys/kthread.h>
44 #include <sys/lock.h>
45 #include <sys/module.h>
46 #include <sys/mutex.h>
47 #include <sys/pmc.h>
48 #include <sys/pmckern.h>
49 #include <sys/pmclog.h>
50 #include <sys/proc.h>
51 #include <sys/signalvar.h>
52 #include <sys/sysctl.h>
53 #include <sys/systm.h>
54 #include <sys/uio.h>
55 #include <sys/unistd.h>
56 #include <sys/vnode.h>
57 
58 /*
59  * Sysctl tunables
60  */
61 
62 SYSCTL_DECL(_kern_hwpmc);
63 
64 /*
65  * kern.hwpmc.logbuffersize -- size of the per-cpu owner buffers.
66  */
67 
68 static int pmclog_buffer_size = PMC_LOG_BUFFER_SIZE;
69 #if (__FreeBSD_version < 1100000)
70 TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "logbuffersize", &pmclog_buffer_size);
71 #endif
72 SYSCTL_INT(_kern_hwpmc, OID_AUTO, logbuffersize, CTLFLAG_RDTUN,
73     &pmclog_buffer_size, 0, "size of log buffers in kilobytes");
74 
75 /*
76  * kern.hwpmc.nbuffer -- number of global log buffers
77  */
78 
79 static int pmc_nlogbuffers = PMC_NLOGBUFFERS;
80 #if (__FreeBSD_version < 1100000)
81 TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "nbuffers", &pmc_nlogbuffers);
82 #endif
83 SYSCTL_INT(_kern_hwpmc, OID_AUTO, nbuffers, CTLFLAG_RDTUN,
84     &pmc_nlogbuffers, 0, "number of global log buffers");
85 
86 /*
87  * Global log buffer list and associated spin lock.
88  */
89 
90 TAILQ_HEAD(, pmclog_buffer) pmc_bufferlist =
91 	TAILQ_HEAD_INITIALIZER(pmc_bufferlist);
92 static struct mtx pmc_bufferlist_mtx;	/* spin lock */
93 static struct mtx pmc_kthread_mtx;	/* sleep lock */
94 
95 #define	PMCLOG_INIT_BUFFER_DESCRIPTOR(D) do {				\
96 		const int __roundup = roundup(sizeof(*D),		\
97 			sizeof(uint32_t));				\
98 		(D)->plb_fence = ((char *) (D)) +			\
99 			 1024*pmclog_buffer_size;			\
100 		(D)->plb_base  = (D)->plb_ptr = ((char *) (D)) +	\
101 			__roundup;					\
102 	} while (0)
103 
104 
105 /*
106  * Log file record constructors.
107  */
108 #define	_PMCLOG_TO_HEADER(T,L)						\
109 	((PMCLOG_HEADER_MAGIC << 24) |					\
110 	 (PMCLOG_TYPE_ ## T << 16)   |					\
111 	 ((L) & 0xFFFF))
112 
113 /* reserve LEN bytes of space and initialize the entry header */
114 #define	_PMCLOG_RESERVE(PO,TYPE,LEN,ACTION) do {			\
115 		uint32_t *_le;						\
116 		int _len = roundup((LEN), sizeof(uint32_t));		\
117 		if ((_le = pmclog_reserve((PO), _len)) == NULL) {	\
118 			ACTION;						\
119 		}							\
120 		*_le = _PMCLOG_TO_HEADER(TYPE,_len);			\
121 		_le += 3	/* skip over timestamp */
122 
123 #define	PMCLOG_RESERVE(P,T,L)		_PMCLOG_RESERVE(P,T,L,return)
124 #define	PMCLOG_RESERVE_WITH_ERROR(P,T,L) _PMCLOG_RESERVE(P,T,L,		\
125 	error=ENOMEM;goto error)
126 
127 #define	PMCLOG_EMIT32(V)	do { *_le++ = (V); } while (0)
128 #define	PMCLOG_EMIT64(V)	do { 					\
129 		*_le++ = (uint32_t) ((V) & 0xFFFFFFFF);			\
130 		*_le++ = (uint32_t) (((V) >> 32) & 0xFFFFFFFF);		\
131 	} while (0)
132 
133 
134 /* Emit a string.  Caution: does NOT update _le, so needs to be last */
135 #define	PMCLOG_EMITSTRING(S,L)	do { bcopy((S), _le, (L)); } while (0)
136 #define	PMCLOG_EMITNULLSTRING(L) do { bzero(_le, (L)); } while (0)
137 
138 #define	PMCLOG_DESPATCH(PO)						\
139 		pmclog_release((PO));					\
140 	} while (0)
141 
142 
143 /*
144  * Assertions about the log file format.
145  */
146 
147 CTASSERT(sizeof(struct pmclog_callchain) == 6*4 +
148     PMC_CALLCHAIN_DEPTH_MAX*sizeof(uintfptr_t));
149 CTASSERT(sizeof(struct pmclog_closelog) == 3*4);
150 CTASSERT(sizeof(struct pmclog_dropnotify) == 3*4);
151 CTASSERT(sizeof(struct pmclog_map_in) == PATH_MAX +
152     4*4 + sizeof(uintfptr_t));
153 CTASSERT(offsetof(struct pmclog_map_in,pl_pathname) ==
154     4*4 + sizeof(uintfptr_t));
155 CTASSERT(sizeof(struct pmclog_map_out) == 4*4 + 2*sizeof(uintfptr_t));
156 CTASSERT(sizeof(struct pmclog_pcsample) == 6*4 + sizeof(uintfptr_t));
157 CTASSERT(sizeof(struct pmclog_pmcallocate) == 6*4);
158 CTASSERT(sizeof(struct pmclog_pmcattach) == 5*4 + PATH_MAX);
159 CTASSERT(offsetof(struct pmclog_pmcattach,pl_pathname) == 5*4);
160 CTASSERT(sizeof(struct pmclog_pmcdetach) == 5*4);
161 CTASSERT(sizeof(struct pmclog_proccsw) == 5*4 + 8);
162 CTASSERT(sizeof(struct pmclog_procexec) == 5*4 + PATH_MAX +
163     sizeof(uintfptr_t));
164 CTASSERT(offsetof(struct pmclog_procexec,pl_pathname) == 5*4 +
165     sizeof(uintfptr_t));
166 CTASSERT(sizeof(struct pmclog_procexit) == 5*4 + 8);
167 CTASSERT(sizeof(struct pmclog_procfork) == 5*4);
168 CTASSERT(sizeof(struct pmclog_sysexit) == 4*4);
169 CTASSERT(sizeof(struct pmclog_userdata) == 4*4);
170 
171 /*
172  * Log buffer structure
173  */
174 
175 struct pmclog_buffer {
176 	TAILQ_ENTRY(pmclog_buffer) plb_next;
177 	char 		*plb_base;
178 	char		*plb_ptr;
179 	char 		*plb_fence;
180 };
181 
182 /*
183  * Prototypes
184  */
185 
186 static int pmclog_get_buffer(struct pmc_owner *po);
187 static void pmclog_loop(void *arg);
188 static void pmclog_release(struct pmc_owner *po);
189 static uint32_t *pmclog_reserve(struct pmc_owner *po, int length);
190 static void pmclog_schedule_io(struct pmc_owner *po);
191 static void pmclog_stop_kthread(struct pmc_owner *po);
192 
193 /*
194  * Helper functions
195  */
196 
197 /*
198  * Get a log buffer
199  */
200 
201 static int
202 pmclog_get_buffer(struct pmc_owner *po)
203 {
204 	struct pmclog_buffer *plb;
205 
206 	mtx_assert(&po->po_mtx, MA_OWNED);
207 
208 	KASSERT(po->po_curbuf == NULL,
209 	    ("[pmclog,%d] po=%p current buffer still valid", __LINE__, po));
210 
211 	mtx_lock_spin(&pmc_bufferlist_mtx);
212 	if ((plb = TAILQ_FIRST(&pmc_bufferlist)) != NULL)
213 		TAILQ_REMOVE(&pmc_bufferlist, plb, plb_next);
214 	mtx_unlock_spin(&pmc_bufferlist_mtx);
215 
216 	PMCDBG2(LOG,GTB,1, "po=%p plb=%p", po, plb);
217 
218 #ifdef	HWPMC_DEBUG
219 	if (plb)
220 		KASSERT(plb->plb_ptr == plb->plb_base &&
221 		    plb->plb_base < plb->plb_fence,
222 		    ("[pmclog,%d] po=%p buffer invariants: ptr=%p "
223 		    "base=%p fence=%p", __LINE__, po, plb->plb_ptr,
224 		    plb->plb_base, plb->plb_fence));
225 #endif
226 
227 	po->po_curbuf = plb;
228 
229 	/* update stats */
230 	atomic_add_int(&pmc_stats.pm_buffer_requests, 1);
231 	if (plb == NULL)
232 		atomic_add_int(&pmc_stats.pm_buffer_requests_failed, 1);
233 
234 	return (plb ? 0 : ENOMEM);
235 }
236 
237 /*
238  * Log handler loop.
239  *
240  * This function is executed by each pmc owner's helper thread.
241  */
242 
243 static void
244 pmclog_loop(void *arg)
245 {
246 	int error;
247 	struct pmc_owner *po;
248 	struct pmclog_buffer *lb;
249 	struct proc *p;
250 	struct ucred *ownercred;
251 	struct ucred *mycred;
252 	struct thread *td;
253 	struct uio auio;
254 	struct iovec aiov;
255 	size_t nbytes;
256 
257 	po = (struct pmc_owner *) arg;
258 	p = po->po_owner;
259 	td = curthread;
260 	mycred = td->td_ucred;
261 
262 	PROC_LOCK(p);
263 	ownercred = crhold(p->p_ucred);
264 	PROC_UNLOCK(p);
265 
266 	PMCDBG2(LOG,INI,1, "po=%p kt=%p", po, po->po_kthread);
267 	KASSERT(po->po_kthread == curthread->td_proc,
268 	    ("[pmclog,%d] proc mismatch po=%p po/kt=%p curproc=%p", __LINE__,
269 		po, po->po_kthread, curthread->td_proc));
270 
271 	lb = NULL;
272 
273 
274 	/*
275 	 * Loop waiting for I/O requests to be added to the owner
276 	 * struct's queue.  The loop is exited when the log file
277 	 * is deconfigured.
278 	 */
279 
280 	mtx_lock(&pmc_kthread_mtx);
281 
282 	for (;;) {
283 
284 		/* check if we've been asked to exit */
285 		if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)
286 			break;
287 
288 		if (lb == NULL) { /* look for a fresh buffer to write */
289 			mtx_lock_spin(&po->po_mtx);
290 			if ((lb = TAILQ_FIRST(&po->po_logbuffers)) == NULL) {
291 				mtx_unlock_spin(&po->po_mtx);
292 
293 				/* No more buffers and shutdown required. */
294 				if (po->po_flags & PMC_PO_SHUTDOWN) {
295 					mtx_unlock(&pmc_kthread_mtx);
296 					/*
297 			 		 * Close the file to get PMCLOG_EOF
298 					 * error in pmclog(3).
299 					 */
300 					fo_close(po->po_file, curthread);
301 					mtx_lock(&pmc_kthread_mtx);
302 					break;
303 				}
304 
305 				(void) msleep(po, &pmc_kthread_mtx, PWAIT,
306 				    "pmcloop", 0);
307 				continue;
308 			}
309 
310 			TAILQ_REMOVE(&po->po_logbuffers, lb, plb_next);
311 			mtx_unlock_spin(&po->po_mtx);
312 		}
313 
314 		mtx_unlock(&pmc_kthread_mtx);
315 
316 		/* process the request */
317 		PMCDBG3(LOG,WRI,2, "po=%p base=%p ptr=%p", po,
318 		    lb->plb_base, lb->plb_ptr);
319 		/* change our thread's credentials before issuing the I/O */
320 
321 		aiov.iov_base = lb->plb_base;
322 		aiov.iov_len  = nbytes = lb->plb_ptr - lb->plb_base;
323 
324 		auio.uio_iov    = &aiov;
325 		auio.uio_iovcnt = 1;
326 		auio.uio_offset = -1;
327 		auio.uio_resid  = nbytes;
328 		auio.uio_rw     = UIO_WRITE;
329 		auio.uio_segflg = UIO_SYSSPACE;
330 		auio.uio_td     = td;
331 
332 		/* switch thread credentials -- see kern_ktrace.c */
333 		td->td_ucred = ownercred;
334 		error = fo_write(po->po_file, &auio, ownercred, 0, td);
335 		td->td_ucred = mycred;
336 
337 		if (error) {
338 			/* XXX some errors are recoverable */
339 			/* send a SIGIO to the owner and exit */
340 			PROC_LOCK(p);
341 			kern_psignal(p, SIGIO);
342 			PROC_UNLOCK(p);
343 
344 			mtx_lock(&pmc_kthread_mtx);
345 
346 			po->po_error = error; /* save for flush log */
347 
348 			PMCDBG2(LOG,WRI,2, "po=%p error=%d", po, error);
349 
350 			break;
351 		}
352 
353 		mtx_lock(&pmc_kthread_mtx);
354 
355 		/* put the used buffer back into the global pool */
356 		PMCLOG_INIT_BUFFER_DESCRIPTOR(lb);
357 
358 		mtx_lock_spin(&pmc_bufferlist_mtx);
359 		TAILQ_INSERT_HEAD(&pmc_bufferlist, lb, plb_next);
360 		mtx_unlock_spin(&pmc_bufferlist_mtx);
361 
362 		lb = NULL;
363 	}
364 
365 	wakeup_one(po->po_kthread);
366 	po->po_kthread = NULL;
367 
368 	mtx_unlock(&pmc_kthread_mtx);
369 
370 	/* return the current I/O buffer to the global pool */
371 	if (lb) {
372 		PMCLOG_INIT_BUFFER_DESCRIPTOR(lb);
373 
374 		mtx_lock_spin(&pmc_bufferlist_mtx);
375 		TAILQ_INSERT_HEAD(&pmc_bufferlist, lb, plb_next);
376 		mtx_unlock_spin(&pmc_bufferlist_mtx);
377 	}
378 
379 	/*
380 	 * Exit this thread, signalling the waiter
381 	 */
382 
383 	crfree(ownercred);
384 
385 	kproc_exit(0);
386 }
387 
388 /*
389  * Release and log entry and schedule an I/O if needed.
390  */
391 
392 static void
393 pmclog_release(struct pmc_owner *po)
394 {
395 	KASSERT(po->po_curbuf->plb_ptr >= po->po_curbuf->plb_base,
396 	    ("[pmclog,%d] buffer invariants po=%p ptr=%p base=%p", __LINE__,
397 		po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_base));
398 	KASSERT(po->po_curbuf->plb_ptr <= po->po_curbuf->plb_fence,
399 	    ("[pmclog,%d] buffer invariants po=%p ptr=%p fenc=%p", __LINE__,
400 		po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_fence));
401 
402 	/* schedule an I/O if we've filled a buffer */
403 	if (po->po_curbuf->plb_ptr >= po->po_curbuf->plb_fence)
404 		pmclog_schedule_io(po);
405 
406 	mtx_unlock_spin(&po->po_mtx);
407 
408 	PMCDBG1(LOG,REL,1, "po=%p", po);
409 }
410 
411 
412 /*
413  * Attempt to reserve 'length' bytes of space in an owner's log
414  * buffer.  The function returns a pointer to 'length' bytes of space
415  * if there was enough space or returns NULL if no space was
416  * available.  Non-null returns do so with the po mutex locked.  The
417  * caller must invoke pmclog_release() on the pmc owner structure
418  * when done.
419  */
420 
421 static uint32_t *
422 pmclog_reserve(struct pmc_owner *po, int length)
423 {
424 	uintptr_t newptr, oldptr;
425 	uint32_t *lh;
426 	struct timespec ts;
427 
428 	PMCDBG2(LOG,ALL,1, "po=%p len=%d", po, length);
429 
430 	KASSERT(length % sizeof(uint32_t) == 0,
431 	    ("[pmclog,%d] length not a multiple of word size", __LINE__));
432 
433 	mtx_lock_spin(&po->po_mtx);
434 
435 	/* No more data when shutdown in progress. */
436 	if (po->po_flags & PMC_PO_SHUTDOWN) {
437 		mtx_unlock_spin(&po->po_mtx);
438 		return (NULL);
439 	}
440 
441 	if (po->po_curbuf == NULL)
442 		if (pmclog_get_buffer(po) != 0) {
443 			mtx_unlock_spin(&po->po_mtx);
444 			return (NULL);
445 		}
446 
447 	KASSERT(po->po_curbuf != NULL,
448 	    ("[pmclog,%d] po=%p no current buffer", __LINE__, po));
449 
450 	KASSERT(po->po_curbuf->plb_ptr >= po->po_curbuf->plb_base &&
451 	    po->po_curbuf->plb_ptr <= po->po_curbuf->plb_fence,
452 	    ("[pmclog,%d] po=%p buffer invariants: ptr=%p base=%p fence=%p",
453 		__LINE__, po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_base,
454 		po->po_curbuf->plb_fence));
455 
456 	oldptr = (uintptr_t) po->po_curbuf->plb_ptr;
457 	newptr = oldptr + length;
458 
459 	KASSERT(oldptr != (uintptr_t) NULL,
460 	    ("[pmclog,%d] po=%p Null log buffer pointer", __LINE__, po));
461 
462 	/*
463 	 * If we have space in the current buffer, return a pointer to
464 	 * available space with the PO structure locked.
465 	 */
466 	if (newptr <= (uintptr_t) po->po_curbuf->plb_fence) {
467 		po->po_curbuf->plb_ptr = (char *) newptr;
468 		goto done;
469 	}
470 
471 	/*
472 	 * Otherwise, schedule the current buffer for output and get a
473 	 * fresh buffer.
474 	 */
475 	pmclog_schedule_io(po);
476 
477 	if (pmclog_get_buffer(po) != 0) {
478 		mtx_unlock_spin(&po->po_mtx);
479 		return (NULL);
480 	}
481 
482 	KASSERT(po->po_curbuf != NULL,
483 	    ("[pmclog,%d] po=%p no current buffer", __LINE__, po));
484 
485 	KASSERT(po->po_curbuf->plb_ptr != NULL,
486 	    ("[pmclog,%d] null return from pmc_get_log_buffer", __LINE__));
487 
488 	KASSERT(po->po_curbuf->plb_ptr == po->po_curbuf->plb_base &&
489 	    po->po_curbuf->plb_ptr <= po->po_curbuf->plb_fence,
490 	    ("[pmclog,%d] po=%p buffer invariants: ptr=%p base=%p fence=%p",
491 		__LINE__, po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_base,
492 		po->po_curbuf->plb_fence));
493 
494 	oldptr = (uintptr_t) po->po_curbuf->plb_ptr;
495 
496  done:
497 	lh = (uint32_t *) oldptr;
498 	lh++;				/* skip header */
499 	getnanotime(&ts);		/* fill in the timestamp */
500 	*lh++ = ts.tv_sec & 0xFFFFFFFF;
501 	*lh++ = ts.tv_nsec & 0xFFFFFFF;
502 	return ((uint32_t *) oldptr);
503 }
504 
505 /*
506  * Schedule an I/O.
507  *
508  * Transfer the current buffer to the helper kthread.
509  */
510 
511 static void
512 pmclog_schedule_io(struct pmc_owner *po)
513 {
514 	KASSERT(po->po_curbuf != NULL,
515 	    ("[pmclog,%d] schedule_io with null buffer po=%p", __LINE__, po));
516 
517 	KASSERT(po->po_curbuf->plb_ptr >= po->po_curbuf->plb_base,
518 	    ("[pmclog,%d] buffer invariants po=%p ptr=%p base=%p", __LINE__,
519 		po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_base));
520 	KASSERT(po->po_curbuf->plb_ptr <= po->po_curbuf->plb_fence,
521 	    ("[pmclog,%d] buffer invariants po=%p ptr=%p fenc=%p", __LINE__,
522 		po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_fence));
523 
524 	PMCDBG1(LOG,SIO, 1, "po=%p", po);
525 
526 	mtx_assert(&po->po_mtx, MA_OWNED);
527 
528 	/*
529 	 * Add the current buffer to the tail of the buffer list and
530 	 * wakeup the helper.
531 	 */
532 	TAILQ_INSERT_TAIL(&po->po_logbuffers, po->po_curbuf, plb_next);
533 	po->po_curbuf = NULL;
534 	wakeup_one(po);
535 }
536 
537 /*
538  * Stop the helper kthread.
539  */
540 
541 static void
542 pmclog_stop_kthread(struct pmc_owner *po)
543 {
544 	/*
545 	 * Close the file to force the thread out of fo_write,
546 	 * unset flag, wakeup the helper thread,
547 	 * wait for it to exit
548 	 */
549 
550 	if (po->po_file != NULL)
551 		fo_close(po->po_file, curthread);
552 
553 	mtx_lock(&pmc_kthread_mtx);
554 	po->po_flags &= ~PMC_PO_OWNS_LOGFILE;
555 	wakeup_one(po);
556 	if (po->po_kthread)
557 		msleep(po->po_kthread, &pmc_kthread_mtx, PPAUSE, "pmckstp", 0);
558 	mtx_unlock(&pmc_kthread_mtx);
559 }
560 
561 /*
562  * Public functions
563  */
564 
565 /*
566  * Configure a log file for pmc owner 'po'.
567  *
568  * Parameter 'logfd' is a file handle referencing an open file in the
569  * owner process.  This file needs to have been opened for writing.
570  */
571 
572 int
573 pmclog_configure_log(struct pmc_mdep *md, struct pmc_owner *po, int logfd)
574 {
575 	int error;
576 	struct proc *p;
577 	cap_rights_t rights;
578 	/*
579 	 * As long as it is possible to get a LOR between pmc_sx lock and
580 	 * proctree/allproc sx locks used for adding a new process, assure
581 	 * the former is not held here.
582 	 */
583 	sx_assert(&pmc_sx, SA_UNLOCKED);
584 	PMCDBG2(LOG,CFG,1, "config po=%p logfd=%d", po, logfd);
585 
586 	p = po->po_owner;
587 
588 	/* return EBUSY if a log file was already present */
589 	if (po->po_flags & PMC_PO_OWNS_LOGFILE)
590 		return (EBUSY);
591 
592 	KASSERT(po->po_kthread == NULL,
593 	    ("[pmclog,%d] po=%p kthread (%p) already present", __LINE__, po,
594 		po->po_kthread));
595 	KASSERT(po->po_file == NULL,
596 	    ("[pmclog,%d] po=%p file (%p) already present", __LINE__, po,
597 		po->po_file));
598 
599 	/* get a reference to the file state */
600 	error = fget_write(curthread, logfd,
601 	    cap_rights_init(&rights, CAP_WRITE), &po->po_file);
602 	if (error)
603 		goto error;
604 
605 	/* mark process as owning a log file */
606 	po->po_flags |= PMC_PO_OWNS_LOGFILE;
607 	error = kproc_create(pmclog_loop, po, &po->po_kthread,
608 	    RFHIGHPID, 0, "hwpmc: proc(%d)", p->p_pid);
609 	if (error)
610 		goto error;
611 
612 	/* mark process as using HWPMCs */
613 	PROC_LOCK(p);
614 	p->p_flag |= P_HWPMC;
615 	PROC_UNLOCK(p);
616 
617 	/* create a log initialization entry */
618 	PMCLOG_RESERVE_WITH_ERROR(po, INITIALIZE,
619 	    sizeof(struct pmclog_initialize));
620 	PMCLOG_EMIT32(PMC_VERSION);
621 	PMCLOG_EMIT32(md->pmd_cputype);
622 	PMCLOG_DESPATCH(po);
623 
624 	return (0);
625 
626  error:
627 	/* shutdown the thread */
628 	if (po->po_kthread)
629 		pmclog_stop_kthread(po);
630 
631 	KASSERT(po->po_kthread == NULL, ("[pmclog,%d] po=%p kthread not "
632 	    "stopped", __LINE__, po));
633 
634 	if (po->po_file)
635 		(void) fdrop(po->po_file, curthread);
636 	po->po_file  = NULL;	/* clear file and error state */
637 	po->po_error = 0;
638 
639 	return (error);
640 }
641 
642 
643 /*
644  * De-configure a log file.  This will throw away any buffers queued
645  * for this owner process.
646  */
647 
648 int
649 pmclog_deconfigure_log(struct pmc_owner *po)
650 {
651 	int error;
652 	struct pmclog_buffer *lb;
653 
654 	PMCDBG1(LOG,CFG,1, "de-config po=%p", po);
655 
656 	if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)
657 		return (EINVAL);
658 
659 	KASSERT(po->po_sscount == 0,
660 	    ("[pmclog,%d] po=%p still owning SS PMCs", __LINE__, po));
661 	KASSERT(po->po_file != NULL,
662 	    ("[pmclog,%d] po=%p no log file", __LINE__, po));
663 
664 	/* stop the kthread, this will reset the 'OWNS_LOGFILE' flag */
665 	pmclog_stop_kthread(po);
666 
667 	KASSERT(po->po_kthread == NULL,
668 	    ("[pmclog,%d] po=%p kthread not stopped", __LINE__, po));
669 
670 	/* return all queued log buffers to the global pool */
671 	while ((lb = TAILQ_FIRST(&po->po_logbuffers)) != NULL) {
672 		TAILQ_REMOVE(&po->po_logbuffers, lb, plb_next);
673 		PMCLOG_INIT_BUFFER_DESCRIPTOR(lb);
674 		mtx_lock_spin(&pmc_bufferlist_mtx);
675 		TAILQ_INSERT_HEAD(&pmc_bufferlist, lb, plb_next);
676 		mtx_unlock_spin(&pmc_bufferlist_mtx);
677 	}
678 
679 	/* return the 'current' buffer to the global pool */
680 	if ((lb = po->po_curbuf) != NULL) {
681 		PMCLOG_INIT_BUFFER_DESCRIPTOR(lb);
682 		mtx_lock_spin(&pmc_bufferlist_mtx);
683 		TAILQ_INSERT_HEAD(&pmc_bufferlist, lb, plb_next);
684 		mtx_unlock_spin(&pmc_bufferlist_mtx);
685 	}
686 
687 	/* drop a reference to the fd */
688 	error = fdrop(po->po_file, curthread);
689 	po->po_file  = NULL;
690 	po->po_error = 0;
691 
692 	return (error);
693 }
694 
695 /*
696  * Flush a process' log buffer.
697  */
698 
699 int
700 pmclog_flush(struct pmc_owner *po)
701 {
702 	int error;
703 	struct pmclog_buffer *lb;
704 
705 	PMCDBG1(LOG,FLS,1, "po=%p", po);
706 
707 	/*
708 	 * If there is a pending error recorded by the logger thread,
709 	 * return that.
710 	 */
711 	if (po->po_error)
712 		return (po->po_error);
713 
714 	error = 0;
715 
716 	/*
717 	 * Check that we do have an active log file.
718 	 */
719 	mtx_lock(&pmc_kthread_mtx);
720 	if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) {
721 		error = EINVAL;
722 		goto error;
723 	}
724 
725 	/*
726 	 * Schedule the current buffer if any and not empty.
727 	 */
728 	mtx_lock_spin(&po->po_mtx);
729 	lb = po->po_curbuf;
730 	if (lb && lb->plb_ptr != lb->plb_base) {
731 		pmclog_schedule_io(po);
732 	} else
733 		error = ENOBUFS;
734 	mtx_unlock_spin(&po->po_mtx);
735 
736  error:
737 	mtx_unlock(&pmc_kthread_mtx);
738 
739 	return (error);
740 }
741 
742 int
743 pmclog_close(struct pmc_owner *po)
744 {
745 
746 	PMCDBG1(LOG,CLO,1, "po=%p", po);
747 
748 	mtx_lock(&pmc_kthread_mtx);
749 
750 	/*
751 	 * Schedule the current buffer.
752 	 */
753 	mtx_lock_spin(&po->po_mtx);
754 	if (po->po_curbuf)
755 		pmclog_schedule_io(po);
756 	else
757 		wakeup_one(po);
758 	mtx_unlock_spin(&po->po_mtx);
759 
760 	/*
761 	 * Initiate shutdown: no new data queued,
762 	 * thread will close file on last block.
763 	 */
764 	po->po_flags |= PMC_PO_SHUTDOWN;
765 
766 	mtx_unlock(&pmc_kthread_mtx);
767 
768 	return (0);
769 }
770 
771 void
772 pmclog_process_callchain(struct pmc *pm, struct pmc_sample *ps)
773 {
774 	int n, recordlen;
775 	uint32_t flags;
776 	struct pmc_owner *po;
777 
778 	PMCDBG3(LOG,SAM,1,"pm=%p pid=%d n=%d", pm, ps->ps_pid,
779 	    ps->ps_nsamples);
780 
781 	recordlen = offsetof(struct pmclog_callchain, pl_pc) +
782 	    ps->ps_nsamples * sizeof(uintfptr_t);
783 	po = pm->pm_owner;
784 	flags = PMC_CALLCHAIN_TO_CPUFLAGS(ps->ps_cpu,ps->ps_flags);
785 	PMCLOG_RESERVE(po, CALLCHAIN, recordlen);
786 	PMCLOG_EMIT32(ps->ps_pid);
787 	PMCLOG_EMIT32(pm->pm_id);
788 	PMCLOG_EMIT32(flags);
789 	for (n = 0; n < ps->ps_nsamples; n++)
790 		PMCLOG_EMITADDR(ps->ps_pc[n]);
791 	PMCLOG_DESPATCH(po);
792 }
793 
794 void
795 pmclog_process_closelog(struct pmc_owner *po)
796 {
797 	PMCLOG_RESERVE(po,CLOSELOG,sizeof(struct pmclog_closelog));
798 	PMCLOG_DESPATCH(po);
799 }
800 
801 void
802 pmclog_process_dropnotify(struct pmc_owner *po)
803 {
804 	PMCLOG_RESERVE(po,DROPNOTIFY,sizeof(struct pmclog_dropnotify));
805 	PMCLOG_DESPATCH(po);
806 }
807 
808 void
809 pmclog_process_map_in(struct pmc_owner *po, pid_t pid, uintfptr_t start,
810     const char *path)
811 {
812 	int pathlen, recordlen;
813 
814 	KASSERT(path != NULL, ("[pmclog,%d] map-in, null path", __LINE__));
815 
816 	pathlen = strlen(path) + 1;	/* #bytes for path name */
817 	recordlen = offsetof(struct pmclog_map_in, pl_pathname) +
818 	    pathlen;
819 
820 	PMCLOG_RESERVE(po, MAP_IN, recordlen);
821 	PMCLOG_EMIT32(pid);
822 	PMCLOG_EMITADDR(start);
823 	PMCLOG_EMITSTRING(path,pathlen);
824 	PMCLOG_DESPATCH(po);
825 }
826 
827 void
828 pmclog_process_map_out(struct pmc_owner *po, pid_t pid, uintfptr_t start,
829     uintfptr_t end)
830 {
831 	KASSERT(start <= end, ("[pmclog,%d] start > end", __LINE__));
832 
833 	PMCLOG_RESERVE(po, MAP_OUT, sizeof(struct pmclog_map_out));
834 	PMCLOG_EMIT32(pid);
835 	PMCLOG_EMITADDR(start);
836 	PMCLOG_EMITADDR(end);
837 	PMCLOG_DESPATCH(po);
838 }
839 
840 void
841 pmclog_process_pmcallocate(struct pmc *pm)
842 {
843 	struct pmc_owner *po;
844 	struct pmc_soft *ps;
845 
846 	po = pm->pm_owner;
847 
848 	PMCDBG1(LOG,ALL,1, "pm=%p", pm);
849 
850 	if (PMC_TO_CLASS(pm) == PMC_CLASS_SOFT) {
851 		PMCLOG_RESERVE(po, PMCALLOCATEDYN,
852 		    sizeof(struct pmclog_pmcallocatedyn));
853 		PMCLOG_EMIT32(pm->pm_id);
854 		PMCLOG_EMIT32(pm->pm_event);
855 		PMCLOG_EMIT32(pm->pm_flags);
856 		ps = pmc_soft_ev_acquire(pm->pm_event);
857 		if (ps != NULL)
858 			PMCLOG_EMITSTRING(ps->ps_ev.pm_ev_name,PMC_NAME_MAX);
859 		else
860 			PMCLOG_EMITNULLSTRING(PMC_NAME_MAX);
861 		pmc_soft_ev_release(ps);
862 		PMCLOG_DESPATCH(po);
863 	} else {
864 		PMCLOG_RESERVE(po, PMCALLOCATE,
865 		    sizeof(struct pmclog_pmcallocate));
866 		PMCLOG_EMIT32(pm->pm_id);
867 		PMCLOG_EMIT32(pm->pm_event);
868 		PMCLOG_EMIT32(pm->pm_flags);
869 		PMCLOG_DESPATCH(po);
870 	}
871 }
872 
873 void
874 pmclog_process_pmcattach(struct pmc *pm, pid_t pid, char *path)
875 {
876 	int pathlen, recordlen;
877 	struct pmc_owner *po;
878 
879 	PMCDBG2(LOG,ATT,1,"pm=%p pid=%d", pm, pid);
880 
881 	po = pm->pm_owner;
882 
883 	pathlen = strlen(path) + 1;	/* #bytes for the string */
884 	recordlen = offsetof(struct pmclog_pmcattach, pl_pathname) + pathlen;
885 
886 	PMCLOG_RESERVE(po, PMCATTACH, recordlen);
887 	PMCLOG_EMIT32(pm->pm_id);
888 	PMCLOG_EMIT32(pid);
889 	PMCLOG_EMITSTRING(path, pathlen);
890 	PMCLOG_DESPATCH(po);
891 }
892 
893 void
894 pmclog_process_pmcdetach(struct pmc *pm, pid_t pid)
895 {
896 	struct pmc_owner *po;
897 
898 	PMCDBG2(LOG,ATT,1,"!pm=%p pid=%d", pm, pid);
899 
900 	po = pm->pm_owner;
901 
902 	PMCLOG_RESERVE(po, PMCDETACH, sizeof(struct pmclog_pmcdetach));
903 	PMCLOG_EMIT32(pm->pm_id);
904 	PMCLOG_EMIT32(pid);
905 	PMCLOG_DESPATCH(po);
906 }
907 
908 /*
909  * Log a context switch event to the log file.
910  */
911 
912 void
913 pmclog_process_proccsw(struct pmc *pm, struct pmc_process *pp, pmc_value_t v)
914 {
915 	struct pmc_owner *po;
916 
917 	KASSERT(pm->pm_flags & PMC_F_LOG_PROCCSW,
918 	    ("[pmclog,%d] log-process-csw called gratuitously", __LINE__));
919 
920 	PMCDBG3(LOG,SWO,1,"pm=%p pid=%d v=%jx", pm, pp->pp_proc->p_pid,
921 	    v);
922 
923 	po = pm->pm_owner;
924 
925 	PMCLOG_RESERVE(po, PROCCSW, sizeof(struct pmclog_proccsw));
926 	PMCLOG_EMIT32(pm->pm_id);
927 	PMCLOG_EMIT64(v);
928 	PMCLOG_EMIT32(pp->pp_proc->p_pid);
929 	PMCLOG_DESPATCH(po);
930 }
931 
932 void
933 pmclog_process_procexec(struct pmc_owner *po, pmc_id_t pmid, pid_t pid,
934     uintfptr_t startaddr, char *path)
935 {
936 	int pathlen, recordlen;
937 
938 	PMCDBG3(LOG,EXC,1,"po=%p pid=%d path=\"%s\"", po, pid, path);
939 
940 	pathlen   = strlen(path) + 1;	/* #bytes for the path */
941 	recordlen = offsetof(struct pmclog_procexec, pl_pathname) + pathlen;
942 
943 	PMCLOG_RESERVE(po, PROCEXEC, recordlen);
944 	PMCLOG_EMIT32(pid);
945 	PMCLOG_EMITADDR(startaddr);
946 	PMCLOG_EMIT32(pmid);
947 	PMCLOG_EMITSTRING(path,pathlen);
948 	PMCLOG_DESPATCH(po);
949 }
950 
951 /*
952  * Log a process exit event (and accumulated pmc value) to the log file.
953  */
954 
955 void
956 pmclog_process_procexit(struct pmc *pm, struct pmc_process *pp)
957 {
958 	int ri;
959 	struct pmc_owner *po;
960 
961 	ri = PMC_TO_ROWINDEX(pm);
962 	PMCDBG3(LOG,EXT,1,"pm=%p pid=%d v=%jx", pm, pp->pp_proc->p_pid,
963 	    pp->pp_pmcs[ri].pp_pmcval);
964 
965 	po = pm->pm_owner;
966 
967 	PMCLOG_RESERVE(po, PROCEXIT, sizeof(struct pmclog_procexit));
968 	PMCLOG_EMIT32(pm->pm_id);
969 	PMCLOG_EMIT64(pp->pp_pmcs[ri].pp_pmcval);
970 	PMCLOG_EMIT32(pp->pp_proc->p_pid);
971 	PMCLOG_DESPATCH(po);
972 }
973 
974 /*
975  * Log a fork event.
976  */
977 
978 void
979 pmclog_process_procfork(struct pmc_owner *po, pid_t oldpid, pid_t newpid)
980 {
981 	PMCLOG_RESERVE(po, PROCFORK, sizeof(struct pmclog_procfork));
982 	PMCLOG_EMIT32(oldpid);
983 	PMCLOG_EMIT32(newpid);
984 	PMCLOG_DESPATCH(po);
985 }
986 
987 /*
988  * Log a process exit event of the form suitable for system-wide PMCs.
989  */
990 
991 void
992 pmclog_process_sysexit(struct pmc_owner *po, pid_t pid)
993 {
994 	PMCLOG_RESERVE(po, SYSEXIT, sizeof(struct pmclog_sysexit));
995 	PMCLOG_EMIT32(pid);
996 	PMCLOG_DESPATCH(po);
997 }
998 
999 /*
1000  * Write a user log entry.
1001  */
1002 
1003 int
1004 pmclog_process_userlog(struct pmc_owner *po, struct pmc_op_writelog *wl)
1005 {
1006 	int error;
1007 
1008 	PMCDBG2(LOG,WRI,1, "writelog po=%p ud=0x%x", po, wl->pm_userdata);
1009 
1010 	error = 0;
1011 
1012 	PMCLOG_RESERVE_WITH_ERROR(po, USERDATA,
1013 	    sizeof(struct pmclog_userdata));
1014 	PMCLOG_EMIT32(wl->pm_userdata);
1015 	PMCLOG_DESPATCH(po);
1016 
1017  error:
1018 	return (error);
1019 }
1020 
1021 /*
1022  * Initialization.
1023  *
1024  * Create a pool of log buffers and initialize mutexes.
1025  */
1026 
1027 void
1028 pmclog_initialize()
1029 {
1030 	int n;
1031 	struct pmclog_buffer *plb;
1032 
1033 	if (pmclog_buffer_size <= 0) {
1034 		(void) printf("hwpmc: tunable logbuffersize=%d must be "
1035 		    "greater than zero.\n", pmclog_buffer_size);
1036 		pmclog_buffer_size = PMC_LOG_BUFFER_SIZE;
1037 	}
1038 
1039 	if (pmc_nlogbuffers <= 0) {
1040 		(void) printf("hwpmc: tunable nlogbuffers=%d must be greater "
1041 		    "than zero.\n", pmc_nlogbuffers);
1042 		pmc_nlogbuffers = PMC_NLOGBUFFERS;
1043 	}
1044 
1045 	/* create global pool of log buffers */
1046 	for (n = 0; n < pmc_nlogbuffers; n++) {
1047 		plb = malloc(1024 * pmclog_buffer_size, M_PMC,
1048 		    M_WAITOK|M_ZERO);
1049 		PMCLOG_INIT_BUFFER_DESCRIPTOR(plb);
1050 		TAILQ_INSERT_HEAD(&pmc_bufferlist, plb, plb_next);
1051 	}
1052 	mtx_init(&pmc_bufferlist_mtx, "pmc-buffer-list", "pmc-leaf",
1053 	    MTX_SPIN);
1054 	mtx_init(&pmc_kthread_mtx, "pmc-kthread", "pmc-sleep", MTX_DEF);
1055 }
1056 
1057 /*
1058  * Shutdown logging.
1059  *
1060  * Destroy mutexes and release memory back the to free pool.
1061  */
1062 
1063 void
1064 pmclog_shutdown()
1065 {
1066 	struct pmclog_buffer *plb;
1067 
1068 	mtx_destroy(&pmc_kthread_mtx);
1069 	mtx_destroy(&pmc_bufferlist_mtx);
1070 
1071 	while ((plb = TAILQ_FIRST(&pmc_bufferlist)) != NULL) {
1072 		TAILQ_REMOVE(&pmc_bufferlist, plb, plb_next);
1073 		free(plb, M_PMC);
1074 	}
1075 }
1076