xref: /titanic_52/usr/src/uts/common/os/dumpsubr.c (revision 4cca9c843b53e5c17fa12ad23a003df0587904fd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/vm.h>
30 #include <sys/proc.h>
31 #include <sys/file.h>
32 #include <sys/conf.h>
33 #include <sys/kmem.h>
34 #include <sys/mem.h>
35 #include <sys/mman.h>
36 #include <sys/vnode.h>
37 #include <sys/errno.h>
38 #include <sys/memlist.h>
39 #include <sys/dumphdr.h>
40 #include <sys/dumpadm.h>
41 #include <sys/ksyms.h>
42 #include <sys/compress.h>
43 #include <sys/stream.h>
44 #include <sys/strsun.h>
45 #include <sys/cmn_err.h>
46 #include <sys/bitmap.h>
47 #include <sys/modctl.h>
48 #include <sys/utsname.h>
49 #include <sys/systeminfo.h>
50 #include <sys/vmem.h>
51 #include <sys/log.h>
52 #include <sys/var.h>
53 #include <sys/debug.h>
54 #include <sys/sunddi.h>
55 #include <fs/fs_subr.h>
56 #include <sys/fs/snode.h>
57 #include <sys/ontrap.h>
58 #include <sys/panic.h>
59 #include <sys/dkio.h>
60 #include <sys/vtoc.h>
61 #include <sys/errorq.h>
62 #include <sys/fm/util.h>
63 #include <sys/fs/zfs.h>
64 
65 #include <vm/hat.h>
66 #include <vm/as.h>
67 #include <vm/page.h>
68 #include <vm/pvn.h>
69 #include <vm/seg.h>
70 #include <vm/seg_kmem.h>
71 #include <sys/clock_impl.h>
72 #include <sys/hold_page.h>
73 
74 #include <bzip2/bzlib.h>
75 
76 /*
77  * Crash dump time is dominated by disk write time.  To reduce this,
78  * the stronger compression method bzip2 is applied to reduce the dump
79  * size and hence reduce I/O time.  However, bzip2 is much more
80  * computationally expensive than the existing lzjb algorithm, so to
81  * avoid increasing compression time, CPUs that are otherwise idle
82  * during panic are employed to parallelize the compression task.
83  * Many helper CPUs are needed to prevent bzip2 from being a
84  * bottleneck, and on systems with too few CPUs, the lzjb algorithm is
85  * parallelized instead. Lastly, I/O and compression are performed by
86  * different CPUs, and are hence overlapped in time, unlike the older
87  * serial code.
88  *
89  * Another important consideration is the speed of the dump
90  * device. Faster disks need less CPUs in order to benefit from
91  * parallel lzjb versus parallel bzip2. Therefore, the CPU count
92  * threshold for switching from parallel lzjb to paralled bzip2 is
93  * elevated for faster disks. The dump device speed is adduced from
94  * the setting for dumpbuf.iosize, see dump_update_clevel.
95  */
96 
97 /*
98  * exported vars
99  */
100 kmutex_t	dump_lock;		/* lock for dump configuration */
101 dumphdr_t	*dumphdr;		/* dump header */
102 int		dump_conflags = DUMP_KERNEL; /* dump configuration flags */
103 vnode_t		*dumpvp;		/* dump device vnode pointer */
104 u_offset_t	dumpvp_size;		/* size of dump device, in bytes */
105 char		*dumppath;		/* pathname of dump device */
106 int		dump_timeout = 120;	/* timeout for dumping pages */
107 int		dump_timeleft;		/* portion of dump_timeout remaining */
108 int		dump_ioerr;		/* dump i/o error */
109 int		dump_check_used;	/* enable check for used pages */
110 
111 /*
112  * Tunables for dump compression and parallelism. These can be set via
113  * /etc/system.
114  *
115  * dump_ncpu_low	number of helpers for parallel lzjb
116  *	This is also the minimum configuration.
117  *
118  * dump_bzip2_level	bzip2 compression level: 1-9
119  *	Higher numbers give greater compression, but take more memory
120  *	and time. Memory used per helper is ~(dump_bzip2_level * 1MB).
121  *
122  * dump_plat_mincpu	the cross-over limit for using bzip2 (per platform):
123  *	if dump_plat_mincpu == 0, then always do single threaded dump
124  *	if ncpu >= dump_plat_mincpu then try to use bzip2
125  *
126  * dump_metrics_on	if set, metrics are collected in the kernel, passed
127  *	to savecore via the dump file, and recorded by savecore in
128  *	METRICS.txt.
129  */
130 uint_t dump_ncpu_low = 4;	/* minimum config for parallel lzjb */
131 uint_t dump_bzip2_level = 1;	/* bzip2 level (1-9) */
132 
133 /* Use dump_plat_mincpu_default unless this variable is set by /etc/system */
134 #define	MINCPU_NOT_SET	((uint_t)-1)
135 uint_t dump_plat_mincpu = MINCPU_NOT_SET;
136 
137 /* tunables for pre-reserved heap */
138 uint_t dump_kmem_permap = 1024;
139 uint_t dump_kmem_pages = 8;
140 
141 /* Define multiple buffers per helper to avoid stalling */
142 #define	NCBUF_PER_HELPER	2
143 #define	NCMAP_PER_HELPER	4
144 
145 /* minimum number of helpers configured */
146 #define	MINHELPERS	(dump_ncpu_low)
147 #define	MINCBUFS	(MINHELPERS * NCBUF_PER_HELPER)
148 
149 /*
150  * Define constant parameters.
151  *
152  * CBUF_SIZE		size of an output buffer
153  *
154  * CBUF_MAPSIZE		size of virtual range for mapping pages
155  *
156  * CBUF_MAPNP		size of virtual range in pages
157  *
158  */
159 #define	DUMP_1KB	((size_t)1 << 10)
160 #define	DUMP_1MB	((size_t)1 << 20)
161 #define	CBUF_SIZE	((size_t)1 << 17)
162 #define	CBUF_MAPSHIFT	(22)
163 #define	CBUF_MAPSIZE	((size_t)1 << CBUF_MAPSHIFT)
164 #define	CBUF_MAPNP	((size_t)1 << (CBUF_MAPSHIFT - PAGESHIFT))
165 
166 /*
167  * Compression metrics are accumulated nano-second subtotals. The
168  * results are normalized by the number of pages dumped. A report is
169  * generated when dumpsys() completes and is saved in the dump image
170  * after the trailing dump header.
171  *
172  * Metrics are always collected. Set the variable dump_metrics_on to
173  * cause metrics to be saved in the crash file, where savecore will
174  * save it in the file METRICS.txt.
175  */
176 #define	PERPAGES \
177 	PERPAGE(bitmap) PERPAGE(map) PERPAGE(unmap) \
178 	PERPAGE(copy) PERPAGE(compress) \
179 	PERPAGE(write) \
180 	PERPAGE(inwait) PERPAGE(outwait)
181 
182 typedef struct perpage {
183 #define	PERPAGE(x) hrtime_t x;
184 	PERPAGES
185 #undef PERPAGE
186 } perpage_t;
187 
188 /*
189  * This macro controls the code generation for collecting dump
190  * performance information. By default, the code is generated, but
191  * automatic saving of the information is disabled. If dump_metrics_on
192  * is set to 1, the timing information is passed to savecore via the
193  * crash file, where it is appended to the file dump-dir/METRICS.txt.
194  */
195 #define	COLLECT_METRICS
196 
197 #ifdef COLLECT_METRICS
198 uint_t dump_metrics_on = 0;	/* set to 1 to enable recording metrics */
199 
200 #define	HRSTART(v, m)		v##ts.m = gethrtime()
201 #define	HRSTOP(v, m)		v.m += gethrtime() - v##ts.m
202 #define	HRBEGIN(v, m, s)	v##ts.m = gethrtime(); v.size += s
203 #define	HREND(v, m)		v.m += gethrtime() - v##ts.m
204 #define	HRNORM(v, m, n)		v.m /= (n)
205 
206 #else
207 #define	HRSTART(v, m)
208 #define	HRSTOP(v, m)
209 #define	HRBEGIN(v, m, s)
210 #define	HREND(v, m)
211 #define	HRNORM(v, m, n)
212 #endif	/* COLLECT_METRICS */
213 
214 /*
215  * Buffers for copying and compressing memory pages.
216  *
217  * cbuf_t buffer controllers: used for both input and output.
218  *
219  * The buffer state indicates how it is being used:
220  *
221  * CBUF_FREEMAP: CBUF_MAPSIZE virtual address range is available for
222  * mapping input pages.
223  *
224  * CBUF_INREADY: input pages are mapped and ready for compression by a
225  * helper.
226  *
227  * CBUF_USEDMAP: mapping has been consumed by a helper. Needs unmap.
228  *
229  * CBUF_FREEBUF: CBUF_SIZE output buffer, which is available.
230  *
231  * CBUF_WRITE: CBUF_SIZE block of compressed pages from a helper,
232  * ready to write out.
233  *
234  * CBUF_ERRMSG: CBUF_SIZE block of error messages from a helper
235  * (reports UE errors.)
236  */
237 
238 typedef enum cbufstate {
239 	CBUF_FREEMAP,
240 	CBUF_INREADY,
241 	CBUF_USEDMAP,
242 	CBUF_FREEBUF,
243 	CBUF_WRITE,
244 	CBUF_ERRMSG
245 } cbufstate_t;
246 
247 typedef struct cbuf cbuf_t;
248 
249 struct cbuf {
250 	cbuf_t *next;			/* next in list */
251 	cbufstate_t state;		/* processing state */
252 	size_t used;			/* amount used */
253 	size_t size;			/* mem size */
254 	char *buf;			/* kmem or vmem */
255 	pgcnt_t pagenum;		/* index to pfn map */
256 	pgcnt_t bitnum;			/* first set bitnum */
257 	pfn_t pfn;			/* first pfn in mapped range */
258 	int off;			/* byte offset to first pfn */
259 };
260 
261 /*
262  * cqueue_t queues: a uni-directional channel for communication
263  * from the master to helper tasks or vice-versa using put and
264  * get primitives. Both mappings and data buffers are passed via
265  * queues. Producers close a queue when done. The number of
266  * active producers is reference counted so the consumer can
267  * detect end of data. Concurrent access is mediated by atomic
268  * operations for panic dump, or mutex/cv for live dump.
269  *
270  * There a four queues, used as follows:
271  *
272  * Queue		Dataflow		NewState
273  * --------------------------------------------------
274  * mainq		master -> master	FREEMAP
275  * master has initialized or unmapped an input buffer
276  * --------------------------------------------------
277  * helperq		master -> helper	INREADY
278  * master has mapped input for use by helper
279  * --------------------------------------------------
280  * mainq		master <- helper	USEDMAP
281  * helper is done with input
282  * --------------------------------------------------
283  * freebufq		master -> helper	FREEBUF
284  * master has initialized or written an output buffer
285  * --------------------------------------------------
286  * mainq		master <- helper	WRITE
287  * block of compressed pages from a helper
288  * --------------------------------------------------
289  * mainq		master <- helper	ERRMSG
290  * error messages from a helper (memory error case)
291  * --------------------------------------------------
292  * writerq		master <- master	WRITE
293  * non-blocking queue of blocks to write
294  * --------------------------------------------------
295  */
296 typedef struct cqueue {
297 	cbuf_t *volatile first;		/* first in list */
298 	cbuf_t *last;			/* last in list */
299 	hrtime_t ts;			/* timestamp */
300 	hrtime_t empty;			/* total time empty */
301 	kmutex_t mutex;			/* live state lock */
302 	kcondvar_t cv;			/* live wait var */
303 	lock_t spinlock;		/* panic mode spin lock */
304 	volatile uint_t open;		/* producer ref count */
305 } cqueue_t;
306 
307 /*
308  * Convenience macros for using the cqueue functions
309  * Note that the caller must have defined "dumpsync_t *ds"
310  */
311 #define	CQ_IS_EMPTY(q)					\
312 	(ds->q.first == NULL)
313 
314 #define	CQ_OPEN(q)					\
315 	atomic_inc_uint(&ds->q.open)
316 
317 #define	CQ_CLOSE(q)					\
318 	dumpsys_close_cq(&ds->q, ds->live)
319 
320 #define	CQ_PUT(q, cp, st)				\
321 	dumpsys_put_cq(&ds->q, cp, st, ds->live)
322 
323 #define	CQ_GET(q)					\
324 	dumpsys_get_cq(&ds->q, ds->live)
325 
326 /*
327  * Dynamic state when dumpsys() is running.
328  */
329 typedef struct dumpsync {
330 	pgcnt_t npages;			/* subtotal of pages dumped */
331 	pgcnt_t pages_mapped;		/* subtotal of pages mapped */
332 	pgcnt_t pages_used;		/* subtotal of pages used per map */
333 	size_t nwrite;			/* subtotal of bytes written */
334 	uint_t live;			/* running live dump */
335 	uint_t neednl;			/* will need to print a newline */
336 	uint_t percent;			/* dump progress */
337 	uint_t percent_done;		/* dump progress reported */
338 	cqueue_t freebufq;		/* free kmem bufs for writing */
339 	cqueue_t mainq;			/* input for main task */
340 	cqueue_t helperq;		/* input for helpers */
341 	cqueue_t writerq;		/* input for writer */
342 	hrtime_t start;			/* start time */
343 	hrtime_t elapsed;		/* elapsed time when completed */
344 	hrtime_t iotime;		/* time spent writing nwrite bytes */
345 	hrtime_t iowait;		/* time spent waiting for output */
346 	hrtime_t iowaitts;		/* iowait timestamp */
347 	perpage_t perpage;		/* metrics */
348 	perpage_t perpagets;
349 	int dumpcpu;			/* master cpu */
350 } dumpsync_t;
351 
352 static dumpsync_t dumpsync;		/* synchronization vars */
353 
354 /*
355  * helper_t helpers: contains the context for a stream. CPUs run in
356  * parallel at dump time; each CPU creates a single stream of
357  * compression data.  Stream data is divided into CBUF_SIZE blocks.
358  * The blocks are written in order within a stream. But, blocks from
359  * multiple streams can be interleaved. Each stream is identified by a
360  * unique tag.
361  */
362 typedef struct helper {
363 	int helper;			/* bound helper id */
364 	int tag;			/* compression stream tag */
365 	perpage_t perpage;		/* per page metrics */
366 	perpage_t perpagets;		/* per page metrics (timestamps) */
367 	taskqid_t taskqid;		/* live dump task ptr */
368 	int in, out;			/* buffer offsets */
369 	cbuf_t *cpin, *cpout, *cperr;	/* cbuf objects in process */
370 	dumpsync_t *ds;			/* pointer to sync vars */
371 	size_t used;			/* counts input consumed */
372 	char *page;			/* buffer for page copy */
373 	char *lzbuf;			/* lzjb output */
374 	bz_stream bzstream;		/* bzip2 state */
375 } helper_t;
376 
377 #define	MAINHELPER	(-1)		/* helper is also the main task */
378 #define	FREEHELPER	(-2)		/* unbound helper */
379 #define	DONEHELPER	(-3)		/* helper finished */
380 
381 /*
382  * configuration vars for dumpsys
383  */
384 typedef struct dumpcfg {
385 	int	threshold;	/* ncpu threshold for bzip2 */
386 	int	nhelper;	/* number of helpers */
387 	int	nhelper_used;	/* actual number of helpers used */
388 	int	ncmap;		/* number VA pages for compression */
389 	int	ncbuf;		/* number of bufs for compression */
390 	int	ncbuf_used;	/* number of bufs in use */
391 	uint_t	clevel;		/* dump compression level */
392 	helper_t *helper;	/* array of helpers */
393 	cbuf_t	*cmap;		/* array of input (map) buffers */
394 	cbuf_t	*cbuf;		/* array of output  buffers */
395 	ulong_t	*helpermap;	/* set of dumpsys helper CPU ids */
396 	ulong_t	*bitmap;	/* bitmap for marking pages to dump */
397 	ulong_t	*rbitmap;	/* bitmap for used CBUF_MAPSIZE ranges */
398 	pgcnt_t	bitmapsize;	/* size of bitmap */
399 	pgcnt_t	rbitmapsize;	/* size of bitmap for ranges */
400 	pgcnt_t found4m;	/* number ranges allocated by dump */
401 	pgcnt_t foundsm;	/* number small pages allocated by dump */
402 	pid_t	*pids;		/* list of process IDs at dump time */
403 	size_t	maxsize;	/* memory size needed at dump time */
404 	size_t	maxvmsize;	/* size of reserved VM */
405 	char	*maxvm;		/* reserved VM for spare pages */
406 	lock_t	helper_lock;	/* protect helper state */
407 	char	helpers_wanted;	/* flag to enable parallelism */
408 	char	helper_present;	/* at least one helper showed up */
409 } dumpcfg_t;
410 
411 static dumpcfg_t dumpcfg;	/* config vars */
412 
413 /*
414  * The dump I/O buffer.
415  *
416  * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is
417  * sized according to the optimum device transfer speed.
418  */
419 typedef struct dumpbuf {
420 	vnode_t	*cdev_vp;	/* VCHR open of the dump device */
421 	len_t	vp_limit;	/* maximum write offset */
422 	offset_t vp_off;	/* current dump device offset */
423 	char	*cur;		/* dump write pointer */
424 	char	*start;		/* dump buffer address */
425 	char	*end;		/* dump buffer end */
426 	size_t	size;		/* size of dumpbuf in bytes */
427 	size_t	iosize;		/* best transfer size for device */
428 } dumpbuf_t;
429 
430 dumpbuf_t dumpbuf;		/* I/O buffer */
431 
432 /*
433  * The dump I/O buffer must be at least one page, at most xfer_size
434  * bytes, and should scale with physmem in between.  The transfer size
435  * passed in will either represent a global default (maxphys) or the
436  * best size for the device.  The size of the dumpbuf I/O buffer is
437  * limited by dumpbuf_limit (8MB by default) because the dump
438  * performance saturates beyond a certain size.  The default is to
439  * select 1/4096 of the memory.
440  */
441 static int	dumpbuf_fraction = 12;	/* memory size scale factor */
442 static size_t	dumpbuf_limit = 8 * DUMP_1MB;	/* max I/O buf size */
443 
444 static size_t
445 dumpbuf_iosize(size_t xfer_size)
446 {
447 	size_t iosize = ptob(physmem >> dumpbuf_fraction);
448 
449 	if (iosize < PAGESIZE)
450 		iosize = PAGESIZE;
451 	else if (iosize > xfer_size)
452 		iosize = xfer_size;
453 	if (iosize > dumpbuf_limit)
454 		iosize = dumpbuf_limit;
455 	return (iosize & PAGEMASK);
456 }
457 
458 /*
459  * resize the I/O buffer
460  */
461 static void
462 dumpbuf_resize(void)
463 {
464 	char *old_buf = dumpbuf.start;
465 	size_t old_size = dumpbuf.size;
466 	char *new_buf;
467 	size_t new_size;
468 
469 	ASSERT(MUTEX_HELD(&dump_lock));
470 
471 	new_size = dumpbuf_iosize(MAX(dumpbuf.iosize, maxphys));
472 	if (new_size <= old_size)
473 		return; /* no need to reallocate buffer */
474 
475 	new_buf = kmem_alloc(new_size, KM_SLEEP);
476 	dumpbuf.size = new_size;
477 	dumpbuf.start = new_buf;
478 	dumpbuf.end = new_buf + new_size;
479 	kmem_free(old_buf, old_size);
480 }
481 
482 /*
483  * dump_update_clevel is called when dumpadm configures the dump device.
484  * 	Calculate number of helpers and buffers.
485  * 	Allocate the minimum configuration for now.
486  *
487  * When the dump file is configured we reserve a minimum amount of
488  * memory for use at crash time. But we reserve VA for all the memory
489  * we really want in order to do the fastest dump possible. The VA is
490  * backed by pages not being dumped, according to the bitmap. If
491  * there is insufficient spare memory, however, we fall back to the
492  * minimum.
493  *
494  * Live dump (savecore -L) always uses the minimum config.
495  *
496  * clevel 0 is single threaded lzjb
497  * clevel 1 is parallel lzjb
498  * clevel 2 is parallel bzip2
499  *
500  * The ncpu threshold is selected with dump_plat_mincpu.
501  * On OPL, set_platform_defaults() overrides the sun4u setting.
502  * The actual values are defined via DUMP_PLAT_*_MINCPU macros.
503  *
504  * Architecture		Threshold	Algorithm
505  * sun4u       		<  51		parallel lzjb
506  * sun4u       		>= 51		parallel bzip2(*)
507  * sun4u OPL   		<  8		parallel lzjb
508  * sun4u OPL   		>= 8		parallel bzip2(*)
509  * sun4v       		<  128		parallel lzjb
510  * sun4v       		>= 128		parallel bzip2(*)
511  * x86			< 11		parallel lzjb
512  * x86			>= 11		parallel bzip2(*)
513  * 32-bit      		N/A		single-threaded lzjb
514  *
515  * (*) bzip2 is only chosen if there is sufficient available
516  * memory for buffers at dump time. See dumpsys_get_maxmem().
517  *
518  * Faster dump devices have larger I/O buffers. The threshold value is
519  * increased according to the size of the dump I/O buffer, because
520  * parallel lzjb performs better with faster disks. For buffers >= 1MB
521  * the threshold is 3X; for buffers >= 256K threshold is 2X.
522  *
523  * For parallel dumps, the number of helpers is ncpu-1. The CPU
524  * running panic runs the main task. For single-threaded dumps, the
525  * panic CPU does lzjb compression (it is tagged as MAINHELPER.)
526  *
527  * Need multiple buffers per helper so that they do not block waiting
528  * for the main task.
529  *				parallel	single-threaded
530  * Number of output buffers:	nhelper*2		1
531  * Number of mapping buffers:	nhelper*4		1
532  *
533  */
534 static void
535 dump_update_clevel()
536 {
537 	int tag;
538 	size_t bz2size;
539 	helper_t *hp, *hpend;
540 	cbuf_t *cp, *cpend;
541 	dumpcfg_t *old = &dumpcfg;
542 	dumpcfg_t newcfg = *old;
543 	dumpcfg_t *new = &newcfg;
544 
545 	ASSERT(MUTEX_HELD(&dump_lock));
546 
547 	/*
548 	 * Free the previously allocated bufs and VM.
549 	 */
550 	if (old->helper != NULL) {
551 
552 		/* helpers */
553 		hpend = &old->helper[old->nhelper];
554 		for (hp = old->helper; hp != hpend; hp++) {
555 			if (hp->lzbuf != NULL)
556 				kmem_free(hp->lzbuf, PAGESIZE);
557 			if (hp->page != NULL)
558 				kmem_free(hp->page, PAGESIZE);
559 		}
560 		kmem_free(old->helper, old->nhelper * sizeof (helper_t));
561 
562 		/* VM space for mapping pages */
563 		cpend = &old->cmap[old->ncmap];
564 		for (cp = old->cmap; cp != cpend; cp++)
565 			vmem_xfree(heap_arena, cp->buf, CBUF_MAPSIZE);
566 		kmem_free(old->cmap, old->ncmap * sizeof (cbuf_t));
567 
568 		/* output bufs */
569 		cpend = &old->cbuf[old->ncbuf];
570 		for (cp = old->cbuf; cp != cpend; cp++)
571 			if (cp->buf != NULL)
572 				kmem_free(cp->buf, cp->size);
573 		kmem_free(old->cbuf, old->ncbuf * sizeof (cbuf_t));
574 
575 		/* reserved VM for dumpsys_get_maxmem */
576 		if (old->maxvmsize > 0)
577 			vmem_xfree(heap_arena, old->maxvm, old->maxvmsize);
578 	}
579 
580 	/*
581 	 * Allocate memory and VM.
582 	 * One CPU runs dumpsys, the rest are helpers.
583 	 */
584 	new->nhelper = ncpus - 1;
585 	if (new->nhelper < 1)
586 		new->nhelper = 1;
587 
588 	if (new->nhelper > DUMP_MAX_NHELPER)
589 		new->nhelper = DUMP_MAX_NHELPER;
590 
591 	/* use platform default, unless /etc/system overrides */
592 	if (dump_plat_mincpu == MINCPU_NOT_SET)
593 		dump_plat_mincpu = dump_plat_mincpu_default;
594 
595 	/* increase threshold for faster disks */
596 	new->threshold = dump_plat_mincpu;
597 	if (dumpbuf.iosize >= DUMP_1MB)
598 		new->threshold *= 3;
599 	else if (dumpbuf.iosize >= (256 * DUMP_1KB))
600 		new->threshold *= 2;
601 
602 	/* figure compression level based upon the computed threshold. */
603 	if (dump_plat_mincpu == 0 || new->nhelper < 2) {
604 		new->clevel = 0;
605 		new->nhelper = 1;
606 	} else if ((new->nhelper + 1) >= new->threshold) {
607 		new->clevel = DUMP_CLEVEL_BZIP2;
608 	} else {
609 		new->clevel = DUMP_CLEVEL_LZJB;
610 	}
611 
612 	if (new->clevel == 0) {
613 		new->ncbuf = 1;
614 		new->ncmap = 1;
615 	} else {
616 		new->ncbuf = NCBUF_PER_HELPER * new->nhelper;
617 		new->ncmap = NCMAP_PER_HELPER * new->nhelper;
618 	}
619 
620 	/*
621 	 * Allocate new data structures and buffers for MINHELPERS,
622 	 * and also figure the max desired size.
623 	 */
624 	bz2size = BZ2_bzCompressInitSize(dump_bzip2_level);
625 	new->maxsize = 0;
626 	new->maxvmsize = 0;
627 	new->maxvm = NULL;
628 	tag = 1;
629 	new->helper = kmem_zalloc(new->nhelper * sizeof (helper_t), KM_SLEEP);
630 	hpend = &new->helper[new->nhelper];
631 	for (hp = new->helper; hp != hpend; hp++) {
632 		hp->tag = tag++;
633 		if (hp < &new->helper[MINHELPERS]) {
634 			hp->lzbuf = kmem_alloc(PAGESIZE, KM_SLEEP);
635 			hp->page = kmem_alloc(PAGESIZE, KM_SLEEP);
636 		} else if (new->clevel < DUMP_CLEVEL_BZIP2) {
637 			new->maxsize += 2 * PAGESIZE;
638 		} else {
639 			new->maxsize += PAGESIZE;
640 		}
641 		if (new->clevel >= DUMP_CLEVEL_BZIP2)
642 			new->maxsize += bz2size;
643 	}
644 
645 	new->cbuf = kmem_zalloc(new->ncbuf * sizeof (cbuf_t), KM_SLEEP);
646 	cpend = &new->cbuf[new->ncbuf];
647 	for (cp = new->cbuf; cp != cpend; cp++) {
648 		cp->state = CBUF_FREEBUF;
649 		cp->size = CBUF_SIZE;
650 		if (cp < &new->cbuf[MINCBUFS])
651 			cp->buf = kmem_alloc(cp->size, KM_SLEEP);
652 		else
653 			new->maxsize += cp->size;
654 	}
655 
656 	new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP);
657 	cpend = &new->cmap[new->ncmap];
658 	for (cp = new->cmap; cp != cpend; cp++) {
659 		cp->state = CBUF_FREEMAP;
660 		cp->size = CBUF_MAPSIZE;
661 		cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE,
662 		    0, 0, NULL, NULL, VM_SLEEP);
663 	}
664 
665 	/* reserve VA to be backed with spare pages at crash time */
666 	if (new->maxsize > 0) {
667 		new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE);
668 		new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE);
669 		new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize,
670 		    CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP);
671 	}
672 
673 	/*
674 	 * Reserve memory for kmem allocation calls made during crash
675 	 * dump.  The hat layer allocates memory for each mapping
676 	 * created, and the I/O path allocates buffers and data structs.
677 	 * Add a few pages for safety.
678 	 */
679 	kmem_dump_init((new->ncmap * dump_kmem_permap) +
680 	    (dump_kmem_pages * PAGESIZE));
681 
682 	/* set new config pointers */
683 	*old = *new;
684 }
685 
686 /*
687  * Define a struct memlist walker to optimize bitnum to pfn
688  * lookup. The walker maintains the state of the list traversal.
689  */
690 typedef struct dumpmlw {
691 	struct memlist	*mp;		/* current memlist */
692 	pgcnt_t		basenum;	/* bitnum base offset */
693 	pgcnt_t		mppages;	/* current memlist size */
694 	pgcnt_t		mpleft;		/* size to end of current memlist */
695 	pfn_t		mpaddr;		/* first pfn in memlist */
696 } dumpmlw_t;
697 
698 /* initialize the walker */
699 static inline void
700 dump_init_memlist_walker(dumpmlw_t *pw)
701 {
702 	pw->mp = phys_install;
703 	pw->basenum = 0;
704 	pw->mppages = pw->mp->ml_size >> PAGESHIFT;
705 	pw->mpleft = pw->mppages;
706 	pw->mpaddr = pw->mp->ml_address >> PAGESHIFT;
707 }
708 
709 /*
710  * Lookup pfn given bitnum. The memlist can be quite long on some
711  * systems (e.g.: one per board). To optimize sequential lookups, the
712  * caller initializes and presents a memlist walker.
713  */
714 static pfn_t
715 dump_bitnum_to_pfn(pgcnt_t bitnum, dumpmlw_t *pw)
716 {
717 	bitnum -= pw->basenum;
718 	while (pw->mp != NULL) {
719 		if (bitnum < pw->mppages) {
720 			pw->mpleft = pw->mppages - bitnum;
721 			return (pw->mpaddr + bitnum);
722 		}
723 		bitnum -= pw->mppages;
724 		pw->basenum += pw->mppages;
725 		pw->mp = pw->mp->ml_next;
726 		if (pw->mp != NULL) {
727 			pw->mppages = pw->mp->ml_size >> PAGESHIFT;
728 			pw->mpleft = pw->mppages;
729 			pw->mpaddr = pw->mp->ml_address >> PAGESHIFT;
730 		}
731 	}
732 	return (PFN_INVALID);
733 }
734 
735 static pgcnt_t
736 dump_pfn_to_bitnum(pfn_t pfn)
737 {
738 	struct memlist *mp;
739 	pgcnt_t bitnum = 0;
740 
741 	for (mp = phys_install; mp != NULL; mp = mp->ml_next) {
742 		if (pfn >= (mp->ml_address >> PAGESHIFT) &&
743 		    pfn < ((mp->ml_address + mp->ml_size) >> PAGESHIFT))
744 			return (bitnum + pfn - (mp->ml_address >> PAGESHIFT));
745 		bitnum += mp->ml_size >> PAGESHIFT;
746 	}
747 	return ((pgcnt_t)-1);
748 }
749 
750 /*
751  * Set/test bitmap for a CBUF_MAPSIZE range which includes pfn. The
752  * mapping of pfn to range index is imperfect because pfn and bitnum
753  * do not have the same phase. To make sure a CBUF_MAPSIZE range is
754  * covered, call this for both ends:
755  *	dump_set_used(base)
756  *	dump_set_used(base+CBUF_MAPNP-1)
757  *
758  * This is used during a panic dump to mark pages allocated by
759  * dumpsys_get_maxmem(). The macro IS_DUMP_PAGE(pp) is used by
760  * page_get_mnode_freelist() to make sure pages used by dump are never
761  * allocated.
762  */
763 #define	CBUF_MAPP2R(pfn)	((pfn) >> (CBUF_MAPSHIFT - PAGESHIFT))
764 
765 static void
766 dump_set_used(pfn_t pfn)
767 {
768 
769 	pgcnt_t bitnum, rbitnum;
770 
771 	bitnum = dump_pfn_to_bitnum(pfn);
772 	ASSERT(bitnum != (pgcnt_t)-1);
773 
774 	rbitnum = CBUF_MAPP2R(bitnum);
775 	ASSERT(rbitnum < dumpcfg.rbitmapsize);
776 
777 	BT_SET(dumpcfg.rbitmap, rbitnum);
778 }
779 
780 int
781 dump_test_used(pfn_t pfn)
782 {
783 	pgcnt_t bitnum, rbitnum;
784 
785 	bitnum = dump_pfn_to_bitnum(pfn);
786 	ASSERT(bitnum != (pgcnt_t)-1);
787 
788 	rbitnum = CBUF_MAPP2R(bitnum);
789 	ASSERT(rbitnum < dumpcfg.rbitmapsize);
790 
791 	return (BT_TEST(dumpcfg.rbitmap, rbitnum));
792 }
793 
794 /*
795  * dumpbzalloc and dumpbzfree are callbacks from the bzip2 library.
796  * dumpsys_get_maxmem() uses them for BZ2_bzCompressInit().
797  */
798 static void *
799 dumpbzalloc(void *opaque, int items, int size)
800 {
801 	size_t *sz;
802 	char *ret;
803 
804 	ASSERT(opaque != NULL);
805 	sz = opaque;
806 	ret = dumpcfg.maxvm + *sz;
807 	*sz += items * size;
808 	*sz = P2ROUNDUP(*sz, BZ2_BZALLOC_ALIGN);
809 	ASSERT(*sz <= dumpcfg.maxvmsize);
810 	return (ret);
811 }
812 
813 /*ARGSUSED*/
814 static void
815 dumpbzfree(void *opaque, void *addr)
816 {
817 }
818 
819 /*
820  * Perform additional checks on the page to see if we can really use
821  * it. The kernel (kas) pages are always set in the bitmap. However,
822  * boot memory pages (prom_ppages or P_BOOTPAGES) are not in the
823  * bitmap. So we check for them.
824  */
825 static inline int
826 dump_pfn_check(pfn_t pfn)
827 {
828 	page_t *pp = page_numtopp_nolock(pfn);
829 	if (pp == NULL || pp->p_pagenum != pfn ||
830 #if defined(__sparc)
831 	    pp->p_vnode == &promvp ||
832 #else
833 	    PP_ISBOOTPAGES(pp) ||
834 #endif
835 	    pp->p_toxic != 0)
836 		return (0);
837 	return (1);
838 }
839 
840 /*
841  * Check a range to see if all contained pages are available and
842  * return non-zero if the range can be used.
843  */
844 static inline int
845 dump_range_check(pgcnt_t start, pgcnt_t end, pfn_t pfn)
846 {
847 	for (; start < end; start++, pfn++) {
848 		if (BT_TEST(dumpcfg.bitmap, start))
849 			return (0);
850 		if (!dump_pfn_check(pfn))
851 			return (0);
852 	}
853 	return (1);
854 }
855 
856 /*
857  * dumpsys_get_maxmem() is called during panic. Find unused ranges
858  * and use them for buffers. If we find enough memory switch to
859  * parallel bzip2, otherwise use parallel lzjb.
860  *
861  * It searches the dump bitmap in 2 passes. The first time it looks
862  * for CBUF_MAPSIZE ranges. On the second pass it uses small pages.
863  */
864 static void
865 dumpsys_get_maxmem()
866 {
867 	dumpcfg_t *cfg = &dumpcfg;
868 	cbuf_t *endcp = &cfg->cbuf[cfg->ncbuf];
869 	helper_t *endhp = &cfg->helper[cfg->nhelper];
870 	pgcnt_t bitnum, end;
871 	size_t sz, endsz, bz2size;
872 	pfn_t pfn, off;
873 	cbuf_t *cp;
874 	helper_t *hp, *ohp;
875 	dumpmlw_t mlw;
876 	int k;
877 
878 	/*
879 	 * Fall back to doing a serial dump if no helpers showed
880 	 * up. It is possible for other CPUs to be stuck in PROM, or
881 	 * DRd out. panic("sync initiated") in sync_handler() is one
882 	 * case. A parallel dump will hang (dump time out) unless
883 	 * there is at least one helper CPU. At this point dumpsys()
884 	 * has done some I/O, which means there has been plenty of
885 	 * time for helpers to arrive.
886 	 */
887 	if (!cfg->helper_present) {
888 		cfg->clevel = 0;
889 		return;
890 	}
891 
892 	/*
893 	 * There may be no point in looking for spare memory. If
894 	 * dumping all memory, then none is spare. If doing a serial
895 	 * dump, then already have buffers.
896 	 */
897 	if (cfg->maxsize == 0 || cfg->clevel < DUMP_CLEVEL_LZJB ||
898 	    (dump_conflags & DUMP_ALL) != 0) {
899 		if (cfg->clevel > DUMP_CLEVEL_LZJB)
900 			cfg->clevel = DUMP_CLEVEL_LZJB;
901 		return;
902 	}
903 
904 	sz = 0;
905 	cfg->found4m = 0;
906 	cfg->foundsm = 0;
907 
908 	/* bitmap of ranges used to estimate which pfns are being used */
909 	bzero(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.rbitmapsize));
910 
911 	/* find ranges that are not being dumped to use for buffers */
912 	dump_init_memlist_walker(&mlw);
913 	for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) {
914 		dump_timeleft = dump_timeout;
915 		end = bitnum + CBUF_MAPNP;
916 		pfn = dump_bitnum_to_pfn(bitnum, &mlw);
917 		ASSERT(pfn != PFN_INVALID);
918 
919 		/* skip partial range at end of mem segment */
920 		if (mlw.mpleft < CBUF_MAPNP) {
921 			end = bitnum + mlw.mpleft;
922 			continue;
923 		}
924 
925 		/* skip non aligned pages */
926 		off = P2PHASE(pfn, CBUF_MAPNP);
927 		if (off != 0) {
928 			end -= off;
929 			continue;
930 		}
931 
932 		if (!dump_range_check(bitnum, end, pfn))
933 			continue;
934 
935 		ASSERT((sz + CBUF_MAPSIZE) <= cfg->maxvmsize);
936 		hat_devload(kas.a_hat, cfg->maxvm + sz, CBUF_MAPSIZE, pfn,
937 		    PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST);
938 		sz += CBUF_MAPSIZE;
939 		cfg->found4m++;
940 
941 		/* set the bitmap for both ends to be sure to cover the range */
942 		dump_set_used(pfn);
943 		dump_set_used(pfn + CBUF_MAPNP - 1);
944 
945 		if (sz >= cfg->maxsize)
946 			goto foundmax;
947 	}
948 
949 	/* Add small pages if we can't find enough large pages. */
950 	dump_init_memlist_walker(&mlw);
951 	for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) {
952 		dump_timeleft = dump_timeout;
953 		end = bitnum + CBUF_MAPNP;
954 		pfn = dump_bitnum_to_pfn(bitnum, &mlw);
955 		ASSERT(pfn != PFN_INVALID);
956 
957 		/* Find any non-aligned pages at start and end of segment. */
958 		off = P2PHASE(pfn, CBUF_MAPNP);
959 		if (mlw.mpleft < CBUF_MAPNP) {
960 			end = bitnum + mlw.mpleft;
961 		} else if (off != 0) {
962 			end -= off;
963 		} else if (cfg->found4m && dump_test_used(pfn)) {
964 			continue;
965 		}
966 
967 		for (; bitnum < end; bitnum++, pfn++) {
968 			dump_timeleft = dump_timeout;
969 			if (BT_TEST(dumpcfg.bitmap, bitnum))
970 				continue;
971 			if (!dump_pfn_check(pfn))
972 				continue;
973 			ASSERT((sz + PAGESIZE) <= cfg->maxvmsize);
974 			hat_devload(kas.a_hat, cfg->maxvm + sz, PAGESIZE, pfn,
975 			    PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST);
976 			sz += PAGESIZE;
977 			cfg->foundsm++;
978 			dump_set_used(pfn);
979 			if (sz >= cfg->maxsize)
980 				goto foundmax;
981 		}
982 	}
983 
984 	/* Fall back to lzjb if we did not get enough memory for bzip2. */
985 	endsz = (cfg->maxsize * cfg->threshold) / cfg->nhelper;
986 	if (sz < endsz) {
987 		cfg->clevel = DUMP_CLEVEL_LZJB;
988 	}
989 
990 	/* Allocate memory for as many helpers as we can. */
991 foundmax:
992 
993 	/* Byte offsets into memory found and mapped above */
994 	endsz = sz;
995 	sz = 0;
996 
997 	/* Set the size for bzip2 state. Only bzip2 needs it. */
998 	bz2size = BZ2_bzCompressInitSize(dump_bzip2_level);
999 
1000 	/* Skip the preallocate output buffers. */
1001 	cp = &cfg->cbuf[MINCBUFS];
1002 
1003 	/* Use this to move memory up from the preallocated helpers. */
1004 	ohp = cfg->helper;
1005 
1006 	/* Loop over all helpers and allocate memory. */
1007 	for (hp = cfg->helper; hp < endhp; hp++) {
1008 
1009 		/* Skip preallocated helpers by checking hp->page. */
1010 		if (hp->page == NULL) {
1011 			if (cfg->clevel <= DUMP_CLEVEL_LZJB) {
1012 				/* lzjb needs 2 1-page buffers */
1013 				if ((sz + (2 * PAGESIZE)) > endsz)
1014 					break;
1015 				hp->page = cfg->maxvm + sz;
1016 				sz += PAGESIZE;
1017 				hp->lzbuf = cfg->maxvm + sz;
1018 				sz += PAGESIZE;
1019 
1020 			} else if (ohp->lzbuf != NULL) {
1021 				/* re-use the preallocted lzjb page for bzip2 */
1022 				hp->page = ohp->lzbuf;
1023 				ohp->lzbuf = NULL;
1024 				++ohp;
1025 
1026 			} else {
1027 				/* bzip2 needs a 1-page buffer */
1028 				if ((sz + PAGESIZE) > endsz)
1029 					break;
1030 				hp->page = cfg->maxvm + sz;
1031 				sz += PAGESIZE;
1032 			}
1033 		}
1034 
1035 		/*
1036 		 * Add output buffers per helper. The number of
1037 		 * buffers per helper is determined by the ratio of
1038 		 * ncbuf to nhelper.
1039 		 */
1040 		for (k = 0; cp < endcp && (sz + CBUF_SIZE) <= endsz &&
1041 		    k < NCBUF_PER_HELPER; k++) {
1042 			cp->state = CBUF_FREEBUF;
1043 			cp->size = CBUF_SIZE;
1044 			cp->buf = cfg->maxvm + sz;
1045 			sz += CBUF_SIZE;
1046 			++cp;
1047 		}
1048 
1049 		/*
1050 		 * bzip2 needs compression state. Use the dumpbzalloc
1051 		 * and dumpbzfree callbacks to allocate the memory.
1052 		 * bzip2 does allocation only at init time.
1053 		 */
1054 		if (cfg->clevel >= DUMP_CLEVEL_BZIP2) {
1055 			if ((sz + bz2size) > endsz) {
1056 				hp->page = NULL;
1057 				break;
1058 			} else {
1059 				hp->bzstream.opaque = &sz;
1060 				hp->bzstream.bzalloc = dumpbzalloc;
1061 				hp->bzstream.bzfree = dumpbzfree;
1062 				(void) BZ2_bzCompressInit(&hp->bzstream,
1063 				    dump_bzip2_level, 0, 0);
1064 				hp->bzstream.opaque = NULL;
1065 			}
1066 		}
1067 	}
1068 
1069 	/* Finish allocating output buffers */
1070 	for (; cp < endcp && (sz + CBUF_SIZE) <= endsz; cp++) {
1071 		cp->state = CBUF_FREEBUF;
1072 		cp->size = CBUF_SIZE;
1073 		cp->buf = cfg->maxvm + sz;
1074 		sz += CBUF_SIZE;
1075 	}
1076 
1077 	/* Enable IS_DUMP_PAGE macro, which checks for pages we took. */
1078 	if (cfg->found4m || cfg->foundsm)
1079 		dump_check_used = 1;
1080 
1081 	ASSERT(sz <= endsz);
1082 }
1083 
1084 static void
1085 dumphdr_init(void)
1086 {
1087 	pgcnt_t npages = 0;
1088 
1089 	ASSERT(MUTEX_HELD(&dump_lock));
1090 
1091 	if (dumphdr == NULL) {
1092 		dumphdr = kmem_zalloc(sizeof (dumphdr_t), KM_SLEEP);
1093 		dumphdr->dump_magic = DUMP_MAGIC;
1094 		dumphdr->dump_version = DUMP_VERSION;
1095 		dumphdr->dump_wordsize = DUMP_WORDSIZE;
1096 		dumphdr->dump_pageshift = PAGESHIFT;
1097 		dumphdr->dump_pagesize = PAGESIZE;
1098 		dumphdr->dump_utsname = utsname;
1099 		(void) strcpy(dumphdr->dump_platform, platform);
1100 		dumpbuf.size = dumpbuf_iosize(maxphys);
1101 		dumpbuf.start = kmem_alloc(dumpbuf.size, KM_SLEEP);
1102 		dumpbuf.end = dumpbuf.start + dumpbuf.size;
1103 		dumpcfg.pids = kmem_alloc(v.v_proc * sizeof (pid_t), KM_SLEEP);
1104 		dumpcfg.helpermap = kmem_zalloc(BT_SIZEOFMAP(NCPU), KM_SLEEP);
1105 		LOCK_INIT_HELD(&dumpcfg.helper_lock);
1106 	}
1107 
1108 	npages = num_phys_pages();
1109 
1110 	if (dumpcfg.bitmapsize != npages) {
1111 		size_t rlen = CBUF_MAPP2R(P2ROUNDUP(npages, CBUF_MAPNP));
1112 		void *map = kmem_alloc(BT_SIZEOFMAP(npages), KM_SLEEP);
1113 		void *rmap = kmem_alloc(BT_SIZEOFMAP(rlen), KM_SLEEP);
1114 
1115 		if (dumpcfg.bitmap != NULL)
1116 			kmem_free(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.
1117 			    bitmapsize));
1118 		if (dumpcfg.rbitmap != NULL)
1119 			kmem_free(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.
1120 			    rbitmapsize));
1121 		dumpcfg.bitmap = map;
1122 		dumpcfg.bitmapsize = npages;
1123 		dumpcfg.rbitmap = rmap;
1124 		dumpcfg.rbitmapsize = rlen;
1125 	}
1126 }
1127 
1128 /*
1129  * Establish a new dump device.
1130  */
1131 int
1132 dumpinit(vnode_t *vp, char *name, int justchecking)
1133 {
1134 	vnode_t *cvp;
1135 	vattr_t vattr;
1136 	vnode_t *cdev_vp;
1137 	int error = 0;
1138 
1139 	ASSERT(MUTEX_HELD(&dump_lock));
1140 
1141 	dumphdr_init();
1142 
1143 	cvp = common_specvp(vp);
1144 	if (cvp == dumpvp)
1145 		return (0);
1146 
1147 	/*
1148 	 * Determine whether this is a plausible dump device.  We want either:
1149 	 * (1) a real device that's not mounted and has a cb_dump routine, or
1150 	 * (2) a swapfile on some filesystem that has a vop_dump routine.
1151 	 */
1152 	if ((error = VOP_OPEN(&cvp, FREAD | FWRITE, kcred, NULL)) != 0)
1153 		return (error);
1154 
1155 	vattr.va_mask = AT_SIZE | AT_TYPE | AT_RDEV;
1156 	if ((error = VOP_GETATTR(cvp, &vattr, 0, kcred, NULL)) == 0) {
1157 		if (vattr.va_type == VBLK || vattr.va_type == VCHR) {
1158 			if (devopsp[getmajor(vattr.va_rdev)]->
1159 			    devo_cb_ops->cb_dump == nodev)
1160 				error = ENOTSUP;
1161 			else if (vfs_devismounted(vattr.va_rdev))
1162 				error = EBUSY;
1163 			if (strcmp(ddi_driver_name(VTOS(cvp)->s_dip),
1164 			    ZFS_DRIVER) == 0 &&
1165 			    IS_SWAPVP(common_specvp(cvp)))
1166 					error = EBUSY;
1167 		} else {
1168 			if (vn_matchopval(cvp, VOPNAME_DUMP, fs_nosys) ||
1169 			    !IS_SWAPVP(cvp))
1170 				error = ENOTSUP;
1171 		}
1172 	}
1173 
1174 	if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE)
1175 		error = ENOSPC;
1176 
1177 	if (error || justchecking) {
1178 		(void) VOP_CLOSE(cvp, FREAD | FWRITE, 1, (offset_t)0,
1179 		    kcred, NULL);
1180 		return (error);
1181 	}
1182 
1183 	VN_HOLD(cvp);
1184 
1185 	if (dumpvp != NULL)
1186 		dumpfini();	/* unconfigure the old dump device */
1187 
1188 	dumpvp = cvp;
1189 	dumpvp_size = vattr.va_size & -DUMP_OFFSET;
1190 	dumppath = kmem_alloc(strlen(name) + 1, KM_SLEEP);
1191 	(void) strcpy(dumppath, name);
1192 	dumpbuf.iosize = 0;
1193 
1194 	/*
1195 	 * If the dump device is a block device, attempt to open up the
1196 	 * corresponding character device and determine its maximum transfer
1197 	 * size.  We use this information to potentially resize dumpbuf to a
1198 	 * larger and more optimal size for performing i/o to the dump device.
1199 	 */
1200 	if (cvp->v_type == VBLK &&
1201 	    (cdev_vp = makespecvp(VTOS(cvp)->s_dev, VCHR)) != NULL) {
1202 		if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) {
1203 			size_t blk_size;
1204 			struct dk_cinfo dki;
1205 			struct dk_minfo minf;
1206 
1207 			if (VOP_IOCTL(cdev_vp, DKIOCGMEDIAINFO,
1208 			    (intptr_t)&minf, FKIOCTL, kcred, NULL, NULL)
1209 			    == 0 && minf.dki_lbsize != 0)
1210 				blk_size = minf.dki_lbsize;
1211 			else
1212 				blk_size = DEV_BSIZE;
1213 
1214 			if (VOP_IOCTL(cdev_vp, DKIOCINFO, (intptr_t)&dki,
1215 			    FKIOCTL, kcred, NULL, NULL) == 0) {
1216 				dumpbuf.iosize = dki.dki_maxtransfer * blk_size;
1217 				dumpbuf_resize();
1218 			}
1219 			/*
1220 			 * If we are working with a zvol then dumpify it
1221 			 * if it's not being used as swap.
1222 			 */
1223 			if (strcmp(dki.dki_dname, ZVOL_DRIVER) == 0) {
1224 				if (IS_SWAPVP(common_specvp(cvp)))
1225 					error = EBUSY;
1226 				else if ((error = VOP_IOCTL(cdev_vp,
1227 				    DKIOCDUMPINIT, NULL, FKIOCTL, kcred,
1228 				    NULL, NULL)) != 0)
1229 					dumpfini();
1230 			}
1231 
1232 			(void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0,
1233 			    kcred, NULL);
1234 		}
1235 
1236 		VN_RELE(cdev_vp);
1237 	}
1238 
1239 	cmn_err(CE_CONT, "?dump on %s size %llu MB\n", name, dumpvp_size >> 20);
1240 
1241 	dump_update_clevel();
1242 
1243 	return (error);
1244 }
1245 
1246 void
1247 dumpfini(void)
1248 {
1249 	vattr_t vattr;
1250 	boolean_t is_zfs = B_FALSE;
1251 	vnode_t *cdev_vp;
1252 	ASSERT(MUTEX_HELD(&dump_lock));
1253 
1254 	kmem_free(dumppath, strlen(dumppath) + 1);
1255 
1256 	/*
1257 	 * Determine if we are using zvols for our dump device
1258 	 */
1259 	vattr.va_mask = AT_RDEV;
1260 	if (VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL) == 0) {
1261 		is_zfs = (getmajor(vattr.va_rdev) ==
1262 		    ddi_name_to_major(ZFS_DRIVER)) ? B_TRUE : B_FALSE;
1263 	}
1264 
1265 	/*
1266 	 * If we have a zvol dump device then we call into zfs so
1267 	 * that it may have a chance to cleanup.
1268 	 */
1269 	if (is_zfs &&
1270 	    (cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR)) != NULL) {
1271 		if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) {
1272 			(void) VOP_IOCTL(cdev_vp, DKIOCDUMPFINI, NULL, FKIOCTL,
1273 			    kcred, NULL, NULL);
1274 			(void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0,
1275 			    kcred, NULL);
1276 		}
1277 		VN_RELE(cdev_vp);
1278 	}
1279 
1280 	(void) VOP_CLOSE(dumpvp, FREAD | FWRITE, 1, (offset_t)0, kcred, NULL);
1281 
1282 	VN_RELE(dumpvp);
1283 
1284 	dumpvp = NULL;
1285 	dumpvp_size = 0;
1286 	dumppath = NULL;
1287 }
1288 
1289 static offset_t
1290 dumpvp_flush(void)
1291 {
1292 	size_t size = P2ROUNDUP(dumpbuf.cur - dumpbuf.start, PAGESIZE);
1293 	hrtime_t iotime;
1294 	int err;
1295 
1296 	if (dumpbuf.vp_off + size > dumpbuf.vp_limit) {
1297 		dump_ioerr = ENOSPC;
1298 		dumpbuf.vp_off = dumpbuf.vp_limit;
1299 	} else if (size != 0) {
1300 		iotime = gethrtime();
1301 		dumpsync.iowait += iotime - dumpsync.iowaitts;
1302 		if (panicstr)
1303 			err = VOP_DUMP(dumpvp, dumpbuf.start,
1304 			    lbtodb(dumpbuf.vp_off), btod(size), NULL);
1305 		else
1306 			err = vn_rdwr(UIO_WRITE, dumpbuf.cdev_vp != NULL ?
1307 			    dumpbuf.cdev_vp : dumpvp, dumpbuf.start, size,
1308 			    dumpbuf.vp_off, UIO_SYSSPACE, 0, dumpbuf.vp_limit,
1309 			    kcred, 0);
1310 		if (err && dump_ioerr == 0)
1311 			dump_ioerr = err;
1312 		dumpsync.iowaitts = gethrtime();
1313 		dumpsync.iotime += dumpsync.iowaitts - iotime;
1314 		dumpsync.nwrite += size;
1315 		dumpbuf.vp_off += size;
1316 	}
1317 	dumpbuf.cur = dumpbuf.start;
1318 	dump_timeleft = dump_timeout;
1319 	return (dumpbuf.vp_off);
1320 }
1321 
1322 /* maximize write speed by keeping seek offset aligned with size */
1323 void
1324 dumpvp_write(const void *va, size_t size)
1325 {
1326 	size_t len, off, sz;
1327 
1328 	while (size != 0) {
1329 		len = MIN(size, dumpbuf.end - dumpbuf.cur);
1330 		if (len == 0) {
1331 			off = P2PHASE(dumpbuf.vp_off, dumpbuf.size);
1332 			if (off == 0 || !ISP2(dumpbuf.size)) {
1333 				(void) dumpvp_flush();
1334 			} else {
1335 				sz = dumpbuf.size - off;
1336 				dumpbuf.cur = dumpbuf.start + sz;
1337 				(void) dumpvp_flush();
1338 				ovbcopy(dumpbuf.start + sz, dumpbuf.start, off);
1339 				dumpbuf.cur += off;
1340 			}
1341 		} else {
1342 			bcopy(va, dumpbuf.cur, len);
1343 			va = (char *)va + len;
1344 			dumpbuf.cur += len;
1345 			size -= len;
1346 		}
1347 	}
1348 }
1349 
1350 /*ARGSUSED*/
1351 static void
1352 dumpvp_ksyms_write(const void *src, void *dst, size_t size)
1353 {
1354 	dumpvp_write(src, size);
1355 }
1356 
1357 /*
1358  * Mark 'pfn' in the bitmap and dump its translation table entry.
1359  */
1360 void
1361 dump_addpage(struct as *as, void *va, pfn_t pfn)
1362 {
1363 	mem_vtop_t mem_vtop;
1364 	pgcnt_t bitnum;
1365 
1366 	if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) {
1367 		if (!BT_TEST(dumpcfg.bitmap, bitnum)) {
1368 			dumphdr->dump_npages++;
1369 			BT_SET(dumpcfg.bitmap, bitnum);
1370 		}
1371 		dumphdr->dump_nvtop++;
1372 		mem_vtop.m_as = as;
1373 		mem_vtop.m_va = va;
1374 		mem_vtop.m_pfn = pfn;
1375 		dumpvp_write(&mem_vtop, sizeof (mem_vtop_t));
1376 	}
1377 	dump_timeleft = dump_timeout;
1378 }
1379 
1380 /*
1381  * Mark 'pfn' in the bitmap
1382  */
1383 void
1384 dump_page(pfn_t pfn)
1385 {
1386 	pgcnt_t bitnum;
1387 
1388 	if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) {
1389 		if (!BT_TEST(dumpcfg.bitmap, bitnum)) {
1390 			dumphdr->dump_npages++;
1391 			BT_SET(dumpcfg.bitmap, bitnum);
1392 		}
1393 	}
1394 	dump_timeleft = dump_timeout;
1395 }
1396 
1397 /*
1398  * Dump the <as, va, pfn> information for a given address space.
1399  * SEGOP_DUMP() will call dump_addpage() for each page in the segment.
1400  */
1401 static void
1402 dump_as(struct as *as)
1403 {
1404 	struct seg *seg;
1405 
1406 	AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
1407 	for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) {
1408 		if (seg->s_as != as)
1409 			break;
1410 		if (seg->s_ops == NULL)
1411 			continue;
1412 		SEGOP_DUMP(seg);
1413 	}
1414 	AS_LOCK_EXIT(as, &as->a_lock);
1415 
1416 	if (seg != NULL)
1417 		cmn_err(CE_WARN, "invalid segment %p in address space %p",
1418 		    (void *)seg, (void *)as);
1419 }
1420 
1421 static int
1422 dump_process(pid_t pid)
1423 {
1424 	proc_t *p = sprlock(pid);
1425 
1426 	if (p == NULL)
1427 		return (-1);
1428 	if (p->p_as != &kas) {
1429 		mutex_exit(&p->p_lock);
1430 		dump_as(p->p_as);
1431 		mutex_enter(&p->p_lock);
1432 	}
1433 
1434 	sprunlock(p);
1435 
1436 	return (0);
1437 }
1438 
1439 void
1440 dump_ereports(void)
1441 {
1442 	u_offset_t dumpvp_start;
1443 	erpt_dump_t ed;
1444 
1445 	if (dumpvp == NULL || dumphdr == NULL)
1446 		return;
1447 
1448 	dumpbuf.cur = dumpbuf.start;
1449 	dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE);
1450 	dumpvp_start = dumpbuf.vp_limit - DUMP_ERPTSIZE;
1451 	dumpbuf.vp_off = dumpvp_start;
1452 
1453 	fm_ereport_dump();
1454 	if (panicstr)
1455 		errorq_dump();
1456 
1457 	bzero(&ed, sizeof (ed)); /* indicate end of ereports */
1458 	dumpvp_write(&ed, sizeof (ed));
1459 	(void) dumpvp_flush();
1460 
1461 	if (!panicstr) {
1462 		(void) VOP_PUTPAGE(dumpvp, dumpvp_start,
1463 		    (size_t)(dumpbuf.vp_off - dumpvp_start),
1464 		    B_INVAL | B_FORCE, kcred, NULL);
1465 	}
1466 }
1467 
1468 void
1469 dump_messages(void)
1470 {
1471 	log_dump_t ld;
1472 	mblk_t *mctl, *mdata;
1473 	queue_t *q, *qlast;
1474 	u_offset_t dumpvp_start;
1475 
1476 	if (dumpvp == NULL || dumphdr == NULL || log_consq == NULL)
1477 		return;
1478 
1479 	dumpbuf.cur = dumpbuf.start;
1480 	dumpbuf.vp_limit = dumpvp_size - DUMP_OFFSET;
1481 	dumpvp_start = dumpbuf.vp_limit - DUMP_LOGSIZE;
1482 	dumpbuf.vp_off = dumpvp_start;
1483 
1484 	qlast = NULL;
1485 	do {
1486 		for (q = log_consq; q->q_next != qlast; q = q->q_next)
1487 			continue;
1488 		for (mctl = q->q_first; mctl != NULL; mctl = mctl->b_next) {
1489 			dump_timeleft = dump_timeout;
1490 			mdata = mctl->b_cont;
1491 			ld.ld_magic = LOG_MAGIC;
1492 			ld.ld_msgsize = MBLKL(mctl->b_cont);
1493 			ld.ld_csum = checksum32(mctl->b_rptr, MBLKL(mctl));
1494 			ld.ld_msum = checksum32(mdata->b_rptr, MBLKL(mdata));
1495 			dumpvp_write(&ld, sizeof (ld));
1496 			dumpvp_write(mctl->b_rptr, MBLKL(mctl));
1497 			dumpvp_write(mdata->b_rptr, MBLKL(mdata));
1498 		}
1499 	} while ((qlast = q) != log_consq);
1500 
1501 	ld.ld_magic = 0;		/* indicate end of messages */
1502 	dumpvp_write(&ld, sizeof (ld));
1503 	(void) dumpvp_flush();
1504 	if (!panicstr) {
1505 		(void) VOP_PUTPAGE(dumpvp, dumpvp_start,
1506 		    (size_t)(dumpbuf.vp_off - dumpvp_start),
1507 		    B_INVAL | B_FORCE, kcred, NULL);
1508 	}
1509 }
1510 
1511 /*
1512  * The following functions are called on multiple CPUs during dump.
1513  * They must not use most kernel services, because all cross-calls are
1514  * disabled during panic. Therefore, blocking locks and cache flushes
1515  * will not work.
1516  */
1517 
1518 /*
1519  * Copy pages, trapping ECC errors. Also, for robustness, trap data
1520  * access in case something goes wrong in the hat layer and the
1521  * mapping is broken.
1522  */
1523 static int
1524 dump_pagecopy(void *src, void *dst)
1525 {
1526 	long *wsrc = (long *)src;
1527 	long *wdst = (long *)dst;
1528 	const ulong_t ncopies = PAGESIZE / sizeof (long);
1529 	volatile int w = 0;
1530 	volatile int ueoff = -1;
1531 	on_trap_data_t otd;
1532 
1533 	if (on_trap(&otd, OT_DATA_EC | OT_DATA_ACCESS)) {
1534 		if (ueoff == -1)
1535 			ueoff = w * sizeof (long);
1536 		/* report "bad ECC" or "bad address" */
1537 #ifdef _LP64
1538 		if (otd.ot_trap & OT_DATA_EC)
1539 			wdst[w++] = 0x00badecc00badecc;
1540 		else
1541 			wdst[w++] = 0x00badadd00badadd;
1542 #else
1543 		if (otd.ot_trap & OT_DATA_EC)
1544 			wdst[w++] = 0x00badecc;
1545 		else
1546 			wdst[w++] = 0x00badadd;
1547 #endif
1548 	}
1549 	while (w < ncopies) {
1550 		wdst[w] = wsrc[w];
1551 		w++;
1552 	}
1553 	no_trap();
1554 	return (ueoff);
1555 }
1556 
1557 static void
1558 dumpsys_close_cq(cqueue_t *cq, int live)
1559 {
1560 	if (live) {
1561 		mutex_enter(&cq->mutex);
1562 		atomic_dec_uint(&cq->open);
1563 		cv_signal(&cq->cv);
1564 		mutex_exit(&cq->mutex);
1565 	} else {
1566 		atomic_dec_uint(&cq->open);
1567 	}
1568 }
1569 
1570 static inline void
1571 dumpsys_spinlock(lock_t *lp)
1572 {
1573 	uint_t backoff = 0;
1574 	int loop_count = 0;
1575 
1576 	while (LOCK_HELD(lp) || !lock_spin_try(lp)) {
1577 		if (++loop_count >= ncpus) {
1578 			backoff = mutex_lock_backoff(0);
1579 			loop_count = 0;
1580 		} else {
1581 			backoff = mutex_lock_backoff(backoff);
1582 		}
1583 		mutex_lock_delay(backoff);
1584 	}
1585 }
1586 
1587 static inline void
1588 dumpsys_spinunlock(lock_t *lp)
1589 {
1590 	lock_clear(lp);
1591 }
1592 
1593 static inline void
1594 dumpsys_lock(cqueue_t *cq, int live)
1595 {
1596 	if (live)
1597 		mutex_enter(&cq->mutex);
1598 	else
1599 		dumpsys_spinlock(&cq->spinlock);
1600 }
1601 
1602 static inline void
1603 dumpsys_unlock(cqueue_t *cq, int live, int signal)
1604 {
1605 	if (live) {
1606 		if (signal)
1607 			cv_signal(&cq->cv);
1608 		mutex_exit(&cq->mutex);
1609 	} else {
1610 		dumpsys_spinunlock(&cq->spinlock);
1611 	}
1612 }
1613 
1614 static void
1615 dumpsys_wait_cq(cqueue_t *cq, int live)
1616 {
1617 	if (live) {
1618 		cv_wait(&cq->cv, &cq->mutex);
1619 	} else {
1620 		dumpsys_spinunlock(&cq->spinlock);
1621 		while (cq->open)
1622 			if (cq->first)
1623 				break;
1624 		dumpsys_spinlock(&cq->spinlock);
1625 	}
1626 }
1627 
1628 static void
1629 dumpsys_put_cq(cqueue_t *cq, cbuf_t *cp, int newstate, int live)
1630 {
1631 	if (cp == NULL)
1632 		return;
1633 
1634 	dumpsys_lock(cq, live);
1635 
1636 	if (cq->ts != 0) {
1637 		cq->empty += gethrtime() - cq->ts;
1638 		cq->ts = 0;
1639 	}
1640 
1641 	cp->state = newstate;
1642 	cp->next = NULL;
1643 	if (cq->last == NULL)
1644 		cq->first = cp;
1645 	else
1646 		cq->last->next = cp;
1647 	cq->last = cp;
1648 
1649 	dumpsys_unlock(cq, live, 1);
1650 }
1651 
1652 static cbuf_t *
1653 dumpsys_get_cq(cqueue_t *cq, int live)
1654 {
1655 	cbuf_t *cp;
1656 	hrtime_t now = gethrtime();
1657 
1658 	dumpsys_lock(cq, live);
1659 
1660 	/* CONSTCOND */
1661 	while (1) {
1662 		cp = (cbuf_t *)cq->first;
1663 		if (cp == NULL) {
1664 			if (cq->open == 0)
1665 				break;
1666 			dumpsys_wait_cq(cq, live);
1667 			continue;
1668 		}
1669 		cq->first = cp->next;
1670 		if (cq->first == NULL) {
1671 			cq->last = NULL;
1672 			cq->ts = now;
1673 		}
1674 		break;
1675 	}
1676 
1677 	dumpsys_unlock(cq, live, cq->first != NULL || cq->open == 0);
1678 	return (cp);
1679 }
1680 
1681 /*
1682  * Send an error message to the console. If the main task is running
1683  * just write the message via uprintf. If a helper is running the
1684  * message has to be put on a queue for the main task. Setting fmt to
1685  * NULL means flush the error message buffer. If fmt is not NULL, just
1686  * add the text to the existing buffer.
1687  */
1688 static void
1689 dumpsys_errmsg(helper_t *hp, const char *fmt, ...)
1690 {
1691 	dumpsync_t *ds = hp->ds;
1692 	cbuf_t *cp = hp->cperr;
1693 	va_list adx;
1694 
1695 	if (hp->helper == MAINHELPER) {
1696 		if (fmt != NULL) {
1697 			if (ds->neednl) {
1698 				uprintf("\n");
1699 				ds->neednl = 0;
1700 			}
1701 			va_start(adx, fmt);
1702 			vuprintf(fmt, adx);
1703 			va_end(adx);
1704 		}
1705 	} else if (fmt == NULL) {
1706 		if (cp != NULL) {
1707 			CQ_PUT(mainq, cp, CBUF_ERRMSG);
1708 			hp->cperr = NULL;
1709 		}
1710 	} else {
1711 		if (hp->cperr == NULL) {
1712 			cp = CQ_GET(freebufq);
1713 			hp->cperr = cp;
1714 			cp->used = 0;
1715 		}
1716 		va_start(adx, fmt);
1717 		cp->used += vsnprintf(cp->buf + cp->used, cp->size - cp->used,
1718 		    fmt, adx);
1719 		va_end(adx);
1720 		if ((cp->used + LOG_MSGSIZE) > cp->size) {
1721 			CQ_PUT(mainq, cp, CBUF_ERRMSG);
1722 			hp->cperr = NULL;
1723 		}
1724 	}
1725 }
1726 
1727 /*
1728  * Write an output buffer to the dump file. If the main task is
1729  * running just write the data. If a helper is running the output is
1730  * placed on a queue for the main task.
1731  */
1732 static void
1733 dumpsys_swrite(helper_t *hp, cbuf_t *cp, size_t used)
1734 {
1735 	dumpsync_t *ds = hp->ds;
1736 
1737 	if (hp->helper == MAINHELPER) {
1738 		HRSTART(ds->perpage, write);
1739 		dumpvp_write(cp->buf, used);
1740 		HRSTOP(ds->perpage, write);
1741 		CQ_PUT(freebufq, cp, CBUF_FREEBUF);
1742 	} else {
1743 		cp->used = used;
1744 		CQ_PUT(mainq, cp, CBUF_WRITE);
1745 	}
1746 }
1747 
1748 /*
1749  * Copy one page within the mapped range. The offset starts at 0 and
1750  * is relative to the first pfn. cp->buf + cp->off is the address of
1751  * the first pfn. If dump_pagecopy returns a UE offset, create an
1752  * error message.  Returns the offset to the next pfn in the range
1753  * selected by the bitmap.
1754  */
1755 static int
1756 dumpsys_copy_page(helper_t *hp, int offset)
1757 {
1758 	cbuf_t *cp = hp->cpin;
1759 	int ueoff;
1760 
1761 	ASSERT(cp->off + offset + PAGESIZE <= cp->size);
1762 	ASSERT(BT_TEST(dumpcfg.bitmap, cp->bitnum));
1763 
1764 	ueoff = dump_pagecopy(cp->buf + cp->off + offset, hp->page);
1765 
1766 	/* ueoff is the offset in the page to a UE error */
1767 	if (ueoff != -1) {
1768 		uint64_t pa = ptob(cp->pfn) + offset + ueoff;
1769 
1770 		dumpsys_errmsg(hp, "cpu %d: memory error at PA 0x%08x.%08x\n",
1771 		    CPU->cpu_id, (uint32_t)(pa >> 32), (uint32_t)pa);
1772 	}
1773 
1774 	/*
1775 	 * Advance bitnum and offset to the next input page for the
1776 	 * next call to this function.
1777 	 */
1778 	offset += PAGESIZE;
1779 	cp->bitnum++;
1780 	while (cp->off + offset < cp->size) {
1781 		if (BT_TEST(dumpcfg.bitmap, cp->bitnum))
1782 			break;
1783 		offset += PAGESIZE;
1784 		cp->bitnum++;
1785 	}
1786 
1787 	return (offset);
1788 }
1789 
1790 /*
1791  * Read the helper queue, and copy one mapped page. Return 0 when
1792  * done. Return 1 when a page has been copied into hp->page.
1793  */
1794 static int
1795 dumpsys_sread(helper_t *hp)
1796 {
1797 	dumpsync_t *ds = hp->ds;
1798 
1799 	/* CONSTCOND */
1800 	while (1) {
1801 
1802 		/* Find the next input buffer. */
1803 		if (hp->cpin == NULL) {
1804 			HRSTART(hp->perpage, inwait);
1805 
1806 			/* CONSTCOND */
1807 			while (1) {
1808 				hp->cpin = CQ_GET(helperq);
1809 				dump_timeleft = dump_timeout;
1810 
1811 				/*
1812 				 * NULL return means the helper queue
1813 				 * is closed and empty.
1814 				 */
1815 				if (hp->cpin == NULL)
1816 					break;
1817 
1818 				/* Have input, check for dump I/O error. */
1819 				if (!dump_ioerr)
1820 					break;
1821 
1822 				/*
1823 				 * If an I/O error occurs, stay in the
1824 				 * loop in order to empty the helper
1825 				 * queue. Return the buffers to the
1826 				 * main task to unmap and free it.
1827 				 */
1828 				hp->cpin->used = 0;
1829 				CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
1830 			}
1831 			HRSTOP(hp->perpage, inwait);
1832 
1833 			/* Stop here when the helper queue is closed. */
1834 			if (hp->cpin == NULL)
1835 				break;
1836 
1837 			/* Set the offset=0 to get the first pfn. */
1838 			hp->in = 0;
1839 
1840 			/* Set the total processed to 0 */
1841 			hp->used = 0;
1842 		}
1843 
1844 		/* Process the next page. */
1845 		if (hp->used < hp->cpin->used) {
1846 
1847 			/*
1848 			 * Get the next page from the input buffer and
1849 			 * return a copy.
1850 			 */
1851 			ASSERT(hp->in != -1);
1852 			HRSTART(hp->perpage, copy);
1853 			hp->in = dumpsys_copy_page(hp, hp->in);
1854 			hp->used += PAGESIZE;
1855 			HRSTOP(hp->perpage, copy);
1856 			break;
1857 
1858 		} else {
1859 
1860 			/*
1861 			 * Done with the input. Flush the VM and
1862 			 * return the buffer to the main task.
1863 			 */
1864 			if (panicstr && hp->helper != MAINHELPER)
1865 				hat_flush_range(kas.a_hat,
1866 				    hp->cpin->buf, hp->cpin->size);
1867 			dumpsys_errmsg(hp, NULL);
1868 			CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
1869 			hp->cpin = NULL;
1870 		}
1871 	}
1872 
1873 	return (hp->cpin != NULL);
1874 }
1875 
1876 /*
1877  * Compress size bytes starting at buf with bzip2
1878  * mode:
1879  *	BZ_RUN		add one more compressed page
1880  *	BZ_FINISH	no more input, flush the state
1881  */
1882 static void
1883 dumpsys_bzrun(helper_t *hp, void *buf, size_t size, int mode)
1884 {
1885 	dumpsync_t *ds = hp->ds;
1886 	const int CSIZE = sizeof (dumpcsize_t);
1887 	bz_stream *ps = &hp->bzstream;
1888 	int rc = 0;
1889 	uint32_t csize;
1890 	dumpcsize_t cs;
1891 
1892 	/* Set input pointers to new input page */
1893 	if (size > 0) {
1894 		ps->avail_in = size;
1895 		ps->next_in = buf;
1896 	}
1897 
1898 	/* CONSTCOND */
1899 	while (1) {
1900 
1901 		/* Quit when all input has been consumed */
1902 		if (ps->avail_in == 0 && mode == BZ_RUN)
1903 			break;
1904 
1905 		/* Get a new output buffer */
1906 		if (hp->cpout == NULL) {
1907 			HRSTART(hp->perpage, outwait);
1908 			hp->cpout = CQ_GET(freebufq);
1909 			HRSTOP(hp->perpage, outwait);
1910 			ps->avail_out = hp->cpout->size - CSIZE;
1911 			ps->next_out = hp->cpout->buf + CSIZE;
1912 		}
1913 
1914 		/* Compress input, or finalize */
1915 		HRSTART(hp->perpage, compress);
1916 		rc = BZ2_bzCompress(ps, mode);
1917 		HRSTOP(hp->perpage, compress);
1918 
1919 		/* Check for error */
1920 		if (mode == BZ_RUN && rc != BZ_RUN_OK) {
1921 			dumpsys_errmsg(hp, "%d: BZ_RUN error %s at page %lx\n",
1922 			    hp->helper, BZ2_bzErrorString(rc),
1923 			    hp->cpin->pagenum);
1924 			break;
1925 		}
1926 
1927 		/* Write the buffer if it is full, or we are flushing */
1928 		if (ps->avail_out == 0 || mode == BZ_FINISH) {
1929 			csize = hp->cpout->size - CSIZE - ps->avail_out;
1930 			cs = DUMP_SET_TAG(csize, hp->tag);
1931 			if (csize > 0) {
1932 				(void) memcpy(hp->cpout->buf, &cs, CSIZE);
1933 				dumpsys_swrite(hp, hp->cpout, csize + CSIZE);
1934 				hp->cpout = NULL;
1935 			}
1936 		}
1937 
1938 		/* Check for final complete */
1939 		if (mode == BZ_FINISH) {
1940 			if (rc == BZ_STREAM_END)
1941 				break;
1942 			if (rc != BZ_FINISH_OK) {
1943 				dumpsys_errmsg(hp, "%d: BZ_FINISH error %s\n",
1944 				    hp->helper, BZ2_bzErrorString(rc));
1945 				break;
1946 			}
1947 		}
1948 	}
1949 
1950 	/* Cleanup state and buffers */
1951 	if (mode == BZ_FINISH) {
1952 
1953 		/* Reset state so that it is re-usable. */
1954 		(void) BZ2_bzCompressReset(&hp->bzstream);
1955 
1956 		/* Give any unused outout buffer to the main task */
1957 		if (hp->cpout != NULL) {
1958 			hp->cpout->used = 0;
1959 			CQ_PUT(mainq, hp->cpout, CBUF_ERRMSG);
1960 			hp->cpout = NULL;
1961 		}
1962 	}
1963 }
1964 
1965 static void
1966 dumpsys_bz2compress(helper_t *hp)
1967 {
1968 	dumpsync_t *ds = hp->ds;
1969 	dumpstreamhdr_t sh;
1970 
1971 	(void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC);
1972 	sh.stream_pagenum = (pgcnt_t)-1;
1973 	sh.stream_npages = 0;
1974 	hp->cpin = NULL;
1975 	hp->cpout = NULL;
1976 	hp->cperr = NULL;
1977 	hp->in = 0;
1978 	hp->out = 0;
1979 	hp->bzstream.avail_in = 0;
1980 
1981 	/* Bump reference to mainq while we are running */
1982 	CQ_OPEN(mainq);
1983 
1984 	/* Get one page at a time */
1985 	while (dumpsys_sread(hp)) {
1986 		if (sh.stream_pagenum != hp->cpin->pagenum) {
1987 			sh.stream_pagenum = hp->cpin->pagenum;
1988 			sh.stream_npages = btop(hp->cpin->used);
1989 			dumpsys_bzrun(hp, &sh, sizeof (sh), BZ_RUN);
1990 		}
1991 		dumpsys_bzrun(hp, hp->page, PAGESIZE, 0);
1992 	}
1993 
1994 	/* Done with input, flush any partial buffer */
1995 	if (sh.stream_pagenum != (pgcnt_t)-1) {
1996 		dumpsys_bzrun(hp, NULL, 0, BZ_FINISH);
1997 		dumpsys_errmsg(hp, NULL);
1998 	}
1999 
2000 	ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL);
2001 
2002 	/* Decrement main queue count, we are done */
2003 	CQ_CLOSE(mainq);
2004 }
2005 
2006 /*
2007  * Compress with lzjb
2008  * write stream block if full or size==0
2009  * if csize==0 write stream header, else write <csize, data>
2010  * size==0 is a call to flush a buffer
2011  * hp->cpout is the buffer we are flushing or filling
2012  * hp->out is the next index to fill data
2013  * osize is either csize+data, or the size of a stream header
2014  */
2015 static void
2016 dumpsys_lzjbrun(helper_t *hp, size_t csize, void *buf, size_t size)
2017 {
2018 	dumpsync_t *ds = hp->ds;
2019 	const int CSIZE = sizeof (dumpcsize_t);
2020 	dumpcsize_t cs;
2021 	size_t osize = csize > 0 ? CSIZE + size : size;
2022 
2023 	/* If flush, and there is no buffer, just return */
2024 	if (size == 0 && hp->cpout == NULL)
2025 		return;
2026 
2027 	/* If flush, or cpout is full, write it out */
2028 	if (size == 0 ||
2029 	    hp->cpout != NULL && hp->out + osize > hp->cpout->size) {
2030 
2031 		/* Set tag+size word at the front of the stream block. */
2032 		cs = DUMP_SET_TAG(hp->out - CSIZE, hp->tag);
2033 		(void) memcpy(hp->cpout->buf, &cs, CSIZE);
2034 
2035 		/* Write block to dump file. */
2036 		dumpsys_swrite(hp, hp->cpout, hp->out);
2037 
2038 		/* Clear pointer to indicate we need a new buffer */
2039 		hp->cpout = NULL;
2040 
2041 		/* flushing, we are done */
2042 		if (size == 0)
2043 			return;
2044 	}
2045 
2046 	/* Get an output buffer if we dont have one. */
2047 	if (hp->cpout == NULL) {
2048 		HRSTART(hp->perpage, outwait);
2049 		hp->cpout = CQ_GET(freebufq);
2050 		HRSTOP(hp->perpage, outwait);
2051 		hp->out = CSIZE;
2052 	}
2053 
2054 	/* Store csize word. This is the size of compressed data. */
2055 	if (csize > 0) {
2056 		cs = DUMP_SET_TAG(csize, 0);
2057 		(void) memcpy(hp->cpout->buf + hp->out, &cs, CSIZE);
2058 		hp->out += CSIZE;
2059 	}
2060 
2061 	/* Store the data. */
2062 	(void) memcpy(hp->cpout->buf + hp->out, buf, size);
2063 	hp->out += size;
2064 }
2065 
2066 static void
2067 dumpsys_lzjbcompress(helper_t *hp)
2068 {
2069 	dumpsync_t *ds = hp->ds;
2070 	size_t csize;
2071 	dumpstreamhdr_t sh;
2072 
2073 	(void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC);
2074 	sh.stream_pagenum = (pfn_t)-1;
2075 	sh.stream_npages = 0;
2076 	hp->cpin = NULL;
2077 	hp->cpout = NULL;
2078 	hp->cperr = NULL;
2079 	hp->in = 0;
2080 	hp->out = 0;
2081 
2082 	/* Bump reference to mainq while we are running */
2083 	CQ_OPEN(mainq);
2084 
2085 	/* Get one page at a time */
2086 	while (dumpsys_sread(hp)) {
2087 
2088 		/* Create a stream header for each new input map */
2089 		if (sh.stream_pagenum != hp->cpin->pagenum) {
2090 			sh.stream_pagenum = hp->cpin->pagenum;
2091 			sh.stream_npages = btop(hp->cpin->used);
2092 			dumpsys_lzjbrun(hp, 0, &sh, sizeof (sh));
2093 		}
2094 
2095 		/* Compress one page */
2096 		HRSTART(hp->perpage, compress);
2097 		csize = compress(hp->page, hp->lzbuf, PAGESIZE);
2098 		HRSTOP(hp->perpage, compress);
2099 
2100 		/* Add csize+data to output block */
2101 		ASSERT(csize > 0 && csize <= PAGESIZE);
2102 		dumpsys_lzjbrun(hp, csize, hp->lzbuf, csize);
2103 	}
2104 
2105 	/* Done with input, flush any partial buffer */
2106 	if (sh.stream_pagenum != (pfn_t)-1) {
2107 		dumpsys_lzjbrun(hp, 0, NULL, 0);
2108 		dumpsys_errmsg(hp, NULL);
2109 	}
2110 
2111 	ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL);
2112 
2113 	/* Decrement main queue count, we are done */
2114 	CQ_CLOSE(mainq);
2115 }
2116 
2117 /*
2118  * Dump helper called from panic_idle() to compress pages.  CPUs in
2119  * this path must not call most kernel services.
2120  *
2121  * During panic, all but one of the CPUs is idle. These CPUs are used
2122  * as helpers working in parallel to copy and compress memory
2123  * pages. During a panic, however, these processors cannot call any
2124  * kernel services. This is because mutexes become no-ops during
2125  * panic, and, cross-call interrupts are inhibited.  Therefore, during
2126  * panic dump the helper CPUs communicate with the panic CPU using
2127  * memory variables. All memory mapping and I/O is performed by the
2128  * panic CPU.
2129  *
2130  * At dump configuration time, helper_lock is set and helpers_wanted
2131  * is 0. dumpsys() decides whether to set helpers_wanted before
2132  * clearing helper_lock.
2133  *
2134  * At panic time, idle CPUs spin-wait on helper_lock, then alternately
2135  * take the lock and become a helper, or return.
2136  */
2137 void
2138 dumpsys_helper()
2139 {
2140 	if (!dumpcfg.helper_present)
2141 		dumpcfg.helper_present = 1;
2142 	dumpsys_spinlock(&dumpcfg.helper_lock);
2143 	if (dumpcfg.helpers_wanted) {
2144 		helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper];
2145 
2146 		for (hp = dumpcfg.helper; hp != hpend; hp++) {
2147 			if (hp->helper == FREEHELPER) {
2148 				hp->helper = CPU->cpu_id;
2149 				BT_SET(dumpcfg.helpermap, CPU->cpu_seqid);
2150 
2151 				dumpsys_spinunlock(&dumpcfg.helper_lock);
2152 
2153 				if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2)
2154 					dumpsys_lzjbcompress(hp);
2155 				else
2156 					dumpsys_bz2compress(hp);
2157 
2158 				hp->helper = DONEHELPER;
2159 				return;
2160 			}
2161 		}
2162 
2163 		/* No more helpers are needed. */
2164 		dumpcfg.helpers_wanted = 0;
2165 
2166 	}
2167 	dumpsys_spinunlock(&dumpcfg.helper_lock);
2168 }
2169 
2170 /*
2171  * No-wait helper callable in spin loops.
2172  *
2173  * Do not wait for helper_lock. Just check helpers_wanted. The caller
2174  * may decide to continue. This is the "c)ontinue, s)ync, r)eset? s"
2175  * case.
2176  */
2177 void
2178 dumpsys_helper_nw()
2179 {
2180 	if (!dumpcfg.helper_present)
2181 		dumpcfg.helper_present = 1;
2182 	if (dumpcfg.helpers_wanted)
2183 		dumpsys_helper();
2184 }
2185 
2186 /*
2187  * Dump helper for live dumps.
2188  * These run as a system task.
2189  */
2190 static void
2191 dumpsys_live_helper(void *arg)
2192 {
2193 	helper_t *hp = arg;
2194 
2195 	BT_ATOMIC_SET(dumpcfg.helpermap, CPU->cpu_seqid);
2196 	if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2)
2197 		dumpsys_lzjbcompress(hp);
2198 	else
2199 		dumpsys_bz2compress(hp);
2200 }
2201 
2202 /*
2203  * Compress one page with lzjb (single threaded case)
2204  */
2205 static void
2206 dumpsys_lzjb_page(helper_t *hp, cbuf_t *cp)
2207 {
2208 	dumpsync_t *ds = hp->ds;
2209 	uint32_t csize;
2210 
2211 	hp->helper = MAINHELPER;
2212 	hp->in = 0;
2213 	hp->used = 0;
2214 	hp->cpin = cp;
2215 	while (hp->used < cp->used) {
2216 		HRSTART(hp->perpage, copy);
2217 		hp->in = dumpsys_copy_page(hp, hp->in);
2218 		hp->used += PAGESIZE;
2219 		HRSTOP(hp->perpage, copy);
2220 
2221 		HRSTART(hp->perpage, compress);
2222 		csize = compress(hp->page, hp->lzbuf, PAGESIZE);
2223 		HRSTOP(hp->perpage, compress);
2224 
2225 		HRSTART(hp->perpage, write);
2226 		dumpvp_write(&csize, sizeof (csize));
2227 		dumpvp_write(hp->lzbuf, csize);
2228 		HRSTOP(hp->perpage, write);
2229 	}
2230 	CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP);
2231 	hp->cpin = NULL;
2232 }
2233 
2234 /*
2235  * Main task to dump pages. This is called on the dump CPU.
2236  */
2237 static void
2238 dumpsys_main_task(void *arg)
2239 {
2240 	dumpsync_t *ds = arg;
2241 	pgcnt_t pagenum = 0, bitnum = 0, hibitnum;
2242 	dumpmlw_t mlw;
2243 	cbuf_t *cp;
2244 	pgcnt_t baseoff, pfnoff;
2245 	pfn_t base, pfn;
2246 	int sec;
2247 
2248 	dump_init_memlist_walker(&mlw);
2249 
2250 	/* CONSTCOND */
2251 	while (1) {
2252 
2253 		if (ds->percent > ds->percent_done) {
2254 			ds->percent_done = ds->percent;
2255 			sec = (gethrtime() - ds->start) / 1000 / 1000 / 1000;
2256 			uprintf("^\r%2d:%02d %3d%% done",
2257 			    sec / 60, sec % 60, ds->percent);
2258 			ds->neednl = 1;
2259 		}
2260 
2261 		while (CQ_IS_EMPTY(mainq) && !CQ_IS_EMPTY(writerq)) {
2262 
2263 			/* the writerq never blocks */
2264 			cp = CQ_GET(writerq);
2265 			if (cp == NULL)
2266 				break;
2267 
2268 			dump_timeleft = dump_timeout;
2269 
2270 			HRSTART(ds->perpage, write);
2271 			dumpvp_write(cp->buf, cp->used);
2272 			HRSTOP(ds->perpage, write);
2273 
2274 			CQ_PUT(freebufq, cp, CBUF_FREEBUF);
2275 		}
2276 
2277 		/*
2278 		 * Wait here for some buffers to process. Returns NULL
2279 		 * when all helpers have terminated and all buffers
2280 		 * have been processed.
2281 		 */
2282 		cp = CQ_GET(mainq);
2283 
2284 		if (cp == NULL) {
2285 
2286 			/* Drain the write queue. */
2287 			if (!CQ_IS_EMPTY(writerq))
2288 				continue;
2289 
2290 			/* Main task exits here. */
2291 			break;
2292 		}
2293 
2294 		dump_timeleft = dump_timeout;
2295 
2296 		switch (cp->state) {
2297 
2298 		case CBUF_FREEMAP:
2299 
2300 			/*
2301 			 * Note that we drop CBUF_FREEMAP buffers on
2302 			 * the floor (they will not be on any cqueue)
2303 			 * when we no longer need them.
2304 			 */
2305 			if (bitnum >= dumpcfg.bitmapsize)
2306 				break;
2307 
2308 			if (dump_ioerr) {
2309 				bitnum = dumpcfg.bitmapsize;
2310 				CQ_CLOSE(helperq);
2311 				break;
2312 			}
2313 
2314 			HRSTART(ds->perpage, bitmap);
2315 			for (; bitnum < dumpcfg.bitmapsize; bitnum++)
2316 				if (BT_TEST(dumpcfg.bitmap, bitnum))
2317 					break;
2318 			HRSTOP(ds->perpage, bitmap);
2319 			dump_timeleft = dump_timeout;
2320 
2321 			if (bitnum >= dumpcfg.bitmapsize) {
2322 				CQ_CLOSE(helperq);
2323 				break;
2324 			}
2325 
2326 			/*
2327 			 * Try to map CBUF_MAPSIZE ranges. Can't
2328 			 * assume that memory segment size is a
2329 			 * multiple of CBUF_MAPSIZE. Can't assume that
2330 			 * the segment starts on a CBUF_MAPSIZE
2331 			 * boundary.
2332 			 */
2333 			pfn = dump_bitnum_to_pfn(bitnum, &mlw);
2334 			ASSERT(pfn != PFN_INVALID);
2335 			ASSERT(bitnum + mlw.mpleft <= dumpcfg.bitmapsize);
2336 
2337 			base = P2ALIGN(pfn, CBUF_MAPNP);
2338 			if (base < mlw.mpaddr) {
2339 				base = mlw.mpaddr;
2340 				baseoff = P2PHASE(base, CBUF_MAPNP);
2341 			} else {
2342 				baseoff = 0;
2343 			}
2344 
2345 			pfnoff = pfn - base;
2346 			if (pfnoff + mlw.mpleft < CBUF_MAPNP) {
2347 				hibitnum = bitnum + mlw.mpleft;
2348 				cp->size = ptob(pfnoff + mlw.mpleft);
2349 			} else {
2350 				hibitnum = bitnum - pfnoff + CBUF_MAPNP -
2351 				    baseoff;
2352 				cp->size = CBUF_MAPSIZE - ptob(baseoff);
2353 			}
2354 
2355 			cp->pfn = pfn;
2356 			cp->bitnum = bitnum++;
2357 			cp->pagenum = pagenum++;
2358 			cp->off = ptob(pfnoff);
2359 
2360 			for (; bitnum < hibitnum; bitnum++)
2361 				if (BT_TEST(dumpcfg.bitmap, bitnum))
2362 					pagenum++;
2363 
2364 			dump_timeleft = dump_timeout;
2365 			cp->used = ptob(pagenum - cp->pagenum);
2366 
2367 			HRSTART(ds->perpage, map);
2368 			hat_devload(kas.a_hat, cp->buf, cp->size, base,
2369 			    PROT_READ, HAT_LOAD_NOCONSIST);
2370 			HRSTOP(ds->perpage, map);
2371 
2372 			ds->pages_mapped += btop(cp->size);
2373 			ds->pages_used += pagenum - cp->pagenum;
2374 
2375 			CQ_OPEN(mainq);
2376 
2377 			/*
2378 			 * If there are no helpers the main task does
2379 			 * non-streams lzjb compress.
2380 			 */
2381 			if (dumpcfg.clevel == 0) {
2382 				dumpsys_lzjb_page(dumpcfg.helper, cp);
2383 				break;
2384 			}
2385 
2386 			/* pass mapped pages to a helper */
2387 			CQ_PUT(helperq, cp, CBUF_INREADY);
2388 
2389 			/* the last page was done */
2390 			if (bitnum >= dumpcfg.bitmapsize)
2391 				CQ_CLOSE(helperq);
2392 
2393 			break;
2394 
2395 		case CBUF_USEDMAP:
2396 
2397 			ds->npages += btop(cp->used);
2398 
2399 			HRSTART(ds->perpage, unmap);
2400 			hat_unload(kas.a_hat, cp->buf, cp->size, HAT_UNLOAD);
2401 			HRSTOP(ds->perpage, unmap);
2402 
2403 			if (bitnum < dumpcfg.bitmapsize)
2404 				CQ_PUT(mainq, cp, CBUF_FREEMAP);
2405 			CQ_CLOSE(mainq);
2406 
2407 			ASSERT(ds->npages <= dumphdr->dump_npages);
2408 			ds->percent = ds->npages * 100LL / dumphdr->dump_npages;
2409 			break;
2410 
2411 		case CBUF_WRITE:
2412 
2413 			CQ_PUT(writerq, cp, CBUF_WRITE);
2414 			break;
2415 
2416 		case CBUF_ERRMSG:
2417 
2418 			if (cp->used > 0) {
2419 				cp->buf[cp->size - 2] = '\n';
2420 				cp->buf[cp->size - 1] = '\0';
2421 				if (ds->neednl) {
2422 					uprintf("\n%s", cp->buf);
2423 					ds->neednl = 0;
2424 				} else {
2425 					uprintf("%s", cp->buf);
2426 				}
2427 				/* wait for console output */
2428 				drv_usecwait(200000);
2429 				dump_timeleft = dump_timeout;
2430 			}
2431 			CQ_PUT(freebufq, cp, CBUF_FREEBUF);
2432 			break;
2433 
2434 		default:
2435 			uprintf("dump: unexpected buffer state %d, "
2436 			    "buffer will be lost\n", cp->state);
2437 			break;
2438 
2439 		} /* end switch */
2440 
2441 	} /* end while(1) */
2442 }
2443 
2444 #ifdef	COLLECT_METRICS
2445 size_t
2446 dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size)
2447 {
2448 	dumpcfg_t *cfg = &dumpcfg;
2449 	int myid = CPU->cpu_seqid;
2450 	int i, compress_ratio;
2451 	int sec, iorate;
2452 	helper_t *hp, *hpend = &cfg->helper[cfg->nhelper];
2453 	char *e = buf + size;
2454 	char *p = buf;
2455 
2456 	sec = ds->elapsed / (1000 * 1000 * 1000ULL);
2457 	if (sec < 1)
2458 		sec = 1;
2459 
2460 	if (ds->iotime < 1)
2461 		ds->iotime = 1;
2462 	iorate = (ds->nwrite * 100000ULL) / ds->iotime;
2463 
2464 	compress_ratio = 100LL * ds->npages / btopr(ds->nwrite + 1);
2465 
2466 #define	P(...) (p += p < e ? snprintf(p, e - p, __VA_ARGS__) : 0)
2467 
2468 	P("Master cpu_seqid,%d\n", CPU->cpu_seqid);
2469 	P("Master cpu_id,%d\n", CPU->cpu_id);
2470 	P("dump_flags,0x%x\n", dumphdr->dump_flags);
2471 	P("dump_ioerr,%d\n", dump_ioerr);
2472 
2473 	P("Helpers:\n");
2474 	for (i = 0; i < ncpus; i++) {
2475 		if ((i & 15) == 0)
2476 			P(",,%03d,", i);
2477 		if (i == myid)
2478 			P("   M");
2479 		else if (BT_TEST(cfg->helpermap, i))
2480 			P("%4d", cpu_seq[i]->cpu_id);
2481 		else
2482 			P("   *");
2483 		if ((i & 15) == 15)
2484 			P("\n");
2485 	}
2486 
2487 	P("ncbuf_used,%d\n", cfg->ncbuf_used);
2488 	P("ncmap,%d\n", cfg->ncmap);
2489 
2490 	P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m);
2491 	P("Found small pages,%ld\n", cfg->foundsm);
2492 
2493 	P("Compression level,%d\n", cfg->clevel);
2494 	P("Compression type,%s %s\n", cfg->clevel == 0 ? "serial" : "parallel",
2495 	    cfg->clevel >= DUMP_CLEVEL_BZIP2 ? "bzip2" : "lzjb");
2496 	P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio %
2497 	    100);
2498 	P("nhelper_used,%d\n", cfg->nhelper_used);
2499 
2500 	P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100);
2501 	P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite);
2502 	P("..total nsec,%lld\n", (u_longlong_t)ds->iotime);
2503 	P("dumpbuf.iosize,%ld\n", dumpbuf.iosize);
2504 	P("dumpbuf.size,%ld\n", dumpbuf.size);
2505 
2506 	P("Dump pages/sec,%llu\n", (u_longlong_t)ds->npages / sec);
2507 	P("Dump pages,%llu\n", (u_longlong_t)ds->npages);
2508 	P("Dump time,%d\n", sec);
2509 
2510 	if (ds->pages_mapped > 0)
2511 		P("per-cent map utilization,%d\n", (int)((100 * ds->pages_used)
2512 		    / ds->pages_mapped));
2513 
2514 	P("\nPer-page metrics:\n");
2515 	if (ds->npages > 0) {
2516 		for (hp = cfg->helper; hp != hpend; hp++) {
2517 #define	PERPAGE(x)	ds->perpage.x += hp->perpage.x;
2518 			PERPAGES;
2519 #undef PERPAGE
2520 		}
2521 #define	PERPAGE(x) \
2522 		P("%s nsec/page,%d\n", #x, (int)(ds->perpage.x / ds->npages));
2523 		PERPAGES;
2524 #undef PERPAGE
2525 		P("freebufq.empty,%d\n", (int)(ds->freebufq.empty /
2526 		    ds->npages));
2527 		P("helperq.empty,%d\n", (int)(ds->helperq.empty /
2528 		    ds->npages));
2529 		P("writerq.empty,%d\n", (int)(ds->writerq.empty /
2530 		    ds->npages));
2531 		P("mainq.empty,%d\n", (int)(ds->mainq.empty / ds->npages));
2532 
2533 		P("I/O wait nsec/page,%llu\n", (u_longlong_t)(ds->iowait /
2534 		    ds->npages));
2535 	}
2536 #undef P
2537 	if (p < e)
2538 		bzero(p, e - p);
2539 	return (p - buf);
2540 }
2541 #endif	/* COLLECT_METRICS */
2542 
2543 /*
2544  * Dump the system.
2545  */
2546 void
2547 dumpsys(void)
2548 {
2549 	dumpsync_t *ds = &dumpsync;
2550 	taskq_t *livetaskq = NULL;
2551 	pfn_t pfn;
2552 	pgcnt_t bitnum;
2553 	proc_t *p;
2554 	helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper];
2555 	cbuf_t *cp;
2556 	pid_t npids, pidx;
2557 	char *content;
2558 	char *buf;
2559 	size_t size;
2560 	int save_dump_clevel;
2561 	dumpmlw_t mlw;
2562 	dumpcsize_t datatag;
2563 	dumpdatahdr_t datahdr;
2564 
2565 	if (dumpvp == NULL || dumphdr == NULL) {
2566 		uprintf("skipping system dump - no dump device configured\n");
2567 		if (panicstr) {
2568 			dumpcfg.helpers_wanted = 0;
2569 			dumpsys_spinunlock(&dumpcfg.helper_lock);
2570 		}
2571 		return;
2572 	}
2573 	dumpbuf.cur = dumpbuf.start;
2574 
2575 	/* clear the sync variables */
2576 	ASSERT(dumpcfg.nhelper > 0);
2577 	bzero(ds, sizeof (*ds));
2578 	ds->dumpcpu = CPU->cpu_id;
2579 
2580 	/*
2581 	 * Calculate the starting block for dump.  If we're dumping on a
2582 	 * swap device, start 1/5 of the way in; otherwise, start at the
2583 	 * beginning.  And never use the first page -- it may be a disk label.
2584 	 */
2585 	if (dumpvp->v_flag & VISSWAP)
2586 		dumphdr->dump_start = P2ROUNDUP(dumpvp_size / 5, DUMP_OFFSET);
2587 	else
2588 		dumphdr->dump_start = DUMP_OFFSET;
2589 
2590 	dumphdr->dump_flags = DF_VALID | DF_COMPLETE | DF_LIVE | DF_COMPRESSED;
2591 	dumphdr->dump_crashtime = gethrestime_sec();
2592 	dumphdr->dump_npages = 0;
2593 	dumphdr->dump_nvtop = 0;
2594 	bzero(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.bitmapsize));
2595 	dump_timeleft = dump_timeout;
2596 
2597 	if (panicstr) {
2598 		dumphdr->dump_flags &= ~DF_LIVE;
2599 		(void) VOP_DUMPCTL(dumpvp, DUMP_FREE, NULL, NULL);
2600 		(void) VOP_DUMPCTL(dumpvp, DUMP_ALLOC, NULL, NULL);
2601 		(void) vsnprintf(dumphdr->dump_panicstring, DUMP_PANICSIZE,
2602 		    panicstr, panicargs);
2603 
2604 	}
2605 
2606 	if (dump_conflags & DUMP_ALL)
2607 		content = "all";
2608 	else if (dump_conflags & DUMP_CURPROC)
2609 		content = "kernel + curproc";
2610 	else
2611 		content = "kernel";
2612 	uprintf("dumping to %s, offset %lld, content: %s\n", dumppath,
2613 	    dumphdr->dump_start, content);
2614 
2615 	/* Make sure nodename is current */
2616 	bcopy(utsname.nodename, dumphdr->dump_utsname.nodename, SYS_NMLN);
2617 
2618 	/*
2619 	 * If this is a live dump, try to open a VCHR vnode for better
2620 	 * performance. We must take care to flush the buffer cache
2621 	 * first.
2622 	 */
2623 	if (!panicstr) {
2624 		vnode_t *cdev_vp, *cmn_cdev_vp;
2625 
2626 		ASSERT(dumpbuf.cdev_vp == NULL);
2627 		cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR);
2628 		if (cdev_vp != NULL) {
2629 			cmn_cdev_vp = common_specvp(cdev_vp);
2630 			if (VOP_OPEN(&cmn_cdev_vp, FREAD | FWRITE, kcred, NULL)
2631 			    == 0) {
2632 				if (vn_has_cached_data(dumpvp))
2633 					(void) pvn_vplist_dirty(dumpvp, 0, NULL,
2634 					    B_INVAL | B_TRUNC, kcred);
2635 				dumpbuf.cdev_vp = cmn_cdev_vp;
2636 			} else {
2637 				VN_RELE(cdev_vp);
2638 			}
2639 		}
2640 	}
2641 
2642 	/*
2643 	 * Store a hires timestamp so we can look it up during debugging.
2644 	 */
2645 	lbolt_debug_entry();
2646 
2647 	/*
2648 	 * Leave room for the message and ereport save areas and terminal dump
2649 	 * header.
2650 	 */
2651 	dumpbuf.vp_limit = dumpvp_size - DUMP_LOGSIZE - DUMP_OFFSET -
2652 	    DUMP_ERPTSIZE;
2653 
2654 	/*
2655 	 * Write out the symbol table.  It's no longer compressed,
2656 	 * so its 'size' and 'csize' are equal.
2657 	 */
2658 	dumpbuf.vp_off = dumphdr->dump_ksyms = dumphdr->dump_start + PAGESIZE;
2659 	dumphdr->dump_ksyms_size = dumphdr->dump_ksyms_csize =
2660 	    ksyms_snapshot(dumpvp_ksyms_write, NULL, LONG_MAX);
2661 
2662 	/*
2663 	 * Write out the translation map.
2664 	 */
2665 	dumphdr->dump_map = dumpvp_flush();
2666 	dump_as(&kas);
2667 	dumphdr->dump_nvtop += dump_plat_addr();
2668 
2669 	/*
2670 	 * call into hat, which may have unmapped pages that also need to
2671 	 * be in the dump
2672 	 */
2673 	hat_dump();
2674 
2675 	if (dump_conflags & DUMP_ALL) {
2676 		mutex_enter(&pidlock);
2677 
2678 		for (npids = 0, p = practive; p != NULL; p = p->p_next)
2679 			dumpcfg.pids[npids++] = p->p_pid;
2680 
2681 		mutex_exit(&pidlock);
2682 
2683 		for (pidx = 0; pidx < npids; pidx++)
2684 			(void) dump_process(dumpcfg.pids[pidx]);
2685 
2686 		dump_init_memlist_walker(&mlw);
2687 		for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) {
2688 			dump_timeleft = dump_timeout;
2689 			pfn = dump_bitnum_to_pfn(bitnum, &mlw);
2690 			/*
2691 			 * Some hypervisors do not have all pages available to
2692 			 * be accessed by the guest OS.  Check for page
2693 			 * accessibility.
2694 			 */
2695 			if (plat_hold_page(pfn, PLAT_HOLD_NO_LOCK, NULL) !=
2696 			    PLAT_HOLD_OK)
2697 				continue;
2698 			BT_SET(dumpcfg.bitmap, bitnum);
2699 		}
2700 		dumphdr->dump_npages = dumpcfg.bitmapsize;
2701 		dumphdr->dump_flags |= DF_ALL;
2702 
2703 	} else if (dump_conflags & DUMP_CURPROC) {
2704 		/*
2705 		 * Determine which pid is to be dumped.  If we're panicking, we
2706 		 * dump the process associated with panic_thread (if any).  If
2707 		 * this is a live dump, we dump the process associated with
2708 		 * curthread.
2709 		 */
2710 		npids = 0;
2711 		if (panicstr) {
2712 			if (panic_thread != NULL &&
2713 			    panic_thread->t_procp != NULL &&
2714 			    panic_thread->t_procp != &p0) {
2715 				dumpcfg.pids[npids++] =
2716 				    panic_thread->t_procp->p_pid;
2717 			}
2718 		} else {
2719 			dumpcfg.pids[npids++] = curthread->t_procp->p_pid;
2720 		}
2721 
2722 		if (npids && dump_process(dumpcfg.pids[0]) == 0)
2723 			dumphdr->dump_flags |= DF_CURPROC;
2724 		else
2725 			dumphdr->dump_flags |= DF_KERNEL;
2726 
2727 	} else {
2728 		dumphdr->dump_flags |= DF_KERNEL;
2729 	}
2730 
2731 	dumphdr->dump_hashmask = (1 << highbit(dumphdr->dump_nvtop - 1)) - 1;
2732 
2733 	/*
2734 	 * Write out the pfn table.
2735 	 */
2736 	dumphdr->dump_pfn = dumpvp_flush();
2737 	dump_init_memlist_walker(&mlw);
2738 	for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) {
2739 		dump_timeleft = dump_timeout;
2740 		if (!BT_TEST(dumpcfg.bitmap, bitnum))
2741 			continue;
2742 		pfn = dump_bitnum_to_pfn(bitnum, &mlw);
2743 		ASSERT(pfn != PFN_INVALID);
2744 		dumpvp_write(&pfn, sizeof (pfn_t));
2745 	}
2746 	dump_plat_pfn();
2747 
2748 	/*
2749 	 * Write out all the pages.
2750 	 * Map pages, copy them handling UEs, compress, and write them out.
2751 	 * Cooperate with any helpers running on CPUs in panic_idle().
2752 	 */
2753 	dumphdr->dump_data = dumpvp_flush();
2754 
2755 	bzero(dumpcfg.helpermap, BT_SIZEOFMAP(NCPU));
2756 	ds->live = dumpcfg.clevel > 0 &&
2757 	    (dumphdr->dump_flags & DF_LIVE) != 0;
2758 
2759 	save_dump_clevel = dumpcfg.clevel;
2760 	if (panicstr)
2761 		dumpsys_get_maxmem();
2762 	else if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2)
2763 		dumpcfg.clevel = DUMP_CLEVEL_LZJB;
2764 
2765 	dumpcfg.nhelper_used = 0;
2766 	for (hp = dumpcfg.helper; hp != hpend; hp++) {
2767 		if (hp->page == NULL) {
2768 			hp->helper = DONEHELPER;
2769 			continue;
2770 		}
2771 		++dumpcfg.nhelper_used;
2772 		hp->helper = FREEHELPER;
2773 		hp->taskqid = NULL;
2774 		hp->ds = ds;
2775 		bzero(&hp->perpage, sizeof (hp->perpage));
2776 		if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2)
2777 			(void) BZ2_bzCompressReset(&hp->bzstream);
2778 	}
2779 
2780 	CQ_OPEN(freebufq);
2781 	CQ_OPEN(helperq);
2782 
2783 	dumpcfg.ncbuf_used = 0;
2784 	for (cp = dumpcfg.cbuf; cp != &dumpcfg.cbuf[dumpcfg.ncbuf]; cp++) {
2785 		if (cp->buf != NULL) {
2786 			CQ_PUT(freebufq, cp, CBUF_FREEBUF);
2787 			++dumpcfg.ncbuf_used;
2788 		}
2789 	}
2790 
2791 	for (cp = dumpcfg.cmap; cp != &dumpcfg.cmap[dumpcfg.ncmap]; cp++)
2792 		CQ_PUT(mainq, cp, CBUF_FREEMAP);
2793 
2794 	ds->start = gethrtime();
2795 	ds->iowaitts = ds->start;
2796 
2797 	/* start helpers */
2798 	if (ds->live) {
2799 		int n = dumpcfg.nhelper_used;
2800 		int pri = MINCLSYSPRI - 25;
2801 
2802 		livetaskq = taskq_create("LiveDump", n, pri, n, n,
2803 		    TASKQ_PREPOPULATE);
2804 		for (hp = dumpcfg.helper; hp != hpend; hp++) {
2805 			if (hp->page == NULL)
2806 				continue;
2807 			hp->helper = hp - dumpcfg.helper;
2808 			hp->taskqid = taskq_dispatch(livetaskq,
2809 			    dumpsys_live_helper, (void *)hp, TQ_NOSLEEP);
2810 		}
2811 
2812 	} else {
2813 		if (panicstr)
2814 			kmem_dump_begin();
2815 		dumpcfg.helpers_wanted = dumpcfg.clevel > 0;
2816 		dumpsys_spinunlock(&dumpcfg.helper_lock);
2817 	}
2818 
2819 	/* run main task */
2820 	dumpsys_main_task(ds);
2821 
2822 	ds->elapsed = gethrtime() - ds->start;
2823 	if (ds->elapsed < 1)
2824 		ds->elapsed = 1;
2825 
2826 	if (livetaskq != NULL)
2827 		taskq_destroy(livetaskq);
2828 
2829 	if (ds->neednl) {
2830 		uprintf("\n");
2831 		ds->neednl = 0;
2832 	}
2833 
2834 	/* record actual pages dumped */
2835 	dumphdr->dump_npages = ds->npages;
2836 
2837 	/* platform-specific data */
2838 	dumphdr->dump_npages += dump_plat_data(dumpcfg.cbuf[0].buf);
2839 
2840 	/* note any errors by clearing DF_COMPLETE */
2841 	if (dump_ioerr || ds->npages < dumphdr->dump_npages)
2842 		dumphdr->dump_flags &= ~DF_COMPLETE;
2843 
2844 	/* end of stream blocks */
2845 	datatag = 0;
2846 	dumpvp_write(&datatag, sizeof (datatag));
2847 
2848 	bzero(&datahdr, sizeof (datahdr));
2849 
2850 	/* buffer for metrics */
2851 	buf = dumpcfg.cbuf[0].buf;
2852 	size = MIN(dumpcfg.cbuf[0].size, DUMP_OFFSET - sizeof (dumphdr_t) -
2853 	    sizeof (dumpdatahdr_t));
2854 
2855 	/* finish the kmem intercepts, collect kmem verbose info */
2856 	if (panicstr) {
2857 		datahdr.dump_metrics = kmem_dump_finish(buf, size);
2858 		buf += datahdr.dump_metrics;
2859 		size -= datahdr.dump_metrics;
2860 	}
2861 
2862 	/* compression info in data header */
2863 	datahdr.dump_datahdr_magic = DUMP_DATAHDR_MAGIC;
2864 	datahdr.dump_datahdr_version = DUMP_DATAHDR_VERSION;
2865 	datahdr.dump_maxcsize = CBUF_SIZE;
2866 	datahdr.dump_maxrange = CBUF_MAPSIZE / PAGESIZE;
2867 	datahdr.dump_nstreams = dumpcfg.nhelper_used;
2868 	datahdr.dump_clevel = dumpcfg.clevel;
2869 #ifdef COLLECT_METRICS
2870 	if (dump_metrics_on)
2871 		datahdr.dump_metrics += dumpsys_metrics(ds, buf, size);
2872 #endif
2873 	datahdr.dump_data_csize = dumpvp_flush() - dumphdr->dump_data;
2874 
2875 	/*
2876 	 * Write out the initial and terminal dump headers.
2877 	 */
2878 	dumpbuf.vp_off = dumphdr->dump_start;
2879 	dumpvp_write(dumphdr, sizeof (dumphdr_t));
2880 	(void) dumpvp_flush();
2881 
2882 	dumpbuf.vp_limit = dumpvp_size;
2883 	dumpbuf.vp_off = dumpbuf.vp_limit - DUMP_OFFSET;
2884 	dumpvp_write(dumphdr, sizeof (dumphdr_t));
2885 	dumpvp_write(&datahdr, sizeof (dumpdatahdr_t));
2886 	dumpvp_write(dumpcfg.cbuf[0].buf, datahdr.dump_metrics);
2887 
2888 	(void) dumpvp_flush();
2889 
2890 	uprintf("\r%3d%% done: %llu pages dumped, ",
2891 	    ds->percent_done, (u_longlong_t)ds->npages);
2892 
2893 	if (dump_ioerr == 0) {
2894 		uprintf("dump succeeded\n");
2895 	} else {
2896 		uprintf("dump failed: error %d\n", dump_ioerr);
2897 #ifdef DEBUG
2898 		if (panicstr)
2899 			debug_enter("dump failed");
2900 #endif
2901 	}
2902 
2903 	/*
2904 	 * Write out all undelivered messages.  This has to be the *last*
2905 	 * thing we do because the dump process itself emits messages.
2906 	 */
2907 	if (panicstr) {
2908 		dump_ereports();
2909 		dump_messages();
2910 	}
2911 
2912 	delay(2 * hz);	/* let people see the 'done' message */
2913 	dump_timeleft = 0;
2914 	dump_ioerr = 0;
2915 
2916 	/* restore settings after live dump completes */
2917 	if (!panicstr) {
2918 		dumpcfg.clevel = save_dump_clevel;
2919 
2920 		/* release any VCHR open of the dump device */
2921 		if (dumpbuf.cdev_vp != NULL) {
2922 			(void) VOP_CLOSE(dumpbuf.cdev_vp, FREAD | FWRITE, 1, 0,
2923 			    kcred, NULL);
2924 			VN_RELE(dumpbuf.cdev_vp);
2925 			dumpbuf.cdev_vp = NULL;
2926 		}
2927 	}
2928 }
2929 
2930 /*
2931  * This function is called whenever the memory size, as represented
2932  * by the phys_install list, changes.
2933  */
2934 void
2935 dump_resize()
2936 {
2937 	mutex_enter(&dump_lock);
2938 	dumphdr_init();
2939 	dumpbuf_resize();
2940 	dump_update_clevel();
2941 	mutex_exit(&dump_lock);
2942 }
2943 
2944 /*
2945  * This function allows for dynamic resizing of a dump area. It assumes that
2946  * the underlying device has update its appropriate size(9P).
2947  */
2948 int
2949 dumpvp_resize()
2950 {
2951 	int error;
2952 	vattr_t vattr;
2953 
2954 	mutex_enter(&dump_lock);
2955 	vattr.va_mask = AT_SIZE;
2956 	if ((error = VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL)) != 0) {
2957 		mutex_exit(&dump_lock);
2958 		return (error);
2959 	}
2960 
2961 	if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) {
2962 		mutex_exit(&dump_lock);
2963 		return (ENOSPC);
2964 	}
2965 
2966 	dumpvp_size = vattr.va_size & -DUMP_OFFSET;
2967 	mutex_exit(&dump_lock);
2968 	return (0);
2969 }
2970