1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/param.h> 28 #include <sys/systm.h> 29 #include <sys/vm.h> 30 #include <sys/proc.h> 31 #include <sys/file.h> 32 #include <sys/conf.h> 33 #include <sys/kmem.h> 34 #include <sys/mem.h> 35 #include <sys/mman.h> 36 #include <sys/vnode.h> 37 #include <sys/errno.h> 38 #include <sys/memlist.h> 39 #include <sys/dumphdr.h> 40 #include <sys/dumpadm.h> 41 #include <sys/ksyms.h> 42 #include <sys/compress.h> 43 #include <sys/stream.h> 44 #include <sys/strsun.h> 45 #include <sys/cmn_err.h> 46 #include <sys/bitmap.h> 47 #include <sys/modctl.h> 48 #include <sys/utsname.h> 49 #include <sys/systeminfo.h> 50 #include <sys/vmem.h> 51 #include <sys/log.h> 52 #include <sys/var.h> 53 #include <sys/debug.h> 54 #include <sys/sunddi.h> 55 #include <fs/fs_subr.h> 56 #include <sys/fs/snode.h> 57 #include <sys/ontrap.h> 58 #include <sys/panic.h> 59 #include <sys/dkio.h> 60 #include <sys/vtoc.h> 61 #include <sys/errorq.h> 62 #include <sys/fm/util.h> 63 #include <sys/fs/zfs.h> 64 65 #include <vm/hat.h> 66 #include <vm/as.h> 67 #include <vm/page.h> 68 #include <vm/pvn.h> 69 #include <vm/seg.h> 70 #include <vm/seg_kmem.h> 71 #include <sys/clock_impl.h> 72 #include <sys/hold_page.h> 73 74 #include <bzip2/bzlib.h> 75 76 /* 77 * Crash dump time is dominated by disk write time. To reduce this, 78 * the stronger compression method bzip2 is applied to reduce the dump 79 * size and hence reduce I/O time. However, bzip2 is much more 80 * computationally expensive than the existing lzjb algorithm, so to 81 * avoid increasing compression time, CPUs that are otherwise idle 82 * during panic are employed to parallelize the compression task. 83 * Many helper CPUs are needed to prevent bzip2 from being a 84 * bottleneck, and on systems with too few CPUs, the lzjb algorithm is 85 * parallelized instead. Lastly, I/O and compression are performed by 86 * different CPUs, and are hence overlapped in time, unlike the older 87 * serial code. 88 * 89 * Another important consideration is the speed of the dump 90 * device. Faster disks need less CPUs in order to benefit from 91 * parallel lzjb versus parallel bzip2. Therefore, the CPU count 92 * threshold for switching from parallel lzjb to paralled bzip2 is 93 * elevated for faster disks. The dump device speed is adduced from 94 * the setting for dumpbuf.iosize, see dump_update_clevel. 95 */ 96 97 /* 98 * exported vars 99 */ 100 kmutex_t dump_lock; /* lock for dump configuration */ 101 dumphdr_t *dumphdr; /* dump header */ 102 int dump_conflags = DUMP_KERNEL; /* dump configuration flags */ 103 vnode_t *dumpvp; /* dump device vnode pointer */ 104 u_offset_t dumpvp_size; /* size of dump device, in bytes */ 105 char *dumppath; /* pathname of dump device */ 106 int dump_timeout = 120; /* timeout for dumping pages */ 107 int dump_timeleft; /* portion of dump_timeout remaining */ 108 int dump_ioerr; /* dump i/o error */ 109 int dump_check_used; /* enable check for used pages */ 110 char *dump_stack_scratch; /* scratch area for saving stack summary */ 111 112 /* 113 * Tunables for dump compression and parallelism. These can be set via 114 * /etc/system. 115 * 116 * dump_ncpu_low number of helpers for parallel lzjb 117 * This is also the minimum configuration. 118 * 119 * dump_bzip2_level bzip2 compression level: 1-9 120 * Higher numbers give greater compression, but take more memory 121 * and time. Memory used per helper is ~(dump_bzip2_level * 1MB). 122 * 123 * dump_plat_mincpu the cross-over limit for using bzip2 (per platform): 124 * if dump_plat_mincpu == 0, then always do single threaded dump 125 * if ncpu >= dump_plat_mincpu then try to use bzip2 126 * 127 * dump_metrics_on if set, metrics are collected in the kernel, passed 128 * to savecore via the dump file, and recorded by savecore in 129 * METRICS.txt. 130 */ 131 uint_t dump_ncpu_low = 4; /* minimum config for parallel lzjb */ 132 uint_t dump_bzip2_level = 1; /* bzip2 level (1-9) */ 133 134 /* Use dump_plat_mincpu_default unless this variable is set by /etc/system */ 135 #define MINCPU_NOT_SET ((uint_t)-1) 136 uint_t dump_plat_mincpu = MINCPU_NOT_SET; 137 138 /* tunables for pre-reserved heap */ 139 uint_t dump_kmem_permap = 1024; 140 uint_t dump_kmem_pages = 8; 141 142 /* Define multiple buffers per helper to avoid stalling */ 143 #define NCBUF_PER_HELPER 2 144 #define NCMAP_PER_HELPER 4 145 146 /* minimum number of helpers configured */ 147 #define MINHELPERS (dump_ncpu_low) 148 #define MINCBUFS (MINHELPERS * NCBUF_PER_HELPER) 149 150 /* 151 * Define constant parameters. 152 * 153 * CBUF_SIZE size of an output buffer 154 * 155 * CBUF_MAPSIZE size of virtual range for mapping pages 156 * 157 * CBUF_MAPNP size of virtual range in pages 158 * 159 */ 160 #define DUMP_1KB ((size_t)1 << 10) 161 #define DUMP_1MB ((size_t)1 << 20) 162 #define CBUF_SIZE ((size_t)1 << 17) 163 #define CBUF_MAPSHIFT (22) 164 #define CBUF_MAPSIZE ((size_t)1 << CBUF_MAPSHIFT) 165 #define CBUF_MAPNP ((size_t)1 << (CBUF_MAPSHIFT - PAGESHIFT)) 166 167 /* 168 * Compression metrics are accumulated nano-second subtotals. The 169 * results are normalized by the number of pages dumped. A report is 170 * generated when dumpsys() completes and is saved in the dump image 171 * after the trailing dump header. 172 * 173 * Metrics are always collected. Set the variable dump_metrics_on to 174 * cause metrics to be saved in the crash file, where savecore will 175 * save it in the file METRICS.txt. 176 */ 177 #define PERPAGES \ 178 PERPAGE(bitmap) PERPAGE(map) PERPAGE(unmap) \ 179 PERPAGE(copy) PERPAGE(compress) \ 180 PERPAGE(write) \ 181 PERPAGE(inwait) PERPAGE(outwait) 182 183 typedef struct perpage { 184 #define PERPAGE(x) hrtime_t x; 185 PERPAGES 186 #undef PERPAGE 187 } perpage_t; 188 189 /* 190 * This macro controls the code generation for collecting dump 191 * performance information. By default, the code is generated, but 192 * automatic saving of the information is disabled. If dump_metrics_on 193 * is set to 1, the timing information is passed to savecore via the 194 * crash file, where it is appended to the file dump-dir/METRICS.txt. 195 */ 196 #define COLLECT_METRICS 197 198 #ifdef COLLECT_METRICS 199 uint_t dump_metrics_on = 0; /* set to 1 to enable recording metrics */ 200 201 #define HRSTART(v, m) v##ts.m = gethrtime() 202 #define HRSTOP(v, m) v.m += gethrtime() - v##ts.m 203 #define HRBEGIN(v, m, s) v##ts.m = gethrtime(); v.size += s 204 #define HREND(v, m) v.m += gethrtime() - v##ts.m 205 #define HRNORM(v, m, n) v.m /= (n) 206 207 #else 208 #define HRSTART(v, m) 209 #define HRSTOP(v, m) 210 #define HRBEGIN(v, m, s) 211 #define HREND(v, m) 212 #define HRNORM(v, m, n) 213 #endif /* COLLECT_METRICS */ 214 215 /* 216 * Buffers for copying and compressing memory pages. 217 * 218 * cbuf_t buffer controllers: used for both input and output. 219 * 220 * The buffer state indicates how it is being used: 221 * 222 * CBUF_FREEMAP: CBUF_MAPSIZE virtual address range is available for 223 * mapping input pages. 224 * 225 * CBUF_INREADY: input pages are mapped and ready for compression by a 226 * helper. 227 * 228 * CBUF_USEDMAP: mapping has been consumed by a helper. Needs unmap. 229 * 230 * CBUF_FREEBUF: CBUF_SIZE output buffer, which is available. 231 * 232 * CBUF_WRITE: CBUF_SIZE block of compressed pages from a helper, 233 * ready to write out. 234 * 235 * CBUF_ERRMSG: CBUF_SIZE block of error messages from a helper 236 * (reports UE errors.) 237 */ 238 239 typedef enum cbufstate { 240 CBUF_FREEMAP, 241 CBUF_INREADY, 242 CBUF_USEDMAP, 243 CBUF_FREEBUF, 244 CBUF_WRITE, 245 CBUF_ERRMSG 246 } cbufstate_t; 247 248 typedef struct cbuf cbuf_t; 249 250 struct cbuf { 251 cbuf_t *next; /* next in list */ 252 cbufstate_t state; /* processing state */ 253 size_t used; /* amount used */ 254 size_t size; /* mem size */ 255 char *buf; /* kmem or vmem */ 256 pgcnt_t pagenum; /* index to pfn map */ 257 pgcnt_t bitnum; /* first set bitnum */ 258 pfn_t pfn; /* first pfn in mapped range */ 259 int off; /* byte offset to first pfn */ 260 }; 261 262 static char dump_osimage_uuid[36 + 1]; 263 264 #define isdigit(ch) ((ch) >= '0' && (ch) <= '9') 265 #define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \ 266 ((ch) >= 'A' && (ch) <= 'F')) 267 268 /* 269 * cqueue_t queues: a uni-directional channel for communication 270 * from the master to helper tasks or vice-versa using put and 271 * get primitives. Both mappings and data buffers are passed via 272 * queues. Producers close a queue when done. The number of 273 * active producers is reference counted so the consumer can 274 * detect end of data. Concurrent access is mediated by atomic 275 * operations for panic dump, or mutex/cv for live dump. 276 * 277 * There a four queues, used as follows: 278 * 279 * Queue Dataflow NewState 280 * -------------------------------------------------- 281 * mainq master -> master FREEMAP 282 * master has initialized or unmapped an input buffer 283 * -------------------------------------------------- 284 * helperq master -> helper INREADY 285 * master has mapped input for use by helper 286 * -------------------------------------------------- 287 * mainq master <- helper USEDMAP 288 * helper is done with input 289 * -------------------------------------------------- 290 * freebufq master -> helper FREEBUF 291 * master has initialized or written an output buffer 292 * -------------------------------------------------- 293 * mainq master <- helper WRITE 294 * block of compressed pages from a helper 295 * -------------------------------------------------- 296 * mainq master <- helper ERRMSG 297 * error messages from a helper (memory error case) 298 * -------------------------------------------------- 299 * writerq master <- master WRITE 300 * non-blocking queue of blocks to write 301 * -------------------------------------------------- 302 */ 303 typedef struct cqueue { 304 cbuf_t *volatile first; /* first in list */ 305 cbuf_t *last; /* last in list */ 306 hrtime_t ts; /* timestamp */ 307 hrtime_t empty; /* total time empty */ 308 kmutex_t mutex; /* live state lock */ 309 kcondvar_t cv; /* live wait var */ 310 lock_t spinlock; /* panic mode spin lock */ 311 volatile uint_t open; /* producer ref count */ 312 } cqueue_t; 313 314 /* 315 * Convenience macros for using the cqueue functions 316 * Note that the caller must have defined "dumpsync_t *ds" 317 */ 318 #define CQ_IS_EMPTY(q) \ 319 (ds->q.first == NULL) 320 321 #define CQ_OPEN(q) \ 322 atomic_inc_uint(&ds->q.open) 323 324 #define CQ_CLOSE(q) \ 325 dumpsys_close_cq(&ds->q, ds->live) 326 327 #define CQ_PUT(q, cp, st) \ 328 dumpsys_put_cq(&ds->q, cp, st, ds->live) 329 330 #define CQ_GET(q) \ 331 dumpsys_get_cq(&ds->q, ds->live) 332 333 /* 334 * Dynamic state when dumpsys() is running. 335 */ 336 typedef struct dumpsync { 337 pgcnt_t npages; /* subtotal of pages dumped */ 338 pgcnt_t pages_mapped; /* subtotal of pages mapped */ 339 pgcnt_t pages_used; /* subtotal of pages used per map */ 340 size_t nwrite; /* subtotal of bytes written */ 341 uint_t live; /* running live dump */ 342 uint_t neednl; /* will need to print a newline */ 343 uint_t percent; /* dump progress */ 344 uint_t percent_done; /* dump progress reported */ 345 cqueue_t freebufq; /* free kmem bufs for writing */ 346 cqueue_t mainq; /* input for main task */ 347 cqueue_t helperq; /* input for helpers */ 348 cqueue_t writerq; /* input for writer */ 349 hrtime_t start; /* start time */ 350 hrtime_t elapsed; /* elapsed time when completed */ 351 hrtime_t iotime; /* time spent writing nwrite bytes */ 352 hrtime_t iowait; /* time spent waiting for output */ 353 hrtime_t iowaitts; /* iowait timestamp */ 354 perpage_t perpage; /* metrics */ 355 perpage_t perpagets; 356 int dumpcpu; /* master cpu */ 357 } dumpsync_t; 358 359 static dumpsync_t dumpsync; /* synchronization vars */ 360 361 /* 362 * helper_t helpers: contains the context for a stream. CPUs run in 363 * parallel at dump time; each CPU creates a single stream of 364 * compression data. Stream data is divided into CBUF_SIZE blocks. 365 * The blocks are written in order within a stream. But, blocks from 366 * multiple streams can be interleaved. Each stream is identified by a 367 * unique tag. 368 */ 369 typedef struct helper { 370 int helper; /* bound helper id */ 371 int tag; /* compression stream tag */ 372 perpage_t perpage; /* per page metrics */ 373 perpage_t perpagets; /* per page metrics (timestamps) */ 374 taskqid_t taskqid; /* live dump task ptr */ 375 int in, out; /* buffer offsets */ 376 cbuf_t *cpin, *cpout, *cperr; /* cbuf objects in process */ 377 dumpsync_t *ds; /* pointer to sync vars */ 378 size_t used; /* counts input consumed */ 379 char *page; /* buffer for page copy */ 380 char *lzbuf; /* lzjb output */ 381 bz_stream bzstream; /* bzip2 state */ 382 } helper_t; 383 384 #define MAINHELPER (-1) /* helper is also the main task */ 385 #define FREEHELPER (-2) /* unbound helper */ 386 #define DONEHELPER (-3) /* helper finished */ 387 388 /* 389 * configuration vars for dumpsys 390 */ 391 typedef struct dumpcfg { 392 int threshold; /* ncpu threshold for bzip2 */ 393 int nhelper; /* number of helpers */ 394 int nhelper_used; /* actual number of helpers used */ 395 int ncmap; /* number VA pages for compression */ 396 int ncbuf; /* number of bufs for compression */ 397 int ncbuf_used; /* number of bufs in use */ 398 uint_t clevel; /* dump compression level */ 399 helper_t *helper; /* array of helpers */ 400 cbuf_t *cmap; /* array of input (map) buffers */ 401 cbuf_t *cbuf; /* array of output buffers */ 402 ulong_t *helpermap; /* set of dumpsys helper CPU ids */ 403 ulong_t *bitmap; /* bitmap for marking pages to dump */ 404 ulong_t *rbitmap; /* bitmap for used CBUF_MAPSIZE ranges */ 405 pgcnt_t bitmapsize; /* size of bitmap */ 406 pgcnt_t rbitmapsize; /* size of bitmap for ranges */ 407 pgcnt_t found4m; /* number ranges allocated by dump */ 408 pgcnt_t foundsm; /* number small pages allocated by dump */ 409 pid_t *pids; /* list of process IDs at dump time */ 410 size_t maxsize; /* memory size needed at dump time */ 411 size_t maxvmsize; /* size of reserved VM */ 412 char *maxvm; /* reserved VM for spare pages */ 413 lock_t helper_lock; /* protect helper state */ 414 char helpers_wanted; /* flag to enable parallelism */ 415 char helper_present; /* at least one helper showed up */ 416 } dumpcfg_t; 417 418 static dumpcfg_t dumpcfg; /* config vars */ 419 420 /* 421 * The dump I/O buffer. 422 * 423 * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is 424 * sized according to the optimum device transfer speed. 425 */ 426 typedef struct dumpbuf { 427 vnode_t *cdev_vp; /* VCHR open of the dump device */ 428 len_t vp_limit; /* maximum write offset */ 429 offset_t vp_off; /* current dump device offset */ 430 char *cur; /* dump write pointer */ 431 char *start; /* dump buffer address */ 432 char *end; /* dump buffer end */ 433 size_t size; /* size of dumpbuf in bytes */ 434 size_t iosize; /* best transfer size for device */ 435 } dumpbuf_t; 436 437 dumpbuf_t dumpbuf; /* I/O buffer */ 438 439 /* 440 * The dump I/O buffer must be at least one page, at most xfer_size 441 * bytes, and should scale with physmem in between. The transfer size 442 * passed in will either represent a global default (maxphys) or the 443 * best size for the device. The size of the dumpbuf I/O buffer is 444 * limited by dumpbuf_limit (8MB by default) because the dump 445 * performance saturates beyond a certain size. The default is to 446 * select 1/4096 of the memory. 447 */ 448 static int dumpbuf_fraction = 12; /* memory size scale factor */ 449 static size_t dumpbuf_limit = 8 * DUMP_1MB; /* max I/O buf size */ 450 451 static size_t 452 dumpbuf_iosize(size_t xfer_size) 453 { 454 size_t iosize = ptob(physmem >> dumpbuf_fraction); 455 456 if (iosize < PAGESIZE) 457 iosize = PAGESIZE; 458 else if (iosize > xfer_size) 459 iosize = xfer_size; 460 if (iosize > dumpbuf_limit) 461 iosize = dumpbuf_limit; 462 return (iosize & PAGEMASK); 463 } 464 465 /* 466 * resize the I/O buffer 467 */ 468 static void 469 dumpbuf_resize(void) 470 { 471 char *old_buf = dumpbuf.start; 472 size_t old_size = dumpbuf.size; 473 char *new_buf; 474 size_t new_size; 475 476 ASSERT(MUTEX_HELD(&dump_lock)); 477 478 new_size = dumpbuf_iosize(MAX(dumpbuf.iosize, maxphys)); 479 if (new_size <= old_size) 480 return; /* no need to reallocate buffer */ 481 482 new_buf = kmem_alloc(new_size, KM_SLEEP); 483 dumpbuf.size = new_size; 484 dumpbuf.start = new_buf; 485 dumpbuf.end = new_buf + new_size; 486 kmem_free(old_buf, old_size); 487 } 488 489 /* 490 * dump_update_clevel is called when dumpadm configures the dump device. 491 * Calculate number of helpers and buffers. 492 * Allocate the minimum configuration for now. 493 * 494 * When the dump file is configured we reserve a minimum amount of 495 * memory for use at crash time. But we reserve VA for all the memory 496 * we really want in order to do the fastest dump possible. The VA is 497 * backed by pages not being dumped, according to the bitmap. If 498 * there is insufficient spare memory, however, we fall back to the 499 * minimum. 500 * 501 * Live dump (savecore -L) always uses the minimum config. 502 * 503 * clevel 0 is single threaded lzjb 504 * clevel 1 is parallel lzjb 505 * clevel 2 is parallel bzip2 506 * 507 * The ncpu threshold is selected with dump_plat_mincpu. 508 * On OPL, set_platform_defaults() overrides the sun4u setting. 509 * The actual values are defined via DUMP_PLAT_*_MINCPU macros. 510 * 511 * Architecture Threshold Algorithm 512 * sun4u < 51 parallel lzjb 513 * sun4u >= 51 parallel bzip2(*) 514 * sun4u OPL < 8 parallel lzjb 515 * sun4u OPL >= 8 parallel bzip2(*) 516 * sun4v < 128 parallel lzjb 517 * sun4v >= 128 parallel bzip2(*) 518 * x86 < 11 parallel lzjb 519 * x86 >= 11 parallel bzip2(*) 520 * 32-bit N/A single-threaded lzjb 521 * 522 * (*) bzip2 is only chosen if there is sufficient available 523 * memory for buffers at dump time. See dumpsys_get_maxmem(). 524 * 525 * Faster dump devices have larger I/O buffers. The threshold value is 526 * increased according to the size of the dump I/O buffer, because 527 * parallel lzjb performs better with faster disks. For buffers >= 1MB 528 * the threshold is 3X; for buffers >= 256K threshold is 2X. 529 * 530 * For parallel dumps, the number of helpers is ncpu-1. The CPU 531 * running panic runs the main task. For single-threaded dumps, the 532 * panic CPU does lzjb compression (it is tagged as MAINHELPER.) 533 * 534 * Need multiple buffers per helper so that they do not block waiting 535 * for the main task. 536 * parallel single-threaded 537 * Number of output buffers: nhelper*2 1 538 * Number of mapping buffers: nhelper*4 1 539 * 540 */ 541 static void 542 dump_update_clevel() 543 { 544 int tag; 545 size_t bz2size; 546 helper_t *hp, *hpend; 547 cbuf_t *cp, *cpend; 548 dumpcfg_t *old = &dumpcfg; 549 dumpcfg_t newcfg = *old; 550 dumpcfg_t *new = &newcfg; 551 552 ASSERT(MUTEX_HELD(&dump_lock)); 553 554 /* 555 * Free the previously allocated bufs and VM. 556 */ 557 if (old->helper != NULL) { 558 559 /* helpers */ 560 hpend = &old->helper[old->nhelper]; 561 for (hp = old->helper; hp != hpend; hp++) { 562 if (hp->lzbuf != NULL) 563 kmem_free(hp->lzbuf, PAGESIZE); 564 if (hp->page != NULL) 565 kmem_free(hp->page, PAGESIZE); 566 } 567 kmem_free(old->helper, old->nhelper * sizeof (helper_t)); 568 569 /* VM space for mapping pages */ 570 cpend = &old->cmap[old->ncmap]; 571 for (cp = old->cmap; cp != cpend; cp++) 572 vmem_xfree(heap_arena, cp->buf, CBUF_MAPSIZE); 573 kmem_free(old->cmap, old->ncmap * sizeof (cbuf_t)); 574 575 /* output bufs */ 576 cpend = &old->cbuf[old->ncbuf]; 577 for (cp = old->cbuf; cp != cpend; cp++) 578 if (cp->buf != NULL) 579 kmem_free(cp->buf, cp->size); 580 kmem_free(old->cbuf, old->ncbuf * sizeof (cbuf_t)); 581 582 /* reserved VM for dumpsys_get_maxmem */ 583 if (old->maxvmsize > 0) 584 vmem_xfree(heap_arena, old->maxvm, old->maxvmsize); 585 } 586 587 /* 588 * Allocate memory and VM. 589 * One CPU runs dumpsys, the rest are helpers. 590 */ 591 new->nhelper = ncpus - 1; 592 if (new->nhelper < 1) 593 new->nhelper = 1; 594 595 if (new->nhelper > DUMP_MAX_NHELPER) 596 new->nhelper = DUMP_MAX_NHELPER; 597 598 /* use platform default, unless /etc/system overrides */ 599 if (dump_plat_mincpu == MINCPU_NOT_SET) 600 dump_plat_mincpu = dump_plat_mincpu_default; 601 602 /* increase threshold for faster disks */ 603 new->threshold = dump_plat_mincpu; 604 if (dumpbuf.iosize >= DUMP_1MB) 605 new->threshold *= 3; 606 else if (dumpbuf.iosize >= (256 * DUMP_1KB)) 607 new->threshold *= 2; 608 609 /* figure compression level based upon the computed threshold. */ 610 if (dump_plat_mincpu == 0 || new->nhelper < 2) { 611 new->clevel = 0; 612 new->nhelper = 1; 613 } else if ((new->nhelper + 1) >= new->threshold) { 614 new->clevel = DUMP_CLEVEL_BZIP2; 615 } else { 616 new->clevel = DUMP_CLEVEL_LZJB; 617 } 618 619 if (new->clevel == 0) { 620 new->ncbuf = 1; 621 new->ncmap = 1; 622 } else { 623 new->ncbuf = NCBUF_PER_HELPER * new->nhelper; 624 new->ncmap = NCMAP_PER_HELPER * new->nhelper; 625 } 626 627 /* 628 * Allocate new data structures and buffers for MINHELPERS, 629 * and also figure the max desired size. 630 */ 631 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level); 632 new->maxsize = 0; 633 new->maxvmsize = 0; 634 new->maxvm = NULL; 635 tag = 1; 636 new->helper = kmem_zalloc(new->nhelper * sizeof (helper_t), KM_SLEEP); 637 hpend = &new->helper[new->nhelper]; 638 for (hp = new->helper; hp != hpend; hp++) { 639 hp->tag = tag++; 640 if (hp < &new->helper[MINHELPERS]) { 641 hp->lzbuf = kmem_alloc(PAGESIZE, KM_SLEEP); 642 hp->page = kmem_alloc(PAGESIZE, KM_SLEEP); 643 } else if (new->clevel < DUMP_CLEVEL_BZIP2) { 644 new->maxsize += 2 * PAGESIZE; 645 } else { 646 new->maxsize += PAGESIZE; 647 } 648 if (new->clevel >= DUMP_CLEVEL_BZIP2) 649 new->maxsize += bz2size; 650 } 651 652 new->cbuf = kmem_zalloc(new->ncbuf * sizeof (cbuf_t), KM_SLEEP); 653 cpend = &new->cbuf[new->ncbuf]; 654 for (cp = new->cbuf; cp != cpend; cp++) { 655 cp->state = CBUF_FREEBUF; 656 cp->size = CBUF_SIZE; 657 if (cp < &new->cbuf[MINCBUFS]) 658 cp->buf = kmem_alloc(cp->size, KM_SLEEP); 659 else 660 new->maxsize += cp->size; 661 } 662 663 new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP); 664 cpend = &new->cmap[new->ncmap]; 665 for (cp = new->cmap; cp != cpend; cp++) { 666 cp->state = CBUF_FREEMAP; 667 cp->size = CBUF_MAPSIZE; 668 cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE, 669 0, 0, NULL, NULL, VM_SLEEP); 670 } 671 672 /* reserve VA to be backed with spare pages at crash time */ 673 if (new->maxsize > 0) { 674 new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE); 675 new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE); 676 new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize, 677 CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP); 678 } 679 680 /* 681 * Reserve memory for kmem allocation calls made during crash 682 * dump. The hat layer allocates memory for each mapping 683 * created, and the I/O path allocates buffers and data structs. 684 * Add a few pages for safety. 685 */ 686 kmem_dump_init((new->ncmap * dump_kmem_permap) + 687 (dump_kmem_pages * PAGESIZE)); 688 689 /* set new config pointers */ 690 *old = *new; 691 } 692 693 /* 694 * Define a struct memlist walker to optimize bitnum to pfn 695 * lookup. The walker maintains the state of the list traversal. 696 */ 697 typedef struct dumpmlw { 698 struct memlist *mp; /* current memlist */ 699 pgcnt_t basenum; /* bitnum base offset */ 700 pgcnt_t mppages; /* current memlist size */ 701 pgcnt_t mpleft; /* size to end of current memlist */ 702 pfn_t mpaddr; /* first pfn in memlist */ 703 } dumpmlw_t; 704 705 /* initialize the walker */ 706 static inline void 707 dump_init_memlist_walker(dumpmlw_t *pw) 708 { 709 pw->mp = phys_install; 710 pw->basenum = 0; 711 pw->mppages = pw->mp->ml_size >> PAGESHIFT; 712 pw->mpleft = pw->mppages; 713 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT; 714 } 715 716 /* 717 * Lookup pfn given bitnum. The memlist can be quite long on some 718 * systems (e.g.: one per board). To optimize sequential lookups, the 719 * caller initializes and presents a memlist walker. 720 */ 721 static pfn_t 722 dump_bitnum_to_pfn(pgcnt_t bitnum, dumpmlw_t *pw) 723 { 724 bitnum -= pw->basenum; 725 while (pw->mp != NULL) { 726 if (bitnum < pw->mppages) { 727 pw->mpleft = pw->mppages - bitnum; 728 return (pw->mpaddr + bitnum); 729 } 730 bitnum -= pw->mppages; 731 pw->basenum += pw->mppages; 732 pw->mp = pw->mp->ml_next; 733 if (pw->mp != NULL) { 734 pw->mppages = pw->mp->ml_size >> PAGESHIFT; 735 pw->mpleft = pw->mppages; 736 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT; 737 } 738 } 739 return (PFN_INVALID); 740 } 741 742 static pgcnt_t 743 dump_pfn_to_bitnum(pfn_t pfn) 744 { 745 struct memlist *mp; 746 pgcnt_t bitnum = 0; 747 748 for (mp = phys_install; mp != NULL; mp = mp->ml_next) { 749 if (pfn >= (mp->ml_address >> PAGESHIFT) && 750 pfn < ((mp->ml_address + mp->ml_size) >> PAGESHIFT)) 751 return (bitnum + pfn - (mp->ml_address >> PAGESHIFT)); 752 bitnum += mp->ml_size >> PAGESHIFT; 753 } 754 return ((pgcnt_t)-1); 755 } 756 757 /* 758 * Set/test bitmap for a CBUF_MAPSIZE range which includes pfn. The 759 * mapping of pfn to range index is imperfect because pfn and bitnum 760 * do not have the same phase. To make sure a CBUF_MAPSIZE range is 761 * covered, call this for both ends: 762 * dump_set_used(base) 763 * dump_set_used(base+CBUF_MAPNP-1) 764 * 765 * This is used during a panic dump to mark pages allocated by 766 * dumpsys_get_maxmem(). The macro IS_DUMP_PAGE(pp) is used by 767 * page_get_mnode_freelist() to make sure pages used by dump are never 768 * allocated. 769 */ 770 #define CBUF_MAPP2R(pfn) ((pfn) >> (CBUF_MAPSHIFT - PAGESHIFT)) 771 772 static void 773 dump_set_used(pfn_t pfn) 774 { 775 776 pgcnt_t bitnum, rbitnum; 777 778 bitnum = dump_pfn_to_bitnum(pfn); 779 ASSERT(bitnum != (pgcnt_t)-1); 780 781 rbitnum = CBUF_MAPP2R(bitnum); 782 ASSERT(rbitnum < dumpcfg.rbitmapsize); 783 784 BT_SET(dumpcfg.rbitmap, rbitnum); 785 } 786 787 int 788 dump_test_used(pfn_t pfn) 789 { 790 pgcnt_t bitnum, rbitnum; 791 792 bitnum = dump_pfn_to_bitnum(pfn); 793 ASSERT(bitnum != (pgcnt_t)-1); 794 795 rbitnum = CBUF_MAPP2R(bitnum); 796 ASSERT(rbitnum < dumpcfg.rbitmapsize); 797 798 return (BT_TEST(dumpcfg.rbitmap, rbitnum)); 799 } 800 801 /* 802 * dumpbzalloc and dumpbzfree are callbacks from the bzip2 library. 803 * dumpsys_get_maxmem() uses them for BZ2_bzCompressInit(). 804 */ 805 static void * 806 dumpbzalloc(void *opaque, int items, int size) 807 { 808 size_t *sz; 809 char *ret; 810 811 ASSERT(opaque != NULL); 812 sz = opaque; 813 ret = dumpcfg.maxvm + *sz; 814 *sz += items * size; 815 *sz = P2ROUNDUP(*sz, BZ2_BZALLOC_ALIGN); 816 ASSERT(*sz <= dumpcfg.maxvmsize); 817 return (ret); 818 } 819 820 /*ARGSUSED*/ 821 static void 822 dumpbzfree(void *opaque, void *addr) 823 { 824 } 825 826 /* 827 * Perform additional checks on the page to see if we can really use 828 * it. The kernel (kas) pages are always set in the bitmap. However, 829 * boot memory pages (prom_ppages or P_BOOTPAGES) are not in the 830 * bitmap. So we check for them. 831 */ 832 static inline int 833 dump_pfn_check(pfn_t pfn) 834 { 835 page_t *pp = page_numtopp_nolock(pfn); 836 if (pp == NULL || pp->p_pagenum != pfn || 837 #if defined(__sparc) 838 pp->p_vnode == &promvp || 839 #else 840 PP_ISBOOTPAGES(pp) || 841 #endif 842 pp->p_toxic != 0) 843 return (0); 844 return (1); 845 } 846 847 /* 848 * Check a range to see if all contained pages are available and 849 * return non-zero if the range can be used. 850 */ 851 static inline int 852 dump_range_check(pgcnt_t start, pgcnt_t end, pfn_t pfn) 853 { 854 for (; start < end; start++, pfn++) { 855 if (BT_TEST(dumpcfg.bitmap, start)) 856 return (0); 857 if (!dump_pfn_check(pfn)) 858 return (0); 859 } 860 return (1); 861 } 862 863 /* 864 * dumpsys_get_maxmem() is called during panic. Find unused ranges 865 * and use them for buffers. If we find enough memory switch to 866 * parallel bzip2, otherwise use parallel lzjb. 867 * 868 * It searches the dump bitmap in 2 passes. The first time it looks 869 * for CBUF_MAPSIZE ranges. On the second pass it uses small pages. 870 */ 871 static void 872 dumpsys_get_maxmem() 873 { 874 dumpcfg_t *cfg = &dumpcfg; 875 cbuf_t *endcp = &cfg->cbuf[cfg->ncbuf]; 876 helper_t *endhp = &cfg->helper[cfg->nhelper]; 877 pgcnt_t bitnum, end; 878 size_t sz, endsz, bz2size; 879 pfn_t pfn, off; 880 cbuf_t *cp; 881 helper_t *hp, *ohp; 882 dumpmlw_t mlw; 883 int k; 884 885 /* 886 * Fall back to doing a serial dump if no helpers showed 887 * up. It is possible for other CPUs to be stuck in PROM, or 888 * DRd out. panic("sync initiated") in sync_handler() is one 889 * case. A parallel dump will hang (dump time out) unless 890 * there is at least one helper CPU. At this point dumpsys() 891 * has done some I/O, which means there has been plenty of 892 * time for helpers to arrive. 893 */ 894 if (!cfg->helper_present) { 895 cfg->clevel = 0; 896 return; 897 } 898 899 /* 900 * There may be no point in looking for spare memory. If 901 * dumping all memory, then none is spare. If doing a serial 902 * dump, then already have buffers. 903 */ 904 if (cfg->maxsize == 0 || cfg->clevel < DUMP_CLEVEL_LZJB || 905 (dump_conflags & DUMP_ALL) != 0) { 906 if (cfg->clevel > DUMP_CLEVEL_LZJB) 907 cfg->clevel = DUMP_CLEVEL_LZJB; 908 return; 909 } 910 911 sz = 0; 912 cfg->found4m = 0; 913 cfg->foundsm = 0; 914 915 /* bitmap of ranges used to estimate which pfns are being used */ 916 bzero(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.rbitmapsize)); 917 918 /* find ranges that are not being dumped to use for buffers */ 919 dump_init_memlist_walker(&mlw); 920 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) { 921 dump_timeleft = dump_timeout; 922 end = bitnum + CBUF_MAPNP; 923 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 924 ASSERT(pfn != PFN_INVALID); 925 926 /* skip partial range at end of mem segment */ 927 if (mlw.mpleft < CBUF_MAPNP) { 928 end = bitnum + mlw.mpleft; 929 continue; 930 } 931 932 /* skip non aligned pages */ 933 off = P2PHASE(pfn, CBUF_MAPNP); 934 if (off != 0) { 935 end -= off; 936 continue; 937 } 938 939 if (!dump_range_check(bitnum, end, pfn)) 940 continue; 941 942 ASSERT((sz + CBUF_MAPSIZE) <= cfg->maxvmsize); 943 hat_devload(kas.a_hat, cfg->maxvm + sz, CBUF_MAPSIZE, pfn, 944 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST); 945 sz += CBUF_MAPSIZE; 946 cfg->found4m++; 947 948 /* set the bitmap for both ends to be sure to cover the range */ 949 dump_set_used(pfn); 950 dump_set_used(pfn + CBUF_MAPNP - 1); 951 952 if (sz >= cfg->maxsize) 953 goto foundmax; 954 } 955 956 /* Add small pages if we can't find enough large pages. */ 957 dump_init_memlist_walker(&mlw); 958 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) { 959 dump_timeleft = dump_timeout; 960 end = bitnum + CBUF_MAPNP; 961 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 962 ASSERT(pfn != PFN_INVALID); 963 964 /* Find any non-aligned pages at start and end of segment. */ 965 off = P2PHASE(pfn, CBUF_MAPNP); 966 if (mlw.mpleft < CBUF_MAPNP) { 967 end = bitnum + mlw.mpleft; 968 } else if (off != 0) { 969 end -= off; 970 } else if (cfg->found4m && dump_test_used(pfn)) { 971 continue; 972 } 973 974 for (; bitnum < end; bitnum++, pfn++) { 975 dump_timeleft = dump_timeout; 976 if (BT_TEST(dumpcfg.bitmap, bitnum)) 977 continue; 978 if (!dump_pfn_check(pfn)) 979 continue; 980 ASSERT((sz + PAGESIZE) <= cfg->maxvmsize); 981 hat_devload(kas.a_hat, cfg->maxvm + sz, PAGESIZE, pfn, 982 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST); 983 sz += PAGESIZE; 984 cfg->foundsm++; 985 dump_set_used(pfn); 986 if (sz >= cfg->maxsize) 987 goto foundmax; 988 } 989 } 990 991 /* Fall back to lzjb if we did not get enough memory for bzip2. */ 992 endsz = (cfg->maxsize * cfg->threshold) / cfg->nhelper; 993 if (sz < endsz) { 994 cfg->clevel = DUMP_CLEVEL_LZJB; 995 } 996 997 /* Allocate memory for as many helpers as we can. */ 998 foundmax: 999 1000 /* Byte offsets into memory found and mapped above */ 1001 endsz = sz; 1002 sz = 0; 1003 1004 /* Set the size for bzip2 state. Only bzip2 needs it. */ 1005 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level); 1006 1007 /* Skip the preallocate output buffers. */ 1008 cp = &cfg->cbuf[MINCBUFS]; 1009 1010 /* Use this to move memory up from the preallocated helpers. */ 1011 ohp = cfg->helper; 1012 1013 /* Loop over all helpers and allocate memory. */ 1014 for (hp = cfg->helper; hp < endhp; hp++) { 1015 1016 /* Skip preallocated helpers by checking hp->page. */ 1017 if (hp->page == NULL) { 1018 if (cfg->clevel <= DUMP_CLEVEL_LZJB) { 1019 /* lzjb needs 2 1-page buffers */ 1020 if ((sz + (2 * PAGESIZE)) > endsz) 1021 break; 1022 hp->page = cfg->maxvm + sz; 1023 sz += PAGESIZE; 1024 hp->lzbuf = cfg->maxvm + sz; 1025 sz += PAGESIZE; 1026 1027 } else if (ohp->lzbuf != NULL) { 1028 /* re-use the preallocted lzjb page for bzip2 */ 1029 hp->page = ohp->lzbuf; 1030 ohp->lzbuf = NULL; 1031 ++ohp; 1032 1033 } else { 1034 /* bzip2 needs a 1-page buffer */ 1035 if ((sz + PAGESIZE) > endsz) 1036 break; 1037 hp->page = cfg->maxvm + sz; 1038 sz += PAGESIZE; 1039 } 1040 } 1041 1042 /* 1043 * Add output buffers per helper. The number of 1044 * buffers per helper is determined by the ratio of 1045 * ncbuf to nhelper. 1046 */ 1047 for (k = 0; cp < endcp && (sz + CBUF_SIZE) <= endsz && 1048 k < NCBUF_PER_HELPER; k++) { 1049 cp->state = CBUF_FREEBUF; 1050 cp->size = CBUF_SIZE; 1051 cp->buf = cfg->maxvm + sz; 1052 sz += CBUF_SIZE; 1053 ++cp; 1054 } 1055 1056 /* 1057 * bzip2 needs compression state. Use the dumpbzalloc 1058 * and dumpbzfree callbacks to allocate the memory. 1059 * bzip2 does allocation only at init time. 1060 */ 1061 if (cfg->clevel >= DUMP_CLEVEL_BZIP2) { 1062 if ((sz + bz2size) > endsz) { 1063 hp->page = NULL; 1064 break; 1065 } else { 1066 hp->bzstream.opaque = &sz; 1067 hp->bzstream.bzalloc = dumpbzalloc; 1068 hp->bzstream.bzfree = dumpbzfree; 1069 (void) BZ2_bzCompressInit(&hp->bzstream, 1070 dump_bzip2_level, 0, 0); 1071 hp->bzstream.opaque = NULL; 1072 } 1073 } 1074 } 1075 1076 /* Finish allocating output buffers */ 1077 for (; cp < endcp && (sz + CBUF_SIZE) <= endsz; cp++) { 1078 cp->state = CBUF_FREEBUF; 1079 cp->size = CBUF_SIZE; 1080 cp->buf = cfg->maxvm + sz; 1081 sz += CBUF_SIZE; 1082 } 1083 1084 /* Enable IS_DUMP_PAGE macro, which checks for pages we took. */ 1085 if (cfg->found4m || cfg->foundsm) 1086 dump_check_used = 1; 1087 1088 ASSERT(sz <= endsz); 1089 } 1090 1091 static void 1092 dumphdr_init(void) 1093 { 1094 pgcnt_t npages = 0; 1095 1096 ASSERT(MUTEX_HELD(&dump_lock)); 1097 1098 if (dumphdr == NULL) { 1099 dumphdr = kmem_zalloc(sizeof (dumphdr_t), KM_SLEEP); 1100 dumphdr->dump_magic = DUMP_MAGIC; 1101 dumphdr->dump_version = DUMP_VERSION; 1102 dumphdr->dump_wordsize = DUMP_WORDSIZE; 1103 dumphdr->dump_pageshift = PAGESHIFT; 1104 dumphdr->dump_pagesize = PAGESIZE; 1105 dumphdr->dump_utsname = utsname; 1106 (void) strcpy(dumphdr->dump_platform, platform); 1107 dumpbuf.size = dumpbuf_iosize(maxphys); 1108 dumpbuf.start = kmem_alloc(dumpbuf.size, KM_SLEEP); 1109 dumpbuf.end = dumpbuf.start + dumpbuf.size; 1110 dumpcfg.pids = kmem_alloc(v.v_proc * sizeof (pid_t), KM_SLEEP); 1111 dumpcfg.helpermap = kmem_zalloc(BT_SIZEOFMAP(NCPU), KM_SLEEP); 1112 LOCK_INIT_HELD(&dumpcfg.helper_lock); 1113 dump_stack_scratch = kmem_alloc(STACK_BUF_SIZE, KM_SLEEP); 1114 (void) strncpy(dumphdr->dump_uuid, dump_get_uuid(), 1115 sizeof (dumphdr->dump_uuid)); 1116 } 1117 1118 npages = num_phys_pages(); 1119 1120 if (dumpcfg.bitmapsize != npages) { 1121 size_t rlen = CBUF_MAPP2R(P2ROUNDUP(npages, CBUF_MAPNP)); 1122 void *map = kmem_alloc(BT_SIZEOFMAP(npages), KM_SLEEP); 1123 void *rmap = kmem_alloc(BT_SIZEOFMAP(rlen), KM_SLEEP); 1124 1125 if (dumpcfg.bitmap != NULL) 1126 kmem_free(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg. 1127 bitmapsize)); 1128 if (dumpcfg.rbitmap != NULL) 1129 kmem_free(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg. 1130 rbitmapsize)); 1131 dumpcfg.bitmap = map; 1132 dumpcfg.bitmapsize = npages; 1133 dumpcfg.rbitmap = rmap; 1134 dumpcfg.rbitmapsize = rlen; 1135 } 1136 } 1137 1138 /* 1139 * Establish a new dump device. 1140 */ 1141 int 1142 dumpinit(vnode_t *vp, char *name, int justchecking) 1143 { 1144 vnode_t *cvp; 1145 vattr_t vattr; 1146 vnode_t *cdev_vp; 1147 int error = 0; 1148 1149 ASSERT(MUTEX_HELD(&dump_lock)); 1150 1151 dumphdr_init(); 1152 1153 cvp = common_specvp(vp); 1154 if (cvp == dumpvp) 1155 return (0); 1156 1157 /* 1158 * Determine whether this is a plausible dump device. We want either: 1159 * (1) a real device that's not mounted and has a cb_dump routine, or 1160 * (2) a swapfile on some filesystem that has a vop_dump routine. 1161 */ 1162 if ((error = VOP_OPEN(&cvp, FREAD | FWRITE, kcred, NULL)) != 0) 1163 return (error); 1164 1165 vattr.va_mask = AT_SIZE | AT_TYPE | AT_RDEV; 1166 if ((error = VOP_GETATTR(cvp, &vattr, 0, kcred, NULL)) == 0) { 1167 if (vattr.va_type == VBLK || vattr.va_type == VCHR) { 1168 if (devopsp[getmajor(vattr.va_rdev)]-> 1169 devo_cb_ops->cb_dump == nodev) 1170 error = ENOTSUP; 1171 else if (vfs_devismounted(vattr.va_rdev)) 1172 error = EBUSY; 1173 if (strcmp(ddi_driver_name(VTOS(cvp)->s_dip), 1174 ZFS_DRIVER) == 0 && 1175 IS_SWAPVP(common_specvp(cvp))) 1176 error = EBUSY; 1177 } else { 1178 if (vn_matchopval(cvp, VOPNAME_DUMP, fs_nosys) || 1179 !IS_SWAPVP(cvp)) 1180 error = ENOTSUP; 1181 } 1182 } 1183 1184 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) 1185 error = ENOSPC; 1186 1187 if (error || justchecking) { 1188 (void) VOP_CLOSE(cvp, FREAD | FWRITE, 1, (offset_t)0, 1189 kcred, NULL); 1190 return (error); 1191 } 1192 1193 VN_HOLD(cvp); 1194 1195 if (dumpvp != NULL) 1196 dumpfini(); /* unconfigure the old dump device */ 1197 1198 dumpvp = cvp; 1199 dumpvp_size = vattr.va_size & -DUMP_OFFSET; 1200 dumppath = kmem_alloc(strlen(name) + 1, KM_SLEEP); 1201 (void) strcpy(dumppath, name); 1202 dumpbuf.iosize = 0; 1203 1204 /* 1205 * If the dump device is a block device, attempt to open up the 1206 * corresponding character device and determine its maximum transfer 1207 * size. We use this information to potentially resize dumpbuf to a 1208 * larger and more optimal size for performing i/o to the dump device. 1209 */ 1210 if (cvp->v_type == VBLK && 1211 (cdev_vp = makespecvp(VTOS(cvp)->s_dev, VCHR)) != NULL) { 1212 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) { 1213 size_t blk_size; 1214 struct dk_cinfo dki; 1215 struct dk_minfo minf; 1216 1217 if (VOP_IOCTL(cdev_vp, DKIOCGMEDIAINFO, 1218 (intptr_t)&minf, FKIOCTL, kcred, NULL, NULL) 1219 == 0 && minf.dki_lbsize != 0) 1220 blk_size = minf.dki_lbsize; 1221 else 1222 blk_size = DEV_BSIZE; 1223 1224 if (VOP_IOCTL(cdev_vp, DKIOCINFO, (intptr_t)&dki, 1225 FKIOCTL, kcred, NULL, NULL) == 0) { 1226 dumpbuf.iosize = dki.dki_maxtransfer * blk_size; 1227 dumpbuf_resize(); 1228 } 1229 /* 1230 * If we are working with a zvol then dumpify it 1231 * if it's not being used as swap. 1232 */ 1233 if (strcmp(dki.dki_dname, ZVOL_DRIVER) == 0) { 1234 if (IS_SWAPVP(common_specvp(cvp))) 1235 error = EBUSY; 1236 else if ((error = VOP_IOCTL(cdev_vp, 1237 DKIOCDUMPINIT, NULL, FKIOCTL, kcred, 1238 NULL, NULL)) != 0) 1239 dumpfini(); 1240 } 1241 1242 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0, 1243 kcred, NULL); 1244 } 1245 1246 VN_RELE(cdev_vp); 1247 } 1248 1249 cmn_err(CE_CONT, "?dump on %s size %llu MB\n", name, dumpvp_size >> 20); 1250 1251 dump_update_clevel(); 1252 1253 return (error); 1254 } 1255 1256 void 1257 dumpfini(void) 1258 { 1259 vattr_t vattr; 1260 boolean_t is_zfs = B_FALSE; 1261 vnode_t *cdev_vp; 1262 ASSERT(MUTEX_HELD(&dump_lock)); 1263 1264 kmem_free(dumppath, strlen(dumppath) + 1); 1265 1266 /* 1267 * Determine if we are using zvols for our dump device 1268 */ 1269 vattr.va_mask = AT_RDEV; 1270 if (VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL) == 0) { 1271 is_zfs = (getmajor(vattr.va_rdev) == 1272 ddi_name_to_major(ZFS_DRIVER)) ? B_TRUE : B_FALSE; 1273 } 1274 1275 /* 1276 * If we have a zvol dump device then we call into zfs so 1277 * that it may have a chance to cleanup. 1278 */ 1279 if (is_zfs && 1280 (cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR)) != NULL) { 1281 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) { 1282 (void) VOP_IOCTL(cdev_vp, DKIOCDUMPFINI, NULL, FKIOCTL, 1283 kcred, NULL, NULL); 1284 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0, 1285 kcred, NULL); 1286 } 1287 VN_RELE(cdev_vp); 1288 } 1289 1290 (void) VOP_CLOSE(dumpvp, FREAD | FWRITE, 1, (offset_t)0, kcred, NULL); 1291 1292 VN_RELE(dumpvp); 1293 1294 dumpvp = NULL; 1295 dumpvp_size = 0; 1296 dumppath = NULL; 1297 } 1298 1299 static offset_t 1300 dumpvp_flush(void) 1301 { 1302 size_t size = P2ROUNDUP(dumpbuf.cur - dumpbuf.start, PAGESIZE); 1303 hrtime_t iotime; 1304 int err; 1305 1306 if (dumpbuf.vp_off + size > dumpbuf.vp_limit) { 1307 dump_ioerr = ENOSPC; 1308 dumpbuf.vp_off = dumpbuf.vp_limit; 1309 } else if (size != 0) { 1310 iotime = gethrtime(); 1311 dumpsync.iowait += iotime - dumpsync.iowaitts; 1312 if (panicstr) 1313 err = VOP_DUMP(dumpvp, dumpbuf.start, 1314 lbtodb(dumpbuf.vp_off), btod(size), NULL); 1315 else 1316 err = vn_rdwr(UIO_WRITE, dumpbuf.cdev_vp != NULL ? 1317 dumpbuf.cdev_vp : dumpvp, dumpbuf.start, size, 1318 dumpbuf.vp_off, UIO_SYSSPACE, 0, dumpbuf.vp_limit, 1319 kcred, 0); 1320 if (err && dump_ioerr == 0) 1321 dump_ioerr = err; 1322 dumpsync.iowaitts = gethrtime(); 1323 dumpsync.iotime += dumpsync.iowaitts - iotime; 1324 dumpsync.nwrite += size; 1325 dumpbuf.vp_off += size; 1326 } 1327 dumpbuf.cur = dumpbuf.start; 1328 dump_timeleft = dump_timeout; 1329 return (dumpbuf.vp_off); 1330 } 1331 1332 /* maximize write speed by keeping seek offset aligned with size */ 1333 void 1334 dumpvp_write(const void *va, size_t size) 1335 { 1336 size_t len, off, sz; 1337 1338 while (size != 0) { 1339 len = MIN(size, dumpbuf.end - dumpbuf.cur); 1340 if (len == 0) { 1341 off = P2PHASE(dumpbuf.vp_off, dumpbuf.size); 1342 if (off == 0 || !ISP2(dumpbuf.size)) { 1343 (void) dumpvp_flush(); 1344 } else { 1345 sz = dumpbuf.size - off; 1346 dumpbuf.cur = dumpbuf.start + sz; 1347 (void) dumpvp_flush(); 1348 ovbcopy(dumpbuf.start + sz, dumpbuf.start, off); 1349 dumpbuf.cur += off; 1350 } 1351 } else { 1352 bcopy(va, dumpbuf.cur, len); 1353 va = (char *)va + len; 1354 dumpbuf.cur += len; 1355 size -= len; 1356 } 1357 } 1358 } 1359 1360 /*ARGSUSED*/ 1361 static void 1362 dumpvp_ksyms_write(const void *src, void *dst, size_t size) 1363 { 1364 dumpvp_write(src, size); 1365 } 1366 1367 /* 1368 * Mark 'pfn' in the bitmap and dump its translation table entry. 1369 */ 1370 void 1371 dump_addpage(struct as *as, void *va, pfn_t pfn) 1372 { 1373 mem_vtop_t mem_vtop; 1374 pgcnt_t bitnum; 1375 1376 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) { 1377 if (!BT_TEST(dumpcfg.bitmap, bitnum)) { 1378 dumphdr->dump_npages++; 1379 BT_SET(dumpcfg.bitmap, bitnum); 1380 } 1381 dumphdr->dump_nvtop++; 1382 mem_vtop.m_as = as; 1383 mem_vtop.m_va = va; 1384 mem_vtop.m_pfn = pfn; 1385 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 1386 } 1387 dump_timeleft = dump_timeout; 1388 } 1389 1390 /* 1391 * Mark 'pfn' in the bitmap 1392 */ 1393 void 1394 dump_page(pfn_t pfn) 1395 { 1396 pgcnt_t bitnum; 1397 1398 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) { 1399 if (!BT_TEST(dumpcfg.bitmap, bitnum)) { 1400 dumphdr->dump_npages++; 1401 BT_SET(dumpcfg.bitmap, bitnum); 1402 } 1403 } 1404 dump_timeleft = dump_timeout; 1405 } 1406 1407 /* 1408 * Dump the <as, va, pfn> information for a given address space. 1409 * SEGOP_DUMP() will call dump_addpage() for each page in the segment. 1410 */ 1411 static void 1412 dump_as(struct as *as) 1413 { 1414 struct seg *seg; 1415 1416 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1417 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 1418 if (seg->s_as != as) 1419 break; 1420 if (seg->s_ops == NULL) 1421 continue; 1422 SEGOP_DUMP(seg); 1423 } 1424 AS_LOCK_EXIT(as, &as->a_lock); 1425 1426 if (seg != NULL) 1427 cmn_err(CE_WARN, "invalid segment %p in address space %p", 1428 (void *)seg, (void *)as); 1429 } 1430 1431 static int 1432 dump_process(pid_t pid) 1433 { 1434 proc_t *p = sprlock(pid); 1435 1436 if (p == NULL) 1437 return (-1); 1438 if (p->p_as != &kas) { 1439 mutex_exit(&p->p_lock); 1440 dump_as(p->p_as); 1441 mutex_enter(&p->p_lock); 1442 } 1443 1444 sprunlock(p); 1445 1446 return (0); 1447 } 1448 1449 /* 1450 * The following functions (dump_summary(), dump_ereports(), and 1451 * dump_messages()), write data to an uncompressed area within the 1452 * crashdump. The layout of these is 1453 * 1454 * +------------------------------------------------------------+ 1455 * | compressed pages | summary | ereports | messages | 1456 * +------------------------------------------------------------+ 1457 * 1458 * With the advent of saving a compressed crash dump by default, we 1459 * need to save a little more data to describe the failure mode in 1460 * an uncompressed buffer available before savecore uncompresses 1461 * the dump. Initially this is a copy of the stack trace. Additional 1462 * summary information should be added here. 1463 */ 1464 1465 void 1466 dump_summary(void) 1467 { 1468 u_offset_t dumpvp_start; 1469 summary_dump_t sd; 1470 1471 if (dumpvp == NULL || dumphdr == NULL) 1472 return; 1473 1474 dumpbuf.cur = dumpbuf.start; 1475 1476 dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE + 1477 DUMP_ERPTSIZE); 1478 dumpvp_start = dumpbuf.vp_limit - DUMP_SUMMARYSIZE; 1479 dumpbuf.vp_off = dumpvp_start; 1480 1481 sd.sd_magic = SUMMARY_MAGIC; 1482 sd.sd_ssum = checksum32(dump_stack_scratch, STACK_BUF_SIZE); 1483 dumpvp_write(&sd, sizeof (sd)); 1484 dumpvp_write(dump_stack_scratch, STACK_BUF_SIZE); 1485 1486 sd.sd_magic = 0; /* indicate end of summary */ 1487 dumpvp_write(&sd, sizeof (sd)); 1488 (void) dumpvp_flush(); 1489 } 1490 1491 void 1492 dump_ereports(void) 1493 { 1494 u_offset_t dumpvp_start; 1495 erpt_dump_t ed; 1496 1497 if (dumpvp == NULL || dumphdr == NULL) 1498 return; 1499 1500 dumpbuf.cur = dumpbuf.start; 1501 dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE); 1502 dumpvp_start = dumpbuf.vp_limit - DUMP_ERPTSIZE; 1503 dumpbuf.vp_off = dumpvp_start; 1504 1505 fm_ereport_dump(); 1506 if (panicstr) 1507 errorq_dump(); 1508 1509 bzero(&ed, sizeof (ed)); /* indicate end of ereports */ 1510 dumpvp_write(&ed, sizeof (ed)); 1511 (void) dumpvp_flush(); 1512 1513 if (!panicstr) { 1514 (void) VOP_PUTPAGE(dumpvp, dumpvp_start, 1515 (size_t)(dumpbuf.vp_off - dumpvp_start), 1516 B_INVAL | B_FORCE, kcred, NULL); 1517 } 1518 } 1519 1520 void 1521 dump_messages(void) 1522 { 1523 log_dump_t ld; 1524 mblk_t *mctl, *mdata; 1525 queue_t *q, *qlast; 1526 u_offset_t dumpvp_start; 1527 1528 if (dumpvp == NULL || dumphdr == NULL || log_consq == NULL) 1529 return; 1530 1531 dumpbuf.cur = dumpbuf.start; 1532 dumpbuf.vp_limit = dumpvp_size - DUMP_OFFSET; 1533 dumpvp_start = dumpbuf.vp_limit - DUMP_LOGSIZE; 1534 dumpbuf.vp_off = dumpvp_start; 1535 1536 qlast = NULL; 1537 do { 1538 for (q = log_consq; q->q_next != qlast; q = q->q_next) 1539 continue; 1540 for (mctl = q->q_first; mctl != NULL; mctl = mctl->b_next) { 1541 dump_timeleft = dump_timeout; 1542 mdata = mctl->b_cont; 1543 ld.ld_magic = LOG_MAGIC; 1544 ld.ld_msgsize = MBLKL(mctl->b_cont); 1545 ld.ld_csum = checksum32(mctl->b_rptr, MBLKL(mctl)); 1546 ld.ld_msum = checksum32(mdata->b_rptr, MBLKL(mdata)); 1547 dumpvp_write(&ld, sizeof (ld)); 1548 dumpvp_write(mctl->b_rptr, MBLKL(mctl)); 1549 dumpvp_write(mdata->b_rptr, MBLKL(mdata)); 1550 } 1551 } while ((qlast = q) != log_consq); 1552 1553 ld.ld_magic = 0; /* indicate end of messages */ 1554 dumpvp_write(&ld, sizeof (ld)); 1555 (void) dumpvp_flush(); 1556 if (!panicstr) { 1557 (void) VOP_PUTPAGE(dumpvp, dumpvp_start, 1558 (size_t)(dumpbuf.vp_off - dumpvp_start), 1559 B_INVAL | B_FORCE, kcred, NULL); 1560 } 1561 } 1562 1563 /* 1564 * The following functions are called on multiple CPUs during dump. 1565 * They must not use most kernel services, because all cross-calls are 1566 * disabled during panic. Therefore, blocking locks and cache flushes 1567 * will not work. 1568 */ 1569 1570 /* 1571 * Copy pages, trapping ECC errors. Also, for robustness, trap data 1572 * access in case something goes wrong in the hat layer and the 1573 * mapping is broken. 1574 */ 1575 static int 1576 dump_pagecopy(void *src, void *dst) 1577 { 1578 long *wsrc = (long *)src; 1579 long *wdst = (long *)dst; 1580 const ulong_t ncopies = PAGESIZE / sizeof (long); 1581 volatile int w = 0; 1582 volatile int ueoff = -1; 1583 on_trap_data_t otd; 1584 1585 if (on_trap(&otd, OT_DATA_EC | OT_DATA_ACCESS)) { 1586 if (ueoff == -1) 1587 ueoff = w * sizeof (long); 1588 /* report "bad ECC" or "bad address" */ 1589 #ifdef _LP64 1590 if (otd.ot_trap & OT_DATA_EC) 1591 wdst[w++] = 0x00badecc00badecc; 1592 else 1593 wdst[w++] = 0x00badadd00badadd; 1594 #else 1595 if (otd.ot_trap & OT_DATA_EC) 1596 wdst[w++] = 0x00badecc; 1597 else 1598 wdst[w++] = 0x00badadd; 1599 #endif 1600 } 1601 while (w < ncopies) { 1602 wdst[w] = wsrc[w]; 1603 w++; 1604 } 1605 no_trap(); 1606 return (ueoff); 1607 } 1608 1609 static void 1610 dumpsys_close_cq(cqueue_t *cq, int live) 1611 { 1612 if (live) { 1613 mutex_enter(&cq->mutex); 1614 atomic_dec_uint(&cq->open); 1615 cv_signal(&cq->cv); 1616 mutex_exit(&cq->mutex); 1617 } else { 1618 atomic_dec_uint(&cq->open); 1619 } 1620 } 1621 1622 static inline void 1623 dumpsys_spinlock(lock_t *lp) 1624 { 1625 uint_t backoff = 0; 1626 int loop_count = 0; 1627 1628 while (LOCK_HELD(lp) || !lock_spin_try(lp)) { 1629 if (++loop_count >= ncpus) { 1630 backoff = mutex_lock_backoff(0); 1631 loop_count = 0; 1632 } else { 1633 backoff = mutex_lock_backoff(backoff); 1634 } 1635 mutex_lock_delay(backoff); 1636 } 1637 } 1638 1639 static inline void 1640 dumpsys_spinunlock(lock_t *lp) 1641 { 1642 lock_clear(lp); 1643 } 1644 1645 static inline void 1646 dumpsys_lock(cqueue_t *cq, int live) 1647 { 1648 if (live) 1649 mutex_enter(&cq->mutex); 1650 else 1651 dumpsys_spinlock(&cq->spinlock); 1652 } 1653 1654 static inline void 1655 dumpsys_unlock(cqueue_t *cq, int live, int signal) 1656 { 1657 if (live) { 1658 if (signal) 1659 cv_signal(&cq->cv); 1660 mutex_exit(&cq->mutex); 1661 } else { 1662 dumpsys_spinunlock(&cq->spinlock); 1663 } 1664 } 1665 1666 static void 1667 dumpsys_wait_cq(cqueue_t *cq, int live) 1668 { 1669 if (live) { 1670 cv_wait(&cq->cv, &cq->mutex); 1671 } else { 1672 dumpsys_spinunlock(&cq->spinlock); 1673 while (cq->open) 1674 if (cq->first) 1675 break; 1676 dumpsys_spinlock(&cq->spinlock); 1677 } 1678 } 1679 1680 static void 1681 dumpsys_put_cq(cqueue_t *cq, cbuf_t *cp, int newstate, int live) 1682 { 1683 if (cp == NULL) 1684 return; 1685 1686 dumpsys_lock(cq, live); 1687 1688 if (cq->ts != 0) { 1689 cq->empty += gethrtime() - cq->ts; 1690 cq->ts = 0; 1691 } 1692 1693 cp->state = newstate; 1694 cp->next = NULL; 1695 if (cq->last == NULL) 1696 cq->first = cp; 1697 else 1698 cq->last->next = cp; 1699 cq->last = cp; 1700 1701 dumpsys_unlock(cq, live, 1); 1702 } 1703 1704 static cbuf_t * 1705 dumpsys_get_cq(cqueue_t *cq, int live) 1706 { 1707 cbuf_t *cp; 1708 hrtime_t now = gethrtime(); 1709 1710 dumpsys_lock(cq, live); 1711 1712 /* CONSTCOND */ 1713 while (1) { 1714 cp = (cbuf_t *)cq->first; 1715 if (cp == NULL) { 1716 if (cq->open == 0) 1717 break; 1718 dumpsys_wait_cq(cq, live); 1719 continue; 1720 } 1721 cq->first = cp->next; 1722 if (cq->first == NULL) { 1723 cq->last = NULL; 1724 cq->ts = now; 1725 } 1726 break; 1727 } 1728 1729 dumpsys_unlock(cq, live, cq->first != NULL || cq->open == 0); 1730 return (cp); 1731 } 1732 1733 /* 1734 * Send an error message to the console. If the main task is running 1735 * just write the message via uprintf. If a helper is running the 1736 * message has to be put on a queue for the main task. Setting fmt to 1737 * NULL means flush the error message buffer. If fmt is not NULL, just 1738 * add the text to the existing buffer. 1739 */ 1740 static void 1741 dumpsys_errmsg(helper_t *hp, const char *fmt, ...) 1742 { 1743 dumpsync_t *ds = hp->ds; 1744 cbuf_t *cp = hp->cperr; 1745 va_list adx; 1746 1747 if (hp->helper == MAINHELPER) { 1748 if (fmt != NULL) { 1749 if (ds->neednl) { 1750 uprintf("\n"); 1751 ds->neednl = 0; 1752 } 1753 va_start(adx, fmt); 1754 vuprintf(fmt, adx); 1755 va_end(adx); 1756 } 1757 } else if (fmt == NULL) { 1758 if (cp != NULL) { 1759 CQ_PUT(mainq, cp, CBUF_ERRMSG); 1760 hp->cperr = NULL; 1761 } 1762 } else { 1763 if (hp->cperr == NULL) { 1764 cp = CQ_GET(freebufq); 1765 hp->cperr = cp; 1766 cp->used = 0; 1767 } 1768 va_start(adx, fmt); 1769 cp->used += vsnprintf(cp->buf + cp->used, cp->size - cp->used, 1770 fmt, adx); 1771 va_end(adx); 1772 if ((cp->used + LOG_MSGSIZE) > cp->size) { 1773 CQ_PUT(mainq, cp, CBUF_ERRMSG); 1774 hp->cperr = NULL; 1775 } 1776 } 1777 } 1778 1779 /* 1780 * Write an output buffer to the dump file. If the main task is 1781 * running just write the data. If a helper is running the output is 1782 * placed on a queue for the main task. 1783 */ 1784 static void 1785 dumpsys_swrite(helper_t *hp, cbuf_t *cp, size_t used) 1786 { 1787 dumpsync_t *ds = hp->ds; 1788 1789 if (hp->helper == MAINHELPER) { 1790 HRSTART(ds->perpage, write); 1791 dumpvp_write(cp->buf, used); 1792 HRSTOP(ds->perpage, write); 1793 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 1794 } else { 1795 cp->used = used; 1796 CQ_PUT(mainq, cp, CBUF_WRITE); 1797 } 1798 } 1799 1800 /* 1801 * Copy one page within the mapped range. The offset starts at 0 and 1802 * is relative to the first pfn. cp->buf + cp->off is the address of 1803 * the first pfn. If dump_pagecopy returns a UE offset, create an 1804 * error message. Returns the offset to the next pfn in the range 1805 * selected by the bitmap. 1806 */ 1807 static int 1808 dumpsys_copy_page(helper_t *hp, int offset) 1809 { 1810 cbuf_t *cp = hp->cpin; 1811 int ueoff; 1812 1813 ASSERT(cp->off + offset + PAGESIZE <= cp->size); 1814 ASSERT(BT_TEST(dumpcfg.bitmap, cp->bitnum)); 1815 1816 ueoff = dump_pagecopy(cp->buf + cp->off + offset, hp->page); 1817 1818 /* ueoff is the offset in the page to a UE error */ 1819 if (ueoff != -1) { 1820 uint64_t pa = ptob(cp->pfn) + offset + ueoff; 1821 1822 dumpsys_errmsg(hp, "cpu %d: memory error at PA 0x%08x.%08x\n", 1823 CPU->cpu_id, (uint32_t)(pa >> 32), (uint32_t)pa); 1824 } 1825 1826 /* 1827 * Advance bitnum and offset to the next input page for the 1828 * next call to this function. 1829 */ 1830 offset += PAGESIZE; 1831 cp->bitnum++; 1832 while (cp->off + offset < cp->size) { 1833 if (BT_TEST(dumpcfg.bitmap, cp->bitnum)) 1834 break; 1835 offset += PAGESIZE; 1836 cp->bitnum++; 1837 } 1838 1839 return (offset); 1840 } 1841 1842 /* 1843 * Read the helper queue, and copy one mapped page. Return 0 when 1844 * done. Return 1 when a page has been copied into hp->page. 1845 */ 1846 static int 1847 dumpsys_sread(helper_t *hp) 1848 { 1849 dumpsync_t *ds = hp->ds; 1850 1851 /* CONSTCOND */ 1852 while (1) { 1853 1854 /* Find the next input buffer. */ 1855 if (hp->cpin == NULL) { 1856 HRSTART(hp->perpage, inwait); 1857 1858 /* CONSTCOND */ 1859 while (1) { 1860 hp->cpin = CQ_GET(helperq); 1861 dump_timeleft = dump_timeout; 1862 1863 /* 1864 * NULL return means the helper queue 1865 * is closed and empty. 1866 */ 1867 if (hp->cpin == NULL) 1868 break; 1869 1870 /* Have input, check for dump I/O error. */ 1871 if (!dump_ioerr) 1872 break; 1873 1874 /* 1875 * If an I/O error occurs, stay in the 1876 * loop in order to empty the helper 1877 * queue. Return the buffers to the 1878 * main task to unmap and free it. 1879 */ 1880 hp->cpin->used = 0; 1881 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 1882 } 1883 HRSTOP(hp->perpage, inwait); 1884 1885 /* Stop here when the helper queue is closed. */ 1886 if (hp->cpin == NULL) 1887 break; 1888 1889 /* Set the offset=0 to get the first pfn. */ 1890 hp->in = 0; 1891 1892 /* Set the total processed to 0 */ 1893 hp->used = 0; 1894 } 1895 1896 /* Process the next page. */ 1897 if (hp->used < hp->cpin->used) { 1898 1899 /* 1900 * Get the next page from the input buffer and 1901 * return a copy. 1902 */ 1903 ASSERT(hp->in != -1); 1904 HRSTART(hp->perpage, copy); 1905 hp->in = dumpsys_copy_page(hp, hp->in); 1906 hp->used += PAGESIZE; 1907 HRSTOP(hp->perpage, copy); 1908 break; 1909 1910 } else { 1911 1912 /* 1913 * Done with the input. Flush the VM and 1914 * return the buffer to the main task. 1915 */ 1916 if (panicstr && hp->helper != MAINHELPER) 1917 hat_flush_range(kas.a_hat, 1918 hp->cpin->buf, hp->cpin->size); 1919 dumpsys_errmsg(hp, NULL); 1920 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 1921 hp->cpin = NULL; 1922 } 1923 } 1924 1925 return (hp->cpin != NULL); 1926 } 1927 1928 /* 1929 * Compress size bytes starting at buf with bzip2 1930 * mode: 1931 * BZ_RUN add one more compressed page 1932 * BZ_FINISH no more input, flush the state 1933 */ 1934 static void 1935 dumpsys_bzrun(helper_t *hp, void *buf, size_t size, int mode) 1936 { 1937 dumpsync_t *ds = hp->ds; 1938 const int CSIZE = sizeof (dumpcsize_t); 1939 bz_stream *ps = &hp->bzstream; 1940 int rc = 0; 1941 uint32_t csize; 1942 dumpcsize_t cs; 1943 1944 /* Set input pointers to new input page */ 1945 if (size > 0) { 1946 ps->avail_in = size; 1947 ps->next_in = buf; 1948 } 1949 1950 /* CONSTCOND */ 1951 while (1) { 1952 1953 /* Quit when all input has been consumed */ 1954 if (ps->avail_in == 0 && mode == BZ_RUN) 1955 break; 1956 1957 /* Get a new output buffer */ 1958 if (hp->cpout == NULL) { 1959 HRSTART(hp->perpage, outwait); 1960 hp->cpout = CQ_GET(freebufq); 1961 HRSTOP(hp->perpage, outwait); 1962 ps->avail_out = hp->cpout->size - CSIZE; 1963 ps->next_out = hp->cpout->buf + CSIZE; 1964 } 1965 1966 /* Compress input, or finalize */ 1967 HRSTART(hp->perpage, compress); 1968 rc = BZ2_bzCompress(ps, mode); 1969 HRSTOP(hp->perpage, compress); 1970 1971 /* Check for error */ 1972 if (mode == BZ_RUN && rc != BZ_RUN_OK) { 1973 dumpsys_errmsg(hp, "%d: BZ_RUN error %s at page %lx\n", 1974 hp->helper, BZ2_bzErrorString(rc), 1975 hp->cpin->pagenum); 1976 break; 1977 } 1978 1979 /* Write the buffer if it is full, or we are flushing */ 1980 if (ps->avail_out == 0 || mode == BZ_FINISH) { 1981 csize = hp->cpout->size - CSIZE - ps->avail_out; 1982 cs = DUMP_SET_TAG(csize, hp->tag); 1983 if (csize > 0) { 1984 (void) memcpy(hp->cpout->buf, &cs, CSIZE); 1985 dumpsys_swrite(hp, hp->cpout, csize + CSIZE); 1986 hp->cpout = NULL; 1987 } 1988 } 1989 1990 /* Check for final complete */ 1991 if (mode == BZ_FINISH) { 1992 if (rc == BZ_STREAM_END) 1993 break; 1994 if (rc != BZ_FINISH_OK) { 1995 dumpsys_errmsg(hp, "%d: BZ_FINISH error %s\n", 1996 hp->helper, BZ2_bzErrorString(rc)); 1997 break; 1998 } 1999 } 2000 } 2001 2002 /* Cleanup state and buffers */ 2003 if (mode == BZ_FINISH) { 2004 2005 /* Reset state so that it is re-usable. */ 2006 (void) BZ2_bzCompressReset(&hp->bzstream); 2007 2008 /* Give any unused outout buffer to the main task */ 2009 if (hp->cpout != NULL) { 2010 hp->cpout->used = 0; 2011 CQ_PUT(mainq, hp->cpout, CBUF_ERRMSG); 2012 hp->cpout = NULL; 2013 } 2014 } 2015 } 2016 2017 static void 2018 dumpsys_bz2compress(helper_t *hp) 2019 { 2020 dumpsync_t *ds = hp->ds; 2021 dumpstreamhdr_t sh; 2022 2023 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC); 2024 sh.stream_pagenum = (pgcnt_t)-1; 2025 sh.stream_npages = 0; 2026 hp->cpin = NULL; 2027 hp->cpout = NULL; 2028 hp->cperr = NULL; 2029 hp->in = 0; 2030 hp->out = 0; 2031 hp->bzstream.avail_in = 0; 2032 2033 /* Bump reference to mainq while we are running */ 2034 CQ_OPEN(mainq); 2035 2036 /* Get one page at a time */ 2037 while (dumpsys_sread(hp)) { 2038 if (sh.stream_pagenum != hp->cpin->pagenum) { 2039 sh.stream_pagenum = hp->cpin->pagenum; 2040 sh.stream_npages = btop(hp->cpin->used); 2041 dumpsys_bzrun(hp, &sh, sizeof (sh), BZ_RUN); 2042 } 2043 dumpsys_bzrun(hp, hp->page, PAGESIZE, 0); 2044 } 2045 2046 /* Done with input, flush any partial buffer */ 2047 if (sh.stream_pagenum != (pgcnt_t)-1) { 2048 dumpsys_bzrun(hp, NULL, 0, BZ_FINISH); 2049 dumpsys_errmsg(hp, NULL); 2050 } 2051 2052 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL); 2053 2054 /* Decrement main queue count, we are done */ 2055 CQ_CLOSE(mainq); 2056 } 2057 2058 /* 2059 * Compress with lzjb 2060 * write stream block if full or size==0 2061 * if csize==0 write stream header, else write <csize, data> 2062 * size==0 is a call to flush a buffer 2063 * hp->cpout is the buffer we are flushing or filling 2064 * hp->out is the next index to fill data 2065 * osize is either csize+data, or the size of a stream header 2066 */ 2067 static void 2068 dumpsys_lzjbrun(helper_t *hp, size_t csize, void *buf, size_t size) 2069 { 2070 dumpsync_t *ds = hp->ds; 2071 const int CSIZE = sizeof (dumpcsize_t); 2072 dumpcsize_t cs; 2073 size_t osize = csize > 0 ? CSIZE + size : size; 2074 2075 /* If flush, and there is no buffer, just return */ 2076 if (size == 0 && hp->cpout == NULL) 2077 return; 2078 2079 /* If flush, or cpout is full, write it out */ 2080 if (size == 0 || 2081 hp->cpout != NULL && hp->out + osize > hp->cpout->size) { 2082 2083 /* Set tag+size word at the front of the stream block. */ 2084 cs = DUMP_SET_TAG(hp->out - CSIZE, hp->tag); 2085 (void) memcpy(hp->cpout->buf, &cs, CSIZE); 2086 2087 /* Write block to dump file. */ 2088 dumpsys_swrite(hp, hp->cpout, hp->out); 2089 2090 /* Clear pointer to indicate we need a new buffer */ 2091 hp->cpout = NULL; 2092 2093 /* flushing, we are done */ 2094 if (size == 0) 2095 return; 2096 } 2097 2098 /* Get an output buffer if we dont have one. */ 2099 if (hp->cpout == NULL) { 2100 HRSTART(hp->perpage, outwait); 2101 hp->cpout = CQ_GET(freebufq); 2102 HRSTOP(hp->perpage, outwait); 2103 hp->out = CSIZE; 2104 } 2105 2106 /* Store csize word. This is the size of compressed data. */ 2107 if (csize > 0) { 2108 cs = DUMP_SET_TAG(csize, 0); 2109 (void) memcpy(hp->cpout->buf + hp->out, &cs, CSIZE); 2110 hp->out += CSIZE; 2111 } 2112 2113 /* Store the data. */ 2114 (void) memcpy(hp->cpout->buf + hp->out, buf, size); 2115 hp->out += size; 2116 } 2117 2118 static void 2119 dumpsys_lzjbcompress(helper_t *hp) 2120 { 2121 dumpsync_t *ds = hp->ds; 2122 size_t csize; 2123 dumpstreamhdr_t sh; 2124 2125 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC); 2126 sh.stream_pagenum = (pfn_t)-1; 2127 sh.stream_npages = 0; 2128 hp->cpin = NULL; 2129 hp->cpout = NULL; 2130 hp->cperr = NULL; 2131 hp->in = 0; 2132 hp->out = 0; 2133 2134 /* Bump reference to mainq while we are running */ 2135 CQ_OPEN(mainq); 2136 2137 /* Get one page at a time */ 2138 while (dumpsys_sread(hp)) { 2139 2140 /* Create a stream header for each new input map */ 2141 if (sh.stream_pagenum != hp->cpin->pagenum) { 2142 sh.stream_pagenum = hp->cpin->pagenum; 2143 sh.stream_npages = btop(hp->cpin->used); 2144 dumpsys_lzjbrun(hp, 0, &sh, sizeof (sh)); 2145 } 2146 2147 /* Compress one page */ 2148 HRSTART(hp->perpage, compress); 2149 csize = compress(hp->page, hp->lzbuf, PAGESIZE); 2150 HRSTOP(hp->perpage, compress); 2151 2152 /* Add csize+data to output block */ 2153 ASSERT(csize > 0 && csize <= PAGESIZE); 2154 dumpsys_lzjbrun(hp, csize, hp->lzbuf, csize); 2155 } 2156 2157 /* Done with input, flush any partial buffer */ 2158 if (sh.stream_pagenum != (pfn_t)-1) { 2159 dumpsys_lzjbrun(hp, 0, NULL, 0); 2160 dumpsys_errmsg(hp, NULL); 2161 } 2162 2163 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL); 2164 2165 /* Decrement main queue count, we are done */ 2166 CQ_CLOSE(mainq); 2167 } 2168 2169 /* 2170 * Dump helper called from panic_idle() to compress pages. CPUs in 2171 * this path must not call most kernel services. 2172 * 2173 * During panic, all but one of the CPUs is idle. These CPUs are used 2174 * as helpers working in parallel to copy and compress memory 2175 * pages. During a panic, however, these processors cannot call any 2176 * kernel services. This is because mutexes become no-ops during 2177 * panic, and, cross-call interrupts are inhibited. Therefore, during 2178 * panic dump the helper CPUs communicate with the panic CPU using 2179 * memory variables. All memory mapping and I/O is performed by the 2180 * panic CPU. 2181 * 2182 * At dump configuration time, helper_lock is set and helpers_wanted 2183 * is 0. dumpsys() decides whether to set helpers_wanted before 2184 * clearing helper_lock. 2185 * 2186 * At panic time, idle CPUs spin-wait on helper_lock, then alternately 2187 * take the lock and become a helper, or return. 2188 */ 2189 void 2190 dumpsys_helper() 2191 { 2192 if (!dumpcfg.helper_present) 2193 dumpcfg.helper_present = 1; 2194 dumpsys_spinlock(&dumpcfg.helper_lock); 2195 if (dumpcfg.helpers_wanted) { 2196 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper]; 2197 2198 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2199 if (hp->helper == FREEHELPER) { 2200 hp->helper = CPU->cpu_id; 2201 BT_SET(dumpcfg.helpermap, CPU->cpu_seqid); 2202 2203 dumpsys_spinunlock(&dumpcfg.helper_lock); 2204 2205 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2) 2206 dumpsys_lzjbcompress(hp); 2207 else 2208 dumpsys_bz2compress(hp); 2209 2210 hp->helper = DONEHELPER; 2211 return; 2212 } 2213 } 2214 2215 /* No more helpers are needed. */ 2216 dumpcfg.helpers_wanted = 0; 2217 2218 } 2219 dumpsys_spinunlock(&dumpcfg.helper_lock); 2220 } 2221 2222 /* 2223 * No-wait helper callable in spin loops. 2224 * 2225 * Do not wait for helper_lock. Just check helpers_wanted. The caller 2226 * may decide to continue. This is the "c)ontinue, s)ync, r)eset? s" 2227 * case. 2228 */ 2229 void 2230 dumpsys_helper_nw() 2231 { 2232 if (!dumpcfg.helper_present) 2233 dumpcfg.helper_present = 1; 2234 if (dumpcfg.helpers_wanted) 2235 dumpsys_helper(); 2236 } 2237 2238 /* 2239 * Dump helper for live dumps. 2240 * These run as a system task. 2241 */ 2242 static void 2243 dumpsys_live_helper(void *arg) 2244 { 2245 helper_t *hp = arg; 2246 2247 BT_ATOMIC_SET(dumpcfg.helpermap, CPU->cpu_seqid); 2248 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2) 2249 dumpsys_lzjbcompress(hp); 2250 else 2251 dumpsys_bz2compress(hp); 2252 } 2253 2254 /* 2255 * Compress one page with lzjb (single threaded case) 2256 */ 2257 static void 2258 dumpsys_lzjb_page(helper_t *hp, cbuf_t *cp) 2259 { 2260 dumpsync_t *ds = hp->ds; 2261 uint32_t csize; 2262 2263 hp->helper = MAINHELPER; 2264 hp->in = 0; 2265 hp->used = 0; 2266 hp->cpin = cp; 2267 while (hp->used < cp->used) { 2268 HRSTART(hp->perpage, copy); 2269 hp->in = dumpsys_copy_page(hp, hp->in); 2270 hp->used += PAGESIZE; 2271 HRSTOP(hp->perpage, copy); 2272 2273 HRSTART(hp->perpage, compress); 2274 csize = compress(hp->page, hp->lzbuf, PAGESIZE); 2275 HRSTOP(hp->perpage, compress); 2276 2277 HRSTART(hp->perpage, write); 2278 dumpvp_write(&csize, sizeof (csize)); 2279 dumpvp_write(hp->lzbuf, csize); 2280 HRSTOP(hp->perpage, write); 2281 } 2282 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 2283 hp->cpin = NULL; 2284 } 2285 2286 /* 2287 * Main task to dump pages. This is called on the dump CPU. 2288 */ 2289 static void 2290 dumpsys_main_task(void *arg) 2291 { 2292 dumpsync_t *ds = arg; 2293 pgcnt_t pagenum = 0, bitnum = 0, hibitnum; 2294 dumpmlw_t mlw; 2295 cbuf_t *cp; 2296 pgcnt_t baseoff, pfnoff; 2297 pfn_t base, pfn; 2298 int sec; 2299 2300 dump_init_memlist_walker(&mlw); 2301 2302 /* CONSTCOND */ 2303 while (1) { 2304 2305 if (ds->percent > ds->percent_done) { 2306 ds->percent_done = ds->percent; 2307 sec = (gethrtime() - ds->start) / 1000 / 1000 / 1000; 2308 uprintf("^\r%2d:%02d %3d%% done", 2309 sec / 60, sec % 60, ds->percent); 2310 ds->neednl = 1; 2311 } 2312 2313 while (CQ_IS_EMPTY(mainq) && !CQ_IS_EMPTY(writerq)) { 2314 2315 /* the writerq never blocks */ 2316 cp = CQ_GET(writerq); 2317 if (cp == NULL) 2318 break; 2319 2320 dump_timeleft = dump_timeout; 2321 2322 HRSTART(ds->perpage, write); 2323 dumpvp_write(cp->buf, cp->used); 2324 HRSTOP(ds->perpage, write); 2325 2326 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2327 } 2328 2329 /* 2330 * Wait here for some buffers to process. Returns NULL 2331 * when all helpers have terminated and all buffers 2332 * have been processed. 2333 */ 2334 cp = CQ_GET(mainq); 2335 2336 if (cp == NULL) { 2337 2338 /* Drain the write queue. */ 2339 if (!CQ_IS_EMPTY(writerq)) 2340 continue; 2341 2342 /* Main task exits here. */ 2343 break; 2344 } 2345 2346 dump_timeleft = dump_timeout; 2347 2348 switch (cp->state) { 2349 2350 case CBUF_FREEMAP: 2351 2352 /* 2353 * Note that we drop CBUF_FREEMAP buffers on 2354 * the floor (they will not be on any cqueue) 2355 * when we no longer need them. 2356 */ 2357 if (bitnum >= dumpcfg.bitmapsize) 2358 break; 2359 2360 if (dump_ioerr) { 2361 bitnum = dumpcfg.bitmapsize; 2362 CQ_CLOSE(helperq); 2363 break; 2364 } 2365 2366 HRSTART(ds->perpage, bitmap); 2367 for (; bitnum < dumpcfg.bitmapsize; bitnum++) 2368 if (BT_TEST(dumpcfg.bitmap, bitnum)) 2369 break; 2370 HRSTOP(ds->perpage, bitmap); 2371 dump_timeleft = dump_timeout; 2372 2373 if (bitnum >= dumpcfg.bitmapsize) { 2374 CQ_CLOSE(helperq); 2375 break; 2376 } 2377 2378 /* 2379 * Try to map CBUF_MAPSIZE ranges. Can't 2380 * assume that memory segment size is a 2381 * multiple of CBUF_MAPSIZE. Can't assume that 2382 * the segment starts on a CBUF_MAPSIZE 2383 * boundary. 2384 */ 2385 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2386 ASSERT(pfn != PFN_INVALID); 2387 ASSERT(bitnum + mlw.mpleft <= dumpcfg.bitmapsize); 2388 2389 base = P2ALIGN(pfn, CBUF_MAPNP); 2390 if (base < mlw.mpaddr) { 2391 base = mlw.mpaddr; 2392 baseoff = P2PHASE(base, CBUF_MAPNP); 2393 } else { 2394 baseoff = 0; 2395 } 2396 2397 pfnoff = pfn - base; 2398 if (pfnoff + mlw.mpleft < CBUF_MAPNP) { 2399 hibitnum = bitnum + mlw.mpleft; 2400 cp->size = ptob(pfnoff + mlw.mpleft); 2401 } else { 2402 hibitnum = bitnum - pfnoff + CBUF_MAPNP - 2403 baseoff; 2404 cp->size = CBUF_MAPSIZE - ptob(baseoff); 2405 } 2406 2407 cp->pfn = pfn; 2408 cp->bitnum = bitnum++; 2409 cp->pagenum = pagenum++; 2410 cp->off = ptob(pfnoff); 2411 2412 for (; bitnum < hibitnum; bitnum++) 2413 if (BT_TEST(dumpcfg.bitmap, bitnum)) 2414 pagenum++; 2415 2416 dump_timeleft = dump_timeout; 2417 cp->used = ptob(pagenum - cp->pagenum); 2418 2419 HRSTART(ds->perpage, map); 2420 hat_devload(kas.a_hat, cp->buf, cp->size, base, 2421 PROT_READ, HAT_LOAD_NOCONSIST); 2422 HRSTOP(ds->perpage, map); 2423 2424 ds->pages_mapped += btop(cp->size); 2425 ds->pages_used += pagenum - cp->pagenum; 2426 2427 CQ_OPEN(mainq); 2428 2429 /* 2430 * If there are no helpers the main task does 2431 * non-streams lzjb compress. 2432 */ 2433 if (dumpcfg.clevel == 0) { 2434 dumpsys_lzjb_page(dumpcfg.helper, cp); 2435 break; 2436 } 2437 2438 /* pass mapped pages to a helper */ 2439 CQ_PUT(helperq, cp, CBUF_INREADY); 2440 2441 /* the last page was done */ 2442 if (bitnum >= dumpcfg.bitmapsize) 2443 CQ_CLOSE(helperq); 2444 2445 break; 2446 2447 case CBUF_USEDMAP: 2448 2449 ds->npages += btop(cp->used); 2450 2451 HRSTART(ds->perpage, unmap); 2452 hat_unload(kas.a_hat, cp->buf, cp->size, HAT_UNLOAD); 2453 HRSTOP(ds->perpage, unmap); 2454 2455 if (bitnum < dumpcfg.bitmapsize) 2456 CQ_PUT(mainq, cp, CBUF_FREEMAP); 2457 CQ_CLOSE(mainq); 2458 2459 ASSERT(ds->npages <= dumphdr->dump_npages); 2460 ds->percent = ds->npages * 100LL / dumphdr->dump_npages; 2461 break; 2462 2463 case CBUF_WRITE: 2464 2465 CQ_PUT(writerq, cp, CBUF_WRITE); 2466 break; 2467 2468 case CBUF_ERRMSG: 2469 2470 if (cp->used > 0) { 2471 cp->buf[cp->size - 2] = '\n'; 2472 cp->buf[cp->size - 1] = '\0'; 2473 if (ds->neednl) { 2474 uprintf("\n%s", cp->buf); 2475 ds->neednl = 0; 2476 } else { 2477 uprintf("%s", cp->buf); 2478 } 2479 /* wait for console output */ 2480 drv_usecwait(200000); 2481 dump_timeleft = dump_timeout; 2482 } 2483 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2484 break; 2485 2486 default: 2487 uprintf("dump: unexpected buffer state %d, " 2488 "buffer will be lost\n", cp->state); 2489 break; 2490 2491 } /* end switch */ 2492 2493 } /* end while(1) */ 2494 } 2495 2496 #ifdef COLLECT_METRICS 2497 size_t 2498 dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size) 2499 { 2500 dumpcfg_t *cfg = &dumpcfg; 2501 int myid = CPU->cpu_seqid; 2502 int i, compress_ratio; 2503 int sec, iorate; 2504 helper_t *hp, *hpend = &cfg->helper[cfg->nhelper]; 2505 char *e = buf + size; 2506 char *p = buf; 2507 2508 sec = ds->elapsed / (1000 * 1000 * 1000ULL); 2509 if (sec < 1) 2510 sec = 1; 2511 2512 if (ds->iotime < 1) 2513 ds->iotime = 1; 2514 iorate = (ds->nwrite * 100000ULL) / ds->iotime; 2515 2516 compress_ratio = 100LL * ds->npages / btopr(ds->nwrite + 1); 2517 2518 #define P(...) (p += p < e ? snprintf(p, e - p, __VA_ARGS__) : 0) 2519 2520 P("Master cpu_seqid,%d\n", CPU->cpu_seqid); 2521 P("Master cpu_id,%d\n", CPU->cpu_id); 2522 P("dump_flags,0x%x\n", dumphdr->dump_flags); 2523 P("dump_ioerr,%d\n", dump_ioerr); 2524 2525 P("Helpers:\n"); 2526 for (i = 0; i < ncpus; i++) { 2527 if ((i & 15) == 0) 2528 P(",,%03d,", i); 2529 if (i == myid) 2530 P(" M"); 2531 else if (BT_TEST(cfg->helpermap, i)) 2532 P("%4d", cpu_seq[i]->cpu_id); 2533 else 2534 P(" *"); 2535 if ((i & 15) == 15) 2536 P("\n"); 2537 } 2538 2539 P("ncbuf_used,%d\n", cfg->ncbuf_used); 2540 P("ncmap,%d\n", cfg->ncmap); 2541 2542 P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m); 2543 P("Found small pages,%ld\n", cfg->foundsm); 2544 2545 P("Compression level,%d\n", cfg->clevel); 2546 P("Compression type,%s %s\n", cfg->clevel == 0 ? "serial" : "parallel", 2547 cfg->clevel >= DUMP_CLEVEL_BZIP2 ? "bzip2" : "lzjb"); 2548 P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio % 2549 100); 2550 P("nhelper_used,%d\n", cfg->nhelper_used); 2551 2552 P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100); 2553 P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite); 2554 P("..total nsec,%lld\n", (u_longlong_t)ds->iotime); 2555 P("dumpbuf.iosize,%ld\n", dumpbuf.iosize); 2556 P("dumpbuf.size,%ld\n", dumpbuf.size); 2557 2558 P("Dump pages/sec,%llu\n", (u_longlong_t)ds->npages / sec); 2559 P("Dump pages,%llu\n", (u_longlong_t)ds->npages); 2560 P("Dump time,%d\n", sec); 2561 2562 if (ds->pages_mapped > 0) 2563 P("per-cent map utilization,%d\n", (int)((100 * ds->pages_used) 2564 / ds->pages_mapped)); 2565 2566 P("\nPer-page metrics:\n"); 2567 if (ds->npages > 0) { 2568 for (hp = cfg->helper; hp != hpend; hp++) { 2569 #define PERPAGE(x) ds->perpage.x += hp->perpage.x; 2570 PERPAGES; 2571 #undef PERPAGE 2572 } 2573 #define PERPAGE(x) \ 2574 P("%s nsec/page,%d\n", #x, (int)(ds->perpage.x / ds->npages)); 2575 PERPAGES; 2576 #undef PERPAGE 2577 P("freebufq.empty,%d\n", (int)(ds->freebufq.empty / 2578 ds->npages)); 2579 P("helperq.empty,%d\n", (int)(ds->helperq.empty / 2580 ds->npages)); 2581 P("writerq.empty,%d\n", (int)(ds->writerq.empty / 2582 ds->npages)); 2583 P("mainq.empty,%d\n", (int)(ds->mainq.empty / ds->npages)); 2584 2585 P("I/O wait nsec/page,%llu\n", (u_longlong_t)(ds->iowait / 2586 ds->npages)); 2587 } 2588 #undef P 2589 if (p < e) 2590 bzero(p, e - p); 2591 return (p - buf); 2592 } 2593 #endif /* COLLECT_METRICS */ 2594 2595 /* 2596 * Dump the system. 2597 */ 2598 void 2599 dumpsys(void) 2600 { 2601 dumpsync_t *ds = &dumpsync; 2602 taskq_t *livetaskq = NULL; 2603 pfn_t pfn; 2604 pgcnt_t bitnum; 2605 proc_t *p; 2606 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper]; 2607 cbuf_t *cp; 2608 pid_t npids, pidx; 2609 char *content; 2610 char *buf; 2611 size_t size; 2612 int save_dump_clevel; 2613 dumpmlw_t mlw; 2614 dumpcsize_t datatag; 2615 dumpdatahdr_t datahdr; 2616 2617 if (dumpvp == NULL || dumphdr == NULL) { 2618 uprintf("skipping system dump - no dump device configured\n"); 2619 if (panicstr) { 2620 dumpcfg.helpers_wanted = 0; 2621 dumpsys_spinunlock(&dumpcfg.helper_lock); 2622 } 2623 return; 2624 } 2625 dumpbuf.cur = dumpbuf.start; 2626 2627 /* clear the sync variables */ 2628 ASSERT(dumpcfg.nhelper > 0); 2629 bzero(ds, sizeof (*ds)); 2630 ds->dumpcpu = CPU->cpu_id; 2631 2632 /* 2633 * Calculate the starting block for dump. If we're dumping on a 2634 * swap device, start 1/5 of the way in; otherwise, start at the 2635 * beginning. And never use the first page -- it may be a disk label. 2636 */ 2637 if (dumpvp->v_flag & VISSWAP) 2638 dumphdr->dump_start = P2ROUNDUP(dumpvp_size / 5, DUMP_OFFSET); 2639 else 2640 dumphdr->dump_start = DUMP_OFFSET; 2641 2642 dumphdr->dump_flags = DF_VALID | DF_COMPLETE | DF_LIVE | DF_COMPRESSED; 2643 dumphdr->dump_crashtime = gethrestime_sec(); 2644 dumphdr->dump_npages = 0; 2645 dumphdr->dump_nvtop = 0; 2646 bzero(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.bitmapsize)); 2647 dump_timeleft = dump_timeout; 2648 2649 if (panicstr) { 2650 dumphdr->dump_flags &= ~DF_LIVE; 2651 (void) VOP_DUMPCTL(dumpvp, DUMP_FREE, NULL, NULL); 2652 (void) VOP_DUMPCTL(dumpvp, DUMP_ALLOC, NULL, NULL); 2653 (void) vsnprintf(dumphdr->dump_panicstring, DUMP_PANICSIZE, 2654 panicstr, panicargs); 2655 2656 } 2657 2658 if (dump_conflags & DUMP_ALL) 2659 content = "all"; 2660 else if (dump_conflags & DUMP_CURPROC) 2661 content = "kernel + curproc"; 2662 else 2663 content = "kernel"; 2664 uprintf("dumping to %s, offset %lld, content: %s\n", dumppath, 2665 dumphdr->dump_start, content); 2666 2667 /* Make sure nodename is current */ 2668 bcopy(utsname.nodename, dumphdr->dump_utsname.nodename, SYS_NMLN); 2669 2670 /* 2671 * If this is a live dump, try to open a VCHR vnode for better 2672 * performance. We must take care to flush the buffer cache 2673 * first. 2674 */ 2675 if (!panicstr) { 2676 vnode_t *cdev_vp, *cmn_cdev_vp; 2677 2678 ASSERT(dumpbuf.cdev_vp == NULL); 2679 cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR); 2680 if (cdev_vp != NULL) { 2681 cmn_cdev_vp = common_specvp(cdev_vp); 2682 if (VOP_OPEN(&cmn_cdev_vp, FREAD | FWRITE, kcred, NULL) 2683 == 0) { 2684 if (vn_has_cached_data(dumpvp)) 2685 (void) pvn_vplist_dirty(dumpvp, 0, NULL, 2686 B_INVAL | B_TRUNC, kcred); 2687 dumpbuf.cdev_vp = cmn_cdev_vp; 2688 } else { 2689 VN_RELE(cdev_vp); 2690 } 2691 } 2692 } 2693 2694 /* 2695 * Store a hires timestamp so we can look it up during debugging. 2696 */ 2697 lbolt_debug_entry(); 2698 2699 /* 2700 * Leave room for the message and ereport save areas and terminal dump 2701 * header. 2702 */ 2703 dumpbuf.vp_limit = dumpvp_size - DUMP_LOGSIZE - DUMP_OFFSET - 2704 DUMP_ERPTSIZE; 2705 2706 /* 2707 * Write out the symbol table. It's no longer compressed, 2708 * so its 'size' and 'csize' are equal. 2709 */ 2710 dumpbuf.vp_off = dumphdr->dump_ksyms = dumphdr->dump_start + PAGESIZE; 2711 dumphdr->dump_ksyms_size = dumphdr->dump_ksyms_csize = 2712 ksyms_snapshot(dumpvp_ksyms_write, NULL, LONG_MAX); 2713 2714 /* 2715 * Write out the translation map. 2716 */ 2717 dumphdr->dump_map = dumpvp_flush(); 2718 dump_as(&kas); 2719 dumphdr->dump_nvtop += dump_plat_addr(); 2720 2721 /* 2722 * call into hat, which may have unmapped pages that also need to 2723 * be in the dump 2724 */ 2725 hat_dump(); 2726 2727 if (dump_conflags & DUMP_ALL) { 2728 mutex_enter(&pidlock); 2729 2730 for (npids = 0, p = practive; p != NULL; p = p->p_next) 2731 dumpcfg.pids[npids++] = p->p_pid; 2732 2733 mutex_exit(&pidlock); 2734 2735 for (pidx = 0; pidx < npids; pidx++) 2736 (void) dump_process(dumpcfg.pids[pidx]); 2737 2738 dump_init_memlist_walker(&mlw); 2739 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) { 2740 dump_timeleft = dump_timeout; 2741 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2742 /* 2743 * Some hypervisors do not have all pages available to 2744 * be accessed by the guest OS. Check for page 2745 * accessibility. 2746 */ 2747 if (plat_hold_page(pfn, PLAT_HOLD_NO_LOCK, NULL) != 2748 PLAT_HOLD_OK) 2749 continue; 2750 BT_SET(dumpcfg.bitmap, bitnum); 2751 } 2752 dumphdr->dump_npages = dumpcfg.bitmapsize; 2753 dumphdr->dump_flags |= DF_ALL; 2754 2755 } else if (dump_conflags & DUMP_CURPROC) { 2756 /* 2757 * Determine which pid is to be dumped. If we're panicking, we 2758 * dump the process associated with panic_thread (if any). If 2759 * this is a live dump, we dump the process associated with 2760 * curthread. 2761 */ 2762 npids = 0; 2763 if (panicstr) { 2764 if (panic_thread != NULL && 2765 panic_thread->t_procp != NULL && 2766 panic_thread->t_procp != &p0) { 2767 dumpcfg.pids[npids++] = 2768 panic_thread->t_procp->p_pid; 2769 } 2770 } else { 2771 dumpcfg.pids[npids++] = curthread->t_procp->p_pid; 2772 } 2773 2774 if (npids && dump_process(dumpcfg.pids[0]) == 0) 2775 dumphdr->dump_flags |= DF_CURPROC; 2776 else 2777 dumphdr->dump_flags |= DF_KERNEL; 2778 2779 } else { 2780 dumphdr->dump_flags |= DF_KERNEL; 2781 } 2782 2783 dumphdr->dump_hashmask = (1 << highbit(dumphdr->dump_nvtop - 1)) - 1; 2784 2785 /* 2786 * Write out the pfn table. 2787 */ 2788 dumphdr->dump_pfn = dumpvp_flush(); 2789 dump_init_memlist_walker(&mlw); 2790 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) { 2791 dump_timeleft = dump_timeout; 2792 if (!BT_TEST(dumpcfg.bitmap, bitnum)) 2793 continue; 2794 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2795 ASSERT(pfn != PFN_INVALID); 2796 dumpvp_write(&pfn, sizeof (pfn_t)); 2797 } 2798 dump_plat_pfn(); 2799 2800 /* 2801 * Write out all the pages. 2802 * Map pages, copy them handling UEs, compress, and write them out. 2803 * Cooperate with any helpers running on CPUs in panic_idle(). 2804 */ 2805 dumphdr->dump_data = dumpvp_flush(); 2806 2807 bzero(dumpcfg.helpermap, BT_SIZEOFMAP(NCPU)); 2808 ds->live = dumpcfg.clevel > 0 && 2809 (dumphdr->dump_flags & DF_LIVE) != 0; 2810 2811 save_dump_clevel = dumpcfg.clevel; 2812 if (panicstr) 2813 dumpsys_get_maxmem(); 2814 else if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2) 2815 dumpcfg.clevel = DUMP_CLEVEL_LZJB; 2816 2817 dumpcfg.nhelper_used = 0; 2818 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2819 if (hp->page == NULL) { 2820 hp->helper = DONEHELPER; 2821 continue; 2822 } 2823 ++dumpcfg.nhelper_used; 2824 hp->helper = FREEHELPER; 2825 hp->taskqid = NULL; 2826 hp->ds = ds; 2827 bzero(&hp->perpage, sizeof (hp->perpage)); 2828 if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2) 2829 (void) BZ2_bzCompressReset(&hp->bzstream); 2830 } 2831 2832 CQ_OPEN(freebufq); 2833 CQ_OPEN(helperq); 2834 2835 dumpcfg.ncbuf_used = 0; 2836 for (cp = dumpcfg.cbuf; cp != &dumpcfg.cbuf[dumpcfg.ncbuf]; cp++) { 2837 if (cp->buf != NULL) { 2838 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2839 ++dumpcfg.ncbuf_used; 2840 } 2841 } 2842 2843 for (cp = dumpcfg.cmap; cp != &dumpcfg.cmap[dumpcfg.ncmap]; cp++) 2844 CQ_PUT(mainq, cp, CBUF_FREEMAP); 2845 2846 ds->start = gethrtime(); 2847 ds->iowaitts = ds->start; 2848 2849 /* start helpers */ 2850 if (ds->live) { 2851 int n = dumpcfg.nhelper_used; 2852 int pri = MINCLSYSPRI - 25; 2853 2854 livetaskq = taskq_create("LiveDump", n, pri, n, n, 2855 TASKQ_PREPOPULATE); 2856 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2857 if (hp->page == NULL) 2858 continue; 2859 hp->helper = hp - dumpcfg.helper; 2860 hp->taskqid = taskq_dispatch(livetaskq, 2861 dumpsys_live_helper, (void *)hp, TQ_NOSLEEP); 2862 } 2863 2864 } else { 2865 if (panicstr) 2866 kmem_dump_begin(); 2867 dumpcfg.helpers_wanted = dumpcfg.clevel > 0; 2868 dumpsys_spinunlock(&dumpcfg.helper_lock); 2869 } 2870 2871 /* run main task */ 2872 dumpsys_main_task(ds); 2873 2874 ds->elapsed = gethrtime() - ds->start; 2875 if (ds->elapsed < 1) 2876 ds->elapsed = 1; 2877 2878 if (livetaskq != NULL) 2879 taskq_destroy(livetaskq); 2880 2881 if (ds->neednl) { 2882 uprintf("\n"); 2883 ds->neednl = 0; 2884 } 2885 2886 /* record actual pages dumped */ 2887 dumphdr->dump_npages = ds->npages; 2888 2889 /* platform-specific data */ 2890 dumphdr->dump_npages += dump_plat_data(dumpcfg.cbuf[0].buf); 2891 2892 /* note any errors by clearing DF_COMPLETE */ 2893 if (dump_ioerr || ds->npages < dumphdr->dump_npages) 2894 dumphdr->dump_flags &= ~DF_COMPLETE; 2895 2896 /* end of stream blocks */ 2897 datatag = 0; 2898 dumpvp_write(&datatag, sizeof (datatag)); 2899 2900 bzero(&datahdr, sizeof (datahdr)); 2901 2902 /* buffer for metrics */ 2903 buf = dumpcfg.cbuf[0].buf; 2904 size = MIN(dumpcfg.cbuf[0].size, DUMP_OFFSET - sizeof (dumphdr_t) - 2905 sizeof (dumpdatahdr_t)); 2906 2907 /* finish the kmem intercepts, collect kmem verbose info */ 2908 if (panicstr) { 2909 datahdr.dump_metrics = kmem_dump_finish(buf, size); 2910 buf += datahdr.dump_metrics; 2911 size -= datahdr.dump_metrics; 2912 } 2913 2914 /* record in the header whether this is a fault-management panic */ 2915 if (panicstr) 2916 dumphdr->dump_fm_panic = is_fm_panic(); 2917 2918 /* compression info in data header */ 2919 datahdr.dump_datahdr_magic = DUMP_DATAHDR_MAGIC; 2920 datahdr.dump_datahdr_version = DUMP_DATAHDR_VERSION; 2921 datahdr.dump_maxcsize = CBUF_SIZE; 2922 datahdr.dump_maxrange = CBUF_MAPSIZE / PAGESIZE; 2923 datahdr.dump_nstreams = dumpcfg.nhelper_used; 2924 datahdr.dump_clevel = dumpcfg.clevel; 2925 #ifdef COLLECT_METRICS 2926 if (dump_metrics_on) 2927 datahdr.dump_metrics += dumpsys_metrics(ds, buf, size); 2928 #endif 2929 datahdr.dump_data_csize = dumpvp_flush() - dumphdr->dump_data; 2930 2931 /* 2932 * Write out the initial and terminal dump headers. 2933 */ 2934 dumpbuf.vp_off = dumphdr->dump_start; 2935 dumpvp_write(dumphdr, sizeof (dumphdr_t)); 2936 (void) dumpvp_flush(); 2937 2938 dumpbuf.vp_limit = dumpvp_size; 2939 dumpbuf.vp_off = dumpbuf.vp_limit - DUMP_OFFSET; 2940 dumpvp_write(dumphdr, sizeof (dumphdr_t)); 2941 dumpvp_write(&datahdr, sizeof (dumpdatahdr_t)); 2942 dumpvp_write(dumpcfg.cbuf[0].buf, datahdr.dump_metrics); 2943 2944 (void) dumpvp_flush(); 2945 2946 uprintf("\r%3d%% done: %llu pages dumped, ", 2947 ds->percent_done, (u_longlong_t)ds->npages); 2948 2949 if (dump_ioerr == 0) { 2950 uprintf("dump succeeded\n"); 2951 } else { 2952 uprintf("dump failed: error %d\n", dump_ioerr); 2953 #ifdef DEBUG 2954 if (panicstr) 2955 debug_enter("dump failed"); 2956 #endif 2957 } 2958 2959 /* 2960 * Write out all undelivered messages. This has to be the *last* 2961 * thing we do because the dump process itself emits messages. 2962 */ 2963 if (panicstr) { 2964 dump_summary(); 2965 dump_ereports(); 2966 dump_messages(); 2967 } 2968 2969 delay(2 * hz); /* let people see the 'done' message */ 2970 dump_timeleft = 0; 2971 dump_ioerr = 0; 2972 2973 /* restore settings after live dump completes */ 2974 if (!panicstr) { 2975 dumpcfg.clevel = save_dump_clevel; 2976 2977 /* release any VCHR open of the dump device */ 2978 if (dumpbuf.cdev_vp != NULL) { 2979 (void) VOP_CLOSE(dumpbuf.cdev_vp, FREAD | FWRITE, 1, 0, 2980 kcred, NULL); 2981 VN_RELE(dumpbuf.cdev_vp); 2982 dumpbuf.cdev_vp = NULL; 2983 } 2984 } 2985 } 2986 2987 /* 2988 * This function is called whenever the memory size, as represented 2989 * by the phys_install list, changes. 2990 */ 2991 void 2992 dump_resize() 2993 { 2994 mutex_enter(&dump_lock); 2995 dumphdr_init(); 2996 dumpbuf_resize(); 2997 dump_update_clevel(); 2998 mutex_exit(&dump_lock); 2999 } 3000 3001 /* 3002 * This function allows for dynamic resizing of a dump area. It assumes that 3003 * the underlying device has update its appropriate size(9P). 3004 */ 3005 int 3006 dumpvp_resize() 3007 { 3008 int error; 3009 vattr_t vattr; 3010 3011 mutex_enter(&dump_lock); 3012 vattr.va_mask = AT_SIZE; 3013 if ((error = VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL)) != 0) { 3014 mutex_exit(&dump_lock); 3015 return (error); 3016 } 3017 3018 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) { 3019 mutex_exit(&dump_lock); 3020 return (ENOSPC); 3021 } 3022 3023 dumpvp_size = vattr.va_size & -DUMP_OFFSET; 3024 mutex_exit(&dump_lock); 3025 return (0); 3026 } 3027 3028 int 3029 dump_set_uuid(const char *uuidstr) 3030 { 3031 const char *ptr; 3032 int i; 3033 3034 if (uuidstr == NULL || strnlen(uuidstr, 36 + 1) != 36) 3035 return (EINVAL); 3036 3037 /* uuid_parse is not common code so check manually */ 3038 for (i = 0, ptr = uuidstr; i < 36; i++, ptr++) { 3039 switch (i) { 3040 case 8: 3041 case 13: 3042 case 18: 3043 case 23: 3044 if (*ptr != '-') 3045 return (EINVAL); 3046 break; 3047 3048 default: 3049 if (!isxdigit(*ptr)) 3050 return (EINVAL); 3051 break; 3052 } 3053 } 3054 3055 if (dump_osimage_uuid[0] != '\0') 3056 return (EALREADY); 3057 3058 (void) strncpy(dump_osimage_uuid, uuidstr, 36 + 1); 3059 3060 cmn_err(CE_CONT, "?This Solaris instance has UUID %s", 3061 dump_osimage_uuid); 3062 3063 return (0); 3064 } 3065 3066 const char * 3067 dump_get_uuid(void) 3068 { 3069 return (dump_osimage_uuid[0] != '\0' ? dump_osimage_uuid : ""); 3070 } 3071