1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2018 Joyent, Inc. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/systm.h> 30 #include <sys/vm.h> 31 #include <sys/proc.h> 32 #include <sys/file.h> 33 #include <sys/conf.h> 34 #include <sys/kmem.h> 35 #include <sys/mem.h> 36 #include <sys/mman.h> 37 #include <sys/vnode.h> 38 #include <sys/errno.h> 39 #include <sys/memlist.h> 40 #include <sys/dumphdr.h> 41 #include <sys/dumpadm.h> 42 #include <sys/ksyms.h> 43 #include <sys/compress.h> 44 #include <sys/stream.h> 45 #include <sys/strsun.h> 46 #include <sys/cmn_err.h> 47 #include <sys/bitmap.h> 48 #include <sys/modctl.h> 49 #include <sys/utsname.h> 50 #include <sys/systeminfo.h> 51 #include <sys/vmem.h> 52 #include <sys/log.h> 53 #include <sys/var.h> 54 #include <sys/debug.h> 55 #include <sys/sunddi.h> 56 #include <fs/fs_subr.h> 57 #include <sys/fs/snode.h> 58 #include <sys/ontrap.h> 59 #include <sys/panic.h> 60 #include <sys/dkio.h> 61 #include <sys/vtoc.h> 62 #include <sys/errorq.h> 63 #include <sys/fm/util.h> 64 #include <sys/fs/zfs.h> 65 66 #include <vm/hat.h> 67 #include <vm/as.h> 68 #include <vm/page.h> 69 #include <vm/pvn.h> 70 #include <vm/seg.h> 71 #include <vm/seg_kmem.h> 72 #include <sys/clock_impl.h> 73 #include <sys/hold_page.h> 74 75 #include <bzip2/bzlib.h> 76 77 #define ONE_GIG (1024 * 1024 * 1024UL) 78 79 /* 80 * Crash dump time is dominated by disk write time. To reduce this, 81 * the stronger compression method bzip2 is applied to reduce the dump 82 * size and hence reduce I/O time. However, bzip2 is much more 83 * computationally expensive than the existing lzjb algorithm, so to 84 * avoid increasing compression time, CPUs that are otherwise idle 85 * during panic are employed to parallelize the compression task. 86 * Many helper CPUs are needed to prevent bzip2 from being a 87 * bottleneck, and on systems with too few CPUs, the lzjb algorithm is 88 * parallelized instead. Lastly, I/O and compression are performed by 89 * different CPUs, and are hence overlapped in time, unlike the older 90 * serial code. 91 * 92 * Another important consideration is the speed of the dump 93 * device. Faster disks need less CPUs in order to benefit from 94 * parallel lzjb versus parallel bzip2. Therefore, the CPU count 95 * threshold for switching from parallel lzjb to paralled bzip2 is 96 * elevated for faster disks. The dump device speed is adduced from 97 * the setting for dumpbuf.iosize, see dump_update_clevel. 98 */ 99 100 /* 101 * exported vars 102 */ 103 kmutex_t dump_lock; /* lock for dump configuration */ 104 dumphdr_t *dumphdr; /* dump header */ 105 int dump_conflags = DUMP_KERNEL; /* dump configuration flags */ 106 vnode_t *dumpvp; /* dump device vnode pointer */ 107 u_offset_t dumpvp_size; /* size of dump device, in bytes */ 108 char *dumppath; /* pathname of dump device */ 109 int dump_timeout = 120; /* timeout for dumping pages */ 110 int dump_timeleft; /* portion of dump_timeout remaining */ 111 int dump_ioerr; /* dump i/o error */ 112 int dump_check_used; /* enable check for used pages */ 113 char *dump_stack_scratch; /* scratch area for saving stack summary */ 114 115 /* 116 * Tunables for dump compression and parallelism. These can be set via 117 * /etc/system. 118 * 119 * dump_ncpu_low number of helpers for parallel lzjb 120 * This is also the minimum configuration. 121 * 122 * dump_bzip2_level bzip2 compression level: 1-9 123 * Higher numbers give greater compression, but take more memory 124 * and time. Memory used per helper is ~(dump_bzip2_level * 1MB). 125 * 126 * dump_plat_mincpu the cross-over limit for using bzip2 (per platform): 127 * if dump_plat_mincpu == 0, then always do single threaded dump 128 * if ncpu >= dump_plat_mincpu then try to use bzip2 129 * 130 * dump_metrics_on if set, metrics are collected in the kernel, passed 131 * to savecore via the dump file, and recorded by savecore in 132 * METRICS.txt. 133 */ 134 uint_t dump_ncpu_low = 4; /* minimum config for parallel lzjb */ 135 uint_t dump_bzip2_level = 1; /* bzip2 level (1-9) */ 136 137 /* Use dump_plat_mincpu_default unless this variable is set by /etc/system */ 138 #define MINCPU_NOT_SET ((uint_t)-1) 139 uint_t dump_plat_mincpu = MINCPU_NOT_SET; 140 141 /* tunables for pre-reserved heap */ 142 uint_t dump_kmem_permap = 1024; 143 uint_t dump_kmem_pages = 0; 144 145 /* Define multiple buffers per helper to avoid stalling */ 146 #define NCBUF_PER_HELPER 2 147 #define NCMAP_PER_HELPER 4 148 149 /* minimum number of helpers configured */ 150 #define MINHELPERS (dump_ncpu_low) 151 #define MINCBUFS (MINHELPERS * NCBUF_PER_HELPER) 152 153 /* 154 * Define constant parameters. 155 * 156 * CBUF_SIZE size of an output buffer 157 * 158 * CBUF_MAPSIZE size of virtual range for mapping pages 159 * 160 * CBUF_MAPNP size of virtual range in pages 161 * 162 */ 163 #define DUMP_1KB ((size_t)1 << 10) 164 #define DUMP_1MB ((size_t)1 << 20) 165 #define CBUF_SIZE ((size_t)1 << 17) 166 #define CBUF_MAPSHIFT (22) 167 #define CBUF_MAPSIZE ((size_t)1 << CBUF_MAPSHIFT) 168 #define CBUF_MAPNP ((size_t)1 << (CBUF_MAPSHIFT - PAGESHIFT)) 169 170 /* 171 * Compression metrics are accumulated nano-second subtotals. The 172 * results are normalized by the number of pages dumped. A report is 173 * generated when dumpsys() completes and is saved in the dump image 174 * after the trailing dump header. 175 * 176 * Metrics are always collected. Set the variable dump_metrics_on to 177 * cause metrics to be saved in the crash file, where savecore will 178 * save it in the file METRICS.txt. 179 */ 180 #define PERPAGES \ 181 PERPAGE(bitmap) PERPAGE(map) PERPAGE(unmap) \ 182 PERPAGE(copy) PERPAGE(compress) \ 183 PERPAGE(write) \ 184 PERPAGE(inwait) PERPAGE(outwait) 185 186 typedef struct perpage { 187 #define PERPAGE(x) hrtime_t x; 188 PERPAGES 189 #undef PERPAGE 190 } perpage_t; 191 192 /* 193 * This macro controls the code generation for collecting dump 194 * performance information. By default, the code is generated, but 195 * automatic saving of the information is disabled. If dump_metrics_on 196 * is set to 1, the timing information is passed to savecore via the 197 * crash file, where it is appended to the file dump-dir/METRICS.txt. 198 */ 199 #define COLLECT_METRICS 200 201 #ifdef COLLECT_METRICS 202 uint_t dump_metrics_on = 0; /* set to 1 to enable recording metrics */ 203 204 #define HRSTART(v, m) v##ts.m = gethrtime() 205 #define HRSTOP(v, m) v.m += gethrtime() - v##ts.m 206 #define HRBEGIN(v, m, s) v##ts.m = gethrtime(); v.size += s 207 #define HREND(v, m) v.m += gethrtime() - v##ts.m 208 #define HRNORM(v, m, n) v.m /= (n) 209 210 #else 211 #define HRSTART(v, m) 212 #define HRSTOP(v, m) 213 #define HRBEGIN(v, m, s) 214 #define HREND(v, m) 215 #define HRNORM(v, m, n) 216 #endif /* COLLECT_METRICS */ 217 218 /* 219 * Buffers for copying and compressing memory pages. 220 * 221 * cbuf_t buffer controllers: used for both input and output. 222 * 223 * The buffer state indicates how it is being used: 224 * 225 * CBUF_FREEMAP: CBUF_MAPSIZE virtual address range is available for 226 * mapping input pages. 227 * 228 * CBUF_INREADY: input pages are mapped and ready for compression by a 229 * helper. 230 * 231 * CBUF_USEDMAP: mapping has been consumed by a helper. Needs unmap. 232 * 233 * CBUF_FREEBUF: CBUF_SIZE output buffer, which is available. 234 * 235 * CBUF_WRITE: CBUF_SIZE block of compressed pages from a helper, 236 * ready to write out. 237 * 238 * CBUF_ERRMSG: CBUF_SIZE block of error messages from a helper 239 * (reports UE errors.) 240 */ 241 242 typedef enum cbufstate { 243 CBUF_FREEMAP, 244 CBUF_INREADY, 245 CBUF_USEDMAP, 246 CBUF_FREEBUF, 247 CBUF_WRITE, 248 CBUF_ERRMSG 249 } cbufstate_t; 250 251 typedef struct cbuf cbuf_t; 252 253 struct cbuf { 254 cbuf_t *next; /* next in list */ 255 cbufstate_t state; /* processing state */ 256 size_t used; /* amount used */ 257 size_t size; /* mem size */ 258 char *buf; /* kmem or vmem */ 259 pgcnt_t pagenum; /* index to pfn map */ 260 pgcnt_t bitnum; /* first set bitnum */ 261 pfn_t pfn; /* first pfn in mapped range */ 262 int off; /* byte offset to first pfn */ 263 }; 264 265 static char dump_osimage_uuid[36 + 1]; 266 267 #define isdigit(ch) ((ch) >= '0' && (ch) <= '9') 268 #define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \ 269 ((ch) >= 'A' && (ch) <= 'F')) 270 271 /* 272 * cqueue_t queues: a uni-directional channel for communication 273 * from the master to helper tasks or vice-versa using put and 274 * get primitives. Both mappings and data buffers are passed via 275 * queues. Producers close a queue when done. The number of 276 * active producers is reference counted so the consumer can 277 * detect end of data. Concurrent access is mediated by atomic 278 * operations for panic dump, or mutex/cv for live dump. 279 * 280 * There a four queues, used as follows: 281 * 282 * Queue Dataflow NewState 283 * -------------------------------------------------- 284 * mainq master -> master FREEMAP 285 * master has initialized or unmapped an input buffer 286 * -------------------------------------------------- 287 * helperq master -> helper INREADY 288 * master has mapped input for use by helper 289 * -------------------------------------------------- 290 * mainq master <- helper USEDMAP 291 * helper is done with input 292 * -------------------------------------------------- 293 * freebufq master -> helper FREEBUF 294 * master has initialized or written an output buffer 295 * -------------------------------------------------- 296 * mainq master <- helper WRITE 297 * block of compressed pages from a helper 298 * -------------------------------------------------- 299 * mainq master <- helper ERRMSG 300 * error messages from a helper (memory error case) 301 * -------------------------------------------------- 302 * writerq master <- master WRITE 303 * non-blocking queue of blocks to write 304 * -------------------------------------------------- 305 */ 306 typedef struct cqueue { 307 cbuf_t *volatile first; /* first in list */ 308 cbuf_t *last; /* last in list */ 309 hrtime_t ts; /* timestamp */ 310 hrtime_t empty; /* total time empty */ 311 kmutex_t mutex; /* live state lock */ 312 kcondvar_t cv; /* live wait var */ 313 lock_t spinlock; /* panic mode spin lock */ 314 volatile uint_t open; /* producer ref count */ 315 } cqueue_t; 316 317 /* 318 * Convenience macros for using the cqueue functions 319 * Note that the caller must have defined "dumpsync_t *ds" 320 */ 321 #define CQ_IS_EMPTY(q) \ 322 (ds->q.first == NULL) 323 324 #define CQ_OPEN(q) \ 325 atomic_inc_uint(&ds->q.open) 326 327 #define CQ_CLOSE(q) \ 328 dumpsys_close_cq(&ds->q, ds->live) 329 330 #define CQ_PUT(q, cp, st) \ 331 dumpsys_put_cq(&ds->q, cp, st, ds->live) 332 333 #define CQ_GET(q) \ 334 dumpsys_get_cq(&ds->q, ds->live) 335 336 /* 337 * Dynamic state when dumpsys() is running. 338 */ 339 typedef struct dumpsync { 340 pgcnt_t npages; /* subtotal of pages dumped */ 341 pgcnt_t pages_mapped; /* subtotal of pages mapped */ 342 pgcnt_t pages_used; /* subtotal of pages used per map */ 343 size_t nwrite; /* subtotal of bytes written */ 344 uint_t live; /* running live dump */ 345 uint_t neednl; /* will need to print a newline */ 346 uint_t percent; /* dump progress */ 347 uint_t percent_done; /* dump progress reported */ 348 int sec_done; /* dump progress last report time */ 349 cqueue_t freebufq; /* free kmem bufs for writing */ 350 cqueue_t mainq; /* input for main task */ 351 cqueue_t helperq; /* input for helpers */ 352 cqueue_t writerq; /* input for writer */ 353 hrtime_t start; /* start time */ 354 hrtime_t elapsed; /* elapsed time when completed */ 355 hrtime_t iotime; /* time spent writing nwrite bytes */ 356 hrtime_t iowait; /* time spent waiting for output */ 357 hrtime_t iowaitts; /* iowait timestamp */ 358 perpage_t perpage; /* metrics */ 359 perpage_t perpagets; 360 int dumpcpu; /* master cpu */ 361 } dumpsync_t; 362 363 static dumpsync_t dumpsync; /* synchronization vars */ 364 365 /* 366 * helper_t helpers: contains the context for a stream. CPUs run in 367 * parallel at dump time; each CPU creates a single stream of 368 * compression data. Stream data is divided into CBUF_SIZE blocks. 369 * The blocks are written in order within a stream. But, blocks from 370 * multiple streams can be interleaved. Each stream is identified by a 371 * unique tag. 372 */ 373 typedef struct helper { 374 int helper; /* bound helper id */ 375 int tag; /* compression stream tag */ 376 perpage_t perpage; /* per page metrics */ 377 perpage_t perpagets; /* per page metrics (timestamps) */ 378 taskqid_t taskqid; /* live dump task ptr */ 379 int in, out; /* buffer offsets */ 380 cbuf_t *cpin, *cpout, *cperr; /* cbuf objects in process */ 381 dumpsync_t *ds; /* pointer to sync vars */ 382 size_t used; /* counts input consumed */ 383 char *page; /* buffer for page copy */ 384 char *lzbuf; /* lzjb output */ 385 bz_stream bzstream; /* bzip2 state */ 386 } helper_t; 387 388 #define MAINHELPER (-1) /* helper is also the main task */ 389 #define FREEHELPER (-2) /* unbound helper */ 390 #define DONEHELPER (-3) /* helper finished */ 391 392 /* 393 * configuration vars for dumpsys 394 */ 395 typedef struct dumpcfg { 396 int threshold; /* ncpu threshold for bzip2 */ 397 int nhelper; /* number of helpers */ 398 int nhelper_used; /* actual number of helpers used */ 399 int ncmap; /* number VA pages for compression */ 400 int ncbuf; /* number of bufs for compression */ 401 int ncbuf_used; /* number of bufs in use */ 402 uint_t clevel; /* dump compression level */ 403 helper_t *helper; /* array of helpers */ 404 cbuf_t *cmap; /* array of input (map) buffers */ 405 cbuf_t *cbuf; /* array of output buffers */ 406 ulong_t *helpermap; /* set of dumpsys helper CPU ids */ 407 ulong_t *bitmap; /* bitmap for marking pages to dump */ 408 ulong_t *rbitmap; /* bitmap for used CBUF_MAPSIZE ranges */ 409 pgcnt_t bitmapsize; /* size of bitmap */ 410 pgcnt_t rbitmapsize; /* size of bitmap for ranges */ 411 pgcnt_t found4m; /* number ranges allocated by dump */ 412 pgcnt_t foundsm; /* number small pages allocated by dump */ 413 pid_t *pids; /* list of process IDs at dump time */ 414 size_t maxsize; /* memory size needed at dump time */ 415 size_t maxvmsize; /* size of reserved VM */ 416 char *maxvm; /* reserved VM for spare pages */ 417 lock_t helper_lock; /* protect helper state */ 418 char helpers_wanted; /* flag to enable parallelism */ 419 } dumpcfg_t; 420 421 static dumpcfg_t dumpcfg; /* config vars */ 422 423 /* 424 * The dump I/O buffer. 425 * 426 * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is 427 * sized according to the optimum device transfer speed. 428 */ 429 typedef struct dumpbuf { 430 vnode_t *cdev_vp; /* VCHR open of the dump device */ 431 len_t vp_limit; /* maximum write offset */ 432 offset_t vp_off; /* current dump device offset */ 433 char *cur; /* dump write pointer */ 434 char *start; /* dump buffer address */ 435 char *end; /* dump buffer end */ 436 size_t size; /* size of dumpbuf in bytes */ 437 size_t iosize; /* best transfer size for device */ 438 } dumpbuf_t; 439 440 dumpbuf_t dumpbuf; /* I/O buffer */ 441 442 /* 443 * The dump I/O buffer must be at least one page, at most xfer_size 444 * bytes, and should scale with physmem in between. The transfer size 445 * passed in will either represent a global default (maxphys) or the 446 * best size for the device. The size of the dumpbuf I/O buffer is 447 * limited by dumpbuf_limit (8MB by default) because the dump 448 * performance saturates beyond a certain size. The default is to 449 * select 1/4096 of the memory. 450 */ 451 static int dumpbuf_fraction = 12; /* memory size scale factor */ 452 static size_t dumpbuf_limit = 8 * DUMP_1MB; /* max I/O buf size */ 453 454 static size_t 455 dumpbuf_iosize(size_t xfer_size) 456 { 457 size_t iosize = ptob(physmem >> dumpbuf_fraction); 458 459 if (iosize < PAGESIZE) 460 iosize = PAGESIZE; 461 else if (iosize > xfer_size) 462 iosize = xfer_size; 463 if (iosize > dumpbuf_limit) 464 iosize = dumpbuf_limit; 465 return (iosize & PAGEMASK); 466 } 467 468 /* 469 * resize the I/O buffer 470 */ 471 static void 472 dumpbuf_resize(void) 473 { 474 char *old_buf = dumpbuf.start; 475 size_t old_size = dumpbuf.size; 476 char *new_buf; 477 size_t new_size; 478 479 ASSERT(MUTEX_HELD(&dump_lock)); 480 481 new_size = dumpbuf_iosize(MAX(dumpbuf.iosize, maxphys)); 482 if (new_size <= old_size) 483 return; /* no need to reallocate buffer */ 484 485 new_buf = kmem_alloc(new_size, KM_SLEEP); 486 dumpbuf.size = new_size; 487 dumpbuf.start = new_buf; 488 dumpbuf.end = new_buf + new_size; 489 kmem_free(old_buf, old_size); 490 } 491 492 /* 493 * dump_update_clevel is called when dumpadm configures the dump device. 494 * Calculate number of helpers and buffers. 495 * Allocate the minimum configuration for now. 496 * 497 * When the dump file is configured we reserve a minimum amount of 498 * memory for use at crash time. But we reserve VA for all the memory 499 * we really want in order to do the fastest dump possible. The VA is 500 * backed by pages not being dumped, according to the bitmap. If 501 * there is insufficient spare memory, however, we fall back to the 502 * minimum. 503 * 504 * Live dump (savecore -L) always uses the minimum config. 505 * 506 * clevel 0 is single threaded lzjb 507 * clevel 1 is parallel lzjb 508 * clevel 2 is parallel bzip2 509 * 510 * The ncpu threshold is selected with dump_plat_mincpu. 511 * On OPL, set_platform_defaults() overrides the sun4u setting. 512 * The actual values are defined via DUMP_PLAT_*_MINCPU macros. 513 * 514 * Architecture Threshold Algorithm 515 * sun4u < 51 parallel lzjb 516 * sun4u >= 51 parallel bzip2(*) 517 * sun4u OPL < 8 parallel lzjb 518 * sun4u OPL >= 8 parallel bzip2(*) 519 * sun4v < 128 parallel lzjb 520 * sun4v >= 128 parallel bzip2(*) 521 * x86 < 11 parallel lzjb 522 * x86 >= 11 parallel bzip2(*) 523 * 32-bit N/A single-threaded lzjb 524 * 525 * (*) bzip2 is only chosen if there is sufficient available 526 * memory for buffers at dump time. See dumpsys_get_maxmem(). 527 * 528 * Faster dump devices have larger I/O buffers. The threshold value is 529 * increased according to the size of the dump I/O buffer, because 530 * parallel lzjb performs better with faster disks. For buffers >= 1MB 531 * the threshold is 3X; for buffers >= 256K threshold is 2X. 532 * 533 * For parallel dumps, the number of helpers is ncpu-1. The CPU 534 * running panic runs the main task. For single-threaded dumps, the 535 * panic CPU does lzjb compression (it is tagged as MAINHELPER.) 536 * 537 * Need multiple buffers per helper so that they do not block waiting 538 * for the main task. 539 * parallel single-threaded 540 * Number of output buffers: nhelper*2 1 541 * Number of mapping buffers: nhelper*4 1 542 * 543 */ 544 static void 545 dump_update_clevel() 546 { 547 int tag; 548 size_t bz2size; 549 helper_t *hp, *hpend; 550 cbuf_t *cp, *cpend; 551 dumpcfg_t *old = &dumpcfg; 552 dumpcfg_t newcfg = *old; 553 dumpcfg_t *new = &newcfg; 554 555 ASSERT(MUTEX_HELD(&dump_lock)); 556 557 /* 558 * Free the previously allocated bufs and VM. 559 */ 560 if (old->helper != NULL) { 561 562 /* helpers */ 563 hpend = &old->helper[old->nhelper]; 564 for (hp = old->helper; hp != hpend; hp++) { 565 if (hp->lzbuf != NULL) 566 kmem_free(hp->lzbuf, PAGESIZE); 567 if (hp->page != NULL) 568 kmem_free(hp->page, PAGESIZE); 569 } 570 kmem_free(old->helper, old->nhelper * sizeof (helper_t)); 571 572 /* VM space for mapping pages */ 573 cpend = &old->cmap[old->ncmap]; 574 for (cp = old->cmap; cp != cpend; cp++) 575 vmem_xfree(heap_arena, cp->buf, CBUF_MAPSIZE); 576 kmem_free(old->cmap, old->ncmap * sizeof (cbuf_t)); 577 578 /* output bufs */ 579 cpend = &old->cbuf[old->ncbuf]; 580 for (cp = old->cbuf; cp != cpend; cp++) 581 if (cp->buf != NULL) 582 kmem_free(cp->buf, cp->size); 583 kmem_free(old->cbuf, old->ncbuf * sizeof (cbuf_t)); 584 585 /* reserved VM for dumpsys_get_maxmem */ 586 if (old->maxvmsize > 0) 587 vmem_xfree(heap_arena, old->maxvm, old->maxvmsize); 588 } 589 590 /* 591 * Allocate memory and VM. 592 * One CPU runs dumpsys, the rest are helpers. 593 */ 594 new->nhelper = ncpus - 1; 595 if (new->nhelper < 1) 596 new->nhelper = 1; 597 598 if (new->nhelper > DUMP_MAX_NHELPER) 599 new->nhelper = DUMP_MAX_NHELPER; 600 601 /* use platform default, unless /etc/system overrides */ 602 if (dump_plat_mincpu == MINCPU_NOT_SET) 603 dump_plat_mincpu = dump_plat_mincpu_default; 604 605 /* increase threshold for faster disks */ 606 new->threshold = dump_plat_mincpu; 607 if (dumpbuf.iosize >= DUMP_1MB) 608 new->threshold *= 3; 609 else if (dumpbuf.iosize >= (256 * DUMP_1KB)) 610 new->threshold *= 2; 611 612 /* figure compression level based upon the computed threshold. */ 613 if (dump_plat_mincpu == 0 || new->nhelper < 2) { 614 new->clevel = 0; 615 new->nhelper = 1; 616 } else if ((new->nhelper + 1) >= new->threshold) { 617 new->clevel = DUMP_CLEVEL_BZIP2; 618 } else { 619 new->clevel = DUMP_CLEVEL_LZJB; 620 } 621 622 if (new->clevel == 0) { 623 new->ncbuf = 1; 624 new->ncmap = 1; 625 } else { 626 new->ncbuf = NCBUF_PER_HELPER * new->nhelper; 627 new->ncmap = NCMAP_PER_HELPER * new->nhelper; 628 } 629 630 /* 631 * Allocate new data structures and buffers for MINHELPERS, 632 * and also figure the max desired size. 633 */ 634 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level); 635 new->maxsize = 0; 636 new->maxvmsize = 0; 637 new->maxvm = NULL; 638 tag = 1; 639 new->helper = kmem_zalloc(new->nhelper * sizeof (helper_t), KM_SLEEP); 640 hpend = &new->helper[new->nhelper]; 641 for (hp = new->helper; hp != hpend; hp++) { 642 hp->tag = tag++; 643 if (hp < &new->helper[MINHELPERS]) { 644 hp->lzbuf = kmem_alloc(PAGESIZE, KM_SLEEP); 645 hp->page = kmem_alloc(PAGESIZE, KM_SLEEP); 646 } else if (new->clevel < DUMP_CLEVEL_BZIP2) { 647 new->maxsize += 2 * PAGESIZE; 648 } else { 649 new->maxsize += PAGESIZE; 650 } 651 if (new->clevel >= DUMP_CLEVEL_BZIP2) 652 new->maxsize += bz2size; 653 } 654 655 new->cbuf = kmem_zalloc(new->ncbuf * sizeof (cbuf_t), KM_SLEEP); 656 cpend = &new->cbuf[new->ncbuf]; 657 for (cp = new->cbuf; cp != cpend; cp++) { 658 cp->state = CBUF_FREEBUF; 659 cp->size = CBUF_SIZE; 660 if (cp < &new->cbuf[MINCBUFS]) 661 cp->buf = kmem_alloc(cp->size, KM_SLEEP); 662 else 663 new->maxsize += cp->size; 664 } 665 666 new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP); 667 cpend = &new->cmap[new->ncmap]; 668 for (cp = new->cmap; cp != cpend; cp++) { 669 cp->state = CBUF_FREEMAP; 670 cp->size = CBUF_MAPSIZE; 671 cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE, 672 0, 0, NULL, NULL, VM_SLEEP); 673 } 674 675 /* reserve VA to be backed with spare pages at crash time */ 676 if (new->maxsize > 0) { 677 new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE); 678 new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE); 679 new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize, 680 CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP); 681 } 682 683 /* 684 * Reserve memory for kmem allocation calls made during crash dump. The 685 * hat layer allocates memory for each mapping created, and the I/O path 686 * allocates buffers and data structs. 687 * 688 * On larger systems, we easily exceed the lower amount, so we need some 689 * more space; the cut-over point is relatively arbitrary. If we run 690 * out, the only impact is that kmem state in the dump becomes 691 * inconsistent. 692 */ 693 694 if (dump_kmem_pages == 0) { 695 if (physmem > (16 * ONE_GIG) / PAGESIZE) 696 dump_kmem_pages = 20; 697 else 698 dump_kmem_pages = 8; 699 } 700 701 kmem_dump_init((new->ncmap * dump_kmem_permap) + 702 (dump_kmem_pages * PAGESIZE)); 703 704 /* set new config pointers */ 705 *old = *new; 706 } 707 708 /* 709 * Define a struct memlist walker to optimize bitnum to pfn 710 * lookup. The walker maintains the state of the list traversal. 711 */ 712 typedef struct dumpmlw { 713 struct memlist *mp; /* current memlist */ 714 pgcnt_t basenum; /* bitnum base offset */ 715 pgcnt_t mppages; /* current memlist size */ 716 pgcnt_t mpleft; /* size to end of current memlist */ 717 pfn_t mpaddr; /* first pfn in memlist */ 718 } dumpmlw_t; 719 720 /* initialize the walker */ 721 static inline void 722 dump_init_memlist_walker(dumpmlw_t *pw) 723 { 724 pw->mp = phys_install; 725 pw->basenum = 0; 726 pw->mppages = pw->mp->ml_size >> PAGESHIFT; 727 pw->mpleft = pw->mppages; 728 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT; 729 } 730 731 /* 732 * Lookup pfn given bitnum. The memlist can be quite long on some 733 * systems (e.g.: one per board). To optimize sequential lookups, the 734 * caller initializes and presents a memlist walker. 735 */ 736 static pfn_t 737 dump_bitnum_to_pfn(pgcnt_t bitnum, dumpmlw_t *pw) 738 { 739 bitnum -= pw->basenum; 740 while (pw->mp != NULL) { 741 if (bitnum < pw->mppages) { 742 pw->mpleft = pw->mppages - bitnum; 743 return (pw->mpaddr + bitnum); 744 } 745 bitnum -= pw->mppages; 746 pw->basenum += pw->mppages; 747 pw->mp = pw->mp->ml_next; 748 if (pw->mp != NULL) { 749 pw->mppages = pw->mp->ml_size >> PAGESHIFT; 750 pw->mpleft = pw->mppages; 751 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT; 752 } 753 } 754 return (PFN_INVALID); 755 } 756 757 static pgcnt_t 758 dump_pfn_to_bitnum(pfn_t pfn) 759 { 760 struct memlist *mp; 761 pgcnt_t bitnum = 0; 762 763 for (mp = phys_install; mp != NULL; mp = mp->ml_next) { 764 if (pfn >= (mp->ml_address >> PAGESHIFT) && 765 pfn < ((mp->ml_address + mp->ml_size) >> PAGESHIFT)) 766 return (bitnum + pfn - (mp->ml_address >> PAGESHIFT)); 767 bitnum += mp->ml_size >> PAGESHIFT; 768 } 769 return ((pgcnt_t)-1); 770 } 771 772 /* 773 * Set/test bitmap for a CBUF_MAPSIZE range which includes pfn. The 774 * mapping of pfn to range index is imperfect because pfn and bitnum 775 * do not have the same phase. To make sure a CBUF_MAPSIZE range is 776 * covered, call this for both ends: 777 * dump_set_used(base) 778 * dump_set_used(base+CBUF_MAPNP-1) 779 * 780 * This is used during a panic dump to mark pages allocated by 781 * dumpsys_get_maxmem(). The macro IS_DUMP_PAGE(pp) is used by 782 * page_get_mnode_freelist() to make sure pages used by dump are never 783 * allocated. 784 */ 785 #define CBUF_MAPP2R(pfn) ((pfn) >> (CBUF_MAPSHIFT - PAGESHIFT)) 786 787 static void 788 dump_set_used(pfn_t pfn) 789 { 790 791 pgcnt_t bitnum, rbitnum; 792 793 bitnum = dump_pfn_to_bitnum(pfn); 794 ASSERT(bitnum != (pgcnt_t)-1); 795 796 rbitnum = CBUF_MAPP2R(bitnum); 797 ASSERT(rbitnum < dumpcfg.rbitmapsize); 798 799 BT_SET(dumpcfg.rbitmap, rbitnum); 800 } 801 802 int 803 dump_test_used(pfn_t pfn) 804 { 805 pgcnt_t bitnum, rbitnum; 806 807 bitnum = dump_pfn_to_bitnum(pfn); 808 ASSERT(bitnum != (pgcnt_t)-1); 809 810 rbitnum = CBUF_MAPP2R(bitnum); 811 ASSERT(rbitnum < dumpcfg.rbitmapsize); 812 813 return (BT_TEST(dumpcfg.rbitmap, rbitnum)); 814 } 815 816 /* 817 * dumpbzalloc and dumpbzfree are callbacks from the bzip2 library. 818 * dumpsys_get_maxmem() uses them for BZ2_bzCompressInit(). 819 */ 820 static void * 821 dumpbzalloc(void *opaque, int items, int size) 822 { 823 size_t *sz; 824 char *ret; 825 826 ASSERT(opaque != NULL); 827 sz = opaque; 828 ret = dumpcfg.maxvm + *sz; 829 *sz += items * size; 830 *sz = P2ROUNDUP(*sz, BZ2_BZALLOC_ALIGN); 831 ASSERT(*sz <= dumpcfg.maxvmsize); 832 return (ret); 833 } 834 835 /*ARGSUSED*/ 836 static void 837 dumpbzfree(void *opaque, void *addr) 838 { 839 } 840 841 /* 842 * Perform additional checks on the page to see if we can really use 843 * it. The kernel (kas) pages are always set in the bitmap. However, 844 * boot memory pages (prom_ppages or P_BOOTPAGES) are not in the 845 * bitmap. So we check for them. 846 */ 847 static inline int 848 dump_pfn_check(pfn_t pfn) 849 { 850 page_t *pp = page_numtopp_nolock(pfn); 851 if (pp == NULL || pp->p_pagenum != pfn || 852 #if defined(__sparc) 853 pp->p_vnode == &promvp || 854 #else 855 PP_ISBOOTPAGES(pp) || 856 #endif 857 pp->p_toxic != 0) 858 return (0); 859 return (1); 860 } 861 862 /* 863 * Check a range to see if all contained pages are available and 864 * return non-zero if the range can be used. 865 */ 866 static inline int 867 dump_range_check(pgcnt_t start, pgcnt_t end, pfn_t pfn) 868 { 869 for (; start < end; start++, pfn++) { 870 if (BT_TEST(dumpcfg.bitmap, start)) 871 return (0); 872 if (!dump_pfn_check(pfn)) 873 return (0); 874 } 875 return (1); 876 } 877 878 /* 879 * dumpsys_get_maxmem() is called during panic. Find unused ranges 880 * and use them for buffers. If we find enough memory switch to 881 * parallel bzip2, otherwise use parallel lzjb. 882 * 883 * It searches the dump bitmap in 2 passes. The first time it looks 884 * for CBUF_MAPSIZE ranges. On the second pass it uses small pages. 885 */ 886 static void 887 dumpsys_get_maxmem() 888 { 889 dumpcfg_t *cfg = &dumpcfg; 890 cbuf_t *endcp = &cfg->cbuf[cfg->ncbuf]; 891 helper_t *endhp = &cfg->helper[cfg->nhelper]; 892 pgcnt_t bitnum, end; 893 size_t sz, endsz, bz2size; 894 pfn_t pfn, off; 895 cbuf_t *cp; 896 helper_t *hp, *ohp; 897 dumpmlw_t mlw; 898 int k; 899 900 /* 901 * Setting dump_plat_mincpu to 0 at any time forces a serial 902 * dump. 903 */ 904 if (dump_plat_mincpu == 0) { 905 cfg->clevel = 0; 906 return; 907 } 908 909 /* 910 * There may be no point in looking for spare memory. If 911 * dumping all memory, then none is spare. If doing a serial 912 * dump, then already have buffers. 913 */ 914 if (cfg->maxsize == 0 || cfg->clevel < DUMP_CLEVEL_LZJB || 915 (dump_conflags & DUMP_ALL) != 0) { 916 if (cfg->clevel > DUMP_CLEVEL_LZJB) 917 cfg->clevel = DUMP_CLEVEL_LZJB; 918 return; 919 } 920 921 sz = 0; 922 cfg->found4m = 0; 923 cfg->foundsm = 0; 924 925 /* bitmap of ranges used to estimate which pfns are being used */ 926 bzero(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.rbitmapsize)); 927 928 /* find ranges that are not being dumped to use for buffers */ 929 dump_init_memlist_walker(&mlw); 930 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) { 931 dump_timeleft = dump_timeout; 932 end = bitnum + CBUF_MAPNP; 933 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 934 ASSERT(pfn != PFN_INVALID); 935 936 /* skip partial range at end of mem segment */ 937 if (mlw.mpleft < CBUF_MAPNP) { 938 end = bitnum + mlw.mpleft; 939 continue; 940 } 941 942 /* skip non aligned pages */ 943 off = P2PHASE(pfn, CBUF_MAPNP); 944 if (off != 0) { 945 end -= off; 946 continue; 947 } 948 949 if (!dump_range_check(bitnum, end, pfn)) 950 continue; 951 952 ASSERT((sz + CBUF_MAPSIZE) <= cfg->maxvmsize); 953 hat_devload(kas.a_hat, cfg->maxvm + sz, CBUF_MAPSIZE, pfn, 954 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST); 955 sz += CBUF_MAPSIZE; 956 cfg->found4m++; 957 958 /* set the bitmap for both ends to be sure to cover the range */ 959 dump_set_used(pfn); 960 dump_set_used(pfn + CBUF_MAPNP - 1); 961 962 if (sz >= cfg->maxsize) 963 goto foundmax; 964 } 965 966 /* Add small pages if we can't find enough large pages. */ 967 dump_init_memlist_walker(&mlw); 968 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) { 969 dump_timeleft = dump_timeout; 970 end = bitnum + CBUF_MAPNP; 971 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 972 ASSERT(pfn != PFN_INVALID); 973 974 /* Find any non-aligned pages at start and end of segment. */ 975 off = P2PHASE(pfn, CBUF_MAPNP); 976 if (mlw.mpleft < CBUF_MAPNP) { 977 end = bitnum + mlw.mpleft; 978 } else if (off != 0) { 979 end -= off; 980 } else if (cfg->found4m && dump_test_used(pfn)) { 981 continue; 982 } 983 984 for (; bitnum < end; bitnum++, pfn++) { 985 dump_timeleft = dump_timeout; 986 if (BT_TEST(dumpcfg.bitmap, bitnum)) 987 continue; 988 if (!dump_pfn_check(pfn)) 989 continue; 990 ASSERT((sz + PAGESIZE) <= cfg->maxvmsize); 991 hat_devload(kas.a_hat, cfg->maxvm + sz, PAGESIZE, pfn, 992 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST); 993 sz += PAGESIZE; 994 cfg->foundsm++; 995 dump_set_used(pfn); 996 if (sz >= cfg->maxsize) 997 goto foundmax; 998 } 999 } 1000 1001 /* Fall back to lzjb if we did not get enough memory for bzip2. */ 1002 endsz = (cfg->maxsize * cfg->threshold) / cfg->nhelper; 1003 if (sz < endsz) { 1004 cfg->clevel = DUMP_CLEVEL_LZJB; 1005 } 1006 1007 /* Allocate memory for as many helpers as we can. */ 1008 foundmax: 1009 1010 /* Byte offsets into memory found and mapped above */ 1011 endsz = sz; 1012 sz = 0; 1013 1014 /* Set the size for bzip2 state. Only bzip2 needs it. */ 1015 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level); 1016 1017 /* Skip the preallocate output buffers. */ 1018 cp = &cfg->cbuf[MINCBUFS]; 1019 1020 /* Use this to move memory up from the preallocated helpers. */ 1021 ohp = cfg->helper; 1022 1023 /* Loop over all helpers and allocate memory. */ 1024 for (hp = cfg->helper; hp < endhp; hp++) { 1025 1026 /* Skip preallocated helpers by checking hp->page. */ 1027 if (hp->page == NULL) { 1028 if (cfg->clevel <= DUMP_CLEVEL_LZJB) { 1029 /* lzjb needs 2 1-page buffers */ 1030 if ((sz + (2 * PAGESIZE)) > endsz) 1031 break; 1032 hp->page = cfg->maxvm + sz; 1033 sz += PAGESIZE; 1034 hp->lzbuf = cfg->maxvm + sz; 1035 sz += PAGESIZE; 1036 1037 } else if (ohp->lzbuf != NULL) { 1038 /* re-use the preallocted lzjb page for bzip2 */ 1039 hp->page = ohp->lzbuf; 1040 ohp->lzbuf = NULL; 1041 ++ohp; 1042 1043 } else { 1044 /* bzip2 needs a 1-page buffer */ 1045 if ((sz + PAGESIZE) > endsz) 1046 break; 1047 hp->page = cfg->maxvm + sz; 1048 sz += PAGESIZE; 1049 } 1050 } 1051 1052 /* 1053 * Add output buffers per helper. The number of 1054 * buffers per helper is determined by the ratio of 1055 * ncbuf to nhelper. 1056 */ 1057 for (k = 0; cp < endcp && (sz + CBUF_SIZE) <= endsz && 1058 k < NCBUF_PER_HELPER; k++) { 1059 cp->state = CBUF_FREEBUF; 1060 cp->size = CBUF_SIZE; 1061 cp->buf = cfg->maxvm + sz; 1062 sz += CBUF_SIZE; 1063 ++cp; 1064 } 1065 1066 /* 1067 * bzip2 needs compression state. Use the dumpbzalloc 1068 * and dumpbzfree callbacks to allocate the memory. 1069 * bzip2 does allocation only at init time. 1070 */ 1071 if (cfg->clevel >= DUMP_CLEVEL_BZIP2) { 1072 if ((sz + bz2size) > endsz) { 1073 hp->page = NULL; 1074 break; 1075 } else { 1076 hp->bzstream.opaque = &sz; 1077 hp->bzstream.bzalloc = dumpbzalloc; 1078 hp->bzstream.bzfree = dumpbzfree; 1079 (void) BZ2_bzCompressInit(&hp->bzstream, 1080 dump_bzip2_level, 0, 0); 1081 hp->bzstream.opaque = NULL; 1082 } 1083 } 1084 } 1085 1086 /* Finish allocating output buffers */ 1087 for (; cp < endcp && (sz + CBUF_SIZE) <= endsz; cp++) { 1088 cp->state = CBUF_FREEBUF; 1089 cp->size = CBUF_SIZE; 1090 cp->buf = cfg->maxvm + sz; 1091 sz += CBUF_SIZE; 1092 } 1093 1094 /* Enable IS_DUMP_PAGE macro, which checks for pages we took. */ 1095 if (cfg->found4m || cfg->foundsm) 1096 dump_check_used = 1; 1097 1098 ASSERT(sz <= endsz); 1099 } 1100 1101 static void 1102 dumphdr_init(void) 1103 { 1104 pgcnt_t npages = 0; 1105 1106 ASSERT(MUTEX_HELD(&dump_lock)); 1107 1108 if (dumphdr == NULL) { 1109 dumphdr = kmem_zalloc(sizeof (dumphdr_t), KM_SLEEP); 1110 dumphdr->dump_magic = DUMP_MAGIC; 1111 dumphdr->dump_version = DUMP_VERSION; 1112 dumphdr->dump_wordsize = DUMP_WORDSIZE; 1113 dumphdr->dump_pageshift = PAGESHIFT; 1114 dumphdr->dump_pagesize = PAGESIZE; 1115 dumphdr->dump_utsname = utsname; 1116 (void) strcpy(dumphdr->dump_platform, platform); 1117 dumpbuf.size = dumpbuf_iosize(maxphys); 1118 dumpbuf.start = kmem_alloc(dumpbuf.size, KM_SLEEP); 1119 dumpbuf.end = dumpbuf.start + dumpbuf.size; 1120 dumpcfg.pids = kmem_alloc(v.v_proc * sizeof (pid_t), KM_SLEEP); 1121 dumpcfg.helpermap = kmem_zalloc(BT_SIZEOFMAP(NCPU), KM_SLEEP); 1122 LOCK_INIT_HELD(&dumpcfg.helper_lock); 1123 dump_stack_scratch = kmem_alloc(STACK_BUF_SIZE, KM_SLEEP); 1124 (void) strncpy(dumphdr->dump_uuid, dump_get_uuid(), 1125 sizeof (dumphdr->dump_uuid)); 1126 } 1127 1128 npages = num_phys_pages(); 1129 1130 if (dumpcfg.bitmapsize != npages) { 1131 size_t rlen = CBUF_MAPP2R(P2ROUNDUP(npages, CBUF_MAPNP)); 1132 void *map = kmem_alloc(BT_SIZEOFMAP(npages), KM_SLEEP); 1133 void *rmap = kmem_alloc(BT_SIZEOFMAP(rlen), KM_SLEEP); 1134 1135 if (dumpcfg.bitmap != NULL) 1136 kmem_free(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg. 1137 bitmapsize)); 1138 if (dumpcfg.rbitmap != NULL) 1139 kmem_free(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg. 1140 rbitmapsize)); 1141 dumpcfg.bitmap = map; 1142 dumpcfg.bitmapsize = npages; 1143 dumpcfg.rbitmap = rmap; 1144 dumpcfg.rbitmapsize = rlen; 1145 } 1146 } 1147 1148 /* 1149 * Establish a new dump device. 1150 */ 1151 int 1152 dumpinit(vnode_t *vp, char *name, int justchecking) 1153 { 1154 vnode_t *cvp; 1155 vattr_t vattr; 1156 vnode_t *cdev_vp; 1157 int error = 0; 1158 1159 ASSERT(MUTEX_HELD(&dump_lock)); 1160 1161 dumphdr_init(); 1162 1163 cvp = common_specvp(vp); 1164 if (cvp == dumpvp) 1165 return (0); 1166 1167 /* 1168 * Determine whether this is a plausible dump device. We want either: 1169 * (1) a real device that's not mounted and has a cb_dump routine, or 1170 * (2) a swapfile on some filesystem that has a vop_dump routine. 1171 */ 1172 if ((error = VOP_OPEN(&cvp, FREAD | FWRITE, kcred, NULL)) != 0) 1173 return (error); 1174 1175 vattr.va_mask = AT_SIZE | AT_TYPE | AT_RDEV; 1176 if ((error = VOP_GETATTR(cvp, &vattr, 0, kcred, NULL)) == 0) { 1177 if (vattr.va_type == VBLK || vattr.va_type == VCHR) { 1178 if (devopsp[getmajor(vattr.va_rdev)]-> 1179 devo_cb_ops->cb_dump == nodev) 1180 error = ENOTSUP; 1181 else if (vfs_devismounted(vattr.va_rdev)) 1182 error = EBUSY; 1183 if (strcmp(ddi_driver_name(VTOS(cvp)->s_dip), 1184 ZFS_DRIVER) == 0 && 1185 IS_SWAPVP(common_specvp(cvp))) 1186 error = EBUSY; 1187 } else { 1188 if (vn_matchopval(cvp, VOPNAME_DUMP, fs_nosys) || 1189 !IS_SWAPVP(cvp)) 1190 error = ENOTSUP; 1191 } 1192 } 1193 1194 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) 1195 error = ENOSPC; 1196 1197 if (error || justchecking) { 1198 (void) VOP_CLOSE(cvp, FREAD | FWRITE, 1, (offset_t)0, 1199 kcred, NULL); 1200 return (error); 1201 } 1202 1203 VN_HOLD(cvp); 1204 1205 if (dumpvp != NULL) 1206 dumpfini(); /* unconfigure the old dump device */ 1207 1208 dumpvp = cvp; 1209 dumpvp_size = vattr.va_size & -DUMP_OFFSET; 1210 dumppath = kmem_alloc(strlen(name) + 1, KM_SLEEP); 1211 (void) strcpy(dumppath, name); 1212 dumpbuf.iosize = 0; 1213 1214 /* 1215 * If the dump device is a block device, attempt to open up the 1216 * corresponding character device and determine its maximum transfer 1217 * size. We use this information to potentially resize dumpbuf to a 1218 * larger and more optimal size for performing i/o to the dump device. 1219 */ 1220 if (cvp->v_type == VBLK && 1221 (cdev_vp = makespecvp(VTOS(cvp)->s_dev, VCHR)) != NULL) { 1222 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) { 1223 size_t blk_size; 1224 struct dk_cinfo dki; 1225 struct dk_minfo minf; 1226 1227 if (VOP_IOCTL(cdev_vp, DKIOCGMEDIAINFO, 1228 (intptr_t)&minf, FKIOCTL, kcred, NULL, NULL) 1229 == 0 && minf.dki_lbsize != 0) 1230 blk_size = minf.dki_lbsize; 1231 else 1232 blk_size = DEV_BSIZE; 1233 1234 if (VOP_IOCTL(cdev_vp, DKIOCINFO, (intptr_t)&dki, 1235 FKIOCTL, kcred, NULL, NULL) == 0) { 1236 dumpbuf.iosize = dki.dki_maxtransfer * blk_size; 1237 dumpbuf_resize(); 1238 } 1239 /* 1240 * If we are working with a zvol then dumpify it 1241 * if it's not being used as swap. 1242 */ 1243 if (strcmp(dki.dki_dname, ZVOL_DRIVER) == 0) { 1244 if (IS_SWAPVP(common_specvp(cvp))) 1245 error = EBUSY; 1246 else if ((error = VOP_IOCTL(cdev_vp, 1247 DKIOCDUMPINIT, NULL, FKIOCTL, kcred, 1248 NULL, NULL)) != 0) 1249 dumpfini(); 1250 } 1251 1252 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0, 1253 kcred, NULL); 1254 } 1255 1256 VN_RELE(cdev_vp); 1257 } 1258 1259 cmn_err(CE_CONT, "?dump on %s size %llu MB\n", name, dumpvp_size >> 20); 1260 1261 dump_update_clevel(); 1262 1263 return (error); 1264 } 1265 1266 void 1267 dumpfini(void) 1268 { 1269 vattr_t vattr; 1270 boolean_t is_zfs = B_FALSE; 1271 vnode_t *cdev_vp; 1272 ASSERT(MUTEX_HELD(&dump_lock)); 1273 1274 kmem_free(dumppath, strlen(dumppath) + 1); 1275 1276 /* 1277 * Determine if we are using zvols for our dump device 1278 */ 1279 vattr.va_mask = AT_RDEV; 1280 if (VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL) == 0) { 1281 is_zfs = (getmajor(vattr.va_rdev) == 1282 ddi_name_to_major(ZFS_DRIVER)) ? B_TRUE : B_FALSE; 1283 } 1284 1285 /* 1286 * If we have a zvol dump device then we call into zfs so 1287 * that it may have a chance to cleanup. 1288 */ 1289 if (is_zfs && 1290 (cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR)) != NULL) { 1291 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) { 1292 (void) VOP_IOCTL(cdev_vp, DKIOCDUMPFINI, NULL, FKIOCTL, 1293 kcred, NULL, NULL); 1294 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0, 1295 kcred, NULL); 1296 } 1297 VN_RELE(cdev_vp); 1298 } 1299 1300 (void) VOP_CLOSE(dumpvp, FREAD | FWRITE, 1, (offset_t)0, kcred, NULL); 1301 1302 VN_RELE(dumpvp); 1303 1304 dumpvp = NULL; 1305 dumpvp_size = 0; 1306 dumppath = NULL; 1307 } 1308 1309 static offset_t 1310 dumpvp_flush(void) 1311 { 1312 size_t size = P2ROUNDUP(dumpbuf.cur - dumpbuf.start, PAGESIZE); 1313 hrtime_t iotime; 1314 int err; 1315 1316 if (dumpbuf.vp_off + size > dumpbuf.vp_limit) { 1317 dump_ioerr = ENOSPC; 1318 dumpbuf.vp_off = dumpbuf.vp_limit; 1319 } else if (size != 0) { 1320 iotime = gethrtime(); 1321 dumpsync.iowait += iotime - dumpsync.iowaitts; 1322 if (panicstr) 1323 err = VOP_DUMP(dumpvp, dumpbuf.start, 1324 lbtodb(dumpbuf.vp_off), btod(size), NULL); 1325 else 1326 err = vn_rdwr(UIO_WRITE, dumpbuf.cdev_vp != NULL ? 1327 dumpbuf.cdev_vp : dumpvp, dumpbuf.start, size, 1328 dumpbuf.vp_off, UIO_SYSSPACE, 0, dumpbuf.vp_limit, 1329 kcred, 0); 1330 if (err && dump_ioerr == 0) 1331 dump_ioerr = err; 1332 dumpsync.iowaitts = gethrtime(); 1333 dumpsync.iotime += dumpsync.iowaitts - iotime; 1334 dumpsync.nwrite += size; 1335 dumpbuf.vp_off += size; 1336 } 1337 dumpbuf.cur = dumpbuf.start; 1338 dump_timeleft = dump_timeout; 1339 return (dumpbuf.vp_off); 1340 } 1341 1342 /* maximize write speed by keeping seek offset aligned with size */ 1343 void 1344 dumpvp_write(const void *va, size_t size) 1345 { 1346 size_t len, off, sz; 1347 1348 while (size != 0) { 1349 len = MIN(size, dumpbuf.end - dumpbuf.cur); 1350 if (len == 0) { 1351 off = P2PHASE(dumpbuf.vp_off, dumpbuf.size); 1352 if (off == 0 || !ISP2(dumpbuf.size)) { 1353 (void) dumpvp_flush(); 1354 } else { 1355 sz = dumpbuf.size - off; 1356 dumpbuf.cur = dumpbuf.start + sz; 1357 (void) dumpvp_flush(); 1358 ovbcopy(dumpbuf.start + sz, dumpbuf.start, off); 1359 dumpbuf.cur += off; 1360 } 1361 } else { 1362 bcopy(va, dumpbuf.cur, len); 1363 va = (char *)va + len; 1364 dumpbuf.cur += len; 1365 size -= len; 1366 } 1367 } 1368 } 1369 1370 /*ARGSUSED*/ 1371 static void 1372 dumpvp_ksyms_write(const void *src, void *dst, size_t size) 1373 { 1374 dumpvp_write(src, size); 1375 } 1376 1377 /* 1378 * Mark 'pfn' in the bitmap and dump its translation table entry. 1379 */ 1380 void 1381 dump_addpage(struct as *as, void *va, pfn_t pfn) 1382 { 1383 mem_vtop_t mem_vtop; 1384 pgcnt_t bitnum; 1385 1386 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) { 1387 if (!BT_TEST(dumpcfg.bitmap, bitnum)) { 1388 dumphdr->dump_npages++; 1389 BT_SET(dumpcfg.bitmap, bitnum); 1390 } 1391 dumphdr->dump_nvtop++; 1392 mem_vtop.m_as = as; 1393 mem_vtop.m_va = va; 1394 mem_vtop.m_pfn = pfn; 1395 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 1396 } 1397 dump_timeleft = dump_timeout; 1398 } 1399 1400 /* 1401 * Mark 'pfn' in the bitmap 1402 */ 1403 void 1404 dump_page(pfn_t pfn) 1405 { 1406 pgcnt_t bitnum; 1407 1408 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) { 1409 if (!BT_TEST(dumpcfg.bitmap, bitnum)) { 1410 dumphdr->dump_npages++; 1411 BT_SET(dumpcfg.bitmap, bitnum); 1412 } 1413 } 1414 dump_timeleft = dump_timeout; 1415 } 1416 1417 /* 1418 * Dump the <as, va, pfn> information for a given address space. 1419 * SEGOP_DUMP() will call dump_addpage() for each page in the segment. 1420 */ 1421 static void 1422 dump_as(struct as *as) 1423 { 1424 struct seg *seg; 1425 1426 AS_LOCK_ENTER(as, RW_READER); 1427 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 1428 if (seg->s_as != as) 1429 break; 1430 if (seg->s_ops == NULL) 1431 continue; 1432 SEGOP_DUMP(seg); 1433 } 1434 AS_LOCK_EXIT(as); 1435 1436 if (seg != NULL) 1437 cmn_err(CE_WARN, "invalid segment %p in address space %p", 1438 (void *)seg, (void *)as); 1439 } 1440 1441 static int 1442 dump_process(pid_t pid) 1443 { 1444 proc_t *p = sprlock(pid); 1445 1446 if (p == NULL) 1447 return (-1); 1448 if (p->p_as != &kas) { 1449 mutex_exit(&p->p_lock); 1450 dump_as(p->p_as); 1451 mutex_enter(&p->p_lock); 1452 } 1453 1454 sprunlock(p); 1455 1456 return (0); 1457 } 1458 1459 /* 1460 * The following functions (dump_summary(), dump_ereports(), and 1461 * dump_messages()), write data to an uncompressed area within the 1462 * crashdump. The layout of these is 1463 * 1464 * +------------------------------------------------------------+ 1465 * | compressed pages | summary | ereports | messages | 1466 * +------------------------------------------------------------+ 1467 * 1468 * With the advent of saving a compressed crash dump by default, we 1469 * need to save a little more data to describe the failure mode in 1470 * an uncompressed buffer available before savecore uncompresses 1471 * the dump. Initially this is a copy of the stack trace. Additional 1472 * summary information should be added here. 1473 */ 1474 1475 void 1476 dump_summary(void) 1477 { 1478 u_offset_t dumpvp_start; 1479 summary_dump_t sd; 1480 1481 if (dumpvp == NULL || dumphdr == NULL) 1482 return; 1483 1484 dumpbuf.cur = dumpbuf.start; 1485 1486 dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE + 1487 DUMP_ERPTSIZE); 1488 dumpvp_start = dumpbuf.vp_limit - DUMP_SUMMARYSIZE; 1489 dumpbuf.vp_off = dumpvp_start; 1490 1491 sd.sd_magic = SUMMARY_MAGIC; 1492 sd.sd_ssum = checksum32(dump_stack_scratch, STACK_BUF_SIZE); 1493 dumpvp_write(&sd, sizeof (sd)); 1494 dumpvp_write(dump_stack_scratch, STACK_BUF_SIZE); 1495 1496 sd.sd_magic = 0; /* indicate end of summary */ 1497 dumpvp_write(&sd, sizeof (sd)); 1498 (void) dumpvp_flush(); 1499 } 1500 1501 void 1502 dump_ereports(void) 1503 { 1504 u_offset_t dumpvp_start; 1505 erpt_dump_t ed; 1506 1507 if (dumpvp == NULL || dumphdr == NULL) 1508 return; 1509 1510 dumpbuf.cur = dumpbuf.start; 1511 dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE); 1512 dumpvp_start = dumpbuf.vp_limit - DUMP_ERPTSIZE; 1513 dumpbuf.vp_off = dumpvp_start; 1514 1515 fm_ereport_dump(); 1516 if (panicstr) 1517 errorq_dump(); 1518 1519 bzero(&ed, sizeof (ed)); /* indicate end of ereports */ 1520 dumpvp_write(&ed, sizeof (ed)); 1521 (void) dumpvp_flush(); 1522 1523 if (!panicstr) { 1524 (void) VOP_PUTPAGE(dumpvp, dumpvp_start, 1525 (size_t)(dumpbuf.vp_off - dumpvp_start), 1526 B_INVAL | B_FORCE, kcred, NULL); 1527 } 1528 } 1529 1530 void 1531 dump_messages(void) 1532 { 1533 log_dump_t ld; 1534 mblk_t *mctl, *mdata; 1535 queue_t *q, *qlast; 1536 u_offset_t dumpvp_start; 1537 1538 if (dumpvp == NULL || dumphdr == NULL || log_consq == NULL) 1539 return; 1540 1541 dumpbuf.cur = dumpbuf.start; 1542 dumpbuf.vp_limit = dumpvp_size - DUMP_OFFSET; 1543 dumpvp_start = dumpbuf.vp_limit - DUMP_LOGSIZE; 1544 dumpbuf.vp_off = dumpvp_start; 1545 1546 qlast = NULL; 1547 do { 1548 for (q = log_consq; q->q_next != qlast; q = q->q_next) 1549 continue; 1550 for (mctl = q->q_first; mctl != NULL; mctl = mctl->b_next) { 1551 dump_timeleft = dump_timeout; 1552 mdata = mctl->b_cont; 1553 ld.ld_magic = LOG_MAGIC; 1554 ld.ld_msgsize = MBLKL(mctl->b_cont); 1555 ld.ld_csum = checksum32(mctl->b_rptr, MBLKL(mctl)); 1556 ld.ld_msum = checksum32(mdata->b_rptr, MBLKL(mdata)); 1557 dumpvp_write(&ld, sizeof (ld)); 1558 dumpvp_write(mctl->b_rptr, MBLKL(mctl)); 1559 dumpvp_write(mdata->b_rptr, MBLKL(mdata)); 1560 } 1561 } while ((qlast = q) != log_consq); 1562 1563 ld.ld_magic = 0; /* indicate end of messages */ 1564 dumpvp_write(&ld, sizeof (ld)); 1565 (void) dumpvp_flush(); 1566 if (!panicstr) { 1567 (void) VOP_PUTPAGE(dumpvp, dumpvp_start, 1568 (size_t)(dumpbuf.vp_off - dumpvp_start), 1569 B_INVAL | B_FORCE, kcred, NULL); 1570 } 1571 } 1572 1573 /* 1574 * The following functions are called on multiple CPUs during dump. 1575 * They must not use most kernel services, because all cross-calls are 1576 * disabled during panic. Therefore, blocking locks and cache flushes 1577 * will not work. 1578 */ 1579 1580 /* 1581 * Copy pages, trapping ECC errors. Also, for robustness, trap data 1582 * access in case something goes wrong in the hat layer and the 1583 * mapping is broken. 1584 */ 1585 static int 1586 dump_pagecopy(void *src, void *dst) 1587 { 1588 long *wsrc = (long *)src; 1589 long *wdst = (long *)dst; 1590 const ulong_t ncopies = PAGESIZE / sizeof (long); 1591 volatile int w = 0; 1592 volatile int ueoff = -1; 1593 on_trap_data_t otd; 1594 1595 if (on_trap(&otd, OT_DATA_EC | OT_DATA_ACCESS)) { 1596 if (ueoff == -1) 1597 ueoff = w * sizeof (long); 1598 /* report "bad ECC" or "bad address" */ 1599 #ifdef _LP64 1600 if (otd.ot_trap & OT_DATA_EC) 1601 wdst[w++] = 0x00badecc00badecc; 1602 else 1603 wdst[w++] = 0x00badadd00badadd; 1604 #else 1605 if (otd.ot_trap & OT_DATA_EC) 1606 wdst[w++] = 0x00badecc; 1607 else 1608 wdst[w++] = 0x00badadd; 1609 #endif 1610 } 1611 while (w < ncopies) { 1612 wdst[w] = wsrc[w]; 1613 w++; 1614 } 1615 no_trap(); 1616 return (ueoff); 1617 } 1618 1619 static void 1620 dumpsys_close_cq(cqueue_t *cq, int live) 1621 { 1622 if (live) { 1623 mutex_enter(&cq->mutex); 1624 atomic_dec_uint(&cq->open); 1625 cv_signal(&cq->cv); 1626 mutex_exit(&cq->mutex); 1627 } else { 1628 atomic_dec_uint(&cq->open); 1629 } 1630 } 1631 1632 static inline void 1633 dumpsys_spinlock(lock_t *lp) 1634 { 1635 uint_t backoff = 0; 1636 int loop_count = 0; 1637 1638 while (LOCK_HELD(lp) || !lock_spin_try(lp)) { 1639 if (++loop_count >= ncpus) { 1640 backoff = mutex_lock_backoff(0); 1641 loop_count = 0; 1642 } else { 1643 backoff = mutex_lock_backoff(backoff); 1644 } 1645 mutex_lock_delay(backoff); 1646 } 1647 } 1648 1649 static inline void 1650 dumpsys_spinunlock(lock_t *lp) 1651 { 1652 lock_clear(lp); 1653 } 1654 1655 static inline void 1656 dumpsys_lock(cqueue_t *cq, int live) 1657 { 1658 if (live) 1659 mutex_enter(&cq->mutex); 1660 else 1661 dumpsys_spinlock(&cq->spinlock); 1662 } 1663 1664 static inline void 1665 dumpsys_unlock(cqueue_t *cq, int live, int signal) 1666 { 1667 if (live) { 1668 if (signal) 1669 cv_signal(&cq->cv); 1670 mutex_exit(&cq->mutex); 1671 } else { 1672 dumpsys_spinunlock(&cq->spinlock); 1673 } 1674 } 1675 1676 static void 1677 dumpsys_wait_cq(cqueue_t *cq, int live) 1678 { 1679 if (live) { 1680 cv_wait(&cq->cv, &cq->mutex); 1681 } else { 1682 dumpsys_spinunlock(&cq->spinlock); 1683 while (cq->open) 1684 if (cq->first) 1685 break; 1686 dumpsys_spinlock(&cq->spinlock); 1687 } 1688 } 1689 1690 static void 1691 dumpsys_put_cq(cqueue_t *cq, cbuf_t *cp, int newstate, int live) 1692 { 1693 if (cp == NULL) 1694 return; 1695 1696 dumpsys_lock(cq, live); 1697 1698 if (cq->ts != 0) { 1699 cq->empty += gethrtime() - cq->ts; 1700 cq->ts = 0; 1701 } 1702 1703 cp->state = newstate; 1704 cp->next = NULL; 1705 if (cq->last == NULL) 1706 cq->first = cp; 1707 else 1708 cq->last->next = cp; 1709 cq->last = cp; 1710 1711 dumpsys_unlock(cq, live, 1); 1712 } 1713 1714 static cbuf_t * 1715 dumpsys_get_cq(cqueue_t *cq, int live) 1716 { 1717 cbuf_t *cp; 1718 hrtime_t now = gethrtime(); 1719 1720 dumpsys_lock(cq, live); 1721 1722 /* CONSTCOND */ 1723 while (1) { 1724 cp = (cbuf_t *)cq->first; 1725 if (cp == NULL) { 1726 if (cq->open == 0) 1727 break; 1728 dumpsys_wait_cq(cq, live); 1729 continue; 1730 } 1731 cq->first = cp->next; 1732 if (cq->first == NULL) { 1733 cq->last = NULL; 1734 cq->ts = now; 1735 } 1736 break; 1737 } 1738 1739 dumpsys_unlock(cq, live, cq->first != NULL || cq->open == 0); 1740 return (cp); 1741 } 1742 1743 /* 1744 * Send an error message to the console. If the main task is running 1745 * just write the message via uprintf. If a helper is running the 1746 * message has to be put on a queue for the main task. Setting fmt to 1747 * NULL means flush the error message buffer. If fmt is not NULL, just 1748 * add the text to the existing buffer. 1749 */ 1750 static void 1751 dumpsys_errmsg(helper_t *hp, const char *fmt, ...) 1752 { 1753 dumpsync_t *ds = hp->ds; 1754 cbuf_t *cp = hp->cperr; 1755 va_list adx; 1756 1757 if (hp->helper == MAINHELPER) { 1758 if (fmt != NULL) { 1759 if (ds->neednl) { 1760 uprintf("\n"); 1761 ds->neednl = 0; 1762 } 1763 va_start(adx, fmt); 1764 vuprintf(fmt, adx); 1765 va_end(adx); 1766 } 1767 } else if (fmt == NULL) { 1768 if (cp != NULL) { 1769 CQ_PUT(mainq, cp, CBUF_ERRMSG); 1770 hp->cperr = NULL; 1771 } 1772 } else { 1773 if (hp->cperr == NULL) { 1774 cp = CQ_GET(freebufq); 1775 hp->cperr = cp; 1776 cp->used = 0; 1777 } 1778 va_start(adx, fmt); 1779 cp->used += vsnprintf(cp->buf + cp->used, cp->size - cp->used, 1780 fmt, adx); 1781 va_end(adx); 1782 if ((cp->used + LOG_MSGSIZE) > cp->size) { 1783 CQ_PUT(mainq, cp, CBUF_ERRMSG); 1784 hp->cperr = NULL; 1785 } 1786 } 1787 } 1788 1789 /* 1790 * Write an output buffer to the dump file. If the main task is 1791 * running just write the data. If a helper is running the output is 1792 * placed on a queue for the main task. 1793 */ 1794 static void 1795 dumpsys_swrite(helper_t *hp, cbuf_t *cp, size_t used) 1796 { 1797 dumpsync_t *ds = hp->ds; 1798 1799 if (hp->helper == MAINHELPER) { 1800 HRSTART(ds->perpage, write); 1801 dumpvp_write(cp->buf, used); 1802 HRSTOP(ds->perpage, write); 1803 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 1804 } else { 1805 cp->used = used; 1806 CQ_PUT(mainq, cp, CBUF_WRITE); 1807 } 1808 } 1809 1810 /* 1811 * Copy one page within the mapped range. The offset starts at 0 and 1812 * is relative to the first pfn. cp->buf + cp->off is the address of 1813 * the first pfn. If dump_pagecopy returns a UE offset, create an 1814 * error message. Returns the offset to the next pfn in the range 1815 * selected by the bitmap. 1816 */ 1817 static int 1818 dumpsys_copy_page(helper_t *hp, int offset) 1819 { 1820 cbuf_t *cp = hp->cpin; 1821 int ueoff; 1822 1823 ASSERT(cp->off + offset + PAGESIZE <= cp->size); 1824 ASSERT(BT_TEST(dumpcfg.bitmap, cp->bitnum)); 1825 1826 ueoff = dump_pagecopy(cp->buf + cp->off + offset, hp->page); 1827 1828 /* ueoff is the offset in the page to a UE error */ 1829 if (ueoff != -1) { 1830 uint64_t pa = ptob(cp->pfn) + offset + ueoff; 1831 1832 dumpsys_errmsg(hp, "cpu %d: memory error at PA 0x%08x.%08x\n", 1833 CPU->cpu_id, (uint32_t)(pa >> 32), (uint32_t)pa); 1834 } 1835 1836 /* 1837 * Advance bitnum and offset to the next input page for the 1838 * next call to this function. 1839 */ 1840 offset += PAGESIZE; 1841 cp->bitnum++; 1842 while (cp->off + offset < cp->size) { 1843 if (BT_TEST(dumpcfg.bitmap, cp->bitnum)) 1844 break; 1845 offset += PAGESIZE; 1846 cp->bitnum++; 1847 } 1848 1849 return (offset); 1850 } 1851 1852 /* 1853 * Read the helper queue, and copy one mapped page. Return 0 when 1854 * done. Return 1 when a page has been copied into hp->page. 1855 */ 1856 static int 1857 dumpsys_sread(helper_t *hp) 1858 { 1859 dumpsync_t *ds = hp->ds; 1860 1861 /* CONSTCOND */ 1862 while (1) { 1863 1864 /* Find the next input buffer. */ 1865 if (hp->cpin == NULL) { 1866 HRSTART(hp->perpage, inwait); 1867 1868 /* CONSTCOND */ 1869 while (1) { 1870 hp->cpin = CQ_GET(helperq); 1871 dump_timeleft = dump_timeout; 1872 1873 /* 1874 * NULL return means the helper queue 1875 * is closed and empty. 1876 */ 1877 if (hp->cpin == NULL) 1878 break; 1879 1880 /* Have input, check for dump I/O error. */ 1881 if (!dump_ioerr) 1882 break; 1883 1884 /* 1885 * If an I/O error occurs, stay in the 1886 * loop in order to empty the helper 1887 * queue. Return the buffers to the 1888 * main task to unmap and free it. 1889 */ 1890 hp->cpin->used = 0; 1891 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 1892 } 1893 HRSTOP(hp->perpage, inwait); 1894 1895 /* Stop here when the helper queue is closed. */ 1896 if (hp->cpin == NULL) 1897 break; 1898 1899 /* Set the offset=0 to get the first pfn. */ 1900 hp->in = 0; 1901 1902 /* Set the total processed to 0 */ 1903 hp->used = 0; 1904 } 1905 1906 /* Process the next page. */ 1907 if (hp->used < hp->cpin->used) { 1908 1909 /* 1910 * Get the next page from the input buffer and 1911 * return a copy. 1912 */ 1913 ASSERT(hp->in != -1); 1914 HRSTART(hp->perpage, copy); 1915 hp->in = dumpsys_copy_page(hp, hp->in); 1916 hp->used += PAGESIZE; 1917 HRSTOP(hp->perpage, copy); 1918 break; 1919 1920 } else { 1921 1922 /* 1923 * Done with the input. Flush the VM and 1924 * return the buffer to the main task. 1925 */ 1926 if (panicstr && hp->helper != MAINHELPER) 1927 hat_flush_range(kas.a_hat, 1928 hp->cpin->buf, hp->cpin->size); 1929 dumpsys_errmsg(hp, NULL); 1930 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 1931 hp->cpin = NULL; 1932 } 1933 } 1934 1935 return (hp->cpin != NULL); 1936 } 1937 1938 /* 1939 * Compress size bytes starting at buf with bzip2 1940 * mode: 1941 * BZ_RUN add one more compressed page 1942 * BZ_FINISH no more input, flush the state 1943 */ 1944 static void 1945 dumpsys_bzrun(helper_t *hp, void *buf, size_t size, int mode) 1946 { 1947 dumpsync_t *ds = hp->ds; 1948 const int CSIZE = sizeof (dumpcsize_t); 1949 bz_stream *ps = &hp->bzstream; 1950 int rc = 0; 1951 uint32_t csize; 1952 dumpcsize_t cs; 1953 1954 /* Set input pointers to new input page */ 1955 if (size > 0) { 1956 ps->avail_in = size; 1957 ps->next_in = buf; 1958 } 1959 1960 /* CONSTCOND */ 1961 while (1) { 1962 1963 /* Quit when all input has been consumed */ 1964 if (ps->avail_in == 0 && mode == BZ_RUN) 1965 break; 1966 1967 /* Get a new output buffer */ 1968 if (hp->cpout == NULL) { 1969 HRSTART(hp->perpage, outwait); 1970 hp->cpout = CQ_GET(freebufq); 1971 HRSTOP(hp->perpage, outwait); 1972 ps->avail_out = hp->cpout->size - CSIZE; 1973 ps->next_out = hp->cpout->buf + CSIZE; 1974 } 1975 1976 /* Compress input, or finalize */ 1977 HRSTART(hp->perpage, compress); 1978 rc = BZ2_bzCompress(ps, mode); 1979 HRSTOP(hp->perpage, compress); 1980 1981 /* Check for error */ 1982 if (mode == BZ_RUN && rc != BZ_RUN_OK) { 1983 dumpsys_errmsg(hp, "%d: BZ_RUN error %s at page %lx\n", 1984 hp->helper, BZ2_bzErrorString(rc), 1985 hp->cpin->pagenum); 1986 break; 1987 } 1988 1989 /* Write the buffer if it is full, or we are flushing */ 1990 if (ps->avail_out == 0 || mode == BZ_FINISH) { 1991 csize = hp->cpout->size - CSIZE - ps->avail_out; 1992 cs = DUMP_SET_TAG(csize, hp->tag); 1993 if (csize > 0) { 1994 (void) memcpy(hp->cpout->buf, &cs, CSIZE); 1995 dumpsys_swrite(hp, hp->cpout, csize + CSIZE); 1996 hp->cpout = NULL; 1997 } 1998 } 1999 2000 /* Check for final complete */ 2001 if (mode == BZ_FINISH) { 2002 if (rc == BZ_STREAM_END) 2003 break; 2004 if (rc != BZ_FINISH_OK) { 2005 dumpsys_errmsg(hp, "%d: BZ_FINISH error %s\n", 2006 hp->helper, BZ2_bzErrorString(rc)); 2007 break; 2008 } 2009 } 2010 } 2011 2012 /* Cleanup state and buffers */ 2013 if (mode == BZ_FINISH) { 2014 2015 /* Reset state so that it is re-usable. */ 2016 (void) BZ2_bzCompressReset(&hp->bzstream); 2017 2018 /* Give any unused outout buffer to the main task */ 2019 if (hp->cpout != NULL) { 2020 hp->cpout->used = 0; 2021 CQ_PUT(mainq, hp->cpout, CBUF_ERRMSG); 2022 hp->cpout = NULL; 2023 } 2024 } 2025 } 2026 2027 static void 2028 dumpsys_bz2compress(helper_t *hp) 2029 { 2030 dumpsync_t *ds = hp->ds; 2031 dumpstreamhdr_t sh; 2032 2033 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC); 2034 sh.stream_pagenum = (pgcnt_t)-1; 2035 sh.stream_npages = 0; 2036 hp->cpin = NULL; 2037 hp->cpout = NULL; 2038 hp->cperr = NULL; 2039 hp->in = 0; 2040 hp->out = 0; 2041 hp->bzstream.avail_in = 0; 2042 2043 /* Bump reference to mainq while we are running */ 2044 CQ_OPEN(mainq); 2045 2046 /* Get one page at a time */ 2047 while (dumpsys_sread(hp)) { 2048 if (sh.stream_pagenum != hp->cpin->pagenum) { 2049 sh.stream_pagenum = hp->cpin->pagenum; 2050 sh.stream_npages = btop(hp->cpin->used); 2051 dumpsys_bzrun(hp, &sh, sizeof (sh), BZ_RUN); 2052 } 2053 dumpsys_bzrun(hp, hp->page, PAGESIZE, 0); 2054 } 2055 2056 /* Done with input, flush any partial buffer */ 2057 if (sh.stream_pagenum != (pgcnt_t)-1) { 2058 dumpsys_bzrun(hp, NULL, 0, BZ_FINISH); 2059 dumpsys_errmsg(hp, NULL); 2060 } 2061 2062 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL); 2063 2064 /* Decrement main queue count, we are done */ 2065 CQ_CLOSE(mainq); 2066 } 2067 2068 /* 2069 * Compress with lzjb 2070 * write stream block if full or size==0 2071 * if csize==0 write stream header, else write <csize, data> 2072 * size==0 is a call to flush a buffer 2073 * hp->cpout is the buffer we are flushing or filling 2074 * hp->out is the next index to fill data 2075 * osize is either csize+data, or the size of a stream header 2076 */ 2077 static void 2078 dumpsys_lzjbrun(helper_t *hp, size_t csize, void *buf, size_t size) 2079 { 2080 dumpsync_t *ds = hp->ds; 2081 const int CSIZE = sizeof (dumpcsize_t); 2082 dumpcsize_t cs; 2083 size_t osize = csize > 0 ? CSIZE + size : size; 2084 2085 /* If flush, and there is no buffer, just return */ 2086 if (size == 0 && hp->cpout == NULL) 2087 return; 2088 2089 /* If flush, or cpout is full, write it out */ 2090 if (size == 0 || 2091 hp->cpout != NULL && hp->out + osize > hp->cpout->size) { 2092 2093 /* Set tag+size word at the front of the stream block. */ 2094 cs = DUMP_SET_TAG(hp->out - CSIZE, hp->tag); 2095 (void) memcpy(hp->cpout->buf, &cs, CSIZE); 2096 2097 /* Write block to dump file. */ 2098 dumpsys_swrite(hp, hp->cpout, hp->out); 2099 2100 /* Clear pointer to indicate we need a new buffer */ 2101 hp->cpout = NULL; 2102 2103 /* flushing, we are done */ 2104 if (size == 0) 2105 return; 2106 } 2107 2108 /* Get an output buffer if we dont have one. */ 2109 if (hp->cpout == NULL) { 2110 HRSTART(hp->perpage, outwait); 2111 hp->cpout = CQ_GET(freebufq); 2112 HRSTOP(hp->perpage, outwait); 2113 hp->out = CSIZE; 2114 } 2115 2116 /* Store csize word. This is the size of compressed data. */ 2117 if (csize > 0) { 2118 cs = DUMP_SET_TAG(csize, 0); 2119 (void) memcpy(hp->cpout->buf + hp->out, &cs, CSIZE); 2120 hp->out += CSIZE; 2121 } 2122 2123 /* Store the data. */ 2124 (void) memcpy(hp->cpout->buf + hp->out, buf, size); 2125 hp->out += size; 2126 } 2127 2128 static void 2129 dumpsys_lzjbcompress(helper_t *hp) 2130 { 2131 dumpsync_t *ds = hp->ds; 2132 size_t csize; 2133 dumpstreamhdr_t sh; 2134 2135 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC); 2136 sh.stream_pagenum = (pfn_t)-1; 2137 sh.stream_npages = 0; 2138 hp->cpin = NULL; 2139 hp->cpout = NULL; 2140 hp->cperr = NULL; 2141 hp->in = 0; 2142 hp->out = 0; 2143 2144 /* Bump reference to mainq while we are running */ 2145 CQ_OPEN(mainq); 2146 2147 /* Get one page at a time */ 2148 while (dumpsys_sread(hp)) { 2149 2150 /* Create a stream header for each new input map */ 2151 if (sh.stream_pagenum != hp->cpin->pagenum) { 2152 sh.stream_pagenum = hp->cpin->pagenum; 2153 sh.stream_npages = btop(hp->cpin->used); 2154 dumpsys_lzjbrun(hp, 0, &sh, sizeof (sh)); 2155 } 2156 2157 /* Compress one page */ 2158 HRSTART(hp->perpage, compress); 2159 csize = compress(hp->page, hp->lzbuf, PAGESIZE); 2160 HRSTOP(hp->perpage, compress); 2161 2162 /* Add csize+data to output block */ 2163 ASSERT(csize > 0 && csize <= PAGESIZE); 2164 dumpsys_lzjbrun(hp, csize, hp->lzbuf, csize); 2165 } 2166 2167 /* Done with input, flush any partial buffer */ 2168 if (sh.stream_pagenum != (pfn_t)-1) { 2169 dumpsys_lzjbrun(hp, 0, NULL, 0); 2170 dumpsys_errmsg(hp, NULL); 2171 } 2172 2173 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL); 2174 2175 /* Decrement main queue count, we are done */ 2176 CQ_CLOSE(mainq); 2177 } 2178 2179 /* 2180 * Dump helper called from panic_idle() to compress pages. CPUs in 2181 * this path must not call most kernel services. 2182 * 2183 * During panic, all but one of the CPUs is idle. These CPUs are used 2184 * as helpers working in parallel to copy and compress memory 2185 * pages. During a panic, however, these processors cannot call any 2186 * kernel services. This is because mutexes become no-ops during 2187 * panic, and, cross-call interrupts are inhibited. Therefore, during 2188 * panic dump the helper CPUs communicate with the panic CPU using 2189 * memory variables. All memory mapping and I/O is performed by the 2190 * panic CPU. 2191 * 2192 * At dump configuration time, helper_lock is set and helpers_wanted 2193 * is 0. dumpsys() decides whether to set helpers_wanted before 2194 * clearing helper_lock. 2195 * 2196 * At panic time, idle CPUs spin-wait on helper_lock, then alternately 2197 * take the lock and become a helper, or return. 2198 */ 2199 void 2200 dumpsys_helper() 2201 { 2202 dumpsys_spinlock(&dumpcfg.helper_lock); 2203 if (dumpcfg.helpers_wanted) { 2204 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper]; 2205 2206 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2207 if (hp->helper == FREEHELPER) { 2208 hp->helper = CPU->cpu_id; 2209 BT_SET(dumpcfg.helpermap, CPU->cpu_seqid); 2210 2211 dumpsys_spinunlock(&dumpcfg.helper_lock); 2212 2213 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2) 2214 dumpsys_lzjbcompress(hp); 2215 else 2216 dumpsys_bz2compress(hp); 2217 2218 hp->helper = DONEHELPER; 2219 return; 2220 } 2221 } 2222 2223 /* No more helpers are needed. */ 2224 dumpcfg.helpers_wanted = 0; 2225 2226 } 2227 dumpsys_spinunlock(&dumpcfg.helper_lock); 2228 } 2229 2230 /* 2231 * No-wait helper callable in spin loops. 2232 * 2233 * Do not wait for helper_lock. Just check helpers_wanted. The caller 2234 * may decide to continue. This is the "c)ontinue, s)ync, r)eset? s" 2235 * case. 2236 */ 2237 void 2238 dumpsys_helper_nw() 2239 { 2240 if (dumpcfg.helpers_wanted) 2241 dumpsys_helper(); 2242 } 2243 2244 /* 2245 * Dump helper for live dumps. 2246 * These run as a system task. 2247 */ 2248 static void 2249 dumpsys_live_helper(void *arg) 2250 { 2251 helper_t *hp = arg; 2252 2253 BT_ATOMIC_SET(dumpcfg.helpermap, CPU->cpu_seqid); 2254 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2) 2255 dumpsys_lzjbcompress(hp); 2256 else 2257 dumpsys_bz2compress(hp); 2258 } 2259 2260 /* 2261 * Compress one page with lzjb (single threaded case) 2262 */ 2263 static void 2264 dumpsys_lzjb_page(helper_t *hp, cbuf_t *cp) 2265 { 2266 dumpsync_t *ds = hp->ds; 2267 uint32_t csize; 2268 2269 hp->helper = MAINHELPER; 2270 hp->in = 0; 2271 hp->used = 0; 2272 hp->cpin = cp; 2273 while (hp->used < cp->used) { 2274 HRSTART(hp->perpage, copy); 2275 hp->in = dumpsys_copy_page(hp, hp->in); 2276 hp->used += PAGESIZE; 2277 HRSTOP(hp->perpage, copy); 2278 2279 HRSTART(hp->perpage, compress); 2280 csize = compress(hp->page, hp->lzbuf, PAGESIZE); 2281 HRSTOP(hp->perpage, compress); 2282 2283 HRSTART(hp->perpage, write); 2284 dumpvp_write(&csize, sizeof (csize)); 2285 dumpvp_write(hp->lzbuf, csize); 2286 HRSTOP(hp->perpage, write); 2287 } 2288 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 2289 hp->cpin = NULL; 2290 } 2291 2292 /* 2293 * Main task to dump pages. This is called on the dump CPU. 2294 */ 2295 static void 2296 dumpsys_main_task(void *arg) 2297 { 2298 dumpsync_t *ds = arg; 2299 pgcnt_t pagenum = 0, bitnum = 0, hibitnum; 2300 dumpmlw_t mlw; 2301 cbuf_t *cp; 2302 pgcnt_t baseoff, pfnoff; 2303 pfn_t base, pfn; 2304 int i, dumpserial; 2305 2306 /* 2307 * Fall back to serial mode if there are no helpers. 2308 * dump_plat_mincpu can be set to 0 at any time. 2309 * dumpcfg.helpermap must contain at least one member. 2310 */ 2311 dumpserial = 1; 2312 2313 if (dump_plat_mincpu != 0 && dumpcfg.clevel != 0) { 2314 for (i = 0; i < BT_BITOUL(NCPU); ++i) { 2315 if (dumpcfg.helpermap[i] != 0) { 2316 dumpserial = 0; 2317 break; 2318 } 2319 } 2320 } 2321 2322 if (dumpserial) { 2323 dumpcfg.clevel = 0; 2324 if (dumpcfg.helper[0].lzbuf == NULL) 2325 dumpcfg.helper[0].lzbuf = dumpcfg.helper[1].page; 2326 } 2327 2328 dump_init_memlist_walker(&mlw); 2329 2330 for (;;) { 2331 int sec = (gethrtime() - ds->start) / NANOSEC; 2332 2333 /* 2334 * Render a simple progress display on the system console to 2335 * make clear to the operator that the system has not hung. 2336 * Emit an update when dump progress has advanced by one 2337 * percent, or when no update has been drawn in the last 2338 * second. 2339 */ 2340 if (ds->percent > ds->percent_done || sec > ds->sec_done) { 2341 ds->sec_done = sec; 2342 ds->percent_done = ds->percent; 2343 uprintf("^\rdumping: %2d:%02d %3d%% done", 2344 sec / 60, sec % 60, ds->percent); 2345 ds->neednl = 1; 2346 } 2347 2348 while (CQ_IS_EMPTY(mainq) && !CQ_IS_EMPTY(writerq)) { 2349 2350 /* the writerq never blocks */ 2351 cp = CQ_GET(writerq); 2352 if (cp == NULL) 2353 break; 2354 2355 dump_timeleft = dump_timeout; 2356 2357 HRSTART(ds->perpage, write); 2358 dumpvp_write(cp->buf, cp->used); 2359 HRSTOP(ds->perpage, write); 2360 2361 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2362 } 2363 2364 /* 2365 * Wait here for some buffers to process. Returns NULL 2366 * when all helpers have terminated and all buffers 2367 * have been processed. 2368 */ 2369 cp = CQ_GET(mainq); 2370 2371 if (cp == NULL) { 2372 2373 /* Drain the write queue. */ 2374 if (!CQ_IS_EMPTY(writerq)) 2375 continue; 2376 2377 /* Main task exits here. */ 2378 break; 2379 } 2380 2381 dump_timeleft = dump_timeout; 2382 2383 switch (cp->state) { 2384 2385 case CBUF_FREEMAP: 2386 2387 /* 2388 * Note that we drop CBUF_FREEMAP buffers on 2389 * the floor (they will not be on any cqueue) 2390 * when we no longer need them. 2391 */ 2392 if (bitnum >= dumpcfg.bitmapsize) 2393 break; 2394 2395 if (dump_ioerr) { 2396 bitnum = dumpcfg.bitmapsize; 2397 CQ_CLOSE(helperq); 2398 break; 2399 } 2400 2401 HRSTART(ds->perpage, bitmap); 2402 for (; bitnum < dumpcfg.bitmapsize; bitnum++) 2403 if (BT_TEST(dumpcfg.bitmap, bitnum)) 2404 break; 2405 HRSTOP(ds->perpage, bitmap); 2406 dump_timeleft = dump_timeout; 2407 2408 if (bitnum >= dumpcfg.bitmapsize) { 2409 CQ_CLOSE(helperq); 2410 break; 2411 } 2412 2413 /* 2414 * Try to map CBUF_MAPSIZE ranges. Can't 2415 * assume that memory segment size is a 2416 * multiple of CBUF_MAPSIZE. Can't assume that 2417 * the segment starts on a CBUF_MAPSIZE 2418 * boundary. 2419 */ 2420 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2421 ASSERT(pfn != PFN_INVALID); 2422 ASSERT(bitnum + mlw.mpleft <= dumpcfg.bitmapsize); 2423 2424 base = P2ALIGN(pfn, CBUF_MAPNP); 2425 if (base < mlw.mpaddr) { 2426 base = mlw.mpaddr; 2427 baseoff = P2PHASE(base, CBUF_MAPNP); 2428 } else { 2429 baseoff = 0; 2430 } 2431 2432 pfnoff = pfn - base; 2433 if (pfnoff + mlw.mpleft < CBUF_MAPNP) { 2434 hibitnum = bitnum + mlw.mpleft; 2435 cp->size = ptob(pfnoff + mlw.mpleft); 2436 } else { 2437 hibitnum = bitnum - pfnoff + CBUF_MAPNP - 2438 baseoff; 2439 cp->size = CBUF_MAPSIZE - ptob(baseoff); 2440 } 2441 2442 cp->pfn = pfn; 2443 cp->bitnum = bitnum++; 2444 cp->pagenum = pagenum++; 2445 cp->off = ptob(pfnoff); 2446 2447 for (; bitnum < hibitnum; bitnum++) 2448 if (BT_TEST(dumpcfg.bitmap, bitnum)) 2449 pagenum++; 2450 2451 dump_timeleft = dump_timeout; 2452 cp->used = ptob(pagenum - cp->pagenum); 2453 2454 HRSTART(ds->perpage, map); 2455 hat_devload(kas.a_hat, cp->buf, cp->size, base, 2456 PROT_READ, HAT_LOAD_NOCONSIST); 2457 HRSTOP(ds->perpage, map); 2458 2459 ds->pages_mapped += btop(cp->size); 2460 ds->pages_used += pagenum - cp->pagenum; 2461 2462 CQ_OPEN(mainq); 2463 2464 /* 2465 * If there are no helpers the main task does 2466 * non-streams lzjb compress. 2467 */ 2468 if (dumpserial) { 2469 dumpsys_lzjb_page(dumpcfg.helper, cp); 2470 break; 2471 } 2472 2473 /* pass mapped pages to a helper */ 2474 CQ_PUT(helperq, cp, CBUF_INREADY); 2475 2476 /* the last page was done */ 2477 if (bitnum >= dumpcfg.bitmapsize) 2478 CQ_CLOSE(helperq); 2479 2480 break; 2481 2482 case CBUF_USEDMAP: 2483 2484 ds->npages += btop(cp->used); 2485 2486 HRSTART(ds->perpage, unmap); 2487 hat_unload(kas.a_hat, cp->buf, cp->size, HAT_UNLOAD); 2488 HRSTOP(ds->perpage, unmap); 2489 2490 if (bitnum < dumpcfg.bitmapsize) 2491 CQ_PUT(mainq, cp, CBUF_FREEMAP); 2492 CQ_CLOSE(mainq); 2493 2494 ASSERT(ds->npages <= dumphdr->dump_npages); 2495 ds->percent = ds->npages * 100LL / dumphdr->dump_npages; 2496 break; 2497 2498 case CBUF_WRITE: 2499 2500 CQ_PUT(writerq, cp, CBUF_WRITE); 2501 break; 2502 2503 case CBUF_ERRMSG: 2504 2505 if (cp->used > 0) { 2506 cp->buf[cp->size - 2] = '\n'; 2507 cp->buf[cp->size - 1] = '\0'; 2508 if (ds->neednl) { 2509 uprintf("\n%s", cp->buf); 2510 ds->neednl = 0; 2511 } else { 2512 uprintf("%s", cp->buf); 2513 } 2514 /* wait for console output */ 2515 drv_usecwait(200000); 2516 dump_timeleft = dump_timeout; 2517 } 2518 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2519 break; 2520 2521 default: 2522 uprintf("dump: unexpected buffer state %d, " 2523 "buffer will be lost\n", cp->state); 2524 break; 2525 2526 } /* end switch */ 2527 } 2528 } 2529 2530 #ifdef COLLECT_METRICS 2531 size_t 2532 dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size) 2533 { 2534 dumpcfg_t *cfg = &dumpcfg; 2535 int myid = CPU->cpu_seqid; 2536 int i, compress_ratio; 2537 int sec, iorate; 2538 helper_t *hp, *hpend = &cfg->helper[cfg->nhelper]; 2539 char *e = buf + size; 2540 char *p = buf; 2541 2542 sec = ds->elapsed / (1000 * 1000 * 1000ULL); 2543 if (sec < 1) 2544 sec = 1; 2545 2546 if (ds->iotime < 1) 2547 ds->iotime = 1; 2548 iorate = (ds->nwrite * 100000ULL) / ds->iotime; 2549 2550 compress_ratio = 100LL * ds->npages / btopr(ds->nwrite + 1); 2551 2552 #define P(...) (p += p < e ? snprintf(p, e - p, __VA_ARGS__) : 0) 2553 2554 P("Master cpu_seqid,%d\n", CPU->cpu_seqid); 2555 P("Master cpu_id,%d\n", CPU->cpu_id); 2556 P("dump_flags,0x%x\n", dumphdr->dump_flags); 2557 P("dump_ioerr,%d\n", dump_ioerr); 2558 2559 P("Helpers:\n"); 2560 for (i = 0; i < ncpus; i++) { 2561 if ((i & 15) == 0) 2562 P(",,%03d,", i); 2563 if (i == myid) 2564 P(" M"); 2565 else if (BT_TEST(cfg->helpermap, i)) 2566 P("%4d", cpu_seq[i]->cpu_id); 2567 else 2568 P(" *"); 2569 if ((i & 15) == 15) 2570 P("\n"); 2571 } 2572 2573 P("ncbuf_used,%d\n", cfg->ncbuf_used); 2574 P("ncmap,%d\n", cfg->ncmap); 2575 2576 P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m); 2577 P("Found small pages,%ld\n", cfg->foundsm); 2578 2579 P("Compression level,%d\n", cfg->clevel); 2580 P("Compression type,%s %s\n", cfg->clevel == 0 ? "serial" : "parallel", 2581 cfg->clevel >= DUMP_CLEVEL_BZIP2 ? "bzip2" : "lzjb"); 2582 P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio % 2583 100); 2584 P("nhelper_used,%d\n", cfg->nhelper_used); 2585 2586 P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100); 2587 P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite); 2588 P("..total nsec,%lld\n", (u_longlong_t)ds->iotime); 2589 P("dumpbuf.iosize,%ld\n", dumpbuf.iosize); 2590 P("dumpbuf.size,%ld\n", dumpbuf.size); 2591 2592 P("Dump pages/sec,%llu\n", (u_longlong_t)ds->npages / sec); 2593 P("Dump pages,%llu\n", (u_longlong_t)ds->npages); 2594 P("Dump time,%d\n", sec); 2595 2596 if (ds->pages_mapped > 0) 2597 P("per-cent map utilization,%d\n", (int)((100 * ds->pages_used) 2598 / ds->pages_mapped)); 2599 2600 P("\nPer-page metrics:\n"); 2601 if (ds->npages > 0) { 2602 for (hp = cfg->helper; hp != hpend; hp++) { 2603 #define PERPAGE(x) ds->perpage.x += hp->perpage.x; 2604 PERPAGES; 2605 #undef PERPAGE 2606 } 2607 #define PERPAGE(x) \ 2608 P("%s nsec/page,%d\n", #x, (int)(ds->perpage.x / ds->npages)); 2609 PERPAGES; 2610 #undef PERPAGE 2611 P("freebufq.empty,%d\n", (int)(ds->freebufq.empty / 2612 ds->npages)); 2613 P("helperq.empty,%d\n", (int)(ds->helperq.empty / 2614 ds->npages)); 2615 P("writerq.empty,%d\n", (int)(ds->writerq.empty / 2616 ds->npages)); 2617 P("mainq.empty,%d\n", (int)(ds->mainq.empty / ds->npages)); 2618 2619 P("I/O wait nsec/page,%llu\n", (u_longlong_t)(ds->iowait / 2620 ds->npages)); 2621 } 2622 #undef P 2623 if (p < e) 2624 bzero(p, e - p); 2625 return (p - buf); 2626 } 2627 #endif /* COLLECT_METRICS */ 2628 2629 /* 2630 * Dump the system. 2631 */ 2632 void 2633 dumpsys(void) 2634 { 2635 dumpsync_t *ds = &dumpsync; 2636 taskq_t *livetaskq = NULL; 2637 pfn_t pfn; 2638 pgcnt_t bitnum; 2639 proc_t *p; 2640 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper]; 2641 cbuf_t *cp; 2642 pid_t npids, pidx; 2643 char *content; 2644 char *buf; 2645 size_t size; 2646 int save_dump_clevel; 2647 dumpmlw_t mlw; 2648 dumpcsize_t datatag; 2649 dumpdatahdr_t datahdr; 2650 2651 if (dumpvp == NULL || dumphdr == NULL) { 2652 uprintf("skipping system dump - no dump device configured\n"); 2653 if (panicstr) { 2654 dumpcfg.helpers_wanted = 0; 2655 dumpsys_spinunlock(&dumpcfg.helper_lock); 2656 } 2657 return; 2658 } 2659 dumpbuf.cur = dumpbuf.start; 2660 2661 /* clear the sync variables */ 2662 ASSERT(dumpcfg.nhelper > 0); 2663 bzero(ds, sizeof (*ds)); 2664 ds->dumpcpu = CPU->cpu_id; 2665 2666 /* 2667 * Calculate the starting block for dump. If we're dumping on a 2668 * swap device, start 1/5 of the way in; otherwise, start at the 2669 * beginning. And never use the first page -- it may be a disk label. 2670 */ 2671 if (dumpvp->v_flag & VISSWAP) 2672 dumphdr->dump_start = P2ROUNDUP(dumpvp_size / 5, DUMP_OFFSET); 2673 else 2674 dumphdr->dump_start = DUMP_OFFSET; 2675 2676 dumphdr->dump_flags = DF_VALID | DF_COMPLETE | DF_LIVE | DF_COMPRESSED; 2677 dumphdr->dump_crashtime = gethrestime_sec(); 2678 dumphdr->dump_npages = 0; 2679 dumphdr->dump_nvtop = 0; 2680 bzero(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.bitmapsize)); 2681 dump_timeleft = dump_timeout; 2682 2683 if (panicstr) { 2684 dumphdr->dump_flags &= ~DF_LIVE; 2685 (void) VOP_DUMPCTL(dumpvp, DUMP_FREE, NULL, NULL); 2686 (void) VOP_DUMPCTL(dumpvp, DUMP_ALLOC, NULL, NULL); 2687 (void) vsnprintf(dumphdr->dump_panicstring, DUMP_PANICSIZE, 2688 panicstr, panicargs); 2689 2690 } 2691 2692 if (dump_conflags & DUMP_ALL) 2693 content = "all"; 2694 else if (dump_conflags & DUMP_CURPROC) 2695 content = "kernel + curproc"; 2696 else 2697 content = "kernel"; 2698 uprintf("dumping to %s, offset %lld, content: %s\n", dumppath, 2699 dumphdr->dump_start, content); 2700 2701 /* Make sure nodename is current */ 2702 bcopy(utsname.nodename, dumphdr->dump_utsname.nodename, SYS_NMLN); 2703 2704 /* 2705 * If this is a live dump, try to open a VCHR vnode for better 2706 * performance. We must take care to flush the buffer cache 2707 * first. 2708 */ 2709 if (!panicstr) { 2710 vnode_t *cdev_vp, *cmn_cdev_vp; 2711 2712 ASSERT(dumpbuf.cdev_vp == NULL); 2713 cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR); 2714 if (cdev_vp != NULL) { 2715 cmn_cdev_vp = common_specvp(cdev_vp); 2716 if (VOP_OPEN(&cmn_cdev_vp, FREAD | FWRITE, kcred, NULL) 2717 == 0) { 2718 if (vn_has_cached_data(dumpvp)) 2719 (void) pvn_vplist_dirty(dumpvp, 0, NULL, 2720 B_INVAL | B_TRUNC, kcred); 2721 dumpbuf.cdev_vp = cmn_cdev_vp; 2722 } else { 2723 VN_RELE(cdev_vp); 2724 } 2725 } 2726 } 2727 2728 /* 2729 * Store a hires timestamp so we can look it up during debugging. 2730 */ 2731 lbolt_debug_entry(); 2732 2733 /* 2734 * Leave room for the message and ereport save areas and terminal dump 2735 * header. 2736 */ 2737 dumpbuf.vp_limit = dumpvp_size - DUMP_LOGSIZE - DUMP_OFFSET - 2738 DUMP_ERPTSIZE; 2739 2740 /* 2741 * Write out the symbol table. It's no longer compressed, 2742 * so its 'size' and 'csize' are equal. 2743 */ 2744 dumpbuf.vp_off = dumphdr->dump_ksyms = dumphdr->dump_start + PAGESIZE; 2745 dumphdr->dump_ksyms_size = dumphdr->dump_ksyms_csize = 2746 ksyms_snapshot(dumpvp_ksyms_write, NULL, LONG_MAX); 2747 2748 /* 2749 * Write out the translation map. 2750 */ 2751 dumphdr->dump_map = dumpvp_flush(); 2752 dump_as(&kas); 2753 dumphdr->dump_nvtop += dump_plat_addr(); 2754 2755 /* 2756 * call into hat, which may have unmapped pages that also need to 2757 * be in the dump 2758 */ 2759 hat_dump(); 2760 2761 if (dump_conflags & DUMP_ALL) { 2762 mutex_enter(&pidlock); 2763 2764 for (npids = 0, p = practive; p != NULL; p = p->p_next) 2765 dumpcfg.pids[npids++] = p->p_pid; 2766 2767 mutex_exit(&pidlock); 2768 2769 for (pidx = 0; pidx < npids; pidx++) 2770 (void) dump_process(dumpcfg.pids[pidx]); 2771 2772 dump_init_memlist_walker(&mlw); 2773 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) { 2774 dump_timeleft = dump_timeout; 2775 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2776 /* 2777 * Some hypervisors do not have all pages available to 2778 * be accessed by the guest OS. Check for page 2779 * accessibility. 2780 */ 2781 if (plat_hold_page(pfn, PLAT_HOLD_NO_LOCK, NULL) != 2782 PLAT_HOLD_OK) 2783 continue; 2784 BT_SET(dumpcfg.bitmap, bitnum); 2785 } 2786 dumphdr->dump_npages = dumpcfg.bitmapsize; 2787 dumphdr->dump_flags |= DF_ALL; 2788 2789 } else if (dump_conflags & DUMP_CURPROC) { 2790 /* 2791 * Determine which pid is to be dumped. If we're panicking, we 2792 * dump the process associated with panic_thread (if any). If 2793 * this is a live dump, we dump the process associated with 2794 * curthread. 2795 */ 2796 npids = 0; 2797 if (panicstr) { 2798 if (panic_thread != NULL && 2799 panic_thread->t_procp != NULL && 2800 panic_thread->t_procp != &p0) { 2801 dumpcfg.pids[npids++] = 2802 panic_thread->t_procp->p_pid; 2803 } 2804 } else { 2805 dumpcfg.pids[npids++] = curthread->t_procp->p_pid; 2806 } 2807 2808 if (npids && dump_process(dumpcfg.pids[0]) == 0) 2809 dumphdr->dump_flags |= DF_CURPROC; 2810 else 2811 dumphdr->dump_flags |= DF_KERNEL; 2812 2813 } else { 2814 dumphdr->dump_flags |= DF_KERNEL; 2815 } 2816 2817 dumphdr->dump_hashmask = (1 << highbit(dumphdr->dump_nvtop - 1)) - 1; 2818 2819 /* 2820 * Write out the pfn table. 2821 */ 2822 dumphdr->dump_pfn = dumpvp_flush(); 2823 dump_init_memlist_walker(&mlw); 2824 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) { 2825 dump_timeleft = dump_timeout; 2826 if (!BT_TEST(dumpcfg.bitmap, bitnum)) 2827 continue; 2828 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2829 ASSERT(pfn != PFN_INVALID); 2830 dumpvp_write(&pfn, sizeof (pfn_t)); 2831 } 2832 dump_plat_pfn(); 2833 2834 /* 2835 * Write out all the pages. 2836 * Map pages, copy them handling UEs, compress, and write them out. 2837 * Cooperate with any helpers running on CPUs in panic_idle(). 2838 */ 2839 dumphdr->dump_data = dumpvp_flush(); 2840 2841 bzero(dumpcfg.helpermap, BT_SIZEOFMAP(NCPU)); 2842 ds->live = dumpcfg.clevel > 0 && 2843 (dumphdr->dump_flags & DF_LIVE) != 0; 2844 2845 save_dump_clevel = dumpcfg.clevel; 2846 if (panicstr) 2847 dumpsys_get_maxmem(); 2848 else if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2) 2849 dumpcfg.clevel = DUMP_CLEVEL_LZJB; 2850 2851 dumpcfg.nhelper_used = 0; 2852 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2853 if (hp->page == NULL) { 2854 hp->helper = DONEHELPER; 2855 continue; 2856 } 2857 ++dumpcfg.nhelper_used; 2858 hp->helper = FREEHELPER; 2859 hp->taskqid = NULL; 2860 hp->ds = ds; 2861 bzero(&hp->perpage, sizeof (hp->perpage)); 2862 if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2) 2863 (void) BZ2_bzCompressReset(&hp->bzstream); 2864 } 2865 2866 CQ_OPEN(freebufq); 2867 CQ_OPEN(helperq); 2868 2869 dumpcfg.ncbuf_used = 0; 2870 for (cp = dumpcfg.cbuf; cp != &dumpcfg.cbuf[dumpcfg.ncbuf]; cp++) { 2871 if (cp->buf != NULL) { 2872 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2873 ++dumpcfg.ncbuf_used; 2874 } 2875 } 2876 2877 for (cp = dumpcfg.cmap; cp != &dumpcfg.cmap[dumpcfg.ncmap]; cp++) 2878 CQ_PUT(mainq, cp, CBUF_FREEMAP); 2879 2880 ds->start = gethrtime(); 2881 ds->iowaitts = ds->start; 2882 2883 /* start helpers */ 2884 if (ds->live) { 2885 int n = dumpcfg.nhelper_used; 2886 int pri = MINCLSYSPRI - 25; 2887 2888 livetaskq = taskq_create("LiveDump", n, pri, n, n, 2889 TASKQ_PREPOPULATE); 2890 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2891 if (hp->page == NULL) 2892 continue; 2893 hp->helper = hp - dumpcfg.helper; 2894 hp->taskqid = taskq_dispatch(livetaskq, 2895 dumpsys_live_helper, (void *)hp, TQ_NOSLEEP); 2896 } 2897 2898 } else { 2899 if (panicstr) 2900 kmem_dump_begin(); 2901 dumpcfg.helpers_wanted = dumpcfg.clevel > 0; 2902 dumpsys_spinunlock(&dumpcfg.helper_lock); 2903 } 2904 2905 /* run main task */ 2906 dumpsys_main_task(ds); 2907 2908 ds->elapsed = gethrtime() - ds->start; 2909 if (ds->elapsed < 1) 2910 ds->elapsed = 1; 2911 2912 if (livetaskq != NULL) 2913 taskq_destroy(livetaskq); 2914 2915 if (ds->neednl) { 2916 uprintf("\n"); 2917 ds->neednl = 0; 2918 } 2919 2920 /* record actual pages dumped */ 2921 dumphdr->dump_npages = ds->npages; 2922 2923 /* platform-specific data */ 2924 dumphdr->dump_npages += dump_plat_data(dumpcfg.cbuf[0].buf); 2925 2926 /* note any errors by clearing DF_COMPLETE */ 2927 if (dump_ioerr || ds->npages < dumphdr->dump_npages) 2928 dumphdr->dump_flags &= ~DF_COMPLETE; 2929 2930 /* end of stream blocks */ 2931 datatag = 0; 2932 dumpvp_write(&datatag, sizeof (datatag)); 2933 2934 bzero(&datahdr, sizeof (datahdr)); 2935 2936 /* buffer for metrics */ 2937 buf = dumpcfg.cbuf[0].buf; 2938 size = MIN(dumpcfg.cbuf[0].size, DUMP_OFFSET - sizeof (dumphdr_t) - 2939 sizeof (dumpdatahdr_t)); 2940 2941 /* finish the kmem intercepts, collect kmem verbose info */ 2942 if (panicstr) { 2943 datahdr.dump_metrics = kmem_dump_finish(buf, size); 2944 buf += datahdr.dump_metrics; 2945 size -= datahdr.dump_metrics; 2946 } 2947 2948 /* record in the header whether this is a fault-management panic */ 2949 if (panicstr) 2950 dumphdr->dump_fm_panic = is_fm_panic(); 2951 2952 /* compression info in data header */ 2953 datahdr.dump_datahdr_magic = DUMP_DATAHDR_MAGIC; 2954 datahdr.dump_datahdr_version = DUMP_DATAHDR_VERSION; 2955 datahdr.dump_maxcsize = CBUF_SIZE; 2956 datahdr.dump_maxrange = CBUF_MAPSIZE / PAGESIZE; 2957 datahdr.dump_nstreams = dumpcfg.nhelper_used; 2958 datahdr.dump_clevel = dumpcfg.clevel; 2959 #ifdef COLLECT_METRICS 2960 if (dump_metrics_on) 2961 datahdr.dump_metrics += dumpsys_metrics(ds, buf, size); 2962 #endif 2963 datahdr.dump_data_csize = dumpvp_flush() - dumphdr->dump_data; 2964 2965 /* 2966 * Write out the initial and terminal dump headers. 2967 */ 2968 dumpbuf.vp_off = dumphdr->dump_start; 2969 dumpvp_write(dumphdr, sizeof (dumphdr_t)); 2970 (void) dumpvp_flush(); 2971 2972 dumpbuf.vp_limit = dumpvp_size; 2973 dumpbuf.vp_off = dumpbuf.vp_limit - DUMP_OFFSET; 2974 dumpvp_write(dumphdr, sizeof (dumphdr_t)); 2975 dumpvp_write(&datahdr, sizeof (dumpdatahdr_t)); 2976 dumpvp_write(dumpcfg.cbuf[0].buf, datahdr.dump_metrics); 2977 2978 (void) dumpvp_flush(); 2979 2980 uprintf("\r%3d%% done: %llu pages dumped, ", 2981 ds->percent_done, (u_longlong_t)ds->npages); 2982 2983 if (dump_ioerr == 0) { 2984 uprintf("dump succeeded\n"); 2985 } else { 2986 uprintf("dump failed: error %d\n", dump_ioerr); 2987 #ifdef DEBUG 2988 if (panicstr) 2989 debug_enter("dump failed"); 2990 #endif 2991 } 2992 2993 /* 2994 * Write out all undelivered messages. This has to be the *last* 2995 * thing we do because the dump process itself emits messages. 2996 */ 2997 if (panicstr) { 2998 dump_summary(); 2999 dump_ereports(); 3000 dump_messages(); 3001 } 3002 3003 delay(2 * hz); /* let people see the 'done' message */ 3004 dump_timeleft = 0; 3005 dump_ioerr = 0; 3006 3007 /* restore settings after live dump completes */ 3008 if (!panicstr) { 3009 dumpcfg.clevel = save_dump_clevel; 3010 3011 /* release any VCHR open of the dump device */ 3012 if (dumpbuf.cdev_vp != NULL) { 3013 (void) VOP_CLOSE(dumpbuf.cdev_vp, FREAD | FWRITE, 1, 0, 3014 kcred, NULL); 3015 VN_RELE(dumpbuf.cdev_vp); 3016 dumpbuf.cdev_vp = NULL; 3017 } 3018 } 3019 } 3020 3021 /* 3022 * This function is called whenever the memory size, as represented 3023 * by the phys_install list, changes. 3024 */ 3025 void 3026 dump_resize() 3027 { 3028 mutex_enter(&dump_lock); 3029 dumphdr_init(); 3030 dumpbuf_resize(); 3031 dump_update_clevel(); 3032 mutex_exit(&dump_lock); 3033 } 3034 3035 /* 3036 * This function allows for dynamic resizing of a dump area. It assumes that 3037 * the underlying device has update its appropriate size(9P). 3038 */ 3039 int 3040 dumpvp_resize() 3041 { 3042 int error; 3043 vattr_t vattr; 3044 3045 mutex_enter(&dump_lock); 3046 vattr.va_mask = AT_SIZE; 3047 if ((error = VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL)) != 0) { 3048 mutex_exit(&dump_lock); 3049 return (error); 3050 } 3051 3052 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) { 3053 mutex_exit(&dump_lock); 3054 return (ENOSPC); 3055 } 3056 3057 dumpvp_size = vattr.va_size & -DUMP_OFFSET; 3058 mutex_exit(&dump_lock); 3059 return (0); 3060 } 3061 3062 int 3063 dump_set_uuid(const char *uuidstr) 3064 { 3065 const char *ptr; 3066 int i; 3067 3068 if (uuidstr == NULL || strnlen(uuidstr, 36 + 1) != 36) 3069 return (EINVAL); 3070 3071 /* uuid_parse is not common code so check manually */ 3072 for (i = 0, ptr = uuidstr; i < 36; i++, ptr++) { 3073 switch (i) { 3074 case 8: 3075 case 13: 3076 case 18: 3077 case 23: 3078 if (*ptr != '-') 3079 return (EINVAL); 3080 break; 3081 3082 default: 3083 if (!isxdigit(*ptr)) 3084 return (EINVAL); 3085 break; 3086 } 3087 } 3088 3089 if (dump_osimage_uuid[0] != '\0') 3090 return (EALREADY); 3091 3092 (void) strncpy(dump_osimage_uuid, uuidstr, 36 + 1); 3093 3094 cmn_err(CE_CONT, "?This Solaris instance has UUID %s\n", 3095 dump_osimage_uuid); 3096 3097 return (0); 3098 } 3099 3100 const char * 3101 dump_get_uuid(void) 3102 { 3103 return (dump_osimage_uuid[0] != '\0' ? dump_osimage_uuid : ""); 3104 } 3105