1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/param.h> 28 #include <sys/systm.h> 29 #include <sys/vm.h> 30 #include <sys/proc.h> 31 #include <sys/file.h> 32 #include <sys/conf.h> 33 #include <sys/kmem.h> 34 #include <sys/mem.h> 35 #include <sys/mman.h> 36 #include <sys/vnode.h> 37 #include <sys/errno.h> 38 #include <sys/memlist.h> 39 #include <sys/dumphdr.h> 40 #include <sys/dumpadm.h> 41 #include <sys/ksyms.h> 42 #include <sys/compress.h> 43 #include <sys/stream.h> 44 #include <sys/strsun.h> 45 #include <sys/cmn_err.h> 46 #include <sys/bitmap.h> 47 #include <sys/modctl.h> 48 #include <sys/utsname.h> 49 #include <sys/systeminfo.h> 50 #include <sys/vmem.h> 51 #include <sys/log.h> 52 #include <sys/var.h> 53 #include <sys/debug.h> 54 #include <sys/sunddi.h> 55 #include <fs/fs_subr.h> 56 #include <sys/fs/snode.h> 57 #include <sys/ontrap.h> 58 #include <sys/panic.h> 59 #include <sys/dkio.h> 60 #include <sys/vtoc.h> 61 #include <sys/errorq.h> 62 #include <sys/fm/util.h> 63 #include <sys/fs/zfs.h> 64 65 #include <vm/hat.h> 66 #include <vm/as.h> 67 #include <vm/page.h> 68 #include <vm/pvn.h> 69 #include <vm/seg.h> 70 #include <vm/seg_kmem.h> 71 #include <sys/clock_impl.h> 72 #include <sys/hold_page.h> 73 74 #include <bzip2/bzlib.h> 75 76 /* 77 * Crash dump time is dominated by disk write time. To reduce this, 78 * the stronger compression method bzip2 is applied to reduce the dump 79 * size and hence reduce I/O time. However, bzip2 is much more 80 * computationally expensive than the existing lzjb algorithm, so to 81 * avoid increasing compression time, CPUs that are otherwise idle 82 * during panic are employed to parallelize the compression task. 83 * Many helper CPUs are needed to prevent bzip2 from being a 84 * bottleneck, and on systems with too few CPUs, the lzjb algorithm is 85 * parallelized instead. Lastly, I/O and compression are performed by 86 * different CPUs, and are hence overlapped in time, unlike the older 87 * serial code. 88 * 89 * Another important consideration is the speed of the dump 90 * device. Faster disks need less CPUs in order to benefit from 91 * parallel lzjb versus parallel bzip2. Therefore, the CPU count 92 * threshold for switching from parallel lzjb to paralled bzip2 is 93 * elevated for faster disks. The dump device speed is adduced from 94 * the setting for dumpbuf.iosize, see dump_update_clevel. 95 */ 96 97 /* 98 * exported vars 99 */ 100 kmutex_t dump_lock; /* lock for dump configuration */ 101 dumphdr_t *dumphdr; /* dump header */ 102 int dump_conflags = DUMP_KERNEL; /* dump configuration flags */ 103 vnode_t *dumpvp; /* dump device vnode pointer */ 104 u_offset_t dumpvp_size; /* size of dump device, in bytes */ 105 char *dumppath; /* pathname of dump device */ 106 int dump_timeout = 120; /* timeout for dumping pages */ 107 int dump_timeleft; /* portion of dump_timeout remaining */ 108 int dump_ioerr; /* dump i/o error */ 109 int dump_check_used; /* enable check for used pages */ 110 111 /* 112 * Tunables for dump compression and parallelism. These can be set via 113 * /etc/system. 114 * 115 * dump_ncpu_low number of helpers for parallel lzjb 116 * This is also the minimum configuration. 117 * 118 * dump_bzip2_level bzip2 compression level: 1-9 119 * Higher numbers give greater compression, but take more memory 120 * and time. Memory used per helper is ~(dump_bzip2_level * 1MB). 121 * 122 * dump_plat_mincpu the cross-over limit for using bzip2 (per platform): 123 * if dump_plat_mincpu == 0, then always do single threaded dump 124 * if ncpu >= dump_plat_mincpu then try to use bzip2 125 * 126 * dump_metrics_on if set, metrics are collected in the kernel, passed 127 * to savecore via the dump file, and recorded by savecore in 128 * METRICS.txt. 129 */ 130 uint_t dump_ncpu_low = 4; /* minimum config for parallel lzjb */ 131 uint_t dump_bzip2_level = 1; /* bzip2 level (1-9) */ 132 133 /* Use dump_plat_mincpu_default unless this variable is set by /etc/system */ 134 #define MINCPU_NOT_SET ((uint_t)-1) 135 uint_t dump_plat_mincpu = MINCPU_NOT_SET; 136 137 /* tunables for pre-reserved heap */ 138 uint_t dump_kmem_permap = 1024; 139 uint_t dump_kmem_pages = 8; 140 141 /* Define multiple buffers per helper to avoid stalling */ 142 #define NCBUF_PER_HELPER 2 143 #define NCMAP_PER_HELPER 4 144 145 /* minimum number of helpers configured */ 146 #define MINHELPERS (dump_ncpu_low) 147 #define MINCBUFS (MINHELPERS * NCBUF_PER_HELPER) 148 149 /* 150 * Define constant parameters. 151 * 152 * CBUF_SIZE size of an output buffer 153 * 154 * CBUF_MAPSIZE size of virtual range for mapping pages 155 * 156 * CBUF_MAPNP size of virtual range in pages 157 * 158 */ 159 #define DUMP_1KB ((size_t)1 << 10) 160 #define DUMP_1MB ((size_t)1 << 20) 161 #define CBUF_SIZE ((size_t)1 << 17) 162 #define CBUF_MAPSHIFT (22) 163 #define CBUF_MAPSIZE ((size_t)1 << CBUF_MAPSHIFT) 164 #define CBUF_MAPNP ((size_t)1 << (CBUF_MAPSHIFT - PAGESHIFT)) 165 166 /* 167 * Compression metrics are accumulated nano-second subtotals. The 168 * results are normalized by the number of pages dumped. A report is 169 * generated when dumpsys() completes and is saved in the dump image 170 * after the trailing dump header. 171 * 172 * Metrics are always collected. Set the variable dump_metrics_on to 173 * cause metrics to be saved in the crash file, where savecore will 174 * save it in the file METRICS.txt. 175 */ 176 #define PERPAGES \ 177 PERPAGE(bitmap) PERPAGE(map) PERPAGE(unmap) \ 178 PERPAGE(copy) PERPAGE(compress) \ 179 PERPAGE(write) \ 180 PERPAGE(inwait) PERPAGE(outwait) 181 182 typedef struct perpage { 183 #define PERPAGE(x) hrtime_t x; 184 PERPAGES 185 #undef PERPAGE 186 } perpage_t; 187 188 /* 189 * This macro controls the code generation for collecting dump 190 * performance information. By default, the code is generated, but 191 * automatic saving of the information is disabled. If dump_metrics_on 192 * is set to 1, the timing information is passed to savecore via the 193 * crash file, where it is appended to the file dump-dir/METRICS.txt. 194 */ 195 #define COLLECT_METRICS 196 197 #ifdef COLLECT_METRICS 198 uint_t dump_metrics_on = 0; /* set to 1 to enable recording metrics */ 199 200 #define HRSTART(v, m) v##ts.m = gethrtime() 201 #define HRSTOP(v, m) v.m += gethrtime() - v##ts.m 202 #define HRBEGIN(v, m, s) v##ts.m = gethrtime(); v.size += s 203 #define HREND(v, m) v.m += gethrtime() - v##ts.m 204 #define HRNORM(v, m, n) v.m /= (n) 205 206 #else 207 #define HRSTART(v, m) 208 #define HRSTOP(v, m) 209 #define HRBEGIN(v, m, s) 210 #define HREND(v, m) 211 #define HRNORM(v, m, n) 212 #endif /* COLLECT_METRICS */ 213 214 /* 215 * Buffers for copying and compressing memory pages. 216 * 217 * cbuf_t buffer controllers: used for both input and output. 218 * 219 * The buffer state indicates how it is being used: 220 * 221 * CBUF_FREEMAP: CBUF_MAPSIZE virtual address range is available for 222 * mapping input pages. 223 * 224 * CBUF_INREADY: input pages are mapped and ready for compression by a 225 * helper. 226 * 227 * CBUF_USEDMAP: mapping has been consumed by a helper. Needs unmap. 228 * 229 * CBUF_FREEBUF: CBUF_SIZE output buffer, which is available. 230 * 231 * CBUF_WRITE: CBUF_SIZE block of compressed pages from a helper, 232 * ready to write out. 233 * 234 * CBUF_ERRMSG: CBUF_SIZE block of error messages from a helper 235 * (reports UE errors.) 236 */ 237 238 typedef enum cbufstate { 239 CBUF_FREEMAP, 240 CBUF_INREADY, 241 CBUF_USEDMAP, 242 CBUF_FREEBUF, 243 CBUF_WRITE, 244 CBUF_ERRMSG 245 } cbufstate_t; 246 247 typedef struct cbuf cbuf_t; 248 249 struct cbuf { 250 cbuf_t *next; /* next in list */ 251 cbufstate_t state; /* processing state */ 252 size_t used; /* amount used */ 253 size_t size; /* mem size */ 254 char *buf; /* kmem or vmem */ 255 pgcnt_t pagenum; /* index to pfn map */ 256 pgcnt_t bitnum; /* first set bitnum */ 257 pfn_t pfn; /* first pfn in mapped range */ 258 int off; /* byte offset to first pfn */ 259 }; 260 261 /* 262 * cqueue_t queues: a uni-directional channel for communication 263 * from the master to helper tasks or vice-versa using put and 264 * get primitives. Both mappings and data buffers are passed via 265 * queues. Producers close a queue when done. The number of 266 * active producers is reference counted so the consumer can 267 * detect end of data. Concurrent access is mediated by atomic 268 * operations for panic dump, or mutex/cv for live dump. 269 * 270 * There a four queues, used as follows: 271 * 272 * Queue Dataflow NewState 273 * -------------------------------------------------- 274 * mainq master -> master FREEMAP 275 * master has initialized or unmapped an input buffer 276 * -------------------------------------------------- 277 * helperq master -> helper INREADY 278 * master has mapped input for use by helper 279 * -------------------------------------------------- 280 * mainq master <- helper USEDMAP 281 * helper is done with input 282 * -------------------------------------------------- 283 * freebufq master -> helper FREEBUF 284 * master has initialized or written an output buffer 285 * -------------------------------------------------- 286 * mainq master <- helper WRITE 287 * block of compressed pages from a helper 288 * -------------------------------------------------- 289 * mainq master <- helper ERRMSG 290 * error messages from a helper (memory error case) 291 * -------------------------------------------------- 292 * writerq master <- master WRITE 293 * non-blocking queue of blocks to write 294 * -------------------------------------------------- 295 */ 296 typedef struct cqueue { 297 cbuf_t *volatile first; /* first in list */ 298 cbuf_t *last; /* last in list */ 299 hrtime_t ts; /* timestamp */ 300 hrtime_t empty; /* total time empty */ 301 kmutex_t mutex; /* live state lock */ 302 kcondvar_t cv; /* live wait var */ 303 lock_t spinlock; /* panic mode spin lock */ 304 volatile uint_t open; /* producer ref count */ 305 } cqueue_t; 306 307 /* 308 * Convenience macros for using the cqueue functions 309 * Note that the caller must have defined "dumpsync_t *ds" 310 */ 311 #define CQ_IS_EMPTY(q) \ 312 (ds->q.first == NULL) 313 314 #define CQ_OPEN(q) \ 315 atomic_inc_uint(&ds->q.open) 316 317 #define CQ_CLOSE(q) \ 318 dumpsys_close_cq(&ds->q, ds->live) 319 320 #define CQ_PUT(q, cp, st) \ 321 dumpsys_put_cq(&ds->q, cp, st, ds->live) 322 323 #define CQ_GET(q) \ 324 dumpsys_get_cq(&ds->q, ds->live) 325 326 /* 327 * Dynamic state when dumpsys() is running. 328 */ 329 typedef struct dumpsync { 330 pgcnt_t npages; /* subtotal of pages dumped */ 331 pgcnt_t pages_mapped; /* subtotal of pages mapped */ 332 pgcnt_t pages_used; /* subtotal of pages used per map */ 333 size_t nwrite; /* subtotal of bytes written */ 334 uint_t live; /* running live dump */ 335 uint_t neednl; /* will need to print a newline */ 336 uint_t percent; /* dump progress */ 337 uint_t percent_done; /* dump progress reported */ 338 cqueue_t freebufq; /* free kmem bufs for writing */ 339 cqueue_t mainq; /* input for main task */ 340 cqueue_t helperq; /* input for helpers */ 341 cqueue_t writerq; /* input for writer */ 342 hrtime_t start; /* start time */ 343 hrtime_t elapsed; /* elapsed time when completed */ 344 hrtime_t iotime; /* time spent writing nwrite bytes */ 345 hrtime_t iowait; /* time spent waiting for output */ 346 hrtime_t iowaitts; /* iowait timestamp */ 347 perpage_t perpage; /* metrics */ 348 perpage_t perpagets; 349 int dumpcpu; /* master cpu */ 350 } dumpsync_t; 351 352 static dumpsync_t dumpsync; /* synchronization vars */ 353 354 /* 355 * helper_t helpers: contains the context for a stream. CPUs run in 356 * parallel at dump time; each CPU creates a single stream of 357 * compression data. Stream data is divided into CBUF_SIZE blocks. 358 * The blocks are written in order within a stream. But, blocks from 359 * multiple streams can be interleaved. Each stream is identified by a 360 * unique tag. 361 */ 362 typedef struct helper { 363 int helper; /* bound helper id */ 364 int tag; /* compression stream tag */ 365 perpage_t perpage; /* per page metrics */ 366 perpage_t perpagets; /* per page metrics (timestamps) */ 367 taskqid_t taskqid; /* live dump task ptr */ 368 int in, out; /* buffer offsets */ 369 cbuf_t *cpin, *cpout, *cperr; /* cbuf objects in process */ 370 dumpsync_t *ds; /* pointer to sync vars */ 371 size_t used; /* counts input consumed */ 372 char *page; /* buffer for page copy */ 373 char *lzbuf; /* lzjb output */ 374 bz_stream bzstream; /* bzip2 state */ 375 } helper_t; 376 377 #define MAINHELPER (-1) /* helper is also the main task */ 378 #define FREEHELPER (-2) /* unbound helper */ 379 #define DONEHELPER (-3) /* helper finished */ 380 381 /* 382 * configuration vars for dumpsys 383 */ 384 typedef struct dumpcfg { 385 int threshold; /* ncpu threshold for bzip2 */ 386 int nhelper; /* number of helpers */ 387 int nhelper_used; /* actual number of helpers used */ 388 int ncmap; /* number VA pages for compression */ 389 int ncbuf; /* number of bufs for compression */ 390 int ncbuf_used; /* number of bufs in use */ 391 uint_t clevel; /* dump compression level */ 392 helper_t *helper; /* array of helpers */ 393 cbuf_t *cmap; /* array of input (map) buffers */ 394 cbuf_t *cbuf; /* array of output buffers */ 395 ulong_t *helpermap; /* set of dumpsys helper CPU ids */ 396 ulong_t *bitmap; /* bitmap for marking pages to dump */ 397 ulong_t *rbitmap; /* bitmap for used CBUF_MAPSIZE ranges */ 398 pgcnt_t bitmapsize; /* size of bitmap */ 399 pgcnt_t rbitmapsize; /* size of bitmap for ranges */ 400 pgcnt_t found4m; /* number ranges allocated by dump */ 401 pgcnt_t foundsm; /* number small pages allocated by dump */ 402 pid_t *pids; /* list of process IDs at dump time */ 403 size_t maxsize; /* memory size needed at dump time */ 404 size_t maxvmsize; /* size of reserved VM */ 405 char *maxvm; /* reserved VM for spare pages */ 406 lock_t helper_lock; /* protect helper state */ 407 char helpers_wanted; /* flag to enable parallelism */ 408 char helper_present; /* at least one helper showed up */ 409 } dumpcfg_t; 410 411 static dumpcfg_t dumpcfg; /* config vars */ 412 413 /* 414 * The dump I/O buffer. 415 * 416 * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is 417 * sized according to the optimum device transfer speed. 418 */ 419 typedef struct dumpbuf { 420 vnode_t *cdev_vp; /* VCHR open of the dump device */ 421 len_t vp_limit; /* maximum write offset */ 422 offset_t vp_off; /* current dump device offset */ 423 char *cur; /* dump write pointer */ 424 char *start; /* dump buffer address */ 425 char *end; /* dump buffer end */ 426 size_t size; /* size of dumpbuf in bytes */ 427 size_t iosize; /* best transfer size for device */ 428 } dumpbuf_t; 429 430 dumpbuf_t dumpbuf; /* I/O buffer */ 431 432 /* 433 * The dump I/O buffer must be at least one page, at most xfer_size 434 * bytes, and should scale with physmem in between. The transfer size 435 * passed in will either represent a global default (maxphys) or the 436 * best size for the device. The size of the dumpbuf I/O buffer is 437 * limited by dumpbuf_limit (8MB by default) because the dump 438 * performance saturates beyond a certain size. The default is to 439 * select 1/4096 of the memory. 440 */ 441 static int dumpbuf_fraction = 12; /* memory size scale factor */ 442 static size_t dumpbuf_limit = 8 * DUMP_1MB; /* max I/O buf size */ 443 444 static size_t 445 dumpbuf_iosize(size_t xfer_size) 446 { 447 size_t iosize = ptob(physmem >> dumpbuf_fraction); 448 449 if (iosize < PAGESIZE) 450 iosize = PAGESIZE; 451 else if (iosize > xfer_size) 452 iosize = xfer_size; 453 if (iosize > dumpbuf_limit) 454 iosize = dumpbuf_limit; 455 return (iosize & PAGEMASK); 456 } 457 458 /* 459 * resize the I/O buffer 460 */ 461 static void 462 dumpbuf_resize(void) 463 { 464 char *old_buf = dumpbuf.start; 465 size_t old_size = dumpbuf.size; 466 char *new_buf; 467 size_t new_size; 468 469 ASSERT(MUTEX_HELD(&dump_lock)); 470 471 new_size = dumpbuf_iosize(MAX(dumpbuf.iosize, maxphys)); 472 if (new_size <= old_size) 473 return; /* no need to reallocate buffer */ 474 475 new_buf = kmem_alloc(new_size, KM_SLEEP); 476 dumpbuf.size = new_size; 477 dumpbuf.start = new_buf; 478 dumpbuf.end = new_buf + new_size; 479 kmem_free(old_buf, old_size); 480 } 481 482 /* 483 * dump_update_clevel is called when dumpadm configures the dump device. 484 * Calculate number of helpers and buffers. 485 * Allocate the minimum configuration for now. 486 * 487 * When the dump file is configured we reserve a minimum amount of 488 * memory for use at crash time. But we reserve VA for all the memory 489 * we really want in order to do the fastest dump possible. The VA is 490 * backed by pages not being dumped, according to the bitmap. If 491 * there is insufficient spare memory, however, we fall back to the 492 * minimum. 493 * 494 * Live dump (savecore -L) always uses the minimum config. 495 * 496 * clevel 0 is single threaded lzjb 497 * clevel 1 is parallel lzjb 498 * clevel 2 is parallel bzip2 499 * 500 * The ncpu threshold is selected with dump_plat_mincpu. 501 * On OPL, set_platform_defaults() overrides the sun4u setting. 502 * The actual values are defined via DUMP_PLAT_*_MINCPU macros. 503 * 504 * Architecture Threshold Algorithm 505 * sun4u < 51 parallel lzjb 506 * sun4u >= 51 parallel bzip2(*) 507 * sun4u OPL < 8 parallel lzjb 508 * sun4u OPL >= 8 parallel bzip2(*) 509 * sun4v < 128 parallel lzjb 510 * sun4v >= 128 parallel bzip2(*) 511 * x86 < 11 parallel lzjb 512 * x86 >= 11 parallel bzip2(*) 513 * 32-bit N/A single-threaded lzjb 514 * 515 * (*) bzip2 is only chosen if there is sufficient available 516 * memory for buffers at dump time. See dumpsys_get_maxmem(). 517 * 518 * Faster dump devices have larger I/O buffers. The threshold value is 519 * increased according to the size of the dump I/O buffer, because 520 * parallel lzjb performs better with faster disks. For buffers >= 1MB 521 * the threshold is 3X; for buffers >= 256K threshold is 2X. 522 * 523 * For parallel dumps, the number of helpers is ncpu-1. The CPU 524 * running panic runs the main task. For single-threaded dumps, the 525 * panic CPU does lzjb compression (it is tagged as MAINHELPER.) 526 * 527 * Need multiple buffers per helper so that they do not block waiting 528 * for the main task. 529 * parallel single-threaded 530 * Number of output buffers: nhelper*2 1 531 * Number of mapping buffers: nhelper*4 1 532 * 533 */ 534 static void 535 dump_update_clevel() 536 { 537 int tag; 538 size_t bz2size; 539 helper_t *hp, *hpend; 540 cbuf_t *cp, *cpend; 541 dumpcfg_t *old = &dumpcfg; 542 dumpcfg_t newcfg = *old; 543 dumpcfg_t *new = &newcfg; 544 545 ASSERT(MUTEX_HELD(&dump_lock)); 546 547 /* 548 * Free the previously allocated bufs and VM. 549 */ 550 if (old->helper != NULL) { 551 552 /* helpers */ 553 hpend = &old->helper[old->nhelper]; 554 for (hp = old->helper; hp != hpend; hp++) { 555 if (hp->lzbuf != NULL) 556 kmem_free(hp->lzbuf, PAGESIZE); 557 if (hp->page != NULL) 558 kmem_free(hp->page, PAGESIZE); 559 } 560 kmem_free(old->helper, old->nhelper * sizeof (helper_t)); 561 562 /* VM space for mapping pages */ 563 cpend = &old->cmap[old->ncmap]; 564 for (cp = old->cmap; cp != cpend; cp++) 565 vmem_xfree(heap_arena, cp->buf, CBUF_MAPSIZE); 566 kmem_free(old->cmap, old->ncmap * sizeof (cbuf_t)); 567 568 /* output bufs */ 569 cpend = &old->cbuf[old->ncbuf]; 570 for (cp = old->cbuf; cp != cpend; cp++) 571 if (cp->buf != NULL) 572 kmem_free(cp->buf, cp->size); 573 kmem_free(old->cbuf, old->ncbuf * sizeof (cbuf_t)); 574 575 /* reserved VM for dumpsys_get_maxmem */ 576 if (old->maxvmsize > 0) 577 vmem_xfree(heap_arena, old->maxvm, old->maxvmsize); 578 } 579 580 /* 581 * Allocate memory and VM. 582 * One CPU runs dumpsys, the rest are helpers. 583 */ 584 new->nhelper = ncpus - 1; 585 if (new->nhelper < 1) 586 new->nhelper = 1; 587 588 if (new->nhelper > DUMP_MAX_NHELPER) 589 new->nhelper = DUMP_MAX_NHELPER; 590 591 /* use platform default, unless /etc/system overrides */ 592 if (dump_plat_mincpu == MINCPU_NOT_SET) 593 dump_plat_mincpu = dump_plat_mincpu_default; 594 595 /* increase threshold for faster disks */ 596 new->threshold = dump_plat_mincpu; 597 if (dumpbuf.iosize >= DUMP_1MB) 598 new->threshold *= 3; 599 else if (dumpbuf.iosize >= (256 * DUMP_1KB)) 600 new->threshold *= 2; 601 602 /* figure compression level based upon the computed threshold. */ 603 if (dump_plat_mincpu == 0 || new->nhelper < 2) { 604 new->clevel = 0; 605 new->nhelper = 1; 606 } else if ((new->nhelper + 1) >= new->threshold) { 607 new->clevel = DUMP_CLEVEL_BZIP2; 608 } else { 609 new->clevel = DUMP_CLEVEL_LZJB; 610 } 611 612 if (new->clevel == 0) { 613 new->ncbuf = 1; 614 new->ncmap = 1; 615 } else { 616 new->ncbuf = NCBUF_PER_HELPER * new->nhelper; 617 new->ncmap = NCMAP_PER_HELPER * new->nhelper; 618 } 619 620 /* 621 * Allocate new data structures and buffers for MINHELPERS, 622 * and also figure the max desired size. 623 */ 624 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level); 625 new->maxsize = 0; 626 new->maxvmsize = 0; 627 new->maxvm = NULL; 628 tag = 1; 629 new->helper = kmem_zalloc(new->nhelper * sizeof (helper_t), KM_SLEEP); 630 hpend = &new->helper[new->nhelper]; 631 for (hp = new->helper; hp != hpend; hp++) { 632 hp->tag = tag++; 633 if (hp < &new->helper[MINHELPERS]) { 634 hp->lzbuf = kmem_alloc(PAGESIZE, KM_SLEEP); 635 hp->page = kmem_alloc(PAGESIZE, KM_SLEEP); 636 } else if (new->clevel < DUMP_CLEVEL_BZIP2) { 637 new->maxsize += 2 * PAGESIZE; 638 } else { 639 new->maxsize += PAGESIZE; 640 } 641 if (new->clevel >= DUMP_CLEVEL_BZIP2) 642 new->maxsize += bz2size; 643 } 644 645 new->cbuf = kmem_zalloc(new->ncbuf * sizeof (cbuf_t), KM_SLEEP); 646 cpend = &new->cbuf[new->ncbuf]; 647 for (cp = new->cbuf; cp != cpend; cp++) { 648 cp->state = CBUF_FREEBUF; 649 cp->size = CBUF_SIZE; 650 if (cp < &new->cbuf[MINCBUFS]) 651 cp->buf = kmem_alloc(cp->size, KM_SLEEP); 652 else 653 new->maxsize += cp->size; 654 } 655 656 new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP); 657 cpend = &new->cmap[new->ncmap]; 658 for (cp = new->cmap; cp != cpend; cp++) { 659 cp->state = CBUF_FREEMAP; 660 cp->size = CBUF_MAPSIZE; 661 cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE, 662 0, 0, NULL, NULL, VM_SLEEP); 663 } 664 665 /* reserve VA to be backed with spare pages at crash time */ 666 if (new->maxsize > 0) { 667 new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE); 668 new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE); 669 new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize, 670 CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP); 671 } 672 673 /* 674 * Reserve memory for kmem allocation calls made during crash 675 * dump. The hat layer allocates memory for each mapping 676 * created, and the I/O path allocates buffers and data structs. 677 * Add a few pages for safety. 678 */ 679 kmem_dump_init((new->ncmap * dump_kmem_permap) + 680 (dump_kmem_pages * PAGESIZE)); 681 682 /* set new config pointers */ 683 *old = *new; 684 } 685 686 /* 687 * Define a struct memlist walker to optimize bitnum to pfn 688 * lookup. The walker maintains the state of the list traversal. 689 */ 690 typedef struct dumpmlw { 691 struct memlist *mp; /* current memlist */ 692 pgcnt_t basenum; /* bitnum base offset */ 693 pgcnt_t mppages; /* current memlist size */ 694 pgcnt_t mpleft; /* size to end of current memlist */ 695 pfn_t mpaddr; /* first pfn in memlist */ 696 } dumpmlw_t; 697 698 /* initialize the walker */ 699 static inline void 700 dump_init_memlist_walker(dumpmlw_t *pw) 701 { 702 pw->mp = phys_install; 703 pw->basenum = 0; 704 pw->mppages = pw->mp->ml_size >> PAGESHIFT; 705 pw->mpleft = pw->mppages; 706 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT; 707 } 708 709 /* 710 * Lookup pfn given bitnum. The memlist can be quite long on some 711 * systems (e.g.: one per board). To optimize sequential lookups, the 712 * caller initializes and presents a memlist walker. 713 */ 714 static pfn_t 715 dump_bitnum_to_pfn(pgcnt_t bitnum, dumpmlw_t *pw) 716 { 717 bitnum -= pw->basenum; 718 while (pw->mp != NULL) { 719 if (bitnum < pw->mppages) { 720 pw->mpleft = pw->mppages - bitnum; 721 return (pw->mpaddr + bitnum); 722 } 723 bitnum -= pw->mppages; 724 pw->basenum += pw->mppages; 725 pw->mp = pw->mp->ml_next; 726 if (pw->mp != NULL) { 727 pw->mppages = pw->mp->ml_size >> PAGESHIFT; 728 pw->mpleft = pw->mppages; 729 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT; 730 } 731 } 732 return (PFN_INVALID); 733 } 734 735 static pgcnt_t 736 dump_pfn_to_bitnum(pfn_t pfn) 737 { 738 struct memlist *mp; 739 pgcnt_t bitnum = 0; 740 741 for (mp = phys_install; mp != NULL; mp = mp->ml_next) { 742 if (pfn >= (mp->ml_address >> PAGESHIFT) && 743 pfn < ((mp->ml_address + mp->ml_size) >> PAGESHIFT)) 744 return (bitnum + pfn - (mp->ml_address >> PAGESHIFT)); 745 bitnum += mp->ml_size >> PAGESHIFT; 746 } 747 return ((pgcnt_t)-1); 748 } 749 750 /* 751 * Set/test bitmap for a CBUF_MAPSIZE range which includes pfn. The 752 * mapping of pfn to range index is imperfect because pfn and bitnum 753 * do not have the same phase. To make sure a CBUF_MAPSIZE range is 754 * covered, call this for both ends: 755 * dump_set_used(base) 756 * dump_set_used(base+CBUF_MAPNP-1) 757 * 758 * This is used during a panic dump to mark pages allocated by 759 * dumpsys_get_maxmem(). The macro IS_DUMP_PAGE(pp) is used by 760 * page_get_mnode_freelist() to make sure pages used by dump are never 761 * allocated. 762 */ 763 #define CBUF_MAPP2R(pfn) ((pfn) >> (CBUF_MAPSHIFT - PAGESHIFT)) 764 765 static void 766 dump_set_used(pfn_t pfn) 767 { 768 769 pgcnt_t bitnum, rbitnum; 770 771 bitnum = dump_pfn_to_bitnum(pfn); 772 ASSERT(bitnum != (pgcnt_t)-1); 773 774 rbitnum = CBUF_MAPP2R(bitnum); 775 ASSERT(rbitnum < dumpcfg.rbitmapsize); 776 777 BT_SET(dumpcfg.rbitmap, rbitnum); 778 } 779 780 int 781 dump_test_used(pfn_t pfn) 782 { 783 pgcnt_t bitnum, rbitnum; 784 785 bitnum = dump_pfn_to_bitnum(pfn); 786 ASSERT(bitnum != (pgcnt_t)-1); 787 788 rbitnum = CBUF_MAPP2R(bitnum); 789 ASSERT(rbitnum < dumpcfg.rbitmapsize); 790 791 return (BT_TEST(dumpcfg.rbitmap, rbitnum)); 792 } 793 794 /* 795 * dumpbzalloc and dumpbzfree are callbacks from the bzip2 library. 796 * dumpsys_get_maxmem() uses them for BZ2_bzCompressInit(). 797 */ 798 static void * 799 dumpbzalloc(void *opaque, int items, int size) 800 { 801 size_t *sz; 802 char *ret; 803 804 ASSERT(opaque != NULL); 805 sz = opaque; 806 ret = dumpcfg.maxvm + *sz; 807 *sz += items * size; 808 *sz = P2ROUNDUP(*sz, BZ2_BZALLOC_ALIGN); 809 ASSERT(*sz <= dumpcfg.maxvmsize); 810 return (ret); 811 } 812 813 /*ARGSUSED*/ 814 static void 815 dumpbzfree(void *opaque, void *addr) 816 { 817 } 818 819 /* 820 * Perform additional checks on the page to see if we can really use 821 * it. The kernel (kas) pages are always set in the bitmap. However, 822 * boot memory pages (prom_ppages or P_BOOTPAGES) are not in the 823 * bitmap. So we check for them. 824 */ 825 static inline int 826 dump_pfn_check(pfn_t pfn) 827 { 828 page_t *pp = page_numtopp_nolock(pfn); 829 if (pp == NULL || pp->p_pagenum != pfn || 830 #if defined(__sparc) 831 pp->p_vnode == &promvp || 832 #else 833 PP_ISBOOTPAGES(pp) || 834 #endif 835 pp->p_toxic != 0) 836 return (0); 837 return (1); 838 } 839 840 /* 841 * Check a range to see if all contained pages are available and 842 * return non-zero if the range can be used. 843 */ 844 static inline int 845 dump_range_check(pgcnt_t start, pgcnt_t end, pfn_t pfn) 846 { 847 for (; start < end; start++, pfn++) { 848 if (BT_TEST(dumpcfg.bitmap, start)) 849 return (0); 850 if (!dump_pfn_check(pfn)) 851 return (0); 852 } 853 return (1); 854 } 855 856 /* 857 * dumpsys_get_maxmem() is called during panic. Find unused ranges 858 * and use them for buffers. If we find enough memory switch to 859 * parallel bzip2, otherwise use parallel lzjb. 860 * 861 * It searches the dump bitmap in 2 passes. The first time it looks 862 * for CBUF_MAPSIZE ranges. On the second pass it uses small pages. 863 */ 864 static void 865 dumpsys_get_maxmem() 866 { 867 dumpcfg_t *cfg = &dumpcfg; 868 cbuf_t *endcp = &cfg->cbuf[cfg->ncbuf]; 869 helper_t *endhp = &cfg->helper[cfg->nhelper]; 870 pgcnt_t bitnum, end; 871 size_t sz, endsz, bz2size; 872 pfn_t pfn, off; 873 cbuf_t *cp; 874 helper_t *hp, *ohp; 875 dumpmlw_t mlw; 876 int k; 877 878 /* 879 * Fall back to doing a serial dump if no helpers showed 880 * up. It is possible for other CPUs to be stuck in PROM, or 881 * DRd out. panic("sync initiated") in sync_handler() is one 882 * case. A parallel dump will hang (dump time out) unless 883 * there is at least one helper CPU. At this point dumpsys() 884 * has done some I/O, which means there has been plenty of 885 * time for helpers to arrive. 886 */ 887 if (!cfg->helper_present) { 888 cfg->clevel = 0; 889 return; 890 } 891 892 /* 893 * There may be no point in looking for spare memory. If 894 * dumping all memory, then none is spare. If doing a serial 895 * dump, then already have buffers. 896 */ 897 if (cfg->maxsize == 0 || cfg->clevel < DUMP_CLEVEL_LZJB || 898 (dump_conflags & DUMP_ALL) != 0) { 899 if (cfg->clevel > DUMP_CLEVEL_LZJB) 900 cfg->clevel = DUMP_CLEVEL_LZJB; 901 return; 902 } 903 904 sz = 0; 905 cfg->found4m = 0; 906 cfg->foundsm = 0; 907 908 /* bitmap of ranges used to estimate which pfns are being used */ 909 bzero(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.rbitmapsize)); 910 911 /* find ranges that are not being dumped to use for buffers */ 912 dump_init_memlist_walker(&mlw); 913 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) { 914 dump_timeleft = dump_timeout; 915 end = bitnum + CBUF_MAPNP; 916 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 917 ASSERT(pfn != PFN_INVALID); 918 919 /* skip partial range at end of mem segment */ 920 if (mlw.mpleft < CBUF_MAPNP) { 921 end = bitnum + mlw.mpleft; 922 continue; 923 } 924 925 /* skip non aligned pages */ 926 off = P2PHASE(pfn, CBUF_MAPNP); 927 if (off != 0) { 928 end -= off; 929 continue; 930 } 931 932 if (!dump_range_check(bitnum, end, pfn)) 933 continue; 934 935 ASSERT((sz + CBUF_MAPSIZE) <= cfg->maxvmsize); 936 hat_devload(kas.a_hat, cfg->maxvm + sz, CBUF_MAPSIZE, pfn, 937 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST); 938 sz += CBUF_MAPSIZE; 939 cfg->found4m++; 940 941 /* set the bitmap for both ends to be sure to cover the range */ 942 dump_set_used(pfn); 943 dump_set_used(pfn + CBUF_MAPNP - 1); 944 945 if (sz >= cfg->maxsize) 946 goto foundmax; 947 } 948 949 /* Add small pages if we can't find enough large pages. */ 950 dump_init_memlist_walker(&mlw); 951 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) { 952 dump_timeleft = dump_timeout; 953 end = bitnum + CBUF_MAPNP; 954 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 955 ASSERT(pfn != PFN_INVALID); 956 957 /* Find any non-aligned pages at start and end of segment. */ 958 off = P2PHASE(pfn, CBUF_MAPNP); 959 if (mlw.mpleft < CBUF_MAPNP) { 960 end = bitnum + mlw.mpleft; 961 } else if (off != 0) { 962 end -= off; 963 } else if (cfg->found4m && dump_test_used(pfn)) { 964 continue; 965 } 966 967 for (; bitnum < end; bitnum++, pfn++) { 968 dump_timeleft = dump_timeout; 969 if (BT_TEST(dumpcfg.bitmap, bitnum)) 970 continue; 971 if (!dump_pfn_check(pfn)) 972 continue; 973 ASSERT((sz + PAGESIZE) <= cfg->maxvmsize); 974 hat_devload(kas.a_hat, cfg->maxvm + sz, PAGESIZE, pfn, 975 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST); 976 sz += PAGESIZE; 977 cfg->foundsm++; 978 dump_set_used(pfn); 979 if (sz >= cfg->maxsize) 980 goto foundmax; 981 } 982 } 983 984 /* Fall back to lzjb if we did not get enough memory for bzip2. */ 985 endsz = (cfg->maxsize * cfg->threshold) / cfg->nhelper; 986 if (sz < endsz) { 987 cfg->clevel = DUMP_CLEVEL_LZJB; 988 } 989 990 /* Allocate memory for as many helpers as we can. */ 991 foundmax: 992 993 /* Byte offsets into memory found and mapped above */ 994 endsz = sz; 995 sz = 0; 996 997 /* Set the size for bzip2 state. Only bzip2 needs it. */ 998 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level); 999 1000 /* Skip the preallocate output buffers. */ 1001 cp = &cfg->cbuf[MINCBUFS]; 1002 1003 /* Use this to move memory up from the preallocated helpers. */ 1004 ohp = cfg->helper; 1005 1006 /* Loop over all helpers and allocate memory. */ 1007 for (hp = cfg->helper; hp < endhp; hp++) { 1008 1009 /* Skip preallocated helpers by checking hp->page. */ 1010 if (hp->page == NULL) { 1011 if (cfg->clevel <= DUMP_CLEVEL_LZJB) { 1012 /* lzjb needs 2 1-page buffers */ 1013 if ((sz + (2 * PAGESIZE)) > endsz) 1014 break; 1015 hp->page = cfg->maxvm + sz; 1016 sz += PAGESIZE; 1017 hp->lzbuf = cfg->maxvm + sz; 1018 sz += PAGESIZE; 1019 1020 } else if (ohp->lzbuf != NULL) { 1021 /* re-use the preallocted lzjb page for bzip2 */ 1022 hp->page = ohp->lzbuf; 1023 ohp->lzbuf = NULL; 1024 ++ohp; 1025 1026 } else { 1027 /* bzip2 needs a 1-page buffer */ 1028 if ((sz + PAGESIZE) > endsz) 1029 break; 1030 hp->page = cfg->maxvm + sz; 1031 sz += PAGESIZE; 1032 } 1033 } 1034 1035 /* 1036 * Add output buffers per helper. The number of 1037 * buffers per helper is determined by the ratio of 1038 * ncbuf to nhelper. 1039 */ 1040 for (k = 0; cp < endcp && (sz + CBUF_SIZE) <= endsz && 1041 k < NCBUF_PER_HELPER; k++) { 1042 cp->state = CBUF_FREEBUF; 1043 cp->size = CBUF_SIZE; 1044 cp->buf = cfg->maxvm + sz; 1045 sz += CBUF_SIZE; 1046 ++cp; 1047 } 1048 1049 /* 1050 * bzip2 needs compression state. Use the dumpbzalloc 1051 * and dumpbzfree callbacks to allocate the memory. 1052 * bzip2 does allocation only at init time. 1053 */ 1054 if (cfg->clevel >= DUMP_CLEVEL_BZIP2) { 1055 if ((sz + bz2size) > endsz) { 1056 hp->page = NULL; 1057 break; 1058 } else { 1059 hp->bzstream.opaque = &sz; 1060 hp->bzstream.bzalloc = dumpbzalloc; 1061 hp->bzstream.bzfree = dumpbzfree; 1062 (void) BZ2_bzCompressInit(&hp->bzstream, 1063 dump_bzip2_level, 0, 0); 1064 hp->bzstream.opaque = NULL; 1065 } 1066 } 1067 } 1068 1069 /* Finish allocating output buffers */ 1070 for (; cp < endcp && (sz + CBUF_SIZE) <= endsz; cp++) { 1071 cp->state = CBUF_FREEBUF; 1072 cp->size = CBUF_SIZE; 1073 cp->buf = cfg->maxvm + sz; 1074 sz += CBUF_SIZE; 1075 } 1076 1077 /* Enable IS_DUMP_PAGE macro, which checks for pages we took. */ 1078 if (cfg->found4m || cfg->foundsm) 1079 dump_check_used = 1; 1080 1081 ASSERT(sz <= endsz); 1082 } 1083 1084 static void 1085 dumphdr_init(void) 1086 { 1087 pgcnt_t npages = 0; 1088 1089 ASSERT(MUTEX_HELD(&dump_lock)); 1090 1091 if (dumphdr == NULL) { 1092 dumphdr = kmem_zalloc(sizeof (dumphdr_t), KM_SLEEP); 1093 dumphdr->dump_magic = DUMP_MAGIC; 1094 dumphdr->dump_version = DUMP_VERSION; 1095 dumphdr->dump_wordsize = DUMP_WORDSIZE; 1096 dumphdr->dump_pageshift = PAGESHIFT; 1097 dumphdr->dump_pagesize = PAGESIZE; 1098 dumphdr->dump_utsname = utsname; 1099 (void) strcpy(dumphdr->dump_platform, platform); 1100 dumpbuf.size = dumpbuf_iosize(maxphys); 1101 dumpbuf.start = kmem_alloc(dumpbuf.size, KM_SLEEP); 1102 dumpbuf.end = dumpbuf.start + dumpbuf.size; 1103 dumpcfg.pids = kmem_alloc(v.v_proc * sizeof (pid_t), KM_SLEEP); 1104 dumpcfg.helpermap = kmem_zalloc(BT_SIZEOFMAP(NCPU), KM_SLEEP); 1105 LOCK_INIT_HELD(&dumpcfg.helper_lock); 1106 } 1107 1108 npages = num_phys_pages(); 1109 1110 if (dumpcfg.bitmapsize != npages) { 1111 size_t rlen = CBUF_MAPP2R(P2ROUNDUP(npages, CBUF_MAPNP)); 1112 void *map = kmem_alloc(BT_SIZEOFMAP(npages), KM_SLEEP); 1113 void *rmap = kmem_alloc(BT_SIZEOFMAP(rlen), KM_SLEEP); 1114 1115 if (dumpcfg.bitmap != NULL) 1116 kmem_free(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg. 1117 bitmapsize)); 1118 if (dumpcfg.rbitmap != NULL) 1119 kmem_free(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg. 1120 rbitmapsize)); 1121 dumpcfg.bitmap = map; 1122 dumpcfg.bitmapsize = npages; 1123 dumpcfg.rbitmap = rmap; 1124 dumpcfg.rbitmapsize = rlen; 1125 } 1126 } 1127 1128 /* 1129 * Establish a new dump device. 1130 */ 1131 int 1132 dumpinit(vnode_t *vp, char *name, int justchecking) 1133 { 1134 vnode_t *cvp; 1135 vattr_t vattr; 1136 vnode_t *cdev_vp; 1137 int error = 0; 1138 1139 ASSERT(MUTEX_HELD(&dump_lock)); 1140 1141 dumphdr_init(); 1142 1143 cvp = common_specvp(vp); 1144 if (cvp == dumpvp) 1145 return (0); 1146 1147 /* 1148 * Determine whether this is a plausible dump device. We want either: 1149 * (1) a real device that's not mounted and has a cb_dump routine, or 1150 * (2) a swapfile on some filesystem that has a vop_dump routine. 1151 */ 1152 if ((error = VOP_OPEN(&cvp, FREAD | FWRITE, kcred, NULL)) != 0) 1153 return (error); 1154 1155 vattr.va_mask = AT_SIZE | AT_TYPE | AT_RDEV; 1156 if ((error = VOP_GETATTR(cvp, &vattr, 0, kcred, NULL)) == 0) { 1157 if (vattr.va_type == VBLK || vattr.va_type == VCHR) { 1158 if (devopsp[getmajor(vattr.va_rdev)]-> 1159 devo_cb_ops->cb_dump == nodev) 1160 error = ENOTSUP; 1161 else if (vfs_devismounted(vattr.va_rdev)) 1162 error = EBUSY; 1163 if (strcmp(ddi_driver_name(VTOS(cvp)->s_dip), 1164 ZFS_DRIVER) == 0 && 1165 IS_SWAPVP(common_specvp(cvp))) 1166 error = EBUSY; 1167 } else { 1168 if (vn_matchopval(cvp, VOPNAME_DUMP, fs_nosys) || 1169 !IS_SWAPVP(cvp)) 1170 error = ENOTSUP; 1171 } 1172 } 1173 1174 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) 1175 error = ENOSPC; 1176 1177 if (error || justchecking) { 1178 (void) VOP_CLOSE(cvp, FREAD | FWRITE, 1, (offset_t)0, 1179 kcred, NULL); 1180 return (error); 1181 } 1182 1183 VN_HOLD(cvp); 1184 1185 if (dumpvp != NULL) 1186 dumpfini(); /* unconfigure the old dump device */ 1187 1188 dumpvp = cvp; 1189 dumpvp_size = vattr.va_size & -DUMP_OFFSET; 1190 dumppath = kmem_alloc(strlen(name) + 1, KM_SLEEP); 1191 (void) strcpy(dumppath, name); 1192 dumpbuf.iosize = 0; 1193 1194 /* 1195 * If the dump device is a block device, attempt to open up the 1196 * corresponding character device and determine its maximum transfer 1197 * size. We use this information to potentially resize dumpbuf to a 1198 * larger and more optimal size for performing i/o to the dump device. 1199 */ 1200 if (cvp->v_type == VBLK && 1201 (cdev_vp = makespecvp(VTOS(cvp)->s_dev, VCHR)) != NULL) { 1202 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) { 1203 size_t blk_size; 1204 struct dk_cinfo dki; 1205 struct dk_minfo minf; 1206 1207 if (VOP_IOCTL(cdev_vp, DKIOCGMEDIAINFO, 1208 (intptr_t)&minf, FKIOCTL, kcred, NULL, NULL) 1209 == 0 && minf.dki_lbsize != 0) 1210 blk_size = minf.dki_lbsize; 1211 else 1212 blk_size = DEV_BSIZE; 1213 1214 if (VOP_IOCTL(cdev_vp, DKIOCINFO, (intptr_t)&dki, 1215 FKIOCTL, kcred, NULL, NULL) == 0) { 1216 dumpbuf.iosize = dki.dki_maxtransfer * blk_size; 1217 dumpbuf_resize(); 1218 } 1219 /* 1220 * If we are working with a zvol then dumpify it 1221 * if it's not being used as swap. 1222 */ 1223 if (strcmp(dki.dki_dname, ZVOL_DRIVER) == 0) { 1224 if (IS_SWAPVP(common_specvp(cvp))) 1225 error = EBUSY; 1226 else if ((error = VOP_IOCTL(cdev_vp, 1227 DKIOCDUMPINIT, NULL, FKIOCTL, kcred, 1228 NULL, NULL)) != 0) 1229 dumpfini(); 1230 } 1231 1232 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0, 1233 kcred, NULL); 1234 } 1235 1236 VN_RELE(cdev_vp); 1237 } 1238 1239 cmn_err(CE_CONT, "?dump on %s size %llu MB\n", name, dumpvp_size >> 20); 1240 1241 dump_update_clevel(); 1242 1243 return (error); 1244 } 1245 1246 void 1247 dumpfini(void) 1248 { 1249 vattr_t vattr; 1250 boolean_t is_zfs = B_FALSE; 1251 vnode_t *cdev_vp; 1252 ASSERT(MUTEX_HELD(&dump_lock)); 1253 1254 kmem_free(dumppath, strlen(dumppath) + 1); 1255 1256 /* 1257 * Determine if we are using zvols for our dump device 1258 */ 1259 vattr.va_mask = AT_RDEV; 1260 if (VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL) == 0) { 1261 is_zfs = (getmajor(vattr.va_rdev) == 1262 ddi_name_to_major(ZFS_DRIVER)) ? B_TRUE : B_FALSE; 1263 } 1264 1265 /* 1266 * If we have a zvol dump device then we call into zfs so 1267 * that it may have a chance to cleanup. 1268 */ 1269 if (is_zfs && 1270 (cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR)) != NULL) { 1271 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) { 1272 (void) VOP_IOCTL(cdev_vp, DKIOCDUMPFINI, NULL, FKIOCTL, 1273 kcred, NULL, NULL); 1274 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0, 1275 kcred, NULL); 1276 } 1277 VN_RELE(cdev_vp); 1278 } 1279 1280 (void) VOP_CLOSE(dumpvp, FREAD | FWRITE, 1, (offset_t)0, kcred, NULL); 1281 1282 VN_RELE(dumpvp); 1283 1284 dumpvp = NULL; 1285 dumpvp_size = 0; 1286 dumppath = NULL; 1287 } 1288 1289 static offset_t 1290 dumpvp_flush(void) 1291 { 1292 size_t size = P2ROUNDUP(dumpbuf.cur - dumpbuf.start, PAGESIZE); 1293 hrtime_t iotime; 1294 int err; 1295 1296 if (dumpbuf.vp_off + size > dumpbuf.vp_limit) { 1297 dump_ioerr = ENOSPC; 1298 dumpbuf.vp_off = dumpbuf.vp_limit; 1299 } else if (size != 0) { 1300 iotime = gethrtime(); 1301 dumpsync.iowait += iotime - dumpsync.iowaitts; 1302 if (panicstr) 1303 err = VOP_DUMP(dumpvp, dumpbuf.start, 1304 lbtodb(dumpbuf.vp_off), btod(size), NULL); 1305 else 1306 err = vn_rdwr(UIO_WRITE, dumpbuf.cdev_vp != NULL ? 1307 dumpbuf.cdev_vp : dumpvp, dumpbuf.start, size, 1308 dumpbuf.vp_off, UIO_SYSSPACE, 0, dumpbuf.vp_limit, 1309 kcred, 0); 1310 if (err && dump_ioerr == 0) 1311 dump_ioerr = err; 1312 dumpsync.iowaitts = gethrtime(); 1313 dumpsync.iotime += dumpsync.iowaitts - iotime; 1314 dumpsync.nwrite += size; 1315 dumpbuf.vp_off += size; 1316 } 1317 dumpbuf.cur = dumpbuf.start; 1318 dump_timeleft = dump_timeout; 1319 return (dumpbuf.vp_off); 1320 } 1321 1322 /* maximize write speed by keeping seek offset aligned with size */ 1323 void 1324 dumpvp_write(const void *va, size_t size) 1325 { 1326 size_t len, off, sz; 1327 1328 while (size != 0) { 1329 len = MIN(size, dumpbuf.end - dumpbuf.cur); 1330 if (len == 0) { 1331 off = P2PHASE(dumpbuf.vp_off, dumpbuf.size); 1332 if (off == 0 || !ISP2(dumpbuf.size)) { 1333 (void) dumpvp_flush(); 1334 } else { 1335 sz = dumpbuf.size - off; 1336 dumpbuf.cur = dumpbuf.start + sz; 1337 (void) dumpvp_flush(); 1338 ovbcopy(dumpbuf.start + sz, dumpbuf.start, off); 1339 dumpbuf.cur += off; 1340 } 1341 } else { 1342 bcopy(va, dumpbuf.cur, len); 1343 va = (char *)va + len; 1344 dumpbuf.cur += len; 1345 size -= len; 1346 } 1347 } 1348 } 1349 1350 /*ARGSUSED*/ 1351 static void 1352 dumpvp_ksyms_write(const void *src, void *dst, size_t size) 1353 { 1354 dumpvp_write(src, size); 1355 } 1356 1357 /* 1358 * Mark 'pfn' in the bitmap and dump its translation table entry. 1359 */ 1360 void 1361 dump_addpage(struct as *as, void *va, pfn_t pfn) 1362 { 1363 mem_vtop_t mem_vtop; 1364 pgcnt_t bitnum; 1365 1366 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) { 1367 if (!BT_TEST(dumpcfg.bitmap, bitnum)) { 1368 dumphdr->dump_npages++; 1369 BT_SET(dumpcfg.bitmap, bitnum); 1370 } 1371 dumphdr->dump_nvtop++; 1372 mem_vtop.m_as = as; 1373 mem_vtop.m_va = va; 1374 mem_vtop.m_pfn = pfn; 1375 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 1376 } 1377 dump_timeleft = dump_timeout; 1378 } 1379 1380 /* 1381 * Mark 'pfn' in the bitmap 1382 */ 1383 void 1384 dump_page(pfn_t pfn) 1385 { 1386 pgcnt_t bitnum; 1387 1388 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) { 1389 if (!BT_TEST(dumpcfg.bitmap, bitnum)) { 1390 dumphdr->dump_npages++; 1391 BT_SET(dumpcfg.bitmap, bitnum); 1392 } 1393 } 1394 dump_timeleft = dump_timeout; 1395 } 1396 1397 /* 1398 * Dump the <as, va, pfn> information for a given address space. 1399 * SEGOP_DUMP() will call dump_addpage() for each page in the segment. 1400 */ 1401 static void 1402 dump_as(struct as *as) 1403 { 1404 struct seg *seg; 1405 1406 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1407 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 1408 if (seg->s_as != as) 1409 break; 1410 if (seg->s_ops == NULL) 1411 continue; 1412 SEGOP_DUMP(seg); 1413 } 1414 AS_LOCK_EXIT(as, &as->a_lock); 1415 1416 if (seg != NULL) 1417 cmn_err(CE_WARN, "invalid segment %p in address space %p", 1418 (void *)seg, (void *)as); 1419 } 1420 1421 static int 1422 dump_process(pid_t pid) 1423 { 1424 proc_t *p = sprlock(pid); 1425 1426 if (p == NULL) 1427 return (-1); 1428 if (p->p_as != &kas) { 1429 mutex_exit(&p->p_lock); 1430 dump_as(p->p_as); 1431 mutex_enter(&p->p_lock); 1432 } 1433 1434 sprunlock(p); 1435 1436 return (0); 1437 } 1438 1439 void 1440 dump_ereports(void) 1441 { 1442 u_offset_t dumpvp_start; 1443 erpt_dump_t ed; 1444 1445 if (dumpvp == NULL || dumphdr == NULL) 1446 return; 1447 1448 dumpbuf.cur = dumpbuf.start; 1449 dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE); 1450 dumpvp_start = dumpbuf.vp_limit - DUMP_ERPTSIZE; 1451 dumpbuf.vp_off = dumpvp_start; 1452 1453 fm_ereport_dump(); 1454 if (panicstr) 1455 errorq_dump(); 1456 1457 bzero(&ed, sizeof (ed)); /* indicate end of ereports */ 1458 dumpvp_write(&ed, sizeof (ed)); 1459 (void) dumpvp_flush(); 1460 1461 if (!panicstr) { 1462 (void) VOP_PUTPAGE(dumpvp, dumpvp_start, 1463 (size_t)(dumpbuf.vp_off - dumpvp_start), 1464 B_INVAL | B_FORCE, kcred, NULL); 1465 } 1466 } 1467 1468 void 1469 dump_messages(void) 1470 { 1471 log_dump_t ld; 1472 mblk_t *mctl, *mdata; 1473 queue_t *q, *qlast; 1474 u_offset_t dumpvp_start; 1475 1476 if (dumpvp == NULL || dumphdr == NULL || log_consq == NULL) 1477 return; 1478 1479 dumpbuf.cur = dumpbuf.start; 1480 dumpbuf.vp_limit = dumpvp_size - DUMP_OFFSET; 1481 dumpvp_start = dumpbuf.vp_limit - DUMP_LOGSIZE; 1482 dumpbuf.vp_off = dumpvp_start; 1483 1484 qlast = NULL; 1485 do { 1486 for (q = log_consq; q->q_next != qlast; q = q->q_next) 1487 continue; 1488 for (mctl = q->q_first; mctl != NULL; mctl = mctl->b_next) { 1489 dump_timeleft = dump_timeout; 1490 mdata = mctl->b_cont; 1491 ld.ld_magic = LOG_MAGIC; 1492 ld.ld_msgsize = MBLKL(mctl->b_cont); 1493 ld.ld_csum = checksum32(mctl->b_rptr, MBLKL(mctl)); 1494 ld.ld_msum = checksum32(mdata->b_rptr, MBLKL(mdata)); 1495 dumpvp_write(&ld, sizeof (ld)); 1496 dumpvp_write(mctl->b_rptr, MBLKL(mctl)); 1497 dumpvp_write(mdata->b_rptr, MBLKL(mdata)); 1498 } 1499 } while ((qlast = q) != log_consq); 1500 1501 ld.ld_magic = 0; /* indicate end of messages */ 1502 dumpvp_write(&ld, sizeof (ld)); 1503 (void) dumpvp_flush(); 1504 if (!panicstr) { 1505 (void) VOP_PUTPAGE(dumpvp, dumpvp_start, 1506 (size_t)(dumpbuf.vp_off - dumpvp_start), 1507 B_INVAL | B_FORCE, kcred, NULL); 1508 } 1509 } 1510 1511 /* 1512 * The following functions are called on multiple CPUs during dump. 1513 * They must not use most kernel services, because all cross-calls are 1514 * disabled during panic. Therefore, blocking locks and cache flushes 1515 * will not work. 1516 */ 1517 1518 /* 1519 * Copy pages, trapping ECC errors. Also, for robustness, trap data 1520 * access in case something goes wrong in the hat layer and the 1521 * mapping is broken. 1522 */ 1523 static int 1524 dump_pagecopy(void *src, void *dst) 1525 { 1526 long *wsrc = (long *)src; 1527 long *wdst = (long *)dst; 1528 const ulong_t ncopies = PAGESIZE / sizeof (long); 1529 volatile int w = 0; 1530 volatile int ueoff = -1; 1531 on_trap_data_t otd; 1532 1533 if (on_trap(&otd, OT_DATA_EC | OT_DATA_ACCESS)) { 1534 if (ueoff == -1) 1535 ueoff = w * sizeof (long); 1536 /* report "bad ECC" or "bad address" */ 1537 #ifdef _LP64 1538 if (otd.ot_trap & OT_DATA_EC) 1539 wdst[w++] = 0x00badecc00badecc; 1540 else 1541 wdst[w++] = 0x00badadd00badadd; 1542 #else 1543 if (otd.ot_trap & OT_DATA_EC) 1544 wdst[w++] = 0x00badecc; 1545 else 1546 wdst[w++] = 0x00badadd; 1547 #endif 1548 } 1549 while (w < ncopies) { 1550 wdst[w] = wsrc[w]; 1551 w++; 1552 } 1553 no_trap(); 1554 return (ueoff); 1555 } 1556 1557 static void 1558 dumpsys_close_cq(cqueue_t *cq, int live) 1559 { 1560 if (live) { 1561 mutex_enter(&cq->mutex); 1562 atomic_dec_uint(&cq->open); 1563 cv_signal(&cq->cv); 1564 mutex_exit(&cq->mutex); 1565 } else { 1566 atomic_dec_uint(&cq->open); 1567 } 1568 } 1569 1570 static inline void 1571 dumpsys_spinlock(lock_t *lp) 1572 { 1573 uint_t backoff = 0; 1574 int loop_count = 0; 1575 1576 while (LOCK_HELD(lp) || !lock_spin_try(lp)) { 1577 if (++loop_count >= ncpus) { 1578 backoff = mutex_lock_backoff(0); 1579 loop_count = 0; 1580 } else { 1581 backoff = mutex_lock_backoff(backoff); 1582 } 1583 mutex_lock_delay(backoff); 1584 } 1585 } 1586 1587 static inline void 1588 dumpsys_spinunlock(lock_t *lp) 1589 { 1590 lock_clear(lp); 1591 } 1592 1593 static inline void 1594 dumpsys_lock(cqueue_t *cq, int live) 1595 { 1596 if (live) 1597 mutex_enter(&cq->mutex); 1598 else 1599 dumpsys_spinlock(&cq->spinlock); 1600 } 1601 1602 static inline void 1603 dumpsys_unlock(cqueue_t *cq, int live, int signal) 1604 { 1605 if (live) { 1606 if (signal) 1607 cv_signal(&cq->cv); 1608 mutex_exit(&cq->mutex); 1609 } else { 1610 dumpsys_spinunlock(&cq->spinlock); 1611 } 1612 } 1613 1614 static void 1615 dumpsys_wait_cq(cqueue_t *cq, int live) 1616 { 1617 if (live) { 1618 cv_wait(&cq->cv, &cq->mutex); 1619 } else { 1620 dumpsys_spinunlock(&cq->spinlock); 1621 while (cq->open) 1622 if (cq->first) 1623 break; 1624 dumpsys_spinlock(&cq->spinlock); 1625 } 1626 } 1627 1628 static void 1629 dumpsys_put_cq(cqueue_t *cq, cbuf_t *cp, int newstate, int live) 1630 { 1631 if (cp == NULL) 1632 return; 1633 1634 dumpsys_lock(cq, live); 1635 1636 if (cq->ts != 0) { 1637 cq->empty += gethrtime() - cq->ts; 1638 cq->ts = 0; 1639 } 1640 1641 cp->state = newstate; 1642 cp->next = NULL; 1643 if (cq->last == NULL) 1644 cq->first = cp; 1645 else 1646 cq->last->next = cp; 1647 cq->last = cp; 1648 1649 dumpsys_unlock(cq, live, 1); 1650 } 1651 1652 static cbuf_t * 1653 dumpsys_get_cq(cqueue_t *cq, int live) 1654 { 1655 cbuf_t *cp; 1656 hrtime_t now = gethrtime(); 1657 1658 dumpsys_lock(cq, live); 1659 1660 /* CONSTCOND */ 1661 while (1) { 1662 cp = (cbuf_t *)cq->first; 1663 if (cp == NULL) { 1664 if (cq->open == 0) 1665 break; 1666 dumpsys_wait_cq(cq, live); 1667 continue; 1668 } 1669 cq->first = cp->next; 1670 if (cq->first == NULL) { 1671 cq->last = NULL; 1672 cq->ts = now; 1673 } 1674 break; 1675 } 1676 1677 dumpsys_unlock(cq, live, cq->first != NULL || cq->open == 0); 1678 return (cp); 1679 } 1680 1681 /* 1682 * Send an error message to the console. If the main task is running 1683 * just write the message via uprintf. If a helper is running the 1684 * message has to be put on a queue for the main task. Setting fmt to 1685 * NULL means flush the error message buffer. If fmt is not NULL, just 1686 * add the text to the existing buffer. 1687 */ 1688 static void 1689 dumpsys_errmsg(helper_t *hp, const char *fmt, ...) 1690 { 1691 dumpsync_t *ds = hp->ds; 1692 cbuf_t *cp = hp->cperr; 1693 va_list adx; 1694 1695 if (hp->helper == MAINHELPER) { 1696 if (fmt != NULL) { 1697 if (ds->neednl) { 1698 uprintf("\n"); 1699 ds->neednl = 0; 1700 } 1701 va_start(adx, fmt); 1702 vuprintf(fmt, adx); 1703 va_end(adx); 1704 } 1705 } else if (fmt == NULL) { 1706 if (cp != NULL) { 1707 CQ_PUT(mainq, cp, CBUF_ERRMSG); 1708 hp->cperr = NULL; 1709 } 1710 } else { 1711 if (hp->cperr == NULL) { 1712 cp = CQ_GET(freebufq); 1713 hp->cperr = cp; 1714 cp->used = 0; 1715 } 1716 va_start(adx, fmt); 1717 cp->used += vsnprintf(cp->buf + cp->used, cp->size - cp->used, 1718 fmt, adx); 1719 va_end(adx); 1720 if ((cp->used + LOG_MSGSIZE) > cp->size) { 1721 CQ_PUT(mainq, cp, CBUF_ERRMSG); 1722 hp->cperr = NULL; 1723 } 1724 } 1725 } 1726 1727 /* 1728 * Write an output buffer to the dump file. If the main task is 1729 * running just write the data. If a helper is running the output is 1730 * placed on a queue for the main task. 1731 */ 1732 static void 1733 dumpsys_swrite(helper_t *hp, cbuf_t *cp, size_t used) 1734 { 1735 dumpsync_t *ds = hp->ds; 1736 1737 if (hp->helper == MAINHELPER) { 1738 HRSTART(ds->perpage, write); 1739 dumpvp_write(cp->buf, used); 1740 HRSTOP(ds->perpage, write); 1741 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 1742 } else { 1743 cp->used = used; 1744 CQ_PUT(mainq, cp, CBUF_WRITE); 1745 } 1746 } 1747 1748 /* 1749 * Copy one page within the mapped range. The offset starts at 0 and 1750 * is relative to the first pfn. cp->buf + cp->off is the address of 1751 * the first pfn. If dump_pagecopy returns a UE offset, create an 1752 * error message. Returns the offset to the next pfn in the range 1753 * selected by the bitmap. 1754 */ 1755 static int 1756 dumpsys_copy_page(helper_t *hp, int offset) 1757 { 1758 cbuf_t *cp = hp->cpin; 1759 int ueoff; 1760 1761 ASSERT(cp->off + offset + PAGESIZE <= cp->size); 1762 ASSERT(BT_TEST(dumpcfg.bitmap, cp->bitnum)); 1763 1764 ueoff = dump_pagecopy(cp->buf + cp->off + offset, hp->page); 1765 1766 /* ueoff is the offset in the page to a UE error */ 1767 if (ueoff != -1) { 1768 uint64_t pa = ptob(cp->pfn) + offset + ueoff; 1769 1770 dumpsys_errmsg(hp, "cpu %d: memory error at PA 0x%08x.%08x\n", 1771 CPU->cpu_id, (uint32_t)(pa >> 32), (uint32_t)pa); 1772 } 1773 1774 /* 1775 * Advance bitnum and offset to the next input page for the 1776 * next call to this function. 1777 */ 1778 offset += PAGESIZE; 1779 cp->bitnum++; 1780 while (cp->off + offset < cp->size) { 1781 if (BT_TEST(dumpcfg.bitmap, cp->bitnum)) 1782 break; 1783 offset += PAGESIZE; 1784 cp->bitnum++; 1785 } 1786 1787 return (offset); 1788 } 1789 1790 /* 1791 * Read the helper queue, and copy one mapped page. Return 0 when 1792 * done. Return 1 when a page has been copied into hp->page. 1793 */ 1794 static int 1795 dumpsys_sread(helper_t *hp) 1796 { 1797 dumpsync_t *ds = hp->ds; 1798 1799 /* CONSTCOND */ 1800 while (1) { 1801 1802 /* Find the next input buffer. */ 1803 if (hp->cpin == NULL) { 1804 HRSTART(hp->perpage, inwait); 1805 1806 /* CONSTCOND */ 1807 while (1) { 1808 hp->cpin = CQ_GET(helperq); 1809 dump_timeleft = dump_timeout; 1810 1811 /* 1812 * NULL return means the helper queue 1813 * is closed and empty. 1814 */ 1815 if (hp->cpin == NULL) 1816 break; 1817 1818 /* Have input, check for dump I/O error. */ 1819 if (!dump_ioerr) 1820 break; 1821 1822 /* 1823 * If an I/O error occurs, stay in the 1824 * loop in order to empty the helper 1825 * queue. Return the buffers to the 1826 * main task to unmap and free it. 1827 */ 1828 hp->cpin->used = 0; 1829 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 1830 } 1831 HRSTOP(hp->perpage, inwait); 1832 1833 /* Stop here when the helper queue is closed. */ 1834 if (hp->cpin == NULL) 1835 break; 1836 1837 /* Set the offset=0 to get the first pfn. */ 1838 hp->in = 0; 1839 1840 /* Set the total processed to 0 */ 1841 hp->used = 0; 1842 } 1843 1844 /* Process the next page. */ 1845 if (hp->used < hp->cpin->used) { 1846 1847 /* 1848 * Get the next page from the input buffer and 1849 * return a copy. 1850 */ 1851 ASSERT(hp->in != -1); 1852 HRSTART(hp->perpage, copy); 1853 hp->in = dumpsys_copy_page(hp, hp->in); 1854 hp->used += PAGESIZE; 1855 HRSTOP(hp->perpage, copy); 1856 break; 1857 1858 } else { 1859 1860 /* 1861 * Done with the input. Flush the VM and 1862 * return the buffer to the main task. 1863 */ 1864 if (panicstr && hp->helper != MAINHELPER) 1865 hat_flush_range(kas.a_hat, 1866 hp->cpin->buf, hp->cpin->size); 1867 dumpsys_errmsg(hp, NULL); 1868 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 1869 hp->cpin = NULL; 1870 } 1871 } 1872 1873 return (hp->cpin != NULL); 1874 } 1875 1876 /* 1877 * Compress size bytes starting at buf with bzip2 1878 * mode: 1879 * BZ_RUN add one more compressed page 1880 * BZ_FINISH no more input, flush the state 1881 */ 1882 static void 1883 dumpsys_bzrun(helper_t *hp, void *buf, size_t size, int mode) 1884 { 1885 dumpsync_t *ds = hp->ds; 1886 const int CSIZE = sizeof (dumpcsize_t); 1887 bz_stream *ps = &hp->bzstream; 1888 int rc = 0; 1889 uint32_t csize; 1890 dumpcsize_t cs; 1891 1892 /* Set input pointers to new input page */ 1893 if (size > 0) { 1894 ps->avail_in = size; 1895 ps->next_in = buf; 1896 } 1897 1898 /* CONSTCOND */ 1899 while (1) { 1900 1901 /* Quit when all input has been consumed */ 1902 if (ps->avail_in == 0 && mode == BZ_RUN) 1903 break; 1904 1905 /* Get a new output buffer */ 1906 if (hp->cpout == NULL) { 1907 HRSTART(hp->perpage, outwait); 1908 hp->cpout = CQ_GET(freebufq); 1909 HRSTOP(hp->perpage, outwait); 1910 ps->avail_out = hp->cpout->size - CSIZE; 1911 ps->next_out = hp->cpout->buf + CSIZE; 1912 } 1913 1914 /* Compress input, or finalize */ 1915 HRSTART(hp->perpage, compress); 1916 rc = BZ2_bzCompress(ps, mode); 1917 HRSTOP(hp->perpage, compress); 1918 1919 /* Check for error */ 1920 if (mode == BZ_RUN && rc != BZ_RUN_OK) { 1921 dumpsys_errmsg(hp, "%d: BZ_RUN error %s at page %lx\n", 1922 hp->helper, BZ2_bzErrorString(rc), 1923 hp->cpin->pagenum); 1924 break; 1925 } 1926 1927 /* Write the buffer if it is full, or we are flushing */ 1928 if (ps->avail_out == 0 || mode == BZ_FINISH) { 1929 csize = hp->cpout->size - CSIZE - ps->avail_out; 1930 cs = DUMP_SET_TAG(csize, hp->tag); 1931 if (csize > 0) { 1932 (void) memcpy(hp->cpout->buf, &cs, CSIZE); 1933 dumpsys_swrite(hp, hp->cpout, csize + CSIZE); 1934 hp->cpout = NULL; 1935 } 1936 } 1937 1938 /* Check for final complete */ 1939 if (mode == BZ_FINISH) { 1940 if (rc == BZ_STREAM_END) 1941 break; 1942 if (rc != BZ_FINISH_OK) { 1943 dumpsys_errmsg(hp, "%d: BZ_FINISH error %s\n", 1944 hp->helper, BZ2_bzErrorString(rc)); 1945 break; 1946 } 1947 } 1948 } 1949 1950 /* Cleanup state and buffers */ 1951 if (mode == BZ_FINISH) { 1952 1953 /* Reset state so that it is re-usable. */ 1954 (void) BZ2_bzCompressReset(&hp->bzstream); 1955 1956 /* Give any unused outout buffer to the main task */ 1957 if (hp->cpout != NULL) { 1958 hp->cpout->used = 0; 1959 CQ_PUT(mainq, hp->cpout, CBUF_ERRMSG); 1960 hp->cpout = NULL; 1961 } 1962 } 1963 } 1964 1965 static void 1966 dumpsys_bz2compress(helper_t *hp) 1967 { 1968 dumpsync_t *ds = hp->ds; 1969 dumpstreamhdr_t sh; 1970 1971 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC); 1972 sh.stream_pagenum = (pgcnt_t)-1; 1973 sh.stream_npages = 0; 1974 hp->cpin = NULL; 1975 hp->cpout = NULL; 1976 hp->cperr = NULL; 1977 hp->in = 0; 1978 hp->out = 0; 1979 hp->bzstream.avail_in = 0; 1980 1981 /* Bump reference to mainq while we are running */ 1982 CQ_OPEN(mainq); 1983 1984 /* Get one page at a time */ 1985 while (dumpsys_sread(hp)) { 1986 if (sh.stream_pagenum != hp->cpin->pagenum) { 1987 sh.stream_pagenum = hp->cpin->pagenum; 1988 sh.stream_npages = btop(hp->cpin->used); 1989 dumpsys_bzrun(hp, &sh, sizeof (sh), BZ_RUN); 1990 } 1991 dumpsys_bzrun(hp, hp->page, PAGESIZE, 0); 1992 } 1993 1994 /* Done with input, flush any partial buffer */ 1995 if (sh.stream_pagenum != (pgcnt_t)-1) { 1996 dumpsys_bzrun(hp, NULL, 0, BZ_FINISH); 1997 dumpsys_errmsg(hp, NULL); 1998 } 1999 2000 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL); 2001 2002 /* Decrement main queue count, we are done */ 2003 CQ_CLOSE(mainq); 2004 } 2005 2006 /* 2007 * Compress with lzjb 2008 * write stream block if full or size==0 2009 * if csize==0 write stream header, else write <csize, data> 2010 * size==0 is a call to flush a buffer 2011 * hp->cpout is the buffer we are flushing or filling 2012 * hp->out is the next index to fill data 2013 * osize is either csize+data, or the size of a stream header 2014 */ 2015 static void 2016 dumpsys_lzjbrun(helper_t *hp, size_t csize, void *buf, size_t size) 2017 { 2018 dumpsync_t *ds = hp->ds; 2019 const int CSIZE = sizeof (dumpcsize_t); 2020 dumpcsize_t cs; 2021 size_t osize = csize > 0 ? CSIZE + size : size; 2022 2023 /* If flush, and there is no buffer, just return */ 2024 if (size == 0 && hp->cpout == NULL) 2025 return; 2026 2027 /* If flush, or cpout is full, write it out */ 2028 if (size == 0 || 2029 hp->cpout != NULL && hp->out + osize > hp->cpout->size) { 2030 2031 /* Set tag+size word at the front of the stream block. */ 2032 cs = DUMP_SET_TAG(hp->out - CSIZE, hp->tag); 2033 (void) memcpy(hp->cpout->buf, &cs, CSIZE); 2034 2035 /* Write block to dump file. */ 2036 dumpsys_swrite(hp, hp->cpout, hp->out); 2037 2038 /* Clear pointer to indicate we need a new buffer */ 2039 hp->cpout = NULL; 2040 2041 /* flushing, we are done */ 2042 if (size == 0) 2043 return; 2044 } 2045 2046 /* Get an output buffer if we dont have one. */ 2047 if (hp->cpout == NULL) { 2048 HRSTART(hp->perpage, outwait); 2049 hp->cpout = CQ_GET(freebufq); 2050 HRSTOP(hp->perpage, outwait); 2051 hp->out = CSIZE; 2052 } 2053 2054 /* Store csize word. This is the size of compressed data. */ 2055 if (csize > 0) { 2056 cs = DUMP_SET_TAG(csize, 0); 2057 (void) memcpy(hp->cpout->buf + hp->out, &cs, CSIZE); 2058 hp->out += CSIZE; 2059 } 2060 2061 /* Store the data. */ 2062 (void) memcpy(hp->cpout->buf + hp->out, buf, size); 2063 hp->out += size; 2064 } 2065 2066 static void 2067 dumpsys_lzjbcompress(helper_t *hp) 2068 { 2069 dumpsync_t *ds = hp->ds; 2070 size_t csize; 2071 dumpstreamhdr_t sh; 2072 2073 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC); 2074 sh.stream_pagenum = (pfn_t)-1; 2075 sh.stream_npages = 0; 2076 hp->cpin = NULL; 2077 hp->cpout = NULL; 2078 hp->cperr = NULL; 2079 hp->in = 0; 2080 hp->out = 0; 2081 2082 /* Bump reference to mainq while we are running */ 2083 CQ_OPEN(mainq); 2084 2085 /* Get one page at a time */ 2086 while (dumpsys_sread(hp)) { 2087 2088 /* Create a stream header for each new input map */ 2089 if (sh.stream_pagenum != hp->cpin->pagenum) { 2090 sh.stream_pagenum = hp->cpin->pagenum; 2091 sh.stream_npages = btop(hp->cpin->used); 2092 dumpsys_lzjbrun(hp, 0, &sh, sizeof (sh)); 2093 } 2094 2095 /* Compress one page */ 2096 HRSTART(hp->perpage, compress); 2097 csize = compress(hp->page, hp->lzbuf, PAGESIZE); 2098 HRSTOP(hp->perpage, compress); 2099 2100 /* Add csize+data to output block */ 2101 ASSERT(csize > 0 && csize <= PAGESIZE); 2102 dumpsys_lzjbrun(hp, csize, hp->lzbuf, csize); 2103 } 2104 2105 /* Done with input, flush any partial buffer */ 2106 if (sh.stream_pagenum != (pfn_t)-1) { 2107 dumpsys_lzjbrun(hp, 0, NULL, 0); 2108 dumpsys_errmsg(hp, NULL); 2109 } 2110 2111 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL); 2112 2113 /* Decrement main queue count, we are done */ 2114 CQ_CLOSE(mainq); 2115 } 2116 2117 /* 2118 * Dump helper called from panic_idle() to compress pages. CPUs in 2119 * this path must not call most kernel services. 2120 * 2121 * During panic, all but one of the CPUs is idle. These CPUs are used 2122 * as helpers working in parallel to copy and compress memory 2123 * pages. During a panic, however, these processors cannot call any 2124 * kernel services. This is because mutexes become no-ops during 2125 * panic, and, cross-call interrupts are inhibited. Therefore, during 2126 * panic dump the helper CPUs communicate with the panic CPU using 2127 * memory variables. All memory mapping and I/O is performed by the 2128 * panic CPU. 2129 * 2130 * At dump configuration time, helper_lock is set and helpers_wanted 2131 * is 0. dumpsys() decides whether to set helpers_wanted before 2132 * clearing helper_lock. 2133 * 2134 * At panic time, idle CPUs spin-wait on helper_lock, then alternately 2135 * take the lock and become a helper, or return. 2136 */ 2137 void 2138 dumpsys_helper() 2139 { 2140 if (!dumpcfg.helper_present) 2141 dumpcfg.helper_present = 1; 2142 dumpsys_spinlock(&dumpcfg.helper_lock); 2143 if (dumpcfg.helpers_wanted) { 2144 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper]; 2145 2146 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2147 if (hp->helper == FREEHELPER) { 2148 hp->helper = CPU->cpu_id; 2149 BT_SET(dumpcfg.helpermap, CPU->cpu_seqid); 2150 2151 dumpsys_spinunlock(&dumpcfg.helper_lock); 2152 2153 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2) 2154 dumpsys_lzjbcompress(hp); 2155 else 2156 dumpsys_bz2compress(hp); 2157 2158 hp->helper = DONEHELPER; 2159 return; 2160 } 2161 } 2162 2163 /* No more helpers are needed. */ 2164 dumpcfg.helpers_wanted = 0; 2165 2166 } 2167 dumpsys_spinunlock(&dumpcfg.helper_lock); 2168 } 2169 2170 /* 2171 * No-wait helper callable in spin loops. 2172 * 2173 * Do not wait for helper_lock. Just check helpers_wanted. The caller 2174 * may decide to continue. This is the "c)ontinue, s)ync, r)eset? s" 2175 * case. 2176 */ 2177 void 2178 dumpsys_helper_nw() 2179 { 2180 if (!dumpcfg.helper_present) 2181 dumpcfg.helper_present = 1; 2182 if (dumpcfg.helpers_wanted) 2183 dumpsys_helper(); 2184 } 2185 2186 /* 2187 * Dump helper for live dumps. 2188 * These run as a system task. 2189 */ 2190 static void 2191 dumpsys_live_helper(void *arg) 2192 { 2193 helper_t *hp = arg; 2194 2195 BT_ATOMIC_SET(dumpcfg.helpermap, CPU->cpu_seqid); 2196 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2) 2197 dumpsys_lzjbcompress(hp); 2198 else 2199 dumpsys_bz2compress(hp); 2200 } 2201 2202 /* 2203 * Compress one page with lzjb (single threaded case) 2204 */ 2205 static void 2206 dumpsys_lzjb_page(helper_t *hp, cbuf_t *cp) 2207 { 2208 dumpsync_t *ds = hp->ds; 2209 uint32_t csize; 2210 2211 hp->helper = MAINHELPER; 2212 hp->in = 0; 2213 hp->used = 0; 2214 hp->cpin = cp; 2215 while (hp->used < cp->used) { 2216 HRSTART(hp->perpage, copy); 2217 hp->in = dumpsys_copy_page(hp, hp->in); 2218 hp->used += PAGESIZE; 2219 HRSTOP(hp->perpage, copy); 2220 2221 HRSTART(hp->perpage, compress); 2222 csize = compress(hp->page, hp->lzbuf, PAGESIZE); 2223 HRSTOP(hp->perpage, compress); 2224 2225 HRSTART(hp->perpage, write); 2226 dumpvp_write(&csize, sizeof (csize)); 2227 dumpvp_write(hp->lzbuf, csize); 2228 HRSTOP(hp->perpage, write); 2229 } 2230 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 2231 hp->cpin = NULL; 2232 } 2233 2234 /* 2235 * Main task to dump pages. This is called on the dump CPU. 2236 */ 2237 static void 2238 dumpsys_main_task(void *arg) 2239 { 2240 dumpsync_t *ds = arg; 2241 pgcnt_t pagenum = 0, bitnum = 0, hibitnum; 2242 dumpmlw_t mlw; 2243 cbuf_t *cp; 2244 pgcnt_t baseoff, pfnoff; 2245 pfn_t base, pfn; 2246 int sec; 2247 2248 dump_init_memlist_walker(&mlw); 2249 2250 /* CONSTCOND */ 2251 while (1) { 2252 2253 if (ds->percent > ds->percent_done) { 2254 ds->percent_done = ds->percent; 2255 sec = (gethrtime() - ds->start) / 1000 / 1000 / 1000; 2256 uprintf("^\r%2d:%02d %3d%% done", 2257 sec / 60, sec % 60, ds->percent); 2258 ds->neednl = 1; 2259 } 2260 2261 while (CQ_IS_EMPTY(mainq) && !CQ_IS_EMPTY(writerq)) { 2262 2263 /* the writerq never blocks */ 2264 cp = CQ_GET(writerq); 2265 if (cp == NULL) 2266 break; 2267 2268 dump_timeleft = dump_timeout; 2269 2270 HRSTART(ds->perpage, write); 2271 dumpvp_write(cp->buf, cp->used); 2272 HRSTOP(ds->perpage, write); 2273 2274 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2275 } 2276 2277 /* 2278 * Wait here for some buffers to process. Returns NULL 2279 * when all helpers have terminated and all buffers 2280 * have been processed. 2281 */ 2282 cp = CQ_GET(mainq); 2283 2284 if (cp == NULL) { 2285 2286 /* Drain the write queue. */ 2287 if (!CQ_IS_EMPTY(writerq)) 2288 continue; 2289 2290 /* Main task exits here. */ 2291 break; 2292 } 2293 2294 dump_timeleft = dump_timeout; 2295 2296 switch (cp->state) { 2297 2298 case CBUF_FREEMAP: 2299 2300 /* 2301 * Note that we drop CBUF_FREEMAP buffers on 2302 * the floor (they will not be on any cqueue) 2303 * when we no longer need them. 2304 */ 2305 if (bitnum >= dumpcfg.bitmapsize) 2306 break; 2307 2308 if (dump_ioerr) { 2309 bitnum = dumpcfg.bitmapsize; 2310 CQ_CLOSE(helperq); 2311 break; 2312 } 2313 2314 HRSTART(ds->perpage, bitmap); 2315 for (; bitnum < dumpcfg.bitmapsize; bitnum++) 2316 if (BT_TEST(dumpcfg.bitmap, bitnum)) 2317 break; 2318 HRSTOP(ds->perpage, bitmap); 2319 dump_timeleft = dump_timeout; 2320 2321 if (bitnum >= dumpcfg.bitmapsize) { 2322 CQ_CLOSE(helperq); 2323 break; 2324 } 2325 2326 /* 2327 * Try to map CBUF_MAPSIZE ranges. Can't 2328 * assume that memory segment size is a 2329 * multiple of CBUF_MAPSIZE. Can't assume that 2330 * the segment starts on a CBUF_MAPSIZE 2331 * boundary. 2332 */ 2333 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2334 ASSERT(pfn != PFN_INVALID); 2335 ASSERT(bitnum + mlw.mpleft <= dumpcfg.bitmapsize); 2336 2337 base = P2ALIGN(pfn, CBUF_MAPNP); 2338 if (base < mlw.mpaddr) { 2339 base = mlw.mpaddr; 2340 baseoff = P2PHASE(base, CBUF_MAPNP); 2341 } else { 2342 baseoff = 0; 2343 } 2344 2345 pfnoff = pfn - base; 2346 if (pfnoff + mlw.mpleft < CBUF_MAPNP) { 2347 hibitnum = bitnum + mlw.mpleft; 2348 cp->size = ptob(pfnoff + mlw.mpleft); 2349 } else { 2350 hibitnum = bitnum - pfnoff + CBUF_MAPNP - 2351 baseoff; 2352 cp->size = CBUF_MAPSIZE - ptob(baseoff); 2353 } 2354 2355 cp->pfn = pfn; 2356 cp->bitnum = bitnum++; 2357 cp->pagenum = pagenum++; 2358 cp->off = ptob(pfnoff); 2359 2360 for (; bitnum < hibitnum; bitnum++) 2361 if (BT_TEST(dumpcfg.bitmap, bitnum)) 2362 pagenum++; 2363 2364 dump_timeleft = dump_timeout; 2365 cp->used = ptob(pagenum - cp->pagenum); 2366 2367 HRSTART(ds->perpage, map); 2368 hat_devload(kas.a_hat, cp->buf, cp->size, base, 2369 PROT_READ, HAT_LOAD_NOCONSIST); 2370 HRSTOP(ds->perpage, map); 2371 2372 ds->pages_mapped += btop(cp->size); 2373 ds->pages_used += pagenum - cp->pagenum; 2374 2375 CQ_OPEN(mainq); 2376 2377 /* 2378 * If there are no helpers the main task does 2379 * non-streams lzjb compress. 2380 */ 2381 if (dumpcfg.clevel == 0) { 2382 dumpsys_lzjb_page(dumpcfg.helper, cp); 2383 break; 2384 } 2385 2386 /* pass mapped pages to a helper */ 2387 CQ_PUT(helperq, cp, CBUF_INREADY); 2388 2389 /* the last page was done */ 2390 if (bitnum >= dumpcfg.bitmapsize) 2391 CQ_CLOSE(helperq); 2392 2393 break; 2394 2395 case CBUF_USEDMAP: 2396 2397 ds->npages += btop(cp->used); 2398 2399 HRSTART(ds->perpage, unmap); 2400 hat_unload(kas.a_hat, cp->buf, cp->size, HAT_UNLOAD); 2401 HRSTOP(ds->perpage, unmap); 2402 2403 if (bitnum < dumpcfg.bitmapsize) 2404 CQ_PUT(mainq, cp, CBUF_FREEMAP); 2405 CQ_CLOSE(mainq); 2406 2407 ASSERT(ds->npages <= dumphdr->dump_npages); 2408 ds->percent = ds->npages * 100LL / dumphdr->dump_npages; 2409 break; 2410 2411 case CBUF_WRITE: 2412 2413 CQ_PUT(writerq, cp, CBUF_WRITE); 2414 break; 2415 2416 case CBUF_ERRMSG: 2417 2418 if (cp->used > 0) { 2419 cp->buf[cp->size - 2] = '\n'; 2420 cp->buf[cp->size - 1] = '\0'; 2421 if (ds->neednl) { 2422 uprintf("\n%s", cp->buf); 2423 ds->neednl = 0; 2424 } else { 2425 uprintf("%s", cp->buf); 2426 } 2427 /* wait for console output */ 2428 drv_usecwait(200000); 2429 dump_timeleft = dump_timeout; 2430 } 2431 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2432 break; 2433 2434 default: 2435 uprintf("dump: unexpected buffer state %d, " 2436 "buffer will be lost\n", cp->state); 2437 break; 2438 2439 } /* end switch */ 2440 2441 } /* end while(1) */ 2442 } 2443 2444 #ifdef COLLECT_METRICS 2445 size_t 2446 dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size) 2447 { 2448 dumpcfg_t *cfg = &dumpcfg; 2449 int myid = CPU->cpu_seqid; 2450 int i, compress_ratio; 2451 int sec, iorate; 2452 helper_t *hp, *hpend = &cfg->helper[cfg->nhelper]; 2453 char *e = buf + size; 2454 char *p = buf; 2455 2456 sec = ds->elapsed / (1000 * 1000 * 1000ULL); 2457 if (sec < 1) 2458 sec = 1; 2459 2460 if (ds->iotime < 1) 2461 ds->iotime = 1; 2462 iorate = (ds->nwrite * 100000ULL) / ds->iotime; 2463 2464 compress_ratio = 100LL * ds->npages / btopr(ds->nwrite + 1); 2465 2466 #define P(...) (p += p < e ? snprintf(p, e - p, __VA_ARGS__) : 0) 2467 2468 P("Master cpu_seqid,%d\n", CPU->cpu_seqid); 2469 P("Master cpu_id,%d\n", CPU->cpu_id); 2470 P("dump_flags,0x%x\n", dumphdr->dump_flags); 2471 P("dump_ioerr,%d\n", dump_ioerr); 2472 2473 P("Helpers:\n"); 2474 for (i = 0; i < ncpus; i++) { 2475 if ((i & 15) == 0) 2476 P(",,%03d,", i); 2477 if (i == myid) 2478 P(" M"); 2479 else if (BT_TEST(cfg->helpermap, i)) 2480 P("%4d", cpu_seq[i]->cpu_id); 2481 else 2482 P(" *"); 2483 if ((i & 15) == 15) 2484 P("\n"); 2485 } 2486 2487 P("ncbuf_used,%d\n", cfg->ncbuf_used); 2488 P("ncmap,%d\n", cfg->ncmap); 2489 2490 P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m); 2491 P("Found small pages,%ld\n", cfg->foundsm); 2492 2493 P("Compression level,%d\n", cfg->clevel); 2494 P("Compression type,%s %s\n", cfg->clevel == 0 ? "serial" : "parallel", 2495 cfg->clevel >= DUMP_CLEVEL_BZIP2 ? "bzip2" : "lzjb"); 2496 P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio % 2497 100); 2498 P("nhelper_used,%d\n", cfg->nhelper_used); 2499 2500 P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100); 2501 P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite); 2502 P("..total nsec,%lld\n", (u_longlong_t)ds->iotime); 2503 P("dumpbuf.iosize,%ld\n", dumpbuf.iosize); 2504 P("dumpbuf.size,%ld\n", dumpbuf.size); 2505 2506 P("Dump pages/sec,%llu\n", (u_longlong_t)ds->npages / sec); 2507 P("Dump pages,%llu\n", (u_longlong_t)ds->npages); 2508 P("Dump time,%d\n", sec); 2509 2510 if (ds->pages_mapped > 0) 2511 P("per-cent map utilization,%d\n", (int)((100 * ds->pages_used) 2512 / ds->pages_mapped)); 2513 2514 P("\nPer-page metrics:\n"); 2515 if (ds->npages > 0) { 2516 for (hp = cfg->helper; hp != hpend; hp++) { 2517 #define PERPAGE(x) ds->perpage.x += hp->perpage.x; 2518 PERPAGES; 2519 #undef PERPAGE 2520 } 2521 #define PERPAGE(x) \ 2522 P("%s nsec/page,%d\n", #x, (int)(ds->perpage.x / ds->npages)); 2523 PERPAGES; 2524 #undef PERPAGE 2525 P("freebufq.empty,%d\n", (int)(ds->freebufq.empty / 2526 ds->npages)); 2527 P("helperq.empty,%d\n", (int)(ds->helperq.empty / 2528 ds->npages)); 2529 P("writerq.empty,%d\n", (int)(ds->writerq.empty / 2530 ds->npages)); 2531 P("mainq.empty,%d\n", (int)(ds->mainq.empty / ds->npages)); 2532 2533 P("I/O wait nsec/page,%llu\n", (u_longlong_t)(ds->iowait / 2534 ds->npages)); 2535 } 2536 #undef P 2537 if (p < e) 2538 bzero(p, e - p); 2539 return (p - buf); 2540 } 2541 #endif /* COLLECT_METRICS */ 2542 2543 /* 2544 * Dump the system. 2545 */ 2546 void 2547 dumpsys(void) 2548 { 2549 dumpsync_t *ds = &dumpsync; 2550 taskq_t *livetaskq = NULL; 2551 pfn_t pfn; 2552 pgcnt_t bitnum; 2553 proc_t *p; 2554 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper]; 2555 cbuf_t *cp; 2556 pid_t npids, pidx; 2557 char *content; 2558 char *buf; 2559 size_t size; 2560 int save_dump_clevel; 2561 dumpmlw_t mlw; 2562 dumpcsize_t datatag; 2563 dumpdatahdr_t datahdr; 2564 2565 if (dumpvp == NULL || dumphdr == NULL) { 2566 uprintf("skipping system dump - no dump device configured\n"); 2567 if (panicstr) { 2568 dumpcfg.helpers_wanted = 0; 2569 dumpsys_spinunlock(&dumpcfg.helper_lock); 2570 } 2571 return; 2572 } 2573 dumpbuf.cur = dumpbuf.start; 2574 2575 /* clear the sync variables */ 2576 ASSERT(dumpcfg.nhelper > 0); 2577 bzero(ds, sizeof (*ds)); 2578 ds->dumpcpu = CPU->cpu_id; 2579 2580 /* 2581 * Calculate the starting block for dump. If we're dumping on a 2582 * swap device, start 1/5 of the way in; otherwise, start at the 2583 * beginning. And never use the first page -- it may be a disk label. 2584 */ 2585 if (dumpvp->v_flag & VISSWAP) 2586 dumphdr->dump_start = P2ROUNDUP(dumpvp_size / 5, DUMP_OFFSET); 2587 else 2588 dumphdr->dump_start = DUMP_OFFSET; 2589 2590 dumphdr->dump_flags = DF_VALID | DF_COMPLETE | DF_LIVE | DF_COMPRESSED; 2591 dumphdr->dump_crashtime = gethrestime_sec(); 2592 dumphdr->dump_npages = 0; 2593 dumphdr->dump_nvtop = 0; 2594 bzero(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.bitmapsize)); 2595 dump_timeleft = dump_timeout; 2596 2597 if (panicstr) { 2598 dumphdr->dump_flags &= ~DF_LIVE; 2599 (void) VOP_DUMPCTL(dumpvp, DUMP_FREE, NULL, NULL); 2600 (void) VOP_DUMPCTL(dumpvp, DUMP_ALLOC, NULL, NULL); 2601 (void) vsnprintf(dumphdr->dump_panicstring, DUMP_PANICSIZE, 2602 panicstr, panicargs); 2603 2604 } 2605 2606 if (dump_conflags & DUMP_ALL) 2607 content = "all"; 2608 else if (dump_conflags & DUMP_CURPROC) 2609 content = "kernel + curproc"; 2610 else 2611 content = "kernel"; 2612 uprintf("dumping to %s, offset %lld, content: %s\n", dumppath, 2613 dumphdr->dump_start, content); 2614 2615 /* Make sure nodename is current */ 2616 bcopy(utsname.nodename, dumphdr->dump_utsname.nodename, SYS_NMLN); 2617 2618 /* 2619 * If this is a live dump, try to open a VCHR vnode for better 2620 * performance. We must take care to flush the buffer cache 2621 * first. 2622 */ 2623 if (!panicstr) { 2624 vnode_t *cdev_vp, *cmn_cdev_vp; 2625 2626 ASSERT(dumpbuf.cdev_vp == NULL); 2627 cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR); 2628 if (cdev_vp != NULL) { 2629 cmn_cdev_vp = common_specvp(cdev_vp); 2630 if (VOP_OPEN(&cmn_cdev_vp, FREAD | FWRITE, kcred, NULL) 2631 == 0) { 2632 if (vn_has_cached_data(dumpvp)) 2633 (void) pvn_vplist_dirty(dumpvp, 0, NULL, 2634 B_INVAL | B_TRUNC, kcred); 2635 dumpbuf.cdev_vp = cmn_cdev_vp; 2636 } else { 2637 VN_RELE(cdev_vp); 2638 } 2639 } 2640 } 2641 2642 /* 2643 * Store a hires timestamp so we can look it up during debugging. 2644 */ 2645 lbolt_debug_entry(); 2646 2647 /* 2648 * Leave room for the message and ereport save areas and terminal dump 2649 * header. 2650 */ 2651 dumpbuf.vp_limit = dumpvp_size - DUMP_LOGSIZE - DUMP_OFFSET - 2652 DUMP_ERPTSIZE; 2653 2654 /* 2655 * Write out the symbol table. It's no longer compressed, 2656 * so its 'size' and 'csize' are equal. 2657 */ 2658 dumpbuf.vp_off = dumphdr->dump_ksyms = dumphdr->dump_start + PAGESIZE; 2659 dumphdr->dump_ksyms_size = dumphdr->dump_ksyms_csize = 2660 ksyms_snapshot(dumpvp_ksyms_write, NULL, LONG_MAX); 2661 2662 /* 2663 * Write out the translation map. 2664 */ 2665 dumphdr->dump_map = dumpvp_flush(); 2666 dump_as(&kas); 2667 dumphdr->dump_nvtop += dump_plat_addr(); 2668 2669 /* 2670 * call into hat, which may have unmapped pages that also need to 2671 * be in the dump 2672 */ 2673 hat_dump(); 2674 2675 if (dump_conflags & DUMP_ALL) { 2676 mutex_enter(&pidlock); 2677 2678 for (npids = 0, p = practive; p != NULL; p = p->p_next) 2679 dumpcfg.pids[npids++] = p->p_pid; 2680 2681 mutex_exit(&pidlock); 2682 2683 for (pidx = 0; pidx < npids; pidx++) 2684 (void) dump_process(dumpcfg.pids[pidx]); 2685 2686 dump_init_memlist_walker(&mlw); 2687 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) { 2688 dump_timeleft = dump_timeout; 2689 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2690 /* 2691 * Some hypervisors do not have all pages available to 2692 * be accessed by the guest OS. Check for page 2693 * accessibility. 2694 */ 2695 if (plat_hold_page(pfn, PLAT_HOLD_NO_LOCK, NULL) != 2696 PLAT_HOLD_OK) 2697 continue; 2698 BT_SET(dumpcfg.bitmap, bitnum); 2699 } 2700 dumphdr->dump_npages = dumpcfg.bitmapsize; 2701 dumphdr->dump_flags |= DF_ALL; 2702 2703 } else if (dump_conflags & DUMP_CURPROC) { 2704 /* 2705 * Determine which pid is to be dumped. If we're panicking, we 2706 * dump the process associated with panic_thread (if any). If 2707 * this is a live dump, we dump the process associated with 2708 * curthread. 2709 */ 2710 npids = 0; 2711 if (panicstr) { 2712 if (panic_thread != NULL && 2713 panic_thread->t_procp != NULL && 2714 panic_thread->t_procp != &p0) { 2715 dumpcfg.pids[npids++] = 2716 panic_thread->t_procp->p_pid; 2717 } 2718 } else { 2719 dumpcfg.pids[npids++] = curthread->t_procp->p_pid; 2720 } 2721 2722 if (npids && dump_process(dumpcfg.pids[0]) == 0) 2723 dumphdr->dump_flags |= DF_CURPROC; 2724 else 2725 dumphdr->dump_flags |= DF_KERNEL; 2726 2727 } else { 2728 dumphdr->dump_flags |= DF_KERNEL; 2729 } 2730 2731 dumphdr->dump_hashmask = (1 << highbit(dumphdr->dump_nvtop - 1)) - 1; 2732 2733 /* 2734 * Write out the pfn table. 2735 */ 2736 dumphdr->dump_pfn = dumpvp_flush(); 2737 dump_init_memlist_walker(&mlw); 2738 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) { 2739 dump_timeleft = dump_timeout; 2740 if (!BT_TEST(dumpcfg.bitmap, bitnum)) 2741 continue; 2742 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2743 ASSERT(pfn != PFN_INVALID); 2744 dumpvp_write(&pfn, sizeof (pfn_t)); 2745 } 2746 dump_plat_pfn(); 2747 2748 /* 2749 * Write out all the pages. 2750 * Map pages, copy them handling UEs, compress, and write them out. 2751 * Cooperate with any helpers running on CPUs in panic_idle(). 2752 */ 2753 dumphdr->dump_data = dumpvp_flush(); 2754 2755 bzero(dumpcfg.helpermap, BT_SIZEOFMAP(NCPU)); 2756 ds->live = dumpcfg.clevel > 0 && 2757 (dumphdr->dump_flags & DF_LIVE) != 0; 2758 2759 save_dump_clevel = dumpcfg.clevel; 2760 if (panicstr) 2761 dumpsys_get_maxmem(); 2762 else if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2) 2763 dumpcfg.clevel = DUMP_CLEVEL_LZJB; 2764 2765 dumpcfg.nhelper_used = 0; 2766 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2767 if (hp->page == NULL) { 2768 hp->helper = DONEHELPER; 2769 continue; 2770 } 2771 ++dumpcfg.nhelper_used; 2772 hp->helper = FREEHELPER; 2773 hp->taskqid = NULL; 2774 hp->ds = ds; 2775 bzero(&hp->perpage, sizeof (hp->perpage)); 2776 if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2) 2777 (void) BZ2_bzCompressReset(&hp->bzstream); 2778 } 2779 2780 CQ_OPEN(freebufq); 2781 CQ_OPEN(helperq); 2782 2783 dumpcfg.ncbuf_used = 0; 2784 for (cp = dumpcfg.cbuf; cp != &dumpcfg.cbuf[dumpcfg.ncbuf]; cp++) { 2785 if (cp->buf != NULL) { 2786 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2787 ++dumpcfg.ncbuf_used; 2788 } 2789 } 2790 2791 for (cp = dumpcfg.cmap; cp != &dumpcfg.cmap[dumpcfg.ncmap]; cp++) 2792 CQ_PUT(mainq, cp, CBUF_FREEMAP); 2793 2794 ds->start = gethrtime(); 2795 ds->iowaitts = ds->start; 2796 2797 /* start helpers */ 2798 if (ds->live) { 2799 int n = dumpcfg.nhelper_used; 2800 int pri = MINCLSYSPRI - 25; 2801 2802 livetaskq = taskq_create("LiveDump", n, pri, n, n, 2803 TASKQ_PREPOPULATE); 2804 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2805 if (hp->page == NULL) 2806 continue; 2807 hp->helper = hp - dumpcfg.helper; 2808 hp->taskqid = taskq_dispatch(livetaskq, 2809 dumpsys_live_helper, (void *)hp, TQ_NOSLEEP); 2810 } 2811 2812 } else { 2813 if (panicstr) 2814 kmem_dump_begin(); 2815 dumpcfg.helpers_wanted = dumpcfg.clevel > 0; 2816 dumpsys_spinunlock(&dumpcfg.helper_lock); 2817 } 2818 2819 /* run main task */ 2820 dumpsys_main_task(ds); 2821 2822 ds->elapsed = gethrtime() - ds->start; 2823 if (ds->elapsed < 1) 2824 ds->elapsed = 1; 2825 2826 if (livetaskq != NULL) 2827 taskq_destroy(livetaskq); 2828 2829 if (ds->neednl) { 2830 uprintf("\n"); 2831 ds->neednl = 0; 2832 } 2833 2834 /* record actual pages dumped */ 2835 dumphdr->dump_npages = ds->npages; 2836 2837 /* platform-specific data */ 2838 dumphdr->dump_npages += dump_plat_data(dumpcfg.cbuf[0].buf); 2839 2840 /* note any errors by clearing DF_COMPLETE */ 2841 if (dump_ioerr || ds->npages < dumphdr->dump_npages) 2842 dumphdr->dump_flags &= ~DF_COMPLETE; 2843 2844 /* end of stream blocks */ 2845 datatag = 0; 2846 dumpvp_write(&datatag, sizeof (datatag)); 2847 2848 bzero(&datahdr, sizeof (datahdr)); 2849 2850 /* buffer for metrics */ 2851 buf = dumpcfg.cbuf[0].buf; 2852 size = MIN(dumpcfg.cbuf[0].size, DUMP_OFFSET - sizeof (dumphdr_t) - 2853 sizeof (dumpdatahdr_t)); 2854 2855 /* finish the kmem intercepts, collect kmem verbose info */ 2856 if (panicstr) { 2857 datahdr.dump_metrics = kmem_dump_finish(buf, size); 2858 buf += datahdr.dump_metrics; 2859 size -= datahdr.dump_metrics; 2860 } 2861 2862 /* compression info in data header */ 2863 datahdr.dump_datahdr_magic = DUMP_DATAHDR_MAGIC; 2864 datahdr.dump_datahdr_version = DUMP_DATAHDR_VERSION; 2865 datahdr.dump_maxcsize = CBUF_SIZE; 2866 datahdr.dump_maxrange = CBUF_MAPSIZE / PAGESIZE; 2867 datahdr.dump_nstreams = dumpcfg.nhelper_used; 2868 datahdr.dump_clevel = dumpcfg.clevel; 2869 #ifdef COLLECT_METRICS 2870 if (dump_metrics_on) 2871 datahdr.dump_metrics += dumpsys_metrics(ds, buf, size); 2872 #endif 2873 datahdr.dump_data_csize = dumpvp_flush() - dumphdr->dump_data; 2874 2875 /* 2876 * Write out the initial and terminal dump headers. 2877 */ 2878 dumpbuf.vp_off = dumphdr->dump_start; 2879 dumpvp_write(dumphdr, sizeof (dumphdr_t)); 2880 (void) dumpvp_flush(); 2881 2882 dumpbuf.vp_limit = dumpvp_size; 2883 dumpbuf.vp_off = dumpbuf.vp_limit - DUMP_OFFSET; 2884 dumpvp_write(dumphdr, sizeof (dumphdr_t)); 2885 dumpvp_write(&datahdr, sizeof (dumpdatahdr_t)); 2886 dumpvp_write(dumpcfg.cbuf[0].buf, datahdr.dump_metrics); 2887 2888 (void) dumpvp_flush(); 2889 2890 uprintf("\r%3d%% done: %llu pages dumped, ", 2891 ds->percent_done, (u_longlong_t)ds->npages); 2892 2893 if (dump_ioerr == 0) { 2894 uprintf("dump succeeded\n"); 2895 } else { 2896 uprintf("dump failed: error %d\n", dump_ioerr); 2897 #ifdef DEBUG 2898 if (panicstr) 2899 debug_enter("dump failed"); 2900 #endif 2901 } 2902 2903 /* 2904 * Write out all undelivered messages. This has to be the *last* 2905 * thing we do because the dump process itself emits messages. 2906 */ 2907 if (panicstr) { 2908 dump_ereports(); 2909 dump_messages(); 2910 } 2911 2912 delay(2 * hz); /* let people see the 'done' message */ 2913 dump_timeleft = 0; 2914 dump_ioerr = 0; 2915 2916 /* restore settings after live dump completes */ 2917 if (!panicstr) { 2918 dumpcfg.clevel = save_dump_clevel; 2919 2920 /* release any VCHR open of the dump device */ 2921 if (dumpbuf.cdev_vp != NULL) { 2922 (void) VOP_CLOSE(dumpbuf.cdev_vp, FREAD | FWRITE, 1, 0, 2923 kcred, NULL); 2924 VN_RELE(dumpbuf.cdev_vp); 2925 dumpbuf.cdev_vp = NULL; 2926 } 2927 } 2928 } 2929 2930 /* 2931 * This function is called whenever the memory size, as represented 2932 * by the phys_install list, changes. 2933 */ 2934 void 2935 dump_resize() 2936 { 2937 mutex_enter(&dump_lock); 2938 dumphdr_init(); 2939 dumpbuf_resize(); 2940 dump_update_clevel(); 2941 mutex_exit(&dump_lock); 2942 } 2943 2944 /* 2945 * This function allows for dynamic resizing of a dump area. It assumes that 2946 * the underlying device has update its appropriate size(9P). 2947 */ 2948 int 2949 dumpvp_resize() 2950 { 2951 int error; 2952 vattr_t vattr; 2953 2954 mutex_enter(&dump_lock); 2955 vattr.va_mask = AT_SIZE; 2956 if ((error = VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL)) != 0) { 2957 mutex_exit(&dump_lock); 2958 return (error); 2959 } 2960 2961 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) { 2962 mutex_exit(&dump_lock); 2963 return (ENOSPC); 2964 } 2965 2966 dumpvp_size = vattr.va_size & -DUMP_OFFSET; 2967 mutex_exit(&dump_lock); 2968 return (0); 2969 } 2970