1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/systm.h> 30 #include <sys/vm.h> 31 #include <sys/proc.h> 32 #include <sys/file.h> 33 #include <sys/conf.h> 34 #include <sys/kmem.h> 35 #include <sys/mem.h> 36 #include <sys/mman.h> 37 #include <sys/vnode.h> 38 #include <sys/errno.h> 39 #include <sys/memlist.h> 40 #include <sys/dumphdr.h> 41 #include <sys/dumpadm.h> 42 #include <sys/ksyms.h> 43 #include <sys/compress.h> 44 #include <sys/stream.h> 45 #include <sys/strsun.h> 46 #include <sys/cmn_err.h> 47 #include <sys/bitmap.h> 48 #include <sys/modctl.h> 49 #include <sys/utsname.h> 50 #include <sys/systeminfo.h> 51 #include <sys/vmem.h> 52 #include <sys/log.h> 53 #include <sys/var.h> 54 #include <sys/debug.h> 55 #include <sys/sunddi.h> 56 #include <fs/fs_subr.h> 57 #include <sys/fs/snode.h> 58 #include <sys/ontrap.h> 59 #include <sys/panic.h> 60 #include <sys/dkio.h> 61 #include <sys/vtoc.h> 62 #include <sys/errorq.h> 63 #include <sys/fm/util.h> 64 #include <sys/fs/zfs.h> 65 66 #include <vm/hat.h> 67 #include <vm/as.h> 68 #include <vm/page.h> 69 #include <vm/pvn.h> 70 #include <vm/seg.h> 71 #include <vm/seg_kmem.h> 72 #include <sys/clock_impl.h> 73 #include <sys/hold_page.h> 74 75 #include <bzip2/bzlib.h> 76 77 /* 78 * Crash dump time is dominated by disk write time. To reduce this, 79 * the stronger compression method bzip2 is applied to reduce the dump 80 * size and hence reduce I/O time. However, bzip2 is much more 81 * computationally expensive than the existing lzjb algorithm, so to 82 * avoid increasing compression time, CPUs that are otherwise idle 83 * during panic are employed to parallelize the compression task. 84 * Many helper CPUs are needed to prevent bzip2 from being a 85 * bottleneck, and on systems with too few CPUs, the lzjb algorithm is 86 * parallelized instead. Lastly, I/O and compression are performed by 87 * different CPUs, and are hence overlapped in time, unlike the older 88 * serial code. 89 * 90 * Another important consideration is the speed of the dump 91 * device. Faster disks need less CPUs in order to benefit from 92 * parallel lzjb versus parallel bzip2. Therefore, the CPU count 93 * threshold for switching from parallel lzjb to paralled bzip2 is 94 * elevated for faster disks. The dump device speed is adduced from 95 * the setting for dumpbuf.iosize, see dump_update_clevel. 96 */ 97 98 /* 99 * exported vars 100 */ 101 kmutex_t dump_lock; /* lock for dump configuration */ 102 dumphdr_t *dumphdr; /* dump header */ 103 int dump_conflags = DUMP_KERNEL; /* dump configuration flags */ 104 vnode_t *dumpvp; /* dump device vnode pointer */ 105 u_offset_t dumpvp_size; /* size of dump device, in bytes */ 106 char *dumppath; /* pathname of dump device */ 107 int dump_timeout = 120; /* timeout for dumping pages */ 108 int dump_timeleft; /* portion of dump_timeout remaining */ 109 int dump_ioerr; /* dump i/o error */ 110 int dump_check_used; /* enable check for used pages */ 111 112 /* 113 * Tunables for dump compression and parallelism. These can be set via 114 * /etc/system. 115 * 116 * dump_ncpu_low number of helpers for parallel lzjb 117 * This is also the minimum configuration. 118 * 119 * dump_bzip2_level bzip2 compression level: 1-9 120 * Higher numbers give greater compression, but take more memory 121 * and time. Memory used per helper is ~(dump_bzip2_level * 1MB). 122 * 123 * dump_plat_mincpu the cross-over limit for using bzip2 (per platform): 124 * if dump_plat_mincpu == 0, then always do single threaded dump 125 * if ncpu >= dump_plat_mincpu then try to use bzip2 126 * 127 * dump_metrics_on if set, metrics are collected in the kernel, passed 128 * to savecore via the dump file, and recorded by savecore in 129 * METRICS.txt. 130 */ 131 uint_t dump_ncpu_low = 4; /* minimum config for parallel lzjb */ 132 uint_t dump_bzip2_level = 1; /* bzip2 level (1-9) */ 133 134 /* tunables for pre-reserved heap */ 135 uint_t dump_kmem_permap = 1024; 136 uint_t dump_kmem_pages = 8; 137 138 /* Define multiple buffers per helper to avoid stalling */ 139 #define NCBUF_PER_HELPER 2 140 #define NCMAP_PER_HELPER 4 141 142 /* minimum number of helpers configured */ 143 #define MINHELPERS (dump_ncpu_low) 144 #define MINCBUFS (MINHELPERS * NCBUF_PER_HELPER) 145 146 /* 147 * Define constant parameters. 148 * 149 * CBUF_SIZE size of an output buffer 150 * 151 * CBUF_MAPSIZE size of virtual range for mapping pages 152 * 153 * CBUF_MAPNP size of virtual range in pages 154 * 155 */ 156 #define DUMP_1KB ((size_t)1 << 10) 157 #define DUMP_1MB ((size_t)1 << 20) 158 #define CBUF_SIZE ((size_t)1 << 17) 159 #define CBUF_MAPSHIFT (22) 160 #define CBUF_MAPSIZE ((size_t)1 << CBUF_MAPSHIFT) 161 #define CBUF_MAPNP ((size_t)1 << (CBUF_MAPSHIFT - PAGESHIFT)) 162 163 /* 164 * Compression metrics are accumulated nano-second subtotals. The 165 * results are normalized by the number of pages dumped. A report is 166 * generated when dumpsys() completes and is saved in the dump image 167 * after the trailing dump header. 168 * 169 * Metrics are always collected. Set the variable dump_metrics_on to 170 * cause metrics to be saved in the crash file, where savecore will 171 * save it in the file METRICS.txt. 172 */ 173 #define PERPAGES \ 174 PERPAGE(bitmap) PERPAGE(map) PERPAGE(unmap) \ 175 PERPAGE(copy) PERPAGE(compress) \ 176 PERPAGE(write) \ 177 PERPAGE(inwait) PERPAGE(outwait) 178 179 typedef struct perpage { 180 #define PERPAGE(x) hrtime_t x; 181 PERPAGES 182 #undef PERPAGE 183 } perpage_t; 184 185 /* 186 * This macro controls the code generation for collecting dump 187 * performance information. By default, the code is generated, but 188 * automatic saving of the information is disabled. If dump_metrics_on 189 * is set to 1, the timing information is passed to savecore via the 190 * crash file, where it is appended to the file dump-dir/METRICS.txt. 191 */ 192 #define COLLECT_METRICS 193 194 #ifdef COLLECT_METRICS 195 uint_t dump_metrics_on = 0; /* set to 1 to enable recording metrics */ 196 197 #define HRSTART(v, m) v##ts.m = gethrtime() 198 #define HRSTOP(v, m) v.m += gethrtime() - v##ts.m 199 #define HRBEGIN(v, m, s) v##ts.m = gethrtime(); v.size += s 200 #define HREND(v, m) v.m += gethrtime() - v##ts.m 201 #define HRNORM(v, m, n) v.m /= (n) 202 203 #else 204 #define HRSTART(v, m) 205 #define HRSTOP(v, m) 206 #define HRBEGIN(v, m, s) 207 #define HREND(v, m) 208 #define HRNORM(v, m, n) 209 #endif /* COLLECT_METRICS */ 210 211 /* 212 * Buffers for copying and compressing memory pages. 213 * 214 * cbuf_t buffer controllers: used for both input and output. 215 * 216 * The buffer state indicates how it is being used: 217 * 218 * CBUF_FREEMAP: CBUF_MAPSIZE virtual address range is available for 219 * mapping input pages. 220 * 221 * CBUF_INREADY: input pages are mapped and ready for compression by a 222 * helper. 223 * 224 * CBUF_USEDMAP: mapping has been consumed by a helper. Needs unmap. 225 * 226 * CBUF_FREEBUF: CBUF_SIZE output buffer, which is available. 227 * 228 * CBUF_WRITE: CBUF_SIZE block of compressed pages from a helper, 229 * ready to write out. 230 * 231 * CBUF_ERRMSG: CBUF_SIZE block of error messages from a helper 232 * (reports UE errors.) 233 */ 234 235 typedef enum cbufstate { 236 CBUF_FREEMAP, 237 CBUF_INREADY, 238 CBUF_USEDMAP, 239 CBUF_FREEBUF, 240 CBUF_WRITE, 241 CBUF_ERRMSG 242 } cbufstate_t; 243 244 typedef struct cbuf cbuf_t; 245 246 struct cbuf { 247 cbuf_t *next; /* next in list */ 248 cbufstate_t state; /* processing state */ 249 size_t used; /* amount used */ 250 size_t size; /* mem size */ 251 char *buf; /* kmem or vmem */ 252 pgcnt_t pagenum; /* index to pfn map */ 253 pgcnt_t bitnum; /* first set bitnum */ 254 pfn_t pfn; /* first pfn in mapped range */ 255 int off; /* byte offset to first pfn */ 256 }; 257 258 /* 259 * cqueue_t queues: a uni-directional channel for communication 260 * from the master to helper tasks or vice-versa using put and 261 * get primitives. Both mappings and data buffers are passed via 262 * queues. Producers close a queue when done. The number of 263 * active producers is reference counted so the consumer can 264 * detect end of data. Concurrent access is mediated by atomic 265 * operations for panic dump, or mutex/cv for live dump. 266 * 267 * There a four queues, used as follows: 268 * 269 * Queue Dataflow NewState 270 * -------------------------------------------------- 271 * mainq master -> master FREEMAP 272 * master has initialized or unmapped an input buffer 273 * -------------------------------------------------- 274 * helperq master -> helper INREADY 275 * master has mapped input for use by helper 276 * -------------------------------------------------- 277 * mainq master <- helper USEDMAP 278 * helper is done with input 279 * -------------------------------------------------- 280 * freebufq master -> helper FREEBUF 281 * master has initialized or written an output buffer 282 * -------------------------------------------------- 283 * mainq master <- helper WRITE 284 * block of compressed pages from a helper 285 * -------------------------------------------------- 286 * mainq master <- helper ERRMSG 287 * error messages from a helper (memory error case) 288 * -------------------------------------------------- 289 * writerq master <- master WRITE 290 * non-blocking queue of blocks to write 291 * -------------------------------------------------- 292 */ 293 typedef struct cqueue { 294 cbuf_t *volatile first; /* first in list */ 295 cbuf_t *last; /* last in list */ 296 hrtime_t ts; /* timestamp */ 297 hrtime_t empty; /* total time empty */ 298 kmutex_t mutex; /* live state lock */ 299 kcondvar_t cv; /* live wait var */ 300 lock_t spinlock; /* panic mode spin lock */ 301 volatile uint_t open; /* producer ref count */ 302 } cqueue_t; 303 304 /* 305 * Convenience macros for using the cqueue functions 306 * Note that the caller must have defined "dumpsync_t *ds" 307 */ 308 #define CQ_IS_EMPTY(q) \ 309 (ds->q.first == NULL) 310 311 #define CQ_OPEN(q) \ 312 atomic_inc_uint(&ds->q.open) 313 314 #define CQ_CLOSE(q) \ 315 dumpsys_close_cq(&ds->q, ds->live) 316 317 #define CQ_PUT(q, cp, st) \ 318 dumpsys_put_cq(&ds->q, cp, st, ds->live) 319 320 #define CQ_GET(q) \ 321 dumpsys_get_cq(&ds->q, ds->live) 322 323 /* 324 * Dynamic state when dumpsys() is running. 325 */ 326 typedef struct dumpsync { 327 pgcnt_t npages; /* subtotal of pages dumped */ 328 pgcnt_t pages_mapped; /* subtotal of pages mapped */ 329 pgcnt_t pages_used; /* subtotal of pages used per map */ 330 size_t nwrite; /* subtotal of bytes written */ 331 uint_t live; /* running live dump */ 332 uint_t neednl; /* will need to print a newline */ 333 uint_t percent; /* dump progress */ 334 uint_t percent_done; /* dump progress reported */ 335 cqueue_t freebufq; /* free kmem bufs for writing */ 336 cqueue_t mainq; /* input for main task */ 337 cqueue_t helperq; /* input for helpers */ 338 cqueue_t writerq; /* input for writer */ 339 hrtime_t start; /* start time */ 340 hrtime_t elapsed; /* elapsed time when completed */ 341 hrtime_t iotime; /* time spent writing nwrite bytes */ 342 hrtime_t iowait; /* time spent waiting for output */ 343 hrtime_t iowaitts; /* iowait timestamp */ 344 perpage_t perpage; /* metrics */ 345 perpage_t perpagets; 346 int dumpcpu; /* master cpu */ 347 } dumpsync_t; 348 349 static dumpsync_t dumpsync; /* synchronization vars */ 350 351 /* 352 * helper_t helpers: contains the context for a stream. CPUs run in 353 * parallel at dump time; each CPU creates a single stream of 354 * compression data. Stream data is divided into CBUF_SIZE blocks. 355 * The blocks are written in order within a stream. But, blocks from 356 * multiple streams can be interleaved. Each stream is identified by a 357 * unique tag. 358 */ 359 typedef struct helper { 360 int helper; /* bound helper id */ 361 int tag; /* compression stream tag */ 362 perpage_t perpage; /* per page metrics */ 363 perpage_t perpagets; /* per page metrics (timestamps) */ 364 taskqid_t taskqid; /* live dump task ptr */ 365 int in, out; /* buffer offsets */ 366 cbuf_t *cpin, *cpout, *cperr; /* cbuf objects in process */ 367 dumpsync_t *ds; /* pointer to sync vars */ 368 size_t used; /* counts input consumed */ 369 char *page; /* buffer for page copy */ 370 char *lzbuf; /* lzjb output */ 371 bz_stream bzstream; /* bzip2 state */ 372 } helper_t; 373 374 #define MAINHELPER (-1) /* helper is also the main task */ 375 #define FREEHELPER (-2) /* unbound helper */ 376 #define DONEHELPER (-3) /* helper finished */ 377 378 /* 379 * configuration vars for dumpsys 380 */ 381 typedef struct dumpcfg { 382 int threshold; /* ncpu threshold for bzip2 */ 383 int nhelper; /* number of helpers */ 384 int nhelper_used; /* actual number of helpers used */ 385 int ncmap; /* number VA pages for compression */ 386 int ncbuf; /* number of bufs for compression */ 387 int ncbuf_used; /* number of bufs in use */ 388 uint_t clevel; /* dump compression level */ 389 helper_t *helper; /* array of helpers */ 390 cbuf_t *cmap; /* array of input (map) buffers */ 391 cbuf_t *cbuf; /* array of output buffers */ 392 ulong_t *helpermap; /* set of dumpsys helper CPU ids */ 393 ulong_t *bitmap; /* bitmap for marking pages to dump */ 394 ulong_t *rbitmap; /* bitmap for used CBUF_MAPSIZE ranges */ 395 pgcnt_t bitmapsize; /* size of bitmap */ 396 pgcnt_t rbitmapsize; /* size of bitmap for ranges */ 397 pgcnt_t found4m; /* number ranges allocated by dump */ 398 pgcnt_t foundsm; /* number small pages allocated by dump */ 399 pid_t *pids; /* list of process IDs at dump time */ 400 size_t maxsize; /* memory size needed at dump time */ 401 size_t maxvmsize; /* size of reserved VM */ 402 char *maxvm; /* reserved VM for spare pages */ 403 lock_t helper_lock; /* protect helper state */ 404 char helpers_wanted; /* flag to enable parallelism */ 405 } dumpcfg_t; 406 407 static dumpcfg_t dumpcfg; /* config vars */ 408 409 /* 410 * The dump I/O buffer. 411 * 412 * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is 413 * sized according to the optimum device transfer speed. 414 */ 415 typedef struct dumpbuf { 416 vnode_t *cdev_vp; /* VCHR open of the dump device */ 417 len_t vp_limit; /* maximum write offset */ 418 offset_t vp_off; /* current dump device offset */ 419 char *cur; /* dump write pointer */ 420 char *start; /* dump buffer address */ 421 char *end; /* dump buffer end */ 422 size_t size; /* size of dumpbuf in bytes */ 423 size_t iosize; /* best transfer size for device */ 424 } dumpbuf_t; 425 426 dumpbuf_t dumpbuf; /* I/O buffer */ 427 428 /* 429 * The dump I/O buffer must be at least one page, at most xfer_size 430 * bytes, and should scale with physmem in between. The transfer size 431 * passed in will either represent a global default (maxphys) or the 432 * best size for the device. The size of the dumpbuf I/O buffer is 433 * limited by dumpbuf_limit (8MB by default) because the dump 434 * performance saturates beyond a certain size. The default is to 435 * select 1/4096 of the memory. 436 */ 437 static int dumpbuf_fraction = 12; /* memory size scale factor */ 438 static size_t dumpbuf_limit = 8 * DUMP_1MB; /* max I/O buf size */ 439 440 static size_t 441 dumpbuf_iosize(size_t xfer_size) 442 { 443 size_t iosize = ptob(physmem >> dumpbuf_fraction); 444 445 if (iosize < PAGESIZE) 446 iosize = PAGESIZE; 447 else if (iosize > xfer_size) 448 iosize = xfer_size; 449 if (iosize > dumpbuf_limit) 450 iosize = dumpbuf_limit; 451 return (iosize & PAGEMASK); 452 } 453 454 /* 455 * resize the I/O buffer 456 */ 457 static void 458 dumpbuf_resize(void) 459 { 460 char *old_buf = dumpbuf.start; 461 size_t old_size = dumpbuf.size; 462 char *new_buf; 463 size_t new_size; 464 465 ASSERT(MUTEX_HELD(&dump_lock)); 466 467 new_size = dumpbuf_iosize(MAX(dumpbuf.iosize, maxphys)); 468 if (new_size <= old_size) 469 return; /* no need to reallocate buffer */ 470 471 new_buf = kmem_alloc(new_size, KM_SLEEP); 472 dumpbuf.size = new_size; 473 dumpbuf.start = new_buf; 474 dumpbuf.end = new_buf + new_size; 475 kmem_free(old_buf, old_size); 476 } 477 478 /* 479 * dump_update_clevel is called when dumpadm configures the dump device. 480 * Calculate number of helpers and buffers. 481 * Allocate the minimum configuration for now. 482 * 483 * When the dump file is configured we reserve a minimum amount of 484 * memory for use at crash time. But we reserve VA for all the memory 485 * we really want in order to do the fastest dump possible. The VA is 486 * backed by pages not being dumped, according to the bitmap. If 487 * there is insufficient spare memory, however, we fall back to the 488 * minimum. 489 * 490 * Live dump (savecore -L) always uses the minimum config. 491 * 492 * clevel 0 is single threaded lzjb 493 * clevel 1 is parallel lzjb 494 * clevel 2 is parallel bzip2 495 * 496 * The ncpu threshold is selected with dump_plat_mincpu. 497 * On OPL, set_platform_defaults() overrides the sun4u setting. 498 * The actual values are defined via DUMP_PLAT_*_MINCPU macros. 499 * 500 * Architecture Threshold Algorithm 501 * sun4u < 51 parallel lzjb 502 * sun4u >= 51 parallel bzip2(*) 503 * sun4u OPL < 8 parallel lzjb 504 * sun4u OPL >= 8 parallel bzip2(*) 505 * sun4v < 128 parallel lzjb 506 * sun4v >= 128 parallel bzip2(*) 507 * x86 < 11 parallel lzjb 508 * x86 >= 11 parallel bzip2(*) 509 * 32-bit N/A single-threaded lzjb 510 * 511 * (*) bzip2 is only chosen if there is sufficient available 512 * memory for buffers at dump time. See dumpsys_get_maxmem(). 513 * 514 * Faster dump devices have larger I/O buffers. The threshold value is 515 * increased according to the size of the dump I/O buffer, because 516 * parallel lzjb performs better with faster disks. For buffers >= 1MB 517 * the threshold is 3X; for buffers >= 256K threshold is 2X. 518 * 519 * For parallel dumps, the number of helpers is ncpu-1. The CPU 520 * running panic runs the main task. For single-threaded dumps, the 521 * panic CPU does lzjb compression (it is tagged as MAINHELPER.) 522 * 523 * Need multiple buffers per helper so that they do not block waiting 524 * for the main task. 525 * parallel single-threaded 526 * Number of output buffers: nhelper*2 1 527 * Number of mapping buffers: nhelper*4 1 528 * 529 */ 530 static void 531 dump_update_clevel() 532 { 533 int tag; 534 size_t bz2size; 535 helper_t *hp, *hpend; 536 cbuf_t *cp, *cpend; 537 dumpcfg_t *old = &dumpcfg; 538 dumpcfg_t newcfg = *old; 539 dumpcfg_t *new = &newcfg; 540 541 ASSERT(MUTEX_HELD(&dump_lock)); 542 543 /* 544 * Free the previously allocated bufs and VM. 545 */ 546 if (old->helper != NULL) { 547 548 /* helpers */ 549 hpend = &old->helper[old->nhelper]; 550 for (hp = old->helper; hp != hpend; hp++) { 551 if (hp->lzbuf != NULL) 552 kmem_free(hp->lzbuf, PAGESIZE); 553 if (hp->page != NULL) 554 kmem_free(hp->page, PAGESIZE); 555 } 556 kmem_free(old->helper, old->nhelper * sizeof (helper_t)); 557 558 /* VM space for mapping pages */ 559 cpend = &old->cmap[old->ncmap]; 560 for (cp = old->cmap; cp != cpend; cp++) 561 vmem_xfree(heap_arena, cp->buf, CBUF_MAPSIZE); 562 kmem_free(old->cmap, old->ncmap * sizeof (cbuf_t)); 563 564 /* output bufs */ 565 cpend = &old->cbuf[old->ncbuf]; 566 for (cp = old->cbuf; cp != cpend; cp++) 567 if (cp->buf != NULL) 568 kmem_free(cp->buf, cp->size); 569 kmem_free(old->cbuf, old->ncbuf * sizeof (cbuf_t)); 570 571 /* reserved VM for dumpsys_get_maxmem */ 572 if (old->maxvmsize > 0) 573 vmem_xfree(heap_arena, old->maxvm, old->maxvmsize); 574 } 575 576 /* 577 * Allocate memory and VM. 578 * One CPU runs dumpsys, the rest are helpers. 579 */ 580 new->nhelper = ncpus - 1; 581 if (new->nhelper < 1) 582 new->nhelper = 1; 583 584 if (new->nhelper > DUMP_MAX_NHELPER) 585 new->nhelper = DUMP_MAX_NHELPER; 586 587 /* increase threshold for faster disks */ 588 new->threshold = dump_plat_mincpu; 589 if (dumpbuf.iosize >= DUMP_1MB) 590 new->threshold *= 3; 591 else if (dumpbuf.iosize >= (256 * DUMP_1KB)) 592 new->threshold *= 2; 593 594 /* figure compression level based upon the computed threshold. */ 595 if (dump_plat_mincpu == 0 || new->nhelper < 2) { 596 new->clevel = 0; 597 new->nhelper = 1; 598 } else if ((new->nhelper + 1) >= new->threshold) { 599 new->clevel = DUMP_CLEVEL_BZIP2; 600 } else { 601 new->clevel = DUMP_CLEVEL_LZJB; 602 } 603 604 if (new->clevel == 0) { 605 new->ncbuf = 1; 606 new->ncmap = 1; 607 } else { 608 new->ncbuf = NCBUF_PER_HELPER * new->nhelper; 609 new->ncmap = NCMAP_PER_HELPER * new->nhelper; 610 } 611 612 /* 613 * Allocate new data structures and buffers for MINHELPERS, 614 * and also figure the max desired size. 615 */ 616 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level); 617 new->maxsize = 0; 618 new->maxvmsize = 0; 619 new->maxvm = NULL; 620 tag = 1; 621 new->helper = kmem_zalloc(new->nhelper * sizeof (helper_t), KM_SLEEP); 622 hpend = &new->helper[new->nhelper]; 623 for (hp = new->helper; hp != hpend; hp++) { 624 hp->tag = tag++; 625 if (hp < &new->helper[MINHELPERS]) { 626 hp->lzbuf = kmem_alloc(PAGESIZE, KM_SLEEP); 627 hp->page = kmem_alloc(PAGESIZE, KM_SLEEP); 628 } else if (new->clevel < DUMP_CLEVEL_BZIP2) { 629 new->maxsize += 2 * PAGESIZE; 630 } else { 631 new->maxsize += PAGESIZE; 632 } 633 if (new->clevel >= DUMP_CLEVEL_BZIP2) 634 new->maxsize += bz2size; 635 } 636 637 new->cbuf = kmem_zalloc(new->ncbuf * sizeof (cbuf_t), KM_SLEEP); 638 cpend = &new->cbuf[new->ncbuf]; 639 for (cp = new->cbuf; cp != cpend; cp++) { 640 cp->state = CBUF_FREEBUF; 641 cp->size = CBUF_SIZE; 642 if (cp < &new->cbuf[MINCBUFS]) 643 cp->buf = kmem_alloc(cp->size, KM_SLEEP); 644 else 645 new->maxsize += cp->size; 646 } 647 648 new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP); 649 cpend = &new->cmap[new->ncmap]; 650 for (cp = new->cmap; cp != cpend; cp++) { 651 cp->state = CBUF_FREEMAP; 652 cp->size = CBUF_MAPSIZE; 653 cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE, 654 0, 0, NULL, NULL, VM_SLEEP); 655 } 656 657 /* reserve VA to be backed with spare pages at crash time */ 658 if (new->maxsize > 0) { 659 new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE); 660 new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE); 661 new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize, 662 CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP); 663 } 664 665 /* 666 * Reserve memory for kmem allocation calls made during crash 667 * dump. The hat layer allocates memory for each mapping 668 * created, and the I/O path allocates buffers and data structs. 669 * Add a few pages for safety. 670 */ 671 kmem_dump_init((new->ncmap * dump_kmem_permap) + 672 (dump_kmem_pages * PAGESIZE)); 673 674 /* set new config pointers */ 675 *old = *new; 676 } 677 678 /* 679 * Define a struct memlist walker to optimize bitnum to pfn 680 * lookup. The walker maintains the state of the list traversal. 681 */ 682 typedef struct dumpmlw { 683 struct memlist *mp; /* current memlist */ 684 pgcnt_t basenum; /* bitnum base offset */ 685 pgcnt_t mppages; /* current memlist size */ 686 pgcnt_t mpleft; /* size to end of current memlist */ 687 pfn_t mpaddr; /* first pfn in memlist */ 688 } dumpmlw_t; 689 690 /* initialize the walker */ 691 static inline void 692 dump_init_memlist_walker(dumpmlw_t *pw) 693 { 694 pw->mp = phys_install; 695 pw->basenum = 0; 696 pw->mppages = pw->mp->ml_size >> PAGESHIFT; 697 pw->mpleft = pw->mppages; 698 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT; 699 } 700 701 /* 702 * Lookup pfn given bitnum. The memlist can be quite long on some 703 * systems (e.g.: one per board). To optimize sequential lookups, the 704 * caller initializes and presents a memlist walker. 705 */ 706 static pfn_t 707 dump_bitnum_to_pfn(pgcnt_t bitnum, dumpmlw_t *pw) 708 { 709 bitnum -= pw->basenum; 710 while (pw->mp != NULL) { 711 if (bitnum < pw->mppages) { 712 pw->mpleft = pw->mppages - bitnum; 713 return (pw->mpaddr + bitnum); 714 } 715 bitnum -= pw->mppages; 716 pw->basenum += pw->mppages; 717 pw->mp = pw->mp->ml_next; 718 if (pw->mp != NULL) { 719 pw->mppages = pw->mp->ml_size >> PAGESHIFT; 720 pw->mpleft = pw->mppages; 721 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT; 722 } 723 } 724 return (PFN_INVALID); 725 } 726 727 static pgcnt_t 728 dump_pfn_to_bitnum(pfn_t pfn) 729 { 730 struct memlist *mp; 731 pgcnt_t bitnum = 0; 732 733 for (mp = phys_install; mp != NULL; mp = mp->ml_next) { 734 if (pfn >= (mp->ml_address >> PAGESHIFT) && 735 pfn < ((mp->ml_address + mp->ml_size) >> PAGESHIFT)) 736 return (bitnum + pfn - (mp->ml_address >> PAGESHIFT)); 737 bitnum += mp->ml_size >> PAGESHIFT; 738 } 739 return ((pgcnt_t)-1); 740 } 741 742 /* 743 * Set/test bitmap for a CBUF_MAPSIZE range which includes pfn. The 744 * mapping of pfn to range index is imperfect because pfn and bitnum 745 * do not have the same phase. To make sure a CBUF_MAPSIZE range is 746 * covered, call this for both ends: 747 * dump_set_used(base) 748 * dump_set_used(base+CBUF_MAPNP-1) 749 * 750 * This is used during a panic dump to mark pages allocated by 751 * dumpsys_get_maxmem(). The macro IS_DUMP_PAGE(pp) is used by 752 * page_get_mnode_freelist() to make sure pages used by dump are never 753 * allocated. 754 */ 755 #define CBUF_MAPP2R(pfn) ((pfn) >> (CBUF_MAPSHIFT - PAGESHIFT)) 756 757 static void 758 dump_set_used(pfn_t pfn) 759 { 760 761 pgcnt_t bitnum, rbitnum; 762 763 bitnum = dump_pfn_to_bitnum(pfn); 764 ASSERT(bitnum != (pgcnt_t)-1); 765 766 rbitnum = CBUF_MAPP2R(bitnum); 767 ASSERT(rbitnum < dumpcfg.rbitmapsize); 768 769 BT_SET(dumpcfg.rbitmap, rbitnum); 770 } 771 772 int 773 dump_test_used(pfn_t pfn) 774 { 775 pgcnt_t bitnum, rbitnum; 776 777 bitnum = dump_pfn_to_bitnum(pfn); 778 ASSERT(bitnum != (pgcnt_t)-1); 779 780 rbitnum = CBUF_MAPP2R(bitnum); 781 ASSERT(rbitnum < dumpcfg.rbitmapsize); 782 783 return (BT_TEST(dumpcfg.rbitmap, rbitnum)); 784 } 785 786 /* 787 * dumpbzalloc and dumpbzfree are callbacks from the bzip2 library. 788 * dumpsys_get_maxmem() uses them for BZ2_bzCompressInit(). 789 */ 790 static void * 791 dumpbzalloc(void *opaque, int items, int size) 792 { 793 size_t *sz; 794 char *ret; 795 796 ASSERT(opaque != NULL); 797 sz = opaque; 798 ret = dumpcfg.maxvm + *sz; 799 *sz += items * size; 800 *sz = P2ROUNDUP(*sz, BZ2_BZALLOC_ALIGN); 801 ASSERT(*sz <= dumpcfg.maxvmsize); 802 return (ret); 803 } 804 805 /*ARGSUSED*/ 806 static void 807 dumpbzfree(void *opaque, void *addr) 808 { 809 } 810 811 /* 812 * Perform additional checks on the page to see if we can really use 813 * it. The kernel (kas) pages are always set in the bitmap. However, 814 * boot memory pages (prom_ppages or P_BOOTPAGES) are not in the 815 * bitmap. So we check for them. 816 */ 817 static inline int 818 dump_pfn_check(pfn_t pfn) 819 { 820 page_t *pp = page_numtopp_nolock(pfn); 821 if (pp == NULL || pp->p_pagenum != pfn || 822 #if defined(__sparc) 823 pp->p_vnode == &promvp || 824 #else 825 PP_ISBOOTPAGES(pp) || 826 #endif 827 pp->p_toxic != 0) 828 return (0); 829 return (1); 830 } 831 832 /* 833 * Check a range to see if all contained pages are available and 834 * return non-zero if the range can be used. 835 */ 836 static inline int 837 dump_range_check(pgcnt_t start, pgcnt_t end, pfn_t pfn) 838 { 839 for (; start < end; start++, pfn++) { 840 if (BT_TEST(dumpcfg.bitmap, start)) 841 return (0); 842 if (!dump_pfn_check(pfn)) 843 return (0); 844 } 845 return (1); 846 } 847 848 /* 849 * dumpsys_get_maxmem() is called during panic. Find unused ranges 850 * and use them for buffers. If we find enough memory switch to 851 * parallel bzip2, otherwise use parallel lzjb. 852 * 853 * It searches the dump bitmap in 2 passes. The first time it looks 854 * for CBUF_MAPSIZE ranges. On the second pass it uses small pages. 855 */ 856 static void 857 dumpsys_get_maxmem() 858 { 859 dumpcfg_t *cfg = &dumpcfg; 860 cbuf_t *endcp = &cfg->cbuf[cfg->ncbuf]; 861 helper_t *endhp = &cfg->helper[cfg->nhelper]; 862 pgcnt_t bitnum, end; 863 size_t sz, endsz, bz2size; 864 pfn_t pfn, off; 865 cbuf_t *cp; 866 helper_t *hp, *ohp; 867 dumpmlw_t mlw; 868 int k; 869 870 if (cfg->maxsize == 0 || cfg->clevel < DUMP_CLEVEL_LZJB || 871 (dump_conflags & DUMP_ALL) != 0) 872 return; 873 874 sz = 0; 875 cfg->found4m = 0; 876 cfg->foundsm = 0; 877 878 /* bitmap of ranges used to estimate which pfns are being used */ 879 bzero(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.rbitmapsize)); 880 881 /* find ranges that are not being dumped to use for buffers */ 882 dump_init_memlist_walker(&mlw); 883 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) { 884 dump_timeleft = dump_timeout; 885 end = bitnum + CBUF_MAPNP; 886 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 887 ASSERT(pfn != PFN_INVALID); 888 889 /* skip partial range at end of mem segment */ 890 if (mlw.mpleft < CBUF_MAPNP) { 891 end = bitnum + mlw.mpleft; 892 continue; 893 } 894 895 /* skip non aligned pages */ 896 off = P2PHASE(pfn, CBUF_MAPNP); 897 if (off != 0) { 898 end -= off; 899 continue; 900 } 901 902 if (!dump_range_check(bitnum, end, pfn)) 903 continue; 904 905 ASSERT((sz + CBUF_MAPSIZE) <= cfg->maxvmsize); 906 hat_devload(kas.a_hat, cfg->maxvm + sz, CBUF_MAPSIZE, pfn, 907 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST); 908 sz += CBUF_MAPSIZE; 909 cfg->found4m++; 910 911 /* set the bitmap for both ends to be sure to cover the range */ 912 dump_set_used(pfn); 913 dump_set_used(pfn + CBUF_MAPNP - 1); 914 915 if (sz >= cfg->maxsize) 916 goto foundmax; 917 } 918 919 /* Add small pages if we can't find enough large pages. */ 920 dump_init_memlist_walker(&mlw); 921 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) { 922 dump_timeleft = dump_timeout; 923 end = bitnum + CBUF_MAPNP; 924 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 925 ASSERT(pfn != PFN_INVALID); 926 927 /* Find any non-aligned pages at start and end of segment. */ 928 off = P2PHASE(pfn, CBUF_MAPNP); 929 if (mlw.mpleft < CBUF_MAPNP) { 930 end = bitnum + mlw.mpleft; 931 } else if (off != 0) { 932 end -= off; 933 } else if (cfg->found4m && dump_test_used(pfn)) { 934 continue; 935 } 936 937 for (; bitnum < end; bitnum++, pfn++) { 938 dump_timeleft = dump_timeout; 939 if (BT_TEST(dumpcfg.bitmap, bitnum)) 940 continue; 941 if (!dump_pfn_check(pfn)) 942 continue; 943 ASSERT((sz + PAGESIZE) <= cfg->maxvmsize); 944 hat_devload(kas.a_hat, cfg->maxvm + sz, PAGESIZE, pfn, 945 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST); 946 sz += PAGESIZE; 947 cfg->foundsm++; 948 dump_set_used(pfn); 949 if (sz >= cfg->maxsize) 950 goto foundmax; 951 } 952 } 953 954 /* Fall back to lzjb if we did not get enough memory for bzip2. */ 955 endsz = (cfg->maxsize * cfg->threshold) / cfg->nhelper; 956 if (sz < endsz) { 957 cfg->clevel = DUMP_CLEVEL_LZJB; 958 } 959 960 /* Allocate memory for as many helpers as we can. */ 961 foundmax: 962 963 /* Byte offsets into memory found and mapped above */ 964 endsz = sz; 965 sz = 0; 966 967 /* Set the size for bzip2 state. Only bzip2 needs it. */ 968 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level); 969 970 /* Skip the preallocate output buffers. */ 971 cp = &cfg->cbuf[MINCBUFS]; 972 973 /* Use this to move memory up from the preallocated helpers. */ 974 ohp = cfg->helper; 975 976 /* Loop over all helpers and allocate memory. */ 977 for (hp = cfg->helper; hp < endhp; hp++) { 978 979 /* Skip preallocated helpers by checking hp->page. */ 980 if (hp->page == NULL) { 981 if (cfg->clevel <= DUMP_CLEVEL_LZJB) { 982 /* lzjb needs 2 1-page buffers */ 983 if ((sz + (2 * PAGESIZE)) > endsz) 984 break; 985 hp->page = cfg->maxvm + sz; 986 sz += PAGESIZE; 987 hp->lzbuf = cfg->maxvm + sz; 988 sz += PAGESIZE; 989 990 } else if (ohp->lzbuf != NULL) { 991 /* re-use the preallocted lzjb page for bzip2 */ 992 hp->page = ohp->lzbuf; 993 ohp->lzbuf = NULL; 994 ++ohp; 995 996 } else { 997 /* bzip2 needs a 1-page buffer */ 998 if ((sz + PAGESIZE) > endsz) 999 break; 1000 hp->page = cfg->maxvm + sz; 1001 sz += PAGESIZE; 1002 } 1003 } 1004 1005 /* 1006 * Add output buffers per helper. The number of 1007 * buffers per helper is determined by the ratio of 1008 * ncbuf to nhelper. 1009 */ 1010 for (k = 0; cp < endcp && (sz + CBUF_SIZE) <= endsz && 1011 k < NCBUF_PER_HELPER; k++) { 1012 cp->state = CBUF_FREEBUF; 1013 cp->size = CBUF_SIZE; 1014 cp->buf = cfg->maxvm + sz; 1015 sz += CBUF_SIZE; 1016 ++cp; 1017 } 1018 1019 /* 1020 * bzip2 needs compression state. Use the dumpbzalloc 1021 * and dumpbzfree callbacks to allocate the memory. 1022 * bzip2 does allocation only at init time. 1023 */ 1024 if (cfg->clevel >= DUMP_CLEVEL_BZIP2) { 1025 if ((sz + bz2size) > endsz) { 1026 hp->page = NULL; 1027 break; 1028 } else { 1029 hp->bzstream.opaque = &sz; 1030 hp->bzstream.bzalloc = dumpbzalloc; 1031 hp->bzstream.bzfree = dumpbzfree; 1032 (void) BZ2_bzCompressInit(&hp->bzstream, 1033 dump_bzip2_level, 0, 0); 1034 hp->bzstream.opaque = NULL; 1035 } 1036 } 1037 } 1038 1039 /* Finish allocating output buffers */ 1040 for (; cp < endcp && (sz + CBUF_SIZE) <= endsz; cp++) { 1041 cp->state = CBUF_FREEBUF; 1042 cp->size = CBUF_SIZE; 1043 cp->buf = cfg->maxvm + sz; 1044 sz += CBUF_SIZE; 1045 } 1046 1047 /* Enable IS_DUMP_PAGE macro, which checks for pages we took. */ 1048 if (cfg->found4m || cfg->foundsm) 1049 dump_check_used = 1; 1050 1051 ASSERT(sz <= endsz); 1052 } 1053 1054 static void 1055 dumphdr_init(void) 1056 { 1057 pgcnt_t npages = 0; 1058 1059 ASSERT(MUTEX_HELD(&dump_lock)); 1060 1061 if (dumphdr == NULL) { 1062 dumphdr = kmem_zalloc(sizeof (dumphdr_t), KM_SLEEP); 1063 dumphdr->dump_magic = DUMP_MAGIC; 1064 dumphdr->dump_version = DUMP_VERSION; 1065 dumphdr->dump_wordsize = DUMP_WORDSIZE; 1066 dumphdr->dump_pageshift = PAGESHIFT; 1067 dumphdr->dump_pagesize = PAGESIZE; 1068 dumphdr->dump_utsname = utsname; 1069 (void) strcpy(dumphdr->dump_platform, platform); 1070 dumpbuf.size = dumpbuf_iosize(maxphys); 1071 dumpbuf.start = kmem_alloc(dumpbuf.size, KM_SLEEP); 1072 dumpbuf.end = dumpbuf.start + dumpbuf.size; 1073 dumpcfg.pids = kmem_alloc(v.v_proc * sizeof (pid_t), KM_SLEEP); 1074 dumpcfg.helpermap = kmem_zalloc(BT_SIZEOFMAP(NCPU), KM_SLEEP); 1075 LOCK_INIT_HELD(&dumpcfg.helper_lock); 1076 } 1077 1078 npages = num_phys_pages(); 1079 1080 if (dumpcfg.bitmapsize != npages) { 1081 size_t rlen = CBUF_MAPP2R(P2ROUNDUP(npages, CBUF_MAPNP)); 1082 void *map = kmem_alloc(BT_SIZEOFMAP(npages), KM_SLEEP); 1083 void *rmap = kmem_alloc(BT_SIZEOFMAP(rlen), KM_SLEEP); 1084 1085 if (dumpcfg.bitmap != NULL) 1086 kmem_free(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg. 1087 bitmapsize)); 1088 if (dumpcfg.rbitmap != NULL) 1089 kmem_free(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg. 1090 rbitmapsize)); 1091 dumpcfg.bitmap = map; 1092 dumpcfg.bitmapsize = npages; 1093 dumpcfg.rbitmap = rmap; 1094 dumpcfg.rbitmapsize = rlen; 1095 } 1096 } 1097 1098 /* 1099 * Establish a new dump device. 1100 */ 1101 int 1102 dumpinit(vnode_t *vp, char *name, int justchecking) 1103 { 1104 vnode_t *cvp; 1105 vattr_t vattr; 1106 vnode_t *cdev_vp; 1107 int error = 0; 1108 1109 ASSERT(MUTEX_HELD(&dump_lock)); 1110 1111 dumphdr_init(); 1112 1113 cvp = common_specvp(vp); 1114 if (cvp == dumpvp) 1115 return (0); 1116 1117 /* 1118 * Determine whether this is a plausible dump device. We want either: 1119 * (1) a real device that's not mounted and has a cb_dump routine, or 1120 * (2) a swapfile on some filesystem that has a vop_dump routine. 1121 */ 1122 if ((error = VOP_OPEN(&cvp, FREAD | FWRITE, kcred, NULL)) != 0) 1123 return (error); 1124 1125 vattr.va_mask = AT_SIZE | AT_TYPE | AT_RDEV; 1126 if ((error = VOP_GETATTR(cvp, &vattr, 0, kcred, NULL)) == 0) { 1127 if (vattr.va_type == VBLK || vattr.va_type == VCHR) { 1128 if (devopsp[getmajor(vattr.va_rdev)]-> 1129 devo_cb_ops->cb_dump == nodev) 1130 error = ENOTSUP; 1131 else if (vfs_devismounted(vattr.va_rdev)) 1132 error = EBUSY; 1133 if (strcmp(ddi_driver_name(VTOS(cvp)->s_dip), 1134 ZFS_DRIVER) == 0 && 1135 IS_SWAPVP(common_specvp(cvp))) 1136 error = EBUSY; 1137 } else { 1138 if (vn_matchopval(cvp, VOPNAME_DUMP, fs_nosys) || 1139 !IS_SWAPVP(cvp)) 1140 error = ENOTSUP; 1141 } 1142 } 1143 1144 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) 1145 error = ENOSPC; 1146 1147 if (error || justchecking) { 1148 (void) VOP_CLOSE(cvp, FREAD | FWRITE, 1, (offset_t)0, 1149 kcred, NULL); 1150 return (error); 1151 } 1152 1153 VN_HOLD(cvp); 1154 1155 if (dumpvp != NULL) 1156 dumpfini(); /* unconfigure the old dump device */ 1157 1158 dumpvp = cvp; 1159 dumpvp_size = vattr.va_size & -DUMP_OFFSET; 1160 dumppath = kmem_alloc(strlen(name) + 1, KM_SLEEP); 1161 (void) strcpy(dumppath, name); 1162 dumpbuf.iosize = 0; 1163 1164 /* 1165 * If the dump device is a block device, attempt to open up the 1166 * corresponding character device and determine its maximum transfer 1167 * size. We use this information to potentially resize dumpbuf to a 1168 * larger and more optimal size for performing i/o to the dump device. 1169 */ 1170 if (cvp->v_type == VBLK && 1171 (cdev_vp = makespecvp(VTOS(cvp)->s_dev, VCHR)) != NULL) { 1172 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) { 1173 size_t blk_size; 1174 struct dk_cinfo dki; 1175 struct dk_minfo minf; 1176 1177 if (VOP_IOCTL(cdev_vp, DKIOCGMEDIAINFO, 1178 (intptr_t)&minf, FKIOCTL, kcred, NULL, NULL) 1179 == 0 && minf.dki_lbsize != 0) 1180 blk_size = minf.dki_lbsize; 1181 else 1182 blk_size = DEV_BSIZE; 1183 1184 if (VOP_IOCTL(cdev_vp, DKIOCINFO, (intptr_t)&dki, 1185 FKIOCTL, kcred, NULL, NULL) == 0) { 1186 dumpbuf.iosize = dki.dki_maxtransfer * blk_size; 1187 dumpbuf_resize(); 1188 } 1189 /* 1190 * If we are working with a zvol then dumpify it 1191 * if it's not being used as swap. 1192 */ 1193 if (strcmp(dki.dki_dname, ZVOL_DRIVER) == 0) { 1194 if (IS_SWAPVP(common_specvp(cvp))) 1195 error = EBUSY; 1196 else if ((error = VOP_IOCTL(cdev_vp, 1197 DKIOCDUMPINIT, NULL, FKIOCTL, kcred, 1198 NULL, NULL)) != 0) 1199 dumpfini(); 1200 } 1201 1202 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0, 1203 kcred, NULL); 1204 } 1205 1206 VN_RELE(cdev_vp); 1207 } 1208 1209 cmn_err(CE_CONT, "?dump on %s size %llu MB\n", name, dumpvp_size >> 20); 1210 1211 dump_update_clevel(); 1212 1213 return (error); 1214 } 1215 1216 void 1217 dumpfini(void) 1218 { 1219 vattr_t vattr; 1220 boolean_t is_zfs = B_FALSE; 1221 vnode_t *cdev_vp; 1222 ASSERT(MUTEX_HELD(&dump_lock)); 1223 1224 kmem_free(dumppath, strlen(dumppath) + 1); 1225 1226 /* 1227 * Determine if we are using zvols for our dump device 1228 */ 1229 vattr.va_mask = AT_RDEV; 1230 if (VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL) == 0) { 1231 is_zfs = (getmajor(vattr.va_rdev) == 1232 ddi_name_to_major(ZFS_DRIVER)) ? B_TRUE : B_FALSE; 1233 } 1234 1235 /* 1236 * If we have a zvol dump device then we call into zfs so 1237 * that it may have a chance to cleanup. 1238 */ 1239 if (is_zfs && 1240 (cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR)) != NULL) { 1241 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) { 1242 (void) VOP_IOCTL(cdev_vp, DKIOCDUMPFINI, NULL, FKIOCTL, 1243 kcred, NULL, NULL); 1244 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0, 1245 kcred, NULL); 1246 } 1247 VN_RELE(cdev_vp); 1248 } 1249 1250 (void) VOP_CLOSE(dumpvp, FREAD | FWRITE, 1, (offset_t)0, kcred, NULL); 1251 1252 VN_RELE(dumpvp); 1253 1254 dumpvp = NULL; 1255 dumpvp_size = 0; 1256 dumppath = NULL; 1257 } 1258 1259 static offset_t 1260 dumpvp_flush(void) 1261 { 1262 size_t size = P2ROUNDUP(dumpbuf.cur - dumpbuf.start, PAGESIZE); 1263 hrtime_t iotime; 1264 int err; 1265 1266 if (dumpbuf.vp_off + size > dumpbuf.vp_limit) { 1267 dump_ioerr = ENOSPC; 1268 dumpbuf.vp_off = dumpbuf.vp_limit; 1269 } else if (size != 0) { 1270 iotime = gethrtime(); 1271 dumpsync.iowait += iotime - dumpsync.iowaitts; 1272 if (panicstr) 1273 err = VOP_DUMP(dumpvp, dumpbuf.start, 1274 lbtodb(dumpbuf.vp_off), btod(size), NULL); 1275 else 1276 err = vn_rdwr(UIO_WRITE, dumpbuf.cdev_vp != NULL ? 1277 dumpbuf.cdev_vp : dumpvp, dumpbuf.start, size, 1278 dumpbuf.vp_off, UIO_SYSSPACE, 0, dumpbuf.vp_limit, 1279 kcred, 0); 1280 if (err && dump_ioerr == 0) 1281 dump_ioerr = err; 1282 dumpsync.iowaitts = gethrtime(); 1283 dumpsync.iotime += dumpsync.iowaitts - iotime; 1284 dumpsync.nwrite += size; 1285 dumpbuf.vp_off += size; 1286 } 1287 dumpbuf.cur = dumpbuf.start; 1288 dump_timeleft = dump_timeout; 1289 return (dumpbuf.vp_off); 1290 } 1291 1292 /* maximize write speed by keeping seek offset aligned with size */ 1293 void 1294 dumpvp_write(const void *va, size_t size) 1295 { 1296 size_t len, off, sz; 1297 1298 while (size != 0) { 1299 len = MIN(size, dumpbuf.end - dumpbuf.cur); 1300 if (len == 0) { 1301 off = P2PHASE(dumpbuf.vp_off, dumpbuf.size); 1302 if (off == 0 || !ISP2(dumpbuf.size)) { 1303 (void) dumpvp_flush(); 1304 } else { 1305 sz = dumpbuf.size - off; 1306 dumpbuf.cur = dumpbuf.start + sz; 1307 (void) dumpvp_flush(); 1308 ovbcopy(dumpbuf.start + sz, dumpbuf.start, off); 1309 dumpbuf.cur += off; 1310 } 1311 } else { 1312 bcopy(va, dumpbuf.cur, len); 1313 va = (char *)va + len; 1314 dumpbuf.cur += len; 1315 size -= len; 1316 } 1317 } 1318 } 1319 1320 /*ARGSUSED*/ 1321 static void 1322 dumpvp_ksyms_write(const void *src, void *dst, size_t size) 1323 { 1324 dumpvp_write(src, size); 1325 } 1326 1327 /* 1328 * Mark 'pfn' in the bitmap and dump its translation table entry. 1329 */ 1330 void 1331 dump_addpage(struct as *as, void *va, pfn_t pfn) 1332 { 1333 mem_vtop_t mem_vtop; 1334 pgcnt_t bitnum; 1335 1336 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) { 1337 if (!BT_TEST(dumpcfg.bitmap, bitnum)) { 1338 dumphdr->dump_npages++; 1339 BT_SET(dumpcfg.bitmap, bitnum); 1340 } 1341 dumphdr->dump_nvtop++; 1342 mem_vtop.m_as = as; 1343 mem_vtop.m_va = va; 1344 mem_vtop.m_pfn = pfn; 1345 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 1346 } 1347 dump_timeleft = dump_timeout; 1348 } 1349 1350 /* 1351 * Mark 'pfn' in the bitmap 1352 */ 1353 void 1354 dump_page(pfn_t pfn) 1355 { 1356 pgcnt_t bitnum; 1357 1358 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) { 1359 if (!BT_TEST(dumpcfg.bitmap, bitnum)) { 1360 dumphdr->dump_npages++; 1361 BT_SET(dumpcfg.bitmap, bitnum); 1362 } 1363 } 1364 dump_timeleft = dump_timeout; 1365 } 1366 1367 /* 1368 * Dump the <as, va, pfn> information for a given address space. 1369 * SEGOP_DUMP() will call dump_addpage() for each page in the segment. 1370 */ 1371 static void 1372 dump_as(struct as *as) 1373 { 1374 struct seg *seg; 1375 1376 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1377 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 1378 if (seg->s_as != as) 1379 break; 1380 if (seg->s_ops == NULL) 1381 continue; 1382 SEGOP_DUMP(seg); 1383 } 1384 AS_LOCK_EXIT(as, &as->a_lock); 1385 1386 if (seg != NULL) 1387 cmn_err(CE_WARN, "invalid segment %p in address space %p", 1388 (void *)seg, (void *)as); 1389 } 1390 1391 static int 1392 dump_process(pid_t pid) 1393 { 1394 proc_t *p = sprlock(pid); 1395 1396 if (p == NULL) 1397 return (-1); 1398 if (p->p_as != &kas) { 1399 mutex_exit(&p->p_lock); 1400 dump_as(p->p_as); 1401 mutex_enter(&p->p_lock); 1402 } 1403 1404 sprunlock(p); 1405 1406 return (0); 1407 } 1408 1409 void 1410 dump_ereports(void) 1411 { 1412 u_offset_t dumpvp_start; 1413 erpt_dump_t ed; 1414 1415 if (dumpvp == NULL || dumphdr == NULL) 1416 return; 1417 1418 dumpbuf.cur = dumpbuf.start; 1419 dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE); 1420 dumpvp_start = dumpbuf.vp_limit - DUMP_ERPTSIZE; 1421 dumpbuf.vp_off = dumpvp_start; 1422 1423 fm_ereport_dump(); 1424 if (panicstr) 1425 errorq_dump(); 1426 1427 bzero(&ed, sizeof (ed)); /* indicate end of ereports */ 1428 dumpvp_write(&ed, sizeof (ed)); 1429 (void) dumpvp_flush(); 1430 1431 if (!panicstr) { 1432 (void) VOP_PUTPAGE(dumpvp, dumpvp_start, 1433 (size_t)(dumpbuf.vp_off - dumpvp_start), 1434 B_INVAL | B_FORCE, kcred, NULL); 1435 } 1436 } 1437 1438 void 1439 dump_messages(void) 1440 { 1441 log_dump_t ld; 1442 mblk_t *mctl, *mdata; 1443 queue_t *q, *qlast; 1444 u_offset_t dumpvp_start; 1445 1446 if (dumpvp == NULL || dumphdr == NULL || log_consq == NULL) 1447 return; 1448 1449 dumpbuf.cur = dumpbuf.start; 1450 dumpbuf.vp_limit = dumpvp_size - DUMP_OFFSET; 1451 dumpvp_start = dumpbuf.vp_limit - DUMP_LOGSIZE; 1452 dumpbuf.vp_off = dumpvp_start; 1453 1454 qlast = NULL; 1455 do { 1456 for (q = log_consq; q->q_next != qlast; q = q->q_next) 1457 continue; 1458 for (mctl = q->q_first; mctl != NULL; mctl = mctl->b_next) { 1459 dump_timeleft = dump_timeout; 1460 mdata = mctl->b_cont; 1461 ld.ld_magic = LOG_MAGIC; 1462 ld.ld_msgsize = MBLKL(mctl->b_cont); 1463 ld.ld_csum = checksum32(mctl->b_rptr, MBLKL(mctl)); 1464 ld.ld_msum = checksum32(mdata->b_rptr, MBLKL(mdata)); 1465 dumpvp_write(&ld, sizeof (ld)); 1466 dumpvp_write(mctl->b_rptr, MBLKL(mctl)); 1467 dumpvp_write(mdata->b_rptr, MBLKL(mdata)); 1468 } 1469 } while ((qlast = q) != log_consq); 1470 1471 ld.ld_magic = 0; /* indicate end of messages */ 1472 dumpvp_write(&ld, sizeof (ld)); 1473 (void) dumpvp_flush(); 1474 if (!panicstr) { 1475 (void) VOP_PUTPAGE(dumpvp, dumpvp_start, 1476 (size_t)(dumpbuf.vp_off - dumpvp_start), 1477 B_INVAL | B_FORCE, kcred, NULL); 1478 } 1479 } 1480 1481 /* 1482 * The following functions are called on multiple CPUs during dump. 1483 * They must not use most kernel services, because all cross-calls are 1484 * disabled during panic. Therefore, blocking locks and cache flushes 1485 * will not work. 1486 */ 1487 1488 /* 1489 * Copy pages, trapping ECC errors. Also, for robustness, trap data 1490 * access in case something goes wrong in the hat layer and the 1491 * mapping is broken. 1492 */ 1493 static int 1494 dump_pagecopy(void *src, void *dst) 1495 { 1496 long *wsrc = (long *)src; 1497 long *wdst = (long *)dst; 1498 const ulong_t ncopies = PAGESIZE / sizeof (long); 1499 volatile int w = 0; 1500 volatile int ueoff = -1; 1501 on_trap_data_t otd; 1502 1503 if (on_trap(&otd, OT_DATA_EC | OT_DATA_ACCESS)) { 1504 if (ueoff == -1) 1505 ueoff = w * sizeof (long); 1506 /* report "bad ECC" or "bad address" */ 1507 #ifdef _LP64 1508 if (otd.ot_trap & OT_DATA_EC) 1509 wdst[w++] = 0x00badecc00badecc; 1510 else 1511 wdst[w++] = 0x00badadd00badadd; 1512 #else 1513 if (otd.ot_trap & OT_DATA_EC) 1514 wdst[w++] = 0x00badecc; 1515 else 1516 wdst[w++] = 0x00badadd; 1517 #endif 1518 } 1519 while (w < ncopies) { 1520 wdst[w] = wsrc[w]; 1521 w++; 1522 } 1523 no_trap(); 1524 return (ueoff); 1525 } 1526 1527 static void 1528 dumpsys_close_cq(cqueue_t *cq, int live) 1529 { 1530 if (live) { 1531 mutex_enter(&cq->mutex); 1532 atomic_dec_uint(&cq->open); 1533 cv_signal(&cq->cv); 1534 mutex_exit(&cq->mutex); 1535 } else { 1536 atomic_dec_uint(&cq->open); 1537 } 1538 } 1539 1540 static inline void 1541 dumpsys_spinlock(lock_t *lp) 1542 { 1543 uint_t backoff = 0; 1544 int loop_count = 0; 1545 1546 while (LOCK_HELD(lp) || !lock_spin_try(lp)) { 1547 if (++loop_count >= ncpus) { 1548 backoff = mutex_lock_backoff(0); 1549 loop_count = 0; 1550 } else { 1551 backoff = mutex_lock_backoff(backoff); 1552 } 1553 mutex_lock_delay(backoff); 1554 } 1555 } 1556 1557 static inline void 1558 dumpsys_spinunlock(lock_t *lp) 1559 { 1560 lock_clear(lp); 1561 } 1562 1563 static inline void 1564 dumpsys_lock(cqueue_t *cq, int live) 1565 { 1566 if (live) 1567 mutex_enter(&cq->mutex); 1568 else 1569 dumpsys_spinlock(&cq->spinlock); 1570 } 1571 1572 static inline void 1573 dumpsys_unlock(cqueue_t *cq, int live, int signal) 1574 { 1575 if (live) { 1576 if (signal) 1577 cv_signal(&cq->cv); 1578 mutex_exit(&cq->mutex); 1579 } else { 1580 dumpsys_spinunlock(&cq->spinlock); 1581 } 1582 } 1583 1584 static void 1585 dumpsys_wait_cq(cqueue_t *cq, int live) 1586 { 1587 if (live) { 1588 cv_wait(&cq->cv, &cq->mutex); 1589 } else { 1590 dumpsys_spinunlock(&cq->spinlock); 1591 while (cq->open) 1592 if (cq->first) 1593 break; 1594 dumpsys_spinlock(&cq->spinlock); 1595 } 1596 } 1597 1598 static void 1599 dumpsys_put_cq(cqueue_t *cq, cbuf_t *cp, int newstate, int live) 1600 { 1601 if (cp == NULL) 1602 return; 1603 1604 dumpsys_lock(cq, live); 1605 1606 if (cq->ts != 0) { 1607 cq->empty += gethrtime() - cq->ts; 1608 cq->ts = 0; 1609 } 1610 1611 cp->state = newstate; 1612 cp->next = NULL; 1613 if (cq->last == NULL) 1614 cq->first = cp; 1615 else 1616 cq->last->next = cp; 1617 cq->last = cp; 1618 1619 dumpsys_unlock(cq, live, 1); 1620 } 1621 1622 static cbuf_t * 1623 dumpsys_get_cq(cqueue_t *cq, int live) 1624 { 1625 cbuf_t *cp; 1626 hrtime_t now = gethrtime(); 1627 1628 dumpsys_lock(cq, live); 1629 1630 /* CONSTCOND */ 1631 while (1) { 1632 cp = (cbuf_t *)cq->first; 1633 if (cp == NULL) { 1634 if (cq->open == 0) 1635 break; 1636 dumpsys_wait_cq(cq, live); 1637 continue; 1638 } 1639 cq->first = cp->next; 1640 if (cq->first == NULL) { 1641 cq->last = NULL; 1642 cq->ts = now; 1643 } 1644 break; 1645 } 1646 1647 dumpsys_unlock(cq, live, cq->first != NULL || cq->open == 0); 1648 return (cp); 1649 } 1650 1651 /* 1652 * Send an error message to the console. If the main task is running 1653 * just write the message via uprintf. If a helper is running the 1654 * message has to be put on a queue for the main task. Setting fmt to 1655 * NULL means flush the error message buffer. If fmt is not NULL, just 1656 * add the text to the existing buffer. 1657 */ 1658 static void 1659 dumpsys_errmsg(helper_t *hp, const char *fmt, ...) 1660 { 1661 dumpsync_t *ds = hp->ds; 1662 cbuf_t *cp = hp->cperr; 1663 va_list adx; 1664 1665 if (hp->helper == MAINHELPER) { 1666 if (fmt != NULL) { 1667 if (ds->neednl) { 1668 uprintf("\n"); 1669 ds->neednl = 0; 1670 } 1671 va_start(adx, fmt); 1672 vuprintf(fmt, adx); 1673 va_end(adx); 1674 } 1675 } else if (fmt == NULL) { 1676 if (cp != NULL) { 1677 CQ_PUT(mainq, cp, CBUF_ERRMSG); 1678 hp->cperr = NULL; 1679 } 1680 } else { 1681 if (hp->cperr == NULL) { 1682 cp = CQ_GET(freebufq); 1683 hp->cperr = cp; 1684 cp->used = 0; 1685 } 1686 va_start(adx, fmt); 1687 cp->used += vsnprintf(cp->buf + cp->used, cp->size - cp->used, 1688 fmt, adx); 1689 va_end(adx); 1690 if ((cp->used + LOG_MSGSIZE) > cp->size) { 1691 CQ_PUT(mainq, cp, CBUF_ERRMSG); 1692 hp->cperr = NULL; 1693 } 1694 } 1695 } 1696 1697 /* 1698 * Write an output buffer to the dump file. If the main task is 1699 * running just write the data. If a helper is running the output is 1700 * placed on a queue for the main task. 1701 */ 1702 static void 1703 dumpsys_swrite(helper_t *hp, cbuf_t *cp, size_t used) 1704 { 1705 dumpsync_t *ds = hp->ds; 1706 1707 if (hp->helper == MAINHELPER) { 1708 HRSTART(ds->perpage, write); 1709 dumpvp_write(cp->buf, used); 1710 HRSTOP(ds->perpage, write); 1711 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 1712 } else { 1713 cp->used = used; 1714 CQ_PUT(mainq, cp, CBUF_WRITE); 1715 } 1716 } 1717 1718 /* 1719 * Copy one page within the mapped range. The offset starts at 0 and 1720 * is relative to the first pfn. cp->buf + cp->off is the address of 1721 * the first pfn. If dump_pagecopy returns a UE offset, create an 1722 * error message. Returns the offset to the next pfn in the range 1723 * selected by the bitmap. 1724 */ 1725 static int 1726 dumpsys_copy_page(helper_t *hp, int offset) 1727 { 1728 cbuf_t *cp = hp->cpin; 1729 int ueoff; 1730 1731 ASSERT(cp->off + offset + PAGESIZE <= cp->size); 1732 ASSERT(BT_TEST(dumpcfg.bitmap, cp->bitnum)); 1733 1734 ueoff = dump_pagecopy(cp->buf + cp->off + offset, hp->page); 1735 1736 /* ueoff is the offset in the page to a UE error */ 1737 if (ueoff != -1) { 1738 uint64_t pa = ptob(cp->pfn) + offset + ueoff; 1739 1740 dumpsys_errmsg(hp, "cpu %d: memory error at PA 0x%08x.%08x\n", 1741 CPU->cpu_id, (uint32_t)(pa >> 32), (uint32_t)pa); 1742 } 1743 1744 /* 1745 * Advance bitnum and offset to the next input page for the 1746 * next call to this function. 1747 */ 1748 offset += PAGESIZE; 1749 cp->bitnum++; 1750 while (cp->off + offset < cp->size) { 1751 if (BT_TEST(dumpcfg.bitmap, cp->bitnum)) 1752 break; 1753 offset += PAGESIZE; 1754 cp->bitnum++; 1755 } 1756 1757 return (offset); 1758 } 1759 1760 /* 1761 * Read the helper queue, and copy one mapped page. Return 0 when 1762 * done. Return 1 when a page has been copied into hp->page. 1763 */ 1764 static int 1765 dumpsys_sread(helper_t *hp) 1766 { 1767 dumpsync_t *ds = hp->ds; 1768 1769 /* CONSTCOND */ 1770 while (1) { 1771 1772 /* Find the next input buffer. */ 1773 if (hp->cpin == NULL) { 1774 HRSTART(hp->perpage, inwait); 1775 1776 /* CONSTCOND */ 1777 while (1) { 1778 hp->cpin = CQ_GET(helperq); 1779 dump_timeleft = dump_timeout; 1780 1781 /* 1782 * NULL return means the helper queue 1783 * is closed and empty. 1784 */ 1785 if (hp->cpin == NULL) 1786 break; 1787 1788 /* Have input, check for dump I/O error. */ 1789 if (!dump_ioerr) 1790 break; 1791 1792 /* 1793 * If an I/O error occurs, stay in the 1794 * loop in order to empty the helper 1795 * queue. Return the buffers to the 1796 * main task to unmap and free it. 1797 */ 1798 hp->cpin->used = 0; 1799 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 1800 } 1801 HRSTOP(hp->perpage, inwait); 1802 1803 /* Stop here when the helper queue is closed. */ 1804 if (hp->cpin == NULL) 1805 break; 1806 1807 /* Set the offset=0 to get the first pfn. */ 1808 hp->in = 0; 1809 1810 /* Set the total processed to 0 */ 1811 hp->used = 0; 1812 } 1813 1814 /* Process the next page. */ 1815 if (hp->used < hp->cpin->used) { 1816 1817 /* 1818 * Get the next page from the input buffer and 1819 * return a copy. 1820 */ 1821 ASSERT(hp->in != -1); 1822 HRSTART(hp->perpage, copy); 1823 hp->in = dumpsys_copy_page(hp, hp->in); 1824 hp->used += PAGESIZE; 1825 HRSTOP(hp->perpage, copy); 1826 break; 1827 1828 } else { 1829 1830 /* 1831 * Done with the input. Flush the VM and 1832 * return the buffer to the main task. 1833 */ 1834 if (panicstr && hp->helper != MAINHELPER) 1835 hat_flush_range(kas.a_hat, 1836 hp->cpin->buf, hp->cpin->size); 1837 dumpsys_errmsg(hp, NULL); 1838 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 1839 hp->cpin = NULL; 1840 } 1841 } 1842 1843 return (hp->cpin != NULL); 1844 } 1845 1846 /* 1847 * Compress size bytes starting at buf with bzip2 1848 * mode: 1849 * BZ_RUN add one more compressed page 1850 * BZ_FINISH no more input, flush the state 1851 */ 1852 static void 1853 dumpsys_bzrun(helper_t *hp, void *buf, size_t size, int mode) 1854 { 1855 dumpsync_t *ds = hp->ds; 1856 const int CSIZE = sizeof (dumpcsize_t); 1857 bz_stream *ps = &hp->bzstream; 1858 int rc = 0; 1859 uint32_t csize; 1860 dumpcsize_t cs; 1861 1862 /* Set input pointers to new input page */ 1863 if (size > 0) { 1864 ps->avail_in = size; 1865 ps->next_in = buf; 1866 } 1867 1868 /* CONSTCOND */ 1869 while (1) { 1870 1871 /* Quit when all input has been consumed */ 1872 if (ps->avail_in == 0 && mode == BZ_RUN) 1873 break; 1874 1875 /* Get a new output buffer */ 1876 if (hp->cpout == NULL) { 1877 HRSTART(hp->perpage, outwait); 1878 hp->cpout = CQ_GET(freebufq); 1879 HRSTOP(hp->perpage, outwait); 1880 ps->avail_out = hp->cpout->size - CSIZE; 1881 ps->next_out = hp->cpout->buf + CSIZE; 1882 } 1883 1884 /* Compress input, or finalize */ 1885 HRSTART(hp->perpage, compress); 1886 rc = BZ2_bzCompress(ps, mode); 1887 HRSTOP(hp->perpage, compress); 1888 1889 /* Check for error */ 1890 if (mode == BZ_RUN && rc != BZ_RUN_OK) { 1891 dumpsys_errmsg(hp, "%d: BZ_RUN error %s at page %lx\n", 1892 hp->helper, BZ2_bzErrorString(rc), 1893 hp->cpin->pagenum); 1894 break; 1895 } 1896 1897 /* Write the buffer if it is full, or we are flushing */ 1898 if (ps->avail_out == 0 || mode == BZ_FINISH) { 1899 csize = hp->cpout->size - CSIZE - ps->avail_out; 1900 cs = DUMP_SET_TAG(csize, hp->tag); 1901 if (csize > 0) { 1902 (void) memcpy(hp->cpout->buf, &cs, CSIZE); 1903 dumpsys_swrite(hp, hp->cpout, csize + CSIZE); 1904 hp->cpout = NULL; 1905 } 1906 } 1907 1908 /* Check for final complete */ 1909 if (mode == BZ_FINISH) { 1910 if (rc == BZ_STREAM_END) 1911 break; 1912 if (rc != BZ_FINISH_OK) { 1913 dumpsys_errmsg(hp, "%d: BZ_FINISH error %s\n", 1914 hp->helper, BZ2_bzErrorString(rc)); 1915 break; 1916 } 1917 } 1918 } 1919 1920 /* Cleanup state and buffers */ 1921 if (mode == BZ_FINISH) { 1922 1923 /* Reset state so that it is re-usable. */ 1924 (void) BZ2_bzCompressReset(&hp->bzstream); 1925 1926 /* Give any unused outout buffer to the main task */ 1927 if (hp->cpout != NULL) { 1928 hp->cpout->used = 0; 1929 CQ_PUT(mainq, hp->cpout, CBUF_ERRMSG); 1930 hp->cpout = NULL; 1931 } 1932 } 1933 } 1934 1935 static void 1936 dumpsys_bz2compress(helper_t *hp) 1937 { 1938 dumpsync_t *ds = hp->ds; 1939 dumpstreamhdr_t sh; 1940 1941 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC); 1942 sh.stream_pagenum = (pgcnt_t)-1; 1943 sh.stream_npages = 0; 1944 hp->cpin = NULL; 1945 hp->cpout = NULL; 1946 hp->cperr = NULL; 1947 hp->in = 0; 1948 hp->out = 0; 1949 hp->bzstream.avail_in = 0; 1950 1951 /* Bump reference to mainq while we are running */ 1952 CQ_OPEN(mainq); 1953 1954 /* Get one page at a time */ 1955 while (dumpsys_sread(hp)) { 1956 if (sh.stream_pagenum != hp->cpin->pagenum) { 1957 sh.stream_pagenum = hp->cpin->pagenum; 1958 sh.stream_npages = btop(hp->cpin->used); 1959 dumpsys_bzrun(hp, &sh, sizeof (sh), BZ_RUN); 1960 } 1961 dumpsys_bzrun(hp, hp->page, PAGESIZE, 0); 1962 } 1963 1964 /* Done with input, flush any partial buffer */ 1965 if (sh.stream_pagenum != (pgcnt_t)-1) { 1966 dumpsys_bzrun(hp, NULL, 0, BZ_FINISH); 1967 dumpsys_errmsg(hp, NULL); 1968 } 1969 1970 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL); 1971 1972 /* Decrement main queue count, we are done */ 1973 CQ_CLOSE(mainq); 1974 } 1975 1976 /* 1977 * Compress with lzjb 1978 * write stream block if full or size==0 1979 * if csize==0 write stream header, else write <csize, data> 1980 * size==0 is a call to flush a buffer 1981 * hp->cpout is the buffer we are flushing or filling 1982 * hp->out is the next index to fill data 1983 * osize is either csize+data, or the size of a stream header 1984 */ 1985 static void 1986 dumpsys_lzjbrun(helper_t *hp, size_t csize, void *buf, size_t size) 1987 { 1988 dumpsync_t *ds = hp->ds; 1989 const int CSIZE = sizeof (dumpcsize_t); 1990 dumpcsize_t cs; 1991 size_t osize = csize > 0 ? CSIZE + size : size; 1992 1993 /* If flush, and there is no buffer, just return */ 1994 if (size == 0 && hp->cpout == NULL) 1995 return; 1996 1997 /* If flush, or cpout is full, write it out */ 1998 if (size == 0 || 1999 hp->cpout != NULL && hp->out + osize > hp->cpout->size) { 2000 2001 /* Set tag+size word at the front of the stream block. */ 2002 cs = DUMP_SET_TAG(hp->out - CSIZE, hp->tag); 2003 (void) memcpy(hp->cpout->buf, &cs, CSIZE); 2004 2005 /* Write block to dump file. */ 2006 dumpsys_swrite(hp, hp->cpout, hp->out); 2007 2008 /* Clear pointer to indicate we need a new buffer */ 2009 hp->cpout = NULL; 2010 2011 /* flushing, we are done */ 2012 if (size == 0) 2013 return; 2014 } 2015 2016 /* Get an output buffer if we dont have one. */ 2017 if (hp->cpout == NULL) { 2018 HRSTART(hp->perpage, outwait); 2019 hp->cpout = CQ_GET(freebufq); 2020 HRSTOP(hp->perpage, outwait); 2021 hp->out = CSIZE; 2022 } 2023 2024 /* Store csize word. This is the size of compressed data. */ 2025 if (csize > 0) { 2026 cs = DUMP_SET_TAG(csize, 0); 2027 (void) memcpy(hp->cpout->buf + hp->out, &cs, CSIZE); 2028 hp->out += CSIZE; 2029 } 2030 2031 /* Store the data. */ 2032 (void) memcpy(hp->cpout->buf + hp->out, buf, size); 2033 hp->out += size; 2034 } 2035 2036 static void 2037 dumpsys_lzjbcompress(helper_t *hp) 2038 { 2039 dumpsync_t *ds = hp->ds; 2040 size_t csize; 2041 dumpstreamhdr_t sh; 2042 2043 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC); 2044 sh.stream_pagenum = (pfn_t)-1; 2045 sh.stream_npages = 0; 2046 hp->cpin = NULL; 2047 hp->cpout = NULL; 2048 hp->cperr = NULL; 2049 hp->in = 0; 2050 hp->out = 0; 2051 2052 /* Bump reference to mainq while we are running */ 2053 CQ_OPEN(mainq); 2054 2055 /* Get one page at a time */ 2056 while (dumpsys_sread(hp)) { 2057 2058 /* Create a stream header for each new input map */ 2059 if (sh.stream_pagenum != hp->cpin->pagenum) { 2060 sh.stream_pagenum = hp->cpin->pagenum; 2061 sh.stream_npages = btop(hp->cpin->used); 2062 dumpsys_lzjbrun(hp, 0, &sh, sizeof (sh)); 2063 } 2064 2065 /* Compress one page */ 2066 HRSTART(hp->perpage, compress); 2067 csize = compress(hp->page, hp->lzbuf, PAGESIZE); 2068 HRSTOP(hp->perpage, compress); 2069 2070 /* Add csize+data to output block */ 2071 ASSERT(csize > 0 && csize <= PAGESIZE); 2072 dumpsys_lzjbrun(hp, csize, hp->lzbuf, csize); 2073 } 2074 2075 /* Done with input, flush any partial buffer */ 2076 if (sh.stream_pagenum != (pfn_t)-1) { 2077 dumpsys_lzjbrun(hp, 0, NULL, 0); 2078 dumpsys_errmsg(hp, NULL); 2079 } 2080 2081 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL); 2082 2083 /* Decrement main queue count, we are done */ 2084 CQ_CLOSE(mainq); 2085 } 2086 2087 /* 2088 * Dump helper called from panic_idle() to compress pages. CPUs in 2089 * this path must not call most kernel services. 2090 * 2091 * During panic, all but one of the CPUs is idle. These CPUs are used 2092 * as helpers working in parallel to copy and compress memory 2093 * pages. During a panic, however, these processors cannot call any 2094 * kernel services. This is because mutexes become no-ops during 2095 * panic, and, cross-call interrupts are inhibited. Therefore, during 2096 * panic dump the helper CPUs communicate with the panic CPU using 2097 * memory variables. All memory mapping and I/O is performed by the 2098 * panic CPU. 2099 */ 2100 void 2101 dumpsys_helper() 2102 { 2103 dumpsys_spinlock(&dumpcfg.helper_lock); 2104 if (dumpcfg.helpers_wanted) { 2105 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper]; 2106 2107 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2108 if (hp->helper == FREEHELPER) { 2109 hp->helper = CPU->cpu_id; 2110 BT_SET(dumpcfg.helpermap, CPU->cpu_seqid); 2111 2112 dumpsys_spinunlock(&dumpcfg.helper_lock); 2113 2114 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2) 2115 dumpsys_lzjbcompress(hp); 2116 else 2117 dumpsys_bz2compress(hp); 2118 2119 hp->helper = DONEHELPER; 2120 return; 2121 } 2122 } 2123 } 2124 dumpsys_spinunlock(&dumpcfg.helper_lock); 2125 } 2126 2127 /* 2128 * Dump helper for live dumps. 2129 * These run as a system task. 2130 */ 2131 static void 2132 dumpsys_live_helper(void *arg) 2133 { 2134 helper_t *hp = arg; 2135 2136 BT_ATOMIC_SET(dumpcfg.helpermap, CPU->cpu_seqid); 2137 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2) 2138 dumpsys_lzjbcompress(hp); 2139 else 2140 dumpsys_bz2compress(hp); 2141 } 2142 2143 /* 2144 * Compress one page with lzjb (single threaded case) 2145 */ 2146 static void 2147 dumpsys_lzjb_page(helper_t *hp, cbuf_t *cp) 2148 { 2149 dumpsync_t *ds = hp->ds; 2150 uint32_t csize; 2151 2152 hp->helper = MAINHELPER; 2153 hp->in = 0; 2154 hp->used = 0; 2155 hp->cpin = cp; 2156 while (hp->used < cp->used) { 2157 HRSTART(hp->perpage, copy); 2158 hp->in = dumpsys_copy_page(hp, hp->in); 2159 hp->used += PAGESIZE; 2160 HRSTOP(hp->perpage, copy); 2161 2162 HRSTART(hp->perpage, compress); 2163 csize = compress(hp->page, hp->lzbuf, PAGESIZE); 2164 HRSTOP(hp->perpage, compress); 2165 2166 HRSTART(hp->perpage, write); 2167 dumpvp_write(&csize, sizeof (csize)); 2168 dumpvp_write(hp->lzbuf, csize); 2169 HRSTOP(hp->perpage, write); 2170 } 2171 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 2172 hp->cpin = NULL; 2173 } 2174 2175 /* 2176 * Main task to dump pages. This is called on the dump CPU. 2177 */ 2178 static void 2179 dumpsys_main_task(void *arg) 2180 { 2181 dumpsync_t *ds = arg; 2182 pgcnt_t pagenum = 0, bitnum = 0, hibitnum; 2183 dumpmlw_t mlw; 2184 cbuf_t *cp; 2185 pgcnt_t baseoff, pfnoff; 2186 pfn_t base, pfn; 2187 int sec; 2188 2189 dump_init_memlist_walker(&mlw); 2190 2191 /* CONSTCOND */ 2192 while (1) { 2193 2194 if (ds->percent > ds->percent_done) { 2195 ds->percent_done = ds->percent; 2196 sec = (gethrtime() - ds->start) / 1000 / 1000 / 1000; 2197 uprintf("^\r%2d:%02d %3d%% done", 2198 sec / 60, sec % 60, ds->percent); 2199 ds->neednl = 1; 2200 } 2201 2202 while (CQ_IS_EMPTY(mainq) && !CQ_IS_EMPTY(writerq)) { 2203 2204 /* the writerq never blocks */ 2205 cp = CQ_GET(writerq); 2206 if (cp == NULL) 2207 break; 2208 2209 dump_timeleft = dump_timeout; 2210 2211 HRSTART(ds->perpage, write); 2212 dumpvp_write(cp->buf, cp->used); 2213 HRSTOP(ds->perpage, write); 2214 2215 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2216 } 2217 2218 /* 2219 * Wait here for some buffers to process. Returns NULL 2220 * when all helpers have terminated and all buffers 2221 * have been processed. 2222 */ 2223 cp = CQ_GET(mainq); 2224 2225 if (cp == NULL) { 2226 2227 /* Drain the write queue. */ 2228 if (!CQ_IS_EMPTY(writerq)) 2229 continue; 2230 2231 /* Main task exits here. */ 2232 break; 2233 } 2234 2235 dump_timeleft = dump_timeout; 2236 2237 switch (cp->state) { 2238 2239 case CBUF_FREEMAP: 2240 2241 /* 2242 * Note that we drop CBUF_FREEMAP buffers on 2243 * the floor (they will not be on any cqueue) 2244 * when we no longer need them. 2245 */ 2246 if (bitnum >= dumpcfg.bitmapsize) 2247 break; 2248 2249 if (dump_ioerr) { 2250 bitnum = dumpcfg.bitmapsize; 2251 CQ_CLOSE(helperq); 2252 break; 2253 } 2254 2255 HRSTART(ds->perpage, bitmap); 2256 for (; bitnum < dumpcfg.bitmapsize; bitnum++) 2257 if (BT_TEST(dumpcfg.bitmap, bitnum)) 2258 break; 2259 HRSTOP(ds->perpage, bitmap); 2260 dump_timeleft = dump_timeout; 2261 2262 if (bitnum >= dumpcfg.bitmapsize) { 2263 CQ_CLOSE(helperq); 2264 break; 2265 } 2266 2267 /* 2268 * Try to map CBUF_MAPSIZE ranges. Can't 2269 * assume that memory segment size is a 2270 * multiple of CBUF_MAPSIZE. Can't assume that 2271 * the segment starts on a CBUF_MAPSIZE 2272 * boundary. 2273 */ 2274 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2275 ASSERT(pfn != PFN_INVALID); 2276 ASSERT(bitnum + mlw.mpleft <= dumpcfg.bitmapsize); 2277 2278 base = P2ALIGN(pfn, CBUF_MAPNP); 2279 if (base < mlw.mpaddr) { 2280 base = mlw.mpaddr; 2281 baseoff = P2PHASE(base, CBUF_MAPNP); 2282 } else { 2283 baseoff = 0; 2284 } 2285 2286 pfnoff = pfn - base; 2287 if (pfnoff + mlw.mpleft < CBUF_MAPNP) { 2288 hibitnum = bitnum + mlw.mpleft; 2289 cp->size = ptob(pfnoff + mlw.mpleft); 2290 } else { 2291 hibitnum = bitnum - pfnoff + CBUF_MAPNP - 2292 baseoff; 2293 cp->size = CBUF_MAPSIZE - ptob(baseoff); 2294 } 2295 2296 cp->pfn = pfn; 2297 cp->bitnum = bitnum++; 2298 cp->pagenum = pagenum++; 2299 cp->off = ptob(pfnoff); 2300 2301 for (; bitnum < hibitnum; bitnum++) 2302 if (BT_TEST(dumpcfg.bitmap, bitnum)) 2303 pagenum++; 2304 2305 dump_timeleft = dump_timeout; 2306 cp->used = ptob(pagenum - cp->pagenum); 2307 2308 HRSTART(ds->perpage, map); 2309 hat_devload(kas.a_hat, cp->buf, cp->size, base, 2310 PROT_READ, HAT_LOAD_NOCONSIST); 2311 HRSTOP(ds->perpage, map); 2312 2313 ds->pages_mapped += btop(cp->size); 2314 ds->pages_used += pagenum - cp->pagenum; 2315 2316 CQ_OPEN(mainq); 2317 2318 /* 2319 * If there are no helpers the main task does 2320 * non-streams lzjb compress. 2321 */ 2322 if (dumpcfg.clevel == 0) { 2323 dumpsys_lzjb_page(dumpcfg.helper, cp); 2324 break; 2325 } 2326 2327 /* pass mapped pages to a helper */ 2328 CQ_PUT(helperq, cp, CBUF_INREADY); 2329 2330 /* the last page was done */ 2331 if (bitnum >= dumpcfg.bitmapsize) 2332 CQ_CLOSE(helperq); 2333 2334 break; 2335 2336 case CBUF_USEDMAP: 2337 2338 ds->npages += btop(cp->used); 2339 2340 HRSTART(ds->perpage, unmap); 2341 hat_unload(kas.a_hat, cp->buf, cp->size, HAT_UNLOAD); 2342 HRSTOP(ds->perpage, unmap); 2343 2344 if (bitnum < dumpcfg.bitmapsize) 2345 CQ_PUT(mainq, cp, CBUF_FREEMAP); 2346 CQ_CLOSE(mainq); 2347 2348 ASSERT(ds->npages <= dumphdr->dump_npages); 2349 ds->percent = ds->npages * 100LL / dumphdr->dump_npages; 2350 break; 2351 2352 case CBUF_WRITE: 2353 2354 CQ_PUT(writerq, cp, CBUF_WRITE); 2355 break; 2356 2357 case CBUF_ERRMSG: 2358 2359 if (cp->used > 0) { 2360 cp->buf[cp->size - 2] = '\n'; 2361 cp->buf[cp->size - 1] = '\0'; 2362 if (ds->neednl) { 2363 uprintf("\n%s", cp->buf); 2364 ds->neednl = 0; 2365 } else { 2366 uprintf("%s", cp->buf); 2367 } 2368 /* wait for console output */ 2369 drv_usecwait(200000); 2370 dump_timeleft = dump_timeout; 2371 } 2372 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2373 break; 2374 2375 default: 2376 uprintf("dump: unexpected buffer state %d, " 2377 "buffer will be lost\n", cp->state); 2378 break; 2379 2380 } /* end switch */ 2381 2382 } /* end while(1) */ 2383 } 2384 2385 #ifdef COLLECT_METRICS 2386 size_t 2387 dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size) 2388 { 2389 dumpcfg_t *cfg = &dumpcfg; 2390 int myid = CPU->cpu_seqid; 2391 int i, compress_ratio; 2392 int sec, iorate; 2393 helper_t *hp, *hpend = &cfg->helper[cfg->nhelper]; 2394 char *e = buf + size; 2395 char *p = buf; 2396 2397 sec = ds->elapsed / (1000 * 1000 * 1000ULL); 2398 if (sec < 1) 2399 sec = 1; 2400 2401 if (ds->iotime < 1) 2402 ds->iotime = 1; 2403 iorate = (ds->nwrite * 100000ULL) / ds->iotime; 2404 2405 compress_ratio = 100LL * ds->npages / btopr(ds->nwrite + 1); 2406 2407 #define P(...) (p += p < e ? snprintf(p, e - p, __VA_ARGS__) : 0) 2408 2409 P("Master cpu_seqid,%d\n", CPU->cpu_seqid); 2410 P("Master cpu_id,%d\n", CPU->cpu_id); 2411 P("dump_flags,0x%x\n", dumphdr->dump_flags); 2412 P("dump_ioerr,%d\n", dump_ioerr); 2413 2414 P("Helpers:\n"); 2415 for (i = 0; i < ncpus; i++) { 2416 if ((i & 15) == 0) 2417 P(",,%03d,", i); 2418 if (i == myid) 2419 P(" M"); 2420 else if (BT_TEST(cfg->helpermap, i)) 2421 P("%4d", cpu_seq[i]->cpu_id); 2422 else 2423 P(" *"); 2424 if ((i & 15) == 15) 2425 P("\n"); 2426 } 2427 2428 P("ncbuf_used,%d\n", cfg->ncbuf_used); 2429 P("ncmap,%d\n", cfg->ncmap); 2430 2431 P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m); 2432 P("Found small pages,%ld\n", cfg->foundsm); 2433 2434 P("Compression level,%d\n", cfg->clevel); 2435 P("Compression type,%s %s\n", cfg->clevel == 0 ? "serial" : "parallel", 2436 cfg->clevel >= DUMP_CLEVEL_BZIP2 ? "bzip2" : "lzjb"); 2437 P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio % 2438 100); 2439 P("nhelper_used,%d\n", cfg->nhelper_used); 2440 2441 P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100); 2442 P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite); 2443 P("..total nsec,%lld\n", (u_longlong_t)ds->iotime); 2444 P("dumpbuf.iosize,%ld\n", dumpbuf.iosize); 2445 P("dumpbuf.size,%ld\n", dumpbuf.size); 2446 2447 P("Dump pages/sec,%llu\n", (u_longlong_t)ds->npages / sec); 2448 P("Dump pages,%llu\n", (u_longlong_t)ds->npages); 2449 P("Dump time,%d\n", sec); 2450 2451 if (ds->pages_mapped > 0) 2452 P("per-cent map utilization,%d\n", (int)((100 * ds->pages_used) 2453 / ds->pages_mapped)); 2454 2455 P("\nPer-page metrics:\n"); 2456 if (ds->npages > 0) { 2457 for (hp = cfg->helper; hp != hpend; hp++) { 2458 #define PERPAGE(x) ds->perpage.x += hp->perpage.x; 2459 PERPAGES; 2460 #undef PERPAGE 2461 } 2462 #define PERPAGE(x) \ 2463 P("%s nsec/page,%d\n", #x, (int)(ds->perpage.x / ds->npages)); 2464 PERPAGES; 2465 #undef PERPAGE 2466 P("freebufq.empty,%d\n", (int)(ds->freebufq.empty / 2467 ds->npages)); 2468 P("helperq.empty,%d\n", (int)(ds->helperq.empty / 2469 ds->npages)); 2470 P("writerq.empty,%d\n", (int)(ds->writerq.empty / 2471 ds->npages)); 2472 P("mainq.empty,%d\n", (int)(ds->mainq.empty / ds->npages)); 2473 2474 P("I/O wait nsec/page,%llu\n", (u_longlong_t)(ds->iowait / 2475 ds->npages)); 2476 } 2477 #undef P 2478 if (p < e) 2479 bzero(p, e - p); 2480 return (p - buf); 2481 } 2482 #endif /* COLLECT_METRICS */ 2483 2484 /* 2485 * Dump the system. 2486 */ 2487 void 2488 dumpsys(void) 2489 { 2490 dumpsync_t *ds = &dumpsync; 2491 taskq_t *livetaskq = NULL; 2492 pfn_t pfn; 2493 pgcnt_t bitnum; 2494 proc_t *p; 2495 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper]; 2496 cbuf_t *cp; 2497 pid_t npids, pidx; 2498 char *content; 2499 char *buf; 2500 size_t size; 2501 int save_dump_clevel; 2502 dumpmlw_t mlw; 2503 dumpcsize_t datatag; 2504 dumpdatahdr_t datahdr; 2505 2506 if (dumpvp == NULL || dumphdr == NULL) { 2507 uprintf("skipping system dump - no dump device configured\n"); 2508 if (panicstr) { 2509 dumpcfg.helpers_wanted = 0; 2510 dumpsys_spinunlock(&dumpcfg.helper_lock); 2511 } 2512 return; 2513 } 2514 dumpbuf.cur = dumpbuf.start; 2515 2516 /* clear the sync variables */ 2517 ASSERT(dumpcfg.nhelper > 0); 2518 bzero(ds, sizeof (*ds)); 2519 ds->dumpcpu = CPU->cpu_id; 2520 2521 /* 2522 * Calculate the starting block for dump. If we're dumping on a 2523 * swap device, start 1/5 of the way in; otherwise, start at the 2524 * beginning. And never use the first page -- it may be a disk label. 2525 */ 2526 if (dumpvp->v_flag & VISSWAP) 2527 dumphdr->dump_start = P2ROUNDUP(dumpvp_size / 5, DUMP_OFFSET); 2528 else 2529 dumphdr->dump_start = DUMP_OFFSET; 2530 2531 dumphdr->dump_flags = DF_VALID | DF_COMPLETE | DF_LIVE | DF_COMPRESSED; 2532 dumphdr->dump_crashtime = gethrestime_sec(); 2533 dumphdr->dump_npages = 0; 2534 dumphdr->dump_nvtop = 0; 2535 bzero(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.bitmapsize)); 2536 dump_timeleft = dump_timeout; 2537 2538 if (panicstr) { 2539 dumphdr->dump_flags &= ~DF_LIVE; 2540 (void) VOP_DUMPCTL(dumpvp, DUMP_FREE, NULL, NULL); 2541 (void) VOP_DUMPCTL(dumpvp, DUMP_ALLOC, NULL, NULL); 2542 (void) vsnprintf(dumphdr->dump_panicstring, DUMP_PANICSIZE, 2543 panicstr, panicargs); 2544 2545 } 2546 2547 if (dump_conflags & DUMP_ALL) 2548 content = "all"; 2549 else if (dump_conflags & DUMP_CURPROC) 2550 content = "kernel + curproc"; 2551 else 2552 content = "kernel"; 2553 uprintf("dumping to %s, offset %lld, content: %s\n", dumppath, 2554 dumphdr->dump_start, content); 2555 2556 /* Make sure nodename is current */ 2557 bcopy(utsname.nodename, dumphdr->dump_utsname.nodename, SYS_NMLN); 2558 2559 /* 2560 * If this is a live dump, try to open a VCHR vnode for better 2561 * performance. We must take care to flush the buffer cache 2562 * first. 2563 */ 2564 if (!panicstr) { 2565 vnode_t *cdev_vp, *cmn_cdev_vp; 2566 2567 ASSERT(dumpbuf.cdev_vp == NULL); 2568 cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR); 2569 if (cdev_vp != NULL) { 2570 cmn_cdev_vp = common_specvp(cdev_vp); 2571 if (VOP_OPEN(&cmn_cdev_vp, FREAD | FWRITE, kcred, NULL) 2572 == 0) { 2573 if (vn_has_cached_data(dumpvp)) 2574 (void) pvn_vplist_dirty(dumpvp, 0, NULL, 2575 B_INVAL | B_TRUNC, kcred); 2576 dumpbuf.cdev_vp = cmn_cdev_vp; 2577 } else { 2578 VN_RELE(cdev_vp); 2579 } 2580 } 2581 } 2582 2583 /* 2584 * Store a hires timestamp so we can look it up during debugging. 2585 */ 2586 lbolt_debug_entry(); 2587 2588 /* 2589 * Leave room for the message and ereport save areas and terminal dump 2590 * header. 2591 */ 2592 dumpbuf.vp_limit = dumpvp_size - DUMP_LOGSIZE - DUMP_OFFSET - 2593 DUMP_ERPTSIZE; 2594 2595 /* 2596 * Write out the symbol table. It's no longer compressed, 2597 * so its 'size' and 'csize' are equal. 2598 */ 2599 dumpbuf.vp_off = dumphdr->dump_ksyms = dumphdr->dump_start + PAGESIZE; 2600 dumphdr->dump_ksyms_size = dumphdr->dump_ksyms_csize = 2601 ksyms_snapshot(dumpvp_ksyms_write, NULL, LONG_MAX); 2602 2603 /* 2604 * Write out the translation map. 2605 */ 2606 dumphdr->dump_map = dumpvp_flush(); 2607 dump_as(&kas); 2608 dumphdr->dump_nvtop += dump_plat_addr(); 2609 2610 /* 2611 * call into hat, which may have unmapped pages that also need to 2612 * be in the dump 2613 */ 2614 hat_dump(); 2615 2616 if (dump_conflags & DUMP_ALL) { 2617 mutex_enter(&pidlock); 2618 2619 for (npids = 0, p = practive; p != NULL; p = p->p_next) 2620 dumpcfg.pids[npids++] = p->p_pid; 2621 2622 mutex_exit(&pidlock); 2623 2624 for (pidx = 0; pidx < npids; pidx++) 2625 (void) dump_process(dumpcfg.pids[pidx]); 2626 2627 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) { 2628 dump_timeleft = dump_timeout; 2629 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2630 /* 2631 * Some hypervisors do not have all pages available to 2632 * be accessed by the guest OS. Check for page 2633 * accessibility. 2634 */ 2635 if (plat_hold_page(pfn, PLAT_HOLD_NO_LOCK, NULL) != 2636 PLAT_HOLD_OK) 2637 continue; 2638 BT_SET(dumpcfg.bitmap, bitnum); 2639 } 2640 dumphdr->dump_npages = dumpcfg.bitmapsize; 2641 dumphdr->dump_flags |= DF_ALL; 2642 2643 } else if (dump_conflags & DUMP_CURPROC) { 2644 /* 2645 * Determine which pid is to be dumped. If we're panicking, we 2646 * dump the process associated with panic_thread (if any). If 2647 * this is a live dump, we dump the process associated with 2648 * curthread. 2649 */ 2650 npids = 0; 2651 if (panicstr) { 2652 if (panic_thread != NULL && 2653 panic_thread->t_procp != NULL && 2654 panic_thread->t_procp != &p0) { 2655 dumpcfg.pids[npids++] = 2656 panic_thread->t_procp->p_pid; 2657 } 2658 } else { 2659 dumpcfg.pids[npids++] = curthread->t_procp->p_pid; 2660 } 2661 2662 if (npids && dump_process(dumpcfg.pids[0]) == 0) 2663 dumphdr->dump_flags |= DF_CURPROC; 2664 else 2665 dumphdr->dump_flags |= DF_KERNEL; 2666 2667 } else { 2668 dumphdr->dump_flags |= DF_KERNEL; 2669 } 2670 2671 dumphdr->dump_hashmask = (1 << highbit(dumphdr->dump_nvtop - 1)) - 1; 2672 2673 /* 2674 * Write out the pfn table. 2675 */ 2676 dumphdr->dump_pfn = dumpvp_flush(); 2677 dump_init_memlist_walker(&mlw); 2678 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) { 2679 dump_timeleft = dump_timeout; 2680 if (!BT_TEST(dumpcfg.bitmap, bitnum)) 2681 continue; 2682 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2683 ASSERT(pfn != PFN_INVALID); 2684 dumpvp_write(&pfn, sizeof (pfn_t)); 2685 } 2686 dump_plat_pfn(); 2687 2688 /* 2689 * Write out all the pages. 2690 * Map pages, copy them handling UEs, compress, and write them out. 2691 * Cooperate with any helpers running on CPUs in panic_idle(). 2692 */ 2693 dumphdr->dump_data = dumpvp_flush(); 2694 2695 bzero(dumpcfg.helpermap, BT_SIZEOFMAP(NCPU)); 2696 ds->live = dumpcfg.clevel > 0 && 2697 (dumphdr->dump_flags & DF_LIVE) != 0; 2698 2699 save_dump_clevel = dumpcfg.clevel; 2700 if (panicstr) 2701 dumpsys_get_maxmem(); 2702 else if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2) 2703 dumpcfg.clevel = DUMP_CLEVEL_LZJB; 2704 2705 dumpcfg.nhelper_used = 0; 2706 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2707 if (hp->page == NULL) { 2708 hp->helper = DONEHELPER; 2709 continue; 2710 } 2711 ++dumpcfg.nhelper_used; 2712 hp->helper = FREEHELPER; 2713 hp->taskqid = NULL; 2714 hp->ds = ds; 2715 bzero(&hp->perpage, sizeof (hp->perpage)); 2716 if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2) 2717 (void) BZ2_bzCompressReset(&hp->bzstream); 2718 } 2719 2720 CQ_OPEN(freebufq); 2721 CQ_OPEN(helperq); 2722 2723 dumpcfg.ncbuf_used = 0; 2724 for (cp = dumpcfg.cbuf; cp != &dumpcfg.cbuf[dumpcfg.ncbuf]; cp++) { 2725 if (cp->buf != NULL) { 2726 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2727 ++dumpcfg.ncbuf_used; 2728 } 2729 } 2730 2731 for (cp = dumpcfg.cmap; cp != &dumpcfg.cmap[dumpcfg.ncmap]; cp++) 2732 CQ_PUT(mainq, cp, CBUF_FREEMAP); 2733 2734 ds->start = gethrtime(); 2735 ds->iowaitts = ds->start; 2736 2737 /* start helpers */ 2738 if (ds->live) { 2739 int n = dumpcfg.nhelper_used; 2740 int pri = MINCLSYSPRI - 25; 2741 2742 livetaskq = taskq_create("LiveDump", n, pri, n, n, 2743 TASKQ_PREPOPULATE); 2744 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2745 if (hp->page == NULL) 2746 continue; 2747 hp->helper = hp - dumpcfg.helper; 2748 hp->taskqid = taskq_dispatch(livetaskq, 2749 dumpsys_live_helper, (void *)hp, TQ_NOSLEEP); 2750 } 2751 2752 } else { 2753 if (panicstr) 2754 kmem_dump_begin(); 2755 dumpcfg.helpers_wanted = dumpcfg.clevel > 0; 2756 dumpsys_spinunlock(&dumpcfg.helper_lock); 2757 } 2758 2759 /* run main task */ 2760 dumpsys_main_task(ds); 2761 2762 ds->elapsed = gethrtime() - ds->start; 2763 if (ds->elapsed < 1) 2764 ds->elapsed = 1; 2765 2766 if (livetaskq != NULL) 2767 taskq_destroy(livetaskq); 2768 2769 if (ds->neednl) { 2770 uprintf("\n"); 2771 ds->neednl = 0; 2772 } 2773 2774 /* record actual pages dumped */ 2775 dumphdr->dump_npages = ds->npages; 2776 2777 /* platform-specific data */ 2778 dumphdr->dump_npages += dump_plat_data(dumpcfg.cbuf[0].buf); 2779 2780 /* note any errors by clearing DF_COMPLETE */ 2781 if (dump_ioerr || ds->npages < dumphdr->dump_npages) 2782 dumphdr->dump_flags &= ~DF_COMPLETE; 2783 2784 /* end of stream blocks */ 2785 datatag = 0; 2786 dumpvp_write(&datatag, sizeof (datatag)); 2787 2788 bzero(&datahdr, sizeof (datahdr)); 2789 2790 /* buffer for metrics */ 2791 buf = dumpcfg.cbuf[0].buf; 2792 size = MIN(dumpcfg.cbuf[0].size, DUMP_OFFSET - sizeof (dumphdr_t) - 2793 sizeof (dumpdatahdr_t)); 2794 2795 /* finish the kmem intercepts, collect kmem verbose info */ 2796 if (panicstr) { 2797 datahdr.dump_metrics = kmem_dump_finish(buf, size); 2798 buf += datahdr.dump_metrics; 2799 size -= datahdr.dump_metrics; 2800 } 2801 2802 /* compression info in data header */ 2803 datahdr.dump_datahdr_magic = DUMP_DATAHDR_MAGIC; 2804 datahdr.dump_datahdr_version = DUMP_DATAHDR_VERSION; 2805 datahdr.dump_maxcsize = CBUF_SIZE; 2806 datahdr.dump_maxrange = CBUF_MAPSIZE / PAGESIZE; 2807 datahdr.dump_nstreams = dumpcfg.nhelper_used; 2808 datahdr.dump_clevel = dumpcfg.clevel; 2809 #ifdef COLLECT_METRICS 2810 if (dump_metrics_on) 2811 datahdr.dump_metrics += dumpsys_metrics(ds, buf, size); 2812 #endif 2813 datahdr.dump_data_csize = dumpvp_flush() - dumphdr->dump_data; 2814 2815 /* 2816 * Write out the initial and terminal dump headers. 2817 */ 2818 dumpbuf.vp_off = dumphdr->dump_start; 2819 dumpvp_write(dumphdr, sizeof (dumphdr_t)); 2820 (void) dumpvp_flush(); 2821 2822 dumpbuf.vp_limit = dumpvp_size; 2823 dumpbuf.vp_off = dumpbuf.vp_limit - DUMP_OFFSET; 2824 dumpvp_write(dumphdr, sizeof (dumphdr_t)); 2825 dumpvp_write(&datahdr, sizeof (dumpdatahdr_t)); 2826 dumpvp_write(dumpcfg.cbuf[0].buf, datahdr.dump_metrics); 2827 2828 (void) dumpvp_flush(); 2829 2830 uprintf("\r%3d%% done: %llu pages dumped, ", 2831 ds->percent_done, (u_longlong_t)ds->npages); 2832 2833 if (dump_ioerr == 0) { 2834 uprintf("dump succeeded\n"); 2835 } else { 2836 uprintf("dump failed: error %d\n", dump_ioerr); 2837 #ifdef DEBUG 2838 if (panicstr) 2839 debug_enter("dump failed"); 2840 #endif 2841 } 2842 2843 /* 2844 * Write out all undelivered messages. This has to be the *last* 2845 * thing we do because the dump process itself emits messages. 2846 */ 2847 if (panicstr) { 2848 dump_ereports(); 2849 dump_messages(); 2850 } 2851 2852 delay(2 * hz); /* let people see the 'done' message */ 2853 dump_timeleft = 0; 2854 dump_ioerr = 0; 2855 2856 /* restore settings after live dump completes */ 2857 if (!panicstr) { 2858 dumpcfg.clevel = save_dump_clevel; 2859 2860 /* release any VCHR open of the dump device */ 2861 if (dumpbuf.cdev_vp != NULL) { 2862 (void) VOP_CLOSE(dumpbuf.cdev_vp, FREAD | FWRITE, 1, 0, 2863 kcred, NULL); 2864 VN_RELE(dumpbuf.cdev_vp); 2865 dumpbuf.cdev_vp = NULL; 2866 } 2867 } 2868 } 2869 2870 /* 2871 * This function is called whenever the memory size, as represented 2872 * by the phys_install list, changes. 2873 */ 2874 void 2875 dump_resize() 2876 { 2877 mutex_enter(&dump_lock); 2878 dumphdr_init(); 2879 dumpbuf_resize(); 2880 dump_update_clevel(); 2881 mutex_exit(&dump_lock); 2882 } 2883 2884 /* 2885 * This function allows for dynamic resizing of a dump area. It assumes that 2886 * the underlying device has update its appropriate size(9P). 2887 */ 2888 int 2889 dumpvp_resize() 2890 { 2891 int error; 2892 vattr_t vattr; 2893 2894 mutex_enter(&dump_lock); 2895 vattr.va_mask = AT_SIZE; 2896 if ((error = VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL)) != 0) { 2897 mutex_exit(&dump_lock); 2898 return (error); 2899 } 2900 2901 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) { 2902 mutex_exit(&dump_lock); 2903 return (ENOSPC); 2904 } 2905 2906 dumpvp_size = vattr.va_size & -DUMP_OFFSET; 2907 mutex_exit(&dump_lock); 2908 return (0); 2909 } 2910