1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/systm.h> 30 #include <sys/vm.h> 31 #include <sys/proc.h> 32 #include <sys/file.h> 33 #include <sys/conf.h> 34 #include <sys/kmem.h> 35 #include <sys/mem.h> 36 #include <sys/mman.h> 37 #include <sys/vnode.h> 38 #include <sys/errno.h> 39 #include <sys/memlist.h> 40 #include <sys/dumphdr.h> 41 #include <sys/dumpadm.h> 42 #include <sys/ksyms.h> 43 #include <sys/compress.h> 44 #include <sys/stream.h> 45 #include <sys/strsun.h> 46 #include <sys/cmn_err.h> 47 #include <sys/bitmap.h> 48 #include <sys/modctl.h> 49 #include <sys/utsname.h> 50 #include <sys/systeminfo.h> 51 #include <sys/vmem.h> 52 #include <sys/log.h> 53 #include <sys/var.h> 54 #include <sys/debug.h> 55 #include <sys/sunddi.h> 56 #include <fs/fs_subr.h> 57 #include <sys/fs/snode.h> 58 #include <sys/ontrap.h> 59 #include <sys/panic.h> 60 #include <sys/dkio.h> 61 #include <sys/vtoc.h> 62 #include <sys/errorq.h> 63 #include <sys/fm/util.h> 64 #include <sys/fs/zfs.h> 65 66 #include <vm/hat.h> 67 #include <vm/as.h> 68 #include <vm/page.h> 69 #include <vm/pvn.h> 70 #include <vm/seg.h> 71 #include <vm/seg_kmem.h> 72 #include <sys/clock_impl.h> 73 74 #include <bzip2/bzlib.h> 75 76 /* 77 * Crash dump time is dominated by disk write time. To reduce this, 78 * the stronger compression method bzip2 is applied to reduce the dump 79 * size and hence reduce I/O time. However, bzip2 is much more 80 * computationally expensive than the existing lzjb algorithm, so to 81 * avoid increasing compression time, CPUs that are otherwise idle 82 * during panic are employed to parallelize the compression task. 83 * Many helper CPUs are needed to prevent bzip2 from being a 84 * bottleneck, and on systems with too few CPUs, the lzjb algorithm is 85 * parallelized instead. Lastly, I/O and compression are performed by 86 * different CPUs, and are hence overlapped in time, unlike the older 87 * serial code. 88 * 89 * Another important consideration is the speed of the dump 90 * device. Faster disks need less CPUs in order to benefit from 91 * parallel lzjb versus parallel bzip2. Therefore, the CPU count 92 * threshold for switching from parallel lzjb to paralled bzip2 is 93 * elevated for faster disks. The dump device speed is adduced from 94 * the setting for dumpbuf.iosize, see dump_update_clevel. 95 */ 96 97 /* 98 * exported vars 99 */ 100 kmutex_t dump_lock; /* lock for dump configuration */ 101 dumphdr_t *dumphdr; /* dump header */ 102 int dump_conflags = DUMP_KERNEL; /* dump configuration flags */ 103 vnode_t *dumpvp; /* dump device vnode pointer */ 104 u_offset_t dumpvp_size; /* size of dump device, in bytes */ 105 char *dumppath; /* pathname of dump device */ 106 int dump_timeout = 120; /* timeout for dumping pages */ 107 int dump_timeleft; /* portion of dump_timeout remaining */ 108 int dump_ioerr; /* dump i/o error */ 109 int dump_check_used; /* enable check for used pages */ 110 111 /* 112 * Tunables for dump compression and parallelism. These can be set via 113 * /etc/system. 114 * 115 * dump_ncpu_low number of helpers for parallel lzjb 116 * This is also the minimum configuration. 117 * 118 * dump_bzip2_level bzip2 compression level: 1-9 119 * Higher numbers give greater compression, but take more memory 120 * and time. Memory used per helper is ~(dump_bzip2_level * 1MB). 121 * 122 * dump_plat_mincpu the cross-over limit for using bzip2 (per platform): 123 * if dump_plat_mincpu == 0, then always do single threaded dump 124 * if ncpu >= dump_plat_mincpu then try to use bzip2 125 * 126 * dump_metrics_on if set, metrics are collected in the kernel, passed 127 * to savecore via the dump file, and recorded by savecore in 128 * METRICS.txt. 129 */ 130 uint_t dump_ncpu_low = 4; /* minimum config for parallel lzjb */ 131 uint_t dump_bzip2_level = 1; /* bzip2 level (1-9) */ 132 133 /* tunables for pre-reserved heap */ 134 uint_t dump_kmem_permap = 1024; 135 uint_t dump_kmem_pages = 8; 136 137 /* Define multiple buffers per helper to avoid stalling */ 138 #define NCBUF_PER_HELPER 2 139 #define NCMAP_PER_HELPER 4 140 141 /* minimum number of helpers configured */ 142 #define MINHELPERS (dump_ncpu_low) 143 #define MINCBUFS (MINHELPERS * NCBUF_PER_HELPER) 144 145 /* 146 * Define constant parameters. 147 * 148 * CBUF_SIZE size of an output buffer 149 * 150 * CBUF_MAPSIZE size of virtual range for mapping pages 151 * 152 * CBUF_MAPNP size of virtual range in pages 153 * 154 */ 155 #define DUMP_1KB ((size_t)1 << 10) 156 #define DUMP_1MB ((size_t)1 << 20) 157 #define CBUF_SIZE ((size_t)1 << 17) 158 #define CBUF_MAPSHIFT (22) 159 #define CBUF_MAPSIZE ((size_t)1 << CBUF_MAPSHIFT) 160 #define CBUF_MAPNP ((size_t)1 << (CBUF_MAPSHIFT - PAGESHIFT)) 161 162 /* 163 * Compression metrics are accumulated nano-second subtotals. The 164 * results are normalized by the number of pages dumped. A report is 165 * generated when dumpsys() completes and is saved in the dump image 166 * after the trailing dump header. 167 * 168 * Metrics are always collected. Set the variable dump_metrics_on to 169 * cause metrics to be saved in the crash file, where savecore will 170 * save it in the file METRICS.txt. 171 */ 172 #define PERPAGES \ 173 PERPAGE(bitmap) PERPAGE(map) PERPAGE(unmap) \ 174 PERPAGE(copy) PERPAGE(compress) \ 175 PERPAGE(write) \ 176 PERPAGE(inwait) PERPAGE(outwait) 177 178 typedef struct perpage { 179 #define PERPAGE(x) hrtime_t x; 180 PERPAGES 181 #undef PERPAGE 182 } perpage_t; 183 184 /* 185 * This macro controls the code generation for collecting dump 186 * performance information. By default, the code is generated, but 187 * automatic saving of the information is disabled. If dump_metrics_on 188 * is set to 1, the timing information is passed to savecore via the 189 * crash file, where it is appended to the file dump-dir/METRICS.txt. 190 */ 191 #define COLLECT_METRICS 192 193 #ifdef COLLECT_METRICS 194 uint_t dump_metrics_on = 0; /* set to 1 to enable recording metrics */ 195 196 #define HRSTART(v, m) v##ts.m = gethrtime() 197 #define HRSTOP(v, m) v.m += gethrtime() - v##ts.m 198 #define HRBEGIN(v, m, s) v##ts.m = gethrtime(); v.size += s 199 #define HREND(v, m) v.m += gethrtime() - v##ts.m 200 #define HRNORM(v, m, n) v.m /= (n) 201 202 #else 203 #define HRSTART(v, m) 204 #define HRSTOP(v, m) 205 #define HRBEGIN(v, m, s) 206 #define HREND(v, m) 207 #define HRNORM(v, m, n) 208 #endif /* COLLECT_METRICS */ 209 210 /* 211 * Buffers for copying and compressing memory pages. 212 * 213 * cbuf_t buffer controllers: used for both input and output. 214 * 215 * The buffer state indicates how it is being used: 216 * 217 * CBUF_FREEMAP: CBUF_MAPSIZE virtual address range is available for 218 * mapping input pages. 219 * 220 * CBUF_INREADY: input pages are mapped and ready for compression by a 221 * helper. 222 * 223 * CBUF_USEDMAP: mapping has been consumed by a helper. Needs unmap. 224 * 225 * CBUF_FREEBUF: CBUF_SIZE output buffer, which is available. 226 * 227 * CBUF_WRITE: CBUF_SIZE block of compressed pages from a helper, 228 * ready to write out. 229 * 230 * CBUF_ERRMSG: CBUF_SIZE block of error messages from a helper 231 * (reports UE errors.) 232 */ 233 234 typedef enum cbufstate { 235 CBUF_FREEMAP, 236 CBUF_INREADY, 237 CBUF_USEDMAP, 238 CBUF_FREEBUF, 239 CBUF_WRITE, 240 CBUF_ERRMSG 241 } cbufstate_t; 242 243 typedef struct cbuf cbuf_t; 244 245 struct cbuf { 246 cbuf_t *next; /* next in list */ 247 cbufstate_t state; /* processing state */ 248 size_t used; /* amount used */ 249 size_t size; /* mem size */ 250 char *buf; /* kmem or vmem */ 251 pgcnt_t pagenum; /* index to pfn map */ 252 pgcnt_t bitnum; /* first set bitnum */ 253 pfn_t pfn; /* first pfn in mapped range */ 254 int off; /* byte offset to first pfn */ 255 }; 256 257 /* 258 * cqueue_t queues: a uni-directional channel for communication 259 * from the master to helper tasks or vice-versa using put and 260 * get primitives. Both mappings and data buffers are passed via 261 * queues. Producers close a queue when done. The number of 262 * active producers is reference counted so the consumer can 263 * detect end of data. Concurrent access is mediated by atomic 264 * operations for panic dump, or mutex/cv for live dump. 265 * 266 * There a four queues, used as follows: 267 * 268 * Queue Dataflow NewState 269 * -------------------------------------------------- 270 * mainq master -> master FREEMAP 271 * master has initialized or unmapped an input buffer 272 * -------------------------------------------------- 273 * helperq master -> helper INREADY 274 * master has mapped input for use by helper 275 * -------------------------------------------------- 276 * mainq master <- helper USEDMAP 277 * helper is done with input 278 * -------------------------------------------------- 279 * freebufq master -> helper FREEBUF 280 * master has initialized or written an output buffer 281 * -------------------------------------------------- 282 * mainq master <- helper WRITE 283 * block of compressed pages from a helper 284 * -------------------------------------------------- 285 * mainq master <- helper ERRMSG 286 * error messages from a helper (memory error case) 287 * -------------------------------------------------- 288 * writerq master <- master WRITE 289 * non-blocking queue of blocks to write 290 * -------------------------------------------------- 291 */ 292 typedef struct cqueue { 293 cbuf_t *volatile first; /* first in list */ 294 cbuf_t *last; /* last in list */ 295 hrtime_t ts; /* timestamp */ 296 hrtime_t empty; /* total time empty */ 297 kmutex_t mutex; /* live state lock */ 298 kcondvar_t cv; /* live wait var */ 299 lock_t spinlock; /* panic mode spin lock */ 300 volatile uint_t open; /* producer ref count */ 301 } cqueue_t; 302 303 /* 304 * Convenience macros for using the cqueue functions 305 * Note that the caller must have defined "dumpsync_t *ds" 306 */ 307 #define CQ_IS_EMPTY(q) \ 308 (ds->q.first == NULL) 309 310 #define CQ_OPEN(q) \ 311 atomic_inc_uint(&ds->q.open) 312 313 #define CQ_CLOSE(q) \ 314 dumpsys_close_cq(&ds->q, ds->live) 315 316 #define CQ_PUT(q, cp, st) \ 317 dumpsys_put_cq(&ds->q, cp, st, ds->live) 318 319 #define CQ_GET(q) \ 320 dumpsys_get_cq(&ds->q, ds->live) 321 322 /* 323 * Dynamic state when dumpsys() is running. 324 */ 325 typedef struct dumpsync { 326 pgcnt_t npages; /* subtotal of pages dumped */ 327 pgcnt_t pages_mapped; /* subtotal of pages mapped */ 328 pgcnt_t pages_used; /* subtotal of pages used per map */ 329 size_t nwrite; /* subtotal of bytes written */ 330 uint_t live; /* running live dump */ 331 uint_t neednl; /* will need to print a newline */ 332 uint_t percent; /* dump progress */ 333 uint_t percent_done; /* dump progress reported */ 334 cqueue_t freebufq; /* free kmem bufs for writing */ 335 cqueue_t mainq; /* input for main task */ 336 cqueue_t helperq; /* input for helpers */ 337 cqueue_t writerq; /* input for writer */ 338 hrtime_t start; /* start time */ 339 hrtime_t elapsed; /* elapsed time when completed */ 340 hrtime_t iotime; /* time spent writing nwrite bytes */ 341 hrtime_t iowait; /* time spent waiting for output */ 342 hrtime_t iowaitts; /* iowait timestamp */ 343 perpage_t perpage; /* metrics */ 344 perpage_t perpagets; 345 int dumpcpu; /* master cpu */ 346 } dumpsync_t; 347 348 static dumpsync_t dumpsync; /* synchronization vars */ 349 350 /* 351 * helper_t helpers: contains the context for a stream. CPUs run in 352 * parallel at dump time; each CPU creates a single stream of 353 * compression data. Stream data is divided into CBUF_SIZE blocks. 354 * The blocks are written in order within a stream. But, blocks from 355 * multiple streams can be interleaved. Each stream is identified by a 356 * unique tag. 357 */ 358 typedef struct helper { 359 int helper; /* bound helper id */ 360 int tag; /* compression stream tag */ 361 perpage_t perpage; /* per page metrics */ 362 perpage_t perpagets; /* per page metrics (timestamps) */ 363 taskqid_t taskqid; /* live dump task ptr */ 364 int in, out; /* buffer offsets */ 365 cbuf_t *cpin, *cpout, *cperr; /* cbuf objects in process */ 366 dumpsync_t *ds; /* pointer to sync vars */ 367 size_t used; /* counts input consumed */ 368 char *page; /* buffer for page copy */ 369 char *lzbuf; /* lzjb output */ 370 bz_stream bzstream; /* bzip2 state */ 371 } helper_t; 372 373 #define MAINHELPER (-1) /* helper is also the main task */ 374 #define FREEHELPER (-2) /* unbound helper */ 375 #define DONEHELPER (-3) /* helper finished */ 376 377 /* 378 * configuration vars for dumpsys 379 */ 380 typedef struct dumpcfg { 381 int threshold; /* ncpu threshold for bzip2 */ 382 int nhelper; /* number of helpers */ 383 int nhelper_used; /* actual number of helpers used */ 384 int ncmap; /* number VA pages for compression */ 385 int ncbuf; /* number of bufs for compression */ 386 int ncbuf_used; /* number of bufs in use */ 387 uint_t clevel; /* dump compression level */ 388 helper_t *helper; /* array of helpers */ 389 cbuf_t *cmap; /* array of input (map) buffers */ 390 cbuf_t *cbuf; /* array of output buffers */ 391 ulong_t *helpermap; /* set of dumpsys helper CPU ids */ 392 ulong_t *bitmap; /* bitmap for marking pages to dump */ 393 ulong_t *rbitmap; /* bitmap for used CBUF_MAPSIZE ranges */ 394 pgcnt_t bitmapsize; /* size of bitmap */ 395 pgcnt_t rbitmapsize; /* size of bitmap for ranges */ 396 pgcnt_t found4m; /* number ranges allocated by dump */ 397 pgcnt_t foundsm; /* number small pages allocated by dump */ 398 pid_t *pids; /* list of process IDs at dump time */ 399 size_t maxsize; /* memory size needed at dump time */ 400 size_t maxvmsize; /* size of reserved VM */ 401 char *maxvm; /* reserved VM for spare pages */ 402 lock_t helper_lock; /* protect helper state */ 403 char helpers_wanted; /* flag to enable parallelism */ 404 } dumpcfg_t; 405 406 static dumpcfg_t dumpcfg; /* config vars */ 407 408 /* 409 * The dump I/O buffer. 410 * 411 * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is 412 * sized according to the optimum device transfer speed. 413 */ 414 typedef struct dumpbuf { 415 vnode_t *cdev_vp; /* VCHR open of the dump device */ 416 len_t vp_limit; /* maximum write offset */ 417 offset_t vp_off; /* current dump device offset */ 418 char *cur; /* dump write pointer */ 419 char *start; /* dump buffer address */ 420 char *end; /* dump buffer end */ 421 size_t size; /* size of dumpbuf in bytes */ 422 size_t iosize; /* best transfer size for device */ 423 } dumpbuf_t; 424 425 dumpbuf_t dumpbuf; /* I/O buffer */ 426 427 /* 428 * The dump I/O buffer must be at least one page, at most xfer_size 429 * bytes, and should scale with physmem in between. The transfer size 430 * passed in will either represent a global default (maxphys) or the 431 * best size for the device. The size of the dumpbuf I/O buffer is 432 * limited by dumpbuf_limit (8MB by default) because the dump 433 * performance saturates beyond a certain size. The default is to 434 * select 1/4096 of the memory. 435 */ 436 static int dumpbuf_fraction = 12; /* memory size scale factor */ 437 static size_t dumpbuf_limit = 8 * DUMP_1MB; /* max I/O buf size */ 438 439 static size_t 440 dumpbuf_iosize(size_t xfer_size) 441 { 442 size_t iosize = ptob(physmem >> dumpbuf_fraction); 443 444 if (iosize < PAGESIZE) 445 iosize = PAGESIZE; 446 else if (iosize > xfer_size) 447 iosize = xfer_size; 448 if (iosize > dumpbuf_limit) 449 iosize = dumpbuf_limit; 450 return (iosize & PAGEMASK); 451 } 452 453 /* 454 * resize the I/O buffer 455 */ 456 static void 457 dumpbuf_resize(void) 458 { 459 char *old_buf = dumpbuf.start; 460 size_t old_size = dumpbuf.size; 461 char *new_buf; 462 size_t new_size; 463 464 ASSERT(MUTEX_HELD(&dump_lock)); 465 466 new_size = dumpbuf_iosize(MAX(dumpbuf.iosize, maxphys)); 467 if (new_size <= old_size) 468 return; /* no need to reallocate buffer */ 469 470 new_buf = kmem_alloc(new_size, KM_SLEEP); 471 dumpbuf.size = new_size; 472 dumpbuf.start = new_buf; 473 dumpbuf.end = new_buf + new_size; 474 kmem_free(old_buf, old_size); 475 } 476 477 /* 478 * dump_update_clevel is called when dumpadm configures the dump device. 479 * Calculate number of helpers and buffers. 480 * Allocate the minimum configuration for now. 481 * 482 * When the dump file is configured we reserve a minimum amount of 483 * memory for use at crash time. But we reserve VA for all the memory 484 * we really want in order to do the fastest dump possible. The VA is 485 * backed by pages not being dumped, according to the bitmap. If 486 * there is insufficient spare memory, however, we fall back to the 487 * minimum. 488 * 489 * Live dump (savecore -L) always uses the minimum config. 490 * 491 * clevel 0 is single threaded lzjb 492 * clevel 1 is parallel lzjb 493 * clevel 2 is parallel bzip2 494 * 495 * The ncpu threshold is selected with dump_plat_mincpu. 496 * On OPL, set_platform_defaults() overrides the sun4u setting. 497 * The actual values are defined via DUMP_PLAT_*_MINCPU macros. 498 * 499 * Architecture Threshold Algorithm 500 * sun4u < 51 parallel lzjb 501 * sun4u >= 51 parallel bzip2(*) 502 * sun4u OPL < 8 parallel lzjb 503 * sun4u OPL >= 8 parallel bzip2(*) 504 * sun4v < 128 parallel lzjb 505 * sun4v >= 128 parallel bzip2(*) 506 * x86 < 11 parallel lzjb 507 * x86 >= 11 parallel bzip2(*) 508 * 32-bit N/A single-threaded lzjb 509 * 510 * (*) bzip2 is only chosen if there is sufficient available 511 * memory for buffers at dump time. See dumpsys_get_maxmem(). 512 * 513 * Faster dump devices have larger I/O buffers. The threshold value is 514 * increased according to the size of the dump I/O buffer, because 515 * parallel lzjb performs better with faster disks. For buffers >= 1MB 516 * the threshold is 3X; for buffers >= 256K threshold is 2X. 517 * 518 * For parallel dumps, the number of helpers is ncpu-1. The CPU 519 * running panic runs the main task. For single-threaded dumps, the 520 * panic CPU does lzjb compression (it is tagged as MAINHELPER.) 521 * 522 * Need multiple buffers per helper so that they do not block waiting 523 * for the main task. 524 * parallel single-threaded 525 * Number of output buffers: nhelper*2 1 526 * Number of mapping buffers: nhelper*4 1 527 * 528 */ 529 static void 530 dump_update_clevel() 531 { 532 int tag; 533 size_t bz2size; 534 helper_t *hp, *hpend; 535 cbuf_t *cp, *cpend; 536 dumpcfg_t *old = &dumpcfg; 537 dumpcfg_t newcfg = *old; 538 dumpcfg_t *new = &newcfg; 539 540 ASSERT(MUTEX_HELD(&dump_lock)); 541 542 /* 543 * Free the previously allocated bufs and VM. 544 */ 545 if (old->helper != NULL) { 546 547 /* helpers */ 548 hpend = &old->helper[old->nhelper]; 549 for (hp = old->helper; hp != hpend; hp++) { 550 if (hp->lzbuf != NULL) 551 kmem_free(hp->lzbuf, PAGESIZE); 552 if (hp->page != NULL) 553 kmem_free(hp->page, PAGESIZE); 554 } 555 kmem_free(old->helper, old->nhelper * sizeof (helper_t)); 556 557 /* VM space for mapping pages */ 558 cpend = &old->cmap[old->ncmap]; 559 for (cp = old->cmap; cp != cpend; cp++) 560 vmem_xfree(heap_arena, cp->buf, CBUF_MAPSIZE); 561 kmem_free(old->cmap, old->ncmap * sizeof (cbuf_t)); 562 563 /* output bufs */ 564 cpend = &old->cbuf[old->ncbuf]; 565 for (cp = old->cbuf; cp != cpend; cp++) 566 if (cp->buf != NULL) 567 kmem_free(cp->buf, cp->size); 568 kmem_free(old->cbuf, old->ncbuf * sizeof (cbuf_t)); 569 570 /* reserved VM for dumpsys_get_maxmem */ 571 if (old->maxvmsize > 0) 572 vmem_xfree(heap_arena, old->maxvm, old->maxvmsize); 573 } 574 575 /* 576 * Allocate memory and VM. 577 * One CPU runs dumpsys, the rest are helpers. 578 */ 579 new->nhelper = ncpus - 1; 580 if (new->nhelper < 1) 581 new->nhelper = 1; 582 583 if (new->nhelper > DUMP_MAX_NHELPER) 584 new->nhelper = DUMP_MAX_NHELPER; 585 586 /* increase threshold for faster disks */ 587 new->threshold = dump_plat_mincpu; 588 if (dumpbuf.iosize >= DUMP_1MB) 589 new->threshold *= 3; 590 else if (dumpbuf.iosize >= (256 * DUMP_1KB)) 591 new->threshold *= 2; 592 593 /* figure compression level based upon the computed threshold. */ 594 if (dump_plat_mincpu == 0 || new->nhelper < 2) { 595 new->clevel = 0; 596 new->nhelper = 1; 597 } else if ((new->nhelper + 1) >= new->threshold) { 598 new->clevel = DUMP_CLEVEL_BZIP2; 599 } else { 600 new->clevel = DUMP_CLEVEL_LZJB; 601 } 602 603 if (new->clevel == 0) { 604 new->ncbuf = 1; 605 new->ncmap = 1; 606 } else { 607 new->ncbuf = NCBUF_PER_HELPER * new->nhelper; 608 new->ncmap = NCMAP_PER_HELPER * new->nhelper; 609 } 610 611 /* 612 * Allocate new data structures and buffers for MINHELPERS, 613 * and also figure the max desired size. 614 */ 615 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level); 616 new->maxsize = 0; 617 new->maxvmsize = 0; 618 new->maxvm = NULL; 619 tag = 1; 620 new->helper = kmem_zalloc(new->nhelper * sizeof (helper_t), KM_SLEEP); 621 hpend = &new->helper[new->nhelper]; 622 for (hp = new->helper; hp != hpend; hp++) { 623 hp->tag = tag++; 624 if (hp < &new->helper[MINHELPERS]) { 625 hp->lzbuf = kmem_alloc(PAGESIZE, KM_SLEEP); 626 hp->page = kmem_alloc(PAGESIZE, KM_SLEEP); 627 } else if (new->clevel < DUMP_CLEVEL_BZIP2) { 628 new->maxsize += 2 * PAGESIZE; 629 } else { 630 new->maxsize += PAGESIZE; 631 } 632 if (new->clevel >= DUMP_CLEVEL_BZIP2) 633 new->maxsize += bz2size; 634 } 635 636 new->cbuf = kmem_zalloc(new->ncbuf * sizeof (cbuf_t), KM_SLEEP); 637 cpend = &new->cbuf[new->ncbuf]; 638 for (cp = new->cbuf; cp != cpend; cp++) { 639 cp->state = CBUF_FREEBUF; 640 cp->size = CBUF_SIZE; 641 if (cp < &new->cbuf[MINCBUFS]) 642 cp->buf = kmem_alloc(cp->size, KM_SLEEP); 643 else 644 new->maxsize += cp->size; 645 } 646 647 new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP); 648 cpend = &new->cmap[new->ncmap]; 649 for (cp = new->cmap; cp != cpend; cp++) { 650 cp->state = CBUF_FREEMAP; 651 cp->size = CBUF_MAPSIZE; 652 cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE, 653 0, 0, NULL, NULL, VM_SLEEP); 654 } 655 656 /* reserve VA to be backed with spare pages at crash time */ 657 if (new->maxsize > 0) { 658 new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE); 659 new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE); 660 new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize, 661 CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP); 662 } 663 664 /* 665 * Reserve memory for kmem allocation calls made during crash 666 * dump. The hat layer allocates memory for each mapping 667 * created, and the I/O path allocates buffers and data structs. 668 * Add a few pages for safety. 669 */ 670 kmem_dump_init((new->ncmap * dump_kmem_permap) + 671 (dump_kmem_pages * PAGESIZE)); 672 673 /* set new config pointers */ 674 *old = *new; 675 } 676 677 /* 678 * Define a struct memlist walker to optimize bitnum to pfn 679 * lookup. The walker maintains the state of the list traversal. 680 */ 681 typedef struct dumpmlw { 682 struct memlist *mp; /* current memlist */ 683 pgcnt_t basenum; /* bitnum base offset */ 684 pgcnt_t mppages; /* current memlist size */ 685 pgcnt_t mpleft; /* size to end of current memlist */ 686 pfn_t mpaddr; /* first pfn in memlist */ 687 } dumpmlw_t; 688 689 /* initialize the walker */ 690 static inline void 691 dump_init_memlist_walker(dumpmlw_t *pw) 692 { 693 pw->mp = phys_install; 694 pw->basenum = 0; 695 pw->mppages = pw->mp->size >> PAGESHIFT; 696 pw->mpleft = pw->mppages; 697 pw->mpaddr = pw->mp->address >> PAGESHIFT; 698 } 699 700 /* 701 * Lookup pfn given bitnum. The memlist can be quite long on some 702 * systems (e.g.: one per board). To optimize sequential lookups, the 703 * caller initializes and presents a memlist walker. 704 */ 705 static pfn_t 706 dump_bitnum_to_pfn(pgcnt_t bitnum, dumpmlw_t *pw) 707 { 708 bitnum -= pw->basenum; 709 while (pw->mp != NULL) { 710 if (bitnum < pw->mppages) { 711 pw->mpleft = pw->mppages - bitnum; 712 return (pw->mpaddr + bitnum); 713 } 714 bitnum -= pw->mppages; 715 pw->basenum += pw->mppages; 716 pw->mp = pw->mp->next; 717 if (pw->mp != NULL) { 718 pw->mppages = pw->mp->size >> PAGESHIFT; 719 pw->mpleft = pw->mppages; 720 pw->mpaddr = pw->mp->address >> PAGESHIFT; 721 } 722 } 723 return (PFN_INVALID); 724 } 725 726 static pgcnt_t 727 dump_pfn_to_bitnum(pfn_t pfn) 728 { 729 struct memlist *mp; 730 pgcnt_t bitnum = 0; 731 732 for (mp = phys_install; mp != NULL; mp = mp->next) { 733 if (pfn >= (mp->address >> PAGESHIFT) && 734 pfn < ((mp->address + mp->size) >> PAGESHIFT)) 735 return (bitnum + pfn - (mp->address >> PAGESHIFT)); 736 bitnum += mp->size >> PAGESHIFT; 737 } 738 return ((pgcnt_t)-1); 739 } 740 741 /* 742 * Set/test bitmap for a CBUF_MAPSIZE range which includes pfn. The 743 * mapping of pfn to range index is imperfect because pfn and bitnum 744 * do not have the same phase. To make sure a CBUF_MAPSIZE range is 745 * covered, call this for both ends: 746 * dump_set_used(base) 747 * dump_set_used(base+CBUF_MAPNP-1) 748 * 749 * This is used during a panic dump to mark pages allocated by 750 * dumpsys_get_maxmem(). The macro IS_DUMP_PAGE(pp) is used by 751 * page_get_mnode_freelist() to make sure pages used by dump are never 752 * allocated. 753 */ 754 #define CBUF_MAPP2R(pfn) ((pfn) >> (CBUF_MAPSHIFT - PAGESHIFT)) 755 756 static void 757 dump_set_used(pfn_t pfn) 758 { 759 760 pgcnt_t bitnum, rbitnum; 761 762 bitnum = dump_pfn_to_bitnum(pfn); 763 ASSERT(bitnum != (pgcnt_t)-1); 764 765 rbitnum = CBUF_MAPP2R(bitnum); 766 ASSERT(rbitnum < dumpcfg.rbitmapsize); 767 768 BT_SET(dumpcfg.rbitmap, rbitnum); 769 } 770 771 int 772 dump_test_used(pfn_t pfn) 773 { 774 pgcnt_t bitnum, rbitnum; 775 776 bitnum = dump_pfn_to_bitnum(pfn); 777 ASSERT(bitnum != (pgcnt_t)-1); 778 779 rbitnum = CBUF_MAPP2R(bitnum); 780 ASSERT(rbitnum < dumpcfg.rbitmapsize); 781 782 return (BT_TEST(dumpcfg.rbitmap, rbitnum)); 783 } 784 785 /* 786 * dumpbzalloc and dumpbzfree are callbacks from the bzip2 library. 787 * dumpsys_get_maxmem() uses them for BZ2_bzCompressInit(). 788 */ 789 static void * 790 dumpbzalloc(void *opaque, int items, int size) 791 { 792 size_t *sz; 793 char *ret; 794 795 ASSERT(opaque != NULL); 796 sz = opaque; 797 ret = dumpcfg.maxvm + *sz; 798 *sz += items * size; 799 *sz = P2ROUNDUP(*sz, BZ2_BZALLOC_ALIGN); 800 ASSERT(*sz <= dumpcfg.maxvmsize); 801 return (ret); 802 } 803 804 /*ARGSUSED*/ 805 static void 806 dumpbzfree(void *opaque, void *addr) 807 { 808 } 809 810 /* 811 * Perform additional checks on the page to see if we can really use 812 * it. The kernel (kas) pages are always set in the bitmap. However, 813 * boot memory pages (prom_ppages or P_BOOTPAGES) are not in the 814 * bitmap. So we check for them. 815 */ 816 static inline int 817 dump_pfn_check(pfn_t pfn) 818 { 819 page_t *pp = page_numtopp_nolock(pfn); 820 #if defined(__sparc) 821 extern struct vnode prom_ppages; 822 #endif 823 824 if (pp == NULL || pp->p_pagenum != pfn || 825 #if defined(__sparc) 826 pp->p_vnode == &prom_ppages || 827 #else 828 PP_ISBOOTPAGES(pp) || 829 #endif 830 pp->p_toxic != 0) 831 return (0); 832 return (1); 833 } 834 835 /* 836 * Check a range to see if all contained pages are available and 837 * return non-zero if the range can be used. 838 */ 839 static inline int 840 dump_range_check(pgcnt_t start, pgcnt_t end, pfn_t pfn) 841 { 842 for (; start < end; start++, pfn++) { 843 if (BT_TEST(dumpcfg.bitmap, start)) 844 return (0); 845 if (!dump_pfn_check(pfn)) 846 return (0); 847 } 848 return (1); 849 } 850 851 /* 852 * dumpsys_get_maxmem() is called during panic. Find unused ranges 853 * and use them for buffers. If we find enough memory switch to 854 * parallel bzip2, otherwise use parallel lzjb. 855 * 856 * It searches the dump bitmap in 2 passes. The first time it looks 857 * for CBUF_MAPSIZE ranges. On the second pass it uses small pages. 858 */ 859 static void 860 dumpsys_get_maxmem() 861 { 862 dumpcfg_t *cfg = &dumpcfg; 863 cbuf_t *endcp = &cfg->cbuf[cfg->ncbuf]; 864 helper_t *endhp = &cfg->helper[cfg->nhelper]; 865 pgcnt_t bitnum, end; 866 size_t sz, endsz, bz2size; 867 pfn_t pfn, off; 868 cbuf_t *cp; 869 helper_t *hp, *ohp; 870 dumpmlw_t mlw; 871 int k; 872 873 if (cfg->maxsize == 0 || cfg->clevel < DUMP_CLEVEL_LZJB || 874 (dump_conflags & DUMP_ALL) != 0) 875 return; 876 877 sz = 0; 878 cfg->found4m = 0; 879 cfg->foundsm = 0; 880 881 /* bitmap of ranges used to estimate which pfns are being used */ 882 bzero(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.rbitmapsize)); 883 884 /* find ranges that are not being dumped to use for buffers */ 885 dump_init_memlist_walker(&mlw); 886 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) { 887 dump_timeleft = dump_timeout; 888 end = bitnum + CBUF_MAPNP; 889 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 890 ASSERT(pfn != PFN_INVALID); 891 892 /* skip partial range at end of mem segment */ 893 if (mlw.mpleft < CBUF_MAPNP) { 894 end = bitnum + mlw.mpleft; 895 continue; 896 } 897 898 /* skip non aligned pages */ 899 off = P2PHASE(pfn, CBUF_MAPNP); 900 if (off != 0) { 901 end -= off; 902 continue; 903 } 904 905 if (!dump_range_check(bitnum, end, pfn)) 906 continue; 907 908 ASSERT((sz + CBUF_MAPSIZE) <= cfg->maxvmsize); 909 hat_devload(kas.a_hat, cfg->maxvm + sz, CBUF_MAPSIZE, pfn, 910 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST); 911 sz += CBUF_MAPSIZE; 912 cfg->found4m++; 913 914 /* set the bitmap for both ends to be sure to cover the range */ 915 dump_set_used(pfn); 916 dump_set_used(pfn + CBUF_MAPNP - 1); 917 918 if (sz >= cfg->maxsize) 919 goto foundmax; 920 } 921 922 /* Add small pages if we can't find enough large pages. */ 923 dump_init_memlist_walker(&mlw); 924 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) { 925 dump_timeleft = dump_timeout; 926 end = bitnum + CBUF_MAPNP; 927 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 928 ASSERT(pfn != PFN_INVALID); 929 930 /* Find any non-aligned pages at start and end of segment. */ 931 off = P2PHASE(pfn, CBUF_MAPNP); 932 if (mlw.mpleft < CBUF_MAPNP) { 933 end = bitnum + mlw.mpleft; 934 } else if (off != 0) { 935 end -= off; 936 } else if (cfg->found4m && dump_test_used(pfn)) { 937 continue; 938 } 939 940 for (; bitnum < end; bitnum++, pfn++) { 941 dump_timeleft = dump_timeout; 942 if (BT_TEST(dumpcfg.bitmap, bitnum)) 943 continue; 944 if (!dump_pfn_check(pfn)) 945 continue; 946 ASSERT((sz + PAGESIZE) <= cfg->maxvmsize); 947 hat_devload(kas.a_hat, cfg->maxvm + sz, PAGESIZE, pfn, 948 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST); 949 sz += PAGESIZE; 950 cfg->foundsm++; 951 dump_set_used(pfn); 952 if (sz >= cfg->maxsize) 953 goto foundmax; 954 } 955 } 956 957 /* Fall back to lzjb if we did not get enough memory for bzip2. */ 958 endsz = (cfg->maxsize * cfg->threshold) / cfg->nhelper; 959 if (sz < endsz) { 960 cfg->clevel = DUMP_CLEVEL_LZJB; 961 } 962 963 /* Allocate memory for as many helpers as we can. */ 964 foundmax: 965 966 /* Byte offsets into memory found and mapped above */ 967 endsz = sz; 968 sz = 0; 969 970 /* Set the size for bzip2 state. Only bzip2 needs it. */ 971 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level); 972 973 /* Skip the preallocate output buffers. */ 974 cp = &cfg->cbuf[MINCBUFS]; 975 976 /* Use this to move memory up from the preallocated helpers. */ 977 ohp = cfg->helper; 978 979 /* Loop over all helpers and allocate memory. */ 980 for (hp = cfg->helper; hp < endhp; hp++) { 981 982 /* Skip preallocated helpers by checking hp->page. */ 983 if (hp->page == NULL) { 984 if (cfg->clevel <= DUMP_CLEVEL_LZJB) { 985 /* lzjb needs 2 1-page buffers */ 986 if ((sz + (2 * PAGESIZE)) > endsz) 987 break; 988 hp->page = cfg->maxvm + sz; 989 sz += PAGESIZE; 990 hp->lzbuf = cfg->maxvm + sz; 991 sz += PAGESIZE; 992 993 } else if (ohp->lzbuf != NULL) { 994 /* re-use the preallocted lzjb page for bzip2 */ 995 hp->page = ohp->lzbuf; 996 ohp->lzbuf = NULL; 997 ++ohp; 998 999 } else { 1000 /* bzip2 needs a 1-page buffer */ 1001 if ((sz + PAGESIZE) > endsz) 1002 break; 1003 hp->page = cfg->maxvm + sz; 1004 sz += PAGESIZE; 1005 } 1006 } 1007 1008 /* 1009 * Add output buffers per helper. The number of 1010 * buffers per helper is determined by the ratio of 1011 * ncbuf to nhelper. 1012 */ 1013 for (k = 0; cp < endcp && (sz + CBUF_SIZE) <= endsz && 1014 k < NCBUF_PER_HELPER; k++) { 1015 cp->state = CBUF_FREEBUF; 1016 cp->size = CBUF_SIZE; 1017 cp->buf = cfg->maxvm + sz; 1018 sz += CBUF_SIZE; 1019 ++cp; 1020 } 1021 1022 /* 1023 * bzip2 needs compression state. Use the dumpbzalloc 1024 * and dumpbzfree callbacks to allocate the memory. 1025 * bzip2 does allocation only at init time. 1026 */ 1027 if (cfg->clevel >= DUMP_CLEVEL_BZIP2) { 1028 if ((sz + bz2size) > endsz) { 1029 hp->page = NULL; 1030 break; 1031 } else { 1032 hp->bzstream.opaque = &sz; 1033 hp->bzstream.bzalloc = dumpbzalloc; 1034 hp->bzstream.bzfree = dumpbzfree; 1035 (void) BZ2_bzCompressInit(&hp->bzstream, 1036 dump_bzip2_level, 0, 0); 1037 hp->bzstream.opaque = NULL; 1038 } 1039 } 1040 } 1041 1042 /* Finish allocating output buffers */ 1043 for (; cp < endcp && (sz + CBUF_SIZE) <= endsz; cp++) { 1044 cp->state = CBUF_FREEBUF; 1045 cp->size = CBUF_SIZE; 1046 cp->buf = cfg->maxvm + sz; 1047 sz += CBUF_SIZE; 1048 } 1049 1050 /* Enable IS_DUMP_PAGE macro, which checks for pages we took. */ 1051 if (cfg->found4m || cfg->foundsm) 1052 dump_check_used = 1; 1053 1054 ASSERT(sz <= endsz); 1055 } 1056 1057 static void 1058 dumphdr_init(void) 1059 { 1060 pgcnt_t npages = 0; 1061 1062 ASSERT(MUTEX_HELD(&dump_lock)); 1063 1064 if (dumphdr == NULL) { 1065 dumphdr = kmem_zalloc(sizeof (dumphdr_t), KM_SLEEP); 1066 dumphdr->dump_magic = DUMP_MAGIC; 1067 dumphdr->dump_version = DUMP_VERSION; 1068 dumphdr->dump_wordsize = DUMP_WORDSIZE; 1069 dumphdr->dump_pageshift = PAGESHIFT; 1070 dumphdr->dump_pagesize = PAGESIZE; 1071 dumphdr->dump_utsname = utsname; 1072 (void) strcpy(dumphdr->dump_platform, platform); 1073 dumpbuf.size = dumpbuf_iosize(maxphys); 1074 dumpbuf.start = kmem_alloc(dumpbuf.size, KM_SLEEP); 1075 dumpbuf.end = dumpbuf.start + dumpbuf.size; 1076 dumpcfg.pids = kmem_alloc(v.v_proc * sizeof (pid_t), KM_SLEEP); 1077 dumpcfg.helpermap = kmem_zalloc(BT_SIZEOFMAP(NCPU), KM_SLEEP); 1078 LOCK_INIT_HELD(&dumpcfg.helper_lock); 1079 } 1080 1081 npages = num_phys_pages(); 1082 1083 if (dumpcfg.bitmapsize != npages) { 1084 size_t rlen = CBUF_MAPP2R(P2ROUNDUP(npages, CBUF_MAPNP)); 1085 void *map = kmem_alloc(BT_SIZEOFMAP(npages), KM_SLEEP); 1086 void *rmap = kmem_alloc(BT_SIZEOFMAP(rlen), KM_SLEEP); 1087 1088 if (dumpcfg.bitmap != NULL) 1089 kmem_free(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg. 1090 bitmapsize)); 1091 if (dumpcfg.rbitmap != NULL) 1092 kmem_free(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg. 1093 rbitmapsize)); 1094 dumpcfg.bitmap = map; 1095 dumpcfg.bitmapsize = npages; 1096 dumpcfg.rbitmap = rmap; 1097 dumpcfg.rbitmapsize = rlen; 1098 } 1099 } 1100 1101 /* 1102 * Establish a new dump device. 1103 */ 1104 int 1105 dumpinit(vnode_t *vp, char *name, int justchecking) 1106 { 1107 vnode_t *cvp; 1108 vattr_t vattr; 1109 vnode_t *cdev_vp; 1110 int error = 0; 1111 1112 ASSERT(MUTEX_HELD(&dump_lock)); 1113 1114 dumphdr_init(); 1115 1116 cvp = common_specvp(vp); 1117 if (cvp == dumpvp) 1118 return (0); 1119 1120 /* 1121 * Determine whether this is a plausible dump device. We want either: 1122 * (1) a real device that's not mounted and has a cb_dump routine, or 1123 * (2) a swapfile on some filesystem that has a vop_dump routine. 1124 */ 1125 if ((error = VOP_OPEN(&cvp, FREAD | FWRITE, kcred, NULL)) != 0) 1126 return (error); 1127 1128 vattr.va_mask = AT_SIZE | AT_TYPE | AT_RDEV; 1129 if ((error = VOP_GETATTR(cvp, &vattr, 0, kcred, NULL)) == 0) { 1130 if (vattr.va_type == VBLK || vattr.va_type == VCHR) { 1131 if (devopsp[getmajor(vattr.va_rdev)]-> 1132 devo_cb_ops->cb_dump == nodev) 1133 error = ENOTSUP; 1134 else if (vfs_devismounted(vattr.va_rdev)) 1135 error = EBUSY; 1136 if (strcmp(ddi_driver_name(VTOS(cvp)->s_dip), 1137 ZFS_DRIVER) == 0 && 1138 IS_SWAPVP(common_specvp(cvp))) 1139 error = EBUSY; 1140 } else { 1141 if (vn_matchopval(cvp, VOPNAME_DUMP, fs_nosys) || 1142 !IS_SWAPVP(cvp)) 1143 error = ENOTSUP; 1144 } 1145 } 1146 1147 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) 1148 error = ENOSPC; 1149 1150 if (error || justchecking) { 1151 (void) VOP_CLOSE(cvp, FREAD | FWRITE, 1, (offset_t)0, 1152 kcred, NULL); 1153 return (error); 1154 } 1155 1156 VN_HOLD(cvp); 1157 1158 if (dumpvp != NULL) 1159 dumpfini(); /* unconfigure the old dump device */ 1160 1161 dumpvp = cvp; 1162 dumpvp_size = vattr.va_size & -DUMP_OFFSET; 1163 dumppath = kmem_alloc(strlen(name) + 1, KM_SLEEP); 1164 (void) strcpy(dumppath, name); 1165 dumpbuf.iosize = 0; 1166 1167 /* 1168 * If the dump device is a block device, attempt to open up the 1169 * corresponding character device and determine its maximum transfer 1170 * size. We use this information to potentially resize dumpbuf to a 1171 * larger and more optimal size for performing i/o to the dump device. 1172 */ 1173 if (cvp->v_type == VBLK && 1174 (cdev_vp = makespecvp(VTOS(cvp)->s_dev, VCHR)) != NULL) { 1175 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) { 1176 size_t blk_size; 1177 struct dk_cinfo dki; 1178 struct dk_minfo minf; 1179 1180 if (VOP_IOCTL(cdev_vp, DKIOCGMEDIAINFO, 1181 (intptr_t)&minf, FKIOCTL, kcred, NULL, NULL) 1182 == 0 && minf.dki_lbsize != 0) 1183 blk_size = minf.dki_lbsize; 1184 else 1185 blk_size = DEV_BSIZE; 1186 1187 if (VOP_IOCTL(cdev_vp, DKIOCINFO, (intptr_t)&dki, 1188 FKIOCTL, kcred, NULL, NULL) == 0) { 1189 dumpbuf.iosize = dki.dki_maxtransfer * blk_size; 1190 dumpbuf_resize(); 1191 } 1192 /* 1193 * If we are working with a zvol then dumpify it 1194 * if it's not being used as swap. 1195 */ 1196 if (strcmp(dki.dki_dname, ZVOL_DRIVER) == 0) { 1197 if (IS_SWAPVP(common_specvp(cvp))) 1198 error = EBUSY; 1199 else if ((error = VOP_IOCTL(cdev_vp, 1200 DKIOCDUMPINIT, NULL, FKIOCTL, kcred, 1201 NULL, NULL)) != 0) 1202 dumpfini(); 1203 } 1204 1205 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0, 1206 kcred, NULL); 1207 } 1208 1209 VN_RELE(cdev_vp); 1210 } 1211 1212 cmn_err(CE_CONT, "?dump on %s size %llu MB\n", name, dumpvp_size >> 20); 1213 1214 dump_update_clevel(); 1215 1216 return (error); 1217 } 1218 1219 void 1220 dumpfini(void) 1221 { 1222 vattr_t vattr; 1223 boolean_t is_zfs = B_FALSE; 1224 vnode_t *cdev_vp; 1225 ASSERT(MUTEX_HELD(&dump_lock)); 1226 1227 kmem_free(dumppath, strlen(dumppath) + 1); 1228 1229 /* 1230 * Determine if we are using zvols for our dump device 1231 */ 1232 vattr.va_mask = AT_RDEV; 1233 if (VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL) == 0) { 1234 is_zfs = (getmajor(vattr.va_rdev) == 1235 ddi_name_to_major(ZFS_DRIVER)) ? B_TRUE : B_FALSE; 1236 } 1237 1238 /* 1239 * If we have a zvol dump device then we call into zfs so 1240 * that it may have a chance to cleanup. 1241 */ 1242 if (is_zfs && 1243 (cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR)) != NULL) { 1244 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) { 1245 (void) VOP_IOCTL(cdev_vp, DKIOCDUMPFINI, NULL, FKIOCTL, 1246 kcred, NULL, NULL); 1247 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0, 1248 kcred, NULL); 1249 } 1250 VN_RELE(cdev_vp); 1251 } 1252 1253 (void) VOP_CLOSE(dumpvp, FREAD | FWRITE, 1, (offset_t)0, kcred, NULL); 1254 1255 VN_RELE(dumpvp); 1256 1257 dumpvp = NULL; 1258 dumpvp_size = 0; 1259 dumppath = NULL; 1260 } 1261 1262 static offset_t 1263 dumpvp_flush(void) 1264 { 1265 size_t size = P2ROUNDUP(dumpbuf.cur - dumpbuf.start, PAGESIZE); 1266 hrtime_t iotime; 1267 int err; 1268 1269 if (dumpbuf.vp_off + size > dumpbuf.vp_limit) { 1270 dump_ioerr = ENOSPC; 1271 dumpbuf.vp_off = dumpbuf.vp_limit; 1272 } else if (size != 0) { 1273 iotime = gethrtime(); 1274 dumpsync.iowait += iotime - dumpsync.iowaitts; 1275 if (panicstr) 1276 err = VOP_DUMP(dumpvp, dumpbuf.start, 1277 lbtodb(dumpbuf.vp_off), btod(size), NULL); 1278 else 1279 err = vn_rdwr(UIO_WRITE, dumpbuf.cdev_vp != NULL ? 1280 dumpbuf.cdev_vp : dumpvp, dumpbuf.start, size, 1281 dumpbuf.vp_off, UIO_SYSSPACE, 0, dumpbuf.vp_limit, 1282 kcred, 0); 1283 if (err && dump_ioerr == 0) 1284 dump_ioerr = err; 1285 dumpsync.iowaitts = gethrtime(); 1286 dumpsync.iotime += dumpsync.iowaitts - iotime; 1287 dumpsync.nwrite += size; 1288 dumpbuf.vp_off += size; 1289 } 1290 dumpbuf.cur = dumpbuf.start; 1291 dump_timeleft = dump_timeout; 1292 return (dumpbuf.vp_off); 1293 } 1294 1295 /* maximize write speed by keeping seek offset aligned with size */ 1296 void 1297 dumpvp_write(const void *va, size_t size) 1298 { 1299 size_t len, off, sz; 1300 1301 while (size != 0) { 1302 len = MIN(size, dumpbuf.end - dumpbuf.cur); 1303 if (len == 0) { 1304 off = P2PHASE(dumpbuf.vp_off, dumpbuf.size); 1305 if (off == 0 || !ISP2(dumpbuf.size)) { 1306 (void) dumpvp_flush(); 1307 } else { 1308 sz = dumpbuf.size - off; 1309 dumpbuf.cur = dumpbuf.start + sz; 1310 (void) dumpvp_flush(); 1311 ovbcopy(dumpbuf.start + sz, dumpbuf.start, off); 1312 dumpbuf.cur += off; 1313 } 1314 } else { 1315 bcopy(va, dumpbuf.cur, len); 1316 va = (char *)va + len; 1317 dumpbuf.cur += len; 1318 size -= len; 1319 } 1320 } 1321 } 1322 1323 /*ARGSUSED*/ 1324 static void 1325 dumpvp_ksyms_write(const void *src, void *dst, size_t size) 1326 { 1327 dumpvp_write(src, size); 1328 } 1329 1330 /* 1331 * Mark 'pfn' in the bitmap and dump its translation table entry. 1332 */ 1333 void 1334 dump_addpage(struct as *as, void *va, pfn_t pfn) 1335 { 1336 mem_vtop_t mem_vtop; 1337 pgcnt_t bitnum; 1338 1339 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) { 1340 if (!BT_TEST(dumpcfg.bitmap, bitnum)) { 1341 dumphdr->dump_npages++; 1342 BT_SET(dumpcfg.bitmap, bitnum); 1343 } 1344 dumphdr->dump_nvtop++; 1345 mem_vtop.m_as = as; 1346 mem_vtop.m_va = va; 1347 mem_vtop.m_pfn = pfn; 1348 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 1349 } 1350 dump_timeleft = dump_timeout; 1351 } 1352 1353 /* 1354 * Mark 'pfn' in the bitmap 1355 */ 1356 void 1357 dump_page(pfn_t pfn) 1358 { 1359 pgcnt_t bitnum; 1360 1361 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) { 1362 if (!BT_TEST(dumpcfg.bitmap, bitnum)) { 1363 dumphdr->dump_npages++; 1364 BT_SET(dumpcfg.bitmap, bitnum); 1365 } 1366 } 1367 dump_timeleft = dump_timeout; 1368 } 1369 1370 /* 1371 * Dump the <as, va, pfn> information for a given address space. 1372 * SEGOP_DUMP() will call dump_addpage() for each page in the segment. 1373 */ 1374 static void 1375 dump_as(struct as *as) 1376 { 1377 struct seg *seg; 1378 1379 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1380 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 1381 if (seg->s_as != as) 1382 break; 1383 if (seg->s_ops == NULL) 1384 continue; 1385 SEGOP_DUMP(seg); 1386 } 1387 AS_LOCK_EXIT(as, &as->a_lock); 1388 1389 if (seg != NULL) 1390 cmn_err(CE_WARN, "invalid segment %p in address space %p", 1391 (void *)seg, (void *)as); 1392 } 1393 1394 static int 1395 dump_process(pid_t pid) 1396 { 1397 proc_t *p = sprlock(pid); 1398 1399 if (p == NULL) 1400 return (-1); 1401 if (p->p_as != &kas) { 1402 mutex_exit(&p->p_lock); 1403 dump_as(p->p_as); 1404 mutex_enter(&p->p_lock); 1405 } 1406 1407 sprunlock(p); 1408 1409 return (0); 1410 } 1411 1412 void 1413 dump_ereports(void) 1414 { 1415 u_offset_t dumpvp_start; 1416 erpt_dump_t ed; 1417 1418 if (dumpvp == NULL || dumphdr == NULL) 1419 return; 1420 1421 dumpbuf.cur = dumpbuf.start; 1422 dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE); 1423 dumpvp_start = dumpbuf.vp_limit - DUMP_ERPTSIZE; 1424 dumpbuf.vp_off = dumpvp_start; 1425 1426 fm_ereport_dump(); 1427 if (panicstr) 1428 errorq_dump(); 1429 1430 bzero(&ed, sizeof (ed)); /* indicate end of ereports */ 1431 dumpvp_write(&ed, sizeof (ed)); 1432 (void) dumpvp_flush(); 1433 1434 if (!panicstr) { 1435 (void) VOP_PUTPAGE(dumpvp, dumpvp_start, 1436 (size_t)(dumpbuf.vp_off - dumpvp_start), 1437 B_INVAL | B_FORCE, kcred, NULL); 1438 } 1439 } 1440 1441 void 1442 dump_messages(void) 1443 { 1444 log_dump_t ld; 1445 mblk_t *mctl, *mdata; 1446 queue_t *q, *qlast; 1447 u_offset_t dumpvp_start; 1448 1449 if (dumpvp == NULL || dumphdr == NULL || log_consq == NULL) 1450 return; 1451 1452 dumpbuf.cur = dumpbuf.start; 1453 dumpbuf.vp_limit = dumpvp_size - DUMP_OFFSET; 1454 dumpvp_start = dumpbuf.vp_limit - DUMP_LOGSIZE; 1455 dumpbuf.vp_off = dumpvp_start; 1456 1457 qlast = NULL; 1458 do { 1459 for (q = log_consq; q->q_next != qlast; q = q->q_next) 1460 continue; 1461 for (mctl = q->q_first; mctl != NULL; mctl = mctl->b_next) { 1462 dump_timeleft = dump_timeout; 1463 mdata = mctl->b_cont; 1464 ld.ld_magic = LOG_MAGIC; 1465 ld.ld_msgsize = MBLKL(mctl->b_cont); 1466 ld.ld_csum = checksum32(mctl->b_rptr, MBLKL(mctl)); 1467 ld.ld_msum = checksum32(mdata->b_rptr, MBLKL(mdata)); 1468 dumpvp_write(&ld, sizeof (ld)); 1469 dumpvp_write(mctl->b_rptr, MBLKL(mctl)); 1470 dumpvp_write(mdata->b_rptr, MBLKL(mdata)); 1471 } 1472 } while ((qlast = q) != log_consq); 1473 1474 ld.ld_magic = 0; /* indicate end of messages */ 1475 dumpvp_write(&ld, sizeof (ld)); 1476 (void) dumpvp_flush(); 1477 if (!panicstr) { 1478 (void) VOP_PUTPAGE(dumpvp, dumpvp_start, 1479 (size_t)(dumpbuf.vp_off - dumpvp_start), 1480 B_INVAL | B_FORCE, kcred, NULL); 1481 } 1482 } 1483 1484 /* 1485 * The following functions are called on multiple CPUs during dump. 1486 * They must not use most kernel services, because all cross-calls are 1487 * disabled during panic. Therefore, blocking locks and cache flushes 1488 * will not work. 1489 */ 1490 1491 /* 1492 * Copy pages, trapping ECC errors. Also, for robustness, trap data 1493 * access in case something goes wrong in the hat layer and the 1494 * mapping is broken. 1495 */ 1496 static int 1497 dump_pagecopy(void *src, void *dst) 1498 { 1499 long *wsrc = (long *)src; 1500 long *wdst = (long *)dst; 1501 const ulong_t ncopies = PAGESIZE / sizeof (long); 1502 volatile int w = 0; 1503 volatile int ueoff = -1; 1504 on_trap_data_t otd; 1505 1506 if (on_trap(&otd, OT_DATA_EC | OT_DATA_ACCESS)) { 1507 if (ueoff == -1) 1508 ueoff = w * sizeof (long); 1509 /* report "bad ECC" or "bad address" */ 1510 #ifdef _LP64 1511 if (otd.ot_trap & OT_DATA_EC) 1512 wdst[w++] = 0x00badecc00badecc; 1513 else 1514 wdst[w++] = 0x00badadd00badadd; 1515 #else 1516 if (otd.ot_trap & OT_DATA_EC) 1517 wdst[w++] = 0x00badecc; 1518 else 1519 wdst[w++] = 0x00badadd; 1520 #endif 1521 } 1522 while (w < ncopies) { 1523 wdst[w] = wsrc[w]; 1524 w++; 1525 } 1526 no_trap(); 1527 return (ueoff); 1528 } 1529 1530 static void 1531 dumpsys_close_cq(cqueue_t *cq, int live) 1532 { 1533 if (live) { 1534 mutex_enter(&cq->mutex); 1535 atomic_dec_uint(&cq->open); 1536 cv_signal(&cq->cv); 1537 mutex_exit(&cq->mutex); 1538 } else { 1539 atomic_dec_uint(&cq->open); 1540 } 1541 } 1542 1543 static inline void 1544 dumpsys_spinlock(lock_t *lp) 1545 { 1546 uint_t backoff = 0; 1547 int loop_count = 0; 1548 1549 while (LOCK_HELD(lp) || !lock_spin_try(lp)) { 1550 if (++loop_count >= ncpus) { 1551 backoff = mutex_lock_backoff(0); 1552 loop_count = 0; 1553 } else { 1554 backoff = mutex_lock_backoff(backoff); 1555 } 1556 mutex_lock_delay(backoff); 1557 } 1558 } 1559 1560 static inline void 1561 dumpsys_spinunlock(lock_t *lp) 1562 { 1563 lock_clear(lp); 1564 } 1565 1566 static inline void 1567 dumpsys_lock(cqueue_t *cq, int live) 1568 { 1569 if (live) 1570 mutex_enter(&cq->mutex); 1571 else 1572 dumpsys_spinlock(&cq->spinlock); 1573 } 1574 1575 static inline void 1576 dumpsys_unlock(cqueue_t *cq, int live, int signal) 1577 { 1578 if (live) { 1579 if (signal) 1580 cv_signal(&cq->cv); 1581 mutex_exit(&cq->mutex); 1582 } else { 1583 dumpsys_spinunlock(&cq->spinlock); 1584 } 1585 } 1586 1587 static void 1588 dumpsys_wait_cq(cqueue_t *cq, int live) 1589 { 1590 if (live) { 1591 cv_wait(&cq->cv, &cq->mutex); 1592 } else { 1593 dumpsys_spinunlock(&cq->spinlock); 1594 while (cq->open) 1595 if (cq->first) 1596 break; 1597 dumpsys_spinlock(&cq->spinlock); 1598 } 1599 } 1600 1601 static void 1602 dumpsys_put_cq(cqueue_t *cq, cbuf_t *cp, int newstate, int live) 1603 { 1604 if (cp == NULL) 1605 return; 1606 1607 dumpsys_lock(cq, live); 1608 1609 if (cq->ts != 0) { 1610 cq->empty += gethrtime() - cq->ts; 1611 cq->ts = 0; 1612 } 1613 1614 cp->state = newstate; 1615 cp->next = NULL; 1616 if (cq->last == NULL) 1617 cq->first = cp; 1618 else 1619 cq->last->next = cp; 1620 cq->last = cp; 1621 1622 dumpsys_unlock(cq, live, 1); 1623 } 1624 1625 static cbuf_t * 1626 dumpsys_get_cq(cqueue_t *cq, int live) 1627 { 1628 cbuf_t *cp; 1629 hrtime_t now = gethrtime(); 1630 1631 dumpsys_lock(cq, live); 1632 1633 /* CONSTCOND */ 1634 while (1) { 1635 cp = (cbuf_t *)cq->first; 1636 if (cp == NULL) { 1637 if (cq->open == 0) 1638 break; 1639 dumpsys_wait_cq(cq, live); 1640 continue; 1641 } 1642 cq->first = cp->next; 1643 if (cq->first == NULL) { 1644 cq->last = NULL; 1645 cq->ts = now; 1646 } 1647 break; 1648 } 1649 1650 dumpsys_unlock(cq, live, cq->first != NULL || cq->open == 0); 1651 return (cp); 1652 } 1653 1654 /* 1655 * Send an error message to the console. If the main task is running 1656 * just write the message via uprintf. If a helper is running the 1657 * message has to be put on a queue for the main task. Setting fmt to 1658 * NULL means flush the error message buffer. If fmt is not NULL, just 1659 * add the text to the existing buffer. 1660 */ 1661 static void 1662 dumpsys_errmsg(helper_t *hp, const char *fmt, ...) 1663 { 1664 dumpsync_t *ds = hp->ds; 1665 cbuf_t *cp = hp->cperr; 1666 va_list adx; 1667 1668 if (hp->helper == MAINHELPER) { 1669 if (fmt != NULL) { 1670 if (ds->neednl) { 1671 uprintf("\n"); 1672 ds->neednl = 0; 1673 } 1674 va_start(adx, fmt); 1675 vuprintf(fmt, adx); 1676 va_end(adx); 1677 } 1678 } else if (fmt == NULL) { 1679 if (cp != NULL) { 1680 CQ_PUT(mainq, cp, CBUF_ERRMSG); 1681 hp->cperr = NULL; 1682 } 1683 } else { 1684 if (hp->cperr == NULL) { 1685 cp = CQ_GET(freebufq); 1686 hp->cperr = cp; 1687 cp->used = 0; 1688 } 1689 va_start(adx, fmt); 1690 cp->used += vsnprintf(cp->buf + cp->used, cp->size - cp->used, 1691 fmt, adx); 1692 va_end(adx); 1693 if ((cp->used + LOG_MSGSIZE) > cp->size) { 1694 CQ_PUT(mainq, cp, CBUF_ERRMSG); 1695 hp->cperr = NULL; 1696 } 1697 } 1698 } 1699 1700 /* 1701 * Write an output buffer to the dump file. If the main task is 1702 * running just write the data. If a helper is running the output is 1703 * placed on a queue for the main task. 1704 */ 1705 static void 1706 dumpsys_swrite(helper_t *hp, cbuf_t *cp, size_t used) 1707 { 1708 dumpsync_t *ds = hp->ds; 1709 1710 if (hp->helper == MAINHELPER) { 1711 HRSTART(ds->perpage, write); 1712 dumpvp_write(cp->buf, used); 1713 HRSTOP(ds->perpage, write); 1714 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 1715 } else { 1716 cp->used = used; 1717 CQ_PUT(mainq, cp, CBUF_WRITE); 1718 } 1719 } 1720 1721 /* 1722 * Copy one page within the mapped range. The offset starts at 0 and 1723 * is relative to the first pfn. cp->buf + cp->off is the address of 1724 * the first pfn. If dump_pagecopy returns a UE offset, create an 1725 * error message. Returns the offset to the next pfn in the range 1726 * selected by the bitmap. 1727 */ 1728 static int 1729 dumpsys_copy_page(helper_t *hp, int offset) 1730 { 1731 cbuf_t *cp = hp->cpin; 1732 int ueoff; 1733 1734 ASSERT(cp->off + offset + PAGESIZE <= cp->size); 1735 ASSERT(BT_TEST(dumpcfg.bitmap, cp->bitnum)); 1736 1737 ueoff = dump_pagecopy(cp->buf + cp->off + offset, hp->page); 1738 1739 /* ueoff is the offset in the page to a UE error */ 1740 if (ueoff != -1) { 1741 uint64_t pa = ptob(cp->pfn) + offset + ueoff; 1742 1743 dumpsys_errmsg(hp, "cpu %d: memory error at PA 0x%08x.%08x\n", 1744 CPU->cpu_id, (uint32_t)(pa >> 32), (uint32_t)pa); 1745 } 1746 1747 /* 1748 * Advance bitnum and offset to the next input page for the 1749 * next call to this function. 1750 */ 1751 offset += PAGESIZE; 1752 cp->bitnum++; 1753 while (cp->off + offset < cp->size) { 1754 if (BT_TEST(dumpcfg.bitmap, cp->bitnum)) 1755 break; 1756 offset += PAGESIZE; 1757 cp->bitnum++; 1758 } 1759 1760 return (offset); 1761 } 1762 1763 /* 1764 * Read the helper queue, and copy one mapped page. Return 0 when 1765 * done. Return 1 when a page has been copied into hp->page. 1766 */ 1767 static int 1768 dumpsys_sread(helper_t *hp) 1769 { 1770 dumpsync_t *ds = hp->ds; 1771 1772 /* CONSTCOND */ 1773 while (1) { 1774 1775 /* Find the next input buffer. */ 1776 if (hp->cpin == NULL) { 1777 HRSTART(hp->perpage, inwait); 1778 1779 /* CONSTCOND */ 1780 while (1) { 1781 hp->cpin = CQ_GET(helperq); 1782 dump_timeleft = dump_timeout; 1783 1784 /* 1785 * NULL return means the helper queue 1786 * is closed and empty. 1787 */ 1788 if (hp->cpin == NULL) 1789 break; 1790 1791 /* Have input, check for dump I/O error. */ 1792 if (!dump_ioerr) 1793 break; 1794 1795 /* 1796 * If an I/O error occurs, stay in the 1797 * loop in order to empty the helper 1798 * queue. Return the buffers to the 1799 * main task to unmap and free it. 1800 */ 1801 hp->cpin->used = 0; 1802 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 1803 } 1804 HRSTOP(hp->perpage, inwait); 1805 1806 /* Stop here when the helper queue is closed. */ 1807 if (hp->cpin == NULL) 1808 break; 1809 1810 /* Set the offset=0 to get the first pfn. */ 1811 hp->in = 0; 1812 1813 /* Set the total processed to 0 */ 1814 hp->used = 0; 1815 } 1816 1817 /* Process the next page. */ 1818 if (hp->used < hp->cpin->used) { 1819 1820 /* 1821 * Get the next page from the input buffer and 1822 * return a copy. 1823 */ 1824 ASSERT(hp->in != -1); 1825 HRSTART(hp->perpage, copy); 1826 hp->in = dumpsys_copy_page(hp, hp->in); 1827 hp->used += PAGESIZE; 1828 HRSTOP(hp->perpage, copy); 1829 break; 1830 1831 } else { 1832 1833 /* 1834 * Done with the input. Flush the VM and 1835 * return the buffer to the main task. 1836 */ 1837 if (panicstr && hp->helper != MAINHELPER) 1838 hat_flush_range(kas.a_hat, 1839 hp->cpin->buf, hp->cpin->size); 1840 dumpsys_errmsg(hp, NULL); 1841 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 1842 hp->cpin = NULL; 1843 } 1844 } 1845 1846 return (hp->cpin != NULL); 1847 } 1848 1849 /* 1850 * Compress size bytes starting at buf with bzip2 1851 * mode: 1852 * BZ_RUN add one more compressed page 1853 * BZ_FINISH no more input, flush the state 1854 */ 1855 static void 1856 dumpsys_bzrun(helper_t *hp, void *buf, size_t size, int mode) 1857 { 1858 dumpsync_t *ds = hp->ds; 1859 const int CSIZE = sizeof (dumpcsize_t); 1860 bz_stream *ps = &hp->bzstream; 1861 int rc = 0; 1862 uint32_t csize; 1863 dumpcsize_t cs; 1864 1865 /* Set input pointers to new input page */ 1866 if (size > 0) { 1867 ps->avail_in = size; 1868 ps->next_in = buf; 1869 } 1870 1871 /* CONSTCOND */ 1872 while (1) { 1873 1874 /* Quit when all input has been consumed */ 1875 if (ps->avail_in == 0 && mode == BZ_RUN) 1876 break; 1877 1878 /* Get a new output buffer */ 1879 if (hp->cpout == NULL) { 1880 HRSTART(hp->perpage, outwait); 1881 hp->cpout = CQ_GET(freebufq); 1882 HRSTOP(hp->perpage, outwait); 1883 ps->avail_out = hp->cpout->size - CSIZE; 1884 ps->next_out = hp->cpout->buf + CSIZE; 1885 } 1886 1887 /* Compress input, or finalize */ 1888 HRSTART(hp->perpage, compress); 1889 rc = BZ2_bzCompress(ps, mode); 1890 HRSTOP(hp->perpage, compress); 1891 1892 /* Check for error */ 1893 if (mode == BZ_RUN && rc != BZ_RUN_OK) { 1894 dumpsys_errmsg(hp, "%d: BZ_RUN error %s at page %lx\n", 1895 hp->helper, BZ2_bzErrorString(rc), 1896 hp->cpin->pagenum); 1897 break; 1898 } 1899 1900 /* Write the buffer if it is full, or we are flushing */ 1901 if (ps->avail_out == 0 || mode == BZ_FINISH) { 1902 csize = hp->cpout->size - CSIZE - ps->avail_out; 1903 cs = DUMP_SET_TAG(csize, hp->tag); 1904 if (csize > 0) { 1905 (void) memcpy(hp->cpout->buf, &cs, CSIZE); 1906 dumpsys_swrite(hp, hp->cpout, csize + CSIZE); 1907 hp->cpout = NULL; 1908 } 1909 } 1910 1911 /* Check for final complete */ 1912 if (mode == BZ_FINISH) { 1913 if (rc == BZ_STREAM_END) 1914 break; 1915 if (rc != BZ_FINISH_OK) { 1916 dumpsys_errmsg(hp, "%d: BZ_FINISH error %s\n", 1917 hp->helper, BZ2_bzErrorString(rc)); 1918 break; 1919 } 1920 } 1921 } 1922 1923 /* Cleanup state and buffers */ 1924 if (mode == BZ_FINISH) { 1925 1926 /* Reset state so that it is re-usable. */ 1927 (void) BZ2_bzCompressReset(&hp->bzstream); 1928 1929 /* Give any unused outout buffer to the main task */ 1930 if (hp->cpout != NULL) { 1931 hp->cpout->used = 0; 1932 CQ_PUT(mainq, hp->cpout, CBUF_ERRMSG); 1933 hp->cpout = NULL; 1934 } 1935 } 1936 } 1937 1938 static void 1939 dumpsys_bz2compress(helper_t *hp) 1940 { 1941 dumpsync_t *ds = hp->ds; 1942 dumpstreamhdr_t sh; 1943 1944 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC); 1945 sh.stream_pagenum = (pgcnt_t)-1; 1946 sh.stream_npages = 0; 1947 hp->cpin = NULL; 1948 hp->cpout = NULL; 1949 hp->cperr = NULL; 1950 hp->in = 0; 1951 hp->out = 0; 1952 hp->bzstream.avail_in = 0; 1953 1954 /* Bump reference to mainq while we are running */ 1955 CQ_OPEN(mainq); 1956 1957 /* Get one page at a time */ 1958 while (dumpsys_sread(hp)) { 1959 if (sh.stream_pagenum != hp->cpin->pagenum) { 1960 sh.stream_pagenum = hp->cpin->pagenum; 1961 sh.stream_npages = btop(hp->cpin->used); 1962 dumpsys_bzrun(hp, &sh, sizeof (sh), BZ_RUN); 1963 } 1964 dumpsys_bzrun(hp, hp->page, PAGESIZE, 0); 1965 } 1966 1967 /* Done with input, flush any partial buffer */ 1968 if (sh.stream_pagenum != (pgcnt_t)-1) { 1969 dumpsys_bzrun(hp, NULL, 0, BZ_FINISH); 1970 dumpsys_errmsg(hp, NULL); 1971 } 1972 1973 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL); 1974 1975 /* Decrement main queue count, we are done */ 1976 CQ_CLOSE(mainq); 1977 } 1978 1979 /* 1980 * Compress with lzjb 1981 * write stream block if full or size==0 1982 * if csize==0 write stream header, else write <csize, data> 1983 * size==0 is a call to flush a buffer 1984 * hp->cpout is the buffer we are flushing or filling 1985 * hp->out is the next index to fill data 1986 * osize is either csize+data, or the size of a stream header 1987 */ 1988 static void 1989 dumpsys_lzjbrun(helper_t *hp, size_t csize, void *buf, size_t size) 1990 { 1991 dumpsync_t *ds = hp->ds; 1992 const int CSIZE = sizeof (dumpcsize_t); 1993 dumpcsize_t cs; 1994 size_t osize = csize > 0 ? CSIZE + size : size; 1995 1996 /* If flush, and there is no buffer, just return */ 1997 if (size == 0 && hp->cpout == NULL) 1998 return; 1999 2000 /* If flush, or cpout is full, write it out */ 2001 if (size == 0 || 2002 hp->cpout != NULL && hp->out + osize > hp->cpout->size) { 2003 2004 /* Set tag+size word at the front of the stream block. */ 2005 cs = DUMP_SET_TAG(hp->out - CSIZE, hp->tag); 2006 (void) memcpy(hp->cpout->buf, &cs, CSIZE); 2007 2008 /* Write block to dump file. */ 2009 dumpsys_swrite(hp, hp->cpout, hp->out); 2010 2011 /* Clear pointer to indicate we need a new buffer */ 2012 hp->cpout = NULL; 2013 2014 /* flushing, we are done */ 2015 if (size == 0) 2016 return; 2017 } 2018 2019 /* Get an output buffer if we dont have one. */ 2020 if (hp->cpout == NULL) { 2021 HRSTART(hp->perpage, outwait); 2022 hp->cpout = CQ_GET(freebufq); 2023 HRSTOP(hp->perpage, outwait); 2024 hp->out = CSIZE; 2025 } 2026 2027 /* Store csize word. This is the size of compressed data. */ 2028 if (csize > 0) { 2029 cs = DUMP_SET_TAG(csize, 0); 2030 (void) memcpy(hp->cpout->buf + hp->out, &cs, CSIZE); 2031 hp->out += CSIZE; 2032 } 2033 2034 /* Store the data. */ 2035 (void) memcpy(hp->cpout->buf + hp->out, buf, size); 2036 hp->out += size; 2037 } 2038 2039 static void 2040 dumpsys_lzjbcompress(helper_t *hp) 2041 { 2042 dumpsync_t *ds = hp->ds; 2043 size_t csize; 2044 dumpstreamhdr_t sh; 2045 2046 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC); 2047 sh.stream_pagenum = (pfn_t)-1; 2048 sh.stream_npages = 0; 2049 hp->cpin = NULL; 2050 hp->cpout = NULL; 2051 hp->cperr = NULL; 2052 hp->in = 0; 2053 hp->out = 0; 2054 2055 /* Bump reference to mainq while we are running */ 2056 CQ_OPEN(mainq); 2057 2058 /* Get one page at a time */ 2059 while (dumpsys_sread(hp)) { 2060 2061 /* Create a stream header for each new input map */ 2062 if (sh.stream_pagenum != hp->cpin->pagenum) { 2063 sh.stream_pagenum = hp->cpin->pagenum; 2064 sh.stream_npages = btop(hp->cpin->used); 2065 dumpsys_lzjbrun(hp, 0, &sh, sizeof (sh)); 2066 } 2067 2068 /* Compress one page */ 2069 HRSTART(hp->perpage, compress); 2070 csize = compress(hp->page, hp->lzbuf, PAGESIZE); 2071 HRSTOP(hp->perpage, compress); 2072 2073 /* Add csize+data to output block */ 2074 ASSERT(csize > 0 && csize <= PAGESIZE); 2075 dumpsys_lzjbrun(hp, csize, hp->lzbuf, csize); 2076 } 2077 2078 /* Done with input, flush any partial buffer */ 2079 if (sh.stream_pagenum != (pfn_t)-1) { 2080 dumpsys_lzjbrun(hp, 0, NULL, 0); 2081 dumpsys_errmsg(hp, NULL); 2082 } 2083 2084 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL); 2085 2086 /* Decrement main queue count, we are done */ 2087 CQ_CLOSE(mainq); 2088 } 2089 2090 /* 2091 * Dump helper called from panic_idle() to compress pages. CPUs in 2092 * this path must not call most kernel services. 2093 * 2094 * During panic, all but one of the CPUs is idle. These CPUs are used 2095 * as helpers working in parallel to copy and compress memory 2096 * pages. During a panic, however, these processors cannot call any 2097 * kernel services. This is because mutexes become no-ops during 2098 * panic, and, cross-call interrupts are inhibited. Therefore, during 2099 * panic dump the helper CPUs communicate with the panic CPU using 2100 * memory variables. All memory mapping and I/O is performed by the 2101 * panic CPU. 2102 */ 2103 void 2104 dumpsys_helper() 2105 { 2106 dumpsys_spinlock(&dumpcfg.helper_lock); 2107 if (dumpcfg.helpers_wanted) { 2108 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper]; 2109 2110 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2111 if (hp->helper == FREEHELPER) { 2112 hp->helper = CPU->cpu_id; 2113 BT_SET(dumpcfg.helpermap, CPU->cpu_seqid); 2114 2115 dumpsys_spinunlock(&dumpcfg.helper_lock); 2116 2117 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2) 2118 dumpsys_lzjbcompress(hp); 2119 else 2120 dumpsys_bz2compress(hp); 2121 2122 hp->helper = DONEHELPER; 2123 return; 2124 } 2125 } 2126 } 2127 dumpsys_spinunlock(&dumpcfg.helper_lock); 2128 } 2129 2130 /* 2131 * Dump helper for live dumps. 2132 * These run as a system task. 2133 */ 2134 static void 2135 dumpsys_live_helper(void *arg) 2136 { 2137 helper_t *hp = arg; 2138 2139 BT_ATOMIC_SET(dumpcfg.helpermap, CPU->cpu_seqid); 2140 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2) 2141 dumpsys_lzjbcompress(hp); 2142 else 2143 dumpsys_bz2compress(hp); 2144 } 2145 2146 /* 2147 * Compress one page with lzjb (single threaded case) 2148 */ 2149 static void 2150 dumpsys_lzjb_page(helper_t *hp, cbuf_t *cp) 2151 { 2152 dumpsync_t *ds = hp->ds; 2153 uint32_t csize; 2154 2155 hp->helper = MAINHELPER; 2156 hp->in = 0; 2157 hp->used = 0; 2158 hp->cpin = cp; 2159 while (hp->used < cp->used) { 2160 HRSTART(hp->perpage, copy); 2161 hp->in = dumpsys_copy_page(hp, hp->in); 2162 hp->used += PAGESIZE; 2163 HRSTOP(hp->perpage, copy); 2164 2165 HRSTART(hp->perpage, compress); 2166 csize = compress(hp->page, hp->lzbuf, PAGESIZE); 2167 HRSTOP(hp->perpage, compress); 2168 2169 HRSTART(hp->perpage, write); 2170 dumpvp_write(&csize, sizeof (csize)); 2171 dumpvp_write(hp->lzbuf, csize); 2172 HRSTOP(hp->perpage, write); 2173 } 2174 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 2175 hp->cpin = NULL; 2176 } 2177 2178 /* 2179 * Main task to dump pages. This is called on the dump CPU. 2180 */ 2181 static void 2182 dumpsys_main_task(void *arg) 2183 { 2184 dumpsync_t *ds = arg; 2185 pgcnt_t pagenum = 0, bitnum = 0, hibitnum; 2186 dumpmlw_t mlw; 2187 cbuf_t *cp; 2188 pgcnt_t baseoff, pfnoff; 2189 pfn_t base, pfn; 2190 int sec; 2191 2192 dump_init_memlist_walker(&mlw); 2193 2194 /* CONSTCOND */ 2195 while (1) { 2196 2197 if (ds->percent > ds->percent_done) { 2198 ds->percent_done = ds->percent; 2199 sec = (gethrtime() - ds->start) / 1000 / 1000 / 1000; 2200 uprintf("^\r%2d:%02d %3d%% done", 2201 sec / 60, sec % 60, ds->percent); 2202 ds->neednl = 1; 2203 } 2204 2205 while (CQ_IS_EMPTY(mainq) && !CQ_IS_EMPTY(writerq)) { 2206 2207 /* the writerq never blocks */ 2208 cp = CQ_GET(writerq); 2209 if (cp == NULL) 2210 break; 2211 2212 dump_timeleft = dump_timeout; 2213 2214 HRSTART(ds->perpage, write); 2215 dumpvp_write(cp->buf, cp->used); 2216 HRSTOP(ds->perpage, write); 2217 2218 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2219 } 2220 2221 /* 2222 * Wait here for some buffers to process. Returns NULL 2223 * when all helpers have terminated and all buffers 2224 * have been processed. 2225 */ 2226 cp = CQ_GET(mainq); 2227 2228 if (cp == NULL) { 2229 2230 /* Drain the write queue. */ 2231 if (!CQ_IS_EMPTY(writerq)) 2232 continue; 2233 2234 /* Main task exits here. */ 2235 break; 2236 } 2237 2238 dump_timeleft = dump_timeout; 2239 2240 switch (cp->state) { 2241 2242 case CBUF_FREEMAP: 2243 2244 /* 2245 * Note that we drop CBUF_FREEMAP buffers on 2246 * the floor (they will not be on any cqueue) 2247 * when we no longer need them. 2248 */ 2249 if (bitnum >= dumpcfg.bitmapsize) 2250 break; 2251 2252 if (dump_ioerr) { 2253 bitnum = dumpcfg.bitmapsize; 2254 CQ_CLOSE(helperq); 2255 break; 2256 } 2257 2258 HRSTART(ds->perpage, bitmap); 2259 for (; bitnum < dumpcfg.bitmapsize; bitnum++) 2260 if (BT_TEST(dumpcfg.bitmap, bitnum)) 2261 break; 2262 HRSTOP(ds->perpage, bitmap); 2263 dump_timeleft = dump_timeout; 2264 2265 if (bitnum >= dumpcfg.bitmapsize) { 2266 CQ_CLOSE(helperq); 2267 break; 2268 } 2269 2270 /* 2271 * Try to map CBUF_MAPSIZE ranges. Can't 2272 * assume that memory segment size is a 2273 * multiple of CBUF_MAPSIZE. Can't assume that 2274 * the segment starts on a CBUF_MAPSIZE 2275 * boundary. 2276 */ 2277 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2278 ASSERT(pfn != PFN_INVALID); 2279 ASSERT(bitnum + mlw.mpleft <= dumpcfg.bitmapsize); 2280 2281 base = P2ALIGN(pfn, CBUF_MAPNP); 2282 if (base < mlw.mpaddr) { 2283 base = mlw.mpaddr; 2284 baseoff = P2PHASE(base, CBUF_MAPNP); 2285 } else { 2286 baseoff = 0; 2287 } 2288 2289 pfnoff = pfn - base; 2290 if (pfnoff + mlw.mpleft < CBUF_MAPNP) { 2291 hibitnum = bitnum + mlw.mpleft; 2292 cp->size = ptob(pfnoff + mlw.mpleft); 2293 } else { 2294 hibitnum = bitnum - pfnoff + CBUF_MAPNP - 2295 baseoff; 2296 cp->size = CBUF_MAPSIZE - ptob(baseoff); 2297 } 2298 2299 cp->pfn = pfn; 2300 cp->bitnum = bitnum++; 2301 cp->pagenum = pagenum++; 2302 cp->off = ptob(pfnoff); 2303 2304 for (; bitnum < hibitnum; bitnum++) 2305 if (BT_TEST(dumpcfg.bitmap, bitnum)) 2306 pagenum++; 2307 2308 dump_timeleft = dump_timeout; 2309 cp->used = ptob(pagenum - cp->pagenum); 2310 2311 HRSTART(ds->perpage, map); 2312 hat_devload(kas.a_hat, cp->buf, cp->size, base, 2313 PROT_READ, HAT_LOAD_NOCONSIST); 2314 HRSTOP(ds->perpage, map); 2315 2316 ds->pages_mapped += btop(cp->size); 2317 ds->pages_used += pagenum - cp->pagenum; 2318 2319 CQ_OPEN(mainq); 2320 2321 /* 2322 * If there are no helpers the main task does 2323 * non-streams lzjb compress. 2324 */ 2325 if (dumpcfg.clevel == 0) { 2326 dumpsys_lzjb_page(dumpcfg.helper, cp); 2327 break; 2328 } 2329 2330 /* pass mapped pages to a helper */ 2331 CQ_PUT(helperq, cp, CBUF_INREADY); 2332 2333 /* the last page was done */ 2334 if (bitnum >= dumpcfg.bitmapsize) 2335 CQ_CLOSE(helperq); 2336 2337 break; 2338 2339 case CBUF_USEDMAP: 2340 2341 ds->npages += btop(cp->used); 2342 2343 HRSTART(ds->perpage, unmap); 2344 hat_unload(kas.a_hat, cp->buf, cp->size, HAT_UNLOAD); 2345 HRSTOP(ds->perpage, unmap); 2346 2347 if (bitnum < dumpcfg.bitmapsize) 2348 CQ_PUT(mainq, cp, CBUF_FREEMAP); 2349 CQ_CLOSE(mainq); 2350 2351 ASSERT(ds->npages <= dumphdr->dump_npages); 2352 ds->percent = ds->npages * 100LL / dumphdr->dump_npages; 2353 break; 2354 2355 case CBUF_WRITE: 2356 2357 CQ_PUT(writerq, cp, CBUF_WRITE); 2358 break; 2359 2360 case CBUF_ERRMSG: 2361 2362 if (cp->used > 0) { 2363 cp->buf[cp->size - 2] = '\n'; 2364 cp->buf[cp->size - 1] = '\0'; 2365 if (ds->neednl) { 2366 uprintf("\n%s", cp->buf); 2367 ds->neednl = 0; 2368 } else { 2369 uprintf("%s", cp->buf); 2370 } 2371 /* wait for console output */ 2372 drv_usecwait(200000); 2373 dump_timeleft = dump_timeout; 2374 } 2375 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2376 break; 2377 2378 default: 2379 uprintf("dump: unexpected buffer state %d, " 2380 "buffer will be lost\n", cp->state); 2381 break; 2382 2383 } /* end switch */ 2384 2385 } /* end while(1) */ 2386 } 2387 2388 #ifdef COLLECT_METRICS 2389 size_t 2390 dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size) 2391 { 2392 dumpcfg_t *cfg = &dumpcfg; 2393 int myid = CPU->cpu_seqid; 2394 int i, compress_ratio; 2395 int sec, iorate; 2396 helper_t *hp, *hpend = &cfg->helper[cfg->nhelper]; 2397 char *e = buf + size; 2398 char *p = buf; 2399 2400 sec = ds->elapsed / (1000 * 1000 * 1000ULL); 2401 if (sec < 1) 2402 sec = 1; 2403 2404 if (ds->iotime < 1) 2405 ds->iotime = 1; 2406 iorate = (ds->nwrite * 100000ULL) / ds->iotime; 2407 2408 compress_ratio = 100LL * ds->npages / btopr(ds->nwrite + 1); 2409 2410 #define P(...) (p += p < e ? snprintf(p, e - p, __VA_ARGS__) : 0) 2411 2412 P("Master cpu_seqid,%d\n", CPU->cpu_seqid); 2413 P("Master cpu_id,%d\n", CPU->cpu_id); 2414 P("dump_flags,0x%x\n", dumphdr->dump_flags); 2415 P("dump_ioerr,%d\n", dump_ioerr); 2416 2417 P("Helpers:\n"); 2418 for (i = 0; i < ncpus; i++) { 2419 if ((i & 15) == 0) 2420 P(",,%03d,", i); 2421 if (i == myid) 2422 P(" M"); 2423 else if (BT_TEST(cfg->helpermap, i)) 2424 P("%4d", cpu_seq[i]->cpu_id); 2425 else 2426 P(" *"); 2427 if ((i & 15) == 15) 2428 P("\n"); 2429 } 2430 2431 P("ncbuf_used,%d\n", cfg->ncbuf_used); 2432 P("ncmap,%d\n", cfg->ncmap); 2433 2434 P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m); 2435 P("Found small pages,%ld\n", cfg->foundsm); 2436 2437 P("Compression level,%d\n", cfg->clevel); 2438 P("Compression type,%s %s\n", cfg->clevel == 0 ? "serial" : "parallel", 2439 cfg->clevel >= DUMP_CLEVEL_BZIP2 ? "bzip2" : "lzjb"); 2440 P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio % 2441 100); 2442 P("nhelper_used,%d\n", cfg->nhelper_used); 2443 2444 P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100); 2445 P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite); 2446 P("..total nsec,%lld\n", (u_longlong_t)ds->iotime); 2447 P("dumpbuf.iosize,%ld\n", dumpbuf.iosize); 2448 P("dumpbuf.size,%ld\n", dumpbuf.size); 2449 2450 P("Dump pages/sec,%llu\n", (u_longlong_t)ds->npages / sec); 2451 P("Dump pages,%llu\n", (u_longlong_t)ds->npages); 2452 P("Dump time,%d\n", sec); 2453 2454 if (ds->pages_mapped > 0) 2455 P("per-cent map utilization,%d\n", (int)((100 * ds->pages_used) 2456 / ds->pages_mapped)); 2457 2458 P("\nPer-page metrics:\n"); 2459 if (ds->npages > 0) { 2460 for (hp = cfg->helper; hp != hpend; hp++) { 2461 #define PERPAGE(x) ds->perpage.x += hp->perpage.x; 2462 PERPAGES; 2463 #undef PERPAGE 2464 } 2465 #define PERPAGE(x) \ 2466 P("%s nsec/page,%d\n", #x, (int)(ds->perpage.x / ds->npages)); 2467 PERPAGES; 2468 #undef PERPAGE 2469 P("freebufq.empty,%d\n", (int)(ds->freebufq.empty / 2470 ds->npages)); 2471 P("helperq.empty,%d\n", (int)(ds->helperq.empty / 2472 ds->npages)); 2473 P("writerq.empty,%d\n", (int)(ds->writerq.empty / 2474 ds->npages)); 2475 P("mainq.empty,%d\n", (int)(ds->mainq.empty / ds->npages)); 2476 2477 P("I/O wait nsec/page,%llu\n", (u_longlong_t)(ds->iowait / 2478 ds->npages)); 2479 } 2480 #undef P 2481 if (p < e) 2482 bzero(p, e - p); 2483 return (p - buf); 2484 } 2485 #endif /* COLLECT_METRICS */ 2486 2487 /* 2488 * Dump the system. 2489 */ 2490 void 2491 dumpsys(void) 2492 { 2493 dumpsync_t *ds = &dumpsync; 2494 taskq_t *livetaskq = NULL; 2495 pfn_t pfn; 2496 pgcnt_t bitnum; 2497 proc_t *p; 2498 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper]; 2499 cbuf_t *cp; 2500 pid_t npids, pidx; 2501 char *content; 2502 char *buf; 2503 size_t size; 2504 int save_dump_clevel; 2505 dumpmlw_t mlw; 2506 dumpcsize_t datatag; 2507 dumpdatahdr_t datahdr; 2508 2509 if (dumpvp == NULL || dumphdr == NULL) { 2510 uprintf("skipping system dump - no dump device configured\n"); 2511 if (panicstr) { 2512 dumpcfg.helpers_wanted = 0; 2513 dumpsys_spinunlock(&dumpcfg.helper_lock); 2514 } 2515 return; 2516 } 2517 dumpbuf.cur = dumpbuf.start; 2518 2519 /* clear the sync variables */ 2520 ASSERT(dumpcfg.nhelper > 0); 2521 bzero(ds, sizeof (*ds)); 2522 ds->dumpcpu = CPU->cpu_id; 2523 2524 /* 2525 * Calculate the starting block for dump. If we're dumping on a 2526 * swap device, start 1/5 of the way in; otherwise, start at the 2527 * beginning. And never use the first page -- it may be a disk label. 2528 */ 2529 if (dumpvp->v_flag & VISSWAP) 2530 dumphdr->dump_start = P2ROUNDUP(dumpvp_size / 5, DUMP_OFFSET); 2531 else 2532 dumphdr->dump_start = DUMP_OFFSET; 2533 2534 dumphdr->dump_flags = DF_VALID | DF_COMPLETE | DF_LIVE | DF_COMPRESSED; 2535 dumphdr->dump_crashtime = gethrestime_sec(); 2536 dumphdr->dump_npages = 0; 2537 dumphdr->dump_nvtop = 0; 2538 bzero(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.bitmapsize)); 2539 dump_timeleft = dump_timeout; 2540 2541 if (panicstr) { 2542 dumphdr->dump_flags &= ~DF_LIVE; 2543 (void) VOP_DUMPCTL(dumpvp, DUMP_FREE, NULL, NULL); 2544 (void) VOP_DUMPCTL(dumpvp, DUMP_ALLOC, NULL, NULL); 2545 (void) vsnprintf(dumphdr->dump_panicstring, DUMP_PANICSIZE, 2546 panicstr, panicargs); 2547 2548 } 2549 2550 if (dump_conflags & DUMP_ALL) 2551 content = "all"; 2552 else if (dump_conflags & DUMP_CURPROC) 2553 content = "kernel + curproc"; 2554 else 2555 content = "kernel"; 2556 uprintf("dumping to %s, offset %lld, content: %s\n", dumppath, 2557 dumphdr->dump_start, content); 2558 2559 /* Make sure nodename is current */ 2560 bcopy(utsname.nodename, dumphdr->dump_utsname.nodename, SYS_NMLN); 2561 2562 /* 2563 * If this is a live dump, try to open a VCHR vnode for better 2564 * performance. We must take care to flush the buffer cache 2565 * first. 2566 */ 2567 if (!panicstr) { 2568 vnode_t *cdev_vp, *cmn_cdev_vp; 2569 2570 ASSERT(dumpbuf.cdev_vp == NULL); 2571 cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR); 2572 if (cdev_vp != NULL) { 2573 cmn_cdev_vp = common_specvp(cdev_vp); 2574 if (VOP_OPEN(&cmn_cdev_vp, FREAD | FWRITE, kcred, NULL) 2575 == 0) { 2576 if (vn_has_cached_data(dumpvp)) 2577 (void) pvn_vplist_dirty(dumpvp, 0, NULL, 2578 B_INVAL | B_TRUNC, kcred); 2579 dumpbuf.cdev_vp = cmn_cdev_vp; 2580 } else { 2581 VN_RELE(cdev_vp); 2582 } 2583 } 2584 } 2585 2586 /* 2587 * Store a hires timestamp so we can look it up during debugging. 2588 */ 2589 lbolt_debug_entry(); 2590 2591 /* 2592 * Leave room for the message and ereport save areas and terminal dump 2593 * header. 2594 */ 2595 dumpbuf.vp_limit = dumpvp_size - DUMP_LOGSIZE - DUMP_OFFSET - 2596 DUMP_ERPTSIZE; 2597 2598 /* 2599 * Write out the symbol table. It's no longer compressed, 2600 * so its 'size' and 'csize' are equal. 2601 */ 2602 dumpbuf.vp_off = dumphdr->dump_ksyms = dumphdr->dump_start + PAGESIZE; 2603 dumphdr->dump_ksyms_size = dumphdr->dump_ksyms_csize = 2604 ksyms_snapshot(dumpvp_ksyms_write, NULL, LONG_MAX); 2605 2606 /* 2607 * Write out the translation map. 2608 */ 2609 dumphdr->dump_map = dumpvp_flush(); 2610 dump_as(&kas); 2611 dumphdr->dump_nvtop += dump_plat_addr(); 2612 2613 /* 2614 * call into hat, which may have unmapped pages that also need to 2615 * be in the dump 2616 */ 2617 hat_dump(); 2618 2619 if (dump_conflags & DUMP_ALL) { 2620 mutex_enter(&pidlock); 2621 2622 for (npids = 0, p = practive; p != NULL; p = p->p_next) 2623 dumpcfg.pids[npids++] = p->p_pid; 2624 2625 mutex_exit(&pidlock); 2626 2627 for (pidx = 0; pidx < npids; pidx++) 2628 (void) dump_process(dumpcfg.pids[pidx]); 2629 2630 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) { 2631 dump_timeleft = dump_timeout; 2632 BT_SET(dumpcfg.bitmap, bitnum); 2633 } 2634 dumphdr->dump_npages = dumpcfg.bitmapsize; 2635 dumphdr->dump_flags |= DF_ALL; 2636 2637 } else if (dump_conflags & DUMP_CURPROC) { 2638 /* 2639 * Determine which pid is to be dumped. If we're panicking, we 2640 * dump the process associated with panic_thread (if any). If 2641 * this is a live dump, we dump the process associated with 2642 * curthread. 2643 */ 2644 npids = 0; 2645 if (panicstr) { 2646 if (panic_thread != NULL && 2647 panic_thread->t_procp != NULL && 2648 panic_thread->t_procp != &p0) { 2649 dumpcfg.pids[npids++] = 2650 panic_thread->t_procp->p_pid; 2651 } 2652 } else { 2653 dumpcfg.pids[npids++] = curthread->t_procp->p_pid; 2654 } 2655 2656 if (npids && dump_process(dumpcfg.pids[0]) == 0) 2657 dumphdr->dump_flags |= DF_CURPROC; 2658 else 2659 dumphdr->dump_flags |= DF_KERNEL; 2660 2661 } else { 2662 dumphdr->dump_flags |= DF_KERNEL; 2663 } 2664 2665 dumphdr->dump_hashmask = (1 << highbit(dumphdr->dump_nvtop - 1)) - 1; 2666 2667 /* 2668 * Write out the pfn table. 2669 */ 2670 dumphdr->dump_pfn = dumpvp_flush(); 2671 dump_init_memlist_walker(&mlw); 2672 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) { 2673 dump_timeleft = dump_timeout; 2674 if (!BT_TEST(dumpcfg.bitmap, bitnum)) 2675 continue; 2676 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2677 ASSERT(pfn != PFN_INVALID); 2678 dumpvp_write(&pfn, sizeof (pfn_t)); 2679 } 2680 dump_plat_pfn(); 2681 2682 /* 2683 * Write out all the pages. 2684 * Map pages, copy them handling UEs, compress, and write them out. 2685 * Cooperate with any helpers running on CPUs in panic_idle(). 2686 */ 2687 dumphdr->dump_data = dumpvp_flush(); 2688 2689 bzero(dumpcfg.helpermap, BT_SIZEOFMAP(NCPU)); 2690 ds->live = dumpcfg.clevel > 0 && 2691 (dumphdr->dump_flags & DF_LIVE) != 0; 2692 2693 save_dump_clevel = dumpcfg.clevel; 2694 if (panicstr) 2695 dumpsys_get_maxmem(); 2696 else if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2) 2697 dumpcfg.clevel = DUMP_CLEVEL_LZJB; 2698 2699 dumpcfg.nhelper_used = 0; 2700 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2701 if (hp->page == NULL) { 2702 hp->helper = DONEHELPER; 2703 continue; 2704 } 2705 ++dumpcfg.nhelper_used; 2706 hp->helper = FREEHELPER; 2707 hp->taskqid = NULL; 2708 hp->ds = ds; 2709 bzero(&hp->perpage, sizeof (hp->perpage)); 2710 if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2) 2711 (void) BZ2_bzCompressReset(&hp->bzstream); 2712 } 2713 2714 CQ_OPEN(freebufq); 2715 CQ_OPEN(helperq); 2716 2717 dumpcfg.ncbuf_used = 0; 2718 for (cp = dumpcfg.cbuf; cp != &dumpcfg.cbuf[dumpcfg.ncbuf]; cp++) { 2719 if (cp->buf != NULL) { 2720 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2721 ++dumpcfg.ncbuf_used; 2722 } 2723 } 2724 2725 for (cp = dumpcfg.cmap; cp != &dumpcfg.cmap[dumpcfg.ncmap]; cp++) 2726 CQ_PUT(mainq, cp, CBUF_FREEMAP); 2727 2728 ds->start = gethrtime(); 2729 ds->iowaitts = ds->start; 2730 2731 /* start helpers */ 2732 if (ds->live) { 2733 int n = dumpcfg.nhelper_used; 2734 int pri = MINCLSYSPRI - 25; 2735 2736 livetaskq = taskq_create("LiveDump", n, pri, n, n, 2737 TASKQ_PREPOPULATE); 2738 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2739 if (hp->page == NULL) 2740 continue; 2741 hp->helper = hp - dumpcfg.helper; 2742 hp->taskqid = taskq_dispatch(livetaskq, 2743 dumpsys_live_helper, (void *)hp, TQ_NOSLEEP); 2744 } 2745 2746 } else { 2747 if (panicstr) 2748 kmem_dump_begin(); 2749 dumpcfg.helpers_wanted = dumpcfg.clevel > 0; 2750 dumpsys_spinunlock(&dumpcfg.helper_lock); 2751 } 2752 2753 /* run main task */ 2754 dumpsys_main_task(ds); 2755 2756 ds->elapsed = gethrtime() - ds->start; 2757 if (ds->elapsed < 1) 2758 ds->elapsed = 1; 2759 2760 if (livetaskq != NULL) 2761 taskq_destroy(livetaskq); 2762 2763 if (ds->neednl) { 2764 uprintf("\n"); 2765 ds->neednl = 0; 2766 } 2767 2768 /* record actual pages dumped */ 2769 dumphdr->dump_npages = ds->npages; 2770 2771 /* platform-specific data */ 2772 dumphdr->dump_npages += dump_plat_data(dumpcfg.cbuf[0].buf); 2773 2774 /* note any errors by clearing DF_COMPLETE */ 2775 if (dump_ioerr || ds->npages < dumphdr->dump_npages) 2776 dumphdr->dump_flags &= ~DF_COMPLETE; 2777 2778 /* end of stream blocks */ 2779 datatag = 0; 2780 dumpvp_write(&datatag, sizeof (datatag)); 2781 2782 bzero(&datahdr, sizeof (datahdr)); 2783 2784 /* buffer for metrics */ 2785 buf = dumpcfg.cbuf[0].buf; 2786 size = MIN(dumpcfg.cbuf[0].size, DUMP_OFFSET - sizeof (dumphdr_t) - 2787 sizeof (dumpdatahdr_t)); 2788 2789 /* finish the kmem intercepts, collect kmem verbose info */ 2790 if (panicstr) { 2791 datahdr.dump_metrics = kmem_dump_finish(buf, size); 2792 buf += datahdr.dump_metrics; 2793 size -= datahdr.dump_metrics; 2794 } 2795 2796 /* compression info in data header */ 2797 datahdr.dump_datahdr_magic = DUMP_DATAHDR_MAGIC; 2798 datahdr.dump_datahdr_version = DUMP_DATAHDR_VERSION; 2799 datahdr.dump_maxcsize = CBUF_SIZE; 2800 datahdr.dump_maxrange = CBUF_MAPSIZE / PAGESIZE; 2801 datahdr.dump_nstreams = dumpcfg.nhelper_used; 2802 datahdr.dump_clevel = dumpcfg.clevel; 2803 #ifdef COLLECT_METRICS 2804 if (dump_metrics_on) 2805 datahdr.dump_metrics += dumpsys_metrics(ds, buf, size); 2806 #endif 2807 datahdr.dump_data_csize = dumpvp_flush() - dumphdr->dump_data; 2808 2809 /* 2810 * Write out the initial and terminal dump headers. 2811 */ 2812 dumpbuf.vp_off = dumphdr->dump_start; 2813 dumpvp_write(dumphdr, sizeof (dumphdr_t)); 2814 (void) dumpvp_flush(); 2815 2816 dumpbuf.vp_limit = dumpvp_size; 2817 dumpbuf.vp_off = dumpbuf.vp_limit - DUMP_OFFSET; 2818 dumpvp_write(dumphdr, sizeof (dumphdr_t)); 2819 dumpvp_write(&datahdr, sizeof (dumpdatahdr_t)); 2820 dumpvp_write(dumpcfg.cbuf[0].buf, datahdr.dump_metrics); 2821 2822 (void) dumpvp_flush(); 2823 2824 uprintf("\r%3d%% done: %llu pages dumped, ", 2825 ds->percent_done, (u_longlong_t)ds->npages); 2826 2827 if (dump_ioerr == 0) { 2828 uprintf("dump succeeded\n"); 2829 } else { 2830 uprintf("dump failed: error %d\n", dump_ioerr); 2831 #ifdef DEBUG 2832 if (panicstr) 2833 debug_enter("dump failed"); 2834 #endif 2835 } 2836 2837 /* 2838 * Write out all undelivered messages. This has to be the *last* 2839 * thing we do because the dump process itself emits messages. 2840 */ 2841 if (panicstr) { 2842 dump_ereports(); 2843 dump_messages(); 2844 } 2845 2846 delay(2 * hz); /* let people see the 'done' message */ 2847 dump_timeleft = 0; 2848 dump_ioerr = 0; 2849 2850 /* restore settings after live dump completes */ 2851 if (!panicstr) { 2852 dumpcfg.clevel = save_dump_clevel; 2853 2854 /* release any VCHR open of the dump device */ 2855 if (dumpbuf.cdev_vp != NULL) { 2856 (void) VOP_CLOSE(dumpbuf.cdev_vp, FREAD | FWRITE, 1, 0, 2857 kcred, NULL); 2858 VN_RELE(dumpbuf.cdev_vp); 2859 dumpbuf.cdev_vp = NULL; 2860 } 2861 } 2862 } 2863 2864 /* 2865 * This function is called whenever the memory size, as represented 2866 * by the phys_install list, changes. 2867 */ 2868 void 2869 dump_resize() 2870 { 2871 mutex_enter(&dump_lock); 2872 dumphdr_init(); 2873 dumpbuf_resize(); 2874 dump_update_clevel(); 2875 mutex_exit(&dump_lock); 2876 } 2877 2878 /* 2879 * This function allows for dynamic resizing of a dump area. It assumes that 2880 * the underlying device has update its appropriate size(9P). 2881 */ 2882 int 2883 dumpvp_resize() 2884 { 2885 int error; 2886 vattr_t vattr; 2887 2888 mutex_enter(&dump_lock); 2889 vattr.va_mask = AT_SIZE; 2890 if ((error = VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL)) != 0) { 2891 mutex_exit(&dump_lock); 2892 return (error); 2893 } 2894 2895 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) { 2896 mutex_exit(&dump_lock); 2897 return (ENOSPC); 2898 } 2899 2900 dumpvp_size = vattr.va_size & -DUMP_OFFSET; 2901 mutex_exit(&dump_lock); 2902 return (0); 2903 } 2904