1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/systm.h> 30 #include <sys/vm.h> 31 #include <sys/proc.h> 32 #include <sys/file.h> 33 #include <sys/conf.h> 34 #include <sys/kmem.h> 35 #include <sys/mem.h> 36 #include <sys/mman.h> 37 #include <sys/vnode.h> 38 #include <sys/errno.h> 39 #include <sys/memlist.h> 40 #include <sys/dumphdr.h> 41 #include <sys/dumpadm.h> 42 #include <sys/ksyms.h> 43 #include <sys/compress.h> 44 #include <sys/stream.h> 45 #include <sys/strsun.h> 46 #include <sys/cmn_err.h> 47 #include <sys/bitmap.h> 48 #include <sys/modctl.h> 49 #include <sys/utsname.h> 50 #include <sys/systeminfo.h> 51 #include <sys/vmem.h> 52 #include <sys/log.h> 53 #include <sys/var.h> 54 #include <sys/debug.h> 55 #include <sys/sunddi.h> 56 #include <fs/fs_subr.h> 57 #include <sys/fs/snode.h> 58 #include <sys/ontrap.h> 59 #include <sys/panic.h> 60 #include <sys/dkio.h> 61 #include <sys/vtoc.h> 62 #include <sys/errorq.h> 63 #include <sys/fm/util.h> 64 #include <sys/fs/zfs.h> 65 66 #include <vm/hat.h> 67 #include <vm/as.h> 68 #include <vm/page.h> 69 #include <vm/pvn.h> 70 #include <vm/seg.h> 71 #include <vm/seg_kmem.h> 72 #include <sys/clock_impl.h> 73 74 #include <bzip2/bzlib.h> 75 76 /* 77 * Crash dump time is dominated by disk write time. To reduce this, 78 * the stronger compression method bzip2 is applied to reduce the dump 79 * size and hence reduce I/O time. However, bzip2 is much more 80 * computationally expensive than the existing lzjb algorithm, so to 81 * avoid increasing compression time, CPUs that are otherwise idle 82 * during panic are employed to parallelize the compression task. 83 * Many helper CPUs are needed to prevent bzip2 from being a 84 * bottleneck, and on systems with too few CPUs, the lzjb algorithm is 85 * parallelized instead. Lastly, I/O and compression are performed by 86 * different CPUs, and are hence overlapped in time, unlike the older 87 * serial code. 88 * 89 * Another important consideration is the speed of the dump 90 * device. Faster disks need less CPUs in order to benefit from 91 * parallel lzjb versus parallel bzip2. Therefore, the CPU count 92 * threshold for switching from parallel lzjb to paralled bzip2 is 93 * elevated for faster disks. The dump device speed is adduced from 94 * the setting for dumpbuf.iosize, see dump_update_clevel. 95 */ 96 97 /* 98 * exported vars 99 */ 100 kmutex_t dump_lock; /* lock for dump configuration */ 101 dumphdr_t *dumphdr; /* dump header */ 102 int dump_conflags = DUMP_KERNEL; /* dump configuration flags */ 103 vnode_t *dumpvp; /* dump device vnode pointer */ 104 u_offset_t dumpvp_size; /* size of dump device, in bytes */ 105 char *dumppath; /* pathname of dump device */ 106 int dump_timeout = 120; /* timeout for dumping pages */ 107 int dump_timeleft; /* portion of dump_timeout remaining */ 108 int dump_ioerr; /* dump i/o error */ 109 int dump_check_used; /* enable check for used pages */ 110 111 /* 112 * Tunables for dump compression and parallelism. These can be set via 113 * /etc/system. 114 * 115 * dump_ncpu_low number of helpers for parallel lzjb 116 * This is also the minimum configuration. 117 * 118 * dump_bzip2_level bzip2 compression level: 1-9 119 * Higher numbers give greater compression, but take more memory 120 * and time. Memory used per helper is ~(dump_bzip2_level * 1MB). 121 * 122 * dump_plat_mincpu the cross-over limit for using bzip2 (per platform): 123 * if dump_plat_mincpu == 0, then always do single threaded dump 124 * if ncpu >= dump_plat_mincpu then try to use bzip2 125 * 126 * dump_metrics_on if set, metrics are collected in the kernel, passed 127 * to savecore via the dump file, and recorded by savecore in 128 * METRICS.txt. 129 */ 130 uint_t dump_ncpu_low = 4; /* minimum config for parallel lzjb */ 131 uint_t dump_bzip2_level = 1; /* bzip2 level (1-9) */ 132 133 /* tunables for pre-reserved heap */ 134 uint_t dump_kmem_permap = 1024; 135 uint_t dump_kmem_pages = 8; 136 137 /* Define multiple buffers per helper to avoid stalling */ 138 #define NCBUF_PER_HELPER 2 139 #define NCMAP_PER_HELPER 4 140 141 /* minimum number of helpers configured */ 142 #define MINHELPERS (dump_ncpu_low) 143 #define MINCBUFS (MINHELPERS * NCBUF_PER_HELPER) 144 145 /* 146 * Define constant parameters. 147 * 148 * CBUF_SIZE size of an output buffer 149 * 150 * CBUF_MAPSIZE size of virtual range for mapping pages 151 * 152 * CBUF_MAPNP size of virtual range in pages 153 * 154 */ 155 #define DUMP_1KB ((size_t)1 << 10) 156 #define DUMP_1MB ((size_t)1 << 20) 157 #define CBUF_SIZE ((size_t)1 << 17) 158 #define CBUF_MAPSHIFT (22) 159 #define CBUF_MAPSIZE ((size_t)1 << CBUF_MAPSHIFT) 160 #define CBUF_MAPNP ((size_t)1 << (CBUF_MAPSHIFT - PAGESHIFT)) 161 162 /* 163 * Compression metrics are accumulated nano-second subtotals. The 164 * results are normalized by the number of pages dumped. A report is 165 * generated when dumpsys() completes and is saved in the dump image 166 * after the trailing dump header. 167 * 168 * Metrics are always collected. Set the variable dump_metrics_on to 169 * cause metrics to be saved in the crash file, where savecore will 170 * save it in the file METRICS.txt. 171 */ 172 #define PERPAGES \ 173 PERPAGE(bitmap) PERPAGE(map) PERPAGE(unmap) \ 174 PERPAGE(copy) PERPAGE(compress) \ 175 PERPAGE(write) \ 176 PERPAGE(inwait) PERPAGE(outwait) 177 178 typedef struct perpage { 179 #define PERPAGE(x) hrtime_t x; 180 PERPAGES 181 #undef PERPAGE 182 } perpage_t; 183 184 /* 185 * This macro controls the code generation for collecting dump 186 * performance information. By default, the code is generated, but 187 * automatic saving of the information is disabled. If dump_metrics_on 188 * is set to 1, the timing information is passed to savecore via the 189 * crash file, where it is appended to the file dump-dir/METRICS.txt. 190 */ 191 #define COLLECT_METRICS 192 193 #ifdef COLLECT_METRICS 194 uint_t dump_metrics_on = 0; /* set to 1 to enable recording metrics */ 195 196 #define HRSTART(v, m) v##ts.m = gethrtime() 197 #define HRSTOP(v, m) v.m += gethrtime() - v##ts.m 198 #define HRBEGIN(v, m, s) v##ts.m = gethrtime(); v.size += s 199 #define HREND(v, m) v.m += gethrtime() - v##ts.m 200 #define HRNORM(v, m, n) v.m /= (n) 201 202 #else 203 #define HRSTART(v, m) 204 #define HRSTOP(v, m) 205 #define HRBEGIN(v, m, s) 206 #define HREND(v, m) 207 #define HRNORM(v, m, n) 208 #endif /* COLLECT_METRICS */ 209 210 /* 211 * Buffers for copying and compressing memory pages. 212 * 213 * cbuf_t buffer controllers: used for both input and output. 214 * 215 * The buffer state indicates how it is being used: 216 * 217 * CBUF_FREEMAP: CBUF_MAPSIZE virtual address range is available for 218 * mapping input pages. 219 * 220 * CBUF_INREADY: input pages are mapped and ready for compression by a 221 * helper. 222 * 223 * CBUF_USEDMAP: mapping has been consumed by a helper. Needs unmap. 224 * 225 * CBUF_FREEBUF: CBUF_SIZE output buffer, which is available. 226 * 227 * CBUF_WRITE: CBUF_SIZE block of compressed pages from a helper, 228 * ready to write out. 229 * 230 * CBUF_ERRMSG: CBUF_SIZE block of error messages from a helper 231 * (reports UE errors.) 232 */ 233 234 typedef enum cbufstate { 235 CBUF_FREEMAP, 236 CBUF_INREADY, 237 CBUF_USEDMAP, 238 CBUF_FREEBUF, 239 CBUF_WRITE, 240 CBUF_ERRMSG 241 } cbufstate_t; 242 243 typedef struct cbuf cbuf_t; 244 245 struct cbuf { 246 cbuf_t *next; /* next in list */ 247 cbufstate_t state; /* processing state */ 248 size_t used; /* amount used */ 249 size_t size; /* mem size */ 250 char *buf; /* kmem or vmem */ 251 pgcnt_t pagenum; /* index to pfn map */ 252 pgcnt_t bitnum; /* first set bitnum */ 253 pfn_t pfn; /* first pfn in mapped range */ 254 int off; /* byte offset to first pfn */ 255 }; 256 257 /* 258 * cqueue_t queues: a uni-directional channel for communication 259 * from the master to helper tasks or vice-versa using put and 260 * get primitives. Both mappings and data buffers are passed via 261 * queues. Producers close a queue when done. The number of 262 * active producers is reference counted so the consumer can 263 * detect end of data. Concurrent access is mediated by atomic 264 * operations for panic dump, or mutex/cv for live dump. 265 * 266 * There a four queues, used as follows: 267 * 268 * Queue Dataflow NewState 269 * -------------------------------------------------- 270 * mainq master -> master FREEMAP 271 * master has initialized or unmapped an input buffer 272 * -------------------------------------------------- 273 * helperq master -> helper INREADY 274 * master has mapped input for use by helper 275 * -------------------------------------------------- 276 * mainq master <- helper USEDMAP 277 * helper is done with input 278 * -------------------------------------------------- 279 * freebufq master -> helper FREEBUF 280 * master has initialized or written an output buffer 281 * -------------------------------------------------- 282 * mainq master <- helper WRITE 283 * block of compressed pages from a helper 284 * -------------------------------------------------- 285 * mainq master <- helper ERRMSG 286 * error messages from a helper (memory error case) 287 * -------------------------------------------------- 288 * writerq master <- master WRITE 289 * non-blocking queue of blocks to write 290 * -------------------------------------------------- 291 */ 292 typedef struct cqueue { 293 cbuf_t *volatile first; /* first in list */ 294 cbuf_t *last; /* last in list */ 295 hrtime_t ts; /* timestamp */ 296 hrtime_t empty; /* total time empty */ 297 kmutex_t mutex; /* live state lock */ 298 kcondvar_t cv; /* live wait var */ 299 lock_t spinlock; /* panic mode spin lock */ 300 volatile uint_t open; /* producer ref count */ 301 } cqueue_t; 302 303 /* 304 * Convenience macros for using the cqueue functions 305 * Note that the caller must have defined "dumpsync_t *ds" 306 */ 307 #define CQ_IS_EMPTY(q) \ 308 (ds->q.first == NULL) 309 310 #define CQ_OPEN(q) \ 311 atomic_inc_uint(&ds->q.open) 312 313 #define CQ_CLOSE(q) \ 314 dumpsys_close_cq(&ds->q, ds->live) 315 316 #define CQ_PUT(q, cp, st) \ 317 dumpsys_put_cq(&ds->q, cp, st, ds->live) 318 319 #define CQ_GET(q) \ 320 dumpsys_get_cq(&ds->q, ds->live) 321 322 /* 323 * Dynamic state when dumpsys() is running. 324 */ 325 typedef struct dumpsync { 326 pgcnt_t npages; /* subtotal of pages dumped */ 327 pgcnt_t pages_mapped; /* subtotal of pages mapped */ 328 pgcnt_t pages_used; /* subtotal of pages used per map */ 329 size_t nwrite; /* subtotal of bytes written */ 330 uint_t live; /* running live dump */ 331 uint_t neednl; /* will need to print a newline */ 332 uint_t percent; /* dump progress */ 333 uint_t percent_done; /* dump progress reported */ 334 cqueue_t freebufq; /* free kmem bufs for writing */ 335 cqueue_t mainq; /* input for main task */ 336 cqueue_t helperq; /* input for helpers */ 337 cqueue_t writerq; /* input for writer */ 338 hrtime_t start; /* start time */ 339 hrtime_t elapsed; /* elapsed time when completed */ 340 hrtime_t iotime; /* time spent writing nwrite bytes */ 341 hrtime_t iowait; /* time spent waiting for output */ 342 hrtime_t iowaitts; /* iowait timestamp */ 343 perpage_t perpage; /* metrics */ 344 perpage_t perpagets; 345 int dumpcpu; /* master cpu */ 346 } dumpsync_t; 347 348 static dumpsync_t dumpsync; /* synchronization vars */ 349 350 /* 351 * helper_t helpers: contains the context for a stream. CPUs run in 352 * parallel at dump time; each CPU creates a single stream of 353 * compression data. Stream data is divided into CBUF_SIZE blocks. 354 * The blocks are written in order within a stream. But, blocks from 355 * multiple streams can be interleaved. Each stream is identified by a 356 * unique tag. 357 */ 358 typedef struct helper { 359 int helper; /* bound helper id */ 360 int tag; /* compression stream tag */ 361 perpage_t perpage; /* per page metrics */ 362 perpage_t perpagets; /* per page metrics (timestamps) */ 363 taskqid_t taskqid; /* live dump task ptr */ 364 int in, out; /* buffer offsets */ 365 cbuf_t *cpin, *cpout, *cperr; /* cbuf objects in process */ 366 dumpsync_t *ds; /* pointer to sync vars */ 367 size_t used; /* counts input consumed */ 368 char *page; /* buffer for page copy */ 369 char *lzbuf; /* lzjb output */ 370 bz_stream bzstream; /* bzip2 state */ 371 } helper_t; 372 373 #define MAINHELPER (-1) /* helper is also the main task */ 374 #define FREEHELPER (-2) /* unbound helper */ 375 #define DONEHELPER (-3) /* helper finished */ 376 377 /* 378 * configuration vars for dumpsys 379 */ 380 typedef struct dumpcfg { 381 int threshold; /* ncpu threshold for bzip2 */ 382 int nhelper; /* number of helpers */ 383 int nhelper_used; /* actual number of helpers used */ 384 int ncmap; /* number VA pages for compression */ 385 int ncbuf; /* number of bufs for compression */ 386 int ncbuf_used; /* number of bufs in use */ 387 uint_t clevel; /* dump compression level */ 388 helper_t *helper; /* array of helpers */ 389 cbuf_t *cmap; /* array of input (map) buffers */ 390 cbuf_t *cbuf; /* array of output buffers */ 391 ulong_t *helpermap; /* set of dumpsys helper CPU ids */ 392 ulong_t *bitmap; /* bitmap for marking pages to dump */ 393 ulong_t *rbitmap; /* bitmap for used CBUF_MAPSIZE ranges */ 394 pgcnt_t bitmapsize; /* size of bitmap */ 395 pgcnt_t rbitmapsize; /* size of bitmap for ranges */ 396 pgcnt_t found4m; /* number ranges allocated by dump */ 397 pgcnt_t foundsm; /* number small pages allocated by dump */ 398 pid_t *pids; /* list of process IDs at dump time */ 399 size_t maxsize; /* memory size needed at dump time */ 400 size_t maxvmsize; /* size of reserved VM */ 401 char *maxvm; /* reserved VM for spare pages */ 402 lock_t helper_lock; /* protect helper state */ 403 char helpers_wanted; /* flag to enable parallelism */ 404 } dumpcfg_t; 405 406 static dumpcfg_t dumpcfg; /* config vars */ 407 408 /* 409 * The dump I/O buffer. 410 * 411 * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is 412 * sized according to the optimum device transfer speed. 413 */ 414 typedef struct dumpbuf { 415 vnode_t *cdev_vp; /* VCHR open of the dump device */ 416 len_t vp_limit; /* maximum write offset */ 417 offset_t vp_off; /* current dump device offset */ 418 char *cur; /* dump write pointer */ 419 char *start; /* dump buffer address */ 420 char *end; /* dump buffer end */ 421 size_t size; /* size of dumpbuf in bytes */ 422 size_t iosize; /* best transfer size for device */ 423 } dumpbuf_t; 424 425 dumpbuf_t dumpbuf; /* I/O buffer */ 426 427 /* 428 * The dump I/O buffer must be at least one page, at most xfer_size 429 * bytes, and should scale with physmem in between. The transfer size 430 * passed in will either represent a global default (maxphys) or the 431 * best size for the device. The size of the dumpbuf I/O buffer is 432 * limited by dumpbuf_limit (8MB by default) because the dump 433 * performance saturates beyond a certain size. The default is to 434 * select 1/4096 of the memory. 435 */ 436 static int dumpbuf_fraction = 12; /* memory size scale factor */ 437 static size_t dumpbuf_limit = 8 * DUMP_1MB; /* max I/O buf size */ 438 439 static size_t 440 dumpbuf_iosize(size_t xfer_size) 441 { 442 size_t iosize = ptob(physmem >> dumpbuf_fraction); 443 444 if (iosize < PAGESIZE) 445 iosize = PAGESIZE; 446 else if (iosize > xfer_size) 447 iosize = xfer_size; 448 if (iosize > dumpbuf_limit) 449 iosize = dumpbuf_limit; 450 return (iosize & PAGEMASK); 451 } 452 453 /* 454 * resize the I/O buffer 455 */ 456 static void 457 dumpbuf_resize(void) 458 { 459 char *old_buf = dumpbuf.start; 460 size_t old_size = dumpbuf.size; 461 char *new_buf; 462 size_t new_size; 463 464 ASSERT(MUTEX_HELD(&dump_lock)); 465 466 new_size = dumpbuf_iosize(MAX(dumpbuf.iosize, maxphys)); 467 if (new_size <= old_size) 468 return; /* no need to reallocate buffer */ 469 470 new_buf = kmem_alloc(new_size, KM_SLEEP); 471 dumpbuf.size = new_size; 472 dumpbuf.start = new_buf; 473 dumpbuf.end = new_buf + new_size; 474 kmem_free(old_buf, old_size); 475 } 476 477 /* 478 * dump_update_clevel is called when dumpadm configures the dump device. 479 * Calculate number of helpers and buffers. 480 * Allocate the minimum configuration for now. 481 * 482 * When the dump file is configured we reserve a minimum amount of 483 * memory for use at crash time. But we reserve VA for all the memory 484 * we really want in order to do the fastest dump possible. The VA is 485 * backed by pages not being dumped, according to the bitmap. If 486 * there is insufficient spare memory, however, we fall back to the 487 * minimum. 488 * 489 * Live dump (savecore -L) always uses the minimum config. 490 * 491 * clevel 0 is single threaded lzjb 492 * clevel 1 is parallel lzjb 493 * clevel 2 is parallel bzip2 494 * 495 * The ncpu threshold is selected with dump_plat_mincpu. 496 * On OPL, set_platform_defaults() overrides the sun4u setting. 497 * The actual values are defined via DUMP_PLAT_*_MINCPU macros. 498 * 499 * Architecture Threshold Algorithm 500 * sun4u < 51 parallel lzjb 501 * sun4u >= 51 parallel bzip2(*) 502 * sun4u OPL < 8 parallel lzjb 503 * sun4u OPL >= 8 parallel bzip2(*) 504 * sun4v < 128 parallel lzjb 505 * sun4v >= 128 parallel bzip2(*) 506 * x86 < 11 parallel lzjb 507 * x86 >= 11 parallel bzip2(*) 508 * 32-bit N/A single-threaded lzjb 509 * 510 * (*) bzip2 is only chosen if there is sufficient available 511 * memory for buffers at dump time. See dumpsys_get_maxmem(). 512 * 513 * Faster dump devices have larger I/O buffers. The threshold value is 514 * increased according to the size of the dump I/O buffer, because 515 * parallel lzjb performs better with faster disks. For buffers >= 1MB 516 * the threshold is 3X; for buffers >= 256K threshold is 2X. 517 * 518 * For parallel dumps, the number of helpers is ncpu-1. The CPU 519 * running panic runs the main task. For single-threaded dumps, the 520 * panic CPU does lzjb compression (it is tagged as MAINHELPER.) 521 * 522 * Need multiple buffers per helper so that they do not block waiting 523 * for the main task. 524 * parallel single-threaded 525 * Number of output buffers: nhelper*2 1 526 * Number of mapping buffers: nhelper*4 1 527 * 528 */ 529 static void 530 dump_update_clevel() 531 { 532 int tag; 533 size_t bz2size; 534 helper_t *hp, *hpend; 535 cbuf_t *cp, *cpend; 536 dumpcfg_t *old = &dumpcfg; 537 dumpcfg_t newcfg = *old; 538 dumpcfg_t *new = &newcfg; 539 540 ASSERT(MUTEX_HELD(&dump_lock)); 541 542 /* 543 * Free the previously allocated bufs and VM. 544 */ 545 if (old->helper != NULL) { 546 547 /* helpers */ 548 hpend = &old->helper[old->nhelper]; 549 for (hp = old->helper; hp != hpend; hp++) { 550 if (hp->lzbuf != NULL) 551 kmem_free(hp->lzbuf, PAGESIZE); 552 if (hp->page != NULL) 553 kmem_free(hp->page, PAGESIZE); 554 } 555 kmem_free(old->helper, old->nhelper * sizeof (helper_t)); 556 557 /* VM space for mapping pages */ 558 cpend = &old->cmap[old->ncmap]; 559 for (cp = old->cmap; cp != cpend; cp++) 560 vmem_xfree(heap_arena, cp->buf, CBUF_MAPSIZE); 561 kmem_free(old->cmap, old->ncmap * sizeof (cbuf_t)); 562 563 /* output bufs */ 564 cpend = &old->cbuf[old->ncbuf]; 565 for (cp = old->cbuf; cp != cpend; cp++) 566 if (cp->buf != NULL) 567 kmem_free(cp->buf, cp->size); 568 kmem_free(old->cbuf, old->ncbuf * sizeof (cbuf_t)); 569 570 /* reserved VM for dumpsys_get_maxmem */ 571 if (old->maxvmsize > 0) 572 vmem_xfree(heap_arena, old->maxvm, old->maxvmsize); 573 } 574 575 /* 576 * Allocate memory and VM. 577 * One CPU runs dumpsys, the rest are helpers. 578 */ 579 new->nhelper = ncpus - 1; 580 if (new->nhelper < 1) 581 new->nhelper = 1; 582 583 if (new->nhelper > DUMP_MAX_NHELPER) 584 new->nhelper = DUMP_MAX_NHELPER; 585 586 /* increase threshold for faster disks */ 587 new->threshold = dump_plat_mincpu; 588 if (dumpbuf.iosize >= DUMP_1MB) 589 new->threshold *= 3; 590 else if (dumpbuf.iosize >= (256 * DUMP_1KB)) 591 new->threshold *= 2; 592 593 /* figure compression level based upon the computed threshold. */ 594 if (dump_plat_mincpu == 0 || new->nhelper < 2) { 595 new->clevel = 0; 596 new->nhelper = 1; 597 } else if ((new->nhelper + 1) >= new->threshold) { 598 new->clevel = DUMP_CLEVEL_BZIP2; 599 } else { 600 new->clevel = DUMP_CLEVEL_LZJB; 601 } 602 603 if (new->clevel == 0) { 604 new->ncbuf = 1; 605 new->ncmap = 1; 606 } else { 607 new->ncbuf = NCBUF_PER_HELPER * new->nhelper; 608 new->ncmap = NCMAP_PER_HELPER * new->nhelper; 609 } 610 611 /* 612 * Allocate new data structures and buffers for MINHELPERS, 613 * and also figure the max desired size. 614 */ 615 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level); 616 new->maxsize = 0; 617 new->maxvmsize = 0; 618 new->maxvm = NULL; 619 tag = 1; 620 new->helper = kmem_zalloc(new->nhelper * sizeof (helper_t), KM_SLEEP); 621 hpend = &new->helper[new->nhelper]; 622 for (hp = new->helper; hp != hpend; hp++) { 623 hp->tag = tag++; 624 if (hp < &new->helper[MINHELPERS]) { 625 hp->lzbuf = kmem_alloc(PAGESIZE, KM_SLEEP); 626 hp->page = kmem_alloc(PAGESIZE, KM_SLEEP); 627 } else if (new->clevel < DUMP_CLEVEL_BZIP2) { 628 new->maxsize += 2 * PAGESIZE; 629 } else { 630 new->maxsize += PAGESIZE; 631 } 632 if (new->clevel >= DUMP_CLEVEL_BZIP2) 633 new->maxsize += bz2size; 634 } 635 636 new->cbuf = kmem_zalloc(new->ncbuf * sizeof (cbuf_t), KM_SLEEP); 637 cpend = &new->cbuf[new->ncbuf]; 638 for (cp = new->cbuf; cp != cpend; cp++) { 639 cp->state = CBUF_FREEBUF; 640 cp->size = CBUF_SIZE; 641 if (cp < &new->cbuf[MINCBUFS]) 642 cp->buf = kmem_alloc(cp->size, KM_SLEEP); 643 else 644 new->maxsize += cp->size; 645 } 646 647 new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP); 648 cpend = &new->cmap[new->ncmap]; 649 for (cp = new->cmap; cp != cpend; cp++) { 650 cp->state = CBUF_FREEMAP; 651 cp->size = CBUF_MAPSIZE; 652 cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE, 653 0, 0, NULL, NULL, VM_SLEEP); 654 } 655 656 /* reserve VA to be backed with spare pages at crash time */ 657 if (new->maxsize > 0) { 658 new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE); 659 new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE); 660 new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize, 661 CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP); 662 } 663 664 /* 665 * Reserve memory for kmem allocation calls made during crash 666 * dump. The hat layer allocates memory for each mapping 667 * created, and the I/O path allocates buffers and data structs. 668 * Add a few pages for safety. 669 */ 670 kmem_dump_init((new->ncmap * dump_kmem_permap) + 671 (dump_kmem_pages * PAGESIZE)); 672 673 /* set new config pointers */ 674 *old = *new; 675 } 676 677 /* 678 * Define a struct memlist walker to optimize bitnum to pfn 679 * lookup. The walker maintains the state of the list traversal. 680 */ 681 typedef struct dumpmlw { 682 struct memlist *mp; /* current memlist */ 683 pgcnt_t basenum; /* bitnum base offset */ 684 pgcnt_t mppages; /* current memlist size */ 685 pgcnt_t mpleft; /* size to end of current memlist */ 686 pfn_t mpaddr; /* first pfn in memlist */ 687 } dumpmlw_t; 688 689 /* initialize the walker */ 690 static inline void 691 dump_init_memlist_walker(dumpmlw_t *pw) 692 { 693 pw->mp = phys_install; 694 pw->basenum = 0; 695 pw->mppages = pw->mp->ml_size >> PAGESHIFT; 696 pw->mpleft = pw->mppages; 697 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT; 698 } 699 700 /* 701 * Lookup pfn given bitnum. The memlist can be quite long on some 702 * systems (e.g.: one per board). To optimize sequential lookups, the 703 * caller initializes and presents a memlist walker. 704 */ 705 static pfn_t 706 dump_bitnum_to_pfn(pgcnt_t bitnum, dumpmlw_t *pw) 707 { 708 bitnum -= pw->basenum; 709 while (pw->mp != NULL) { 710 if (bitnum < pw->mppages) { 711 pw->mpleft = pw->mppages - bitnum; 712 return (pw->mpaddr + bitnum); 713 } 714 bitnum -= pw->mppages; 715 pw->basenum += pw->mppages; 716 pw->mp = pw->mp->ml_next; 717 if (pw->mp != NULL) { 718 pw->mppages = pw->mp->ml_size >> PAGESHIFT; 719 pw->mpleft = pw->mppages; 720 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT; 721 } 722 } 723 return (PFN_INVALID); 724 } 725 726 static pgcnt_t 727 dump_pfn_to_bitnum(pfn_t pfn) 728 { 729 struct memlist *mp; 730 pgcnt_t bitnum = 0; 731 732 for (mp = phys_install; mp != NULL; mp = mp->ml_next) { 733 if (pfn >= (mp->ml_address >> PAGESHIFT) && 734 pfn < ((mp->ml_address + mp->ml_size) >> PAGESHIFT)) 735 return (bitnum + pfn - (mp->ml_address >> PAGESHIFT)); 736 bitnum += mp->ml_size >> PAGESHIFT; 737 } 738 return ((pgcnt_t)-1); 739 } 740 741 /* 742 * Set/test bitmap for a CBUF_MAPSIZE range which includes pfn. The 743 * mapping of pfn to range index is imperfect because pfn and bitnum 744 * do not have the same phase. To make sure a CBUF_MAPSIZE range is 745 * covered, call this for both ends: 746 * dump_set_used(base) 747 * dump_set_used(base+CBUF_MAPNP-1) 748 * 749 * This is used during a panic dump to mark pages allocated by 750 * dumpsys_get_maxmem(). The macro IS_DUMP_PAGE(pp) is used by 751 * page_get_mnode_freelist() to make sure pages used by dump are never 752 * allocated. 753 */ 754 #define CBUF_MAPP2R(pfn) ((pfn) >> (CBUF_MAPSHIFT - PAGESHIFT)) 755 756 static void 757 dump_set_used(pfn_t pfn) 758 { 759 760 pgcnt_t bitnum, rbitnum; 761 762 bitnum = dump_pfn_to_bitnum(pfn); 763 ASSERT(bitnum != (pgcnt_t)-1); 764 765 rbitnum = CBUF_MAPP2R(bitnum); 766 ASSERT(rbitnum < dumpcfg.rbitmapsize); 767 768 BT_SET(dumpcfg.rbitmap, rbitnum); 769 } 770 771 int 772 dump_test_used(pfn_t pfn) 773 { 774 pgcnt_t bitnum, rbitnum; 775 776 bitnum = dump_pfn_to_bitnum(pfn); 777 ASSERT(bitnum != (pgcnt_t)-1); 778 779 rbitnum = CBUF_MAPP2R(bitnum); 780 ASSERT(rbitnum < dumpcfg.rbitmapsize); 781 782 return (BT_TEST(dumpcfg.rbitmap, rbitnum)); 783 } 784 785 /* 786 * dumpbzalloc and dumpbzfree are callbacks from the bzip2 library. 787 * dumpsys_get_maxmem() uses them for BZ2_bzCompressInit(). 788 */ 789 static void * 790 dumpbzalloc(void *opaque, int items, int size) 791 { 792 size_t *sz; 793 char *ret; 794 795 ASSERT(opaque != NULL); 796 sz = opaque; 797 ret = dumpcfg.maxvm + *sz; 798 *sz += items * size; 799 *sz = P2ROUNDUP(*sz, BZ2_BZALLOC_ALIGN); 800 ASSERT(*sz <= dumpcfg.maxvmsize); 801 return (ret); 802 } 803 804 /*ARGSUSED*/ 805 static void 806 dumpbzfree(void *opaque, void *addr) 807 { 808 } 809 810 /* 811 * Perform additional checks on the page to see if we can really use 812 * it. The kernel (kas) pages are always set in the bitmap. However, 813 * boot memory pages (prom_ppages or P_BOOTPAGES) are not in the 814 * bitmap. So we check for them. 815 */ 816 static inline int 817 dump_pfn_check(pfn_t pfn) 818 { 819 page_t *pp = page_numtopp_nolock(pfn); 820 if (pp == NULL || pp->p_pagenum != pfn || 821 #if defined(__sparc) 822 pp->p_vnode == &promvp || 823 #else 824 PP_ISBOOTPAGES(pp) || 825 #endif 826 pp->p_toxic != 0) 827 return (0); 828 return (1); 829 } 830 831 /* 832 * Check a range to see if all contained pages are available and 833 * return non-zero if the range can be used. 834 */ 835 static inline int 836 dump_range_check(pgcnt_t start, pgcnt_t end, pfn_t pfn) 837 { 838 for (; start < end; start++, pfn++) { 839 if (BT_TEST(dumpcfg.bitmap, start)) 840 return (0); 841 if (!dump_pfn_check(pfn)) 842 return (0); 843 } 844 return (1); 845 } 846 847 /* 848 * dumpsys_get_maxmem() is called during panic. Find unused ranges 849 * and use them for buffers. If we find enough memory switch to 850 * parallel bzip2, otherwise use parallel lzjb. 851 * 852 * It searches the dump bitmap in 2 passes. The first time it looks 853 * for CBUF_MAPSIZE ranges. On the second pass it uses small pages. 854 */ 855 static void 856 dumpsys_get_maxmem() 857 { 858 dumpcfg_t *cfg = &dumpcfg; 859 cbuf_t *endcp = &cfg->cbuf[cfg->ncbuf]; 860 helper_t *endhp = &cfg->helper[cfg->nhelper]; 861 pgcnt_t bitnum, end; 862 size_t sz, endsz, bz2size; 863 pfn_t pfn, off; 864 cbuf_t *cp; 865 helper_t *hp, *ohp; 866 dumpmlw_t mlw; 867 int k; 868 869 if (cfg->maxsize == 0 || cfg->clevel < DUMP_CLEVEL_LZJB || 870 (dump_conflags & DUMP_ALL) != 0) 871 return; 872 873 sz = 0; 874 cfg->found4m = 0; 875 cfg->foundsm = 0; 876 877 /* bitmap of ranges used to estimate which pfns are being used */ 878 bzero(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.rbitmapsize)); 879 880 /* find ranges that are not being dumped to use for buffers */ 881 dump_init_memlist_walker(&mlw); 882 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) { 883 dump_timeleft = dump_timeout; 884 end = bitnum + CBUF_MAPNP; 885 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 886 ASSERT(pfn != PFN_INVALID); 887 888 /* skip partial range at end of mem segment */ 889 if (mlw.mpleft < CBUF_MAPNP) { 890 end = bitnum + mlw.mpleft; 891 continue; 892 } 893 894 /* skip non aligned pages */ 895 off = P2PHASE(pfn, CBUF_MAPNP); 896 if (off != 0) { 897 end -= off; 898 continue; 899 } 900 901 if (!dump_range_check(bitnum, end, pfn)) 902 continue; 903 904 ASSERT((sz + CBUF_MAPSIZE) <= cfg->maxvmsize); 905 hat_devload(kas.a_hat, cfg->maxvm + sz, CBUF_MAPSIZE, pfn, 906 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST); 907 sz += CBUF_MAPSIZE; 908 cfg->found4m++; 909 910 /* set the bitmap for both ends to be sure to cover the range */ 911 dump_set_used(pfn); 912 dump_set_used(pfn + CBUF_MAPNP - 1); 913 914 if (sz >= cfg->maxsize) 915 goto foundmax; 916 } 917 918 /* Add small pages if we can't find enough large pages. */ 919 dump_init_memlist_walker(&mlw); 920 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) { 921 dump_timeleft = dump_timeout; 922 end = bitnum + CBUF_MAPNP; 923 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 924 ASSERT(pfn != PFN_INVALID); 925 926 /* Find any non-aligned pages at start and end of segment. */ 927 off = P2PHASE(pfn, CBUF_MAPNP); 928 if (mlw.mpleft < CBUF_MAPNP) { 929 end = bitnum + mlw.mpleft; 930 } else if (off != 0) { 931 end -= off; 932 } else if (cfg->found4m && dump_test_used(pfn)) { 933 continue; 934 } 935 936 for (; bitnum < end; bitnum++, pfn++) { 937 dump_timeleft = dump_timeout; 938 if (BT_TEST(dumpcfg.bitmap, bitnum)) 939 continue; 940 if (!dump_pfn_check(pfn)) 941 continue; 942 ASSERT((sz + PAGESIZE) <= cfg->maxvmsize); 943 hat_devload(kas.a_hat, cfg->maxvm + sz, PAGESIZE, pfn, 944 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST); 945 sz += PAGESIZE; 946 cfg->foundsm++; 947 dump_set_used(pfn); 948 if (sz >= cfg->maxsize) 949 goto foundmax; 950 } 951 } 952 953 /* Fall back to lzjb if we did not get enough memory for bzip2. */ 954 endsz = (cfg->maxsize * cfg->threshold) / cfg->nhelper; 955 if (sz < endsz) { 956 cfg->clevel = DUMP_CLEVEL_LZJB; 957 } 958 959 /* Allocate memory for as many helpers as we can. */ 960 foundmax: 961 962 /* Byte offsets into memory found and mapped above */ 963 endsz = sz; 964 sz = 0; 965 966 /* Set the size for bzip2 state. Only bzip2 needs it. */ 967 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level); 968 969 /* Skip the preallocate output buffers. */ 970 cp = &cfg->cbuf[MINCBUFS]; 971 972 /* Use this to move memory up from the preallocated helpers. */ 973 ohp = cfg->helper; 974 975 /* Loop over all helpers and allocate memory. */ 976 for (hp = cfg->helper; hp < endhp; hp++) { 977 978 /* Skip preallocated helpers by checking hp->page. */ 979 if (hp->page == NULL) { 980 if (cfg->clevel <= DUMP_CLEVEL_LZJB) { 981 /* lzjb needs 2 1-page buffers */ 982 if ((sz + (2 * PAGESIZE)) > endsz) 983 break; 984 hp->page = cfg->maxvm + sz; 985 sz += PAGESIZE; 986 hp->lzbuf = cfg->maxvm + sz; 987 sz += PAGESIZE; 988 989 } else if (ohp->lzbuf != NULL) { 990 /* re-use the preallocted lzjb page for bzip2 */ 991 hp->page = ohp->lzbuf; 992 ohp->lzbuf = NULL; 993 ++ohp; 994 995 } else { 996 /* bzip2 needs a 1-page buffer */ 997 if ((sz + PAGESIZE) > endsz) 998 break; 999 hp->page = cfg->maxvm + sz; 1000 sz += PAGESIZE; 1001 } 1002 } 1003 1004 /* 1005 * Add output buffers per helper. The number of 1006 * buffers per helper is determined by the ratio of 1007 * ncbuf to nhelper. 1008 */ 1009 for (k = 0; cp < endcp && (sz + CBUF_SIZE) <= endsz && 1010 k < NCBUF_PER_HELPER; k++) { 1011 cp->state = CBUF_FREEBUF; 1012 cp->size = CBUF_SIZE; 1013 cp->buf = cfg->maxvm + sz; 1014 sz += CBUF_SIZE; 1015 ++cp; 1016 } 1017 1018 /* 1019 * bzip2 needs compression state. Use the dumpbzalloc 1020 * and dumpbzfree callbacks to allocate the memory. 1021 * bzip2 does allocation only at init time. 1022 */ 1023 if (cfg->clevel >= DUMP_CLEVEL_BZIP2) { 1024 if ((sz + bz2size) > endsz) { 1025 hp->page = NULL; 1026 break; 1027 } else { 1028 hp->bzstream.opaque = &sz; 1029 hp->bzstream.bzalloc = dumpbzalloc; 1030 hp->bzstream.bzfree = dumpbzfree; 1031 (void) BZ2_bzCompressInit(&hp->bzstream, 1032 dump_bzip2_level, 0, 0); 1033 hp->bzstream.opaque = NULL; 1034 } 1035 } 1036 } 1037 1038 /* Finish allocating output buffers */ 1039 for (; cp < endcp && (sz + CBUF_SIZE) <= endsz; cp++) { 1040 cp->state = CBUF_FREEBUF; 1041 cp->size = CBUF_SIZE; 1042 cp->buf = cfg->maxvm + sz; 1043 sz += CBUF_SIZE; 1044 } 1045 1046 /* Enable IS_DUMP_PAGE macro, which checks for pages we took. */ 1047 if (cfg->found4m || cfg->foundsm) 1048 dump_check_used = 1; 1049 1050 ASSERT(sz <= endsz); 1051 } 1052 1053 static void 1054 dumphdr_init(void) 1055 { 1056 pgcnt_t npages = 0; 1057 1058 ASSERT(MUTEX_HELD(&dump_lock)); 1059 1060 if (dumphdr == NULL) { 1061 dumphdr = kmem_zalloc(sizeof (dumphdr_t), KM_SLEEP); 1062 dumphdr->dump_magic = DUMP_MAGIC; 1063 dumphdr->dump_version = DUMP_VERSION; 1064 dumphdr->dump_wordsize = DUMP_WORDSIZE; 1065 dumphdr->dump_pageshift = PAGESHIFT; 1066 dumphdr->dump_pagesize = PAGESIZE; 1067 dumphdr->dump_utsname = utsname; 1068 (void) strcpy(dumphdr->dump_platform, platform); 1069 dumpbuf.size = dumpbuf_iosize(maxphys); 1070 dumpbuf.start = kmem_alloc(dumpbuf.size, KM_SLEEP); 1071 dumpbuf.end = dumpbuf.start + dumpbuf.size; 1072 dumpcfg.pids = kmem_alloc(v.v_proc * sizeof (pid_t), KM_SLEEP); 1073 dumpcfg.helpermap = kmem_zalloc(BT_SIZEOFMAP(NCPU), KM_SLEEP); 1074 LOCK_INIT_HELD(&dumpcfg.helper_lock); 1075 } 1076 1077 npages = num_phys_pages(); 1078 1079 if (dumpcfg.bitmapsize != npages) { 1080 size_t rlen = CBUF_MAPP2R(P2ROUNDUP(npages, CBUF_MAPNP)); 1081 void *map = kmem_alloc(BT_SIZEOFMAP(npages), KM_SLEEP); 1082 void *rmap = kmem_alloc(BT_SIZEOFMAP(rlen), KM_SLEEP); 1083 1084 if (dumpcfg.bitmap != NULL) 1085 kmem_free(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg. 1086 bitmapsize)); 1087 if (dumpcfg.rbitmap != NULL) 1088 kmem_free(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg. 1089 rbitmapsize)); 1090 dumpcfg.bitmap = map; 1091 dumpcfg.bitmapsize = npages; 1092 dumpcfg.rbitmap = rmap; 1093 dumpcfg.rbitmapsize = rlen; 1094 } 1095 } 1096 1097 /* 1098 * Establish a new dump device. 1099 */ 1100 int 1101 dumpinit(vnode_t *vp, char *name, int justchecking) 1102 { 1103 vnode_t *cvp; 1104 vattr_t vattr; 1105 vnode_t *cdev_vp; 1106 int error = 0; 1107 1108 ASSERT(MUTEX_HELD(&dump_lock)); 1109 1110 dumphdr_init(); 1111 1112 cvp = common_specvp(vp); 1113 if (cvp == dumpvp) 1114 return (0); 1115 1116 /* 1117 * Determine whether this is a plausible dump device. We want either: 1118 * (1) a real device that's not mounted and has a cb_dump routine, or 1119 * (2) a swapfile on some filesystem that has a vop_dump routine. 1120 */ 1121 if ((error = VOP_OPEN(&cvp, FREAD | FWRITE, kcred, NULL)) != 0) 1122 return (error); 1123 1124 vattr.va_mask = AT_SIZE | AT_TYPE | AT_RDEV; 1125 if ((error = VOP_GETATTR(cvp, &vattr, 0, kcred, NULL)) == 0) { 1126 if (vattr.va_type == VBLK || vattr.va_type == VCHR) { 1127 if (devopsp[getmajor(vattr.va_rdev)]-> 1128 devo_cb_ops->cb_dump == nodev) 1129 error = ENOTSUP; 1130 else if (vfs_devismounted(vattr.va_rdev)) 1131 error = EBUSY; 1132 if (strcmp(ddi_driver_name(VTOS(cvp)->s_dip), 1133 ZFS_DRIVER) == 0 && 1134 IS_SWAPVP(common_specvp(cvp))) 1135 error = EBUSY; 1136 } else { 1137 if (vn_matchopval(cvp, VOPNAME_DUMP, fs_nosys) || 1138 !IS_SWAPVP(cvp)) 1139 error = ENOTSUP; 1140 } 1141 } 1142 1143 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) 1144 error = ENOSPC; 1145 1146 if (error || justchecking) { 1147 (void) VOP_CLOSE(cvp, FREAD | FWRITE, 1, (offset_t)0, 1148 kcred, NULL); 1149 return (error); 1150 } 1151 1152 VN_HOLD(cvp); 1153 1154 if (dumpvp != NULL) 1155 dumpfini(); /* unconfigure the old dump device */ 1156 1157 dumpvp = cvp; 1158 dumpvp_size = vattr.va_size & -DUMP_OFFSET; 1159 dumppath = kmem_alloc(strlen(name) + 1, KM_SLEEP); 1160 (void) strcpy(dumppath, name); 1161 dumpbuf.iosize = 0; 1162 1163 /* 1164 * If the dump device is a block device, attempt to open up the 1165 * corresponding character device and determine its maximum transfer 1166 * size. We use this information to potentially resize dumpbuf to a 1167 * larger and more optimal size for performing i/o to the dump device. 1168 */ 1169 if (cvp->v_type == VBLK && 1170 (cdev_vp = makespecvp(VTOS(cvp)->s_dev, VCHR)) != NULL) { 1171 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) { 1172 size_t blk_size; 1173 struct dk_cinfo dki; 1174 struct dk_minfo minf; 1175 1176 if (VOP_IOCTL(cdev_vp, DKIOCGMEDIAINFO, 1177 (intptr_t)&minf, FKIOCTL, kcred, NULL, NULL) 1178 == 0 && minf.dki_lbsize != 0) 1179 blk_size = minf.dki_lbsize; 1180 else 1181 blk_size = DEV_BSIZE; 1182 1183 if (VOP_IOCTL(cdev_vp, DKIOCINFO, (intptr_t)&dki, 1184 FKIOCTL, kcred, NULL, NULL) == 0) { 1185 dumpbuf.iosize = dki.dki_maxtransfer * blk_size; 1186 dumpbuf_resize(); 1187 } 1188 /* 1189 * If we are working with a zvol then dumpify it 1190 * if it's not being used as swap. 1191 */ 1192 if (strcmp(dki.dki_dname, ZVOL_DRIVER) == 0) { 1193 if (IS_SWAPVP(common_specvp(cvp))) 1194 error = EBUSY; 1195 else if ((error = VOP_IOCTL(cdev_vp, 1196 DKIOCDUMPINIT, NULL, FKIOCTL, kcred, 1197 NULL, NULL)) != 0) 1198 dumpfini(); 1199 } 1200 1201 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0, 1202 kcred, NULL); 1203 } 1204 1205 VN_RELE(cdev_vp); 1206 } 1207 1208 cmn_err(CE_CONT, "?dump on %s size %llu MB\n", name, dumpvp_size >> 20); 1209 1210 dump_update_clevel(); 1211 1212 return (error); 1213 } 1214 1215 void 1216 dumpfini(void) 1217 { 1218 vattr_t vattr; 1219 boolean_t is_zfs = B_FALSE; 1220 vnode_t *cdev_vp; 1221 ASSERT(MUTEX_HELD(&dump_lock)); 1222 1223 kmem_free(dumppath, strlen(dumppath) + 1); 1224 1225 /* 1226 * Determine if we are using zvols for our dump device 1227 */ 1228 vattr.va_mask = AT_RDEV; 1229 if (VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL) == 0) { 1230 is_zfs = (getmajor(vattr.va_rdev) == 1231 ddi_name_to_major(ZFS_DRIVER)) ? B_TRUE : B_FALSE; 1232 } 1233 1234 /* 1235 * If we have a zvol dump device then we call into zfs so 1236 * that it may have a chance to cleanup. 1237 */ 1238 if (is_zfs && 1239 (cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR)) != NULL) { 1240 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) { 1241 (void) VOP_IOCTL(cdev_vp, DKIOCDUMPFINI, NULL, FKIOCTL, 1242 kcred, NULL, NULL); 1243 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0, 1244 kcred, NULL); 1245 } 1246 VN_RELE(cdev_vp); 1247 } 1248 1249 (void) VOP_CLOSE(dumpvp, FREAD | FWRITE, 1, (offset_t)0, kcred, NULL); 1250 1251 VN_RELE(dumpvp); 1252 1253 dumpvp = NULL; 1254 dumpvp_size = 0; 1255 dumppath = NULL; 1256 } 1257 1258 static offset_t 1259 dumpvp_flush(void) 1260 { 1261 size_t size = P2ROUNDUP(dumpbuf.cur - dumpbuf.start, PAGESIZE); 1262 hrtime_t iotime; 1263 int err; 1264 1265 if (dumpbuf.vp_off + size > dumpbuf.vp_limit) { 1266 dump_ioerr = ENOSPC; 1267 dumpbuf.vp_off = dumpbuf.vp_limit; 1268 } else if (size != 0) { 1269 iotime = gethrtime(); 1270 dumpsync.iowait += iotime - dumpsync.iowaitts; 1271 if (panicstr) 1272 err = VOP_DUMP(dumpvp, dumpbuf.start, 1273 lbtodb(dumpbuf.vp_off), btod(size), NULL); 1274 else 1275 err = vn_rdwr(UIO_WRITE, dumpbuf.cdev_vp != NULL ? 1276 dumpbuf.cdev_vp : dumpvp, dumpbuf.start, size, 1277 dumpbuf.vp_off, UIO_SYSSPACE, 0, dumpbuf.vp_limit, 1278 kcred, 0); 1279 if (err && dump_ioerr == 0) 1280 dump_ioerr = err; 1281 dumpsync.iowaitts = gethrtime(); 1282 dumpsync.iotime += dumpsync.iowaitts - iotime; 1283 dumpsync.nwrite += size; 1284 dumpbuf.vp_off += size; 1285 } 1286 dumpbuf.cur = dumpbuf.start; 1287 dump_timeleft = dump_timeout; 1288 return (dumpbuf.vp_off); 1289 } 1290 1291 /* maximize write speed by keeping seek offset aligned with size */ 1292 void 1293 dumpvp_write(const void *va, size_t size) 1294 { 1295 size_t len, off, sz; 1296 1297 while (size != 0) { 1298 len = MIN(size, dumpbuf.end - dumpbuf.cur); 1299 if (len == 0) { 1300 off = P2PHASE(dumpbuf.vp_off, dumpbuf.size); 1301 if (off == 0 || !ISP2(dumpbuf.size)) { 1302 (void) dumpvp_flush(); 1303 } else { 1304 sz = dumpbuf.size - off; 1305 dumpbuf.cur = dumpbuf.start + sz; 1306 (void) dumpvp_flush(); 1307 ovbcopy(dumpbuf.start + sz, dumpbuf.start, off); 1308 dumpbuf.cur += off; 1309 } 1310 } else { 1311 bcopy(va, dumpbuf.cur, len); 1312 va = (char *)va + len; 1313 dumpbuf.cur += len; 1314 size -= len; 1315 } 1316 } 1317 } 1318 1319 /*ARGSUSED*/ 1320 static void 1321 dumpvp_ksyms_write(const void *src, void *dst, size_t size) 1322 { 1323 dumpvp_write(src, size); 1324 } 1325 1326 /* 1327 * Mark 'pfn' in the bitmap and dump its translation table entry. 1328 */ 1329 void 1330 dump_addpage(struct as *as, void *va, pfn_t pfn) 1331 { 1332 mem_vtop_t mem_vtop; 1333 pgcnt_t bitnum; 1334 1335 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) { 1336 if (!BT_TEST(dumpcfg.bitmap, bitnum)) { 1337 dumphdr->dump_npages++; 1338 BT_SET(dumpcfg.bitmap, bitnum); 1339 } 1340 dumphdr->dump_nvtop++; 1341 mem_vtop.m_as = as; 1342 mem_vtop.m_va = va; 1343 mem_vtop.m_pfn = pfn; 1344 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 1345 } 1346 dump_timeleft = dump_timeout; 1347 } 1348 1349 /* 1350 * Mark 'pfn' in the bitmap 1351 */ 1352 void 1353 dump_page(pfn_t pfn) 1354 { 1355 pgcnt_t bitnum; 1356 1357 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) { 1358 if (!BT_TEST(dumpcfg.bitmap, bitnum)) { 1359 dumphdr->dump_npages++; 1360 BT_SET(dumpcfg.bitmap, bitnum); 1361 } 1362 } 1363 dump_timeleft = dump_timeout; 1364 } 1365 1366 /* 1367 * Dump the <as, va, pfn> information for a given address space. 1368 * SEGOP_DUMP() will call dump_addpage() for each page in the segment. 1369 */ 1370 static void 1371 dump_as(struct as *as) 1372 { 1373 struct seg *seg; 1374 1375 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1376 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 1377 if (seg->s_as != as) 1378 break; 1379 if (seg->s_ops == NULL) 1380 continue; 1381 SEGOP_DUMP(seg); 1382 } 1383 AS_LOCK_EXIT(as, &as->a_lock); 1384 1385 if (seg != NULL) 1386 cmn_err(CE_WARN, "invalid segment %p in address space %p", 1387 (void *)seg, (void *)as); 1388 } 1389 1390 static int 1391 dump_process(pid_t pid) 1392 { 1393 proc_t *p = sprlock(pid); 1394 1395 if (p == NULL) 1396 return (-1); 1397 if (p->p_as != &kas) { 1398 mutex_exit(&p->p_lock); 1399 dump_as(p->p_as); 1400 mutex_enter(&p->p_lock); 1401 } 1402 1403 sprunlock(p); 1404 1405 return (0); 1406 } 1407 1408 void 1409 dump_ereports(void) 1410 { 1411 u_offset_t dumpvp_start; 1412 erpt_dump_t ed; 1413 1414 if (dumpvp == NULL || dumphdr == NULL) 1415 return; 1416 1417 dumpbuf.cur = dumpbuf.start; 1418 dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE); 1419 dumpvp_start = dumpbuf.vp_limit - DUMP_ERPTSIZE; 1420 dumpbuf.vp_off = dumpvp_start; 1421 1422 fm_ereport_dump(); 1423 if (panicstr) 1424 errorq_dump(); 1425 1426 bzero(&ed, sizeof (ed)); /* indicate end of ereports */ 1427 dumpvp_write(&ed, sizeof (ed)); 1428 (void) dumpvp_flush(); 1429 1430 if (!panicstr) { 1431 (void) VOP_PUTPAGE(dumpvp, dumpvp_start, 1432 (size_t)(dumpbuf.vp_off - dumpvp_start), 1433 B_INVAL | B_FORCE, kcred, NULL); 1434 } 1435 } 1436 1437 void 1438 dump_messages(void) 1439 { 1440 log_dump_t ld; 1441 mblk_t *mctl, *mdata; 1442 queue_t *q, *qlast; 1443 u_offset_t dumpvp_start; 1444 1445 if (dumpvp == NULL || dumphdr == NULL || log_consq == NULL) 1446 return; 1447 1448 dumpbuf.cur = dumpbuf.start; 1449 dumpbuf.vp_limit = dumpvp_size - DUMP_OFFSET; 1450 dumpvp_start = dumpbuf.vp_limit - DUMP_LOGSIZE; 1451 dumpbuf.vp_off = dumpvp_start; 1452 1453 qlast = NULL; 1454 do { 1455 for (q = log_consq; q->q_next != qlast; q = q->q_next) 1456 continue; 1457 for (mctl = q->q_first; mctl != NULL; mctl = mctl->b_next) { 1458 dump_timeleft = dump_timeout; 1459 mdata = mctl->b_cont; 1460 ld.ld_magic = LOG_MAGIC; 1461 ld.ld_msgsize = MBLKL(mctl->b_cont); 1462 ld.ld_csum = checksum32(mctl->b_rptr, MBLKL(mctl)); 1463 ld.ld_msum = checksum32(mdata->b_rptr, MBLKL(mdata)); 1464 dumpvp_write(&ld, sizeof (ld)); 1465 dumpvp_write(mctl->b_rptr, MBLKL(mctl)); 1466 dumpvp_write(mdata->b_rptr, MBLKL(mdata)); 1467 } 1468 } while ((qlast = q) != log_consq); 1469 1470 ld.ld_magic = 0; /* indicate end of messages */ 1471 dumpvp_write(&ld, sizeof (ld)); 1472 (void) dumpvp_flush(); 1473 if (!panicstr) { 1474 (void) VOP_PUTPAGE(dumpvp, dumpvp_start, 1475 (size_t)(dumpbuf.vp_off - dumpvp_start), 1476 B_INVAL | B_FORCE, kcred, NULL); 1477 } 1478 } 1479 1480 /* 1481 * The following functions are called on multiple CPUs during dump. 1482 * They must not use most kernel services, because all cross-calls are 1483 * disabled during panic. Therefore, blocking locks and cache flushes 1484 * will not work. 1485 */ 1486 1487 /* 1488 * Copy pages, trapping ECC errors. Also, for robustness, trap data 1489 * access in case something goes wrong in the hat layer and the 1490 * mapping is broken. 1491 */ 1492 static int 1493 dump_pagecopy(void *src, void *dst) 1494 { 1495 long *wsrc = (long *)src; 1496 long *wdst = (long *)dst; 1497 const ulong_t ncopies = PAGESIZE / sizeof (long); 1498 volatile int w = 0; 1499 volatile int ueoff = -1; 1500 on_trap_data_t otd; 1501 1502 if (on_trap(&otd, OT_DATA_EC | OT_DATA_ACCESS)) { 1503 if (ueoff == -1) 1504 ueoff = w * sizeof (long); 1505 /* report "bad ECC" or "bad address" */ 1506 #ifdef _LP64 1507 if (otd.ot_trap & OT_DATA_EC) 1508 wdst[w++] = 0x00badecc00badecc; 1509 else 1510 wdst[w++] = 0x00badadd00badadd; 1511 #else 1512 if (otd.ot_trap & OT_DATA_EC) 1513 wdst[w++] = 0x00badecc; 1514 else 1515 wdst[w++] = 0x00badadd; 1516 #endif 1517 } 1518 while (w < ncopies) { 1519 wdst[w] = wsrc[w]; 1520 w++; 1521 } 1522 no_trap(); 1523 return (ueoff); 1524 } 1525 1526 static void 1527 dumpsys_close_cq(cqueue_t *cq, int live) 1528 { 1529 if (live) { 1530 mutex_enter(&cq->mutex); 1531 atomic_dec_uint(&cq->open); 1532 cv_signal(&cq->cv); 1533 mutex_exit(&cq->mutex); 1534 } else { 1535 atomic_dec_uint(&cq->open); 1536 } 1537 } 1538 1539 static inline void 1540 dumpsys_spinlock(lock_t *lp) 1541 { 1542 uint_t backoff = 0; 1543 int loop_count = 0; 1544 1545 while (LOCK_HELD(lp) || !lock_spin_try(lp)) { 1546 if (++loop_count >= ncpus) { 1547 backoff = mutex_lock_backoff(0); 1548 loop_count = 0; 1549 } else { 1550 backoff = mutex_lock_backoff(backoff); 1551 } 1552 mutex_lock_delay(backoff); 1553 } 1554 } 1555 1556 static inline void 1557 dumpsys_spinunlock(lock_t *lp) 1558 { 1559 lock_clear(lp); 1560 } 1561 1562 static inline void 1563 dumpsys_lock(cqueue_t *cq, int live) 1564 { 1565 if (live) 1566 mutex_enter(&cq->mutex); 1567 else 1568 dumpsys_spinlock(&cq->spinlock); 1569 } 1570 1571 static inline void 1572 dumpsys_unlock(cqueue_t *cq, int live, int signal) 1573 { 1574 if (live) { 1575 if (signal) 1576 cv_signal(&cq->cv); 1577 mutex_exit(&cq->mutex); 1578 } else { 1579 dumpsys_spinunlock(&cq->spinlock); 1580 } 1581 } 1582 1583 static void 1584 dumpsys_wait_cq(cqueue_t *cq, int live) 1585 { 1586 if (live) { 1587 cv_wait(&cq->cv, &cq->mutex); 1588 } else { 1589 dumpsys_spinunlock(&cq->spinlock); 1590 while (cq->open) 1591 if (cq->first) 1592 break; 1593 dumpsys_spinlock(&cq->spinlock); 1594 } 1595 } 1596 1597 static void 1598 dumpsys_put_cq(cqueue_t *cq, cbuf_t *cp, int newstate, int live) 1599 { 1600 if (cp == NULL) 1601 return; 1602 1603 dumpsys_lock(cq, live); 1604 1605 if (cq->ts != 0) { 1606 cq->empty += gethrtime() - cq->ts; 1607 cq->ts = 0; 1608 } 1609 1610 cp->state = newstate; 1611 cp->next = NULL; 1612 if (cq->last == NULL) 1613 cq->first = cp; 1614 else 1615 cq->last->next = cp; 1616 cq->last = cp; 1617 1618 dumpsys_unlock(cq, live, 1); 1619 } 1620 1621 static cbuf_t * 1622 dumpsys_get_cq(cqueue_t *cq, int live) 1623 { 1624 cbuf_t *cp; 1625 hrtime_t now = gethrtime(); 1626 1627 dumpsys_lock(cq, live); 1628 1629 /* CONSTCOND */ 1630 while (1) { 1631 cp = (cbuf_t *)cq->first; 1632 if (cp == NULL) { 1633 if (cq->open == 0) 1634 break; 1635 dumpsys_wait_cq(cq, live); 1636 continue; 1637 } 1638 cq->first = cp->next; 1639 if (cq->first == NULL) { 1640 cq->last = NULL; 1641 cq->ts = now; 1642 } 1643 break; 1644 } 1645 1646 dumpsys_unlock(cq, live, cq->first != NULL || cq->open == 0); 1647 return (cp); 1648 } 1649 1650 /* 1651 * Send an error message to the console. If the main task is running 1652 * just write the message via uprintf. If a helper is running the 1653 * message has to be put on a queue for the main task. Setting fmt to 1654 * NULL means flush the error message buffer. If fmt is not NULL, just 1655 * add the text to the existing buffer. 1656 */ 1657 static void 1658 dumpsys_errmsg(helper_t *hp, const char *fmt, ...) 1659 { 1660 dumpsync_t *ds = hp->ds; 1661 cbuf_t *cp = hp->cperr; 1662 va_list adx; 1663 1664 if (hp->helper == MAINHELPER) { 1665 if (fmt != NULL) { 1666 if (ds->neednl) { 1667 uprintf("\n"); 1668 ds->neednl = 0; 1669 } 1670 va_start(adx, fmt); 1671 vuprintf(fmt, adx); 1672 va_end(adx); 1673 } 1674 } else if (fmt == NULL) { 1675 if (cp != NULL) { 1676 CQ_PUT(mainq, cp, CBUF_ERRMSG); 1677 hp->cperr = NULL; 1678 } 1679 } else { 1680 if (hp->cperr == NULL) { 1681 cp = CQ_GET(freebufq); 1682 hp->cperr = cp; 1683 cp->used = 0; 1684 } 1685 va_start(adx, fmt); 1686 cp->used += vsnprintf(cp->buf + cp->used, cp->size - cp->used, 1687 fmt, adx); 1688 va_end(adx); 1689 if ((cp->used + LOG_MSGSIZE) > cp->size) { 1690 CQ_PUT(mainq, cp, CBUF_ERRMSG); 1691 hp->cperr = NULL; 1692 } 1693 } 1694 } 1695 1696 /* 1697 * Write an output buffer to the dump file. If the main task is 1698 * running just write the data. If a helper is running the output is 1699 * placed on a queue for the main task. 1700 */ 1701 static void 1702 dumpsys_swrite(helper_t *hp, cbuf_t *cp, size_t used) 1703 { 1704 dumpsync_t *ds = hp->ds; 1705 1706 if (hp->helper == MAINHELPER) { 1707 HRSTART(ds->perpage, write); 1708 dumpvp_write(cp->buf, used); 1709 HRSTOP(ds->perpage, write); 1710 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 1711 } else { 1712 cp->used = used; 1713 CQ_PUT(mainq, cp, CBUF_WRITE); 1714 } 1715 } 1716 1717 /* 1718 * Copy one page within the mapped range. The offset starts at 0 and 1719 * is relative to the first pfn. cp->buf + cp->off is the address of 1720 * the first pfn. If dump_pagecopy returns a UE offset, create an 1721 * error message. Returns the offset to the next pfn in the range 1722 * selected by the bitmap. 1723 */ 1724 static int 1725 dumpsys_copy_page(helper_t *hp, int offset) 1726 { 1727 cbuf_t *cp = hp->cpin; 1728 int ueoff; 1729 1730 ASSERT(cp->off + offset + PAGESIZE <= cp->size); 1731 ASSERT(BT_TEST(dumpcfg.bitmap, cp->bitnum)); 1732 1733 ueoff = dump_pagecopy(cp->buf + cp->off + offset, hp->page); 1734 1735 /* ueoff is the offset in the page to a UE error */ 1736 if (ueoff != -1) { 1737 uint64_t pa = ptob(cp->pfn) + offset + ueoff; 1738 1739 dumpsys_errmsg(hp, "cpu %d: memory error at PA 0x%08x.%08x\n", 1740 CPU->cpu_id, (uint32_t)(pa >> 32), (uint32_t)pa); 1741 } 1742 1743 /* 1744 * Advance bitnum and offset to the next input page for the 1745 * next call to this function. 1746 */ 1747 offset += PAGESIZE; 1748 cp->bitnum++; 1749 while (cp->off + offset < cp->size) { 1750 if (BT_TEST(dumpcfg.bitmap, cp->bitnum)) 1751 break; 1752 offset += PAGESIZE; 1753 cp->bitnum++; 1754 } 1755 1756 return (offset); 1757 } 1758 1759 /* 1760 * Read the helper queue, and copy one mapped page. Return 0 when 1761 * done. Return 1 when a page has been copied into hp->page. 1762 */ 1763 static int 1764 dumpsys_sread(helper_t *hp) 1765 { 1766 dumpsync_t *ds = hp->ds; 1767 1768 /* CONSTCOND */ 1769 while (1) { 1770 1771 /* Find the next input buffer. */ 1772 if (hp->cpin == NULL) { 1773 HRSTART(hp->perpage, inwait); 1774 1775 /* CONSTCOND */ 1776 while (1) { 1777 hp->cpin = CQ_GET(helperq); 1778 dump_timeleft = dump_timeout; 1779 1780 /* 1781 * NULL return means the helper queue 1782 * is closed and empty. 1783 */ 1784 if (hp->cpin == NULL) 1785 break; 1786 1787 /* Have input, check for dump I/O error. */ 1788 if (!dump_ioerr) 1789 break; 1790 1791 /* 1792 * If an I/O error occurs, stay in the 1793 * loop in order to empty the helper 1794 * queue. Return the buffers to the 1795 * main task to unmap and free it. 1796 */ 1797 hp->cpin->used = 0; 1798 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 1799 } 1800 HRSTOP(hp->perpage, inwait); 1801 1802 /* Stop here when the helper queue is closed. */ 1803 if (hp->cpin == NULL) 1804 break; 1805 1806 /* Set the offset=0 to get the first pfn. */ 1807 hp->in = 0; 1808 1809 /* Set the total processed to 0 */ 1810 hp->used = 0; 1811 } 1812 1813 /* Process the next page. */ 1814 if (hp->used < hp->cpin->used) { 1815 1816 /* 1817 * Get the next page from the input buffer and 1818 * return a copy. 1819 */ 1820 ASSERT(hp->in != -1); 1821 HRSTART(hp->perpage, copy); 1822 hp->in = dumpsys_copy_page(hp, hp->in); 1823 hp->used += PAGESIZE; 1824 HRSTOP(hp->perpage, copy); 1825 break; 1826 1827 } else { 1828 1829 /* 1830 * Done with the input. Flush the VM and 1831 * return the buffer to the main task. 1832 */ 1833 if (panicstr && hp->helper != MAINHELPER) 1834 hat_flush_range(kas.a_hat, 1835 hp->cpin->buf, hp->cpin->size); 1836 dumpsys_errmsg(hp, NULL); 1837 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 1838 hp->cpin = NULL; 1839 } 1840 } 1841 1842 return (hp->cpin != NULL); 1843 } 1844 1845 /* 1846 * Compress size bytes starting at buf with bzip2 1847 * mode: 1848 * BZ_RUN add one more compressed page 1849 * BZ_FINISH no more input, flush the state 1850 */ 1851 static void 1852 dumpsys_bzrun(helper_t *hp, void *buf, size_t size, int mode) 1853 { 1854 dumpsync_t *ds = hp->ds; 1855 const int CSIZE = sizeof (dumpcsize_t); 1856 bz_stream *ps = &hp->bzstream; 1857 int rc = 0; 1858 uint32_t csize; 1859 dumpcsize_t cs; 1860 1861 /* Set input pointers to new input page */ 1862 if (size > 0) { 1863 ps->avail_in = size; 1864 ps->next_in = buf; 1865 } 1866 1867 /* CONSTCOND */ 1868 while (1) { 1869 1870 /* Quit when all input has been consumed */ 1871 if (ps->avail_in == 0 && mode == BZ_RUN) 1872 break; 1873 1874 /* Get a new output buffer */ 1875 if (hp->cpout == NULL) { 1876 HRSTART(hp->perpage, outwait); 1877 hp->cpout = CQ_GET(freebufq); 1878 HRSTOP(hp->perpage, outwait); 1879 ps->avail_out = hp->cpout->size - CSIZE; 1880 ps->next_out = hp->cpout->buf + CSIZE; 1881 } 1882 1883 /* Compress input, or finalize */ 1884 HRSTART(hp->perpage, compress); 1885 rc = BZ2_bzCompress(ps, mode); 1886 HRSTOP(hp->perpage, compress); 1887 1888 /* Check for error */ 1889 if (mode == BZ_RUN && rc != BZ_RUN_OK) { 1890 dumpsys_errmsg(hp, "%d: BZ_RUN error %s at page %lx\n", 1891 hp->helper, BZ2_bzErrorString(rc), 1892 hp->cpin->pagenum); 1893 break; 1894 } 1895 1896 /* Write the buffer if it is full, or we are flushing */ 1897 if (ps->avail_out == 0 || mode == BZ_FINISH) { 1898 csize = hp->cpout->size - CSIZE - ps->avail_out; 1899 cs = DUMP_SET_TAG(csize, hp->tag); 1900 if (csize > 0) { 1901 (void) memcpy(hp->cpout->buf, &cs, CSIZE); 1902 dumpsys_swrite(hp, hp->cpout, csize + CSIZE); 1903 hp->cpout = NULL; 1904 } 1905 } 1906 1907 /* Check for final complete */ 1908 if (mode == BZ_FINISH) { 1909 if (rc == BZ_STREAM_END) 1910 break; 1911 if (rc != BZ_FINISH_OK) { 1912 dumpsys_errmsg(hp, "%d: BZ_FINISH error %s\n", 1913 hp->helper, BZ2_bzErrorString(rc)); 1914 break; 1915 } 1916 } 1917 } 1918 1919 /* Cleanup state and buffers */ 1920 if (mode == BZ_FINISH) { 1921 1922 /* Reset state so that it is re-usable. */ 1923 (void) BZ2_bzCompressReset(&hp->bzstream); 1924 1925 /* Give any unused outout buffer to the main task */ 1926 if (hp->cpout != NULL) { 1927 hp->cpout->used = 0; 1928 CQ_PUT(mainq, hp->cpout, CBUF_ERRMSG); 1929 hp->cpout = NULL; 1930 } 1931 } 1932 } 1933 1934 static void 1935 dumpsys_bz2compress(helper_t *hp) 1936 { 1937 dumpsync_t *ds = hp->ds; 1938 dumpstreamhdr_t sh; 1939 1940 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC); 1941 sh.stream_pagenum = (pgcnt_t)-1; 1942 sh.stream_npages = 0; 1943 hp->cpin = NULL; 1944 hp->cpout = NULL; 1945 hp->cperr = NULL; 1946 hp->in = 0; 1947 hp->out = 0; 1948 hp->bzstream.avail_in = 0; 1949 1950 /* Bump reference to mainq while we are running */ 1951 CQ_OPEN(mainq); 1952 1953 /* Get one page at a time */ 1954 while (dumpsys_sread(hp)) { 1955 if (sh.stream_pagenum != hp->cpin->pagenum) { 1956 sh.stream_pagenum = hp->cpin->pagenum; 1957 sh.stream_npages = btop(hp->cpin->used); 1958 dumpsys_bzrun(hp, &sh, sizeof (sh), BZ_RUN); 1959 } 1960 dumpsys_bzrun(hp, hp->page, PAGESIZE, 0); 1961 } 1962 1963 /* Done with input, flush any partial buffer */ 1964 if (sh.stream_pagenum != (pgcnt_t)-1) { 1965 dumpsys_bzrun(hp, NULL, 0, BZ_FINISH); 1966 dumpsys_errmsg(hp, NULL); 1967 } 1968 1969 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL); 1970 1971 /* Decrement main queue count, we are done */ 1972 CQ_CLOSE(mainq); 1973 } 1974 1975 /* 1976 * Compress with lzjb 1977 * write stream block if full or size==0 1978 * if csize==0 write stream header, else write <csize, data> 1979 * size==0 is a call to flush a buffer 1980 * hp->cpout is the buffer we are flushing or filling 1981 * hp->out is the next index to fill data 1982 * osize is either csize+data, or the size of a stream header 1983 */ 1984 static void 1985 dumpsys_lzjbrun(helper_t *hp, size_t csize, void *buf, size_t size) 1986 { 1987 dumpsync_t *ds = hp->ds; 1988 const int CSIZE = sizeof (dumpcsize_t); 1989 dumpcsize_t cs; 1990 size_t osize = csize > 0 ? CSIZE + size : size; 1991 1992 /* If flush, and there is no buffer, just return */ 1993 if (size == 0 && hp->cpout == NULL) 1994 return; 1995 1996 /* If flush, or cpout is full, write it out */ 1997 if (size == 0 || 1998 hp->cpout != NULL && hp->out + osize > hp->cpout->size) { 1999 2000 /* Set tag+size word at the front of the stream block. */ 2001 cs = DUMP_SET_TAG(hp->out - CSIZE, hp->tag); 2002 (void) memcpy(hp->cpout->buf, &cs, CSIZE); 2003 2004 /* Write block to dump file. */ 2005 dumpsys_swrite(hp, hp->cpout, hp->out); 2006 2007 /* Clear pointer to indicate we need a new buffer */ 2008 hp->cpout = NULL; 2009 2010 /* flushing, we are done */ 2011 if (size == 0) 2012 return; 2013 } 2014 2015 /* Get an output buffer if we dont have one. */ 2016 if (hp->cpout == NULL) { 2017 HRSTART(hp->perpage, outwait); 2018 hp->cpout = CQ_GET(freebufq); 2019 HRSTOP(hp->perpage, outwait); 2020 hp->out = CSIZE; 2021 } 2022 2023 /* Store csize word. This is the size of compressed data. */ 2024 if (csize > 0) { 2025 cs = DUMP_SET_TAG(csize, 0); 2026 (void) memcpy(hp->cpout->buf + hp->out, &cs, CSIZE); 2027 hp->out += CSIZE; 2028 } 2029 2030 /* Store the data. */ 2031 (void) memcpy(hp->cpout->buf + hp->out, buf, size); 2032 hp->out += size; 2033 } 2034 2035 static void 2036 dumpsys_lzjbcompress(helper_t *hp) 2037 { 2038 dumpsync_t *ds = hp->ds; 2039 size_t csize; 2040 dumpstreamhdr_t sh; 2041 2042 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC); 2043 sh.stream_pagenum = (pfn_t)-1; 2044 sh.stream_npages = 0; 2045 hp->cpin = NULL; 2046 hp->cpout = NULL; 2047 hp->cperr = NULL; 2048 hp->in = 0; 2049 hp->out = 0; 2050 2051 /* Bump reference to mainq while we are running */ 2052 CQ_OPEN(mainq); 2053 2054 /* Get one page at a time */ 2055 while (dumpsys_sread(hp)) { 2056 2057 /* Create a stream header for each new input map */ 2058 if (sh.stream_pagenum != hp->cpin->pagenum) { 2059 sh.stream_pagenum = hp->cpin->pagenum; 2060 sh.stream_npages = btop(hp->cpin->used); 2061 dumpsys_lzjbrun(hp, 0, &sh, sizeof (sh)); 2062 } 2063 2064 /* Compress one page */ 2065 HRSTART(hp->perpage, compress); 2066 csize = compress(hp->page, hp->lzbuf, PAGESIZE); 2067 HRSTOP(hp->perpage, compress); 2068 2069 /* Add csize+data to output block */ 2070 ASSERT(csize > 0 && csize <= PAGESIZE); 2071 dumpsys_lzjbrun(hp, csize, hp->lzbuf, csize); 2072 } 2073 2074 /* Done with input, flush any partial buffer */ 2075 if (sh.stream_pagenum != (pfn_t)-1) { 2076 dumpsys_lzjbrun(hp, 0, NULL, 0); 2077 dumpsys_errmsg(hp, NULL); 2078 } 2079 2080 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL); 2081 2082 /* Decrement main queue count, we are done */ 2083 CQ_CLOSE(mainq); 2084 } 2085 2086 /* 2087 * Dump helper called from panic_idle() to compress pages. CPUs in 2088 * this path must not call most kernel services. 2089 * 2090 * During panic, all but one of the CPUs is idle. These CPUs are used 2091 * as helpers working in parallel to copy and compress memory 2092 * pages. During a panic, however, these processors cannot call any 2093 * kernel services. This is because mutexes become no-ops during 2094 * panic, and, cross-call interrupts are inhibited. Therefore, during 2095 * panic dump the helper CPUs communicate with the panic CPU using 2096 * memory variables. All memory mapping and I/O is performed by the 2097 * panic CPU. 2098 */ 2099 void 2100 dumpsys_helper() 2101 { 2102 dumpsys_spinlock(&dumpcfg.helper_lock); 2103 if (dumpcfg.helpers_wanted) { 2104 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper]; 2105 2106 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2107 if (hp->helper == FREEHELPER) { 2108 hp->helper = CPU->cpu_id; 2109 BT_SET(dumpcfg.helpermap, CPU->cpu_seqid); 2110 2111 dumpsys_spinunlock(&dumpcfg.helper_lock); 2112 2113 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2) 2114 dumpsys_lzjbcompress(hp); 2115 else 2116 dumpsys_bz2compress(hp); 2117 2118 hp->helper = DONEHELPER; 2119 return; 2120 } 2121 } 2122 } 2123 dumpsys_spinunlock(&dumpcfg.helper_lock); 2124 } 2125 2126 /* 2127 * Dump helper for live dumps. 2128 * These run as a system task. 2129 */ 2130 static void 2131 dumpsys_live_helper(void *arg) 2132 { 2133 helper_t *hp = arg; 2134 2135 BT_ATOMIC_SET(dumpcfg.helpermap, CPU->cpu_seqid); 2136 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2) 2137 dumpsys_lzjbcompress(hp); 2138 else 2139 dumpsys_bz2compress(hp); 2140 } 2141 2142 /* 2143 * Compress one page with lzjb (single threaded case) 2144 */ 2145 static void 2146 dumpsys_lzjb_page(helper_t *hp, cbuf_t *cp) 2147 { 2148 dumpsync_t *ds = hp->ds; 2149 uint32_t csize; 2150 2151 hp->helper = MAINHELPER; 2152 hp->in = 0; 2153 hp->used = 0; 2154 hp->cpin = cp; 2155 while (hp->used < cp->used) { 2156 HRSTART(hp->perpage, copy); 2157 hp->in = dumpsys_copy_page(hp, hp->in); 2158 hp->used += PAGESIZE; 2159 HRSTOP(hp->perpage, copy); 2160 2161 HRSTART(hp->perpage, compress); 2162 csize = compress(hp->page, hp->lzbuf, PAGESIZE); 2163 HRSTOP(hp->perpage, compress); 2164 2165 HRSTART(hp->perpage, write); 2166 dumpvp_write(&csize, sizeof (csize)); 2167 dumpvp_write(hp->lzbuf, csize); 2168 HRSTOP(hp->perpage, write); 2169 } 2170 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 2171 hp->cpin = NULL; 2172 } 2173 2174 /* 2175 * Main task to dump pages. This is called on the dump CPU. 2176 */ 2177 static void 2178 dumpsys_main_task(void *arg) 2179 { 2180 dumpsync_t *ds = arg; 2181 pgcnt_t pagenum = 0, bitnum = 0, hibitnum; 2182 dumpmlw_t mlw; 2183 cbuf_t *cp; 2184 pgcnt_t baseoff, pfnoff; 2185 pfn_t base, pfn; 2186 int sec; 2187 2188 dump_init_memlist_walker(&mlw); 2189 2190 /* CONSTCOND */ 2191 while (1) { 2192 2193 if (ds->percent > ds->percent_done) { 2194 ds->percent_done = ds->percent; 2195 sec = (gethrtime() - ds->start) / 1000 / 1000 / 1000; 2196 uprintf("^\r%2d:%02d %3d%% done", 2197 sec / 60, sec % 60, ds->percent); 2198 ds->neednl = 1; 2199 } 2200 2201 while (CQ_IS_EMPTY(mainq) && !CQ_IS_EMPTY(writerq)) { 2202 2203 /* the writerq never blocks */ 2204 cp = CQ_GET(writerq); 2205 if (cp == NULL) 2206 break; 2207 2208 dump_timeleft = dump_timeout; 2209 2210 HRSTART(ds->perpage, write); 2211 dumpvp_write(cp->buf, cp->used); 2212 HRSTOP(ds->perpage, write); 2213 2214 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2215 } 2216 2217 /* 2218 * Wait here for some buffers to process. Returns NULL 2219 * when all helpers have terminated and all buffers 2220 * have been processed. 2221 */ 2222 cp = CQ_GET(mainq); 2223 2224 if (cp == NULL) { 2225 2226 /* Drain the write queue. */ 2227 if (!CQ_IS_EMPTY(writerq)) 2228 continue; 2229 2230 /* Main task exits here. */ 2231 break; 2232 } 2233 2234 dump_timeleft = dump_timeout; 2235 2236 switch (cp->state) { 2237 2238 case CBUF_FREEMAP: 2239 2240 /* 2241 * Note that we drop CBUF_FREEMAP buffers on 2242 * the floor (they will not be on any cqueue) 2243 * when we no longer need them. 2244 */ 2245 if (bitnum >= dumpcfg.bitmapsize) 2246 break; 2247 2248 if (dump_ioerr) { 2249 bitnum = dumpcfg.bitmapsize; 2250 CQ_CLOSE(helperq); 2251 break; 2252 } 2253 2254 HRSTART(ds->perpage, bitmap); 2255 for (; bitnum < dumpcfg.bitmapsize; bitnum++) 2256 if (BT_TEST(dumpcfg.bitmap, bitnum)) 2257 break; 2258 HRSTOP(ds->perpage, bitmap); 2259 dump_timeleft = dump_timeout; 2260 2261 if (bitnum >= dumpcfg.bitmapsize) { 2262 CQ_CLOSE(helperq); 2263 break; 2264 } 2265 2266 /* 2267 * Try to map CBUF_MAPSIZE ranges. Can't 2268 * assume that memory segment size is a 2269 * multiple of CBUF_MAPSIZE. Can't assume that 2270 * the segment starts on a CBUF_MAPSIZE 2271 * boundary. 2272 */ 2273 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2274 ASSERT(pfn != PFN_INVALID); 2275 ASSERT(bitnum + mlw.mpleft <= dumpcfg.bitmapsize); 2276 2277 base = P2ALIGN(pfn, CBUF_MAPNP); 2278 if (base < mlw.mpaddr) { 2279 base = mlw.mpaddr; 2280 baseoff = P2PHASE(base, CBUF_MAPNP); 2281 } else { 2282 baseoff = 0; 2283 } 2284 2285 pfnoff = pfn - base; 2286 if (pfnoff + mlw.mpleft < CBUF_MAPNP) { 2287 hibitnum = bitnum + mlw.mpleft; 2288 cp->size = ptob(pfnoff + mlw.mpleft); 2289 } else { 2290 hibitnum = bitnum - pfnoff + CBUF_MAPNP - 2291 baseoff; 2292 cp->size = CBUF_MAPSIZE - ptob(baseoff); 2293 } 2294 2295 cp->pfn = pfn; 2296 cp->bitnum = bitnum++; 2297 cp->pagenum = pagenum++; 2298 cp->off = ptob(pfnoff); 2299 2300 for (; bitnum < hibitnum; bitnum++) 2301 if (BT_TEST(dumpcfg.bitmap, bitnum)) 2302 pagenum++; 2303 2304 dump_timeleft = dump_timeout; 2305 cp->used = ptob(pagenum - cp->pagenum); 2306 2307 HRSTART(ds->perpage, map); 2308 hat_devload(kas.a_hat, cp->buf, cp->size, base, 2309 PROT_READ, HAT_LOAD_NOCONSIST); 2310 HRSTOP(ds->perpage, map); 2311 2312 ds->pages_mapped += btop(cp->size); 2313 ds->pages_used += pagenum - cp->pagenum; 2314 2315 CQ_OPEN(mainq); 2316 2317 /* 2318 * If there are no helpers the main task does 2319 * non-streams lzjb compress. 2320 */ 2321 if (dumpcfg.clevel == 0) { 2322 dumpsys_lzjb_page(dumpcfg.helper, cp); 2323 break; 2324 } 2325 2326 /* pass mapped pages to a helper */ 2327 CQ_PUT(helperq, cp, CBUF_INREADY); 2328 2329 /* the last page was done */ 2330 if (bitnum >= dumpcfg.bitmapsize) 2331 CQ_CLOSE(helperq); 2332 2333 break; 2334 2335 case CBUF_USEDMAP: 2336 2337 ds->npages += btop(cp->used); 2338 2339 HRSTART(ds->perpage, unmap); 2340 hat_unload(kas.a_hat, cp->buf, cp->size, HAT_UNLOAD); 2341 HRSTOP(ds->perpage, unmap); 2342 2343 if (bitnum < dumpcfg.bitmapsize) 2344 CQ_PUT(mainq, cp, CBUF_FREEMAP); 2345 CQ_CLOSE(mainq); 2346 2347 ASSERT(ds->npages <= dumphdr->dump_npages); 2348 ds->percent = ds->npages * 100LL / dumphdr->dump_npages; 2349 break; 2350 2351 case CBUF_WRITE: 2352 2353 CQ_PUT(writerq, cp, CBUF_WRITE); 2354 break; 2355 2356 case CBUF_ERRMSG: 2357 2358 if (cp->used > 0) { 2359 cp->buf[cp->size - 2] = '\n'; 2360 cp->buf[cp->size - 1] = '\0'; 2361 if (ds->neednl) { 2362 uprintf("\n%s", cp->buf); 2363 ds->neednl = 0; 2364 } else { 2365 uprintf("%s", cp->buf); 2366 } 2367 /* wait for console output */ 2368 drv_usecwait(200000); 2369 dump_timeleft = dump_timeout; 2370 } 2371 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2372 break; 2373 2374 default: 2375 uprintf("dump: unexpected buffer state %d, " 2376 "buffer will be lost\n", cp->state); 2377 break; 2378 2379 } /* end switch */ 2380 2381 } /* end while(1) */ 2382 } 2383 2384 #ifdef COLLECT_METRICS 2385 size_t 2386 dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size) 2387 { 2388 dumpcfg_t *cfg = &dumpcfg; 2389 int myid = CPU->cpu_seqid; 2390 int i, compress_ratio; 2391 int sec, iorate; 2392 helper_t *hp, *hpend = &cfg->helper[cfg->nhelper]; 2393 char *e = buf + size; 2394 char *p = buf; 2395 2396 sec = ds->elapsed / (1000 * 1000 * 1000ULL); 2397 if (sec < 1) 2398 sec = 1; 2399 2400 if (ds->iotime < 1) 2401 ds->iotime = 1; 2402 iorate = (ds->nwrite * 100000ULL) / ds->iotime; 2403 2404 compress_ratio = 100LL * ds->npages / btopr(ds->nwrite + 1); 2405 2406 #define P(...) (p += p < e ? snprintf(p, e - p, __VA_ARGS__) : 0) 2407 2408 P("Master cpu_seqid,%d\n", CPU->cpu_seqid); 2409 P("Master cpu_id,%d\n", CPU->cpu_id); 2410 P("dump_flags,0x%x\n", dumphdr->dump_flags); 2411 P("dump_ioerr,%d\n", dump_ioerr); 2412 2413 P("Helpers:\n"); 2414 for (i = 0; i < ncpus; i++) { 2415 if ((i & 15) == 0) 2416 P(",,%03d,", i); 2417 if (i == myid) 2418 P(" M"); 2419 else if (BT_TEST(cfg->helpermap, i)) 2420 P("%4d", cpu_seq[i]->cpu_id); 2421 else 2422 P(" *"); 2423 if ((i & 15) == 15) 2424 P("\n"); 2425 } 2426 2427 P("ncbuf_used,%d\n", cfg->ncbuf_used); 2428 P("ncmap,%d\n", cfg->ncmap); 2429 2430 P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m); 2431 P("Found small pages,%ld\n", cfg->foundsm); 2432 2433 P("Compression level,%d\n", cfg->clevel); 2434 P("Compression type,%s %s\n", cfg->clevel == 0 ? "serial" : "parallel", 2435 cfg->clevel >= DUMP_CLEVEL_BZIP2 ? "bzip2" : "lzjb"); 2436 P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio % 2437 100); 2438 P("nhelper_used,%d\n", cfg->nhelper_used); 2439 2440 P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100); 2441 P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite); 2442 P("..total nsec,%lld\n", (u_longlong_t)ds->iotime); 2443 P("dumpbuf.iosize,%ld\n", dumpbuf.iosize); 2444 P("dumpbuf.size,%ld\n", dumpbuf.size); 2445 2446 P("Dump pages/sec,%llu\n", (u_longlong_t)ds->npages / sec); 2447 P("Dump pages,%llu\n", (u_longlong_t)ds->npages); 2448 P("Dump time,%d\n", sec); 2449 2450 if (ds->pages_mapped > 0) 2451 P("per-cent map utilization,%d\n", (int)((100 * ds->pages_used) 2452 / ds->pages_mapped)); 2453 2454 P("\nPer-page metrics:\n"); 2455 if (ds->npages > 0) { 2456 for (hp = cfg->helper; hp != hpend; hp++) { 2457 #define PERPAGE(x) ds->perpage.x += hp->perpage.x; 2458 PERPAGES; 2459 #undef PERPAGE 2460 } 2461 #define PERPAGE(x) \ 2462 P("%s nsec/page,%d\n", #x, (int)(ds->perpage.x / ds->npages)); 2463 PERPAGES; 2464 #undef PERPAGE 2465 P("freebufq.empty,%d\n", (int)(ds->freebufq.empty / 2466 ds->npages)); 2467 P("helperq.empty,%d\n", (int)(ds->helperq.empty / 2468 ds->npages)); 2469 P("writerq.empty,%d\n", (int)(ds->writerq.empty / 2470 ds->npages)); 2471 P("mainq.empty,%d\n", (int)(ds->mainq.empty / ds->npages)); 2472 2473 P("I/O wait nsec/page,%llu\n", (u_longlong_t)(ds->iowait / 2474 ds->npages)); 2475 } 2476 #undef P 2477 if (p < e) 2478 bzero(p, e - p); 2479 return (p - buf); 2480 } 2481 #endif /* COLLECT_METRICS */ 2482 2483 /* 2484 * Dump the system. 2485 */ 2486 void 2487 dumpsys(void) 2488 { 2489 dumpsync_t *ds = &dumpsync; 2490 taskq_t *livetaskq = NULL; 2491 pfn_t pfn; 2492 pgcnt_t bitnum; 2493 proc_t *p; 2494 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper]; 2495 cbuf_t *cp; 2496 pid_t npids, pidx; 2497 char *content; 2498 char *buf; 2499 size_t size; 2500 int save_dump_clevel; 2501 dumpmlw_t mlw; 2502 dumpcsize_t datatag; 2503 dumpdatahdr_t datahdr; 2504 2505 if (dumpvp == NULL || dumphdr == NULL) { 2506 uprintf("skipping system dump - no dump device configured\n"); 2507 if (panicstr) { 2508 dumpcfg.helpers_wanted = 0; 2509 dumpsys_spinunlock(&dumpcfg.helper_lock); 2510 } 2511 return; 2512 } 2513 dumpbuf.cur = dumpbuf.start; 2514 2515 /* clear the sync variables */ 2516 ASSERT(dumpcfg.nhelper > 0); 2517 bzero(ds, sizeof (*ds)); 2518 ds->dumpcpu = CPU->cpu_id; 2519 2520 /* 2521 * Calculate the starting block for dump. If we're dumping on a 2522 * swap device, start 1/5 of the way in; otherwise, start at the 2523 * beginning. And never use the first page -- it may be a disk label. 2524 */ 2525 if (dumpvp->v_flag & VISSWAP) 2526 dumphdr->dump_start = P2ROUNDUP(dumpvp_size / 5, DUMP_OFFSET); 2527 else 2528 dumphdr->dump_start = DUMP_OFFSET; 2529 2530 dumphdr->dump_flags = DF_VALID | DF_COMPLETE | DF_LIVE | DF_COMPRESSED; 2531 dumphdr->dump_crashtime = gethrestime_sec(); 2532 dumphdr->dump_npages = 0; 2533 dumphdr->dump_nvtop = 0; 2534 bzero(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.bitmapsize)); 2535 dump_timeleft = dump_timeout; 2536 2537 if (panicstr) { 2538 dumphdr->dump_flags &= ~DF_LIVE; 2539 (void) VOP_DUMPCTL(dumpvp, DUMP_FREE, NULL, NULL); 2540 (void) VOP_DUMPCTL(dumpvp, DUMP_ALLOC, NULL, NULL); 2541 (void) vsnprintf(dumphdr->dump_panicstring, DUMP_PANICSIZE, 2542 panicstr, panicargs); 2543 2544 } 2545 2546 if (dump_conflags & DUMP_ALL) 2547 content = "all"; 2548 else if (dump_conflags & DUMP_CURPROC) 2549 content = "kernel + curproc"; 2550 else 2551 content = "kernel"; 2552 uprintf("dumping to %s, offset %lld, content: %s\n", dumppath, 2553 dumphdr->dump_start, content); 2554 2555 /* Make sure nodename is current */ 2556 bcopy(utsname.nodename, dumphdr->dump_utsname.nodename, SYS_NMLN); 2557 2558 /* 2559 * If this is a live dump, try to open a VCHR vnode for better 2560 * performance. We must take care to flush the buffer cache 2561 * first. 2562 */ 2563 if (!panicstr) { 2564 vnode_t *cdev_vp, *cmn_cdev_vp; 2565 2566 ASSERT(dumpbuf.cdev_vp == NULL); 2567 cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR); 2568 if (cdev_vp != NULL) { 2569 cmn_cdev_vp = common_specvp(cdev_vp); 2570 if (VOP_OPEN(&cmn_cdev_vp, FREAD | FWRITE, kcred, NULL) 2571 == 0) { 2572 if (vn_has_cached_data(dumpvp)) 2573 (void) pvn_vplist_dirty(dumpvp, 0, NULL, 2574 B_INVAL | B_TRUNC, kcred); 2575 dumpbuf.cdev_vp = cmn_cdev_vp; 2576 } else { 2577 VN_RELE(cdev_vp); 2578 } 2579 } 2580 } 2581 2582 /* 2583 * Store a hires timestamp so we can look it up during debugging. 2584 */ 2585 lbolt_debug_entry(); 2586 2587 /* 2588 * Leave room for the message and ereport save areas and terminal dump 2589 * header. 2590 */ 2591 dumpbuf.vp_limit = dumpvp_size - DUMP_LOGSIZE - DUMP_OFFSET - 2592 DUMP_ERPTSIZE; 2593 2594 /* 2595 * Write out the symbol table. It's no longer compressed, 2596 * so its 'size' and 'csize' are equal. 2597 */ 2598 dumpbuf.vp_off = dumphdr->dump_ksyms = dumphdr->dump_start + PAGESIZE; 2599 dumphdr->dump_ksyms_size = dumphdr->dump_ksyms_csize = 2600 ksyms_snapshot(dumpvp_ksyms_write, NULL, LONG_MAX); 2601 2602 /* 2603 * Write out the translation map. 2604 */ 2605 dumphdr->dump_map = dumpvp_flush(); 2606 dump_as(&kas); 2607 dumphdr->dump_nvtop += dump_plat_addr(); 2608 2609 /* 2610 * call into hat, which may have unmapped pages that also need to 2611 * be in the dump 2612 */ 2613 hat_dump(); 2614 2615 if (dump_conflags & DUMP_ALL) { 2616 mutex_enter(&pidlock); 2617 2618 for (npids = 0, p = practive; p != NULL; p = p->p_next) 2619 dumpcfg.pids[npids++] = p->p_pid; 2620 2621 mutex_exit(&pidlock); 2622 2623 for (pidx = 0; pidx < npids; pidx++) 2624 (void) dump_process(dumpcfg.pids[pidx]); 2625 2626 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) { 2627 dump_timeleft = dump_timeout; 2628 BT_SET(dumpcfg.bitmap, bitnum); 2629 } 2630 dumphdr->dump_npages = dumpcfg.bitmapsize; 2631 dumphdr->dump_flags |= DF_ALL; 2632 2633 } else if (dump_conflags & DUMP_CURPROC) { 2634 /* 2635 * Determine which pid is to be dumped. If we're panicking, we 2636 * dump the process associated with panic_thread (if any). If 2637 * this is a live dump, we dump the process associated with 2638 * curthread. 2639 */ 2640 npids = 0; 2641 if (panicstr) { 2642 if (panic_thread != NULL && 2643 panic_thread->t_procp != NULL && 2644 panic_thread->t_procp != &p0) { 2645 dumpcfg.pids[npids++] = 2646 panic_thread->t_procp->p_pid; 2647 } 2648 } else { 2649 dumpcfg.pids[npids++] = curthread->t_procp->p_pid; 2650 } 2651 2652 if (npids && dump_process(dumpcfg.pids[0]) == 0) 2653 dumphdr->dump_flags |= DF_CURPROC; 2654 else 2655 dumphdr->dump_flags |= DF_KERNEL; 2656 2657 } else { 2658 dumphdr->dump_flags |= DF_KERNEL; 2659 } 2660 2661 dumphdr->dump_hashmask = (1 << highbit(dumphdr->dump_nvtop - 1)) - 1; 2662 2663 /* 2664 * Write out the pfn table. 2665 */ 2666 dumphdr->dump_pfn = dumpvp_flush(); 2667 dump_init_memlist_walker(&mlw); 2668 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) { 2669 dump_timeleft = dump_timeout; 2670 if (!BT_TEST(dumpcfg.bitmap, bitnum)) 2671 continue; 2672 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2673 ASSERT(pfn != PFN_INVALID); 2674 dumpvp_write(&pfn, sizeof (pfn_t)); 2675 } 2676 dump_plat_pfn(); 2677 2678 /* 2679 * Write out all the pages. 2680 * Map pages, copy them handling UEs, compress, and write them out. 2681 * Cooperate with any helpers running on CPUs in panic_idle(). 2682 */ 2683 dumphdr->dump_data = dumpvp_flush(); 2684 2685 bzero(dumpcfg.helpermap, BT_SIZEOFMAP(NCPU)); 2686 ds->live = dumpcfg.clevel > 0 && 2687 (dumphdr->dump_flags & DF_LIVE) != 0; 2688 2689 save_dump_clevel = dumpcfg.clevel; 2690 if (panicstr) 2691 dumpsys_get_maxmem(); 2692 else if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2) 2693 dumpcfg.clevel = DUMP_CLEVEL_LZJB; 2694 2695 dumpcfg.nhelper_used = 0; 2696 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2697 if (hp->page == NULL) { 2698 hp->helper = DONEHELPER; 2699 continue; 2700 } 2701 ++dumpcfg.nhelper_used; 2702 hp->helper = FREEHELPER; 2703 hp->taskqid = NULL; 2704 hp->ds = ds; 2705 bzero(&hp->perpage, sizeof (hp->perpage)); 2706 if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2) 2707 (void) BZ2_bzCompressReset(&hp->bzstream); 2708 } 2709 2710 CQ_OPEN(freebufq); 2711 CQ_OPEN(helperq); 2712 2713 dumpcfg.ncbuf_used = 0; 2714 for (cp = dumpcfg.cbuf; cp != &dumpcfg.cbuf[dumpcfg.ncbuf]; cp++) { 2715 if (cp->buf != NULL) { 2716 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2717 ++dumpcfg.ncbuf_used; 2718 } 2719 } 2720 2721 for (cp = dumpcfg.cmap; cp != &dumpcfg.cmap[dumpcfg.ncmap]; cp++) 2722 CQ_PUT(mainq, cp, CBUF_FREEMAP); 2723 2724 ds->start = gethrtime(); 2725 ds->iowaitts = ds->start; 2726 2727 /* start helpers */ 2728 if (ds->live) { 2729 int n = dumpcfg.nhelper_used; 2730 int pri = MINCLSYSPRI - 25; 2731 2732 livetaskq = taskq_create("LiveDump", n, pri, n, n, 2733 TASKQ_PREPOPULATE); 2734 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2735 if (hp->page == NULL) 2736 continue; 2737 hp->helper = hp - dumpcfg.helper; 2738 hp->taskqid = taskq_dispatch(livetaskq, 2739 dumpsys_live_helper, (void *)hp, TQ_NOSLEEP); 2740 } 2741 2742 } else { 2743 if (panicstr) 2744 kmem_dump_begin(); 2745 dumpcfg.helpers_wanted = dumpcfg.clevel > 0; 2746 dumpsys_spinunlock(&dumpcfg.helper_lock); 2747 } 2748 2749 /* run main task */ 2750 dumpsys_main_task(ds); 2751 2752 ds->elapsed = gethrtime() - ds->start; 2753 if (ds->elapsed < 1) 2754 ds->elapsed = 1; 2755 2756 if (livetaskq != NULL) 2757 taskq_destroy(livetaskq); 2758 2759 if (ds->neednl) { 2760 uprintf("\n"); 2761 ds->neednl = 0; 2762 } 2763 2764 /* record actual pages dumped */ 2765 dumphdr->dump_npages = ds->npages; 2766 2767 /* platform-specific data */ 2768 dumphdr->dump_npages += dump_plat_data(dumpcfg.cbuf[0].buf); 2769 2770 /* note any errors by clearing DF_COMPLETE */ 2771 if (dump_ioerr || ds->npages < dumphdr->dump_npages) 2772 dumphdr->dump_flags &= ~DF_COMPLETE; 2773 2774 /* end of stream blocks */ 2775 datatag = 0; 2776 dumpvp_write(&datatag, sizeof (datatag)); 2777 2778 bzero(&datahdr, sizeof (datahdr)); 2779 2780 /* buffer for metrics */ 2781 buf = dumpcfg.cbuf[0].buf; 2782 size = MIN(dumpcfg.cbuf[0].size, DUMP_OFFSET - sizeof (dumphdr_t) - 2783 sizeof (dumpdatahdr_t)); 2784 2785 /* finish the kmem intercepts, collect kmem verbose info */ 2786 if (panicstr) { 2787 datahdr.dump_metrics = kmem_dump_finish(buf, size); 2788 buf += datahdr.dump_metrics; 2789 size -= datahdr.dump_metrics; 2790 } 2791 2792 /* compression info in data header */ 2793 datahdr.dump_datahdr_magic = DUMP_DATAHDR_MAGIC; 2794 datahdr.dump_datahdr_version = DUMP_DATAHDR_VERSION; 2795 datahdr.dump_maxcsize = CBUF_SIZE; 2796 datahdr.dump_maxrange = CBUF_MAPSIZE / PAGESIZE; 2797 datahdr.dump_nstreams = dumpcfg.nhelper_used; 2798 datahdr.dump_clevel = dumpcfg.clevel; 2799 #ifdef COLLECT_METRICS 2800 if (dump_metrics_on) 2801 datahdr.dump_metrics += dumpsys_metrics(ds, buf, size); 2802 #endif 2803 datahdr.dump_data_csize = dumpvp_flush() - dumphdr->dump_data; 2804 2805 /* 2806 * Write out the initial and terminal dump headers. 2807 */ 2808 dumpbuf.vp_off = dumphdr->dump_start; 2809 dumpvp_write(dumphdr, sizeof (dumphdr_t)); 2810 (void) dumpvp_flush(); 2811 2812 dumpbuf.vp_limit = dumpvp_size; 2813 dumpbuf.vp_off = dumpbuf.vp_limit - DUMP_OFFSET; 2814 dumpvp_write(dumphdr, sizeof (dumphdr_t)); 2815 dumpvp_write(&datahdr, sizeof (dumpdatahdr_t)); 2816 dumpvp_write(dumpcfg.cbuf[0].buf, datahdr.dump_metrics); 2817 2818 (void) dumpvp_flush(); 2819 2820 uprintf("\r%3d%% done: %llu pages dumped, ", 2821 ds->percent_done, (u_longlong_t)ds->npages); 2822 2823 if (dump_ioerr == 0) { 2824 uprintf("dump succeeded\n"); 2825 } else { 2826 uprintf("dump failed: error %d\n", dump_ioerr); 2827 #ifdef DEBUG 2828 if (panicstr) 2829 debug_enter("dump failed"); 2830 #endif 2831 } 2832 2833 /* 2834 * Write out all undelivered messages. This has to be the *last* 2835 * thing we do because the dump process itself emits messages. 2836 */ 2837 if (panicstr) { 2838 dump_ereports(); 2839 dump_messages(); 2840 } 2841 2842 delay(2 * hz); /* let people see the 'done' message */ 2843 dump_timeleft = 0; 2844 dump_ioerr = 0; 2845 2846 /* restore settings after live dump completes */ 2847 if (!panicstr) { 2848 dumpcfg.clevel = save_dump_clevel; 2849 2850 /* release any VCHR open of the dump device */ 2851 if (dumpbuf.cdev_vp != NULL) { 2852 (void) VOP_CLOSE(dumpbuf.cdev_vp, FREAD | FWRITE, 1, 0, 2853 kcred, NULL); 2854 VN_RELE(dumpbuf.cdev_vp); 2855 dumpbuf.cdev_vp = NULL; 2856 } 2857 } 2858 } 2859 2860 /* 2861 * This function is called whenever the memory size, as represented 2862 * by the phys_install list, changes. 2863 */ 2864 void 2865 dump_resize() 2866 { 2867 mutex_enter(&dump_lock); 2868 dumphdr_init(); 2869 dumpbuf_resize(); 2870 dump_update_clevel(); 2871 mutex_exit(&dump_lock); 2872 } 2873 2874 /* 2875 * This function allows for dynamic resizing of a dump area. It assumes that 2876 * the underlying device has update its appropriate size(9P). 2877 */ 2878 int 2879 dumpvp_resize() 2880 { 2881 int error; 2882 vattr_t vattr; 2883 2884 mutex_enter(&dump_lock); 2885 vattr.va_mask = AT_SIZE; 2886 if ((error = VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL)) != 0) { 2887 mutex_exit(&dump_lock); 2888 return (error); 2889 } 2890 2891 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) { 2892 mutex_exit(&dump_lock); 2893 return (ENOSPC); 2894 } 2895 2896 dumpvp_size = vattr.va_size & -DUMP_OFFSET; 2897 mutex_exit(&dump_lock); 2898 return (0); 2899 } 2900