1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/param.h> 28 #include <sys/systm.h> 29 #include <sys/vm.h> 30 #include <sys/proc.h> 31 #include <sys/file.h> 32 #include <sys/conf.h> 33 #include <sys/kmem.h> 34 #include <sys/mem.h> 35 #include <sys/mman.h> 36 #include <sys/vnode.h> 37 #include <sys/errno.h> 38 #include <sys/memlist.h> 39 #include <sys/dumphdr.h> 40 #include <sys/dumpadm.h> 41 #include <sys/ksyms.h> 42 #include <sys/compress.h> 43 #include <sys/stream.h> 44 #include <sys/strsun.h> 45 #include <sys/cmn_err.h> 46 #include <sys/bitmap.h> 47 #include <sys/modctl.h> 48 #include <sys/utsname.h> 49 #include <sys/systeminfo.h> 50 #include <sys/vmem.h> 51 #include <sys/log.h> 52 #include <sys/var.h> 53 #include <sys/debug.h> 54 #include <sys/sunddi.h> 55 #include <fs/fs_subr.h> 56 #include <sys/fs/snode.h> 57 #include <sys/ontrap.h> 58 #include <sys/panic.h> 59 #include <sys/dkio.h> 60 #include <sys/vtoc.h> 61 #include <sys/errorq.h> 62 #include <sys/fm/util.h> 63 #include <sys/fs/zfs.h> 64 65 #include <vm/hat.h> 66 #include <vm/as.h> 67 #include <vm/page.h> 68 #include <vm/pvn.h> 69 #include <vm/seg.h> 70 #include <vm/seg_kmem.h> 71 #include <sys/clock_impl.h> 72 #include <sys/hold_page.h> 73 74 #include <bzip2/bzlib.h> 75 76 /* 77 * Crash dump time is dominated by disk write time. To reduce this, 78 * the stronger compression method bzip2 is applied to reduce the dump 79 * size and hence reduce I/O time. However, bzip2 is much more 80 * computationally expensive than the existing lzjb algorithm, so to 81 * avoid increasing compression time, CPUs that are otherwise idle 82 * during panic are employed to parallelize the compression task. 83 * Many helper CPUs are needed to prevent bzip2 from being a 84 * bottleneck, and on systems with too few CPUs, the lzjb algorithm is 85 * parallelized instead. Lastly, I/O and compression are performed by 86 * different CPUs, and are hence overlapped in time, unlike the older 87 * serial code. 88 * 89 * Another important consideration is the speed of the dump 90 * device. Faster disks need less CPUs in order to benefit from 91 * parallel lzjb versus parallel bzip2. Therefore, the CPU count 92 * threshold for switching from parallel lzjb to paralled bzip2 is 93 * elevated for faster disks. The dump device speed is adduced from 94 * the setting for dumpbuf.iosize, see dump_update_clevel. 95 */ 96 97 /* 98 * exported vars 99 */ 100 kmutex_t dump_lock; /* lock for dump configuration */ 101 dumphdr_t *dumphdr; /* dump header */ 102 int dump_conflags = DUMP_KERNEL; /* dump configuration flags */ 103 vnode_t *dumpvp; /* dump device vnode pointer */ 104 u_offset_t dumpvp_size; /* size of dump device, in bytes */ 105 char *dumppath; /* pathname of dump device */ 106 int dump_timeout = 120; /* timeout for dumping pages */ 107 int dump_timeleft; /* portion of dump_timeout remaining */ 108 int dump_ioerr; /* dump i/o error */ 109 int dump_check_used; /* enable check for used pages */ 110 111 /* 112 * Tunables for dump compression and parallelism. These can be set via 113 * /etc/system. 114 * 115 * dump_ncpu_low number of helpers for parallel lzjb 116 * This is also the minimum configuration. 117 * 118 * dump_bzip2_level bzip2 compression level: 1-9 119 * Higher numbers give greater compression, but take more memory 120 * and time. Memory used per helper is ~(dump_bzip2_level * 1MB). 121 * 122 * dump_plat_mincpu the cross-over limit for using bzip2 (per platform): 123 * if dump_plat_mincpu == 0, then always do single threaded dump 124 * if ncpu >= dump_plat_mincpu then try to use bzip2 125 * 126 * dump_metrics_on if set, metrics are collected in the kernel, passed 127 * to savecore via the dump file, and recorded by savecore in 128 * METRICS.txt. 129 */ 130 uint_t dump_ncpu_low = 4; /* minimum config for parallel lzjb */ 131 uint_t dump_bzip2_level = 1; /* bzip2 level (1-9) */ 132 133 /* tunables for pre-reserved heap */ 134 uint_t dump_kmem_permap = 1024; 135 uint_t dump_kmem_pages = 8; 136 137 /* Define multiple buffers per helper to avoid stalling */ 138 #define NCBUF_PER_HELPER 2 139 #define NCMAP_PER_HELPER 4 140 141 /* minimum number of helpers configured */ 142 #define MINHELPERS (dump_ncpu_low) 143 #define MINCBUFS (MINHELPERS * NCBUF_PER_HELPER) 144 145 /* 146 * Define constant parameters. 147 * 148 * CBUF_SIZE size of an output buffer 149 * 150 * CBUF_MAPSIZE size of virtual range for mapping pages 151 * 152 * CBUF_MAPNP size of virtual range in pages 153 * 154 */ 155 #define DUMP_1KB ((size_t)1 << 10) 156 #define DUMP_1MB ((size_t)1 << 20) 157 #define CBUF_SIZE ((size_t)1 << 17) 158 #define CBUF_MAPSHIFT (22) 159 #define CBUF_MAPSIZE ((size_t)1 << CBUF_MAPSHIFT) 160 #define CBUF_MAPNP ((size_t)1 << (CBUF_MAPSHIFT - PAGESHIFT)) 161 162 /* 163 * Compression metrics are accumulated nano-second subtotals. The 164 * results are normalized by the number of pages dumped. A report is 165 * generated when dumpsys() completes and is saved in the dump image 166 * after the trailing dump header. 167 * 168 * Metrics are always collected. Set the variable dump_metrics_on to 169 * cause metrics to be saved in the crash file, where savecore will 170 * save it in the file METRICS.txt. 171 */ 172 #define PERPAGES \ 173 PERPAGE(bitmap) PERPAGE(map) PERPAGE(unmap) \ 174 PERPAGE(copy) PERPAGE(compress) \ 175 PERPAGE(write) \ 176 PERPAGE(inwait) PERPAGE(outwait) 177 178 typedef struct perpage { 179 #define PERPAGE(x) hrtime_t x; 180 PERPAGES 181 #undef PERPAGE 182 } perpage_t; 183 184 /* 185 * This macro controls the code generation for collecting dump 186 * performance information. By default, the code is generated, but 187 * automatic saving of the information is disabled. If dump_metrics_on 188 * is set to 1, the timing information is passed to savecore via the 189 * crash file, where it is appended to the file dump-dir/METRICS.txt. 190 */ 191 #define COLLECT_METRICS 192 193 #ifdef COLLECT_METRICS 194 uint_t dump_metrics_on = 0; /* set to 1 to enable recording metrics */ 195 196 #define HRSTART(v, m) v##ts.m = gethrtime() 197 #define HRSTOP(v, m) v.m += gethrtime() - v##ts.m 198 #define HRBEGIN(v, m, s) v##ts.m = gethrtime(); v.size += s 199 #define HREND(v, m) v.m += gethrtime() - v##ts.m 200 #define HRNORM(v, m, n) v.m /= (n) 201 202 #else 203 #define HRSTART(v, m) 204 #define HRSTOP(v, m) 205 #define HRBEGIN(v, m, s) 206 #define HREND(v, m) 207 #define HRNORM(v, m, n) 208 #endif /* COLLECT_METRICS */ 209 210 /* 211 * Buffers for copying and compressing memory pages. 212 * 213 * cbuf_t buffer controllers: used for both input and output. 214 * 215 * The buffer state indicates how it is being used: 216 * 217 * CBUF_FREEMAP: CBUF_MAPSIZE virtual address range is available for 218 * mapping input pages. 219 * 220 * CBUF_INREADY: input pages are mapped and ready for compression by a 221 * helper. 222 * 223 * CBUF_USEDMAP: mapping has been consumed by a helper. Needs unmap. 224 * 225 * CBUF_FREEBUF: CBUF_SIZE output buffer, which is available. 226 * 227 * CBUF_WRITE: CBUF_SIZE block of compressed pages from a helper, 228 * ready to write out. 229 * 230 * CBUF_ERRMSG: CBUF_SIZE block of error messages from a helper 231 * (reports UE errors.) 232 */ 233 234 typedef enum cbufstate { 235 CBUF_FREEMAP, 236 CBUF_INREADY, 237 CBUF_USEDMAP, 238 CBUF_FREEBUF, 239 CBUF_WRITE, 240 CBUF_ERRMSG 241 } cbufstate_t; 242 243 typedef struct cbuf cbuf_t; 244 245 struct cbuf { 246 cbuf_t *next; /* next in list */ 247 cbufstate_t state; /* processing state */ 248 size_t used; /* amount used */ 249 size_t size; /* mem size */ 250 char *buf; /* kmem or vmem */ 251 pgcnt_t pagenum; /* index to pfn map */ 252 pgcnt_t bitnum; /* first set bitnum */ 253 pfn_t pfn; /* first pfn in mapped range */ 254 int off; /* byte offset to first pfn */ 255 }; 256 257 /* 258 * cqueue_t queues: a uni-directional channel for communication 259 * from the master to helper tasks or vice-versa using put and 260 * get primitives. Both mappings and data buffers are passed via 261 * queues. Producers close a queue when done. The number of 262 * active producers is reference counted so the consumer can 263 * detect end of data. Concurrent access is mediated by atomic 264 * operations for panic dump, or mutex/cv for live dump. 265 * 266 * There a four queues, used as follows: 267 * 268 * Queue Dataflow NewState 269 * -------------------------------------------------- 270 * mainq master -> master FREEMAP 271 * master has initialized or unmapped an input buffer 272 * -------------------------------------------------- 273 * helperq master -> helper INREADY 274 * master has mapped input for use by helper 275 * -------------------------------------------------- 276 * mainq master <- helper USEDMAP 277 * helper is done with input 278 * -------------------------------------------------- 279 * freebufq master -> helper FREEBUF 280 * master has initialized or written an output buffer 281 * -------------------------------------------------- 282 * mainq master <- helper WRITE 283 * block of compressed pages from a helper 284 * -------------------------------------------------- 285 * mainq master <- helper ERRMSG 286 * error messages from a helper (memory error case) 287 * -------------------------------------------------- 288 * writerq master <- master WRITE 289 * non-blocking queue of blocks to write 290 * -------------------------------------------------- 291 */ 292 typedef struct cqueue { 293 cbuf_t *volatile first; /* first in list */ 294 cbuf_t *last; /* last in list */ 295 hrtime_t ts; /* timestamp */ 296 hrtime_t empty; /* total time empty */ 297 kmutex_t mutex; /* live state lock */ 298 kcondvar_t cv; /* live wait var */ 299 lock_t spinlock; /* panic mode spin lock */ 300 volatile uint_t open; /* producer ref count */ 301 } cqueue_t; 302 303 /* 304 * Convenience macros for using the cqueue functions 305 * Note that the caller must have defined "dumpsync_t *ds" 306 */ 307 #define CQ_IS_EMPTY(q) \ 308 (ds->q.first == NULL) 309 310 #define CQ_OPEN(q) \ 311 atomic_inc_uint(&ds->q.open) 312 313 #define CQ_CLOSE(q) \ 314 dumpsys_close_cq(&ds->q, ds->live) 315 316 #define CQ_PUT(q, cp, st) \ 317 dumpsys_put_cq(&ds->q, cp, st, ds->live) 318 319 #define CQ_GET(q) \ 320 dumpsys_get_cq(&ds->q, ds->live) 321 322 /* 323 * Dynamic state when dumpsys() is running. 324 */ 325 typedef struct dumpsync { 326 pgcnt_t npages; /* subtotal of pages dumped */ 327 pgcnt_t pages_mapped; /* subtotal of pages mapped */ 328 pgcnt_t pages_used; /* subtotal of pages used per map */ 329 size_t nwrite; /* subtotal of bytes written */ 330 uint_t live; /* running live dump */ 331 uint_t neednl; /* will need to print a newline */ 332 uint_t percent; /* dump progress */ 333 uint_t percent_done; /* dump progress reported */ 334 cqueue_t freebufq; /* free kmem bufs for writing */ 335 cqueue_t mainq; /* input for main task */ 336 cqueue_t helperq; /* input for helpers */ 337 cqueue_t writerq; /* input for writer */ 338 hrtime_t start; /* start time */ 339 hrtime_t elapsed; /* elapsed time when completed */ 340 hrtime_t iotime; /* time spent writing nwrite bytes */ 341 hrtime_t iowait; /* time spent waiting for output */ 342 hrtime_t iowaitts; /* iowait timestamp */ 343 perpage_t perpage; /* metrics */ 344 perpage_t perpagets; 345 int dumpcpu; /* master cpu */ 346 } dumpsync_t; 347 348 static dumpsync_t dumpsync; /* synchronization vars */ 349 350 /* 351 * helper_t helpers: contains the context for a stream. CPUs run in 352 * parallel at dump time; each CPU creates a single stream of 353 * compression data. Stream data is divided into CBUF_SIZE blocks. 354 * The blocks are written in order within a stream. But, blocks from 355 * multiple streams can be interleaved. Each stream is identified by a 356 * unique tag. 357 */ 358 typedef struct helper { 359 int helper; /* bound helper id */ 360 int tag; /* compression stream tag */ 361 perpage_t perpage; /* per page metrics */ 362 perpage_t perpagets; /* per page metrics (timestamps) */ 363 taskqid_t taskqid; /* live dump task ptr */ 364 int in, out; /* buffer offsets */ 365 cbuf_t *cpin, *cpout, *cperr; /* cbuf objects in process */ 366 dumpsync_t *ds; /* pointer to sync vars */ 367 size_t used; /* counts input consumed */ 368 char *page; /* buffer for page copy */ 369 char *lzbuf; /* lzjb output */ 370 bz_stream bzstream; /* bzip2 state */ 371 } helper_t; 372 373 #define MAINHELPER (-1) /* helper is also the main task */ 374 #define FREEHELPER (-2) /* unbound helper */ 375 #define DONEHELPER (-3) /* helper finished */ 376 377 /* 378 * configuration vars for dumpsys 379 */ 380 typedef struct dumpcfg { 381 int threshold; /* ncpu threshold for bzip2 */ 382 int nhelper; /* number of helpers */ 383 int nhelper_used; /* actual number of helpers used */ 384 int ncmap; /* number VA pages for compression */ 385 int ncbuf; /* number of bufs for compression */ 386 int ncbuf_used; /* number of bufs in use */ 387 uint_t clevel; /* dump compression level */ 388 helper_t *helper; /* array of helpers */ 389 cbuf_t *cmap; /* array of input (map) buffers */ 390 cbuf_t *cbuf; /* array of output buffers */ 391 ulong_t *helpermap; /* set of dumpsys helper CPU ids */ 392 ulong_t *bitmap; /* bitmap for marking pages to dump */ 393 ulong_t *rbitmap; /* bitmap for used CBUF_MAPSIZE ranges */ 394 pgcnt_t bitmapsize; /* size of bitmap */ 395 pgcnt_t rbitmapsize; /* size of bitmap for ranges */ 396 pgcnt_t found4m; /* number ranges allocated by dump */ 397 pgcnt_t foundsm; /* number small pages allocated by dump */ 398 pid_t *pids; /* list of process IDs at dump time */ 399 size_t maxsize; /* memory size needed at dump time */ 400 size_t maxvmsize; /* size of reserved VM */ 401 char *maxvm; /* reserved VM for spare pages */ 402 lock_t helper_lock; /* protect helper state */ 403 char helpers_wanted; /* flag to enable parallelism */ 404 } dumpcfg_t; 405 406 static dumpcfg_t dumpcfg; /* config vars */ 407 408 /* 409 * The dump I/O buffer. 410 * 411 * There is one I/O buffer used by dumpvp_write and dumvp_flush. It is 412 * sized according to the optimum device transfer speed. 413 */ 414 typedef struct dumpbuf { 415 vnode_t *cdev_vp; /* VCHR open of the dump device */ 416 len_t vp_limit; /* maximum write offset */ 417 offset_t vp_off; /* current dump device offset */ 418 char *cur; /* dump write pointer */ 419 char *start; /* dump buffer address */ 420 char *end; /* dump buffer end */ 421 size_t size; /* size of dumpbuf in bytes */ 422 size_t iosize; /* best transfer size for device */ 423 } dumpbuf_t; 424 425 dumpbuf_t dumpbuf; /* I/O buffer */ 426 427 /* 428 * The dump I/O buffer must be at least one page, at most xfer_size 429 * bytes, and should scale with physmem in between. The transfer size 430 * passed in will either represent a global default (maxphys) or the 431 * best size for the device. The size of the dumpbuf I/O buffer is 432 * limited by dumpbuf_limit (8MB by default) because the dump 433 * performance saturates beyond a certain size. The default is to 434 * select 1/4096 of the memory. 435 */ 436 static int dumpbuf_fraction = 12; /* memory size scale factor */ 437 static size_t dumpbuf_limit = 8 * DUMP_1MB; /* max I/O buf size */ 438 439 static size_t 440 dumpbuf_iosize(size_t xfer_size) 441 { 442 size_t iosize = ptob(physmem >> dumpbuf_fraction); 443 444 if (iosize < PAGESIZE) 445 iosize = PAGESIZE; 446 else if (iosize > xfer_size) 447 iosize = xfer_size; 448 if (iosize > dumpbuf_limit) 449 iosize = dumpbuf_limit; 450 return (iosize & PAGEMASK); 451 } 452 453 /* 454 * resize the I/O buffer 455 */ 456 static void 457 dumpbuf_resize(void) 458 { 459 char *old_buf = dumpbuf.start; 460 size_t old_size = dumpbuf.size; 461 char *new_buf; 462 size_t new_size; 463 464 ASSERT(MUTEX_HELD(&dump_lock)); 465 466 new_size = dumpbuf_iosize(MAX(dumpbuf.iosize, maxphys)); 467 if (new_size <= old_size) 468 return; /* no need to reallocate buffer */ 469 470 new_buf = kmem_alloc(new_size, KM_SLEEP); 471 dumpbuf.size = new_size; 472 dumpbuf.start = new_buf; 473 dumpbuf.end = new_buf + new_size; 474 kmem_free(old_buf, old_size); 475 } 476 477 /* 478 * dump_update_clevel is called when dumpadm configures the dump device. 479 * Calculate number of helpers and buffers. 480 * Allocate the minimum configuration for now. 481 * 482 * When the dump file is configured we reserve a minimum amount of 483 * memory for use at crash time. But we reserve VA for all the memory 484 * we really want in order to do the fastest dump possible. The VA is 485 * backed by pages not being dumped, according to the bitmap. If 486 * there is insufficient spare memory, however, we fall back to the 487 * minimum. 488 * 489 * Live dump (savecore -L) always uses the minimum config. 490 * 491 * clevel 0 is single threaded lzjb 492 * clevel 1 is parallel lzjb 493 * clevel 2 is parallel bzip2 494 * 495 * The ncpu threshold is selected with dump_plat_mincpu. 496 * On OPL, set_platform_defaults() overrides the sun4u setting. 497 * The actual values are defined via DUMP_PLAT_*_MINCPU macros. 498 * 499 * Architecture Threshold Algorithm 500 * sun4u < 51 parallel lzjb 501 * sun4u >= 51 parallel bzip2(*) 502 * sun4u OPL < 8 parallel lzjb 503 * sun4u OPL >= 8 parallel bzip2(*) 504 * sun4v < 128 parallel lzjb 505 * sun4v >= 128 parallel bzip2(*) 506 * x86 < 11 parallel lzjb 507 * x86 >= 11 parallel bzip2(*) 508 * 32-bit N/A single-threaded lzjb 509 * 510 * (*) bzip2 is only chosen if there is sufficient available 511 * memory for buffers at dump time. See dumpsys_get_maxmem(). 512 * 513 * Faster dump devices have larger I/O buffers. The threshold value is 514 * increased according to the size of the dump I/O buffer, because 515 * parallel lzjb performs better with faster disks. For buffers >= 1MB 516 * the threshold is 3X; for buffers >= 256K threshold is 2X. 517 * 518 * For parallel dumps, the number of helpers is ncpu-1. The CPU 519 * running panic runs the main task. For single-threaded dumps, the 520 * panic CPU does lzjb compression (it is tagged as MAINHELPER.) 521 * 522 * Need multiple buffers per helper so that they do not block waiting 523 * for the main task. 524 * parallel single-threaded 525 * Number of output buffers: nhelper*2 1 526 * Number of mapping buffers: nhelper*4 1 527 * 528 */ 529 static void 530 dump_update_clevel() 531 { 532 int tag; 533 size_t bz2size; 534 helper_t *hp, *hpend; 535 cbuf_t *cp, *cpend; 536 dumpcfg_t *old = &dumpcfg; 537 dumpcfg_t newcfg = *old; 538 dumpcfg_t *new = &newcfg; 539 540 ASSERT(MUTEX_HELD(&dump_lock)); 541 542 /* 543 * Free the previously allocated bufs and VM. 544 */ 545 if (old->helper != NULL) { 546 547 /* helpers */ 548 hpend = &old->helper[old->nhelper]; 549 for (hp = old->helper; hp != hpend; hp++) { 550 if (hp->lzbuf != NULL) 551 kmem_free(hp->lzbuf, PAGESIZE); 552 if (hp->page != NULL) 553 kmem_free(hp->page, PAGESIZE); 554 } 555 kmem_free(old->helper, old->nhelper * sizeof (helper_t)); 556 557 /* VM space for mapping pages */ 558 cpend = &old->cmap[old->ncmap]; 559 for (cp = old->cmap; cp != cpend; cp++) 560 vmem_xfree(heap_arena, cp->buf, CBUF_MAPSIZE); 561 kmem_free(old->cmap, old->ncmap * sizeof (cbuf_t)); 562 563 /* output bufs */ 564 cpend = &old->cbuf[old->ncbuf]; 565 for (cp = old->cbuf; cp != cpend; cp++) 566 if (cp->buf != NULL) 567 kmem_free(cp->buf, cp->size); 568 kmem_free(old->cbuf, old->ncbuf * sizeof (cbuf_t)); 569 570 /* reserved VM for dumpsys_get_maxmem */ 571 if (old->maxvmsize > 0) 572 vmem_xfree(heap_arena, old->maxvm, old->maxvmsize); 573 } 574 575 /* 576 * Allocate memory and VM. 577 * One CPU runs dumpsys, the rest are helpers. 578 */ 579 new->nhelper = ncpus - 1; 580 if (new->nhelper < 1) 581 new->nhelper = 1; 582 583 if (new->nhelper > DUMP_MAX_NHELPER) 584 new->nhelper = DUMP_MAX_NHELPER; 585 586 /* increase threshold for faster disks */ 587 new->threshold = dump_plat_mincpu; 588 if (dumpbuf.iosize >= DUMP_1MB) 589 new->threshold *= 3; 590 else if (dumpbuf.iosize >= (256 * DUMP_1KB)) 591 new->threshold *= 2; 592 593 /* figure compression level based upon the computed threshold. */ 594 if (dump_plat_mincpu == 0 || new->nhelper < 2) { 595 new->clevel = 0; 596 new->nhelper = 1; 597 } else if ((new->nhelper + 1) >= new->threshold) { 598 new->clevel = DUMP_CLEVEL_BZIP2; 599 } else { 600 new->clevel = DUMP_CLEVEL_LZJB; 601 } 602 603 if (new->clevel == 0) { 604 new->ncbuf = 1; 605 new->ncmap = 1; 606 } else { 607 new->ncbuf = NCBUF_PER_HELPER * new->nhelper; 608 new->ncmap = NCMAP_PER_HELPER * new->nhelper; 609 } 610 611 /* 612 * Allocate new data structures and buffers for MINHELPERS, 613 * and also figure the max desired size. 614 */ 615 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level); 616 new->maxsize = 0; 617 new->maxvmsize = 0; 618 new->maxvm = NULL; 619 tag = 1; 620 new->helper = kmem_zalloc(new->nhelper * sizeof (helper_t), KM_SLEEP); 621 hpend = &new->helper[new->nhelper]; 622 for (hp = new->helper; hp != hpend; hp++) { 623 hp->tag = tag++; 624 if (hp < &new->helper[MINHELPERS]) { 625 hp->lzbuf = kmem_alloc(PAGESIZE, KM_SLEEP); 626 hp->page = kmem_alloc(PAGESIZE, KM_SLEEP); 627 } else if (new->clevel < DUMP_CLEVEL_BZIP2) { 628 new->maxsize += 2 * PAGESIZE; 629 } else { 630 new->maxsize += PAGESIZE; 631 } 632 if (new->clevel >= DUMP_CLEVEL_BZIP2) 633 new->maxsize += bz2size; 634 } 635 636 new->cbuf = kmem_zalloc(new->ncbuf * sizeof (cbuf_t), KM_SLEEP); 637 cpend = &new->cbuf[new->ncbuf]; 638 for (cp = new->cbuf; cp != cpend; cp++) { 639 cp->state = CBUF_FREEBUF; 640 cp->size = CBUF_SIZE; 641 if (cp < &new->cbuf[MINCBUFS]) 642 cp->buf = kmem_alloc(cp->size, KM_SLEEP); 643 else 644 new->maxsize += cp->size; 645 } 646 647 new->cmap = kmem_zalloc(new->ncmap * sizeof (cbuf_t), KM_SLEEP); 648 cpend = &new->cmap[new->ncmap]; 649 for (cp = new->cmap; cp != cpend; cp++) { 650 cp->state = CBUF_FREEMAP; 651 cp->size = CBUF_MAPSIZE; 652 cp->buf = vmem_xalloc(heap_arena, CBUF_MAPSIZE, CBUF_MAPSIZE, 653 0, 0, NULL, NULL, VM_SLEEP); 654 } 655 656 /* reserve VA to be backed with spare pages at crash time */ 657 if (new->maxsize > 0) { 658 new->maxsize = P2ROUNDUP(new->maxsize, PAGESIZE); 659 new->maxvmsize = P2ROUNDUP(new->maxsize, CBUF_MAPSIZE); 660 new->maxvm = vmem_xalloc(heap_arena, new->maxvmsize, 661 CBUF_MAPSIZE, 0, 0, NULL, NULL, VM_SLEEP); 662 } 663 664 /* 665 * Reserve memory for kmem allocation calls made during crash 666 * dump. The hat layer allocates memory for each mapping 667 * created, and the I/O path allocates buffers and data structs. 668 * Add a few pages for safety. 669 */ 670 kmem_dump_init((new->ncmap * dump_kmem_permap) + 671 (dump_kmem_pages * PAGESIZE)); 672 673 /* set new config pointers */ 674 *old = *new; 675 } 676 677 /* 678 * Define a struct memlist walker to optimize bitnum to pfn 679 * lookup. The walker maintains the state of the list traversal. 680 */ 681 typedef struct dumpmlw { 682 struct memlist *mp; /* current memlist */ 683 pgcnt_t basenum; /* bitnum base offset */ 684 pgcnt_t mppages; /* current memlist size */ 685 pgcnt_t mpleft; /* size to end of current memlist */ 686 pfn_t mpaddr; /* first pfn in memlist */ 687 } dumpmlw_t; 688 689 /* initialize the walker */ 690 static inline void 691 dump_init_memlist_walker(dumpmlw_t *pw) 692 { 693 pw->mp = phys_install; 694 pw->basenum = 0; 695 pw->mppages = pw->mp->ml_size >> PAGESHIFT; 696 pw->mpleft = pw->mppages; 697 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT; 698 } 699 700 /* 701 * Lookup pfn given bitnum. The memlist can be quite long on some 702 * systems (e.g.: one per board). To optimize sequential lookups, the 703 * caller initializes and presents a memlist walker. 704 */ 705 static pfn_t 706 dump_bitnum_to_pfn(pgcnt_t bitnum, dumpmlw_t *pw) 707 { 708 bitnum -= pw->basenum; 709 while (pw->mp != NULL) { 710 if (bitnum < pw->mppages) { 711 pw->mpleft = pw->mppages - bitnum; 712 return (pw->mpaddr + bitnum); 713 } 714 bitnum -= pw->mppages; 715 pw->basenum += pw->mppages; 716 pw->mp = pw->mp->ml_next; 717 if (pw->mp != NULL) { 718 pw->mppages = pw->mp->ml_size >> PAGESHIFT; 719 pw->mpleft = pw->mppages; 720 pw->mpaddr = pw->mp->ml_address >> PAGESHIFT; 721 } 722 } 723 return (PFN_INVALID); 724 } 725 726 static pgcnt_t 727 dump_pfn_to_bitnum(pfn_t pfn) 728 { 729 struct memlist *mp; 730 pgcnt_t bitnum = 0; 731 732 for (mp = phys_install; mp != NULL; mp = mp->ml_next) { 733 if (pfn >= (mp->ml_address >> PAGESHIFT) && 734 pfn < ((mp->ml_address + mp->ml_size) >> PAGESHIFT)) 735 return (bitnum + pfn - (mp->ml_address >> PAGESHIFT)); 736 bitnum += mp->ml_size >> PAGESHIFT; 737 } 738 return ((pgcnt_t)-1); 739 } 740 741 /* 742 * Set/test bitmap for a CBUF_MAPSIZE range which includes pfn. The 743 * mapping of pfn to range index is imperfect because pfn and bitnum 744 * do not have the same phase. To make sure a CBUF_MAPSIZE range is 745 * covered, call this for both ends: 746 * dump_set_used(base) 747 * dump_set_used(base+CBUF_MAPNP-1) 748 * 749 * This is used during a panic dump to mark pages allocated by 750 * dumpsys_get_maxmem(). The macro IS_DUMP_PAGE(pp) is used by 751 * page_get_mnode_freelist() to make sure pages used by dump are never 752 * allocated. 753 */ 754 #define CBUF_MAPP2R(pfn) ((pfn) >> (CBUF_MAPSHIFT - PAGESHIFT)) 755 756 static void 757 dump_set_used(pfn_t pfn) 758 { 759 760 pgcnt_t bitnum, rbitnum; 761 762 bitnum = dump_pfn_to_bitnum(pfn); 763 ASSERT(bitnum != (pgcnt_t)-1); 764 765 rbitnum = CBUF_MAPP2R(bitnum); 766 ASSERT(rbitnum < dumpcfg.rbitmapsize); 767 768 BT_SET(dumpcfg.rbitmap, rbitnum); 769 } 770 771 int 772 dump_test_used(pfn_t pfn) 773 { 774 pgcnt_t bitnum, rbitnum; 775 776 bitnum = dump_pfn_to_bitnum(pfn); 777 ASSERT(bitnum != (pgcnt_t)-1); 778 779 rbitnum = CBUF_MAPP2R(bitnum); 780 ASSERT(rbitnum < dumpcfg.rbitmapsize); 781 782 return (BT_TEST(dumpcfg.rbitmap, rbitnum)); 783 } 784 785 /* 786 * dumpbzalloc and dumpbzfree are callbacks from the bzip2 library. 787 * dumpsys_get_maxmem() uses them for BZ2_bzCompressInit(). 788 */ 789 static void * 790 dumpbzalloc(void *opaque, int items, int size) 791 { 792 size_t *sz; 793 char *ret; 794 795 ASSERT(opaque != NULL); 796 sz = opaque; 797 ret = dumpcfg.maxvm + *sz; 798 *sz += items * size; 799 *sz = P2ROUNDUP(*sz, BZ2_BZALLOC_ALIGN); 800 ASSERT(*sz <= dumpcfg.maxvmsize); 801 return (ret); 802 } 803 804 /*ARGSUSED*/ 805 static void 806 dumpbzfree(void *opaque, void *addr) 807 { 808 } 809 810 /* 811 * Perform additional checks on the page to see if we can really use 812 * it. The kernel (kas) pages are always set in the bitmap. However, 813 * boot memory pages (prom_ppages or P_BOOTPAGES) are not in the 814 * bitmap. So we check for them. 815 */ 816 static inline int 817 dump_pfn_check(pfn_t pfn) 818 { 819 page_t *pp = page_numtopp_nolock(pfn); 820 if (pp == NULL || pp->p_pagenum != pfn || 821 #if defined(__sparc) 822 pp->p_vnode == &promvp || 823 #else 824 PP_ISBOOTPAGES(pp) || 825 #endif 826 pp->p_toxic != 0) 827 return (0); 828 return (1); 829 } 830 831 /* 832 * Check a range to see if all contained pages are available and 833 * return non-zero if the range can be used. 834 */ 835 static inline int 836 dump_range_check(pgcnt_t start, pgcnt_t end, pfn_t pfn) 837 { 838 for (; start < end; start++, pfn++) { 839 if (BT_TEST(dumpcfg.bitmap, start)) 840 return (0); 841 if (!dump_pfn_check(pfn)) 842 return (0); 843 } 844 return (1); 845 } 846 847 /* 848 * dumpsys_get_maxmem() is called during panic. Find unused ranges 849 * and use them for buffers. If we find enough memory switch to 850 * parallel bzip2, otherwise use parallel lzjb. 851 * 852 * It searches the dump bitmap in 2 passes. The first time it looks 853 * for CBUF_MAPSIZE ranges. On the second pass it uses small pages. 854 */ 855 static void 856 dumpsys_get_maxmem() 857 { 858 dumpcfg_t *cfg = &dumpcfg; 859 cbuf_t *endcp = &cfg->cbuf[cfg->ncbuf]; 860 helper_t *endhp = &cfg->helper[cfg->nhelper]; 861 pgcnt_t bitnum, end; 862 size_t sz, endsz, bz2size; 863 pfn_t pfn, off; 864 cbuf_t *cp; 865 helper_t *hp, *ohp; 866 dumpmlw_t mlw; 867 int k; 868 869 if (cfg->maxsize == 0 || cfg->clevel < DUMP_CLEVEL_LZJB || 870 (dump_conflags & DUMP_ALL) != 0) { 871 if (cfg->clevel > DUMP_CLEVEL_LZJB) 872 cfg->clevel = DUMP_CLEVEL_LZJB; 873 return; 874 } 875 876 sz = 0; 877 cfg->found4m = 0; 878 cfg->foundsm = 0; 879 880 /* bitmap of ranges used to estimate which pfns are being used */ 881 bzero(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg.rbitmapsize)); 882 883 /* find ranges that are not being dumped to use for buffers */ 884 dump_init_memlist_walker(&mlw); 885 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) { 886 dump_timeleft = dump_timeout; 887 end = bitnum + CBUF_MAPNP; 888 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 889 ASSERT(pfn != PFN_INVALID); 890 891 /* skip partial range at end of mem segment */ 892 if (mlw.mpleft < CBUF_MAPNP) { 893 end = bitnum + mlw.mpleft; 894 continue; 895 } 896 897 /* skip non aligned pages */ 898 off = P2PHASE(pfn, CBUF_MAPNP); 899 if (off != 0) { 900 end -= off; 901 continue; 902 } 903 904 if (!dump_range_check(bitnum, end, pfn)) 905 continue; 906 907 ASSERT((sz + CBUF_MAPSIZE) <= cfg->maxvmsize); 908 hat_devload(kas.a_hat, cfg->maxvm + sz, CBUF_MAPSIZE, pfn, 909 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST); 910 sz += CBUF_MAPSIZE; 911 cfg->found4m++; 912 913 /* set the bitmap for both ends to be sure to cover the range */ 914 dump_set_used(pfn); 915 dump_set_used(pfn + CBUF_MAPNP - 1); 916 917 if (sz >= cfg->maxsize) 918 goto foundmax; 919 } 920 921 /* Add small pages if we can't find enough large pages. */ 922 dump_init_memlist_walker(&mlw); 923 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum = end) { 924 dump_timeleft = dump_timeout; 925 end = bitnum + CBUF_MAPNP; 926 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 927 ASSERT(pfn != PFN_INVALID); 928 929 /* Find any non-aligned pages at start and end of segment. */ 930 off = P2PHASE(pfn, CBUF_MAPNP); 931 if (mlw.mpleft < CBUF_MAPNP) { 932 end = bitnum + mlw.mpleft; 933 } else if (off != 0) { 934 end -= off; 935 } else if (cfg->found4m && dump_test_used(pfn)) { 936 continue; 937 } 938 939 for (; bitnum < end; bitnum++, pfn++) { 940 dump_timeleft = dump_timeout; 941 if (BT_TEST(dumpcfg.bitmap, bitnum)) 942 continue; 943 if (!dump_pfn_check(pfn)) 944 continue; 945 ASSERT((sz + PAGESIZE) <= cfg->maxvmsize); 946 hat_devload(kas.a_hat, cfg->maxvm + sz, PAGESIZE, pfn, 947 PROT_READ | PROT_WRITE, HAT_LOAD_NOCONSIST); 948 sz += PAGESIZE; 949 cfg->foundsm++; 950 dump_set_used(pfn); 951 if (sz >= cfg->maxsize) 952 goto foundmax; 953 } 954 } 955 956 /* Fall back to lzjb if we did not get enough memory for bzip2. */ 957 endsz = (cfg->maxsize * cfg->threshold) / cfg->nhelper; 958 if (sz < endsz) { 959 cfg->clevel = DUMP_CLEVEL_LZJB; 960 } 961 962 /* Allocate memory for as many helpers as we can. */ 963 foundmax: 964 965 /* Byte offsets into memory found and mapped above */ 966 endsz = sz; 967 sz = 0; 968 969 /* Set the size for bzip2 state. Only bzip2 needs it. */ 970 bz2size = BZ2_bzCompressInitSize(dump_bzip2_level); 971 972 /* Skip the preallocate output buffers. */ 973 cp = &cfg->cbuf[MINCBUFS]; 974 975 /* Use this to move memory up from the preallocated helpers. */ 976 ohp = cfg->helper; 977 978 /* Loop over all helpers and allocate memory. */ 979 for (hp = cfg->helper; hp < endhp; hp++) { 980 981 /* Skip preallocated helpers by checking hp->page. */ 982 if (hp->page == NULL) { 983 if (cfg->clevel <= DUMP_CLEVEL_LZJB) { 984 /* lzjb needs 2 1-page buffers */ 985 if ((sz + (2 * PAGESIZE)) > endsz) 986 break; 987 hp->page = cfg->maxvm + sz; 988 sz += PAGESIZE; 989 hp->lzbuf = cfg->maxvm + sz; 990 sz += PAGESIZE; 991 992 } else if (ohp->lzbuf != NULL) { 993 /* re-use the preallocted lzjb page for bzip2 */ 994 hp->page = ohp->lzbuf; 995 ohp->lzbuf = NULL; 996 ++ohp; 997 998 } else { 999 /* bzip2 needs a 1-page buffer */ 1000 if ((sz + PAGESIZE) > endsz) 1001 break; 1002 hp->page = cfg->maxvm + sz; 1003 sz += PAGESIZE; 1004 } 1005 } 1006 1007 /* 1008 * Add output buffers per helper. The number of 1009 * buffers per helper is determined by the ratio of 1010 * ncbuf to nhelper. 1011 */ 1012 for (k = 0; cp < endcp && (sz + CBUF_SIZE) <= endsz && 1013 k < NCBUF_PER_HELPER; k++) { 1014 cp->state = CBUF_FREEBUF; 1015 cp->size = CBUF_SIZE; 1016 cp->buf = cfg->maxvm + sz; 1017 sz += CBUF_SIZE; 1018 ++cp; 1019 } 1020 1021 /* 1022 * bzip2 needs compression state. Use the dumpbzalloc 1023 * and dumpbzfree callbacks to allocate the memory. 1024 * bzip2 does allocation only at init time. 1025 */ 1026 if (cfg->clevel >= DUMP_CLEVEL_BZIP2) { 1027 if ((sz + bz2size) > endsz) { 1028 hp->page = NULL; 1029 break; 1030 } else { 1031 hp->bzstream.opaque = &sz; 1032 hp->bzstream.bzalloc = dumpbzalloc; 1033 hp->bzstream.bzfree = dumpbzfree; 1034 (void) BZ2_bzCompressInit(&hp->bzstream, 1035 dump_bzip2_level, 0, 0); 1036 hp->bzstream.opaque = NULL; 1037 } 1038 } 1039 } 1040 1041 /* Finish allocating output buffers */ 1042 for (; cp < endcp && (sz + CBUF_SIZE) <= endsz; cp++) { 1043 cp->state = CBUF_FREEBUF; 1044 cp->size = CBUF_SIZE; 1045 cp->buf = cfg->maxvm + sz; 1046 sz += CBUF_SIZE; 1047 } 1048 1049 /* Enable IS_DUMP_PAGE macro, which checks for pages we took. */ 1050 if (cfg->found4m || cfg->foundsm) 1051 dump_check_used = 1; 1052 1053 ASSERT(sz <= endsz); 1054 } 1055 1056 static void 1057 dumphdr_init(void) 1058 { 1059 pgcnt_t npages = 0; 1060 1061 ASSERT(MUTEX_HELD(&dump_lock)); 1062 1063 if (dumphdr == NULL) { 1064 dumphdr = kmem_zalloc(sizeof (dumphdr_t), KM_SLEEP); 1065 dumphdr->dump_magic = DUMP_MAGIC; 1066 dumphdr->dump_version = DUMP_VERSION; 1067 dumphdr->dump_wordsize = DUMP_WORDSIZE; 1068 dumphdr->dump_pageshift = PAGESHIFT; 1069 dumphdr->dump_pagesize = PAGESIZE; 1070 dumphdr->dump_utsname = utsname; 1071 (void) strcpy(dumphdr->dump_platform, platform); 1072 dumpbuf.size = dumpbuf_iosize(maxphys); 1073 dumpbuf.start = kmem_alloc(dumpbuf.size, KM_SLEEP); 1074 dumpbuf.end = dumpbuf.start + dumpbuf.size; 1075 dumpcfg.pids = kmem_alloc(v.v_proc * sizeof (pid_t), KM_SLEEP); 1076 dumpcfg.helpermap = kmem_zalloc(BT_SIZEOFMAP(NCPU), KM_SLEEP); 1077 LOCK_INIT_HELD(&dumpcfg.helper_lock); 1078 } 1079 1080 npages = num_phys_pages(); 1081 1082 if (dumpcfg.bitmapsize != npages) { 1083 size_t rlen = CBUF_MAPP2R(P2ROUNDUP(npages, CBUF_MAPNP)); 1084 void *map = kmem_alloc(BT_SIZEOFMAP(npages), KM_SLEEP); 1085 void *rmap = kmem_alloc(BT_SIZEOFMAP(rlen), KM_SLEEP); 1086 1087 if (dumpcfg.bitmap != NULL) 1088 kmem_free(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg. 1089 bitmapsize)); 1090 if (dumpcfg.rbitmap != NULL) 1091 kmem_free(dumpcfg.rbitmap, BT_SIZEOFMAP(dumpcfg. 1092 rbitmapsize)); 1093 dumpcfg.bitmap = map; 1094 dumpcfg.bitmapsize = npages; 1095 dumpcfg.rbitmap = rmap; 1096 dumpcfg.rbitmapsize = rlen; 1097 } 1098 } 1099 1100 /* 1101 * Establish a new dump device. 1102 */ 1103 int 1104 dumpinit(vnode_t *vp, char *name, int justchecking) 1105 { 1106 vnode_t *cvp; 1107 vattr_t vattr; 1108 vnode_t *cdev_vp; 1109 int error = 0; 1110 1111 ASSERT(MUTEX_HELD(&dump_lock)); 1112 1113 dumphdr_init(); 1114 1115 cvp = common_specvp(vp); 1116 if (cvp == dumpvp) 1117 return (0); 1118 1119 /* 1120 * Determine whether this is a plausible dump device. We want either: 1121 * (1) a real device that's not mounted and has a cb_dump routine, or 1122 * (2) a swapfile on some filesystem that has a vop_dump routine. 1123 */ 1124 if ((error = VOP_OPEN(&cvp, FREAD | FWRITE, kcred, NULL)) != 0) 1125 return (error); 1126 1127 vattr.va_mask = AT_SIZE | AT_TYPE | AT_RDEV; 1128 if ((error = VOP_GETATTR(cvp, &vattr, 0, kcred, NULL)) == 0) { 1129 if (vattr.va_type == VBLK || vattr.va_type == VCHR) { 1130 if (devopsp[getmajor(vattr.va_rdev)]-> 1131 devo_cb_ops->cb_dump == nodev) 1132 error = ENOTSUP; 1133 else if (vfs_devismounted(vattr.va_rdev)) 1134 error = EBUSY; 1135 if (strcmp(ddi_driver_name(VTOS(cvp)->s_dip), 1136 ZFS_DRIVER) == 0 && 1137 IS_SWAPVP(common_specvp(cvp))) 1138 error = EBUSY; 1139 } else { 1140 if (vn_matchopval(cvp, VOPNAME_DUMP, fs_nosys) || 1141 !IS_SWAPVP(cvp)) 1142 error = ENOTSUP; 1143 } 1144 } 1145 1146 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) 1147 error = ENOSPC; 1148 1149 if (error || justchecking) { 1150 (void) VOP_CLOSE(cvp, FREAD | FWRITE, 1, (offset_t)0, 1151 kcred, NULL); 1152 return (error); 1153 } 1154 1155 VN_HOLD(cvp); 1156 1157 if (dumpvp != NULL) 1158 dumpfini(); /* unconfigure the old dump device */ 1159 1160 dumpvp = cvp; 1161 dumpvp_size = vattr.va_size & -DUMP_OFFSET; 1162 dumppath = kmem_alloc(strlen(name) + 1, KM_SLEEP); 1163 (void) strcpy(dumppath, name); 1164 dumpbuf.iosize = 0; 1165 1166 /* 1167 * If the dump device is a block device, attempt to open up the 1168 * corresponding character device and determine its maximum transfer 1169 * size. We use this information to potentially resize dumpbuf to a 1170 * larger and more optimal size for performing i/o to the dump device. 1171 */ 1172 if (cvp->v_type == VBLK && 1173 (cdev_vp = makespecvp(VTOS(cvp)->s_dev, VCHR)) != NULL) { 1174 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) { 1175 size_t blk_size; 1176 struct dk_cinfo dki; 1177 struct dk_minfo minf; 1178 1179 if (VOP_IOCTL(cdev_vp, DKIOCGMEDIAINFO, 1180 (intptr_t)&minf, FKIOCTL, kcred, NULL, NULL) 1181 == 0 && minf.dki_lbsize != 0) 1182 blk_size = minf.dki_lbsize; 1183 else 1184 blk_size = DEV_BSIZE; 1185 1186 if (VOP_IOCTL(cdev_vp, DKIOCINFO, (intptr_t)&dki, 1187 FKIOCTL, kcred, NULL, NULL) == 0) { 1188 dumpbuf.iosize = dki.dki_maxtransfer * blk_size; 1189 dumpbuf_resize(); 1190 } 1191 /* 1192 * If we are working with a zvol then dumpify it 1193 * if it's not being used as swap. 1194 */ 1195 if (strcmp(dki.dki_dname, ZVOL_DRIVER) == 0) { 1196 if (IS_SWAPVP(common_specvp(cvp))) 1197 error = EBUSY; 1198 else if ((error = VOP_IOCTL(cdev_vp, 1199 DKIOCDUMPINIT, NULL, FKIOCTL, kcred, 1200 NULL, NULL)) != 0) 1201 dumpfini(); 1202 } 1203 1204 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0, 1205 kcred, NULL); 1206 } 1207 1208 VN_RELE(cdev_vp); 1209 } 1210 1211 cmn_err(CE_CONT, "?dump on %s size %llu MB\n", name, dumpvp_size >> 20); 1212 1213 dump_update_clevel(); 1214 1215 return (error); 1216 } 1217 1218 void 1219 dumpfini(void) 1220 { 1221 vattr_t vattr; 1222 boolean_t is_zfs = B_FALSE; 1223 vnode_t *cdev_vp; 1224 ASSERT(MUTEX_HELD(&dump_lock)); 1225 1226 kmem_free(dumppath, strlen(dumppath) + 1); 1227 1228 /* 1229 * Determine if we are using zvols for our dump device 1230 */ 1231 vattr.va_mask = AT_RDEV; 1232 if (VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL) == 0) { 1233 is_zfs = (getmajor(vattr.va_rdev) == 1234 ddi_name_to_major(ZFS_DRIVER)) ? B_TRUE : B_FALSE; 1235 } 1236 1237 /* 1238 * If we have a zvol dump device then we call into zfs so 1239 * that it may have a chance to cleanup. 1240 */ 1241 if (is_zfs && 1242 (cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR)) != NULL) { 1243 if (VOP_OPEN(&cdev_vp, FREAD | FWRITE, kcred, NULL) == 0) { 1244 (void) VOP_IOCTL(cdev_vp, DKIOCDUMPFINI, NULL, FKIOCTL, 1245 kcred, NULL, NULL); 1246 (void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0, 1247 kcred, NULL); 1248 } 1249 VN_RELE(cdev_vp); 1250 } 1251 1252 (void) VOP_CLOSE(dumpvp, FREAD | FWRITE, 1, (offset_t)0, kcred, NULL); 1253 1254 VN_RELE(dumpvp); 1255 1256 dumpvp = NULL; 1257 dumpvp_size = 0; 1258 dumppath = NULL; 1259 } 1260 1261 static offset_t 1262 dumpvp_flush(void) 1263 { 1264 size_t size = P2ROUNDUP(dumpbuf.cur - dumpbuf.start, PAGESIZE); 1265 hrtime_t iotime; 1266 int err; 1267 1268 if (dumpbuf.vp_off + size > dumpbuf.vp_limit) { 1269 dump_ioerr = ENOSPC; 1270 dumpbuf.vp_off = dumpbuf.vp_limit; 1271 } else if (size != 0) { 1272 iotime = gethrtime(); 1273 dumpsync.iowait += iotime - dumpsync.iowaitts; 1274 if (panicstr) 1275 err = VOP_DUMP(dumpvp, dumpbuf.start, 1276 lbtodb(dumpbuf.vp_off), btod(size), NULL); 1277 else 1278 err = vn_rdwr(UIO_WRITE, dumpbuf.cdev_vp != NULL ? 1279 dumpbuf.cdev_vp : dumpvp, dumpbuf.start, size, 1280 dumpbuf.vp_off, UIO_SYSSPACE, 0, dumpbuf.vp_limit, 1281 kcred, 0); 1282 if (err && dump_ioerr == 0) 1283 dump_ioerr = err; 1284 dumpsync.iowaitts = gethrtime(); 1285 dumpsync.iotime += dumpsync.iowaitts - iotime; 1286 dumpsync.nwrite += size; 1287 dumpbuf.vp_off += size; 1288 } 1289 dumpbuf.cur = dumpbuf.start; 1290 dump_timeleft = dump_timeout; 1291 return (dumpbuf.vp_off); 1292 } 1293 1294 /* maximize write speed by keeping seek offset aligned with size */ 1295 void 1296 dumpvp_write(const void *va, size_t size) 1297 { 1298 size_t len, off, sz; 1299 1300 while (size != 0) { 1301 len = MIN(size, dumpbuf.end - dumpbuf.cur); 1302 if (len == 0) { 1303 off = P2PHASE(dumpbuf.vp_off, dumpbuf.size); 1304 if (off == 0 || !ISP2(dumpbuf.size)) { 1305 (void) dumpvp_flush(); 1306 } else { 1307 sz = dumpbuf.size - off; 1308 dumpbuf.cur = dumpbuf.start + sz; 1309 (void) dumpvp_flush(); 1310 ovbcopy(dumpbuf.start + sz, dumpbuf.start, off); 1311 dumpbuf.cur += off; 1312 } 1313 } else { 1314 bcopy(va, dumpbuf.cur, len); 1315 va = (char *)va + len; 1316 dumpbuf.cur += len; 1317 size -= len; 1318 } 1319 } 1320 } 1321 1322 /*ARGSUSED*/ 1323 static void 1324 dumpvp_ksyms_write(const void *src, void *dst, size_t size) 1325 { 1326 dumpvp_write(src, size); 1327 } 1328 1329 /* 1330 * Mark 'pfn' in the bitmap and dump its translation table entry. 1331 */ 1332 void 1333 dump_addpage(struct as *as, void *va, pfn_t pfn) 1334 { 1335 mem_vtop_t mem_vtop; 1336 pgcnt_t bitnum; 1337 1338 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) { 1339 if (!BT_TEST(dumpcfg.bitmap, bitnum)) { 1340 dumphdr->dump_npages++; 1341 BT_SET(dumpcfg.bitmap, bitnum); 1342 } 1343 dumphdr->dump_nvtop++; 1344 mem_vtop.m_as = as; 1345 mem_vtop.m_va = va; 1346 mem_vtop.m_pfn = pfn; 1347 dumpvp_write(&mem_vtop, sizeof (mem_vtop_t)); 1348 } 1349 dump_timeleft = dump_timeout; 1350 } 1351 1352 /* 1353 * Mark 'pfn' in the bitmap 1354 */ 1355 void 1356 dump_page(pfn_t pfn) 1357 { 1358 pgcnt_t bitnum; 1359 1360 if ((bitnum = dump_pfn_to_bitnum(pfn)) != (pgcnt_t)-1) { 1361 if (!BT_TEST(dumpcfg.bitmap, bitnum)) { 1362 dumphdr->dump_npages++; 1363 BT_SET(dumpcfg.bitmap, bitnum); 1364 } 1365 } 1366 dump_timeleft = dump_timeout; 1367 } 1368 1369 /* 1370 * Dump the <as, va, pfn> information for a given address space. 1371 * SEGOP_DUMP() will call dump_addpage() for each page in the segment. 1372 */ 1373 static void 1374 dump_as(struct as *as) 1375 { 1376 struct seg *seg; 1377 1378 AS_LOCK_ENTER(as, &as->a_lock, RW_READER); 1379 for (seg = AS_SEGFIRST(as); seg; seg = AS_SEGNEXT(as, seg)) { 1380 if (seg->s_as != as) 1381 break; 1382 if (seg->s_ops == NULL) 1383 continue; 1384 SEGOP_DUMP(seg); 1385 } 1386 AS_LOCK_EXIT(as, &as->a_lock); 1387 1388 if (seg != NULL) 1389 cmn_err(CE_WARN, "invalid segment %p in address space %p", 1390 (void *)seg, (void *)as); 1391 } 1392 1393 static int 1394 dump_process(pid_t pid) 1395 { 1396 proc_t *p = sprlock(pid); 1397 1398 if (p == NULL) 1399 return (-1); 1400 if (p->p_as != &kas) { 1401 mutex_exit(&p->p_lock); 1402 dump_as(p->p_as); 1403 mutex_enter(&p->p_lock); 1404 } 1405 1406 sprunlock(p); 1407 1408 return (0); 1409 } 1410 1411 void 1412 dump_ereports(void) 1413 { 1414 u_offset_t dumpvp_start; 1415 erpt_dump_t ed; 1416 1417 if (dumpvp == NULL || dumphdr == NULL) 1418 return; 1419 1420 dumpbuf.cur = dumpbuf.start; 1421 dumpbuf.vp_limit = dumpvp_size - (DUMP_OFFSET + DUMP_LOGSIZE); 1422 dumpvp_start = dumpbuf.vp_limit - DUMP_ERPTSIZE; 1423 dumpbuf.vp_off = dumpvp_start; 1424 1425 fm_ereport_dump(); 1426 if (panicstr) 1427 errorq_dump(); 1428 1429 bzero(&ed, sizeof (ed)); /* indicate end of ereports */ 1430 dumpvp_write(&ed, sizeof (ed)); 1431 (void) dumpvp_flush(); 1432 1433 if (!panicstr) { 1434 (void) VOP_PUTPAGE(dumpvp, dumpvp_start, 1435 (size_t)(dumpbuf.vp_off - dumpvp_start), 1436 B_INVAL | B_FORCE, kcred, NULL); 1437 } 1438 } 1439 1440 void 1441 dump_messages(void) 1442 { 1443 log_dump_t ld; 1444 mblk_t *mctl, *mdata; 1445 queue_t *q, *qlast; 1446 u_offset_t dumpvp_start; 1447 1448 if (dumpvp == NULL || dumphdr == NULL || log_consq == NULL) 1449 return; 1450 1451 dumpbuf.cur = dumpbuf.start; 1452 dumpbuf.vp_limit = dumpvp_size - DUMP_OFFSET; 1453 dumpvp_start = dumpbuf.vp_limit - DUMP_LOGSIZE; 1454 dumpbuf.vp_off = dumpvp_start; 1455 1456 qlast = NULL; 1457 do { 1458 for (q = log_consq; q->q_next != qlast; q = q->q_next) 1459 continue; 1460 for (mctl = q->q_first; mctl != NULL; mctl = mctl->b_next) { 1461 dump_timeleft = dump_timeout; 1462 mdata = mctl->b_cont; 1463 ld.ld_magic = LOG_MAGIC; 1464 ld.ld_msgsize = MBLKL(mctl->b_cont); 1465 ld.ld_csum = checksum32(mctl->b_rptr, MBLKL(mctl)); 1466 ld.ld_msum = checksum32(mdata->b_rptr, MBLKL(mdata)); 1467 dumpvp_write(&ld, sizeof (ld)); 1468 dumpvp_write(mctl->b_rptr, MBLKL(mctl)); 1469 dumpvp_write(mdata->b_rptr, MBLKL(mdata)); 1470 } 1471 } while ((qlast = q) != log_consq); 1472 1473 ld.ld_magic = 0; /* indicate end of messages */ 1474 dumpvp_write(&ld, sizeof (ld)); 1475 (void) dumpvp_flush(); 1476 if (!panicstr) { 1477 (void) VOP_PUTPAGE(dumpvp, dumpvp_start, 1478 (size_t)(dumpbuf.vp_off - dumpvp_start), 1479 B_INVAL | B_FORCE, kcred, NULL); 1480 } 1481 } 1482 1483 /* 1484 * The following functions are called on multiple CPUs during dump. 1485 * They must not use most kernel services, because all cross-calls are 1486 * disabled during panic. Therefore, blocking locks and cache flushes 1487 * will not work. 1488 */ 1489 1490 /* 1491 * Copy pages, trapping ECC errors. Also, for robustness, trap data 1492 * access in case something goes wrong in the hat layer and the 1493 * mapping is broken. 1494 */ 1495 static int 1496 dump_pagecopy(void *src, void *dst) 1497 { 1498 long *wsrc = (long *)src; 1499 long *wdst = (long *)dst; 1500 const ulong_t ncopies = PAGESIZE / sizeof (long); 1501 volatile int w = 0; 1502 volatile int ueoff = -1; 1503 on_trap_data_t otd; 1504 1505 if (on_trap(&otd, OT_DATA_EC | OT_DATA_ACCESS)) { 1506 if (ueoff == -1) 1507 ueoff = w * sizeof (long); 1508 /* report "bad ECC" or "bad address" */ 1509 #ifdef _LP64 1510 if (otd.ot_trap & OT_DATA_EC) 1511 wdst[w++] = 0x00badecc00badecc; 1512 else 1513 wdst[w++] = 0x00badadd00badadd; 1514 #else 1515 if (otd.ot_trap & OT_DATA_EC) 1516 wdst[w++] = 0x00badecc; 1517 else 1518 wdst[w++] = 0x00badadd; 1519 #endif 1520 } 1521 while (w < ncopies) { 1522 wdst[w] = wsrc[w]; 1523 w++; 1524 } 1525 no_trap(); 1526 return (ueoff); 1527 } 1528 1529 static void 1530 dumpsys_close_cq(cqueue_t *cq, int live) 1531 { 1532 if (live) { 1533 mutex_enter(&cq->mutex); 1534 atomic_dec_uint(&cq->open); 1535 cv_signal(&cq->cv); 1536 mutex_exit(&cq->mutex); 1537 } else { 1538 atomic_dec_uint(&cq->open); 1539 } 1540 } 1541 1542 static inline void 1543 dumpsys_spinlock(lock_t *lp) 1544 { 1545 uint_t backoff = 0; 1546 int loop_count = 0; 1547 1548 while (LOCK_HELD(lp) || !lock_spin_try(lp)) { 1549 if (++loop_count >= ncpus) { 1550 backoff = mutex_lock_backoff(0); 1551 loop_count = 0; 1552 } else { 1553 backoff = mutex_lock_backoff(backoff); 1554 } 1555 mutex_lock_delay(backoff); 1556 } 1557 } 1558 1559 static inline void 1560 dumpsys_spinunlock(lock_t *lp) 1561 { 1562 lock_clear(lp); 1563 } 1564 1565 static inline void 1566 dumpsys_lock(cqueue_t *cq, int live) 1567 { 1568 if (live) 1569 mutex_enter(&cq->mutex); 1570 else 1571 dumpsys_spinlock(&cq->spinlock); 1572 } 1573 1574 static inline void 1575 dumpsys_unlock(cqueue_t *cq, int live, int signal) 1576 { 1577 if (live) { 1578 if (signal) 1579 cv_signal(&cq->cv); 1580 mutex_exit(&cq->mutex); 1581 } else { 1582 dumpsys_spinunlock(&cq->spinlock); 1583 } 1584 } 1585 1586 static void 1587 dumpsys_wait_cq(cqueue_t *cq, int live) 1588 { 1589 if (live) { 1590 cv_wait(&cq->cv, &cq->mutex); 1591 } else { 1592 dumpsys_spinunlock(&cq->spinlock); 1593 while (cq->open) 1594 if (cq->first) 1595 break; 1596 dumpsys_spinlock(&cq->spinlock); 1597 } 1598 } 1599 1600 static void 1601 dumpsys_put_cq(cqueue_t *cq, cbuf_t *cp, int newstate, int live) 1602 { 1603 if (cp == NULL) 1604 return; 1605 1606 dumpsys_lock(cq, live); 1607 1608 if (cq->ts != 0) { 1609 cq->empty += gethrtime() - cq->ts; 1610 cq->ts = 0; 1611 } 1612 1613 cp->state = newstate; 1614 cp->next = NULL; 1615 if (cq->last == NULL) 1616 cq->first = cp; 1617 else 1618 cq->last->next = cp; 1619 cq->last = cp; 1620 1621 dumpsys_unlock(cq, live, 1); 1622 } 1623 1624 static cbuf_t * 1625 dumpsys_get_cq(cqueue_t *cq, int live) 1626 { 1627 cbuf_t *cp; 1628 hrtime_t now = gethrtime(); 1629 1630 dumpsys_lock(cq, live); 1631 1632 /* CONSTCOND */ 1633 while (1) { 1634 cp = (cbuf_t *)cq->first; 1635 if (cp == NULL) { 1636 if (cq->open == 0) 1637 break; 1638 dumpsys_wait_cq(cq, live); 1639 continue; 1640 } 1641 cq->first = cp->next; 1642 if (cq->first == NULL) { 1643 cq->last = NULL; 1644 cq->ts = now; 1645 } 1646 break; 1647 } 1648 1649 dumpsys_unlock(cq, live, cq->first != NULL || cq->open == 0); 1650 return (cp); 1651 } 1652 1653 /* 1654 * Send an error message to the console. If the main task is running 1655 * just write the message via uprintf. If a helper is running the 1656 * message has to be put on a queue for the main task. Setting fmt to 1657 * NULL means flush the error message buffer. If fmt is not NULL, just 1658 * add the text to the existing buffer. 1659 */ 1660 static void 1661 dumpsys_errmsg(helper_t *hp, const char *fmt, ...) 1662 { 1663 dumpsync_t *ds = hp->ds; 1664 cbuf_t *cp = hp->cperr; 1665 va_list adx; 1666 1667 if (hp->helper == MAINHELPER) { 1668 if (fmt != NULL) { 1669 if (ds->neednl) { 1670 uprintf("\n"); 1671 ds->neednl = 0; 1672 } 1673 va_start(adx, fmt); 1674 vuprintf(fmt, adx); 1675 va_end(adx); 1676 } 1677 } else if (fmt == NULL) { 1678 if (cp != NULL) { 1679 CQ_PUT(mainq, cp, CBUF_ERRMSG); 1680 hp->cperr = NULL; 1681 } 1682 } else { 1683 if (hp->cperr == NULL) { 1684 cp = CQ_GET(freebufq); 1685 hp->cperr = cp; 1686 cp->used = 0; 1687 } 1688 va_start(adx, fmt); 1689 cp->used += vsnprintf(cp->buf + cp->used, cp->size - cp->used, 1690 fmt, adx); 1691 va_end(adx); 1692 if ((cp->used + LOG_MSGSIZE) > cp->size) { 1693 CQ_PUT(mainq, cp, CBUF_ERRMSG); 1694 hp->cperr = NULL; 1695 } 1696 } 1697 } 1698 1699 /* 1700 * Write an output buffer to the dump file. If the main task is 1701 * running just write the data. If a helper is running the output is 1702 * placed on a queue for the main task. 1703 */ 1704 static void 1705 dumpsys_swrite(helper_t *hp, cbuf_t *cp, size_t used) 1706 { 1707 dumpsync_t *ds = hp->ds; 1708 1709 if (hp->helper == MAINHELPER) { 1710 HRSTART(ds->perpage, write); 1711 dumpvp_write(cp->buf, used); 1712 HRSTOP(ds->perpage, write); 1713 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 1714 } else { 1715 cp->used = used; 1716 CQ_PUT(mainq, cp, CBUF_WRITE); 1717 } 1718 } 1719 1720 /* 1721 * Copy one page within the mapped range. The offset starts at 0 and 1722 * is relative to the first pfn. cp->buf + cp->off is the address of 1723 * the first pfn. If dump_pagecopy returns a UE offset, create an 1724 * error message. Returns the offset to the next pfn in the range 1725 * selected by the bitmap. 1726 */ 1727 static int 1728 dumpsys_copy_page(helper_t *hp, int offset) 1729 { 1730 cbuf_t *cp = hp->cpin; 1731 int ueoff; 1732 1733 ASSERT(cp->off + offset + PAGESIZE <= cp->size); 1734 ASSERT(BT_TEST(dumpcfg.bitmap, cp->bitnum)); 1735 1736 ueoff = dump_pagecopy(cp->buf + cp->off + offset, hp->page); 1737 1738 /* ueoff is the offset in the page to a UE error */ 1739 if (ueoff != -1) { 1740 uint64_t pa = ptob(cp->pfn) + offset + ueoff; 1741 1742 dumpsys_errmsg(hp, "cpu %d: memory error at PA 0x%08x.%08x\n", 1743 CPU->cpu_id, (uint32_t)(pa >> 32), (uint32_t)pa); 1744 } 1745 1746 /* 1747 * Advance bitnum and offset to the next input page for the 1748 * next call to this function. 1749 */ 1750 offset += PAGESIZE; 1751 cp->bitnum++; 1752 while (cp->off + offset < cp->size) { 1753 if (BT_TEST(dumpcfg.bitmap, cp->bitnum)) 1754 break; 1755 offset += PAGESIZE; 1756 cp->bitnum++; 1757 } 1758 1759 return (offset); 1760 } 1761 1762 /* 1763 * Read the helper queue, and copy one mapped page. Return 0 when 1764 * done. Return 1 when a page has been copied into hp->page. 1765 */ 1766 static int 1767 dumpsys_sread(helper_t *hp) 1768 { 1769 dumpsync_t *ds = hp->ds; 1770 1771 /* CONSTCOND */ 1772 while (1) { 1773 1774 /* Find the next input buffer. */ 1775 if (hp->cpin == NULL) { 1776 HRSTART(hp->perpage, inwait); 1777 1778 /* CONSTCOND */ 1779 while (1) { 1780 hp->cpin = CQ_GET(helperq); 1781 dump_timeleft = dump_timeout; 1782 1783 /* 1784 * NULL return means the helper queue 1785 * is closed and empty. 1786 */ 1787 if (hp->cpin == NULL) 1788 break; 1789 1790 /* Have input, check for dump I/O error. */ 1791 if (!dump_ioerr) 1792 break; 1793 1794 /* 1795 * If an I/O error occurs, stay in the 1796 * loop in order to empty the helper 1797 * queue. Return the buffers to the 1798 * main task to unmap and free it. 1799 */ 1800 hp->cpin->used = 0; 1801 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 1802 } 1803 HRSTOP(hp->perpage, inwait); 1804 1805 /* Stop here when the helper queue is closed. */ 1806 if (hp->cpin == NULL) 1807 break; 1808 1809 /* Set the offset=0 to get the first pfn. */ 1810 hp->in = 0; 1811 1812 /* Set the total processed to 0 */ 1813 hp->used = 0; 1814 } 1815 1816 /* Process the next page. */ 1817 if (hp->used < hp->cpin->used) { 1818 1819 /* 1820 * Get the next page from the input buffer and 1821 * return a copy. 1822 */ 1823 ASSERT(hp->in != -1); 1824 HRSTART(hp->perpage, copy); 1825 hp->in = dumpsys_copy_page(hp, hp->in); 1826 hp->used += PAGESIZE; 1827 HRSTOP(hp->perpage, copy); 1828 break; 1829 1830 } else { 1831 1832 /* 1833 * Done with the input. Flush the VM and 1834 * return the buffer to the main task. 1835 */ 1836 if (panicstr && hp->helper != MAINHELPER) 1837 hat_flush_range(kas.a_hat, 1838 hp->cpin->buf, hp->cpin->size); 1839 dumpsys_errmsg(hp, NULL); 1840 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 1841 hp->cpin = NULL; 1842 } 1843 } 1844 1845 return (hp->cpin != NULL); 1846 } 1847 1848 /* 1849 * Compress size bytes starting at buf with bzip2 1850 * mode: 1851 * BZ_RUN add one more compressed page 1852 * BZ_FINISH no more input, flush the state 1853 */ 1854 static void 1855 dumpsys_bzrun(helper_t *hp, void *buf, size_t size, int mode) 1856 { 1857 dumpsync_t *ds = hp->ds; 1858 const int CSIZE = sizeof (dumpcsize_t); 1859 bz_stream *ps = &hp->bzstream; 1860 int rc = 0; 1861 uint32_t csize; 1862 dumpcsize_t cs; 1863 1864 /* Set input pointers to new input page */ 1865 if (size > 0) { 1866 ps->avail_in = size; 1867 ps->next_in = buf; 1868 } 1869 1870 /* CONSTCOND */ 1871 while (1) { 1872 1873 /* Quit when all input has been consumed */ 1874 if (ps->avail_in == 0 && mode == BZ_RUN) 1875 break; 1876 1877 /* Get a new output buffer */ 1878 if (hp->cpout == NULL) { 1879 HRSTART(hp->perpage, outwait); 1880 hp->cpout = CQ_GET(freebufq); 1881 HRSTOP(hp->perpage, outwait); 1882 ps->avail_out = hp->cpout->size - CSIZE; 1883 ps->next_out = hp->cpout->buf + CSIZE; 1884 } 1885 1886 /* Compress input, or finalize */ 1887 HRSTART(hp->perpage, compress); 1888 rc = BZ2_bzCompress(ps, mode); 1889 HRSTOP(hp->perpage, compress); 1890 1891 /* Check for error */ 1892 if (mode == BZ_RUN && rc != BZ_RUN_OK) { 1893 dumpsys_errmsg(hp, "%d: BZ_RUN error %s at page %lx\n", 1894 hp->helper, BZ2_bzErrorString(rc), 1895 hp->cpin->pagenum); 1896 break; 1897 } 1898 1899 /* Write the buffer if it is full, or we are flushing */ 1900 if (ps->avail_out == 0 || mode == BZ_FINISH) { 1901 csize = hp->cpout->size - CSIZE - ps->avail_out; 1902 cs = DUMP_SET_TAG(csize, hp->tag); 1903 if (csize > 0) { 1904 (void) memcpy(hp->cpout->buf, &cs, CSIZE); 1905 dumpsys_swrite(hp, hp->cpout, csize + CSIZE); 1906 hp->cpout = NULL; 1907 } 1908 } 1909 1910 /* Check for final complete */ 1911 if (mode == BZ_FINISH) { 1912 if (rc == BZ_STREAM_END) 1913 break; 1914 if (rc != BZ_FINISH_OK) { 1915 dumpsys_errmsg(hp, "%d: BZ_FINISH error %s\n", 1916 hp->helper, BZ2_bzErrorString(rc)); 1917 break; 1918 } 1919 } 1920 } 1921 1922 /* Cleanup state and buffers */ 1923 if (mode == BZ_FINISH) { 1924 1925 /* Reset state so that it is re-usable. */ 1926 (void) BZ2_bzCompressReset(&hp->bzstream); 1927 1928 /* Give any unused outout buffer to the main task */ 1929 if (hp->cpout != NULL) { 1930 hp->cpout->used = 0; 1931 CQ_PUT(mainq, hp->cpout, CBUF_ERRMSG); 1932 hp->cpout = NULL; 1933 } 1934 } 1935 } 1936 1937 static void 1938 dumpsys_bz2compress(helper_t *hp) 1939 { 1940 dumpsync_t *ds = hp->ds; 1941 dumpstreamhdr_t sh; 1942 1943 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC); 1944 sh.stream_pagenum = (pgcnt_t)-1; 1945 sh.stream_npages = 0; 1946 hp->cpin = NULL; 1947 hp->cpout = NULL; 1948 hp->cperr = NULL; 1949 hp->in = 0; 1950 hp->out = 0; 1951 hp->bzstream.avail_in = 0; 1952 1953 /* Bump reference to mainq while we are running */ 1954 CQ_OPEN(mainq); 1955 1956 /* Get one page at a time */ 1957 while (dumpsys_sread(hp)) { 1958 if (sh.stream_pagenum != hp->cpin->pagenum) { 1959 sh.stream_pagenum = hp->cpin->pagenum; 1960 sh.stream_npages = btop(hp->cpin->used); 1961 dumpsys_bzrun(hp, &sh, sizeof (sh), BZ_RUN); 1962 } 1963 dumpsys_bzrun(hp, hp->page, PAGESIZE, 0); 1964 } 1965 1966 /* Done with input, flush any partial buffer */ 1967 if (sh.stream_pagenum != (pgcnt_t)-1) { 1968 dumpsys_bzrun(hp, NULL, 0, BZ_FINISH); 1969 dumpsys_errmsg(hp, NULL); 1970 } 1971 1972 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL); 1973 1974 /* Decrement main queue count, we are done */ 1975 CQ_CLOSE(mainq); 1976 } 1977 1978 /* 1979 * Compress with lzjb 1980 * write stream block if full or size==0 1981 * if csize==0 write stream header, else write <csize, data> 1982 * size==0 is a call to flush a buffer 1983 * hp->cpout is the buffer we are flushing or filling 1984 * hp->out is the next index to fill data 1985 * osize is either csize+data, or the size of a stream header 1986 */ 1987 static void 1988 dumpsys_lzjbrun(helper_t *hp, size_t csize, void *buf, size_t size) 1989 { 1990 dumpsync_t *ds = hp->ds; 1991 const int CSIZE = sizeof (dumpcsize_t); 1992 dumpcsize_t cs; 1993 size_t osize = csize > 0 ? CSIZE + size : size; 1994 1995 /* If flush, and there is no buffer, just return */ 1996 if (size == 0 && hp->cpout == NULL) 1997 return; 1998 1999 /* If flush, or cpout is full, write it out */ 2000 if (size == 0 || 2001 hp->cpout != NULL && hp->out + osize > hp->cpout->size) { 2002 2003 /* Set tag+size word at the front of the stream block. */ 2004 cs = DUMP_SET_TAG(hp->out - CSIZE, hp->tag); 2005 (void) memcpy(hp->cpout->buf, &cs, CSIZE); 2006 2007 /* Write block to dump file. */ 2008 dumpsys_swrite(hp, hp->cpout, hp->out); 2009 2010 /* Clear pointer to indicate we need a new buffer */ 2011 hp->cpout = NULL; 2012 2013 /* flushing, we are done */ 2014 if (size == 0) 2015 return; 2016 } 2017 2018 /* Get an output buffer if we dont have one. */ 2019 if (hp->cpout == NULL) { 2020 HRSTART(hp->perpage, outwait); 2021 hp->cpout = CQ_GET(freebufq); 2022 HRSTOP(hp->perpage, outwait); 2023 hp->out = CSIZE; 2024 } 2025 2026 /* Store csize word. This is the size of compressed data. */ 2027 if (csize > 0) { 2028 cs = DUMP_SET_TAG(csize, 0); 2029 (void) memcpy(hp->cpout->buf + hp->out, &cs, CSIZE); 2030 hp->out += CSIZE; 2031 } 2032 2033 /* Store the data. */ 2034 (void) memcpy(hp->cpout->buf + hp->out, buf, size); 2035 hp->out += size; 2036 } 2037 2038 static void 2039 dumpsys_lzjbcompress(helper_t *hp) 2040 { 2041 dumpsync_t *ds = hp->ds; 2042 size_t csize; 2043 dumpstreamhdr_t sh; 2044 2045 (void) strcpy(sh.stream_magic, DUMP_STREAM_MAGIC); 2046 sh.stream_pagenum = (pfn_t)-1; 2047 sh.stream_npages = 0; 2048 hp->cpin = NULL; 2049 hp->cpout = NULL; 2050 hp->cperr = NULL; 2051 hp->in = 0; 2052 hp->out = 0; 2053 2054 /* Bump reference to mainq while we are running */ 2055 CQ_OPEN(mainq); 2056 2057 /* Get one page at a time */ 2058 while (dumpsys_sread(hp)) { 2059 2060 /* Create a stream header for each new input map */ 2061 if (sh.stream_pagenum != hp->cpin->pagenum) { 2062 sh.stream_pagenum = hp->cpin->pagenum; 2063 sh.stream_npages = btop(hp->cpin->used); 2064 dumpsys_lzjbrun(hp, 0, &sh, sizeof (sh)); 2065 } 2066 2067 /* Compress one page */ 2068 HRSTART(hp->perpage, compress); 2069 csize = compress(hp->page, hp->lzbuf, PAGESIZE); 2070 HRSTOP(hp->perpage, compress); 2071 2072 /* Add csize+data to output block */ 2073 ASSERT(csize > 0 && csize <= PAGESIZE); 2074 dumpsys_lzjbrun(hp, csize, hp->lzbuf, csize); 2075 } 2076 2077 /* Done with input, flush any partial buffer */ 2078 if (sh.stream_pagenum != (pfn_t)-1) { 2079 dumpsys_lzjbrun(hp, 0, NULL, 0); 2080 dumpsys_errmsg(hp, NULL); 2081 } 2082 2083 ASSERT(hp->cpin == NULL && hp->cpout == NULL && hp->cperr == NULL); 2084 2085 /* Decrement main queue count, we are done */ 2086 CQ_CLOSE(mainq); 2087 } 2088 2089 /* 2090 * Dump helper called from panic_idle() to compress pages. CPUs in 2091 * this path must not call most kernel services. 2092 * 2093 * During panic, all but one of the CPUs is idle. These CPUs are used 2094 * as helpers working in parallel to copy and compress memory 2095 * pages. During a panic, however, these processors cannot call any 2096 * kernel services. This is because mutexes become no-ops during 2097 * panic, and, cross-call interrupts are inhibited. Therefore, during 2098 * panic dump the helper CPUs communicate with the panic CPU using 2099 * memory variables. All memory mapping and I/O is performed by the 2100 * panic CPU. 2101 * 2102 * At dump configuration time, helper_lock is set and helpers_wanted 2103 * is 0. dumpsys() decides whether to set helpers_wanted before 2104 * clearing helper_lock. 2105 * 2106 * At panic time, idle CPUs spin-wait on helper_lock, then alternately 2107 * take the lock and become a helper, or return. 2108 */ 2109 void 2110 dumpsys_helper() 2111 { 2112 dumpsys_spinlock(&dumpcfg.helper_lock); 2113 if (dumpcfg.helpers_wanted) { 2114 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper]; 2115 2116 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2117 if (hp->helper == FREEHELPER) { 2118 hp->helper = CPU->cpu_id; 2119 BT_SET(dumpcfg.helpermap, CPU->cpu_seqid); 2120 2121 dumpsys_spinunlock(&dumpcfg.helper_lock); 2122 2123 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2) 2124 dumpsys_lzjbcompress(hp); 2125 else 2126 dumpsys_bz2compress(hp); 2127 2128 hp->helper = DONEHELPER; 2129 return; 2130 } 2131 } 2132 2133 /* No more helpers are needed. */ 2134 dumpcfg.helpers_wanted = 0; 2135 2136 } 2137 dumpsys_spinunlock(&dumpcfg.helper_lock); 2138 } 2139 2140 /* 2141 * No-wait helper callable in spin loops. 2142 * 2143 * Do not wait for helper_lock. Just check helpers_wanted. The caller 2144 * may decide to continue. This is the "c)ontinue, s)ync, r)eset? s" 2145 * case. 2146 */ 2147 void 2148 dumpsys_helper_nw() 2149 { 2150 if (dumpcfg.helpers_wanted) 2151 dumpsys_helper(); 2152 } 2153 2154 /* 2155 * Dump helper for live dumps. 2156 * These run as a system task. 2157 */ 2158 static void 2159 dumpsys_live_helper(void *arg) 2160 { 2161 helper_t *hp = arg; 2162 2163 BT_ATOMIC_SET(dumpcfg.helpermap, CPU->cpu_seqid); 2164 if (dumpcfg.clevel < DUMP_CLEVEL_BZIP2) 2165 dumpsys_lzjbcompress(hp); 2166 else 2167 dumpsys_bz2compress(hp); 2168 } 2169 2170 /* 2171 * Compress one page with lzjb (single threaded case) 2172 */ 2173 static void 2174 dumpsys_lzjb_page(helper_t *hp, cbuf_t *cp) 2175 { 2176 dumpsync_t *ds = hp->ds; 2177 uint32_t csize; 2178 2179 hp->helper = MAINHELPER; 2180 hp->in = 0; 2181 hp->used = 0; 2182 hp->cpin = cp; 2183 while (hp->used < cp->used) { 2184 HRSTART(hp->perpage, copy); 2185 hp->in = dumpsys_copy_page(hp, hp->in); 2186 hp->used += PAGESIZE; 2187 HRSTOP(hp->perpage, copy); 2188 2189 HRSTART(hp->perpage, compress); 2190 csize = compress(hp->page, hp->lzbuf, PAGESIZE); 2191 HRSTOP(hp->perpage, compress); 2192 2193 HRSTART(hp->perpage, write); 2194 dumpvp_write(&csize, sizeof (csize)); 2195 dumpvp_write(hp->lzbuf, csize); 2196 HRSTOP(hp->perpage, write); 2197 } 2198 CQ_PUT(mainq, hp->cpin, CBUF_USEDMAP); 2199 hp->cpin = NULL; 2200 } 2201 2202 /* 2203 * Main task to dump pages. This is called on the dump CPU. 2204 */ 2205 static void 2206 dumpsys_main_task(void *arg) 2207 { 2208 dumpsync_t *ds = arg; 2209 pgcnt_t pagenum = 0, bitnum = 0, hibitnum; 2210 dumpmlw_t mlw; 2211 cbuf_t *cp; 2212 pgcnt_t baseoff, pfnoff; 2213 pfn_t base, pfn; 2214 int sec; 2215 2216 dump_init_memlist_walker(&mlw); 2217 2218 /* CONSTCOND */ 2219 while (1) { 2220 2221 if (ds->percent > ds->percent_done) { 2222 ds->percent_done = ds->percent; 2223 sec = (gethrtime() - ds->start) / 1000 / 1000 / 1000; 2224 uprintf("^\r%2d:%02d %3d%% done", 2225 sec / 60, sec % 60, ds->percent); 2226 ds->neednl = 1; 2227 } 2228 2229 while (CQ_IS_EMPTY(mainq) && !CQ_IS_EMPTY(writerq)) { 2230 2231 /* the writerq never blocks */ 2232 cp = CQ_GET(writerq); 2233 if (cp == NULL) 2234 break; 2235 2236 dump_timeleft = dump_timeout; 2237 2238 HRSTART(ds->perpage, write); 2239 dumpvp_write(cp->buf, cp->used); 2240 HRSTOP(ds->perpage, write); 2241 2242 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2243 } 2244 2245 /* 2246 * Wait here for some buffers to process. Returns NULL 2247 * when all helpers have terminated and all buffers 2248 * have been processed. 2249 */ 2250 cp = CQ_GET(mainq); 2251 2252 if (cp == NULL) { 2253 2254 /* Drain the write queue. */ 2255 if (!CQ_IS_EMPTY(writerq)) 2256 continue; 2257 2258 /* Main task exits here. */ 2259 break; 2260 } 2261 2262 dump_timeleft = dump_timeout; 2263 2264 switch (cp->state) { 2265 2266 case CBUF_FREEMAP: 2267 2268 /* 2269 * Note that we drop CBUF_FREEMAP buffers on 2270 * the floor (they will not be on any cqueue) 2271 * when we no longer need them. 2272 */ 2273 if (bitnum >= dumpcfg.bitmapsize) 2274 break; 2275 2276 if (dump_ioerr) { 2277 bitnum = dumpcfg.bitmapsize; 2278 CQ_CLOSE(helperq); 2279 break; 2280 } 2281 2282 HRSTART(ds->perpage, bitmap); 2283 for (; bitnum < dumpcfg.bitmapsize; bitnum++) 2284 if (BT_TEST(dumpcfg.bitmap, bitnum)) 2285 break; 2286 HRSTOP(ds->perpage, bitmap); 2287 dump_timeleft = dump_timeout; 2288 2289 if (bitnum >= dumpcfg.bitmapsize) { 2290 CQ_CLOSE(helperq); 2291 break; 2292 } 2293 2294 /* 2295 * Try to map CBUF_MAPSIZE ranges. Can't 2296 * assume that memory segment size is a 2297 * multiple of CBUF_MAPSIZE. Can't assume that 2298 * the segment starts on a CBUF_MAPSIZE 2299 * boundary. 2300 */ 2301 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2302 ASSERT(pfn != PFN_INVALID); 2303 ASSERT(bitnum + mlw.mpleft <= dumpcfg.bitmapsize); 2304 2305 base = P2ALIGN(pfn, CBUF_MAPNP); 2306 if (base < mlw.mpaddr) { 2307 base = mlw.mpaddr; 2308 baseoff = P2PHASE(base, CBUF_MAPNP); 2309 } else { 2310 baseoff = 0; 2311 } 2312 2313 pfnoff = pfn - base; 2314 if (pfnoff + mlw.mpleft < CBUF_MAPNP) { 2315 hibitnum = bitnum + mlw.mpleft; 2316 cp->size = ptob(pfnoff + mlw.mpleft); 2317 } else { 2318 hibitnum = bitnum - pfnoff + CBUF_MAPNP - 2319 baseoff; 2320 cp->size = CBUF_MAPSIZE - ptob(baseoff); 2321 } 2322 2323 cp->pfn = pfn; 2324 cp->bitnum = bitnum++; 2325 cp->pagenum = pagenum++; 2326 cp->off = ptob(pfnoff); 2327 2328 for (; bitnum < hibitnum; bitnum++) 2329 if (BT_TEST(dumpcfg.bitmap, bitnum)) 2330 pagenum++; 2331 2332 dump_timeleft = dump_timeout; 2333 cp->used = ptob(pagenum - cp->pagenum); 2334 2335 HRSTART(ds->perpage, map); 2336 hat_devload(kas.a_hat, cp->buf, cp->size, base, 2337 PROT_READ, HAT_LOAD_NOCONSIST); 2338 HRSTOP(ds->perpage, map); 2339 2340 ds->pages_mapped += btop(cp->size); 2341 ds->pages_used += pagenum - cp->pagenum; 2342 2343 CQ_OPEN(mainq); 2344 2345 /* 2346 * If there are no helpers the main task does 2347 * non-streams lzjb compress. 2348 */ 2349 if (dumpcfg.clevel == 0) { 2350 dumpsys_lzjb_page(dumpcfg.helper, cp); 2351 break; 2352 } 2353 2354 /* pass mapped pages to a helper */ 2355 CQ_PUT(helperq, cp, CBUF_INREADY); 2356 2357 /* the last page was done */ 2358 if (bitnum >= dumpcfg.bitmapsize) 2359 CQ_CLOSE(helperq); 2360 2361 break; 2362 2363 case CBUF_USEDMAP: 2364 2365 ds->npages += btop(cp->used); 2366 2367 HRSTART(ds->perpage, unmap); 2368 hat_unload(kas.a_hat, cp->buf, cp->size, HAT_UNLOAD); 2369 HRSTOP(ds->perpage, unmap); 2370 2371 if (bitnum < dumpcfg.bitmapsize) 2372 CQ_PUT(mainq, cp, CBUF_FREEMAP); 2373 CQ_CLOSE(mainq); 2374 2375 ASSERT(ds->npages <= dumphdr->dump_npages); 2376 ds->percent = ds->npages * 100LL / dumphdr->dump_npages; 2377 break; 2378 2379 case CBUF_WRITE: 2380 2381 CQ_PUT(writerq, cp, CBUF_WRITE); 2382 break; 2383 2384 case CBUF_ERRMSG: 2385 2386 if (cp->used > 0) { 2387 cp->buf[cp->size - 2] = '\n'; 2388 cp->buf[cp->size - 1] = '\0'; 2389 if (ds->neednl) { 2390 uprintf("\n%s", cp->buf); 2391 ds->neednl = 0; 2392 } else { 2393 uprintf("%s", cp->buf); 2394 } 2395 /* wait for console output */ 2396 drv_usecwait(200000); 2397 dump_timeleft = dump_timeout; 2398 } 2399 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2400 break; 2401 2402 default: 2403 uprintf("dump: unexpected buffer state %d, " 2404 "buffer will be lost\n", cp->state); 2405 break; 2406 2407 } /* end switch */ 2408 2409 } /* end while(1) */ 2410 } 2411 2412 #ifdef COLLECT_METRICS 2413 size_t 2414 dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size) 2415 { 2416 dumpcfg_t *cfg = &dumpcfg; 2417 int myid = CPU->cpu_seqid; 2418 int i, compress_ratio; 2419 int sec, iorate; 2420 helper_t *hp, *hpend = &cfg->helper[cfg->nhelper]; 2421 char *e = buf + size; 2422 char *p = buf; 2423 2424 sec = ds->elapsed / (1000 * 1000 * 1000ULL); 2425 if (sec < 1) 2426 sec = 1; 2427 2428 if (ds->iotime < 1) 2429 ds->iotime = 1; 2430 iorate = (ds->nwrite * 100000ULL) / ds->iotime; 2431 2432 compress_ratio = 100LL * ds->npages / btopr(ds->nwrite + 1); 2433 2434 #define P(...) (p += p < e ? snprintf(p, e - p, __VA_ARGS__) : 0) 2435 2436 P("Master cpu_seqid,%d\n", CPU->cpu_seqid); 2437 P("Master cpu_id,%d\n", CPU->cpu_id); 2438 P("dump_flags,0x%x\n", dumphdr->dump_flags); 2439 P("dump_ioerr,%d\n", dump_ioerr); 2440 2441 P("Helpers:\n"); 2442 for (i = 0; i < ncpus; i++) { 2443 if ((i & 15) == 0) 2444 P(",,%03d,", i); 2445 if (i == myid) 2446 P(" M"); 2447 else if (BT_TEST(cfg->helpermap, i)) 2448 P("%4d", cpu_seq[i]->cpu_id); 2449 else 2450 P(" *"); 2451 if ((i & 15) == 15) 2452 P("\n"); 2453 } 2454 2455 P("ncbuf_used,%d\n", cfg->ncbuf_used); 2456 P("ncmap,%d\n", cfg->ncmap); 2457 2458 P("Found %ldM ranges,%ld\n", (CBUF_MAPSIZE / DUMP_1MB), cfg->found4m); 2459 P("Found small pages,%ld\n", cfg->foundsm); 2460 2461 P("Compression level,%d\n", cfg->clevel); 2462 P("Compression type,%s %s\n", cfg->clevel == 0 ? "serial" : "parallel", 2463 cfg->clevel >= DUMP_CLEVEL_BZIP2 ? "bzip2" : "lzjb"); 2464 P("Compression ratio,%d.%02d\n", compress_ratio / 100, compress_ratio % 2465 100); 2466 P("nhelper_used,%d\n", cfg->nhelper_used); 2467 2468 P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100); 2469 P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite); 2470 P("..total nsec,%lld\n", (u_longlong_t)ds->iotime); 2471 P("dumpbuf.iosize,%ld\n", dumpbuf.iosize); 2472 P("dumpbuf.size,%ld\n", dumpbuf.size); 2473 2474 P("Dump pages/sec,%llu\n", (u_longlong_t)ds->npages / sec); 2475 P("Dump pages,%llu\n", (u_longlong_t)ds->npages); 2476 P("Dump time,%d\n", sec); 2477 2478 if (ds->pages_mapped > 0) 2479 P("per-cent map utilization,%d\n", (int)((100 * ds->pages_used) 2480 / ds->pages_mapped)); 2481 2482 P("\nPer-page metrics:\n"); 2483 if (ds->npages > 0) { 2484 for (hp = cfg->helper; hp != hpend; hp++) { 2485 #define PERPAGE(x) ds->perpage.x += hp->perpage.x; 2486 PERPAGES; 2487 #undef PERPAGE 2488 } 2489 #define PERPAGE(x) \ 2490 P("%s nsec/page,%d\n", #x, (int)(ds->perpage.x / ds->npages)); 2491 PERPAGES; 2492 #undef PERPAGE 2493 P("freebufq.empty,%d\n", (int)(ds->freebufq.empty / 2494 ds->npages)); 2495 P("helperq.empty,%d\n", (int)(ds->helperq.empty / 2496 ds->npages)); 2497 P("writerq.empty,%d\n", (int)(ds->writerq.empty / 2498 ds->npages)); 2499 P("mainq.empty,%d\n", (int)(ds->mainq.empty / ds->npages)); 2500 2501 P("I/O wait nsec/page,%llu\n", (u_longlong_t)(ds->iowait / 2502 ds->npages)); 2503 } 2504 #undef P 2505 if (p < e) 2506 bzero(p, e - p); 2507 return (p - buf); 2508 } 2509 #endif /* COLLECT_METRICS */ 2510 2511 /* 2512 * Dump the system. 2513 */ 2514 void 2515 dumpsys(void) 2516 { 2517 dumpsync_t *ds = &dumpsync; 2518 taskq_t *livetaskq = NULL; 2519 pfn_t pfn; 2520 pgcnt_t bitnum; 2521 proc_t *p; 2522 helper_t *hp, *hpend = &dumpcfg.helper[dumpcfg.nhelper]; 2523 cbuf_t *cp; 2524 pid_t npids, pidx; 2525 char *content; 2526 char *buf; 2527 size_t size; 2528 int save_dump_clevel; 2529 dumpmlw_t mlw; 2530 dumpcsize_t datatag; 2531 dumpdatahdr_t datahdr; 2532 2533 if (dumpvp == NULL || dumphdr == NULL) { 2534 uprintf("skipping system dump - no dump device configured\n"); 2535 if (panicstr) { 2536 dumpcfg.helpers_wanted = 0; 2537 dumpsys_spinunlock(&dumpcfg.helper_lock); 2538 } 2539 return; 2540 } 2541 dumpbuf.cur = dumpbuf.start; 2542 2543 /* clear the sync variables */ 2544 ASSERT(dumpcfg.nhelper > 0); 2545 bzero(ds, sizeof (*ds)); 2546 ds->dumpcpu = CPU->cpu_id; 2547 2548 /* 2549 * Calculate the starting block for dump. If we're dumping on a 2550 * swap device, start 1/5 of the way in; otherwise, start at the 2551 * beginning. And never use the first page -- it may be a disk label. 2552 */ 2553 if (dumpvp->v_flag & VISSWAP) 2554 dumphdr->dump_start = P2ROUNDUP(dumpvp_size / 5, DUMP_OFFSET); 2555 else 2556 dumphdr->dump_start = DUMP_OFFSET; 2557 2558 dumphdr->dump_flags = DF_VALID | DF_COMPLETE | DF_LIVE | DF_COMPRESSED; 2559 dumphdr->dump_crashtime = gethrestime_sec(); 2560 dumphdr->dump_npages = 0; 2561 dumphdr->dump_nvtop = 0; 2562 bzero(dumpcfg.bitmap, BT_SIZEOFMAP(dumpcfg.bitmapsize)); 2563 dump_timeleft = dump_timeout; 2564 2565 if (panicstr) { 2566 dumphdr->dump_flags &= ~DF_LIVE; 2567 (void) VOP_DUMPCTL(dumpvp, DUMP_FREE, NULL, NULL); 2568 (void) VOP_DUMPCTL(dumpvp, DUMP_ALLOC, NULL, NULL); 2569 (void) vsnprintf(dumphdr->dump_panicstring, DUMP_PANICSIZE, 2570 panicstr, panicargs); 2571 2572 } 2573 2574 if (dump_conflags & DUMP_ALL) 2575 content = "all"; 2576 else if (dump_conflags & DUMP_CURPROC) 2577 content = "kernel + curproc"; 2578 else 2579 content = "kernel"; 2580 uprintf("dumping to %s, offset %lld, content: %s\n", dumppath, 2581 dumphdr->dump_start, content); 2582 2583 /* Make sure nodename is current */ 2584 bcopy(utsname.nodename, dumphdr->dump_utsname.nodename, SYS_NMLN); 2585 2586 /* 2587 * If this is a live dump, try to open a VCHR vnode for better 2588 * performance. We must take care to flush the buffer cache 2589 * first. 2590 */ 2591 if (!panicstr) { 2592 vnode_t *cdev_vp, *cmn_cdev_vp; 2593 2594 ASSERT(dumpbuf.cdev_vp == NULL); 2595 cdev_vp = makespecvp(VTOS(dumpvp)->s_dev, VCHR); 2596 if (cdev_vp != NULL) { 2597 cmn_cdev_vp = common_specvp(cdev_vp); 2598 if (VOP_OPEN(&cmn_cdev_vp, FREAD | FWRITE, kcred, NULL) 2599 == 0) { 2600 if (vn_has_cached_data(dumpvp)) 2601 (void) pvn_vplist_dirty(dumpvp, 0, NULL, 2602 B_INVAL | B_TRUNC, kcred); 2603 dumpbuf.cdev_vp = cmn_cdev_vp; 2604 } else { 2605 VN_RELE(cdev_vp); 2606 } 2607 } 2608 } 2609 2610 /* 2611 * Store a hires timestamp so we can look it up during debugging. 2612 */ 2613 lbolt_debug_entry(); 2614 2615 /* 2616 * Leave room for the message and ereport save areas and terminal dump 2617 * header. 2618 */ 2619 dumpbuf.vp_limit = dumpvp_size - DUMP_LOGSIZE - DUMP_OFFSET - 2620 DUMP_ERPTSIZE; 2621 2622 /* 2623 * Write out the symbol table. It's no longer compressed, 2624 * so its 'size' and 'csize' are equal. 2625 */ 2626 dumpbuf.vp_off = dumphdr->dump_ksyms = dumphdr->dump_start + PAGESIZE; 2627 dumphdr->dump_ksyms_size = dumphdr->dump_ksyms_csize = 2628 ksyms_snapshot(dumpvp_ksyms_write, NULL, LONG_MAX); 2629 2630 /* 2631 * Write out the translation map. 2632 */ 2633 dumphdr->dump_map = dumpvp_flush(); 2634 dump_as(&kas); 2635 dumphdr->dump_nvtop += dump_plat_addr(); 2636 2637 /* 2638 * call into hat, which may have unmapped pages that also need to 2639 * be in the dump 2640 */ 2641 hat_dump(); 2642 2643 if (dump_conflags & DUMP_ALL) { 2644 mutex_enter(&pidlock); 2645 2646 for (npids = 0, p = practive; p != NULL; p = p->p_next) 2647 dumpcfg.pids[npids++] = p->p_pid; 2648 2649 mutex_exit(&pidlock); 2650 2651 for (pidx = 0; pidx < npids; pidx++) 2652 (void) dump_process(dumpcfg.pids[pidx]); 2653 2654 dump_init_memlist_walker(&mlw); 2655 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) { 2656 dump_timeleft = dump_timeout; 2657 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2658 /* 2659 * Some hypervisors do not have all pages available to 2660 * be accessed by the guest OS. Check for page 2661 * accessibility. 2662 */ 2663 if (plat_hold_page(pfn, PLAT_HOLD_NO_LOCK, NULL) != 2664 PLAT_HOLD_OK) 2665 continue; 2666 BT_SET(dumpcfg.bitmap, bitnum); 2667 } 2668 dumphdr->dump_npages = dumpcfg.bitmapsize; 2669 dumphdr->dump_flags |= DF_ALL; 2670 2671 } else if (dump_conflags & DUMP_CURPROC) { 2672 /* 2673 * Determine which pid is to be dumped. If we're panicking, we 2674 * dump the process associated with panic_thread (if any). If 2675 * this is a live dump, we dump the process associated with 2676 * curthread. 2677 */ 2678 npids = 0; 2679 if (panicstr) { 2680 if (panic_thread != NULL && 2681 panic_thread->t_procp != NULL && 2682 panic_thread->t_procp != &p0) { 2683 dumpcfg.pids[npids++] = 2684 panic_thread->t_procp->p_pid; 2685 } 2686 } else { 2687 dumpcfg.pids[npids++] = curthread->t_procp->p_pid; 2688 } 2689 2690 if (npids && dump_process(dumpcfg.pids[0]) == 0) 2691 dumphdr->dump_flags |= DF_CURPROC; 2692 else 2693 dumphdr->dump_flags |= DF_KERNEL; 2694 2695 } else { 2696 dumphdr->dump_flags |= DF_KERNEL; 2697 } 2698 2699 dumphdr->dump_hashmask = (1 << highbit(dumphdr->dump_nvtop - 1)) - 1; 2700 2701 /* 2702 * Write out the pfn table. 2703 */ 2704 dumphdr->dump_pfn = dumpvp_flush(); 2705 dump_init_memlist_walker(&mlw); 2706 for (bitnum = 0; bitnum < dumpcfg.bitmapsize; bitnum++) { 2707 dump_timeleft = dump_timeout; 2708 if (!BT_TEST(dumpcfg.bitmap, bitnum)) 2709 continue; 2710 pfn = dump_bitnum_to_pfn(bitnum, &mlw); 2711 ASSERT(pfn != PFN_INVALID); 2712 dumpvp_write(&pfn, sizeof (pfn_t)); 2713 } 2714 dump_plat_pfn(); 2715 2716 /* 2717 * Write out all the pages. 2718 * Map pages, copy them handling UEs, compress, and write them out. 2719 * Cooperate with any helpers running on CPUs in panic_idle(). 2720 */ 2721 dumphdr->dump_data = dumpvp_flush(); 2722 2723 bzero(dumpcfg.helpermap, BT_SIZEOFMAP(NCPU)); 2724 ds->live = dumpcfg.clevel > 0 && 2725 (dumphdr->dump_flags & DF_LIVE) != 0; 2726 2727 save_dump_clevel = dumpcfg.clevel; 2728 if (panicstr) 2729 dumpsys_get_maxmem(); 2730 else if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2) 2731 dumpcfg.clevel = DUMP_CLEVEL_LZJB; 2732 2733 dumpcfg.nhelper_used = 0; 2734 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2735 if (hp->page == NULL) { 2736 hp->helper = DONEHELPER; 2737 continue; 2738 } 2739 ++dumpcfg.nhelper_used; 2740 hp->helper = FREEHELPER; 2741 hp->taskqid = NULL; 2742 hp->ds = ds; 2743 bzero(&hp->perpage, sizeof (hp->perpage)); 2744 if (dumpcfg.clevel >= DUMP_CLEVEL_BZIP2) 2745 (void) BZ2_bzCompressReset(&hp->bzstream); 2746 } 2747 2748 CQ_OPEN(freebufq); 2749 CQ_OPEN(helperq); 2750 2751 dumpcfg.ncbuf_used = 0; 2752 for (cp = dumpcfg.cbuf; cp != &dumpcfg.cbuf[dumpcfg.ncbuf]; cp++) { 2753 if (cp->buf != NULL) { 2754 CQ_PUT(freebufq, cp, CBUF_FREEBUF); 2755 ++dumpcfg.ncbuf_used; 2756 } 2757 } 2758 2759 for (cp = dumpcfg.cmap; cp != &dumpcfg.cmap[dumpcfg.ncmap]; cp++) 2760 CQ_PUT(mainq, cp, CBUF_FREEMAP); 2761 2762 ds->start = gethrtime(); 2763 ds->iowaitts = ds->start; 2764 2765 /* start helpers */ 2766 if (ds->live) { 2767 int n = dumpcfg.nhelper_used; 2768 int pri = MINCLSYSPRI - 25; 2769 2770 livetaskq = taskq_create("LiveDump", n, pri, n, n, 2771 TASKQ_PREPOPULATE); 2772 for (hp = dumpcfg.helper; hp != hpend; hp++) { 2773 if (hp->page == NULL) 2774 continue; 2775 hp->helper = hp - dumpcfg.helper; 2776 hp->taskqid = taskq_dispatch(livetaskq, 2777 dumpsys_live_helper, (void *)hp, TQ_NOSLEEP); 2778 } 2779 2780 } else { 2781 if (panicstr) 2782 kmem_dump_begin(); 2783 dumpcfg.helpers_wanted = dumpcfg.clevel > 0; 2784 dumpsys_spinunlock(&dumpcfg.helper_lock); 2785 } 2786 2787 /* run main task */ 2788 dumpsys_main_task(ds); 2789 2790 ds->elapsed = gethrtime() - ds->start; 2791 if (ds->elapsed < 1) 2792 ds->elapsed = 1; 2793 2794 if (livetaskq != NULL) 2795 taskq_destroy(livetaskq); 2796 2797 if (ds->neednl) { 2798 uprintf("\n"); 2799 ds->neednl = 0; 2800 } 2801 2802 /* record actual pages dumped */ 2803 dumphdr->dump_npages = ds->npages; 2804 2805 /* platform-specific data */ 2806 dumphdr->dump_npages += dump_plat_data(dumpcfg.cbuf[0].buf); 2807 2808 /* note any errors by clearing DF_COMPLETE */ 2809 if (dump_ioerr || ds->npages < dumphdr->dump_npages) 2810 dumphdr->dump_flags &= ~DF_COMPLETE; 2811 2812 /* end of stream blocks */ 2813 datatag = 0; 2814 dumpvp_write(&datatag, sizeof (datatag)); 2815 2816 bzero(&datahdr, sizeof (datahdr)); 2817 2818 /* buffer for metrics */ 2819 buf = dumpcfg.cbuf[0].buf; 2820 size = MIN(dumpcfg.cbuf[0].size, DUMP_OFFSET - sizeof (dumphdr_t) - 2821 sizeof (dumpdatahdr_t)); 2822 2823 /* finish the kmem intercepts, collect kmem verbose info */ 2824 if (panicstr) { 2825 datahdr.dump_metrics = kmem_dump_finish(buf, size); 2826 buf += datahdr.dump_metrics; 2827 size -= datahdr.dump_metrics; 2828 } 2829 2830 /* compression info in data header */ 2831 datahdr.dump_datahdr_magic = DUMP_DATAHDR_MAGIC; 2832 datahdr.dump_datahdr_version = DUMP_DATAHDR_VERSION; 2833 datahdr.dump_maxcsize = CBUF_SIZE; 2834 datahdr.dump_maxrange = CBUF_MAPSIZE / PAGESIZE; 2835 datahdr.dump_nstreams = dumpcfg.nhelper_used; 2836 datahdr.dump_clevel = dumpcfg.clevel; 2837 #ifdef COLLECT_METRICS 2838 if (dump_metrics_on) 2839 datahdr.dump_metrics += dumpsys_metrics(ds, buf, size); 2840 #endif 2841 datahdr.dump_data_csize = dumpvp_flush() - dumphdr->dump_data; 2842 2843 /* 2844 * Write out the initial and terminal dump headers. 2845 */ 2846 dumpbuf.vp_off = dumphdr->dump_start; 2847 dumpvp_write(dumphdr, sizeof (dumphdr_t)); 2848 (void) dumpvp_flush(); 2849 2850 dumpbuf.vp_limit = dumpvp_size; 2851 dumpbuf.vp_off = dumpbuf.vp_limit - DUMP_OFFSET; 2852 dumpvp_write(dumphdr, sizeof (dumphdr_t)); 2853 dumpvp_write(&datahdr, sizeof (dumpdatahdr_t)); 2854 dumpvp_write(dumpcfg.cbuf[0].buf, datahdr.dump_metrics); 2855 2856 (void) dumpvp_flush(); 2857 2858 uprintf("\r%3d%% done: %llu pages dumped, ", 2859 ds->percent_done, (u_longlong_t)ds->npages); 2860 2861 if (dump_ioerr == 0) { 2862 uprintf("dump succeeded\n"); 2863 } else { 2864 uprintf("dump failed: error %d\n", dump_ioerr); 2865 #ifdef DEBUG 2866 if (panicstr) 2867 debug_enter("dump failed"); 2868 #endif 2869 } 2870 2871 /* 2872 * Write out all undelivered messages. This has to be the *last* 2873 * thing we do because the dump process itself emits messages. 2874 */ 2875 if (panicstr) { 2876 dump_ereports(); 2877 dump_messages(); 2878 } 2879 2880 delay(2 * hz); /* let people see the 'done' message */ 2881 dump_timeleft = 0; 2882 dump_ioerr = 0; 2883 2884 /* restore settings after live dump completes */ 2885 if (!panicstr) { 2886 dumpcfg.clevel = save_dump_clevel; 2887 2888 /* release any VCHR open of the dump device */ 2889 if (dumpbuf.cdev_vp != NULL) { 2890 (void) VOP_CLOSE(dumpbuf.cdev_vp, FREAD | FWRITE, 1, 0, 2891 kcred, NULL); 2892 VN_RELE(dumpbuf.cdev_vp); 2893 dumpbuf.cdev_vp = NULL; 2894 } 2895 } 2896 } 2897 2898 /* 2899 * This function is called whenever the memory size, as represented 2900 * by the phys_install list, changes. 2901 */ 2902 void 2903 dump_resize() 2904 { 2905 mutex_enter(&dump_lock); 2906 dumphdr_init(); 2907 dumpbuf_resize(); 2908 dump_update_clevel(); 2909 mutex_exit(&dump_lock); 2910 } 2911 2912 /* 2913 * This function allows for dynamic resizing of a dump area. It assumes that 2914 * the underlying device has update its appropriate size(9P). 2915 */ 2916 int 2917 dumpvp_resize() 2918 { 2919 int error; 2920 vattr_t vattr; 2921 2922 mutex_enter(&dump_lock); 2923 vattr.va_mask = AT_SIZE; 2924 if ((error = VOP_GETATTR(dumpvp, &vattr, 0, kcred, NULL)) != 0) { 2925 mutex_exit(&dump_lock); 2926 return (error); 2927 } 2928 2929 if (error == 0 && vattr.va_size < 2 * DUMP_LOGSIZE + DUMP_ERPTSIZE) { 2930 mutex_exit(&dump_lock); 2931 return (ENOSPC); 2932 } 2933 2934 dumpvp_size = vattr.va_size & -DUMP_OFFSET; 2935 mutex_exit(&dump_lock); 2936 return (0); 2937 } 2938