1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/cmn_err.h> 28 #include <sys/vmem.h> 29 #include <sys/kmem.h> 30 #include <sys/systm.h> 31 #include <sys/machsystm.h> /* for page_freelist_coalesce() */ 32 #include <sys/errno.h> 33 #include <sys/memnode.h> 34 #include <sys/memlist.h> 35 #include <sys/memlist_impl.h> 36 #include <sys/tuneable.h> 37 #include <sys/proc.h> 38 #include <sys/disp.h> 39 #include <sys/debug.h> 40 #include <sys/vm.h> 41 #include <sys/callb.h> 42 #include <sys/memlist_plat.h> /* for installed_top_size() */ 43 #include <sys/condvar_impl.h> /* for CV_HAS_WAITERS() */ 44 #include <sys/dumphdr.h> /* for dump_resize() */ 45 #include <sys/atomic.h> /* for use in stats collection */ 46 #include <sys/rwlock.h> 47 #include <sys/cpuvar.h> 48 #include <vm/seg_kmem.h> 49 #include <vm/seg_kpm.h> 50 #include <vm/page.h> 51 #include <vm/vm_dep.h> 52 #define SUNDDI_IMPL /* so sunddi.h will not redefine splx() et al */ 53 #include <sys/sunddi.h> 54 #include <sys/mem_config.h> 55 #include <sys/mem_cage.h> 56 #include <sys/lgrp.h> 57 #include <sys/ddi.h> 58 #include <sys/modctl.h> 59 60 extern struct memlist *phys_avail; 61 62 extern void mem_node_add(pfn_t, pfn_t); 63 extern void mem_node_del(pfn_t, pfn_t); 64 65 extern uint_t page_ctrs_adjust(int); 66 void page_ctrs_cleanup(void); 67 static void kphysm_setup_post_add(pgcnt_t); 68 static int kphysm_setup_pre_del(pgcnt_t); 69 static void kphysm_setup_post_del(pgcnt_t, int); 70 71 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs); 72 73 static int delspan_reserve(pfn_t, pgcnt_t); 74 static void delspan_unreserve(pfn_t, pgcnt_t); 75 76 kmutex_t memseg_lists_lock; 77 struct memseg *memseg_va_avail; 78 struct memseg *memseg_alloc(void); 79 static struct memseg *memseg_delete_junk; 80 static struct memseg *memseg_edit_junk; 81 void memseg_remap_init(void); 82 static void memseg_remap_to_dummy(struct memseg *); 83 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t); 84 static struct memseg *memseg_reuse(pgcnt_t); 85 86 static struct kmem_cache *memseg_cache; 87 88 /* 89 * Interfaces to manage externally allocated 90 * page_t memory (metadata) for a memseg. 91 */ 92 #pragma weak memseg_alloc_meta 93 #pragma weak memseg_free_meta 94 #pragma weak memseg_get_metapfn 95 #pragma weak memseg_remap_meta 96 97 extern int ppvm_enable; 98 extern page_t *ppvm_base; 99 extern int memseg_alloc_meta(pfn_t, pgcnt_t, void **, pgcnt_t *); 100 extern void memseg_free_meta(void *, pgcnt_t); 101 extern pfn_t memseg_get_metapfn(void *, pgcnt_t); 102 extern void memseg_remap_meta(struct memseg *); 103 static int memseg_is_dynamic(struct memseg *); 104 static int memseg_includes_meta(struct memseg *); 105 pfn_t memseg_get_start(struct memseg *); 106 static void memseg_cpu_vm_flush(void); 107 108 int meta_alloc_enable; 109 110 /* 111 * Add a chunk of memory to the system. 112 * base: starting PAGESIZE page of new memory. 113 * npgs: length in PAGESIZE pages. 114 * 115 * Adding mem this way doesn't increase the size of the hash tables; 116 * growing them would be too hard. This should be OK, but adding memory 117 * dynamically most likely means more hash misses, since the tables will 118 * be smaller than they otherwise would be. 119 */ 120 #ifdef DEBUG 121 static int memseg_debug; 122 #define MEMSEG_DEBUG(args...) if (memseg_debug) printf(args) 123 #else 124 #define MEMSEG_DEBUG(...) 125 #endif 126 127 int 128 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs) 129 { 130 page_t *pp; 131 page_t *opp, *oepp, *segpp; 132 struct memseg *seg; 133 uint64_t avmem; 134 pfn_t pfn; 135 pfn_t pt_base = base; 136 pgcnt_t tpgs = npgs; 137 pgcnt_t metapgs = 0; 138 int exhausted; 139 pfn_t pnum; 140 int mnode; 141 caddr_t vaddr; 142 int reuse; 143 int mlret; 144 int rv; 145 int flags; 146 int meta_alloc = 0; 147 void *mapva; 148 void *metabase = (void *)base; 149 pgcnt_t nkpmpgs = 0; 150 offset_t kpm_pages_off; 151 152 cmn_err(CE_CONT, 153 "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n", 154 npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT); 155 156 /* 157 * Add this span in the delete list to prevent interactions. 158 */ 159 if (!delspan_reserve(base, npgs)) { 160 return (KPHYSM_ESPAN); 161 } 162 /* 163 * Check to see if any of the memory span has been added 164 * by trying an add to the installed memory list. This 165 * forms the interlocking process for add. 166 */ 167 168 memlist_write_lock(); 169 170 mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT, 171 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 172 173 if (mlret == MEML_SPANOP_OK) 174 installed_top_size(phys_install, &physmax, &physinstalled); 175 176 memlist_write_unlock(); 177 178 if (mlret != MEML_SPANOP_OK) { 179 if (mlret == MEML_SPANOP_EALLOC) { 180 delspan_unreserve(pt_base, tpgs); 181 return (KPHYSM_ERESOURCE); 182 } else if (mlret == MEML_SPANOP_ESPAN) { 183 delspan_unreserve(pt_base, tpgs); 184 return (KPHYSM_ESPAN); 185 } else { 186 delspan_unreserve(pt_base, tpgs); 187 return (KPHYSM_ERESOURCE); 188 } 189 } 190 191 if (meta_alloc_enable) { 192 /* 193 * Allocate the page_t's from existing memory; 194 * if that fails, allocate from the incoming memory. 195 */ 196 rv = memseg_alloc_meta(base, npgs, &metabase, &metapgs); 197 if (rv == KPHYSM_OK) { 198 ASSERT(metapgs); 199 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); 200 meta_alloc = 1; 201 goto mapalloc; 202 } 203 } 204 205 /* 206 * We store the page_t's for this new memory in the first 207 * few pages of the chunk. Here, we go and get'em ... 208 */ 209 210 /* 211 * The expression after the '-' gives the number of pages 212 * that will fit in the new memory based on a requirement 213 * of (PAGESIZE + sizeof (page_t)) bytes per page. 214 */ 215 metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) / 216 (PAGESIZE + sizeof (page_t))); 217 218 npgs -= metapgs; 219 base += metapgs; 220 221 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); 222 223 exhausted = (metapgs == 0 || npgs == 0); 224 225 if (kpm_enable && !exhausted) { 226 pgcnt_t start, end, nkpmpgs_prelim; 227 size_t ptsz; 228 229 /* 230 * A viable kpm large page mapping must not overlap two 231 * dynamic memsegs. Therefore the total size is checked 232 * to be at least kpm_pgsz and also whether start and end 233 * points are at least kpm_pgsz aligned. 234 */ 235 if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) || 236 pmodkpmp(base + npgs)) { 237 238 kphysm_addmem_error_undospan(pt_base, tpgs); 239 240 /* 241 * There is no specific error code for violating 242 * kpm granularity constraints. 243 */ 244 return (KPHYSM_ENOTVIABLE); 245 } 246 247 start = kpmptop(ptokpmp(base)); 248 end = kpmptop(ptokpmp(base + npgs)); 249 nkpmpgs_prelim = ptokpmp(end - start); 250 ptsz = npgs * sizeof (page_t); 251 metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ); 252 exhausted = (tpgs <= metapgs); 253 if (!exhausted) { 254 npgs = tpgs - metapgs; 255 base = pt_base + metapgs; 256 257 /* final nkpmpgs */ 258 start = kpmptop(ptokpmp(base)); 259 nkpmpgs = ptokpmp(end - start); 260 kpm_pages_off = ptsz + 261 (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ; 262 } 263 } 264 265 /* 266 * Is memory area supplied too small? 267 */ 268 if (exhausted) { 269 kphysm_addmem_error_undospan(pt_base, tpgs); 270 /* 271 * There is no specific error code for 'too small'. 272 */ 273 return (KPHYSM_ERESOURCE); 274 } 275 276 mapalloc: 277 /* 278 * We may re-use a previously allocated VA space for the page_ts 279 * eventually, but we need to initialize and lock the pages first. 280 */ 281 282 /* 283 * Get an address in the kernel address map, map 284 * the page_t pages and see if we can touch them. 285 */ 286 287 mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP); 288 if (mapva == NULL) { 289 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 290 " Can't allocate VA for page_ts"); 291 292 if (meta_alloc) 293 memseg_free_meta(metabase, metapgs); 294 kphysm_addmem_error_undospan(pt_base, tpgs); 295 296 return (KPHYSM_ERESOURCE); 297 } 298 pp = mapva; 299 300 if (physmax < (pt_base + tpgs)) 301 physmax = (pt_base + tpgs); 302 303 /* 304 * In the remapping code we map one page at a time so we must do 305 * the same here to match mapping sizes. 306 */ 307 pfn = pt_base; 308 vaddr = (caddr_t)pp; 309 for (pnum = 0; pnum < metapgs; pnum++) { 310 if (meta_alloc) 311 pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum); 312 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 313 PROT_READ | PROT_WRITE, 314 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 315 pfn++; 316 vaddr += ptob(1); 317 } 318 319 if (ddi_peek32((dev_info_t *)NULL, 320 (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) { 321 322 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 323 " Can't access pp array at 0x%p [phys 0x%lx]", 324 (void *)pp, pt_base); 325 326 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 327 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 328 329 vmem_free(heap_arena, mapva, ptob(metapgs)); 330 if (meta_alloc) 331 memseg_free_meta(metabase, metapgs); 332 kphysm_addmem_error_undospan(pt_base, tpgs); 333 334 return (KPHYSM_EFAULT); 335 } 336 337 /* 338 * Add this memory slice to its memory node translation. 339 * 340 * Note that right now, each node may have only one slice; 341 * this may change with COD or in larger SSM systems with 342 * nested latency groups, so we must not assume that the 343 * node does not yet exist. 344 */ 345 pnum = pt_base + tpgs - 1; 346 mem_node_add_range(pt_base, pnum); 347 348 /* 349 * Allocate or resize page counters as necessary to accommodate 350 * the increase in memory pages. 351 */ 352 mnode = PFN_2_MEM_NODE(pnum); 353 PAGE_CTRS_ADJUST(base, npgs, rv); 354 if (rv) { 355 356 mem_node_del_range(pt_base, pnum); 357 358 /* cleanup the page counters */ 359 page_ctrs_cleanup(); 360 361 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 362 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 363 364 vmem_free(heap_arena, mapva, ptob(metapgs)); 365 if (meta_alloc) 366 memseg_free_meta(metabase, metapgs); 367 kphysm_addmem_error_undospan(pt_base, tpgs); 368 369 return (KPHYSM_ERESOURCE); 370 } 371 372 /* 373 * Update the phys_avail memory list. 374 * The phys_install list was done at the start. 375 */ 376 377 memlist_write_lock(); 378 379 mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT, 380 (uint64_t)(npgs) << PAGESHIFT, &phys_avail); 381 ASSERT(mlret == MEML_SPANOP_OK); 382 383 memlist_write_unlock(); 384 385 /* See if we can find a memseg to re-use. */ 386 if (meta_alloc) { 387 seg = memseg_reuse(0); 388 reuse = 1; /* force unmapping of temp mapva */ 389 flags = MEMSEG_DYNAMIC | MEMSEG_META_ALLOC; 390 /* 391 * There is a 1:1 fixed relationship between a pfn 392 * and a page_t VA. The pfn is used as an index into 393 * the ppvm_base page_t table in order to calculate 394 * the page_t base address for a given pfn range. 395 */ 396 segpp = ppvm_base + base; 397 } else { 398 seg = memseg_reuse(metapgs); 399 reuse = (seg != NULL); 400 flags = MEMSEG_DYNAMIC | MEMSEG_META_INCL; 401 segpp = pp; 402 } 403 404 /* 405 * Initialize the memseg structure representing this memory 406 * and add it to the existing list of memsegs. Do some basic 407 * initialization and add the memory to the system. 408 * In order to prevent lock deadlocks, the add_physmem() 409 * code is repeated here, but split into several stages. 410 * 411 * If a memseg is reused, invalidate memseg pointers in 412 * all cpu vm caches. We need to do this this since the check 413 * pp >= seg->pages && pp < seg->epages 414 * used in various places is not atomic and so the first compare 415 * can happen before reuse and the second compare after reuse. 416 * The invalidation ensures that a memseg is not deferenced while 417 * it's page/pfn pointers are changing. 418 */ 419 if (seg == NULL) { 420 seg = memseg_alloc(); 421 ASSERT(seg != NULL); 422 seg->msegflags = flags; 423 MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p", 424 (void *)seg, (void *)(seg->pages)); 425 seg->pages = segpp; 426 } else { 427 ASSERT(seg->msegflags == flags); 428 ASSERT(seg->pages_base == seg->pages_end); 429 MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p", 430 (void *)seg, (void *)(seg->pages)); 431 if (meta_alloc) { 432 memseg_cpu_vm_flush(); 433 seg->pages = segpp; 434 } 435 } 436 437 seg->epages = seg->pages + npgs; 438 seg->pages_base = base; 439 seg->pages_end = base + npgs; 440 441 /* 442 * Initialize metadata. The page_ts are set to locked state 443 * ready to be freed. 444 */ 445 bzero((caddr_t)pp, ptob(metapgs)); 446 447 pfn = seg->pages_base; 448 /* Save the original pp base in case we reuse a memseg. */ 449 opp = pp; 450 oepp = opp + npgs; 451 for (pp = opp; pp < oepp; pp++) { 452 pp->p_pagenum = pfn; 453 pfn++; 454 page_iolock_init(pp); 455 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 456 continue; 457 pp->p_offset = (u_offset_t)-1; 458 } 459 460 if (reuse) { 461 /* Remap our page_ts to the re-used memseg VA space. */ 462 pfn = pt_base; 463 vaddr = (caddr_t)seg->pages; 464 for (pnum = 0; pnum < metapgs; pnum++) { 465 if (meta_alloc) 466 pfn = memseg_get_metapfn(metabase, 467 (pgcnt_t)pnum); 468 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 469 PROT_READ | PROT_WRITE, 470 HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST); 471 pfn++; 472 vaddr += ptob(1); 473 } 474 475 hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs), 476 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 477 478 vmem_free(heap_arena, mapva, ptob(metapgs)); 479 } 480 481 hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off); 482 483 memsegs_lock(1); 484 485 /* 486 * The new memseg is inserted at the beginning of the list. 487 * Not only does this save searching for the tail, but in the 488 * case of a re-used memseg, it solves the problem of what 489 * happens if some process has still got a pointer to the 490 * memseg and follows the next pointer to continue traversing 491 * the memsegs list. 492 */ 493 494 hat_kpm_addmem_mseg_insert(seg); 495 496 seg->next = memsegs; 497 membar_producer(); 498 499 hat_kpm_addmem_memsegs_update(seg); 500 501 memsegs = seg; 502 503 build_pfn_hash(); 504 505 total_pages += npgs; 506 507 /* 508 * Recalculate the paging parameters now total_pages has changed. 509 * This will also cause the clock hands to be reset before next use. 510 */ 511 setupclock(1); 512 513 memsegs_unlock(1); 514 515 PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs); 516 517 /* 518 * Free the pages outside the lock to avoid locking loops. 519 */ 520 for (pp = seg->pages; pp < seg->epages; pp++) { 521 page_free(pp, 1); 522 } 523 524 /* 525 * Now that we've updated the appropriate memory lists we 526 * need to reset a number of globals, since we've increased memory. 527 * Several have already been updated for us as noted above. The 528 * globals we're interested in at this point are: 529 * physmax - highest page frame number. 530 * physinstalled - number of pages currently installed (done earlier) 531 * maxmem - max free pages in the system 532 * physmem - physical memory pages available 533 * availrmem - real memory available 534 */ 535 536 mutex_enter(&freemem_lock); 537 maxmem += npgs; 538 physmem += npgs; 539 availrmem += npgs; 540 availrmem_initial += npgs; 541 542 mutex_exit(&freemem_lock); 543 544 dump_resize(); 545 546 page_freelist_coalesce_all(mnode); 547 548 kphysm_setup_post_add(npgs); 549 550 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK " 551 "(0x%" PRIx64 ")\n", 552 physinstalled << (PAGESHIFT - 10), 553 (uint64_t)physinstalled << PAGESHIFT); 554 555 avmem = (uint64_t)freemem << PAGESHIFT; 556 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: " 557 "avail mem = %" PRId64 "\n", avmem); 558 559 /* 560 * Update lgroup generation number on single lgroup systems 561 */ 562 if (nlgrps == 1) 563 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 564 565 delspan_unreserve(pt_base, tpgs); 566 return (KPHYSM_OK); /* Successfully added system memory */ 567 568 } 569 570 /* 571 * There are various error conditions in kphysm_add_memory_dynamic() 572 * which require a rollback of already changed global state. 573 */ 574 static void 575 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs) 576 { 577 int mlret; 578 579 /* Unreserve memory span. */ 580 memlist_write_lock(); 581 582 mlret = memlist_delete_span( 583 (uint64_t)(pt_base) << PAGESHIFT, 584 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 585 586 ASSERT(mlret == MEML_SPANOP_OK); 587 phys_install_has_changed(); 588 installed_top_size(phys_install, &physmax, &physinstalled); 589 590 memlist_write_unlock(); 591 delspan_unreserve(pt_base, tpgs); 592 } 593 594 /* 595 * Only return an available memseg of exactly the right size 596 * if size is required. 597 * When the meta data area has it's own virtual address space 598 * we will need to manage this more carefully and do best fit 599 * allocations, possibly splitting an available area. 600 */ 601 struct memseg * 602 memseg_reuse(pgcnt_t metapgs) 603 { 604 int type; 605 struct memseg **segpp, *seg; 606 607 mutex_enter(&memseg_lists_lock); 608 609 segpp = &memseg_va_avail; 610 for (; (seg = *segpp) != NULL; segpp = &seg->lnext) { 611 caddr_t end; 612 613 /* 614 * Make sure we are reusing the right segment type. 615 */ 616 type = metapgs ? MEMSEG_META_INCL : MEMSEG_META_ALLOC; 617 618 if ((seg->msegflags & (MEMSEG_META_INCL | MEMSEG_META_ALLOC)) 619 != type) 620 continue; 621 622 if (kpm_enable) 623 end = hat_kpm_mseg_reuse(seg); 624 else 625 end = (caddr_t)seg->epages; 626 627 /* 628 * Check for the right size if it is provided. 629 */ 630 if (!metapgs || btopr(end - (caddr_t)seg->pages) == metapgs) { 631 *segpp = seg->lnext; 632 seg->lnext = NULL; 633 break; 634 } 635 } 636 mutex_exit(&memseg_lists_lock); 637 638 return (seg); 639 } 640 641 static uint_t handle_gen; 642 643 struct memdelspan { 644 struct memdelspan *mds_next; 645 pfn_t mds_base; 646 pgcnt_t mds_npgs; 647 uint_t *mds_bitmap; 648 uint_t *mds_bitmap_retired; 649 }; 650 651 #define NBPBMW (sizeof (uint_t) * NBBY) 652 #define MDS_BITMAPBYTES(MDSP) \ 653 ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t)) 654 655 struct transit_list { 656 struct transit_list *trl_next; 657 struct memdelspan *trl_spans; 658 int trl_collect; 659 }; 660 661 struct transit_list_head { 662 kmutex_t trh_lock; 663 struct transit_list *trh_head; 664 }; 665 666 static struct transit_list_head transit_list_head; 667 668 struct mem_handle; 669 static void transit_list_collect(struct mem_handle *, int); 670 static void transit_list_insert(struct transit_list *); 671 static void transit_list_remove(struct transit_list *); 672 673 #ifdef DEBUG 674 #define MEM_DEL_STATS 675 #endif /* DEBUG */ 676 677 #ifdef MEM_DEL_STATS 678 static int mem_del_stat_print = 0; 679 struct mem_del_stat { 680 uint_t nloop; 681 uint_t need_free; 682 uint_t free_loop; 683 uint_t free_low; 684 uint_t free_failed; 685 uint_t ncheck; 686 uint_t nopaget; 687 uint_t lockfail; 688 uint_t nfree; 689 uint_t nreloc; 690 uint_t nrelocfail; 691 uint_t already_done; 692 uint_t first_notfree; 693 uint_t npplocked; 694 uint_t nlockreloc; 695 uint_t nnorepl; 696 uint_t nmodreloc; 697 uint_t ndestroy; 698 uint_t nputpage; 699 uint_t nnoreclaim; 700 uint_t ndelay; 701 uint_t demotefail; 702 uint64_t nticks_total; 703 uint64_t nticks_pgrp; 704 uint_t retired; 705 uint_t toxic; 706 uint_t failing; 707 uint_t modtoxic; 708 uint_t npplkdtoxic; 709 uint_t gptlmodfail; 710 uint_t gptllckfail; 711 }; 712 /* 713 * The stat values are only incremented in the delete thread 714 * so no locking or atomic required. 715 */ 716 #define MDSTAT_INCR(MHP, FLD) (MHP)->mh_delstat.FLD++ 717 #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck)) 718 #define MDSTAT_PGRP(MHP, ntck) ((MHP)->mh_delstat.nticks_pgrp += (ntck)) 719 static void mem_del_stat_print_func(struct mem_handle *); 720 #define MDSTAT_PRINT(MHP) mem_del_stat_print_func((MHP)) 721 #else /* MEM_DEL_STATS */ 722 #define MDSTAT_INCR(MHP, FLD) 723 #define MDSTAT_TOTAL(MHP, ntck) 724 #define MDSTAT_PGRP(MHP, ntck) 725 #define MDSTAT_PRINT(MHP) 726 #endif /* MEM_DEL_STATS */ 727 728 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING, 729 MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t; 730 731 /* 732 * mh_mutex must be taken to examine or change mh_exthandle and mh_state. 733 * The mutex may not be required for other fields, dependent on mh_state. 734 */ 735 struct mem_handle { 736 kmutex_t mh_mutex; 737 struct mem_handle *mh_next; 738 memhandle_t mh_exthandle; 739 mhnd_state_t mh_state; 740 struct transit_list mh_transit; 741 pgcnt_t mh_phys_pages; 742 pgcnt_t mh_vm_pages; 743 pgcnt_t mh_hold_todo; 744 void (*mh_delete_complete)(void *, int error); 745 void *mh_delete_complete_arg; 746 volatile uint_t mh_cancel; 747 volatile uint_t mh_dr_aio_cleanup_cancel; 748 volatile uint_t mh_aio_cleanup_done; 749 kcondvar_t mh_cv; 750 kthread_id_t mh_thread_id; 751 page_t *mh_deleted; /* link through p_next */ 752 #ifdef MEM_DEL_STATS 753 struct mem_del_stat mh_delstat; 754 #endif /* MEM_DEL_STATS */ 755 }; 756 757 static struct mem_handle *mem_handle_head; 758 static kmutex_t mem_handle_list_mutex; 759 760 static struct mem_handle * 761 kphysm_allocate_mem_handle() 762 { 763 struct mem_handle *mhp; 764 765 mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP); 766 mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL); 767 mutex_enter(&mem_handle_list_mutex); 768 mutex_enter(&mhp->mh_mutex); 769 /* handle_gen is protected by list mutex. */ 770 mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen); 771 mhp->mh_next = mem_handle_head; 772 mem_handle_head = mhp; 773 mutex_exit(&mem_handle_list_mutex); 774 775 return (mhp); 776 } 777 778 static void 779 kphysm_free_mem_handle(struct mem_handle *mhp) 780 { 781 struct mem_handle **mhpp; 782 783 ASSERT(mutex_owned(&mhp->mh_mutex)); 784 ASSERT(mhp->mh_state == MHND_FREE); 785 /* 786 * Exit the mutex to preserve locking order. This is OK 787 * here as once in the FREE state, the handle cannot 788 * be found by a lookup. 789 */ 790 mutex_exit(&mhp->mh_mutex); 791 792 mutex_enter(&mem_handle_list_mutex); 793 mhpp = &mem_handle_head; 794 while (*mhpp != NULL && *mhpp != mhp) 795 mhpp = &(*mhpp)->mh_next; 796 ASSERT(*mhpp == mhp); 797 /* 798 * No need to lock the handle (mh_mutex) as only 799 * mh_next changing and this is the only thread that 800 * can be referncing mhp. 801 */ 802 *mhpp = mhp->mh_next; 803 mutex_exit(&mem_handle_list_mutex); 804 805 mutex_destroy(&mhp->mh_mutex); 806 kmem_free(mhp, sizeof (struct mem_handle)); 807 } 808 809 /* 810 * This function finds the internal mem_handle corresponding to an 811 * external handle and returns it with the mh_mutex held. 812 */ 813 static struct mem_handle * 814 kphysm_lookup_mem_handle(memhandle_t handle) 815 { 816 struct mem_handle *mhp; 817 818 mutex_enter(&mem_handle_list_mutex); 819 for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) { 820 if (mhp->mh_exthandle == handle) { 821 mutex_enter(&mhp->mh_mutex); 822 /* 823 * The state of the handle could have been changed 824 * by kphysm_del_release() while waiting for mh_mutex. 825 */ 826 if (mhp->mh_state == MHND_FREE) { 827 mutex_exit(&mhp->mh_mutex); 828 continue; 829 } 830 break; 831 } 832 } 833 mutex_exit(&mem_handle_list_mutex); 834 return (mhp); 835 } 836 837 int 838 kphysm_del_gethandle(memhandle_t *xmhp) 839 { 840 struct mem_handle *mhp; 841 842 mhp = kphysm_allocate_mem_handle(); 843 /* 844 * The handle is allocated using KM_SLEEP, so cannot fail. 845 * If the implementation is changed, the correct error to return 846 * here would be KPHYSM_ENOHANDLES. 847 */ 848 ASSERT(mhp->mh_state == MHND_FREE); 849 mhp->mh_state = MHND_INIT; 850 *xmhp = mhp->mh_exthandle; 851 mutex_exit(&mhp->mh_mutex); 852 return (KPHYSM_OK); 853 } 854 855 static int 856 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2) 857 { 858 pfn_t e1, e2; 859 860 e1 = b1 + l1; 861 e2 = b2 + l2; 862 863 return (!(b2 >= e1 || b1 >= e2)); 864 } 865 866 static int can_remove_pgs(pgcnt_t); 867 868 static struct memdelspan * 869 span_to_install(pfn_t base, pgcnt_t npgs) 870 { 871 struct memdelspan *mdsp; 872 struct memdelspan *mdsp_new; 873 uint64_t address, size, thislen; 874 struct memlist *mlp; 875 876 mdsp_new = NULL; 877 878 address = (uint64_t)base << PAGESHIFT; 879 size = (uint64_t)npgs << PAGESHIFT; 880 while (size != 0) { 881 memlist_read_lock(); 882 for (mlp = phys_install; mlp != NULL; mlp = mlp->next) { 883 if (address >= (mlp->address + mlp->size)) 884 continue; 885 if ((address + size) > mlp->address) 886 break; 887 } 888 if (mlp == NULL) { 889 address += size; 890 size = 0; 891 thislen = 0; 892 } else { 893 if (address < mlp->address) { 894 size -= (mlp->address - address); 895 address = mlp->address; 896 } 897 ASSERT(address >= mlp->address); 898 if ((address + size) > (mlp->address + mlp->size)) { 899 thislen = mlp->size - (address - mlp->address); 900 } else { 901 thislen = size; 902 } 903 } 904 memlist_read_unlock(); 905 /* TODO: phys_install could change now */ 906 if (thislen == 0) 907 continue; 908 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 909 mdsp->mds_base = btop(address); 910 mdsp->mds_npgs = btop(thislen); 911 mdsp->mds_next = mdsp_new; 912 mdsp_new = mdsp; 913 address += thislen; 914 size -= thislen; 915 } 916 return (mdsp_new); 917 } 918 919 static void 920 free_delspans(struct memdelspan *mdsp) 921 { 922 struct memdelspan *amdsp; 923 924 while ((amdsp = mdsp) != NULL) { 925 mdsp = amdsp->mds_next; 926 kmem_free(amdsp, sizeof (struct memdelspan)); 927 } 928 } 929 930 /* 931 * Concatenate lists. No list ordering is required. 932 */ 933 934 static void 935 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp) 936 { 937 while (*mdspp != NULL) 938 mdspp = &(*mdspp)->mds_next; 939 940 *mdspp = mdsp; 941 } 942 943 /* 944 * Given a new list of delspans, check there is no overlap with 945 * all existing span activity (add or delete) and then concatenate 946 * the new spans to the given list. 947 * Return 1 for OK, 0 if overlapping. 948 */ 949 static int 950 delspan_insert( 951 struct transit_list *my_tlp, 952 struct memdelspan *mdsp_new) 953 { 954 struct transit_list_head *trh; 955 struct transit_list *tlp; 956 int ret; 957 958 trh = &transit_list_head; 959 960 ASSERT(my_tlp != NULL); 961 ASSERT(mdsp_new != NULL); 962 963 ret = 1; 964 mutex_enter(&trh->trh_lock); 965 /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */ 966 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 967 struct memdelspan *mdsp; 968 969 for (mdsp = tlp->trl_spans; mdsp != NULL; 970 mdsp = mdsp->mds_next) { 971 struct memdelspan *nmdsp; 972 973 for (nmdsp = mdsp_new; nmdsp != NULL; 974 nmdsp = nmdsp->mds_next) { 975 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 976 nmdsp->mds_base, nmdsp->mds_npgs)) { 977 ret = 0; 978 goto done; 979 } 980 } 981 } 982 } 983 done: 984 if (ret != 0) { 985 if (my_tlp->trl_spans == NULL) 986 transit_list_insert(my_tlp); 987 delspan_concat(&my_tlp->trl_spans, mdsp_new); 988 } 989 mutex_exit(&trh->trh_lock); 990 return (ret); 991 } 992 993 static void 994 delspan_remove( 995 struct transit_list *my_tlp, 996 pfn_t base, 997 pgcnt_t npgs) 998 { 999 struct transit_list_head *trh; 1000 struct memdelspan *mdsp; 1001 1002 trh = &transit_list_head; 1003 1004 ASSERT(my_tlp != NULL); 1005 1006 mutex_enter(&trh->trh_lock); 1007 if ((mdsp = my_tlp->trl_spans) != NULL) { 1008 if (npgs == 0) { 1009 my_tlp->trl_spans = NULL; 1010 free_delspans(mdsp); 1011 transit_list_remove(my_tlp); 1012 } else { 1013 struct memdelspan **prv; 1014 1015 prv = &my_tlp->trl_spans; 1016 while (mdsp != NULL) { 1017 pfn_t p_end; 1018 1019 p_end = mdsp->mds_base + mdsp->mds_npgs; 1020 if (mdsp->mds_base >= base && 1021 p_end <= (base + npgs)) { 1022 *prv = mdsp->mds_next; 1023 mdsp->mds_next = NULL; 1024 free_delspans(mdsp); 1025 } else { 1026 prv = &mdsp->mds_next; 1027 } 1028 mdsp = *prv; 1029 } 1030 if (my_tlp->trl_spans == NULL) 1031 transit_list_remove(my_tlp); 1032 } 1033 } 1034 mutex_exit(&trh->trh_lock); 1035 } 1036 1037 /* 1038 * Reserve interface for add to stop delete before add finished. 1039 * This list is only accessed through the delspan_insert/remove 1040 * functions and so is fully protected by the mutex in struct transit_list. 1041 */ 1042 1043 static struct transit_list reserve_transit; 1044 1045 static int 1046 delspan_reserve(pfn_t base, pgcnt_t npgs) 1047 { 1048 struct memdelspan *mdsp; 1049 int ret; 1050 1051 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 1052 mdsp->mds_base = base; 1053 mdsp->mds_npgs = npgs; 1054 if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) { 1055 free_delspans(mdsp); 1056 } 1057 return (ret); 1058 } 1059 1060 static void 1061 delspan_unreserve(pfn_t base, pgcnt_t npgs) 1062 { 1063 delspan_remove(&reserve_transit, base, npgs); 1064 } 1065 1066 /* 1067 * Return whether memseg was created by kphysm_add_memory_dynamic(). 1068 */ 1069 static int 1070 memseg_is_dynamic(struct memseg *seg) 1071 { 1072 return (seg->msegflags & MEMSEG_DYNAMIC); 1073 } 1074 1075 int 1076 kphysm_del_span( 1077 memhandle_t handle, 1078 pfn_t base, 1079 pgcnt_t npgs) 1080 { 1081 struct mem_handle *mhp; 1082 struct memseg *seg; 1083 struct memdelspan *mdsp; 1084 struct memdelspan *mdsp_new; 1085 pgcnt_t phys_pages, vm_pages; 1086 pfn_t p_end; 1087 page_t *pp; 1088 int ret; 1089 1090 mhp = kphysm_lookup_mem_handle(handle); 1091 if (mhp == NULL) { 1092 return (KPHYSM_EHANDLE); 1093 } 1094 if (mhp->mh_state != MHND_INIT) { 1095 mutex_exit(&mhp->mh_mutex); 1096 return (KPHYSM_ESEQUENCE); 1097 } 1098 1099 /* 1100 * Intersect the span with the installed memory list (phys_install). 1101 */ 1102 mdsp_new = span_to_install(base, npgs); 1103 if (mdsp_new == NULL) { 1104 /* 1105 * No physical memory in this range. Is this an 1106 * error? If an attempt to start the delete is made 1107 * for OK returns from del_span such as this, start will 1108 * return an error. 1109 * Could return KPHYSM_ENOWORK. 1110 */ 1111 /* 1112 * It is assumed that there are no error returns 1113 * from span_to_install() due to kmem_alloc failure. 1114 */ 1115 mutex_exit(&mhp->mh_mutex); 1116 return (KPHYSM_OK); 1117 } 1118 /* 1119 * Does this span overlap an existing span? 1120 */ 1121 if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) { 1122 /* 1123 * Differentiate between already on list for this handle 1124 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY). 1125 */ 1126 ret = KPHYSM_EBUSY; 1127 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1128 mdsp = mdsp->mds_next) { 1129 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 1130 base, npgs)) { 1131 ret = KPHYSM_EDUP; 1132 break; 1133 } 1134 } 1135 mutex_exit(&mhp->mh_mutex); 1136 free_delspans(mdsp_new); 1137 return (ret); 1138 } 1139 /* 1140 * At this point the spans in mdsp_new have been inserted into the 1141 * list of spans for this handle and thereby to the global list of 1142 * spans being processed. Each of these spans must now be checked 1143 * for relocatability. As a side-effect segments in the memseg list 1144 * may be split. 1145 * 1146 * Note that mdsp_new can no longer be used as it is now part of 1147 * a larger list. Select elements of this larger list based 1148 * on base and npgs. 1149 */ 1150 restart: 1151 phys_pages = 0; 1152 vm_pages = 0; 1153 ret = KPHYSM_OK; 1154 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1155 mdsp = mdsp->mds_next) { 1156 pgcnt_t pages_checked; 1157 1158 if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) { 1159 continue; 1160 } 1161 p_end = mdsp->mds_base + mdsp->mds_npgs; 1162 /* 1163 * The pages_checked count is a hack. All pages should be 1164 * checked for relocatability. Those not covered by memsegs 1165 * should be tested with arch_kphysm_del_span_ok(). 1166 */ 1167 pages_checked = 0; 1168 for (seg = memsegs; seg; seg = seg->next) { 1169 pfn_t mseg_start; 1170 1171 if (seg->pages_base >= p_end || 1172 seg->pages_end <= mdsp->mds_base) { 1173 /* Span and memseg don't overlap. */ 1174 continue; 1175 } 1176 mseg_start = memseg_get_start(seg); 1177 /* Check that segment is suitable for delete. */ 1178 if (memseg_includes_meta(seg)) { 1179 /* 1180 * Check that this segment is completely 1181 * within the span. 1182 */ 1183 if (mseg_start < mdsp->mds_base || 1184 seg->pages_end > p_end) { 1185 ret = KPHYSM_EBUSY; 1186 break; 1187 } 1188 pages_checked += seg->pages_end - mseg_start; 1189 } else { 1190 /* 1191 * If this segment is larger than the span, 1192 * try to split it. After the split, it 1193 * is necessary to restart. 1194 */ 1195 if (seg->pages_base < mdsp->mds_base || 1196 seg->pages_end > p_end) { 1197 pfn_t abase; 1198 pgcnt_t anpgs; 1199 int s_ret; 1200 1201 /* Split required. */ 1202 if (mdsp->mds_base < seg->pages_base) 1203 abase = seg->pages_base; 1204 else 1205 abase = mdsp->mds_base; 1206 if (p_end > seg->pages_end) 1207 anpgs = seg->pages_end - abase; 1208 else 1209 anpgs = p_end - abase; 1210 s_ret = kphysm_split_memseg(abase, 1211 anpgs); 1212 if (s_ret == 0) { 1213 /* Split failed. */ 1214 ret = KPHYSM_ERESOURCE; 1215 break; 1216 } 1217 goto restart; 1218 } 1219 pages_checked += 1220 seg->pages_end - seg->pages_base; 1221 } 1222 /* 1223 * The memseg is wholly within the delete span. 1224 * The individual pages can now be checked. 1225 */ 1226 /* Cage test. */ 1227 for (pp = seg->pages; pp < seg->epages; pp++) { 1228 if (PP_ISNORELOC(pp)) { 1229 ret = KPHYSM_ENONRELOC; 1230 break; 1231 } 1232 } 1233 if (ret != KPHYSM_OK) { 1234 break; 1235 } 1236 phys_pages += (seg->pages_end - mseg_start); 1237 vm_pages += MSEG_NPAGES(seg); 1238 } 1239 if (ret != KPHYSM_OK) 1240 break; 1241 if (pages_checked != mdsp->mds_npgs) { 1242 ret = KPHYSM_ENONRELOC; 1243 break; 1244 } 1245 } 1246 1247 if (ret == KPHYSM_OK) { 1248 mhp->mh_phys_pages += phys_pages; 1249 mhp->mh_vm_pages += vm_pages; 1250 } else { 1251 /* 1252 * Keep holding the mh_mutex to prevent it going away. 1253 */ 1254 delspan_remove(&mhp->mh_transit, base, npgs); 1255 } 1256 mutex_exit(&mhp->mh_mutex); 1257 return (ret); 1258 } 1259 1260 int 1261 kphysm_del_span_query( 1262 pfn_t base, 1263 pgcnt_t npgs, 1264 memquery_t *mqp) 1265 { 1266 struct memdelspan *mdsp; 1267 struct memdelspan *mdsp_new; 1268 int done_first_nonreloc; 1269 1270 mqp->phys_pages = 0; 1271 mqp->managed = 0; 1272 mqp->nonrelocatable = 0; 1273 mqp->first_nonrelocatable = 0; 1274 mqp->last_nonrelocatable = 0; 1275 1276 mdsp_new = span_to_install(base, npgs); 1277 /* 1278 * It is OK to proceed here if mdsp_new == NULL. 1279 */ 1280 done_first_nonreloc = 0; 1281 for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) { 1282 pfn_t sbase; 1283 pgcnt_t snpgs; 1284 1285 mqp->phys_pages += mdsp->mds_npgs; 1286 sbase = mdsp->mds_base; 1287 snpgs = mdsp->mds_npgs; 1288 while (snpgs != 0) { 1289 struct memseg *lseg, *seg; 1290 pfn_t p_end; 1291 page_t *pp; 1292 pfn_t mseg_start; 1293 1294 p_end = sbase + snpgs; 1295 /* 1296 * Find the lowest addressed memseg that starts 1297 * after sbase and account for it. 1298 * This is to catch dynamic memsegs whose start 1299 * is hidden. 1300 */ 1301 seg = NULL; 1302 for (lseg = memsegs; lseg != NULL; lseg = lseg->next) { 1303 if ((lseg->pages_base >= sbase) || 1304 (lseg->pages_base < p_end && 1305 lseg->pages_end > sbase)) { 1306 if (seg == NULL || 1307 seg->pages_base > lseg->pages_base) 1308 seg = lseg; 1309 } 1310 } 1311 if (seg != NULL) { 1312 mseg_start = memseg_get_start(seg); 1313 /* 1314 * Now have the full extent of the memseg so 1315 * do the range check. 1316 */ 1317 if (mseg_start >= p_end || 1318 seg->pages_end <= sbase) { 1319 /* Span does not overlap memseg. */ 1320 seg = NULL; 1321 } 1322 } 1323 /* 1324 * Account for gap either before the segment if 1325 * there is one or to the end of the span. 1326 */ 1327 if (seg == NULL || mseg_start > sbase) { 1328 pfn_t a_end; 1329 1330 a_end = (seg == NULL) ? p_end : mseg_start; 1331 /* 1332 * Check with arch layer for relocatability. 1333 */ 1334 if (arch_kphysm_del_span_ok(sbase, 1335 (a_end - sbase))) { 1336 /* 1337 * No non-relocatble pages in this 1338 * area, avoid the fine-grained 1339 * test. 1340 */ 1341 snpgs -= (a_end - sbase); 1342 sbase = a_end; 1343 } 1344 while (sbase < a_end) { 1345 if (!arch_kphysm_del_span_ok(sbase, 1346 1)) { 1347 mqp->nonrelocatable++; 1348 if (!done_first_nonreloc) { 1349 mqp-> 1350 first_nonrelocatable 1351 = sbase; 1352 done_first_nonreloc = 1; 1353 } 1354 mqp->last_nonrelocatable = 1355 sbase; 1356 } 1357 sbase++; 1358 snpgs--; 1359 } 1360 } 1361 if (seg != NULL) { 1362 ASSERT(mseg_start <= sbase); 1363 if (seg->pages_base != mseg_start && 1364 seg->pages_base > sbase) { 1365 pgcnt_t skip_pgs; 1366 1367 /* 1368 * Skip the page_t area of a 1369 * dynamic memseg. 1370 */ 1371 skip_pgs = seg->pages_base - sbase; 1372 if (snpgs <= skip_pgs) { 1373 sbase += snpgs; 1374 snpgs = 0; 1375 continue; 1376 } 1377 snpgs -= skip_pgs; 1378 sbase += skip_pgs; 1379 } 1380 ASSERT(snpgs != 0); 1381 ASSERT(seg->pages_base <= sbase); 1382 /* 1383 * The individual pages can now be checked. 1384 */ 1385 for (pp = seg->pages + 1386 (sbase - seg->pages_base); 1387 snpgs != 0 && pp < seg->epages; pp++) { 1388 mqp->managed++; 1389 if (PP_ISNORELOC(pp)) { 1390 mqp->nonrelocatable++; 1391 if (!done_first_nonreloc) { 1392 mqp-> 1393 first_nonrelocatable 1394 = sbase; 1395 done_first_nonreloc = 1; 1396 } 1397 mqp->last_nonrelocatable = 1398 sbase; 1399 } 1400 sbase++; 1401 snpgs--; 1402 } 1403 } 1404 } 1405 } 1406 1407 free_delspans(mdsp_new); 1408 1409 return (KPHYSM_OK); 1410 } 1411 1412 /* 1413 * This release function can be called at any stage as follows: 1414 * _gethandle only called 1415 * _span(s) only called 1416 * _start called but failed 1417 * delete thread exited 1418 */ 1419 int 1420 kphysm_del_release(memhandle_t handle) 1421 { 1422 struct mem_handle *mhp; 1423 1424 mhp = kphysm_lookup_mem_handle(handle); 1425 if (mhp == NULL) { 1426 return (KPHYSM_EHANDLE); 1427 } 1428 switch (mhp->mh_state) { 1429 case MHND_STARTING: 1430 case MHND_RUNNING: 1431 mutex_exit(&mhp->mh_mutex); 1432 return (KPHYSM_ENOTFINISHED); 1433 case MHND_FREE: 1434 ASSERT(mhp->mh_state != MHND_FREE); 1435 mutex_exit(&mhp->mh_mutex); 1436 return (KPHYSM_EHANDLE); 1437 case MHND_INIT: 1438 break; 1439 case MHND_DONE: 1440 break; 1441 case MHND_RELEASE: 1442 mutex_exit(&mhp->mh_mutex); 1443 return (KPHYSM_ESEQUENCE); 1444 default: 1445 #ifdef DEBUG 1446 cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d", 1447 (void *)mhp, mhp->mh_state); 1448 #endif /* DEBUG */ 1449 mutex_exit(&mhp->mh_mutex); 1450 return (KPHYSM_EHANDLE); 1451 } 1452 /* 1453 * Set state so that we can wait if necessary. 1454 * Also this means that we have read/write access to all 1455 * fields except mh_exthandle and mh_state. 1456 */ 1457 mhp->mh_state = MHND_RELEASE; 1458 /* 1459 * The mem_handle cannot be de-allocated by any other operation 1460 * now, so no need to hold mh_mutex. 1461 */ 1462 mutex_exit(&mhp->mh_mutex); 1463 1464 delspan_remove(&mhp->mh_transit, 0, 0); 1465 mhp->mh_phys_pages = 0; 1466 mhp->mh_vm_pages = 0; 1467 mhp->mh_hold_todo = 0; 1468 mhp->mh_delete_complete = NULL; 1469 mhp->mh_delete_complete_arg = NULL; 1470 mhp->mh_cancel = 0; 1471 1472 mutex_enter(&mhp->mh_mutex); 1473 ASSERT(mhp->mh_state == MHND_RELEASE); 1474 mhp->mh_state = MHND_FREE; 1475 1476 kphysm_free_mem_handle(mhp); 1477 1478 return (KPHYSM_OK); 1479 } 1480 1481 /* 1482 * This cancel function can only be called with the thread running. 1483 */ 1484 int 1485 kphysm_del_cancel(memhandle_t handle) 1486 { 1487 struct mem_handle *mhp; 1488 1489 mhp = kphysm_lookup_mem_handle(handle); 1490 if (mhp == NULL) { 1491 return (KPHYSM_EHANDLE); 1492 } 1493 if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) { 1494 mutex_exit(&mhp->mh_mutex); 1495 return (KPHYSM_ENOTRUNNING); 1496 } 1497 /* 1498 * Set the cancel flag and wake the delete thread up. 1499 * The thread may be waiting on I/O, so the effect of the cancel 1500 * may be delayed. 1501 */ 1502 if (mhp->mh_cancel == 0) { 1503 mhp->mh_cancel = KPHYSM_ECANCELLED; 1504 cv_signal(&mhp->mh_cv); 1505 } 1506 mutex_exit(&mhp->mh_mutex); 1507 return (KPHYSM_OK); 1508 } 1509 1510 int 1511 kphysm_del_status( 1512 memhandle_t handle, 1513 memdelstat_t *mdstp) 1514 { 1515 struct mem_handle *mhp; 1516 1517 mhp = kphysm_lookup_mem_handle(handle); 1518 if (mhp == NULL) { 1519 return (KPHYSM_EHANDLE); 1520 } 1521 /* 1522 * Calling kphysm_del_status() is allowed before the delete 1523 * is started to allow for status display. 1524 */ 1525 if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING && 1526 mhp->mh_state != MHND_RUNNING) { 1527 mutex_exit(&mhp->mh_mutex); 1528 return (KPHYSM_ENOTRUNNING); 1529 } 1530 mdstp->phys_pages = mhp->mh_phys_pages; 1531 mdstp->managed = mhp->mh_vm_pages; 1532 mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo; 1533 mutex_exit(&mhp->mh_mutex); 1534 return (KPHYSM_OK); 1535 } 1536 1537 static int mem_delete_additional_pages = 100; 1538 1539 static int 1540 can_remove_pgs(pgcnt_t npgs) 1541 { 1542 /* 1543 * If all pageable pages were paged out, freemem would 1544 * equal availrmem. There is a minimum requirement for 1545 * availrmem. 1546 */ 1547 if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages)) 1548 < npgs) 1549 return (0); 1550 /* TODO: check swap space, etc. */ 1551 return (1); 1552 } 1553 1554 static int 1555 get_availrmem(pgcnt_t npgs) 1556 { 1557 int ret; 1558 1559 mutex_enter(&freemem_lock); 1560 ret = can_remove_pgs(npgs); 1561 if (ret != 0) 1562 availrmem -= npgs; 1563 mutex_exit(&freemem_lock); 1564 return (ret); 1565 } 1566 1567 static void 1568 put_availrmem(pgcnt_t npgs) 1569 { 1570 mutex_enter(&freemem_lock); 1571 availrmem += npgs; 1572 mutex_exit(&freemem_lock); 1573 } 1574 1575 #define FREEMEM_INCR 100 1576 static pgcnt_t freemem_incr = FREEMEM_INCR; 1577 #define DEL_FREE_WAIT_FRAC 4 1578 #define DEL_FREE_WAIT_TICKS ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC) 1579 1580 #define DEL_BUSY_WAIT_FRAC 20 1581 #define DEL_BUSY_WAIT_TICKS ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC) 1582 1583 static void kphysm_del_cleanup(struct mem_handle *); 1584 1585 static void page_delete_collect(page_t *, struct mem_handle *); 1586 1587 static pgcnt_t 1588 delthr_get_freemem(struct mem_handle *mhp) 1589 { 1590 pgcnt_t free_get; 1591 int ret; 1592 1593 ASSERT(MUTEX_HELD(&mhp->mh_mutex)); 1594 1595 MDSTAT_INCR(mhp, need_free); 1596 /* 1597 * Get up to freemem_incr pages. 1598 */ 1599 free_get = freemem_incr; 1600 if (free_get > mhp->mh_hold_todo) 1601 free_get = mhp->mh_hold_todo; 1602 /* 1603 * Take free_get pages away from freemem, 1604 * waiting if necessary. 1605 */ 1606 1607 while (!mhp->mh_cancel) { 1608 mutex_exit(&mhp->mh_mutex); 1609 MDSTAT_INCR(mhp, free_loop); 1610 /* 1611 * Duplicate test from page_create_throttle() 1612 * but don't override with !PG_WAIT. 1613 */ 1614 if (freemem < (free_get + throttlefree)) { 1615 MDSTAT_INCR(mhp, free_low); 1616 ret = 0; 1617 } else { 1618 ret = page_create_wait(free_get, 0); 1619 if (ret == 0) { 1620 /* EMPTY */ 1621 MDSTAT_INCR(mhp, free_failed); 1622 } 1623 } 1624 if (ret != 0) { 1625 mutex_enter(&mhp->mh_mutex); 1626 return (free_get); 1627 } 1628 1629 /* 1630 * Put pressure on pageout. 1631 */ 1632 page_needfree(free_get); 1633 cv_signal(&proc_pageout->p_cv); 1634 1635 mutex_enter(&mhp->mh_mutex); 1636 (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex, 1637 DEL_FREE_WAIT_TICKS, TR_CLOCK_TICK); 1638 mutex_exit(&mhp->mh_mutex); 1639 page_needfree(-(spgcnt_t)free_get); 1640 1641 mutex_enter(&mhp->mh_mutex); 1642 } 1643 return (0); 1644 } 1645 1646 #define DR_AIO_CLEANUP_DELAY 25000 /* 0.025secs, in usec */ 1647 #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100 1648 /* 1649 * This function is run as a helper thread for delete_memory_thread. 1650 * It is needed in order to force kaio cleanup, so that pages used in kaio 1651 * will be unlocked and subsequently relocated by delete_memory_thread. 1652 * The address of the delete_memory_threads's mem_handle is passed in to 1653 * this thread function, and is used to set the mh_aio_cleanup_done member 1654 * prior to calling thread_exit(). 1655 */ 1656 static void 1657 dr_aio_cleanup_thread(caddr_t amhp) 1658 { 1659 proc_t *procp; 1660 int (*aio_cleanup_dr_delete_memory)(proc_t *); 1661 int cleaned; 1662 int n = 0; 1663 struct mem_handle *mhp; 1664 volatile uint_t *pcancel; 1665 1666 mhp = (struct mem_handle *)amhp; 1667 ASSERT(mhp != NULL); 1668 pcancel = &mhp->mh_dr_aio_cleanup_cancel; 1669 if (modload("sys", "kaio") == -1) { 1670 mhp->mh_aio_cleanup_done = 1; 1671 cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio"); 1672 thread_exit(); 1673 } 1674 aio_cleanup_dr_delete_memory = (int (*)(proc_t *)) 1675 modgetsymvalue("aio_cleanup_dr_delete_memory", 0); 1676 if (aio_cleanup_dr_delete_memory == NULL) { 1677 mhp->mh_aio_cleanup_done = 1; 1678 cmn_err(CE_WARN, 1679 "aio_cleanup_dr_delete_memory not found in kaio"); 1680 thread_exit(); 1681 } 1682 do { 1683 cleaned = 0; 1684 mutex_enter(&pidlock); 1685 for (procp = practive; (*pcancel == 0) && (procp != NULL); 1686 procp = procp->p_next) { 1687 mutex_enter(&procp->p_lock); 1688 if (procp->p_aio != NULL) { 1689 /* cleanup proc's outstanding kaio */ 1690 cleaned += 1691 (*aio_cleanup_dr_delete_memory)(procp); 1692 } 1693 mutex_exit(&procp->p_lock); 1694 } 1695 mutex_exit(&pidlock); 1696 if ((*pcancel == 0) && 1697 (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) { 1698 /* delay a bit before retrying all procs again */ 1699 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 1700 n = 0; 1701 } 1702 } while (*pcancel == 0); 1703 mhp->mh_aio_cleanup_done = 1; 1704 thread_exit(); 1705 } 1706 1707 static void 1708 delete_memory_thread(caddr_t amhp) 1709 { 1710 struct mem_handle *mhp; 1711 struct memdelspan *mdsp; 1712 callb_cpr_t cprinfo; 1713 page_t *pp_targ; 1714 spgcnt_t freemem_left; 1715 void (*del_complete_funcp)(void *, int error); 1716 void *del_complete_arg; 1717 int comp_code; 1718 int ret; 1719 int first_scan; 1720 uint_t szc; 1721 #ifdef MEM_DEL_STATS 1722 uint64_t start_total, ntick_total; 1723 uint64_t start_pgrp, ntick_pgrp; 1724 #endif /* MEM_DEL_STATS */ 1725 1726 mhp = (struct mem_handle *)amhp; 1727 1728 #ifdef MEM_DEL_STATS 1729 start_total = ddi_get_lbolt(); 1730 #endif /* MEM_DEL_STATS */ 1731 1732 CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex, 1733 callb_generic_cpr, "memdel"); 1734 1735 mutex_enter(&mhp->mh_mutex); 1736 ASSERT(mhp->mh_state == MHND_STARTING); 1737 1738 mhp->mh_state = MHND_RUNNING; 1739 mhp->mh_thread_id = curthread; 1740 1741 mhp->mh_hold_todo = mhp->mh_vm_pages; 1742 mutex_exit(&mhp->mh_mutex); 1743 1744 /* Allocate the remap pages now, if necessary. */ 1745 memseg_remap_init(); 1746 1747 /* 1748 * Subtract from availrmem now if possible as availrmem 1749 * may not be available by the end of the delete. 1750 */ 1751 if (!get_availrmem(mhp->mh_vm_pages)) { 1752 comp_code = KPHYSM_ENOTVIABLE; 1753 mutex_enter(&mhp->mh_mutex); 1754 goto early_exit; 1755 } 1756 1757 ret = kphysm_setup_pre_del(mhp->mh_vm_pages); 1758 1759 mutex_enter(&mhp->mh_mutex); 1760 1761 if (ret != 0) { 1762 mhp->mh_cancel = KPHYSM_EREFUSED; 1763 goto refused; 1764 } 1765 1766 transit_list_collect(mhp, 1); 1767 1768 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1769 mdsp = mdsp->mds_next) { 1770 ASSERT(mdsp->mds_bitmap == NULL); 1771 mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP); 1772 mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp), 1773 KM_SLEEP); 1774 } 1775 1776 first_scan = 1; 1777 freemem_left = 0; 1778 /* 1779 * Start dr_aio_cleanup_thread, which periodically iterates 1780 * through the process list and invokes aio cleanup. This 1781 * is needed in order to avoid a deadly embrace between the 1782 * delete_memory_thread (waiting on writer lock for page, with the 1783 * exclusive-wanted bit set), kaio read request threads (waiting for a 1784 * reader lock on the same page that is wanted by the 1785 * delete_memory_thread), and threads waiting for kaio completion 1786 * (blocked on spt_amp->lock). 1787 */ 1788 mhp->mh_dr_aio_cleanup_cancel = 0; 1789 mhp->mh_aio_cleanup_done = 0; 1790 (void) thread_create(NULL, 0, dr_aio_cleanup_thread, 1791 (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1); 1792 while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) { 1793 pgcnt_t collected; 1794 1795 MDSTAT_INCR(mhp, nloop); 1796 collected = 0; 1797 for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) && 1798 (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) { 1799 pfn_t pfn, p_end; 1800 1801 p_end = mdsp->mds_base + mdsp->mds_npgs; 1802 for (pfn = mdsp->mds_base; (pfn < p_end) && 1803 (mhp->mh_cancel == 0); pfn++) { 1804 page_t *pp, *tpp, *tpp_targ; 1805 pgcnt_t bit; 1806 struct vnode *vp; 1807 u_offset_t offset; 1808 int mod, result; 1809 spgcnt_t pgcnt; 1810 1811 bit = pfn - mdsp->mds_base; 1812 if ((mdsp->mds_bitmap[bit / NBPBMW] & 1813 (1 << (bit % NBPBMW))) != 0) { 1814 MDSTAT_INCR(mhp, already_done); 1815 continue; 1816 } 1817 if (freemem_left == 0) { 1818 freemem_left += delthr_get_freemem(mhp); 1819 if (freemem_left == 0) 1820 break; 1821 } 1822 1823 /* 1824 * Release mh_mutex - some of this 1825 * stuff takes some time (eg PUTPAGE). 1826 */ 1827 1828 mutex_exit(&mhp->mh_mutex); 1829 MDSTAT_INCR(mhp, ncheck); 1830 1831 pp = page_numtopp_nolock(pfn); 1832 if (pp == NULL) { 1833 /* 1834 * Not covered by a page_t - will 1835 * be dealt with elsewhere. 1836 */ 1837 MDSTAT_INCR(mhp, nopaget); 1838 mutex_enter(&mhp->mh_mutex); 1839 mdsp->mds_bitmap[bit / NBPBMW] |= 1840 (1 << (bit % NBPBMW)); 1841 continue; 1842 } 1843 1844 if (!page_try_reclaim_lock(pp, SE_EXCL, 1845 SE_EXCL_WANTED | SE_RETIRED)) { 1846 /* 1847 * Page in use elsewhere. Skip it. 1848 */ 1849 MDSTAT_INCR(mhp, lockfail); 1850 mutex_enter(&mhp->mh_mutex); 1851 continue; 1852 } 1853 /* 1854 * See if the cage expanded into the delete. 1855 * This can happen as we have to allow the 1856 * cage to expand. 1857 */ 1858 if (PP_ISNORELOC(pp)) { 1859 page_unlock(pp); 1860 mutex_enter(&mhp->mh_mutex); 1861 mhp->mh_cancel = KPHYSM_ENONRELOC; 1862 break; 1863 } 1864 if (PP_RETIRED(pp)) { 1865 /* 1866 * Page has been retired and is 1867 * not part of the cage so we 1868 * can now do the accounting for 1869 * it. 1870 */ 1871 MDSTAT_INCR(mhp, retired); 1872 mutex_enter(&mhp->mh_mutex); 1873 mdsp->mds_bitmap[bit / NBPBMW] 1874 |= (1 << (bit % NBPBMW)); 1875 mdsp->mds_bitmap_retired[bit / 1876 NBPBMW] |= 1877 (1 << (bit % NBPBMW)); 1878 mhp->mh_hold_todo--; 1879 continue; 1880 } 1881 ASSERT(freemem_left != 0); 1882 if (PP_ISFREE(pp)) { 1883 /* 1884 * Like page_reclaim() only 'freemem' 1885 * processing is already done. 1886 */ 1887 MDSTAT_INCR(mhp, nfree); 1888 free_page_collect: 1889 if (PP_ISAGED(pp)) { 1890 page_list_sub(pp, 1891 PG_FREE_LIST); 1892 } else { 1893 page_list_sub(pp, 1894 PG_CACHE_LIST); 1895 } 1896 PP_CLRFREE(pp); 1897 PP_CLRAGED(pp); 1898 collected++; 1899 mutex_enter(&mhp->mh_mutex); 1900 page_delete_collect(pp, mhp); 1901 mdsp->mds_bitmap[bit / NBPBMW] |= 1902 (1 << (bit % NBPBMW)); 1903 freemem_left--; 1904 continue; 1905 } 1906 ASSERT(pp->p_vnode != NULL); 1907 if (first_scan) { 1908 MDSTAT_INCR(mhp, first_notfree); 1909 page_unlock(pp); 1910 mutex_enter(&mhp->mh_mutex); 1911 continue; 1912 } 1913 /* 1914 * Keep stats on pages encountered that 1915 * are marked for retirement. 1916 */ 1917 if (PP_TOXIC(pp)) { 1918 MDSTAT_INCR(mhp, toxic); 1919 } else if (PP_PR_REQ(pp)) { 1920 MDSTAT_INCR(mhp, failing); 1921 } 1922 /* 1923 * In certain cases below, special exceptions 1924 * are made for pages that are toxic. This 1925 * is because the current meaning of toxic 1926 * is that an uncorrectable error has been 1927 * previously associated with the page. 1928 */ 1929 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1930 if (!PP_TOXIC(pp)) { 1931 /* 1932 * Must relocate locked in 1933 * memory pages. 1934 */ 1935 #ifdef MEM_DEL_STATS 1936 start_pgrp = ddi_get_lbolt(); 1937 #endif /* MEM_DEL_STATS */ 1938 /* 1939 * Lock all constituent pages 1940 * of a large page to ensure 1941 * that p_szc won't change. 1942 */ 1943 if (!group_page_trylock(pp, 1944 SE_EXCL)) { 1945 MDSTAT_INCR(mhp, 1946 gptllckfail); 1947 page_unlock(pp); 1948 mutex_enter( 1949 &mhp->mh_mutex); 1950 continue; 1951 } 1952 MDSTAT_INCR(mhp, npplocked); 1953 pp_targ = 1954 page_get_replacement_page( 1955 pp, NULL, 0); 1956 if (pp_targ != NULL) { 1957 #ifdef MEM_DEL_STATS 1958 ntick_pgrp = 1959 (uint64_t) 1960 ddi_get_lbolt() - 1961 start_pgrp; 1962 #endif /* MEM_DEL_STATS */ 1963 MDSTAT_PGRP(mhp, 1964 ntick_pgrp); 1965 MDSTAT_INCR(mhp, 1966 nlockreloc); 1967 goto reloc; 1968 } 1969 group_page_unlock(pp); 1970 page_unlock(pp); 1971 #ifdef MEM_DEL_STATS 1972 ntick_pgrp = 1973 (uint64_t)ddi_get_lbolt() - 1974 start_pgrp; 1975 #endif /* MEM_DEL_STATS */ 1976 MDSTAT_PGRP(mhp, ntick_pgrp); 1977 MDSTAT_INCR(mhp, nnorepl); 1978 mutex_enter(&mhp->mh_mutex); 1979 continue; 1980 } else { 1981 /* 1982 * Cannot do anything about 1983 * this page because it is 1984 * toxic. 1985 */ 1986 MDSTAT_INCR(mhp, npplkdtoxic); 1987 page_unlock(pp); 1988 mutex_enter(&mhp->mh_mutex); 1989 continue; 1990 } 1991 } 1992 /* 1993 * Unload the mappings and check if mod bit 1994 * is set. 1995 */ 1996 ASSERT(!PP_ISKAS(pp)); 1997 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1998 mod = hat_ismod(pp); 1999 2000 #ifdef MEM_DEL_STATS 2001 start_pgrp = ddi_get_lbolt(); 2002 #endif /* MEM_DEL_STATS */ 2003 if (mod && !PP_TOXIC(pp)) { 2004 /* 2005 * Lock all constituent pages 2006 * of a large page to ensure 2007 * that p_szc won't change. 2008 */ 2009 if (!group_page_trylock(pp, SE_EXCL)) { 2010 MDSTAT_INCR(mhp, gptlmodfail); 2011 page_unlock(pp); 2012 mutex_enter(&mhp->mh_mutex); 2013 continue; 2014 } 2015 pp_targ = page_get_replacement_page(pp, 2016 NULL, 0); 2017 if (pp_targ != NULL) { 2018 MDSTAT_INCR(mhp, nmodreloc); 2019 #ifdef MEM_DEL_STATS 2020 ntick_pgrp = 2021 (uint64_t)ddi_get_lbolt() - 2022 start_pgrp; 2023 #endif /* MEM_DEL_STATS */ 2024 MDSTAT_PGRP(mhp, ntick_pgrp); 2025 goto reloc; 2026 } 2027 group_page_unlock(pp); 2028 } 2029 2030 if (!page_try_demote_pages(pp)) { 2031 MDSTAT_INCR(mhp, demotefail); 2032 page_unlock(pp); 2033 #ifdef MEM_DEL_STATS 2034 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2035 start_pgrp; 2036 #endif /* MEM_DEL_STATS */ 2037 MDSTAT_PGRP(mhp, ntick_pgrp); 2038 mutex_enter(&mhp->mh_mutex); 2039 continue; 2040 } 2041 2042 /* 2043 * Regular 'page-out'. 2044 */ 2045 if (!mod) { 2046 MDSTAT_INCR(mhp, ndestroy); 2047 page_destroy(pp, 1); 2048 /* 2049 * page_destroy was called with 2050 * dontfree. As long as p_lckcnt 2051 * and p_cowcnt are both zero, the 2052 * only additional action of 2053 * page_destroy with !dontfree is to 2054 * call page_free, so we can collect 2055 * the page here. 2056 */ 2057 collected++; 2058 #ifdef MEM_DEL_STATS 2059 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2060 start_pgrp; 2061 #endif /* MEM_DEL_STATS */ 2062 MDSTAT_PGRP(mhp, ntick_pgrp); 2063 mutex_enter(&mhp->mh_mutex); 2064 page_delete_collect(pp, mhp); 2065 mdsp->mds_bitmap[bit / NBPBMW] |= 2066 (1 << (bit % NBPBMW)); 2067 continue; 2068 } 2069 /* 2070 * The page is toxic and the mod bit is 2071 * set, we cannot do anything here to deal 2072 * with it. 2073 */ 2074 if (PP_TOXIC(pp)) { 2075 page_unlock(pp); 2076 #ifdef MEM_DEL_STATS 2077 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2078 start_pgrp; 2079 #endif /* MEM_DEL_STATS */ 2080 MDSTAT_PGRP(mhp, ntick_pgrp); 2081 MDSTAT_INCR(mhp, modtoxic); 2082 mutex_enter(&mhp->mh_mutex); 2083 continue; 2084 } 2085 MDSTAT_INCR(mhp, nputpage); 2086 vp = pp->p_vnode; 2087 offset = pp->p_offset; 2088 VN_HOLD(vp); 2089 page_unlock(pp); 2090 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, 2091 B_INVAL|B_FORCE, kcred, NULL); 2092 VN_RELE(vp); 2093 #ifdef MEM_DEL_STATS 2094 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2095 start_pgrp; 2096 #endif /* MEM_DEL_STATS */ 2097 MDSTAT_PGRP(mhp, ntick_pgrp); 2098 /* 2099 * Try to get the page back immediately 2100 * so that it can be collected. 2101 */ 2102 pp = page_numtopp_nolock(pfn); 2103 if (pp == NULL) { 2104 MDSTAT_INCR(mhp, nnoreclaim); 2105 /* 2106 * This should not happen as this 2107 * thread is deleting the page. 2108 * If this code is generalized, this 2109 * becomes a reality. 2110 */ 2111 #ifdef DEBUG 2112 cmn_err(CE_WARN, 2113 "delete_memory_thread(0x%p) " 2114 "pfn 0x%lx has no page_t", 2115 (void *)mhp, pfn); 2116 #endif /* DEBUG */ 2117 mutex_enter(&mhp->mh_mutex); 2118 continue; 2119 } 2120 if (page_try_reclaim_lock(pp, SE_EXCL, 2121 SE_EXCL_WANTED | SE_RETIRED)) { 2122 if (PP_ISFREE(pp)) { 2123 goto free_page_collect; 2124 } 2125 page_unlock(pp); 2126 } 2127 MDSTAT_INCR(mhp, nnoreclaim); 2128 mutex_enter(&mhp->mh_mutex); 2129 continue; 2130 2131 reloc: 2132 /* 2133 * Got some freemem and a target 2134 * page, so move the data to avoid 2135 * I/O and lock problems. 2136 */ 2137 ASSERT(!page_iolock_assert(pp)); 2138 MDSTAT_INCR(mhp, nreloc); 2139 /* 2140 * page_relocate() will return pgcnt: the 2141 * number of consecutive pages relocated. 2142 * If it is successful, pp will be a 2143 * linked list of the page structs that 2144 * were relocated. If page_relocate() is 2145 * unsuccessful, pp will be unmodified. 2146 */ 2147 #ifdef MEM_DEL_STATS 2148 start_pgrp = ddi_get_lbolt(); 2149 #endif /* MEM_DEL_STATS */ 2150 result = page_relocate(&pp, &pp_targ, 0, 0, 2151 &pgcnt, NULL); 2152 #ifdef MEM_DEL_STATS 2153 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2154 start_pgrp; 2155 #endif /* MEM_DEL_STATS */ 2156 MDSTAT_PGRP(mhp, ntick_pgrp); 2157 if (result != 0) { 2158 MDSTAT_INCR(mhp, nrelocfail); 2159 /* 2160 * We did not succeed. We need 2161 * to give the pp_targ pages back. 2162 * page_free(pp_targ, 1) without 2163 * the freemem accounting. 2164 */ 2165 group_page_unlock(pp); 2166 page_free_replacement_page(pp_targ); 2167 page_unlock(pp); 2168 mutex_enter(&mhp->mh_mutex); 2169 continue; 2170 } 2171 2172 /* 2173 * We will then collect pgcnt pages. 2174 */ 2175 ASSERT(pgcnt > 0); 2176 mutex_enter(&mhp->mh_mutex); 2177 /* 2178 * We need to make sure freemem_left is 2179 * large enough. 2180 */ 2181 while ((freemem_left < pgcnt) && 2182 (!mhp->mh_cancel)) { 2183 freemem_left += 2184 delthr_get_freemem(mhp); 2185 } 2186 2187 /* 2188 * Do not proceed if mh_cancel is set. 2189 */ 2190 if (mhp->mh_cancel) { 2191 while (pp_targ != NULL) { 2192 /* 2193 * Unlink and unlock each page. 2194 */ 2195 tpp_targ = pp_targ; 2196 page_sub(&pp_targ, tpp_targ); 2197 page_unlock(tpp_targ); 2198 } 2199 /* 2200 * We need to give the pp pages back. 2201 * page_free(pp, 1) without the 2202 * freemem accounting. 2203 */ 2204 page_free_replacement_page(pp); 2205 break; 2206 } 2207 2208 /* Now remove pgcnt from freemem_left */ 2209 freemem_left -= pgcnt; 2210 ASSERT(freemem_left >= 0); 2211 szc = pp->p_szc; 2212 while (pp != NULL) { 2213 /* 2214 * pp and pp_targ were passed back as 2215 * a linked list of pages. 2216 * Unlink and unlock each page. 2217 */ 2218 tpp_targ = pp_targ; 2219 page_sub(&pp_targ, tpp_targ); 2220 page_unlock(tpp_targ); 2221 /* 2222 * The original page is now free 2223 * so remove it from the linked 2224 * list and collect it. 2225 */ 2226 tpp = pp; 2227 page_sub(&pp, tpp); 2228 pfn = page_pptonum(tpp); 2229 collected++; 2230 ASSERT(PAGE_EXCL(tpp)); 2231 ASSERT(tpp->p_vnode == NULL); 2232 ASSERT(!hat_page_is_mapped(tpp)); 2233 ASSERT(tpp->p_szc == szc); 2234 tpp->p_szc = 0; 2235 page_delete_collect(tpp, mhp); 2236 bit = pfn - mdsp->mds_base; 2237 mdsp->mds_bitmap[bit / NBPBMW] |= 2238 (1 << (bit % NBPBMW)); 2239 } 2240 ASSERT(pp_targ == NULL); 2241 } 2242 } 2243 first_scan = 0; 2244 if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) && 2245 (collected == 0)) { 2246 /* 2247 * This code is needed as we cannot wait 2248 * for a page to be locked OR the delete to 2249 * be cancelled. Also, we must delay so 2250 * that other threads get a chance to run 2251 * on our cpu, otherwise page locks may be 2252 * held indefinitely by those threads. 2253 */ 2254 MDSTAT_INCR(mhp, ndelay); 2255 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2256 (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex, 2257 DEL_BUSY_WAIT_TICKS, TR_CLOCK_TICK); 2258 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2259 } 2260 } 2261 /* stop the dr aio cleanup thread */ 2262 mhp->mh_dr_aio_cleanup_cancel = 1; 2263 transit_list_collect(mhp, 0); 2264 if (freemem_left != 0) { 2265 /* Return any surplus. */ 2266 page_create_putback(freemem_left); 2267 freemem_left = 0; 2268 } 2269 #ifdef MEM_DEL_STATS 2270 ntick_total = (uint64_t)ddi_get_lbolt() - start_total; 2271 #endif /* MEM_DEL_STATS */ 2272 MDSTAT_TOTAL(mhp, ntick_total); 2273 MDSTAT_PRINT(mhp); 2274 2275 /* 2276 * If the memory delete was cancelled, exclusive-wanted bits must 2277 * be cleared. If there are retired pages being deleted, they need 2278 * to be unretired. 2279 */ 2280 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2281 mdsp = mdsp->mds_next) { 2282 pfn_t pfn, p_end; 2283 2284 p_end = mdsp->mds_base + mdsp->mds_npgs; 2285 for (pfn = mdsp->mds_base; pfn < p_end; pfn++) { 2286 page_t *pp; 2287 pgcnt_t bit; 2288 2289 bit = pfn - mdsp->mds_base; 2290 if (mhp->mh_cancel) { 2291 pp = page_numtopp_nolock(pfn); 2292 if (pp != NULL) { 2293 if ((mdsp->mds_bitmap[bit / NBPBMW] & 2294 (1 << (bit % NBPBMW))) == 0) { 2295 page_lock_clr_exclwanted(pp); 2296 } 2297 } 2298 } else { 2299 pp = NULL; 2300 } 2301 if ((mdsp->mds_bitmap_retired[bit / NBPBMW] & 2302 (1 << (bit % NBPBMW))) != 0) { 2303 /* do we already have pp? */ 2304 if (pp == NULL) { 2305 pp = page_numtopp_nolock(pfn); 2306 } 2307 ASSERT(pp != NULL); 2308 ASSERT(PP_RETIRED(pp)); 2309 if (mhp->mh_cancel != 0) { 2310 page_unlock(pp); 2311 /* 2312 * To satisfy ASSERT below in 2313 * cancel code. 2314 */ 2315 mhp->mh_hold_todo++; 2316 } else { 2317 (void) page_unretire_pp(pp, 2318 PR_UNR_CLEAN); 2319 } 2320 } 2321 } 2322 } 2323 /* 2324 * Free retired page bitmap and collected page bitmap 2325 */ 2326 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2327 mdsp = mdsp->mds_next) { 2328 ASSERT(mdsp->mds_bitmap_retired != NULL); 2329 kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp)); 2330 mdsp->mds_bitmap_retired = NULL; /* Paranoia. */ 2331 ASSERT(mdsp->mds_bitmap != NULL); 2332 kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp)); 2333 mdsp->mds_bitmap = NULL; /* Paranoia. */ 2334 } 2335 2336 /* wait for our dr aio cancel thread to exit */ 2337 while (!(mhp->mh_aio_cleanup_done)) { 2338 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2339 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 2340 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2341 } 2342 refused: 2343 if (mhp->mh_cancel != 0) { 2344 page_t *pp; 2345 2346 comp_code = mhp->mh_cancel; 2347 /* 2348 * Go through list of deleted pages (mh_deleted) freeing 2349 * them. 2350 */ 2351 while ((pp = mhp->mh_deleted) != NULL) { 2352 mhp->mh_deleted = pp->p_next; 2353 mhp->mh_hold_todo++; 2354 mutex_exit(&mhp->mh_mutex); 2355 /* Restore p_next. */ 2356 pp->p_next = pp->p_prev; 2357 if (PP_ISFREE(pp)) { 2358 cmn_err(CE_PANIC, 2359 "page %p is free", 2360 (void *)pp); 2361 } 2362 page_free(pp, 1); 2363 mutex_enter(&mhp->mh_mutex); 2364 } 2365 ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages); 2366 2367 mutex_exit(&mhp->mh_mutex); 2368 put_availrmem(mhp->mh_vm_pages); 2369 mutex_enter(&mhp->mh_mutex); 2370 2371 goto t_exit; 2372 } 2373 2374 /* 2375 * All the pages are no longer in use and are exclusively locked. 2376 */ 2377 2378 mhp->mh_deleted = NULL; 2379 2380 kphysm_del_cleanup(mhp); 2381 2382 /* 2383 * mem_node_del_range needs to be after kphysm_del_cleanup so 2384 * that the mem_node_config[] will remain intact for the cleanup. 2385 */ 2386 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2387 mdsp = mdsp->mds_next) { 2388 mem_node_del_range(mdsp->mds_base, 2389 mdsp->mds_base + mdsp->mds_npgs - 1); 2390 } 2391 /* cleanup the page counters */ 2392 page_ctrs_cleanup(); 2393 2394 comp_code = KPHYSM_OK; 2395 2396 t_exit: 2397 mutex_exit(&mhp->mh_mutex); 2398 kphysm_setup_post_del(mhp->mh_vm_pages, 2399 (comp_code == KPHYSM_OK) ? 0 : 1); 2400 mutex_enter(&mhp->mh_mutex); 2401 2402 early_exit: 2403 /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */ 2404 mhp->mh_state = MHND_DONE; 2405 del_complete_funcp = mhp->mh_delete_complete; 2406 del_complete_arg = mhp->mh_delete_complete_arg; 2407 CALLB_CPR_EXIT(&cprinfo); 2408 (*del_complete_funcp)(del_complete_arg, comp_code); 2409 thread_exit(); 2410 /*NOTREACHED*/ 2411 } 2412 2413 /* 2414 * Start the delete of the memory from the system. 2415 */ 2416 int 2417 kphysm_del_start( 2418 memhandle_t handle, 2419 void (*complete)(void *, int), 2420 void *complete_arg) 2421 { 2422 struct mem_handle *mhp; 2423 2424 mhp = kphysm_lookup_mem_handle(handle); 2425 if (mhp == NULL) { 2426 return (KPHYSM_EHANDLE); 2427 } 2428 switch (mhp->mh_state) { 2429 case MHND_FREE: 2430 ASSERT(mhp->mh_state != MHND_FREE); 2431 mutex_exit(&mhp->mh_mutex); 2432 return (KPHYSM_EHANDLE); 2433 case MHND_INIT: 2434 break; 2435 case MHND_STARTING: 2436 case MHND_RUNNING: 2437 mutex_exit(&mhp->mh_mutex); 2438 return (KPHYSM_ESEQUENCE); 2439 case MHND_DONE: 2440 mutex_exit(&mhp->mh_mutex); 2441 return (KPHYSM_ESEQUENCE); 2442 case MHND_RELEASE: 2443 mutex_exit(&mhp->mh_mutex); 2444 return (KPHYSM_ESEQUENCE); 2445 default: 2446 #ifdef DEBUG 2447 cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d", 2448 (void *)mhp, mhp->mh_state); 2449 #endif /* DEBUG */ 2450 mutex_exit(&mhp->mh_mutex); 2451 return (KPHYSM_EHANDLE); 2452 } 2453 2454 if (mhp->mh_transit.trl_spans == NULL) { 2455 mutex_exit(&mhp->mh_mutex); 2456 return (KPHYSM_ENOWORK); 2457 } 2458 2459 ASSERT(complete != NULL); 2460 mhp->mh_delete_complete = complete; 2461 mhp->mh_delete_complete_arg = complete_arg; 2462 mhp->mh_state = MHND_STARTING; 2463 /* 2464 * Release the mutex in case thread_create sleeps. 2465 */ 2466 mutex_exit(&mhp->mh_mutex); 2467 2468 /* 2469 * The "obvious" process for this thread is pageout (proc_pageout) 2470 * but this gives the thread too much power over freemem 2471 * which results in freemem starvation. 2472 */ 2473 (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0, 2474 TS_RUN, maxclsyspri - 1); 2475 2476 return (KPHYSM_OK); 2477 } 2478 2479 static kmutex_t pp_dummy_lock; /* Protects init. of pp_dummy. */ 2480 static caddr_t pp_dummy; 2481 static pgcnt_t pp_dummy_npages; 2482 static pfn_t *pp_dummy_pfn; /* Array of dummy pfns. */ 2483 2484 static void 2485 memseg_remap_init_pages(page_t *pages, page_t *epages) 2486 { 2487 page_t *pp; 2488 2489 for (pp = pages; pp < epages; pp++) { 2490 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2491 pp->p_offset = (u_offset_t)-1; 2492 page_iolock_init(pp); 2493 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 2494 continue; 2495 page_lock_delete(pp); 2496 } 2497 } 2498 2499 void 2500 memseg_remap_init() 2501 { 2502 mutex_enter(&pp_dummy_lock); 2503 if (pp_dummy == NULL) { 2504 uint_t dpages; 2505 int i; 2506 2507 /* 2508 * dpages starts off as the size of the structure and 2509 * ends up as the minimum number of pages that will 2510 * hold a whole number of page_t structures. 2511 */ 2512 dpages = sizeof (page_t); 2513 ASSERT(dpages != 0); 2514 ASSERT(dpages <= MMU_PAGESIZE); 2515 2516 while ((dpages & 1) == 0) 2517 dpages >>= 1; 2518 2519 pp_dummy_npages = dpages; 2520 /* 2521 * Allocate pp_dummy pages directly from static_arena, 2522 * since these are whole page allocations and are 2523 * referenced by physical address. This also has the 2524 * nice fringe benefit of hiding the memory from 2525 * ::findleaks since it doesn't deal well with allocated 2526 * kernel heap memory that doesn't have any mappings. 2527 */ 2528 pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages), 2529 PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP); 2530 bzero(pp_dummy, ptob(pp_dummy_npages)); 2531 ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0); 2532 pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) * 2533 pp_dummy_npages, KM_SLEEP); 2534 for (i = 0; i < pp_dummy_npages; i++) { 2535 pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat, 2536 &pp_dummy[MMU_PAGESIZE * i]); 2537 ASSERT(pp_dummy_pfn[i] != PFN_INVALID); 2538 } 2539 /* 2540 * Initialize the page_t's to a known 'deleted' state 2541 * that matches the state of deleted pages. 2542 */ 2543 memseg_remap_init_pages((page_t *)pp_dummy, 2544 (page_t *)(pp_dummy + ptob(pp_dummy_npages))); 2545 /* Remove kmem mappings for the pages for safety. */ 2546 hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages), 2547 HAT_UNLOAD_UNLOCK); 2548 /* Leave pp_dummy pointer set as flag that init is done. */ 2549 } 2550 mutex_exit(&pp_dummy_lock); 2551 } 2552 2553 /* 2554 * Remap a page-aglined range of page_t's to dummy pages. 2555 */ 2556 void 2557 remap_to_dummy(caddr_t va, pgcnt_t metapgs) 2558 { 2559 int phase; 2560 2561 ASSERT(IS_P2ALIGNED((uint64_t)va, PAGESIZE)); 2562 2563 /* 2564 * We may start remapping at a non-zero page offset 2565 * within the dummy pages since the low/high ends 2566 * of the outgoing pp's could be shared by other 2567 * memsegs (see memseg_remap_meta). 2568 */ 2569 phase = btop((uint64_t)va) % pp_dummy_npages; 2570 ASSERT(PAGESIZE % sizeof (page_t) || phase == 0); 2571 2572 while (metapgs != 0) { 2573 pgcnt_t n; 2574 int i, j; 2575 2576 n = pp_dummy_npages; 2577 if (n > metapgs) 2578 n = metapgs; 2579 for (i = 0; i < n; i++) { 2580 j = (i + phase) % pp_dummy_npages; 2581 hat_devload(kas.a_hat, va, ptob(1), pp_dummy_pfn[j], 2582 PROT_READ, 2583 HAT_LOAD | HAT_LOAD_NOCONSIST | 2584 HAT_LOAD_REMAP); 2585 va += ptob(1); 2586 } 2587 metapgs -= n; 2588 } 2589 } 2590 2591 static void 2592 memseg_remap_to_dummy(struct memseg *seg) 2593 { 2594 caddr_t pp; 2595 pgcnt_t metapgs; 2596 2597 ASSERT(memseg_is_dynamic(seg)); 2598 ASSERT(pp_dummy != NULL); 2599 2600 2601 if (!memseg_includes_meta(seg)) { 2602 memseg_remap_meta(seg); 2603 return; 2604 } 2605 2606 pp = (caddr_t)seg->pages; 2607 metapgs = seg->pages_base - memseg_get_start(seg); 2608 ASSERT(metapgs != 0); 2609 2610 seg->pages_end = seg->pages_base; 2611 2612 remap_to_dummy(pp, metapgs); 2613 } 2614 2615 /* 2616 * Transition all the deleted pages to the deleted state so that 2617 * page_lock will not wait. The page_lock_delete call will 2618 * also wake up any waiters. 2619 */ 2620 static void 2621 memseg_lock_delete_all(struct memseg *seg) 2622 { 2623 page_t *pp; 2624 2625 for (pp = seg->pages; pp < seg->epages; pp++) { 2626 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2627 page_lock_delete(pp); 2628 } 2629 } 2630 2631 static void 2632 kphysm_del_cleanup(struct mem_handle *mhp) 2633 { 2634 struct memdelspan *mdsp; 2635 struct memseg *seg; 2636 struct memseg **segpp; 2637 struct memseg *seglist; 2638 pfn_t p_end; 2639 uint64_t avmem; 2640 pgcnt_t avpgs; 2641 pgcnt_t npgs; 2642 2643 avpgs = mhp->mh_vm_pages; 2644 2645 memsegs_lock(1); 2646 2647 /* 2648 * remove from main segment list. 2649 */ 2650 npgs = 0; 2651 seglist = NULL; 2652 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2653 mdsp = mdsp->mds_next) { 2654 p_end = mdsp->mds_base + mdsp->mds_npgs; 2655 for (segpp = &memsegs; (seg = *segpp) != NULL; ) { 2656 if (seg->pages_base >= p_end || 2657 seg->pages_end <= mdsp->mds_base) { 2658 /* Span and memseg don't overlap. */ 2659 segpp = &((*segpp)->next); 2660 continue; 2661 } 2662 ASSERT(seg->pages_base >= mdsp->mds_base); 2663 ASSERT(seg->pages_end <= p_end); 2664 2665 PLCNT_MODIFY_MAX(seg->pages_base, 2666 seg->pages_base - seg->pages_end); 2667 2668 /* Hide the memseg from future scans. */ 2669 hat_kpm_delmem_mseg_update(seg, segpp); 2670 *segpp = seg->next; 2671 membar_producer(); /* TODO: Needed? */ 2672 npgs += MSEG_NPAGES(seg); 2673 2674 /* 2675 * Leave the deleted segment's next pointer intact 2676 * in case a memsegs scanning loop is walking this 2677 * segment concurrently. 2678 */ 2679 seg->lnext = seglist; 2680 seglist = seg; 2681 } 2682 } 2683 2684 build_pfn_hash(); 2685 2686 ASSERT(npgs < total_pages); 2687 total_pages -= npgs; 2688 2689 /* 2690 * Recalculate the paging parameters now total_pages has changed. 2691 * This will also cause the clock hands to be reset before next use. 2692 */ 2693 setupclock(1); 2694 2695 memsegs_unlock(1); 2696 2697 mutex_exit(&mhp->mh_mutex); 2698 2699 while ((seg = seglist) != NULL) { 2700 pfn_t mseg_start; 2701 pfn_t mseg_base, mseg_end; 2702 pgcnt_t mseg_npgs; 2703 int mlret; 2704 2705 seglist = seg->lnext; 2706 2707 /* 2708 * Put the page_t's into the deleted state to stop 2709 * cv_wait()s on the pages. When we remap, the dummy 2710 * page_t's will be in the same state. 2711 */ 2712 memseg_lock_delete_all(seg); 2713 /* 2714 * Collect up information based on pages_base and pages_end 2715 * early so that we can flag early that the memseg has been 2716 * deleted by setting pages_end == pages_base. 2717 */ 2718 mseg_base = seg->pages_base; 2719 mseg_end = seg->pages_end; 2720 mseg_npgs = MSEG_NPAGES(seg); 2721 mseg_start = memseg_get_start(seg); 2722 2723 if (memseg_is_dynamic(seg)) { 2724 /* Remap the meta data to our special dummy area. */ 2725 memseg_remap_to_dummy(seg); 2726 2727 mutex_enter(&memseg_lists_lock); 2728 seg->lnext = memseg_va_avail; 2729 memseg_va_avail = seg; 2730 mutex_exit(&memseg_lists_lock); 2731 } else { 2732 /* 2733 * For memory whose page_ts were allocated 2734 * at boot, we need to find a new use for 2735 * the page_t memory. 2736 * For the moment, just leak it. 2737 * (It is held in the memseg_delete_junk list.) 2738 */ 2739 seg->pages_end = seg->pages_base; 2740 2741 mutex_enter(&memseg_lists_lock); 2742 seg->lnext = memseg_delete_junk; 2743 memseg_delete_junk = seg; 2744 mutex_exit(&memseg_lists_lock); 2745 } 2746 2747 /* Must not use seg now as it could be re-used. */ 2748 2749 memlist_write_lock(); 2750 2751 mlret = memlist_delete_span( 2752 (uint64_t)(mseg_base) << PAGESHIFT, 2753 (uint64_t)(mseg_npgs) << PAGESHIFT, 2754 &phys_avail); 2755 ASSERT(mlret == MEML_SPANOP_OK); 2756 2757 mlret = memlist_delete_span( 2758 (uint64_t)(mseg_start) << PAGESHIFT, 2759 (uint64_t)(mseg_end - mseg_start) << 2760 PAGESHIFT, 2761 &phys_install); 2762 ASSERT(mlret == MEML_SPANOP_OK); 2763 phys_install_has_changed(); 2764 2765 memlist_write_unlock(); 2766 } 2767 2768 memlist_read_lock(); 2769 installed_top_size(phys_install, &physmax, &physinstalled); 2770 memlist_read_unlock(); 2771 2772 mutex_enter(&freemem_lock); 2773 maxmem -= avpgs; 2774 physmem -= avpgs; 2775 /* availrmem is adjusted during the delete. */ 2776 availrmem_initial -= avpgs; 2777 2778 mutex_exit(&freemem_lock); 2779 2780 dump_resize(); 2781 2782 cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK " 2783 "(0x%" PRIx64 ")\n", 2784 physinstalled << (PAGESHIFT - 10), 2785 (uint64_t)physinstalled << PAGESHIFT); 2786 2787 avmem = (uint64_t)freemem << PAGESHIFT; 2788 cmn_err(CE_CONT, "?kphysm_delete: " 2789 "avail mem = %" PRId64 "\n", avmem); 2790 2791 /* 2792 * Update lgroup generation number on single lgroup systems 2793 */ 2794 if (nlgrps == 1) 2795 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 2796 2797 /* Successfully deleted system memory */ 2798 mutex_enter(&mhp->mh_mutex); 2799 } 2800 2801 static uint_t mdel_nullvp_waiter; 2802 2803 static void 2804 page_delete_collect( 2805 page_t *pp, 2806 struct mem_handle *mhp) 2807 { 2808 if (pp->p_vnode) { 2809 page_hashout(pp, (kmutex_t *)NULL); 2810 /* do not do PP_SETAGED(pp); */ 2811 } else { 2812 kmutex_t *sep; 2813 2814 sep = page_se_mutex(pp); 2815 mutex_enter(sep); 2816 if (CV_HAS_WAITERS(&pp->p_cv)) { 2817 mdel_nullvp_waiter++; 2818 cv_broadcast(&pp->p_cv); 2819 } 2820 mutex_exit(sep); 2821 } 2822 ASSERT(pp->p_next == pp->p_prev); 2823 ASSERT(pp->p_next == NULL || pp->p_next == pp); 2824 pp->p_next = mhp->mh_deleted; 2825 mhp->mh_deleted = pp; 2826 ASSERT(mhp->mh_hold_todo != 0); 2827 mhp->mh_hold_todo--; 2828 } 2829 2830 static void 2831 transit_list_collect(struct mem_handle *mhp, int v) 2832 { 2833 struct transit_list_head *trh; 2834 2835 trh = &transit_list_head; 2836 mutex_enter(&trh->trh_lock); 2837 mhp->mh_transit.trl_collect = v; 2838 mutex_exit(&trh->trh_lock); 2839 } 2840 2841 static void 2842 transit_list_insert(struct transit_list *tlp) 2843 { 2844 struct transit_list_head *trh; 2845 2846 trh = &transit_list_head; 2847 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2848 tlp->trl_next = trh->trh_head; 2849 trh->trh_head = tlp; 2850 } 2851 2852 static void 2853 transit_list_remove(struct transit_list *tlp) 2854 { 2855 struct transit_list_head *trh; 2856 struct transit_list **tlpp; 2857 2858 trh = &transit_list_head; 2859 tlpp = &trh->trh_head; 2860 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2861 while (*tlpp != NULL && *tlpp != tlp) 2862 tlpp = &(*tlpp)->trl_next; 2863 ASSERT(*tlpp != NULL); 2864 if (*tlpp == tlp) 2865 *tlpp = tlp->trl_next; 2866 tlp->trl_next = NULL; 2867 } 2868 2869 static struct transit_list * 2870 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum) 2871 { 2872 struct transit_list *tlp; 2873 2874 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 2875 struct memdelspan *mdsp; 2876 2877 for (mdsp = tlp->trl_spans; mdsp != NULL; 2878 mdsp = mdsp->mds_next) { 2879 if (pfnum >= mdsp->mds_base && 2880 pfnum < (mdsp->mds_base + mdsp->mds_npgs)) { 2881 return (tlp); 2882 } 2883 } 2884 } 2885 return (NULL); 2886 } 2887 2888 int 2889 pfn_is_being_deleted(pfn_t pfnum) 2890 { 2891 struct transit_list_head *trh; 2892 struct transit_list *tlp; 2893 int ret; 2894 2895 trh = &transit_list_head; 2896 if (trh->trh_head == NULL) 2897 return (0); 2898 2899 mutex_enter(&trh->trh_lock); 2900 tlp = pfnum_to_transit_list(trh, pfnum); 2901 ret = (tlp != NULL && tlp->trl_collect); 2902 mutex_exit(&trh->trh_lock); 2903 2904 return (ret); 2905 } 2906 2907 #ifdef MEM_DEL_STATS 2908 extern int hz; 2909 static void 2910 mem_del_stat_print_func(struct mem_handle *mhp) 2911 { 2912 uint64_t tmp; 2913 2914 if (mem_del_stat_print) { 2915 printf("memory delete loop %x/%x, statistics%s\n", 2916 (uint_t)mhp->mh_transit.trl_spans->mds_base, 2917 (uint_t)mhp->mh_transit.trl_spans->mds_npgs, 2918 (mhp->mh_cancel ? " (cancelled)" : "")); 2919 printf("\t%8u nloop\n", mhp->mh_delstat.nloop); 2920 printf("\t%8u need_free\n", mhp->mh_delstat.need_free); 2921 printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop); 2922 printf("\t%8u free_low\n", mhp->mh_delstat.free_low); 2923 printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed); 2924 printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck); 2925 printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget); 2926 printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail); 2927 printf("\t%8u nfree\n", mhp->mh_delstat.nfree); 2928 printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc); 2929 printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail); 2930 printf("\t%8u already_done\n", mhp->mh_delstat.already_done); 2931 printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree); 2932 printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked); 2933 printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc); 2934 printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl); 2935 printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc); 2936 printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy); 2937 printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage); 2938 printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim); 2939 printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay); 2940 printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail); 2941 printf("\t%8u retired\n", mhp->mh_delstat.retired); 2942 printf("\t%8u toxic\n", mhp->mh_delstat.toxic); 2943 printf("\t%8u failing\n", mhp->mh_delstat.failing); 2944 printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic); 2945 printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic); 2946 printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail); 2947 printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail); 2948 tmp = mhp->mh_delstat.nticks_total / hz; /* seconds */ 2949 printf( 2950 "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n", 2951 mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60); 2952 2953 tmp = mhp->mh_delstat.nticks_pgrp / hz; /* seconds */ 2954 printf( 2955 "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n", 2956 mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60); 2957 } 2958 } 2959 #endif /* MEM_DEL_STATS */ 2960 2961 struct mem_callback { 2962 kphysm_setup_vector_t *vec; 2963 void *arg; 2964 }; 2965 2966 #define NMEMCALLBACKS 100 2967 2968 static struct mem_callback mem_callbacks[NMEMCALLBACKS]; 2969 static uint_t nmemcallbacks; 2970 static krwlock_t mem_callback_rwlock; 2971 2972 int 2973 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg) 2974 { 2975 uint_t i, found; 2976 2977 /* 2978 * This test will become more complicated when the version must 2979 * change. 2980 */ 2981 if (vec->version != KPHYSM_SETUP_VECTOR_VERSION) 2982 return (EINVAL); 2983 2984 if (vec->post_add == NULL || vec->pre_del == NULL || 2985 vec->post_del == NULL) 2986 return (EINVAL); 2987 2988 rw_enter(&mem_callback_rwlock, RW_WRITER); 2989 for (i = 0, found = 0; i < nmemcallbacks; i++) { 2990 if (mem_callbacks[i].vec == NULL && found == 0) 2991 found = i + 1; 2992 if (mem_callbacks[i].vec == vec && 2993 mem_callbacks[i].arg == arg) { 2994 #ifdef DEBUG 2995 /* Catch this in DEBUG kernels. */ 2996 cmn_err(CE_WARN, "kphysm_setup_func_register" 2997 "(0x%p, 0x%p) duplicate registration from 0x%p", 2998 (void *)vec, arg, (void *)caller()); 2999 #endif /* DEBUG */ 3000 rw_exit(&mem_callback_rwlock); 3001 return (EEXIST); 3002 } 3003 } 3004 if (found != 0) { 3005 i = found - 1; 3006 } else { 3007 ASSERT(nmemcallbacks < NMEMCALLBACKS); 3008 if (nmemcallbacks == NMEMCALLBACKS) { 3009 rw_exit(&mem_callback_rwlock); 3010 return (ENOMEM); 3011 } 3012 i = nmemcallbacks++; 3013 } 3014 mem_callbacks[i].vec = vec; 3015 mem_callbacks[i].arg = arg; 3016 rw_exit(&mem_callback_rwlock); 3017 return (0); 3018 } 3019 3020 void 3021 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg) 3022 { 3023 uint_t i; 3024 3025 rw_enter(&mem_callback_rwlock, RW_WRITER); 3026 for (i = 0; i < nmemcallbacks; i++) { 3027 if (mem_callbacks[i].vec == vec && 3028 mem_callbacks[i].arg == arg) { 3029 mem_callbacks[i].vec = NULL; 3030 mem_callbacks[i].arg = NULL; 3031 if (i == (nmemcallbacks - 1)) 3032 nmemcallbacks--; 3033 break; 3034 } 3035 } 3036 rw_exit(&mem_callback_rwlock); 3037 } 3038 3039 static void 3040 kphysm_setup_post_add(pgcnt_t delta_pages) 3041 { 3042 uint_t i; 3043 3044 rw_enter(&mem_callback_rwlock, RW_READER); 3045 for (i = 0; i < nmemcallbacks; i++) { 3046 if (mem_callbacks[i].vec != NULL) { 3047 (*mem_callbacks[i].vec->post_add) 3048 (mem_callbacks[i].arg, delta_pages); 3049 } 3050 } 3051 rw_exit(&mem_callback_rwlock); 3052 } 3053 3054 /* 3055 * Note the locking between pre_del and post_del: The reader lock is held 3056 * between the two calls to stop the set of functions from changing. 3057 */ 3058 3059 static int 3060 kphysm_setup_pre_del(pgcnt_t delta_pages) 3061 { 3062 uint_t i; 3063 int ret; 3064 int aret; 3065 3066 ret = 0; 3067 rw_enter(&mem_callback_rwlock, RW_READER); 3068 for (i = 0; i < nmemcallbacks; i++) { 3069 if (mem_callbacks[i].vec != NULL) { 3070 aret = (*mem_callbacks[i].vec->pre_del) 3071 (mem_callbacks[i].arg, delta_pages); 3072 ret |= aret; 3073 } 3074 } 3075 3076 return (ret); 3077 } 3078 3079 static void 3080 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled) 3081 { 3082 uint_t i; 3083 3084 for (i = 0; i < nmemcallbacks; i++) { 3085 if (mem_callbacks[i].vec != NULL) { 3086 (*mem_callbacks[i].vec->post_del) 3087 (mem_callbacks[i].arg, delta_pages, cancelled); 3088 } 3089 } 3090 rw_exit(&mem_callback_rwlock); 3091 } 3092 3093 static int 3094 kphysm_split_memseg( 3095 pfn_t base, 3096 pgcnt_t npgs) 3097 { 3098 struct memseg *seg; 3099 struct memseg **segpp; 3100 pgcnt_t size_low, size_high; 3101 struct memseg *seg_low, *seg_mid, *seg_high; 3102 3103 /* 3104 * Lock the memsegs list against other updates now 3105 */ 3106 memsegs_lock(1); 3107 3108 /* 3109 * Find boot time memseg that wholly covers this area. 3110 */ 3111 3112 /* First find the memseg with page 'base' in it. */ 3113 for (segpp = &memsegs; (seg = *segpp) != NULL; 3114 segpp = &((*segpp)->next)) { 3115 if (base >= seg->pages_base && base < seg->pages_end) 3116 break; 3117 } 3118 if (seg == NULL) { 3119 memsegs_unlock(1); 3120 return (0); 3121 } 3122 if (memseg_includes_meta(seg)) { 3123 memsegs_unlock(1); 3124 return (0); 3125 } 3126 if ((base + npgs) > seg->pages_end) { 3127 memsegs_unlock(1); 3128 return (0); 3129 } 3130 3131 /* 3132 * Work out the size of the two segments that will 3133 * surround the new segment, one for low address 3134 * and one for high. 3135 */ 3136 ASSERT(base >= seg->pages_base); 3137 size_low = base - seg->pages_base; 3138 ASSERT(seg->pages_end >= (base + npgs)); 3139 size_high = seg->pages_end - (base + npgs); 3140 3141 /* 3142 * Sanity check. 3143 */ 3144 if ((size_low + size_high) == 0) { 3145 memsegs_unlock(1); 3146 return (0); 3147 } 3148 3149 /* 3150 * Allocate the new structures. The old memseg will not be freed 3151 * as there may be a reference to it. 3152 */ 3153 seg_low = NULL; 3154 seg_high = NULL; 3155 3156 if (size_low != 0) 3157 seg_low = memseg_alloc(); 3158 3159 seg_mid = memseg_alloc(); 3160 3161 if (size_high != 0) 3162 seg_high = memseg_alloc(); 3163 3164 /* 3165 * All allocation done now. 3166 */ 3167 if (size_low != 0) { 3168 seg_low->pages = seg->pages; 3169 seg_low->epages = seg_low->pages + size_low; 3170 seg_low->pages_base = seg->pages_base; 3171 seg_low->pages_end = seg_low->pages_base + size_low; 3172 seg_low->next = seg_mid; 3173 seg_low->msegflags = seg->msegflags; 3174 } 3175 if (size_high != 0) { 3176 seg_high->pages = seg->epages - size_high; 3177 seg_high->epages = seg_high->pages + size_high; 3178 seg_high->pages_base = seg->pages_end - size_high; 3179 seg_high->pages_end = seg_high->pages_base + size_high; 3180 seg_high->next = seg->next; 3181 seg_high->msegflags = seg->msegflags; 3182 } 3183 3184 seg_mid->pages = seg->pages + size_low; 3185 seg_mid->pages_base = seg->pages_base + size_low; 3186 seg_mid->epages = seg->epages - size_high; 3187 seg_mid->pages_end = seg->pages_end - size_high; 3188 seg_mid->next = (seg_high != NULL) ? seg_high : seg->next; 3189 seg_mid->msegflags = seg->msegflags; 3190 3191 /* 3192 * Update hat_kpm specific info of all involved memsegs and 3193 * allow hat_kpm specific global chain updates. 3194 */ 3195 hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high); 3196 3197 /* 3198 * At this point we have two equivalent memseg sub-chains, 3199 * seg and seg_low/seg_mid/seg_high, which both chain on to 3200 * the same place in the global chain. By re-writing the pointer 3201 * in the previous element we switch atomically from using the old 3202 * (seg) to the new. 3203 */ 3204 *segpp = (seg_low != NULL) ? seg_low : seg_mid; 3205 3206 membar_enter(); 3207 3208 build_pfn_hash(); 3209 memsegs_unlock(1); 3210 3211 /* 3212 * We leave the old segment, 'seg', intact as there may be 3213 * references to it. Also, as the value of total_pages has not 3214 * changed and the memsegs list is effectively the same when 3215 * accessed via the old or the new pointer, we do not have to 3216 * cause pageout_scanner() to re-evaluate its hand pointers. 3217 * 3218 * We currently do not re-use or reclaim the page_t memory. 3219 * If we do, then this may have to change. 3220 */ 3221 3222 mutex_enter(&memseg_lists_lock); 3223 seg->lnext = memseg_edit_junk; 3224 memseg_edit_junk = seg; 3225 mutex_exit(&memseg_lists_lock); 3226 3227 return (1); 3228 } 3229 3230 /* 3231 * The sfmmu hat layer (e.g.) accesses some parts of the memseg 3232 * structure using physical addresses. Therefore a kmem_cache is 3233 * used with KMC_NOHASH to avoid page crossings within a memseg 3234 * structure. KMC_NOHASH requires that no external (outside of 3235 * slab) information is allowed. This, in turn, implies that the 3236 * cache's slabsize must be exactly a single page, since per-slab 3237 * information (e.g. the freelist for the slab) is kept at the 3238 * end of the slab, where it is easy to locate. Should be changed 3239 * when a more obvious kmem_cache interface/flag will become 3240 * available. 3241 */ 3242 void 3243 mem_config_init() 3244 { 3245 memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg), 3246 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 3247 } 3248 3249 struct memseg * 3250 memseg_alloc() 3251 { 3252 struct memseg *seg; 3253 3254 seg = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3255 bzero(seg, sizeof (struct memseg)); 3256 3257 return (seg); 3258 } 3259 3260 /* 3261 * Return whether the page_t memory for this memseg 3262 * is included in the memseg itself. 3263 */ 3264 static int 3265 memseg_includes_meta(struct memseg *seg) 3266 { 3267 return (seg->msegflags & MEMSEG_META_INCL); 3268 } 3269 3270 pfn_t 3271 memseg_get_start(struct memseg *seg) 3272 { 3273 pfn_t pt_start; 3274 3275 if (memseg_includes_meta(seg)) { 3276 pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages); 3277 3278 /* Meta data is required to be at the beginning */ 3279 ASSERT(pt_start < seg->pages_base); 3280 } else 3281 pt_start = seg->pages_base; 3282 3283 return (pt_start); 3284 } 3285 3286 /* 3287 * Invalidate memseg pointers in cpu private vm data caches. 3288 */ 3289 static void 3290 memseg_cpu_vm_flush() 3291 { 3292 cpu_t *cp; 3293 vm_cpu_data_t *vc; 3294 3295 mutex_enter(&cpu_lock); 3296 pause_cpus(NULL); 3297 3298 cp = cpu_list; 3299 do { 3300 vc = cp->cpu_vm_data; 3301 vc->vc_pnum_memseg = NULL; 3302 vc->vc_pnext_memseg = NULL; 3303 3304 } while ((cp = cp->cpu_next) != cpu_list); 3305 3306 start_cpus(); 3307 mutex_exit(&cpu_lock); 3308 } 3309