1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * Copyright 2017 Joyent, Inc. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/cmn_err.h> 29 #include <sys/vmem.h> 30 #include <sys/kmem.h> 31 #include <sys/systm.h> 32 #include <sys/machsystm.h> /* for page_freelist_coalesce() */ 33 #include <sys/errno.h> 34 #include <sys/memnode.h> 35 #include <sys/memlist.h> 36 #include <sys/memlist_impl.h> 37 #include <sys/tuneable.h> 38 #include <sys/proc.h> 39 #include <sys/disp.h> 40 #include <sys/debug.h> 41 #include <sys/vm.h> 42 #include <sys/callb.h> 43 #include <sys/memlist_plat.h> /* for installed_top_size() */ 44 #include <sys/condvar_impl.h> /* for CV_HAS_WAITERS() */ 45 #include <sys/dumphdr.h> /* for dump_resize() */ 46 #include <sys/atomic.h> /* for use in stats collection */ 47 #include <sys/rwlock.h> 48 #include <sys/cpuvar.h> 49 #include <vm/seg_kmem.h> 50 #include <vm/seg_kpm.h> 51 #include <vm/page.h> 52 #include <vm/vm_dep.h> 53 #define SUNDDI_IMPL /* so sunddi.h will not redefine splx() et al */ 54 #include <sys/sunddi.h> 55 #include <sys/mem_config.h> 56 #include <sys/mem_cage.h> 57 #include <sys/lgrp.h> 58 #include <sys/ddi.h> 59 #include <sys/modctl.h> 60 61 extern struct memlist *phys_avail; 62 63 extern uint_t page_ctrs_adjust(int); 64 void page_ctrs_cleanup(void); 65 static void kphysm_setup_post_add(pgcnt_t); 66 static int kphysm_setup_pre_del(pgcnt_t); 67 static void kphysm_setup_post_del(pgcnt_t, int); 68 69 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs); 70 71 static int delspan_reserve(pfn_t, pgcnt_t); 72 static void delspan_unreserve(pfn_t, pgcnt_t); 73 74 kmutex_t memseg_lists_lock; 75 struct memseg *memseg_va_avail; 76 struct memseg *memseg_alloc(void); 77 static struct memseg *memseg_delete_junk; 78 static struct memseg *memseg_edit_junk; 79 void memseg_remap_init(void); 80 static void memseg_remap_to_dummy(struct memseg *); 81 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t); 82 static struct memseg *memseg_reuse(pgcnt_t); 83 84 static struct kmem_cache *memseg_cache; 85 86 /* 87 * Interfaces to manage externally allocated 88 * page_t memory (metadata) for a memseg. 89 */ 90 #pragma weak memseg_alloc_meta 91 #pragma weak memseg_free_meta 92 #pragma weak memseg_get_metapfn 93 #pragma weak memseg_remap_meta 94 95 extern int ppvm_enable; 96 extern page_t *ppvm_base; 97 extern int memseg_alloc_meta(pfn_t, pgcnt_t, void **, pgcnt_t *); 98 extern void memseg_free_meta(void *, pgcnt_t); 99 extern pfn_t memseg_get_metapfn(void *, pgcnt_t); 100 extern void memseg_remap_meta(struct memseg *); 101 static int memseg_is_dynamic(struct memseg *); 102 static int memseg_includes_meta(struct memseg *); 103 pfn_t memseg_get_start(struct memseg *); 104 static void memseg_cpu_vm_flush(void); 105 106 int meta_alloc_enable; 107 108 #ifdef DEBUG 109 static int memseg_debug; 110 #define MEMSEG_DEBUG(args...) if (memseg_debug) printf(args) 111 #else 112 #define MEMSEG_DEBUG(...) 113 #endif 114 115 /* 116 * Add a chunk of memory to the system. 117 * base: starting PAGESIZE page of new memory. 118 * npgs: length in PAGESIZE pages. 119 * 120 * Adding mem this way doesn't increase the size of the hash tables; 121 * growing them would be too hard. This should be OK, but adding memory 122 * dynamically most likely means more hash misses, since the tables will 123 * be smaller than they otherwise would be. 124 */ 125 int 126 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs) 127 { 128 page_t *pp; 129 page_t *opp, *oepp, *segpp; 130 struct memseg *seg; 131 uint64_t avmem; 132 pfn_t pfn; 133 pfn_t pt_base = base; 134 pgcnt_t tpgs = npgs; 135 pgcnt_t metapgs = 0; 136 int exhausted; 137 pfn_t pnum; 138 int mnode; 139 caddr_t vaddr; 140 int reuse; 141 int mlret; 142 int rv; 143 int flags; 144 int meta_alloc = 0; 145 void *mapva; 146 void *metabase = (void *)base; 147 pgcnt_t nkpmpgs = 0; 148 offset_t kpm_pages_off = 0; 149 150 cmn_err(CE_CONT, 151 "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n", 152 npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT); 153 154 /* 155 * Add this span in the delete list to prevent interactions. 156 */ 157 if (!delspan_reserve(base, npgs)) { 158 return (KPHYSM_ESPAN); 159 } 160 /* 161 * Check to see if any of the memory span has been added 162 * by trying an add to the installed memory list. This 163 * forms the interlocking process for add. 164 */ 165 166 memlist_write_lock(); 167 168 mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT, 169 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 170 171 if (mlret == MEML_SPANOP_OK) 172 installed_top_size(phys_install, &physmax, &physinstalled); 173 174 memlist_write_unlock(); 175 176 if (mlret != MEML_SPANOP_OK) { 177 if (mlret == MEML_SPANOP_EALLOC) { 178 delspan_unreserve(pt_base, tpgs); 179 return (KPHYSM_ERESOURCE); 180 } else if (mlret == MEML_SPANOP_ESPAN) { 181 delspan_unreserve(pt_base, tpgs); 182 return (KPHYSM_ESPAN); 183 } else { 184 delspan_unreserve(pt_base, tpgs); 185 return (KPHYSM_ERESOURCE); 186 } 187 } 188 189 if (meta_alloc_enable) { 190 /* 191 * Allocate the page_t's from existing memory; 192 * if that fails, allocate from the incoming memory. 193 */ 194 rv = memseg_alloc_meta(base, npgs, &metabase, &metapgs); 195 if (rv == KPHYSM_OK) { 196 ASSERT(metapgs); 197 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); 198 meta_alloc = 1; 199 goto mapalloc; 200 } 201 } 202 203 /* 204 * We store the page_t's for this new memory in the first 205 * few pages of the chunk. Here, we go and get'em ... 206 */ 207 208 /* 209 * The expression after the '-' gives the number of pages 210 * that will fit in the new memory based on a requirement 211 * of (PAGESIZE + sizeof (page_t)) bytes per page. 212 */ 213 metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) / 214 (PAGESIZE + sizeof (page_t))); 215 216 npgs -= metapgs; 217 base += metapgs; 218 219 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); 220 221 exhausted = (metapgs == 0 || npgs == 0); 222 223 if (kpm_enable && !exhausted) { 224 pgcnt_t start, end, nkpmpgs_prelim; 225 size_t ptsz; 226 227 /* 228 * A viable kpm large page mapping must not overlap two 229 * dynamic memsegs. Therefore the total size is checked 230 * to be at least kpm_pgsz and also whether start and end 231 * points are at least kpm_pgsz aligned. 232 */ 233 if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) || 234 pmodkpmp(base + npgs)) { 235 236 kphysm_addmem_error_undospan(pt_base, tpgs); 237 238 /* 239 * There is no specific error code for violating 240 * kpm granularity constraints. 241 */ 242 return (KPHYSM_ENOTVIABLE); 243 } 244 245 start = kpmptop(ptokpmp(base)); 246 end = kpmptop(ptokpmp(base + npgs)); 247 nkpmpgs_prelim = ptokpmp(end - start); 248 ptsz = npgs * sizeof (page_t); 249 metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ); 250 exhausted = (tpgs <= metapgs); 251 if (!exhausted) { 252 npgs = tpgs - metapgs; 253 base = pt_base + metapgs; 254 255 /* final nkpmpgs */ 256 start = kpmptop(ptokpmp(base)); 257 nkpmpgs = ptokpmp(end - start); 258 kpm_pages_off = ptsz + 259 (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ; 260 } 261 } 262 263 /* 264 * Is memory area supplied too small? 265 */ 266 if (exhausted) { 267 kphysm_addmem_error_undospan(pt_base, tpgs); 268 /* 269 * There is no specific error code for 'too small'. 270 */ 271 return (KPHYSM_ERESOURCE); 272 } 273 274 mapalloc: 275 /* 276 * We may re-use a previously allocated VA space for the page_ts 277 * eventually, but we need to initialize and lock the pages first. 278 */ 279 280 /* 281 * Get an address in the kernel address map, map 282 * the page_t pages and see if we can touch them. 283 */ 284 285 mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP); 286 if (mapva == NULL) { 287 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 288 " Can't allocate VA for page_ts"); 289 290 if (meta_alloc) 291 memseg_free_meta(metabase, metapgs); 292 kphysm_addmem_error_undospan(pt_base, tpgs); 293 294 return (KPHYSM_ERESOURCE); 295 } 296 pp = mapva; 297 298 if (physmax < (pt_base + tpgs)) 299 physmax = (pt_base + tpgs); 300 301 /* 302 * In the remapping code we map one page at a time so we must do 303 * the same here to match mapping sizes. 304 */ 305 pfn = pt_base; 306 vaddr = (caddr_t)pp; 307 for (pnum = 0; pnum < metapgs; pnum++) { 308 if (meta_alloc) 309 pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum); 310 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 311 PROT_READ | PROT_WRITE, 312 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 313 pfn++; 314 vaddr += ptob(1); 315 } 316 317 if (ddi_peek32((dev_info_t *)NULL, 318 (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) { 319 320 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 321 " Can't access pp array at 0x%p [phys 0x%lx]", 322 (void *)pp, pt_base); 323 324 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 325 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 326 327 vmem_free(heap_arena, mapva, ptob(metapgs)); 328 if (meta_alloc) 329 memseg_free_meta(metabase, metapgs); 330 kphysm_addmem_error_undospan(pt_base, tpgs); 331 332 return (KPHYSM_EFAULT); 333 } 334 335 /* 336 * Add this memory slice to its memory node translation. 337 * 338 * Note that right now, each node may have only one slice; 339 * this may change with COD or in larger SSM systems with 340 * nested latency groups, so we must not assume that the 341 * node does not yet exist. 342 * 343 * Note that there may be multiple memory nodes associated with 344 * a single lgrp node on x86 systems. 345 */ 346 pnum = pt_base + tpgs - 1; 347 mem_node_add_range(pt_base, pnum); 348 349 /* 350 * Allocate or resize page counters as necessary to accommodate 351 * the increase in memory pages. 352 */ 353 mnode = PFN_2_MEM_NODE(pnum); 354 PAGE_CTRS_ADJUST(base, npgs, rv); 355 if (rv) { 356 357 mem_node_del_range(pt_base, pnum); 358 359 /* cleanup the page counters */ 360 page_ctrs_cleanup(); 361 362 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 363 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 364 365 vmem_free(heap_arena, mapva, ptob(metapgs)); 366 if (meta_alloc) 367 memseg_free_meta(metabase, metapgs); 368 kphysm_addmem_error_undospan(pt_base, tpgs); 369 370 return (KPHYSM_ERESOURCE); 371 } 372 373 /* 374 * Update the phys_avail memory list. 375 * The phys_install list was done at the start. 376 */ 377 378 memlist_write_lock(); 379 380 mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT, 381 (uint64_t)(npgs) << PAGESHIFT, &phys_avail); 382 ASSERT(mlret == MEML_SPANOP_OK); 383 384 memlist_write_unlock(); 385 386 /* See if we can find a memseg to re-use. */ 387 if (meta_alloc) { 388 seg = memseg_reuse(0); 389 reuse = 1; /* force unmapping of temp mapva */ 390 flags = MEMSEG_DYNAMIC | MEMSEG_META_ALLOC; 391 /* 392 * There is a 1:1 fixed relationship between a pfn 393 * and a page_t VA. The pfn is used as an index into 394 * the ppvm_base page_t table in order to calculate 395 * the page_t base address for a given pfn range. 396 */ 397 segpp = ppvm_base + base; 398 } else { 399 seg = memseg_reuse(metapgs); 400 reuse = (seg != NULL); 401 flags = MEMSEG_DYNAMIC | MEMSEG_META_INCL; 402 segpp = pp; 403 } 404 405 /* 406 * Initialize the memseg structure representing this memory 407 * and add it to the existing list of memsegs. Do some basic 408 * initialization and add the memory to the system. 409 * In order to prevent lock deadlocks, the add_physmem() 410 * code is repeated here, but split into several stages. 411 * 412 * If a memseg is reused, invalidate memseg pointers in 413 * all cpu vm caches. We need to do this this since the check 414 * pp >= seg->pages && pp < seg->epages 415 * used in various places is not atomic and so the first compare 416 * can happen before reuse and the second compare after reuse. 417 * The invalidation ensures that a memseg is not deferenced while 418 * it's page/pfn pointers are changing. 419 */ 420 if (seg == NULL) { 421 seg = memseg_alloc(); 422 ASSERT(seg != NULL); 423 seg->msegflags = flags; 424 MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p", 425 (void *)seg, (void *)(seg->pages)); 426 seg->pages = segpp; 427 } else { 428 ASSERT(seg->msegflags == flags); 429 ASSERT(seg->pages_base == seg->pages_end); 430 MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p", 431 (void *)seg, (void *)(seg->pages)); 432 if (meta_alloc) { 433 memseg_cpu_vm_flush(); 434 seg->pages = segpp; 435 } 436 } 437 438 seg->epages = seg->pages + npgs; 439 seg->pages_base = base; 440 seg->pages_end = base + npgs; 441 442 /* 443 * Initialize metadata. The page_ts are set to locked state 444 * ready to be freed. 445 */ 446 bzero((caddr_t)pp, ptob(metapgs)); 447 448 pfn = seg->pages_base; 449 /* Save the original pp base in case we reuse a memseg. */ 450 opp = pp; 451 oepp = opp + npgs; 452 for (pp = opp; pp < oepp; pp++) { 453 pp->p_pagenum = pfn; 454 pfn++; 455 page_iolock_init(pp); 456 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 457 continue; 458 pp->p_offset = (u_offset_t)-1; 459 } 460 461 if (reuse) { 462 /* Remap our page_ts to the re-used memseg VA space. */ 463 pfn = pt_base; 464 vaddr = (caddr_t)seg->pages; 465 for (pnum = 0; pnum < metapgs; pnum++) { 466 if (meta_alloc) 467 pfn = memseg_get_metapfn(metabase, 468 (pgcnt_t)pnum); 469 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 470 PROT_READ | PROT_WRITE, 471 HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST); 472 pfn++; 473 vaddr += ptob(1); 474 } 475 476 hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs), 477 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 478 479 vmem_free(heap_arena, mapva, ptob(metapgs)); 480 } 481 482 hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off); 483 484 memsegs_lock(1); 485 486 /* 487 * The new memseg is inserted at the beginning of the list. 488 * Not only does this save searching for the tail, but in the 489 * case of a re-used memseg, it solves the problem of what 490 * happens if some process has still got a pointer to the 491 * memseg and follows the next pointer to continue traversing 492 * the memsegs list. 493 */ 494 495 hat_kpm_addmem_mseg_insert(seg); 496 497 seg->next = memsegs; 498 membar_producer(); 499 500 hat_kpm_addmem_memsegs_update(seg); 501 502 memsegs = seg; 503 504 build_pfn_hash(); 505 506 total_pages += npgs; 507 508 /* 509 * Recalculate the paging parameters now total_pages has changed. 510 * This will also cause the clock hands to be reset before next use. 511 */ 512 setupclock(); 513 514 memsegs_unlock(1); 515 516 PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs); 517 518 /* 519 * Free the pages outside the lock to avoid locking loops. 520 */ 521 for (pp = seg->pages; pp < seg->epages; pp++) { 522 page_free(pp, 1); 523 } 524 525 /* 526 * Now that we've updated the appropriate memory lists we 527 * need to reset a number of globals, since we've increased memory. 528 * Several have already been updated for us as noted above. The 529 * globals we're interested in at this point are: 530 * physmax - highest page frame number. 531 * physinstalled - number of pages currently installed (done earlier) 532 * maxmem - max free pages in the system 533 * physmem - physical memory pages available 534 * availrmem - real memory available 535 */ 536 537 mutex_enter(&freemem_lock); 538 maxmem += npgs; 539 physmem += npgs; 540 availrmem += npgs; 541 availrmem_initial += npgs; 542 543 mutex_exit(&freemem_lock); 544 545 dump_resize(); 546 547 page_freelist_coalesce_all(mnode); 548 549 kphysm_setup_post_add(npgs); 550 551 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK " 552 "(0x%" PRIx64 ")\n", 553 physinstalled << (PAGESHIFT - 10), 554 (uint64_t)physinstalled << PAGESHIFT); 555 556 avmem = (uint64_t)freemem << PAGESHIFT; 557 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: " 558 "avail mem = %" PRId64 "\n", avmem); 559 560 /* 561 * Update lgroup generation number on single lgroup systems 562 */ 563 if (nlgrps == 1) 564 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 565 566 /* 567 * Inform DDI of update 568 */ 569 ddi_mem_update((uint64_t)(pt_base) << PAGESHIFT, 570 (uint64_t)(tpgs) << PAGESHIFT); 571 572 delspan_unreserve(pt_base, tpgs); 573 574 return (KPHYSM_OK); /* Successfully added system memory */ 575 } 576 577 /* 578 * There are various error conditions in kphysm_add_memory_dynamic() 579 * which require a rollback of already changed global state. 580 */ 581 static void 582 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs) 583 { 584 int mlret; 585 586 /* Unreserve memory span. */ 587 memlist_write_lock(); 588 589 mlret = memlist_delete_span( 590 (uint64_t)(pt_base) << PAGESHIFT, 591 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 592 593 ASSERT(mlret == MEML_SPANOP_OK); 594 phys_install_has_changed(); 595 installed_top_size(phys_install, &physmax, &physinstalled); 596 597 memlist_write_unlock(); 598 delspan_unreserve(pt_base, tpgs); 599 } 600 601 /* 602 * Only return an available memseg of exactly the right size 603 * if size is required. 604 * When the meta data area has it's own virtual address space 605 * we will need to manage this more carefully and do best fit 606 * allocations, possibly splitting an available area. 607 */ 608 struct memseg * 609 memseg_reuse(pgcnt_t metapgs) 610 { 611 int type; 612 struct memseg **segpp, *seg; 613 614 mutex_enter(&memseg_lists_lock); 615 616 segpp = &memseg_va_avail; 617 for (; (seg = *segpp) != NULL; segpp = &seg->lnext) { 618 caddr_t end; 619 620 /* 621 * Make sure we are reusing the right segment type. 622 */ 623 type = metapgs ? MEMSEG_META_INCL : MEMSEG_META_ALLOC; 624 625 if ((seg->msegflags & (MEMSEG_META_INCL | MEMSEG_META_ALLOC)) 626 != type) 627 continue; 628 629 if (kpm_enable) 630 end = hat_kpm_mseg_reuse(seg); 631 else 632 end = (caddr_t)seg->epages; 633 634 /* 635 * Check for the right size if it is provided. 636 */ 637 if (!metapgs || btopr(end - (caddr_t)seg->pages) == metapgs) { 638 *segpp = seg->lnext; 639 seg->lnext = NULL; 640 break; 641 } 642 } 643 mutex_exit(&memseg_lists_lock); 644 645 return (seg); 646 } 647 648 static uint_t handle_gen; 649 650 struct memdelspan { 651 struct memdelspan *mds_next; 652 pfn_t mds_base; 653 pgcnt_t mds_npgs; 654 uint_t *mds_bitmap; 655 uint_t *mds_bitmap_retired; 656 }; 657 658 #define NBPBMW (sizeof (uint_t) * NBBY) 659 #define MDS_BITMAPBYTES(MDSP) \ 660 ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t)) 661 662 struct transit_list { 663 struct transit_list *trl_next; 664 struct memdelspan *trl_spans; 665 int trl_collect; 666 }; 667 668 struct transit_list_head { 669 kmutex_t trh_lock; 670 struct transit_list *trh_head; 671 }; 672 673 static struct transit_list_head transit_list_head; 674 675 struct mem_handle; 676 static void transit_list_collect(struct mem_handle *, int); 677 static void transit_list_insert(struct transit_list *); 678 static void transit_list_remove(struct transit_list *); 679 680 #ifdef DEBUG 681 #define MEM_DEL_STATS 682 #endif /* DEBUG */ 683 684 #ifdef MEM_DEL_STATS 685 static int mem_del_stat_print = 0; 686 struct mem_del_stat { 687 uint_t nloop; 688 uint_t need_free; 689 uint_t free_loop; 690 uint_t free_low; 691 uint_t free_failed; 692 uint_t ncheck; 693 uint_t nopaget; 694 uint_t lockfail; 695 uint_t nfree; 696 uint_t nreloc; 697 uint_t nrelocfail; 698 uint_t already_done; 699 uint_t first_notfree; 700 uint_t npplocked; 701 uint_t nlockreloc; 702 uint_t nnorepl; 703 uint_t nmodreloc; 704 uint_t ndestroy; 705 uint_t nputpage; 706 uint_t nnoreclaim; 707 uint_t ndelay; 708 uint_t demotefail; 709 uint64_t nticks_total; 710 uint64_t nticks_pgrp; 711 uint_t retired; 712 uint_t toxic; 713 uint_t failing; 714 uint_t modtoxic; 715 uint_t npplkdtoxic; 716 uint_t gptlmodfail; 717 uint_t gptllckfail; 718 }; 719 /* 720 * The stat values are only incremented in the delete thread 721 * so no locking or atomic required. 722 */ 723 #define MDSTAT_INCR(MHP, FLD) (MHP)->mh_delstat.FLD++ 724 #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck)) 725 #define MDSTAT_PGRP(MHP, ntck) ((MHP)->mh_delstat.nticks_pgrp += (ntck)) 726 static void mem_del_stat_print_func(struct mem_handle *); 727 #define MDSTAT_PRINT(MHP) mem_del_stat_print_func((MHP)) 728 #else /* MEM_DEL_STATS */ 729 #define MDSTAT_INCR(MHP, FLD) 730 #define MDSTAT_TOTAL(MHP, ntck) 731 #define MDSTAT_PGRP(MHP, ntck) 732 #define MDSTAT_PRINT(MHP) 733 #endif /* MEM_DEL_STATS */ 734 735 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING, 736 MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t; 737 738 /* 739 * mh_mutex must be taken to examine or change mh_exthandle and mh_state. 740 * The mutex may not be required for other fields, dependent on mh_state. 741 */ 742 struct mem_handle { 743 kmutex_t mh_mutex; 744 struct mem_handle *mh_next; 745 memhandle_t mh_exthandle; 746 mhnd_state_t mh_state; 747 struct transit_list mh_transit; 748 pgcnt_t mh_phys_pages; 749 pgcnt_t mh_vm_pages; 750 pgcnt_t mh_hold_todo; 751 void (*mh_delete_complete)(void *, int error); 752 void *mh_delete_complete_arg; 753 volatile uint_t mh_cancel; 754 volatile uint_t mh_dr_aio_cleanup_cancel; 755 volatile uint_t mh_aio_cleanup_done; 756 kcondvar_t mh_cv; 757 kthread_id_t mh_thread_id; 758 page_t *mh_deleted; /* link through p_next */ 759 #ifdef MEM_DEL_STATS 760 struct mem_del_stat mh_delstat; 761 #endif /* MEM_DEL_STATS */ 762 }; 763 764 static struct mem_handle *mem_handle_head; 765 static kmutex_t mem_handle_list_mutex; 766 767 static struct mem_handle * 768 kphysm_allocate_mem_handle() 769 { 770 struct mem_handle *mhp; 771 772 mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP); 773 mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL); 774 mutex_enter(&mem_handle_list_mutex); 775 mutex_enter(&mhp->mh_mutex); 776 /* handle_gen is protected by list mutex. */ 777 mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen); 778 mhp->mh_next = mem_handle_head; 779 mem_handle_head = mhp; 780 mutex_exit(&mem_handle_list_mutex); 781 782 return (mhp); 783 } 784 785 static void 786 kphysm_free_mem_handle(struct mem_handle *mhp) 787 { 788 struct mem_handle **mhpp; 789 790 ASSERT(mutex_owned(&mhp->mh_mutex)); 791 ASSERT(mhp->mh_state == MHND_FREE); 792 /* 793 * Exit the mutex to preserve locking order. This is OK 794 * here as once in the FREE state, the handle cannot 795 * be found by a lookup. 796 */ 797 mutex_exit(&mhp->mh_mutex); 798 799 mutex_enter(&mem_handle_list_mutex); 800 mhpp = &mem_handle_head; 801 while (*mhpp != NULL && *mhpp != mhp) 802 mhpp = &(*mhpp)->mh_next; 803 ASSERT(*mhpp == mhp); 804 /* 805 * No need to lock the handle (mh_mutex) as only 806 * mh_next changing and this is the only thread that 807 * can be referncing mhp. 808 */ 809 *mhpp = mhp->mh_next; 810 mutex_exit(&mem_handle_list_mutex); 811 812 mutex_destroy(&mhp->mh_mutex); 813 kmem_free(mhp, sizeof (struct mem_handle)); 814 } 815 816 /* 817 * This function finds the internal mem_handle corresponding to an 818 * external handle and returns it with the mh_mutex held. 819 */ 820 static struct mem_handle * 821 kphysm_lookup_mem_handle(memhandle_t handle) 822 { 823 struct mem_handle *mhp; 824 825 mutex_enter(&mem_handle_list_mutex); 826 for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) { 827 if (mhp->mh_exthandle == handle) { 828 mutex_enter(&mhp->mh_mutex); 829 /* 830 * The state of the handle could have been changed 831 * by kphysm_del_release() while waiting for mh_mutex. 832 */ 833 if (mhp->mh_state == MHND_FREE) { 834 mutex_exit(&mhp->mh_mutex); 835 continue; 836 } 837 break; 838 } 839 } 840 mutex_exit(&mem_handle_list_mutex); 841 return (mhp); 842 } 843 844 int 845 kphysm_del_gethandle(memhandle_t *xmhp) 846 { 847 struct mem_handle *mhp; 848 849 mhp = kphysm_allocate_mem_handle(); 850 /* 851 * The handle is allocated using KM_SLEEP, so cannot fail. 852 * If the implementation is changed, the correct error to return 853 * here would be KPHYSM_ENOHANDLES. 854 */ 855 ASSERT(mhp->mh_state == MHND_FREE); 856 mhp->mh_state = MHND_INIT; 857 *xmhp = mhp->mh_exthandle; 858 mutex_exit(&mhp->mh_mutex); 859 return (KPHYSM_OK); 860 } 861 862 static int 863 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2) 864 { 865 pfn_t e1, e2; 866 867 e1 = b1 + l1; 868 e2 = b2 + l2; 869 870 return (!(b2 >= e1 || b1 >= e2)); 871 } 872 873 static int can_remove_pgs(pgcnt_t); 874 875 static struct memdelspan * 876 span_to_install(pfn_t base, pgcnt_t npgs) 877 { 878 struct memdelspan *mdsp; 879 struct memdelspan *mdsp_new; 880 uint64_t address, size, thislen; 881 struct memlist *mlp; 882 883 mdsp_new = NULL; 884 885 address = (uint64_t)base << PAGESHIFT; 886 size = (uint64_t)npgs << PAGESHIFT; 887 while (size != 0) { 888 memlist_read_lock(); 889 for (mlp = phys_install; mlp != NULL; mlp = mlp->ml_next) { 890 if (address >= (mlp->ml_address + mlp->ml_size)) 891 continue; 892 if ((address + size) > mlp->ml_address) 893 break; 894 } 895 if (mlp == NULL) { 896 address += size; 897 size = 0; 898 thislen = 0; 899 } else { 900 if (address < mlp->ml_address) { 901 size -= (mlp->ml_address - address); 902 address = mlp->ml_address; 903 } 904 ASSERT(address >= mlp->ml_address); 905 if ((address + size) > 906 (mlp->ml_address + mlp->ml_size)) { 907 thislen = 908 mlp->ml_size - (address - mlp->ml_address); 909 } else { 910 thislen = size; 911 } 912 } 913 memlist_read_unlock(); 914 /* TODO: phys_install could change now */ 915 if (thislen == 0) 916 continue; 917 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 918 mdsp->mds_base = btop(address); 919 mdsp->mds_npgs = btop(thislen); 920 mdsp->mds_next = mdsp_new; 921 mdsp_new = mdsp; 922 address += thislen; 923 size -= thislen; 924 } 925 return (mdsp_new); 926 } 927 928 static void 929 free_delspans(struct memdelspan *mdsp) 930 { 931 struct memdelspan *amdsp; 932 933 while ((amdsp = mdsp) != NULL) { 934 mdsp = amdsp->mds_next; 935 kmem_free(amdsp, sizeof (struct memdelspan)); 936 } 937 } 938 939 /* 940 * Concatenate lists. No list ordering is required. 941 */ 942 943 static void 944 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp) 945 { 946 while (*mdspp != NULL) 947 mdspp = &(*mdspp)->mds_next; 948 949 *mdspp = mdsp; 950 } 951 952 /* 953 * Given a new list of delspans, check there is no overlap with 954 * all existing span activity (add or delete) and then concatenate 955 * the new spans to the given list. 956 * Return 1 for OK, 0 if overlapping. 957 */ 958 static int 959 delspan_insert( 960 struct transit_list *my_tlp, 961 struct memdelspan *mdsp_new) 962 { 963 struct transit_list_head *trh; 964 struct transit_list *tlp; 965 int ret; 966 967 trh = &transit_list_head; 968 969 ASSERT(my_tlp != NULL); 970 ASSERT(mdsp_new != NULL); 971 972 ret = 1; 973 mutex_enter(&trh->trh_lock); 974 /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */ 975 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 976 struct memdelspan *mdsp; 977 978 for (mdsp = tlp->trl_spans; mdsp != NULL; 979 mdsp = mdsp->mds_next) { 980 struct memdelspan *nmdsp; 981 982 for (nmdsp = mdsp_new; nmdsp != NULL; 983 nmdsp = nmdsp->mds_next) { 984 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 985 nmdsp->mds_base, nmdsp->mds_npgs)) { 986 ret = 0; 987 goto done; 988 } 989 } 990 } 991 } 992 done: 993 if (ret != 0) { 994 if (my_tlp->trl_spans == NULL) 995 transit_list_insert(my_tlp); 996 delspan_concat(&my_tlp->trl_spans, mdsp_new); 997 } 998 mutex_exit(&trh->trh_lock); 999 return (ret); 1000 } 1001 1002 static void 1003 delspan_remove( 1004 struct transit_list *my_tlp, 1005 pfn_t base, 1006 pgcnt_t npgs) 1007 { 1008 struct transit_list_head *trh; 1009 struct memdelspan *mdsp; 1010 1011 trh = &transit_list_head; 1012 1013 ASSERT(my_tlp != NULL); 1014 1015 mutex_enter(&trh->trh_lock); 1016 if ((mdsp = my_tlp->trl_spans) != NULL) { 1017 if (npgs == 0) { 1018 my_tlp->trl_spans = NULL; 1019 free_delspans(mdsp); 1020 transit_list_remove(my_tlp); 1021 } else { 1022 struct memdelspan **prv; 1023 1024 prv = &my_tlp->trl_spans; 1025 while (mdsp != NULL) { 1026 pfn_t p_end; 1027 1028 p_end = mdsp->mds_base + mdsp->mds_npgs; 1029 if (mdsp->mds_base >= base && 1030 p_end <= (base + npgs)) { 1031 *prv = mdsp->mds_next; 1032 mdsp->mds_next = NULL; 1033 free_delspans(mdsp); 1034 } else { 1035 prv = &mdsp->mds_next; 1036 } 1037 mdsp = *prv; 1038 } 1039 if (my_tlp->trl_spans == NULL) 1040 transit_list_remove(my_tlp); 1041 } 1042 } 1043 mutex_exit(&trh->trh_lock); 1044 } 1045 1046 /* 1047 * Reserve interface for add to stop delete before add finished. 1048 * This list is only accessed through the delspan_insert/remove 1049 * functions and so is fully protected by the mutex in struct transit_list. 1050 */ 1051 1052 static struct transit_list reserve_transit; 1053 1054 static int 1055 delspan_reserve(pfn_t base, pgcnt_t npgs) 1056 { 1057 struct memdelspan *mdsp; 1058 int ret; 1059 1060 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 1061 mdsp->mds_base = base; 1062 mdsp->mds_npgs = npgs; 1063 if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) { 1064 free_delspans(mdsp); 1065 } 1066 return (ret); 1067 } 1068 1069 static void 1070 delspan_unreserve(pfn_t base, pgcnt_t npgs) 1071 { 1072 delspan_remove(&reserve_transit, base, npgs); 1073 } 1074 1075 /* 1076 * Return whether memseg was created by kphysm_add_memory_dynamic(). 1077 */ 1078 static int 1079 memseg_is_dynamic(struct memseg *seg) 1080 { 1081 return (seg->msegflags & MEMSEG_DYNAMIC); 1082 } 1083 1084 int 1085 kphysm_del_span( 1086 memhandle_t handle, 1087 pfn_t base, 1088 pgcnt_t npgs) 1089 { 1090 struct mem_handle *mhp; 1091 struct memseg *seg; 1092 struct memdelspan *mdsp; 1093 struct memdelspan *mdsp_new; 1094 pgcnt_t phys_pages, vm_pages; 1095 pfn_t p_end; 1096 page_t *pp; 1097 int ret; 1098 1099 mhp = kphysm_lookup_mem_handle(handle); 1100 if (mhp == NULL) { 1101 return (KPHYSM_EHANDLE); 1102 } 1103 if (mhp->mh_state != MHND_INIT) { 1104 mutex_exit(&mhp->mh_mutex); 1105 return (KPHYSM_ESEQUENCE); 1106 } 1107 1108 /* 1109 * Intersect the span with the installed memory list (phys_install). 1110 */ 1111 mdsp_new = span_to_install(base, npgs); 1112 if (mdsp_new == NULL) { 1113 /* 1114 * No physical memory in this range. Is this an 1115 * error? If an attempt to start the delete is made 1116 * for OK returns from del_span such as this, start will 1117 * return an error. 1118 * Could return KPHYSM_ENOWORK. 1119 */ 1120 /* 1121 * It is assumed that there are no error returns 1122 * from span_to_install() due to kmem_alloc failure. 1123 */ 1124 mutex_exit(&mhp->mh_mutex); 1125 return (KPHYSM_OK); 1126 } 1127 /* 1128 * Does this span overlap an existing span? 1129 */ 1130 if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) { 1131 /* 1132 * Differentiate between already on list for this handle 1133 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY). 1134 */ 1135 ret = KPHYSM_EBUSY; 1136 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1137 mdsp = mdsp->mds_next) { 1138 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 1139 base, npgs)) { 1140 ret = KPHYSM_EDUP; 1141 break; 1142 } 1143 } 1144 mutex_exit(&mhp->mh_mutex); 1145 free_delspans(mdsp_new); 1146 return (ret); 1147 } 1148 /* 1149 * At this point the spans in mdsp_new have been inserted into the 1150 * list of spans for this handle and thereby to the global list of 1151 * spans being processed. Each of these spans must now be checked 1152 * for relocatability. As a side-effect segments in the memseg list 1153 * may be split. 1154 * 1155 * Note that mdsp_new can no longer be used as it is now part of 1156 * a larger list. Select elements of this larger list based 1157 * on base and npgs. 1158 */ 1159 restart: 1160 phys_pages = 0; 1161 vm_pages = 0; 1162 ret = KPHYSM_OK; 1163 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1164 mdsp = mdsp->mds_next) { 1165 pgcnt_t pages_checked; 1166 1167 if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) { 1168 continue; 1169 } 1170 p_end = mdsp->mds_base + mdsp->mds_npgs; 1171 /* 1172 * The pages_checked count is a hack. All pages should be 1173 * checked for relocatability. Those not covered by memsegs 1174 * should be tested with arch_kphysm_del_span_ok(). 1175 */ 1176 pages_checked = 0; 1177 for (seg = memsegs; seg; seg = seg->next) { 1178 pfn_t mseg_start; 1179 1180 if (seg->pages_base >= p_end || 1181 seg->pages_end <= mdsp->mds_base) { 1182 /* Span and memseg don't overlap. */ 1183 continue; 1184 } 1185 mseg_start = memseg_get_start(seg); 1186 /* Check that segment is suitable for delete. */ 1187 if (memseg_includes_meta(seg)) { 1188 /* 1189 * Check that this segment is completely 1190 * within the span. 1191 */ 1192 if (mseg_start < mdsp->mds_base || 1193 seg->pages_end > p_end) { 1194 ret = KPHYSM_EBUSY; 1195 break; 1196 } 1197 pages_checked += seg->pages_end - mseg_start; 1198 } else { 1199 /* 1200 * If this segment is larger than the span, 1201 * try to split it. After the split, it 1202 * is necessary to restart. 1203 */ 1204 if (seg->pages_base < mdsp->mds_base || 1205 seg->pages_end > p_end) { 1206 pfn_t abase; 1207 pgcnt_t anpgs; 1208 int s_ret; 1209 1210 /* Split required. */ 1211 if (mdsp->mds_base < seg->pages_base) 1212 abase = seg->pages_base; 1213 else 1214 abase = mdsp->mds_base; 1215 if (p_end > seg->pages_end) 1216 anpgs = seg->pages_end - abase; 1217 else 1218 anpgs = p_end - abase; 1219 s_ret = kphysm_split_memseg(abase, 1220 anpgs); 1221 if (s_ret == 0) { 1222 /* Split failed. */ 1223 ret = KPHYSM_ERESOURCE; 1224 break; 1225 } 1226 goto restart; 1227 } 1228 pages_checked += 1229 seg->pages_end - seg->pages_base; 1230 } 1231 /* 1232 * The memseg is wholly within the delete span. 1233 * The individual pages can now be checked. 1234 */ 1235 /* Cage test. */ 1236 for (pp = seg->pages; pp < seg->epages; pp++) { 1237 if (PP_ISNORELOC(pp)) { 1238 ret = KPHYSM_ENONRELOC; 1239 break; 1240 } 1241 } 1242 if (ret != KPHYSM_OK) { 1243 break; 1244 } 1245 phys_pages += (seg->pages_end - mseg_start); 1246 vm_pages += MSEG_NPAGES(seg); 1247 } 1248 if (ret != KPHYSM_OK) 1249 break; 1250 if (pages_checked != mdsp->mds_npgs) { 1251 ret = KPHYSM_ENONRELOC; 1252 break; 1253 } 1254 } 1255 1256 if (ret == KPHYSM_OK) { 1257 mhp->mh_phys_pages += phys_pages; 1258 mhp->mh_vm_pages += vm_pages; 1259 } else { 1260 /* 1261 * Keep holding the mh_mutex to prevent it going away. 1262 */ 1263 delspan_remove(&mhp->mh_transit, base, npgs); 1264 } 1265 mutex_exit(&mhp->mh_mutex); 1266 return (ret); 1267 } 1268 1269 int 1270 kphysm_del_span_query( 1271 pfn_t base, 1272 pgcnt_t npgs, 1273 memquery_t *mqp) 1274 { 1275 struct memdelspan *mdsp; 1276 struct memdelspan *mdsp_new; 1277 int done_first_nonreloc; 1278 1279 mqp->phys_pages = 0; 1280 mqp->managed = 0; 1281 mqp->nonrelocatable = 0; 1282 mqp->first_nonrelocatable = 0; 1283 mqp->last_nonrelocatable = 0; 1284 1285 mdsp_new = span_to_install(base, npgs); 1286 /* 1287 * It is OK to proceed here if mdsp_new == NULL. 1288 */ 1289 done_first_nonreloc = 0; 1290 for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) { 1291 pfn_t sbase; 1292 pgcnt_t snpgs; 1293 1294 mqp->phys_pages += mdsp->mds_npgs; 1295 sbase = mdsp->mds_base; 1296 snpgs = mdsp->mds_npgs; 1297 while (snpgs != 0) { 1298 struct memseg *lseg, *seg; 1299 pfn_t p_end; 1300 page_t *pp; 1301 pfn_t mseg_start; 1302 1303 p_end = sbase + snpgs; 1304 /* 1305 * Find the lowest addressed memseg that starts 1306 * after sbase and account for it. 1307 * This is to catch dynamic memsegs whose start 1308 * is hidden. 1309 */ 1310 seg = NULL; 1311 for (lseg = memsegs; lseg != NULL; lseg = lseg->next) { 1312 if ((lseg->pages_base >= sbase) || 1313 (lseg->pages_base < p_end && 1314 lseg->pages_end > sbase)) { 1315 if (seg == NULL || 1316 seg->pages_base > lseg->pages_base) 1317 seg = lseg; 1318 } 1319 } 1320 if (seg != NULL) { 1321 mseg_start = memseg_get_start(seg); 1322 /* 1323 * Now have the full extent of the memseg so 1324 * do the range check. 1325 */ 1326 if (mseg_start >= p_end || 1327 seg->pages_end <= sbase) { 1328 /* Span does not overlap memseg. */ 1329 seg = NULL; 1330 } 1331 } 1332 /* 1333 * Account for gap either before the segment if 1334 * there is one or to the end of the span. 1335 */ 1336 if (seg == NULL || mseg_start > sbase) { 1337 pfn_t a_end; 1338 1339 a_end = (seg == NULL) ? p_end : mseg_start; 1340 /* 1341 * Check with arch layer for relocatability. 1342 */ 1343 if (arch_kphysm_del_span_ok(sbase, 1344 (a_end - sbase))) { 1345 /* 1346 * No non-relocatble pages in this 1347 * area, avoid the fine-grained 1348 * test. 1349 */ 1350 snpgs -= (a_end - sbase); 1351 sbase = a_end; 1352 } 1353 while (sbase < a_end) { 1354 if (!arch_kphysm_del_span_ok(sbase, 1355 1)) { 1356 mqp->nonrelocatable++; 1357 if (!done_first_nonreloc) { 1358 mqp-> 1359 first_nonrelocatable 1360 = sbase; 1361 done_first_nonreloc = 1; 1362 } 1363 mqp->last_nonrelocatable = 1364 sbase; 1365 } 1366 sbase++; 1367 snpgs--; 1368 } 1369 } 1370 if (seg != NULL) { 1371 ASSERT(mseg_start <= sbase); 1372 if (seg->pages_base != mseg_start && 1373 seg->pages_base > sbase) { 1374 pgcnt_t skip_pgs; 1375 1376 /* 1377 * Skip the page_t area of a 1378 * dynamic memseg. 1379 */ 1380 skip_pgs = seg->pages_base - sbase; 1381 if (snpgs <= skip_pgs) { 1382 sbase += snpgs; 1383 snpgs = 0; 1384 continue; 1385 } 1386 snpgs -= skip_pgs; 1387 sbase += skip_pgs; 1388 } 1389 ASSERT(snpgs != 0); 1390 ASSERT(seg->pages_base <= sbase); 1391 /* 1392 * The individual pages can now be checked. 1393 */ 1394 for (pp = seg->pages + 1395 (sbase - seg->pages_base); 1396 snpgs != 0 && pp < seg->epages; pp++) { 1397 mqp->managed++; 1398 if (PP_ISNORELOC(pp)) { 1399 mqp->nonrelocatable++; 1400 if (!done_first_nonreloc) { 1401 mqp-> 1402 first_nonrelocatable 1403 = sbase; 1404 done_first_nonreloc = 1; 1405 } 1406 mqp->last_nonrelocatable = 1407 sbase; 1408 } 1409 sbase++; 1410 snpgs--; 1411 } 1412 } 1413 } 1414 } 1415 1416 free_delspans(mdsp_new); 1417 1418 return (KPHYSM_OK); 1419 } 1420 1421 /* 1422 * This release function can be called at any stage as follows: 1423 * _gethandle only called 1424 * _span(s) only called 1425 * _start called but failed 1426 * delete thread exited 1427 */ 1428 int 1429 kphysm_del_release(memhandle_t handle) 1430 { 1431 struct mem_handle *mhp; 1432 1433 mhp = kphysm_lookup_mem_handle(handle); 1434 if (mhp == NULL) { 1435 return (KPHYSM_EHANDLE); 1436 } 1437 switch (mhp->mh_state) { 1438 case MHND_STARTING: 1439 case MHND_RUNNING: 1440 mutex_exit(&mhp->mh_mutex); 1441 return (KPHYSM_ENOTFINISHED); 1442 case MHND_FREE: 1443 ASSERT(mhp->mh_state != MHND_FREE); 1444 mutex_exit(&mhp->mh_mutex); 1445 return (KPHYSM_EHANDLE); 1446 case MHND_INIT: 1447 break; 1448 case MHND_DONE: 1449 break; 1450 case MHND_RELEASE: 1451 mutex_exit(&mhp->mh_mutex); 1452 return (KPHYSM_ESEQUENCE); 1453 default: 1454 #ifdef DEBUG 1455 cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d", 1456 (void *)mhp, mhp->mh_state); 1457 #endif /* DEBUG */ 1458 mutex_exit(&mhp->mh_mutex); 1459 return (KPHYSM_EHANDLE); 1460 } 1461 /* 1462 * Set state so that we can wait if necessary. 1463 * Also this means that we have read/write access to all 1464 * fields except mh_exthandle and mh_state. 1465 */ 1466 mhp->mh_state = MHND_RELEASE; 1467 /* 1468 * The mem_handle cannot be de-allocated by any other operation 1469 * now, so no need to hold mh_mutex. 1470 */ 1471 mutex_exit(&mhp->mh_mutex); 1472 1473 delspan_remove(&mhp->mh_transit, 0, 0); 1474 mhp->mh_phys_pages = 0; 1475 mhp->mh_vm_pages = 0; 1476 mhp->mh_hold_todo = 0; 1477 mhp->mh_delete_complete = NULL; 1478 mhp->mh_delete_complete_arg = NULL; 1479 mhp->mh_cancel = 0; 1480 1481 mutex_enter(&mhp->mh_mutex); 1482 ASSERT(mhp->mh_state == MHND_RELEASE); 1483 mhp->mh_state = MHND_FREE; 1484 1485 kphysm_free_mem_handle(mhp); 1486 1487 return (KPHYSM_OK); 1488 } 1489 1490 /* 1491 * This cancel function can only be called with the thread running. 1492 */ 1493 int 1494 kphysm_del_cancel(memhandle_t handle) 1495 { 1496 struct mem_handle *mhp; 1497 1498 mhp = kphysm_lookup_mem_handle(handle); 1499 if (mhp == NULL) { 1500 return (KPHYSM_EHANDLE); 1501 } 1502 if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) { 1503 mutex_exit(&mhp->mh_mutex); 1504 return (KPHYSM_ENOTRUNNING); 1505 } 1506 /* 1507 * Set the cancel flag and wake the delete thread up. 1508 * The thread may be waiting on I/O, so the effect of the cancel 1509 * may be delayed. 1510 */ 1511 if (mhp->mh_cancel == 0) { 1512 mhp->mh_cancel = KPHYSM_ECANCELLED; 1513 cv_signal(&mhp->mh_cv); 1514 } 1515 mutex_exit(&mhp->mh_mutex); 1516 return (KPHYSM_OK); 1517 } 1518 1519 int 1520 kphysm_del_status( 1521 memhandle_t handle, 1522 memdelstat_t *mdstp) 1523 { 1524 struct mem_handle *mhp; 1525 1526 mhp = kphysm_lookup_mem_handle(handle); 1527 if (mhp == NULL) { 1528 return (KPHYSM_EHANDLE); 1529 } 1530 /* 1531 * Calling kphysm_del_status() is allowed before the delete 1532 * is started to allow for status display. 1533 */ 1534 if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING && 1535 mhp->mh_state != MHND_RUNNING) { 1536 mutex_exit(&mhp->mh_mutex); 1537 return (KPHYSM_ENOTRUNNING); 1538 } 1539 mdstp->phys_pages = mhp->mh_phys_pages; 1540 mdstp->managed = mhp->mh_vm_pages; 1541 mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo; 1542 mutex_exit(&mhp->mh_mutex); 1543 return (KPHYSM_OK); 1544 } 1545 1546 static int mem_delete_additional_pages = 100; 1547 1548 static int 1549 can_remove_pgs(pgcnt_t npgs) 1550 { 1551 /* 1552 * If all pageable pages were paged out, freemem would 1553 * equal availrmem. There is a minimum requirement for 1554 * availrmem. 1555 */ 1556 if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages)) 1557 < npgs) 1558 return (0); 1559 /* TODO: check swap space, etc. */ 1560 return (1); 1561 } 1562 1563 static int 1564 get_availrmem(pgcnt_t npgs) 1565 { 1566 int ret; 1567 1568 mutex_enter(&freemem_lock); 1569 ret = can_remove_pgs(npgs); 1570 if (ret != 0) 1571 availrmem -= npgs; 1572 mutex_exit(&freemem_lock); 1573 return (ret); 1574 } 1575 1576 static void 1577 put_availrmem(pgcnt_t npgs) 1578 { 1579 mutex_enter(&freemem_lock); 1580 availrmem += npgs; 1581 mutex_exit(&freemem_lock); 1582 } 1583 1584 #define FREEMEM_INCR 100 1585 static pgcnt_t freemem_incr = FREEMEM_INCR; 1586 #define DEL_FREE_WAIT_FRAC 4 1587 #define DEL_FREE_WAIT_TICKS ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC) 1588 1589 #define DEL_BUSY_WAIT_FRAC 20 1590 #define DEL_BUSY_WAIT_TICKS ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC) 1591 1592 static void kphysm_del_cleanup(struct mem_handle *); 1593 1594 static void page_delete_collect(page_t *, struct mem_handle *); 1595 1596 static pgcnt_t 1597 delthr_get_freemem(struct mem_handle *mhp) 1598 { 1599 pgcnt_t free_get; 1600 int ret; 1601 1602 ASSERT(MUTEX_HELD(&mhp->mh_mutex)); 1603 1604 MDSTAT_INCR(mhp, need_free); 1605 /* 1606 * Get up to freemem_incr pages. 1607 */ 1608 free_get = freemem_incr; 1609 if (free_get > mhp->mh_hold_todo) 1610 free_get = mhp->mh_hold_todo; 1611 /* 1612 * Take free_get pages away from freemem, 1613 * waiting if necessary. 1614 */ 1615 1616 while (!mhp->mh_cancel) { 1617 mutex_exit(&mhp->mh_mutex); 1618 MDSTAT_INCR(mhp, free_loop); 1619 /* 1620 * Duplicate test from page_create_throttle() 1621 * but don't override with !PG_WAIT. 1622 */ 1623 if (freemem < (free_get + throttlefree)) { 1624 MDSTAT_INCR(mhp, free_low); 1625 ret = 0; 1626 } else { 1627 ret = page_create_wait(free_get, 0); 1628 if (ret == 0) { 1629 /* EMPTY */ 1630 MDSTAT_INCR(mhp, free_failed); 1631 } 1632 } 1633 if (ret != 0) { 1634 mutex_enter(&mhp->mh_mutex); 1635 return (free_get); 1636 } 1637 1638 /* 1639 * Put pressure on pageout. 1640 */ 1641 page_needfree(free_get); 1642 WAKE_PAGEOUT_SCANNER(delthr); 1643 1644 mutex_enter(&mhp->mh_mutex); 1645 (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex, 1646 DEL_FREE_WAIT_TICKS, TR_CLOCK_TICK); 1647 mutex_exit(&mhp->mh_mutex); 1648 page_needfree(-(spgcnt_t)free_get); 1649 1650 mutex_enter(&mhp->mh_mutex); 1651 } 1652 return (0); 1653 } 1654 1655 #define DR_AIO_CLEANUP_DELAY 25000 /* 0.025secs, in usec */ 1656 #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100 1657 /* 1658 * This function is run as a helper thread for delete_memory_thread. 1659 * It is needed in order to force kaio cleanup, so that pages used in kaio 1660 * will be unlocked and subsequently relocated by delete_memory_thread. 1661 * The address of the delete_memory_threads's mem_handle is passed in to 1662 * this thread function, and is used to set the mh_aio_cleanup_done member 1663 * prior to calling thread_exit(). 1664 */ 1665 static void 1666 dr_aio_cleanup_thread(caddr_t amhp) 1667 { 1668 proc_t *procp; 1669 int (*aio_cleanup_dr_delete_memory)(proc_t *); 1670 int cleaned; 1671 int n = 0; 1672 struct mem_handle *mhp; 1673 volatile uint_t *pcancel; 1674 1675 mhp = (struct mem_handle *)amhp; 1676 ASSERT(mhp != NULL); 1677 pcancel = &mhp->mh_dr_aio_cleanup_cancel; 1678 if (modload("sys", "kaio") == -1) { 1679 mhp->mh_aio_cleanup_done = 1; 1680 cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio"); 1681 thread_exit(); 1682 } 1683 aio_cleanup_dr_delete_memory = (int (*)(proc_t *)) 1684 modgetsymvalue("aio_cleanup_dr_delete_memory", 0); 1685 if (aio_cleanup_dr_delete_memory == NULL) { 1686 mhp->mh_aio_cleanup_done = 1; 1687 cmn_err(CE_WARN, 1688 "aio_cleanup_dr_delete_memory not found in kaio"); 1689 thread_exit(); 1690 } 1691 do { 1692 cleaned = 0; 1693 mutex_enter(&pidlock); 1694 for (procp = practive; (*pcancel == 0) && (procp != NULL); 1695 procp = procp->p_next) { 1696 mutex_enter(&procp->p_lock); 1697 if (procp->p_aio != NULL) { 1698 /* cleanup proc's outstanding kaio */ 1699 cleaned += 1700 (*aio_cleanup_dr_delete_memory)(procp); 1701 } 1702 mutex_exit(&procp->p_lock); 1703 } 1704 mutex_exit(&pidlock); 1705 if ((*pcancel == 0) && 1706 (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) { 1707 /* delay a bit before retrying all procs again */ 1708 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 1709 n = 0; 1710 } 1711 } while (*pcancel == 0); 1712 mhp->mh_aio_cleanup_done = 1; 1713 thread_exit(); 1714 } 1715 1716 static void 1717 delete_memory_thread(caddr_t amhp) 1718 { 1719 struct mem_handle *mhp; 1720 struct memdelspan *mdsp; 1721 callb_cpr_t cprinfo; 1722 page_t *pp_targ; 1723 spgcnt_t freemem_left; 1724 void (*del_complete_funcp)(void *, int error); 1725 void *del_complete_arg; 1726 int comp_code; 1727 int ret; 1728 int first_scan; 1729 uint_t szc; 1730 #ifdef MEM_DEL_STATS 1731 uint64_t start_total, ntick_total; 1732 uint64_t start_pgrp, ntick_pgrp; 1733 #endif /* MEM_DEL_STATS */ 1734 1735 mhp = (struct mem_handle *)amhp; 1736 1737 #ifdef MEM_DEL_STATS 1738 start_total = ddi_get_lbolt(); 1739 #endif /* MEM_DEL_STATS */ 1740 1741 CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex, 1742 callb_generic_cpr, "memdel"); 1743 1744 mutex_enter(&mhp->mh_mutex); 1745 ASSERT(mhp->mh_state == MHND_STARTING); 1746 1747 mhp->mh_state = MHND_RUNNING; 1748 mhp->mh_thread_id = curthread; 1749 1750 mhp->mh_hold_todo = mhp->mh_vm_pages; 1751 mutex_exit(&mhp->mh_mutex); 1752 1753 /* Allocate the remap pages now, if necessary. */ 1754 memseg_remap_init(); 1755 1756 /* 1757 * Subtract from availrmem now if possible as availrmem 1758 * may not be available by the end of the delete. 1759 */ 1760 if (!get_availrmem(mhp->mh_vm_pages)) { 1761 comp_code = KPHYSM_ENOTVIABLE; 1762 mutex_enter(&mhp->mh_mutex); 1763 goto early_exit; 1764 } 1765 1766 ret = kphysm_setup_pre_del(mhp->mh_vm_pages); 1767 1768 mutex_enter(&mhp->mh_mutex); 1769 1770 if (ret != 0) { 1771 mhp->mh_cancel = KPHYSM_EREFUSED; 1772 goto refused; 1773 } 1774 1775 transit_list_collect(mhp, 1); 1776 1777 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1778 mdsp = mdsp->mds_next) { 1779 ASSERT(mdsp->mds_bitmap == NULL); 1780 mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP); 1781 mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp), 1782 KM_SLEEP); 1783 } 1784 1785 first_scan = 1; 1786 freemem_left = 0; 1787 /* 1788 * Start dr_aio_cleanup_thread, which periodically iterates 1789 * through the process list and invokes aio cleanup. This 1790 * is needed in order to avoid a deadly embrace between the 1791 * delete_memory_thread (waiting on writer lock for page, with the 1792 * exclusive-wanted bit set), kaio read request threads (waiting for a 1793 * reader lock on the same page that is wanted by the 1794 * delete_memory_thread), and threads waiting for kaio completion 1795 * (blocked on spt_amp->lock). 1796 */ 1797 mhp->mh_dr_aio_cleanup_cancel = 0; 1798 mhp->mh_aio_cleanup_done = 0; 1799 (void) thread_create(NULL, 0, dr_aio_cleanup_thread, 1800 (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1); 1801 while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) { 1802 pgcnt_t collected; 1803 1804 MDSTAT_INCR(mhp, nloop); 1805 collected = 0; 1806 for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) && 1807 (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) { 1808 pfn_t pfn, p_end; 1809 1810 p_end = mdsp->mds_base + mdsp->mds_npgs; 1811 for (pfn = mdsp->mds_base; (pfn < p_end) && 1812 (mhp->mh_cancel == 0); pfn++) { 1813 page_t *pp, *tpp, *tpp_targ; 1814 pgcnt_t bit; 1815 struct vnode *vp; 1816 u_offset_t offset; 1817 int mod, result; 1818 spgcnt_t pgcnt; 1819 1820 bit = pfn - mdsp->mds_base; 1821 if ((mdsp->mds_bitmap[bit / NBPBMW] & 1822 (1 << (bit % NBPBMW))) != 0) { 1823 MDSTAT_INCR(mhp, already_done); 1824 continue; 1825 } 1826 if (freemem_left == 0) { 1827 freemem_left += delthr_get_freemem(mhp); 1828 if (freemem_left == 0) 1829 break; 1830 } 1831 1832 /* 1833 * Release mh_mutex - some of this 1834 * stuff takes some time (eg PUTPAGE). 1835 */ 1836 1837 mutex_exit(&mhp->mh_mutex); 1838 MDSTAT_INCR(mhp, ncheck); 1839 1840 pp = page_numtopp_nolock(pfn); 1841 if (pp == NULL) { 1842 /* 1843 * Not covered by a page_t - will 1844 * be dealt with elsewhere. 1845 */ 1846 MDSTAT_INCR(mhp, nopaget); 1847 mutex_enter(&mhp->mh_mutex); 1848 mdsp->mds_bitmap[bit / NBPBMW] |= 1849 (1 << (bit % NBPBMW)); 1850 continue; 1851 } 1852 1853 if (!page_try_reclaim_lock(pp, SE_EXCL, 1854 SE_EXCL_WANTED | SE_RETIRED)) { 1855 /* 1856 * Page in use elsewhere. Skip it. 1857 */ 1858 MDSTAT_INCR(mhp, lockfail); 1859 mutex_enter(&mhp->mh_mutex); 1860 continue; 1861 } 1862 /* 1863 * See if the cage expanded into the delete. 1864 * This can happen as we have to allow the 1865 * cage to expand. 1866 */ 1867 if (PP_ISNORELOC(pp)) { 1868 page_unlock(pp); 1869 mutex_enter(&mhp->mh_mutex); 1870 mhp->mh_cancel = KPHYSM_ENONRELOC; 1871 break; 1872 } 1873 if (PP_RETIRED(pp)) { 1874 /* 1875 * Page has been retired and is 1876 * not part of the cage so we 1877 * can now do the accounting for 1878 * it. 1879 */ 1880 MDSTAT_INCR(mhp, retired); 1881 mutex_enter(&mhp->mh_mutex); 1882 mdsp->mds_bitmap[bit / NBPBMW] 1883 |= (1 << (bit % NBPBMW)); 1884 mdsp->mds_bitmap_retired[bit / 1885 NBPBMW] |= 1886 (1 << (bit % NBPBMW)); 1887 mhp->mh_hold_todo--; 1888 continue; 1889 } 1890 ASSERT(freemem_left != 0); 1891 if (PP_ISFREE(pp)) { 1892 /* 1893 * Like page_reclaim() only 'freemem' 1894 * processing is already done. 1895 */ 1896 MDSTAT_INCR(mhp, nfree); 1897 free_page_collect: 1898 if (PP_ISAGED(pp)) { 1899 page_list_sub(pp, 1900 PG_FREE_LIST); 1901 } else { 1902 page_list_sub(pp, 1903 PG_CACHE_LIST); 1904 } 1905 PP_CLRFREE(pp); 1906 PP_CLRAGED(pp); 1907 collected++; 1908 mutex_enter(&mhp->mh_mutex); 1909 page_delete_collect(pp, mhp); 1910 mdsp->mds_bitmap[bit / NBPBMW] |= 1911 (1 << (bit % NBPBMW)); 1912 freemem_left--; 1913 continue; 1914 } 1915 ASSERT(pp->p_vnode != NULL); 1916 if (first_scan) { 1917 MDSTAT_INCR(mhp, first_notfree); 1918 page_unlock(pp); 1919 mutex_enter(&mhp->mh_mutex); 1920 continue; 1921 } 1922 /* 1923 * Keep stats on pages encountered that 1924 * are marked for retirement. 1925 */ 1926 if (PP_TOXIC(pp)) { 1927 MDSTAT_INCR(mhp, toxic); 1928 } else if (PP_PR_REQ(pp)) { 1929 MDSTAT_INCR(mhp, failing); 1930 } 1931 /* 1932 * In certain cases below, special exceptions 1933 * are made for pages that are toxic. This 1934 * is because the current meaning of toxic 1935 * is that an uncorrectable error has been 1936 * previously associated with the page. 1937 */ 1938 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1939 if (!PP_TOXIC(pp)) { 1940 /* 1941 * Must relocate locked in 1942 * memory pages. 1943 */ 1944 #ifdef MEM_DEL_STATS 1945 start_pgrp = ddi_get_lbolt(); 1946 #endif /* MEM_DEL_STATS */ 1947 /* 1948 * Lock all constituent pages 1949 * of a large page to ensure 1950 * that p_szc won't change. 1951 */ 1952 if (!group_page_trylock(pp, 1953 SE_EXCL)) { 1954 MDSTAT_INCR(mhp, 1955 gptllckfail); 1956 page_unlock(pp); 1957 mutex_enter( 1958 &mhp->mh_mutex); 1959 continue; 1960 } 1961 MDSTAT_INCR(mhp, npplocked); 1962 pp_targ = 1963 page_get_replacement_page( 1964 pp, NULL, 0); 1965 if (pp_targ != NULL) { 1966 #ifdef MEM_DEL_STATS 1967 ntick_pgrp = 1968 (uint64_t) 1969 ddi_get_lbolt() - 1970 start_pgrp; 1971 #endif /* MEM_DEL_STATS */ 1972 MDSTAT_PGRP(mhp, 1973 ntick_pgrp); 1974 MDSTAT_INCR(mhp, 1975 nlockreloc); 1976 goto reloc; 1977 } 1978 group_page_unlock(pp); 1979 page_unlock(pp); 1980 #ifdef MEM_DEL_STATS 1981 ntick_pgrp = 1982 (uint64_t)ddi_get_lbolt() - 1983 start_pgrp; 1984 #endif /* MEM_DEL_STATS */ 1985 MDSTAT_PGRP(mhp, ntick_pgrp); 1986 MDSTAT_INCR(mhp, nnorepl); 1987 mutex_enter(&mhp->mh_mutex); 1988 continue; 1989 } else { 1990 /* 1991 * Cannot do anything about 1992 * this page because it is 1993 * toxic. 1994 */ 1995 MDSTAT_INCR(mhp, npplkdtoxic); 1996 page_unlock(pp); 1997 mutex_enter(&mhp->mh_mutex); 1998 continue; 1999 } 2000 } 2001 /* 2002 * Unload the mappings and check if mod bit 2003 * is set. 2004 */ 2005 ASSERT(!PP_ISKAS(pp)); 2006 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 2007 mod = hat_ismod(pp); 2008 2009 #ifdef MEM_DEL_STATS 2010 start_pgrp = ddi_get_lbolt(); 2011 #endif /* MEM_DEL_STATS */ 2012 if (mod && !PP_TOXIC(pp)) { 2013 /* 2014 * Lock all constituent pages 2015 * of a large page to ensure 2016 * that p_szc won't change. 2017 */ 2018 if (!group_page_trylock(pp, SE_EXCL)) { 2019 MDSTAT_INCR(mhp, gptlmodfail); 2020 page_unlock(pp); 2021 mutex_enter(&mhp->mh_mutex); 2022 continue; 2023 } 2024 pp_targ = page_get_replacement_page(pp, 2025 NULL, 0); 2026 if (pp_targ != NULL) { 2027 MDSTAT_INCR(mhp, nmodreloc); 2028 #ifdef MEM_DEL_STATS 2029 ntick_pgrp = 2030 (uint64_t)ddi_get_lbolt() - 2031 start_pgrp; 2032 #endif /* MEM_DEL_STATS */ 2033 MDSTAT_PGRP(mhp, ntick_pgrp); 2034 goto reloc; 2035 } 2036 group_page_unlock(pp); 2037 } 2038 2039 if (!page_try_demote_pages(pp)) { 2040 MDSTAT_INCR(mhp, demotefail); 2041 page_unlock(pp); 2042 #ifdef MEM_DEL_STATS 2043 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2044 start_pgrp; 2045 #endif /* MEM_DEL_STATS */ 2046 MDSTAT_PGRP(mhp, ntick_pgrp); 2047 mutex_enter(&mhp->mh_mutex); 2048 continue; 2049 } 2050 2051 /* 2052 * Regular 'page-out'. 2053 */ 2054 if (!mod) { 2055 MDSTAT_INCR(mhp, ndestroy); 2056 page_destroy(pp, 1); 2057 /* 2058 * page_destroy was called with 2059 * dontfree. As long as p_lckcnt 2060 * and p_cowcnt are both zero, the 2061 * only additional action of 2062 * page_destroy with !dontfree is to 2063 * call page_free, so we can collect 2064 * the page here. 2065 */ 2066 collected++; 2067 #ifdef MEM_DEL_STATS 2068 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2069 start_pgrp; 2070 #endif /* MEM_DEL_STATS */ 2071 MDSTAT_PGRP(mhp, ntick_pgrp); 2072 mutex_enter(&mhp->mh_mutex); 2073 page_delete_collect(pp, mhp); 2074 mdsp->mds_bitmap[bit / NBPBMW] |= 2075 (1 << (bit % NBPBMW)); 2076 continue; 2077 } 2078 /* 2079 * The page is toxic and the mod bit is 2080 * set, we cannot do anything here to deal 2081 * with it. 2082 */ 2083 if (PP_TOXIC(pp)) { 2084 page_unlock(pp); 2085 #ifdef MEM_DEL_STATS 2086 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2087 start_pgrp; 2088 #endif /* MEM_DEL_STATS */ 2089 MDSTAT_PGRP(mhp, ntick_pgrp); 2090 MDSTAT_INCR(mhp, modtoxic); 2091 mutex_enter(&mhp->mh_mutex); 2092 continue; 2093 } 2094 MDSTAT_INCR(mhp, nputpage); 2095 vp = pp->p_vnode; 2096 offset = pp->p_offset; 2097 VN_HOLD(vp); 2098 page_unlock(pp); 2099 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, 2100 B_INVAL|B_FORCE, kcred, NULL); 2101 VN_RELE(vp); 2102 #ifdef MEM_DEL_STATS 2103 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2104 start_pgrp; 2105 #endif /* MEM_DEL_STATS */ 2106 MDSTAT_PGRP(mhp, ntick_pgrp); 2107 /* 2108 * Try to get the page back immediately 2109 * so that it can be collected. 2110 */ 2111 pp = page_numtopp_nolock(pfn); 2112 if (pp == NULL) { 2113 MDSTAT_INCR(mhp, nnoreclaim); 2114 /* 2115 * This should not happen as this 2116 * thread is deleting the page. 2117 * If this code is generalized, this 2118 * becomes a reality. 2119 */ 2120 #ifdef DEBUG 2121 cmn_err(CE_WARN, 2122 "delete_memory_thread(0x%p) " 2123 "pfn 0x%lx has no page_t", 2124 (void *)mhp, pfn); 2125 #endif /* DEBUG */ 2126 mutex_enter(&mhp->mh_mutex); 2127 continue; 2128 } 2129 if (page_try_reclaim_lock(pp, SE_EXCL, 2130 SE_EXCL_WANTED | SE_RETIRED)) { 2131 if (PP_ISFREE(pp)) { 2132 goto free_page_collect; 2133 } 2134 page_unlock(pp); 2135 } 2136 MDSTAT_INCR(mhp, nnoreclaim); 2137 mutex_enter(&mhp->mh_mutex); 2138 continue; 2139 2140 reloc: 2141 /* 2142 * Got some freemem and a target 2143 * page, so move the data to avoid 2144 * I/O and lock problems. 2145 */ 2146 ASSERT(!page_iolock_assert(pp)); 2147 MDSTAT_INCR(mhp, nreloc); 2148 /* 2149 * page_relocate() will return pgcnt: the 2150 * number of consecutive pages relocated. 2151 * If it is successful, pp will be a 2152 * linked list of the page structs that 2153 * were relocated. If page_relocate() is 2154 * unsuccessful, pp will be unmodified. 2155 */ 2156 #ifdef MEM_DEL_STATS 2157 start_pgrp = ddi_get_lbolt(); 2158 #endif /* MEM_DEL_STATS */ 2159 result = page_relocate(&pp, &pp_targ, 0, 0, 2160 &pgcnt, NULL); 2161 #ifdef MEM_DEL_STATS 2162 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2163 start_pgrp; 2164 #endif /* MEM_DEL_STATS */ 2165 MDSTAT_PGRP(mhp, ntick_pgrp); 2166 if (result != 0) { 2167 MDSTAT_INCR(mhp, nrelocfail); 2168 /* 2169 * We did not succeed. We need 2170 * to give the pp_targ pages back. 2171 * page_free(pp_targ, 1) without 2172 * the freemem accounting. 2173 */ 2174 group_page_unlock(pp); 2175 page_free_replacement_page(pp_targ); 2176 page_unlock(pp); 2177 mutex_enter(&mhp->mh_mutex); 2178 continue; 2179 } 2180 2181 /* 2182 * We will then collect pgcnt pages. 2183 */ 2184 ASSERT(pgcnt > 0); 2185 mutex_enter(&mhp->mh_mutex); 2186 /* 2187 * We need to make sure freemem_left is 2188 * large enough. 2189 */ 2190 while ((freemem_left < pgcnt) && 2191 (!mhp->mh_cancel)) { 2192 freemem_left += 2193 delthr_get_freemem(mhp); 2194 } 2195 2196 /* 2197 * Do not proceed if mh_cancel is set. 2198 */ 2199 if (mhp->mh_cancel) { 2200 while (pp_targ != NULL) { 2201 /* 2202 * Unlink and unlock each page. 2203 */ 2204 tpp_targ = pp_targ; 2205 page_sub(&pp_targ, tpp_targ); 2206 page_unlock(tpp_targ); 2207 } 2208 /* 2209 * We need to give the pp pages back. 2210 * page_free(pp, 1) without the 2211 * freemem accounting. 2212 */ 2213 page_free_replacement_page(pp); 2214 break; 2215 } 2216 2217 /* Now remove pgcnt from freemem_left */ 2218 freemem_left -= pgcnt; 2219 ASSERT(freemem_left >= 0); 2220 szc = pp->p_szc; 2221 while (pp != NULL) { 2222 /* 2223 * pp and pp_targ were passed back as 2224 * a linked list of pages. 2225 * Unlink and unlock each page. 2226 */ 2227 tpp_targ = pp_targ; 2228 page_sub(&pp_targ, tpp_targ); 2229 page_unlock(tpp_targ); 2230 /* 2231 * The original page is now free 2232 * so remove it from the linked 2233 * list and collect it. 2234 */ 2235 tpp = pp; 2236 page_sub(&pp, tpp); 2237 pfn = page_pptonum(tpp); 2238 collected++; 2239 ASSERT(PAGE_EXCL(tpp)); 2240 ASSERT(tpp->p_vnode == NULL); 2241 ASSERT(!hat_page_is_mapped(tpp)); 2242 ASSERT(tpp->p_szc == szc); 2243 tpp->p_szc = 0; 2244 page_delete_collect(tpp, mhp); 2245 bit = pfn - mdsp->mds_base; 2246 mdsp->mds_bitmap[bit / NBPBMW] |= 2247 (1 << (bit % NBPBMW)); 2248 } 2249 ASSERT(pp_targ == NULL); 2250 } 2251 } 2252 first_scan = 0; 2253 if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) && 2254 (collected == 0)) { 2255 /* 2256 * This code is needed as we cannot wait 2257 * for a page to be locked OR the delete to 2258 * be cancelled. Also, we must delay so 2259 * that other threads get a chance to run 2260 * on our cpu, otherwise page locks may be 2261 * held indefinitely by those threads. 2262 */ 2263 MDSTAT_INCR(mhp, ndelay); 2264 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2265 (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex, 2266 DEL_BUSY_WAIT_TICKS, TR_CLOCK_TICK); 2267 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2268 } 2269 } 2270 /* stop the dr aio cleanup thread */ 2271 mhp->mh_dr_aio_cleanup_cancel = 1; 2272 transit_list_collect(mhp, 0); 2273 if (freemem_left != 0) { 2274 /* Return any surplus. */ 2275 page_create_putback(freemem_left); 2276 freemem_left = 0; 2277 } 2278 #ifdef MEM_DEL_STATS 2279 ntick_total = (uint64_t)ddi_get_lbolt() - start_total; 2280 #endif /* MEM_DEL_STATS */ 2281 MDSTAT_TOTAL(mhp, ntick_total); 2282 MDSTAT_PRINT(mhp); 2283 2284 /* 2285 * If the memory delete was cancelled, exclusive-wanted bits must 2286 * be cleared. If there are retired pages being deleted, they need 2287 * to be unretired. 2288 */ 2289 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2290 mdsp = mdsp->mds_next) { 2291 pfn_t pfn, p_end; 2292 2293 p_end = mdsp->mds_base + mdsp->mds_npgs; 2294 for (pfn = mdsp->mds_base; pfn < p_end; pfn++) { 2295 page_t *pp; 2296 pgcnt_t bit; 2297 2298 bit = pfn - mdsp->mds_base; 2299 if (mhp->mh_cancel) { 2300 pp = page_numtopp_nolock(pfn); 2301 if (pp != NULL) { 2302 if ((mdsp->mds_bitmap[bit / NBPBMW] & 2303 (1 << (bit % NBPBMW))) == 0) { 2304 page_lock_clr_exclwanted(pp); 2305 } 2306 } 2307 } else { 2308 pp = NULL; 2309 } 2310 if ((mdsp->mds_bitmap_retired[bit / NBPBMW] & 2311 (1 << (bit % NBPBMW))) != 0) { 2312 /* do we already have pp? */ 2313 if (pp == NULL) { 2314 pp = page_numtopp_nolock(pfn); 2315 } 2316 ASSERT(pp != NULL); 2317 ASSERT(PP_RETIRED(pp)); 2318 if (mhp->mh_cancel != 0) { 2319 page_unlock(pp); 2320 /* 2321 * To satisfy ASSERT below in 2322 * cancel code. 2323 */ 2324 mhp->mh_hold_todo++; 2325 } else { 2326 (void) page_unretire_pp(pp, 2327 PR_UNR_CLEAN); 2328 } 2329 } 2330 } 2331 } 2332 /* 2333 * Free retired page bitmap and collected page bitmap 2334 */ 2335 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2336 mdsp = mdsp->mds_next) { 2337 ASSERT(mdsp->mds_bitmap_retired != NULL); 2338 kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp)); 2339 mdsp->mds_bitmap_retired = NULL; /* Paranoia. */ 2340 ASSERT(mdsp->mds_bitmap != NULL); 2341 kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp)); 2342 mdsp->mds_bitmap = NULL; /* Paranoia. */ 2343 } 2344 2345 /* wait for our dr aio cancel thread to exit */ 2346 while (!(mhp->mh_aio_cleanup_done)) { 2347 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2348 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 2349 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2350 } 2351 refused: 2352 if (mhp->mh_cancel != 0) { 2353 page_t *pp; 2354 2355 comp_code = mhp->mh_cancel; 2356 /* 2357 * Go through list of deleted pages (mh_deleted) freeing 2358 * them. 2359 */ 2360 while ((pp = mhp->mh_deleted) != NULL) { 2361 mhp->mh_deleted = pp->p_next; 2362 mhp->mh_hold_todo++; 2363 mutex_exit(&mhp->mh_mutex); 2364 /* Restore p_next. */ 2365 pp->p_next = pp->p_prev; 2366 if (PP_ISFREE(pp)) { 2367 cmn_err(CE_PANIC, 2368 "page %p is free", 2369 (void *)pp); 2370 } 2371 page_free(pp, 1); 2372 mutex_enter(&mhp->mh_mutex); 2373 } 2374 ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages); 2375 2376 mutex_exit(&mhp->mh_mutex); 2377 put_availrmem(mhp->mh_vm_pages); 2378 mutex_enter(&mhp->mh_mutex); 2379 2380 goto t_exit; 2381 } 2382 2383 /* 2384 * All the pages are no longer in use and are exclusively locked. 2385 */ 2386 2387 mhp->mh_deleted = NULL; 2388 2389 kphysm_del_cleanup(mhp); 2390 2391 /* 2392 * mem_node_del_range needs to be after kphysm_del_cleanup so 2393 * that the mem_node_config[] will remain intact for the cleanup. 2394 */ 2395 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2396 mdsp = mdsp->mds_next) { 2397 mem_node_del_range(mdsp->mds_base, 2398 mdsp->mds_base + mdsp->mds_npgs - 1); 2399 } 2400 /* cleanup the page counters */ 2401 page_ctrs_cleanup(); 2402 2403 comp_code = KPHYSM_OK; 2404 2405 t_exit: 2406 mutex_exit(&mhp->mh_mutex); 2407 kphysm_setup_post_del(mhp->mh_vm_pages, 2408 (comp_code == KPHYSM_OK) ? 0 : 1); 2409 mutex_enter(&mhp->mh_mutex); 2410 2411 early_exit: 2412 /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */ 2413 mhp->mh_state = MHND_DONE; 2414 del_complete_funcp = mhp->mh_delete_complete; 2415 del_complete_arg = mhp->mh_delete_complete_arg; 2416 CALLB_CPR_EXIT(&cprinfo); 2417 (*del_complete_funcp)(del_complete_arg, comp_code); 2418 thread_exit(); 2419 /*NOTREACHED*/ 2420 } 2421 2422 /* 2423 * Start the delete of the memory from the system. 2424 */ 2425 int 2426 kphysm_del_start( 2427 memhandle_t handle, 2428 void (*complete)(void *, int), 2429 void *complete_arg) 2430 { 2431 struct mem_handle *mhp; 2432 2433 mhp = kphysm_lookup_mem_handle(handle); 2434 if (mhp == NULL) { 2435 return (KPHYSM_EHANDLE); 2436 } 2437 switch (mhp->mh_state) { 2438 case MHND_FREE: 2439 ASSERT(mhp->mh_state != MHND_FREE); 2440 mutex_exit(&mhp->mh_mutex); 2441 return (KPHYSM_EHANDLE); 2442 case MHND_INIT: 2443 break; 2444 case MHND_STARTING: 2445 case MHND_RUNNING: 2446 mutex_exit(&mhp->mh_mutex); 2447 return (KPHYSM_ESEQUENCE); 2448 case MHND_DONE: 2449 mutex_exit(&mhp->mh_mutex); 2450 return (KPHYSM_ESEQUENCE); 2451 case MHND_RELEASE: 2452 mutex_exit(&mhp->mh_mutex); 2453 return (KPHYSM_ESEQUENCE); 2454 default: 2455 #ifdef DEBUG 2456 cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d", 2457 (void *)mhp, mhp->mh_state); 2458 #endif /* DEBUG */ 2459 mutex_exit(&mhp->mh_mutex); 2460 return (KPHYSM_EHANDLE); 2461 } 2462 2463 if (mhp->mh_transit.trl_spans == NULL) { 2464 mutex_exit(&mhp->mh_mutex); 2465 return (KPHYSM_ENOWORK); 2466 } 2467 2468 ASSERT(complete != NULL); 2469 mhp->mh_delete_complete = complete; 2470 mhp->mh_delete_complete_arg = complete_arg; 2471 mhp->mh_state = MHND_STARTING; 2472 /* 2473 * Release the mutex in case thread_create sleeps. 2474 */ 2475 mutex_exit(&mhp->mh_mutex); 2476 2477 /* 2478 * The "obvious" process for this thread is pageout (proc_pageout) 2479 * but this gives the thread too much power over freemem 2480 * which results in freemem starvation. 2481 */ 2482 (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0, 2483 TS_RUN, maxclsyspri - 1); 2484 2485 return (KPHYSM_OK); 2486 } 2487 2488 static kmutex_t pp_dummy_lock; /* Protects init. of pp_dummy. */ 2489 static caddr_t pp_dummy; 2490 static pgcnt_t pp_dummy_npages; 2491 static pfn_t *pp_dummy_pfn; /* Array of dummy pfns. */ 2492 2493 static void 2494 memseg_remap_init_pages(page_t *pages, page_t *epages) 2495 { 2496 page_t *pp; 2497 2498 for (pp = pages; pp < epages; pp++) { 2499 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2500 pp->p_offset = (u_offset_t)-1; 2501 page_iolock_init(pp); 2502 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 2503 continue; 2504 page_lock_delete(pp); 2505 } 2506 } 2507 2508 void 2509 memseg_remap_init() 2510 { 2511 mutex_enter(&pp_dummy_lock); 2512 if (pp_dummy == NULL) { 2513 uint_t dpages; 2514 int i; 2515 2516 /* 2517 * dpages starts off as the size of the structure and 2518 * ends up as the minimum number of pages that will 2519 * hold a whole number of page_t structures. 2520 */ 2521 dpages = sizeof (page_t); 2522 ASSERT(dpages != 0); 2523 ASSERT(dpages <= MMU_PAGESIZE); 2524 2525 while ((dpages & 1) == 0) 2526 dpages >>= 1; 2527 2528 pp_dummy_npages = dpages; 2529 /* 2530 * Allocate pp_dummy pages directly from static_arena, 2531 * since these are whole page allocations and are 2532 * referenced by physical address. This also has the 2533 * nice fringe benefit of hiding the memory from 2534 * ::findleaks since it doesn't deal well with allocated 2535 * kernel heap memory that doesn't have any mappings. 2536 */ 2537 pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages), 2538 PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP); 2539 bzero(pp_dummy, ptob(pp_dummy_npages)); 2540 ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0); 2541 pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) * 2542 pp_dummy_npages, KM_SLEEP); 2543 for (i = 0; i < pp_dummy_npages; i++) { 2544 pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat, 2545 &pp_dummy[MMU_PAGESIZE * i]); 2546 ASSERT(pp_dummy_pfn[i] != PFN_INVALID); 2547 } 2548 /* 2549 * Initialize the page_t's to a known 'deleted' state 2550 * that matches the state of deleted pages. 2551 */ 2552 memseg_remap_init_pages((page_t *)pp_dummy, 2553 (page_t *)(pp_dummy + ptob(pp_dummy_npages))); 2554 /* Remove kmem mappings for the pages for safety. */ 2555 hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages), 2556 HAT_UNLOAD_UNLOCK); 2557 /* Leave pp_dummy pointer set as flag that init is done. */ 2558 } 2559 mutex_exit(&pp_dummy_lock); 2560 } 2561 2562 /* 2563 * Remap a page-aglined range of page_t's to dummy pages. 2564 */ 2565 void 2566 remap_to_dummy(caddr_t va, pgcnt_t metapgs) 2567 { 2568 int phase; 2569 2570 ASSERT(IS_P2ALIGNED((uint64_t)(uintptr_t)va, PAGESIZE)); 2571 2572 /* 2573 * We may start remapping at a non-zero page offset 2574 * within the dummy pages since the low/high ends 2575 * of the outgoing pp's could be shared by other 2576 * memsegs (see memseg_remap_meta). 2577 */ 2578 phase = btop((uint64_t)(uintptr_t)va) % pp_dummy_npages; 2579 /*CONSTCOND*/ 2580 ASSERT(PAGESIZE % sizeof (page_t) || phase == 0); 2581 2582 while (metapgs != 0) { 2583 pgcnt_t n; 2584 int i, j; 2585 2586 n = pp_dummy_npages; 2587 if (n > metapgs) 2588 n = metapgs; 2589 for (i = 0; i < n; i++) { 2590 j = (i + phase) % pp_dummy_npages; 2591 hat_devload(kas.a_hat, va, ptob(1), pp_dummy_pfn[j], 2592 PROT_READ, 2593 HAT_LOAD | HAT_LOAD_NOCONSIST | 2594 HAT_LOAD_REMAP); 2595 va += ptob(1); 2596 } 2597 metapgs -= n; 2598 } 2599 } 2600 2601 static void 2602 memseg_remap_to_dummy(struct memseg *seg) 2603 { 2604 caddr_t pp; 2605 pgcnt_t metapgs; 2606 2607 ASSERT(memseg_is_dynamic(seg)); 2608 ASSERT(pp_dummy != NULL); 2609 2610 2611 if (!memseg_includes_meta(seg)) { 2612 memseg_remap_meta(seg); 2613 return; 2614 } 2615 2616 pp = (caddr_t)seg->pages; 2617 metapgs = seg->pages_base - memseg_get_start(seg); 2618 ASSERT(metapgs != 0); 2619 2620 seg->pages_end = seg->pages_base; 2621 2622 remap_to_dummy(pp, metapgs); 2623 } 2624 2625 /* 2626 * Transition all the deleted pages to the deleted state so that 2627 * page_lock will not wait. The page_lock_delete call will 2628 * also wake up any waiters. 2629 */ 2630 static void 2631 memseg_lock_delete_all(struct memseg *seg) 2632 { 2633 page_t *pp; 2634 2635 for (pp = seg->pages; pp < seg->epages; pp++) { 2636 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2637 page_lock_delete(pp); 2638 } 2639 } 2640 2641 static void 2642 kphysm_del_cleanup(struct mem_handle *mhp) 2643 { 2644 struct memdelspan *mdsp; 2645 struct memseg *seg; 2646 struct memseg **segpp; 2647 struct memseg *seglist; 2648 pfn_t p_end; 2649 uint64_t avmem; 2650 pgcnt_t avpgs; 2651 pgcnt_t npgs; 2652 2653 avpgs = mhp->mh_vm_pages; 2654 2655 memsegs_lock(1); 2656 2657 /* 2658 * remove from main segment list. 2659 */ 2660 npgs = 0; 2661 seglist = NULL; 2662 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2663 mdsp = mdsp->mds_next) { 2664 p_end = mdsp->mds_base + mdsp->mds_npgs; 2665 for (segpp = &memsegs; (seg = *segpp) != NULL; ) { 2666 if (seg->pages_base >= p_end || 2667 seg->pages_end <= mdsp->mds_base) { 2668 /* Span and memseg don't overlap. */ 2669 segpp = &((*segpp)->next); 2670 continue; 2671 } 2672 ASSERT(seg->pages_base >= mdsp->mds_base); 2673 ASSERT(seg->pages_end <= p_end); 2674 2675 PLCNT_MODIFY_MAX(seg->pages_base, 2676 seg->pages_base - seg->pages_end); 2677 2678 /* Hide the memseg from future scans. */ 2679 hat_kpm_delmem_mseg_update(seg, segpp); 2680 *segpp = seg->next; 2681 membar_producer(); /* TODO: Needed? */ 2682 npgs += MSEG_NPAGES(seg); 2683 2684 /* 2685 * Leave the deleted segment's next pointer intact 2686 * in case a memsegs scanning loop is walking this 2687 * segment concurrently. 2688 */ 2689 seg->lnext = seglist; 2690 seglist = seg; 2691 } 2692 } 2693 2694 build_pfn_hash(); 2695 2696 ASSERT(npgs < total_pages); 2697 total_pages -= npgs; 2698 2699 /* 2700 * Recalculate the paging parameters now total_pages has changed. 2701 * This will also cause the clock hands to be reset before next use. 2702 */ 2703 setupclock(); 2704 2705 memsegs_unlock(1); 2706 2707 mutex_exit(&mhp->mh_mutex); 2708 2709 while ((seg = seglist) != NULL) { 2710 pfn_t mseg_start; 2711 pfn_t mseg_base, mseg_end; 2712 pgcnt_t mseg_npgs; 2713 int mlret; 2714 2715 seglist = seg->lnext; 2716 2717 /* 2718 * Put the page_t's into the deleted state to stop 2719 * cv_wait()s on the pages. When we remap, the dummy 2720 * page_t's will be in the same state. 2721 */ 2722 memseg_lock_delete_all(seg); 2723 /* 2724 * Collect up information based on pages_base and pages_end 2725 * early so that we can flag early that the memseg has been 2726 * deleted by setting pages_end == pages_base. 2727 */ 2728 mseg_base = seg->pages_base; 2729 mseg_end = seg->pages_end; 2730 mseg_npgs = MSEG_NPAGES(seg); 2731 mseg_start = memseg_get_start(seg); 2732 2733 if (memseg_is_dynamic(seg)) { 2734 /* Remap the meta data to our special dummy area. */ 2735 memseg_remap_to_dummy(seg); 2736 2737 mutex_enter(&memseg_lists_lock); 2738 seg->lnext = memseg_va_avail; 2739 memseg_va_avail = seg; 2740 mutex_exit(&memseg_lists_lock); 2741 } else { 2742 /* 2743 * For memory whose page_ts were allocated 2744 * at boot, we need to find a new use for 2745 * the page_t memory. 2746 * For the moment, just leak it. 2747 * (It is held in the memseg_delete_junk list.) 2748 */ 2749 seg->pages_end = seg->pages_base; 2750 2751 mutex_enter(&memseg_lists_lock); 2752 seg->lnext = memseg_delete_junk; 2753 memseg_delete_junk = seg; 2754 mutex_exit(&memseg_lists_lock); 2755 } 2756 2757 /* Must not use seg now as it could be re-used. */ 2758 2759 memlist_write_lock(); 2760 2761 mlret = memlist_delete_span( 2762 (uint64_t)(mseg_base) << PAGESHIFT, 2763 (uint64_t)(mseg_npgs) << PAGESHIFT, 2764 &phys_avail); 2765 ASSERT(mlret == MEML_SPANOP_OK); 2766 2767 mlret = memlist_delete_span( 2768 (uint64_t)(mseg_start) << PAGESHIFT, 2769 (uint64_t)(mseg_end - mseg_start) << 2770 PAGESHIFT, 2771 &phys_install); 2772 ASSERT(mlret == MEML_SPANOP_OK); 2773 phys_install_has_changed(); 2774 2775 memlist_write_unlock(); 2776 } 2777 2778 memlist_read_lock(); 2779 installed_top_size(phys_install, &physmax, &physinstalled); 2780 memlist_read_unlock(); 2781 2782 mutex_enter(&freemem_lock); 2783 maxmem -= avpgs; 2784 physmem -= avpgs; 2785 /* availrmem is adjusted during the delete. */ 2786 availrmem_initial -= avpgs; 2787 2788 mutex_exit(&freemem_lock); 2789 2790 dump_resize(); 2791 2792 cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK " 2793 "(0x%" PRIx64 ")\n", 2794 physinstalled << (PAGESHIFT - 10), 2795 (uint64_t)physinstalled << PAGESHIFT); 2796 2797 avmem = (uint64_t)freemem << PAGESHIFT; 2798 cmn_err(CE_CONT, "?kphysm_delete: " 2799 "avail mem = %" PRId64 "\n", avmem); 2800 2801 /* 2802 * Update lgroup generation number on single lgroup systems 2803 */ 2804 if (nlgrps == 1) 2805 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 2806 2807 /* Successfully deleted system memory */ 2808 mutex_enter(&mhp->mh_mutex); 2809 } 2810 2811 static uint_t mdel_nullvp_waiter; 2812 2813 static void 2814 page_delete_collect( 2815 page_t *pp, 2816 struct mem_handle *mhp) 2817 { 2818 if (pp->p_vnode) { 2819 page_hashout(pp, (kmutex_t *)NULL); 2820 /* do not do PP_SETAGED(pp); */ 2821 } else { 2822 kmutex_t *sep; 2823 2824 sep = page_se_mutex(pp); 2825 mutex_enter(sep); 2826 if (CV_HAS_WAITERS(&pp->p_cv)) { 2827 mdel_nullvp_waiter++; 2828 cv_broadcast(&pp->p_cv); 2829 } 2830 mutex_exit(sep); 2831 } 2832 ASSERT(pp->p_next == pp->p_prev); 2833 ASSERT(pp->p_next == NULL || pp->p_next == pp); 2834 pp->p_next = mhp->mh_deleted; 2835 mhp->mh_deleted = pp; 2836 ASSERT(mhp->mh_hold_todo != 0); 2837 mhp->mh_hold_todo--; 2838 } 2839 2840 static void 2841 transit_list_collect(struct mem_handle *mhp, int v) 2842 { 2843 struct transit_list_head *trh; 2844 2845 trh = &transit_list_head; 2846 mutex_enter(&trh->trh_lock); 2847 mhp->mh_transit.trl_collect = v; 2848 mutex_exit(&trh->trh_lock); 2849 } 2850 2851 static void 2852 transit_list_insert(struct transit_list *tlp) 2853 { 2854 struct transit_list_head *trh; 2855 2856 trh = &transit_list_head; 2857 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2858 tlp->trl_next = trh->trh_head; 2859 trh->trh_head = tlp; 2860 } 2861 2862 static void 2863 transit_list_remove(struct transit_list *tlp) 2864 { 2865 struct transit_list_head *trh; 2866 struct transit_list **tlpp; 2867 2868 trh = &transit_list_head; 2869 tlpp = &trh->trh_head; 2870 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2871 while (*tlpp != NULL && *tlpp != tlp) 2872 tlpp = &(*tlpp)->trl_next; 2873 ASSERT(*tlpp != NULL); 2874 if (*tlpp == tlp) 2875 *tlpp = tlp->trl_next; 2876 tlp->trl_next = NULL; 2877 } 2878 2879 static struct transit_list * 2880 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum) 2881 { 2882 struct transit_list *tlp; 2883 2884 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 2885 struct memdelspan *mdsp; 2886 2887 for (mdsp = tlp->trl_spans; mdsp != NULL; 2888 mdsp = mdsp->mds_next) { 2889 if (pfnum >= mdsp->mds_base && 2890 pfnum < (mdsp->mds_base + mdsp->mds_npgs)) { 2891 return (tlp); 2892 } 2893 } 2894 } 2895 return (NULL); 2896 } 2897 2898 int 2899 pfn_is_being_deleted(pfn_t pfnum) 2900 { 2901 struct transit_list_head *trh; 2902 struct transit_list *tlp; 2903 int ret; 2904 2905 trh = &transit_list_head; 2906 if (trh->trh_head == NULL) 2907 return (0); 2908 2909 mutex_enter(&trh->trh_lock); 2910 tlp = pfnum_to_transit_list(trh, pfnum); 2911 ret = (tlp != NULL && tlp->trl_collect); 2912 mutex_exit(&trh->trh_lock); 2913 2914 return (ret); 2915 } 2916 2917 #ifdef MEM_DEL_STATS 2918 extern int hz; 2919 static void 2920 mem_del_stat_print_func(struct mem_handle *mhp) 2921 { 2922 uint64_t tmp; 2923 2924 if (mem_del_stat_print) { 2925 printf("memory delete loop %x/%x, statistics%s\n", 2926 (uint_t)mhp->mh_transit.trl_spans->mds_base, 2927 (uint_t)mhp->mh_transit.trl_spans->mds_npgs, 2928 (mhp->mh_cancel ? " (cancelled)" : "")); 2929 printf("\t%8u nloop\n", mhp->mh_delstat.nloop); 2930 printf("\t%8u need_free\n", mhp->mh_delstat.need_free); 2931 printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop); 2932 printf("\t%8u free_low\n", mhp->mh_delstat.free_low); 2933 printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed); 2934 printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck); 2935 printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget); 2936 printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail); 2937 printf("\t%8u nfree\n", mhp->mh_delstat.nfree); 2938 printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc); 2939 printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail); 2940 printf("\t%8u already_done\n", mhp->mh_delstat.already_done); 2941 printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree); 2942 printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked); 2943 printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc); 2944 printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl); 2945 printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc); 2946 printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy); 2947 printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage); 2948 printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim); 2949 printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay); 2950 printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail); 2951 printf("\t%8u retired\n", mhp->mh_delstat.retired); 2952 printf("\t%8u toxic\n", mhp->mh_delstat.toxic); 2953 printf("\t%8u failing\n", mhp->mh_delstat.failing); 2954 printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic); 2955 printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic); 2956 printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail); 2957 printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail); 2958 tmp = mhp->mh_delstat.nticks_total / hz; /* seconds */ 2959 printf( 2960 "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n", 2961 mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60); 2962 2963 tmp = mhp->mh_delstat.nticks_pgrp / hz; /* seconds */ 2964 printf( 2965 "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n", 2966 mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60); 2967 } 2968 } 2969 #endif /* MEM_DEL_STATS */ 2970 2971 struct mem_callback { 2972 kphysm_setup_vector_t *vec; 2973 void *arg; 2974 }; 2975 2976 #define NMEMCALLBACKS 100 2977 2978 static struct mem_callback mem_callbacks[NMEMCALLBACKS]; 2979 static uint_t nmemcallbacks; 2980 static krwlock_t mem_callback_rwlock; 2981 2982 int 2983 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg) 2984 { 2985 uint_t i, found; 2986 2987 /* 2988 * This test will become more complicated when the version must 2989 * change. 2990 */ 2991 if (vec->version != KPHYSM_SETUP_VECTOR_VERSION) 2992 return (EINVAL); 2993 2994 if (vec->post_add == NULL || vec->pre_del == NULL || 2995 vec->post_del == NULL) 2996 return (EINVAL); 2997 2998 rw_enter(&mem_callback_rwlock, RW_WRITER); 2999 for (i = 0, found = 0; i < nmemcallbacks; i++) { 3000 if (mem_callbacks[i].vec == NULL && found == 0) 3001 found = i + 1; 3002 if (mem_callbacks[i].vec == vec && 3003 mem_callbacks[i].arg == arg) { 3004 #ifdef DEBUG 3005 /* Catch this in DEBUG kernels. */ 3006 cmn_err(CE_WARN, "kphysm_setup_func_register" 3007 "(0x%p, 0x%p) duplicate registration from 0x%p", 3008 (void *)vec, arg, (void *)caller()); 3009 #endif /* DEBUG */ 3010 rw_exit(&mem_callback_rwlock); 3011 return (EEXIST); 3012 } 3013 } 3014 if (found != 0) { 3015 i = found - 1; 3016 } else { 3017 ASSERT(nmemcallbacks < NMEMCALLBACKS); 3018 if (nmemcallbacks == NMEMCALLBACKS) { 3019 rw_exit(&mem_callback_rwlock); 3020 return (ENOMEM); 3021 } 3022 i = nmemcallbacks++; 3023 } 3024 mem_callbacks[i].vec = vec; 3025 mem_callbacks[i].arg = arg; 3026 rw_exit(&mem_callback_rwlock); 3027 return (0); 3028 } 3029 3030 void 3031 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg) 3032 { 3033 uint_t i; 3034 3035 rw_enter(&mem_callback_rwlock, RW_WRITER); 3036 for (i = 0; i < nmemcallbacks; i++) { 3037 if (mem_callbacks[i].vec == vec && 3038 mem_callbacks[i].arg == arg) { 3039 mem_callbacks[i].vec = NULL; 3040 mem_callbacks[i].arg = NULL; 3041 if (i == (nmemcallbacks - 1)) 3042 nmemcallbacks--; 3043 break; 3044 } 3045 } 3046 rw_exit(&mem_callback_rwlock); 3047 } 3048 3049 static void 3050 kphysm_setup_post_add(pgcnt_t delta_pages) 3051 { 3052 uint_t i; 3053 3054 rw_enter(&mem_callback_rwlock, RW_READER); 3055 for (i = 0; i < nmemcallbacks; i++) { 3056 if (mem_callbacks[i].vec != NULL) { 3057 (*mem_callbacks[i].vec->post_add) 3058 (mem_callbacks[i].arg, delta_pages); 3059 } 3060 } 3061 rw_exit(&mem_callback_rwlock); 3062 } 3063 3064 /* 3065 * Note the locking between pre_del and post_del: The reader lock is held 3066 * between the two calls to stop the set of functions from changing. 3067 */ 3068 3069 static int 3070 kphysm_setup_pre_del(pgcnt_t delta_pages) 3071 { 3072 uint_t i; 3073 int ret; 3074 int aret; 3075 3076 ret = 0; 3077 rw_enter(&mem_callback_rwlock, RW_READER); 3078 for (i = 0; i < nmemcallbacks; i++) { 3079 if (mem_callbacks[i].vec != NULL) { 3080 aret = (*mem_callbacks[i].vec->pre_del) 3081 (mem_callbacks[i].arg, delta_pages); 3082 ret |= aret; 3083 } 3084 } 3085 3086 return (ret); 3087 } 3088 3089 static void 3090 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled) 3091 { 3092 uint_t i; 3093 3094 for (i = 0; i < nmemcallbacks; i++) { 3095 if (mem_callbacks[i].vec != NULL) { 3096 (*mem_callbacks[i].vec->post_del) 3097 (mem_callbacks[i].arg, delta_pages, cancelled); 3098 } 3099 } 3100 rw_exit(&mem_callback_rwlock); 3101 } 3102 3103 static int 3104 kphysm_split_memseg( 3105 pfn_t base, 3106 pgcnt_t npgs) 3107 { 3108 struct memseg *seg; 3109 struct memseg **segpp; 3110 pgcnt_t size_low, size_high; 3111 struct memseg *seg_low, *seg_mid, *seg_high; 3112 3113 /* 3114 * Lock the memsegs list against other updates now 3115 */ 3116 memsegs_lock(1); 3117 3118 /* 3119 * Find boot time memseg that wholly covers this area. 3120 */ 3121 3122 /* First find the memseg with page 'base' in it. */ 3123 for (segpp = &memsegs; (seg = *segpp) != NULL; 3124 segpp = &((*segpp)->next)) { 3125 if (base >= seg->pages_base && base < seg->pages_end) 3126 break; 3127 } 3128 if (seg == NULL) { 3129 memsegs_unlock(1); 3130 return (0); 3131 } 3132 if (memseg_includes_meta(seg)) { 3133 memsegs_unlock(1); 3134 return (0); 3135 } 3136 if ((base + npgs) > seg->pages_end) { 3137 memsegs_unlock(1); 3138 return (0); 3139 } 3140 3141 /* 3142 * Work out the size of the two segments that will 3143 * surround the new segment, one for low address 3144 * and one for high. 3145 */ 3146 ASSERT(base >= seg->pages_base); 3147 size_low = base - seg->pages_base; 3148 ASSERT(seg->pages_end >= (base + npgs)); 3149 size_high = seg->pages_end - (base + npgs); 3150 3151 /* 3152 * Sanity check. 3153 */ 3154 if ((size_low + size_high) == 0) { 3155 memsegs_unlock(1); 3156 return (0); 3157 } 3158 3159 /* 3160 * Allocate the new structures. The old memseg will not be freed 3161 * as there may be a reference to it. 3162 */ 3163 seg_low = NULL; 3164 seg_high = NULL; 3165 3166 if (size_low != 0) 3167 seg_low = memseg_alloc(); 3168 3169 seg_mid = memseg_alloc(); 3170 3171 if (size_high != 0) 3172 seg_high = memseg_alloc(); 3173 3174 /* 3175 * All allocation done now. 3176 */ 3177 if (size_low != 0) { 3178 seg_low->pages = seg->pages; 3179 seg_low->epages = seg_low->pages + size_low; 3180 seg_low->pages_base = seg->pages_base; 3181 seg_low->pages_end = seg_low->pages_base + size_low; 3182 seg_low->next = seg_mid; 3183 seg_low->msegflags = seg->msegflags; 3184 } 3185 if (size_high != 0) { 3186 seg_high->pages = seg->epages - size_high; 3187 seg_high->epages = seg_high->pages + size_high; 3188 seg_high->pages_base = seg->pages_end - size_high; 3189 seg_high->pages_end = seg_high->pages_base + size_high; 3190 seg_high->next = seg->next; 3191 seg_high->msegflags = seg->msegflags; 3192 } 3193 3194 seg_mid->pages = seg->pages + size_low; 3195 seg_mid->pages_base = seg->pages_base + size_low; 3196 seg_mid->epages = seg->epages - size_high; 3197 seg_mid->pages_end = seg->pages_end - size_high; 3198 seg_mid->next = (seg_high != NULL) ? seg_high : seg->next; 3199 seg_mid->msegflags = seg->msegflags; 3200 3201 /* 3202 * Update hat_kpm specific info of all involved memsegs and 3203 * allow hat_kpm specific global chain updates. 3204 */ 3205 hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high); 3206 3207 /* 3208 * At this point we have two equivalent memseg sub-chains, 3209 * seg and seg_low/seg_mid/seg_high, which both chain on to 3210 * the same place in the global chain. By re-writing the pointer 3211 * in the previous element we switch atomically from using the old 3212 * (seg) to the new. 3213 */ 3214 *segpp = (seg_low != NULL) ? seg_low : seg_mid; 3215 3216 membar_enter(); 3217 3218 build_pfn_hash(); 3219 memsegs_unlock(1); 3220 3221 /* 3222 * We leave the old segment, 'seg', intact as there may be 3223 * references to it. Also, as the value of total_pages has not 3224 * changed and the memsegs list is effectively the same when 3225 * accessed via the old or the new pointer, we do not have to 3226 * cause pageout_scanner() to re-evaluate its hand pointers. 3227 * 3228 * We currently do not re-use or reclaim the page_t memory. 3229 * If we do, then this may have to change. 3230 */ 3231 3232 mutex_enter(&memseg_lists_lock); 3233 seg->lnext = memseg_edit_junk; 3234 memseg_edit_junk = seg; 3235 mutex_exit(&memseg_lists_lock); 3236 3237 return (1); 3238 } 3239 3240 /* 3241 * The sfmmu hat layer (e.g.) accesses some parts of the memseg 3242 * structure using physical addresses. Therefore a kmem_cache is 3243 * used with KMC_NOHASH to avoid page crossings within a memseg 3244 * structure. KMC_NOHASH requires that no external (outside of 3245 * slab) information is allowed. This, in turn, implies that the 3246 * cache's slabsize must be exactly a single page, since per-slab 3247 * information (e.g. the freelist for the slab) is kept at the 3248 * end of the slab, where it is easy to locate. Should be changed 3249 * when a more obvious kmem_cache interface/flag will become 3250 * available. 3251 */ 3252 void 3253 mem_config_init() 3254 { 3255 memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg), 3256 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 3257 } 3258 3259 struct memseg * 3260 memseg_alloc() 3261 { 3262 struct memseg *seg; 3263 3264 seg = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3265 bzero(seg, sizeof (struct memseg)); 3266 3267 return (seg); 3268 } 3269 3270 /* 3271 * Return whether the page_t memory for this memseg 3272 * is included in the memseg itself. 3273 */ 3274 static int 3275 memseg_includes_meta(struct memseg *seg) 3276 { 3277 return (seg->msegflags & MEMSEG_META_INCL); 3278 } 3279 3280 pfn_t 3281 memseg_get_start(struct memseg *seg) 3282 { 3283 pfn_t pt_start; 3284 3285 if (memseg_includes_meta(seg)) { 3286 pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages); 3287 3288 /* Meta data is required to be at the beginning */ 3289 ASSERT(pt_start < seg->pages_base); 3290 } else 3291 pt_start = seg->pages_base; 3292 3293 return (pt_start); 3294 } 3295 3296 /* 3297 * Invalidate memseg pointers in cpu private vm data caches. 3298 */ 3299 static void 3300 memseg_cpu_vm_flush() 3301 { 3302 cpu_t *cp; 3303 vm_cpu_data_t *vc; 3304 3305 mutex_enter(&cpu_lock); 3306 pause_cpus(NULL, NULL); 3307 3308 cp = cpu_list; 3309 do { 3310 vc = cp->cpu_vm_data; 3311 vc->vc_pnum_memseg = NULL; 3312 vc->vc_pnext_memseg = NULL; 3313 3314 } while ((cp = cp->cpu_next) != cpu_list); 3315 3316 start_cpus(); 3317 mutex_exit(&cpu_lock); 3318 } 3319