1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/cmn_err.h> 28 #include <sys/vmem.h> 29 #include <sys/kmem.h> 30 #include <sys/systm.h> 31 #include <sys/machsystm.h> /* for page_freelist_coalesce() */ 32 #include <sys/errno.h> 33 #include <sys/memnode.h> 34 #include <sys/memlist.h> 35 #include <sys/memlist_impl.h> 36 #include <sys/tuneable.h> 37 #include <sys/proc.h> 38 #include <sys/disp.h> 39 #include <sys/debug.h> 40 #include <sys/vm.h> 41 #include <sys/callb.h> 42 #include <sys/memlist_plat.h> /* for installed_top_size() */ 43 #include <sys/condvar_impl.h> /* for CV_HAS_WAITERS() */ 44 #include <sys/dumphdr.h> /* for dump_resize() */ 45 #include <sys/atomic.h> /* for use in stats collection */ 46 #include <sys/rwlock.h> 47 #include <sys/cpuvar.h> 48 #include <vm/seg_kmem.h> 49 #include <vm/seg_kpm.h> 50 #include <vm/page.h> 51 #include <vm/vm_dep.h> 52 #define SUNDDI_IMPL /* so sunddi.h will not redefine splx() et al */ 53 #include <sys/sunddi.h> 54 #include <sys/mem_config.h> 55 #include <sys/mem_cage.h> 56 #include <sys/lgrp.h> 57 #include <sys/ddi.h> 58 #include <sys/modctl.h> 59 60 extern struct memlist *phys_avail; 61 62 extern void mem_node_add(pfn_t, pfn_t); 63 extern void mem_node_del(pfn_t, pfn_t); 64 65 extern uint_t page_ctrs_adjust(int); 66 static void kphysm_setup_post_add(pgcnt_t); 67 static int kphysm_setup_pre_del(pgcnt_t); 68 static void kphysm_setup_post_del(pgcnt_t, int); 69 70 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs); 71 72 static int delspan_reserve(pfn_t, pgcnt_t); 73 static void delspan_unreserve(pfn_t, pgcnt_t); 74 75 kmutex_t memseg_lists_lock; 76 struct memseg *memseg_va_avail; 77 struct memseg *memseg_alloc(void); 78 static struct memseg *memseg_delete_junk; 79 static struct memseg *memseg_edit_junk; 80 void memseg_remap_init(void); 81 static void memseg_remap_to_dummy(struct memseg *); 82 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t); 83 static struct memseg *memseg_reuse(pgcnt_t); 84 85 static struct kmem_cache *memseg_cache; 86 87 /* 88 * Interfaces to manage externally allocated 89 * page_t memory (metadata) for a memseg. 90 */ 91 #pragma weak memseg_alloc_meta 92 #pragma weak memseg_free_meta 93 #pragma weak memseg_get_metapfn 94 #pragma weak memseg_remap_meta 95 96 extern int ppvm_enable; 97 extern page_t *ppvm_base; 98 extern int memseg_alloc_meta(pfn_t, pgcnt_t, void **, pgcnt_t *); 99 extern void memseg_free_meta(void *, pgcnt_t); 100 extern pfn_t memseg_get_metapfn(void *, pgcnt_t); 101 extern void memseg_remap_meta(struct memseg *); 102 static int memseg_is_dynamic(struct memseg *); 103 static int memseg_includes_meta(struct memseg *); 104 static pfn_t memseg_get_start(struct memseg *); 105 static void memseg_cpu_vm_flush(void); 106 107 int meta_alloc_enable; 108 109 /* 110 * Add a chunk of memory to the system. 111 * base: starting PAGESIZE page of new memory. 112 * npgs: length in PAGESIZE pages. 113 * 114 * Adding mem this way doesn't increase the size of the hash tables; 115 * growing them would be too hard. This should be OK, but adding memory 116 * dynamically most likely means more hash misses, since the tables will 117 * be smaller than they otherwise would be. 118 */ 119 #ifdef DEBUG 120 static int memseg_debug; 121 #define MEMSEG_DEBUG(args...) if (memseg_debug) printf(args) 122 #else 123 #define MEMSEG_DEBUG(...) 124 #endif 125 126 int 127 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs) 128 { 129 page_t *pp; 130 page_t *opp, *oepp, *segpp; 131 struct memseg *seg; 132 uint64_t avmem; 133 pfn_t pfn; 134 pfn_t pt_base = base; 135 pgcnt_t tpgs = npgs; 136 pgcnt_t metapgs = 0; 137 int exhausted; 138 pfn_t pnum; 139 int mnode; 140 caddr_t vaddr; 141 int reuse; 142 int mlret; 143 int rv; 144 int flags; 145 int meta_alloc = 0; 146 void *mapva; 147 void *metabase = (void *)base; 148 pgcnt_t nkpmpgs = 0; 149 offset_t kpm_pages_off; 150 151 cmn_err(CE_CONT, 152 "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n", 153 npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT); 154 155 /* 156 * Add this span in the delete list to prevent interactions. 157 */ 158 if (!delspan_reserve(base, npgs)) { 159 return (KPHYSM_ESPAN); 160 } 161 /* 162 * Check to see if any of the memory span has been added 163 * by trying an add to the installed memory list. This 164 * forms the interlocking process for add. 165 */ 166 167 memlist_write_lock(); 168 169 mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT, 170 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 171 172 if (mlret == MEML_SPANOP_OK) 173 installed_top_size(phys_install, &physmax, &physinstalled); 174 175 memlist_write_unlock(); 176 177 if (mlret != MEML_SPANOP_OK) { 178 if (mlret == MEML_SPANOP_EALLOC) { 179 delspan_unreserve(pt_base, tpgs); 180 return (KPHYSM_ERESOURCE); 181 } else if (mlret == MEML_SPANOP_ESPAN) { 182 delspan_unreserve(pt_base, tpgs); 183 return (KPHYSM_ESPAN); 184 } else { 185 delspan_unreserve(pt_base, tpgs); 186 return (KPHYSM_ERESOURCE); 187 } 188 } 189 190 if (meta_alloc_enable) { 191 /* 192 * Allocate the page_t's from existing memory; 193 * if that fails, allocate from the incoming memory. 194 */ 195 rv = memseg_alloc_meta(base, npgs, &metabase, &metapgs); 196 if (rv == KPHYSM_OK) { 197 ASSERT(metapgs); 198 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); 199 meta_alloc = 1; 200 goto mapalloc; 201 } 202 } 203 204 /* 205 * We store the page_t's for this new memory in the first 206 * few pages of the chunk. Here, we go and get'em ... 207 */ 208 209 /* 210 * The expression after the '-' gives the number of pages 211 * that will fit in the new memory based on a requirement 212 * of (PAGESIZE + sizeof (page_t)) bytes per page. 213 */ 214 metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) / 215 (PAGESIZE + sizeof (page_t))); 216 217 npgs -= metapgs; 218 base += metapgs; 219 220 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); 221 222 exhausted = (metapgs == 0 || npgs == 0); 223 224 if (kpm_enable && !exhausted) { 225 pgcnt_t start, end, nkpmpgs_prelim; 226 size_t ptsz; 227 228 /* 229 * A viable kpm large page mapping must not overlap two 230 * dynamic memsegs. Therefore the total size is checked 231 * to be at least kpm_pgsz and also whether start and end 232 * points are at least kpm_pgsz aligned. 233 */ 234 if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) || 235 pmodkpmp(base + npgs)) { 236 237 kphysm_addmem_error_undospan(pt_base, tpgs); 238 239 /* 240 * There is no specific error code for violating 241 * kpm granularity constraints. 242 */ 243 return (KPHYSM_ENOTVIABLE); 244 } 245 246 start = kpmptop(ptokpmp(base)); 247 end = kpmptop(ptokpmp(base + npgs)); 248 nkpmpgs_prelim = ptokpmp(end - start); 249 ptsz = npgs * sizeof (page_t); 250 metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ); 251 exhausted = (tpgs <= metapgs); 252 if (!exhausted) { 253 npgs = tpgs - metapgs; 254 base = pt_base + metapgs; 255 256 /* final nkpmpgs */ 257 start = kpmptop(ptokpmp(base)); 258 nkpmpgs = ptokpmp(end - start); 259 kpm_pages_off = ptsz + 260 (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ; 261 } 262 } 263 264 /* 265 * Is memory area supplied too small? 266 */ 267 if (exhausted) { 268 kphysm_addmem_error_undospan(pt_base, tpgs); 269 /* 270 * There is no specific error code for 'too small'. 271 */ 272 return (KPHYSM_ERESOURCE); 273 } 274 275 mapalloc: 276 /* 277 * We may re-use a previously allocated VA space for the page_ts 278 * eventually, but we need to initialize and lock the pages first. 279 */ 280 281 /* 282 * Get an address in the kernel address map, map 283 * the page_t pages and see if we can touch them. 284 */ 285 286 mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP); 287 if (mapva == NULL) { 288 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 289 " Can't allocate VA for page_ts"); 290 291 if (meta_alloc) 292 memseg_free_meta(metabase, metapgs); 293 kphysm_addmem_error_undospan(pt_base, tpgs); 294 295 return (KPHYSM_ERESOURCE); 296 } 297 pp = mapva; 298 299 if (physmax < (pt_base + tpgs)) 300 physmax = (pt_base + tpgs); 301 302 /* 303 * In the remapping code we map one page at a time so we must do 304 * the same here to match mapping sizes. 305 */ 306 pfn = pt_base; 307 vaddr = (caddr_t)pp; 308 for (pnum = 0; pnum < metapgs; pnum++) { 309 if (meta_alloc) 310 pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum); 311 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 312 PROT_READ | PROT_WRITE, 313 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 314 pfn++; 315 vaddr += ptob(1); 316 } 317 318 if (ddi_peek32((dev_info_t *)NULL, 319 (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) { 320 321 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 322 " Can't access pp array at 0x%p [phys 0x%lx]", 323 (void *)pp, pt_base); 324 325 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 326 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 327 328 vmem_free(heap_arena, mapva, ptob(metapgs)); 329 if (meta_alloc) 330 memseg_free_meta(metabase, metapgs); 331 kphysm_addmem_error_undospan(pt_base, tpgs); 332 333 return (KPHYSM_EFAULT); 334 } 335 336 /* 337 * Add this memory slice to its memory node translation. 338 * 339 * Note that right now, each node may have only one slice; 340 * this may change with COD or in larger SSM systems with 341 * nested latency groups, so we must not assume that the 342 * node does not yet exist. 343 */ 344 pnum = pt_base + tpgs - 1; 345 mem_node_add_range(pt_base, pnum); 346 347 /* 348 * Allocate or resize page counters as necessary to accommodate 349 * the increase in memory pages. 350 */ 351 mnode = PFN_2_MEM_NODE(pnum); 352 PAGE_CTRS_ADJUST(base, npgs, rv); 353 if (rv) { 354 355 mem_node_del_range(pt_base, pnum); 356 357 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 358 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 359 360 vmem_free(heap_arena, mapva, ptob(metapgs)); 361 if (meta_alloc) 362 memseg_free_meta(metabase, metapgs); 363 kphysm_addmem_error_undospan(pt_base, tpgs); 364 365 return (KPHYSM_ERESOURCE); 366 } 367 368 /* 369 * Update the phys_avail memory list. 370 * The phys_install list was done at the start. 371 */ 372 373 memlist_write_lock(); 374 375 mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT, 376 (uint64_t)(npgs) << PAGESHIFT, &phys_avail); 377 ASSERT(mlret == MEML_SPANOP_OK); 378 379 memlist_write_unlock(); 380 381 /* See if we can find a memseg to re-use. */ 382 if (meta_alloc) { 383 seg = memseg_reuse(0); 384 reuse = 1; /* force unmapping of temp mapva */ 385 flags = MEMSEG_DYNAMIC | MEMSEG_META_ALLOC; 386 /* 387 * There is a 1:1 fixed relationship between a pfn 388 * and a page_t VA. The pfn is used as an index into 389 * the ppvm_base page_t table in order to calculate 390 * the page_t base address for a given pfn range. 391 */ 392 segpp = ppvm_base + base; 393 } else { 394 seg = memseg_reuse(metapgs); 395 reuse = (seg != NULL); 396 flags = MEMSEG_DYNAMIC | MEMSEG_META_INCL; 397 segpp = pp; 398 } 399 400 /* 401 * Initialize the memseg structure representing this memory 402 * and add it to the existing list of memsegs. Do some basic 403 * initialization and add the memory to the system. 404 * In order to prevent lock deadlocks, the add_physmem() 405 * code is repeated here, but split into several stages. 406 * 407 * If a memseg is reused, invalidate memseg pointers in 408 * all cpu vm caches. We need to do this this since the check 409 * pp >= seg->pages && pp < seg->epages 410 * used in various places is not atomic and so the first compare 411 * can happen before reuse and the second compare after reuse. 412 * The invalidation ensures that a memseg is not deferenced while 413 * it's page/pfn pointers are changing. 414 */ 415 if (seg == NULL) { 416 seg = memseg_alloc(); 417 ASSERT(seg != NULL); 418 seg->msegflags = flags; 419 MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p", 420 (void *)seg, (void *)(seg->pages)); 421 seg->pages = segpp; 422 } else { 423 ASSERT(seg->msegflags == flags); 424 ASSERT(seg->pages_base == seg->pages_end); 425 MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p", 426 (void *)seg, (void *)(seg->pages)); 427 if (meta_alloc) { 428 memseg_cpu_vm_flush(); 429 seg->pages = segpp; 430 } 431 } 432 433 seg->epages = seg->pages + npgs; 434 seg->pages_base = base; 435 seg->pages_end = base + npgs; 436 437 /* 438 * Initialize metadata. The page_ts are set to locked state 439 * ready to be freed. 440 */ 441 bzero((caddr_t)pp, ptob(metapgs)); 442 443 pfn = seg->pages_base; 444 /* Save the original pp base in case we reuse a memseg. */ 445 opp = pp; 446 oepp = opp + npgs; 447 for (pp = opp; pp < oepp; pp++) { 448 pp->p_pagenum = pfn; 449 pfn++; 450 page_iolock_init(pp); 451 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 452 continue; 453 pp->p_offset = (u_offset_t)-1; 454 } 455 456 if (reuse) { 457 /* Remap our page_ts to the re-used memseg VA space. */ 458 pfn = pt_base; 459 vaddr = (caddr_t)seg->pages; 460 for (pnum = 0; pnum < metapgs; pnum++) { 461 if (meta_alloc) 462 pfn = memseg_get_metapfn(metabase, 463 (pgcnt_t)pnum); 464 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 465 PROT_READ | PROT_WRITE, 466 HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST); 467 pfn++; 468 vaddr += ptob(1); 469 } 470 471 hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs), 472 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 473 474 vmem_free(heap_arena, mapva, ptob(metapgs)); 475 } 476 477 hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off); 478 479 memsegs_lock(1); 480 481 /* 482 * The new memseg is inserted at the beginning of the list. 483 * Not only does this save searching for the tail, but in the 484 * case of a re-used memseg, it solves the problem of what 485 * happens if some process has still got a pointer to the 486 * memseg and follows the next pointer to continue traversing 487 * the memsegs list. 488 */ 489 490 hat_kpm_addmem_mseg_insert(seg); 491 492 seg->next = memsegs; 493 membar_producer(); 494 495 hat_kpm_addmem_memsegs_update(seg); 496 497 memsegs = seg; 498 499 build_pfn_hash(); 500 501 total_pages += npgs; 502 503 /* 504 * Recalculate the paging parameters now total_pages has changed. 505 * This will also cause the clock hands to be reset before next use. 506 */ 507 setupclock(1); 508 509 memsegs_unlock(1); 510 511 PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs); 512 513 /* 514 * Free the pages outside the lock to avoid locking loops. 515 */ 516 for (pp = seg->pages; pp < seg->epages; pp++) { 517 page_free(pp, 1); 518 } 519 520 /* 521 * Now that we've updated the appropriate memory lists we 522 * need to reset a number of globals, since we've increased memory. 523 * Several have already been updated for us as noted above. The 524 * globals we're interested in at this point are: 525 * physmax - highest page frame number. 526 * physinstalled - number of pages currently installed (done earlier) 527 * maxmem - max free pages in the system 528 * physmem - physical memory pages available 529 * availrmem - real memory available 530 */ 531 532 mutex_enter(&freemem_lock); 533 maxmem += npgs; 534 physmem += npgs; 535 availrmem += npgs; 536 availrmem_initial += npgs; 537 538 mutex_exit(&freemem_lock); 539 540 dump_resize(); 541 542 page_freelist_coalesce_all(mnode); 543 544 kphysm_setup_post_add(npgs); 545 546 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK " 547 "(0x%" PRIx64 ")\n", 548 physinstalled << (PAGESHIFT - 10), 549 (uint64_t)physinstalled << PAGESHIFT); 550 551 avmem = (uint64_t)freemem << PAGESHIFT; 552 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: " 553 "avail mem = %" PRId64 "\n", avmem); 554 555 /* 556 * Update lgroup generation number on single lgroup systems 557 */ 558 if (nlgrps == 1) 559 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 560 561 delspan_unreserve(pt_base, tpgs); 562 return (KPHYSM_OK); /* Successfully added system memory */ 563 564 } 565 566 /* 567 * There are various error conditions in kphysm_add_memory_dynamic() 568 * which require a rollback of already changed global state. 569 */ 570 static void 571 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs) 572 { 573 int mlret; 574 575 /* Unreserve memory span. */ 576 memlist_write_lock(); 577 578 mlret = memlist_delete_span( 579 (uint64_t)(pt_base) << PAGESHIFT, 580 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 581 582 ASSERT(mlret == MEML_SPANOP_OK); 583 phys_install_has_changed(); 584 installed_top_size(phys_install, &physmax, &physinstalled); 585 586 memlist_write_unlock(); 587 delspan_unreserve(pt_base, tpgs); 588 } 589 590 /* 591 * Only return an available memseg of exactly the right size 592 * if size is required. 593 * When the meta data area has it's own virtual address space 594 * we will need to manage this more carefully and do best fit 595 * allocations, possibly splitting an available area. 596 */ 597 struct memseg * 598 memseg_reuse(pgcnt_t metapgs) 599 { 600 int type; 601 struct memseg **segpp, *seg; 602 603 mutex_enter(&memseg_lists_lock); 604 605 segpp = &memseg_va_avail; 606 for (; (seg = *segpp) != NULL; segpp = &seg->lnext) { 607 caddr_t end; 608 609 /* 610 * Make sure we are reusing the right segment type. 611 */ 612 type = metapgs ? MEMSEG_META_INCL : MEMSEG_META_ALLOC; 613 614 if ((seg->msegflags & (MEMSEG_META_INCL | MEMSEG_META_ALLOC)) 615 != type) 616 continue; 617 618 if (kpm_enable) 619 end = hat_kpm_mseg_reuse(seg); 620 else 621 end = (caddr_t)seg->epages; 622 623 /* 624 * Check for the right size if it is provided. 625 */ 626 if (!metapgs || btopr(end - (caddr_t)seg->pages) == metapgs) { 627 *segpp = seg->lnext; 628 seg->lnext = NULL; 629 break; 630 } 631 } 632 mutex_exit(&memseg_lists_lock); 633 634 return (seg); 635 } 636 637 static uint_t handle_gen; 638 639 struct memdelspan { 640 struct memdelspan *mds_next; 641 pfn_t mds_base; 642 pgcnt_t mds_npgs; 643 uint_t *mds_bitmap; 644 uint_t *mds_bitmap_retired; 645 }; 646 647 #define NBPBMW (sizeof (uint_t) * NBBY) 648 #define MDS_BITMAPBYTES(MDSP) \ 649 ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t)) 650 651 struct transit_list { 652 struct transit_list *trl_next; 653 struct memdelspan *trl_spans; 654 int trl_collect; 655 }; 656 657 struct transit_list_head { 658 kmutex_t trh_lock; 659 struct transit_list *trh_head; 660 }; 661 662 static struct transit_list_head transit_list_head; 663 664 struct mem_handle; 665 static void transit_list_collect(struct mem_handle *, int); 666 static void transit_list_insert(struct transit_list *); 667 static void transit_list_remove(struct transit_list *); 668 669 #ifdef DEBUG 670 #define MEM_DEL_STATS 671 #endif /* DEBUG */ 672 673 #ifdef MEM_DEL_STATS 674 static int mem_del_stat_print = 0; 675 struct mem_del_stat { 676 uint_t nloop; 677 uint_t need_free; 678 uint_t free_loop; 679 uint_t free_low; 680 uint_t free_failed; 681 uint_t ncheck; 682 uint_t nopaget; 683 uint_t lockfail; 684 uint_t nfree; 685 uint_t nreloc; 686 uint_t nrelocfail; 687 uint_t already_done; 688 uint_t first_notfree; 689 uint_t npplocked; 690 uint_t nlockreloc; 691 uint_t nnorepl; 692 uint_t nmodreloc; 693 uint_t ndestroy; 694 uint_t nputpage; 695 uint_t nnoreclaim; 696 uint_t ndelay; 697 uint_t demotefail; 698 uint64_t nticks_total; 699 uint64_t nticks_pgrp; 700 uint_t retired; 701 uint_t toxic; 702 uint_t failing; 703 uint_t modtoxic; 704 uint_t npplkdtoxic; 705 uint_t gptlmodfail; 706 uint_t gptllckfail; 707 }; 708 /* 709 * The stat values are only incremented in the delete thread 710 * so no locking or atomic required. 711 */ 712 #define MDSTAT_INCR(MHP, FLD) (MHP)->mh_delstat.FLD++ 713 #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck)) 714 #define MDSTAT_PGRP(MHP, ntck) ((MHP)->mh_delstat.nticks_pgrp += (ntck)) 715 static void mem_del_stat_print_func(struct mem_handle *); 716 #define MDSTAT_PRINT(MHP) mem_del_stat_print_func((MHP)) 717 #else /* MEM_DEL_STATS */ 718 #define MDSTAT_INCR(MHP, FLD) 719 #define MDSTAT_TOTAL(MHP, ntck) 720 #define MDSTAT_PGRP(MHP, ntck) 721 #define MDSTAT_PRINT(MHP) 722 #endif /* MEM_DEL_STATS */ 723 724 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING, 725 MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t; 726 727 /* 728 * mh_mutex must be taken to examine or change mh_exthandle and mh_state. 729 * The mutex may not be required for other fields, dependent on mh_state. 730 */ 731 struct mem_handle { 732 kmutex_t mh_mutex; 733 struct mem_handle *mh_next; 734 memhandle_t mh_exthandle; 735 mhnd_state_t mh_state; 736 struct transit_list mh_transit; 737 pgcnt_t mh_phys_pages; 738 pgcnt_t mh_vm_pages; 739 pgcnt_t mh_hold_todo; 740 void (*mh_delete_complete)(void *, int error); 741 void *mh_delete_complete_arg; 742 volatile uint_t mh_cancel; 743 volatile uint_t mh_dr_aio_cleanup_cancel; 744 volatile uint_t mh_aio_cleanup_done; 745 kcondvar_t mh_cv; 746 kthread_id_t mh_thread_id; 747 page_t *mh_deleted; /* link through p_next */ 748 #ifdef MEM_DEL_STATS 749 struct mem_del_stat mh_delstat; 750 #endif /* MEM_DEL_STATS */ 751 }; 752 753 static struct mem_handle *mem_handle_head; 754 static kmutex_t mem_handle_list_mutex; 755 756 static struct mem_handle * 757 kphysm_allocate_mem_handle() 758 { 759 struct mem_handle *mhp; 760 761 mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP); 762 mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL); 763 mutex_enter(&mem_handle_list_mutex); 764 mutex_enter(&mhp->mh_mutex); 765 /* handle_gen is protected by list mutex. */ 766 mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen); 767 mhp->mh_next = mem_handle_head; 768 mem_handle_head = mhp; 769 mutex_exit(&mem_handle_list_mutex); 770 771 return (mhp); 772 } 773 774 static void 775 kphysm_free_mem_handle(struct mem_handle *mhp) 776 { 777 struct mem_handle **mhpp; 778 779 ASSERT(mutex_owned(&mhp->mh_mutex)); 780 ASSERT(mhp->mh_state == MHND_FREE); 781 /* 782 * Exit the mutex to preserve locking order. This is OK 783 * here as once in the FREE state, the handle cannot 784 * be found by a lookup. 785 */ 786 mutex_exit(&mhp->mh_mutex); 787 788 mutex_enter(&mem_handle_list_mutex); 789 mhpp = &mem_handle_head; 790 while (*mhpp != NULL && *mhpp != mhp) 791 mhpp = &(*mhpp)->mh_next; 792 ASSERT(*mhpp == mhp); 793 /* 794 * No need to lock the handle (mh_mutex) as only 795 * mh_next changing and this is the only thread that 796 * can be referncing mhp. 797 */ 798 *mhpp = mhp->mh_next; 799 mutex_exit(&mem_handle_list_mutex); 800 801 mutex_destroy(&mhp->mh_mutex); 802 kmem_free(mhp, sizeof (struct mem_handle)); 803 } 804 805 /* 806 * This function finds the internal mem_handle corresponding to an 807 * external handle and returns it with the mh_mutex held. 808 */ 809 static struct mem_handle * 810 kphysm_lookup_mem_handle(memhandle_t handle) 811 { 812 struct mem_handle *mhp; 813 814 mutex_enter(&mem_handle_list_mutex); 815 for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) { 816 if (mhp->mh_exthandle == handle) { 817 mutex_enter(&mhp->mh_mutex); 818 /* 819 * The state of the handle could have been changed 820 * by kphysm_del_release() while waiting for mh_mutex. 821 */ 822 if (mhp->mh_state == MHND_FREE) { 823 mutex_exit(&mhp->mh_mutex); 824 continue; 825 } 826 break; 827 } 828 } 829 mutex_exit(&mem_handle_list_mutex); 830 return (mhp); 831 } 832 833 int 834 kphysm_del_gethandle(memhandle_t *xmhp) 835 { 836 struct mem_handle *mhp; 837 838 mhp = kphysm_allocate_mem_handle(); 839 /* 840 * The handle is allocated using KM_SLEEP, so cannot fail. 841 * If the implementation is changed, the correct error to return 842 * here would be KPHYSM_ENOHANDLES. 843 */ 844 ASSERT(mhp->mh_state == MHND_FREE); 845 mhp->mh_state = MHND_INIT; 846 *xmhp = mhp->mh_exthandle; 847 mutex_exit(&mhp->mh_mutex); 848 return (KPHYSM_OK); 849 } 850 851 static int 852 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2) 853 { 854 pfn_t e1, e2; 855 856 e1 = b1 + l1; 857 e2 = b2 + l2; 858 859 return (!(b2 >= e1 || b1 >= e2)); 860 } 861 862 static int can_remove_pgs(pgcnt_t); 863 864 static struct memdelspan * 865 span_to_install(pfn_t base, pgcnt_t npgs) 866 { 867 struct memdelspan *mdsp; 868 struct memdelspan *mdsp_new; 869 uint64_t address, size, thislen; 870 struct memlist *mlp; 871 872 mdsp_new = NULL; 873 874 address = (uint64_t)base << PAGESHIFT; 875 size = (uint64_t)npgs << PAGESHIFT; 876 while (size != 0) { 877 memlist_read_lock(); 878 for (mlp = phys_install; mlp != NULL; mlp = mlp->next) { 879 if (address >= (mlp->address + mlp->size)) 880 continue; 881 if ((address + size) > mlp->address) 882 break; 883 } 884 if (mlp == NULL) { 885 address += size; 886 size = 0; 887 thislen = 0; 888 } else { 889 if (address < mlp->address) { 890 size -= (mlp->address - address); 891 address = mlp->address; 892 } 893 ASSERT(address >= mlp->address); 894 if ((address + size) > (mlp->address + mlp->size)) { 895 thislen = mlp->size - (address - mlp->address); 896 } else { 897 thislen = size; 898 } 899 } 900 memlist_read_unlock(); 901 /* TODO: phys_install could change now */ 902 if (thislen == 0) 903 continue; 904 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 905 mdsp->mds_base = btop(address); 906 mdsp->mds_npgs = btop(thislen); 907 mdsp->mds_next = mdsp_new; 908 mdsp_new = mdsp; 909 address += thislen; 910 size -= thislen; 911 } 912 return (mdsp_new); 913 } 914 915 static void 916 free_delspans(struct memdelspan *mdsp) 917 { 918 struct memdelspan *amdsp; 919 920 while ((amdsp = mdsp) != NULL) { 921 mdsp = amdsp->mds_next; 922 kmem_free(amdsp, sizeof (struct memdelspan)); 923 } 924 } 925 926 /* 927 * Concatenate lists. No list ordering is required. 928 */ 929 930 static void 931 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp) 932 { 933 while (*mdspp != NULL) 934 mdspp = &(*mdspp)->mds_next; 935 936 *mdspp = mdsp; 937 } 938 939 /* 940 * Given a new list of delspans, check there is no overlap with 941 * all existing span activity (add or delete) and then concatenate 942 * the new spans to the given list. 943 * Return 1 for OK, 0 if overlapping. 944 */ 945 static int 946 delspan_insert( 947 struct transit_list *my_tlp, 948 struct memdelspan *mdsp_new) 949 { 950 struct transit_list_head *trh; 951 struct transit_list *tlp; 952 int ret; 953 954 trh = &transit_list_head; 955 956 ASSERT(my_tlp != NULL); 957 ASSERT(mdsp_new != NULL); 958 959 ret = 1; 960 mutex_enter(&trh->trh_lock); 961 /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */ 962 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 963 struct memdelspan *mdsp; 964 965 for (mdsp = tlp->trl_spans; mdsp != NULL; 966 mdsp = mdsp->mds_next) { 967 struct memdelspan *nmdsp; 968 969 for (nmdsp = mdsp_new; nmdsp != NULL; 970 nmdsp = nmdsp->mds_next) { 971 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 972 nmdsp->mds_base, nmdsp->mds_npgs)) { 973 ret = 0; 974 goto done; 975 } 976 } 977 } 978 } 979 done: 980 if (ret != 0) { 981 if (my_tlp->trl_spans == NULL) 982 transit_list_insert(my_tlp); 983 delspan_concat(&my_tlp->trl_spans, mdsp_new); 984 } 985 mutex_exit(&trh->trh_lock); 986 return (ret); 987 } 988 989 static void 990 delspan_remove( 991 struct transit_list *my_tlp, 992 pfn_t base, 993 pgcnt_t npgs) 994 { 995 struct transit_list_head *trh; 996 struct memdelspan *mdsp; 997 998 trh = &transit_list_head; 999 1000 ASSERT(my_tlp != NULL); 1001 1002 mutex_enter(&trh->trh_lock); 1003 if ((mdsp = my_tlp->trl_spans) != NULL) { 1004 if (npgs == 0) { 1005 my_tlp->trl_spans = NULL; 1006 free_delspans(mdsp); 1007 transit_list_remove(my_tlp); 1008 } else { 1009 struct memdelspan **prv; 1010 1011 prv = &my_tlp->trl_spans; 1012 while (mdsp != NULL) { 1013 pfn_t p_end; 1014 1015 p_end = mdsp->mds_base + mdsp->mds_npgs; 1016 if (mdsp->mds_base >= base && 1017 p_end <= (base + npgs)) { 1018 *prv = mdsp->mds_next; 1019 mdsp->mds_next = NULL; 1020 free_delspans(mdsp); 1021 } else { 1022 prv = &mdsp->mds_next; 1023 } 1024 mdsp = *prv; 1025 } 1026 if (my_tlp->trl_spans == NULL) 1027 transit_list_remove(my_tlp); 1028 } 1029 } 1030 mutex_exit(&trh->trh_lock); 1031 } 1032 1033 /* 1034 * Reserve interface for add to stop delete before add finished. 1035 * This list is only accessed through the delspan_insert/remove 1036 * functions and so is fully protected by the mutex in struct transit_list. 1037 */ 1038 1039 static struct transit_list reserve_transit; 1040 1041 static int 1042 delspan_reserve(pfn_t base, pgcnt_t npgs) 1043 { 1044 struct memdelspan *mdsp; 1045 int ret; 1046 1047 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 1048 mdsp->mds_base = base; 1049 mdsp->mds_npgs = npgs; 1050 if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) { 1051 free_delspans(mdsp); 1052 } 1053 return (ret); 1054 } 1055 1056 static void 1057 delspan_unreserve(pfn_t base, pgcnt_t npgs) 1058 { 1059 delspan_remove(&reserve_transit, base, npgs); 1060 } 1061 1062 /* 1063 * Return whether memseg was created by kphysm_add_memory_dynamic(). 1064 */ 1065 static int 1066 memseg_is_dynamic(struct memseg *seg) 1067 { 1068 return (seg->msegflags & MEMSEG_DYNAMIC); 1069 } 1070 1071 int 1072 kphysm_del_span( 1073 memhandle_t handle, 1074 pfn_t base, 1075 pgcnt_t npgs) 1076 { 1077 struct mem_handle *mhp; 1078 struct memseg *seg; 1079 struct memdelspan *mdsp; 1080 struct memdelspan *mdsp_new; 1081 pgcnt_t phys_pages, vm_pages; 1082 pfn_t p_end; 1083 page_t *pp; 1084 int ret; 1085 1086 mhp = kphysm_lookup_mem_handle(handle); 1087 if (mhp == NULL) { 1088 return (KPHYSM_EHANDLE); 1089 } 1090 if (mhp->mh_state != MHND_INIT) { 1091 mutex_exit(&mhp->mh_mutex); 1092 return (KPHYSM_ESEQUENCE); 1093 } 1094 1095 /* 1096 * Intersect the span with the installed memory list (phys_install). 1097 */ 1098 mdsp_new = span_to_install(base, npgs); 1099 if (mdsp_new == NULL) { 1100 /* 1101 * No physical memory in this range. Is this an 1102 * error? If an attempt to start the delete is made 1103 * for OK returns from del_span such as this, start will 1104 * return an error. 1105 * Could return KPHYSM_ENOWORK. 1106 */ 1107 /* 1108 * It is assumed that there are no error returns 1109 * from span_to_install() due to kmem_alloc failure. 1110 */ 1111 mutex_exit(&mhp->mh_mutex); 1112 return (KPHYSM_OK); 1113 } 1114 /* 1115 * Does this span overlap an existing span? 1116 */ 1117 if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) { 1118 /* 1119 * Differentiate between already on list for this handle 1120 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY). 1121 */ 1122 ret = KPHYSM_EBUSY; 1123 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1124 mdsp = mdsp->mds_next) { 1125 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 1126 base, npgs)) { 1127 ret = KPHYSM_EDUP; 1128 break; 1129 } 1130 } 1131 mutex_exit(&mhp->mh_mutex); 1132 free_delspans(mdsp_new); 1133 return (ret); 1134 } 1135 /* 1136 * At this point the spans in mdsp_new have been inserted into the 1137 * list of spans for this handle and thereby to the global list of 1138 * spans being processed. Each of these spans must now be checked 1139 * for relocatability. As a side-effect segments in the memseg list 1140 * may be split. 1141 * 1142 * Note that mdsp_new can no longer be used as it is now part of 1143 * a larger list. Select elements of this larger list based 1144 * on base and npgs. 1145 */ 1146 restart: 1147 phys_pages = 0; 1148 vm_pages = 0; 1149 ret = KPHYSM_OK; 1150 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1151 mdsp = mdsp->mds_next) { 1152 pgcnt_t pages_checked; 1153 1154 if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) { 1155 continue; 1156 } 1157 p_end = mdsp->mds_base + mdsp->mds_npgs; 1158 /* 1159 * The pages_checked count is a hack. All pages should be 1160 * checked for relocatability. Those not covered by memsegs 1161 * should be tested with arch_kphysm_del_span_ok(). 1162 */ 1163 pages_checked = 0; 1164 for (seg = memsegs; seg; seg = seg->next) { 1165 pfn_t mseg_start; 1166 1167 if (seg->pages_base >= p_end || 1168 seg->pages_end <= mdsp->mds_base) { 1169 /* Span and memseg don't overlap. */ 1170 continue; 1171 } 1172 mseg_start = memseg_get_start(seg); 1173 /* Check that segment is suitable for delete. */ 1174 if (memseg_includes_meta(seg)) { 1175 /* 1176 * Check that this segment is completely 1177 * within the span. 1178 */ 1179 if (mseg_start < mdsp->mds_base || 1180 seg->pages_end > p_end) { 1181 ret = KPHYSM_EBUSY; 1182 break; 1183 } 1184 pages_checked += seg->pages_end - mseg_start; 1185 } else { 1186 /* 1187 * If this segment is larger than the span, 1188 * try to split it. After the split, it 1189 * is necessary to restart. 1190 */ 1191 if (seg->pages_base < mdsp->mds_base || 1192 seg->pages_end > p_end) { 1193 pfn_t abase; 1194 pgcnt_t anpgs; 1195 int s_ret; 1196 1197 /* Split required. */ 1198 if (mdsp->mds_base < seg->pages_base) 1199 abase = seg->pages_base; 1200 else 1201 abase = mdsp->mds_base; 1202 if (p_end > seg->pages_end) 1203 anpgs = seg->pages_end - abase; 1204 else 1205 anpgs = p_end - abase; 1206 s_ret = kphysm_split_memseg(abase, 1207 anpgs); 1208 if (s_ret == 0) { 1209 /* Split failed. */ 1210 ret = KPHYSM_ERESOURCE; 1211 break; 1212 } 1213 goto restart; 1214 } 1215 pages_checked += 1216 seg->pages_end - seg->pages_base; 1217 } 1218 /* 1219 * The memseg is wholly within the delete span. 1220 * The individual pages can now be checked. 1221 */ 1222 /* Cage test. */ 1223 for (pp = seg->pages; pp < seg->epages; pp++) { 1224 if (PP_ISNORELOC(pp)) { 1225 ret = KPHYSM_ENONRELOC; 1226 break; 1227 } 1228 } 1229 if (ret != KPHYSM_OK) { 1230 break; 1231 } 1232 phys_pages += (seg->pages_end - mseg_start); 1233 vm_pages += MSEG_NPAGES(seg); 1234 } 1235 if (ret != KPHYSM_OK) 1236 break; 1237 if (pages_checked != mdsp->mds_npgs) { 1238 ret = KPHYSM_ENONRELOC; 1239 break; 1240 } 1241 } 1242 1243 if (ret == KPHYSM_OK) { 1244 mhp->mh_phys_pages += phys_pages; 1245 mhp->mh_vm_pages += vm_pages; 1246 } else { 1247 /* 1248 * Keep holding the mh_mutex to prevent it going away. 1249 */ 1250 delspan_remove(&mhp->mh_transit, base, npgs); 1251 } 1252 mutex_exit(&mhp->mh_mutex); 1253 return (ret); 1254 } 1255 1256 int 1257 kphysm_del_span_query( 1258 pfn_t base, 1259 pgcnt_t npgs, 1260 memquery_t *mqp) 1261 { 1262 struct memdelspan *mdsp; 1263 struct memdelspan *mdsp_new; 1264 int done_first_nonreloc; 1265 1266 mqp->phys_pages = 0; 1267 mqp->managed = 0; 1268 mqp->nonrelocatable = 0; 1269 mqp->first_nonrelocatable = 0; 1270 mqp->last_nonrelocatable = 0; 1271 1272 mdsp_new = span_to_install(base, npgs); 1273 /* 1274 * It is OK to proceed here if mdsp_new == NULL. 1275 */ 1276 done_first_nonreloc = 0; 1277 for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) { 1278 pfn_t sbase; 1279 pgcnt_t snpgs; 1280 1281 mqp->phys_pages += mdsp->mds_npgs; 1282 sbase = mdsp->mds_base; 1283 snpgs = mdsp->mds_npgs; 1284 while (snpgs != 0) { 1285 struct memseg *lseg, *seg; 1286 pfn_t p_end; 1287 page_t *pp; 1288 pfn_t mseg_start; 1289 1290 p_end = sbase + snpgs; 1291 /* 1292 * Find the lowest addressed memseg that starts 1293 * after sbase and account for it. 1294 * This is to catch dynamic memsegs whose start 1295 * is hidden. 1296 */ 1297 seg = NULL; 1298 for (lseg = memsegs; lseg != NULL; lseg = lseg->next) { 1299 if ((lseg->pages_base >= sbase) || 1300 (lseg->pages_base < p_end && 1301 lseg->pages_end > sbase)) { 1302 if (seg == NULL || 1303 seg->pages_base > lseg->pages_base) 1304 seg = lseg; 1305 } 1306 } 1307 if (seg != NULL) { 1308 mseg_start = memseg_get_start(seg); 1309 /* 1310 * Now have the full extent of the memseg so 1311 * do the range check. 1312 */ 1313 if (mseg_start >= p_end || 1314 seg->pages_end <= sbase) { 1315 /* Span does not overlap memseg. */ 1316 seg = NULL; 1317 } 1318 } 1319 /* 1320 * Account for gap either before the segment if 1321 * there is one or to the end of the span. 1322 */ 1323 if (seg == NULL || mseg_start > sbase) { 1324 pfn_t a_end; 1325 1326 a_end = (seg == NULL) ? p_end : mseg_start; 1327 /* 1328 * Check with arch layer for relocatability. 1329 */ 1330 if (arch_kphysm_del_span_ok(sbase, 1331 (a_end - sbase))) { 1332 /* 1333 * No non-relocatble pages in this 1334 * area, avoid the fine-grained 1335 * test. 1336 */ 1337 snpgs -= (a_end - sbase); 1338 sbase = a_end; 1339 } 1340 while (sbase < a_end) { 1341 if (!arch_kphysm_del_span_ok(sbase, 1342 1)) { 1343 mqp->nonrelocatable++; 1344 if (!done_first_nonreloc) { 1345 mqp-> 1346 first_nonrelocatable 1347 = sbase; 1348 done_first_nonreloc = 1; 1349 } 1350 mqp->last_nonrelocatable = 1351 sbase; 1352 } 1353 sbase++; 1354 snpgs--; 1355 } 1356 } 1357 if (seg != NULL) { 1358 ASSERT(mseg_start <= sbase); 1359 if (seg->pages_base != mseg_start && 1360 seg->pages_base > sbase) { 1361 pgcnt_t skip_pgs; 1362 1363 /* 1364 * Skip the page_t area of a 1365 * dynamic memseg. 1366 */ 1367 skip_pgs = seg->pages_base - sbase; 1368 if (snpgs <= skip_pgs) { 1369 sbase += snpgs; 1370 snpgs = 0; 1371 continue; 1372 } 1373 snpgs -= skip_pgs; 1374 sbase += skip_pgs; 1375 } 1376 ASSERT(snpgs != 0); 1377 ASSERT(seg->pages_base <= sbase); 1378 /* 1379 * The individual pages can now be checked. 1380 */ 1381 for (pp = seg->pages + 1382 (sbase - seg->pages_base); 1383 snpgs != 0 && pp < seg->epages; pp++) { 1384 mqp->managed++; 1385 if (PP_ISNORELOC(pp)) { 1386 mqp->nonrelocatable++; 1387 if (!done_first_nonreloc) { 1388 mqp-> 1389 first_nonrelocatable 1390 = sbase; 1391 done_first_nonreloc = 1; 1392 } 1393 mqp->last_nonrelocatable = 1394 sbase; 1395 } 1396 sbase++; 1397 snpgs--; 1398 } 1399 } 1400 } 1401 } 1402 1403 free_delspans(mdsp_new); 1404 1405 return (KPHYSM_OK); 1406 } 1407 1408 /* 1409 * This release function can be called at any stage as follows: 1410 * _gethandle only called 1411 * _span(s) only called 1412 * _start called but failed 1413 * delete thread exited 1414 */ 1415 int 1416 kphysm_del_release(memhandle_t handle) 1417 { 1418 struct mem_handle *mhp; 1419 1420 mhp = kphysm_lookup_mem_handle(handle); 1421 if (mhp == NULL) { 1422 return (KPHYSM_EHANDLE); 1423 } 1424 switch (mhp->mh_state) { 1425 case MHND_STARTING: 1426 case MHND_RUNNING: 1427 mutex_exit(&mhp->mh_mutex); 1428 return (KPHYSM_ENOTFINISHED); 1429 case MHND_FREE: 1430 ASSERT(mhp->mh_state != MHND_FREE); 1431 mutex_exit(&mhp->mh_mutex); 1432 return (KPHYSM_EHANDLE); 1433 case MHND_INIT: 1434 break; 1435 case MHND_DONE: 1436 break; 1437 case MHND_RELEASE: 1438 mutex_exit(&mhp->mh_mutex); 1439 return (KPHYSM_ESEQUENCE); 1440 default: 1441 #ifdef DEBUG 1442 cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d", 1443 (void *)mhp, mhp->mh_state); 1444 #endif /* DEBUG */ 1445 mutex_exit(&mhp->mh_mutex); 1446 return (KPHYSM_EHANDLE); 1447 } 1448 /* 1449 * Set state so that we can wait if necessary. 1450 * Also this means that we have read/write access to all 1451 * fields except mh_exthandle and mh_state. 1452 */ 1453 mhp->mh_state = MHND_RELEASE; 1454 /* 1455 * The mem_handle cannot be de-allocated by any other operation 1456 * now, so no need to hold mh_mutex. 1457 */ 1458 mutex_exit(&mhp->mh_mutex); 1459 1460 delspan_remove(&mhp->mh_transit, 0, 0); 1461 mhp->mh_phys_pages = 0; 1462 mhp->mh_vm_pages = 0; 1463 mhp->mh_hold_todo = 0; 1464 mhp->mh_delete_complete = NULL; 1465 mhp->mh_delete_complete_arg = NULL; 1466 mhp->mh_cancel = 0; 1467 1468 mutex_enter(&mhp->mh_mutex); 1469 ASSERT(mhp->mh_state == MHND_RELEASE); 1470 mhp->mh_state = MHND_FREE; 1471 1472 kphysm_free_mem_handle(mhp); 1473 1474 return (KPHYSM_OK); 1475 } 1476 1477 /* 1478 * This cancel function can only be called with the thread running. 1479 */ 1480 int 1481 kphysm_del_cancel(memhandle_t handle) 1482 { 1483 struct mem_handle *mhp; 1484 1485 mhp = kphysm_lookup_mem_handle(handle); 1486 if (mhp == NULL) { 1487 return (KPHYSM_EHANDLE); 1488 } 1489 if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) { 1490 mutex_exit(&mhp->mh_mutex); 1491 return (KPHYSM_ENOTRUNNING); 1492 } 1493 /* 1494 * Set the cancel flag and wake the delete thread up. 1495 * The thread may be waiting on I/O, so the effect of the cancel 1496 * may be delayed. 1497 */ 1498 if (mhp->mh_cancel == 0) { 1499 mhp->mh_cancel = KPHYSM_ECANCELLED; 1500 cv_signal(&mhp->mh_cv); 1501 } 1502 mutex_exit(&mhp->mh_mutex); 1503 return (KPHYSM_OK); 1504 } 1505 1506 int 1507 kphysm_del_status( 1508 memhandle_t handle, 1509 memdelstat_t *mdstp) 1510 { 1511 struct mem_handle *mhp; 1512 1513 mhp = kphysm_lookup_mem_handle(handle); 1514 if (mhp == NULL) { 1515 return (KPHYSM_EHANDLE); 1516 } 1517 /* 1518 * Calling kphysm_del_status() is allowed before the delete 1519 * is started to allow for status display. 1520 */ 1521 if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING && 1522 mhp->mh_state != MHND_RUNNING) { 1523 mutex_exit(&mhp->mh_mutex); 1524 return (KPHYSM_ENOTRUNNING); 1525 } 1526 mdstp->phys_pages = mhp->mh_phys_pages; 1527 mdstp->managed = mhp->mh_vm_pages; 1528 mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo; 1529 mutex_exit(&mhp->mh_mutex); 1530 return (KPHYSM_OK); 1531 } 1532 1533 static int mem_delete_additional_pages = 100; 1534 1535 static int 1536 can_remove_pgs(pgcnt_t npgs) 1537 { 1538 /* 1539 * If all pageable pages were paged out, freemem would 1540 * equal availrmem. There is a minimum requirement for 1541 * availrmem. 1542 */ 1543 if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages)) 1544 < npgs) 1545 return (0); 1546 /* TODO: check swap space, etc. */ 1547 return (1); 1548 } 1549 1550 static int 1551 get_availrmem(pgcnt_t npgs) 1552 { 1553 int ret; 1554 1555 mutex_enter(&freemem_lock); 1556 ret = can_remove_pgs(npgs); 1557 if (ret != 0) 1558 availrmem -= npgs; 1559 mutex_exit(&freemem_lock); 1560 return (ret); 1561 } 1562 1563 static void 1564 put_availrmem(pgcnt_t npgs) 1565 { 1566 mutex_enter(&freemem_lock); 1567 availrmem += npgs; 1568 mutex_exit(&freemem_lock); 1569 } 1570 1571 #define FREEMEM_INCR 100 1572 static pgcnt_t freemem_incr = FREEMEM_INCR; 1573 #define DEL_FREE_WAIT_FRAC 4 1574 #define DEL_FREE_WAIT_TICKS ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC) 1575 1576 #define DEL_BUSY_WAIT_FRAC 20 1577 #define DEL_BUSY_WAIT_TICKS ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC) 1578 1579 static void kphysm_del_cleanup(struct mem_handle *); 1580 1581 static void page_delete_collect(page_t *, struct mem_handle *); 1582 1583 static pgcnt_t 1584 delthr_get_freemem(struct mem_handle *mhp) 1585 { 1586 pgcnt_t free_get; 1587 int ret; 1588 1589 ASSERT(MUTEX_HELD(&mhp->mh_mutex)); 1590 1591 MDSTAT_INCR(mhp, need_free); 1592 /* 1593 * Get up to freemem_incr pages. 1594 */ 1595 free_get = freemem_incr; 1596 if (free_get > mhp->mh_hold_todo) 1597 free_get = mhp->mh_hold_todo; 1598 /* 1599 * Take free_get pages away from freemem, 1600 * waiting if necessary. 1601 */ 1602 1603 while (!mhp->mh_cancel) { 1604 mutex_exit(&mhp->mh_mutex); 1605 MDSTAT_INCR(mhp, free_loop); 1606 /* 1607 * Duplicate test from page_create_throttle() 1608 * but don't override with !PG_WAIT. 1609 */ 1610 if (freemem < (free_get + throttlefree)) { 1611 MDSTAT_INCR(mhp, free_low); 1612 ret = 0; 1613 } else { 1614 ret = page_create_wait(free_get, 0); 1615 if (ret == 0) { 1616 /* EMPTY */ 1617 MDSTAT_INCR(mhp, free_failed); 1618 } 1619 } 1620 if (ret != 0) { 1621 mutex_enter(&mhp->mh_mutex); 1622 return (free_get); 1623 } 1624 1625 /* 1626 * Put pressure on pageout. 1627 */ 1628 page_needfree(free_get); 1629 cv_signal(&proc_pageout->p_cv); 1630 1631 mutex_enter(&mhp->mh_mutex); 1632 (void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex, 1633 (lbolt + DEL_FREE_WAIT_TICKS)); 1634 mutex_exit(&mhp->mh_mutex); 1635 page_needfree(-(spgcnt_t)free_get); 1636 1637 mutex_enter(&mhp->mh_mutex); 1638 } 1639 return (0); 1640 } 1641 1642 #define DR_AIO_CLEANUP_DELAY 25000 /* 0.025secs, in usec */ 1643 #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100 1644 /* 1645 * This function is run as a helper thread for delete_memory_thread. 1646 * It is needed in order to force kaio cleanup, so that pages used in kaio 1647 * will be unlocked and subsequently relocated by delete_memory_thread. 1648 * The address of the delete_memory_threads's mem_handle is passed in to 1649 * this thread function, and is used to set the mh_aio_cleanup_done member 1650 * prior to calling thread_exit(). 1651 */ 1652 static void 1653 dr_aio_cleanup_thread(caddr_t amhp) 1654 { 1655 proc_t *procp; 1656 int (*aio_cleanup_dr_delete_memory)(proc_t *); 1657 int cleaned; 1658 int n = 0; 1659 struct mem_handle *mhp; 1660 volatile uint_t *pcancel; 1661 1662 mhp = (struct mem_handle *)amhp; 1663 ASSERT(mhp != NULL); 1664 pcancel = &mhp->mh_dr_aio_cleanup_cancel; 1665 if (modload("sys", "kaio") == -1) { 1666 mhp->mh_aio_cleanup_done = 1; 1667 cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio"); 1668 thread_exit(); 1669 } 1670 aio_cleanup_dr_delete_memory = (int (*)(proc_t *)) 1671 modgetsymvalue("aio_cleanup_dr_delete_memory", 0); 1672 if (aio_cleanup_dr_delete_memory == NULL) { 1673 mhp->mh_aio_cleanup_done = 1; 1674 cmn_err(CE_WARN, 1675 "aio_cleanup_dr_delete_memory not found in kaio"); 1676 thread_exit(); 1677 } 1678 do { 1679 cleaned = 0; 1680 mutex_enter(&pidlock); 1681 for (procp = practive; (*pcancel == 0) && (procp != NULL); 1682 procp = procp->p_next) { 1683 mutex_enter(&procp->p_lock); 1684 if (procp->p_aio != NULL) { 1685 /* cleanup proc's outstanding kaio */ 1686 cleaned += 1687 (*aio_cleanup_dr_delete_memory)(procp); 1688 } 1689 mutex_exit(&procp->p_lock); 1690 } 1691 mutex_exit(&pidlock); 1692 if ((*pcancel == 0) && 1693 (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) { 1694 /* delay a bit before retrying all procs again */ 1695 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 1696 n = 0; 1697 } 1698 } while (*pcancel == 0); 1699 mhp->mh_aio_cleanup_done = 1; 1700 thread_exit(); 1701 } 1702 1703 static void 1704 delete_memory_thread(caddr_t amhp) 1705 { 1706 struct mem_handle *mhp; 1707 struct memdelspan *mdsp; 1708 callb_cpr_t cprinfo; 1709 page_t *pp_targ; 1710 spgcnt_t freemem_left; 1711 void (*del_complete_funcp)(void *, int error); 1712 void *del_complete_arg; 1713 int comp_code; 1714 int ret; 1715 int first_scan; 1716 uint_t szc; 1717 #ifdef MEM_DEL_STATS 1718 uint64_t start_total, ntick_total; 1719 uint64_t start_pgrp, ntick_pgrp; 1720 #endif /* MEM_DEL_STATS */ 1721 1722 mhp = (struct mem_handle *)amhp; 1723 1724 #ifdef MEM_DEL_STATS 1725 start_total = ddi_get_lbolt(); 1726 #endif /* MEM_DEL_STATS */ 1727 1728 CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex, 1729 callb_generic_cpr, "memdel"); 1730 1731 mutex_enter(&mhp->mh_mutex); 1732 ASSERT(mhp->mh_state == MHND_STARTING); 1733 1734 mhp->mh_state = MHND_RUNNING; 1735 mhp->mh_thread_id = curthread; 1736 1737 mhp->mh_hold_todo = mhp->mh_vm_pages; 1738 mutex_exit(&mhp->mh_mutex); 1739 1740 /* Allocate the remap pages now, if necessary. */ 1741 memseg_remap_init(); 1742 1743 /* 1744 * Subtract from availrmem now if possible as availrmem 1745 * may not be available by the end of the delete. 1746 */ 1747 if (!get_availrmem(mhp->mh_vm_pages)) { 1748 comp_code = KPHYSM_ENOTVIABLE; 1749 mutex_enter(&mhp->mh_mutex); 1750 goto early_exit; 1751 } 1752 1753 ret = kphysm_setup_pre_del(mhp->mh_vm_pages); 1754 1755 mutex_enter(&mhp->mh_mutex); 1756 1757 if (ret != 0) { 1758 mhp->mh_cancel = KPHYSM_EREFUSED; 1759 goto refused; 1760 } 1761 1762 transit_list_collect(mhp, 1); 1763 1764 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1765 mdsp = mdsp->mds_next) { 1766 ASSERT(mdsp->mds_bitmap == NULL); 1767 mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP); 1768 mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp), 1769 KM_SLEEP); 1770 } 1771 1772 first_scan = 1; 1773 freemem_left = 0; 1774 /* 1775 * Start dr_aio_cleanup_thread, which periodically iterates 1776 * through the process list and invokes aio cleanup. This 1777 * is needed in order to avoid a deadly embrace between the 1778 * delete_memory_thread (waiting on writer lock for page, with the 1779 * exclusive-wanted bit set), kaio read request threads (waiting for a 1780 * reader lock on the same page that is wanted by the 1781 * delete_memory_thread), and threads waiting for kaio completion 1782 * (blocked on spt_amp->lock). 1783 */ 1784 mhp->mh_dr_aio_cleanup_cancel = 0; 1785 mhp->mh_aio_cleanup_done = 0; 1786 (void) thread_create(NULL, 0, dr_aio_cleanup_thread, 1787 (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1); 1788 while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) { 1789 pgcnt_t collected; 1790 1791 MDSTAT_INCR(mhp, nloop); 1792 collected = 0; 1793 for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) && 1794 (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) { 1795 pfn_t pfn, p_end; 1796 1797 p_end = mdsp->mds_base + mdsp->mds_npgs; 1798 for (pfn = mdsp->mds_base; (pfn < p_end) && 1799 (mhp->mh_cancel == 0); pfn++) { 1800 page_t *pp, *tpp, *tpp_targ; 1801 pgcnt_t bit; 1802 struct vnode *vp; 1803 u_offset_t offset; 1804 int mod, result; 1805 spgcnt_t pgcnt; 1806 1807 bit = pfn - mdsp->mds_base; 1808 if ((mdsp->mds_bitmap[bit / NBPBMW] & 1809 (1 << (bit % NBPBMW))) != 0) { 1810 MDSTAT_INCR(mhp, already_done); 1811 continue; 1812 } 1813 if (freemem_left == 0) { 1814 freemem_left += delthr_get_freemem(mhp); 1815 if (freemem_left == 0) 1816 break; 1817 } 1818 1819 /* 1820 * Release mh_mutex - some of this 1821 * stuff takes some time (eg PUTPAGE). 1822 */ 1823 1824 mutex_exit(&mhp->mh_mutex); 1825 MDSTAT_INCR(mhp, ncheck); 1826 1827 pp = page_numtopp_nolock(pfn); 1828 if (pp == NULL) { 1829 /* 1830 * Not covered by a page_t - will 1831 * be dealt with elsewhere. 1832 */ 1833 MDSTAT_INCR(mhp, nopaget); 1834 mutex_enter(&mhp->mh_mutex); 1835 mdsp->mds_bitmap[bit / NBPBMW] |= 1836 (1 << (bit % NBPBMW)); 1837 continue; 1838 } 1839 1840 if (!page_try_reclaim_lock(pp, SE_EXCL, 1841 SE_EXCL_WANTED | SE_RETIRED)) { 1842 /* 1843 * Page in use elsewhere. Skip it. 1844 */ 1845 MDSTAT_INCR(mhp, lockfail); 1846 mutex_enter(&mhp->mh_mutex); 1847 continue; 1848 } 1849 /* 1850 * See if the cage expanded into the delete. 1851 * This can happen as we have to allow the 1852 * cage to expand. 1853 */ 1854 if (PP_ISNORELOC(pp)) { 1855 page_unlock(pp); 1856 mutex_enter(&mhp->mh_mutex); 1857 mhp->mh_cancel = KPHYSM_ENONRELOC; 1858 break; 1859 } 1860 if (PP_RETIRED(pp)) { 1861 /* 1862 * Page has been retired and is 1863 * not part of the cage so we 1864 * can now do the accounting for 1865 * it. 1866 */ 1867 MDSTAT_INCR(mhp, retired); 1868 mutex_enter(&mhp->mh_mutex); 1869 mdsp->mds_bitmap[bit / NBPBMW] 1870 |= (1 << (bit % NBPBMW)); 1871 mdsp->mds_bitmap_retired[bit / 1872 NBPBMW] |= 1873 (1 << (bit % NBPBMW)); 1874 mhp->mh_hold_todo--; 1875 continue; 1876 } 1877 ASSERT(freemem_left != 0); 1878 if (PP_ISFREE(pp)) { 1879 /* 1880 * Like page_reclaim() only 'freemem' 1881 * processing is already done. 1882 */ 1883 MDSTAT_INCR(mhp, nfree); 1884 free_page_collect: 1885 if (PP_ISAGED(pp)) { 1886 page_list_sub(pp, 1887 PG_FREE_LIST); 1888 } else { 1889 page_list_sub(pp, 1890 PG_CACHE_LIST); 1891 } 1892 PP_CLRFREE(pp); 1893 PP_CLRAGED(pp); 1894 collected++; 1895 mutex_enter(&mhp->mh_mutex); 1896 page_delete_collect(pp, mhp); 1897 mdsp->mds_bitmap[bit / NBPBMW] |= 1898 (1 << (bit % NBPBMW)); 1899 freemem_left--; 1900 continue; 1901 } 1902 ASSERT(pp->p_vnode != NULL); 1903 if (first_scan) { 1904 MDSTAT_INCR(mhp, first_notfree); 1905 page_unlock(pp); 1906 mutex_enter(&mhp->mh_mutex); 1907 continue; 1908 } 1909 /* 1910 * Keep stats on pages encountered that 1911 * are marked for retirement. 1912 */ 1913 if (PP_TOXIC(pp)) { 1914 MDSTAT_INCR(mhp, toxic); 1915 } else if (PP_PR_REQ(pp)) { 1916 MDSTAT_INCR(mhp, failing); 1917 } 1918 /* 1919 * In certain cases below, special exceptions 1920 * are made for pages that are toxic. This 1921 * is because the current meaning of toxic 1922 * is that an uncorrectable error has been 1923 * previously associated with the page. 1924 */ 1925 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1926 if (!PP_TOXIC(pp)) { 1927 /* 1928 * Must relocate locked in 1929 * memory pages. 1930 */ 1931 #ifdef MEM_DEL_STATS 1932 start_pgrp = ddi_get_lbolt(); 1933 #endif /* MEM_DEL_STATS */ 1934 /* 1935 * Lock all constituent pages 1936 * of a large page to ensure 1937 * that p_szc won't change. 1938 */ 1939 if (!group_page_trylock(pp, 1940 SE_EXCL)) { 1941 MDSTAT_INCR(mhp, 1942 gptllckfail); 1943 page_unlock(pp); 1944 mutex_enter( 1945 &mhp->mh_mutex); 1946 continue; 1947 } 1948 MDSTAT_INCR(mhp, npplocked); 1949 pp_targ = 1950 page_get_replacement_page( 1951 pp, NULL, 0); 1952 if (pp_targ != NULL) { 1953 #ifdef MEM_DEL_STATS 1954 ntick_pgrp = 1955 (uint64_t) 1956 ddi_get_lbolt() - 1957 start_pgrp; 1958 #endif /* MEM_DEL_STATS */ 1959 MDSTAT_PGRP(mhp, 1960 ntick_pgrp); 1961 MDSTAT_INCR(mhp, 1962 nlockreloc); 1963 goto reloc; 1964 } 1965 group_page_unlock(pp); 1966 page_unlock(pp); 1967 #ifdef MEM_DEL_STATS 1968 ntick_pgrp = 1969 (uint64_t)ddi_get_lbolt() - 1970 start_pgrp; 1971 #endif /* MEM_DEL_STATS */ 1972 MDSTAT_PGRP(mhp, ntick_pgrp); 1973 MDSTAT_INCR(mhp, nnorepl); 1974 mutex_enter(&mhp->mh_mutex); 1975 continue; 1976 } else { 1977 /* 1978 * Cannot do anything about 1979 * this page because it is 1980 * toxic. 1981 */ 1982 MDSTAT_INCR(mhp, npplkdtoxic); 1983 page_unlock(pp); 1984 mutex_enter(&mhp->mh_mutex); 1985 continue; 1986 } 1987 } 1988 /* 1989 * Unload the mappings and check if mod bit 1990 * is set. 1991 */ 1992 ASSERT(!PP_ISKAS(pp)); 1993 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1994 mod = hat_ismod(pp); 1995 1996 #ifdef MEM_DEL_STATS 1997 start_pgrp = ddi_get_lbolt(); 1998 #endif /* MEM_DEL_STATS */ 1999 if (mod && !PP_TOXIC(pp)) { 2000 /* 2001 * Lock all constituent pages 2002 * of a large page to ensure 2003 * that p_szc won't change. 2004 */ 2005 if (!group_page_trylock(pp, SE_EXCL)) { 2006 MDSTAT_INCR(mhp, gptlmodfail); 2007 page_unlock(pp); 2008 mutex_enter(&mhp->mh_mutex); 2009 continue; 2010 } 2011 pp_targ = page_get_replacement_page(pp, 2012 NULL, 0); 2013 if (pp_targ != NULL) { 2014 MDSTAT_INCR(mhp, nmodreloc); 2015 #ifdef MEM_DEL_STATS 2016 ntick_pgrp = 2017 (uint64_t)ddi_get_lbolt() - 2018 start_pgrp; 2019 #endif /* MEM_DEL_STATS */ 2020 MDSTAT_PGRP(mhp, ntick_pgrp); 2021 goto reloc; 2022 } 2023 group_page_unlock(pp); 2024 } 2025 2026 if (!page_try_demote_pages(pp)) { 2027 MDSTAT_INCR(mhp, demotefail); 2028 page_unlock(pp); 2029 #ifdef MEM_DEL_STATS 2030 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2031 start_pgrp; 2032 #endif /* MEM_DEL_STATS */ 2033 MDSTAT_PGRP(mhp, ntick_pgrp); 2034 mutex_enter(&mhp->mh_mutex); 2035 continue; 2036 } 2037 2038 /* 2039 * Regular 'page-out'. 2040 */ 2041 if (!mod) { 2042 MDSTAT_INCR(mhp, ndestroy); 2043 page_destroy(pp, 1); 2044 /* 2045 * page_destroy was called with 2046 * dontfree. As long as p_lckcnt 2047 * and p_cowcnt are both zero, the 2048 * only additional action of 2049 * page_destroy with !dontfree is to 2050 * call page_free, so we can collect 2051 * the page here. 2052 */ 2053 collected++; 2054 #ifdef MEM_DEL_STATS 2055 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2056 start_pgrp; 2057 #endif /* MEM_DEL_STATS */ 2058 MDSTAT_PGRP(mhp, ntick_pgrp); 2059 mutex_enter(&mhp->mh_mutex); 2060 page_delete_collect(pp, mhp); 2061 mdsp->mds_bitmap[bit / NBPBMW] |= 2062 (1 << (bit % NBPBMW)); 2063 continue; 2064 } 2065 /* 2066 * The page is toxic and the mod bit is 2067 * set, we cannot do anything here to deal 2068 * with it. 2069 */ 2070 if (PP_TOXIC(pp)) { 2071 page_unlock(pp); 2072 #ifdef MEM_DEL_STATS 2073 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2074 start_pgrp; 2075 #endif /* MEM_DEL_STATS */ 2076 MDSTAT_PGRP(mhp, ntick_pgrp); 2077 MDSTAT_INCR(mhp, modtoxic); 2078 mutex_enter(&mhp->mh_mutex); 2079 continue; 2080 } 2081 MDSTAT_INCR(mhp, nputpage); 2082 vp = pp->p_vnode; 2083 offset = pp->p_offset; 2084 VN_HOLD(vp); 2085 page_unlock(pp); 2086 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, 2087 B_INVAL|B_FORCE, kcred, NULL); 2088 VN_RELE(vp); 2089 #ifdef MEM_DEL_STATS 2090 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2091 start_pgrp; 2092 #endif /* MEM_DEL_STATS */ 2093 MDSTAT_PGRP(mhp, ntick_pgrp); 2094 /* 2095 * Try to get the page back immediately 2096 * so that it can be collected. 2097 */ 2098 pp = page_numtopp_nolock(pfn); 2099 if (pp == NULL) { 2100 MDSTAT_INCR(mhp, nnoreclaim); 2101 /* 2102 * This should not happen as this 2103 * thread is deleting the page. 2104 * If this code is generalized, this 2105 * becomes a reality. 2106 */ 2107 #ifdef DEBUG 2108 cmn_err(CE_WARN, 2109 "delete_memory_thread(0x%p) " 2110 "pfn 0x%lx has no page_t", 2111 (void *)mhp, pfn); 2112 #endif /* DEBUG */ 2113 mutex_enter(&mhp->mh_mutex); 2114 continue; 2115 } 2116 if (page_try_reclaim_lock(pp, SE_EXCL, 2117 SE_EXCL_WANTED | SE_RETIRED)) { 2118 if (PP_ISFREE(pp)) { 2119 goto free_page_collect; 2120 } 2121 page_unlock(pp); 2122 } 2123 MDSTAT_INCR(mhp, nnoreclaim); 2124 mutex_enter(&mhp->mh_mutex); 2125 continue; 2126 2127 reloc: 2128 /* 2129 * Got some freemem and a target 2130 * page, so move the data to avoid 2131 * I/O and lock problems. 2132 */ 2133 ASSERT(!page_iolock_assert(pp)); 2134 MDSTAT_INCR(mhp, nreloc); 2135 /* 2136 * page_relocate() will return pgcnt: the 2137 * number of consecutive pages relocated. 2138 * If it is successful, pp will be a 2139 * linked list of the page structs that 2140 * were relocated. If page_relocate() is 2141 * unsuccessful, pp will be unmodified. 2142 */ 2143 #ifdef MEM_DEL_STATS 2144 start_pgrp = ddi_get_lbolt(); 2145 #endif /* MEM_DEL_STATS */ 2146 result = page_relocate(&pp, &pp_targ, 0, 0, 2147 &pgcnt, NULL); 2148 #ifdef MEM_DEL_STATS 2149 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2150 start_pgrp; 2151 #endif /* MEM_DEL_STATS */ 2152 MDSTAT_PGRP(mhp, ntick_pgrp); 2153 if (result != 0) { 2154 MDSTAT_INCR(mhp, nrelocfail); 2155 /* 2156 * We did not succeed. We need 2157 * to give the pp_targ pages back. 2158 * page_free(pp_targ, 1) without 2159 * the freemem accounting. 2160 */ 2161 group_page_unlock(pp); 2162 page_free_replacement_page(pp_targ); 2163 page_unlock(pp); 2164 mutex_enter(&mhp->mh_mutex); 2165 continue; 2166 } 2167 2168 /* 2169 * We will then collect pgcnt pages. 2170 */ 2171 ASSERT(pgcnt > 0); 2172 mutex_enter(&mhp->mh_mutex); 2173 /* 2174 * We need to make sure freemem_left is 2175 * large enough. 2176 */ 2177 while ((freemem_left < pgcnt) && 2178 (!mhp->mh_cancel)) { 2179 freemem_left += 2180 delthr_get_freemem(mhp); 2181 } 2182 2183 /* 2184 * Do not proceed if mh_cancel is set. 2185 */ 2186 if (mhp->mh_cancel) { 2187 while (pp_targ != NULL) { 2188 /* 2189 * Unlink and unlock each page. 2190 */ 2191 tpp_targ = pp_targ; 2192 page_sub(&pp_targ, tpp_targ); 2193 page_unlock(tpp_targ); 2194 } 2195 /* 2196 * We need to give the pp pages back. 2197 * page_free(pp, 1) without the 2198 * freemem accounting. 2199 */ 2200 page_free_replacement_page(pp); 2201 break; 2202 } 2203 2204 /* Now remove pgcnt from freemem_left */ 2205 freemem_left -= pgcnt; 2206 ASSERT(freemem_left >= 0); 2207 szc = pp->p_szc; 2208 while (pp != NULL) { 2209 /* 2210 * pp and pp_targ were passed back as 2211 * a linked list of pages. 2212 * Unlink and unlock each page. 2213 */ 2214 tpp_targ = pp_targ; 2215 page_sub(&pp_targ, tpp_targ); 2216 page_unlock(tpp_targ); 2217 /* 2218 * The original page is now free 2219 * so remove it from the linked 2220 * list and collect it. 2221 */ 2222 tpp = pp; 2223 page_sub(&pp, tpp); 2224 pfn = page_pptonum(tpp); 2225 collected++; 2226 ASSERT(PAGE_EXCL(tpp)); 2227 ASSERT(tpp->p_vnode == NULL); 2228 ASSERT(!hat_page_is_mapped(tpp)); 2229 ASSERT(tpp->p_szc == szc); 2230 tpp->p_szc = 0; 2231 page_delete_collect(tpp, mhp); 2232 bit = pfn - mdsp->mds_base; 2233 mdsp->mds_bitmap[bit / NBPBMW] |= 2234 (1 << (bit % NBPBMW)); 2235 } 2236 ASSERT(pp_targ == NULL); 2237 } 2238 } 2239 first_scan = 0; 2240 if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) && 2241 (collected == 0)) { 2242 /* 2243 * This code is needed as we cannot wait 2244 * for a page to be locked OR the delete to 2245 * be cancelled. Also, we must delay so 2246 * that other threads get a chance to run 2247 * on our cpu, otherwise page locks may be 2248 * held indefinitely by those threads. 2249 */ 2250 MDSTAT_INCR(mhp, ndelay); 2251 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2252 (void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex, 2253 (lbolt + DEL_BUSY_WAIT_TICKS)); 2254 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2255 } 2256 } 2257 /* stop the dr aio cleanup thread */ 2258 mhp->mh_dr_aio_cleanup_cancel = 1; 2259 transit_list_collect(mhp, 0); 2260 if (freemem_left != 0) { 2261 /* Return any surplus. */ 2262 page_create_putback(freemem_left); 2263 freemem_left = 0; 2264 } 2265 #ifdef MEM_DEL_STATS 2266 ntick_total = (uint64_t)ddi_get_lbolt() - start_total; 2267 #endif /* MEM_DEL_STATS */ 2268 MDSTAT_TOTAL(mhp, ntick_total); 2269 MDSTAT_PRINT(mhp); 2270 2271 /* 2272 * If the memory delete was cancelled, exclusive-wanted bits must 2273 * be cleared. If there are retired pages being deleted, they need 2274 * to be unretired. 2275 */ 2276 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2277 mdsp = mdsp->mds_next) { 2278 pfn_t pfn, p_end; 2279 2280 p_end = mdsp->mds_base + mdsp->mds_npgs; 2281 for (pfn = mdsp->mds_base; pfn < p_end; pfn++) { 2282 page_t *pp; 2283 pgcnt_t bit; 2284 2285 bit = pfn - mdsp->mds_base; 2286 if (mhp->mh_cancel) { 2287 pp = page_numtopp_nolock(pfn); 2288 if (pp != NULL) { 2289 if ((mdsp->mds_bitmap[bit / NBPBMW] & 2290 (1 << (bit % NBPBMW))) == 0) { 2291 page_lock_clr_exclwanted(pp); 2292 } 2293 } 2294 } else { 2295 pp = NULL; 2296 } 2297 if ((mdsp->mds_bitmap_retired[bit / NBPBMW] & 2298 (1 << (bit % NBPBMW))) != 0) { 2299 /* do we already have pp? */ 2300 if (pp == NULL) { 2301 pp = page_numtopp_nolock(pfn); 2302 } 2303 ASSERT(pp != NULL); 2304 ASSERT(PP_RETIRED(pp)); 2305 if (mhp->mh_cancel != 0) { 2306 page_unlock(pp); 2307 /* 2308 * To satisfy ASSERT below in 2309 * cancel code. 2310 */ 2311 mhp->mh_hold_todo++; 2312 } else { 2313 (void) page_unretire_pp(pp, 2314 PR_UNR_CLEAN); 2315 } 2316 } 2317 } 2318 } 2319 /* 2320 * Free retired page bitmap and collected page bitmap 2321 */ 2322 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2323 mdsp = mdsp->mds_next) { 2324 ASSERT(mdsp->mds_bitmap_retired != NULL); 2325 kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp)); 2326 mdsp->mds_bitmap_retired = NULL; /* Paranoia. */ 2327 ASSERT(mdsp->mds_bitmap != NULL); 2328 kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp)); 2329 mdsp->mds_bitmap = NULL; /* Paranoia. */ 2330 } 2331 2332 /* wait for our dr aio cancel thread to exit */ 2333 while (!(mhp->mh_aio_cleanup_done)) { 2334 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2335 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 2336 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2337 } 2338 refused: 2339 if (mhp->mh_cancel != 0) { 2340 page_t *pp; 2341 2342 comp_code = mhp->mh_cancel; 2343 /* 2344 * Go through list of deleted pages (mh_deleted) freeing 2345 * them. 2346 */ 2347 while ((pp = mhp->mh_deleted) != NULL) { 2348 mhp->mh_deleted = pp->p_next; 2349 mhp->mh_hold_todo++; 2350 mutex_exit(&mhp->mh_mutex); 2351 /* Restore p_next. */ 2352 pp->p_next = pp->p_prev; 2353 if (PP_ISFREE(pp)) { 2354 cmn_err(CE_PANIC, 2355 "page %p is free", 2356 (void *)pp); 2357 } 2358 page_free(pp, 1); 2359 mutex_enter(&mhp->mh_mutex); 2360 } 2361 ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages); 2362 2363 mutex_exit(&mhp->mh_mutex); 2364 put_availrmem(mhp->mh_vm_pages); 2365 mutex_enter(&mhp->mh_mutex); 2366 2367 goto t_exit; 2368 } 2369 2370 /* 2371 * All the pages are no longer in use and are exclusively locked. 2372 */ 2373 2374 mhp->mh_deleted = NULL; 2375 2376 kphysm_del_cleanup(mhp); 2377 2378 /* 2379 * mem_node_del_range needs to be after kphysm_del_cleanup so 2380 * that the mem_node_config[] will remain intact for the cleanup. 2381 */ 2382 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2383 mdsp = mdsp->mds_next) { 2384 mem_node_del_range(mdsp->mds_base, 2385 mdsp->mds_base + mdsp->mds_npgs - 1); 2386 } 2387 2388 comp_code = KPHYSM_OK; 2389 2390 t_exit: 2391 mutex_exit(&mhp->mh_mutex); 2392 kphysm_setup_post_del(mhp->mh_vm_pages, 2393 (comp_code == KPHYSM_OK) ? 0 : 1); 2394 mutex_enter(&mhp->mh_mutex); 2395 2396 early_exit: 2397 /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */ 2398 mhp->mh_state = MHND_DONE; 2399 del_complete_funcp = mhp->mh_delete_complete; 2400 del_complete_arg = mhp->mh_delete_complete_arg; 2401 CALLB_CPR_EXIT(&cprinfo); 2402 (*del_complete_funcp)(del_complete_arg, comp_code); 2403 thread_exit(); 2404 /*NOTREACHED*/ 2405 } 2406 2407 /* 2408 * Start the delete of the memory from the system. 2409 */ 2410 int 2411 kphysm_del_start( 2412 memhandle_t handle, 2413 void (*complete)(void *, int), 2414 void *complete_arg) 2415 { 2416 struct mem_handle *mhp; 2417 2418 mhp = kphysm_lookup_mem_handle(handle); 2419 if (mhp == NULL) { 2420 return (KPHYSM_EHANDLE); 2421 } 2422 switch (mhp->mh_state) { 2423 case MHND_FREE: 2424 ASSERT(mhp->mh_state != MHND_FREE); 2425 mutex_exit(&mhp->mh_mutex); 2426 return (KPHYSM_EHANDLE); 2427 case MHND_INIT: 2428 break; 2429 case MHND_STARTING: 2430 case MHND_RUNNING: 2431 mutex_exit(&mhp->mh_mutex); 2432 return (KPHYSM_ESEQUENCE); 2433 case MHND_DONE: 2434 mutex_exit(&mhp->mh_mutex); 2435 return (KPHYSM_ESEQUENCE); 2436 case MHND_RELEASE: 2437 mutex_exit(&mhp->mh_mutex); 2438 return (KPHYSM_ESEQUENCE); 2439 default: 2440 #ifdef DEBUG 2441 cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d", 2442 (void *)mhp, mhp->mh_state); 2443 #endif /* DEBUG */ 2444 mutex_exit(&mhp->mh_mutex); 2445 return (KPHYSM_EHANDLE); 2446 } 2447 2448 if (mhp->mh_transit.trl_spans == NULL) { 2449 mutex_exit(&mhp->mh_mutex); 2450 return (KPHYSM_ENOWORK); 2451 } 2452 2453 ASSERT(complete != NULL); 2454 mhp->mh_delete_complete = complete; 2455 mhp->mh_delete_complete_arg = complete_arg; 2456 mhp->mh_state = MHND_STARTING; 2457 /* 2458 * Release the mutex in case thread_create sleeps. 2459 */ 2460 mutex_exit(&mhp->mh_mutex); 2461 2462 /* 2463 * The "obvious" process for this thread is pageout (proc_pageout) 2464 * but this gives the thread too much power over freemem 2465 * which results in freemem starvation. 2466 */ 2467 (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0, 2468 TS_RUN, maxclsyspri - 1); 2469 2470 return (KPHYSM_OK); 2471 } 2472 2473 static kmutex_t pp_dummy_lock; /* Protects init. of pp_dummy. */ 2474 static caddr_t pp_dummy; 2475 static pgcnt_t pp_dummy_npages; 2476 static pfn_t *pp_dummy_pfn; /* Array of dummy pfns. */ 2477 2478 static void 2479 memseg_remap_init_pages(page_t *pages, page_t *epages) 2480 { 2481 page_t *pp; 2482 2483 for (pp = pages; pp < epages; pp++) { 2484 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2485 pp->p_offset = (u_offset_t)-1; 2486 page_iolock_init(pp); 2487 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 2488 continue; 2489 page_lock_delete(pp); 2490 } 2491 } 2492 2493 void 2494 memseg_remap_init() 2495 { 2496 mutex_enter(&pp_dummy_lock); 2497 if (pp_dummy == NULL) { 2498 uint_t dpages; 2499 int i; 2500 2501 /* 2502 * dpages starts off as the size of the structure and 2503 * ends up as the minimum number of pages that will 2504 * hold a whole number of page_t structures. 2505 */ 2506 dpages = sizeof (page_t); 2507 ASSERT(dpages != 0); 2508 ASSERT(dpages <= MMU_PAGESIZE); 2509 2510 while ((dpages & 1) == 0) 2511 dpages >>= 1; 2512 2513 pp_dummy_npages = dpages; 2514 /* 2515 * Allocate pp_dummy pages directly from static_arena, 2516 * since these are whole page allocations and are 2517 * referenced by physical address. This also has the 2518 * nice fringe benefit of hiding the memory from 2519 * ::findleaks since it doesn't deal well with allocated 2520 * kernel heap memory that doesn't have any mappings. 2521 */ 2522 pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages), 2523 PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP); 2524 bzero(pp_dummy, ptob(pp_dummy_npages)); 2525 ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0); 2526 pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) * 2527 pp_dummy_npages, KM_SLEEP); 2528 for (i = 0; i < pp_dummy_npages; i++) { 2529 pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat, 2530 &pp_dummy[MMU_PAGESIZE * i]); 2531 ASSERT(pp_dummy_pfn[i] != PFN_INVALID); 2532 } 2533 /* 2534 * Initialize the page_t's to a known 'deleted' state 2535 * that matches the state of deleted pages. 2536 */ 2537 memseg_remap_init_pages((page_t *)pp_dummy, 2538 (page_t *)(pp_dummy + ptob(pp_dummy_npages))); 2539 /* Remove kmem mappings for the pages for safety. */ 2540 hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages), 2541 HAT_UNLOAD_UNLOCK); 2542 /* Leave pp_dummy pointer set as flag that init is done. */ 2543 } 2544 mutex_exit(&pp_dummy_lock); 2545 } 2546 2547 /* 2548 * Remap a page-aglined range of page_t's to dummy pages. 2549 */ 2550 void 2551 remap_to_dummy(caddr_t va, pgcnt_t metapgs) 2552 { 2553 int phase; 2554 2555 ASSERT(IS_P2ALIGNED((uint64_t)va, PAGESIZE)); 2556 2557 /* 2558 * We may start remapping at a non-zero page offset 2559 * within the dummy pages since the low/high ends 2560 * of the outgoing pp's could be shared by other 2561 * memsegs (see memseg_remap_meta). 2562 */ 2563 phase = btop((uint64_t)va) % pp_dummy_npages; 2564 ASSERT(PAGESIZE % sizeof (page_t) || phase == 0); 2565 2566 while (metapgs != 0) { 2567 pgcnt_t n; 2568 int i, j; 2569 2570 n = pp_dummy_npages; 2571 if (n > metapgs) 2572 n = metapgs; 2573 for (i = 0; i < n; i++) { 2574 j = (i + phase) % pp_dummy_npages; 2575 hat_devload(kas.a_hat, va, ptob(1), pp_dummy_pfn[j], 2576 PROT_READ, 2577 HAT_LOAD | HAT_LOAD_NOCONSIST | 2578 HAT_LOAD_REMAP); 2579 va += ptob(1); 2580 } 2581 metapgs -= n; 2582 } 2583 } 2584 2585 static void 2586 memseg_remap_to_dummy(struct memseg *seg) 2587 { 2588 caddr_t pp; 2589 pgcnt_t metapgs; 2590 2591 ASSERT(memseg_is_dynamic(seg)); 2592 ASSERT(pp_dummy != NULL); 2593 2594 2595 if (!memseg_includes_meta(seg)) { 2596 memseg_remap_meta(seg); 2597 return; 2598 } 2599 2600 pp = (caddr_t)seg->pages; 2601 metapgs = seg->pages_base - memseg_get_start(seg); 2602 ASSERT(metapgs != 0); 2603 2604 seg->pages_end = seg->pages_base; 2605 2606 remap_to_dummy(pp, metapgs); 2607 } 2608 2609 /* 2610 * Transition all the deleted pages to the deleted state so that 2611 * page_lock will not wait. The page_lock_delete call will 2612 * also wake up any waiters. 2613 */ 2614 static void 2615 memseg_lock_delete_all(struct memseg *seg) 2616 { 2617 page_t *pp; 2618 2619 for (pp = seg->pages; pp < seg->epages; pp++) { 2620 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2621 page_lock_delete(pp); 2622 } 2623 } 2624 2625 static void 2626 kphysm_del_cleanup(struct mem_handle *mhp) 2627 { 2628 struct memdelspan *mdsp; 2629 struct memseg *seg; 2630 struct memseg **segpp; 2631 struct memseg *seglist; 2632 pfn_t p_end; 2633 uint64_t avmem; 2634 pgcnt_t avpgs; 2635 pgcnt_t npgs; 2636 2637 avpgs = mhp->mh_vm_pages; 2638 2639 memsegs_lock(1); 2640 2641 /* 2642 * remove from main segment list. 2643 */ 2644 npgs = 0; 2645 seglist = NULL; 2646 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2647 mdsp = mdsp->mds_next) { 2648 p_end = mdsp->mds_base + mdsp->mds_npgs; 2649 for (segpp = &memsegs; (seg = *segpp) != NULL; ) { 2650 if (seg->pages_base >= p_end || 2651 seg->pages_end <= mdsp->mds_base) { 2652 /* Span and memseg don't overlap. */ 2653 segpp = &((*segpp)->next); 2654 continue; 2655 } 2656 ASSERT(seg->pages_base >= mdsp->mds_base); 2657 ASSERT(seg->pages_end <= p_end); 2658 2659 PLCNT_MODIFY_MAX(seg->pages_base, 2660 seg->pages_base - seg->pages_end); 2661 2662 /* Hide the memseg from future scans. */ 2663 hat_kpm_delmem_mseg_update(seg, segpp); 2664 *segpp = seg->next; 2665 membar_producer(); /* TODO: Needed? */ 2666 npgs += MSEG_NPAGES(seg); 2667 2668 /* 2669 * Leave the deleted segment's next pointer intact 2670 * in case a memsegs scanning loop is walking this 2671 * segment concurrently. 2672 */ 2673 seg->lnext = seglist; 2674 seglist = seg; 2675 } 2676 } 2677 2678 build_pfn_hash(); 2679 2680 ASSERT(npgs < total_pages); 2681 total_pages -= npgs; 2682 2683 /* 2684 * Recalculate the paging parameters now total_pages has changed. 2685 * This will also cause the clock hands to be reset before next use. 2686 */ 2687 setupclock(1); 2688 2689 memsegs_unlock(1); 2690 2691 mutex_exit(&mhp->mh_mutex); 2692 2693 while ((seg = seglist) != NULL) { 2694 pfn_t mseg_start; 2695 pfn_t mseg_base, mseg_end; 2696 pgcnt_t mseg_npgs; 2697 int mlret; 2698 2699 seglist = seg->lnext; 2700 2701 /* 2702 * Put the page_t's into the deleted state to stop 2703 * cv_wait()s on the pages. When we remap, the dummy 2704 * page_t's will be in the same state. 2705 */ 2706 memseg_lock_delete_all(seg); 2707 /* 2708 * Collect up information based on pages_base and pages_end 2709 * early so that we can flag early that the memseg has been 2710 * deleted by setting pages_end == pages_base. 2711 */ 2712 mseg_base = seg->pages_base; 2713 mseg_end = seg->pages_end; 2714 mseg_npgs = MSEG_NPAGES(seg); 2715 mseg_start = memseg_get_start(seg); 2716 2717 if (memseg_is_dynamic(seg)) { 2718 /* Remap the meta data to our special dummy area. */ 2719 memseg_remap_to_dummy(seg); 2720 2721 mutex_enter(&memseg_lists_lock); 2722 seg->lnext = memseg_va_avail; 2723 memseg_va_avail = seg; 2724 mutex_exit(&memseg_lists_lock); 2725 } else { 2726 /* 2727 * For memory whose page_ts were allocated 2728 * at boot, we need to find a new use for 2729 * the page_t memory. 2730 * For the moment, just leak it. 2731 * (It is held in the memseg_delete_junk list.) 2732 */ 2733 seg->pages_end = seg->pages_base; 2734 2735 mutex_enter(&memseg_lists_lock); 2736 seg->lnext = memseg_delete_junk; 2737 memseg_delete_junk = seg; 2738 mutex_exit(&memseg_lists_lock); 2739 } 2740 2741 /* Must not use seg now as it could be re-used. */ 2742 2743 memlist_write_lock(); 2744 2745 mlret = memlist_delete_span( 2746 (uint64_t)(mseg_base) << PAGESHIFT, 2747 (uint64_t)(mseg_npgs) << PAGESHIFT, 2748 &phys_avail); 2749 ASSERT(mlret == MEML_SPANOP_OK); 2750 2751 mlret = memlist_delete_span( 2752 (uint64_t)(mseg_start) << PAGESHIFT, 2753 (uint64_t)(mseg_end - mseg_start) << 2754 PAGESHIFT, 2755 &phys_install); 2756 ASSERT(mlret == MEML_SPANOP_OK); 2757 phys_install_has_changed(); 2758 2759 memlist_write_unlock(); 2760 } 2761 2762 memlist_read_lock(); 2763 installed_top_size(phys_install, &physmax, &physinstalled); 2764 memlist_read_unlock(); 2765 2766 mutex_enter(&freemem_lock); 2767 maxmem -= avpgs; 2768 physmem -= avpgs; 2769 /* availrmem is adjusted during the delete. */ 2770 availrmem_initial -= avpgs; 2771 2772 mutex_exit(&freemem_lock); 2773 2774 dump_resize(); 2775 2776 cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK " 2777 "(0x%" PRIx64 ")\n", 2778 physinstalled << (PAGESHIFT - 10), 2779 (uint64_t)physinstalled << PAGESHIFT); 2780 2781 avmem = (uint64_t)freemem << PAGESHIFT; 2782 cmn_err(CE_CONT, "?kphysm_delete: " 2783 "avail mem = %" PRId64 "\n", avmem); 2784 2785 /* 2786 * Update lgroup generation number on single lgroup systems 2787 */ 2788 if (nlgrps == 1) 2789 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 2790 2791 /* Successfully deleted system memory */ 2792 mutex_enter(&mhp->mh_mutex); 2793 } 2794 2795 static uint_t mdel_nullvp_waiter; 2796 2797 static void 2798 page_delete_collect( 2799 page_t *pp, 2800 struct mem_handle *mhp) 2801 { 2802 if (pp->p_vnode) { 2803 page_hashout(pp, (kmutex_t *)NULL); 2804 /* do not do PP_SETAGED(pp); */ 2805 } else { 2806 kmutex_t *sep; 2807 2808 sep = page_se_mutex(pp); 2809 mutex_enter(sep); 2810 if (CV_HAS_WAITERS(&pp->p_cv)) { 2811 mdel_nullvp_waiter++; 2812 cv_broadcast(&pp->p_cv); 2813 } 2814 mutex_exit(sep); 2815 } 2816 ASSERT(pp->p_next == pp->p_prev); 2817 ASSERT(pp->p_next == NULL || pp->p_next == pp); 2818 pp->p_next = mhp->mh_deleted; 2819 mhp->mh_deleted = pp; 2820 ASSERT(mhp->mh_hold_todo != 0); 2821 mhp->mh_hold_todo--; 2822 } 2823 2824 static void 2825 transit_list_collect(struct mem_handle *mhp, int v) 2826 { 2827 struct transit_list_head *trh; 2828 2829 trh = &transit_list_head; 2830 mutex_enter(&trh->trh_lock); 2831 mhp->mh_transit.trl_collect = v; 2832 mutex_exit(&trh->trh_lock); 2833 } 2834 2835 static void 2836 transit_list_insert(struct transit_list *tlp) 2837 { 2838 struct transit_list_head *trh; 2839 2840 trh = &transit_list_head; 2841 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2842 tlp->trl_next = trh->trh_head; 2843 trh->trh_head = tlp; 2844 } 2845 2846 static void 2847 transit_list_remove(struct transit_list *tlp) 2848 { 2849 struct transit_list_head *trh; 2850 struct transit_list **tlpp; 2851 2852 trh = &transit_list_head; 2853 tlpp = &trh->trh_head; 2854 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2855 while (*tlpp != NULL && *tlpp != tlp) 2856 tlpp = &(*tlpp)->trl_next; 2857 ASSERT(*tlpp != NULL); 2858 if (*tlpp == tlp) 2859 *tlpp = tlp->trl_next; 2860 tlp->trl_next = NULL; 2861 } 2862 2863 static struct transit_list * 2864 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum) 2865 { 2866 struct transit_list *tlp; 2867 2868 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 2869 struct memdelspan *mdsp; 2870 2871 for (mdsp = tlp->trl_spans; mdsp != NULL; 2872 mdsp = mdsp->mds_next) { 2873 if (pfnum >= mdsp->mds_base && 2874 pfnum < (mdsp->mds_base + mdsp->mds_npgs)) { 2875 return (tlp); 2876 } 2877 } 2878 } 2879 return (NULL); 2880 } 2881 2882 int 2883 pfn_is_being_deleted(pfn_t pfnum) 2884 { 2885 struct transit_list_head *trh; 2886 struct transit_list *tlp; 2887 int ret; 2888 2889 trh = &transit_list_head; 2890 if (trh->trh_head == NULL) 2891 return (0); 2892 2893 mutex_enter(&trh->trh_lock); 2894 tlp = pfnum_to_transit_list(trh, pfnum); 2895 ret = (tlp != NULL && tlp->trl_collect); 2896 mutex_exit(&trh->trh_lock); 2897 2898 return (ret); 2899 } 2900 2901 #ifdef MEM_DEL_STATS 2902 extern int hz; 2903 static void 2904 mem_del_stat_print_func(struct mem_handle *mhp) 2905 { 2906 uint64_t tmp; 2907 2908 if (mem_del_stat_print) { 2909 printf("memory delete loop %x/%x, statistics%s\n", 2910 (uint_t)mhp->mh_transit.trl_spans->mds_base, 2911 (uint_t)mhp->mh_transit.trl_spans->mds_npgs, 2912 (mhp->mh_cancel ? " (cancelled)" : "")); 2913 printf("\t%8u nloop\n", mhp->mh_delstat.nloop); 2914 printf("\t%8u need_free\n", mhp->mh_delstat.need_free); 2915 printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop); 2916 printf("\t%8u free_low\n", mhp->mh_delstat.free_low); 2917 printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed); 2918 printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck); 2919 printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget); 2920 printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail); 2921 printf("\t%8u nfree\n", mhp->mh_delstat.nfree); 2922 printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc); 2923 printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail); 2924 printf("\t%8u already_done\n", mhp->mh_delstat.already_done); 2925 printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree); 2926 printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked); 2927 printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc); 2928 printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl); 2929 printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc); 2930 printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy); 2931 printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage); 2932 printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim); 2933 printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay); 2934 printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail); 2935 printf("\t%8u retired\n", mhp->mh_delstat.retired); 2936 printf("\t%8u toxic\n", mhp->mh_delstat.toxic); 2937 printf("\t%8u failing\n", mhp->mh_delstat.failing); 2938 printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic); 2939 printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic); 2940 printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail); 2941 printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail); 2942 tmp = mhp->mh_delstat.nticks_total / hz; /* seconds */ 2943 printf( 2944 "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n", 2945 mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60); 2946 2947 tmp = mhp->mh_delstat.nticks_pgrp / hz; /* seconds */ 2948 printf( 2949 "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n", 2950 mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60); 2951 } 2952 } 2953 #endif /* MEM_DEL_STATS */ 2954 2955 struct mem_callback { 2956 kphysm_setup_vector_t *vec; 2957 void *arg; 2958 }; 2959 2960 #define NMEMCALLBACKS 100 2961 2962 static struct mem_callback mem_callbacks[NMEMCALLBACKS]; 2963 static uint_t nmemcallbacks; 2964 static krwlock_t mem_callback_rwlock; 2965 2966 int 2967 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg) 2968 { 2969 uint_t i, found; 2970 2971 /* 2972 * This test will become more complicated when the version must 2973 * change. 2974 */ 2975 if (vec->version != KPHYSM_SETUP_VECTOR_VERSION) 2976 return (EINVAL); 2977 2978 if (vec->post_add == NULL || vec->pre_del == NULL || 2979 vec->post_del == NULL) 2980 return (EINVAL); 2981 2982 rw_enter(&mem_callback_rwlock, RW_WRITER); 2983 for (i = 0, found = 0; i < nmemcallbacks; i++) { 2984 if (mem_callbacks[i].vec == NULL && found == 0) 2985 found = i + 1; 2986 if (mem_callbacks[i].vec == vec && 2987 mem_callbacks[i].arg == arg) { 2988 #ifdef DEBUG 2989 /* Catch this in DEBUG kernels. */ 2990 cmn_err(CE_WARN, "kphysm_setup_func_register" 2991 "(0x%p, 0x%p) duplicate registration from 0x%p", 2992 (void *)vec, arg, (void *)caller()); 2993 #endif /* DEBUG */ 2994 rw_exit(&mem_callback_rwlock); 2995 return (EEXIST); 2996 } 2997 } 2998 if (found != 0) { 2999 i = found - 1; 3000 } else { 3001 ASSERT(nmemcallbacks < NMEMCALLBACKS); 3002 if (nmemcallbacks == NMEMCALLBACKS) { 3003 rw_exit(&mem_callback_rwlock); 3004 return (ENOMEM); 3005 } 3006 i = nmemcallbacks++; 3007 } 3008 mem_callbacks[i].vec = vec; 3009 mem_callbacks[i].arg = arg; 3010 rw_exit(&mem_callback_rwlock); 3011 return (0); 3012 } 3013 3014 void 3015 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg) 3016 { 3017 uint_t i; 3018 3019 rw_enter(&mem_callback_rwlock, RW_WRITER); 3020 for (i = 0; i < nmemcallbacks; i++) { 3021 if (mem_callbacks[i].vec == vec && 3022 mem_callbacks[i].arg == arg) { 3023 mem_callbacks[i].vec = NULL; 3024 mem_callbacks[i].arg = NULL; 3025 if (i == (nmemcallbacks - 1)) 3026 nmemcallbacks--; 3027 break; 3028 } 3029 } 3030 rw_exit(&mem_callback_rwlock); 3031 } 3032 3033 static void 3034 kphysm_setup_post_add(pgcnt_t delta_pages) 3035 { 3036 uint_t i; 3037 3038 rw_enter(&mem_callback_rwlock, RW_READER); 3039 for (i = 0; i < nmemcallbacks; i++) { 3040 if (mem_callbacks[i].vec != NULL) { 3041 (*mem_callbacks[i].vec->post_add) 3042 (mem_callbacks[i].arg, delta_pages); 3043 } 3044 } 3045 rw_exit(&mem_callback_rwlock); 3046 } 3047 3048 /* 3049 * Note the locking between pre_del and post_del: The reader lock is held 3050 * between the two calls to stop the set of functions from changing. 3051 */ 3052 3053 static int 3054 kphysm_setup_pre_del(pgcnt_t delta_pages) 3055 { 3056 uint_t i; 3057 int ret; 3058 int aret; 3059 3060 ret = 0; 3061 rw_enter(&mem_callback_rwlock, RW_READER); 3062 for (i = 0; i < nmemcallbacks; i++) { 3063 if (mem_callbacks[i].vec != NULL) { 3064 aret = (*mem_callbacks[i].vec->pre_del) 3065 (mem_callbacks[i].arg, delta_pages); 3066 ret |= aret; 3067 } 3068 } 3069 3070 return (ret); 3071 } 3072 3073 static void 3074 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled) 3075 { 3076 uint_t i; 3077 3078 for (i = 0; i < nmemcallbacks; i++) { 3079 if (mem_callbacks[i].vec != NULL) { 3080 (*mem_callbacks[i].vec->post_del) 3081 (mem_callbacks[i].arg, delta_pages, cancelled); 3082 } 3083 } 3084 rw_exit(&mem_callback_rwlock); 3085 } 3086 3087 static int 3088 kphysm_split_memseg( 3089 pfn_t base, 3090 pgcnt_t npgs) 3091 { 3092 struct memseg *seg; 3093 struct memseg **segpp; 3094 pgcnt_t size_low, size_high; 3095 struct memseg *seg_low, *seg_mid, *seg_high; 3096 3097 /* 3098 * Lock the memsegs list against other updates now 3099 */ 3100 memsegs_lock(1); 3101 3102 /* 3103 * Find boot time memseg that wholly covers this area. 3104 */ 3105 3106 /* First find the memseg with page 'base' in it. */ 3107 for (segpp = &memsegs; (seg = *segpp) != NULL; 3108 segpp = &((*segpp)->next)) { 3109 if (base >= seg->pages_base && base < seg->pages_end) 3110 break; 3111 } 3112 if (seg == NULL) { 3113 memsegs_unlock(1); 3114 return (0); 3115 } 3116 if (memseg_includes_meta(seg)) { 3117 memsegs_unlock(1); 3118 return (0); 3119 } 3120 if ((base + npgs) > seg->pages_end) { 3121 memsegs_unlock(1); 3122 return (0); 3123 } 3124 3125 /* 3126 * Work out the size of the two segments that will 3127 * surround the new segment, one for low address 3128 * and one for high. 3129 */ 3130 ASSERT(base >= seg->pages_base); 3131 size_low = base - seg->pages_base; 3132 ASSERT(seg->pages_end >= (base + npgs)); 3133 size_high = seg->pages_end - (base + npgs); 3134 3135 /* 3136 * Sanity check. 3137 */ 3138 if ((size_low + size_high) == 0) { 3139 memsegs_unlock(1); 3140 return (0); 3141 } 3142 3143 /* 3144 * Allocate the new structures. The old memseg will not be freed 3145 * as there may be a reference to it. 3146 */ 3147 seg_low = NULL; 3148 seg_high = NULL; 3149 3150 if (size_low != 0) 3151 seg_low = memseg_alloc(); 3152 3153 seg_mid = memseg_alloc(); 3154 3155 if (size_high != 0) 3156 seg_high = memseg_alloc(); 3157 3158 /* 3159 * All allocation done now. 3160 */ 3161 if (size_low != 0) { 3162 seg_low->pages = seg->pages; 3163 seg_low->epages = seg_low->pages + size_low; 3164 seg_low->pages_base = seg->pages_base; 3165 seg_low->pages_end = seg_low->pages_base + size_low; 3166 seg_low->next = seg_mid; 3167 seg_low->msegflags = seg->msegflags; 3168 } 3169 if (size_high != 0) { 3170 seg_high->pages = seg->epages - size_high; 3171 seg_high->epages = seg_high->pages + size_high; 3172 seg_high->pages_base = seg->pages_end - size_high; 3173 seg_high->pages_end = seg_high->pages_base + size_high; 3174 seg_high->next = seg->next; 3175 seg_high->msegflags = seg->msegflags; 3176 } 3177 3178 seg_mid->pages = seg->pages + size_low; 3179 seg_mid->pages_base = seg->pages_base + size_low; 3180 seg_mid->epages = seg->epages - size_high; 3181 seg_mid->pages_end = seg->pages_end - size_high; 3182 seg_mid->next = (seg_high != NULL) ? seg_high : seg->next; 3183 seg_mid->msegflags = seg->msegflags; 3184 3185 /* 3186 * Update hat_kpm specific info of all involved memsegs and 3187 * allow hat_kpm specific global chain updates. 3188 */ 3189 hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high); 3190 3191 /* 3192 * At this point we have two equivalent memseg sub-chains, 3193 * seg and seg_low/seg_mid/seg_high, which both chain on to 3194 * the same place in the global chain. By re-writing the pointer 3195 * in the previous element we switch atomically from using the old 3196 * (seg) to the new. 3197 */ 3198 *segpp = (seg_low != NULL) ? seg_low : seg_mid; 3199 3200 membar_enter(); 3201 3202 build_pfn_hash(); 3203 memsegs_unlock(1); 3204 3205 /* 3206 * We leave the old segment, 'seg', intact as there may be 3207 * references to it. Also, as the value of total_pages has not 3208 * changed and the memsegs list is effectively the same when 3209 * accessed via the old or the new pointer, we do not have to 3210 * cause pageout_scanner() to re-evaluate its hand pointers. 3211 * 3212 * We currently do not re-use or reclaim the page_t memory. 3213 * If we do, then this may have to change. 3214 */ 3215 3216 mutex_enter(&memseg_lists_lock); 3217 seg->lnext = memseg_edit_junk; 3218 memseg_edit_junk = seg; 3219 mutex_exit(&memseg_lists_lock); 3220 3221 return (1); 3222 } 3223 3224 /* 3225 * The sfmmu hat layer (e.g.) accesses some parts of the memseg 3226 * structure using physical addresses. Therefore a kmem_cache is 3227 * used with KMC_NOHASH to avoid page crossings within a memseg 3228 * structure. KMC_NOHASH requires that no external (outside of 3229 * slab) information is allowed. This, in turn, implies that the 3230 * cache's slabsize must be exactly a single page, since per-slab 3231 * information (e.g. the freelist for the slab) is kept at the 3232 * end of the slab, where it is easy to locate. Should be changed 3233 * when a more obvious kmem_cache interface/flag will become 3234 * available. 3235 */ 3236 void 3237 mem_config_init() 3238 { 3239 memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg), 3240 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 3241 } 3242 3243 struct memseg * 3244 memseg_alloc() 3245 { 3246 struct memseg *seg; 3247 3248 seg = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3249 bzero(seg, sizeof (struct memseg)); 3250 3251 return (seg); 3252 } 3253 3254 /* 3255 * Return whether the page_t memory for this memseg 3256 * is included in the memseg itself. 3257 */ 3258 static int 3259 memseg_includes_meta(struct memseg *seg) 3260 { 3261 return (seg->msegflags & MEMSEG_META_INCL); 3262 } 3263 3264 pfn_t 3265 memseg_get_start(struct memseg *seg) 3266 { 3267 pfn_t pt_start; 3268 3269 if (memseg_includes_meta(seg)) { 3270 pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages); 3271 3272 /* Meta data is required to be at the beginning */ 3273 ASSERT(pt_start < seg->pages_base); 3274 } else 3275 pt_start = seg->pages_base; 3276 3277 return (pt_start); 3278 } 3279 3280 /* 3281 * Invalidate memseg pointers in cpu private vm data caches. 3282 */ 3283 static void 3284 memseg_cpu_vm_flush() 3285 { 3286 cpu_t *cp; 3287 vm_cpu_data_t *vc; 3288 3289 mutex_enter(&cpu_lock); 3290 pause_cpus(NULL); 3291 3292 cp = cpu_list; 3293 do { 3294 vc = cp->cpu_vm_data; 3295 vc->vc_pnum_memseg = NULL; 3296 vc->vc_pnext_memseg = NULL; 3297 3298 } while ((cp = cp->cpu_next) != cpu_list); 3299 3300 start_cpus(); 3301 mutex_exit(&cpu_lock); 3302 } 3303