1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/cmn_err.h> 28 #include <sys/vmem.h> 29 #include <sys/kmem.h> 30 #include <sys/systm.h> 31 #include <sys/machsystm.h> /* for page_freelist_coalesce() */ 32 #include <sys/errno.h> 33 #include <sys/memnode.h> 34 #include <sys/memlist.h> 35 #include <sys/memlist_impl.h> 36 #include <sys/tuneable.h> 37 #include <sys/proc.h> 38 #include <sys/disp.h> 39 #include <sys/debug.h> 40 #include <sys/vm.h> 41 #include <sys/callb.h> 42 #include <sys/memlist_plat.h> /* for installed_top_size() */ 43 #include <sys/condvar_impl.h> /* for CV_HAS_WAITERS() */ 44 #include <sys/dumphdr.h> /* for dump_resize() */ 45 #include <sys/atomic.h> /* for use in stats collection */ 46 #include <sys/rwlock.h> 47 #include <sys/cpuvar.h> 48 #include <vm/seg_kmem.h> 49 #include <vm/seg_kpm.h> 50 #include <vm/page.h> 51 #include <vm/vm_dep.h> 52 #define SUNDDI_IMPL /* so sunddi.h will not redefine splx() et al */ 53 #include <sys/sunddi.h> 54 #include <sys/mem_config.h> 55 #include <sys/mem_cage.h> 56 #include <sys/lgrp.h> 57 #include <sys/ddi.h> 58 #include <sys/modctl.h> 59 60 extern struct memlist *phys_avail; 61 62 extern uint_t page_ctrs_adjust(int); 63 void page_ctrs_cleanup(void); 64 static void kphysm_setup_post_add(pgcnt_t); 65 static int kphysm_setup_pre_del(pgcnt_t); 66 static void kphysm_setup_post_del(pgcnt_t, int); 67 68 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs); 69 70 static int delspan_reserve(pfn_t, pgcnt_t); 71 static void delspan_unreserve(pfn_t, pgcnt_t); 72 73 kmutex_t memseg_lists_lock; 74 struct memseg *memseg_va_avail; 75 struct memseg *memseg_alloc(void); 76 static struct memseg *memseg_delete_junk; 77 static struct memseg *memseg_edit_junk; 78 void memseg_remap_init(void); 79 static void memseg_remap_to_dummy(struct memseg *); 80 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t); 81 static struct memseg *memseg_reuse(pgcnt_t); 82 83 static struct kmem_cache *memseg_cache; 84 85 /* 86 * Interfaces to manage externally allocated 87 * page_t memory (metadata) for a memseg. 88 */ 89 #pragma weak memseg_alloc_meta 90 #pragma weak memseg_free_meta 91 #pragma weak memseg_get_metapfn 92 #pragma weak memseg_remap_meta 93 94 extern int ppvm_enable; 95 extern page_t *ppvm_base; 96 extern int memseg_alloc_meta(pfn_t, pgcnt_t, void **, pgcnt_t *); 97 extern void memseg_free_meta(void *, pgcnt_t); 98 extern pfn_t memseg_get_metapfn(void *, pgcnt_t); 99 extern void memseg_remap_meta(struct memseg *); 100 static int memseg_is_dynamic(struct memseg *); 101 static int memseg_includes_meta(struct memseg *); 102 pfn_t memseg_get_start(struct memseg *); 103 static void memseg_cpu_vm_flush(void); 104 105 int meta_alloc_enable; 106 107 #ifdef DEBUG 108 static int memseg_debug; 109 #define MEMSEG_DEBUG(args...) if (memseg_debug) printf(args) 110 #else 111 #define MEMSEG_DEBUG(...) 112 #endif 113 114 /* 115 * Add a chunk of memory to the system. 116 * base: starting PAGESIZE page of new memory. 117 * npgs: length in PAGESIZE pages. 118 * 119 * Adding mem this way doesn't increase the size of the hash tables; 120 * growing them would be too hard. This should be OK, but adding memory 121 * dynamically most likely means more hash misses, since the tables will 122 * be smaller than they otherwise would be. 123 */ 124 int 125 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs) 126 { 127 page_t *pp; 128 page_t *opp, *oepp, *segpp; 129 struct memseg *seg; 130 uint64_t avmem; 131 pfn_t pfn; 132 pfn_t pt_base = base; 133 pgcnt_t tpgs = npgs; 134 pgcnt_t metapgs = 0; 135 int exhausted; 136 pfn_t pnum; 137 int mnode; 138 caddr_t vaddr; 139 int reuse; 140 int mlret; 141 int rv; 142 int flags; 143 int meta_alloc = 0; 144 void *mapva; 145 void *metabase = (void *)base; 146 pgcnt_t nkpmpgs = 0; 147 offset_t kpm_pages_off; 148 149 cmn_err(CE_CONT, 150 "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n", 151 npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT); 152 153 /* 154 * Add this span in the delete list to prevent interactions. 155 */ 156 if (!delspan_reserve(base, npgs)) { 157 return (KPHYSM_ESPAN); 158 } 159 /* 160 * Check to see if any of the memory span has been added 161 * by trying an add to the installed memory list. This 162 * forms the interlocking process for add. 163 */ 164 165 memlist_write_lock(); 166 167 mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT, 168 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 169 170 if (mlret == MEML_SPANOP_OK) 171 installed_top_size(phys_install, &physmax, &physinstalled); 172 173 memlist_write_unlock(); 174 175 if (mlret != MEML_SPANOP_OK) { 176 if (mlret == MEML_SPANOP_EALLOC) { 177 delspan_unreserve(pt_base, tpgs); 178 return (KPHYSM_ERESOURCE); 179 } else if (mlret == MEML_SPANOP_ESPAN) { 180 delspan_unreserve(pt_base, tpgs); 181 return (KPHYSM_ESPAN); 182 } else { 183 delspan_unreserve(pt_base, tpgs); 184 return (KPHYSM_ERESOURCE); 185 } 186 } 187 188 if (meta_alloc_enable) { 189 /* 190 * Allocate the page_t's from existing memory; 191 * if that fails, allocate from the incoming memory. 192 */ 193 rv = memseg_alloc_meta(base, npgs, &metabase, &metapgs); 194 if (rv == KPHYSM_OK) { 195 ASSERT(metapgs); 196 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); 197 meta_alloc = 1; 198 goto mapalloc; 199 } 200 } 201 202 /* 203 * We store the page_t's for this new memory in the first 204 * few pages of the chunk. Here, we go and get'em ... 205 */ 206 207 /* 208 * The expression after the '-' gives the number of pages 209 * that will fit in the new memory based on a requirement 210 * of (PAGESIZE + sizeof (page_t)) bytes per page. 211 */ 212 metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) / 213 (PAGESIZE + sizeof (page_t))); 214 215 npgs -= metapgs; 216 base += metapgs; 217 218 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); 219 220 exhausted = (metapgs == 0 || npgs == 0); 221 222 if (kpm_enable && !exhausted) { 223 pgcnt_t start, end, nkpmpgs_prelim; 224 size_t ptsz; 225 226 /* 227 * A viable kpm large page mapping must not overlap two 228 * dynamic memsegs. Therefore the total size is checked 229 * to be at least kpm_pgsz and also whether start and end 230 * points are at least kpm_pgsz aligned. 231 */ 232 if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) || 233 pmodkpmp(base + npgs)) { 234 235 kphysm_addmem_error_undospan(pt_base, tpgs); 236 237 /* 238 * There is no specific error code for violating 239 * kpm granularity constraints. 240 */ 241 return (KPHYSM_ENOTVIABLE); 242 } 243 244 start = kpmptop(ptokpmp(base)); 245 end = kpmptop(ptokpmp(base + npgs)); 246 nkpmpgs_prelim = ptokpmp(end - start); 247 ptsz = npgs * sizeof (page_t); 248 metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ); 249 exhausted = (tpgs <= metapgs); 250 if (!exhausted) { 251 npgs = tpgs - metapgs; 252 base = pt_base + metapgs; 253 254 /* final nkpmpgs */ 255 start = kpmptop(ptokpmp(base)); 256 nkpmpgs = ptokpmp(end - start); 257 kpm_pages_off = ptsz + 258 (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ; 259 } 260 } 261 262 /* 263 * Is memory area supplied too small? 264 */ 265 if (exhausted) { 266 kphysm_addmem_error_undospan(pt_base, tpgs); 267 /* 268 * There is no specific error code for 'too small'. 269 */ 270 return (KPHYSM_ERESOURCE); 271 } 272 273 mapalloc: 274 /* 275 * We may re-use a previously allocated VA space for the page_ts 276 * eventually, but we need to initialize and lock the pages first. 277 */ 278 279 /* 280 * Get an address in the kernel address map, map 281 * the page_t pages and see if we can touch them. 282 */ 283 284 mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP); 285 if (mapva == NULL) { 286 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 287 " Can't allocate VA for page_ts"); 288 289 if (meta_alloc) 290 memseg_free_meta(metabase, metapgs); 291 kphysm_addmem_error_undospan(pt_base, tpgs); 292 293 return (KPHYSM_ERESOURCE); 294 } 295 pp = mapva; 296 297 if (physmax < (pt_base + tpgs)) 298 physmax = (pt_base + tpgs); 299 300 /* 301 * In the remapping code we map one page at a time so we must do 302 * the same here to match mapping sizes. 303 */ 304 pfn = pt_base; 305 vaddr = (caddr_t)pp; 306 for (pnum = 0; pnum < metapgs; pnum++) { 307 if (meta_alloc) 308 pfn = memseg_get_metapfn(metabase, (pgcnt_t)pnum); 309 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 310 PROT_READ | PROT_WRITE, 311 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 312 pfn++; 313 vaddr += ptob(1); 314 } 315 316 if (ddi_peek32((dev_info_t *)NULL, 317 (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) { 318 319 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 320 " Can't access pp array at 0x%p [phys 0x%lx]", 321 (void *)pp, pt_base); 322 323 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 324 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 325 326 vmem_free(heap_arena, mapva, ptob(metapgs)); 327 if (meta_alloc) 328 memseg_free_meta(metabase, metapgs); 329 kphysm_addmem_error_undospan(pt_base, tpgs); 330 331 return (KPHYSM_EFAULT); 332 } 333 334 /* 335 * Add this memory slice to its memory node translation. 336 * 337 * Note that right now, each node may have only one slice; 338 * this may change with COD or in larger SSM systems with 339 * nested latency groups, so we must not assume that the 340 * node does not yet exist. 341 * 342 * Note that there may be multiple memory nodes associated with 343 * a single lgrp node on x86 systems. 344 */ 345 pnum = pt_base + tpgs - 1; 346 mem_node_add_range(pt_base, pnum); 347 348 /* 349 * Allocate or resize page counters as necessary to accommodate 350 * the increase in memory pages. 351 */ 352 mnode = PFN_2_MEM_NODE(pnum); 353 PAGE_CTRS_ADJUST(base, npgs, rv); 354 if (rv) { 355 356 mem_node_del_range(pt_base, pnum); 357 358 /* cleanup the page counters */ 359 page_ctrs_cleanup(); 360 361 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 362 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 363 364 vmem_free(heap_arena, mapva, ptob(metapgs)); 365 if (meta_alloc) 366 memseg_free_meta(metabase, metapgs); 367 kphysm_addmem_error_undospan(pt_base, tpgs); 368 369 return (KPHYSM_ERESOURCE); 370 } 371 372 /* 373 * Update the phys_avail memory list. 374 * The phys_install list was done at the start. 375 */ 376 377 memlist_write_lock(); 378 379 mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT, 380 (uint64_t)(npgs) << PAGESHIFT, &phys_avail); 381 ASSERT(mlret == MEML_SPANOP_OK); 382 383 memlist_write_unlock(); 384 385 /* See if we can find a memseg to re-use. */ 386 if (meta_alloc) { 387 seg = memseg_reuse(0); 388 reuse = 1; /* force unmapping of temp mapva */ 389 flags = MEMSEG_DYNAMIC | MEMSEG_META_ALLOC; 390 /* 391 * There is a 1:1 fixed relationship between a pfn 392 * and a page_t VA. The pfn is used as an index into 393 * the ppvm_base page_t table in order to calculate 394 * the page_t base address for a given pfn range. 395 */ 396 segpp = ppvm_base + base; 397 } else { 398 seg = memseg_reuse(metapgs); 399 reuse = (seg != NULL); 400 flags = MEMSEG_DYNAMIC | MEMSEG_META_INCL; 401 segpp = pp; 402 } 403 404 /* 405 * Initialize the memseg structure representing this memory 406 * and add it to the existing list of memsegs. Do some basic 407 * initialization and add the memory to the system. 408 * In order to prevent lock deadlocks, the add_physmem() 409 * code is repeated here, but split into several stages. 410 * 411 * If a memseg is reused, invalidate memseg pointers in 412 * all cpu vm caches. We need to do this this since the check 413 * pp >= seg->pages && pp < seg->epages 414 * used in various places is not atomic and so the first compare 415 * can happen before reuse and the second compare after reuse. 416 * The invalidation ensures that a memseg is not deferenced while 417 * it's page/pfn pointers are changing. 418 */ 419 if (seg == NULL) { 420 seg = memseg_alloc(); 421 ASSERT(seg != NULL); 422 seg->msegflags = flags; 423 MEMSEG_DEBUG("memseg_get: alloc seg=0x%p, pages=0x%p", 424 (void *)seg, (void *)(seg->pages)); 425 seg->pages = segpp; 426 } else { 427 ASSERT(seg->msegflags == flags); 428 ASSERT(seg->pages_base == seg->pages_end); 429 MEMSEG_DEBUG("memseg_get: reuse seg=0x%p, pages=0x%p", 430 (void *)seg, (void *)(seg->pages)); 431 if (meta_alloc) { 432 memseg_cpu_vm_flush(); 433 seg->pages = segpp; 434 } 435 } 436 437 seg->epages = seg->pages + npgs; 438 seg->pages_base = base; 439 seg->pages_end = base + npgs; 440 441 /* 442 * Initialize metadata. The page_ts are set to locked state 443 * ready to be freed. 444 */ 445 bzero((caddr_t)pp, ptob(metapgs)); 446 447 pfn = seg->pages_base; 448 /* Save the original pp base in case we reuse a memseg. */ 449 opp = pp; 450 oepp = opp + npgs; 451 for (pp = opp; pp < oepp; pp++) { 452 pp->p_pagenum = pfn; 453 pfn++; 454 page_iolock_init(pp); 455 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 456 continue; 457 pp->p_offset = (u_offset_t)-1; 458 } 459 460 if (reuse) { 461 /* Remap our page_ts to the re-used memseg VA space. */ 462 pfn = pt_base; 463 vaddr = (caddr_t)seg->pages; 464 for (pnum = 0; pnum < metapgs; pnum++) { 465 if (meta_alloc) 466 pfn = memseg_get_metapfn(metabase, 467 (pgcnt_t)pnum); 468 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 469 PROT_READ | PROT_WRITE, 470 HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST); 471 pfn++; 472 vaddr += ptob(1); 473 } 474 475 hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs), 476 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 477 478 vmem_free(heap_arena, mapva, ptob(metapgs)); 479 } 480 481 hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off); 482 483 memsegs_lock(1); 484 485 /* 486 * The new memseg is inserted at the beginning of the list. 487 * Not only does this save searching for the tail, but in the 488 * case of a re-used memseg, it solves the problem of what 489 * happens if some process has still got a pointer to the 490 * memseg and follows the next pointer to continue traversing 491 * the memsegs list. 492 */ 493 494 hat_kpm_addmem_mseg_insert(seg); 495 496 seg->next = memsegs; 497 membar_producer(); 498 499 hat_kpm_addmem_memsegs_update(seg); 500 501 memsegs = seg; 502 503 build_pfn_hash(); 504 505 total_pages += npgs; 506 507 /* 508 * Recalculate the paging parameters now total_pages has changed. 509 * This will also cause the clock hands to be reset before next use. 510 */ 511 setupclock(1); 512 513 memsegs_unlock(1); 514 515 PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs); 516 517 /* 518 * Free the pages outside the lock to avoid locking loops. 519 */ 520 for (pp = seg->pages; pp < seg->epages; pp++) { 521 page_free(pp, 1); 522 } 523 524 /* 525 * Now that we've updated the appropriate memory lists we 526 * need to reset a number of globals, since we've increased memory. 527 * Several have already been updated for us as noted above. The 528 * globals we're interested in at this point are: 529 * physmax - highest page frame number. 530 * physinstalled - number of pages currently installed (done earlier) 531 * maxmem - max free pages in the system 532 * physmem - physical memory pages available 533 * availrmem - real memory available 534 */ 535 536 mutex_enter(&freemem_lock); 537 maxmem += npgs; 538 physmem += npgs; 539 availrmem += npgs; 540 availrmem_initial += npgs; 541 542 mutex_exit(&freemem_lock); 543 544 dump_resize(); 545 546 page_freelist_coalesce_all(mnode); 547 548 kphysm_setup_post_add(npgs); 549 550 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK " 551 "(0x%" PRIx64 ")\n", 552 physinstalled << (PAGESHIFT - 10), 553 (uint64_t)physinstalled << PAGESHIFT); 554 555 avmem = (uint64_t)freemem << PAGESHIFT; 556 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: " 557 "avail mem = %" PRId64 "\n", avmem); 558 559 /* 560 * Update lgroup generation number on single lgroup systems 561 */ 562 if (nlgrps == 1) 563 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 564 565 /* 566 * Inform DDI of update 567 */ 568 ddi_mem_update((uint64_t)(pt_base) << PAGESHIFT, 569 (uint64_t)(tpgs) << PAGESHIFT); 570 571 delspan_unreserve(pt_base, tpgs); 572 573 return (KPHYSM_OK); /* Successfully added system memory */ 574 } 575 576 /* 577 * There are various error conditions in kphysm_add_memory_dynamic() 578 * which require a rollback of already changed global state. 579 */ 580 static void 581 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs) 582 { 583 int mlret; 584 585 /* Unreserve memory span. */ 586 memlist_write_lock(); 587 588 mlret = memlist_delete_span( 589 (uint64_t)(pt_base) << PAGESHIFT, 590 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 591 592 ASSERT(mlret == MEML_SPANOP_OK); 593 phys_install_has_changed(); 594 installed_top_size(phys_install, &physmax, &physinstalled); 595 596 memlist_write_unlock(); 597 delspan_unreserve(pt_base, tpgs); 598 } 599 600 /* 601 * Only return an available memseg of exactly the right size 602 * if size is required. 603 * When the meta data area has it's own virtual address space 604 * we will need to manage this more carefully and do best fit 605 * allocations, possibly splitting an available area. 606 */ 607 struct memseg * 608 memseg_reuse(pgcnt_t metapgs) 609 { 610 int type; 611 struct memseg **segpp, *seg; 612 613 mutex_enter(&memseg_lists_lock); 614 615 segpp = &memseg_va_avail; 616 for (; (seg = *segpp) != NULL; segpp = &seg->lnext) { 617 caddr_t end; 618 619 /* 620 * Make sure we are reusing the right segment type. 621 */ 622 type = metapgs ? MEMSEG_META_INCL : MEMSEG_META_ALLOC; 623 624 if ((seg->msegflags & (MEMSEG_META_INCL | MEMSEG_META_ALLOC)) 625 != type) 626 continue; 627 628 if (kpm_enable) 629 end = hat_kpm_mseg_reuse(seg); 630 else 631 end = (caddr_t)seg->epages; 632 633 /* 634 * Check for the right size if it is provided. 635 */ 636 if (!metapgs || btopr(end - (caddr_t)seg->pages) == metapgs) { 637 *segpp = seg->lnext; 638 seg->lnext = NULL; 639 break; 640 } 641 } 642 mutex_exit(&memseg_lists_lock); 643 644 return (seg); 645 } 646 647 static uint_t handle_gen; 648 649 struct memdelspan { 650 struct memdelspan *mds_next; 651 pfn_t mds_base; 652 pgcnt_t mds_npgs; 653 uint_t *mds_bitmap; 654 uint_t *mds_bitmap_retired; 655 }; 656 657 #define NBPBMW (sizeof (uint_t) * NBBY) 658 #define MDS_BITMAPBYTES(MDSP) \ 659 ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t)) 660 661 struct transit_list { 662 struct transit_list *trl_next; 663 struct memdelspan *trl_spans; 664 int trl_collect; 665 }; 666 667 struct transit_list_head { 668 kmutex_t trh_lock; 669 struct transit_list *trh_head; 670 }; 671 672 static struct transit_list_head transit_list_head; 673 674 struct mem_handle; 675 static void transit_list_collect(struct mem_handle *, int); 676 static void transit_list_insert(struct transit_list *); 677 static void transit_list_remove(struct transit_list *); 678 679 #ifdef DEBUG 680 #define MEM_DEL_STATS 681 #endif /* DEBUG */ 682 683 #ifdef MEM_DEL_STATS 684 static int mem_del_stat_print = 0; 685 struct mem_del_stat { 686 uint_t nloop; 687 uint_t need_free; 688 uint_t free_loop; 689 uint_t free_low; 690 uint_t free_failed; 691 uint_t ncheck; 692 uint_t nopaget; 693 uint_t lockfail; 694 uint_t nfree; 695 uint_t nreloc; 696 uint_t nrelocfail; 697 uint_t already_done; 698 uint_t first_notfree; 699 uint_t npplocked; 700 uint_t nlockreloc; 701 uint_t nnorepl; 702 uint_t nmodreloc; 703 uint_t ndestroy; 704 uint_t nputpage; 705 uint_t nnoreclaim; 706 uint_t ndelay; 707 uint_t demotefail; 708 uint64_t nticks_total; 709 uint64_t nticks_pgrp; 710 uint_t retired; 711 uint_t toxic; 712 uint_t failing; 713 uint_t modtoxic; 714 uint_t npplkdtoxic; 715 uint_t gptlmodfail; 716 uint_t gptllckfail; 717 }; 718 /* 719 * The stat values are only incremented in the delete thread 720 * so no locking or atomic required. 721 */ 722 #define MDSTAT_INCR(MHP, FLD) (MHP)->mh_delstat.FLD++ 723 #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck)) 724 #define MDSTAT_PGRP(MHP, ntck) ((MHP)->mh_delstat.nticks_pgrp += (ntck)) 725 static void mem_del_stat_print_func(struct mem_handle *); 726 #define MDSTAT_PRINT(MHP) mem_del_stat_print_func((MHP)) 727 #else /* MEM_DEL_STATS */ 728 #define MDSTAT_INCR(MHP, FLD) 729 #define MDSTAT_TOTAL(MHP, ntck) 730 #define MDSTAT_PGRP(MHP, ntck) 731 #define MDSTAT_PRINT(MHP) 732 #endif /* MEM_DEL_STATS */ 733 734 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING, 735 MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t; 736 737 /* 738 * mh_mutex must be taken to examine or change mh_exthandle and mh_state. 739 * The mutex may not be required for other fields, dependent on mh_state. 740 */ 741 struct mem_handle { 742 kmutex_t mh_mutex; 743 struct mem_handle *mh_next; 744 memhandle_t mh_exthandle; 745 mhnd_state_t mh_state; 746 struct transit_list mh_transit; 747 pgcnt_t mh_phys_pages; 748 pgcnt_t mh_vm_pages; 749 pgcnt_t mh_hold_todo; 750 void (*mh_delete_complete)(void *, int error); 751 void *mh_delete_complete_arg; 752 volatile uint_t mh_cancel; 753 volatile uint_t mh_dr_aio_cleanup_cancel; 754 volatile uint_t mh_aio_cleanup_done; 755 kcondvar_t mh_cv; 756 kthread_id_t mh_thread_id; 757 page_t *mh_deleted; /* link through p_next */ 758 #ifdef MEM_DEL_STATS 759 struct mem_del_stat mh_delstat; 760 #endif /* MEM_DEL_STATS */ 761 }; 762 763 static struct mem_handle *mem_handle_head; 764 static kmutex_t mem_handle_list_mutex; 765 766 static struct mem_handle * 767 kphysm_allocate_mem_handle() 768 { 769 struct mem_handle *mhp; 770 771 mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP); 772 mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL); 773 mutex_enter(&mem_handle_list_mutex); 774 mutex_enter(&mhp->mh_mutex); 775 /* handle_gen is protected by list mutex. */ 776 mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen); 777 mhp->mh_next = mem_handle_head; 778 mem_handle_head = mhp; 779 mutex_exit(&mem_handle_list_mutex); 780 781 return (mhp); 782 } 783 784 static void 785 kphysm_free_mem_handle(struct mem_handle *mhp) 786 { 787 struct mem_handle **mhpp; 788 789 ASSERT(mutex_owned(&mhp->mh_mutex)); 790 ASSERT(mhp->mh_state == MHND_FREE); 791 /* 792 * Exit the mutex to preserve locking order. This is OK 793 * here as once in the FREE state, the handle cannot 794 * be found by a lookup. 795 */ 796 mutex_exit(&mhp->mh_mutex); 797 798 mutex_enter(&mem_handle_list_mutex); 799 mhpp = &mem_handle_head; 800 while (*mhpp != NULL && *mhpp != mhp) 801 mhpp = &(*mhpp)->mh_next; 802 ASSERT(*mhpp == mhp); 803 /* 804 * No need to lock the handle (mh_mutex) as only 805 * mh_next changing and this is the only thread that 806 * can be referncing mhp. 807 */ 808 *mhpp = mhp->mh_next; 809 mutex_exit(&mem_handle_list_mutex); 810 811 mutex_destroy(&mhp->mh_mutex); 812 kmem_free(mhp, sizeof (struct mem_handle)); 813 } 814 815 /* 816 * This function finds the internal mem_handle corresponding to an 817 * external handle and returns it with the mh_mutex held. 818 */ 819 static struct mem_handle * 820 kphysm_lookup_mem_handle(memhandle_t handle) 821 { 822 struct mem_handle *mhp; 823 824 mutex_enter(&mem_handle_list_mutex); 825 for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) { 826 if (mhp->mh_exthandle == handle) { 827 mutex_enter(&mhp->mh_mutex); 828 /* 829 * The state of the handle could have been changed 830 * by kphysm_del_release() while waiting for mh_mutex. 831 */ 832 if (mhp->mh_state == MHND_FREE) { 833 mutex_exit(&mhp->mh_mutex); 834 continue; 835 } 836 break; 837 } 838 } 839 mutex_exit(&mem_handle_list_mutex); 840 return (mhp); 841 } 842 843 int 844 kphysm_del_gethandle(memhandle_t *xmhp) 845 { 846 struct mem_handle *mhp; 847 848 mhp = kphysm_allocate_mem_handle(); 849 /* 850 * The handle is allocated using KM_SLEEP, so cannot fail. 851 * If the implementation is changed, the correct error to return 852 * here would be KPHYSM_ENOHANDLES. 853 */ 854 ASSERT(mhp->mh_state == MHND_FREE); 855 mhp->mh_state = MHND_INIT; 856 *xmhp = mhp->mh_exthandle; 857 mutex_exit(&mhp->mh_mutex); 858 return (KPHYSM_OK); 859 } 860 861 static int 862 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2) 863 { 864 pfn_t e1, e2; 865 866 e1 = b1 + l1; 867 e2 = b2 + l2; 868 869 return (!(b2 >= e1 || b1 >= e2)); 870 } 871 872 static int can_remove_pgs(pgcnt_t); 873 874 static struct memdelspan * 875 span_to_install(pfn_t base, pgcnt_t npgs) 876 { 877 struct memdelspan *mdsp; 878 struct memdelspan *mdsp_new; 879 uint64_t address, size, thislen; 880 struct memlist *mlp; 881 882 mdsp_new = NULL; 883 884 address = (uint64_t)base << PAGESHIFT; 885 size = (uint64_t)npgs << PAGESHIFT; 886 while (size != 0) { 887 memlist_read_lock(); 888 for (mlp = phys_install; mlp != NULL; mlp = mlp->ml_next) { 889 if (address >= (mlp->ml_address + mlp->ml_size)) 890 continue; 891 if ((address + size) > mlp->ml_address) 892 break; 893 } 894 if (mlp == NULL) { 895 address += size; 896 size = 0; 897 thislen = 0; 898 } else { 899 if (address < mlp->ml_address) { 900 size -= (mlp->ml_address - address); 901 address = mlp->ml_address; 902 } 903 ASSERT(address >= mlp->ml_address); 904 if ((address + size) > 905 (mlp->ml_address + mlp->ml_size)) { 906 thislen = 907 mlp->ml_size - (address - mlp->ml_address); 908 } else { 909 thislen = size; 910 } 911 } 912 memlist_read_unlock(); 913 /* TODO: phys_install could change now */ 914 if (thislen == 0) 915 continue; 916 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 917 mdsp->mds_base = btop(address); 918 mdsp->mds_npgs = btop(thislen); 919 mdsp->mds_next = mdsp_new; 920 mdsp_new = mdsp; 921 address += thislen; 922 size -= thislen; 923 } 924 return (mdsp_new); 925 } 926 927 static void 928 free_delspans(struct memdelspan *mdsp) 929 { 930 struct memdelspan *amdsp; 931 932 while ((amdsp = mdsp) != NULL) { 933 mdsp = amdsp->mds_next; 934 kmem_free(amdsp, sizeof (struct memdelspan)); 935 } 936 } 937 938 /* 939 * Concatenate lists. No list ordering is required. 940 */ 941 942 static void 943 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp) 944 { 945 while (*mdspp != NULL) 946 mdspp = &(*mdspp)->mds_next; 947 948 *mdspp = mdsp; 949 } 950 951 /* 952 * Given a new list of delspans, check there is no overlap with 953 * all existing span activity (add or delete) and then concatenate 954 * the new spans to the given list. 955 * Return 1 for OK, 0 if overlapping. 956 */ 957 static int 958 delspan_insert( 959 struct transit_list *my_tlp, 960 struct memdelspan *mdsp_new) 961 { 962 struct transit_list_head *trh; 963 struct transit_list *tlp; 964 int ret; 965 966 trh = &transit_list_head; 967 968 ASSERT(my_tlp != NULL); 969 ASSERT(mdsp_new != NULL); 970 971 ret = 1; 972 mutex_enter(&trh->trh_lock); 973 /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */ 974 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 975 struct memdelspan *mdsp; 976 977 for (mdsp = tlp->trl_spans; mdsp != NULL; 978 mdsp = mdsp->mds_next) { 979 struct memdelspan *nmdsp; 980 981 for (nmdsp = mdsp_new; nmdsp != NULL; 982 nmdsp = nmdsp->mds_next) { 983 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 984 nmdsp->mds_base, nmdsp->mds_npgs)) { 985 ret = 0; 986 goto done; 987 } 988 } 989 } 990 } 991 done: 992 if (ret != 0) { 993 if (my_tlp->trl_spans == NULL) 994 transit_list_insert(my_tlp); 995 delspan_concat(&my_tlp->trl_spans, mdsp_new); 996 } 997 mutex_exit(&trh->trh_lock); 998 return (ret); 999 } 1000 1001 static void 1002 delspan_remove( 1003 struct transit_list *my_tlp, 1004 pfn_t base, 1005 pgcnt_t npgs) 1006 { 1007 struct transit_list_head *trh; 1008 struct memdelspan *mdsp; 1009 1010 trh = &transit_list_head; 1011 1012 ASSERT(my_tlp != NULL); 1013 1014 mutex_enter(&trh->trh_lock); 1015 if ((mdsp = my_tlp->trl_spans) != NULL) { 1016 if (npgs == 0) { 1017 my_tlp->trl_spans = NULL; 1018 free_delspans(mdsp); 1019 transit_list_remove(my_tlp); 1020 } else { 1021 struct memdelspan **prv; 1022 1023 prv = &my_tlp->trl_spans; 1024 while (mdsp != NULL) { 1025 pfn_t p_end; 1026 1027 p_end = mdsp->mds_base + mdsp->mds_npgs; 1028 if (mdsp->mds_base >= base && 1029 p_end <= (base + npgs)) { 1030 *prv = mdsp->mds_next; 1031 mdsp->mds_next = NULL; 1032 free_delspans(mdsp); 1033 } else { 1034 prv = &mdsp->mds_next; 1035 } 1036 mdsp = *prv; 1037 } 1038 if (my_tlp->trl_spans == NULL) 1039 transit_list_remove(my_tlp); 1040 } 1041 } 1042 mutex_exit(&trh->trh_lock); 1043 } 1044 1045 /* 1046 * Reserve interface for add to stop delete before add finished. 1047 * This list is only accessed through the delspan_insert/remove 1048 * functions and so is fully protected by the mutex in struct transit_list. 1049 */ 1050 1051 static struct transit_list reserve_transit; 1052 1053 static int 1054 delspan_reserve(pfn_t base, pgcnt_t npgs) 1055 { 1056 struct memdelspan *mdsp; 1057 int ret; 1058 1059 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 1060 mdsp->mds_base = base; 1061 mdsp->mds_npgs = npgs; 1062 if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) { 1063 free_delspans(mdsp); 1064 } 1065 return (ret); 1066 } 1067 1068 static void 1069 delspan_unreserve(pfn_t base, pgcnt_t npgs) 1070 { 1071 delspan_remove(&reserve_transit, base, npgs); 1072 } 1073 1074 /* 1075 * Return whether memseg was created by kphysm_add_memory_dynamic(). 1076 */ 1077 static int 1078 memseg_is_dynamic(struct memseg *seg) 1079 { 1080 return (seg->msegflags & MEMSEG_DYNAMIC); 1081 } 1082 1083 int 1084 kphysm_del_span( 1085 memhandle_t handle, 1086 pfn_t base, 1087 pgcnt_t npgs) 1088 { 1089 struct mem_handle *mhp; 1090 struct memseg *seg; 1091 struct memdelspan *mdsp; 1092 struct memdelspan *mdsp_new; 1093 pgcnt_t phys_pages, vm_pages; 1094 pfn_t p_end; 1095 page_t *pp; 1096 int ret; 1097 1098 mhp = kphysm_lookup_mem_handle(handle); 1099 if (mhp == NULL) { 1100 return (KPHYSM_EHANDLE); 1101 } 1102 if (mhp->mh_state != MHND_INIT) { 1103 mutex_exit(&mhp->mh_mutex); 1104 return (KPHYSM_ESEQUENCE); 1105 } 1106 1107 /* 1108 * Intersect the span with the installed memory list (phys_install). 1109 */ 1110 mdsp_new = span_to_install(base, npgs); 1111 if (mdsp_new == NULL) { 1112 /* 1113 * No physical memory in this range. Is this an 1114 * error? If an attempt to start the delete is made 1115 * for OK returns from del_span such as this, start will 1116 * return an error. 1117 * Could return KPHYSM_ENOWORK. 1118 */ 1119 /* 1120 * It is assumed that there are no error returns 1121 * from span_to_install() due to kmem_alloc failure. 1122 */ 1123 mutex_exit(&mhp->mh_mutex); 1124 return (KPHYSM_OK); 1125 } 1126 /* 1127 * Does this span overlap an existing span? 1128 */ 1129 if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) { 1130 /* 1131 * Differentiate between already on list for this handle 1132 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY). 1133 */ 1134 ret = KPHYSM_EBUSY; 1135 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1136 mdsp = mdsp->mds_next) { 1137 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 1138 base, npgs)) { 1139 ret = KPHYSM_EDUP; 1140 break; 1141 } 1142 } 1143 mutex_exit(&mhp->mh_mutex); 1144 free_delspans(mdsp_new); 1145 return (ret); 1146 } 1147 /* 1148 * At this point the spans in mdsp_new have been inserted into the 1149 * list of spans for this handle and thereby to the global list of 1150 * spans being processed. Each of these spans must now be checked 1151 * for relocatability. As a side-effect segments in the memseg list 1152 * may be split. 1153 * 1154 * Note that mdsp_new can no longer be used as it is now part of 1155 * a larger list. Select elements of this larger list based 1156 * on base and npgs. 1157 */ 1158 restart: 1159 phys_pages = 0; 1160 vm_pages = 0; 1161 ret = KPHYSM_OK; 1162 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1163 mdsp = mdsp->mds_next) { 1164 pgcnt_t pages_checked; 1165 1166 if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) { 1167 continue; 1168 } 1169 p_end = mdsp->mds_base + mdsp->mds_npgs; 1170 /* 1171 * The pages_checked count is a hack. All pages should be 1172 * checked for relocatability. Those not covered by memsegs 1173 * should be tested with arch_kphysm_del_span_ok(). 1174 */ 1175 pages_checked = 0; 1176 for (seg = memsegs; seg; seg = seg->next) { 1177 pfn_t mseg_start; 1178 1179 if (seg->pages_base >= p_end || 1180 seg->pages_end <= mdsp->mds_base) { 1181 /* Span and memseg don't overlap. */ 1182 continue; 1183 } 1184 mseg_start = memseg_get_start(seg); 1185 /* Check that segment is suitable for delete. */ 1186 if (memseg_includes_meta(seg)) { 1187 /* 1188 * Check that this segment is completely 1189 * within the span. 1190 */ 1191 if (mseg_start < mdsp->mds_base || 1192 seg->pages_end > p_end) { 1193 ret = KPHYSM_EBUSY; 1194 break; 1195 } 1196 pages_checked += seg->pages_end - mseg_start; 1197 } else { 1198 /* 1199 * If this segment is larger than the span, 1200 * try to split it. After the split, it 1201 * is necessary to restart. 1202 */ 1203 if (seg->pages_base < mdsp->mds_base || 1204 seg->pages_end > p_end) { 1205 pfn_t abase; 1206 pgcnt_t anpgs; 1207 int s_ret; 1208 1209 /* Split required. */ 1210 if (mdsp->mds_base < seg->pages_base) 1211 abase = seg->pages_base; 1212 else 1213 abase = mdsp->mds_base; 1214 if (p_end > seg->pages_end) 1215 anpgs = seg->pages_end - abase; 1216 else 1217 anpgs = p_end - abase; 1218 s_ret = kphysm_split_memseg(abase, 1219 anpgs); 1220 if (s_ret == 0) { 1221 /* Split failed. */ 1222 ret = KPHYSM_ERESOURCE; 1223 break; 1224 } 1225 goto restart; 1226 } 1227 pages_checked += 1228 seg->pages_end - seg->pages_base; 1229 } 1230 /* 1231 * The memseg is wholly within the delete span. 1232 * The individual pages can now be checked. 1233 */ 1234 /* Cage test. */ 1235 for (pp = seg->pages; pp < seg->epages; pp++) { 1236 if (PP_ISNORELOC(pp)) { 1237 ret = KPHYSM_ENONRELOC; 1238 break; 1239 } 1240 } 1241 if (ret != KPHYSM_OK) { 1242 break; 1243 } 1244 phys_pages += (seg->pages_end - mseg_start); 1245 vm_pages += MSEG_NPAGES(seg); 1246 } 1247 if (ret != KPHYSM_OK) 1248 break; 1249 if (pages_checked != mdsp->mds_npgs) { 1250 ret = KPHYSM_ENONRELOC; 1251 break; 1252 } 1253 } 1254 1255 if (ret == KPHYSM_OK) { 1256 mhp->mh_phys_pages += phys_pages; 1257 mhp->mh_vm_pages += vm_pages; 1258 } else { 1259 /* 1260 * Keep holding the mh_mutex to prevent it going away. 1261 */ 1262 delspan_remove(&mhp->mh_transit, base, npgs); 1263 } 1264 mutex_exit(&mhp->mh_mutex); 1265 return (ret); 1266 } 1267 1268 int 1269 kphysm_del_span_query( 1270 pfn_t base, 1271 pgcnt_t npgs, 1272 memquery_t *mqp) 1273 { 1274 struct memdelspan *mdsp; 1275 struct memdelspan *mdsp_new; 1276 int done_first_nonreloc; 1277 1278 mqp->phys_pages = 0; 1279 mqp->managed = 0; 1280 mqp->nonrelocatable = 0; 1281 mqp->first_nonrelocatable = 0; 1282 mqp->last_nonrelocatable = 0; 1283 1284 mdsp_new = span_to_install(base, npgs); 1285 /* 1286 * It is OK to proceed here if mdsp_new == NULL. 1287 */ 1288 done_first_nonreloc = 0; 1289 for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) { 1290 pfn_t sbase; 1291 pgcnt_t snpgs; 1292 1293 mqp->phys_pages += mdsp->mds_npgs; 1294 sbase = mdsp->mds_base; 1295 snpgs = mdsp->mds_npgs; 1296 while (snpgs != 0) { 1297 struct memseg *lseg, *seg; 1298 pfn_t p_end; 1299 page_t *pp; 1300 pfn_t mseg_start; 1301 1302 p_end = sbase + snpgs; 1303 /* 1304 * Find the lowest addressed memseg that starts 1305 * after sbase and account for it. 1306 * This is to catch dynamic memsegs whose start 1307 * is hidden. 1308 */ 1309 seg = NULL; 1310 for (lseg = memsegs; lseg != NULL; lseg = lseg->next) { 1311 if ((lseg->pages_base >= sbase) || 1312 (lseg->pages_base < p_end && 1313 lseg->pages_end > sbase)) { 1314 if (seg == NULL || 1315 seg->pages_base > lseg->pages_base) 1316 seg = lseg; 1317 } 1318 } 1319 if (seg != NULL) { 1320 mseg_start = memseg_get_start(seg); 1321 /* 1322 * Now have the full extent of the memseg so 1323 * do the range check. 1324 */ 1325 if (mseg_start >= p_end || 1326 seg->pages_end <= sbase) { 1327 /* Span does not overlap memseg. */ 1328 seg = NULL; 1329 } 1330 } 1331 /* 1332 * Account for gap either before the segment if 1333 * there is one or to the end of the span. 1334 */ 1335 if (seg == NULL || mseg_start > sbase) { 1336 pfn_t a_end; 1337 1338 a_end = (seg == NULL) ? p_end : mseg_start; 1339 /* 1340 * Check with arch layer for relocatability. 1341 */ 1342 if (arch_kphysm_del_span_ok(sbase, 1343 (a_end - sbase))) { 1344 /* 1345 * No non-relocatble pages in this 1346 * area, avoid the fine-grained 1347 * test. 1348 */ 1349 snpgs -= (a_end - sbase); 1350 sbase = a_end; 1351 } 1352 while (sbase < a_end) { 1353 if (!arch_kphysm_del_span_ok(sbase, 1354 1)) { 1355 mqp->nonrelocatable++; 1356 if (!done_first_nonreloc) { 1357 mqp-> 1358 first_nonrelocatable 1359 = sbase; 1360 done_first_nonreloc = 1; 1361 } 1362 mqp->last_nonrelocatable = 1363 sbase; 1364 } 1365 sbase++; 1366 snpgs--; 1367 } 1368 } 1369 if (seg != NULL) { 1370 ASSERT(mseg_start <= sbase); 1371 if (seg->pages_base != mseg_start && 1372 seg->pages_base > sbase) { 1373 pgcnt_t skip_pgs; 1374 1375 /* 1376 * Skip the page_t area of a 1377 * dynamic memseg. 1378 */ 1379 skip_pgs = seg->pages_base - sbase; 1380 if (snpgs <= skip_pgs) { 1381 sbase += snpgs; 1382 snpgs = 0; 1383 continue; 1384 } 1385 snpgs -= skip_pgs; 1386 sbase += skip_pgs; 1387 } 1388 ASSERT(snpgs != 0); 1389 ASSERT(seg->pages_base <= sbase); 1390 /* 1391 * The individual pages can now be checked. 1392 */ 1393 for (pp = seg->pages + 1394 (sbase - seg->pages_base); 1395 snpgs != 0 && pp < seg->epages; pp++) { 1396 mqp->managed++; 1397 if (PP_ISNORELOC(pp)) { 1398 mqp->nonrelocatable++; 1399 if (!done_first_nonreloc) { 1400 mqp-> 1401 first_nonrelocatable 1402 = sbase; 1403 done_first_nonreloc = 1; 1404 } 1405 mqp->last_nonrelocatable = 1406 sbase; 1407 } 1408 sbase++; 1409 snpgs--; 1410 } 1411 } 1412 } 1413 } 1414 1415 free_delspans(mdsp_new); 1416 1417 return (KPHYSM_OK); 1418 } 1419 1420 /* 1421 * This release function can be called at any stage as follows: 1422 * _gethandle only called 1423 * _span(s) only called 1424 * _start called but failed 1425 * delete thread exited 1426 */ 1427 int 1428 kphysm_del_release(memhandle_t handle) 1429 { 1430 struct mem_handle *mhp; 1431 1432 mhp = kphysm_lookup_mem_handle(handle); 1433 if (mhp == NULL) { 1434 return (KPHYSM_EHANDLE); 1435 } 1436 switch (mhp->mh_state) { 1437 case MHND_STARTING: 1438 case MHND_RUNNING: 1439 mutex_exit(&mhp->mh_mutex); 1440 return (KPHYSM_ENOTFINISHED); 1441 case MHND_FREE: 1442 ASSERT(mhp->mh_state != MHND_FREE); 1443 mutex_exit(&mhp->mh_mutex); 1444 return (KPHYSM_EHANDLE); 1445 case MHND_INIT: 1446 break; 1447 case MHND_DONE: 1448 break; 1449 case MHND_RELEASE: 1450 mutex_exit(&mhp->mh_mutex); 1451 return (KPHYSM_ESEQUENCE); 1452 default: 1453 #ifdef DEBUG 1454 cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d", 1455 (void *)mhp, mhp->mh_state); 1456 #endif /* DEBUG */ 1457 mutex_exit(&mhp->mh_mutex); 1458 return (KPHYSM_EHANDLE); 1459 } 1460 /* 1461 * Set state so that we can wait if necessary. 1462 * Also this means that we have read/write access to all 1463 * fields except mh_exthandle and mh_state. 1464 */ 1465 mhp->mh_state = MHND_RELEASE; 1466 /* 1467 * The mem_handle cannot be de-allocated by any other operation 1468 * now, so no need to hold mh_mutex. 1469 */ 1470 mutex_exit(&mhp->mh_mutex); 1471 1472 delspan_remove(&mhp->mh_transit, 0, 0); 1473 mhp->mh_phys_pages = 0; 1474 mhp->mh_vm_pages = 0; 1475 mhp->mh_hold_todo = 0; 1476 mhp->mh_delete_complete = NULL; 1477 mhp->mh_delete_complete_arg = NULL; 1478 mhp->mh_cancel = 0; 1479 1480 mutex_enter(&mhp->mh_mutex); 1481 ASSERT(mhp->mh_state == MHND_RELEASE); 1482 mhp->mh_state = MHND_FREE; 1483 1484 kphysm_free_mem_handle(mhp); 1485 1486 return (KPHYSM_OK); 1487 } 1488 1489 /* 1490 * This cancel function can only be called with the thread running. 1491 */ 1492 int 1493 kphysm_del_cancel(memhandle_t handle) 1494 { 1495 struct mem_handle *mhp; 1496 1497 mhp = kphysm_lookup_mem_handle(handle); 1498 if (mhp == NULL) { 1499 return (KPHYSM_EHANDLE); 1500 } 1501 if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) { 1502 mutex_exit(&mhp->mh_mutex); 1503 return (KPHYSM_ENOTRUNNING); 1504 } 1505 /* 1506 * Set the cancel flag and wake the delete thread up. 1507 * The thread may be waiting on I/O, so the effect of the cancel 1508 * may be delayed. 1509 */ 1510 if (mhp->mh_cancel == 0) { 1511 mhp->mh_cancel = KPHYSM_ECANCELLED; 1512 cv_signal(&mhp->mh_cv); 1513 } 1514 mutex_exit(&mhp->mh_mutex); 1515 return (KPHYSM_OK); 1516 } 1517 1518 int 1519 kphysm_del_status( 1520 memhandle_t handle, 1521 memdelstat_t *mdstp) 1522 { 1523 struct mem_handle *mhp; 1524 1525 mhp = kphysm_lookup_mem_handle(handle); 1526 if (mhp == NULL) { 1527 return (KPHYSM_EHANDLE); 1528 } 1529 /* 1530 * Calling kphysm_del_status() is allowed before the delete 1531 * is started to allow for status display. 1532 */ 1533 if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING && 1534 mhp->mh_state != MHND_RUNNING) { 1535 mutex_exit(&mhp->mh_mutex); 1536 return (KPHYSM_ENOTRUNNING); 1537 } 1538 mdstp->phys_pages = mhp->mh_phys_pages; 1539 mdstp->managed = mhp->mh_vm_pages; 1540 mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo; 1541 mutex_exit(&mhp->mh_mutex); 1542 return (KPHYSM_OK); 1543 } 1544 1545 static int mem_delete_additional_pages = 100; 1546 1547 static int 1548 can_remove_pgs(pgcnt_t npgs) 1549 { 1550 /* 1551 * If all pageable pages were paged out, freemem would 1552 * equal availrmem. There is a minimum requirement for 1553 * availrmem. 1554 */ 1555 if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages)) 1556 < npgs) 1557 return (0); 1558 /* TODO: check swap space, etc. */ 1559 return (1); 1560 } 1561 1562 static int 1563 get_availrmem(pgcnt_t npgs) 1564 { 1565 int ret; 1566 1567 mutex_enter(&freemem_lock); 1568 ret = can_remove_pgs(npgs); 1569 if (ret != 0) 1570 availrmem -= npgs; 1571 mutex_exit(&freemem_lock); 1572 return (ret); 1573 } 1574 1575 static void 1576 put_availrmem(pgcnt_t npgs) 1577 { 1578 mutex_enter(&freemem_lock); 1579 availrmem += npgs; 1580 mutex_exit(&freemem_lock); 1581 } 1582 1583 #define FREEMEM_INCR 100 1584 static pgcnt_t freemem_incr = FREEMEM_INCR; 1585 #define DEL_FREE_WAIT_FRAC 4 1586 #define DEL_FREE_WAIT_TICKS ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC) 1587 1588 #define DEL_BUSY_WAIT_FRAC 20 1589 #define DEL_BUSY_WAIT_TICKS ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC) 1590 1591 static void kphysm_del_cleanup(struct mem_handle *); 1592 1593 static void page_delete_collect(page_t *, struct mem_handle *); 1594 1595 static pgcnt_t 1596 delthr_get_freemem(struct mem_handle *mhp) 1597 { 1598 pgcnt_t free_get; 1599 int ret; 1600 1601 ASSERT(MUTEX_HELD(&mhp->mh_mutex)); 1602 1603 MDSTAT_INCR(mhp, need_free); 1604 /* 1605 * Get up to freemem_incr pages. 1606 */ 1607 free_get = freemem_incr; 1608 if (free_get > mhp->mh_hold_todo) 1609 free_get = mhp->mh_hold_todo; 1610 /* 1611 * Take free_get pages away from freemem, 1612 * waiting if necessary. 1613 */ 1614 1615 while (!mhp->mh_cancel) { 1616 mutex_exit(&mhp->mh_mutex); 1617 MDSTAT_INCR(mhp, free_loop); 1618 /* 1619 * Duplicate test from page_create_throttle() 1620 * but don't override with !PG_WAIT. 1621 */ 1622 if (freemem < (free_get + throttlefree)) { 1623 MDSTAT_INCR(mhp, free_low); 1624 ret = 0; 1625 } else { 1626 ret = page_create_wait(free_get, 0); 1627 if (ret == 0) { 1628 /* EMPTY */ 1629 MDSTAT_INCR(mhp, free_failed); 1630 } 1631 } 1632 if (ret != 0) { 1633 mutex_enter(&mhp->mh_mutex); 1634 return (free_get); 1635 } 1636 1637 /* 1638 * Put pressure on pageout. 1639 */ 1640 page_needfree(free_get); 1641 cv_signal(&proc_pageout->p_cv); 1642 1643 mutex_enter(&mhp->mh_mutex); 1644 (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex, 1645 DEL_FREE_WAIT_TICKS, TR_CLOCK_TICK); 1646 mutex_exit(&mhp->mh_mutex); 1647 page_needfree(-(spgcnt_t)free_get); 1648 1649 mutex_enter(&mhp->mh_mutex); 1650 } 1651 return (0); 1652 } 1653 1654 #define DR_AIO_CLEANUP_DELAY 25000 /* 0.025secs, in usec */ 1655 #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100 1656 /* 1657 * This function is run as a helper thread for delete_memory_thread. 1658 * It is needed in order to force kaio cleanup, so that pages used in kaio 1659 * will be unlocked and subsequently relocated by delete_memory_thread. 1660 * The address of the delete_memory_threads's mem_handle is passed in to 1661 * this thread function, and is used to set the mh_aio_cleanup_done member 1662 * prior to calling thread_exit(). 1663 */ 1664 static void 1665 dr_aio_cleanup_thread(caddr_t amhp) 1666 { 1667 proc_t *procp; 1668 int (*aio_cleanup_dr_delete_memory)(proc_t *); 1669 int cleaned; 1670 int n = 0; 1671 struct mem_handle *mhp; 1672 volatile uint_t *pcancel; 1673 1674 mhp = (struct mem_handle *)amhp; 1675 ASSERT(mhp != NULL); 1676 pcancel = &mhp->mh_dr_aio_cleanup_cancel; 1677 if (modload("sys", "kaio") == -1) { 1678 mhp->mh_aio_cleanup_done = 1; 1679 cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio"); 1680 thread_exit(); 1681 } 1682 aio_cleanup_dr_delete_memory = (int (*)(proc_t *)) 1683 modgetsymvalue("aio_cleanup_dr_delete_memory", 0); 1684 if (aio_cleanup_dr_delete_memory == NULL) { 1685 mhp->mh_aio_cleanup_done = 1; 1686 cmn_err(CE_WARN, 1687 "aio_cleanup_dr_delete_memory not found in kaio"); 1688 thread_exit(); 1689 } 1690 do { 1691 cleaned = 0; 1692 mutex_enter(&pidlock); 1693 for (procp = practive; (*pcancel == 0) && (procp != NULL); 1694 procp = procp->p_next) { 1695 mutex_enter(&procp->p_lock); 1696 if (procp->p_aio != NULL) { 1697 /* cleanup proc's outstanding kaio */ 1698 cleaned += 1699 (*aio_cleanup_dr_delete_memory)(procp); 1700 } 1701 mutex_exit(&procp->p_lock); 1702 } 1703 mutex_exit(&pidlock); 1704 if ((*pcancel == 0) && 1705 (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) { 1706 /* delay a bit before retrying all procs again */ 1707 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 1708 n = 0; 1709 } 1710 } while (*pcancel == 0); 1711 mhp->mh_aio_cleanup_done = 1; 1712 thread_exit(); 1713 } 1714 1715 static void 1716 delete_memory_thread(caddr_t amhp) 1717 { 1718 struct mem_handle *mhp; 1719 struct memdelspan *mdsp; 1720 callb_cpr_t cprinfo; 1721 page_t *pp_targ; 1722 spgcnt_t freemem_left; 1723 void (*del_complete_funcp)(void *, int error); 1724 void *del_complete_arg; 1725 int comp_code; 1726 int ret; 1727 int first_scan; 1728 uint_t szc; 1729 #ifdef MEM_DEL_STATS 1730 uint64_t start_total, ntick_total; 1731 uint64_t start_pgrp, ntick_pgrp; 1732 #endif /* MEM_DEL_STATS */ 1733 1734 mhp = (struct mem_handle *)amhp; 1735 1736 #ifdef MEM_DEL_STATS 1737 start_total = ddi_get_lbolt(); 1738 #endif /* MEM_DEL_STATS */ 1739 1740 CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex, 1741 callb_generic_cpr, "memdel"); 1742 1743 mutex_enter(&mhp->mh_mutex); 1744 ASSERT(mhp->mh_state == MHND_STARTING); 1745 1746 mhp->mh_state = MHND_RUNNING; 1747 mhp->mh_thread_id = curthread; 1748 1749 mhp->mh_hold_todo = mhp->mh_vm_pages; 1750 mutex_exit(&mhp->mh_mutex); 1751 1752 /* Allocate the remap pages now, if necessary. */ 1753 memseg_remap_init(); 1754 1755 /* 1756 * Subtract from availrmem now if possible as availrmem 1757 * may not be available by the end of the delete. 1758 */ 1759 if (!get_availrmem(mhp->mh_vm_pages)) { 1760 comp_code = KPHYSM_ENOTVIABLE; 1761 mutex_enter(&mhp->mh_mutex); 1762 goto early_exit; 1763 } 1764 1765 ret = kphysm_setup_pre_del(mhp->mh_vm_pages); 1766 1767 mutex_enter(&mhp->mh_mutex); 1768 1769 if (ret != 0) { 1770 mhp->mh_cancel = KPHYSM_EREFUSED; 1771 goto refused; 1772 } 1773 1774 transit_list_collect(mhp, 1); 1775 1776 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1777 mdsp = mdsp->mds_next) { 1778 ASSERT(mdsp->mds_bitmap == NULL); 1779 mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP); 1780 mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp), 1781 KM_SLEEP); 1782 } 1783 1784 first_scan = 1; 1785 freemem_left = 0; 1786 /* 1787 * Start dr_aio_cleanup_thread, which periodically iterates 1788 * through the process list and invokes aio cleanup. This 1789 * is needed in order to avoid a deadly embrace between the 1790 * delete_memory_thread (waiting on writer lock for page, with the 1791 * exclusive-wanted bit set), kaio read request threads (waiting for a 1792 * reader lock on the same page that is wanted by the 1793 * delete_memory_thread), and threads waiting for kaio completion 1794 * (blocked on spt_amp->lock). 1795 */ 1796 mhp->mh_dr_aio_cleanup_cancel = 0; 1797 mhp->mh_aio_cleanup_done = 0; 1798 (void) thread_create(NULL, 0, dr_aio_cleanup_thread, 1799 (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1); 1800 while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) { 1801 pgcnt_t collected; 1802 1803 MDSTAT_INCR(mhp, nloop); 1804 collected = 0; 1805 for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) && 1806 (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) { 1807 pfn_t pfn, p_end; 1808 1809 p_end = mdsp->mds_base + mdsp->mds_npgs; 1810 for (pfn = mdsp->mds_base; (pfn < p_end) && 1811 (mhp->mh_cancel == 0); pfn++) { 1812 page_t *pp, *tpp, *tpp_targ; 1813 pgcnt_t bit; 1814 struct vnode *vp; 1815 u_offset_t offset; 1816 int mod, result; 1817 spgcnt_t pgcnt; 1818 1819 bit = pfn - mdsp->mds_base; 1820 if ((mdsp->mds_bitmap[bit / NBPBMW] & 1821 (1 << (bit % NBPBMW))) != 0) { 1822 MDSTAT_INCR(mhp, already_done); 1823 continue; 1824 } 1825 if (freemem_left == 0) { 1826 freemem_left += delthr_get_freemem(mhp); 1827 if (freemem_left == 0) 1828 break; 1829 } 1830 1831 /* 1832 * Release mh_mutex - some of this 1833 * stuff takes some time (eg PUTPAGE). 1834 */ 1835 1836 mutex_exit(&mhp->mh_mutex); 1837 MDSTAT_INCR(mhp, ncheck); 1838 1839 pp = page_numtopp_nolock(pfn); 1840 if (pp == NULL) { 1841 /* 1842 * Not covered by a page_t - will 1843 * be dealt with elsewhere. 1844 */ 1845 MDSTAT_INCR(mhp, nopaget); 1846 mutex_enter(&mhp->mh_mutex); 1847 mdsp->mds_bitmap[bit / NBPBMW] |= 1848 (1 << (bit % NBPBMW)); 1849 continue; 1850 } 1851 1852 if (!page_try_reclaim_lock(pp, SE_EXCL, 1853 SE_EXCL_WANTED | SE_RETIRED)) { 1854 /* 1855 * Page in use elsewhere. Skip it. 1856 */ 1857 MDSTAT_INCR(mhp, lockfail); 1858 mutex_enter(&mhp->mh_mutex); 1859 continue; 1860 } 1861 /* 1862 * See if the cage expanded into the delete. 1863 * This can happen as we have to allow the 1864 * cage to expand. 1865 */ 1866 if (PP_ISNORELOC(pp)) { 1867 page_unlock(pp); 1868 mutex_enter(&mhp->mh_mutex); 1869 mhp->mh_cancel = KPHYSM_ENONRELOC; 1870 break; 1871 } 1872 if (PP_RETIRED(pp)) { 1873 /* 1874 * Page has been retired and is 1875 * not part of the cage so we 1876 * can now do the accounting for 1877 * it. 1878 */ 1879 MDSTAT_INCR(mhp, retired); 1880 mutex_enter(&mhp->mh_mutex); 1881 mdsp->mds_bitmap[bit / NBPBMW] 1882 |= (1 << (bit % NBPBMW)); 1883 mdsp->mds_bitmap_retired[bit / 1884 NBPBMW] |= 1885 (1 << (bit % NBPBMW)); 1886 mhp->mh_hold_todo--; 1887 continue; 1888 } 1889 ASSERT(freemem_left != 0); 1890 if (PP_ISFREE(pp)) { 1891 /* 1892 * Like page_reclaim() only 'freemem' 1893 * processing is already done. 1894 */ 1895 MDSTAT_INCR(mhp, nfree); 1896 free_page_collect: 1897 if (PP_ISAGED(pp)) { 1898 page_list_sub(pp, 1899 PG_FREE_LIST); 1900 } else { 1901 page_list_sub(pp, 1902 PG_CACHE_LIST); 1903 } 1904 PP_CLRFREE(pp); 1905 PP_CLRAGED(pp); 1906 collected++; 1907 mutex_enter(&mhp->mh_mutex); 1908 page_delete_collect(pp, mhp); 1909 mdsp->mds_bitmap[bit / NBPBMW] |= 1910 (1 << (bit % NBPBMW)); 1911 freemem_left--; 1912 continue; 1913 } 1914 ASSERT(pp->p_vnode != NULL); 1915 if (first_scan) { 1916 MDSTAT_INCR(mhp, first_notfree); 1917 page_unlock(pp); 1918 mutex_enter(&mhp->mh_mutex); 1919 continue; 1920 } 1921 /* 1922 * Keep stats on pages encountered that 1923 * are marked for retirement. 1924 */ 1925 if (PP_TOXIC(pp)) { 1926 MDSTAT_INCR(mhp, toxic); 1927 } else if (PP_PR_REQ(pp)) { 1928 MDSTAT_INCR(mhp, failing); 1929 } 1930 /* 1931 * In certain cases below, special exceptions 1932 * are made for pages that are toxic. This 1933 * is because the current meaning of toxic 1934 * is that an uncorrectable error has been 1935 * previously associated with the page. 1936 */ 1937 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1938 if (!PP_TOXIC(pp)) { 1939 /* 1940 * Must relocate locked in 1941 * memory pages. 1942 */ 1943 #ifdef MEM_DEL_STATS 1944 start_pgrp = ddi_get_lbolt(); 1945 #endif /* MEM_DEL_STATS */ 1946 /* 1947 * Lock all constituent pages 1948 * of a large page to ensure 1949 * that p_szc won't change. 1950 */ 1951 if (!group_page_trylock(pp, 1952 SE_EXCL)) { 1953 MDSTAT_INCR(mhp, 1954 gptllckfail); 1955 page_unlock(pp); 1956 mutex_enter( 1957 &mhp->mh_mutex); 1958 continue; 1959 } 1960 MDSTAT_INCR(mhp, npplocked); 1961 pp_targ = 1962 page_get_replacement_page( 1963 pp, NULL, 0); 1964 if (pp_targ != NULL) { 1965 #ifdef MEM_DEL_STATS 1966 ntick_pgrp = 1967 (uint64_t) 1968 ddi_get_lbolt() - 1969 start_pgrp; 1970 #endif /* MEM_DEL_STATS */ 1971 MDSTAT_PGRP(mhp, 1972 ntick_pgrp); 1973 MDSTAT_INCR(mhp, 1974 nlockreloc); 1975 goto reloc; 1976 } 1977 group_page_unlock(pp); 1978 page_unlock(pp); 1979 #ifdef MEM_DEL_STATS 1980 ntick_pgrp = 1981 (uint64_t)ddi_get_lbolt() - 1982 start_pgrp; 1983 #endif /* MEM_DEL_STATS */ 1984 MDSTAT_PGRP(mhp, ntick_pgrp); 1985 MDSTAT_INCR(mhp, nnorepl); 1986 mutex_enter(&mhp->mh_mutex); 1987 continue; 1988 } else { 1989 /* 1990 * Cannot do anything about 1991 * this page because it is 1992 * toxic. 1993 */ 1994 MDSTAT_INCR(mhp, npplkdtoxic); 1995 page_unlock(pp); 1996 mutex_enter(&mhp->mh_mutex); 1997 continue; 1998 } 1999 } 2000 /* 2001 * Unload the mappings and check if mod bit 2002 * is set. 2003 */ 2004 ASSERT(!PP_ISKAS(pp)); 2005 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 2006 mod = hat_ismod(pp); 2007 2008 #ifdef MEM_DEL_STATS 2009 start_pgrp = ddi_get_lbolt(); 2010 #endif /* MEM_DEL_STATS */ 2011 if (mod && !PP_TOXIC(pp)) { 2012 /* 2013 * Lock all constituent pages 2014 * of a large page to ensure 2015 * that p_szc won't change. 2016 */ 2017 if (!group_page_trylock(pp, SE_EXCL)) { 2018 MDSTAT_INCR(mhp, gptlmodfail); 2019 page_unlock(pp); 2020 mutex_enter(&mhp->mh_mutex); 2021 continue; 2022 } 2023 pp_targ = page_get_replacement_page(pp, 2024 NULL, 0); 2025 if (pp_targ != NULL) { 2026 MDSTAT_INCR(mhp, nmodreloc); 2027 #ifdef MEM_DEL_STATS 2028 ntick_pgrp = 2029 (uint64_t)ddi_get_lbolt() - 2030 start_pgrp; 2031 #endif /* MEM_DEL_STATS */ 2032 MDSTAT_PGRP(mhp, ntick_pgrp); 2033 goto reloc; 2034 } 2035 group_page_unlock(pp); 2036 } 2037 2038 if (!page_try_demote_pages(pp)) { 2039 MDSTAT_INCR(mhp, demotefail); 2040 page_unlock(pp); 2041 #ifdef MEM_DEL_STATS 2042 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2043 start_pgrp; 2044 #endif /* MEM_DEL_STATS */ 2045 MDSTAT_PGRP(mhp, ntick_pgrp); 2046 mutex_enter(&mhp->mh_mutex); 2047 continue; 2048 } 2049 2050 /* 2051 * Regular 'page-out'. 2052 */ 2053 if (!mod) { 2054 MDSTAT_INCR(mhp, ndestroy); 2055 page_destroy(pp, 1); 2056 /* 2057 * page_destroy was called with 2058 * dontfree. As long as p_lckcnt 2059 * and p_cowcnt are both zero, the 2060 * only additional action of 2061 * page_destroy with !dontfree is to 2062 * call page_free, so we can collect 2063 * the page here. 2064 */ 2065 collected++; 2066 #ifdef MEM_DEL_STATS 2067 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2068 start_pgrp; 2069 #endif /* MEM_DEL_STATS */ 2070 MDSTAT_PGRP(mhp, ntick_pgrp); 2071 mutex_enter(&mhp->mh_mutex); 2072 page_delete_collect(pp, mhp); 2073 mdsp->mds_bitmap[bit / NBPBMW] |= 2074 (1 << (bit % NBPBMW)); 2075 continue; 2076 } 2077 /* 2078 * The page is toxic and the mod bit is 2079 * set, we cannot do anything here to deal 2080 * with it. 2081 */ 2082 if (PP_TOXIC(pp)) { 2083 page_unlock(pp); 2084 #ifdef MEM_DEL_STATS 2085 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2086 start_pgrp; 2087 #endif /* MEM_DEL_STATS */ 2088 MDSTAT_PGRP(mhp, ntick_pgrp); 2089 MDSTAT_INCR(mhp, modtoxic); 2090 mutex_enter(&mhp->mh_mutex); 2091 continue; 2092 } 2093 MDSTAT_INCR(mhp, nputpage); 2094 vp = pp->p_vnode; 2095 offset = pp->p_offset; 2096 VN_HOLD(vp); 2097 page_unlock(pp); 2098 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, 2099 B_INVAL|B_FORCE, kcred, NULL); 2100 VN_RELE(vp); 2101 #ifdef MEM_DEL_STATS 2102 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2103 start_pgrp; 2104 #endif /* MEM_DEL_STATS */ 2105 MDSTAT_PGRP(mhp, ntick_pgrp); 2106 /* 2107 * Try to get the page back immediately 2108 * so that it can be collected. 2109 */ 2110 pp = page_numtopp_nolock(pfn); 2111 if (pp == NULL) { 2112 MDSTAT_INCR(mhp, nnoreclaim); 2113 /* 2114 * This should not happen as this 2115 * thread is deleting the page. 2116 * If this code is generalized, this 2117 * becomes a reality. 2118 */ 2119 #ifdef DEBUG 2120 cmn_err(CE_WARN, 2121 "delete_memory_thread(0x%p) " 2122 "pfn 0x%lx has no page_t", 2123 (void *)mhp, pfn); 2124 #endif /* DEBUG */ 2125 mutex_enter(&mhp->mh_mutex); 2126 continue; 2127 } 2128 if (page_try_reclaim_lock(pp, SE_EXCL, 2129 SE_EXCL_WANTED | SE_RETIRED)) { 2130 if (PP_ISFREE(pp)) { 2131 goto free_page_collect; 2132 } 2133 page_unlock(pp); 2134 } 2135 MDSTAT_INCR(mhp, nnoreclaim); 2136 mutex_enter(&mhp->mh_mutex); 2137 continue; 2138 2139 reloc: 2140 /* 2141 * Got some freemem and a target 2142 * page, so move the data to avoid 2143 * I/O and lock problems. 2144 */ 2145 ASSERT(!page_iolock_assert(pp)); 2146 MDSTAT_INCR(mhp, nreloc); 2147 /* 2148 * page_relocate() will return pgcnt: the 2149 * number of consecutive pages relocated. 2150 * If it is successful, pp will be a 2151 * linked list of the page structs that 2152 * were relocated. If page_relocate() is 2153 * unsuccessful, pp will be unmodified. 2154 */ 2155 #ifdef MEM_DEL_STATS 2156 start_pgrp = ddi_get_lbolt(); 2157 #endif /* MEM_DEL_STATS */ 2158 result = page_relocate(&pp, &pp_targ, 0, 0, 2159 &pgcnt, NULL); 2160 #ifdef MEM_DEL_STATS 2161 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2162 start_pgrp; 2163 #endif /* MEM_DEL_STATS */ 2164 MDSTAT_PGRP(mhp, ntick_pgrp); 2165 if (result != 0) { 2166 MDSTAT_INCR(mhp, nrelocfail); 2167 /* 2168 * We did not succeed. We need 2169 * to give the pp_targ pages back. 2170 * page_free(pp_targ, 1) without 2171 * the freemem accounting. 2172 */ 2173 group_page_unlock(pp); 2174 page_free_replacement_page(pp_targ); 2175 page_unlock(pp); 2176 mutex_enter(&mhp->mh_mutex); 2177 continue; 2178 } 2179 2180 /* 2181 * We will then collect pgcnt pages. 2182 */ 2183 ASSERT(pgcnt > 0); 2184 mutex_enter(&mhp->mh_mutex); 2185 /* 2186 * We need to make sure freemem_left is 2187 * large enough. 2188 */ 2189 while ((freemem_left < pgcnt) && 2190 (!mhp->mh_cancel)) { 2191 freemem_left += 2192 delthr_get_freemem(mhp); 2193 } 2194 2195 /* 2196 * Do not proceed if mh_cancel is set. 2197 */ 2198 if (mhp->mh_cancel) { 2199 while (pp_targ != NULL) { 2200 /* 2201 * Unlink and unlock each page. 2202 */ 2203 tpp_targ = pp_targ; 2204 page_sub(&pp_targ, tpp_targ); 2205 page_unlock(tpp_targ); 2206 } 2207 /* 2208 * We need to give the pp pages back. 2209 * page_free(pp, 1) without the 2210 * freemem accounting. 2211 */ 2212 page_free_replacement_page(pp); 2213 break; 2214 } 2215 2216 /* Now remove pgcnt from freemem_left */ 2217 freemem_left -= pgcnt; 2218 ASSERT(freemem_left >= 0); 2219 szc = pp->p_szc; 2220 while (pp != NULL) { 2221 /* 2222 * pp and pp_targ were passed back as 2223 * a linked list of pages. 2224 * Unlink and unlock each page. 2225 */ 2226 tpp_targ = pp_targ; 2227 page_sub(&pp_targ, tpp_targ); 2228 page_unlock(tpp_targ); 2229 /* 2230 * The original page is now free 2231 * so remove it from the linked 2232 * list and collect it. 2233 */ 2234 tpp = pp; 2235 page_sub(&pp, tpp); 2236 pfn = page_pptonum(tpp); 2237 collected++; 2238 ASSERT(PAGE_EXCL(tpp)); 2239 ASSERT(tpp->p_vnode == NULL); 2240 ASSERT(!hat_page_is_mapped(tpp)); 2241 ASSERT(tpp->p_szc == szc); 2242 tpp->p_szc = 0; 2243 page_delete_collect(tpp, mhp); 2244 bit = pfn - mdsp->mds_base; 2245 mdsp->mds_bitmap[bit / NBPBMW] |= 2246 (1 << (bit % NBPBMW)); 2247 } 2248 ASSERT(pp_targ == NULL); 2249 } 2250 } 2251 first_scan = 0; 2252 if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) && 2253 (collected == 0)) { 2254 /* 2255 * This code is needed as we cannot wait 2256 * for a page to be locked OR the delete to 2257 * be cancelled. Also, we must delay so 2258 * that other threads get a chance to run 2259 * on our cpu, otherwise page locks may be 2260 * held indefinitely by those threads. 2261 */ 2262 MDSTAT_INCR(mhp, ndelay); 2263 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2264 (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex, 2265 DEL_BUSY_WAIT_TICKS, TR_CLOCK_TICK); 2266 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2267 } 2268 } 2269 /* stop the dr aio cleanup thread */ 2270 mhp->mh_dr_aio_cleanup_cancel = 1; 2271 transit_list_collect(mhp, 0); 2272 if (freemem_left != 0) { 2273 /* Return any surplus. */ 2274 page_create_putback(freemem_left); 2275 freemem_left = 0; 2276 } 2277 #ifdef MEM_DEL_STATS 2278 ntick_total = (uint64_t)ddi_get_lbolt() - start_total; 2279 #endif /* MEM_DEL_STATS */ 2280 MDSTAT_TOTAL(mhp, ntick_total); 2281 MDSTAT_PRINT(mhp); 2282 2283 /* 2284 * If the memory delete was cancelled, exclusive-wanted bits must 2285 * be cleared. If there are retired pages being deleted, they need 2286 * to be unretired. 2287 */ 2288 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2289 mdsp = mdsp->mds_next) { 2290 pfn_t pfn, p_end; 2291 2292 p_end = mdsp->mds_base + mdsp->mds_npgs; 2293 for (pfn = mdsp->mds_base; pfn < p_end; pfn++) { 2294 page_t *pp; 2295 pgcnt_t bit; 2296 2297 bit = pfn - mdsp->mds_base; 2298 if (mhp->mh_cancel) { 2299 pp = page_numtopp_nolock(pfn); 2300 if (pp != NULL) { 2301 if ((mdsp->mds_bitmap[bit / NBPBMW] & 2302 (1 << (bit % NBPBMW))) == 0) { 2303 page_lock_clr_exclwanted(pp); 2304 } 2305 } 2306 } else { 2307 pp = NULL; 2308 } 2309 if ((mdsp->mds_bitmap_retired[bit / NBPBMW] & 2310 (1 << (bit % NBPBMW))) != 0) { 2311 /* do we already have pp? */ 2312 if (pp == NULL) { 2313 pp = page_numtopp_nolock(pfn); 2314 } 2315 ASSERT(pp != NULL); 2316 ASSERT(PP_RETIRED(pp)); 2317 if (mhp->mh_cancel != 0) { 2318 page_unlock(pp); 2319 /* 2320 * To satisfy ASSERT below in 2321 * cancel code. 2322 */ 2323 mhp->mh_hold_todo++; 2324 } else { 2325 (void) page_unretire_pp(pp, 2326 PR_UNR_CLEAN); 2327 } 2328 } 2329 } 2330 } 2331 /* 2332 * Free retired page bitmap and collected page bitmap 2333 */ 2334 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2335 mdsp = mdsp->mds_next) { 2336 ASSERT(mdsp->mds_bitmap_retired != NULL); 2337 kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp)); 2338 mdsp->mds_bitmap_retired = NULL; /* Paranoia. */ 2339 ASSERT(mdsp->mds_bitmap != NULL); 2340 kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp)); 2341 mdsp->mds_bitmap = NULL; /* Paranoia. */ 2342 } 2343 2344 /* wait for our dr aio cancel thread to exit */ 2345 while (!(mhp->mh_aio_cleanup_done)) { 2346 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2347 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 2348 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2349 } 2350 refused: 2351 if (mhp->mh_cancel != 0) { 2352 page_t *pp; 2353 2354 comp_code = mhp->mh_cancel; 2355 /* 2356 * Go through list of deleted pages (mh_deleted) freeing 2357 * them. 2358 */ 2359 while ((pp = mhp->mh_deleted) != NULL) { 2360 mhp->mh_deleted = pp->p_next; 2361 mhp->mh_hold_todo++; 2362 mutex_exit(&mhp->mh_mutex); 2363 /* Restore p_next. */ 2364 pp->p_next = pp->p_prev; 2365 if (PP_ISFREE(pp)) { 2366 cmn_err(CE_PANIC, 2367 "page %p is free", 2368 (void *)pp); 2369 } 2370 page_free(pp, 1); 2371 mutex_enter(&mhp->mh_mutex); 2372 } 2373 ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages); 2374 2375 mutex_exit(&mhp->mh_mutex); 2376 put_availrmem(mhp->mh_vm_pages); 2377 mutex_enter(&mhp->mh_mutex); 2378 2379 goto t_exit; 2380 } 2381 2382 /* 2383 * All the pages are no longer in use and are exclusively locked. 2384 */ 2385 2386 mhp->mh_deleted = NULL; 2387 2388 kphysm_del_cleanup(mhp); 2389 2390 /* 2391 * mem_node_del_range needs to be after kphysm_del_cleanup so 2392 * that the mem_node_config[] will remain intact for the cleanup. 2393 */ 2394 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2395 mdsp = mdsp->mds_next) { 2396 mem_node_del_range(mdsp->mds_base, 2397 mdsp->mds_base + mdsp->mds_npgs - 1); 2398 } 2399 /* cleanup the page counters */ 2400 page_ctrs_cleanup(); 2401 2402 comp_code = KPHYSM_OK; 2403 2404 t_exit: 2405 mutex_exit(&mhp->mh_mutex); 2406 kphysm_setup_post_del(mhp->mh_vm_pages, 2407 (comp_code == KPHYSM_OK) ? 0 : 1); 2408 mutex_enter(&mhp->mh_mutex); 2409 2410 early_exit: 2411 /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */ 2412 mhp->mh_state = MHND_DONE; 2413 del_complete_funcp = mhp->mh_delete_complete; 2414 del_complete_arg = mhp->mh_delete_complete_arg; 2415 CALLB_CPR_EXIT(&cprinfo); 2416 (*del_complete_funcp)(del_complete_arg, comp_code); 2417 thread_exit(); 2418 /*NOTREACHED*/ 2419 } 2420 2421 /* 2422 * Start the delete of the memory from the system. 2423 */ 2424 int 2425 kphysm_del_start( 2426 memhandle_t handle, 2427 void (*complete)(void *, int), 2428 void *complete_arg) 2429 { 2430 struct mem_handle *mhp; 2431 2432 mhp = kphysm_lookup_mem_handle(handle); 2433 if (mhp == NULL) { 2434 return (KPHYSM_EHANDLE); 2435 } 2436 switch (mhp->mh_state) { 2437 case MHND_FREE: 2438 ASSERT(mhp->mh_state != MHND_FREE); 2439 mutex_exit(&mhp->mh_mutex); 2440 return (KPHYSM_EHANDLE); 2441 case MHND_INIT: 2442 break; 2443 case MHND_STARTING: 2444 case MHND_RUNNING: 2445 mutex_exit(&mhp->mh_mutex); 2446 return (KPHYSM_ESEQUENCE); 2447 case MHND_DONE: 2448 mutex_exit(&mhp->mh_mutex); 2449 return (KPHYSM_ESEQUENCE); 2450 case MHND_RELEASE: 2451 mutex_exit(&mhp->mh_mutex); 2452 return (KPHYSM_ESEQUENCE); 2453 default: 2454 #ifdef DEBUG 2455 cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d", 2456 (void *)mhp, mhp->mh_state); 2457 #endif /* DEBUG */ 2458 mutex_exit(&mhp->mh_mutex); 2459 return (KPHYSM_EHANDLE); 2460 } 2461 2462 if (mhp->mh_transit.trl_spans == NULL) { 2463 mutex_exit(&mhp->mh_mutex); 2464 return (KPHYSM_ENOWORK); 2465 } 2466 2467 ASSERT(complete != NULL); 2468 mhp->mh_delete_complete = complete; 2469 mhp->mh_delete_complete_arg = complete_arg; 2470 mhp->mh_state = MHND_STARTING; 2471 /* 2472 * Release the mutex in case thread_create sleeps. 2473 */ 2474 mutex_exit(&mhp->mh_mutex); 2475 2476 /* 2477 * The "obvious" process for this thread is pageout (proc_pageout) 2478 * but this gives the thread too much power over freemem 2479 * which results in freemem starvation. 2480 */ 2481 (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0, 2482 TS_RUN, maxclsyspri - 1); 2483 2484 return (KPHYSM_OK); 2485 } 2486 2487 static kmutex_t pp_dummy_lock; /* Protects init. of pp_dummy. */ 2488 static caddr_t pp_dummy; 2489 static pgcnt_t pp_dummy_npages; 2490 static pfn_t *pp_dummy_pfn; /* Array of dummy pfns. */ 2491 2492 static void 2493 memseg_remap_init_pages(page_t *pages, page_t *epages) 2494 { 2495 page_t *pp; 2496 2497 for (pp = pages; pp < epages; pp++) { 2498 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2499 pp->p_offset = (u_offset_t)-1; 2500 page_iolock_init(pp); 2501 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 2502 continue; 2503 page_lock_delete(pp); 2504 } 2505 } 2506 2507 void 2508 memseg_remap_init() 2509 { 2510 mutex_enter(&pp_dummy_lock); 2511 if (pp_dummy == NULL) { 2512 uint_t dpages; 2513 int i; 2514 2515 /* 2516 * dpages starts off as the size of the structure and 2517 * ends up as the minimum number of pages that will 2518 * hold a whole number of page_t structures. 2519 */ 2520 dpages = sizeof (page_t); 2521 ASSERT(dpages != 0); 2522 ASSERT(dpages <= MMU_PAGESIZE); 2523 2524 while ((dpages & 1) == 0) 2525 dpages >>= 1; 2526 2527 pp_dummy_npages = dpages; 2528 /* 2529 * Allocate pp_dummy pages directly from static_arena, 2530 * since these are whole page allocations and are 2531 * referenced by physical address. This also has the 2532 * nice fringe benefit of hiding the memory from 2533 * ::findleaks since it doesn't deal well with allocated 2534 * kernel heap memory that doesn't have any mappings. 2535 */ 2536 pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages), 2537 PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP); 2538 bzero(pp_dummy, ptob(pp_dummy_npages)); 2539 ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0); 2540 pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) * 2541 pp_dummy_npages, KM_SLEEP); 2542 for (i = 0; i < pp_dummy_npages; i++) { 2543 pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat, 2544 &pp_dummy[MMU_PAGESIZE * i]); 2545 ASSERT(pp_dummy_pfn[i] != PFN_INVALID); 2546 } 2547 /* 2548 * Initialize the page_t's to a known 'deleted' state 2549 * that matches the state of deleted pages. 2550 */ 2551 memseg_remap_init_pages((page_t *)pp_dummy, 2552 (page_t *)(pp_dummy + ptob(pp_dummy_npages))); 2553 /* Remove kmem mappings for the pages for safety. */ 2554 hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages), 2555 HAT_UNLOAD_UNLOCK); 2556 /* Leave pp_dummy pointer set as flag that init is done. */ 2557 } 2558 mutex_exit(&pp_dummy_lock); 2559 } 2560 2561 /* 2562 * Remap a page-aglined range of page_t's to dummy pages. 2563 */ 2564 void 2565 remap_to_dummy(caddr_t va, pgcnt_t metapgs) 2566 { 2567 int phase; 2568 2569 ASSERT(IS_P2ALIGNED((uint64_t)(uintptr_t)va, PAGESIZE)); 2570 2571 /* 2572 * We may start remapping at a non-zero page offset 2573 * within the dummy pages since the low/high ends 2574 * of the outgoing pp's could be shared by other 2575 * memsegs (see memseg_remap_meta). 2576 */ 2577 phase = btop((uint64_t)(uintptr_t)va) % pp_dummy_npages; 2578 /*CONSTCOND*/ 2579 ASSERT(PAGESIZE % sizeof (page_t) || phase == 0); 2580 2581 while (metapgs != 0) { 2582 pgcnt_t n; 2583 int i, j; 2584 2585 n = pp_dummy_npages; 2586 if (n > metapgs) 2587 n = metapgs; 2588 for (i = 0; i < n; i++) { 2589 j = (i + phase) % pp_dummy_npages; 2590 hat_devload(kas.a_hat, va, ptob(1), pp_dummy_pfn[j], 2591 PROT_READ, 2592 HAT_LOAD | HAT_LOAD_NOCONSIST | 2593 HAT_LOAD_REMAP); 2594 va += ptob(1); 2595 } 2596 metapgs -= n; 2597 } 2598 } 2599 2600 static void 2601 memseg_remap_to_dummy(struct memseg *seg) 2602 { 2603 caddr_t pp; 2604 pgcnt_t metapgs; 2605 2606 ASSERT(memseg_is_dynamic(seg)); 2607 ASSERT(pp_dummy != NULL); 2608 2609 2610 if (!memseg_includes_meta(seg)) { 2611 memseg_remap_meta(seg); 2612 return; 2613 } 2614 2615 pp = (caddr_t)seg->pages; 2616 metapgs = seg->pages_base - memseg_get_start(seg); 2617 ASSERT(metapgs != 0); 2618 2619 seg->pages_end = seg->pages_base; 2620 2621 remap_to_dummy(pp, metapgs); 2622 } 2623 2624 /* 2625 * Transition all the deleted pages to the deleted state so that 2626 * page_lock will not wait. The page_lock_delete call will 2627 * also wake up any waiters. 2628 */ 2629 static void 2630 memseg_lock_delete_all(struct memseg *seg) 2631 { 2632 page_t *pp; 2633 2634 for (pp = seg->pages; pp < seg->epages; pp++) { 2635 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2636 page_lock_delete(pp); 2637 } 2638 } 2639 2640 static void 2641 kphysm_del_cleanup(struct mem_handle *mhp) 2642 { 2643 struct memdelspan *mdsp; 2644 struct memseg *seg; 2645 struct memseg **segpp; 2646 struct memseg *seglist; 2647 pfn_t p_end; 2648 uint64_t avmem; 2649 pgcnt_t avpgs; 2650 pgcnt_t npgs; 2651 2652 avpgs = mhp->mh_vm_pages; 2653 2654 memsegs_lock(1); 2655 2656 /* 2657 * remove from main segment list. 2658 */ 2659 npgs = 0; 2660 seglist = NULL; 2661 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2662 mdsp = mdsp->mds_next) { 2663 p_end = mdsp->mds_base + mdsp->mds_npgs; 2664 for (segpp = &memsegs; (seg = *segpp) != NULL; ) { 2665 if (seg->pages_base >= p_end || 2666 seg->pages_end <= mdsp->mds_base) { 2667 /* Span and memseg don't overlap. */ 2668 segpp = &((*segpp)->next); 2669 continue; 2670 } 2671 ASSERT(seg->pages_base >= mdsp->mds_base); 2672 ASSERT(seg->pages_end <= p_end); 2673 2674 PLCNT_MODIFY_MAX(seg->pages_base, 2675 seg->pages_base - seg->pages_end); 2676 2677 /* Hide the memseg from future scans. */ 2678 hat_kpm_delmem_mseg_update(seg, segpp); 2679 *segpp = seg->next; 2680 membar_producer(); /* TODO: Needed? */ 2681 npgs += MSEG_NPAGES(seg); 2682 2683 /* 2684 * Leave the deleted segment's next pointer intact 2685 * in case a memsegs scanning loop is walking this 2686 * segment concurrently. 2687 */ 2688 seg->lnext = seglist; 2689 seglist = seg; 2690 } 2691 } 2692 2693 build_pfn_hash(); 2694 2695 ASSERT(npgs < total_pages); 2696 total_pages -= npgs; 2697 2698 /* 2699 * Recalculate the paging parameters now total_pages has changed. 2700 * This will also cause the clock hands to be reset before next use. 2701 */ 2702 setupclock(1); 2703 2704 memsegs_unlock(1); 2705 2706 mutex_exit(&mhp->mh_mutex); 2707 2708 while ((seg = seglist) != NULL) { 2709 pfn_t mseg_start; 2710 pfn_t mseg_base, mseg_end; 2711 pgcnt_t mseg_npgs; 2712 int mlret; 2713 2714 seglist = seg->lnext; 2715 2716 /* 2717 * Put the page_t's into the deleted state to stop 2718 * cv_wait()s on the pages. When we remap, the dummy 2719 * page_t's will be in the same state. 2720 */ 2721 memseg_lock_delete_all(seg); 2722 /* 2723 * Collect up information based on pages_base and pages_end 2724 * early so that we can flag early that the memseg has been 2725 * deleted by setting pages_end == pages_base. 2726 */ 2727 mseg_base = seg->pages_base; 2728 mseg_end = seg->pages_end; 2729 mseg_npgs = MSEG_NPAGES(seg); 2730 mseg_start = memseg_get_start(seg); 2731 2732 if (memseg_is_dynamic(seg)) { 2733 /* Remap the meta data to our special dummy area. */ 2734 memseg_remap_to_dummy(seg); 2735 2736 mutex_enter(&memseg_lists_lock); 2737 seg->lnext = memseg_va_avail; 2738 memseg_va_avail = seg; 2739 mutex_exit(&memseg_lists_lock); 2740 } else { 2741 /* 2742 * For memory whose page_ts were allocated 2743 * at boot, we need to find a new use for 2744 * the page_t memory. 2745 * For the moment, just leak it. 2746 * (It is held in the memseg_delete_junk list.) 2747 */ 2748 seg->pages_end = seg->pages_base; 2749 2750 mutex_enter(&memseg_lists_lock); 2751 seg->lnext = memseg_delete_junk; 2752 memseg_delete_junk = seg; 2753 mutex_exit(&memseg_lists_lock); 2754 } 2755 2756 /* Must not use seg now as it could be re-used. */ 2757 2758 memlist_write_lock(); 2759 2760 mlret = memlist_delete_span( 2761 (uint64_t)(mseg_base) << PAGESHIFT, 2762 (uint64_t)(mseg_npgs) << PAGESHIFT, 2763 &phys_avail); 2764 ASSERT(mlret == MEML_SPANOP_OK); 2765 2766 mlret = memlist_delete_span( 2767 (uint64_t)(mseg_start) << PAGESHIFT, 2768 (uint64_t)(mseg_end - mseg_start) << 2769 PAGESHIFT, 2770 &phys_install); 2771 ASSERT(mlret == MEML_SPANOP_OK); 2772 phys_install_has_changed(); 2773 2774 memlist_write_unlock(); 2775 } 2776 2777 memlist_read_lock(); 2778 installed_top_size(phys_install, &physmax, &physinstalled); 2779 memlist_read_unlock(); 2780 2781 mutex_enter(&freemem_lock); 2782 maxmem -= avpgs; 2783 physmem -= avpgs; 2784 /* availrmem is adjusted during the delete. */ 2785 availrmem_initial -= avpgs; 2786 2787 mutex_exit(&freemem_lock); 2788 2789 dump_resize(); 2790 2791 cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK " 2792 "(0x%" PRIx64 ")\n", 2793 physinstalled << (PAGESHIFT - 10), 2794 (uint64_t)physinstalled << PAGESHIFT); 2795 2796 avmem = (uint64_t)freemem << PAGESHIFT; 2797 cmn_err(CE_CONT, "?kphysm_delete: " 2798 "avail mem = %" PRId64 "\n", avmem); 2799 2800 /* 2801 * Update lgroup generation number on single lgroup systems 2802 */ 2803 if (nlgrps == 1) 2804 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 2805 2806 /* Successfully deleted system memory */ 2807 mutex_enter(&mhp->mh_mutex); 2808 } 2809 2810 static uint_t mdel_nullvp_waiter; 2811 2812 static void 2813 page_delete_collect( 2814 page_t *pp, 2815 struct mem_handle *mhp) 2816 { 2817 if (pp->p_vnode) { 2818 page_hashout(pp, (kmutex_t *)NULL); 2819 /* do not do PP_SETAGED(pp); */ 2820 } else { 2821 kmutex_t *sep; 2822 2823 sep = page_se_mutex(pp); 2824 mutex_enter(sep); 2825 if (CV_HAS_WAITERS(&pp->p_cv)) { 2826 mdel_nullvp_waiter++; 2827 cv_broadcast(&pp->p_cv); 2828 } 2829 mutex_exit(sep); 2830 } 2831 ASSERT(pp->p_next == pp->p_prev); 2832 ASSERT(pp->p_next == NULL || pp->p_next == pp); 2833 pp->p_next = mhp->mh_deleted; 2834 mhp->mh_deleted = pp; 2835 ASSERT(mhp->mh_hold_todo != 0); 2836 mhp->mh_hold_todo--; 2837 } 2838 2839 static void 2840 transit_list_collect(struct mem_handle *mhp, int v) 2841 { 2842 struct transit_list_head *trh; 2843 2844 trh = &transit_list_head; 2845 mutex_enter(&trh->trh_lock); 2846 mhp->mh_transit.trl_collect = v; 2847 mutex_exit(&trh->trh_lock); 2848 } 2849 2850 static void 2851 transit_list_insert(struct transit_list *tlp) 2852 { 2853 struct transit_list_head *trh; 2854 2855 trh = &transit_list_head; 2856 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2857 tlp->trl_next = trh->trh_head; 2858 trh->trh_head = tlp; 2859 } 2860 2861 static void 2862 transit_list_remove(struct transit_list *tlp) 2863 { 2864 struct transit_list_head *trh; 2865 struct transit_list **tlpp; 2866 2867 trh = &transit_list_head; 2868 tlpp = &trh->trh_head; 2869 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2870 while (*tlpp != NULL && *tlpp != tlp) 2871 tlpp = &(*tlpp)->trl_next; 2872 ASSERT(*tlpp != NULL); 2873 if (*tlpp == tlp) 2874 *tlpp = tlp->trl_next; 2875 tlp->trl_next = NULL; 2876 } 2877 2878 static struct transit_list * 2879 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum) 2880 { 2881 struct transit_list *tlp; 2882 2883 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 2884 struct memdelspan *mdsp; 2885 2886 for (mdsp = tlp->trl_spans; mdsp != NULL; 2887 mdsp = mdsp->mds_next) { 2888 if (pfnum >= mdsp->mds_base && 2889 pfnum < (mdsp->mds_base + mdsp->mds_npgs)) { 2890 return (tlp); 2891 } 2892 } 2893 } 2894 return (NULL); 2895 } 2896 2897 int 2898 pfn_is_being_deleted(pfn_t pfnum) 2899 { 2900 struct transit_list_head *trh; 2901 struct transit_list *tlp; 2902 int ret; 2903 2904 trh = &transit_list_head; 2905 if (trh->trh_head == NULL) 2906 return (0); 2907 2908 mutex_enter(&trh->trh_lock); 2909 tlp = pfnum_to_transit_list(trh, pfnum); 2910 ret = (tlp != NULL && tlp->trl_collect); 2911 mutex_exit(&trh->trh_lock); 2912 2913 return (ret); 2914 } 2915 2916 #ifdef MEM_DEL_STATS 2917 extern int hz; 2918 static void 2919 mem_del_stat_print_func(struct mem_handle *mhp) 2920 { 2921 uint64_t tmp; 2922 2923 if (mem_del_stat_print) { 2924 printf("memory delete loop %x/%x, statistics%s\n", 2925 (uint_t)mhp->mh_transit.trl_spans->mds_base, 2926 (uint_t)mhp->mh_transit.trl_spans->mds_npgs, 2927 (mhp->mh_cancel ? " (cancelled)" : "")); 2928 printf("\t%8u nloop\n", mhp->mh_delstat.nloop); 2929 printf("\t%8u need_free\n", mhp->mh_delstat.need_free); 2930 printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop); 2931 printf("\t%8u free_low\n", mhp->mh_delstat.free_low); 2932 printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed); 2933 printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck); 2934 printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget); 2935 printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail); 2936 printf("\t%8u nfree\n", mhp->mh_delstat.nfree); 2937 printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc); 2938 printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail); 2939 printf("\t%8u already_done\n", mhp->mh_delstat.already_done); 2940 printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree); 2941 printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked); 2942 printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc); 2943 printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl); 2944 printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc); 2945 printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy); 2946 printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage); 2947 printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim); 2948 printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay); 2949 printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail); 2950 printf("\t%8u retired\n", mhp->mh_delstat.retired); 2951 printf("\t%8u toxic\n", mhp->mh_delstat.toxic); 2952 printf("\t%8u failing\n", mhp->mh_delstat.failing); 2953 printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic); 2954 printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic); 2955 printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail); 2956 printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail); 2957 tmp = mhp->mh_delstat.nticks_total / hz; /* seconds */ 2958 printf( 2959 "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n", 2960 mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60); 2961 2962 tmp = mhp->mh_delstat.nticks_pgrp / hz; /* seconds */ 2963 printf( 2964 "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n", 2965 mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60); 2966 } 2967 } 2968 #endif /* MEM_DEL_STATS */ 2969 2970 struct mem_callback { 2971 kphysm_setup_vector_t *vec; 2972 void *arg; 2973 }; 2974 2975 #define NMEMCALLBACKS 100 2976 2977 static struct mem_callback mem_callbacks[NMEMCALLBACKS]; 2978 static uint_t nmemcallbacks; 2979 static krwlock_t mem_callback_rwlock; 2980 2981 int 2982 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg) 2983 { 2984 uint_t i, found; 2985 2986 /* 2987 * This test will become more complicated when the version must 2988 * change. 2989 */ 2990 if (vec->version != KPHYSM_SETUP_VECTOR_VERSION) 2991 return (EINVAL); 2992 2993 if (vec->post_add == NULL || vec->pre_del == NULL || 2994 vec->post_del == NULL) 2995 return (EINVAL); 2996 2997 rw_enter(&mem_callback_rwlock, RW_WRITER); 2998 for (i = 0, found = 0; i < nmemcallbacks; i++) { 2999 if (mem_callbacks[i].vec == NULL && found == 0) 3000 found = i + 1; 3001 if (mem_callbacks[i].vec == vec && 3002 mem_callbacks[i].arg == arg) { 3003 #ifdef DEBUG 3004 /* Catch this in DEBUG kernels. */ 3005 cmn_err(CE_WARN, "kphysm_setup_func_register" 3006 "(0x%p, 0x%p) duplicate registration from 0x%p", 3007 (void *)vec, arg, (void *)caller()); 3008 #endif /* DEBUG */ 3009 rw_exit(&mem_callback_rwlock); 3010 return (EEXIST); 3011 } 3012 } 3013 if (found != 0) { 3014 i = found - 1; 3015 } else { 3016 ASSERT(nmemcallbacks < NMEMCALLBACKS); 3017 if (nmemcallbacks == NMEMCALLBACKS) { 3018 rw_exit(&mem_callback_rwlock); 3019 return (ENOMEM); 3020 } 3021 i = nmemcallbacks++; 3022 } 3023 mem_callbacks[i].vec = vec; 3024 mem_callbacks[i].arg = arg; 3025 rw_exit(&mem_callback_rwlock); 3026 return (0); 3027 } 3028 3029 void 3030 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg) 3031 { 3032 uint_t i; 3033 3034 rw_enter(&mem_callback_rwlock, RW_WRITER); 3035 for (i = 0; i < nmemcallbacks; i++) { 3036 if (mem_callbacks[i].vec == vec && 3037 mem_callbacks[i].arg == arg) { 3038 mem_callbacks[i].vec = NULL; 3039 mem_callbacks[i].arg = NULL; 3040 if (i == (nmemcallbacks - 1)) 3041 nmemcallbacks--; 3042 break; 3043 } 3044 } 3045 rw_exit(&mem_callback_rwlock); 3046 } 3047 3048 static void 3049 kphysm_setup_post_add(pgcnt_t delta_pages) 3050 { 3051 uint_t i; 3052 3053 rw_enter(&mem_callback_rwlock, RW_READER); 3054 for (i = 0; i < nmemcallbacks; i++) { 3055 if (mem_callbacks[i].vec != NULL) { 3056 (*mem_callbacks[i].vec->post_add) 3057 (mem_callbacks[i].arg, delta_pages); 3058 } 3059 } 3060 rw_exit(&mem_callback_rwlock); 3061 } 3062 3063 /* 3064 * Note the locking between pre_del and post_del: The reader lock is held 3065 * between the two calls to stop the set of functions from changing. 3066 */ 3067 3068 static int 3069 kphysm_setup_pre_del(pgcnt_t delta_pages) 3070 { 3071 uint_t i; 3072 int ret; 3073 int aret; 3074 3075 ret = 0; 3076 rw_enter(&mem_callback_rwlock, RW_READER); 3077 for (i = 0; i < nmemcallbacks; i++) { 3078 if (mem_callbacks[i].vec != NULL) { 3079 aret = (*mem_callbacks[i].vec->pre_del) 3080 (mem_callbacks[i].arg, delta_pages); 3081 ret |= aret; 3082 } 3083 } 3084 3085 return (ret); 3086 } 3087 3088 static void 3089 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled) 3090 { 3091 uint_t i; 3092 3093 for (i = 0; i < nmemcallbacks; i++) { 3094 if (mem_callbacks[i].vec != NULL) { 3095 (*mem_callbacks[i].vec->post_del) 3096 (mem_callbacks[i].arg, delta_pages, cancelled); 3097 } 3098 } 3099 rw_exit(&mem_callback_rwlock); 3100 } 3101 3102 static int 3103 kphysm_split_memseg( 3104 pfn_t base, 3105 pgcnt_t npgs) 3106 { 3107 struct memseg *seg; 3108 struct memseg **segpp; 3109 pgcnt_t size_low, size_high; 3110 struct memseg *seg_low, *seg_mid, *seg_high; 3111 3112 /* 3113 * Lock the memsegs list against other updates now 3114 */ 3115 memsegs_lock(1); 3116 3117 /* 3118 * Find boot time memseg that wholly covers this area. 3119 */ 3120 3121 /* First find the memseg with page 'base' in it. */ 3122 for (segpp = &memsegs; (seg = *segpp) != NULL; 3123 segpp = &((*segpp)->next)) { 3124 if (base >= seg->pages_base && base < seg->pages_end) 3125 break; 3126 } 3127 if (seg == NULL) { 3128 memsegs_unlock(1); 3129 return (0); 3130 } 3131 if (memseg_includes_meta(seg)) { 3132 memsegs_unlock(1); 3133 return (0); 3134 } 3135 if ((base + npgs) > seg->pages_end) { 3136 memsegs_unlock(1); 3137 return (0); 3138 } 3139 3140 /* 3141 * Work out the size of the two segments that will 3142 * surround the new segment, one for low address 3143 * and one for high. 3144 */ 3145 ASSERT(base >= seg->pages_base); 3146 size_low = base - seg->pages_base; 3147 ASSERT(seg->pages_end >= (base + npgs)); 3148 size_high = seg->pages_end - (base + npgs); 3149 3150 /* 3151 * Sanity check. 3152 */ 3153 if ((size_low + size_high) == 0) { 3154 memsegs_unlock(1); 3155 return (0); 3156 } 3157 3158 /* 3159 * Allocate the new structures. The old memseg will not be freed 3160 * as there may be a reference to it. 3161 */ 3162 seg_low = NULL; 3163 seg_high = NULL; 3164 3165 if (size_low != 0) 3166 seg_low = memseg_alloc(); 3167 3168 seg_mid = memseg_alloc(); 3169 3170 if (size_high != 0) 3171 seg_high = memseg_alloc(); 3172 3173 /* 3174 * All allocation done now. 3175 */ 3176 if (size_low != 0) { 3177 seg_low->pages = seg->pages; 3178 seg_low->epages = seg_low->pages + size_low; 3179 seg_low->pages_base = seg->pages_base; 3180 seg_low->pages_end = seg_low->pages_base + size_low; 3181 seg_low->next = seg_mid; 3182 seg_low->msegflags = seg->msegflags; 3183 } 3184 if (size_high != 0) { 3185 seg_high->pages = seg->epages - size_high; 3186 seg_high->epages = seg_high->pages + size_high; 3187 seg_high->pages_base = seg->pages_end - size_high; 3188 seg_high->pages_end = seg_high->pages_base + size_high; 3189 seg_high->next = seg->next; 3190 seg_high->msegflags = seg->msegflags; 3191 } 3192 3193 seg_mid->pages = seg->pages + size_low; 3194 seg_mid->pages_base = seg->pages_base + size_low; 3195 seg_mid->epages = seg->epages - size_high; 3196 seg_mid->pages_end = seg->pages_end - size_high; 3197 seg_mid->next = (seg_high != NULL) ? seg_high : seg->next; 3198 seg_mid->msegflags = seg->msegflags; 3199 3200 /* 3201 * Update hat_kpm specific info of all involved memsegs and 3202 * allow hat_kpm specific global chain updates. 3203 */ 3204 hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high); 3205 3206 /* 3207 * At this point we have two equivalent memseg sub-chains, 3208 * seg and seg_low/seg_mid/seg_high, which both chain on to 3209 * the same place in the global chain. By re-writing the pointer 3210 * in the previous element we switch atomically from using the old 3211 * (seg) to the new. 3212 */ 3213 *segpp = (seg_low != NULL) ? seg_low : seg_mid; 3214 3215 membar_enter(); 3216 3217 build_pfn_hash(); 3218 memsegs_unlock(1); 3219 3220 /* 3221 * We leave the old segment, 'seg', intact as there may be 3222 * references to it. Also, as the value of total_pages has not 3223 * changed and the memsegs list is effectively the same when 3224 * accessed via the old or the new pointer, we do not have to 3225 * cause pageout_scanner() to re-evaluate its hand pointers. 3226 * 3227 * We currently do not re-use or reclaim the page_t memory. 3228 * If we do, then this may have to change. 3229 */ 3230 3231 mutex_enter(&memseg_lists_lock); 3232 seg->lnext = memseg_edit_junk; 3233 memseg_edit_junk = seg; 3234 mutex_exit(&memseg_lists_lock); 3235 3236 return (1); 3237 } 3238 3239 /* 3240 * The sfmmu hat layer (e.g.) accesses some parts of the memseg 3241 * structure using physical addresses. Therefore a kmem_cache is 3242 * used with KMC_NOHASH to avoid page crossings within a memseg 3243 * structure. KMC_NOHASH requires that no external (outside of 3244 * slab) information is allowed. This, in turn, implies that the 3245 * cache's slabsize must be exactly a single page, since per-slab 3246 * information (e.g. the freelist for the slab) is kept at the 3247 * end of the slab, where it is easy to locate. Should be changed 3248 * when a more obvious kmem_cache interface/flag will become 3249 * available. 3250 */ 3251 void 3252 mem_config_init() 3253 { 3254 memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg), 3255 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 3256 } 3257 3258 struct memseg * 3259 memseg_alloc() 3260 { 3261 struct memseg *seg; 3262 3263 seg = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3264 bzero(seg, sizeof (struct memseg)); 3265 3266 return (seg); 3267 } 3268 3269 /* 3270 * Return whether the page_t memory for this memseg 3271 * is included in the memseg itself. 3272 */ 3273 static int 3274 memseg_includes_meta(struct memseg *seg) 3275 { 3276 return (seg->msegflags & MEMSEG_META_INCL); 3277 } 3278 3279 pfn_t 3280 memseg_get_start(struct memseg *seg) 3281 { 3282 pfn_t pt_start; 3283 3284 if (memseg_includes_meta(seg)) { 3285 pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages); 3286 3287 /* Meta data is required to be at the beginning */ 3288 ASSERT(pt_start < seg->pages_base); 3289 } else 3290 pt_start = seg->pages_base; 3291 3292 return (pt_start); 3293 } 3294 3295 /* 3296 * Invalidate memseg pointers in cpu private vm data caches. 3297 */ 3298 static void 3299 memseg_cpu_vm_flush() 3300 { 3301 cpu_t *cp; 3302 vm_cpu_data_t *vc; 3303 3304 mutex_enter(&cpu_lock); 3305 pause_cpus(NULL, NULL); 3306 3307 cp = cpu_list; 3308 do { 3309 vc = cp->cpu_vm_data; 3310 vc->vc_pnum_memseg = NULL; 3311 vc->vc_pnext_memseg = NULL; 3312 3313 } while ((cp = cp->cpu_next) != cpu_list); 3314 3315 start_cpus(); 3316 mutex_exit(&cpu_lock); 3317 } 3318