1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/cmn_err.h> 30 #include <sys/vmem.h> 31 #include <sys/kmem.h> 32 #include <sys/systm.h> 33 #include <sys/machsystm.h> /* for page_freelist_coalesce() */ 34 #include <sys/errno.h> 35 #include <sys/memnode.h> 36 #include <sys/memlist.h> 37 #include <sys/memlist_impl.h> 38 #include <sys/tuneable.h> 39 #include <sys/proc.h> 40 #include <sys/disp.h> 41 #include <sys/debug.h> 42 #include <sys/vm.h> 43 #include <sys/callb.h> 44 #include <sys/memlist_plat.h> /* for installed_top_size() */ 45 #include <sys/condvar_impl.h> /* for CV_HAS_WAITERS() */ 46 #include <sys/dumphdr.h> /* for dump_resize() */ 47 #include <sys/atomic.h> /* for use in stats collection */ 48 #include <sys/rwlock.h> 49 #include <sys/cpuvar.h> 50 #include <vm/seg_kmem.h> 51 #include <vm/seg_kpm.h> 52 #include <vm/page.h> 53 #include <vm/vm_dep.h> 54 #define SUNDDI_IMPL /* so sunddi.h will not redefine splx() et al */ 55 #include <sys/sunddi.h> 56 #include <sys/mem_config.h> 57 #include <sys/mem_cage.h> 58 #include <sys/lgrp.h> 59 #include <sys/ddi.h> 60 #include <sys/modctl.h> 61 62 extern struct memlist *phys_avail; 63 64 extern void mem_node_add(pfn_t, pfn_t); 65 extern void mem_node_del(pfn_t, pfn_t); 66 67 extern uint_t page_ctrs_adjust(int); 68 static void kphysm_setup_post_add(pgcnt_t); 69 static int kphysm_setup_pre_del(pgcnt_t); 70 static void kphysm_setup_post_del(pgcnt_t, int); 71 72 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs); 73 74 static int delspan_reserve(pfn_t, pgcnt_t); 75 static void delspan_unreserve(pfn_t, pgcnt_t); 76 77 static kmutex_t memseg_lists_lock; 78 static struct memseg *memseg_va_avail; 79 static struct memseg *memseg_delete_junk; 80 static struct memseg *memseg_edit_junk; 81 void memseg_remap_init(void); 82 static void memseg_remap_to_dummy(caddr_t, pgcnt_t); 83 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t); 84 static struct memseg *memseg_reuse(pgcnt_t); 85 86 static struct kmem_cache *memseg_cache; 87 88 /* 89 * Add a chunk of memory to the system. page_t's for this memory 90 * are allocated in the first few pages of the chunk. 91 * base: starting PAGESIZE page of new memory. 92 * npgs: length in PAGESIZE pages. 93 * 94 * Adding mem this way doesn't increase the size of the hash tables; 95 * growing them would be too hard. This should be OK, but adding memory 96 * dynamically most likely means more hash misses, since the tables will 97 * be smaller than they otherwise would be. 98 */ 99 int 100 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs) 101 { 102 page_t *pp; 103 page_t *opp, *oepp; 104 struct memseg *seg; 105 uint64_t avmem; 106 pfn_t pfn; 107 pfn_t pt_base = base; 108 pgcnt_t tpgs = npgs; 109 pgcnt_t metapgs; 110 int exhausted; 111 pfn_t pnum; 112 int mnode; 113 caddr_t vaddr; 114 int reuse; 115 int mlret; 116 void *mapva; 117 pgcnt_t nkpmpgs = 0; 118 offset_t kpm_pages_off; 119 120 cmn_err(CE_CONT, 121 "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n", 122 npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT); 123 124 /* 125 * Add this span in the delete list to prevent interactions. 126 */ 127 if (!delspan_reserve(base, npgs)) { 128 return (KPHYSM_ESPAN); 129 } 130 /* 131 * Check to see if any of the memory span has been added 132 * by trying an add to the installed memory list. This 133 * forms the interlocking process for add. 134 */ 135 136 memlist_write_lock(); 137 138 mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT, 139 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 140 141 if (mlret == MEML_SPANOP_OK) 142 installed_top_size(phys_install, &physmax, &physinstalled); 143 144 memlist_write_unlock(); 145 146 if (mlret != MEML_SPANOP_OK) { 147 if (mlret == MEML_SPANOP_EALLOC) { 148 delspan_unreserve(pt_base, tpgs); 149 return (KPHYSM_ERESOURCE); 150 } else 151 if (mlret == MEML_SPANOP_ESPAN) { 152 delspan_unreserve(pt_base, tpgs); 153 return (KPHYSM_ESPAN); 154 } else { 155 delspan_unreserve(pt_base, tpgs); 156 return (KPHYSM_ERESOURCE); 157 } 158 } 159 160 /* 161 * We store the page_t's for this new memory in the first 162 * few pages of the chunk. Here, we go and get'em ... 163 */ 164 165 /* 166 * The expression after the '-' gives the number of pages 167 * that will fit in the new memory based on a requirement 168 * of (PAGESIZE + sizeof (page_t)) bytes per page. 169 */ 170 metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) / 171 (PAGESIZE + sizeof (page_t))); 172 173 npgs -= metapgs; 174 base += metapgs; 175 176 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); 177 178 exhausted = (metapgs == 0 || npgs == 0); 179 180 if (kpm_enable && !exhausted) { 181 pgcnt_t start, end, nkpmpgs_prelim; 182 size_t ptsz; 183 184 /* 185 * A viable kpm large page mapping must not overlap two 186 * dynamic memsegs. Therefore the total size is checked 187 * to be at least kpm_pgsz and also whether start and end 188 * points are at least kpm_pgsz aligned. 189 */ 190 if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) || 191 pmodkpmp(base + npgs)) { 192 193 kphysm_addmem_error_undospan(pt_base, tpgs); 194 195 /* 196 * There is no specific error code for violating 197 * kpm granularity constraints. 198 */ 199 return (KPHYSM_ENOTVIABLE); 200 } 201 202 start = kpmptop(ptokpmp(base)); 203 end = kpmptop(ptokpmp(base + npgs)); 204 nkpmpgs_prelim = ptokpmp(end - start); 205 ptsz = npgs * sizeof (page_t); 206 metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ); 207 exhausted = (tpgs <= metapgs); 208 if (!exhausted) { 209 npgs = tpgs - metapgs; 210 base = pt_base + metapgs; 211 212 /* final nkpmpgs */ 213 start = kpmptop(ptokpmp(base)); 214 nkpmpgs = ptokpmp(end - start); 215 kpm_pages_off = ptsz + 216 (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ; 217 } 218 } 219 220 /* 221 * Is memory area supplied too small? 222 */ 223 if (exhausted) { 224 kphysm_addmem_error_undospan(pt_base, tpgs); 225 226 /* 227 * There is no specific error code for 'too small'. 228 */ 229 return (KPHYSM_ERESOURCE); 230 } 231 232 /* 233 * We may re-use a previously allocated VA space for the page_ts 234 * eventually, but we need to initialize and lock the pages first. 235 */ 236 237 /* 238 * Get an address in the kernel address map, map 239 * the page_t pages and see if we can touch them. 240 */ 241 242 mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP); 243 if (mapva == NULL) { 244 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 245 " Can't allocate VA for page_ts"); 246 247 kphysm_addmem_error_undospan(pt_base, tpgs); 248 249 return (KPHYSM_ERESOURCE); 250 } 251 pp = mapva; 252 253 if (physmax < (pt_base + tpgs)) 254 physmax = (pt_base + tpgs); 255 256 /* 257 * In the remapping code we map one page at a time so we must do 258 * the same here to match mapping sizes. 259 */ 260 pfn = pt_base; 261 vaddr = (caddr_t)pp; 262 for (pnum = 0; pnum < metapgs; pnum++) { 263 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 264 PROT_READ | PROT_WRITE, 265 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 266 pfn++; 267 vaddr += ptob(1); 268 } 269 270 if (ddi_peek32((dev_info_t *)NULL, 271 (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) { 272 273 cmn_err(CE_PANIC, "kphysm_add_memory_dynamic:" 274 " Can't access pp array at 0x%p [phys 0x%lx]", 275 (void *)pp, pt_base); 276 277 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 278 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 279 280 vmem_free(heap_arena, mapva, ptob(metapgs)); 281 282 kphysm_addmem_error_undospan(pt_base, tpgs); 283 284 return (KPHYSM_EFAULT); 285 } 286 287 /* 288 * Add this memory slice to its memory node translation. 289 * 290 * Note that right now, each node may have only one slice; 291 * this may change with COD or in larger SSM systems with 292 * nested latency groups, so we must not assume that the 293 * node does not yet exist. 294 */ 295 pnum = base + npgs - 1; 296 mem_node_add_slice(base, pnum); 297 298 /* 299 * Allocate or resize page counters as necessary to accommodate 300 * the increase in memory pages. 301 */ 302 mnode = PFN_2_MEM_NODE(pnum); 303 if (page_ctrs_adjust(mnode) != 0) { 304 305 mem_node_pre_del_slice(base, pnum); 306 mem_node_post_del_slice(base, pnum, 0); 307 308 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 309 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 310 311 vmem_free(heap_arena, mapva, ptob(metapgs)); 312 313 kphysm_addmem_error_undospan(pt_base, tpgs); 314 315 return (KPHYSM_ERESOURCE); 316 } 317 318 /* 319 * Update the phys_avail memory list. 320 * The phys_install list was done at the start. 321 */ 322 323 memlist_write_lock(); 324 325 mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT, 326 (uint64_t)(npgs) << PAGESHIFT, &phys_avail); 327 ASSERT(mlret == MEML_SPANOP_OK); 328 329 memlist_write_unlock(); 330 331 /* See if we can find a memseg to re-use. */ 332 seg = memseg_reuse(metapgs); 333 334 reuse = (seg != NULL); 335 336 /* 337 * Initialize the memseg structure representing this memory 338 * and add it to the existing list of memsegs. Do some basic 339 * initialization and add the memory to the system. 340 * In order to prevent lock deadlocks, the add_physmem() 341 * code is repeated here, but split into several stages. 342 */ 343 if (seg == NULL) { 344 seg = kmem_cache_alloc(memseg_cache, KM_SLEEP); 345 bzero(seg, sizeof (struct memseg)); 346 seg->msegflags = MEMSEG_DYNAMIC; 347 seg->pages = pp; 348 } else { 349 /*EMPTY*/ 350 ASSERT(seg->msegflags & MEMSEG_DYNAMIC); 351 } 352 353 seg->epages = seg->pages + npgs; 354 seg->pages_base = base; 355 seg->pages_end = base + npgs; 356 357 /* 358 * Initialize metadata. The page_ts are set to locked state 359 * ready to be freed. 360 */ 361 bzero((caddr_t)pp, ptob(metapgs)); 362 363 pfn = seg->pages_base; 364 /* Save the original pp base in case we reuse a memseg. */ 365 opp = pp; 366 oepp = opp + npgs; 367 for (pp = opp; pp < oepp; pp++) { 368 pp->p_pagenum = pfn; 369 pfn++; 370 page_iolock_init(pp); 371 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 372 continue; 373 pp->p_offset = (u_offset_t)-1; 374 } 375 376 if (reuse) { 377 /* Remap our page_ts to the re-used memseg VA space. */ 378 pfn = pt_base; 379 vaddr = (caddr_t)seg->pages; 380 for (pnum = 0; pnum < metapgs; pnum++) { 381 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 382 PROT_READ | PROT_WRITE, 383 HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST); 384 pfn++; 385 vaddr += ptob(1); 386 } 387 388 hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs), 389 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 390 391 vmem_free(heap_arena, mapva, ptob(metapgs)); 392 } 393 394 hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off); 395 396 memsegs_lock(1); 397 398 /* 399 * The new memseg is inserted at the beginning of the list. 400 * Not only does this save searching for the tail, but in the 401 * case of a re-used memseg, it solves the problem of what 402 * happens of some process has still got a pointer to the 403 * memseg and follows the next pointer to continue traversing 404 * the memsegs list. 405 */ 406 407 hat_kpm_addmem_mseg_insert(seg); 408 409 seg->next = memsegs; 410 membar_producer(); 411 412 hat_kpm_addmem_memsegs_update(seg); 413 414 memsegs = seg; 415 416 build_pfn_hash(); 417 418 total_pages += npgs; 419 420 /* 421 * Recalculate the paging parameters now total_pages has changed. 422 * This will also cause the clock hands to be reset before next use. 423 */ 424 setupclock(1); 425 426 memsegs_unlock(1); 427 428 PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs); 429 430 /* 431 * Free the pages outside the lock to avoid locking loops. 432 */ 433 for (pp = seg->pages; pp < seg->epages; pp++) { 434 page_free(pp, 1); 435 } 436 437 /* 438 * Now that we've updated the appropriate memory lists we 439 * need to reset a number of globals, since we've increased memory. 440 * Several have already been updated for us as noted above. The 441 * globals we're interested in at this point are: 442 * physmax - highest page frame number. 443 * physinstalled - number of pages currently installed (done earlier) 444 * maxmem - max free pages in the system 445 * physmem - physical memory pages available 446 * availrmem - real memory available 447 */ 448 449 mutex_enter(&freemem_lock); 450 maxmem += npgs; 451 physmem += npgs; 452 availrmem += npgs; 453 availrmem_initial += npgs; 454 455 mutex_exit(&freemem_lock); 456 457 dump_resize(); 458 459 page_freelist_coalesce_all(mnode); 460 461 kphysm_setup_post_add(npgs); 462 463 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK " 464 "(0x%" PRIx64 ")\n", 465 physinstalled << (PAGESHIFT - 10), 466 (uint64_t)physinstalled << PAGESHIFT); 467 468 avmem = (uint64_t)freemem << PAGESHIFT; 469 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: " 470 "avail mem = %" PRId64 "\n", avmem); 471 472 /* 473 * Update lgroup generation number on single lgroup systems 474 */ 475 if (nlgrps == 1) 476 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 477 478 delspan_unreserve(pt_base, tpgs); 479 return (KPHYSM_OK); /* Successfully added system memory */ 480 481 } 482 483 /* 484 * There are various error conditions in kphysm_add_memory_dynamic() 485 * which require a rollback of already changed global state. 486 */ 487 static void 488 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs) 489 { 490 int mlret; 491 492 /* Unreserve memory span. */ 493 memlist_write_lock(); 494 495 mlret = memlist_delete_span( 496 (uint64_t)(pt_base) << PAGESHIFT, 497 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 498 499 ASSERT(mlret == MEML_SPANOP_OK); 500 phys_install_has_changed(); 501 installed_top_size(phys_install, &physmax, &physinstalled); 502 503 memlist_write_unlock(); 504 delspan_unreserve(pt_base, tpgs); 505 } 506 507 /* 508 * Only return an available memseg of exactly the right size. 509 * When the meta data area has it's own virtual address space 510 * we will need to manage this more carefully and do best fit 511 * allocations, possibly splitting an available area. 512 */ 513 static struct memseg * 514 memseg_reuse(pgcnt_t metapgs) 515 { 516 struct memseg **segpp, *seg; 517 518 mutex_enter(&memseg_lists_lock); 519 520 segpp = &memseg_va_avail; 521 for (; (seg = *segpp) != NULL; segpp = &seg->lnext) { 522 caddr_t end; 523 524 if (kpm_enable) 525 end = hat_kpm_mseg_reuse(seg); 526 else 527 end = (caddr_t)seg->epages; 528 529 if (btopr(end - (caddr_t)seg->pages) == metapgs) { 530 *segpp = seg->lnext; 531 seg->lnext = NULL; 532 break; 533 } 534 } 535 mutex_exit(&memseg_lists_lock); 536 537 return (seg); 538 } 539 540 static uint_t handle_gen; 541 542 struct memdelspan { 543 struct memdelspan *mds_next; 544 pfn_t mds_base; 545 pgcnt_t mds_npgs; 546 uint_t *mds_bitmap; 547 uint_t *mds_bitmap_retired; 548 }; 549 550 #define NBPBMW (sizeof (uint_t) * NBBY) 551 #define MDS_BITMAPBYTES(MDSP) \ 552 ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t)) 553 554 struct transit_list { 555 struct transit_list *trl_next; 556 struct memdelspan *trl_spans; 557 int trl_collect; 558 }; 559 560 struct transit_list_head { 561 kmutex_t trh_lock; 562 struct transit_list *trh_head; 563 }; 564 565 static struct transit_list_head transit_list_head; 566 567 struct mem_handle; 568 static void transit_list_collect(struct mem_handle *, int); 569 static void transit_list_insert(struct transit_list *); 570 static void transit_list_remove(struct transit_list *); 571 572 #ifdef DEBUG 573 #define MEM_DEL_STATS 574 #endif /* DEBUG */ 575 576 #ifdef MEM_DEL_STATS 577 static int mem_del_stat_print = 0; 578 struct mem_del_stat { 579 uint_t nloop; 580 uint_t need_free; 581 uint_t free_loop; 582 uint_t free_low; 583 uint_t free_failed; 584 uint_t ncheck; 585 uint_t nopaget; 586 uint_t lockfail; 587 uint_t nfree; 588 uint_t nreloc; 589 uint_t nrelocfail; 590 uint_t already_done; 591 uint_t first_notfree; 592 uint_t npplocked; 593 uint_t nlockreloc; 594 uint_t nnorepl; 595 uint_t nmodreloc; 596 uint_t ndestroy; 597 uint_t nputpage; 598 uint_t nnoreclaim; 599 uint_t ndelay; 600 uint_t demotefail; 601 uint64_t nticks_total; 602 uint64_t nticks_pgrp; 603 uint_t retired; 604 uint_t toxic; 605 uint_t failing; 606 uint_t modtoxic; 607 uint_t npplkdtoxic; 608 uint_t gptlmodfail; 609 uint_t gptllckfail; 610 }; 611 /* 612 * The stat values are only incremented in the delete thread 613 * so no locking or atomic required. 614 */ 615 #define MDSTAT_INCR(MHP, FLD) (MHP)->mh_delstat.FLD++ 616 #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck)) 617 #define MDSTAT_PGRP(MHP, ntck) ((MHP)->mh_delstat.nticks_pgrp += (ntck)) 618 static void mem_del_stat_print_func(struct mem_handle *); 619 #define MDSTAT_PRINT(MHP) mem_del_stat_print_func((MHP)) 620 #else /* MEM_DEL_STATS */ 621 #define MDSTAT_INCR(MHP, FLD) 622 #define MDSTAT_TOTAL(MHP, ntck) 623 #define MDSTAT_PGRP(MHP, ntck) 624 #define MDSTAT_PRINT(MHP) 625 #endif /* MEM_DEL_STATS */ 626 627 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING, 628 MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t; 629 630 /* 631 * mh_mutex must be taken to examine or change mh_exthandle and mh_state. 632 * The mutex may not be required for other fields, dependent on mh_state. 633 */ 634 struct mem_handle { 635 kmutex_t mh_mutex; 636 struct mem_handle *mh_next; 637 memhandle_t mh_exthandle; 638 mhnd_state_t mh_state; 639 struct transit_list mh_transit; 640 pgcnt_t mh_phys_pages; 641 pgcnt_t mh_vm_pages; 642 pgcnt_t mh_hold_todo; 643 void (*mh_delete_complete)(void *, int error); 644 void *mh_delete_complete_arg; 645 volatile uint_t mh_cancel; 646 volatile uint_t mh_dr_aio_cleanup_cancel; 647 volatile uint_t mh_aio_cleanup_done; 648 kcondvar_t mh_cv; 649 kthread_id_t mh_thread_id; 650 page_t *mh_deleted; /* link through p_next */ 651 #ifdef MEM_DEL_STATS 652 struct mem_del_stat mh_delstat; 653 #endif /* MEM_DEL_STATS */ 654 }; 655 656 static struct mem_handle *mem_handle_head; 657 static kmutex_t mem_handle_list_mutex; 658 659 static struct mem_handle * 660 kphysm_allocate_mem_handle() 661 { 662 struct mem_handle *mhp; 663 664 mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP); 665 mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL); 666 mutex_enter(&mem_handle_list_mutex); 667 mutex_enter(&mhp->mh_mutex); 668 /* handle_gen is protected by list mutex. */ 669 mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen); 670 mhp->mh_next = mem_handle_head; 671 mem_handle_head = mhp; 672 mutex_exit(&mem_handle_list_mutex); 673 674 return (mhp); 675 } 676 677 static void 678 kphysm_free_mem_handle(struct mem_handle *mhp) 679 { 680 struct mem_handle **mhpp; 681 682 ASSERT(mutex_owned(&mhp->mh_mutex)); 683 ASSERT(mhp->mh_state == MHND_FREE); 684 /* 685 * Exit the mutex to preserve locking order. This is OK 686 * here as once in the FREE state, the handle cannot 687 * be found by a lookup. 688 */ 689 mutex_exit(&mhp->mh_mutex); 690 691 mutex_enter(&mem_handle_list_mutex); 692 mhpp = &mem_handle_head; 693 while (*mhpp != NULL && *mhpp != mhp) 694 mhpp = &(*mhpp)->mh_next; 695 ASSERT(*mhpp == mhp); 696 /* 697 * No need to lock the handle (mh_mutex) as only 698 * mh_next changing and this is the only thread that 699 * can be referncing mhp. 700 */ 701 *mhpp = mhp->mh_next; 702 mutex_exit(&mem_handle_list_mutex); 703 704 mutex_destroy(&mhp->mh_mutex); 705 kmem_free(mhp, sizeof (struct mem_handle)); 706 } 707 708 /* 709 * This function finds the internal mem_handle corresponding to an 710 * external handle and returns it with the mh_mutex held. 711 */ 712 static struct mem_handle * 713 kphysm_lookup_mem_handle(memhandle_t handle) 714 { 715 struct mem_handle *mhp; 716 717 mutex_enter(&mem_handle_list_mutex); 718 for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) { 719 if (mhp->mh_exthandle == handle) { 720 mutex_enter(&mhp->mh_mutex); 721 /* 722 * The state of the handle could have been changed 723 * by kphysm_del_release() while waiting for mh_mutex. 724 */ 725 if (mhp->mh_state == MHND_FREE) { 726 mutex_exit(&mhp->mh_mutex); 727 continue; 728 } 729 break; 730 } 731 } 732 mutex_exit(&mem_handle_list_mutex); 733 return (mhp); 734 } 735 736 int 737 kphysm_del_gethandle(memhandle_t *xmhp) 738 { 739 struct mem_handle *mhp; 740 741 mhp = kphysm_allocate_mem_handle(); 742 /* 743 * The handle is allocated using KM_SLEEP, so cannot fail. 744 * If the implementation is changed, the correct error to return 745 * here would be KPHYSM_ENOHANDLES. 746 */ 747 ASSERT(mhp->mh_state == MHND_FREE); 748 mhp->mh_state = MHND_INIT; 749 *xmhp = mhp->mh_exthandle; 750 mutex_exit(&mhp->mh_mutex); 751 return (KPHYSM_OK); 752 } 753 754 static int 755 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2) 756 { 757 pfn_t e1, e2; 758 759 e1 = b1 + l1; 760 e2 = b2 + l2; 761 762 return (!(b2 >= e1 || b1 >= e2)); 763 } 764 765 static int can_remove_pgs(pgcnt_t); 766 767 static struct memdelspan * 768 span_to_install(pfn_t base, pgcnt_t npgs) 769 { 770 struct memdelspan *mdsp; 771 struct memdelspan *mdsp_new; 772 uint64_t address, size, thislen; 773 struct memlist *mlp; 774 775 mdsp_new = NULL; 776 777 address = (uint64_t)base << PAGESHIFT; 778 size = (uint64_t)npgs << PAGESHIFT; 779 while (size != 0) { 780 memlist_read_lock(); 781 for (mlp = phys_install; mlp != NULL; mlp = mlp->next) { 782 if (address >= (mlp->address + mlp->size)) 783 continue; 784 if ((address + size) > mlp->address) 785 break; 786 } 787 if (mlp == NULL) { 788 address += size; 789 size = 0; 790 thislen = 0; 791 } else { 792 if (address < mlp->address) { 793 size -= (mlp->address - address); 794 address = mlp->address; 795 } 796 ASSERT(address >= mlp->address); 797 if ((address + size) > (mlp->address + mlp->size)) { 798 thislen = mlp->size - (address - mlp->address); 799 } else { 800 thislen = size; 801 } 802 } 803 memlist_read_unlock(); 804 /* TODO: phys_install could change now */ 805 if (thislen == 0) 806 continue; 807 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 808 mdsp->mds_base = btop(address); 809 mdsp->mds_npgs = btop(thislen); 810 mdsp->mds_next = mdsp_new; 811 mdsp_new = mdsp; 812 address += thislen; 813 size -= thislen; 814 } 815 return (mdsp_new); 816 } 817 818 static void 819 free_delspans(struct memdelspan *mdsp) 820 { 821 struct memdelspan *amdsp; 822 823 while ((amdsp = mdsp) != NULL) { 824 mdsp = amdsp->mds_next; 825 kmem_free(amdsp, sizeof (struct memdelspan)); 826 } 827 } 828 829 /* 830 * Concatenate lists. No list ordering is required. 831 */ 832 833 static void 834 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp) 835 { 836 while (*mdspp != NULL) 837 mdspp = &(*mdspp)->mds_next; 838 839 *mdspp = mdsp; 840 } 841 842 /* 843 * Given a new list of delspans, check there is no overlap with 844 * all existing span activity (add or delete) and then concatenate 845 * the new spans to the given list. 846 * Return 1 for OK, 0 if overlapping. 847 */ 848 static int 849 delspan_insert( 850 struct transit_list *my_tlp, 851 struct memdelspan *mdsp_new) 852 { 853 struct transit_list_head *trh; 854 struct transit_list *tlp; 855 int ret; 856 857 trh = &transit_list_head; 858 859 ASSERT(my_tlp != NULL); 860 ASSERT(mdsp_new != NULL); 861 862 ret = 1; 863 mutex_enter(&trh->trh_lock); 864 /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */ 865 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 866 struct memdelspan *mdsp; 867 868 for (mdsp = tlp->trl_spans; mdsp != NULL; 869 mdsp = mdsp->mds_next) { 870 struct memdelspan *nmdsp; 871 872 for (nmdsp = mdsp_new; nmdsp != NULL; 873 nmdsp = nmdsp->mds_next) { 874 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 875 nmdsp->mds_base, nmdsp->mds_npgs)) { 876 ret = 0; 877 goto done; 878 } 879 } 880 } 881 } 882 done: 883 if (ret != 0) { 884 if (my_tlp->trl_spans == NULL) 885 transit_list_insert(my_tlp); 886 delspan_concat(&my_tlp->trl_spans, mdsp_new); 887 } 888 mutex_exit(&trh->trh_lock); 889 return (ret); 890 } 891 892 static void 893 delspan_remove( 894 struct transit_list *my_tlp, 895 pfn_t base, 896 pgcnt_t npgs) 897 { 898 struct transit_list_head *trh; 899 struct memdelspan *mdsp; 900 901 trh = &transit_list_head; 902 903 ASSERT(my_tlp != NULL); 904 905 mutex_enter(&trh->trh_lock); 906 if ((mdsp = my_tlp->trl_spans) != NULL) { 907 if (npgs == 0) { 908 my_tlp->trl_spans = NULL; 909 free_delspans(mdsp); 910 transit_list_remove(my_tlp); 911 } else { 912 struct memdelspan **prv; 913 914 prv = &my_tlp->trl_spans; 915 while (mdsp != NULL) { 916 pfn_t p_end; 917 918 p_end = mdsp->mds_base + mdsp->mds_npgs; 919 if (mdsp->mds_base >= base && 920 p_end <= (base + npgs)) { 921 *prv = mdsp->mds_next; 922 mdsp->mds_next = NULL; 923 free_delspans(mdsp); 924 } else { 925 prv = &mdsp->mds_next; 926 } 927 mdsp = *prv; 928 } 929 if (my_tlp->trl_spans == NULL) 930 transit_list_remove(my_tlp); 931 } 932 } 933 mutex_exit(&trh->trh_lock); 934 } 935 936 /* 937 * Reserve interface for add to stop delete before add finished. 938 * This list is only accessed through the delspan_insert/remove 939 * functions and so is fully protected by the mutex in struct transit_list. 940 */ 941 942 static struct transit_list reserve_transit; 943 944 static int 945 delspan_reserve(pfn_t base, pgcnt_t npgs) 946 { 947 struct memdelspan *mdsp; 948 int ret; 949 950 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 951 mdsp->mds_base = base; 952 mdsp->mds_npgs = npgs; 953 if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) { 954 free_delspans(mdsp); 955 } 956 return (ret); 957 } 958 959 static void 960 delspan_unreserve(pfn_t base, pgcnt_t npgs) 961 { 962 delspan_remove(&reserve_transit, base, npgs); 963 } 964 965 /* 966 * Return whether memseg was created by kphysm_add_memory_dynamic(). 967 * If this is the case and startp non zero, return also the start pfn 968 * of the meta data via startp. 969 */ 970 static int 971 memseg_is_dynamic(struct memseg *seg, pfn_t *startp) 972 { 973 pfn_t pt_start; 974 975 if ((seg->msegflags & MEMSEG_DYNAMIC) == 0) 976 return (0); 977 978 /* Meta data is required to be at the beginning */ 979 ASSERT(hat_getpfnum(kas.a_hat, (caddr_t)seg->epages) < seg->pages_base); 980 981 pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages); 982 if (startp != NULL) 983 *startp = pt_start; 984 985 return (1); 986 } 987 988 int 989 kphysm_del_span( 990 memhandle_t handle, 991 pfn_t base, 992 pgcnt_t npgs) 993 { 994 struct mem_handle *mhp; 995 struct memseg *seg; 996 struct memdelspan *mdsp; 997 struct memdelspan *mdsp_new; 998 pgcnt_t phys_pages, vm_pages; 999 pfn_t p_end; 1000 page_t *pp; 1001 int ret; 1002 1003 mhp = kphysm_lookup_mem_handle(handle); 1004 if (mhp == NULL) { 1005 return (KPHYSM_EHANDLE); 1006 } 1007 if (mhp->mh_state != MHND_INIT) { 1008 mutex_exit(&mhp->mh_mutex); 1009 return (KPHYSM_ESEQUENCE); 1010 } 1011 1012 /* 1013 * Intersect the span with the installed memory list (phys_install). 1014 */ 1015 mdsp_new = span_to_install(base, npgs); 1016 if (mdsp_new == NULL) { 1017 /* 1018 * No physical memory in this range. Is this an 1019 * error? If an attempt to start the delete is made 1020 * for OK returns from del_span such as this, start will 1021 * return an error. 1022 * Could return KPHYSM_ENOWORK. 1023 */ 1024 /* 1025 * It is assumed that there are no error returns 1026 * from span_to_install() due to kmem_alloc failure. 1027 */ 1028 mutex_exit(&mhp->mh_mutex); 1029 return (KPHYSM_OK); 1030 } 1031 /* 1032 * Does this span overlap an existing span? 1033 */ 1034 if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) { 1035 /* 1036 * Differentiate between already on list for this handle 1037 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY). 1038 */ 1039 ret = KPHYSM_EBUSY; 1040 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1041 mdsp = mdsp->mds_next) { 1042 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 1043 base, npgs)) { 1044 ret = KPHYSM_EDUP; 1045 break; 1046 } 1047 } 1048 mutex_exit(&mhp->mh_mutex); 1049 free_delspans(mdsp_new); 1050 return (ret); 1051 } 1052 /* 1053 * At this point the spans in mdsp_new have been inserted into the 1054 * list of spans for this handle and thereby to the global list of 1055 * spans being processed. Each of these spans must now be checked 1056 * for relocatability. As a side-effect segments in the memseg list 1057 * may be split. 1058 * 1059 * Note that mdsp_new can no longer be used as it is now part of 1060 * a larger list. Select elements of this larger list based 1061 * on base and npgs. 1062 */ 1063 restart: 1064 phys_pages = 0; 1065 vm_pages = 0; 1066 ret = KPHYSM_OK; 1067 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1068 mdsp = mdsp->mds_next) { 1069 pgcnt_t pages_checked; 1070 1071 if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) { 1072 continue; 1073 } 1074 p_end = mdsp->mds_base + mdsp->mds_npgs; 1075 /* 1076 * The pages_checked count is a hack. All pages should be 1077 * checked for relocatability. Those not covered by memsegs 1078 * should be tested with arch_kphysm_del_span_ok(). 1079 */ 1080 pages_checked = 0; 1081 for (seg = memsegs; seg; seg = seg->next) { 1082 pfn_t mseg_start; 1083 1084 if (seg->pages_base >= p_end || 1085 seg->pages_end <= mdsp->mds_base) { 1086 /* Span and memseg don't overlap. */ 1087 continue; 1088 } 1089 /* Check that segment is suitable for delete. */ 1090 if (memseg_is_dynamic(seg, &mseg_start)) { 1091 /* 1092 * Can only delete whole added segments 1093 * for the moment. 1094 * Check that this is completely within the 1095 * span. 1096 */ 1097 if (mseg_start < mdsp->mds_base || 1098 seg->pages_end > p_end) { 1099 ret = KPHYSM_EBUSY; 1100 break; 1101 } 1102 pages_checked += seg->pages_end - mseg_start; 1103 } else { 1104 /* 1105 * Set mseg_start for accounting below. 1106 */ 1107 mseg_start = seg->pages_base; 1108 /* 1109 * If this segment is larger than the span, 1110 * try to split it. After the split, it 1111 * is necessary to restart. 1112 */ 1113 if (seg->pages_base < mdsp->mds_base || 1114 seg->pages_end > p_end) { 1115 pfn_t abase; 1116 pgcnt_t anpgs; 1117 int s_ret; 1118 1119 /* Split required. */ 1120 if (mdsp->mds_base < seg->pages_base) 1121 abase = seg->pages_base; 1122 else 1123 abase = mdsp->mds_base; 1124 if (p_end > seg->pages_end) 1125 anpgs = seg->pages_end - abase; 1126 else 1127 anpgs = p_end - abase; 1128 s_ret = kphysm_split_memseg(abase, 1129 anpgs); 1130 if (s_ret == 0) { 1131 /* Split failed. */ 1132 ret = KPHYSM_ERESOURCE; 1133 break; 1134 } 1135 goto restart; 1136 } 1137 pages_checked += 1138 seg->pages_end - seg->pages_base; 1139 } 1140 /* 1141 * The memseg is wholly within the delete span. 1142 * The individual pages can now be checked. 1143 */ 1144 /* Cage test. */ 1145 for (pp = seg->pages; pp < seg->epages; pp++) { 1146 if (PP_ISNORELOC(pp)) { 1147 ret = KPHYSM_ENONRELOC; 1148 break; 1149 } 1150 } 1151 if (ret != KPHYSM_OK) { 1152 break; 1153 } 1154 phys_pages += (seg->pages_end - mseg_start); 1155 vm_pages += MSEG_NPAGES(seg); 1156 } 1157 if (ret != KPHYSM_OK) 1158 break; 1159 if (pages_checked != mdsp->mds_npgs) { 1160 ret = KPHYSM_ENONRELOC; 1161 break; 1162 } 1163 } 1164 1165 if (ret == KPHYSM_OK) { 1166 mhp->mh_phys_pages += phys_pages; 1167 mhp->mh_vm_pages += vm_pages; 1168 } else { 1169 /* 1170 * Keep holding the mh_mutex to prevent it going away. 1171 */ 1172 delspan_remove(&mhp->mh_transit, base, npgs); 1173 } 1174 mutex_exit(&mhp->mh_mutex); 1175 return (ret); 1176 } 1177 1178 int 1179 kphysm_del_span_query( 1180 pfn_t base, 1181 pgcnt_t npgs, 1182 memquery_t *mqp) 1183 { 1184 struct memdelspan *mdsp; 1185 struct memdelspan *mdsp_new; 1186 int done_first_nonreloc; 1187 1188 mqp->phys_pages = 0; 1189 mqp->managed = 0; 1190 mqp->nonrelocatable = 0; 1191 mqp->first_nonrelocatable = 0; 1192 mqp->last_nonrelocatable = 0; 1193 1194 mdsp_new = span_to_install(base, npgs); 1195 /* 1196 * It is OK to proceed here if mdsp_new == NULL. 1197 */ 1198 done_first_nonreloc = 0; 1199 for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) { 1200 pfn_t sbase; 1201 pgcnt_t snpgs; 1202 1203 mqp->phys_pages += mdsp->mds_npgs; 1204 sbase = mdsp->mds_base; 1205 snpgs = mdsp->mds_npgs; 1206 while (snpgs != 0) { 1207 struct memseg *lseg, *seg; 1208 pfn_t p_end; 1209 page_t *pp; 1210 pfn_t mseg_start; 1211 1212 p_end = sbase + snpgs; 1213 /* 1214 * Find the lowest addressed memseg that starts 1215 * after sbase and account for it. 1216 * This is to catch dynamic memsegs whose start 1217 * is hidden. 1218 */ 1219 seg = NULL; 1220 for (lseg = memsegs; lseg != NULL; lseg = lseg->next) { 1221 if ((lseg->pages_base >= sbase) || 1222 (lseg->pages_base < p_end && 1223 lseg->pages_end > sbase)) { 1224 if (seg == NULL || 1225 seg->pages_base > lseg->pages_base) 1226 seg = lseg; 1227 } 1228 } 1229 if (seg != NULL) { 1230 if (!memseg_is_dynamic(seg, &mseg_start)) { 1231 mseg_start = seg->pages_base; 1232 } 1233 /* 1234 * Now have the full extent of the memseg so 1235 * do the range check. 1236 */ 1237 if (mseg_start >= p_end || 1238 seg->pages_end <= sbase) { 1239 /* Span does not overlap memseg. */ 1240 seg = NULL; 1241 } 1242 } 1243 /* 1244 * Account for gap either before the segment if 1245 * there is one or to the end of the span. 1246 */ 1247 if (seg == NULL || mseg_start > sbase) { 1248 pfn_t a_end; 1249 1250 a_end = (seg == NULL) ? p_end : mseg_start; 1251 /* 1252 * Check with arch layer for relocatability. 1253 */ 1254 if (arch_kphysm_del_span_ok(sbase, 1255 (a_end - sbase))) { 1256 /* 1257 * No non-relocatble pages in this 1258 * area, avoid the fine-grained 1259 * test. 1260 */ 1261 snpgs -= (a_end - sbase); 1262 sbase = a_end; 1263 } 1264 while (sbase < a_end) { 1265 if (!arch_kphysm_del_span_ok(sbase, 1266 1)) { 1267 mqp->nonrelocatable++; 1268 if (!done_first_nonreloc) { 1269 mqp-> 1270 first_nonrelocatable 1271 = sbase; 1272 done_first_nonreloc = 1; 1273 } 1274 mqp->last_nonrelocatable = 1275 sbase; 1276 } 1277 sbase++; 1278 snpgs--; 1279 } 1280 } 1281 if (seg != NULL) { 1282 ASSERT(mseg_start <= sbase); 1283 if (seg->pages_base != mseg_start && 1284 seg->pages_base > sbase) { 1285 pgcnt_t skip_pgs; 1286 1287 /* 1288 * Skip the page_t area of a 1289 * dynamic memseg. 1290 */ 1291 skip_pgs = seg->pages_base - sbase; 1292 if (snpgs <= skip_pgs) { 1293 sbase += snpgs; 1294 snpgs = 0; 1295 continue; 1296 } 1297 snpgs -= skip_pgs; 1298 sbase += skip_pgs; 1299 } 1300 ASSERT(snpgs != 0); 1301 ASSERT(seg->pages_base <= sbase); 1302 /* 1303 * The individual pages can now be checked. 1304 */ 1305 for (pp = seg->pages + 1306 (sbase - seg->pages_base); 1307 snpgs != 0 && pp < seg->epages; pp++) { 1308 mqp->managed++; 1309 if (PP_ISNORELOC(pp)) { 1310 mqp->nonrelocatable++; 1311 if (!done_first_nonreloc) { 1312 mqp-> 1313 first_nonrelocatable 1314 = sbase; 1315 done_first_nonreloc = 1; 1316 } 1317 mqp->last_nonrelocatable = 1318 sbase; 1319 } 1320 sbase++; 1321 snpgs--; 1322 } 1323 } 1324 } 1325 } 1326 1327 free_delspans(mdsp_new); 1328 1329 return (KPHYSM_OK); 1330 } 1331 1332 /* 1333 * This release function can be called at any stage as follows: 1334 * _gethandle only called 1335 * _span(s) only called 1336 * _start called but failed 1337 * delete thread exited 1338 */ 1339 int 1340 kphysm_del_release(memhandle_t handle) 1341 { 1342 struct mem_handle *mhp; 1343 1344 mhp = kphysm_lookup_mem_handle(handle); 1345 if (mhp == NULL) { 1346 return (KPHYSM_EHANDLE); 1347 } 1348 switch (mhp->mh_state) { 1349 case MHND_STARTING: 1350 case MHND_RUNNING: 1351 mutex_exit(&mhp->mh_mutex); 1352 return (KPHYSM_ENOTFINISHED); 1353 case MHND_FREE: 1354 ASSERT(mhp->mh_state != MHND_FREE); 1355 mutex_exit(&mhp->mh_mutex); 1356 return (KPHYSM_EHANDLE); 1357 case MHND_INIT: 1358 break; 1359 case MHND_DONE: 1360 break; 1361 case MHND_RELEASE: 1362 mutex_exit(&mhp->mh_mutex); 1363 return (KPHYSM_ESEQUENCE); 1364 default: 1365 #ifdef DEBUG 1366 cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d", 1367 (void *)mhp, mhp->mh_state); 1368 #endif /* DEBUG */ 1369 mutex_exit(&mhp->mh_mutex); 1370 return (KPHYSM_EHANDLE); 1371 } 1372 /* 1373 * Set state so that we can wait if necessary. 1374 * Also this means that we have read/write access to all 1375 * fields except mh_exthandle and mh_state. 1376 */ 1377 mhp->mh_state = MHND_RELEASE; 1378 /* 1379 * The mem_handle cannot be de-allocated by any other operation 1380 * now, so no need to hold mh_mutex. 1381 */ 1382 mutex_exit(&mhp->mh_mutex); 1383 1384 delspan_remove(&mhp->mh_transit, 0, 0); 1385 mhp->mh_phys_pages = 0; 1386 mhp->mh_vm_pages = 0; 1387 mhp->mh_hold_todo = 0; 1388 mhp->mh_delete_complete = NULL; 1389 mhp->mh_delete_complete_arg = NULL; 1390 mhp->mh_cancel = 0; 1391 1392 mutex_enter(&mhp->mh_mutex); 1393 ASSERT(mhp->mh_state == MHND_RELEASE); 1394 mhp->mh_state = MHND_FREE; 1395 1396 kphysm_free_mem_handle(mhp); 1397 1398 return (KPHYSM_OK); 1399 } 1400 1401 /* 1402 * This cancel function can only be called with the thread running. 1403 */ 1404 int 1405 kphysm_del_cancel(memhandle_t handle) 1406 { 1407 struct mem_handle *mhp; 1408 1409 mhp = kphysm_lookup_mem_handle(handle); 1410 if (mhp == NULL) { 1411 return (KPHYSM_EHANDLE); 1412 } 1413 if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) { 1414 mutex_exit(&mhp->mh_mutex); 1415 return (KPHYSM_ENOTRUNNING); 1416 } 1417 /* 1418 * Set the cancel flag and wake the delete thread up. 1419 * The thread may be waiting on I/O, so the effect of the cancel 1420 * may be delayed. 1421 */ 1422 if (mhp->mh_cancel == 0) { 1423 mhp->mh_cancel = KPHYSM_ECANCELLED; 1424 cv_signal(&mhp->mh_cv); 1425 } 1426 mutex_exit(&mhp->mh_mutex); 1427 return (KPHYSM_OK); 1428 } 1429 1430 int 1431 kphysm_del_status( 1432 memhandle_t handle, 1433 memdelstat_t *mdstp) 1434 { 1435 struct mem_handle *mhp; 1436 1437 mhp = kphysm_lookup_mem_handle(handle); 1438 if (mhp == NULL) { 1439 return (KPHYSM_EHANDLE); 1440 } 1441 /* 1442 * Calling kphysm_del_status() is allowed before the delete 1443 * is started to allow for status display. 1444 */ 1445 if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING && 1446 mhp->mh_state != MHND_RUNNING) { 1447 mutex_exit(&mhp->mh_mutex); 1448 return (KPHYSM_ENOTRUNNING); 1449 } 1450 mdstp->phys_pages = mhp->mh_phys_pages; 1451 mdstp->managed = mhp->mh_vm_pages; 1452 mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo; 1453 mutex_exit(&mhp->mh_mutex); 1454 return (KPHYSM_OK); 1455 } 1456 1457 static int mem_delete_additional_pages = 100; 1458 1459 static int 1460 can_remove_pgs(pgcnt_t npgs) 1461 { 1462 /* 1463 * If all pageable pages were paged out, freemem would 1464 * equal availrmem. There is a minimum requirement for 1465 * availrmem. 1466 */ 1467 if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages)) 1468 < npgs) 1469 return (0); 1470 /* TODO: check swap space, etc. */ 1471 return (1); 1472 } 1473 1474 static int 1475 get_availrmem(pgcnt_t npgs) 1476 { 1477 int ret; 1478 1479 mutex_enter(&freemem_lock); 1480 ret = can_remove_pgs(npgs); 1481 if (ret != 0) 1482 availrmem -= npgs; 1483 mutex_exit(&freemem_lock); 1484 return (ret); 1485 } 1486 1487 static void 1488 put_availrmem(pgcnt_t npgs) 1489 { 1490 mutex_enter(&freemem_lock); 1491 availrmem += npgs; 1492 mutex_exit(&freemem_lock); 1493 } 1494 1495 #define FREEMEM_INCR 100 1496 static pgcnt_t freemem_incr = FREEMEM_INCR; 1497 #define DEL_FREE_WAIT_FRAC 4 1498 #define DEL_FREE_WAIT_TICKS ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC) 1499 1500 #define DEL_BUSY_WAIT_FRAC 20 1501 #define DEL_BUSY_WAIT_TICKS ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC) 1502 1503 static void kphysm_del_cleanup(struct mem_handle *); 1504 1505 static void page_delete_collect(page_t *, struct mem_handle *); 1506 1507 static pgcnt_t 1508 delthr_get_freemem(struct mem_handle *mhp) 1509 { 1510 pgcnt_t free_get; 1511 int ret; 1512 1513 ASSERT(MUTEX_HELD(&mhp->mh_mutex)); 1514 1515 MDSTAT_INCR(mhp, need_free); 1516 /* 1517 * Get up to freemem_incr pages. 1518 */ 1519 free_get = freemem_incr; 1520 if (free_get > mhp->mh_hold_todo) 1521 free_get = mhp->mh_hold_todo; 1522 /* 1523 * Take free_get pages away from freemem, 1524 * waiting if necessary. 1525 */ 1526 1527 while (!mhp->mh_cancel) { 1528 mutex_exit(&mhp->mh_mutex); 1529 MDSTAT_INCR(mhp, free_loop); 1530 /* 1531 * Duplicate test from page_create_throttle() 1532 * but don't override with !PG_WAIT. 1533 */ 1534 if (freemem < (free_get + throttlefree)) { 1535 MDSTAT_INCR(mhp, free_low); 1536 ret = 0; 1537 } else { 1538 ret = page_create_wait(free_get, 0); 1539 if (ret == 0) { 1540 /* EMPTY */ 1541 MDSTAT_INCR(mhp, free_failed); 1542 } 1543 } 1544 if (ret != 0) { 1545 mutex_enter(&mhp->mh_mutex); 1546 return (free_get); 1547 } 1548 1549 /* 1550 * Put pressure on pageout. 1551 */ 1552 page_needfree(free_get); 1553 cv_signal(&proc_pageout->p_cv); 1554 1555 mutex_enter(&mhp->mh_mutex); 1556 (void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex, 1557 (lbolt + DEL_FREE_WAIT_TICKS)); 1558 mutex_exit(&mhp->mh_mutex); 1559 page_needfree(-(spgcnt_t)free_get); 1560 1561 mutex_enter(&mhp->mh_mutex); 1562 } 1563 return (0); 1564 } 1565 1566 #define DR_AIO_CLEANUP_DELAY 25000 /* 0.025secs, in usec */ 1567 #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100 1568 /* 1569 * This function is run as a helper thread for delete_memory_thread. 1570 * It is needed in order to force kaio cleanup, so that pages used in kaio 1571 * will be unlocked and subsequently relocated by delete_memory_thread. 1572 * The address of the delete_memory_threads's mem_handle is passed in to 1573 * this thread function, and is used to set the mh_aio_cleanup_done member 1574 * prior to calling thread_exit(). 1575 */ 1576 static void 1577 dr_aio_cleanup_thread(caddr_t amhp) 1578 { 1579 proc_t *procp; 1580 int (*aio_cleanup_dr_delete_memory)(proc_t *); 1581 int cleaned; 1582 int n = 0; 1583 struct mem_handle *mhp; 1584 volatile uint_t *pcancel; 1585 1586 mhp = (struct mem_handle *)amhp; 1587 ASSERT(mhp != NULL); 1588 pcancel = &mhp->mh_dr_aio_cleanup_cancel; 1589 if (modload("sys", "kaio") == -1) { 1590 mhp->mh_aio_cleanup_done = 1; 1591 cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio"); 1592 thread_exit(); 1593 } 1594 aio_cleanup_dr_delete_memory = (int (*)(proc_t *)) 1595 modgetsymvalue("aio_cleanup_dr_delete_memory", 0); 1596 if (aio_cleanup_dr_delete_memory == NULL) { 1597 mhp->mh_aio_cleanup_done = 1; 1598 cmn_err(CE_WARN, 1599 "aio_cleanup_dr_delete_memory not found in kaio"); 1600 thread_exit(); 1601 } 1602 do { 1603 cleaned = 0; 1604 mutex_enter(&pidlock); 1605 for (procp = practive; (*pcancel == 0) && (procp != NULL); 1606 procp = procp->p_next) { 1607 mutex_enter(&procp->p_lock); 1608 if (procp->p_aio != NULL) { 1609 /* cleanup proc's outstanding kaio */ 1610 cleaned += 1611 (*aio_cleanup_dr_delete_memory)(procp); 1612 } 1613 mutex_exit(&procp->p_lock); 1614 } 1615 mutex_exit(&pidlock); 1616 if ((*pcancel == 0) && 1617 (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) { 1618 /* delay a bit before retrying all procs again */ 1619 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 1620 n = 0; 1621 } 1622 } while (*pcancel == 0); 1623 mhp->mh_aio_cleanup_done = 1; 1624 thread_exit(); 1625 } 1626 1627 static void 1628 delete_memory_thread(caddr_t amhp) 1629 { 1630 struct mem_handle *mhp; 1631 struct memdelspan *mdsp; 1632 callb_cpr_t cprinfo; 1633 page_t *pp_targ; 1634 spgcnt_t freemem_left; 1635 void (*del_complete_funcp)(void *, int error); 1636 void *del_complete_arg; 1637 int comp_code; 1638 int ret; 1639 int first_scan; 1640 uint_t szc; 1641 #ifdef MEM_DEL_STATS 1642 uint64_t start_total, ntick_total; 1643 uint64_t start_pgrp, ntick_pgrp; 1644 #endif /* MEM_DEL_STATS */ 1645 1646 mhp = (struct mem_handle *)amhp; 1647 1648 #ifdef MEM_DEL_STATS 1649 start_total = ddi_get_lbolt(); 1650 #endif /* MEM_DEL_STATS */ 1651 1652 CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex, 1653 callb_generic_cpr, "memdel"); 1654 1655 mutex_enter(&mhp->mh_mutex); 1656 ASSERT(mhp->mh_state == MHND_STARTING); 1657 1658 mhp->mh_state = MHND_RUNNING; 1659 mhp->mh_thread_id = curthread; 1660 1661 mhp->mh_hold_todo = mhp->mh_vm_pages; 1662 mutex_exit(&mhp->mh_mutex); 1663 1664 /* Allocate the remap pages now, if necessary. */ 1665 memseg_remap_init(); 1666 1667 /* 1668 * Subtract from availrmem now if possible as availrmem 1669 * may not be available by the end of the delete. 1670 */ 1671 if (!get_availrmem(mhp->mh_vm_pages)) { 1672 comp_code = KPHYSM_ENOTVIABLE; 1673 mutex_enter(&mhp->mh_mutex); 1674 goto early_exit; 1675 } 1676 1677 ret = kphysm_setup_pre_del(mhp->mh_vm_pages); 1678 1679 mutex_enter(&mhp->mh_mutex); 1680 1681 if (ret != 0) { 1682 mhp->mh_cancel = KPHYSM_EREFUSED; 1683 goto refused; 1684 } 1685 1686 transit_list_collect(mhp, 1); 1687 1688 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1689 mdsp = mdsp->mds_next) { 1690 ASSERT(mdsp->mds_bitmap == NULL); 1691 mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP); 1692 mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp), 1693 KM_SLEEP); 1694 } 1695 1696 first_scan = 1; 1697 freemem_left = 0; 1698 /* 1699 * Start dr_aio_cleanup_thread, which periodically iterates 1700 * through the process list and invokes aio cleanup. This 1701 * is needed in order to avoid a deadly embrace between the 1702 * delete_memory_thread (waiting on writer lock for page, with the 1703 * exclusive-wanted bit set), kaio read request threads (waiting for a 1704 * reader lock on the same page that is wanted by the 1705 * delete_memory_thread), and threads waiting for kaio completion 1706 * (blocked on spt_amp->lock). 1707 */ 1708 mhp->mh_dr_aio_cleanup_cancel = 0; 1709 mhp->mh_aio_cleanup_done = 0; 1710 (void) thread_create(NULL, 0, dr_aio_cleanup_thread, 1711 (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1); 1712 while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) { 1713 pgcnt_t collected; 1714 1715 MDSTAT_INCR(mhp, nloop); 1716 collected = 0; 1717 for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) && 1718 (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) { 1719 pfn_t pfn, p_end; 1720 1721 if (first_scan) { 1722 mem_node_pre_del_slice(mdsp->mds_base, 1723 mdsp->mds_base + mdsp->mds_npgs - 1); 1724 } 1725 1726 p_end = mdsp->mds_base + mdsp->mds_npgs; 1727 for (pfn = mdsp->mds_base; (pfn < p_end) && 1728 (mhp->mh_cancel == 0); pfn++) { 1729 page_t *pp, *tpp, *tpp_targ; 1730 pgcnt_t bit; 1731 struct vnode *vp; 1732 u_offset_t offset; 1733 int mod, result; 1734 spgcnt_t pgcnt; 1735 1736 bit = pfn - mdsp->mds_base; 1737 if ((mdsp->mds_bitmap[bit / NBPBMW] & 1738 (1 << (bit % NBPBMW))) != 0) { 1739 MDSTAT_INCR(mhp, already_done); 1740 continue; 1741 } 1742 if (freemem_left == 0) { 1743 freemem_left += delthr_get_freemem(mhp); 1744 if (freemem_left == 0) 1745 break; 1746 } 1747 1748 /* 1749 * Release mh_mutex - some of this 1750 * stuff takes some time (eg PUTPAGE). 1751 */ 1752 1753 mutex_exit(&mhp->mh_mutex); 1754 MDSTAT_INCR(mhp, ncheck); 1755 1756 pp = page_numtopp_nolock(pfn); 1757 if (pp == NULL) { 1758 /* 1759 * Not covered by a page_t - will 1760 * be dealt with elsewhere. 1761 */ 1762 MDSTAT_INCR(mhp, nopaget); 1763 mutex_enter(&mhp->mh_mutex); 1764 mdsp->mds_bitmap[bit / NBPBMW] |= 1765 (1 << (bit % NBPBMW)); 1766 continue; 1767 } 1768 1769 if (!page_try_reclaim_lock(pp, SE_EXCL, 1770 SE_EXCL_WANTED | SE_RETIRED)) { 1771 /* 1772 * Page in use elsewhere. Skip it. 1773 */ 1774 MDSTAT_INCR(mhp, lockfail); 1775 mutex_enter(&mhp->mh_mutex); 1776 continue; 1777 } 1778 /* 1779 * See if the cage expanded into the delete. 1780 * This can happen as we have to allow the 1781 * cage to expand. 1782 */ 1783 if (PP_ISNORELOC(pp)) { 1784 page_unlock(pp); 1785 mutex_enter(&mhp->mh_mutex); 1786 mhp->mh_cancel = KPHYSM_ENONRELOC; 1787 break; 1788 } 1789 if (PP_RETIRED(pp)) { 1790 /* 1791 * Page has been retired and is 1792 * not part of the cage so we 1793 * can now do the accounting for 1794 * it. 1795 */ 1796 MDSTAT_INCR(mhp, retired); 1797 mutex_enter(&mhp->mh_mutex); 1798 mdsp->mds_bitmap[bit / NBPBMW] 1799 |= (1 << (bit % NBPBMW)); 1800 mdsp->mds_bitmap_retired[bit / 1801 NBPBMW] |= 1802 (1 << (bit % NBPBMW)); 1803 mhp->mh_hold_todo--; 1804 continue; 1805 } 1806 ASSERT(freemem_left != 0); 1807 if (PP_ISFREE(pp)) { 1808 /* 1809 * Like page_reclaim() only 'freemem' 1810 * processing is already done. 1811 */ 1812 MDSTAT_INCR(mhp, nfree); 1813 free_page_collect: 1814 if (PP_ISAGED(pp)) { 1815 page_list_sub(pp, 1816 PG_FREE_LIST); 1817 } else { 1818 page_list_sub(pp, 1819 PG_CACHE_LIST); 1820 } 1821 PP_CLRFREE(pp); 1822 PP_CLRAGED(pp); 1823 collected++; 1824 mutex_enter(&mhp->mh_mutex); 1825 page_delete_collect(pp, mhp); 1826 mdsp->mds_bitmap[bit / NBPBMW] |= 1827 (1 << (bit % NBPBMW)); 1828 freemem_left--; 1829 continue; 1830 } 1831 ASSERT(pp->p_vnode != NULL); 1832 if (first_scan) { 1833 MDSTAT_INCR(mhp, first_notfree); 1834 page_unlock(pp); 1835 mutex_enter(&mhp->mh_mutex); 1836 continue; 1837 } 1838 /* 1839 * Keep stats on pages encountered that 1840 * are marked for retirement. 1841 */ 1842 if (PP_TOXIC(pp)) { 1843 MDSTAT_INCR(mhp, toxic); 1844 } else if (PP_PR_REQ(pp)) { 1845 MDSTAT_INCR(mhp, failing); 1846 } 1847 /* 1848 * In certain cases below, special exceptions 1849 * are made for pages that are toxic. This 1850 * is because the current meaning of toxic 1851 * is that an uncorrectable error has been 1852 * previously associated with the page. 1853 */ 1854 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1855 if (!PP_TOXIC(pp)) { 1856 /* 1857 * Must relocate locked in 1858 * memory pages. 1859 */ 1860 #ifdef MEM_DEL_STATS 1861 start_pgrp = ddi_get_lbolt(); 1862 #endif /* MEM_DEL_STATS */ 1863 /* 1864 * Lock all constituent pages 1865 * of a large page to ensure 1866 * that p_szc won't change. 1867 */ 1868 if (!group_page_trylock(pp, 1869 SE_EXCL)) { 1870 MDSTAT_INCR(mhp, 1871 gptllckfail); 1872 page_unlock(pp); 1873 mutex_enter( 1874 &mhp->mh_mutex); 1875 continue; 1876 } 1877 MDSTAT_INCR(mhp, npplocked); 1878 pp_targ = 1879 page_get_replacement_page( 1880 pp, NULL, 0); 1881 if (pp_targ != NULL) { 1882 #ifdef MEM_DEL_STATS 1883 ntick_pgrp = 1884 (uint64_t) 1885 ddi_get_lbolt() - 1886 start_pgrp; 1887 #endif /* MEM_DEL_STATS */ 1888 MDSTAT_PGRP(mhp, 1889 ntick_pgrp); 1890 MDSTAT_INCR(mhp, 1891 nlockreloc); 1892 goto reloc; 1893 } 1894 group_page_unlock(pp); 1895 page_unlock(pp); 1896 #ifdef MEM_DEL_STATS 1897 ntick_pgrp = 1898 (uint64_t)ddi_get_lbolt() - 1899 start_pgrp; 1900 #endif /* MEM_DEL_STATS */ 1901 MDSTAT_PGRP(mhp, ntick_pgrp); 1902 MDSTAT_INCR(mhp, nnorepl); 1903 mutex_enter(&mhp->mh_mutex); 1904 continue; 1905 } else { 1906 /* 1907 * Cannot do anything about 1908 * this page because it is 1909 * toxic. 1910 */ 1911 MDSTAT_INCR(mhp, npplkdtoxic); 1912 page_unlock(pp); 1913 mutex_enter(&mhp->mh_mutex); 1914 continue; 1915 } 1916 } 1917 /* 1918 * Unload the mappings and check if mod bit 1919 * is set. 1920 */ 1921 ASSERT(!PP_ISKAS(pp)); 1922 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1923 mod = hat_ismod(pp); 1924 1925 #ifdef MEM_DEL_STATS 1926 start_pgrp = ddi_get_lbolt(); 1927 #endif /* MEM_DEL_STATS */ 1928 if (mod && !PP_TOXIC(pp)) { 1929 /* 1930 * Lock all constituent pages 1931 * of a large page to ensure 1932 * that p_szc won't change. 1933 */ 1934 if (!group_page_trylock(pp, SE_EXCL)) { 1935 MDSTAT_INCR(mhp, gptlmodfail); 1936 page_unlock(pp); 1937 mutex_enter(&mhp->mh_mutex); 1938 continue; 1939 } 1940 pp_targ = page_get_replacement_page(pp, 1941 NULL, 0); 1942 if (pp_targ != NULL) { 1943 MDSTAT_INCR(mhp, nmodreloc); 1944 #ifdef MEM_DEL_STATS 1945 ntick_pgrp = 1946 (uint64_t)ddi_get_lbolt() - 1947 start_pgrp; 1948 #endif /* MEM_DEL_STATS */ 1949 MDSTAT_PGRP(mhp, ntick_pgrp); 1950 goto reloc; 1951 } 1952 group_page_unlock(pp); 1953 } 1954 1955 if (!page_try_demote_pages(pp)) { 1956 MDSTAT_INCR(mhp, demotefail); 1957 page_unlock(pp); 1958 #ifdef MEM_DEL_STATS 1959 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 1960 start_pgrp; 1961 #endif /* MEM_DEL_STATS */ 1962 MDSTAT_PGRP(mhp, ntick_pgrp); 1963 mutex_enter(&mhp->mh_mutex); 1964 continue; 1965 } 1966 1967 /* 1968 * Regular 'page-out'. 1969 */ 1970 if (!mod) { 1971 MDSTAT_INCR(mhp, ndestroy); 1972 page_destroy(pp, 1); 1973 /* 1974 * page_destroy was called with 1975 * dontfree. As long as p_lckcnt 1976 * and p_cowcnt are both zero, the 1977 * only additional action of 1978 * page_destroy with !dontfree is to 1979 * call page_free, so we can collect 1980 * the page here. 1981 */ 1982 collected++; 1983 #ifdef MEM_DEL_STATS 1984 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 1985 start_pgrp; 1986 #endif /* MEM_DEL_STATS */ 1987 MDSTAT_PGRP(mhp, ntick_pgrp); 1988 mutex_enter(&mhp->mh_mutex); 1989 page_delete_collect(pp, mhp); 1990 mdsp->mds_bitmap[bit / NBPBMW] |= 1991 (1 << (bit % NBPBMW)); 1992 continue; 1993 } 1994 /* 1995 * The page is toxic and the mod bit is 1996 * set, we cannot do anything here to deal 1997 * with it. 1998 */ 1999 if (PP_TOXIC(pp)) { 2000 page_unlock(pp); 2001 #ifdef MEM_DEL_STATS 2002 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2003 start_pgrp; 2004 #endif /* MEM_DEL_STATS */ 2005 MDSTAT_PGRP(mhp, ntick_pgrp); 2006 MDSTAT_INCR(mhp, modtoxic); 2007 mutex_enter(&mhp->mh_mutex); 2008 continue; 2009 } 2010 MDSTAT_INCR(mhp, nputpage); 2011 vp = pp->p_vnode; 2012 offset = pp->p_offset; 2013 VN_HOLD(vp); 2014 page_unlock(pp); 2015 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, 2016 B_INVAL|B_FORCE, kcred, NULL); 2017 VN_RELE(vp); 2018 #ifdef MEM_DEL_STATS 2019 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2020 start_pgrp; 2021 #endif /* MEM_DEL_STATS */ 2022 MDSTAT_PGRP(mhp, ntick_pgrp); 2023 /* 2024 * Try to get the page back immediately 2025 * so that it can be collected. 2026 */ 2027 pp = page_numtopp_nolock(pfn); 2028 if (pp == NULL) { 2029 MDSTAT_INCR(mhp, nnoreclaim); 2030 /* 2031 * This should not happen as this 2032 * thread is deleting the page. 2033 * If this code is generalized, this 2034 * becomes a reality. 2035 */ 2036 #ifdef DEBUG 2037 cmn_err(CE_WARN, 2038 "delete_memory_thread(0x%p) " 2039 "pfn 0x%lx has no page_t", 2040 (void *)mhp, pfn); 2041 #endif /* DEBUG */ 2042 mutex_enter(&mhp->mh_mutex); 2043 continue; 2044 } 2045 if (page_try_reclaim_lock(pp, SE_EXCL, 2046 SE_EXCL_WANTED | SE_RETIRED)) { 2047 if (PP_ISFREE(pp)) { 2048 goto free_page_collect; 2049 } 2050 page_unlock(pp); 2051 } 2052 MDSTAT_INCR(mhp, nnoreclaim); 2053 mutex_enter(&mhp->mh_mutex); 2054 continue; 2055 2056 reloc: 2057 /* 2058 * Got some freemem and a target 2059 * page, so move the data to avoid 2060 * I/O and lock problems. 2061 */ 2062 ASSERT(!page_iolock_assert(pp)); 2063 MDSTAT_INCR(mhp, nreloc); 2064 /* 2065 * page_relocate() will return pgcnt: the 2066 * number of consecutive pages relocated. 2067 * If it is successful, pp will be a 2068 * linked list of the page structs that 2069 * were relocated. If page_relocate() is 2070 * unsuccessful, pp will be unmodified. 2071 */ 2072 #ifdef MEM_DEL_STATS 2073 start_pgrp = ddi_get_lbolt(); 2074 #endif /* MEM_DEL_STATS */ 2075 result = page_relocate(&pp, &pp_targ, 0, 0, 2076 &pgcnt, NULL); 2077 #ifdef MEM_DEL_STATS 2078 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2079 start_pgrp; 2080 #endif /* MEM_DEL_STATS */ 2081 MDSTAT_PGRP(mhp, ntick_pgrp); 2082 if (result != 0) { 2083 MDSTAT_INCR(mhp, nrelocfail); 2084 /* 2085 * We did not succeed. We need 2086 * to give the pp_targ pages back. 2087 * page_free(pp_targ, 1) without 2088 * the freemem accounting. 2089 */ 2090 group_page_unlock(pp); 2091 page_free_replacement_page(pp_targ); 2092 page_unlock(pp); 2093 mutex_enter(&mhp->mh_mutex); 2094 continue; 2095 } 2096 2097 /* 2098 * We will then collect pgcnt pages. 2099 */ 2100 ASSERT(pgcnt > 0); 2101 mutex_enter(&mhp->mh_mutex); 2102 /* 2103 * We need to make sure freemem_left is 2104 * large enough. 2105 */ 2106 while ((freemem_left < pgcnt) && 2107 (!mhp->mh_cancel)) { 2108 freemem_left += 2109 delthr_get_freemem(mhp); 2110 } 2111 2112 /* 2113 * Do not proceed if mh_cancel is set. 2114 */ 2115 if (mhp->mh_cancel) { 2116 while (pp_targ != NULL) { 2117 /* 2118 * Unlink and unlock each page. 2119 */ 2120 tpp_targ = pp_targ; 2121 page_sub(&pp_targ, tpp_targ); 2122 page_unlock(tpp_targ); 2123 } 2124 /* 2125 * We need to give the pp pages back. 2126 * page_free(pp, 1) without the 2127 * freemem accounting. 2128 */ 2129 page_free_replacement_page(pp); 2130 break; 2131 } 2132 2133 /* Now remove pgcnt from freemem_left */ 2134 freemem_left -= pgcnt; 2135 ASSERT(freemem_left >= 0); 2136 szc = pp->p_szc; 2137 while (pp != NULL) { 2138 /* 2139 * pp and pp_targ were passed back as 2140 * a linked list of pages. 2141 * Unlink and unlock each page. 2142 */ 2143 tpp_targ = pp_targ; 2144 page_sub(&pp_targ, tpp_targ); 2145 page_unlock(tpp_targ); 2146 /* 2147 * The original page is now free 2148 * so remove it from the linked 2149 * list and collect it. 2150 */ 2151 tpp = pp; 2152 page_sub(&pp, tpp); 2153 pfn = page_pptonum(tpp); 2154 collected++; 2155 ASSERT(PAGE_EXCL(tpp)); 2156 ASSERT(tpp->p_vnode == NULL); 2157 ASSERT(!hat_page_is_mapped(tpp)); 2158 ASSERT(tpp->p_szc == szc); 2159 tpp->p_szc = 0; 2160 page_delete_collect(tpp, mhp); 2161 bit = pfn - mdsp->mds_base; 2162 mdsp->mds_bitmap[bit / NBPBMW] |= 2163 (1 << (bit % NBPBMW)); 2164 } 2165 ASSERT(pp_targ == NULL); 2166 } 2167 } 2168 first_scan = 0; 2169 if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) && 2170 (collected == 0)) { 2171 /* 2172 * This code is needed as we cannot wait 2173 * for a page to be locked OR the delete to 2174 * be cancelled. Also, we must delay so 2175 * that other threads get a chance to run 2176 * on our cpu, otherwise page locks may be 2177 * held indefinitely by those threads. 2178 */ 2179 MDSTAT_INCR(mhp, ndelay); 2180 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2181 (void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex, 2182 (lbolt + DEL_BUSY_WAIT_TICKS)); 2183 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2184 } 2185 } 2186 /* stop the dr aio cleanup thread */ 2187 mhp->mh_dr_aio_cleanup_cancel = 1; 2188 transit_list_collect(mhp, 0); 2189 if (freemem_left != 0) { 2190 /* Return any surplus. */ 2191 page_create_putback(freemem_left); 2192 freemem_left = 0; 2193 } 2194 #ifdef MEM_DEL_STATS 2195 ntick_total = (uint64_t)ddi_get_lbolt() - start_total; 2196 #endif /* MEM_DEL_STATS */ 2197 MDSTAT_TOTAL(mhp, ntick_total); 2198 MDSTAT_PRINT(mhp); 2199 2200 /* 2201 * If the memory delete was cancelled, exclusive-wanted bits must 2202 * be cleared. If there are retired pages being deleted, they need 2203 * to be unretired. 2204 */ 2205 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2206 mdsp = mdsp->mds_next) { 2207 pfn_t pfn, p_end; 2208 2209 p_end = mdsp->mds_base + mdsp->mds_npgs; 2210 for (pfn = mdsp->mds_base; pfn < p_end; pfn++) { 2211 page_t *pp; 2212 pgcnt_t bit; 2213 2214 bit = pfn - mdsp->mds_base; 2215 if (mhp->mh_cancel) { 2216 pp = page_numtopp_nolock(pfn); 2217 if (pp != NULL) { 2218 if ((mdsp->mds_bitmap[bit / NBPBMW] & 2219 (1 << (bit % NBPBMW))) == 0) { 2220 page_lock_clr_exclwanted(pp); 2221 } 2222 } 2223 } else { 2224 pp = NULL; 2225 } 2226 if ((mdsp->mds_bitmap_retired[bit / NBPBMW] & 2227 (1 << (bit % NBPBMW))) != 0) { 2228 /* do we already have pp? */ 2229 if (pp == NULL) { 2230 pp = page_numtopp_nolock(pfn); 2231 } 2232 ASSERT(pp != NULL); 2233 ASSERT(PP_RETIRED(pp)); 2234 if (mhp->mh_cancel != 0) { 2235 page_unlock(pp); 2236 /* 2237 * To satisfy ASSERT below in 2238 * cancel code. 2239 */ 2240 mhp->mh_hold_todo++; 2241 } else { 2242 (void) page_unretire_pp(pp, 2243 PR_UNR_CLEAN); 2244 } 2245 } 2246 } 2247 } 2248 /* 2249 * Free retired page bitmap and collected page bitmap 2250 */ 2251 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2252 mdsp = mdsp->mds_next) { 2253 ASSERT(mdsp->mds_bitmap_retired != NULL); 2254 kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp)); 2255 mdsp->mds_bitmap_retired = NULL; /* Paranoia. */ 2256 ASSERT(mdsp->mds_bitmap != NULL); 2257 kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp)); 2258 mdsp->mds_bitmap = NULL; /* Paranoia. */ 2259 } 2260 2261 /* wait for our dr aio cancel thread to exit */ 2262 while (!(mhp->mh_aio_cleanup_done)) { 2263 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2264 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 2265 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2266 } 2267 refused: 2268 if (mhp->mh_cancel != 0) { 2269 page_t *pp; 2270 2271 comp_code = mhp->mh_cancel; 2272 /* 2273 * Go through list of deleted pages (mh_deleted) freeing 2274 * them. 2275 */ 2276 while ((pp = mhp->mh_deleted) != NULL) { 2277 mhp->mh_deleted = pp->p_next; 2278 mhp->mh_hold_todo++; 2279 mutex_exit(&mhp->mh_mutex); 2280 /* Restore p_next. */ 2281 pp->p_next = pp->p_prev; 2282 if (PP_ISFREE(pp)) { 2283 cmn_err(CE_PANIC, 2284 "page %p is free", 2285 (void *)pp); 2286 } 2287 page_free(pp, 1); 2288 mutex_enter(&mhp->mh_mutex); 2289 } 2290 ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages); 2291 2292 mutex_exit(&mhp->mh_mutex); 2293 put_availrmem(mhp->mh_vm_pages); 2294 mutex_enter(&mhp->mh_mutex); 2295 2296 goto t_exit; 2297 } 2298 2299 /* 2300 * All the pages are no longer in use and are exclusively locked. 2301 */ 2302 2303 mhp->mh_deleted = NULL; 2304 2305 kphysm_del_cleanup(mhp); 2306 2307 /* 2308 * mem_node_post_del_slice needs to be after kphysm_del_cleanup so 2309 * that the mem_node_config[] will remain intact for the cleanup. 2310 */ 2311 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2312 mdsp = mdsp->mds_next) { 2313 mem_node_post_del_slice(mdsp->mds_base, 2314 mdsp->mds_base + mdsp->mds_npgs - 1, 0); 2315 } 2316 2317 comp_code = KPHYSM_OK; 2318 2319 t_exit: 2320 mutex_exit(&mhp->mh_mutex); 2321 kphysm_setup_post_del(mhp->mh_vm_pages, 2322 (comp_code == KPHYSM_OK) ? 0 : 1); 2323 mutex_enter(&mhp->mh_mutex); 2324 2325 early_exit: 2326 /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */ 2327 mhp->mh_state = MHND_DONE; 2328 del_complete_funcp = mhp->mh_delete_complete; 2329 del_complete_arg = mhp->mh_delete_complete_arg; 2330 CALLB_CPR_EXIT(&cprinfo); 2331 (*del_complete_funcp)(del_complete_arg, comp_code); 2332 thread_exit(); 2333 /*NOTREACHED*/ 2334 } 2335 2336 /* 2337 * Start the delete of the memory from the system. 2338 */ 2339 int 2340 kphysm_del_start( 2341 memhandle_t handle, 2342 void (*complete)(void *, int), 2343 void *complete_arg) 2344 { 2345 struct mem_handle *mhp; 2346 2347 mhp = kphysm_lookup_mem_handle(handle); 2348 if (mhp == NULL) { 2349 return (KPHYSM_EHANDLE); 2350 } 2351 switch (mhp->mh_state) { 2352 case MHND_FREE: 2353 ASSERT(mhp->mh_state != MHND_FREE); 2354 mutex_exit(&mhp->mh_mutex); 2355 return (KPHYSM_EHANDLE); 2356 case MHND_INIT: 2357 break; 2358 case MHND_STARTING: 2359 case MHND_RUNNING: 2360 mutex_exit(&mhp->mh_mutex); 2361 return (KPHYSM_ESEQUENCE); 2362 case MHND_DONE: 2363 mutex_exit(&mhp->mh_mutex); 2364 return (KPHYSM_ESEQUENCE); 2365 case MHND_RELEASE: 2366 mutex_exit(&mhp->mh_mutex); 2367 return (KPHYSM_ESEQUENCE); 2368 default: 2369 #ifdef DEBUG 2370 cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d", 2371 (void *)mhp, mhp->mh_state); 2372 #endif /* DEBUG */ 2373 mutex_exit(&mhp->mh_mutex); 2374 return (KPHYSM_EHANDLE); 2375 } 2376 2377 if (mhp->mh_transit.trl_spans == NULL) { 2378 mutex_exit(&mhp->mh_mutex); 2379 return (KPHYSM_ENOWORK); 2380 } 2381 2382 ASSERT(complete != NULL); 2383 mhp->mh_delete_complete = complete; 2384 mhp->mh_delete_complete_arg = complete_arg; 2385 mhp->mh_state = MHND_STARTING; 2386 /* 2387 * Release the mutex in case thread_create sleeps. 2388 */ 2389 mutex_exit(&mhp->mh_mutex); 2390 2391 /* 2392 * The "obvious" process for this thread is pageout (proc_pageout) 2393 * but this gives the thread too much power over freemem 2394 * which results in freemem starvation. 2395 */ 2396 (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0, 2397 TS_RUN, maxclsyspri - 1); 2398 2399 return (KPHYSM_OK); 2400 } 2401 2402 static kmutex_t pp_dummy_lock; /* Protects init. of pp_dummy. */ 2403 static caddr_t pp_dummy; 2404 static pgcnt_t pp_dummy_npages; 2405 static pfn_t *pp_dummy_pfn; /* Array of dummy pfns. */ 2406 2407 static void 2408 memseg_remap_init_pages(page_t *pages, page_t *epages) 2409 { 2410 page_t *pp; 2411 2412 for (pp = pages; pp < epages; pp++) { 2413 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2414 pp->p_offset = (u_offset_t)-1; 2415 page_iolock_init(pp); 2416 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 2417 continue; 2418 page_lock_delete(pp); 2419 } 2420 } 2421 2422 void 2423 memseg_remap_init() 2424 { 2425 mutex_enter(&pp_dummy_lock); 2426 if (pp_dummy == NULL) { 2427 uint_t dpages; 2428 int i; 2429 2430 /* 2431 * dpages starts off as the size of the structure and 2432 * ends up as the minimum number of pages that will 2433 * hold a whole number of page_t structures. 2434 */ 2435 dpages = sizeof (page_t); 2436 ASSERT(dpages != 0); 2437 ASSERT(dpages <= MMU_PAGESIZE); 2438 2439 while ((dpages & 1) == 0) 2440 dpages >>= 1; 2441 2442 pp_dummy_npages = dpages; 2443 /* 2444 * Allocate pp_dummy pages directly from static_arena, 2445 * since these are whole page allocations and are 2446 * referenced by physical address. This also has the 2447 * nice fringe benefit of hiding the memory from 2448 * ::findleaks since it doesn't deal well with allocated 2449 * kernel heap memory that doesn't have any mappings. 2450 */ 2451 pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages), 2452 PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP); 2453 bzero(pp_dummy, ptob(pp_dummy_npages)); 2454 ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0); 2455 pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) * 2456 pp_dummy_npages, KM_SLEEP); 2457 for (i = 0; i < pp_dummy_npages; i++) { 2458 pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat, 2459 &pp_dummy[MMU_PAGESIZE * i]); 2460 ASSERT(pp_dummy_pfn[i] != PFN_INVALID); 2461 } 2462 /* 2463 * Initialize the page_t's to a known 'deleted' state 2464 * that matches the state of deleted pages. 2465 */ 2466 memseg_remap_init_pages((page_t *)pp_dummy, 2467 (page_t *)(pp_dummy + ptob(pp_dummy_npages))); 2468 /* Remove kmem mappings for the pages for safety. */ 2469 hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages), 2470 HAT_UNLOAD_UNLOCK); 2471 /* Leave pp_dummy pointer set as flag that init is done. */ 2472 } 2473 mutex_exit(&pp_dummy_lock); 2474 } 2475 2476 static void 2477 memseg_remap_to_dummy(caddr_t pp, pgcnt_t metapgs) 2478 { 2479 ASSERT(pp_dummy != NULL); 2480 2481 while (metapgs != 0) { 2482 pgcnt_t n; 2483 int i; 2484 2485 n = pp_dummy_npages; 2486 if (n > metapgs) 2487 n = metapgs; 2488 for (i = 0; i < n; i++) { 2489 hat_devload(kas.a_hat, pp, ptob(1), pp_dummy_pfn[i], 2490 PROT_READ, 2491 HAT_LOAD | HAT_LOAD_NOCONSIST | 2492 HAT_LOAD_REMAP); 2493 pp += ptob(1); 2494 } 2495 metapgs -= n; 2496 } 2497 } 2498 2499 /* 2500 * Transition all the deleted pages to the deleted state so that 2501 * page_lock will not wait. The page_lock_delete call will 2502 * also wake up any waiters. 2503 */ 2504 static void 2505 memseg_lock_delete_all(struct memseg *seg) 2506 { 2507 page_t *pp; 2508 2509 for (pp = seg->pages; pp < seg->epages; pp++) { 2510 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2511 page_lock_delete(pp); 2512 } 2513 } 2514 2515 static void 2516 kphysm_del_cleanup(struct mem_handle *mhp) 2517 { 2518 struct memdelspan *mdsp; 2519 struct memseg *seg; 2520 struct memseg **segpp; 2521 struct memseg *seglist; 2522 pfn_t p_end; 2523 uint64_t avmem; 2524 pgcnt_t avpgs; 2525 pgcnt_t npgs; 2526 2527 avpgs = mhp->mh_vm_pages; 2528 2529 memsegs_lock(1); 2530 2531 /* 2532 * remove from main segment list. 2533 */ 2534 npgs = 0; 2535 seglist = NULL; 2536 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2537 mdsp = mdsp->mds_next) { 2538 p_end = mdsp->mds_base + mdsp->mds_npgs; 2539 for (segpp = &memsegs; (seg = *segpp) != NULL; ) { 2540 if (seg->pages_base >= p_end || 2541 seg->pages_end <= mdsp->mds_base) { 2542 /* Span and memseg don't overlap. */ 2543 segpp = &((*segpp)->next); 2544 continue; 2545 } 2546 ASSERT(seg->pages_base >= mdsp->mds_base); 2547 ASSERT(seg->pages_end <= p_end); 2548 2549 PLCNT_MODIFY_MAX(seg->pages_base, 2550 seg->pages_base - seg->pages_end); 2551 2552 /* Hide the memseg from future scans. */ 2553 hat_kpm_delmem_mseg_update(seg, segpp); 2554 *segpp = seg->next; 2555 membar_producer(); /* TODO: Needed? */ 2556 npgs += MSEG_NPAGES(seg); 2557 2558 /* 2559 * Leave the deleted segment's next pointer intact 2560 * in case a memsegs scanning loop is walking this 2561 * segment concurrently. 2562 */ 2563 seg->lnext = seglist; 2564 seglist = seg; 2565 } 2566 } 2567 2568 build_pfn_hash(); 2569 2570 ASSERT(npgs < total_pages); 2571 total_pages -= npgs; 2572 2573 /* 2574 * Recalculate the paging parameters now total_pages has changed. 2575 * This will also cause the clock hands to be reset before next use. 2576 */ 2577 setupclock(1); 2578 2579 memsegs_unlock(1); 2580 2581 mutex_exit(&mhp->mh_mutex); 2582 2583 while ((seg = seglist) != NULL) { 2584 pfn_t mseg_start; 2585 pfn_t mseg_base, mseg_end; 2586 pgcnt_t mseg_npgs; 2587 page_t *pp; 2588 pgcnt_t metapgs; 2589 int dynamic; 2590 int mlret; 2591 2592 seglist = seg->lnext; 2593 2594 /* 2595 * Put the page_t's into the deleted state to stop 2596 * cv_wait()s on the pages. When we remap, the dummy 2597 * page_t's will be in the same state. 2598 */ 2599 memseg_lock_delete_all(seg); 2600 /* 2601 * Collect up information based on pages_base and pages_end 2602 * early so that we can flag early that the memseg has been 2603 * deleted by setting pages_end == pages_base. 2604 */ 2605 mseg_base = seg->pages_base; 2606 mseg_end = seg->pages_end; 2607 mseg_npgs = MSEG_NPAGES(seg); 2608 dynamic = memseg_is_dynamic(seg, &mseg_start); 2609 2610 seg->pages_end = seg->pages_base; 2611 2612 if (dynamic) { 2613 pp = seg->pages; 2614 metapgs = mseg_base - mseg_start; 2615 ASSERT(metapgs != 0); 2616 2617 /* Remap the meta data to our special dummy area. */ 2618 memseg_remap_to_dummy((caddr_t)pp, metapgs); 2619 2620 mutex_enter(&memseg_lists_lock); 2621 seg->lnext = memseg_va_avail; 2622 memseg_va_avail = seg; 2623 mutex_exit(&memseg_lists_lock); 2624 } else { 2625 /* 2626 * Set for clean-up below. 2627 */ 2628 mseg_start = seg->pages_base; 2629 /* 2630 * For memory whose page_ts were allocated 2631 * at boot, we need to find a new use for 2632 * the page_t memory. 2633 * For the moment, just leak it. 2634 * (It is held in the memseg_delete_junk list.) 2635 */ 2636 2637 mutex_enter(&memseg_lists_lock); 2638 seg->lnext = memseg_delete_junk; 2639 memseg_delete_junk = seg; 2640 mutex_exit(&memseg_lists_lock); 2641 } 2642 2643 /* Must not use seg now as it could be re-used. */ 2644 2645 memlist_write_lock(); 2646 2647 mlret = memlist_delete_span( 2648 (uint64_t)(mseg_base) << PAGESHIFT, 2649 (uint64_t)(mseg_npgs) << PAGESHIFT, 2650 &phys_avail); 2651 ASSERT(mlret == MEML_SPANOP_OK); 2652 2653 mlret = memlist_delete_span( 2654 (uint64_t)(mseg_start) << PAGESHIFT, 2655 (uint64_t)(mseg_end - mseg_start) << 2656 PAGESHIFT, 2657 &phys_install); 2658 ASSERT(mlret == MEML_SPANOP_OK); 2659 phys_install_has_changed(); 2660 2661 memlist_write_unlock(); 2662 } 2663 2664 memlist_read_lock(); 2665 installed_top_size(phys_install, &physmax, &physinstalled); 2666 memlist_read_unlock(); 2667 2668 mutex_enter(&freemem_lock); 2669 maxmem -= avpgs; 2670 physmem -= avpgs; 2671 /* availrmem is adjusted during the delete. */ 2672 availrmem_initial -= avpgs; 2673 2674 mutex_exit(&freemem_lock); 2675 2676 dump_resize(); 2677 2678 cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK " 2679 "(0x%" PRIx64 ")\n", 2680 physinstalled << (PAGESHIFT - 10), 2681 (uint64_t)physinstalled << PAGESHIFT); 2682 2683 avmem = (uint64_t)freemem << PAGESHIFT; 2684 cmn_err(CE_CONT, "?kphysm_delete: " 2685 "avail mem = %" PRId64 "\n", avmem); 2686 2687 /* 2688 * Update lgroup generation number on single lgroup systems 2689 */ 2690 if (nlgrps == 1) 2691 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 2692 2693 /* Successfully deleted system memory */ 2694 mutex_enter(&mhp->mh_mutex); 2695 } 2696 2697 static uint_t mdel_nullvp_waiter; 2698 2699 static void 2700 page_delete_collect( 2701 page_t *pp, 2702 struct mem_handle *mhp) 2703 { 2704 if (pp->p_vnode) { 2705 page_hashout(pp, (kmutex_t *)NULL); 2706 /* do not do PP_SETAGED(pp); */ 2707 } else { 2708 kmutex_t *sep; 2709 2710 sep = page_se_mutex(pp); 2711 mutex_enter(sep); 2712 if (CV_HAS_WAITERS(&pp->p_cv)) { 2713 mdel_nullvp_waiter++; 2714 cv_broadcast(&pp->p_cv); 2715 } 2716 mutex_exit(sep); 2717 } 2718 ASSERT(pp->p_next == pp->p_prev); 2719 ASSERT(pp->p_next == NULL || pp->p_next == pp); 2720 pp->p_next = mhp->mh_deleted; 2721 mhp->mh_deleted = pp; 2722 ASSERT(mhp->mh_hold_todo != 0); 2723 mhp->mh_hold_todo--; 2724 } 2725 2726 static void 2727 transit_list_collect(struct mem_handle *mhp, int v) 2728 { 2729 struct transit_list_head *trh; 2730 2731 trh = &transit_list_head; 2732 mutex_enter(&trh->trh_lock); 2733 mhp->mh_transit.trl_collect = v; 2734 mutex_exit(&trh->trh_lock); 2735 } 2736 2737 static void 2738 transit_list_insert(struct transit_list *tlp) 2739 { 2740 struct transit_list_head *trh; 2741 2742 trh = &transit_list_head; 2743 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2744 tlp->trl_next = trh->trh_head; 2745 trh->trh_head = tlp; 2746 } 2747 2748 static void 2749 transit_list_remove(struct transit_list *tlp) 2750 { 2751 struct transit_list_head *trh; 2752 struct transit_list **tlpp; 2753 2754 trh = &transit_list_head; 2755 tlpp = &trh->trh_head; 2756 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2757 while (*tlpp != NULL && *tlpp != tlp) 2758 tlpp = &(*tlpp)->trl_next; 2759 ASSERT(*tlpp != NULL); 2760 if (*tlpp == tlp) 2761 *tlpp = tlp->trl_next; 2762 tlp->trl_next = NULL; 2763 } 2764 2765 static struct transit_list * 2766 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum) 2767 { 2768 struct transit_list *tlp; 2769 2770 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 2771 struct memdelspan *mdsp; 2772 2773 for (mdsp = tlp->trl_spans; mdsp != NULL; 2774 mdsp = mdsp->mds_next) { 2775 if (pfnum >= mdsp->mds_base && 2776 pfnum < (mdsp->mds_base + mdsp->mds_npgs)) { 2777 return (tlp); 2778 } 2779 } 2780 } 2781 return (NULL); 2782 } 2783 2784 int 2785 pfn_is_being_deleted(pfn_t pfnum) 2786 { 2787 struct transit_list_head *trh; 2788 struct transit_list *tlp; 2789 int ret; 2790 2791 trh = &transit_list_head; 2792 if (trh->trh_head == NULL) 2793 return (0); 2794 2795 mutex_enter(&trh->trh_lock); 2796 tlp = pfnum_to_transit_list(trh, pfnum); 2797 ret = (tlp != NULL && tlp->trl_collect); 2798 mutex_exit(&trh->trh_lock); 2799 2800 return (ret); 2801 } 2802 2803 #ifdef MEM_DEL_STATS 2804 extern int hz; 2805 static void 2806 mem_del_stat_print_func(struct mem_handle *mhp) 2807 { 2808 uint64_t tmp; 2809 2810 if (mem_del_stat_print) { 2811 printf("memory delete loop %x/%x, statistics%s\n", 2812 (uint_t)mhp->mh_transit.trl_spans->mds_base, 2813 (uint_t)mhp->mh_transit.trl_spans->mds_npgs, 2814 (mhp->mh_cancel ? " (cancelled)" : "")); 2815 printf("\t%8u nloop\n", mhp->mh_delstat.nloop); 2816 printf("\t%8u need_free\n", mhp->mh_delstat.need_free); 2817 printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop); 2818 printf("\t%8u free_low\n", mhp->mh_delstat.free_low); 2819 printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed); 2820 printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck); 2821 printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget); 2822 printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail); 2823 printf("\t%8u nfree\n", mhp->mh_delstat.nfree); 2824 printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc); 2825 printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail); 2826 printf("\t%8u already_done\n", mhp->mh_delstat.already_done); 2827 printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree); 2828 printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked); 2829 printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc); 2830 printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl); 2831 printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc); 2832 printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy); 2833 printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage); 2834 printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim); 2835 printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay); 2836 printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail); 2837 printf("\t%8u retired\n", mhp->mh_delstat.retired); 2838 printf("\t%8u toxic\n", mhp->mh_delstat.toxic); 2839 printf("\t%8u failing\n", mhp->mh_delstat.failing); 2840 printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic); 2841 printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic); 2842 printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail); 2843 printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail); 2844 tmp = mhp->mh_delstat.nticks_total / hz; /* seconds */ 2845 printf( 2846 "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n", 2847 mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60); 2848 2849 tmp = mhp->mh_delstat.nticks_pgrp / hz; /* seconds */ 2850 printf( 2851 "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n", 2852 mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60); 2853 } 2854 } 2855 #endif /* MEM_DEL_STATS */ 2856 2857 struct mem_callback { 2858 kphysm_setup_vector_t *vec; 2859 void *arg; 2860 }; 2861 2862 #define NMEMCALLBACKS 100 2863 2864 static struct mem_callback mem_callbacks[NMEMCALLBACKS]; 2865 static uint_t nmemcallbacks; 2866 static krwlock_t mem_callback_rwlock; 2867 2868 int 2869 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg) 2870 { 2871 uint_t i, found; 2872 2873 /* 2874 * This test will become more complicated when the version must 2875 * change. 2876 */ 2877 if (vec->version != KPHYSM_SETUP_VECTOR_VERSION) 2878 return (EINVAL); 2879 2880 if (vec->post_add == NULL || vec->pre_del == NULL || 2881 vec->post_del == NULL) 2882 return (EINVAL); 2883 2884 rw_enter(&mem_callback_rwlock, RW_WRITER); 2885 for (i = 0, found = 0; i < nmemcallbacks; i++) { 2886 if (mem_callbacks[i].vec == NULL && found == 0) 2887 found = i + 1; 2888 if (mem_callbacks[i].vec == vec && 2889 mem_callbacks[i].arg == arg) { 2890 #ifdef DEBUG 2891 /* Catch this in DEBUG kernels. */ 2892 cmn_err(CE_WARN, "kphysm_setup_func_register" 2893 "(0x%p, 0x%p) duplicate registration from 0x%p", 2894 (void *)vec, arg, (void *)caller()); 2895 #endif /* DEBUG */ 2896 rw_exit(&mem_callback_rwlock); 2897 return (EEXIST); 2898 } 2899 } 2900 if (found != 0) { 2901 i = found - 1; 2902 } else { 2903 ASSERT(nmemcallbacks < NMEMCALLBACKS); 2904 if (nmemcallbacks == NMEMCALLBACKS) { 2905 rw_exit(&mem_callback_rwlock); 2906 return (ENOMEM); 2907 } 2908 i = nmemcallbacks++; 2909 } 2910 mem_callbacks[i].vec = vec; 2911 mem_callbacks[i].arg = arg; 2912 rw_exit(&mem_callback_rwlock); 2913 return (0); 2914 } 2915 2916 void 2917 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg) 2918 { 2919 uint_t i; 2920 2921 rw_enter(&mem_callback_rwlock, RW_WRITER); 2922 for (i = 0; i < nmemcallbacks; i++) { 2923 if (mem_callbacks[i].vec == vec && 2924 mem_callbacks[i].arg == arg) { 2925 mem_callbacks[i].vec = NULL; 2926 mem_callbacks[i].arg = NULL; 2927 if (i == (nmemcallbacks - 1)) 2928 nmemcallbacks--; 2929 break; 2930 } 2931 } 2932 rw_exit(&mem_callback_rwlock); 2933 } 2934 2935 static void 2936 kphysm_setup_post_add(pgcnt_t delta_pages) 2937 { 2938 uint_t i; 2939 2940 rw_enter(&mem_callback_rwlock, RW_READER); 2941 for (i = 0; i < nmemcallbacks; i++) { 2942 if (mem_callbacks[i].vec != NULL) { 2943 (*mem_callbacks[i].vec->post_add) 2944 (mem_callbacks[i].arg, delta_pages); 2945 } 2946 } 2947 rw_exit(&mem_callback_rwlock); 2948 } 2949 2950 /* 2951 * Note the locking between pre_del and post_del: The reader lock is held 2952 * between the two calls to stop the set of functions from changing. 2953 */ 2954 2955 static int 2956 kphysm_setup_pre_del(pgcnt_t delta_pages) 2957 { 2958 uint_t i; 2959 int ret; 2960 int aret; 2961 2962 ret = 0; 2963 rw_enter(&mem_callback_rwlock, RW_READER); 2964 for (i = 0; i < nmemcallbacks; i++) { 2965 if (mem_callbacks[i].vec != NULL) { 2966 aret = (*mem_callbacks[i].vec->pre_del) 2967 (mem_callbacks[i].arg, delta_pages); 2968 ret |= aret; 2969 } 2970 } 2971 2972 return (ret); 2973 } 2974 2975 static void 2976 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled) 2977 { 2978 uint_t i; 2979 2980 for (i = 0; i < nmemcallbacks; i++) { 2981 if (mem_callbacks[i].vec != NULL) { 2982 (*mem_callbacks[i].vec->post_del) 2983 (mem_callbacks[i].arg, delta_pages, cancelled); 2984 } 2985 } 2986 rw_exit(&mem_callback_rwlock); 2987 } 2988 2989 static int 2990 kphysm_split_memseg( 2991 pfn_t base, 2992 pgcnt_t npgs) 2993 { 2994 struct memseg *seg; 2995 struct memseg **segpp; 2996 pgcnt_t size_low, size_high; 2997 struct memseg *seg_low, *seg_mid, *seg_high; 2998 2999 /* 3000 * Lock the memsegs list against other updates now 3001 */ 3002 memsegs_lock(1); 3003 3004 /* 3005 * Find boot time memseg that wholly covers this area. 3006 */ 3007 3008 /* First find the memseg with page 'base' in it. */ 3009 for (segpp = &memsegs; (seg = *segpp) != NULL; 3010 segpp = &((*segpp)->next)) { 3011 if (base >= seg->pages_base && base < seg->pages_end) 3012 break; 3013 } 3014 if (seg == NULL) { 3015 memsegs_unlock(1); 3016 return (0); 3017 } 3018 if (memseg_is_dynamic(seg, (pfn_t *)NULL)) { 3019 memsegs_unlock(1); 3020 return (0); 3021 } 3022 if ((base + npgs) > seg->pages_end) { 3023 memsegs_unlock(1); 3024 return (0); 3025 } 3026 3027 /* 3028 * Work out the size of the two segments that will 3029 * surround the new segment, one for low address 3030 * and one for high. 3031 */ 3032 ASSERT(base >= seg->pages_base); 3033 size_low = base - seg->pages_base; 3034 ASSERT(seg->pages_end >= (base + npgs)); 3035 size_high = seg->pages_end - (base + npgs); 3036 3037 /* 3038 * Sanity check. 3039 */ 3040 if ((size_low + size_high) == 0) { 3041 memsegs_unlock(1); 3042 return (0); 3043 } 3044 3045 /* 3046 * Allocate the new structures. The old memseg will not be freed 3047 * as there may be a reference to it. 3048 */ 3049 seg_low = NULL; 3050 seg_high = NULL; 3051 3052 if (size_low != 0) { 3053 seg_low = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3054 bzero(seg_low, sizeof (struct memseg)); 3055 } 3056 3057 seg_mid = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3058 bzero(seg_mid, sizeof (struct memseg)); 3059 3060 if (size_high != 0) { 3061 seg_high = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3062 bzero(seg_high, sizeof (struct memseg)); 3063 } 3064 3065 /* 3066 * All allocation done now. 3067 */ 3068 if (size_low != 0) { 3069 seg_low->pages = seg->pages; 3070 seg_low->epages = seg_low->pages + size_low; 3071 seg_low->pages_base = seg->pages_base; 3072 seg_low->pages_end = seg_low->pages_base + size_low; 3073 seg_low->next = seg_mid; 3074 } 3075 if (size_high != 0) { 3076 seg_high->pages = seg->epages - size_high; 3077 seg_high->epages = seg_high->pages + size_high; 3078 seg_high->pages_base = seg->pages_end - size_high; 3079 seg_high->pages_end = seg_high->pages_base + size_high; 3080 seg_high->next = seg->next; 3081 } 3082 3083 seg_mid->pages = seg->pages + size_low; 3084 seg_mid->pages_base = seg->pages_base + size_low; 3085 seg_mid->epages = seg->epages - size_high; 3086 seg_mid->pages_end = seg->pages_end - size_high; 3087 seg_mid->next = (seg_high != NULL) ? seg_high : seg->next; 3088 3089 /* 3090 * Update hat_kpm specific info of all involved memsegs and 3091 * allow hat_kpm specific global chain updates. 3092 */ 3093 hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high); 3094 3095 /* 3096 * At this point we have two equivalent memseg sub-chains, 3097 * seg and seg_low/seg_mid/seg_high, which both chain on to 3098 * the same place in the global chain. By re-writing the pointer 3099 * in the previous element we switch atomically from using the old 3100 * (seg) to the new. 3101 */ 3102 *segpp = (seg_low != NULL) ? seg_low : seg_mid; 3103 3104 membar_enter(); 3105 3106 build_pfn_hash(); 3107 memsegs_unlock(1); 3108 3109 /* 3110 * We leave the old segment, 'seg', intact as there may be 3111 * references to it. Also, as the value of total_pages has not 3112 * changed and the memsegs list is effectively the same when 3113 * accessed via the old or the new pointer, we do not have to 3114 * cause pageout_scanner() to re-evaluate its hand pointers. 3115 * 3116 * We currently do not re-use or reclaim the page_t memory. 3117 * If we do, then this may have to change. 3118 */ 3119 3120 mutex_enter(&memseg_lists_lock); 3121 seg->lnext = memseg_edit_junk; 3122 memseg_edit_junk = seg; 3123 mutex_exit(&memseg_lists_lock); 3124 3125 return (1); 3126 } 3127 3128 /* 3129 * The sfmmu hat layer (e.g.) accesses some parts of the memseg 3130 * structure using physical addresses. Therefore a kmem_cache is 3131 * used with KMC_NOHASH to avoid page crossings within a memseg 3132 * structure. KMC_NOHASH requires that no external (outside of 3133 * slab) information is allowed. This, in turn, implies that the 3134 * cache's slabsize must be exactly a single page, since per-slab 3135 * information (e.g. the freelist for the slab) is kept at the 3136 * end of the slab, where it is easy to locate. Should be changed 3137 * when a more obvious kmem_cache interface/flag will become 3138 * available. 3139 */ 3140 void 3141 mem_config_init() 3142 { 3143 memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg), 3144 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 3145 } 3146