1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/types.h> 30 #include <sys/cmn_err.h> 31 #include <sys/vmem.h> 32 #include <sys/kmem.h> 33 #include <sys/systm.h> 34 #include <sys/machsystm.h> /* for page_freelist_coalesce() */ 35 #include <sys/errno.h> 36 #include <sys/memnode.h> 37 #include <sys/memlist.h> 38 #include <sys/memlist_impl.h> 39 #include <sys/tuneable.h> 40 #include <sys/proc.h> 41 #include <sys/disp.h> 42 #include <sys/debug.h> 43 #include <sys/vm.h> 44 #include <sys/callb.h> 45 #include <sys/memlist_plat.h> /* for installed_top_size() */ 46 #include <sys/condvar_impl.h> /* for CV_HAS_WAITERS() */ 47 #include <sys/dumphdr.h> /* for dump_resize() */ 48 #include <sys/atomic.h> /* for use in stats collection */ 49 #include <sys/rwlock.h> 50 #include <sys/cpuvar.h> 51 #include <vm/seg_kmem.h> 52 #include <vm/seg_kpm.h> 53 #include <vm/page.h> 54 #define SUNDDI_IMPL /* so sunddi.h will not redefine splx() et al */ 55 #include <sys/sunddi.h> 56 #include <sys/mem_config.h> 57 #include <sys/mem_cage.h> 58 #include <sys/lgrp.h> 59 #include <sys/ddi.h> 60 #include <sys/modctl.h> 61 62 extern void memlist_read_lock(void); 63 extern void memlist_read_unlock(void); 64 extern void memlist_write_lock(void); 65 extern void memlist_write_unlock(void); 66 67 extern struct memlist *phys_avail; 68 69 extern void mem_node_add(pfn_t, pfn_t); 70 extern void mem_node_del(pfn_t, pfn_t); 71 72 extern uint_t page_ctrs_adjust(int); 73 static void kphysm_setup_post_add(pgcnt_t); 74 static int kphysm_setup_pre_del(pgcnt_t); 75 static void kphysm_setup_post_del(pgcnt_t, int); 76 77 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs); 78 79 static int delspan_reserve(pfn_t, pgcnt_t); 80 static void delspan_unreserve(pfn_t, pgcnt_t); 81 82 static kmutex_t memseg_lists_lock; 83 static struct memseg *memseg_va_avail; 84 static struct memseg *memseg_delete_junk; 85 static struct memseg *memseg_edit_junk; 86 void memseg_remap_init(void); 87 static void memseg_remap_to_dummy(caddr_t, pgcnt_t); 88 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t); 89 static struct memseg *memseg_reuse(pgcnt_t); 90 91 static struct kmem_cache *memseg_cache; 92 93 /* 94 * Add a chunk of memory to the system. page_t's for this memory 95 * are allocated in the first few pages of the chunk. 96 * base: starting PAGESIZE page of new memory. 97 * npgs: length in PAGESIZE pages. 98 * 99 * Adding mem this way doesn't increase the size of the hash tables; 100 * growing them would be too hard. This should be OK, but adding memory 101 * dynamically most likely means more hash misses, since the tables will 102 * be smaller than they otherwise would be. 103 */ 104 int 105 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs) 106 { 107 page_t *pp; 108 page_t *opp, *oepp; 109 struct memseg *seg; 110 uint64_t avmem; 111 pfn_t pfn; 112 pfn_t pt_base = base; 113 pgcnt_t tpgs = npgs; 114 pgcnt_t metapgs; 115 int exhausted; 116 pfn_t pnum; 117 int mnode; 118 caddr_t vaddr; 119 int reuse; 120 int mlret; 121 void *mapva; 122 pgcnt_t nkpmpgs = 0; 123 offset_t kpm_pages_off; 124 125 cmn_err(CE_CONT, 126 "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n", 127 npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT); 128 129 /* 130 * Add this span in the delete list to prevent interactions. 131 */ 132 if (!delspan_reserve(base, npgs)) { 133 return (KPHYSM_ESPAN); 134 } 135 /* 136 * Check to see if any of the memory span has been added 137 * by trying an add to the installed memory list. This 138 * forms the interlocking process for add. 139 */ 140 141 memlist_write_lock(); 142 143 mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT, 144 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 145 146 if (mlret == MEML_SPANOP_OK) 147 installed_top_size(phys_install, &physmax, &physinstalled); 148 149 memlist_write_unlock(); 150 151 if (mlret != MEML_SPANOP_OK) { 152 if (mlret == MEML_SPANOP_EALLOC) { 153 delspan_unreserve(pt_base, tpgs); 154 return (KPHYSM_ERESOURCE); 155 } else 156 if (mlret == MEML_SPANOP_ESPAN) { 157 delspan_unreserve(pt_base, tpgs); 158 return (KPHYSM_ESPAN); 159 } else { 160 delspan_unreserve(pt_base, tpgs); 161 return (KPHYSM_ERESOURCE); 162 } 163 } 164 165 /* 166 * We store the page_t's for this new memory in the first 167 * few pages of the chunk. Here, we go and get'em ... 168 */ 169 170 /* 171 * The expression after the '-' gives the number of pages 172 * that will fit in the new memory based on a requirement 173 * of (PAGESIZE + sizeof (page_t)) bytes per page. 174 */ 175 metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) / 176 (PAGESIZE + sizeof (page_t))); 177 178 npgs -= metapgs; 179 base += metapgs; 180 181 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); 182 183 exhausted = (metapgs == 0 || npgs == 0); 184 185 if (kpm_enable && !exhausted) { 186 pgcnt_t start, end, nkpmpgs_prelim; 187 size_t ptsz; 188 189 /* 190 * A viable kpm large page mapping must not overlap two 191 * dynamic memsegs. Therefore the total size is checked 192 * to be at least kpm_pgsz and also whether start and end 193 * points are at least kpm_pgsz aligned. 194 */ 195 if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) || 196 pmodkpmp(base + npgs)) { 197 198 kphysm_addmem_error_undospan(pt_base, tpgs); 199 200 /* 201 * There is no specific error code for violating 202 * kpm granularity constraints. 203 */ 204 return (KPHYSM_ENOTVIABLE); 205 } 206 207 start = kpmptop(ptokpmp(base)); 208 end = kpmptop(ptokpmp(base + npgs)); 209 nkpmpgs_prelim = ptokpmp(end - start); 210 ptsz = npgs * sizeof (page_t); 211 metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ); 212 exhausted = (tpgs <= metapgs); 213 if (!exhausted) { 214 npgs = tpgs - metapgs; 215 base = pt_base + metapgs; 216 217 /* final nkpmpgs */ 218 start = kpmptop(ptokpmp(base)); 219 nkpmpgs = ptokpmp(end - start); 220 kpm_pages_off = ptsz + 221 (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ; 222 } 223 } 224 225 /* 226 * Is memory area supplied too small? 227 */ 228 if (exhausted) { 229 kphysm_addmem_error_undospan(pt_base, tpgs); 230 231 /* 232 * There is no specific error code for 'too small'. 233 */ 234 return (KPHYSM_ERESOURCE); 235 } 236 237 /* 238 * We may re-use a previously allocated VA space for the page_ts 239 * eventually, but we need to initialize and lock the pages first. 240 */ 241 242 /* 243 * Get an address in the kernel address map, map 244 * the page_t pages and see if we can touch them. 245 */ 246 247 mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP); 248 if (mapva == NULL) { 249 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 250 " Can't allocate VA for page_ts"); 251 252 kphysm_addmem_error_undospan(pt_base, tpgs); 253 254 return (KPHYSM_ERESOURCE); 255 } 256 pp = mapva; 257 258 if (physmax < (pt_base + tpgs)) 259 physmax = (pt_base + tpgs); 260 261 /* 262 * In the remapping code we map one page at a time so we must do 263 * the same here to match mapping sizes. 264 */ 265 pfn = pt_base; 266 vaddr = (caddr_t)pp; 267 for (pnum = 0; pnum < metapgs; pnum++) { 268 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 269 PROT_READ | PROT_WRITE, 270 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 271 pfn++; 272 vaddr += ptob(1); 273 } 274 275 if (ddi_peek32((dev_info_t *)NULL, 276 (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) { 277 278 cmn_err(CE_PANIC, "kphysm_add_memory_dynamic:" 279 " Can't access pp array at 0x%p [phys 0x%lx]", 280 (void *)pp, pt_base); 281 282 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 283 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 284 285 vmem_free(heap_arena, mapva, ptob(metapgs)); 286 287 kphysm_addmem_error_undospan(pt_base, tpgs); 288 289 return (KPHYSM_EFAULT); 290 } 291 292 /* 293 * Add this memory slice to its memory node translation. 294 * 295 * Note that right now, each node may have only one slice; 296 * this may change with COD or in larger SSM systems with 297 * nested latency groups, so we must not assume that the 298 * node does not yet exist. 299 */ 300 pnum = base + npgs - 1; 301 mem_node_add_slice(base, pnum); 302 303 /* 304 * Allocate or resize page counters as necessary to accomodate 305 * the increase in memory pages. 306 */ 307 mnode = PFN_2_MEM_NODE(pnum); 308 if (page_ctrs_adjust(mnode) != 0) { 309 310 mem_node_pre_del_slice(base, pnum); 311 mem_node_post_del_slice(base, pnum, 0); 312 313 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 314 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 315 316 vmem_free(heap_arena, mapva, ptob(metapgs)); 317 318 kphysm_addmem_error_undospan(pt_base, tpgs); 319 320 return (KPHYSM_ERESOURCE); 321 } 322 323 /* 324 * Update the phys_avail memory list. 325 * The phys_install list was done at the start. 326 */ 327 328 memlist_write_lock(); 329 330 mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT, 331 (uint64_t)(npgs) << PAGESHIFT, &phys_avail); 332 ASSERT(mlret == MEML_SPANOP_OK); 333 334 memlist_write_unlock(); 335 336 /* See if we can find a memseg to re-use. */ 337 seg = memseg_reuse(metapgs); 338 339 reuse = (seg != NULL); 340 341 /* 342 * Initialize the memseg structure representing this memory 343 * and add it to the existing list of memsegs. Do some basic 344 * initialization and add the memory to the system. 345 * In order to prevent lock deadlocks, the add_physmem() 346 * code is repeated here, but split into several stages. 347 */ 348 if (seg == NULL) { 349 seg = kmem_cache_alloc(memseg_cache, KM_SLEEP); 350 bzero(seg, sizeof (struct memseg)); 351 seg->msegflags = MEMSEG_DYNAMIC; 352 seg->pages = pp; 353 } else { 354 /*EMPTY*/ 355 ASSERT(seg->msegflags & MEMSEG_DYNAMIC); 356 } 357 358 seg->epages = seg->pages + npgs; 359 seg->pages_base = base; 360 seg->pages_end = base + npgs; 361 362 /* 363 * Initialize metadata. The page_ts are set to locked state 364 * ready to be freed. 365 */ 366 bzero((caddr_t)pp, ptob(metapgs)); 367 368 pfn = seg->pages_base; 369 /* Save the original pp base in case we reuse a memseg. */ 370 opp = pp; 371 oepp = opp + npgs; 372 for (pp = opp; pp < oepp; pp++) { 373 pp->p_pagenum = pfn; 374 pfn++; 375 page_iolock_init(pp); 376 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 377 continue; 378 pp->p_offset = (u_offset_t)-1; 379 } 380 381 if (reuse) { 382 /* Remap our page_ts to the re-used memseg VA space. */ 383 pfn = pt_base; 384 vaddr = (caddr_t)seg->pages; 385 for (pnum = 0; pnum < metapgs; pnum++) { 386 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 387 PROT_READ | PROT_WRITE, 388 HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST); 389 pfn++; 390 vaddr += ptob(1); 391 } 392 393 hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs), 394 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 395 396 vmem_free(heap_arena, mapva, ptob(metapgs)); 397 } 398 399 hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off); 400 401 memsegs_lock(1); 402 403 /* 404 * The new memseg is inserted at the beginning of the list. 405 * Not only does this save searching for the tail, but in the 406 * case of a re-used memseg, it solves the problem of what 407 * happens of some process has still got a pointer to the 408 * memseg and follows the next pointer to continue traversing 409 * the memsegs list. 410 */ 411 412 hat_kpm_addmem_mseg_insert(seg); 413 414 seg->next = memsegs; 415 membar_producer(); 416 417 hat_kpm_addmem_memsegs_update(seg); 418 419 memsegs = seg; 420 421 build_pfn_hash(); 422 423 total_pages += npgs; 424 425 /* 426 * Recalculate the paging parameters now total_pages has changed. 427 * This will also cause the clock hands to be reset before next use. 428 */ 429 setupclock(1); 430 431 memsegs_unlock(1); 432 433 /* 434 * Free the pages outside the lock to avoid locking loops. 435 */ 436 for (pp = seg->pages; pp < seg->epages; pp++) { 437 page_free(pp, 1); 438 } 439 440 /* 441 * Now that we've updated the appropriate memory lists we 442 * need to reset a number of globals, since we've increased memory. 443 * Several have already been updated for us as noted above. The 444 * globals we're interested in at this point are: 445 * physmax - highest page frame number. 446 * physinstalled - number of pages currently installed (done earlier) 447 * maxmem - max free pages in the system 448 * physmem - physical memory pages available 449 * availrmem - real memory available 450 */ 451 452 mutex_enter(&freemem_lock); 453 maxmem += npgs; 454 physmem += npgs; 455 availrmem += npgs; 456 availrmem_initial += npgs; 457 458 mutex_exit(&freemem_lock); 459 460 dump_resize(); 461 462 page_freelist_coalesce_all(mnode); 463 464 kphysm_setup_post_add(npgs); 465 466 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK " 467 "(0x%" PRIx64 ")\n", 468 physinstalled << (PAGESHIFT - 10), 469 (uint64_t)physinstalled << PAGESHIFT); 470 471 avmem = (uint64_t)freemem << PAGESHIFT; 472 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: " 473 "avail mem = %" PRId64 "\n", avmem); 474 475 /* 476 * Update lgroup generation number on single lgroup systems 477 */ 478 if (nlgrps == 1) 479 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 480 481 delspan_unreserve(pt_base, tpgs); 482 return (KPHYSM_OK); /* Successfully added system memory */ 483 484 } 485 486 /* 487 * There are various error conditions in kphysm_add_memory_dynamic() 488 * which require a rollback of already changed global state. 489 */ 490 static void 491 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs) 492 { 493 int mlret; 494 495 /* Unreserve memory span. */ 496 memlist_write_lock(); 497 498 mlret = memlist_delete_span( 499 (uint64_t)(pt_base) << PAGESHIFT, 500 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 501 502 ASSERT(mlret == MEML_SPANOP_OK); 503 phys_install_has_changed(); 504 installed_top_size(phys_install, &physmax, &physinstalled); 505 506 memlist_write_unlock(); 507 delspan_unreserve(pt_base, tpgs); 508 } 509 510 /* 511 * Only return an available memseg of exactly the right size. 512 * When the meta data area has it's own virtual address space 513 * we will need to manage this more carefully and do best fit 514 * allocations, possibly splitting an availble area. 515 */ 516 static struct memseg * 517 memseg_reuse(pgcnt_t metapgs) 518 { 519 struct memseg **segpp, *seg; 520 521 mutex_enter(&memseg_lists_lock); 522 523 segpp = &memseg_va_avail; 524 for (; (seg = *segpp) != NULL; segpp = &seg->lnext) { 525 caddr_t end; 526 527 if (kpm_enable) 528 end = hat_kpm_mseg_reuse(seg); 529 else 530 end = (caddr_t)seg->epages; 531 532 if (btopr(end - (caddr_t)seg->pages) == metapgs) { 533 *segpp = seg->lnext; 534 seg->lnext = NULL; 535 break; 536 } 537 } 538 mutex_exit(&memseg_lists_lock); 539 540 return (seg); 541 } 542 543 static uint_t handle_gen; 544 545 struct memdelspan { 546 struct memdelspan *mds_next; 547 pfn_t mds_base; 548 pgcnt_t mds_npgs; 549 uint_t *mds_bitmap; 550 uint_t *mds_bitmap_retired; 551 }; 552 553 #define NBPBMW (sizeof (uint_t) * NBBY) 554 #define MDS_BITMAPBYTES(MDSP) \ 555 ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t)) 556 557 struct transit_list { 558 struct transit_list *trl_next; 559 struct memdelspan *trl_spans; 560 int trl_collect; 561 }; 562 563 struct transit_list_head { 564 kmutex_t trh_lock; 565 struct transit_list *trh_head; 566 }; 567 568 static struct transit_list_head transit_list_head; 569 570 struct mem_handle; 571 static void transit_list_collect(struct mem_handle *, int); 572 static void transit_list_insert(struct transit_list *); 573 static void transit_list_remove(struct transit_list *); 574 575 #ifdef DEBUG 576 #define MEM_DEL_STATS 577 #endif /* DEBUG */ 578 579 #ifdef MEM_DEL_STATS 580 static int mem_del_stat_print = 0; 581 struct mem_del_stat { 582 uint_t nloop; 583 uint_t need_free; 584 uint_t free_loop; 585 uint_t free_low; 586 uint_t free_failed; 587 uint_t ncheck; 588 uint_t nopaget; 589 uint_t lockfail; 590 uint_t nfree; 591 uint_t nreloc; 592 uint_t nrelocfail; 593 uint_t already_done; 594 uint_t first_notfree; 595 uint_t npplocked; 596 uint_t nlockreloc; 597 uint_t nnorepl; 598 uint_t nmodreloc; 599 uint_t ndestroy; 600 uint_t nputpage; 601 uint_t nnoreclaim; 602 uint_t ndelay; 603 uint_t demotefail; 604 uint64_t nticks_total; 605 uint64_t nticks_pgrp; 606 uint_t retired; 607 uint_t toxic; 608 uint_t failing; 609 uint_t modtoxic; 610 uint_t npplkdtoxic; 611 uint_t gptlmodfail; 612 uint_t gptllckfail; 613 }; 614 /* 615 * The stat values are only incremented in the delete thread 616 * so no locking or atomic required. 617 */ 618 #define MDSTAT_INCR(MHP, FLD) (MHP)->mh_delstat.FLD++ 619 #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck)) 620 #define MDSTAT_PGRP(MHP, ntck) ((MHP)->mh_delstat.nticks_pgrp += (ntck)) 621 static void mem_del_stat_print_func(struct mem_handle *); 622 #define MDSTAT_PRINT(MHP) mem_del_stat_print_func((MHP)) 623 #else /* MEM_DEL_STATS */ 624 #define MDSTAT_INCR(MHP, FLD) 625 #define MDSTAT_TOTAL(MHP, ntck) 626 #define MDSTAT_PGRP(MHP, ntck) 627 #define MDSTAT_PRINT(MHP) 628 #endif /* MEM_DEL_STATS */ 629 630 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING, 631 MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t; 632 633 /* 634 * mh_mutex must be taken to examine or change mh_exthandle and mh_state. 635 * The mutex may not be required for other fields, dependent on mh_state. 636 */ 637 struct mem_handle { 638 kmutex_t mh_mutex; 639 struct mem_handle *mh_next; 640 memhandle_t mh_exthandle; 641 mhnd_state_t mh_state; 642 struct transit_list mh_transit; 643 pgcnt_t mh_phys_pages; 644 pgcnt_t mh_vm_pages; 645 pgcnt_t mh_hold_todo; 646 void (*mh_delete_complete)(void *, int error); 647 void *mh_delete_complete_arg; 648 volatile uint_t mh_cancel; 649 volatile uint_t mh_dr_aio_cleanup_cancel; 650 volatile uint_t mh_aio_cleanup_done; 651 kcondvar_t mh_cv; 652 kthread_id_t mh_thread_id; 653 page_t *mh_deleted; /* link through p_next */ 654 #ifdef MEM_DEL_STATS 655 struct mem_del_stat mh_delstat; 656 #endif /* MEM_DEL_STATS */ 657 }; 658 659 static struct mem_handle *mem_handle_head; 660 static kmutex_t mem_handle_list_mutex; 661 662 static struct mem_handle * 663 kphysm_allocate_mem_handle() 664 { 665 struct mem_handle *mhp; 666 667 mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP); 668 mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL); 669 mutex_enter(&mem_handle_list_mutex); 670 mutex_enter(&mhp->mh_mutex); 671 /* handle_gen is protected by list mutex. */ 672 mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen); 673 mhp->mh_next = mem_handle_head; 674 mem_handle_head = mhp; 675 mutex_exit(&mem_handle_list_mutex); 676 677 return (mhp); 678 } 679 680 static void 681 kphysm_free_mem_handle(struct mem_handle *mhp) 682 { 683 struct mem_handle **mhpp; 684 685 ASSERT(mutex_owned(&mhp->mh_mutex)); 686 ASSERT(mhp->mh_state == MHND_FREE); 687 /* 688 * Exit the mutex to preserve locking order. This is OK 689 * here as once in the FREE state, the handle cannot 690 * be found by a lookup. 691 */ 692 mutex_exit(&mhp->mh_mutex); 693 694 mutex_enter(&mem_handle_list_mutex); 695 mhpp = &mem_handle_head; 696 while (*mhpp != NULL && *mhpp != mhp) 697 mhpp = &(*mhpp)->mh_next; 698 ASSERT(*mhpp == mhp); 699 /* 700 * No need to lock the handle (mh_mutex) as only 701 * mh_next changing and this is the only thread that 702 * can be referncing mhp. 703 */ 704 *mhpp = mhp->mh_next; 705 mutex_exit(&mem_handle_list_mutex); 706 707 mutex_destroy(&mhp->mh_mutex); 708 kmem_free(mhp, sizeof (struct mem_handle)); 709 } 710 711 /* 712 * This function finds the internal mem_handle corresponding to an 713 * external handle and returns it with the mh_mutex held. 714 */ 715 static struct mem_handle * 716 kphysm_lookup_mem_handle(memhandle_t handle) 717 { 718 struct mem_handle *mhp; 719 720 mutex_enter(&mem_handle_list_mutex); 721 for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) { 722 if (mhp->mh_exthandle == handle) { 723 mutex_enter(&mhp->mh_mutex); 724 /* 725 * The state of the handle could have been changed 726 * by kphysm_del_release() while waiting for mh_mutex. 727 */ 728 if (mhp->mh_state == MHND_FREE) { 729 mutex_exit(&mhp->mh_mutex); 730 continue; 731 } 732 break; 733 } 734 } 735 mutex_exit(&mem_handle_list_mutex); 736 return (mhp); 737 } 738 739 int 740 kphysm_del_gethandle(memhandle_t *xmhp) 741 { 742 struct mem_handle *mhp; 743 744 mhp = kphysm_allocate_mem_handle(); 745 /* 746 * The handle is allocated using KM_SLEEP, so cannot fail. 747 * If the implementation is changed, the correct error to return 748 * here would be KPHYSM_ENOHANDLES. 749 */ 750 ASSERT(mhp->mh_state == MHND_FREE); 751 mhp->mh_state = MHND_INIT; 752 *xmhp = mhp->mh_exthandle; 753 mutex_exit(&mhp->mh_mutex); 754 return (KPHYSM_OK); 755 } 756 757 static int 758 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2) 759 { 760 pfn_t e1, e2; 761 762 e1 = b1 + l1; 763 e2 = b2 + l2; 764 765 return (!(b2 >= e1 || b1 >= e2)); 766 } 767 768 static int can_remove_pgs(pgcnt_t); 769 770 static struct memdelspan * 771 span_to_install(pfn_t base, pgcnt_t npgs) 772 { 773 struct memdelspan *mdsp; 774 struct memdelspan *mdsp_new; 775 uint64_t address, size, thislen; 776 struct memlist *mlp; 777 778 mdsp_new = NULL; 779 780 address = (uint64_t)base << PAGESHIFT; 781 size = (uint64_t)npgs << PAGESHIFT; 782 while (size != 0) { 783 memlist_read_lock(); 784 for (mlp = phys_install; mlp != NULL; mlp = mlp->next) { 785 if (address >= (mlp->address + mlp->size)) 786 continue; 787 if ((address + size) > mlp->address) 788 break; 789 } 790 if (mlp == NULL) { 791 address += size; 792 size = 0; 793 thislen = 0; 794 } else { 795 if (address < mlp->address) { 796 size -= (mlp->address - address); 797 address = mlp->address; 798 } 799 ASSERT(address >= mlp->address); 800 if ((address + size) > (mlp->address + mlp->size)) { 801 thislen = mlp->size - (address - mlp->address); 802 } else { 803 thislen = size; 804 } 805 } 806 memlist_read_unlock(); 807 /* TODO: phys_install could change now */ 808 if (thislen == 0) 809 continue; 810 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 811 mdsp->mds_base = btop(address); 812 mdsp->mds_npgs = btop(thislen); 813 mdsp->mds_next = mdsp_new; 814 mdsp_new = mdsp; 815 address += thislen; 816 size -= thislen; 817 } 818 return (mdsp_new); 819 } 820 821 static void 822 free_delspans(struct memdelspan *mdsp) 823 { 824 struct memdelspan *amdsp; 825 826 while ((amdsp = mdsp) != NULL) { 827 mdsp = amdsp->mds_next; 828 kmem_free(amdsp, sizeof (struct memdelspan)); 829 } 830 } 831 832 /* 833 * Concatenate lists. No list ordering is required. 834 */ 835 836 static void 837 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp) 838 { 839 while (*mdspp != NULL) 840 mdspp = &(*mdspp)->mds_next; 841 842 *mdspp = mdsp; 843 } 844 845 /* 846 * Given a new list of delspans, check there is no overlap with 847 * all existing span activity (add or delete) and then concatenate 848 * the new spans to the given list. 849 * Return 1 for OK, 0 if overlapping. 850 */ 851 static int 852 delspan_insert( 853 struct transit_list *my_tlp, 854 struct memdelspan *mdsp_new) 855 { 856 struct transit_list_head *trh; 857 struct transit_list *tlp; 858 int ret; 859 860 trh = &transit_list_head; 861 862 ASSERT(my_tlp != NULL); 863 ASSERT(mdsp_new != NULL); 864 865 ret = 1; 866 mutex_enter(&trh->trh_lock); 867 /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */ 868 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 869 struct memdelspan *mdsp; 870 871 for (mdsp = tlp->trl_spans; mdsp != NULL; 872 mdsp = mdsp->mds_next) { 873 struct memdelspan *nmdsp; 874 875 for (nmdsp = mdsp_new; nmdsp != NULL; 876 nmdsp = nmdsp->mds_next) { 877 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 878 nmdsp->mds_base, nmdsp->mds_npgs)) { 879 ret = 0; 880 goto done; 881 } 882 } 883 } 884 } 885 done: 886 if (ret != 0) { 887 if (my_tlp->trl_spans == NULL) 888 transit_list_insert(my_tlp); 889 delspan_concat(&my_tlp->trl_spans, mdsp_new); 890 } 891 mutex_exit(&trh->trh_lock); 892 return (ret); 893 } 894 895 static void 896 delspan_remove( 897 struct transit_list *my_tlp, 898 pfn_t base, 899 pgcnt_t npgs) 900 { 901 struct transit_list_head *trh; 902 struct memdelspan *mdsp; 903 904 trh = &transit_list_head; 905 906 ASSERT(my_tlp != NULL); 907 908 mutex_enter(&trh->trh_lock); 909 if ((mdsp = my_tlp->trl_spans) != NULL) { 910 if (npgs == 0) { 911 my_tlp->trl_spans = NULL; 912 free_delspans(mdsp); 913 transit_list_remove(my_tlp); 914 } else { 915 struct memdelspan **prv; 916 917 prv = &my_tlp->trl_spans; 918 while (mdsp != NULL) { 919 pfn_t p_end; 920 921 p_end = mdsp->mds_base + mdsp->mds_npgs; 922 if (mdsp->mds_base >= base && 923 p_end <= (base + npgs)) { 924 *prv = mdsp->mds_next; 925 mdsp->mds_next = NULL; 926 free_delspans(mdsp); 927 } else { 928 prv = &mdsp->mds_next; 929 } 930 mdsp = *prv; 931 } 932 if (my_tlp->trl_spans == NULL) 933 transit_list_remove(my_tlp); 934 } 935 } 936 mutex_exit(&trh->trh_lock); 937 } 938 939 /* 940 * Reserve interface for add to stop delete before add finished. 941 * This list is only accessed through the delspan_insert/remove 942 * functions and so is fully protected by the mutex in struct transit_list. 943 */ 944 945 static struct transit_list reserve_transit; 946 947 static int 948 delspan_reserve(pfn_t base, pgcnt_t npgs) 949 { 950 struct memdelspan *mdsp; 951 int ret; 952 953 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 954 mdsp->mds_base = base; 955 mdsp->mds_npgs = npgs; 956 if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) { 957 free_delspans(mdsp); 958 } 959 return (ret); 960 } 961 962 static void 963 delspan_unreserve(pfn_t base, pgcnt_t npgs) 964 { 965 delspan_remove(&reserve_transit, base, npgs); 966 } 967 968 /* 969 * Return whether memseg was created by kphysm_add_memory_dynamic(). 970 * If this is the case and startp non zero, return also the start pfn 971 * of the meta data via startp. 972 */ 973 static int 974 memseg_is_dynamic(struct memseg *seg, pfn_t *startp) 975 { 976 pfn_t pt_start; 977 978 if ((seg->msegflags & MEMSEG_DYNAMIC) == 0) 979 return (0); 980 981 /* Meta data is required to be at the beginning */ 982 ASSERT(hat_getpfnum(kas.a_hat, (caddr_t)seg->epages) < seg->pages_base); 983 984 pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages); 985 if (startp != NULL) 986 *startp = pt_start; 987 988 return (1); 989 } 990 991 int 992 kphysm_del_span( 993 memhandle_t handle, 994 pfn_t base, 995 pgcnt_t npgs) 996 { 997 struct mem_handle *mhp; 998 struct memseg *seg; 999 struct memdelspan *mdsp; 1000 struct memdelspan *mdsp_new; 1001 pgcnt_t phys_pages, vm_pages; 1002 pfn_t p_end; 1003 page_t *pp; 1004 int ret; 1005 1006 mhp = kphysm_lookup_mem_handle(handle); 1007 if (mhp == NULL) { 1008 return (KPHYSM_EHANDLE); 1009 } 1010 if (mhp->mh_state != MHND_INIT) { 1011 mutex_exit(&mhp->mh_mutex); 1012 return (KPHYSM_ESEQUENCE); 1013 } 1014 1015 /* 1016 * Intersect the span with the installed memory list (phys_install). 1017 */ 1018 mdsp_new = span_to_install(base, npgs); 1019 if (mdsp_new == NULL) { 1020 /* 1021 * No physical memory in this range. Is this an 1022 * error? If an attempt to start the delete is made 1023 * for OK returns from del_span such as this, start will 1024 * return an error. 1025 * Could return KPHYSM_ENOWORK. 1026 */ 1027 /* 1028 * It is assumed that there are no error returns 1029 * from span_to_install() due to kmem_alloc failure. 1030 */ 1031 mutex_exit(&mhp->mh_mutex); 1032 return (KPHYSM_OK); 1033 } 1034 /* 1035 * Does this span overlap an existing span? 1036 */ 1037 if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) { 1038 /* 1039 * Differentiate between already on list for this handle 1040 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY). 1041 */ 1042 ret = KPHYSM_EBUSY; 1043 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1044 mdsp = mdsp->mds_next) { 1045 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 1046 base, npgs)) { 1047 ret = KPHYSM_EDUP; 1048 break; 1049 } 1050 } 1051 mutex_exit(&mhp->mh_mutex); 1052 free_delspans(mdsp_new); 1053 return (ret); 1054 } 1055 /* 1056 * At this point the spans in mdsp_new have been inserted into the 1057 * list of spans for this handle and thereby to the global list of 1058 * spans being processed. Each of these spans must now be checked 1059 * for relocatability. As a side-effect segments in the memseg list 1060 * may be split. 1061 * 1062 * Note that mdsp_new can no longer be used as it is now part of 1063 * a larger list. Select elements of this larger list based 1064 * on base and npgs. 1065 */ 1066 restart: 1067 phys_pages = 0; 1068 vm_pages = 0; 1069 ret = KPHYSM_OK; 1070 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1071 mdsp = mdsp->mds_next) { 1072 pgcnt_t pages_checked; 1073 1074 if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) { 1075 continue; 1076 } 1077 p_end = mdsp->mds_base + mdsp->mds_npgs; 1078 /* 1079 * The pages_checked count is a hack. All pages should be 1080 * checked for relocatability. Those not covered by memsegs 1081 * should be tested with arch_kphysm_del_span_ok(). 1082 */ 1083 pages_checked = 0; 1084 for (seg = memsegs; seg; seg = seg->next) { 1085 pfn_t mseg_start; 1086 1087 if (seg->pages_base >= p_end || 1088 seg->pages_end <= mdsp->mds_base) { 1089 /* Span and memseg don't overlap. */ 1090 continue; 1091 } 1092 /* Check that segment is suitable for delete. */ 1093 if (memseg_is_dynamic(seg, &mseg_start)) { 1094 /* 1095 * Can only delete whole added segments 1096 * for the moment. 1097 * Check that this is completely within the 1098 * span. 1099 */ 1100 if (mseg_start < mdsp->mds_base || 1101 seg->pages_end > p_end) { 1102 ret = KPHYSM_EBUSY; 1103 break; 1104 } 1105 pages_checked += seg->pages_end - mseg_start; 1106 } else { 1107 /* 1108 * Set mseg_start for accounting below. 1109 */ 1110 mseg_start = seg->pages_base; 1111 /* 1112 * If this segment is larger than the span, 1113 * try to split it. After the split, it 1114 * is necessary to restart. 1115 */ 1116 if (seg->pages_base < mdsp->mds_base || 1117 seg->pages_end > p_end) { 1118 pfn_t abase; 1119 pgcnt_t anpgs; 1120 int s_ret; 1121 1122 /* Split required. */ 1123 if (mdsp->mds_base < seg->pages_base) 1124 abase = seg->pages_base; 1125 else 1126 abase = mdsp->mds_base; 1127 if (p_end > seg->pages_end) 1128 anpgs = seg->pages_end - abase; 1129 else 1130 anpgs = p_end - abase; 1131 s_ret = kphysm_split_memseg(abase, 1132 anpgs); 1133 if (s_ret == 0) { 1134 /* Split failed. */ 1135 ret = KPHYSM_ERESOURCE; 1136 break; 1137 } 1138 goto restart; 1139 } 1140 pages_checked += 1141 seg->pages_end - seg->pages_base; 1142 } 1143 /* 1144 * The memseg is wholly within the delete span. 1145 * The individual pages can now be checked. 1146 */ 1147 /* Cage test. */ 1148 for (pp = seg->pages; pp < seg->epages; pp++) { 1149 if (PP_ISNORELOC(pp)) { 1150 ret = KPHYSM_ENONRELOC; 1151 break; 1152 } 1153 } 1154 if (ret != KPHYSM_OK) { 1155 break; 1156 } 1157 phys_pages += (seg->pages_end - mseg_start); 1158 vm_pages += MSEG_NPAGES(seg); 1159 } 1160 if (ret != KPHYSM_OK) 1161 break; 1162 if (pages_checked != mdsp->mds_npgs) { 1163 ret = KPHYSM_ENONRELOC; 1164 break; 1165 } 1166 } 1167 1168 if (ret == KPHYSM_OK) { 1169 mhp->mh_phys_pages += phys_pages; 1170 mhp->mh_vm_pages += vm_pages; 1171 } else { 1172 /* 1173 * Keep holding the mh_mutex to prevent it going away. 1174 */ 1175 delspan_remove(&mhp->mh_transit, base, npgs); 1176 } 1177 mutex_exit(&mhp->mh_mutex); 1178 return (ret); 1179 } 1180 1181 int 1182 kphysm_del_span_query( 1183 pfn_t base, 1184 pgcnt_t npgs, 1185 memquery_t *mqp) 1186 { 1187 struct memdelspan *mdsp; 1188 struct memdelspan *mdsp_new; 1189 int done_first_nonreloc; 1190 1191 mqp->phys_pages = 0; 1192 mqp->managed = 0; 1193 mqp->nonrelocatable = 0; 1194 mqp->first_nonrelocatable = 0; 1195 mqp->last_nonrelocatable = 0; 1196 1197 mdsp_new = span_to_install(base, npgs); 1198 /* 1199 * It is OK to proceed here if mdsp_new == NULL. 1200 */ 1201 done_first_nonreloc = 0; 1202 for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) { 1203 pfn_t sbase; 1204 pgcnt_t snpgs; 1205 1206 mqp->phys_pages += mdsp->mds_npgs; 1207 sbase = mdsp->mds_base; 1208 snpgs = mdsp->mds_npgs; 1209 while (snpgs != 0) { 1210 struct memseg *lseg, *seg; 1211 pfn_t p_end; 1212 page_t *pp; 1213 pfn_t mseg_start; 1214 1215 p_end = sbase + snpgs; 1216 /* 1217 * Find the lowest addressed memseg that starts 1218 * after sbase and account for it. 1219 * This is to catch dynamic memsegs whose start 1220 * is hidden. 1221 */ 1222 seg = NULL; 1223 for (lseg = memsegs; lseg != NULL; lseg = lseg->next) { 1224 if ((lseg->pages_base >= sbase) || 1225 (lseg->pages_base < p_end && 1226 lseg->pages_end > sbase)) { 1227 if (seg == NULL || 1228 seg->pages_base > lseg->pages_base) 1229 seg = lseg; 1230 } 1231 } 1232 if (seg != NULL) { 1233 if (!memseg_is_dynamic(seg, &mseg_start)) { 1234 mseg_start = seg->pages_base; 1235 } 1236 /* 1237 * Now have the full extent of the memseg so 1238 * do the range check. 1239 */ 1240 if (mseg_start >= p_end || 1241 seg->pages_end <= sbase) { 1242 /* Span does not overlap memseg. */ 1243 seg = NULL; 1244 } 1245 } 1246 /* 1247 * Account for gap either before the segment if 1248 * there is one or to the end of the span. 1249 */ 1250 if (seg == NULL || mseg_start > sbase) { 1251 pfn_t a_end; 1252 1253 a_end = (seg == NULL) ? p_end : mseg_start; 1254 /* 1255 * Check with arch layer for relocatability. 1256 */ 1257 if (arch_kphysm_del_span_ok(sbase, 1258 (a_end - sbase))) { 1259 /* 1260 * No non-relocatble pages in this 1261 * area, avoid the fine-grained 1262 * test. 1263 */ 1264 snpgs -= (a_end - sbase); 1265 sbase = a_end; 1266 } 1267 while (sbase < a_end) { 1268 if (!arch_kphysm_del_span_ok(sbase, 1269 1)) { 1270 mqp->nonrelocatable++; 1271 if (!done_first_nonreloc) { 1272 mqp-> 1273 first_nonrelocatable 1274 = sbase; 1275 done_first_nonreloc = 1; 1276 } 1277 mqp->last_nonrelocatable = 1278 sbase; 1279 } 1280 sbase++; 1281 snpgs--; 1282 } 1283 } 1284 if (seg != NULL) { 1285 ASSERT(mseg_start <= sbase); 1286 if (seg->pages_base != mseg_start && 1287 seg->pages_base > sbase) { 1288 pgcnt_t skip_pgs; 1289 1290 /* 1291 * Skip the page_t area of a 1292 * dynamic memseg. 1293 */ 1294 skip_pgs = seg->pages_base - sbase; 1295 if (snpgs <= skip_pgs) { 1296 sbase += snpgs; 1297 snpgs = 0; 1298 continue; 1299 } 1300 snpgs -= skip_pgs; 1301 sbase += skip_pgs; 1302 } 1303 ASSERT(snpgs != 0); 1304 ASSERT(seg->pages_base <= sbase); 1305 /* 1306 * The individual pages can now be checked. 1307 */ 1308 for (pp = seg->pages + 1309 (sbase - seg->pages_base); 1310 snpgs != 0 && pp < seg->epages; pp++) { 1311 mqp->managed++; 1312 if (PP_ISNORELOC(pp)) { 1313 mqp->nonrelocatable++; 1314 if (!done_first_nonreloc) { 1315 mqp-> 1316 first_nonrelocatable 1317 = sbase; 1318 done_first_nonreloc = 1; 1319 } 1320 mqp->last_nonrelocatable = 1321 sbase; 1322 } 1323 sbase++; 1324 snpgs--; 1325 } 1326 } 1327 } 1328 } 1329 1330 free_delspans(mdsp_new); 1331 1332 return (KPHYSM_OK); 1333 } 1334 1335 /* 1336 * This release function can be called at any stage as follows: 1337 * _gethandle only called 1338 * _span(s) only called 1339 * _start called but failed 1340 * delete thread exited 1341 */ 1342 int 1343 kphysm_del_release(memhandle_t handle) 1344 { 1345 struct mem_handle *mhp; 1346 1347 mhp = kphysm_lookup_mem_handle(handle); 1348 if (mhp == NULL) { 1349 return (KPHYSM_EHANDLE); 1350 } 1351 switch (mhp->mh_state) { 1352 case MHND_STARTING: 1353 case MHND_RUNNING: 1354 mutex_exit(&mhp->mh_mutex); 1355 return (KPHYSM_ENOTFINISHED); 1356 case MHND_FREE: 1357 ASSERT(mhp->mh_state != MHND_FREE); 1358 mutex_exit(&mhp->mh_mutex); 1359 return (KPHYSM_EHANDLE); 1360 case MHND_INIT: 1361 break; 1362 case MHND_DONE: 1363 break; 1364 case MHND_RELEASE: 1365 mutex_exit(&mhp->mh_mutex); 1366 return (KPHYSM_ESEQUENCE); 1367 default: 1368 #ifdef DEBUG 1369 cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d", 1370 (void *)mhp, mhp->mh_state); 1371 #endif /* DEBUG */ 1372 mutex_exit(&mhp->mh_mutex); 1373 return (KPHYSM_EHANDLE); 1374 } 1375 /* 1376 * Set state so that we can wait if necessary. 1377 * Also this means that we have read/write access to all 1378 * fields except mh_exthandle and mh_state. 1379 */ 1380 mhp->mh_state = MHND_RELEASE; 1381 /* 1382 * The mem_handle cannot be de-allocated by any other operation 1383 * now, so no need to hold mh_mutex. 1384 */ 1385 mutex_exit(&mhp->mh_mutex); 1386 1387 delspan_remove(&mhp->mh_transit, 0, 0); 1388 mhp->mh_phys_pages = 0; 1389 mhp->mh_vm_pages = 0; 1390 mhp->mh_hold_todo = 0; 1391 mhp->mh_delete_complete = NULL; 1392 mhp->mh_delete_complete_arg = NULL; 1393 mhp->mh_cancel = 0; 1394 1395 mutex_enter(&mhp->mh_mutex); 1396 ASSERT(mhp->mh_state == MHND_RELEASE); 1397 mhp->mh_state = MHND_FREE; 1398 1399 kphysm_free_mem_handle(mhp); 1400 1401 return (KPHYSM_OK); 1402 } 1403 1404 /* 1405 * This cancel function can only be called with the thread running. 1406 */ 1407 int 1408 kphysm_del_cancel(memhandle_t handle) 1409 { 1410 struct mem_handle *mhp; 1411 1412 mhp = kphysm_lookup_mem_handle(handle); 1413 if (mhp == NULL) { 1414 return (KPHYSM_EHANDLE); 1415 } 1416 if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) { 1417 mutex_exit(&mhp->mh_mutex); 1418 return (KPHYSM_ENOTRUNNING); 1419 } 1420 /* 1421 * Set the cancel flag and wake the delete thread up. 1422 * The thread may be waiting on I/O, so the effect of the cancel 1423 * may be delayed. 1424 */ 1425 if (mhp->mh_cancel == 0) { 1426 mhp->mh_cancel = KPHYSM_ECANCELLED; 1427 cv_signal(&mhp->mh_cv); 1428 } 1429 mutex_exit(&mhp->mh_mutex); 1430 return (KPHYSM_OK); 1431 } 1432 1433 int 1434 kphysm_del_status( 1435 memhandle_t handle, 1436 memdelstat_t *mdstp) 1437 { 1438 struct mem_handle *mhp; 1439 1440 mhp = kphysm_lookup_mem_handle(handle); 1441 if (mhp == NULL) { 1442 return (KPHYSM_EHANDLE); 1443 } 1444 /* 1445 * Calling kphysm_del_status() is allowed before the delete 1446 * is started to allow for status display. 1447 */ 1448 if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING && 1449 mhp->mh_state != MHND_RUNNING) { 1450 mutex_exit(&mhp->mh_mutex); 1451 return (KPHYSM_ENOTRUNNING); 1452 } 1453 mdstp->phys_pages = mhp->mh_phys_pages; 1454 mdstp->managed = mhp->mh_vm_pages; 1455 mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo; 1456 mutex_exit(&mhp->mh_mutex); 1457 return (KPHYSM_OK); 1458 } 1459 1460 static int mem_delete_additional_pages = 100; 1461 1462 static int 1463 can_remove_pgs(pgcnt_t npgs) 1464 { 1465 /* 1466 * If all pageable pages were paged out, freemem would 1467 * equal availrmem. There is a minimum requirement for 1468 * availrmem. 1469 */ 1470 if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages)) 1471 < npgs) 1472 return (0); 1473 /* TODO: check swap space, etc. */ 1474 return (1); 1475 } 1476 1477 static int 1478 get_availrmem(pgcnt_t npgs) 1479 { 1480 int ret; 1481 1482 mutex_enter(&freemem_lock); 1483 ret = can_remove_pgs(npgs); 1484 if (ret != 0) 1485 availrmem -= npgs; 1486 mutex_exit(&freemem_lock); 1487 return (ret); 1488 } 1489 1490 static void 1491 put_availrmem(pgcnt_t npgs) 1492 { 1493 mutex_enter(&freemem_lock); 1494 availrmem += npgs; 1495 mutex_exit(&freemem_lock); 1496 } 1497 1498 #define FREEMEM_INCR 100 1499 static pgcnt_t freemem_incr = FREEMEM_INCR; 1500 #define DEL_FREE_WAIT_FRAC 4 1501 #define DEL_FREE_WAIT_TICKS ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC) 1502 1503 #define DEL_BUSY_WAIT_FRAC 20 1504 #define DEL_BUSY_WAIT_TICKS ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC) 1505 1506 static void kphysm_del_cleanup(struct mem_handle *); 1507 1508 static void page_delete_collect(page_t *, struct mem_handle *); 1509 1510 static pgcnt_t 1511 delthr_get_freemem(struct mem_handle *mhp) 1512 { 1513 pgcnt_t free_get; 1514 int ret; 1515 1516 ASSERT(MUTEX_HELD(&mhp->mh_mutex)); 1517 1518 MDSTAT_INCR(mhp, need_free); 1519 /* 1520 * Get up to freemem_incr pages. 1521 */ 1522 free_get = freemem_incr; 1523 if (free_get > mhp->mh_hold_todo) 1524 free_get = mhp->mh_hold_todo; 1525 /* 1526 * Take free_get pages away from freemem, 1527 * waiting if necessary. 1528 */ 1529 1530 while (!mhp->mh_cancel) { 1531 mutex_exit(&mhp->mh_mutex); 1532 MDSTAT_INCR(mhp, free_loop); 1533 /* 1534 * Duplicate test from page_create_throttle() 1535 * but don't override with !PG_WAIT. 1536 */ 1537 if (freemem < (free_get + throttlefree)) { 1538 MDSTAT_INCR(mhp, free_low); 1539 ret = 0; 1540 } else { 1541 ret = page_create_wait(free_get, 0); 1542 if (ret == 0) { 1543 /* EMPTY */ 1544 MDSTAT_INCR(mhp, free_failed); 1545 } 1546 } 1547 if (ret != 0) { 1548 mutex_enter(&mhp->mh_mutex); 1549 return (free_get); 1550 } 1551 1552 /* 1553 * Put pressure on pageout. 1554 */ 1555 page_needfree(free_get); 1556 cv_signal(&proc_pageout->p_cv); 1557 1558 mutex_enter(&mhp->mh_mutex); 1559 (void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex, 1560 (lbolt + DEL_FREE_WAIT_TICKS)); 1561 mutex_exit(&mhp->mh_mutex); 1562 page_needfree(-(spgcnt_t)free_get); 1563 1564 mutex_enter(&mhp->mh_mutex); 1565 } 1566 return (0); 1567 } 1568 1569 #define DR_AIO_CLEANUP_DELAY 25000 /* 0.025secs, in usec */ 1570 #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100 1571 /* 1572 * This function is run as a helper thread for delete_memory_thread. 1573 * It is needed in order to force kaio cleanup, so that pages used in kaio 1574 * will be unlocked and subsequently relocated by delete_memory_thread. 1575 * The address of the delete_memory_threads's mem_handle is passed in to 1576 * this thread function, and is used to set the mh_aio_cleanup_done member 1577 * prior to calling thread_exit(). 1578 */ 1579 static void 1580 dr_aio_cleanup_thread(caddr_t amhp) 1581 { 1582 proc_t *procp; 1583 int (*aio_cleanup_dr_delete_memory)(proc_t *); 1584 int cleaned; 1585 int n = 0; 1586 struct mem_handle *mhp; 1587 volatile uint_t *pcancel; 1588 1589 mhp = (struct mem_handle *)amhp; 1590 ASSERT(mhp != NULL); 1591 pcancel = &mhp->mh_dr_aio_cleanup_cancel; 1592 if (modload("sys", "kaio") == -1) { 1593 mhp->mh_aio_cleanup_done = 1; 1594 cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio"); 1595 thread_exit(); 1596 } 1597 aio_cleanup_dr_delete_memory = (int (*)(proc_t *)) 1598 modgetsymvalue("aio_cleanup_dr_delete_memory", 0); 1599 if (aio_cleanup_dr_delete_memory == NULL) { 1600 mhp->mh_aio_cleanup_done = 1; 1601 cmn_err(CE_WARN, 1602 "aio_cleanup_dr_delete_memory not found in kaio"); 1603 thread_exit(); 1604 } 1605 do { 1606 cleaned = 0; 1607 mutex_enter(&pidlock); 1608 for (procp = practive; (*pcancel == 0) && (procp != NULL); 1609 procp = procp->p_next) { 1610 mutex_enter(&procp->p_lock); 1611 if (procp->p_aio != NULL) { 1612 /* cleanup proc's outstanding kaio */ 1613 cleaned += 1614 (*aio_cleanup_dr_delete_memory)(procp); 1615 } 1616 mutex_exit(&procp->p_lock); 1617 } 1618 mutex_exit(&pidlock); 1619 if ((*pcancel == 0) && 1620 (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) { 1621 /* delay a bit before retrying all procs again */ 1622 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 1623 n = 0; 1624 } 1625 } while (*pcancel == 0); 1626 mhp->mh_aio_cleanup_done = 1; 1627 thread_exit(); 1628 } 1629 1630 static void 1631 delete_memory_thread(caddr_t amhp) 1632 { 1633 struct mem_handle *mhp; 1634 struct memdelspan *mdsp; 1635 callb_cpr_t cprinfo; 1636 page_t *pp_targ; 1637 spgcnt_t freemem_left; 1638 void (*del_complete_funcp)(void *, int error); 1639 void *del_complete_arg; 1640 int comp_code; 1641 int ret; 1642 int first_scan; 1643 uint_t szc; 1644 #ifdef MEM_DEL_STATS 1645 uint64_t start_total, ntick_total; 1646 uint64_t start_pgrp, ntick_pgrp; 1647 #endif /* MEM_DEL_STATS */ 1648 1649 mhp = (struct mem_handle *)amhp; 1650 1651 #ifdef MEM_DEL_STATS 1652 start_total = ddi_get_lbolt(); 1653 #endif /* MEM_DEL_STATS */ 1654 1655 CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex, 1656 callb_generic_cpr, "memdel"); 1657 1658 mutex_enter(&mhp->mh_mutex); 1659 ASSERT(mhp->mh_state == MHND_STARTING); 1660 1661 mhp->mh_state = MHND_RUNNING; 1662 mhp->mh_thread_id = curthread; 1663 1664 mhp->mh_hold_todo = mhp->mh_vm_pages; 1665 mutex_exit(&mhp->mh_mutex); 1666 1667 /* Allocate the remap pages now, if necessary. */ 1668 memseg_remap_init(); 1669 1670 /* 1671 * Subtract from availrmem now if possible as availrmem 1672 * may not be available by the end of the delete. 1673 */ 1674 if (!get_availrmem(mhp->mh_vm_pages)) { 1675 comp_code = KPHYSM_ENOTVIABLE; 1676 mutex_enter(&mhp->mh_mutex); 1677 goto early_exit; 1678 } 1679 1680 ret = kphysm_setup_pre_del(mhp->mh_vm_pages); 1681 1682 mutex_enter(&mhp->mh_mutex); 1683 1684 if (ret != 0) { 1685 mhp->mh_cancel = KPHYSM_EREFUSED; 1686 goto refused; 1687 } 1688 1689 transit_list_collect(mhp, 1); 1690 1691 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1692 mdsp = mdsp->mds_next) { 1693 ASSERT(mdsp->mds_bitmap == NULL); 1694 mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP); 1695 mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp), 1696 KM_SLEEP); 1697 } 1698 1699 first_scan = 1; 1700 freemem_left = 0; 1701 /* 1702 * Start dr_aio_cleanup_thread, which periodically iterates 1703 * through the process list and invokes aio cleanup. This 1704 * is needed in order to avoid a deadly embrace between the 1705 * delete_memory_thread (waiting on writer lock for page, with the 1706 * exclusive-wanted bit set), kaio read request threads (waiting for a 1707 * reader lock on the same page that is wanted by the 1708 * delete_memory_thread), and threads waiting for kaio completion 1709 * (blocked on spt_amp->lock). 1710 */ 1711 mhp->mh_dr_aio_cleanup_cancel = 0; 1712 mhp->mh_aio_cleanup_done = 0; 1713 (void) thread_create(NULL, 0, dr_aio_cleanup_thread, 1714 (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1); 1715 while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) { 1716 pgcnt_t collected; 1717 1718 MDSTAT_INCR(mhp, nloop); 1719 collected = 0; 1720 for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) && 1721 (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) { 1722 pfn_t pfn, p_end; 1723 1724 if (first_scan) { 1725 mem_node_pre_del_slice(mdsp->mds_base, 1726 mdsp->mds_base + mdsp->mds_npgs - 1); 1727 } 1728 1729 p_end = mdsp->mds_base + mdsp->mds_npgs; 1730 for (pfn = mdsp->mds_base; (pfn < p_end) && 1731 (mhp->mh_cancel == 0); pfn++) { 1732 page_t *pp, *tpp, *tpp_targ; 1733 pgcnt_t bit; 1734 struct vnode *vp; 1735 u_offset_t offset; 1736 int mod, result; 1737 spgcnt_t pgcnt; 1738 1739 bit = pfn - mdsp->mds_base; 1740 if ((mdsp->mds_bitmap[bit / NBPBMW] & 1741 (1 << (bit % NBPBMW))) != 0) { 1742 MDSTAT_INCR(mhp, already_done); 1743 continue; 1744 } 1745 if (freemem_left == 0) { 1746 freemem_left += delthr_get_freemem(mhp); 1747 if (freemem_left == 0) 1748 break; 1749 } 1750 1751 /* 1752 * Release mh_mutex - some of this 1753 * stuff takes some time (eg PUTPAGE). 1754 */ 1755 1756 mutex_exit(&mhp->mh_mutex); 1757 MDSTAT_INCR(mhp, ncheck); 1758 1759 pp = page_numtopp_nolock(pfn); 1760 if (pp == NULL) { 1761 /* 1762 * Not covered by a page_t - will 1763 * be dealt with elsewhere. 1764 */ 1765 MDSTAT_INCR(mhp, nopaget); 1766 mutex_enter(&mhp->mh_mutex); 1767 mdsp->mds_bitmap[bit / NBPBMW] |= 1768 (1 << (bit % NBPBMW)); 1769 continue; 1770 } 1771 1772 if (!page_try_reclaim_lock(pp, SE_EXCL, 1773 SE_EXCL_WANTED)) { 1774 if (page_isretired(pp)) { 1775 /* 1776 * Page has been retired. 1777 * 1778 * Its shared lock can and 1779 * must be upgraded to an 1780 * exclusive lock in order 1781 * to hashout the page when 1782 * the delete completes. 1783 */ 1784 page_lock_clr_exclwanted(pp); 1785 if (!page_tryupgrade(pp)) { 1786 mutex_enter( 1787 &mhp->mh_mutex); 1788 continue; 1789 } 1790 } else { 1791 /* 1792 * Page in use elsewhere. 1793 */ 1794 MDSTAT_INCR(mhp, lockfail); 1795 mutex_enter(&mhp->mh_mutex); 1796 continue; 1797 } 1798 } 1799 /* 1800 * See if the cage expanded into the delete. 1801 * This can happen as we have to allow the 1802 * cage to expand. 1803 */ 1804 if (PP_ISNORELOC(pp)) { 1805 if (page_isretired(pp)) 1806 page_downgrade(pp); 1807 else 1808 page_unlock(pp); 1809 mutex_enter(&mhp->mh_mutex); 1810 mhp->mh_cancel = KPHYSM_ENONRELOC; 1811 break; 1812 } 1813 if (page_isretired(pp)) { 1814 /* 1815 * Page has been retired and is 1816 * not part of the cage so we 1817 * can now do the accounting for 1818 * it. 1819 */ 1820 MDSTAT_INCR(mhp, retired); 1821 mutex_enter(&mhp->mh_mutex); 1822 mdsp->mds_bitmap[bit / NBPBMW] 1823 |= (1 << (bit % NBPBMW)); 1824 mdsp->mds_bitmap_retired[bit / 1825 NBPBMW] |= 1826 (1 << (bit % NBPBMW)); 1827 mhp->mh_hold_todo--; 1828 continue; 1829 } 1830 ASSERT(freemem_left != 0); 1831 if (PP_ISFREE(pp)) { 1832 /* 1833 * Like page_reclaim() only 'freemem' 1834 * processing is already done. 1835 */ 1836 MDSTAT_INCR(mhp, nfree); 1837 free_page_collect: 1838 if (PP_ISAGED(pp)) { 1839 page_list_sub(pp, 1840 PG_FREE_LIST); 1841 } else { 1842 page_list_sub(pp, 1843 PG_CACHE_LIST); 1844 } 1845 PP_CLRFREE(pp); 1846 PP_CLRAGED(pp); 1847 collected++; 1848 mutex_enter(&mhp->mh_mutex); 1849 page_delete_collect(pp, mhp); 1850 mdsp->mds_bitmap[bit / NBPBMW] |= 1851 (1 << (bit % NBPBMW)); 1852 freemem_left--; 1853 continue; 1854 } 1855 ASSERT(pp->p_vnode != NULL); 1856 if (first_scan) { 1857 MDSTAT_INCR(mhp, first_notfree); 1858 page_unlock(pp); 1859 mutex_enter(&mhp->mh_mutex); 1860 continue; 1861 } 1862 /* 1863 * Keep stats on pages encountered that 1864 * are toxic or failing but not retired. 1865 */ 1866 if (page_istoxic(pp)) { 1867 MDSTAT_INCR(mhp, toxic); 1868 } else if (page_isfailing(pp)) { 1869 MDSTAT_INCR(mhp, failing); 1870 } 1871 /* 1872 * In certain cases below, special exceptions 1873 * are made for pages that are toxic. This 1874 * is because the current meaning of toxic 1875 * is that an uncorrectable error has been 1876 * previously associated with the page. 1877 */ 1878 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1879 if (!page_istoxic(pp)) { 1880 /* 1881 * Must relocate locked in 1882 * memory pages. 1883 */ 1884 #ifdef MEM_DEL_STATS 1885 start_pgrp = ddi_get_lbolt(); 1886 #endif /* MEM_DEL_STATS */ 1887 /* 1888 * Lock all constituent pages 1889 * of a large page to ensure 1890 * that p_szc won't change. 1891 */ 1892 if (!group_page_trylock(pp, 1893 SE_EXCL)) { 1894 MDSTAT_INCR(mhp, 1895 gptllckfail); 1896 page_unlock(pp); 1897 mutex_enter( 1898 &mhp->mh_mutex); 1899 continue; 1900 } 1901 MDSTAT_INCR(mhp, npplocked); 1902 pp_targ = 1903 page_get_replacement_page( 1904 pp, NULL, 0); 1905 if (pp_targ != NULL) { 1906 #ifdef MEM_DEL_STATS 1907 ntick_pgrp = 1908 (uint64_t) 1909 ddi_get_lbolt() - 1910 start_pgrp; 1911 #endif /* MEM_DEL_STATS */ 1912 MDSTAT_PGRP(mhp, 1913 ntick_pgrp); 1914 MDSTAT_INCR(mhp, 1915 nlockreloc); 1916 goto reloc; 1917 } 1918 group_page_unlock(pp); 1919 page_unlock(pp); 1920 #ifdef MEM_DEL_STATS 1921 ntick_pgrp = 1922 (uint64_t)ddi_get_lbolt() - 1923 start_pgrp; 1924 #endif /* MEM_DEL_STATS */ 1925 MDSTAT_PGRP(mhp, ntick_pgrp); 1926 MDSTAT_INCR(mhp, nnorepl); 1927 mutex_enter(&mhp->mh_mutex); 1928 continue; 1929 } else { 1930 /* 1931 * Cannot do anything about 1932 * this page because it is 1933 * toxic. 1934 */ 1935 MDSTAT_INCR(mhp, npplkdtoxic); 1936 page_unlock(pp); 1937 mutex_enter(&mhp->mh_mutex); 1938 continue; 1939 } 1940 } 1941 /* 1942 * Unload the mappings and check if mod bit 1943 * is set. 1944 */ 1945 ASSERT(pp->p_vnode != &kvp); 1946 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1947 mod = hat_ismod(pp); 1948 1949 #ifdef MEM_DEL_STATS 1950 start_pgrp = ddi_get_lbolt(); 1951 #endif /* MEM_DEL_STATS */ 1952 if (mod && !page_istoxic(pp)) { 1953 /* 1954 * Lock all constituent pages 1955 * of a large page to ensure 1956 * that p_szc won't change. 1957 */ 1958 if (!group_page_trylock(pp, SE_EXCL)) { 1959 MDSTAT_INCR(mhp, gptlmodfail); 1960 page_unlock(pp); 1961 mutex_enter(&mhp->mh_mutex); 1962 continue; 1963 } 1964 pp_targ = page_get_replacement_page(pp, 1965 NULL, 0); 1966 if (pp_targ != NULL) { 1967 MDSTAT_INCR(mhp, nmodreloc); 1968 #ifdef MEM_DEL_STATS 1969 ntick_pgrp = 1970 (uint64_t)ddi_get_lbolt() - 1971 start_pgrp; 1972 #endif /* MEM_DEL_STATS */ 1973 MDSTAT_PGRP(mhp, ntick_pgrp); 1974 goto reloc; 1975 } 1976 group_page_unlock(pp); 1977 } 1978 1979 if (!page_try_demote_pages(pp)) { 1980 MDSTAT_INCR(mhp, demotefail); 1981 page_unlock(pp); 1982 #ifdef MEM_DEL_STATS 1983 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 1984 start_pgrp; 1985 #endif /* MEM_DEL_STATS */ 1986 MDSTAT_PGRP(mhp, ntick_pgrp); 1987 mutex_enter(&mhp->mh_mutex); 1988 continue; 1989 } 1990 1991 /* 1992 * Regular 'page-out'. 1993 */ 1994 if (!mod) { 1995 MDSTAT_INCR(mhp, ndestroy); 1996 page_destroy(pp, 1); 1997 /* 1998 * page_destroy was called with 1999 * dontfree. As long as p_lckcnt 2000 * and p_cowcnt are both zero, the 2001 * only additional action of 2002 * page_destroy with !dontfree is to 2003 * call page_free, so we can collect 2004 * the page here. 2005 */ 2006 collected++; 2007 #ifdef MEM_DEL_STATS 2008 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2009 start_pgrp; 2010 #endif /* MEM_DEL_STATS */ 2011 MDSTAT_PGRP(mhp, ntick_pgrp); 2012 mutex_enter(&mhp->mh_mutex); 2013 page_delete_collect(pp, mhp); 2014 mdsp->mds_bitmap[bit / NBPBMW] |= 2015 (1 << (bit % NBPBMW)); 2016 continue; 2017 } 2018 /* 2019 * The page is toxic and the mod bit is 2020 * set, we cannot do anything here to deal 2021 * with it. 2022 */ 2023 if (page_istoxic(pp)) { 2024 page_unlock(pp); 2025 #ifdef MEM_DEL_STATS 2026 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2027 start_pgrp; 2028 #endif /* MEM_DEL_STATS */ 2029 MDSTAT_PGRP(mhp, ntick_pgrp); 2030 MDSTAT_INCR(mhp, modtoxic); 2031 mutex_enter(&mhp->mh_mutex); 2032 continue; 2033 } 2034 MDSTAT_INCR(mhp, nputpage); 2035 vp = pp->p_vnode; 2036 offset = pp->p_offset; 2037 VN_HOLD(vp); 2038 page_unlock(pp); 2039 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, 2040 B_INVAL|B_FORCE, kcred); 2041 VN_RELE(vp); 2042 #ifdef MEM_DEL_STATS 2043 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2044 start_pgrp; 2045 #endif /* MEM_DEL_STATS */ 2046 MDSTAT_PGRP(mhp, ntick_pgrp); 2047 /* 2048 * Try to get the page back immediately 2049 * so that it can be collected. 2050 */ 2051 pp = page_numtopp_nolock(pfn); 2052 if (pp == NULL) { 2053 MDSTAT_INCR(mhp, nnoreclaim); 2054 /* 2055 * This should not happen as this 2056 * thread is deleting the page. 2057 * If this code is generalized, this 2058 * becomes a reality. 2059 */ 2060 #ifdef DEBUG 2061 cmn_err(CE_WARN, 2062 "delete_memory_thread(0x%p) " 2063 "pfn 0x%lx has no page_t", 2064 (void *)mhp, pfn); 2065 #endif /* DEBUG */ 2066 mutex_enter(&mhp->mh_mutex); 2067 continue; 2068 } 2069 if (page_try_reclaim_lock(pp, SE_EXCL, 2070 SE_EXCL_WANTED)) { 2071 if (PP_ISFREE(pp)) { 2072 goto free_page_collect; 2073 } 2074 page_unlock(pp); 2075 } 2076 MDSTAT_INCR(mhp, nnoreclaim); 2077 mutex_enter(&mhp->mh_mutex); 2078 continue; 2079 2080 reloc: 2081 /* 2082 * Got some freemem and a target 2083 * page, so move the data to avoid 2084 * I/O and lock problems. 2085 */ 2086 ASSERT(!page_iolock_assert(pp)); 2087 MDSTAT_INCR(mhp, nreloc); 2088 /* 2089 * page_relocate() will return pgcnt: the 2090 * number of consecutive pages relocated. 2091 * If it is successful, pp will be a 2092 * linked list of the page structs that 2093 * were relocated. If page_relocate() is 2094 * unsuccessful, pp will be unmodified. 2095 */ 2096 #ifdef MEM_DEL_STATS 2097 start_pgrp = ddi_get_lbolt(); 2098 #endif /* MEM_DEL_STATS */ 2099 result = page_relocate(&pp, &pp_targ, 0, 0, 2100 &pgcnt, NULL); 2101 #ifdef MEM_DEL_STATS 2102 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2103 start_pgrp; 2104 #endif /* MEM_DEL_STATS */ 2105 MDSTAT_PGRP(mhp, ntick_pgrp); 2106 if (result != 0) { 2107 MDSTAT_INCR(mhp, nrelocfail); 2108 /* 2109 * We did not succeed. We need 2110 * to give the pp_targ pages back. 2111 * page_free(pp_targ, 1) without 2112 * the freemem accounting. 2113 */ 2114 group_page_unlock(pp); 2115 page_free_replacement_page(pp_targ); 2116 page_unlock(pp); 2117 mutex_enter(&mhp->mh_mutex); 2118 continue; 2119 } 2120 2121 /* 2122 * We will then collect pgcnt pages. 2123 */ 2124 ASSERT(pgcnt > 0); 2125 mutex_enter(&mhp->mh_mutex); 2126 /* 2127 * We need to make sure freemem_left is 2128 * large enough. 2129 */ 2130 while ((freemem_left < pgcnt) && 2131 (!mhp->mh_cancel)) { 2132 freemem_left += 2133 delthr_get_freemem(mhp); 2134 } 2135 2136 /* 2137 * Do not proceed if mh_cancel is set. 2138 */ 2139 if (mhp->mh_cancel) { 2140 while (pp_targ != NULL) { 2141 /* 2142 * Unlink and unlock each page. 2143 */ 2144 tpp_targ = pp_targ; 2145 page_sub(&pp_targ, tpp_targ); 2146 page_unlock(tpp_targ); 2147 } 2148 /* 2149 * We need to give the pp pages back. 2150 * page_free(pp, 1) without the 2151 * freemem accounting. 2152 */ 2153 page_free_replacement_page(pp); 2154 break; 2155 } 2156 2157 /* Now remove pgcnt from freemem_left */ 2158 freemem_left -= pgcnt; 2159 ASSERT(freemem_left >= 0); 2160 szc = pp->p_szc; 2161 while (pp != NULL) { 2162 /* 2163 * pp and pp_targ were passed back as 2164 * a linked list of pages. 2165 * Unlink and unlock each page. 2166 */ 2167 tpp_targ = pp_targ; 2168 page_sub(&pp_targ, tpp_targ); 2169 page_unlock(tpp_targ); 2170 /* 2171 * The original page is now free 2172 * so remove it from the linked 2173 * list and collect it. 2174 */ 2175 tpp = pp; 2176 page_sub(&pp, tpp); 2177 pfn = page_pptonum(tpp); 2178 collected++; 2179 ASSERT(PAGE_EXCL(tpp)); 2180 ASSERT(tpp->p_vnode == NULL); 2181 ASSERT(!hat_page_is_mapped(tpp)); 2182 ASSERT(tpp->p_szc == szc); 2183 tpp->p_szc = 0; 2184 page_delete_collect(tpp, mhp); 2185 bit = pfn - mdsp->mds_base; 2186 mdsp->mds_bitmap[bit / NBPBMW] |= 2187 (1 << (bit % NBPBMW)); 2188 } 2189 ASSERT(pp_targ == NULL); 2190 } 2191 } 2192 first_scan = 0; 2193 if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) && 2194 (collected == 0)) { 2195 /* 2196 * This code is needed as we cannot wait 2197 * for a page to be locked OR the delete to 2198 * be cancelled. Also, we must delay so 2199 * that other threads get a chance to run 2200 * on our cpu, otherwise page locks may be 2201 * held indefinitely by those threads. 2202 */ 2203 MDSTAT_INCR(mhp, ndelay); 2204 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2205 (void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex, 2206 (lbolt + DEL_BUSY_WAIT_TICKS)); 2207 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2208 } 2209 } 2210 /* stop the dr aio cleanup thread */ 2211 mhp->mh_dr_aio_cleanup_cancel = 1; 2212 transit_list_collect(mhp, 0); 2213 if (freemem_left != 0) { 2214 /* Return any surplus. */ 2215 page_create_putback(freemem_left); 2216 freemem_left = 0; 2217 } 2218 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2219 mdsp = mdsp->mds_next) { 2220 mem_node_post_del_slice(mdsp->mds_base, 2221 mdsp->mds_base + mdsp->mds_npgs - 1, 2222 (mhp->mh_cancel != 0)); 2223 } 2224 #ifdef MEM_DEL_STATS 2225 ntick_total = (uint64_t)ddi_get_lbolt() - start_total; 2226 #endif /* MEM_DEL_STATS */ 2227 MDSTAT_TOTAL(mhp, ntick_total); 2228 MDSTAT_PRINT(mhp); 2229 2230 /* 2231 * If the memory delete was cancelled, exclusive-wanted bits must 2232 * be cleared, and also any retired pages that 2233 * were accounted for above must have their exclusive lock 2234 * downgraded to a shared lock to return them to their previous 2235 * state. 2236 * Otherwise, if the memory delete has completed, retired pages 2237 * must be hashed out. 2238 */ 2239 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2240 mdsp = mdsp->mds_next) { 2241 pfn_t pfn, p_end; 2242 2243 p_end = mdsp->mds_base + mdsp->mds_npgs; 2244 for (pfn = mdsp->mds_base; pfn < p_end; pfn++) { 2245 page_t *pp; 2246 pgcnt_t bit; 2247 2248 bit = pfn - mdsp->mds_base; 2249 if (mhp->mh_cancel) { 2250 pp = page_numtopp_nolock(pfn); 2251 if (pp != NULL) { 2252 if ((mdsp->mds_bitmap[bit / NBPBMW] & 2253 (1 << (bit % NBPBMW))) == 0) { 2254 page_lock_clr_exclwanted(pp); 2255 } 2256 } 2257 } else { 2258 pp = NULL; 2259 } 2260 if ((mdsp->mds_bitmap_retired[bit / NBPBMW] & 2261 (1 << (bit % NBPBMW))) != 0) { 2262 /* do we already have pp? */ 2263 if (pp == NULL) { 2264 pp = page_numtopp_nolock(pfn); 2265 } 2266 ASSERT(pp != NULL); 2267 ASSERT(page_isretired(pp)); 2268 if (mhp->mh_cancel != 0) { 2269 page_downgrade(pp); 2270 /* 2271 * To satisfy ASSERT below in 2272 * cancel code. 2273 */ 2274 mhp->mh_hold_todo++; 2275 } else { 2276 page_hashout(pp, (kmutex_t *)NULL); 2277 } 2278 } 2279 } 2280 } 2281 /* 2282 * Free retired page bitmap and collected page bitmap 2283 */ 2284 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2285 mdsp = mdsp->mds_next) { 2286 ASSERT(mdsp->mds_bitmap_retired != NULL); 2287 kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp)); 2288 mdsp->mds_bitmap_retired = NULL; /* Paranoia. */ 2289 ASSERT(mdsp->mds_bitmap != NULL); 2290 kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp)); 2291 mdsp->mds_bitmap = NULL; /* Paranoia. */ 2292 } 2293 2294 /* wait for our dr aio cancel thread to exit */ 2295 while (!(mhp->mh_aio_cleanup_done)) { 2296 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2297 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 2298 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2299 } 2300 refused: 2301 if (mhp->mh_cancel != 0) { 2302 page_t *pp; 2303 2304 comp_code = mhp->mh_cancel; 2305 /* 2306 * Go through list of deleted pages (mh_deleted) freeing 2307 * them. 2308 */ 2309 while ((pp = mhp->mh_deleted) != NULL) { 2310 mhp->mh_deleted = pp->p_next; 2311 mhp->mh_hold_todo++; 2312 mutex_exit(&mhp->mh_mutex); 2313 /* Restore p_next. */ 2314 pp->p_next = pp->p_prev; 2315 if (PP_ISFREE(pp)) { 2316 cmn_err(CE_PANIC, 2317 "page %p is free", 2318 (void *)pp); 2319 } 2320 page_free(pp, 1); 2321 mutex_enter(&mhp->mh_mutex); 2322 } 2323 ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages); 2324 2325 mutex_exit(&mhp->mh_mutex); 2326 put_availrmem(mhp->mh_vm_pages); 2327 mutex_enter(&mhp->mh_mutex); 2328 2329 goto t_exit; 2330 } 2331 2332 /* 2333 * All the pages are no longer in use and are exclusively locked. 2334 */ 2335 2336 mhp->mh_deleted = NULL; 2337 2338 kphysm_del_cleanup(mhp); 2339 2340 comp_code = KPHYSM_OK; 2341 2342 t_exit: 2343 mutex_exit(&mhp->mh_mutex); 2344 kphysm_setup_post_del(mhp->mh_vm_pages, 2345 (comp_code == KPHYSM_OK) ? 0 : 1); 2346 mutex_enter(&mhp->mh_mutex); 2347 2348 early_exit: 2349 /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */ 2350 mhp->mh_state = MHND_DONE; 2351 del_complete_funcp = mhp->mh_delete_complete; 2352 del_complete_arg = mhp->mh_delete_complete_arg; 2353 CALLB_CPR_EXIT(&cprinfo); 2354 (*del_complete_funcp)(del_complete_arg, comp_code); 2355 thread_exit(); 2356 /*NOTREACHED*/ 2357 } 2358 2359 /* 2360 * Start the delete of the memory from the system. 2361 */ 2362 int 2363 kphysm_del_start( 2364 memhandle_t handle, 2365 void (*complete)(void *, int), 2366 void *complete_arg) 2367 { 2368 struct mem_handle *mhp; 2369 2370 mhp = kphysm_lookup_mem_handle(handle); 2371 if (mhp == NULL) { 2372 return (KPHYSM_EHANDLE); 2373 } 2374 switch (mhp->mh_state) { 2375 case MHND_FREE: 2376 ASSERT(mhp->mh_state != MHND_FREE); 2377 mutex_exit(&mhp->mh_mutex); 2378 return (KPHYSM_EHANDLE); 2379 case MHND_INIT: 2380 break; 2381 case MHND_STARTING: 2382 case MHND_RUNNING: 2383 mutex_exit(&mhp->mh_mutex); 2384 return (KPHYSM_ESEQUENCE); 2385 case MHND_DONE: 2386 mutex_exit(&mhp->mh_mutex); 2387 return (KPHYSM_ESEQUENCE); 2388 case MHND_RELEASE: 2389 mutex_exit(&mhp->mh_mutex); 2390 return (KPHYSM_ESEQUENCE); 2391 default: 2392 #ifdef DEBUG 2393 cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d", 2394 (void *)mhp, mhp->mh_state); 2395 #endif /* DEBUG */ 2396 mutex_exit(&mhp->mh_mutex); 2397 return (KPHYSM_EHANDLE); 2398 } 2399 2400 if (mhp->mh_transit.trl_spans == NULL) { 2401 mutex_exit(&mhp->mh_mutex); 2402 return (KPHYSM_ENOWORK); 2403 } 2404 2405 ASSERT(complete != NULL); 2406 mhp->mh_delete_complete = complete; 2407 mhp->mh_delete_complete_arg = complete_arg; 2408 mhp->mh_state = MHND_STARTING; 2409 /* 2410 * Release the mutex in case thread_create sleeps. 2411 */ 2412 mutex_exit(&mhp->mh_mutex); 2413 2414 /* 2415 * The "obvious" process for this thread is pageout (proc_pageout) 2416 * but this gives the thread too much power over freemem 2417 * which results in freemem starvation. 2418 */ 2419 (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0, 2420 TS_RUN, maxclsyspri - 1); 2421 2422 return (KPHYSM_OK); 2423 } 2424 2425 static kmutex_t pp_dummy_lock; /* Protects init. of pp_dummy. */ 2426 static caddr_t pp_dummy; 2427 static pgcnt_t pp_dummy_npages; 2428 static pfn_t *pp_dummy_pfn; /* Array of dummy pfns. */ 2429 2430 static void 2431 memseg_remap_init_pages(page_t *pages, page_t *epages) 2432 { 2433 page_t *pp; 2434 2435 for (pp = pages; pp < epages; pp++) { 2436 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2437 pp->p_offset = (u_offset_t)-1; 2438 page_iolock_init(pp); 2439 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 2440 continue; 2441 page_lock_delete(pp); 2442 } 2443 } 2444 2445 void 2446 memseg_remap_init() 2447 { 2448 mutex_enter(&pp_dummy_lock); 2449 if (pp_dummy == NULL) { 2450 uint_t dpages; 2451 int i; 2452 2453 /* 2454 * dpages starts off as the size of the structure and 2455 * ends up as the minimum number of pages that will 2456 * hold a whole number of page_t structures. 2457 */ 2458 dpages = sizeof (page_t); 2459 ASSERT(dpages != 0); 2460 ASSERT(dpages <= MMU_PAGESIZE); 2461 2462 while ((dpages & 1) == 0) 2463 dpages >>= 1; 2464 2465 pp_dummy_npages = dpages; 2466 /* 2467 * Allocate pp_dummy pages directly from static_arena, 2468 * since these are whole page allocations and are 2469 * referenced by physical address. This also has the 2470 * nice fringe benefit of hiding the memory from 2471 * ::findleaks since it doesn't deal well with allocated 2472 * kernel heap memory that doesn't have any mappings. 2473 */ 2474 pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages), 2475 PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP); 2476 bzero(pp_dummy, ptob(pp_dummy_npages)); 2477 ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0); 2478 pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) * 2479 pp_dummy_npages, KM_SLEEP); 2480 for (i = 0; i < pp_dummy_npages; i++) { 2481 pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat, 2482 &pp_dummy[MMU_PAGESIZE * i]); 2483 ASSERT(pp_dummy_pfn[i] != PFN_INVALID); 2484 } 2485 /* 2486 * Initialize the page_t's to a known 'deleted' state 2487 * that matches the state of deleted pages. 2488 */ 2489 memseg_remap_init_pages((page_t *)pp_dummy, 2490 (page_t *)(pp_dummy + 2491 ptob(pp_dummy_npages))); 2492 /* Remove kmem mappings for the pages for safety. */ 2493 hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages), 2494 HAT_UNLOAD_UNLOCK); 2495 /* Leave pp_dummy pointer set as flag that init is done. */ 2496 } 2497 mutex_exit(&pp_dummy_lock); 2498 } 2499 2500 static void 2501 memseg_remap_to_dummy(caddr_t pp, pgcnt_t metapgs) 2502 { 2503 ASSERT(pp_dummy != NULL); 2504 2505 while (metapgs != 0) { 2506 pgcnt_t n; 2507 int i; 2508 2509 n = pp_dummy_npages; 2510 if (n > metapgs) 2511 n = metapgs; 2512 for (i = 0; i < n; i++) { 2513 hat_devload(kas.a_hat, pp, ptob(1), pp_dummy_pfn[i], 2514 PROT_READ, 2515 HAT_LOAD | HAT_LOAD_NOCONSIST | 2516 HAT_LOAD_REMAP); 2517 pp += ptob(1); 2518 } 2519 metapgs -= n; 2520 } 2521 } 2522 2523 /* 2524 * Transition all the deleted pages to the deleted state so that 2525 * page_lock will not wait. The page_lock_delete call will 2526 * also wake up any waiters. 2527 */ 2528 static void 2529 memseg_lock_delete_all(struct memseg *seg) 2530 { 2531 page_t *pp; 2532 2533 for (pp = seg->pages; pp < seg->epages; pp++) { 2534 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2535 page_lock_delete(pp); 2536 } 2537 } 2538 2539 static void 2540 kphysm_del_cleanup(struct mem_handle *mhp) 2541 { 2542 struct memdelspan *mdsp; 2543 struct memseg *seg; 2544 struct memseg **segpp; 2545 struct memseg *seglist; 2546 pfn_t p_end; 2547 uint64_t avmem; 2548 pgcnt_t avpgs; 2549 pgcnt_t npgs; 2550 2551 avpgs = mhp->mh_vm_pages; 2552 2553 memsegs_lock(1); 2554 2555 /* 2556 * remove from main segment list. 2557 */ 2558 npgs = 0; 2559 seglist = NULL; 2560 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2561 mdsp = mdsp->mds_next) { 2562 p_end = mdsp->mds_base + mdsp->mds_npgs; 2563 for (segpp = &memsegs; (seg = *segpp) != NULL; ) { 2564 if (seg->pages_base >= p_end || 2565 seg->pages_end <= mdsp->mds_base) { 2566 /* Span and memseg don't overlap. */ 2567 segpp = &((*segpp)->next); 2568 continue; 2569 } 2570 ASSERT(seg->pages_base >= mdsp->mds_base); 2571 ASSERT(seg->pages_end <= p_end); 2572 2573 /* Hide the memseg from future scans. */ 2574 hat_kpm_delmem_mseg_update(seg, segpp); 2575 *segpp = seg->next; 2576 membar_producer(); /* TODO: Needed? */ 2577 npgs += MSEG_NPAGES(seg); 2578 2579 /* 2580 * Leave the deleted segment's next pointer intact 2581 * in case a memsegs scanning loop is walking this 2582 * segment concurrently. 2583 */ 2584 seg->lnext = seglist; 2585 seglist = seg; 2586 } 2587 } 2588 2589 build_pfn_hash(); 2590 2591 ASSERT(npgs < total_pages); 2592 total_pages -= npgs; 2593 2594 /* 2595 * Recalculate the paging parameters now total_pages has changed. 2596 * This will also cause the clock hands to be reset before next use. 2597 */ 2598 setupclock(1); 2599 2600 memsegs_unlock(1); 2601 2602 mutex_exit(&mhp->mh_mutex); 2603 2604 while ((seg = seglist) != NULL) { 2605 pfn_t mseg_start; 2606 pfn_t mseg_base, mseg_end; 2607 pgcnt_t mseg_npgs; 2608 page_t *pp; 2609 pgcnt_t metapgs; 2610 int dynamic; 2611 int mlret; 2612 2613 seglist = seg->lnext; 2614 2615 /* 2616 * Put the page_t's into the deleted state to stop 2617 * cv_wait()s on the pages. When we remap, the dummy 2618 * page_t's will be in the same state. 2619 */ 2620 memseg_lock_delete_all(seg); 2621 /* 2622 * Collect up information based on pages_base and pages_end 2623 * early so that we can flag early that the memseg has been 2624 * deleted by setting pages_end == pages_base. 2625 */ 2626 mseg_base = seg->pages_base; 2627 mseg_end = seg->pages_end; 2628 mseg_npgs = MSEG_NPAGES(seg); 2629 dynamic = memseg_is_dynamic(seg, &mseg_start); 2630 2631 seg->pages_end = seg->pages_base; 2632 2633 if (dynamic) { 2634 pp = seg->pages; 2635 metapgs = mseg_base - mseg_start; 2636 ASSERT(metapgs != 0); 2637 2638 /* Remap the meta data to our special dummy area. */ 2639 memseg_remap_to_dummy((caddr_t)pp, metapgs); 2640 2641 mutex_enter(&memseg_lists_lock); 2642 seg->lnext = memseg_va_avail; 2643 memseg_va_avail = seg; 2644 mutex_exit(&memseg_lists_lock); 2645 } else { 2646 /* 2647 * Set for clean-up below. 2648 */ 2649 mseg_start = seg->pages_base; 2650 /* 2651 * For memory whose page_ts were allocated 2652 * at boot, we need to find a new use for 2653 * the page_t memory. 2654 * For the moment, just leak it. 2655 * (It is held in the memseg_delete_junk list.) 2656 */ 2657 2658 mutex_enter(&memseg_lists_lock); 2659 seg->lnext = memseg_delete_junk; 2660 memseg_delete_junk = seg; 2661 mutex_exit(&memseg_lists_lock); 2662 } 2663 2664 /* Must not use seg now as it could be re-used. */ 2665 2666 memlist_write_lock(); 2667 2668 mlret = memlist_delete_span( 2669 (uint64_t)(mseg_base) << PAGESHIFT, 2670 (uint64_t)(mseg_npgs) << PAGESHIFT, 2671 &phys_avail); 2672 ASSERT(mlret == MEML_SPANOP_OK); 2673 2674 mlret = memlist_delete_span( 2675 (uint64_t)(mseg_start) << PAGESHIFT, 2676 (uint64_t)(mseg_end - mseg_start) << 2677 PAGESHIFT, 2678 &phys_install); 2679 ASSERT(mlret == MEML_SPANOP_OK); 2680 phys_install_has_changed(); 2681 2682 memlist_write_unlock(); 2683 } 2684 2685 memlist_read_lock(); 2686 installed_top_size(phys_install, &physmax, &physinstalled); 2687 memlist_read_unlock(); 2688 2689 mutex_enter(&freemem_lock); 2690 maxmem -= avpgs; 2691 physmem -= avpgs; 2692 /* availrmem is adjusted during the delete. */ 2693 availrmem_initial -= avpgs; 2694 2695 mutex_exit(&freemem_lock); 2696 2697 dump_resize(); 2698 2699 cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK " 2700 "(0x%" PRIx64 ")\n", 2701 physinstalled << (PAGESHIFT - 10), 2702 (uint64_t)physinstalled << PAGESHIFT); 2703 2704 avmem = (uint64_t)freemem << PAGESHIFT; 2705 cmn_err(CE_CONT, "?kphysm_delete: " 2706 "avail mem = %" PRId64 "\n", avmem); 2707 2708 /* 2709 * Update lgroup generation number on single lgroup systems 2710 */ 2711 if (nlgrps == 1) 2712 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 2713 2714 /* Successfully deleted system memory */ 2715 mutex_enter(&mhp->mh_mutex); 2716 } 2717 2718 static uint_t mdel_nullvp_waiter; 2719 2720 static void 2721 page_delete_collect( 2722 page_t *pp, 2723 struct mem_handle *mhp) 2724 { 2725 if (pp->p_vnode) { 2726 page_hashout(pp, (kmutex_t *)NULL); 2727 /* do not do PP_SETAGED(pp); */ 2728 } else { 2729 kmutex_t *sep; 2730 2731 sep = page_se_mutex(pp); 2732 mutex_enter(sep); 2733 if (CV_HAS_WAITERS(&pp->p_cv)) { 2734 mdel_nullvp_waiter++; 2735 cv_broadcast(&pp->p_cv); 2736 } 2737 mutex_exit(sep); 2738 } 2739 ASSERT(pp->p_next == pp->p_prev); 2740 ASSERT(pp->p_next == NULL || pp->p_next == pp); 2741 pp->p_next = mhp->mh_deleted; 2742 mhp->mh_deleted = pp; 2743 ASSERT(mhp->mh_hold_todo != 0); 2744 mhp->mh_hold_todo--; 2745 } 2746 2747 static void 2748 transit_list_collect(struct mem_handle *mhp, int v) 2749 { 2750 struct transit_list_head *trh; 2751 2752 trh = &transit_list_head; 2753 mutex_enter(&trh->trh_lock); 2754 mhp->mh_transit.trl_collect = v; 2755 mutex_exit(&trh->trh_lock); 2756 } 2757 2758 static void 2759 transit_list_insert(struct transit_list *tlp) 2760 { 2761 struct transit_list_head *trh; 2762 2763 trh = &transit_list_head; 2764 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2765 tlp->trl_next = trh->trh_head; 2766 trh->trh_head = tlp; 2767 } 2768 2769 static void 2770 transit_list_remove(struct transit_list *tlp) 2771 { 2772 struct transit_list_head *trh; 2773 struct transit_list **tlpp; 2774 2775 trh = &transit_list_head; 2776 tlpp = &trh->trh_head; 2777 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2778 while (*tlpp != NULL && *tlpp != tlp) 2779 tlpp = &(*tlpp)->trl_next; 2780 ASSERT(*tlpp != NULL); 2781 if (*tlpp == tlp) 2782 *tlpp = tlp->trl_next; 2783 tlp->trl_next = NULL; 2784 } 2785 2786 static struct transit_list * 2787 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum) 2788 { 2789 struct transit_list *tlp; 2790 2791 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 2792 struct memdelspan *mdsp; 2793 2794 for (mdsp = tlp->trl_spans; mdsp != NULL; 2795 mdsp = mdsp->mds_next) { 2796 if (pfnum >= mdsp->mds_base && 2797 pfnum < (mdsp->mds_base + mdsp->mds_npgs)) { 2798 return (tlp); 2799 } 2800 } 2801 } 2802 return (NULL); 2803 } 2804 2805 int 2806 pfn_is_being_deleted(pfn_t pfnum) 2807 { 2808 struct transit_list_head *trh; 2809 struct transit_list *tlp; 2810 int ret; 2811 2812 trh = &transit_list_head; 2813 if (trh->trh_head == NULL) 2814 return (0); 2815 2816 mutex_enter(&trh->trh_lock); 2817 tlp = pfnum_to_transit_list(trh, pfnum); 2818 ret = (tlp != NULL && tlp->trl_collect); 2819 mutex_exit(&trh->trh_lock); 2820 2821 return (ret); 2822 } 2823 2824 #ifdef MEM_DEL_STATS 2825 extern int hz; 2826 static void 2827 mem_del_stat_print_func(struct mem_handle *mhp) 2828 { 2829 uint64_t tmp; 2830 2831 if (mem_del_stat_print) { 2832 printf("memory delete loop %x/%x, statistics%s\n", 2833 (uint_t)mhp->mh_transit.trl_spans->mds_base, 2834 (uint_t)mhp->mh_transit.trl_spans->mds_npgs, 2835 (mhp->mh_cancel ? " (cancelled)" : "")); 2836 printf("\t%8u nloop\n", mhp->mh_delstat.nloop); 2837 printf("\t%8u need_free\n", mhp->mh_delstat.need_free); 2838 printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop); 2839 printf("\t%8u free_low\n", mhp->mh_delstat.free_low); 2840 printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed); 2841 printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck); 2842 printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget); 2843 printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail); 2844 printf("\t%8u nfree\n", mhp->mh_delstat.nfree); 2845 printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc); 2846 printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail); 2847 printf("\t%8u already_done\n", mhp->mh_delstat.already_done); 2848 printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree); 2849 printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked); 2850 printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc); 2851 printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl); 2852 printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc); 2853 printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy); 2854 printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage); 2855 printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim); 2856 printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay); 2857 printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail); 2858 printf("\t%8u retired\n", mhp->mh_delstat.retired); 2859 printf("\t%8u toxic\n", mhp->mh_delstat.toxic); 2860 printf("\t%8u failing\n", mhp->mh_delstat.failing); 2861 printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic); 2862 printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic); 2863 printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail); 2864 printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail); 2865 tmp = mhp->mh_delstat.nticks_total / hz; /* seconds */ 2866 printf( 2867 "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n", 2868 mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60); 2869 2870 tmp = mhp->mh_delstat.nticks_pgrp / hz; /* seconds */ 2871 printf( 2872 "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n", 2873 mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60); 2874 } 2875 } 2876 #endif /* MEM_DEL_STATS */ 2877 2878 struct mem_callback { 2879 kphysm_setup_vector_t *vec; 2880 void *arg; 2881 }; 2882 2883 #define NMEMCALLBACKS 100 2884 2885 static struct mem_callback mem_callbacks[NMEMCALLBACKS]; 2886 static uint_t nmemcallbacks; 2887 static krwlock_t mem_callback_rwlock; 2888 2889 int 2890 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg) 2891 { 2892 uint_t i, found; 2893 2894 /* 2895 * This test will become more complicated when the version must 2896 * change. 2897 */ 2898 if (vec->version != KPHYSM_SETUP_VECTOR_VERSION) 2899 return (EINVAL); 2900 2901 if (vec->post_add == NULL || vec->pre_del == NULL || 2902 vec->post_del == NULL) 2903 return (EINVAL); 2904 2905 rw_enter(&mem_callback_rwlock, RW_WRITER); 2906 for (i = 0, found = 0; i < nmemcallbacks; i++) { 2907 if (mem_callbacks[i].vec == NULL && found == 0) 2908 found = i + 1; 2909 if (mem_callbacks[i].vec == vec && 2910 mem_callbacks[i].arg == arg) { 2911 #ifdef DEBUG 2912 /* Catch this in DEBUG kernels. */ 2913 cmn_err(CE_WARN, "kphysm_setup_func_register" 2914 "(0x%p, 0x%p) duplicate registration from 0x%p", 2915 (void *)vec, arg, (void *)caller()); 2916 #endif /* DEBUG */ 2917 rw_exit(&mem_callback_rwlock); 2918 return (EEXIST); 2919 } 2920 } 2921 if (found != 0) { 2922 i = found - 1; 2923 } else { 2924 ASSERT(nmemcallbacks < NMEMCALLBACKS); 2925 if (nmemcallbacks == NMEMCALLBACKS) { 2926 rw_exit(&mem_callback_rwlock); 2927 return (ENOMEM); 2928 } 2929 i = nmemcallbacks++; 2930 } 2931 mem_callbacks[i].vec = vec; 2932 mem_callbacks[i].arg = arg; 2933 rw_exit(&mem_callback_rwlock); 2934 return (0); 2935 } 2936 2937 void 2938 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg) 2939 { 2940 uint_t i; 2941 2942 rw_enter(&mem_callback_rwlock, RW_WRITER); 2943 for (i = 0; i < nmemcallbacks; i++) { 2944 if (mem_callbacks[i].vec == vec && 2945 mem_callbacks[i].arg == arg) { 2946 mem_callbacks[i].vec = NULL; 2947 mem_callbacks[i].arg = NULL; 2948 if (i == (nmemcallbacks - 1)) 2949 nmemcallbacks--; 2950 break; 2951 } 2952 } 2953 rw_exit(&mem_callback_rwlock); 2954 } 2955 2956 static void 2957 kphysm_setup_post_add(pgcnt_t delta_pages) 2958 { 2959 uint_t i; 2960 2961 rw_enter(&mem_callback_rwlock, RW_READER); 2962 for (i = 0; i < nmemcallbacks; i++) { 2963 if (mem_callbacks[i].vec != NULL) { 2964 (*mem_callbacks[i].vec->post_add) 2965 (mem_callbacks[i].arg, delta_pages); 2966 } 2967 } 2968 rw_exit(&mem_callback_rwlock); 2969 } 2970 2971 /* 2972 * Note the locking between pre_del and post_del: The reader lock is held 2973 * between the two calls to stop the set of functions from changing. 2974 */ 2975 2976 static int 2977 kphysm_setup_pre_del(pgcnt_t delta_pages) 2978 { 2979 uint_t i; 2980 int ret; 2981 int aret; 2982 2983 ret = 0; 2984 rw_enter(&mem_callback_rwlock, RW_READER); 2985 for (i = 0; i < nmemcallbacks; i++) { 2986 if (mem_callbacks[i].vec != NULL) { 2987 aret = (*mem_callbacks[i].vec->pre_del) 2988 (mem_callbacks[i].arg, delta_pages); 2989 ret |= aret; 2990 } 2991 } 2992 2993 return (ret); 2994 } 2995 2996 static void 2997 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled) 2998 { 2999 uint_t i; 3000 3001 for (i = 0; i < nmemcallbacks; i++) { 3002 if (mem_callbacks[i].vec != NULL) { 3003 (*mem_callbacks[i].vec->post_del) 3004 (mem_callbacks[i].arg, delta_pages, cancelled); 3005 } 3006 } 3007 rw_exit(&mem_callback_rwlock); 3008 } 3009 3010 static int 3011 kphysm_split_memseg( 3012 pfn_t base, 3013 pgcnt_t npgs) 3014 { 3015 struct memseg *seg; 3016 struct memseg **segpp; 3017 pgcnt_t size_low, size_high; 3018 struct memseg *seg_low, *seg_mid, *seg_high; 3019 3020 /* 3021 * Lock the memsegs list against other updates now 3022 */ 3023 memsegs_lock(1); 3024 3025 /* 3026 * Find boot time memseg that wholly covers this area. 3027 */ 3028 3029 /* First find the memseg with page 'base' in it. */ 3030 for (segpp = &memsegs; (seg = *segpp) != NULL; 3031 segpp = &((*segpp)->next)) { 3032 if (base >= seg->pages_base && base < seg->pages_end) 3033 break; 3034 } 3035 if (seg == NULL) { 3036 memsegs_unlock(1); 3037 return (0); 3038 } 3039 if (memseg_is_dynamic(seg, (pfn_t *)NULL)) { 3040 memsegs_unlock(1); 3041 return (0); 3042 } 3043 if ((base + npgs) > seg->pages_end) { 3044 memsegs_unlock(1); 3045 return (0); 3046 } 3047 3048 /* 3049 * Work out the size of the two segments that will 3050 * surround the new segment, one for low address 3051 * and one for high. 3052 */ 3053 ASSERT(base >= seg->pages_base); 3054 size_low = base - seg->pages_base; 3055 ASSERT(seg->pages_end >= (base + npgs)); 3056 size_high = seg->pages_end - (base + npgs); 3057 3058 /* 3059 * Sanity check. 3060 */ 3061 if ((size_low + size_high) == 0) { 3062 memsegs_unlock(1); 3063 return (0); 3064 } 3065 3066 /* 3067 * Allocate the new structures. The old memseg will not be freed 3068 * as there may be a reference to it. 3069 */ 3070 seg_low = NULL; 3071 seg_high = NULL; 3072 3073 if (size_low != 0) { 3074 seg_low = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3075 bzero(seg_low, sizeof (struct memseg)); 3076 } 3077 3078 seg_mid = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3079 bzero(seg_mid, sizeof (struct memseg)); 3080 3081 if (size_high != 0) { 3082 seg_high = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3083 bzero(seg_high, sizeof (struct memseg)); 3084 } 3085 3086 /* 3087 * All allocation done now. 3088 */ 3089 if (size_low != 0) { 3090 seg_low->pages = seg->pages; 3091 seg_low->epages = seg_low->pages + size_low; 3092 seg_low->pages_base = seg->pages_base; 3093 seg_low->pages_end = seg_low->pages_base + size_low; 3094 seg_low->next = seg_mid; 3095 } 3096 if (size_high != 0) { 3097 seg_high->pages = seg->epages - size_high; 3098 seg_high->epages = seg_high->pages + size_high; 3099 seg_high->pages_base = seg->pages_end - size_high; 3100 seg_high->pages_end = seg_high->pages_base + size_high; 3101 seg_high->next = seg->next; 3102 } 3103 3104 seg_mid->pages = seg->pages + size_low; 3105 seg_mid->pages_base = seg->pages_base + size_low; 3106 seg_mid->epages = seg->epages - size_high; 3107 seg_mid->pages_end = seg->pages_end - size_high; 3108 seg_mid->next = (seg_high != NULL) ? seg_high : seg->next; 3109 3110 /* 3111 * Update hat_kpm specific info of all involved memsegs and 3112 * allow hat_kpm specific global chain updates. 3113 */ 3114 hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high); 3115 3116 /* 3117 * At this point we have two equivalent memseg sub-chains, 3118 * seg and seg_low/seg_mid/seg_high, which both chain on to 3119 * the same place in the global chain. By re-writing the pointer 3120 * in the previous element we switch atomically from using the old 3121 * (seg) to the new. 3122 */ 3123 *segpp = (seg_low != NULL) ? seg_low : seg_mid; 3124 3125 membar_enter(); 3126 3127 build_pfn_hash(); 3128 memsegs_unlock(1); 3129 3130 /* 3131 * We leave the old segment, 'seg', intact as there may be 3132 * references to it. Also, as the value of total_pages has not 3133 * changed and the memsegs list is effectively the same when 3134 * accessed via the old or the new pointer, we do not have to 3135 * cause pageout_scanner() to re-evaluate its hand pointers. 3136 * 3137 * We currently do not re-use or reclaim the page_t memory. 3138 * If we do, then this may have to change. 3139 */ 3140 3141 mutex_enter(&memseg_lists_lock); 3142 seg->lnext = memseg_edit_junk; 3143 memseg_edit_junk = seg; 3144 mutex_exit(&memseg_lists_lock); 3145 3146 return (1); 3147 } 3148 3149 /* 3150 * The memsegs lock is only taken when modifying the memsegs list 3151 * and rebuilding the pfn hash table (after boot). 3152 * No lock is needed for read as memseg structure are never de-allocated 3153 * and the pointer linkage is never updated until the memseg is ready. 3154 */ 3155 krwlock_t memsegslock; 3156 3157 void 3158 memsegs_lock(int writer) 3159 { 3160 rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER); 3161 } 3162 3163 /*ARGSUSED*/ 3164 void 3165 memsegs_unlock(int writer) 3166 { 3167 rw_exit(&memsegslock); 3168 } 3169 3170 /* 3171 * memlist (phys_install, phys_avail) locking. 3172 */ 3173 3174 /* 3175 * A read/write lock might be better here. 3176 */ 3177 static kmutex_t memlists_mutex; 3178 3179 void 3180 memlist_read_lock() 3181 { 3182 mutex_enter(&memlists_mutex); 3183 } 3184 3185 void 3186 memlist_read_unlock() 3187 { 3188 mutex_exit(&memlists_mutex); 3189 } 3190 3191 void 3192 memlist_write_lock() 3193 { 3194 mutex_enter(&memlists_mutex); 3195 } 3196 3197 void 3198 memlist_write_unlock() 3199 { 3200 mutex_exit(&memlists_mutex); 3201 } 3202 3203 /* 3204 * The sfmmu hat layer (e.g.) accesses some parts of the memseg 3205 * structure using physical addresses. Therefore a kmem_cache is 3206 * used with KMC_NOHASH to avoid page crossings within a memseg 3207 * structure. KMC_NOHASH requires that no external (outside of 3208 * slab) information is allowed. This, in turn, implies that the 3209 * cache's slabsize must be exactly a single page, since per-slab 3210 * information (e.g. the freelist for the slab) is kept at the 3211 * end of the slab, where it is easy to locate. Should be changed 3212 * when a more obvious kmem_cache interface/flag will become 3213 * available. 3214 */ 3215 void 3216 mem_config_init() 3217 { 3218 memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg), 3219 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 3220 } 3221