1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/cmn_err.h> 30 #include <sys/vmem.h> 31 #include <sys/kmem.h> 32 #include <sys/systm.h> 33 #include <sys/machsystm.h> /* for page_freelist_coalesce() */ 34 #include <sys/errno.h> 35 #include <sys/memnode.h> 36 #include <sys/memlist.h> 37 #include <sys/memlist_impl.h> 38 #include <sys/tuneable.h> 39 #include <sys/proc.h> 40 #include <sys/disp.h> 41 #include <sys/debug.h> 42 #include <sys/vm.h> 43 #include <sys/callb.h> 44 #include <sys/memlist_plat.h> /* for installed_top_size() */ 45 #include <sys/condvar_impl.h> /* for CV_HAS_WAITERS() */ 46 #include <sys/dumphdr.h> /* for dump_resize() */ 47 #include <sys/atomic.h> /* for use in stats collection */ 48 #include <sys/rwlock.h> 49 #include <sys/cpuvar.h> 50 #include <vm/seg_kmem.h> 51 #include <vm/seg_kpm.h> 52 #include <vm/page.h> 53 #include <vm/vm_dep.h> 54 #define SUNDDI_IMPL /* so sunddi.h will not redefine splx() et al */ 55 #include <sys/sunddi.h> 56 #include <sys/mem_config.h> 57 #include <sys/mem_cage.h> 58 #include <sys/lgrp.h> 59 #include <sys/ddi.h> 60 #include <sys/modctl.h> 61 62 extern struct memlist *phys_avail; 63 64 extern void mem_node_add(pfn_t, pfn_t); 65 extern void mem_node_del(pfn_t, pfn_t); 66 67 extern uint_t page_ctrs_adjust(int); 68 static void kphysm_setup_post_add(pgcnt_t); 69 static int kphysm_setup_pre_del(pgcnt_t); 70 static void kphysm_setup_post_del(pgcnt_t, int); 71 72 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs); 73 74 static int delspan_reserve(pfn_t, pgcnt_t); 75 static void delspan_unreserve(pfn_t, pgcnt_t); 76 77 static kmutex_t memseg_lists_lock; 78 static struct memseg *memseg_va_avail; 79 static struct memseg *memseg_delete_junk; 80 static struct memseg *memseg_edit_junk; 81 void memseg_remap_init(void); 82 static void memseg_remap_to_dummy(caddr_t, pgcnt_t); 83 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t); 84 static struct memseg *memseg_reuse(pgcnt_t); 85 86 static struct kmem_cache *memseg_cache; 87 88 /* 89 * Add a chunk of memory to the system. page_t's for this memory 90 * are allocated in the first few pages of the chunk. 91 * base: starting PAGESIZE page of new memory. 92 * npgs: length in PAGESIZE pages. 93 * 94 * Adding mem this way doesn't increase the size of the hash tables; 95 * growing them would be too hard. This should be OK, but adding memory 96 * dynamically most likely means more hash misses, since the tables will 97 * be smaller than they otherwise would be. 98 */ 99 int 100 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs) 101 { 102 page_t *pp; 103 page_t *opp, *oepp; 104 struct memseg *seg; 105 uint64_t avmem; 106 pfn_t pfn; 107 pfn_t pt_base = base; 108 pgcnt_t tpgs = npgs; 109 pgcnt_t metapgs; 110 int exhausted; 111 pfn_t pnum; 112 int mnode; 113 caddr_t vaddr; 114 int reuse; 115 int mlret; 116 void *mapva; 117 pgcnt_t nkpmpgs = 0; 118 offset_t kpm_pages_off; 119 120 cmn_err(CE_CONT, 121 "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n", 122 npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT); 123 124 /* 125 * Add this span in the delete list to prevent interactions. 126 */ 127 if (!delspan_reserve(base, npgs)) { 128 return (KPHYSM_ESPAN); 129 } 130 /* 131 * Check to see if any of the memory span has been added 132 * by trying an add to the installed memory list. This 133 * forms the interlocking process for add. 134 */ 135 136 memlist_write_lock(); 137 138 mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT, 139 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 140 141 if (mlret == MEML_SPANOP_OK) 142 installed_top_size(phys_install, &physmax, &physinstalled); 143 144 memlist_write_unlock(); 145 146 if (mlret != MEML_SPANOP_OK) { 147 if (mlret == MEML_SPANOP_EALLOC) { 148 delspan_unreserve(pt_base, tpgs); 149 return (KPHYSM_ERESOURCE); 150 } else 151 if (mlret == MEML_SPANOP_ESPAN) { 152 delspan_unreserve(pt_base, tpgs); 153 return (KPHYSM_ESPAN); 154 } else { 155 delspan_unreserve(pt_base, tpgs); 156 return (KPHYSM_ERESOURCE); 157 } 158 } 159 160 /* 161 * We store the page_t's for this new memory in the first 162 * few pages of the chunk. Here, we go and get'em ... 163 */ 164 165 /* 166 * The expression after the '-' gives the number of pages 167 * that will fit in the new memory based on a requirement 168 * of (PAGESIZE + sizeof (page_t)) bytes per page. 169 */ 170 metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) / 171 (PAGESIZE + sizeof (page_t))); 172 173 npgs -= metapgs; 174 base += metapgs; 175 176 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); 177 178 exhausted = (metapgs == 0 || npgs == 0); 179 180 if (kpm_enable && !exhausted) { 181 pgcnt_t start, end, nkpmpgs_prelim; 182 size_t ptsz; 183 184 /* 185 * A viable kpm large page mapping must not overlap two 186 * dynamic memsegs. Therefore the total size is checked 187 * to be at least kpm_pgsz and also whether start and end 188 * points are at least kpm_pgsz aligned. 189 */ 190 if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) || 191 pmodkpmp(base + npgs)) { 192 193 kphysm_addmem_error_undospan(pt_base, tpgs); 194 195 /* 196 * There is no specific error code for violating 197 * kpm granularity constraints. 198 */ 199 return (KPHYSM_ENOTVIABLE); 200 } 201 202 start = kpmptop(ptokpmp(base)); 203 end = kpmptop(ptokpmp(base + npgs)); 204 nkpmpgs_prelim = ptokpmp(end - start); 205 ptsz = npgs * sizeof (page_t); 206 metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ); 207 exhausted = (tpgs <= metapgs); 208 if (!exhausted) { 209 npgs = tpgs - metapgs; 210 base = pt_base + metapgs; 211 212 /* final nkpmpgs */ 213 start = kpmptop(ptokpmp(base)); 214 nkpmpgs = ptokpmp(end - start); 215 kpm_pages_off = ptsz + 216 (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ; 217 } 218 } 219 220 /* 221 * Is memory area supplied too small? 222 */ 223 if (exhausted) { 224 kphysm_addmem_error_undospan(pt_base, tpgs); 225 226 /* 227 * There is no specific error code for 'too small'. 228 */ 229 return (KPHYSM_ERESOURCE); 230 } 231 232 /* 233 * We may re-use a previously allocated VA space for the page_ts 234 * eventually, but we need to initialize and lock the pages first. 235 */ 236 237 /* 238 * Get an address in the kernel address map, map 239 * the page_t pages and see if we can touch them. 240 */ 241 242 mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP); 243 if (mapva == NULL) { 244 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 245 " Can't allocate VA for page_ts"); 246 247 kphysm_addmem_error_undospan(pt_base, tpgs); 248 249 return (KPHYSM_ERESOURCE); 250 } 251 pp = mapva; 252 253 if (physmax < (pt_base + tpgs)) 254 physmax = (pt_base + tpgs); 255 256 /* 257 * In the remapping code we map one page at a time so we must do 258 * the same here to match mapping sizes. 259 */ 260 pfn = pt_base; 261 vaddr = (caddr_t)pp; 262 for (pnum = 0; pnum < metapgs; pnum++) { 263 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 264 PROT_READ | PROT_WRITE, 265 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 266 pfn++; 267 vaddr += ptob(1); 268 } 269 270 if (ddi_peek32((dev_info_t *)NULL, 271 (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) { 272 273 cmn_err(CE_PANIC, "kphysm_add_memory_dynamic:" 274 " Can't access pp array at 0x%p [phys 0x%lx]", 275 (void *)pp, pt_base); 276 277 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 278 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 279 280 vmem_free(heap_arena, mapva, ptob(metapgs)); 281 282 kphysm_addmem_error_undospan(pt_base, tpgs); 283 284 return (KPHYSM_EFAULT); 285 } 286 287 /* 288 * Add this memory slice to its memory node translation. 289 * 290 * Note that right now, each node may have only one slice; 291 * this may change with COD or in larger SSM systems with 292 * nested latency groups, so we must not assume that the 293 * node does not yet exist. 294 */ 295 pnum = base + npgs - 1; 296 mem_node_add_slice(base, pnum); 297 298 /* 299 * Allocate or resize page counters as necessary to accommodate 300 * the increase in memory pages. 301 */ 302 mnode = PFN_2_MEM_NODE(pnum); 303 if (page_ctrs_adjust(mnode) != 0) { 304 305 mem_node_pre_del_slice(base, pnum); 306 mem_node_post_del_slice(base, pnum, 0); 307 308 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 309 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 310 311 vmem_free(heap_arena, mapva, ptob(metapgs)); 312 313 kphysm_addmem_error_undospan(pt_base, tpgs); 314 315 return (KPHYSM_ERESOURCE); 316 } 317 318 /* 319 * Update the phys_avail memory list. 320 * The phys_install list was done at the start. 321 */ 322 323 memlist_write_lock(); 324 325 mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT, 326 (uint64_t)(npgs) << PAGESHIFT, &phys_avail); 327 ASSERT(mlret == MEML_SPANOP_OK); 328 329 memlist_write_unlock(); 330 331 /* See if we can find a memseg to re-use. */ 332 seg = memseg_reuse(metapgs); 333 334 reuse = (seg != NULL); 335 336 /* 337 * Initialize the memseg structure representing this memory 338 * and add it to the existing list of memsegs. Do some basic 339 * initialization and add the memory to the system. 340 * In order to prevent lock deadlocks, the add_physmem() 341 * code is repeated here, but split into several stages. 342 */ 343 if (seg == NULL) { 344 seg = kmem_cache_alloc(memseg_cache, KM_SLEEP); 345 bzero(seg, sizeof (struct memseg)); 346 seg->msegflags = MEMSEG_DYNAMIC; 347 seg->pages = pp; 348 } else { 349 /*EMPTY*/ 350 ASSERT(seg->msegflags & MEMSEG_DYNAMIC); 351 } 352 353 seg->epages = seg->pages + npgs; 354 seg->pages_base = base; 355 seg->pages_end = base + npgs; 356 357 /* 358 * Initialize metadata. The page_ts are set to locked state 359 * ready to be freed. 360 */ 361 bzero((caddr_t)pp, ptob(metapgs)); 362 363 pfn = seg->pages_base; 364 /* Save the original pp base in case we reuse a memseg. */ 365 opp = pp; 366 oepp = opp + npgs; 367 for (pp = opp; pp < oepp; pp++) { 368 pp->p_pagenum = pfn; 369 pfn++; 370 page_iolock_init(pp); 371 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 372 continue; 373 pp->p_offset = (u_offset_t)-1; 374 } 375 376 if (reuse) { 377 /* Remap our page_ts to the re-used memseg VA space. */ 378 pfn = pt_base; 379 vaddr = (caddr_t)seg->pages; 380 for (pnum = 0; pnum < metapgs; pnum++) { 381 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 382 PROT_READ | PROT_WRITE, 383 HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST); 384 pfn++; 385 vaddr += ptob(1); 386 } 387 388 hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs), 389 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 390 391 vmem_free(heap_arena, mapva, ptob(metapgs)); 392 } 393 394 hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off); 395 396 memsegs_lock(1); 397 398 /* 399 * The new memseg is inserted at the beginning of the list. 400 * Not only does this save searching for the tail, but in the 401 * case of a re-used memseg, it solves the problem of what 402 * happens of some process has still got a pointer to the 403 * memseg and follows the next pointer to continue traversing 404 * the memsegs list. 405 */ 406 407 hat_kpm_addmem_mseg_insert(seg); 408 409 seg->next = memsegs; 410 membar_producer(); 411 412 hat_kpm_addmem_memsegs_update(seg); 413 414 memsegs = seg; 415 416 build_pfn_hash(); 417 418 total_pages += npgs; 419 420 /* 421 * Recalculate the paging parameters now total_pages has changed. 422 * This will also cause the clock hands to be reset before next use. 423 */ 424 setupclock(1); 425 426 memsegs_unlock(1); 427 428 PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs); 429 430 /* 431 * Free the pages outside the lock to avoid locking loops. 432 */ 433 for (pp = seg->pages; pp < seg->epages; pp++) { 434 page_free(pp, 1); 435 } 436 437 /* 438 * Now that we've updated the appropriate memory lists we 439 * need to reset a number of globals, since we've increased memory. 440 * Several have already been updated for us as noted above. The 441 * globals we're interested in at this point are: 442 * physmax - highest page frame number. 443 * physinstalled - number of pages currently installed (done earlier) 444 * maxmem - max free pages in the system 445 * physmem - physical memory pages available 446 * availrmem - real memory available 447 */ 448 449 mutex_enter(&freemem_lock); 450 maxmem += npgs; 451 physmem += npgs; 452 availrmem += npgs; 453 availrmem_initial += npgs; 454 455 mutex_exit(&freemem_lock); 456 457 dump_resize(); 458 459 page_freelist_coalesce_all(mnode); 460 461 kphysm_setup_post_add(npgs); 462 463 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK " 464 "(0x%" PRIx64 ")\n", 465 physinstalled << (PAGESHIFT - 10), 466 (uint64_t)physinstalled << PAGESHIFT); 467 468 avmem = (uint64_t)freemem << PAGESHIFT; 469 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: " 470 "avail mem = %" PRId64 "\n", avmem); 471 472 /* 473 * Update lgroup generation number on single lgroup systems 474 */ 475 if (nlgrps == 1) 476 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 477 478 delspan_unreserve(pt_base, tpgs); 479 return (KPHYSM_OK); /* Successfully added system memory */ 480 481 } 482 483 /* 484 * There are various error conditions in kphysm_add_memory_dynamic() 485 * which require a rollback of already changed global state. 486 */ 487 static void 488 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs) 489 { 490 int mlret; 491 492 /* Unreserve memory span. */ 493 memlist_write_lock(); 494 495 mlret = memlist_delete_span( 496 (uint64_t)(pt_base) << PAGESHIFT, 497 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 498 499 ASSERT(mlret == MEML_SPANOP_OK); 500 phys_install_has_changed(); 501 installed_top_size(phys_install, &physmax, &physinstalled); 502 503 memlist_write_unlock(); 504 delspan_unreserve(pt_base, tpgs); 505 } 506 507 /* 508 * Only return an available memseg of exactly the right size. 509 * When the meta data area has it's own virtual address space 510 * we will need to manage this more carefully and do best fit 511 * allocations, possibly splitting an available area. 512 */ 513 static struct memseg * 514 memseg_reuse(pgcnt_t metapgs) 515 { 516 struct memseg **segpp, *seg; 517 518 mutex_enter(&memseg_lists_lock); 519 520 segpp = &memseg_va_avail; 521 for (; (seg = *segpp) != NULL; segpp = &seg->lnext) { 522 caddr_t end; 523 524 if (kpm_enable) 525 end = hat_kpm_mseg_reuse(seg); 526 else 527 end = (caddr_t)seg->epages; 528 529 if (btopr(end - (caddr_t)seg->pages) == metapgs) { 530 *segpp = seg->lnext; 531 seg->lnext = NULL; 532 break; 533 } 534 } 535 mutex_exit(&memseg_lists_lock); 536 537 return (seg); 538 } 539 540 static uint_t handle_gen; 541 542 struct memdelspan { 543 struct memdelspan *mds_next; 544 pfn_t mds_base; 545 pgcnt_t mds_npgs; 546 uint_t *mds_bitmap; 547 uint_t *mds_bitmap_retired; 548 }; 549 550 #define NBPBMW (sizeof (uint_t) * NBBY) 551 #define MDS_BITMAPBYTES(MDSP) \ 552 ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t)) 553 554 struct transit_list { 555 struct transit_list *trl_next; 556 struct memdelspan *trl_spans; 557 int trl_collect; 558 }; 559 560 struct transit_list_head { 561 kmutex_t trh_lock; 562 struct transit_list *trh_head; 563 }; 564 565 static struct transit_list_head transit_list_head; 566 567 struct mem_handle; 568 static void transit_list_collect(struct mem_handle *, int); 569 static void transit_list_insert(struct transit_list *); 570 static void transit_list_remove(struct transit_list *); 571 572 #ifdef DEBUG 573 #define MEM_DEL_STATS 574 #endif /* DEBUG */ 575 576 #ifdef MEM_DEL_STATS 577 static int mem_del_stat_print = 0; 578 struct mem_del_stat { 579 uint_t nloop; 580 uint_t need_free; 581 uint_t free_loop; 582 uint_t free_low; 583 uint_t free_failed; 584 uint_t ncheck; 585 uint_t nopaget; 586 uint_t lockfail; 587 uint_t nfree; 588 uint_t nreloc; 589 uint_t nrelocfail; 590 uint_t already_done; 591 uint_t first_notfree; 592 uint_t npplocked; 593 uint_t nlockreloc; 594 uint_t nnorepl; 595 uint_t nmodreloc; 596 uint_t ndestroy; 597 uint_t nputpage; 598 uint_t nnoreclaim; 599 uint_t ndelay; 600 uint_t demotefail; 601 uint64_t nticks_total; 602 uint64_t nticks_pgrp; 603 uint_t retired; 604 uint_t toxic; 605 uint_t failing; 606 uint_t modtoxic; 607 uint_t npplkdtoxic; 608 uint_t gptlmodfail; 609 uint_t gptllckfail; 610 }; 611 /* 612 * The stat values are only incremented in the delete thread 613 * so no locking or atomic required. 614 */ 615 #define MDSTAT_INCR(MHP, FLD) (MHP)->mh_delstat.FLD++ 616 #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck)) 617 #define MDSTAT_PGRP(MHP, ntck) ((MHP)->mh_delstat.nticks_pgrp += (ntck)) 618 static void mem_del_stat_print_func(struct mem_handle *); 619 #define MDSTAT_PRINT(MHP) mem_del_stat_print_func((MHP)) 620 #else /* MEM_DEL_STATS */ 621 #define MDSTAT_INCR(MHP, FLD) 622 #define MDSTAT_TOTAL(MHP, ntck) 623 #define MDSTAT_PGRP(MHP, ntck) 624 #define MDSTAT_PRINT(MHP) 625 #endif /* MEM_DEL_STATS */ 626 627 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING, 628 MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t; 629 630 /* 631 * mh_mutex must be taken to examine or change mh_exthandle and mh_state. 632 * The mutex may not be required for other fields, dependent on mh_state. 633 */ 634 struct mem_handle { 635 kmutex_t mh_mutex; 636 struct mem_handle *mh_next; 637 memhandle_t mh_exthandle; 638 mhnd_state_t mh_state; 639 struct transit_list mh_transit; 640 pgcnt_t mh_phys_pages; 641 pgcnt_t mh_vm_pages; 642 pgcnt_t mh_hold_todo; 643 void (*mh_delete_complete)(void *, int error); 644 void *mh_delete_complete_arg; 645 volatile uint_t mh_cancel; 646 volatile uint_t mh_dr_aio_cleanup_cancel; 647 volatile uint_t mh_aio_cleanup_done; 648 kcondvar_t mh_cv; 649 kthread_id_t mh_thread_id; 650 page_t *mh_deleted; /* link through p_next */ 651 #ifdef MEM_DEL_STATS 652 struct mem_del_stat mh_delstat; 653 #endif /* MEM_DEL_STATS */ 654 }; 655 656 static struct mem_handle *mem_handle_head; 657 static kmutex_t mem_handle_list_mutex; 658 659 static struct mem_handle * 660 kphysm_allocate_mem_handle() 661 { 662 struct mem_handle *mhp; 663 664 mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP); 665 mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL); 666 mutex_enter(&mem_handle_list_mutex); 667 mutex_enter(&mhp->mh_mutex); 668 /* handle_gen is protected by list mutex. */ 669 mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen); 670 mhp->mh_next = mem_handle_head; 671 mem_handle_head = mhp; 672 mutex_exit(&mem_handle_list_mutex); 673 674 return (mhp); 675 } 676 677 static void 678 kphysm_free_mem_handle(struct mem_handle *mhp) 679 { 680 struct mem_handle **mhpp; 681 682 ASSERT(mutex_owned(&mhp->mh_mutex)); 683 ASSERT(mhp->mh_state == MHND_FREE); 684 /* 685 * Exit the mutex to preserve locking order. This is OK 686 * here as once in the FREE state, the handle cannot 687 * be found by a lookup. 688 */ 689 mutex_exit(&mhp->mh_mutex); 690 691 mutex_enter(&mem_handle_list_mutex); 692 mhpp = &mem_handle_head; 693 while (*mhpp != NULL && *mhpp != mhp) 694 mhpp = &(*mhpp)->mh_next; 695 ASSERT(*mhpp == mhp); 696 /* 697 * No need to lock the handle (mh_mutex) as only 698 * mh_next changing and this is the only thread that 699 * can be referncing mhp. 700 */ 701 *mhpp = mhp->mh_next; 702 mutex_exit(&mem_handle_list_mutex); 703 704 mutex_destroy(&mhp->mh_mutex); 705 kmem_free(mhp, sizeof (struct mem_handle)); 706 } 707 708 /* 709 * This function finds the internal mem_handle corresponding to an 710 * external handle and returns it with the mh_mutex held. 711 */ 712 static struct mem_handle * 713 kphysm_lookup_mem_handle(memhandle_t handle) 714 { 715 struct mem_handle *mhp; 716 717 mutex_enter(&mem_handle_list_mutex); 718 for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) { 719 if (mhp->mh_exthandle == handle) { 720 mutex_enter(&mhp->mh_mutex); 721 /* 722 * The state of the handle could have been changed 723 * by kphysm_del_release() while waiting for mh_mutex. 724 */ 725 if (mhp->mh_state == MHND_FREE) { 726 mutex_exit(&mhp->mh_mutex); 727 continue; 728 } 729 break; 730 } 731 } 732 mutex_exit(&mem_handle_list_mutex); 733 return (mhp); 734 } 735 736 int 737 kphysm_del_gethandle(memhandle_t *xmhp) 738 { 739 struct mem_handle *mhp; 740 741 mhp = kphysm_allocate_mem_handle(); 742 /* 743 * The handle is allocated using KM_SLEEP, so cannot fail. 744 * If the implementation is changed, the correct error to return 745 * here would be KPHYSM_ENOHANDLES. 746 */ 747 ASSERT(mhp->mh_state == MHND_FREE); 748 mhp->mh_state = MHND_INIT; 749 *xmhp = mhp->mh_exthandle; 750 mutex_exit(&mhp->mh_mutex); 751 return (KPHYSM_OK); 752 } 753 754 static int 755 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2) 756 { 757 pfn_t e1, e2; 758 759 e1 = b1 + l1; 760 e2 = b2 + l2; 761 762 return (!(b2 >= e1 || b1 >= e2)); 763 } 764 765 static int can_remove_pgs(pgcnt_t); 766 767 static struct memdelspan * 768 span_to_install(pfn_t base, pgcnt_t npgs) 769 { 770 struct memdelspan *mdsp; 771 struct memdelspan *mdsp_new; 772 uint64_t address, size, thislen; 773 struct memlist *mlp; 774 775 mdsp_new = NULL; 776 777 address = (uint64_t)base << PAGESHIFT; 778 size = (uint64_t)npgs << PAGESHIFT; 779 while (size != 0) { 780 memlist_read_lock(); 781 for (mlp = phys_install; mlp != NULL; mlp = mlp->next) { 782 if (address >= (mlp->address + mlp->size)) 783 continue; 784 if ((address + size) > mlp->address) 785 break; 786 } 787 if (mlp == NULL) { 788 address += size; 789 size = 0; 790 thislen = 0; 791 } else { 792 if (address < mlp->address) { 793 size -= (mlp->address - address); 794 address = mlp->address; 795 } 796 ASSERT(address >= mlp->address); 797 if ((address + size) > (mlp->address + mlp->size)) { 798 thislen = mlp->size - (address - mlp->address); 799 } else { 800 thislen = size; 801 } 802 } 803 memlist_read_unlock(); 804 /* TODO: phys_install could change now */ 805 if (thislen == 0) 806 continue; 807 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 808 mdsp->mds_base = btop(address); 809 mdsp->mds_npgs = btop(thislen); 810 mdsp->mds_next = mdsp_new; 811 mdsp_new = mdsp; 812 address += thislen; 813 size -= thislen; 814 } 815 return (mdsp_new); 816 } 817 818 static void 819 free_delspans(struct memdelspan *mdsp) 820 { 821 struct memdelspan *amdsp; 822 823 while ((amdsp = mdsp) != NULL) { 824 mdsp = amdsp->mds_next; 825 kmem_free(amdsp, sizeof (struct memdelspan)); 826 } 827 } 828 829 /* 830 * Concatenate lists. No list ordering is required. 831 */ 832 833 static void 834 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp) 835 { 836 while (*mdspp != NULL) 837 mdspp = &(*mdspp)->mds_next; 838 839 *mdspp = mdsp; 840 } 841 842 /* 843 * Given a new list of delspans, check there is no overlap with 844 * all existing span activity (add or delete) and then concatenate 845 * the new spans to the given list. 846 * Return 1 for OK, 0 if overlapping. 847 */ 848 static int 849 delspan_insert( 850 struct transit_list *my_tlp, 851 struct memdelspan *mdsp_new) 852 { 853 struct transit_list_head *trh; 854 struct transit_list *tlp; 855 int ret; 856 857 trh = &transit_list_head; 858 859 ASSERT(my_tlp != NULL); 860 ASSERT(mdsp_new != NULL); 861 862 ret = 1; 863 mutex_enter(&trh->trh_lock); 864 /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */ 865 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 866 struct memdelspan *mdsp; 867 868 for (mdsp = tlp->trl_spans; mdsp != NULL; 869 mdsp = mdsp->mds_next) { 870 struct memdelspan *nmdsp; 871 872 for (nmdsp = mdsp_new; nmdsp != NULL; 873 nmdsp = nmdsp->mds_next) { 874 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 875 nmdsp->mds_base, nmdsp->mds_npgs)) { 876 ret = 0; 877 goto done; 878 } 879 } 880 } 881 } 882 done: 883 if (ret != 0) { 884 if (my_tlp->trl_spans == NULL) 885 transit_list_insert(my_tlp); 886 delspan_concat(&my_tlp->trl_spans, mdsp_new); 887 } 888 mutex_exit(&trh->trh_lock); 889 return (ret); 890 } 891 892 static void 893 delspan_remove( 894 struct transit_list *my_tlp, 895 pfn_t base, 896 pgcnt_t npgs) 897 { 898 struct transit_list_head *trh; 899 struct memdelspan *mdsp; 900 901 trh = &transit_list_head; 902 903 ASSERT(my_tlp != NULL); 904 905 mutex_enter(&trh->trh_lock); 906 if ((mdsp = my_tlp->trl_spans) != NULL) { 907 if (npgs == 0) { 908 my_tlp->trl_spans = NULL; 909 free_delspans(mdsp); 910 transit_list_remove(my_tlp); 911 } else { 912 struct memdelspan **prv; 913 914 prv = &my_tlp->trl_spans; 915 while (mdsp != NULL) { 916 pfn_t p_end; 917 918 p_end = mdsp->mds_base + mdsp->mds_npgs; 919 if (mdsp->mds_base >= base && 920 p_end <= (base + npgs)) { 921 *prv = mdsp->mds_next; 922 mdsp->mds_next = NULL; 923 free_delspans(mdsp); 924 } else { 925 prv = &mdsp->mds_next; 926 } 927 mdsp = *prv; 928 } 929 if (my_tlp->trl_spans == NULL) 930 transit_list_remove(my_tlp); 931 } 932 } 933 mutex_exit(&trh->trh_lock); 934 } 935 936 /* 937 * Reserve interface for add to stop delete before add finished. 938 * This list is only accessed through the delspan_insert/remove 939 * functions and so is fully protected by the mutex in struct transit_list. 940 */ 941 942 static struct transit_list reserve_transit; 943 944 static int 945 delspan_reserve(pfn_t base, pgcnt_t npgs) 946 { 947 struct memdelspan *mdsp; 948 int ret; 949 950 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 951 mdsp->mds_base = base; 952 mdsp->mds_npgs = npgs; 953 if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) { 954 free_delspans(mdsp); 955 } 956 return (ret); 957 } 958 959 static void 960 delspan_unreserve(pfn_t base, pgcnt_t npgs) 961 { 962 delspan_remove(&reserve_transit, base, npgs); 963 } 964 965 /* 966 * Return whether memseg was created by kphysm_add_memory_dynamic(). 967 * If this is the case and startp non zero, return also the start pfn 968 * of the meta data via startp. 969 */ 970 static int 971 memseg_is_dynamic(struct memseg *seg, pfn_t *startp) 972 { 973 pfn_t pt_start; 974 975 if ((seg->msegflags & MEMSEG_DYNAMIC) == 0) 976 return (0); 977 978 /* Meta data is required to be at the beginning */ 979 ASSERT(hat_getpfnum(kas.a_hat, (caddr_t)seg->epages) < seg->pages_base); 980 981 pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages); 982 if (startp != NULL) 983 *startp = pt_start; 984 985 return (1); 986 } 987 988 int 989 kphysm_del_span( 990 memhandle_t handle, 991 pfn_t base, 992 pgcnt_t npgs) 993 { 994 struct mem_handle *mhp; 995 struct memseg *seg; 996 struct memdelspan *mdsp; 997 struct memdelspan *mdsp_new; 998 pgcnt_t phys_pages, vm_pages; 999 pfn_t p_end; 1000 page_t *pp; 1001 int ret; 1002 1003 mhp = kphysm_lookup_mem_handle(handle); 1004 if (mhp == NULL) { 1005 return (KPHYSM_EHANDLE); 1006 } 1007 if (mhp->mh_state != MHND_INIT) { 1008 mutex_exit(&mhp->mh_mutex); 1009 return (KPHYSM_ESEQUENCE); 1010 } 1011 1012 /* 1013 * Intersect the span with the installed memory list (phys_install). 1014 */ 1015 mdsp_new = span_to_install(base, npgs); 1016 if (mdsp_new == NULL) { 1017 /* 1018 * No physical memory in this range. Is this an 1019 * error? If an attempt to start the delete is made 1020 * for OK returns from del_span such as this, start will 1021 * return an error. 1022 * Could return KPHYSM_ENOWORK. 1023 */ 1024 /* 1025 * It is assumed that there are no error returns 1026 * from span_to_install() due to kmem_alloc failure. 1027 */ 1028 mutex_exit(&mhp->mh_mutex); 1029 return (KPHYSM_OK); 1030 } 1031 /* 1032 * Does this span overlap an existing span? 1033 */ 1034 if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) { 1035 /* 1036 * Differentiate between already on list for this handle 1037 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY). 1038 */ 1039 ret = KPHYSM_EBUSY; 1040 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1041 mdsp = mdsp->mds_next) { 1042 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 1043 base, npgs)) { 1044 ret = KPHYSM_EDUP; 1045 break; 1046 } 1047 } 1048 mutex_exit(&mhp->mh_mutex); 1049 free_delspans(mdsp_new); 1050 return (ret); 1051 } 1052 /* 1053 * At this point the spans in mdsp_new have been inserted into the 1054 * list of spans for this handle and thereby to the global list of 1055 * spans being processed. Each of these spans must now be checked 1056 * for relocatability. As a side-effect segments in the memseg list 1057 * may be split. 1058 * 1059 * Note that mdsp_new can no longer be used as it is now part of 1060 * a larger list. Select elements of this larger list based 1061 * on base and npgs. 1062 */ 1063 restart: 1064 phys_pages = 0; 1065 vm_pages = 0; 1066 ret = KPHYSM_OK; 1067 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1068 mdsp = mdsp->mds_next) { 1069 pgcnt_t pages_checked; 1070 1071 if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) { 1072 continue; 1073 } 1074 p_end = mdsp->mds_base + mdsp->mds_npgs; 1075 /* 1076 * The pages_checked count is a hack. All pages should be 1077 * checked for relocatability. Those not covered by memsegs 1078 * should be tested with arch_kphysm_del_span_ok(). 1079 */ 1080 pages_checked = 0; 1081 for (seg = memsegs; seg; seg = seg->next) { 1082 pfn_t mseg_start; 1083 1084 if (seg->pages_base >= p_end || 1085 seg->pages_end <= mdsp->mds_base) { 1086 /* Span and memseg don't overlap. */ 1087 continue; 1088 } 1089 /* Check that segment is suitable for delete. */ 1090 if (memseg_is_dynamic(seg, &mseg_start)) { 1091 /* 1092 * Can only delete whole added segments 1093 * for the moment. 1094 * Check that this is completely within the 1095 * span. 1096 */ 1097 if (mseg_start < mdsp->mds_base || 1098 seg->pages_end > p_end) { 1099 ret = KPHYSM_EBUSY; 1100 break; 1101 } 1102 pages_checked += seg->pages_end - mseg_start; 1103 } else { 1104 /* 1105 * Set mseg_start for accounting below. 1106 */ 1107 mseg_start = seg->pages_base; 1108 /* 1109 * If this segment is larger than the span, 1110 * try to split it. After the split, it 1111 * is necessary to restart. 1112 */ 1113 if (seg->pages_base < mdsp->mds_base || 1114 seg->pages_end > p_end) { 1115 pfn_t abase; 1116 pgcnt_t anpgs; 1117 int s_ret; 1118 1119 /* Split required. */ 1120 if (mdsp->mds_base < seg->pages_base) 1121 abase = seg->pages_base; 1122 else 1123 abase = mdsp->mds_base; 1124 if (p_end > seg->pages_end) 1125 anpgs = seg->pages_end - abase; 1126 else 1127 anpgs = p_end - abase; 1128 s_ret = kphysm_split_memseg(abase, 1129 anpgs); 1130 if (s_ret == 0) { 1131 /* Split failed. */ 1132 ret = KPHYSM_ERESOURCE; 1133 break; 1134 } 1135 goto restart; 1136 } 1137 pages_checked += 1138 seg->pages_end - seg->pages_base; 1139 } 1140 /* 1141 * The memseg is wholly within the delete span. 1142 * The individual pages can now be checked. 1143 */ 1144 /* Cage test. */ 1145 for (pp = seg->pages; pp < seg->epages; pp++) { 1146 if (PP_ISNORELOC(pp)) { 1147 ret = KPHYSM_ENONRELOC; 1148 break; 1149 } 1150 } 1151 if (ret != KPHYSM_OK) { 1152 break; 1153 } 1154 phys_pages += (seg->pages_end - mseg_start); 1155 vm_pages += MSEG_NPAGES(seg); 1156 } 1157 if (ret != KPHYSM_OK) 1158 break; 1159 if (pages_checked != mdsp->mds_npgs) { 1160 ret = KPHYSM_ENONRELOC; 1161 break; 1162 } 1163 } 1164 1165 if (ret == KPHYSM_OK) { 1166 mhp->mh_phys_pages += phys_pages; 1167 mhp->mh_vm_pages += vm_pages; 1168 } else { 1169 /* 1170 * Keep holding the mh_mutex to prevent it going away. 1171 */ 1172 delspan_remove(&mhp->mh_transit, base, npgs); 1173 } 1174 mutex_exit(&mhp->mh_mutex); 1175 return (ret); 1176 } 1177 1178 int 1179 kphysm_del_span_query( 1180 pfn_t base, 1181 pgcnt_t npgs, 1182 memquery_t *mqp) 1183 { 1184 struct memdelspan *mdsp; 1185 struct memdelspan *mdsp_new; 1186 int done_first_nonreloc; 1187 1188 mqp->phys_pages = 0; 1189 mqp->managed = 0; 1190 mqp->nonrelocatable = 0; 1191 mqp->first_nonrelocatable = 0; 1192 mqp->last_nonrelocatable = 0; 1193 1194 mdsp_new = span_to_install(base, npgs); 1195 /* 1196 * It is OK to proceed here if mdsp_new == NULL. 1197 */ 1198 done_first_nonreloc = 0; 1199 for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) { 1200 pfn_t sbase; 1201 pgcnt_t snpgs; 1202 1203 mqp->phys_pages += mdsp->mds_npgs; 1204 sbase = mdsp->mds_base; 1205 snpgs = mdsp->mds_npgs; 1206 while (snpgs != 0) { 1207 struct memseg *lseg, *seg; 1208 pfn_t p_end; 1209 page_t *pp; 1210 pfn_t mseg_start; 1211 1212 p_end = sbase + snpgs; 1213 /* 1214 * Find the lowest addressed memseg that starts 1215 * after sbase and account for it. 1216 * This is to catch dynamic memsegs whose start 1217 * is hidden. 1218 */ 1219 seg = NULL; 1220 for (lseg = memsegs; lseg != NULL; lseg = lseg->next) { 1221 if ((lseg->pages_base >= sbase) || 1222 (lseg->pages_base < p_end && 1223 lseg->pages_end > sbase)) { 1224 if (seg == NULL || 1225 seg->pages_base > lseg->pages_base) 1226 seg = lseg; 1227 } 1228 } 1229 if (seg != NULL) { 1230 if (!memseg_is_dynamic(seg, &mseg_start)) { 1231 mseg_start = seg->pages_base; 1232 } 1233 /* 1234 * Now have the full extent of the memseg so 1235 * do the range check. 1236 */ 1237 if (mseg_start >= p_end || 1238 seg->pages_end <= sbase) { 1239 /* Span does not overlap memseg. */ 1240 seg = NULL; 1241 } 1242 } 1243 /* 1244 * Account for gap either before the segment if 1245 * there is one or to the end of the span. 1246 */ 1247 if (seg == NULL || mseg_start > sbase) { 1248 pfn_t a_end; 1249 1250 a_end = (seg == NULL) ? p_end : mseg_start; 1251 /* 1252 * Check with arch layer for relocatability. 1253 */ 1254 if (arch_kphysm_del_span_ok(sbase, 1255 (a_end - sbase))) { 1256 /* 1257 * No non-relocatble pages in this 1258 * area, avoid the fine-grained 1259 * test. 1260 */ 1261 snpgs -= (a_end - sbase); 1262 sbase = a_end; 1263 } 1264 while (sbase < a_end) { 1265 if (!arch_kphysm_del_span_ok(sbase, 1266 1)) { 1267 mqp->nonrelocatable++; 1268 if (!done_first_nonreloc) { 1269 mqp-> 1270 first_nonrelocatable 1271 = sbase; 1272 done_first_nonreloc = 1; 1273 } 1274 mqp->last_nonrelocatable = 1275 sbase; 1276 } 1277 sbase++; 1278 snpgs--; 1279 } 1280 } 1281 if (seg != NULL) { 1282 ASSERT(mseg_start <= sbase); 1283 if (seg->pages_base != mseg_start && 1284 seg->pages_base > sbase) { 1285 pgcnt_t skip_pgs; 1286 1287 /* 1288 * Skip the page_t area of a 1289 * dynamic memseg. 1290 */ 1291 skip_pgs = seg->pages_base - sbase; 1292 if (snpgs <= skip_pgs) { 1293 sbase += snpgs; 1294 snpgs = 0; 1295 continue; 1296 } 1297 snpgs -= skip_pgs; 1298 sbase += skip_pgs; 1299 } 1300 ASSERT(snpgs != 0); 1301 ASSERT(seg->pages_base <= sbase); 1302 /* 1303 * The individual pages can now be checked. 1304 */ 1305 for (pp = seg->pages + 1306 (sbase - seg->pages_base); 1307 snpgs != 0 && pp < seg->epages; pp++) { 1308 mqp->managed++; 1309 if (PP_ISNORELOC(pp)) { 1310 mqp->nonrelocatable++; 1311 if (!done_first_nonreloc) { 1312 mqp-> 1313 first_nonrelocatable 1314 = sbase; 1315 done_first_nonreloc = 1; 1316 } 1317 mqp->last_nonrelocatable = 1318 sbase; 1319 } 1320 sbase++; 1321 snpgs--; 1322 } 1323 } 1324 } 1325 } 1326 1327 free_delspans(mdsp_new); 1328 1329 return (KPHYSM_OK); 1330 } 1331 1332 /* 1333 * This release function can be called at any stage as follows: 1334 * _gethandle only called 1335 * _span(s) only called 1336 * _start called but failed 1337 * delete thread exited 1338 */ 1339 int 1340 kphysm_del_release(memhandle_t handle) 1341 { 1342 struct mem_handle *mhp; 1343 1344 mhp = kphysm_lookup_mem_handle(handle); 1345 if (mhp == NULL) { 1346 return (KPHYSM_EHANDLE); 1347 } 1348 switch (mhp->mh_state) { 1349 case MHND_STARTING: 1350 case MHND_RUNNING: 1351 mutex_exit(&mhp->mh_mutex); 1352 return (KPHYSM_ENOTFINISHED); 1353 case MHND_FREE: 1354 ASSERT(mhp->mh_state != MHND_FREE); 1355 mutex_exit(&mhp->mh_mutex); 1356 return (KPHYSM_EHANDLE); 1357 case MHND_INIT: 1358 break; 1359 case MHND_DONE: 1360 break; 1361 case MHND_RELEASE: 1362 mutex_exit(&mhp->mh_mutex); 1363 return (KPHYSM_ESEQUENCE); 1364 default: 1365 #ifdef DEBUG 1366 cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d", 1367 (void *)mhp, mhp->mh_state); 1368 #endif /* DEBUG */ 1369 mutex_exit(&mhp->mh_mutex); 1370 return (KPHYSM_EHANDLE); 1371 } 1372 /* 1373 * Set state so that we can wait if necessary. 1374 * Also this means that we have read/write access to all 1375 * fields except mh_exthandle and mh_state. 1376 */ 1377 mhp->mh_state = MHND_RELEASE; 1378 /* 1379 * The mem_handle cannot be de-allocated by any other operation 1380 * now, so no need to hold mh_mutex. 1381 */ 1382 mutex_exit(&mhp->mh_mutex); 1383 1384 delspan_remove(&mhp->mh_transit, 0, 0); 1385 mhp->mh_phys_pages = 0; 1386 mhp->mh_vm_pages = 0; 1387 mhp->mh_hold_todo = 0; 1388 mhp->mh_delete_complete = NULL; 1389 mhp->mh_delete_complete_arg = NULL; 1390 mhp->mh_cancel = 0; 1391 1392 mutex_enter(&mhp->mh_mutex); 1393 ASSERT(mhp->mh_state == MHND_RELEASE); 1394 mhp->mh_state = MHND_FREE; 1395 1396 kphysm_free_mem_handle(mhp); 1397 1398 return (KPHYSM_OK); 1399 } 1400 1401 /* 1402 * This cancel function can only be called with the thread running. 1403 */ 1404 int 1405 kphysm_del_cancel(memhandle_t handle) 1406 { 1407 struct mem_handle *mhp; 1408 1409 mhp = kphysm_lookup_mem_handle(handle); 1410 if (mhp == NULL) { 1411 return (KPHYSM_EHANDLE); 1412 } 1413 if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) { 1414 mutex_exit(&mhp->mh_mutex); 1415 return (KPHYSM_ENOTRUNNING); 1416 } 1417 /* 1418 * Set the cancel flag and wake the delete thread up. 1419 * The thread may be waiting on I/O, so the effect of the cancel 1420 * may be delayed. 1421 */ 1422 if (mhp->mh_cancel == 0) { 1423 mhp->mh_cancel = KPHYSM_ECANCELLED; 1424 cv_signal(&mhp->mh_cv); 1425 } 1426 mutex_exit(&mhp->mh_mutex); 1427 return (KPHYSM_OK); 1428 } 1429 1430 int 1431 kphysm_del_status( 1432 memhandle_t handle, 1433 memdelstat_t *mdstp) 1434 { 1435 struct mem_handle *mhp; 1436 1437 mhp = kphysm_lookup_mem_handle(handle); 1438 if (mhp == NULL) { 1439 return (KPHYSM_EHANDLE); 1440 } 1441 /* 1442 * Calling kphysm_del_status() is allowed before the delete 1443 * is started to allow for status display. 1444 */ 1445 if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING && 1446 mhp->mh_state != MHND_RUNNING) { 1447 mutex_exit(&mhp->mh_mutex); 1448 return (KPHYSM_ENOTRUNNING); 1449 } 1450 mdstp->phys_pages = mhp->mh_phys_pages; 1451 mdstp->managed = mhp->mh_vm_pages; 1452 mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo; 1453 mutex_exit(&mhp->mh_mutex); 1454 return (KPHYSM_OK); 1455 } 1456 1457 static int mem_delete_additional_pages = 100; 1458 1459 static int 1460 can_remove_pgs(pgcnt_t npgs) 1461 { 1462 /* 1463 * If all pageable pages were paged out, freemem would 1464 * equal availrmem. There is a minimum requirement for 1465 * availrmem. 1466 */ 1467 if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages)) 1468 < npgs) 1469 return (0); 1470 /* TODO: check swap space, etc. */ 1471 return (1); 1472 } 1473 1474 static int 1475 get_availrmem(pgcnt_t npgs) 1476 { 1477 int ret; 1478 1479 mutex_enter(&freemem_lock); 1480 ret = can_remove_pgs(npgs); 1481 if (ret != 0) 1482 availrmem -= npgs; 1483 mutex_exit(&freemem_lock); 1484 return (ret); 1485 } 1486 1487 static void 1488 put_availrmem(pgcnt_t npgs) 1489 { 1490 mutex_enter(&freemem_lock); 1491 availrmem += npgs; 1492 mutex_exit(&freemem_lock); 1493 } 1494 1495 #define FREEMEM_INCR 100 1496 static pgcnt_t freemem_incr = FREEMEM_INCR; 1497 #define DEL_FREE_WAIT_FRAC 4 1498 #define DEL_FREE_WAIT_TICKS ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC) 1499 1500 #define DEL_BUSY_WAIT_FRAC 20 1501 #define DEL_BUSY_WAIT_TICKS ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC) 1502 1503 static void kphysm_del_cleanup(struct mem_handle *); 1504 1505 static void page_delete_collect(page_t *, struct mem_handle *); 1506 1507 static pgcnt_t 1508 delthr_get_freemem(struct mem_handle *mhp) 1509 { 1510 pgcnt_t free_get; 1511 int ret; 1512 1513 ASSERT(MUTEX_HELD(&mhp->mh_mutex)); 1514 1515 MDSTAT_INCR(mhp, need_free); 1516 /* 1517 * Get up to freemem_incr pages. 1518 */ 1519 free_get = freemem_incr; 1520 if (free_get > mhp->mh_hold_todo) 1521 free_get = mhp->mh_hold_todo; 1522 /* 1523 * Take free_get pages away from freemem, 1524 * waiting if necessary. 1525 */ 1526 1527 while (!mhp->mh_cancel) { 1528 mutex_exit(&mhp->mh_mutex); 1529 MDSTAT_INCR(mhp, free_loop); 1530 /* 1531 * Duplicate test from page_create_throttle() 1532 * but don't override with !PG_WAIT. 1533 */ 1534 if (freemem < (free_get + throttlefree)) { 1535 MDSTAT_INCR(mhp, free_low); 1536 ret = 0; 1537 } else { 1538 ret = page_create_wait(free_get, 0); 1539 if (ret == 0) { 1540 /* EMPTY */ 1541 MDSTAT_INCR(mhp, free_failed); 1542 } 1543 } 1544 if (ret != 0) { 1545 mutex_enter(&mhp->mh_mutex); 1546 return (free_get); 1547 } 1548 1549 /* 1550 * Put pressure on pageout. 1551 */ 1552 page_needfree(free_get); 1553 cv_signal(&proc_pageout->p_cv); 1554 1555 mutex_enter(&mhp->mh_mutex); 1556 (void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex, 1557 (lbolt + DEL_FREE_WAIT_TICKS)); 1558 mutex_exit(&mhp->mh_mutex); 1559 page_needfree(-(spgcnt_t)free_get); 1560 1561 mutex_enter(&mhp->mh_mutex); 1562 } 1563 return (0); 1564 } 1565 1566 #define DR_AIO_CLEANUP_DELAY 25000 /* 0.025secs, in usec */ 1567 #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100 1568 /* 1569 * This function is run as a helper thread for delete_memory_thread. 1570 * It is needed in order to force kaio cleanup, so that pages used in kaio 1571 * will be unlocked and subsequently relocated by delete_memory_thread. 1572 * The address of the delete_memory_threads's mem_handle is passed in to 1573 * this thread function, and is used to set the mh_aio_cleanup_done member 1574 * prior to calling thread_exit(). 1575 */ 1576 static void 1577 dr_aio_cleanup_thread(caddr_t amhp) 1578 { 1579 proc_t *procp; 1580 int (*aio_cleanup_dr_delete_memory)(proc_t *); 1581 int cleaned; 1582 int n = 0; 1583 struct mem_handle *mhp; 1584 volatile uint_t *pcancel; 1585 1586 mhp = (struct mem_handle *)amhp; 1587 ASSERT(mhp != NULL); 1588 pcancel = &mhp->mh_dr_aio_cleanup_cancel; 1589 if (modload("sys", "kaio") == -1) { 1590 mhp->mh_aio_cleanup_done = 1; 1591 cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio"); 1592 thread_exit(); 1593 } 1594 aio_cleanup_dr_delete_memory = (int (*)(proc_t *)) 1595 modgetsymvalue("aio_cleanup_dr_delete_memory", 0); 1596 if (aio_cleanup_dr_delete_memory == NULL) { 1597 mhp->mh_aio_cleanup_done = 1; 1598 cmn_err(CE_WARN, 1599 "aio_cleanup_dr_delete_memory not found in kaio"); 1600 thread_exit(); 1601 } 1602 do { 1603 cleaned = 0; 1604 mutex_enter(&pidlock); 1605 for (procp = practive; (*pcancel == 0) && (procp != NULL); 1606 procp = procp->p_next) { 1607 mutex_enter(&procp->p_lock); 1608 if (procp->p_aio != NULL) { 1609 /* cleanup proc's outstanding kaio */ 1610 cleaned += 1611 (*aio_cleanup_dr_delete_memory)(procp); 1612 } 1613 mutex_exit(&procp->p_lock); 1614 } 1615 mutex_exit(&pidlock); 1616 if ((*pcancel == 0) && 1617 (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) { 1618 /* delay a bit before retrying all procs again */ 1619 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 1620 n = 0; 1621 } 1622 } while (*pcancel == 0); 1623 mhp->mh_aio_cleanup_done = 1; 1624 thread_exit(); 1625 } 1626 1627 static void 1628 delete_memory_thread(caddr_t amhp) 1629 { 1630 struct mem_handle *mhp; 1631 struct memdelspan *mdsp; 1632 callb_cpr_t cprinfo; 1633 page_t *pp_targ; 1634 spgcnt_t freemem_left; 1635 void (*del_complete_funcp)(void *, int error); 1636 void *del_complete_arg; 1637 int comp_code; 1638 int ret; 1639 int first_scan; 1640 uint_t szc; 1641 #ifdef MEM_DEL_STATS 1642 uint64_t start_total, ntick_total; 1643 uint64_t start_pgrp, ntick_pgrp; 1644 #endif /* MEM_DEL_STATS */ 1645 1646 mhp = (struct mem_handle *)amhp; 1647 1648 #ifdef MEM_DEL_STATS 1649 start_total = ddi_get_lbolt(); 1650 #endif /* MEM_DEL_STATS */ 1651 1652 CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex, 1653 callb_generic_cpr, "memdel"); 1654 1655 mutex_enter(&mhp->mh_mutex); 1656 ASSERT(mhp->mh_state == MHND_STARTING); 1657 1658 mhp->mh_state = MHND_RUNNING; 1659 mhp->mh_thread_id = curthread; 1660 1661 mhp->mh_hold_todo = mhp->mh_vm_pages; 1662 mutex_exit(&mhp->mh_mutex); 1663 1664 /* Allocate the remap pages now, if necessary. */ 1665 memseg_remap_init(); 1666 1667 /* 1668 * Subtract from availrmem now if possible as availrmem 1669 * may not be available by the end of the delete. 1670 */ 1671 if (!get_availrmem(mhp->mh_vm_pages)) { 1672 comp_code = KPHYSM_ENOTVIABLE; 1673 mutex_enter(&mhp->mh_mutex); 1674 goto early_exit; 1675 } 1676 1677 ret = kphysm_setup_pre_del(mhp->mh_vm_pages); 1678 1679 mutex_enter(&mhp->mh_mutex); 1680 1681 if (ret != 0) { 1682 mhp->mh_cancel = KPHYSM_EREFUSED; 1683 goto refused; 1684 } 1685 1686 transit_list_collect(mhp, 1); 1687 1688 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1689 mdsp = mdsp->mds_next) { 1690 ASSERT(mdsp->mds_bitmap == NULL); 1691 mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP); 1692 mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp), 1693 KM_SLEEP); 1694 } 1695 1696 first_scan = 1; 1697 freemem_left = 0; 1698 /* 1699 * Start dr_aio_cleanup_thread, which periodically iterates 1700 * through the process list and invokes aio cleanup. This 1701 * is needed in order to avoid a deadly embrace between the 1702 * delete_memory_thread (waiting on writer lock for page, with the 1703 * exclusive-wanted bit set), kaio read request threads (waiting for a 1704 * reader lock on the same page that is wanted by the 1705 * delete_memory_thread), and threads waiting for kaio completion 1706 * (blocked on spt_amp->lock). 1707 */ 1708 mhp->mh_dr_aio_cleanup_cancel = 0; 1709 mhp->mh_aio_cleanup_done = 0; 1710 (void) thread_create(NULL, 0, dr_aio_cleanup_thread, 1711 (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1); 1712 while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) { 1713 pgcnt_t collected; 1714 1715 MDSTAT_INCR(mhp, nloop); 1716 collected = 0; 1717 for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) && 1718 (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) { 1719 pfn_t pfn, p_end; 1720 1721 if (first_scan) { 1722 mem_node_pre_del_slice(mdsp->mds_base, 1723 mdsp->mds_base + mdsp->mds_npgs - 1); 1724 } 1725 1726 p_end = mdsp->mds_base + mdsp->mds_npgs; 1727 for (pfn = mdsp->mds_base; (pfn < p_end) && 1728 (mhp->mh_cancel == 0); pfn++) { 1729 page_t *pp, *tpp, *tpp_targ; 1730 pgcnt_t bit; 1731 struct vnode *vp; 1732 u_offset_t offset; 1733 int mod, result; 1734 spgcnt_t pgcnt; 1735 1736 bit = pfn - mdsp->mds_base; 1737 if ((mdsp->mds_bitmap[bit / NBPBMW] & 1738 (1 << (bit % NBPBMW))) != 0) { 1739 MDSTAT_INCR(mhp, already_done); 1740 continue; 1741 } 1742 if (freemem_left == 0) { 1743 freemem_left += delthr_get_freemem(mhp); 1744 if (freemem_left == 0) 1745 break; 1746 } 1747 1748 /* 1749 * Release mh_mutex - some of this 1750 * stuff takes some time (eg PUTPAGE). 1751 */ 1752 1753 mutex_exit(&mhp->mh_mutex); 1754 MDSTAT_INCR(mhp, ncheck); 1755 1756 pp = page_numtopp_nolock(pfn); 1757 if (pp == NULL) { 1758 /* 1759 * Not covered by a page_t - will 1760 * be dealt with elsewhere. 1761 */ 1762 MDSTAT_INCR(mhp, nopaget); 1763 mutex_enter(&mhp->mh_mutex); 1764 mdsp->mds_bitmap[bit / NBPBMW] |= 1765 (1 << (bit % NBPBMW)); 1766 continue; 1767 } 1768 1769 if (!page_try_reclaim_lock(pp, SE_EXCL, 1770 SE_EXCL_WANTED | SE_RETIRED)) { 1771 /* 1772 * Page in use elsewhere. Skip it. 1773 */ 1774 MDSTAT_INCR(mhp, lockfail); 1775 mutex_enter(&mhp->mh_mutex); 1776 continue; 1777 } 1778 /* 1779 * See if the cage expanded into the delete. 1780 * This can happen as we have to allow the 1781 * cage to expand. 1782 */ 1783 if (PP_ISNORELOC(pp)) { 1784 page_unlock(pp); 1785 mutex_enter(&mhp->mh_mutex); 1786 mhp->mh_cancel = KPHYSM_ENONRELOC; 1787 break; 1788 } 1789 if (PP_RETIRED(pp)) { 1790 /* 1791 * Page has been retired and is 1792 * not part of the cage so we 1793 * can now do the accounting for 1794 * it. 1795 */ 1796 MDSTAT_INCR(mhp, retired); 1797 mutex_enter(&mhp->mh_mutex); 1798 mdsp->mds_bitmap[bit / NBPBMW] 1799 |= (1 << (bit % NBPBMW)); 1800 mdsp->mds_bitmap_retired[bit / 1801 NBPBMW] |= 1802 (1 << (bit % NBPBMW)); 1803 mhp->mh_hold_todo--; 1804 continue; 1805 } 1806 ASSERT(freemem_left != 0); 1807 if (PP_ISFREE(pp)) { 1808 /* 1809 * Like page_reclaim() only 'freemem' 1810 * processing is already done. 1811 */ 1812 MDSTAT_INCR(mhp, nfree); 1813 free_page_collect: 1814 if (PP_ISAGED(pp)) { 1815 page_list_sub(pp, 1816 PG_FREE_LIST); 1817 } else { 1818 page_list_sub(pp, 1819 PG_CACHE_LIST); 1820 } 1821 PP_CLRFREE(pp); 1822 PP_CLRAGED(pp); 1823 collected++; 1824 mutex_enter(&mhp->mh_mutex); 1825 page_delete_collect(pp, mhp); 1826 mdsp->mds_bitmap[bit / NBPBMW] |= 1827 (1 << (bit % NBPBMW)); 1828 freemem_left--; 1829 continue; 1830 } 1831 ASSERT(pp->p_vnode != NULL); 1832 if (first_scan) { 1833 MDSTAT_INCR(mhp, first_notfree); 1834 page_unlock(pp); 1835 mutex_enter(&mhp->mh_mutex); 1836 continue; 1837 } 1838 /* 1839 * Keep stats on pages encountered that 1840 * are marked for retirement. 1841 */ 1842 if (PP_TOXIC(pp)) { 1843 MDSTAT_INCR(mhp, toxic); 1844 } else if (PP_PR_REQ(pp)) { 1845 MDSTAT_INCR(mhp, failing); 1846 } 1847 /* 1848 * In certain cases below, special exceptions 1849 * are made for pages that are toxic. This 1850 * is because the current meaning of toxic 1851 * is that an uncorrectable error has been 1852 * previously associated with the page. 1853 */ 1854 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1855 if (!PP_TOXIC(pp)) { 1856 /* 1857 * Must relocate locked in 1858 * memory pages. 1859 */ 1860 #ifdef MEM_DEL_STATS 1861 start_pgrp = ddi_get_lbolt(); 1862 #endif /* MEM_DEL_STATS */ 1863 /* 1864 * Lock all constituent pages 1865 * of a large page to ensure 1866 * that p_szc won't change. 1867 */ 1868 if (!group_page_trylock(pp, 1869 SE_EXCL)) { 1870 MDSTAT_INCR(mhp, 1871 gptllckfail); 1872 page_unlock(pp); 1873 mutex_enter( 1874 &mhp->mh_mutex); 1875 continue; 1876 } 1877 MDSTAT_INCR(mhp, npplocked); 1878 pp_targ = 1879 page_get_replacement_page( 1880 pp, NULL, 0); 1881 if (pp_targ != NULL) { 1882 #ifdef MEM_DEL_STATS 1883 ntick_pgrp = 1884 (uint64_t) 1885 ddi_get_lbolt() - 1886 start_pgrp; 1887 #endif /* MEM_DEL_STATS */ 1888 MDSTAT_PGRP(mhp, 1889 ntick_pgrp); 1890 MDSTAT_INCR(mhp, 1891 nlockreloc); 1892 goto reloc; 1893 } 1894 group_page_unlock(pp); 1895 page_unlock(pp); 1896 #ifdef MEM_DEL_STATS 1897 ntick_pgrp = 1898 (uint64_t)ddi_get_lbolt() - 1899 start_pgrp; 1900 #endif /* MEM_DEL_STATS */ 1901 MDSTAT_PGRP(mhp, ntick_pgrp); 1902 MDSTAT_INCR(mhp, nnorepl); 1903 mutex_enter(&mhp->mh_mutex); 1904 continue; 1905 } else { 1906 /* 1907 * Cannot do anything about 1908 * this page because it is 1909 * toxic. 1910 */ 1911 MDSTAT_INCR(mhp, npplkdtoxic); 1912 page_unlock(pp); 1913 mutex_enter(&mhp->mh_mutex); 1914 continue; 1915 } 1916 } 1917 /* 1918 * Unload the mappings and check if mod bit 1919 * is set. 1920 */ 1921 ASSERT(!PP_ISKAS(pp)); 1922 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1923 mod = hat_ismod(pp); 1924 1925 #ifdef MEM_DEL_STATS 1926 start_pgrp = ddi_get_lbolt(); 1927 #endif /* MEM_DEL_STATS */ 1928 if (mod && !PP_TOXIC(pp)) { 1929 /* 1930 * Lock all constituent pages 1931 * of a large page to ensure 1932 * that p_szc won't change. 1933 */ 1934 if (!group_page_trylock(pp, SE_EXCL)) { 1935 MDSTAT_INCR(mhp, gptlmodfail); 1936 page_unlock(pp); 1937 mutex_enter(&mhp->mh_mutex); 1938 continue; 1939 } 1940 pp_targ = page_get_replacement_page(pp, 1941 NULL, 0); 1942 if (pp_targ != NULL) { 1943 MDSTAT_INCR(mhp, nmodreloc); 1944 #ifdef MEM_DEL_STATS 1945 ntick_pgrp = 1946 (uint64_t)ddi_get_lbolt() - 1947 start_pgrp; 1948 #endif /* MEM_DEL_STATS */ 1949 MDSTAT_PGRP(mhp, ntick_pgrp); 1950 goto reloc; 1951 } 1952 group_page_unlock(pp); 1953 } 1954 1955 if (!page_try_demote_pages(pp)) { 1956 MDSTAT_INCR(mhp, demotefail); 1957 page_unlock(pp); 1958 #ifdef MEM_DEL_STATS 1959 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 1960 start_pgrp; 1961 #endif /* MEM_DEL_STATS */ 1962 MDSTAT_PGRP(mhp, ntick_pgrp); 1963 mutex_enter(&mhp->mh_mutex); 1964 continue; 1965 } 1966 1967 /* 1968 * Regular 'page-out'. 1969 */ 1970 if (!mod) { 1971 MDSTAT_INCR(mhp, ndestroy); 1972 page_destroy(pp, 1); 1973 /* 1974 * page_destroy was called with 1975 * dontfree. As long as p_lckcnt 1976 * and p_cowcnt are both zero, the 1977 * only additional action of 1978 * page_destroy with !dontfree is to 1979 * call page_free, so we can collect 1980 * the page here. 1981 */ 1982 collected++; 1983 #ifdef MEM_DEL_STATS 1984 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 1985 start_pgrp; 1986 #endif /* MEM_DEL_STATS */ 1987 MDSTAT_PGRP(mhp, ntick_pgrp); 1988 mutex_enter(&mhp->mh_mutex); 1989 page_delete_collect(pp, mhp); 1990 mdsp->mds_bitmap[bit / NBPBMW] |= 1991 (1 << (bit % NBPBMW)); 1992 continue; 1993 } 1994 /* 1995 * The page is toxic and the mod bit is 1996 * set, we cannot do anything here to deal 1997 * with it. 1998 */ 1999 if (PP_TOXIC(pp)) { 2000 page_unlock(pp); 2001 #ifdef MEM_DEL_STATS 2002 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2003 start_pgrp; 2004 #endif /* MEM_DEL_STATS */ 2005 MDSTAT_PGRP(mhp, ntick_pgrp); 2006 MDSTAT_INCR(mhp, modtoxic); 2007 mutex_enter(&mhp->mh_mutex); 2008 continue; 2009 } 2010 MDSTAT_INCR(mhp, nputpage); 2011 vp = pp->p_vnode; 2012 offset = pp->p_offset; 2013 VN_HOLD(vp); 2014 page_unlock(pp); 2015 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, 2016 B_INVAL|B_FORCE, kcred, NULL); 2017 VN_RELE(vp); 2018 #ifdef MEM_DEL_STATS 2019 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2020 start_pgrp; 2021 #endif /* MEM_DEL_STATS */ 2022 MDSTAT_PGRP(mhp, ntick_pgrp); 2023 /* 2024 * Try to get the page back immediately 2025 * so that it can be collected. 2026 */ 2027 pp = page_numtopp_nolock(pfn); 2028 if (pp == NULL) { 2029 MDSTAT_INCR(mhp, nnoreclaim); 2030 /* 2031 * This should not happen as this 2032 * thread is deleting the page. 2033 * If this code is generalized, this 2034 * becomes a reality. 2035 */ 2036 #ifdef DEBUG 2037 cmn_err(CE_WARN, 2038 "delete_memory_thread(0x%p) " 2039 "pfn 0x%lx has no page_t", 2040 (void *)mhp, pfn); 2041 #endif /* DEBUG */ 2042 mutex_enter(&mhp->mh_mutex); 2043 continue; 2044 } 2045 if (page_try_reclaim_lock(pp, SE_EXCL, 2046 SE_EXCL_WANTED | SE_RETIRED)) { 2047 if (PP_ISFREE(pp)) { 2048 goto free_page_collect; 2049 } 2050 page_unlock(pp); 2051 } 2052 MDSTAT_INCR(mhp, nnoreclaim); 2053 mutex_enter(&mhp->mh_mutex); 2054 continue; 2055 2056 reloc: 2057 /* 2058 * Got some freemem and a target 2059 * page, so move the data to avoid 2060 * I/O and lock problems. 2061 */ 2062 ASSERT(!page_iolock_assert(pp)); 2063 MDSTAT_INCR(mhp, nreloc); 2064 /* 2065 * page_relocate() will return pgcnt: the 2066 * number of consecutive pages relocated. 2067 * If it is successful, pp will be a 2068 * linked list of the page structs that 2069 * were relocated. If page_relocate() is 2070 * unsuccessful, pp will be unmodified. 2071 */ 2072 #ifdef MEM_DEL_STATS 2073 start_pgrp = ddi_get_lbolt(); 2074 #endif /* MEM_DEL_STATS */ 2075 result = page_relocate(&pp, &pp_targ, 0, 0, 2076 &pgcnt, NULL); 2077 #ifdef MEM_DEL_STATS 2078 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2079 start_pgrp; 2080 #endif /* MEM_DEL_STATS */ 2081 MDSTAT_PGRP(mhp, ntick_pgrp); 2082 if (result != 0) { 2083 MDSTAT_INCR(mhp, nrelocfail); 2084 /* 2085 * We did not succeed. We need 2086 * to give the pp_targ pages back. 2087 * page_free(pp_targ, 1) without 2088 * the freemem accounting. 2089 */ 2090 group_page_unlock(pp); 2091 page_free_replacement_page(pp_targ); 2092 page_unlock(pp); 2093 mutex_enter(&mhp->mh_mutex); 2094 continue; 2095 } 2096 2097 /* 2098 * We will then collect pgcnt pages. 2099 */ 2100 ASSERT(pgcnt > 0); 2101 mutex_enter(&mhp->mh_mutex); 2102 /* 2103 * We need to make sure freemem_left is 2104 * large enough. 2105 */ 2106 while ((freemem_left < pgcnt) && 2107 (!mhp->mh_cancel)) { 2108 freemem_left += 2109 delthr_get_freemem(mhp); 2110 } 2111 2112 /* 2113 * Do not proceed if mh_cancel is set. 2114 */ 2115 if (mhp->mh_cancel) { 2116 while (pp_targ != NULL) { 2117 /* 2118 * Unlink and unlock each page. 2119 */ 2120 tpp_targ = pp_targ; 2121 page_sub(&pp_targ, tpp_targ); 2122 page_unlock(tpp_targ); 2123 } 2124 /* 2125 * We need to give the pp pages back. 2126 * page_free(pp, 1) without the 2127 * freemem accounting. 2128 */ 2129 page_free_replacement_page(pp); 2130 break; 2131 } 2132 2133 /* Now remove pgcnt from freemem_left */ 2134 freemem_left -= pgcnt; 2135 ASSERT(freemem_left >= 0); 2136 szc = pp->p_szc; 2137 while (pp != NULL) { 2138 /* 2139 * pp and pp_targ were passed back as 2140 * a linked list of pages. 2141 * Unlink and unlock each page. 2142 */ 2143 tpp_targ = pp_targ; 2144 page_sub(&pp_targ, tpp_targ); 2145 page_unlock(tpp_targ); 2146 /* 2147 * The original page is now free 2148 * so remove it from the linked 2149 * list and collect it. 2150 */ 2151 tpp = pp; 2152 page_sub(&pp, tpp); 2153 pfn = page_pptonum(tpp); 2154 collected++; 2155 ASSERT(PAGE_EXCL(tpp)); 2156 ASSERT(tpp->p_vnode == NULL); 2157 ASSERT(!hat_page_is_mapped(tpp)); 2158 ASSERT(tpp->p_szc == szc); 2159 tpp->p_szc = 0; 2160 page_delete_collect(tpp, mhp); 2161 bit = pfn - mdsp->mds_base; 2162 mdsp->mds_bitmap[bit / NBPBMW] |= 2163 (1 << (bit % NBPBMW)); 2164 } 2165 ASSERT(pp_targ == NULL); 2166 } 2167 } 2168 first_scan = 0; 2169 if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) && 2170 (collected == 0)) { 2171 /* 2172 * This code is needed as we cannot wait 2173 * for a page to be locked OR the delete to 2174 * be cancelled. Also, we must delay so 2175 * that other threads get a chance to run 2176 * on our cpu, otherwise page locks may be 2177 * held indefinitely by those threads. 2178 */ 2179 MDSTAT_INCR(mhp, ndelay); 2180 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2181 (void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex, 2182 (lbolt + DEL_BUSY_WAIT_TICKS)); 2183 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2184 } 2185 } 2186 /* stop the dr aio cleanup thread */ 2187 mhp->mh_dr_aio_cleanup_cancel = 1; 2188 transit_list_collect(mhp, 0); 2189 if (freemem_left != 0) { 2190 /* Return any surplus. */ 2191 page_create_putback(freemem_left); 2192 freemem_left = 0; 2193 } 2194 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2195 mdsp = mdsp->mds_next) { 2196 mem_node_post_del_slice(mdsp->mds_base, 2197 mdsp->mds_base + mdsp->mds_npgs - 1, 2198 (mhp->mh_cancel != 0)); 2199 } 2200 #ifdef MEM_DEL_STATS 2201 ntick_total = (uint64_t)ddi_get_lbolt() - start_total; 2202 #endif /* MEM_DEL_STATS */ 2203 MDSTAT_TOTAL(mhp, ntick_total); 2204 MDSTAT_PRINT(mhp); 2205 2206 /* 2207 * If the memory delete was cancelled, exclusive-wanted bits must 2208 * be cleared. If there are retired pages being deleted, they need 2209 * to be unretired. 2210 */ 2211 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2212 mdsp = mdsp->mds_next) { 2213 pfn_t pfn, p_end; 2214 2215 p_end = mdsp->mds_base + mdsp->mds_npgs; 2216 for (pfn = mdsp->mds_base; pfn < p_end; pfn++) { 2217 page_t *pp; 2218 pgcnt_t bit; 2219 2220 bit = pfn - mdsp->mds_base; 2221 if (mhp->mh_cancel) { 2222 pp = page_numtopp_nolock(pfn); 2223 if (pp != NULL) { 2224 if ((mdsp->mds_bitmap[bit / NBPBMW] & 2225 (1 << (bit % NBPBMW))) == 0) { 2226 page_lock_clr_exclwanted(pp); 2227 } 2228 } 2229 } else { 2230 pp = NULL; 2231 } 2232 if ((mdsp->mds_bitmap_retired[bit / NBPBMW] & 2233 (1 << (bit % NBPBMW))) != 0) { 2234 /* do we already have pp? */ 2235 if (pp == NULL) { 2236 pp = page_numtopp_nolock(pfn); 2237 } 2238 ASSERT(pp != NULL); 2239 ASSERT(PP_RETIRED(pp)); 2240 if (mhp->mh_cancel != 0) { 2241 page_unlock(pp); 2242 /* 2243 * To satisfy ASSERT below in 2244 * cancel code. 2245 */ 2246 mhp->mh_hold_todo++; 2247 } else { 2248 (void) page_unretire_pp(pp, 2249 PR_UNR_CLEAN); 2250 } 2251 } 2252 } 2253 } 2254 /* 2255 * Free retired page bitmap and collected page bitmap 2256 */ 2257 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2258 mdsp = mdsp->mds_next) { 2259 ASSERT(mdsp->mds_bitmap_retired != NULL); 2260 kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp)); 2261 mdsp->mds_bitmap_retired = NULL; /* Paranoia. */ 2262 ASSERT(mdsp->mds_bitmap != NULL); 2263 kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp)); 2264 mdsp->mds_bitmap = NULL; /* Paranoia. */ 2265 } 2266 2267 /* wait for our dr aio cancel thread to exit */ 2268 while (!(mhp->mh_aio_cleanup_done)) { 2269 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2270 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 2271 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2272 } 2273 refused: 2274 if (mhp->mh_cancel != 0) { 2275 page_t *pp; 2276 2277 comp_code = mhp->mh_cancel; 2278 /* 2279 * Go through list of deleted pages (mh_deleted) freeing 2280 * them. 2281 */ 2282 while ((pp = mhp->mh_deleted) != NULL) { 2283 mhp->mh_deleted = pp->p_next; 2284 mhp->mh_hold_todo++; 2285 mutex_exit(&mhp->mh_mutex); 2286 /* Restore p_next. */ 2287 pp->p_next = pp->p_prev; 2288 if (PP_ISFREE(pp)) { 2289 cmn_err(CE_PANIC, 2290 "page %p is free", 2291 (void *)pp); 2292 } 2293 page_free(pp, 1); 2294 mutex_enter(&mhp->mh_mutex); 2295 } 2296 ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages); 2297 2298 mutex_exit(&mhp->mh_mutex); 2299 put_availrmem(mhp->mh_vm_pages); 2300 mutex_enter(&mhp->mh_mutex); 2301 2302 goto t_exit; 2303 } 2304 2305 /* 2306 * All the pages are no longer in use and are exclusively locked. 2307 */ 2308 2309 mhp->mh_deleted = NULL; 2310 2311 kphysm_del_cleanup(mhp); 2312 2313 comp_code = KPHYSM_OK; 2314 2315 t_exit: 2316 mutex_exit(&mhp->mh_mutex); 2317 kphysm_setup_post_del(mhp->mh_vm_pages, 2318 (comp_code == KPHYSM_OK) ? 0 : 1); 2319 mutex_enter(&mhp->mh_mutex); 2320 2321 early_exit: 2322 /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */ 2323 mhp->mh_state = MHND_DONE; 2324 del_complete_funcp = mhp->mh_delete_complete; 2325 del_complete_arg = mhp->mh_delete_complete_arg; 2326 CALLB_CPR_EXIT(&cprinfo); 2327 (*del_complete_funcp)(del_complete_arg, comp_code); 2328 thread_exit(); 2329 /*NOTREACHED*/ 2330 } 2331 2332 /* 2333 * Start the delete of the memory from the system. 2334 */ 2335 int 2336 kphysm_del_start( 2337 memhandle_t handle, 2338 void (*complete)(void *, int), 2339 void *complete_arg) 2340 { 2341 struct mem_handle *mhp; 2342 2343 mhp = kphysm_lookup_mem_handle(handle); 2344 if (mhp == NULL) { 2345 return (KPHYSM_EHANDLE); 2346 } 2347 switch (mhp->mh_state) { 2348 case MHND_FREE: 2349 ASSERT(mhp->mh_state != MHND_FREE); 2350 mutex_exit(&mhp->mh_mutex); 2351 return (KPHYSM_EHANDLE); 2352 case MHND_INIT: 2353 break; 2354 case MHND_STARTING: 2355 case MHND_RUNNING: 2356 mutex_exit(&mhp->mh_mutex); 2357 return (KPHYSM_ESEQUENCE); 2358 case MHND_DONE: 2359 mutex_exit(&mhp->mh_mutex); 2360 return (KPHYSM_ESEQUENCE); 2361 case MHND_RELEASE: 2362 mutex_exit(&mhp->mh_mutex); 2363 return (KPHYSM_ESEQUENCE); 2364 default: 2365 #ifdef DEBUG 2366 cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d", 2367 (void *)mhp, mhp->mh_state); 2368 #endif /* DEBUG */ 2369 mutex_exit(&mhp->mh_mutex); 2370 return (KPHYSM_EHANDLE); 2371 } 2372 2373 if (mhp->mh_transit.trl_spans == NULL) { 2374 mutex_exit(&mhp->mh_mutex); 2375 return (KPHYSM_ENOWORK); 2376 } 2377 2378 ASSERT(complete != NULL); 2379 mhp->mh_delete_complete = complete; 2380 mhp->mh_delete_complete_arg = complete_arg; 2381 mhp->mh_state = MHND_STARTING; 2382 /* 2383 * Release the mutex in case thread_create sleeps. 2384 */ 2385 mutex_exit(&mhp->mh_mutex); 2386 2387 /* 2388 * The "obvious" process for this thread is pageout (proc_pageout) 2389 * but this gives the thread too much power over freemem 2390 * which results in freemem starvation. 2391 */ 2392 (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0, 2393 TS_RUN, maxclsyspri - 1); 2394 2395 return (KPHYSM_OK); 2396 } 2397 2398 static kmutex_t pp_dummy_lock; /* Protects init. of pp_dummy. */ 2399 static caddr_t pp_dummy; 2400 static pgcnt_t pp_dummy_npages; 2401 static pfn_t *pp_dummy_pfn; /* Array of dummy pfns. */ 2402 2403 static void 2404 memseg_remap_init_pages(page_t *pages, page_t *epages) 2405 { 2406 page_t *pp; 2407 2408 for (pp = pages; pp < epages; pp++) { 2409 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2410 pp->p_offset = (u_offset_t)-1; 2411 page_iolock_init(pp); 2412 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 2413 continue; 2414 page_lock_delete(pp); 2415 } 2416 } 2417 2418 void 2419 memseg_remap_init() 2420 { 2421 mutex_enter(&pp_dummy_lock); 2422 if (pp_dummy == NULL) { 2423 uint_t dpages; 2424 int i; 2425 2426 /* 2427 * dpages starts off as the size of the structure and 2428 * ends up as the minimum number of pages that will 2429 * hold a whole number of page_t structures. 2430 */ 2431 dpages = sizeof (page_t); 2432 ASSERT(dpages != 0); 2433 ASSERT(dpages <= MMU_PAGESIZE); 2434 2435 while ((dpages & 1) == 0) 2436 dpages >>= 1; 2437 2438 pp_dummy_npages = dpages; 2439 /* 2440 * Allocate pp_dummy pages directly from static_arena, 2441 * since these are whole page allocations and are 2442 * referenced by physical address. This also has the 2443 * nice fringe benefit of hiding the memory from 2444 * ::findleaks since it doesn't deal well with allocated 2445 * kernel heap memory that doesn't have any mappings. 2446 */ 2447 pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages), 2448 PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP); 2449 bzero(pp_dummy, ptob(pp_dummy_npages)); 2450 ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0); 2451 pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) * 2452 pp_dummy_npages, KM_SLEEP); 2453 for (i = 0; i < pp_dummy_npages; i++) { 2454 pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat, 2455 &pp_dummy[MMU_PAGESIZE * i]); 2456 ASSERT(pp_dummy_pfn[i] != PFN_INVALID); 2457 } 2458 /* 2459 * Initialize the page_t's to a known 'deleted' state 2460 * that matches the state of deleted pages. 2461 */ 2462 memseg_remap_init_pages((page_t *)pp_dummy, 2463 (page_t *)(pp_dummy + 2464 ptob(pp_dummy_npages))); 2465 /* Remove kmem mappings for the pages for safety. */ 2466 hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages), 2467 HAT_UNLOAD_UNLOCK); 2468 /* Leave pp_dummy pointer set as flag that init is done. */ 2469 } 2470 mutex_exit(&pp_dummy_lock); 2471 } 2472 2473 static void 2474 memseg_remap_to_dummy(caddr_t pp, pgcnt_t metapgs) 2475 { 2476 ASSERT(pp_dummy != NULL); 2477 2478 while (metapgs != 0) { 2479 pgcnt_t n; 2480 int i; 2481 2482 n = pp_dummy_npages; 2483 if (n > metapgs) 2484 n = metapgs; 2485 for (i = 0; i < n; i++) { 2486 hat_devload(kas.a_hat, pp, ptob(1), pp_dummy_pfn[i], 2487 PROT_READ, 2488 HAT_LOAD | HAT_LOAD_NOCONSIST | 2489 HAT_LOAD_REMAP); 2490 pp += ptob(1); 2491 } 2492 metapgs -= n; 2493 } 2494 } 2495 2496 /* 2497 * Transition all the deleted pages to the deleted state so that 2498 * page_lock will not wait. The page_lock_delete call will 2499 * also wake up any waiters. 2500 */ 2501 static void 2502 memseg_lock_delete_all(struct memseg *seg) 2503 { 2504 page_t *pp; 2505 2506 for (pp = seg->pages; pp < seg->epages; pp++) { 2507 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2508 page_lock_delete(pp); 2509 } 2510 } 2511 2512 static void 2513 kphysm_del_cleanup(struct mem_handle *mhp) 2514 { 2515 struct memdelspan *mdsp; 2516 struct memseg *seg; 2517 struct memseg **segpp; 2518 struct memseg *seglist; 2519 pfn_t p_end; 2520 uint64_t avmem; 2521 pgcnt_t avpgs; 2522 pgcnt_t npgs; 2523 2524 avpgs = mhp->mh_vm_pages; 2525 2526 memsegs_lock(1); 2527 2528 /* 2529 * remove from main segment list. 2530 */ 2531 npgs = 0; 2532 seglist = NULL; 2533 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2534 mdsp = mdsp->mds_next) { 2535 p_end = mdsp->mds_base + mdsp->mds_npgs; 2536 for (segpp = &memsegs; (seg = *segpp) != NULL; ) { 2537 if (seg->pages_base >= p_end || 2538 seg->pages_end <= mdsp->mds_base) { 2539 /* Span and memseg don't overlap. */ 2540 segpp = &((*segpp)->next); 2541 continue; 2542 } 2543 ASSERT(seg->pages_base >= mdsp->mds_base); 2544 ASSERT(seg->pages_end <= p_end); 2545 2546 PLCNT_MODIFY_MAX(seg->pages_base, 2547 seg->pages_base - seg->pages_end); 2548 2549 /* Hide the memseg from future scans. */ 2550 hat_kpm_delmem_mseg_update(seg, segpp); 2551 *segpp = seg->next; 2552 membar_producer(); /* TODO: Needed? */ 2553 npgs += MSEG_NPAGES(seg); 2554 2555 /* 2556 * Leave the deleted segment's next pointer intact 2557 * in case a memsegs scanning loop is walking this 2558 * segment concurrently. 2559 */ 2560 seg->lnext = seglist; 2561 seglist = seg; 2562 } 2563 } 2564 2565 build_pfn_hash(); 2566 2567 ASSERT(npgs < total_pages); 2568 total_pages -= npgs; 2569 2570 /* 2571 * Recalculate the paging parameters now total_pages has changed. 2572 * This will also cause the clock hands to be reset before next use. 2573 */ 2574 setupclock(1); 2575 2576 memsegs_unlock(1); 2577 2578 mutex_exit(&mhp->mh_mutex); 2579 2580 while ((seg = seglist) != NULL) { 2581 pfn_t mseg_start; 2582 pfn_t mseg_base, mseg_end; 2583 pgcnt_t mseg_npgs; 2584 page_t *pp; 2585 pgcnt_t metapgs; 2586 int dynamic; 2587 int mlret; 2588 2589 seglist = seg->lnext; 2590 2591 /* 2592 * Put the page_t's into the deleted state to stop 2593 * cv_wait()s on the pages. When we remap, the dummy 2594 * page_t's will be in the same state. 2595 */ 2596 memseg_lock_delete_all(seg); 2597 /* 2598 * Collect up information based on pages_base and pages_end 2599 * early so that we can flag early that the memseg has been 2600 * deleted by setting pages_end == pages_base. 2601 */ 2602 mseg_base = seg->pages_base; 2603 mseg_end = seg->pages_end; 2604 mseg_npgs = MSEG_NPAGES(seg); 2605 dynamic = memseg_is_dynamic(seg, &mseg_start); 2606 2607 seg->pages_end = seg->pages_base; 2608 2609 if (dynamic) { 2610 pp = seg->pages; 2611 metapgs = mseg_base - mseg_start; 2612 ASSERT(metapgs != 0); 2613 2614 /* Remap the meta data to our special dummy area. */ 2615 memseg_remap_to_dummy((caddr_t)pp, metapgs); 2616 2617 mutex_enter(&memseg_lists_lock); 2618 seg->lnext = memseg_va_avail; 2619 memseg_va_avail = seg; 2620 mutex_exit(&memseg_lists_lock); 2621 } else { 2622 /* 2623 * Set for clean-up below. 2624 */ 2625 mseg_start = seg->pages_base; 2626 /* 2627 * For memory whose page_ts were allocated 2628 * at boot, we need to find a new use for 2629 * the page_t memory. 2630 * For the moment, just leak it. 2631 * (It is held in the memseg_delete_junk list.) 2632 */ 2633 2634 mutex_enter(&memseg_lists_lock); 2635 seg->lnext = memseg_delete_junk; 2636 memseg_delete_junk = seg; 2637 mutex_exit(&memseg_lists_lock); 2638 } 2639 2640 /* Must not use seg now as it could be re-used. */ 2641 2642 memlist_write_lock(); 2643 2644 mlret = memlist_delete_span( 2645 (uint64_t)(mseg_base) << PAGESHIFT, 2646 (uint64_t)(mseg_npgs) << PAGESHIFT, 2647 &phys_avail); 2648 ASSERT(mlret == MEML_SPANOP_OK); 2649 2650 mlret = memlist_delete_span( 2651 (uint64_t)(mseg_start) << PAGESHIFT, 2652 (uint64_t)(mseg_end - mseg_start) << 2653 PAGESHIFT, 2654 &phys_install); 2655 ASSERT(mlret == MEML_SPANOP_OK); 2656 phys_install_has_changed(); 2657 2658 memlist_write_unlock(); 2659 } 2660 2661 memlist_read_lock(); 2662 installed_top_size(phys_install, &physmax, &physinstalled); 2663 memlist_read_unlock(); 2664 2665 mutex_enter(&freemem_lock); 2666 maxmem -= avpgs; 2667 physmem -= avpgs; 2668 /* availrmem is adjusted during the delete. */ 2669 availrmem_initial -= avpgs; 2670 2671 mutex_exit(&freemem_lock); 2672 2673 dump_resize(); 2674 2675 cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK " 2676 "(0x%" PRIx64 ")\n", 2677 physinstalled << (PAGESHIFT - 10), 2678 (uint64_t)physinstalled << PAGESHIFT); 2679 2680 avmem = (uint64_t)freemem << PAGESHIFT; 2681 cmn_err(CE_CONT, "?kphysm_delete: " 2682 "avail mem = %" PRId64 "\n", avmem); 2683 2684 /* 2685 * Update lgroup generation number on single lgroup systems 2686 */ 2687 if (nlgrps == 1) 2688 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 2689 2690 /* Successfully deleted system memory */ 2691 mutex_enter(&mhp->mh_mutex); 2692 } 2693 2694 static uint_t mdel_nullvp_waiter; 2695 2696 static void 2697 page_delete_collect( 2698 page_t *pp, 2699 struct mem_handle *mhp) 2700 { 2701 if (pp->p_vnode) { 2702 page_hashout(pp, (kmutex_t *)NULL); 2703 /* do not do PP_SETAGED(pp); */ 2704 } else { 2705 kmutex_t *sep; 2706 2707 sep = page_se_mutex(pp); 2708 mutex_enter(sep); 2709 if (CV_HAS_WAITERS(&pp->p_cv)) { 2710 mdel_nullvp_waiter++; 2711 cv_broadcast(&pp->p_cv); 2712 } 2713 mutex_exit(sep); 2714 } 2715 ASSERT(pp->p_next == pp->p_prev); 2716 ASSERT(pp->p_next == NULL || pp->p_next == pp); 2717 pp->p_next = mhp->mh_deleted; 2718 mhp->mh_deleted = pp; 2719 ASSERT(mhp->mh_hold_todo != 0); 2720 mhp->mh_hold_todo--; 2721 } 2722 2723 static void 2724 transit_list_collect(struct mem_handle *mhp, int v) 2725 { 2726 struct transit_list_head *trh; 2727 2728 trh = &transit_list_head; 2729 mutex_enter(&trh->trh_lock); 2730 mhp->mh_transit.trl_collect = v; 2731 mutex_exit(&trh->trh_lock); 2732 } 2733 2734 static void 2735 transit_list_insert(struct transit_list *tlp) 2736 { 2737 struct transit_list_head *trh; 2738 2739 trh = &transit_list_head; 2740 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2741 tlp->trl_next = trh->trh_head; 2742 trh->trh_head = tlp; 2743 } 2744 2745 static void 2746 transit_list_remove(struct transit_list *tlp) 2747 { 2748 struct transit_list_head *trh; 2749 struct transit_list **tlpp; 2750 2751 trh = &transit_list_head; 2752 tlpp = &trh->trh_head; 2753 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2754 while (*tlpp != NULL && *tlpp != tlp) 2755 tlpp = &(*tlpp)->trl_next; 2756 ASSERT(*tlpp != NULL); 2757 if (*tlpp == tlp) 2758 *tlpp = tlp->trl_next; 2759 tlp->trl_next = NULL; 2760 } 2761 2762 static struct transit_list * 2763 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum) 2764 { 2765 struct transit_list *tlp; 2766 2767 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 2768 struct memdelspan *mdsp; 2769 2770 for (mdsp = tlp->trl_spans; mdsp != NULL; 2771 mdsp = mdsp->mds_next) { 2772 if (pfnum >= mdsp->mds_base && 2773 pfnum < (mdsp->mds_base + mdsp->mds_npgs)) { 2774 return (tlp); 2775 } 2776 } 2777 } 2778 return (NULL); 2779 } 2780 2781 int 2782 pfn_is_being_deleted(pfn_t pfnum) 2783 { 2784 struct transit_list_head *trh; 2785 struct transit_list *tlp; 2786 int ret; 2787 2788 trh = &transit_list_head; 2789 if (trh->trh_head == NULL) 2790 return (0); 2791 2792 mutex_enter(&trh->trh_lock); 2793 tlp = pfnum_to_transit_list(trh, pfnum); 2794 ret = (tlp != NULL && tlp->trl_collect); 2795 mutex_exit(&trh->trh_lock); 2796 2797 return (ret); 2798 } 2799 2800 #ifdef MEM_DEL_STATS 2801 extern int hz; 2802 static void 2803 mem_del_stat_print_func(struct mem_handle *mhp) 2804 { 2805 uint64_t tmp; 2806 2807 if (mem_del_stat_print) { 2808 printf("memory delete loop %x/%x, statistics%s\n", 2809 (uint_t)mhp->mh_transit.trl_spans->mds_base, 2810 (uint_t)mhp->mh_transit.trl_spans->mds_npgs, 2811 (mhp->mh_cancel ? " (cancelled)" : "")); 2812 printf("\t%8u nloop\n", mhp->mh_delstat.nloop); 2813 printf("\t%8u need_free\n", mhp->mh_delstat.need_free); 2814 printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop); 2815 printf("\t%8u free_low\n", mhp->mh_delstat.free_low); 2816 printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed); 2817 printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck); 2818 printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget); 2819 printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail); 2820 printf("\t%8u nfree\n", mhp->mh_delstat.nfree); 2821 printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc); 2822 printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail); 2823 printf("\t%8u already_done\n", mhp->mh_delstat.already_done); 2824 printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree); 2825 printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked); 2826 printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc); 2827 printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl); 2828 printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc); 2829 printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy); 2830 printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage); 2831 printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim); 2832 printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay); 2833 printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail); 2834 printf("\t%8u retired\n", mhp->mh_delstat.retired); 2835 printf("\t%8u toxic\n", mhp->mh_delstat.toxic); 2836 printf("\t%8u failing\n", mhp->mh_delstat.failing); 2837 printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic); 2838 printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic); 2839 printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail); 2840 printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail); 2841 tmp = mhp->mh_delstat.nticks_total / hz; /* seconds */ 2842 printf( 2843 "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n", 2844 mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60); 2845 2846 tmp = mhp->mh_delstat.nticks_pgrp / hz; /* seconds */ 2847 printf( 2848 "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n", 2849 mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60); 2850 } 2851 } 2852 #endif /* MEM_DEL_STATS */ 2853 2854 struct mem_callback { 2855 kphysm_setup_vector_t *vec; 2856 void *arg; 2857 }; 2858 2859 #define NMEMCALLBACKS 100 2860 2861 static struct mem_callback mem_callbacks[NMEMCALLBACKS]; 2862 static uint_t nmemcallbacks; 2863 static krwlock_t mem_callback_rwlock; 2864 2865 int 2866 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg) 2867 { 2868 uint_t i, found; 2869 2870 /* 2871 * This test will become more complicated when the version must 2872 * change. 2873 */ 2874 if (vec->version != KPHYSM_SETUP_VECTOR_VERSION) 2875 return (EINVAL); 2876 2877 if (vec->post_add == NULL || vec->pre_del == NULL || 2878 vec->post_del == NULL) 2879 return (EINVAL); 2880 2881 rw_enter(&mem_callback_rwlock, RW_WRITER); 2882 for (i = 0, found = 0; i < nmemcallbacks; i++) { 2883 if (mem_callbacks[i].vec == NULL && found == 0) 2884 found = i + 1; 2885 if (mem_callbacks[i].vec == vec && 2886 mem_callbacks[i].arg == arg) { 2887 #ifdef DEBUG 2888 /* Catch this in DEBUG kernels. */ 2889 cmn_err(CE_WARN, "kphysm_setup_func_register" 2890 "(0x%p, 0x%p) duplicate registration from 0x%p", 2891 (void *)vec, arg, (void *)caller()); 2892 #endif /* DEBUG */ 2893 rw_exit(&mem_callback_rwlock); 2894 return (EEXIST); 2895 } 2896 } 2897 if (found != 0) { 2898 i = found - 1; 2899 } else { 2900 ASSERT(nmemcallbacks < NMEMCALLBACKS); 2901 if (nmemcallbacks == NMEMCALLBACKS) { 2902 rw_exit(&mem_callback_rwlock); 2903 return (ENOMEM); 2904 } 2905 i = nmemcallbacks++; 2906 } 2907 mem_callbacks[i].vec = vec; 2908 mem_callbacks[i].arg = arg; 2909 rw_exit(&mem_callback_rwlock); 2910 return (0); 2911 } 2912 2913 void 2914 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg) 2915 { 2916 uint_t i; 2917 2918 rw_enter(&mem_callback_rwlock, RW_WRITER); 2919 for (i = 0; i < nmemcallbacks; i++) { 2920 if (mem_callbacks[i].vec == vec && 2921 mem_callbacks[i].arg == arg) { 2922 mem_callbacks[i].vec = NULL; 2923 mem_callbacks[i].arg = NULL; 2924 if (i == (nmemcallbacks - 1)) 2925 nmemcallbacks--; 2926 break; 2927 } 2928 } 2929 rw_exit(&mem_callback_rwlock); 2930 } 2931 2932 static void 2933 kphysm_setup_post_add(pgcnt_t delta_pages) 2934 { 2935 uint_t i; 2936 2937 rw_enter(&mem_callback_rwlock, RW_READER); 2938 for (i = 0; i < nmemcallbacks; i++) { 2939 if (mem_callbacks[i].vec != NULL) { 2940 (*mem_callbacks[i].vec->post_add) 2941 (mem_callbacks[i].arg, delta_pages); 2942 } 2943 } 2944 rw_exit(&mem_callback_rwlock); 2945 } 2946 2947 /* 2948 * Note the locking between pre_del and post_del: The reader lock is held 2949 * between the two calls to stop the set of functions from changing. 2950 */ 2951 2952 static int 2953 kphysm_setup_pre_del(pgcnt_t delta_pages) 2954 { 2955 uint_t i; 2956 int ret; 2957 int aret; 2958 2959 ret = 0; 2960 rw_enter(&mem_callback_rwlock, RW_READER); 2961 for (i = 0; i < nmemcallbacks; i++) { 2962 if (mem_callbacks[i].vec != NULL) { 2963 aret = (*mem_callbacks[i].vec->pre_del) 2964 (mem_callbacks[i].arg, delta_pages); 2965 ret |= aret; 2966 } 2967 } 2968 2969 return (ret); 2970 } 2971 2972 static void 2973 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled) 2974 { 2975 uint_t i; 2976 2977 for (i = 0; i < nmemcallbacks; i++) { 2978 if (mem_callbacks[i].vec != NULL) { 2979 (*mem_callbacks[i].vec->post_del) 2980 (mem_callbacks[i].arg, delta_pages, cancelled); 2981 } 2982 } 2983 rw_exit(&mem_callback_rwlock); 2984 } 2985 2986 static int 2987 kphysm_split_memseg( 2988 pfn_t base, 2989 pgcnt_t npgs) 2990 { 2991 struct memseg *seg; 2992 struct memseg **segpp; 2993 pgcnt_t size_low, size_high; 2994 struct memseg *seg_low, *seg_mid, *seg_high; 2995 2996 /* 2997 * Lock the memsegs list against other updates now 2998 */ 2999 memsegs_lock(1); 3000 3001 /* 3002 * Find boot time memseg that wholly covers this area. 3003 */ 3004 3005 /* First find the memseg with page 'base' in it. */ 3006 for (segpp = &memsegs; (seg = *segpp) != NULL; 3007 segpp = &((*segpp)->next)) { 3008 if (base >= seg->pages_base && base < seg->pages_end) 3009 break; 3010 } 3011 if (seg == NULL) { 3012 memsegs_unlock(1); 3013 return (0); 3014 } 3015 if (memseg_is_dynamic(seg, (pfn_t *)NULL)) { 3016 memsegs_unlock(1); 3017 return (0); 3018 } 3019 if ((base + npgs) > seg->pages_end) { 3020 memsegs_unlock(1); 3021 return (0); 3022 } 3023 3024 /* 3025 * Work out the size of the two segments that will 3026 * surround the new segment, one for low address 3027 * and one for high. 3028 */ 3029 ASSERT(base >= seg->pages_base); 3030 size_low = base - seg->pages_base; 3031 ASSERT(seg->pages_end >= (base + npgs)); 3032 size_high = seg->pages_end - (base + npgs); 3033 3034 /* 3035 * Sanity check. 3036 */ 3037 if ((size_low + size_high) == 0) { 3038 memsegs_unlock(1); 3039 return (0); 3040 } 3041 3042 /* 3043 * Allocate the new structures. The old memseg will not be freed 3044 * as there may be a reference to it. 3045 */ 3046 seg_low = NULL; 3047 seg_high = NULL; 3048 3049 if (size_low != 0) { 3050 seg_low = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3051 bzero(seg_low, sizeof (struct memseg)); 3052 } 3053 3054 seg_mid = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3055 bzero(seg_mid, sizeof (struct memseg)); 3056 3057 if (size_high != 0) { 3058 seg_high = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3059 bzero(seg_high, sizeof (struct memseg)); 3060 } 3061 3062 /* 3063 * All allocation done now. 3064 */ 3065 if (size_low != 0) { 3066 seg_low->pages = seg->pages; 3067 seg_low->epages = seg_low->pages + size_low; 3068 seg_low->pages_base = seg->pages_base; 3069 seg_low->pages_end = seg_low->pages_base + size_low; 3070 seg_low->next = seg_mid; 3071 } 3072 if (size_high != 0) { 3073 seg_high->pages = seg->epages - size_high; 3074 seg_high->epages = seg_high->pages + size_high; 3075 seg_high->pages_base = seg->pages_end - size_high; 3076 seg_high->pages_end = seg_high->pages_base + size_high; 3077 seg_high->next = seg->next; 3078 } 3079 3080 seg_mid->pages = seg->pages + size_low; 3081 seg_mid->pages_base = seg->pages_base + size_low; 3082 seg_mid->epages = seg->epages - size_high; 3083 seg_mid->pages_end = seg->pages_end - size_high; 3084 seg_mid->next = (seg_high != NULL) ? seg_high : seg->next; 3085 3086 /* 3087 * Update hat_kpm specific info of all involved memsegs and 3088 * allow hat_kpm specific global chain updates. 3089 */ 3090 hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high); 3091 3092 /* 3093 * At this point we have two equivalent memseg sub-chains, 3094 * seg and seg_low/seg_mid/seg_high, which both chain on to 3095 * the same place in the global chain. By re-writing the pointer 3096 * in the previous element we switch atomically from using the old 3097 * (seg) to the new. 3098 */ 3099 *segpp = (seg_low != NULL) ? seg_low : seg_mid; 3100 3101 membar_enter(); 3102 3103 build_pfn_hash(); 3104 memsegs_unlock(1); 3105 3106 /* 3107 * We leave the old segment, 'seg', intact as there may be 3108 * references to it. Also, as the value of total_pages has not 3109 * changed and the memsegs list is effectively the same when 3110 * accessed via the old or the new pointer, we do not have to 3111 * cause pageout_scanner() to re-evaluate its hand pointers. 3112 * 3113 * We currently do not re-use or reclaim the page_t memory. 3114 * If we do, then this may have to change. 3115 */ 3116 3117 mutex_enter(&memseg_lists_lock); 3118 seg->lnext = memseg_edit_junk; 3119 memseg_edit_junk = seg; 3120 mutex_exit(&memseg_lists_lock); 3121 3122 return (1); 3123 } 3124 3125 /* 3126 * The sfmmu hat layer (e.g.) accesses some parts of the memseg 3127 * structure using physical addresses. Therefore a kmem_cache is 3128 * used with KMC_NOHASH to avoid page crossings within a memseg 3129 * structure. KMC_NOHASH requires that no external (outside of 3130 * slab) information is allowed. This, in turn, implies that the 3131 * cache's slabsize must be exactly a single page, since per-slab 3132 * information (e.g. the freelist for the slab) is kept at the 3133 * end of the slab, where it is easy to locate. Should be changed 3134 * when a more obvious kmem_cache interface/flag will become 3135 * available. 3136 */ 3137 void 3138 mem_config_init() 3139 { 3140 memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg), 3141 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 3142 } 3143