1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/cmn_err.h> 28 #include <sys/vmem.h> 29 #include <sys/kmem.h> 30 #include <sys/systm.h> 31 #include <sys/machsystm.h> /* for page_freelist_coalesce() */ 32 #include <sys/errno.h> 33 #include <sys/memnode.h> 34 #include <sys/memlist.h> 35 #include <sys/memlist_impl.h> 36 #include <sys/tuneable.h> 37 #include <sys/proc.h> 38 #include <sys/disp.h> 39 #include <sys/debug.h> 40 #include <sys/vm.h> 41 #include <sys/callb.h> 42 #include <sys/memlist_plat.h> /* for installed_top_size() */ 43 #include <sys/condvar_impl.h> /* for CV_HAS_WAITERS() */ 44 #include <sys/dumphdr.h> /* for dump_resize() */ 45 #include <sys/atomic.h> /* for use in stats collection */ 46 #include <sys/rwlock.h> 47 #include <sys/cpuvar.h> 48 #include <vm/seg_kmem.h> 49 #include <vm/seg_kpm.h> 50 #include <vm/page.h> 51 #include <vm/vm_dep.h> 52 #define SUNDDI_IMPL /* so sunddi.h will not redefine splx() et al */ 53 #include <sys/sunddi.h> 54 #include <sys/mem_config.h> 55 #include <sys/mem_cage.h> 56 #include <sys/lgrp.h> 57 #include <sys/ddi.h> 58 #include <sys/modctl.h> 59 60 extern struct memlist *phys_avail; 61 62 extern void mem_node_add(pfn_t, pfn_t); 63 extern void mem_node_del(pfn_t, pfn_t); 64 65 extern uint_t page_ctrs_adjust(int); 66 static void kphysm_setup_post_add(pgcnt_t); 67 static int kphysm_setup_pre_del(pgcnt_t); 68 static void kphysm_setup_post_del(pgcnt_t, int); 69 70 static int kphysm_split_memseg(pfn_t base, pgcnt_t npgs); 71 72 static int delspan_reserve(pfn_t, pgcnt_t); 73 static void delspan_unreserve(pfn_t, pgcnt_t); 74 75 static kmutex_t memseg_lists_lock; 76 static struct memseg *memseg_va_avail; 77 static struct memseg *memseg_delete_junk; 78 static struct memseg *memseg_edit_junk; 79 void memseg_remap_init(void); 80 static void memseg_remap_to_dummy(caddr_t, pgcnt_t); 81 static void kphysm_addmem_error_undospan(pfn_t, pgcnt_t); 82 static struct memseg *memseg_reuse(pgcnt_t); 83 84 static struct kmem_cache *memseg_cache; 85 86 /* 87 * Add a chunk of memory to the system. page_t's for this memory 88 * are allocated in the first few pages of the chunk. 89 * base: starting PAGESIZE page of new memory. 90 * npgs: length in PAGESIZE pages. 91 * 92 * Adding mem this way doesn't increase the size of the hash tables; 93 * growing them would be too hard. This should be OK, but adding memory 94 * dynamically most likely means more hash misses, since the tables will 95 * be smaller than they otherwise would be. 96 */ 97 int 98 kphysm_add_memory_dynamic(pfn_t base, pgcnt_t npgs) 99 { 100 page_t *pp; 101 page_t *opp, *oepp; 102 struct memseg *seg; 103 uint64_t avmem; 104 pfn_t pfn; 105 pfn_t pt_base = base; 106 pgcnt_t tpgs = npgs; 107 pgcnt_t metapgs; 108 int exhausted; 109 pfn_t pnum; 110 int mnode; 111 caddr_t vaddr; 112 int reuse; 113 int mlret; 114 void *mapva; 115 pgcnt_t nkpmpgs = 0; 116 offset_t kpm_pages_off; 117 118 cmn_err(CE_CONT, 119 "?kphysm_add_memory_dynamic: adding %ldK at 0x%" PRIx64 "\n", 120 npgs << (PAGESHIFT - 10), (uint64_t)base << PAGESHIFT); 121 122 /* 123 * Add this span in the delete list to prevent interactions. 124 */ 125 if (!delspan_reserve(base, npgs)) { 126 return (KPHYSM_ESPAN); 127 } 128 /* 129 * Check to see if any of the memory span has been added 130 * by trying an add to the installed memory list. This 131 * forms the interlocking process for add. 132 */ 133 134 memlist_write_lock(); 135 136 mlret = memlist_add_span((uint64_t)(pt_base) << PAGESHIFT, 137 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 138 139 if (mlret == MEML_SPANOP_OK) 140 installed_top_size(phys_install, &physmax, &physinstalled); 141 142 memlist_write_unlock(); 143 144 if (mlret != MEML_SPANOP_OK) { 145 if (mlret == MEML_SPANOP_EALLOC) { 146 delspan_unreserve(pt_base, tpgs); 147 return (KPHYSM_ERESOURCE); 148 } else 149 if (mlret == MEML_SPANOP_ESPAN) { 150 delspan_unreserve(pt_base, tpgs); 151 return (KPHYSM_ESPAN); 152 } else { 153 delspan_unreserve(pt_base, tpgs); 154 return (KPHYSM_ERESOURCE); 155 } 156 } 157 158 /* 159 * We store the page_t's for this new memory in the first 160 * few pages of the chunk. Here, we go and get'em ... 161 */ 162 163 /* 164 * The expression after the '-' gives the number of pages 165 * that will fit in the new memory based on a requirement 166 * of (PAGESIZE + sizeof (page_t)) bytes per page. 167 */ 168 metapgs = npgs - (((uint64_t)(npgs) << PAGESHIFT) / 169 (PAGESIZE + sizeof (page_t))); 170 171 npgs -= metapgs; 172 base += metapgs; 173 174 ASSERT(btopr(npgs * sizeof (page_t)) <= metapgs); 175 176 exhausted = (metapgs == 0 || npgs == 0); 177 178 if (kpm_enable && !exhausted) { 179 pgcnt_t start, end, nkpmpgs_prelim; 180 size_t ptsz; 181 182 /* 183 * A viable kpm large page mapping must not overlap two 184 * dynamic memsegs. Therefore the total size is checked 185 * to be at least kpm_pgsz and also whether start and end 186 * points are at least kpm_pgsz aligned. 187 */ 188 if (ptokpmp(tpgs) < 1 || pmodkpmp(pt_base) || 189 pmodkpmp(base + npgs)) { 190 191 kphysm_addmem_error_undospan(pt_base, tpgs); 192 193 /* 194 * There is no specific error code for violating 195 * kpm granularity constraints. 196 */ 197 return (KPHYSM_ENOTVIABLE); 198 } 199 200 start = kpmptop(ptokpmp(base)); 201 end = kpmptop(ptokpmp(base + npgs)); 202 nkpmpgs_prelim = ptokpmp(end - start); 203 ptsz = npgs * sizeof (page_t); 204 metapgs = btopr(ptsz + nkpmpgs_prelim * KPMPAGE_T_SZ); 205 exhausted = (tpgs <= metapgs); 206 if (!exhausted) { 207 npgs = tpgs - metapgs; 208 base = pt_base + metapgs; 209 210 /* final nkpmpgs */ 211 start = kpmptop(ptokpmp(base)); 212 nkpmpgs = ptokpmp(end - start); 213 kpm_pages_off = ptsz + 214 (nkpmpgs_prelim - nkpmpgs) * KPMPAGE_T_SZ; 215 } 216 } 217 218 /* 219 * Is memory area supplied too small? 220 */ 221 if (exhausted) { 222 kphysm_addmem_error_undospan(pt_base, tpgs); 223 224 /* 225 * There is no specific error code for 'too small'. 226 */ 227 return (KPHYSM_ERESOURCE); 228 } 229 230 /* 231 * We may re-use a previously allocated VA space for the page_ts 232 * eventually, but we need to initialize and lock the pages first. 233 */ 234 235 /* 236 * Get an address in the kernel address map, map 237 * the page_t pages and see if we can touch them. 238 */ 239 240 mapva = vmem_alloc(heap_arena, ptob(metapgs), VM_NOSLEEP); 241 if (mapva == NULL) { 242 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 243 " Can't allocate VA for page_ts"); 244 245 kphysm_addmem_error_undospan(pt_base, tpgs); 246 247 return (KPHYSM_ERESOURCE); 248 } 249 pp = mapva; 250 251 if (physmax < (pt_base + tpgs)) 252 physmax = (pt_base + tpgs); 253 254 /* 255 * In the remapping code we map one page at a time so we must do 256 * the same here to match mapping sizes. 257 */ 258 pfn = pt_base; 259 vaddr = (caddr_t)pp; 260 for (pnum = 0; pnum < metapgs; pnum++) { 261 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 262 PROT_READ | PROT_WRITE, 263 HAT_LOAD | HAT_LOAD_LOCK | HAT_LOAD_NOCONSIST); 264 pfn++; 265 vaddr += ptob(1); 266 } 267 268 if (ddi_peek32((dev_info_t *)NULL, 269 (int32_t *)pp, (int32_t *)0) == DDI_FAILURE) { 270 271 cmn_err(CE_WARN, "kphysm_add_memory_dynamic:" 272 " Can't access pp array at 0x%p [phys 0x%lx]", 273 (void *)pp, pt_base); 274 275 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 276 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 277 278 vmem_free(heap_arena, mapva, ptob(metapgs)); 279 280 kphysm_addmem_error_undospan(pt_base, tpgs); 281 282 return (KPHYSM_EFAULT); 283 } 284 285 /* 286 * Add this memory slice to its memory node translation. 287 * 288 * Note that right now, each node may have only one slice; 289 * this may change with COD or in larger SSM systems with 290 * nested latency groups, so we must not assume that the 291 * node does not yet exist. 292 */ 293 pnum = base + npgs - 1; 294 mem_node_add_slice(base, pnum); 295 296 /* 297 * Allocate or resize page counters as necessary to accommodate 298 * the increase in memory pages. 299 */ 300 mnode = PFN_2_MEM_NODE(pnum); 301 if (page_ctrs_adjust(mnode) != 0) { 302 303 mem_node_pre_del_slice(base, pnum); 304 mem_node_post_del_slice(base, pnum, 0); 305 306 hat_unload(kas.a_hat, (caddr_t)pp, ptob(metapgs), 307 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 308 309 vmem_free(heap_arena, mapva, ptob(metapgs)); 310 311 kphysm_addmem_error_undospan(pt_base, tpgs); 312 313 return (KPHYSM_ERESOURCE); 314 } 315 316 /* 317 * Update the phys_avail memory list. 318 * The phys_install list was done at the start. 319 */ 320 321 memlist_write_lock(); 322 323 mlret = memlist_add_span((uint64_t)(base) << PAGESHIFT, 324 (uint64_t)(npgs) << PAGESHIFT, &phys_avail); 325 ASSERT(mlret == MEML_SPANOP_OK); 326 327 memlist_write_unlock(); 328 329 /* See if we can find a memseg to re-use. */ 330 seg = memseg_reuse(metapgs); 331 332 reuse = (seg != NULL); 333 334 /* 335 * Initialize the memseg structure representing this memory 336 * and add it to the existing list of memsegs. Do some basic 337 * initialization and add the memory to the system. 338 * In order to prevent lock deadlocks, the add_physmem() 339 * code is repeated here, but split into several stages. 340 */ 341 if (seg == NULL) { 342 seg = kmem_cache_alloc(memseg_cache, KM_SLEEP); 343 bzero(seg, sizeof (struct memseg)); 344 seg->msegflags = MEMSEG_DYNAMIC; 345 seg->pages = pp; 346 } else { 347 /*EMPTY*/ 348 ASSERT(seg->msegflags & MEMSEG_DYNAMIC); 349 } 350 351 seg->epages = seg->pages + npgs; 352 seg->pages_base = base; 353 seg->pages_end = base + npgs; 354 355 /* 356 * Initialize metadata. The page_ts are set to locked state 357 * ready to be freed. 358 */ 359 bzero((caddr_t)pp, ptob(metapgs)); 360 361 pfn = seg->pages_base; 362 /* Save the original pp base in case we reuse a memseg. */ 363 opp = pp; 364 oepp = opp + npgs; 365 for (pp = opp; pp < oepp; pp++) { 366 pp->p_pagenum = pfn; 367 pfn++; 368 page_iolock_init(pp); 369 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 370 continue; 371 pp->p_offset = (u_offset_t)-1; 372 } 373 374 if (reuse) { 375 /* Remap our page_ts to the re-used memseg VA space. */ 376 pfn = pt_base; 377 vaddr = (caddr_t)seg->pages; 378 for (pnum = 0; pnum < metapgs; pnum++) { 379 hat_devload(kas.a_hat, vaddr, ptob(1), pfn, 380 PROT_READ | PROT_WRITE, 381 HAT_LOAD_REMAP | HAT_LOAD | HAT_LOAD_NOCONSIST); 382 pfn++; 383 vaddr += ptob(1); 384 } 385 386 hat_unload(kas.a_hat, (caddr_t)opp, ptob(metapgs), 387 HAT_UNLOAD_UNMAP|HAT_UNLOAD_UNLOCK); 388 389 vmem_free(heap_arena, mapva, ptob(metapgs)); 390 } 391 392 hat_kpm_addmem_mseg_update(seg, nkpmpgs, kpm_pages_off); 393 394 memsegs_lock(1); 395 396 /* 397 * The new memseg is inserted at the beginning of the list. 398 * Not only does this save searching for the tail, but in the 399 * case of a re-used memseg, it solves the problem of what 400 * happens if some process has still got a pointer to the 401 * memseg and follows the next pointer to continue traversing 402 * the memsegs list. 403 */ 404 405 hat_kpm_addmem_mseg_insert(seg); 406 407 seg->next = memsegs; 408 membar_producer(); 409 410 hat_kpm_addmem_memsegs_update(seg); 411 412 memsegs = seg; 413 414 build_pfn_hash(); 415 416 total_pages += npgs; 417 418 /* 419 * Recalculate the paging parameters now total_pages has changed. 420 * This will also cause the clock hands to be reset before next use. 421 */ 422 setupclock(1); 423 424 memsegs_unlock(1); 425 426 PLCNT_MODIFY_MAX(seg->pages_base, (long)npgs); 427 428 /* 429 * Free the pages outside the lock to avoid locking loops. 430 */ 431 for (pp = seg->pages; pp < seg->epages; pp++) { 432 page_free(pp, 1); 433 } 434 435 /* 436 * Now that we've updated the appropriate memory lists we 437 * need to reset a number of globals, since we've increased memory. 438 * Several have already been updated for us as noted above. The 439 * globals we're interested in at this point are: 440 * physmax - highest page frame number. 441 * physinstalled - number of pages currently installed (done earlier) 442 * maxmem - max free pages in the system 443 * physmem - physical memory pages available 444 * availrmem - real memory available 445 */ 446 447 mutex_enter(&freemem_lock); 448 maxmem += npgs; 449 physmem += npgs; 450 availrmem += npgs; 451 availrmem_initial += npgs; 452 453 mutex_exit(&freemem_lock); 454 455 dump_resize(); 456 457 page_freelist_coalesce_all(mnode); 458 459 kphysm_setup_post_add(npgs); 460 461 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: mem = %ldK " 462 "(0x%" PRIx64 ")\n", 463 physinstalled << (PAGESHIFT - 10), 464 (uint64_t)physinstalled << PAGESHIFT); 465 466 avmem = (uint64_t)freemem << PAGESHIFT; 467 cmn_err(CE_CONT, "?kphysm_add_memory_dynamic: " 468 "avail mem = %" PRId64 "\n", avmem); 469 470 /* 471 * Update lgroup generation number on single lgroup systems 472 */ 473 if (nlgrps == 1) 474 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 475 476 delspan_unreserve(pt_base, tpgs); 477 return (KPHYSM_OK); /* Successfully added system memory */ 478 479 } 480 481 /* 482 * There are various error conditions in kphysm_add_memory_dynamic() 483 * which require a rollback of already changed global state. 484 */ 485 static void 486 kphysm_addmem_error_undospan(pfn_t pt_base, pgcnt_t tpgs) 487 { 488 int mlret; 489 490 /* Unreserve memory span. */ 491 memlist_write_lock(); 492 493 mlret = memlist_delete_span( 494 (uint64_t)(pt_base) << PAGESHIFT, 495 (uint64_t)(tpgs) << PAGESHIFT, &phys_install); 496 497 ASSERT(mlret == MEML_SPANOP_OK); 498 phys_install_has_changed(); 499 installed_top_size(phys_install, &physmax, &physinstalled); 500 501 memlist_write_unlock(); 502 delspan_unreserve(pt_base, tpgs); 503 } 504 505 /* 506 * Only return an available memseg of exactly the right size. 507 * When the meta data area has it's own virtual address space 508 * we will need to manage this more carefully and do best fit 509 * allocations, possibly splitting an available area. 510 */ 511 static struct memseg * 512 memseg_reuse(pgcnt_t metapgs) 513 { 514 struct memseg **segpp, *seg; 515 516 mutex_enter(&memseg_lists_lock); 517 518 segpp = &memseg_va_avail; 519 for (; (seg = *segpp) != NULL; segpp = &seg->lnext) { 520 caddr_t end; 521 522 if (kpm_enable) 523 end = hat_kpm_mseg_reuse(seg); 524 else 525 end = (caddr_t)seg->epages; 526 527 if (btopr(end - (caddr_t)seg->pages) == metapgs) { 528 *segpp = seg->lnext; 529 seg->lnext = NULL; 530 break; 531 } 532 } 533 mutex_exit(&memseg_lists_lock); 534 535 return (seg); 536 } 537 538 static uint_t handle_gen; 539 540 struct memdelspan { 541 struct memdelspan *mds_next; 542 pfn_t mds_base; 543 pgcnt_t mds_npgs; 544 uint_t *mds_bitmap; 545 uint_t *mds_bitmap_retired; 546 }; 547 548 #define NBPBMW (sizeof (uint_t) * NBBY) 549 #define MDS_BITMAPBYTES(MDSP) \ 550 ((((MDSP)->mds_npgs + NBPBMW - 1) / NBPBMW) * sizeof (uint_t)) 551 552 struct transit_list { 553 struct transit_list *trl_next; 554 struct memdelspan *trl_spans; 555 int trl_collect; 556 }; 557 558 struct transit_list_head { 559 kmutex_t trh_lock; 560 struct transit_list *trh_head; 561 }; 562 563 static struct transit_list_head transit_list_head; 564 565 struct mem_handle; 566 static void transit_list_collect(struct mem_handle *, int); 567 static void transit_list_insert(struct transit_list *); 568 static void transit_list_remove(struct transit_list *); 569 570 #ifdef DEBUG 571 #define MEM_DEL_STATS 572 #endif /* DEBUG */ 573 574 #ifdef MEM_DEL_STATS 575 static int mem_del_stat_print = 0; 576 struct mem_del_stat { 577 uint_t nloop; 578 uint_t need_free; 579 uint_t free_loop; 580 uint_t free_low; 581 uint_t free_failed; 582 uint_t ncheck; 583 uint_t nopaget; 584 uint_t lockfail; 585 uint_t nfree; 586 uint_t nreloc; 587 uint_t nrelocfail; 588 uint_t already_done; 589 uint_t first_notfree; 590 uint_t npplocked; 591 uint_t nlockreloc; 592 uint_t nnorepl; 593 uint_t nmodreloc; 594 uint_t ndestroy; 595 uint_t nputpage; 596 uint_t nnoreclaim; 597 uint_t ndelay; 598 uint_t demotefail; 599 uint64_t nticks_total; 600 uint64_t nticks_pgrp; 601 uint_t retired; 602 uint_t toxic; 603 uint_t failing; 604 uint_t modtoxic; 605 uint_t npplkdtoxic; 606 uint_t gptlmodfail; 607 uint_t gptllckfail; 608 }; 609 /* 610 * The stat values are only incremented in the delete thread 611 * so no locking or atomic required. 612 */ 613 #define MDSTAT_INCR(MHP, FLD) (MHP)->mh_delstat.FLD++ 614 #define MDSTAT_TOTAL(MHP, ntck) ((MHP)->mh_delstat.nticks_total += (ntck)) 615 #define MDSTAT_PGRP(MHP, ntck) ((MHP)->mh_delstat.nticks_pgrp += (ntck)) 616 static void mem_del_stat_print_func(struct mem_handle *); 617 #define MDSTAT_PRINT(MHP) mem_del_stat_print_func((MHP)) 618 #else /* MEM_DEL_STATS */ 619 #define MDSTAT_INCR(MHP, FLD) 620 #define MDSTAT_TOTAL(MHP, ntck) 621 #define MDSTAT_PGRP(MHP, ntck) 622 #define MDSTAT_PRINT(MHP) 623 #endif /* MEM_DEL_STATS */ 624 625 typedef enum mhnd_state {MHND_FREE = 0, MHND_INIT, MHND_STARTING, 626 MHND_RUNNING, MHND_DONE, MHND_RELEASE} mhnd_state_t; 627 628 /* 629 * mh_mutex must be taken to examine or change mh_exthandle and mh_state. 630 * The mutex may not be required for other fields, dependent on mh_state. 631 */ 632 struct mem_handle { 633 kmutex_t mh_mutex; 634 struct mem_handle *mh_next; 635 memhandle_t mh_exthandle; 636 mhnd_state_t mh_state; 637 struct transit_list mh_transit; 638 pgcnt_t mh_phys_pages; 639 pgcnt_t mh_vm_pages; 640 pgcnt_t mh_hold_todo; 641 void (*mh_delete_complete)(void *, int error); 642 void *mh_delete_complete_arg; 643 volatile uint_t mh_cancel; 644 volatile uint_t mh_dr_aio_cleanup_cancel; 645 volatile uint_t mh_aio_cleanup_done; 646 kcondvar_t mh_cv; 647 kthread_id_t mh_thread_id; 648 page_t *mh_deleted; /* link through p_next */ 649 #ifdef MEM_DEL_STATS 650 struct mem_del_stat mh_delstat; 651 #endif /* MEM_DEL_STATS */ 652 }; 653 654 static struct mem_handle *mem_handle_head; 655 static kmutex_t mem_handle_list_mutex; 656 657 static struct mem_handle * 658 kphysm_allocate_mem_handle() 659 { 660 struct mem_handle *mhp; 661 662 mhp = kmem_zalloc(sizeof (struct mem_handle), KM_SLEEP); 663 mutex_init(&mhp->mh_mutex, NULL, MUTEX_DEFAULT, NULL); 664 mutex_enter(&mem_handle_list_mutex); 665 mutex_enter(&mhp->mh_mutex); 666 /* handle_gen is protected by list mutex. */ 667 mhp->mh_exthandle = (memhandle_t)(uintptr_t)(++handle_gen); 668 mhp->mh_next = mem_handle_head; 669 mem_handle_head = mhp; 670 mutex_exit(&mem_handle_list_mutex); 671 672 return (mhp); 673 } 674 675 static void 676 kphysm_free_mem_handle(struct mem_handle *mhp) 677 { 678 struct mem_handle **mhpp; 679 680 ASSERT(mutex_owned(&mhp->mh_mutex)); 681 ASSERT(mhp->mh_state == MHND_FREE); 682 /* 683 * Exit the mutex to preserve locking order. This is OK 684 * here as once in the FREE state, the handle cannot 685 * be found by a lookup. 686 */ 687 mutex_exit(&mhp->mh_mutex); 688 689 mutex_enter(&mem_handle_list_mutex); 690 mhpp = &mem_handle_head; 691 while (*mhpp != NULL && *mhpp != mhp) 692 mhpp = &(*mhpp)->mh_next; 693 ASSERT(*mhpp == mhp); 694 /* 695 * No need to lock the handle (mh_mutex) as only 696 * mh_next changing and this is the only thread that 697 * can be referncing mhp. 698 */ 699 *mhpp = mhp->mh_next; 700 mutex_exit(&mem_handle_list_mutex); 701 702 mutex_destroy(&mhp->mh_mutex); 703 kmem_free(mhp, sizeof (struct mem_handle)); 704 } 705 706 /* 707 * This function finds the internal mem_handle corresponding to an 708 * external handle and returns it with the mh_mutex held. 709 */ 710 static struct mem_handle * 711 kphysm_lookup_mem_handle(memhandle_t handle) 712 { 713 struct mem_handle *mhp; 714 715 mutex_enter(&mem_handle_list_mutex); 716 for (mhp = mem_handle_head; mhp != NULL; mhp = mhp->mh_next) { 717 if (mhp->mh_exthandle == handle) { 718 mutex_enter(&mhp->mh_mutex); 719 /* 720 * The state of the handle could have been changed 721 * by kphysm_del_release() while waiting for mh_mutex. 722 */ 723 if (mhp->mh_state == MHND_FREE) { 724 mutex_exit(&mhp->mh_mutex); 725 continue; 726 } 727 break; 728 } 729 } 730 mutex_exit(&mem_handle_list_mutex); 731 return (mhp); 732 } 733 734 int 735 kphysm_del_gethandle(memhandle_t *xmhp) 736 { 737 struct mem_handle *mhp; 738 739 mhp = kphysm_allocate_mem_handle(); 740 /* 741 * The handle is allocated using KM_SLEEP, so cannot fail. 742 * If the implementation is changed, the correct error to return 743 * here would be KPHYSM_ENOHANDLES. 744 */ 745 ASSERT(mhp->mh_state == MHND_FREE); 746 mhp->mh_state = MHND_INIT; 747 *xmhp = mhp->mh_exthandle; 748 mutex_exit(&mhp->mh_mutex); 749 return (KPHYSM_OK); 750 } 751 752 static int 753 overlapping(pfn_t b1, pgcnt_t l1, pfn_t b2, pgcnt_t l2) 754 { 755 pfn_t e1, e2; 756 757 e1 = b1 + l1; 758 e2 = b2 + l2; 759 760 return (!(b2 >= e1 || b1 >= e2)); 761 } 762 763 static int can_remove_pgs(pgcnt_t); 764 765 static struct memdelspan * 766 span_to_install(pfn_t base, pgcnt_t npgs) 767 { 768 struct memdelspan *mdsp; 769 struct memdelspan *mdsp_new; 770 uint64_t address, size, thislen; 771 struct memlist *mlp; 772 773 mdsp_new = NULL; 774 775 address = (uint64_t)base << PAGESHIFT; 776 size = (uint64_t)npgs << PAGESHIFT; 777 while (size != 0) { 778 memlist_read_lock(); 779 for (mlp = phys_install; mlp != NULL; mlp = mlp->next) { 780 if (address >= (mlp->address + mlp->size)) 781 continue; 782 if ((address + size) > mlp->address) 783 break; 784 } 785 if (mlp == NULL) { 786 address += size; 787 size = 0; 788 thislen = 0; 789 } else { 790 if (address < mlp->address) { 791 size -= (mlp->address - address); 792 address = mlp->address; 793 } 794 ASSERT(address >= mlp->address); 795 if ((address + size) > (mlp->address + mlp->size)) { 796 thislen = mlp->size - (address - mlp->address); 797 } else { 798 thislen = size; 799 } 800 } 801 memlist_read_unlock(); 802 /* TODO: phys_install could change now */ 803 if (thislen == 0) 804 continue; 805 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 806 mdsp->mds_base = btop(address); 807 mdsp->mds_npgs = btop(thislen); 808 mdsp->mds_next = mdsp_new; 809 mdsp_new = mdsp; 810 address += thislen; 811 size -= thislen; 812 } 813 return (mdsp_new); 814 } 815 816 static void 817 free_delspans(struct memdelspan *mdsp) 818 { 819 struct memdelspan *amdsp; 820 821 while ((amdsp = mdsp) != NULL) { 822 mdsp = amdsp->mds_next; 823 kmem_free(amdsp, sizeof (struct memdelspan)); 824 } 825 } 826 827 /* 828 * Concatenate lists. No list ordering is required. 829 */ 830 831 static void 832 delspan_concat(struct memdelspan **mdspp, struct memdelspan *mdsp) 833 { 834 while (*mdspp != NULL) 835 mdspp = &(*mdspp)->mds_next; 836 837 *mdspp = mdsp; 838 } 839 840 /* 841 * Given a new list of delspans, check there is no overlap with 842 * all existing span activity (add or delete) and then concatenate 843 * the new spans to the given list. 844 * Return 1 for OK, 0 if overlapping. 845 */ 846 static int 847 delspan_insert( 848 struct transit_list *my_tlp, 849 struct memdelspan *mdsp_new) 850 { 851 struct transit_list_head *trh; 852 struct transit_list *tlp; 853 int ret; 854 855 trh = &transit_list_head; 856 857 ASSERT(my_tlp != NULL); 858 ASSERT(mdsp_new != NULL); 859 860 ret = 1; 861 mutex_enter(&trh->trh_lock); 862 /* ASSERT(my_tlp->trl_spans == NULL || tlp_in_list(trh, my_tlp)); */ 863 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 864 struct memdelspan *mdsp; 865 866 for (mdsp = tlp->trl_spans; mdsp != NULL; 867 mdsp = mdsp->mds_next) { 868 struct memdelspan *nmdsp; 869 870 for (nmdsp = mdsp_new; nmdsp != NULL; 871 nmdsp = nmdsp->mds_next) { 872 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 873 nmdsp->mds_base, nmdsp->mds_npgs)) { 874 ret = 0; 875 goto done; 876 } 877 } 878 } 879 } 880 done: 881 if (ret != 0) { 882 if (my_tlp->trl_spans == NULL) 883 transit_list_insert(my_tlp); 884 delspan_concat(&my_tlp->trl_spans, mdsp_new); 885 } 886 mutex_exit(&trh->trh_lock); 887 return (ret); 888 } 889 890 static void 891 delspan_remove( 892 struct transit_list *my_tlp, 893 pfn_t base, 894 pgcnt_t npgs) 895 { 896 struct transit_list_head *trh; 897 struct memdelspan *mdsp; 898 899 trh = &transit_list_head; 900 901 ASSERT(my_tlp != NULL); 902 903 mutex_enter(&trh->trh_lock); 904 if ((mdsp = my_tlp->trl_spans) != NULL) { 905 if (npgs == 0) { 906 my_tlp->trl_spans = NULL; 907 free_delspans(mdsp); 908 transit_list_remove(my_tlp); 909 } else { 910 struct memdelspan **prv; 911 912 prv = &my_tlp->trl_spans; 913 while (mdsp != NULL) { 914 pfn_t p_end; 915 916 p_end = mdsp->mds_base + mdsp->mds_npgs; 917 if (mdsp->mds_base >= base && 918 p_end <= (base + npgs)) { 919 *prv = mdsp->mds_next; 920 mdsp->mds_next = NULL; 921 free_delspans(mdsp); 922 } else { 923 prv = &mdsp->mds_next; 924 } 925 mdsp = *prv; 926 } 927 if (my_tlp->trl_spans == NULL) 928 transit_list_remove(my_tlp); 929 } 930 } 931 mutex_exit(&trh->trh_lock); 932 } 933 934 /* 935 * Reserve interface for add to stop delete before add finished. 936 * This list is only accessed through the delspan_insert/remove 937 * functions and so is fully protected by the mutex in struct transit_list. 938 */ 939 940 static struct transit_list reserve_transit; 941 942 static int 943 delspan_reserve(pfn_t base, pgcnt_t npgs) 944 { 945 struct memdelspan *mdsp; 946 int ret; 947 948 mdsp = kmem_zalloc(sizeof (struct memdelspan), KM_SLEEP); 949 mdsp->mds_base = base; 950 mdsp->mds_npgs = npgs; 951 if ((ret = delspan_insert(&reserve_transit, mdsp)) == 0) { 952 free_delspans(mdsp); 953 } 954 return (ret); 955 } 956 957 static void 958 delspan_unreserve(pfn_t base, pgcnt_t npgs) 959 { 960 delspan_remove(&reserve_transit, base, npgs); 961 } 962 963 /* 964 * Return whether memseg was created by kphysm_add_memory_dynamic(). 965 * If this is the case and startp non zero, return also the start pfn 966 * of the meta data via startp. 967 */ 968 static int 969 memseg_is_dynamic(struct memseg *seg, pfn_t *startp) 970 { 971 pfn_t pt_start; 972 973 if ((seg->msegflags & MEMSEG_DYNAMIC) == 0) 974 return (0); 975 976 /* Meta data is required to be at the beginning */ 977 ASSERT(hat_getpfnum(kas.a_hat, (caddr_t)seg->epages) < seg->pages_base); 978 979 pt_start = hat_getpfnum(kas.a_hat, (caddr_t)seg->pages); 980 if (startp != NULL) 981 *startp = pt_start; 982 983 return (1); 984 } 985 986 int 987 kphysm_del_span( 988 memhandle_t handle, 989 pfn_t base, 990 pgcnt_t npgs) 991 { 992 struct mem_handle *mhp; 993 struct memseg *seg; 994 struct memdelspan *mdsp; 995 struct memdelspan *mdsp_new; 996 pgcnt_t phys_pages, vm_pages; 997 pfn_t p_end; 998 page_t *pp; 999 int ret; 1000 1001 mhp = kphysm_lookup_mem_handle(handle); 1002 if (mhp == NULL) { 1003 return (KPHYSM_EHANDLE); 1004 } 1005 if (mhp->mh_state != MHND_INIT) { 1006 mutex_exit(&mhp->mh_mutex); 1007 return (KPHYSM_ESEQUENCE); 1008 } 1009 1010 /* 1011 * Intersect the span with the installed memory list (phys_install). 1012 */ 1013 mdsp_new = span_to_install(base, npgs); 1014 if (mdsp_new == NULL) { 1015 /* 1016 * No physical memory in this range. Is this an 1017 * error? If an attempt to start the delete is made 1018 * for OK returns from del_span such as this, start will 1019 * return an error. 1020 * Could return KPHYSM_ENOWORK. 1021 */ 1022 /* 1023 * It is assumed that there are no error returns 1024 * from span_to_install() due to kmem_alloc failure. 1025 */ 1026 mutex_exit(&mhp->mh_mutex); 1027 return (KPHYSM_OK); 1028 } 1029 /* 1030 * Does this span overlap an existing span? 1031 */ 1032 if (delspan_insert(&mhp->mh_transit, mdsp_new) == 0) { 1033 /* 1034 * Differentiate between already on list for this handle 1035 * (KPHYSM_EDUP) and busy elsewhere (KPHYSM_EBUSY). 1036 */ 1037 ret = KPHYSM_EBUSY; 1038 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1039 mdsp = mdsp->mds_next) { 1040 if (overlapping(mdsp->mds_base, mdsp->mds_npgs, 1041 base, npgs)) { 1042 ret = KPHYSM_EDUP; 1043 break; 1044 } 1045 } 1046 mutex_exit(&mhp->mh_mutex); 1047 free_delspans(mdsp_new); 1048 return (ret); 1049 } 1050 /* 1051 * At this point the spans in mdsp_new have been inserted into the 1052 * list of spans for this handle and thereby to the global list of 1053 * spans being processed. Each of these spans must now be checked 1054 * for relocatability. As a side-effect segments in the memseg list 1055 * may be split. 1056 * 1057 * Note that mdsp_new can no longer be used as it is now part of 1058 * a larger list. Select elements of this larger list based 1059 * on base and npgs. 1060 */ 1061 restart: 1062 phys_pages = 0; 1063 vm_pages = 0; 1064 ret = KPHYSM_OK; 1065 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1066 mdsp = mdsp->mds_next) { 1067 pgcnt_t pages_checked; 1068 1069 if (!overlapping(mdsp->mds_base, mdsp->mds_npgs, base, npgs)) { 1070 continue; 1071 } 1072 p_end = mdsp->mds_base + mdsp->mds_npgs; 1073 /* 1074 * The pages_checked count is a hack. All pages should be 1075 * checked for relocatability. Those not covered by memsegs 1076 * should be tested with arch_kphysm_del_span_ok(). 1077 */ 1078 pages_checked = 0; 1079 for (seg = memsegs; seg; seg = seg->next) { 1080 pfn_t mseg_start; 1081 1082 if (seg->pages_base >= p_end || 1083 seg->pages_end <= mdsp->mds_base) { 1084 /* Span and memseg don't overlap. */ 1085 continue; 1086 } 1087 /* Check that segment is suitable for delete. */ 1088 if (memseg_is_dynamic(seg, &mseg_start)) { 1089 /* 1090 * Can only delete whole added segments 1091 * for the moment. 1092 * Check that this is completely within the 1093 * span. 1094 */ 1095 if (mseg_start < mdsp->mds_base || 1096 seg->pages_end > p_end) { 1097 ret = KPHYSM_EBUSY; 1098 break; 1099 } 1100 pages_checked += seg->pages_end - mseg_start; 1101 } else { 1102 /* 1103 * Set mseg_start for accounting below. 1104 */ 1105 mseg_start = seg->pages_base; 1106 /* 1107 * If this segment is larger than the span, 1108 * try to split it. After the split, it 1109 * is necessary to restart. 1110 */ 1111 if (seg->pages_base < mdsp->mds_base || 1112 seg->pages_end > p_end) { 1113 pfn_t abase; 1114 pgcnt_t anpgs; 1115 int s_ret; 1116 1117 /* Split required. */ 1118 if (mdsp->mds_base < seg->pages_base) 1119 abase = seg->pages_base; 1120 else 1121 abase = mdsp->mds_base; 1122 if (p_end > seg->pages_end) 1123 anpgs = seg->pages_end - abase; 1124 else 1125 anpgs = p_end - abase; 1126 s_ret = kphysm_split_memseg(abase, 1127 anpgs); 1128 if (s_ret == 0) { 1129 /* Split failed. */ 1130 ret = KPHYSM_ERESOURCE; 1131 break; 1132 } 1133 goto restart; 1134 } 1135 pages_checked += 1136 seg->pages_end - seg->pages_base; 1137 } 1138 /* 1139 * The memseg is wholly within the delete span. 1140 * The individual pages can now be checked. 1141 */ 1142 /* Cage test. */ 1143 for (pp = seg->pages; pp < seg->epages; pp++) { 1144 if (PP_ISNORELOC(pp)) { 1145 ret = KPHYSM_ENONRELOC; 1146 break; 1147 } 1148 } 1149 if (ret != KPHYSM_OK) { 1150 break; 1151 } 1152 phys_pages += (seg->pages_end - mseg_start); 1153 vm_pages += MSEG_NPAGES(seg); 1154 } 1155 if (ret != KPHYSM_OK) 1156 break; 1157 if (pages_checked != mdsp->mds_npgs) { 1158 ret = KPHYSM_ENONRELOC; 1159 break; 1160 } 1161 } 1162 1163 if (ret == KPHYSM_OK) { 1164 mhp->mh_phys_pages += phys_pages; 1165 mhp->mh_vm_pages += vm_pages; 1166 } else { 1167 /* 1168 * Keep holding the mh_mutex to prevent it going away. 1169 */ 1170 delspan_remove(&mhp->mh_transit, base, npgs); 1171 } 1172 mutex_exit(&mhp->mh_mutex); 1173 return (ret); 1174 } 1175 1176 int 1177 kphysm_del_span_query( 1178 pfn_t base, 1179 pgcnt_t npgs, 1180 memquery_t *mqp) 1181 { 1182 struct memdelspan *mdsp; 1183 struct memdelspan *mdsp_new; 1184 int done_first_nonreloc; 1185 1186 mqp->phys_pages = 0; 1187 mqp->managed = 0; 1188 mqp->nonrelocatable = 0; 1189 mqp->first_nonrelocatable = 0; 1190 mqp->last_nonrelocatable = 0; 1191 1192 mdsp_new = span_to_install(base, npgs); 1193 /* 1194 * It is OK to proceed here if mdsp_new == NULL. 1195 */ 1196 done_first_nonreloc = 0; 1197 for (mdsp = mdsp_new; mdsp != NULL; mdsp = mdsp->mds_next) { 1198 pfn_t sbase; 1199 pgcnt_t snpgs; 1200 1201 mqp->phys_pages += mdsp->mds_npgs; 1202 sbase = mdsp->mds_base; 1203 snpgs = mdsp->mds_npgs; 1204 while (snpgs != 0) { 1205 struct memseg *lseg, *seg; 1206 pfn_t p_end; 1207 page_t *pp; 1208 pfn_t mseg_start; 1209 1210 p_end = sbase + snpgs; 1211 /* 1212 * Find the lowest addressed memseg that starts 1213 * after sbase and account for it. 1214 * This is to catch dynamic memsegs whose start 1215 * is hidden. 1216 */ 1217 seg = NULL; 1218 for (lseg = memsegs; lseg != NULL; lseg = lseg->next) { 1219 if ((lseg->pages_base >= sbase) || 1220 (lseg->pages_base < p_end && 1221 lseg->pages_end > sbase)) { 1222 if (seg == NULL || 1223 seg->pages_base > lseg->pages_base) 1224 seg = lseg; 1225 } 1226 } 1227 if (seg != NULL) { 1228 if (!memseg_is_dynamic(seg, &mseg_start)) { 1229 mseg_start = seg->pages_base; 1230 } 1231 /* 1232 * Now have the full extent of the memseg so 1233 * do the range check. 1234 */ 1235 if (mseg_start >= p_end || 1236 seg->pages_end <= sbase) { 1237 /* Span does not overlap memseg. */ 1238 seg = NULL; 1239 } 1240 } 1241 /* 1242 * Account for gap either before the segment if 1243 * there is one or to the end of the span. 1244 */ 1245 if (seg == NULL || mseg_start > sbase) { 1246 pfn_t a_end; 1247 1248 a_end = (seg == NULL) ? p_end : mseg_start; 1249 /* 1250 * Check with arch layer for relocatability. 1251 */ 1252 if (arch_kphysm_del_span_ok(sbase, 1253 (a_end - sbase))) { 1254 /* 1255 * No non-relocatble pages in this 1256 * area, avoid the fine-grained 1257 * test. 1258 */ 1259 snpgs -= (a_end - sbase); 1260 sbase = a_end; 1261 } 1262 while (sbase < a_end) { 1263 if (!arch_kphysm_del_span_ok(sbase, 1264 1)) { 1265 mqp->nonrelocatable++; 1266 if (!done_first_nonreloc) { 1267 mqp-> 1268 first_nonrelocatable 1269 = sbase; 1270 done_first_nonreloc = 1; 1271 } 1272 mqp->last_nonrelocatable = 1273 sbase; 1274 } 1275 sbase++; 1276 snpgs--; 1277 } 1278 } 1279 if (seg != NULL) { 1280 ASSERT(mseg_start <= sbase); 1281 if (seg->pages_base != mseg_start && 1282 seg->pages_base > sbase) { 1283 pgcnt_t skip_pgs; 1284 1285 /* 1286 * Skip the page_t area of a 1287 * dynamic memseg. 1288 */ 1289 skip_pgs = seg->pages_base - sbase; 1290 if (snpgs <= skip_pgs) { 1291 sbase += snpgs; 1292 snpgs = 0; 1293 continue; 1294 } 1295 snpgs -= skip_pgs; 1296 sbase += skip_pgs; 1297 } 1298 ASSERT(snpgs != 0); 1299 ASSERT(seg->pages_base <= sbase); 1300 /* 1301 * The individual pages can now be checked. 1302 */ 1303 for (pp = seg->pages + 1304 (sbase - seg->pages_base); 1305 snpgs != 0 && pp < seg->epages; pp++) { 1306 mqp->managed++; 1307 if (PP_ISNORELOC(pp)) { 1308 mqp->nonrelocatable++; 1309 if (!done_first_nonreloc) { 1310 mqp-> 1311 first_nonrelocatable 1312 = sbase; 1313 done_first_nonreloc = 1; 1314 } 1315 mqp->last_nonrelocatable = 1316 sbase; 1317 } 1318 sbase++; 1319 snpgs--; 1320 } 1321 } 1322 } 1323 } 1324 1325 free_delspans(mdsp_new); 1326 1327 return (KPHYSM_OK); 1328 } 1329 1330 /* 1331 * This release function can be called at any stage as follows: 1332 * _gethandle only called 1333 * _span(s) only called 1334 * _start called but failed 1335 * delete thread exited 1336 */ 1337 int 1338 kphysm_del_release(memhandle_t handle) 1339 { 1340 struct mem_handle *mhp; 1341 1342 mhp = kphysm_lookup_mem_handle(handle); 1343 if (mhp == NULL) { 1344 return (KPHYSM_EHANDLE); 1345 } 1346 switch (mhp->mh_state) { 1347 case MHND_STARTING: 1348 case MHND_RUNNING: 1349 mutex_exit(&mhp->mh_mutex); 1350 return (KPHYSM_ENOTFINISHED); 1351 case MHND_FREE: 1352 ASSERT(mhp->mh_state != MHND_FREE); 1353 mutex_exit(&mhp->mh_mutex); 1354 return (KPHYSM_EHANDLE); 1355 case MHND_INIT: 1356 break; 1357 case MHND_DONE: 1358 break; 1359 case MHND_RELEASE: 1360 mutex_exit(&mhp->mh_mutex); 1361 return (KPHYSM_ESEQUENCE); 1362 default: 1363 #ifdef DEBUG 1364 cmn_err(CE_WARN, "kphysm_del_release(0x%p) state corrupt %d", 1365 (void *)mhp, mhp->mh_state); 1366 #endif /* DEBUG */ 1367 mutex_exit(&mhp->mh_mutex); 1368 return (KPHYSM_EHANDLE); 1369 } 1370 /* 1371 * Set state so that we can wait if necessary. 1372 * Also this means that we have read/write access to all 1373 * fields except mh_exthandle and mh_state. 1374 */ 1375 mhp->mh_state = MHND_RELEASE; 1376 /* 1377 * The mem_handle cannot be de-allocated by any other operation 1378 * now, so no need to hold mh_mutex. 1379 */ 1380 mutex_exit(&mhp->mh_mutex); 1381 1382 delspan_remove(&mhp->mh_transit, 0, 0); 1383 mhp->mh_phys_pages = 0; 1384 mhp->mh_vm_pages = 0; 1385 mhp->mh_hold_todo = 0; 1386 mhp->mh_delete_complete = NULL; 1387 mhp->mh_delete_complete_arg = NULL; 1388 mhp->mh_cancel = 0; 1389 1390 mutex_enter(&mhp->mh_mutex); 1391 ASSERT(mhp->mh_state == MHND_RELEASE); 1392 mhp->mh_state = MHND_FREE; 1393 1394 kphysm_free_mem_handle(mhp); 1395 1396 return (KPHYSM_OK); 1397 } 1398 1399 /* 1400 * This cancel function can only be called with the thread running. 1401 */ 1402 int 1403 kphysm_del_cancel(memhandle_t handle) 1404 { 1405 struct mem_handle *mhp; 1406 1407 mhp = kphysm_lookup_mem_handle(handle); 1408 if (mhp == NULL) { 1409 return (KPHYSM_EHANDLE); 1410 } 1411 if (mhp->mh_state != MHND_STARTING && mhp->mh_state != MHND_RUNNING) { 1412 mutex_exit(&mhp->mh_mutex); 1413 return (KPHYSM_ENOTRUNNING); 1414 } 1415 /* 1416 * Set the cancel flag and wake the delete thread up. 1417 * The thread may be waiting on I/O, so the effect of the cancel 1418 * may be delayed. 1419 */ 1420 if (mhp->mh_cancel == 0) { 1421 mhp->mh_cancel = KPHYSM_ECANCELLED; 1422 cv_signal(&mhp->mh_cv); 1423 } 1424 mutex_exit(&mhp->mh_mutex); 1425 return (KPHYSM_OK); 1426 } 1427 1428 int 1429 kphysm_del_status( 1430 memhandle_t handle, 1431 memdelstat_t *mdstp) 1432 { 1433 struct mem_handle *mhp; 1434 1435 mhp = kphysm_lookup_mem_handle(handle); 1436 if (mhp == NULL) { 1437 return (KPHYSM_EHANDLE); 1438 } 1439 /* 1440 * Calling kphysm_del_status() is allowed before the delete 1441 * is started to allow for status display. 1442 */ 1443 if (mhp->mh_state != MHND_INIT && mhp->mh_state != MHND_STARTING && 1444 mhp->mh_state != MHND_RUNNING) { 1445 mutex_exit(&mhp->mh_mutex); 1446 return (KPHYSM_ENOTRUNNING); 1447 } 1448 mdstp->phys_pages = mhp->mh_phys_pages; 1449 mdstp->managed = mhp->mh_vm_pages; 1450 mdstp->collected = mhp->mh_vm_pages - mhp->mh_hold_todo; 1451 mutex_exit(&mhp->mh_mutex); 1452 return (KPHYSM_OK); 1453 } 1454 1455 static int mem_delete_additional_pages = 100; 1456 1457 static int 1458 can_remove_pgs(pgcnt_t npgs) 1459 { 1460 /* 1461 * If all pageable pages were paged out, freemem would 1462 * equal availrmem. There is a minimum requirement for 1463 * availrmem. 1464 */ 1465 if ((availrmem - (tune.t_minarmem + mem_delete_additional_pages)) 1466 < npgs) 1467 return (0); 1468 /* TODO: check swap space, etc. */ 1469 return (1); 1470 } 1471 1472 static int 1473 get_availrmem(pgcnt_t npgs) 1474 { 1475 int ret; 1476 1477 mutex_enter(&freemem_lock); 1478 ret = can_remove_pgs(npgs); 1479 if (ret != 0) 1480 availrmem -= npgs; 1481 mutex_exit(&freemem_lock); 1482 return (ret); 1483 } 1484 1485 static void 1486 put_availrmem(pgcnt_t npgs) 1487 { 1488 mutex_enter(&freemem_lock); 1489 availrmem += npgs; 1490 mutex_exit(&freemem_lock); 1491 } 1492 1493 #define FREEMEM_INCR 100 1494 static pgcnt_t freemem_incr = FREEMEM_INCR; 1495 #define DEL_FREE_WAIT_FRAC 4 1496 #define DEL_FREE_WAIT_TICKS ((hz+DEL_FREE_WAIT_FRAC-1)/DEL_FREE_WAIT_FRAC) 1497 1498 #define DEL_BUSY_WAIT_FRAC 20 1499 #define DEL_BUSY_WAIT_TICKS ((hz+DEL_BUSY_WAIT_FRAC-1)/DEL_BUSY_WAIT_FRAC) 1500 1501 static void kphysm_del_cleanup(struct mem_handle *); 1502 1503 static void page_delete_collect(page_t *, struct mem_handle *); 1504 1505 static pgcnt_t 1506 delthr_get_freemem(struct mem_handle *mhp) 1507 { 1508 pgcnt_t free_get; 1509 int ret; 1510 1511 ASSERT(MUTEX_HELD(&mhp->mh_mutex)); 1512 1513 MDSTAT_INCR(mhp, need_free); 1514 /* 1515 * Get up to freemem_incr pages. 1516 */ 1517 free_get = freemem_incr; 1518 if (free_get > mhp->mh_hold_todo) 1519 free_get = mhp->mh_hold_todo; 1520 /* 1521 * Take free_get pages away from freemem, 1522 * waiting if necessary. 1523 */ 1524 1525 while (!mhp->mh_cancel) { 1526 mutex_exit(&mhp->mh_mutex); 1527 MDSTAT_INCR(mhp, free_loop); 1528 /* 1529 * Duplicate test from page_create_throttle() 1530 * but don't override with !PG_WAIT. 1531 */ 1532 if (freemem < (free_get + throttlefree)) { 1533 MDSTAT_INCR(mhp, free_low); 1534 ret = 0; 1535 } else { 1536 ret = page_create_wait(free_get, 0); 1537 if (ret == 0) { 1538 /* EMPTY */ 1539 MDSTAT_INCR(mhp, free_failed); 1540 } 1541 } 1542 if (ret != 0) { 1543 mutex_enter(&mhp->mh_mutex); 1544 return (free_get); 1545 } 1546 1547 /* 1548 * Put pressure on pageout. 1549 */ 1550 page_needfree(free_get); 1551 cv_signal(&proc_pageout->p_cv); 1552 1553 mutex_enter(&mhp->mh_mutex); 1554 (void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex, 1555 (lbolt + DEL_FREE_WAIT_TICKS)); 1556 mutex_exit(&mhp->mh_mutex); 1557 page_needfree(-(spgcnt_t)free_get); 1558 1559 mutex_enter(&mhp->mh_mutex); 1560 } 1561 return (0); 1562 } 1563 1564 #define DR_AIO_CLEANUP_DELAY 25000 /* 0.025secs, in usec */ 1565 #define DR_AIO_CLEANUP_MAXLOOPS_NODELAY 100 1566 /* 1567 * This function is run as a helper thread for delete_memory_thread. 1568 * It is needed in order to force kaio cleanup, so that pages used in kaio 1569 * will be unlocked and subsequently relocated by delete_memory_thread. 1570 * The address of the delete_memory_threads's mem_handle is passed in to 1571 * this thread function, and is used to set the mh_aio_cleanup_done member 1572 * prior to calling thread_exit(). 1573 */ 1574 static void 1575 dr_aio_cleanup_thread(caddr_t amhp) 1576 { 1577 proc_t *procp; 1578 int (*aio_cleanup_dr_delete_memory)(proc_t *); 1579 int cleaned; 1580 int n = 0; 1581 struct mem_handle *mhp; 1582 volatile uint_t *pcancel; 1583 1584 mhp = (struct mem_handle *)amhp; 1585 ASSERT(mhp != NULL); 1586 pcancel = &mhp->mh_dr_aio_cleanup_cancel; 1587 if (modload("sys", "kaio") == -1) { 1588 mhp->mh_aio_cleanup_done = 1; 1589 cmn_err(CE_WARN, "dr_aio_cleanup_thread: cannot load kaio"); 1590 thread_exit(); 1591 } 1592 aio_cleanup_dr_delete_memory = (int (*)(proc_t *)) 1593 modgetsymvalue("aio_cleanup_dr_delete_memory", 0); 1594 if (aio_cleanup_dr_delete_memory == NULL) { 1595 mhp->mh_aio_cleanup_done = 1; 1596 cmn_err(CE_WARN, 1597 "aio_cleanup_dr_delete_memory not found in kaio"); 1598 thread_exit(); 1599 } 1600 do { 1601 cleaned = 0; 1602 mutex_enter(&pidlock); 1603 for (procp = practive; (*pcancel == 0) && (procp != NULL); 1604 procp = procp->p_next) { 1605 mutex_enter(&procp->p_lock); 1606 if (procp->p_aio != NULL) { 1607 /* cleanup proc's outstanding kaio */ 1608 cleaned += 1609 (*aio_cleanup_dr_delete_memory)(procp); 1610 } 1611 mutex_exit(&procp->p_lock); 1612 } 1613 mutex_exit(&pidlock); 1614 if ((*pcancel == 0) && 1615 (!cleaned || (++n == DR_AIO_CLEANUP_MAXLOOPS_NODELAY))) { 1616 /* delay a bit before retrying all procs again */ 1617 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 1618 n = 0; 1619 } 1620 } while (*pcancel == 0); 1621 mhp->mh_aio_cleanup_done = 1; 1622 thread_exit(); 1623 } 1624 1625 static void 1626 delete_memory_thread(caddr_t amhp) 1627 { 1628 struct mem_handle *mhp; 1629 struct memdelspan *mdsp; 1630 callb_cpr_t cprinfo; 1631 page_t *pp_targ; 1632 spgcnt_t freemem_left; 1633 void (*del_complete_funcp)(void *, int error); 1634 void *del_complete_arg; 1635 int comp_code; 1636 int ret; 1637 int first_scan; 1638 uint_t szc; 1639 #ifdef MEM_DEL_STATS 1640 uint64_t start_total, ntick_total; 1641 uint64_t start_pgrp, ntick_pgrp; 1642 #endif /* MEM_DEL_STATS */ 1643 1644 mhp = (struct mem_handle *)amhp; 1645 1646 #ifdef MEM_DEL_STATS 1647 start_total = ddi_get_lbolt(); 1648 #endif /* MEM_DEL_STATS */ 1649 1650 CALLB_CPR_INIT(&cprinfo, &mhp->mh_mutex, 1651 callb_generic_cpr, "memdel"); 1652 1653 mutex_enter(&mhp->mh_mutex); 1654 ASSERT(mhp->mh_state == MHND_STARTING); 1655 1656 mhp->mh_state = MHND_RUNNING; 1657 mhp->mh_thread_id = curthread; 1658 1659 mhp->mh_hold_todo = mhp->mh_vm_pages; 1660 mutex_exit(&mhp->mh_mutex); 1661 1662 /* Allocate the remap pages now, if necessary. */ 1663 memseg_remap_init(); 1664 1665 /* 1666 * Subtract from availrmem now if possible as availrmem 1667 * may not be available by the end of the delete. 1668 */ 1669 if (!get_availrmem(mhp->mh_vm_pages)) { 1670 comp_code = KPHYSM_ENOTVIABLE; 1671 mutex_enter(&mhp->mh_mutex); 1672 goto early_exit; 1673 } 1674 1675 ret = kphysm_setup_pre_del(mhp->mh_vm_pages); 1676 1677 mutex_enter(&mhp->mh_mutex); 1678 1679 if (ret != 0) { 1680 mhp->mh_cancel = KPHYSM_EREFUSED; 1681 goto refused; 1682 } 1683 1684 transit_list_collect(mhp, 1); 1685 1686 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 1687 mdsp = mdsp->mds_next) { 1688 ASSERT(mdsp->mds_bitmap == NULL); 1689 mdsp->mds_bitmap = kmem_zalloc(MDS_BITMAPBYTES(mdsp), KM_SLEEP); 1690 mdsp->mds_bitmap_retired = kmem_zalloc(MDS_BITMAPBYTES(mdsp), 1691 KM_SLEEP); 1692 } 1693 1694 first_scan = 1; 1695 freemem_left = 0; 1696 /* 1697 * Start dr_aio_cleanup_thread, which periodically iterates 1698 * through the process list and invokes aio cleanup. This 1699 * is needed in order to avoid a deadly embrace between the 1700 * delete_memory_thread (waiting on writer lock for page, with the 1701 * exclusive-wanted bit set), kaio read request threads (waiting for a 1702 * reader lock on the same page that is wanted by the 1703 * delete_memory_thread), and threads waiting for kaio completion 1704 * (blocked on spt_amp->lock). 1705 */ 1706 mhp->mh_dr_aio_cleanup_cancel = 0; 1707 mhp->mh_aio_cleanup_done = 0; 1708 (void) thread_create(NULL, 0, dr_aio_cleanup_thread, 1709 (caddr_t)mhp, 0, &p0, TS_RUN, maxclsyspri - 1); 1710 while ((mhp->mh_hold_todo != 0) && (mhp->mh_cancel == 0)) { 1711 pgcnt_t collected; 1712 1713 MDSTAT_INCR(mhp, nloop); 1714 collected = 0; 1715 for (mdsp = mhp->mh_transit.trl_spans; (mdsp != NULL) && 1716 (mhp->mh_cancel == 0); mdsp = mdsp->mds_next) { 1717 pfn_t pfn, p_end; 1718 1719 if (first_scan) { 1720 mem_node_pre_del_slice(mdsp->mds_base, 1721 mdsp->mds_base + mdsp->mds_npgs - 1); 1722 } 1723 1724 p_end = mdsp->mds_base + mdsp->mds_npgs; 1725 for (pfn = mdsp->mds_base; (pfn < p_end) && 1726 (mhp->mh_cancel == 0); pfn++) { 1727 page_t *pp, *tpp, *tpp_targ; 1728 pgcnt_t bit; 1729 struct vnode *vp; 1730 u_offset_t offset; 1731 int mod, result; 1732 spgcnt_t pgcnt; 1733 1734 bit = pfn - mdsp->mds_base; 1735 if ((mdsp->mds_bitmap[bit / NBPBMW] & 1736 (1 << (bit % NBPBMW))) != 0) { 1737 MDSTAT_INCR(mhp, already_done); 1738 continue; 1739 } 1740 if (freemem_left == 0) { 1741 freemem_left += delthr_get_freemem(mhp); 1742 if (freemem_left == 0) 1743 break; 1744 } 1745 1746 /* 1747 * Release mh_mutex - some of this 1748 * stuff takes some time (eg PUTPAGE). 1749 */ 1750 1751 mutex_exit(&mhp->mh_mutex); 1752 MDSTAT_INCR(mhp, ncheck); 1753 1754 pp = page_numtopp_nolock(pfn); 1755 if (pp == NULL) { 1756 /* 1757 * Not covered by a page_t - will 1758 * be dealt with elsewhere. 1759 */ 1760 MDSTAT_INCR(mhp, nopaget); 1761 mutex_enter(&mhp->mh_mutex); 1762 mdsp->mds_bitmap[bit / NBPBMW] |= 1763 (1 << (bit % NBPBMW)); 1764 continue; 1765 } 1766 1767 if (!page_try_reclaim_lock(pp, SE_EXCL, 1768 SE_EXCL_WANTED | SE_RETIRED)) { 1769 /* 1770 * Page in use elsewhere. Skip it. 1771 */ 1772 MDSTAT_INCR(mhp, lockfail); 1773 mutex_enter(&mhp->mh_mutex); 1774 continue; 1775 } 1776 /* 1777 * See if the cage expanded into the delete. 1778 * This can happen as we have to allow the 1779 * cage to expand. 1780 */ 1781 if (PP_ISNORELOC(pp)) { 1782 page_unlock(pp); 1783 mutex_enter(&mhp->mh_mutex); 1784 mhp->mh_cancel = KPHYSM_ENONRELOC; 1785 break; 1786 } 1787 if (PP_RETIRED(pp)) { 1788 /* 1789 * Page has been retired and is 1790 * not part of the cage so we 1791 * can now do the accounting for 1792 * it. 1793 */ 1794 MDSTAT_INCR(mhp, retired); 1795 mutex_enter(&mhp->mh_mutex); 1796 mdsp->mds_bitmap[bit / NBPBMW] 1797 |= (1 << (bit % NBPBMW)); 1798 mdsp->mds_bitmap_retired[bit / 1799 NBPBMW] |= 1800 (1 << (bit % NBPBMW)); 1801 mhp->mh_hold_todo--; 1802 continue; 1803 } 1804 ASSERT(freemem_left != 0); 1805 if (PP_ISFREE(pp)) { 1806 /* 1807 * Like page_reclaim() only 'freemem' 1808 * processing is already done. 1809 */ 1810 MDSTAT_INCR(mhp, nfree); 1811 free_page_collect: 1812 if (PP_ISAGED(pp)) { 1813 page_list_sub(pp, 1814 PG_FREE_LIST); 1815 } else { 1816 page_list_sub(pp, 1817 PG_CACHE_LIST); 1818 } 1819 PP_CLRFREE(pp); 1820 PP_CLRAGED(pp); 1821 collected++; 1822 mutex_enter(&mhp->mh_mutex); 1823 page_delete_collect(pp, mhp); 1824 mdsp->mds_bitmap[bit / NBPBMW] |= 1825 (1 << (bit % NBPBMW)); 1826 freemem_left--; 1827 continue; 1828 } 1829 ASSERT(pp->p_vnode != NULL); 1830 if (first_scan) { 1831 MDSTAT_INCR(mhp, first_notfree); 1832 page_unlock(pp); 1833 mutex_enter(&mhp->mh_mutex); 1834 continue; 1835 } 1836 /* 1837 * Keep stats on pages encountered that 1838 * are marked for retirement. 1839 */ 1840 if (PP_TOXIC(pp)) { 1841 MDSTAT_INCR(mhp, toxic); 1842 } else if (PP_PR_REQ(pp)) { 1843 MDSTAT_INCR(mhp, failing); 1844 } 1845 /* 1846 * In certain cases below, special exceptions 1847 * are made for pages that are toxic. This 1848 * is because the current meaning of toxic 1849 * is that an uncorrectable error has been 1850 * previously associated with the page. 1851 */ 1852 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1853 if (!PP_TOXIC(pp)) { 1854 /* 1855 * Must relocate locked in 1856 * memory pages. 1857 */ 1858 #ifdef MEM_DEL_STATS 1859 start_pgrp = ddi_get_lbolt(); 1860 #endif /* MEM_DEL_STATS */ 1861 /* 1862 * Lock all constituent pages 1863 * of a large page to ensure 1864 * that p_szc won't change. 1865 */ 1866 if (!group_page_trylock(pp, 1867 SE_EXCL)) { 1868 MDSTAT_INCR(mhp, 1869 gptllckfail); 1870 page_unlock(pp); 1871 mutex_enter( 1872 &mhp->mh_mutex); 1873 continue; 1874 } 1875 MDSTAT_INCR(mhp, npplocked); 1876 pp_targ = 1877 page_get_replacement_page( 1878 pp, NULL, 0); 1879 if (pp_targ != NULL) { 1880 #ifdef MEM_DEL_STATS 1881 ntick_pgrp = 1882 (uint64_t) 1883 ddi_get_lbolt() - 1884 start_pgrp; 1885 #endif /* MEM_DEL_STATS */ 1886 MDSTAT_PGRP(mhp, 1887 ntick_pgrp); 1888 MDSTAT_INCR(mhp, 1889 nlockreloc); 1890 goto reloc; 1891 } 1892 group_page_unlock(pp); 1893 page_unlock(pp); 1894 #ifdef MEM_DEL_STATS 1895 ntick_pgrp = 1896 (uint64_t)ddi_get_lbolt() - 1897 start_pgrp; 1898 #endif /* MEM_DEL_STATS */ 1899 MDSTAT_PGRP(mhp, ntick_pgrp); 1900 MDSTAT_INCR(mhp, nnorepl); 1901 mutex_enter(&mhp->mh_mutex); 1902 continue; 1903 } else { 1904 /* 1905 * Cannot do anything about 1906 * this page because it is 1907 * toxic. 1908 */ 1909 MDSTAT_INCR(mhp, npplkdtoxic); 1910 page_unlock(pp); 1911 mutex_enter(&mhp->mh_mutex); 1912 continue; 1913 } 1914 } 1915 /* 1916 * Unload the mappings and check if mod bit 1917 * is set. 1918 */ 1919 ASSERT(!PP_ISKAS(pp)); 1920 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1921 mod = hat_ismod(pp); 1922 1923 #ifdef MEM_DEL_STATS 1924 start_pgrp = ddi_get_lbolt(); 1925 #endif /* MEM_DEL_STATS */ 1926 if (mod && !PP_TOXIC(pp)) { 1927 /* 1928 * Lock all constituent pages 1929 * of a large page to ensure 1930 * that p_szc won't change. 1931 */ 1932 if (!group_page_trylock(pp, SE_EXCL)) { 1933 MDSTAT_INCR(mhp, gptlmodfail); 1934 page_unlock(pp); 1935 mutex_enter(&mhp->mh_mutex); 1936 continue; 1937 } 1938 pp_targ = page_get_replacement_page(pp, 1939 NULL, 0); 1940 if (pp_targ != NULL) { 1941 MDSTAT_INCR(mhp, nmodreloc); 1942 #ifdef MEM_DEL_STATS 1943 ntick_pgrp = 1944 (uint64_t)ddi_get_lbolt() - 1945 start_pgrp; 1946 #endif /* MEM_DEL_STATS */ 1947 MDSTAT_PGRP(mhp, ntick_pgrp); 1948 goto reloc; 1949 } 1950 group_page_unlock(pp); 1951 } 1952 1953 if (!page_try_demote_pages(pp)) { 1954 MDSTAT_INCR(mhp, demotefail); 1955 page_unlock(pp); 1956 #ifdef MEM_DEL_STATS 1957 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 1958 start_pgrp; 1959 #endif /* MEM_DEL_STATS */ 1960 MDSTAT_PGRP(mhp, ntick_pgrp); 1961 mutex_enter(&mhp->mh_mutex); 1962 continue; 1963 } 1964 1965 /* 1966 * Regular 'page-out'. 1967 */ 1968 if (!mod) { 1969 MDSTAT_INCR(mhp, ndestroy); 1970 page_destroy(pp, 1); 1971 /* 1972 * page_destroy was called with 1973 * dontfree. As long as p_lckcnt 1974 * and p_cowcnt are both zero, the 1975 * only additional action of 1976 * page_destroy with !dontfree is to 1977 * call page_free, so we can collect 1978 * the page here. 1979 */ 1980 collected++; 1981 #ifdef MEM_DEL_STATS 1982 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 1983 start_pgrp; 1984 #endif /* MEM_DEL_STATS */ 1985 MDSTAT_PGRP(mhp, ntick_pgrp); 1986 mutex_enter(&mhp->mh_mutex); 1987 page_delete_collect(pp, mhp); 1988 mdsp->mds_bitmap[bit / NBPBMW] |= 1989 (1 << (bit % NBPBMW)); 1990 continue; 1991 } 1992 /* 1993 * The page is toxic and the mod bit is 1994 * set, we cannot do anything here to deal 1995 * with it. 1996 */ 1997 if (PP_TOXIC(pp)) { 1998 page_unlock(pp); 1999 #ifdef MEM_DEL_STATS 2000 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2001 start_pgrp; 2002 #endif /* MEM_DEL_STATS */ 2003 MDSTAT_PGRP(mhp, ntick_pgrp); 2004 MDSTAT_INCR(mhp, modtoxic); 2005 mutex_enter(&mhp->mh_mutex); 2006 continue; 2007 } 2008 MDSTAT_INCR(mhp, nputpage); 2009 vp = pp->p_vnode; 2010 offset = pp->p_offset; 2011 VN_HOLD(vp); 2012 page_unlock(pp); 2013 (void) VOP_PUTPAGE(vp, offset, PAGESIZE, 2014 B_INVAL|B_FORCE, kcred, NULL); 2015 VN_RELE(vp); 2016 #ifdef MEM_DEL_STATS 2017 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2018 start_pgrp; 2019 #endif /* MEM_DEL_STATS */ 2020 MDSTAT_PGRP(mhp, ntick_pgrp); 2021 /* 2022 * Try to get the page back immediately 2023 * so that it can be collected. 2024 */ 2025 pp = page_numtopp_nolock(pfn); 2026 if (pp == NULL) { 2027 MDSTAT_INCR(mhp, nnoreclaim); 2028 /* 2029 * This should not happen as this 2030 * thread is deleting the page. 2031 * If this code is generalized, this 2032 * becomes a reality. 2033 */ 2034 #ifdef DEBUG 2035 cmn_err(CE_WARN, 2036 "delete_memory_thread(0x%p) " 2037 "pfn 0x%lx has no page_t", 2038 (void *)mhp, pfn); 2039 #endif /* DEBUG */ 2040 mutex_enter(&mhp->mh_mutex); 2041 continue; 2042 } 2043 if (page_try_reclaim_lock(pp, SE_EXCL, 2044 SE_EXCL_WANTED | SE_RETIRED)) { 2045 if (PP_ISFREE(pp)) { 2046 goto free_page_collect; 2047 } 2048 page_unlock(pp); 2049 } 2050 MDSTAT_INCR(mhp, nnoreclaim); 2051 mutex_enter(&mhp->mh_mutex); 2052 continue; 2053 2054 reloc: 2055 /* 2056 * Got some freemem and a target 2057 * page, so move the data to avoid 2058 * I/O and lock problems. 2059 */ 2060 ASSERT(!page_iolock_assert(pp)); 2061 MDSTAT_INCR(mhp, nreloc); 2062 /* 2063 * page_relocate() will return pgcnt: the 2064 * number of consecutive pages relocated. 2065 * If it is successful, pp will be a 2066 * linked list of the page structs that 2067 * were relocated. If page_relocate() is 2068 * unsuccessful, pp will be unmodified. 2069 */ 2070 #ifdef MEM_DEL_STATS 2071 start_pgrp = ddi_get_lbolt(); 2072 #endif /* MEM_DEL_STATS */ 2073 result = page_relocate(&pp, &pp_targ, 0, 0, 2074 &pgcnt, NULL); 2075 #ifdef MEM_DEL_STATS 2076 ntick_pgrp = (uint64_t)ddi_get_lbolt() - 2077 start_pgrp; 2078 #endif /* MEM_DEL_STATS */ 2079 MDSTAT_PGRP(mhp, ntick_pgrp); 2080 if (result != 0) { 2081 MDSTAT_INCR(mhp, nrelocfail); 2082 /* 2083 * We did not succeed. We need 2084 * to give the pp_targ pages back. 2085 * page_free(pp_targ, 1) without 2086 * the freemem accounting. 2087 */ 2088 group_page_unlock(pp); 2089 page_free_replacement_page(pp_targ); 2090 page_unlock(pp); 2091 mutex_enter(&mhp->mh_mutex); 2092 continue; 2093 } 2094 2095 /* 2096 * We will then collect pgcnt pages. 2097 */ 2098 ASSERT(pgcnt > 0); 2099 mutex_enter(&mhp->mh_mutex); 2100 /* 2101 * We need to make sure freemem_left is 2102 * large enough. 2103 */ 2104 while ((freemem_left < pgcnt) && 2105 (!mhp->mh_cancel)) { 2106 freemem_left += 2107 delthr_get_freemem(mhp); 2108 } 2109 2110 /* 2111 * Do not proceed if mh_cancel is set. 2112 */ 2113 if (mhp->mh_cancel) { 2114 while (pp_targ != NULL) { 2115 /* 2116 * Unlink and unlock each page. 2117 */ 2118 tpp_targ = pp_targ; 2119 page_sub(&pp_targ, tpp_targ); 2120 page_unlock(tpp_targ); 2121 } 2122 /* 2123 * We need to give the pp pages back. 2124 * page_free(pp, 1) without the 2125 * freemem accounting. 2126 */ 2127 page_free_replacement_page(pp); 2128 break; 2129 } 2130 2131 /* Now remove pgcnt from freemem_left */ 2132 freemem_left -= pgcnt; 2133 ASSERT(freemem_left >= 0); 2134 szc = pp->p_szc; 2135 while (pp != NULL) { 2136 /* 2137 * pp and pp_targ were passed back as 2138 * a linked list of pages. 2139 * Unlink and unlock each page. 2140 */ 2141 tpp_targ = pp_targ; 2142 page_sub(&pp_targ, tpp_targ); 2143 page_unlock(tpp_targ); 2144 /* 2145 * The original page is now free 2146 * so remove it from the linked 2147 * list and collect it. 2148 */ 2149 tpp = pp; 2150 page_sub(&pp, tpp); 2151 pfn = page_pptonum(tpp); 2152 collected++; 2153 ASSERT(PAGE_EXCL(tpp)); 2154 ASSERT(tpp->p_vnode == NULL); 2155 ASSERT(!hat_page_is_mapped(tpp)); 2156 ASSERT(tpp->p_szc == szc); 2157 tpp->p_szc = 0; 2158 page_delete_collect(tpp, mhp); 2159 bit = pfn - mdsp->mds_base; 2160 mdsp->mds_bitmap[bit / NBPBMW] |= 2161 (1 << (bit % NBPBMW)); 2162 } 2163 ASSERT(pp_targ == NULL); 2164 } 2165 } 2166 first_scan = 0; 2167 if ((mhp->mh_cancel == 0) && (mhp->mh_hold_todo != 0) && 2168 (collected == 0)) { 2169 /* 2170 * This code is needed as we cannot wait 2171 * for a page to be locked OR the delete to 2172 * be cancelled. Also, we must delay so 2173 * that other threads get a chance to run 2174 * on our cpu, otherwise page locks may be 2175 * held indefinitely by those threads. 2176 */ 2177 MDSTAT_INCR(mhp, ndelay); 2178 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2179 (void) cv_timedwait(&mhp->mh_cv, &mhp->mh_mutex, 2180 (lbolt + DEL_BUSY_WAIT_TICKS)); 2181 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2182 } 2183 } 2184 /* stop the dr aio cleanup thread */ 2185 mhp->mh_dr_aio_cleanup_cancel = 1; 2186 transit_list_collect(mhp, 0); 2187 if (freemem_left != 0) { 2188 /* Return any surplus. */ 2189 page_create_putback(freemem_left); 2190 freemem_left = 0; 2191 } 2192 #ifdef MEM_DEL_STATS 2193 ntick_total = (uint64_t)ddi_get_lbolt() - start_total; 2194 #endif /* MEM_DEL_STATS */ 2195 MDSTAT_TOTAL(mhp, ntick_total); 2196 MDSTAT_PRINT(mhp); 2197 2198 /* 2199 * If the memory delete was cancelled, exclusive-wanted bits must 2200 * be cleared. If there are retired pages being deleted, they need 2201 * to be unretired. 2202 */ 2203 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2204 mdsp = mdsp->mds_next) { 2205 pfn_t pfn, p_end; 2206 2207 p_end = mdsp->mds_base + mdsp->mds_npgs; 2208 for (pfn = mdsp->mds_base; pfn < p_end; pfn++) { 2209 page_t *pp; 2210 pgcnt_t bit; 2211 2212 bit = pfn - mdsp->mds_base; 2213 if (mhp->mh_cancel) { 2214 pp = page_numtopp_nolock(pfn); 2215 if (pp != NULL) { 2216 if ((mdsp->mds_bitmap[bit / NBPBMW] & 2217 (1 << (bit % NBPBMW))) == 0) { 2218 page_lock_clr_exclwanted(pp); 2219 } 2220 } 2221 } else { 2222 pp = NULL; 2223 } 2224 if ((mdsp->mds_bitmap_retired[bit / NBPBMW] & 2225 (1 << (bit % NBPBMW))) != 0) { 2226 /* do we already have pp? */ 2227 if (pp == NULL) { 2228 pp = page_numtopp_nolock(pfn); 2229 } 2230 ASSERT(pp != NULL); 2231 ASSERT(PP_RETIRED(pp)); 2232 if (mhp->mh_cancel != 0) { 2233 page_unlock(pp); 2234 /* 2235 * To satisfy ASSERT below in 2236 * cancel code. 2237 */ 2238 mhp->mh_hold_todo++; 2239 } else { 2240 (void) page_unretire_pp(pp, 2241 PR_UNR_CLEAN); 2242 } 2243 } 2244 } 2245 } 2246 /* 2247 * Free retired page bitmap and collected page bitmap 2248 */ 2249 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2250 mdsp = mdsp->mds_next) { 2251 ASSERT(mdsp->mds_bitmap_retired != NULL); 2252 kmem_free(mdsp->mds_bitmap_retired, MDS_BITMAPBYTES(mdsp)); 2253 mdsp->mds_bitmap_retired = NULL; /* Paranoia. */ 2254 ASSERT(mdsp->mds_bitmap != NULL); 2255 kmem_free(mdsp->mds_bitmap, MDS_BITMAPBYTES(mdsp)); 2256 mdsp->mds_bitmap = NULL; /* Paranoia. */ 2257 } 2258 2259 /* wait for our dr aio cancel thread to exit */ 2260 while (!(mhp->mh_aio_cleanup_done)) { 2261 CALLB_CPR_SAFE_BEGIN(&cprinfo); 2262 delay(drv_usectohz(DR_AIO_CLEANUP_DELAY)); 2263 CALLB_CPR_SAFE_END(&cprinfo, &mhp->mh_mutex); 2264 } 2265 refused: 2266 if (mhp->mh_cancel != 0) { 2267 page_t *pp; 2268 2269 comp_code = mhp->mh_cancel; 2270 /* 2271 * Go through list of deleted pages (mh_deleted) freeing 2272 * them. 2273 */ 2274 while ((pp = mhp->mh_deleted) != NULL) { 2275 mhp->mh_deleted = pp->p_next; 2276 mhp->mh_hold_todo++; 2277 mutex_exit(&mhp->mh_mutex); 2278 /* Restore p_next. */ 2279 pp->p_next = pp->p_prev; 2280 if (PP_ISFREE(pp)) { 2281 cmn_err(CE_PANIC, 2282 "page %p is free", 2283 (void *)pp); 2284 } 2285 page_free(pp, 1); 2286 mutex_enter(&mhp->mh_mutex); 2287 } 2288 ASSERT(mhp->mh_hold_todo == mhp->mh_vm_pages); 2289 2290 mutex_exit(&mhp->mh_mutex); 2291 put_availrmem(mhp->mh_vm_pages); 2292 mutex_enter(&mhp->mh_mutex); 2293 2294 goto t_exit; 2295 } 2296 2297 /* 2298 * All the pages are no longer in use and are exclusively locked. 2299 */ 2300 2301 mhp->mh_deleted = NULL; 2302 2303 kphysm_del_cleanup(mhp); 2304 2305 /* 2306 * mem_node_post_del_slice needs to be after kphysm_del_cleanup so 2307 * that the mem_node_config[] will remain intact for the cleanup. 2308 */ 2309 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2310 mdsp = mdsp->mds_next) { 2311 mem_node_post_del_slice(mdsp->mds_base, 2312 mdsp->mds_base + mdsp->mds_npgs - 1, 0); 2313 } 2314 2315 comp_code = KPHYSM_OK; 2316 2317 t_exit: 2318 mutex_exit(&mhp->mh_mutex); 2319 kphysm_setup_post_del(mhp->mh_vm_pages, 2320 (comp_code == KPHYSM_OK) ? 0 : 1); 2321 mutex_enter(&mhp->mh_mutex); 2322 2323 early_exit: 2324 /* mhp->mh_mutex exited by CALLB_CPR_EXIT() */ 2325 mhp->mh_state = MHND_DONE; 2326 del_complete_funcp = mhp->mh_delete_complete; 2327 del_complete_arg = mhp->mh_delete_complete_arg; 2328 CALLB_CPR_EXIT(&cprinfo); 2329 (*del_complete_funcp)(del_complete_arg, comp_code); 2330 thread_exit(); 2331 /*NOTREACHED*/ 2332 } 2333 2334 /* 2335 * Start the delete of the memory from the system. 2336 */ 2337 int 2338 kphysm_del_start( 2339 memhandle_t handle, 2340 void (*complete)(void *, int), 2341 void *complete_arg) 2342 { 2343 struct mem_handle *mhp; 2344 2345 mhp = kphysm_lookup_mem_handle(handle); 2346 if (mhp == NULL) { 2347 return (KPHYSM_EHANDLE); 2348 } 2349 switch (mhp->mh_state) { 2350 case MHND_FREE: 2351 ASSERT(mhp->mh_state != MHND_FREE); 2352 mutex_exit(&mhp->mh_mutex); 2353 return (KPHYSM_EHANDLE); 2354 case MHND_INIT: 2355 break; 2356 case MHND_STARTING: 2357 case MHND_RUNNING: 2358 mutex_exit(&mhp->mh_mutex); 2359 return (KPHYSM_ESEQUENCE); 2360 case MHND_DONE: 2361 mutex_exit(&mhp->mh_mutex); 2362 return (KPHYSM_ESEQUENCE); 2363 case MHND_RELEASE: 2364 mutex_exit(&mhp->mh_mutex); 2365 return (KPHYSM_ESEQUENCE); 2366 default: 2367 #ifdef DEBUG 2368 cmn_err(CE_WARN, "kphysm_del_start(0x%p) state corrupt %d", 2369 (void *)mhp, mhp->mh_state); 2370 #endif /* DEBUG */ 2371 mutex_exit(&mhp->mh_mutex); 2372 return (KPHYSM_EHANDLE); 2373 } 2374 2375 if (mhp->mh_transit.trl_spans == NULL) { 2376 mutex_exit(&mhp->mh_mutex); 2377 return (KPHYSM_ENOWORK); 2378 } 2379 2380 ASSERT(complete != NULL); 2381 mhp->mh_delete_complete = complete; 2382 mhp->mh_delete_complete_arg = complete_arg; 2383 mhp->mh_state = MHND_STARTING; 2384 /* 2385 * Release the mutex in case thread_create sleeps. 2386 */ 2387 mutex_exit(&mhp->mh_mutex); 2388 2389 /* 2390 * The "obvious" process for this thread is pageout (proc_pageout) 2391 * but this gives the thread too much power over freemem 2392 * which results in freemem starvation. 2393 */ 2394 (void) thread_create(NULL, 0, delete_memory_thread, mhp, 0, &p0, 2395 TS_RUN, maxclsyspri - 1); 2396 2397 return (KPHYSM_OK); 2398 } 2399 2400 static kmutex_t pp_dummy_lock; /* Protects init. of pp_dummy. */ 2401 static caddr_t pp_dummy; 2402 static pgcnt_t pp_dummy_npages; 2403 static pfn_t *pp_dummy_pfn; /* Array of dummy pfns. */ 2404 2405 static void 2406 memseg_remap_init_pages(page_t *pages, page_t *epages) 2407 { 2408 page_t *pp; 2409 2410 for (pp = pages; pp < epages; pp++) { 2411 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2412 pp->p_offset = (u_offset_t)-1; 2413 page_iolock_init(pp); 2414 while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) 2415 continue; 2416 page_lock_delete(pp); 2417 } 2418 } 2419 2420 void 2421 memseg_remap_init() 2422 { 2423 mutex_enter(&pp_dummy_lock); 2424 if (pp_dummy == NULL) { 2425 uint_t dpages; 2426 int i; 2427 2428 /* 2429 * dpages starts off as the size of the structure and 2430 * ends up as the minimum number of pages that will 2431 * hold a whole number of page_t structures. 2432 */ 2433 dpages = sizeof (page_t); 2434 ASSERT(dpages != 0); 2435 ASSERT(dpages <= MMU_PAGESIZE); 2436 2437 while ((dpages & 1) == 0) 2438 dpages >>= 1; 2439 2440 pp_dummy_npages = dpages; 2441 /* 2442 * Allocate pp_dummy pages directly from static_arena, 2443 * since these are whole page allocations and are 2444 * referenced by physical address. This also has the 2445 * nice fringe benefit of hiding the memory from 2446 * ::findleaks since it doesn't deal well with allocated 2447 * kernel heap memory that doesn't have any mappings. 2448 */ 2449 pp_dummy = vmem_xalloc(static_arena, ptob(pp_dummy_npages), 2450 PAGESIZE, 0, 0, NULL, NULL, VM_SLEEP); 2451 bzero(pp_dummy, ptob(pp_dummy_npages)); 2452 ASSERT(((uintptr_t)pp_dummy & MMU_PAGEOFFSET) == 0); 2453 pp_dummy_pfn = kmem_alloc(sizeof (*pp_dummy_pfn) * 2454 pp_dummy_npages, KM_SLEEP); 2455 for (i = 0; i < pp_dummy_npages; i++) { 2456 pp_dummy_pfn[i] = hat_getpfnum(kas.a_hat, 2457 &pp_dummy[MMU_PAGESIZE * i]); 2458 ASSERT(pp_dummy_pfn[i] != PFN_INVALID); 2459 } 2460 /* 2461 * Initialize the page_t's to a known 'deleted' state 2462 * that matches the state of deleted pages. 2463 */ 2464 memseg_remap_init_pages((page_t *)pp_dummy, 2465 (page_t *)(pp_dummy + ptob(pp_dummy_npages))); 2466 /* Remove kmem mappings for the pages for safety. */ 2467 hat_unload(kas.a_hat, pp_dummy, ptob(pp_dummy_npages), 2468 HAT_UNLOAD_UNLOCK); 2469 /* Leave pp_dummy pointer set as flag that init is done. */ 2470 } 2471 mutex_exit(&pp_dummy_lock); 2472 } 2473 2474 static void 2475 memseg_remap_to_dummy(caddr_t pp, pgcnt_t metapgs) 2476 { 2477 ASSERT(pp_dummy != NULL); 2478 2479 while (metapgs != 0) { 2480 pgcnt_t n; 2481 int i; 2482 2483 n = pp_dummy_npages; 2484 if (n > metapgs) 2485 n = metapgs; 2486 for (i = 0; i < n; i++) { 2487 hat_devload(kas.a_hat, pp, ptob(1), pp_dummy_pfn[i], 2488 PROT_READ, 2489 HAT_LOAD | HAT_LOAD_NOCONSIST | 2490 HAT_LOAD_REMAP); 2491 pp += ptob(1); 2492 } 2493 metapgs -= n; 2494 } 2495 } 2496 2497 /* 2498 * Transition all the deleted pages to the deleted state so that 2499 * page_lock will not wait. The page_lock_delete call will 2500 * also wake up any waiters. 2501 */ 2502 static void 2503 memseg_lock_delete_all(struct memseg *seg) 2504 { 2505 page_t *pp; 2506 2507 for (pp = seg->pages; pp < seg->epages; pp++) { 2508 pp->p_pagenum = PFN_INVALID; /* XXXX */ 2509 page_lock_delete(pp); 2510 } 2511 } 2512 2513 static void 2514 kphysm_del_cleanup(struct mem_handle *mhp) 2515 { 2516 struct memdelspan *mdsp; 2517 struct memseg *seg; 2518 struct memseg **segpp; 2519 struct memseg *seglist; 2520 pfn_t p_end; 2521 uint64_t avmem; 2522 pgcnt_t avpgs; 2523 pgcnt_t npgs; 2524 2525 avpgs = mhp->mh_vm_pages; 2526 2527 memsegs_lock(1); 2528 2529 /* 2530 * remove from main segment list. 2531 */ 2532 npgs = 0; 2533 seglist = NULL; 2534 for (mdsp = mhp->mh_transit.trl_spans; mdsp != NULL; 2535 mdsp = mdsp->mds_next) { 2536 p_end = mdsp->mds_base + mdsp->mds_npgs; 2537 for (segpp = &memsegs; (seg = *segpp) != NULL; ) { 2538 if (seg->pages_base >= p_end || 2539 seg->pages_end <= mdsp->mds_base) { 2540 /* Span and memseg don't overlap. */ 2541 segpp = &((*segpp)->next); 2542 continue; 2543 } 2544 ASSERT(seg->pages_base >= mdsp->mds_base); 2545 ASSERT(seg->pages_end <= p_end); 2546 2547 PLCNT_MODIFY_MAX(seg->pages_base, 2548 seg->pages_base - seg->pages_end); 2549 2550 /* Hide the memseg from future scans. */ 2551 hat_kpm_delmem_mseg_update(seg, segpp); 2552 *segpp = seg->next; 2553 membar_producer(); /* TODO: Needed? */ 2554 npgs += MSEG_NPAGES(seg); 2555 2556 /* 2557 * Leave the deleted segment's next pointer intact 2558 * in case a memsegs scanning loop is walking this 2559 * segment concurrently. 2560 */ 2561 seg->lnext = seglist; 2562 seglist = seg; 2563 } 2564 } 2565 2566 build_pfn_hash(); 2567 2568 ASSERT(npgs < total_pages); 2569 total_pages -= npgs; 2570 2571 /* 2572 * Recalculate the paging parameters now total_pages has changed. 2573 * This will also cause the clock hands to be reset before next use. 2574 */ 2575 setupclock(1); 2576 2577 memsegs_unlock(1); 2578 2579 mutex_exit(&mhp->mh_mutex); 2580 2581 while ((seg = seglist) != NULL) { 2582 pfn_t mseg_start; 2583 pfn_t mseg_base, mseg_end; 2584 pgcnt_t mseg_npgs; 2585 page_t *pp; 2586 pgcnt_t metapgs; 2587 int dynamic; 2588 int mlret; 2589 2590 seglist = seg->lnext; 2591 2592 /* 2593 * Put the page_t's into the deleted state to stop 2594 * cv_wait()s on the pages. When we remap, the dummy 2595 * page_t's will be in the same state. 2596 */ 2597 memseg_lock_delete_all(seg); 2598 /* 2599 * Collect up information based on pages_base and pages_end 2600 * early so that we can flag early that the memseg has been 2601 * deleted by setting pages_end == pages_base. 2602 */ 2603 mseg_base = seg->pages_base; 2604 mseg_end = seg->pages_end; 2605 mseg_npgs = MSEG_NPAGES(seg); 2606 dynamic = memseg_is_dynamic(seg, &mseg_start); 2607 2608 seg->pages_end = seg->pages_base; 2609 2610 if (dynamic) { 2611 pp = seg->pages; 2612 metapgs = mseg_base - mseg_start; 2613 ASSERT(metapgs != 0); 2614 2615 /* Remap the meta data to our special dummy area. */ 2616 memseg_remap_to_dummy((caddr_t)pp, metapgs); 2617 2618 mutex_enter(&memseg_lists_lock); 2619 seg->lnext = memseg_va_avail; 2620 memseg_va_avail = seg; 2621 mutex_exit(&memseg_lists_lock); 2622 } else { 2623 /* 2624 * Set for clean-up below. 2625 */ 2626 mseg_start = seg->pages_base; 2627 /* 2628 * For memory whose page_ts were allocated 2629 * at boot, we need to find a new use for 2630 * the page_t memory. 2631 * For the moment, just leak it. 2632 * (It is held in the memseg_delete_junk list.) 2633 */ 2634 2635 mutex_enter(&memseg_lists_lock); 2636 seg->lnext = memseg_delete_junk; 2637 memseg_delete_junk = seg; 2638 mutex_exit(&memseg_lists_lock); 2639 } 2640 2641 /* Must not use seg now as it could be re-used. */ 2642 2643 memlist_write_lock(); 2644 2645 mlret = memlist_delete_span( 2646 (uint64_t)(mseg_base) << PAGESHIFT, 2647 (uint64_t)(mseg_npgs) << PAGESHIFT, 2648 &phys_avail); 2649 ASSERT(mlret == MEML_SPANOP_OK); 2650 2651 mlret = memlist_delete_span( 2652 (uint64_t)(mseg_start) << PAGESHIFT, 2653 (uint64_t)(mseg_end - mseg_start) << 2654 PAGESHIFT, 2655 &phys_install); 2656 ASSERT(mlret == MEML_SPANOP_OK); 2657 phys_install_has_changed(); 2658 2659 memlist_write_unlock(); 2660 } 2661 2662 memlist_read_lock(); 2663 installed_top_size(phys_install, &physmax, &physinstalled); 2664 memlist_read_unlock(); 2665 2666 mutex_enter(&freemem_lock); 2667 maxmem -= avpgs; 2668 physmem -= avpgs; 2669 /* availrmem is adjusted during the delete. */ 2670 availrmem_initial -= avpgs; 2671 2672 mutex_exit(&freemem_lock); 2673 2674 dump_resize(); 2675 2676 cmn_err(CE_CONT, "?kphysm_delete: mem = %ldK " 2677 "(0x%" PRIx64 ")\n", 2678 physinstalled << (PAGESHIFT - 10), 2679 (uint64_t)physinstalled << PAGESHIFT); 2680 2681 avmem = (uint64_t)freemem << PAGESHIFT; 2682 cmn_err(CE_CONT, "?kphysm_delete: " 2683 "avail mem = %" PRId64 "\n", avmem); 2684 2685 /* 2686 * Update lgroup generation number on single lgroup systems 2687 */ 2688 if (nlgrps == 1) 2689 lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0); 2690 2691 /* Successfully deleted system memory */ 2692 mutex_enter(&mhp->mh_mutex); 2693 } 2694 2695 static uint_t mdel_nullvp_waiter; 2696 2697 static void 2698 page_delete_collect( 2699 page_t *pp, 2700 struct mem_handle *mhp) 2701 { 2702 if (pp->p_vnode) { 2703 page_hashout(pp, (kmutex_t *)NULL); 2704 /* do not do PP_SETAGED(pp); */ 2705 } else { 2706 kmutex_t *sep; 2707 2708 sep = page_se_mutex(pp); 2709 mutex_enter(sep); 2710 if (CV_HAS_WAITERS(&pp->p_cv)) { 2711 mdel_nullvp_waiter++; 2712 cv_broadcast(&pp->p_cv); 2713 } 2714 mutex_exit(sep); 2715 } 2716 ASSERT(pp->p_next == pp->p_prev); 2717 ASSERT(pp->p_next == NULL || pp->p_next == pp); 2718 pp->p_next = mhp->mh_deleted; 2719 mhp->mh_deleted = pp; 2720 ASSERT(mhp->mh_hold_todo != 0); 2721 mhp->mh_hold_todo--; 2722 } 2723 2724 static void 2725 transit_list_collect(struct mem_handle *mhp, int v) 2726 { 2727 struct transit_list_head *trh; 2728 2729 trh = &transit_list_head; 2730 mutex_enter(&trh->trh_lock); 2731 mhp->mh_transit.trl_collect = v; 2732 mutex_exit(&trh->trh_lock); 2733 } 2734 2735 static void 2736 transit_list_insert(struct transit_list *tlp) 2737 { 2738 struct transit_list_head *trh; 2739 2740 trh = &transit_list_head; 2741 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2742 tlp->trl_next = trh->trh_head; 2743 trh->trh_head = tlp; 2744 } 2745 2746 static void 2747 transit_list_remove(struct transit_list *tlp) 2748 { 2749 struct transit_list_head *trh; 2750 struct transit_list **tlpp; 2751 2752 trh = &transit_list_head; 2753 tlpp = &trh->trh_head; 2754 ASSERT(MUTEX_HELD(&trh->trh_lock)); 2755 while (*tlpp != NULL && *tlpp != tlp) 2756 tlpp = &(*tlpp)->trl_next; 2757 ASSERT(*tlpp != NULL); 2758 if (*tlpp == tlp) 2759 *tlpp = tlp->trl_next; 2760 tlp->trl_next = NULL; 2761 } 2762 2763 static struct transit_list * 2764 pfnum_to_transit_list(struct transit_list_head *trh, pfn_t pfnum) 2765 { 2766 struct transit_list *tlp; 2767 2768 for (tlp = trh->trh_head; tlp != NULL; tlp = tlp->trl_next) { 2769 struct memdelspan *mdsp; 2770 2771 for (mdsp = tlp->trl_spans; mdsp != NULL; 2772 mdsp = mdsp->mds_next) { 2773 if (pfnum >= mdsp->mds_base && 2774 pfnum < (mdsp->mds_base + mdsp->mds_npgs)) { 2775 return (tlp); 2776 } 2777 } 2778 } 2779 return (NULL); 2780 } 2781 2782 int 2783 pfn_is_being_deleted(pfn_t pfnum) 2784 { 2785 struct transit_list_head *trh; 2786 struct transit_list *tlp; 2787 int ret; 2788 2789 trh = &transit_list_head; 2790 if (trh->trh_head == NULL) 2791 return (0); 2792 2793 mutex_enter(&trh->trh_lock); 2794 tlp = pfnum_to_transit_list(trh, pfnum); 2795 ret = (tlp != NULL && tlp->trl_collect); 2796 mutex_exit(&trh->trh_lock); 2797 2798 return (ret); 2799 } 2800 2801 #ifdef MEM_DEL_STATS 2802 extern int hz; 2803 static void 2804 mem_del_stat_print_func(struct mem_handle *mhp) 2805 { 2806 uint64_t tmp; 2807 2808 if (mem_del_stat_print) { 2809 printf("memory delete loop %x/%x, statistics%s\n", 2810 (uint_t)mhp->mh_transit.trl_spans->mds_base, 2811 (uint_t)mhp->mh_transit.trl_spans->mds_npgs, 2812 (mhp->mh_cancel ? " (cancelled)" : "")); 2813 printf("\t%8u nloop\n", mhp->mh_delstat.nloop); 2814 printf("\t%8u need_free\n", mhp->mh_delstat.need_free); 2815 printf("\t%8u free_loop\n", mhp->mh_delstat.free_loop); 2816 printf("\t%8u free_low\n", mhp->mh_delstat.free_low); 2817 printf("\t%8u free_failed\n", mhp->mh_delstat.free_failed); 2818 printf("\t%8u ncheck\n", mhp->mh_delstat.ncheck); 2819 printf("\t%8u nopaget\n", mhp->mh_delstat.nopaget); 2820 printf("\t%8u lockfail\n", mhp->mh_delstat.lockfail); 2821 printf("\t%8u nfree\n", mhp->mh_delstat.nfree); 2822 printf("\t%8u nreloc\n", mhp->mh_delstat.nreloc); 2823 printf("\t%8u nrelocfail\n", mhp->mh_delstat.nrelocfail); 2824 printf("\t%8u already_done\n", mhp->mh_delstat.already_done); 2825 printf("\t%8u first_notfree\n", mhp->mh_delstat.first_notfree); 2826 printf("\t%8u npplocked\n", mhp->mh_delstat.npplocked); 2827 printf("\t%8u nlockreloc\n", mhp->mh_delstat.nlockreloc); 2828 printf("\t%8u nnorepl\n", mhp->mh_delstat.nnorepl); 2829 printf("\t%8u nmodreloc\n", mhp->mh_delstat.nmodreloc); 2830 printf("\t%8u ndestroy\n", mhp->mh_delstat.ndestroy); 2831 printf("\t%8u nputpage\n", mhp->mh_delstat.nputpage); 2832 printf("\t%8u nnoreclaim\n", mhp->mh_delstat.nnoreclaim); 2833 printf("\t%8u ndelay\n", mhp->mh_delstat.ndelay); 2834 printf("\t%8u demotefail\n", mhp->mh_delstat.demotefail); 2835 printf("\t%8u retired\n", mhp->mh_delstat.retired); 2836 printf("\t%8u toxic\n", mhp->mh_delstat.toxic); 2837 printf("\t%8u failing\n", mhp->mh_delstat.failing); 2838 printf("\t%8u modtoxic\n", mhp->mh_delstat.modtoxic); 2839 printf("\t%8u npplkdtoxic\n", mhp->mh_delstat.npplkdtoxic); 2840 printf("\t%8u gptlmodfail\n", mhp->mh_delstat.gptlmodfail); 2841 printf("\t%8u gptllckfail\n", mhp->mh_delstat.gptllckfail); 2842 tmp = mhp->mh_delstat.nticks_total / hz; /* seconds */ 2843 printf( 2844 "\t%"PRIu64" nticks_total - %"PRIu64" min %"PRIu64" sec\n", 2845 mhp->mh_delstat.nticks_total, tmp / 60, tmp % 60); 2846 2847 tmp = mhp->mh_delstat.nticks_pgrp / hz; /* seconds */ 2848 printf( 2849 "\t%"PRIu64" nticks_pgrp - %"PRIu64" min %"PRIu64" sec\n", 2850 mhp->mh_delstat.nticks_pgrp, tmp / 60, tmp % 60); 2851 } 2852 } 2853 #endif /* MEM_DEL_STATS */ 2854 2855 struct mem_callback { 2856 kphysm_setup_vector_t *vec; 2857 void *arg; 2858 }; 2859 2860 #define NMEMCALLBACKS 100 2861 2862 static struct mem_callback mem_callbacks[NMEMCALLBACKS]; 2863 static uint_t nmemcallbacks; 2864 static krwlock_t mem_callback_rwlock; 2865 2866 int 2867 kphysm_setup_func_register(kphysm_setup_vector_t *vec, void *arg) 2868 { 2869 uint_t i, found; 2870 2871 /* 2872 * This test will become more complicated when the version must 2873 * change. 2874 */ 2875 if (vec->version != KPHYSM_SETUP_VECTOR_VERSION) 2876 return (EINVAL); 2877 2878 if (vec->post_add == NULL || vec->pre_del == NULL || 2879 vec->post_del == NULL) 2880 return (EINVAL); 2881 2882 rw_enter(&mem_callback_rwlock, RW_WRITER); 2883 for (i = 0, found = 0; i < nmemcallbacks; i++) { 2884 if (mem_callbacks[i].vec == NULL && found == 0) 2885 found = i + 1; 2886 if (mem_callbacks[i].vec == vec && 2887 mem_callbacks[i].arg == arg) { 2888 #ifdef DEBUG 2889 /* Catch this in DEBUG kernels. */ 2890 cmn_err(CE_WARN, "kphysm_setup_func_register" 2891 "(0x%p, 0x%p) duplicate registration from 0x%p", 2892 (void *)vec, arg, (void *)caller()); 2893 #endif /* DEBUG */ 2894 rw_exit(&mem_callback_rwlock); 2895 return (EEXIST); 2896 } 2897 } 2898 if (found != 0) { 2899 i = found - 1; 2900 } else { 2901 ASSERT(nmemcallbacks < NMEMCALLBACKS); 2902 if (nmemcallbacks == NMEMCALLBACKS) { 2903 rw_exit(&mem_callback_rwlock); 2904 return (ENOMEM); 2905 } 2906 i = nmemcallbacks++; 2907 } 2908 mem_callbacks[i].vec = vec; 2909 mem_callbacks[i].arg = arg; 2910 rw_exit(&mem_callback_rwlock); 2911 return (0); 2912 } 2913 2914 void 2915 kphysm_setup_func_unregister(kphysm_setup_vector_t *vec, void *arg) 2916 { 2917 uint_t i; 2918 2919 rw_enter(&mem_callback_rwlock, RW_WRITER); 2920 for (i = 0; i < nmemcallbacks; i++) { 2921 if (mem_callbacks[i].vec == vec && 2922 mem_callbacks[i].arg == arg) { 2923 mem_callbacks[i].vec = NULL; 2924 mem_callbacks[i].arg = NULL; 2925 if (i == (nmemcallbacks - 1)) 2926 nmemcallbacks--; 2927 break; 2928 } 2929 } 2930 rw_exit(&mem_callback_rwlock); 2931 } 2932 2933 static void 2934 kphysm_setup_post_add(pgcnt_t delta_pages) 2935 { 2936 uint_t i; 2937 2938 rw_enter(&mem_callback_rwlock, RW_READER); 2939 for (i = 0; i < nmemcallbacks; i++) { 2940 if (mem_callbacks[i].vec != NULL) { 2941 (*mem_callbacks[i].vec->post_add) 2942 (mem_callbacks[i].arg, delta_pages); 2943 } 2944 } 2945 rw_exit(&mem_callback_rwlock); 2946 } 2947 2948 /* 2949 * Note the locking between pre_del and post_del: The reader lock is held 2950 * between the two calls to stop the set of functions from changing. 2951 */ 2952 2953 static int 2954 kphysm_setup_pre_del(pgcnt_t delta_pages) 2955 { 2956 uint_t i; 2957 int ret; 2958 int aret; 2959 2960 ret = 0; 2961 rw_enter(&mem_callback_rwlock, RW_READER); 2962 for (i = 0; i < nmemcallbacks; i++) { 2963 if (mem_callbacks[i].vec != NULL) { 2964 aret = (*mem_callbacks[i].vec->pre_del) 2965 (mem_callbacks[i].arg, delta_pages); 2966 ret |= aret; 2967 } 2968 } 2969 2970 return (ret); 2971 } 2972 2973 static void 2974 kphysm_setup_post_del(pgcnt_t delta_pages, int cancelled) 2975 { 2976 uint_t i; 2977 2978 for (i = 0; i < nmemcallbacks; i++) { 2979 if (mem_callbacks[i].vec != NULL) { 2980 (*mem_callbacks[i].vec->post_del) 2981 (mem_callbacks[i].arg, delta_pages, cancelled); 2982 } 2983 } 2984 rw_exit(&mem_callback_rwlock); 2985 } 2986 2987 static int 2988 kphysm_split_memseg( 2989 pfn_t base, 2990 pgcnt_t npgs) 2991 { 2992 struct memseg *seg; 2993 struct memseg **segpp; 2994 pgcnt_t size_low, size_high; 2995 struct memseg *seg_low, *seg_mid, *seg_high; 2996 2997 /* 2998 * Lock the memsegs list against other updates now 2999 */ 3000 memsegs_lock(1); 3001 3002 /* 3003 * Find boot time memseg that wholly covers this area. 3004 */ 3005 3006 /* First find the memseg with page 'base' in it. */ 3007 for (segpp = &memsegs; (seg = *segpp) != NULL; 3008 segpp = &((*segpp)->next)) { 3009 if (base >= seg->pages_base && base < seg->pages_end) 3010 break; 3011 } 3012 if (seg == NULL) { 3013 memsegs_unlock(1); 3014 return (0); 3015 } 3016 if (memseg_is_dynamic(seg, (pfn_t *)NULL)) { 3017 memsegs_unlock(1); 3018 return (0); 3019 } 3020 if ((base + npgs) > seg->pages_end) { 3021 memsegs_unlock(1); 3022 return (0); 3023 } 3024 3025 /* 3026 * Work out the size of the two segments that will 3027 * surround the new segment, one for low address 3028 * and one for high. 3029 */ 3030 ASSERT(base >= seg->pages_base); 3031 size_low = base - seg->pages_base; 3032 ASSERT(seg->pages_end >= (base + npgs)); 3033 size_high = seg->pages_end - (base + npgs); 3034 3035 /* 3036 * Sanity check. 3037 */ 3038 if ((size_low + size_high) == 0) { 3039 memsegs_unlock(1); 3040 return (0); 3041 } 3042 3043 /* 3044 * Allocate the new structures. The old memseg will not be freed 3045 * as there may be a reference to it. 3046 */ 3047 seg_low = NULL; 3048 seg_high = NULL; 3049 3050 if (size_low != 0) { 3051 seg_low = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3052 bzero(seg_low, sizeof (struct memseg)); 3053 } 3054 3055 seg_mid = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3056 bzero(seg_mid, sizeof (struct memseg)); 3057 3058 if (size_high != 0) { 3059 seg_high = kmem_cache_alloc(memseg_cache, KM_SLEEP); 3060 bzero(seg_high, sizeof (struct memseg)); 3061 } 3062 3063 /* 3064 * All allocation done now. 3065 */ 3066 if (size_low != 0) { 3067 seg_low->pages = seg->pages; 3068 seg_low->epages = seg_low->pages + size_low; 3069 seg_low->pages_base = seg->pages_base; 3070 seg_low->pages_end = seg_low->pages_base + size_low; 3071 seg_low->next = seg_mid; 3072 } 3073 if (size_high != 0) { 3074 seg_high->pages = seg->epages - size_high; 3075 seg_high->epages = seg_high->pages + size_high; 3076 seg_high->pages_base = seg->pages_end - size_high; 3077 seg_high->pages_end = seg_high->pages_base + size_high; 3078 seg_high->next = seg->next; 3079 } 3080 3081 seg_mid->pages = seg->pages + size_low; 3082 seg_mid->pages_base = seg->pages_base + size_low; 3083 seg_mid->epages = seg->epages - size_high; 3084 seg_mid->pages_end = seg->pages_end - size_high; 3085 seg_mid->next = (seg_high != NULL) ? seg_high : seg->next; 3086 3087 /* 3088 * Update hat_kpm specific info of all involved memsegs and 3089 * allow hat_kpm specific global chain updates. 3090 */ 3091 hat_kpm_split_mseg_update(seg, segpp, seg_low, seg_mid, seg_high); 3092 3093 /* 3094 * At this point we have two equivalent memseg sub-chains, 3095 * seg and seg_low/seg_mid/seg_high, which both chain on to 3096 * the same place in the global chain. By re-writing the pointer 3097 * in the previous element we switch atomically from using the old 3098 * (seg) to the new. 3099 */ 3100 *segpp = (seg_low != NULL) ? seg_low : seg_mid; 3101 3102 membar_enter(); 3103 3104 build_pfn_hash(); 3105 memsegs_unlock(1); 3106 3107 /* 3108 * We leave the old segment, 'seg', intact as there may be 3109 * references to it. Also, as the value of total_pages has not 3110 * changed and the memsegs list is effectively the same when 3111 * accessed via the old or the new pointer, we do not have to 3112 * cause pageout_scanner() to re-evaluate its hand pointers. 3113 * 3114 * We currently do not re-use or reclaim the page_t memory. 3115 * If we do, then this may have to change. 3116 */ 3117 3118 mutex_enter(&memseg_lists_lock); 3119 seg->lnext = memseg_edit_junk; 3120 memseg_edit_junk = seg; 3121 mutex_exit(&memseg_lists_lock); 3122 3123 return (1); 3124 } 3125 3126 /* 3127 * The sfmmu hat layer (e.g.) accesses some parts of the memseg 3128 * structure using physical addresses. Therefore a kmem_cache is 3129 * used with KMC_NOHASH to avoid page crossings within a memseg 3130 * structure. KMC_NOHASH requires that no external (outside of 3131 * slab) information is allowed. This, in turn, implies that the 3132 * cache's slabsize must be exactly a single page, since per-slab 3133 * information (e.g. the freelist for the slab) is kept at the 3134 * end of the slab, where it is easy to locate. Should be changed 3135 * when a more obvious kmem_cache interface/flag will become 3136 * available. 3137 */ 3138 void 3139 mem_config_init() 3140 { 3141 memseg_cache = kmem_cache_create("memseg_cache", sizeof (struct memseg), 3142 0, NULL, NULL, NULL, NULL, static_arena, KMC_NOHASH); 3143 } 3144